Blame - arch/x86/kvm/x86.c - hafnium/third_party/linux.git

blob: f1a0eebdcf64175c403455a07eacf0c3846d5c1d [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* Kernel-based Virtual Machine driver for Linux
				4	*
				5	* derived from drivers/kvm/kvm_main.c
				6	*
				7	* Copyright (C) 2006 Qumranet, Inc.
				8	* Copyright (C) 2008 Qumranet, Inc.
				9	* Copyright IBM Corporation, 2008
				10	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				11	*
				12	* Authors:
				13	* Avi Kivity <avi@qumranet.com>
				14	* Yaniv Kamay <yaniv@qumranet.com>
				15	* Amit Shah <amit.shah@qumranet.com>
				16	* Ben-Ami Yassour <benami@il.ibm.com>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	17	*/
				18
				19	#include <linux/kvm_host.h>
				20	#include "irq.h"
				21	#include "mmu.h"
				22	#include "i8254.h"
				23	#include "tss.h"
				24	#include "kvm_cache_regs.h"
				25	#include "x86.h"
				26	#include "cpuid.h"
				27	#include "pmu.h"
				28	#include "hyperv.h"
				29
				30	#include <linux/clocksource.h>
				31	#include <linux/interrupt.h>
				32	#include <linux/kvm.h>
				33	#include <linux/fs.h>
				34	#include <linux/vmalloc.h>
				35	#include <linux/export.h>
				36	#include <linux/moduleparam.h>
				37	#include <linux/mman.h>
				38	#include <linux/highmem.h>
				39	#include <linux/iommu.h>
				40	#include <linux/intel-iommu.h>
				41	#include <linux/cpufreq.h>
				42	#include <linux/user-return-notifier.h>
				43	#include <linux/srcu.h>
				44	#include <linux/slab.h>
				45	#include <linux/perf_event.h>
				46	#include <linux/uaccess.h>
				47	#include <linux/hash.h>
				48	#include <linux/pci.h>
				49	#include <linux/timekeeper_internal.h>
				50	#include <linux/pvclock_gtod.h>
				51	#include <linux/kvm_irqfd.h>
				52	#include <linux/irqbypass.h>
				53	#include <linux/sched/stat.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	54	#include <linux/sched/isolation.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	55	#include <linux/mem_encrypt.h>
				56
				57	#include <trace/events/kvm.h>
				58
				59	#include <asm/debugreg.h>
				60	#include <asm/msr.h>
				61	#include <asm/desc.h>
				62	#include <asm/mce.h>
				63	#include <linux/kernel_stat.h>
				64	#include <asm/fpu/internal.h> /* Ugh! */
				65	#include <asm/pvclock.h>
				66	#include <asm/div64.h>
				67	#include <asm/irq_remapping.h>
				68	#include <asm/mshyperv.h>
				69	#include <asm/hypervisor.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	70	#include <asm/intel_pt.h>
				71	#include <clocksource/hyperv_timer.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	72
				73	#define CREATE_TRACE_POINTS
				74	#include "trace.h"
				75
				76	#define MAX_IO_MSRS 256
				77	#define KVM_MAX_MCE_BANKS 32
				78	u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P \| MCG_SER_P;
				79	EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
				80
				81	#define emul_to_vcpu(ctxt) \
				82	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
				83
				84	/* EFER defaults:
				85	* - enable syscall per default because its emulated by KVM
				86	* - enable LME and LMA per default on 64 bit KVM
				87	*/
				88	#ifdef CONFIG_X86_64
				89	static
				90	u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE \| EFER_LME \| EFER_LMA));
				91	#else
				92	static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
				93	#endif
				94
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	95	static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
				96
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	97	#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
				98	#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	99
				100	#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS \| \
				101	KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
				102
				103	static void update_cr8_intercept(struct kvm_vcpu *vcpu);
				104	static void process_nmi(struct kvm_vcpu *vcpu);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	105	static void process_smi(struct kvm_vcpu *vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	106	static void enter_smm(struct kvm_vcpu *vcpu);
				107	static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
				108	static void store_regs(struct kvm_vcpu *vcpu);
				109	static int sync_regs(struct kvm_vcpu *vcpu);
				110
				111	struct kvm_x86_ops *kvm_x86_ops __read_mostly;
				112	EXPORT_SYMBOL_GPL(kvm_x86_ops);
				113
				114	static bool __read_mostly ignore_msrs = 0;
				115	module_param(ignore_msrs, bool, S_IRUGO \| S_IWUSR);
				116
				117	static bool __read_mostly report_ignored_msrs = true;
				118	module_param(report_ignored_msrs, bool, S_IRUGO \| S_IWUSR);
				119
				120	unsigned int min_timer_period_us = 200;
				121	module_param(min_timer_period_us, uint, S_IRUGO \| S_IWUSR);
				122
				123	static bool __read_mostly kvmclock_periodic_sync = true;
				124	module_param(kvmclock_periodic_sync, bool, S_IRUGO);
				125
				126	bool __read_mostly kvm_has_tsc_control;
				127	EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
				128	u32 __read_mostly kvm_max_guest_tsc_khz;
				129	EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
				130	u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
				131	EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
				132	u64 __read_mostly kvm_max_tsc_scaling_ratio;
				133	EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
				134	u64 __read_mostly kvm_default_tsc_scaling_ratio;
				135	EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
				136
				137	/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
				138	static u32 __read_mostly tsc_tolerance_ppm = 250;
				139	module_param(tsc_tolerance_ppm, uint, S_IRUGO \| S_IWUSR);
				140
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	141	/*
				142	* lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables
				143	* adaptive tuning starting from default advancment of 1000ns. '0' disables
				144	* advancement entirely. Any other value is used as-is and disables adaptive
				145	* tuning, i.e. allows priveleged userspace to set an exact advancement time.
				146	*/
				147	static int __read_mostly lapic_timer_advance_ns = -1;
				148	module_param(lapic_timer_advance_ns, int, S_IRUGO \| S_IWUSR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	149
				150	static bool __read_mostly vector_hashing = true;
				151	module_param(vector_hashing, bool, S_IRUGO);
				152
				153	bool __read_mostly enable_vmware_backdoor = false;
				154	module_param(enable_vmware_backdoor, bool, S_IRUGO);
				155	EXPORT_SYMBOL_GPL(enable_vmware_backdoor);
				156
				157	static bool __read_mostly force_emulation_prefix = false;
				158	module_param(force_emulation_prefix, bool, S_IRUGO);
				159
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	160	int __read_mostly pi_inject_timer = -1;
				161	module_param(pi_inject_timer, bint, S_IRUGO \| S_IWUSR);
				162
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	163	#define KVM_NR_SHARED_MSRS 16
				164
				165	struct kvm_shared_msrs_global {
				166	int nr;
				167	u32 msrs[KVM_NR_SHARED_MSRS];
				168	};
				169
				170	struct kvm_shared_msrs {
				171	struct user_return_notifier urn;
				172	bool registered;
				173	struct kvm_shared_msr_values {
				174	u64 host;
				175	u64 curr;
				176	} values[KVM_NR_SHARED_MSRS];
				177	};
				178
				179	static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
				180	static struct kvm_shared_msrs __percpu *shared_msrs;
				181
				182	struct kvm_stats_debugfs_item debugfs_entries[] = {
				183	{ "pf_fixed", VCPU_STAT(pf_fixed) },
				184	{ "pf_guest", VCPU_STAT(pf_guest) },
				185	{ "tlb_flush", VCPU_STAT(tlb_flush) },
				186	{ "invlpg", VCPU_STAT(invlpg) },
				187	{ "exits", VCPU_STAT(exits) },
				188	{ "io_exits", VCPU_STAT(io_exits) },
				189	{ "mmio_exits", VCPU_STAT(mmio_exits) },
				190	{ "signal_exits", VCPU_STAT(signal_exits) },
				191	{ "irq_window", VCPU_STAT(irq_window_exits) },
				192	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
				193	{ "halt_exits", VCPU_STAT(halt_exits) },
				194	{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
				195	{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
				196	{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
				197	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
				198	{ "hypercalls", VCPU_STAT(hypercalls) },
				199	{ "request_irq", VCPU_STAT(request_irq_exits) },
				200	{ "irq_exits", VCPU_STAT(irq_exits) },
				201	{ "host_state_reload", VCPU_STAT(host_state_reload) },
				202	{ "fpu_reload", VCPU_STAT(fpu_reload) },
				203	{ "insn_emulation", VCPU_STAT(insn_emulation) },
				204	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
				205	{ "irq_injections", VCPU_STAT(irq_injections) },
				206	{ "nmi_injections", VCPU_STAT(nmi_injections) },
				207	{ "req_event", VCPU_STAT(req_event) },
				208	{ "l1d_flush", VCPU_STAT(l1d_flush) },
				209	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
				210	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	211	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
				212	{ "mmu_flooded", VM_STAT(mmu_flooded) },
				213	{ "mmu_recycled", VM_STAT(mmu_recycled) },
				214	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
				215	{ "mmu_unsync", VM_STAT(mmu_unsync) },
				216	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	217	{ "largepages", VM_STAT(lpages, .mode = 0444) },
				218	{ "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	219	{ "max_mmu_page_hash_collisions",
				220	VM_STAT(max_mmu_page_hash_collisions) },
				221	{ NULL }
				222	};
				223
				224	u64 __read_mostly host_xcr0;
				225
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	226	struct kmem_cache *x86_fpu_cache;
				227	EXPORT_SYMBOL_GPL(x86_fpu_cache);
				228
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	229	static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
				230
				231	static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
				232	{
				233	int i;
				234	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
				235	vcpu->arch.apf.gfns[i] = ~0;
				236	}
				237
				238	static void kvm_on_user_return(struct user_return_notifier *urn)
				239	{
				240	unsigned slot;
				241	struct kvm_shared_msrs *locals
				242	= container_of(urn, struct kvm_shared_msrs, urn);
				243	struct kvm_shared_msr_values *values;
				244	unsigned long flags;
				245
				246	/*
				247	* Disabling irqs at this point since the following code could be
				248	* interrupted and executed through kvm_arch_hardware_disable()
				249	*/
				250	local_irq_save(flags);
				251	if (locals->registered) {
				252	locals->registered = false;
				253	user_return_notifier_unregister(urn);
				254	}
				255	local_irq_restore(flags);
				256	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
				257	values = &locals->values[slot];
				258	if (values->host != values->curr) {
				259	wrmsrl(shared_msrs_global.msrs[slot], values->host);
				260	values->curr = values->host;
				261	}
				262	}
				263	}
				264
				265	static void shared_msr_update(unsigned slot, u32 msr)
				266	{
				267	u64 value;
				268	unsigned int cpu = smp_processor_id();
				269	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
				270
				271	/* only read, and nobody should modify it at this time,
				272	* so don't need lock */
				273	if (slot >= shared_msrs_global.nr) {
				274	printk(KERN_ERR "kvm: invalid MSR slot!");
				275	return;
				276	}
				277	rdmsrl_safe(msr, &value);
				278	smsr->values[slot].host = value;
				279	smsr->values[slot].curr = value;
				280	}
				281
				282	void kvm_define_shared_msr(unsigned slot, u32 msr)
				283	{
				284	BUG_ON(slot >= KVM_NR_SHARED_MSRS);
				285	shared_msrs_global.msrs[slot] = msr;
				286	if (slot >= shared_msrs_global.nr)
				287	shared_msrs_global.nr = slot + 1;
				288	}
				289	EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
				290
				291	static void kvm_shared_msr_cpu_online(void)
				292	{
				293	unsigned i;
				294
				295	for (i = 0; i < shared_msrs_global.nr; ++i)
				296	shared_msr_update(i, shared_msrs_global.msrs[i]);
				297	}
				298
				299	int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
				300	{
				301	unsigned int cpu = smp_processor_id();
				302	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
				303	int err;
				304
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	305	value = (value & mask) \| (smsr->values[slot].host & ~mask);
				306	if (value == smsr->values[slot].curr)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	307	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	308	err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
				309	if (err)
				310	return 1;
				311
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	312	smsr->values[slot].curr = value;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	313	if (!smsr->registered) {
				314	smsr->urn.on_user_return = kvm_on_user_return;
				315	user_return_notifier_register(&smsr->urn);
				316	smsr->registered = true;
				317	}
				318	return 0;
				319	}
				320	EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
				321
				322	static void drop_user_return_notifiers(void)
				323	{
				324	unsigned int cpu = smp_processor_id();
				325	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
				326
				327	if (smsr->registered)
				328	kvm_on_user_return(&smsr->urn);
				329	}
				330
				331	u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
				332	{
				333	return vcpu->arch.apic_base;
				334	}
				335	EXPORT_SYMBOL_GPL(kvm_get_apic_base);
				336
				337	enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu)
				338	{
				339	return kvm_apic_mode(kvm_get_apic_base(vcpu));
				340	}
				341	EXPORT_SYMBOL_GPL(kvm_get_apic_mode);
				342
				343	int kvm_set_apic_base(struct kvm_vcpu vcpu, struct msr_data msr_info)
				344	{
				345	enum lapic_mode old_mode = kvm_get_apic_mode(vcpu);
				346	enum lapic_mode new_mode = kvm_apic_mode(msr_info->data);
				347	u64 reserved_bits = ((~0ULL) << cpuid_maxphyaddr(vcpu)) \| 0x2ff \|
				348	(guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE);
				349
				350	if ((msr_info->data & reserved_bits) != 0 \|\| new_mode == LAPIC_MODE_INVALID)
				351	return 1;
				352	if (!msr_info->host_initiated) {
				353	if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC)
				354	return 1;
				355	if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC)
				356	return 1;
				357	}
				358
				359	kvm_lapic_set_base(vcpu, msr_info->data);
				360	return 0;
				361	}
				362	EXPORT_SYMBOL_GPL(kvm_set_apic_base);
				363
				364	asmlinkage __visible void kvm_spurious_fault(void)
				365	{
				366	/* Fault while not rebooting. We want the trace. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	367	BUG_ON(!kvm_rebooting);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	368	}
				369	EXPORT_SYMBOL_GPL(kvm_spurious_fault);
				370
				371	#define EXCPT_BENIGN 0
				372	#define EXCPT_CONTRIBUTORY 1
				373	#define EXCPT_PF 2
				374
				375	static int exception_class(int vector)
				376	{
				377	switch (vector) {
				378	case PF_VECTOR:
				379	return EXCPT_PF;
				380	case DE_VECTOR:
				381	case TS_VECTOR:
				382	case NP_VECTOR:
				383	case SS_VECTOR:
				384	case GP_VECTOR:
				385	return EXCPT_CONTRIBUTORY;
				386	default:
				387	break;
				388	}
				389	return EXCPT_BENIGN;
				390	}
				391
				392	#define EXCPT_FAULT 0
				393	#define EXCPT_TRAP 1
				394	#define EXCPT_ABORT 2
				395	#define EXCPT_INTERRUPT 3
				396
				397	static int exception_type(int vector)
				398	{
				399	unsigned int mask;
				400
				401	if (WARN_ON(vector > 31 \|\| vector == NMI_VECTOR))
				402	return EXCPT_INTERRUPT;
				403
				404	mask = 1 << vector;
				405
				406	/* #DB is trap, as instruction watchpoints are handled elsewhere */
				407	if (mask & ((1 << DB_VECTOR) \| (1 << BP_VECTOR) \| (1 << OF_VECTOR)))
				408	return EXCPT_TRAP;
				409
				410	if (mask & ((1 << DF_VECTOR) \| (1 << MC_VECTOR)))
				411	return EXCPT_ABORT;
				412
				413	/* Reserved exceptions will result in fault */
				414	return EXCPT_FAULT;
				415	}
				416
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	417	void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu)
				418	{
				419	unsigned nr = vcpu->arch.exception.nr;
				420	bool has_payload = vcpu->arch.exception.has_payload;
				421	unsigned long payload = vcpu->arch.exception.payload;
				422
				423	if (!has_payload)
				424	return;
				425
				426	switch (nr) {
				427	case DB_VECTOR:
				428	/*
				429	* "Certain debug exceptions may clear bit 0-3. The
				430	* remaining contents of the DR6 register are never
				431	* cleared by the processor".
				432	*/
				433	vcpu->arch.dr6 &= ~DR_TRAP_BITS;
				434	/*
				435	* DR6.RTM is set by all #DB exceptions that don't clear it.
				436	*/
				437	vcpu->arch.dr6 \|= DR6_RTM;
				438	vcpu->arch.dr6 \|= payload;
				439	/*
				440	* Bit 16 should be set in the payload whenever the #DB
				441	* exception should clear DR6.RTM. This makes the payload
				442	* compatible with the pending debug exceptions under VMX.
				443	* Though not currently documented in the SDM, this also
				444	* makes the payload compatible with the exit qualification
				445	* for #DB exceptions under VMX.
				446	*/
				447	vcpu->arch.dr6 ^= payload & DR6_RTM;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	448
				449	/*
				450	* The #DB payload is defined as compatible with the 'pending
				451	* debug exceptions' field under VMX, not DR6. While bit 12 is
				452	* defined in the 'pending debug exceptions' field (enabled
				453	* breakpoint), it is reserved and must be zero in DR6.
				454	*/
				455	vcpu->arch.dr6 &= ~BIT(12);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	456	break;
				457	case PF_VECTOR:
				458	vcpu->arch.cr2 = payload;
				459	break;
				460	}
				461
				462	vcpu->arch.exception.has_payload = false;
				463	vcpu->arch.exception.payload = 0;
				464	}
				465	EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload);
				466
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	467	static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
				468	unsigned nr, bool has_error, u32 error_code,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	469	bool has_payload, unsigned long payload, bool reinject)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	470	{
				471	u32 prev_nr;
				472	int class1, class2;
				473
				474	kvm_make_request(KVM_REQ_EVENT, vcpu);
				475
				476	if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
				477	queue:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	478	if (reinject) {
				479	/*
				480	* On vmentry, vcpu->arch.exception.pending is only
				481	* true if an event injection was blocked by
				482	* nested_run_pending. In that case, however,
				483	* vcpu_enter_guest requests an immediate exit,
				484	* and the guest shouldn't proceed far enough to
				485	* need reinjection.
				486	*/
				487	WARN_ON_ONCE(vcpu->arch.exception.pending);
				488	vcpu->arch.exception.injected = true;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	489	if (WARN_ON_ONCE(has_payload)) {
				490	/*
				491	* A reinjected event has already
				492	* delivered its payload.
				493	*/
				494	has_payload = false;
				495	payload = 0;
				496	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	497	} else {
				498	vcpu->arch.exception.pending = true;
				499	vcpu->arch.exception.injected = false;
				500	}
				501	vcpu->arch.exception.has_error_code = has_error;
				502	vcpu->arch.exception.nr = nr;
				503	vcpu->arch.exception.error_code = error_code;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	504	vcpu->arch.exception.has_payload = has_payload;
				505	vcpu->arch.exception.payload = payload;
				506	/*
				507	* In guest mode, payload delivery should be deferred,
				508	* so that the L1 hypervisor can intercept #PF before
				509	* CR2 is modified (or intercept #DB before DR6 is
				510	* modified under nVMX). However, for ABI
				511	* compatibility with KVM_GET_VCPU_EVENTS and
				512	* KVM_SET_VCPU_EVENTS, we can't delay payload
				513	* delivery unless userspace has enabled this
				514	* functionality via the per-VM capability,
				515	* KVM_CAP_EXCEPTION_PAYLOAD.
				516	*/
				517	if (!vcpu->kvm->arch.exception_payload_enabled \|\|
				518	!is_guest_mode(vcpu))
				519	kvm_deliver_exception_payload(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	520	return;
				521	}
				522
				523	/* to check exception */
				524	prev_nr = vcpu->arch.exception.nr;
				525	if (prev_nr == DF_VECTOR) {
				526	/* triple fault -> shutdown */
				527	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				528	return;
				529	}
				530	class1 = exception_class(prev_nr);
				531	class2 = exception_class(nr);
				532	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
				533	\|\| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
				534	/*
				535	* Generate double fault per SDM Table 5-5. Set
				536	* exception.pending = true so that the double fault
				537	* can trigger a nested vmexit.
				538	*/
				539	vcpu->arch.exception.pending = true;
				540	vcpu->arch.exception.injected = false;
				541	vcpu->arch.exception.has_error_code = true;
				542	vcpu->arch.exception.nr = DF_VECTOR;
				543	vcpu->arch.exception.error_code = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	544	vcpu->arch.exception.has_payload = false;
				545	vcpu->arch.exception.payload = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	546	} else
				547	/* replace previous exception with a new one in a hope
				548	that instruction re-execution will regenerate lost
				549	exception */
				550	goto queue;
				551	}
				552
				553	void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
				554	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	555	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	556	}
				557	EXPORT_SYMBOL_GPL(kvm_queue_exception);
				558
				559	void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
				560	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	561	kvm_multiple_exception(vcpu, nr, false, 0, false, 0, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	562	}
				563	EXPORT_SYMBOL_GPL(kvm_requeue_exception);
				564
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	565	static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
				566	unsigned long payload)
				567	{
				568	kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
				569	}
				570
				571	static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
				572	u32 error_code, unsigned long payload)
				573	{
				574	kvm_multiple_exception(vcpu, nr, true, error_code,
				575	true, payload, false);
				576	}
				577
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	578	int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
				579	{
				580	if (err)
				581	kvm_inject_gp(vcpu, 0);
				582	else
				583	return kvm_skip_emulated_instruction(vcpu);
				584
				585	return 1;
				586	}
				587	EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
				588
				589	void kvm_inject_page_fault(struct kvm_vcpu vcpu, struct x86_exception fault)
				590	{
				591	++vcpu->stat.pf_guest;
				592	vcpu->arch.exception.nested_apf =
				593	is_guest_mode(vcpu) && fault->async_page_fault;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	594	if (vcpu->arch.exception.nested_apf) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	595	vcpu->arch.apf.nested_apf_token = fault->address;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	596	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
				597	} else {
				598	kvm_queue_exception_e_p(vcpu, PF_VECTOR, fault->error_code,
				599	fault->address);
				600	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	601	}
				602	EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
				603
				604	static bool kvm_propagate_fault(struct kvm_vcpu vcpu, struct x86_exception fault)
				605	{
				606	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
				607	vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
				608	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	609	vcpu->arch.mmu->inject_page_fault(vcpu, fault);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	610
				611	return fault->nested_page_fault;
				612	}
				613
				614	void kvm_inject_nmi(struct kvm_vcpu *vcpu)
				615	{
				616	atomic_inc(&vcpu->arch.nmi_queued);
				617	kvm_make_request(KVM_REQ_NMI, vcpu);
				618	}
				619	EXPORT_SYMBOL_GPL(kvm_inject_nmi);
				620
				621	void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
				622	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	623	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	624	}
				625	EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
				626
				627	void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
				628	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	629	kvm_multiple_exception(vcpu, nr, true, error_code, false, 0, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	630	}
				631	EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);
				632
				633	/*
				634	* Checks if cpl <= required_cpl; if true, return true. Otherwise queue
				635	* a #GP and return false.
				636	*/
				637	bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
				638	{
				639	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
				640	return true;
				641	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
				642	return false;
				643	}
				644	EXPORT_SYMBOL_GPL(kvm_require_cpl);
				645
				646	bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr)
				647	{
				648	if ((dr != 4 && dr != 5) \|\| !kvm_read_cr4_bits(vcpu, X86_CR4_DE))
				649	return true;
				650
				651	kvm_queue_exception(vcpu, UD_VECTOR);
				652	return false;
				653	}
				654	EXPORT_SYMBOL_GPL(kvm_require_dr);
				655
				656	/*
				657	* This function will be used to read from the physical memory of the currently
				658	* running guest. The difference to kvm_vcpu_read_guest_page is that this function
				659	* can read from guest physical or from the guest's guest physical memory.
				660	*/
				661	int kvm_read_guest_page_mmu(struct kvm_vcpu vcpu, struct kvm_mmu mmu,
				662	gfn_t ngfn, void *data, int offset, int len,
				663	u32 access)
				664	{
				665	struct x86_exception exception;
				666	gfn_t real_gfn;
				667	gpa_t ngpa;
				668
				669	ngpa = gfn_to_gpa(ngfn);
				670	real_gfn = mmu->translate_gpa(vcpu, ngpa, access, &exception);
				671	if (real_gfn == UNMAPPED_GVA)
				672	return -EFAULT;
				673
				674	real_gfn = gpa_to_gfn(real_gfn);
				675
				676	return kvm_vcpu_read_guest_page(vcpu, real_gfn, data, offset, len);
				677	}
				678	EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
				679
				680	static int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
				681	void *data, int offset, int len, u32 access)
				682	{
				683	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
				684	data, offset, len, access);
				685	}
				686
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	687	static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
				688	{
				689	return rsvd_bits(cpuid_maxphyaddr(vcpu), 63) \| rsvd_bits(5, 8) \|
				690	rsvd_bits(1, 2);
				691	}
				692
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	693	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	694	* Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	695	*/
				696	int load_pdptrs(struct kvm_vcpu vcpu, struct kvm_mmu mmu, unsigned long cr3)
				697	{
				698	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
				699	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
				700	int i;
				701	int ret;
				702	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
				703
				704	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
				705	offset * sizeof(u64), sizeof(pdpte),
				706	PFERR_USER_MASK\|PFERR_WRITE_MASK);
				707	if (ret < 0) {
				708	ret = 0;
				709	goto out;
				710	}
				711	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
				712	if ((pdpte[i] & PT_PRESENT_MASK) &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	713	(pdpte[i] & pdptr_rsvd_bits(vcpu))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	714	ret = 0;
				715	goto out;
				716	}
				717	}
				718	ret = 1;
				719
				720	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
				721	__set_bit(VCPU_EXREG_PDPTR,
				722	(unsigned long *)&vcpu->arch.regs_avail);
				723	__set_bit(VCPU_EXREG_PDPTR,
				724	(unsigned long *)&vcpu->arch.regs_dirty);
				725	out:
				726
				727	return ret;
				728	}
				729	EXPORT_SYMBOL_GPL(load_pdptrs);
				730
				731	bool pdptrs_changed(struct kvm_vcpu *vcpu)
				732	{
				733	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
				734	bool changed = true;
				735	int offset;
				736	gfn_t gfn;
				737	int r;
				738
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	739	if (!is_pae_paging(vcpu))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	740	return false;
				741
				742	if (!test_bit(VCPU_EXREG_PDPTR,
				743	(unsigned long *)&vcpu->arch.regs_avail))
				744	return true;
				745
				746	gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
				747	offset = (kvm_read_cr3(vcpu) & 0xffffffe0ul) & (PAGE_SIZE - 1);
				748	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
				749	PFERR_USER_MASK \| PFERR_WRITE_MASK);
				750	if (r < 0)
				751	goto out;
				752	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
				753	out:
				754
				755	return changed;
				756	}
				757	EXPORT_SYMBOL_GPL(pdptrs_changed);
				758
				759	int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
				760	{
				761	unsigned long old_cr0 = kvm_read_cr0(vcpu);
				762	unsigned long update_bits = X86_CR0_PG \| X86_CR0_WP;
				763
				764	cr0 \|= X86_CR0_ET;
				765
				766	#ifdef CONFIG_X86_64
				767	if (cr0 & 0xffffffff00000000UL)
				768	return 1;
				769	#endif
				770
				771	cr0 &= ~CR0_RESERVED_BITS;
				772
				773	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
				774	return 1;
				775
				776	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
				777	return 1;
				778
				779	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
				780	#ifdef CONFIG_X86_64
				781	if ((vcpu->arch.efer & EFER_LME)) {
				782	int cs_db, cs_l;
				783
				784	if (!is_pae(vcpu))
				785	return 1;
				786	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
				787	if (cs_l)
				788	return 1;
				789	} else
				790	#endif
				791	if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				792	kvm_read_cr3(vcpu)))
				793	return 1;
				794	}
				795
				796	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
				797	return 1;
				798
				799	kvm_x86_ops->set_cr0(vcpu, cr0);
				800
				801	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
				802	kvm_clear_async_pf_completion_queue(vcpu);
				803	kvm_async_pf_hash_reset(vcpu);
				804	}
				805
				806	if ((cr0 ^ old_cr0) & update_bits)
				807	kvm_mmu_reset_context(vcpu);
				808
				809	if (((cr0 ^ old_cr0) & X86_CR0_CD) &&
				810	kvm_arch_has_noncoherent_dma(vcpu->kvm) &&
				811	!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
				812	kvm_zap_gfn_range(vcpu->kvm, 0, ~0ULL);
				813
				814	return 0;
				815	}
				816	EXPORT_SYMBOL_GPL(kvm_set_cr0);
				817
				818	void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
				819	{
				820	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) \| (msw & 0x0f));
				821	}
				822	EXPORT_SYMBOL_GPL(kvm_lmsw);
				823
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	824	void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	825	{
				826	if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
				827	!vcpu->guest_xcr0_loaded) {
				828	/* kvm_set_xcr() also depends on this */
				829	if (vcpu->arch.xcr0 != host_xcr0)
				830	xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
				831	vcpu->guest_xcr0_loaded = 1;
				832	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	833
				834	if (static_cpu_has(X86_FEATURE_PKU) &&
				835	(kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\|
				836	(vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
				837	vcpu->arch.pkru != vcpu->arch.host_pkru)
				838	__write_pkru(vcpu->arch.pkru);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	839	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	840	EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	841
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	842	void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	843	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	844	if (static_cpu_has(X86_FEATURE_PKU) &&
				845	(kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\|
				846	(vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
				847	vcpu->arch.pkru = rdpkru();
				848	if (vcpu->arch.pkru != vcpu->arch.host_pkru)
				849	__write_pkru(vcpu->arch.host_pkru);
				850	}
				851
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	852	if (vcpu->guest_xcr0_loaded) {
				853	if (vcpu->arch.xcr0 != host_xcr0)
				854	xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
				855	vcpu->guest_xcr0_loaded = 0;
				856	}
				857	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	858	EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	859
				860	static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
				861	{
				862	u64 xcr0 = xcr;
				863	u64 old_xcr0 = vcpu->arch.xcr0;
				864	u64 valid_bits;
				865
				866	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */
				867	if (index != XCR_XFEATURE_ENABLED_MASK)
				868	return 1;
				869	if (!(xcr0 & XFEATURE_MASK_FP))
				870	return 1;
				871	if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE))
				872	return 1;
				873
				874	/*
				875	* Do not allow the guest to set bits that we do not support
				876	* saving. However, xcr0 bit 0 is always set, even if the
				877	* emulated CPU does not support XSAVE (see fx_init).
				878	*/
				879	valid_bits = vcpu->arch.guest_supported_xcr0 \| XFEATURE_MASK_FP;
				880	if (xcr0 & ~valid_bits)
				881	return 1;
				882
				883	if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) !=
				884	(!(xcr0 & XFEATURE_MASK_BNDCSR)))
				885	return 1;
				886
				887	if (xcr0 & XFEATURE_MASK_AVX512) {
				888	if (!(xcr0 & XFEATURE_MASK_YMM))
				889	return 1;
				890	if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512)
				891	return 1;
				892	}
				893	vcpu->arch.xcr0 = xcr0;
				894
				895	if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
				896	kvm_update_cpuid(vcpu);
				897	return 0;
				898	}
				899
				900	int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
				901	{
				902	if (kvm_x86_ops->get_cpl(vcpu) != 0 \|\|
				903	__kvm_set_xcr(vcpu, index, xcr)) {
				904	kvm_inject_gp(vcpu, 0);
				905	return 1;
				906	}
				907	return 0;
				908	}
				909	EXPORT_SYMBOL_GPL(kvm_set_xcr);
				910
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	911	static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
				912	{
				913	u64 reserved_bits = CR4_RESERVED_BITS;
				914
				915	if (!cpu_has(c, X86_FEATURE_XSAVE))
				916	reserved_bits \|= X86_CR4_OSXSAVE;
				917
				918	if (!cpu_has(c, X86_FEATURE_SMEP))
				919	reserved_bits \|= X86_CR4_SMEP;
				920
				921	if (!cpu_has(c, X86_FEATURE_SMAP))
				922	reserved_bits \|= X86_CR4_SMAP;
				923
				924	if (!cpu_has(c, X86_FEATURE_FSGSBASE))
				925	reserved_bits \|= X86_CR4_FSGSBASE;
				926
				927	if (!cpu_has(c, X86_FEATURE_PKU))
				928	reserved_bits \|= X86_CR4_PKE;
				929
				930	if (!cpu_has(c, X86_FEATURE_LA57) &&
				931	!(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
				932	reserved_bits \|= X86_CR4_LA57;
				933
				934	if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
				935	reserved_bits \|= X86_CR4_UMIP;
				936
				937	return reserved_bits;
				938	}
				939
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	940	static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
				941	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	942	if (cr4 & cr4_reserved_bits)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	943	return -EINVAL;
				944
				945	if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
				946	return -EINVAL;
				947
				948	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
				949	return -EINVAL;
				950
				951	if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
				952	return -EINVAL;
				953
				954	if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
				955	return -EINVAL;
				956
				957	if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
				958	return -EINVAL;
				959
				960	if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
				961	return -EINVAL;
				962
				963	if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
				964	return -EINVAL;
				965
				966	return 0;
				967	}
				968
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	969	int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
				970	{
				971	unsigned long old_cr4 = kvm_read_cr4(vcpu);
				972	unsigned long pdptr_bits = X86_CR4_PGE \| X86_CR4_PSE \| X86_CR4_PAE \|
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	973	X86_CR4_SMEP;
				974	unsigned long mmu_role_bits = pdptr_bits \| X86_CR4_SMAP \| X86_CR4_PKE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	975
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	976	if (kvm_valid_cr4(vcpu, cr4))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	977	return 1;
				978
				979	if (is_long_mode(vcpu)) {
				980	if (!(cr4 & X86_CR4_PAE))
				981	return 1;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	982	if ((cr4 ^ old_cr4) & X86_CR4_LA57)
				983	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	984	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
				985	&& ((cr4 ^ old_cr4) & pdptr_bits)
				986	&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				987	kvm_read_cr3(vcpu)))
				988	return 1;
				989
				990	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
				991	if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
				992	return 1;
				993
				994	/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
				995	if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) \|\| !is_long_mode(vcpu))
				996	return 1;
				997	}
				998
				999	if (kvm_x86_ops->set_cr4(vcpu, cr4))
				1000	return 1;
				1001
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1002	if (((cr4 ^ old_cr4) & mmu_role_bits) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1003	(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
				1004	kvm_mmu_reset_context(vcpu);
				1005
				1006	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE \| X86_CR4_PKE))
				1007	kvm_update_cpuid(vcpu);
				1008
				1009	return 0;
				1010	}
				1011	EXPORT_SYMBOL_GPL(kvm_set_cr4);
				1012
				1013	int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
				1014	{
				1015	bool skip_tlb_flush = false;
				1016	#ifdef CONFIG_X86_64
				1017	bool pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
				1018
				1019	if (pcid_enabled) {
				1020	skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH;
				1021	cr3 &= ~X86_CR3_PCID_NOFLUSH;
				1022	}
				1023	#endif
				1024
				1025	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
				1026	if (!skip_tlb_flush) {
				1027	kvm_mmu_sync_roots(vcpu);
				1028	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				1029	}
				1030	return 0;
				1031	}
				1032
				1033	if (is_long_mode(vcpu) &&
				1034	(cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
				1035	return 1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1036	else if (is_pae_paging(vcpu) &&
				1037	!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1038	return 1;
				1039
				1040	kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
				1041	vcpu->arch.cr3 = cr3;
				1042	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
				1043
				1044	return 0;
				1045	}
				1046	EXPORT_SYMBOL_GPL(kvm_set_cr3);
				1047
				1048	int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
				1049	{
				1050	if (cr8 & CR8_RESERVED_BITS)
				1051	return 1;
				1052	if (lapic_in_kernel(vcpu))
				1053	kvm_lapic_set_tpr(vcpu, cr8);
				1054	else
				1055	vcpu->arch.cr8 = cr8;
				1056	return 0;
				1057	}
				1058	EXPORT_SYMBOL_GPL(kvm_set_cr8);
				1059
				1060	unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
				1061	{
				1062	if (lapic_in_kernel(vcpu))
				1063	return kvm_lapic_get_cr8(vcpu);
				1064	else
				1065	return vcpu->arch.cr8;
				1066	}
				1067	EXPORT_SYMBOL_GPL(kvm_get_cr8);
				1068
				1069	static void kvm_update_dr0123(struct kvm_vcpu *vcpu)
				1070	{
				1071	int i;
				1072
				1073	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
				1074	for (i = 0; i < KVM_NR_DB_REGS; i++)
				1075	vcpu->arch.eff_db[i] = vcpu->arch.db[i];
				1076	vcpu->arch.switch_db_regs \|= KVM_DEBUGREG_RELOAD;
				1077	}
				1078	}
				1079
				1080	static void kvm_update_dr6(struct kvm_vcpu *vcpu)
				1081	{
				1082	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
				1083	kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
				1084	}
				1085
				1086	static void kvm_update_dr7(struct kvm_vcpu *vcpu)
				1087	{
				1088	unsigned long dr7;
				1089
				1090	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
				1091	dr7 = vcpu->arch.guest_debug_dr7;
				1092	else
				1093	dr7 = vcpu->arch.dr7;
				1094	kvm_x86_ops->set_dr7(vcpu, dr7);
				1095	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
				1096	if (dr7 & DR7_BP_EN_MASK)
				1097	vcpu->arch.switch_db_regs \|= KVM_DEBUGREG_BP_ENABLED;
				1098	}
				1099
				1100	static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
				1101	{
				1102	u64 fixed = DR6_FIXED_1;
				1103
				1104	if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM))
				1105	fixed \|= DR6_RTM;
				1106	return fixed;
				1107	}
				1108
				1109	static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
				1110	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1111	size_t size = ARRAY_SIZE(vcpu->arch.db);
				1112
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1113	switch (dr) {
				1114	case 0 ... 3:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1115	vcpu->arch.db[array_index_nospec(dr, size)] = val;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1116	if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
				1117	vcpu->arch.eff_db[dr] = val;
				1118	break;
				1119	case 4:
				1120	/* fall through */
				1121	case 6:
				1122	if (val & 0xffffffff00000000ULL)
				1123	return -1; /* #GP */
				1124	vcpu->arch.dr6 = (val & DR6_VOLATILE) \| kvm_dr6_fixed(vcpu);
				1125	kvm_update_dr6(vcpu);
				1126	break;
				1127	case 5:
				1128	/* fall through */
				1129	default: /* 7 */
				1130	if (val & 0xffffffff00000000ULL)
				1131	return -1; /* #GP */
				1132	vcpu->arch.dr7 = (val & DR7_VOLATILE) \| DR7_FIXED_1;
				1133	kvm_update_dr7(vcpu);
				1134	break;
				1135	}
				1136
				1137	return 0;
				1138	}
				1139
				1140	int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
				1141	{
				1142	if (__kvm_set_dr(vcpu, dr, val)) {
				1143	kvm_inject_gp(vcpu, 0);
				1144	return 1;
				1145	}
				1146	return 0;
				1147	}
				1148	EXPORT_SYMBOL_GPL(kvm_set_dr);
				1149
				1150	int kvm_get_dr(struct kvm_vcpu vcpu, int dr, unsigned long val)
				1151	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1152	size_t size = ARRAY_SIZE(vcpu->arch.db);
				1153
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1154	switch (dr) {
				1155	case 0 ... 3:
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1156	*val = vcpu->arch.db[array_index_nospec(dr, size)];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1157	break;
				1158	case 4:
				1159	/* fall through */
				1160	case 6:
				1161	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
				1162	*val = vcpu->arch.dr6;
				1163	else
				1164	*val = kvm_x86_ops->get_dr6(vcpu);
				1165	break;
				1166	case 5:
				1167	/* fall through */
				1168	default: /* 7 */
				1169	*val = vcpu->arch.dr7;
				1170	break;
				1171	}
				1172	return 0;
				1173	}
				1174	EXPORT_SYMBOL_GPL(kvm_get_dr);
				1175
				1176	bool kvm_rdpmc(struct kvm_vcpu *vcpu)
				1177	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1178	u32 ecx = kvm_rcx_read(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1179	u64 data;
				1180	int err;
				1181
				1182	err = kvm_pmu_rdpmc(vcpu, ecx, &data);
				1183	if (err)
				1184	return err;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1185	kvm_rax_write(vcpu, (u32)data);
				1186	kvm_rdx_write(vcpu, data >> 32);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1187	return err;
				1188	}
				1189	EXPORT_SYMBOL_GPL(kvm_rdpmc);
				1190
				1191	/*
				1192	* List of msr numbers which we expose to userspace through KVM_GET_MSRS
				1193	* and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
				1194	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1195	* The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features)
				1196	* extract the supported MSRs from the related const lists.
				1197	* msrs_to_save is selected from the msrs_to_save_all to reflect the
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1198	* capabilities of the host cpu. This capabilities test skips MSRs that are
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1199	* kvm-specific. Those are put in emulated_msrs_all; filtering of emulated_msrs
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1200	* may depend on host virtualization features rather than host cpu features.
				1201	*/
				1202
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1203	static const u32 msrs_to_save_all[] = {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1204	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
				1205	MSR_STAR,
				1206	#ifdef CONFIG_X86_64
				1207	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
				1208	#endif
				1209	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
				1210	MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1211	MSR_IA32_SPEC_CTRL,
				1212	MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
				1213	MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
				1214	MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B,
				1215	MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B,
				1216	MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B,
				1217	MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B,
				1218	MSR_IA32_UMWAIT_CONTROL,
				1219
				1220	MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
				1221	MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
				1222	MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
				1223	MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
				1224	MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
				1225	MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3,
				1226	MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5,
				1227	MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7,
				1228	MSR_ARCH_PERFMON_PERFCTR0 + 8, MSR_ARCH_PERFMON_PERFCTR0 + 9,
				1229	MSR_ARCH_PERFMON_PERFCTR0 + 10, MSR_ARCH_PERFMON_PERFCTR0 + 11,
				1230	MSR_ARCH_PERFMON_PERFCTR0 + 12, MSR_ARCH_PERFMON_PERFCTR0 + 13,
				1231	MSR_ARCH_PERFMON_PERFCTR0 + 14, MSR_ARCH_PERFMON_PERFCTR0 + 15,
				1232	MSR_ARCH_PERFMON_PERFCTR0 + 16, MSR_ARCH_PERFMON_PERFCTR0 + 17,
				1233	MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1,
				1234	MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3,
				1235	MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5,
				1236	MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7,
				1237	MSR_ARCH_PERFMON_EVENTSEL0 + 8, MSR_ARCH_PERFMON_EVENTSEL0 + 9,
				1238	MSR_ARCH_PERFMON_EVENTSEL0 + 10, MSR_ARCH_PERFMON_EVENTSEL0 + 11,
				1239	MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
				1240	MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
				1241	MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1242	};
				1243
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1244	static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1245	static unsigned num_msrs_to_save;
				1246
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1247	static const u32 emulated_msrs_all[] = {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1248	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
				1249	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
				1250	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
				1251	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
				1252	HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY,
				1253	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
				1254	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
				1255	HV_X64_MSR_RESET,
				1256	HV_X64_MSR_VP_INDEX,
				1257	HV_X64_MSR_VP_RUNTIME,
				1258	HV_X64_MSR_SCONTROL,
				1259	HV_X64_MSR_STIMER0_CONFIG,
				1260	HV_X64_MSR_VP_ASSIST_PAGE,
				1261	HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
				1262	HV_X64_MSR_TSC_EMULATION_STATUS,
				1263
				1264	MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
				1265	MSR_KVM_PV_EOI_EN,
				1266
				1267	MSR_IA32_TSC_ADJUST,
				1268	MSR_IA32_TSCDEADLINE,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1269	MSR_IA32_ARCH_CAPABILITIES,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1270	MSR_IA32_MISC_ENABLE,
				1271	MSR_IA32_MCG_STATUS,
				1272	MSR_IA32_MCG_CTL,
				1273	MSR_IA32_MCG_EXT_CTL,
				1274	MSR_IA32_SMBASE,
				1275	MSR_SMI_COUNT,
				1276	MSR_PLATFORM_INFO,
				1277	MSR_MISC_FEATURES_ENABLES,
				1278	MSR_AMD64_VIRT_SPEC_CTRL,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1279	MSR_IA32_POWER_CTL,
				1280
				1281	/*
				1282	* The following list leaves out MSRs whose values are determined
				1283	* by arch/x86/kvm/vmx/nested.c based on CPUID or other MSRs.
				1284	* We always support the "true" VMX control MSRs, even if the host
				1285	* processor does not, so I am putting these registers here rather
				1286	* than in msrs_to_save_all.
				1287	*/
				1288	MSR_IA32_VMX_BASIC,
				1289	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
				1290	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
				1291	MSR_IA32_VMX_TRUE_EXIT_CTLS,
				1292	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
				1293	MSR_IA32_VMX_MISC,
				1294	MSR_IA32_VMX_CR0_FIXED0,
				1295	MSR_IA32_VMX_CR4_FIXED0,
				1296	MSR_IA32_VMX_VMCS_ENUM,
				1297	MSR_IA32_VMX_PROCBASED_CTLS2,
				1298	MSR_IA32_VMX_EPT_VPID_CAP,
				1299	MSR_IA32_VMX_VMFUNC,
				1300
				1301	MSR_K7_HWCR,
				1302	MSR_KVM_POLL_CONTROL,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1303	};
				1304
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1305	static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1306	static unsigned num_emulated_msrs;
				1307
				1308	/*
				1309	* List of msr numbers which are used to expose MSR-based features that
				1310	* can be used by a hypervisor to validate requested CPU features.
				1311	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1312	static const u32 msr_based_features_all[] = {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1313	MSR_IA32_VMX_BASIC,
				1314	MSR_IA32_VMX_TRUE_PINBASED_CTLS,
				1315	MSR_IA32_VMX_PINBASED_CTLS,
				1316	MSR_IA32_VMX_TRUE_PROCBASED_CTLS,
				1317	MSR_IA32_VMX_PROCBASED_CTLS,
				1318	MSR_IA32_VMX_TRUE_EXIT_CTLS,
				1319	MSR_IA32_VMX_EXIT_CTLS,
				1320	MSR_IA32_VMX_TRUE_ENTRY_CTLS,
				1321	MSR_IA32_VMX_ENTRY_CTLS,
				1322	MSR_IA32_VMX_MISC,
				1323	MSR_IA32_VMX_CR0_FIXED0,
				1324	MSR_IA32_VMX_CR0_FIXED1,
				1325	MSR_IA32_VMX_CR4_FIXED0,
				1326	MSR_IA32_VMX_CR4_FIXED1,
				1327	MSR_IA32_VMX_VMCS_ENUM,
				1328	MSR_IA32_VMX_PROCBASED_CTLS2,
				1329	MSR_IA32_VMX_EPT_VPID_CAP,
				1330	MSR_IA32_VMX_VMFUNC,
				1331
				1332	MSR_F10H_DECFG,
				1333	MSR_IA32_UCODE_REV,
				1334	MSR_IA32_ARCH_CAPABILITIES,
				1335	};
				1336
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1337	static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1338	static unsigned int num_msr_based_features;
				1339
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1340	static u64 kvm_get_arch_capabilities(void)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1341	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1342	u64 data = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1343
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1344	if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
				1345	rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
				1346
				1347	/*
				1348	* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
				1349	* the nested hypervisor runs with NX huge pages. If it is not,
				1350	* L1 is anyway vulnerable to ITLB_MULTIHIT explots from other
				1351	* L1 guests, so it need not worry about its own (L2) guests.
				1352	*/
				1353	data \|= ARCH_CAP_PSCHANGE_MC_NO;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1354
				1355	/*
				1356	* If we're doing cache flushes (either "always" or "cond")
				1357	* we will do one whenever the guest does a vmlaunch/vmresume.
				1358	* If an outer hypervisor is doing the cache flush for us
				1359	* (VMENTER_L1D_FLUSH_NESTED_VM), we can safely pass that
				1360	* capability to the guest too, and if EPT is disabled we're not
				1361	* vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will
				1362	* require a nested hypervisor to do a flush of its own.
				1363	*/
				1364	if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER)
				1365	data \|= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH;
				1366
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1367	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
				1368	data \|= ARCH_CAP_RDCL_NO;
				1369	if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
				1370	data \|= ARCH_CAP_SSB_NO;
				1371	if (!boot_cpu_has_bug(X86_BUG_MDS))
				1372	data \|= ARCH_CAP_MDS_NO;
				1373
				1374	/*
				1375	* On TAA affected systems, export MDS_NO=0 when:
				1376	* - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
				1377	* - Updated microcode is present. This is detected by
				1378	* the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
				1379	* that VERW clears CPU buffers.
				1380	*
				1381	* When MDS_NO=0 is exported, guests deploy clear CPU buffer
				1382	* mitigation and don't complain:
				1383	*
				1384	* "Vulnerable: Clear CPU buffers attempted, no microcode"
				1385	*
				1386	* If TSX is disabled on the system, guests are also mitigated against
				1387	* TAA and clear CPU buffer mitigation is not required for guests.
				1388	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1389	if (!boot_cpu_has(X86_FEATURE_RTM))
				1390	data &= ~ARCH_CAP_TAA_NO;
				1391	else if (!boot_cpu_has_bug(X86_BUG_TAA))
				1392	data \|= ARCH_CAP_TAA_NO;
				1393	else if (data & ARCH_CAP_TSX_CTRL_MSR)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1394	data &= ~ARCH_CAP_MDS_NO;
				1395
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	1396	/* KVM does not emulate MSR_IA32_TSX_CTRL. */
				1397	data &= ~ARCH_CAP_TSX_CTRL_MSR;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1398	return data;
				1399	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1400
				1401	static int kvm_get_msr_feature(struct kvm_msr_entry *msr)
				1402	{
				1403	switch (msr->index) {
				1404	case MSR_IA32_ARCH_CAPABILITIES:
				1405	msr->data = kvm_get_arch_capabilities();
				1406	break;
				1407	case MSR_IA32_UCODE_REV:
				1408	rdmsrl_safe(msr->index, &msr->data);
				1409	break;
				1410	default:
				1411	if (kvm_x86_ops->get_msr_feature(msr))
				1412	return 1;
				1413	}
				1414	return 0;
				1415	}
				1416
				1417	static int do_get_msr_feature(struct kvm_vcpu vcpu, unsigned index, u64 data)
				1418	{
				1419	struct kvm_msr_entry msr;
				1420	int r;
				1421
				1422	msr.index = index;
				1423	r = kvm_get_msr_feature(&msr);
				1424	if (r)
				1425	return r;
				1426
				1427	*data = msr.data;
				1428
				1429	return 0;
				1430	}
				1431
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1432	static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
				1433	{
				1434	if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT))
				1435	return false;
				1436
				1437	if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM))
				1438	return false;
				1439
				1440	if (efer & (EFER_LME \| EFER_LMA) &&
				1441	!guest_cpuid_has(vcpu, X86_FEATURE_LM))
				1442	return false;
				1443
				1444	if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX))
				1445	return false;
				1446
				1447	return true;
				1448
				1449	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1450	bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer)
				1451	{
				1452	if (efer & efer_reserved_bits)
				1453	return false;
				1454
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1455	return __kvm_valid_efer(vcpu, efer);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1456	}
				1457	EXPORT_SYMBOL_GPL(kvm_valid_efer);
				1458
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1459	static int set_efer(struct kvm_vcpu vcpu, struct msr_data msr_info)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1460	{
				1461	u64 old_efer = vcpu->arch.efer;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1462	u64 efer = msr_info->data;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1463
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1464	if (efer & efer_reserved_bits)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1465	return 1;
				1466
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1467	if (!msr_info->host_initiated) {
				1468	if (!__kvm_valid_efer(vcpu, efer))
				1469	return 1;
				1470
				1471	if (is_paging(vcpu) &&
				1472	(vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
				1473	return 1;
				1474	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1475
				1476	efer &= ~EFER_LMA;
				1477	efer \|= vcpu->arch.efer & EFER_LMA;
				1478
				1479	kvm_x86_ops->set_efer(vcpu, efer);
				1480
				1481	/* Update reserved bits */
				1482	if ((efer ^ old_efer) & EFER_NX)
				1483	kvm_mmu_reset_context(vcpu);
				1484
				1485	return 0;
				1486	}
				1487
				1488	void kvm_enable_efer_bits(u64 mask)
				1489	{
				1490	efer_reserved_bits &= ~mask;
				1491	}
				1492	EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
				1493
				1494	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1495	* Write @data into the MSR specified by @index. Select MSR specific fault
				1496	* checks are bypassed if @host_initiated is %true.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1497	* Returns 0 on success, non-0 otherwise.
				1498	* Assumes vcpu_load() was already called.
				1499	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1500	static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data,
				1501	bool host_initiated)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1502	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1503	struct msr_data msr;
				1504
				1505	switch (index) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1506	case MSR_FS_BASE:
				1507	case MSR_GS_BASE:
				1508	case MSR_KERNEL_GS_BASE:
				1509	case MSR_CSTAR:
				1510	case MSR_LSTAR:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1511	if (is_noncanonical_address(data, vcpu))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1512	return 1;
				1513	break;
				1514	case MSR_IA32_SYSENTER_EIP:
				1515	case MSR_IA32_SYSENTER_ESP:
				1516	/*
				1517	* IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if
				1518	* non-canonical address is written on Intel but not on
				1519	* AMD (which ignores the top 32-bits, because it does
				1520	* not implement 64-bit SYSENTER).
				1521	*
				1522	* 64-bit code should hence be able to write a non-canonical
				1523	* value on AMD. Making the address canonical ensures that
				1524	* vmentry does not fail on Intel after writing a non-canonical
				1525	* value, and that something deterministic happens if the guest
				1526	* invokes 64-bit SYSENTER.
				1527	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1528	data = get_canonical(data, vcpu_virt_addr_bits(vcpu));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1529	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1530
				1531	msr.data = data;
				1532	msr.index = index;
				1533	msr.host_initiated = host_initiated;
				1534
				1535	return kvm_x86_ops->set_msr(vcpu, &msr);
				1536	}
				1537
				1538	/*
				1539	* Read the MSR specified by @index into @data. Select MSR specific fault
				1540	* checks are bypassed if @host_initiated is %true.
				1541	* Returns 0 on success, non-0 otherwise.
				1542	* Assumes vcpu_load() was already called.
				1543	*/
				1544	static int __kvm_get_msr(struct kvm_vcpu vcpu, u32 index, u64 data,
				1545	bool host_initiated)
				1546	{
				1547	struct msr_data msr;
				1548	int ret;
				1549
				1550	msr.index = index;
				1551	msr.host_initiated = host_initiated;
				1552
				1553	ret = kvm_x86_ops->get_msr(vcpu, &msr);
				1554	if (!ret)
				1555	*data = msr.data;
				1556	return ret;
				1557	}
				1558
				1559	int kvm_get_msr(struct kvm_vcpu vcpu, u32 index, u64 data)
				1560	{
				1561	return __kvm_get_msr(vcpu, index, data, false);
				1562	}
				1563	EXPORT_SYMBOL_GPL(kvm_get_msr);
				1564
				1565	int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
				1566	{
				1567	return __kvm_set_msr(vcpu, index, data, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1568	}
				1569	EXPORT_SYMBOL_GPL(kvm_set_msr);
				1570
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1571	int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
				1572	{
				1573	u32 ecx = kvm_rcx_read(vcpu);
				1574	u64 data;
				1575
				1576	if (kvm_get_msr(vcpu, ecx, &data)) {
				1577	trace_kvm_msr_read_ex(ecx);
				1578	kvm_inject_gp(vcpu, 0);
				1579	return 1;
				1580	}
				1581
				1582	trace_kvm_msr_read(ecx, data);
				1583
				1584	kvm_rax_write(vcpu, data & -1u);
				1585	kvm_rdx_write(vcpu, (data >> 32) & -1u);
				1586	return kvm_skip_emulated_instruction(vcpu);
				1587	}
				1588	EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr);
				1589
				1590	int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu)
				1591	{
				1592	u32 ecx = kvm_rcx_read(vcpu);
				1593	u64 data = kvm_read_edx_eax(vcpu);
				1594
				1595	if (kvm_set_msr(vcpu, ecx, data)) {
				1596	trace_kvm_msr_write_ex(ecx, data);
				1597	kvm_inject_gp(vcpu, 0);
				1598	return 1;
				1599	}
				1600
				1601	trace_kvm_msr_write(ecx, data);
				1602	return kvm_skip_emulated_instruction(vcpu);
				1603	}
				1604	EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
				1605
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1606	/*
				1607	* Adapt set_msr() to msr_io()'s calling convention
				1608	*/
				1609	static int do_get_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
				1610	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1611	return __kvm_get_msr(vcpu, index, data, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1612	}
				1613
				1614	static int do_set_msr(struct kvm_vcpu vcpu, unsigned index, u64 data)
				1615	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1616	return __kvm_set_msr(vcpu, index, *data, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1617	}
				1618
				1619	#ifdef CONFIG_X86_64
				1620	struct pvclock_gtod_data {
				1621	seqcount_t seq;
				1622
				1623	struct { /* extract of a clocksource struct */
				1624	int vclock_mode;
				1625	u64 cycle_last;
				1626	u64 mask;
				1627	u32 mult;
				1628	u32 shift;
				1629	} clock;
				1630
				1631	u64 boot_ns;
				1632	u64 nsec_base;
				1633	u64 wall_time_sec;
				1634	};
				1635
				1636	static struct pvclock_gtod_data pvclock_gtod_data;
				1637
				1638	static void update_pvclock_gtod(struct timekeeper *tk)
				1639	{
				1640	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
				1641	u64 boot_ns;
				1642
				1643	boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
				1644
				1645	write_seqcount_begin(&vdata->seq);
				1646
				1647	/* copy pvclock gtod data */
				1648	vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
				1649	vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
				1650	vdata->clock.mask = tk->tkr_mono.mask;
				1651	vdata->clock.mult = tk->tkr_mono.mult;
				1652	vdata->clock.shift = tk->tkr_mono.shift;
				1653
				1654	vdata->boot_ns = boot_ns;
				1655	vdata->nsec_base = tk->tkr_mono.xtime_nsec;
				1656
				1657	vdata->wall_time_sec = tk->xtime_sec;
				1658
				1659	write_seqcount_end(&vdata->seq);
				1660	}
				1661	#endif
				1662
				1663	void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
				1664	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1665	kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1666	kvm_vcpu_kick(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1667	}
				1668
				1669	static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
				1670	{
				1671	int version;
				1672	int r;
				1673	struct pvclock_wall_clock wc;
				1674	struct timespec64 boot;
				1675
				1676	if (!wall_clock)
				1677	return;
				1678
				1679	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
				1680	if (r)
				1681	return;
				1682
				1683	if (version & 1)
				1684	++version; /* first time write, random junk */
				1685
				1686	++version;
				1687
				1688	if (kvm_write_guest(kvm, wall_clock, &version, sizeof(version)))
				1689	return;
				1690
				1691	/*
				1692	* The guest calculates current wall clock time by adding
				1693	* system time (updated by kvm_guest_time_update below) to the
				1694	* wall clock specified here. guest system time equals host
				1695	* system time for us, thus we must fill in host boot time here.
				1696	*/
				1697	getboottime64(&boot);
				1698
				1699	if (kvm->arch.kvmclock_offset) {
				1700	struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
				1701	boot = timespec64_sub(boot, ts);
				1702	}
				1703	wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
				1704	wc.nsec = boot.tv_nsec;
				1705	wc.version = version;
				1706
				1707	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
				1708
				1709	version++;
				1710	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
				1711	}
				1712
				1713	static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
				1714	{
				1715	do_shl32_div32(dividend, divisor);
				1716	return dividend;
				1717	}
				1718
				1719	static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz,
				1720	s8 pshift, u32 pmultiplier)
				1721	{
				1722	uint64_t scaled64;
				1723	int32_t shift = 0;
				1724	uint64_t tps64;
				1725	uint32_t tps32;
				1726
				1727	tps64 = base_hz;
				1728	scaled64 = scaled_hz;
				1729	while (tps64 > scaled64*2 \|\| tps64 & 0xffffffff00000000ULL) {
				1730	tps64 >>= 1;
				1731	shift--;
				1732	}
				1733
				1734	tps32 = (uint32_t)tps64;
				1735	while (tps32 <= scaled64 \|\| scaled64 & 0xffffffff00000000ULL) {
				1736	if (scaled64 & 0xffffffff00000000ULL \|\| tps32 & 0x80000000)
				1737	scaled64 >>= 1;
				1738	else
				1739	tps32 <<= 1;
				1740	shift++;
				1741	}
				1742
				1743	*pshift = shift;
				1744	*pmultiplier = div_frac(scaled64, tps32);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1745	}
				1746
				1747	#ifdef CONFIG_X86_64
				1748	static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0);
				1749	#endif
				1750
				1751	static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
				1752	static unsigned long max_tsc_khz;
				1753
				1754	static u32 adjust_tsc_khz(u32 khz, s32 ppm)
				1755	{
				1756	u64 v = (u64)khz * (1000000 + ppm);
				1757	do_div(v, 1000000);
				1758	return v;
				1759	}
				1760
				1761	static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale)
				1762	{
				1763	u64 ratio;
				1764
				1765	/* Guest TSC same frequency as host TSC? */
				1766	if (!scale) {
				1767	vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
				1768	return 0;
				1769	}
				1770
				1771	/* TSC scaling supported? */
				1772	if (!kvm_has_tsc_control) {
				1773	if (user_tsc_khz > tsc_khz) {
				1774	vcpu->arch.tsc_catchup = 1;
				1775	vcpu->arch.tsc_always_catchup = 1;
				1776	return 0;
				1777	} else {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1778	pr_warn_ratelimited("user requested TSC rate below hardware speed\n");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1779	return -1;
				1780	}
				1781	}
				1782
				1783	/* TSC scaling required - calculate ratio */
				1784	ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits,
				1785	user_tsc_khz, tsc_khz);
				1786
				1787	if (ratio == 0 \|\| ratio >= kvm_max_tsc_scaling_ratio) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1788	pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n",
				1789	user_tsc_khz);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1790	return -1;
				1791	}
				1792
				1793	vcpu->arch.tsc_scaling_ratio = ratio;
				1794	return 0;
				1795	}
				1796
				1797	static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
				1798	{
				1799	u32 thresh_lo, thresh_hi;
				1800	int use_scaling = 0;
				1801
				1802	/* tsc_khz can be zero if TSC calibration fails */
				1803	if (user_tsc_khz == 0) {
				1804	/* set tsc_scaling_ratio to a safe value */
				1805	vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio;
				1806	return -1;
				1807	}
				1808
				1809	/* Compute a scale to convert nanoseconds in TSC cycles */
				1810	kvm_get_time_scale(user_tsc_khz * 1000LL, NSEC_PER_SEC,
				1811	&vcpu->arch.virtual_tsc_shift,
				1812	&vcpu->arch.virtual_tsc_mult);
				1813	vcpu->arch.virtual_tsc_khz = user_tsc_khz;
				1814
				1815	/*
				1816	* Compute the variation in TSC rate which is acceptable
				1817	* within the range of tolerance and decide if the
				1818	* rate being applied is within that bounds of the hardware
				1819	* rate. If so, no scaling or compensation need be done.
				1820	*/
				1821	thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm);
				1822	thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm);
				1823	if (user_tsc_khz < thresh_lo \|\| user_tsc_khz > thresh_hi) {
				1824	pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", user_tsc_khz, thresh_lo, thresh_hi);
				1825	use_scaling = 1;
				1826	}
				1827	return set_tsc_khz(vcpu, user_tsc_khz, use_scaling);
				1828	}
				1829
				1830	static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
				1831	{
				1832	u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec,
				1833	vcpu->arch.virtual_tsc_mult,
				1834	vcpu->arch.virtual_tsc_shift);
				1835	tsc += vcpu->arch.this_tsc_write;
				1836	return tsc;
				1837	}
				1838
				1839	static inline int gtod_is_based_on_tsc(int mode)
				1840	{
				1841	return mode == VCLOCK_TSC \|\| mode == VCLOCK_HVCLOCK;
				1842	}
				1843
				1844	static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
				1845	{
				1846	#ifdef CONFIG_X86_64
				1847	bool vcpus_matched;
				1848	struct kvm_arch *ka = &vcpu->kvm->arch;
				1849	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				1850
				1851	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
				1852	atomic_read(&vcpu->kvm->online_vcpus));
				1853
				1854	/*
				1855	* Once the masterclock is enabled, always perform request in
				1856	* order to update it.
				1857	*
				1858	* In order to enable masterclock, the host clocksource must be TSC
				1859	* and the vcpus need to have matched TSCs. When that happens,
				1860	* perform request to enable masterclock.
				1861	*/
				1862	if (ka->use_master_clock \|\|
				1863	(gtod_is_based_on_tsc(gtod->clock.vclock_mode) && vcpus_matched))
				1864	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				1865
				1866	trace_kvm_track_tsc(vcpu->vcpu_id, ka->nr_vcpus_matched_tsc,
				1867	atomic_read(&vcpu->kvm->online_vcpus),
				1868	ka->use_master_clock, gtod->clock.vclock_mode);
				1869	#endif
				1870	}
				1871
				1872	static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
				1873	{
				1874	u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
				1875	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
				1876	}
				1877
				1878	/*
				1879	* Multiply tsc by a fixed point number represented by ratio.
				1880	*
				1881	* The most significant 64-N bits (mult) of ratio represent the
				1882	* integral part of the fixed point number; the remaining N bits
				1883	* (frac) represent the fractional part, ie. ratio represents a fixed
				1884	* point number (mult + frac * 2^(-N)).
				1885	*
				1886	* N equals to kvm_tsc_scaling_ratio_frac_bits.
				1887	*/
				1888	static inline u64 __scale_tsc(u64 ratio, u64 tsc)
				1889	{
				1890	return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits);
				1891	}
				1892
				1893	u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
				1894	{
				1895	u64 _tsc = tsc;
				1896	u64 ratio = vcpu->arch.tsc_scaling_ratio;
				1897
				1898	if (ratio != kvm_default_tsc_scaling_ratio)
				1899	_tsc = __scale_tsc(ratio, tsc);
				1900
				1901	return _tsc;
				1902	}
				1903	EXPORT_SYMBOL_GPL(kvm_scale_tsc);
				1904
				1905	static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
				1906	{
				1907	u64 tsc;
				1908
				1909	tsc = kvm_scale_tsc(vcpu, rdtsc());
				1910
				1911	return target_tsc - tsc;
				1912	}
				1913
				1914	u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
				1915	{
				1916	u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
				1917
				1918	return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
				1919	}
				1920	EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
				1921
				1922	static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
				1923	{
				1924	vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
				1925	}
				1926
				1927	static inline bool kvm_check_tsc_unstable(void)
				1928	{
				1929	#ifdef CONFIG_X86_64
				1930	/*
				1931	* TSC is marked unstable when we're running on Hyper-V,
				1932	* 'TSC page' clocksource is good.
				1933	*/
				1934	if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
				1935	return false;
				1936	#endif
				1937	return check_tsc_unstable();
				1938	}
				1939
				1940	void kvm_write_tsc(struct kvm_vcpu vcpu, struct msr_data msr)
				1941	{
				1942	struct kvm *kvm = vcpu->kvm;
				1943	u64 offset, ns, elapsed;
				1944	unsigned long flags;
				1945	bool matched;
				1946	bool already_matched;
				1947	u64 data = msr->data;
				1948	bool synchronizing = false;
				1949
				1950	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
				1951	offset = kvm_compute_tsc_offset(vcpu, data);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1952	ns = ktime_get_boottime_ns();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1953	elapsed = ns - kvm->arch.last_tsc_nsec;
				1954
				1955	if (vcpu->arch.virtual_tsc_khz) {
				1956	if (data == 0 && msr->host_initiated) {
				1957	/*
				1958	* detection of vcpu initialization -- need to sync
				1959	* with other vCPUs. This particularly helps to keep
				1960	* kvm_clock stable after CPU hotplug
				1961	*/
				1962	synchronizing = true;
				1963	} else {
				1964	u64 tsc_exp = kvm->arch.last_tsc_write +
				1965	nsec_to_cycles(vcpu, elapsed);
				1966	u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
				1967	/*
				1968	* Special case: TSC write with a small delta (1 second)
				1969	* of virtual cycle time against real time is
				1970	* interpreted as an attempt to synchronize the CPU.
				1971	*/
				1972	synchronizing = data < tsc_exp + tsc_hz &&
				1973	data + tsc_hz > tsc_exp;
				1974	}
				1975	}
				1976
				1977	/*
				1978	* For a reliable TSC, we can match TSC offsets, and for an unstable
				1979	* TSC, we add elapsed time in this computation. We could let the
				1980	* compensation code attempt to catch up if we fall behind, but
				1981	* it's better to try to match offsets from the beginning.
				1982	*/
				1983	if (synchronizing &&
				1984	vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
				1985	if (!kvm_check_tsc_unstable()) {
				1986	offset = kvm->arch.cur_tsc_offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1987	} else {
				1988	u64 delta = nsec_to_cycles(vcpu, elapsed);
				1989	data += delta;
				1990	offset = kvm_compute_tsc_offset(vcpu, data);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1991	}
				1992	matched = true;
				1993	already_matched = (vcpu->arch.this_tsc_generation == kvm->arch.cur_tsc_generation);
				1994	} else {
				1995	/*
				1996	* We split periods of matched TSC writes into generations.
				1997	* For each generation, we track the original measured
				1998	* nanosecond time, offset, and write, so if TSCs are in
				1999	* sync, we can match exact offset, and if not, we can match
				2000	* exact software computation in compute_guest_tsc()
				2001	*
				2002	* These values are tracked in kvm->arch.cur_xxx variables.
				2003	*/
				2004	kvm->arch.cur_tsc_generation++;
				2005	kvm->arch.cur_tsc_nsec = ns;
				2006	kvm->arch.cur_tsc_write = data;
				2007	kvm->arch.cur_tsc_offset = offset;
				2008	matched = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2009	}
				2010
				2011	/*
				2012	* We also track th most recent recorded KHZ, write and time to
				2013	* allow the matching interval to be extended at each write.
				2014	*/
				2015	kvm->arch.last_tsc_nsec = ns;
				2016	kvm->arch.last_tsc_write = data;
				2017	kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz;
				2018
				2019	vcpu->arch.last_guest_tsc = data;
				2020
				2021	/* Keep track of which generation this VCPU has synchronized to */
				2022	vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation;
				2023	vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
				2024	vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
				2025
				2026	if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
				2027	update_ia32_tsc_adjust_msr(vcpu, offset);
				2028
				2029	kvm_vcpu_write_tsc_offset(vcpu, offset);
				2030	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
				2031
				2032	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
				2033	if (!matched) {
				2034	kvm->arch.nr_vcpus_matched_tsc = 0;
				2035	} else if (!already_matched) {
				2036	kvm->arch.nr_vcpus_matched_tsc++;
				2037	}
				2038
				2039	kvm_track_tsc_matching(vcpu);
				2040	spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
				2041	}
				2042
				2043	EXPORT_SYMBOL_GPL(kvm_write_tsc);
				2044
				2045	static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
				2046	s64 adjustment)
				2047	{
				2048	u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
				2049	kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
				2050	}
				2051
				2052	static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
				2053	{
				2054	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
				2055	WARN_ON(adjustment < 0);
				2056	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
				2057	adjust_tsc_offset_guest(vcpu, adjustment);
				2058	}
				2059
				2060	#ifdef CONFIG_X86_64
				2061
				2062	static u64 read_tsc(void)
				2063	{
				2064	u64 ret = (u64)rdtsc_ordered();
				2065	u64 last = pvclock_gtod_data.clock.cycle_last;
				2066
				2067	if (likely(ret >= last))
				2068	return ret;
				2069
				2070	/*
				2071	* GCC likes to generate cmov here, but this branch is extremely
				2072	* predictable (it's just a function of time and the likely is
				2073	* very likely) and there's a data dependence, so force GCC
				2074	* to generate a branch instead. I don't barrier() because
				2075	* we don't actually need a barrier, and if this function
				2076	* ever gets inlined it will generate worse code.
				2077	*/
				2078	asm volatile ("");
				2079	return last;
				2080	}
				2081
				2082	static inline u64 vgettsc(u64 tsc_timestamp, int mode)
				2083	{
				2084	long v;
				2085	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				2086	u64 tsc_pg_val;
				2087
				2088	switch (gtod->clock.vclock_mode) {
				2089	case VCLOCK_HVCLOCK:
				2090	tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
				2091	tsc_timestamp);
				2092	if (tsc_pg_val != U64_MAX) {
				2093	/* TSC page valid */
				2094	*mode = VCLOCK_HVCLOCK;
				2095	v = (tsc_pg_val - gtod->clock.cycle_last) &
				2096	gtod->clock.mask;
				2097	} else {
				2098	/* TSC page invalid */
				2099	*mode = VCLOCK_NONE;
				2100	}
				2101	break;
				2102	case VCLOCK_TSC:
				2103	*mode = VCLOCK_TSC;
				2104	*tsc_timestamp = read_tsc();
				2105	v = (*tsc_timestamp - gtod->clock.cycle_last) &
				2106	gtod->clock.mask;
				2107	break;
				2108	default:
				2109	*mode = VCLOCK_NONE;
				2110	}
				2111
				2112	if (*mode == VCLOCK_NONE)
				2113	*tsc_timestamp = v = 0;
				2114
				2115	return v * gtod->clock.mult;
				2116	}
				2117
				2118	static int do_monotonic_boot(s64 t, u64 tsc_timestamp)
				2119	{
				2120	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				2121	unsigned long seq;
				2122	int mode;
				2123	u64 ns;
				2124
				2125	do {
				2126	seq = read_seqcount_begin(&gtod->seq);
				2127	ns = gtod->nsec_base;
				2128	ns += vgettsc(tsc_timestamp, &mode);
				2129	ns >>= gtod->clock.shift;
				2130	ns += gtod->boot_ns;
				2131	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
				2132	*t = ns;
				2133
				2134	return mode;
				2135	}
				2136
				2137	static int do_realtime(struct timespec64 ts, u64 tsc_timestamp)
				2138	{
				2139	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				2140	unsigned long seq;
				2141	int mode;
				2142	u64 ns;
				2143
				2144	do {
				2145	seq = read_seqcount_begin(&gtod->seq);
				2146	ts->tv_sec = gtod->wall_time_sec;
				2147	ns = gtod->nsec_base;
				2148	ns += vgettsc(tsc_timestamp, &mode);
				2149	ns >>= gtod->clock.shift;
				2150	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
				2151
				2152	ts->tv_sec += __iter_div_u64_rem(ns, NSEC_PER_SEC, &ns);
				2153	ts->tv_nsec = ns;
				2154
				2155	return mode;
				2156	}
				2157
				2158	/* returns true if host is using TSC based clocksource */
				2159	static bool kvm_get_time_and_clockread(s64 kernel_ns, u64 tsc_timestamp)
				2160	{
				2161	/* checked again under seqlock below */
				2162	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
				2163	return false;
				2164
				2165	return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
				2166	tsc_timestamp));
				2167	}
				2168
				2169	/* returns true if host is using TSC based clocksource */
				2170	static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
				2171	u64 *tsc_timestamp)
				2172	{
				2173	/* checked again under seqlock below */
				2174	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
				2175	return false;
				2176
				2177	return gtod_is_based_on_tsc(do_realtime(ts, tsc_timestamp));
				2178	}
				2179	#endif
				2180
				2181	/*
				2182	*
				2183	* Assuming a stable TSC across physical CPUS, and a stable TSC
				2184	* across virtual CPUs, the following condition is possible.
				2185	* Each numbered line represents an event visible to both
				2186	* CPUs at the next numbered event.
				2187	*
				2188	* "timespecX" represents host monotonic time. "tscX" represents
				2189	* RDTSC value.
				2190	*
				2191	* VCPU0 on CPU0 \| VCPU1 on CPU1
				2192	*
				2193	* 1. read timespec0,tsc0
				2194	* 2. \| timespec1 = timespec0 + N
				2195	* \| tsc1 = tsc0 + M
				2196	* 3. transition to guest \| transition to guest
				2197	* 4. ret0 = timespec0 + (rdtsc - tsc0) \|
				2198	* 5. \| ret1 = timespec1 + (rdtsc - tsc1)
				2199	* \| ret1 = timespec0 + N + (rdtsc - (tsc0 + M))
				2200	*
				2201	* Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity:
				2202	*
				2203	* - ret0 < ret1
				2204	* - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M))
				2205	* ...
				2206	* - 0 < N - M => M < N
				2207	*
				2208	* That is, when timespec0 != timespec1, M < N. Unfortunately that is not
				2209	* always the case (the difference between two distinct xtime instances
				2210	* might be smaller then the difference between corresponding TSC reads,
				2211	* when updating guest vcpus pvclock areas).
				2212	*
				2213	* To avoid that problem, do not allow visibility of distinct
				2214	* system_timestamp/tsc_timestamp values simultaneously: use a master
				2215	* copy of host monotonic time values. Update that master copy
				2216	* in lockstep.
				2217	*
				2218	* Rely on synchronization of host TSCs and guest TSCs for monotonicity.
				2219	*
				2220	*/
				2221
				2222	static void pvclock_update_vm_gtod_copy(struct kvm *kvm)
				2223	{
				2224	#ifdef CONFIG_X86_64
				2225	struct kvm_arch *ka = &kvm->arch;
				2226	int vclock_mode;
				2227	bool host_tsc_clocksource, vcpus_matched;
				2228
				2229	vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 ==
				2230	atomic_read(&kvm->online_vcpus));
				2231
				2232	/*
				2233	* If the host uses TSC clock, then passthrough TSC as stable
				2234	* to the guest.
				2235	*/
				2236	host_tsc_clocksource = kvm_get_time_and_clockread(
				2237	&ka->master_kernel_ns,
				2238	&ka->master_cycle_now);
				2239
				2240	ka->use_master_clock = host_tsc_clocksource && vcpus_matched
				2241	&& !ka->backwards_tsc_observed
				2242	&& !ka->boot_vcpu_runs_old_kvmclock;
				2243
				2244	if (ka->use_master_clock)
				2245	atomic_set(&kvm_guest_has_master_clock, 1);
				2246
				2247	vclock_mode = pvclock_gtod_data.clock.vclock_mode;
				2248	trace_kvm_update_master_clock(ka->use_master_clock, vclock_mode,
				2249	vcpus_matched);
				2250	#endif
				2251	}
				2252
				2253	void kvm_make_mclock_inprogress_request(struct kvm *kvm)
				2254	{
				2255	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
				2256	}
				2257
				2258	static void kvm_gen_update_masterclock(struct kvm *kvm)
				2259	{
				2260	#ifdef CONFIG_X86_64
				2261	int i;
				2262	struct kvm_vcpu *vcpu;
				2263	struct kvm_arch *ka = &kvm->arch;
				2264
				2265	spin_lock(&ka->pvclock_gtod_sync_lock);
				2266	kvm_make_mclock_inprogress_request(kvm);
				2267	/* no guest entries from this point */
				2268	pvclock_update_vm_gtod_copy(kvm);
				2269
				2270	kvm_for_each_vcpu(i, vcpu, kvm)
				2271	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				2272
				2273	/* guest entries allowed */
				2274	kvm_for_each_vcpu(i, vcpu, kvm)
				2275	kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
				2276
				2277	spin_unlock(&ka->pvclock_gtod_sync_lock);
				2278	#endif
				2279	}
				2280
				2281	u64 get_kvmclock_ns(struct kvm *kvm)
				2282	{
				2283	struct kvm_arch *ka = &kvm->arch;
				2284	struct pvclock_vcpu_time_info hv_clock;
				2285	u64 ret;
				2286
				2287	spin_lock(&ka->pvclock_gtod_sync_lock);
				2288	if (!ka->use_master_clock) {
				2289	spin_unlock(&ka->pvclock_gtod_sync_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2290	return ktime_get_boottime_ns() + ka->kvmclock_offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2291	}
				2292
				2293	hv_clock.tsc_timestamp = ka->master_cycle_now;
				2294	hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
				2295	spin_unlock(&ka->pvclock_gtod_sync_lock);
				2296
				2297	/* both __this_cpu_read() and rdtsc() should be on the same cpu */
				2298	get_cpu();
				2299
				2300	if (__this_cpu_read(cpu_tsc_khz)) {
				2301	kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
				2302	&hv_clock.tsc_shift,
				2303	&hv_clock.tsc_to_system_mul);
				2304	ret = __pvclock_read_cycles(&hv_clock, rdtsc());
				2305	} else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2306	ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2307
				2308	put_cpu();
				2309
				2310	return ret;
				2311	}
				2312
				2313	static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
				2314	{
				2315	struct kvm_vcpu_arch *vcpu = &v->arch;
				2316	struct pvclock_vcpu_time_info guest_hv_clock;
				2317
				2318	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
				2319	&guest_hv_clock, sizeof(guest_hv_clock))))
				2320	return;
				2321
				2322	/* This VCPU is paused, but it's legal for a guest to read another
				2323	* VCPU's kvmclock, so we really have to follow the specification where
				2324	* it says that version is odd if data is being modified, and even after
				2325	* it is consistent.
				2326	*
				2327	* Version field updates must be kept separate. This is because
				2328	* kvm_write_guest_cached might use a "rep movs" instruction, and
				2329	* writes within a string instruction are weakly ordered. So there
				2330	* are three writes overall.
				2331	*
				2332	* As a small optimization, only write the version field in the first
				2333	* and third write. The vcpu->pv_time cache is still valid, because the
				2334	* version field is the first in the struct.
				2335	*/
				2336	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
				2337
				2338	if (guest_hv_clock.version & 1)
				2339	++guest_hv_clock.version; /* first time write, random junk */
				2340
				2341	vcpu->hv_clock.version = guest_hv_clock.version + 1;
				2342	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				2343	&vcpu->hv_clock,
				2344	sizeof(vcpu->hv_clock.version));
				2345
				2346	smp_wmb();
				2347
				2348	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
				2349	vcpu->hv_clock.flags \|= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
				2350
				2351	if (vcpu->pvclock_set_guest_stopped_request) {
				2352	vcpu->hv_clock.flags \|= PVCLOCK_GUEST_STOPPED;
				2353	vcpu->pvclock_set_guest_stopped_request = false;
				2354	}
				2355
				2356	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
				2357
				2358	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				2359	&vcpu->hv_clock,
				2360	sizeof(vcpu->hv_clock));
				2361
				2362	smp_wmb();
				2363
				2364	vcpu->hv_clock.version++;
				2365	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
				2366	&vcpu->hv_clock,
				2367	sizeof(vcpu->hv_clock.version));
				2368	}
				2369
				2370	static int kvm_guest_time_update(struct kvm_vcpu *v)
				2371	{
				2372	unsigned long flags, tgt_tsc_khz;
				2373	struct kvm_vcpu_arch *vcpu = &v->arch;
				2374	struct kvm_arch *ka = &v->kvm->arch;
				2375	s64 kernel_ns;
				2376	u64 tsc_timestamp, host_tsc;
				2377	u8 pvclock_flags;
				2378	bool use_master_clock;
				2379
				2380	kernel_ns = 0;
				2381	host_tsc = 0;
				2382
				2383	/*
				2384	* If the host uses TSC clock, then passthrough TSC as stable
				2385	* to the guest.
				2386	*/
				2387	spin_lock(&ka->pvclock_gtod_sync_lock);
				2388	use_master_clock = ka->use_master_clock;
				2389	if (use_master_clock) {
				2390	host_tsc = ka->master_cycle_now;
				2391	kernel_ns = ka->master_kernel_ns;
				2392	}
				2393	spin_unlock(&ka->pvclock_gtod_sync_lock);
				2394
				2395	/* Keep irq disabled to prevent changes to the clock */
				2396	local_irq_save(flags);
				2397	tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz);
				2398	if (unlikely(tgt_tsc_khz == 0)) {
				2399	local_irq_restore(flags);
				2400	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
				2401	return 1;
				2402	}
				2403	if (!use_master_clock) {
				2404	host_tsc = rdtsc();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2405	kernel_ns = ktime_get_boottime_ns();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2406	}
				2407
				2408	tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
				2409
				2410	/*
				2411	* We may have to catch up the TSC to match elapsed wall clock
				2412	* time for two reasons, even if kvmclock is used.
				2413	* 1) CPU could have been running below the maximum TSC rate
				2414	* 2) Broken TSC compensation resets the base at each VCPU
				2415	* entry to avoid unknown leaps of TSC even when running
				2416	* again on the same CPU. This may cause apparent elapsed
				2417	* time to disappear, and the guest to stand still or run
				2418	* very slowly.
				2419	*/
				2420	if (vcpu->tsc_catchup) {
				2421	u64 tsc = compute_guest_tsc(v, kernel_ns);
				2422	if (tsc > tsc_timestamp) {
				2423	adjust_tsc_offset_guest(v, tsc - tsc_timestamp);
				2424	tsc_timestamp = tsc;
				2425	}
				2426	}
				2427
				2428	local_irq_restore(flags);
				2429
				2430	/* With all the info we got, fill in the values */
				2431
				2432	if (kvm_has_tsc_control)
				2433	tgt_tsc_khz = kvm_scale_tsc(v, tgt_tsc_khz);
				2434
				2435	if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) {
				2436	kvm_get_time_scale(NSEC_PER_SEC, tgt_tsc_khz * 1000LL,
				2437	&vcpu->hv_clock.tsc_shift,
				2438	&vcpu->hv_clock.tsc_to_system_mul);
				2439	vcpu->hw_tsc_khz = tgt_tsc_khz;
				2440	}
				2441
				2442	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
				2443	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
				2444	vcpu->last_guest_tsc = tsc_timestamp;
				2445
				2446	/* If the host uses TSC clocksource, then it is stable */
				2447	pvclock_flags = 0;
				2448	if (use_master_clock)
				2449	pvclock_flags \|= PVCLOCK_TSC_STABLE_BIT;
				2450
				2451	vcpu->hv_clock.flags = pvclock_flags;
				2452
				2453	if (vcpu->pv_time_enabled)
				2454	kvm_setup_pvclock_page(v);
				2455	if (v == kvm_get_vcpu(v->kvm, 0))
				2456	kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
				2457	return 0;
				2458	}
				2459
				2460	/*
				2461	* kvmclock updates which are isolated to a given vcpu, such as
				2462	* vcpu->cpu migration, should not allow system_timestamp from
				2463	* the rest of the vcpus to remain static. Otherwise ntp frequency
				2464	* correction applies to one vcpu's system_timestamp but not
				2465	* the others.
				2466	*
				2467	* So in those cases, request a kvmclock update for all vcpus.
				2468	* We need to rate-limit these requests though, as they can
				2469	* considerably slow guests that have a large number of vcpus.
				2470	* The time for a remote vcpu to update its kvmclock is bound
				2471	* by the delay we use to rate-limit the updates.
				2472	*/
				2473
				2474	#define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100)
				2475
				2476	static void kvmclock_update_fn(struct work_struct *work)
				2477	{
				2478	int i;
				2479	struct delayed_work *dwork = to_delayed_work(work);
				2480	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
				2481	kvmclock_update_work);
				2482	struct kvm *kvm = container_of(ka, struct kvm, arch);
				2483	struct kvm_vcpu *vcpu;
				2484
				2485	kvm_for_each_vcpu(i, vcpu, kvm) {
				2486	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				2487	kvm_vcpu_kick(vcpu);
				2488	}
				2489	}
				2490
				2491	static void kvm_gen_kvmclock_update(struct kvm_vcpu *v)
				2492	{
				2493	struct kvm *kvm = v->kvm;
				2494
				2495	kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
				2496	schedule_delayed_work(&kvm->arch.kvmclock_update_work,
				2497	KVMCLOCK_UPDATE_DELAY);
				2498	}
				2499
				2500	#define KVMCLOCK_SYNC_PERIOD (300 * HZ)
				2501
				2502	static void kvmclock_sync_fn(struct work_struct *work)
				2503	{
				2504	struct delayed_work *dwork = to_delayed_work(work);
				2505	struct kvm_arch *ka = container_of(dwork, struct kvm_arch,
				2506	kvmclock_sync_work);
				2507	struct kvm *kvm = container_of(ka, struct kvm, arch);
				2508
				2509	if (!kvmclock_periodic_sync)
				2510	return;
				2511
				2512	schedule_delayed_work(&kvm->arch.kvmclock_update_work, 0);
				2513	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
				2514	KVMCLOCK_SYNC_PERIOD);
				2515	}
				2516
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2517	/*
				2518	* On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP.
				2519	*/
				2520	static bool can_set_mci_status(struct kvm_vcpu *vcpu)
				2521	{
				2522	/* McStatusWrEn enabled? */
				2523	if (guest_cpuid_is_amd(vcpu))
				2524	return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
				2525
				2526	return false;
				2527	}
				2528
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2529	static int set_msr_mce(struct kvm_vcpu vcpu, struct msr_data msr_info)
				2530	{
				2531	u64 mcg_cap = vcpu->arch.mcg_cap;
				2532	unsigned bank_num = mcg_cap & 0xff;
				2533	u32 msr = msr_info->index;
				2534	u64 data = msr_info->data;
				2535
				2536	switch (msr) {
				2537	case MSR_IA32_MCG_STATUS:
				2538	vcpu->arch.mcg_status = data;
				2539	break;
				2540	case MSR_IA32_MCG_CTL:
				2541	if (!(mcg_cap & MCG_CTL_P) &&
				2542	(data \|\| !msr_info->host_initiated))
				2543	return 1;
				2544	if (data != 0 && data != ~(u64)0)
				2545	return 1;
				2546	vcpu->arch.mcg_ctl = data;
				2547	break;
				2548	default:
				2549	if (msr >= MSR_IA32_MC0_CTL &&
				2550	msr < MSR_IA32_MCx_CTL(bank_num)) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2551	u32 offset = array_index_nospec(
				2552	msr - MSR_IA32_MC0_CTL,
				2553	MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
				2554
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2555	/* only 0 or all 1s can be written to IA32_MCi_CTL
				2556	* some Linux kernels though clear bit 10 in bank 4 to
				2557	* workaround a BIOS/GART TBL issue on AMD K8s, ignore
				2558	* this to avoid an uncatched #GP in the guest
				2559	*/
				2560	if ((offset & 0x3) == 0 &&
				2561	data != 0 && (data \| (1 << 10)) != ~(u64)0)
				2562	return -1;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2563
				2564	/* MCi_STATUS */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2565	if (!msr_info->host_initiated &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2566	(offset & 0x3) == 1 && data != 0) {
				2567	if (!can_set_mci_status(vcpu))
				2568	return -1;
				2569	}
				2570
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2571	vcpu->arch.mce_banks[offset] = data;
				2572	break;
				2573	}
				2574	return 1;
				2575	}
				2576	return 0;
				2577	}
				2578
				2579	static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
				2580	{
				2581	struct kvm *kvm = vcpu->kvm;
				2582	int lm = is_long_mode(vcpu);
				2583	u8 blob_addr = lm ? (u8 )(long)kvm->arch.xen_hvm_config.blob_addr_64
				2584	: (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
				2585	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
				2586	: kvm->arch.xen_hvm_config.blob_size_32;
				2587	u32 page_num = data & ~PAGE_MASK;
				2588	u64 page_addr = data & PAGE_MASK;
				2589	u8 *page;
				2590	int r;
				2591
				2592	r = -E2BIG;
				2593	if (page_num >= blob_size)
				2594	goto out;
				2595	r = -ENOMEM;
				2596	page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
				2597	if (IS_ERR(page)) {
				2598	r = PTR_ERR(page);
				2599	goto out;
				2600	}
				2601	if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
				2602	goto out_free;
				2603	r = 0;
				2604	out_free:
				2605	kfree(page);
				2606	out:
				2607	return r;
				2608	}
				2609
				2610	static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
				2611	{
				2612	gpa_t gpa = data & ~0x3f;
				2613
				2614	/* Bits 3:5 are reserved, Should be zero */
				2615	if (data & 0x38)
				2616	return 1;
				2617
				2618	vcpu->arch.apf.msr_val = data;
				2619
				2620	if (!(data & KVM_ASYNC_PF_ENABLED)) {
				2621	kvm_clear_async_pf_completion_queue(vcpu);
				2622	kvm_async_pf_hash_reset(vcpu);
				2623	return 0;
				2624	}
				2625
				2626	if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
				2627	sizeof(u32)))
				2628	return 1;
				2629
				2630	vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
				2631	vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
				2632	kvm_async_pf_wakeup_all(vcpu);
				2633	return 0;
				2634	}
				2635
				2636	static void kvmclock_reset(struct kvm_vcpu *vcpu)
				2637	{
				2638	vcpu->arch.pv_time_enabled = false;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2639	vcpu->arch.time = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2640	}
				2641
				2642	static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
				2643	{
				2644	++vcpu->stat.tlb_flush;
				2645	kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
				2646	}
				2647
				2648	static void record_steal_time(struct kvm_vcpu *vcpu)
				2649	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2650	struct kvm_host_map map;
				2651	struct kvm_steal_time *st;
				2652
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2653	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
				2654	return;
				2655
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2656	/* -EAGAIN is returned in atomic context so we can just return. */
				2657	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
				2658	&map, &vcpu->arch.st.cache, false))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2659	return;
				2660
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2661	st = map.hva +
				2662	offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
				2663
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2664	/*
				2665	* Doing a TLB flush here, on the guest's behalf, can avoid
				2666	* expensive IPIs.
				2667	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2668	trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2669	st->preempted & KVM_VCPU_FLUSH_TLB);
				2670	if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2671	kvm_vcpu_flush_tlb(vcpu, false);
				2672
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2673	vcpu->arch.st.preempted = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2674
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2675	if (st->version & 1)
				2676	st->version += 1; /* first time write, random junk */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2677
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2678	st->version += 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2679
				2680	smp_wmb();
				2681
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2682	st->steal += current->sched_info.run_delay -
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2683	vcpu->arch.st.last_steal;
				2684	vcpu->arch.st.last_steal = current->sched_info.run_delay;
				2685
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2686	smp_wmb();
				2687
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2688	st->version += 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2689
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2690	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2691	}
				2692
				2693	int kvm_set_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
				2694	{
				2695	bool pr = false;
				2696	u32 msr = msr_info->index;
				2697	u64 data = msr_info->data;
				2698
				2699	switch (msr) {
				2700	case MSR_AMD64_NB_CFG:
				2701	case MSR_IA32_UCODE_WRITE:
				2702	case MSR_VM_HSAVE_PA:
				2703	case MSR_AMD64_PATCH_LOADER:
				2704	case MSR_AMD64_BU_CFG2:
				2705	case MSR_AMD64_DC_CFG:
				2706	case MSR_F15H_EX_CFG:
				2707	break;
				2708
				2709	case MSR_IA32_UCODE_REV:
				2710	if (msr_info->host_initiated)
				2711	vcpu->arch.microcode_version = data;
				2712	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2713	case MSR_IA32_ARCH_CAPABILITIES:
				2714	if (!msr_info->host_initiated)
				2715	return 1;
				2716	vcpu->arch.arch_capabilities = data;
				2717	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2718	case MSR_EFER:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2719	return set_efer(vcpu, msr_info);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2720	case MSR_K7_HWCR:
				2721	data &= ~(u64)0x40; /* ignore flush filter disable */
				2722	data &= ~(u64)0x100; /* ignore ignne emulation enable */
				2723	data &= ~(u64)0x8; /* ignore TLB cache disable */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2724
				2725	/* Handle McStatusWrEn */
				2726	if (data == BIT_ULL(18)) {
				2727	vcpu->arch.msr_hwcr = data;
				2728	} else if (data != 0) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2729	vcpu_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n",
				2730	data);
				2731	return 1;
				2732	}
				2733	break;
				2734	case MSR_FAM10H_MMIO_CONF_BASE:
				2735	if (data != 0) {
				2736	vcpu_unimpl(vcpu, "unimplemented MMIO_CONF_BASE wrmsr: "
				2737	"0x%llx\n", data);
				2738	return 1;
				2739	}
				2740	break;
				2741	case MSR_IA32_DEBUGCTLMSR:
				2742	if (!data) {
				2743	/* We support the non-activated case already */
				2744	break;
				2745	} else if (data & ~(DEBUGCTLMSR_LBR \| DEBUGCTLMSR_BTF)) {
				2746	/* Values other than LBR and BTF are vendor-specific,
				2747	thus reserved and should throw a #GP */
				2748	return 1;
				2749	}
				2750	vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
				2751	__func__, data);
				2752	break;
				2753	case 0x200 ... 0x2ff:
				2754	return kvm_mtrr_set_msr(vcpu, msr, data);
				2755	case MSR_IA32_APICBASE:
				2756	return kvm_set_apic_base(vcpu, msr_info);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2757	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2758	return kvm_x2apic_msr_write(vcpu, msr, data);
				2759	case MSR_IA32_TSCDEADLINE:
				2760	kvm_set_lapic_tscdeadline_msr(vcpu, data);
				2761	break;
				2762	case MSR_IA32_TSC_ADJUST:
				2763	if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) {
				2764	if (!msr_info->host_initiated) {
				2765	s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
				2766	adjust_tsc_offset_guest(vcpu, adj);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2767	/* Before back to guest, tsc_timestamp must be adjusted
				2768	* as well, otherwise guest's percpu pvclock time could jump.
				2769	*/
				2770	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2771	}
				2772	vcpu->arch.ia32_tsc_adjust_msr = data;
				2773	}
				2774	break;
				2775	case MSR_IA32_MISC_ENABLE:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2776	if (!kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) &&
				2777	((vcpu->arch.ia32_misc_enable_msr ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) {
				2778	if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
				2779	return 1;
				2780	vcpu->arch.ia32_misc_enable_msr = data;
				2781	kvm_update_cpuid(vcpu);
				2782	} else {
				2783	vcpu->arch.ia32_misc_enable_msr = data;
				2784	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2785	break;
				2786	case MSR_IA32_SMBASE:
				2787	if (!msr_info->host_initiated)
				2788	return 1;
				2789	vcpu->arch.smbase = data;
				2790	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2791	case MSR_IA32_POWER_CTL:
				2792	vcpu->arch.msr_ia32_power_ctl = data;
				2793	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2794	case MSR_IA32_TSC:
				2795	kvm_write_tsc(vcpu, msr_info);
				2796	break;
				2797	case MSR_SMI_COUNT:
				2798	if (!msr_info->host_initiated)
				2799	return 1;
				2800	vcpu->arch.smi_count = data;
				2801	break;
				2802	case MSR_KVM_WALL_CLOCK_NEW:
				2803	case MSR_KVM_WALL_CLOCK:
				2804	vcpu->kvm->arch.wall_clock = data;
				2805	kvm_write_wall_clock(vcpu->kvm, data);
				2806	break;
				2807	case MSR_KVM_SYSTEM_TIME_NEW:
				2808	case MSR_KVM_SYSTEM_TIME: {
				2809	struct kvm_arch *ka = &vcpu->kvm->arch;
				2810
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2811	if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
				2812	bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
				2813
				2814	if (ka->boot_vcpu_runs_old_kvmclock != tmp)
				2815	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				2816
				2817	ka->boot_vcpu_runs_old_kvmclock = tmp;
				2818	}
				2819
				2820	vcpu->arch.time = data;
				2821	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
				2822
				2823	/* we verify if the enable bit is set... */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2824	vcpu->arch.pv_time_enabled = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2825	if (!(data & 1))
				2826	break;
				2827
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2828	if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2829	&vcpu->arch.pv_time, data & ~1ULL,
				2830	sizeof(struct pvclock_vcpu_time_info)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2831	vcpu->arch.pv_time_enabled = true;
				2832
				2833	break;
				2834	}
				2835	case MSR_KVM_ASYNC_PF_EN:
				2836	if (kvm_pv_enable_async_pf(vcpu, data))
				2837	return 1;
				2838	break;
				2839	case MSR_KVM_STEAL_TIME:
				2840
				2841	if (unlikely(!sched_info_on()))
				2842	return 1;
				2843
				2844	if (data & KVM_STEAL_RESERVED_MASK)
				2845	return 1;
				2846
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2847	vcpu->arch.st.msr_val = data;
				2848
				2849	if (!(data & KVM_MSR_ENABLED))
				2850	break;
				2851
				2852	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
				2853
				2854	break;
				2855	case MSR_KVM_PV_EOI_EN:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2856	if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2857	return 1;
				2858	break;
				2859
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	2860	case MSR_KVM_POLL_CONTROL:
				2861	/* only enable bit supported */
				2862	if (data & (-1ULL << 1))
				2863	return 1;
				2864
				2865	vcpu->arch.msr_kvm_poll_control = data;
				2866	break;
				2867
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2868	case MSR_IA32_MCG_CTL:
				2869	case MSR_IA32_MCG_STATUS:
				2870	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
				2871	return set_msr_mce(vcpu, msr_info);
				2872
				2873	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
				2874	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
				2875	pr = true; /* fall through */
				2876	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
				2877	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
				2878	if (kvm_pmu_is_valid_msr(vcpu, msr))
				2879	return kvm_pmu_set_msr(vcpu, msr_info);
				2880
				2881	if (pr \|\| data != 0)
				2882	vcpu_unimpl(vcpu, "disabled perfctr wrmsr: "
				2883	"0x%x data 0x%llx\n", msr, data);
				2884	break;
				2885	case MSR_K7_CLK_CTL:
				2886	/*
				2887	* Ignore all writes to this no longer documented MSR.
				2888	* Writes are only relevant for old K7 processors,
				2889	* all pre-dating SVM, but a recommended workaround from
				2890	* AMD for these chips. It is possible to specify the
				2891	* affected processor models on the command line, hence
				2892	* the need to ignore the workaround.
				2893	*/
				2894	break;
				2895	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
				2896	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
				2897	case HV_X64_MSR_CRASH_CTL:
				2898	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
				2899	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
				2900	case HV_X64_MSR_TSC_EMULATION_CONTROL:
				2901	case HV_X64_MSR_TSC_EMULATION_STATUS:
				2902	return kvm_hv_set_msr_common(vcpu, msr, data,
				2903	msr_info->host_initiated);
				2904	case MSR_IA32_BBL_CR_CTL3:
				2905	/* Drop writes to this legacy MSR -- see rdmsr
				2906	* counterpart for further detail.
				2907	*/
				2908	if (report_ignored_msrs)
				2909	vcpu_unimpl(vcpu, "ignored wrmsr: 0x%x data 0x%llx\n",
				2910	msr, data);
				2911	break;
				2912	case MSR_AMD64_OSVW_ID_LENGTH:
				2913	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				2914	return 1;
				2915	vcpu->arch.osvw.length = data;
				2916	break;
				2917	case MSR_AMD64_OSVW_STATUS:
				2918	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				2919	return 1;
				2920	vcpu->arch.osvw.status = data;
				2921	break;
				2922	case MSR_PLATFORM_INFO:
				2923	if (!msr_info->host_initiated \|\|
				2924	(!(data & MSR_PLATFORM_INFO_CPUID_FAULT) &&
				2925	cpuid_fault_enabled(vcpu)))
				2926	return 1;
				2927	vcpu->arch.msr_platform_info = data;
				2928	break;
				2929	case MSR_MISC_FEATURES_ENABLES:
				2930	if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT \|\|
				2931	(data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT &&
				2932	!supports_cpuid_fault(vcpu)))
				2933	return 1;
				2934	vcpu->arch.msr_misc_features_enables = data;
				2935	break;
				2936	default:
				2937	if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
				2938	return xen_hvm_config(vcpu, data);
				2939	if (kvm_pmu_is_valid_msr(vcpu, msr))
				2940	return kvm_pmu_set_msr(vcpu, msr_info);
				2941	if (!ignore_msrs) {
				2942	vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
				2943	msr, data);
				2944	return 1;
				2945	} else {
				2946	if (report_ignored_msrs)
				2947	vcpu_unimpl(vcpu,
				2948	"ignored wrmsr: 0x%x data 0x%llx\n",
				2949	msr, data);
				2950	break;
				2951	}
				2952	}
				2953	return 0;
				2954	}
				2955	EXPORT_SYMBOL_GPL(kvm_set_msr_common);
				2956
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2957	static int get_msr_mce(struct kvm_vcpu vcpu, u32 msr, u64 pdata, bool host)
				2958	{
				2959	u64 data;
				2960	u64 mcg_cap = vcpu->arch.mcg_cap;
				2961	unsigned bank_num = mcg_cap & 0xff;
				2962
				2963	switch (msr) {
				2964	case MSR_IA32_P5_MC_ADDR:
				2965	case MSR_IA32_P5_MC_TYPE:
				2966	data = 0;
				2967	break;
				2968	case MSR_IA32_MCG_CAP:
				2969	data = vcpu->arch.mcg_cap;
				2970	break;
				2971	case MSR_IA32_MCG_CTL:
				2972	if (!(mcg_cap & MCG_CTL_P) && !host)
				2973	return 1;
				2974	data = vcpu->arch.mcg_ctl;
				2975	break;
				2976	case MSR_IA32_MCG_STATUS:
				2977	data = vcpu->arch.mcg_status;
				2978	break;
				2979	default:
				2980	if (msr >= MSR_IA32_MC0_CTL &&
				2981	msr < MSR_IA32_MCx_CTL(bank_num)) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	2982	u32 offset = array_index_nospec(
				2983	msr - MSR_IA32_MC0_CTL,
				2984	MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
				2985
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2986	data = vcpu->arch.mce_banks[offset];
				2987	break;
				2988	}
				2989	return 1;
				2990	}
				2991	*pdata = data;
				2992	return 0;
				2993	}
				2994
				2995	int kvm_get_msr_common(struct kvm_vcpu vcpu, struct msr_data msr_info)
				2996	{
				2997	switch (msr_info->index) {
				2998	case MSR_IA32_PLATFORM_ID:
				2999	case MSR_IA32_EBL_CR_POWERON:
				3000	case MSR_IA32_DEBUGCTLMSR:
				3001	case MSR_IA32_LASTBRANCHFROMIP:
				3002	case MSR_IA32_LASTBRANCHTOIP:
				3003	case MSR_IA32_LASTINTFROMIP:
				3004	case MSR_IA32_LASTINTTOIP:
				3005	case MSR_K8_SYSCFG:
				3006	case MSR_K8_TSEG_ADDR:
				3007	case MSR_K8_TSEG_MASK:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3008	case MSR_VM_HSAVE_PA:
				3009	case MSR_K8_INT_PENDING_MSG:
				3010	case MSR_AMD64_NB_CFG:
				3011	case MSR_FAM10H_MMIO_CONF_BASE:
				3012	case MSR_AMD64_BU_CFG2:
				3013	case MSR_IA32_PERF_CTL:
				3014	case MSR_AMD64_DC_CFG:
				3015	case MSR_F15H_EX_CFG:
				3016	msr_info->data = 0;
				3017	break;
				3018	case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
				3019	case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
				3020	case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
				3021	case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
				3022	case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
				3023	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
				3024	return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
				3025	msr_info->data = 0;
				3026	break;
				3027	case MSR_IA32_UCODE_REV:
				3028	msr_info->data = vcpu->arch.microcode_version;
				3029	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3030	case MSR_IA32_ARCH_CAPABILITIES:
				3031	if (!msr_info->host_initiated &&
				3032	!guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES))
				3033	return 1;
				3034	msr_info->data = vcpu->arch.arch_capabilities;
				3035	break;
				3036	case MSR_IA32_POWER_CTL:
				3037	msr_info->data = vcpu->arch.msr_ia32_power_ctl;
				3038	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3039	case MSR_IA32_TSC:
				3040	msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
				3041	break;
				3042	case MSR_MTRRcap:
				3043	case 0x200 ... 0x2ff:
				3044	return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
				3045	case 0xcd: /* fsb frequency */
				3046	msr_info->data = 3;
				3047	break;
				3048	/*
				3049	* MSR_EBC_FREQUENCY_ID
				3050	* Conservative value valid for even the basic CPU models.
				3051	* Models 0,1: 000 in bits 23:21 indicating a bus speed of
				3052	* 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
				3053	* and 266MHz for model 3, or 4. Set Core Clock
				3054	* Frequency to System Bus Frequency Ratio to 1 (bits
				3055	* 31:24) even though these are only valid for CPU
				3056	* models > 2, however guests may end up dividing or
				3057	* multiplying by zero otherwise.
				3058	*/
				3059	case MSR_EBC_FREQUENCY_ID:
				3060	msr_info->data = 1 << 24;
				3061	break;
				3062	case MSR_IA32_APICBASE:
				3063	msr_info->data = kvm_get_apic_base(vcpu);
				3064	break;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3065	case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3066	return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
				3067	break;
				3068	case MSR_IA32_TSCDEADLINE:
				3069	msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
				3070	break;
				3071	case MSR_IA32_TSC_ADJUST:
				3072	msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr;
				3073	break;
				3074	case MSR_IA32_MISC_ENABLE:
				3075	msr_info->data = vcpu->arch.ia32_misc_enable_msr;
				3076	break;
				3077	case MSR_IA32_SMBASE:
				3078	if (!msr_info->host_initiated)
				3079	return 1;
				3080	msr_info->data = vcpu->arch.smbase;
				3081	break;
				3082	case MSR_SMI_COUNT:
				3083	msr_info->data = vcpu->arch.smi_count;
				3084	break;
				3085	case MSR_IA32_PERF_STATUS:
				3086	/* TSC increment by tick */
				3087	msr_info->data = 1000ULL;
				3088	/* CPU multiplier */
				3089	msr_info->data \|= (((uint64_t)4ULL) << 40);
				3090	break;
				3091	case MSR_EFER:
				3092	msr_info->data = vcpu->arch.efer;
				3093	break;
				3094	case MSR_KVM_WALL_CLOCK:
				3095	case MSR_KVM_WALL_CLOCK_NEW:
				3096	msr_info->data = vcpu->kvm->arch.wall_clock;
				3097	break;
				3098	case MSR_KVM_SYSTEM_TIME:
				3099	case MSR_KVM_SYSTEM_TIME_NEW:
				3100	msr_info->data = vcpu->arch.time;
				3101	break;
				3102	case MSR_KVM_ASYNC_PF_EN:
				3103	msr_info->data = vcpu->arch.apf.msr_val;
				3104	break;
				3105	case MSR_KVM_STEAL_TIME:
				3106	msr_info->data = vcpu->arch.st.msr_val;
				3107	break;
				3108	case MSR_KVM_PV_EOI_EN:
				3109	msr_info->data = vcpu->arch.pv_eoi.msr_val;
				3110	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3111	case MSR_KVM_POLL_CONTROL:
				3112	msr_info->data = vcpu->arch.msr_kvm_poll_control;
				3113	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3114	case MSR_IA32_P5_MC_ADDR:
				3115	case MSR_IA32_P5_MC_TYPE:
				3116	case MSR_IA32_MCG_CAP:
				3117	case MSR_IA32_MCG_CTL:
				3118	case MSR_IA32_MCG_STATUS:
				3119	case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
				3120	return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
				3121	msr_info->host_initiated);
				3122	case MSR_K7_CLK_CTL:
				3123	/*
				3124	* Provide expected ramp-up count for K7. All other
				3125	* are set to zero, indicating minimum divisors for
				3126	* every field.
				3127	*
				3128	* This prevents guest kernels on AMD host with CPU
				3129	* type 6, model 8 and higher from exploding due to
				3130	* the rdmsr failing.
				3131	*/
				3132	msr_info->data = 0x20000000;
				3133	break;
				3134	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
				3135	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
				3136	case HV_X64_MSR_CRASH_CTL:
				3137	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
				3138	case HV_X64_MSR_REENLIGHTENMENT_CONTROL:
				3139	case HV_X64_MSR_TSC_EMULATION_CONTROL:
				3140	case HV_X64_MSR_TSC_EMULATION_STATUS:
				3141	return kvm_hv_get_msr_common(vcpu,
				3142	msr_info->index, &msr_info->data,
				3143	msr_info->host_initiated);
				3144	break;
				3145	case MSR_IA32_BBL_CR_CTL3:
				3146	/* This legacy MSR exists but isn't fully documented in current
				3147	* silicon. It is however accessed by winxp in very narrow
				3148	* scenarios where it sets bit #19, itself documented as
				3149	* a "reserved" bit. Best effort attempt to source coherent
				3150	* read data here should the balance of the register be
				3151	* interpreted by the guest:
				3152	*
				3153	* L2 cache control register 3: 64GB range, 256KB size,
				3154	* enabled, latency 0x1, configured
				3155	*/
				3156	msr_info->data = 0xbe702111;
				3157	break;
				3158	case MSR_AMD64_OSVW_ID_LENGTH:
				3159	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				3160	return 1;
				3161	msr_info->data = vcpu->arch.osvw.length;
				3162	break;
				3163	case MSR_AMD64_OSVW_STATUS:
				3164	if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW))
				3165	return 1;
				3166	msr_info->data = vcpu->arch.osvw.status;
				3167	break;
				3168	case MSR_PLATFORM_INFO:
				3169	if (!msr_info->host_initiated &&
				3170	!vcpu->kvm->arch.guest_can_read_msr_platform_info)
				3171	return 1;
				3172	msr_info->data = vcpu->arch.msr_platform_info;
				3173	break;
				3174	case MSR_MISC_FEATURES_ENABLES:
				3175	msr_info->data = vcpu->arch.msr_misc_features_enables;
				3176	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3177	case MSR_K7_HWCR:
				3178	msr_info->data = vcpu->arch.msr_hwcr;
				3179	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3180	default:
				3181	if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
				3182	return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
				3183	if (!ignore_msrs) {
				3184	vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
				3185	msr_info->index);
				3186	return 1;
				3187	} else {
				3188	if (report_ignored_msrs)
				3189	vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
				3190	msr_info->index);
				3191	msr_info->data = 0;
				3192	}
				3193	break;
				3194	}
				3195	return 0;
				3196	}
				3197	EXPORT_SYMBOL_GPL(kvm_get_msr_common);
				3198
				3199	/*
				3200	* Read or write a bunch of msrs. All parameters are kernel addresses.
				3201	*
				3202	* @return number of msrs set successfully.
				3203	*/
				3204	static int __msr_io(struct kvm_vcpu vcpu, struct kvm_msrs msrs,
				3205	struct kvm_msr_entry *entries,
				3206	int (do_msr)(struct kvm_vcpu vcpu,
				3207	unsigned index, u64 *data))
				3208	{
				3209	int i;
				3210
				3211	for (i = 0; i < msrs->nmsrs; ++i)
				3212	if (do_msr(vcpu, entries[i].index, &entries[i].data))
				3213	break;
				3214
				3215	return i;
				3216	}
				3217
				3218	/*
				3219	* Read or write a bunch of msrs. Parameters are user addresses.
				3220	*
				3221	* @return number of msrs set successfully.
				3222	*/
				3223	static int msr_io(struct kvm_vcpu vcpu, struct kvm_msrs __user user_msrs,
				3224	int (do_msr)(struct kvm_vcpu vcpu,
				3225	unsigned index, u64 *data),
				3226	int writeback)
				3227	{
				3228	struct kvm_msrs msrs;
				3229	struct kvm_msr_entry *entries;
				3230	int r, n;
				3231	unsigned size;
				3232
				3233	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3234	if (copy_from_user(&msrs, user_msrs, sizeof(msrs)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3235	goto out;
				3236
				3237	r = -E2BIG;
				3238	if (msrs.nmsrs >= MAX_IO_MSRS)
				3239	goto out;
				3240
				3241	size = sizeof(struct kvm_msr_entry) * msrs.nmsrs;
				3242	entries = memdup_user(user_msrs->entries, size);
				3243	if (IS_ERR(entries)) {
				3244	r = PTR_ERR(entries);
				3245	goto out;
				3246	}
				3247
				3248	r = n = __msr_io(vcpu, &msrs, entries, do_msr);
				3249	if (r < 0)
				3250	goto out_free;
				3251
				3252	r = -EFAULT;
				3253	if (writeback && copy_to_user(user_msrs->entries, entries, size))
				3254	goto out_free;
				3255
				3256	r = n;
				3257
				3258	out_free:
				3259	kfree(entries);
				3260	out:
				3261	return r;
				3262	}
				3263
				3264	static inline bool kvm_can_mwait_in_guest(void)
				3265	{
				3266	return boot_cpu_has(X86_FEATURE_MWAIT) &&
				3267	!boot_cpu_has_bug(X86_BUG_MONITOR) &&
				3268	boot_cpu_has(X86_FEATURE_ARAT);
				3269	}
				3270
				3271	int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
				3272	{
				3273	int r = 0;
				3274
				3275	switch (ext) {
				3276	case KVM_CAP_IRQCHIP:
				3277	case KVM_CAP_HLT:
				3278	case KVM_CAP_MMU_SHADOW_CACHE_CONTROL:
				3279	case KVM_CAP_SET_TSS_ADDR:
				3280	case KVM_CAP_EXT_CPUID:
				3281	case KVM_CAP_EXT_EMUL_CPUID:
				3282	case KVM_CAP_CLOCKSOURCE:
				3283	case KVM_CAP_PIT:
				3284	case KVM_CAP_NOP_IO_DELAY:
				3285	case KVM_CAP_MP_STATE:
				3286	case KVM_CAP_SYNC_MMU:
				3287	case KVM_CAP_USER_NMI:
				3288	case KVM_CAP_REINJECT_CONTROL:
				3289	case KVM_CAP_IRQ_INJECT_STATUS:
				3290	case KVM_CAP_IOEVENTFD:
				3291	case KVM_CAP_IOEVENTFD_NO_LENGTH:
				3292	case KVM_CAP_PIT2:
				3293	case KVM_CAP_PIT_STATE2:
				3294	case KVM_CAP_SET_IDENTITY_MAP_ADDR:
				3295	case KVM_CAP_XEN_HVM:
				3296	case KVM_CAP_VCPU_EVENTS:
				3297	case KVM_CAP_HYPERV:
				3298	case KVM_CAP_HYPERV_VAPIC:
				3299	case KVM_CAP_HYPERV_SPIN:
				3300	case KVM_CAP_HYPERV_SYNIC:
				3301	case KVM_CAP_HYPERV_SYNIC2:
				3302	case KVM_CAP_HYPERV_VP_INDEX:
				3303	case KVM_CAP_HYPERV_EVENTFD:
				3304	case KVM_CAP_HYPERV_TLBFLUSH:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3305	case KVM_CAP_HYPERV_SEND_IPI:
				3306	case KVM_CAP_HYPERV_CPUID:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3307	case KVM_CAP_PCI_SEGMENT:
				3308	case KVM_CAP_DEBUGREGS:
				3309	case KVM_CAP_X86_ROBUST_SINGLESTEP:
				3310	case KVM_CAP_XSAVE:
				3311	case KVM_CAP_ASYNC_PF:
				3312	case KVM_CAP_GET_TSC_KHZ:
				3313	case KVM_CAP_KVMCLOCK_CTRL:
				3314	case KVM_CAP_READONLY_MEM:
				3315	case KVM_CAP_HYPERV_TIME:
				3316	case KVM_CAP_IOAPIC_POLARITY_IGNORED:
				3317	case KVM_CAP_TSC_DEADLINE_TIMER:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3318	case KVM_CAP_DISABLE_QUIRKS:
				3319	case KVM_CAP_SET_BOOT_CPU_ID:
				3320	case KVM_CAP_SPLIT_IRQCHIP:
				3321	case KVM_CAP_IMMEDIATE_EXIT:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3322	case KVM_CAP_PMU_EVENT_FILTER:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3323	case KVM_CAP_GET_MSR_FEATURES:
				3324	case KVM_CAP_MSR_PLATFORM_INFO:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3325	case KVM_CAP_EXCEPTION_PAYLOAD:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3326	r = 1;
				3327	break;
				3328	case KVM_CAP_SYNC_REGS:
				3329	r = KVM_SYNC_X86_VALID_FIELDS;
				3330	break;
				3331	case KVM_CAP_ADJUST_CLOCK:
				3332	r = KVM_CLOCK_TSC_STABLE;
				3333	break;
				3334	case KVM_CAP_X86_DISABLE_EXITS:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3335	r \|= KVM_X86_DISABLE_EXITS_HLT \| KVM_X86_DISABLE_EXITS_PAUSE \|
				3336	KVM_X86_DISABLE_EXITS_CSTATE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3337	if(kvm_can_mwait_in_guest())
				3338	r \|= KVM_X86_DISABLE_EXITS_MWAIT;
				3339	break;
				3340	case KVM_CAP_X86_SMM:
				3341	/* SMBASE is usually relocated above 1M on modern chipsets,
				3342	* and SMM handlers might indeed rely on 4G segment limits,
				3343	* so do not report SMM to be available if real mode is
				3344	* emulated via vm86 mode. Still, do not go to great lengths
				3345	* to avoid userspace's usage of the feature, because it is a
				3346	* fringe case that is not enabled except via specific settings
				3347	* of the module parameters.
				3348	*/
				3349	r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
				3350	break;
				3351	case KVM_CAP_VAPIC:
				3352	r = !kvm_x86_ops->cpu_has_accelerated_tpr();
				3353	break;
				3354	case KVM_CAP_NR_VCPUS:
				3355	r = KVM_SOFT_MAX_VCPUS;
				3356	break;
				3357	case KVM_CAP_MAX_VCPUS:
				3358	r = KVM_MAX_VCPUS;
				3359	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3360	case KVM_CAP_MAX_VCPU_ID:
				3361	r = KVM_MAX_VCPU_ID;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3362	break;
				3363	case KVM_CAP_PV_MMU: /* obsolete */
				3364	r = 0;
				3365	break;
				3366	case KVM_CAP_MCE:
				3367	r = KVM_MAX_MCE_BANKS;
				3368	break;
				3369	case KVM_CAP_XCRS:
				3370	r = boot_cpu_has(X86_FEATURE_XSAVE);
				3371	break;
				3372	case KVM_CAP_TSC_CONTROL:
				3373	r = kvm_has_tsc_control;
				3374	break;
				3375	case KVM_CAP_X2APIC_API:
				3376	r = KVM_X2APIC_API_VALID_FLAGS;
				3377	break;
				3378	case KVM_CAP_NESTED_STATE:
				3379	r = kvm_x86_ops->get_nested_state ?
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3380	kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
				3381	break;
				3382	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
				3383	r = kvm_x86_ops->enable_direct_tlbflush != NULL;
				3384	break;
				3385	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
				3386	r = kvm_x86_ops->nested_enable_evmcs != NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3387	break;
				3388	default:
				3389	break;
				3390	}
				3391	return r;
				3392
				3393	}
				3394
				3395	long kvm_arch_dev_ioctl(struct file *filp,
				3396	unsigned int ioctl, unsigned long arg)
				3397	{
				3398	void __user argp = (void __user )arg;
				3399	long r;
				3400
				3401	switch (ioctl) {
				3402	case KVM_GET_MSR_INDEX_LIST: {
				3403	struct kvm_msr_list __user *user_msr_list = argp;
				3404	struct kvm_msr_list msr_list;
				3405	unsigned n;
				3406
				3407	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3408	if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3409	goto out;
				3410	n = msr_list.nmsrs;
				3411	msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3412	if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3413	goto out;
				3414	r = -E2BIG;
				3415	if (n < msr_list.nmsrs)
				3416	goto out;
				3417	r = -EFAULT;
				3418	if (copy_to_user(user_msr_list->indices, &msrs_to_save,
				3419	num_msrs_to_save * sizeof(u32)))
				3420	goto out;
				3421	if (copy_to_user(user_msr_list->indices + num_msrs_to_save,
				3422	&emulated_msrs,
				3423	num_emulated_msrs * sizeof(u32)))
				3424	goto out;
				3425	r = 0;
				3426	break;
				3427	}
				3428	case KVM_GET_SUPPORTED_CPUID:
				3429	case KVM_GET_EMULATED_CPUID: {
				3430	struct kvm_cpuid2 __user *cpuid_arg = argp;
				3431	struct kvm_cpuid2 cpuid;
				3432
				3433	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3434	if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3435	goto out;
				3436
				3437	r = kvm_dev_ioctl_get_cpuid(&cpuid, cpuid_arg->entries,
				3438	ioctl);
				3439	if (r)
				3440	goto out;
				3441
				3442	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3443	if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3444	goto out;
				3445	r = 0;
				3446	break;
				3447	}
				3448	case KVM_X86_GET_MCE_CAP_SUPPORTED: {
				3449	r = -EFAULT;
				3450	if (copy_to_user(argp, &kvm_mce_cap_supported,
				3451	sizeof(kvm_mce_cap_supported)))
				3452	goto out;
				3453	r = 0;
				3454	break;
				3455	case KVM_GET_MSR_FEATURE_INDEX_LIST: {
				3456	struct kvm_msr_list __user *user_msr_list = argp;
				3457	struct kvm_msr_list msr_list;
				3458	unsigned int n;
				3459
				3460	r = -EFAULT;
				3461	if (copy_from_user(&msr_list, user_msr_list, sizeof(msr_list)))
				3462	goto out;
				3463	n = msr_list.nmsrs;
				3464	msr_list.nmsrs = num_msr_based_features;
				3465	if (copy_to_user(user_msr_list, &msr_list, sizeof(msr_list)))
				3466	goto out;
				3467	r = -E2BIG;
				3468	if (n < msr_list.nmsrs)
				3469	goto out;
				3470	r = -EFAULT;
				3471	if (copy_to_user(user_msr_list->indices, &msr_based_features,
				3472	num_msr_based_features * sizeof(u32)))
				3473	goto out;
				3474	r = 0;
				3475	break;
				3476	}
				3477	case KVM_GET_MSRS:
				3478	r = msr_io(NULL, argp, do_get_msr_feature, 1);
				3479	break;
				3480	}
				3481	default:
				3482	r = -EINVAL;
				3483	}
				3484	out:
				3485	return r;
				3486	}
				3487
				3488	static void wbinvd_ipi(void *garbage)
				3489	{
				3490	wbinvd();
				3491	}
				3492
				3493	static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu)
				3494	{
				3495	return kvm_arch_has_noncoherent_dma(vcpu->kvm);
				3496	}
				3497
				3498	void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
				3499	{
				3500	/* Address WBINVD may be executed by guest */
				3501	if (need_emulate_wbinvd(vcpu)) {
				3502	if (kvm_x86_ops->has_wbinvd_exit())
				3503	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
				3504	else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
				3505	smp_call_function_single(vcpu->cpu,
				3506	wbinvd_ipi, NULL, 1);
				3507	}
				3508
				3509	kvm_x86_ops->vcpu_load(vcpu, cpu);
				3510
				3511	/* Apply any externally detected TSC adjustments (due to suspend) */
				3512	if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
				3513	adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
				3514	vcpu->arch.tsc_offset_adjustment = 0;
				3515	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				3516	}
				3517
				3518	if (unlikely(vcpu->cpu != cpu) \|\| kvm_check_tsc_unstable()) {
				3519	s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
				3520	rdtsc() - vcpu->arch.last_host_tsc;
				3521	if (tsc_delta < 0)
				3522	mark_tsc_unstable("KVM discovered backwards TSC");
				3523
				3524	if (kvm_check_tsc_unstable()) {
				3525	u64 offset = kvm_compute_tsc_offset(vcpu,
				3526	vcpu->arch.last_guest_tsc);
				3527	kvm_vcpu_write_tsc_offset(vcpu, offset);
				3528	vcpu->arch.tsc_catchup = 1;
				3529	}
				3530
				3531	if (kvm_lapic_hv_timer_in_use(vcpu))
				3532	kvm_lapic_restart_hv_timer(vcpu);
				3533
				3534	/*
				3535	* On a host with synchronized TSC, there is no need to update
				3536	* kvmclock on vcpu->cpu migration
				3537	*/
				3538	if (!vcpu->kvm->arch.use_master_clock \|\| vcpu->cpu == -1)
				3539	kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
				3540	if (vcpu->cpu != cpu)
				3541	kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu);
				3542	vcpu->cpu = cpu;
				3543	}
				3544
				3545	kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu);
				3546	}
				3547
				3548	static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
				3549	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3550	struct kvm_host_map map;
				3551	struct kvm_steal_time *st;
				3552
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3553	if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
				3554	return;
				3555
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3556	if (vcpu->arch.st.preempted)
				3557	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3558
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3559	if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
				3560	&vcpu->arch.st.cache, true))
				3561	return;
				3562
				3563	st = map.hva +
				3564	offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
				3565
				3566	st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
				3567
				3568	kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3569	}
				3570
				3571	void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
				3572	{
				3573	int idx;
				3574
				3575	if (vcpu->preempted)
				3576	vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
				3577
				3578	/*
				3579	* Disable page faults because we're in atomic context here.
				3580	* kvm_write_guest_offset_cached() would call might_fault()
				3581	* that relies on pagefault_disable() to tell if there's a
				3582	* bug. NOTE: the write to guest memory may not go through if
				3583	* during postcopy live migration or if there's heavy guest
				3584	* paging.
				3585	*/
				3586	pagefault_disable();
				3587	/*
				3588	* kvm_memslots() will be called by
				3589	* kvm_write_guest_offset_cached() so take the srcu lock.
				3590	*/
				3591	idx = srcu_read_lock(&vcpu->kvm->srcu);
				3592	kvm_steal_time_set_preempted(vcpu);
				3593	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				3594	pagefault_enable();
				3595	kvm_x86_ops->vcpu_put(vcpu);
				3596	vcpu->arch.last_host_tsc = rdtsc();
				3597	/*
				3598	* If userspace has set any breakpoints or watchpoints, dr6 is restored
				3599	* on every vmexit, but if not, we might have a stale dr6 from the
				3600	* guest. do_debug expects dr6 to be cleared after it runs, do the same.
				3601	*/
				3602	set_debugreg(0, 6);
				3603	}
				3604
				3605	static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
				3606	struct kvm_lapic_state *s)
				3607	{
				3608	if (vcpu->arch.apicv_active)
				3609	kvm_x86_ops->sync_pir_to_irr(vcpu);
				3610
				3611	return kvm_apic_get_state(vcpu, s);
				3612	}
				3613
				3614	static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
				3615	struct kvm_lapic_state *s)
				3616	{
				3617	int r;
				3618
				3619	r = kvm_apic_set_state(vcpu, s);
				3620	if (r)
				3621	return r;
				3622	update_cr8_intercept(vcpu);
				3623
				3624	return 0;
				3625	}
				3626
				3627	static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
				3628	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3629	/*
				3630	* We can accept userspace's request for interrupt injection
				3631	* as long as we have a place to store the interrupt number.
				3632	* The actual injection will happen when the CPU is able to
				3633	* deliver the interrupt.
				3634	*/
				3635	if (kvm_cpu_has_extint(vcpu))
				3636	return false;
				3637
				3638	/* Acknowledging ExtINT does not happen if LINT0 is masked. */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3639	return (!lapic_in_kernel(vcpu) \|\|
				3640	kvm_apic_accept_pic_intr(vcpu));
				3641	}
				3642
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3643	static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
				3644	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3645	/*
				3646	* Do not cause an interrupt window exit if an exception
				3647	* is pending or an event needs reinjection; userspace
				3648	* might want to inject the interrupt manually using KVM_SET_REGS
				3649	* or KVM_SET_SREGS. For that to work, we must be at an
				3650	* instruction boundary and with no events half-injected.
				3651	*/
				3652	return (kvm_arch_interrupt_allowed(vcpu) &&
				3653	kvm_cpu_accept_dm_intr(vcpu) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3654	!kvm_event_needs_reinjection(vcpu) &&
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3655	!vcpu->arch.exception.pending);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3656	}
				3657
				3658	static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
				3659	struct kvm_interrupt *irq)
				3660	{
				3661	if (irq->irq >= KVM_NR_INTERRUPTS)
				3662	return -EINVAL;
				3663
				3664	if (!irqchip_in_kernel(vcpu->kvm)) {
				3665	kvm_queue_interrupt(vcpu, irq->irq, false);
				3666	kvm_make_request(KVM_REQ_EVENT, vcpu);
				3667	return 0;
				3668	}
				3669
				3670	/*
				3671	* With in-kernel LAPIC, we only use this to inject EXTINT, so
				3672	* fail for in-kernel 8259.
				3673	*/
				3674	if (pic_in_kernel(vcpu->kvm))
				3675	return -ENXIO;
				3676
				3677	if (vcpu->arch.pending_external_vector != -1)
				3678	return -EEXIST;
				3679
				3680	vcpu->arch.pending_external_vector = irq->irq;
				3681	kvm_make_request(KVM_REQ_EVENT, vcpu);
				3682	return 0;
				3683	}
				3684
				3685	static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu)
				3686	{
				3687	kvm_inject_nmi(vcpu);
				3688
				3689	return 0;
				3690	}
				3691
				3692	static int kvm_vcpu_ioctl_smi(struct kvm_vcpu *vcpu)
				3693	{
				3694	kvm_make_request(KVM_REQ_SMI, vcpu);
				3695
				3696	return 0;
				3697	}
				3698
				3699	static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu,
				3700	struct kvm_tpr_access_ctl *tac)
				3701	{
				3702	if (tac->flags)
				3703	return -EINVAL;
				3704	vcpu->arch.tpr_access_reporting = !!tac->enabled;
				3705	return 0;
				3706	}
				3707
				3708	static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
				3709	u64 mcg_cap)
				3710	{
				3711	int r;
				3712	unsigned bank_num = mcg_cap & 0xff, bank;
				3713
				3714	r = -EINVAL;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3715	if (!bank_num \|\| bank_num > KVM_MAX_MCE_BANKS)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3716	goto out;
				3717	if (mcg_cap & ~(kvm_mce_cap_supported \| 0xff \| 0xff0000))
				3718	goto out;
				3719	r = 0;
				3720	vcpu->arch.mcg_cap = mcg_cap;
				3721	/* Init IA32_MCG_CTL to all 1s */
				3722	if (mcg_cap & MCG_CTL_P)
				3723	vcpu->arch.mcg_ctl = ~(u64)0;
				3724	/* Init IA32_MCi_CTL to all 1s */
				3725	for (bank = 0; bank < bank_num; bank++)
				3726	vcpu->arch.mce_banks[bank*4] = ~(u64)0;
				3727
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3728	kvm_x86_ops->setup_mce(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3729	out:
				3730	return r;
				3731	}
				3732
				3733	static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
				3734	struct kvm_x86_mce *mce)
				3735	{
				3736	u64 mcg_cap = vcpu->arch.mcg_cap;
				3737	unsigned bank_num = mcg_cap & 0xff;
				3738	u64 *banks = vcpu->arch.mce_banks;
				3739
				3740	if (mce->bank >= bank_num \|\| !(mce->status & MCI_STATUS_VAL))
				3741	return -EINVAL;
				3742	/*
				3743	* if IA32_MCG_CTL is not all 1s, the uncorrected error
				3744	* reporting is disabled
				3745	*/
				3746	if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) &&
				3747	vcpu->arch.mcg_ctl != ~(u64)0)
				3748	return 0;
				3749	banks += 4 * mce->bank;
				3750	/*
				3751	* if IA32_MCi_CTL is not all 1s, the uncorrected error
				3752	* reporting is disabled for the bank
				3753	*/
				3754	if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0)
				3755	return 0;
				3756	if (mce->status & MCI_STATUS_UC) {
				3757	if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) \|\|
				3758	!kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
				3759	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				3760	return 0;
				3761	}
				3762	if (banks[1] & MCI_STATUS_VAL)
				3763	mce->status \|= MCI_STATUS_OVER;
				3764	banks[2] = mce->addr;
				3765	banks[3] = mce->misc;
				3766	vcpu->arch.mcg_status = mce->mcg_status;
				3767	banks[1] = mce->status;
				3768	kvm_queue_exception(vcpu, MC_VECTOR);
				3769	} else if (!(banks[1] & MCI_STATUS_VAL)
				3770	\|\| !(banks[1] & MCI_STATUS_UC)) {
				3771	if (banks[1] & MCI_STATUS_VAL)
				3772	mce->status \|= MCI_STATUS_OVER;
				3773	banks[2] = mce->addr;
				3774	banks[3] = mce->misc;
				3775	banks[1] = mce->status;
				3776	} else
				3777	banks[1] \|= MCI_STATUS_OVER;
				3778	return 0;
				3779	}
				3780
				3781	static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
				3782	struct kvm_vcpu_events *events)
				3783	{
				3784	process_nmi(vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3785
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	3786
				3787	if (kvm_check_request(KVM_REQ_SMI, vcpu))
				3788	process_smi(vcpu);
				3789
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3790	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3791	* The API doesn't provide the instruction length for software
				3792	* exceptions, so don't report them. As long as the guest RIP
				3793	* isn't advanced, we should expect to encounter the exception
				3794	* again.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3795	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3796	if (kvm_exception_is_soft(vcpu->arch.exception.nr)) {
				3797	events->exception.injected = 0;
				3798	events->exception.pending = 0;
				3799	} else {
				3800	events->exception.injected = vcpu->arch.exception.injected;
				3801	events->exception.pending = vcpu->arch.exception.pending;
				3802	/*
				3803	* For ABI compatibility, deliberately conflate
				3804	* pending and injected exceptions when
				3805	* KVM_CAP_EXCEPTION_PAYLOAD isn't enabled.
				3806	*/
				3807	if (!vcpu->kvm->arch.exception_payload_enabled)
				3808	events->exception.injected \|=
				3809	vcpu->arch.exception.pending;
				3810	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3811	events->exception.nr = vcpu->arch.exception.nr;
				3812	events->exception.has_error_code = vcpu->arch.exception.has_error_code;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3813	events->exception.error_code = vcpu->arch.exception.error_code;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3814	events->exception_has_payload = vcpu->arch.exception.has_payload;
				3815	events->exception_payload = vcpu->arch.exception.payload;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3816
				3817	events->interrupt.injected =
				3818	vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
				3819	events->interrupt.nr = vcpu->arch.interrupt.nr;
				3820	events->interrupt.soft = 0;
				3821	events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
				3822
				3823	events->nmi.injected = vcpu->arch.nmi_injected;
				3824	events->nmi.pending = vcpu->arch.nmi_pending != 0;
				3825	events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
				3826	events->nmi.pad = 0;
				3827
				3828	events->sipi_vector = 0; /* never valid when reporting to user space */
				3829
				3830	events->smi.smm = is_smm(vcpu);
				3831	events->smi.pending = vcpu->arch.smi_pending;
				3832	events->smi.smm_inside_nmi =
				3833	!!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK);
				3834	events->smi.latched_init = kvm_lapic_latched_init(vcpu);
				3835
				3836	events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
				3837	\| KVM_VCPUEVENT_VALID_SHADOW
				3838	\| KVM_VCPUEVENT_VALID_SMM);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3839	if (vcpu->kvm->arch.exception_payload_enabled)
				3840	events->flags \|= KVM_VCPUEVENT_VALID_PAYLOAD;
				3841
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3842	memset(&events->reserved, 0, sizeof(events->reserved));
				3843	}
				3844
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3845	static void kvm_smm_changed(struct kvm_vcpu *vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3846
				3847	static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
				3848	struct kvm_vcpu_events *events)
				3849	{
				3850	if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
				3851	\| KVM_VCPUEVENT_VALID_SIPI_VECTOR
				3852	\| KVM_VCPUEVENT_VALID_SHADOW
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3853	\| KVM_VCPUEVENT_VALID_SMM
				3854	\| KVM_VCPUEVENT_VALID_PAYLOAD))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3855	return -EINVAL;
				3856
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3857	if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) {
				3858	if (!vcpu->kvm->arch.exception_payload_enabled)
				3859	return -EINVAL;
				3860	if (events->exception.pending)
				3861	events->exception.injected = 0;
				3862	else
				3863	events->exception_has_payload = 0;
				3864	} else {
				3865	events->exception.pending = 0;
				3866	events->exception_has_payload = 0;
				3867	}
				3868
				3869	if ((events->exception.injected \|\| events->exception.pending) &&
				3870	(events->exception.nr > 31 \|\| events->exception.nr == NMI_VECTOR))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3871	return -EINVAL;
				3872
				3873	/* INITs are latched while in SMM */
				3874	if (events->flags & KVM_VCPUEVENT_VALID_SMM &&
				3875	(events->smi.smm \|\| events->smi.pending) &&
				3876	vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED)
				3877	return -EINVAL;
				3878
				3879	process_nmi(vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3880	vcpu->arch.exception.injected = events->exception.injected;
				3881	vcpu->arch.exception.pending = events->exception.pending;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3882	vcpu->arch.exception.nr = events->exception.nr;
				3883	vcpu->arch.exception.has_error_code = events->exception.has_error_code;
				3884	vcpu->arch.exception.error_code = events->exception.error_code;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3885	vcpu->arch.exception.has_payload = events->exception_has_payload;
				3886	vcpu->arch.exception.payload = events->exception_payload;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3887
				3888	vcpu->arch.interrupt.injected = events->interrupt.injected;
				3889	vcpu->arch.interrupt.nr = events->interrupt.nr;
				3890	vcpu->arch.interrupt.soft = events->interrupt.soft;
				3891	if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
				3892	kvm_x86_ops->set_interrupt_shadow(vcpu,
				3893	events->interrupt.shadow);
				3894
				3895	vcpu->arch.nmi_injected = events->nmi.injected;
				3896	if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
				3897	vcpu->arch.nmi_pending = events->nmi.pending;
				3898	kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
				3899
				3900	if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
				3901	lapic_in_kernel(vcpu))
				3902	vcpu->arch.apic->sipi_vector = events->sipi_vector;
				3903
				3904	if (events->flags & KVM_VCPUEVENT_VALID_SMM) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3905	if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) {
				3906	if (events->smi.smm)
				3907	vcpu->arch.hflags \|= HF_SMM_MASK;
				3908	else
				3909	vcpu->arch.hflags &= ~HF_SMM_MASK;
				3910	kvm_smm_changed(vcpu);
				3911	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3912
				3913	vcpu->arch.smi_pending = events->smi.pending;
				3914
				3915	if (events->smi.smm) {
				3916	if (events->smi.smm_inside_nmi)
				3917	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
				3918	else
				3919	vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
				3920	if (lapic_in_kernel(vcpu)) {
				3921	if (events->smi.latched_init)
				3922	set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
				3923	else
				3924	clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
				3925	}
				3926	}
				3927	}
				3928
				3929	kvm_make_request(KVM_REQ_EVENT, vcpu);
				3930
				3931	return 0;
				3932	}
				3933
				3934	static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
				3935	struct kvm_debugregs *dbgregs)
				3936	{
				3937	unsigned long val;
				3938
				3939	memcpy(dbgregs->db, vcpu->arch.db, sizeof(vcpu->arch.db));
				3940	kvm_get_dr(vcpu, 6, &val);
				3941	dbgregs->dr6 = val;
				3942	dbgregs->dr7 = vcpu->arch.dr7;
				3943	dbgregs->flags = 0;
				3944	memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
				3945	}
				3946
				3947	static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
				3948	struct kvm_debugregs *dbgregs)
				3949	{
				3950	if (dbgregs->flags)
				3951	return -EINVAL;
				3952
				3953	if (dbgregs->dr6 & ~0xffffffffull)
				3954	return -EINVAL;
				3955	if (dbgregs->dr7 & ~0xffffffffull)
				3956	return -EINVAL;
				3957
				3958	memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
				3959	kvm_update_dr0123(vcpu);
				3960	vcpu->arch.dr6 = dbgregs->dr6;
				3961	kvm_update_dr6(vcpu);
				3962	vcpu->arch.dr7 = dbgregs->dr7;
				3963	kvm_update_dr7(vcpu);
				3964
				3965	return 0;
				3966	}
				3967
				3968	#define XSTATE_COMPACTION_ENABLED (1ULL << 63)
				3969
				3970	static void fill_xsave(u8 dest, struct kvm_vcpu vcpu)
				3971	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3972	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3973	u64 xstate_bv = xsave->header.xfeatures;
				3974	u64 valid;
				3975
				3976	/*
				3977	* Copy legacy XSAVE area, to avoid complications with CPUID
				3978	* leaves 0 and 1 in the loop below.
				3979	*/
				3980	memcpy(dest, xsave, XSAVE_HDR_OFFSET);
				3981
				3982	/* Set XSTATE_BV */
				3983	xstate_bv &= vcpu->arch.guest_supported_xcr0 \| XFEATURE_MASK_FPSSE;
				3984	(u64 )(dest + XSAVE_HDR_OFFSET) = xstate_bv;
				3985
				3986	/*
				3987	* Copy each region from the possibly compacted offset to the
				3988	* non-compacted offset.
				3989	*/
				3990	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
				3991	while (valid) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3992	u64 xfeature_mask = valid & -valid;
				3993	int xfeature_nr = fls64(xfeature_mask) - 1;
				3994	void *src = get_xsave_addr(xsave, xfeature_nr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3995
				3996	if (src) {
				3997	u32 size, offset, ecx, edx;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	3998	cpuid_count(XSTATE_CPUID, xfeature_nr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	3999	&size, &offset, &ecx, &edx);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4000	if (xfeature_nr == XFEATURE_PKRU)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4001	memcpy(dest + offset, &vcpu->arch.pkru,
				4002	sizeof(vcpu->arch.pkru));
				4003	else
				4004	memcpy(dest + offset, src, size);
				4005
				4006	}
				4007
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4008	valid -= xfeature_mask;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4009	}
				4010	}
				4011
				4012	static void load_xsave(struct kvm_vcpu vcpu, u8 src)
				4013	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4014	struct xregs_state *xsave = &vcpu->arch.guest_fpu->state.xsave;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4015	u64 xstate_bv = (u64 )(src + XSAVE_HDR_OFFSET);
				4016	u64 valid;
				4017
				4018	/*
				4019	* Copy legacy XSAVE area, to avoid complications with CPUID
				4020	* leaves 0 and 1 in the loop below.
				4021	*/
				4022	memcpy(xsave, src, XSAVE_HDR_OFFSET);
				4023
				4024	/* Set XSTATE_BV and possibly XCOMP_BV. */
				4025	xsave->header.xfeatures = xstate_bv;
				4026	if (boot_cpu_has(X86_FEATURE_XSAVES))
				4027	xsave->header.xcomp_bv = host_xcr0 \| XSTATE_COMPACTION_ENABLED;
				4028
				4029	/*
				4030	* Copy each region from the non-compacted offset to the
				4031	* possibly compacted offset.
				4032	*/
				4033	valid = xstate_bv & ~XFEATURE_MASK_FPSSE;
				4034	while (valid) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4035	u64 xfeature_mask = valid & -valid;
				4036	int xfeature_nr = fls64(xfeature_mask) - 1;
				4037	void *dest = get_xsave_addr(xsave, xfeature_nr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4038
				4039	if (dest) {
				4040	u32 size, offset, ecx, edx;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4041	cpuid_count(XSTATE_CPUID, xfeature_nr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4042	&size, &offset, &ecx, &edx);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4043	if (xfeature_nr == XFEATURE_PKRU)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4044	memcpy(&vcpu->arch.pkru, src + offset,
				4045	sizeof(vcpu->arch.pkru));
				4046	else
				4047	memcpy(dest, src + offset, size);
				4048	}
				4049
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4050	valid -= xfeature_mask;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4051	}
				4052	}
				4053
				4054	static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu,
				4055	struct kvm_xsave *guest_xsave)
				4056	{
				4057	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
				4058	memset(guest_xsave, 0, sizeof(struct kvm_xsave));
				4059	fill_xsave((u8 *) guest_xsave->region, vcpu);
				4060	} else {
				4061	memcpy(guest_xsave->region,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4062	&vcpu->arch.guest_fpu->state.fxsave,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4063	sizeof(struct fxregs_state));
				4064	(u64 )&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)] =
				4065	XFEATURE_MASK_FPSSE;
				4066	}
				4067	}
				4068
				4069	#define XSAVE_MXCSR_OFFSET 24
				4070
				4071	static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu,
				4072	struct kvm_xsave *guest_xsave)
				4073	{
				4074	u64 xstate_bv =
				4075	(u64 )&guest_xsave->region[XSAVE_HDR_OFFSET / sizeof(u32)];
				4076	u32 mxcsr = (u32 )&guest_xsave->region[XSAVE_MXCSR_OFFSET / sizeof(u32)];
				4077
				4078	if (boot_cpu_has(X86_FEATURE_XSAVE)) {
				4079	/*
				4080	* Here we allow setting states that are not present in
				4081	* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
				4082	* with old userspace.
				4083	*/
				4084	if (xstate_bv & ~kvm_supported_xcr0() \|\|
				4085	mxcsr & ~mxcsr_feature_mask)
				4086	return -EINVAL;
				4087	load_xsave(vcpu, (u8 *)guest_xsave->region);
				4088	} else {
				4089	if (xstate_bv & ~XFEATURE_MASK_FPSSE \|\|
				4090	mxcsr & ~mxcsr_feature_mask)
				4091	return -EINVAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4092	memcpy(&vcpu->arch.guest_fpu->state.fxsave,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4093	guest_xsave->region, sizeof(struct fxregs_state));
				4094	}
				4095	return 0;
				4096	}
				4097
				4098	static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu,
				4099	struct kvm_xcrs *guest_xcrs)
				4100	{
				4101	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
				4102	guest_xcrs->nr_xcrs = 0;
				4103	return;
				4104	}
				4105
				4106	guest_xcrs->nr_xcrs = 1;
				4107	guest_xcrs->flags = 0;
				4108	guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK;
				4109	guest_xcrs->xcrs[0].value = vcpu->arch.xcr0;
				4110	}
				4111
				4112	static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu,
				4113	struct kvm_xcrs *guest_xcrs)
				4114	{
				4115	int i, r = 0;
				4116
				4117	if (!boot_cpu_has(X86_FEATURE_XSAVE))
				4118	return -EINVAL;
				4119
				4120	if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS \|\| guest_xcrs->flags)
				4121	return -EINVAL;
				4122
				4123	for (i = 0; i < guest_xcrs->nr_xcrs; i++)
				4124	/* Only support XCR0 currently */
				4125	if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) {
				4126	r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK,
				4127	guest_xcrs->xcrs[i].value);
				4128	break;
				4129	}
				4130	if (r)
				4131	r = -EINVAL;
				4132	return r;
				4133	}
				4134
				4135	/*
				4136	* kvm_set_guest_paused() indicates to the guest kernel that it has been
				4137	* stopped by the hypervisor. This function will be called from the host only.
				4138	* EINVAL is returned when the host attempts to set the flag for a guest that
				4139	* does not support pv clocks.
				4140	*/
				4141	static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
				4142	{
				4143	if (!vcpu->arch.pv_time_enabled)
				4144	return -EINVAL;
				4145	vcpu->arch.pvclock_set_guest_stopped_request = true;
				4146	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				4147	return 0;
				4148	}
				4149
				4150	static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
				4151	struct kvm_enable_cap *cap)
				4152	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4153	int r;
				4154	uint16_t vmcs_version;
				4155	void __user *user_ptr;
				4156
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4157	if (cap->flags)
				4158	return -EINVAL;
				4159
				4160	switch (cap->cap) {
				4161	case KVM_CAP_HYPERV_SYNIC2:
				4162	if (cap->args[0])
				4163	return -EINVAL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4164	/* fall through */
				4165
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4166	case KVM_CAP_HYPERV_SYNIC:
				4167	if (!irqchip_in_kernel(vcpu->kvm))
				4168	return -EINVAL;
				4169	return kvm_hv_activate_synic(vcpu, cap->cap ==
				4170	KVM_CAP_HYPERV_SYNIC2);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4171	case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
				4172	if (!kvm_x86_ops->nested_enable_evmcs)
				4173	return -ENOTTY;
				4174	r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
				4175	if (!r) {
				4176	user_ptr = (void __user *)(uintptr_t)cap->args[0];
				4177	if (copy_to_user(user_ptr, &vmcs_version,
				4178	sizeof(vmcs_version)))
				4179	r = -EFAULT;
				4180	}
				4181	return r;
				4182	case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
				4183	if (!kvm_x86_ops->enable_direct_tlbflush)
				4184	return -ENOTTY;
				4185
				4186	return kvm_x86_ops->enable_direct_tlbflush(vcpu);
				4187
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4188	default:
				4189	return -EINVAL;
				4190	}
				4191	}
				4192
				4193	long kvm_arch_vcpu_ioctl(struct file *filp,
				4194	unsigned int ioctl, unsigned long arg)
				4195	{
				4196	struct kvm_vcpu *vcpu = filp->private_data;
				4197	void __user argp = (void __user )arg;
				4198	int r;
				4199	union {
				4200	struct kvm_lapic_state *lapic;
				4201	struct kvm_xsave *xsave;
				4202	struct kvm_xcrs *xcrs;
				4203	void *buffer;
				4204	} u;
				4205
				4206	vcpu_load(vcpu);
				4207
				4208	u.buffer = NULL;
				4209	switch (ioctl) {
				4210	case KVM_GET_LAPIC: {
				4211	r = -EINVAL;
				4212	if (!lapic_in_kernel(vcpu))
				4213	goto out;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4214	u.lapic = kzalloc(sizeof(struct kvm_lapic_state),
				4215	GFP_KERNEL_ACCOUNT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4216
				4217	r = -ENOMEM;
				4218	if (!u.lapic)
				4219	goto out;
				4220	r = kvm_vcpu_ioctl_get_lapic(vcpu, u.lapic);
				4221	if (r)
				4222	goto out;
				4223	r = -EFAULT;
				4224	if (copy_to_user(argp, u.lapic, sizeof(struct kvm_lapic_state)))
				4225	goto out;
				4226	r = 0;
				4227	break;
				4228	}
				4229	case KVM_SET_LAPIC: {
				4230	r = -EINVAL;
				4231	if (!lapic_in_kernel(vcpu))
				4232	goto out;
				4233	u.lapic = memdup_user(argp, sizeof(*u.lapic));
				4234	if (IS_ERR(u.lapic)) {
				4235	r = PTR_ERR(u.lapic);
				4236	goto out_nofree;
				4237	}
				4238
				4239	r = kvm_vcpu_ioctl_set_lapic(vcpu, u.lapic);
				4240	break;
				4241	}
				4242	case KVM_INTERRUPT: {
				4243	struct kvm_interrupt irq;
				4244
				4245	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4246	if (copy_from_user(&irq, argp, sizeof(irq)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4247	goto out;
				4248	r = kvm_vcpu_ioctl_interrupt(vcpu, &irq);
				4249	break;
				4250	}
				4251	case KVM_NMI: {
				4252	r = kvm_vcpu_ioctl_nmi(vcpu);
				4253	break;
				4254	}
				4255	case KVM_SMI: {
				4256	r = kvm_vcpu_ioctl_smi(vcpu);
				4257	break;
				4258	}
				4259	case KVM_SET_CPUID: {
				4260	struct kvm_cpuid __user *cpuid_arg = argp;
				4261	struct kvm_cpuid cpuid;
				4262
				4263	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4264	if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4265	goto out;
				4266	r = kvm_vcpu_ioctl_set_cpuid(vcpu, &cpuid, cpuid_arg->entries);
				4267	break;
				4268	}
				4269	case KVM_SET_CPUID2: {
				4270	struct kvm_cpuid2 __user *cpuid_arg = argp;
				4271	struct kvm_cpuid2 cpuid;
				4272
				4273	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4274	if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4275	goto out;
				4276	r = kvm_vcpu_ioctl_set_cpuid2(vcpu, &cpuid,
				4277	cpuid_arg->entries);
				4278	break;
				4279	}
				4280	case KVM_GET_CPUID2: {
				4281	struct kvm_cpuid2 __user *cpuid_arg = argp;
				4282	struct kvm_cpuid2 cpuid;
				4283
				4284	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4285	if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4286	goto out;
				4287	r = kvm_vcpu_ioctl_get_cpuid2(vcpu, &cpuid,
				4288	cpuid_arg->entries);
				4289	if (r)
				4290	goto out;
				4291	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4292	if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4293	goto out;
				4294	r = 0;
				4295	break;
				4296	}
				4297	case KVM_GET_MSRS: {
				4298	int idx = srcu_read_lock(&vcpu->kvm->srcu);
				4299	r = msr_io(vcpu, argp, do_get_msr, 1);
				4300	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				4301	break;
				4302	}
				4303	case KVM_SET_MSRS: {
				4304	int idx = srcu_read_lock(&vcpu->kvm->srcu);
				4305	r = msr_io(vcpu, argp, do_set_msr, 0);
				4306	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				4307	break;
				4308	}
				4309	case KVM_TPR_ACCESS_REPORTING: {
				4310	struct kvm_tpr_access_ctl tac;
				4311
				4312	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4313	if (copy_from_user(&tac, argp, sizeof(tac)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4314	goto out;
				4315	r = vcpu_ioctl_tpr_access_reporting(vcpu, &tac);
				4316	if (r)
				4317	goto out;
				4318	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4319	if (copy_to_user(argp, &tac, sizeof(tac)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4320	goto out;
				4321	r = 0;
				4322	break;
				4323	};
				4324	case KVM_SET_VAPIC_ADDR: {
				4325	struct kvm_vapic_addr va;
				4326	int idx;
				4327
				4328	r = -EINVAL;
				4329	if (!lapic_in_kernel(vcpu))
				4330	goto out;
				4331	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4332	if (copy_from_user(&va, argp, sizeof(va)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4333	goto out;
				4334	idx = srcu_read_lock(&vcpu->kvm->srcu);
				4335	r = kvm_lapic_set_vapic_addr(vcpu, va.vapic_addr);
				4336	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				4337	break;
				4338	}
				4339	case KVM_X86_SETUP_MCE: {
				4340	u64 mcg_cap;
				4341
				4342	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4343	if (copy_from_user(&mcg_cap, argp, sizeof(mcg_cap)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4344	goto out;
				4345	r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap);
				4346	break;
				4347	}
				4348	case KVM_X86_SET_MCE: {
				4349	struct kvm_x86_mce mce;
				4350
				4351	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4352	if (copy_from_user(&mce, argp, sizeof(mce)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4353	goto out;
				4354	r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
				4355	break;
				4356	}
				4357	case KVM_GET_VCPU_EVENTS: {
				4358	struct kvm_vcpu_events events;
				4359
				4360	kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
				4361
				4362	r = -EFAULT;
				4363	if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
				4364	break;
				4365	r = 0;
				4366	break;
				4367	}
				4368	case KVM_SET_VCPU_EVENTS: {
				4369	struct kvm_vcpu_events events;
				4370
				4371	r = -EFAULT;
				4372	if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
				4373	break;
				4374
				4375	r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
				4376	break;
				4377	}
				4378	case KVM_GET_DEBUGREGS: {
				4379	struct kvm_debugregs dbgregs;
				4380
				4381	kvm_vcpu_ioctl_x86_get_debugregs(vcpu, &dbgregs);
				4382
				4383	r = -EFAULT;
				4384	if (copy_to_user(argp, &dbgregs,
				4385	sizeof(struct kvm_debugregs)))
				4386	break;
				4387	r = 0;
				4388	break;
				4389	}
				4390	case KVM_SET_DEBUGREGS: {
				4391	struct kvm_debugregs dbgregs;
				4392
				4393	r = -EFAULT;
				4394	if (copy_from_user(&dbgregs, argp,
				4395	sizeof(struct kvm_debugregs)))
				4396	break;
				4397
				4398	r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, &dbgregs);
				4399	break;
				4400	}
				4401	case KVM_GET_XSAVE: {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4402	u.xsave = kzalloc(sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4403	r = -ENOMEM;
				4404	if (!u.xsave)
				4405	break;
				4406
				4407	kvm_vcpu_ioctl_x86_get_xsave(vcpu, u.xsave);
				4408
				4409	r = -EFAULT;
				4410	if (copy_to_user(argp, u.xsave, sizeof(struct kvm_xsave)))
				4411	break;
				4412	r = 0;
				4413	break;
				4414	}
				4415	case KVM_SET_XSAVE: {
				4416	u.xsave = memdup_user(argp, sizeof(*u.xsave));
				4417	if (IS_ERR(u.xsave)) {
				4418	r = PTR_ERR(u.xsave);
				4419	goto out_nofree;
				4420	}
				4421
				4422	r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, u.xsave);
				4423	break;
				4424	}
				4425	case KVM_GET_XCRS: {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4426	u.xcrs = kzalloc(sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4427	r = -ENOMEM;
				4428	if (!u.xcrs)
				4429	break;
				4430
				4431	kvm_vcpu_ioctl_x86_get_xcrs(vcpu, u.xcrs);
				4432
				4433	r = -EFAULT;
				4434	if (copy_to_user(argp, u.xcrs,
				4435	sizeof(struct kvm_xcrs)))
				4436	break;
				4437	r = 0;
				4438	break;
				4439	}
				4440	case KVM_SET_XCRS: {
				4441	u.xcrs = memdup_user(argp, sizeof(*u.xcrs));
				4442	if (IS_ERR(u.xcrs)) {
				4443	r = PTR_ERR(u.xcrs);
				4444	goto out_nofree;
				4445	}
				4446
				4447	r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
				4448	break;
				4449	}
				4450	case KVM_SET_TSC_KHZ: {
				4451	u32 user_tsc_khz;
				4452
				4453	r = -EINVAL;
				4454	user_tsc_khz = (u32)arg;
				4455
				4456	if (user_tsc_khz >= kvm_max_guest_tsc_khz)
				4457	goto out;
				4458
				4459	if (user_tsc_khz == 0)
				4460	user_tsc_khz = tsc_khz;
				4461
				4462	if (!kvm_set_tsc_khz(vcpu, user_tsc_khz))
				4463	r = 0;
				4464
				4465	goto out;
				4466	}
				4467	case KVM_GET_TSC_KHZ: {
				4468	r = vcpu->arch.virtual_tsc_khz;
				4469	goto out;
				4470	}
				4471	case KVM_KVMCLOCK_CTRL: {
				4472	r = kvm_set_guest_paused(vcpu);
				4473	goto out;
				4474	}
				4475	case KVM_ENABLE_CAP: {
				4476	struct kvm_enable_cap cap;
				4477
				4478	r = -EFAULT;
				4479	if (copy_from_user(&cap, argp, sizeof(cap)))
				4480	goto out;
				4481	r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
				4482	break;
				4483	}
				4484	case KVM_GET_NESTED_STATE: {
				4485	struct kvm_nested_state __user *user_kvm_nested_state = argp;
				4486	u32 user_data_size;
				4487
				4488	r = -EINVAL;
				4489	if (!kvm_x86_ops->get_nested_state)
				4490	break;
				4491
				4492	BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
				4493	r = -EFAULT;
				4494	if (get_user(user_data_size, &user_kvm_nested_state->size))
				4495	break;
				4496
				4497	r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
				4498	user_data_size);
				4499	if (r < 0)
				4500	break;
				4501
				4502	if (r > user_data_size) {
				4503	if (put_user(r, &user_kvm_nested_state->size))
				4504	r = -EFAULT;
				4505	else
				4506	r = -E2BIG;
				4507	break;
				4508	}
				4509
				4510	r = 0;
				4511	break;
				4512	}
				4513	case KVM_SET_NESTED_STATE: {
				4514	struct kvm_nested_state __user *user_kvm_nested_state = argp;
				4515	struct kvm_nested_state kvm_state;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4516	int idx;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4517
				4518	r = -EINVAL;
				4519	if (!kvm_x86_ops->set_nested_state)
				4520	break;
				4521
				4522	r = -EFAULT;
				4523	if (copy_from_user(&kvm_state, user_kvm_nested_state, sizeof(kvm_state)))
				4524	break;
				4525
				4526	r = -EINVAL;
				4527	if (kvm_state.size < sizeof(kvm_state))
				4528	break;
				4529
				4530	if (kvm_state.flags &
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4531	~(KVM_STATE_NESTED_RUN_PENDING \| KVM_STATE_NESTED_GUEST_MODE
				4532	\| KVM_STATE_NESTED_EVMCS))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4533	break;
				4534
				4535	/* nested_run_pending implies guest_mode. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4536	if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING)
				4537	&& !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4538	break;
				4539
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4540	idx = srcu_read_lock(&vcpu->kvm->srcu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4541	r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	4542	srcu_read_unlock(&vcpu->kvm->srcu, idx);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4543	break;
				4544	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4545	case KVM_GET_SUPPORTED_HV_CPUID: {
				4546	struct kvm_cpuid2 __user *cpuid_arg = argp;
				4547	struct kvm_cpuid2 cpuid;
				4548
				4549	r = -EFAULT;
				4550	if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
				4551	goto out;
				4552
				4553	r = kvm_vcpu_ioctl_get_hv_cpuid(vcpu, &cpuid,
				4554	cpuid_arg->entries);
				4555	if (r)
				4556	goto out;
				4557
				4558	r = -EFAULT;
				4559	if (copy_to_user(cpuid_arg, &cpuid, sizeof(cpuid)))
				4560	goto out;
				4561	r = 0;
				4562	break;
				4563	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4564	default:
				4565	r = -EINVAL;
				4566	}
				4567	out:
				4568	kfree(u.buffer);
				4569	out_nofree:
				4570	vcpu_put(vcpu);
				4571	return r;
				4572	}
				4573
				4574	vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu vcpu, struct vm_fault vmf)
				4575	{
				4576	return VM_FAULT_SIGBUS;
				4577	}
				4578
				4579	static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr)
				4580	{
				4581	int ret;
				4582
				4583	if (addr > (unsigned int)(-3 * PAGE_SIZE))
				4584	return -EINVAL;
				4585	ret = kvm_x86_ops->set_tss_addr(kvm, addr);
				4586	return ret;
				4587	}
				4588
				4589	static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
				4590	u64 ident_addr)
				4591	{
				4592	return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
				4593	}
				4594
				4595	static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4596	unsigned long kvm_nr_mmu_pages)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4597	{
				4598	if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
				4599	return -EINVAL;
				4600
				4601	mutex_lock(&kvm->slots_lock);
				4602
				4603	kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
				4604	kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
				4605
				4606	mutex_unlock(&kvm->slots_lock);
				4607	return 0;
				4608	}
				4609
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4610	static unsigned long kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4611	{
				4612	return kvm->arch.n_max_mmu_pages;
				4613	}
				4614
				4615	static int kvm_vm_ioctl_get_irqchip(struct kvm kvm, struct kvm_irqchip chip)
				4616	{
				4617	struct kvm_pic *pic = kvm->arch.vpic;
				4618	int r;
				4619
				4620	r = 0;
				4621	switch (chip->chip_id) {
				4622	case KVM_IRQCHIP_PIC_MASTER:
				4623	memcpy(&chip->chip.pic, &pic->pics[0],
				4624	sizeof(struct kvm_pic_state));
				4625	break;
				4626	case KVM_IRQCHIP_PIC_SLAVE:
				4627	memcpy(&chip->chip.pic, &pic->pics[1],
				4628	sizeof(struct kvm_pic_state));
				4629	break;
				4630	case KVM_IRQCHIP_IOAPIC:
				4631	kvm_get_ioapic(kvm, &chip->chip.ioapic);
				4632	break;
				4633	default:
				4634	r = -EINVAL;
				4635	break;
				4636	}
				4637	return r;
				4638	}
				4639
				4640	static int kvm_vm_ioctl_set_irqchip(struct kvm kvm, struct kvm_irqchip chip)
				4641	{
				4642	struct kvm_pic *pic = kvm->arch.vpic;
				4643	int r;
				4644
				4645	r = 0;
				4646	switch (chip->chip_id) {
				4647	case KVM_IRQCHIP_PIC_MASTER:
				4648	spin_lock(&pic->lock);
				4649	memcpy(&pic->pics[0], &chip->chip.pic,
				4650	sizeof(struct kvm_pic_state));
				4651	spin_unlock(&pic->lock);
				4652	break;
				4653	case KVM_IRQCHIP_PIC_SLAVE:
				4654	spin_lock(&pic->lock);
				4655	memcpy(&pic->pics[1], &chip->chip.pic,
				4656	sizeof(struct kvm_pic_state));
				4657	spin_unlock(&pic->lock);
				4658	break;
				4659	case KVM_IRQCHIP_IOAPIC:
				4660	kvm_set_ioapic(kvm, &chip->chip.ioapic);
				4661	break;
				4662	default:
				4663	r = -EINVAL;
				4664	break;
				4665	}
				4666	kvm_pic_update_irq(pic);
				4667	return r;
				4668	}
				4669
				4670	static int kvm_vm_ioctl_get_pit(struct kvm kvm, struct kvm_pit_state ps)
				4671	{
				4672	struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state;
				4673
				4674	BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels));
				4675
				4676	mutex_lock(&kps->lock);
				4677	memcpy(ps, &kps->channels, sizeof(*ps));
				4678	mutex_unlock(&kps->lock);
				4679	return 0;
				4680	}
				4681
				4682	static int kvm_vm_ioctl_set_pit(struct kvm kvm, struct kvm_pit_state ps)
				4683	{
				4684	int i;
				4685	struct kvm_pit *pit = kvm->arch.vpit;
				4686
				4687	mutex_lock(&pit->pit_state.lock);
				4688	memcpy(&pit->pit_state.channels, ps, sizeof(*ps));
				4689	for (i = 0; i < 3; i++)
				4690	kvm_pit_load_count(pit, i, ps->channels[i].count, 0);
				4691	mutex_unlock(&pit->pit_state.lock);
				4692	return 0;
				4693	}
				4694
				4695	static int kvm_vm_ioctl_get_pit2(struct kvm kvm, struct kvm_pit_state2 ps)
				4696	{
				4697	mutex_lock(&kvm->arch.vpit->pit_state.lock);
				4698	memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels,
				4699	sizeof(ps->channels));
				4700	ps->flags = kvm->arch.vpit->pit_state.flags;
				4701	mutex_unlock(&kvm->arch.vpit->pit_state.lock);
				4702	memset(&ps->reserved, 0, sizeof(ps->reserved));
				4703	return 0;
				4704	}
				4705
				4706	static int kvm_vm_ioctl_set_pit2(struct kvm kvm, struct kvm_pit_state2 ps)
				4707	{
				4708	int start = 0;
				4709	int i;
				4710	u32 prev_legacy, cur_legacy;
				4711	struct kvm_pit *pit = kvm->arch.vpit;
				4712
				4713	mutex_lock(&pit->pit_state.lock);
				4714	prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY;
				4715	cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY;
				4716	if (!prev_legacy && cur_legacy)
				4717	start = 1;
				4718	memcpy(&pit->pit_state.channels, &ps->channels,
				4719	sizeof(pit->pit_state.channels));
				4720	pit->pit_state.flags = ps->flags;
				4721	for (i = 0; i < 3; i++)
				4722	kvm_pit_load_count(pit, i, pit->pit_state.channels[i].count,
				4723	start && i == 0);
				4724	mutex_unlock(&pit->pit_state.lock);
				4725	return 0;
				4726	}
				4727
				4728	static int kvm_vm_ioctl_reinject(struct kvm *kvm,
				4729	struct kvm_reinject_control *control)
				4730	{
				4731	struct kvm_pit *pit = kvm->arch.vpit;
				4732
				4733	if (!pit)
				4734	return -ENXIO;
				4735
				4736	/* pit->pit_state.lock was overloaded to prevent userspace from getting
				4737	* an inconsistent state after running multiple KVM_REINJECT_CONTROL
				4738	* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
				4739	*/
				4740	mutex_lock(&pit->pit_state.lock);
				4741	kvm_pit_set_reinject(pit, control->pit_reinject);
				4742	mutex_unlock(&pit->pit_state.lock);
				4743
				4744	return 0;
				4745	}
				4746
				4747	/**
				4748	* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
				4749	* @kvm: kvm instance
				4750	* @log: slot id and address to which we copy the log
				4751	*
				4752	* Steps 1-4 below provide general overview of dirty page logging. See
				4753	* kvm_get_dirty_log_protect() function description for additional details.
				4754	*
				4755	* We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
				4756	* always flush the TLB (step 4) even if previous step failed and the dirty
				4757	* bitmap may be corrupt. Regardless of previous outcome the KVM logging API
				4758	* does not preclude user space subsequent dirty log read. Flushing TLB ensures
				4759	* writes will be marked dirty for next log read.
				4760	*
				4761	* 1. Take a snapshot of the bit and clear it if needed.
				4762	* 2. Write protect the corresponding page.
				4763	* 3. Copy the snapshot to the userspace.
				4764	* 4. Flush TLB's if needed.
				4765	*/
				4766	int kvm_vm_ioctl_get_dirty_log(struct kvm kvm, struct kvm_dirty_log log)
				4767	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4768	bool flush = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4769	int r;
				4770
				4771	mutex_lock(&kvm->slots_lock);
				4772
				4773	/*
				4774	* Flush potentially hardware-cached dirty pages to dirty_bitmap.
				4775	*/
				4776	if (kvm_x86_ops->flush_log_dirty)
				4777	kvm_x86_ops->flush_log_dirty(kvm);
				4778
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4779	r = kvm_get_dirty_log_protect(kvm, log, &flush);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4780
				4781	/*
				4782	* All the TLBs can be flushed out of mmu lock, see the comments in
				4783	* kvm_mmu_slot_remove_write_access().
				4784	*/
				4785	lockdep_assert_held(&kvm->slots_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4786	if (flush)
				4787	kvm_flush_remote_tlbs(kvm);
				4788
				4789	mutex_unlock(&kvm->slots_lock);
				4790	return r;
				4791	}
				4792
				4793	int kvm_vm_ioctl_clear_dirty_log(struct kvm kvm, struct kvm_clear_dirty_log log)
				4794	{
				4795	bool flush = false;
				4796	int r;
				4797
				4798	mutex_lock(&kvm->slots_lock);
				4799
				4800	/*
				4801	* Flush potentially hardware-cached dirty pages to dirty_bitmap.
				4802	*/
				4803	if (kvm_x86_ops->flush_log_dirty)
				4804	kvm_x86_ops->flush_log_dirty(kvm);
				4805
				4806	r = kvm_clear_dirty_log_protect(kvm, log, &flush);
				4807
				4808	/*
				4809	* All the TLBs can be flushed out of mmu lock, see the comments in
				4810	* kvm_mmu_slot_remove_write_access().
				4811	*/
				4812	lockdep_assert_held(&kvm->slots_lock);
				4813	if (flush)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4814	kvm_flush_remote_tlbs(kvm);
				4815
				4816	mutex_unlock(&kvm->slots_lock);
				4817	return r;
				4818	}
				4819
				4820	int kvm_vm_ioctl_irq_line(struct kvm kvm, struct kvm_irq_level irq_event,
				4821	bool line_status)
				4822	{
				4823	if (!irqchip_in_kernel(kvm))
				4824	return -ENXIO;
				4825
				4826	irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
				4827	irq_event->irq, irq_event->level,
				4828	line_status);
				4829	return 0;
				4830	}
				4831
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4832	int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
				4833	struct kvm_enable_cap *cap)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4834	{
				4835	int r;
				4836
				4837	if (cap->flags)
				4838	return -EINVAL;
				4839
				4840	switch (cap->cap) {
				4841	case KVM_CAP_DISABLE_QUIRKS:
				4842	kvm->arch.disabled_quirks = cap->args[0];
				4843	r = 0;
				4844	break;
				4845	case KVM_CAP_SPLIT_IRQCHIP: {
				4846	mutex_lock(&kvm->lock);
				4847	r = -EINVAL;
				4848	if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS)
				4849	goto split_irqchip_unlock;
				4850	r = -EEXIST;
				4851	if (irqchip_in_kernel(kvm))
				4852	goto split_irqchip_unlock;
				4853	if (kvm->created_vcpus)
				4854	goto split_irqchip_unlock;
				4855	r = kvm_setup_empty_irq_routing(kvm);
				4856	if (r)
				4857	goto split_irqchip_unlock;
				4858	/* Pairs with irqchip_in_kernel. */
				4859	smp_wmb();
				4860	kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT;
				4861	kvm->arch.nr_reserved_ioapic_pins = cap->args[0];
				4862	r = 0;
				4863	split_irqchip_unlock:
				4864	mutex_unlock(&kvm->lock);
				4865	break;
				4866	}
				4867	case KVM_CAP_X2APIC_API:
				4868	r = -EINVAL;
				4869	if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
				4870	break;
				4871
				4872	if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
				4873	kvm->arch.x2apic_format = true;
				4874	if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
				4875	kvm->arch.x2apic_broadcast_quirk_disabled = true;
				4876
				4877	r = 0;
				4878	break;
				4879	case KVM_CAP_X86_DISABLE_EXITS:
				4880	r = -EINVAL;
				4881	if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS)
				4882	break;
				4883
				4884	if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) &&
				4885	kvm_can_mwait_in_guest())
				4886	kvm->arch.mwait_in_guest = true;
				4887	if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT)
				4888	kvm->arch.hlt_in_guest = true;
				4889	if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE)
				4890	kvm->arch.pause_in_guest = true;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4891	if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE)
				4892	kvm->arch.cstate_in_guest = true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4893	r = 0;
				4894	break;
				4895	case KVM_CAP_MSR_PLATFORM_INFO:
				4896	kvm->arch.guest_can_read_msr_platform_info = cap->args[0];
				4897	r = 0;
				4898	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4899	case KVM_CAP_EXCEPTION_PAYLOAD:
				4900	kvm->arch.exception_payload_enabled = cap->args[0];
				4901	r = 0;
				4902	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4903	default:
				4904	r = -EINVAL;
				4905	break;
				4906	}
				4907	return r;
				4908	}
				4909
				4910	long kvm_arch_vm_ioctl(struct file *filp,
				4911	unsigned int ioctl, unsigned long arg)
				4912	{
				4913	struct kvm *kvm = filp->private_data;
				4914	void __user argp = (void __user )arg;
				4915	int r = -ENOTTY;
				4916	/*
				4917	* This union makes it completely explicit to gcc-3.x
				4918	* that these two variables' stack usage should be
				4919	* combined, not added together.
				4920	*/
				4921	union {
				4922	struct kvm_pit_state ps;
				4923	struct kvm_pit_state2 ps2;
				4924	struct kvm_pit_config pit_config;
				4925	} u;
				4926
				4927	switch (ioctl) {
				4928	case KVM_SET_TSS_ADDR:
				4929	r = kvm_vm_ioctl_set_tss_addr(kvm, arg);
				4930	break;
				4931	case KVM_SET_IDENTITY_MAP_ADDR: {
				4932	u64 ident_addr;
				4933
				4934	mutex_lock(&kvm->lock);
				4935	r = -EINVAL;
				4936	if (kvm->created_vcpus)
				4937	goto set_identity_unlock;
				4938	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	4939	if (copy_from_user(&ident_addr, argp, sizeof(ident_addr)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	4940	goto set_identity_unlock;
				4941	r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr);
				4942	set_identity_unlock:
				4943	mutex_unlock(&kvm->lock);
				4944	break;
				4945	}
				4946	case KVM_SET_NR_MMU_PAGES:
				4947	r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, arg);
				4948	break;
				4949	case KVM_GET_NR_MMU_PAGES:
				4950	r = kvm_vm_ioctl_get_nr_mmu_pages(kvm);
				4951	break;
				4952	case KVM_CREATE_IRQCHIP: {
				4953	mutex_lock(&kvm->lock);
				4954
				4955	r = -EEXIST;
				4956	if (irqchip_in_kernel(kvm))
				4957	goto create_irqchip_unlock;
				4958
				4959	r = -EINVAL;
				4960	if (kvm->created_vcpus)
				4961	goto create_irqchip_unlock;
				4962
				4963	r = kvm_pic_init(kvm);
				4964	if (r)
				4965	goto create_irqchip_unlock;
				4966
				4967	r = kvm_ioapic_init(kvm);
				4968	if (r) {
				4969	kvm_pic_destroy(kvm);
				4970	goto create_irqchip_unlock;
				4971	}
				4972
				4973	r = kvm_setup_default_irq_routing(kvm);
				4974	if (r) {
				4975	kvm_ioapic_destroy(kvm);
				4976	kvm_pic_destroy(kvm);
				4977	goto create_irqchip_unlock;
				4978	}
				4979	/* Write kvm->irq_routing before enabling irqchip_in_kernel. */
				4980	smp_wmb();
				4981	kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL;
				4982	create_irqchip_unlock:
				4983	mutex_unlock(&kvm->lock);
				4984	break;
				4985	}
				4986	case KVM_CREATE_PIT:
				4987	u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
				4988	goto create_pit;
				4989	case KVM_CREATE_PIT2:
				4990	r = -EFAULT;
				4991	if (copy_from_user(&u.pit_config, argp,
				4992	sizeof(struct kvm_pit_config)))
				4993	goto out;
				4994	create_pit:
				4995	mutex_lock(&kvm->lock);
				4996	r = -EEXIST;
				4997	if (kvm->arch.vpit)
				4998	goto create_pit_unlock;
				4999	r = -ENOMEM;
				5000	kvm->arch.vpit = kvm_create_pit(kvm, u.pit_config.flags);
				5001	if (kvm->arch.vpit)
				5002	r = 0;
				5003	create_pit_unlock:
				5004	mutex_unlock(&kvm->lock);
				5005	break;
				5006	case KVM_GET_IRQCHIP: {
				5007	/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
				5008	struct kvm_irqchip *chip;
				5009
				5010	chip = memdup_user(argp, sizeof(*chip));
				5011	if (IS_ERR(chip)) {
				5012	r = PTR_ERR(chip);
				5013	goto out;
				5014	}
				5015
				5016	r = -ENXIO;
				5017	if (!irqchip_kernel(kvm))
				5018	goto get_irqchip_out;
				5019	r = kvm_vm_ioctl_get_irqchip(kvm, chip);
				5020	if (r)
				5021	goto get_irqchip_out;
				5022	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5023	if (copy_to_user(argp, chip, sizeof(*chip)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5024	goto get_irqchip_out;
				5025	r = 0;
				5026	get_irqchip_out:
				5027	kfree(chip);
				5028	break;
				5029	}
				5030	case KVM_SET_IRQCHIP: {
				5031	/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
				5032	struct kvm_irqchip *chip;
				5033
				5034	chip = memdup_user(argp, sizeof(*chip));
				5035	if (IS_ERR(chip)) {
				5036	r = PTR_ERR(chip);
				5037	goto out;
				5038	}
				5039
				5040	r = -ENXIO;
				5041	if (!irqchip_kernel(kvm))
				5042	goto set_irqchip_out;
				5043	r = kvm_vm_ioctl_set_irqchip(kvm, chip);
				5044	if (r)
				5045	goto set_irqchip_out;
				5046	r = 0;
				5047	set_irqchip_out:
				5048	kfree(chip);
				5049	break;
				5050	}
				5051	case KVM_GET_PIT: {
				5052	r = -EFAULT;
				5053	if (copy_from_user(&u.ps, argp, sizeof(struct kvm_pit_state)))
				5054	goto out;
				5055	r = -ENXIO;
				5056	if (!kvm->arch.vpit)
				5057	goto out;
				5058	r = kvm_vm_ioctl_get_pit(kvm, &u.ps);
				5059	if (r)
				5060	goto out;
				5061	r = -EFAULT;
				5062	if (copy_to_user(argp, &u.ps, sizeof(struct kvm_pit_state)))
				5063	goto out;
				5064	r = 0;
				5065	break;
				5066	}
				5067	case KVM_SET_PIT: {
				5068	r = -EFAULT;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5069	if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5070	goto out;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5071	mutex_lock(&kvm->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5072	r = -ENXIO;
				5073	if (!kvm->arch.vpit)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5074	goto set_pit_out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5075	r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5076	set_pit_out:
				5077	mutex_unlock(&kvm->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5078	break;
				5079	}
				5080	case KVM_GET_PIT2: {
				5081	r = -ENXIO;
				5082	if (!kvm->arch.vpit)
				5083	goto out;
				5084	r = kvm_vm_ioctl_get_pit2(kvm, &u.ps2);
				5085	if (r)
				5086	goto out;
				5087	r = -EFAULT;
				5088	if (copy_to_user(argp, &u.ps2, sizeof(u.ps2)))
				5089	goto out;
				5090	r = 0;
				5091	break;
				5092	}
				5093	case KVM_SET_PIT2: {
				5094	r = -EFAULT;
				5095	if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
				5096	goto out;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5097	mutex_lock(&kvm->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5098	r = -ENXIO;
				5099	if (!kvm->arch.vpit)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5100	goto set_pit2_out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5101	r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5102	set_pit2_out:
				5103	mutex_unlock(&kvm->lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5104	break;
				5105	}
				5106	case KVM_REINJECT_CONTROL: {
				5107	struct kvm_reinject_control control;
				5108	r = -EFAULT;
				5109	if (copy_from_user(&control, argp, sizeof(control)))
				5110	goto out;
				5111	r = kvm_vm_ioctl_reinject(kvm, &control);
				5112	break;
				5113	}
				5114	case KVM_SET_BOOT_CPU_ID:
				5115	r = 0;
				5116	mutex_lock(&kvm->lock);
				5117	if (kvm->created_vcpus)
				5118	r = -EBUSY;
				5119	else
				5120	kvm->arch.bsp_vcpu_id = arg;
				5121	mutex_unlock(&kvm->lock);
				5122	break;
				5123	case KVM_XEN_HVM_CONFIG: {
				5124	struct kvm_xen_hvm_config xhc;
				5125	r = -EFAULT;
				5126	if (copy_from_user(&xhc, argp, sizeof(xhc)))
				5127	goto out;
				5128	r = -EINVAL;
				5129	if (xhc.flags)
				5130	goto out;
				5131	memcpy(&kvm->arch.xen_hvm_config, &xhc, sizeof(xhc));
				5132	r = 0;
				5133	break;
				5134	}
				5135	case KVM_SET_CLOCK: {
				5136	struct kvm_clock_data user_ns;
				5137	u64 now_ns;
				5138
				5139	r = -EFAULT;
				5140	if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
				5141	goto out;
				5142
				5143	r = -EINVAL;
				5144	if (user_ns.flags)
				5145	goto out;
				5146
				5147	r = 0;
				5148	/*
				5149	* TODO: userspace has to take care of races with VCPU_RUN, so
				5150	* kvm_gen_update_masterclock() can be cut down to locked
				5151	* pvclock_update_vm_gtod_copy().
				5152	*/
				5153	kvm_gen_update_masterclock(kvm);
				5154	now_ns = get_kvmclock_ns(kvm);
				5155	kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
				5156	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
				5157	break;
				5158	}
				5159	case KVM_GET_CLOCK: {
				5160	struct kvm_clock_data user_ns;
				5161	u64 now_ns;
				5162
				5163	now_ns = get_kvmclock_ns(kvm);
				5164	user_ns.clock = now_ns;
				5165	user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0;
				5166	memset(&user_ns.pad, 0, sizeof(user_ns.pad));
				5167
				5168	r = -EFAULT;
				5169	if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
				5170	goto out;
				5171	r = 0;
				5172	break;
				5173	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5174	case KVM_MEMORY_ENCRYPT_OP: {
				5175	r = -ENOTTY;
				5176	if (kvm_x86_ops->mem_enc_op)
				5177	r = kvm_x86_ops->mem_enc_op(kvm, argp);
				5178	break;
				5179	}
				5180	case KVM_MEMORY_ENCRYPT_REG_REGION: {
				5181	struct kvm_enc_region region;
				5182
				5183	r = -EFAULT;
				5184	if (copy_from_user(&region, argp, sizeof(region)))
				5185	goto out;
				5186
				5187	r = -ENOTTY;
				5188	if (kvm_x86_ops->mem_enc_reg_region)
				5189	r = kvm_x86_ops->mem_enc_reg_region(kvm, &region);
				5190	break;
				5191	}
				5192	case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
				5193	struct kvm_enc_region region;
				5194
				5195	r = -EFAULT;
				5196	if (copy_from_user(&region, argp, sizeof(region)))
				5197	goto out;
				5198
				5199	r = -ENOTTY;
				5200	if (kvm_x86_ops->mem_enc_unreg_region)
				5201	r = kvm_x86_ops->mem_enc_unreg_region(kvm, &region);
				5202	break;
				5203	}
				5204	case KVM_HYPERV_EVENTFD: {
				5205	struct kvm_hyperv_eventfd hvevfd;
				5206
				5207	r = -EFAULT;
				5208	if (copy_from_user(&hvevfd, argp, sizeof(hvevfd)))
				5209	goto out;
				5210	r = kvm_vm_ioctl_hv_eventfd(kvm, &hvevfd);
				5211	break;
				5212	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5213	case KVM_SET_PMU_EVENT_FILTER:
				5214	r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
				5215	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5216	default:
				5217	r = -ENOTTY;
				5218	}
				5219	out:
				5220	return r;
				5221	}
				5222
				5223	static void kvm_init_msr_list(void)
				5224	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5225	struct x86_pmu_capability x86_pmu;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5226	u32 dummy[2];
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5227	unsigned i;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5228
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5229	BUILD_BUG_ON_MSG(INTEL_PMC_MAX_FIXED != 4,
				5230	"Please update the fixed PMCs in msrs_to_saved_all[]");
				5231
				5232	perf_get_x86_pmu_capability(&x86_pmu);
				5233
				5234	num_msrs_to_save = 0;
				5235	num_emulated_msrs = 0;
				5236	num_msr_based_features = 0;
				5237
				5238	for (i = 0; i < ARRAY_SIZE(msrs_to_save_all); i++) {
				5239	if (rdmsr_safe(msrs_to_save_all[i], &dummy[0], &dummy[1]) < 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5240	continue;
				5241
				5242	/*
				5243	* Even MSRs that are valid in the host may not be exposed
				5244	* to the guests in some cases.
				5245	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5246	switch (msrs_to_save_all[i]) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5247	case MSR_IA32_BNDCFGS:
				5248	if (!kvm_mpx_supported())
				5249	continue;
				5250	break;
				5251	case MSR_TSC_AUX:
				5252	if (!kvm_x86_ops->rdtscp_supported())
				5253	continue;
				5254	break;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	5255	case MSR_IA32_UMWAIT_CONTROL:
				5256	if (!boot_cpu_has(X86_FEATURE_WAITPKG))
				5257	continue;
				5258	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5259	case MSR_IA32_RTIT_CTL:
				5260	case MSR_IA32_RTIT_STATUS:
				5261	if (!kvm_x86_ops->pt_supported())
				5262	continue;
				5263	break;
				5264	case MSR_IA32_RTIT_CR3_MATCH:
				5265	if (!kvm_x86_ops->pt_supported() \|\|
				5266	!intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
				5267	continue;
				5268	break;
				5269	case MSR_IA32_RTIT_OUTPUT_BASE:
				5270	case MSR_IA32_RTIT_OUTPUT_MASK:
				5271	if (!kvm_x86_ops->pt_supported() \|\|
				5272	(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
				5273	!intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
				5274	continue;
				5275	break;
				5276	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
				5277	if (!kvm_x86_ops->pt_supported() \|\|
				5278	msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
				5279	intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
				5280	continue;
				5281	break;
				5282	case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR0 + 17:
				5283	if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_PERFCTR0 >=
				5284	min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
				5285	continue;
				5286	break;
				5287	case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL0 + 17:
				5288	if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
				5289	min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
				5290	continue;
				5291	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5292	default:
				5293	break;
				5294	}
				5295
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5296	msrs_to_save[num_msrs_to_save++] = msrs_to_save_all[i];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5297	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5298
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5299	for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
				5300	if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5301	continue;
				5302
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5303	emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5304	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5305
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5306	for (i = 0; i < ARRAY_SIZE(msr_based_features_all); i++) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5307	struct kvm_msr_entry msr;
				5308
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5309	msr.index = msr_based_features_all[i];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5310	if (kvm_get_msr_feature(&msr))
				5311	continue;
				5312
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5313	msr_based_features[num_msr_based_features++] = msr_based_features_all[i];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5314	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5315	}
				5316
				5317	static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
				5318	const void *v)
				5319	{
				5320	int handled = 0;
				5321	int n;
				5322
				5323	do {
				5324	n = min(len, 8);
				5325	if (!(lapic_in_kernel(vcpu) &&
				5326	!kvm_iodevice_write(vcpu, &vcpu->arch.apic->dev, addr, n, v))
				5327	&& kvm_io_bus_write(vcpu, KVM_MMIO_BUS, addr, n, v))
				5328	break;
				5329	handled += n;
				5330	addr += n;
				5331	len -= n;
				5332	v += n;
				5333	} while (len);
				5334
				5335	return handled;
				5336	}
				5337
				5338	static int vcpu_mmio_read(struct kvm_vcpu vcpu, gpa_t addr, int len, void v)
				5339	{
				5340	int handled = 0;
				5341	int n;
				5342
				5343	do {
				5344	n = min(len, 8);
				5345	if (!(lapic_in_kernel(vcpu) &&
				5346	!kvm_iodevice_read(vcpu, &vcpu->arch.apic->dev,
				5347	addr, n, v))
				5348	&& kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
				5349	break;
				5350	trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
				5351	handled += n;
				5352	addr += n;
				5353	len -= n;
				5354	v += n;
				5355	} while (len);
				5356
				5357	return handled;
				5358	}
				5359
				5360	static void kvm_set_segment(struct kvm_vcpu *vcpu,
				5361	struct kvm_segment *var, int seg)
				5362	{
				5363	kvm_x86_ops->set_segment(vcpu, var, seg);
				5364	}
				5365
				5366	void kvm_get_segment(struct kvm_vcpu *vcpu,
				5367	struct kvm_segment *var, int seg)
				5368	{
				5369	kvm_x86_ops->get_segment(vcpu, var, seg);
				5370	}
				5371
				5372	gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
				5373	struct x86_exception *exception)
				5374	{
				5375	gpa_t t_gpa;
				5376
				5377	BUG_ON(!mmu_is_nested(vcpu));
				5378
				5379	/* NPT walks are always user-walks */
				5380	access \|= PFERR_USER_MASK;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5381	t_gpa = vcpu->arch.mmu->gva_to_gpa(vcpu, gpa, access, exception);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5382
				5383	return t_gpa;
				5384	}
				5385
				5386	gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
				5387	struct x86_exception *exception)
				5388	{
				5389	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				5390	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				5391	}
				5392
				5393	gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
				5394	struct x86_exception *exception)
				5395	{
				5396	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				5397	access \|= PFERR_FETCH_MASK;
				5398	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				5399	}
				5400
				5401	gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
				5402	struct x86_exception *exception)
				5403	{
				5404	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				5405	access \|= PFERR_WRITE_MASK;
				5406	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				5407	}
				5408
				5409	/* uses this to access any guest's mapped memory without checking CPL */
				5410	gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
				5411	struct x86_exception *exception)
				5412	{
				5413	return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
				5414	}
				5415
				5416	static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
				5417	struct kvm_vcpu *vcpu, u32 access,
				5418	struct x86_exception *exception)
				5419	{
				5420	void *data = val;
				5421	int r = X86EMUL_CONTINUE;
				5422
				5423	while (bytes) {
				5424	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
				5425	exception);
				5426	unsigned offset = addr & (PAGE_SIZE-1);
				5427	unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
				5428	int ret;
				5429
				5430	if (gpa == UNMAPPED_GVA)
				5431	return X86EMUL_PROPAGATE_FAULT;
				5432	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, data,
				5433	offset, toread);
				5434	if (ret < 0) {
				5435	r = X86EMUL_IO_NEEDED;
				5436	goto out;
				5437	}
				5438
				5439	bytes -= toread;
				5440	data += toread;
				5441	addr += toread;
				5442	}
				5443	out:
				5444	return r;
				5445	}
				5446
				5447	/* used for instruction fetching */
				5448	static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
				5449	gva_t addr, void *val, unsigned int bytes,
				5450	struct x86_exception *exception)
				5451	{
				5452	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5453	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				5454	unsigned offset;
				5455	int ret;
				5456
				5457	/* Inline kvm_read_guest_virt_helper for speed. */
				5458	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access\|PFERR_FETCH_MASK,
				5459	exception);
				5460	if (unlikely(gpa == UNMAPPED_GVA))
				5461	return X86EMUL_PROPAGATE_FAULT;
				5462
				5463	offset = addr & (PAGE_SIZE-1);
				5464	if (WARN_ON(offset + bytes > PAGE_SIZE))
				5465	bytes = (unsigned)PAGE_SIZE - offset;
				5466	ret = kvm_vcpu_read_guest_page(vcpu, gpa >> PAGE_SHIFT, val,
				5467	offset, bytes);
				5468	if (unlikely(ret < 0))
				5469	return X86EMUL_IO_NEEDED;
				5470
				5471	return X86EMUL_CONTINUE;
				5472	}
				5473
				5474	int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
				5475	gva_t addr, void *val, unsigned int bytes,
				5476	struct x86_exception *exception)
				5477	{
				5478	u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
				5479
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5480	/*
				5481	* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
				5482	* is returned, but our callers are not ready for that and they blindly
				5483	* call kvm_inject_page_fault. Ensure that they at least do not leak
				5484	* uninitialized kernel stack memory into cr2 and error code.
				5485	*/
				5486	memset(exception, 0, sizeof(*exception));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5487	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
				5488	exception);
				5489	}
				5490	EXPORT_SYMBOL_GPL(kvm_read_guest_virt);
				5491
				5492	static int emulator_read_std(struct x86_emulate_ctxt *ctxt,
				5493	gva_t addr, void *val, unsigned int bytes,
				5494	struct x86_exception *exception, bool system)
				5495	{
				5496	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5497	u32 access = 0;
				5498
				5499	if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
				5500	access \|= PFERR_USER_MASK;
				5501
				5502	return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
				5503	}
				5504
				5505	static int kvm_read_guest_phys_system(struct x86_emulate_ctxt *ctxt,
				5506	unsigned long addr, void *val, unsigned int bytes)
				5507	{
				5508	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5509	int r = kvm_vcpu_read_guest(vcpu, addr, val, bytes);
				5510
				5511	return r < 0 ? X86EMUL_IO_NEEDED : X86EMUL_CONTINUE;
				5512	}
				5513
				5514	static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
				5515	struct kvm_vcpu *vcpu, u32 access,
				5516	struct x86_exception *exception)
				5517	{
				5518	void *data = val;
				5519	int r = X86EMUL_CONTINUE;
				5520
				5521	while (bytes) {
				5522	gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
				5523	access,
				5524	exception);
				5525	unsigned offset = addr & (PAGE_SIZE-1);
				5526	unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
				5527	int ret;
				5528
				5529	if (gpa == UNMAPPED_GVA)
				5530	return X86EMUL_PROPAGATE_FAULT;
				5531	ret = kvm_vcpu_write_guest(vcpu, gpa, data, towrite);
				5532	if (ret < 0) {
				5533	r = X86EMUL_IO_NEEDED;
				5534	goto out;
				5535	}
				5536
				5537	bytes -= towrite;
				5538	data += towrite;
				5539	addr += towrite;
				5540	}
				5541	out:
				5542	return r;
				5543	}
				5544
				5545	static int emulator_write_std(struct x86_emulate_ctxt ctxt, gva_t addr, void val,
				5546	unsigned int bytes, struct x86_exception *exception,
				5547	bool system)
				5548	{
				5549	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5550	u32 access = PFERR_WRITE_MASK;
				5551
				5552	if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
				5553	access \|= PFERR_USER_MASK;
				5554
				5555	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
				5556	access, exception);
				5557	}
				5558
				5559	int kvm_write_guest_virt_system(struct kvm_vcpu vcpu, gva_t addr, void val,
				5560	unsigned int bytes, struct x86_exception *exception)
				5561	{
				5562	/* kvm_write_guest_virt_system can pull in tons of pages. */
				5563	vcpu->arch.l1tf_flush_l1d = true;
				5564
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5565	/*
				5566	* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
				5567	* is returned, but our callers are not ready for that and they blindly
				5568	* call kvm_inject_page_fault. Ensure that they at least do not leak
				5569	* uninitialized kernel stack memory into cr2 and error code.
				5570	*/
				5571	memset(exception, 0, sizeof(*exception));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5572	return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
				5573	PFERR_WRITE_MASK, exception);
				5574	}
				5575	EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system);
				5576
				5577	int handle_ud(struct kvm_vcpu *vcpu)
				5578	{
				5579	int emul_type = EMULTYPE_TRAP_UD;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5580	char sig[5]; /* ud2; .ascii "kvm" */
				5581	struct x86_exception e;
				5582
				5583	if (force_emulation_prefix &&
				5584	kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
				5585	sig, sizeof(sig), &e) == 0 &&
				5586	memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
				5587	kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5588	emul_type = EMULTYPE_TRAP_UD_FORCED;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5589	}
				5590
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5591	return kvm_emulate_instruction(vcpu, emul_type);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5592	}
				5593	EXPORT_SYMBOL_GPL(handle_ud);
				5594
				5595	static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
				5596	gpa_t gpa, bool write)
				5597	{
				5598	/* For APIC access vmexit */
				5599	if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
				5600	return 1;
				5601
				5602	if (vcpu_match_mmio_gpa(vcpu, gpa)) {
				5603	trace_vcpu_match_mmio(gva, gpa, write, true);
				5604	return 1;
				5605	}
				5606
				5607	return 0;
				5608	}
				5609
				5610	static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
				5611	gpa_t gpa, struct x86_exception exception,
				5612	bool write)
				5613	{
				5614	u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
				5615	\| (write ? PFERR_WRITE_MASK : 0);
				5616
				5617	/*
				5618	* currently PKRU is only applied to ept enabled guest so
				5619	* there is no pkey in EPT page table for L1 guest or EPT
				5620	* shadow page table for L2 guest.
				5621	*/
				5622	if (vcpu_match_mmio_gva(vcpu, gva)
				5623	&& !permission_fault(vcpu, vcpu->arch.walk_mmu,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5624	vcpu->arch.mmio_access, 0, access)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5625	*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT \|
				5626	(gva & (PAGE_SIZE - 1));
				5627	trace_vcpu_match_mmio(gva, *gpa, write, false);
				5628	return 1;
				5629	}
				5630
				5631	*gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
				5632
				5633	if (*gpa == UNMAPPED_GVA)
				5634	return -1;
				5635
				5636	return vcpu_is_mmio_gpa(vcpu, gva, *gpa, write);
				5637	}
				5638
				5639	int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
				5640	const void *val, int bytes)
				5641	{
				5642	int ret;
				5643
				5644	ret = kvm_vcpu_write_guest(vcpu, gpa, val, bytes);
				5645	if (ret < 0)
				5646	return 0;
				5647	kvm_page_track_write(vcpu, gpa, val, bytes);
				5648	return 1;
				5649	}
				5650
				5651	struct read_write_emulator_ops {
				5652	int (read_write_prepare)(struct kvm_vcpu vcpu, void *val,
				5653	int bytes);
				5654	int (read_write_emulate)(struct kvm_vcpu vcpu, gpa_t gpa,
				5655	void *val, int bytes);
				5656	int (read_write_mmio)(struct kvm_vcpu vcpu, gpa_t gpa,
				5657	int bytes, void *val);
				5658	int (read_write_exit_mmio)(struct kvm_vcpu vcpu, gpa_t gpa,
				5659	void *val, int bytes);
				5660	bool write;
				5661	};
				5662
				5663	static int read_prepare(struct kvm_vcpu vcpu, void val, int bytes)
				5664	{
				5665	if (vcpu->mmio_read_completed) {
				5666	trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
				5667	vcpu->mmio_fragments[0].gpa, val);
				5668	vcpu->mmio_read_completed = 0;
				5669	return 1;
				5670	}
				5671
				5672	return 0;
				5673	}
				5674
				5675	static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
				5676	void *val, int bytes)
				5677	{
				5678	return !kvm_vcpu_read_guest(vcpu, gpa, val, bytes);
				5679	}
				5680
				5681	static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
				5682	void *val, int bytes)
				5683	{
				5684	return emulator_write_phys(vcpu, gpa, val, bytes);
				5685	}
				5686
				5687	static int write_mmio(struct kvm_vcpu vcpu, gpa_t gpa, int bytes, void val)
				5688	{
				5689	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
				5690	return vcpu_mmio_write(vcpu, gpa, bytes, val);
				5691	}
				5692
				5693	static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
				5694	void *val, int bytes)
				5695	{
				5696	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
				5697	return X86EMUL_IO_NEEDED;
				5698	}
				5699
				5700	static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
				5701	void *val, int bytes)
				5702	{
				5703	struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0];
				5704
				5705	memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len));
				5706	return X86EMUL_CONTINUE;
				5707	}
				5708
				5709	static const struct read_write_emulator_ops read_emultor = {
				5710	.read_write_prepare = read_prepare,
				5711	.read_write_emulate = read_emulate,
				5712	.read_write_mmio = vcpu_mmio_read,
				5713	.read_write_exit_mmio = read_exit_mmio,
				5714	};
				5715
				5716	static const struct read_write_emulator_ops write_emultor = {
				5717	.read_write_emulate = write_emulate,
				5718	.read_write_mmio = write_mmio,
				5719	.read_write_exit_mmio = write_exit_mmio,
				5720	.write = true,
				5721	};
				5722
				5723	static int emulator_read_write_onepage(unsigned long addr, void *val,
				5724	unsigned int bytes,
				5725	struct x86_exception *exception,
				5726	struct kvm_vcpu *vcpu,
				5727	const struct read_write_emulator_ops *ops)
				5728	{
				5729	gpa_t gpa;
				5730	int handled, ret;
				5731	bool write = ops->write;
				5732	struct kvm_mmio_fragment *frag;
				5733	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				5734
				5735	/*
				5736	* If the exit was due to a NPF we may already have a GPA.
				5737	* If the GPA is present, use it to avoid the GVA to GPA table walk.
				5738	* Note, this cannot be used on string operations since string
				5739	* operation using rep will only have the initial GPA from the NPF
				5740	* occurred.
				5741	*/
				5742	if (vcpu->arch.gpa_available &&
				5743	emulator_can_use_gpa(ctxt) &&
				5744	(addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
				5745	gpa = vcpu->arch.gpa_val;
				5746	ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
				5747	} else {
				5748	ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
				5749	if (ret < 0)
				5750	return X86EMUL_PROPAGATE_FAULT;
				5751	}
				5752
				5753	if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes))
				5754	return X86EMUL_CONTINUE;
				5755
				5756	/*
				5757	* Is this MMIO handled locally?
				5758	*/
				5759	handled = ops->read_write_mmio(vcpu, gpa, bytes, val);
				5760	if (handled == bytes)
				5761	return X86EMUL_CONTINUE;
				5762
				5763	gpa += handled;
				5764	bytes -= handled;
				5765	val += handled;
				5766
				5767	WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS);
				5768	frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++];
				5769	frag->gpa = gpa;
				5770	frag->data = val;
				5771	frag->len = bytes;
				5772	return X86EMUL_CONTINUE;
				5773	}
				5774
				5775	static int emulator_read_write(struct x86_emulate_ctxt *ctxt,
				5776	unsigned long addr,
				5777	void *val, unsigned int bytes,
				5778	struct x86_exception *exception,
				5779	const struct read_write_emulator_ops *ops)
				5780	{
				5781	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5782	gpa_t gpa;
				5783	int rc;
				5784
				5785	if (ops->read_write_prepare &&
				5786	ops->read_write_prepare(vcpu, val, bytes))
				5787	return X86EMUL_CONTINUE;
				5788
				5789	vcpu->mmio_nr_fragments = 0;
				5790
				5791	/* Crossing a page boundary? */
				5792	if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
				5793	int now;
				5794
				5795	now = -addr & ~PAGE_MASK;
				5796	rc = emulator_read_write_onepage(addr, val, now, exception,
				5797	vcpu, ops);
				5798
				5799	if (rc != X86EMUL_CONTINUE)
				5800	return rc;
				5801	addr += now;
				5802	if (ctxt->mode != X86EMUL_MODE_PROT64)
				5803	addr = (u32)addr;
				5804	val += now;
				5805	bytes -= now;
				5806	}
				5807
				5808	rc = emulator_read_write_onepage(addr, val, bytes, exception,
				5809	vcpu, ops);
				5810	if (rc != X86EMUL_CONTINUE)
				5811	return rc;
				5812
				5813	if (!vcpu->mmio_nr_fragments)
				5814	return rc;
				5815
				5816	gpa = vcpu->mmio_fragments[0].gpa;
				5817
				5818	vcpu->mmio_needed = 1;
				5819	vcpu->mmio_cur_fragment = 0;
				5820
				5821	vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len);
				5822	vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write;
				5823	vcpu->run->exit_reason = KVM_EXIT_MMIO;
				5824	vcpu->run->mmio.phys_addr = gpa;
				5825
				5826	return ops->read_write_exit_mmio(vcpu, gpa, val, bytes);
				5827	}
				5828
				5829	static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
				5830	unsigned long addr,
				5831	void *val,
				5832	unsigned int bytes,
				5833	struct x86_exception *exception)
				5834	{
				5835	return emulator_read_write(ctxt, addr, val, bytes,
				5836	exception, &read_emultor);
				5837	}
				5838
				5839	static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
				5840	unsigned long addr,
				5841	const void *val,
				5842	unsigned int bytes,
				5843	struct x86_exception *exception)
				5844	{
				5845	return emulator_read_write(ctxt, addr, (void *)val, bytes,
				5846	exception, &write_emultor);
				5847	}
				5848
				5849	#define CMPXCHG_TYPE(t, ptr, old, new) \
				5850	(cmpxchg((t )(ptr), (t )(old), (t )(new)) == (t *)(old))
				5851
				5852	#ifdef CONFIG_X86_64
				5853	# define CMPXCHG64(ptr, old, new) CMPXCHG_TYPE(u64, ptr, old, new)
				5854	#else
				5855	# define CMPXCHG64(ptr, old, new) \
				5856	(cmpxchg64((u64 )(ptr), (u64 )(old), (u64 )(new)) == (u64 *)(old))
				5857	#endif
				5858
				5859	static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
				5860	unsigned long addr,
				5861	const void *old,
				5862	const void *new,
				5863	unsigned int bytes,
				5864	struct x86_exception *exception)
				5865	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5866	struct kvm_host_map map;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5867	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5868	gpa_t gpa;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5869	char *kaddr;
				5870	bool exchanged;
				5871
				5872	/* guests cmpxchg8b have to be emulated atomically */
				5873	if (bytes > 8 \|\| (bytes & (bytes - 1)))
				5874	goto emul_write;
				5875
				5876	gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
				5877
				5878	if (gpa == UNMAPPED_GVA \|\|
				5879	(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
				5880	goto emul_write;
				5881
				5882	if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
				5883	goto emul_write;
				5884
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5885	if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5886	goto emul_write;
				5887
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5888	kaddr = map.hva + offset_in_page(gpa);
				5889
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5890	switch (bytes) {
				5891	case 1:
				5892	exchanged = CMPXCHG_TYPE(u8, kaddr, old, new);
				5893	break;
				5894	case 2:
				5895	exchanged = CMPXCHG_TYPE(u16, kaddr, old, new);
				5896	break;
				5897	case 4:
				5898	exchanged = CMPXCHG_TYPE(u32, kaddr, old, new);
				5899	break;
				5900	case 8:
				5901	exchanged = CMPXCHG64(kaddr, old, new);
				5902	break;
				5903	default:
				5904	BUG();
				5905	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	5906
				5907	kvm_vcpu_unmap(vcpu, &map, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5908
				5909	if (!exchanged)
				5910	return X86EMUL_CMPXCHG_FAILED;
				5911
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	5912	kvm_page_track_write(vcpu, gpa, new, bytes);
				5913
				5914	return X86EMUL_CONTINUE;
				5915
				5916	emul_write:
				5917	printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
				5918
				5919	return emulator_write_emulated(ctxt, addr, new, bytes, exception);
				5920	}
				5921
				5922	static int kernel_pio(struct kvm_vcpu vcpu, void pd)
				5923	{
				5924	int r = 0, i;
				5925
				5926	for (i = 0; i < vcpu->arch.pio.count; i++) {
				5927	if (vcpu->arch.pio.in)
				5928	r = kvm_io_bus_read(vcpu, KVM_PIO_BUS, vcpu->arch.pio.port,
				5929	vcpu->arch.pio.size, pd);
				5930	else
				5931	r = kvm_io_bus_write(vcpu, KVM_PIO_BUS,
				5932	vcpu->arch.pio.port, vcpu->arch.pio.size,
				5933	pd);
				5934	if (r)
				5935	break;
				5936	pd += vcpu->arch.pio.size;
				5937	}
				5938	return r;
				5939	}
				5940
				5941	static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size,
				5942	unsigned short port, void *val,
				5943	unsigned int count, bool in)
				5944	{
				5945	vcpu->arch.pio.port = port;
				5946	vcpu->arch.pio.in = in;
				5947	vcpu->arch.pio.count = count;
				5948	vcpu->arch.pio.size = size;
				5949
				5950	if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
				5951	vcpu->arch.pio.count = 0;
				5952	return 1;
				5953	}
				5954
				5955	vcpu->run->exit_reason = KVM_EXIT_IO;
				5956	vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
				5957	vcpu->run->io.size = size;
				5958	vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE;
				5959	vcpu->run->io.count = count;
				5960	vcpu->run->io.port = port;
				5961
				5962	return 0;
				5963	}
				5964
				5965	static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
				5966	int size, unsigned short port, void *val,
				5967	unsigned int count)
				5968	{
				5969	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5970	int ret;
				5971
				5972	if (vcpu->arch.pio.count)
				5973	goto data_avail;
				5974
				5975	memset(vcpu->arch.pio_data, 0, size * count);
				5976
				5977	ret = emulator_pio_in_out(vcpu, size, port, val, count, true);
				5978	if (ret) {
				5979	data_avail:
				5980	memcpy(val, vcpu->arch.pio_data, size * count);
				5981	trace_kvm_pio(KVM_PIO_IN, port, size, count, vcpu->arch.pio_data);
				5982	vcpu->arch.pio.count = 0;
				5983	return 1;
				5984	}
				5985
				5986	return 0;
				5987	}
				5988
				5989	static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
				5990	int size, unsigned short port,
				5991	const void *val, unsigned int count)
				5992	{
				5993	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				5994
				5995	memcpy(vcpu->arch.pio_data, val, size * count);
				5996	trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
				5997	return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
				5998	}
				5999
				6000	static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
				6001	{
				6002	return kvm_x86_ops->get_segment_base(vcpu, seg);
				6003	}
				6004
				6005	static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
				6006	{
				6007	kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
				6008	}
				6009
				6010	static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
				6011	{
				6012	if (!need_emulate_wbinvd(vcpu))
				6013	return X86EMUL_CONTINUE;
				6014
				6015	if (kvm_x86_ops->has_wbinvd_exit()) {
				6016	int cpu = get_cpu();
				6017
				6018	cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
				6019	smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
				6020	wbinvd_ipi, NULL, 1);
				6021	put_cpu();
				6022	cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
				6023	} else
				6024	wbinvd();
				6025	return X86EMUL_CONTINUE;
				6026	}
				6027
				6028	int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
				6029	{
				6030	kvm_emulate_wbinvd_noskip(vcpu);
				6031	return kvm_skip_emulated_instruction(vcpu);
				6032	}
				6033	EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
				6034
				6035
				6036
				6037	static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
				6038	{
				6039	kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
				6040	}
				6041
				6042	static int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr,
				6043	unsigned long *dest)
				6044	{
				6045	return kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
				6046	}
				6047
				6048	static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
				6049	unsigned long value)
				6050	{
				6051
				6052	return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
				6053	}
				6054
				6055	static u64 mk_cr_64(u64 curr_cr, u32 new_val)
				6056	{
				6057	return (curr_cr & ~((1ULL << 32) - 1)) \| new_val;
				6058	}
				6059
				6060	static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
				6061	{
				6062	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				6063	unsigned long value;
				6064
				6065	switch (cr) {
				6066	case 0:
				6067	value = kvm_read_cr0(vcpu);
				6068	break;
				6069	case 2:
				6070	value = vcpu->arch.cr2;
				6071	break;
				6072	case 3:
				6073	value = kvm_read_cr3(vcpu);
				6074	break;
				6075	case 4:
				6076	value = kvm_read_cr4(vcpu);
				6077	break;
				6078	case 8:
				6079	value = kvm_get_cr8(vcpu);
				6080	break;
				6081	default:
				6082	kvm_err("%s: unexpected cr %u\n", __func__, cr);
				6083	return 0;
				6084	}
				6085
				6086	return value;
				6087	}
				6088
				6089	static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
				6090	{
				6091	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				6092	int res = 0;
				6093
				6094	switch (cr) {
				6095	case 0:
				6096	res = kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
				6097	break;
				6098	case 2:
				6099	vcpu->arch.cr2 = val;
				6100	break;
				6101	case 3:
				6102	res = kvm_set_cr3(vcpu, val);
				6103	break;
				6104	case 4:
				6105	res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
				6106	break;
				6107	case 8:
				6108	res = kvm_set_cr8(vcpu, val);
				6109	break;
				6110	default:
				6111	kvm_err("%s: unexpected cr %u\n", __func__, cr);
				6112	res = -1;
				6113	}
				6114
				6115	return res;
				6116	}
				6117
				6118	static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
				6119	{
				6120	return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
				6121	}
				6122
				6123	static void emulator_get_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				6124	{
				6125	kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
				6126	}
				6127
				6128	static void emulator_get_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				6129	{
				6130	kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
				6131	}
				6132
				6133	static void emulator_set_gdt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				6134	{
				6135	kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
				6136	}
				6137
				6138	static void emulator_set_idt(struct x86_emulate_ctxt ctxt, struct desc_ptr dt)
				6139	{
				6140	kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
				6141	}
				6142
				6143	static unsigned long emulator_get_cached_segment_base(
				6144	struct x86_emulate_ctxt *ctxt, int seg)
				6145	{
				6146	return get_segment_base(emul_to_vcpu(ctxt), seg);
				6147	}
				6148
				6149	static bool emulator_get_segment(struct x86_emulate_ctxt ctxt, u16 selector,
				6150	struct desc_struct desc, u32 base3,
				6151	int seg)
				6152	{
				6153	struct kvm_segment var;
				6154
				6155	kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
				6156	*selector = var.selector;
				6157
				6158	if (var.unusable) {
				6159	memset(desc, 0, sizeof(*desc));
				6160	if (base3)
				6161	*base3 = 0;
				6162	return false;
				6163	}
				6164
				6165	if (var.g)
				6166	var.limit >>= 12;
				6167	set_desc_limit(desc, var.limit);
				6168	set_desc_base(desc, (unsigned long)var.base);
				6169	#ifdef CONFIG_X86_64
				6170	if (base3)
				6171	*base3 = var.base >> 32;
				6172	#endif
				6173	desc->type = var.type;
				6174	desc->s = var.s;
				6175	desc->dpl = var.dpl;
				6176	desc->p = var.present;
				6177	desc->avl = var.avl;
				6178	desc->l = var.l;
				6179	desc->d = var.db;
				6180	desc->g = var.g;
				6181
				6182	return true;
				6183	}
				6184
				6185	static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
				6186	struct desc_struct *desc, u32 base3,
				6187	int seg)
				6188	{
				6189	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				6190	struct kvm_segment var;
				6191
				6192	var.selector = selector;
				6193	var.base = get_desc_base(desc);
				6194	#ifdef CONFIG_X86_64
				6195	var.base \|= ((u64)base3) << 32;
				6196	#endif
				6197	var.limit = get_desc_limit(desc);
				6198	if (desc->g)
				6199	var.limit = (var.limit << 12) \| 0xfff;
				6200	var.type = desc->type;
				6201	var.dpl = desc->dpl;
				6202	var.db = desc->d;
				6203	var.s = desc->s;
				6204	var.l = desc->l;
				6205	var.g = desc->g;
				6206	var.avl = desc->avl;
				6207	var.present = desc->p;
				6208	var.unusable = !var.present;
				6209	var.padding = 0;
				6210
				6211	kvm_set_segment(vcpu, &var, seg);
				6212	return;
				6213	}
				6214
				6215	static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
				6216	u32 msr_index, u64 *pdata)
				6217	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6218	return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6219	}
				6220
				6221	static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
				6222	u32 msr_index, u64 data)
				6223	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6224	return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6225	}
				6226
				6227	static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
				6228	{
				6229	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				6230
				6231	return vcpu->arch.smbase;
				6232	}
				6233
				6234	static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase)
				6235	{
				6236	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				6237
				6238	vcpu->arch.smbase = smbase;
				6239	}
				6240
				6241	static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
				6242	u32 pmc)
				6243	{
				6244	return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
				6245	}
				6246
				6247	static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
				6248	u32 pmc, u64 *pdata)
				6249	{
				6250	return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, pdata);
				6251	}
				6252
				6253	static void emulator_halt(struct x86_emulate_ctxt *ctxt)
				6254	{
				6255	emul_to_vcpu(ctxt)->arch.halt_request = 1;
				6256	}
				6257
				6258	static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
				6259	struct x86_instruction_info *info,
				6260	enum x86_intercept_stage stage)
				6261	{
				6262	return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
				6263	}
				6264
				6265	static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
				6266	u32 eax, u32 ebx, u32 ecx, u32 edx, bool check_limit)
				6267	{
				6268	return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
				6269	}
				6270
				6271	static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
				6272	{
				6273	return kvm_register_read(emul_to_vcpu(ctxt), reg);
				6274	}
				6275
				6276	static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val)
				6277	{
				6278	kvm_register_write(emul_to_vcpu(ctxt), reg, val);
				6279	}
				6280
				6281	static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
				6282	{
				6283	kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
				6284	}
				6285
				6286	static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
				6287	{
				6288	return emul_to_vcpu(ctxt)->arch.hflags;
				6289	}
				6290
				6291	static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
				6292	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6293	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				6294
				6295	vcpu->arch.hflags = emul_flags;
				6296	kvm_mmu_reset_context(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6297	}
				6298
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6299	static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
				6300	const char *smstate)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6301	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6302	return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate);
				6303	}
				6304
				6305	static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
				6306	{
				6307	kvm_smm_changed(emul_to_vcpu(ctxt));
				6308	}
				6309
				6310	static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr)
				6311	{
				6312	return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6313	}
				6314
				6315	static const struct x86_emulate_ops emulate_ops = {
				6316	.read_gpr = emulator_read_gpr,
				6317	.write_gpr = emulator_write_gpr,
				6318	.read_std = emulator_read_std,
				6319	.write_std = emulator_write_std,
				6320	.read_phys = kvm_read_guest_phys_system,
				6321	.fetch = kvm_fetch_guest_virt,
				6322	.read_emulated = emulator_read_emulated,
				6323	.write_emulated = emulator_write_emulated,
				6324	.cmpxchg_emulated = emulator_cmpxchg_emulated,
				6325	.invlpg = emulator_invlpg,
				6326	.pio_in_emulated = emulator_pio_in_emulated,
				6327	.pio_out_emulated = emulator_pio_out_emulated,
				6328	.get_segment = emulator_get_segment,
				6329	.set_segment = emulator_set_segment,
				6330	.get_cached_segment_base = emulator_get_cached_segment_base,
				6331	.get_gdt = emulator_get_gdt,
				6332	.get_idt = emulator_get_idt,
				6333	.set_gdt = emulator_set_gdt,
				6334	.set_idt = emulator_set_idt,
				6335	.get_cr = emulator_get_cr,
				6336	.set_cr = emulator_set_cr,
				6337	.cpl = emulator_get_cpl,
				6338	.get_dr = emulator_get_dr,
				6339	.set_dr = emulator_set_dr,
				6340	.get_smbase = emulator_get_smbase,
				6341	.set_smbase = emulator_set_smbase,
				6342	.set_msr = emulator_set_msr,
				6343	.get_msr = emulator_get_msr,
				6344	.check_pmc = emulator_check_pmc,
				6345	.read_pmc = emulator_read_pmc,
				6346	.halt = emulator_halt,
				6347	.wbinvd = emulator_wbinvd,
				6348	.fix_hypercall = emulator_fix_hypercall,
				6349	.intercept = emulator_intercept,
				6350	.get_cpuid = emulator_get_cpuid,
				6351	.set_nmi_mask = emulator_set_nmi_mask,
				6352	.get_hflags = emulator_get_hflags,
				6353	.set_hflags = emulator_set_hflags,
				6354	.pre_leave_smm = emulator_pre_leave_smm,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6355	.post_leave_smm = emulator_post_leave_smm,
				6356	.set_xcr = emulator_set_xcr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6357	};
				6358
				6359	static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
				6360	{
				6361	u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
				6362	/*
				6363	* an sti; sti; sequence only disable interrupts for the first
				6364	* instruction. So, if the last instruction, be it emulated or
				6365	* not, left the system with the INT_STI flag enabled, it
				6366	* means that the last instruction is an sti. We should not
				6367	* leave the flag on in this case. The same goes for mov ss
				6368	*/
				6369	if (int_shadow & mask)
				6370	mask = 0;
				6371	if (unlikely(int_shadow \|\| mask)) {
				6372	kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
				6373	if (!mask)
				6374	kvm_make_request(KVM_REQ_EVENT, vcpu);
				6375	}
				6376	}
				6377
				6378	static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
				6379	{
				6380	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				6381	if (ctxt->exception.vector == PF_VECTOR)
				6382	return kvm_propagate_fault(vcpu, &ctxt->exception);
				6383
				6384	if (ctxt->exception.error_code_valid)
				6385	kvm_queue_exception_e(vcpu, ctxt->exception.vector,
				6386	ctxt->exception.error_code);
				6387	else
				6388	kvm_queue_exception(vcpu, ctxt->exception.vector);
				6389	return false;
				6390	}
				6391
				6392	static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
				6393	{
				6394	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				6395	int cs_db, cs_l;
				6396
				6397	kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
				6398
				6399	ctxt->eflags = kvm_get_rflags(vcpu);
				6400	ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
				6401
				6402	ctxt->eip = kvm_rip_read(vcpu);
				6403	ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
				6404	(ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 :
				6405	(cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 :
				6406	cs_db ? X86EMUL_MODE_PROT32 :
				6407	X86EMUL_MODE_PROT16;
				6408	BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK);
				6409	BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
				6410	BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
				6411
				6412	init_decode_cache(ctxt);
				6413	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
				6414	}
				6415
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6416	void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6417	{
				6418	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				6419	int ret;
				6420
				6421	init_emulate_ctxt(vcpu);
				6422
				6423	ctxt->op_bytes = 2;
				6424	ctxt->ad_bytes = 2;
				6425	ctxt->_eip = ctxt->eip + inc_eip;
				6426	ret = emulate_int_real(ctxt, irq);
				6427
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6428	if (ret != X86EMUL_CONTINUE) {
				6429	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				6430	} else {
				6431	ctxt->eip = ctxt->_eip;
				6432	kvm_rip_write(vcpu, ctxt->eip);
				6433	kvm_set_rflags(vcpu, ctxt->eflags);
				6434	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6435	}
				6436	EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
				6437
				6438	static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type)
				6439	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6440	++vcpu->stat.insn_emulation_fail;
				6441	trace_kvm_emulate_insn_failed(vcpu);
				6442
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6443	if (emulation_type & EMULTYPE_VMWARE_GP) {
				6444	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
				6445	return 1;
				6446	}
				6447
				6448	if (emulation_type & EMULTYPE_SKIP) {
				6449	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
				6450	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
				6451	vcpu->run->internal.ndata = 0;
				6452	return 0;
				6453	}
				6454
				6455	kvm_queue_exception(vcpu, UD_VECTOR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6456
				6457	if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
				6458	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
				6459	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
				6460	vcpu->run->internal.ndata = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6461	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6462	}
				6463
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6464	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6465	}
				6466
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6467	static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6468	bool write_fault_to_shadow_pgtable,
				6469	int emulation_type)
				6470	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6471	gpa_t gpa = cr2_or_gpa;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6472	kvm_pfn_t pfn;
				6473
				6474	if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
				6475	return false;
				6476
				6477	if (WARN_ON_ONCE(is_guest_mode(vcpu)))
				6478	return false;
				6479
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6480	if (!vcpu->arch.mmu->direct_map) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6481	/*
				6482	* Write permission should be allowed since only
				6483	* write access need to be emulated.
				6484	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6485	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6486
				6487	/*
				6488	* If the mapping is invalid in guest, let cpu retry
				6489	* it to generate fault.
				6490	*/
				6491	if (gpa == UNMAPPED_GVA)
				6492	return true;
				6493	}
				6494
				6495	/*
				6496	* Do not retry the unhandleable instruction if it faults on the
				6497	* readonly host memory, otherwise it will goto a infinite loop:
				6498	* retry instruction -> write #PF -> emulation fail -> retry
				6499	* instruction -> ...
				6500	*/
				6501	pfn = gfn_to_pfn(vcpu->kvm, gpa_to_gfn(gpa));
				6502
				6503	/*
				6504	* If the instruction failed on the error pfn, it can not be fixed,
				6505	* report the error to userspace.
				6506	*/
				6507	if (is_error_noslot_pfn(pfn))
				6508	return false;
				6509
				6510	kvm_release_pfn_clean(pfn);
				6511
				6512	/* The instructions are well-emulated on direct mmu. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6513	if (vcpu->arch.mmu->direct_map) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6514	unsigned int indirect_shadow_pages;
				6515
				6516	spin_lock(&vcpu->kvm->mmu_lock);
				6517	indirect_shadow_pages = vcpu->kvm->arch.indirect_shadow_pages;
				6518	spin_unlock(&vcpu->kvm->mmu_lock);
				6519
				6520	if (indirect_shadow_pages)
				6521	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
				6522
				6523	return true;
				6524	}
				6525
				6526	/*
				6527	* if emulation was due to access to shadowed page table
				6528	* and it failed try to unshadow page and re-enter the
				6529	* guest to let CPU execute the instruction.
				6530	*/
				6531	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
				6532
				6533	/*
				6534	* If the access faults on its page table, it can not
				6535	* be fixed by unprotecting shadow page and it should
				6536	* be reported to userspace.
				6537	*/
				6538	return !write_fault_to_shadow_pgtable;
				6539	}
				6540
				6541	static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6542	gpa_t cr2_or_gpa, int emulation_type)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6543	{
				6544	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6545	unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6546
				6547	last_retry_eip = vcpu->arch.last_retry_eip;
				6548	last_retry_addr = vcpu->arch.last_retry_addr;
				6549
				6550	/*
				6551	* If the emulation is caused by #PF and it is non-page_table
				6552	* writing instruction, it means the VM-EXIT is caused by shadow
				6553	* page protected, we can zap the shadow page and retry this
				6554	* instruction directly.
				6555	*
				6556	* Note: if the guest uses a non-page-table modifying instruction
				6557	* on the PDE that points to the instruction, then we will unmap
				6558	* the instruction and go to an infinite loop. So, we cache the
				6559	* last retried eip and the last fault address, if we meet the eip
				6560	* and the address again, we can break out of the potential infinite
				6561	* loop.
				6562	*/
				6563	vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
				6564
				6565	if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
				6566	return false;
				6567
				6568	if (WARN_ON_ONCE(is_guest_mode(vcpu)))
				6569	return false;
				6570
				6571	if (x86_page_table_writing_insn(ctxt))
				6572	return false;
				6573
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6574	if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6575	return false;
				6576
				6577	vcpu->arch.last_retry_eip = ctxt->eip;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6578	vcpu->arch.last_retry_addr = cr2_or_gpa;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6579
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6580	if (!vcpu->arch.mmu->direct_map)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6581	gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6582
				6583	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
				6584
				6585	return true;
				6586	}
				6587
				6588	static int complete_emulated_mmio(struct kvm_vcpu *vcpu);
				6589	static int complete_emulated_pio(struct kvm_vcpu *vcpu);
				6590
				6591	static void kvm_smm_changed(struct kvm_vcpu *vcpu)
				6592	{
				6593	if (!(vcpu->arch.hflags & HF_SMM_MASK)) {
				6594	/* This is a good place to trace that we are exiting SMM. */
				6595	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
				6596
				6597	/* Process a latched INIT or SMI, if any. */
				6598	kvm_make_request(KVM_REQ_EVENT, vcpu);
				6599	}
				6600
				6601	kvm_mmu_reset_context(vcpu);
				6602	}
				6603
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6604	static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7,
				6605	unsigned long *db)
				6606	{
				6607	u32 dr6 = 0;
				6608	int i;
				6609	u32 enable, rwlen;
				6610
				6611	enable = dr7;
				6612	rwlen = dr7 >> 16;
				6613	for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4)
				6614	if ((enable & 3) && (rwlen & 15) == type && db[i] == addr)
				6615	dr6 \|= (1 << i);
				6616	return dr6;
				6617	}
				6618
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6619	static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6620	{
				6621	struct kvm_run *kvm_run = vcpu->run;
				6622
				6623	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
				6624	kvm_run->debug.arch.dr6 = DR6_BS \| DR6_FIXED_1 \| DR6_RTM;
				6625	kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
				6626	kvm_run->debug.arch.exception = DB_VECTOR;
				6627	kvm_run->exit_reason = KVM_EXIT_DEBUG;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6628	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6629	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6630	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS);
				6631	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6632	}
				6633
				6634	int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
				6635	{
				6636	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6637	int r;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6638
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6639	r = kvm_x86_ops->skip_emulated_instruction(vcpu);
				6640	if (unlikely(!r))
				6641	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6642
				6643	/*
				6644	* rflags is the old, "raw" value of the flags. The new value has
				6645	* not been saved yet.
				6646	*
				6647	* This is correct even for TF set by the guest, because "the
				6648	* processor will not generate this exception after the instruction
				6649	* that sets the TF flag".
				6650	*/
				6651	if (unlikely(rflags & X86_EFLAGS_TF))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6652	r = kvm_vcpu_do_singlestep(vcpu);
				6653	return r;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6654	}
				6655	EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
				6656
				6657	static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu vcpu, int r)
				6658	{
				6659	if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
				6660	(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
				6661	struct kvm_run *kvm_run = vcpu->run;
				6662	unsigned long eip = kvm_get_linear_rip(vcpu);
				6663	u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
				6664	vcpu->arch.guest_debug_dr7,
				6665	vcpu->arch.eff_db);
				6666
				6667	if (dr6 != 0) {
				6668	kvm_run->debug.arch.dr6 = dr6 \| DR6_FIXED_1 \| DR6_RTM;
				6669	kvm_run->debug.arch.pc = eip;
				6670	kvm_run->debug.arch.exception = DB_VECTOR;
				6671	kvm_run->exit_reason = KVM_EXIT_DEBUG;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6672	*r = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6673	return true;
				6674	}
				6675	}
				6676
				6677	if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) &&
				6678	!(kvm_get_rflags(vcpu) & X86_EFLAGS_RF)) {
				6679	unsigned long eip = kvm_get_linear_rip(vcpu);
				6680	u32 dr6 = kvm_vcpu_check_hw_bp(eip, 0,
				6681	vcpu->arch.dr7,
				6682	vcpu->arch.db);
				6683
				6684	if (dr6 != 0) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6685	vcpu->arch.dr6 &= ~DR_TRAP_BITS;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6686	vcpu->arch.dr6 \|= dr6 \| DR6_RTM;
				6687	kvm_queue_exception(vcpu, DB_VECTOR);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6688	*r = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6689	return true;
				6690	}
				6691	}
				6692
				6693	return false;
				6694	}
				6695
				6696	static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt)
				6697	{
				6698	switch (ctxt->opcode_len) {
				6699	case 1:
				6700	switch (ctxt->b) {
				6701	case 0xe4: /* IN */
				6702	case 0xe5:
				6703	case 0xec:
				6704	case 0xed:
				6705	case 0xe6: /* OUT */
				6706	case 0xe7:
				6707	case 0xee:
				6708	case 0xef:
				6709	case 0x6c: /* INS */
				6710	case 0x6d:
				6711	case 0x6e: /* OUTS */
				6712	case 0x6f:
				6713	return true;
				6714	}
				6715	break;
				6716	case 2:
				6717	switch (ctxt->b) {
				6718	case 0x33: /* RDPMC */
				6719	return true;
				6720	}
				6721	break;
				6722	}
				6723
				6724	return false;
				6725	}
				6726
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6727	int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
				6728	int emulation_type, void *insn, int insn_len)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6729	{
				6730	int r;
				6731	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				6732	bool writeback = true;
				6733	bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
				6734
				6735	vcpu->arch.l1tf_flush_l1d = true;
				6736
				6737	/*
				6738	* Clear write_fault_to_shadow_pgtable here to ensure it is
				6739	* never reused.
				6740	*/
				6741	vcpu->arch.write_fault_to_shadow_pgtable = false;
				6742	kvm_clear_exception_queue(vcpu);
				6743
				6744	if (!(emulation_type & EMULTYPE_NO_DECODE)) {
				6745	init_emulate_ctxt(vcpu);
				6746
				6747	/*
				6748	* We will reenter on the same instruction since
				6749	* we do not set complete_userspace_io. This does not
				6750	* handle watchpoints yet, those would be handled in
				6751	* the emulate_ops.
				6752	*/
				6753	if (!(emulation_type & EMULTYPE_SKIP) &&
				6754	kvm_vcpu_check_breakpoint(vcpu, &r))
				6755	return r;
				6756
				6757	ctxt->interruptibility = 0;
				6758	ctxt->have_exception = false;
				6759	ctxt->exception.vector = -1;
				6760	ctxt->perm_ok = false;
				6761
				6762	ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
				6763
				6764	r = x86_decode_insn(ctxt, insn, insn_len);
				6765
				6766	trace_kvm_emulate_insn_start(vcpu);
				6767	++vcpu->stat.insn_emulation;
				6768	if (r != EMULATION_OK) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6769	if ((emulation_type & EMULTYPE_TRAP_UD) \|\|
				6770	(emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
				6771	kvm_queue_exception(vcpu, UD_VECTOR);
				6772	return 1;
				6773	}
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6774	if (reexecute_instruction(vcpu, cr2_or_gpa,
				6775	write_fault_to_spt,
				6776	emulation_type))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6777	return 1;
				6778	if (ctxt->have_exception) {
				6779	/*
				6780	* #UD should result in just EMULATION_FAILED, and trap-like
				6781	* exception should not be encountered during decode.
				6782	*/
				6783	WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR \|\|
				6784	exception_type(ctxt->exception.vector) == EXCPT_TRAP);
				6785	inject_emulated_exception(vcpu);
				6786	return 1;
				6787	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6788	return handle_emulation_failure(vcpu, emulation_type);
				6789	}
				6790	}
				6791
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6792	if ((emulation_type & EMULTYPE_VMWARE_GP) &&
				6793	!is_vmware_backdoor_opcode(ctxt)) {
				6794	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
				6795	return 1;
				6796	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6797
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6798	/*
				6799	* Note, EMULTYPE_SKIP is intended for use only by vendor callbacks
				6800	* for kvm_skip_emulated_instruction(). The caller is responsible for
				6801	* updating interruptibility state and injecting single-step #DBs.
				6802	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6803	if (emulation_type & EMULTYPE_SKIP) {
				6804	kvm_rip_write(vcpu, ctxt->_eip);
				6805	if (ctxt->eflags & X86_EFLAGS_RF)
				6806	kvm_set_rflags(vcpu, ctxt->eflags & ~X86_EFLAGS_RF);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6807	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6808	}
				6809
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6810	if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6811	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6812
				6813	/* this is needed for vmware backdoor interface to work since it
				6814	changes registers values during IO operation */
				6815	if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
				6816	vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
				6817	emulator_invalidate_register_cache(ctxt);
				6818	}
				6819
				6820	restart:
				6821	/* Save the faulting GPA (cr2) in the address field */
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6822	ctxt->exception.address = cr2_or_gpa;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6823
				6824	r = x86_emulate_insn(ctxt);
				6825
				6826	if (r == EMULATION_INTERCEPTED)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6827	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6828
				6829	if (r == EMULATION_FAILED) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6830	if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6831	emulation_type))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6832	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6833
				6834	return handle_emulation_failure(vcpu, emulation_type);
				6835	}
				6836
				6837	if (ctxt->have_exception) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6838	r = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6839	if (inject_emulated_exception(vcpu))
				6840	return r;
				6841	} else if (vcpu->arch.pio.count) {
				6842	if (!vcpu->arch.pio.in) {
				6843	/* FIXME: return into emulator if single-stepping. */
				6844	vcpu->arch.pio.count = 0;
				6845	} else {
				6846	writeback = false;
				6847	vcpu->arch.complete_userspace_io = complete_emulated_pio;
				6848	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6849	r = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6850	} else if (vcpu->mmio_needed) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6851	++vcpu->stat.mmio_exits;
				6852
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6853	if (!vcpu->mmio_is_write)
				6854	writeback = false;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6855	r = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6856	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
				6857	} else if (r == EMULATION_RESTART)
				6858	goto restart;
				6859	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6860	r = 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6861
				6862	if (writeback) {
				6863	unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
				6864	toggle_interruptibility(vcpu, ctxt->interruptibility);
				6865	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6866	if (!ctxt->have_exception \|\|
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6867	exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
				6868	kvm_rip_write(vcpu, ctxt->eip);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	6869	if (r && (ctxt->tf \|\| (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6870	r = kvm_vcpu_do_singlestep(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6871	__kvm_set_rflags(vcpu, ctxt->eflags);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6872	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6873
				6874	/*
				6875	* For STI, interrupts are shadowed; so KVM_REQ_EVENT will
				6876	* do nothing, and it will be requested again as soon as
				6877	* the shadow expires. But we still need to check here,
				6878	* because POPF has no interrupt shadow.
				6879	*/
				6880	if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF))
				6881	kvm_make_request(KVM_REQ_EVENT, vcpu);
				6882	} else
				6883	vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
				6884
				6885	return r;
				6886	}
				6887
				6888	int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type)
				6889	{
				6890	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
				6891	}
				6892	EXPORT_SYMBOL_GPL(kvm_emulate_instruction);
				6893
				6894	int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu,
				6895	void *insn, int insn_len)
				6896	{
				6897	return x86_emulate_instruction(vcpu, 0, 0, insn, insn_len);
				6898	}
				6899	EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer);
				6900
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6901	static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu)
				6902	{
				6903	vcpu->arch.pio.count = 0;
				6904	return 1;
				6905	}
				6906
				6907	static int complete_fast_pio_out(struct kvm_vcpu *vcpu)
				6908	{
				6909	vcpu->arch.pio.count = 0;
				6910
				6911	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip)))
				6912	return 1;
				6913
				6914	return kvm_skip_emulated_instruction(vcpu);
				6915	}
				6916
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6917	static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size,
				6918	unsigned short port)
				6919	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6920	unsigned long val = kvm_rax_read(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6921	int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
				6922	size, port, &val, 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6923	if (ret)
				6924	return ret;
				6925
				6926	/*
				6927	* Workaround userspace that relies on old KVM behavior of %rip being
				6928	* incremented prior to exiting to userspace to handle "OUT 0x7e".
				6929	*/
				6930	if (port == 0x7e &&
				6931	kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) {
				6932	vcpu->arch.complete_userspace_io =
				6933	complete_fast_pio_out_port_0x7e;
				6934	kvm_skip_emulated_instruction(vcpu);
				6935	} else {
				6936	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
				6937	vcpu->arch.complete_userspace_io = complete_fast_pio_out;
				6938	}
				6939	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6940	}
				6941
				6942	static int complete_fast_pio_in(struct kvm_vcpu *vcpu)
				6943	{
				6944	unsigned long val;
				6945
				6946	/* We should only ever be called with arch.pio.count equal to 1 */
				6947	BUG_ON(vcpu->arch.pio.count != 1);
				6948
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6949	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) {
				6950	vcpu->arch.pio.count = 0;
				6951	return 1;
				6952	}
				6953
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6954	/* For size less than 4 we merge, else we zero extend */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6955	val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6956
				6957	/*
				6958	* Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
				6959	* the copy and tracing
				6960	*/
				6961	emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
				6962	vcpu->arch.pio.port, &val, 1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6963	kvm_rax_write(vcpu, val);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6964
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6965	return kvm_skip_emulated_instruction(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6966	}
				6967
				6968	static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size,
				6969	unsigned short port)
				6970	{
				6971	unsigned long val;
				6972	int ret;
				6973
				6974	/* For size less than 4 we merge, else we zero extend */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6975	val = (size < 4) ? kvm_rax_read(vcpu) : 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6976
				6977	ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
				6978	&val, 1);
				6979	if (ret) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6980	kvm_rax_write(vcpu, val);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6981	return ret;
				6982	}
				6983
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6984	vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6985	vcpu->arch.complete_userspace_io = complete_fast_pio_in;
				6986
				6987	return 0;
				6988	}
				6989
				6990	int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in)
				6991	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6992	int ret;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6993
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6994	if (in)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6995	ret = kvm_fast_pio_in(vcpu, size, port);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6996	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	6997	ret = kvm_fast_pio_out(vcpu, size, port);
				6998	return ret && kvm_skip_emulated_instruction(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6999	}
				7000	EXPORT_SYMBOL_GPL(kvm_fast_pio);
				7001
				7002	static int kvmclock_cpu_down_prep(unsigned int cpu)
				7003	{
				7004	__this_cpu_write(cpu_tsc_khz, 0);
				7005	return 0;
				7006	}
				7007
				7008	static void tsc_khz_changed(void *data)
				7009	{
				7010	struct cpufreq_freqs *freq = data;
				7011	unsigned long khz = 0;
				7012
				7013	if (data)
				7014	khz = freq->new;
				7015	else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
				7016	khz = cpufreq_quick_get(raw_smp_processor_id());
				7017	if (!khz)
				7018	khz = tsc_khz;
				7019	__this_cpu_write(cpu_tsc_khz, khz);
				7020	}
				7021
				7022	#ifdef CONFIG_X86_64
				7023	static void kvm_hyperv_tsc_notifier(void)
				7024	{
				7025	struct kvm *kvm;
				7026	struct kvm_vcpu *vcpu;
				7027	int cpu;
				7028
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7029	mutex_lock(&kvm_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7030	list_for_each_entry(kvm, &vm_list, vm_list)
				7031	kvm_make_mclock_inprogress_request(kvm);
				7032
				7033	hyperv_stop_tsc_emulation();
				7034
				7035	/* TSC frequency always matches when on Hyper-V */
				7036	for_each_present_cpu(cpu)
				7037	per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
				7038	kvm_max_guest_tsc_khz = tsc_khz;
				7039
				7040	list_for_each_entry(kvm, &vm_list, vm_list) {
				7041	struct kvm_arch *ka = &kvm->arch;
				7042
				7043	spin_lock(&ka->pvclock_gtod_sync_lock);
				7044
				7045	pvclock_update_vm_gtod_copy(kvm);
				7046
				7047	kvm_for_each_vcpu(cpu, vcpu, kvm)
				7048	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				7049
				7050	kvm_for_each_vcpu(cpu, vcpu, kvm)
				7051	kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu);
				7052
				7053	spin_unlock(&ka->pvclock_gtod_sync_lock);
				7054	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7055	mutex_unlock(&kvm_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7056	}
				7057	#endif
				7058
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7059	static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7060	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7061	struct kvm *kvm;
				7062	struct kvm_vcpu *vcpu;
				7063	int i, send_ipi = 0;
				7064
				7065	/*
				7066	* We allow guests to temporarily run on slowing clocks,
				7067	* provided we notify them after, or to run on accelerating
				7068	* clocks, provided we notify them before. Thus time never
				7069	* goes backwards.
				7070	*
				7071	* However, we have a problem. We can't atomically update
				7072	* the frequency of a given CPU from this function; it is
				7073	* merely a notifier, which can be called from any CPU.
				7074	* Changing the TSC frequency at arbitrary points in time
				7075	* requires a recomputation of local variables related to
				7076	* the TSC for each VCPU. We must flag these local variables
				7077	* to be updated and be sure the update takes place with the
				7078	* new frequency before any guests proceed.
				7079	*
				7080	* Unfortunately, the combination of hotplug CPU and frequency
				7081	* change creates an intractable locking scenario; the order
				7082	* of when these callouts happen is undefined with respect to
				7083	* CPU hotplug, and they can race with each other. As such,
				7084	* merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
				7085	* undefined; you can actually have a CPU frequency change take
				7086	* place in between the computation of X and the setting of the
				7087	* variable. To protect against this problem, all updates of
				7088	* the per_cpu tsc_khz variable are done in an interrupt
				7089	* protected IPI, and all callers wishing to update the value
				7090	* must wait for a synchronous IPI to complete (which is trivial
				7091	* if the caller is on the CPU already). This establishes the
				7092	* necessary total order on variable updates.
				7093	*
				7094	* Note that because a guest time update may take place
				7095	* anytime after the setting of the VCPU's request bit, the
				7096	* correct TSC value must be set before the request. However,
				7097	* to ensure the update actually makes it to any guest which
				7098	* starts running in hardware virtualization between the set
				7099	* and the acquisition of the spinlock, we must also ping the
				7100	* CPU after setting the request bit.
				7101	*
				7102	*/
				7103
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7104	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7105
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7106	mutex_lock(&kvm_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7107	list_for_each_entry(kvm, &vm_list, vm_list) {
				7108	kvm_for_each_vcpu(i, vcpu, kvm) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7109	if (vcpu->cpu != cpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7110	continue;
				7111	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7112	if (vcpu->cpu != raw_smp_processor_id())
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7113	send_ipi = 1;
				7114	}
				7115	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7116	mutex_unlock(&kvm_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7117
				7118	if (freq->old < freq->new && send_ipi) {
				7119	/*
				7120	* We upscale the frequency. Must make the guest
				7121	* doesn't see old kvmclock values while running with
				7122	* the new frequency, otherwise we risk the guest sees
				7123	* time go backwards.
				7124	*
				7125	* In case we update the frequency for another cpu
				7126	* (which might be in guest context) send an interrupt
				7127	* to kick the cpu out of guest context. Next time
				7128	* guest context is entered kvmclock will be updated,
				7129	* so the guest will not see stale values.
				7130	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7131	smp_call_function_single(cpu, tsc_khz_changed, freq, 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7132	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7133	}
				7134
				7135	static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
				7136	void *data)
				7137	{
				7138	struct cpufreq_freqs *freq = data;
				7139	int cpu;
				7140
				7141	if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
				7142	return 0;
				7143	if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
				7144	return 0;
				7145
				7146	for_each_cpu(cpu, freq->policy->cpus)
				7147	__kvmclock_cpufreq_notifier(freq, cpu);
				7148
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7149	return 0;
				7150	}
				7151
				7152	static struct notifier_block kvmclock_cpufreq_notifier_block = {
				7153	.notifier_call = kvmclock_cpufreq_notifier
				7154	};
				7155
				7156	static int kvmclock_cpu_online(unsigned int cpu)
				7157	{
				7158	tsc_khz_changed(NULL);
				7159	return 0;
				7160	}
				7161
				7162	static void kvm_timer_init(void)
				7163	{
				7164	max_tsc_khz = tsc_khz;
				7165
				7166	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
				7167	#ifdef CONFIG_CPU_FREQ
				7168	struct cpufreq_policy policy;
				7169	int cpu;
				7170
				7171	memset(&policy, 0, sizeof(policy));
				7172	cpu = get_cpu();
				7173	cpufreq_get_policy(&policy, cpu);
				7174	if (policy.cpuinfo.max_freq)
				7175	max_tsc_khz = policy.cpuinfo.max_freq;
				7176	put_cpu();
				7177	#endif
				7178	cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
				7179	CPUFREQ_TRANSITION_NOTIFIER);
				7180	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7181
				7182	cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
				7183	kvmclock_cpu_online, kvmclock_cpu_down_prep);
				7184	}
				7185
				7186	DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
				7187	EXPORT_PER_CPU_SYMBOL_GPL(current_vcpu);
				7188
				7189	int kvm_is_in_guest(void)
				7190	{
				7191	return __this_cpu_read(current_vcpu) != NULL;
				7192	}
				7193
				7194	static int kvm_is_user_mode(void)
				7195	{
				7196	int user_mode = 3;
				7197
				7198	if (__this_cpu_read(current_vcpu))
				7199	user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
				7200
				7201	return user_mode != 0;
				7202	}
				7203
				7204	static unsigned long kvm_get_guest_ip(void)
				7205	{
				7206	unsigned long ip = 0;
				7207
				7208	if (__this_cpu_read(current_vcpu))
				7209	ip = kvm_rip_read(__this_cpu_read(current_vcpu));
				7210
				7211	return ip;
				7212	}
				7213
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7214	static void kvm_handle_intel_pt_intr(void)
				7215	{
				7216	struct kvm_vcpu *vcpu = __this_cpu_read(current_vcpu);
				7217
				7218	kvm_make_request(KVM_REQ_PMI, vcpu);
				7219	__set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
				7220	(unsigned long *)&vcpu->arch.pmu.global_status);
				7221	}
				7222
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7223	static struct perf_guest_info_callbacks kvm_guest_cbs = {
				7224	.is_in_guest = kvm_is_in_guest,
				7225	.is_user_mode = kvm_is_user_mode,
				7226	.get_guest_ip = kvm_get_guest_ip,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7227	.handle_intel_pt_intr = kvm_handle_intel_pt_intr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7228	};
				7229
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7230	#ifdef CONFIG_X86_64
				7231	static void pvclock_gtod_update_fn(struct work_struct *work)
				7232	{
				7233	struct kvm *kvm;
				7234
				7235	struct kvm_vcpu *vcpu;
				7236	int i;
				7237
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7238	mutex_lock(&kvm_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7239	list_for_each_entry(kvm, &vm_list, vm_list)
				7240	kvm_for_each_vcpu(i, vcpu, kvm)
				7241	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				7242	atomic_set(&kvm_guest_has_master_clock, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7243	mutex_unlock(&kvm_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7244	}
				7245
				7246	static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
				7247
				7248	/*
				7249	* Notification about pvclock gtod data update.
				7250	*/
				7251	static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
				7252	void *priv)
				7253	{
				7254	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
				7255	struct timekeeper *tk = priv;
				7256
				7257	update_pvclock_gtod(tk);
				7258
				7259	/* disable master clock if host does not trust, or does not
				7260	* use, TSC based clocksource.
				7261	*/
				7262	if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
				7263	atomic_read(&kvm_guest_has_master_clock) != 0)
				7264	queue_work(system_long_wq, &pvclock_gtod_work);
				7265
				7266	return 0;
				7267	}
				7268
				7269	static struct notifier_block pvclock_gtod_notifier = {
				7270	.notifier_call = pvclock_gtod_notify,
				7271	};
				7272	#endif
				7273
				7274	int kvm_arch_init(void *opaque)
				7275	{
				7276	int r;
				7277	struct kvm_x86_ops *ops = opaque;
				7278
				7279	if (kvm_x86_ops) {
				7280	printk(KERN_ERR "kvm: already loaded the other module\n");
				7281	r = -EEXIST;
				7282	goto out;
				7283	}
				7284
				7285	if (!ops->cpu_has_kvm_support()) {
				7286	printk(KERN_ERR "kvm: no hardware support\n");
				7287	r = -EOPNOTSUPP;
				7288	goto out;
				7289	}
				7290	if (ops->disabled_by_bios()) {
				7291	printk(KERN_ERR "kvm: disabled by bios\n");
				7292	r = -EOPNOTSUPP;
				7293	goto out;
				7294	}
				7295
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7296	/*
				7297	* KVM explicitly assumes that the guest has an FPU and
				7298	* FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the
				7299	* vCPU's FPU state as a fxregs_state struct.
				7300	*/
				7301	if (!boot_cpu_has(X86_FEATURE_FPU) \|\| !boot_cpu_has(X86_FEATURE_FXSR)) {
				7302	printk(KERN_ERR "kvm: inadequate fpu\n");
				7303	r = -EOPNOTSUPP;
				7304	goto out;
				7305	}
				7306
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7307	r = -ENOMEM;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7308	x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu),
				7309	__alignof__(struct fpu), SLAB_ACCOUNT,
				7310	NULL);
				7311	if (!x86_fpu_cache) {
				7312	printk(KERN_ERR "kvm: failed to allocate cache for x86 fpu\n");
				7313	goto out;
				7314	}
				7315
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7316	shared_msrs = alloc_percpu(struct kvm_shared_msrs);
				7317	if (!shared_msrs) {
				7318	printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7319	goto out_free_x86_fpu_cache;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7320	}
				7321
				7322	r = kvm_mmu_module_init();
				7323	if (r)
				7324	goto out_free_percpu;
				7325
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7326	kvm_x86_ops = ops;
				7327
				7328	kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
				7329	PT_DIRTY_MASK, PT64_NX_MASK, 0,
				7330	PT_PRESENT_MASK, 0, sme_me_mask);
				7331	kvm_timer_init();
				7332
				7333	perf_register_guest_info_callbacks(&kvm_guest_cbs);
				7334
				7335	if (boot_cpu_has(X86_FEATURE_XSAVE))
				7336	host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
				7337
				7338	kvm_lapic_init();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7339	if (pi_inject_timer == -1)
				7340	pi_inject_timer = housekeeping_enabled(HK_FLAG_TIMER);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7341	#ifdef CONFIG_X86_64
				7342	pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
				7343
				7344	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
				7345	set_hv_tscchange_cb(kvm_hyperv_tsc_notifier);
				7346	#endif
				7347
				7348	return 0;
				7349
				7350	out_free_percpu:
				7351	free_percpu(shared_msrs);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7352	out_free_x86_fpu_cache:
				7353	kmem_cache_destroy(x86_fpu_cache);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7354	out:
				7355	return r;
				7356	}
				7357
				7358	void kvm_arch_exit(void)
				7359	{
				7360	#ifdef CONFIG_X86_64
				7361	if (hypervisor_is_type(X86_HYPER_MS_HYPERV))
				7362	clear_hv_tscchange_cb();
				7363	#endif
				7364	kvm_lapic_exit();
				7365	perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
				7366
				7367	if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
				7368	cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
				7369	CPUFREQ_TRANSITION_NOTIFIER);
				7370	cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
				7371	#ifdef CONFIG_X86_64
				7372	pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7373	cancel_work_sync(&pvclock_gtod_work);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7374	#endif
				7375	kvm_x86_ops = NULL;
				7376	kvm_mmu_module_exit();
				7377	free_percpu(shared_msrs);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7378	kmem_cache_destroy(x86_fpu_cache);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7379	}
				7380
				7381	int kvm_vcpu_halt(struct kvm_vcpu *vcpu)
				7382	{
				7383	++vcpu->stat.halt_exits;
				7384	if (lapic_in_kernel(vcpu)) {
				7385	vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
				7386	return 1;
				7387	} else {
				7388	vcpu->run->exit_reason = KVM_EXIT_HLT;
				7389	return 0;
				7390	}
				7391	}
				7392	EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
				7393
				7394	int kvm_emulate_halt(struct kvm_vcpu *vcpu)
				7395	{
				7396	int ret = kvm_skip_emulated_instruction(vcpu);
				7397	/*
				7398	* TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered
				7399	* KVM_EXIT_DEBUG here.
				7400	*/
				7401	return kvm_vcpu_halt(vcpu) && ret;
				7402	}
				7403	EXPORT_SYMBOL_GPL(kvm_emulate_halt);
				7404
				7405	#ifdef CONFIG_X86_64
				7406	static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr,
				7407	unsigned long clock_type)
				7408	{
				7409	struct kvm_clock_pairing clock_pairing;
				7410	struct timespec64 ts;
				7411	u64 cycle;
				7412	int ret;
				7413
				7414	if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK)
				7415	return -KVM_EOPNOTSUPP;
				7416
				7417	if (kvm_get_walltime_and_clockread(&ts, &cycle) == false)
				7418	return -KVM_EOPNOTSUPP;
				7419
				7420	clock_pairing.sec = ts.tv_sec;
				7421	clock_pairing.nsec = ts.tv_nsec;
				7422	clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle);
				7423	clock_pairing.flags = 0;
				7424	memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad));
				7425
				7426	ret = 0;
				7427	if (kvm_write_guest(vcpu->kvm, paddr, &clock_pairing,
				7428	sizeof(struct kvm_clock_pairing)))
				7429	ret = -KVM_EFAULT;
				7430
				7431	return ret;
				7432	}
				7433	#endif
				7434
				7435	/*
				7436	* kvm_pv_kick_cpu_op: Kick a vcpu.
				7437	*
				7438	* @apicid - apicid of vcpu to be kicked.
				7439	*/
				7440	static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
				7441	{
				7442	struct kvm_lapic_irq lapic_irq;
				7443
				7444	lapic_irq.shorthand = 0;
				7445	lapic_irq.dest_mode = 0;
				7446	lapic_irq.level = 0;
				7447	lapic_irq.dest_id = apicid;
				7448	lapic_irq.msi_redir_hint = false;
				7449
				7450	lapic_irq.delivery_mode = APIC_DM_REMRD;
				7451	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
				7452	}
				7453
				7454	void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
				7455	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7456	if (!lapic_in_kernel(vcpu)) {
				7457	WARN_ON_ONCE(vcpu->arch.apicv_active);
				7458	return;
				7459	}
				7460	if (!vcpu->arch.apicv_active)
				7461	return;
				7462
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7463	vcpu->arch.apicv_active = false;
				7464	kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
				7465	}
				7466
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7467	static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
				7468	{
				7469	struct kvm_vcpu *target = NULL;
				7470	struct kvm_apic_map *map;
				7471
				7472	rcu_read_lock();
				7473	map = rcu_dereference(kvm->arch.apic_map);
				7474
				7475	if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id])
				7476	target = map->phys_map[dest_id]->vcpu;
				7477
				7478	rcu_read_unlock();
				7479
				7480	if (target && READ_ONCE(target->ready))
				7481	kvm_vcpu_yield_to(target);
				7482	}
				7483
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7484	int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
				7485	{
				7486	unsigned long nr, a0, a1, a2, a3, ret;
				7487	int op_64_bit;
				7488
				7489	if (kvm_hv_hypercall_enabled(vcpu->kvm))
				7490	return kvm_hv_hypercall(vcpu);
				7491
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7492	nr = kvm_rax_read(vcpu);
				7493	a0 = kvm_rbx_read(vcpu);
				7494	a1 = kvm_rcx_read(vcpu);
				7495	a2 = kvm_rdx_read(vcpu);
				7496	a3 = kvm_rsi_read(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7497
				7498	trace_kvm_hypercall(nr, a0, a1, a2, a3);
				7499
				7500	op_64_bit = is_64_bit_mode(vcpu);
				7501	if (!op_64_bit) {
				7502	nr &= 0xFFFFFFFF;
				7503	a0 &= 0xFFFFFFFF;
				7504	a1 &= 0xFFFFFFFF;
				7505	a2 &= 0xFFFFFFFF;
				7506	a3 &= 0xFFFFFFFF;
				7507	}
				7508
				7509	if (kvm_x86_ops->get_cpl(vcpu) != 0) {
				7510	ret = -KVM_EPERM;
				7511	goto out;
				7512	}
				7513
				7514	switch (nr) {
				7515	case KVM_HC_VAPIC_POLL_IRQ:
				7516	ret = 0;
				7517	break;
				7518	case KVM_HC_KICK_CPU:
				7519	kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7520	kvm_sched_yield(vcpu->kvm, a1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7521	ret = 0;
				7522	break;
				7523	#ifdef CONFIG_X86_64
				7524	case KVM_HC_CLOCK_PAIRING:
				7525	ret = kvm_pv_clock_pairing(vcpu, a0, a1);
				7526	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7527	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7528	case KVM_HC_SEND_IPI:
				7529	ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
				7530	break;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7531	case KVM_HC_SCHED_YIELD:
				7532	kvm_sched_yield(vcpu->kvm, a0);
				7533	ret = 0;
				7534	break;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7535	default:
				7536	ret = -KVM_ENOSYS;
				7537	break;
				7538	}
				7539	out:
				7540	if (!op_64_bit)
				7541	ret = (u32)ret;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7542	kvm_rax_write(vcpu, ret);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7543
				7544	++vcpu->stat.hypercalls;
				7545	return kvm_skip_emulated_instruction(vcpu);
				7546	}
				7547	EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
				7548
				7549	static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
				7550	{
				7551	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
				7552	char instruction[3];
				7553	unsigned long rip = kvm_rip_read(vcpu);
				7554
				7555	kvm_x86_ops->patch_hypercall(vcpu, instruction);
				7556
				7557	return emulator_write_emulated(ctxt, rip, instruction, 3,
				7558	&ctxt->exception);
				7559	}
				7560
				7561	static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
				7562	{
				7563	return vcpu->run->request_interrupt_window &&
				7564	likely(!pic_in_kernel(vcpu->kvm));
				7565	}
				7566
				7567	static void post_kvm_run_save(struct kvm_vcpu *vcpu)
				7568	{
				7569	struct kvm_run *kvm_run = vcpu->run;
				7570
				7571	kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
				7572	kvm_run->flags = is_smm(vcpu) ? KVM_RUN_X86_SMM : 0;
				7573	kvm_run->cr8 = kvm_get_cr8(vcpu);
				7574	kvm_run->apic_base = kvm_get_apic_base(vcpu);
				7575	kvm_run->ready_for_interrupt_injection =
				7576	pic_in_kernel(vcpu->kvm) \|\|
				7577	kvm_vcpu_ready_for_interrupt_injection(vcpu);
				7578	}
				7579
				7580	static void update_cr8_intercept(struct kvm_vcpu *vcpu)
				7581	{
				7582	int max_irr, tpr;
				7583
				7584	if (!kvm_x86_ops->update_cr8_intercept)
				7585	return;
				7586
				7587	if (!lapic_in_kernel(vcpu))
				7588	return;
				7589
				7590	if (vcpu->arch.apicv_active)
				7591	return;
				7592
				7593	if (!vcpu->arch.apic->vapic_addr)
				7594	max_irr = kvm_lapic_find_highest_irr(vcpu);
				7595	else
				7596	max_irr = -1;
				7597
				7598	if (max_irr != -1)
				7599	max_irr >>= 4;
				7600
				7601	tpr = kvm_lapic_get_cr8(vcpu);
				7602
				7603	kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
				7604	}
				7605
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7606	static void kvm_inject_exception(struct kvm_vcpu *vcpu)
				7607	{
				7608	if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
				7609	vcpu->arch.exception.error_code = false;
				7610	kvm_x86_ops->queue_exception(vcpu);
				7611	}
				7612
				7613	static int inject_pending_event(struct kvm_vcpu *vcpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7614	{
				7615	int r;
				7616
				7617	/* try to reinject previous events if any */
				7618
				7619	if (vcpu->arch.exception.injected)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7620	kvm_inject_exception(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7621	/*
				7622	* Do not inject an NMI or interrupt if there is a pending
				7623	* exception. Exceptions and interrupts are recognized at
				7624	* instruction boundaries, i.e. the start of an instruction.
				7625	* Trap-like exceptions, e.g. #DB, have higher priority than
				7626	* NMIs and interrupts, i.e. traps are recognized before an
				7627	* NMI/interrupt that's pending on the same instruction.
				7628	* Fault-like exceptions, e.g. #GP and #PF, are the lowest
				7629	* priority, but are only generated (pended) during instruction
				7630	* execution, i.e. a pending fault-like exception means the
				7631	* fault occurred on the previous instruction and must be
				7632	* serviced prior to recognizing any new events in order to
				7633	* fully complete the previous instruction.
				7634	*/
				7635	else if (!vcpu->arch.exception.pending) {
				7636	if (vcpu->arch.nmi_injected)
				7637	kvm_x86_ops->set_nmi(vcpu);
				7638	else if (vcpu->arch.interrupt.injected)
				7639	kvm_x86_ops->set_irq(vcpu);
				7640	}
				7641
				7642	/*
				7643	* Call check_nested_events() even if we reinjected a previous event
				7644	* in order for caller to determine if it should require immediate-exit
				7645	* from L2 to L1 due to pending L1 events which require exit
				7646	* from L2 to L1.
				7647	*/
				7648	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7649	r = kvm_x86_ops->check_nested_events(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7650	if (r != 0)
				7651	return r;
				7652	}
				7653
				7654	/* try to inject new event if pending */
				7655	if (vcpu->arch.exception.pending) {
				7656	trace_kvm_inj_exception(vcpu->arch.exception.nr,
				7657	vcpu->arch.exception.has_error_code,
				7658	vcpu->arch.exception.error_code);
				7659
				7660	WARN_ON_ONCE(vcpu->arch.exception.injected);
				7661	vcpu->arch.exception.pending = false;
				7662	vcpu->arch.exception.injected = true;
				7663
				7664	if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
				7665	__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) \|
				7666	X86_EFLAGS_RF);
				7667
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7668	if (vcpu->arch.exception.nr == DB_VECTOR) {
				7669	/*
				7670	* This code assumes that nSVM doesn't use
				7671	* check_nested_events(). If it does, the
				7672	* DR6/DR7 changes should happen before L1
				7673	* gets a #VMEXIT for an intercepted #DB in
				7674	* L2. (Under VMX, on the other hand, the
				7675	* DR6/DR7 changes should not happen in the
				7676	* event of a VM-exit to L1 for an intercepted
				7677	* #DB in L2.)
				7678	*/
				7679	kvm_deliver_exception_payload(vcpu);
				7680	if (vcpu->arch.dr7 & DR7_GD) {
				7681	vcpu->arch.dr7 &= ~DR7_GD;
				7682	kvm_update_dr7(vcpu);
				7683	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7684	}
				7685
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7686	kvm_inject_exception(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7687	}
				7688
				7689	/* Don't consider new event if we re-injected an event */
				7690	if (kvm_event_needs_reinjection(vcpu))
				7691	return 0;
				7692
				7693	if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
				7694	kvm_x86_ops->smi_allowed(vcpu)) {
				7695	vcpu->arch.smi_pending = false;
				7696	++vcpu->arch.smi_count;
				7697	enter_smm(vcpu);
				7698	} else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
				7699	--vcpu->arch.nmi_pending;
				7700	vcpu->arch.nmi_injected = true;
				7701	kvm_x86_ops->set_nmi(vcpu);
				7702	} else if (kvm_cpu_has_injectable_intr(vcpu)) {
				7703	/*
				7704	* Because interrupts can be injected asynchronously, we are
				7705	* calling check_nested_events again here to avoid a race condition.
				7706	* See https://lkml.org/lkml/2014/7/2/60 for discussion about this
				7707	* proposal and current concerns. Perhaps we should be setting
				7708	* KVM_REQ_EVENT only on certain events and not unconditionally?
				7709	*/
				7710	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	7711	r = kvm_x86_ops->check_nested_events(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7712	if (r != 0)
				7713	return r;
				7714	}
				7715	if (kvm_x86_ops->interrupt_allowed(vcpu)) {
				7716	kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
				7717	false);
				7718	kvm_x86_ops->set_irq(vcpu);
				7719	}
				7720	}
				7721
				7722	return 0;
				7723	}
				7724
				7725	static void process_nmi(struct kvm_vcpu *vcpu)
				7726	{
				7727	unsigned limit = 2;
				7728
				7729	/*
				7730	* x86 is limited to one NMI running, and one NMI pending after it.
				7731	* If an NMI is already in progress, limit further NMIs to just one.
				7732	* Otherwise, allow two (and we'll inject the first one immediately).
				7733	*/
				7734	if (kvm_x86_ops->get_nmi_mask(vcpu) \|\| vcpu->arch.nmi_injected)
				7735	limit = 1;
				7736
				7737	vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
				7738	vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit);
				7739	kvm_make_request(KVM_REQ_EVENT, vcpu);
				7740	}
				7741
				7742	static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
				7743	{
				7744	u32 flags = 0;
				7745	flags \|= seg->g << 23;
				7746	flags \|= seg->db << 22;
				7747	flags \|= seg->l << 21;
				7748	flags \|= seg->avl << 20;
				7749	flags \|= seg->present << 15;
				7750	flags \|= seg->dpl << 13;
				7751	flags \|= seg->s << 12;
				7752	flags \|= seg->type << 8;
				7753	return flags;
				7754	}
				7755
				7756	static void enter_smm_save_seg_32(struct kvm_vcpu vcpu, char buf, int n)
				7757	{
				7758	struct kvm_segment seg;
				7759	int offset;
				7760
				7761	kvm_get_segment(vcpu, &seg, n);
				7762	put_smstate(u32, buf, 0x7fa8 + n * 4, seg.selector);
				7763
				7764	if (n < 3)
				7765	offset = 0x7f84 + n * 12;
				7766	else
				7767	offset = 0x7f2c + (n - 3) * 12;
				7768
				7769	put_smstate(u32, buf, offset + 8, seg.base);
				7770	put_smstate(u32, buf, offset + 4, seg.limit);
				7771	put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
				7772	}
				7773
				7774	#ifdef CONFIG_X86_64
				7775	static void enter_smm_save_seg_64(struct kvm_vcpu vcpu, char buf, int n)
				7776	{
				7777	struct kvm_segment seg;
				7778	int offset;
				7779	u16 flags;
				7780
				7781	kvm_get_segment(vcpu, &seg, n);
				7782	offset = 0x7e00 + n * 16;
				7783
				7784	flags = enter_smm_get_segment_flags(&seg) >> 8;
				7785	put_smstate(u16, buf, offset, seg.selector);
				7786	put_smstate(u16, buf, offset + 2, flags);
				7787	put_smstate(u32, buf, offset + 4, seg.limit);
				7788	put_smstate(u64, buf, offset + 8, seg.base);
				7789	}
				7790	#endif
				7791
				7792	static void enter_smm_save_state_32(struct kvm_vcpu vcpu, char buf)
				7793	{
				7794	struct desc_ptr dt;
				7795	struct kvm_segment seg;
				7796	unsigned long val;
				7797	int i;
				7798
				7799	put_smstate(u32, buf, 0x7ffc, kvm_read_cr0(vcpu));
				7800	put_smstate(u32, buf, 0x7ff8, kvm_read_cr3(vcpu));
				7801	put_smstate(u32, buf, 0x7ff4, kvm_get_rflags(vcpu));
				7802	put_smstate(u32, buf, 0x7ff0, kvm_rip_read(vcpu));
				7803
				7804	for (i = 0; i < 8; i++)
				7805	put_smstate(u32, buf, 0x7fd0 + i * 4, kvm_register_read(vcpu, i));
				7806
				7807	kvm_get_dr(vcpu, 6, &val);
				7808	put_smstate(u32, buf, 0x7fcc, (u32)val);
				7809	kvm_get_dr(vcpu, 7, &val);
				7810	put_smstate(u32, buf, 0x7fc8, (u32)val);
				7811
				7812	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
				7813	put_smstate(u32, buf, 0x7fc4, seg.selector);
				7814	put_smstate(u32, buf, 0x7f64, seg.base);
				7815	put_smstate(u32, buf, 0x7f60, seg.limit);
				7816	put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
				7817
				7818	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
				7819	put_smstate(u32, buf, 0x7fc0, seg.selector);
				7820	put_smstate(u32, buf, 0x7f80, seg.base);
				7821	put_smstate(u32, buf, 0x7f7c, seg.limit);
				7822	put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
				7823
				7824	kvm_x86_ops->get_gdt(vcpu, &dt);
				7825	put_smstate(u32, buf, 0x7f74, dt.address);
				7826	put_smstate(u32, buf, 0x7f70, dt.size);
				7827
				7828	kvm_x86_ops->get_idt(vcpu, &dt);
				7829	put_smstate(u32, buf, 0x7f58, dt.address);
				7830	put_smstate(u32, buf, 0x7f54, dt.size);
				7831
				7832	for (i = 0; i < 6; i++)
				7833	enter_smm_save_seg_32(vcpu, buf, i);
				7834
				7835	put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
				7836
				7837	/* revision id */
				7838	put_smstate(u32, buf, 0x7efc, 0x00020000);
				7839	put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
				7840	}
				7841
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7842	#ifdef CONFIG_X86_64
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7843	static void enter_smm_save_state_64(struct kvm_vcpu vcpu, char buf)
				7844	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7845	struct desc_ptr dt;
				7846	struct kvm_segment seg;
				7847	unsigned long val;
				7848	int i;
				7849
				7850	for (i = 0; i < 16; i++)
				7851	put_smstate(u64, buf, 0x7ff8 - i * 8, kvm_register_read(vcpu, i));
				7852
				7853	put_smstate(u64, buf, 0x7f78, kvm_rip_read(vcpu));
				7854	put_smstate(u32, buf, 0x7f70, kvm_get_rflags(vcpu));
				7855
				7856	kvm_get_dr(vcpu, 6, &val);
				7857	put_smstate(u64, buf, 0x7f68, val);
				7858	kvm_get_dr(vcpu, 7, &val);
				7859	put_smstate(u64, buf, 0x7f60, val);
				7860
				7861	put_smstate(u64, buf, 0x7f58, kvm_read_cr0(vcpu));
				7862	put_smstate(u64, buf, 0x7f50, kvm_read_cr3(vcpu));
				7863	put_smstate(u64, buf, 0x7f48, kvm_read_cr4(vcpu));
				7864
				7865	put_smstate(u32, buf, 0x7f00, vcpu->arch.smbase);
				7866
				7867	/* revision id */
				7868	put_smstate(u32, buf, 0x7efc, 0x00020064);
				7869
				7870	put_smstate(u64, buf, 0x7ed0, vcpu->arch.efer);
				7871
				7872	kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
				7873	put_smstate(u16, buf, 0x7e90, seg.selector);
				7874	put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
				7875	put_smstate(u32, buf, 0x7e94, seg.limit);
				7876	put_smstate(u64, buf, 0x7e98, seg.base);
				7877
				7878	kvm_x86_ops->get_idt(vcpu, &dt);
				7879	put_smstate(u32, buf, 0x7e84, dt.size);
				7880	put_smstate(u64, buf, 0x7e88, dt.address);
				7881
				7882	kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
				7883	put_smstate(u16, buf, 0x7e70, seg.selector);
				7884	put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
				7885	put_smstate(u32, buf, 0x7e74, seg.limit);
				7886	put_smstate(u64, buf, 0x7e78, seg.base);
				7887
				7888	kvm_x86_ops->get_gdt(vcpu, &dt);
				7889	put_smstate(u32, buf, 0x7e64, dt.size);
				7890	put_smstate(u64, buf, 0x7e68, dt.address);
				7891
				7892	for (i = 0; i < 6; i++)
				7893	enter_smm_save_seg_64(vcpu, buf, i);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7894	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7895	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7896
				7897	static void enter_smm(struct kvm_vcpu *vcpu)
				7898	{
				7899	struct kvm_segment cs, ds;
				7900	struct desc_ptr dt;
				7901	char buf[512];
				7902	u32 cr0;
				7903
				7904	trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
				7905	memset(buf, 0, 512);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7906	#ifdef CONFIG_X86_64
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7907	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
				7908	enter_smm_save_state_64(vcpu, buf);
				7909	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7910	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7911	enter_smm_save_state_32(vcpu, buf);
				7912
				7913	/*
				7914	* Give pre_enter_smm() a chance to make ISA-specific changes to the
				7915	* vCPU state (e.g. leave guest mode) after we've saved the state into
				7916	* the SMM state-save area.
				7917	*/
				7918	kvm_x86_ops->pre_enter_smm(vcpu, buf);
				7919
				7920	vcpu->arch.hflags \|= HF_SMM_MASK;
				7921	kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
				7922
				7923	if (kvm_x86_ops->get_nmi_mask(vcpu))
				7924	vcpu->arch.hflags \|= HF_SMM_INSIDE_NMI_MASK;
				7925	else
				7926	kvm_x86_ops->set_nmi_mask(vcpu, true);
				7927
				7928	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
				7929	kvm_rip_write(vcpu, 0x8000);
				7930
				7931	cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE \| X86_CR0_EM \| X86_CR0_TS \| X86_CR0_PG);
				7932	kvm_x86_ops->set_cr0(vcpu, cr0);
				7933	vcpu->arch.cr0 = cr0;
				7934
				7935	kvm_x86_ops->set_cr4(vcpu, 0);
				7936
				7937	/* Undocumented: IDT limit is set to zero on entry to SMM. */
				7938	dt.address = dt.size = 0;
				7939	kvm_x86_ops->set_idt(vcpu, &dt);
				7940
				7941	__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
				7942
				7943	cs.selector = (vcpu->arch.smbase >> 4) & 0xffff;
				7944	cs.base = vcpu->arch.smbase;
				7945
				7946	ds.selector = 0;
				7947	ds.base = 0;
				7948
				7949	cs.limit = ds.limit = 0xffffffff;
				7950	cs.type = ds.type = 0x3;
				7951	cs.dpl = ds.dpl = 0;
				7952	cs.db = ds.db = 0;
				7953	cs.s = ds.s = 1;
				7954	cs.l = ds.l = 0;
				7955	cs.g = ds.g = 1;
				7956	cs.avl = ds.avl = 0;
				7957	cs.present = ds.present = 1;
				7958	cs.unusable = ds.unusable = 0;
				7959	cs.padding = ds.padding = 0;
				7960
				7961	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
				7962	kvm_set_segment(vcpu, &ds, VCPU_SREG_DS);
				7963	kvm_set_segment(vcpu, &ds, VCPU_SREG_ES);
				7964	kvm_set_segment(vcpu, &ds, VCPU_SREG_FS);
				7965	kvm_set_segment(vcpu, &ds, VCPU_SREG_GS);
				7966	kvm_set_segment(vcpu, &ds, VCPU_SREG_SS);
				7967
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7968	#ifdef CONFIG_X86_64
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7969	if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
				7970	kvm_x86_ops->set_efer(vcpu, 0);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	7971	#endif
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	7972
				7973	kvm_update_cpuid(vcpu);
				7974	kvm_mmu_reset_context(vcpu);
				7975	}
				7976
				7977	static void process_smi(struct kvm_vcpu *vcpu)
				7978	{
				7979	vcpu->arch.smi_pending = true;
				7980	kvm_make_request(KVM_REQ_EVENT, vcpu);
				7981	}
				7982
				7983	void kvm_make_scan_ioapic_request(struct kvm *kvm)
				7984	{
				7985	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
				7986	}
				7987
				7988	static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
				7989	{
				7990	if (!kvm_apic_present(vcpu))
				7991	return;
				7992
				7993	bitmap_zero(vcpu->arch.ioapic_handled_vectors, 256);
				7994
				7995	if (irqchip_split(vcpu->kvm))
				7996	kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
				7997	else {
				7998	if (vcpu->arch.apicv_active)
				7999	kvm_x86_ops->sync_pir_to_irr(vcpu);
				8000	if (ioapic_in_kernel(vcpu->kvm))
				8001	kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
				8002	}
				8003
				8004	if (is_guest_mode(vcpu))
				8005	vcpu->arch.load_eoi_exitmap_pending = true;
				8006	else
				8007	kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu);
				8008	}
				8009
				8010	static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu)
				8011	{
				8012	u64 eoi_exit_bitmap[4];
				8013
				8014	if (!kvm_apic_hw_enabled(vcpu->arch.apic))
				8015	return;
				8016
				8017	bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
				8018	vcpu_to_synic(vcpu)->vec_bitmap, 256);
				8019	kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
				8020	}
				8021
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8022	void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
				8023	unsigned long start, unsigned long end)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8024	{
				8025	unsigned long apic_address;
				8026
				8027	/*
				8028	* The physical address of apic access page is stored in the VMCS.
				8029	* Update it when it becomes invalid.
				8030	*/
				8031	apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
				8032	if (start <= apic_address && apic_address < end)
				8033	kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8034	}
				8035
				8036	void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
				8037	{
				8038	struct page *page = NULL;
				8039
				8040	if (!lapic_in_kernel(vcpu))
				8041	return;
				8042
				8043	if (!kvm_x86_ops->set_apic_access_page_addr)
				8044	return;
				8045
				8046	page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
				8047	if (is_error_page(page))
				8048	return;
				8049	kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
				8050
				8051	/*
				8052	* Do not pin apic access page in memory, the MMU notifier
				8053	* will call us again if it is migrated or swapped out.
				8054	*/
				8055	put_page(page);
				8056	}
				8057	EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
				8058
				8059	void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
				8060	{
				8061	smp_send_reschedule(vcpu->cpu);
				8062	}
				8063	EXPORT_SYMBOL_GPL(__kvm_request_immediate_exit);
				8064
				8065	/*
				8066	* Returns 1 to let vcpu_run() continue the guest execution loop without
				8067	* exiting to the userspace. Otherwise, the value will be returned to the
				8068	* userspace.
				8069	*/
				8070	static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
				8071	{
				8072	int r;
				8073	bool req_int_win =
				8074	dm_request_for_irq_injection(vcpu) &&
				8075	kvm_cpu_accept_dm_intr(vcpu);
				8076
				8077	bool req_immediate_exit = false;
				8078
				8079	if (kvm_request_pending(vcpu)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8080	if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
				8081	if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
				8082	r = 0;
				8083	goto out;
				8084	}
				8085	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8086	if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu))
				8087	kvm_mmu_unload(vcpu);
				8088	if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
				8089	__kvm_migrate_timers(vcpu);
				8090	if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu))
				8091	kvm_gen_update_masterclock(vcpu->kvm);
				8092	if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu))
				8093	kvm_gen_kvmclock_update(vcpu);
				8094	if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
				8095	r = kvm_guest_time_update(vcpu);
				8096	if (unlikely(r))
				8097	goto out;
				8098	}
				8099	if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
				8100	kvm_mmu_sync_roots(vcpu);
				8101	if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
				8102	kvm_mmu_load_cr3(vcpu);
				8103	if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
				8104	kvm_vcpu_flush_tlb(vcpu, true);
				8105	if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
				8106	vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
				8107	r = 0;
				8108	goto out;
				8109	}
				8110	if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) {
				8111	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
				8112	vcpu->mmio_needed = 0;
				8113	r = 0;
				8114	goto out;
				8115	}
				8116	if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
				8117	/* Page is swapped out. Do synthetic halt */
				8118	vcpu->arch.apf.halted = true;
				8119	r = 1;
				8120	goto out;
				8121	}
				8122	if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu))
				8123	record_steal_time(vcpu);
				8124	if (kvm_check_request(KVM_REQ_SMI, vcpu))
				8125	process_smi(vcpu);
				8126	if (kvm_check_request(KVM_REQ_NMI, vcpu))
				8127	process_nmi(vcpu);
				8128	if (kvm_check_request(KVM_REQ_PMU, vcpu))
				8129	kvm_pmu_handle_event(vcpu);
				8130	if (kvm_check_request(KVM_REQ_PMI, vcpu))
				8131	kvm_pmu_deliver_pmi(vcpu);
				8132	if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) {
				8133	BUG_ON(vcpu->arch.pending_ioapic_eoi > 255);
				8134	if (test_bit(vcpu->arch.pending_ioapic_eoi,
				8135	vcpu->arch.ioapic_handled_vectors)) {
				8136	vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI;
				8137	vcpu->run->eoi.vector =
				8138	vcpu->arch.pending_ioapic_eoi;
				8139	r = 0;
				8140	goto out;
				8141	}
				8142	}
				8143	if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu))
				8144	vcpu_scan_ioapic(vcpu);
				8145	if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu))
				8146	vcpu_load_eoi_exitmap(vcpu);
				8147	if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
				8148	kvm_vcpu_reload_apic_access_page(vcpu);
				8149	if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
				8150	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
				8151	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
				8152	r = 0;
				8153	goto out;
				8154	}
				8155	if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
				8156	vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
				8157	vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
				8158	r = 0;
				8159	goto out;
				8160	}
				8161	if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
				8162	vcpu->run->exit_reason = KVM_EXIT_HYPERV;
				8163	vcpu->run->hyperv = vcpu->arch.hyperv.exit;
				8164	r = 0;
				8165	goto out;
				8166	}
				8167
				8168	/*
				8169	* KVM_REQ_HV_STIMER has to be processed after
				8170	* KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
				8171	* depend on the guest clock being up-to-date
				8172	*/
				8173	if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
				8174	kvm_hv_process_stimers(vcpu);
				8175	}
				8176
				8177	if (kvm_check_request(KVM_REQ_EVENT, vcpu) \|\| req_int_win) {
				8178	++vcpu->stat.req_event;
				8179	kvm_apic_accept_events(vcpu);
				8180	if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
				8181	r = 1;
				8182	goto out;
				8183	}
				8184
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8185	if (inject_pending_event(vcpu) != 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8186	req_immediate_exit = true;
				8187	else {
				8188	/* Enable SMI/NMI/IRQ window open exits if needed.
				8189	*
				8190	* SMIs have three cases:
				8191	* 1) They can be nested, and then there is nothing to
				8192	* do here because RSM will cause a vmexit anyway.
				8193	* 2) There is an ISA-specific reason why SMI cannot be
				8194	* injected, and the moment when this changes can be
				8195	* intercepted.
				8196	* 3) Or the SMI can be pending because
				8197	* inject_pending_event has completed the injection
				8198	* of an IRQ or NMI from the previous vmexit, and
				8199	* then we request an immediate exit to inject the
				8200	* SMI.
				8201	*/
				8202	if (vcpu->arch.smi_pending && !is_smm(vcpu))
				8203	if (!kvm_x86_ops->enable_smi_window(vcpu))
				8204	req_immediate_exit = true;
				8205	if (vcpu->arch.nmi_pending)
				8206	kvm_x86_ops->enable_nmi_window(vcpu);
				8207	if (kvm_cpu_has_injectable_intr(vcpu) \|\| req_int_win)
				8208	kvm_x86_ops->enable_irq_window(vcpu);
				8209	WARN_ON(vcpu->arch.exception.pending);
				8210	}
				8211
				8212	if (kvm_lapic_enabled(vcpu)) {
				8213	update_cr8_intercept(vcpu);
				8214	kvm_lapic_sync_to_vapic(vcpu);
				8215	}
				8216	}
				8217
				8218	r = kvm_mmu_reload(vcpu);
				8219	if (unlikely(r)) {
				8220	goto cancel_injection;
				8221	}
				8222
				8223	preempt_disable();
				8224
				8225	kvm_x86_ops->prepare_guest_switch(vcpu);
				8226
				8227	/*
				8228	* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
				8229	* IPI are then delayed after guest entry, which ensures that they
				8230	* result in virtual interrupt delivery.
				8231	*/
				8232	local_irq_disable();
				8233	vcpu->mode = IN_GUEST_MODE;
				8234
				8235	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
				8236
				8237	/*
				8238	* 1) We should set ->mode before checking ->requests. Please see
				8239	* the comment in kvm_vcpu_exiting_guest_mode().
				8240	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8241	* 2) For APICv, we should set ->mode before checking PID.ON. This
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8242	* pairs with the memory barrier implicit in pi_test_and_set_on
				8243	* (see vmx_deliver_posted_interrupt).
				8244	*
				8245	* 3) This also orders the write to mode from any reads to the page
				8246	* tables done while the VCPU is running. Please see the comment
				8247	* in kvm_flush_remote_tlbs.
				8248	*/
				8249	smp_mb__after_srcu_read_unlock();
				8250
				8251	/*
				8252	* This handles the case where a posted interrupt was
				8253	* notified with kvm_vcpu_kick.
				8254	*/
				8255	if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
				8256	kvm_x86_ops->sync_pir_to_irr(vcpu);
				8257
				8258	if (vcpu->mode == EXITING_GUEST_MODE \|\| kvm_request_pending(vcpu)
				8259	\|\| need_resched() \|\| signal_pending(current)) {
				8260	vcpu->mode = OUTSIDE_GUEST_MODE;
				8261	smp_wmb();
				8262	local_irq_enable();
				8263	preempt_enable();
				8264	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
				8265	r = 1;
				8266	goto cancel_injection;
				8267	}
				8268
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8269	if (req_immediate_exit) {
				8270	kvm_make_request(KVM_REQ_EVENT, vcpu);
				8271	kvm_x86_ops->request_immediate_exit(vcpu);
				8272	}
				8273
				8274	trace_kvm_entry(vcpu->vcpu_id);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8275	guest_enter_irqoff();
				8276
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8277	/* Save host pkru register if supported */
				8278	vcpu->arch.host_pkru = read_pkru();
				8279
				8280	fpregs_assert_state_consistent();
				8281	if (test_thread_flag(TIF_NEED_FPU_LOAD))
				8282	switch_fpu_return();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8283
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8284	if (unlikely(vcpu->arch.switch_db_regs)) {
				8285	set_debugreg(0, 7);
				8286	set_debugreg(vcpu->arch.eff_db[0], 0);
				8287	set_debugreg(vcpu->arch.eff_db[1], 1);
				8288	set_debugreg(vcpu->arch.eff_db[2], 2);
				8289	set_debugreg(vcpu->arch.eff_db[3], 3);
				8290	set_debugreg(vcpu->arch.dr6, 6);
				8291	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8292	} else if (unlikely(hw_breakpoint_active())) {
				8293	set_debugreg(0, 7);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8294	}
				8295
				8296	kvm_x86_ops->run(vcpu);
				8297
				8298	/*
				8299	* Do this here before restoring debug registers on the host. And
				8300	* since we do this before handling the vmexit, a DR access vmexit
				8301	* can (a) read the correct value of the debug registers, (b) set
				8302	* KVM_DEBUGREG_WONT_EXIT again.
				8303	*/
				8304	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
				8305	WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
				8306	kvm_x86_ops->sync_dirty_debug_regs(vcpu);
				8307	kvm_update_dr0123(vcpu);
				8308	kvm_update_dr6(vcpu);
				8309	kvm_update_dr7(vcpu);
				8310	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
				8311	}
				8312
				8313	/*
				8314	* If the guest has used debug registers, at least dr7
				8315	* will be disabled while returning to the host.
				8316	* If we don't have active breakpoints in the host, we don't
				8317	* care about the messed up debug address registers. But if
				8318	* we have some of them active, restore the old state.
				8319	*/
				8320	if (hw_breakpoint_active())
				8321	hw_breakpoint_restore();
				8322
				8323	vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
				8324
				8325	vcpu->mode = OUTSIDE_GUEST_MODE;
				8326	smp_wmb();
				8327
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8328	kvm_x86_ops->handle_exit_irqoff(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8329
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8330	/*
				8331	* Consume any pending interrupts, including the possible source of
				8332	* VM-Exit on SVM and any ticks that occur between VM-Exit and now.
				8333	* An instruction is required after local_irq_enable() to fully unblock
				8334	* interrupts on processors that implement an interrupt shadow, the
				8335	* stat.exits increment will do nicely.
				8336	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8337	kvm_before_interrupt(vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8338	local_irq_enable();
				8339	++vcpu->stat.exits;
				8340	local_irq_disable();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8341	kvm_after_interrupt(vcpu);
				8342
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8343	guest_exit_irqoff();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8344	if (lapic_in_kernel(vcpu)) {
				8345	s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
				8346	if (delta != S64_MIN) {
				8347	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, delta);
				8348	vcpu->arch.apic->lapic_timer.advance_expire_delta = S64_MIN;
				8349	}
				8350	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8351
				8352	local_irq_enable();
				8353	preempt_enable();
				8354
				8355	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
				8356
				8357	/*
				8358	* Profile KVM exit RIPs:
				8359	*/
				8360	if (unlikely(prof_on == KVM_PROFILING)) {
				8361	unsigned long rip = kvm_rip_read(vcpu);
				8362	profile_hit(KVM_PROFILING, (void *)rip);
				8363	}
				8364
				8365	if (unlikely(vcpu->arch.tsc_always_catchup))
				8366	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				8367
				8368	if (vcpu->arch.apic_attention)
				8369	kvm_lapic_sync_from_vapic(vcpu);
				8370
				8371	vcpu->arch.gpa_available = false;
				8372	r = kvm_x86_ops->handle_exit(vcpu);
				8373	return r;
				8374
				8375	cancel_injection:
				8376	kvm_x86_ops->cancel_injection(vcpu);
				8377	if (unlikely(vcpu->arch.apic_attention))
				8378	kvm_lapic_sync_from_vapic(vcpu);
				8379	out:
				8380	return r;
				8381	}
				8382
				8383	static inline int vcpu_block(struct kvm kvm, struct kvm_vcpu vcpu)
				8384	{
				8385	if (!kvm_arch_vcpu_runnable(vcpu) &&
				8386	(!kvm_x86_ops->pre_block \|\| kvm_x86_ops->pre_block(vcpu) == 0)) {
				8387	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
				8388	kvm_vcpu_block(vcpu);
				8389	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
				8390
				8391	if (kvm_x86_ops->post_block)
				8392	kvm_x86_ops->post_block(vcpu);
				8393
				8394	if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
				8395	return 1;
				8396	}
				8397
				8398	kvm_apic_accept_events(vcpu);
				8399	switch(vcpu->arch.mp_state) {
				8400	case KVM_MP_STATE_HALTED:
				8401	vcpu->arch.pv.pv_unhalted = false;
				8402	vcpu->arch.mp_state =
				8403	KVM_MP_STATE_RUNNABLE;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8404	/* fall through */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8405	case KVM_MP_STATE_RUNNABLE:
				8406	vcpu->arch.apf.halted = false;
				8407	break;
				8408	case KVM_MP_STATE_INIT_RECEIVED:
				8409	break;
				8410	default:
				8411	return -EINTR;
				8412	break;
				8413	}
				8414	return 1;
				8415	}
				8416
				8417	static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
				8418	{
				8419	if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8420	kvm_x86_ops->check_nested_events(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8421
				8422	return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
				8423	!vcpu->arch.apf.halted);
				8424	}
				8425
				8426	static int vcpu_run(struct kvm_vcpu *vcpu)
				8427	{
				8428	int r;
				8429	struct kvm *kvm = vcpu->kvm;
				8430
				8431	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
				8432	vcpu->arch.l1tf_flush_l1d = true;
				8433
				8434	for (;;) {
				8435	if (kvm_vcpu_running(vcpu)) {
				8436	r = vcpu_enter_guest(vcpu);
				8437	} else {
				8438	r = vcpu_block(kvm, vcpu);
				8439	}
				8440
				8441	if (r <= 0)
				8442	break;
				8443
				8444	kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu);
				8445	if (kvm_cpu_has_pending_timer(vcpu))
				8446	kvm_inject_pending_timer_irqs(vcpu);
				8447
				8448	if (dm_request_for_irq_injection(vcpu) &&
				8449	kvm_vcpu_ready_for_interrupt_injection(vcpu)) {
				8450	r = 0;
				8451	vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
				8452	++vcpu->stat.request_irq_exits;
				8453	break;
				8454	}
				8455
				8456	kvm_check_async_pf_completion(vcpu);
				8457
				8458	if (signal_pending(current)) {
				8459	r = -EINTR;
				8460	vcpu->run->exit_reason = KVM_EXIT_INTR;
				8461	++vcpu->stat.signal_exits;
				8462	break;
				8463	}
				8464	if (need_resched()) {
				8465	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
				8466	cond_resched();
				8467	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
				8468	}
				8469	}
				8470
				8471	srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
				8472
				8473	return r;
				8474	}
				8475
				8476	static inline int complete_emulated_io(struct kvm_vcpu *vcpu)
				8477	{
				8478	int r;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8479
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8480	vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
				8481	r = kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
				8482	srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8483	return r;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8484	}
				8485
				8486	static int complete_emulated_pio(struct kvm_vcpu *vcpu)
				8487	{
				8488	BUG_ON(!vcpu->arch.pio.count);
				8489
				8490	return complete_emulated_io(vcpu);
				8491	}
				8492
				8493	/*
				8494	* Implements the following, as a state machine:
				8495	*
				8496	* read:
				8497	* for each fragment
				8498	* for each mmio piece in the fragment
				8499	* write gpa, len
				8500	* exit
				8501	* copy data
				8502	* execute insn
				8503	*
				8504	* write:
				8505	* for each fragment
				8506	* for each mmio piece in the fragment
				8507	* write gpa, len
				8508	* copy data
				8509	* exit
				8510	*/
				8511	static int complete_emulated_mmio(struct kvm_vcpu *vcpu)
				8512	{
				8513	struct kvm_run *run = vcpu->run;
				8514	struct kvm_mmio_fragment *frag;
				8515	unsigned len;
				8516
				8517	BUG_ON(!vcpu->mmio_needed);
				8518
				8519	/* Complete previous fragment */
				8520	frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment];
				8521	len = min(8u, frag->len);
				8522	if (!vcpu->mmio_is_write)
				8523	memcpy(frag->data, run->mmio.data, len);
				8524
				8525	if (frag->len <= 8) {
				8526	/* Switch to the next fragment. */
				8527	frag++;
				8528	vcpu->mmio_cur_fragment++;
				8529	} else {
				8530	/* Go forward to the next mmio piece. */
				8531	frag->data += len;
				8532	frag->gpa += len;
				8533	frag->len -= len;
				8534	}
				8535
				8536	if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) {
				8537	vcpu->mmio_needed = 0;
				8538
				8539	/* FIXME: return into emulator if single-stepping. */
				8540	if (vcpu->mmio_is_write)
				8541	return 1;
				8542	vcpu->mmio_read_completed = 1;
				8543	return complete_emulated_io(vcpu);
				8544	}
				8545
				8546	run->exit_reason = KVM_EXIT_MMIO;
				8547	run->mmio.phys_addr = frag->gpa;
				8548	if (vcpu->mmio_is_write)
				8549	memcpy(run->mmio.data, frag->data, min(8u, frag->len));
				8550	run->mmio.len = min(8u, frag->len);
				8551	run->mmio.is_write = vcpu->mmio_is_write;
				8552	vcpu->arch.complete_userspace_io = complete_emulated_mmio;
				8553	return 0;
				8554	}
				8555
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8556	static void kvm_save_current_fpu(struct fpu *fpu)
				8557	{
				8558	/*
				8559	* If the target FPU state is not resident in the CPU registers, just
				8560	* memcpy() from current, else save CPU state directly to the target.
				8561	*/
				8562	if (test_thread_flag(TIF_NEED_FPU_LOAD))
				8563	memcpy(&fpu->state, &current->thread.fpu.state,
				8564	fpu_kernel_xstate_size);
				8565	else
				8566	copy_fpregs_to_fpstate(fpu);
				8567	}
				8568
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8569	/* Swap (qemu) user FPU context for the guest FPU context. */
				8570	static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
				8571	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8572	fpregs_lock();
				8573
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8574	kvm_save_current_fpu(vcpu->arch.user_fpu);
				8575
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8576	/* PKRU is separately restored in kvm_x86_ops->run. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8577	__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8578	~XFEATURE_MASK_PKRU);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8579
				8580	fpregs_mark_activate();
				8581	fpregs_unlock();
				8582
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8583	trace_kvm_fpu(1);
				8584	}
				8585
				8586	/* When vcpu_run ends, restore user space FPU context. */
				8587	static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
				8588	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8589	fpregs_lock();
				8590
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8591	kvm_save_current_fpu(vcpu->arch.guest_fpu);
				8592
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8593	copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
				8594
				8595	fpregs_mark_activate();
				8596	fpregs_unlock();
				8597
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8598	++vcpu->stat.fpu_reload;
				8599	trace_kvm_fpu(0);
				8600	}
				8601
				8602	int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu vcpu, struct kvm_run kvm_run)
				8603	{
				8604	int r;
				8605
				8606	vcpu_load(vcpu);
				8607	kvm_sigset_activate(vcpu);
				8608	kvm_load_guest_fpu(vcpu);
				8609
				8610	if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) {
				8611	if (kvm_run->immediate_exit) {
				8612	r = -EINTR;
				8613	goto out;
				8614	}
				8615	kvm_vcpu_block(vcpu);
				8616	kvm_apic_accept_events(vcpu);
				8617	kvm_clear_request(KVM_REQ_UNHALT, vcpu);
				8618	r = -EAGAIN;
				8619	if (signal_pending(current)) {
				8620	r = -EINTR;
				8621	vcpu->run->exit_reason = KVM_EXIT_INTR;
				8622	++vcpu->stat.signal_exits;
				8623	}
				8624	goto out;
				8625	}
				8626
				8627	if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
				8628	r = -EINVAL;
				8629	goto out;
				8630	}
				8631
				8632	if (vcpu->run->kvm_dirty_regs) {
				8633	r = sync_regs(vcpu);
				8634	if (r != 0)
				8635	goto out;
				8636	}
				8637
				8638	/* re-sync apic's tpr */
				8639	if (!lapic_in_kernel(vcpu)) {
				8640	if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
				8641	r = -EINVAL;
				8642	goto out;
				8643	}
				8644	}
				8645
				8646	if (unlikely(vcpu->arch.complete_userspace_io)) {
				8647	int (cui)(struct kvm_vcpu ) = vcpu->arch.complete_userspace_io;
				8648	vcpu->arch.complete_userspace_io = NULL;
				8649	r = cui(vcpu);
				8650	if (r <= 0)
				8651	goto out;
				8652	} else
				8653	WARN_ON(vcpu->arch.pio.count \|\| vcpu->mmio_needed);
				8654
				8655	if (kvm_run->immediate_exit)
				8656	r = -EINTR;
				8657	else
				8658	r = vcpu_run(vcpu);
				8659
				8660	out:
				8661	kvm_put_guest_fpu(vcpu);
				8662	if (vcpu->run->kvm_valid_regs)
				8663	store_regs(vcpu);
				8664	post_kvm_run_save(vcpu);
				8665	kvm_sigset_deactivate(vcpu);
				8666
				8667	vcpu_put(vcpu);
				8668	return r;
				8669	}
				8670
				8671	static void __get_regs(struct kvm_vcpu vcpu, struct kvm_regs regs)
				8672	{
				8673	if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
				8674	/*
				8675	* We are here if userspace calls get_regs() in the middle of
				8676	* instruction emulation. Registers state needs to be copied
				8677	* back from emulation context to vcpu. Userspace shouldn't do
				8678	* that usually, but some bad designed PV devices (vmware
				8679	* backdoor interface) need this to work
				8680	*/
				8681	emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
				8682	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
				8683	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8684	regs->rax = kvm_rax_read(vcpu);
				8685	regs->rbx = kvm_rbx_read(vcpu);
				8686	regs->rcx = kvm_rcx_read(vcpu);
				8687	regs->rdx = kvm_rdx_read(vcpu);
				8688	regs->rsi = kvm_rsi_read(vcpu);
				8689	regs->rdi = kvm_rdi_read(vcpu);
				8690	regs->rsp = kvm_rsp_read(vcpu);
				8691	regs->rbp = kvm_rbp_read(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8692	#ifdef CONFIG_X86_64
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8693	regs->r8 = kvm_r8_read(vcpu);
				8694	regs->r9 = kvm_r9_read(vcpu);
				8695	regs->r10 = kvm_r10_read(vcpu);
				8696	regs->r11 = kvm_r11_read(vcpu);
				8697	regs->r12 = kvm_r12_read(vcpu);
				8698	regs->r13 = kvm_r13_read(vcpu);
				8699	regs->r14 = kvm_r14_read(vcpu);
				8700	regs->r15 = kvm_r15_read(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8701	#endif
				8702
				8703	regs->rip = kvm_rip_read(vcpu);
				8704	regs->rflags = kvm_get_rflags(vcpu);
				8705	}
				8706
				8707	int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu vcpu, struct kvm_regs regs)
				8708	{
				8709	vcpu_load(vcpu);
				8710	__get_regs(vcpu, regs);
				8711	vcpu_put(vcpu);
				8712	return 0;
				8713	}
				8714
				8715	static void __set_regs(struct kvm_vcpu vcpu, struct kvm_regs regs)
				8716	{
				8717	vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
				8718	vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
				8719
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8720	kvm_rax_write(vcpu, regs->rax);
				8721	kvm_rbx_write(vcpu, regs->rbx);
				8722	kvm_rcx_write(vcpu, regs->rcx);
				8723	kvm_rdx_write(vcpu, regs->rdx);
				8724	kvm_rsi_write(vcpu, regs->rsi);
				8725	kvm_rdi_write(vcpu, regs->rdi);
				8726	kvm_rsp_write(vcpu, regs->rsp);
				8727	kvm_rbp_write(vcpu, regs->rbp);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8728	#ifdef CONFIG_X86_64
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8729	kvm_r8_write(vcpu, regs->r8);
				8730	kvm_r9_write(vcpu, regs->r9);
				8731	kvm_r10_write(vcpu, regs->r10);
				8732	kvm_r11_write(vcpu, regs->r11);
				8733	kvm_r12_write(vcpu, regs->r12);
				8734	kvm_r13_write(vcpu, regs->r13);
				8735	kvm_r14_write(vcpu, regs->r14);
				8736	kvm_r15_write(vcpu, regs->r15);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8737	#endif
				8738
				8739	kvm_rip_write(vcpu, regs->rip);
				8740	kvm_set_rflags(vcpu, regs->rflags \| X86_EFLAGS_FIXED);
				8741
				8742	vcpu->arch.exception.pending = false;
				8743
				8744	kvm_make_request(KVM_REQ_EVENT, vcpu);
				8745	}
				8746
				8747	int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu vcpu, struct kvm_regs regs)
				8748	{
				8749	vcpu_load(vcpu);
				8750	__set_regs(vcpu, regs);
				8751	vcpu_put(vcpu);
				8752	return 0;
				8753	}
				8754
				8755	void kvm_get_cs_db_l_bits(struct kvm_vcpu vcpu, int db, int *l)
				8756	{
				8757	struct kvm_segment cs;
				8758
				8759	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
				8760	*db = cs.db;
				8761	*l = cs.l;
				8762	}
				8763	EXPORT_SYMBOL_GPL(kvm_get_cs_db_l_bits);
				8764
				8765	static void __get_sregs(struct kvm_vcpu vcpu, struct kvm_sregs sregs)
				8766	{
				8767	struct desc_ptr dt;
				8768
				8769	kvm_get_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
				8770	kvm_get_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
				8771	kvm_get_segment(vcpu, &sregs->es, VCPU_SREG_ES);
				8772	kvm_get_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
				8773	kvm_get_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
				8774	kvm_get_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
				8775
				8776	kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
				8777	kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
				8778
				8779	kvm_x86_ops->get_idt(vcpu, &dt);
				8780	sregs->idt.limit = dt.size;
				8781	sregs->idt.base = dt.address;
				8782	kvm_x86_ops->get_gdt(vcpu, &dt);
				8783	sregs->gdt.limit = dt.size;
				8784	sregs->gdt.base = dt.address;
				8785
				8786	sregs->cr0 = kvm_read_cr0(vcpu);
				8787	sregs->cr2 = vcpu->arch.cr2;
				8788	sregs->cr3 = kvm_read_cr3(vcpu);
				8789	sregs->cr4 = kvm_read_cr4(vcpu);
				8790	sregs->cr8 = kvm_get_cr8(vcpu);
				8791	sregs->efer = vcpu->arch.efer;
				8792	sregs->apic_base = kvm_get_apic_base(vcpu);
				8793
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8794	memset(sregs->interrupt_bitmap, 0, sizeof(sregs->interrupt_bitmap));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8795
				8796	if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft)
				8797	set_bit(vcpu->arch.interrupt.nr,
				8798	(unsigned long *)sregs->interrupt_bitmap);
				8799	}
				8800
				8801	int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
				8802	struct kvm_sregs *sregs)
				8803	{
				8804	vcpu_load(vcpu);
				8805	__get_sregs(vcpu, sregs);
				8806	vcpu_put(vcpu);
				8807	return 0;
				8808	}
				8809
				8810	int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
				8811	struct kvm_mp_state *mp_state)
				8812	{
				8813	vcpu_load(vcpu);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8814	if (kvm_mpx_supported())
				8815	kvm_load_guest_fpu(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8816
				8817	kvm_apic_accept_events(vcpu);
				8818	if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
				8819	vcpu->arch.pv.pv_unhalted)
				8820	mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
				8821	else
				8822	mp_state->mp_state = vcpu->arch.mp_state;
				8823
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	8824	if (kvm_mpx_supported())
				8825	kvm_put_guest_fpu(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8826	vcpu_put(vcpu);
				8827	return 0;
				8828	}
				8829
				8830	int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
				8831	struct kvm_mp_state *mp_state)
				8832	{
				8833	int ret = -EINVAL;
				8834
				8835	vcpu_load(vcpu);
				8836
				8837	if (!lapic_in_kernel(vcpu) &&
				8838	mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
				8839	goto out;
				8840
				8841	/* INITs are latched while in SMM */
				8842	if ((is_smm(vcpu) \|\| vcpu->arch.smi_pending) &&
				8843	(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED \|\|
				8844	mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
				8845	goto out;
				8846
				8847	if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) {
				8848	vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
				8849	set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events);
				8850	} else
				8851	vcpu->arch.mp_state = mp_state->mp_state;
				8852	kvm_make_request(KVM_REQ_EVENT, vcpu);
				8853
				8854	ret = 0;
				8855	out:
				8856	vcpu_put(vcpu);
				8857	return ret;
				8858	}
				8859
				8860	int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
				8861	int reason, bool has_error_code, u32 error_code)
				8862	{
				8863	struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
				8864	int ret;
				8865
				8866	init_emulate_ctxt(vcpu);
				8867
				8868	ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason,
				8869	has_error_code, error_code);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8870	if (ret) {
				8871	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
				8872	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
				8873	vcpu->run->internal.ndata = 0;
				8874	return 0;
				8875	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8876
				8877	kvm_rip_write(vcpu, ctxt->eip);
				8878	kvm_set_rflags(vcpu, ctxt->eflags);
				8879	kvm_make_request(KVM_REQ_EVENT, vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8880	return 1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8881	}
				8882	EXPORT_SYMBOL_GPL(kvm_task_switch);
				8883
				8884	static int kvm_valid_sregs(struct kvm_vcpu vcpu, struct kvm_sregs sregs)
				8885	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8886	if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) {
				8887	/*
				8888	* When EFER.LME and CR0.PG are set, the processor is in
				8889	* 64-bit mode (though maybe in a 32-bit code segment).
				8890	* CR4.PAE and EFER.LMA must be set.
				8891	*/
				8892	if (!(sregs->cr4 & X86_CR4_PAE)
				8893	\|\| !(sregs->efer & EFER_LMA))
				8894	return -EINVAL;
				8895	} else {
				8896	/*
				8897	* Not in 64-bit mode: EFER.LMA is clear and the code
				8898	* segment cannot be 64-bit.
				8899	*/
				8900	if (sregs->efer & EFER_LMA \|\| sregs->cs.l)
				8901	return -EINVAL;
				8902	}
				8903
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8904	return kvm_valid_cr4(vcpu, sregs->cr4);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8905	}
				8906
				8907	static int __set_sregs(struct kvm_vcpu vcpu, struct kvm_sregs sregs)
				8908	{
				8909	struct msr_data apic_base_msr;
				8910	int mmu_reset_needed = 0;
				8911	int cpuid_update_needed = 0;
				8912	int pending_vec, max_bits, idx;
				8913	struct desc_ptr dt;
				8914	int ret = -EINVAL;
				8915
				8916	if (kvm_valid_sregs(vcpu, sregs))
				8917	goto out;
				8918
				8919	apic_base_msr.data = sregs->apic_base;
				8920	apic_base_msr.host_initiated = true;
				8921	if (kvm_set_apic_base(vcpu, &apic_base_msr))
				8922	goto out;
				8923
				8924	dt.size = sregs->idt.limit;
				8925	dt.address = sregs->idt.base;
				8926	kvm_x86_ops->set_idt(vcpu, &dt);
				8927	dt.size = sregs->gdt.limit;
				8928	dt.address = sregs->gdt.base;
				8929	kvm_x86_ops->set_gdt(vcpu, &dt);
				8930
				8931	vcpu->arch.cr2 = sregs->cr2;
				8932	mmu_reset_needed \|= kvm_read_cr3(vcpu) != sregs->cr3;
				8933	vcpu->arch.cr3 = sregs->cr3;
				8934	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
				8935
				8936	kvm_set_cr8(vcpu, sregs->cr8);
				8937
				8938	mmu_reset_needed \|= vcpu->arch.efer != sregs->efer;
				8939	kvm_x86_ops->set_efer(vcpu, sregs->efer);
				8940
				8941	mmu_reset_needed \|= kvm_read_cr0(vcpu) != sregs->cr0;
				8942	kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
				8943	vcpu->arch.cr0 = sregs->cr0;
				8944
				8945	mmu_reset_needed \|= kvm_read_cr4(vcpu) != sregs->cr4;
				8946	cpuid_update_needed \|= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
				8947	(X86_CR4_OSXSAVE \| X86_CR4_PKE));
				8948	kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
				8949	if (cpuid_update_needed)
				8950	kvm_update_cpuid(vcpu);
				8951
				8952	idx = srcu_read_lock(&vcpu->kvm->srcu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	8953	if (is_pae_paging(vcpu)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	8954	load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
				8955	mmu_reset_needed = 1;
				8956	}
				8957	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				8958
				8959	if (mmu_reset_needed)
				8960	kvm_mmu_reset_context(vcpu);
				8961
				8962	max_bits = KVM_NR_INTERRUPTS;
				8963	pending_vec = find_first_bit(
				8964	(const unsigned long *)sregs->interrupt_bitmap, max_bits);
				8965	if (pending_vec < max_bits) {
				8966	kvm_queue_interrupt(vcpu, pending_vec, false);
				8967	pr_debug("Set back pending irq %d\n", pending_vec);
				8968	}
				8969
				8970	kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
				8971	kvm_set_segment(vcpu, &sregs->ds, VCPU_SREG_DS);
				8972	kvm_set_segment(vcpu, &sregs->es, VCPU_SREG_ES);
				8973	kvm_set_segment(vcpu, &sregs->fs, VCPU_SREG_FS);
				8974	kvm_set_segment(vcpu, &sregs->gs, VCPU_SREG_GS);
				8975	kvm_set_segment(vcpu, &sregs->ss, VCPU_SREG_SS);
				8976
				8977	kvm_set_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
				8978	kvm_set_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
				8979
				8980	update_cr8_intercept(vcpu);
				8981
				8982	/* Older userspace won't unhalt the vcpu on reset. */
				8983	if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
				8984	sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
				8985	!is_protmode(vcpu))
				8986	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
				8987
				8988	kvm_make_request(KVM_REQ_EVENT, vcpu);
				8989
				8990	ret = 0;
				8991	out:
				8992	return ret;
				8993	}
				8994
				8995	int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
				8996	struct kvm_sregs *sregs)
				8997	{
				8998	int ret;
				8999
				9000	vcpu_load(vcpu);
				9001	ret = __set_sregs(vcpu, sregs);
				9002	vcpu_put(vcpu);
				9003	return ret;
				9004	}
				9005
				9006	int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
				9007	struct kvm_guest_debug *dbg)
				9008	{
				9009	unsigned long rflags;
				9010	int i, r;
				9011
				9012	vcpu_load(vcpu);
				9013
				9014	if (dbg->control & (KVM_GUESTDBG_INJECT_DB \| KVM_GUESTDBG_INJECT_BP)) {
				9015	r = -EBUSY;
				9016	if (vcpu->arch.exception.pending)
				9017	goto out;
				9018	if (dbg->control & KVM_GUESTDBG_INJECT_DB)
				9019	kvm_queue_exception(vcpu, DB_VECTOR);
				9020	else
				9021	kvm_queue_exception(vcpu, BP_VECTOR);
				9022	}
				9023
				9024	/*
				9025	* Read rflags as long as potentially injected trace flags are still
				9026	* filtered out.
				9027	*/
				9028	rflags = kvm_get_rflags(vcpu);
				9029
				9030	vcpu->guest_debug = dbg->control;
				9031	if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
				9032	vcpu->guest_debug = 0;
				9033
				9034	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
				9035	for (i = 0; i < KVM_NR_DB_REGS; ++i)
				9036	vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
				9037	vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7];
				9038	} else {
				9039	for (i = 0; i < KVM_NR_DB_REGS; i++)
				9040	vcpu->arch.eff_db[i] = vcpu->arch.db[i];
				9041	}
				9042	kvm_update_dr7(vcpu);
				9043
				9044	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
				9045	vcpu->arch.singlestep_rip = kvm_rip_read(vcpu) +
				9046	get_segment_base(vcpu, VCPU_SREG_CS);
				9047
				9048	/*
				9049	* Trigger an rflags update that will inject or remove the trace
				9050	* flags.
				9051	*/
				9052	kvm_set_rflags(vcpu, rflags);
				9053
				9054	kvm_x86_ops->update_bp_intercept(vcpu);
				9055
				9056	r = 0;
				9057
				9058	out:
				9059	vcpu_put(vcpu);
				9060	return r;
				9061	}
				9062
				9063	/*
				9064	* Translate a guest virtual address to a guest physical address.
				9065	*/
				9066	int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
				9067	struct kvm_translation *tr)
				9068	{
				9069	unsigned long vaddr = tr->linear_address;
				9070	gpa_t gpa;
				9071	int idx;
				9072
				9073	vcpu_load(vcpu);
				9074
				9075	idx = srcu_read_lock(&vcpu->kvm->srcu);
				9076	gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
				9077	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				9078	tr->physical_address = gpa;
				9079	tr->valid = gpa != UNMAPPED_GVA;
				9080	tr->writeable = 1;
				9081	tr->usermode = 0;
				9082
				9083	vcpu_put(vcpu);
				9084	return 0;
				9085	}
				9086
				9087	int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu vcpu, struct kvm_fpu fpu)
				9088	{
				9089	struct fxregs_state *fxsave;
				9090
				9091	vcpu_load(vcpu);
				9092
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9093	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9094	memcpy(fpu->fpr, fxsave->st_space, 128);
				9095	fpu->fcw = fxsave->cwd;
				9096	fpu->fsw = fxsave->swd;
				9097	fpu->ftwx = fxsave->twd;
				9098	fpu->last_opcode = fxsave->fop;
				9099	fpu->last_ip = fxsave->rip;
				9100	fpu->last_dp = fxsave->rdp;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9101	memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9102
				9103	vcpu_put(vcpu);
				9104	return 0;
				9105	}
				9106
				9107	int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu vcpu, struct kvm_fpu fpu)
				9108	{
				9109	struct fxregs_state *fxsave;
				9110
				9111	vcpu_load(vcpu);
				9112
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9113	fxsave = &vcpu->arch.guest_fpu->state.fxsave;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9114
				9115	memcpy(fxsave->st_space, fpu->fpr, 128);
				9116	fxsave->cwd = fpu->fcw;
				9117	fxsave->swd = fpu->fsw;
				9118	fxsave->twd = fpu->ftwx;
				9119	fxsave->fop = fpu->last_opcode;
				9120	fxsave->rip = fpu->last_ip;
				9121	fxsave->rdp = fpu->last_dp;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9122	memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9123
				9124	vcpu_put(vcpu);
				9125	return 0;
				9126	}
				9127
				9128	static void store_regs(struct kvm_vcpu *vcpu)
				9129	{
				9130	BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES);
				9131
				9132	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS)
				9133	__get_regs(vcpu, &vcpu->run->s.regs.regs);
				9134
				9135	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS)
				9136	__get_sregs(vcpu, &vcpu->run->s.regs.sregs);
				9137
				9138	if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS)
				9139	kvm_vcpu_ioctl_x86_get_vcpu_events(
				9140	vcpu, &vcpu->run->s.regs.events);
				9141	}
				9142
				9143	static int sync_regs(struct kvm_vcpu *vcpu)
				9144	{
				9145	if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)
				9146	return -EINVAL;
				9147
				9148	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) {
				9149	__set_regs(vcpu, &vcpu->run->s.regs.regs);
				9150	vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS;
				9151	}
				9152	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) {
				9153	if (__set_sregs(vcpu, &vcpu->run->s.regs.sregs))
				9154	return -EINVAL;
				9155	vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS;
				9156	}
				9157	if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) {
				9158	if (kvm_vcpu_ioctl_x86_set_vcpu_events(
				9159	vcpu, &vcpu->run->s.regs.events))
				9160	return -EINVAL;
				9161	vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS;
				9162	}
				9163
				9164	return 0;
				9165	}
				9166
				9167	static void fx_init(struct kvm_vcpu *vcpu)
				9168	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9169	fpstate_init(&vcpu->arch.guest_fpu->state);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9170	if (boot_cpu_has(X86_FEATURE_XSAVES))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9171	vcpu->arch.guest_fpu->state.xsave.header.xcomp_bv =
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9172	host_xcr0 \| XSTATE_COMPACTION_ENABLED;
				9173
				9174	/*
				9175	* Ensure guest xcr0 is valid for loading
				9176	*/
				9177	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
				9178
				9179	vcpu->arch.cr0 \|= X86_CR0_ET;
				9180	}
				9181
				9182	void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
				9183	{
				9184	void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9185	struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
				9186
				9187	kvm_release_pfn(cache->pfn, cache->dirty, cache);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9188
				9189	kvmclock_reset(vcpu);
				9190
				9191	kvm_x86_ops->vcpu_free(vcpu);
				9192	free_cpumask_var(wbinvd_dirty_mask);
				9193	}
				9194
				9195	struct kvm_vcpu kvm_arch_vcpu_create(struct kvm kvm,
				9196	unsigned int id)
				9197	{
				9198	struct kvm_vcpu *vcpu;
				9199
				9200	if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
				9201	printk_once(KERN_WARNING
				9202	"kvm: SMP vm created on host with unstable TSC; "
				9203	"guest TSC will not be reliable\n");
				9204
				9205	vcpu = kvm_x86_ops->vcpu_create(kvm, id);
				9206
				9207	return vcpu;
				9208	}
				9209
				9210	int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
				9211	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9212	vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
				9213	vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9214	kvm_vcpu_mtrr_init(vcpu);
				9215	vcpu_load(vcpu);
				9216	kvm_vcpu_reset(vcpu, false);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9217	kvm_init_mmu(vcpu, false);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9218	vcpu_put(vcpu);
				9219	return 0;
				9220	}
				9221
				9222	void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
				9223	{
				9224	struct msr_data msr;
				9225	struct kvm *kvm = vcpu->kvm;
				9226
				9227	kvm_hv_vcpu_postcreate(vcpu);
				9228
				9229	if (mutex_lock_killable(&vcpu->mutex))
				9230	return;
				9231	vcpu_load(vcpu);
				9232	msr.data = 0x0;
				9233	msr.index = MSR_IA32_TSC;
				9234	msr.host_initiated = true;
				9235	kvm_write_tsc(vcpu, &msr);
				9236	vcpu_put(vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9237
				9238	/* poll control enabled by default */
				9239	vcpu->arch.msr_kvm_poll_control = 1;
				9240
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9241	mutex_unlock(&vcpu->mutex);
				9242
				9243	if (!kvmclock_periodic_sync)
				9244	return;
				9245
				9246	schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
				9247	KVMCLOCK_SYNC_PERIOD);
				9248	}
				9249
				9250	void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
				9251	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9252	kvm_arch_vcpu_free(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9253	}
				9254
				9255	void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
				9256	{
				9257	kvm_lapic_reset(vcpu, init_event);
				9258
				9259	vcpu->arch.hflags = 0;
				9260
				9261	vcpu->arch.smi_pending = 0;
				9262	vcpu->arch.smi_count = 0;
				9263	atomic_set(&vcpu->arch.nmi_queued, 0);
				9264	vcpu->arch.nmi_pending = 0;
				9265	vcpu->arch.nmi_injected = false;
				9266	kvm_clear_interrupt_queue(vcpu);
				9267	kvm_clear_exception_queue(vcpu);
				9268	vcpu->arch.exception.pending = false;
				9269
				9270	memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
				9271	kvm_update_dr0123(vcpu);
				9272	vcpu->arch.dr6 = DR6_INIT;
				9273	kvm_update_dr6(vcpu);
				9274	vcpu->arch.dr7 = DR7_FIXED_1;
				9275	kvm_update_dr7(vcpu);
				9276
				9277	vcpu->arch.cr2 = 0;
				9278
				9279	kvm_make_request(KVM_REQ_EVENT, vcpu);
				9280	vcpu->arch.apf.msr_val = 0;
				9281	vcpu->arch.st.msr_val = 0;
				9282
				9283	kvmclock_reset(vcpu);
				9284
				9285	kvm_clear_async_pf_completion_queue(vcpu);
				9286	kvm_async_pf_hash_reset(vcpu);
				9287	vcpu->arch.apf.halted = false;
				9288
				9289	if (kvm_mpx_supported()) {
				9290	void *mpx_state_buffer;
				9291
				9292	/*
				9293	* To avoid have the INIT path from kvm_apic_has_events() that be
				9294	* called with loaded FPU and does not let userspace fix the state.
				9295	*/
				9296	if (init_event)
				9297	kvm_put_guest_fpu(vcpu);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9298	mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
				9299	XFEATURE_BNDREGS);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9300	if (mpx_state_buffer)
				9301	memset(mpx_state_buffer, 0, sizeof(struct mpx_bndreg_state));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9302	mpx_state_buffer = get_xsave_addr(&vcpu->arch.guest_fpu->state.xsave,
				9303	XFEATURE_BNDCSR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9304	if (mpx_state_buffer)
				9305	memset(mpx_state_buffer, 0, sizeof(struct mpx_bndcsr));
				9306	if (init_event)
				9307	kvm_load_guest_fpu(vcpu);
				9308	}
				9309
				9310	if (!init_event) {
				9311	kvm_pmu_reset(vcpu);
				9312	vcpu->arch.smbase = 0x30000;
				9313
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9314	vcpu->arch.msr_misc_features_enables = 0;
				9315
				9316	vcpu->arch.xcr0 = XFEATURE_MASK_FP;
				9317	}
				9318
				9319	memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs));
				9320	vcpu->arch.regs_avail = ~0;
				9321	vcpu->arch.regs_dirty = ~0;
				9322
				9323	vcpu->arch.ia32_xss = 0;
				9324
				9325	kvm_x86_ops->vcpu_reset(vcpu, init_event);
				9326	}
				9327
				9328	void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
				9329	{
				9330	struct kvm_segment cs;
				9331
				9332	kvm_get_segment(vcpu, &cs, VCPU_SREG_CS);
				9333	cs.selector = vector << 8;
				9334	cs.base = vector << 12;
				9335	kvm_set_segment(vcpu, &cs, VCPU_SREG_CS);
				9336	kvm_rip_write(vcpu, 0);
				9337	}
				9338
				9339	int kvm_arch_hardware_enable(void)
				9340	{
				9341	struct kvm *kvm;
				9342	struct kvm_vcpu *vcpu;
				9343	int i;
				9344	int ret;
				9345	u64 local_tsc;
				9346	u64 max_tsc = 0;
				9347	bool stable, backwards_tsc = false;
				9348
				9349	kvm_shared_msr_cpu_online();
				9350	ret = kvm_x86_ops->hardware_enable();
				9351	if (ret != 0)
				9352	return ret;
				9353
				9354	local_tsc = rdtsc();
				9355	stable = !kvm_check_tsc_unstable();
				9356	list_for_each_entry(kvm, &vm_list, vm_list) {
				9357	kvm_for_each_vcpu(i, vcpu, kvm) {
				9358	if (!stable && vcpu->cpu == smp_processor_id())
				9359	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
				9360	if (stable && vcpu->arch.last_host_tsc > local_tsc) {
				9361	backwards_tsc = true;
				9362	if (vcpu->arch.last_host_tsc > max_tsc)
				9363	max_tsc = vcpu->arch.last_host_tsc;
				9364	}
				9365	}
				9366	}
				9367
				9368	/*
				9369	* Sometimes, even reliable TSCs go backwards. This happens on
				9370	* platforms that reset TSC during suspend or hibernate actions, but
				9371	* maintain synchronization. We must compensate. Fortunately, we can
				9372	* detect that condition here, which happens early in CPU bringup,
				9373	* before any KVM threads can be running. Unfortunately, we can't
				9374	* bring the TSCs fully up to date with real time, as we aren't yet far
				9375	* enough into CPU bringup that we know how much real time has actually
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9376	* elapsed; our helper function, ktime_get_boottime_ns() will be using boot
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9377	* variables that haven't been updated yet.
				9378	*
				9379	* So we simply find the maximum observed TSC above, then record the
				9380	* adjustment to TSC in each VCPU. When the VCPU later gets loaded,
				9381	* the adjustment will be applied. Note that we accumulate
				9382	* adjustments, in case multiple suspend cycles happen before some VCPU
				9383	* gets a chance to run again. In the event that no KVM threads get a
				9384	* chance to run, we will miss the entire elapsed period, as we'll have
				9385	* reset last_host_tsc, so VCPUs will not have the TSC adjusted and may
				9386	* loose cycle time. This isn't too big a deal, since the loss will be
				9387	* uniform across all VCPUs (not to mention the scenario is extremely
				9388	* unlikely). It is possible that a second hibernate recovery happens
				9389	* much faster than a first, causing the observed TSC here to be
				9390	* smaller; this would require additional padding adjustment, which is
				9391	* why we set last_host_tsc to the local tsc observed here.
				9392	*
				9393	* N.B. - this code below runs only on platforms with reliable TSC,
				9394	* as that is the only way backwards_tsc is set above. Also note
				9395	* that this runs for ALL vcpus, which is not a bug; all VCPUs should
				9396	* have the same delta_cyc adjustment applied if backwards_tsc
				9397	* is detected. Note further, this adjustment is only done once,
				9398	* as we reset last_host_tsc on all VCPUs to stop this from being
				9399	* called multiple times (one for each physical CPU bringup).
				9400	*
				9401	* Platforms with unreliable TSCs don't have to deal with this, they
				9402	* will be compensated by the logic in vcpu_load, which sets the TSC to
				9403	* catchup mode. This will catchup all VCPUs to real time, but cannot
				9404	* guarantee that they stay in perfect synchronization.
				9405	*/
				9406	if (backwards_tsc) {
				9407	u64 delta_cyc = max_tsc - local_tsc;
				9408	list_for_each_entry(kvm, &vm_list, vm_list) {
				9409	kvm->arch.backwards_tsc_observed = true;
				9410	kvm_for_each_vcpu(i, vcpu, kvm) {
				9411	vcpu->arch.tsc_offset_adjustment += delta_cyc;
				9412	vcpu->arch.last_host_tsc = local_tsc;
				9413	kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
				9414	}
				9415
				9416	/*
				9417	* We have to disable TSC offset matching.. if you were
				9418	* booting a VM while issuing an S4 host suspend....
				9419	* you may have some problem. Solving this issue is
				9420	* left as an exercise to the reader.
				9421	*/
				9422	kvm->arch.last_tsc_nsec = 0;
				9423	kvm->arch.last_tsc_write = 0;
				9424	}
				9425
				9426	}
				9427	return 0;
				9428	}
				9429
				9430	void kvm_arch_hardware_disable(void)
				9431	{
				9432	kvm_x86_ops->hardware_disable();
				9433	drop_user_return_notifiers();
				9434	}
				9435
				9436	int kvm_arch_hardware_setup(void)
				9437	{
				9438	int r;
				9439
				9440	r = kvm_x86_ops->hardware_setup();
				9441	if (r != 0)
				9442	return r;
				9443
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9444	cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
				9445
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9446	if (kvm_has_tsc_control) {
				9447	/*
				9448	* Make sure the user can only configure tsc_khz values that
				9449	* fit into a signed integer.
				9450	* A min value is not calculated because it will always
				9451	* be 1 on all machines.
				9452	*/
				9453	u64 max = min(0x7fffffffULL,
				9454	__scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz));
				9455	kvm_max_guest_tsc_khz = max;
				9456
				9457	kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits;
				9458	}
				9459
				9460	kvm_init_msr_list();
				9461	return 0;
				9462	}
				9463
				9464	void kvm_arch_hardware_unsetup(void)
				9465	{
				9466	kvm_x86_ops->hardware_unsetup();
				9467	}
				9468
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9469	int kvm_arch_check_processor_compat(void)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9470	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9471	return kvm_x86_ops->check_processor_compatibility();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9472	}
				9473
				9474	bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
				9475	{
				9476	return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id;
				9477	}
				9478	EXPORT_SYMBOL_GPL(kvm_vcpu_is_reset_bsp);
				9479
				9480	bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
				9481	{
				9482	return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
				9483	}
				9484
				9485	struct static_key kvm_no_apic_vcpu __read_mostly;
				9486	EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
				9487
				9488	int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
				9489	{
				9490	struct page *page;
				9491	int r;
				9492
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9493	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
				9494	if (!irqchip_in_kernel(vcpu->kvm) \|\| kvm_vcpu_is_reset_bsp(vcpu))
				9495	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
				9496	else
				9497	vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
				9498
				9499	page = alloc_page(GFP_KERNEL \| __GFP_ZERO);
				9500	if (!page) {
				9501	r = -ENOMEM;
				9502	goto fail;
				9503	}
				9504	vcpu->arch.pio_data = page_address(page);
				9505
				9506	kvm_set_tsc_khz(vcpu, max_tsc_khz);
				9507
				9508	r = kvm_mmu_create(vcpu);
				9509	if (r < 0)
				9510	goto fail_free_pio_data;
				9511
				9512	if (irqchip_in_kernel(vcpu->kvm)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9513	vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
				9514	r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9515	if (r < 0)
				9516	goto fail_mmu_destroy;
				9517	} else
				9518	static_key_slow_inc(&kvm_no_apic_vcpu);
				9519
				9520	vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9521	GFP_KERNEL_ACCOUNT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9522	if (!vcpu->arch.mce_banks) {
				9523	r = -ENOMEM;
				9524	goto fail_free_lapic;
				9525	}
				9526	vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
				9527
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9528	if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
				9529	GFP_KERNEL_ACCOUNT)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9530	r = -ENOMEM;
				9531	goto fail_free_mce_banks;
				9532	}
				9533
				9534	fx_init(vcpu);
				9535
				9536	vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
				9537
				9538	vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
				9539
				9540	vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
				9541
				9542	kvm_async_pf_hash_reset(vcpu);
				9543	kvm_pmu_init(vcpu);
				9544
				9545	vcpu->arch.pending_external_vector = -1;
				9546	vcpu->arch.preempted_in_kernel = false;
				9547
				9548	kvm_hv_vcpu_init(vcpu);
				9549
				9550	return 0;
				9551
				9552	fail_free_mce_banks:
				9553	kfree(vcpu->arch.mce_banks);
				9554	fail_free_lapic:
				9555	kvm_free_lapic(vcpu);
				9556	fail_mmu_destroy:
				9557	kvm_mmu_destroy(vcpu);
				9558	fail_free_pio_data:
				9559	free_page((unsigned long)vcpu->arch.pio_data);
				9560	fail:
				9561	return r;
				9562	}
				9563
				9564	void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
				9565	{
				9566	int idx;
				9567
				9568	kvm_hv_vcpu_uninit(vcpu);
				9569	kvm_pmu_destroy(vcpu);
				9570	kfree(vcpu->arch.mce_banks);
				9571	kvm_free_lapic(vcpu);
				9572	idx = srcu_read_lock(&vcpu->kvm->srcu);
				9573	kvm_mmu_destroy(vcpu);
				9574	srcu_read_unlock(&vcpu->kvm->srcu, idx);
				9575	free_page((unsigned long)vcpu->arch.pio_data);
				9576	if (!lapic_in_kernel(vcpu))
				9577	static_key_slow_dec(&kvm_no_apic_vcpu);
				9578	}
				9579
				9580	void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
				9581	{
				9582	vcpu->arch.l1tf_flush_l1d = true;
				9583	kvm_x86_ops->sched_in(vcpu, cpu);
				9584	}
				9585
				9586	int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
				9587	{
				9588	if (type)
				9589	return -EINVAL;
				9590
				9591	INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
				9592	INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
				9593	INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9594	INIT_LIST_HEAD(&kvm->arch.lpage_disallowed_mmu_pages);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9595	INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
				9596	atomic_set(&kvm->arch.noncoherent_dma_count, 0);
				9597
				9598	/* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
				9599	set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
				9600	/* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */
				9601	set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID,
				9602	&kvm->arch.irq_sources_bitmap);
				9603
				9604	raw_spin_lock_init(&kvm->arch.tsc_write_lock);
				9605	mutex_init(&kvm->arch.apic_map_lock);
				9606	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
				9607
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9608	kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9609	pvclock_update_vm_gtod_copy(kvm);
				9610
				9611	kvm->arch.guest_can_read_msr_platform_info = true;
				9612
				9613	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
				9614	INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
				9615
				9616	kvm_hv_init_vm(kvm);
				9617	kvm_page_track_init(kvm);
				9618	kvm_mmu_init_vm(kvm);
				9619
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9620	return kvm_x86_ops->vm_init(kvm);
				9621	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9622
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9623	int kvm_arch_post_init_vm(struct kvm *kvm)
				9624	{
				9625	return kvm_mmu_post_init_vm(kvm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9626	}
				9627
				9628	static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
				9629	{
				9630	vcpu_load(vcpu);
				9631	kvm_mmu_unload(vcpu);
				9632	vcpu_put(vcpu);
				9633	}
				9634
				9635	static void kvm_free_vcpus(struct kvm *kvm)
				9636	{
				9637	unsigned int i;
				9638	struct kvm_vcpu *vcpu;
				9639
				9640	/*
				9641	* Unpin any mmu pages first.
				9642	*/
				9643	kvm_for_each_vcpu(i, vcpu, kvm) {
				9644	kvm_clear_async_pf_completion_queue(vcpu);
				9645	kvm_unload_vcpu_mmu(vcpu);
				9646	}
				9647	kvm_for_each_vcpu(i, vcpu, kvm)
				9648	kvm_arch_vcpu_free(vcpu);
				9649
				9650	mutex_lock(&kvm->lock);
				9651	for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
				9652	kvm->vcpus[i] = NULL;
				9653
				9654	atomic_set(&kvm->online_vcpus, 0);
				9655	mutex_unlock(&kvm->lock);
				9656	}
				9657
				9658	void kvm_arch_sync_events(struct kvm *kvm)
				9659	{
				9660	cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work);
				9661	cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work);
				9662	kvm_free_pit(kvm);
				9663	}
				9664
				9665	int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
				9666	{
				9667	int i, r;
				9668	unsigned long hva;
				9669	struct kvm_memslots *slots = kvm_memslots(kvm);
				9670	struct kvm_memory_slot *slot, old;
				9671
				9672	/* Called with kvm->slots_lock held. */
				9673	if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
				9674	return -EINVAL;
				9675
				9676	slot = id_to_memslot(slots, id);
				9677	if (size) {
				9678	if (slot->npages)
				9679	return -EEXIST;
				9680
				9681	/*
				9682	* MAP_SHARED to prevent internal slot pages from being moved
				9683	* by fork()/COW.
				9684	*/
				9685	hva = vm_mmap(NULL, 0, size, PROT_READ \| PROT_WRITE,
				9686	MAP_SHARED \| MAP_ANONYMOUS, 0);
				9687	if (IS_ERR((void *)hva))
				9688	return PTR_ERR((void *)hva);
				9689	} else {
				9690	if (!slot->npages)
				9691	return 0;
				9692
				9693	hva = 0;
				9694	}
				9695
				9696	old = *slot;
				9697	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				9698	struct kvm_userspace_memory_region m;
				9699
				9700	m.slot = id \| (i << 16);
				9701	m.flags = 0;
				9702	m.guest_phys_addr = gpa;
				9703	m.userspace_addr = hva;
				9704	m.memory_size = size;
				9705	r = __kvm_set_memory_region(kvm, &m);
				9706	if (r < 0)
				9707	return r;
				9708	}
				9709
				9710	if (!size)
				9711	vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
				9712
				9713	return 0;
				9714	}
				9715	EXPORT_SYMBOL_GPL(__x86_set_memory_region);
				9716
				9717	int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
				9718	{
				9719	int r;
				9720
				9721	mutex_lock(&kvm->slots_lock);
				9722	r = __x86_set_memory_region(kvm, id, gpa, size);
				9723	mutex_unlock(&kvm->slots_lock);
				9724
				9725	return r;
				9726	}
				9727	EXPORT_SYMBOL_GPL(x86_set_memory_region);
				9728
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9729	void kvm_arch_pre_destroy_vm(struct kvm *kvm)
				9730	{
				9731	kvm_mmu_pre_destroy_vm(kvm);
				9732	}
				9733
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9734	void kvm_arch_destroy_vm(struct kvm *kvm)
				9735	{
				9736	if (current->mm == kvm->mm) {
				9737	/*
				9738	* Free memory regions allocated on behalf of userspace,
				9739	* unless the the memory map has changed due to process exit
				9740	* or fd copying.
				9741	*/
				9742	x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
				9743	x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
				9744	x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
				9745	}
				9746	if (kvm_x86_ops->vm_destroy)
				9747	kvm_x86_ops->vm_destroy(kvm);
				9748	kvm_pic_destroy(kvm);
				9749	kvm_ioapic_destroy(kvm);
				9750	kvm_free_vcpus(kvm);
				9751	kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9752	kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9753	kvm_mmu_uninit_vm(kvm);
				9754	kvm_page_track_cleanup(kvm);
				9755	kvm_hv_destroy_vm(kvm);
				9756	}
				9757
				9758	void kvm_arch_free_memslot(struct kvm kvm, struct kvm_memory_slot free,
				9759	struct kvm_memory_slot *dont)
				9760	{
				9761	int i;
				9762
				9763	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
				9764	if (!dont \|\| free->arch.rmap[i] != dont->arch.rmap[i]) {
				9765	kvfree(free->arch.rmap[i]);
				9766	free->arch.rmap[i] = NULL;
				9767	}
				9768	if (i == 0)
				9769	continue;
				9770
				9771	if (!dont \|\| free->arch.lpage_info[i - 1] !=
				9772	dont->arch.lpage_info[i - 1]) {
				9773	kvfree(free->arch.lpage_info[i - 1]);
				9774	free->arch.lpage_info[i - 1] = NULL;
				9775	}
				9776	}
				9777
				9778	kvm_page_track_free_memslot(free, dont);
				9779	}
				9780
				9781	int kvm_arch_create_memslot(struct kvm kvm, struct kvm_memory_slot slot,
				9782	unsigned long npages)
				9783	{
				9784	int i;
				9785
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9786	/*
				9787	* Clear out the previous array pointers for the KVM_MR_MOVE case. The
				9788	* old arrays will be freed by __kvm_set_memory_region() if installing
				9789	* the new memslot is successful.
				9790	*/
				9791	memset(&slot->arch, 0, sizeof(slot->arch));
				9792
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9793	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
				9794	struct kvm_lpage_info *linfo;
				9795	unsigned long ugfn;
				9796	int lpages;
				9797	int level = i + 1;
				9798
				9799	lpages = gfn_to_index(slot->base_gfn + npages - 1,
				9800	slot->base_gfn, level) + 1;
				9801
				9802	slot->arch.rmap[i] =
				9803	kvcalloc(lpages, sizeof(*slot->arch.rmap[i]),
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9804	GFP_KERNEL_ACCOUNT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9805	if (!slot->arch.rmap[i])
				9806	goto out_free;
				9807	if (i == 0)
				9808	continue;
				9809
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9810	linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9811	if (!linfo)
				9812	goto out_free;
				9813
				9814	slot->arch.lpage_info[i - 1] = linfo;
				9815
				9816	if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1))
				9817	linfo[0].disallow_lpage = 1;
				9818	if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1))
				9819	linfo[lpages - 1].disallow_lpage = 1;
				9820	ugfn = slot->userspace_addr >> PAGE_SHIFT;
				9821	/*
				9822	* If the gfn and userspace address are not aligned wrt each
				9823	* other, or if explicitly asked to, disable large page
				9824	* support for this slot
				9825	*/
				9826	if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) \|\|
				9827	!kvm_largepages_enabled()) {
				9828	unsigned long j;
				9829
				9830	for (j = 0; j < lpages; ++j)
				9831	linfo[j].disallow_lpage = 1;
				9832	}
				9833	}
				9834
				9835	if (kvm_page_track_create_memslot(slot, npages))
				9836	goto out_free;
				9837
				9838	return 0;
				9839
				9840	out_free:
				9841	for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
				9842	kvfree(slot->arch.rmap[i]);
				9843	slot->arch.rmap[i] = NULL;
				9844	if (i == 0)
				9845	continue;
				9846
				9847	kvfree(slot->arch.lpage_info[i - 1]);
				9848	slot->arch.lpage_info[i - 1] = NULL;
				9849	}
				9850	return -ENOMEM;
				9851	}
				9852
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9853	void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9854	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9855	struct kvm_vcpu *vcpu;
				9856	int i;
				9857
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9858	/*
				9859	* memslots->generation has been incremented.
				9860	* mmio generation may have reached its maximum value.
				9861	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9862	kvm_mmu_invalidate_mmio_sptes(kvm, gen);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9863
				9864	/* Force re-initialization of steal_time cache */
				9865	kvm_for_each_vcpu(i, vcpu, kvm)
				9866	kvm_vcpu_kick(vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9867	}
				9868
				9869	int kvm_arch_prepare_memory_region(struct kvm *kvm,
				9870	struct kvm_memory_slot *memslot,
				9871	const struct kvm_userspace_memory_region *mem,
				9872	enum kvm_mr_change change)
				9873	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	9874	if (change == KVM_MR_MOVE)
				9875	return kvm_arch_create_memslot(kvm, memslot,
				9876	mem->memory_size >> PAGE_SHIFT);
				9877
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9878	return 0;
				9879	}
				9880
				9881	static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
				9882	struct kvm_memory_slot *new)
				9883	{
				9884	/* Still write protect RO slot */
				9885	if (new->flags & KVM_MEM_READONLY) {
				9886	kvm_mmu_slot_remove_write_access(kvm, new);
				9887	return;
				9888	}
				9889
				9890	/*
				9891	* Call kvm_x86_ops dirty logging hooks when they are valid.
				9892	*
				9893	* kvm_x86_ops->slot_disable_log_dirty is called when:
				9894	*
				9895	* - KVM_MR_CREATE with dirty logging is disabled
				9896	* - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
				9897	*
				9898	* The reason is, in case of PML, we need to set D-bit for any slots
				9899	* with dirty logging disabled in order to eliminate unnecessary GPA
				9900	* logging in PML buffer (and potential PML buffer full VMEXT). This
				9901	* guarantees leaving PML enabled during guest's lifetime won't have
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9902	* any additional overhead from PML when guest is running with dirty
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9903	* logging disabled for memory slots.
				9904	*
				9905	* kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
				9906	* to dirty logging mode.
				9907	*
				9908	* If kvm_x86_ops dirty logging hooks are invalid, use write protect.
				9909	*
				9910	* In case of write protect:
				9911	*
				9912	* Write protect all pages for dirty logging.
				9913	*
				9914	* All the sptes including the large sptes which point to this
				9915	* slot are set to readonly. We can not create any new large
				9916	* spte on this slot until the end of the logging.
				9917	*
				9918	* See the comments in fast_page_fault().
				9919	*/
				9920	if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
				9921	if (kvm_x86_ops->slot_enable_log_dirty)
				9922	kvm_x86_ops->slot_enable_log_dirty(kvm, new);
				9923	else
				9924	kvm_mmu_slot_remove_write_access(kvm, new);
				9925	} else {
				9926	if (kvm_x86_ops->slot_disable_log_dirty)
				9927	kvm_x86_ops->slot_disable_log_dirty(kvm, new);
				9928	}
				9929	}
				9930
				9931	void kvm_arch_commit_memory_region(struct kvm *kvm,
				9932	const struct kvm_userspace_memory_region *mem,
				9933	const struct kvm_memory_slot *old,
				9934	const struct kvm_memory_slot *new,
				9935	enum kvm_mr_change change)
				9936	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9937	if (!kvm->arch.n_requested_mmu_pages)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9938	kvm_mmu_change_mmu_pages(kvm,
				9939	kvm_mmu_calculate_default_mmu_pages(kvm));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9940
				9941	/*
				9942	* Dirty logging tracks sptes in 4k granularity, meaning that large
				9943	* sptes have to be split. If live migration is successful, the guest
				9944	* in the source machine will be destroyed and large sptes will be
				9945	* created in the destination. However, if the guest continues to run
				9946	* in the source machine (for example if live migration fails), small
				9947	* sptes will remain around and cause bad performance.
				9948	*
				9949	* Scan sptes if dirty logging has been stopped, dropping those
				9950	* which can be collapsed into a single large-page spte. Later
				9951	* page faults will create the large-page sptes.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9952	*
				9953	* There is no need to do this in any of the following cases:
				9954	* CREATE: No dirty mappings will already exist.
				9955	* MOVE/DELETE: The old mappings will already have been cleaned up by
				9956	* kvm_arch_flush_shadow_memslot()
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9957	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9958	if (change == KVM_MR_FLAGS_ONLY &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9959	(old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
				9960	!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
				9961	kvm_mmu_zap_collapsible_sptes(kvm, new);
				9962
				9963	/*
				9964	* Set up write protection and/or dirty logging for the new slot.
				9965	*
				9966	* For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
				9967	* been zapped so no dirty logging staff is needed for old slot. For
				9968	* KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
				9969	* new and it's also covered when dealing with the new slot.
				9970	*
				9971	* FIXME: const-ify all uses of struct kvm_memory_slot.
				9972	*/
				9973	if (change != KVM_MR_DELETE)
				9974	kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
				9975	}
				9976
				9977	void kvm_arch_flush_shadow_all(struct kvm *kvm)
				9978	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	9979	kvm_mmu_zap_all(kvm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	9980	}
				9981
				9982	void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
				9983	struct kvm_memory_slot *slot)
				9984	{
				9985	kvm_page_track_flush_slot(kvm, slot);
				9986	}
				9987
				9988	static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
				9989	{
				9990	return (is_guest_mode(vcpu) &&
				9991	kvm_x86_ops->guest_apic_has_interrupt &&
				9992	kvm_x86_ops->guest_apic_has_interrupt(vcpu));
				9993	}
				9994
				9995	static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
				9996	{
				9997	if (!list_empty_careful(&vcpu->async_pf.done))
				9998	return true;
				9999
				10000	if (kvm_apic_has_events(vcpu))
				10001	return true;
				10002
				10003	if (vcpu->arch.pv.pv_unhalted)
				10004	return true;
				10005
				10006	if (vcpu->arch.exception.pending)
				10007	return true;
				10008
				10009	if (kvm_test_request(KVM_REQ_NMI, vcpu) \|\|
				10010	(vcpu->arch.nmi_pending &&
				10011	kvm_x86_ops->nmi_allowed(vcpu)))
				10012	return true;
				10013
				10014	if (kvm_test_request(KVM_REQ_SMI, vcpu) \|\|
				10015	(vcpu->arch.smi_pending && !is_smm(vcpu)))
				10016	return true;
				10017
				10018	if (kvm_arch_interrupt_allowed(vcpu) &&
				10019	(kvm_cpu_has_interrupt(vcpu) \|\|
				10020	kvm_guest_apic_has_interrupt(vcpu)))
				10021	return true;
				10022
				10023	if (kvm_hv_has_stimer_pending(vcpu))
				10024	return true;
				10025
				10026	return false;
				10027	}
				10028
				10029	int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
				10030	{
				10031	return kvm_vcpu_running(vcpu) \|\| kvm_vcpu_has_events(vcpu);
				10032	}
				10033
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10034	bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
				10035	{
				10036	if (READ_ONCE(vcpu->arch.pv.pv_unhalted))
				10037	return true;
				10038
				10039	if (kvm_test_request(KVM_REQ_NMI, vcpu) \|\|
				10040	kvm_test_request(KVM_REQ_SMI, vcpu) \|\|
				10041	kvm_test_request(KVM_REQ_EVENT, vcpu))
				10042	return true;
				10043
				10044	if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
				10045	return true;
				10046
				10047	return false;
				10048	}
				10049
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10050	bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
				10051	{
				10052	return vcpu->arch.preempted_in_kernel;
				10053	}
				10054
				10055	int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
				10056	{
				10057	return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE;
				10058	}
				10059
				10060	int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
				10061	{
				10062	return kvm_x86_ops->interrupt_allowed(vcpu);
				10063	}
				10064
				10065	unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
				10066	{
				10067	if (is_64_bit_mode(vcpu))
				10068	return kvm_rip_read(vcpu);
				10069	return (u32)(get_segment_base(vcpu, VCPU_SREG_CS) +
				10070	kvm_rip_read(vcpu));
				10071	}
				10072	EXPORT_SYMBOL_GPL(kvm_get_linear_rip);
				10073
				10074	bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip)
				10075	{
				10076	return kvm_get_linear_rip(vcpu) == linear_rip;
				10077	}
				10078	EXPORT_SYMBOL_GPL(kvm_is_linear_rip);
				10079
				10080	unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
				10081	{
				10082	unsigned long rflags;
				10083
				10084	rflags = kvm_x86_ops->get_rflags(vcpu);
				10085	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
				10086	rflags &= ~X86_EFLAGS_TF;
				10087	return rflags;
				10088	}
				10089	EXPORT_SYMBOL_GPL(kvm_get_rflags);
				10090
				10091	static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
				10092	{
				10093	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
				10094	kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
				10095	rflags \|= X86_EFLAGS_TF;
				10096	kvm_x86_ops->set_rflags(vcpu, rflags);
				10097	}
				10098
				10099	void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
				10100	{
				10101	__kvm_set_rflags(vcpu, rflags);
				10102	kvm_make_request(KVM_REQ_EVENT, vcpu);
				10103	}
				10104	EXPORT_SYMBOL_GPL(kvm_set_rflags);
				10105
				10106	void kvm_arch_async_page_ready(struct kvm_vcpu vcpu, struct kvm_async_pf work)
				10107	{
				10108	int r;
				10109
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10110	if ((vcpu->arch.mmu->direct_map != work->arch.direct_map) \|\|
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10111	work->wakeup_all)
				10112	return;
				10113
				10114	r = kvm_mmu_reload(vcpu);
				10115	if (unlikely(r))
				10116	return;
				10117
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10118	if (!vcpu->arch.mmu->direct_map &&
				10119	work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10120	return;
				10121
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10122	vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10123	}
				10124
				10125	static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
				10126	{
				10127	return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
				10128	}
				10129
				10130	static inline u32 kvm_async_pf_next_probe(u32 key)
				10131	{
				10132	return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
				10133	}
				10134
				10135	static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				10136	{
				10137	u32 key = kvm_async_pf_hash_fn(gfn);
				10138
				10139	while (vcpu->arch.apf.gfns[key] != ~0)
				10140	key = kvm_async_pf_next_probe(key);
				10141
				10142	vcpu->arch.apf.gfns[key] = gfn;
				10143	}
				10144
				10145	static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
				10146	{
				10147	int i;
				10148	u32 key = kvm_async_pf_hash_fn(gfn);
				10149
				10150	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
				10151	(vcpu->arch.apf.gfns[key] != gfn &&
				10152	vcpu->arch.apf.gfns[key] != ~0); i++)
				10153	key = kvm_async_pf_next_probe(key);
				10154
				10155	return key;
				10156	}
				10157
				10158	bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				10159	{
				10160	return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
				10161	}
				10162
				10163	static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
				10164	{
				10165	u32 i, j, k;
				10166
				10167	i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
				10168	while (true) {
				10169	vcpu->arch.apf.gfns[i] = ~0;
				10170	do {
				10171	j = kvm_async_pf_next_probe(j);
				10172	if (vcpu->arch.apf.gfns[j] == ~0)
				10173	return;
				10174	k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
				10175	/*
				10176	* k lies cyclically in ]i,j]
				10177	* \| i.k.j \|
				10178	* \|....j i.k.\| or \|.k..j i...\|
				10179	*/
				10180	} while ((i <= j) ? (i < k && k <= j) : (i < k \|\| k <= j));
				10181	vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
				10182	i = j;
				10183	}
				10184	}
				10185
				10186	static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
				10187	{
				10188
				10189	return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
				10190	sizeof(val));
				10191	}
				10192
				10193	static int apf_get_user(struct kvm_vcpu vcpu, u32 val)
				10194	{
				10195
				10196	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
				10197	sizeof(u32));
				10198	}
				10199
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10200	static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
				10201	{
				10202	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
				10203	return false;
				10204
				10205	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) \|\|
				10206	(vcpu->arch.apf.send_user_only &&
				10207	kvm_x86_ops->get_cpl(vcpu) == 0))
				10208	return false;
				10209
				10210	return true;
				10211	}
				10212
				10213	bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
				10214	{
				10215	if (unlikely(!lapic_in_kernel(vcpu) \|\|
				10216	kvm_event_needs_reinjection(vcpu) \|\|
				10217	vcpu->arch.exception.pending))
				10218	return false;
				10219
				10220	if (kvm_hlt_in_guest(vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu))
				10221	return false;
				10222
				10223	/*
				10224	* If interrupts are off we cannot even use an artificial
				10225	* halt state.
				10226	*/
				10227	return kvm_x86_ops->interrupt_allowed(vcpu);
				10228	}
				10229
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10230	void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
				10231	struct kvm_async_pf *work)
				10232	{
				10233	struct x86_exception fault;
				10234
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10235	trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10236	kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
				10237
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10238	if (kvm_can_deliver_async_pf(vcpu) &&
				10239	!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10240	fault.vector = PF_VECTOR;
				10241	fault.error_code_valid = true;
				10242	fault.error_code = 0;
				10243	fault.nested_page_fault = false;
				10244	fault.address = work->arch.token;
				10245	fault.async_page_fault = true;
				10246	kvm_inject_page_fault(vcpu, &fault);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10247	} else {
				10248	/*
				10249	* It is not possible to deliver a paravirtualized asynchronous
				10250	* page fault, but putting the guest in an artificial halt state
				10251	* can be beneficial nevertheless: if an interrupt arrives, we
				10252	* can deliver it timely and perhaps the guest will schedule
				10253	* another process. When the instruction that triggered a page
				10254	* fault is retried, hopefully the page will be ready in the host.
				10255	*/
				10256	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10257	}
				10258	}
				10259
				10260	void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
				10261	struct kvm_async_pf *work)
				10262	{
				10263	struct x86_exception fault;
				10264	u32 val;
				10265
				10266	if (work->wakeup_all)
				10267	work->arch.token = ~0; /* broadcast wakeup */
				10268	else
				10269	kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10270	trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10271
				10272	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
				10273	!apf_get_user(vcpu, &val)) {
				10274	if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
				10275	vcpu->arch.exception.pending &&
				10276	vcpu->arch.exception.nr == PF_VECTOR &&
				10277	!apf_put_user(vcpu, 0)) {
				10278	vcpu->arch.exception.injected = false;
				10279	vcpu->arch.exception.pending = false;
				10280	vcpu->arch.exception.nr = 0;
				10281	vcpu->arch.exception.has_error_code = false;
				10282	vcpu->arch.exception.error_code = 0;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10283	vcpu->arch.exception.has_payload = false;
				10284	vcpu->arch.exception.payload = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10285	} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
				10286	fault.vector = PF_VECTOR;
				10287	fault.error_code_valid = true;
				10288	fault.error_code = 0;
				10289	fault.nested_page_fault = false;
				10290	fault.address = work->arch.token;
				10291	fault.async_page_fault = true;
				10292	kvm_inject_page_fault(vcpu, &fault);
				10293	}
				10294	}
				10295	vcpu->arch.apf.halted = false;
				10296	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
				10297	}
				10298
				10299	bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
				10300	{
				10301	if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
				10302	return true;
				10303	else
				10304	return kvm_can_do_async_pf(vcpu);
				10305	}
				10306
				10307	void kvm_arch_start_assignment(struct kvm *kvm)
				10308	{
				10309	atomic_inc(&kvm->arch.assigned_device_count);
				10310	}
				10311	EXPORT_SYMBOL_GPL(kvm_arch_start_assignment);
				10312
				10313	void kvm_arch_end_assignment(struct kvm *kvm)
				10314	{
				10315	atomic_dec(&kvm->arch.assigned_device_count);
				10316	}
				10317	EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
				10318
				10319	bool kvm_arch_has_assigned_device(struct kvm *kvm)
				10320	{
				10321	return atomic_read(&kvm->arch.assigned_device_count);
				10322	}
				10323	EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
				10324
				10325	void kvm_arch_register_noncoherent_dma(struct kvm *kvm)
				10326	{
				10327	atomic_inc(&kvm->arch.noncoherent_dma_count);
				10328	}
				10329	EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma);
				10330
				10331	void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm)
				10332	{
				10333	atomic_dec(&kvm->arch.noncoherent_dma_count);
				10334	}
				10335	EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma);
				10336
				10337	bool kvm_arch_has_noncoherent_dma(struct kvm *kvm)
				10338	{
				10339	return atomic_read(&kvm->arch.noncoherent_dma_count);
				10340	}
				10341	EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma);
				10342
				10343	bool kvm_arch_has_irq_bypass(void)
				10344	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10345	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10346	}
				10347
				10348	int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
				10349	struct irq_bypass_producer *prod)
				10350	{
				10351	struct kvm_kernel_irqfd *irqfd =
				10352	container_of(cons, struct kvm_kernel_irqfd, consumer);
				10353
				10354	irqfd->producer = prod;
				10355
				10356	return kvm_x86_ops->update_pi_irte(irqfd->kvm,
				10357	prod->irq, irqfd->gsi, 1);
				10358	}
				10359
				10360	void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
				10361	struct irq_bypass_producer *prod)
				10362	{
				10363	int ret;
				10364	struct kvm_kernel_irqfd *irqfd =
				10365	container_of(cons, struct kvm_kernel_irqfd, consumer);
				10366
				10367	WARN_ON(irqfd->producer != prod);
				10368	irqfd->producer = NULL;
				10369
				10370	/*
				10371	* When producer of consumer is unregistered, we change back to
				10372	* remapped mode, so we can re-use the current implementation
				10373	* when the irq is masked/disabled or the consumer side (KVM
				10374	* int this case doesn't want to receive the interrupts.
				10375	*/
				10376	ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
				10377	if (ret)
				10378	printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
				10379	" fails: %d\n", irqfd->consumer.token, ret);
				10380	}
				10381
				10382	int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
				10383	uint32_t guest_irq, bool set)
				10384	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10385	return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
				10386	}
				10387
				10388	bool kvm_vector_hashing_enabled(void)
				10389	{
				10390	return vector_hashing;
				10391	}
				10392	EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
				10393
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10394	bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
				10395	{
				10396	return (vcpu->arch.msr_kvm_poll_control & 1) == 0;
				10397	}
				10398	EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
				10399
				10400
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	10401	int kvm_spec_ctrl_test_value(u64 value)
				10402	{
				10403	/*
				10404	* test that setting IA32_SPEC_CTRL to given value
				10405	* is allowed by the host processor
				10406	*/
				10407
				10408	u64 saved_value;
				10409	unsigned long flags;
				10410	int ret = 0;
				10411
				10412	local_irq_save(flags);
				10413
				10414	if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
				10415	ret = 1;
				10416	else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
				10417	ret = 1;
				10418	else
				10419	wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
				10420
				10421	local_irq_restore(flags);
				10422
				10423	return ret;
				10424	}
				10425	EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
				10426
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10427	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
				10428	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
				10429	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
				10430	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
				10431	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
				10432	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
				10433	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
				10434	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
				10435	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
				10436	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10437	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10438	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
				10439	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
				10440	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts);
				10441	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10442	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	10443	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full);
				10444	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
				10445	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
				10446	EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);