Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f1a0eeb..a5d6d79 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -18,14 +18,17 @@
#include <linux/kvm_host.h>
#include "irq.h"
+#include "ioapic.h"
#include "mmu.h"
#include "i8254.h"
#include "tss.h"
#include "kvm_cache_regs.h"
+#include "kvm_emulate.h"
#include "x86.h"
#include "cpuid.h"
#include "pmu.h"
#include "hyperv.h"
+#include "lapic.h"
#include <linux/clocksource.h>
#include <linux/interrupt.h>
@@ -53,6 +56,7 @@
#include <linux/sched/stat.h>
#include <linux/sched/isolation.h>
#include <linux/mem_encrypt.h>
+#include <linux/entry-kvm.h>
#include <trace/events/kvm.h>
@@ -67,7 +71,9 @@
#include <asm/irq_remapping.h>
#include <asm/mshyperv.h>
#include <asm/hypervisor.h>
+#include <asm/tlbflush.h>
#include <asm/intel_pt.h>
+#include <asm/emulate_prefix.h>
#include <clocksource/hyperv_timer.h>
#define CREATE_TRACE_POINTS
@@ -79,7 +85,7 @@
EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define emul_to_vcpu(ctxt) \
- container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
+ ((struct kvm_vcpu *)(ctxt)->vcpu)
/* EFER defaults:
* - enable syscall per default because its emulated by KVM
@@ -94,9 +100,6 @@
static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
-#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
-#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
-
#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
@@ -108,7 +111,7 @@
static void store_regs(struct kvm_vcpu *vcpu);
static int sync_regs(struct kvm_vcpu *vcpu);
-struct kvm_x86_ops *kvm_x86_ops __read_mostly;
+struct kvm_x86_ops kvm_x86_ops __read_mostly;
EXPORT_SYMBOL_GPL(kvm_x86_ops);
static bool __read_mostly ignore_msrs = 0;
@@ -160,87 +163,144 @@
int __read_mostly pi_inject_timer = -1;
module_param(pi_inject_timer, bint, S_IRUGO | S_IWUSR);
-#define KVM_NR_SHARED_MSRS 16
+/*
+ * Restoring the host value for MSRs that are only consumed when running in
+ * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU
+ * returns to userspace, i.e. the kernel can run with the guest's value.
+ */
+#define KVM_MAX_NR_USER_RETURN_MSRS 16
-struct kvm_shared_msrs_global {
+struct kvm_user_return_msrs_global {
int nr;
- u32 msrs[KVM_NR_SHARED_MSRS];
+ u32 msrs[KVM_MAX_NR_USER_RETURN_MSRS];
};
-struct kvm_shared_msrs {
+struct kvm_user_return_msrs {
struct user_return_notifier urn;
bool registered;
- struct kvm_shared_msr_values {
+ struct kvm_user_return_msr_values {
u64 host;
u64 curr;
- } values[KVM_NR_SHARED_MSRS];
+ } values[KVM_MAX_NR_USER_RETURN_MSRS];
};
-static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
-static struct kvm_shared_msrs __percpu *shared_msrs;
+static struct kvm_user_return_msrs_global __read_mostly user_return_msrs_global;
+static struct kvm_user_return_msrs __percpu *user_return_msrs;
+
+#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
+ | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \
+ | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \
+ | XFEATURE_MASK_PKRU)
+
+u64 __read_mostly host_efer;
+EXPORT_SYMBOL_GPL(host_efer);
+
+bool __read_mostly allow_smaller_maxphyaddr = 0;
+EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr);
+
+static u64 __read_mostly host_xss;
+u64 __read_mostly supported_xss;
+EXPORT_SYMBOL_GPL(supported_xss);
struct kvm_stats_debugfs_item debugfs_entries[] = {
- { "pf_fixed", VCPU_STAT(pf_fixed) },
- { "pf_guest", VCPU_STAT(pf_guest) },
- { "tlb_flush", VCPU_STAT(tlb_flush) },
- { "invlpg", VCPU_STAT(invlpg) },
- { "exits", VCPU_STAT(exits) },
- { "io_exits", VCPU_STAT(io_exits) },
- { "mmio_exits", VCPU_STAT(mmio_exits) },
- { "signal_exits", VCPU_STAT(signal_exits) },
- { "irq_window", VCPU_STAT(irq_window_exits) },
- { "nmi_window", VCPU_STAT(nmi_window_exits) },
- { "halt_exits", VCPU_STAT(halt_exits) },
- { "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
- { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
- { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
- { "halt_wakeup", VCPU_STAT(halt_wakeup) },
- { "hypercalls", VCPU_STAT(hypercalls) },
- { "request_irq", VCPU_STAT(request_irq_exits) },
- { "irq_exits", VCPU_STAT(irq_exits) },
- { "host_state_reload", VCPU_STAT(host_state_reload) },
- { "fpu_reload", VCPU_STAT(fpu_reload) },
- { "insn_emulation", VCPU_STAT(insn_emulation) },
- { "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
- { "irq_injections", VCPU_STAT(irq_injections) },
- { "nmi_injections", VCPU_STAT(nmi_injections) },
- { "req_event", VCPU_STAT(req_event) },
- { "l1d_flush", VCPU_STAT(l1d_flush) },
- { "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
- { "mmu_pte_write", VM_STAT(mmu_pte_write) },
- { "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
- { "mmu_flooded", VM_STAT(mmu_flooded) },
- { "mmu_recycled", VM_STAT(mmu_recycled) },
- { "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
- { "mmu_unsync", VM_STAT(mmu_unsync) },
- { "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
- { "largepages", VM_STAT(lpages, .mode = 0444) },
- { "nx_largepages_splitted", VM_STAT(nx_lpage_splits, .mode = 0444) },
- { "max_mmu_page_hash_collisions",
- VM_STAT(max_mmu_page_hash_collisions) },
+ VCPU_STAT("pf_fixed", pf_fixed),
+ VCPU_STAT("pf_guest", pf_guest),
+ VCPU_STAT("tlb_flush", tlb_flush),
+ VCPU_STAT("invlpg", invlpg),
+ VCPU_STAT("exits", exits),
+ VCPU_STAT("io_exits", io_exits),
+ VCPU_STAT("mmio_exits", mmio_exits),
+ VCPU_STAT("signal_exits", signal_exits),
+ VCPU_STAT("irq_window", irq_window_exits),
+ VCPU_STAT("nmi_window", nmi_window_exits),
+ VCPU_STAT("halt_exits", halt_exits),
+ VCPU_STAT("halt_successful_poll", halt_successful_poll),
+ VCPU_STAT("halt_attempted_poll", halt_attempted_poll),
+ VCPU_STAT("halt_poll_invalid", halt_poll_invalid),
+ VCPU_STAT("halt_wakeup", halt_wakeup),
+ VCPU_STAT("hypercalls", hypercalls),
+ VCPU_STAT("request_irq", request_irq_exits),
+ VCPU_STAT("irq_exits", irq_exits),
+ VCPU_STAT("host_state_reload", host_state_reload),
+ VCPU_STAT("fpu_reload", fpu_reload),
+ VCPU_STAT("insn_emulation", insn_emulation),
+ VCPU_STAT("insn_emulation_fail", insn_emulation_fail),
+ VCPU_STAT("irq_injections", irq_injections),
+ VCPU_STAT("nmi_injections", nmi_injections),
+ VCPU_STAT("req_event", req_event),
+ VCPU_STAT("l1d_flush", l1d_flush),
+ VCPU_STAT("halt_poll_success_ns", halt_poll_success_ns),
+ VCPU_STAT("halt_poll_fail_ns", halt_poll_fail_ns),
+ VM_STAT("mmu_shadow_zapped", mmu_shadow_zapped),
+ VM_STAT("mmu_pte_write", mmu_pte_write),
+ VM_STAT("mmu_pde_zapped", mmu_pde_zapped),
+ VM_STAT("mmu_flooded", mmu_flooded),
+ VM_STAT("mmu_recycled", mmu_recycled),
+ VM_STAT("mmu_cache_miss", mmu_cache_miss),
+ VM_STAT("mmu_unsync", mmu_unsync),
+ VM_STAT("remote_tlb_flush", remote_tlb_flush),
+ VM_STAT("largepages", lpages, .mode = 0444),
+ VM_STAT("nx_largepages_splitted", nx_lpage_splits, .mode = 0444),
+ VM_STAT("max_mmu_page_hash_collisions", max_mmu_page_hash_collisions),
{ NULL }
};
u64 __read_mostly host_xcr0;
+u64 __read_mostly supported_xcr0;
+EXPORT_SYMBOL_GPL(supported_xcr0);
-struct kmem_cache *x86_fpu_cache;
-EXPORT_SYMBOL_GPL(x86_fpu_cache);
+static struct kmem_cache *x86_fpu_cache;
+
+static struct kmem_cache *x86_emulator_cache;
+
+/*
+ * When called, it means the previous get/set msr reached an invalid msr.
+ * Return true if we want to ignore/silent this failed msr access.
+ */
+static bool kvm_msr_ignored_check(struct kvm_vcpu *vcpu, u32 msr,
+ u64 data, bool write)
+{
+ const char *op = write ? "wrmsr" : "rdmsr";
+
+ if (ignore_msrs) {
+ if (report_ignored_msrs)
+ kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ /* Mask the error */
+ return true;
+ } else {
+ kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n",
+ op, msr, data);
+ return false;
+ }
+}
+
+static struct kmem_cache *kvm_alloc_emulator_cache(void)
+{
+ unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src);
+ unsigned int size = sizeof(struct x86_emulate_ctxt);
+
+ return kmem_cache_create_usercopy("x86_emulator", size,
+ __alignof__(struct x86_emulate_ctxt),
+ SLAB_ACCOUNT, useroffset,
+ size - useroffset, NULL);
+}
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
int i;
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
+ for (i = 0; i < ASYNC_PF_PER_VCPU; i++)
vcpu->arch.apf.gfns[i] = ~0;
}
static void kvm_on_user_return(struct user_return_notifier *urn)
{
unsigned slot;
- struct kvm_shared_msrs *locals
- = container_of(urn, struct kvm_shared_msrs, urn);
- struct kvm_shared_msr_values *values;
+ struct kvm_user_return_msrs *msrs
+ = container_of(urn, struct kvm_user_return_msrs, urn);
+ struct kvm_user_return_msr_values *values;
unsigned long flags;
/*
@@ -248,84 +308,89 @@
* interrupted and executed through kvm_arch_hardware_disable()
*/
local_irq_save(flags);
- if (locals->registered) {
- locals->registered = false;
+ if (msrs->registered) {
+ msrs->registered = false;
user_return_notifier_unregister(urn);
}
local_irq_restore(flags);
- for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
- values = &locals->values[slot];
+ for (slot = 0; slot < user_return_msrs_global.nr; ++slot) {
+ values = &msrs->values[slot];
if (values->host != values->curr) {
- wrmsrl(shared_msrs_global.msrs[slot], values->host);
+ wrmsrl(user_return_msrs_global.msrs[slot], values->host);
values->curr = values->host;
}
}
}
-static void shared_msr_update(unsigned slot, u32 msr)
+int kvm_probe_user_return_msr(u32 msr)
{
+ u64 val;
+ int ret;
+
+ preempt_disable();
+ ret = rdmsrl_safe(msr, &val);
+ if (ret)
+ goto out;
+ ret = wrmsrl_safe(msr, val);
+out:
+ preempt_enable();
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_probe_user_return_msr);
+
+void kvm_define_user_return_msr(unsigned slot, u32 msr)
+{
+ BUG_ON(slot >= KVM_MAX_NR_USER_RETURN_MSRS);
+ user_return_msrs_global.msrs[slot] = msr;
+ if (slot >= user_return_msrs_global.nr)
+ user_return_msrs_global.nr = slot + 1;
+}
+EXPORT_SYMBOL_GPL(kvm_define_user_return_msr);
+
+static void kvm_user_return_msr_cpu_online(void)
+{
+ unsigned int cpu = smp_processor_id();
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
u64 value;
- unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ int i;
- /* only read, and nobody should modify it at this time,
- * so don't need lock */
- if (slot >= shared_msrs_global.nr) {
- printk(KERN_ERR "kvm: invalid MSR slot!");
- return;
+ for (i = 0; i < user_return_msrs_global.nr; ++i) {
+ rdmsrl_safe(user_return_msrs_global.msrs[i], &value);
+ msrs->values[i].host = value;
+ msrs->values[i].curr = value;
}
- rdmsrl_safe(msr, &value);
- smsr->values[slot].host = value;
- smsr->values[slot].curr = value;
}
-void kvm_define_shared_msr(unsigned slot, u32 msr)
-{
- BUG_ON(slot >= KVM_NR_SHARED_MSRS);
- shared_msrs_global.msrs[slot] = msr;
- if (slot >= shared_msrs_global.nr)
- shared_msrs_global.nr = slot + 1;
-}
-EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
-
-static void kvm_shared_msr_cpu_online(void)
-{
- unsigned i;
-
- for (i = 0; i < shared_msrs_global.nr; ++i)
- shared_msr_update(i, shared_msrs_global.msrs[i]);
-}
-
-int kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
+int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
int err;
- value = (value & mask) | (smsr->values[slot].host & ~mask);
- if (value == smsr->values[slot].curr)
+ value = (value & mask) | (msrs->values[slot].host & ~mask);
+ if (value == msrs->values[slot].curr)
return 0;
- err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
+ err = wrmsrl_safe(user_return_msrs_global.msrs[slot], value);
if (err)
return 1;
- smsr->values[slot].curr = value;
- if (!smsr->registered) {
- smsr->urn.on_user_return = kvm_on_user_return;
- user_return_notifier_register(&smsr->urn);
- smsr->registered = true;
+ msrs->values[slot].curr = value;
+ if (!msrs->registered) {
+ msrs->urn.on_user_return = kvm_on_user_return;
+ user_return_notifier_register(&msrs->urn);
+ msrs->registered = true;
}
return 0;
}
-EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
+EXPORT_SYMBOL_GPL(kvm_set_user_return_msr);
static void drop_user_return_notifiers(void)
{
unsigned int cpu = smp_processor_id();
- struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
+ struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu);
- if (smsr->registered)
- kvm_on_user_return(&smsr->urn);
+ if (msrs->registered)
+ kvm_on_user_return(&msrs->urn);
}
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
@@ -357,11 +422,12 @@
}
kvm_lapic_set_base(vcpu, msr_info->data);
+ kvm_recalculate_apic_map(vcpu->kvm);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);
-asmlinkage __visible void kvm_spurious_fault(void)
+asmlinkage __visible noinstr void kvm_spurious_fault(void)
{
/* Fault while not rebooting. We want the trace. */
BUG_ON(!kvm_rebooting);
@@ -503,19 +569,7 @@
vcpu->arch.exception.error_code = error_code;
vcpu->arch.exception.has_payload = has_payload;
vcpu->arch.exception.payload = payload;
- /*
- * In guest mode, payload delivery should be deferred,
- * so that the L1 hypervisor can intercept #PF before
- * CR2 is modified (or intercept #DB before DR6 is
- * modified under nVMX). However, for ABI
- * compatibility with KVM_GET_VCPU_EVENTS and
- * KVM_SET_VCPU_EVENTS, we can't delay payload
- * delivery unless userspace has enabled this
- * functionality via the per-VM capability,
- * KVM_CAP_EXCEPTION_PAYLOAD.
- */
- if (!vcpu->kvm->arch.exception_payload_enabled ||
- !is_guest_mode(vcpu))
+ if (!is_guest_mode(vcpu))
kvm_deliver_exception_payload(vcpu);
return;
}
@@ -562,11 +616,12 @@
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);
-static void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
- unsigned long payload)
+void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr,
+ unsigned long payload)
{
kvm_multiple_exception(vcpu, nr, false, 0, true, payload, false);
}
+EXPORT_SYMBOL_GPL(kvm_queue_exception_p);
static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr,
u32 error_code, unsigned long payload)
@@ -601,15 +656,28 @@
}
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
+bool kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
+ struct x86_exception *fault)
{
- if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
- vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
- else
- vcpu->arch.mmu->inject_page_fault(vcpu, fault);
+ struct kvm_mmu *fault_mmu;
+ WARN_ON_ONCE(fault->vector != PF_VECTOR);
+ fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
+ vcpu->arch.walk_mmu;
+
+ /*
+ * Invalidate the TLB entry for the faulting address, if it exists,
+ * else the access will fault indefinitely (and to emulate hardware).
+ */
+ if ((fault->error_code & PFERR_PRESENT_MASK) &&
+ !(fault->error_code & PFERR_RSVD_MASK))
+ kvm_mmu_invalidate_gva(vcpu, fault_mmu, fault->address,
+ fault_mmu->root_hpa);
+
+ fault_mmu->inject_page_fault(vcpu, fault);
return fault->nested_page_fault;
}
+EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault);
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
@@ -636,7 +704,7 @@
*/
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
{
- if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
+ if (kvm_x86_ops.get_cpl(vcpu) <= required_cpl)
return true;
kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
return false;
@@ -718,10 +786,8 @@
ret = 1;
memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
+
out:
return ret;
@@ -731,7 +797,6 @@
bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
- bool changed = true;
int offset;
gfn_t gfn;
int r;
@@ -739,8 +804,7 @@
if (!is_pae_paging(vcpu))
return false;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail))
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_PDPTR))
return true;
gfn = (kvm_read_cr3(vcpu) & 0xffffffe0ul) >> PAGE_SHIFT;
@@ -748,17 +812,16 @@
r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
PFERR_USER_MASK | PFERR_WRITE_MASK);
if (r < 0)
- goto out;
- changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
-out:
+ return true;
- return changed;
+ return memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
}
EXPORT_SYMBOL_GPL(pdptrs_changed);
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
{
unsigned long old_cr0 = kvm_read_cr0(vcpu);
+ unsigned long pdptr_bits = X86_CR0_CD | X86_CR0_NW | X86_CR0_PG;
unsigned long update_bits = X86_CR0_PG | X86_CR0_WP;
cr0 |= X86_CR0_ET;
@@ -776,27 +839,27 @@
if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
return 1;
- if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
- if ((vcpu->arch.efer & EFER_LME)) {
- int cs_db, cs_l;
+ if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) &&
+ (cr0 & X86_CR0_PG)) {
+ int cs_db, cs_l;
- if (!is_pae(vcpu))
- return 1;
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
- if (cs_l)
- return 1;
- } else
-#endif
- if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
- kvm_read_cr3(vcpu)))
+ if (!is_pae(vcpu))
+ return 1;
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ if (cs_l)
return 1;
}
+#endif
+ if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) &&
+ is_pae(vcpu) && ((cr0 ^ old_cr0) & pdptr_bits) &&
+ !load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)))
+ return 1;
if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
return 1;
- kvm_x86_ops->set_cr0(vcpu, cr0);
+ kvm_x86_ops.set_cr0(vcpu, cr0);
if ((cr0 ^ old_cr0) & X86_CR0_PG) {
kvm_clear_async_pf_completion_queue(vcpu);
@@ -821,14 +884,16 @@
}
EXPORT_SYMBOL_GPL(kvm_lmsw);
-void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu)
{
- if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE) &&
- !vcpu->guest_xcr0_loaded) {
- /* kvm_set_xcr() also depends on this */
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
- vcpu->guest_xcr0_loaded = 1;
+
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, vcpu->arch.ia32_xss);
}
if (static_cpu_has(X86_FEATURE_PKU) &&
@@ -837,9 +902,9 @@
vcpu->arch.pkru != vcpu->arch.host_pkru)
__write_pkru(vcpu->arch.pkru);
}
-EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state);
-void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
+void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu)
{
if (static_cpu_has(X86_FEATURE_PKU) &&
(kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
@@ -849,13 +914,18 @@
__write_pkru(vcpu->arch.host_pkru);
}
- if (vcpu->guest_xcr0_loaded) {
+ if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) {
+
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
- vcpu->guest_xcr0_loaded = 0;
+
+ if (vcpu->arch.xsaves_enabled &&
+ vcpu->arch.ia32_xss != host_xss)
+ wrmsrl(MSR_IA32_XSS, host_xss);
}
+
}
-EXPORT_SYMBOL_GPL(kvm_put_guest_xcr0);
+EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state);
static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
@@ -893,13 +963,13 @@
vcpu->arch.xcr0 = xcr0;
if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
- if (kvm_x86_ops->get_cpl(vcpu) != 0 ||
+ if (kvm_x86_ops.get_cpl(vcpu) != 0 ||
__kvm_set_xcr(vcpu, index, xcr)) {
kvm_inject_gp(vcpu, 0);
return 1;
@@ -908,63 +978,17 @@
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
-static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
-{
- u64 reserved_bits = CR4_RESERVED_BITS;
-
- if (!cpu_has(c, X86_FEATURE_XSAVE))
- reserved_bits |= X86_CR4_OSXSAVE;
-
- if (!cpu_has(c, X86_FEATURE_SMEP))
- reserved_bits |= X86_CR4_SMEP;
-
- if (!cpu_has(c, X86_FEATURE_SMAP))
- reserved_bits |= X86_CR4_SMAP;
-
- if (!cpu_has(c, X86_FEATURE_FSGSBASE))
- reserved_bits |= X86_CR4_FSGSBASE;
-
- if (!cpu_has(c, X86_FEATURE_PKU))
- reserved_bits |= X86_CR4_PKE;
-
- if (!cpu_has(c, X86_FEATURE_LA57) &&
- !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
- reserved_bits |= X86_CR4_LA57;
-
- if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
- reserved_bits |= X86_CR4_UMIP;
-
- return reserved_bits;
-}
-
-static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
if (cr4 & cr4_reserved_bits)
return -EINVAL;
- if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMEP) && (cr4 & X86_CR4_SMEP))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_SMAP) && (cr4 & X86_CR4_SMAP))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_FSGSBASE) && (cr4 & X86_CR4_FSGSBASE))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_PKU) && (cr4 & X86_CR4_PKE))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_LA57) && (cr4 & X86_CR4_LA57))
- return -EINVAL;
-
- if (!guest_cpuid_has(vcpu, X86_FEATURE_UMIP) && (cr4 & X86_CR4_UMIP))
+ if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return -EINVAL;
return 0;
}
+EXPORT_SYMBOL_GPL(kvm_valid_cr4);
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
@@ -996,7 +1020,7 @@
return 1;
}
- if (kvm_x86_ops->set_cr4(vcpu, cr4))
+ if (kvm_x86_ops.set_cr4(vcpu, cr4))
return 1;
if (((cr4 ^ old_cr4) & mmu_role_bits) ||
@@ -1004,7 +1028,7 @@
kvm_mmu_reset_context(vcpu);
if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE))
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
return 0;
}
@@ -1025,21 +1049,21 @@
if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
if (!skip_tlb_flush) {
kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
return 0;
}
if (is_long_mode(vcpu) &&
- (cr3 & rsvd_bits(cpuid_maxphyaddr(vcpu), 63)))
+ (cr3 & vcpu->arch.cr3_lm_rsvd_bits))
return 1;
else if (is_pae_paging(vcpu) &&
!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
return 1;
- kvm_mmu_new_cr3(vcpu, cr3, skip_tlb_flush);
+ kvm_mmu_new_pgd(vcpu, cr3, skip_tlb_flush, skip_tlb_flush);
vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
return 0;
}
@@ -1077,13 +1101,7 @@
}
}
-static void kvm_update_dr6(struct kvm_vcpu *vcpu)
-{
- if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
- kvm_x86_ops->set_dr6(vcpu, vcpu->arch.dr6);
-}
-
-static void kvm_update_dr7(struct kvm_vcpu *vcpu)
+void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
unsigned long dr7;
@@ -1091,11 +1109,12 @@
dr7 = vcpu->arch.guest_debug_dr7;
else
dr7 = vcpu->arch.dr7;
- kvm_x86_ops->set_dr7(vcpu, dr7);
+ kvm_x86_ops.set_dr7(vcpu, dr7);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED;
if (dr7 & DR7_BP_EN_MASK)
vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED;
}
+EXPORT_SYMBOL_GPL(kvm_update_dr7);
static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu)
{
@@ -1117,17 +1136,14 @@
vcpu->arch.eff_db[dr] = val;
break;
case 4:
- /* fall through */
case 6:
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr6_valid(val))
return -1; /* #GP */
vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu);
- kvm_update_dr6(vcpu);
break;
case 5:
- /* fall through */
default: /* 7 */
- if (val & 0xffffffff00000000ULL)
+ if (!kvm_dr7_valid(val))
return -1; /* #GP */
vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
kvm_update_dr7(vcpu);
@@ -1156,15 +1172,10 @@
*val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
- /* fall through */
case 6:
- if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
- *val = vcpu->arch.dr6;
- else
- *val = kvm_x86_ops->get_dr6(vcpu);
+ *val = vcpu->arch.dr6;
break;
case 5:
- /* fall through */
default: /* 7 */
*val = vcpu->arch.dr7;
break;
@@ -1207,7 +1218,7 @@
MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA,
- MSR_IA32_FEATURE_CONTROL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
+ MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX,
MSR_IA32_SPEC_CTRL,
MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH,
MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK,
@@ -1218,7 +1229,7 @@
MSR_IA32_UMWAIT_CONTROL,
MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1,
- MSR_ARCH_PERFMON_FIXED_CTR0 + 2, MSR_ARCH_PERFMON_FIXED_CTR0 + 3,
+ MSR_ARCH_PERFMON_FIXED_CTR0 + 2,
MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS,
MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL,
MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1,
@@ -1239,6 +1250,13 @@
MSR_ARCH_PERFMON_EVENTSEL0 + 12, MSR_ARCH_PERFMON_EVENTSEL0 + 13,
MSR_ARCH_PERFMON_EVENTSEL0 + 14, MSR_ARCH_PERFMON_EVENTSEL0 + 15,
MSR_ARCH_PERFMON_EVENTSEL0 + 16, MSR_ARCH_PERFMON_EVENTSEL0 + 17,
+
+ MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3,
+ MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3,
+ MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2,
+ MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5,
+ MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2,
+ MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5,
};
static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_all)];
@@ -1260,13 +1278,18 @@
HV_X64_MSR_VP_ASSIST_PAGE,
HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL,
HV_X64_MSR_TSC_EMULATION_STATUS,
+ HV_X64_MSR_SYNDBG_OPTIONS,
+ HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS,
+ HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER,
+ HV_X64_MSR_SYNDBG_PENDING_BUFFER,
MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
- MSR_KVM_PV_EOI_EN,
+ MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK,
MSR_IA32_TSC_ADJUST,
MSR_IA32_TSCDEADLINE,
MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
@@ -1277,6 +1300,7 @@
MSR_MISC_FEATURES_ENABLES,
MSR_AMD64_VIRT_SPEC_CTRL,
MSR_IA32_POWER_CTL,
+ MSR_IA32_UCODE_REV,
/*
* The following list leaves out MSRs whose values are determined
@@ -1332,6 +1356,7 @@
MSR_F10H_DECFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
+ MSR_IA32_PERF_CAPABILITIES,
};
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
@@ -1371,30 +1396,25 @@
if (!boot_cpu_has_bug(X86_BUG_MDS))
data |= ARCH_CAP_MDS_NO;
- /*
- * On TAA affected systems, export MDS_NO=0 when:
- * - TSX is enabled on the host, i.e. X86_FEATURE_RTM=1.
- * - Updated microcode is present. This is detected by
- * the presence of ARCH_CAP_TSX_CTRL_MSR and ensures
- * that VERW clears CPU buffers.
- *
- * When MDS_NO=0 is exported, guests deploy clear CPU buffer
- * mitigation and don't complain:
- *
- * "Vulnerable: Clear CPU buffers attempted, no microcode"
- *
- * If TSX is disabled on the system, guests are also mitigated against
- * TAA and clear CPU buffer mitigation is not required for guests.
- */
- if (!boot_cpu_has(X86_FEATURE_RTM))
+ if (!boot_cpu_has(X86_FEATURE_RTM)) {
+ /*
+ * If RTM=0 because the kernel has disabled TSX, the host might
+ * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0
+ * and therefore knows that there cannot be TAA) but keep
+ * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts,
+ * and we want to allow migrating those guests to tsx=off hosts.
+ */
data &= ~ARCH_CAP_TAA_NO;
- else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ } else if (!boot_cpu_has_bug(X86_BUG_TAA)) {
data |= ARCH_CAP_TAA_NO;
- else if (data & ARCH_CAP_TSX_CTRL_MSR)
- data &= ~ARCH_CAP_MDS_NO;
+ } else {
+ /*
+ * Nothing to do here; we emulate TSX_CTRL if present on the
+ * host so the guest can choose between disabling TSX or
+ * using VERW to clear CPU buffers.
+ */
+ }
- /* KVM does not emulate MSR_IA32_TSX_CTRL. */
- data &= ~ARCH_CAP_TSX_CTRL_MSR;
return data;
}
@@ -1408,8 +1428,7 @@
rdmsrl_safe(msr->index, &msr->data);
break;
default:
- if (kvm_x86_ops->get_msr_feature(msr))
- return 1;
+ return kvm_x86_ops.get_msr_feature(msr);
}
return 0;
}
@@ -1421,6 +1440,14 @@
msr.index = index;
r = kvm_get_msr_feature(&msr);
+
+ if (r == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear the output for simplicity */
+ *data = 0;
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ r = 0;
+ }
+
if (r)
return r;
@@ -1460,6 +1487,7 @@
{
u64 old_efer = vcpu->arch.efer;
u64 efer = msr_info->data;
+ int r;
if (efer & efer_reserved_bits)
return 1;
@@ -1476,7 +1504,11 @@
efer &= ~EFER_LMA;
efer |= vcpu->arch.efer & EFER_LMA;
- kvm_x86_ops->set_efer(vcpu, efer);
+ r = kvm_x86_ops.set_efer(vcpu, efer);
+ if (r) {
+ WARN_ON(r > 0);
+ return r;
+ }
/* Update reserved bits */
if ((efer ^ old_efer) & EFER_NX)
@@ -1491,6 +1523,49 @@
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);
+bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+ struct msr_bitmap_range *ranges;
+ struct kvm *kvm = vcpu->kvm;
+ bool allowed;
+ int idx;
+ u32 i;
+
+ /* x2APIC MSRs do not support filtering. */
+ if (index >= 0x800 && index <= 0x8ff)
+ return true;
+
+ idx = srcu_read_lock(&kvm->srcu);
+
+ msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu);
+ if (!msr_filter) {
+ allowed = true;
+ goto out;
+ }
+
+ allowed = msr_filter->default_allow;
+ ranges = msr_filter->ranges;
+
+ for (i = 0; i < msr_filter->count; i++) {
+ u32 start = ranges[i].base;
+ u32 end = start + ranges[i].nmsrs;
+ u32 flags = ranges[i].flags;
+ unsigned long *bitmap = ranges[i].bitmap;
+
+ if ((index >= start) && (index < end) && (flags & type)) {
+ allowed = !!test_bit(index - start, bitmap);
+ break;
+ }
+ }
+
+out:
+ srcu_read_unlock(&kvm->srcu, idx);
+
+ return allowed;
+}
+EXPORT_SYMBOL_GPL(kvm_msr_allowed);
+
/*
* Write @data into the MSR specified by @index. Select MSR specific fault
* checks are bypassed if @host_initiated is %true.
@@ -1502,6 +1577,9 @@
{
struct msr_data msr;
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE))
+ return KVM_MSR_RET_FILTERED;
+
switch (index) {
case MSR_FS_BASE:
case MSR_GS_BASE:
@@ -1532,7 +1610,19 @@
msr.index = index;
msr.host_initiated = host_initiated;
- return kvm_x86_ops->set_msr(vcpu, &msr);
+ return kvm_x86_ops.set_msr(vcpu, &msr);
+}
+
+static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 data, bool host_initiated)
+{
+ int ret = __kvm_set_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID)
+ if (kvm_msr_ignored_check(vcpu, index, data, true))
+ ret = 0;
+
+ return ret;
}
/*
@@ -1541,39 +1631,136 @@
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
-static int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
- bool host_initiated)
+int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data,
+ bool host_initiated)
{
struct msr_data msr;
int ret;
+ if (!host_initiated && !kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ))
+ return KVM_MSR_RET_FILTERED;
+
msr.index = index;
msr.host_initiated = host_initiated;
- ret = kvm_x86_ops->get_msr(vcpu, &msr);
+ ret = kvm_x86_ops.get_msr(vcpu, &msr);
if (!ret)
*data = msr.data;
return ret;
}
+static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu,
+ u32 index, u64 *data, bool host_initiated)
+{
+ int ret = __kvm_get_msr(vcpu, index, data, host_initiated);
+
+ if (ret == KVM_MSR_RET_INVALID) {
+ /* Unconditionally clear *data for simplicity */
+ *data = 0;
+ if (kvm_msr_ignored_check(vcpu, index, 0, false))
+ ret = 0;
+ }
+
+ return ret;
+}
+
int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data)
{
- return __kvm_get_msr(vcpu, index, data, false);
+ return kvm_get_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_get_msr);
int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data)
{
- return __kvm_set_msr(vcpu, index, data, false);
+ return kvm_set_msr_ignored_check(vcpu, index, data, false);
}
EXPORT_SYMBOL_GPL(kvm_set_msr);
+static int complete_emulated_msr(struct kvm_vcpu *vcpu, bool is_read)
+{
+ if (vcpu->run->msr.error) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ } else if (is_read) {
+ kvm_rax_write(vcpu, (u32)vcpu->run->msr.data);
+ kvm_rdx_write(vcpu, vcpu->run->msr.data >> 32);
+ }
+
+ return kvm_skip_emulated_instruction(vcpu);
+}
+
+static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, true);
+}
+
+static int complete_emulated_wrmsr(struct kvm_vcpu *vcpu)
+{
+ return complete_emulated_msr(vcpu, false);
+}
+
+static u64 kvm_msr_reason(int r)
+{
+ switch (r) {
+ case KVM_MSR_RET_INVALID:
+ return KVM_MSR_EXIT_REASON_UNKNOWN;
+ case KVM_MSR_RET_FILTERED:
+ return KVM_MSR_EXIT_REASON_FILTER;
+ default:
+ return KVM_MSR_EXIT_REASON_INVAL;
+ }
+}
+
+static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index,
+ u32 exit_reason, u64 data,
+ int (*completion)(struct kvm_vcpu *vcpu),
+ int r)
+{
+ u64 msr_reason = kvm_msr_reason(r);
+
+ /* Check if the user wanted to know about this MSR fault */
+ if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason))
+ return 0;
+
+ vcpu->run->exit_reason = exit_reason;
+ vcpu->run->msr.error = 0;
+ memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad));
+ vcpu->run->msr.reason = msr_reason;
+ vcpu->run->msr.index = index;
+ vcpu->run->msr.data = data;
+ vcpu->arch.complete_userspace_io = completion;
+
+ return 1;
+}
+
+static int kvm_get_msr_user_space(struct kvm_vcpu *vcpu, u32 index, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_RDMSR, 0,
+ complete_emulated_rdmsr, r);
+}
+
+static int kvm_set_msr_user_space(struct kvm_vcpu *vcpu, u32 index, u64 data, int r)
+{
+ return kvm_msr_user_space(vcpu, index, KVM_EXIT_X86_WRMSR, data,
+ complete_emulated_wrmsr, r);
+}
+
int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu)
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data;
+ int r;
- if (kvm_get_msr(vcpu, ecx, &data)) {
+ r = kvm_get_msr(vcpu, ecx, &data);
+
+ /* MSR read failed? See if we should ask user space */
+ if (r && kvm_get_msr_user_space(vcpu, ecx, r)) {
+ /* Bounce to user space */
+ return 0;
+ }
+
+ /* MSR read failed? Inject a #GP */
+ if (r) {
trace_kvm_msr_read_ex(ecx);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -1591,8 +1778,21 @@
{
u32 ecx = kvm_rcx_read(vcpu);
u64 data = kvm_read_edx_eax(vcpu);
+ int r;
- if (kvm_set_msr(vcpu, ecx, data)) {
+ r = kvm_set_msr(vcpu, ecx, data);
+
+ /* MSR write failed? See if we should ask user space */
+ if (r && kvm_set_msr_user_space(vcpu, ecx, data, r))
+ /* Bounce to user space */
+ return 0;
+
+ /* Signal all other negative errors to userspace */
+ if (r < 0)
+ return r;
+
+ /* MSR write failed? Inject a #GP */
+ if (r > 0) {
trace_kvm_msr_write_ex(ecx, data);
kvm_inject_gp(vcpu, 0);
return 1;
@@ -1603,33 +1803,113 @@
}
EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr);
+bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu)
+{
+ return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) ||
+ xfer_to_guest_mode_work_pending();
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_exit_request);
+
+/*
+ * The fast path for frequent and performance sensitive wrmsr emulation,
+ * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces
+ * the latency of virtual IPI by avoiding the expensive bits of transitioning
+ * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the
+ * other cases which must be called after interrupts are enabled on the host.
+ */
+static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(vcpu->arch.apic))
+ return 1;
+
+ if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) &&
+ ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) &&
+ ((data & APIC_MODE_MASK) == APIC_DM_FIXED) &&
+ ((u32)(data >> 32) != X2APIC_BROADCAST)) {
+
+ data &= ~(1 << 12);
+ kvm_apic_send_ipi(vcpu->arch.apic, (u32)data, (u32)(data >> 32));
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR2, (u32)(data >> 32));
+ kvm_lapic_set_reg(vcpu->arch.apic, APIC_ICR, (u32)data);
+ trace_kvm_apic_write(APIC_ICR, (u32)data);
+ return 0;
+ }
+
+ return 1;
+}
+
+static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data)
+{
+ if (!kvm_can_use_hv_timer(vcpu))
+ return 1;
+
+ kvm_set_lapic_tscdeadline_msr(vcpu, data);
+ return 0;
+}
+
+fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu)
+{
+ u32 msr = kvm_rcx_read(vcpu);
+ u64 data;
+ fastpath_t ret = EXIT_FASTPATH_NONE;
+
+ switch (msr) {
+ case APIC_BASE_MSR + (APIC_ICR >> 4):
+ data = kvm_read_edx_eax(vcpu);
+ if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) {
+ kvm_skip_emulated_instruction(vcpu);
+ ret = EXIT_FASTPATH_EXIT_HANDLED;
+ }
+ break;
+ case MSR_IA32_TSCDEADLINE:
+ data = kvm_read_edx_eax(vcpu);
+ if (!handle_fastpath_set_tscdeadline(vcpu, data)) {
+ kvm_skip_emulated_instruction(vcpu);
+ ret = EXIT_FASTPATH_REENTER_GUEST;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (ret != EXIT_FASTPATH_NONE)
+ trace_kvm_msr_write(msr, data);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff);
+
/*
* Adapt set_msr() to msr_io()'s calling convention
*/
static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return __kvm_get_msr(vcpu, index, data, true);
+ return kvm_get_msr_ignored_check(vcpu, index, data, true);
}
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
- return __kvm_set_msr(vcpu, index, *data, true);
+ return kvm_set_msr_ignored_check(vcpu, index, *data, true);
}
#ifdef CONFIG_X86_64
+struct pvclock_clock {
+ int vclock_mode;
+ u64 cycle_last;
+ u64 mask;
+ u32 mult;
+ u32 shift;
+ u64 base_cycles;
+ u64 offset;
+};
+
struct pvclock_gtod_data {
seqcount_t seq;
- struct { /* extract of a clocksource struct */
- int vclock_mode;
- u64 cycle_last;
- u64 mask;
- u32 mult;
- u32 shift;
- } clock;
+ struct pvclock_clock clock; /* extract of a clocksource struct */
+ struct pvclock_clock raw_clock; /* extract of a clocksource struct */
- u64 boot_ns;
- u64 nsec_base;
+ ktime_t offs_boot;
u64 wall_time_sec;
};
@@ -1638,40 +1918,54 @@
static void update_pvclock_gtod(struct timekeeper *tk)
{
struct pvclock_gtod_data *vdata = &pvclock_gtod_data;
- u64 boot_ns;
-
- boot_ns = ktime_to_ns(ktime_add(tk->tkr_mono.base, tk->offs_boot));
write_seqcount_begin(&vdata->seq);
/* copy pvclock gtod data */
- vdata->clock.vclock_mode = tk->tkr_mono.clock->archdata.vclock_mode;
+ vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode;
vdata->clock.cycle_last = tk->tkr_mono.cycle_last;
vdata->clock.mask = tk->tkr_mono.mask;
vdata->clock.mult = tk->tkr_mono.mult;
vdata->clock.shift = tk->tkr_mono.shift;
+ vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec;
+ vdata->clock.offset = tk->tkr_mono.base;
- vdata->boot_ns = boot_ns;
- vdata->nsec_base = tk->tkr_mono.xtime_nsec;
+ vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode;
+ vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last;
+ vdata->raw_clock.mask = tk->tkr_raw.mask;
+ vdata->raw_clock.mult = tk->tkr_raw.mult;
+ vdata->raw_clock.shift = tk->tkr_raw.shift;
+ vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec;
+ vdata->raw_clock.offset = tk->tkr_raw.base;
vdata->wall_time_sec = tk->xtime_sec;
+ vdata->offs_boot = tk->offs_boot;
+
write_seqcount_end(&vdata->seq);
}
-#endif
-void kvm_set_pending_timer(struct kvm_vcpu *vcpu)
+static s64 get_kvmclock_base_ns(void)
{
- kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
- kvm_vcpu_kick(vcpu);
+ /* Count up from boot time, but with the frequency of the raw clock. */
+ return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot));
}
+#else
+static s64 get_kvmclock_base_ns(void)
+{
+ /* Master clock not used, so we can just use CLOCK_BOOTTIME. */
+ return ktime_get_boottime_ns();
+}
+#endif
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec64 boot;
+ u64 wall_nsec;
+
+ kvm->arch.wall_clock = wall_clock;
if (!wall_clock)
return;
@@ -1691,17 +1985,12 @@
/*
* The guest calculates current wall clock time by adding
* system time (updated by kvm_guest_time_update below) to the
- * wall clock specified here. guest system time equals host
- * system time for us, thus we must fill in host boot time here.
+ * wall clock specified here. We do the reverse here.
*/
- getboottime64(&boot);
+ wall_nsec = ktime_get_real_ns() - get_kvmclock_ns(kvm);
- if (kvm->arch.kvmclock_offset) {
- struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
- boot = timespec64_sub(boot, ts);
- }
- wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
- wc.nsec = boot.tv_nsec;
+ wc.nsec = do_div(wall_nsec, 1000000000);
+ wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */
wc.version = version;
kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
@@ -1710,6 +1999,34 @@
kvm_write_guest(kvm, wall_clock, &version, sizeof(version));
}
+static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time,
+ bool old_msr, bool host_initiated)
+{
+ struct kvm_arch *ka = &vcpu->kvm->arch;
+
+ if (vcpu->vcpu_id == 0 && !host_initiated) {
+ if (ka->boot_vcpu_runs_old_kvmclock != old_msr)
+ kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+
+ ka->boot_vcpu_runs_old_kvmclock = old_msr;
+ }
+
+ vcpu->arch.time = system_time;
+ kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
+
+ /* we verify if the enable bit is set... */
+ vcpu->arch.pv_time_enabled = false;
+ if (!(system_time & 1))
+ return;
+
+ if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
+ &vcpu->arch.pv_time, system_time & ~1ULL,
+ sizeof(struct pvclock_vcpu_time_info)))
+ vcpu->arch.pv_time_enabled = true;
+
+ return;
+}
+
static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
{
do_shl32_div32(dividend, divisor);
@@ -1838,7 +2155,7 @@
static inline int gtod_is_based_on_tsc(int mode)
{
- return mode == VCLOCK_TSC || mode == VCLOCK_HVCLOCK;
+ return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK;
}
static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
@@ -1869,12 +2186,6 @@
#endif
}
-static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
-{
- u64 curr_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
- vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
-}
-
/*
* Multiply tsc by a fixed point number represented by ratio.
*
@@ -1913,15 +2224,14 @@
u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
{
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
-
- return tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
+ return vcpu->arch.l1_tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
}
EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
- vcpu->arch.tsc_offset = kvm_x86_ops->write_l1_tsc_offset(vcpu, offset);
+ vcpu->arch.l1_tsc_offset = offset;
+ vcpu->arch.tsc_offset = kvm_x86_ops.write_l1_tsc_offset(vcpu, offset);
}
static inline bool kvm_check_tsc_unstable(void)
@@ -1931,29 +2241,28 @@
* TSC is marked unstable when we're running on Hyper-V,
* 'TSC page' clocksource is good.
*/
- if (pvclock_gtod_data.clock.vclock_mode == VCLOCK_HVCLOCK)
+ if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK)
return false;
#endif
return check_tsc_unstable();
}
-void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
+static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 data)
{
struct kvm *kvm = vcpu->kvm;
u64 offset, ns, elapsed;
unsigned long flags;
bool matched;
bool already_matched;
- u64 data = msr->data;
bool synchronizing = false;
raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
offset = kvm_compute_tsc_offset(vcpu, data);
- ns = ktime_get_boottime_ns();
+ ns = get_kvmclock_base_ns();
elapsed = ns - kvm->arch.last_tsc_nsec;
if (vcpu->arch.virtual_tsc_khz) {
- if (data == 0 && msr->host_initiated) {
+ if (data == 0) {
/*
* detection of vcpu initialization -- need to sync
* with other vCPUs. This particularly helps to keep
@@ -2023,9 +2332,6 @@
vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec;
vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write;
- if (!msr->host_initiated && guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST))
- update_ia32_tsc_adjust_msr(vcpu, offset);
-
kvm_vcpu_write_tsc_offset(vcpu, offset);
raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
@@ -2040,12 +2346,10 @@
spin_unlock(&kvm->arch.pvclock_gtod_sync_lock);
}
-EXPORT_SYMBOL_GPL(kvm_write_tsc);
-
static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
s64 adjustment)
{
- u64 tsc_offset = kvm_x86_ops->read_l1_tsc_offset(vcpu);
+ u64 tsc_offset = vcpu->arch.l1_tsc_offset;
kvm_vcpu_write_tsc_offset(vcpu, tsc_offset + adjustment);
}
@@ -2079,43 +2383,43 @@
return last;
}
-static inline u64 vgettsc(u64 *tsc_timestamp, int *mode)
+static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
+ int *mode)
{
long v;
- struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
u64 tsc_pg_val;
- switch (gtod->clock.vclock_mode) {
- case VCLOCK_HVCLOCK:
+ switch (clock->vclock_mode) {
+ case VDSO_CLOCKMODE_HVCLOCK:
tsc_pg_val = hv_read_tsc_page_tsc(hv_get_tsc_page(),
tsc_timestamp);
if (tsc_pg_val != U64_MAX) {
/* TSC page valid */
- *mode = VCLOCK_HVCLOCK;
- v = (tsc_pg_val - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ *mode = VDSO_CLOCKMODE_HVCLOCK;
+ v = (tsc_pg_val - clock->cycle_last) &
+ clock->mask;
} else {
/* TSC page invalid */
- *mode = VCLOCK_NONE;
+ *mode = VDSO_CLOCKMODE_NONE;
}
break;
- case VCLOCK_TSC:
- *mode = VCLOCK_TSC;
+ case VDSO_CLOCKMODE_TSC:
+ *mode = VDSO_CLOCKMODE_TSC;
*tsc_timestamp = read_tsc();
- v = (*tsc_timestamp - gtod->clock.cycle_last) &
- gtod->clock.mask;
+ v = (*tsc_timestamp - clock->cycle_last) &
+ clock->mask;
break;
default:
- *mode = VCLOCK_NONE;
+ *mode = VDSO_CLOCKMODE_NONE;
}
- if (*mode == VCLOCK_NONE)
+ if (*mode == VDSO_CLOCKMODE_NONE)
*tsc_timestamp = v = 0;
- return v * gtod->clock.mult;
+ return v * clock->mult;
}
-static int do_monotonic_boot(s64 *t, u64 *tsc_timestamp)
+static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
{
struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
unsigned long seq;
@@ -2124,10 +2428,10 @@
do {
seq = read_seqcount_begin(>od->seq);
- ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
- ns >>= gtod->clock.shift;
- ns += gtod->boot_ns;
+ ns = gtod->raw_clock.base_cycles;
+ ns += vgettsc(>od->raw_clock, tsc_timestamp, &mode);
+ ns >>= gtod->raw_clock.shift;
+ ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot));
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
*t = ns;
@@ -2144,8 +2448,8 @@
do {
seq = read_seqcount_begin(>od->seq);
ts->tv_sec = gtod->wall_time_sec;
- ns = gtod->nsec_base;
- ns += vgettsc(tsc_timestamp, &mode);
+ ns = gtod->clock.base_cycles;
+ ns += vgettsc(>od->clock, tsc_timestamp, &mode);
ns >>= gtod->clock.shift;
} while (unlikely(read_seqcount_retry(>od->seq, seq)));
@@ -2162,7 +2466,7 @@
if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
return false;
- return gtod_is_based_on_tsc(do_monotonic_boot(kernel_ns,
+ return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
tsc_timestamp));
}
@@ -2287,7 +2591,7 @@
spin_lock(&ka->pvclock_gtod_sync_lock);
if (!ka->use_master_clock) {
spin_unlock(&ka->pvclock_gtod_sync_lock);
- return ktime_get_boottime_ns() + ka->kvmclock_offset;
+ return get_kvmclock_base_ns() + ka->kvmclock_offset;
}
hv_clock.tsc_timestamp = ka->master_cycle_now;
@@ -2303,7 +2607,7 @@
&hv_clock.tsc_to_system_mul);
ret = __pvclock_read_cycles(&hv_clock, rdtsc());
} else
- ret = ktime_get_boottime_ns() + ka->kvmclock_offset;
+ ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
put_cpu();
@@ -2402,7 +2706,7 @@
}
if (!use_master_clock) {
host_tsc = rdtsc();
- kernel_ns = ktime_get_boottime_ns();
+ kernel_ns = get_kvmclock_base_ns();
}
tsc_timestamp = kvm_read_l1_tsc(v, host_tsc);
@@ -2520,7 +2824,7 @@
static bool can_set_mci_status(struct kvm_vcpu *vcpu)
{
/* McStatusWrEn enabled? */
- if (guest_cpuid_is_amd(vcpu))
+ if (guest_cpuid_is_amd_or_hygon(vcpu))
return !!(vcpu->arch.msr_hwcr & BIT_ULL(18));
return false;
@@ -2587,49 +2891,80 @@
u32 page_num = data & ~PAGE_MASK;
u64 page_addr = data & PAGE_MASK;
u8 *page;
- int r;
- r = -E2BIG;
if (page_num >= blob_size)
- goto out;
- r = -ENOMEM;
+ return 1;
+
page = memdup_user(blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE);
- if (IS_ERR(page)) {
- r = PTR_ERR(page);
- goto out;
+ if (IS_ERR(page))
+ return PTR_ERR(page);
+
+ if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE)) {
+ kfree(page);
+ return 1;
}
- if (kvm_vcpu_write_guest(vcpu, page_addr, page, PAGE_SIZE))
- goto out_free;
- r = 0;
-out_free:
- kfree(page);
-out:
- return r;
+ return 0;
+}
+
+static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu)
+{
+ u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT;
+
+ return (vcpu->arch.apf.msr_en_val & mask) == mask;
}
static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
{
gpa_t gpa = data & ~0x3f;
- /* Bits 3:5 are reserved, Should be zero */
- if (data & 0x38)
+ /* Bits 4:5 are reserved, Should be zero */
+ if (data & 0x30)
return 1;
- vcpu->arch.apf.msr_val = data;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT))
+ return 1;
- if (!(data & KVM_ASYNC_PF_ENABLED)) {
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) &&
+ (data & KVM_ASYNC_PF_DELIVERY_AS_INT))
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return data ? 1 : 0;
+
+ vcpu->arch.apf.msr_en_val = data;
+
+ if (!kvm_pv_async_pf_enabled(vcpu)) {
kvm_clear_async_pf_completion_queue(vcpu);
kvm_async_pf_hash_reset(vcpu);
return 0;
}
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa,
- sizeof(u32)))
+ sizeof(u64)))
return 1;
vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;
+
kvm_async_pf_wakeup_all(vcpu);
+
+ return 0;
+}
+
+static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data)
+{
+ /* Bits 8-63 are reserved */
+ if (data >> 8)
+ return 1;
+
+ if (!lapic_in_kernel(vcpu))
+ return 1;
+
+ vcpu->arch.apf.msr_int_val = data;
+
+ vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK;
+
return 0;
}
@@ -2639,10 +2974,16 @@
vcpu->arch.time = 0;
}
-static void kvm_vcpu_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
+static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
- kvm_x86_ops->tlb_flush(vcpu, invalidate_gpa);
+ kvm_x86_ops.tlb_flush_all(vcpu);
+}
+
+static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ ++vcpu->stat.tlb_flush;
+ kvm_x86_ops.tlb_flush_guest(vcpu);
}
static void record_steal_time(struct kvm_vcpu *vcpu)
@@ -2665,10 +3006,14 @@
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
- trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
- st->preempted & KVM_VCPU_FLUSH_TLB);
- if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
- kvm_vcpu_flush_tlb(vcpu, false);
+ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
+ trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
+ st->preempted & KVM_VCPU_FLUSH_TLB);
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ kvm_vcpu_flush_tlb_guest(vcpu);
+ } else {
+ st->preempted = 0;
+ }
vcpu->arch.st.preempted = 0;
@@ -2715,6 +3060,20 @@
return 1;
vcpu->arch.arch_capabilities = data;
break;
+ case MSR_IA32_PERF_CAPABILITIES: {
+ struct kvm_msr_entry msr_ent = {.index = msr, .data = 0};
+
+ if (!msr_info->host_initiated)
+ return 1;
+ if (kvm_get_msr_feature(&msr_ent))
+ return 1;
+ if (data & ~msr_ent.data)
+ return 1;
+
+ vcpu->arch.perf_capabilities = data;
+
+ return 0;
+ }
case MSR_EFER:
return set_efer(vcpu, msr_info);
case MSR_K7_HWCR:
@@ -2746,9 +3105,9 @@
/* Values other than LBR and BTF are vendor-specific,
thus reserved and should throw a #GP */
return 1;
- }
- vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
- __func__, data);
+ } else if (report_ignored_msrs)
+ vcpu_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTLMSR 0x%llx, nop\n",
+ __func__, data);
break;
case 0x200 ... 0x2ff:
return kvm_mtrr_set_msr(vcpu, msr, data);
@@ -2778,7 +3137,7 @@
if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3))
return 1;
vcpu->arch.ia32_misc_enable_msr = data;
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
} else {
vcpu->arch.ia32_misc_enable_msr = data;
}
@@ -2792,7 +3151,27 @@
vcpu->arch.msr_ia32_power_ctl = data;
break;
case MSR_IA32_TSC:
- kvm_write_tsc(vcpu, msr_info);
+ if (msr_info->host_initiated) {
+ kvm_synchronize_tsc(vcpu, data);
+ } else {
+ u64 adj = kvm_compute_tsc_offset(vcpu, data) - vcpu->arch.l1_tsc_offset;
+ adjust_tsc_offset_guest(vcpu, adj);
+ vcpu->arch.ia32_tsc_adjust_msr += adj;
+ }
+ break;
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ /*
+ * KVM supports exposing PT to the guest, but does not support
+ * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than
+ * XSAVES/XRSTORS to save/restore PT MSRs.
+ */
+ if (data & ~supported_xss)
+ return 1;
+ vcpu->arch.ia32_xss = data;
+ kvm_update_cpuid_runtime(vcpu);
break;
case MSR_SMI_COUNT:
if (!msr_info->host_initiated)
@@ -2800,43 +3179,54 @@
vcpu->arch.smi_count = data;
break;
case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
+ kvm_write_wall_clock(vcpu->kvm, data);
+ break;
case MSR_KVM_WALL_CLOCK:
- vcpu->kvm->arch.wall_clock = data;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
kvm_write_wall_clock(vcpu->kvm, data);
break;
case MSR_KVM_SYSTEM_TIME_NEW:
- case MSR_KVM_SYSTEM_TIME: {
- struct kvm_arch *ka = &vcpu->kvm->arch;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
- if (vcpu->vcpu_id == 0 && !msr_info->host_initiated) {
- bool tmp = (msr == MSR_KVM_SYSTEM_TIME);
-
- if (ka->boot_vcpu_runs_old_kvmclock != tmp)
- kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
-
- ka->boot_vcpu_runs_old_kvmclock = tmp;
- }
-
- vcpu->arch.time = data;
- kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu);
-
- /* we verify if the enable bit is set... */
- vcpu->arch.pv_time_enabled = false;
- if (!(data & 1))
- break;
-
- if (!kvm_gfn_to_hva_cache_init(vcpu->kvm,
- &vcpu->arch.pv_time, data & ~1ULL,
- sizeof(struct pvclock_vcpu_time_info)))
- vcpu->arch.pv_time_enabled = true;
-
+ kvm_write_system_time(vcpu, data, false, msr_info->host_initiated);
break;
- }
+ case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ kvm_write_system_time(vcpu, data, true, msr_info->host_initiated);
+ break;
case MSR_KVM_ASYNC_PF_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
if (kvm_pv_enable_async_pf(vcpu, data))
return 1;
break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ if (kvm_pv_enable_async_pf_int(vcpu, data))
+ return 1;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+ if (data & 0x1) {
+ vcpu->arch.apf.pageready_pending = false;
+ kvm_check_async_pf_completion(vcpu);
+ }
+ break;
case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
if (unlikely(!sched_info_on()))
return 1;
@@ -2853,11 +3243,17 @@
break;
case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8)))
return 1;
break;
case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
/* only enable bit supported */
if (data & (-1ULL << 1))
return 1;
@@ -2872,7 +3268,8 @@
case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3:
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
- pr = true; /* fall through */
+ pr = true;
+ fallthrough;
case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr))
@@ -2893,6 +3290,8 @@
*/
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
@@ -2938,17 +3337,7 @@
return xen_hvm_config(vcpu, data);
if (kvm_pmu_is_valid_msr(vcpu, msr))
return kvm_pmu_set_msr(vcpu, msr_info);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu,
- "ignored wrmsr: 0x%x data 0x%llx\n",
- msr, data);
- break;
- }
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
@@ -3013,6 +3402,17 @@
case MSR_IA32_PERF_CTL:
case MSR_AMD64_DC_CFG:
case MSR_F15H_EX_CFG:
+ /*
+ * Intel Sandy Bridge CPUs must support the RAPL (running average power
+ * limit) MSRs. Just return 0, as we do not want to expose the host
+ * data here. Do not conditionalize this on CPUID, as KVM does not do
+ * so for existing CPU-specific MSRs.
+ */
+ case MSR_RAPL_POWER_UNIT:
+ case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */
+ case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */
+ case MSR_PKG_ENERGY_STATUS: /* Total package */
+ case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */
msr_info->data = 0;
break;
case MSR_F15H_PERF_CTL0 ... MSR_F15H_PERF_CTR5:
@@ -3021,7 +3421,7 @@
case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1:
case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
+ return kvm_pmu_get_msr(vcpu, msr_info);
msr_info->data = 0;
break;
case MSR_IA32_UCODE_REV:
@@ -3033,12 +3433,31 @@
return 1;
msr_info->data = vcpu->arch.arch_capabilities;
break;
+ case MSR_IA32_PERF_CAPABILITIES:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return 1;
+ msr_info->data = vcpu->arch.perf_capabilities;
+ break;
case MSR_IA32_POWER_CTL:
msr_info->data = vcpu->arch.msr_ia32_power_ctl;
break;
- case MSR_IA32_TSC:
- msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + vcpu->arch.tsc_offset;
+ case MSR_IA32_TSC: {
+ /*
+ * Intel SDM states that MSR_IA32_TSC read adds the TSC offset
+ * even when not intercepted. AMD manual doesn't explicitly
+ * state this but appears to behave the same.
+ *
+ * On userspace reads and writes, however, we unconditionally
+ * return L1's TSC value to ensure backwards-compatible
+ * behavior for migration.
+ */
+ u64 tsc_offset = msr_info->host_initiated ? vcpu->arch.l1_tsc_offset :
+ vcpu->arch.tsc_offset;
+
+ msr_info->data = kvm_scale_tsc(vcpu, rdtsc()) + tsc_offset;
break;
+ }
case MSR_MTRRcap:
case 0x200 ... 0x2ff:
return kvm_mtrr_get_msr(vcpu, msr_info->index, &msr_info->data);
@@ -3064,7 +3483,6 @@
break;
case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
- break;
case MSR_IA32_TSCDEADLINE:
msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu);
break;
@@ -3092,23 +3510,63 @@
msr_info->data = vcpu->arch.efer;
break;
case MSR_KVM_WALL_CLOCK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->kvm->arch.wall_clock;
+ break;
case MSR_KVM_WALL_CLOCK_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->kvm->arch.wall_clock;
break;
case MSR_KVM_SYSTEM_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE))
+ return 1;
+
+ msr_info->data = vcpu->arch.time;
+ break;
case MSR_KVM_SYSTEM_TIME_NEW:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2))
+ return 1;
+
msr_info->data = vcpu->arch.time;
break;
case MSR_KVM_ASYNC_PF_EN:
- msr_info->data = vcpu->arch.apf.msr_val;
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_en_val;
+ break;
+ case MSR_KVM_ASYNC_PF_INT:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = vcpu->arch.apf.msr_int_val;
+ break;
+ case MSR_KVM_ASYNC_PF_ACK:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT))
+ return 1;
+
+ msr_info->data = 0;
break;
case MSR_KVM_STEAL_TIME:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME))
+ return 1;
+
msr_info->data = vcpu->arch.st.msr_val;
break;
case MSR_KVM_PV_EOI_EN:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI))
+ return 1;
+
msr_info->data = vcpu->arch.pv_eoi.msr_val;
break;
case MSR_KVM_POLL_CONTROL:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL))
+ return 1;
+
msr_info->data = vcpu->arch.msr_kvm_poll_control;
break;
case MSR_IA32_P5_MC_ADDR:
@@ -3119,6 +3577,12 @@
case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1:
return get_msr_mce(vcpu, msr_info->index, &msr_info->data,
msr_info->host_initiated);
+ case MSR_IA32_XSS:
+ if (!msr_info->host_initiated &&
+ !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))
+ return 1;
+ msr_info->data = vcpu->arch.ia32_xss;
+ break;
case MSR_K7_CLK_CTL:
/*
* Provide expected ramp-up count for K7. All other
@@ -3132,6 +3596,8 @@
msr_info->data = 0x20000000;
break;
case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+ case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER:
+ case HV_X64_MSR_SYNDBG_OPTIONS:
case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
case HV_X64_MSR_CRASH_CTL:
case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
@@ -3141,7 +3607,6 @@
return kvm_hv_get_msr_common(vcpu,
msr_info->index, &msr_info->data,
msr_info->host_initiated);
- break;
case MSR_IA32_BBL_CR_CTL3:
/* This legacy MSR exists but isn't fully documented in current
* silicon. It is however accessed by winxp in very narrow
@@ -3179,18 +3644,8 @@
break;
default:
if (kvm_pmu_is_valid_msr(vcpu, msr_info->index))
- return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data);
- if (!ignore_msrs) {
- vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n",
- msr_info->index);
- return 1;
- } else {
- if (report_ignored_msrs)
- vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n",
- msr_info->index);
- msr_info->data = 0;
- }
- break;
+ return kvm_pmu_get_msr(vcpu, msr_info);
+ return KVM_MSR_RET_INVALID;
}
return 0;
}
@@ -3309,6 +3764,7 @@
case KVM_CAP_X86_ROBUST_SINGLESTEP:
case KVM_CAP_XSAVE:
case KVM_CAP_ASYNC_PF:
+ case KVM_CAP_ASYNC_PF_INT:
case KVM_CAP_GET_TSC_KHZ:
case KVM_CAP_KVMCLOCK_CTRL:
case KVM_CAP_READONLY_MEM:
@@ -3323,6 +3779,11 @@
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
+ case KVM_CAP_SET_GUEST_DEBUG:
+ case KVM_CAP_LAST_CPU:
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ case KVM_CAP_X86_MSR_FILTER:
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -3346,10 +3807,10 @@
* fringe case that is not enabled except via specific settings
* of the module parameters.
*/
- r = kvm_x86_ops->has_emulated_msr(MSR_IA32_SMBASE);
+ r = kvm_x86_ops.has_emulated_msr(MSR_IA32_SMBASE);
break;
case KVM_CAP_VAPIC:
- r = !kvm_x86_ops->cpu_has_accelerated_tpr();
+ r = !kvm_x86_ops.cpu_has_accelerated_tpr();
break;
case KVM_CAP_NR_VCPUS:
r = KVM_SOFT_MAX_VCPUS;
@@ -3376,14 +3837,20 @@
r = KVM_X2APIC_API_VALID_FLAGS;
break;
case KVM_CAP_NESTED_STATE:
- r = kvm_x86_ops->get_nested_state ?
- kvm_x86_ops->get_nested_state(NULL, NULL, 0) : 0;
+ r = kvm_x86_ops.nested_ops->get_state ?
+ kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0;
break;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- r = kvm_x86_ops->enable_direct_tlbflush != NULL;
+ r = kvm_x86_ops.enable_direct_tlbflush != NULL;
break;
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
- r = kvm_x86_ops->nested_enable_evmcs != NULL;
+ r = kvm_x86_ops.nested_ops->enable_evmcs != NULL;
+ break;
+ case KVM_CAP_SMALLER_MAXPHYADDR:
+ r = (int) allow_smaller_maxphyaddr;
+ break;
+ case KVM_CAP_STEAL_TIME:
+ r = sched_info_on();
break;
default:
break;
@@ -3445,7 +3912,7 @@
r = 0;
break;
}
- case KVM_X86_GET_MCE_CAP_SUPPORTED: {
+ case KVM_X86_GET_MCE_CAP_SUPPORTED:
r = -EFAULT;
if (copy_to_user(argp, &kvm_mce_cap_supported,
sizeof(kvm_mce_cap_supported)))
@@ -3477,9 +3944,9 @@
case KVM_GET_MSRS:
r = msr_io(NULL, argp, do_get_msr_feature, 1);
break;
- }
default:
r = -EINVAL;
+ break;
}
out:
return r;
@@ -3499,14 +3966,17 @@
{
/* Address WBINVD may be executed by guest */
if (need_emulate_wbinvd(vcpu)) {
- if (kvm_x86_ops->has_wbinvd_exit())
+ if (kvm_x86_ops.has_wbinvd_exit())
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
else if (vcpu->cpu != -1 && vcpu->cpu != cpu)
smp_call_function_single(vcpu->cpu,
wbinvd_ipi, NULL, 1);
}
- kvm_x86_ops->vcpu_load(vcpu, cpu);
+ kvm_x86_ops.vcpu_load(vcpu, cpu);
+
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
@@ -3573,7 +4043,7 @@
int idx;
if (vcpu->preempted)
- vcpu->arch.preempted_in_kernel = !kvm_x86_ops->get_cpl(vcpu);
+ vcpu->arch.preempted_in_kernel = !kvm_x86_ops.get_cpl(vcpu);
/*
* Disable page faults because we're in atomic context here.
@@ -3592,7 +4062,7 @@
kvm_steal_time_set_preempted(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
pagefault_enable();
- kvm_x86_ops->vcpu_put(vcpu);
+ kvm_x86_ops.vcpu_put(vcpu);
vcpu->arch.last_host_tsc = rdtsc();
/*
* If userspace has set any breakpoints or watchpoints, dr6 is restored
@@ -3606,7 +4076,7 @@
struct kvm_lapic_state *s)
{
if (vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -3725,7 +4195,7 @@
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
- kvm_x86_ops->setup_mce(vcpu);
+ kvm_x86_ops.setup_mce(vcpu);
out:
return r;
}
@@ -3783,11 +4253,25 @@
{
process_nmi(vcpu);
-
if (kvm_check_request(KVM_REQ_SMI, vcpu))
process_smi(vcpu);
/*
+ * In guest mode, payload delivery should be deferred,
+ * so that the L1 hypervisor can intercept #PF before
+ * CR2 is modified (or intercept #DB before DR6 is
+ * modified under nVMX). Unless the per-VM capability,
+ * KVM_CAP_EXCEPTION_PAYLOAD, is set, we may not defer the delivery of
+ * an exception payload and handle after a KVM_GET_VCPU_EVENTS. Since we
+ * opportunistically defer the exception payload, deliver it if the
+ * capability hasn't been requested before processing a
+ * KVM_GET_VCPU_EVENTS.
+ */
+ if (!vcpu->kvm->arch.exception_payload_enabled &&
+ vcpu->arch.exception.pending && vcpu->arch.exception.has_payload)
+ kvm_deliver_exception_payload(vcpu);
+
+ /*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
* isn't advanced, we should expect to encounter the exception
@@ -3818,11 +4302,11 @@
vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft;
events->interrupt.nr = vcpu->arch.interrupt.nr;
events->interrupt.soft = 0;
- events->interrupt.shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+ events->interrupt.shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
events->nmi.injected = vcpu->arch.nmi_injected;
events->nmi.pending = vcpu->arch.nmi_pending != 0;
- events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
+ events->nmi.masked = kvm_x86_ops.get_nmi_mask(vcpu);
events->nmi.pad = 0;
events->sipi_vector = 0; /* never valid when reporting to user space */
@@ -3889,13 +4373,13 @@
vcpu->arch.interrupt.nr = events->interrupt.nr;
vcpu->arch.interrupt.soft = events->interrupt.soft;
if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
- kvm_x86_ops->set_interrupt_shadow(vcpu,
+ kvm_x86_ops.set_interrupt_shadow(vcpu,
events->interrupt.shadow);
vcpu->arch.nmi_injected = events->nmi.injected;
if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
vcpu->arch.nmi_pending = events->nmi.pending;
- kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
+ kvm_x86_ops.set_nmi_mask(vcpu, events->nmi.masked);
if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR &&
lapic_in_kernel(vcpu))
@@ -3907,6 +4391,8 @@
vcpu->arch.hflags |= HF_SMM_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_MASK;
+
+ kvm_x86_ops.nested_ops->leave_nested(vcpu);
kvm_smm_changed(vcpu);
}
@@ -3917,12 +4403,13 @@
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK;
- if (lapic_in_kernel(vcpu)) {
- if (events->smi.latched_init)
- set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- else
- clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
- }
+ }
+
+ if (lapic_in_kernel(vcpu)) {
+ if (events->smi.latched_init)
+ set_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
+ else
+ clear_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
}
@@ -3958,7 +4445,6 @@
memcpy(vcpu->arch.db, dbgregs->db, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = dbgregs->dr6;
- kvm_update_dr6(vcpu);
vcpu->arch.dr7 = dbgregs->dr7;
kvm_update_dr7(vcpu);
@@ -4081,8 +4567,7 @@
* CPUID leaf 0xD, index 0, EDX:EAX. This is for compatibility
* with old userspace.
*/
- if (xstate_bv & ~kvm_supported_xcr0() ||
- mxcsr & ~mxcsr_feature_mask)
+ if (xstate_bv & ~supported_xcr0 || mxcsr & ~mxcsr_feature_mask)
return -EINVAL;
load_xsave(vcpu, (u8 *)guest_xsave->region);
} else {
@@ -4161,7 +4646,7 @@
case KVM_CAP_HYPERV_SYNIC2:
if (cap->args[0])
return -EINVAL;
- /* fall through */
+ fallthrough;
case KVM_CAP_HYPERV_SYNIC:
if (!irqchip_in_kernel(vcpu->kvm))
@@ -4169,9 +4654,9 @@
return kvm_hv_activate_synic(vcpu, cap->cap ==
KVM_CAP_HYPERV_SYNIC2);
case KVM_CAP_HYPERV_ENLIGHTENED_VMCS:
- if (!kvm_x86_ops->nested_enable_evmcs)
+ if (!kvm_x86_ops.nested_ops->enable_evmcs)
return -ENOTTY;
- r = kvm_x86_ops->nested_enable_evmcs(vcpu, &vmcs_version);
+ r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version);
if (!r) {
user_ptr = (void __user *)(uintptr_t)cap->args[0];
if (copy_to_user(user_ptr, &vmcs_version,
@@ -4180,10 +4665,17 @@
}
return r;
case KVM_CAP_HYPERV_DIRECT_TLBFLUSH:
- if (!kvm_x86_ops->enable_direct_tlbflush)
+ if (!kvm_x86_ops.enable_direct_tlbflush)
return -ENOTTY;
- return kvm_x86_ops->enable_direct_tlbflush(vcpu);
+ return kvm_x86_ops.enable_direct_tlbflush(vcpu);
+
+ case KVM_CAP_ENFORCE_PV_FEATURE_CPUID:
+ vcpu->arch.pv_cpuid.enforce = cap->args[0];
+ if (vcpu->arch.pv_cpuid.enforce)
+ kvm_update_pv_runtime(vcpu);
+
+ return 0;
default:
return -EINVAL;
@@ -4453,7 +4945,8 @@
r = -EINVAL;
user_tsc_khz = (u32)arg;
- if (user_tsc_khz >= kvm_max_guest_tsc_khz)
+ if (kvm_has_tsc_control &&
+ user_tsc_khz >= kvm_max_guest_tsc_khz)
goto out;
if (user_tsc_khz == 0)
@@ -4486,7 +4979,7 @@
u32 user_data_size;
r = -EINVAL;
- if (!kvm_x86_ops->get_nested_state)
+ if (!kvm_x86_ops.nested_ops->get_state)
break;
BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size));
@@ -4494,8 +4987,8 @@
if (get_user(user_data_size, &user_kvm_nested_state->size))
break;
- r = kvm_x86_ops->get_nested_state(vcpu, user_kvm_nested_state,
- user_data_size);
+ r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state,
+ user_data_size);
if (r < 0)
break;
@@ -4516,7 +5009,7 @@
int idx;
r = -EINVAL;
- if (!kvm_x86_ops->set_nested_state)
+ if (!kvm_x86_ops.nested_ops->set_state)
break;
r = -EFAULT;
@@ -4529,7 +5022,8 @@
if (kvm_state.flags &
~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE
- | KVM_STATE_NESTED_EVMCS))
+ | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING
+ | KVM_STATE_NESTED_GIF_SET))
break;
/* nested_run_pending implies guest_mode. */
@@ -4538,7 +5032,7 @@
break;
idx = srcu_read_lock(&vcpu->kvm->srcu);
- r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state);
srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
@@ -4582,14 +5076,14 @@
if (addr > (unsigned int)(-3 * PAGE_SIZE))
return -EINVAL;
- ret = kvm_x86_ops->set_tss_addr(kvm, addr);
+ ret = kvm_x86_ops.set_tss_addr(kvm, addr);
return ret;
}
static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm,
u64 ident_addr)
{
- return kvm_x86_ops->set_identity_map_addr(kvm, ident_addr);
+ return kvm_x86_ops.set_identity_map_addr(kvm, ident_addr);
}
static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
@@ -4730,9 +5224,6 @@
{
struct kvm_pit *pit = kvm->arch.vpit;
- if (!pit)
- return -ENXIO;
-
/* pit->pit_state.lock was overloaded to prevent userspace from getting
* an inconsistent state after running multiple KVM_REINJECT_CONTROL
* ioctls in parallel. Use a separate lock if that ioctl isn't rare.
@@ -4744,77 +5235,13 @@
return 0;
}
-/**
- * kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
- * @kvm: kvm instance
- * @log: slot id and address to which we copy the log
- *
- * Steps 1-4 below provide general overview of dirty page logging. See
- * kvm_get_dirty_log_protect() function description for additional details.
- *
- * We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
- * always flush the TLB (step 4) even if previous step failed and the dirty
- * bitmap may be corrupt. Regardless of previous outcome the KVM logging API
- * does not preclude user space subsequent dirty log read. Flushing TLB ensures
- * writes will be marked dirty for next log read.
- *
- * 1. Take a snapshot of the bit and clear it if needed.
- * 2. Write protect the corresponding page.
- * 3. Copy the snapshot to the userspace.
- * 4. Flush TLB's if needed.
- */
-int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, struct kvm_dirty_log *log)
+void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
{
- bool flush = false;
- int r;
-
- mutex_lock(&kvm->slots_lock);
-
/*
* Flush potentially hardware-cached dirty pages to dirty_bitmap.
*/
- if (kvm_x86_ops->flush_log_dirty)
- kvm_x86_ops->flush_log_dirty(kvm);
-
- r = kvm_get_dirty_log_protect(kvm, log, &flush);
-
- /*
- * All the TLBs can be flushed out of mmu lock, see the comments in
- * kvm_mmu_slot_remove_write_access().
- */
- lockdep_assert_held(&kvm->slots_lock);
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-
- mutex_unlock(&kvm->slots_lock);
- return r;
-}
-
-int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm, struct kvm_clear_dirty_log *log)
-{
- bool flush = false;
- int r;
-
- mutex_lock(&kvm->slots_lock);
-
- /*
- * Flush potentially hardware-cached dirty pages to dirty_bitmap.
- */
- if (kvm_x86_ops->flush_log_dirty)
- kvm_x86_ops->flush_log_dirty(kvm);
-
- r = kvm_clear_dirty_log_protect(kvm, log, &flush);
-
- /*
- * All the TLBs can be flushed out of mmu lock, see the comments in
- * kvm_mmu_slot_remove_write_access().
- */
- lockdep_assert_held(&kvm->slots_lock);
- if (flush)
- kvm_flush_remote_tlbs(kvm);
-
- mutex_unlock(&kvm->slots_lock);
- return r;
+ if (kvm_x86_ops.flush_log_dirty)
+ kvm_x86_ops.flush_log_dirty(kvm);
}
int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event,
@@ -4900,6 +5327,10 @@
kvm->arch.exception_payload_enabled = cap->args[0];
r = 0;
break;
+ case KVM_CAP_X86_USER_SPACE_MSR:
+ kvm->arch.user_space_msr_mask = cap->args[0];
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -4907,6 +5338,125 @@
return r;
}
+static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow)
+{
+ struct kvm_x86_msr_filter *msr_filter;
+
+ msr_filter = kzalloc(sizeof(*msr_filter), GFP_KERNEL_ACCOUNT);
+ if (!msr_filter)
+ return NULL;
+
+ msr_filter->default_allow = default_allow;
+ return msr_filter;
+}
+
+static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter)
+{
+ u32 i;
+
+ if (!msr_filter)
+ return;
+
+ for (i = 0; i < msr_filter->count; i++)
+ kfree(msr_filter->ranges[i].bitmap);
+
+ kfree(msr_filter);
+}
+
+static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter,
+ struct kvm_msr_filter_range *user_range)
+{
+ struct msr_bitmap_range range;
+ unsigned long *bitmap = NULL;
+ size_t bitmap_size;
+ int r;
+
+ if (!user_range->nmsrs)
+ return 0;
+
+ bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long);
+ if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE)
+ return -EINVAL;
+
+ bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size);
+ if (IS_ERR(bitmap))
+ return PTR_ERR(bitmap);
+
+ range = (struct msr_bitmap_range) {
+ .flags = user_range->flags,
+ .base = user_range->base,
+ .nmsrs = user_range->nmsrs,
+ .bitmap = bitmap,
+ };
+
+ if (range.flags & ~(KVM_MSR_FILTER_READ | KVM_MSR_FILTER_WRITE)) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ if (!range.flags) {
+ r = -EINVAL;
+ goto err;
+ }
+
+ /* Everything ok, add this range identifier. */
+ msr_filter->ranges[msr_filter->count] = range;
+ msr_filter->count++;
+
+ return 0;
+err:
+ kfree(bitmap);
+ return r;
+}
+
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
+{
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_x86_msr_filter *new_filter, *old_filter;
+ struct kvm_msr_filter filter;
+ bool default_allow;
+ bool empty = true;
+ int r = 0;
+ u32 i;
+
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+ return -EFAULT;
+
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++)
+ empty &= !filter.ranges[i].nmsrs;
+
+ default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ if (empty && !default_allow)
+ return -EINVAL;
+
+ new_filter = kvm_alloc_msr_filter(default_allow);
+ if (!new_filter)
+ return -ENOMEM;
+
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+ r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
+ if (r) {
+ kvm_free_msr_filter(new_filter);
+ return r;
+ }
+ }
+
+ mutex_lock(&kvm->lock);
+
+ /* The per-VM filter is protected by kvm->lock... */
+ old_filter = srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1);
+
+ rcu_assign_pointer(kvm->arch.msr_filter, new_filter);
+ synchronize_srcu(&kvm->srcu);
+
+ kvm_free_msr_filter(old_filter);
+
+ kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED);
+ mutex_unlock(&kvm->lock);
+
+ return 0;
+}
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5041,9 +5591,6 @@
if (!irqchip_kernel(kvm))
goto set_irqchip_out;
r = kvm_vm_ioctl_set_irqchip(kvm, chip);
- if (r)
- goto set_irqchip_out;
- r = 0;
set_irqchip_out:
kfree(chip);
break;
@@ -5108,6 +5655,9 @@
r = -EFAULT;
if (copy_from_user(&control, argp, sizeof(control)))
goto out;
+ r = -ENXIO;
+ if (!kvm->arch.vpit)
+ goto out;
r = kvm_vm_ioctl_reinject(kvm, &control);
break;
}
@@ -5173,8 +5723,8 @@
}
case KVM_MEMORY_ENCRYPT_OP: {
r = -ENOTTY;
- if (kvm_x86_ops->mem_enc_op)
- r = kvm_x86_ops->mem_enc_op(kvm, argp);
+ if (kvm_x86_ops.mem_enc_op)
+ r = kvm_x86_ops.mem_enc_op(kvm, argp);
break;
}
case KVM_MEMORY_ENCRYPT_REG_REGION: {
@@ -5185,8 +5735,8 @@
goto out;
r = -ENOTTY;
- if (kvm_x86_ops->mem_enc_reg_region)
- r = kvm_x86_ops->mem_enc_reg_region(kvm, ®ion);
+ if (kvm_x86_ops.mem_enc_reg_region)
+ r = kvm_x86_ops.mem_enc_reg_region(kvm, ®ion);
break;
}
case KVM_MEMORY_ENCRYPT_UNREG_REGION: {
@@ -5197,8 +5747,8 @@
goto out;
r = -ENOTTY;
- if (kvm_x86_ops->mem_enc_unreg_region)
- r = kvm_x86_ops->mem_enc_unreg_region(kvm, ®ion);
+ if (kvm_x86_ops.mem_enc_unreg_region)
+ r = kvm_x86_ops.mem_enc_unreg_region(kvm, ®ion);
break;
}
case KVM_HYPERV_EVENTFD: {
@@ -5213,6 +5763,9 @@
case KVM_SET_PMU_EVENT_FILTER:
r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
break;
+ case KVM_X86_SET_MSR_FILTER:
+ r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
+ break;
default:
r = -ENOTTY;
}
@@ -5249,32 +5802,32 @@
continue;
break;
case MSR_TSC_AUX:
- if (!kvm_x86_ops->rdtscp_supported())
+ if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP))
continue;
break;
case MSR_IA32_UMWAIT_CONTROL:
- if (!boot_cpu_has(X86_FEATURE_WAITPKG))
+ if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG))
continue;
break;
case MSR_IA32_RTIT_CTL:
case MSR_IA32_RTIT_STATUS:
- if (!kvm_x86_ops->pt_supported())
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT))
continue;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if (!kvm_x86_ops->pt_supported() ||
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
!intel_pt_validate_hw_cap(PT_CAP_cr3_filtering))
continue;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
case MSR_IA32_RTIT_OUTPUT_MASK:
- if (!kvm_x86_ops->pt_supported() ||
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
(!intel_pt_validate_hw_cap(PT_CAP_topa_output) &&
!intel_pt_validate_hw_cap(PT_CAP_single_range_output)))
continue;
break;
- case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: {
- if (!kvm_x86_ops->pt_supported() ||
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) ||
msrs_to_save_all[i] - MSR_IA32_RTIT_ADDR0_A >=
intel_pt_validate_hw_cap(PT_CAP_num_address_ranges) * 2)
continue;
@@ -5288,7 +5841,7 @@
if (msrs_to_save_all[i] - MSR_ARCH_PERFMON_EVENTSEL0 >=
min(INTEL_PMC_MAX_GENERIC, x86_pmu.num_counters_gp))
continue;
- }
+ break;
default:
break;
}
@@ -5297,7 +5850,7 @@
}
for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) {
- if (!kvm_x86_ops->has_emulated_msr(emulated_msrs_all[i]))
+ if (!kvm_x86_ops.has_emulated_msr(emulated_msrs_all[i]))
continue;
emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i];
@@ -5360,13 +5913,13 @@
static void kvm_set_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops->set_segment(vcpu, var, seg);
+ kvm_x86_ops.set_segment(vcpu, var, seg);
}
void kvm_get_segment(struct kvm_vcpu *vcpu,
struct kvm_segment *var, int seg)
{
- kvm_x86_ops->get_segment(vcpu, var, seg);
+ kvm_x86_ops.get_segment(vcpu, var, seg);
}
gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access,
@@ -5386,14 +5939,14 @@
gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_FETCH_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5401,7 +5954,7 @@
gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
}
@@ -5450,7 +6003,7 @@
struct x86_exception *exception)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
unsigned offset;
int ret;
@@ -5475,7 +6028,7 @@
gva_t addr, void *val, unsigned int bytes,
struct x86_exception *exception)
{
- u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
+ u32 access = (kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
/*
* FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
@@ -5496,7 +6049,7 @@
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = 0;
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception);
@@ -5549,7 +6102,7 @@
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
u32 access = PFERR_WRITE_MASK;
- if (!system && kvm_x86_ops->get_cpl(vcpu) == 3)
+ if (!system && kvm_x86_ops.get_cpl(vcpu) == 3)
access |= PFERR_USER_MASK;
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
@@ -5562,13 +6115,6 @@
/* kvm_write_guest_virt_system can pull in tons of pages. */
vcpu->arch.l1tf_flush_l1d = true;
- /*
- * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED
- * is returned, but our callers are not ready for that and they blindly
- * call kvm_inject_page_fault. Ensure that they at least do not leak
- * uninitialized kernel stack memory into cr2 and error code.
- */
- memset(exception, 0, sizeof(*exception));
return kvm_write_guest_virt_helper(addr, val, bytes, vcpu,
PFERR_WRITE_MASK, exception);
}
@@ -5576,14 +6122,18 @@
int handle_ud(struct kvm_vcpu *vcpu)
{
+ static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX };
int emul_type = EMULTYPE_TRAP_UD;
char sig[5]; /* ud2; .ascii "kvm" */
struct x86_exception e;
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, NULL, 0)))
+ return 1;
+
if (force_emulation_prefix &&
kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu),
sig, sizeof(sig), &e) == 0 &&
- memcmp(sig, "\xf\xbkvm", sizeof(sig)) == 0) {
+ memcmp(sig, kvm_emulate_prefix, sizeof(sig)) == 0) {
kvm_rip_write(vcpu, kvm_rip_read(vcpu) + sizeof(sig));
emul_type = EMULTYPE_TRAP_UD_FORCED;
}
@@ -5611,7 +6161,7 @@
gpa_t *gpa, struct x86_exception *exception,
bool write)
{
- u32 access = ((kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
+ u32 access = ((kvm_x86_ops.get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0)
| (write ? PFERR_WRITE_MASK : 0);
/*
@@ -5730,7 +6280,7 @@
int handled, ret;
bool write = ops->write;
struct kvm_mmio_fragment *frag;
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
/*
* If the exit was due to a NPF we may already have a GPA.
@@ -5739,10 +6289,9 @@
* operation using rep will only have the initial GPA from the NPF
* occurred.
*/
- if (vcpu->arch.gpa_available &&
- emulator_can_use_gpa(ctxt) &&
- (addr & ~PAGE_MASK) == (vcpu->arch.gpa_val & ~PAGE_MASK)) {
- gpa = vcpu->arch.gpa_val;
+ if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) &&
+ (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) {
+ gpa = ctxt->gpa_val;
ret = vcpu_is_mmio_gpa(vcpu, addr, gpa, write);
} else {
ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write);
@@ -5865,6 +6414,7 @@
{
struct kvm_host_map map;
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ u64 page_line_mask;
gpa_t gpa;
char *kaddr;
bool exchanged;
@@ -5879,7 +6429,16 @@
(gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
goto emul_write;
- if (((gpa + bytes - 1) & PAGE_MASK) != (gpa & PAGE_MASK))
+ /*
+ * Emulate the atomic as a straight write to avoid #AC if SLD is
+ * enabled in the host and the access splits a cache line.
+ */
+ if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ page_line_mask = ~(cache_line_size() - 1);
+ else
+ page_line_mask = PAGE_MASK;
+
+ if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask))
goto emul_write;
if (kvm_vcpu_map(vcpu, gpa_to_gfn(gpa), &map))
@@ -5962,11 +6521,9 @@
return 0;
}
-static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port, void *val,
- unsigned int count)
+static int emulator_pio_in(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, void *val, unsigned int count)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
int ret;
if (vcpu->arch.pio.count)
@@ -5986,20 +6543,33 @@
return 0;
}
-static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
- int size, unsigned short port,
- const void *val, unsigned int count)
+static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port, void *val,
+ unsigned int count)
{
- struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ return emulator_pio_in(emul_to_vcpu(ctxt), size, port, val, count);
+}
+
+static int emulator_pio_out(struct kvm_vcpu *vcpu, int size,
+ unsigned short port, const void *val,
+ unsigned int count)
+{
memcpy(vcpu->arch.pio_data, val, size * count);
trace_kvm_pio(KVM_PIO_OUT, port, size, count, vcpu->arch.pio_data);
return emulator_pio_in_out(vcpu, size, port, (void *)val, count, false);
}
+static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
+ int size, unsigned short port,
+ const void *val, unsigned int count)
+{
+ return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count);
+}
+
static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
{
- return kvm_x86_ops->get_segment_base(vcpu, seg);
+ return kvm_x86_ops.get_segment_base(vcpu, seg);
}
static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
@@ -6012,7 +6582,7 @@
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
- if (kvm_x86_ops->has_wbinvd_exit()) {
+ if (kvm_x86_ops.has_wbinvd_exit()) {
int cpu = get_cpu();
cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
@@ -6117,27 +6687,27 @@
static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
{
- return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
+ return kvm_x86_ops.get_cpl(emul_to_vcpu(ctxt));
}
static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.get_gdt(emul_to_vcpu(ctxt), dt);
}
static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.get_idt(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.set_gdt(emul_to_vcpu(ctxt), dt);
}
static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
{
- kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
+ kvm_x86_ops.set_idt(emul_to_vcpu(ctxt), dt);
}
static unsigned long emulator_get_cached_segment_base(
@@ -6215,13 +6785,33 @@
static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 *pdata)
{
- return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_get_msr(vcpu, msr_index, pdata);
+
+ if (r && kvm_get_msr_user_space(vcpu, msr_index, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
u32 msr_index, u64 data)
{
- return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+ int r;
+
+ r = kvm_set_msr(vcpu, msr_index, data);
+
+ if (r && kvm_set_msr_user_space(vcpu, msr_index, data, r)) {
+ /* Bounce to user space */
+ return X86EMUL_IO_NEEDED;
+ }
+
+ return r;
}
static u64 emulator_get_smbase(struct x86_emulate_ctxt *ctxt)
@@ -6241,7 +6831,7 @@
static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt,
u32 pmc)
{
- return kvm_pmu_is_valid_msr_idx(emul_to_vcpu(ctxt), pmc);
+ return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc);
}
static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt,
@@ -6259,13 +6849,30 @@
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
{
- return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
+ return kvm_x86_ops.check_intercept(emul_to_vcpu(ctxt), info, stage,
+ &ctxt->exception);
}
static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt,
- u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, bool check_limit)
+ u32 *eax, u32 *ebx, u32 *ecx, u32 *edx,
+ bool exact_only)
{
- return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, check_limit);
+ return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only);
+}
+
+static bool emulator_guest_has_long_mode(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_LM);
+}
+
+static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE);
+}
+
+static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
}
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
@@ -6280,7 +6887,7 @@
static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked)
{
- kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked);
+ kvm_x86_ops.set_nmi_mask(emul_to_vcpu(ctxt), masked);
}
static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt)
@@ -6299,7 +6906,7 @@
static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
const char *smstate)
{
- return kvm_x86_ops->pre_leave_smm(emul_to_vcpu(ctxt), smstate);
+ return kvm_x86_ops.pre_leave_smm(emul_to_vcpu(ctxt), smstate);
}
static void emulator_post_leave_smm(struct x86_emulate_ctxt *ctxt)
@@ -6348,6 +6955,9 @@
.fix_hypercall = emulator_fix_hypercall,
.intercept = emulator_intercept,
.get_cpuid = emulator_get_cpuid,
+ .guest_has_long_mode = emulator_guest_has_long_mode,
+ .guest_has_movbe = emulator_guest_has_movbe,
+ .guest_has_fxsr = emulator_guest_has_fxsr,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
@@ -6358,7 +6968,7 @@
static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
{
- u32 int_shadow = kvm_x86_ops->get_interrupt_shadow(vcpu);
+ u32 int_shadow = kvm_x86_ops.get_interrupt_shadow(vcpu);
/*
* an sti; sti; sequence only disable interrupts for the first
* instruction. So, if the last instruction, be it emulated or
@@ -6369,7 +6979,7 @@
if (int_shadow & mask)
mask = 0;
if (unlikely(int_shadow || mask)) {
- kvm_x86_ops->set_interrupt_shadow(vcpu, mask);
+ kvm_x86_ops.set_interrupt_shadow(vcpu, mask);
if (!mask)
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
@@ -6377,9 +6987,9 @@
static bool inject_emulated_exception(struct kvm_vcpu *vcpu)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
if (ctxt->exception.vector == PF_VECTOR)
- return kvm_propagate_fault(vcpu, &ctxt->exception);
+ return kvm_inject_emulated_page_fault(vcpu, &ctxt->exception);
if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
@@ -6389,13 +6999,31 @@
return false;
}
+static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu)
+{
+ struct x86_emulate_ctxt *ctxt;
+
+ ctxt = kmem_cache_zalloc(x86_emulator_cache, GFP_KERNEL_ACCOUNT);
+ if (!ctxt) {
+ pr_err("kvm: failed to allocate vcpu's emulator\n");
+ return NULL;
+ }
+
+ ctxt->vcpu = vcpu;
+ ctxt->ops = &emulate_ops;
+ vcpu->arch.emulate_ctxt = ctxt;
+
+ return ctxt;
+}
+
static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int cs_db, cs_l;
- kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ kvm_x86_ops.get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
+ ctxt->gpa_available = false;
ctxt->eflags = kvm_get_rflags(vcpu);
ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0;
@@ -6409,13 +7037,18 @@
BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK);
BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK);
+ ctxt->interruptibility = 0;
+ ctxt->have_exception = false;
+ ctxt->exception.vector = -1;
+ ctxt->perm_ok = false;
+
init_decode_cache(ctxt);
vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
}
void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
@@ -6454,7 +7087,7 @@
kvm_queue_exception(vcpu, UD_VECTOR);
- if (!is_guest_mode(vcpu) && kvm_x86_ops->get_cpl(vcpu) == 0) {
+ if (!is_guest_mode(vcpu) && kvm_x86_ops.get_cpl(vcpu) == 0) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
vcpu->run->internal.ndata = 0;
@@ -6471,10 +7104,11 @@
gpa_t gpa = cr2_or_gpa;
kvm_pfn_t pfn;
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
return false;
if (!vcpu->arch.mmu->direct_map) {
@@ -6562,10 +7196,11 @@
*/
vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0;
- if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
+ if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF))
return false;
- if (WARN_ON_ONCE(is_guest_mode(vcpu)))
+ if (WARN_ON_ONCE(is_guest_mode(vcpu)) ||
+ WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF)))
return false;
if (x86_page_table_writing_insn(ctxt))
@@ -6622,7 +7257,7 @@
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
kvm_run->debug.arch.dr6 = DR6_BS | DR6_FIXED_1 | DR6_RTM;
- kvm_run->debug.arch.pc = vcpu->arch.singlestep_rip;
+ kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
kvm_run->debug.arch.exception = DB_VECTOR;
kvm_run->exit_reason = KVM_EXIT_DEBUG;
return 0;
@@ -6633,10 +7268,10 @@
int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
int r;
- r = kvm_x86_ops->skip_emulated_instruction(vcpu);
+ r = kvm_x86_ops.skip_emulated_instruction(vcpu);
if (unlikely(!r))
return 0;
@@ -6682,9 +7317,7 @@
vcpu->arch.db);
if (dr6 != 0) {
- vcpu->arch.dr6 &= ~DR_TRAP_BITS;
- vcpu->arch.dr6 |= dr6 | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
*r = 1;
return true;
}
@@ -6724,13 +7357,47 @@
return false;
}
+/*
+ * Decode to be emulated instruction. Return EMULATION_OK if success.
+ */
+int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
+ void *insn, int insn_len)
+{
+ int r = EMULATION_OK;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+
+ init_emulate_ctxt(vcpu);
+
+ /*
+ * We will reenter on the same instruction since we do not set
+ * complete_userspace_io. This does not handle watchpoints yet,
+ * those would be handled in the emulate_ops.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_breakpoint(vcpu, &r))
+ return r;
+
+ ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
+
+ r = x86_decode_insn(ctxt, insn, insn_len);
+
+ trace_kvm_emulate_insn_start(vcpu);
+ ++vcpu->stat.insn_emulation;
+
+ return r;
+}
+EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction);
+
int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len)
{
int r;
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
bool writeback = true;
- bool write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
+ bool write_fault_to_spt;
+
+ if (unlikely(!kvm_x86_ops.can_emulate_instruction(vcpu, insn, insn_len)))
+ return 1;
vcpu->arch.l1tf_flush_l1d = true;
@@ -6738,33 +7405,14 @@
* Clear write_fault_to_shadow_pgtable here to ensure it is
* never reused.
*/
+ write_fault_to_spt = vcpu->arch.write_fault_to_shadow_pgtable;
vcpu->arch.write_fault_to_shadow_pgtable = false;
- kvm_clear_exception_queue(vcpu);
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
- init_emulate_ctxt(vcpu);
+ kvm_clear_exception_queue(vcpu);
- /*
- * We will reenter on the same instruction since
- * we do not set complete_userspace_io. This does not
- * handle watchpoints yet, those would be handled in
- * the emulate_ops.
- */
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_breakpoint(vcpu, &r))
- return r;
-
- ctxt->interruptibility = 0;
- ctxt->have_exception = false;
- ctxt->exception.vector = -1;
- ctxt->perm_ok = false;
-
- ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
-
- r = x86_decode_insn(ctxt, insn, insn_len);
-
- trace_kvm_emulate_insn_start(vcpu);
- ++vcpu->stat.insn_emulation;
+ r = x86_decode_emulated_instruction(vcpu, emulation_type,
+ insn, insn_len);
if (r != EMULATION_OK) {
if ((emulation_type & EMULTYPE_TRAP_UD) ||
(emulation_type & EMULTYPE_TRAP_UD_FORCED)) {
@@ -6818,8 +7466,19 @@
}
restart:
- /* Save the faulting GPA (cr2) in the address field */
- ctxt->exception.address = cr2_or_gpa;
+ if (emulation_type & EMULTYPE_PF) {
+ /* Save the faulting GPA (cr2) in the address field */
+ ctxt->exception.address = cr2_or_gpa;
+
+ /* With shadow page tables, cr2 contains a GVA or nGPA. */
+ if (vcpu->arch.mmu->direct_map) {
+ ctxt->gpa_available = true;
+ ctxt->gpa_val = cr2_or_gpa;
+ }
+ } else {
+ /* Sanitize the address out of an abundance of paranoia. */
+ ctxt->exception.address = 0;
+ }
r = x86_emulate_insn(ctxt);
@@ -6860,7 +7519,7 @@
r = 1;
if (writeback) {
- unsigned long rflags = kvm_x86_ops->get_rflags(vcpu);
+ unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
if (!ctxt->have_exception ||
@@ -6868,6 +7527,8 @@
kvm_rip_write(vcpu, ctxt->eip);
if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
+ if (kvm_x86_ops.update_emulated_instruction)
+ kvm_x86_ops.update_emulated_instruction(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -6918,8 +7579,8 @@
unsigned short port)
{
unsigned long val = kvm_rax_read(vcpu);
- int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
- size, port, &val, 1);
+ int ret = emulator_pio_out(vcpu, size, port, &val, 1);
+
if (ret)
return ret;
@@ -6955,11 +7616,10 @@
val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0;
/*
- * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform
+ * Since vcpu->arch.pio.count == 1 let emulator_pio_in perform
* the copy and tracing
*/
- emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size,
- vcpu->arch.pio.port, &val, 1);
+ emulator_pio_in(vcpu, vcpu->arch.pio.size, vcpu->arch.pio.port, &val, 1);
kvm_rax_write(vcpu, val);
return kvm_skip_emulated_instruction(vcpu);
@@ -6974,8 +7634,7 @@
/* For size less than 4 we merge, else we zero extend */
val = (size < 4) ? kvm_rax_read(vcpu) : 0;
- ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port,
- &val, 1);
+ ret = emulator_pio_in(vcpu, size, port, &val, 1);
if (ret) {
kvm_rax_write(vcpu, val);
return ret;
@@ -7165,14 +7824,16 @@
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
#ifdef CONFIG_CPU_FREQ
- struct cpufreq_policy policy;
+ struct cpufreq_policy *policy;
int cpu;
- memset(&policy, 0, sizeof(policy));
cpu = get_cpu();
- cpufreq_get_policy(&policy, cpu);
- if (policy.cpuinfo.max_freq)
- max_tsc_khz = policy.cpuinfo.max_freq;
+ policy = cpufreq_cpu_get(cpu);
+ if (policy) {
+ if (policy->cpuinfo.max_freq)
+ max_tsc_khz = policy->cpuinfo.max_freq;
+ cpufreq_cpu_put(policy);
+ }
put_cpu();
#endif
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
@@ -7196,7 +7857,7 @@
int user_mode = 3;
if (__this_cpu_read(current_vcpu))
- user_mode = kvm_x86_ops->get_cpl(__this_cpu_read(current_vcpu));
+ user_mode = kvm_x86_ops.get_cpl(__this_cpu_read(current_vcpu));
return user_mode != 0;
}
@@ -7224,7 +7885,7 @@
.is_in_guest = kvm_is_in_guest,
.is_user_mode = kvm_is_user_mode,
.get_guest_ip = kvm_get_guest_ip,
- .handle_intel_pt_intr = kvm_handle_intel_pt_intr,
+ .handle_intel_pt_intr = NULL,
};
#ifdef CONFIG_X86_64
@@ -7246,6 +7907,18 @@
static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn);
/*
+ * Indirection to move queue_work() out of the tk_core.seq write held
+ * region to prevent possible deadlocks against time accessors which
+ * are invoked with work related locks held.
+ */
+static void pvclock_irq_work_fn(struct irq_work *w)
+{
+ queue_work(system_long_wq, &pvclock_gtod_work);
+}
+
+static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn);
+
+/*
* Notification about pvclock gtod data update.
*/
static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused,
@@ -7256,13 +7929,14 @@
update_pvclock_gtod(tk);
- /* disable master clock if host does not trust, or does not
- * use, TSC based clocksource.
+ /*
+ * Disable master clock if host does not trust, or does not use,
+ * TSC based clocksource. Delegate queue_work() to irq_work as
+ * this is invoked with tk_core.seq write held.
*/
if (!gtod_is_based_on_tsc(gtod->clock.vclock_mode) &&
atomic_read(&kvm_guest_has_master_clock) != 0)
- queue_work(system_long_wq, &pvclock_gtod_work);
-
+ irq_work_queue(&pvclock_irq_work);
return 0;
}
@@ -7273,22 +7947,22 @@
int kvm_arch_init(void *opaque)
{
+ struct kvm_x86_init_ops *ops = opaque;
int r;
- struct kvm_x86_ops *ops = opaque;
- if (kvm_x86_ops) {
+ if (kvm_x86_ops.hardware_enable) {
printk(KERN_ERR "kvm: already loaded the other module\n");
r = -EEXIST;
goto out;
}
if (!ops->cpu_has_kvm_support()) {
- printk(KERN_ERR "kvm: no hardware support\n");
+ pr_err_ratelimited("kvm: no hardware support\n");
r = -EOPNOTSUPP;
goto out;
}
if (ops->disabled_by_bios()) {
- printk(KERN_ERR "kvm: disabled by bios\n");
+ pr_err_ratelimited("kvm: disabled by bios\n");
r = -EOPNOTSUPP;
goto out;
}
@@ -7313,27 +7987,35 @@
goto out;
}
- shared_msrs = alloc_percpu(struct kvm_shared_msrs);
- if (!shared_msrs) {
- printk(KERN_ERR "kvm: failed to allocate percpu kvm_shared_msrs\n");
+ x86_emulator_cache = kvm_alloc_emulator_cache();
+ if (!x86_emulator_cache) {
+ pr_err("kvm: failed to allocate cache for x86 emulator\n");
goto out_free_x86_fpu_cache;
}
+ user_return_msrs = alloc_percpu(struct kvm_user_return_msrs);
+ if (!user_return_msrs) {
+ printk(KERN_ERR "kvm: failed to allocate percpu kvm_user_return_msrs\n");
+ goto out_free_x86_emulator_cache;
+ }
+
r = kvm_mmu_module_init();
if (r)
goto out_free_percpu;
- kvm_x86_ops = ops;
-
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
PT_DIRTY_MASK, PT64_NX_MASK, 0,
PT_PRESENT_MASK, 0, sme_me_mask);
kvm_timer_init();
+ if (ops->intel_pt_intr_in_guest && ops->intel_pt_intr_in_guest())
+ kvm_guest_cbs.handle_intel_pt_intr = kvm_handle_intel_pt_intr;
perf_register_guest_info_callbacks(&kvm_guest_cbs);
- if (boot_cpu_has(X86_FEATURE_XSAVE))
+ if (boot_cpu_has(X86_FEATURE_XSAVE)) {
host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK);
+ supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0;
+ }
kvm_lapic_init();
if (pi_inject_timer == -1)
@@ -7348,7 +8030,9 @@
return 0;
out_free_percpu:
- free_percpu(shared_msrs);
+ free_percpu(user_return_msrs);
+out_free_x86_emulator_cache:
+ kmem_cache_destroy(x86_emulator_cache);
out_free_x86_fpu_cache:
kmem_cache_destroy(x86_fpu_cache);
out:
@@ -7363,6 +8047,7 @@
#endif
kvm_lapic_exit();
perf_unregister_guest_info_callbacks(&kvm_guest_cbs);
+ kvm_guest_cbs.handle_intel_pt_intr = NULL;
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
@@ -7370,11 +8055,13 @@
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ irq_work_sync(&pvclock_irq_work);
cancel_work_sync(&pvclock_gtod_work);
#endif
- kvm_x86_ops = NULL;
+ kvm_x86_ops.hardware_enable = NULL;
kvm_mmu_module_exit();
- free_percpu(shared_msrs);
+ free_percpu(user_return_msrs);
+ kmem_cache_destroy(x86_emulator_cache);
kmem_cache_destroy(x86_fpu_cache);
}
@@ -7441,8 +8128,8 @@
{
struct kvm_lapic_irq lapic_irq;
- lapic_irq.shorthand = 0;
- lapic_irq.dest_mode = 0;
+ lapic_irq.shorthand = APIC_DEST_NOSHORT;
+ lapic_irq.dest_mode = APIC_DEST_PHYSICAL;
lapic_irq.level = 0;
lapic_irq.dest_id = apicid;
lapic_irq.msi_redir_hint = false;
@@ -7451,18 +8138,22 @@
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
-void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+bool kvm_apicv_activated(struct kvm *kvm)
{
- if (!lapic_in_kernel(vcpu)) {
- WARN_ON_ONCE(vcpu->arch.apicv_active);
- return;
- }
- if (!vcpu->arch.apicv_active)
- return;
-
- vcpu->arch.apicv_active = false;
- kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+ return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0);
}
+EXPORT_SYMBOL_GPL(kvm_apicv_activated);
+
+void kvm_apicv_init(struct kvm *kvm, bool enable)
+{
+ if (enable)
+ clear_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
+ else
+ set_bit(APICV_INHIBIT_REASON_DISABLE,
+ &kvm->arch.apicv_inhibit_reasons);
+}
+EXPORT_SYMBOL_GPL(kvm_apicv_init);
static void kvm_sched_yield(struct kvm *kvm, unsigned long dest_id)
{
@@ -7506,16 +8197,21 @@
a3 &= 0xFFFFFFFF;
}
- if (kvm_x86_ops->get_cpl(vcpu) != 0) {
+ if (kvm_x86_ops.get_cpl(vcpu) != 0) {
ret = -KVM_EPERM;
goto out;
}
+ ret = -KVM_ENOSYS;
+
switch (nr) {
case KVM_HC_VAPIC_POLL_IRQ:
ret = 0;
break;
case KVM_HC_KICK_CPU:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT))
+ break;
+
kvm_pv_kick_cpu_op(vcpu->kvm, a0, a1);
kvm_sched_yield(vcpu->kvm, a1);
ret = 0;
@@ -7526,9 +8222,15 @@
break;
#endif
case KVM_HC_SEND_IPI:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI))
+ break;
+
ret = kvm_pv_send_ipi(vcpu->kvm, a0, a1, a2, a3, op_64_bit);
break;
case KVM_HC_SCHED_YIELD:
+ if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD))
+ break;
+
kvm_sched_yield(vcpu->kvm, a0);
ret = 0;
break;
@@ -7552,7 +8254,7 @@
char instruction[3];
unsigned long rip = kvm_rip_read(vcpu);
- kvm_x86_ops->patch_hypercall(vcpu, instruction);
+ kvm_x86_ops.patch_hypercall(vcpu, instruction);
return emulator_write_emulated(ctxt, rip, instruction, 3,
&ctxt->exception);
@@ -7581,7 +8283,7 @@
{
int max_irr, tpr;
- if (!kvm_x86_ops->update_cr8_intercept)
+ if (!kvm_x86_ops.update_cr8_intercept)
return;
if (!lapic_in_kernel(vcpu))
@@ -7600,24 +8302,27 @@
tpr = kvm_lapic_get_cr8(vcpu);
- kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
+ kvm_x86_ops.update_cr8_intercept(vcpu, tpr, max_irr);
}
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
- if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
- vcpu->arch.exception.error_code = false;
- kvm_x86_ops->queue_exception(vcpu);
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ kvm_x86_ops.queue_exception(vcpu);
}
-static int inject_pending_event(struct kvm_vcpu *vcpu)
+static void inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit)
{
int r;
+ bool can_inject = true;
/* try to reinject previous events if any */
- if (vcpu->arch.exception.injected)
+ if (vcpu->arch.exception.injected) {
kvm_inject_exception(vcpu);
+ can_inject = false;
+ }
/*
* Do not inject an NMI or interrupt if there is a pending
* exception. Exceptions and interrupts are recognized at
@@ -7633,22 +8338,28 @@
* fully complete the previous instruction.
*/
else if (!vcpu->arch.exception.pending) {
- if (vcpu->arch.nmi_injected)
- kvm_x86_ops->set_nmi(vcpu);
- else if (vcpu->arch.interrupt.injected)
- kvm_x86_ops->set_irq(vcpu);
+ if (vcpu->arch.nmi_injected) {
+ kvm_x86_ops.set_nmi(vcpu);
+ can_inject = false;
+ } else if (vcpu->arch.interrupt.injected) {
+ kvm_x86_ops.set_irq(vcpu);
+ can_inject = false;
+ }
}
+ WARN_ON_ONCE(vcpu->arch.exception.injected &&
+ vcpu->arch.exception.pending);
+
/*
* Call check_nested_events() even if we reinjected a previous event
* in order for caller to determine if it should require immediate-exit
* from L2 to L1 due to pending L1 events which require exit
* from L2 to L1.
*/
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu);
- if (r != 0)
- return r;
+ if (is_guest_mode(vcpu)) {
+ r = kvm_x86_ops.nested_ops->check_events(vcpu);
+ if (r < 0)
+ goto busy;
}
/* try to inject new event if pending */
@@ -7657,7 +8368,6 @@
vcpu->arch.exception.has_error_code,
vcpu->arch.exception.error_code);
- WARN_ON_ONCE(vcpu->arch.exception.injected);
vcpu->arch.exception.pending = false;
vcpu->arch.exception.injected = true;
@@ -7666,16 +8376,6 @@
X86_EFLAGS_RF);
if (vcpu->arch.exception.nr == DB_VECTOR) {
- /*
- * This code assumes that nSVM doesn't use
- * check_nested_events(). If it does, the
- * DR6/DR7 changes should happen before L1
- * gets a #VMEXIT for an intercepted #DB in
- * L2. (Under VMX, on the other hand, the
- * DR6/DR7 changes should not happen in the
- * event of a VM-exit to L1 for an intercepted
- * #DB in L2.)
- */
kvm_deliver_exception_payload(vcpu);
if (vcpu->arch.dr7 & DR7_GD) {
vcpu->arch.dr7 &= ~DR7_GD;
@@ -7684,42 +8384,72 @@
}
kvm_inject_exception(vcpu);
+ can_inject = false;
}
- /* Don't consider new event if we re-injected an event */
- if (kvm_event_needs_reinjection(vcpu))
- return 0;
-
- if (vcpu->arch.smi_pending && !is_smm(vcpu) &&
- kvm_x86_ops->smi_allowed(vcpu)) {
- vcpu->arch.smi_pending = false;
- ++vcpu->arch.smi_count;
- enter_smm(vcpu);
- } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
- --vcpu->arch.nmi_pending;
- vcpu->arch.nmi_injected = true;
- kvm_x86_ops->set_nmi(vcpu);
- } else if (kvm_cpu_has_injectable_intr(vcpu)) {
- /*
- * Because interrupts can be injected asynchronously, we are
- * calling check_nested_events again here to avoid a race condition.
- * See https://lkml.org/lkml/2014/7/2/60 for discussion about this
- * proposal and current concerns. Perhaps we should be setting
- * KVM_REQ_EVENT only on certain events and not unconditionally?
- */
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu);
- if (r != 0)
- return r;
- }
- if (kvm_x86_ops->interrupt_allowed(vcpu)) {
- kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
- false);
- kvm_x86_ops->set_irq(vcpu);
- }
+ /*
+ * Finally, inject interrupt events. If an event cannot be injected
+ * due to architectural conditions (e.g. IF=0) a window-open exit
+ * will re-request KVM_REQ_EVENT. Sometimes however an event is pending
+ * and can architecturally be injected, but we cannot do it right now:
+ * an interrupt could have arrived just now and we have to inject it
+ * as a vmexit, or there could already an event in the queue, which is
+ * indicated by can_inject. In that case we request an immediate exit
+ * in order to make progress and get back here for another iteration.
+ * The kvm_x86_ops hooks communicate this by returning -EBUSY.
+ */
+ if (vcpu->arch.smi_pending) {
+ r = can_inject ? kvm_x86_ops.smi_allowed(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto busy;
+ if (r) {
+ vcpu->arch.smi_pending = false;
+ ++vcpu->arch.smi_count;
+ enter_smm(vcpu);
+ can_inject = false;
+ } else
+ kvm_x86_ops.enable_smi_window(vcpu);
}
- return 0;
+ if (vcpu->arch.nmi_pending) {
+ r = can_inject ? kvm_x86_ops.nmi_allowed(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto busy;
+ if (r) {
+ --vcpu->arch.nmi_pending;
+ vcpu->arch.nmi_injected = true;
+ kvm_x86_ops.set_nmi(vcpu);
+ can_inject = false;
+ WARN_ON(kvm_x86_ops.nmi_allowed(vcpu, true) < 0);
+ }
+ if (vcpu->arch.nmi_pending)
+ kvm_x86_ops.enable_nmi_window(vcpu);
+ }
+
+ if (kvm_cpu_has_injectable_intr(vcpu)) {
+ r = can_inject ? kvm_x86_ops.interrupt_allowed(vcpu, true) : -EBUSY;
+ if (r < 0)
+ goto busy;
+ if (r) {
+ kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu), false);
+ kvm_x86_ops.set_irq(vcpu);
+ WARN_ON(kvm_x86_ops.interrupt_allowed(vcpu, true) < 0);
+ }
+ if (kvm_cpu_has_injectable_intr(vcpu))
+ kvm_x86_ops.enable_irq_window(vcpu);
+ }
+
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+ *req_immediate_exit = true;
+
+ WARN_ON(vcpu->arch.exception.pending);
+ return;
+
+busy:
+ *req_immediate_exit = true;
+ return;
}
static void process_nmi(struct kvm_vcpu *vcpu)
@@ -7731,7 +8461,7 @@
* If an NMI is already in progress, limit further NMIs to just one.
* Otherwise, allow two (and we'll inject the first one immediately).
*/
- if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
+ if (kvm_x86_ops.get_nmi_mask(vcpu) || vcpu->arch.nmi_injected)
limit = 1;
vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0);
@@ -7821,11 +8551,11 @@
put_smstate(u32, buf, 0x7f7c, seg.limit);
put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
- kvm_x86_ops->get_gdt(vcpu, &dt);
+ kvm_x86_ops.get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
put_smstate(u32, buf, 0x7f70, dt.size);
- kvm_x86_ops->get_idt(vcpu, &dt);
+ kvm_x86_ops.get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7f58, dt.address);
put_smstate(u32, buf, 0x7f54, dt.size);
@@ -7875,7 +8605,7 @@
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
- kvm_x86_ops->get_idt(vcpu, &dt);
+ kvm_x86_ops.get_idt(vcpu, &dt);
put_smstate(u32, buf, 0x7e84, dt.size);
put_smstate(u64, buf, 0x7e88, dt.address);
@@ -7885,7 +8615,7 @@
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
- kvm_x86_ops->get_gdt(vcpu, &dt);
+ kvm_x86_ops.get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7e64, dt.size);
put_smstate(u64, buf, 0x7e68, dt.address);
@@ -7915,28 +8645,28 @@
* vCPU state (e.g. leave guest mode) after we've saved the state into
* the SMM state-save area.
*/
- kvm_x86_ops->pre_enter_smm(vcpu, buf);
+ kvm_x86_ops.pre_enter_smm(vcpu, buf);
vcpu->arch.hflags |= HF_SMM_MASK;
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
- if (kvm_x86_ops->get_nmi_mask(vcpu))
+ if (kvm_x86_ops.get_nmi_mask(vcpu))
vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK;
else
- kvm_x86_ops->set_nmi_mask(vcpu, true);
+ kvm_x86_ops.set_nmi_mask(vcpu, true);
kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
kvm_rip_write(vcpu, 0x8000);
cr0 = vcpu->arch.cr0 & ~(X86_CR0_PE | X86_CR0_EM | X86_CR0_TS | X86_CR0_PG);
- kvm_x86_ops->set_cr0(vcpu, cr0);
+ kvm_x86_ops.set_cr0(vcpu, cr0);
vcpu->arch.cr0 = cr0;
- kvm_x86_ops->set_cr4(vcpu, 0);
+ kvm_x86_ops.set_cr4(vcpu, 0);
/* Undocumented: IDT limit is set to zero on entry to SMM. */
dt.address = dt.size = 0;
- kvm_x86_ops->set_idt(vcpu, &dt);
+ kvm_x86_ops.set_idt(vcpu, &dt);
__kvm_set_dr(vcpu, 7, DR7_FIXED_1);
@@ -7967,10 +8697,10 @@
#ifdef CONFIG_X86_64
if (guest_cpuid_has(vcpu, X86_FEATURE_LM))
- kvm_x86_ops->set_efer(vcpu, 0);
+ kvm_x86_ops.set_efer(vcpu, 0);
#endif
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
kvm_mmu_reset_context(vcpu);
}
@@ -7980,11 +8710,83 @@
kvm_make_request(KVM_REQ_EVENT, vcpu);
}
+void kvm_make_scan_ioapic_request_mask(struct kvm *kvm,
+ unsigned long *vcpu_bitmap)
+{
+ cpumask_var_t cpus;
+
+ zalloc_cpumask_var(&cpus, GFP_ATOMIC);
+
+ kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC,
+ NULL, vcpu_bitmap, cpus);
+
+ free_cpumask_var(cpus);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
}
+void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu)
+{
+ if (!lapic_in_kernel(vcpu))
+ return;
+
+ vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm);
+ kvm_apic_update_apicv(vcpu);
+ kvm_x86_ops.refresh_apicv_exec_ctrl(vcpu);
+}
+EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv);
+
+/*
+ * NOTE: Do not hold any lock prior to calling this.
+ *
+ * In particular, kvm_request_apicv_update() expects kvm->srcu not to be
+ * locked, because it calls __x86_set_memory_region() which does
+ * synchronize_srcu(&kvm->srcu).
+ */
+void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit)
+{
+ struct kvm_vcpu *except;
+ unsigned long old, new, expected;
+
+ if (!kvm_x86_ops.check_apicv_inhibit_reasons ||
+ !kvm_x86_ops.check_apicv_inhibit_reasons(bit))
+ return;
+
+ old = READ_ONCE(kvm->arch.apicv_inhibit_reasons);
+ do {
+ expected = new = old;
+ if (activate)
+ __clear_bit(bit, &new);
+ else
+ __set_bit(bit, &new);
+ if (new == old)
+ break;
+ old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new);
+ } while (old != expected);
+
+ if (!!old == !!new)
+ return;
+
+ trace_kvm_apicv_update_request(activate, bit);
+ if (kvm_x86_ops.pre_update_apicv_exec_ctrl)
+ kvm_x86_ops.pre_update_apicv_exec_ctrl(kvm, activate);
+
+ /*
+ * Sending request to update APICV for all other vcpus,
+ * while update the calling vcpu immediately instead of
+ * waiting for another #VMEXIT to handle the request.
+ */
+ except = kvm_get_running_vcpu();
+ kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE,
+ except);
+ if (except)
+ kvm_vcpu_update_apicv(except);
+}
+EXPORT_SYMBOL_GPL(kvm_request_apicv_update);
+
static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
{
if (!kvm_apic_present(vcpu))
@@ -7996,7 +8798,7 @@
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
if (vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -8016,7 +8818,7 @@
bitmap_or((ulong *)eoi_exit_bitmap, vcpu->arch.ioapic_handled_vectors,
vcpu_to_synic(vcpu)->vec_bitmap, 256);
- kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
+ kvm_x86_ops.load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
@@ -8035,26 +8837,14 @@
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
- struct page *page = NULL;
-
if (!lapic_in_kernel(vcpu))
return;
- if (!kvm_x86_ops->set_apic_access_page_addr)
+ if (!kvm_x86_ops.set_apic_access_page_addr)
return;
- page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
- if (is_error_page(page))
- return;
- kvm_x86_ops->set_apic_access_page_addr(vcpu, page_to_phys(page));
-
- /*
- * Do not pin apic access page in memory, the MMU notifier
- * will call us again if it is migrated or swapped out.
- */
- put_page(page);
+ kvm_x86_ops.set_apic_access_page_addr(vcpu);
}
-EXPORT_SYMBOL_GPL(kvm_vcpu_reload_apic_access_page);
void __kvm_request_immediate_exit(struct kvm_vcpu *vcpu)
{
@@ -8073,12 +8863,13 @@
bool req_int_win =
dm_request_for_irq_injection(vcpu) &&
kvm_cpu_accept_dm_intr(vcpu);
+ fastpath_t exit_fastpath;
bool req_immediate_exit = false;
if (kvm_request_pending(vcpu)) {
- if (kvm_check_request(KVM_REQ_GET_VMCS12_PAGES, vcpu)) {
- if (unlikely(!kvm_x86_ops->get_vmcs12_pages(vcpu))) {
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) {
r = 0;
goto out;
}
@@ -8098,10 +8889,19 @@
}
if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
kvm_mmu_sync_roots(vcpu);
- if (kvm_check_request(KVM_REQ_LOAD_CR3, vcpu))
- kvm_mmu_load_cr3(vcpu);
- if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
- kvm_vcpu_flush_tlb(vcpu, true);
+ if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu))
+ kvm_mmu_load_pgd(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) {
+ kvm_vcpu_flush_tlb_all(vcpu);
+
+ /* Flushing all ASIDs flushes the current ASID... */
+ kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
+ kvm_vcpu_flush_tlb_guest(vcpu);
+
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
r = 0;
@@ -8172,6 +8972,12 @@
*/
if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
kvm_hv_process_stimers(vcpu);
+ if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu))
+ kvm_vcpu_update_apicv(vcpu);
+ if (kvm_check_request(KVM_REQ_APF_READY, vcpu))
+ kvm_check_async_pf_completion(vcpu);
+ if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu))
+ kvm_x86_ops.msr_filter_changed(vcpu);
}
if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
@@ -8182,32 +8988,9 @@
goto out;
}
- if (inject_pending_event(vcpu) != 0)
- req_immediate_exit = true;
- else {
- /* Enable SMI/NMI/IRQ window open exits if needed.
- *
- * SMIs have three cases:
- * 1) They can be nested, and then there is nothing to
- * do here because RSM will cause a vmexit anyway.
- * 2) There is an ISA-specific reason why SMI cannot be
- * injected, and the moment when this changes can be
- * intercepted.
- * 3) Or the SMI can be pending because
- * inject_pending_event has completed the injection
- * of an IRQ or NMI from the previous vmexit, and
- * then we request an immediate exit to inject the
- * SMI.
- */
- if (vcpu->arch.smi_pending && !is_smm(vcpu))
- if (!kvm_x86_ops->enable_smi_window(vcpu))
- req_immediate_exit = true;
- if (vcpu->arch.nmi_pending)
- kvm_x86_ops->enable_nmi_window(vcpu);
- if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
- kvm_x86_ops->enable_irq_window(vcpu);
- WARN_ON(vcpu->arch.exception.pending);
- }
+ inject_pending_event(vcpu, &req_immediate_exit);
+ if (req_int_win)
+ kvm_x86_ops.enable_irq_window(vcpu);
if (kvm_lapic_enabled(vcpu)) {
update_cr8_intercept(vcpu);
@@ -8222,7 +9005,7 @@
preempt_disable();
- kvm_x86_ops->prepare_guest_switch(vcpu);
+ kvm_x86_ops.prepare_guest_switch(vcpu);
/*
* Disable IRQs before setting IN_GUEST_MODE. Posted interrupt
@@ -8253,10 +9036,9 @@
* notified with kvm_vcpu_kick.
*/
if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
- kvm_x86_ops->sync_pir_to_irr(vcpu);
+ kvm_x86_ops.sync_pir_to_irr(vcpu);
- if (vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu)
- || need_resched() || signal_pending(current)) {
+ if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
local_irq_enable();
@@ -8268,14 +9050,10 @@
if (req_immediate_exit) {
kvm_make_request(KVM_REQ_EVENT, vcpu);
- kvm_x86_ops->request_immediate_exit(vcpu);
+ kvm_x86_ops.request_immediate_exit(vcpu);
}
- trace_kvm_entry(vcpu->vcpu_id);
- guest_enter_irqoff();
-
- /* Save host pkru register if supported */
- vcpu->arch.host_pkru = read_pkru();
+ trace_kvm_entry(vcpu);
fpregs_assert_state_consistent();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
@@ -8293,7 +9071,7 @@
set_debugreg(0, 7);
}
- kvm_x86_ops->run(vcpu);
+ exit_fastpath = kvm_x86_ops.run(vcpu);
/*
* Do this here before restoring debug registers on the host. And
@@ -8303,9 +9081,8 @@
*/
if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) {
WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP);
- kvm_x86_ops->sync_dirty_debug_regs(vcpu);
+ kvm_x86_ops.sync_dirty_debug_regs(vcpu);
kvm_update_dr0123(vcpu);
- kvm_update_dr6(vcpu);
kvm_update_dr7(vcpu);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
}
@@ -8320,12 +9097,13 @@
if (hw_breakpoint_active())
hw_breakpoint_restore();
+ vcpu->arch.last_vmentry_cpu = vcpu->cpu;
vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
vcpu->mode = OUTSIDE_GUEST_MODE;
smp_wmb();
- kvm_x86_ops->handle_exit_irqoff(vcpu);
+ kvm_x86_ops.handle_exit_irqoff(vcpu);
/*
* Consume any pending interrupts, including the possible source of
@@ -8340,7 +9118,15 @@
local_irq_disable();
kvm_after_interrupt(vcpu);
- guest_exit_irqoff();
+ /*
+ * Wait until after servicing IRQs to account guest time so that any
+ * ticks that occurred while running the guest are properly accounted
+ * to the guest. Waiting until IRQs are enabled degrades the accuracy
+ * of accounting via context tracking, but the loss of accuracy is
+ * acceptable for all known use cases.
+ */
+ vtime_account_guest_exit();
+
if (lapic_in_kernel(vcpu)) {
s64 delta = vcpu->arch.apic->lapic_timer.advance_expire_delta;
if (delta != S64_MIN) {
@@ -8368,12 +9154,13 @@
if (vcpu->arch.apic_attention)
kvm_lapic_sync_from_vapic(vcpu);
- vcpu->arch.gpa_available = false;
- r = kvm_x86_ops->handle_exit(vcpu);
+ r = kvm_x86_ops.handle_exit(vcpu, exit_fastpath);
return r;
cancel_injection:
- kvm_x86_ops->cancel_injection(vcpu);
+ if (req_immediate_exit)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_x86_ops.cancel_injection(vcpu);
if (unlikely(vcpu->arch.apic_attention))
kvm_lapic_sync_from_vapic(vcpu);
out:
@@ -8383,13 +9170,13 @@
static inline int vcpu_block(struct kvm *kvm, struct kvm_vcpu *vcpu)
{
if (!kvm_arch_vcpu_runnable(vcpu) &&
- (!kvm_x86_ops->pre_block || kvm_x86_ops->pre_block(vcpu) == 0)) {
+ (!kvm_x86_ops.pre_block || kvm_x86_ops.pre_block(vcpu) == 0)) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
kvm_vcpu_block(vcpu);
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
- if (kvm_x86_ops->post_block)
- kvm_x86_ops->post_block(vcpu);
+ if (kvm_x86_ops.post_block)
+ kvm_x86_ops.post_block(vcpu);
if (!kvm_check_request(KVM_REQ_UNHALT, vcpu))
return 1;
@@ -8401,7 +9188,7 @@
vcpu->arch.pv.pv_unhalted = false;
vcpu->arch.mp_state =
KVM_MP_STATE_RUNNABLE;
- /* fall through */
+ fallthrough;
case KVM_MP_STATE_RUNNABLE:
vcpu->arch.apf.halted = false;
break;
@@ -8409,15 +9196,14 @@
break;
default:
return -EINTR;
- break;
}
return 1;
}
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
- if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- kvm_x86_ops->check_nested_events(vcpu);
+ if (is_guest_mode(vcpu))
+ kvm_x86_ops.nested_ops->check_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -8453,17 +9239,11 @@
break;
}
- kvm_check_async_pf_completion(vcpu);
-
- if (signal_pending(current)) {
- r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
- ++vcpu->stat.signal_exits;
- break;
- }
- if (need_resched()) {
+ if (__xfer_to_guest_mode_work_pending()) {
srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
- cond_resched();
+ r = xfer_to_guest_mode_handle_work(vcpu);
+ if (r)
+ return r;
vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
}
}
@@ -8573,7 +9353,7 @@
kvm_save_current_fpu(vcpu->arch.user_fpu);
- /* PKRU is separately restored in kvm_x86_ops->run. */
+ /* PKRU is separately restored in kvm_x86_ops.run. */
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
@@ -8599,8 +9379,9 @@
trace_kvm_fpu(0);
}
-int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
+int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu)
{
+ struct kvm_run *kvm_run = vcpu->run;
int r;
vcpu_load(vcpu);
@@ -8618,18 +9399,18 @@
r = -EAGAIN;
if (signal_pending(current)) {
r = -EINTR;
- vcpu->run->exit_reason = KVM_EXIT_INTR;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
++vcpu->stat.signal_exits;
}
goto out;
}
- if (vcpu->run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
+ if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) {
r = -EINVAL;
goto out;
}
- if (vcpu->run->kvm_dirty_regs) {
+ if (kvm_run->kvm_dirty_regs) {
r = sync_regs(vcpu);
if (r != 0)
goto out;
@@ -8659,7 +9440,7 @@
out:
kvm_put_guest_fpu(vcpu);
- if (vcpu->run->kvm_valid_regs)
+ if (kvm_run->kvm_valid_regs)
store_regs(vcpu);
post_kvm_run_save(vcpu);
kvm_sigset_deactivate(vcpu);
@@ -8678,7 +9459,7 @@
* that usually, but some bad designed PV devices (vmware
* backdoor interface) need this to work
*/
- emulator_writeback_register_cache(&vcpu->arch.emulate_ctxt);
+ emulator_writeback_register_cache(vcpu->arch.emulate_ctxt);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
}
regs->rax = kvm_rax_read(vcpu);
@@ -8776,10 +9557,10 @@
kvm_get_segment(vcpu, &sregs->tr, VCPU_SREG_TR);
kvm_get_segment(vcpu, &sregs->ldt, VCPU_SREG_LDTR);
- kvm_x86_ops->get_idt(vcpu, &dt);
+ kvm_x86_ops.get_idt(vcpu, &dt);
sregs->idt.limit = dt.size;
sregs->idt.base = dt.address;
- kvm_x86_ops->get_gdt(vcpu, &dt);
+ kvm_x86_ops.get_gdt(vcpu, &dt);
sregs->gdt.limit = dt.size;
sregs->gdt.base = dt.address;
@@ -8838,8 +9619,12 @@
mp_state->mp_state != KVM_MP_STATE_RUNNABLE)
goto out;
- /* INITs are latched while in SMM */
- if ((is_smm(vcpu) || vcpu->arch.smi_pending) &&
+ /*
+ * KVM_MP_STATE_INIT_RECEIVED means the processor is in
+ * INIT state; latched init should be reported using
+ * KVM_SET_VCPU_EVENTS, so reject it here.
+ */
+ if ((kvm_vcpu_latch_init(vcpu) || vcpu->arch.smi_pending) &&
(mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED ||
mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED))
goto out;
@@ -8860,7 +9645,7 @@
int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index,
int reason, bool has_error_code, u32 error_code)
{
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
int ret;
init_emulate_ctxt(vcpu);
@@ -8876,7 +9661,6 @@
kvm_rip_write(vcpu, ctxt->eip);
kvm_set_rflags(vcpu, ctxt->eflags);
- kvm_make_request(KVM_REQ_EVENT, vcpu);
return 1;
}
EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -8892,6 +9676,8 @@
if (!(sregs->cr4 & X86_CR4_PAE)
|| !(sregs->efer & EFER_LMA))
return -EINVAL;
+ if (sregs->cr3 & vcpu->arch.cr3_lm_rsvd_bits)
+ return -EINVAL;
} else {
/*
* Not in 64-bit mode: EFER.LMA is clear and the code
@@ -8923,31 +9709,31 @@
dt.size = sregs->idt.limit;
dt.address = sregs->idt.base;
- kvm_x86_ops->set_idt(vcpu, &dt);
+ kvm_x86_ops.set_idt(vcpu, &dt);
dt.size = sregs->gdt.limit;
dt.address = sregs->gdt.base;
- kvm_x86_ops->set_gdt(vcpu, &dt);
+ kvm_x86_ops.set_gdt(vcpu, &dt);
vcpu->arch.cr2 = sregs->cr2;
mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
vcpu->arch.cr3 = sregs->cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_set_cr8(vcpu, sregs->cr8);
mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
- kvm_x86_ops->set_efer(vcpu, sregs->efer);
+ kvm_x86_ops.set_efer(vcpu, sregs->efer);
mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
- kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
+ kvm_x86_ops.set_cr0(vcpu, sregs->cr0);
vcpu->arch.cr0 = sregs->cr0;
mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
cpuid_update_needed |= ((kvm_read_cr4(vcpu) ^ sregs->cr4) &
(X86_CR4_OSXSAVE | X86_CR4_PKE));
- kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
+ kvm_x86_ops.set_cr4(vcpu, sregs->cr4);
if (cpuid_update_needed)
- kvm_update_cpuid(vcpu);
+ kvm_update_cpuid_runtime(vcpu);
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (is_pae_paging(vcpu)) {
@@ -9051,7 +9837,7 @@
*/
kvm_set_rflags(vcpu, rflags);
- kvm_x86_ops->update_bp_intercept(vcpu);
+ kvm_x86_ops.update_exception_bitmap(vcpu);
r = 0;
@@ -9179,36 +9965,92 @@
vcpu->arch.cr0 |= X86_CR0_ET;
}
-void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id)
{
- void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
- struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
-
- kvm_release_pfn(cache->pfn, cache->dirty, cache);
-
- kvmclock_reset(vcpu);
-
- kvm_x86_ops->vcpu_free(vcpu);
- free_cpumask_var(wbinvd_dirty_mask);
-}
-
-struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
- unsigned int id)
-{
- struct kvm_vcpu *vcpu;
-
if (kvm_check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
- printk_once(KERN_WARNING
- "kvm: SMP vm created on host with unstable TSC; "
- "guest TSC will not be reliable\n");
+ pr_warn_once("kvm: SMP vm created on host with unstable TSC; "
+ "guest TSC will not be reliable\n");
- vcpu = kvm_x86_ops->vcpu_create(kvm, id);
-
- return vcpu;
+ return 0;
}
-int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
+int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
{
+ struct page *page;
+ int r;
+
+ if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
+ vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
+ else
+ vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
+
+ kvm_set_tsc_khz(vcpu, max_tsc_khz);
+
+ r = kvm_mmu_create(vcpu);
+ if (r < 0)
+ return r;
+
+ if (irqchip_in_kernel(vcpu->kvm)) {
+ r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
+ if (r < 0)
+ goto fail_mmu_destroy;
+ if (kvm_apicv_activated(vcpu->kvm))
+ vcpu->arch.apicv_active = true;
+ } else
+ static_key_slow_inc(&kvm_no_apic_vcpu);
+
+ r = -ENOMEM;
+
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto fail_free_lapic;
+ vcpu->arch.pio_data = page_address(page);
+
+ vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.mce_banks)
+ goto fail_free_pio_data;
+ vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
+
+ if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
+ GFP_KERNEL_ACCOUNT))
+ goto fail_free_mce_banks;
+
+ if (!alloc_emulate_ctxt(vcpu))
+ goto free_wbinvd_dirty_mask;
+
+ vcpu->arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.user_fpu) {
+ pr_err("kvm: failed to allocate userspace's fpu\n");
+ goto free_emulate_ctxt;
+ }
+
+ vcpu->arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!vcpu->arch.guest_fpu) {
+ pr_err("kvm: failed to allocate vcpu's fpu\n");
+ goto free_user_fpu;
+ }
+ fx_init(vcpu);
+
+ vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
+ vcpu->arch.cr3_lm_rsvd_bits = rsvd_bits(cpuid_maxphyaddr(vcpu), 63);
+
+ vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
+
+ kvm_async_pf_hash_reset(vcpu);
+ kvm_pmu_init(vcpu);
+
+ vcpu->arch.pending_external_vector = -1;
+ vcpu->arch.preempted_in_kernel = false;
+
+ kvm_hv_vcpu_init(vcpu);
+
+ r = kvm_x86_ops.vcpu_create(vcpu);
+ if (r)
+ goto free_guest_fpu;
+
vcpu->arch.arch_capabilities = kvm_get_arch_capabilities();
vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT;
kvm_vcpu_mtrr_init(vcpu);
@@ -9217,11 +10059,28 @@
kvm_init_mmu(vcpu, false);
vcpu_put(vcpu);
return 0;
+
+free_guest_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+free_user_fpu:
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+free_emulate_ctxt:
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+free_wbinvd_dirty_mask:
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+fail_free_mce_banks:
+ kfree(vcpu->arch.mce_banks);
+fail_free_pio_data:
+ free_page((unsigned long)vcpu->arch.pio_data);
+fail_free_lapic:
+ kvm_free_lapic(vcpu);
+fail_mmu_destroy:
+ kvm_mmu_destroy(vcpu);
+ return r;
}
void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
{
- struct msr_data msr;
struct kvm *kvm = vcpu->kvm;
kvm_hv_vcpu_postcreate(vcpu);
@@ -9229,10 +10088,7 @@
if (mutex_lock_killable(&vcpu->mutex))
return;
vcpu_load(vcpu);
- msr.data = 0x0;
- msr.index = MSR_IA32_TSC;
- msr.host_initiated = true;
- kvm_write_tsc(vcpu, &msr);
+ kvm_synchronize_tsc(vcpu, 0);
vcpu_put(vcpu);
/* poll control enabled by default */
@@ -9240,16 +10096,38 @@
mutex_unlock(&vcpu->mutex);
- if (!kvmclock_periodic_sync)
- return;
-
- schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
- KVMCLOCK_SYNC_PERIOD);
+ if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0)
+ schedule_delayed_work(&kvm->arch.kvmclock_sync_work,
+ KVMCLOCK_SYNC_PERIOD);
}
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- kvm_arch_vcpu_free(vcpu);
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+ int idx;
+
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+ kvmclock_reset(vcpu);
+
+ kvm_x86_ops.vcpu_free(vcpu);
+
+ kmem_cache_free(x86_emulator_cache, vcpu->arch.emulate_ctxt);
+ free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.user_fpu);
+ kmem_cache_free(x86_fpu_cache, vcpu->arch.guest_fpu);
+
+ kvm_hv_vcpu_uninit(vcpu);
+ kvm_pmu_destroy(vcpu);
+ kfree(vcpu->arch.mce_banks);
+ kvm_free_lapic(vcpu);
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ kvm_mmu_destroy(vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ free_page((unsigned long)vcpu->arch.pio_data);
+ kvfree(vcpu->arch.cpuid_entries);
+ if (!lapic_in_kernel(vcpu))
+ static_key_slow_dec(&kvm_no_apic_vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9265,19 +10143,18 @@
vcpu->arch.nmi_injected = false;
kvm_clear_interrupt_queue(vcpu);
kvm_clear_exception_queue(vcpu);
- vcpu->arch.exception.pending = false;
memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db));
kvm_update_dr0123(vcpu);
vcpu->arch.dr6 = DR6_INIT;
- kvm_update_dr6(vcpu);
vcpu->arch.dr7 = DR7_FIXED_1;
kvm_update_dr7(vcpu);
vcpu->arch.cr2 = 0;
kvm_make_request(KVM_REQ_EVENT, vcpu);
- vcpu->arch.apf.msr_val = 0;
+ vcpu->arch.apf.msr_en_val = 0;
+ vcpu->arch.apf.msr_int_val = 0;
vcpu->arch.st.msr_val = 0;
kvmclock_reset(vcpu);
@@ -9322,7 +10199,7 @@
vcpu->arch.ia32_xss = 0;
- kvm_x86_ops->vcpu_reset(vcpu, init_event);
+ kvm_x86_ops.vcpu_reset(vcpu, init_event);
}
void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector)
@@ -9346,8 +10223,8 @@
u64 max_tsc = 0;
bool stable, backwards_tsc = false;
- kvm_shared_msr_cpu_online();
- ret = kvm_x86_ops->hardware_enable();
+ kvm_user_return_msr_cpu_online();
+ ret = kvm_x86_ops.hardware_enable();
if (ret != 0)
return ret;
@@ -9429,19 +10306,32 @@
void kvm_arch_hardware_disable(void)
{
- kvm_x86_ops->hardware_disable();
+ kvm_x86_ops.hardware_disable();
drop_user_return_notifiers();
}
-int kvm_arch_hardware_setup(void)
+int kvm_arch_hardware_setup(void *opaque)
{
+ struct kvm_x86_init_ops *ops = opaque;
int r;
- r = kvm_x86_ops->hardware_setup();
+ rdmsrl_safe(MSR_EFER, &host_efer);
+
+ if (boot_cpu_has(X86_FEATURE_XSAVES))
+ rdmsrl(MSR_IA32_XSS, host_xss);
+
+ r = ops->hardware_setup();
if (r != 0)
return r;
- cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+ memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops));
+
+ if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES))
+ supported_xss = 0;
+
+#define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f)
+ cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_);
+#undef __kvm_cpu_cap_has
if (kvm_has_tsc_control) {
/*
@@ -9463,12 +10353,21 @@
void kvm_arch_hardware_unsetup(void)
{
- kvm_x86_ops->hardware_unsetup();
+ kvm_x86_ops.hardware_unsetup();
}
-int kvm_arch_check_processor_compat(void)
+int kvm_arch_check_processor_compat(void *opaque)
{
- return kvm_x86_ops->check_processor_compatibility();
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+ struct kvm_x86_init_ops *ops = opaque;
+
+ WARN_ON(!irqs_disabled());
+
+ if (__cr4_reserved_bits(cpu_has, c) !=
+ __cr4_reserved_bits(cpu_has, &boot_cpu_data))
+ return -EIO;
+
+ return ops->check_processor_compatibility();
}
bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu)
@@ -9485,109 +10384,36 @@
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
-int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
-{
- struct page *page;
- int r;
-
- vcpu->arch.emulate_ctxt.ops = &emulate_ops;
- if (!irqchip_in_kernel(vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu))
- vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
- else
- vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED;
-
- page = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!page) {
- r = -ENOMEM;
- goto fail;
- }
- vcpu->arch.pio_data = page_address(page);
-
- kvm_set_tsc_khz(vcpu, max_tsc_khz);
-
- r = kvm_mmu_create(vcpu);
- if (r < 0)
- goto fail_free_pio_data;
-
- if (irqchip_in_kernel(vcpu->kvm)) {
- vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv(vcpu);
- r = kvm_create_lapic(vcpu, lapic_timer_advance_ns);
- if (r < 0)
- goto fail_mmu_destroy;
- } else
- static_key_slow_inc(&kvm_no_apic_vcpu);
-
- vcpu->arch.mce_banks = kzalloc(KVM_MAX_MCE_BANKS * sizeof(u64) * 4,
- GFP_KERNEL_ACCOUNT);
- if (!vcpu->arch.mce_banks) {
- r = -ENOMEM;
- goto fail_free_lapic;
- }
- vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
-
- if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask,
- GFP_KERNEL_ACCOUNT)) {
- r = -ENOMEM;
- goto fail_free_mce_banks;
- }
-
- fx_init(vcpu);
-
- vcpu->arch.guest_xstate_size = XSAVE_HDR_SIZE + XSAVE_HDR_OFFSET;
-
- vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu);
-
- vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT;
-
- kvm_async_pf_hash_reset(vcpu);
- kvm_pmu_init(vcpu);
-
- vcpu->arch.pending_external_vector = -1;
- vcpu->arch.preempted_in_kernel = false;
-
- kvm_hv_vcpu_init(vcpu);
-
- return 0;
-
-fail_free_mce_banks:
- kfree(vcpu->arch.mce_banks);
-fail_free_lapic:
- kvm_free_lapic(vcpu);
-fail_mmu_destroy:
- kvm_mmu_destroy(vcpu);
-fail_free_pio_data:
- free_page((unsigned long)vcpu->arch.pio_data);
-fail:
- return r;
-}
-
-void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
-{
- int idx;
-
- kvm_hv_vcpu_uninit(vcpu);
- kvm_pmu_destroy(vcpu);
- kfree(vcpu->arch.mce_banks);
- kvm_free_lapic(vcpu);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
- kvm_mmu_destroy(vcpu);
- srcu_read_unlock(&vcpu->kvm->srcu, idx);
- free_page((unsigned long)vcpu->arch.pio_data);
- if (!lapic_in_kernel(vcpu))
- static_key_slow_dec(&kvm_no_apic_vcpu);
-}
-
void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+
vcpu->arch.l1tf_flush_l1d = true;
- kvm_x86_ops->sched_in(vcpu, cpu);
+ if (pmu->version && unlikely(pmu->event_count)) {
+ pmu->need_cleanup = true;
+ kvm_make_request(KVM_REQ_PMU, vcpu);
+ }
+ kvm_x86_ops.sched_in(vcpu, cpu);
}
+void kvm_arch_free_vm(struct kvm *kvm)
+{
+ kfree(kvm->arch.hyperv.hv_pa_pg);
+ vfree(kvm);
+}
+
+
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ int ret;
+
if (type)
return -EINVAL;
+ ret = kvm_page_track_init(kvm);
+ if (ret)
+ return ret;
+
INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list);
INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
INIT_LIST_HEAD(&kvm->arch.zapped_obsolete_pages);
@@ -9605,7 +10431,7 @@
mutex_init(&kvm->arch.apic_map_lock);
spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
- kvm->arch.kvmclock_offset = -ktime_get_boottime_ns();
+ kvm->arch.kvmclock_offset = -get_kvmclock_base_ns();
pvclock_update_vm_gtod_copy(kvm);
kvm->arch.guest_can_read_msr_platform_info = true;
@@ -9614,10 +10440,9 @@
INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn);
kvm_hv_init_vm(kvm);
- kvm_page_track_init(kvm);
kvm_mmu_init_vm(kvm);
- return kvm_x86_ops->vm_init(kvm);
+ return kvm_x86_ops.vm_init(kvm);
}
int kvm_arch_post_init_vm(struct kvm *kvm)
@@ -9645,7 +10470,7 @@
kvm_unload_vcpu_mmu(vcpu);
}
kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_arch_vcpu_free(vcpu);
+ kvm_vcpu_destroy(vcpu);
mutex_lock(&kvm->lock);
for (i = 0; i < atomic_read(&kvm->online_vcpus); i++)
@@ -9665,9 +10490,9 @@
int __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
{
int i, r;
- unsigned long hva;
+ unsigned long hva, old_npages;
struct kvm_memslots *slots = kvm_memslots(kvm);
- struct kvm_memory_slot *slot, old;
+ struct kvm_memory_slot *slot;
/* Called with kvm->slots_lock held. */
if (WARN_ON(id >= KVM_MEM_SLOTS_NUM))
@@ -9675,7 +10500,7 @@
slot = id_to_memslot(slots, id);
if (size) {
- if (slot->npages)
+ if (slot && slot->npages)
return -EEXIST;
/*
@@ -9687,13 +10512,13 @@
if (IS_ERR((void *)hva))
return PTR_ERR((void *)hva);
} else {
- if (!slot->npages)
+ if (!slot || !slot->npages)
return 0;
+ old_npages = slot->npages;
hva = 0;
}
- old = *slot;
for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
struct kvm_userspace_memory_region m;
@@ -9708,24 +10533,12 @@
}
if (!size)
- vm_munmap(old.userspace_addr, old.npages * PAGE_SIZE);
+ vm_munmap(hva, old_npages * PAGE_SIZE);
return 0;
}
EXPORT_SYMBOL_GPL(__x86_set_memory_region);
-int x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, u32 size)
-{
- int r;
-
- mutex_lock(&kvm->slots_lock);
- r = __x86_set_memory_region(kvm, id, gpa, size);
- mutex_unlock(&kvm->slots_lock);
-
- return r;
-}
-EXPORT_SYMBOL_GPL(x86_set_memory_region);
-
void kvm_arch_pre_destroy_vm(struct kvm *kvm)
{
kvm_mmu_pre_destroy_vm(kvm);
@@ -9739,12 +10552,17 @@
* unless the the memory map has changed due to process exit
* or fd copying.
*/
- x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, 0, 0);
- x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_lock(&kvm->slots_lock);
+ __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
+ 0, 0);
+ __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0);
+ mutex_unlock(&kvm->slots_lock);
}
- if (kvm_x86_ops->vm_destroy)
- kvm_x86_ops->vm_destroy(kvm);
+ if (kvm_x86_ops.vm_destroy)
+ kvm_x86_ops.vm_destroy(kvm);
+ kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1));
kvm_pic_destroy(kvm);
kvm_ioapic_destroy(kvm);
kvm_free_vcpus(kvm);
@@ -9755,31 +10573,26 @@
kvm_hv_destroy_vm(kvm);
}
-void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
- struct kvm_memory_slot *dont)
+void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot)
{
int i;
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
- if (!dont || free->arch.rmap[i] != dont->arch.rmap[i]) {
- kvfree(free->arch.rmap[i]);
- free->arch.rmap[i] = NULL;
- }
+ kvfree(slot->arch.rmap[i]);
+ slot->arch.rmap[i] = NULL;
+
if (i == 0)
continue;
- if (!dont || free->arch.lpage_info[i - 1] !=
- dont->arch.lpage_info[i - 1]) {
- kvfree(free->arch.lpage_info[i - 1]);
- free->arch.lpage_info[i - 1] = NULL;
- }
+ kvfree(slot->arch.lpage_info[i - 1]);
+ slot->arch.lpage_info[i - 1] = NULL;
}
- kvm_page_track_free_memslot(free, dont);
+ kvm_page_track_free_memslot(slot);
}
-int kvm_arch_create_memslot(struct kvm *kvm, struct kvm_memory_slot *slot,
- unsigned long npages)
+static int kvm_alloc_memslot_metadata(struct kvm_memory_slot *slot,
+ unsigned long npages)
{
int i;
@@ -9820,11 +10633,9 @@
ugfn = slot->userspace_addr >> PAGE_SHIFT;
/*
* If the gfn and userspace address are not aligned wrt each
- * other, or if explicitly asked to, disable large page
- * support for this slot
+ * other, disable large page support for this slot.
*/
- if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) ||
- !kvm_largepages_enabled()) {
+ if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) {
unsigned long j;
for (j = 0; j < lpages; ++j)
@@ -9871,72 +10682,23 @@
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
- if (change == KVM_MR_MOVE)
- return kvm_arch_create_memslot(kvm, memslot,
- mem->memory_size >> PAGE_SHIFT);
-
+ if (change == KVM_MR_CREATE || change == KVM_MR_MOVE)
+ return kvm_alloc_memslot_metadata(memslot,
+ mem->memory_size >> PAGE_SHIFT);
return 0;
}
static void kvm_mmu_slot_apply_flags(struct kvm *kvm,
- struct kvm_memory_slot *new)
+ struct kvm_memory_slot *old,
+ struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
- /* Still write protect RO slot */
- if (new->flags & KVM_MEM_READONLY) {
- kvm_mmu_slot_remove_write_access(kvm, new);
- return;
- }
-
/*
- * Call kvm_x86_ops dirty logging hooks when they are valid.
- *
- * kvm_x86_ops->slot_disable_log_dirty is called when:
- *
- * - KVM_MR_CREATE with dirty logging is disabled
- * - KVM_MR_FLAGS_ONLY with dirty logging is disabled in new flag
- *
- * The reason is, in case of PML, we need to set D-bit for any slots
- * with dirty logging disabled in order to eliminate unnecessary GPA
- * logging in PML buffer (and potential PML buffer full VMEXT). This
- * guarantees leaving PML enabled during guest's lifetime won't have
- * any additional overhead from PML when guest is running with dirty
- * logging disabled for memory slots.
- *
- * kvm_x86_ops->slot_enable_log_dirty is called when switching new slot
- * to dirty logging mode.
- *
- * If kvm_x86_ops dirty logging hooks are invalid, use write protect.
- *
- * In case of write protect:
- *
- * Write protect all pages for dirty logging.
- *
- * All the sptes including the large sptes which point to this
- * slot are set to readonly. We can not create any new large
- * spte on this slot until the end of the logging.
- *
- * See the comments in fast_page_fault().
+ * Nothing to do for RO slots or CREATE/MOVE/DELETE of a slot.
+ * See comments below.
*/
- if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
- if (kvm_x86_ops->slot_enable_log_dirty)
- kvm_x86_ops->slot_enable_log_dirty(kvm, new);
- else
- kvm_mmu_slot_remove_write_access(kvm, new);
- } else {
- if (kvm_x86_ops->slot_disable_log_dirty)
- kvm_x86_ops->slot_disable_log_dirty(kvm, new);
- }
-}
-
-void kvm_arch_commit_memory_region(struct kvm *kvm,
- const struct kvm_userspace_memory_region *mem,
- const struct kvm_memory_slot *old,
- const struct kvm_memory_slot *new,
- enum kvm_mr_change change)
-{
- if (!kvm->arch.n_requested_mmu_pages)
- kvm_mmu_change_mmu_pages(kvm,
- kvm_mmu_calculate_default_mmu_pages(kvm));
+ if ((change != KVM_MR_FLAGS_ONLY) || (new->flags & KVM_MEM_READONLY))
+ return;
/*
* Dirty logging tracks sptes in 4k granularity, meaning that large
@@ -9951,27 +10713,84 @@
* page faults will create the large-page sptes.
*
* There is no need to do this in any of the following cases:
- * CREATE: No dirty mappings will already exist.
- * MOVE/DELETE: The old mappings will already have been cleaned up by
+ * CREATE: No dirty mappings will already exist.
+ * MOVE/DELETE: The old mappings will already have been cleaned up by
* kvm_arch_flush_shadow_memslot()
*/
- if (change == KVM_MR_FLAGS_ONLY &&
- (old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
- !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
+ if ((old->flags & KVM_MEM_LOG_DIRTY_PAGES) &&
+ !(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
kvm_mmu_zap_collapsible_sptes(kvm, new);
/*
- * Set up write protection and/or dirty logging for the new slot.
+ * Enable or disable dirty logging for the slot.
*
- * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of old slot have
- * been zapped so no dirty logging staff is needed for old slot. For
- * KVM_MR_FLAGS_ONLY, the old slot is essentially the same one as the
- * new and it's also covered when dealing with the new slot.
+ * For KVM_MR_DELETE and KVM_MR_MOVE, the shadow pages of the old
+ * slot have been zapped so no dirty logging updates are needed for
+ * the old slot.
+ * For KVM_MR_CREATE and KVM_MR_MOVE, once the new slot is visible
+ * any mappings that might be created in it will consume the
+ * properties of the new slot and do not need to be updated here.
*
+ * When PML is enabled, the kvm_x86_ops dirty logging hooks are
+ * called to enable/disable dirty logging.
+ *
+ * When disabling dirty logging with PML enabled, the D-bit is set
+ * for sptes in the slot in order to prevent unnecessary GPA
+ * logging in the PML buffer (and potential PML buffer full VMEXIT).
+ * This guarantees leaving PML enabled for the guest's lifetime
+ * won't have any additional overhead from PML when the guest is
+ * running with dirty logging disabled.
+ *
+ * When enabling dirty logging, large sptes are write-protected
+ * so they can be split on first write. New large sptes cannot
+ * be created for this slot until the end of the logging.
+ * See the comments in fast_page_fault().
+ * For small sptes, nothing is done if the dirty log is in the
+ * initial-all-set state. Otherwise, depending on whether pml
+ * is enabled the D-bit or the W-bit will be cleared.
+ */
+ if (new->flags & KVM_MEM_LOG_DIRTY_PAGES) {
+ if (kvm_x86_ops.slot_enable_log_dirty) {
+ kvm_x86_ops.slot_enable_log_dirty(kvm, new);
+ } else {
+ int level =
+ kvm_dirty_log_manual_protect_and_init_set(kvm) ?
+ PG_LEVEL_2M : PG_LEVEL_4K;
+
+ /*
+ * If we're with initial-all-set, we don't need
+ * to write protect any small page because
+ * they're reported as dirty already. However
+ * we still need to write-protect huge pages
+ * so that the page split can happen lazily on
+ * the first write to the huge page.
+ */
+ kvm_mmu_slot_remove_write_access(kvm, new, level);
+ }
+ } else {
+ if (kvm_x86_ops.slot_disable_log_dirty)
+ kvm_x86_ops.slot_disable_log_dirty(kvm, new);
+ }
+}
+
+void kvm_arch_commit_memory_region(struct kvm *kvm,
+ const struct kvm_userspace_memory_region *mem,
+ struct kvm_memory_slot *old,
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
+{
+ if (!kvm->arch.n_requested_mmu_pages)
+ kvm_mmu_change_mmu_pages(kvm,
+ kvm_mmu_calculate_default_mmu_pages(kvm));
+
+ /*
* FIXME: const-ify all uses of struct kvm_memory_slot.
*/
- if (change != KVM_MR_DELETE)
- kvm_mmu_slot_apply_flags(kvm, (struct kvm_memory_slot *) new);
+ kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change);
+
+ /* Free the arrays associated with the old memslot. */
+ if (change == KVM_MR_MOVE)
+ kvm_arch_free_memslot(kvm, old);
}
void kvm_arch_flush_shadow_all(struct kvm *kvm)
@@ -9988,8 +10807,8 @@
static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
return (is_guest_mode(vcpu) &&
- kvm_x86_ops->guest_apic_has_interrupt &&
- kvm_x86_ops->guest_apic_has_interrupt(vcpu));
+ kvm_x86_ops.guest_apic_has_interrupt &&
+ kvm_x86_ops.guest_apic_has_interrupt(vcpu));
}
static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
@@ -10008,11 +10827,12 @@
if (kvm_test_request(KVM_REQ_NMI, vcpu) ||
(vcpu->arch.nmi_pending &&
- kvm_x86_ops->nmi_allowed(vcpu)))
+ kvm_x86_ops.nmi_allowed(vcpu, false)))
return true;
if (kvm_test_request(KVM_REQ_SMI, vcpu) ||
- (vcpu->arch.smi_pending && !is_smm(vcpu)))
+ (vcpu->arch.smi_pending &&
+ kvm_x86_ops.smi_allowed(vcpu, false)))
return true;
if (kvm_arch_interrupt_allowed(vcpu) &&
@@ -10023,6 +10843,11 @@
if (kvm_hv_has_stimer_pending(vcpu))
return true;
+ if (is_guest_mode(vcpu) &&
+ kvm_x86_ops.nested_ops->hv_timer_pending &&
+ kvm_x86_ops.nested_ops->hv_timer_pending(vcpu))
+ return true;
+
return false;
}
@@ -10041,7 +10866,7 @@
kvm_test_request(KVM_REQ_EVENT, vcpu))
return true;
- if (vcpu->arch.apicv_active && kvm_x86_ops->dy_apicv_has_pending_interrupt(vcpu))
+ if (vcpu->arch.apicv_active && kvm_x86_ops.dy_apicv_has_pending_interrupt(vcpu))
return true;
return false;
@@ -10059,7 +10884,7 @@
int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return kvm_x86_ops->interrupt_allowed(vcpu);
+ return kvm_x86_ops.interrupt_allowed(vcpu, false);
}
unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu)
@@ -10081,7 +10906,7 @@
{
unsigned long rflags;
- rflags = kvm_x86_ops->get_rflags(vcpu);
+ rflags = kvm_x86_ops.get_rflags(vcpu);
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
rflags &= ~X86_EFLAGS_TF;
return rflags;
@@ -10093,7 +10918,7 @@
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
rflags |= X86_EFLAGS_TF;
- kvm_x86_ops->set_rflags(vcpu, rflags);
+ kvm_x86_ops.set_rflags(vcpu, rflags);
}
void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
@@ -10116,20 +10941,22 @@
return;
if (!vcpu->arch.mmu->direct_map &&
- work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
+ work->arch.cr3 != vcpu->arch.mmu->get_guest_pgd(vcpu))
return;
- vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true);
+ kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
{
+ BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU));
+
return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
}
static inline u32 kvm_async_pf_next_probe(u32 key)
{
- return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
+ return (key + 1) & (ASYNC_PF_PER_VCPU - 1);
}
static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -10147,7 +10974,7 @@
int i;
u32 key = kvm_async_pf_hash_fn(gfn);
- for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
+ for (i = 0; i < ASYNC_PF_PER_VCPU &&
(vcpu->arch.apf.gfns[key] != gfn &&
vcpu->arch.apf.gfns[key] != ~0); i++)
key = kvm_async_pf_next_probe(key);
@@ -10165,6 +10992,10 @@
u32 i, j, k;
i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
+
+ if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn))
+ return;
+
while (true) {
vcpu->arch.apf.gfns[i] = ~0;
do {
@@ -10183,18 +11014,32 @@
}
}
-static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
+static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu)
{
+ u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT;
- return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
- sizeof(val));
+ return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &reason,
+ sizeof(reason));
}
-static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token)
{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
- return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
- sizeof(u32));
+ return kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &token, offset, sizeof(token));
+}
+
+static inline bool apf_pageready_slot_free(struct kvm_vcpu *vcpu)
+{
+ unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token);
+ u32 val;
+
+ if (kvm_read_guest_offset_cached(vcpu->kvm, &vcpu->arch.apf.data,
+ &val, offset, sizeof(val)))
+ return false;
+
+ return !val;
}
static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu)
@@ -10202,9 +11047,8 @@
if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
return false;
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
- (vcpu->arch.apf.send_user_only &&
- kvm_x86_ops->get_cpl(vcpu) == 0))
+ if (!kvm_pv_async_pf_enabled(vcpu) ||
+ (vcpu->arch.apf.send_user_only && kvm_x86_ops.get_cpl(vcpu) == 0))
return false;
return true;
@@ -10224,10 +11068,10 @@
* If interrupts are off we cannot even use an artificial
* halt state.
*/
- return kvm_x86_ops->interrupt_allowed(vcpu);
+ return kvm_arch_interrupt_allowed(vcpu);
}
-void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
+bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
struct x86_exception fault;
@@ -10236,7 +11080,7 @@
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
if (kvm_can_deliver_async_pf(vcpu) &&
- !apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
+ !apf_put_user_notpresent(vcpu)) {
fault.vector = PF_VECTOR;
fault.error_code_valid = true;
fault.error_code = 0;
@@ -10244,6 +11088,7 @@
fault.address = work->arch.token;
fault.async_page_fault = true;
kvm_inject_page_fault(vcpu, &fault);
+ return true;
} else {
/*
* It is not possible to deliver a paravirtualized asynchronous
@@ -10254,14 +11099,17 @@
* fault is retried, hopefully the page will be ready in the host.
*/
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
+ return false;
}
}
void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
struct kvm_async_pf *work)
{
- struct x86_exception fault;
- u32 val;
+ struct kvm_lapic_irq irq = {
+ .delivery_mode = APIC_DM_FIXED,
+ .vector = vcpu->arch.apf.vec
+ };
if (work->wakeup_all)
work->arch.token = ~0; /* broadcast wakeup */
@@ -10269,39 +11117,30 @@
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
- if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
- !apf_get_user(vcpu, &val)) {
- if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
- vcpu->arch.exception.pending &&
- vcpu->arch.exception.nr == PF_VECTOR &&
- !apf_put_user(vcpu, 0)) {
- vcpu->arch.exception.injected = false;
- vcpu->arch.exception.pending = false;
- vcpu->arch.exception.nr = 0;
- vcpu->arch.exception.has_error_code = false;
- vcpu->arch.exception.error_code = 0;
- vcpu->arch.exception.has_payload = false;
- vcpu->arch.exception.payload = 0;
- } else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
- fault.vector = PF_VECTOR;
- fault.error_code_valid = true;
- fault.error_code = 0;
- fault.nested_page_fault = false;
- fault.address = work->arch.token;
- fault.async_page_fault = true;
- kvm_inject_page_fault(vcpu, &fault);
- }
+ if ((work->wakeup_all || work->notpresent_injected) &&
+ kvm_pv_async_pf_enabled(vcpu) &&
+ !apf_put_user_ready(vcpu, work->arch.token)) {
+ vcpu->arch.apf.pageready_pending = true;
+ kvm_apic_set_irq(vcpu, &irq, NULL);
}
+
vcpu->arch.apf.halted = false;
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
}
-bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
+void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu)
{
- if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
+ if (!vcpu->arch.apf.pageready_pending)
+ kvm_vcpu_kick(vcpu);
+}
+
+bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu)
+{
+ if (!kvm_pv_async_pf_enabled(vcpu))
return true;
else
- return kvm_can_do_async_pf(vcpu);
+ return apf_pageready_slot_free(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
@@ -10350,11 +11189,17 @@
{
struct kvm_kernel_irqfd *irqfd =
container_of(cons, struct kvm_kernel_irqfd, consumer);
+ int ret;
irqfd->producer = prod;
+ kvm_arch_start_assignment(irqfd->kvm);
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm,
+ prod->irq, irqfd->gsi, 1);
- return kvm_x86_ops->update_pi_irte(irqfd->kvm,
- prod->irq, irqfd->gsi, 1);
+ if (ret)
+ kvm_arch_end_assignment(irqfd->kvm);
+
+ return ret;
}
void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
@@ -10373,23 +11218,24 @@
* when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
- ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
+ ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
if (ret)
printk(KERN_INFO "irq bypass consumer (token %p) unregistration"
" fails: %d\n", irqfd->consumer.token, ret);
+
+ kvm_arch_end_assignment(irqfd->kvm);
}
int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set)
{
- return kvm_x86_ops->update_pi_irte(kvm, host_irq, guest_irq, set);
+ return kvm_x86_ops.update_pi_irte(kvm, host_irq, guest_irq, set);
}
bool kvm_vector_hashing_enabled(void)
{
return vector_hashing;
}
-EXPORT_SYMBOL_GPL(kvm_vector_hashing_enabled);
bool kvm_arch_no_poll(struct kvm_vcpu *vcpu)
{
@@ -10424,6 +11270,134 @@
}
EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
+void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
+{
+ struct x86_exception fault;
+ u32 access = error_code &
+ (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
+
+ if (!(error_code & PFERR_PRESENT_MASK) ||
+ vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, &fault) != UNMAPPED_GVA) {
+ /*
+ * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+ * tables probably do not match the TLB. Just proceed
+ * with the error code that the processor gave.
+ */
+ fault.vector = PF_VECTOR;
+ fault.error_code_valid = true;
+ fault.error_code = error_code;
+ fault.nested_page_fault = false;
+ fault.address = gva;
+ }
+ vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault);
+}
+EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error);
+
+/*
+ * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns
+ * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value
+ * indicates whether exit to userspace is needed.
+ */
+int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r,
+ struct x86_exception *e)
+{
+ if (r == X86EMUL_PROPAGATE_FAULT) {
+ kvm_inject_emulated_page_fault(vcpu, e);
+ return 1;
+ }
+
+ /*
+ * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED
+ * while handling a VMX instruction KVM could've handled the request
+ * correctly by exiting to userspace and performing I/O but there
+ * doesn't seem to be a real use-case behind such requests, just return
+ * KVM_EXIT_INTERNAL_ERROR for now.
+ */
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_handle_memory_failure);
+
+int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva)
+{
+ bool pcid_enabled;
+ struct x86_exception e;
+ unsigned i;
+ unsigned long roots_to_free = 0;
+ struct {
+ u64 pcid;
+ u64 gla;
+ } operand;
+ int r;
+
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ if (operand.pcid >> 12 != 0) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
+
+ switch (type) {
+ case INVPCID_TYPE_INDIV_ADDR:
+ if ((!pcid_enabled && (operand.pcid != 0)) ||
+ is_noncanonical_address(operand.gla, vcpu)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+ kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_SINGLE_CTXT:
+ if (!pcid_enabled && (operand.pcid != 0)) {
+ kvm_inject_gp(vcpu, 0);
+ return 1;
+ }
+
+ if (kvm_get_active_pcid(vcpu) == operand.pcid) {
+ kvm_mmu_sync_roots(vcpu);
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ }
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
+ if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].pgd)
+ == operand.pcid)
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+
+ kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
+ /*
+ * If neither the current cr3 nor any of the prev_roots use the
+ * given PCID, then nothing needs to be done here because a
+ * resync will happen anyway before switching to any other CR3.
+ */
+
+ return kvm_skip_emulated_instruction(vcpu);
+
+ case INVPCID_TYPE_ALL_NON_GLOBAL:
+ /*
+ * Currently, KVM doesn't mark global entries in the shadow
+ * page tables, so a non-global flush just degenerates to a
+ * global flush. If needed, we could optimize this later by
+ * keeping track of global entries in shadow page tables.
+ */
+
+ fallthrough;
+ case INVPCID_TYPE_ALL_INCL_GLOBAL:
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
+ return kvm_skip_emulated_instruction(vcpu);
+
+ default:
+ BUG(); /* We have already checked above that type <= 3 */
+ }
+}
+EXPORT_SYMBOL_GPL(kvm_handle_invpcid);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
@@ -10444,3 +11418,5 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
+EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);