Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index f486e26..3a18614 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -18,6 +18,8 @@
#define PT_MODE_SYSTEM 0
#define PT_MODE_HOST_GUEST 1
+#define PMU_CAP_FW_WRITES (1ULL << 13)
+
struct nested_vmx_msrs {
/*
* We only store the "true" versions of the VMX capability MSRs. We
@@ -101,7 +103,7 @@
(vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
}
-static inline bool vmx_mpx_supported(void)
+static inline bool cpu_has_vmx_mpx(void)
{
return (vmcs_config.vmexit_ctrl & VM_EXIT_CLEAR_BNDCFGS) &&
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_BNDCFGS);
@@ -146,15 +148,10 @@
SECONDARY_EXEC_DESC;
}
-static inline bool vmx_pku_supported(void)
-{
- return boot_cpu_has(X86_FEATURE_PKU);
-}
-
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
- SECONDARY_EXEC_RDTSCP;
+ SECONDARY_EXEC_ENABLE_RDTSCP;
}
static inline bool cpu_has_vmx_virtualize_x2apic_mode(void)
@@ -199,7 +196,7 @@
SECONDARY_EXEC_PAUSE_LOOP_EXITING;
}
-static inline bool vmx_rdrand_supported(void)
+static inline bool cpu_has_vmx_rdrand(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDRAND_EXITING;
@@ -236,7 +233,7 @@
SECONDARY_EXEC_ENCLS_EXITING;
}
-static inline bool vmx_rdseed_supported(void)
+static inline bool cpu_has_vmx_rdseed(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_RDSEED_EXITING;
@@ -247,13 +244,13 @@
return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML;
}
-static inline bool vmx_xsaves_supported(void)
+static inline bool cpu_has_vmx_xsaves(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_XSAVES;
}
-static inline bool vmx_waitpkg_supported(void)
+static inline bool cpu_has_vmx_waitpkg(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
@@ -354,4 +351,31 @@
(vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_RTIT_CTL);
}
+/*
+ * Processor Trace can operate in one of three modes:
+ * a. system-wide: trace both host/guest and output to host buffer
+ * b. host-only: only trace host and output to host buffer
+ * c. host-guest: trace host and guest simultaneously and output to their
+ * respective buffer
+ *
+ * KVM currently only supports (a) and (c).
+ */
+static inline bool vmx_pt_mode_is_system(void)
+{
+ return pt_mode == PT_MODE_SYSTEM;
+}
+static inline bool vmx_pt_mode_is_host_guest(void)
+{
+ return pt_mode == PT_MODE_HOST_GUEST;
+}
+
+static inline u64 vmx_get_perf_capabilities(void)
+{
+ /*
+ * Since counters are virtualized, KVM would support full
+ * width counting unconditionally, even if the host lacks it.
+ */
+ return PMU_CAP_FW_WRITES;
+}
+
#endif /* __KVM_X86_VMX_CAPS_H */
diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c
index 7235970..5b68034 100644
--- a/arch/x86/kvm/vmx/evmcs.c
+++ b/arch/x86/kvm/vmx/evmcs.c
@@ -4,9 +4,11 @@
#include <linux/smp.h>
#include "../hyperv.h"
+#include "../cpuid.h"
#include "evmcs.h"
#include "vmcs.h"
#include "vmx.h"
+#include "trace.h"
DEFINE_STATIC_KEY_FALSE(enable_evmcs);
@@ -159,14 +161,6 @@
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
EVMCS1_FIELD(VM_ENTRY_MSR_LOAD_ADDR, vm_entry_msr_load_addr,
HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE0, cr3_target_value0,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE1, cr3_target_value1,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE2, cr3_target_value2,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
- EVMCS1_FIELD(CR3_TARGET_VALUE3, cr3_target_value3,
- HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL),
/* 32 bit rw */
EVMCS1_FIELD(TPR_THRESHOLD, tpr_threshold,
@@ -303,14 +297,13 @@
};
const unsigned int nr_evmcs_1_fields = ARRAY_SIZE(vmcs_field_to_evmcs_1);
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf)
{
vmcs_conf->pin_based_exec_ctrl &= ~EVMCS1_UNSUPPORTED_PINCTRL;
vmcs_conf->cpu_based_2nd_exec_ctrl &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
vmcs_conf->vmexit_ctrl &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
vmcs_conf->vmentry_ctrl &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
-
}
#endif
@@ -333,39 +326,114 @@
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- /*
- * vmcs_version represents the range of supported Enlightened VMCS
- * versions: lower 8 bits is the minimal version, higher 8 bits is the
- * maximum supported version. KVM supports versions from 1 to
- * KVM_EVMCS_VERSION.
- */
- if (vmx->nested.enlightened_vmcs_enabled)
- return (KVM_EVMCS_VERSION << 8) | 1;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /*
+ * vmcs_version represents the range of supported Enlightened VMCS
+ * versions: lower 8 bits is the minimal version, higher 8 bits is the
+ * maximum supported version. KVM supports versions from 1 to
+ * KVM_EVMCS_VERSION.
+ */
+ if (kvm_cpu_cap_get(X86_FEATURE_VMX) &&
+ vmx->nested.enlightened_vmcs_enabled)
+ return (KVM_EVMCS_VERSION << 8) | 1;
- return 0;
+ return 0;
+}
+
+void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata)
+{
+ u32 ctl_low = (u32)*pdata;
+ u32 ctl_high = (u32)(*pdata >> 32);
+
+ /*
+ * Hyper-V 2016 and 2019 try using these features even when eVMCS
+ * is enabled but there are no corresponding fields.
+ */
+ switch (msr_index) {
+ case MSR_IA32_VMX_EXIT_CTLS:
+ case MSR_IA32_VMX_TRUE_EXIT_CTLS:
+ ctl_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+ break;
+ case MSR_IA32_VMX_ENTRY_CTLS:
+ case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
+ ctl_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+ break;
+ case MSR_IA32_VMX_PROCBASED_CTLS2:
+ ctl_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
+ break;
+ case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
+ case MSR_IA32_VMX_PINBASED_CTLS:
+ ctl_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
+ break;
+ case MSR_IA32_VMX_VMFUNC:
+ ctl_low &= ~EVMCS1_UNSUPPORTED_VMFUNC;
+ break;
+ }
+
+ *pdata = ctl_low | ((u64)ctl_high << 32);
+}
+
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12)
+{
+ int ret = 0;
+ u32 unsupp_ctl;
+
+ unsupp_ctl = vmcs12->pin_based_vm_exec_control &
+ EVMCS1_UNSUPPORTED_PINCTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported pin-based VM-execution controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->secondary_vm_exec_control &
+ EVMCS1_UNSUPPORTED_2NDEXEC;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported secondary VM-execution controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_exit_controls &
+ EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-exit controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_entry_controls &
+ EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-entry controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ unsupp_ctl = vmcs12->vm_function_control & EVMCS1_UNSUPPORTED_VMFUNC;
+ if (unsupp_ctl) {
+ trace_kvm_nested_vmenter_failed(
+ "eVMCS: unsupported VM-function controls",
+ unsupp_ctl);
+ ret = -EINVAL;
+ }
+
+ return ret;
}
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- bool evmcs_already_enabled = vmx->nested.enlightened_vmcs_enabled;
vmx->nested.enlightened_vmcs_enabled = true;
if (vmcs_version)
*vmcs_version = nested_get_evmcs_version(vcpu);
- /* We don't support disabling the feature for simplicity. */
- if (evmcs_already_enabled)
- return 0;
-
- vmx->nested.msrs.pinbased_ctls_high &= ~EVMCS1_UNSUPPORTED_PINCTRL;
- vmx->nested.msrs.entry_ctls_high &= ~EVMCS1_UNSUPPORTED_VMENTRY_CTRL;
- vmx->nested.msrs.exit_ctls_high &= ~EVMCS1_UNSUPPORTED_VMEXIT_CTRL;
- vmx->nested.msrs.secondary_ctls_high &= ~EVMCS1_UNSUPPORTED_2NDEXEC;
- vmx->nested.msrs.vmfunc_controls &= ~EVMCS1_UNSUPPORTED_VMFUNC;
-
return 0;
}
diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h
index 07ebf68..011929a 100644
--- a/arch/x86/kvm/vmx/evmcs.h
+++ b/arch/x86/kvm/vmx/evmcs.h
@@ -10,6 +10,7 @@
#include "capabilities.h"
#include "vmcs.h"
+#include "vmcs12.h"
struct vmcs_config;
@@ -58,7 +59,9 @@
SECONDARY_EXEC_SHADOW_VMCS | \
SECONDARY_EXEC_TSC_SCALING | \
SECONDARY_EXEC_PAUSE_LOOP_EXITING)
-#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
+#define EVMCS1_UNSUPPORTED_VMEXIT_CTRL \
+ (VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | \
+ VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
#define EVMCS1_UNSUPPORTED_VMENTRY_CTRL (VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
#define EVMCS1_UNSUPPORTED_VMFUNC (VMX_VMFUNC_EPTP_SWITCHING)
@@ -184,7 +187,7 @@
vp_ap->enlighten_vmentry = 1;
}
-void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
+__init void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf);
#else /* !IS_ENABLED(CONFIG_HYPERV) */
static inline void evmcs_write64(unsigned long field, u64 value) {}
static inline void evmcs_write32(unsigned long field, u32 value) {}
@@ -193,13 +196,21 @@
static inline u32 evmcs_read32(unsigned long field) { return 0; }
static inline u16 evmcs_read16(unsigned long field) { return 0; }
static inline void evmcs_load(u64 phys_addr) {}
-static inline void evmcs_sanitize_exec_ctrls(struct vmcs_config *vmcs_conf) {}
static inline void evmcs_touch_msr_bitmap(void) {}
#endif /* IS_ENABLED(CONFIG_HYPERV) */
+enum nested_evmptrld_status {
+ EVMPTRLD_DISABLED,
+ EVMPTRLD_SUCCEEDED,
+ EVMPTRLD_VMFAIL,
+ EVMPTRLD_ERROR,
+};
+
bool nested_enlightened_vmentry(struct kvm_vcpu *vcpu, u64 *evmcs_gpa);
uint16_t nested_get_evmcs_version(struct kvm_vcpu *vcpu);
int nested_enable_evmcs(struct kvm_vcpu *vcpu,
uint16_t *vmcs_version);
+void nested_evmcs_filter_control_msr(u32 msr_index, u64 *pdata);
+int nested_evmcs_check_controls(struct vmcs12 *vmcs12);
#endif /* __KVM_X86_VMX_EVMCS_H */
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3041015..0c2389d 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/frame.h>
+#include <linux/objtool.h>
#include <linux/percpu.h>
#include <asm/debugreg.h>
@@ -10,6 +10,7 @@
#include "hyperv.h"
#include "mmu.h"
#include "nested.h"
+#include "pmu.h"
#include "trace.h"
#include "x86.h"
@@ -170,15 +171,6 @@
static int nested_vmx_failValid(struct kvm_vcpu *vcpu,
u32 vm_instruction_error)
{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- /*
- * failValid writes the error number to the current VMCS, which
- * can't be done if there isn't a current VMCS.
- */
- if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
- return nested_vmx_failInvalid(vcpu);
-
vmx_set_rflags(vcpu, (vmx_get_rflags(vcpu)
& ~(X86_EFLAGS_CF | X86_EFLAGS_PF | X86_EFLAGS_AF |
X86_EFLAGS_SF | X86_EFLAGS_OF))
@@ -191,6 +183,20 @@
return kvm_skip_emulated_instruction(vcpu);
}
+static int nested_vmx_fail(struct kvm_vcpu *vcpu, u32 vm_instruction_error)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * failValid writes the error number to the current VMCS, which
+ * can't be done if there isn't a current VMCS.
+ */
+ if (vmx->nested.current_vmptr == -1ull && !vmx->nested.hv_evmcs)
+ return nested_vmx_failInvalid(vcpu);
+
+ return nested_vmx_failValid(vcpu, vm_instruction_error);
+}
+
static void nested_vmx_abort(struct kvm_vcpu *vcpu, u32 indicator)
{
/* TODO: not to reset guest simply here. */
@@ -227,50 +233,6 @@
vmx->nested.hv_evmcs = NULL;
}
-/*
- * Free whatever needs to be freed from vmx->nested when L1 goes down, or
- * just stops using VMX.
- */
-static void free_nested(struct kvm_vcpu *vcpu)
-{
- struct vcpu_vmx *vmx = to_vmx(vcpu);
-
- if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
- return;
-
- kvm_clear_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
-
- vmx->nested.vmxon = false;
- vmx->nested.smm.vmxon = false;
- free_vpid(vmx->nested.vpid02);
- vmx->nested.posted_intr_nv = -1;
- vmx->nested.current_vmptr = -1ull;
- if (enable_shadow_vmcs) {
- vmx_disable_shadow_vmcs(vmx);
- vmcs_clear(vmx->vmcs01.shadow_vmcs);
- free_vmcs(vmx->vmcs01.shadow_vmcs);
- vmx->vmcs01.shadow_vmcs = NULL;
- }
- kfree(vmx->nested.cached_vmcs12);
- vmx->nested.cached_vmcs12 = NULL;
- kfree(vmx->nested.cached_shadow_vmcs12);
- vmx->nested.cached_shadow_vmcs12 = NULL;
- /* Unpin physical memory we referred to in the vmcs02 */
- if (vmx->nested.apic_access_page) {
- kvm_release_page_dirty(vmx->nested.apic_access_page);
- vmx->nested.apic_access_page = NULL;
- }
- kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
- kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
- vmx->nested.pi_desc = NULL;
-
- kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
-
- nested_release_evmcs(vcpu);
-
- free_loaded_vmcs(&vmx->nested.vmcs02);
-}
-
static void vmx_sync_vmcs_host_state(struct vcpu_vmx *vmx,
struct loaded_vmcs *prev)
{
@@ -296,7 +258,7 @@
struct loaded_vmcs *prev;
int cpu;
- if (vmx->loaded_vmcs == vmcs)
+ if (WARN_ON_ONCE(vmx->loaded_vmcs == vmcs))
return;
cpu = get_cpu();
@@ -306,7 +268,54 @@
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
- vmx_segment_cache_clear(vmx);
+ vmx_register_cache_reset(vcpu);
+}
+
+/*
+ * Free whatever needs to be freed from vmx->nested when L1 goes down, or
+ * just stops using VMX.
+ */
+static void free_nested(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (WARN_ON_ONCE(vmx->loaded_vmcs != &vmx->vmcs01))
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
+
+ if (!vmx->nested.vmxon && !vmx->nested.smm.vmxon)
+ return;
+
+ kvm_clear_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
+
+ vmx->nested.vmxon = false;
+ vmx->nested.smm.vmxon = false;
+ free_vpid(vmx->nested.vpid02);
+ vmx->nested.posted_intr_nv = -1;
+ vmx->nested.current_vmptr = -1ull;
+ if (enable_shadow_vmcs) {
+ vmx_disable_shadow_vmcs(vmx);
+ vmcs_clear(vmx->vmcs01.shadow_vmcs);
+ free_vmcs(vmx->vmcs01.shadow_vmcs);
+ vmx->vmcs01.shadow_vmcs = NULL;
+ }
+ kfree(vmx->nested.cached_vmcs12);
+ vmx->nested.cached_vmcs12 = NULL;
+ kfree(vmx->nested.cached_shadow_vmcs12);
+ vmx->nested.cached_shadow_vmcs12 = NULL;
+ /* Unpin physical memory we referred to in the vmcs02 */
+ if (vmx->nested.apic_access_page) {
+ kvm_release_page_clean(vmx->nested.apic_access_page);
+ vmx->nested.apic_access_page = NULL;
+ }
+ kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
+ kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
+ vmx->nested.pi_desc = NULL;
+
+ kvm_mmu_free_roots(vcpu, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
+
+ nested_release_evmcs(vcpu);
+
+ free_loaded_vmcs(&vmx->nested.vmcs02);
}
/*
@@ -317,8 +326,6 @@
{
vcpu_load(vcpu);
vmx_leave_nested(vcpu);
- vmx_switch_vmcs(vcpu, &to_vmx(vcpu)->vmcs01);
- free_nested(vcpu);
vcpu_put(vcpu);
}
@@ -327,19 +334,19 @@
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason;
+ u32 vm_exit_reason;
unsigned long exit_qualification = vcpu->arch.exit_qualification;
if (vmx->nested.pml_full) {
- exit_reason = EXIT_REASON_PML_FULL;
+ vm_exit_reason = EXIT_REASON_PML_FULL;
vmx->nested.pml_full = false;
exit_qualification &= INTR_INFO_UNBLOCK_NMI;
} else if (fault->error_code & PFERR_RSVD_MASK)
- exit_reason = EXIT_REASON_EPT_MISCONFIG;
+ vm_exit_reason = EXIT_REASON_EPT_MISCONFIG;
else
- exit_reason = EXIT_REASON_EPT_VIOLATION;
+ vm_exit_reason = EXIT_REASON_EPT_VIOLATION;
- nested_vmx_vmexit(vcpu, exit_reason, 0, exit_qualification);
+ nested_vmx_vmexit(vcpu, vm_exit_reason, 0, exit_qualification);
vmcs12->guest_physical_address = fault->address;
}
@@ -352,9 +359,8 @@
to_vmx(vcpu)->nested.msrs.ept_caps &
VMX_EPT_EXECUTE_ONLY_BIT,
nested_ept_ad_enabled(vcpu),
- nested_ept_get_cr3(vcpu));
- vcpu->arch.mmu->set_cr3 = vmx_set_cr3;
- vcpu->arch.mmu->get_cr3 = nested_ept_get_cr3;
+ nested_ept_get_eptp(vcpu));
+ vcpu->arch.mmu->get_guest_pgd = nested_ept_get_eptp;
vcpu->arch.mmu->inject_page_fault = nested_ept_inject_page_fault;
vcpu->arch.mmu->get_pdptr = kvm_pdptr_read;
@@ -437,11 +443,6 @@
}
}
-static bool page_address_valid(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- return PAGE_ALIGNED(gpa) && !(gpa >> cpuid_maxphyaddr(vcpu));
-}
-
static int nested_vmx_check_io_bitmap_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -543,7 +544,8 @@
}
}
-static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) {
+static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap)
+{
int msr;
for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
@@ -616,6 +618,7 @@
}
/* KVM unconditionally exposes the FS/GS base MSRs to L1. */
+#ifdef CONFIG_X86_64
nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
MSR_FS_BASE, MSR_TYPE_RW);
@@ -624,6 +627,7 @@
nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0,
MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
/*
* Checking the L0->L1 bitmap is trying to verify two things:
@@ -697,11 +701,6 @@
VM_EXIT_ACK_INTR_ON_EXIT;
}
-static bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
-{
- return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
-}
-
static int nested_vmx_check_apic_access_controls(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
@@ -926,9 +925,61 @@
}
return 0;
fail:
+ /* Note, max_msr_list_size is at most 4096, i.e. this can't wrap. */
return i + 1;
}
+static bool nested_vmx_get_vmexit_msr_value(struct kvm_vcpu *vcpu,
+ u32 msr_index,
+ u64 *data)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * If the L0 hypervisor stored a more accurate value for the TSC that
+ * does not include the time taken for emulation of the L2->L1
+ * VM-exit in L0, use the more accurate value.
+ */
+ if (msr_index == MSR_IA32_TSC) {
+ int i = vmx_find_loadstore_msr_slot(&vmx->msr_autostore.guest,
+ MSR_IA32_TSC);
+
+ if (i >= 0) {
+ u64 val = vmx->msr_autostore.guest.val[i].value;
+
+ *data = kvm_read_l1_tsc(vcpu, val);
+ return true;
+ }
+ }
+
+ if (kvm_get_msr(vcpu, msr_index, data)) {
+ pr_debug_ratelimited("%s cannot read MSR (0x%x)\n", __func__,
+ msr_index);
+ return false;
+ }
+ return true;
+}
+
+static bool read_and_check_msr_entry(struct kvm_vcpu *vcpu, u64 gpa, int i,
+ struct vmx_msr_entry *e)
+{
+ if (kvm_vcpu_read_guest(vcpu,
+ gpa + i * sizeof(*e),
+ e, 2 * sizeof(u32))) {
+ pr_debug_ratelimited(
+ "%s cannot read MSR entry (%u, 0x%08llx)\n",
+ __func__, i, gpa + i * sizeof(*e));
+ return false;
+ }
+ if (nested_vmx_store_msr_check(vcpu, e)) {
+ pr_debug_ratelimited(
+ "%s check failed (%u, 0x%x, 0x%x)\n",
+ __func__, i, e->index, e->reserved);
+ return false;
+ }
+ return true;
+}
+
static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count)
{
u64 data;
@@ -940,26 +991,12 @@
if (unlikely(i >= max_msr_list_size))
return -EINVAL;
- if (kvm_vcpu_read_guest(vcpu,
- gpa + i * sizeof(e),
- &e, 2 * sizeof(u32))) {
- pr_debug_ratelimited(
- "%s cannot read MSR entry (%u, 0x%08llx)\n",
- __func__, i, gpa + i * sizeof(e));
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
return -EINVAL;
- }
- if (nested_vmx_store_msr_check(vcpu, &e)) {
- pr_debug_ratelimited(
- "%s check failed (%u, 0x%x, 0x%x)\n",
- __func__, i, e.index, e.reserved);
+
+ if (!nested_vmx_get_vmexit_msr_value(vcpu, e.index, &data))
return -EINVAL;
- }
- if (kvm_get_msr(vcpu, e.index, &data)) {
- pr_debug_ratelimited(
- "%s cannot read MSR (%u, 0x%x)\n",
- __func__, i, e.index);
- return -EINVAL;
- }
+
if (kvm_vcpu_write_guest(vcpu,
gpa + i * sizeof(e) +
offsetof(struct vmx_msr_entry, value),
@@ -973,6 +1010,60 @@
return 0;
}
+static bool nested_msr_store_list_has_msr(struct kvm_vcpu *vcpu, u32 msr_index)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 count = vmcs12->vm_exit_msr_store_count;
+ u64 gpa = vmcs12->vm_exit_msr_store_addr;
+ struct vmx_msr_entry e;
+ u32 i;
+
+ for (i = 0; i < count; i++) {
+ if (!read_and_check_msr_entry(vcpu, gpa, i, &e))
+ return false;
+
+ if (e.index == msr_index)
+ return true;
+ }
+ return false;
+}
+
+static void prepare_vmx_msr_autostore_list(struct kvm_vcpu *vcpu,
+ u32 msr_index)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmx_msrs *autostore = &vmx->msr_autostore.guest;
+ bool in_vmcs12_store_list;
+ int msr_autostore_slot;
+ bool in_autostore_list;
+ int last;
+
+ msr_autostore_slot = vmx_find_loadstore_msr_slot(autostore, msr_index);
+ in_autostore_list = msr_autostore_slot >= 0;
+ in_vmcs12_store_list = nested_msr_store_list_has_msr(vcpu, msr_index);
+
+ if (in_vmcs12_store_list && !in_autostore_list) {
+ if (autostore->nr == MAX_NR_LOADSTORE_MSRS) {
+ /*
+ * Emulated VMEntry does not fail here. Instead a less
+ * accurate value will be returned by
+ * nested_vmx_get_vmexit_msr_value() using kvm_get_msr()
+ * instead of reading the value from the vmcs02 VMExit
+ * MSR-store area.
+ */
+ pr_warn_ratelimited(
+ "Not enough msr entries in msr_autostore. Can't add msr %x\n",
+ msr_index);
+ return;
+ }
+ last = autostore->nr++;
+ autostore->val[last].index = msr_index;
+ } else if (!in_vmcs12_store_list && in_autostore_list) {
+ last = --autostore->nr;
+ autostore->val[msr_autostore_slot] = autostore->val[last];
+ }
+}
+
static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
unsigned long invalid_mask;
@@ -982,37 +1073,91 @@
}
/*
- * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are
- * emulating VM entry into a guest with EPT enabled.
- * Returns 0 on success, 1 on failure. Invalid state exit qualification code
- * is assigned to entry_failure_code on failure.
+ * Returns true if the MMU needs to be sync'd on nested VM-Enter/VM-Exit.
+ * tl;dr: the MMU needs a sync if L0 is using shadow paging and L1 didn't
+ * enable VPID for L2 (implying it expects a TLB flush on VMX transitions).
+ * Here's why.
+ *
+ * If EPT is enabled by L0 a sync is never needed:
+ * - if it is disabled by L1, then L0 is not shadowing L1 or L2 PTEs, there
+ * cannot be unsync'd SPTEs for either L1 or L2.
+ *
+ * - if it is also enabled by L1, then L0 doesn't need to sync on VM-Enter
+ * VM-Enter as VM-Enter isn't required to invalidate guest-physical mappings
+ * (irrespective of VPID), i.e. L1 can't rely on the (virtual) CPU to flush
+ * stale guest-physical mappings for L2 from the TLB. And as above, L0 isn't
+ * shadowing L1 PTEs so there are no unsync'd SPTEs to sync on VM-Exit.
+ *
+ * If EPT is disabled by L0:
+ * - if VPID is enabled by L1 (for L2), the situation is similar to when L1
+ * enables EPT: L0 doesn't need to sync as VM-Enter and VM-Exit aren't
+ * required to invalidate linear mappings (EPT is disabled so there are
+ * no combined or guest-physical mappings), i.e. L1 can't rely on the
+ * (virtual) CPU to flush stale linear mappings for either L2 or itself (L1).
+ *
+ * - however if VPID is disabled by L1, then a sync is needed as L1 expects all
+ * linear mappings (EPT is disabled so there are no combined or guest-physical
+ * mappings) to be invalidated on both VM-Enter and VM-Exit.
+ *
+ * Note, this logic is subtly different than nested_has_guest_tlb_tag(), which
+ * additionally checks that L2 has been assigned a VPID (when EPT is disabled).
+ * Whether or not L2 has been assigned a VPID by L0 is irrelevant with respect
+ * to L1's expectations, e.g. L0 needs to invalidate hardware TLB entries if L2
+ * doesn't have a unique VPID to prevent reusing L1's entries (assuming L1 has
+ * been assigned a VPID), but L0 doesn't need to do a MMU sync because L1
+ * doesn't expect stale (virtual) TLB entries to be flushed, i.e. L1 doesn't
+ * know that L0 will flush the TLB and so L1 will do INVVPID as needed to flush
+ * stale TLB entries, at which point L0 will sync L2's MMU.
+ */
+static bool nested_vmx_transition_mmu_sync(struct kvm_vcpu *vcpu)
+{
+ return !enable_ept && !nested_cpu_has_vpid(get_vmcs12(vcpu));
+}
+
+/*
+ * Load guest's/host's cr3 at nested entry/exit. @nested_ept is true if we are
+ * emulating VM-Entry into a guest with EPT enabled. On failure, the expected
+ * Exit Qualification (for a VM-Entry consistency check VM-Exit) is assigned to
+ * @entry_failure_code.
*/
static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept,
- u32 *entry_failure_code)
+ enum vm_entry_failure_code *entry_failure_code)
{
- if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) {
- if (CC(!nested_cr3_valid(vcpu, cr3))) {
- *entry_failure_code = ENTRY_FAIL_DEFAULT;
- return -EINVAL;
- }
+ if (CC(!nested_cr3_valid(vcpu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
- /*
- * If PAE paging and EPT are both on, CR3 is not used by the CPU and
- * must not be dereferenced.
- */
- if (is_pae_paging(vcpu) && !nested_ept) {
- if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
- *entry_failure_code = ENTRY_FAIL_PDPTE;
- return -EINVAL;
- }
+ /*
+ * If PAE paging and EPT are both on, CR3 is not used by the CPU and
+ * must not be dereferenced.
+ */
+ if (!nested_ept && is_pae_paging(vcpu) &&
+ (cr3 != kvm_read_cr3(vcpu) || pdptrs_changed(vcpu))) {
+ if (CC(!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))) {
+ *entry_failure_code = ENTRY_FAIL_PDPTE;
+ return -EINVAL;
}
}
- if (!nested_ept)
- kvm_mmu_new_cr3(vcpu, cr3, false);
+ /*
+ * Unconditionally skip the TLB flush on fast CR3 switch, all TLB
+ * flushes are handled by nested_vmx_transition_tlb_flush().
+ */
+ if (!nested_ept) {
+ kvm_mmu_new_pgd(vcpu, cr3, true, true);
+
+ /*
+ * A TLB flush on VM-Enter/VM-Exit flushes all linear mappings
+ * across all PCIDs, i.e. all PGDs need to be synchronized.
+ * See nested_vmx_transition_mmu_sync() for more details.
+ */
+ if (nested_vmx_transition_mmu_sync(vcpu))
+ kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
+ }
vcpu->arch.cr3 = cr3;
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
kvm_init_mmu(vcpu, false);
@@ -1024,7 +1169,9 @@
* populated by L2 differently than TLB entries populated
* by L1.
*
- * If L1 uses EPT, then TLB entries are tagged with different EPTP.
+ * If L0 uses EPT, L1 and L2 run with different EPTP because
+ * guest_mode is part of kvm_mmu_page_role. Thus, TLB entries
+ * are tagged with different EPTP.
*
* If L1 uses VPID and we allocated a vpid02, TLB entries are tagged
* with different VPID (L1 entries are tagged with vmx->vpid
@@ -1034,15 +1181,52 @@
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- return nested_cpu_has_ept(vmcs12) ||
+ return enable_ept ||
(nested_cpu_has_vpid(vmcs12) && to_vmx(vcpu)->nested.vpid02);
}
-static u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12,
+ bool is_vmenter)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+ /*
+ * If VPID is disabled, linear and combined mappings are flushed on
+ * VM-Enter/VM-Exit, and guest-physical mappings are valid only for
+ * their associated EPTP.
+ */
+ if (!enable_vpid)
+ return;
+
+ /*
+ * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings
+ * for *all* contexts to be flushed on VM-Enter/VM-Exit.
+ *
+ * If VPID is enabled and used by vmc12, but L2 does not have a unique
+ * TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
+ * a VPID for L2, flush the current context as the effective ASID is
+ * common to both L1 and L2.
+ *
+ * Defer the flush so that it runs after vmcs02.EPTP has been set by
+ * KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
+ * redundant flushes further down the nested pipeline.
+ *
+ * If a TLB flush isn't required due to any of the above, and vpid12 is
+ * changing then the new "virtual" VPID (vpid12) will reuse the same
+ * "real" VPID (vpid02), and so needs to be sync'd. There is no direct
+ * mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
+ * all nested vCPUs.
+ */
+ if (!nested_cpu_has_vpid(vmcs12)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ } else if (!nested_has_guest_tlb_tag(vcpu)) {
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
+ } else if (is_vmenter &&
+ vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
+ vmx->nested.last_vpid = vmcs12->virtual_processor_id;
+ vpid_sync_context(nested_get_vpid02(vcpu));
+ }
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -1606,10 +1790,6 @@
* vmcs12->vm_exit_msr_store_addr = evmcs->vm_exit_msr_store_addr;
* vmcs12->vm_exit_msr_load_addr = evmcs->vm_exit_msr_load_addr;
* vmcs12->vm_entry_msr_load_addr = evmcs->vm_entry_msr_load_addr;
- * vmcs12->cr3_target_value0 = evmcs->cr3_target_value0;
- * vmcs12->cr3_target_value1 = evmcs->cr3_target_value1;
- * vmcs12->cr3_target_value2 = evmcs->cr3_target_value2;
- * vmcs12->cr3_target_value3 = evmcs->cr3_target_value3;
* vmcs12->page_fault_error_code_mask =
* evmcs->page_fault_error_code_mask;
* vmcs12->page_fault_error_code_match =
@@ -1683,10 +1863,6 @@
* evmcs->vm_exit_msr_store_addr = vmcs12->vm_exit_msr_store_addr;
* evmcs->vm_exit_msr_load_addr = vmcs12->vm_exit_msr_load_addr;
* evmcs->vm_entry_msr_load_addr = vmcs12->vm_entry_msr_load_addr;
- * evmcs->cr3_target_value0 = vmcs12->cr3_target_value0;
- * evmcs->cr3_target_value1 = vmcs12->cr3_target_value1;
- * evmcs->cr3_target_value2 = vmcs12->cr3_target_value2;
- * evmcs->cr3_target_value3 = vmcs12->cr3_target_value3;
* evmcs->tpr_threshold = vmcs12->tpr_threshold;
* evmcs->virtual_processor_id = vmcs12->virtual_processor_id;
* evmcs->exception_bitmap = vmcs12->exception_bitmap;
@@ -1815,18 +1991,18 @@
* This is an equivalent of the nested hypervisor executing the vmptrld
* instruction.
*/
-static int nested_vmx_handle_enlightened_vmptrld(struct kvm_vcpu *vcpu,
- bool from_launch)
+static enum nested_evmptrld_status nested_vmx_handle_enlightened_vmptrld(
+ struct kvm_vcpu *vcpu, bool from_launch)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool evmcs_gpa_changed = false;
u64 evmcs_gpa;
if (likely(!vmx->nested.enlightened_vmcs_enabled))
- return 1;
+ return EVMPTRLD_DISABLED;
if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
- return 1;
+ return EVMPTRLD_DISABLED;
if (unlikely(!vmx->nested.hv_evmcs ||
evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
@@ -1837,7 +2013,7 @@
if (kvm_vcpu_map(vcpu, gpa_to_gfn(evmcs_gpa),
&vmx->nested.hv_evmcs_map))
- return 0;
+ return EVMPTRLD_ERROR;
vmx->nested.hv_evmcs = vmx->nested.hv_evmcs_map.hva;
@@ -1866,7 +2042,7 @@
if ((vmx->nested.hv_evmcs->revision_id != KVM_EVMCS_VERSION) &&
(vmx->nested.hv_evmcs->revision_id != VMCS12_REVISION)) {
nested_release_evmcs(vcpu);
- return 0;
+ return EVMPTRLD_VMFAIL;
}
vmx->nested.dirty_vmcs12 = true;
@@ -1888,28 +2064,20 @@
}
/*
- * Clean fields data can't de used on VMLAUNCH and when we switch
+ * Clean fields data can't be used on VMLAUNCH and when we switch
* between different L2 guests as KVM keeps a single VMCS12 per L1.
*/
if (from_launch || evmcs_gpa_changed)
vmx->nested.hv_evmcs->hv_clean_fields &=
~HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
- return 1;
+ return EVMPTRLD_SUCCEEDED;
}
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- /*
- * hv_evmcs may end up being not mapped after migration (when
- * L2 was running), map it here to make sure vmcs12 changes are
- * properly reflected.
- */
- if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs)
- nested_vmx_handle_enlightened_vmptrld(vcpu, false);
-
if (vmx->nested.hv_evmcs) {
copy_vmcs12_to_enlightened(vmx);
/* All fields are clean */
@@ -1934,9 +2102,25 @@
return HRTIMER_NORESTART;
}
-static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu)
+static u64 vmx_calc_preemption_timer_value(struct kvm_vcpu *vcpu)
{
- u64 preemption_timeout = get_vmcs12(vcpu)->vmx_preemption_timer_value;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ u64 l1_scaled_tsc = kvm_read_l1_tsc(vcpu, rdtsc()) >>
+ VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE;
+
+ if (!vmx->nested.has_preemption_timer_deadline) {
+ vmx->nested.preemption_timer_deadline =
+ vmcs12->vmx_preemption_timer_value + l1_scaled_tsc;
+ vmx->nested.has_preemption_timer_deadline = true;
+ }
+ return vmx->nested.preemption_timer_deadline - l1_scaled_tsc;
+}
+
+static void vmx_start_preemption_timer(struct kvm_vcpu *vcpu,
+ u64 preemption_timeout)
+{
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
@@ -1955,7 +2139,8 @@
preemption_timeout *= 1000000;
do_div(preemption_timeout, vcpu->arch.virtual_tsc_khz);
hrtimer_start(&vmx->nested.preemption_timer,
- ns_to_ktime(preemption_timeout), HRTIMER_MODE_REL);
+ ktime_add_ns(ktime_get(), preemption_timeout),
+ HRTIMER_MODE_ABS_PINNED);
}
static u64 nested_vmx_calc_efer(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
@@ -1987,7 +2172,8 @@
* consistency checks.
*/
if (enable_ept && nested_early_check)
- vmcs_write64(EPT_POINTER, construct_eptp(&vmx->vcpu, 0));
+ vmcs_write64(EPT_POINTER,
+ construct_eptp(&vmx->vcpu, 0, PT64_ROOT_4LEVEL));
/* All VMFUNCs are currently emulated through L0 vmexits. */
if (cpu_has_vmx_vmfunc())
@@ -2019,7 +2205,7 @@
* addresses are constant (for vmcs02), the counts can change based
* on L2's behavior, e.g. switching to/from long mode.
*/
- vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0);
+ vmcs_write64(VM_EXIT_MSR_STORE_ADDR, __pa(vmx->msr_autostore.guest.val));
vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
@@ -2068,11 +2254,12 @@
* EXEC CONTROLS
*/
exec_control = vmx_exec_control(vmx); /* L0's desires */
- exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
- exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
+ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
+ exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
exec_control |= vmcs12->cpu_based_vm_exec_control;
+ vmx->nested.l1_tpr_threshold = -1;
if (exec_control & CPU_BASED_TPR_SHADOW)
vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold);
#ifdef CONFIG_X86_64
@@ -2108,7 +2295,7 @@
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
SECONDARY_EXEC_ENABLE_INVPCID |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -2136,6 +2323,9 @@
vmcs_write16(GUEST_INTR_STATUS,
vmcs12->guest_intr_status);
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST))
+ exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
+
secondary_exec_controls_set(vmx, exec_control);
}
@@ -2263,22 +2453,28 @@
/*
* Whether page-faults are trapped is determined by a combination of
- * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF.
- * If enable_ept, L0 doesn't care about page faults and we should
- * set all of these to L1's desires. However, if !enable_ept, L0 does
- * care about (at least some) page faults, and because it is not easy
- * (if at all possible?) to merge L0 and L1's desires, we simply ask
- * to exit on each and every L2 page fault. This is done by setting
- * MASK=MATCH=0 and (see below) EB.PF=1.
+ * 3 settings: PFEC_MASK, PFEC_MATCH and EXCEPTION_BITMAP.PF. If L0
+ * doesn't care about page faults then we should set all of these to
+ * L1's desires. However, if L0 does care about (some) page faults, it
+ * is not easy (if at all possible?) to merge L0 and L1's desires, we
+ * simply ask to exit on each and every L2 page fault. This is done by
+ * setting MASK=MATCH=0 and (see below) EB.PF=1.
* Note that below we don't need special code to set EB.PF beyond the
* "or"ing of the EB of vmcs01 and vmcs12, because when enable_ept,
* vmcs01's EB.PF is 0 so the "or" will take vmcs12's value, and when
* !enable_ept, EB.PF is 1, so the "or" will always be 1.
*/
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK,
- enable_ept ? vmcs12->page_fault_error_code_mask : 0);
- vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH,
- enable_ept ? vmcs12->page_fault_error_code_match : 0);
+ if (vmx_need_pf_intercept(&vmx->vcpu)) {
+ /*
+ * TODO: if both L0 and L1 need the same MASK and MATCH,
+ * go ahead and use it?
+ */
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, 0);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, 0);
+ } else {
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, vmcs12->page_fault_error_code_mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, vmcs12->page_fault_error_code_match);
+ }
if (cpu_has_vmx_apicv()) {
vmcs_write64(EOI_EXIT_BITMAP0, vmcs12->eoi_exit_bitmap0);
@@ -2287,6 +2483,13 @@
vmcs_write64(EOI_EXIT_BITMAP3, vmcs12->eoi_exit_bitmap3);
}
+ /*
+ * Make sure the msr_autostore list is up to date before we set the
+ * count in the vmcs02.
+ */
+ prepare_vmx_msr_autostore_list(&vmx->vcpu, MSR_IA32_TSC);
+
+ vmcs_write32(VM_EXIT_MSR_STORE_COUNT, vmx->msr_autostore.guest.nr);
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
@@ -2305,7 +2508,7 @@
* is assigned to entry_failure_code on failure.
*/
static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- u32 *entry_failure_code)
+ enum vm_entry_failure_code *entry_failure_code)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct hv_enlightened_vmcs *hv_evmcs = vmx->nested.hv_evmcs;
@@ -2354,38 +2557,10 @@
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
- if (enable_vpid) {
- /*
- * There is no direct mapping between vpid02 and vpid12, the
- * vpid02 is per-vCPU for L0 and reused while the value of
- * vpid12 is changed w/ one invvpid during nested vmentry.
- * The vpid12 is allocated by L1 for L2, so it will not
- * influence global bitmap(for vpid01 and vpid02 allocation)
- * even if spawn a lot of nested vCPUs.
- */
- if (nested_cpu_has_vpid(vmcs12) && nested_has_guest_tlb_tag(vcpu)) {
- if (vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
- vmx->nested.last_vpid = vmcs12->virtual_processor_id;
- __vmx_flush_tlb(vcpu, nested_get_vpid02(vcpu), false);
- }
- } else {
- /*
- * If L1 use EPT, then L0 needs to execute INVEPT on
- * EPTP02 instead of EPTP01. Therefore, delay TLB
- * flush until vmcs02->eptp is fully updated by
- * KVM_REQ_LOAD_CR3. Note that this assumes
- * KVM_REQ_TLB_FLUSH is evaluated after
- * KVM_REQ_LOAD_CR3 in vcpu_enter_guest().
- */
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
- }
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, true);
if (nested_cpu_has_ept(vmcs12))
nested_ept_init_mmu_context(vcpu);
- else if (nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
- vmx_flush_tlb(vcpu, true);
/*
* This sets GUEST_CR0 to vmcs12->guest_cr0, possibly modifying those
@@ -2410,7 +2585,7 @@
* which means L1 attempted VMEntry to L2 with invalid state.
* Fail the VMEntry.
*/
- if (vmx->emulation_required) {
+ if (CC(!vmx_guest_state_valid(vcpu))) {
*entry_failure_code = ENTRY_FAIL_DEFAULT;
return -EINVAL;
}
@@ -2423,7 +2598,7 @@
/*
* Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
* on nested VM-Exit, which can occur without actually running L2 and
- * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
+ * thus without hitting vmx_load_mmu_pgd(), e.g. if L1 is entering L2 with
* vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
* transition to HLT instead of running L2.
*/
@@ -2442,6 +2617,13 @@
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = vmx_inject_page_fault_nested;
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->guest_ia32_perf_global_ctrl))) {
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
+ return -EINVAL;
+ }
+
kvm_rsp_write(vcpu, vmcs12->guest_rsp);
kvm_rip_write(vcpu, vmcs12->guest_rip);
return 0;
@@ -2454,19 +2636,19 @@
return -EINVAL;
if (CC(!nested_cpu_has_virtual_nmis(vmcs12) &&
- nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING)))
+ nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING)))
return -EINVAL;
return 0;
}
-static bool valid_ept_address(struct kvm_vcpu *vcpu, u64 address)
+static bool nested_vmx_check_eptp(struct kvm_vcpu *vcpu, u64 new_eptp)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int maxphyaddr = cpuid_maxphyaddr(vcpu);
/* Check for memory type validity */
- switch (address & VMX_EPTP_MT_MASK) {
+ switch (new_eptp & VMX_EPTP_MT_MASK) {
case VMX_EPTP_MT_UC:
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPTP_UC_BIT)))
return false;
@@ -2479,16 +2661,26 @@
return false;
}
- /* only 4 levels page-walk length are valid */
- if (CC((address & VMX_EPTP_PWL_MASK) != VMX_EPTP_PWL_4))
+ /* Page-walk levels validity. */
+ switch (new_eptp & VMX_EPTP_PWL_MASK) {
+ case VMX_EPTP_PWL_5:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_5_BIT)))
+ return false;
+ break;
+ case VMX_EPTP_PWL_4:
+ if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_PAGE_WALK_4_BIT)))
+ return false;
+ break;
+ default:
return false;
+ }
/* Reserved bits should not be set */
- if (CC(address >> maxphyaddr || ((address >> 7) & 0x1f)))
+ if (CC(new_eptp >> maxphyaddr || ((new_eptp >> 7) & 0x1f)))
return false;
/* AD, if set, should be supported */
- if (address & VMX_EPTP_AD_ENABLE_BIT) {
+ if (new_eptp & VMX_EPTP_AD_ENABLE_BIT) {
if (CC(!(vmx->nested.msrs.ept_caps & VMX_EPT_AD_BIT)))
return false;
}
@@ -2537,7 +2729,7 @@
return -EINVAL;
if (nested_cpu_has_ept(vmcs12) &&
- CC(!valid_ept_address(vcpu, vmcs12->ept_pointer)))
+ CC(!nested_vmx_check_eptp(vcpu, vmcs12->ept_pointer)))
return -EINVAL;
if (nested_cpu_has_vmfunc(vmcs12)) {
@@ -2655,6 +2847,20 @@
nested_check_vm_entry_controls(vcpu, vmcs12))
return -EINVAL;
+ if (to_vmx(vcpu)->nested.enlightened_vmcs_enabled)
+ return nested_evmcs_check_controls(vmcs12);
+
+ return 0;
+}
+
+static int nested_vmx_check_address_space_size(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+#ifdef CONFIG_X86_64
+ if (CC(!!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) !=
+ !!(vcpu->arch.efer & EFER_LMA)))
+ return -EINVAL;
+#endif
return 0;
}
@@ -2676,19 +2882,22 @@
CC(!kvm_pat_valid(vmcs12->host_ia32_pat)))
return -EINVAL;
+ if ((vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->host_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
#ifdef CONFIG_X86_64
- ia32e = !!(vcpu->arch.efer & EFER_LMA);
+ ia32e = !!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE);
#else
ia32e = false;
#endif
if (ia32e) {
- if (CC(!(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE)) ||
- CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
+ if (CC(!(vmcs12->host_cr4 & X86_CR4_PAE)))
return -EINVAL;
} else {
- if (CC(vmcs12->vm_exit_controls & VM_EXIT_HOST_ADDR_SPACE_SIZE) ||
- CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
+ if (CC(vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) ||
CC(vmcs12->host_cr4 & X86_CR4_PCIDE) ||
CC((vmcs12->host_rip) >> 32))
return -EINVAL;
@@ -2706,7 +2915,6 @@
CC(vmcs12->host_ss_selector == 0 && !ia32e))
return -EINVAL;
-#ifdef CONFIG_X86_64
if (CC(is_noncanonical_address(vmcs12->host_fs_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_gs_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_gdtr_base, vcpu)) ||
@@ -2714,7 +2922,6 @@
CC(is_noncanonical_address(vmcs12->host_tr_base, vcpu)) ||
CC(is_noncanonical_address(vmcs12->host_rip, vcpu)))
return -EINVAL;
-#endif
/*
* If the load IA32_EFER VM-exit control is 1, bits reserved in the
@@ -2772,25 +2979,34 @@
static int nested_vmx_check_guest_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12,
- u32 *exit_qual)
+ enum vm_entry_failure_code *entry_failure_code)
{
bool ia32e;
- *exit_qual = ENTRY_FAIL_DEFAULT;
+ *entry_failure_code = ENTRY_FAIL_DEFAULT;
if (CC(!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0)) ||
CC(!nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)))
return -EINVAL;
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS) &&
+ CC(!kvm_dr7_valid(vmcs12->guest_dr7)))
+ return -EINVAL;
+
if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PAT) &&
CC(!kvm_pat_valid(vmcs12->guest_ia32_pat)))
return -EINVAL;
if (nested_vmx_check_vmcs_link_ptr(vcpu, vmcs12)) {
- *exit_qual = ENTRY_FAIL_VMCS_LINK_PTR;
+ *entry_failure_code = ENTRY_FAIL_VMCS_LINK_PTR;
return -EINVAL;
}
+ if ((vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL) &&
+ CC(!kvm_valid_perf_global_ctrl(vcpu_to_pmu(vcpu),
+ vmcs12->guest_ia32_perf_global_ctrl)))
+ return -EINVAL;
+
/*
* If the load IA32_EFER VM-entry control is 1, the following checks
* are performed on the field for the IA32_EFER MSR:
@@ -2842,7 +3058,7 @@
/*
* Induce a consistency check VMExit by clearing bit 1 in GUEST_RFLAGS,
* which is reserved to '1' by hardware. GUEST_RFLAGS is guaranteed to
- * be written (by preparve_vmcs02()) before the "real" VMEnter, i.e.
+ * be written (by prepare_vmcs02()) before the "real" VMEnter, i.e.
* there is no need to preserve other bits or save/restore the field.
*/
vmcs_writel(GUEST_RFLAGS, 0);
@@ -2908,9 +3124,9 @@
/*
* VMExit clears RFLAGS.IF and DR7, even on a consistency check.
*/
- local_irq_enable();
if (hw_breakpoint_active())
set_debugreg(__this_cpu_read(cpu_dr7), 7);
+ local_irq_enable();
preempt_enable();
/*
@@ -2926,8 +3142,26 @@
return 0;
}
-static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12);
+static bool nested_get_evmcs_page(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ /*
+ * hv_evmcs may end up being not mapped after migration (when
+ * L2 was running), map it here to make sure vmcs12 changes are
+ * properly reflected.
+ */
+ if (vmx->nested.enlightened_vmcs_enabled && !vmx->nested.hv_evmcs) {
+ enum nested_evmptrld_status evmptrld_status =
+ nested_vmx_handle_enlightened_vmptrld(vcpu, false);
+
+ if (evmptrld_status == EVMPTRLD_VMFAIL ||
+ evmptrld_status == EVMPTRLD_ERROR)
+ return false;
+ }
+
+ return true;
+}
static bool nested_get_vmcs12_pages(struct kvm_vcpu *vcpu)
{
@@ -2945,7 +3179,7 @@
* to it so we can release it later.
*/
if (vmx->nested.apic_access_page) { /* shouldn't happen */
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
page = kvm_vcpu_gpa_to_page(vcpu, vmcs12->apic_access_addr);
@@ -3005,9 +3239,66 @@
exec_controls_setbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
else
exec_controls_clearbit(vmx, CPU_BASED_USE_MSR_BITMAPS);
+
return true;
}
+static bool vmx_get_nested_state_pages(struct kvm_vcpu *vcpu)
+{
+ if (!nested_get_evmcs_page(vcpu)) {
+ pr_debug_ratelimited("%s: enlightened vmptrld failed\n",
+ __func__);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_EMULATION;
+ vcpu->run->internal.ndata = 0;
+
+ return false;
+ }
+
+ if (is_guest_mode(vcpu) && !nested_get_vmcs12_pages(vcpu))
+ return false;
+
+ return true;
+}
+
+static int nested_vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
+{
+ struct vmcs12 *vmcs12;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ gpa_t dst;
+
+ if (WARN_ON_ONCE(!is_guest_mode(vcpu)))
+ return 0;
+
+ if (WARN_ON_ONCE(vmx->nested.pml_full))
+ return 1;
+
+ /*
+ * Check if PML is enabled for the nested guest. Whether eptp bit 6 is
+ * set is already checked as part of A/D emulation.
+ */
+ vmcs12 = get_vmcs12(vcpu);
+ if (!nested_cpu_has_pml(vmcs12))
+ return 0;
+
+ if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
+ vmx->nested.pml_full = true;
+ return 1;
+ }
+
+ gpa &= ~0xFFFull;
+ dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
+
+ if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
+ offset_in_page(dst), sizeof(gpa)))
+ return 0;
+
+ vmcs12->guest_pml_index--;
+
+ return 0;
+}
+
/*
* Intel's VMX Instruction Reference specifies a common set of prerequisites
* for running VMX instructions (except VMXON, whose prerequisites are
@@ -3046,22 +3337,29 @@
* or KVM_SET_NESTED_STATE). Otherwise it's called from vmlaunch/vmresume.
*
* Returns:
- * NVMX_ENTRY_SUCCESS: Entered VMX non-root mode
- * NVMX_ENTRY_VMFAIL: Consistency check VMFail
- * NVMX_ENTRY_VMEXIT: Consistency check VMExit
- * NVMX_ENTRY_KVM_INTERNAL_ERROR: KVM internal error
+ * NVMX_VMENTRY_SUCCESS: Entered VMX non-root mode
+ * NVMX_VMENTRY_VMFAIL: Consistency check VMFail
+ * NVMX_VMENTRY_VMEXIT: Consistency check VMExit
+ * NVMX_VMENTRY_KVM_INTERNAL_ERROR: KVM internal error
*/
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
bool from_vmentry)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ enum vm_entry_failure_code entry_failure_code;
bool evaluate_pending_interrupts;
- u32 exit_reason = EXIT_REASON_INVALID_STATE;
- u32 exit_qual;
+ union vmx_exit_reason exit_reason = {
+ .basic = EXIT_REASON_INVALID_STATE,
+ .failed_vmentry = 1,
+ };
+ u32 failed_index;
+
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
- (CPU_BASED_VIRTUAL_INTR_PENDING | CPU_BASED_VIRTUAL_NMI_PENDING);
+ (CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
@@ -3105,24 +3403,33 @@
return NVMX_VMENTRY_VMFAIL;
}
- if (nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
+ if (nested_vmx_check_guest_state(vcpu, vmcs12,
+ &entry_failure_code)) {
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
+ vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit;
+ }
}
enter_guest_mode(vcpu);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset += vmcs12->tsc_offset;
- if (prepare_vmcs02(vcpu, vmcs12, &exit_qual))
+ if (prepare_vmcs02(vcpu, vmcs12, &entry_failure_code)) {
+ exit_reason.basic = EXIT_REASON_INVALID_STATE;
+ vmcs12->exit_qualification = entry_failure_code;
goto vmentry_fail_vmexit_guest_mode;
+ }
if (from_vmentry) {
- exit_reason = EXIT_REASON_MSR_LOAD_FAIL;
- exit_qual = nested_vmx_load_msr(vcpu,
- vmcs12->vm_entry_msr_load_addr,
- vmcs12->vm_entry_msr_load_count);
- if (exit_qual)
+ failed_index = nested_vmx_load_msr(vcpu,
+ vmcs12->vm_entry_msr_load_addr,
+ vmcs12->vm_entry_msr_load_count);
+ if (failed_index) {
+ exit_reason.basic = EXIT_REASON_MSR_LOAD_FAIL;
+ vmcs12->exit_qualification = failed_index;
goto vmentry_fail_vmexit_guest_mode;
+ }
} else {
/*
* The MMU is not initialized to point at the right entities yet and
@@ -3131,7 +3438,7 @@
* to nested_get_vmcs12_pages before the next VM-entry. The MSRs
* have already been set at vmentry time and should not be reset.
*/
- kvm_make_request(KVM_REQ_GET_VMCS12_PAGES, vcpu);
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
}
/*
@@ -3157,8 +3464,10 @@
* the timer.
*/
vmx->nested.preemption_timer_expired = false;
- if (nested_cpu_has_preemption_timer(vmcs12))
- vmx_start_preemption_timer(vcpu);
+ if (nested_cpu_has_preemption_timer(vmcs12)) {
+ u64 timer_value = vmx_calc_preemption_timer_value(vcpu);
+ vmx_start_preemption_timer(vcpu, timer_value);
+ }
/*
* Note no nested_vmx_succeed or nested_vmx_fail here. At this point
@@ -3174,7 +3483,7 @@
* 26.7 "VM-entry failures during or after loading guest state".
*/
vmentry_fail_vmexit_guest_mode:
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
leave_guest_mode(vcpu);
@@ -3185,8 +3494,7 @@
return NVMX_VMENTRY_VMEXIT;
load_vmcs12_host_state(vcpu, vmcs12);
- vmcs12->vm_exit_reason = exit_reason | VMX_EXIT_REASONS_FAILED_VMENTRY;
- vmcs12->exit_qualification = exit_qual;
+ vmcs12->vm_exit_reason = exit_reason.full;
if (enable_shadow_vmcs || vmx->nested.hv_evmcs)
vmx->nested.need_vmcs12_to_shadow_sync = true;
return NVMX_VMENTRY_VMEXIT;
@@ -3202,14 +3510,20 @@
enum nvmx_vmentry_status status;
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 interrupt_shadow = vmx_get_interrupt_shadow(vcpu);
+ enum nested_evmptrld_status evmptrld_status;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (!nested_vmx_handle_enlightened_vmptrld(vcpu, launch))
+ evmptrld_status = nested_vmx_handle_enlightened_vmptrld(vcpu, launch);
+ if (evmptrld_status == EVMPTRLD_ERROR) {
+ kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
+ } else if (CC(evmptrld_status == EVMPTRLD_VMFAIL)) {
+ return nested_vmx_failInvalid(vcpu);
+ }
- if (!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull)
+ if (CC(!vmx->nested.hv_evmcs && vmx->nested.current_vmptr == -1ull))
return nested_vmx_failInvalid(vcpu);
vmcs12 = get_vmcs12(vcpu);
@@ -3220,7 +3534,7 @@
* rather than RFLAGS.ZF, and no error number is stored to the
* VM-instruction error field.
*/
- if (vmcs12->hdr.shadow_vmcs)
+ if (CC(vmcs12->hdr.shadow_vmcs))
return nested_vmx_failInvalid(vcpu);
if (vmx->nested.hv_evmcs) {
@@ -3241,30 +3555,41 @@
* for misconfigurations which will anyway be caught by the processor
* when using the merged vmcs02.
*/
- if (interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS)
- return nested_vmx_failValid(vcpu,
- VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
+ if (CC(interrupt_shadow & KVM_X86_SHADOW_INT_MOV_SS))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_EVENTS_BLOCKED_BY_MOV_SS);
- if (vmcs12->launch_state == launch)
- return nested_vmx_failValid(vcpu,
+ if (CC(vmcs12->launch_state == launch))
+ return nested_vmx_fail(vcpu,
launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS
: VMXERR_VMRESUME_NONLAUNCHED_VMCS);
if (nested_vmx_check_controls(vcpu, vmcs12))
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+
+ if (nested_vmx_check_address_space_size(vcpu, vmcs12))
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
if (nested_vmx_check_host_state(vcpu, vmcs12))
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD);
/*
* We're finally done with prerequisite checking, and can start with
* the nested entry.
*/
vmx->nested.nested_run_pending = 1;
+ vmx->nested.has_preemption_timer_deadline = false;
status = nested_vmx_enter_non_root_mode(vcpu, true);
if (unlikely(status != NVMX_VMENTRY_SUCCESS))
goto vmentry_failed;
+ /* Emulate processing of posted interrupts on VM-Enter. */
+ if (nested_cpu_has_posted_intr(vmcs12) &&
+ kvm_apic_has_interrupt(vcpu) == vmx->nested.posted_intr_nv) {
+ vmx->nested.pi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+ kvm_apic_clear_irr(vcpu, vmx->nested.posted_intr_nv);
+ }
+
/* Hide L1D cache contents from the nested guest. */
vmx->vcpu.arch.l1tf_flush_l1d = true;
@@ -3287,8 +3612,8 @@
*/
if ((vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT) &&
!(vmcs12->vm_entry_intr_info_field & INTR_INFO_VALID_MASK) &&
- !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_NMI_PENDING) &&
- !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_VIRTUAL_INTR_PENDING) &&
+ !(vmcs12->cpu_based_vm_exec_control & CPU_BASED_NMI_WINDOW_EXITING) &&
+ !((vmcs12->cpu_based_vm_exec_control & CPU_BASED_INTR_WINDOW_EXITING) &&
(vmcs12->guest_rflags & X86_EFLAGS_IF))) {
vmx->nested.nested_run_pending = 0;
return kvm_vcpu_halt(vcpu);
@@ -3302,12 +3627,12 @@
if (status == NVMX_VMENTRY_VMEXIT)
return 1;
WARN_ON_ONCE(status != NVMX_VMENTRY_VMFAIL);
- return nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ return nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
}
/*
* On a nested exit from L2 to L1, vmcs12.guest_cr0 might not be up-to-date
- * because L2 may have changed some cr0 bits directly (CRO_GUEST_HOST_MASK).
+ * because L2 may have changed some cr0 bits directly (CR0_GUEST_HOST_MASK).
* This function returns the new value we should put in vmcs12.guest_cr0.
* It's not enough to just return the vmcs02 GUEST_CR0. Rather,
* 1. Bits that neither L0 nor L1 trapped, were set directly by L2 and are now
@@ -3385,7 +3710,7 @@
}
-static void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
gfn_t gfn;
@@ -3463,41 +3788,113 @@
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
}
+/*
+ * Returns true if a debug trap is pending delivery.
+ *
+ * In KVM, debug traps bear an exception payload. As such, the class of a #DB
+ * exception may be inferred from the presence of an exception payload.
+ */
+static inline bool vmx_pending_dbg_trap(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.exception.pending &&
+ vcpu->arch.exception.nr == DB_VECTOR &&
+ vcpu->arch.exception.payload;
+}
+
+/*
+ * Certain VM-exits set the 'pending debug exceptions' field to indicate a
+ * recognized #DB (data or single-step) that has yet to be delivered. Since KVM
+ * represents these debug traps with a payload that is said to be compatible
+ * with the 'pending debug exceptions' field, write the payload to the VMCS
+ * field if a VM-exit is delivered before the debug trap.
+ */
+static void nested_vmx_update_pending_dbg(struct kvm_vcpu *vcpu)
+{
+ if (vmx_pending_dbg_trap(vcpu))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vcpu->arch.exception.payload);
+}
+
+static bool nested_vmx_preemption_timer_pending(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
+ to_vmx(vcpu)->nested.preemption_timer_expired;
+}
+
static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qual;
bool block_nested_events =
vmx->nested.nested_run_pending || kvm_event_needs_reinjection(vcpu);
+ bool mtf_pending = vmx->nested.mtf_pending;
struct kvm_lapic *apic = vcpu->arch.apic;
+ /*
+ * Clear the MTF state. If a higher priority VM-exit is delivered first,
+ * this state is discarded.
+ */
+ if (!block_nested_events)
+ vmx->nested.mtf_pending = false;
+
if (lapic_in_kernel(vcpu) &&
test_bit(KVM_APIC_INIT, &apic->pending_events)) {
if (block_nested_events)
return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ clear_bit(KVM_APIC_INIT, &apic->pending_events);
nested_vmx_vmexit(vcpu, EXIT_REASON_INIT_SIGNAL, 0, 0);
return 0;
}
- if (vcpu->arch.exception.pending &&
- nested_vmx_check_exception(vcpu, &exit_qual)) {
+ /*
+ * Process any exceptions that are not debug traps before MTF.
+ */
+ if (vcpu->arch.exception.pending && !vmx_pending_dbg_trap(vcpu)) {
if (block_nested_events)
return -EBUSY;
+ if (!nested_vmx_check_exception(vcpu, &exit_qual))
+ goto no_vmexit;
nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
return 0;
}
- if (nested_cpu_has_preemption_timer(get_vmcs12(vcpu)) &&
- vmx->nested.preemption_timer_expired) {
+ if (mtf_pending) {
+ if (block_nested_events)
+ return -EBUSY;
+ nested_vmx_update_pending_dbg(vcpu);
+ nested_vmx_vmexit(vcpu, EXIT_REASON_MONITOR_TRAP_FLAG, 0, 0);
+ return 0;
+ }
+
+ if (vcpu->arch.exception.pending) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_vmx_check_exception(vcpu, &exit_qual))
+ goto no_vmexit;
+ nested_vmx_inject_exception_vmexit(vcpu, exit_qual);
+ return 0;
+ }
+
+ if (nested_vmx_preemption_timer_pending(vcpu)) {
if (block_nested_events)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_PREEMPTION_TIMER, 0, 0);
return 0;
}
- if (vcpu->arch.nmi_pending && nested_exit_on_nmi(vcpu)) {
+ if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
if (block_nested_events)
return -EBUSY;
+ goto no_vmexit;
+ }
+
+ if (vcpu->arch.nmi_pending && !vmx_nmi_blocked(vcpu)) {
+ if (block_nested_events)
+ return -EBUSY;
+ if (!nested_exit_on_nmi(vcpu))
+ goto no_vmexit;
+
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI,
NMI_VECTOR | INTR_TYPE_NMI_INTR |
INTR_INFO_VALID_MASK, 0);
@@ -3510,13 +3907,16 @@
return 0;
}
- if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
+ if (kvm_cpu_has_interrupt(vcpu) && !vmx_interrupt_blocked(vcpu)) {
if (block_nested_events)
return -EBUSY;
+ if (!nested_exit_on_intr(vcpu))
+ goto no_vmexit;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
return 0;
}
+no_vmexit:
vmx_complete_nested_posted_interrupt(vcpu);
return 0;
}
@@ -3643,12 +4043,12 @@
cpu = get_cpu();
vmx->loaded_vmcs = &vmx->nested.vmcs02;
- vmx_vcpu_load(&vmx->vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->vmcs01);
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
vmx->loaded_vmcs = &vmx->vmcs01;
- vmx_vcpu_load(&vmx->vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu, &vmx->nested.vmcs02);
put_cpu();
}
@@ -3677,10 +4077,6 @@
vmcs12->guest_cs_ar_bytes = vmcs_read32(GUEST_CS_AR_BYTES);
vmcs12->guest_ss_ar_bytes = vmcs_read32(GUEST_SS_AR_BYTES);
- vmcs12->guest_sysenter_cs = vmcs_read32(GUEST_SYSENTER_CS);
- vmcs12->guest_sysenter_esp = vmcs_readl(GUEST_SYSENTER_ESP);
- vmcs12->guest_sysenter_eip = vmcs_readl(GUEST_SYSENTER_EIP);
-
vmcs12->guest_interruptibility_info =
vmcs_read32(GUEST_INTERRUPTIBILITY_INFO);
@@ -3690,9 +4086,10 @@
vmcs12->guest_activity_state = GUEST_ACTIVITY_ACTIVE;
if (nested_cpu_has_preemption_timer(vmcs12) &&
- vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER)
- vmcs12->vmx_preemption_timer_value =
- vmx_get_preemption_timer_value(vcpu);
+ vmcs12->vm_exit_controls & VM_EXIT_SAVE_VMX_PREEMPTION_TIMER &&
+ !vmx->nested.nested_run_pending)
+ vmcs12->vmx_preemption_timer_value =
+ vmx_get_preemption_timer_value(vcpu);
/*
* In some cases (usually, nested EPT), L2 is allowed to change its
@@ -3740,11 +4137,11 @@
* which already writes to vmcs12 directly.
*/
static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12,
- u32 exit_reason, u32 exit_intr_info,
+ u32 vm_exit_reason, u32 exit_intr_info,
unsigned long exit_qualification)
{
/* update exit information fields: */
- vmcs12->vm_exit_reason = exit_reason;
+ vmcs12->vm_exit_reason = vm_exit_reason;
vmcs12->exit_qualification = exit_qualification;
vmcs12->vm_exit_intr_info = exit_intr_info;
@@ -3799,8 +4196,8 @@
static void load_vmcs12_host_state(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
+ enum vm_entry_failure_code ignored;
struct kvm_segment seg;
- u32 entry_failure_code;
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER)
vcpu->arch.efer = vmcs12->host_ia32_efer;
@@ -3822,7 +4219,7 @@
* CR0_GUEST_HOST_MASK is already set in the original vmcs01
* (KVM doesn't change it);
*/
- vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
vmx_set_cr0(vcpu, vmcs12->host_cr0);
/* Same as above - no reason to call set_cr4_guest_host_mask(). */
@@ -3835,30 +4232,13 @@
* Only PDPTE load can fail as the value of cr3 was checked on entry and
* couldn't have changed.
*/
- if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code))
+ if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &ignored))
nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL);
if (!enable_ept)
vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
- /*
- * If vmcs01 doesn't use VPID, CPU flushes TLB on every
- * VMEntry/VMExit. Thus, no need to flush TLB.
- *
- * If vmcs12 doesn't use VPID, L1 expects TLB to be
- * flushed on every VMEntry/VMExit.
- *
- * Otherwise, we can preserve TLB entries as long as we are
- * able to tag L1 TLB entries differently than L2 TLB entries.
- *
- * If vmcs12 uses EPT, we need to execute this flush on EPTP01
- * and therefore we request the TLB flush to happen only after VMCS EPTP
- * has been set by KVM_REQ_LOAD_CR3.
- */
- if (enable_vpid &&
- (!nested_cpu_has_vpid(vmcs12) || !nested_has_guest_tlb_tag(vcpu))) {
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
+ nested_vmx_transition_tlb_flush(vcpu, vmcs12, false);
vmcs_write32(GUEST_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
vmcs_writel(GUEST_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
@@ -3877,8 +4257,8 @@
vcpu->arch.pat = vmcs12->host_ia32_pat;
}
if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
- vmcs_write64(GUEST_IA32_PERF_GLOBAL_CTRL,
- vmcs12->host_ia32_perf_global_ctrl);
+ WARN_ON_ONCE(kvm_set_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL,
+ vmcs12->host_ia32_perf_global_ctrl));
/* Set L1 segment info according to Intel SDM
27.5.2 Loading Host Segment and Descriptor-Table Registers */
@@ -3939,7 +4319,7 @@
static inline u64 nested_vmx_get_vmcs01_guest_efer(struct vcpu_vmx *vmx)
{
- struct shared_msr_entry *efer_msr;
+ struct vmx_uret_msr *efer_msr;
unsigned int i;
if (vm_entry_controls_get(vmx) & VM_ENTRY_LOAD_IA32_EFER)
@@ -3953,7 +4333,7 @@
return vmx->msr_autoload.guest.val[i].value;
}
- efer_msr = find_msr_entry(vmx, MSR_EFER);
+ efer_msr = vmx_find_uret_msr(vmx, MSR_EFER);
if (efer_msr)
return efer_msr->data;
@@ -3989,7 +4369,7 @@
*/
vmx_set_efer(vcpu, nested_vmx_get_vmcs01_guest_efer(vmx));
- vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
+ vcpu->arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
vmx_set_cr0(vcpu, vmcs_readl(CR0_READ_SHADOW));
vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK);
@@ -3997,7 +4377,7 @@
nested_ept_uninit_mmu_context(vcpu);
vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR3);
/*
* Use ept_save_pdptrs(vcpu) to load the MMU's cached PDPTRs
@@ -4005,7 +4385,7 @@
* VMFail, like everything else we just need to ensure our
* software model is up-to-date.
*/
- if (enable_ept)
+ if (enable_ept && is_pae_paging(vcpu))
ept_save_pdptrs(vcpu);
kvm_mmu_reset_context(vcpu);
@@ -4073,7 +4453,7 @@
* and modify vmcs12 to make it see what it would expect to see there if
* L2 was its real guest. Must only be called when in L2 (is_guest_mode())
*/
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
u32 exit_intr_info, unsigned long exit_qualification)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4082,20 +4462,42 @@
/* trying to cancel vmlaunch/vmresume is a bug */
WARN_ON_ONCE(vmx->nested.nested_run_pending);
+ if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) {
+ /*
+ * KVM_REQ_GET_NESTED_STATE_PAGES is also used to map
+ * Enlightened VMCS after migration and we still need to
+ * do that when something is forcing L2->L1 exit prior to
+ * the first L2 run.
+ */
+ (void)nested_get_evmcs_page(vcpu);
+ }
+
+ /* Service the TLB flush request for L2 before switching to L1. */
+ if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
+ kvm_vcpu_flush_tlb_current(vcpu);
+
+ /*
+ * VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
+ * now and the new vmentry. Ensure that the VMCS02 PDPTR fields are
+ * up-to-date before switching to L1.
+ */
+ if (enable_ept && is_pae_paging(vcpu))
+ vmx_ept_load_pdptrs(vcpu);
+
leave_guest_mode(vcpu);
if (nested_cpu_has_preemption_timer(vmcs12))
hrtimer_cancel(&to_vmx(vcpu)->nested.preemption_timer);
- if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
+ if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING)
vcpu->arch.tsc_offset -= vmcs12->tsc_offset;
if (likely(!vmx->fail)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
- if (exit_reason != -1)
- prepare_vmcs12(vcpu, vmcs12, exit_reason, exit_intr_info,
- exit_qualification);
+ if (vm_exit_reason != -1)
+ prepare_vmcs12(vcpu, vmcs12, vm_exit_reason,
+ exit_intr_info, exit_qualification);
/*
* Must happen outside of sync_vmcs02_to_vmcs12() as it will
@@ -4125,6 +4527,8 @@
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.guest.nr);
vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
+ if (vmx->nested.l1_tpr_threshold != -1)
+ vmcs_write32(TPR_THRESHOLD, vmx->nested.l1_tpr_threshold);
if (kvm_has_tsc_control)
decache_tsc_multiplier(vmx);
@@ -4132,35 +4536,31 @@
if (vmx->nested.change_vmcs01_virtual_apic_mode) {
vmx->nested.change_vmcs01_virtual_apic_mode = false;
vmx_set_virtual_apic_mode(vcpu);
- } else if (!nested_cpu_has_ept(vmcs12) &&
- nested_cpu_has2(vmcs12,
- SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) {
- vmx_flush_tlb(vcpu, true);
}
/* Unpin physical memory we referred to in vmcs02 */
if (vmx->nested.apic_access_page) {
- kvm_release_page_dirty(vmx->nested.apic_access_page);
+ kvm_release_page_clean(vmx->nested.apic_access_page);
vmx->nested.apic_access_page = NULL;
}
kvm_vcpu_unmap(vcpu, &vmx->nested.virtual_apic_map, true);
kvm_vcpu_unmap(vcpu, &vmx->nested.pi_desc_map, true);
vmx->nested.pi_desc = NULL;
- /*
- * We are now running in L2, mmu_notifier will force to reload the
- * page's hpa for L2 vmcs. Need to reload it for L1 before entering L1.
- */
- kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ if (vmx->nested.reload_vmcs01_apic_access_page) {
+ vmx->nested.reload_vmcs01_apic_access_page = false;
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+ }
- if ((exit_reason != -1) && (enable_shadow_vmcs || vmx->nested.hv_evmcs))
+ if ((vm_exit_reason != -1) &&
+ (enable_shadow_vmcs || vmx->nested.hv_evmcs))
vmx->nested.need_vmcs12_to_shadow_sync = true;
/* in case we halted in L2 */
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ if ((u16)vm_exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
@@ -4168,7 +4568,7 @@
INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
}
- if (exit_reason != -1)
+ if (vm_exit_reason != -1)
trace_kvm_nested_vmexit_inject(vmcs12->vm_exit_reason,
vmcs12->exit_qualification,
vmcs12->idt_vectoring_info_field,
@@ -4188,7 +4588,7 @@
* flag and the VM-instruction error field of the VMCS
* accordingly, and skip the emulated instruction.
*/
- (void)nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
+ (void)nested_vmx_fail(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD);
/*
* Restore L1's host state to KVM's software model. We're here
@@ -4205,7 +4605,7 @@
* Decode the memory-address operand of a vmx instruction, as recorded on an
* exit caused by such an instruction (run by a guest hypervisor).
* On success, returns 0. When the operand is invalid, returns 1 and throws
- * #UD or #GP.
+ * #UD, #GP, or #SS.
*/
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret)
@@ -4244,9 +4644,9 @@
else if (addr_size == 0)
off = (gva_t)sign_extend64(off, 15);
if (base_is_valid)
- off += kvm_register_read(vcpu, base_reg);
+ off += kvm_register_readl(vcpu, base_reg);
if (index_is_valid)
- off += kvm_register_read(vcpu, index_reg)<<scaling;
+ off += kvm_register_readl(vcpu, index_reg) << scaling;
vmx_get_segment(vcpu, &s, seg_reg);
/*
@@ -4331,19 +4731,45 @@
return 0;
}
-static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer)
+void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx;
+
+ if (!nested_vmx_allowed(vcpu))
+ return;
+
+ vmx = to_vmx(vcpu);
+ if (kvm_x86_ops.pmu_ops->is_valid_msr(vcpu, MSR_CORE_PERF_GLOBAL_CTRL)) {
+ vmx->nested.msrs.entry_ctls_high |=
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmx->nested.msrs.exit_ctls_high |=
+ VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ } else {
+ vmx->nested.msrs.entry_ctls_high &=
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
+ vmx->nested.msrs.exit_ctls_high &=
+ ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
+ }
+}
+
+static int nested_vmx_get_vmptr(struct kvm_vcpu *vcpu, gpa_t *vmpointer,
+ int *ret)
{
gva_t gva;
struct x86_exception e;
+ int r;
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmcs_read32(VMX_INSTRUCTION_INFO), false,
- sizeof(*vmpointer), &gva))
- return 1;
+ sizeof(*vmpointer), &gva)) {
+ *ret = 1;
+ return -EINVAL;
+ }
- if (kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
+ r = kvm_read_guest_virt(vcpu, gva, vmpointer, sizeof(*vmpointer), &e);
+ if (r != X86EMUL_CONTINUE) {
+ *ret = kvm_handle_memory_failure(vcpu, r, &e);
+ return -EINVAL;
}
return 0;
@@ -4396,7 +4822,7 @@
goto out_shadow_vmcs;
hrtimer_init(&vmx->nested.preemption_timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL_PINNED);
+ HRTIMER_MODE_ABS_PINNED);
vmx->nested.preemption_timer.function = vmx_preemption_timer_fn;
vmx->nested.vpid02 = allocate_vpid();
@@ -4404,9 +4830,9 @@
vmx->nested.vmcs02_initialized = false;
vmx->nested.vmxon = true;
- if (pt_mode == PT_MODE_HOST_GUEST) {
+ if (vmx_pt_mode_is_host_guest()) {
vmx->pt_desc.guest.ctl = 0;
- pt_update_intercept_for_msr(vmx);
+ pt_update_intercept_for_msr(vcpu);
}
return 0;
@@ -4438,8 +4864,8 @@
gpa_t vmptr;
uint32_t revision;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED
- | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ const u64 VMXON_NEEDED_FEATURES = FEAT_CTL_LOCKED
+ | FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
* The Intel VMX Instruction Reference lists a bunch of bits that are
@@ -4462,8 +4888,7 @@
}
if (vmx->nested.vmxon)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
+ return nested_vmx_fail(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
@@ -4471,8 +4896,8 @@
return 1;
}
- if (nested_vmx_get_vmptr(vcpu, &vmptr))
- return 1;
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &ret))
+ return ret;
/*
* SDM 3: 24.11.5
@@ -4545,20 +4970,19 @@
u32 zero = 0;
gpa_t vmptr;
u64 evmcs_gpa;
+ int r;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_get_vmptr(vcpu, &vmptr))
- return 1;
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
if (!page_address_valid(vcpu, vmptr))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_INVALID_ADDRESS);
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS);
if (vmptr == vmx->nested.vmxon_ptr)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMCLEAR_VMXON_POINTER);
+ return nested_vmx_fail(vcpu, VMXERR_VMCLEAR_VMXON_POINTER);
/*
* When Enlightened VMEntry is enabled on the calling CPU we treat
@@ -4584,8 +5008,6 @@
return nested_vmx_succeed(vcpu);
}
-static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch);
-
/* Emulate the VMLAUNCH instruction */
static int handle_vmlaunch(struct kvm_vcpu *vcpu)
{
@@ -4601,17 +5023,17 @@
static int handle_vmread(struct kvm_vcpu *vcpu)
{
- unsigned long field;
- u64 field_value;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- int len;
- gva_t gva = 0;
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
: get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
struct x86_exception e;
+ unsigned long field;
+ u64 value;
+ gva_t gva = 0;
short offset;
+ int len, r;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -4626,37 +5048,34 @@
return nested_vmx_failInvalid(vcpu);
/* Decode instruction info and find the field to read */
- field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
offset = vmcs_field_to_offset(field);
if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
if (!is_guest_mode(vcpu) && is_vmcs12_ext_field(field))
copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- /* Read the field, zero-extended to a u64 field_value */
- field_value = vmcs12_read_any(vmcs12, field, offset);
+ /* Read the field, zero-extended to a u64 value */
+ value = vmcs12_read_any(vmcs12, field, offset);
/*
* Now copy part of this value to register or memory, as requested.
* Note that the number of bits actually copied is 32 or 64 depending
* on the guest's mode (32 or 64 bit), not on the given field's length.
*/
- if (vmx_instruction_info & (1u << 10)) {
- kvm_register_writel(vcpu, (((vmx_instruction_info) >> 3) & 0xf),
- field_value);
+ if (instr_info & BIT(10)) {
+ kvm_register_writel(vcpu, (((instr_info) >> 3) & 0xf), value);
} else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, true, len, &gva))
+ instr_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_write_guest_virt_system(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
}
return nested_vmx_succeed(vcpu);
@@ -4688,24 +5107,25 @@
static int handle_vmwrite(struct kvm_vcpu *vcpu)
{
- unsigned long field;
- int len;
- gva_t gva;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
- u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-
- /* The value to write might be 32 or 64 bits, depending on L1's long
- * mode, and eventually we need to write that into a field of several
- * possible lengths. The code below first zero-extends the value to 64
- * bit (field_value), and then copies only the appropriate number of
- * bits into the vmcs12 field.
- */
- u64 field_value = 0;
- struct x86_exception e;
struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
: get_vmcs12(vcpu);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
+ u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ struct x86_exception e;
+ unsigned long field;
short offset;
+ gva_t gva;
+ int len, r;
+
+ /*
+ * The value to write might be 32 or 64 bits, depending on L1's long
+ * mode, and eventually we need to write that into a field of several
+ * possible lengths. The code below first zero-extends the value to 64
+ * bit (value), and then copies only the appropriate number of
+ * bits into the vmcs12 field.
+ */
+ u64 value = 0;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -4719,27 +5139,23 @@
get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
return nested_vmx_failInvalid(vcpu);
- if (vmx_instruction_info & (1u << 10))
- field_value = kvm_register_readl(vcpu,
- (((vmx_instruction_info) >> 3) & 0xf));
+ if (instr_info & BIT(10))
+ value = kvm_register_readl(vcpu, (((instr_info) >> 3) & 0xf));
else {
len = is_64_bit_mode(vcpu) ? 8 : 4;
if (get_vmx_mem_address(vcpu, exit_qualification,
- vmx_instruction_info, false, len, &gva))
+ instr_info, false, len, &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &field_value, len, &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_read_guest_virt(vcpu, gva, &value, len, &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
}
-
- field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((instr_info) >> 28) & 0xf));
offset = vmcs_field_to_offset(field);
if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT);
/*
* If the vCPU supports "VMWRITE to any supported field in the
@@ -4747,8 +5163,7 @@
*/
if (vmcs_field_readonly(field) &&
!nested_cpu_has_vmwrite_any_field(vcpu))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
+ return nested_vmx_fail(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
/*
* Ensure vmcs12 is up-to-date before any VMWRITE that dirties
@@ -4766,9 +5181,9 @@
* the stripped down value, L2 sees the full value as stored by KVM).
*/
if (field >= GUEST_ES_AR_BYTES && field <= GUEST_TR_AR_BYTES)
- field_value &= 0x1f0ff;
+ value &= 0x1f0ff;
- vmcs12_write_any(vmcs12, field, offset, field_value);
+ vmcs12_write_any(vmcs12, field, offset, value);
/*
* Do not track vmcs12 dirty-state if in guest-mode as we actually
@@ -4785,7 +5200,7 @@
preempt_disable();
vmcs_load(vmx->vmcs01.shadow_vmcs);
- __vmcs_writel(field, field_value);
+ __vmcs_writel(field, value);
vmcs_clear(vmx->vmcs01.shadow_vmcs);
vmcs_load(vmx->loaded_vmcs->vmcs);
@@ -4814,20 +5229,19 @@
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
gpa_t vmptr;
+ int r;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (nested_vmx_get_vmptr(vcpu, &vmptr))
- return 1;
+ if (nested_vmx_get_vmptr(vcpu, &vmptr, &r))
+ return r;
if (!page_address_valid(vcpu, vmptr))
- return nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_INVALID_ADDRESS);
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS);
if (vmptr == vmx->nested.vmxon_ptr)
- return nested_vmx_failValid(vcpu,
- VMXERR_VMPTRLD_VMXON_POINTER);
+ return nested_vmx_fail(vcpu, VMXERR_VMPTRLD_VMXON_POINTER);
/* Forbid normal VMPTRLD if Enlightened version was used */
if (vmx->nested.hv_evmcs)
@@ -4844,7 +5258,7 @@
* given physical address won't match the required
* VMCS12_REVISION identifier.
*/
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -4854,7 +5268,7 @@
(new_vmcs12->hdr.shadow_vmcs &&
!nested_cpu_has_vmx_shadow_vmcs(vcpu))) {
kvm_vcpu_unmap(vcpu, &map, false);
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
}
@@ -4876,11 +5290,12 @@
/* Emulate the VMPTRST instruction */
static int handle_vmptrst(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qual = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qual = vmx_get_exit_qual(vcpu);
u32 instr_info = vmcs_read32(VMX_INSTRUCTION_INFO);
gpa_t current_vmptr = to_vmx(vcpu)->nested.current_vmptr;
struct x86_exception e;
gva_t gva;
+ int r;
if (!nested_vmx_check_permission(vcpu))
return 1;
@@ -4892,25 +5307,35 @@
true, sizeof(gpa_t), &gva))
return 1;
/* *_system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
- sizeof(gpa_t), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_write_guest_virt_system(vcpu, gva, (void *)¤t_vmptr,
+ sizeof(gpa_t), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
return nested_vmx_succeed(vcpu);
}
+#define EPTP_PA_MASK GENMASK_ULL(51, 12)
+
+static bool nested_ept_root_matches(hpa_t root_hpa, u64 root_eptp, u64 eptp)
+{
+ return VALID_PAGE(root_hpa) &&
+ ((root_eptp & EPTP_PA_MASK) == (eptp & EPTP_PA_MASK));
+}
+
/* Emulate the INVEPT instruction */
static int handle_invept(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u32 vmx_instruction_info, types;
- unsigned long type;
+ unsigned long type, roots_to_free;
+ struct kvm_mmu *mmu;
gva_t gva;
struct x86_exception e;
struct {
u64 eptp, gpa;
} operand;
+ int i, r;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_EPT) ||
@@ -4928,33 +5353,53 @@
types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
if (type >= 32 || !(types & (1 << type)))
- return nested_vmx_failValid(vcpu,
- VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+ return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/* According to the Intel VMX instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
*/
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
+ /*
+ * Nested EPT roots are always held through guest_mmu,
+ * not root_mmu.
+ */
+ mmu = &vcpu->arch.guest_mmu;
switch (type) {
- case VMX_EPT_EXTENT_GLOBAL:
case VMX_EPT_EXTENT_CONTEXT:
- /*
- * TODO: Sync the necessary shadow EPT roots here, rather than
- * at the next emulated VM-entry.
- */
+ if (!nested_vmx_check_eptp(vcpu, operand.eptp))
+ return nested_vmx_fail(vcpu,
+ VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+
+ roots_to_free = 0;
+ if (nested_ept_root_matches(mmu->root_hpa, mmu->root_pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_CURRENT;
+
+ for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
+ if (nested_ept_root_matches(mmu->prev_roots[i].hpa,
+ mmu->prev_roots[i].pgd,
+ operand.eptp))
+ roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
+ }
+ break;
+ case VMX_EPT_EXTENT_GLOBAL:
+ roots_to_free = KVM_MMU_ROOTS_ALL;
break;
default:
- BUG_ON(1);
+ BUG();
break;
}
+ if (roots_to_free)
+ kvm_mmu_free_roots(vcpu, mmu, roots_to_free);
+
return nested_vmx_succeed(vcpu);
}
@@ -4970,6 +5415,7 @@
u64 gla;
} operand;
u16 vpid02;
+ int r;
if (!(vmx->nested.msrs.secondary_ctls_high &
SECONDARY_EXEC_ENABLE_VPID) ||
@@ -4988,21 +5434,21 @@
VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
if (type >= 32 || !(types & (1 << type)))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
/* according to the intel vmx instruction reference, the memory
* operand is read even if it isn't needed (e.g., for type==global)
*/
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false, sizeof(operand), &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
+ r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e);
+ if (r != X86EMUL_CONTINUE)
+ return kvm_handle_memory_failure(vcpu, r, &e);
+
if (operand.vpid >> 16)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
vpid02 = nested_get_vpid02(vcpu);
@@ -5010,29 +5456,39 @@
case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
if (!operand.vpid ||
is_noncanonical_address(operand.gla, vcpu))
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- if (cpu_has_vmx_invvpid_individual_addr()) {
- __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR,
- vpid02, operand.gla);
- } else
- __vmx_flush_tlb(vcpu, vpid02, false);
+ vpid_sync_vcpu_addr(vpid02, operand.gla);
break;
case VMX_VPID_EXTENT_SINGLE_CONTEXT:
case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
if (!operand.vpid)
- return nested_vmx_failValid(vcpu,
+ return nested_vmx_fail(vcpu,
VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
- __vmx_flush_tlb(vcpu, vpid02, false);
+ vpid_sync_context(vpid02);
break;
case VMX_VPID_EXTENT_ALL_CONTEXT:
- __vmx_flush_tlb(vcpu, vpid02, false);
+ vpid_sync_context(vpid02);
break;
default:
WARN_ON_ONCE(1);
return kvm_skip_emulated_instruction(vcpu);
}
+ /*
+ * Sync the shadow page tables if EPT is disabled, L1 is invalidating
+ * linear mappings for L2 (tagged with L2's VPID). Free all roots as
+ * VPIDs are not tracked in the MMU role.
+ *
+ * Note, this operates on root_mmu, not guest_mmu, as L1 and L2 share
+ * an MMU when EPT is disabled.
+ *
+ * TODO: sync only the affected SPTEs for INVDIVIDUAL_ADDR.
+ */
+ if (!enable_ept)
+ kvm_mmu_free_roots(vcpu, &vcpu->arch.root_mmu,
+ KVM_MMU_ROOTS_ALL);
+
return nested_vmx_succeed(vcpu);
}
@@ -5040,9 +5496,7 @@
struct vmcs12 *vmcs12)
{
u32 index = kvm_rcx_read(vcpu);
- u64 address;
- bool accessed_dirty;
- struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+ u64 new_eptp;
if (!nested_cpu_has_eptp_switching(vmcs12) ||
!nested_cpu_has_ept(vmcs12))
@@ -5051,31 +5505,21 @@
if (index >= VMFUNC_EPTP_ENTRIES)
return 1;
-
if (kvm_vcpu_read_guest_page(vcpu, vmcs12->eptp_list_address >> PAGE_SHIFT,
- &address, index * 8, 8))
+ &new_eptp, index * 8, 8))
return 1;
- accessed_dirty = !!(address & VMX_EPTP_AD_ENABLE_BIT);
-
/*
* If the (L2) guest does a vmfunc to the currently
* active ept pointer, we don't have to do anything else
*/
- if (vmcs12->ept_pointer != address) {
- if (!valid_ept_address(vcpu, address))
+ if (vmcs12->ept_pointer != new_eptp) {
+ if (!nested_vmx_check_eptp(vcpu, new_eptp))
return 1;
- kvm_mmu_unload(vcpu);
- mmu->ept_ad = accessed_dirty;
- mmu->mmu_role.base.ad_disabled = !accessed_dirty;
- vmcs12->ept_pointer = address;
- /*
- * TODO: Check what's the correct approach in case
- * mmu reload fails. Currently, we just let the next
- * reload potentially fail
- */
- kvm_mmu_reload(vcpu);
+ vmcs12->ept_pointer = new_eptp;
+
+ kvm_make_request(KVM_REQ_MMU_RELOAD, vcpu);
}
return 0;
@@ -5112,9 +5556,14 @@
return kvm_skip_emulated_instruction(vcpu);
fail:
- nested_vmx_vmexit(vcpu, vmx->exit_reason,
- vmcs_read32(VM_EXIT_INTR_INFO),
- vmcs_readl(EXIT_QUALIFICATION));
+ /*
+ * This is effectively a reflected VM-Exit, as opposed to a synthesized
+ * nested VM-Exit. Pass the original exit reason, i.e. don't hardcode
+ * EXIT_REASON_VMFUNC as the exit reason.
+ */
+ nested_vmx_vmexit(vcpu, vmx->exit_reason.full,
+ vmx_get_intr_info(vcpu),
+ vmx_get_exit_qual(vcpu));
return 1;
}
@@ -5165,7 +5614,7 @@
if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
port = exit_qualification >> 16;
size = (exit_qualification & 7) + 1;
@@ -5174,13 +5623,14 @@
}
/*
- * Return 1 if we should exit from L2 to L1 to handle an MSR access access,
+ * Return 1 if we should exit from L2 to L1 to handle an MSR access,
* rather than handle it ourselves in L0. I.e., check whether L1 expressed
* disinterest in the current event (read or write a specific MSR) by using an
* MSR bitmap. This may be the case even when L0 doesn't use MSR bitmaps.
*/
static bool nested_vmx_exit_handled_msr(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12, u32 exit_reason)
+ struct vmcs12 *vmcs12,
+ union vmx_exit_reason exit_reason)
{
u32 msr_index = kvm_rcx_read(vcpu);
gpa_t bitmap;
@@ -5194,7 +5644,7 @@
* First we need to figure out which of the four to use:
*/
bitmap = vmcs12->msr_bitmap;
- if (exit_reason == EXIT_REASON_MSR_WRITE)
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
bitmap += 2048;
if (msr_index >= 0xc0000000) {
msr_index -= 0xc0000000;
@@ -5219,7 +5669,7 @@
static bool nested_vmx_exit_handled_cr(struct kvm_vcpu *vcpu,
struct vmcs12 *vmcs12)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int cr = exit_qualification & 15;
int reg;
unsigned long val;
@@ -5235,15 +5685,6 @@
return true;
break;
case 3:
- if ((vmcs12->cr3_target_count >= 1 &&
- vmcs12->cr3_target_value0 == val) ||
- (vmcs12->cr3_target_count >= 2 &&
- vmcs12->cr3_target_value1 == val) ||
- (vmcs12->cr3_target_count >= 3 &&
- vmcs12->cr3_target_value2 == val) ||
- (vmcs12->cr3_target_count >= 4 &&
- vmcs12->cr3_target_value3 == val))
- return false;
if (nested_cpu_has(vmcs12, CPU_BASED_CR3_LOAD_EXITING))
return true;
break;
@@ -5319,70 +5760,115 @@
return 1 & (b >> (field & 7));
}
-/*
- * Return 1 if we should exit from L2 to L1 to handle an exit, or 0 if we
- * should handle it ourselves in L0 (and then continue L2). Only call this
- * when in is_guest_mode (L2).
- */
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason)
+static bool nested_vmx_exit_handled_mtf(struct vmcs12 *vmcs12)
{
- u32 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 entry_intr_info = vmcs12->vm_entry_intr_info_field;
- if (vmx->nested.nested_run_pending)
- return false;
-
- if (unlikely(vmx->fail)) {
- trace_kvm_nested_vmenter_failed(
- "hardware VM-instruction error: ",
- vmcs_read32(VM_INSTRUCTION_ERROR));
+ if (nested_cpu_has_mtf(vmcs12))
return true;
- }
/*
- * The host physical addresses of some pages of guest memory
- * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
- * Page). The CPU may write to these pages via their host
- * physical address while L2 is running, bypassing any
- * address-translation-based dirty tracking (e.g. EPT write
- * protection).
- *
- * Mark them dirty on every exit from L2 to prevent them from
- * getting out of sync with dirty tracking.
+ * An MTF VM-exit may be injected into the guest by setting the
+ * interruption-type to 7 (other event) and the vector field to 0. Such
+ * is the case regardless of the 'monitor trap flag' VM-execution
+ * control.
*/
- nested_mark_vmcs12_pages_dirty(vcpu);
+ return entry_intr_info == (INTR_INFO_VALID_MASK
+ | INTR_TYPE_OTHER_EVENT);
+}
- trace_kvm_nested_vmexit(kvm_rip_read(vcpu), exit_reason,
- vmcs_readl(EXIT_QUALIFICATION),
- vmx->idt_vectoring_info,
- intr_info,
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
- KVM_ISA_VMX);
+/*
+ * Return true if L0 wants to handle an exit from L2 regardless of whether or not
+ * L1 wants the exit. Only call this when in is_guest_mode (L2).
+ */
+static bool nested_vmx_l0_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
+{
+ u32 intr_info;
- switch ((u16)exit_reason) {
+ switch ((u16)exit_reason.basic) {
case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
if (is_nmi(intr_info))
- return false;
+ return true;
else if (is_page_fault(intr_info))
- return !vmx->vcpu.arch.apf.host_apf_reason && enable_ept;
+ return vcpu->arch.apf.host_apf_flags ||
+ vmx_need_pf_intercept(vcpu);
else if (is_debug(intr_info) &&
vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return false;
+ return true;
else if (is_breakpoint(intr_info) &&
vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
- return false;
+ return true;
+ else if (is_alignment_check(intr_info) &&
+ !vmx_guest_inject_ac(vcpu))
+ return true;
+ return false;
+ case EXIT_REASON_EXTERNAL_INTERRUPT:
+ return true;
+ case EXIT_REASON_MCE_DURING_VMENTRY:
+ return true;
+ case EXIT_REASON_EPT_VIOLATION:
+ /*
+ * L0 always deals with the EPT violation. If nested EPT is
+ * used, and the nested mmu code discovers that the address is
+ * missing in the guest EPT table (EPT12), the EPT violation
+ * will be injected with nested_ept_inject_page_fault()
+ */
+ return true;
+ case EXIT_REASON_EPT_MISCONFIG:
+ /*
+ * L2 never uses directly L1's EPT, but rather L0's own EPT
+ * table (shadow on EPT) or a merged EPT table that L0 built
+ * (EPT on EPT). So any problems with the structure of the
+ * table is L0's fault.
+ */
+ return true;
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return true;
+ case EXIT_REASON_PML_FULL:
+ /* We emulate PML support to L1. */
+ return true;
+ case EXIT_REASON_VMFUNC:
+ /* VM functions are emulated through L2->L0 vmexits. */
+ return true;
+ case EXIT_REASON_ENCLS:
+ /* SGX is never exposed to L1 */
+ return true;
+ default:
+ break;
+ }
+ return false;
+}
+
+/*
+ * Return 1 if L1 wants to intercept an exit from L2. Only call this when in
+ * is_guest_mode (L2).
+ */
+static bool nested_vmx_l1_wants_exit(struct kvm_vcpu *vcpu,
+ union vmx_exit_reason exit_reason)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ u32 intr_info;
+
+ switch ((u16)exit_reason.basic) {
+ case EXIT_REASON_EXCEPTION_NMI:
+ intr_info = vmx_get_intr_info(vcpu);
+ if (is_nmi(intr_info))
+ return true;
+ else if (is_page_fault(intr_info))
+ return true;
return vmcs12->exception_bitmap &
(1u << (intr_info & INTR_INFO_VECTOR_MASK));
case EXIT_REASON_EXTERNAL_INTERRUPT:
- return false;
+ return nested_exit_on_intr(vcpu);
case EXIT_REASON_TRIPLE_FAULT:
return true;
- case EXIT_REASON_PENDING_INTERRUPT:
- return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_INTR_PENDING);
+ case EXIT_REASON_INTERRUPT_WINDOW:
+ return nested_cpu_has(vmcs12, CPU_BASED_INTR_WINDOW_EXITING);
case EXIT_REASON_NMI_WINDOW:
- return nested_cpu_has(vmcs12, CPU_BASED_VIRTUAL_NMI_PENDING);
+ return nested_cpu_has(vmcs12, CPU_BASED_NMI_WINDOW_EXITING);
case EXIT_REASON_TASK_SWITCH:
return true;
case EXIT_REASON_CPUID:
@@ -5433,7 +5919,7 @@
case EXIT_REASON_MWAIT_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MWAIT_EXITING);
case EXIT_REASON_MONITOR_TRAP_FLAG:
- return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+ return nested_vmx_exit_handled_mtf(vmcs12);
case EXIT_REASON_MONITOR_INSTRUCTION:
return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_EXITING);
case EXIT_REASON_PAUSE_INSTRUCTION:
@@ -5441,7 +5927,7 @@
nested_cpu_has2(vmcs12,
SECONDARY_EXEC_PAUSE_LOOP_EXITING);
case EXIT_REASON_MCE_DURING_VMENTRY:
- return false;
+ return true;
case EXIT_REASON_TPR_BELOW_THRESHOLD:
return nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW);
case EXIT_REASON_APIC_ACCESS:
@@ -5453,22 +5939,6 @@
* delivery" only come from vmcs12.
*/
return true;
- case EXIT_REASON_EPT_VIOLATION:
- /*
- * L0 always deals with the EPT violation. If nested EPT is
- * used, and the nested mmu code discovers that the address is
- * missing in the guest EPT table (EPT12), the EPT violation
- * will be injected with nested_ept_inject_page_fault()
- */
- return false;
- case EXIT_REASON_EPT_MISCONFIG:
- /*
- * L2 never uses directly L1's EPT, but rather L0's own EPT
- * table (shadow on EPT) or a merged EPT table that L0 built
- * (EPT on EPT). So any problems with the structure of the
- * table is L0's fault.
- */
- return false;
case EXIT_REASON_INVPCID:
return
nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_INVPCID) &&
@@ -5485,17 +5955,6 @@
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
- case EXIT_REASON_PREEMPTION_TIMER:
- return false;
- case EXIT_REASON_PML_FULL:
- /* We emulate PML support to L1. */
- return false;
- case EXIT_REASON_VMFUNC:
- /* VM functions are emulated through L2->L0 vmexits. */
- return false;
- case EXIT_REASON_ENCLS:
- /* SGX is never exposed to L1 */
- return false;
case EXIT_REASON_UMWAIT:
case EXIT_REASON_TPAUSE:
return nested_cpu_has2(vmcs12,
@@ -5505,6 +5964,61 @@
}
}
+/*
+ * Conditionally reflect a VM-Exit into L1. Returns %true if the VM-Exit was
+ * reflected into L1.
+ */
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
+ unsigned long exit_qual;
+ u32 exit_intr_info;
+
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
+ /*
+ * Late nested VM-Fail shares the same flow as nested VM-Exit since KVM
+ * has already loaded L2's state.
+ */
+ if (unlikely(vmx->fail)) {
+ trace_kvm_nested_vmenter_failed(
+ "hardware VM-instruction error: ",
+ vmcs_read32(VM_INSTRUCTION_ERROR));
+ exit_intr_info = 0;
+ exit_qual = 0;
+ goto reflect_vmexit;
+ }
+
+ trace_kvm_nested_vmexit(exit_reason.full, vcpu, KVM_ISA_VMX);
+
+ /* If L0 (KVM) wants the exit, it trumps L1's desires. */
+ if (nested_vmx_l0_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /* If L1 doesn't want the exit, handle it in L0. */
+ if (!nested_vmx_l1_wants_exit(vcpu, exit_reason))
+ return false;
+
+ /*
+ * vmcs.VM_EXIT_INTR_INFO is only valid for EXCEPTION_NMI exits. For
+ * EXTERNAL_INTERRUPT, the value for vmcs12->vm_exit_intr_info would
+ * need to be synthesized by querying the in-kernel LAPIC, but external
+ * interrupts are never reflected to L1 so it's a non-issue.
+ */
+ exit_intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(exit_intr_info)) {
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+
+ vmcs12->vm_exit_intr_error_code =
+ vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ }
+ exit_qual = vmx_get_exit_qual(vcpu);
+
+reflect_vmexit:
+ nested_vmx_vmexit(vcpu, exit_reason.full, exit_intr_info, exit_qual);
+ return true;
+}
static int vmx_get_nested_state(struct kvm_vcpu *vcpu,
struct kvm_nested_state __user *user_kvm_nested_state,
@@ -5516,8 +6030,10 @@
.flags = 0,
.format = KVM_STATE_NESTED_FORMAT_VMX,
.size = sizeof(kvm_state),
+ .hdr.vmx.flags = 0,
.hdr.vmx.vmxon_pa = -1ull,
.hdr.vmx.vmcs12_pa = -1ull,
+ .hdr.vmx.preemption_timer_deadline = 0,
};
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
&user_kvm_nested_state->data.vmx[0];
@@ -5556,6 +6072,17 @@
if (vmx->nested.nested_run_pending)
kvm_state.flags |= KVM_STATE_NESTED_RUN_PENDING;
+
+ if (vmx->nested.mtf_pending)
+ kvm_state.flags |= KVM_STATE_NESTED_MTF_PENDING;
+
+ if (nested_cpu_has_preemption_timer(vmcs12) &&
+ vmx->nested.has_preemption_timer_deadline) {
+ kvm_state.hdr.vmx.flags |=
+ KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE;
+ kvm_state.hdr.vmx.preemption_timer_deadline =
+ vmx->nested.preemption_timer_deadline;
+ }
}
}
@@ -5604,7 +6131,6 @@
get_shadow_vmcs12(vcpu), VMCS12_SIZE))
return -EFAULT;
}
-
out:
return kvm_state.size;
}
@@ -5627,7 +6153,7 @@
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
struct vmcs12 *vmcs12;
- u32 exit_qual;
+ enum vm_entry_failure_code ignored;
struct kvm_vmx_nested_state_data __user *user_vmx_nested_state =
&user_kvm_nested_state->data.vmx[0];
int ret;
@@ -5669,6 +6195,9 @@
~(KVM_STATE_NESTED_SMM_GUEST_MODE | KVM_STATE_NESTED_SMM_VMXON))
return -EINVAL;
+ if (kvm_state->hdr.vmx.flags & ~KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE)
+ return -EINVAL;
+
/*
* SMM temporarily disables VMX, so we cannot be in guest mode,
* nor can VMLAUNCH/VMRESUME be pending. Outside SMM, SMM flags
@@ -5698,9 +6227,16 @@
if (ret)
return ret;
- /* Empty 'VMXON' state is permitted */
- if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12))
- return 0;
+ /* Empty 'VMXON' state is permitted if no VMCS loaded */
+ if (kvm_state->size < sizeof(*kvm_state) + sizeof(*vmcs12)) {
+ /* See vmx_has_valid_vmcs12. */
+ if ((kvm_state->flags & KVM_STATE_NESTED_GUEST_MODE) ||
+ (kvm_state->flags & KVM_STATE_NESTED_EVMCS) ||
+ (kvm_state->hdr.vmx.vmcs12_pa != -1ull))
+ return -EINVAL;
+ else
+ return 0;
+ }
if (kvm_state->hdr.vmx.vmcs12_pa != -1ull) {
if (kvm_state->hdr.vmx.vmcs12_pa == kvm_state->hdr.vmx.vmxon_pa ||
@@ -5710,10 +6246,12 @@
set_current_vmptr(vmx, kvm_state->hdr.vmx.vmcs12_pa);
} else if (kvm_state->flags & KVM_STATE_NESTED_EVMCS) {
/*
- * Sync eVMCS upon entry as we may not have
- * HV_X64_MSR_VP_ASSIST_PAGE set up yet.
+ * nested_vmx_handle_enlightened_vmptrld() cannot be called
+ * directly from here as HV_X64_MSR_VP_ASSIST_PAGE may not be
+ * restored yet. EVMCS will be mapped from
+ * nested_get_vmcs12_pages().
*/
- vmx->nested.need_vmcs12_to_shadow_sync = true;
+ kvm_make_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu);
} else {
return -EINVAL;
}
@@ -5739,6 +6277,9 @@
vmx->nested.nested_run_pending =
!!(kvm_state->flags & KVM_STATE_NESTED_RUN_PENDING);
+ vmx->nested.mtf_pending =
+ !!(kvm_state->flags & KVM_STATE_NESTED_MTF_PENDING);
+
ret = -EINVAL;
if (nested_cpu_has_shadow_vmcs(vmcs12) &&
vmcs12->vmcs_link_pointer != -1ull) {
@@ -5761,9 +6302,16 @@
goto error_guest_mode;
}
+ vmx->nested.has_preemption_timer_deadline = false;
+ if (kvm_state->hdr.vmx.flags & KVM_STATE_VMX_PREEMPTION_TIMER_DEADLINE) {
+ vmx->nested.has_preemption_timer_deadline = true;
+ vmx->nested.preemption_timer_deadline =
+ kvm_state->hdr.vmx.preemption_timer_deadline;
+ }
+
if (nested_vmx_check_controls(vcpu, vmcs12) ||
nested_vmx_check_host_state(vcpu, vmcs12) ||
- nested_vmx_check_guest_state(vcpu, vmcs12, &exit_qual))
+ nested_vmx_check_guest_state(vcpu, vmcs12, &ignored))
goto error_guest_mode;
vmx->nested.dirty_vmcs12 = true;
@@ -5778,7 +6326,7 @@
return ret;
}
-void nested_vmx_vcpu_setup(void)
+void nested_vmx_set_vmcs_shadowing_bitmap(void)
{
if (enable_shadow_vmcs) {
vmcs_write64(VMREAD_BITMAP, __pa(vmx_vmread_bitmap));
@@ -5809,7 +6357,7 @@
* reason is that if one of these bits is necessary, it will appear
* in vmcs01 and prepare_vmcs02, when it bitwise-or's the control
* fields of vmcs01 and vmcs02, will turn these bits off - and
- * nested_vmx_exit_reflected() will not pass related exits to L1.
+ * nested_vmx_l1_wants_exit() will not pass related exits to L1.
* These rules have exceptions below.
*/
@@ -5839,7 +6387,8 @@
#ifdef CONFIG_X86_64
VM_EXIT_HOST_ADDR_SPACE_SIZE |
#endif
- VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT;
+ VM_EXIT_LOAD_IA32_PAT | VM_EXIT_SAVE_IA32_PAT |
+ VM_EXIT_CLEAR_BNDCFGS | VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
msrs->exit_ctls_high |=
VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR |
VM_EXIT_LOAD_IA32_EFER | VM_EXIT_SAVE_IA32_EFER |
@@ -5858,7 +6407,8 @@
#ifdef CONFIG_X86_64
VM_ENTRY_IA32E_MODE |
#endif
- VM_ENTRY_LOAD_IA32_PAT;
+ VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_BNDCFGS |
+ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
msrs->entry_ctls_high |=
(VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR | VM_ENTRY_LOAD_IA32_EFER);
@@ -5872,8 +6422,8 @@
msrs->procbased_ctls_low =
CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR;
msrs->procbased_ctls_high &=
- CPU_BASED_VIRTUAL_INTR_PENDING |
- CPU_BASED_VIRTUAL_NMI_PENDING | CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_INTR_WINDOW_EXITING |
+ CPU_BASED_NMI_WINDOW_EXITING | CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_HLT_EXITING | CPU_BASED_INVLPG_EXITING |
CPU_BASED_MWAIT_EXITING | CPU_BASED_CR3_LOAD_EXITING |
CPU_BASED_CR3_STORE_EXITING |
@@ -5901,7 +6451,8 @@
/*
* secondary cpu-based controls. Do not include those that
- * depend on CPUID bits, they are added later by vmx_cpuid_update.
+ * depend on CPUID bits, they are added later by
+ * vmx_vcpu_after_set_cpuid.
*/
if (msrs->procbased_ctls_high & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
rdmsr(MSR_IA32_VMX_PROCBASED_CTLS2,
@@ -5911,7 +6462,7 @@
msrs->secondary_ctls_low = 0;
msrs->secondary_ctls_high &=
SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_WBINVD_EXITING |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
@@ -5932,11 +6483,13 @@
/* nested EPT: emulate EPT also to L1 */
msrs->secondary_ctls_high |=
SECONDARY_EXEC_ENABLE_EPT;
- msrs->ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
- VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT;
- if (cpu_has_vmx_ept_execute_only())
- msrs->ept_caps |=
- VMX_EPT_EXECUTE_ONLY_BIT;
+ msrs->ept_caps =
+ VMX_EPT_PAGE_WALK_4_BIT |
+ VMX_EPT_PAGE_WALK_5_BIT |
+ VMX_EPTP_WB_BIT |
+ VMX_EPT_INVEPT_BIT |
+ VMX_EPT_EXECUTE_ONLY_BIT;
+
msrs->ept_caps &= ept_caps;
msrs->ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT |
VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT |
@@ -6058,25 +6611,30 @@
init_vmcs_shadow_fields();
}
- exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear,
- exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch,
- exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld,
- exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst,
- exit_handlers[EXIT_REASON_VMREAD] = handle_vmread,
- exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume,
- exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite,
- exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff,
- exit_handlers[EXIT_REASON_VMON] = handle_vmon,
- exit_handlers[EXIT_REASON_INVEPT] = handle_invept,
- exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid,
- exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc,
-
- kvm_x86_ops->check_nested_events = vmx_check_nested_events;
- kvm_x86_ops->get_nested_state = vmx_get_nested_state;
- kvm_x86_ops->set_nested_state = vmx_set_nested_state;
- kvm_x86_ops->get_vmcs12_pages = nested_get_vmcs12_pages,
- kvm_x86_ops->nested_enable_evmcs = nested_enable_evmcs;
- kvm_x86_ops->nested_get_evmcs_version = nested_get_evmcs_version;
+ exit_handlers[EXIT_REASON_VMCLEAR] = handle_vmclear;
+ exit_handlers[EXIT_REASON_VMLAUNCH] = handle_vmlaunch;
+ exit_handlers[EXIT_REASON_VMPTRLD] = handle_vmptrld;
+ exit_handlers[EXIT_REASON_VMPTRST] = handle_vmptrst;
+ exit_handlers[EXIT_REASON_VMREAD] = handle_vmread;
+ exit_handlers[EXIT_REASON_VMRESUME] = handle_vmresume;
+ exit_handlers[EXIT_REASON_VMWRITE] = handle_vmwrite;
+ exit_handlers[EXIT_REASON_VMOFF] = handle_vmoff;
+ exit_handlers[EXIT_REASON_VMON] = handle_vmon;
+ exit_handlers[EXIT_REASON_INVEPT] = handle_invept;
+ exit_handlers[EXIT_REASON_INVVPID] = handle_invvpid;
+ exit_handlers[EXIT_REASON_VMFUNC] = handle_vmfunc;
return 0;
}
+
+struct kvm_x86_nested_ops vmx_nested_ops = {
+ .leave_nested = vmx_leave_nested,
+ .check_events = vmx_check_nested_events,
+ .hv_timer_pending = nested_vmx_preemption_timer_pending,
+ .get_state = vmx_get_nested_state,
+ .set_state = vmx_set_nested_state,
+ .get_nested_state_pages = vmx_get_nested_state_pages,
+ .write_log_dirty = nested_vmx_write_pml_buffer,
+ .enable_evmcs = nested_enable_evmcs,
+ .get_evmcs_version = nested_get_evmcs_version,
+};
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index b8521c4..197148d 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -20,18 +20,20 @@
void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
-void nested_vmx_vcpu_setup(void);
+void nested_vmx_set_vmcs_shadowing_bitmap(void);
void nested_vmx_free_vcpu(struct kvm_vcpu *vcpu);
enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
bool from_vmentry);
-bool nested_vmx_exit_reflected(struct kvm_vcpu *vcpu, u32 exit_reason);
-void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
+bool nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu);
+void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
u32 exit_intr_info, unsigned long exit_qualification);
void nested_sync_vmcs12_to_shadow(struct kvm_vcpu *vcpu);
int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
+void nested_vmx_pmu_entry_exit_ctls_update(struct kvm_vcpu *vcpu);
+void nested_mark_vmcs12_pages_dirty(struct kvm_vcpu *vcpu);
bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
int size);
@@ -45,6 +47,11 @@
return to_vmx(vcpu)->nested.cached_shadow_vmcs12;
}
+/*
+ * Note: the same condition is checked against the state provided by userspace
+ * in vmx_set_nested_state; if it is satisfied, the nested state must include
+ * the VMCS12.
+ */
static inline int vmx_has_valid_vmcs12(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -59,7 +66,14 @@
vmx->nested.hv_evmcs;
}
-static inline unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu)
+static inline u16 nested_get_vpid02(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ return vmx->nested.vpid02 ? vmx->nested.vpid02 : vmx->vpid;
+}
+
+static inline unsigned long nested_ept_get_eptp(struct kvm_vcpu *vcpu)
{
/* return the page table to be shadowed - in our case, EPT12 */
return get_vmcs12(vcpu)->ept_pointer;
@@ -67,35 +81,7 @@
static inline bool nested_ept_ad_enabled(struct kvm_vcpu *vcpu)
{
- return nested_ept_get_cr3(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
-}
-
-/*
- * Reflect a VM Exit into L1.
- */
-static inline int nested_vmx_reflect_vmexit(struct kvm_vcpu *vcpu,
- u32 exit_reason)
-{
- u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
-
- /*
- * At this point, the exit interruption info in exit_intr_info
- * is only valid for EXCEPTION_NMI exits. For EXTERNAL_INTERRUPT
- * we need to query the in-kernel LAPIC.
- */
- WARN_ON(exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT);
- if ((exit_intr_info &
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) ==
- (INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK)) {
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- vmcs12->vm_exit_intr_error_code =
- vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
- }
-
- nested_vmx_vmexit(vcpu, exit_reason, exit_intr_info,
- vmcs_readl(EXIT_QUALIFICATION));
- return 1;
+ return nested_ept_get_eptp(vcpu) & VMX_EPTP_AD_ENABLE_BIT;
}
/*
@@ -175,6 +161,11 @@
return vmcs12->pin_based_vm_exec_control & PIN_BASED_VIRTUAL_NMIS;
}
+static inline int nested_cpu_has_mtf(struct vmcs12 *vmcs12)
+{
+ return nested_cpu_has(vmcs12, CPU_BASED_MONITOR_TRAP_FLAG);
+}
+
static inline int nested_cpu_has_ept(struct vmcs12 *vmcs12)
{
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_EPT);
@@ -238,6 +229,11 @@
VM_EXIT_SAVE_VMX_PREEMPTION_TIMER;
}
+static inline bool nested_exit_on_nmi(struct kvm_vcpu *vcpu)
+{
+ return nested_cpu_has_nmi_exiting(get_vmcs12(vcpu));
+}
+
/*
* In nested virtualization, check if L1 asked to exit on external interrupts.
* For most existing hypervisors, this will always return true.
@@ -257,7 +253,7 @@
return ((val & fixed1) | fixed0) == val;
}
-static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
@@ -271,7 +267,7 @@
return fixed_bits_valid(val, fixed0, fixed1);
}
-static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr0_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr0_fixed1;
@@ -279,7 +275,7 @@
return fixed_bits_valid(val, fixed0, fixed1);
}
-static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
+static inline bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val)
{
u64 fixed0 = to_vmx(vcpu)->nested.msrs.cr4_fixed0;
u64 fixed1 = to_vmx(vcpu)->nested.msrs.cr4_fixed1;
@@ -291,4 +287,6 @@
#define nested_guest_cr4_valid nested_cr4_valid
#define nested_host_cr4_valid nested_cr4_valid
+extern struct kvm_x86_nested_ops vmx_nested_ops;
+
#endif /* __KVM_X86_VMX_NESTED_H */
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 181e352..bd70c1d 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -15,8 +15,11 @@
#include "x86.h"
#include "cpuid.h"
#include "lapic.h"
+#include "nested.h"
#include "pmu.h"
+#define MSR_PMC_FULL_WIDTH_BIT (MSR_IA32_PMC0 - MSR_IA32_PERFCTR0)
+
static struct kvm_event_hw_type_mapping intel_arch_events[] = {
/* Index must match CPUID 0x0A.EBX bit vector */
[0] = { 0x3c, 0x00, PERF_COUNT_HW_CPU_CYCLES },
@@ -46,6 +49,7 @@
if (old_ctrl == new_ctrl)
continue;
+ __set_bit(INTEL_PMC_IDX_FIXED + i, pmu->pmc_in_use);
reprogram_fixed_counter(pmc, new_ctrl, i);
}
@@ -64,10 +68,11 @@
reprogram_counter(pmu, bit);
}
-static unsigned intel_find_arch_event(struct kvm_pmu *pmu,
- u8 event_select,
- u8 unit_mask)
+static unsigned int intel_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
+ u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
for (i = 0; i < ARRAY_SIZE(intel_arch_events); i++)
@@ -115,7 +120,7 @@
}
/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */
-static int intel_is_valid_msr_idx(struct kvm_vcpu *vcpu, unsigned idx)
+static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
@@ -126,8 +131,8 @@
(fixed && idx >= pmu->nr_arch_fixed_counters);
}
-static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu,
- unsigned idx, u64 *mask)
+static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu,
+ unsigned int idx, u64 *mask)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
@@ -148,6 +153,22 @@
return &counters[array_index_nospec(idx, num_counters)];
}
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ return false;
+
+ return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+}
+
+static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
+{
+ if (!fw_writes_is_enabled(pmu_to_vcpu(pmu)))
+ return NULL;
+
+ return get_gp_pmc(pmu, msr, MSR_IA32_PMC0);
+}
+
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
@@ -163,42 +184,58 @@
default:
ret = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0) ||
get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0) ||
- get_fixed_pmc(pmu, msr);
+ get_fixed_pmc(pmu, msr) || get_fw_gp_pmc(pmu, msr);
break;
}
return ret;
}
-static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
+static struct kvm_pmc *intel_msr_idx_to_pmc(struct kvm_vcpu *vcpu, u32 msr)
{
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
struct kvm_pmc *pmc;
+ pmc = get_fixed_pmc(pmu, msr);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0);
+ pmc = pmc ? pmc : get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0);
+
+ return pmc;
+}
+
+static int intel_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
+{
+ struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
+ struct kvm_pmc *pmc;
+ u32 msr = msr_info->index;
+
switch (msr) {
case MSR_CORE_PERF_FIXED_CTR_CTRL:
- *data = pmu->fixed_ctr_ctrl;
+ msr_info->data = pmu->fixed_ctr_ctrl;
return 0;
case MSR_CORE_PERF_GLOBAL_STATUS:
- *data = pmu->global_status;
+ msr_info->data = pmu->global_status;
return 0;
case MSR_CORE_PERF_GLOBAL_CTRL:
- *data = pmu->global_ctrl;
+ msr_info->data = pmu->global_ctrl;
return 0;
case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
- *data = pmu->global_ovf_ctrl;
+ msr_info->data = pmu->global_ovf_ctrl;
return 0;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
u64 val = pmc_read_counter(pmc);
- *data = val & pmu->counter_bitmask[KVM_PMC_GP];
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_GP];
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
u64 val = pmc_read_counter(pmc);
- *data = val & pmu->counter_bitmask[KVM_PMC_FIXED];
+ msr_info->data =
+ val & pmu->counter_bitmask[KVM_PMC_FIXED];
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
- *data = pmc->eventsel;
+ msr_info->data = pmc->eventsel;
return 0;
}
}
@@ -231,7 +268,7 @@
case MSR_CORE_PERF_GLOBAL_CTRL:
if (pmu->global_ctrl == data)
return 0;
- if (!(data & pmu->global_ctrl_mask)) {
+ if (kvm_valid_perf_global_ctrl(pmu, data)) {
global_ctrl_changed(pmu, data);
return 0;
}
@@ -245,14 +282,24 @@
}
break;
default:
- if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0))) {
- if (msr_info->host_initiated)
- pmc->counter = data;
- else
- pmc->counter = (s32)data;
+ if ((pmc = get_gp_pmc(pmu, msr, MSR_IA32_PERFCTR0)) ||
+ (pmc = get_gp_pmc(pmu, msr, MSR_IA32_PMC0))) {
+ if ((msr & MSR_PMC_FULL_WIDTH_BIT) &&
+ (data & ~pmu->counter_bitmask[KVM_PMC_GP]))
+ return 1;
+ if (!msr_info->host_initiated &&
+ !(msr & MSR_PMC_FULL_WIDTH_BIT))
+ data = (s64)(s32)data;
+ pmc->counter += data - pmc_read_counter(pmc);
+ if (pmc->perf_event)
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, data));
return 0;
} else if ((pmc = get_fixed_pmc(pmu, msr))) {
- pmc->counter = data;
+ pmc->counter += data - pmc_read_counter(pmc);
+ if (pmc->perf_event)
+ perf_event_period(pmc->perf_event,
+ get_sample_period(pmc, data));
return 0;
} else if ((pmc = get_gp_pmc(pmu, msr, MSR_P6_EVNTSEL0))) {
if (data == pmc->eventsel)
@@ -281,6 +328,7 @@
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
+ vcpu->arch.perf_capabilities = 0;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -293,6 +341,8 @@
return;
perf_get_x86_pmu_capability(&x86_pmu);
+ if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
+ vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
@@ -320,7 +370,7 @@
pmu->global_ovf_ctrl_mask = pmu->global_ctrl_mask
& ~(MSR_CORE_PERF_GLOBAL_OVF_CTRL_OVF_BUF |
MSR_CORE_PERF_GLOBAL_OVF_CTRL_COND_CHGD);
- if (kvm_x86_ops->pt_supported())
+ if (vmx_pt_mode_is_host_guest())
pmu->global_ovf_ctrl_mask &=
~MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI;
@@ -329,6 +379,13 @@
(boot_cpu_has(X86_FEATURE_HLE) || boot_cpu_has(X86_FEATURE_RTM)) &&
(entry->ebx & (X86_FEATURE_HLE|X86_FEATURE_RTM)))
pmu->reserved_bits ^= HSW_IN_TX|HSW_IN_TX_CHECKPOINTED;
+
+ bitmap_set(pmu->all_valid_pmc_idx,
+ 0, pmu->nr_arch_gp_counters);
+ bitmap_set(pmu->all_valid_pmc_idx,
+ INTEL_PMC_MAX_GENERIC, pmu->nr_arch_fixed_counters);
+
+ nested_vmx_pmu_entry_exit_ctls_update(vcpu);
}
static void intel_pmu_init(struct kvm_vcpu *vcpu)
@@ -340,12 +397,14 @@
pmu->gp_counters[i].type = KVM_PMC_GP;
pmu->gp_counters[i].vcpu = vcpu;
pmu->gp_counters[i].idx = i;
+ pmu->gp_counters[i].current_config = 0;
}
for (i = 0; i < INTEL_PMC_MAX_FIXED; i++) {
pmu->fixed_counters[i].type = KVM_PMC_FIXED;
pmu->fixed_counters[i].vcpu = vcpu;
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
+ pmu->fixed_counters[i].current_config = 0;
}
}
@@ -374,12 +433,13 @@
}
struct kvm_pmu_ops intel_pmu_ops = {
- .find_arch_event = intel_find_arch_event,
+ .pmc_perf_hw_id = intel_pmc_perf_hw_id,
.find_fixed_event = intel_find_fixed_event,
.pmc_is_enabled = intel_pmc_is_enabled,
.pmc_idx_to_pmc = intel_pmc_idx_to_pmc,
+ .rdpmc_ecx_to_pmc = intel_rdpmc_ecx_to_pmc,
.msr_idx_to_pmc = intel_msr_idx_to_pmc,
- .is_valid_msr_idx = intel_is_valid_msr_idx,
+ .is_valid_rdpmc_ecx = intel_is_valid_rdpmc_ecx,
.is_valid_msr = intel_is_valid_msr,
.get_msr = intel_pmu_get_msr,
.set_msr = intel_pmu_set_msr,
diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c
new file mode 100644
index 0000000..5f8acd2
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.c
@@ -0,0 +1,334 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/kvm_host.h>
+
+#include <asm/irq_remapping.h>
+#include <asm/cpu.h>
+
+#include "lapic.h"
+#include "irq.h"
+#include "posted_intr.h"
+#include "trace.h"
+#include "vmx.h"
+
+/*
+ * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
+ * can find which vCPU should be waken up.
+ */
+static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
+static DEFINE_PER_CPU(raw_spinlock_t, blocked_vcpu_on_cpu_lock);
+
+static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+{
+ return &(to_vmx(vcpu)->pi_desc);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ /*
+ * In case of hot-plug or hot-unplug, we may have to undo
+ * vmx_vcpu_pi_put even if there is no assigned device. And we
+ * always keep PI.NDST up to date for simplicity: it makes the
+ * code easier, and CPU migration is not a fast path.
+ */
+ if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
+ return;
+
+ /*
+ * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
+ * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
+ * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
+ * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
+ * correctly.
+ */
+ if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
+ pi_clear_sn(pi_desc);
+ goto after_clear_sn;
+ }
+
+ /* The full case. */
+ do {
+ old.control = new.control = pi_desc->control;
+
+ dest = cpu_physical_id(cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ new.sn = 0;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+after_clear_sn:
+
+ /*
+ * Clear SN before reading the bitmap. The VT-d firmware
+ * writes the bitmap and reads SN atomically (5.2.3 in the
+ * spec), so it doesn't really have a memory barrier that
+ * pairs with this, but we cannot do that and we need one.
+ */
+ smp_mb__after_atomic();
+
+ if (!pi_is_pir_empty(pi_desc))
+ pi_set_on(pi_desc);
+}
+
+static bool vmx_can_use_vtd_pi(struct kvm *kvm)
+{
+ return irqchip_in_kernel(kvm) && enable_apicv &&
+ kvm_arch_has_assigned_device(kvm) &&
+ irq_remapping_cap(IRQ_POSTING_CAP);
+}
+
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
+ return;
+
+ /* Set SN when the vCPU is preempted */
+ if (vcpu->preempted)
+ pi_set_sn(pi_desc);
+}
+
+static void __pi_post_block(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+ struct pi_desc old, new;
+ unsigned int dest;
+
+ do {
+ old.control = new.control = pi_desc->control;
+ WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
+ "Wakeup handler not enabled while the VCPU is blocked\n");
+
+ dest = cpu_physical_id(vcpu->cpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'notification vector' */
+ new.nv = POSTED_INTR_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
+ raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_del(&vcpu->blocked_vcpu_list);
+ raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ vcpu->pre_pcpu = -1;
+ }
+}
+
+/*
+ * This routine does the following things for vCPU which is going
+ * to be blocked if VT-d PI is enabled.
+ * - Store the vCPU to the wakeup list, so when interrupts happen
+ * we can find the right vCPU to wake up.
+ * - Change the Posted-interrupt descriptor as below:
+ * 'NDST' <-- vcpu->pre_pcpu
+ * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
+ * - If 'ON' is set during this process, which means at least one
+ * interrupt is posted for this vCPU, we cannot block it, in
+ * this case, return 1, otherwise, return 0.
+ *
+ */
+int pi_pre_block(struct kvm_vcpu *vcpu)
+{
+ unsigned int dest;
+ struct pi_desc old, new;
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (!vmx_can_use_vtd_pi(vcpu->kvm))
+ return 0;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
+ vcpu->pre_pcpu = vcpu->cpu;
+ raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ list_add_tail(&vcpu->blocked_vcpu_list,
+ &per_cpu(blocked_vcpu_on_cpu,
+ vcpu->pre_pcpu));
+ raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
+ }
+
+ do {
+ old.control = new.control = pi_desc->control;
+
+ WARN((pi_desc->sn == 1),
+ "Warning: SN field of posted-interrupts "
+ "is set before blocking\n");
+
+ /*
+ * Since vCPU can be preempted during this process,
+ * vcpu->cpu could be different with pre_pcpu, we
+ * need to set pre_pcpu as the destination of wakeup
+ * notification event, then we can find the right vCPU
+ * to wakeup in wakeup handler if interrupts happen
+ * when the vCPU is in blocked state.
+ */
+ dest = cpu_physical_id(vcpu->pre_pcpu);
+
+ if (x2apic_enabled())
+ new.ndst = dest;
+ else
+ new.ndst = (dest << 8) & 0xFF00;
+
+ /* set 'NV' to 'wakeup vector' */
+ new.nv = POSTED_INTR_WAKEUP_VECTOR;
+ } while (cmpxchg64(&pi_desc->control, old.control,
+ new.control) != old.control);
+
+ /* We should not block the vCPU if an interrupt is posted for it. */
+ if (pi_test_on(pi_desc) == 1)
+ __pi_post_block(vcpu);
+
+ local_irq_enable();
+ return (vcpu->pre_pcpu == -1);
+}
+
+void pi_post_block(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->pre_pcpu == -1)
+ return;
+
+ WARN_ON(irqs_disabled());
+ local_irq_disable();
+ __pi_post_block(vcpu);
+ local_irq_enable();
+}
+
+/*
+ * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
+ */
+void pi_wakeup_handler(void)
+{
+ struct kvm_vcpu *vcpu;
+ int cpu = smp_processor_id();
+
+ raw_spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
+ blocked_vcpu_list) {
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ if (pi_test_on(pi_desc) == 1)
+ kvm_vcpu_kick(vcpu);
+ }
+ raw_spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+void __init pi_init_cpu(int cpu)
+{
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ raw_spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+}
+
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
+
+ return pi_test_on(pi_desc) ||
+ (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
+}
+
+
+/*
+ * pi_update_irte - set IRTE for Posted-Interrupts
+ *
+ * @kvm: kvm
+ * @host_irq: host irq of the interrupt
+ * @guest_irq: gsi of the interrupt
+ * @set: set or unset PI
+ * returns 0 on success, < 0 on failure
+ */
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+ bool set)
+{
+ struct kvm_kernel_irq_routing_entry *e;
+ struct kvm_irq_routing_table *irq_rt;
+ struct kvm_lapic_irq irq;
+ struct kvm_vcpu *vcpu;
+ struct vcpu_data vcpu_info;
+ int idx, ret = 0;
+
+ if (!vmx_can_use_vtd_pi(kvm))
+ return 0;
+
+ idx = srcu_read_lock(&kvm->irq_srcu);
+ irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
+
+ hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
+ if (e->type != KVM_IRQ_ROUTING_MSI)
+ continue;
+ /*
+ * VT-d PI cannot support posting multicast/broadcast
+ * interrupts to a vCPU, we still use interrupt remapping
+ * for these kind of interrupts.
+ *
+ * For lowest-priority interrupts, we only support
+ * those with single CPU as the destination, e.g. user
+ * configures the interrupts via /proc/irq or uses
+ * irqbalance to make the interrupts single-CPU.
+ *
+ * We will support full lowest-priority interrupt later.
+ *
+ * In addition, we can only inject generic interrupts using
+ * the PI mechanism, refuse to route others through it.
+ */
+
+ kvm_set_msi_irq(kvm, e, &irq);
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
+ /*
+ * Make sure the IRTE is in remapped mode if
+ * we don't handle it in posted mode.
+ */
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+ if (ret < 0) {
+ printk(KERN_INFO
+ "failed to back to remapped mode, irq: %u\n",
+ host_irq);
+ goto out;
+ }
+
+ continue;
+ }
+
+ vcpu_info.pi_desc_addr = __pa(&to_vmx(vcpu)->pi_desc);
+ vcpu_info.vector = irq.vector;
+
+ trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
+ vcpu_info.vector, vcpu_info.pi_desc_addr, set);
+
+ if (set)
+ ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
+ else
+ ret = irq_set_vcpu_affinity(host_irq, NULL);
+
+ if (ret < 0) {
+ printk(KERN_INFO "%s: failed to update PI IRTE\n",
+ __func__);
+ goto out;
+ }
+ }
+
+ ret = 0;
+out:
+ srcu_read_unlock(&kvm->irq_srcu, idx);
+ return ret;
+}
diff --git a/arch/x86/kvm/vmx/posted_intr.h b/arch/x86/kvm/vmx/posted_intr.h
new file mode 100644
index 0000000..0bdc413
--- /dev/null
+++ b/arch/x86/kvm/vmx/posted_intr.h
@@ -0,0 +1,99 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_POSTED_INTR_H
+#define __KVM_X86_VMX_POSTED_INTR_H
+
+#define POSTED_INTR_ON 0
+#define POSTED_INTR_SN 1
+
+/* Posted-Interrupt Descriptor */
+struct pi_desc {
+ u32 pir[8]; /* Posted interrupt requested */
+ union {
+ struct {
+ /* bit 256 - Outstanding Notification */
+ u16 on : 1,
+ /* bit 257 - Suppress Notification */
+ sn : 1,
+ /* bit 271:258 - Reserved */
+ rsvd_1 : 14;
+ /* bit 279:272 - Notification Vector */
+ u8 nv;
+ /* bit 287:280 - Reserved */
+ u8 rsvd_2;
+ /* bit 319:288 - Notification Destination */
+ u32 ndst;
+ };
+ u64 control;
+ };
+ u32 rsvd[6];
+} __aligned(64);
+
+static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
+{
+ return test_and_clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
+{
+ return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
+}
+
+static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
+{
+ return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
+}
+
+static inline void pi_set_sn(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_set_on(struct pi_desc *pi_desc)
+{
+ set_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_on(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline void pi_clear_sn(struct pi_desc *pi_desc)
+{
+ clear_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_on(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_ON,
+ (unsigned long *)&pi_desc->control);
+}
+
+static inline int pi_test_sn(struct pi_desc *pi_desc)
+{
+ return test_bit(POSTED_INTR_SN,
+ (unsigned long *)&pi_desc->control);
+}
+
+void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu);
+int pi_pre_block(struct kvm_vcpu *vcpu);
+void pi_post_block(struct kvm_vcpu *vcpu);
+void pi_wakeup_handler(void);
+void __init pi_init_cpu(int cpu);
+bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu);
+int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
+ bool set);
+
+#endif /* __KVM_X86_VMX_POSTED_INTR_H */
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 481ad87..571d9ad 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -19,7 +19,7 @@
struct vmcs {
struct vmcs_hdr hdr;
u32 abort;
- char data[0];
+ char data[];
};
DECLARE_PER_CPU(struct vmcs *, current_vmcs);
@@ -72,11 +72,24 @@
struct vmcs_controls_shadow controls_shadow;
};
+static inline bool is_intr_type(u32 intr_info, u32 type)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type);
+}
+
+static inline bool is_intr_type_n(u32 intr_info, u32 type, u8 vector)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK |
+ INTR_INFO_VECTOR_MASK;
+
+ return (intr_info & mask) == (INTR_INFO_VALID_MASK | type | vector);
+}
+
static inline bool is_exception_n(u32 intr_info, u8 vector)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | vector | INTR_INFO_VALID_MASK);
+ return is_intr_type_n(intr_info, INTR_TYPE_HARD_EXCEPTION, vector);
}
static inline bool is_debug(u32 intr_info)
@@ -104,30 +117,37 @@
return is_exception_n(intr_info, GP_VECTOR);
}
+static inline bool is_alignment_check(u32 intr_info)
+{
+ return is_exception_n(intr_info, AC_VECTOR);
+}
+
static inline bool is_machine_check(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
- INTR_INFO_VALID_MASK)) ==
- (INTR_TYPE_HARD_EXCEPTION | MC_VECTOR | INTR_INFO_VALID_MASK);
+ return is_exception_n(intr_info, MC_VECTOR);
}
/* Undocumented: icebp/int1 */
static inline bool is_icebp(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_PRIV_SW_EXCEPTION | INTR_INFO_VALID_MASK);
+ return is_intr_type(intr_info, INTR_TYPE_PRIV_SW_EXCEPTION);
}
static inline bool is_nmi(u32 intr_info)
{
- return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
- == (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
+ return is_intr_type(intr_info, INTR_TYPE_NMI_INTR);
}
static inline bool is_external_intr(u32 intr_info)
{
- return (intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK))
- == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR);
+ return is_intr_type(intr_info, INTR_TYPE_EXT_INTR);
+}
+
+static inline bool is_exception_with_error_code(u32 intr_info)
+{
+ const u32 mask = INTR_INFO_VALID_MASK | INTR_INFO_DELIVER_CODE_MASK;
+
+ return (intr_info & mask) == mask;
}
enum vmcs_field_width {
diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c
index 53dfb40..c8e51c0 100644
--- a/arch/x86/kvm/vmx/vmcs12.c
+++ b/arch/x86/kvm/vmx/vmcs12.c
@@ -115,10 +115,6 @@
FIELD(CR4_GUEST_HOST_MASK, cr4_guest_host_mask),
FIELD(CR0_READ_SHADOW, cr0_read_shadow),
FIELD(CR4_READ_SHADOW, cr4_read_shadow),
- FIELD(CR3_TARGET_VALUE0, cr3_target_value0),
- FIELD(CR3_TARGET_VALUE1, cr3_target_value1),
- FIELD(CR3_TARGET_VALUE2, cr3_target_value2),
- FIELD(CR3_TARGET_VALUE3, cr3_target_value3),
FIELD(EXIT_QUALIFICATION, exit_qualification),
FIELD(GUEST_LINEAR_ADDRESS, guest_linear_address),
FIELD(GUEST_CR0, guest_cr0),
diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h
index d0c6df3..80232da 100644
--- a/arch/x86/kvm/vmx/vmcs12.h
+++ b/arch/x86/kvm/vmx/vmcs12.h
@@ -80,10 +80,7 @@
natural_width cr4_guest_host_mask;
natural_width cr0_read_shadow;
natural_width cr4_read_shadow;
- natural_width cr3_target_value0;
- natural_width cr3_target_value1;
- natural_width cr3_target_value2;
- natural_width cr3_target_value3;
+ natural_width dead_space[4]; /* Last remnants of cr3_target_value[0-3]. */
natural_width exit_qualification;
natural_width guest_linear_address;
natural_width guest_cr0;
@@ -263,10 +260,7 @@
CHECK_OFFSET(cr4_guest_host_mask, 352);
CHECK_OFFSET(cr0_read_shadow, 360);
CHECK_OFFSET(cr4_read_shadow, 368);
- CHECK_OFFSET(cr3_target_value0, 376);
- CHECK_OFFSET(cr3_target_value1, 384);
- CHECK_OFFSET(cr3_target_value2, 392);
- CHECK_OFFSET(cr3_target_value3, 400);
+ CHECK_OFFSET(dead_space, 376);
CHECK_OFFSET(exit_qualification, 408);
CHECK_OFFSET(guest_linear_address, 416);
CHECK_OFFSET(guest_cr0, 424);
diff --git a/arch/x86/kvm/vmx/vmcs_shadow_fields.h b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
index eb1ecd1..cad128d 100644
--- a/arch/x86/kvm/vmx/vmcs_shadow_fields.h
+++ b/arch/x86/kvm/vmx/vmcs_shadow_fields.h
@@ -23,12 +23,12 @@
*
* When adding or removing fields here, note that shadowed
* fields must always be synced by prepare_vmcs02, not just
- * prepare_vmcs02_full.
+ * prepare_vmcs02_rare.
*/
/*
* Keeping the fields ordered by size is an attempt at improving
- * branch prediction in vmcs_read_any and vmcs_write_any.
+ * branch prediction in vmcs12_read_any and vmcs12_write_any.
*/
/* 16-bits */
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index ca4252f..90ad7a6 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -4,6 +4,7 @@
#include <asm/bitsperlong.h>
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
+#include <asm/segment.h>
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -27,7 +28,7 @@
#define VCPU_R15 __VCPU_REGS_R15 * WORD_SIZE
#endif
- .text
+.section .noinstr.text, "ax"
/**
* vmx_vmenter - VM-Enter the current loaded VMCS
@@ -43,7 +44,7 @@
* they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
* to vmx_vmexit.
*/
-ENTRY(vmx_vmenter)
+SYM_FUNC_START(vmx_vmenter)
/* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
je 2f
@@ -58,14 +59,10 @@
ret
4: ud2
- .pushsection .fixup, "ax"
-5: jmp 3b
- .popsection
+ _ASM_EXTABLE(1b, 3b)
+ _ASM_EXTABLE(2b, 3b)
- _ASM_EXTABLE(1b, 5b)
- _ASM_EXTABLE(2b, 5b)
-
-ENDPROC(vmx_vmenter)
+SYM_FUNC_END(vmx_vmenter)
/**
* vmx_vmexit - Handle a VMX VM-Exit
@@ -77,7 +74,7 @@
* here after hardware loads the host's state, i.e. this is the destination
* referred to by VMCS.HOST_RIP.
*/
-ENTRY(vmx_vmexit)
+SYM_FUNC_START(vmx_vmexit)
#ifdef CONFIG_RETPOLINE
ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
/* Preserve guest's RAX, it's used to stuff the RSB. */
@@ -93,7 +90,7 @@
.Lvmexit_skip_rsb:
#endif
ret
-ENDPROC(vmx_vmexit)
+SYM_FUNC_END(vmx_vmexit)
/**
* __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
@@ -104,7 +101,7 @@
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
*/
-ENTRY(__vmx_vcpu_run)
+SYM_FUNC_START(__vmx_vcpu_run)
push %_ASM_BP
mov %_ASM_SP, %_ASM_BP
#ifdef CONFIG_X86_64
@@ -138,12 +135,12 @@
cmpb $0, %bl
/* Load guest registers. Don't clobber flags. */
- mov VCPU_RBX(%_ASM_AX), %_ASM_BX
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
mov VCPU_RDX(%_ASM_AX), %_ASM_DX
+ mov VCPU_RBX(%_ASM_AX), %_ASM_BX
+ mov VCPU_RBP(%_ASM_AX), %_ASM_BP
mov VCPU_RSI(%_ASM_AX), %_ASM_SI
mov VCPU_RDI(%_ASM_AX), %_ASM_DI
- mov VCPU_RBP(%_ASM_AX), %_ASM_BP
#ifdef CONFIG_X86_64
mov VCPU_R8 (%_ASM_AX), %r8
mov VCPU_R9 (%_ASM_AX), %r9
@@ -170,13 +167,13 @@
mov WORD_SIZE(%_ASM_SP), %_ASM_AX
/* Save all guest registers, including RAX from the stack */
- __ASM_SIZE(pop) VCPU_RAX(%_ASM_AX)
- mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
- mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
- mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
- mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
- mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
- mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ pop VCPU_RAX(%_ASM_AX)
+ mov %_ASM_CX, VCPU_RCX(%_ASM_AX)
+ mov %_ASM_DX, VCPU_RDX(%_ASM_AX)
+ mov %_ASM_BX, VCPU_RBX(%_ASM_AX)
+ mov %_ASM_BP, VCPU_RBP(%_ASM_AX)
+ mov %_ASM_SI, VCPU_RSI(%_ASM_AX)
+ mov %_ASM_DI, VCPU_RDI(%_ASM_AX)
#ifdef CONFIG_X86_64
mov %r8, VCPU_R8 (%_ASM_AX)
mov %r9, VCPU_R9 (%_ASM_AX)
@@ -200,12 +197,12 @@
* free. RSP and RAX are exempt as RSP is restored by hardware during
* VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
*/
-1: xor %ebx, %ebx
- xor %ecx, %ecx
+1: xor %ecx, %ecx
xor %edx, %edx
+ xor %ebx, %ebx
+ xor %ebp, %ebp
xor %esi, %esi
xor %edi, %edi
- xor %ebp, %ebp
#ifdef CONFIG_X86_64
xor %r8d, %r8d
xor %r9d, %r9d
@@ -236,7 +233,10 @@
/* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
2: mov $1, %eax
jmp 1b
-ENDPROC(__vmx_vcpu_run)
+SYM_FUNC_END(__vmx_vcpu_run)
+
+
+.section .text, "ax"
/**
* vmread_error_trampoline - Trampoline from inline asm to vmread_error()
@@ -246,7 +246,7 @@
* Save and restore volatile registers across a call to vmread_error(). Note,
* all parameters are passed on the stack.
*/
-ENTRY(vmread_error_trampoline)
+SYM_FUNC_START(vmread_error_trampoline)
push %_ASM_BP
mov %_ASM_SP, %_ASM_BP
@@ -294,4 +294,37 @@
pop %_ASM_BP
ret
-ENDPROC(vmread_error_trampoline)
+SYM_FUNC_END(vmread_error_trampoline)
+
+SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
+ /*
+ * Unconditionally create a stack frame, getting the correct RSP on the
+ * stack (for x86-64) would take two instructions anyways, and RBP can
+ * be used to restore RSP to make objtool happy (see below).
+ */
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+#ifdef CONFIG_X86_64
+ /*
+ * Align RSP to a 16-byte boundary (to emulate CPU behavior) before
+ * creating the synthetic interrupt stack frame for the IRQ/NMI.
+ */
+ and $-16, %rsp
+ push $__KERNEL_DS
+ push %rbp
+#endif
+ pushf
+ push $__KERNEL_CS
+ CALL_NOSPEC _ASM_ARG1
+
+ /*
+ * "Restore" RSP from RBP, even though IRET has already unwound RSP to
+ * the correct value. objtool doesn't know the callee will IRET and,
+ * without the explicit restore, thinks the stack is getting walloped.
+ * Using an unwind hint is problematic due to x86-64's dynamic alignment.
+ */
+ mov %_ASM_BP, %_ASM_SP
+ pop %_ASM_BP
+ ret
+SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index e177848..94f5f21 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -13,7 +13,6 @@
* Yaniv Kamay <yaniv@qumranet.com>
*/
-#include <linux/frame.h>
#include <linux/highmem.h>
#include <linux/hrtimer.h>
#include <linux/kernel.h>
@@ -22,18 +21,22 @@
#include <linux/moduleparam.h>
#include <linux/mod_devicetable.h>
#include <linux/mm.h>
+#include <linux/objtool.h>
#include <linux/sched.h>
#include <linux/sched/smt.h>
#include <linux/slab.h>
#include <linux/tboot.h>
#include <linux/trace_events.h>
+#include <linux/entry-kvm.h>
#include <asm/apic.h>
#include <asm/asm.h>
#include <asm/cpu.h>
+#include <asm/cpu_device_id.h>
#include <asm/debugreg.h>
#include <asm/desc.h>
#include <asm/fpu/internal.h>
+#include <asm/idtentry.h>
#include <asm/io.h>
#include <asm/irq_remapping.h>
#include <asm/kexec.h>
@@ -41,6 +44,7 @@
#include <asm/mce.h>
#include <asm/mmu_context.h>
#include <asm/mshyperv.h>
+#include <asm/mwait.h>
#include <asm/spec-ctrl.h>
#include <asm/virtext.h>
#include <asm/vmx.h>
@@ -53,7 +57,6 @@
#include "lapic.h"
#include "mmu.h"
#include "nested.h"
-#include "ops.h"
#include "pmu.h"
#include "trace.h"
#include "vmcs.h"
@@ -64,11 +67,13 @@
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");
+#ifdef MODULE
static const struct x86_cpu_id vmx_cpu_id[] = {
- X86_FEATURE_MATCH(X86_FEATURE_VMX),
+ X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
{}
};
MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
+#endif
bool __read_mostly enable_vpid = 1;
module_param_named(vpid, enable_vpid, bool, 0444);
@@ -106,8 +111,6 @@
static bool __read_mostly nested = 1;
module_param(nested, bool, S_IRUGO);
-static u64 __read_mostly host_xss;
-
bool __read_mostly enable_pml = 1;
module_param_named(pml, enable_pml, bool, S_IRUGO);
@@ -126,14 +129,14 @@
module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
#endif
+extern bool __read_mostly allow_smaller_maxphyaddr;
+module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
+
#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
#define KVM_VM_CR0_ALWAYS_ON \
(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \
X86_CR0_WP | X86_CR0_PG | X86_CR0_PE)
-#define KVM_CR4_GUEST_OWNED_BITS \
- (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_TSD)
#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
@@ -146,8 +149,27 @@
RTIT_STATUS_ERROR | RTIT_STATUS_STOPPED | \
RTIT_STATUS_BYTECNT))
-#define MSR_IA32_RTIT_OUTPUT_BASE_MASK \
- (~((1UL << cpuid_query_maxphyaddr(vcpu)) - 1) | 0x7f)
+/*
+ * List of MSRs that can be directly passed to the guest.
+ * In addition to these x2apic and PT MSRs are handled specially.
+ */
+static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
+ MSR_IA32_SPEC_CTRL,
+ MSR_IA32_PRED_CMD,
+ MSR_IA32_TSC,
+#ifdef CONFIG_X86_64
+ MSR_FS_BASE,
+ MSR_GS_BASE,
+ MSR_KERNEL_GS_BASE,
+#endif
+ MSR_IA32_SYSENTER_CS,
+ MSR_IA32_SYSENTER_ESP,
+ MSR_IA32_SYSENTER_EIP,
+ MSR_CORE_C1_RES,
+ MSR_CORE_C3_RESIDENCY,
+ MSR_CORE_C6_RESIDENCY,
+ MSR_CORE_C7_RESIDENCY,
+};
/*
* These 2 parameters are used to config the controls for Pause-Loop Exiting:
@@ -341,9 +363,8 @@
};
module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, 0644);
-static bool guest_state_valid(struct kvm_vcpu *vcpu);
static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type);
void vmx_vmexit(void);
@@ -398,13 +419,6 @@
*/
static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
-/*
- * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
- * can find which vCPU should be waken up.
- */
-static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
-static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
-
static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
static DEFINE_SPINLOCK(vmx_vpid_lock);
@@ -435,7 +449,11 @@
VMX_SEGMENT_FIELD(LDTR),
};
-u64 host_efer;
+static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+{
+ vmx->segment_cache.bitmask = 0;
+}
+
static unsigned long host_idt_base;
/*
@@ -443,13 +461,14 @@
* will emulate SYSCALL in legacy mode if the vendor string in guest
* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
* support this emulation, IA32_STAR must always be included in
- * vmx_msr_index[], even in i386 builds.
+ * vmx_uret_msrs_list[], even in i386 builds.
*/
-const u32 vmx_msr_index[] = {
+static const u32 vmx_uret_msrs_list[] = {
#ifdef CONFIG_X86_64
MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
#endif
MSR_EFER, MSR_TSC_AUX, MSR_STAR,
+ MSR_IA32_TSX_CTRL,
};
#if IS_ENABLED(CONFIG_HYPERV)
@@ -618,33 +637,76 @@
return flexpriority_enabled;
}
-static inline int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
+static int possible_passthrough_msr_slot(u32 msr)
+{
+ u32 i;
+
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++)
+ if (vmx_possible_passthrough_msrs[i] == msr)
+ return i;
+
+ return -ENOENT;
+}
+
+static bool is_valid_passthrough_msr(u32 msr)
+{
+ bool r;
+
+ switch (msr) {
+ case 0x800 ... 0x8ff:
+ /* x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() */
+ return true;
+ case MSR_IA32_RTIT_STATUS:
+ case MSR_IA32_RTIT_OUTPUT_BASE:
+ case MSR_IA32_RTIT_OUTPUT_MASK:
+ case MSR_IA32_RTIT_CR3_MATCH:
+ case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ /* PT MSRs. These are handled in pt_update_intercept_for_msr() */
+ return true;
+ }
+
+ r = possible_passthrough_msr_slot(msr) != -ENOENT;
+
+ WARN(!r, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
+
+ return r;
+}
+
+static inline int __vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- for (i = 0; i < vmx->nmsrs; ++i)
- if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
+ for (i = 0; i < vmx->nr_uret_msrs; ++i)
+ if (vmx_uret_msrs_list[vmx->guest_uret_msrs[i].slot] == msr)
return i;
return -1;
}
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr)
{
int i;
- i = __find_msr_index(vmx, msr);
+ i = __vmx_find_uret_msr(vmx, msr);
if (i >= 0)
- return &vmx->guest_msrs[i];
+ return &vmx->guest_uret_msrs[i];
return NULL;
}
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs)
+static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
+ struct vmx_uret_msr *msr, u64 data)
{
- vmcs_clear(loaded_vmcs->vmcs);
- if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
- vmcs_clear(loaded_vmcs->shadow_vmcs);
- loaded_vmcs->cpu = -1;
- loaded_vmcs->launched = 0;
+ int ret = 0;
+
+ u64 old_msr_data = msr->data;
+ msr->data = data;
+ if (msr - vmx->guest_uret_msrs < vmx->nr_active_uret_msrs) {
+ preempt_disable();
+ ret = kvm_set_user_return_msr(msr->slot, msr->data, msr->mask);
+ preempt_enable();
+ if (ret)
+ msr->data = old_msr_data;
+ }
+ return ret;
}
#ifdef CONFIG_KEXEC_CORE
@@ -703,8 +765,8 @@
bool ret;
u32 mask = 1 << (seg * SEG_FIELD_NR + field);
- if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
- vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
+ if (!kvm_register_is_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS)) {
+ kvm_register_mark_available(&vmx->vcpu, VCPU_EXREG_SEGMENTS);
vmx->segment_cache.bitmask = 0;
}
ret = vmx->segment_cache.bitmask & mask;
@@ -768,8 +830,8 @@
eb |= 1u << BP_VECTOR;
if (to_vmx(vcpu)->rmode.vm86_active)
eb = ~0;
- if (enable_ept)
- eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
+ if (!vmx_need_pf_intercept(vcpu))
+ eb &= ~(1u << PF_VECTOR);
/* When we are running a nested L2 guest and L1 specified for it a
* certain exception bitmap, we must trap the same exceptions and pass
@@ -778,6 +840,18 @@
*/
if (is_guest_mode(vcpu))
eb |= get_vmcs12(vcpu)->exception_bitmap;
+ else {
+ /*
+ * If EPT is enabled, #PF is only trapped if MAXPHYADDR is mismatched
+ * between guest and host. In that case we only care about present
+ * faults. For vmcs02, however, PFEC_MASK and PFEC_MATCH are set in
+ * prepare_vmcs02_rare.
+ */
+ bool selective_pf_trap = enable_ept && (eb & (1u << PF_VECTOR));
+ int mask = selective_pf_trap ? PFERR_PRESENT_MASK : 0;
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, mask);
+ vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, mask);
+ }
vmcs_write32(EXCEPTION_BITMAP, eb);
}
@@ -785,15 +859,15 @@
/*
* Check if MSR is intercepted for currently loaded MSR bitmap.
*/
-static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr)
+static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
{
unsigned long *msr_bitmap;
int f = sizeof(unsigned long);
- if (!cpu_has_vmx_msr_bitmap())
+ if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
return true;
- msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap;
+ msr_bitmap = vmx->loaded_vmcs->msr_bitmap;
if (msr <= 0x1fff) {
return !!test_bit(msr, msr_bitmap + 0x800 / f);
@@ -812,7 +886,7 @@
vm_exit_controls_clearbit(vmx, exit);
}
-static int find_msr(struct vmx_msrs *m, unsigned int msr)
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
{
unsigned int i;
@@ -846,7 +920,7 @@
}
break;
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (i < 0)
goto skip_guest;
--m->guest.nr;
@@ -854,7 +928,7 @@
vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, m->guest.nr);
skip_guest:
- i = find_msr(&m->host, msr);
+ i = vmx_find_loadstore_msr_slot(&m->host, msr);
if (i < 0)
return;
@@ -913,12 +987,12 @@
wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
}
- i = find_msr(&m->guest, msr);
+ i = vmx_find_loadstore_msr_slot(&m->guest, msr);
if (!entry_only)
- j = find_msr(&m->host, msr);
+ j = vmx_find_loadstore_msr_slot(&m->host, msr);
- if ((i < 0 && m->guest.nr == NR_AUTOLOAD_MSRS) ||
- (j < 0 && m->host.nr == NR_AUTOLOAD_MSRS)) {
+ if ((i < 0 && m->guest.nr == MAX_NR_LOADSTORE_MSRS) ||
+ (j < 0 && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
printk_once(KERN_WARNING "Not enough msr switch entries. "
"Can't add msr %x\n", msr);
return;
@@ -941,10 +1015,11 @@
m->host.val[j].value = host_val;
}
-static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
+static bool update_transition_efer(struct vcpu_vmx *vmx)
{
u64 guest_efer = vmx->vcpu.arch.efer;
u64 ignore_bits = 0;
+ int i;
/* Shadow paging assumes NX to be available. */
if (!enable_ept)
@@ -976,17 +1051,21 @@
else
clear_atomic_switch_msr(vmx, MSR_EFER);
return false;
- } else {
- clear_atomic_switch_msr(vmx, MSR_EFER);
-
- guest_efer &= ~ignore_bits;
- guest_efer |= host_efer & ignore_bits;
-
- vmx->guest_msrs[efer_offset].data = guest_efer;
- vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
-
- return true;
}
+
+ i = __vmx_find_uret_msr(vmx, MSR_EFER);
+ if (i < 0)
+ return false;
+
+ clear_atomic_switch_msr(vmx, MSR_EFER);
+
+ guest_efer &= ~ignore_bits;
+ guest_efer |= host_efer & ignore_bits;
+
+ vmx->guest_uret_msrs[i].data = guest_efer;
+ vmx->guest_uret_msrs[i].mask = ~ignore_bits;
+
+ return true;
}
#ifdef CONFIG_X86_32
@@ -1018,6 +1097,18 @@
}
#endif
+static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
+{
+ return vmx_pt_mode_is_host_guest() &&
+ !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
+}
+
+static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
+{
+ /* The base must be 128-byte aligned and a legal physical address. */
+ return !kvm_vcpu_is_illegal_gpa(vcpu, base) && !(base & 0x7f);
+}
+
static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
{
u32 i;
@@ -1048,7 +1139,7 @@
static void pt_guest_enter(struct vcpu_vmx *vmx)
{
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
return;
/*
@@ -1065,7 +1156,7 @@
static void pt_guest_exit(struct vcpu_vmx *vmx)
{
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
return;
if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
@@ -1122,12 +1213,12 @@
* when guest state is loaded. This happens when guest transitions
* to/from long-mode by setting MSR_EFER.LMA.
*/
- if (!vmx->guest_msrs_ready) {
- vmx->guest_msrs_ready = true;
- for (i = 0; i < vmx->save_nmsrs; ++i)
- kvm_set_shared_msr(vmx->guest_msrs[i].index,
- vmx->guest_msrs[i].data,
- vmx->guest_msrs[i].mask);
+ if (!vmx->guest_uret_msrs_loaded) {
+ vmx->guest_uret_msrs_loaded = true;
+ for (i = 0; i < vmx->nr_active_uret_msrs; ++i)
+ kvm_set_user_return_msr(vmx->guest_uret_msrs[i].slot,
+ vmx->guest_uret_msrs[i].data,
+ vmx->guest_uret_msrs[i].mask);
}
@@ -1151,7 +1242,7 @@
gs_base = cpu_kernelmode_gs_base(cpu);
if (likely(is_64bit_mm(current->mm))) {
- save_fsgs_for_kvm();
+ current_save_fsgs();
fs_sel = current->thread.fsindex;
gs_sel = current->thread.gsindex;
fs_base = current->thread.fsbase;
@@ -1211,7 +1302,7 @@
#endif
load_fixmap_gdt(raw_smp_processor_id());
vmx->guest_state_loaded = false;
- vmx->guest_msrs_ready = false;
+ vmx->guest_uret_msrs_loaded = false;
}
#ifdef CONFIG_X86_64
@@ -1234,62 +1325,6 @@
}
#endif
-static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- /*
- * In case of hot-plug or hot-unplug, we may have to undo
- * vmx_vcpu_pi_put even if there is no assigned device. And we
- * always keep PI.NDST up to date for simplicity: it makes the
- * code easier, and CPU migration is not a fast path.
- */
- if (!pi_test_sn(pi_desc) && vcpu->cpu == cpu)
- return;
-
- /*
- * If the 'nv' field is POSTED_INTR_WAKEUP_VECTOR, do not change
- * PI.NDST: pi_post_block is the one expected to change PID.NDST and the
- * wakeup handler expects the vCPU to be on the blocked_vcpu_list that
- * matches PI.NDST. Otherwise, a vcpu may not be able to be woken up
- * correctly.
- */
- if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR || vcpu->cpu == cpu) {
- pi_clear_sn(pi_desc);
- goto after_clear_sn;
- }
-
- /* The full case. */
- do {
- old.control = new.control = pi_desc->control;
-
- dest = cpu_physical_id(cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- new.sn = 0;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
-after_clear_sn:
-
- /*
- * Clear SN before reading the bitmap. The VT-d firmware
- * writes the bitmap and reads SN atomically (5.2.3 in the
- * spec), so it doesn't really have a memory barrier that
- * pairs with this, but we cannot do that and we need one.
- */
- smp_mb__after_atomic();
-
- if (!pi_is_pir_empty(pi_desc))
- pi_set_on(pi_desc);
-}
-
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy)
{
@@ -1332,6 +1367,10 @@
void *gdt = get_current_gdt_ro();
unsigned long sysenter_esp;
+ /*
+ * Flush all EPTP/VPID contexts, the new pCPU may have stale
+ * TLB entries from its previous association with the vCPU.
+ */
kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
@@ -1342,14 +1381,6 @@
(unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
vmcs_writel(HOST_GDTR_BASE, (unsigned long)gdt); /* 22.2.4 */
- /*
- * VM exits change the host TR limit to 0x67 after a VM
- * exit. This is okay, since 0x67 covers everything except
- * the IO bitmap and have have code to handle the IO bitmap
- * being lost after a VM exit.
- */
- BUILD_BUG_ON(IO_BITMAP_OFFSET - 1 != 0x67);
-
rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
@@ -1366,7 +1397,7 @@
* Switches to specified vcpu, until a matching vcpu_put(), but assumes
* vcpu mutex is already taken.
*/
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1377,20 +1408,6 @@
vmx->host_debugctlmsr = get_debugctlmsr();
}
-static void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return;
-
- /* Set SN when the vCPU is preempted */
- if (vcpu->preempted)
- pi_set_sn(pi_desc);
-}
-
static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
{
vmx_vcpu_pi_put(vcpu);
@@ -1400,42 +1417,49 @@
static bool emulation_required(struct kvm_vcpu *vcpu)
{
- return emulate_invalid_guest_state && !guest_state_valid(vcpu);
+ return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
}
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
-
unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long rflags, save_rflags;
- if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_RFLAGS)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
rflags = vmcs_readl(GUEST_RFLAGS);
- if (to_vmx(vcpu)->rmode.vm86_active) {
+ if (vmx->rmode.vm86_active) {
rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
- save_rflags = to_vmx(vcpu)->rmode.save_rflags;
+ save_rflags = vmx->rmode.save_rflags;
rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
}
- to_vmx(vcpu)->rflags = rflags;
+ vmx->rflags = rflags;
}
- return to_vmx(vcpu)->rflags;
+ return vmx->rflags;
}
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
- unsigned long old_rflags = vmx_get_rflags(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long old_rflags;
- __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
- to_vmx(vcpu)->rflags = rflags;
- if (to_vmx(vcpu)->rmode.vm86_active) {
- to_vmx(vcpu)->rmode.save_rflags = rflags;
+ if (is_unrestricted_guest(vcpu)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_RFLAGS);
+ vmx->rflags = rflags;
+ vmcs_writel(GUEST_RFLAGS, rflags);
+ return;
+ }
+
+ old_rflags = vmx_get_rflags(vcpu);
+ vmx->rflags = rflags;
+ if (vmx->rmode.vm86_active) {
+ vmx->rmode.save_rflags = rflags;
rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
}
vmcs_writel(GUEST_RFLAGS, rflags);
- if ((old_rflags ^ to_vmx(vcpu)->rflags) & X86_EFLAGS_VM)
- to_vmx(vcpu)->emulation_required = emulation_required(vcpu);
+ if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
+ vmx->emulation_required = emulation_required(vcpu);
}
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
@@ -1539,6 +1563,11 @@
return 0;
}
+static bool vmx_can_emulate_instruction(struct kvm_vcpu *vcpu, void *insn, int insn_len)
+{
+ return true;
+}
+
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
unsigned long rip, orig_rip;
@@ -1552,7 +1581,7 @@
* i.e. we end up advancing IP with some random value.
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
- to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
+ to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
orig_rip = kvm_rip_read(vcpu);
rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
#ifdef CONFIG_X86_64
@@ -1576,6 +1605,39 @@
return 1;
}
+/*
+ * Recognizes a pending MTF VM-exit and records the nested state for later
+ * delivery.
+ */
+static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!is_guest_mode(vcpu))
+ return;
+
+ /*
+ * Per the SDM, MTF takes priority over debug-trap exceptions besides
+ * T-bit traps. As instruction emulation is completed (i.e. at the
+ * instruction boundary), any #DB exception pending delivery must be a
+ * debug-trap. Record the pending MTF state to be delivered in
+ * vmx_check_nested_events().
+ */
+ if (nested_cpu_has_mtf(vmcs12) &&
+ (!vcpu->arch.exception.pending ||
+ vcpu->arch.exception.nr == DB_VECTOR))
+ vmx->nested.mtf_pending = true;
+ else
+ vmx->nested.mtf_pending = false;
+}
+
+static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
+{
+ vmx_update_emulated_instruction(vcpu);
+ return skip_emulated_instruction(vcpu);
+}
+
static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
{
/*
@@ -1626,26 +1688,19 @@
vmx_clear_hlt(vcpu);
}
-static bool vmx_rdtscp_supported(void)
+static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr)
{
- return cpu_has_vmx_rdtscp();
-}
+ struct vmx_uret_msr tmp;
+ int from, to;
-static bool vmx_invpcid_supported(void)
-{
- return cpu_has_vmx_invpcid();
-}
+ from = __vmx_find_uret_msr(vmx, msr);
+ if (from < 0)
+ return;
+ to = vmx->nr_active_uret_msrs++;
-/*
- * Swap MSR entry in host/guest MSR entry array.
- */
-static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
-{
- struct shared_msr_entry tmp;
-
- tmp = vmx->guest_msrs[to];
- vmx->guest_msrs[to] = vmx->guest_msrs[from];
- vmx->guest_msrs[from] = tmp;
+ tmp = vmx->guest_uret_msrs[to];
+ vmx->guest_uret_msrs[to] = vmx->guest_uret_msrs[from];
+ vmx->guest_uret_msrs[from] = tmp;
}
/*
@@ -1655,51 +1710,31 @@
*/
static void setup_msrs(struct vcpu_vmx *vmx)
{
- int save_nmsrs, index;
-
- save_nmsrs = 0;
+ vmx->guest_uret_msrs_loaded = false;
+ vmx->nr_active_uret_msrs = 0;
#ifdef CONFIG_X86_64
/*
* The SYSCALL MSRs are only needed on long mode guests, and only
* when EFER.SCE is set.
*/
if (is_long_mode(&vmx->vcpu) && (vmx->vcpu.arch.efer & EFER_SCE)) {
- index = __find_msr_index(vmx, MSR_STAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_LSTAR);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
- if (index >= 0)
- move_msr_up(vmx, index, save_nmsrs++);
+ vmx_setup_uret_msr(vmx, MSR_STAR);
+ vmx_setup_uret_msr(vmx, MSR_LSTAR);
+ vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK);
}
#endif
- index = __find_msr_index(vmx, MSR_EFER);
- if (index >= 0 && update_transition_efer(vmx, index))
- move_msr_up(vmx, index, save_nmsrs++);
- index = __find_msr_index(vmx, MSR_TSC_AUX);
- if (index >= 0 && guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
- move_msr_up(vmx, index, save_nmsrs++);
+ if (update_transition_efer(vmx))
+ vmx_setup_uret_msr(vmx, MSR_EFER);
- vmx->save_nmsrs = save_nmsrs;
- vmx->guest_msrs_ready = false;
+ if (guest_cpuid_has(&vmx->vcpu, X86_FEATURE_RDTSCP))
+ vmx_setup_uret_msr(vmx, MSR_TSC_AUX);
+
+ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL);
if (cpu_has_vmx_msr_bitmap())
vmx_update_msr_bitmap(&vmx->vcpu);
}
-static u64 vmx_read_l1_tsc_offset(struct kvm_vcpu *vcpu)
-{
- struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
-
- if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
- return vcpu->arch.tsc_offset - vmcs12->tsc_offset;
-
- return vcpu->arch.tsc_offset;
-}
-
static u64 vmx_write_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
@@ -1712,7 +1747,7 @@
* to the newly set TSC to get L2's TSC.
*/
if (is_guest_mode(vcpu) &&
- (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING))
+ (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETTING))
g_tsc_offset = vmcs12->tsc_offset;
trace_kvm_write_tsc_offset(vcpu->vcpu_id,
@@ -1748,11 +1783,12 @@
if (!nested)
return 1;
return vmx_get_vmx_msr(&vmcs_config.nested, msr->index, &msr->data);
+ case MSR_IA32_PERF_CAPABILITIES:
+ msr->data = vmx_get_perf_capabilities();
+ return 0;
default:
- return 1;
+ return KVM_MSR_RET_INVALID;
}
-
- return 0;
}
/*
@@ -1763,7 +1799,7 @@
static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
u32 index;
switch (msr_info->index) {
@@ -1780,6 +1816,11 @@
#endif
case MSR_EFER:
return kvm_get_msr_common(vcpu, msr_info);
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ goto find_uret_msr;
case MSR_IA32_UMWAIT_CONTROL:
if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
return 1;
@@ -1812,45 +1853,50 @@
case MSR_IA32_MCG_EXT_CTL:
if (!msr_info->host_initiated &&
!(vmx->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE))
+ FEAT_CTL_LMCE_ENABLED))
return 1;
msr_info->data = vcpu->arch.mcg_ext_ctl;
break;
- case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_FEAT_CTL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
- return vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
- &msr_info->data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported() ||
- (!msr_info->host_initiated &&
- !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
+ if (vmx_get_vmx_msr(&vmx->nested.msrs, msr_info->index,
+ &msr_info->data))
return 1;
- msr_info->data = vcpu->arch.ia32_xss;
+ /*
+ * Enlightened VMCS v1 doesn't have certain VMCS fields but
+ * instead of just ignoring the features, different Hyper-V
+ * versions are either trying to use them and fail or do some
+ * sanity checking and refuse to boot. Filter all unsupported
+ * features out.
+ */
+ if (!msr_info->host_initiated &&
+ vmx->nested.enlightened_vmcs_enabled)
+ nested_evmcs_filter_control_msr(msr_info->index,
+ &msr_info->data);
break;
case MSR_IA32_RTIT_CTL:
- if (pt_mode != PT_MODE_HOST_GUEST)
+ if (!vmx_pt_mode_is_host_guest())
return 1;
msr_info->data = vmx->pt_desc.guest.ctl;
break;
case MSR_IA32_RTIT_STATUS:
- if (pt_mode != PT_MODE_HOST_GUEST)
+ if (!vmx_pt_mode_is_host_guest())
return 1;
msr_info->data = vmx->pt_desc.guest.status;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_cr3_filtering))
return 1;
msr_info->data = vmx->pt_desc.guest.cr3_match;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1859,7 +1905,7 @@
msr_info->data = vmx->pt_desc.guest.output_base;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(!intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_topa_output) &&
!intel_pt_validate_cap(vmx->pt_desc.caps,
@@ -1869,7 +1915,7 @@
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges)))
return 1;
@@ -1882,9 +1928,10 @@
if (!msr_info->host_initiated &&
!guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP))
return 1;
- /* Else, falls through */
+ goto find_uret_msr;
default:
- msr = find_msr_entry(vmx, msr_info->index);
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_info->index);
if (msr) {
msr_info->data = msr->data;
break;
@@ -1895,15 +1942,25 @@
return 0;
}
+static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
+ u64 data)
+{
+#ifdef CONFIG_X86_64
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
+ return (u32)data;
+#endif
+ return (unsigned long)data;
+}
+
/*
- * Writes msr value into into the appropriate "register".
+ * Writes msr value into the appropriate "register".
* Returns 0 on success, non-0 otherwise.
* Assumes vcpu_load() was already called.
*/
static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr;
+ struct vmx_uret_msr *msr;
int ret = 0;
u32 msr_index = msr_info->index;
u64 data = msr_info->data;
@@ -1932,13 +1989,17 @@
vmcs_write32(GUEST_SYSENTER_CS, data);
break;
case MSR_IA32_SYSENTER_EIP:
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
get_vmcs12(vcpu)->guest_sysenter_eip = data;
+ }
vmcs_writel(GUEST_SYSENTER_EIP, data);
break;
case MSR_IA32_SYSENTER_ESP:
- if (is_guest_mode(vcpu))
+ if (is_guest_mode(vcpu)) {
+ data = nested_vmx_truncate_sysenter_addr(vcpu, data);
get_vmcs12(vcpu)->guest_sysenter_esp = data;
+ }
vmcs_writel(GUEST_SYSENTER_ESP, data);
break;
case MSR_IA32_DEBUGCTLMSR:
@@ -1988,15 +2049,22 @@
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging. We update the vmcs01 here for L1 as well
* since it will end up touching the MSR anyway now.
*/
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap,
+ vmx_disable_intercept_for_msr(vcpu,
MSR_IA32_SPEC_CTRL,
MSR_TYPE_RW);
break;
+ case MSR_IA32_TSX_CTRL:
+ if (!msr_info->host_initiated &&
+ !(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
+ return 1;
+ if (data & ~(TSX_CTRL_RTM_DISABLE | TSX_CTRL_CPUID_CLEAR))
+ return 1;
+ goto find_uret_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
!guest_has_pred_cmd_msr(vcpu))
@@ -2018,12 +2086,11 @@
*
* For nested:
* The handling of the MSR bitmap for L2 guests is done in
- * nested_vmx_merge_msr_bitmap. We should not touch the
+ * nested_vmx_prepare_msr_bitmap. We should not touch the
* vmcs02.msr_bitmap here since it gets completely overwritten
* in the merging.
*/
- vmx_disable_intercept_for_msr(vmx->vmcs01.msr_bitmap, MSR_IA32_PRED_CMD,
- MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W);
break;
case MSR_IA32_CR_PAT:
if (!kvm_pat_valid(data))
@@ -2046,15 +2113,15 @@
case MSR_IA32_MCG_EXT_CTL:
if ((!msr_info->host_initiated &&
!(to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LMCE)) ||
+ FEAT_CTL_LMCE_ENABLED)) ||
(data & ~MCG_EXT_CTL_LMCE_EN))
return 1;
vcpu->arch.mcg_ext_ctl = data;
break;
- case MSR_IA32_FEATURE_CONTROL:
+ case MSR_IA32_FEAT_CTL:
if (!vmx_feature_control_msr_valid(vcpu, data) ||
(to_vmx(vcpu)->msr_ia32_feature_control &
- FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
+ FEAT_CTL_LOCKED && !msr_info->host_initiated))
return 1;
vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
@@ -2066,76 +2133,58 @@
if (!nested_vmx_allowed(vcpu))
return 1;
return vmx_set_vmx_msr(vcpu, msr_index, data);
- case MSR_IA32_XSS:
- if (!vmx_xsaves_supported() ||
- (!msr_info->host_initiated &&
- !(guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
- guest_cpuid_has(vcpu, X86_FEATURE_XSAVES))))
- return 1;
- /*
- * The only supported bit as of Skylake is bit 8, but
- * it is not supported on KVM.
- */
- if (data != 0)
- return 1;
- vcpu->arch.ia32_xss = data;
- if (vcpu->arch.ia32_xss != host_xss)
- add_atomic_switch_msr(vmx, MSR_IA32_XSS,
- vcpu->arch.ia32_xss, host_xss, false);
- else
- clear_atomic_switch_msr(vmx, MSR_IA32_XSS);
- break;
case MSR_IA32_RTIT_CTL:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
+ if (!vmx_pt_mode_is_host_guest() ||
vmx_rtit_ctl_check(vcpu, data) ||
vmx->nested.vmxon)
return 1;
vmcs_write64(GUEST_IA32_RTIT_CTL, data);
vmx->pt_desc.guest.ctl = data;
- pt_update_intercept_for_msr(vmx);
+ pt_update_intercept_for_msr(vcpu);
break;
case MSR_IA32_RTIT_STATUS:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (data & MSR_IA32_RTIT_STATUS_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (data & MSR_IA32_RTIT_STATUS_MASK)
return 1;
vmx->pt_desc.guest.status = data;
break;
case MSR_IA32_RTIT_CR3_MATCH:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_cr3_filtering))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_cr3_filtering))
return 1;
vmx->pt_desc.guest.cr3_match = data;
break;
case MSR_IA32_RTIT_OUTPUT_BASE:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)) ||
- (data & MSR_IA32_RTIT_OUTPUT_BASE_MASK))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
+ return 1;
+ if (!pt_output_base_valid(vcpu, data))
return 1;
vmx->pt_desc.guest.output_base = data;
break;
case MSR_IA32_RTIT_OUTPUT_MASK:
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (!intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_topa_output) &&
- !intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_single_range_output)))
+ if (!pt_can_write_msr(vmx))
+ return 1;
+ if (!intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_topa_output) &&
+ !intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_single_range_output))
return 1;
vmx->pt_desc.guest.output_mask = data;
break;
case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
+ if (!pt_can_write_msr(vmx))
+ return 1;
index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
- if ((pt_mode != PT_MODE_HOST_GUEST) ||
- (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) ||
- (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
- PT_CAP_num_address_ranges)))
+ if (index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
+ PT_CAP_num_address_ranges))
return 1;
if (is_noncanonical_address(data, vcpu))
return 1;
@@ -2151,23 +2200,15 @@
/* Check reserved bit, higher 32 bits should be zero */
if ((data >> 32) != 0)
return 1;
- /* Else, falls through */
+ goto find_uret_msr;
+
default:
- msr = find_msr_entry(vmx, msr_index);
- if (msr) {
- u64 old_msr_data = msr->data;
- msr->data = data;
- if (msr - vmx->guest_msrs < vmx->save_nmsrs) {
- preempt_disable();
- ret = kvm_set_shared_msr(msr->index, msr->data,
- msr->mask);
- preempt_enable();
- if (ret)
- msr->data = old_msr_data;
- }
- break;
- }
- ret = kvm_set_msr_common(vcpu, msr_info);
+ find_uret_msr:
+ msr = vmx_find_uret_msr(vmx, msr_index);
+ if (msr)
+ ret = vmx_set_guest_uret_msr(vmx, msr, data);
+ else
+ ret = kvm_set_msr_common(vcpu, msr_info);
}
return ret;
@@ -2175,7 +2216,10 @@
static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
{
- __set_bit(reg, (unsigned long *)&vcpu->arch.regs_avail);
+ unsigned long guest_owned_bits;
+
+ kvm_register_mark_available(vcpu, reg);
+
switch (reg) {
case VCPU_REGS_RSP:
vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(GUEST_RSP);
@@ -2187,7 +2231,25 @@
if (enable_ept)
ept_save_pdptrs(vcpu);
break;
+ case VCPU_EXREG_CR0:
+ guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
+
+ vcpu->arch.cr0 &= ~guest_owned_bits;
+ vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits;
+ break;
+ case VCPU_EXREG_CR3:
+ if (is_unrestricted_guest(vcpu) ||
+ (enable_ept && is_paging(vcpu)))
+ vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
+ break;
+ case VCPU_EXREG_CR4:
+ guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
+
+ vcpu->arch.cr4 &= ~guest_owned_bits;
+ vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits;
+ break;
default:
+ WARN_ON_ONCE(1);
break;
}
}
@@ -2199,44 +2261,37 @@
static __init int vmx_disabled_by_bios(void)
{
- u64 msr;
-
- rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
- if (msr & FEATURE_CONTROL_LOCKED) {
- /* launched w/ TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && tboot_enabled())
- return 1;
- /* launched w/o TXT and VMX only enabled w/ TXT */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
- && !tboot_enabled()) {
- printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
- "activate TXT before enabling KVM\n");
- return 1;
- }
- /* launched w/o TXT and VMX disabled */
- if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
- && !tboot_enabled())
- return 1;
- }
-
- return 0;
+ return !boot_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !boot_cpu_has(X86_FEATURE_VMX);
}
-static void kvm_cpu_vmxon(u64 addr)
+static int kvm_cpu_vmxon(u64 vmxon_pointer)
{
+ u64 msr;
+
cr4_set_bits(X86_CR4_VMXE);
intel_pt_handle_vmx(1);
- asm volatile ("vmxon %0" : : "m"(addr));
+ asm_volatile_goto("1: vmxon %[vmxon_pointer]\n\t"
+ _ASM_EXTABLE(1b, %l[fault])
+ : : [vmxon_pointer] "m"(vmxon_pointer)
+ : : fault);
+ return 0;
+
+fault:
+ WARN_ONCE(1, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
+ rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? 0xdeadbeef : msr);
+ intel_pt_handle_vmx(0);
+ cr4_clear_bits(X86_CR4_VMXE);
+
+ return -EFAULT;
}
static int hardware_enable(void)
{
int cpu = raw_smp_processor_id();
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
- u64 old, test_bits;
+ int r;
if (cr4_read_shadow() & X86_CR4_VMXE)
return -EBUSY;
@@ -2249,18 +2304,10 @@
!hv_get_vp_assist_page(cpu))
return -EFAULT;
- rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
+ r = kvm_cpu_vmxon(phys_addr);
+ if (r)
+ return r;
- test_bits = FEATURE_CONTROL_LOCKED;
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
- if (tboot_enabled())
- test_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
-
- if ((old & test_bits) != test_bits) {
- /* enable and lock */
- wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits);
- }
- kvm_cpu_vmxon(phys_addr);
if (enable_ept)
ept_sync_global();
@@ -2295,6 +2342,17 @@
kvm_cpu_vmxoff();
}
+/*
+ * There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
+ * directly instead of going through cpu_has(), to ensure KVM is trapping
+ * ENCLS whenever it's supported in hardware. It does not matter whether
+ * the host OS supports or has enabled SGX.
+ */
+static bool cpu_has_sgx(void)
+{
+ return cpuid_eax(0) >= 0x12 && (cpuid_eax(0x12) & BIT(0));
+}
+
static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
u32 msr, u32 *result)
{
@@ -2335,7 +2393,7 @@
CPU_BASED_CR3_STORE_EXITING |
CPU_BASED_UNCOND_IO_EXITING |
CPU_BASED_MOV_DR_EXITING |
- CPU_BASED_USE_TSC_OFFSETING |
+ CPU_BASED_USE_TSC_OFFSETTING |
CPU_BASED_MWAIT_EXITING |
CPU_BASED_MONITOR_EXITING |
CPU_BASED_INVLPG_EXITING |
@@ -2362,7 +2420,7 @@
SECONDARY_EXEC_UNRESTRICTED_GUEST |
SECONDARY_EXEC_PAUSE_LOOP_EXITING |
SECONDARY_EXEC_DESC |
- SECONDARY_EXEC_RDTSCP |
+ SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
@@ -2375,8 +2433,9 @@
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_PT_USE_GPA |
SECONDARY_EXEC_PT_CONCEAL_VMX |
- SECONDARY_EXEC_ENABLE_VMFUNC |
- SECONDARY_EXEC_ENCLS_EXITING;
+ SECONDARY_EXEC_ENABLE_VMFUNC;
+ if (cpu_has_sgx())
+ opt2 |= SECONDARY_EXEC_ENCLS_EXITING;
if (adjust_vmx_controls(min2, opt2,
MSR_IA32_VMX_PROCBASED_CTLS2,
&_cpu_based_2nd_exec_control) < 0)
@@ -2505,8 +2564,10 @@
vmcs_conf->vmexit_ctrl = _vmexit_control;
vmcs_conf->vmentry_ctrl = _vmentry_control;
- if (static_branch_unlikely(&enable_evmcs))
+#if IS_ENABLED(CONFIG_HYPERV)
+ if (enlightened_vmcs)
evmcs_sanitize_exec_ctrls(vmcs_conf);
+#endif
return 0;
}
@@ -2560,9 +2621,12 @@
if (!loaded_vmcs->vmcs)
return -ENOMEM;
+ vmcs_clear(loaded_vmcs->vmcs);
+
loaded_vmcs->shadow_vmcs = NULL;
loaded_vmcs->hv_timer_soft_disabled = false;
- loaded_vmcs_init(loaded_vmcs);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
if (cpu_has_vmx_msr_bitmap()) {
loaded_vmcs->msr_bitmap = (unsigned long *)
@@ -2670,8 +2734,6 @@
vmx->rmode.vm86_active = 0;
- vmx_segment_cache_clear(vmx);
-
vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR);
flags = vmcs_readl(GUEST_RFLAGS);
@@ -2774,13 +2836,14 @@
kvm_mmu_reset_context(vcpu);
}
-void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
+ struct vmx_uret_msr *msr = vmx_find_uret_msr(vmx, MSR_EFER);
+ /* Nothing to do if hardware doesn't support EFER. */
if (!msr)
- return;
+ return 0;
vcpu->arch.efer = efer;
if (efer & EFER_LMA) {
@@ -2792,6 +2855,7 @@
msr->data = efer & ~EFER_LME;
}
setup_msrs(vmx);
+ return 0;
}
#ifdef CONFIG_X86_64
@@ -2821,49 +2885,78 @@
#endif
-static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
+static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
{
- int vpid = to_vmx(vcpu)->vpid;
-
- if (!vpid_sync_vcpu_addr(vpid, addr))
- vpid_sync_context(vpid);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
- * If VPIDs are not supported or enabled, then the above is a no-op.
- * But we don't really need a TLB flush in that case anyway, because
- * each VM entry/exit includes an implicit flush when VPID is 0.
+ * INVEPT must be issued when EPT is enabled, irrespective of VPID, as
+ * the CPU is not required to invalidate guest-physical mappings on
+ * VM-Entry, even if VPID is disabled. Guest-physical mappings are
+ * associated with the root EPT structure and not any particular VPID
+ * (INVVPID also isn't required to invalidate guest-physical mappings).
*/
+ if (enable_ept) {
+ ept_sync_global();
+ } else if (enable_vpid) {
+ if (cpu_has_vmx_invvpid_global()) {
+ vpid_sync_vcpu_global();
+ } else {
+ vpid_sync_vcpu_single(vmx->vpid);
+ vpid_sync_vcpu_single(vmx->nested.vpid02);
+ }
+ }
}
-static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
+static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
- ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
-
- vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
- vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
+ if (is_guest_mode(vcpu))
+ return nested_get_vpid02(vcpu);
+ return to_vmx(vcpu)->vpid;
}
-static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
- if (enable_unrestricted_guest || (enable_ept && is_paging(vcpu)))
- vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
- __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
+ struct kvm_mmu *mmu = vcpu->arch.mmu;
+ u64 root_hpa = mmu->root_hpa;
+
+ /* No flush required if the current context is invalid. */
+ if (!VALID_PAGE(root_hpa))
+ return;
+
+ if (enable_ept)
+ ept_sync_context(construct_eptp(vcpu, root_hpa,
+ mmu->shadow_root_level));
+ else
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
}
-static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
- ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
-
- vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
- vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
+ /*
+ * vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
+ * vmx_flush_tlb_guest() for an explanation of why this is ok.
+ */
+ vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
-static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
+static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
+{
+ /*
+ * vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
+ * vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
+ * required to flush GVA->{G,H}PA mappings from the TLB if vpid is
+ * disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
+ * i.e. no explicit INVVPID is necessary.
+ */
+ vpid_sync_context(vmx_get_current_vpid(vcpu));
+}
+
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (!test_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty))
+ if (!kvm_register_is_dirty(vcpu, VCPU_EXREG_PDPTR))
return;
if (is_pae_paging(vcpu)) {
@@ -2878,17 +2971,15 @@
{
struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
- if (is_pae_paging(vcpu)) {
- mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
- mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
- mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
- mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
- }
+ if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
+ return;
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_avail);
- __set_bit(VCPU_EXREG_PDPTR,
- (unsigned long *)&vcpu->arch.regs_dirty);
+ mmu->pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
+ mmu->pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
+ mmu->pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
+ mmu->pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
+
+ kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR);
}
static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
@@ -2897,8 +2988,8 @@
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
- vmx_decache_cr3(vcpu);
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3))
+ vmx_cache_reg(vcpu, VCPU_EXREG_CR3);
if (!(cr0 & X86_CR0_PG)) {
/* From paging/starting to nonpaging */
exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING |
@@ -2923,7 +3014,7 @@
unsigned long hw_cr0;
hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
- if (enable_unrestricted_guest)
+ if (is_unrestricted_guest(vcpu))
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
else {
hw_cr0 |= KVM_VM_CR0_ALWAYS_ON;
@@ -2944,32 +3035,31 @@
}
#endif
- if (enable_ept && !enable_unrestricted_guest)
+ if (enable_ept && !is_unrestricted_guest(vcpu))
ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
vmcs_writel(CR0_READ_SHADOW, cr0);
vmcs_writel(GUEST_CR0, hw_cr0);
vcpu->arch.cr0 = cr0;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR0);
/* depends on vcpu->arch.cr0 to be set to a new value */
vmx->emulation_required = emulation_required(vcpu);
}
-static int get_ept_level(struct kvm_vcpu *vcpu)
+static int vmx_get_max_tdp_level(void)
{
- /* Nested EPT currently only supports 4-level walks. */
- if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
- return 4;
- if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
+ if (cpu_has_vmx_ept_5levels())
return 5;
return 4;
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa)
+u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
+ int root_level)
{
u64 eptp = VMX_EPTP_MT_WB;
- eptp |= (get_ept_level(vcpu) == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
+ eptp |= (root_level == 5) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
if (enable_ept_ad_bits &&
(!is_guest_mode(vcpu) || nested_ept_ad_enabled(vcpu)))
@@ -2979,19 +3069,19 @@
return eptp;
}
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
+static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, unsigned long pgd,
+ int pgd_level)
{
struct kvm *kvm = vcpu->kvm;
bool update_guest_cr3 = true;
unsigned long guest_cr3;
u64 eptp;
- guest_cr3 = cr3;
if (enable_ept) {
- eptp = construct_eptp(vcpu, cr3);
+ eptp = construct_eptp(vcpu, pgd, pgd_level);
vmcs_write64(EPT_POINTER, eptp);
- if (kvm_x86_ops->tlb_remote_flush) {
+ if (kvm_x86_ops.tlb_remote_flush) {
spin_lock(&to_kvm_vmx(kvm)->ept_pointer_lock);
to_vmx(vcpu)->ept_pointer = eptp;
to_kvm_vmx(kvm)->ept_pointers_match
@@ -2999,14 +3089,15 @@
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
}
- /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
- if (is_guest_mode(vcpu))
- update_guest_cr3 = false;
- else if (enable_unrestricted_guest || is_paging(vcpu))
- guest_cr3 = kvm_read_cr3(vcpu);
- else
+ if (!enable_unrestricted_guest && !is_paging(vcpu))
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
- ept_load_pdptrs(vcpu);
+ else if (test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+ guest_cr3 = vcpu->arch.cr3;
+ else /* vmcs01.GUEST_CR3 is already up-to-date. */
+ update_guest_cr3 = false;
+ vmx_ept_load_pdptrs(vcpu);
+ } else {
+ guest_cr3 = pgd;
}
if (update_guest_cr3)
@@ -3024,7 +3115,7 @@
unsigned long hw_cr4;
hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) | (cr4 & ~X86_CR4_MCE);
- if (enable_unrestricted_guest)
+ if (is_unrestricted_guest(vcpu))
hw_cr4 |= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
else if (vmx->rmode.vm86_active)
hw_cr4 |= KVM_RMODE_VM_CR4_ALWAYS_ON;
@@ -3057,8 +3148,9 @@
return 1;
vcpu->arch.cr4 = cr4;
+ kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
- if (!enable_unrestricted_guest) {
+ if (!is_unrestricted_guest(vcpu)) {
if (enable_ept) {
if (!is_paging(vcpu)) {
hw_cr4 &= ~X86_CR4_PAE;
@@ -3198,7 +3290,7 @@
* tree. Newer qemu binaries with that qemu fix would not need this
* kvm hack.
*/
- if (enable_unrestricted_guest && (seg != VCPU_SREG_LDTR))
+ if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
var->type |= 0x1; /* Accessed */
vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var));
@@ -3387,11 +3479,8 @@
* not.
* We assume that registers are always usable
*/
-static bool guest_state_valid(struct kvm_vcpu *vcpu)
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
{
- if (enable_unrestricted_guest)
- return true;
-
/* real mode guest state checks */
if (!is_protmode(vcpu) || (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
@@ -3469,7 +3558,7 @@
static int init_rmode_identity_map(struct kvm *kvm)
{
struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
- int i, idx, r = 0;
+ int i, r = 0;
kvm_pfn_t identity_map_pfn;
u32 tmp;
@@ -3477,7 +3566,7 @@
mutex_lock(&kvm->slots_lock);
if (likely(kvm_vmx->ept_identity_pagetable_done))
- goto out2;
+ goto out;
if (!kvm_vmx->ept_identity_map_addr)
kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
@@ -3486,9 +3575,8 @@
r = __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
kvm_vmx->ept_identity_map_addr, PAGE_SIZE);
if (r < 0)
- goto out2;
+ goto out;
- idx = srcu_read_lock(&kvm->srcu);
r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
if (r < 0)
goto out;
@@ -3504,9 +3592,6 @@
kvm_vmx->ept_identity_pagetable_done = true;
out:
- srcu_read_unlock(&kvm->srcu, idx);
-
-out2:
mutex_unlock(&kvm->slots_lock);
return r;
}
@@ -3581,10 +3666,51 @@
spin_unlock(&vmx_vpid_lock);
}
-static __always_inline void vmx_disable_intercept_for_msr(unsigned long *msr_bitmap,
+static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __clear_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x000 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f);
+}
+
+static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr)
+{
+ int f = sizeof(unsigned long);
+
+ if (msr <= 0x1fff)
+ __set_bit(msr, msr_bitmap + 0x800 / f);
+ else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff))
+ __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f);
+}
+
+static __always_inline void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type)
{
- int f = sizeof(unsigned long);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3593,36 +3719,44 @@
evmcs_touch_msr_bitmap();
/*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __clear_bit(msr, msr_bitmap + 0x000 / f);
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filters change.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
- if (type & MSR_TYPE_W)
- /* write-low */
- __clear_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __clear_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __clear_bit(msr, msr_bitmap + 0xc00 / f);
-
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ clear_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ clear_bit(idx, vmx->shadow_msr_intercept.write);
+ }
}
+
+ if ((type & MSR_TYPE_R) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) {
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+ type &= ~MSR_TYPE_R;
+ }
+
+ if ((type & MSR_TYPE_W) &&
+ !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) {
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
+ type &= ~MSR_TYPE_W;
+ }
+
+ if (type & MSR_TYPE_R)
+ vmx_clear_msr_bitmap_read(msr_bitmap, msr);
+
+ if (type & MSR_TYPE_W)
+ vmx_clear_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_enable_intercept_for_msr(unsigned long *msr_bitmap,
+static __always_inline void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu,
u32 msr, int type)
{
- int f = sizeof(unsigned long);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
if (!cpu_has_vmx_msr_bitmap())
return;
@@ -3631,39 +3765,34 @@
evmcs_touch_msr_bitmap();
/*
- * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals
- * have the write-low and read-high bitmap offsets the wrong way round.
- * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff.
- */
- if (msr <= 0x1fff) {
- if (type & MSR_TYPE_R)
- /* read-low */
- __set_bit(msr, msr_bitmap + 0x000 / f);
+ * Mark the desired intercept state in shadow bitmap, this is needed
+ * for resync when the MSR filter changes.
+ */
+ if (is_valid_passthrough_msr(msr)) {
+ int idx = possible_passthrough_msr_slot(msr);
- if (type & MSR_TYPE_W)
- /* write-low */
- __set_bit(msr, msr_bitmap + 0x800 / f);
-
- } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) {
- msr &= 0x1fff;
- if (type & MSR_TYPE_R)
- /* read-high */
- __set_bit(msr, msr_bitmap + 0x400 / f);
-
- if (type & MSR_TYPE_W)
- /* write-high */
- __set_bit(msr, msr_bitmap + 0xc00 / f);
-
+ if (idx != -ENOENT) {
+ if (type & MSR_TYPE_R)
+ set_bit(idx, vmx->shadow_msr_intercept.read);
+ if (type & MSR_TYPE_W)
+ set_bit(idx, vmx->shadow_msr_intercept.write);
+ }
}
+
+ if (type & MSR_TYPE_R)
+ vmx_set_msr_bitmap_read(msr_bitmap, msr);
+
+ if (type & MSR_TYPE_W)
+ vmx_set_msr_bitmap_write(msr_bitmap, msr);
}
-static __always_inline void vmx_set_intercept_for_msr(unsigned long *msr_bitmap,
- u32 msr, int type, bool value)
+static __always_inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu,
+ u32 msr, int type, bool value)
{
if (value)
- vmx_enable_intercept_for_msr(msr_bitmap, msr, type);
+ vmx_enable_intercept_for_msr(vcpu, msr, type);
else
- vmx_disable_intercept_for_msr(msr_bitmap, msr, type);
+ vmx_disable_intercept_for_msr(vcpu, msr, type);
}
static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu)
@@ -3681,35 +3810,47 @@
return mode;
}
-static void vmx_update_msr_bitmap_x2apic(unsigned long *msr_bitmap,
- u8 mode)
+static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode)
{
+ unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap;
+ unsigned long read_intercept;
int msr;
- for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
- unsigned word = msr / BITS_PER_LONG;
- msr_bitmap[word] = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
- msr_bitmap[word + (0x800 / sizeof(long))] = ~0;
- }
+ read_intercept = (mode & MSR_BITMAP_MODE_X2APIC_APICV) ? 0 : ~0;
- if (mode & MSR_BITMAP_MODE_X2APIC) {
- /*
- * TPR reads and writes can be virtualized even if virtual interrupt
- * delivery is not in use.
- */
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW);
- if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
- vmx_enable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
- vmx_disable_intercept_for_msr(msr_bitmap, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
- }
+ for (msr = 0x800; msr <= 0x8ff; msr += BITS_PER_LONG) {
+ unsigned int read_idx = msr / BITS_PER_LONG;
+ unsigned int write_idx = read_idx + (0x800 / sizeof(long));
+
+ msr_bitmap[read_idx] = read_intercept;
+ msr_bitmap[write_idx] = ~0ul;
+ }
+}
+
+static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode)
+{
+ if (!cpu_has_vmx_msr_bitmap())
+ return;
+
+ vmx_reset_x2apic_msrs(vcpu, mode);
+
+ /*
+ * TPR reads and writes can be virtualized even if virtual interrupt
+ * delivery is not in use.
+ */
+ vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
+ !(mode & MSR_BITMAP_MODE_X2APIC));
+
+ if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
+ vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
+ vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
}
}
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
u8 mode = vmx_msr_bitmap_mode(vcpu);
u8 changed = mode ^ vmx->msr_bitmap_mode;
@@ -3717,38 +3858,27 @@
return;
if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV))
- vmx_update_msr_bitmap_x2apic(msr_bitmap, mode);
+ vmx_update_msr_bitmap_x2apic(vcpu, mode);
vmx->msr_bitmap_mode = mode;
}
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx)
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
{
- unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
u32 i;
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_STATUS,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_BASE,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_OUTPUT_MASK,
- MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap, MSR_IA32_RTIT_CR3_MATCH,
- MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, flag);
for (i = 0; i < vmx->pt_desc.addr_range; i++) {
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
- vmx_set_intercept_for_msr(msr_bitmap,
- MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * 2, MSR_TYPE_RW, flag);
+ vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * 2, MSR_TYPE_RW, flag);
}
}
-static bool vmx_get_enable_apicv(struct kvm_vcpu *vcpu)
-{
- return enable_apicv;
-}
-
static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -3769,6 +3899,29 @@
return ((rvi & 0xf0) > (vppr & 0xf0));
}
+static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u32 i;
+
+ /*
+ * Set intercept permissions for all potentially passed through MSRs
+ * again. They will automatically get filtered through the MSR filter,
+ * so we are back in sync after this.
+ */
+ for (i = 0; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
+ u32 msr = vmx_possible_passthrough_msrs[i];
+ bool read = test_bit(i, vmx->shadow_msr_intercept.read);
+ bool write = test_bit(i, vmx->shadow_msr_intercept.write);
+
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_R, read);
+ vmx_set_intercept_for_msr(vcpu, msr, MSR_TYPE_W, write);
+ }
+
+ pt_update_intercept_for_msr(vcpu);
+ vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu));
+}
+
static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
bool nested)
{
@@ -3925,15 +4078,16 @@
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
{
- BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS);
+ struct kvm_vcpu *vcpu = &vmx->vcpu;
- vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
- if (enable_ept)
- vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
+ vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
+ ~vcpu->arch.cr4_guest_rsvd_bits;
+ if (!enable_ept)
+ vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PGE;
if (is_guest_mode(&vmx->vcpu))
- vmx->vcpu.arch.cr4_guest_owned_bits &=
- ~get_vmcs12(&vmx->vcpu)->cr4_guest_host_mask;
- vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
+ vcpu->arch.cr4_guest_owned_bits &=
+ ~get_vmcs12(vcpu)->cr4_guest_host_mask;
+ vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits);
}
u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
@@ -3998,6 +4152,61 @@
return exec_control;
}
+/*
+ * Adjust a single secondary execution control bit to intercept/allow an
+ * instruction in the guest. This is usually done based on whether or not a
+ * feature has been exposed to the guest in order to correctly emulate faults.
+ */
+static inline void
+vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control,
+ u32 control, bool enabled, bool exiting)
+{
+ /*
+ * If the control is for an opt-in feature, clear the control if the
+ * feature is not exposed to the guest, i.e. not enabled. If the
+ * control is opt-out, i.e. an exiting control, clear the control if
+ * the feature _is_ exposed to the guest, i.e. exiting/interception is
+ * disabled for the associated instruction. Note, the caller is
+ * responsible presetting exec_control to set all supported bits.
+ */
+ if (enabled == exiting)
+ *exec_control &= ~control;
+
+ /*
+ * Update the nested MSR settings so that a nested VMM can/can't set
+ * controls for features that are/aren't exposed to the guest.
+ */
+ if (nested) {
+ if (enabled)
+ vmx->nested.msrs.secondary_ctls_high |= control;
+ else
+ vmx->nested.msrs.secondary_ctls_high &= ~control;
+ }
+}
+
+/*
+ * Wrapper macro for the common case of adjusting a secondary execution control
+ * based on a single guest CPUID bit, with a dedicated feature bit. This also
+ * verifies that the control is actually supported by KVM and hardware.
+ */
+#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
+({ \
+ bool __enabled; \
+ \
+ if (cpu_has_vmx_##name()) { \
+ __enabled = guest_cpuid_has(&(vmx)->vcpu, \
+ X86_FEATURE_##feat_name); \
+ vmx_adjust_secondary_exec_control(vmx, exec_control, \
+ SECONDARY_EXEC_##ctrl_name, __enabled, exiting); \
+ } \
+})
+
+/* More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. */
+#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
+
+#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
+ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx)
{
@@ -4005,7 +4214,7 @@
u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA | SECONDARY_EXEC_PT_CONCEAL_VMX);
if (!cpu_need_virtualize_apic_accesses(vcpu))
exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
@@ -4038,107 +4247,38 @@
if (!enable_pml)
exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
- if (vmx_xsaves_supported()) {
+ if (cpu_has_vmx_xsaves()) {
/* Exposing XSAVES only when XSAVE is exposed */
bool xsaves_enabled =
+ boot_cpu_has(X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) &&
guest_cpuid_has(vcpu, X86_FEATURE_XSAVES);
- if (!xsaves_enabled)
- exec_control &= ~SECONDARY_EXEC_XSAVES;
+ vcpu->arch.xsaves_enabled = xsaves_enabled;
- if (nested) {
- if (xsaves_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_XSAVES;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_XSAVES;
- }
+ vmx_adjust_secondary_exec_control(vmx, &exec_control,
+ SECONDARY_EXEC_XSAVES,
+ xsaves_enabled, false);
}
- if (vmx_rdtscp_supported()) {
- bool rdtscp_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP);
- if (!rdtscp_enabled)
- exec_control &= ~SECONDARY_EXEC_RDTSCP;
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, rdtscp, RDTSCP);
- if (nested) {
- if (rdtscp_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDTSCP;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDTSCP;
- }
- }
+ /*
+ * Expose INVPCID if and only if PCID is also exposed to the guest.
+ * INVPCID takes a #UD when it's disabled in the VMCS, but a #GP or #PF
+ * if CR4.PCIDE=0. Enumerating CPUID.INVPCID=1 would lead to incorrect
+ * behavior from the guest perspective (it would expect #GP or #PF).
+ */
+ if (!guest_cpuid_has(vcpu, X86_FEATURE_PCID))
+ guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
+ vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
- if (vmx_invpcid_supported()) {
- /* Exposing INVPCID only when PCID is exposed */
- bool invpcid_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_INVPCID) &&
- guest_cpuid_has(vcpu, X86_FEATURE_PCID);
- if (!invpcid_enabled) {
- exec_control &= ~SECONDARY_EXEC_ENABLE_INVPCID;
- guest_cpuid_clear(vcpu, X86_FEATURE_INVPCID);
- }
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
+ vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
- if (nested) {
- if (invpcid_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_INVPCID;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_ENABLE_INVPCID;
- }
- }
-
- if (vmx_rdrand_supported()) {
- bool rdrand_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDRAND);
- if (rdrand_enabled)
- exec_control &= ~SECONDARY_EXEC_RDRAND_EXITING;
-
- if (nested) {
- if (rdrand_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDRAND_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDRAND_EXITING;
- }
- }
-
- if (vmx_rdseed_supported()) {
- bool rdseed_enabled = guest_cpuid_has(vcpu, X86_FEATURE_RDSEED);
- if (rdseed_enabled)
- exec_control &= ~SECONDARY_EXEC_RDSEED_EXITING;
-
- if (nested) {
- if (rdseed_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_RDSEED_EXITING;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_RDSEED_EXITING;
- }
- }
-
- if (vmx_waitpkg_supported()) {
- bool waitpkg_enabled =
- guest_cpuid_has(vcpu, X86_FEATURE_WAITPKG);
-
- if (!waitpkg_enabled)
- exec_control &= ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
-
- if (nested) {
- if (waitpkg_enabled)
- vmx->nested.msrs.secondary_ctls_high |=
- SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
- else
- vmx->nested.msrs.secondary_ctls_high &=
- ~SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
- }
- }
+ vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
+ ENABLE_USR_WAIT_PAUSE, false);
vmx->secondary_exec_control = exec_control;
}
@@ -4149,21 +4289,19 @@
* EPT Misconfigurations can be generated if the value of bits 2:0
* of an EPT paging-structure entry is 110b (write/execute).
*/
- kvm_mmu_set_mmio_spte_mask(VMX_EPT_RWX_MASK,
- VMX_EPT_MISCONFIG_WX_VALUE, 0);
+ kvm_mmu_set_mmio_spte_mask(VMX_EPT_MISCONFIG_WX_VALUE, 0);
}
#define VMX_XSS_EXIT_BITMAP 0
/*
- * Sets up the vmcs for emulated real mode.
+ * Noting that the initialization of Guest-state Area of VMCS is in
+ * vmx_vcpu_reset().
*/
-static void vmx_vcpu_setup(struct vcpu_vmx *vmx)
+static void init_vmcs(struct vcpu_vmx *vmx)
{
- int i;
-
if (nested)
- nested_vmx_vcpu_setup();
+ nested_vmx_set_vmcs_shadowing_bitmap();
if (cpu_has_vmx_msr_bitmap())
vmcs_write64(MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
@@ -4172,7 +4310,6 @@
/* Control */
pin_controls_set(vmx, vmx_pin_based_exec_ctrl(vmx));
- vmx->hv_deadline_tsc = -1;
exec_controls_set(vmx, vmx_exec_control(vmx));
@@ -4221,32 +4358,20 @@
if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
vmcs_write64(GUEST_IA32_PAT, vmx->vcpu.arch.pat);
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) {
- u32 index = vmx_msr_index[i];
- u32 data_low, data_high;
- int j = vmx->nmsrs;
-
- if (rdmsr_safe(index, &data_low, &data_high) < 0)
- continue;
- if (wrmsr_safe(index, data_low, data_high) < 0)
- continue;
- vmx->guest_msrs[j].index = i;
- vmx->guest_msrs[j].data = 0;
- vmx->guest_msrs[j].mask = -1ull;
- ++vmx->nmsrs;
- }
-
vm_exit_controls_set(vmx, vmx_vmexit_ctrl());
/* 22.2.1, 20.8.1 */
vm_entry_controls_set(vmx, vmx_vmentry_ctrl());
- vmx->vcpu.arch.cr0_guest_owned_bits = X86_CR0_TS;
- vmcs_writel(CR0_GUEST_HOST_MASK, ~X86_CR0_TS);
+ vmx->vcpu.arch.cr0_guest_owned_bits = KVM_POSSIBLE_CR0_GUEST_BITS;
+ vmcs_writel(CR0_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr0_guest_owned_bits);
set_cr4_guest_host_mask(vmx);
- if (vmx_xsaves_supported())
+ if (vmx->vpid != 0)
+ vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
+
+ if (cpu_has_vmx_xsaves())
vmcs_write64(XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
if (enable_pml) {
@@ -4257,7 +4382,7 @@
if (cpu_has_vmx_encls_vmexit())
vmcs_write64(ENCLS_EXITING_BITMAP, -1ull);
- if (pt_mode == PT_MODE_HOST_GUEST) {
+ if (vmx_pt_mode_is_host_guest()) {
memset(&vmx->pt_desc, 0, sizeof(vmx->pt_desc));
/* Bit[6~0] are forced to 1, writes are ignored. */
vmx->pt_desc.guest.output_mask = 0x7F;
@@ -4276,7 +4401,6 @@
vmx->msr_ia32_umwait_control = 0;
- vcpu->arch.microcode_version = 0x100000000ULL;
vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val();
vmx->hv_deadline_tsc = -1;
kvm_set_cr8(vcpu, 0);
@@ -4348,9 +4472,6 @@
kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
- if (vmx->vpid != 0)
- vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
-
cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
vmx->vcpu.arch.cr0 = cr0;
vmx_set_cr0(vcpu, cr0); /* enter rmode */
@@ -4366,7 +4487,7 @@
static void enable_irq_window(struct kvm_vcpu *vcpu)
{
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
}
static void enable_nmi_window(struct kvm_vcpu *vcpu)
@@ -4377,7 +4498,7 @@
return;
}
- exec_controls_setbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_setbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
}
static void vmx_inject_irq(struct kvm_vcpu *vcpu)
@@ -4473,31 +4594,54 @@
}
}
-static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
{
- if (to_vmx(vcpu)->nested.nested_run_pending)
- return 0;
-
- if (!enable_vnmi &&
- to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
- return 0;
-
- return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
- | GUEST_INTR_STATE_NMI));
-}
-
-static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
-{
- if (to_vmx(vcpu)->nested.nested_run_pending)
+ if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
return false;
- if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
return true;
- return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
- !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
- (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+ return (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI |
+ GUEST_INTR_STATE_NMI));
+}
+
+static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+
+ /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
+ return -EBUSY;
+
+ return !vmx_nmi_blocked(vcpu);
+}
+
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return false;
+
+ return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) ||
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
+}
+
+static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
+{
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return -EBUSY;
+
+ /*
+ * An IRQ must not be injected into L2 if it's supposed to VM-Exit,
+ * e.g. if the IRQ arrived asynchronously after checking nested events.
+ */
+ if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return -EBUSY;
+
+ return !vmx_interrupt_blocked(vcpu);
}
static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
@@ -4507,8 +4651,11 @@
if (enable_unrestricted_guest)
return 0;
- ret = x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
- PAGE_SIZE * 3);
+ mutex_lock(&kvm->slots_lock);
+ ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, addr,
+ PAGE_SIZE * 3);
+ mutex_unlock(&kvm->slots_lock);
+
if (ret)
return ret;
to_kvm_vmx(kvm)->tss_addr = addr;
@@ -4533,12 +4680,10 @@
vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
return false;
- /* fall through */
+ fallthrough;
case DB_VECTOR:
- if (vcpu->guest_debug &
- (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
- return false;
- /* fall through */
+ return !(vcpu->guest_debug &
+ (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP));
case DE_VECTOR:
case OF_VECTOR:
case BR_VECTOR:
@@ -4548,7 +4693,6 @@
case GP_VECTOR:
case MF_VECTOR:
return true;
- break;
}
return false;
}
@@ -4595,7 +4739,7 @@
.flags = X86_EFLAGS_IF,
};
- do_machine_check(®s, 0);
+ do_machine_check(®s);
#endif
}
@@ -4605,6 +4749,26 @@
return 1;
}
+/*
+ * If the host has split lock detection disabled, then #AC is
+ * unconditionally injected into the guest, which is the pre split lock
+ * detection behaviour.
+ *
+ * If the host has split lock detection enabled then #AC is
+ * only injected into the guest when:
+ * - Guest CPL == 3 (user mode)
+ * - Guest has #AC detection enabled in CR0
+ * - Guest EFLAGS has AC bit set
+ */
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
+{
+ if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
+ return true;
+
+ return vmx_get_cpl(vcpu) == 3 && kvm_read_cr0_bits(vcpu, X86_CR0_AM) &&
+ (kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
+}
+
static int handle_exception_nmi(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4614,7 +4778,7 @@
u32 vect_info;
vect_info = vmx->idt_vectoring_info;
- intr_info = vmx->exit_intr_info;
+ intr_info = vmx_get_intr_info(vcpu);
if (is_machine_check(intr_info) || is_nmi(intr_info))
return 1; /* handled by handle_exception_nmi_irqoff() */
@@ -4650,18 +4814,26 @@
!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
- vcpu->run->internal.ndata = 3;
+ vcpu->run->internal.ndata = 4;
vcpu->run->internal.data[0] = vect_info;
vcpu->run->internal.data[1] = intr_info;
vcpu->run->internal.data[2] = error_code;
+ vcpu->run->internal.data[3] = vcpu->arch.last_vmentry_cpu;
return 0;
}
if (is_page_fault(intr_info)) {
- cr2 = vmcs_readl(EXIT_QUALIFICATION);
- /* EPT won't cause page fault directly */
- WARN_ON_ONCE(!vcpu->arch.apf.host_apf_reason && enable_ept);
- return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
+ cr2 = vmx_get_exit_qual(vcpu);
+ if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
+ /*
+ * EPT will cause page fault only if we need to
+ * detect illegal GPAs.
+ */
+ WARN_ON_ONCE(!allow_smaller_maxphyaddr);
+ kvm_fixup_and_inject_pf_error(vcpu, cr2, error_code);
+ return 1;
+ } else
+ return kvm_handle_page_fault(vcpu, error_code, cr2, NULL, 0);
}
ex_no = intr_info & INTR_INFO_VECTOR_MASK;
@@ -4670,24 +4842,44 @@
return handle_rmode_exception(vcpu, ex_no, error_code);
switch (ex_no) {
- case AC_VECTOR:
- kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
- return 1;
case DB_VECTOR:
- dr6 = vmcs_readl(EXIT_QUALIFICATION);
+ dr6 = vmx_get_exit_qual(vcpu);
if (!(vcpu->guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))) {
- vcpu->arch.dr6 &= ~DR_TRAP_BITS;
- vcpu->arch.dr6 |= dr6 | DR6_RTM;
+ /*
+ * If the #DB was due to ICEBP, a.k.a. INT1, skip the
+ * instruction. ICEBP generates a trap-like #DB, but
+ * despite its interception control being tied to #DB,
+ * is an instruction intercept, i.e. the VM-Exit occurs
+ * on the ICEBP itself. Note, skipping ICEBP also
+ * clears STI and MOVSS blocking.
+ *
+ * For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
+ * if single-step is enabled in RFLAGS and STI or MOVSS
+ * blocking is active, as the CPU doesn't set the bit
+ * on VM-Exit due to #DB interception. VM-Entry has a
+ * consistency check that a single-step #DB is pending
+ * in this scenario as the previous instruction cannot
+ * have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
+ * don't modify RFLAGS), therefore the one instruction
+ * delay when activating single-step breakpoints must
+ * have already expired. Note, the CPU sets/clears BS
+ * as appropriate for all other VM-Exits types.
+ */
if (is_icebp(intr_info))
WARN_ON(!skip_emulated_instruction(vcpu));
+ else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
+ (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
+ (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS)))
+ vmcs_writel(GUEST_PENDING_DBG_EXCEPTIONS,
+ vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS) | DR6_BS);
- kvm_queue_exception(vcpu, DB_VECTOR);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, dr6);
return 1;
}
- kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1;
+ kvm_run->debug.arch.dr6 = dr6 | DR6_FIXED_1 | DR6_RTM;
kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
- /* fall through */
+ fallthrough;
case BP_VECTOR:
/*
* Update instruction length as we may reinject #BP from
@@ -4701,6 +4893,20 @@
kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
kvm_run->debug.arch.exception = ex_no;
break;
+ case AC_VECTOR:
+ if (vmx_guest_inject_ac(vcpu)) {
+ kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
+ return 1;
+ }
+
+ /*
+ * Handle split lock. Depending on detection mode this will
+ * either warn and disable split lock detection for this
+ * task or force SIGBUS on it.
+ */
+ if (handle_guest_split_lock(kvm_rip_read(vcpu)))
+ return 1;
+ fallthrough;
default:
kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
kvm_run->ex.exception = ex_no;
@@ -4710,7 +4916,7 @@
return 0;
}
-static int handle_external_interrupt(struct kvm_vcpu *vcpu)
+static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
{
++vcpu->stat.irq_exits;
return 1;
@@ -4729,7 +4935,7 @@
int size, in, string;
unsigned port;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
string = (exit_qualification & 16) != 0;
++vcpu->stat.io_exits;
@@ -4820,7 +5026,7 @@
int err;
int ret;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
cr = exit_qualification & 15;
reg = (exit_qualification >> 8) & 15;
switch ((exit_qualification >> 4) & 3) {
@@ -4897,7 +5103,7 @@
unsigned long exit_qualification;
int dr, dr7, reg;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
/* First, if DR does not exist, trigger UD */
@@ -4915,16 +5121,14 @@
* guest debugging itself.
*/
if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
- vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
+ vcpu->run->debug.arch.dr6 = DR6_BD | DR6_RTM | DR6_FIXED_1;
vcpu->run->debug.arch.dr7 = dr7;
vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
vcpu->run->debug.arch.exception = DB_VECTOR;
vcpu->run->exit_reason = KVM_EXIT_DEBUG;
return 0;
} else {
- vcpu->arch.dr6 &= ~DR_TRAP_BITS;
- vcpu->arch.dr6 |= DR6_BD | DR6_RTM;
- kvm_queue_exception(vcpu, DB_VECTOR);
+ kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
return 1;
}
}
@@ -4955,15 +5159,6 @@
return kvm_skip_emulated_instruction(vcpu);
}
-static u64 vmx_get_dr6(struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.dr6;
-}
-
-static void vmx_set_dr6(struct kvm_vcpu *vcpu, unsigned long val)
-{
-}
-
static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
{
get_debugreg(vcpu->arch.db[0], 0);
@@ -4982,21 +5177,6 @@
vmcs_writel(GUEST_DR7, val);
}
-static int handle_cpuid(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_cpuid(vcpu);
-}
-
-static int handle_rdmsr(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_rdmsr(vcpu);
-}
-
-static int handle_wrmsr(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_wrmsr(vcpu);
-}
-
static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
{
kvm_apic_update_ppr(vcpu);
@@ -5005,7 +5185,7 @@
static int handle_interrupt_window(struct kvm_vcpu *vcpu)
{
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_INTR_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5013,11 +5193,6 @@
return 1;
}
-static int handle_halt(struct kvm_vcpu *vcpu)
-{
- return kvm_emulate_halt(vcpu);
-}
-
static int handle_vmcall(struct kvm_vcpu *vcpu)
{
return kvm_emulate_hypercall(vcpu);
@@ -5025,12 +5200,13 @@
static int handle_invd(struct kvm_vcpu *vcpu)
{
- return kvm_emulate_instruction(vcpu, 0);
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_skip_emulated_instruction(vcpu);
}
static int handle_invlpg(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
kvm_mmu_invlpg(vcpu, exit_qualification);
return kvm_skip_emulated_instruction(vcpu);
@@ -5062,7 +5238,7 @@
static int handle_apic_access(struct kvm_vcpu *vcpu)
{
if (likely(fasteoi)) {
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int access_type, offset;
access_type = exit_qualification & APIC_ACCESS_TYPE;
@@ -5083,7 +5259,7 @@
static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
int vector = exit_qualification & 0xff;
/* EOI-induced VM exit is trap-like and thus no need to adjust IP */
@@ -5093,7 +5269,7 @@
static int handle_apic_write(struct kvm_vcpu *vcpu)
{
- unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
u32 offset = exit_qualification & 0xfff;
/* APIC-write VM exit is trap-like and thus no need to adjust IP */
@@ -5114,7 +5290,7 @@
idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
reason = (u32)exit_qualification >> 30;
if (reason == TASK_SWITCH_GATE && idt_v) {
@@ -5134,7 +5310,7 @@
error_code =
vmcs_read32(IDT_VECTORING_ERROR_CODE);
}
- /* fall through */
+ fallthrough;
case INTR_TYPE_SOFT_EXCEPTION:
kvm_clear_exception_queue(vcpu);
break;
@@ -5164,7 +5340,7 @@
gpa_t gpa;
u64 error_code;
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
/*
* EPT violation happened while executing iret from NMI,
@@ -5199,6 +5375,18 @@
PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
vcpu->arch.exit_qualification = exit_qualification;
+
+ /*
+ * Check that the GPA doesn't exceed physical memory limits, as that is
+ * a guest page fault. We have to emulate the instruction here, because
+ * if the illegal address is that of a paging structure, then
+ * EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
+ * would also use advanced VM-exit information for EPT violations to
+ * reconstruct the page fault error code.
+ */
+ if (unlikely(allow_smaller_maxphyaddr && kvm_vcpu_is_illegal_gpa(vcpu, gpa)))
+ return kvm_emulate_instruction(vcpu, 0);
+
return kvm_mmu_page_fault(vcpu, gpa, error_code, NULL, 0);
}
@@ -5223,7 +5411,7 @@
static int handle_nmi_window(struct kvm_vcpu *vcpu)
{
WARN_ON_ONCE(!enable_vnmi);
- exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_VIRTUAL_NMI_PENDING);
+ exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
++vcpu->stat.nmi_window_exits;
kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -5236,18 +5424,11 @@
bool intr_window_requested;
unsigned count = 130;
- /*
- * We should never reach the point where we are emulating L2
- * due to invalid guest state as that means we incorrectly
- * allowed a nested VMEntry with an invalid vmcs12.
- */
- WARN_ON_ONCE(vmx->emulation_required && vmx->nested.nested_run_pending);
-
intr_window_requested = exec_controls_get(vmx) &
- CPU_BASED_VIRTUAL_INTR_PENDING;
+ CPU_BASED_INTR_WINDOW_EXITING;
while (vmx->emulation_required && count-- != 0) {
- if (intr_window_requested && vmx_interrupt_allowed(vcpu))
+ if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
return handle_interrupt_window(&vmx->vcpu);
if (kvm_test_request(KVM_REQ_EVENT, vcpu))
@@ -5271,14 +5452,12 @@
}
/*
- * Note, return 1 and not 0, vcpu_run() is responsible for
- * morphing the pending signal into the proper return code.
+ * Note, return 1 and not 0, vcpu_run() will invoke
+ * xfer_to_guest_mode() which will create a proper return
+ * code.
*/
- if (signal_pending(current))
+ if (__xfer_to_guest_mode_work_pending())
return 1;
-
- if (need_resched())
- schedule();
}
return 1;
@@ -5316,25 +5495,6 @@
}
}
-/*
- * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR.
- */
-static void wakeup_handler(void)
-{
- struct kvm_vcpu *vcpu;
- int cpu = smp_processor_id();
-
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
- list_for_each_entry(vcpu, &per_cpu(blocked_vcpu_on_cpu, cpu),
- blocked_vcpu_list) {
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (pi_test_on(pi_desc) == 1)
- kvm_vcpu_kick(vcpu);
- }
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-}
-
static void vmx_enable_tdp(void)
{
kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
@@ -5345,7 +5505,6 @@
VMX_EPT_RWX_MASK, 0ull);
ept_set_mmio_spte_mask();
- kvm_enable_tdp();
}
/*
@@ -5399,11 +5558,7 @@
{
u32 vmx_instruction_info;
unsigned long type;
- bool pcid_enabled;
gva_t gva;
- struct x86_exception e;
- unsigned i;
- unsigned long roots_to_free = 0;
struct {
u64 pcid;
u64 gla;
@@ -5425,74 +5580,12 @@
/* According to the Intel instruction reference, the memory operand
* is read even if it isn't needed (e.g., for type==all)
*/
- if (get_vmx_mem_address(vcpu, vmcs_readl(EXIT_QUALIFICATION),
+ if (get_vmx_mem_address(vcpu, vmx_get_exit_qual(vcpu),
vmx_instruction_info, false,
sizeof(operand), &gva))
return 1;
- if (kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e)) {
- kvm_inject_page_fault(vcpu, &e);
- return 1;
- }
-
- if (operand.pcid >> 12 != 0) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- pcid_enabled = kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE);
-
- switch (type) {
- case INVPCID_TYPE_INDIV_ADDR:
- if ((!pcid_enabled && (operand.pcid != 0)) ||
- is_noncanonical_address(operand.gla, vcpu)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
- kvm_mmu_invpcid_gva(vcpu, operand.gla, operand.pcid);
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_SINGLE_CTXT:
- if (!pcid_enabled && (operand.pcid != 0)) {
- kvm_inject_gp(vcpu, 0);
- return 1;
- }
-
- if (kvm_get_active_pcid(vcpu) == operand.pcid) {
- kvm_mmu_sync_roots(vcpu);
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
- }
-
- for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
- if (kvm_get_pcid(vcpu, vcpu->arch.mmu->prev_roots[i].cr3)
- == operand.pcid)
- roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i);
-
- kvm_mmu_free_roots(vcpu, vcpu->arch.mmu, roots_to_free);
- /*
- * If neither the current cr3 nor any of the prev_roots use the
- * given PCID, then nothing needs to be done here because a
- * resync will happen anyway before switching to any other CR3.
- */
-
- return kvm_skip_emulated_instruction(vcpu);
-
- case INVPCID_TYPE_ALL_NON_GLOBAL:
- /*
- * Currently, KVM doesn't mark global entries in the shadow
- * page tables, so a non-global flush just degenerates to a
- * global flush. If needed, we could optimize this later by
- * keeping track of global entries in shadow page tables.
- */
-
- /* fall-through */
- case INVPCID_TYPE_ALL_INCL_GLOBAL:
- kvm_mmu_unload(vcpu);
- return kvm_skip_emulated_instruction(vcpu);
-
- default:
- BUG(); /* We have already checked above that type <= 3 */
- }
+ return kvm_handle_invpcid(vcpu, type, gva);
}
static int handle_pml_full(struct kvm_vcpu *vcpu)
@@ -5501,7 +5594,7 @@
trace_kvm_pml_full(vcpu->vcpu_id);
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ exit_qualification = vmx_get_exit_qual(vcpu);
/*
* PML buffer FULL happened while executing iret from NMI,
@@ -5520,14 +5613,22 @@
return 1;
}
-static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
if (!vmx->req_immediate_exit &&
- !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
+ !unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled)) {
kvm_lapic_expired_hv_timer(vcpu);
+ return EXIT_FASTPATH_REENTER_GUEST;
+ }
+ return EXIT_FASTPATH_NONE;
+}
+
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ handle_fastpath_preemption_timer(vcpu);
return 1;
}
@@ -5565,11 +5666,11 @@
[EXIT_REASON_IO_INSTRUCTION] = handle_io,
[EXIT_REASON_CR_ACCESS] = handle_cr,
[EXIT_REASON_DR_ACCESS] = handle_dr,
- [EXIT_REASON_CPUID] = handle_cpuid,
- [EXIT_REASON_MSR_READ] = handle_rdmsr,
- [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
- [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
- [EXIT_REASON_HLT] = handle_halt,
+ [EXIT_REASON_CPUID] = kvm_emulate_cpuid,
+ [EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
+ [EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
+ [EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
+ [EXIT_REASON_HLT] = kvm_emulate_halt,
[EXIT_REASON_INVD] = handle_invd,
[EXIT_REASON_INVLPG] = handle_invlpg,
[EXIT_REASON_RDPMC] = handle_rdpmc,
@@ -5613,10 +5714,24 @@
static const int kvm_vmx_max_exit_handlers =
ARRAY_SIZE(kvm_vmx_exit_handlers);
-static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
+static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2,
+ u32 *intr_info, u32 *error_code)
{
- *info1 = vmcs_readl(EXIT_QUALIFICATION);
- *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ *info1 = vmx_get_exit_qual(vcpu);
+ if (!(vmx->exit_reason.failed_vmentry)) {
+ *info2 = vmx->idt_vectoring_info;
+ *intr_info = vmx_get_intr_info(vcpu);
+ if (is_exception_with_error_code(*intr_info))
+ *error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
+ else
+ *error_code = 0;
+ } else {
+ *info2 = 0;
+ *intr_info = 0;
+ *error_code = 0;
+ }
}
static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
@@ -5697,8 +5812,6 @@
u32 vmentry_ctl, vmexit_ctl;
u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
unsigned long cr4;
- u64 efer;
- int i, n;
if (!dump_invalid_vmcs) {
pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
@@ -5710,7 +5823,6 @@
cpu_based_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
pin_based_exec_ctrl = vmcs_read32(PIN_BASED_VM_EXEC_CONTROL);
cr4 = vmcs_readl(GUEST_CR4);
- efer = vmcs_read64(GUEST_IA32_EFER);
secondary_exec_control = 0;
if (cpu_has_secondary_exec_ctrls())
secondary_exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
@@ -5722,9 +5834,7 @@
pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
- if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
- (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA))
- {
+ if (cpu_has_vmx_ept()) {
pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
@@ -5750,7 +5860,8 @@
if ((vmexit_ctl & (VM_EXIT_SAVE_IA32_PAT | VM_EXIT_SAVE_IA32_EFER)) ||
(vmentry_ctl & (VM_ENTRY_LOAD_IA32_PAT | VM_ENTRY_LOAD_IA32_EFER)))
pr_err("EFER = 0x%016llx PAT = 0x%016llx\n",
- efer, vmcs_read64(GUEST_IA32_PAT));
+ vmcs_read64(GUEST_IA32_EFER),
+ vmcs_read64(GUEST_IA32_PAT));
pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
vmcs_read64(GUEST_IA32_DEBUGCTL),
vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
@@ -5835,14 +5946,6 @@
pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
- n = vmcs_read32(CR3_TARGET_COUNT);
- for (i = 0; i + 1 < n; i += 4)
- pr_err("CR3 target%u=%016lx target%u=%016lx\n",
- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2),
- i + 1, vmcs_readl(CR3_TARGET_VALUE0 + i * 2 + 2));
- if (i < n)
- pr_err("CR3 target%u=%016lx\n",
- i, vmcs_readl(CR3_TARGET_VALUE0 + i * 2));
if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
pr_err("PLE Gap=%08x Window=%08x\n",
vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
@@ -5855,13 +5958,12 @@
* The guest has exited. See if we can fix it or if we need userspace
* assistance.
*/
-static int vmx_handle_exit(struct kvm_vcpu *vcpu)
+static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- u32 exit_reason = vmx->exit_reason;
+ union vmx_exit_reason exit_reason = vmx->exit_reason;
u32 vectoring_info = vmx->idt_vectoring_info;
-
- trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
+ u16 exit_handler_index;
/*
* Flush logged GPAs PML buffer, this will make dirty_bitmap more
@@ -5873,18 +5975,42 @@
if (enable_pml)
vmx_flush_pml_buffer(vcpu);
+ /*
+ * We should never reach this point with a pending nested VM-Enter, and
+ * more specifically emulation of L2 due to invalid guest state (see
+ * below) should never happen as that means we incorrectly allowed a
+ * nested VM-Enter with an invalid vmcs12.
+ */
+ WARN_ON_ONCE(vmx->nested.nested_run_pending);
+
/* If guest state is invalid, start emulating */
if (vmx->emulation_required)
return handle_invalid_guest_state(vcpu);
- if (is_guest_mode(vcpu) && nested_vmx_exit_reflected(vcpu, exit_reason))
- return nested_vmx_reflect_vmexit(vcpu, exit_reason);
+ if (is_guest_mode(vcpu)) {
+ /*
+ * The host physical addresses of some pages of guest memory
+ * are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
+ * Page). The CPU may write to these pages via their host
+ * physical address while L2 is running, bypassing any
+ * address-translation-based dirty tracking (e.g. EPT write
+ * protection).
+ *
+ * Mark them dirty on every exit from L2 to prevent them from
+ * getting out of sync with dirty tracking.
+ */
+ nested_mark_vmcs12_pages_dirty(vcpu);
- if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
+ if (nested_vmx_reflect_vmexit(vcpu))
+ return 1;
+ }
+
+ if (exit_reason.failed_vmentry) {
dump_vmcs();
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
- = exit_reason;
+ = exit_reason.full;
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -5893,6 +6019,7 @@
vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
vcpu->run->fail_entry.hardware_entry_failure_reason
= vmcs_read32(VM_INSTRUCTION_ERROR);
+ vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
return 0;
}
@@ -5904,28 +6031,30 @@
* will cause infinite loop.
*/
if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
- (exit_reason != EXIT_REASON_EXCEPTION_NMI &&
- exit_reason != EXIT_REASON_EPT_VIOLATION &&
- exit_reason != EXIT_REASON_PML_FULL &&
- exit_reason != EXIT_REASON_APIC_ACCESS &&
- exit_reason != EXIT_REASON_TASK_SWITCH)) {
+ (exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
+ exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
+ exit_reason.basic != EXIT_REASON_PML_FULL &&
+ exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
+ exit_reason.basic != EXIT_REASON_TASK_SWITCH)) {
+ int ndata = 3;
+
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
- vcpu->run->internal.ndata = 3;
vcpu->run->internal.data[0] = vectoring_info;
- vcpu->run->internal.data[1] = exit_reason;
+ vcpu->run->internal.data[1] = exit_reason.full;
vcpu->run->internal.data[2] = vcpu->arch.exit_qualification;
- if (exit_reason == EXIT_REASON_EPT_MISCONFIG) {
- vcpu->run->internal.ndata++;
- vcpu->run->internal.data[3] =
+ if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
+ vcpu->run->internal.data[ndata++] =
vmcs_read64(GUEST_PHYSICAL_ADDRESS);
}
+ vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
+ vcpu->run->internal.ndata = ndata;
return 0;
}
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked)) {
- if (vmx_interrupt_allowed(vcpu)) {
+ if (!vmx_interrupt_blocked(vcpu)) {
vmx->loaded_vmcs->soft_vnmi_blocked = 0;
} else if (vmx->loaded_vmcs->vnmi_blocked_time > 1000000000LL &&
vcpu->arch.nmi_pending) {
@@ -5942,20 +6071,44 @@
}
}
- if (exit_reason < kvm_vmx_max_exit_handlers
- && kvm_vmx_exit_handlers[exit_reason])
- return kvm_vmx_exit_handlers[exit_reason](vcpu);
- else {
- vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
- exit_reason);
- dump_vmcs();
- vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- vcpu->run->internal.suberror =
+ if (exit_fastpath != EXIT_FASTPATH_NONE)
+ return 1;
+
+ if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
+ goto unexpected_vmexit;
+#ifdef CONFIG_RETPOLINE
+ if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
+ return kvm_emulate_wrmsr(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
+ return handle_preemption_timer(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
+ return handle_interrupt_window(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
+ return handle_external_interrupt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_HLT)
+ return kvm_emulate_halt(vcpu);
+ else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
+ return handle_ept_misconfig(vcpu);
+#endif
+
+ exit_handler_index = array_index_nospec((u16)exit_reason.basic,
+ kvm_vmx_max_exit_handlers);
+ if (!kvm_vmx_exit_handlers[exit_handler_index])
+ goto unexpected_vmexit;
+
+ return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
+
+unexpected_vmexit:
+ vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
+ exit_reason.full);
+ dump_vmcs();
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
- vcpu->run->internal.ndata = 1;
- vcpu->run->internal.data[0] = exit_reason;
- return 0;
- }
+ vcpu->run->internal.ndata = 2;
+ vcpu->run->internal.data[0] = exit_reason.full;
+ vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu;
+ return 0;
}
/*
@@ -5968,7 +6121,7 @@
* information but as all relevant affected CPUs have 32KiB L1D cache size
* there is no point in doing so.
*/
-static void vmx_l1d_flush(struct kvm_vcpu *vcpu)
+static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
{
int size = PAGE_SIZE << L1D_CACHE_ORDER;
@@ -6001,7 +6154,7 @@
vcpu->stat.l1d_flush++;
if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
- wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
+ native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
return;
}
@@ -6031,17 +6184,17 @@
static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ int tpr_threshold;
if (is_guest_mode(vcpu) &&
nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
return;
- if (irr == -1 || tpr < irr) {
- vmcs_write32(TPR_THRESHOLD, 0);
- return;
- }
-
- vmcs_write32(TPR_THRESHOLD, irr);
+ tpr_threshold = (irr == -1 || tpr < irr) ? 0 : irr;
+ if (is_guest_mode(vcpu))
+ to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
+ else
+ vmcs_write32(TPR_THRESHOLD, tpr_threshold);
}
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
@@ -6075,7 +6228,15 @@
if (flexpriority_enabled) {
sec_exec_control |=
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
- vmx_flush_tlb(vcpu, true);
+ kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
+
+ /*
+ * Flush the TLB, reloading the APIC access page will
+ * only do so if its physical address has changed, but
+ * the guest may have inserted a non-APIC mapping into
+ * the TLB while the APIC access page was disabled.
+ */
+ kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
break;
case LAPIC_MODE_X2APIC:
@@ -6089,12 +6250,32 @@
vmx_update_msr_bitmap(vcpu);
}
-static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
+static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
{
- if (!is_guest_mode(vcpu)) {
- vmcs_write64(APIC_ACCESS_ADDR, hpa);
- vmx_flush_tlb(vcpu, true);
+ struct page *page;
+
+ /* Defer reload until vmcs01 is the current VMCS. */
+ if (is_guest_mode(vcpu)) {
+ to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
+ return;
}
+
+ if (!(secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
+ return;
+
+ page = gfn_to_page(vcpu->kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
+ if (is_error_page(page))
+ return;
+
+ vmcs_write64(APIC_ACCESS_ADDR, page_to_phys(page));
+ vmx_flush_tlb_current(vcpu);
+
+ /*
+ * Do not pin apic access page in memory, the MMU notifier
+ * will call us again if it is migrated or swapped out.
+ */
+ put_page(page);
}
static void vmx_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr)
@@ -6164,18 +6345,13 @@
/*
* If we are running L2 and L1 has a new pending interrupt
- * which can be injected, we should re-evaluate
- * what should be done with this new L1 interrupt.
- * If L1 intercepts external-interrupts, we should
- * exit from L2 to L1. Otherwise, interrupt should be
- * delivered directly to L2.
+ * which can be injected, this may cause a vmexit or it may
+ * be injected into L2. Either way, this interrupt will be
+ * processed via KVM_REQ_EVENT, not RVI, because we do not use
+ * virtual interrupt delivery to inject L1 interrupts into L2.
*/
- if (is_guest_mode(vcpu) && max_irr_updated) {
- if (nested_exit_on_intr(vcpu))
- kvm_vcpu_exiting_guest_mode(vcpu);
- else
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ if (is_guest_mode(vcpu) && max_irr_updated)
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
}
@@ -6183,14 +6359,6 @@
return max_irr;
}
-static bool vmx_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- return pi_test_on(pi_desc) ||
- (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc));
-}
-
static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
{
if (!kvm_vcpu_apicv_active(vcpu))
@@ -6210,83 +6378,59 @@
memset(vmx->pi_desc.pir, 0, sizeof(vmx->pi_desc.pir));
}
+void vmx_do_interrupt_nmi_irqoff(unsigned long entry);
+
+static void handle_interrupt_nmi_irqoff(struct kvm_vcpu *vcpu,
+ unsigned long entry)
+{
+ kvm_before_interrupt(vcpu);
+ vmx_do_interrupt_nmi_irqoff(entry);
+ kvm_after_interrupt(vcpu);
+}
+
static void handle_exception_nmi_irqoff(struct vcpu_vmx *vmx)
{
- vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ const unsigned long nmi_entry = (unsigned long)asm_exc_nmi_noist;
+ u32 intr_info = vmx_get_intr_info(&vmx->vcpu);
/* if exit due to PF check for async PF */
- if (is_page_fault(vmx->exit_intr_info))
- vmx->vcpu.arch.apf.host_apf_reason = kvm_read_and_reset_pf_reason();
-
+ if (is_page_fault(intr_info))
+ vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
/* Handle machine checks before interrupts are enabled */
- if (is_machine_check(vmx->exit_intr_info))
+ else if (is_machine_check(intr_info))
kvm_machine_check();
-
/* We need to handle NMIs before interrupts are enabled */
- if (is_nmi(vmx->exit_intr_info)) {
- kvm_before_interrupt(&vmx->vcpu);
- asm("int $2");
- kvm_after_interrupt(&vmx->vcpu);
- }
+ else if (is_nmi(intr_info))
+ handle_interrupt_nmi_irqoff(&vmx->vcpu, nmi_entry);
}
static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
{
- unsigned int vector;
- unsigned long entry;
-#ifdef CONFIG_X86_64
- unsigned long tmp;
-#endif
- gate_desc *desc;
- u32 intr_info;
+ u32 intr_info = vmx_get_intr_info(vcpu);
+ unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
+ gate_desc *desc = (gate_desc *)host_idt_base + vector;
- intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
if (WARN_ONCE(!is_external_intr(intr_info),
"KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info))
return;
- vector = intr_info & INTR_INFO_VECTOR_MASK;
- desc = (gate_desc *)host_idt_base + vector;
- entry = gate_offset(desc);
-
- kvm_before_interrupt(vcpu);
-
- asm volatile(
-#ifdef CONFIG_X86_64
- "mov %%" _ASM_SP ", %[sp]\n\t"
- "and $0xfffffffffffffff0, %%" _ASM_SP "\n\t"
- "push $%c[ss]\n\t"
- "push %[sp]\n\t"
-#endif
- "pushf\n\t"
- __ASM_SIZE(push) " $%c[cs]\n\t"
- CALL_NOSPEC
- :
-#ifdef CONFIG_X86_64
- [sp]"=&r"(tmp),
-#endif
- ASM_CALL_CONSTRAINT
- :
- THUNK_TARGET(entry),
- [ss]"i"(__KERNEL_DS),
- [cs]"i"(__KERNEL_CS)
- );
-
- kvm_after_interrupt(vcpu);
+ handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc));
}
-STACK_FRAME_NON_STANDARD(handle_external_interrupt_irqoff);
static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- if (vmx->exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
+ if (vmx->emulation_required)
+ return;
+
+ if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
handle_external_interrupt_irqoff(vcpu);
- else if (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)
+ else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
handle_exception_nmi_irqoff(vmx);
}
-static bool vmx_has_emulated_msr(int index)
+static bool vmx_has_emulated_msr(u32 index)
{
switch (index) {
case MSR_IA32_SMBASE:
@@ -6305,11 +6449,6 @@
}
}
-static bool vmx_pt_supported(void)
-{
- return pt_mode == PT_MODE_HOST_GUEST;
-}
-
static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
{
u32 exit_intr_info;
@@ -6322,11 +6461,8 @@
if (enable_vnmi) {
if (vmx->loaded_vmcs->nmi_known_unmasked)
return;
- /*
- * Can't use vmx->exit_intr_info since we're not sure what
- * the exit reason is.
- */
- exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+
+ exit_intr_info = vmx_get_intr_info(&vmx->vcpu);
unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
/*
@@ -6388,7 +6524,7 @@
break;
case INTR_TYPE_SOFT_EXCEPTION:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
- /* fall through */
+ fallthrough;
case INTR_TYPE_HARD_EXCEPTION:
if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
u32 err = vmcs_read32(error_code_field);
@@ -6398,7 +6534,7 @@
break;
case INTR_TYPE_SOFT_INTR:
vcpu->arch.event_exit_inst_len = vmcs_read32(instr_len_field);
- /* fall through */
+ fallthrough;
case INTR_TYPE_EXT_INTR:
kvm_queue_interrupt(vcpu, vector, type == INTR_TYPE_SOFT_INTR);
break;
@@ -6468,7 +6604,7 @@
}
}
-void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
+void noinstr vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp)
{
if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
vmx->loaded_vmcs->host_state.rsp = host_rsp;
@@ -6476,13 +6612,84 @@
}
}
+static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
+{
+ switch (to_vmx(vcpu)->exit_reason.basic) {
+ case EXIT_REASON_MSR_WRITE:
+ return handle_fastpath_set_msr_irqoff(vcpu);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return handle_fastpath_preemption_timer(vcpu);
+ default:
+ return EXIT_FASTPATH_NONE;
+ }
+}
+
bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
+static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
+ struct vcpu_vmx *vmx)
{
+ /*
+ * VMENTER enables interrupts (host state), but the kernel state is
+ * interrupts disabled when this is invoked. Also tell RCU about
+ * it. This is the same logic as for exit_to_user_mode().
+ *
+ * This ensures that e.g. latency analysis on the host observes
+ * guest mode as interrupt enabled.
+ *
+ * guest_enter_irqoff() informs context tracking about the
+ * transition to guest mode and if enabled adjusts RCU state
+ * accordingly.
+ */
+ instrumentation_begin();
+ trace_hardirqs_on_prepare();
+ lockdep_hardirqs_on_prepare(CALLER_ADDR0);
+ instrumentation_end();
+
+ guest_enter_irqoff();
+ lockdep_hardirqs_on(CALLER_ADDR0);
+
+ /* L1D Flush includes CPU buffer clear to mitigate MDS */
+ if (static_branch_unlikely(&vmx_l1d_should_flush))
+ vmx_l1d_flush(vcpu);
+ else if (static_branch_unlikely(&mds_user_clear))
+ mds_clear_cpu_buffers();
+
+ if (vcpu->arch.cr2 != native_read_cr2())
+ native_write_cr2(vcpu->arch.cr2);
+
+ vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ vmx->loaded_vmcs->launched);
+
+ vcpu->arch.cr2 = native_read_cr2();
+
+ /*
+ * VMEXIT disables interrupts (host state), but tracing and lockdep
+ * have them in state 'on' as recorded before entering guest mode.
+ * Same as enter_from_user_mode().
+ *
+ * context_tracking_guest_exit() restores host context and reinstates
+ * RCU if enabled and required.
+ *
+ * This needs to be done before the below as native_read_msr()
+ * contains a tracepoint and x86_spec_ctrl_restore_host() calls
+ * into world and some more.
+ */
+ lockdep_hardirqs_off(CALLER_ADDR0);
+ context_tracking_guest_exit();
+
+ instrumentation_begin();
+ trace_hardirqs_off_finish();
+ instrumentation_end();
+}
+
+static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu)
+{
+ fastpath_t exit_fastpath;
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long cr3, cr4;
+reenter_guest:
/* Record the guest's net vcpu time for enforced NMI injections. */
if (unlikely(!enable_vnmi &&
vmx->loaded_vmcs->soft_vnmi_blocked))
@@ -6491,7 +6698,7 @@
/* Don't enter VMX if guest state is invalid, let the exit handler
start emulation until we arrive back to a valid state */
if (vmx->emulation_required)
- return;
+ return EXIT_FASTPATH_NONE;
if (vmx->ple_window_dirty) {
vmx->ple_window_dirty = false;
@@ -6504,9 +6711,9 @@
*/
WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
- if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RSP))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
- if (test_bit(VCPU_REGS_RIP, (unsigned long *)&vcpu->arch.regs_dirty))
+ if (kvm_register_is_dirty(vcpu, VCPU_REGS_RIP))
vmcs_writel(GUEST_RIP, vcpu->arch.regs[VCPU_REGS_RIP]);
cr3 = __get_current_cr3_fast();
@@ -6529,7 +6736,7 @@
if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
vmx_set_interrupt_shadow(vcpu, 0);
- kvm_load_guest_xcr0(vcpu);
+ kvm_load_guest_xsave_state(vcpu);
pt_guest_enter(vmx);
@@ -6538,9 +6745,7 @@
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
- if (lapic_in_kernel(vcpu) &&
- vcpu->arch.apic->lapic_timer.timer_advance_ns)
- kvm_wait_lapic_expire(vcpu);
+ kvm_wait_lapic_expire(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -6550,19 +6755,8 @@
*/
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
- /* L1D Flush includes CPU buffer clear to mitigate MDS */
- if (static_branch_unlikely(&vmx_l1d_should_flush))
- vmx_l1d_flush(vcpu);
- else if (static_branch_unlikely(&mds_user_clear))
- mds_clear_cpu_buffers();
-
- if (vcpu->arch.cr2 != read_cr2())
- write_cr2(vcpu->arch.cr2);
-
- vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- vmx->loaded_vmcs->launched);
-
- vcpu->arch.cr2 = read_cr2();
+ /* The actual VMENTER/EXIT is in the .noinstr.text section. */
+ vmx_vcpu_enter_exit(vcpu, vmx);
/*
* We do not use IBRS in the kernel. If this vCPU has used the
@@ -6579,7 +6773,7 @@
* If the L02 MSR bitmap does not intercept the MSR, then we need to
* save it.
*/
- if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL)))
+ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
@@ -6609,50 +6803,54 @@
loadsegment(es, __USER_DS);
#endif
- vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
- | (1 << VCPU_EXREG_RFLAGS)
- | (1 << VCPU_EXREG_PDPTR)
- | (1 << VCPU_EXREG_SEGMENTS)
- | (1 << VCPU_EXREG_CR3));
- vcpu->arch.regs_dirty = 0;
+ vmx_register_cache_reset(vcpu);
pt_guest_exit(vmx);
- kvm_put_guest_xcr0(vcpu);
+ kvm_load_host_xsave_state(vcpu);
vmx->nested.nested_run_pending = 0;
vmx->idt_vectoring_info = 0;
- vmx->exit_reason = vmx->fail ? 0xdead : vmcs_read32(VM_EXIT_REASON);
- if ((u16)vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY)
+ if (unlikely(vmx->fail)) {
+ vmx->exit_reason.full = 0xdead;
+ return EXIT_FASTPATH_NONE;
+ }
+
+ vmx->exit_reason.full = vmcs_read32(VM_EXIT_REASON);
+ if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
kvm_machine_check();
- if (vmx->fail || (vmx->exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
- return;
+ trace_kvm_exit(vmx->exit_reason.full, vcpu, KVM_ISA_VMX);
+
+ if (unlikely(vmx->exit_reason.failed_vmentry))
+ return EXIT_FASTPATH_NONE;
vmx->loaded_vmcs->launched = 1;
vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
vmx_recover_nmi_blocking(vmx);
vmx_complete_interrupts(vmx);
-}
-static struct kvm *vmx_vm_alloc(void)
-{
- struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
- GFP_KERNEL_ACCOUNT | __GFP_ZERO,
- PAGE_KERNEL);
+ if (is_guest_mode(vcpu))
+ return EXIT_FASTPATH_NONE;
- if (!kvm_vmx)
- return NULL;
+ exit_fastpath = vmx_exit_handlers_fastpath(vcpu);
+ if (exit_fastpath == EXIT_FASTPATH_REENTER_GUEST) {
+ if (!kvm_vcpu_exit_request(vcpu)) {
+ /*
+ * FIXME: this goto should be a loop in vcpu_enter_guest,
+ * but it would incur the cost of a retpoline for now.
+ * Revisit once static calls are available.
+ */
+ if (vcpu->arch.apicv_active)
+ vmx_sync_pir_to_irr(vcpu);
+ goto reenter_guest;
+ }
+ exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;
+ }
- return &kvm_vmx->kvm;
-}
-
-static void vmx_vm_free(struct kvm *kvm)
-{
- kfree(kvm->arch.hyperv.hv_pa_pg);
- vfree(to_kvm_vmx(kvm));
+ return exit_fastpath;
}
static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
@@ -6664,119 +6862,122 @@
free_vpid(vmx->vpid);
nested_vmx_free_vcpu(vcpu);
free_loaded_vmcs(vmx->loaded_vmcs);
- kfree(vmx->guest_msrs);
- kvm_vcpu_uninit(vcpu);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
- kmem_cache_free(kvm_vcpu_cache, vmx);
}
-static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
+static int vmx_create_vcpu(struct kvm_vcpu *vcpu)
{
- int err;
struct vcpu_vmx *vmx;
- unsigned long *msr_bitmap;
- int cpu;
+ int i, cpu, err;
- BUILD_BUG_ON_MSG(offsetof(struct vcpu_vmx, vcpu) != 0,
- "struct kvm_vcpu must be at offset 0 for arch usercopy region");
-
- vmx = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
- if (!vmx)
- return ERR_PTR(-ENOMEM);
-
- vmx->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vmx->vcpu.arch.user_fpu) {
- printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
- err = -ENOMEM;
- goto free_partial_vcpu;
- }
-
- vmx->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
- GFP_KERNEL_ACCOUNT);
- if (!vmx->vcpu.arch.guest_fpu) {
- printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
- err = -ENOMEM;
- goto free_user_fpu;
- }
-
- vmx->vpid = allocate_vpid();
-
- err = kvm_vcpu_init(&vmx->vcpu, kvm, id);
- if (err)
- goto free_vcpu;
+ BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != 0);
+ vmx = to_vmx(vcpu);
err = -ENOMEM;
+ vmx->vpid = allocate_vpid();
+
/*
* If PML is turned on, failure on enabling PML just results in failure
* of creating the vcpu, therefore we can simplify PML logic (by
* avoiding dealing with cases, such as enabling PML partially on vcpus
- * for the guest, etc.
+ * for the guest), etc.
*/
if (enable_pml) {
vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO);
if (!vmx->pml_pg)
- goto uninit_vcpu;
+ goto free_vpid;
}
- vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
- BUILD_BUG_ON(ARRAY_SIZE(vmx_msr_index) * sizeof(vmx->guest_msrs[0])
- > PAGE_SIZE);
+ BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
- if (!vmx->guest_msrs)
- goto free_pml;
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i) {
+ u32 index = vmx_uret_msrs_list[i];
+ int j = vmx->nr_uret_msrs;
+
+ if (kvm_probe_user_return_msr(index))
+ continue;
+
+ vmx->guest_uret_msrs[j].slot = i;
+ vmx->guest_uret_msrs[j].data = 0;
+ switch (index) {
+ case MSR_IA32_TSX_CTRL:
+ /*
+ * TSX_CTRL_CPUID_CLEAR is handled in the CPUID
+ * interception. Keep the host value unchanged to avoid
+ * changing CPUID bits under the host kernel's feet.
+ *
+ * hle=0, rtm=0, tsx_ctrl=1 can be found with some
+ * combinations of new kernel and old userspace. If
+ * those guests run on a tsx=off host, do allow guests
+ * to use TSX_CTRL, but do not change the value on the
+ * host so that TSX remains always disabled.
+ */
+ if (boot_cpu_has(X86_FEATURE_RTM))
+ vmx->guest_uret_msrs[j].mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
+ else
+ vmx->guest_uret_msrs[j].mask = 0;
+ break;
+ default:
+ vmx->guest_uret_msrs[j].mask = -1ull;
+ break;
+ }
+ ++vmx->nr_uret_msrs;
+ }
err = alloc_loaded_vmcs(&vmx->vmcs01);
if (err < 0)
- goto free_msrs;
+ goto free_pml;
- msr_bitmap = vmx->vmcs01.msr_bitmap;
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_TSC, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_FS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
- if (kvm_cstate_in_guest(kvm)) {
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C1_RES, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
- vmx_disable_intercept_for_msr(msr_bitmap, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
+ /* The MSR bitmap starts with all ones */
+ bitmap_fill(vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ bitmap_fill(vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
+#ifdef CONFIG_X86_64
+ vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
+#endif
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
+ vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
+ if (kvm_cstate_in_guest(vcpu->kvm)) {
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
+ vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
}
vmx->msr_bitmap_mode = 0;
vmx->loaded_vmcs = &vmx->vmcs01;
cpu = get_cpu();
- vmx_vcpu_load(&vmx->vcpu, cpu);
- vmx->vcpu.cpu = cpu;
- vmx_vcpu_setup(vmx);
- vmx_vcpu_put(&vmx->vcpu);
+ vmx_vcpu_load(vcpu, cpu);
+ vcpu->cpu = cpu;
+ init_vmcs(vmx);
+ vmx_vcpu_put(vcpu);
put_cpu();
- if (cpu_need_virtualize_apic_accesses(&vmx->vcpu)) {
- err = alloc_apic_access_page(kvm);
+ if (cpu_need_virtualize_apic_accesses(vcpu)) {
+ err = alloc_apic_access_page(vcpu->kvm);
if (err)
goto free_vmcs;
}
if (enable_ept && !enable_unrestricted_guest) {
- err = init_rmode_identity_map(kvm);
+ err = init_rmode_identity_map(vcpu->kvm);
if (err)
goto free_vmcs;
}
if (nested)
- nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
- vmx_capability.ept);
+ memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
- vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+ vcpu->arch.microcode_version = 0x100000000ULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
/*
* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
@@ -6787,24 +6988,15 @@
vmx->ept_pointer = INVALID_PAGE;
- return &vmx->vcpu;
+ return 0;
free_vmcs:
free_loaded_vmcs(vmx->loaded_vmcs);
-free_msrs:
- kfree(vmx->guest_msrs);
free_pml:
vmx_destroy_pml_buffer(vmx);
-uninit_vcpu:
- kvm_vcpu_uninit(&vmx->vcpu);
-free_vcpu:
+free_vpid:
free_vpid(vmx->vpid);
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.guest_fpu);
-free_user_fpu:
- kmem_cache_free(x86_fpu_cache, vmx->vcpu.arch.user_fpu);
-free_partial_vcpu:
- kmem_cache_free(kvm_vcpu_cache, vmx);
- return ERR_PTR(err);
+ return err;
}
#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
@@ -6840,6 +7032,7 @@
break;
}
}
+ kvm_apicv_init(kvm, enable_apicv);
return 0;
}
@@ -6848,6 +7041,12 @@
struct vmcs_config vmcs_conf;
struct vmx_capability vmx_cap;
+ if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) ||
+ !this_cpu_has(X86_FEATURE_VMX)) {
+ pr_err("kvm: VMX is disabled on CPU %d\n", smp_processor_id());
+ return -EIO;
+ }
+
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
return -EIO;
if (nested)
@@ -6865,17 +7064,24 @@
u8 cache;
u64 ipat = 0;
- /* For VT-d and EPT combination
- * 1. MMIO: always map as UC
- * 2. EPT with VT-d:
- * a. VT-d without snooping control feature: can't guarantee the
- * result, try to trust guest.
- * b. VT-d with snooping control feature: snooping control feature of
- * VT-d engine can guarantee the cache correctness. Just set it
- * to WB to keep consistent with host. So the same as item 3.
- * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
- * consistent with host MTRR
+ /* We wanted to honor guest CD/MTRR/PAT, but doing so could result in
+ * memory aliases with conflicting memory types and sometimes MCEs.
+ * We have to be careful as to what are honored and when.
+ *
+ * For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
+ * UC. The effective memory type is UC or WC depending on guest PAT.
+ * This was historically the source of MCEs and we want to be
+ * conservative.
+ *
+ * When there is no need to deal with noncoherent DMA (e.g., no VT-d
+ * or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
+ * EPT memory type is set to WB. The effective memory type is forced
+ * WB.
+ *
+ * Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
+ * EPT memory type is used to emulate guest CD/MTRR.
*/
+
if (is_mmio) {
cache = MTRR_TYPE_UNCACHABLE;
goto exit;
@@ -6902,15 +7108,6 @@
return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat;
}
-static int vmx_get_lpage_level(void)
-{
- if (enable_ept && !cpu_has_vmx_ept_1g_page())
- return PT_DIRECTORY_LEVEL;
- else
- /* For shadow and EPT supported 1GB page */
- return PT_PDPE_LEVEL;
-}
-
static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx)
{
/*
@@ -6949,27 +7146,28 @@
} while (0)
entry = kvm_find_cpuid_entry(vcpu, 0x1, 0);
- cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME));
- cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC));
- cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE));
- cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE));
- cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE));
- cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE));
- cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE));
- cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR));
- cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM));
- cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX));
- cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX));
- cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID));
- cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE));
+ cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
+ cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
+ cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
+ cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
+ cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
+ cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
+ cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
+ cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
+ cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
+ cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
+ cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
+ cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
+ cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
entry = kvm_find_cpuid_entry(vcpu, 0x7, 0);
- cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE));
- cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP));
- cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP));
- cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU));
- cr4_fixed1_update(X86_CR4_UMIP, ecx, bit(X86_FEATURE_UMIP));
+ cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
+ cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
+ cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
+ cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
+ cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
+ cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
#undef cr4_fixed1_update
}
@@ -7060,10 +7258,13 @@
vmx->pt_desc.ctl_bitmask &= ~(0xfULL << (32 + i * 4));
}
-static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
+static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
+ /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */
+ vcpu->arch.xsaves_enabled = false;
+
if (cpu_has_secondary_exec_ctrls()) {
vmx_compute_secondary_exec_control(vmx);
vmcs_set_secondary_exec_control(vmx);
@@ -7071,10 +7272,12 @@
if (nested_vmx_allowed(vcpu))
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ ~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX |
+ FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
if (nested_vmx_allowed(vcpu)) {
nested_vmx_cr_fixed1_bits_update(vcpu);
@@ -7084,12 +7287,54 @@
if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
update_intel_pt_cfg(vcpu);
+
+ if (boot_cpu_has(X86_FEATURE_RTM)) {
+ struct vmx_uret_msr *msr;
+ msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
+ if (msr) {
+ bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
+ vmx_set_guest_uret_msr(vmx, msr, enabled ? 0 : TSX_CTRL_RTM_DISABLE);
+ }
+ }
+
+ set_cr4_guest_host_mask(vmx);
+
+ /* Refresh #PF interception to account for MAXPHYADDR changes. */
+ update_exception_bitmap(vcpu);
}
-static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
+static __init void vmx_set_cpu_caps(void)
{
- if (func == 1 && nested)
- entry->ecx |= bit(X86_FEATURE_VMX);
+ kvm_set_cpu_caps();
+
+ /* CPUID 0x1 */
+ if (nested)
+ kvm_cpu_cap_set(X86_FEATURE_VMX);
+
+ /* CPUID 0x7 */
+ if (kvm_mpx_supported())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
+ if (cpu_has_vmx_invpcid())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INVPCID);
+ if (vmx_pt_mode_is_host_guest())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
+
+ if (vmx_umip_emulated())
+ kvm_cpu_cap_set(X86_FEATURE_UMIP);
+
+ /* CPUID 0xD.1 */
+ supported_xss = 0;
+ if (!cpu_has_vmx_xsaves())
+ kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
+
+ /* CPUID 0x80000001 and 0x7 (RDPID) */
+ if (!cpu_has_vmx_rdtscp()) {
+ kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
+ kvm_cpu_cap_clear(X86_FEATURE_RDPID);
+ }
+
+ if (cpu_has_vmx_waitpkg())
+ kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
}
static void vmx_request_immediate_exit(struct kvm_vcpu *vcpu)
@@ -7133,20 +7378,21 @@
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
- enum x86_intercept_stage stage)
+ enum x86_intercept_stage stage,
+ struct x86_exception *exception)
{
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
- struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
switch (info->intercept) {
/*
* RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here.
+ * Note, RDPID is hidden behind ENABLE_RDTSCP.
*/
- case x86_intercept_rdtscp:
- if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
- ctxt->exception.vector = UD_VECTOR;
- ctxt->exception.error_code_valid = false;
+ case x86_intercept_rdpid:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
+ exception->vector = UD_VECTOR;
+ exception->error_code_valid = false;
return X86EMUL_PROPAGATE_FAULT;
}
break;
@@ -7205,10 +7451,6 @@
u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
- if (kvm_mwait_in_guest(vcpu->kvm) ||
- kvm_can_post_timer_interrupt(vcpu))
- return -EOPNOTSUPP;
-
vmx = to_vmx(vcpu);
tscl = rdtsc();
guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
@@ -7257,7 +7499,8 @@
static void vmx_slot_enable_log_dirty(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
- kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
+ if (!kvm_dirty_log_manual_protect_and_init_set(kvm))
+ kvm_mmu_slot_leaf_clear_dirty(kvm, slot);
kvm_mmu_slot_largepage_remove_write_access(kvm, slot);
}
@@ -7272,42 +7515,6 @@
kvm_flush_pml_buffers(kvm);
}
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
-{
- struct vmcs12 *vmcs12;
- struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t dst;
-
- if (is_guest_mode(vcpu)) {
- WARN_ON_ONCE(vmx->nested.pml_full);
-
- /*
- * Check if PML is enabled for the nested guest.
- * Whether eptp bit 6 is set is already checked
- * as part of A/D emulation.
- */
- vmcs12 = get_vmcs12(vcpu);
- if (!nested_cpu_has_pml(vmcs12))
- return 0;
-
- if (vmcs12->guest_pml_index >= PML_ENTITY_NUM) {
- vmx->nested.pml_full = true;
- return 1;
- }
-
- gpa &= ~0xFFFull;
- dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
-
- if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
- offset_in_page(dst), sizeof(gpa)))
- return 0;
-
- vmcs12->guest_pml_index--;
- }
-
- return 0;
-}
-
static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *memslot,
gfn_t offset, unsigned long mask)
@@ -7315,107 +7522,6 @@
kvm_mmu_clear_dirty_pt_masked(kvm, memslot, offset, mask);
}
-static void __pi_post_block(struct kvm_vcpu *vcpu)
-{
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
- struct pi_desc old, new;
- unsigned int dest;
-
- do {
- old.control = new.control = pi_desc->control;
- WARN(old.nv != POSTED_INTR_WAKEUP_VECTOR,
- "Wakeup handler not enabled while the VCPU is blocked\n");
-
- dest = cpu_physical_id(vcpu->cpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'notification vector' */
- new.nv = POSTED_INTR_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- if (!WARN_ON_ONCE(vcpu->pre_pcpu == -1)) {
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_del(&vcpu->blocked_vcpu_list);
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- vcpu->pre_pcpu = -1;
- }
-}
-
-/*
- * This routine does the following things for vCPU which is going
- * to be blocked if VT-d PI is enabled.
- * - Store the vCPU to the wakeup list, so when interrupts happen
- * we can find the right vCPU to wake up.
- * - Change the Posted-interrupt descriptor as below:
- * 'NDST' <-- vcpu->pre_pcpu
- * 'NV' <-- POSTED_INTR_WAKEUP_VECTOR
- * - If 'ON' is set during this process, which means at least one
- * interrupt is posted for this vCPU, we cannot block it, in
- * this case, return 1, otherwise, return 0.
- *
- */
-static int pi_pre_block(struct kvm_vcpu *vcpu)
-{
- unsigned int dest;
- struct pi_desc old, new;
- struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
-
- if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(vcpu))
- return 0;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- if (!WARN_ON_ONCE(vcpu->pre_pcpu != -1)) {
- vcpu->pre_pcpu = vcpu->cpu;
- spin_lock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- list_add_tail(&vcpu->blocked_vcpu_list,
- &per_cpu(blocked_vcpu_on_cpu,
- vcpu->pre_pcpu));
- spin_unlock(&per_cpu(blocked_vcpu_on_cpu_lock, vcpu->pre_pcpu));
- }
-
- do {
- old.control = new.control = pi_desc->control;
-
- WARN((pi_desc->sn == 1),
- "Warning: SN field of posted-interrupts "
- "is set before blocking\n");
-
- /*
- * Since vCPU can be preempted during this process,
- * vcpu->cpu could be different with pre_pcpu, we
- * need to set pre_pcpu as the destination of wakeup
- * notification event, then we can find the right vCPU
- * to wakeup in wakeup handler if interrupts happen
- * when the vCPU is in blocked state.
- */
- dest = cpu_physical_id(vcpu->pre_pcpu);
-
- if (x2apic_enabled())
- new.ndst = dest;
- else
- new.ndst = (dest << 8) & 0xFF00;
-
- /* set 'NV' to 'wakeup vector' */
- new.nv = POSTED_INTR_WAKEUP_VECTOR;
- } while (cmpxchg64(&pi_desc->control, old.control,
- new.control) != old.control);
-
- /* We should not block the vCPU if an interrupt is posted for it. */
- if (pi_test_on(pi_desc) == 1)
- __pi_post_block(vcpu);
-
- local_irq_enable();
- return (vcpu->pre_pcpu == -1);
-}
-
static int vmx_pre_block(struct kvm_vcpu *vcpu)
{
if (pi_pre_block(vcpu))
@@ -7427,135 +7533,30 @@
return 0;
}
-static void pi_post_block(struct kvm_vcpu *vcpu)
-{
- if (vcpu->pre_pcpu == -1)
- return;
-
- WARN_ON(irqs_disabled());
- local_irq_disable();
- __pi_post_block(vcpu);
- local_irq_enable();
-}
-
static void vmx_post_block(struct kvm_vcpu *vcpu)
{
- if (kvm_x86_ops->set_hv_timer)
+ if (kvm_x86_ops.set_hv_timer)
kvm_lapic_switch_to_hv_timer(vcpu);
pi_post_block(vcpu);
}
-/*
- * vmx_update_pi_irte - set IRTE for Posted-Interrupts
- *
- * @kvm: kvm
- * @host_irq: host irq of the interrupt
- * @guest_irq: gsi of the interrupt
- * @set: set or unset PI
- * returns 0 on success, < 0 on failure
- */
-static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
- uint32_t guest_irq, bool set)
-{
- struct kvm_kernel_irq_routing_entry *e;
- struct kvm_irq_routing_table *irq_rt;
- struct kvm_lapic_irq irq;
- struct kvm_vcpu *vcpu;
- struct vcpu_data vcpu_info;
- int idx, ret = 0;
-
- if (!kvm_arch_has_assigned_device(kvm) ||
- !irq_remapping_cap(IRQ_POSTING_CAP) ||
- !kvm_vcpu_apicv_active(kvm->vcpus[0]))
- return 0;
-
- idx = srcu_read_lock(&kvm->irq_srcu);
- irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- if (guest_irq >= irq_rt->nr_rt_entries ||
- hlist_empty(&irq_rt->map[guest_irq])) {
- pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
- guest_irq, irq_rt->nr_rt_entries);
- goto out;
- }
-
- hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
- if (e->type != KVM_IRQ_ROUTING_MSI)
- continue;
- /*
- * VT-d PI cannot support posting multicast/broadcast
- * interrupts to a vCPU, we still use interrupt remapping
- * for these kind of interrupts.
- *
- * For lowest-priority interrupts, we only support
- * those with single CPU as the destination, e.g. user
- * configures the interrupts via /proc/irq or uses
- * irqbalance to make the interrupts single-CPU.
- *
- * We will support full lowest-priority interrupt later.
- *
- * In addition, we can only inject generic interrupts using
- * the PI mechanism, refuse to route others through it.
- */
-
- kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
- !kvm_irq_is_postable(&irq)) {
- /*
- * Make sure the IRTE is in remapped mode if
- * we don't handle it in posted mode.
- */
- ret = irq_set_vcpu_affinity(host_irq, NULL);
- if (ret < 0) {
- printk(KERN_INFO
- "failed to back to remapped mode, irq: %u\n",
- host_irq);
- goto out;
- }
-
- continue;
- }
-
- vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu));
- vcpu_info.vector = irq.vector;
-
- trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi,
- vcpu_info.vector, vcpu_info.pi_desc_addr, set);
-
- if (set)
- ret = irq_set_vcpu_affinity(host_irq, &vcpu_info);
- else
- ret = irq_set_vcpu_affinity(host_irq, NULL);
-
- if (ret < 0) {
- printk(KERN_INFO "%s: failed to update PI IRTE\n",
- __func__);
- goto out;
- }
- }
-
- ret = 0;
-out:
- srcu_read_unlock(&kvm->irq_srcu, idx);
- return ret;
-}
-
static void vmx_setup_mce(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.mcg_cap & MCG_LMCE_P)
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
- FEATURE_CONTROL_LMCE;
+ FEAT_CTL_LMCE_ENABLED;
else
to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
- ~FEATURE_CONTROL_LMCE;
+ ~FEAT_CTL_LMCE_ENABLED;
}
-static int vmx_smi_allowed(struct kvm_vcpu *vcpu)
+static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
{
/* we need a nested vmexit to enter SMM, postpone if run is pending */
if (to_vmx(vcpu)->nested.nested_run_pending)
- return 0;
- return 1;
+ return -EBUSY;
+ return !is_smm(vcpu);
}
static int vmx_pre_enter_smm(struct kvm_vcpu *vcpu, char *smstate)
@@ -7592,14 +7593,9 @@
return 0;
}
-static int enable_smi_window(struct kvm_vcpu *vcpu)
+static void enable_smi_window(struct kvm_vcpu *vcpu)
{
- return 0;
-}
-
-static bool vmx_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
-{
- return false;
+ /* RSM will cause a vmexit anyway. */
}
static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
@@ -7607,19 +7603,175 @@
return to_vmx(vcpu)->nested.vmxon;
}
+static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
+{
+ if (is_guest_mode(vcpu)) {
+ struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
+
+ if (hrtimer_try_to_cancel(timer) == 1)
+ hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
+ }
+}
+
+static void hardware_unsetup(void)
+{
+ kvm_set_posted_intr_wakeup_handler(NULL);
+
+ if (nested)
+ nested_vmx_hardware_unsetup();
+
+ free_kvm_area();
+}
+
+static bool vmx_check_apicv_inhibit_reasons(ulong bit)
+{
+ ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) |
+ BIT(APICV_INHIBIT_REASON_HYPERV);
+
+ return supported & BIT(bit);
+}
+
+static struct kvm_x86_ops vmx_x86_ops __initdata = {
+ .hardware_unsetup = hardware_unsetup,
+
+ .hardware_enable = hardware_enable,
+ .hardware_disable = hardware_disable,
+ .cpu_has_accelerated_tpr = report_flexpriority,
+ .has_emulated_msr = vmx_has_emulated_msr,
+
+ .vm_size = sizeof(struct kvm_vmx),
+ .vm_init = vmx_vm_init,
+
+ .vcpu_create = vmx_create_vcpu,
+ .vcpu_free = vmx_free_vcpu,
+ .vcpu_reset = vmx_vcpu_reset,
+
+ .prepare_guest_switch = vmx_prepare_switch_to_guest,
+ .vcpu_load = vmx_vcpu_load,
+ .vcpu_put = vmx_vcpu_put,
+
+ .update_exception_bitmap = update_exception_bitmap,
+ .get_msr_feature = vmx_get_msr_feature,
+ .get_msr = vmx_get_msr,
+ .set_msr = vmx_set_msr,
+ .get_segment_base = vmx_get_segment_base,
+ .get_segment = vmx_get_segment,
+ .set_segment = vmx_set_segment,
+ .get_cpl = vmx_get_cpl,
+ .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
+ .set_cr0 = vmx_set_cr0,
+ .set_cr4 = vmx_set_cr4,
+ .set_efer = vmx_set_efer,
+ .get_idt = vmx_get_idt,
+ .set_idt = vmx_set_idt,
+ .get_gdt = vmx_get_gdt,
+ .set_gdt = vmx_set_gdt,
+ .set_dr7 = vmx_set_dr7,
+ .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
+ .cache_reg = vmx_cache_reg,
+ .get_rflags = vmx_get_rflags,
+ .set_rflags = vmx_set_rflags,
+
+ .tlb_flush_all = vmx_flush_tlb_all,
+ .tlb_flush_current = vmx_flush_tlb_current,
+ .tlb_flush_gva = vmx_flush_tlb_gva,
+ .tlb_flush_guest = vmx_flush_tlb_guest,
+
+ .run = vmx_vcpu_run,
+ .handle_exit = vmx_handle_exit,
+ .skip_emulated_instruction = vmx_skip_emulated_instruction,
+ .update_emulated_instruction = vmx_update_emulated_instruction,
+ .set_interrupt_shadow = vmx_set_interrupt_shadow,
+ .get_interrupt_shadow = vmx_get_interrupt_shadow,
+ .patch_hypercall = vmx_patch_hypercall,
+ .set_irq = vmx_inject_irq,
+ .set_nmi = vmx_inject_nmi,
+ .queue_exception = vmx_queue_exception,
+ .cancel_injection = vmx_cancel_injection,
+ .interrupt_allowed = vmx_interrupt_allowed,
+ .nmi_allowed = vmx_nmi_allowed,
+ .get_nmi_mask = vmx_get_nmi_mask,
+ .set_nmi_mask = vmx_set_nmi_mask,
+ .enable_nmi_window = enable_nmi_window,
+ .enable_irq_window = enable_irq_window,
+ .update_cr8_intercept = update_cr8_intercept,
+ .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
+ .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
+ .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
+ .load_eoi_exitmap = vmx_load_eoi_exitmap,
+ .apicv_post_state_restore = vmx_apicv_post_state_restore,
+ .check_apicv_inhibit_reasons = vmx_check_apicv_inhibit_reasons,
+ .hwapic_irr_update = vmx_hwapic_irr_update,
+ .hwapic_isr_update = vmx_hwapic_isr_update,
+ .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
+ .sync_pir_to_irr = vmx_sync_pir_to_irr,
+ .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
+ .dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
+
+ .set_tss_addr = vmx_set_tss_addr,
+ .set_identity_map_addr = vmx_set_identity_map_addr,
+ .get_mt_mask = vmx_get_mt_mask,
+
+ .get_exit_info = vmx_get_exit_info,
+
+ .vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
+
+ .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
+
+ .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
+
+ .load_mmu_pgd = vmx_load_mmu_pgd,
+
+ .check_intercept = vmx_check_intercept,
+ .handle_exit_irqoff = vmx_handle_exit_irqoff,
+
+ .request_immediate_exit = vmx_request_immediate_exit,
+
+ .sched_in = vmx_sched_in,
+
+ .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
+ .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
+ .flush_log_dirty = vmx_flush_log_dirty,
+ .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
+
+ .pre_block = vmx_pre_block,
+ .post_block = vmx_post_block,
+
+ .pmu_ops = &intel_pmu_ops,
+ .nested_ops = &vmx_nested_ops,
+
+ .update_pi_irte = pi_update_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
+
+ .smi_allowed = vmx_smi_allowed,
+ .pre_enter_smm = vmx_pre_enter_smm,
+ .pre_leave_smm = vmx_pre_leave_smm,
+ .enable_smi_window = enable_smi_window,
+
+ .can_emulate_instruction = vmx_can_emulate_instruction,
+ .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .migrate_timers = vmx_migrate_timers,
+
+ .msr_filter_changed = vmx_msr_filter_changed,
+};
+
static __init int hardware_setup(void)
{
unsigned long host_bndcfgs;
struct desc_ptr dt;
- int r, i;
-
- rdmsrl_safe(MSR_EFER, &host_efer);
+ int r, i, ept_lpage_level;
store_idt(&dt);
host_idt_base = dt.address;
- for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i)
- kvm_define_shared_msr(i, vmx_msr_index[i]);
+ for (i = 0; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
+ kvm_define_user_return_msr(i, vmx_uret_msrs_list[i]);
if (setup_vmcs_config(&vmcs_config, &vmx_capability) < 0)
return -EIO;
@@ -7632,8 +7784,9 @@
WARN_ONCE(host_bndcfgs, "KVM: BNDCFGS in host will be lost");
}
- if (boot_cpu_has(X86_FEATURE_XSAVES))
- rdmsrl(MSR_IA32_XSS, host_xss);
+ if (!cpu_has_vmx_mpx())
+ supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS |
+ XFEATURE_MASK_BNDCSR);
if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
@@ -7663,19 +7816,16 @@
* using the APIC_ACCESS_ADDR VMCS field.
*/
if (!flexpriority_enabled)
- kvm_x86_ops->set_apic_access_page_addr = NULL;
+ vmx_x86_ops.set_apic_access_page_addr = NULL;
if (!cpu_has_vmx_tpr_shadow())
- kvm_x86_ops->update_cr8_intercept = NULL;
-
- if (enable_ept && !cpu_has_vmx_ept_2m_page())
- kvm_disable_largepages();
+ vmx_x86_ops.update_cr8_intercept = NULL;
#if IS_ENABLED(CONFIG_HYPERV)
if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
&& enable_ept) {
- kvm_x86_ops->tlb_remote_flush = hv_remote_flush_tlb;
- kvm_x86_ops->tlb_remote_flush_with_range =
+ vmx_x86_ops.tlb_remote_flush = hv_remote_flush_tlb;
+ vmx_x86_ops.tlb_remote_flush_with_range =
hv_remote_flush_tlb_with_range;
}
#endif
@@ -7690,7 +7840,7 @@
if (!cpu_has_vmx_apicv()) {
enable_apicv = 0;
- kvm_x86_ops->sync_pir_to_irr = NULL;
+ vmx_x86_ops.sync_pir_to_irr = NULL;
}
if (cpu_has_vmx_tsc_scaling()) {
@@ -7703,8 +7853,16 @@
if (enable_ept)
vmx_enable_tdp();
+
+ if (!enable_ept)
+ ept_lpage_level = 0;
+ else if (cpu_has_vmx_ept_1g_page())
+ ept_lpage_level = PG_LEVEL_1G;
+ else if (cpu_has_vmx_ept_2m_page())
+ ept_lpage_level = PG_LEVEL_2M;
else
- kvm_disable_tdp();
+ ept_lpage_level = PG_LEVEL_4K;
+ kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level);
/*
* Only enable PML when hardware supports PML feature, and both EPT
@@ -7714,10 +7872,10 @@
enable_pml = 0;
if (!enable_pml) {
- kvm_x86_ops->slot_enable_log_dirty = NULL;
- kvm_x86_ops->slot_disable_log_dirty = NULL;
- kvm_x86_ops->flush_log_dirty = NULL;
- kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
+ vmx_x86_ops.slot_enable_log_dirty = NULL;
+ vmx_x86_ops.slot_disable_log_dirty = NULL;
+ vmx_x86_ops.flush_log_dirty = NULL;
+ vmx_x86_ops.enable_log_dirty_pt_masked = NULL;
}
if (!cpu_has_vmx_preemption_timer())
@@ -7745,13 +7903,11 @@
}
if (!enable_preemption_timer) {
- kvm_x86_ops->set_hv_timer = NULL;
- kvm_x86_ops->cancel_hv_timer = NULL;
- kvm_x86_ops->request_immediate_exit = __kvm_request_immediate_exit;
+ vmx_x86_ops.set_hv_timer = NULL;
+ vmx_x86_ops.cancel_hv_timer = NULL;
+ vmx_x86_ops.request_immediate_exit = __kvm_request_immediate_exit;
}
- kvm_set_posted_intr_wakeup_handler(wakeup_handler);
-
kvm_mce_cap_supported |= MCG_LMCE_P;
if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
@@ -7768,172 +7924,25 @@
return r;
}
+ vmx_set_cpu_caps();
+
r = alloc_kvm_area();
if (r)
nested_vmx_hardware_unsetup();
+
+ kvm_set_posted_intr_wakeup_handler(pi_wakeup_handler);
+
return r;
}
-static __exit void hardware_unsetup(void)
-{
- if (nested)
- nested_vmx_hardware_unsetup();
-
- free_kvm_area();
-}
-
-static struct kvm_x86_ops vmx_x86_ops __ro_after_init = {
+static struct kvm_x86_init_ops vmx_init_ops __initdata = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
- .hardware_setup = hardware_setup,
- .hardware_unsetup = hardware_unsetup,
.check_processor_compatibility = vmx_check_processor_compat,
- .hardware_enable = hardware_enable,
- .hardware_disable = hardware_disable,
- .cpu_has_accelerated_tpr = report_flexpriority,
- .has_emulated_msr = vmx_has_emulated_msr,
+ .hardware_setup = hardware_setup,
+ .intel_pt_intr_in_guest = vmx_pt_mode_is_host_guest,
- .vm_init = vmx_vm_init,
- .vm_alloc = vmx_vm_alloc,
- .vm_free = vmx_vm_free,
-
- .vcpu_create = vmx_create_vcpu,
- .vcpu_free = vmx_free_vcpu,
- .vcpu_reset = vmx_vcpu_reset,
-
- .prepare_guest_switch = vmx_prepare_switch_to_guest,
- .vcpu_load = vmx_vcpu_load,
- .vcpu_put = vmx_vcpu_put,
-
- .update_bp_intercept = update_exception_bitmap,
- .get_msr_feature = vmx_get_msr_feature,
- .get_msr = vmx_get_msr,
- .set_msr = vmx_set_msr,
- .get_segment_base = vmx_get_segment_base,
- .get_segment = vmx_get_segment,
- .set_segment = vmx_set_segment,
- .get_cpl = vmx_get_cpl,
- .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
- .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
- .decache_cr3 = vmx_decache_cr3,
- .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
- .set_cr0 = vmx_set_cr0,
- .set_cr3 = vmx_set_cr3,
- .set_cr4 = vmx_set_cr4,
- .set_efer = vmx_set_efer,
- .get_idt = vmx_get_idt,
- .set_idt = vmx_set_idt,
- .get_gdt = vmx_get_gdt,
- .set_gdt = vmx_set_gdt,
- .get_dr6 = vmx_get_dr6,
- .set_dr6 = vmx_set_dr6,
- .set_dr7 = vmx_set_dr7,
- .sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
- .cache_reg = vmx_cache_reg,
- .get_rflags = vmx_get_rflags,
- .set_rflags = vmx_set_rflags,
-
- .tlb_flush = vmx_flush_tlb,
- .tlb_flush_gva = vmx_flush_tlb_gva,
-
- .run = vmx_vcpu_run,
- .handle_exit = vmx_handle_exit,
- .skip_emulated_instruction = skip_emulated_instruction,
- .set_interrupt_shadow = vmx_set_interrupt_shadow,
- .get_interrupt_shadow = vmx_get_interrupt_shadow,
- .patch_hypercall = vmx_patch_hypercall,
- .set_irq = vmx_inject_irq,
- .set_nmi = vmx_inject_nmi,
- .queue_exception = vmx_queue_exception,
- .cancel_injection = vmx_cancel_injection,
- .interrupt_allowed = vmx_interrupt_allowed,
- .nmi_allowed = vmx_nmi_allowed,
- .get_nmi_mask = vmx_get_nmi_mask,
- .set_nmi_mask = vmx_set_nmi_mask,
- .enable_nmi_window = enable_nmi_window,
- .enable_irq_window = enable_irq_window,
- .update_cr8_intercept = update_cr8_intercept,
- .set_virtual_apic_mode = vmx_set_virtual_apic_mode,
- .set_apic_access_page_addr = vmx_set_apic_access_page_addr,
- .get_enable_apicv = vmx_get_enable_apicv,
- .refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
- .load_eoi_exitmap = vmx_load_eoi_exitmap,
- .apicv_post_state_restore = vmx_apicv_post_state_restore,
- .hwapic_irr_update = vmx_hwapic_irr_update,
- .hwapic_isr_update = vmx_hwapic_isr_update,
- .guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
- .sync_pir_to_irr = vmx_sync_pir_to_irr,
- .deliver_posted_interrupt = vmx_deliver_posted_interrupt,
- .dy_apicv_has_pending_interrupt = vmx_dy_apicv_has_pending_interrupt,
-
- .set_tss_addr = vmx_set_tss_addr,
- .set_identity_map_addr = vmx_set_identity_map_addr,
- .get_tdp_level = get_ept_level,
- .get_mt_mask = vmx_get_mt_mask,
-
- .get_exit_info = vmx_get_exit_info,
-
- .get_lpage_level = vmx_get_lpage_level,
-
- .cpuid_update = vmx_cpuid_update,
-
- .rdtscp_supported = vmx_rdtscp_supported,
- .invpcid_supported = vmx_invpcid_supported,
-
- .set_supported_cpuid = vmx_set_supported_cpuid,
-
- .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
-
- .read_l1_tsc_offset = vmx_read_l1_tsc_offset,
- .write_l1_tsc_offset = vmx_write_l1_tsc_offset,
-
- .set_tdp_cr3 = vmx_set_cr3,
-
- .check_intercept = vmx_check_intercept,
- .handle_exit_irqoff = vmx_handle_exit_irqoff,
- .mpx_supported = vmx_mpx_supported,
- .xsaves_supported = vmx_xsaves_supported,
- .umip_emulated = vmx_umip_emulated,
- .pt_supported = vmx_pt_supported,
- .pku_supported = vmx_pku_supported,
-
- .request_immediate_exit = vmx_request_immediate_exit,
-
- .sched_in = vmx_sched_in,
-
- .slot_enable_log_dirty = vmx_slot_enable_log_dirty,
- .slot_disable_log_dirty = vmx_slot_disable_log_dirty,
- .flush_log_dirty = vmx_flush_log_dirty,
- .enable_log_dirty_pt_masked = vmx_enable_log_dirty_pt_masked,
- .write_log_dirty = vmx_write_pml_buffer,
-
- .pre_block = vmx_pre_block,
- .post_block = vmx_post_block,
-
- .pmu_ops = &intel_pmu_ops,
-
- .update_pi_irte = vmx_update_pi_irte,
-
-#ifdef CONFIG_X86_64
- .set_hv_timer = vmx_set_hv_timer,
- .cancel_hv_timer = vmx_cancel_hv_timer,
-#endif
-
- .setup_mce = vmx_setup_mce,
-
- .smi_allowed = vmx_smi_allowed,
- .pre_enter_smm = vmx_pre_enter_smm,
- .pre_leave_smm = vmx_pre_leave_smm,
- .enable_smi_window = enable_smi_window,
-
- .check_nested_events = NULL,
- .get_nested_state = NULL,
- .set_nested_state = NULL,
- .get_vmcs12_pages = NULL,
- .nested_enable_evmcs = NULL,
- .nested_get_evmcs_version = NULL,
- .need_emulation_on_page_fault = vmx_need_emulation_on_page_fault,
- .apic_init_signal_blocked = vmx_apic_init_signal_blocked,
+ .runtime_ops = &vmx_x86_ops,
};
static void vmx_cleanup_l1d_flush(void)
@@ -8020,7 +8029,7 @@
}
#endif
- r = kvm_init(&vmx_x86_ops, sizeof(struct vcpu_vmx),
+ r = kvm_init(&vmx_init_ops, sizeof(struct vcpu_vmx),
__alignof__(struct vcpu_vmx), THIS_MODULE);
if (r)
return r;
@@ -8040,8 +8049,8 @@
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+
+ pi_init_cpu(cpu);
}
#ifdef CONFIG_KEXEC_CORE
@@ -8050,6 +8059,14 @@
#endif
vmx_check_vmcs12_offsets();
+ /*
+ * Shadow paging doesn't have a (further) performance penalty
+ * from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
+ * by default
+ */
+ if (!enable_ept)
+ allow_smaller_maxphyaddr = true;
+
return 0;
}
module_init(vmx_init);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 55731dd..5ff2453 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -8,11 +8,13 @@
#include <asm/intel_pt.h>
#include "capabilities.h"
-#include "ops.h"
+#include "kvm_cache_regs.h"
+#include "posted_intr.h"
#include "vmcs.h"
+#include "vmx_ops.h"
+#include "cpuid.h"
extern const u32 vmx_msr_index[];
-extern u64 host_efer;
#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
@@ -20,15 +22,21 @@
#define X2APIC_MSR(r) (APIC_BASE_MSR + ((r) >> 4))
-#define NR_AUTOLOAD_MSRS 8
+#ifdef CONFIG_X86_64
+#define MAX_NR_USER_RETURN_MSRS 7
+#else
+#define MAX_NR_USER_RETURN_MSRS 4
+#endif
+
+#define MAX_NR_LOADSTORE_MSRS 8
struct vmx_msrs {
unsigned int nr;
- struct vmx_msr_entry val[NR_AUTOLOAD_MSRS];
+ struct vmx_msr_entry val[MAX_NR_LOADSTORE_MSRS];
};
-struct shared_msr_entry {
- unsigned index;
+struct vmx_uret_msr {
+ unsigned int slot; /* The MSR's slot in kvm_user_return_msrs. */
u64 data;
u64 mask;
};
@@ -42,29 +50,6 @@
SEG_FIELD_NR = 4
};
-/* Posted-Interrupt Descriptor */
-struct pi_desc {
- u32 pir[8]; /* Posted interrupt requested */
- union {
- struct {
- /* bit 256 - Outstanding Notification */
- u16 on : 1,
- /* bit 257 - Suppress Notification */
- sn : 1,
- /* bit 271:258 - Reserved */
- rsvd_1 : 14;
- /* bit 279:272 - Notification Vector */
- u8 nv;
- /* bit 287:280 - Reserved */
- u8 rsvd_2;
- /* bit 319:288 - Notification Destination */
- u32 ndst;
- };
- u64 control;
- };
- u32 rsvd[6];
-} __aligned(64);
-
#define RTIT_ADDR_RANGE 4
struct pt_ctx {
@@ -85,6 +70,29 @@
struct pt_ctx guest;
};
+union vmx_exit_reason {
+ struct {
+ u32 basic : 16;
+ u32 reserved16 : 1;
+ u32 reserved17 : 1;
+ u32 reserved18 : 1;
+ u32 reserved19 : 1;
+ u32 reserved20 : 1;
+ u32 reserved21 : 1;
+ u32 reserved22 : 1;
+ u32 reserved23 : 1;
+ u32 reserved24 : 1;
+ u32 reserved25 : 1;
+ u32 reserved26 : 1;
+ u32 enclave_mode : 1;
+ u32 smi_pending_mtf : 1;
+ u32 smi_from_vmx_root : 1;
+ u32 reserved30 : 1;
+ u32 failed_vmentry : 1;
+ };
+ u32 full;
+};
+
/*
* The nested_vmx structure is part of vcpu_vmx, and holds information we need
* for correct emulation of VMX (i.e., nested VMX) on this vcpu.
@@ -131,6 +139,7 @@
bool vmcs02_initialized;
bool change_vmcs01_virtual_apic_mode;
+ bool reload_vmcs01_apic_access_page;
/*
* Enlightened VMCS has been enabled. It does not mean that L1 has to
@@ -142,6 +151,9 @@
/* L2 must run next, and mustn't decide to exit to L1. */
bool nested_run_pending;
+ /* Pending MTF VM-exit into L1. */
+ bool mtf_pending;
+
struct loaded_vmcs vmcs02;
/*
@@ -159,12 +171,17 @@
u16 posted_intr_nv;
struct hrtimer preemption_timer;
+ u64 preemption_timer_deadline;
+ bool has_preemption_timer_deadline;
bool preemption_timer_expired;
/* to migrate it to L2 if VM_ENTRY_LOAD_DEBUG_CONTROLS is off */
u64 vmcs01_debugctl;
u64 vmcs01_guest_bndcfgs;
+ /* to migrate it to L1 if L2 writes to L1's CR8 directly */
+ int l1_tpr_threshold;
+
u16 vpid02;
u16 last_vpid;
@@ -197,14 +214,15 @@
*/
bool guest_state_loaded;
+ unsigned long exit_qualification;
u32 exit_intr_info;
u32 idt_vectoring_info;
ulong rflags;
- struct shared_msr_entry *guest_msrs;
- int nmsrs;
- int save_nmsrs;
- bool guest_msrs_ready;
+ struct vmx_uret_msr guest_uret_msrs[MAX_NR_USER_RETURN_MSRS];
+ int nr_uret_msrs;
+ int nr_active_uret_msrs;
+ bool guest_uret_msrs_loaded;
#ifdef CONFIG_X86_64
u64 msr_host_kernel_gs_base;
u64 msr_guest_kernel_gs_base;
@@ -228,6 +246,10 @@
struct vmx_msrs host;
} msr_autoload;
+ struct msr_autostore {
+ struct vmx_msrs guest;
+ } msr_autostore;
+
struct {
int vm86_active;
ulong save_rflags;
@@ -245,7 +267,7 @@
int vpid;
bool emulation_required;
- u32 exit_reason;
+ union vmx_exit_reason exit_reason;
/* Posted interrupt descriptor */
struct pi_desc pi_desc;
@@ -268,13 +290,11 @@
u64 current_tsc_ratio;
- u32 host_pkru;
-
unsigned long host_debugctlmsr;
/*
* Only bits masked by msr_ia32_feature_control_valid_bits can be set in
- * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * msr_ia32_feature_control. FEAT_CTL_LOCKED is always included
* in msr_ia32_feature_control_valid_bits.
*/
u64 msr_ia32_feature_control;
@@ -282,6 +302,13 @@
u64 ept_pointer;
struct pt_desc pt_desc;
+
+ /* Save desired MSR intercept (read: pass-through) state */
+#define MAX_POSSIBLE_PASSTHROUGH_MSRS 13
+ struct {
+ DECLARE_BITMAP(read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ DECLARE_BITMAP(write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
+ } shadow_msr_intercept;
};
enum ept_pointers_status {
@@ -304,7 +331,6 @@
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
struct loaded_vmcs *buddy);
-void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
int allocate_vpid(void);
void free_vpid(int vpid);
void vmx_set_constant_host_state(struct vcpu_vmx *vmx);
@@ -316,84 +342,29 @@
void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu);
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
-void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
+int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
+u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa,
+ int root_level);
+
+bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu);
void update_exception_bitmap(struct kvm_vcpu *vcpu);
void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu);
+bool vmx_nmi_blocked(struct kvm_vcpu *vcpu);
+bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu);
bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu);
void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked);
void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu);
-struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr);
-void pt_update_intercept_for_msr(struct vcpu_vmx *vmx);
+struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
+void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
-
-#define POSTED_INTR_ON 0
-#define POSTED_INTR_SN 1
-
-static inline bool pi_test_and_set_on(struct pi_desc *pi_desc)
-{
- return test_and_set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline bool pi_test_and_clear_on(struct pi_desc *pi_desc)
-{
- return test_and_clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_and_set_pir(int vector, struct pi_desc *pi_desc)
-{
- return test_and_set_bit(vector, (unsigned long *)pi_desc->pir);
-}
-
-static inline bool pi_is_pir_empty(struct pi_desc *pi_desc)
-{
- return bitmap_empty((unsigned long *)pi_desc->pir, NR_VECTORS);
-}
-
-static inline void pi_set_sn(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_set_on(struct pi_desc *pi_desc)
-{
- set_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_on(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline void pi_clear_sn(struct pi_desc *pi_desc)
-{
- clear_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_on(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_ON,
- (unsigned long *)&pi_desc->control);
-}
-
-static inline int pi_test_sn(struct pi_desc *pi_desc)
-{
- return test_bit(POSTED_INTR_SN,
- (unsigned long *)&pi_desc->control);
-}
+int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
+void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
static inline u8 vmx_get_rvi(void)
{
@@ -426,15 +397,24 @@
BUILD_CONTROLS_SHADOW(exec, CPU_BASED_VM_EXEC_CONTROL)
BUILD_CONTROLS_SHADOW(secondary_exec, SECONDARY_VM_EXEC_CONTROL)
-static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
+static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu)
{
- vmx->segment_cache.bitmask = 0;
+ vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
+ | (1 << VCPU_EXREG_RFLAGS)
+ | (1 << VCPU_EXREG_PDPTR)
+ | (1 << VCPU_EXREG_SEGMENTS)
+ | (1 << VCPU_EXREG_CR0)
+ | (1 << VCPU_EXREG_CR3)
+ | (1 << VCPU_EXREG_CR4)
+ | (1 << VCPU_EXREG_EXIT_INFO_1)
+ | (1 << VCPU_EXREG_EXIT_INFO_2));
+ vcpu->arch.regs_dirty = 0;
}
static inline u32 vmx_vmentry_ctrl(void)
{
u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP |
VM_ENTRY_LOAD_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
@@ -445,7 +425,7 @@
static inline u32 vmx_vmexit_ctrl(void)
{
u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
- if (pt_mode == PT_MODE_SYSTEM)
+ if (vmx_pt_mode_is_system())
vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP |
VM_EXIT_CLEAR_IA32_RTIT_CTL);
/* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */
@@ -466,16 +446,32 @@
return container_of(vcpu, struct vcpu_vmx, vcpu);
}
-static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu)
+static inline unsigned long vmx_get_exit_qual(struct kvm_vcpu *vcpu)
{
- return &(to_vmx(vcpu)->pi_desc);
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_1)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_1);
+ vmx->exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+ }
+ return vmx->exit_qualification;
+}
+
+static inline u32 vmx_get_intr_info(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+ if (!kvm_register_is_available(vcpu, VCPU_EXREG_EXIT_INFO_2)) {
+ kvm_register_mark_available(vcpu, VCPU_EXREG_EXIT_INFO_2);
+ vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
+ }
+ return vmx->exit_intr_info;
}
struct vmcs *alloc_vmcs_cpu(bool shadow, int cpu, gfp_t flags);
void free_vmcs(struct vmcs *vmcs);
int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs);
-void loaded_vmcs_init(struct loaded_vmcs *loaded_vmcs);
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs);
static inline struct vmcs *alloc_vmcs(bool shadow)
@@ -484,26 +480,6 @@
GFP_KERNEL_ACCOUNT);
}
-u64 construct_eptp(struct kvm_vcpu *vcpu, unsigned long root_hpa);
-
-static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid,
- bool invalidate_gpa)
-{
- if (enable_ept && (invalidate_gpa || !enable_vpid)) {
- if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
- return;
- ept_sync_context(construct_eptp(vcpu,
- vcpu->arch.mmu->root_hpa));
- } else {
- vpid_sync_context(vpid);
- }
-}
-
-static inline void vmx_flush_tlb(struct kvm_vcpu *vcpu, bool invalidate_gpa)
-{
- __vmx_flush_tlb(vcpu, to_vmx(vcpu)->vpid, invalidate_gpa);
-}
-
static inline void decache_tsc_multiplier(struct vcpu_vmx *vmx)
{
vmx->current_tsc_ratio = vmx->vcpu.arch.tsc_scaling_ratio;
@@ -516,6 +492,27 @@
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
+static inline bool vmx_need_pf_intercept(struct kvm_vcpu *vcpu)
+{
+ if (!enable_ept)
+ return true;
+
+ return allow_smaller_maxphyaddr && cpuid_maxphyaddr(vcpu) < boot_cpu_data.x86_phys_bits;
+}
+
+static inline bool is_unrestricted_guest(struct kvm_vcpu *vcpu)
+{
+ return enable_unrestricted_guest && (!is_guest_mode(vcpu) ||
+ (secondary_exec_controls_get(to_vmx(vcpu)) &
+ SECONDARY_EXEC_UNRESTRICTED_GUEST));
+}
+
+bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu);
+static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu)
+{
+ return is_unrestricted_guest(vcpu) || __vmx_guest_state_valid(vcpu);
+}
+
void dump_vmcs(void);
#endif /* __KVM_X86_VMX_H */
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/vmx_ops.h
similarity index 95%
rename from arch/x86/kvm/vmx/ops.h
rename to arch/x86/kvm/vmx/vmx_ops.h
index 19717d0..692b0c3 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/vmx_ops.h
@@ -146,7 +146,9 @@
: : op1 : "cc" : error, fault); \
return; \
error: \
+ instrumentation_begin(); \
insn##_error(error_args); \
+ instrumentation_end(); \
return; \
fault: \
kvm_spurious_fault(); \
@@ -161,7 +163,9 @@
: : op1, op2 : "cc" : error, fault); \
return; \
error: \
+ instrumentation_begin(); \
insn##_error(error_args); \
+ instrumentation_end(); \
return; \
fault: \
kvm_spurious_fault(); \
@@ -268,42 +272,38 @@
vmx_asm2(invept, "r"(ext), "m"(operand), ext, eptp, gpa);
}
-static inline bool vpid_sync_vcpu_addr(int vpid, gva_t addr)
-{
- if (vpid == 0)
- return true;
-
- if (cpu_has_vmx_invvpid_individual_addr()) {
- __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
- return true;
- }
-
- return false;
-}
-
static inline void vpid_sync_vcpu_single(int vpid)
{
if (vpid == 0)
return;
- if (cpu_has_vmx_invvpid_single())
- __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
+ __invvpid(VMX_VPID_EXTENT_SINGLE_CONTEXT, vpid, 0);
}
static inline void vpid_sync_vcpu_global(void)
{
- if (cpu_has_vmx_invvpid_global())
- __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
+ __invvpid(VMX_VPID_EXTENT_ALL_CONTEXT, 0, 0);
}
static inline void vpid_sync_context(int vpid)
{
if (cpu_has_vmx_invvpid_single())
vpid_sync_vcpu_single(vpid);
- else
+ else if (vpid != 0)
vpid_sync_vcpu_global();
}
+static inline void vpid_sync_vcpu_addr(int vpid, gva_t addr)
+{
+ if (vpid == 0)
+ return;
+
+ if (cpu_has_vmx_invvpid_individual_addr())
+ __invvpid(VMX_VPID_EXTENT_INDIVIDUAL_ADDR, vpid, addr);
+ else
+ vpid_sync_context(vpid);
+}
+
static inline void ept_sync_global(void)
{
__invept(VMX_EPT_EXTENT_GLOBAL, 0, 0);