Update Linux to v5.10.157
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.157.tar.xz
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
Change-Id: I7b30d9e98d8c465d6b44de8e7433b4a40b3289ba
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 41b0dc3..06a776f 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -661,13 +661,16 @@
entry->edx = 0;
}
break;
- case 9:
- break;
case 0xa: { /* Architectural Performance Monitoring */
struct x86_pmu_capability cap;
union cpuid10_eax eax;
union cpuid10_edx edx;
+ if (!static_cpu_has(X86_FEATURE_ARCH_PERFMON)) {
+ entry->eax = entry->ebx = entry->ecx = entry->edx = 0;
+ break;
+ }
+
perf_get_x86_pmu_capability(&cap);
/*
@@ -810,11 +813,13 @@
entry->eax = min(entry->eax, 0x8000001f);
break;
case 0x80000001:
+ entry->ebx &= ~GENMASK(27, 16);
cpuid_entry_override(entry, CPUID_8000_0001_EDX);
cpuid_entry_override(entry, CPUID_8000_0001_ECX);
break;
case 0x80000006:
- /* L2 cache and TLB: pass through host info. */
+ /* Drop reserved bits, pass host L2 cache and TLB info. */
+ entry->edx &= ~GENMASK(17, 16);
break;
case 0x80000007: /* Advanced power management */
/* invariant TSC is CPUID.80000007H:EDX[8] */
@@ -837,6 +842,7 @@
g_phys_as = phys_as;
entry->eax = g_phys_as | (virt_as << 8);
+ entry->ecx &= ~(GENMASK(31, 16) | GENMASK(11, 8));
entry->edx = 0;
cpuid_entry_override(entry, CPUID_8000_0008_EBX);
break;
@@ -856,6 +862,9 @@
entry->ecx = entry->edx = 0;
break;
case 0x8000001a:
+ entry->eax &= GENMASK(2, 0);
+ entry->ebx = entry->ecx = entry->edx = 0;
+ break;
case 0x8000001e:
break;
/* Support memory encryption cpuid if host supports it */
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e82151b..63efccc 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -188,9 +188,6 @@
#define X8(x...) X4(x), X4(x)
#define X16(x...) X8(x), X8(x)
-#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
-#define FASTOP_SIZE 8
-
struct opcode {
u64 flags : 56;
u64 intercept : 8;
@@ -304,9 +301,15 @@
* Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
* different operand sizes can be reached by calculation, rather than a jump
* table (which would be bigger than the code).
+ *
+ * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR
+ * and 1 for the straight line speculation INT3, leaves 7 bytes for the
+ * body of the function. Currently none is larger than 4.
*/
static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
+#define FASTOP_SIZE 16
+
#define __FOP_FUNC(name) \
".align " __stringify(FASTOP_SIZE) " \n\t" \
".type " name ", @function \n\t" \
@@ -316,19 +319,21 @@
__FOP_FUNC(#name)
#define __FOP_RET(name) \
- "ret \n\t" \
+ ASM_RET \
".size " name ", .-" name "\n\t"
#define FOP_RET(name) \
__FOP_RET(#name)
-#define FOP_START(op) \
+#define __FOP_START(op, align) \
extern void em_##op(struct fastop *fake); \
asm(".pushsection .text, \"ax\" \n\t" \
".global em_" #op " \n\t" \
- ".align " __stringify(FASTOP_SIZE) " \n\t" \
+ ".align " __stringify(align) " \n\t" \
"em_" #op ":\n\t"
+#define FOP_START(op) __FOP_START(op, FASTOP_SIZE)
+
#define FOP_END \
".popsection")
@@ -428,19 +433,29 @@
FOP_END
/* Special case for SETcc - 1 instruction per cc */
+
+/*
+ * Depending on .config the SETcc functions look like:
+ *
+ * SETcc %al [3 bytes]
+ * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK]
+ * INT3 [1 byte; CONFIG_SLS]
+ */
+#define SETCC_ALIGN 16
+
#define FOP_SETCC(op) \
- ".align 4 \n\t" \
+ ".align " __stringify(SETCC_ALIGN) " \n\t" \
".type " #op ", @function \n\t" \
#op ": \n\t" \
#op " %al \n\t" \
- __FOP_RET(#op)
+ __FOP_RET(#op) \
+ ".skip " __stringify(SETCC_ALIGN) " - (.-" #op "), 0xcc \n\t"
asm(".pushsection .fixup, \"ax\"\n"
- ".global kvm_fastop_exception \n"
- "kvm_fastop_exception: xor %esi, %esi; ret\n"
+ "kvm_fastop_exception: xor %esi, %esi; " ASM_RET
".popsection");
-FOP_START(setcc)
+__FOP_START(setcc, SETCC_ALIGN)
FOP_SETCC(seto)
FOP_SETCC(setno)
FOP_SETCC(setc)
@@ -781,8 +796,7 @@
ctxt->mode, linear);
}
-static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst,
- enum x86emul_mode mode)
+static inline int assign_eip(struct x86_emulate_ctxt *ctxt, ulong dst)
{
ulong linear;
int rc;
@@ -792,41 +806,71 @@
if (ctxt->op_bytes != sizeof(unsigned long))
addr.ea = dst & ((1UL << (ctxt->op_bytes << 3)) - 1);
- rc = __linearize(ctxt, addr, &max_size, 1, false, true, mode, &linear);
+ rc = __linearize(ctxt, addr, &max_size, 1, false, true, ctxt->mode, &linear);
if (rc == X86EMUL_CONTINUE)
ctxt->_eip = addr.ea;
return rc;
}
-static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
+static inline int emulator_recalc_and_set_mode(struct x86_emulate_ctxt *ctxt)
{
- return assign_eip(ctxt, dst, ctxt->mode);
+ u64 efer;
+ struct desc_struct cs;
+ u16 selector;
+ u32 base3;
+
+ ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
+
+ if (!(ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PE)) {
+ /* Real mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_REAL;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (ctxt->eflags & X86_EFLAGS_VM) {
+ /* Protected/VM86 mode. cpu must not have long mode active */
+ if (efer & EFER_LMA)
+ return X86EMUL_UNHANDLEABLE;
+ ctxt->mode = X86EMUL_MODE_VM86;
+ return X86EMUL_CONTINUE;
+ }
+
+ if (!ctxt->ops->get_segment(ctxt, &selector, &cs, &base3, VCPU_SREG_CS))
+ return X86EMUL_UNHANDLEABLE;
+
+ if (efer & EFER_LMA) {
+ if (cs.l) {
+ /* Proper long mode */
+ ctxt->mode = X86EMUL_MODE_PROT64;
+ } else if (cs.d) {
+ /* 32 bit compatibility mode*/
+ ctxt->mode = X86EMUL_MODE_PROT32;
+ } else {
+ ctxt->mode = X86EMUL_MODE_PROT16;
+ }
+ } else {
+ /* Legacy 32 bit / 16 bit mode */
+ ctxt->mode = cs.d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
+ }
+
+ return X86EMUL_CONTINUE;
}
-static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst,
- const struct desc_struct *cs_desc)
+static inline int assign_eip_near(struct x86_emulate_ctxt *ctxt, ulong dst)
{
- enum x86emul_mode mode = ctxt->mode;
- int rc;
+ return assign_eip(ctxt, dst);
+}
-#ifdef CONFIG_X86_64
- if (ctxt->mode >= X86EMUL_MODE_PROT16) {
- if (cs_desc->l) {
- u64 efer = 0;
+static int assign_eip_far(struct x86_emulate_ctxt *ctxt, ulong dst)
+{
+ int rc = emulator_recalc_and_set_mode(ctxt);
- ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
- if (efer & EFER_LMA)
- mode = X86EMUL_MODE_PROT64;
- } else
- mode = X86EMUL_MODE_PROT32; /* temporary value */
- }
-#endif
- if (mode == X86EMUL_MODE_PROT16 || mode == X86EMUL_MODE_PROT32)
- mode = cs_desc->d ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
- rc = assign_eip(ctxt, dst, mode);
- if (rc == X86EMUL_CONTINUE)
- ctxt->mode = mode;
- return rc;
+ if (rc != X86EMUL_CONTINUE)
+ return rc;
+
+ return assign_eip(ctxt, dst);
}
static inline int jmp_rel(struct x86_emulate_ctxt *ctxt, int rel)
@@ -1055,7 +1099,7 @@
static __always_inline u8 test_cc(unsigned int condition, unsigned long flags)
{
u8 rc;
- void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
+ void (*fop)(void) = (void *)em_setcc + SETCC_ALIGN * (condition & 0xf);
flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
asm("push %[flags]; popf; " CALL_NOSPEC
@@ -1718,11 +1762,6 @@
goto exception;
}
- if (!seg_desc.p) {
- err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
- goto exception;
- }
-
dpl = seg_desc.dpl;
switch (seg) {
@@ -1762,12 +1801,6 @@
case VCPU_SREG_TR:
if (seg_desc.s || (seg_desc.type != 1 && seg_desc.type != 9))
goto exception;
- old_desc = seg_desc;
- seg_desc.type |= 2; /* busy */
- ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
- sizeof(seg_desc), &ctxt->exception);
- if (ret != X86EMUL_CONTINUE)
- return ret;
break;
case VCPU_SREG_LDTR:
if (seg_desc.s || seg_desc.type != 2)
@@ -1786,6 +1819,11 @@
break;
}
+ if (!seg_desc.p) {
+ err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
+ goto exception;
+ }
+
if (seg_desc.s) {
/* mark segment as accessed */
if (!(seg_desc.type & 1)) {
@@ -1800,8 +1838,17 @@
if (ret != X86EMUL_CONTINUE)
return ret;
if (emul_is_noncanonical_address(get_desc_base(&seg_desc) |
- ((u64)base3 << 32), ctxt))
- return emulate_gp(ctxt, 0);
+ ((u64)base3 << 32), ctxt))
+ return emulate_gp(ctxt, err_code);
+ }
+
+ if (seg == VCPU_SREG_TR) {
+ old_desc = seg_desc;
+ seg_desc.type |= 2; /* busy */
+ ret = ctxt->ops->cmpxchg_emulated(ctxt, desc_addr, &old_desc, &seg_desc,
+ sizeof(seg_desc), &ctxt->exception);
+ if (ret != X86EMUL_CONTINUE)
+ return ret;
}
load:
ctxt->ops->set_segment(ctxt, selector, &seg_desc, base3, seg);
@@ -2021,7 +2068,7 @@
if (rc != X86EMUL_CONTINUE)
return rc;
- if (ctxt->modrm_reg == VCPU_SREG_SS)
+ if (seg == VCPU_SREG_SS)
ctxt->interruptibility = KVM_X86_SHADOW_INT_MOV_SS;
if (ctxt->op_bytes > 2)
rsp_increment(ctxt, ctxt->op_bytes - 2);
@@ -2238,7 +2285,7 @@
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ rc = assign_eip_far(ctxt, ctxt->src.val);
/* Error handling is not implemented. */
if (rc != X86EMUL_CONTINUE)
return X86EMUL_UNHANDLEABLE;
@@ -2319,7 +2366,7 @@
&new_desc);
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, eip, &new_desc);
+ rc = assign_eip_far(ctxt, eip);
/* Error handling is not implemented. */
if (rc != X86EMUL_CONTINUE)
return X86EMUL_UNHANDLEABLE;
@@ -2939,6 +2986,7 @@
ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
ctxt->_eip = rdx;
+ ctxt->mode = usermode;
*reg_write(ctxt, VCPU_REGS_RSP) = rcx;
return X86EMUL_CONTINUE;
@@ -3535,7 +3583,7 @@
if (rc != X86EMUL_CONTINUE)
return rc;
- rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
+ rc = assign_eip_far(ctxt, ctxt->src.val);
if (rc != X86EMUL_CONTINUE)
goto fail;
@@ -3607,8 +3655,10 @@
{
u64 tsc_aux = 0;
- if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
+ if (!ctxt->ops->guest_has_rdpid(ctxt))
return emulate_ud(ctxt);
+
+ ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}
@@ -3675,11 +3725,25 @@
static int em_cr_write(struct x86_emulate_ctxt *ctxt)
{
- if (ctxt->ops->set_cr(ctxt, ctxt->modrm_reg, ctxt->src.val))
+ int cr_num = ctxt->modrm_reg;
+ int r;
+
+ if (ctxt->ops->set_cr(ctxt, cr_num, ctxt->src.val))
return emulate_gp(ctxt, 0);
/* Disable writeback. */
ctxt->dst.type = OP_NONE;
+
+ if (cr_num == 0) {
+ /*
+ * CR0 write might have updated CR0.PE and/or CR0.PG
+ * which can affect the cpu's execution mode.
+ */
+ r = emulator_recalc_and_set_mode(ctxt);
+ if (r != X86EMUL_CONTINUE)
+ return r;
+ }
+
return X86EMUL_CONTINUE;
}
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 328f37e..09ec1cd 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -207,7 +207,7 @@
struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
int ret;
- if (!synic->active && !host)
+ if (!synic->active && (!host || data))
return 1;
trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
@@ -253,6 +253,9 @@
case HV_X64_MSR_EOM: {
int i;
+ if (!synic->active)
+ break;
+
for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
kvm_hv_notify_acked_sint(vcpu, i);
break;
@@ -425,6 +428,9 @@
struct kvm_lapic_irq irq;
int ret, vector;
+ if (KVM_BUG_ON(!lapic_in_kernel(vcpu), vcpu->kvm))
+ return -EINVAL;
+
if (sint >= ARRAY_SIZE(synic->sint))
return -EINVAL;
@@ -636,7 +642,7 @@
struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
- if (!synic->active && !host)
+ if (!synic->active && (!host || config))
return 1;
trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
@@ -660,7 +666,7 @@
struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
- if (!synic->active && !host)
+ if (!synic->active && (!host || count))
return 1;
trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
diff --git a/arch/x86/kvm/kvm_emulate.h b/arch/x86/kvm/kvm_emulate.h
index 7d5be04..aeed6da 100644
--- a/arch/x86/kvm/kvm_emulate.h
+++ b/arch/x86/kvm/kvm_emulate.h
@@ -225,6 +225,7 @@
bool (*guest_has_long_mode)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_movbe)(struct x86_emulate_ctxt *ctxt);
bool (*guest_has_fxsr)(struct x86_emulate_ctxt *ctxt);
+ bool (*guest_has_rdpid)(struct x86_emulate_ctxt *ctxt);
void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 677d210..260727e 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -113,7 +113,8 @@
static bool kvm_can_post_timer_interrupt(struct kvm_vcpu *vcpu)
{
- return pi_inject_timer && kvm_vcpu_apicv_active(vcpu);
+ return pi_inject_timer && kvm_vcpu_apicv_active(vcpu) &&
+ (kvm_mwait_in_guest(vcpu->kvm) || kvm_hlt_in_guest(vcpu->kvm));
}
bool kvm_can_use_hv_timer(struct kvm_vcpu *vcpu)
@@ -296,6 +297,10 @@
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
}
+
+ /* Check if there are APF page ready requests pending */
+ if (enabled)
+ kvm_make_request(KVM_REQ_APF_READY, apic->vcpu);
}
static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
@@ -986,6 +991,10 @@
*r = -1;
if (irq->shorthand == APIC_DEST_SELF) {
+ if (KVM_BUG_ON(!src, kvm)) {
+ *r = 0;
+ return true;
+ }
*r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
return true;
}
@@ -2106,10 +2115,9 @@
break;
case APIC_SELF_IPI:
- if (apic_x2apic_mode(apic)) {
- kvm_lapic_reg_write(apic, APIC_ICR,
- APIC_DEST_SELF | (val & APIC_VECTOR_MASK));
- } else
+ if (apic_x2apic_mode(apic))
+ kvm_apic_send_ipi(apic, APIC_DEST_SELF | (val & APIC_VECTOR_MASK), 0);
+ else
ret = 1;
break;
default:
@@ -2227,10 +2235,7 @@
void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8)
{
- struct kvm_lapic *apic = vcpu->arch.apic;
-
- apic_set_tpr(apic, ((cr8 & 0x0f) << 4)
- | (kvm_lapic_get_reg(apic, APIC_TASKPRI) & 4));
+ apic_set_tpr(vcpu->arch.apic, (cr8 & 0x0f) << 4);
}
u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu)
@@ -2263,6 +2268,8 @@
if (value & MSR_IA32_APICBASE_ENABLE) {
kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
static_key_slow_dec_deferred(&apic_hw_disabled);
+ /* Check if there are APF page ready requests pending */
+ kvm_make_request(KVM_REQ_APF_READY, vcpu);
} else {
static_key_slow_inc(&apic_hw_disabled.key);
atomic_set_release(&apic->vcpu->kvm->arch.apic_map_dirty, DIRTY);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 20d29ae..13bf319 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3140,6 +3140,8 @@
return;
sp = to_shadow_page(*root_hpa & PT64_BASE_ADDR_MASK);
+ if (WARN_ON(!sp))
+ return;
if (kvm_mmu_put_root(kvm, sp)) {
if (sp->tdp_mmu_page)
@@ -5176,14 +5178,16 @@
uint i;
if (pcid == kvm_get_active_pcid(vcpu)) {
- mmu->invlpg(vcpu, gva, mmu->root_hpa);
+ if (mmu->invlpg)
+ mmu->invlpg(vcpu, gva, mmu->root_hpa);
tlb_flush = true;
}
for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd)) {
- mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
+ if (mmu->invlpg)
+ mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
tlb_flush = true;
}
}
@@ -5373,6 +5377,7 @@
{
struct kvm_mmu_page *sp, *node;
int nr_zapped, batch = 0;
+ bool unstable;
restart:
list_for_each_entry_safe_reverse(sp, node,
@@ -5404,11 +5409,12 @@
goto restart;
}
- if (__kvm_mmu_prepare_zap_page(kvm, sp,
- &kvm->arch.zapped_obsolete_pages, &nr_zapped)) {
- batch += nr_zapped;
+ unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
+ &kvm->arch.zapped_obsolete_pages, &nr_zapped);
+ batch += nr_zapped;
+
+ if (unstable)
goto restart;
- }
}
/*
@@ -5876,12 +5882,24 @@
return 0;
}
-int kvm_mmu_module_init(void)
+/*
+ * nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
+ * its default value of -1 is technically undefined behavior for a boolean.
+ */
+void __init kvm_mmu_x86_module_init(void)
{
- int ret = -ENOMEM;
-
if (nx_huge_pages == -1)
__set_nx_huge_pages(get_nx_auto_mode());
+}
+
+/*
+ * The bulk of the MMU initialization is deferred until the vendor module is
+ * loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
+ * to be reset when a potentially different vendor module is loaded.
+ */
+int kvm_mmu_vendor_module_init(void)
+{
+ int ret = -ENOMEM;
/*
* MMU roles use union aliasing which is, generally speaking, an
@@ -5955,7 +5973,7 @@
mmu_free_memory_caches(vcpu);
}
-void kvm_mmu_module_exit(void)
+void kvm_mmu_vendor_module_exit(void)
{
mmu_destroy_caches();
percpu_counter_destroy(&kvm_total_used_mmu_pages);
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index f882913..c6daeef 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -34,9 +34,8 @@
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgq"
#else
- #define CMPXCHG cmpxchg64
#define PT_MAX_FULL_LEVELS 2
#endif
#elif PTTYPE == 32
@@ -52,7 +51,7 @@
#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
- #define CMPXCHG cmpxchg
+ #define CMPXCHG "cmpxchgl"
#elif PTTYPE == PTTYPE_EPT
#define pt_element_t u64
#define guest_walker guest_walkerEPT
@@ -65,7 +64,9 @@
#define PT_GUEST_DIRTY_SHIFT 9
#define PT_GUEST_ACCESSED_SHIFT 8
#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
- #define CMPXCHG cmpxchg64
+ #ifdef CONFIG_X86_64
+ #define CMPXCHG "cmpxchgq"
+ #endif
#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#else
#error Invalid PTTYPE value
@@ -147,43 +148,39 @@
pt_element_t __user *ptep_user, unsigned index,
pt_element_t orig_pte, pt_element_t new_pte)
{
- int npages;
- pt_element_t ret;
- pt_element_t *table;
- struct page *page;
+ int r = -EFAULT;
- npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
- if (likely(npages == 1)) {
- table = kmap_atomic(page);
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- kunmap_atomic(table);
+ if (!user_access_begin(ptep_user, sizeof(pt_element_t)))
+ return -EFAULT;
- kvm_release_page_dirty(page);
- } else {
- struct vm_area_struct *vma;
- unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
- unsigned long pfn;
- unsigned long paddr;
+#ifdef CMPXCHG
+ asm volatile("1:" LOCK_PREFIX CMPXCHG " %[new], %[ptr]\n"
+ "mov $0, %[r]\n"
+ "setnz %b[r]\n"
+ "2:"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : [ptr] "+m" (*ptep_user),
+ [old] "+a" (orig_pte),
+ [r] "+q" (r)
+ : [new] "r" (new_pte)
+ : "memory");
+#else
+ asm volatile("1:" LOCK_PREFIX "cmpxchg8b %[ptr]\n"
+ "movl $0, %[r]\n"
+ "jz 2f\n"
+ "incl %[r]\n"
+ "2:"
+ _ASM_EXTABLE_UA(1b, 2b)
+ : [ptr] "+m" (*ptep_user),
+ [old] "+A" (orig_pte),
+ [r] "+rm" (r)
+ : [new_lo] "b" ((u32)new_pte),
+ [new_hi] "c" ((u32)(new_pte >> 32))
+ : "memory");
+#endif
- mmap_read_lock(current->mm);
- vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
- if (!vma || !(vma->vm_flags & VM_PFNMAP)) {
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
- pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- paddr = pfn << PAGE_SHIFT;
- table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
- if (!table) {
- mmap_read_unlock(current->mm);
- return -EFAULT;
- }
- ret = CMPXCHG(&table[index], orig_pte, new_pte);
- memunmap(table);
- mmap_read_unlock(current->mm);
- }
-
- return (ret != orig_pte);
+ user_access_end();
+ return r;
}
static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index 7e08efb..073514b 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -902,6 +902,9 @@
if (tdp_mmu_iter_cond_resched(kvm, &iter, false))
continue;
+ if (!is_shadow_present_pte(iter.old_spte))
+ continue;
+
if (spte_ad_need_write_protect(iter.old_spte)) {
if (is_writable_pte(iter.old_spte))
new_spte = iter.old_spte & ~PT_WRITABLE_MASK;
diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index 2f83b5d..e5322a0 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -13,6 +13,8 @@
#include <linux/types.h>
#include <linux/kvm_host.h>
#include <linux/perf_event.h>
+#include <linux/bsearch.h>
+#include <linux/sort.h>
#include <asm/perf_event.h>
#include "x86.h"
#include "cpuid.h"
@@ -168,13 +170,21 @@
return true;
}
+static int cmp_u64(const void *pa, const void *pb)
+{
+ u64 a = *(u64 *)pa;
+ u64 b = *(u64 *)pb;
+
+ return (a > b) - (a < b);
+}
+
void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel)
{
u64 config;
u32 type = PERF_TYPE_RAW;
struct kvm *kvm = pmc->vcpu->kvm;
struct kvm_pmu_event_filter *filter;
- int i;
+ struct kvm_pmu *pmu = vcpu_to_pmu(pmc->vcpu);
bool allow_event = true;
if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
@@ -189,16 +199,13 @@
filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
if (filter) {
- for (i = 0; i < filter->nevents; i++)
- if (filter->events[i] ==
- (eventsel & AMD64_RAW_EVENT_MASK_NB))
- break;
- if (filter->action == KVM_PMU_EVENT_ALLOW &&
- i == filter->nevents)
- allow_event = false;
- if (filter->action == KVM_PMU_EVENT_DENY &&
- i < filter->nevents)
- allow_event = false;
+ __u64 key = eventsel & AMD64_RAW_EVENT_MASK_NB;
+
+ if (bsearch(&key, filter->events, filter->nevents,
+ sizeof(__u64), cmp_u64))
+ allow_event = filter->action == KVM_PMU_EVENT_ALLOW;
+ else
+ allow_event = filter->action == KVM_PMU_EVENT_DENY;
}
if (!allow_event)
return;
@@ -214,7 +221,7 @@
}
if (type == PERF_TYPE_RAW)
- config = eventsel & AMD64_RAW_EVENT_MASK;
+ config = eventsel & pmu->raw_event_mask;
if (pmc->current_config == eventsel && pmc_resume_counter(pmc))
return;
@@ -507,6 +514,11 @@
/* Ensure nevents can't be changed between the user copies. */
*filter = tmp;
+ /*
+ * Sort the in-kernel list so that we can search it with bsearch.
+ */
+ sort(&filter->events, filter->nevents, sizeof(__u64), cmp_u64, NULL);
+
mutex_lock(&kvm->lock);
filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
mutex_is_locked(&kvm->lock));
diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c
index a8b5533..3e5cb74 100644
--- a/arch/x86/kvm/svm/avic.c
+++ b/arch/x86/kvm/svm/avic.c
@@ -806,7 +806,7 @@
{
struct kvm_kernel_irq_routing_entry *e;
struct kvm_irq_routing_table *irq_rt;
- int idx, ret = -EINVAL;
+ int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP))
@@ -817,7 +817,13 @@
idx = srcu_read_lock(&kvm->irq_srcu);
irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu);
- WARN_ON(guest_irq >= irq_rt->nr_rt_entries);
+
+ if (guest_irq >= irq_rt->nr_rt_entries ||
+ hlist_empty(&irq_rt->map[guest_irq])) {
+ pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n",
+ guest_irq, irq_rt->nr_rt_entries);
+ goto out;
+ }
hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) {
struct vcpu_data vcpu_info;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 23910e6..e7feaa7 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -1198,8 +1198,8 @@
return -EINVAL;
ret = -ENOMEM;
- ctl = kzalloc(sizeof(*ctl), GFP_KERNEL);
- save = kzalloc(sizeof(*save), GFP_KERNEL);
+ ctl = kzalloc(sizeof(*ctl), GFP_KERNEL_ACCOUNT);
+ save = kzalloc(sizeof(*save), GFP_KERNEL_ACCOUNT);
if (!ctl || !save)
goto out_free;
diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c
index 4e7093b..35da84f 100644
--- a/arch/x86/kvm/svm/pmu.c
+++ b/arch/x86/kvm/svm/pmu.c
@@ -44,6 +44,22 @@
[7] = { 0xd1, 0x00, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
};
+/* duplicated from amd_f17h_perfmon_event_map. */
+static struct kvm_event_hw_type_mapping amd_f17h_event_mapping[] = {
+ [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES },
+ [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS },
+ [2] = { 0x60, 0xff, PERF_COUNT_HW_CACHE_REFERENCES },
+ [3] = { 0x64, 0x09, PERF_COUNT_HW_CACHE_MISSES },
+ [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
+ [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
+ [6] = { 0x87, 0x02, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND },
+ [7] = { 0x87, 0x01, PERF_COUNT_HW_STALLED_CYCLES_BACKEND },
+};
+
+/* amd_pmc_perf_hw_id depends on these being the same size */
+static_assert(ARRAY_SIZE(amd_event_mapping) ==
+ ARRAY_SIZE(amd_f17h_event_mapping));
+
static unsigned int get_msr_base(struct kvm_pmu *pmu, enum pmu_type type)
{
struct kvm_vcpu *vcpu = pmu_to_vcpu(pmu);
@@ -128,19 +144,25 @@
static unsigned int amd_pmc_perf_hw_id(struct kvm_pmc *pmc)
{
+ struct kvm_event_hw_type_mapping *event_mapping;
u8 event_select = pmc->eventsel & ARCH_PERFMON_EVENTSEL_EVENT;
u8 unit_mask = (pmc->eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8;
int i;
+ if (guest_cpuid_family(pmc->vcpu) >= 0x17)
+ event_mapping = amd_f17h_event_mapping;
+ else
+ event_mapping = amd_event_mapping;
+
for (i = 0; i < ARRAY_SIZE(amd_event_mapping); i++)
- if (amd_event_mapping[i].eventsel == event_select
- && amd_event_mapping[i].unit_mask == unit_mask)
+ if (event_mapping[i].eventsel == event_select
+ && event_mapping[i].unit_mask == unit_mask)
break;
if (i == ARRAY_SIZE(amd_event_mapping))
return PERF_COUNT_HW_MAX;
- return amd_event_mapping[i].event_type;
+ return event_mapping[i].event_type;
}
/* return PERF_COUNT_HW_MAX as AMD doesn't have fixed events */
@@ -253,12 +275,10 @@
/* MSR_EVNTSELn */
pmc = get_gp_pmc_amd(pmu, msr, PMU_TYPE_EVNTSEL);
if (pmc) {
- if (data == pmc->eventsel)
- return 0;
- if (!(data & pmu->reserved_bits)) {
+ data &= ~pmu->reserved_bits;
+ if (data != pmc->eventsel)
reprogram_gp_counter(pmc, data);
- return 0;
- }
+ return 0;
}
return 1;
@@ -275,6 +295,7 @@
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << 48) - 1;
pmu->reserved_bits = 0xfffffff000280000ull;
+ pmu->raw_event_mask = AMD64_RAW_EVENT_MASK;
pmu->version = 1;
/* not applicable to AMD; but clean them to prevent any fall out */
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c
index 6c82ef2..c2b3499 100644
--- a/arch/x86/kvm/svm/sev.c
+++ b/arch/x86/kvm/svm/sev.c
@@ -537,7 +537,7 @@
}
ret = -ENOMEM;
- blob = kmalloc(params.len, GFP_KERNEL);
+ blob = kzalloc(params.len, GFP_KERNEL_ACCOUNT);
if (!blob)
goto e_free;
@@ -676,7 +676,7 @@
if (!IS_ALIGNED(dst_paddr, 16) ||
!IS_ALIGNED(paddr, 16) ||
!IS_ALIGNED(size, 16)) {
- tpage = (void *)alloc_page(GFP_KERNEL);
+ tpage = (void *)alloc_page(GFP_KERNEL | __GFP_ZERO);
if (!tpage)
return -ENOMEM;
@@ -1177,6 +1177,14 @@
sev_flush_asids();
}
+void sev_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (!sev_guest(kvm))
+ return;
+
+ wbinvd_on_all_cpus();
+}
+
void pre_sev_run(struct vcpu_svm *svm, int cpu)
{
struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c
index 7773a76..c34ba03 100644
--- a/arch/x86/kvm/svm/svm.c
+++ b/arch/x86/kvm/svm/svm.c
@@ -305,12 +305,6 @@
return 0;
}
-static int is_external_interrupt(u32 info)
-{
- info &= SVM_EVTINJ_TYPE_MASK | SVM_EVTINJ_VALID;
- return info == (SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_INTR);
-}
-
static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
@@ -1357,6 +1351,7 @@
*/
svm_clear_current_vmcb(svm->vmcb);
+ svm_leave_nested(vcpu);
svm_free_nested(svm);
__free_page(pfn_to_page(__sme_clr(svm->vmcb_pa) >> PAGE_SHIFT));
@@ -1692,14 +1687,16 @@
update_cr0_intercept(svm);
}
-int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ return true;
+}
+
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE;
unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
- if (cr4 & X86_CR4_VMXE)
- return 1;
-
if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
svm_flush_tlb(vcpu);
@@ -1709,7 +1706,6 @@
cr4 |= host_cr4_mce;
to_svm(vcpu)->vmcb->save.cr4 = cr4;
vmcb_mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
- return 0;
}
static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -2474,9 +2470,9 @@
msr->data = 0;
switch (msr->index) {
- case MSR_F10H_DECFG:
- if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC))
- msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE;
+ case MSR_AMD64_DE_CFG:
+ if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC))
+ msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE;
break;
case MSR_IA32_PERF_CAPABILITIES:
return 0;
@@ -2583,7 +2579,7 @@
msr_info->data = 0x1E;
}
break;
- case MSR_F10H_DECFG:
+ case MSR_AMD64_DE_CFG:
msr_info->data = svm->msr_decfg;
break;
default:
@@ -2763,7 +2759,7 @@
case MSR_VM_IGNNE:
vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data);
break;
- case MSR_F10H_DECFG: {
+ case MSR_AMD64_DE_CFG: {
struct kvm_msr_entry msr_entry;
msr_entry.index = msr->index;
@@ -3114,15 +3110,6 @@
return 0;
}
- if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
- exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
- exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
- exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
- printk(KERN_ERR "%s: unexpected exit_int_info 0x%x "
- "exit_code 0x%x\n",
- __func__, svm->vmcb->control.exit_int_info,
- exit_code);
-
if (exit_fastpath != EXIT_FASTPATH_NONE)
return 1;
@@ -3188,8 +3175,6 @@
{
struct vcpu_svm *svm = to_svm(vcpu);
- BUG_ON(!(gif_set(svm)));
-
trace_kvm_inj_virq(vcpu->arch.interrupt.nr);
++vcpu->stat.irq_injections;
@@ -4243,6 +4228,7 @@
.get_cpl = svm_get_cpl,
.get_cs_db_l_bits = kvm_get_cs_db_l_bits,
.set_cr0 = svm_set_cr0,
+ .is_valid_cr4 = svm_is_valid_cr4,
.set_cr4 = svm_set_cr4,
.set_efer = svm_set_efer,
.get_idt = svm_get_idt,
@@ -4325,6 +4311,7 @@
.mem_enc_op = svm_mem_enc_op,
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
+ .guest_memory_reclaimed = sev_guest_memory_reclaimed,
.can_emulate_instruction = svm_can_emulate_instruction,
diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h
index 2c00724..f62d13f 100644
--- a/arch/x86/kvm/svm/svm.h
+++ b/arch/x86/kvm/svm/svm.h
@@ -355,7 +355,7 @@
int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void svm_flush_tlb(struct kvm_vcpu *vcpu);
void disable_nmi_singlestep(struct vcpu_svm *svm);
bool svm_smi_blocked(struct kvm_vcpu *vcpu);
@@ -491,6 +491,8 @@
struct kvm_enc_region *range);
int svm_unregister_enc_region(struct kvm *kvm,
struct kvm_enc_region *range);
+void sev_guest_memory_reclaimed(struct kvm *kvm);
+
void pre_sev_run(struct vcpu_svm *svm, int cpu);
int __init sev_hardware_setup(void);
void sev_hardware_teardown(void);
diff --git a/arch/x86/kvm/svm/vmenter.S b/arch/x86/kvm/svm/vmenter.S
index 1ec1ac4..c18d812 100644
--- a/arch/x86/kvm/svm/vmenter.S
+++ b/arch/x86/kvm/svm/vmenter.S
@@ -129,6 +129,15 @@
#endif
/*
+ * Mitigate RETBleed for AMD/Hygon Zen uarch. RET should be
+ * untrained as soon as we exit the VM and are back to the
+ * kernel. This should be done before re-enabling interrupts
+ * because interrupt handlers won't sanitize 'ret' if the return is
+ * from the kernel.
+ */
+ UNTRAIN_RET
+
+ /*
* Clear all general purpose registers except RSP and RAX to prevent
* speculative use of the guest's values, even those that are reloaded
* via the stack. In theory, an L1 cache miss when restoring registers
@@ -166,5 +175,5 @@
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(__svm_vcpu_run)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a2835d7..3d4988e 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -304,25 +304,29 @@
* Tracepoint for kvm interrupt injection:
*/
TRACE_EVENT(kvm_inj_exception,
- TP_PROTO(unsigned exception, bool has_error, unsigned error_code),
- TP_ARGS(exception, has_error, error_code),
+ TP_PROTO(unsigned exception, bool has_error, unsigned error_code,
+ bool reinjected),
+ TP_ARGS(exception, has_error, error_code, reinjected),
TP_STRUCT__entry(
__field( u8, exception )
__field( u8, has_error )
__field( u32, error_code )
+ __field( bool, reinjected )
),
TP_fast_assign(
__entry->exception = exception;
__entry->has_error = has_error;
__entry->error_code = error_code;
+ __entry->reinjected = reinjected;
),
- TP_printk("%s (0x%x)",
+ TP_printk("%s (0x%x)%s",
__print_symbolic(__entry->exception, kvm_trace_sym_exc),
/* FIXME: don't print error_code if not present */
- __entry->has_error ? __entry->error_code : 0)
+ __entry->has_error ? __entry->error_code : 0,
+ __entry->reinjected ? " [reinjected]" : "")
);
/*
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0c2389d..498fed0 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -12,6 +12,7 @@
#include "nested.h"
#include "pmu.h"
#include "trace.h"
+#include "vmx.h"
#include "x86.h"
static bool __read_mostly enable_shadow_vmcs = 1;
@@ -1244,7 +1245,7 @@
BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) |
/* reserved */
BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56);
- u64 vmx_basic = vmx->nested.msrs.basic;
+ u64 vmx_basic = vmcs_config.nested.basic;
if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved))
return -EINVAL;
@@ -1267,36 +1268,42 @@
return 0;
}
-static int
-vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+static void vmx_get_control_msr(struct nested_vmx_msrs *msrs, u32 msr_index,
+ u32 **low, u32 **high)
{
- u64 supported;
- u32 *lowp, *highp;
-
switch (msr_index) {
case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
- lowp = &vmx->nested.msrs.pinbased_ctls_low;
- highp = &vmx->nested.msrs.pinbased_ctls_high;
+ *low = &msrs->pinbased_ctls_low;
+ *high = &msrs->pinbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
- lowp = &vmx->nested.msrs.procbased_ctls_low;
- highp = &vmx->nested.msrs.procbased_ctls_high;
+ *low = &msrs->procbased_ctls_low;
+ *high = &msrs->procbased_ctls_high;
break;
case MSR_IA32_VMX_TRUE_EXIT_CTLS:
- lowp = &vmx->nested.msrs.exit_ctls_low;
- highp = &vmx->nested.msrs.exit_ctls_high;
+ *low = &msrs->exit_ctls_low;
+ *high = &msrs->exit_ctls_high;
break;
case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
- lowp = &vmx->nested.msrs.entry_ctls_low;
- highp = &vmx->nested.msrs.entry_ctls_high;
+ *low = &msrs->entry_ctls_low;
+ *high = &msrs->entry_ctls_high;
break;
case MSR_IA32_VMX_PROCBASED_CTLS2:
- lowp = &vmx->nested.msrs.secondary_ctls_low;
- highp = &vmx->nested.msrs.secondary_ctls_high;
+ *low = &msrs->secondary_ctls_low;
+ *high = &msrs->secondary_ctls_high;
break;
default:
BUG();
}
+}
+
+static int
+vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ u32 *lowp, *highp;
+ u64 supported;
+
+ vmx_get_control_msr(&vmcs_config.nested, msr_index, &lowp, &highp);
supported = vmx_control_msr(*lowp, *highp);
@@ -1308,6 +1315,7 @@
if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32)))
return -EINVAL;
+ vmx_get_control_msr(&vmx->nested.msrs, msr_index, &lowp, &highp);
*lowp = data;
*highp = data >> 32;
return 0;
@@ -1321,10 +1329,8 @@
BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) |
/* reserved */
GENMASK_ULL(13, 9) | BIT_ULL(31);
- u64 vmx_misc;
-
- vmx_misc = vmx_control_msr(vmx->nested.msrs.misc_low,
- vmx->nested.msrs.misc_high);
+ u64 vmx_misc = vmx_control_msr(vmcs_config.nested.misc_low,
+ vmcs_config.nested.misc_high);
if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits))
return -EINVAL;
@@ -1352,10 +1358,8 @@
static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data)
{
- u64 vmx_ept_vpid_cap;
-
- vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.msrs.ept_caps,
- vmx->nested.msrs.vpid_caps);
+ u64 vmx_ept_vpid_cap = vmx_control_msr(vmcs_config.nested.ept_caps,
+ vmcs_config.nested.vpid_caps);
/* Every bit is either reserved or a feature bit. */
if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL))
@@ -1366,20 +1370,21 @@
return 0;
}
-static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+static u64 *vmx_get_fixed0_msr(struct nested_vmx_msrs *msrs, u32 msr_index)
{
- u64 *msr;
-
switch (msr_index) {
case MSR_IA32_VMX_CR0_FIXED0:
- msr = &vmx->nested.msrs.cr0_fixed0;
- break;
+ return &msrs->cr0_fixed0;
case MSR_IA32_VMX_CR4_FIXED0:
- msr = &vmx->nested.msrs.cr4_fixed0;
- break;
+ return &msrs->cr4_fixed0;
default:
BUG();
}
+}
+
+static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data)
+{
+ const u64 *msr = vmx_get_fixed0_msr(&vmcs_config.nested, msr_index);
/*
* 1 bits (which indicates bits which "must-be-1" during VMX operation)
@@ -1388,7 +1393,7 @@
if (!is_bitwise_subset(data, *msr, -1ULL))
return -EINVAL;
- *msr = data;
+ *vmx_get_fixed0_msr(&vmx->nested.msrs, msr_index) = data;
return 0;
}
@@ -1449,7 +1454,7 @@
vmx->nested.msrs.vmcs_enum = data;
return 0;
case MSR_IA32_VMX_VMFUNC:
- if (data & ~vmx->nested.msrs.vmfunc_controls)
+ if (data & ~vmcs_config.nested.vmfunc_controls)
return -EINVAL;
vmx->nested.msrs.vmfunc_controls = data;
return 0;
@@ -2227,7 +2232,8 @@
}
}
-static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12)
+static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01,
+ struct vmcs12 *vmcs12)
{
u32 exec_control, vmcs12_exec_ctrl;
u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12);
@@ -2238,7 +2244,7 @@
/*
* PIN CONTROLS
*/
- exec_control = vmx_pin_based_exec_ctrl(vmx);
+ exec_control = __pin_controls_get(vmcs01);
exec_control |= (vmcs12->pin_based_vm_exec_control &
~PIN_BASED_VMX_PREEMPTION_TIMER);
@@ -2253,7 +2259,7 @@
/*
* EXEC CONTROLS
*/
- exec_control = vmx_exec_control(vmx); /* L0's desires */
+ exec_control = __exec_controls_get(vmcs01); /* L0's desires */
exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING;
exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING;
exec_control &= ~CPU_BASED_TPR_SHADOW;
@@ -2290,17 +2296,20 @@
* SECONDARY EXEC CONTROLS
*/
if (cpu_has_secondary_exec_ctrls()) {
- exec_control = vmx->secondary_exec_control;
+ exec_control = __secondary_exec_controls_get(vmcs01);
/* Take the following fields only from vmcs12 */
exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
+ SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE |
SECONDARY_EXEC_ENABLE_INVPCID |
SECONDARY_EXEC_ENABLE_RDTSCP |
SECONDARY_EXEC_XSAVES |
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE |
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
SECONDARY_EXEC_APIC_REGISTER_VIRT |
- SECONDARY_EXEC_ENABLE_VMFUNC);
+ SECONDARY_EXEC_ENABLE_VMFUNC |
+ SECONDARY_EXEC_DESC);
+
if (nested_cpu_has(vmcs12,
CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) {
vmcs12_exec_ctrl = vmcs12->secondary_vm_exec_control &
@@ -2336,9 +2345,15 @@
* are emulated by vmx_set_efer() in prepare_vmcs02(), but speculate
* on the related bits (if supported by the CPU) in the hope that
* we can avoid VMWrites during vmx_set_efer().
+ *
+ * Similarly, take vmcs01's PERF_GLOBAL_CTRL in the hope that if KVM is
+ * loading PERF_GLOBAL_CTRL via the VMCS for L1, then KVM will want to
+ * do the same for L2.
*/
- exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) &
- ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER;
+ exec_control = __vm_entry_controls_get(vmcs01);
+ exec_control |= (vmcs12->vm_entry_controls &
+ ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL);
+ exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER);
if (cpu_has_load_ia32_efer()) {
if (guest_efer & EFER_LMA)
exec_control |= VM_ENTRY_IA32E_MODE;
@@ -2354,9 +2369,11 @@
* we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER
* bits may be modified by vmx_set_efer() in prepare_vmcs02().
*/
- exec_control = vmx_vmexit_ctrl();
+ exec_control = __vm_exit_controls_get(vmcs01);
if (cpu_has_load_ia32_efer() && guest_efer != host_efer)
exec_control |= VM_EXIT_LOAD_IA32_EFER;
+ else
+ exec_control &= ~VM_EXIT_LOAD_IA32_EFER;
vm_exit_controls_set(vmx, exec_control);
/*
@@ -3075,35 +3092,8 @@
vmx->loaded_vmcs->host_state.cr4 = cr4;
}
- asm(
- "sub $%c[wordsize], %%" _ASM_SP "\n\t" /* temporarily adjust RSP for CALL */
- "cmp %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "je 1f \n\t"
- __ex("vmwrite %%" _ASM_SP ", %[HOST_RSP]") "\n\t"
- "mov %%" _ASM_SP ", %c[host_state_rsp](%[loaded_vmcs]) \n\t"
- "1: \n\t"
- "add $%c[wordsize], %%" _ASM_SP "\n\t" /* un-adjust RSP */
-
- /* Check if vmlaunch or vmresume is needed */
- "cmpb $0, %c[launched](%[loaded_vmcs])\n\t"
-
- /*
- * VMLAUNCH and VMRESUME clear RFLAGS.{CF,ZF} on VM-Exit, set
- * RFLAGS.CF on VM-Fail Invalid and set RFLAGS.ZF on VM-Fail
- * Valid. vmx_vmenter() directly "returns" RFLAGS, and so the
- * results of VM-Enter is captured via CC_{SET,OUT} to vm_fail.
- */
- "call vmx_vmenter\n\t"
-
- CC_SET(be)
- : ASM_CALL_CONSTRAINT, CC_OUT(be) (vm_fail)
- : [HOST_RSP]"r"((unsigned long)HOST_RSP),
- [loaded_vmcs]"r"(vmx->loaded_vmcs),
- [launched]"i"(offsetof(struct loaded_vmcs, launched)),
- [host_state_rsp]"i"(offsetof(struct loaded_vmcs, host_state.rsp)),
- [wordsize]"i"(sizeof(ulong))
- : "memory"
- );
+ vm_fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
+ __vmx_vcpu_run_flags(vmx));
if (vmx->msr_autoload.host.nr)
vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.host.nr);
@@ -3363,10 +3353,12 @@
if (likely(!evaluate_pending_interrupts) && kvm_vcpu_apicv_active(vcpu))
evaluate_pending_interrupts |= vmx_has_apicv_interrupt(vcpu);
- if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
+ if (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
if (kvm_mpx_supported() &&
- !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS))
+ (!vmx->nested.nested_run_pending ||
+ !(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_BNDCFGS)))
vmx->nested.vmcs01_guest_bndcfgs = vmcs_read64(GUEST_BNDCFGS);
/*
@@ -3390,7 +3382,7 @@
vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02);
- prepare_vmcs02_early(vmx, vmcs12);
+ prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12);
if (from_vmentry) {
if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
@@ -3668,12 +3660,34 @@
}
static void vmcs12_save_pending_event(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+ struct vmcs12 *vmcs12,
+ u32 vm_exit_reason, u32 exit_intr_info)
{
u32 idt_vectoring;
unsigned int nr;
- if (vcpu->arch.exception.injected) {
+ /*
+ * Per the SDM, VM-Exits due to double and triple faults are never
+ * considered to occur during event delivery, even if the double/triple
+ * fault is the result of an escalating vectoring issue.
+ *
+ * Note, the SDM qualifies the double fault behavior with "The original
+ * event results in a double-fault exception". It's unclear why the
+ * qualification exists since exits due to double fault can occur only
+ * while vectoring a different exception (injected events are never
+ * subject to interception), i.e. there's _always_ an original event.
+ *
+ * The SDM also uses NMI as a confusing example for the "original event
+ * causes the VM exit directly" clause. NMI isn't special in any way,
+ * the same rule applies to all events that cause an exit directly.
+ * NMI is an odd choice for the example because NMIs can only occur on
+ * instruction boundaries, i.e. they _can't_ occur during vectoring.
+ */
+ if ((u16)vm_exit_reason == EXIT_REASON_TRIPLE_FAULT ||
+ ((u16)vm_exit_reason == EXIT_REASON_EXCEPTION_NMI &&
+ is_double_fault(exit_intr_info))) {
+ vmcs12->idt_vectoring_info_field = 0;
+ } else if (vcpu->arch.exception.injected) {
nr = vcpu->arch.exception.nr;
idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
@@ -3706,6 +3720,8 @@
idt_vectoring |= INTR_TYPE_EXT_INTR;
vmcs12->idt_vectoring_info_field = idt_vectoring;
+ } else {
+ vmcs12->idt_vectoring_info_field = 0;
}
}
@@ -3772,7 +3788,16 @@
u32 intr_info = nr | INTR_INFO_VALID_MASK;
if (vcpu->arch.exception.has_error_code) {
- vmcs12->vm_exit_intr_error_code = vcpu->arch.exception.error_code;
+ /*
+ * Intel CPUs do not generate error codes with bits 31:16 set,
+ * and more importantly VMX disallows setting bits 31:16 in the
+ * injected error code for VM-Entry. Drop the bits to mimic
+ * hardware and avoid inducing failure on nested VM-Entry if L1
+ * chooses to inject the exception back to L2. AMD CPUs _do_
+ * generate "full" 32-bit error codes, so KVM allows userspace
+ * to inject exception error codes with bits 31:16 set.
+ */
+ vmcs12->vm_exit_intr_error_code = (u16)vcpu->arch.exception.error_code;
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
@@ -4143,12 +4168,12 @@
/* update exit information fields: */
vmcs12->vm_exit_reason = vm_exit_reason;
vmcs12->exit_qualification = exit_qualification;
- vmcs12->vm_exit_intr_info = exit_intr_info;
- vmcs12->idt_vectoring_info_field = 0;
- vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
- vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
-
+ /*
+ * On VM-Exit due to a failed VM-Entry, the VMCS isn't marked launched
+ * and only EXIT_REASON and EXIT_QUALIFICATION are updated, all other
+ * exit info fields are unmodified.
+ */
if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
vmcs12->launch_state = 1;
@@ -4160,7 +4185,12 @@
* Transfer the event that L0 or L1 may wanted to inject into
* L2 to IDT_VECTORING_INFO_FIELD.
*/
- vmcs12_save_pending_event(vcpu, vmcs12);
+ vmcs12_save_pending_event(vcpu, vmcs12,
+ vm_exit_reason, exit_intr_info);
+
+ vmcs12->vm_exit_intr_info = exit_intr_info;
+ vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
/*
* According to spec, there's no need to store the guest's
@@ -4174,14 +4204,6 @@
nested_vmx_abort(vcpu,
VMX_ABORT_SAVE_GUEST_MSR_FAIL);
}
-
- /*
- * Drop what we picked up for L2 via vmx_complete_interrupts. It is
- * preserved above and would only end up incorrectly in L1.
- */
- vcpu->arch.nmi_injected = false;
- kvm_clear_exception_queue(vcpu);
- kvm_clear_interrupt_queue(vcpu);
}
/*
@@ -4521,6 +4543,17 @@
WARN_ON_ONCE(nested_early_check);
}
+ /*
+ * Drop events/exceptions that were queued for re-injection to L2
+ * (picked up via vmx_complete_interrupts()), as well as exceptions
+ * that were pending for L2. Note, this must NOT be hoisted above
+ * prepare_vmcs12(), events/exceptions queued for re-injection need to
+ * be captured in vmcs12 (see vmcs12_save_pending_event()).
+ */
+ vcpu->arch.nmi_injected = false;
+ kvm_clear_exception_queue(vcpu);
+ kvm_clear_interrupt_queue(vcpu);
+
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
/* Update any VMCS fields that might have changed while L2 ran */
@@ -4868,20 +4901,25 @@
| FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
/*
- * The Intel VMX Instruction Reference lists a bunch of bits that are
- * prerequisite to running VMXON, most notably cr4.VMXE must be set to
- * 1 (see vmx_set_cr4() for when we allow the guest to set this).
- * Otherwise, we should fail with #UD. But most faulting conditions
- * have already been checked by hardware, prior to the VM-exit for
- * VMXON. We do test guest cr4.VMXE because processor CR4 always has
- * that bit set to 1 in non-root mode.
+ * Note, KVM cannot rely on hardware to perform the CR0/CR4 #UD checks
+ * that have higher priority than VM-Exit (see Intel SDM's pseudocode
+ * for VMXON), as KVM must load valid CR0/CR4 values into hardware while
+ * running the guest, i.e. KVM needs to check the _guest_ values.
+ *
+ * Rely on hardware for the other two pre-VM-Exit checks, !VM86 and
+ * !COMPATIBILITY modes. KVM may run the guest in VM86 to emulate Real
+ * Mode, but KVM will never take the guest out of those modes.
*/
- if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) {
+ if (!nested_host_cr0_valid(vcpu, kvm_read_cr0(vcpu)) ||
+ !nested_host_cr4_valid(vcpu, kvm_read_cr4(vcpu))) {
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- /* CPL=0 must be checked manually. */
+ /*
+ * CPL=0 and all other checks that are lower priority than VM-Exit must
+ * be checked manually.
+ */
if (vmx_get_cpl(vcpu)) {
kvm_inject_gp(vcpu, 0);
return 1;
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index bd70c1d..f938fc9 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -104,6 +104,9 @@
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
+ if (pmu->version < 2)
+ return true;
+
return test_bit(pmc->idx, (unsigned long *)&pmu->global_ctrl);
}
@@ -153,12 +156,17 @@
return &counters[array_index_nospec(idx, num_counters)];
}
-static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+static inline u64 vcpu_get_perf_capabilities(struct kvm_vcpu *vcpu)
{
if (!guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- return false;
+ return 0;
- return vcpu->arch.perf_capabilities & PMU_CAP_FW_WRITES;
+ return vcpu->arch.perf_capabilities;
+}
+
+static inline bool fw_writes_is_enabled(struct kvm_vcpu *vcpu)
+{
+ return (vcpu_get_perf_capabilities(vcpu) & PMU_CAP_FW_WRITES) != 0;
}
static inline struct kvm_pmc *get_fw_gp_pmc(struct kvm_pmu *pmu, u32 msr)
@@ -254,7 +262,7 @@
case MSR_CORE_PERF_FIXED_CTR_CTRL:
if (pmu->fixed_ctr_ctrl == data)
return 0;
- if (!(data & 0xfffffffffffff444ull)) {
+ if (!(data & pmu->fixed_ctr_ctrl_mask)) {
reprogram_fixed_counters(pmu, data);
return 0;
}
@@ -321,6 +329,7 @@
struct kvm_cpuid_entry2 *entry;
union cpuid10_eax eax;
union cpuid10_edx edx;
+ int i;
pmu->nr_arch_gp_counters = 0;
pmu->nr_arch_fixed_counters = 0;
@@ -328,7 +337,10 @@
pmu->counter_bitmask[KVM_PMC_FIXED] = 0;
pmu->version = 0;
pmu->reserved_bits = 0xffffffff00200000ull;
- vcpu->arch.perf_capabilities = 0;
+ pmu->raw_event_mask = X86_RAW_EVENT_MASK;
+ pmu->global_ctrl_mask = ~0ull;
+ pmu->global_ovf_ctrl_mask = ~0ull;
+ pmu->fixed_ctr_ctrl_mask = ~0ull;
entry = kvm_find_cpuid_entry(vcpu, 0xa, 0);
if (!entry)
@@ -341,8 +353,6 @@
return;
perf_get_x86_pmu_capability(&x86_pmu);
- if (guest_cpuid_has(vcpu, X86_FEATURE_PDCM))
- vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
@@ -364,6 +374,8 @@
((u64)1 << edx.split.bit_width_fixed) - 1;
}
+ for (i = 0; i < pmu->nr_arch_fixed_counters; i++)
+ pmu->fixed_ctr_ctrl_mask &= ~(0xbull << (i * 4));
pmu->global_ctrl = ((1ull << pmu->nr_arch_gp_counters) - 1) |
(((1ull << pmu->nr_arch_fixed_counters) - 1) << INTEL_PMC_IDX_FIXED);
pmu->global_ctrl_mask = ~pmu->global_ctrl;
@@ -406,6 +418,8 @@
pmu->fixed_counters[i].idx = i + INTEL_PMC_IDX_FIXED;
pmu->fixed_counters[i].current_config = 0;
}
+
+ vcpu->arch.perf_capabilities = vmx_get_perf_capabilities();
}
static void intel_pmu_reset(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/run_flags.h b/arch/x86/kvm/vmx/run_flags.h
new file mode 100644
index 0000000..edc3f16
--- /dev/null
+++ b/arch/x86/kvm/vmx/run_flags.h
@@ -0,0 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __KVM_X86_VMX_RUN_FLAGS_H
+#define __KVM_X86_VMX_RUN_FLAGS_H
+
+#define VMX_RUN_VMRESUME (1 << 0)
+#define VMX_RUN_SAVE_SPEC_CTRL (1 << 1)
+
+#endif /* __KVM_X86_VMX_RUN_FLAGS_H */
diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h
index 571d9ad..69c147d 100644
--- a/arch/x86/kvm/vmx/vmcs.h
+++ b/arch/x86/kvm/vmx/vmcs.h
@@ -102,6 +102,11 @@
return is_exception_n(intr_info, BP_VECTOR);
}
+static inline bool is_double_fault(u32 intr_info)
+{
+ return is_exception_n(intr_info, DF_VECTOR);
+}
+
static inline bool is_page_fault(u32 intr_info)
{
return is_exception_n(intr_info, PF_VECTOR);
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 90ad7a6..982138b 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -5,6 +5,7 @@
#include <asm/kvm_vcpu_regs.h>
#include <asm/nospec-branch.h>
#include <asm/segment.h>
+#include "run_flags.h"
#define WORD_SIZE (BITS_PER_LONG / 8)
@@ -31,72 +32,11 @@
.section .noinstr.text, "ax"
/**
- * vmx_vmenter - VM-Enter the current loaded VMCS
- *
- * %RFLAGS.ZF: !VMCS.LAUNCHED, i.e. controls VMLAUNCH vs. VMRESUME
- *
- * Returns:
- * %RFLAGS.CF is set on VM-Fail Invalid
- * %RFLAGS.ZF is set on VM-Fail Valid
- * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
- *
- * Note that VMRESUME/VMLAUNCH fall-through and return directly if
- * they VM-Fail, whereas a successful VM-Enter + VM-Exit will jump
- * to vmx_vmexit.
- */
-SYM_FUNC_START(vmx_vmenter)
- /* EFLAGS.ZF is set if VMCS.LAUNCHED == 0 */
- je 2f
-
-1: vmresume
- ret
-
-2: vmlaunch
- ret
-
-3: cmpb $0, kvm_rebooting
- je 4f
- ret
-4: ud2
-
- _ASM_EXTABLE(1b, 3b)
- _ASM_EXTABLE(2b, 3b)
-
-SYM_FUNC_END(vmx_vmenter)
-
-/**
- * vmx_vmexit - Handle a VMX VM-Exit
- *
- * Returns:
- * %RFLAGS.{CF,ZF} are cleared on VM-Success, i.e. VM-Exit
- *
- * This is vmx_vmenter's partner in crime. On a VM-Exit, control will jump
- * here after hardware loads the host's state, i.e. this is the destination
- * referred to by VMCS.HOST_RIP.
- */
-SYM_FUNC_START(vmx_vmexit)
-#ifdef CONFIG_RETPOLINE
- ALTERNATIVE "jmp .Lvmexit_skip_rsb", "", X86_FEATURE_RETPOLINE
- /* Preserve guest's RAX, it's used to stuff the RSB. */
- push %_ASM_AX
-
- /* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
- FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
-
- /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */
- or $1, %_ASM_AX
-
- pop %_ASM_AX
-.Lvmexit_skip_rsb:
-#endif
- ret
-SYM_FUNC_END(vmx_vmexit)
-
-/**
* __vmx_vcpu_run - Run a vCPU via a transition to VMX guest mode
- * @vmx: struct vcpu_vmx * (forwarded to vmx_update_host_rsp)
+ * @vmx: struct vcpu_vmx *
* @regs: unsigned long * (to guest registers)
- * @launched: %true if the VMCS has been launched
+ * @flags: VMX_RUN_VMRESUME: use VMRESUME instead of VMLAUNCH
+ * VMX_RUN_SAVE_SPEC_CTRL: save guest SPEC_CTRL into vmx->spec_ctrl
*
* Returns:
* 0 on VM-Exit, 1 on VM-Fail
@@ -115,24 +55,29 @@
#endif
push %_ASM_BX
+ /* Save @vmx for SPEC_CTRL handling */
+ push %_ASM_ARG1
+
+ /* Save @flags for SPEC_CTRL handling */
+ push %_ASM_ARG3
+
/*
* Save @regs, _ASM_ARG2 may be modified by vmx_update_host_rsp() and
* @regs is needed after VM-Exit to save the guest's register values.
*/
push %_ASM_ARG2
- /* Copy @launched to BL, _ASM_ARG3 is volatile. */
+ /* Copy @flags to BL, _ASM_ARG3 is volatile. */
mov %_ASM_ARG3B, %bl
- /* Adjust RSP to account for the CALL to vmx_vmenter(). */
- lea -WORD_SIZE(%_ASM_SP), %_ASM_ARG2
+ lea (%_ASM_SP), %_ASM_ARG2
call vmx_update_host_rsp
/* Load @regs to RAX. */
mov (%_ASM_SP), %_ASM_AX
/* Check if vmlaunch or vmresume is needed */
- cmpb $0, %bl
+ testb $VMX_RUN_VMRESUME, %bl
/* Load guest registers. Don't clobber flags. */
mov VCPU_RCX(%_ASM_AX), %_ASM_CX
@@ -154,11 +99,36 @@
/* Load guest RAX. This kills the @regs pointer! */
mov VCPU_RAX(%_ASM_AX), %_ASM_AX
- /* Enter guest mode */
- call vmx_vmenter
+ /* Check EFLAGS.ZF from 'testb' above */
+ jz .Lvmlaunch
- /* Jump on VM-Fail. */
- jbe 2f
+ /*
+ * After a successful VMRESUME/VMLAUNCH, control flow "magically"
+ * resumes below at 'vmx_vmexit' due to the VMCS HOST_RIP setting.
+ * So this isn't a typical function and objtool needs to be told to
+ * save the unwind state here and restore it below.
+ */
+ UNWIND_HINT_SAVE
+
+/*
+ * If VMRESUME/VMLAUNCH and corresponding vmexit succeed, execution resumes at
+ * the 'vmx_vmexit' label below.
+ */
+.Lvmresume:
+ vmresume
+ jmp .Lvmfail
+
+.Lvmlaunch:
+ vmlaunch
+ jmp .Lvmfail
+
+ _ASM_EXTABLE(.Lvmresume, .Lfixup)
+ _ASM_EXTABLE(.Lvmlaunch, .Lfixup)
+
+SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL)
+
+ /* Restore unwind state from before the VMRESUME/VMLAUNCH. */
+ UNWIND_HINT_RESTORE
/* Temporarily save guest's RAX. */
push %_ASM_AX
@@ -185,21 +155,23 @@
mov %r15, VCPU_R15(%_ASM_AX)
#endif
- /* Clear RAX to indicate VM-Exit (as opposed to VM-Fail). */
- xor %eax, %eax
+ /* Clear return value to indicate VM-Exit (as opposed to VM-Fail). */
+ xor %ebx, %ebx
+.Lclear_regs:
/*
- * Clear all general purpose registers except RSP and RAX to prevent
+ * Clear all general purpose registers except RSP and RBX to prevent
* speculative use of the guest's values, even those that are reloaded
* via the stack. In theory, an L1 cache miss when restoring registers
* could lead to speculative execution with the guest's values.
* Zeroing XORs are dirt cheap, i.e. the extra paranoia is essentially
* free. RSP and RAX are exempt as RSP is restored by hardware during
- * VM-Exit and RAX is explicitly loaded with 0 or 1 to return VM-Fail.
+ * VM-Exit and RBX is explicitly loaded with 0 or 1 to hold the return
+ * value.
*/
-1: xor %ecx, %ecx
+ xor %eax, %eax
+ xor %ecx, %ecx
xor %edx, %edx
- xor %ebx, %ebx
xor %ebp, %ebp
xor %esi, %esi
xor %edi, %edi
@@ -216,8 +188,32 @@
/* "POP" @regs. */
add $WORD_SIZE, %_ASM_SP
- pop %_ASM_BX
+ /*
+ * IMPORTANT: RSB filling and SPEC_CTRL handling must be done before
+ * the first unbalanced RET after vmexit!
+ *
+ * For retpoline or IBRS, RSB filling is needed to prevent poisoned RSB
+ * entries and (in some cases) RSB underflow.
+ *
+ * eIBRS has its own protection against poisoned RSB, so it doesn't
+ * need the RSB filling sequence. But it does need to be enabled, and a
+ * single call to retire, before the first unbalanced RET.
+ */
+
+ FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\
+ X86_FEATURE_RSB_VMEXIT_LITE
+
+
+ pop %_ASM_ARG2 /* @flags */
+ pop %_ASM_ARG1 /* @vmx */
+
+ call vmx_spec_ctrl_restore_host
+
+ /* Put return value in AX */
+ mov %_ASM_BX, %_ASM_AX
+
+ pop %_ASM_BX
#ifdef CONFIG_X86_64
pop %r12
pop %r13
@@ -228,11 +224,17 @@
pop %edi
#endif
pop %_ASM_BP
- ret
+ RET
- /* VM-Fail. Out-of-line to avoid a taken Jcc after VM-Exit. */
-2: mov $1, %eax
- jmp 1b
+.Lfixup:
+ cmpb $0, kvm_rebooting
+ jne .Lvmfail
+ ud2
+.Lvmfail:
+ /* VM-Fail: set return value to 1 */
+ mov $1, %_ASM_BX
+ jmp .Lclear_regs
+
SYM_FUNC_END(__vmx_vcpu_run)
@@ -293,7 +295,7 @@
pop %_ASM_AX
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmread_error_trampoline)
SYM_FUNC_START(vmx_do_interrupt_nmi_irqoff)
@@ -326,5 +328,5 @@
*/
mov %_ASM_BP, %_ASM_SP
pop %_ASM_BP
- ret
+ RET
SYM_FUNC_END(vmx_do_interrupt_nmi_irqoff)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 94f5f21..af6742d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -226,6 +226,9 @@
#define L1D_CACHE_ORDER 4
static void *vmx_l1d_flush_pages;
+/* Control for disabling CPU Fill buffer clear */
+static bool __read_mostly vmx_fb_clear_ctrl_available;
+
static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
{
struct page *page;
@@ -357,6 +360,60 @@
return sprintf(s, "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
}
+static void vmx_setup_fb_clear_ctrl(void)
+{
+ u64 msr;
+
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES) &&
+ !boot_cpu_has_bug(X86_BUG_MDS) &&
+ !boot_cpu_has_bug(X86_BUG_TAA)) {
+ rdmsrl(MSR_IA32_ARCH_CAPABILITIES, msr);
+ if (msr & ARCH_CAP_FB_CLEAR_CTRL)
+ vmx_fb_clear_ctrl_available = true;
+ }
+}
+
+static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
+{
+ u64 msr;
+
+ if (!vmx->disable_fb_clear)
+ return;
+
+ msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
+ msr |= FB_CLEAR_DIS;
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
+ /* Cache the MSR value to avoid reading it later */
+ vmx->msr_ia32_mcu_opt_ctrl = msr;
+}
+
+static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
+{
+ if (!vmx->disable_fb_clear)
+ return;
+
+ vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
+ native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
+}
+
+static void vmx_update_fb_clear_dis(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx)
+{
+ vmx->disable_fb_clear = vmx_fb_clear_ctrl_available;
+
+ /*
+ * If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
+ * at VMEntry. Skip the MSR read/write when a guest has no use case to
+ * execute VERW.
+ */
+ if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) ||
+ ((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
+ (vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
+ vmx->disable_fb_clear = false;
+}
+
static const struct kernel_param_ops vmentry_l1d_flush_ops = {
.set = vmentry_l1d_flush_set,
.get = vmentry_l1d_flush_get,
@@ -562,7 +619,7 @@
* evmcs in singe VM shares same assist page.
*/
if (!*p_hv_pa_pg)
- *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL);
+ *p_hv_pa_pg = kzalloc(PAGE_SIZE, GFP_KERNEL_ACCOUNT);
if (!*p_hv_pa_pg)
return -ENOMEM;
@@ -879,6 +936,24 @@
return true;
}
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
+{
+ unsigned int flags = 0;
+
+ if (vmx->loaded_vmcs->launched)
+ flags |= VMX_RUN_VMRESUME;
+
+ /*
+ * If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
+ * to change it directly without causing a vmexit. In that case read
+ * it after vmexit and store it in vmx->spec_ctrl.
+ */
+ if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
+ flags |= VMX_RUN_SAVE_SPEC_CTRL;
+
+ return flags;
+}
+
static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
unsigned long entry, unsigned long exit)
{
@@ -1662,7 +1737,17 @@
kvm_deliver_exception_payload(vcpu);
if (has_error_code) {
- vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, error_code);
+ /*
+ * Despite the error code being architecturally defined as 32
+ * bits, and the VMCS field being 32 bits, Intel CPUs and thus
+ * VMX don't actually supporting setting bits 31:16. Hardware
+ * will (should) never provide a bogus error code, but AMD CPUs
+ * do generate error codes with bits 31:16 set, and so KVM's
+ * ABI lets userspace shove in arbitrary 32-bit values. Drop
+ * the upper bits to avoid VM-Fail, losing information that
+ * does't really exist is preferable to killing the VM.
+ */
+ vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE, (u16)error_code);
intr_info |= INTR_INFO_DELIVER_CODE_MASK;
}
@@ -2211,6 +2296,10 @@
ret = kvm_set_msr_common(vcpu, msr_info);
}
+ /* FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior */
+ if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
+ vmx_update_fb_clear_dis(vcpu, vmx);
+
return ret;
}
@@ -3104,7 +3193,23 @@
vmcs_writel(GUEST_CR3, guest_cr3);
}
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+static bool vmx_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
+{
+ /*
+ * We operate under the default treatment of SMM, so VMX cannot be
+ * enabled under SMM. Note, whether or not VMXE is allowed at all is
+ * handled by kvm_valid_cr4().
+ */
+ if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
+ return false;
+
+ if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
+ return false;
+
+ return true;
+}
+
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
/*
@@ -3132,21 +3237,6 @@
}
}
- if (cr4 & X86_CR4_VMXE) {
- /*
- * To use VMXON (and later other VMX instructions), a guest
- * must first be able to turn on cr4.VMXE (see handle_vmon()).
- * So basically the check on whether to allow nested VMX
- * is here. We operate under the default treatment of SMM,
- * so VMX cannot be enabled under SMM.
- */
- if (!nested_vmx_allowed(vcpu) || is_smm(vcpu))
- return 1;
- }
-
- if (vmx->nested.vmxon && !nested_cr4_valid(vcpu, cr4))
- return 1;
-
vcpu->arch.cr4 = cr4;
kvm_register_mark_available(vcpu, VCPU_EXREG_CR4);
@@ -3177,7 +3267,6 @@
vmcs_writel(CR4_READ_SHADOW, cr4);
vmcs_writel(GUEST_CR4, hw_cr4);
- return 0;
}
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg)
@@ -4483,6 +4572,8 @@
vpid_sync_context(vmx->vpid);
if (init_event)
vmx_clear_hlt(vcpu);
+
+ vmx_update_fb_clear_dis(vcpu, vmx);
}
static void enable_irq_window(struct kvm_vcpu *vcpu)
@@ -6612,6 +6703,31 @@
}
}
+void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
+ unsigned int flags)
+{
+ u64 hostval = this_cpu_read(x86_spec_ctrl_current);
+
+ if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
+ return;
+
+ if (flags & VMX_RUN_SAVE_SPEC_CTRL)
+ vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
+
+ /*
+ * If the guest/host SPEC_CTRL values differ, restore the host value.
+ *
+ * For legacy IBRS, the IBRS bit always needs to be written after
+ * transitioning from a less privileged predictor mode, regardless of
+ * whether the guest/host values differ.
+ */
+ if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) ||
+ vmx->spec_ctrl != hostval)
+ native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
+
+ barrier_nospec();
+}
+
static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu)
{
switch (to_vmx(vcpu)->exit_reason.basic) {
@@ -6624,10 +6740,9 @@
}
}
-bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs, bool launched);
-
static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
- struct vcpu_vmx *vmx)
+ struct vcpu_vmx *vmx,
+ unsigned long flags)
{
/*
* VMENTER enables interrupts (host state), but the kernel state is
@@ -6654,15 +6769,22 @@
vmx_l1d_flush(vcpu);
else if (static_branch_unlikely(&mds_user_clear))
mds_clear_cpu_buffers();
+ else if (static_branch_unlikely(&mmio_stale_data_clear) &&
+ kvm_arch_has_assigned_device(vcpu->kvm))
+ mds_clear_cpu_buffers();
+
+ vmx_disable_fb_clear(vmx);
if (vcpu->arch.cr2 != native_read_cr2())
native_write_cr2(vcpu->arch.cr2);
vmx->fail = __vmx_vcpu_run(vmx, (unsigned long *)&vcpu->arch.regs,
- vmx->loaded_vmcs->launched);
+ flags);
vcpu->arch.cr2 = native_read_cr2();
+ vmx_enable_fb_clear(vmx);
+
/*
* VMEXIT disables interrupts (host state), but tracing and lockdep
* have them in state 'on' as recorded before entering guest mode.
@@ -6756,27 +6878,7 @@
x86_spec_ctrl_set_guest(vmx->spec_ctrl, 0);
/* The actual VMENTER/EXIT is in the .noinstr.text section. */
- vmx_vcpu_enter_exit(vcpu, vmx);
-
- /*
- * We do not use IBRS in the kernel. If this vCPU has used the
- * SPEC_CTRL MSR it may have left it on; save the value and
- * turn it off. This is much more efficient than blindly adding
- * it to the atomic save/restore list. Especially as the former
- * (Saving guest MSRs on vmexit) doesn't even exist in KVM.
- *
- * For non-nested case:
- * If the L01 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- *
- * For nested case:
- * If the L02 MSR bitmap does not intercept the MSR, then we need to
- * save it.
- */
- if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL)))
- vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL);
-
- x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0);
+ vmx_vcpu_enter_exit(vcpu, vmx, __vmx_vcpu_run_flags(vmx));
/* All fields are clean at this point */
if (static_branch_unlikely(&enable_evmcs))
@@ -7660,6 +7762,7 @@
.get_cpl = vmx_get_cpl,
.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
.set_cr0 = vmx_set_cr0,
+ .is_valid_cr4 = vmx_is_valid_cr4,
.set_cr4 = vmx_set_cr4,
.set_efer = vmx_set_efer,
.get_idt = vmx_get_idt,
@@ -8047,6 +8150,8 @@
return r;
}
+ vmx_setup_fb_clear_ctrl();
+
for_each_possible_cpu(cpu) {
INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 5ff2453..ed4b6da 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -13,6 +13,7 @@
#include "vmcs.h"
#include "vmx_ops.h"
#include "cpuid.h"
+#include "run_flags.h"
extern const u32 vmx_msr_index[];
@@ -300,6 +301,8 @@
u64 msr_ia32_feature_control;
u64 msr_ia32_feature_control_valid_bits;
u64 ept_pointer;
+ u64 msr_ia32_mcu_opt_ctrl;
+ bool disable_fb_clear;
struct pt_desc pt_desc;
@@ -344,7 +347,7 @@
void vmx_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask);
int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer);
void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
-int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
+void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx);
void ept_save_pdptrs(struct kvm_vcpu *vcpu);
void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -363,6 +366,10 @@
struct vmx_uret_msr *vmx_find_uret_msr(struct vcpu_vmx *vmx, u32 msr);
void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu);
void vmx_update_host_rsp(struct vcpu_vmx *vmx, unsigned long host_rsp);
+void vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx, unsigned int flags);
+unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx);
+bool __vmx_vcpu_run(struct vcpu_vmx *vmx, unsigned long *regs,
+ unsigned int flags);
int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr);
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu);
@@ -379,9 +386,13 @@
vmx->loaded_vmcs->controls_shadow.lname = val; \
} \
} \
+static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \
+{ \
+ return vmcs->controls_shadow.lname; \
+} \
static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \
{ \
- return vmx->loaded_vmcs->controls_shadow.lname; \
+ return __##lname##_controls_get(vmx->loaded_vmcs); \
} \
static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \
{ \
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index a5d6d79..23d7c56 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -459,6 +459,7 @@
#define EXCPT_TRAP 1
#define EXCPT_ABORT 2
#define EXCPT_INTERRUPT 3
+#define EXCPT_DB 4
static int exception_type(int vector)
{
@@ -469,8 +470,14 @@
mask = 1 << vector;
- /* #DB is trap, as instruction watchpoints are handled elsewhere */
- if (mask & ((1 << DB_VECTOR) | (1 << BP_VECTOR) | (1 << OF_VECTOR)))
+ /*
+ * #DBs can be trap-like or fault-like, the caller must check other CPU
+ * state, e.g. DR6, to determine whether a #DB is a trap or fault.
+ */
+ if (mask & (1 << DB_VECTOR))
+ return EXCPT_DB;
+
+ if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR)))
return EXCPT_TRAP;
if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR)))
@@ -986,6 +993,9 @@
if (cr4 & vcpu->arch.cr4_guest_rsvd_bits)
return -EINVAL;
+ if (!kvm_x86_ops.is_valid_cr4(vcpu, cr4))
+ return -EINVAL;
+
return 0;
}
EXPORT_SYMBOL_GPL(kvm_valid_cr4);
@@ -1020,8 +1030,7 @@
return 1;
}
- if (kvm_x86_ops.set_cr4(vcpu, cr4))
- return 1;
+ kvm_x86_ops.set_cr4(vcpu, cr4);
if (((cr4 ^ old_cr4) & mmu_role_bits) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
@@ -1353,7 +1362,7 @@
MSR_IA32_VMX_EPT_VPID_CAP,
MSR_IA32_VMX_VMFUNC,
- MSR_F10H_DECFG,
+ MSR_AMD64_DE_CFG,
MSR_IA32_UCODE_REV,
MSR_IA32_ARCH_CAPABILITIES,
MSR_IA32_PERF_CAPABILITIES,
@@ -1362,12 +1371,32 @@
static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)];
static unsigned int num_msr_based_features;
+/*
+ * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM
+ * does not yet virtualize. These include:
+ * 10 - MISC_PACKAGE_CTRLS
+ * 11 - ENERGY_FILTERING_CTL
+ * 12 - DOITM
+ * 18 - FB_CLEAR_CTRL
+ * 21 - XAPIC_DISABLE_STATUS
+ * 23 - OVERCLOCKING_STATUS
+ */
+
+#define KVM_SUPPORTED_ARCH_CAP \
+ (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \
+ ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \
+ ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \
+ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \
+ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO)
+
static u64 kvm_get_arch_capabilities(void)
{
u64 data = 0;
- if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES))
+ if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) {
rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data);
+ data &= KVM_SUPPORTED_ARCH_CAP;
+ }
/*
* If nx_huge_pages is enabled, KVM's shadow paging will ensure that
@@ -2859,17 +2888,20 @@
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
- * this to avoid an uncatched #GP in the guest
+ * this to avoid an uncatched #GP in the guest.
+ *
+ * UNIXWARE clears bit 0 of MC1_CTL to ignore
+ * correctable, single-bit ECC data errors.
*/
if ((offset & 0x3) == 0 &&
- data != 0 && (data | (1 << 10)) != ~(u64)0)
- return -1;
+ data != 0 && (data | (1 << 10) | 1) != ~(u64)0)
+ return 1;
/* MCi_STATUS */
if (!msr_info->host_initiated &&
(offset & 0x3) == 1 && data != 0) {
if (!can_set_mci_status(vcpu))
- return -1;
+ return 1;
}
vcpu->arch.mce_banks[offset] = data;
@@ -5328,6 +5360,11 @@
r = 0;
break;
case KVM_CAP_X86_USER_SPACE_MSR:
+ r = -EINVAL;
+ if (cap->args[0] & ~(KVM_MSR_EXIT_REASON_INVAL |
+ KVM_MSR_EXIT_REASON_UNKNOWN |
+ KVM_MSR_EXIT_REASON_FILTER))
+ break;
kvm->arch.user_space_msr_mask = cap->args[0];
r = 0;
break;
@@ -5409,23 +5446,22 @@
return r;
}
-static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, void __user *argp)
+static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm,
+ struct kvm_msr_filter *filter)
{
- struct kvm_msr_filter __user *user_msr_filter = argp;
struct kvm_x86_msr_filter *new_filter, *old_filter;
- struct kvm_msr_filter filter;
bool default_allow;
bool empty = true;
int r = 0;
u32 i;
- if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
- return -EFAULT;
+ if (filter->flags & ~KVM_MSR_FILTER_DEFAULT_DENY)
+ return -EINVAL;
- for (i = 0; i < ARRAY_SIZE(filter.ranges); i++)
- empty &= !filter.ranges[i].nmsrs;
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++)
+ empty &= !filter->ranges[i].nmsrs;
- default_allow = !(filter.flags & KVM_MSR_FILTER_DEFAULT_DENY);
+ default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY);
if (empty && !default_allow)
return -EINVAL;
@@ -5433,8 +5469,8 @@
if (!new_filter)
return -ENOMEM;
- for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
- r = kvm_add_msr_filter(new_filter, &filter.ranges[i]);
+ for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) {
+ r = kvm_add_msr_filter(new_filter, &filter->ranges[i]);
if (r) {
kvm_free_msr_filter(new_filter);
return r;
@@ -5457,6 +5493,62 @@
return 0;
}
+#ifdef CONFIG_KVM_COMPAT
+/* for KVM_X86_SET_MSR_FILTER */
+struct kvm_msr_filter_range_compat {
+ __u32 flags;
+ __u32 nmsrs;
+ __u32 base;
+ __u32 bitmap;
+};
+
+struct kvm_msr_filter_compat {
+ __u32 flags;
+ struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES];
+};
+
+#define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat)
+
+long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl,
+ unsigned long arg)
+{
+ void __user *argp = (void __user *)arg;
+ struct kvm *kvm = filp->private_data;
+ long r = -ENOTTY;
+
+ switch (ioctl) {
+ case KVM_X86_SET_MSR_FILTER_COMPAT: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter_compat filter_compat;
+ struct kvm_msr_filter filter;
+ int i;
+
+ if (copy_from_user(&filter_compat, user_msr_filter,
+ sizeof(filter_compat)))
+ return -EFAULT;
+
+ filter.flags = filter_compat.flags;
+ for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) {
+ struct kvm_msr_filter_range_compat *cr;
+
+ cr = &filter_compat.ranges[i];
+ filter.ranges[i] = (struct kvm_msr_filter_range) {
+ .flags = cr->flags,
+ .nmsrs = cr->nmsrs,
+ .base = cr->base,
+ .bitmap = (__u8 *)(ulong)cr->bitmap,
+ };
+ }
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
+ break;
+ }
+ }
+
+ return r;
+}
+#endif
+
long kvm_arch_vm_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -5763,9 +5855,16 @@
case KVM_SET_PMU_EVENT_FILTER:
r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp);
break;
- case KVM_X86_SET_MSR_FILTER:
- r = kvm_vm_ioctl_set_msr_filter(kvm, argp);
+ case KVM_X86_SET_MSR_FILTER: {
+ struct kvm_msr_filter __user *user_msr_filter = argp;
+ struct kvm_msr_filter filter;
+
+ if (copy_from_user(&filter, user_msr_filter, sizeof(filter)))
+ return -EFAULT;
+
+ r = kvm_vm_ioctl_set_msr_filter(kvm, &filter);
break;
+ }
default:
r = -ENOTTY;
}
@@ -6875,6 +6974,11 @@
return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR);
}
+static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt)
+{
+ return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID);
+}
+
static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg)
{
return kvm_register_read(emul_to_vcpu(ctxt), reg);
@@ -6958,6 +7062,7 @@
.guest_has_long_mode = emulator_guest_has_long_mode,
.guest_has_movbe = emulator_guest_has_movbe,
.guest_has_fxsr = emulator_guest_has_fxsr,
+ .guest_has_rdpid = emulator_guest_has_rdpid,
.set_nmi_mask = emulator_set_nmi_mask,
.get_hflags = emulator_get_hflags,
.set_hflags = emulator_set_hflags,
@@ -7289,7 +7394,7 @@
}
EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction);
-static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r)
+static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, int *r)
{
if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) &&
(vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) {
@@ -7358,25 +7463,23 @@
}
/*
- * Decode to be emulated instruction. Return EMULATION_OK if success.
+ * Decode an instruction for emulation. The caller is responsible for handling
+ * code breakpoints. Note, manually detecting code breakpoints is unnecessary
+ * (and wrong) when emulating on an intercepted fault-like exception[*], as
+ * code breakpoints have higher priority and thus have already been done by
+ * hardware.
+ *
+ * [*] Except #MC, which is higher priority, but KVM should never emulate in
+ * response to a machine check.
*/
int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type,
void *insn, int insn_len)
{
- int r = EMULATION_OK;
struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt;
+ int r;
init_emulate_ctxt(vcpu);
- /*
- * We will reenter on the same instruction since we do not set
- * complete_userspace_io. This does not handle watchpoints yet,
- * those would be handled in the emulate_ops.
- */
- if (!(emulation_type & EMULTYPE_SKIP) &&
- kvm_vcpu_check_breakpoint(vcpu, &r))
- return r;
-
ctxt->ud = emulation_type & EMULTYPE_TRAP_UD;
r = x86_decode_insn(ctxt, insn, insn_len);
@@ -7411,6 +7514,15 @@
if (!(emulation_type & EMULTYPE_NO_DECODE)) {
kvm_clear_exception_queue(vcpu);
+ /*
+ * Return immediately if RIP hits a code breakpoint, such #DBs
+ * are fault-like and are higher priority than any faults on
+ * the code fetch itself.
+ */
+ if (!(emulation_type & EMULTYPE_SKIP) &&
+ kvm_vcpu_check_code_breakpoint(vcpu, &r))
+ return r;
+
r = x86_decode_emulated_instruction(vcpu, emulation_type,
insn, insn_len);
if (r != EMULATION_OK) {
@@ -7522,6 +7634,12 @@
unsigned long rflags = kvm_x86_ops.get_rflags(vcpu);
toggle_interruptibility(vcpu, ctxt->interruptibility);
vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
+
+ /*
+ * Note, EXCPT_DB is assumed to be fault-like as the emulator
+ * only supports code breakpoints and general detect #DB, both
+ * of which are fault-like.
+ */
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_rip_write(vcpu, ctxt->eip);
@@ -7999,7 +8117,7 @@
goto out_free_x86_emulator_cache;
}
- r = kvm_mmu_module_init();
+ r = kvm_mmu_vendor_module_init();
if (r)
goto out_free_percpu;
@@ -8059,7 +8177,7 @@
cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops.hardware_enable = NULL;
- kvm_mmu_module_exit();
+ kvm_mmu_vendor_module_exit();
free_percpu(user_return_msrs);
kmem_cache_destroy(x86_emulator_cache);
kmem_cache_destroy(x86_fpu_cache);
@@ -8126,15 +8244,17 @@
*/
static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
{
- struct kvm_lapic_irq lapic_irq;
+ /*
+ * All other fields are unused for APIC_DM_REMRD, but may be consumed by
+ * common code, e.g. for tracing. Defer initialization to the compiler.
+ */
+ struct kvm_lapic_irq lapic_irq = {
+ .delivery_mode = APIC_DM_REMRD,
+ .dest_mode = APIC_DEST_PHYSICAL,
+ .shorthand = APIC_DEST_NOSHORT,
+ .dest_id = apicid,
+ };
- lapic_irq.shorthand = APIC_DEST_NOSHORT;
- lapic_irq.dest_mode = APIC_DEST_PHYSICAL;
- lapic_irq.level = 0;
- lapic_irq.dest_id = apicid;
- lapic_irq.msi_redir_hint = false;
-
- lapic_irq.delivery_mode = APIC_DM_REMRD;
kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
}
@@ -8307,6 +8427,11 @@
static void kvm_inject_exception(struct kvm_vcpu *vcpu)
{
+ trace_kvm_inj_exception(vcpu->arch.exception.nr,
+ vcpu->arch.exception.has_error_code,
+ vcpu->arch.exception.error_code,
+ vcpu->arch.exception.injected);
+
if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
vcpu->arch.exception.error_code = false;
kvm_x86_ops.queue_exception(vcpu);
@@ -8364,13 +8489,16 @@
/* try to inject new event if pending */
if (vcpu->arch.exception.pending) {
- trace_kvm_inj_exception(vcpu->arch.exception.nr,
- vcpu->arch.exception.has_error_code,
- vcpu->arch.exception.error_code);
-
- vcpu->arch.exception.pending = false;
- vcpu->arch.exception.injected = true;
-
+ /*
+ * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS
+ * value pushed on the stack. Trap-like exception and all #DBs
+ * leave RF as-is (KVM follows Intel's behavior in this regard;
+ * AMD states that code breakpoint #DBs excplitly clear RF=0).
+ *
+ * Note, most versions of Intel's SDM and AMD's APM incorrectly
+ * describe the behavior of General Detect #DBs, which are
+ * fault-like. They do _not_ set RF, a la code breakpoints.
+ */
if (exception_type(vcpu->arch.exception.nr) == EXCPT_FAULT)
__kvm_set_rflags(vcpu, kvm_get_rflags(vcpu) |
X86_EFLAGS_RF);
@@ -8384,6 +8512,10 @@
}
kvm_inject_exception(vcpu);
+
+ vcpu->arch.exception.pending = false;
+ vcpu->arch.exception.injected = true;
+
can_inject = false;
}
@@ -8835,6 +8967,12 @@
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
}
+void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
+{
+ if (kvm_x86_ops.guest_memory_reclaimed)
+ kvm_x86_ops.guest_memory_reclaimed(kvm);
+}
+
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
{
if (!lapic_in_kernel(vcpu))
@@ -11140,7 +11278,7 @@
if (!kvm_pv_async_pf_enabled(vcpu))
return true;
else
- return apf_pageready_slot_free(vcpu);
+ return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu);
}
void kvm_arch_start_assignment(struct kvm *kvm)
@@ -11155,9 +11293,9 @@
}
EXPORT_SYMBOL_GPL(kvm_arch_end_assignment);
-bool kvm_arch_has_assigned_device(struct kvm *kvm)
+bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm)
{
- return atomic_read(&kvm->arch.assigned_device_count);
+ return arch_atomic_read(&kvm->arch.assigned_device_count);
}
EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device);
@@ -11420,3 +11558,19 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_update_request);
+
+static int __init kvm_x86_init(void)
+{
+ kvm_mmu_x86_module_init();
+ return 0;
+}
+module_init(kvm_x86_init);
+
+static void __exit kvm_x86_exit(void)
+{
+ /*
+ * If module_init() is implemented, module_exit() must also be
+ * implemented to allow module unload.
+ */
+}
+module_exit(kvm_x86_exit);