Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index f68c0c7..6a8db8e 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -352,6 +352,7 @@
unsigned f_umip = kvm_x86_ops->umip_emulated() ? F(UMIP) : 0;
unsigned f_intel_pt = kvm_x86_ops->pt_supported() ? F(INTEL_PT) : 0;
unsigned f_la57;
+ unsigned f_pku = kvm_x86_ops->pku_supported() ? F(PKU) : 0;
/* cpuid 7.0.ebx */
const u32 kvm_cpuid_7_0_ebx_x86_features =
@@ -363,7 +364,7 @@
/* cpuid 7.0.ecx*/
const u32 kvm_cpuid_7_0_ecx_x86_features =
- F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(RDPID) |
+ F(AVX512VBMI) | F(LA57) | 0 /*PKU*/ | 0 /*OSPKE*/ | F(RDPID) |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | 0 /*WAITPKG*/;
@@ -392,6 +393,7 @@
/* Set LA57 based on hardware capability. */
entry->ecx |= f_la57;
entry->ecx |= f_umip;
+ entry->ecx |= f_pku;
/* PKU is not yet implemented for shadow paging. */
if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
entry->ecx &= ~F(PKU);
@@ -402,7 +404,8 @@
entry->edx |= F(SPEC_CTRL);
if (boot_cpu_has(X86_FEATURE_STIBP))
entry->edx |= F(INTEL_STIBP);
- if (boot_cpu_has(X86_FEATURE_SSBD))
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->edx |= F(SPEC_CTRL_SSBD);
/*
* We emulate ARCH_CAPABILITIES in software even
@@ -504,7 +507,7 @@
r = -E2BIG;
- if (*nent >= maxnent)
+ if (WARN_ON(*nent >= maxnent))
goto out;
do_host_cpuid(entry, function, 0);
@@ -742,8 +745,14 @@
unsigned virt_as = max((entry->eax >> 8) & 0xff, 48U);
unsigned phys_as = entry->eax & 0xff;
- if (!g_phys_as)
+ /*
+ * Use bare metal's MAXPHADDR if the CPU doesn't report guest
+ * MAXPHYADDR separately, or if TDP (NPT) is disabled, as the
+ * guest version "applies only to guests using nested paging".
+ */
+ if (!g_phys_as || !tdp_enabled)
g_phys_as = phys_as;
+
entry->eax = g_phys_as | (virt_as << 8);
entry->edx = 0;
entry->ebx &= kvm_cpuid_8000_0008_ebx_x86_features;
@@ -759,7 +768,8 @@
entry->ebx |= F(AMD_IBRS);
if (boot_cpu_has(X86_FEATURE_STIBP))
entry->ebx |= F(AMD_STIBP);
- if (boot_cpu_has(X86_FEATURE_SSBD))
+ if (boot_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
entry->ebx |= F(AMD_SSBD);
if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS))
entry->ebx |= F(AMD_SSB_NO);
@@ -810,6 +820,9 @@
static int do_cpuid_func(struct kvm_cpuid_entry2 *entry, u32 func,
int *nent, int maxnent, unsigned int type)
{
+ if (*nent >= maxnent)
+ return -E2BIG;
+
if (type == KVM_GET_EMULATED_CPUID)
return __do_cpuid_func_emulated(entry, func, nent, maxnent);
diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d78a614..7dec43b 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -154,6 +154,20 @@
return x86_stepping(best->eax);
}
+static inline bool guest_has_spec_ctrl_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD));
+}
+
+static inline bool guest_has_pred_cmd_msr(struct kvm_vcpu *vcpu)
+{
+ return (guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) ||
+ guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB));
+}
+
static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu)
{
return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 698efb8..60c8dcb 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -22,6 +22,7 @@
#include "kvm_cache_regs.h"
#include <asm/kvm_emulate.h>
#include <linux/stringify.h>
+#include <asm/fpu/api.h>
#include <asm/debugreg.h>
#include <asm/nospec-branch.h>
@@ -1075,8 +1076,23 @@
}
}
+static void emulator_get_fpu(void)
+{
+ fpregs_lock();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
+}
+
+static void emulator_put_fpu(void)
+{
+ fpregs_unlock();
+}
+
static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movdqa %%xmm0, %0" : "=m"(*data)); break;
case 1: asm("movdqa %%xmm1, %0" : "=m"(*data)); break;
@@ -1098,11 +1114,13 @@
#endif
default: BUG();
}
+ emulator_put_fpu();
}
static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movdqa %0, %%xmm0" : : "m"(*data)); break;
case 1: asm("movdqa %0, %%xmm1" : : "m"(*data)); break;
@@ -1124,10 +1142,12 @@
#endif
default: BUG();
}
+ emulator_put_fpu();
}
static void read_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movq %%mm0, %0" : "=m"(*data)); break;
case 1: asm("movq %%mm1, %0" : "=m"(*data)); break;
@@ -1139,10 +1159,12 @@
case 7: asm("movq %%mm7, %0" : "=m"(*data)); break;
default: BUG();
}
+ emulator_put_fpu();
}
static void write_mmx_reg(struct x86_emulate_ctxt *ctxt, u64 *data, int reg)
{
+ emulator_get_fpu();
switch (reg) {
case 0: asm("movq %0, %%mm0" : : "m"(*data)); break;
case 1: asm("movq %0, %%mm1" : : "m"(*data)); break;
@@ -1154,6 +1176,7 @@
case 7: asm("movq %0, %%mm7" : : "m"(*data)); break;
default: BUG();
}
+ emulator_put_fpu();
}
static int em_fninit(struct x86_emulate_ctxt *ctxt)
@@ -1161,7 +1184,9 @@
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fninit");
+ emulator_put_fpu();
return X86EMUL_CONTINUE;
}
@@ -1172,7 +1197,9 @@
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fnstcw %0": "+m"(fcw));
+ emulator_put_fpu();
ctxt->dst.val = fcw;
@@ -1186,7 +1213,9 @@
if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM))
return emulate_nm(ctxt);
+ emulator_get_fpu();
asm volatile("fnstsw %0": "+m"(fsw));
+ emulator_put_fpu();
ctxt->dst.val = fsw;
@@ -2861,6 +2890,8 @@
ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
*reg_write(ctxt, VCPU_REGS_RSP) = (efer & EFER_LMA) ? msr_data :
(u32)msr_data;
+ if (efer & EFER_LMA)
+ ctxt->mode = X86EMUL_MODE_PROT64;
return X86EMUL_CONTINUE;
}
@@ -3588,7 +3619,7 @@
u64 tsc_aux = 0;
if (ctxt->ops->get_msr(ctxt, MSR_TSC_AUX, &tsc_aux))
- return emulate_gp(ctxt, 0);
+ return emulate_ud(ctxt);
ctxt->dst.val = tsc_aux;
return X86EMUL_CONTINUE;
}
@@ -4021,6 +4052,12 @@
return X86EMUL_CONTINUE;
}
+static int em_clflushopt(struct x86_emulate_ctxt *ctxt)
+{
+ /* emulating clflushopt regardless of cpuid */
+ return X86EMUL_CONTINUE;
+}
+
static int em_movsxd(struct x86_emulate_ctxt *ctxt)
{
ctxt->dst.val = (s32) ctxt->src.val;
@@ -4094,8 +4131,12 @@
if (rc != X86EMUL_CONTINUE)
return rc;
+ emulator_get_fpu();
+
rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state));
+ emulator_put_fpu();
+
if (rc != X86EMUL_CONTINUE)
return rc;
@@ -4138,6 +4179,8 @@
if (rc != X86EMUL_CONTINUE)
return rc;
+ emulator_get_fpu();
+
if (size < __fxstate_size(16)) {
rc = fxregs_fixup(&fx_state, size);
if (rc != X86EMUL_CONTINUE)
@@ -4153,6 +4196,8 @@
rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state));
out:
+ emulator_put_fpu();
+
return rc;
}
@@ -4555,7 +4600,7 @@
};
static const struct gprefix pfx_0f_ae_7 = {
- I(SrcMem | ByteOp, em_clflush), N, N, N,
+ I(SrcMem | ByteOp, em_clflush), I(SrcMem | ByteOp, em_clflushopt), N, N,
};
static const struct group_dual group15 = { {
@@ -5160,6 +5205,7 @@
ctxt->fetch.ptr = ctxt->fetch.data;
ctxt->fetch.end = ctxt->fetch.data + insn_len;
ctxt->opcode_len = 1;
+ ctxt->intercept = x86_intercept_none;
if (insn_len > 0)
memcpy(ctxt->fetch.data, insn, insn_len);
else {
@@ -5212,16 +5258,28 @@
ctxt->ad_bytes = def_ad_bytes ^ 6;
break;
case 0x26: /* ES override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_ES;
+ break;
case 0x2e: /* CS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_CS;
+ break;
case 0x36: /* SS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_SS;
+ break;
case 0x3e: /* DS override */
has_seg_override = true;
- ctxt->seg_override = (ctxt->b >> 3) & 3;
+ ctxt->seg_override = VCPU_SREG_DS;
break;
case 0x64: /* FS override */
+ has_seg_override = true;
+ ctxt->seg_override = VCPU_SREG_FS;
+ break;
case 0x65: /* GS override */
has_seg_override = true;
- ctxt->seg_override = ctxt->b & 7;
+ ctxt->seg_override = VCPU_SREG_GS;
break;
case 0x40 ... 0x4f: /* REX */
if (mode != X86EMUL_MODE_PROT64)
@@ -5305,10 +5363,15 @@
}
break;
case Escape:
- if (ctxt->modrm > 0xbf)
- opcode = opcode.u.esc->high[ctxt->modrm - 0xc0];
- else
+ if (ctxt->modrm > 0xbf) {
+ size_t size = ARRAY_SIZE(opcode.u.esc->high);
+ u32 index = array_index_nospec(
+ ctxt->modrm - 0xc0, size);
+
+ opcode = opcode.u.esc->high[index];
+ } else {
opcode = opcode.u.esc->op[(ctxt->modrm >> 3) & 7];
+ }
break;
case InstrDual:
if ((ctxt->modrm >> 6) == 3)
@@ -5450,7 +5513,9 @@
{
int rc;
+ emulator_get_fpu();
rc = asm_safe("fwait");
+ emulator_put_fpu();
if (unlikely(rc != X86EMUL_CONTINUE))
return emulate_exception(ctxt, MF_VECTOR, 0, false);
@@ -5779,6 +5844,8 @@
}
ctxt->eip = ctxt->_eip;
+ if (ctxt->mode != X86EMUL_MODE_PROT64)
+ ctxt->eip = (u32)ctxt->_eip;
done:
if (rc == X86EMUL_PROPAGATE_FAULT) {
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 23ff655..2640843 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -809,11 +809,12 @@
u32 index, u64 *pdata)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- *pdata = hv->hv_crash_param[index];
+ *pdata = hv->hv_crash_param[array_index_nospec(index, size)];
return 0;
}
@@ -852,11 +853,12 @@
u32 index, u64 data)
{
struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+ size_t size = ARRAY_SIZE(hv->hv_crash_param);
- if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+ if (WARN_ON_ONCE(index >= size))
return -EINVAL;
- hv->hv_crash_param[index] = data;
+ hv->hv_crash_param[array_index_nospec(index, size)] = data;
return 0;
}
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 8b38bb4..629a09c 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -460,10 +460,14 @@
switch (addr) {
case 0x20:
case 0x21:
+ pic_lock(s);
+ pic_ioport_write(&s->pics[0], addr, data);
+ pic_unlock(s);
+ break;
case 0xa0:
case 0xa1:
pic_lock(s);
- pic_ioport_write(&s->pics[addr >> 7], addr, data);
+ pic_ioport_write(&s->pics[1], addr, data);
pic_unlock(s);
break;
case 0x4d0:
diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index d859ae8..642031b 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -36,6 +36,7 @@
#include <linux/io.h>
#include <linux/slab.h>
#include <linux/export.h>
+#include <linux/nospec.h>
#include <asm/processor.h>
#include <asm/page.h>
#include <asm/current.h>
@@ -68,13 +69,14 @@
default:
{
u32 redir_index = (ioapic->ioregsel - 0x10) >> 1;
- u64 redir_content;
+ u64 redir_content = ~0ULL;
- if (redir_index < IOAPIC_NUM_PINS)
- redir_content =
- ioapic->redirtbl[redir_index].bits;
- else
- redir_content = ~0ULL;
+ if (redir_index < IOAPIC_NUM_PINS) {
+ u32 index = array_index_nospec(
+ redir_index, IOAPIC_NUM_PINS);
+
+ redir_content = ioapic->redirtbl[index].bits;
+ }
result = (ioapic->ioregsel & 0x1) ?
(redir_content >> 32) & 0xffffffff :
@@ -89,7 +91,7 @@
static void rtc_irq_eoi_tracking_reset(struct kvm_ioapic *ioapic)
{
ioapic->rtc_status.pending_eoi = 0;
- bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID);
+ bitmap_zero(ioapic->rtc_status.dest_map.map, KVM_MAX_VCPU_ID + 1);
}
static void kvm_rtc_eoi_tracking_restore_all(struct kvm_ioapic *ioapic);
@@ -291,6 +293,7 @@
if (index >= IOAPIC_NUM_PINS)
return;
+ index = array_index_nospec(index, IOAPIC_NUM_PINS);
e = &ioapic->redirtbl[index];
mask_before = e->fields.mask;
/* Preserve read-only fields */
diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h
index ea1a4e0..283f1f4 100644
--- a/arch/x86/kvm/ioapic.h
+++ b/arch/x86/kvm/ioapic.h
@@ -43,13 +43,13 @@
struct dest_map {
/* vcpu bitmap where IRQ has been sent */
- DECLARE_BITMAP(map, KVM_MAX_VCPU_ID);
+ DECLARE_BITMAP(map, KVM_MAX_VCPU_ID + 1);
/*
* Vector sent to a given vcpu, only valid when
* the vcpu's bit in map is set
*/
- u8 vectors[KVM_MAX_VCPU_ID];
+ u8 vectors[KVM_MAX_VCPU_ID + 1];
};
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index e330e7d..896db1a 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -40,29 +40,10 @@
* check if there is pending interrupt from
* non-APIC source without intack.
*/
-static int kvm_cpu_has_extint(struct kvm_vcpu *v)
-{
- u8 accept = kvm_apic_accept_pic_intr(v);
-
- if (accept) {
- if (irqchip_split(v->kvm))
- return pending_userspace_extint(v);
- else
- return v->kvm->arch.vpic->output;
- } else
- return 0;
-}
-
-/*
- * check if there is injectable interrupt:
- * when virtual interrupt delivery enabled,
- * interrupt from apic will handled by hardware,
- * we don't need to check it here.
- */
-int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+int kvm_cpu_has_extint(struct kvm_vcpu *v)
{
/*
- * FIXME: interrupt.injected represents an interrupt that it's
+ * FIXME: interrupt.injected represents an interrupt whose
* side-effects have already been applied (e.g. bit from IRR
* already moved to ISR). Therefore, it is incorrect to rely
* on interrupt.injected to know if there is a pending
@@ -75,6 +56,23 @@
if (!lapic_in_kernel(v))
return v->arch.interrupt.injected;
+ if (!kvm_apic_accept_pic_intr(v))
+ return 0;
+
+ if (irqchip_split(v->kvm))
+ return pending_userspace_extint(v);
+ else
+ return v->kvm->arch.vpic->output;
+}
+
+/*
+ * check if there is injectable interrupt:
+ * when virtual interrupt delivery enabled,
+ * interrupt from apic will handled by hardware,
+ * we don't need to check it here.
+ */
+int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
+{
if (kvm_cpu_has_extint(v))
return 1;
@@ -90,20 +88,6 @@
*/
int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
{
- /*
- * FIXME: interrupt.injected represents an interrupt that it's
- * side-effects have already been applied (e.g. bit from IRR
- * already moved to ISR). Therefore, it is incorrect to rely
- * on interrupt.injected to know if there is a pending
- * interrupt in the user-mode LAPIC.
- * This leads to nVMX/nSVM not be able to distinguish
- * if it should exit from L2 to L1 on EXTERNAL_INTERRUPT on
- * pending interrupt or should re-inject an injected
- * interrupt.
- */
- if (!lapic_in_kernel(v))
- return v->arch.interrupt.injected;
-
if (kvm_cpu_has_extint(v))
return 1;
@@ -117,16 +101,21 @@
*/
static int kvm_cpu_get_extint(struct kvm_vcpu *v)
{
- if (kvm_cpu_has_extint(v)) {
- if (irqchip_split(v->kvm)) {
- int vector = v->arch.pending_external_vector;
-
- v->arch.pending_external_vector = -1;
- return vector;
- } else
- return kvm_pic_read_irq(v->kvm); /* PIC */
- } else
+ if (!kvm_cpu_has_extint(v)) {
+ WARN_ON(!lapic_in_kernel(v));
return -1;
+ }
+
+ if (!lapic_in_kernel(v))
+ return v->arch.interrupt.nr;
+
+ if (irqchip_split(v->kvm)) {
+ int vector = v->arch.pending_external_vector;
+
+ v->arch.pending_external_vector = -1;
+ return vector;
+ } else
+ return kvm_pic_read_irq(v->kvm); /* PIC */
}
/*
@@ -134,13 +123,7 @@
*/
int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
{
- int vector;
-
- if (!lapic_in_kernel(v))
- return v->arch.interrupt.nr;
-
- vector = kvm_cpu_get_extint(v);
-
+ int vector = kvm_cpu_get_extint(v);
if (vector != -1)
return vector; /* PIC */
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index 8ecd48d..5ddcaac 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -416,7 +416,7 @@
kvm_set_msi_irq(vcpu->kvm, entry, &irq);
- if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+ if (irq.trig_mode && kvm_apic_match_dest(vcpu, NULL, 0,
irq.dest_id, irq.dest_mode))
__set_bit(irq.vector, ioapic_handled_vectors);
}
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 1cc6c47..341f58a 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -7,7 +7,7 @@
#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
#define KVM_POSSIBLE_CR4_GUEST_BITS \
(X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
- | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE)
+ | X86_CR4_OSXMMEXCPT | X86_CR4_LA57 | X86_CR4_PGE | X86_CR4_TSD)
#define BUILD_KVM_GPR_ACCESSORS(lname, uname) \
static __always_inline unsigned long kvm_##lname##_read(struct kvm_vcpu *vcpu)\
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index b29d00b..eea2d6f 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -637,9 +637,11 @@
static bool pv_eoi_get_pending(struct kvm_vcpu *vcpu)
{
u8 val;
- if (pv_eoi_get_user(vcpu, &val) < 0)
+ if (pv_eoi_get_user(vcpu, &val) < 0) {
printk(KERN_WARNING "Can't read EOI MSR value: 0x%llx\n",
(unsigned long long)vcpu->arch.pv_eoi.msr_val);
+ return false;
+ }
return val & 0x1;
}
@@ -1056,11 +1058,8 @@
apic->regs + APIC_TMR);
}
- if (vcpu->arch.apicv_active)
- kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
- else {
+ if (kvm_x86_ops->deliver_posted_interrupt(vcpu, vector)) {
kvm_lapic_set_irr(vector, apic);
-
kvm_make_request(KVM_REQ_EVENT, vcpu);
kvm_vcpu_kick(vcpu);
}
@@ -1333,6 +1332,9 @@
if (!apic_x2apic_mode(apic))
valid_reg_mask |= APIC_REG_MASK(APIC_ARBPRI);
+ if (alignment + len > 4)
+ return 1;
+
if (offset > 0x3f0 || !(valid_reg_mask & APIC_REG_MASK(offset)))
return 1;
@@ -1685,7 +1687,7 @@
hrtimer_start(&apic->lapic_timer.timer,
apic->lapic_timer.target_expiration,
- HRTIMER_MODE_ABS);
+ HRTIMER_MODE_ABS_HARD);
}
bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
@@ -1926,15 +1928,20 @@
case APIC_LVTTHMR:
case APIC_LVTPC:
case APIC_LVT1:
- case APIC_LVTERR:
+ case APIC_LVTERR: {
/* TODO: Check vector */
+ size_t size;
+ u32 index;
+
if (!kvm_apic_sw_enabled(apic))
val |= APIC_LVT_MASKED;
-
- val &= apic_lvt_mask[(reg - APIC_LVTT) >> 4];
+ size = ARRAY_SIZE(apic_lvt_mask);
+ index = array_index_nospec(
+ (reg - APIC_LVTT) >> 4, size);
+ val &= apic_lvt_mask[index];
kvm_lapic_set_reg(apic, reg, val);
-
break;
+ }
case APIC_LVTT:
if (!kvm_apic_sw_enabled(apic))
@@ -2081,7 +2088,7 @@
{
struct kvm_lapic *apic = vcpu->arch.apic;
- if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) ||
+ if (!kvm_apic_present(vcpu) || apic_lvtt_oneshot(apic) ||
apic_lvtt_period(apic))
return;
@@ -2326,7 +2333,7 @@
struct kvm_lapic *apic = vcpu->arch.apic;
u32 ppr;
- if (!kvm_apic_hw_enabled(apic))
+ if (!kvm_apic_present(vcpu))
return -1;
__apic_update_ppr(apic, &ppr);
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 2ce9da5..d3877dd 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -343,6 +343,8 @@
{
BUG_ON((u64)(unsigned)access_mask != access_mask);
BUG_ON((mmio_mask & mmio_value) != mmio_value);
+ WARN_ON(mmio_value & (shadow_nonpresent_or_rsvd_mask << shadow_nonpresent_or_rsvd_mask_len));
+ WARN_ON(mmio_value & shadow_nonpresent_or_rsvd_lower_gfn_mask);
shadow_mmio_value = mmio_value | SPTE_MMIO_MASK;
shadow_mmio_mask = mmio_mask | SPTE_SPECIAL_MASK;
shadow_mmio_access_mask = access_mask;
@@ -405,11 +407,11 @@
}
/*
- * Due to limited space in PTEs, the MMIO generation is a 19 bit subset of
+ * Due to limited space in PTEs, the MMIO generation is a 18 bit subset of
* the memslots generation and is derived as follows:
*
* Bits 0-8 of the MMIO generation are propagated to spte bits 3-11
- * Bits 9-18 of the MMIO generation are propagated to spte bits 52-61
+ * Bits 9-17 of the MMIO generation are propagated to spte bits 54-62
*
* The KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS flag is intentionally not included in
* the MMIO generation number, as doing so would require stealing a bit from
@@ -418,25 +420,38 @@
* requires a full MMU zap). The flag is instead explicitly queried when
* checking for MMIO spte cache hits.
*/
-#define MMIO_SPTE_GEN_MASK GENMASK_ULL(18, 0)
#define MMIO_SPTE_GEN_LOW_START 3
#define MMIO_SPTE_GEN_LOW_END 11
+
+#define MMIO_SPTE_GEN_HIGH_START PT64_SECOND_AVAIL_BITS_SHIFT
+#define MMIO_SPTE_GEN_HIGH_END 62
+
#define MMIO_SPTE_GEN_LOW_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_END, \
MMIO_SPTE_GEN_LOW_START)
-
-#define MMIO_SPTE_GEN_HIGH_START 52
-#define MMIO_SPTE_GEN_HIGH_END 61
#define MMIO_SPTE_GEN_HIGH_MASK GENMASK_ULL(MMIO_SPTE_GEN_HIGH_END, \
MMIO_SPTE_GEN_HIGH_START)
+
+#define MMIO_SPTE_GEN_LOW_BITS (MMIO_SPTE_GEN_LOW_END - MMIO_SPTE_GEN_LOW_START + 1)
+#define MMIO_SPTE_GEN_HIGH_BITS (MMIO_SPTE_GEN_HIGH_END - MMIO_SPTE_GEN_HIGH_START + 1)
+
+/* remember to adjust the comment above as well if you change these */
+static_assert(MMIO_SPTE_GEN_LOW_BITS == 9 && MMIO_SPTE_GEN_HIGH_BITS == 9);
+
+#define MMIO_SPTE_GEN_LOW_SHIFT (MMIO_SPTE_GEN_LOW_START - 0)
+#define MMIO_SPTE_GEN_HIGH_SHIFT (MMIO_SPTE_GEN_HIGH_START - MMIO_SPTE_GEN_LOW_BITS)
+
+#define MMIO_SPTE_GEN_MASK GENMASK_ULL(MMIO_SPTE_GEN_LOW_BITS + MMIO_SPTE_GEN_HIGH_BITS - 1, 0)
+
static u64 generation_mmio_spte_mask(u64 gen)
{
u64 mask;
WARN_ON(gen & ~MMIO_SPTE_GEN_MASK);
+ BUILD_BUG_ON((MMIO_SPTE_GEN_HIGH_MASK | MMIO_SPTE_GEN_LOW_MASK) & SPTE_SPECIAL_MASK);
- mask = (gen << MMIO_SPTE_GEN_LOW_START) & MMIO_SPTE_GEN_LOW_MASK;
- mask |= (gen << MMIO_SPTE_GEN_HIGH_START) & MMIO_SPTE_GEN_HIGH_MASK;
+ mask = (gen << MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_SPTE_GEN_LOW_MASK;
+ mask |= (gen << MMIO_SPTE_GEN_HIGH_SHIFT) & MMIO_SPTE_GEN_HIGH_MASK;
return mask;
}
@@ -444,10 +459,8 @@
{
u64 gen;
- spte &= ~shadow_mmio_mask;
-
- gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_START;
- gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_START;
+ gen = (spte & MMIO_SPTE_GEN_LOW_MASK) >> MMIO_SPTE_GEN_LOW_SHIFT;
+ gen |= (spte & MMIO_SPTE_GEN_HIGH_MASK) >> MMIO_SPTE_GEN_HIGH_SHIFT;
return gen;
}
@@ -538,16 +551,20 @@
static u8 kvm_get_shadow_phys_bits(void)
{
/*
- * boot_cpu_data.x86_phys_bits is reduced when MKTME is detected
- * in CPU detection code, but MKTME treats those reduced bits as
- * 'keyID' thus they are not reserved bits. Therefore for MKTME
- * we should still return physical address bits reported by CPUID.
+ * boot_cpu_data.x86_phys_bits is reduced when MKTME or SME are detected
+ * in CPU detection code, but the processor treats those reduced bits as
+ * 'keyID' thus they are not reserved bits. Therefore KVM needs to look at
+ * the physical address bits reported by CPUID.
*/
- if (!boot_cpu_has(X86_FEATURE_TME) ||
- WARN_ON_ONCE(boot_cpu_data.extended_cpuid_level < 0x80000008))
- return boot_cpu_data.x86_phys_bits;
+ if (likely(boot_cpu_data.extended_cpuid_level >= 0x80000008))
+ return cpuid_eax(0x80000008) & 0xff;
- return cpuid_eax(0x80000008) & 0xff;
+ /*
+ * Quite weird to have VMX or SVM but not MAXPHYADDR; probably a VM with
+ * custom CPUID. Proceed with whatever the kernel found since these features
+ * aren't virtualizable (SME/SEV also require CPUIDs higher than 0x80000008).
+ */
+ return boot_cpu_data.x86_phys_bits;
}
static void kvm_mmu_reset_all_pte_masks(void)
@@ -576,16 +593,15 @@
* the most significant bits of legal physical address space.
*/
shadow_nonpresent_or_rsvd_mask = 0;
- low_phys_bits = boot_cpu_data.x86_cache_bits;
- if (boot_cpu_data.x86_cache_bits <
- 52 - shadow_nonpresent_or_rsvd_mask_len) {
+ low_phys_bits = boot_cpu_data.x86_phys_bits;
+ if (boot_cpu_has_bug(X86_BUG_L1TF) &&
+ !WARN_ON_ONCE(boot_cpu_data.x86_cache_bits >=
+ 52 - shadow_nonpresent_or_rsvd_mask_len)) {
+ low_phys_bits = boot_cpu_data.x86_cache_bits
+ - shadow_nonpresent_or_rsvd_mask_len;
shadow_nonpresent_or_rsvd_mask =
- rsvd_bits(boot_cpu_data.x86_cache_bits -
- shadow_nonpresent_or_rsvd_mask_len,
- boot_cpu_data.x86_cache_bits - 1);
- low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
- } else
- WARN_ON_ONCE(boot_cpu_has_bug(X86_BUG_L1TF));
+ rsvd_bits(low_phys_bits, boot_cpu_data.x86_cache_bits - 1);
+ }
shadow_nonpresent_or_rsvd_lower_gfn_mask =
GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
@@ -1282,12 +1298,12 @@
return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
}
-static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
+static int host_mapping_level(struct kvm_vcpu *vcpu, gfn_t gfn)
{
unsigned long page_size;
int i, ret = 0;
- page_size = kvm_host_page_size(kvm, gfn);
+ page_size = kvm_host_page_size(vcpu, gfn);
for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
if (page_size >= KVM_HPAGE_SIZE(i))
@@ -1337,7 +1353,7 @@
if (unlikely(*force_pt_level))
return PT_PAGE_TABLE_LEVEL;
- host_level = host_mapping_level(vcpu->kvm, large_gfn);
+ host_level = host_mapping_level(vcpu, large_gfn);
if (host_level == PT_PAGE_TABLE_LEVEL)
return host_level;
@@ -1814,10 +1830,10 @@
* Emulate arch specific page modification logging for the
* nested hypervisor
*/
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa)
{
if (kvm_x86_ops->write_log_dirty)
- return kvm_x86_ops->write_log_dirty(vcpu);
+ return kvm_x86_ops->write_log_dirty(vcpu, l2_gpa);
return 0;
}
@@ -2040,7 +2056,8 @@
return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
}
-int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
+int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end,
+ unsigned flags)
{
return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
}
@@ -2126,7 +2143,7 @@
* aggregate version in order to make the slab shrinker
* faster
*/
-static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, unsigned long nr)
+static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, long nr)
{
kvm->arch.n_used_mmu_pages += nr;
percpu_counter_add(&kvm_total_used_mmu_pages, nr);
@@ -2226,13 +2243,6 @@
{
}
-static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *pte)
-{
- WARN_ON(1);
-}
-
#define KVM_PAGE_ARRAY_NR 16
struct kvm_mmu_pages {
@@ -3528,7 +3538,7 @@
* - true: let the vcpu to access on the same address again.
* - false: let the real page fault path to fix it.
*/
-static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
+static bool fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, int level,
u32 error_code)
{
struct kvm_shadow_walk_iterator iterator;
@@ -3548,7 +3558,7 @@
do {
u64 new_spte;
- for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
+ for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte)
if (!is_shadow_present_pte(spte) ||
iterator.level < level)
break;
@@ -3626,7 +3636,7 @@
} while (true);
- trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
+ trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep,
spte, fault_handled);
walk_shadow_page_lockless_end(vcpu);
@@ -3634,10 +3644,11 @@
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable);
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable);
static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
-static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
+static int nonpaging_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
gfn_t gfn, bool prefault)
{
int r;
@@ -3663,16 +3674,16 @@
gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
}
- if (fast_page_fault(vcpu, v, level, error_code))
+ if (fast_page_fault(vcpu, gpa, level, error_code))
return RET_PF_RETRY;
mmu_seq = vcpu->kvm->mmu_notifier_seq;
smp_rmb();
- if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
+ if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
return RET_PF_RETRY;
- if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
+ if (handle_abnormal_pfn(vcpu, gpa, gfn, pfn, ACC_ALL, &r))
return r;
r = RET_PF_RETRY;
@@ -3683,7 +3694,7 @@
goto out_unlock;
if (likely(!force_pt_level))
transparent_hugepage_adjust(vcpu, gfn, &pfn, &level);
- r = __direct_map(vcpu, v, write, map_writable, level, pfn,
+ r = __direct_map(vcpu, gpa, write, map_writable, level, pfn,
prefault, false);
out_unlock:
spin_unlock(&vcpu->kvm->mmu_lock);
@@ -3981,7 +3992,7 @@
}
EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
-static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access, struct x86_exception *exception)
{
if (exception)
@@ -3989,7 +4000,7 @@
return vaddr;
}
-static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
+static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access,
struct x86_exception *exception)
{
@@ -4149,13 +4160,14 @@
walk_shadow_page_lockless_end(vcpu);
}
-static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
+static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa,
u32 error_code, bool prefault)
{
- gfn_t gfn = gva >> PAGE_SHIFT;
+ gfn_t gfn = gpa >> PAGE_SHIFT;
int r;
- pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
+ /* Note, paging is disabled, ergo gva == gpa. */
+ pgprintk("%s: gva %lx error %x\n", __func__, gpa, error_code);
if (page_fault_handle_page_track(vcpu, error_code, gfn))
return RET_PF_EMULATE;
@@ -4167,11 +4179,12 @@
MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu->root_hpa));
- return nonpaging_map(vcpu, gva & PAGE_MASK,
+ return nonpaging_map(vcpu, gpa & PAGE_MASK,
error_code, gfn, prefault);
}
-static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
+static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ gfn_t gfn)
{
struct kvm_arch_async_pf arch;
@@ -4180,11 +4193,13 @@
arch.direct_map = vcpu->arch.mmu->direct_map;
arch.cr3 = vcpu->arch.mmu->get_cr3(vcpu);
- return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
+ return kvm_setup_async_pf(vcpu, cr2_or_gpa,
+ kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
}
static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
- gva_t gva, kvm_pfn_t *pfn, bool write, bool *writable)
+ gpa_t cr2_or_gpa, kvm_pfn_t *pfn, bool write,
+ bool *writable)
{
struct kvm_memory_slot *slot;
bool async;
@@ -4204,12 +4219,12 @@
return false; /* *pfn has correct page already */
if (!prefault && kvm_can_do_async_pf(vcpu)) {
- trace_kvm_try_async_get_page(gva, gfn);
+ trace_kvm_try_async_get_page(cr2_or_gpa, gfn);
if (kvm_find_async_pf_gfn(vcpu, gfn)) {
- trace_kvm_async_pf_doublefault(gva, gfn);
+ trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn);
kvm_make_request(KVM_REQ_APF_HALT, vcpu);
return true;
- } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
+ } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn))
return true;
}
@@ -4222,6 +4237,12 @@
{
int r = 1;
+#ifndef CONFIG_X86_64
+ /* A 64-bit CR2 should be impossible on 32-bit KVM. */
+ if (WARN_ON_ONCE(fault_address >> 32))
+ return -EFAULT;
+#endif
+
vcpu->arch.l1tf_flush_l1d = true;
switch (vcpu->arch.apf.host_apf_reason) {
default:
@@ -4259,7 +4280,7 @@
return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
}
-static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
+static int tdp_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code,
bool prefault)
{
kvm_pfn_t pfn;
@@ -4328,7 +4349,6 @@
context->gva_to_gpa = nonpaging_gva_to_gpa;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
- context->update_pte = nonpaging_update_pte;
context->root_level = 0;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = true;
@@ -4564,7 +4584,7 @@
nonleaf_bit8_rsvd | rsvd_bits(7, 7) |
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd |
- nonleaf_bit8_rsvd | gbpages_bit_rsvd |
+ gbpages_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd |
rsvd_bits(maxphyaddr, 51);
@@ -4646,7 +4666,15 @@
void
reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
{
- bool uses_nx = context->nx ||
+ /*
+ * KVM uses NX when TDP is disabled to handle a variety of scenarios,
+ * notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
+ * to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
+ * The iTLB multi-hit workaround can be toggled at any time, so assume
+ * NX can be used by any non-nested shadow MMU to avoid having to reset
+ * MMU contexts. Note, KVM forces EFER.NX=1 when TDP is disabled.
+ */
+ bool uses_nx = context->nx || !tdp_enabled ||
context->mmu_role.base.smep_andnot_wp;
struct rsvd_bits_validate *shadow_zero_check;
int i;
@@ -4907,7 +4935,6 @@
context->gva_to_gpa = paging64_gva_to_gpa;
context->sync_page = paging64_sync_page;
context->invlpg = paging64_invlpg;
- context->update_pte = paging64_update_pte;
context->shadow_root_level = level;
context->direct_map = false;
}
@@ -4936,7 +4963,6 @@
context->gva_to_gpa = paging32_gva_to_gpa;
context->sync_page = paging32_sync_page;
context->invlpg = paging32_invlpg;
- context->update_pte = paging32_update_pte;
context->shadow_root_level = PT32E_ROOT_LEVEL;
context->direct_map = false;
}
@@ -5011,7 +5037,6 @@
context->page_fault = tdp_page_fault;
context->sync_page = nonpaging_sync_page;
context->invlpg = nonpaging_invlpg;
- context->update_pte = nonpaging_update_pte;
context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
context->direct_map = true;
context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
@@ -5144,7 +5169,6 @@
context->gva_to_gpa = ept_gva_to_gpa;
context->sync_page = ept_sync_page;
context->invlpg = ept_invlpg;
- context->update_pte = ept_update_pte;
context->root_level = PT64_ROOT_4LEVEL;
context->direct_map = false;
context->mmu_role.as_u64 = new_role.as_u64;
@@ -5284,19 +5308,6 @@
}
EXPORT_SYMBOL_GPL(kvm_mmu_unload);
-static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
- struct kvm_mmu_page *sp, u64 *spte,
- const void *new)
-{
- if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
- ++vcpu->kvm->stat.mmu_pde_zapped;
- return;
- }
-
- ++vcpu->kvm->stat.mmu_pte_updated;
- vcpu->arch.mmu->update_pte(vcpu, sp, spte, new);
-}
-
static bool need_remote_flush(u64 old, u64 new)
{
if (!is_shadow_present_pte(old))
@@ -5462,14 +5473,10 @@
local_flush = true;
while (npte--) {
- u32 base_role = vcpu->arch.mmu->mmu_role.base.word;
-
entry = *spte;
mmu_page_zap_pte(vcpu->kvm, sp, spte);
- if (gentry &&
- !((sp->role.word ^ base_role)
- & mmu_base_role_mask.word) && rmap_can_add(vcpu))
- mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
+ if (gentry && sp->role.level != PG_LEVEL_4K)
+ ++vcpu->kvm->stat.mmu_pde_zapped;
if (need_remote_flush(entry, *spte))
remote_flush = true;
++spte;
@@ -5516,7 +5523,7 @@
return 0;
}
-int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
+int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
void *insn, int insn_len)
{
int r, emulation_type = 0;
@@ -5525,18 +5532,18 @@
/* With shadow page tables, fault_address contains a GVA or nGPA. */
if (vcpu->arch.mmu->direct_map) {
vcpu->arch.gpa_available = true;
- vcpu->arch.gpa_val = cr2;
+ vcpu->arch.gpa_val = cr2_or_gpa;
}
r = RET_PF_INVALID;
if (unlikely(error_code & PFERR_RSVD_MASK)) {
- r = handle_mmio_page_fault(vcpu, cr2, direct);
+ r = handle_mmio_page_fault(vcpu, cr2_or_gpa, direct);
if (r == RET_PF_EMULATE)
goto emulate;
}
if (r == RET_PF_INVALID) {
- r = vcpu->arch.mmu->page_fault(vcpu, cr2,
+ r = vcpu->arch.mmu->page_fault(vcpu, cr2_or_gpa,
lower_32_bits(error_code),
false);
WARN_ON(r == RET_PF_INVALID);
@@ -5556,7 +5563,7 @@
*/
if (vcpu->arch.mmu->direct_map &&
(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
- kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
+ kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2_or_gpa));
return 1;
}
@@ -5571,7 +5578,7 @@
* explicitly shadowing L1's page tables, i.e. unprotecting something
* for L1 isn't going to magically fix whatever issue cause L2 to fail.
*/
- if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
+ if (!mmio_info_in_cache(vcpu, cr2_or_gpa, direct) && !is_guest_mode(vcpu))
emulation_type = EMULTYPE_ALLOW_RETRY;
emulate:
/*
@@ -5586,7 +5593,7 @@
return 1;
}
- return x86_emulate_instruction(vcpu, cr2, emulation_type, insn,
+ return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
insn_len);
}
EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
@@ -6232,25 +6239,16 @@
u64 mask;
/*
- * Set the reserved bits and the present bit of an paging-structure
- * entry to generate page fault with PFER.RSV = 1.
+ * Set a reserved PA bit in MMIO SPTEs to generate page faults with
+ * PFEC.RSVD=1 on MMIO accesses. 64-bit PTEs (PAE, x86-64, and EPT
+ * paging) support a maximum of 52 bits of PA, i.e. if the CPU supports
+ * 52-bit physical addresses then there are no reserved PA bits in the
+ * PTEs and so the reserved PA approach must be disabled.
*/
-
- /*
- * Mask the uppermost physical address bit, which would be reserved as
- * long as the supported physical address width is less than 52.
- */
- mask = 1ull << 51;
-
- /* Set the present bit. */
- mask |= 1ull;
-
- /*
- * If reserved bit is not supported, clear the present bit to disable
- * mmio page fault.
- */
- if (IS_ENABLED(CONFIG_X86_64) && shadow_phys_bits == 52)
- mask &= ~1ull;
+ if (shadow_phys_bits < 52)
+ mask = BIT_ULL(51) | PT_PRESENT_MASK;
+ else
+ mask = 0;
kvm_mmu_set_mmio_spte_mask(mask, mask, ACC_WRITE_MASK | ACC_USER_MASK);
}
@@ -6445,6 +6443,7 @@
cond_resched_lock(&kvm->mmu_lock);
}
}
+ kvm_mmu_commit_zap_page(kvm, &invalid_list);
spin_unlock(&kvm->mmu_lock);
srcu_read_unlock(&kvm->srcu, rcu_idx);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index d55674f..ea9945a 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -48,7 +48,7 @@
if (e < s)
return 0;
- return ((1ULL << (e - s + 1)) - 1) << s;
+ return ((2ULL << (e - s)) - 1) << s;
}
void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value, u64 access_mask);
@@ -209,7 +209,7 @@
void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn);
bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
struct kvm_memory_slot *slot, u64 gfn);
-int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu);
+int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu, gpa_t l2_gpa);
int kvm_mmu_post_init_vm(struct kvm *kvm);
void kvm_mmu_pre_destroy_vm(struct kvm *kvm);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 7ca8831..ffcd96f 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -249,13 +249,13 @@
TRACE_EVENT(
fast_page_fault,
- TP_PROTO(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
+ TP_PROTO(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u32 error_code,
u64 *sptep, u64 old_spte, bool retry),
- TP_ARGS(vcpu, gva, error_code, sptep, old_spte, retry),
+ TP_ARGS(vcpu, cr2_or_gpa, error_code, sptep, old_spte, retry),
TP_STRUCT__entry(
__field(int, vcpu_id)
- __field(gva_t, gva)
+ __field(gpa_t, cr2_or_gpa)
__field(u32, error_code)
__field(u64 *, sptep)
__field(u64, old_spte)
@@ -265,7 +265,7 @@
TP_fast_assign(
__entry->vcpu_id = vcpu->vcpu_id;
- __entry->gva = gva;
+ __entry->cr2_or_gpa = cr2_or_gpa;
__entry->error_code = error_code;
__entry->sptep = sptep;
__entry->old_spte = old_spte;
@@ -273,9 +273,9 @@
__entry->retry = retry;
),
- TP_printk("vcpu %d gva %lx error_code %s sptep %p old %#llx"
+ TP_printk("vcpu %d gva %llx error_code %s sptep %p old %#llx"
" new %llx spurious %d fixed %d", __entry->vcpu_id,
- __entry->gva, __print_flags(__entry->error_code, "|",
+ __entry->cr2_or_gpa, __print_flags(__entry->error_code, "|",
kvm_mmu_trace_pferr_flags), __entry->sptep,
__entry->old_spte, __entry->new_spte,
__spte_satisfied(old_spte), __spte_satisfied(new_spte)
@@ -339,7 +339,7 @@
/* These depend on page entry type, so compute them now. */
__field(bool, r)
__field(bool, x)
- __field(u8, u)
+ __field(signed char, u)
),
TP_fast_assign(
diff --git a/arch/x86/kvm/mtrr.c b/arch/x86/kvm/mtrr.c
index 25ce3ed..7f0059a 100644
--- a/arch/x86/kvm/mtrr.c
+++ b/arch/x86/kvm/mtrr.c
@@ -192,11 +192,15 @@
break;
case MSR_MTRRfix16K_80000 ... MSR_MTRRfix16K_A0000:
*seg = 1;
- *unit = msr - MSR_MTRRfix16K_80000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix16K_80000,
+ MSR_MTRRfix16K_A0000 - MSR_MTRRfix16K_80000 + 1);
break;
case MSR_MTRRfix4K_C0000 ... MSR_MTRRfix4K_F8000:
*seg = 2;
- *unit = msr - MSR_MTRRfix4K_C0000;
+ *unit = array_index_nospec(
+ msr - MSR_MTRRfix4K_C0000,
+ MSR_MTRRfix4K_F8000 - MSR_MTRRfix4K_C0000 + 1);
break;
default:
return false;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 97b21e7..d4a8ad6 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -33,7 +33,7 @@
#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
#define PT_HAVE_ACCESSED_DIRTY(mmu) true
#ifdef CONFIG_X86_64
- #define PT_MAX_FULL_LEVELS 4
+ #define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
#define CMPXCHG cmpxchg
#else
#define CMPXCHG cmpxchg64
@@ -90,8 +90,8 @@
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
bool pte_writable[PT_MAX_FULL_LEVELS];
- unsigned pt_access;
- unsigned pte_access;
+ unsigned int pt_access[PT_MAX_FULL_LEVELS];
+ unsigned int pte_access;
gfn_t gfn;
struct x86_exception fault;
};
@@ -220,7 +220,7 @@
static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
struct kvm_mmu *mmu,
struct guest_walker *walker,
- int write_fault)
+ gpa_t addr, int write_fault)
{
unsigned level, index;
pt_element_t pte, orig_pte;
@@ -245,7 +245,7 @@
!(pte & PT_GUEST_DIRTY_MASK)) {
trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
#if PTTYPE == PTTYPE_EPT
- if (kvm_arch_write_log_dirty(vcpu))
+ if (kvm_arch_write_log_dirty(vcpu, addr))
return -EINVAL;
#endif
pte |= PT_GUEST_DIRTY_MASK;
@@ -291,11 +291,11 @@
}
/*
- * Fetch a guest pte for a guest virtual address
+ * Fetch a guest pte for a guest virtual address, or for an L2's GPA.
*/
static int FNAME(walk_addr_generic)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
- gva_t addr, u32 access)
+ gpa_t addr, u32 access)
{
int ret;
pt_element_t pte;
@@ -406,13 +406,15 @@
}
walker->ptes[walker->level - 1] = pte;
+
+ /* Convert to ACC_*_MASK flags for struct guest_walker. */
+ walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
} while (!is_last_gpte(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
- walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
@@ -442,7 +444,8 @@
(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
if (unlikely(!accessed_dirty)) {
- ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker, write_fault);
+ ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
+ addr, write_fault);
if (unlikely(ret < 0))
goto error;
else if (ret)
@@ -450,7 +453,8 @@
}
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
- __func__, (u64)pte, walker->pte_access, walker->pt_access);
+ __func__, (u64)pte, walker->pte_access,
+ walker->pt_access[walker->level - 1]);
return 1;
error:
@@ -496,7 +500,7 @@
}
static int FNAME(walk_addr)(struct guest_walker *walker,
- struct kvm_vcpu *vcpu, gva_t addr, u32 access)
+ struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
{
return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
access);
@@ -611,7 +615,7 @@
* If the guest tries to write a write-protected page, we need to
* emulate this operation, return 1 to indicate this case.
*/
-static int FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
+static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
struct guest_walker *gw,
int write_fault, int hlevel,
kvm_pfn_t pfn, bool map_writable, bool prefault,
@@ -619,7 +623,7 @@
{
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
- unsigned direct_access, access = gw->pt_access;
+ unsigned int direct_access, access;
int top_level, ret;
gfn_t gfn, base_gfn;
@@ -651,6 +655,7 @@
sp = NULL;
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
+ access = gw->pt_access[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
false, access);
}
@@ -765,7 +770,7 @@
* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
* a negative value on error.
*/
-static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
+static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
bool prefault)
{
int write_fault = error_code & PFERR_WRITE_MASK;
@@ -945,18 +950,19 @@
spin_unlock(&vcpu->kvm->mmu_lock);
}
-static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
+/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
+static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
struct x86_exception *exception)
{
struct guest_walker walker;
gpa_t gpa = UNMAPPED_GVA;
int r;
- r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
+ r = FNAME(walk_addr)(&walker, vcpu, addr, access);
if (r) {
gpa = gfn_to_gpa(walker.gfn);
- gpa |= vaddr & ~PAGE_MASK;
+ gpa |= addr & ~PAGE_MASK;
} else if (exception)
*exception = walker.fault;
@@ -964,7 +970,8 @@
}
#if PTTYPE != PTTYPE_EPT
-static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
+/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
+static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
u32 access,
struct x86_exception *exception)
{
@@ -972,6 +979,11 @@
gpa_t gpa = UNMAPPED_GVA;
int r;
+#ifndef CONFIG_X86_64
+ /* A 64-bit GVA should be impossible on 32-bit KVM. */
+ WARN_ON_ONCE(vaddr >> 32);
+#endif
+
r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
if (r) {
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index 58265f7..3fc98af 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -2,6 +2,8 @@
#ifndef __KVM_X86_PMU_H
#define __KVM_X86_PMU_H
+#include <linux/nospec.h>
+
#define vcpu_to_pmu(vcpu) (&(vcpu)->arch.pmu)
#define pmu_to_vcpu(pmu) (container_of((pmu), struct kvm_vcpu, arch.pmu))
#define pmc_to_pmu(pmc) (&(pmc)->vcpu->arch.pmu)
@@ -86,8 +88,12 @@
static inline struct kvm_pmc *get_gp_pmc(struct kvm_pmu *pmu, u32 msr,
u32 base)
{
- if (msr >= base && msr < base + pmu->nr_arch_gp_counters)
- return &pmu->gp_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_gp_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_gp_counters);
+
+ return &pmu->gp_counters[index];
+ }
return NULL;
}
@@ -97,8 +103,12 @@
{
int base = MSR_CORE_PERF_FIXED_CTR0;
- if (msr >= base && msr < base + pmu->nr_arch_fixed_counters)
- return &pmu->fixed_counters[msr - base];
+ if (msr >= base && msr < base + pmu->nr_arch_fixed_counters) {
+ u32 index = array_index_nospec(msr - base,
+ pmu->nr_arch_fixed_counters);
+
+ return &pmu->fixed_counters[index];
+ }
return NULL;
}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c5673bd..425444d 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -516,6 +516,9 @@
c->intercept_dr = h->intercept_dr | g->intercept_dr;
c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
c->intercept = h->intercept | g->intercept;
+
+ c->intercept |= (1ULL << INTERCEPT_VMLOAD);
+ c->intercept |= (1ULL << INTERCEPT_VMSAVE);
}
static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
@@ -787,9 +790,6 @@
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
return 0;
} else {
- if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
- pr_err("%s: ip 0x%lx next 0x%llx\n",
- __func__, kvm_rip_read(vcpu), svm->next_rip);
kvm_rip_write(vcpu, svm->next_rip);
}
svm_set_interrupt_shadow(vcpu, 0);
@@ -892,6 +892,11 @@
return 0;
}
+ if (sev_active()) {
+ pr_info("KVM is unsupported when running as an SEV guest\n");
+ return 0;
+ }
+
return 1;
}
@@ -998,33 +1003,32 @@
static int svm_cpu_init(int cpu)
{
struct svm_cpu_data *sd;
- int r;
sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
if (!sd)
return -ENOMEM;
sd->cpu = cpu;
- r = -ENOMEM;
sd->save_area = alloc_page(GFP_KERNEL);
if (!sd->save_area)
- goto err_1;
+ goto free_cpu_data;
if (svm_sev_enabled()) {
- r = -ENOMEM;
sd->sev_vmcbs = kmalloc_array(max_sev_asid + 1,
sizeof(void *),
GFP_KERNEL);
if (!sd->sev_vmcbs)
- goto err_1;
+ goto free_save_area;
}
per_cpu(svm_data, cpu) = sd;
return 0;
-err_1:
+free_save_area:
+ __free_page(sd->save_area);
+free_cpu_data:
kfree(sd);
- return r;
+ return -ENOMEM;
}
@@ -1298,6 +1302,47 @@
}
}
+/*
+ * The default MMIO mask is a single bit (excluding the present bit),
+ * which could conflict with the memory encryption bit. Check for
+ * memory encryption support and override the default MMIO mask if
+ * memory encryption is enabled.
+ */
+static __init void svm_adjust_mmio_mask(void)
+{
+ unsigned int enc_bit, mask_bit;
+ u64 msr, mask;
+
+ /* If there is no memory encryption support, use existing mask */
+ if (cpuid_eax(0x80000000) < 0x8000001f)
+ return;
+
+ /* If memory encryption is not enabled, use existing mask */
+ rdmsrl(MSR_K8_SYSCFG, msr);
+ if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+ return;
+
+ enc_bit = cpuid_ebx(0x8000001f) & 0x3f;
+ mask_bit = boot_cpu_data.x86_phys_bits;
+
+ /* Increment the mask bit if it is the same as the encryption bit */
+ if (enc_bit == mask_bit)
+ mask_bit++;
+
+ /*
+ * If the mask bit location is below 52, then some bits above the
+ * physical addressing limit will always be reserved, so use the
+ * rsvd_bits() function to generate the mask. This mask, along with
+ * the present bit, will be used to generate a page fault with
+ * PFER.RSV = 1.
+ *
+ * If the mask bit location is 52 (or above), then clear the mask.
+ */
+ mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0;
+
+ kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK);
+}
+
static __init int svm_hardware_setup(void)
{
int cpu;
@@ -1352,6 +1397,8 @@
}
}
+ svm_adjust_mmio_mask();
+
for_each_possible_cpu(cpu) {
r = svm_cpu_init(cpu);
if (r)
@@ -1399,12 +1446,7 @@
}
}
- if (vgif) {
- if (!boot_cpu_has(X86_FEATURE_VGIF))
- vgif = false;
- else
- pr_info("Virtual GIF supported\n");
- }
+ vgif = false; /* Disabled for CVE-2021-3653 */
return 0;
@@ -1739,7 +1781,7 @@
for_each_possible_cpu(cpu) {
sd = per_cpu(svm_data, cpu);
- sd->sev_vmcbs[pos] = NULL;
+ sd->sev_vmcbs[asid] = NULL;
}
}
@@ -1750,9 +1792,25 @@
__sev_asid_free(sev->asid);
}
-static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+static void sev_decommission(unsigned int handle)
{
struct sev_data_decommission *decommission;
+
+ if (!handle)
+ return;
+
+ decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
+ if (!decommission)
+ return;
+
+ decommission->handle = handle;
+ sev_guest_decommission(decommission, NULL);
+
+ kfree(decommission);
+}
+
+static void sev_unbind_asid(struct kvm *kvm, unsigned int handle)
+{
struct sev_data_deactivate *data;
if (!handle)
@@ -1770,15 +1828,7 @@
sev_guest_df_flush(NULL);
kfree(data);
- decommission = kzalloc(sizeof(*decommission), GFP_KERNEL);
- if (!decommission)
- return;
-
- /* decommission handle */
- decommission->handle = handle;
- sev_guest_decommission(decommission, NULL);
-
- kfree(decommission);
+ sev_decommission(handle);
}
static struct page **sev_pin_memory(struct kvm *kvm, unsigned long uaddr,
@@ -1791,6 +1841,8 @@
struct page **pages;
unsigned long first, last;
+ lockdep_assert_held(&kvm->lock);
+
if (ulen == 0 || uaddr + ulen < uaddr)
return NULL;
@@ -1818,7 +1870,7 @@
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
+ npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
@@ -1883,6 +1935,10 @@
struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
GFP_KERNEL_ACCOUNT | __GFP_ZERO,
PAGE_KERNEL);
+
+ if (!kvm_svm)
+ return NULL;
+
return &kvm_svm->kvm;
}
@@ -1910,6 +1966,7 @@
list_for_each_safe(pos, q, head) {
__unregister_enc_region_locked(kvm,
list_entry(pos, struct enc_region, list));
+ cond_resched();
}
}
@@ -3191,8 +3248,8 @@
return NESTED_EXIT_HOST;
break;
case SVM_EXIT_EXCP_BASE + PF_VECTOR:
- /* When we're shadowing, trap PFs, but not async PF */
- if (!npt_enabled && svm->vcpu.arch.apf.host_apf_reason == 0)
+ /* Trap async PF even if not shadowing */
+ if (!npt_enabled || svm->vcpu.arch.apf.host_apf_reason)
return NESTED_EXIT_HOST;
break;
default:
@@ -3281,7 +3338,7 @@
dst->iopm_base_pa = from->iopm_base_pa;
dst->msrpm_base_pa = from->msrpm_base_pa;
dst->tsc_offset = from->tsc_offset;
- dst->asid = from->asid;
+ /* asid not copied, it is handled manually for svm->vmcb. */
dst->tlb_ctl = from->tlb_ctl;
dst->int_ctl = from->int_ctl;
dst->int_vector = from->int_vector;
@@ -3548,7 +3605,13 @@
svm->nested.intercept = nested_vmcb->control.intercept;
svm_flush_tlb(&svm->vcpu, true);
- svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
+
+ svm->vmcb->control.int_ctl &=
+ V_INTR_MASKING_MASK | V_GIF_ENABLE_MASK | V_GIF_MASK;
+
+ svm->vmcb->control.int_ctl |= nested_vmcb->control.int_ctl &
+ (V_TPR_MASK | V_IRQ_INJECTION_BITS_MASK);
+
if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
svm->vcpu.arch.hflags |= HF_VINTR_MASK;
else
@@ -3924,6 +3987,12 @@
return 1;
}
+static int invd_interception(struct vcpu_svm *svm)
+{
+ /* Treat an INVD instruction as a NOP and just skip it. */
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+}
+
static int invlpg_interception(struct vcpu_svm *svm)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
@@ -4001,7 +4070,7 @@
err = 0;
if (cr >= 16) { /* mov to cr */
cr -= 16;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_readl(&svm->vcpu, reg);
switch (cr) {
case 0:
if (!check_selective_cr0_intercepted(svm, val))
@@ -4046,7 +4115,7 @@
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
}
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_register_writel(&svm->vcpu, reg, val);
}
return kvm_complete_insn_gp(&svm->vcpu, err);
}
@@ -4076,13 +4145,13 @@
if (dr >= 16) { /* mov to DRn */
if (!kvm_require_dr(&svm->vcpu, dr - 16))
return 1;
- val = kvm_register_read(&svm->vcpu, reg);
+ val = kvm_register_readl(&svm->vcpu, reg);
kvm_set_dr(&svm->vcpu, dr - 16, val);
} else {
if (!kvm_require_dr(&svm->vcpu, dr))
return 1;
kvm_get_dr(&svm->vcpu, dr, &val);
- kvm_register_write(&svm->vcpu, reg, val);
+ kvm_register_writel(&svm->vcpu, reg, val);
}
return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -4184,8 +4253,7 @@
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
msr_info->data = svm->spec_ctrl;
@@ -4269,16 +4337,13 @@
break;
case MSR_IA32_SPEC_CTRL:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
svm->spec_ctrl = data;
-
if (!data)
break;
@@ -4297,18 +4362,17 @@
break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
+ !guest_has_pred_cmd_msr(vcpu))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
-
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ return 1;
if (!data)
break;
wrmsrl(MSR_IA32_PRED_CMD, PRED_CMD_IBPB);
- if (is_guest_mode(vcpu))
- break;
set_msr_interception(svm->msrpm, MSR_IA32_PRED_CMD, 0, 1);
break;
case MSR_AMD64_VIRT_SPEC_CTRL:
@@ -4776,7 +4840,7 @@
[SVM_EXIT_RDPMC] = rdpmc_interception,
[SVM_EXIT_CPUID] = cpuid_interception,
[SVM_EXIT_IRET] = iret_interception,
- [SVM_EXIT_INVD] = emulate_on_interception,
+ [SVM_EXIT_INVD] = invd_interception,
[SVM_EXIT_PAUSE] = pause_interception,
[SVM_EXIT_HLT] = halt_interception,
[SVM_EXIT_INVLPG] = invlpg_interception,
@@ -5141,8 +5205,11 @@
return;
}
-static void svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
+static int svm_deliver_avic_intr(struct kvm_vcpu *vcpu, int vec)
{
+ if (!vcpu->arch.apicv_active)
+ return -1;
+
kvm_lapic_set_irr(vec, vcpu->arch.apic);
smp_mb__after_atomic();
@@ -5154,6 +5221,8 @@
put_cpu();
} else
kvm_vcpu_wake_up(vcpu);
+
+ return 0;
}
static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
@@ -5329,6 +5398,7 @@
* - Tell IOMMU to use legacy mode for this interrupt.
* - Retrieve ga_tag of prior interrupt remapping data.
*/
+ pi.prev_ga_tag = 0;
pi.is_guest_mode = false;
ret = irq_set_vcpu_affinity(host_irq, &pi);
@@ -5986,6 +6056,11 @@
return true;
}
+static bool svm_pku_supported(void)
+{
+ return false;
+}
+
#define PRE_EX(exit) { .exit_code = (exit), \
.stage = X86_ICPT_PRE_EXCEPT, }
#define POST_EX(exit) { .exit_code = (exit), \
@@ -6413,8 +6488,10 @@
/* Bind ASID to this guest */
ret = sev_bind_asid(kvm, start->handle, error);
- if (ret)
+ if (ret) {
+ sev_decommission(start->handle);
goto e_free_session;
+ }
/* return handle to userspace */
params.handle = start->handle;
@@ -7031,12 +7108,20 @@
if (!region)
return -ENOMEM;
+ mutex_lock(&kvm->lock);
region->pages = sev_pin_memory(kvm, range->addr, range->size, ®ion->npages, 1);
if (!region->pages) {
ret = -ENOMEM;
+ mutex_unlock(&kvm->lock);
goto e_free;
}
+ region->uaddr = range->addr;
+ region->size = range->size;
+
+ list_add_tail(®ion->list, &sev->regions_list);
+ mutex_unlock(&kvm->lock);
+
/*
* The guest may change the memory encryption attribute from C=0 -> C=1
* or vice versa for this memory range. Lets make sure caches are
@@ -7045,13 +7130,6 @@
*/
sev_clflush_pages(region->pages, region->npages);
- region->uaddr = range->addr;
- region->size = range->size;
-
- mutex_lock(&kvm->lock);
- list_add_tail(®ion->list, &sev->regions_list);
- mutex_unlock(&kvm->lock);
-
return ret;
e_free:
@@ -7278,6 +7356,7 @@
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
.pt_supported = svm_pt_supported,
+ .pku_supported = svm_pku_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 7c741a0..5fed9c4 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1483,16 +1483,16 @@
TP_ARGS(msg, err),
TP_STRUCT__entry(
- __field(const char *, msg)
+ __string(msg, msg)
__field(u32, err)
),
TP_fast_assign(
- __entry->msg = msg;
+ __assign_str(msg, msg);
__entry->err = err;
),
- TP_printk("%s%s", __entry->msg, !__entry->err ? "" :
+ TP_printk("%s%s", __get_str(msg), !__entry->err ? "" :
__print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
);
diff --git a/arch/x86/kvm/vmx/capabilities.h b/arch/x86/kvm/vmx/capabilities.h
index 7aa6971..f486e26 100644
--- a/arch/x86/kvm/vmx/capabilities.h
+++ b/arch/x86/kvm/vmx/capabilities.h
@@ -12,6 +12,7 @@
extern bool __read_mostly enable_unrestricted_guest;
extern bool __read_mostly enable_ept_ad_bits;
extern bool __read_mostly enable_pml;
+extern bool __read_mostly enable_apicv;
extern int __read_mostly pt_mode;
#define PT_MODE_SYSTEM 0
@@ -145,6 +146,11 @@
SECONDARY_EXEC_DESC;
}
+static inline bool vmx_pku_supported(void)
+{
+ return boot_cpu_has(X86_FEATURE_PKU);
+}
+
static inline bool cpu_has_vmx_rdtscp(void)
{
return vmcs_config.cpu_based_2nd_exec_ctrl &
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 0e7c930..3041015 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -223,7 +223,7 @@
return;
kvm_vcpu_unmap(vcpu, &vmx->nested.hv_evmcs_map, true);
- vmx->nested.hv_evmcs_vmptr = -1ull;
+ vmx->nested.hv_evmcs_vmptr = 0;
vmx->nested.hv_evmcs = NULL;
}
@@ -302,7 +302,7 @@
cpu = get_cpu();
prev = vmx->loaded_vmcs;
vmx->loaded_vmcs = vmcs;
- vmx_vcpu_load_vmcs(vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu, prev);
vmx_sync_vmcs_host_state(vmx, prev);
put_cpu();
@@ -1828,7 +1828,8 @@
if (!nested_enlightened_vmentry(vcpu, &evmcs_gpa))
return 1;
- if (unlikely(evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
+ if (unlikely(!vmx->nested.hv_evmcs ||
+ evmcs_gpa != vmx->nested.hv_evmcs_vmptr)) {
if (!vmx->nested.hv_evmcs)
vmx->nested.current_vmptr = -1ull;
@@ -2056,12 +2057,11 @@
~PIN_BASED_VMX_PREEMPTION_TIMER);
/* Posted interrupts setting is only taken from vmcs12. */
- if (nested_cpu_has_posted_intr(vmcs12)) {
+ vmx->nested.pi_pending = false;
+ if (nested_cpu_has_posted_intr(vmcs12))
vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv;
- vmx->nested.pi_pending = false;
- } else {
+ else
exec_control &= ~PIN_BASED_POSTED_INTR;
- }
pin_controls_set(vmx, exec_control);
/*
@@ -2230,6 +2230,8 @@
vmcs_writel(GUEST_TR_BASE, vmcs12->guest_tr_base);
vmcs_writel(GUEST_GDTR_BASE, vmcs12->guest_gdtr_base);
vmcs_writel(GUEST_IDTR_BASE, vmcs12->guest_idtr_base);
+
+ vmx->segment_cache.bitmask = 0;
}
if (!hv_evmcs || !(hv_evmcs->hv_clean_fields &
@@ -2418,6 +2420,16 @@
entry_failure_code))
return -EINVAL;
+ /*
+ * Immediately write vmcs02.GUEST_CR3. It will be propagated to vmcs12
+ * on nested VM-Exit, which can occur without actually running L2 and
+ * thus without hitting vmx_set_cr3(), e.g. if L1 is entering L2 with
+ * vmcs12.GUEST_ACTIVITYSTATE=HLT, in which case KVM will intercept the
+ * transition to HLT instead of running L2.
+ */
+ if (enable_ept)
+ vmcs_writel(GUEST_CR3, vmcs12->guest_cr3);
+
/* Late preparation of GUEST_PDPTRs now that EFER and CRs are set. */
if (load_guest_pdptrs_vmcs12 && nested_cpu_has_ept(vmcs12) &&
is_pae_paging(vcpu)) {
@@ -3083,8 +3095,10 @@
prepare_vmcs02_early(vmx, vmcs12);
if (from_vmentry) {
- if (unlikely(!nested_get_vmcs12_pages(vcpu)))
+ if (unlikely(!nested_get_vmcs12_pages(vcpu))) {
+ vmx_switch_vmcs(vcpu, &vmx->vmcs01);
return NVMX_VMENTRY_KVM_INTERNAL_ERROR;
+ }
if (nested_vmx_check_vmentry_hw(vcpu)) {
vmx_switch_vmcs(vcpu, &vmx->vmcs01);
@@ -3449,7 +3463,7 @@
nested_vmx_vmexit(vcpu, EXIT_REASON_EXCEPTION_NMI, intr_info, exit_qual);
}
-static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
+static int vmx_check_nested_events(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qual;
@@ -3496,8 +3510,7 @@
return 0;
}
- if ((kvm_cpu_has_interrupt(vcpu) || external_intr) &&
- nested_exit_on_intr(vcpu)) {
+ if (kvm_cpu_has_interrupt(vcpu) && nested_exit_on_intr(vcpu)) {
if (block_nested_events)
return -EBUSY;
nested_vmx_vmexit(vcpu, EXIT_REASON_EXTERNAL_INTERRUPT, 0, 0);
@@ -4147,17 +4160,8 @@
vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
if (likely(!vmx->fail)) {
- /*
- * TODO: SDM says that with acknowledge interrupt on
- * exit, bit 31 of the VM-exit interrupt information
- * (valid interrupt) is always set to 1 on
- * EXIT_REASON_EXTERNAL_INTERRUPT, so we shouldn't
- * need kvm_cpu_has_interrupt(). See the commit
- * message for details.
- */
- if (nested_exit_intr_ack_set(vcpu) &&
- exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
- kvm_cpu_has_interrupt(vcpu)) {
+ if (exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT &&
+ nested_exit_intr_ack_set(vcpu)) {
int irq = kvm_cpu_get_interrupt(vcpu);
WARN_ON(irq < 0);
vmcs12->vm_exit_intr_info = irq |
@@ -4599,32 +4603,28 @@
{
unsigned long field;
u64 field_value;
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
int len;
gva_t gva = 0;
- struct vmcs12 *vmcs12;
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
struct x86_exception e;
short offset;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (to_vmx(vcpu)->nested.current_vmptr == -1ull)
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * any VMREAD sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == -1ull ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
return nested_vmx_failInvalid(vcpu);
- if (!is_guest_mode(vcpu))
- vmcs12 = get_vmcs12(vcpu);
- else {
- /*
- * When vmcs->vmcs_link_pointer is -1ull, any VMREAD
- * to shadowed-field sets the ALU flags for VMfailInvalid.
- */
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
- return nested_vmx_failInvalid(vcpu);
- vmcs12 = get_shadow_vmcs12(vcpu);
- }
-
/* Decode instruction info and find the field to read */
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
@@ -4653,8 +4653,10 @@
vmx_instruction_info, true, len, &gva))
return 1;
/* _system ok, nested_vmx_check_permission has verified cpl=0 */
- if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e))
+ if (kvm_write_guest_virt_system(vcpu, gva, &field_value, len, &e)) {
kvm_inject_page_fault(vcpu, &e);
+ return 1;
+ }
}
return nested_vmx_succeed(vcpu);
@@ -4701,13 +4703,20 @@
*/
u64 field_value = 0;
struct x86_exception e;
- struct vmcs12 *vmcs12;
+ struct vmcs12 *vmcs12 = is_guest_mode(vcpu) ? get_shadow_vmcs12(vcpu)
+ : get_vmcs12(vcpu);
short offset;
if (!nested_vmx_check_permission(vcpu))
return 1;
- if (vmx->nested.current_vmptr == -1ull)
+ /*
+ * In VMX non-root operation, when the VMCS-link pointer is -1ull,
+ * any VMWRITE sets the ALU flags for VMfailInvalid.
+ */
+ if (vmx->nested.current_vmptr == -1ull ||
+ (is_guest_mode(vcpu) &&
+ get_vmcs12(vcpu)->vmcs_link_pointer == -1ull))
return nested_vmx_failInvalid(vcpu);
if (vmx_instruction_info & (1u << 10))
@@ -4726,6 +4735,12 @@
field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+
+ offset = vmcs_field_to_offset(field);
+ if (offset < 0)
+ return nested_vmx_failValid(vcpu,
+ VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+
/*
* If the vCPU supports "VMWRITE to any supported field in the
* VMCS," then the "read-only" fields are actually read/write.
@@ -4735,29 +4750,12 @@
return nested_vmx_failValid(vcpu,
VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT);
- if (!is_guest_mode(vcpu)) {
- vmcs12 = get_vmcs12(vcpu);
-
- /*
- * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
- * vmcs12, else we may crush a field or consume a stale value.
- */
- if (!is_shadow_field_rw(field))
- copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- } else {
- /*
- * When vmcs->vmcs_link_pointer is -1ull, any VMWRITE
- * to shadowed-field sets the ALU flags for VMfailInvalid.
- */
- if (get_vmcs12(vcpu)->vmcs_link_pointer == -1ull)
- return nested_vmx_failInvalid(vcpu);
- vmcs12 = get_shadow_vmcs12(vcpu);
- }
-
- offset = vmcs_field_to_offset(field);
- if (offset < 0)
- return nested_vmx_failValid(vcpu,
- VMXERR_UNSUPPORTED_VMCS_COMPONENT);
+ /*
+ * Ensure vmcs12 is up-to-date before any VMWRITE that dirties
+ * vmcs12, else we may crush a field or consume a stale value.
+ */
+ if (!is_guest_mode(vcpu) && !is_shadow_field_rw(field))
+ copy_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
/*
* Some Intel CPUs intentionally drop the reserved bits of the AR byte
@@ -5100,7 +5098,7 @@
}
vmcs12 = get_vmcs12(vcpu);
- if ((vmcs12->vm_function_control & (1 << function)) == 0)
+ if (!(vmcs12->vm_function_control & BIT_ULL(function)))
goto fail;
switch (function) {
@@ -5120,24 +5118,17 @@
return 1;
}
-
-static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
- struct vmcs12 *vmcs12)
+/*
+ * Return true if an IO instruction with the specified port and size should cause
+ * a VM-exit into L1.
+ */
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size)
{
- unsigned long exit_qualification;
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
gpa_t bitmap, last_bitmap;
- unsigned int port;
- int size;
u8 b;
- if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
- return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
-
- exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
-
- port = exit_qualification >> 16;
- size = (exit_qualification & 7) + 1;
-
last_bitmap = (gpa_t)-1;
b = -1;
@@ -5164,6 +5155,24 @@
return false;
}
+static bool nested_vmx_exit_handled_io(struct kvm_vcpu *vcpu,
+ struct vmcs12 *vmcs12)
+{
+ unsigned long exit_qualification;
+ unsigned short port;
+ int size;
+
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ return nested_cpu_has(vmcs12, CPU_BASED_UNCOND_IO_EXITING);
+
+ exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
+
+ port = exit_qualification >> 16;
+ size = (exit_qualification & 7) + 1;
+
+ return nested_vmx_check_io_bitmaps(vcpu, port, size);
+}
+
/*
* Return 1 if we should exit from L2 to L1 to handle an MSR access access,
* rather than handle it ourselves in L0. I.e., check whether L1 expressed
@@ -5298,7 +5307,7 @@
/* Decode instruction info and find the field to access */
vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
- field = kvm_register_read(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
+ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf));
/* Out-of-range fields always cause a VM exit from L2 to L1 */
if (field >> 15)
@@ -5351,7 +5360,7 @@
vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
KVM_ISA_VMX);
- switch (exit_reason) {
+ switch ((u16)exit_reason) {
case EXIT_REASON_EXCEPTION_NMI:
if (is_nmi(intr_info))
return false;
@@ -5569,11 +5578,14 @@
if (is_guest_mode(vcpu)) {
sync_vmcs02_to_vmcs12(vcpu, vmcs12);
sync_vmcs02_to_vmcs12_rare(vcpu, vmcs12);
- } else if (!vmx->nested.need_vmcs12_to_shadow_sync) {
- if (vmx->nested.hv_evmcs)
- copy_enlightened_to_vmcs12(vmx);
- else if (enable_shadow_vmcs)
- copy_shadow_to_vmcs12(vmx);
+ } else {
+ copy_vmcs02_to_vmcs12_rare(vcpu, get_vmcs12(vcpu));
+ if (!vmx->nested.need_vmcs12_to_shadow_sync) {
+ if (vmx->nested.hv_evmcs)
+ copy_enlightened_to_vmcs12(vmx);
+ else if (enable_shadow_vmcs)
+ copy_shadow_to_vmcs12(vmx);
+ }
}
BUILD_BUG_ON(sizeof(user_vmx_nested_state->vmcs12) < VMCS12_SIZE);
@@ -5784,8 +5796,7 @@
* bit in the high half is on if the corresponding bit in the control field
* may be on. See also vmx_control_verify().
*/
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
- bool apicv)
+void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps)
{
/*
* Note that as a general rule, the high half of the MSRs (bits in
@@ -5812,7 +5823,7 @@
PIN_BASED_EXT_INTR_MASK |
PIN_BASED_NMI_EXITING |
PIN_BASED_VIRTUAL_NMIS |
- (apicv ? PIN_BASED_POSTED_INTR : 0);
+ (enable_apicv ? PIN_BASED_POSTED_INTR : 0);
msrs->pinbased_ctls_high |=
PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
PIN_BASED_VMX_PREEMPTION_TIMER;
diff --git a/arch/x86/kvm/vmx/nested.h b/arch/x86/kvm/vmx/nested.h
index 6280f33..b8521c4 100644
--- a/arch/x86/kvm/vmx/nested.h
+++ b/arch/x86/kvm/vmx/nested.h
@@ -17,8 +17,7 @@
};
void vmx_leave_nested(struct kvm_vcpu *vcpu);
-void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps,
- bool apicv);
+void nested_vmx_setup_ctls_msrs(struct nested_vmx_msrs *msrs, u32 ept_caps);
void nested_vmx_hardware_unsetup(void);
__init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *));
void nested_vmx_vcpu_setup(void);
@@ -33,6 +32,8 @@
int vmx_get_vmx_msr(struct nested_vmx_msrs *msrs, u32 msr_index, u64 *pdata);
int get_vmx_mem_address(struct kvm_vcpu *vcpu, unsigned long exit_qualification,
u32 vmx_instruction_info, bool wr, int len, gva_t *ret);
+bool nested_vmx_check_io_bitmaps(struct kvm_vcpu *vcpu, unsigned int port,
+ int size);
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
diff --git a/arch/x86/kvm/vmx/ops.h b/arch/x86/kvm/vmx/ops.h
index 45eaede..19717d0 100644
--- a/arch/x86/kvm/vmx/ops.h
+++ b/arch/x86/kvm/vmx/ops.h
@@ -13,6 +13,8 @@
#define __ex(x) __kvm_handle_fault_on_reboot(x)
asmlinkage void vmread_error(unsigned long field, bool fault);
+__attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field,
+ bool fault);
void vmwrite_error(unsigned long field, unsigned long value);
void vmclear_error(struct vmcs *vmcs, u64 phys_addr);
void vmptrld_error(struct vmcs *vmcs, u64 phys_addr);
@@ -70,15 +72,28 @@
asm volatile("1: vmread %2, %1\n\t"
".byte 0x3e\n\t" /* branch taken hint */
"ja 3f\n\t"
- "mov %2, %%" _ASM_ARG1 "\n\t"
- "xor %%" _ASM_ARG2 ", %%" _ASM_ARG2 "\n\t"
- "2: call vmread_error\n\t"
- "xor %k1, %k1\n\t"
+
+ /*
+ * VMREAD failed. Push '0' for @fault, push the failing
+ * @field, and bounce through the trampoline to preserve
+ * volatile registers.
+ */
+ "push $0\n\t"
+ "push %2\n\t"
+ "2:call vmread_error_trampoline\n\t"
+
+ /*
+ * Unwind the stack. Note, the trampoline zeros out the
+ * memory for @fault so that the result is '0' on error.
+ */
+ "pop %2\n\t"
+ "pop %1\n\t"
"3:\n\t"
+ /* VMREAD faulted. As above, except push '1' for @fault. */
".pushsection .fixup, \"ax\"\n\t"
- "4: mov %2, %%" _ASM_ARG1 "\n\t"
- "mov $1, %%" _ASM_ARG2 "\n\t"
+ "4: push $1\n\t"
+ "push %2\n\t"
"jmp 2b\n\t"
".popsection\n\t"
_ASM_EXTABLE(1b, 4b)
diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c
index 3e9c059..181e352 100644
--- a/arch/x86/kvm/vmx/pmu_intel.c
+++ b/arch/x86/kvm/vmx/pmu_intel.c
@@ -26,7 +26,7 @@
[4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES },
[5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS },
[6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES },
- [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES },
+ [7] = { 0x00, 0x03, PERF_COUNT_HW_REF_CPU_CYCLES },
};
/* mapping between fixed pmc index and intel_arch_events array */
@@ -84,10 +84,14 @@
static unsigned intel_find_fixed_event(int idx)
{
- if (idx >= ARRAY_SIZE(fixed_pmc_events))
+ u32 event;
+ size_t size = ARRAY_SIZE(fixed_pmc_events);
+
+ if (idx >= size)
return PERF_COUNT_HW_MAX;
- return intel_arch_events[fixed_pmc_events[idx]].event_type;
+ event = fixed_pmc_events[array_index_nospec(idx, size)];
+ return intel_arch_events[event].event_type;
}
/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
@@ -128,16 +132,20 @@
struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
bool fixed = idx & (1u << 30);
struct kvm_pmc *counters;
+ unsigned int num_counters;
idx &= ~(3u << 30);
- if (!fixed && idx >= pmu->nr_arch_gp_counters)
+ if (fixed) {
+ counters = pmu->fixed_counters;
+ num_counters = pmu->nr_arch_fixed_counters;
+ } else {
+ counters = pmu->gp_counters;
+ num_counters = pmu->nr_arch_gp_counters;
+ }
+ if (idx >= num_counters)
return NULL;
- if (fixed && idx >= pmu->nr_arch_fixed_counters)
- return NULL;
- counters = fixed ? pmu->fixed_counters : pmu->gp_counters;
*mask &= pmu->counter_bitmask[fixed ? KVM_PMC_FIXED : KVM_PMC_GP];
-
- return &counters[idx];
+ return &counters[array_index_nospec(idx, num_counters)];
}
static bool intel_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
@@ -288,7 +296,9 @@
pmu->nr_arch_gp_counters = min_t(int, eax.split.num_counters,
x86_pmu.num_counters_gp);
+ eax.split.bit_width = min_t(int, eax.split.bit_width, x86_pmu.bit_width_gp);
pmu->counter_bitmask[KVM_PMC_GP] = ((u64)1 << eax.split.bit_width) - 1;
+ eax.split.mask_length = min_t(int, eax.split.mask_length, x86_pmu.events_mask_len);
pmu->available_event_types = ~entry->ebx &
((1ull << eax.split.mask_length) - 1);
@@ -298,6 +308,8 @@
pmu->nr_arch_fixed_counters =
min_t(int, edx.split.num_counters_fixed,
x86_pmu.num_counters_fixed);
+ edx.split.bit_width_fixed = min_t(int,
+ edx.split.bit_width_fixed, x86_pmu.bit_width_fixed);
pmu->counter_bitmask[KVM_PMC_FIXED] =
((u64)1 << edx.split.bit_width_fixed) - 1;
}
diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S
index 751a384..ca4252f 100644
--- a/arch/x86/kvm/vmx/vmenter.S
+++ b/arch/x86/kvm/vmx/vmenter.S
@@ -86,6 +86,9 @@
/* IMPORTANT: Stuff the RSB immediately after VM-Exit, before RET! */
FILL_RETURN_BUFFER %_ASM_AX, RSB_CLEAR_LOOPS, X86_FEATURE_RETPOLINE
+ /* Clear RFLAGS.CF and RFLAGS.ZF to preserve VM-Exit, i.e. !VM-Fail. */
+ or $1, %_ASM_AX
+
pop %_ASM_AX
.Lvmexit_skip_rsb:
#endif
@@ -234,3 +237,61 @@
2: mov $1, %eax
jmp 1b
ENDPROC(__vmx_vcpu_run)
+
+/**
+ * vmread_error_trampoline - Trampoline from inline asm to vmread_error()
+ * @field: VMCS field encoding that failed
+ * @fault: %true if the VMREAD faulted, %false if it failed
+
+ * Save and restore volatile registers across a call to vmread_error(). Note,
+ * all parameters are passed on the stack.
+ */
+ENTRY(vmread_error_trampoline)
+ push %_ASM_BP
+ mov %_ASM_SP, %_ASM_BP
+
+ push %_ASM_AX
+ push %_ASM_CX
+ push %_ASM_DX
+#ifdef CONFIG_X86_64
+ push %rdi
+ push %rsi
+ push %r8
+ push %r9
+ push %r10
+ push %r11
+#endif
+#ifdef CONFIG_X86_64
+ /* Load @field and @fault to arg1 and arg2 respectively. */
+ mov 3*WORD_SIZE(%rbp), %_ASM_ARG2
+ mov 2*WORD_SIZE(%rbp), %_ASM_ARG1
+#else
+ /* Parameters are passed on the stack for 32-bit (see asmlinkage). */
+ push 3*WORD_SIZE(%ebp)
+ push 2*WORD_SIZE(%ebp)
+#endif
+
+ call vmread_error
+
+#ifndef CONFIG_X86_64
+ add $8, %esp
+#endif
+
+ /* Zero out @fault, which will be popped into the result register. */
+ _ASM_MOV $0, 3*WORD_SIZE(%_ASM_BP)
+
+#ifdef CONFIG_X86_64
+ pop %r11
+ pop %r10
+ pop %r9
+ pop %r8
+ pop %rsi
+ pop %rdi
+#endif
+ pop %_ASM_DX
+ pop %_ASM_CX
+ pop %_ASM_AX
+ pop %_ASM_BP
+
+ ret
+ENDPROC(vmread_error_trampoline)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index 04a8212..e177848 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -95,7 +95,7 @@
static bool __read_mostly fasteoi = 1;
module_param(fasteoi, bool, S_IRUGO);
-static bool __read_mostly enable_apicv = 1;
+bool __read_mostly enable_apicv = 1;
module_param(enable_apicv, bool, S_IRUGO);
/*
@@ -648,43 +648,15 @@
}
#ifdef CONFIG_KEXEC_CORE
-/*
- * This bitmap is used to indicate whether the vmclear
- * operation is enabled on all cpus. All disabled by
- * default.
- */
-static cpumask_t crash_vmclear_enabled_bitmap = CPU_MASK_NONE;
-
-static inline void crash_enable_local_vmclear(int cpu)
-{
- cpumask_set_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline void crash_disable_local_vmclear(int cpu)
-{
- cpumask_clear_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
-static inline int crash_local_vmclear_enabled(int cpu)
-{
- return cpumask_test_cpu(cpu, &crash_vmclear_enabled_bitmap);
-}
-
static void crash_vmclear_local_loaded_vmcss(void)
{
int cpu = raw_smp_processor_id();
struct loaded_vmcs *v;
- if (!crash_local_vmclear_enabled(cpu))
- return;
-
list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
loaded_vmcss_on_cpu_link)
vmcs_clear(v->vmcs);
}
-#else
-static inline void crash_enable_local_vmclear(int cpu) { }
-static inline void crash_disable_local_vmclear(int cpu) { }
#endif /* CONFIG_KEXEC_CORE */
static void __loaded_vmcs_clear(void *arg)
@@ -696,19 +668,24 @@
return; /* vcpu migration can race with cpu offline */
if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
per_cpu(current_vmcs, cpu) = NULL;
- crash_disable_local_vmclear(cpu);
+
+ vmcs_clear(loaded_vmcs->vmcs);
+ if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
+ vmcs_clear(loaded_vmcs->shadow_vmcs);
+
list_del(&loaded_vmcs->loaded_vmcss_on_cpu_link);
/*
- * we should ensure updating loaded_vmcs->loaded_vmcss_on_cpu_link
- * is before setting loaded_vmcs->vcpu to -1 which is done in
- * loaded_vmcs_init. Otherwise, other cpu can see vcpu = -1 fist
- * then adds the vmcs into percpu list before it is deleted.
+ * Ensure all writes to loaded_vmcs, including deleting it from its
+ * current percpu list, complete before setting loaded_vmcs->vcpu to
+ * -1, otherwise a different cpu can see vcpu == -1 first and add
+ * loaded_vmcs to its percpu list before it's deleted from this cpu's
+ * list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
*/
smp_wmb();
- loaded_vmcs_init(loaded_vmcs);
- crash_enable_local_vmclear(cpu);
+ loaded_vmcs->cpu = -1;
+ loaded_vmcs->launched = 0;
}
void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
@@ -1153,6 +1130,10 @@
vmx->guest_msrs[i].mask);
}
+
+ if (vmx->nested.need_vmcs12_to_shadow_sync)
+ nested_sync_vmcs12_to_shadow(vcpu);
+
if (vmx->guest_state_loaded)
return;
@@ -1309,33 +1290,42 @@
pi_set_on(pi_desc);
}
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu)
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ struct loaded_vmcs *buddy)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
+ struct vmcs *prev;
if (!already_loaded) {
loaded_vmcs_clear(vmx->loaded_vmcs);
local_irq_disable();
- crash_disable_local_vmclear(cpu);
/*
- * Read loaded_vmcs->cpu should be before fetching
- * loaded_vmcs->loaded_vmcss_on_cpu_link.
- * See the comments in __loaded_vmcs_clear().
+ * Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
+ * this cpu's percpu list, otherwise it may not yet be deleted
+ * from its previous cpu's percpu list. Pairs with the
+ * smb_wmb() in __loaded_vmcs_clear().
*/
smp_rmb();
list_add(&vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
&per_cpu(loaded_vmcss_on_cpu, cpu));
- crash_enable_local_vmclear(cpu);
local_irq_enable();
}
- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ prev = per_cpu(current_vmcs, cpu);
+ if (prev != vmx->loaded_vmcs->vmcs) {
per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
vmcs_load(vmx->loaded_vmcs->vmcs);
- indirect_branch_prediction_barrier();
+
+ /*
+ * No indirect branch prediction barrier needed when switching
+ * the active VMCS within a guest, e.g. on nested VM-Enter.
+ * The L1 VMM can protect itself with retpolines, IBPB or IBRS.
+ */
+ if (!buddy || WARN_ON_ONCE(buddy->vmcs != prev))
+ indirect_branch_prediction_barrier();
}
if (!already_loaded) {
@@ -1380,11 +1370,10 @@
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
- vmx_vcpu_load_vmcs(vcpu, cpu);
+ vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
vmx_vcpu_pi_load(vcpu, cpu);
- vmx->host_pkru = read_pkru();
vmx->host_debugctlmsr = get_debugctlmsr();
}
@@ -1552,7 +1541,7 @@
static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
- unsigned long rip;
+ unsigned long rip, orig_rip;
/*
* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
@@ -1564,8 +1553,17 @@
*/
if (!static_cpu_has(X86_FEATURE_HYPERVISOR) ||
to_vmx(vcpu)->exit_reason != EXIT_REASON_EPT_MISCONFIG) {
- rip = kvm_rip_read(vcpu);
- rip += vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+ orig_rip = kvm_rip_read(vcpu);
+ rip = orig_rip + vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
+#ifdef CONFIG_X86_64
+ /*
+ * We need to mask out the high 32 bits of RIP if not in 64-bit
+ * mode, but just finding out that we are in 64-bit mode is
+ * quite expensive. Only do it if there was a carry.
+ */
+ if (unlikely(((rip ^ orig_rip) >> 31) == 3) && !is_64_bit_mode(vcpu))
+ rip = (u32)rip;
+#endif
kvm_rip_write(vcpu, rip);
} else {
if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
@@ -1790,7 +1788,7 @@
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
msr_info->data = to_vmx(vcpu)->spec_ctrl;
@@ -1973,15 +1971,13 @@
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_spec_ctrl_msr(vcpu))
return 1;
- /* The STIBP bit doesn't fault even if it's not advertised */
- if (data & ~(SPEC_CTRL_IBRS | SPEC_CTRL_STIBP | SPEC_CTRL_SSBD))
+ if (kvm_spec_ctrl_test_value(data))
return 1;
vmx->spec_ctrl = data;
-
if (!data)
break;
@@ -2003,12 +1999,13 @@
break;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
- !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+ !guest_has_pred_cmd_msr(vcpu))
return 1;
if (data & ~PRED_CMD_IBPB)
return 1;
-
+ if (!boot_cpu_has(X86_FEATURE_IBPB))
+ return 1;
if (!data)
break;
@@ -2140,6 +2137,8 @@
(index >= 2 * intel_pt_validate_cap(vmx->pt_desc.caps,
PT_CAP_num_address_ranges)))
return 1;
+ if (is_noncanonical_address(data, vcpu))
+ return 1;
if (index % 2)
vmx->pt_desc.guest.addr_b[index / 2] = data;
else
@@ -2250,21 +2249,6 @@
!hv_get_vp_assist_page(cpu))
return -EFAULT;
- INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
- INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
- spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
-
- /*
- * Now we can enable the vmclear operation in kdump
- * since the loaded_vmcss_on_cpu list on this cpu
- * has been initialized.
- *
- * Though the cpu is not in VMX operation now, there
- * is no problem to enable the vmclear operation
- * for the loaded_vmcss_on_cpu list is empty!
- */
- crash_enable_local_vmclear(cpu);
-
rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
test_bits = FEATURE_CONTROL_LOCKED;
@@ -2973,6 +2957,9 @@
static int get_ept_level(struct kvm_vcpu *vcpu)
{
+ /* Nested EPT currently only supports 4-level walks. */
+ if (is_guest_mode(vcpu) && nested_cpu_has_ept(get_vmcs12(vcpu)))
+ return 4;
if (cpu_has_vmx_ept_5levels() && (cpuid_maxphyaddr(vcpu) > 48))
return 5;
return 4;
@@ -2995,6 +2982,7 @@
void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
{
struct kvm *kvm = vcpu->kvm;
+ bool update_guest_cr3 = true;
unsigned long guest_cr3;
u64 eptp;
@@ -3011,15 +2999,18 @@
spin_unlock(&to_kvm_vmx(kvm)->ept_pointer_lock);
}
- if (enable_unrestricted_guest || is_paging(vcpu) ||
- is_guest_mode(vcpu))
+ /* Loading vmcs02.GUEST_CR3 is handled by nested VM-Enter. */
+ if (is_guest_mode(vcpu))
+ update_guest_cr3 = false;
+ else if (enable_unrestricted_guest || is_paging(vcpu))
guest_cr3 = kvm_read_cr3(vcpu);
else
guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
ept_load_pdptrs(vcpu);
}
- vmcs_writel(GUEST_CR3, guest_cr3);
+ if (update_guest_cr3)
+ vmcs_writel(GUEST_CR3, guest_cr3);
}
int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -3844,24 +3835,29 @@
* 2. If target vcpu isn't running(root mode), kick it to pick up the
* interrupt from PIR in next vmentry.
*/
-static void vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
+static int vmx_deliver_posted_interrupt(struct kvm_vcpu *vcpu, int vector)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int r;
r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
if (!r)
- return;
+ return 0;
+
+ if (!vcpu->arch.apicv_active)
+ return -1;
if (pi_test_and_set_pir(vector, &vmx->pi_desc))
- return;
+ return 0;
/* If a previous notification has sent the IPI, nothing to do. */
if (pi_test_and_set_on(&vmx->pi_desc))
- return;
+ return 0;
if (!kvm_vcpu_trigger_posted_interrupt(vcpu, false))
kvm_vcpu_kick(vcpu);
+
+ return 0;
}
/*
@@ -3929,6 +3925,8 @@
void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
{
+ BUILD_BUG_ON(KVM_CR4_GUEST_OWNED_BITS & ~KVM_POSSIBLE_CR4_GUEST_BITS);
+
vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
if (enable_ept)
vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
@@ -4491,8 +4489,13 @@
static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
{
- return (!to_vmx(vcpu)->nested.nested_run_pending &&
- vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
+ if (to_vmx(vcpu)->nested.nested_run_pending)
+ return false;
+
+ if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
+ return true;
+
+ return (vmcs_readl(GUEST_RFLAGS) & X86_EFLAGS_IF) &&
!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
(GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS));
}
@@ -4586,7 +4589,7 @@
*/
static void kvm_machine_check(void)
{
-#if defined(CONFIG_X86_MCE) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_MCE)
struct pt_regs regs = {
.cs = 3, /* Fake ring 3 no matter what the guest ran on */
.flags = X86_EFLAGS_IF,
@@ -5904,6 +5907,7 @@
(exit_reason != EXIT_REASON_EXCEPTION_NMI &&
exit_reason != EXIT_REASON_EPT_VIOLATION &&
exit_reason != EXIT_REASON_PML_FULL &&
+ exit_reason != EXIT_REASON_APIC_ACCESS &&
exit_reason != EXIT_REASON_TASK_SWITCH)) {
vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
@@ -6438,23 +6442,6 @@
msrs[i].host, false);
}
-static void atomic_switch_umwait_control_msr(struct vcpu_vmx *vmx)
-{
- u32 host_umwait_control;
-
- if (!vmx_has_waitpkg(vmx))
- return;
-
- host_umwait_control = get_umwait_control_msr();
-
- if (vmx->msr_ia32_umwait_control != host_umwait_control)
- add_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL,
- vmx->msr_ia32_umwait_control,
- host_umwait_control, false);
- else
- clear_atomic_switch_msr(vmx, MSR_IA32_UMWAIT_CONTROL);
-}
-
static void vmx_update_hv_timer(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -6511,8 +6498,11 @@
vmcs_write32(PLE_WINDOW, vmx->ple_window);
}
- if (vmx->nested.need_vmcs12_to_shadow_sync)
- nested_sync_vmcs12_to_shadow(vcpu);
+ /*
+ * We did this in prepare_switch_to_guest, because it needs to
+ * be within srcu_read_lock.
+ */
+ WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -6541,15 +6531,9 @@
kvm_load_guest_xcr0(vcpu);
- if (static_cpu_has(X86_FEATURE_PKU) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE) &&
- vcpu->arch.pkru != vmx->host_pkru)
- __write_pkru(vcpu->arch.pkru);
-
pt_guest_enter(vmx);
atomic_switch_perf_msrs(vmx);
- atomic_switch_umwait_control_msr(vmx);
if (enable_preemption_timer)
vmx_update_hv_timer(vcpu);
@@ -6634,18 +6618,6 @@
pt_guest_exit(vmx);
- /*
- * eager fpu is enabled if PKEY is supported and CR4 is switched
- * back on host, so it is safe to read guest PKRU from current
- * XSAVE.
- */
- if (static_cpu_has(X86_FEATURE_PKU) &&
- kvm_read_cr4_bits(vcpu, X86_CR4_PKE)) {
- vcpu->arch.pkru = rdpkru();
- if (vcpu->arch.pkru != vmx->host_pkru)
- __write_pkru(vmx->host_pkru);
- }
-
kvm_put_guest_xcr0(vcpu);
vmx->nested.nested_run_pending = 0;
@@ -6670,6 +6642,10 @@
struct kvm_vmx *kvm_vmx = __vmalloc(sizeof(struct kvm_vmx),
GFP_KERNEL_ACCOUNT | __GFP_ZERO,
PAGE_KERNEL);
+
+ if (!kvm_vmx)
+ return NULL;
+
return &kvm_vmx->kvm;
}
@@ -6793,8 +6769,7 @@
if (nested)
nested_vmx_setup_ctls_msrs(&vmx->nested.msrs,
- vmx_capability.ept,
- kvm_vcpu_apicv_active(&vmx->vcpu));
+ vmx_capability.ept);
else
memset(&vmx->nested.msrs, 0, sizeof(vmx->nested.msrs));
@@ -6876,8 +6851,7 @@
if (setup_vmcs_config(&vmcs_conf, &vmx_cap) < 0)
return -EIO;
if (nested)
- nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept,
- enable_apicv);
+ nested_vmx_setup_ctls_msrs(&vmcs_conf.nested, vmx_cap.ept);
if (memcmp(&vmcs_config, &vmcs_conf, sizeof(struct vmcs_config)) != 0) {
printk(KERN_ERR "kvm: CPU %d feature inconsistency!\n",
smp_processor_id());
@@ -7123,6 +7097,40 @@
to_vmx(vcpu)->req_immediate_exit = true;
}
+static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
+ struct x86_instruction_info *info)
+{
+ struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
+ unsigned short port;
+ bool intercept;
+ int size;
+
+ if (info->intercept == x86_intercept_in ||
+ info->intercept == x86_intercept_ins) {
+ port = info->src_val;
+ size = info->dst_bytes;
+ } else {
+ port = info->dst_val;
+ size = info->src_bytes;
+ }
+
+ /*
+ * If the 'use IO bitmaps' VM-execution control is 0, IO instruction
+ * VM-exits depend on the 'unconditional IO exiting' VM-execution
+ * control.
+ *
+ * Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
+ */
+ if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
+ intercept = nested_cpu_has(vmcs12,
+ CPU_BASED_UNCOND_IO_EXITING);
+ else
+ intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
+}
+
static int vmx_check_intercept(struct kvm_vcpu *vcpu,
struct x86_instruction_info *info,
enum x86_intercept_stage stage)
@@ -7130,19 +7138,45 @@
struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
+ switch (info->intercept) {
/*
* RDPID causes #UD if disabled through secondary execution controls.
* Because it is marked as EmulateOnUD, we need to intercept it here.
*/
- if (info->intercept == x86_intercept_rdtscp &&
- !nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
- ctxt->exception.vector = UD_VECTOR;
- ctxt->exception.error_code_valid = false;
- return X86EMUL_PROPAGATE_FAULT;
- }
+ case x86_intercept_rdtscp:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDTSCP)) {
+ ctxt->exception.vector = UD_VECTOR;
+ ctxt->exception.error_code_valid = false;
+ return X86EMUL_PROPAGATE_FAULT;
+ }
+ break;
+
+ case x86_intercept_in:
+ case x86_intercept_ins:
+ case x86_intercept_out:
+ case x86_intercept_outs:
+ return vmx_check_intercept_io(vcpu, info);
+
+ case x86_intercept_lgdt:
+ case x86_intercept_lidt:
+ case x86_intercept_lldt:
+ case x86_intercept_ltr:
+ case x86_intercept_sgdt:
+ case x86_intercept_sidt:
+ case x86_intercept_sldt:
+ case x86_intercept_str:
+ if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
+ return X86EMUL_CONTINUE;
+
+ /* FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. */
+ break;
/* TODO: check more intercepts... */
- return X86EMUL_CONTINUE;
+ default:
+ break;
+ }
+
+ return X86EMUL_UNHANDLEABLE;
}
#ifdef CONFIG_X86_64
@@ -7238,11 +7272,11 @@
kvm_flush_pml_buffers(kvm);
}
-static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu)
+static int vmx_write_pml_buffer(struct kvm_vcpu *vcpu, gpa_t gpa)
{
struct vmcs12 *vmcs12;
struct vcpu_vmx *vmx = to_vmx(vcpu);
- gpa_t gpa, dst;
+ gpa_t dst;
if (is_guest_mode(vcpu)) {
WARN_ON_ONCE(vmx->nested.pml_full);
@@ -7261,7 +7295,7 @@
return 1;
}
- gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS) & ~0xFFFull;
+ gpa &= ~0xFFFull;
dst = vmcs12->pml_address + sizeof(u64) * vmcs12->guest_pml_index;
if (kvm_write_guest_page(vcpu->kvm, gpa_to_gfn(dst), &gpa,
@@ -7727,7 +7761,7 @@
if (nested) {
nested_vmx_setup_ctls_msrs(&vmcs_config.nested,
- vmx_capability.ept, enable_apicv);
+ vmx_capability.ept);
r = nested_vmx_hardware_setup(kvm_vmx_exit_handlers);
if (r)
@@ -7861,6 +7895,7 @@
.xsaves_supported = vmx_xsaves_supported,
.umip_emulated = vmx_umip_emulated,
.pt_supported = vmx_pt_supported,
+ .pku_supported = vmx_pku_supported,
.request_immediate_exit = vmx_request_immediate_exit,
@@ -7949,7 +7984,7 @@
static int __init vmx_init(void)
{
- int r;
+ int r, cpu;
#if IS_ENABLED(CONFIG_HYPERV)
/*
@@ -8003,6 +8038,12 @@
return r;
}
+ for_each_possible_cpu(cpu) {
+ INIT_LIST_HEAD(&per_cpu(loaded_vmcss_on_cpu, cpu));
+ INIT_LIST_HEAD(&per_cpu(blocked_vcpu_on_cpu, cpu));
+ spin_lock_init(&per_cpu(blocked_vcpu_on_cpu_lock, cpu));
+ }
+
#ifdef CONFIG_KEXEC_CORE
rcu_assign_pointer(crash_vmclear_loaded_vmcss,
crash_vmclear_local_loaded_vmcss);
diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h
index 5a0f34b..55731dd 100644
--- a/arch/x86/kvm/vmx/vmx.h
+++ b/arch/x86/kvm/vmx/vmx.h
@@ -14,8 +14,6 @@
extern const u32 vmx_msr_index[];
extern u64 host_efer;
-extern u32 get_umwait_control_msr(void);
-
#define MSR_TYPE_R 1
#define MSR_TYPE_W 2
#define MSR_TYPE_RW 3
@@ -304,7 +302,8 @@
};
bool nested_vmx_allowed(struct kvm_vcpu *vcpu);
-void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu);
+void vmx_vcpu_load_vmcs(struct kvm_vcpu *vcpu, int cpu,
+ struct loaded_vmcs *buddy);
void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
int allocate_vpid(void);
void free_vpid(int vpid);
@@ -513,7 +512,7 @@
static inline bool vmx_has_waitpkg(struct vcpu_vmx *vmx)
{
- return vmx->secondary_exec_control &
+ return secondary_exec_controls_get(vmx) &
SECONDARY_EXEC_ENABLE_USR_WAIT_PAUSE;
}
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 5d53052..f1a0eeb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -92,6 +92,8 @@
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#endif
+static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS;
+
#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
@@ -100,6 +102,7 @@
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void process_smi(struct kvm_vcpu *vcpu);
static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
static void store_regs(struct kvm_vcpu *vcpu);
@@ -205,7 +208,6 @@
{ "l1d_flush", VCPU_STAT(l1d_flush) },
{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
- { "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
{ "mmu_flooded", VM_STAT(mmu_flooded) },
{ "mmu_recycled", VM_STAT(mmu_recycled) },
@@ -300,13 +302,14 @@
struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
int err;
- if (((value ^ smsr->values[slot].curr) & mask) == 0)
+ value = (value & mask) | (smsr->values[slot].host & ~mask);
+ if (value == smsr->values[slot].curr)
return 0;
- smsr->values[slot].curr = value;
err = wrmsrl_safe(shared_msrs_global.msrs[slot], value);
if (err)
return 1;
+ smsr->values[slot].curr = value;
if (!smsr->registered) {
smsr->urn.on_user_return = kvm_on_user_return;
user_return_notifier_register(&smsr->urn);
@@ -442,6 +445,14 @@
* for #DB exceptions under VMX.
*/
vcpu->arch.dr6 ^= payload & DR6_RTM;
+
+ /*
+ * The #DB payload is defined as compatible with the 'pending
+ * debug exceptions' field under VMX, not DR6. While bit 12 is
+ * defined in the 'pending debug exceptions' field (enabled
+ * breakpoint), it is reserved and must be zero in DR6.
+ */
+ vcpu->arch.dr6 &= ~BIT(12);
break;
case PF_VECTOR:
vcpu->arch.cr2 = payload;
@@ -464,8 +475,6 @@
if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) {
queue:
- if (has_error && !is_protmode(vcpu))
- has_error = false;
if (reinject) {
/*
* On vmentry, vcpu->arch.exception.pending is only
@@ -821,11 +830,25 @@
xsetbv(XCR_XFEATURE_ENABLED_MASK, vcpu->arch.xcr0);
vcpu->guest_xcr0_loaded = 1;
}
+
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU)) &&
+ vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.pkru);
}
EXPORT_SYMBOL_GPL(kvm_load_guest_xcr0);
void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
{
+ if (static_cpu_has(X86_FEATURE_PKU) &&
+ (kvm_read_cr4_bits(vcpu, X86_CR4_PKE) ||
+ (vcpu->arch.xcr0 & XFEATURE_MASK_PKRU))) {
+ vcpu->arch.pkru = rdpkru();
+ if (vcpu->arch.pkru != vcpu->arch.host_pkru)
+ __write_pkru(vcpu->arch.host_pkru);
+ }
+
if (vcpu->guest_xcr0_loaded) {
if (vcpu->arch.xcr0 != host_xcr0)
xsetbv(XCR_XFEATURE_ENABLED_MASK, host_xcr0);
@@ -885,9 +908,38 @@
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);
+static u64 kvm_host_cr4_reserved_bits(struct cpuinfo_x86 *c)
+{
+ u64 reserved_bits = CR4_RESERVED_BITS;
+
+ if (!cpu_has(c, X86_FEATURE_XSAVE))
+ reserved_bits |= X86_CR4_OSXSAVE;
+
+ if (!cpu_has(c, X86_FEATURE_SMEP))
+ reserved_bits |= X86_CR4_SMEP;
+
+ if (!cpu_has(c, X86_FEATURE_SMAP))
+ reserved_bits |= X86_CR4_SMAP;
+
+ if (!cpu_has(c, X86_FEATURE_FSGSBASE))
+ reserved_bits |= X86_CR4_FSGSBASE;
+
+ if (!cpu_has(c, X86_FEATURE_PKU))
+ reserved_bits |= X86_CR4_PKE;
+
+ if (!cpu_has(c, X86_FEATURE_LA57) &&
+ !(cpuid_ecx(0x7) & bit(X86_FEATURE_LA57)))
+ reserved_bits |= X86_CR4_LA57;
+
+ if (!cpu_has(c, X86_FEATURE_UMIP) && !kvm_x86_ops->umip_emulated())
+ reserved_bits |= X86_CR4_UMIP;
+
+ return reserved_bits;
+}
+
static int kvm_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
{
- if (cr4 & CR4_RESERVED_BITS)
+ if (cr4 & cr4_reserved_bits)
return -EINVAL;
if (!guest_cpuid_has(vcpu, X86_FEATURE_XSAVE) && (cr4 & X86_CR4_OSXSAVE))
@@ -918,7 +970,8 @@
{
unsigned long old_cr4 = kvm_read_cr4(vcpu);
unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE |
- X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE;
+ X86_CR4_SMEP;
+ unsigned long mmu_role_bits = pdptr_bits | X86_CR4_SMAP | X86_CR4_PKE;
if (kvm_valid_cr4(vcpu, cr4))
return 1;
@@ -926,6 +979,8 @@
if (is_long_mode(vcpu)) {
if (!(cr4 & X86_CR4_PAE))
return 1;
+ if ((cr4 ^ old_cr4) & X86_CR4_LA57)
+ return 1;
} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
&& ((cr4 ^ old_cr4) & pdptr_bits)
&& !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
@@ -944,7 +999,7 @@
if (kvm_x86_ops->set_cr4(vcpu, cr4))
return 1;
- if (((cr4 ^ old_cr4) & pdptr_bits) ||
+ if (((cr4 ^ old_cr4) & mmu_role_bits) ||
(!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
kvm_mmu_reset_context(vcpu);
@@ -1053,9 +1108,11 @@
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- vcpu->arch.db[dr] = val;
+ vcpu->arch.db[array_index_nospec(dr, size)] = val;
if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
vcpu->arch.eff_db[dr] = val;
break;
@@ -1092,9 +1149,11 @@
int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
+ size_t size = ARRAY_SIZE(vcpu->arch.db);
+
switch (dr) {
case 0 ... 3:
- *val = vcpu->arch.db[dr];
+ *val = vcpu->arch.db[array_index_nospec(dr, size)];
break;
case 4:
/* fall through */
@@ -1327,10 +1386,15 @@
* If TSX is disabled on the system, guests are also mitigated against
* TAA and clear CPU buffer mitigation is not required for guests.
*/
- if (boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM) &&
- (data & ARCH_CAP_TSX_CTRL_MSR))
+ if (!boot_cpu_has(X86_FEATURE_RTM))
+ data &= ~ARCH_CAP_TAA_NO;
+ else if (!boot_cpu_has_bug(X86_BUG_TAA))
+ data |= ARCH_CAP_TAA_NO;
+ else if (data & ARCH_CAP_TSX_CTRL_MSR)
data &= ~ARCH_CAP_MDS_NO;
+ /* KVM does not emulate MSR_IA32_TSX_CTRL. */
+ data &= ~ARCH_CAP_TSX_CTRL_MSR;
return data;
}
@@ -2484,7 +2548,10 @@
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
/* only 0 or all 1s can be written to IA32_MCi_CTL
* some Linux kernels though clear bit 10 in bank 4 to
* workaround a BIOS/GART TBL issue on AMD K8s, ignore
@@ -2580,45 +2647,47 @@
static void record_steal_time(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time))))
+ /* -EAGAIN is returned in atomic context so we can just return. */
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT,
+ &map, &vcpu->arch.st.cache, false))
return;
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
/*
* Doing a TLB flush here, on the guest's behalf, can avoid
* expensive IPIs.
*/
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
- vcpu->arch.st.steal.preempted & KVM_VCPU_FLUSH_TLB);
- if (xchg(&vcpu->arch.st.steal.preempted, 0) & KVM_VCPU_FLUSH_TLB)
+ st->preempted & KVM_VCPU_FLUSH_TLB);
+ if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb(vcpu, false);
- if (vcpu->arch.st.steal.version & 1)
- vcpu->arch.st.steal.version += 1; /* first time write, random junk */
+ vcpu->arch.st.preempted = 0;
- vcpu->arch.st.steal.version += 1;
+ if (st->version & 1)
+ st->version += 1; /* first time write, random junk */
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ st->version += 1;
smp_wmb();
- vcpu->arch.st.steal.steal += current->sched_info.run_delay -
+ st->steal += current->sched_info.run_delay -
vcpu->arch.st.last_steal;
vcpu->arch.st.last_steal = current->sched_info.run_delay;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
-
smp_wmb();
- vcpu->arch.st.steal.version += 1;
+ st->version += 1;
- kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal, sizeof(struct kvm_steal_time));
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false);
}
int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
@@ -2685,7 +2754,7 @@
return kvm_mtrr_set_msr(vcpu, msr, data);
case MSR_IA32_APICBASE:
return kvm_set_apic_base(vcpu, msr_info);
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_write(vcpu, msr, data);
case MSR_IA32_TSCDEADLINE:
kvm_set_lapic_tscdeadline_msr(vcpu, data);
@@ -2695,6 +2764,10 @@
if (!msr_info->host_initiated) {
s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr;
adjust_tsc_offset_guest(vcpu, adj);
+ /* Before back to guest, tsc_timestamp must be adjusted
+ * as well, otherwise guest's percpu pvclock time could jump.
+ */
+ kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
}
vcpu->arch.ia32_tsc_adjust_msr = data;
}
@@ -2771,11 +2844,6 @@
if (data & KVM_STEAL_RESERVED_MASK)
return 1;
- if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime,
- data & KVM_STEAL_VALID_BITS,
- sizeof(struct kvm_steal_time)))
- return 1;
-
vcpu->arch.st.msr_val = data;
if (!(data & KVM_MSR_ENABLED))
@@ -2911,7 +2979,10 @@
default:
if (msr >= MSR_IA32_MC0_CTL &&
msr < MSR_IA32_MCx_CTL(bank_num)) {
- u32 offset = msr - MSR_IA32_MC0_CTL;
+ u32 offset = array_index_nospec(
+ msr - MSR_IA32_MC0_CTL,
+ MSR_IA32_MCx_CTL(bank_num) - MSR_IA32_MC0_CTL);
+
data = vcpu->arch.mce_banks[offset];
break;
}
@@ -2991,7 +3062,7 @@
case MSR_IA32_APICBASE:
msr_info->data = kvm_get_apic_base(vcpu);
break;
- case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff:
+ case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff:
return kvm_x2apic_msr_read(vcpu, msr_info->index, &msr_info->data);
break;
case MSR_IA32_TSCDEADLINE:
@@ -3437,10 +3508,6 @@
kvm_x86_ops->vcpu_load(vcpu, cpu);
- fpregs_assert_state_consistent();
- if (test_thread_flag(TIF_NEED_FPU_LOAD))
- switch_fpu_return();
-
/* Apply any externally detected TSC adjustments (due to suspend) */
if (unlikely(vcpu->arch.tsc_offset_adjustment)) {
adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment);
@@ -3480,15 +3547,25 @@
static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu)
{
+ struct kvm_host_map map;
+ struct kvm_steal_time *st;
+
if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED))
return;
- vcpu->arch.st.steal.preempted = KVM_VCPU_PREEMPTED;
+ if (vcpu->arch.st.preempted)
+ return;
- kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime,
- &vcpu->arch.st.steal.preempted,
- offsetof(struct kvm_steal_time, preempted),
- sizeof(vcpu->arch.st.steal.preempted));
+ if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map,
+ &vcpu->arch.st.cache, true))
+ return;
+
+ st = map.hva +
+ offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS);
+
+ st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED;
+
+ kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true);
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -3549,22 +3626,33 @@
static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu)
{
+ /*
+ * We can accept userspace's request for interrupt injection
+ * as long as we have a place to store the interrupt number.
+ * The actual injection will happen when the CPU is able to
+ * deliver the interrupt.
+ */
+ if (kvm_cpu_has_extint(vcpu))
+ return false;
+
+ /* Acknowledging ExtINT does not happen if LINT0 is masked. */
return (!lapic_in_kernel(vcpu) ||
kvm_apic_accept_pic_intr(vcpu));
}
-/*
- * if userspace requested an interrupt window, check that the
- * interrupt window is open.
- *
- * No need to exit to userspace if we already have an interrupt queued.
- */
static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu)
{
- return kvm_arch_interrupt_allowed(vcpu) &&
- !kvm_cpu_has_interrupt(vcpu) &&
+ /*
+ * Do not cause an interrupt window exit if an exception
+ * is pending or an event needs reinjection; userspace
+ * might want to inject the interrupt manually using KVM_SET_REGS
+ * or KVM_SET_SREGS. For that to work, we must be at an
+ * instruction boundary and with no events half-injected.
+ */
+ return (kvm_arch_interrupt_allowed(vcpu) &&
+ kvm_cpu_accept_dm_intr(vcpu) &&
!kvm_event_needs_reinjection(vcpu) &&
- kvm_cpu_accept_dm_intr(vcpu);
+ !vcpu->arch.exception.pending);
}
static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
@@ -3624,7 +3712,7 @@
unsigned bank_num = mcg_cap & 0xff, bank;
r = -EINVAL;
- if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
+ if (!bank_num || bank_num > KVM_MAX_MCE_BANKS)
goto out;
if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
@@ -3695,6 +3783,10 @@
{
process_nmi(vcpu);
+
+ if (kvm_check_request(KVM_REQ_SMI, vcpu))
+ process_smi(vcpu);
+
/*
* The API doesn't provide the instruction length for software
* exceptions, so don't report them. As long as the guest RIP
@@ -4421,6 +4513,7 @@
case KVM_SET_NESTED_STATE: {
struct kvm_nested_state __user *user_kvm_nested_state = argp;
struct kvm_nested_state kvm_state;
+ int idx;
r = -EINVAL;
if (!kvm_x86_ops->set_nested_state)
@@ -4444,7 +4537,9 @@
&& !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE))
break;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvm_x86_ops->set_nested_state(vcpu, user_kvm_nested_state, &kvm_state);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
break;
}
case KVM_GET_SUPPORTED_HV_CPUID: {
@@ -4973,10 +5068,13 @@
r = -EFAULT;
if (copy_from_user(&u.ps, argp, sizeof(u.ps)))
goto out;
+ mutex_lock(&kvm->lock);
r = -ENXIO;
if (!kvm->arch.vpit)
- goto out;
+ goto set_pit_out;
r = kvm_vm_ioctl_set_pit(kvm, &u.ps);
+set_pit_out:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_GET_PIT2: {
@@ -4996,10 +5094,13 @@
r = -EFAULT;
if (copy_from_user(&u.ps2, argp, sizeof(u.ps2)))
goto out;
+ mutex_lock(&kvm->lock);
r = -ENXIO;
if (!kvm->arch.vpit)
- goto out;
+ goto set_pit2_out;
r = kvm_vm_ioctl_set_pit2(kvm, &u.ps2);
+set_pit2_out:
+ mutex_unlock(&kvm->lock);
break;
}
case KVM_REINJECT_CONTROL: {
@@ -5151,6 +5252,10 @@
if (!kvm_x86_ops->rdtscp_supported())
continue;
break;
+ case MSR_IA32_UMWAIT_CONTROL:
+ if (!boot_cpu_has(X86_FEATURE_WAITPKG))
+ continue;
+ break;
case MSR_IA32_RTIT_CTL:
case MSR_IA32_RTIT_STATUS:
if (!kvm_x86_ops->pt_supported())
@@ -6185,7 +6290,10 @@
static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags)
{
- emul_to_vcpu(ctxt)->arch.hflags = emul_flags;
+ struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
+
+ vcpu->arch.hflags = emul_flags;
+ kvm_mmu_reset_context(vcpu);
}
static int emulator_pre_leave_smm(struct x86_emulate_ctxt *ctxt,
@@ -6356,11 +6464,11 @@
return 1;
}
-static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t cr2,
+static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
bool write_fault_to_shadow_pgtable,
int emulation_type)
{
- gpa_t gpa = cr2;
+ gpa_t gpa = cr2_or_gpa;
kvm_pfn_t pfn;
if (!(emulation_type & EMULTYPE_ALLOW_RETRY))
@@ -6374,7 +6482,7 @@
* Write permission should be allowed since only
* write access need to be emulated.
*/
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
/*
* If the mapping is invalid in guest, let cpu retry
@@ -6431,10 +6539,10 @@
}
static bool retry_instruction(struct x86_emulate_ctxt *ctxt,
- unsigned long cr2, int emulation_type)
+ gpa_t cr2_or_gpa, int emulation_type)
{
struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
- unsigned long last_retry_eip, last_retry_addr, gpa = cr2;
+ unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa;
last_retry_eip = vcpu->arch.last_retry_eip;
last_retry_addr = vcpu->arch.last_retry_addr;
@@ -6463,14 +6571,14 @@
if (x86_page_table_writing_insn(ctxt))
return false;
- if (ctxt->eip == last_retry_eip && last_retry_addr == cr2)
+ if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa)
return false;
vcpu->arch.last_retry_eip = ctxt->eip;
- vcpu->arch.last_retry_addr = cr2;
+ vcpu->arch.last_retry_addr = cr2_or_gpa;
if (!vcpu->arch.mmu->direct_map)
- gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2, NULL);
+ gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL);
kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(gpa));
@@ -6616,11 +6724,8 @@
return false;
}
-int x86_emulate_instruction(struct kvm_vcpu *vcpu,
- unsigned long cr2,
- int emulation_type,
- void *insn,
- int insn_len)
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ int emulation_type, void *insn, int insn_len)
{
int r;
struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
@@ -6666,8 +6771,9 @@
kvm_queue_exception(vcpu, UD_VECTOR);
return 1;
}
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
- emulation_type))
+ if (reexecute_instruction(vcpu, cr2_or_gpa,
+ write_fault_to_spt,
+ emulation_type))
return 1;
if (ctxt->have_exception) {
/*
@@ -6701,7 +6807,7 @@
return 1;
}
- if (retry_instruction(ctxt, cr2, emulation_type))
+ if (retry_instruction(ctxt, cr2_or_gpa, emulation_type))
return 1;
/* this is needed for vmware backdoor interface to work since it
@@ -6713,7 +6819,7 @@
restart:
/* Save the faulting GPA (cr2) in the address field */
- ctxt->exception.address = cr2;
+ ctxt->exception.address = cr2_or_gpa;
r = x86_emulate_insn(ctxt);
@@ -6721,7 +6827,7 @@
return 1;
if (r == EMULATION_FAILED) {
- if (reexecute_instruction(vcpu, cr2, write_fault_to_spt,
+ if (reexecute_instruction(vcpu, cr2_or_gpa, write_fault_to_spt,
emulation_type))
return 1;
@@ -6760,7 +6866,7 @@
if (!ctxt->have_exception ||
exception_type(ctxt->exception.vector) == EXCPT_TRAP) {
kvm_rip_write(vcpu, ctxt->eip);
- if (r && ctxt->tf)
+ if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)))
r = kvm_vcpu_do_singlestep(vcpu);
__kvm_set_rflags(vcpu, ctxt->eflags);
}
@@ -7264,6 +7370,7 @@
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
#ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
+ cancel_work_sync(&pvclock_gtod_work);
#endif
kvm_x86_ops = NULL;
kvm_mmu_module_exit();
@@ -7496,14 +7603,21 @@
kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
}
-static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
+static void kvm_inject_exception(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.exception.error_code && !is_protmode(vcpu))
+ vcpu->arch.exception.error_code = false;
+ kvm_x86_ops->queue_exception(vcpu);
+}
+
+static int inject_pending_event(struct kvm_vcpu *vcpu)
{
int r;
/* try to reinject previous events if any */
if (vcpu->arch.exception.injected)
- kvm_x86_ops->queue_exception(vcpu);
+ kvm_inject_exception(vcpu);
/*
* Do not inject an NMI or interrupt if there is a pending
* exception. Exceptions and interrupts are recognized at
@@ -7532,7 +7646,7 @@
* from L2 to L1.
*/
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ r = kvm_x86_ops->check_nested_events(vcpu);
if (r != 0)
return r;
}
@@ -7569,7 +7683,7 @@
}
}
- kvm_x86_ops->queue_exception(vcpu);
+ kvm_inject_exception(vcpu);
}
/* Don't consider new event if we re-injected an event */
@@ -7594,7 +7708,7 @@
* KVM_REQ_EVENT only on certain events and not unconditionally?
*/
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events) {
- r = kvm_x86_ops->check_nested_events(vcpu, req_int_win);
+ r = kvm_x86_ops->check_nested_events(vcpu);
if (r != 0)
return r;
}
@@ -7905,9 +8019,8 @@
kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap);
}
-int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end,
- bool blockable)
+void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
{
unsigned long apic_address;
@@ -7918,8 +8031,6 @@
apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT);
if (start <= apic_address && apic_address < end)
kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
-
- return 0;
}
void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu)
@@ -8071,7 +8182,7 @@
goto out;
}
- if (inject_pending_event(vcpu, req_int_win) != 0)
+ if (inject_pending_event(vcpu) != 0)
req_immediate_exit = true;
else {
/* Enable SMI/NMI/IRQ window open exits if needed.
@@ -8163,8 +8274,12 @@
trace_kvm_entry(vcpu->vcpu_id);
guest_enter_irqoff();
- /* The preempt notifier should have taken care of the FPU already. */
- WARN_ON_ONCE(test_thread_flag(TIF_NEED_FPU_LOAD));
+ /* Save host pkru register if supported */
+ vcpu->arch.host_pkru = read_pkru();
+
+ fpregs_assert_state_consistent();
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ switch_fpu_return();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -8174,6 +8289,8 @@
set_debugreg(vcpu->arch.eff_db[3], 3);
set_debugreg(vcpu->arch.dr6, 6);
vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD;
+ } else if (unlikely(hw_breakpoint_active())) {
+ set_debugreg(0, 7);
}
kvm_x86_ops->run(vcpu);
@@ -8300,7 +8417,7 @@
static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu) && kvm_x86_ops->check_nested_events)
- kvm_x86_ops->check_nested_events(vcpu, false);
+ kvm_x86_ops->check_nested_events(vcpu);
return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
!vcpu->arch.apf.halted);
@@ -8436,12 +8553,26 @@
return 0;
}
+static void kvm_save_current_fpu(struct fpu *fpu)
+{
+ /*
+ * If the target FPU state is not resident in the CPU registers, just
+ * memcpy() from current, else save CPU state directly to the target.
+ */
+ if (test_thread_flag(TIF_NEED_FPU_LOAD))
+ memcpy(&fpu->state, ¤t->thread.fpu.state,
+ fpu_kernel_xstate_size);
+ else
+ copy_fpregs_to_fpstate(fpu);
+}
+
/* Swap (qemu) user FPU context for the guest FPU context. */
static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
{
fpregs_lock();
- copy_fpregs_to_fpstate(vcpu->arch.user_fpu);
+ kvm_save_current_fpu(vcpu->arch.user_fpu);
+
/* PKRU is separately restored in kvm_x86_ops->run. */
__copy_kernel_to_fpregs(&vcpu->arch.guest_fpu->state,
~XFEATURE_MASK_PKRU);
@@ -8457,7 +8588,8 @@
{
fpregs_lock();
- copy_fpregs_to_fpstate(vcpu->arch.guest_fpu);
+ kvm_save_current_fpu(vcpu->arch.guest_fpu);
+
copy_kernel_to_fpregs(&vcpu->arch.user_fpu->state);
fpregs_mark_activate();
@@ -8679,6 +8811,8 @@
struct kvm_mp_state *mp_state)
{
vcpu_load(vcpu);
+ if (kvm_mpx_supported())
+ kvm_load_guest_fpu(vcpu);
kvm_apic_accept_events(vcpu);
if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED &&
@@ -8687,6 +8821,8 @@
else
mp_state->mp_state = vcpu->arch.mp_state;
+ if (kvm_mpx_supported())
+ kvm_put_guest_fpu(vcpu);
vcpu_put(vcpu);
return 0;
}
@@ -9046,6 +9182,9 @@
void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
{
void *wbinvd_dirty_mask = vcpu->arch.wbinvd_dirty_mask;
+ struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache;
+
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
kvmclock_reset(vcpu);
@@ -9110,13 +9249,7 @@
void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
{
- vcpu->arch.apf.msr_val = 0;
-
- vcpu_load(vcpu);
- kvm_mmu_unload(vcpu);
- vcpu_put(vcpu);
-
- kvm_x86_ops->vcpu_free(vcpu);
+ kvm_arch_vcpu_free(vcpu);
}
void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
@@ -9308,6 +9441,8 @@
if (r != 0)
return r;
+ cr4_reserved_bits = kvm_host_cr4_reserved_bits(&boot_cpu_data);
+
if (kvm_has_tsc_control) {
/*
* Make sure the user can only configure tsc_khz values that
@@ -9648,6 +9783,13 @@
{
int i;
+ /*
+ * Clear out the previous array pointers for the KVM_MR_MOVE case. The
+ * old arrays will be freed by __kvm_set_memory_region() if installing
+ * the new memslot is successful.
+ */
+ memset(&slot->arch, 0, sizeof(slot->arch));
+
for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) {
struct kvm_lpage_info *linfo;
unsigned long ugfn;
@@ -9710,11 +9852,18 @@
void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen)
{
+ struct kvm_vcpu *vcpu;
+ int i;
+
/*
* memslots->generation has been incremented.
* mmio generation may have reached its maximum value.
*/
kvm_mmu_invalidate_mmio_sptes(kvm, gen);
+
+ /* Force re-initialization of steal_time cache */
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ kvm_vcpu_kick(vcpu);
}
int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -9722,6 +9871,10 @@
const struct kvm_userspace_memory_region *mem,
enum kvm_mr_change change)
{
+ if (change == KVM_MR_MOVE)
+ return kvm_arch_create_memslot(kvm, memslot,
+ mem->memory_size >> PAGE_SHIFT);
+
return 0;
}
@@ -9966,7 +10119,7 @@
work->arch.cr3 != vcpu->arch.mmu->get_cr3(vcpu))
return;
- vcpu->arch.mmu->page_fault(vcpu, work->gva, 0, true);
+ vcpu->arch.mmu->page_fault(vcpu, work->cr2_or_gpa, 0, true);
}
static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
@@ -10079,7 +10232,7 @@
{
struct x86_exception fault;
- trace_kvm_async_pf_not_present(work->arch.token, work->gva);
+ trace_kvm_async_pf_not_present(work->arch.token, work->cr2_or_gpa);
kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
if (kvm_can_deliver_async_pf(vcpu) &&
@@ -10114,7 +10267,7 @@
work->arch.token = ~0; /* broadcast wakeup */
else
kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
- trace_kvm_async_pf_ready(work->arch.token, work->gva);
+ trace_kvm_async_pf_ready(work->arch.token, work->cr2_or_gpa);
if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
!apf_get_user(vcpu, &val)) {
@@ -10245,6 +10398,32 @@
EXPORT_SYMBOL_GPL(kvm_arch_no_poll);
+int kvm_spec_ctrl_test_value(u64 value)
+{
+ /*
+ * test that setting IA32_SPEC_CTRL to given value
+ * is allowed by the host processor
+ */
+
+ u64 saved_value;
+ unsigned long flags;
+ int ret = 0;
+
+ local_irq_save(flags);
+
+ if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, &saved_value))
+ ret = 1;
+ else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, value))
+ ret = 1;
+ else
+ wrmsrl(MSR_IA32_SPEC_CTRL, saved_value);
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value);
+
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio);
EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index dbf7442..c520d37 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -286,7 +286,7 @@
bool kvm_mtrr_check_gfn_range_consistency(struct kvm_vcpu *vcpu, gfn_t gfn,
int page_num);
bool kvm_vector_hashing_enabled(void);
-int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
+int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
int emulation_type, void *insn, int insn_len);
#define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \
@@ -368,5 +368,6 @@
void kvm_load_guest_xcr0(struct kvm_vcpu *vcpu);
void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu);
+int kvm_spec_ctrl_test_value(u64 value);
#endif