Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index f1d3fe5..c5673bd 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Kernel-based Virtual Machine driver for Linux
*
@@ -9,10 +10,6 @@
* Authors:
* Yaniv Kamay <yaniv@qumranet.com>
* Avi Kivity <avi@qumranet.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- *
*/
#define pr_fmt(fmt) "SVM: " fmt
@@ -71,10 +68,8 @@
#define SEG_TYPE_LDT 2
#define SEG_TYPE_BUSY_TSS16 3
-#define SVM_FEATURE_NPT (1 << 0)
#define SVM_FEATURE_LBRV (1 << 1)
#define SVM_FEATURE_SVML (1 << 2)
-#define SVM_FEATURE_NRIP (1 << 3)
#define SVM_FEATURE_TSC_RATE (1 << 4)
#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
#define SVM_FEATURE_FLUSH_ASID (1 << 6)
@@ -145,7 +140,6 @@
/* Struct members for AVIC */
u32 avic_vm_id;
- u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
struct hlist_node hnode;
@@ -236,6 +230,7 @@
bool nrips_enabled : 1;
u32 ldr_reg;
+ u32 dfr_reg;
struct page *avic_backing_page;
u64 *avic_physical_id_cache;
bool avic_is_running;
@@ -262,6 +257,7 @@
};
#define AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK (0xFF)
+#define AVIC_LOGICAL_ID_ENTRY_VALID_BIT 31
#define AVIC_LOGICAL_ID_ENTRY_VALID_MASK (1 << 31)
#define AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK (0xFFULL)
@@ -366,6 +362,10 @@
module_param(avic, int, S_IRUGO);
#endif
+/* enable/disable Next RIP Save */
+static int nrips = true;
+module_param(nrips, int, 0444);
+
/* enable/disable Virtual VMLOAD VMSAVE */
static int vls = true;
module_param(vls, int, 0444);
@@ -378,6 +378,9 @@
static int sev = IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT);
module_param(sev, int, 0444);
+static bool __read_mostly dump_invalid_vmcb = 0;
+module_param(dump_invalid_vmcb, bool, 0644);
+
static u8 rsm_ins_bytes[] = "\x0f\xaa";
static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
@@ -675,11 +678,6 @@
static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
-struct svm_init_data {
- int cpu;
- int r;
-};
-
static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000};
#define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges)
@@ -711,17 +709,17 @@
static inline void clgi(void)
{
- asm volatile (__ex(SVM_CLGI));
+ asm volatile (__ex("clgi"));
}
static inline void stgi(void)
{
- asm volatile (__ex(SVM_STGI));
+ asm volatile (__ex("stgi"));
}
static inline void invlpga(unsigned long addr, u32 asid)
{
- asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
+ asm volatile (__ex("invlpga %1, %0") : : "c"(asid), "a"(addr));
}
static int get_npt_level(struct kvm_vcpu *vcpu)
@@ -736,8 +734,14 @@
static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
{
vcpu->arch.efer = efer;
- if (!npt_enabled && !(efer & EFER_LMA))
- efer &= ~EFER_LME;
+
+ if (!npt_enabled) {
+ /* Shadow paging assumes NX to be available. */
+ efer |= EFER_NX;
+
+ if (!(efer & EFER_LMA))
+ efer &= ~EFER_LME;
+ }
to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
@@ -770,27 +774,27 @@
}
-static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
+static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- if (svm->vmcb->control.next_rip != 0) {
+ if (nrips && svm->vmcb->control.next_rip != 0) {
WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS));
svm->next_rip = svm->vmcb->control.next_rip;
}
if (!svm->next_rip) {
- if (kvm_emulate_instruction(vcpu, EMULTYPE_SKIP) !=
- EMULATE_DONE)
- printk(KERN_DEBUG "%s: NOP\n", __func__);
- return;
+ if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
+ return 0;
+ } else {
+ if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
+ pr_err("%s: ip 0x%lx next 0x%llx\n",
+ __func__, kvm_rip_read(vcpu), svm->next_rip);
+ kvm_rip_write(vcpu, svm->next_rip);
}
- if (svm->next_rip - kvm_rip_read(vcpu) > MAX_INST_SIZE)
- printk(KERN_ERR "%s: ip 0x%lx next 0x%llx\n",
- __func__, kvm_rip_read(vcpu), svm->next_rip);
-
- kvm_rip_write(vcpu, svm->next_rip);
svm_set_interrupt_shadow(vcpu, 0);
+
+ return 1;
}
static void svm_queue_exception(struct kvm_vcpu *vcpu)
@@ -809,7 +813,9 @@
nested_svm_check_exception(svm, nr, has_error_code, error_code))
return;
- if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
+ kvm_deliver_exception_payload(&svm->vcpu);
+
+ if (nr == BP_VECTOR && !nrips) {
unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
/*
@@ -819,7 +825,7 @@
* raises a fault that is not intercepted. Still better than
* failing in all cases.
*/
- skip_emulated_instruction(&svm->vcpu);
+ (void)skip_emulated_instruction(&svm->vcpu);
rip = kvm_rip_read(&svm->vcpu);
svm->int3_rip = rip + svm->vmcb->save.cs.base;
svm->int3_injected = rip - old_rip;
@@ -1267,11 +1273,11 @@
pause_filter_count_grow,
pause_filter_count_max);
- if (control->pause_filter_count != old)
+ if (control->pause_filter_count != old) {
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
- trace_kvm_ple_window_grow(vcpu->vcpu_id,
- control->pause_filter_count, old);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
}
static void shrink_ple_window(struct kvm_vcpu *vcpu)
@@ -1285,11 +1291,11 @@
pause_filter_count,
pause_filter_count_shrink,
pause_filter_count);
- if (control->pause_filter_count != old)
+ if (control->pause_filter_count != old) {
mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-
- trace_kvm_ple_window_shrink(vcpu->vcpu_id,
- control->pause_filter_count, old);
+ trace_kvm_ple_window_update(vcpu->vcpu_id,
+ control->pause_filter_count, old);
+ }
}
static __init int svm_hardware_setup(void)
@@ -1366,6 +1372,11 @@
} else
kvm_disable_tdp();
+ if (nrips) {
+ if (!boot_cpu_has(X86_FEATURE_NRIPS))
+ nrips = false;
+ }
+
if (avic) {
if (!npt_enabled ||
!boot_cpu_has(X86_FEATURE_AVIC) ||
@@ -1454,10 +1465,11 @@
g_tsc_offset = svm->vmcb->control.tsc_offset -
svm->nested.hsave->control.tsc_offset;
svm->nested.hsave->control.tsc_offset = offset;
- } else
- trace_kvm_write_tsc_offset(vcpu->vcpu_id,
- svm->vmcb->control.tsc_offset,
- offset);
+ }
+
+ trace_kvm_write_tsc_offset(vcpu->vcpu_id,
+ svm->vmcb->control.tsc_offset - g_tsc_offset,
+ offset);
svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
@@ -1534,6 +1546,7 @@
set_intercept(svm, INTERCEPT_SKINIT);
set_intercept(svm, INTERCEPT_WBINVD);
set_intercept(svm, INTERCEPT_XSETBV);
+ set_intercept(svm, INTERCEPT_RDPRU);
set_intercept(svm, INTERCEPT_RSM);
if (!kvm_mwait_in_guest(svm->vcpu.kvm)) {
@@ -1706,7 +1719,6 @@
if (!entry)
return -EINVAL;
- new_entry = READ_ONCE(*entry);
new_entry = __sme_set((page_to_phys(svm->avic_backing_page) &
AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) |
AVIC_PHYSICAL_ID_ENTRY_VALID_MASK);
@@ -1797,15 +1809,16 @@
/* Avoid using vmalloc for smaller buffers. */
size = npages * sizeof(struct page *);
if (size > PAGE_SIZE)
- pages = vmalloc(size);
+ pages = __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
else
- pages = kmalloc(size, GFP_KERNEL);
+ pages = kmalloc(size, GFP_KERNEL_ACCOUNT);
if (!pages)
return NULL;
/* Pin the user virtual address. */
- npinned = get_user_pages_fast(uaddr, npages, write ? FOLL_WRITE : 0, pages);
+ npinned = get_user_pages_fast(uaddr, npages, FOLL_WRITE, pages);
if (npinned != npages) {
pr_err("SEV: Failure locking %lu pages.\n", npages);
goto err;
@@ -1867,7 +1880,9 @@
static struct kvm *svm_vm_alloc(void)
{
- struct kvm_svm *kvm_svm = vzalloc(sizeof(struct kvm_svm));
+ struct kvm_svm *kvm_svm = __vmalloc(sizeof(struct kvm_svm),
+ GFP_KERNEL_ACCOUNT | __GFP_ZERO,
+ PAGE_KERNEL);
return &kvm_svm->kvm;
}
@@ -1942,7 +1957,7 @@
return 0;
/* Allocating physical APIC ID table (4KB) */
- p_page = alloc_page(GFP_KERNEL);
+ p_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!p_page)
goto free_avic;
@@ -1950,7 +1965,7 @@
clear_page(page_address(p_page));
/* Allocating logical APIC ID table (4KB) */
- l_page = alloc_page(GFP_KERNEL);
+ l_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!l_page)
goto free_avic;
@@ -2022,7 +2037,11 @@
if (!kvm_vcpu_apicv_active(vcpu))
return;
- if (WARN_ON(h_physical_id >= AVIC_MAX_PHYSICAL_ID_COUNT))
+ /*
+ * Since the host physical APIC id is 8 bits,
+ * we can support host APIC ID upto 255.
+ */
+ if (WARN_ON(h_physical_id > AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK))
return;
entry = READ_ONCE(*(svm->avic_physical_id_cache));
@@ -2089,7 +2108,7 @@
init_vmcb(svm);
kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true);
- kvm_register_write(vcpu, VCPU_REGS_RDX, eax);
+ kvm_rdx_write(vcpu, eax);
if (kvm_vcpu_apicv_active(vcpu) && !init_event)
avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE);
@@ -2108,6 +2127,7 @@
INIT_LIST_HEAD(&svm->ir_list);
spin_lock_init(&svm->ir_list_lock);
+ svm->dfr_reg = APIC_DFR_FLAT;
return ret;
}
@@ -2121,30 +2141,49 @@
struct page *nested_msrpm_pages;
int err;
- svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
+ BUILD_BUG_ON_MSG(offsetof(struct vcpu_svm, vcpu) != 0,
+ "struct kvm_vcpu must be at offset 0 for arch usercopy region");
+
+ svm = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
if (!svm) {
err = -ENOMEM;
goto out;
}
+ svm->vcpu.arch.user_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!svm->vcpu.arch.user_fpu) {
+ printk(KERN_ERR "kvm: failed to allocate kvm userspace's fpu\n");
+ err = -ENOMEM;
+ goto free_partial_svm;
+ }
+
+ svm->vcpu.arch.guest_fpu = kmem_cache_zalloc(x86_fpu_cache,
+ GFP_KERNEL_ACCOUNT);
+ if (!svm->vcpu.arch.guest_fpu) {
+ printk(KERN_ERR "kvm: failed to allocate vcpu's fpu\n");
+ err = -ENOMEM;
+ goto free_user_fpu;
+ }
+
err = kvm_vcpu_init(&svm->vcpu, kvm, id);
if (err)
goto free_svm;
err = -ENOMEM;
- page = alloc_page(GFP_KERNEL);
+ page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!page)
goto uninit;
- msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!msrpm_pages)
goto free_page1;
- nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
+ nested_msrpm_pages = alloc_pages(GFP_KERNEL_ACCOUNT, MSRPM_ALLOC_ORDER);
if (!nested_msrpm_pages)
goto free_page2;
- hsave_page = alloc_page(GFP_KERNEL);
+ hsave_page = alloc_page(GFP_KERNEL_ACCOUNT);
if (!hsave_page)
goto free_page3;
@@ -2186,6 +2225,10 @@
uninit:
kvm_vcpu_uninit(&svm->vcpu);
free_svm:
+ kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
+free_user_fpu:
+ kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
+free_partial_svm:
kmem_cache_free(kvm_vcpu_cache, svm);
out:
return ERR_PTR(err);
@@ -2215,6 +2258,8 @@
__free_page(virt_to_page(svm->nested.hsave));
__free_pages(virt_to_page(svm->nested.msrpm), MSRPM_ALLOC_ORDER);
kvm_vcpu_uninit(vcpu);
+ kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.user_fpu);
+ kmem_cache_free(x86_fpu_cache, svm->vcpu.arch.guest_fpu);
kmem_cache_free(kvm_vcpu_cache, svm);
}
@@ -2679,6 +2724,7 @@
static int db_interception(struct vcpu_svm *svm)
{
struct kvm_run *kvm_run = svm->vcpu.run;
+ struct kvm_vcpu *vcpu = &svm->vcpu;
if (!(svm->vcpu.guest_debug &
(KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
@@ -2689,6 +2735,8 @@
if (svm->nmi_singlestep) {
disable_nmi_singlestep(svm);
+ /* Make sure we check for pending NMIs upon entry */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
if (svm->vcpu.guest_debug &
@@ -2728,17 +2776,18 @@
{
struct kvm_vcpu *vcpu = &svm->vcpu;
u32 error_code = svm->vmcb->control.exit_info_1;
- int er;
WARN_ON_ONCE(!enable_vmware_backdoor);
- er = kvm_emulate_instruction(vcpu,
- EMULTYPE_VMWARE | EMULTYPE_NO_UD_ON_FAIL);
- if (er == EMULATE_USER_EXIT)
- return 0;
- else if (er != EMULATE_DONE)
+ /*
+ * VMware backdoor emulation on #GP interception only handles IN{S},
+ * OUT{S}, and RDPMC, none of which generate a non-zero error code.
+ */
+ if (error_code) {
kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
- return 1;
+ return 1;
+ }
+ return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
}
static bool is_erratum_383(void)
@@ -2836,7 +2885,7 @@
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
if (string)
- return kvm_emulate_instruction(vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(vcpu, 0);
port = io_info >> 16;
size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -2863,13 +2912,11 @@
static int halt_interception(struct vcpu_svm *svm)
{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
return kvm_emulate_halt(&svm->vcpu);
}
static int vmmcall_interception(struct vcpu_svm *svm)
{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
return kvm_emulate_hypercall(&svm->vcpu);
}
@@ -2935,19 +2982,22 @@
static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
{
WARN_ON(mmu_is_nested(vcpu));
+
+ vcpu->arch.mmu = &vcpu->arch.guest_mmu;
kvm_init_shadow_mmu(vcpu);
- vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
- vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
- vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr;
- vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
- vcpu->arch.mmu.shadow_root_level = get_npt_level(vcpu);
- reset_shadow_zero_bits_mask(vcpu, &vcpu->arch.mmu);
+ vcpu->arch.mmu->set_cr3 = nested_svm_set_tdp_cr3;
+ vcpu->arch.mmu->get_cr3 = nested_svm_get_tdp_cr3;
+ vcpu->arch.mmu->get_pdptr = nested_svm_get_tdp_pdptr;
+ vcpu->arch.mmu->inject_page_fault = nested_svm_inject_npf_exit;
+ vcpu->arch.mmu->shadow_root_level = get_npt_level(vcpu);
+ reset_shadow_zero_bits_mask(vcpu, vcpu->arch.mmu);
vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
}
static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
{
- vcpu->arch.walk_mmu = &vcpu->arch.mmu;
+ vcpu->arch.mmu = &vcpu->arch.root_mmu;
+ vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
}
static int nested_svm_check_permissions(struct vcpu_svm *svm)
@@ -2983,16 +3033,13 @@
svm->vmcb->control.exit_info_1 = error_code;
/*
- * FIXME: we should not write CR2 when L1 intercepts an L2 #PF exception.
- * The fix is to add the ancillary datum (CR2 or DR6) to structs
- * kvm_queued_exception and kvm_vcpu_events, so that CR2 and DR6 can be
- * written only when inject_pending_event runs (DR6 would written here
- * too). This should be conditional on a new capability---if the
- * capability is disabled, kvm_multiple_exception would write the
- * ancillary information to CR2 or DR6, for backwards ABI-compatibility.
+ * EXITINFO2 is undefined for all exception intercepts other
+ * than #PF.
*/
if (svm->vcpu.arch.exception.nested_apf)
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.apf.nested_apf_token;
+ else if (svm->vcpu.arch.exception.has_payload)
+ svm->vmcb->control.exit_info_2 = svm->vcpu.arch.exception.payload;
else
svm->vmcb->control.exit_info_2 = svm->vcpu.arch.cr2;
@@ -3054,32 +3101,6 @@
return false;
}
-static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, struct page **_page)
-{
- struct page *page;
-
- might_sleep();
-
- page = kvm_vcpu_gfn_to_page(&svm->vcpu, gpa >> PAGE_SHIFT);
- if (is_error_page(page))
- goto error;
-
- *_page = page;
-
- return kmap(page);
-
-error:
- kvm_inject_gp(&svm->vcpu, 0);
-
- return NULL;
-}
-
-static void nested_svm_unmap(struct page *page)
-{
- kunmap(page);
- kvm_release_page_dirty(page);
-}
-
static int nested_svm_intercept_ioio(struct vcpu_svm *svm)
{
unsigned port, size, iopm_len;
@@ -3276,14 +3297,17 @@
dst->event_inj_err = from->event_inj_err;
dst->nested_cr3 = from->nested_cr3;
dst->virt_ext = from->virt_ext;
+ dst->pause_filter_count = from->pause_filter_count;
+ dst->pause_filter_thresh = from->pause_filter_thresh;
}
static int nested_svm_vmexit(struct vcpu_svm *svm)
{
+ int rc;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
- struct page *page;
+ struct kvm_host_map map;
trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
vmcb->control.exit_info_1,
@@ -3292,9 +3316,14 @@
vmcb->control.exit_int_info_err,
KVM_ISA_SVM);
- nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page);
- if (!nested_vmcb)
+ rc = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->nested.vmcb), &map);
+ if (rc) {
+ if (rc == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
return 1;
+ }
+
+ nested_vmcb = map.hva;
/* Exit Guest-Mode */
leave_guest_mode(&svm->vcpu);
@@ -3354,6 +3383,11 @@
nested_vmcb->control.event_inj = 0;
nested_vmcb->control.event_inj_err = 0;
+ nested_vmcb->control.pause_filter_count =
+ svm->vmcb->control.pause_filter_count;
+ nested_vmcb->control.pause_filter_thresh =
+ svm->vmcb->control.pause_filter_thresh;
+
/* We always set V_INTR_MASKING and remember the old value in hflags */
if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
nested_vmcb->control.int_ctl &= ~V_INTR_MASKING_MASK;
@@ -3384,21 +3418,29 @@
} else {
(void)kvm_set_cr3(&svm->vcpu, hsave->save.cr3);
}
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, hsave->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, hsave->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, hsave->save.rip);
+ kvm_rax_write(&svm->vcpu, hsave->save.rax);
+ kvm_rsp_write(&svm->vcpu, hsave->save.rsp);
+ kvm_rip_write(&svm->vcpu, hsave->save.rip);
svm->vmcb->save.dr7 = 0;
svm->vmcb->save.cpl = 0;
svm->vmcb->control.exit_int_info = 0;
mark_all_dirty(svm->vmcb);
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
nested_svm_uninit_mmu_context(&svm->vcpu);
kvm_mmu_reset_context(&svm->vcpu);
kvm_mmu_load(&svm->vcpu);
+ /*
+ * Drop what we picked up for L2 via svm_complete_interrupts() so it
+ * doesn't end up in L1.
+ */
+ svm->vcpu.arch.nmi_injected = false;
+ kvm_clear_exception_queue(&svm->vcpu);
+ kvm_clear_interrupt_queue(&svm->vcpu);
+
return 0;
}
@@ -3451,7 +3493,7 @@
}
static void enter_svm_guest_mode(struct vcpu_svm *svm, u64 vmcb_gpa,
- struct vmcb *nested_vmcb, struct page *page)
+ struct vmcb *nested_vmcb, struct kvm_host_map *map)
{
if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
svm->vcpu.arch.hflags |= HF_HIF_MASK;
@@ -3459,7 +3501,6 @@
svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
if (nested_vmcb->control.nested_ctl & SVM_NESTED_CTL_NP_ENABLE) {
- kvm_mmu_unload(&svm->vcpu);
svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
nested_svm_init_mmu_context(&svm->vcpu);
}
@@ -3485,9 +3526,9 @@
kvm_mmu_reset_context(&svm->vcpu);
svm->vmcb->save.cr2 = svm->vcpu.arch.cr2 = nested_vmcb->save.cr2;
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX, nested_vmcb->save.rax);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RSP, nested_vmcb->save.rsp);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RIP, nested_vmcb->save.rip);
+ kvm_rax_write(&svm->vcpu, nested_vmcb->save.rax);
+ kvm_rsp_write(&svm->vcpu, nested_vmcb->save.rsp);
+ kvm_rip_write(&svm->vcpu, nested_vmcb->save.rip);
/* In case we don't even reach vcpu_run, the fields are not updated */
svm->vmcb->save.rax = nested_vmcb->save.rax;
@@ -3531,7 +3572,12 @@
svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
- nested_svm_unmap(page);
+ svm->vmcb->control.pause_filter_count =
+ nested_vmcb->control.pause_filter_count;
+ svm->vmcb->control.pause_filter_thresh =
+ nested_vmcb->control.pause_filter_thresh;
+
+ kvm_vcpu_unmap(&svm->vcpu, map, true);
/* Enter Guest-Mode */
enter_guest_mode(&svm->vcpu);
@@ -3549,19 +3595,28 @@
mark_all_dirty(svm->vmcb);
}
-static bool nested_svm_vmrun(struct vcpu_svm *svm)
+static int nested_svm_vmrun(struct vcpu_svm *svm)
{
+ int ret;
struct vmcb *nested_vmcb;
struct vmcb *hsave = svm->nested.hsave;
struct vmcb *vmcb = svm->vmcb;
- struct page *page;
+ struct kvm_host_map map;
u64 vmcb_gpa;
vmcb_gpa = svm->vmcb->save.rax;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
- return false;
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb_gpa), &map);
+ if (ret == -EINVAL) {
+ kvm_inject_gp(&svm->vcpu, 0);
+ return 1;
+ } else if (ret) {
+ return kvm_skip_emulated_instruction(&svm->vcpu);
+ }
+
+ ret = kvm_skip_emulated_instruction(&svm->vcpu);
+
+ nested_vmcb = map.hva;
if (!nested_vmcb_checks(nested_vmcb)) {
nested_vmcb->control.exit_code = SVM_EXIT_ERR;
@@ -3569,9 +3624,9 @@
nested_vmcb->control.exit_info_1 = 0;
nested_vmcb->control.exit_info_2 = 0;
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
- return false;
+ return ret;
}
trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
@@ -3613,9 +3668,18 @@
copy_vmcb_control_area(hsave, vmcb);
- enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, page);
+ enter_svm_guest_mode(svm, vmcb_gpa, nested_vmcb, &map);
- return true;
+ if (!nested_svm_vmrun_msrpm(svm)) {
+ svm->vmcb->control.exit_code = SVM_EXIT_ERR;
+ svm->vmcb->control.exit_code_hi = 0;
+ svm->vmcb->control.exit_info_1 = 0;
+ svm->vmcb->control.exit_info_2 = 0;
+
+ nested_svm_vmexit(svm);
+ }
+
+ return ret;
}
static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
@@ -3637,21 +3701,25 @@
static int vmload_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
- struct page *page;
+ struct kvm_host_map map;
int ret;
if (nested_svm_check_permissions(svm))
return 1;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
return 1;
+ }
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ nested_vmcb = map.hva;
+
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
return ret;
}
@@ -3659,21 +3727,25 @@
static int vmsave_interception(struct vcpu_svm *svm)
{
struct vmcb *nested_vmcb;
- struct page *page;
+ struct kvm_host_map map;
int ret;
if (nested_svm_check_permissions(svm))
return 1;
- nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
- if (!nested_vmcb)
+ ret = kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(svm->vmcb->save.rax), &map);
+ if (ret) {
+ if (ret == -EINVAL)
+ kvm_inject_gp(&svm->vcpu, 0);
return 1;
+ }
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
+ nested_vmcb = map.hva;
+
ret = kvm_skip_emulated_instruction(&svm->vcpu);
nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
- nested_svm_unmap(page);
+ kvm_vcpu_unmap(&svm->vcpu, &map, true);
return ret;
}
@@ -3683,27 +3755,7 @@
if (nested_svm_check_permissions(svm))
return 1;
- /* Save rip after vmrun instruction */
- kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
-
- if (!nested_svm_vmrun(svm))
- return 1;
-
- if (!nested_svm_vmrun_msrpm(svm))
- goto failed;
-
- return 1;
-
-failed:
-
- svm->vmcb->control.exit_code = SVM_EXIT_ERR;
- svm->vmcb->control.exit_code_hi = 0;
- svm->vmcb->control.exit_info_1 = 0;
- svm->vmcb->control.exit_info_2 = 0;
-
- nested_svm_vmexit(svm);
-
- return 1;
+ return nested_svm_vmrun(svm);
}
static int stgi_interception(struct vcpu_svm *svm)
@@ -3720,7 +3772,6 @@
if (vgif_enabled(svm))
clr_intercept(svm, INTERCEPT_STGI);
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
@@ -3736,7 +3787,6 @@
if (nested_svm_check_permissions(svm))
return 1;
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
ret = kvm_skip_emulated_instruction(&svm->vcpu);
disable_gif(svm);
@@ -3755,19 +3805,18 @@
{
struct kvm_vcpu *vcpu = &svm->vcpu;
- trace_kvm_invlpga(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RCX),
- kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+ trace_kvm_invlpga(svm->vmcb->save.rip, kvm_rcx_read(&svm->vcpu),
+ kvm_rax_read(&svm->vcpu));
/* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
- kvm_mmu_invlpg(vcpu, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+ kvm_mmu_invlpg(vcpu, kvm_rax_read(&svm->vcpu));
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
static int skinit_interception(struct vcpu_svm *svm)
{
- trace_kvm_skinit(svm->vmcb->save.rip, kvm_register_read(&svm->vcpu, VCPU_REGS_RAX));
+ trace_kvm_skinit(svm->vmcb->save.rip, kvm_rax_read(&svm->vcpu));
kvm_queue_exception(&svm->vcpu, UD_VECTOR);
return 1;
@@ -3781,16 +3830,21 @@
static int xsetbv_interception(struct vcpu_svm *svm)
{
u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
- u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
+ u32 index = kvm_rcx_read(&svm->vcpu);
if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
return kvm_skip_emulated_instruction(&svm->vcpu);
}
return 1;
}
+static int rdpru_interception(struct vcpu_svm *svm)
+{
+ kvm_queue_exception(&svm->vcpu, UD_VECTOR);
+ return 1;
+}
+
static int task_switch_interception(struct vcpu_svm *svm)
{
u16 tss_selector;
@@ -3843,25 +3897,20 @@
if (reason != TASK_SWITCH_GATE ||
int_type == SVM_EXITINTINFO_TYPE_SOFT ||
(int_type == SVM_EXITINTINFO_TYPE_EXEPT &&
- (int_vec == OF_VECTOR || int_vec == BP_VECTOR)))
- skip_emulated_instruction(&svm->vcpu);
+ (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) {
+ if (!skip_emulated_instruction(&svm->vcpu))
+ return 0;
+ }
if (int_type != SVM_EXITINTINFO_TYPE_SOFT)
int_vec = -1;
- if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
- has_error_code, error_code) == EMULATE_FAIL) {
- svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
- svm->vcpu.run->internal.ndata = 0;
- return 0;
- }
- return 1;
+ return kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason,
+ has_error_code, error_code);
}
static int cpuid_interception(struct vcpu_svm *svm)
{
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
return kvm_emulate_cpuid(&svm->vcpu);
}
@@ -3878,7 +3927,7 @@
static int invlpg_interception(struct vcpu_svm *svm)
{
if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
- return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(&svm->vcpu, 0);
kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
return kvm_skip_emulated_instruction(&svm->vcpu);
@@ -3886,20 +3935,19 @@
static int emulate_on_interception(struct vcpu_svm *svm)
{
- return kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
+ return kvm_emulate_instruction(&svm->vcpu, 0);
}
static int rsm_interception(struct vcpu_svm *svm)
{
- return kvm_emulate_instruction_from_buffer(&svm->vcpu,
- rsm_ins_bytes, 2) == EMULATE_DONE;
+ return kvm_emulate_instruction_from_buffer(&svm->vcpu, rsm_ins_bytes, 2);
}
static int rdpmc_interception(struct vcpu_svm *svm)
{
int err;
- if (!static_cpu_has(X86_FEATURE_NRIPS))
+ if (!nrips)
return emulate_on_interception(svm);
err = kvm_rdpmc(&svm->vcpu);
@@ -4177,25 +4225,7 @@
static int rdmsr_interception(struct vcpu_svm *svm)
{
- u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
- struct msr_data msr_info;
-
- msr_info.index = ecx;
- msr_info.host_initiated = false;
- if (svm_get_msr(&svm->vcpu, &msr_info)) {
- trace_kvm_msr_read_ex(ecx);
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- } else {
- trace_kvm_msr_read(ecx, msr_info.data);
-
- kvm_register_write(&svm->vcpu, VCPU_REGS_RAX,
- msr_info.data & 0xffffffff);
- kvm_register_write(&svm->vcpu, VCPU_REGS_RDX,
- msr_info.data >> 32);
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
+ return kvm_emulate_rdmsr(&svm->vcpu);
}
static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data)
@@ -4376,7 +4406,7 @@
case MSR_IA32_APICBASE:
if (kvm_vcpu_apicv_active(vcpu))
avic_update_vapic_bar(to_svm(vcpu), data);
- /* Follow through */
+ /* Fall through */
default:
return kvm_set_msr_common(vcpu, msr);
}
@@ -4385,23 +4415,7 @@
static int wrmsr_interception(struct vcpu_svm *svm)
{
- struct msr_data msr;
- u32 ecx = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
- u64 data = kvm_read_edx_eax(&svm->vcpu);
-
- msr.data = data;
- msr.index = ecx;
- msr.host_initiated = false;
-
- svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
- if (kvm_set_msr(&svm->vcpu, &msr)) {
- trace_kvm_msr_write_ex(ecx, data);
- kvm_inject_gp(&svm->vcpu, 0);
- return 1;
- } else {
- trace_kvm_msr_write(ecx, data);
- return kvm_skip_emulated_instruction(&svm->vcpu);
- }
+ return kvm_emulate_wrmsr(&svm->vcpu);
}
static int msr_interception(struct vcpu_svm *svm)
@@ -4507,6 +4521,8 @@
break;
}
case AVIC_IPI_FAILURE_INVALID_TARGET:
+ WARN_ONCE(1, "Invalid IPI target: index=%u, vcpu=%d, icr=%#0x:%#0x\n",
+ index, svm->vcpu.vcpu_id, icrh, icrl);
break;
case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE:
WARN_ONCE(1, "Invalid backing page\n");
@@ -4547,8 +4563,7 @@
return &logical_apic_id_table[index];
}
-static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr,
- bool valid)
+static int avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr)
{
bool flat;
u32 *entry, new_entry;
@@ -4561,31 +4576,40 @@
new_entry = READ_ONCE(*entry);
new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK;
new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK);
- if (valid)
- new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
- else
- new_entry &= ~AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
+ new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK;
WRITE_ONCE(*entry, new_entry);
return 0;
}
+static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+ bool flat = svm->dfr_reg == APIC_DFR_FLAT;
+ u32 *entry = avic_get_logical_id_entry(vcpu, svm->ldr_reg, flat);
+
+ if (entry)
+ clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, (unsigned long *)entry);
+}
+
static int avic_handle_ldr_update(struct kvm_vcpu *vcpu)
{
- int ret;
+ int ret = 0;
struct vcpu_svm *svm = to_svm(vcpu);
u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR);
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
- if (!ldr)
- return 1;
+ if (ldr == svm->ldr_reg)
+ return 0;
- ret = avic_ldr_write(vcpu, vcpu->vcpu_id, ldr, true);
- if (ret && svm->ldr_reg) {
- avic_ldr_write(vcpu, 0, svm->ldr_reg, false);
- svm->ldr_reg = 0;
- } else {
+ avic_invalidate_logical_id_entry(vcpu);
+
+ if (ldr)
+ ret = avic_ldr_write(vcpu, id, ldr);
+
+ if (!ret)
svm->ldr_reg = ldr;
- }
+
return ret;
}
@@ -4593,8 +4617,7 @@
{
u64 *old, *new;
struct vcpu_svm *svm = to_svm(vcpu);
- u32 apic_id_reg = kvm_lapic_get_reg(vcpu->arch.apic, APIC_ID);
- u32 id = (apic_id_reg >> 24) & 0xff;
+ u32 id = kvm_xapic_id(vcpu->arch.apic);
if (vcpu->vcpu_id == id)
return 0;
@@ -4619,27 +4642,16 @@
return 0;
}
-static int avic_handle_dfr_update(struct kvm_vcpu *vcpu)
+static void avic_handle_dfr_update(struct kvm_vcpu *vcpu)
{
struct vcpu_svm *svm = to_svm(vcpu);
- struct kvm_svm *kvm_svm = to_kvm_svm(vcpu->kvm);
u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR);
- u32 mod = (dfr >> 28) & 0xf;
- /*
- * We assume that all local APICs are using the same type.
- * If this changes, we need to flush the AVIC logical
- * APID id table.
- */
- if (kvm_svm->ldr_mode == mod)
- return 0;
+ if (svm->dfr_reg == dfr)
+ return;
- clear_page(page_address(kvm_svm->avic_logical_id_table_page));
- kvm_svm->ldr_mode = mod;
-
- if (svm->ldr_reg)
- avic_handle_ldr_update(vcpu);
- return 0;
+ avic_invalidate_logical_id_entry(vcpu);
+ svm->dfr_reg = dfr;
}
static int avic_unaccel_trap_write(struct vcpu_svm *svm)
@@ -4717,7 +4729,7 @@
ret = avic_unaccel_trap_write(svm);
} else {
/* Handling Fault */
- ret = (kvm_emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE);
+ ret = kvm_emulate_instruction(&svm->vcpu, 0);
}
return ret;
@@ -4784,6 +4796,7 @@
[SVM_EXIT_MONITOR] = monitor_interception,
[SVM_EXIT_MWAIT] = mwait_interception,
[SVM_EXIT_XSETBV] = xsetbv_interception,
+ [SVM_EXIT_RDPRU] = rdpru_interception,
[SVM_EXIT_NPF] = npf_interception,
[SVM_EXIT_RSM] = rsm_interception,
[SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception,
@@ -4796,6 +4809,11 @@
struct vmcb_control_area *control = &svm->vmcb->control;
struct vmcb_save_area *save = &svm->vmcb->save;
+ if (!dump_invalid_vmcb) {
+ pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n");
+ return;
+ }
+
pr_err("VMCB Control Area:\n");
pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
@@ -4954,7 +4972,6 @@
kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
kvm_run->fail_entry.hardware_entry_failure_reason
= svm->vmcb->control.exit_code;
- pr_err("KVM: FAILED VMRUN WITH VMCB:\n");
dump_vmcb(vcpu);
return 0;
}
@@ -4970,9 +4987,14 @@
if (exit_code >= ARRAY_SIZE(svm_exit_handlers)
|| !svm_exit_handlers[exit_code]) {
- WARN_ONCE(1, "svm: unexpected exit reason 0x%x\n", exit_code);
- kvm_queue_exception(vcpu, UD_VECTOR);
- return 1;
+ vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%x\n", exit_code);
+ dump_vmcb(vcpu);
+ vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+ vcpu->run->internal.suberror =
+ KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
+ vcpu->run->internal.ndata = 1;
+ vcpu->run->internal.data[0] = exit_code;
+ return 0;
}
return svm_exit_handlers[exit_code](svm);
@@ -5107,11 +5129,11 @@
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *vmcb = svm->vmcb;
- if (!kvm_vcpu_apicv_active(&svm->vcpu))
- return;
-
- vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
- mark_dirty(vmcb, VMCB_INTR);
+ if (kvm_vcpu_apicv_active(vcpu))
+ vmcb->control.int_ctl |= AVIC_ENABLE_MASK;
+ else
+ vmcb->control.int_ctl &= ~AVIC_ENABLE_MASK;
+ mark_dirty(vmcb, VMCB_AVIC);
}
static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
@@ -5124,13 +5146,21 @@
kvm_lapic_set_irr(vec, vcpu->arch.apic);
smp_mb__after_atomic();
- if (avic_vcpu_is_running(vcpu))
- wrmsrl(SVM_AVIC_DOORBELL,
- kvm_cpu_get_apicid(vcpu->cpu));
- else
+ if (avic_vcpu_is_running(vcpu)) {
+ int cpuid = vcpu->cpu;
+
+ if (cpuid != get_cpu())
+ wrmsrl(SVM_AVIC_DOORBELL, kvm_cpu_get_apicid(cpuid));
+ put_cpu();
+ } else
kvm_vcpu_wake_up(vcpu);
}
+static bool svm_dy_apicv_has_pending_interrupt(struct kvm_vcpu *vcpu)
+{
+ return false;
+}
+
static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi)
{
unsigned long flags;
@@ -5177,7 +5207,7 @@
* Allocating new amd_iommu_pi_data, which will get
* add to the per-vcpu ir_list.
*/
- ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL);
+ ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT);
if (!ir) {
ret = -ENOMEM;
goto out;
@@ -5211,7 +5241,8 @@
kvm_set_msi_irq(kvm, e, &irq);
- if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
+ if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) ||
+ !kvm_irq_is_postable(&irq)) {
pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n",
__func__, irq.vector);
return -1;
@@ -5265,6 +5296,7 @@
* 1. When cannot target interrupt to a specific vcpu.
* 2. Unsetting posted interrupt.
* 3. APIC virtialization is disabled for the vcpu.
+ * 4. IRQ has incompatible delivery mode (SMI, INIT, etc)
*/
if (!get_pi_vcpu_info(kvm, e, &vcpu_info, &svm) && set &&
kvm_vcpu_apicv_active(&svm->vcpu)) {
@@ -5602,6 +5634,11 @@
svm->vmcb->save.cr2 = vcpu->arch.cr2;
clgi();
+ kvm_load_guest_xcr0(vcpu);
+
+ if (lapic_in_kernel(vcpu) &&
+ vcpu->arch.apic->lapic_timer.timer_advance_ns)
+ kvm_wait_lapic_expire(vcpu);
/*
* If this vCPU has touched SPEC_CTRL, restore the guest's value if
@@ -5635,9 +5672,9 @@
/* Enter guest mode */
"push %%" _ASM_AX " \n\t"
"mov %c[vmcb](%[svm]), %%" _ASM_AX " \n\t"
- __ex(SVM_VMLOAD) "\n\t"
- __ex(SVM_VMRUN) "\n\t"
- __ex(SVM_VMSAVE) "\n\t"
+ __ex("vmload %%" _ASM_AX) "\n\t"
+ __ex("vmrun %%" _ASM_AX) "\n\t"
+ __ex("vmsave %%" _ASM_AX) "\n\t"
"pop %%" _ASM_AX " \n\t"
/* Save guest registers, load host registers */
@@ -5656,26 +5693,24 @@
"mov %%r13, %c[r13](%[svm]) \n\t"
"mov %%r14, %c[r14](%[svm]) \n\t"
"mov %%r15, %c[r15](%[svm]) \n\t"
-#endif
/*
* Clear host registers marked as clobbered to prevent
* speculative use.
*/
- "xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
- "xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
- "xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
- "xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
- "xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
-#ifdef CONFIG_X86_64
- "xor %%r8, %%r8 \n\t"
- "xor %%r9, %%r9 \n\t"
- "xor %%r10, %%r10 \n\t"
- "xor %%r11, %%r11 \n\t"
- "xor %%r12, %%r12 \n\t"
- "xor %%r13, %%r13 \n\t"
- "xor %%r14, %%r14 \n\t"
- "xor %%r15, %%r15 \n\t"
+ "xor %%r8d, %%r8d \n\t"
+ "xor %%r9d, %%r9d \n\t"
+ "xor %%r10d, %%r10d \n\t"
+ "xor %%r11d, %%r11d \n\t"
+ "xor %%r12d, %%r12d \n\t"
+ "xor %%r13d, %%r13d \n\t"
+ "xor %%r14d, %%r14d \n\t"
+ "xor %%r15d, %%r15d \n\t"
#endif
+ "xor %%ebx, %%ebx \n\t"
+ "xor %%ecx, %%ecx \n\t"
+ "xor %%edx, %%edx \n\t"
+ "xor %%esi, %%esi \n\t"
+ "xor %%edi, %%edi \n\t"
"pop %%" _ASM_BP
:
: [svm]"a"(svm),
@@ -5749,6 +5784,7 @@
if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
kvm_before_interrupt(&svm->vcpu);
+ kvm_put_guest_xcr0(vcpu);
stgi();
/* Any pending NMI will happen here */
@@ -5825,9 +5861,9 @@
hypercall[2] = 0xd9;
}
-static void svm_check_processor_compat(void *rtn)
+static int __init svm_check_processor_compat(void)
{
- *(int *)rtn = 0;
+ return 0;
}
static bool svm_cpu_has_accelerated_tpr(void)
@@ -5837,6 +5873,14 @@
static bool svm_has_emulated_msr(int index)
{
+ switch (index) {
+ case MSR_IA32_MCG_EXT_CTL:
+ case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
+ return false;
+ default:
+ break;
+ }
+
return true;
}
@@ -5858,6 +5902,8 @@
guest_cpuid_clear(vcpu, X86_FEATURE_X2APIC);
}
+#define F(x) bit(X86_FEATURE_##x)
+
static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
{
switch (func) {
@@ -5869,6 +5915,11 @@
if (nested)
entry->ecx |= (1 << 2); /* Set SVM bit */
break;
+ case 0x80000008:
+ if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) ||
+ boot_cpu_has(X86_FEATURE_AMD_SSBD))
+ entry->ebx |= F(VIRT_SSBD);
+ break;
case 0x8000000A:
entry->eax = 1; /* SVM revision 1 */
entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
@@ -5879,11 +5930,11 @@
/* Support next_rip if host supports it */
if (boot_cpu_has(X86_FEATURE_NRIPS))
- entry->edx |= SVM_FEATURE_NRIP;
+ entry->edx |= F(NRIPS);
/* Support NPT for the guest if enabled */
if (npt_enabled)
- entry->edx |= SVM_FEATURE_NPT;
+ entry->edx |= F(NPT);
break;
case 0x8000001F:
@@ -5925,6 +5976,11 @@
return false;
}
+static bool svm_pt_supported(void)
+{
+ return false;
+}
+
static bool svm_has_wbinvd_exit(void)
{
return true;
@@ -5987,6 +6043,7 @@
[x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
[x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
[x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
+ [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV),
};
#undef PRE_EX
@@ -6114,15 +6171,9 @@
return ret;
}
-static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
+static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu)
{
- local_irq_enable();
- /*
- * We must have an instruction with interrupts enabled, so
- * the timer interrupt isn't delayed by the interrupt shadow.
- */
- asm("nop");
- local_irq_disable();
+
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
@@ -6135,8 +6186,7 @@
{
if (avic_handle_apic_id_update(vcpu) != 0)
return;
- if (avic_handle_dfr_update(vcpu) != 0)
- return;
+ avic_handle_dfr_update(vcpu);
avic_handle_ldr_update(vcpu);
}
@@ -6187,32 +6237,24 @@
return 0;
}
-static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, u64 smbase)
+static int svm_pre_leave_smm(struct kvm_vcpu *vcpu, const char *smstate)
{
struct vcpu_svm *svm = to_svm(vcpu);
struct vmcb *nested_vmcb;
- struct page *page;
- struct {
- u64 guest;
- u64 vmcb;
- } svm_state_save;
- int ret;
+ struct kvm_host_map map;
+ u64 guest;
+ u64 vmcb;
- ret = kvm_vcpu_read_guest(vcpu, smbase + 0xfed8, &svm_state_save,
- sizeof(svm_state_save));
- if (ret)
- return ret;
+ guest = GET_SMSTATE(u64, smstate, 0x7ed8);
+ vmcb = GET_SMSTATE(u64, smstate, 0x7ee0);
- if (svm_state_save.guest) {
- vcpu->arch.hflags &= ~HF_SMM_MASK;
- nested_vmcb = nested_svm_map(svm, svm_state_save.vmcb, &page);
- if (nested_vmcb)
- enter_svm_guest_mode(svm, svm_state_save.vmcb, nested_vmcb, page);
- else
- ret = 1;
- vcpu->arch.hflags |= HF_SMM_MASK;
+ if (guest) {
+ if (kvm_vcpu_map(&svm->vcpu, gpa_to_gfn(vmcb), &map) == -EINVAL)
+ return 1;
+ nested_vmcb = map.hva;
+ enter_svm_guest_mode(svm, vmcb, nested_vmcb, &map);
}
- return ret;
+ return 0;
}
static int enable_smi_window(struct kvm_vcpu *vcpu)
@@ -6249,6 +6291,9 @@
int asid, ret;
ret = -EBUSY;
+ if (unlikely(sev->active))
+ return ret;
+
asid = sev_asid_new();
if (asid < 0)
return ret;
@@ -6280,7 +6325,7 @@
if (ret)
return ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6330,7 +6375,7 @@
if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- start = kzalloc(sizeof(*start), GFP_KERNEL);
+ start = kzalloc(sizeof(*start), GFP_KERNEL_ACCOUNT);
if (!start)
return -ENOMEM;
@@ -6391,11 +6436,11 @@
return ret;
}
-static int get_num_contig_pages(int idx, struct page **inpages,
- unsigned long npages)
+static unsigned long get_num_contig_pages(unsigned long idx,
+ struct page **inpages, unsigned long npages)
{
unsigned long paddr, next_paddr;
- int i = idx + 1, pages = 1;
+ unsigned long i = idx + 1, pages = 1;
/* find the number of contiguous pages starting from idx */
paddr = __sme_page_pa(inpages[idx]);
@@ -6414,12 +6459,12 @@
static int sev_launch_update_data(struct kvm *kvm, struct kvm_sev_cmd *argp)
{
- unsigned long vaddr, vaddr_end, next_vaddr, npages, size;
+ unsigned long vaddr, vaddr_end, next_vaddr, npages, pages, size, i;
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_launch_update_data params;
struct sev_data_launch_update_data *data;
struct page **inpages;
- int i, ret, pages;
+ int ret;
if (!sev_guest(kvm))
return -ENOTTY;
@@ -6427,7 +6472,7 @@
if (copy_from_user(¶ms, (void __user *)(uintptr_t)argp->data, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6504,7 +6549,7 @@
if (copy_from_user(¶ms, measure, sizeof(params)))
return -EFAULT;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6566,7 +6611,7 @@
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6587,7 +6632,7 @@
if (!sev_guest(kvm))
return -ENOTTY;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6615,7 +6660,7 @@
struct sev_data_dbg *data;
int ret;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
return -ENOMEM;
@@ -6768,7 +6813,8 @@
struct page **src_p, **dst_p;
struct kvm_sev_dbg debug;
unsigned long n;
- int ret, size;
+ unsigned int size;
+ int ret;
if (!sev_guest(kvm))
return -ENOTTY;
@@ -6776,6 +6822,11 @@
if (copy_from_user(&debug, (void __user *)(uintptr_t)argp->data, sizeof(debug)))
return -EFAULT;
+ if (!debug.len || debug.src_uaddr + debug.len < debug.src_uaddr)
+ return -EINVAL;
+ if (!debug.dst_uaddr)
+ return -EINVAL;
+
vaddr = debug.src_uaddr;
size = debug.len;
vaddr_end = vaddr + size;
@@ -6826,8 +6877,8 @@
dst_vaddr,
len, &argp->error);
- sev_unpin_memory(kvm, src_p, 1);
- sev_unpin_memory(kvm, dst_p, 1);
+ sev_unpin_memory(kvm, src_p, n);
+ sev_unpin_memory(kvm, dst_p, n);
if (ret)
goto err;
@@ -6870,7 +6921,7 @@
}
ret = -ENOMEM;
- data = kzalloc(sizeof(*data), GFP_KERNEL);
+ data = kzalloc(sizeof(*data), GFP_KERNEL_ACCOUNT);
if (!data)
goto e_unpin_memory;
@@ -6976,7 +7027,7 @@
if (range->addr > ULONG_MAX || range->size > ULONG_MAX)
return -EINVAL;
- region = kzalloc(sizeof(*region), GFP_KERNEL);
+ region = kzalloc(sizeof(*region), GFP_KERNEL_ACCOUNT);
if (!region)
return -ENOMEM;
@@ -7054,6 +7105,79 @@
return ret;
}
+static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+{
+ unsigned long cr4 = kvm_read_cr4(vcpu);
+ bool smep = cr4 & X86_CR4_SMEP;
+ bool smap = cr4 & X86_CR4_SMAP;
+ bool is_user = svm_get_cpl(vcpu) == 3;
+
+ /*
+ * Detect and workaround Errata 1096 Fam_17h_00_0Fh.
+ *
+ * Errata:
+ * When CPU raise #NPF on guest data access and vCPU CR4.SMAP=1, it is
+ * possible that CPU microcode implementing DecodeAssist will fail
+ * to read bytes of instruction which caused #NPF. In this case,
+ * GuestIntrBytes field of the VMCB on a VMEXIT will incorrectly
+ * return 0 instead of the correct guest instruction bytes.
+ *
+ * This happens because CPU microcode reading instruction bytes
+ * uses a special opcode which attempts to read data using CPL=0
+ * priviledges. The microcode reads CS:RIP and if it hits a SMAP
+ * fault, it gives up and returns no instruction bytes.
+ *
+ * Detection:
+ * We reach here in case CPU supports DecodeAssist, raised #NPF and
+ * returned 0 in GuestIntrBytes field of the VMCB.
+ * First, errata can only be triggered in case vCPU CR4.SMAP=1.
+ * Second, if vCPU CR4.SMEP=1, errata could only be triggered
+ * in case vCPU CPL==3 (Because otherwise guest would have triggered
+ * a SMEP fault instead of #NPF).
+ * Otherwise, vCPU CR4.SMEP=0, errata could be triggered by any vCPU CPL.
+ * As most guests enable SMAP if they have also enabled SMEP, use above
+ * logic in order to attempt minimize false-positive of detecting errata
+ * while still preserving all cases semantic correctness.
+ *
+ * Workaround:
+ * To determine what instruction the guest was executing, the hypervisor
+ * will have to decode the instruction at the instruction pointer.
+ *
+ * In non SEV guest, hypervisor will be able to read the guest
+ * memory to decode the instruction pointer when insn_len is zero
+ * so we return true to indicate that decoding is possible.
+ *
+ * But in the SEV guest, the guest memory is encrypted with the
+ * guest specific key and hypervisor will not be able to decode the
+ * instruction pointer so we will not able to workaround it. Lets
+ * print the error and request to kill the guest.
+ */
+ if (smap && (!smep || is_user)) {
+ if (!sev_guest(vcpu->kvm))
+ return true;
+
+ pr_err_ratelimited("KVM: SEV Guest triggered AMD Erratum 1096\n");
+ kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
+ }
+
+ return false;
+}
+
+static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_svm *svm = to_svm(vcpu);
+
+ /*
+ * TODO: Last condition latch INIT signals on vCPU when
+ * vCPU is in guest-mode and vmcb12 defines intercept on INIT.
+ * To properly emulate the INIT intercept, SVM should implement
+ * kvm_x86_ops->check_nested_events() and call nested_svm_vmexit()
+ * there if an INIT signal is pending.
+ */
+ return !gif_set(svm) ||
+ (svm->vmcb->control.intercept & (1ULL << INTERCEPT_INIT));
+}
+
static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.cpu_has_kvm_support = has_svm,
.disabled_by_bios = is_disabled,
@@ -7153,6 +7277,7 @@
.mpx_supported = svm_mpx_supported,
.xsaves_supported = svm_xsaves_supported,
.umip_emulated = svm_umip_emulated,
+ .pt_supported = svm_pt_supported,
.set_supported_cpuid = svm_set_supported_cpuid,
@@ -7164,7 +7289,7 @@
.set_tdp_cr3 = set_tdp_cr3,
.check_intercept = svm_check_intercept,
- .handle_external_intr = svm_handle_external_intr,
+ .handle_exit_irqoff = svm_handle_exit_irqoff,
.request_immediate_exit = __kvm_request_immediate_exit,
@@ -7172,6 +7297,7 @@
.pmu_ops = &amd_pmu_ops,
.deliver_posted_interrupt = svm_deliver_avic_intr,
+ .dy_apicv_has_pending_interrupt = svm_dy_apicv_has_pending_interrupt,
.update_pi_irte = svm_update_pi_irte,
.setup_mce = svm_setup_mce,
@@ -7183,6 +7309,13 @@
.mem_enc_op = svm_mem_enc_op,
.mem_enc_reg_region = svm_register_enc_region,
.mem_enc_unreg_region = svm_unregister_enc_region,
+
+ .nested_enable_evmcs = NULL,
+ .nested_get_evmcs_version = NULL,
+
+ .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+
+ .apic_init_signal_blocked = svm_apic_init_signal_blocked,
};
static int __init svm_init(void)