Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/virt/kvm/arm/aarch32.c b/virt/kvm/arm/aarch32.c
index c4c57ba..f2047fc 100644
--- a/virt/kvm/arm/aarch32.c
+++ b/virt/kvm/arm/aarch32.c
@@ -10,10 +10,15 @@
* Author: Christoffer Dall <c.dall@virtualopensystems.com>
*/
+#include <linux/bits.h>
#include <linux/kvm_host.h>
#include <asm/kvm_emulate.h>
#include <asm/kvm_hyp.h>
+#define DFSR_FSC_EXTABT_LPAE 0x10
+#define DFSR_FSC_EXTABT_nLPAE 0x08
+#define DFSR_LPAE BIT(9)
+
/*
* Table taken from ARMv8 ARM DDI0487B-B, table G1-10.
*/
@@ -28,25 +33,135 @@
[7] = { 4, 4 }, /* FIQ, unused */
};
+static bool pre_fault_synchronize(struct kvm_vcpu *vcpu)
+{
+ preempt_disable();
+ if (kvm_arm_vcpu_loaded(vcpu)) {
+ kvm_arch_vcpu_put(vcpu);
+ return true;
+ }
+
+ preempt_enable();
+ return false;
+}
+
+static void post_fault_synchronize(struct kvm_vcpu *vcpu, bool loaded)
+{
+ if (loaded) {
+ kvm_arch_vcpu_load(vcpu, smp_processor_id());
+ preempt_enable();
+ }
+}
+
+/*
+ * When an exception is taken, most CPSR fields are left unchanged in the
+ * handler. However, some are explicitly overridden (e.g. M[4:0]).
+ *
+ * The SPSR/SPSR_ELx layouts differ, and the below is intended to work with
+ * either format. Note: SPSR.J bit doesn't exist in SPSR_ELx, but this bit was
+ * obsoleted by the ARMv7 virtualization extensions and is RES0.
+ *
+ * For the SPSR layout seen from AArch32, see:
+ * - ARM DDI 0406C.d, page B1-1148
+ * - ARM DDI 0487E.a, page G8-6264
+ *
+ * For the SPSR_ELx layout for AArch32 seen from AArch64, see:
+ * - ARM DDI 0487E.a, page C5-426
+ *
+ * Here we manipulate the fields in order of the AArch32 SPSR_ELx layout, from
+ * MSB to LSB.
+ */
+static unsigned long get_except32_cpsr(struct kvm_vcpu *vcpu, u32 mode)
+{
+ u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
+ unsigned long old, new;
+
+ old = *vcpu_cpsr(vcpu);
+ new = 0;
+
+ new |= (old & PSR_AA32_N_BIT);
+ new |= (old & PSR_AA32_Z_BIT);
+ new |= (old & PSR_AA32_C_BIT);
+ new |= (old & PSR_AA32_V_BIT);
+ new |= (old & PSR_AA32_Q_BIT);
+
+ // CPSR.IT[7:0] are set to zero upon any exception
+ // See ARM DDI 0487E.a, section G1.12.3
+ // See ARM DDI 0406C.d, section B1.8.3
+
+ new |= (old & PSR_AA32_DIT_BIT);
+
+ // CPSR.SSBS is set to SCTLR.DSSBS upon any exception
+ // See ARM DDI 0487E.a, page G8-6244
+ if (sctlr & BIT(31))
+ new |= PSR_AA32_SSBS_BIT;
+
+ // CPSR.PAN is unchanged unless SCTLR.SPAN == 0b0
+ // SCTLR.SPAN is RES1 when ARMv8.1-PAN is not implemented
+ // See ARM DDI 0487E.a, page G8-6246
+ new |= (old & PSR_AA32_PAN_BIT);
+ if (!(sctlr & BIT(23)))
+ new |= PSR_AA32_PAN_BIT;
+
+ // SS does not exist in AArch32, so ignore
+
+ // CPSR.IL is set to zero upon any exception
+ // See ARM DDI 0487E.a, page G1-5527
+
+ new |= (old & PSR_AA32_GE_MASK);
+
+ // CPSR.IT[7:0] are set to zero upon any exception
+ // See prior comment above
+
+ // CPSR.E is set to SCTLR.EE upon any exception
+ // See ARM DDI 0487E.a, page G8-6245
+ // See ARM DDI 0406C.d, page B4-1701
+ if (sctlr & BIT(25))
+ new |= PSR_AA32_E_BIT;
+
+ // CPSR.A is unchanged upon an exception to Undefined, Supervisor
+ // CPSR.A is set upon an exception to other modes
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= (old & PSR_AA32_A_BIT);
+ if (mode != PSR_AA32_MODE_UND && mode != PSR_AA32_MODE_SVC)
+ new |= PSR_AA32_A_BIT;
+
+ // CPSR.I is set upon any exception
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= PSR_AA32_I_BIT;
+
+ // CPSR.F is set upon an exception to FIQ
+ // CPSR.F is unchanged upon an exception to other modes
+ // See ARM DDI 0487E.a, pages G1-5515 to G1-5516
+ // See ARM DDI 0406C.d, page B1-1182
+ new |= (old & PSR_AA32_F_BIT);
+ if (mode == PSR_AA32_MODE_FIQ)
+ new |= PSR_AA32_F_BIT;
+
+ // CPSR.T is set to SCTLR.TE upon any exception
+ // See ARM DDI 0487E.a, page G8-5514
+ // See ARM DDI 0406C.d, page B1-1181
+ if (sctlr & BIT(30))
+ new |= PSR_AA32_T_BIT;
+
+ new |= mode;
+
+ return new;
+}
+
static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset)
{
- unsigned long cpsr;
- unsigned long new_spsr_value = *vcpu_cpsr(vcpu);
- bool is_thumb = (new_spsr_value & PSR_AA32_T_BIT);
+ unsigned long spsr = *vcpu_cpsr(vcpu);
+ bool is_thumb = (spsr & PSR_AA32_T_BIT);
u32 return_offset = return_offsets[vect_offset >> 2][is_thumb];
u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR);
- cpsr = mode | PSR_AA32_I_BIT;
-
- if (sctlr & (1 << 30))
- cpsr |= PSR_AA32_T_BIT;
- if (sctlr & (1 << 25))
- cpsr |= PSR_AA32_E_BIT;
-
- *vcpu_cpsr(vcpu) = cpsr;
+ *vcpu_cpsr(vcpu) = get_except32_cpsr(vcpu, mode);
/* Note: These now point to the banked copies */
- vcpu_write_spsr(vcpu, new_spsr_value);
+ vcpu_write_spsr(vcpu, host_spsr_to_spsr32(spsr));
*vcpu_reg32(vcpu, 14) = *vcpu_pc(vcpu) + return_offset;
/* Branch to exception vector */
@@ -60,7 +175,10 @@
void kvm_inject_undef32(struct kvm_vcpu *vcpu)
{
+ bool loaded = pre_fault_synchronize(vcpu);
+
prepare_fault32(vcpu, PSR_AA32_MODE_UND, 4);
+ post_fault_synchronize(vcpu, loaded);
}
/*
@@ -73,6 +191,9 @@
u32 vect_offset;
u32 *far, *fsr;
bool is_lpae;
+ bool loaded;
+
+ loaded = pre_fault_synchronize(vcpu);
if (is_pabt) {
vect_offset = 12;
@@ -84,16 +205,20 @@
fsr = &vcpu_cp15(vcpu, c5_DFSR);
}
- prepare_fault32(vcpu, PSR_AA32_MODE_ABT | PSR_AA32_A_BIT, vect_offset);
+ prepare_fault32(vcpu, PSR_AA32_MODE_ABT, vect_offset);
*far = addr;
/* Give the guest an IMPLEMENTATION DEFINED exception */
is_lpae = (vcpu_cp15(vcpu, c2_TTBCR) >> 31);
- if (is_lpae)
- *fsr = 1 << 9 | 0x34;
- else
- *fsr = 0x14;
+ if (is_lpae) {
+ *fsr = DFSR_LPAE | DFSR_FSC_EXTABT_LPAE;
+ } else {
+ /* no need to shuffle FS[4] into DFSR[10] as its 0 */
+ *fsr = DFSR_FSC_EXTABT_nLPAE;
+ }
+
+ post_fault_synchronize(vcpu, loaded);
}
void kvm_inject_dabt32(struct kvm_vcpu *vcpu, unsigned long addr)
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index e2bb5bd..6b22210 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -805,6 +805,7 @@
switch (treg) {
case TIMER_REG_TVAL:
val = timer->cnt_cval - kvm_phys_timer_read() + timer->cntvoff;
+ val &= lower_32_bits(val);
break;
case TIMER_REG_CTL:
@@ -850,7 +851,7 @@
{
switch (treg) {
case TIMER_REG_TVAL:
- timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + val;
+ timer->cnt_cval = kvm_phys_timer_read() - timer->cntvoff + (s32)val;
break;
case TIMER_REG_CTL:
diff --git a/virt/kvm/arm/arm.c b/virt/kvm/arm/arm.c
index 86c6aa1..f7150fb 100644
--- a/virt/kvm/arm/arm.c
+++ b/virt/kvm/arm/arm.c
@@ -354,6 +354,16 @@
return kvm_vgic_vcpu_init(vcpu);
}
+#ifdef CONFIG_ARM64
+#define __ptrauth_save_key(regs, key) \
+({ \
+ regs[key ## KEYLO_EL1] = read_sysreg_s(SYS_ ## key ## KEYLO_EL1); \
+ regs[key ## KEYHI_EL1] = read_sysreg_s(SYS_ ## key ## KEYHI_EL1); \
+})
+#else
+#define __ptrauth_save_key(regs, key) do { } while (0)
+#endif
+
void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
int *last_ran;
@@ -363,11 +373,17 @@
cpu_data = this_cpu_ptr(&kvm_host_data);
/*
+ * We guarantee that both TLBs and I-cache are private to each
+ * vcpu. If detecting that a vcpu from the same VM has
+ * previously run on the same physical CPU, call into the
+ * hypervisor code to nuke the relevant contexts.
+ *
+ * We might get preempted before the vCPU actually runs, but
* We might get preempted before the vCPU actually runs, but
* over-invalidation doesn't affect correctness.
*/
if (*last_ran != vcpu->vcpu_id) {
- kvm_call_hyp(__kvm_tlb_flush_local_vmid, vcpu);
+ kvm_call_hyp(__kvm_flush_cpu_context, vcpu);
*last_ran = vcpu->vcpu_id;
}
@@ -386,7 +402,17 @@
else
vcpu_set_wfe_traps(vcpu);
- vcpu_ptrauth_setup_lazy(vcpu);
+ if (vcpu_has_ptrauth(vcpu)) {
+ struct kvm_cpu_context __maybe_unused *ctxt = vcpu->arch.host_cpu_context;
+
+ __ptrauth_save_key(ctxt->sys_regs, APIA);
+ __ptrauth_save_key(ctxt->sys_regs, APIB);
+ __ptrauth_save_key(ctxt->sys_regs, APDA);
+ __ptrauth_save_key(ctxt->sys_regs, APDB);
+ __ptrauth_save_key(ctxt->sys_regs, APGA);
+
+ vcpu_ptrauth_disable(vcpu);
+ }
}
void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
@@ -553,6 +579,8 @@
vcpu->arch.has_run_once = true;
+ kvm_arm_vcpu_init_debug(vcpu);
+
if (likely(irqchip_in_kernel(kvm))) {
/*
* Map the VGIC hardware resources before running a vcpu the
@@ -1113,6 +1141,14 @@
if (copy_from_user(®, argp, sizeof(reg)))
break;
+ /*
+ * We could owe a reset due to PSCI. Handle the pending reset
+ * here to ensure userspace register accesses are ordered after
+ * the reset.
+ */
+ if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu))
+ kvm_reset_vcpu(vcpu);
+
if (ioctl == KVM_SET_ONE_REG)
r = kvm_arm_set_reg(vcpu, ®);
else
diff --git a/virt/kvm/arm/hyp/aarch32.c b/virt/kvm/arm/hyp/aarch32.c
index d31f267..25c0e47 100644
--- a/virt/kvm/arm/hyp/aarch32.c
+++ b/virt/kvm/arm/hyp/aarch32.c
@@ -125,12 +125,16 @@
*/
void __hyp_text kvm_skip_instr32(struct kvm_vcpu *vcpu, bool is_wide_instr)
{
+ u32 pc = *vcpu_pc(vcpu);
bool is_thumb;
is_thumb = !!(*vcpu_cpsr(vcpu) & PSR_AA32_T_BIT);
if (is_thumb && !is_wide_instr)
- *vcpu_pc(vcpu) += 2;
+ pc += 2;
else
- *vcpu_pc(vcpu) += 4;
+ pc += 4;
+
+ *vcpu_pc(vcpu) = pc;
+
kvm_adjust_itstate(vcpu);
}
diff --git a/virt/kvm/arm/mmio.c b/virt/kvm/arm/mmio.c
index 6af5c91..1e9ec87 100644
--- a/virt/kvm/arm/mmio.c
+++ b/virt/kvm/arm/mmio.c
@@ -105,6 +105,9 @@
data = (data ^ mask) - mask;
}
+ if (!vcpu->arch.mmio_decode.sixty_four)
+ data = data & 0xffffffff;
+
trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
&data);
data = vcpu_data_host_to_guest(vcpu, data, len);
@@ -125,8 +128,9 @@
unsigned long rt;
int access_size;
bool sign_extend;
+ bool sixty_four;
- if (kvm_vcpu_dabt_iss1tw(vcpu)) {
+ if (kvm_vcpu_abt_iss1tw(vcpu)) {
/* page table accesses IO mem: tell guest to fix its TTBR */
kvm_inject_dabt(vcpu, kvm_vcpu_get_hfar(vcpu));
return 1;
@@ -138,11 +142,13 @@
*is_write = kvm_vcpu_dabt_iswrite(vcpu);
sign_extend = kvm_vcpu_dabt_issext(vcpu);
+ sixty_four = kvm_vcpu_dabt_issf(vcpu);
rt = kvm_vcpu_dabt_get_rd(vcpu);
*len = access_size;
vcpu->arch.mmio_decode.sign_extend = sign_extend;
vcpu->arch.mmio_decode.rt = rt;
+ vcpu->arch.mmio_decode.sixty_four = sixty_four;
return 0;
}
diff --git a/virt/kvm/arm/mmu.c b/virt/kvm/arm/mmu.c
index 38b4c91..c6ba672 100644
--- a/virt/kvm/arm/mmu.c
+++ b/virt/kvm/arm/mmu.c
@@ -38,6 +38,11 @@
#define KVM_S2PTE_FLAG_IS_IOMAP (1UL << 0)
#define KVM_S2_FLAG_LOGGING_ACTIVE (1UL << 1)
+static bool is_iomap(unsigned long flags)
+{
+ return flags & KVM_S2PTE_FLAG_IS_IOMAP;
+}
+
static bool memslot_is_logging(struct kvm_memory_slot *memslot)
{
return memslot->dirty_bitmap && !(memslot->flags & KVM_MEM_READONLY);
@@ -327,7 +332,8 @@
* destroying the VM), otherwise another faulting VCPU may come in and mess
* with things behind our backs.
*/
-static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
+static void __unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size,
+ bool may_block)
{
pgd_t *pgd;
phys_addr_t addr = start, end = start + size;
@@ -352,11 +358,16 @@
* If the range is too large, release the kvm->mmu_lock
* to prevent starvation and lockup detector warnings.
*/
- if (next != end)
+ if (may_block && next != end)
cond_resched_lock(&kvm->mmu_lock);
} while (pgd++, addr = next, addr != end);
}
+static void unmap_stage2_range(struct kvm *kvm, phys_addr_t start, u64 size)
+{
+ __unmap_stage2_range(kvm, start, size, true);
+}
+
static void stage2_flush_ptes(struct kvm *kvm, pmd_t *pmd,
phys_addr_t addr, phys_addr_t end)
{
@@ -1194,7 +1205,7 @@
return true;
}
-static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr)
+static bool stage2_is_exec(struct kvm *kvm, phys_addr_t addr, unsigned long sz)
{
pud_t *pudp;
pmd_t *pmdp;
@@ -1206,11 +1217,11 @@
return false;
if (pudp)
- return kvm_s2pud_exec(pudp);
+ return sz <= PUD_SIZE && kvm_s2pud_exec(pudp);
else if (pmdp)
- return kvm_s2pmd_exec(pmdp);
+ return sz <= PMD_SIZE && kvm_s2pmd_exec(pmdp);
else
- return kvm_s2pte_exec(ptep);
+ return sz == PAGE_SIZE && kvm_s2pte_exec(ptep);
}
static int stage2_set_pte(struct kvm *kvm, struct kvm_mmu_memory_cache *cache,
@@ -1679,7 +1690,7 @@
unsigned long vma_pagesize, flags = 0;
write_fault = kvm_is_write_fault(vcpu);
- exec_fault = kvm_vcpu_trap_is_iabt(vcpu);
+ exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
VM_BUG_ON(write_fault && exec_fault);
if (fault_status == FSC_PERM && !write_fault && !exec_fault) {
@@ -1698,6 +1709,7 @@
vma_pagesize = vma_kernel_pagesize(vma);
if (logging_active ||
+ (vma->vm_flags & VM_PFNMAP) ||
!fault_supports_stage2_huge_mapping(memslot, hva, vma_pagesize)) {
force_pte = true;
vma_pagesize = PAGE_SIZE;
@@ -1744,6 +1756,7 @@
if (kvm_is_device_pfn(pfn)) {
mem_type = PAGE_S2_DEVICE;
flags |= KVM_S2PTE_FLAG_IS_IOMAP;
+ force_pte = true;
} else if (logging_active) {
/*
* Faults on pages in a memslot with logging enabled
@@ -1760,6 +1773,9 @@
writable = false;
}
+ if (exec_fault && is_iomap(flags))
+ return -ENOEXEC;
+
spin_lock(&kvm->mmu_lock);
if (mmu_notifier_retry(kvm, mmu_seq))
goto out_unlock;
@@ -1781,7 +1797,7 @@
if (writable)
kvm_set_pfn_dirty(pfn);
- if (fault_status != FSC_PERM)
+ if (fault_status != FSC_PERM && !is_iomap(flags))
clean_dcache_guest_page(pfn, vma_pagesize);
if (exec_fault)
@@ -1796,9 +1812,15 @@
* execute permissions, and we preserve whatever we have.
*/
needs_exec = exec_fault ||
- (fault_status == FSC_PERM && stage2_is_exec(kvm, fault_ipa));
+ (fault_status == FSC_PERM &&
+ stage2_is_exec(kvm, fault_ipa, vma_pagesize));
- if (vma_pagesize == PUD_SIZE) {
+ /*
+ * If PUD_SIZE == PMD_SIZE, there is no real PUD level, and
+ * all we have is a 2-level page table. Trying to map a PUD in
+ * this case would be fatally wrong.
+ */
+ if (PUD_SIZE != PMD_SIZE && vma_pagesize == PUD_SIZE) {
pud_t new_pud = kvm_pfn_pud(pfn, mem_type);
new_pud = kvm_pud_mkhuge(new_pud);
@@ -1948,9 +1970,8 @@
if (kvm_is_error_hva(hva) || (write_fault && !writable)) {
if (is_iabt) {
/* Prefetch Abort on I/O address */
- kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
- ret = 1;
- goto out_unlock;
+ ret = -ENOEXEC;
+ goto out;
}
/*
@@ -1992,6 +2013,11 @@
ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
if (ret == 0)
ret = 1;
+out:
+ if (ret == -ENOEXEC) {
+ kvm_inject_pabt(vcpu, kvm_vcpu_get_hfar(vcpu));
+ ret = 1;
+ }
out_unlock:
srcu_read_unlock(&vcpu->kvm->srcu, idx);
return ret;
@@ -2031,18 +2057,21 @@
static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data)
{
- unmap_stage2_range(kvm, gpa, size);
+ unsigned flags = *(unsigned *)data;
+ bool may_block = flags & MMU_NOTIFIER_RANGE_BLOCKABLE;
+
+ __unmap_stage2_range(kvm, gpa, size, may_block);
return 0;
}
int kvm_unmap_hva_range(struct kvm *kvm,
- unsigned long start, unsigned long end)
+ unsigned long start, unsigned long end, unsigned flags)
{
if (!kvm->arch.pgd)
return 0;
trace_kvm_unmap_hva_range(start, end);
- handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, NULL);
+ handle_hva_to_gpa(kvm, start, end, &kvm_unmap_hva_handler, &flags);
return 0;
}
@@ -2134,7 +2163,8 @@
if (!kvm->arch.pgd)
return 0;
trace_kvm_test_age_hva(hva);
- return handle_hva_to_gpa(kvm, hva, hva, kvm_test_age_hva_handler, NULL);
+ return handle_hva_to_gpa(kvm, hva, hva + PAGE_SIZE,
+ kvm_test_age_hva_handler, NULL);
}
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu)
@@ -2277,8 +2307,7 @@
* Prevent userspace from creating a memory region outside of the IPA
* space addressable by the KVM guest IPA space.
*/
- if (memslot->base_gfn + memslot->npages >=
- (kvm_phys_size(kvm) >> PAGE_SHIFT))
+ if ((memslot->base_gfn + memslot->npages) > (kvm_phys_size(kvm) >> PAGE_SHIFT))
return -EFAULT;
down_read(¤t->mm->mmap_sem);
diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c
index 8731dfe..4c08fd0 100644
--- a/virt/kvm/arm/pmu.c
+++ b/virt/kvm/arm/pmu.c
@@ -480,25 +480,45 @@
*/
void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val)
{
+ struct kvm_pmu *pmu = &vcpu->arch.pmu;
int i;
- u64 type, enable, reg;
- if (val == 0)
+ if (!(__vcpu_sys_reg(vcpu, PMCR_EL0) & ARMV8_PMU_PMCR_E))
return;
- enable = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+ /* Weed out disabled counters */
+ val &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0);
+
for (i = 0; i < ARMV8_PMU_CYCLE_IDX; i++) {
+ u64 type, reg;
+
if (!(val & BIT(i)))
continue;
- type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i)
- & ARMV8_PMU_EVTYPE_EVENT;
- if ((type == ARMV8_PMUV3_PERFCTR_SW_INCR)
- && (enable & BIT(i))) {
- reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
+
+ /* PMSWINC only applies to ... SW_INC! */
+ type = __vcpu_sys_reg(vcpu, PMEVTYPER0_EL0 + i);
+ type &= ARMV8_PMU_EVTYPE_EVENT;
+ if (type != ARMV8_PMUV3_PERFCTR_SW_INCR)
+ continue;
+
+ /* increment this even SW_INC counter */
+ reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) + 1;
+ reg = lower_32_bits(reg);
+ __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
+
+ if (reg) /* no overflow on the low part */
+ continue;
+
+ if (kvm_pmu_pmc_is_chained(&pmu->pmc[i])) {
+ /* increment the high counter */
+ reg = __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) + 1;
reg = lower_32_bits(reg);
- __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i) = reg;
- if (!reg)
- __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
+ __vcpu_sys_reg(vcpu, PMEVCNTR0_EL0 + i + 1) = reg;
+ if (!reg) /* mark overflow on the high counter */
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i + 1);
+ } else {
+ /* mark overflow on low counter */
+ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) |= BIT(i);
}
}
}
diff --git a/virt/kvm/arm/psci.c b/virt/kvm/arm/psci.c
index 87927f7..48fde38 100644
--- a/virt/kvm/arm/psci.c
+++ b/virt/kvm/arm/psci.c
@@ -408,7 +408,7 @@
val = SMCCC_RET_SUCCESS;
break;
case KVM_BP_HARDEN_NOT_REQUIRED:
- val = SMCCC_RET_NOT_REQUIRED;
+ val = SMCCC_ARCH_WORKAROUND_RET_UNAFFECTED;
break;
}
break;
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 6f50c42..6899101 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -177,6 +177,7 @@
break;
default:
kfree(dist->spis);
+ dist->spis = NULL;
return -EINVAL;
}
}
@@ -357,6 +358,12 @@
{
struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ /*
+ * Retire all pending LPIs on this vcpu anyway as we're
+ * going to destroy it.
+ */
+ vgic_flush_pending_lpis(vcpu);
+
INIT_LIST_HEAD(&vgic_cpu->ap_list_head);
}
@@ -368,10 +375,10 @@
vgic_debug_destroy(kvm);
- kvm_vgic_dist_destroy(kvm);
-
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vgic_vcpu_destroy(vcpu);
+
+ kvm_vgic_dist_destroy(kvm);
}
void kvm_vgic_destroy(struct kvm *kvm)
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
index 2be6b66..35be0e2 100644
--- a/virt/kvm/arm/vgic/vgic-its.c
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -96,14 +96,21 @@
* We "cache" the configuration table entries in our struct vgic_irq's.
* However we only have those structs for mapped IRQs, so we read in
* the respective config data from memory here upon mapping the LPI.
+ *
+ * Should any of these fail, behave as if we couldn't create the LPI
+ * by dropping the refcount and returning the error.
*/
ret = update_lpi_config(kvm, irq, NULL, false);
- if (ret)
+ if (ret) {
+ vgic_put_irq(kvm, irq);
return ERR_PTR(ret);
+ }
ret = vgic_v3_lpi_sync_pending_status(kvm, irq);
- if (ret)
+ if (ret) {
+ vgic_put_irq(kvm, irq);
return ERR_PTR(ret);
+ }
return irq;
}
@@ -2472,7 +2479,8 @@
target_addr = (u32)(val >> KVM_ITS_CTE_RDBASE_SHIFT);
coll_id = val & KVM_ITS_CTE_ICID_MASK;
- if (target_addr >= atomic_read(&kvm->online_vcpus))
+ if (target_addr != COLLECTION_NOT_MAPPED &&
+ target_addr >= atomic_read(&kvm->online_vcpus))
return -EINVAL;
collection = find_collection(its, coll_id);
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 4441967..5eaede3 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -87,8 +87,8 @@
r = vgic_v3_set_redist_base(kvm, 0, *addr, 0);
goto out;
}
- rdreg = list_first_entry(&vgic->rd_regions,
- struct vgic_redist_region, list);
+ rdreg = list_first_entry_or_null(&vgic->rd_regions,
+ struct vgic_redist_region, list);
if (!rdreg)
addr_ptr = &undef_value;
else
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index 5945f06..d63881f 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -422,11 +422,11 @@
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_SET,
vgic_mmio_read_active, vgic_mmio_write_sactive,
- NULL, vgic_mmio_uaccess_write_sactive, 1,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_ACTIVE_CLEAR,
vgic_mmio_read_active, vgic_mmio_write_cactive,
- NULL, vgic_mmio_uaccess_write_cactive, 1,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ(GIC_DIST_PRI,
vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index 7dfd15d..b1e639e 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -223,6 +223,23 @@
return extract_bytes(value, addr & 7, len);
}
+static unsigned long vgic_uaccess_read_v3r_typer(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ unsigned long mpidr = kvm_vcpu_get_mpidr_aff(vcpu);
+ int target_vcpu_id = vcpu->vcpu_id;
+ u64 value;
+
+ value = (u64)(mpidr & GENMASK(23, 0)) << 32;
+ value |= ((target_vcpu_id & 0xffff) << 8);
+
+ if (vgic_has_its(vcpu->kvm))
+ value |= GICR_TYPER_PLPIS;
+
+ /* reporting of the Last bit is not supported for userspace */
+ return extract_bytes(value, addr & 7, len);
+}
+
static unsigned long vgic_mmio_read_v3r_iidr(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
@@ -491,11 +508,11 @@
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ISACTIVER,
vgic_mmio_read_active, vgic_mmio_write_sactive,
- NULL, vgic_mmio_uaccess_write_sactive, 1,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 1,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_ICACTIVER,
vgic_mmio_read_active, vgic_mmio_write_cactive,
- NULL, vgic_mmio_uaccess_write_cactive,
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive,
1, VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_BITS_PER_IRQ_SHARED(GICD_IPRIORITYR,
vgic_mmio_read_priority, vgic_mmio_write_priority, NULL, NULL,
@@ -528,8 +545,9 @@
REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
VGIC_ACCESS_32bit),
- REGISTER_DESC_WITH_LENGTH(GICR_TYPER,
- vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
+ REGISTER_DESC_WITH_LENGTH_UACCESS(GICR_TYPER,
+ vgic_mmio_read_v3r_typer, vgic_mmio_write_wi,
+ vgic_uaccess_read_v3r_typer, vgic_mmio_uaccess_write_wi, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_WAKER,
vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
@@ -563,12 +581,12 @@
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ISACTIVER0,
vgic_mmio_read_active, vgic_mmio_write_sactive,
- NULL, vgic_mmio_uaccess_write_sactive,
- 4, VGIC_ACCESS_32bit),
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_sactive, 4,
+ VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH_UACCESS(SZ_64K + GICR_ICACTIVER0,
vgic_mmio_read_active, vgic_mmio_write_cactive,
- NULL, vgic_mmio_uaccess_write_cactive,
- 4, VGIC_ACCESS_32bit),
+ vgic_uaccess_read_active, vgic_mmio_uaccess_write_cactive, 4,
+ VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(SZ_64K + GICR_IPRIORITYR0,
vgic_mmio_read_priority, vgic_mmio_write_priority, 32,
VGIC_ACCESS_32bit | VGIC_ACCESS_8bit),
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 0d09048..fb1dcd3 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -300,8 +300,39 @@
}
}
-unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
- gpa_t addr, unsigned int len)
+
+/*
+ * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
+ * is not queued on some running VCPU's LRs, because then the change to the
+ * active state can be overwritten when the VCPU's state is synced coming back
+ * from the guest.
+ *
+ * For shared interrupts as well as GICv3 private interrupts, we have to
+ * stop all the VCPUs because interrupts can be migrated while we don't hold
+ * the IRQ locks and we don't want to be chasing moving targets.
+ *
+ * For GICv2 private interrupts we don't have to do anything because
+ * userspace accesses to the VGIC state already require all VCPUs to be
+ * stopped, and only the VCPU itself can modify its private interrupts
+ * active state, which guarantees that the VCPU is not running.
+ */
+static void vgic_access_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
+{
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid >= VGIC_NR_PRIVATE_IRQS)
+ kvm_arm_halt_guest(vcpu->kvm);
+}
+
+/* See vgic_access_active_prepare */
+static void vgic_access_active_finish(struct kvm_vcpu *vcpu, u32 intid)
+{
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
+ intid >= VGIC_NR_PRIVATE_IRQS)
+ kvm_arm_resume_guest(vcpu->kvm);
+}
+
+static unsigned long __vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
{
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
u32 value = 0;
@@ -311,6 +342,10 @@
for (i = 0; i < len * 8; i++) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ /*
+ * Even for HW interrupts, don't evaluate the HW state as
+ * all the guest is interested in is the virtual state.
+ */
if (irq->active)
value |= (1U << i);
@@ -320,6 +355,29 @@
return value;
}
+unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
+ u32 val;
+
+ mutex_lock(&vcpu->kvm->lock);
+ vgic_access_active_prepare(vcpu, intid);
+
+ val = __vgic_mmio_read_active(vcpu, addr, len);
+
+ vgic_access_active_finish(vcpu, intid);
+ mutex_unlock(&vcpu->kvm->lock);
+
+ return val;
+}
+
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ return __vgic_mmio_read_active(vcpu, addr, len);
+}
+
/* Must be called with irq->irq_lock held */
static void vgic_hw_irq_change_active(struct kvm_vcpu *vcpu, struct vgic_irq *irq,
bool active, bool is_uaccess)
@@ -371,36 +429,6 @@
raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
}
-/*
- * If we are fiddling with an IRQ's active state, we have to make sure the IRQ
- * is not queued on some running VCPU's LRs, because then the change to the
- * active state can be overwritten when the VCPU's state is synced coming back
- * from the guest.
- *
- * For shared interrupts, we have to stop all the VCPUs because interrupts can
- * be migrated while we don't hold the IRQ locks and we don't want to be
- * chasing moving targets.
- *
- * For private interrupts we don't have to do anything because userspace
- * accesses to the VGIC state already require all VCPUs to be stopped, and
- * only the VCPU itself can modify its private interrupts active state, which
- * guarantees that the VCPU is not running.
- */
-static void vgic_change_active_prepare(struct kvm_vcpu *vcpu, u32 intid)
-{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
- intid > VGIC_NR_PRIVATE_IRQS)
- kvm_arm_halt_guest(vcpu->kvm);
-}
-
-/* See vgic_change_active_prepare */
-static void vgic_change_active_finish(struct kvm_vcpu *vcpu, u32 intid)
-{
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3 ||
- intid > VGIC_NR_PRIVATE_IRQS)
- kvm_arm_resume_guest(vcpu->kvm);
-}
-
static void __vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val)
@@ -422,11 +450,11 @@
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
mutex_lock(&vcpu->kvm->lock);
- vgic_change_active_prepare(vcpu, intid);
+ vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_cactive(vcpu, addr, len, val);
- vgic_change_active_finish(vcpu, intid);
+ vgic_access_active_finish(vcpu, intid);
mutex_unlock(&vcpu->kvm->lock);
}
@@ -459,11 +487,11 @@
u32 intid = VGIC_ADDR_TO_INTID(addr, 1);
mutex_lock(&vcpu->kvm->lock);
- vgic_change_active_prepare(vcpu, intid);
+ vgic_access_active_prepare(vcpu, intid);
__vgic_mmio_write_sactive(vcpu, addr, len, val);
- vgic_change_active_finish(vcpu, intid);
+ vgic_access_active_finish(vcpu, intid);
mutex_unlock(&vcpu->kvm->lock);
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 836f418..b6aff52 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -157,6 +157,9 @@
unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len);
+unsigned long vgic_uaccess_read_active(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len);
+
void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len,
unsigned long val);
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 8d69f00..b3a97dc 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -363,8 +363,8 @@
int vgic_v3_save_pending_tables(struct kvm *kvm)
{
struct vgic_dist *dist = &kvm->arch.vgic;
- int last_byte_offset = -1;
struct vgic_irq *irq;
+ gpa_t last_ptr = ~(gpa_t)0;
int ret;
u8 val;
@@ -384,11 +384,11 @@
bit_nr = irq->intid % BITS_PER_BYTE;
ptr = pendbase + byte_offset;
- if (byte_offset != last_byte_offset) {
+ if (ptr != last_ptr) {
ret = kvm_read_guest_lock(kvm, ptr, &val, 1);
if (ret)
return ret;
- last_byte_offset = byte_offset;
+ last_ptr = ptr;
}
stored = val & (1U << bit_nr);
diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c
index 35305d6..d8ef708 100644
--- a/virt/kvm/async_pf.c
+++ b/virt/kvm/async_pf.c
@@ -64,7 +64,7 @@
struct mm_struct *mm = apf->mm;
struct kvm_vcpu *vcpu = apf->vcpu;
unsigned long addr = apf->addr;
- gva_t gva = apf->gva;
+ gpa_t cr2_or_gpa = apf->cr2_or_gpa;
int locked = 1;
might_sleep();
@@ -92,7 +92,7 @@
* this point
*/
- trace_kvm_async_pf_completed(addr, gva);
+ trace_kvm_async_pf_completed(addr, cr2_or_gpa);
if (swq_has_sleeper(&vcpu->wq))
swake_up_one(&vcpu->wq);
@@ -165,8 +165,8 @@
}
}
-int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, unsigned long hva,
- struct kvm_arch_async_pf *arch)
+int kvm_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
+ unsigned long hva, struct kvm_arch_async_pf *arch)
{
struct kvm_async_pf *work;
@@ -185,7 +185,7 @@
work->wakeup_all = false;
work->vcpu = vcpu;
- work->gva = gva;
+ work->cr2_or_gpa = cr2_or_gpa;
work->addr = hva;
work->arch = *arch;
work->mm = current->mm;
diff --git a/virt/kvm/coalesced_mmio.c b/virt/kvm/coalesced_mmio.c
index 8ffd07e..8d60699 100644
--- a/virt/kvm/coalesced_mmio.c
+++ b/virt/kvm/coalesced_mmio.c
@@ -178,21 +178,36 @@
struct kvm_coalesced_mmio_zone *zone)
{
struct kvm_coalesced_mmio_dev *dev, *tmp;
+ int r;
if (zone->pio != 1 && zone->pio != 0)
return -EINVAL;
mutex_lock(&kvm->slots_lock);
- list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list)
+ list_for_each_entry_safe(dev, tmp, &kvm->coalesced_zones, list) {
if (zone->pio == dev->zone.pio &&
coalesced_mmio_in_range(dev, zone->addr, zone->size)) {
- kvm_io_bus_unregister_dev(kvm,
+ r = kvm_io_bus_unregister_dev(kvm,
zone->pio ? KVM_PIO_BUS : KVM_MMIO_BUS, &dev->dev);
+
+ /*
+ * On failure, unregister destroys all devices on the
+ * bus _except_ the target device, i.e. coalesced_zones
+ * has been modified. No need to restart the walk as
+ * there aren't any zones left.
+ */
+ if (r)
+ break;
kvm_iodevice_destructor(&dev->dev);
}
+ }
mutex_unlock(&kvm->slots_lock);
+ /*
+ * Ignore the result of kvm_io_bus_unregister_dev(), from userspace's
+ * perspective, the coalesced MMIO is most definitely unregistered.
+ */
return 0;
}
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 13efc29..fc48298 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -157,10 +157,9 @@
static unsigned long long kvm_createvm_count;
static unsigned long long kvm_active_vms;
-__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
- unsigned long start, unsigned long end, bool blockable)
+__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm,
+ unsigned long start, unsigned long end)
{
- return 0;
}
bool kvm_is_zone_device_pfn(kvm_pfn_t pfn)
@@ -186,6 +185,7 @@
*/
if (pfn_valid(pfn))
return PageReserved(pfn_to_page(pfn)) &&
+ !is_zero_pfn(pfn) &&
!kvm_is_zone_device_pfn(pfn);
return true;
@@ -381,6 +381,18 @@
return container_of(mn, struct kvm, mmu_notifier);
}
+static void kvm_mmu_notifier_invalidate_range(struct mmu_notifier *mn,
+ struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ struct kvm *kvm = mmu_notifier_to_kvm(mn);
+ int idx;
+
+ idx = srcu_read_lock(&kvm->srcu);
+ kvm_arch_mmu_notifier_invalidate_range(kvm, start, end);
+ srcu_read_unlock(&kvm->srcu, idx);
+}
+
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
struct mm_struct *mm,
unsigned long address,
@@ -405,7 +417,6 @@
{
struct kvm *kvm = mmu_notifier_to_kvm(mn);
int need_tlb_flush = 0, idx;
- int ret;
idx = srcu_read_lock(&kvm->srcu);
spin_lock(&kvm->mmu_lock);
@@ -415,21 +426,16 @@
* count is also read inside the mmu_lock critical section.
*/
kvm->mmu_notifier_count++;
- need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end);
- need_tlb_flush |= kvm->tlbs_dirty;
+ need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
+ range->flags);
/* we've to flush the tlb before the pages can be freed */
- if (need_tlb_flush)
+ if (need_tlb_flush || kvm->tlbs_dirty)
kvm_flush_remote_tlbs(kvm);
spin_unlock(&kvm->mmu_lock);
-
- ret = kvm_arch_mmu_notifier_invalidate_range(kvm, range->start,
- range->end,
- mmu_notifier_range_blockable(range));
-
srcu_read_unlock(&kvm->srcu, idx);
- return ret;
+ return 0;
}
static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
@@ -535,6 +541,7 @@
}
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
+ .invalidate_range = kvm_mmu_notifier_invalidate_range,
.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
@@ -628,6 +635,8 @@
static int kvm_create_vm_debugfs(struct kvm *kvm, int fd)
{
+ static DEFINE_MUTEX(kvm_debugfs_lock);
+ struct dentry *dent;
char dir_name[ITOA_MAX_LEN * 2];
struct kvm_stat_data *stat_data;
struct kvm_stats_debugfs_item *p;
@@ -636,8 +645,20 @@
return 0;
snprintf(dir_name, sizeof(dir_name), "%d-%d", task_pid_nr(current), fd);
- kvm->debugfs_dentry = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+ mutex_lock(&kvm_debugfs_lock);
+ dent = debugfs_lookup(dir_name, kvm_debugfs_dir);
+ if (dent) {
+ pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
+ dput(dent);
+ mutex_unlock(&kvm_debugfs_lock);
+ return 0;
+ }
+ dent = debugfs_create_dir(dir_name, kvm_debugfs_dir);
+ mutex_unlock(&kvm_debugfs_lock);
+ if (IS_ERR(dent))
+ return 0;
+ kvm->debugfs_dentry = dent;
kvm->debugfs_stat_data = kcalloc(kvm_debugfs_num_entries,
sizeof(*kvm->debugfs_stat_data),
GFP_KERNEL_ACCOUNT);
@@ -1010,6 +1031,7 @@
/* We can read the guest memory with __xxx_user() later on. */
if ((id < KVM_USER_MEM_SLOTS) &&
((mem->userspace_addr & (PAGE_SIZE - 1)) ||
+ (mem->userspace_addr != untagged_addr(mem->userspace_addr)) ||
!access_ok((void __user *)(unsigned long)mem->userspace_addr,
mem->memory_size)))
goto out;
@@ -1394,14 +1416,14 @@
}
EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
-unsigned long kvm_host_page_size(struct kvm *kvm, gfn_t gfn)
+unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
{
struct vm_area_struct *vma;
unsigned long addr, size;
size = PAGE_SIZE;
- addr = gfn_to_hva(kvm, gfn);
+ addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
if (kvm_is_error_hva(addr))
return PAGE_SIZE;
@@ -1585,15 +1607,24 @@
return true;
}
+static int kvm_try_get_pfn(kvm_pfn_t pfn)
+{
+ if (kvm_is_reserved_pfn(pfn))
+ return 1;
+ return get_page_unless_zero(pfn_to_page(pfn));
+}
+
static int hva_to_pfn_remapped(struct vm_area_struct *vma,
unsigned long addr, bool *async,
bool write_fault, bool *writable,
kvm_pfn_t *p_pfn)
{
- unsigned long pfn;
+ kvm_pfn_t pfn;
+ pte_t *ptep;
+ spinlock_t *ptl;
int r;
- r = follow_pfn(vma, addr, &pfn);
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
if (r) {
/*
* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
@@ -1608,14 +1639,19 @@
if (r)
return r;
- r = follow_pfn(vma, addr, &pfn);
+ r = follow_pte(vma->vm_mm, addr, &ptep, &ptl);
if (r)
return r;
+ }
+ if (write_fault && !pte_write(*ptep)) {
+ pfn = KVM_PFN_ERR_RO_FAULT;
+ goto out;
}
if (writable)
- *writable = true;
+ *writable = pte_write(*ptep);
+ pfn = pte_pfn(*ptep);
/*
* Get a reference here because callers of *hva_to_pfn* and
@@ -1627,11 +1663,21 @@
* Whoever called remap_pfn_range is also going to call e.g.
* unmap_mapping_range before the underlying pages are freed,
* causing a call to our MMU notifier.
+ *
+ * Certain IO or PFNMAP mappings can be backed with valid
+ * struct pages, but be allocated without refcounting e.g.,
+ * tail pages of non-compound higher order allocations, which
+ * would then underflow the refcount when the caller does the
+ * required put_page. Don't allow those pages here.
*/
- kvm_get_pfn(pfn);
+ if (!kvm_try_get_pfn(pfn))
+ r = -EFAULT;
+out:
+ pte_unmap_unlock(ptep, ptl);
*p_pfn = pfn;
- return 0;
+
+ return r;
}
/*
@@ -1809,26 +1855,72 @@
}
EXPORT_SYMBOL_GPL(gfn_to_page);
-static int __kvm_map_gfn(struct kvm_memory_slot *slot, gfn_t gfn,
- struct kvm_host_map *map)
+void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache)
+{
+ if (pfn == 0)
+ return;
+
+ if (cache)
+ cache->pfn = cache->gfn = 0;
+
+ if (dirty)
+ kvm_release_pfn_dirty(pfn);
+ else
+ kvm_release_pfn_clean(pfn);
+}
+
+static void kvm_cache_gfn_to_pfn(struct kvm_memory_slot *slot, gfn_t gfn,
+ struct gfn_to_pfn_cache *cache, u64 gen)
+{
+ kvm_release_pfn(cache->pfn, cache->dirty, cache);
+
+ cache->pfn = gfn_to_pfn_memslot(slot, gfn);
+ cache->gfn = gfn;
+ cache->dirty = false;
+ cache->generation = gen;
+}
+
+static int __kvm_map_gfn(struct kvm_memslots *slots, gfn_t gfn,
+ struct kvm_host_map *map,
+ struct gfn_to_pfn_cache *cache,
+ bool atomic)
{
kvm_pfn_t pfn;
void *hva = NULL;
struct page *page = KVM_UNMAPPED_PAGE;
+ struct kvm_memory_slot *slot = __gfn_to_memslot(slots, gfn);
+ u64 gen = slots->generation;
if (!map)
return -EINVAL;
- pfn = gfn_to_pfn_memslot(slot, gfn);
+ if (cache) {
+ if (!cache->pfn || cache->gfn != gfn ||
+ cache->generation != gen) {
+ if (atomic)
+ return -EAGAIN;
+ kvm_cache_gfn_to_pfn(slot, gfn, cache, gen);
+ }
+ pfn = cache->pfn;
+ } else {
+ if (atomic)
+ return -EAGAIN;
+ pfn = gfn_to_pfn_memslot(slot, gfn);
+ }
if (is_error_noslot_pfn(pfn))
return -EINVAL;
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
- hva = kmap(page);
+ if (atomic)
+ hva = kmap_atomic(page);
+ else
+ hva = kmap(page);
#ifdef CONFIG_HAS_IOMEM
- } else {
+ } else if (!atomic) {
hva = memremap(pfn_to_hpa(pfn), PAGE_SIZE, MEMREMAP_WB);
+ } else {
+ return -EINVAL;
#endif
}
@@ -1843,14 +1935,25 @@
return 0;
}
+int kvm_map_gfn(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map,
+ struct gfn_to_pfn_cache *cache, bool atomic)
+{
+ return __kvm_map_gfn(kvm_memslots(vcpu->kvm), gfn, map,
+ cache, atomic);
+}
+EXPORT_SYMBOL_GPL(kvm_map_gfn);
+
int kvm_vcpu_map(struct kvm_vcpu *vcpu, gfn_t gfn, struct kvm_host_map *map)
{
- return __kvm_map_gfn(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, map);
+ return __kvm_map_gfn(kvm_vcpu_memslots(vcpu), gfn, map,
+ NULL, false);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_map);
-void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
- bool dirty)
+static void __kvm_unmap_gfn(struct kvm_memory_slot *memslot,
+ struct kvm_host_map *map,
+ struct gfn_to_pfn_cache *cache,
+ bool dirty, bool atomic)
{
if (!map)
return;
@@ -1858,23 +1961,45 @@
if (!map->hva)
return;
- if (map->page != KVM_UNMAPPED_PAGE)
- kunmap(map->page);
+ if (map->page != KVM_UNMAPPED_PAGE) {
+ if (atomic)
+ kunmap_atomic(map->hva);
+ else
+ kunmap(map->page);
+ }
#ifdef CONFIG_HAS_IOMEM
- else
+ else if (!atomic)
memunmap(map->hva);
+ else
+ WARN_ONCE(1, "Unexpected unmapping in atomic context");
#endif
- if (dirty) {
- kvm_vcpu_mark_page_dirty(vcpu, map->gfn);
- kvm_release_pfn_dirty(map->pfn);
- } else {
- kvm_release_pfn_clean(map->pfn);
- }
+ if (dirty)
+ mark_page_dirty_in_slot(memslot, map->gfn);
+
+ if (cache)
+ cache->dirty |= dirty;
+ else
+ kvm_release_pfn(map->pfn, dirty, NULL);
map->hva = NULL;
map->page = NULL;
}
+
+int kvm_unmap_gfn(struct kvm_vcpu *vcpu, struct kvm_host_map *map,
+ struct gfn_to_pfn_cache *cache, bool dirty, bool atomic)
+{
+ __kvm_unmap_gfn(gfn_to_memslot(vcpu->kvm, map->gfn), map,
+ cache, dirty, atomic);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(kvm_unmap_gfn);
+
+void kvm_vcpu_unmap(struct kvm_vcpu *vcpu, struct kvm_host_map *map, bool dirty)
+{
+ __kvm_unmap_gfn(kvm_vcpu_gfn_to_memslot(vcpu, map->gfn), map, NULL,
+ dirty, false);
+}
EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
struct page *kvm_vcpu_gfn_to_page(struct kvm_vcpu *vcpu, gfn_t gfn)
@@ -2196,12 +2321,12 @@
if (slots->generation != ghc->generation)
__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
- if (unlikely(!ghc->memslot))
- return kvm_write_guest(kvm, gpa, data, len);
-
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
+ if (unlikely(!ghc->memslot))
+ return kvm_write_guest(kvm, gpa, data, len);
+
r = __copy_to_user((void __user *)ghc->hva + offset, data, len);
if (r)
return -EFAULT;
@@ -2229,12 +2354,12 @@
if (slots->generation != ghc->generation)
__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len);
- if (unlikely(!ghc->memslot))
- return kvm_read_guest(kvm, ghc->gpa, data, len);
-
if (kvm_is_error_hva(ghc->hva))
return -EFAULT;
+ if (unlikely(!ghc->memslot))
+ return kvm_read_guest(kvm, ghc->gpa, data, len);
+
r = __copy_from_user(data, (void __user *)ghc->hva, len);
if (r)
return -EFAULT;
@@ -3459,6 +3584,16 @@
};
};
+struct compat_kvm_clear_dirty_log {
+ __u32 slot;
+ __u32 num_pages;
+ __u64 first_page;
+ union {
+ compat_uptr_t dirty_bitmap; /* one bit per page */
+ __u64 padding2;
+ };
+};
+
static long kvm_vm_compat_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
@@ -3468,6 +3603,24 @@
if (kvm->mm != current->mm)
return -EIO;
switch (ioctl) {
+#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
+ case KVM_CLEAR_DIRTY_LOG: {
+ struct compat_kvm_clear_dirty_log compat_log;
+ struct kvm_clear_dirty_log log;
+
+ if (copy_from_user(&compat_log, (void __user *)arg,
+ sizeof(compat_log)))
+ return -EFAULT;
+ log.slot = compat_log.slot;
+ log.num_pages = compat_log.num_pages;
+ log.first_page = compat_log.first_page;
+ log.padding2 = compat_log.padding2;
+ log.dirty_bitmap = compat_ptr(compat_log.dirty_bitmap);
+
+ r = kvm_vm_ioctl_clear_dirty_log(kvm, &log);
+ break;
+ }
+#endif
case KVM_GET_DIRTY_LOG: {
struct compat_kvm_dirty_log compat_log;
struct kvm_dirty_log log;
@@ -3921,15 +4074,15 @@
}
/* Caller must hold slots_lock. */
-void kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
- struct kvm_io_device *dev)
+int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ struct kvm_io_device *dev)
{
- int i;
+ int i, j;
struct kvm_io_bus *new_bus, *bus;
bus = kvm_get_bus(kvm, bus_idx);
if (!bus)
- return;
+ return 0;
for (i = 0; i < bus->dev_count; i++)
if (bus->range[i].dev == dev) {
@@ -3937,25 +4090,28 @@
}
if (i == bus->dev_count)
- return;
+ return 0;
new_bus = kmalloc(struct_size(bus, range, bus->dev_count - 1),
GFP_KERNEL_ACCOUNT);
- if (!new_bus) {
+ if (new_bus) {
+ memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
+ new_bus->dev_count--;
+ memcpy(new_bus->range + i, bus->range + i + 1,
+ (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
+ } else {
pr_err("kvm: failed to shrink bus, removing it completely\n");
- goto broken;
+ for (j = 0; j < bus->dev_count; j++) {
+ if (j == i)
+ continue;
+ kvm_iodevice_destructor(bus->range[j].dev);
+ }
}
- memcpy(new_bus, bus, sizeof(*bus) + i * sizeof(struct kvm_io_range));
- new_bus->dev_count--;
- memcpy(new_bus->range + i, bus->range + i + 1,
- (new_bus->dev_count - i) * sizeof(struct kvm_io_range));
-
-broken:
rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
synchronize_srcu_expedited(&kvm->srcu);
kfree(bus);
- return;
+ return new_bus ? 0 : -ENOMEM;
}
struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
@@ -4225,7 +4381,7 @@
}
add_uevent_var(env, "PID=%d", kvm->userspace_pid);
- if (!IS_ERR_OR_NULL(kvm->debugfs_dentry)) {
+ if (kvm->debugfs_dentry) {
char *tmp, *p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
if (p) {