Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index 68a0e9d..711fca9 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -20,7 +20,6 @@
config KVM
bool
select PREEMPT_NOTIFIERS
- select ANON_INODES
select HAVE_KVM_EVENTFD
select HAVE_KVM_VCPU_ASYNC_IOCTL
select SRCU
@@ -39,6 +38,7 @@
config KVM_BOOK3S_64_HANDLER
bool
select KVM_BOOK3S_HANDLER
+ select PPC_DAWR_FORCE_ENABLE
config KVM_BOOK3S_PR_POSSIBLE
bool
@@ -184,9 +184,9 @@
select HAVE_KVM_MSI
help
Enable support for emulating MPIC devices inside the
- host kernel, rather than relying on userspace to emulate.
- Currently, support is limited to certain versions of
- Freescale's MPIC implementation.
+ host kernel, rather than relying on userspace to emulate.
+ Currently, support is limited to certain versions of
+ Freescale's MPIC implementation.
config KVM_XICS
bool "KVM in-kernel XICS emulation"
@@ -204,6 +204,6 @@
default y
depends on KVM_XICS && PPC_XIVE_NATIVE && KVM_BOOK3S_HV_POSSIBLE
-source drivers/vhost/Kconfig
+source "drivers/vhost/Kconfig"
endif # VIRTUALIZATION
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index f872c04..4c67cc7 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -3,8 +3,6 @@
# Makefile for Kernel-based Virtual Machine module
#
-subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror
-
ccflags-y := -Ivirt/kvm -Iarch/powerpc/kvm
KVM := ../../../virt/kvm
@@ -12,11 +10,6 @@
common-objs-$(CONFIG_KVM_VFIO) += $(KVM)/vfio.o
common-objs-$(CONFIG_KVM_MMIO) += $(KVM)/coalesced_mmio.o
-CFLAGS_e500_mmu.o := -I.
-CFLAGS_e500_mmu_host.o := -I.
-CFLAGS_emulate.o := -I.
-CFLAGS_emulate_loadstore.o := -I.
-
common-objs-y += powerpc.o emulate_loadstore.o
obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
@@ -75,7 +68,8 @@
book3s_hv.o \
book3s_hv_interrupts.o \
book3s_64_mmu_hv.o \
- book3s_64_mmu_radix.o
+ book3s_64_mmu_radix.o \
+ book3s_hv_nested.o
kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
book3s_hv_tm.o
@@ -100,7 +94,7 @@
kvm-book3s_64-objs-$(CONFIG_KVM_XICS) += \
book3s_xics.o
-kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o
+kvm-book3s_64-objs-$(CONFIG_KVM_XIVE) += book3s_xive.o book3s_xive_native.o
kvm-book3s_64-objs-$(CONFIG_SPAPR_TCE_IOMMU) += book3s_64_vio.o
kvm-book3s_64-module-objs := \
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 87348e4..ec2547c 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
*
@@ -8,10 +9,6 @@
* Description:
* This file is derived from arch/powerpc/kvm/44x.c,
* by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kvm_host.h>
@@ -39,7 +36,8 @@
#include "book3s.h"
#include "trace.h"
-#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define VM_STAT(x, ...) offsetof(struct kvm, stat.x), KVM_STAT_VM, ## __VA_ARGS__
+#define VCPU_STAT(x, ...) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU, ## __VA_ARGS__
/* #define EXIT_DEBUG */
@@ -71,6 +69,8 @@
{ "pthru_all", VCPU_STAT(pthru_all) },
{ "pthru_host", VCPU_STAT(pthru_host) },
{ "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) },
+ { "largepages_2M", VM_STAT(num_2M_pages, .mode = 0444) },
+ { "largepages_1G", VM_STAT(num_1G_pages, .mode = 0444) },
{ NULL }
};
@@ -78,8 +78,11 @@
{
if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
ulong pc = kvmppc_get_pc(vcpu);
+ ulong lr = kvmppc_get_lr(vcpu);
if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
+ if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
+ kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
}
}
@@ -150,7 +153,6 @@
case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break;
case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break;
case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break;
- case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break;
case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break;
case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break;
case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break;
@@ -190,6 +192,13 @@
}
EXPORT_SYMBOL_GPL(kvmppc_book3s_queue_irqprio);
+void kvmppc_core_queue_machine_check(struct kvm_vcpu *vcpu, ulong flags)
+{
+ /* might as well deliver this straight away */
+ kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_MACHINE_CHECK, flags);
+}
+EXPORT_SYMBOL_GPL(kvmppc_core_queue_machine_check);
+
void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags)
{
/* might as well deliver this straight away */
@@ -236,18 +245,35 @@
void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
- unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
+ /*
+ * This case (KVM_INTERRUPT_SET) should never actually arise for
+ * a pseries guest (because pseries guests expect their interrupt
+ * controllers to continue asserting an external interrupt request
+ * until it is acknowledged at the interrupt controller), but is
+ * included to avoid ABI breakage and potentially for other
+ * sorts of guest.
+ *
+ * There is a subtlety here: HV KVM does not test the
+ * external_oneshot flag in the code that synthesizes
+ * external interrupts for the guest just before entering
+ * the guest. That is OK even if userspace did do a
+ * KVM_INTERRUPT_SET on a pseries guest vcpu, because the
+ * caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
+ * which ends up doing a smp_send_reschedule(), which will
+ * pull the guest all the way out to the host, meaning that
+ * we will call kvmppc_core_prepare_to_enter() before entering
+ * the guest again, and that will handle the external_oneshot
+ * flag correctly.
+ */
+ if (irq->irq == KVM_INTERRUPT_SET)
+ vcpu->arch.external_oneshot = 1;
- if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
- vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
-
- kvmppc_book3s_queue_irqprio(vcpu, vec);
+ kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
}
void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
{
kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
- kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
}
void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
@@ -278,7 +304,6 @@
vec = BOOK3S_INTERRUPT_DECREMENTER;
break;
case BOOK3S_IRQPRIO_EXTERNAL:
- case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
vec = BOOK3S_INTERRUPT_EXTERNAL;
break;
@@ -352,8 +377,16 @@
case BOOK3S_IRQPRIO_DECREMENTER:
/* DEC interrupts get cleared by mtdec */
return false;
- case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
- /* External interrupts get cleared by userspace */
+ case BOOK3S_IRQPRIO_EXTERNAL:
+ /*
+ * External interrupts get cleared by userspace
+ * except when set by the KVM_INTERRUPT ioctl with
+ * KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
+ */
+ if (vcpu->arch.external_oneshot) {
+ vcpu->arch.external_oneshot = 0;
+ return true;
+ }
return false;
}
@@ -609,12 +642,24 @@
r = -ENXIO;
break;
}
- if (xive_enabled())
+ if (xics_on_xive())
*val = get_reg_val(id, kvmppc_xive_get_icp(vcpu));
else
*val = get_reg_val(id, kvmppc_xics_get_icp(vcpu));
break;
#endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+ case KVM_REG_PPC_VP_STATE:
+ if (!vcpu->arch.xive_vcpu) {
+ r = -ENXIO;
+ break;
+ }
+ if (xive_enabled())
+ r = kvmppc_xive_native_get_vp(vcpu, val);
+ else
+ r = -ENXIO;
+ break;
+#endif /* CONFIG_KVM_XIVE */
case KVM_REG_PPC_FSCR:
*val = get_reg_val(id, vcpu->arch.fscr);
break;
@@ -682,12 +727,24 @@
r = -ENXIO;
break;
}
- if (xive_enabled())
+ if (xics_on_xive())
r = kvmppc_xive_set_icp(vcpu, set_reg_val(id, *val));
else
r = kvmppc_xics_set_icp(vcpu, set_reg_val(id, *val));
break;
#endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+ case KVM_REG_PPC_VP_STATE:
+ if (!vcpu->arch.xive_vcpu) {
+ r = -ENXIO;
+ break;
+ }
+ if (xive_enabled())
+ r = kvmppc_xive_native_set_vp(vcpu, val);
+ else
+ r = -ENXIO;
+ break;
+#endif /* CONFIG_KVM_XIVE */
case KVM_REG_PPC_FSCR:
vcpu->arch.fscr = set_reg_val(id, *val);
break;
@@ -804,9 +861,10 @@
void kvmppc_core_commit_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
- const struct kvm_memory_slot *new)
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
- kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new);
+ kvm->arch.kvm_ops->commit_memory_region(kvm, mem, old, new, change);
}
int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
@@ -824,9 +882,10 @@
return kvm->arch.kvm_ops->test_age_hva(kvm, hva);
}
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
kvm->arch.kvm_ops->set_spte_hva(kvm, hva, pte);
+ return 0;
}
void kvmppc_mmu_destroy(struct kvm_vcpu *vcpu)
@@ -840,6 +899,7 @@
#ifdef CONFIG_PPC64
INIT_LIST_HEAD_RCU(&kvm->arch.spapr_tce_tables);
INIT_LIST_HEAD(&kvm->arch.rtas_tokens);
+ mutex_init(&kvm->arch.rtas_token_lock);
#endif
return kvm->arch.kvm_ops->init_vm(kvm);
@@ -853,6 +913,17 @@
kvmppc_rtas_tokens_free(kvm);
WARN_ON(!list_empty(&kvm->arch.spapr_tce_tables));
#endif
+
+#ifdef CONFIG_KVM_XICS
+ /*
+ * Free the XIVE devices which are not directly freed by the
+ * device 'release' method
+ */
+ kfree(kvm->arch.xive_devices.native);
+ kvm->arch.xive_devices.native = NULL;
+ kfree(kvm->arch.xive_devices.xics_on_xive);
+ kvm->arch.xive_devices.xics_on_xive = NULL;
+#endif /* CONFIG_KVM_XICS */
}
int kvmppc_h_logical_ci_load(struct kvm_vcpu *vcpu)
@@ -956,7 +1027,7 @@
int kvm_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level,
bool line_status)
{
- if (xive_enabled())
+ if (xics_on_xive())
return kvmppc_xive_set_irq(kvm, irq_source_id, irq, level,
line_status);
else
@@ -1009,9 +1080,14 @@
#ifdef CONFIG_KVM_XICS
#ifdef CONFIG_KVM_XIVE
- if (xive_enabled()) {
+ if (xics_on_xive()) {
kvmppc_xive_init_module();
kvm_register_device_ops(&kvm_xive_ops, KVM_DEV_TYPE_XICS);
+ if (kvmppc_xive_native_supported()) {
+ kvmppc_xive_native_init_module();
+ kvm_register_device_ops(&kvm_xive_native_ops,
+ KVM_DEV_TYPE_XIVE);
+ }
} else
#endif
kvm_register_device_ops(&kvm_xics_ops, KVM_DEV_TYPE_XICS);
@@ -1022,8 +1098,10 @@
static void kvmppc_book3s_exit(void)
{
#ifdef CONFIG_KVM_XICS
- if (xive_enabled())
+ if (xics_on_xive()) {
kvmppc_xive_exit_module();
+ kvmppc_xive_native_exit_module();
+ }
#endif
#ifdef CONFIG_KVM_BOOK3S_32_HANDLER
kvmppc_book3s_exit_pr();
diff --git a/arch/powerpc/kvm/book3s.h b/arch/powerpc/kvm/book3s.h
index 14ef035..2ef1311 100644
--- a/arch/powerpc/kvm/book3s.h
+++ b/arch/powerpc/kvm/book3s.h
@@ -1,12 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* Copyright IBM Corporation, 2013
* Author Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License or (at your optional) any later version of the license.
- *
*/
#ifndef __POWERPC_KVM_BOOK3S_H__
diff --git a/arch/powerpc/kvm/book3s_32_mmu.c b/arch/powerpc/kvm/book3s_32_mmu.c
index 6121699..18f244a 100644
--- a/arch/powerpc/kvm/book3s_32_mmu.c
+++ b/arch/powerpc/kvm/book3s_32_mmu.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
@@ -250,6 +239,7 @@
case 2:
case 6:
pte->may_write = true;
+ /* fall through */
case 3:
case 5:
case 7:
@@ -425,6 +415,7 @@
mmu->slbmte = NULL;
mmu->slbmfee = NULL;
mmu->slbmfev = NULL;
+ mmu->slbfee = NULL;
mmu->slbie = NULL;
mmu->slbia = NULL;
}
diff --git a/arch/powerpc/kvm/book3s_32_mmu_host.c b/arch/powerpc/kvm/book3s_32_mmu_host.c
index 9991069..d4cb3bc 100644
--- a/arch/powerpc/kvm/book3s_32_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_32_mmu_host.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
*
* Authors:
* Alexander Graf <agraf@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/kvm_host.h>
diff --git a/arch/powerpc/kvm/book3s_32_sr.S b/arch/powerpc/kvm/book3s_32_sr.S
index 7e06a6f..e3ab9df 100644
--- a/arch/powerpc/kvm/book3s_32_sr.S
+++ b/arch/powerpc/kvm/book3s_32_sr.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c
index c92dd25..5f63a5f 100644
--- a/arch/powerpc/kvm/book3s_64_mmu.c
+++ b/arch/powerpc/kvm/book3s_64_mmu.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
@@ -435,6 +424,19 @@
kvmppc_mmu_map_segment(vcpu, esid << SID_SHIFT);
}
+static int kvmppc_mmu_book3s_64_slbfee(struct kvm_vcpu *vcpu, gva_t eaddr,
+ ulong *ret_slb)
+{
+ struct kvmppc_slb *slbe = kvmppc_mmu_book3s_64_find_slbe(vcpu, eaddr);
+
+ if (slbe) {
+ *ret_slb = slbe->origv;
+ return 0;
+ }
+ *ret_slb = 0;
+ return -ENOENT;
+}
+
static u64 kvmppc_mmu_book3s_64_slbmfee(struct kvm_vcpu *vcpu, u64 slb_nr)
{
struct kvmppc_slb *slbe;
@@ -670,6 +672,7 @@
mmu->slbmte = kvmppc_mmu_book3s_64_slbmte;
mmu->slbmfee = kvmppc_mmu_book3s_64_slbmfee;
mmu->slbmfev = kvmppc_mmu_book3s_64_slbmfev;
+ mmu->slbfee = kvmppc_mmu_book3s_64_slbfee;
mmu->slbie = kvmppc_mmu_book3s_64_slbie;
mmu->slbia = kvmppc_mmu_book3s_64_slbia;
mmu->xlate = kvmppc_mmu_book3s_64_xlate;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c
index 9a4614c..044dd49 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_host.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_host.c
@@ -1,22 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2009 SUSE Linux Products GmbH. All rights reserved.
*
* Authors:
* Alexander Graf <agraf@suse.de>
* Kevin Wolf <mail@kevin-wolf.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/kvm_host.h>
diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
index 68e14af..9a75f0e 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*/
@@ -63,7 +52,7 @@
struct work_struct work;
u32 order;
- /* These fields protected by kvm->lock */
+ /* These fields protected by kvm->arch.mmu_setup_lock */
/* Possible values and their usage:
* <0 an error occurred during allocation,
@@ -73,7 +62,7 @@
int error;
/* Private to the work thread, until error != -EBUSY,
- * then protected by kvm->lock.
+ * then protected by kvm->arch.mmu_setup_lock.
*/
struct kvm_hpt_info hpt;
};
@@ -139,7 +128,7 @@
long err = -EBUSY;
struct kvm_hpt_info info;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.mmu_setup_lock);
if (kvm->arch.mmu_ready) {
kvm->arch.mmu_ready = 0;
/* order mmu_ready vs. vcpus_running */
@@ -183,7 +172,7 @@
/* Ensure that each vcpu will flush its TLB on next entry. */
cpumask_setall(&kvm->arch.need_tlb_flush);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
return err;
}
@@ -268,14 +257,13 @@
{
unsigned long host_lpid, rsvd_lpid;
- if (!cpu_has_feature(CPU_FTR_HVMODE))
- return -EINVAL;
-
if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
return -EINVAL;
/* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
- host_lpid = mfspr(SPRN_LPID);
+ host_lpid = 0;
+ if (cpu_has_feature(CPU_FTR_HVMODE))
+ host_lpid = mfspr(SPRN_LPID);
rsvd_lpid = LPID_RSVD;
kvmppc_init_lpid(rsvd_lpid + 1);
@@ -443,6 +431,24 @@
u32 last_inst;
/*
+ * Fast path - check if the guest physical address corresponds to a
+ * device on the FAST_MMIO_BUS, if so we can avoid loading the
+ * instruction all together, then we can just handle it and return.
+ */
+ if (is_store) {
+ int idx, ret;
+
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+ ret = kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, (gpa_t) gpa, 0,
+ NULL);
+ srcu_read_unlock(&vcpu->kvm->srcu, idx);
+ if (!ret) {
+ kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
+ return RESUME_GUEST;
+ }
+ }
+
+ /*
* If we fail, we just return to the guest and try executing it again.
*/
if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst) !=
@@ -583,7 +589,7 @@
/* If writing != 0, then the HPTE must allow writing, if we get here */
write_ok = writing;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, writing, pages);
+ npages = get_user_pages_fast(hva, 1, writing ? FOLL_WRITE : 0, pages);
if (npages < 1) {
/* Check if it's an I/O mapping */
down_read(¤t->mm->mmap_sem);
@@ -744,12 +750,15 @@
srcu_idx = srcu_read_lock(&kvm->srcu);
slots = kvm_memslots(kvm);
kvm_for_each_memslot(memslot, slots) {
+ /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ spin_lock(&kvm->mmu_lock);
/*
* This assumes it is acceptable to lose reference and
* change bits across a reset.
*/
memset(memslot->arch.rmap, 0,
memslot->npages * sizeof(*memslot->arch.rmap));
+ spin_unlock(&kvm->mmu_lock);
}
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
@@ -897,11 +906,12 @@
gfn = memslot->base_gfn;
rmapp = memslot->arch.rmap;
+ if (kvm_is_radix(kvm)) {
+ kvmppc_radix_flush_memslot(kvm, memslot);
+ return;
+ }
+
for (n = memslot->npages; n; --n, ++gfn) {
- if (kvm_is_radix(kvm)) {
- kvm_unmap_radix(kvm, memslot, gfn);
- continue;
- }
/*
* Testing the present bit without locking is OK because
* the memslot has been marked invalid already, and hence
@@ -1172,7 +1182,7 @@
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
goto err;
hva = gfn_to_hva_memslot(memslot, gfn);
- npages = get_user_pages_fast(hva, 1, 1, pages);
+ npages = get_user_pages_fast(hva, 1, FOLL_WRITE, pages);
if (npages < 1)
goto err;
page = pages[0];
@@ -1426,7 +1436,7 @@
static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
{
- if (WARN_ON(!mutex_is_locked(&kvm->lock)))
+ if (WARN_ON(!mutex_is_locked(&kvm->arch.mmu_setup_lock)))
return;
if (!resize)
@@ -1453,14 +1463,14 @@
if (WARN_ON(resize->error != -EBUSY))
return;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.mmu_setup_lock);
/* Request is still current? */
if (kvm->arch.resize_hpt == resize) {
/* We may request large allocations here:
- * do not sleep with kvm->lock held for a while.
+ * do not sleep with kvm->arch.mmu_setup_lock held for a while.
*/
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
resize->order);
@@ -1473,9 +1483,9 @@
if (WARN_ON(err == -EBUSY))
err = -EINPROGRESS;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.mmu_setup_lock);
/* It is possible that kvm->arch.resize_hpt != resize
- * after we grab kvm->lock again.
+ * after we grab kvm->arch.mmu_setup_lock again.
*/
}
@@ -1484,7 +1494,7 @@
if (kvm->arch.resize_hpt != resize)
resize_hpt_release(kvm, resize);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
}
long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
@@ -1501,7 +1511,7 @@
if (shift && ((shift < 18) || (shift > 46)))
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.mmu_setup_lock);
resize = kvm->arch.resize_hpt;
@@ -1544,7 +1554,7 @@
ret = 100; /* estimated time in ms */
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
return ret;
}
@@ -1567,7 +1577,7 @@
if (shift && ((shift < 18) || (shift > 46)))
return -EINVAL;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.mmu_setup_lock);
resize = kvm->arch.resize_hpt;
@@ -1604,7 +1614,7 @@
smp_mb();
out_no_hpt:
resize_hpt_release(kvm, resize);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
return ret;
}
@@ -1741,7 +1751,7 @@
int first_pass;
unsigned long hpte[2];
- if (!access_ok(VERIFY_WRITE, buf, count))
+ if (!access_ok(buf, count))
return -EFAULT;
if (kvm_is_radix(kvm))
return 0;
@@ -1841,13 +1851,13 @@
int mmu_ready;
int pshift;
- if (!access_ok(VERIFY_READ, buf, count))
+ if (!access_ok(buf, count))
return -EFAULT;
if (kvm_is_radix(kvm))
return -EINVAL;
/* lock out vcpus from running while we're doing this */
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.mmu_setup_lock);
mmu_ready = kvm->arch.mmu_ready;
if (mmu_ready) {
kvm->arch.mmu_ready = 0; /* temporarily */
@@ -1855,7 +1865,7 @@
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
kvm->arch.mmu_ready = 1;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
return -EBUSY;
}
}
@@ -1942,7 +1952,7 @@
/* Order HPTE updates vs. mmu_ready */
smp_wmb();
kvm->arch.mmu_ready = mmu_ready;
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
if (err)
return err;
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 998f8d0..2d415c3 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -1,7 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*
* Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*/
@@ -10,6 +8,9 @@
#include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/debugfs.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
@@ -26,18 +27,231 @@
*/
static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
+unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid,
+ gva_t eaddr, void *to, void *from,
+ unsigned long n)
+{
+ int uninitialized_var(old_pid), old_lpid;
+ unsigned long quadrant, ret = n;
+ bool is_load = !!to;
+
+ /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */
+ if (kvmhv_on_pseries())
+ return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr,
+ __pa(to), __pa(from), n);
+
+ quadrant = 1;
+ if (!pid)
+ quadrant = 2;
+ if (is_load)
+ from = (void *) (eaddr | (quadrant << 62));
+ else
+ to = (void *) (eaddr | (quadrant << 62));
+
+ preempt_disable();
+
+ /* switch the lpid first to avoid running host with unallocated pid */
+ old_lpid = mfspr(SPRN_LPID);
+ if (old_lpid != lpid)
+ mtspr(SPRN_LPID, lpid);
+ if (quadrant == 1) {
+ old_pid = mfspr(SPRN_PID);
+ if (old_pid != pid)
+ mtspr(SPRN_PID, pid);
+ }
+ isync();
+
+ pagefault_disable();
+ if (is_load)
+ ret = raw_copy_from_user(to, from, n);
+ else
+ ret = raw_copy_to_user(to, from, n);
+ pagefault_enable();
+
+ /* switch the pid first to avoid running host with unallocated pid */
+ if (quadrant == 1 && pid != old_pid)
+ mtspr(SPRN_PID, old_pid);
+ if (lpid != old_lpid)
+ mtspr(SPRN_LPID, old_lpid);
+ isync();
+
+ preempt_enable();
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__kvmhv_copy_tofrom_guest_radix);
+
+static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr,
+ void *to, void *from, unsigned long n)
+{
+ int lpid = vcpu->kvm->arch.lpid;
+ int pid = vcpu->arch.pid;
+
+ /* This would cause a data segment intr so don't allow the access */
+ if (eaddr & (0x3FFUL << 52))
+ return -EINVAL;
+
+ /* Should we be using the nested lpid */
+ if (vcpu->arch.nested)
+ lpid = vcpu->arch.nested->shadow_lpid;
+
+ /* If accessing quadrant 3 then pid is expected to be 0 */
+ if (((eaddr >> 62) & 0x3) == 0x3)
+ pid = 0;
+
+ eaddr &= ~(0xFFFUL << 52);
+
+ return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n);
+}
+
+long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to,
+ unsigned long n)
+{
+ long ret;
+
+ ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n);
+ if (ret > 0)
+ memset(to + (n - ret), 0, ret);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(kvmhv_copy_from_guest_radix);
+
+long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from,
+ unsigned long n)
+{
+ return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n);
+}
+EXPORT_SYMBOL_GPL(kvmhv_copy_to_guest_radix);
+
+int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
+ struct kvmppc_pte *gpte, u64 root,
+ u64 *pte_ret_p)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int ret, level, ps;
+ unsigned long rts, bits, offset, index;
+ u64 pte, base, gpa;
+ __be64 rpte;
+
+ rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
+ ((root & RTS2_MASK) >> RTS2_SHIFT);
+ bits = root & RPDS_MASK;
+ base = root & RPDB_MASK;
+
+ offset = rts + 31;
+
+ /* Current implementations only support 52-bit space */
+ if (offset != 52)
+ return -EINVAL;
+
+ /* Walk each level of the radix tree */
+ for (level = 3; level >= 0; --level) {
+ u64 addr;
+ /* Check a valid size */
+ if (level && bits != p9_supported_radix_bits[level])
+ return -EINVAL;
+ if (level == 0 && !(bits == 5 || bits == 9))
+ return -EINVAL;
+ offset -= bits;
+ index = (eaddr >> offset) & ((1UL << bits) - 1);
+ /* Check that low bits of page table base are zero */
+ if (base & ((1UL << (bits + 3)) - 1))
+ return -EINVAL;
+ /* Read the entry from guest memory */
+ addr = base + (index * sizeof(rpte));
+ ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
+ if (ret) {
+ if (pte_ret_p)
+ *pte_ret_p = addr;
+ return ret;
+ }
+ pte = __be64_to_cpu(rpte);
+ if (!(pte & _PAGE_PRESENT))
+ return -ENOENT;
+ /* Check if a leaf entry */
+ if (pte & _PAGE_PTE)
+ break;
+ /* Get ready to walk the next level */
+ base = pte & RPDB_MASK;
+ bits = pte & RPDS_MASK;
+ }
+
+ /* Need a leaf at lowest level; 512GB pages not supported */
+ if (level < 0 || level == 3)
+ return -EINVAL;
+
+ /* We found a valid leaf PTE */
+ /* Offset is now log base 2 of the page size */
+ gpa = pte & 0x01fffffffffff000ul;
+ if (gpa & ((1ul << offset) - 1))
+ return -EINVAL;
+ gpa |= eaddr & ((1ul << offset) - 1);
+ for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
+ if (offset == mmu_psize_defs[ps].shift)
+ break;
+ gpte->page_size = ps;
+ gpte->page_shift = offset;
+
+ gpte->eaddr = eaddr;
+ gpte->raddr = gpa;
+
+ /* Work out permissions */
+ gpte->may_read = !!(pte & _PAGE_READ);
+ gpte->may_write = !!(pte & _PAGE_WRITE);
+ gpte->may_execute = !!(pte & _PAGE_EXEC);
+
+ gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
+
+ if (pte_ret_p)
+ *pte_ret_p = pte;
+
+ return 0;
+}
+
+/*
+ * Used to walk a partition or process table radix tree in guest memory
+ * Note: We exploit the fact that a partition table and a process
+ * table have the same layout, a partition-scoped page table and a
+ * process-scoped page table have the same layout, and the 2nd
+ * doubleword of a partition table entry has the same layout as
+ * the PTCR register.
+ */
+int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
+ struct kvmppc_pte *gpte, u64 table,
+ int table_index, u64 *pte_ret_p)
+{
+ struct kvm *kvm = vcpu->kvm;
+ int ret;
+ unsigned long size, ptbl, root;
+ struct prtb_entry entry;
+
+ if ((table & PRTS_MASK) > 24)
+ return -EINVAL;
+ size = 1ul << ((table & PRTS_MASK) + 12);
+
+ /* Is the table big enough to contain this entry? */
+ if ((table_index * sizeof(entry)) >= size)
+ return -EINVAL;
+
+ /* Read the table to find the root of the radix tree */
+ ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
+ ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
+ if (ret)
+ return ret;
+
+ /* Root is stored in the first double word */
+ root = be64_to_cpu(entry.prtb0);
+
+ return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
+}
+
int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite)
{
- struct kvm *kvm = vcpu->kvm;
u32 pid;
- int ret, level, ps;
- __be64 prte, rpte;
- unsigned long ptbl;
- unsigned long root, pte, index;
- unsigned long rts, bits, offset;
- unsigned long gpa;
- unsigned long proc_tbl_size;
+ u64 pte;
+ int ret;
/* Work out effective PID */
switch (eaddr >> 62) {
@@ -50,71 +264,13 @@
default:
return -EINVAL;
}
- proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
- if (pid * 16 >= proc_tbl_size)
- return -EINVAL;
- /* Read partition table to find root of tree for effective PID */
- ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
- ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
+ ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
+ vcpu->kvm->arch.process_table, pid, &pte);
if (ret)
return ret;
- root = be64_to_cpu(prte);
- rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
- ((root & RTS2_MASK) >> RTS2_SHIFT);
- bits = root & RPDS_MASK;
- root = root & RPDB_MASK;
-
- offset = rts + 31;
-
- /* current implementations only support 52-bit space */
- if (offset != 52)
- return -EINVAL;
-
- for (level = 3; level >= 0; --level) {
- if (level && bits != p9_supported_radix_bits[level])
- return -EINVAL;
- if (level == 0 && !(bits == 5 || bits == 9))
- return -EINVAL;
- offset -= bits;
- index = (eaddr >> offset) & ((1UL << bits) - 1);
- /* check that low bits of page table base are zero */
- if (root & ((1UL << (bits + 3)) - 1))
- return -EINVAL;
- ret = kvm_read_guest(kvm, root + index * 8,
- &rpte, sizeof(rpte));
- if (ret)
- return ret;
- pte = __be64_to_cpu(rpte);
- if (!(pte & _PAGE_PRESENT))
- return -ENOENT;
- if (pte & _PAGE_PTE)
- break;
- bits = pte & 0x1f;
- root = pte & 0x0fffffffffffff00ul;
- }
- /* need a leaf at lowest level; 512GB pages not supported */
- if (level < 0 || level == 3)
- return -EINVAL;
-
- /* offset is now log base 2 of the page size */
- gpa = pte & 0x01fffffffffff000ul;
- if (gpa & ((1ul << offset) - 1))
- return -EINVAL;
- gpa += eaddr & ((1ul << offset) - 1);
- for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
- if (offset == mmu_psize_defs[ps].shift)
- break;
- gpte->page_size = ps;
-
- gpte->eaddr = eaddr;
- gpte->raddr = gpa;
-
- /* Work out permissions */
- gpte->may_read = !!(pte & _PAGE_READ);
- gpte->may_write = !!(pte & _PAGE_WRITE);
- gpte->may_execute = !!(pte & _PAGE_EXEC);
+ /* Check privilege (applies only to process scoped translations) */
if (kvmppc_get_msr(vcpu) & MSR_PR) {
if (pte & _PAGE_PRIVILEGED) {
gpte->may_read = 0;
@@ -136,21 +292,47 @@
return 0;
}
-static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
- unsigned int pshift)
+void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
+ unsigned int pshift, unsigned int lpid)
{
unsigned long psize = PAGE_SIZE;
+ int psi;
+ long rc;
+ unsigned long rb;
if (pshift)
psize = 1UL << pshift;
+ else
+ pshift = PAGE_SHIFT;
addr &= ~(psize - 1);
- radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
+
+ if (!kvmhv_on_pseries()) {
+ radix__flush_tlb_lpid_page(lpid, addr, psize);
+ return;
+ }
+
+ psi = shift_to_mmu_psize(pshift);
+ rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
+ rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
+ lpid, rb);
+ if (rc)
+ pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
}
-static void kvmppc_radix_flush_pwc(struct kvm *kvm)
+static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
{
- radix__flush_pwc_lpid(kvm->arch.lpid);
+ long rc;
+
+ if (!kvmhv_on_pseries()) {
+ radix__flush_pwc_lpid(lpid);
+ return;
+ }
+
+ rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
+ lpid, TLBIEL_INVAL_SET_LPID);
+ if (rc)
+ pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
}
static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
@@ -179,12 +361,6 @@
kmem_cache_free(kvm_pte_cache, ptep);
}
-/* Like pmd_huge() and pmd_large(), but works regardless of config options */
-static inline int pmd_is_leaf(pmd_t pmd)
-{
- return !!(pmd_val(pmd) & _PAGE_PTE);
-}
-
static pmd_t *kvmppc_pmd_alloc(void)
{
return kmem_cache_alloc(kvm_pmd_cache, GFP_KERNEL);
@@ -195,23 +371,44 @@
kmem_cache_free(kvm_pmd_cache, pmdp);
}
-static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
- unsigned long gpa, unsigned int shift)
+/* Called with kvm->mmu_lock held */
+void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
+ unsigned int shift,
+ const struct kvm_memory_slot *memslot,
+ unsigned int lpid)
{
- unsigned long page_size = 1ul << shift;
unsigned long old;
+ unsigned long gfn = gpa >> PAGE_SHIFT;
+ unsigned long page_size = PAGE_SIZE;
+ unsigned long hpa;
old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
- kvmppc_radix_tlbie_page(kvm, gpa, shift);
- if (old & _PAGE_DIRTY) {
- unsigned long gfn = gpa >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot;
+ kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
+ /* The following only applies to L1 entries */
+ if (lpid != kvm->arch.lpid)
+ return;
+
+ if (!memslot) {
memslot = gfn_to_memslot(kvm, gfn);
- if (memslot && memslot->dirty_bitmap)
- kvmppc_update_dirty_map(memslot, gfn, page_size);
+ if (!memslot)
+ return;
}
+ if (shift) { /* 1GB or 2MB page */
+ page_size = 1ul << shift;
+ if (shift == PMD_SHIFT)
+ kvm->stat.num_2M_pages--;
+ else if (shift == PUD_SHIFT)
+ kvm->stat.num_1G_pages--;
+ }
+
+ gpa &= ~(page_size - 1);
+ hpa = old & PTE_RPN_MASK;
+ kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
+
+ if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
+ kvmppc_update_dirty_map(memslot, gfn, page_size);
}
/*
@@ -224,7 +421,8 @@
* and emit a warning if encountered, but there may already be data
* corruption due to the unexpected mappings.
*/
-static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
+static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
+ unsigned int lpid)
{
if (full) {
memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
@@ -238,14 +436,15 @@
WARN_ON_ONCE(1);
kvmppc_unmap_pte(kvm, p,
pte_pfn(*p) << PAGE_SHIFT,
- PAGE_SHIFT);
+ PAGE_SHIFT, NULL, lpid);
}
}
kvmppc_pte_free(pte);
}
-static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
+static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
+ unsigned int lpid)
{
unsigned long im;
pmd_t *p = pmd;
@@ -260,20 +459,21 @@
WARN_ON_ONCE(1);
kvmppc_unmap_pte(kvm, (pte_t *)p,
pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
- PMD_SHIFT);
+ PMD_SHIFT, NULL, lpid);
}
} else {
pte_t *pte;
pte = pte_offset_map(p, 0);
- kvmppc_unmap_free_pte(kvm, pte, full);
+ kvmppc_unmap_free_pte(kvm, pte, full, lpid);
pmd_clear(p);
}
}
kvmppc_pmd_free(pmd);
}
-static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
+static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
+ unsigned int lpid)
{
unsigned long iu;
pud_t *p = pud;
@@ -281,42 +481,46 @@
for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) {
if (!pud_present(*p))
continue;
- if (pud_huge(*p)) {
+ if (pud_is_leaf(*p)) {
pud_clear(p);
} else {
pmd_t *pmd;
pmd = pmd_offset(p, 0);
- kvmppc_unmap_free_pmd(kvm, pmd, true);
+ kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
pud_clear(p);
}
}
pud_free(kvm->mm, pud);
}
-void kvmppc_free_radix(struct kvm *kvm)
+void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
{
unsigned long ig;
- pgd_t *pgd;
- if (!kvm->arch.pgtable)
- return;
- pgd = kvm->arch.pgtable;
for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
pud_t *pud;
if (!pgd_present(*pgd))
continue;
pud = pud_offset(pgd, 0);
- kvmppc_unmap_free_pud(kvm, pud);
+ kvmppc_unmap_free_pud(kvm, pud, lpid);
pgd_clear(pgd);
}
- pgd_free(kvm->mm, kvm->arch.pgtable);
- kvm->arch.pgtable = NULL;
+}
+
+void kvmppc_free_radix(struct kvm *kvm)
+{
+ if (kvm->arch.pgtable) {
+ kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
+ kvm->arch.lpid);
+ pgd_free(kvm->mm, kvm->arch.pgtable);
+ kvm->arch.pgtable = NULL;
+ }
}
static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
- unsigned long gpa)
+ unsigned long gpa, unsigned int lpid)
{
pte_t *pte = pte_offset_kernel(pmd, 0);
@@ -326,13 +530,13 @@
* flushing the PWC again.
*/
pmd_clear(pmd);
- kvmppc_radix_flush_pwc(kvm);
+ kvmppc_radix_flush_pwc(kvm, lpid);
- kvmppc_unmap_free_pte(kvm, pte, false);
+ kvmppc_unmap_free_pte(kvm, pte, false, lpid);
}
static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
- unsigned long gpa)
+ unsigned long gpa, unsigned int lpid)
{
pmd_t *pmd = pmd_offset(pud, 0);
@@ -342,9 +546,9 @@
* so can be freed without flushing the PWC again.
*/
pud_clear(pud);
- kvmppc_radix_flush_pwc(kvm);
+ kvmppc_radix_flush_pwc(kvm, lpid);
- kvmppc_unmap_free_pmd(kvm, pmd, false);
+ kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
}
/*
@@ -356,8 +560,10 @@
*/
#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
-static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
- unsigned int level, unsigned long mmu_seq)
+int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
+ unsigned long gpa, unsigned int level,
+ unsigned long mmu_seq, unsigned int lpid,
+ unsigned long *rmapp, struct rmap_nested **n_rmap)
{
pgd_t *pgd;
pud_t *pud, *new_pud = NULL;
@@ -366,7 +572,7 @@
int ret;
/* Traverse the guest's 2nd-level tree, allocate new levels needed */
- pgd = kvm->arch.pgtable + pgd_index(gpa);
+ pgd = pgtable + pgd_index(gpa);
pud = NULL;
if (pgd_present(*pgd))
pud = pud_offset(pgd, gpa);
@@ -374,7 +580,7 @@
new_pud = pud_alloc_one(kvm->mm, gpa);
pmd = NULL;
- if (pud && pud_present(*pud) && !pud_huge(*pud))
+ if (pud && pud_present(*pud) && !pud_is_leaf(*pud))
pmd = pmd_offset(pud, gpa);
else if (level <= 1)
new_pmd = kvmppc_pmd_alloc();
@@ -397,7 +603,7 @@
new_pud = NULL;
}
pud = pud_offset(pgd, gpa);
- if (pud_huge(*pud)) {
+ if (pud_is_leaf(*pud)) {
unsigned long hgpa = gpa & PUD_MASK;
/* Check if we raced and someone else has set the same thing */
@@ -423,7 +629,8 @@
goto out_unlock;
}
/* Valid 1GB page here already, remove it */
- kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
+ kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
+ lpid);
}
if (level == 2) {
if (!pud_none(*pud)) {
@@ -432,9 +639,11 @@
* install a large page, so remove and free the page
* table page.
*/
- kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
+ kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
}
kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
+ if (rmapp && n_rmap)
+ kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0;
goto out_unlock;
}
@@ -458,7 +667,7 @@
WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
PTE_BITS_MUST_MATCH);
kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
- 0, pte_val(pte), lgpa, PMD_SHIFT);
+ 0, pte_val(pte), lgpa, PMD_SHIFT);
ret = 0;
goto out_unlock;
}
@@ -472,7 +681,8 @@
goto out_unlock;
}
/* Valid 2MB page here already, remove it */
- kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
+ kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
+ lpid);
}
if (level == 1) {
if (!pmd_none(*pmd)) {
@@ -481,9 +691,11 @@
* install a large page, so remove and free the page
* table page.
*/
- kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
+ kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
}
kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
+ if (rmapp && n_rmap)
+ kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0;
goto out_unlock;
}
@@ -508,6 +720,8 @@
goto out_unlock;
}
kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
+ if (rmapp && n_rmap)
+ kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0;
out_unlock:
@@ -521,95 +735,50 @@
return ret;
}
-int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
- unsigned long ea, unsigned long dsisr)
+bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
+ unsigned long gpa, unsigned int lpid)
+{
+ unsigned long pgflags;
+ unsigned int shift;
+ pte_t *ptep;
+
+ /*
+ * Need to set an R or C bit in the 2nd-level tables;
+ * since we are just helping out the hardware here,
+ * it is sufficient to do what the hardware does.
+ */
+ pgflags = _PAGE_ACCESSED;
+ if (writing)
+ pgflags |= _PAGE_DIRTY;
+ /*
+ * We are walking the secondary (partition-scoped) page table here.
+ * We can do this without disabling irq because the Linux MM
+ * subsystem doesn't do THP splits and collapses on this tree.
+ */
+ ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
+ if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
+ kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
+ return true;
+ }
+ return false;
+}
+
+int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
+ unsigned long gpa,
+ struct kvm_memory_slot *memslot,
+ bool writing, bool kvm_ro,
+ pte_t *inserted_pte, unsigned int *levelp)
{
struct kvm *kvm = vcpu->kvm;
- unsigned long mmu_seq;
- unsigned long gpa, gfn, hva;
- struct kvm_memory_slot *memslot;
struct page *page = NULL;
- long ret;
- bool writing;
+ unsigned long mmu_seq;
+ unsigned long hva, gfn = gpa >> PAGE_SHIFT;
bool upgrade_write = false;
bool *upgrade_p = &upgrade_write;
pte_t pte, *ptep;
- unsigned long pgflags;
unsigned int shift, level;
-
- /* Check for unusual errors */
- if (dsisr & DSISR_UNSUPP_MMU) {
- pr_err("KVM: Got unsupported MMU fault\n");
- return -EFAULT;
- }
- if (dsisr & DSISR_BADACCESS) {
- /* Reflect to the guest as DSI */
- pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
- kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
- return RESUME_GUEST;
- }
-
- /* Translate the logical address and get the page */
- gpa = vcpu->arch.fault_gpa & ~0xfffUL;
- gpa &= ~0xF000000000000000ul;
- gfn = gpa >> PAGE_SHIFT;
- if (!(dsisr & DSISR_PRTABLE_FAULT))
- gpa |= ea & 0xfff;
- memslot = gfn_to_memslot(kvm, gfn);
-
- /* No memslot means it's an emulated MMIO region */
- if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
- if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
- DSISR_SET_RC)) {
- /*
- * Bad address in guest page table tree, or other
- * unusual error - reflect it to the guest as DSI.
- */
- kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
- return RESUME_GUEST;
- }
- return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
- dsisr & DSISR_ISSTORE);
- }
-
- writing = (dsisr & DSISR_ISSTORE) != 0;
- if (memslot->flags & KVM_MEM_READONLY) {
- if (writing) {
- /* give the guest a DSI */
- dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
- kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
- return RESUME_GUEST;
- }
- upgrade_p = NULL;
- }
-
- if (dsisr & DSISR_SET_RC) {
- /*
- * Need to set an R or C bit in the 2nd-level tables;
- * since we are just helping out the hardware here,
- * it is sufficient to do what the hardware does.
- */
- pgflags = _PAGE_ACCESSED;
- if (writing)
- pgflags |= _PAGE_DIRTY;
- /*
- * We are walking the secondary page table here. We can do this
- * without disabling irq.
- */
- spin_lock(&kvm->mmu_lock);
- ptep = __find_linux_pte(kvm->arch.pgtable,
- gpa, NULL, &shift);
- if (ptep && pte_present(*ptep) &&
- (!writing || pte_write(*ptep))) {
- kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
- gpa, shift);
- dsisr &= ~DSISR_SET_RC;
- }
- spin_unlock(&kvm->mmu_lock);
- if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
- DSISR_PROTFAULT | DSISR_SET_RC)))
- return RESUME_GUEST;
- }
+ int ret;
+ bool large_enable;
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
@@ -622,7 +791,7 @@
* is that the page is writable.
*/
hva = gfn_to_hva_memslot(memslot, gfn);
- if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
+ if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
upgrade_write = true;
} else {
unsigned long pfn;
@@ -659,12 +828,15 @@
pte = *ptep;
local_irq_enable();
+ /* If we're logging dirty pages, always map single pages */
+ large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES);
+
/* Get pte level from shift/size */
- if (shift == PUD_SHIFT &&
+ if (large_enable && shift == PUD_SHIFT &&
(gpa & (PUD_SIZE - PAGE_SIZE)) ==
(hva & (PUD_SIZE - PAGE_SIZE))) {
level = 2;
- } else if (shift == PMD_SHIFT &&
+ } else if (large_enable && shift == PMD_SHIFT &&
(gpa & (PMD_SIZE - PAGE_SIZE)) ==
(hva & (PMD_SIZE - PAGE_SIZE))) {
level = 1;
@@ -690,7 +862,12 @@
}
/* Allocate space in the tree and write the PTE */
- ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
+ ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
+ mmu_seq, kvm->arch.lpid, NULL, NULL);
+ if (inserted_pte)
+ *inserted_pte = pte;
+ if (levelp)
+ *levelp = level;
if (page) {
if (!ret && (pte_val(pte) & _PAGE_WRITE))
@@ -698,36 +875,111 @@
put_page(page);
}
+ /* Increment number of large pages if we (successfully) inserted one */
+ if (!ret) {
+ if (level == 1)
+ kvm->stat.num_2M_pages++;
+ else if (level == 2)
+ kvm->stat.num_1G_pages++;
+ }
+
+ return ret;
+}
+
+int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
+ unsigned long ea, unsigned long dsisr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long gpa, gfn;
+ struct kvm_memory_slot *memslot;
+ long ret;
+ bool writing = !!(dsisr & DSISR_ISSTORE);
+ bool kvm_ro = false;
+
+ /* Check for unusual errors */
+ if (dsisr & DSISR_UNSUPP_MMU) {
+ pr_err("KVM: Got unsupported MMU fault\n");
+ return -EFAULT;
+ }
+ if (dsisr & DSISR_BADACCESS) {
+ /* Reflect to the guest as DSI */
+ pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
+ kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+ return RESUME_GUEST;
+ }
+
+ /* Translate the logical address */
+ gpa = vcpu->arch.fault_gpa & ~0xfffUL;
+ gpa &= ~0xF000000000000000ul;
+ gfn = gpa >> PAGE_SHIFT;
+ if (!(dsisr & DSISR_PRTABLE_FAULT))
+ gpa |= ea & 0xfff;
+
+ /* Get the corresponding memslot */
+ memslot = gfn_to_memslot(kvm, gfn);
+
+ /* No memslot means it's an emulated MMIO region */
+ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+ if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
+ DSISR_SET_RC)) {
+ /*
+ * Bad address in guest page table tree, or other
+ * unusual error - reflect it to the guest as DSI.
+ */
+ kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+ return RESUME_GUEST;
+ }
+ return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
+ }
+
+ if (memslot->flags & KVM_MEM_READONLY) {
+ if (writing) {
+ /* give the guest a DSI */
+ kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
+ DSISR_PROTFAULT);
+ return RESUME_GUEST;
+ }
+ kvm_ro = true;
+ }
+
+ /* Failed to set the reference/change bits */
+ if (dsisr & DSISR_SET_RC) {
+ spin_lock(&kvm->mmu_lock);
+ if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
+ writing, gpa, kvm->arch.lpid))
+ dsisr &= ~DSISR_SET_RC;
+ spin_unlock(&kvm->mmu_lock);
+
+ if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+ DSISR_PROTFAULT | DSISR_SET_RC)))
+ return RESUME_GUEST;
+ }
+
+ /* Try to insert a pte */
+ ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
+ kvm_ro, NULL, NULL);
+
if (ret == 0 || ret == -EAGAIN)
ret = RESUME_GUEST;
return ret;
}
-/* Called with kvm->lock held */
+/* Called with kvm->mmu_lock held */
int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
- unsigned long old;
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
- if (ptep && pte_present(*ptep)) {
- old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
- gpa, shift);
- kvmppc_radix_tlbie_page(kvm, gpa, shift);
- if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
- unsigned long psize = PAGE_SIZE;
- if (shift)
- psize = 1ul << shift;
- kvmppc_update_dirty_map(memslot, gfn, psize);
- }
- }
+ if (ptep && pte_present(*ptep))
+ kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+ kvm->arch.lpid);
return 0;
}
-/* Called with kvm->lock held */
+/* Called with kvm->mmu_lock held */
int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
@@ -735,18 +987,24 @@
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
int ref = 0;
+ unsigned long old, *rmapp;
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep) && pte_young(*ptep)) {
- kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
- gpa, shift);
+ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0,
+ gpa, shift);
/* XXX need to flush tlb here? */
+ /* Also clear bit in ptes in shadow pgtable for nested guests */
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0,
+ old & PTE_RPN_MASK,
+ 1UL << shift);
ref = 1;
}
return ref;
}
-/* Called with kvm->lock held */
+/* Called with kvm->mmu_lock held */
int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
{
@@ -770,15 +1028,23 @@
pte_t *ptep;
unsigned int shift;
int ret = 0;
+ unsigned long old, *rmapp;
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) {
ret = 1;
if (shift)
ret = 1 << (shift - PAGE_SHIFT);
- kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
- gpa, shift);
- kvmppc_radix_tlbie_page(kvm, gpa, shift);
+ spin_lock(&kvm->mmu_lock);
+ old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
+ gpa, shift);
+ kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
+ /* Also clear bit in ptes in shadow pgtable for nested guests */
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0,
+ old & PTE_RPN_MASK,
+ 1UL << shift);
+ spin_unlock(&kvm->mmu_lock);
}
return ret;
}
@@ -808,6 +1074,26 @@
return 0;
}
+void kvmppc_radix_flush_memslot(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot)
+{
+ unsigned long n;
+ pte_t *ptep;
+ unsigned long gpa;
+ unsigned int shift;
+
+ gpa = memslot->base_gfn << PAGE_SHIFT;
+ spin_lock(&kvm->mmu_lock);
+ for (n = memslot->npages; n; --n) {
+ ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ if (ptep && pte_present(*ptep))
+ kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
+ kvm->arch.lpid);
+ gpa += PAGE_SIZE;
+ }
+ spin_unlock(&kvm->mmu_lock);
+}
+
static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info,
int psize, int *indexp)
{
@@ -863,6 +1149,215 @@
memset(addr, 0, RADIX_PMD_TABLE_SIZE);
}
+struct debugfs_radix_state {
+ struct kvm *kvm;
+ struct mutex mutex;
+ unsigned long gpa;
+ int lpid;
+ int chars_left;
+ int buf_index;
+ char buf[128];
+ u8 hdr;
+};
+
+static int debugfs_radix_open(struct inode *inode, struct file *file)
+{
+ struct kvm *kvm = inode->i_private;
+ struct debugfs_radix_state *p;
+
+ p = kzalloc(sizeof(*p), GFP_KERNEL);
+ if (!p)
+ return -ENOMEM;
+
+ kvm_get_kvm(kvm);
+ p->kvm = kvm;
+ mutex_init(&p->mutex);
+ file->private_data = p;
+
+ return nonseekable_open(inode, file);
+}
+
+static int debugfs_radix_release(struct inode *inode, struct file *file)
+{
+ struct debugfs_radix_state *p = file->private_data;
+
+ kvm_put_kvm(p->kvm);
+ kfree(p);
+ return 0;
+}
+
+static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ struct debugfs_radix_state *p = file->private_data;
+ ssize_t ret, r;
+ unsigned long n;
+ struct kvm *kvm;
+ unsigned long gpa;
+ pgd_t *pgt;
+ struct kvm_nested_guest *nested;
+ pgd_t pgd, *pgdp;
+ pud_t pud, *pudp;
+ pmd_t pmd, *pmdp;
+ pte_t *ptep;
+ int shift;
+ unsigned long pte;
+
+ kvm = p->kvm;
+ if (!kvm_is_radix(kvm))
+ return 0;
+
+ ret = mutex_lock_interruptible(&p->mutex);
+ if (ret)
+ return ret;
+
+ if (p->chars_left) {
+ n = p->chars_left;
+ if (n > len)
+ n = len;
+ r = copy_to_user(buf, p->buf + p->buf_index, n);
+ n -= r;
+ p->chars_left -= n;
+ p->buf_index += n;
+ buf += n;
+ len -= n;
+ ret = n;
+ if (r) {
+ if (!n)
+ ret = -EFAULT;
+ goto out;
+ }
+ }
+
+ gpa = p->gpa;
+ nested = NULL;
+ pgt = NULL;
+ while (len != 0 && p->lpid >= 0) {
+ if (gpa >= RADIX_PGTABLE_RANGE) {
+ gpa = 0;
+ pgt = NULL;
+ if (nested) {
+ kvmhv_put_nested(nested);
+ nested = NULL;
+ }
+ p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
+ p->hdr = 0;
+ if (p->lpid < 0)
+ break;
+ }
+ if (!pgt) {
+ if (p->lpid == 0) {
+ pgt = kvm->arch.pgtable;
+ } else {
+ nested = kvmhv_get_nested(kvm, p->lpid, false);
+ if (!nested) {
+ gpa = RADIX_PGTABLE_RANGE;
+ continue;
+ }
+ pgt = nested->shadow_pgtable;
+ }
+ }
+ n = 0;
+ if (!p->hdr) {
+ if (p->lpid > 0)
+ n = scnprintf(p->buf, sizeof(p->buf),
+ "\nNested LPID %d: ", p->lpid);
+ n += scnprintf(p->buf + n, sizeof(p->buf) - n,
+ "pgdir: %lx\n", (unsigned long)pgt);
+ p->hdr = 1;
+ goto copy;
+ }
+
+ pgdp = pgt + pgd_index(gpa);
+ pgd = READ_ONCE(*pgdp);
+ if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
+ gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
+ continue;
+ }
+
+ pudp = pud_offset(&pgd, gpa);
+ pud = READ_ONCE(*pudp);
+ if (!(pud_val(pud) & _PAGE_PRESENT)) {
+ gpa = (gpa & PUD_MASK) + PUD_SIZE;
+ continue;
+ }
+ if (pud_val(pud) & _PAGE_PTE) {
+ pte = pud_val(pud);
+ shift = PUD_SHIFT;
+ goto leaf;
+ }
+
+ pmdp = pmd_offset(&pud, gpa);
+ pmd = READ_ONCE(*pmdp);
+ if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+ gpa = (gpa & PMD_MASK) + PMD_SIZE;
+ continue;
+ }
+ if (pmd_val(pmd) & _PAGE_PTE) {
+ pte = pmd_val(pmd);
+ shift = PMD_SHIFT;
+ goto leaf;
+ }
+
+ ptep = pte_offset_kernel(&pmd, gpa);
+ pte = pte_val(READ_ONCE(*ptep));
+ if (!(pte & _PAGE_PRESENT)) {
+ gpa += PAGE_SIZE;
+ continue;
+ }
+ shift = PAGE_SHIFT;
+ leaf:
+ n = scnprintf(p->buf, sizeof(p->buf),
+ " %lx: %lx %d\n", gpa, pte, shift);
+ gpa += 1ul << shift;
+ copy:
+ p->chars_left = n;
+ if (n > len)
+ n = len;
+ r = copy_to_user(buf, p->buf, n);
+ n -= r;
+ p->chars_left -= n;
+ p->buf_index = n;
+ buf += n;
+ len -= n;
+ ret += n;
+ if (r) {
+ if (!ret)
+ ret = -EFAULT;
+ break;
+ }
+ }
+ p->gpa = gpa;
+ if (nested)
+ kvmhv_put_nested(nested);
+
+ out:
+ mutex_unlock(&p->mutex);
+ return ret;
+}
+
+static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ return -EACCES;
+}
+
+static const struct file_operations debugfs_radix_fops = {
+ .owner = THIS_MODULE,
+ .open = debugfs_radix_open,
+ .release = debugfs_radix_release,
+ .read = debugfs_radix_read,
+ .write = debugfs_radix_write,
+ .llseek = generic_file_llseek,
+};
+
+void kvmhv_radix_debugfs_init(struct kvm *kvm)
+{
+ kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
+ kvm->arch.debugfs_dir, kvm,
+ &debugfs_radix_fops);
+}
+
int kvmppc_radix_init(void)
{
unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
diff --git a/arch/powerpc/kvm/book3s_64_slb.S b/arch/powerpc/kvm/book3s_64_slb.S
index 066c665..4d958dd 100644
--- a/arch/powerpc/kvm/book3s_64_slb.S
+++ b/arch/powerpc/kvm/book3s_64_slb.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 9a3f264..5834db0 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
* Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
@@ -30,6 +19,7 @@
#include <linux/anon_inodes.h>
#include <linux/iommu.h>
#include <linux/file.h>
+#include <linux/mm.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
@@ -56,43 +46,6 @@
return tce_pages + ALIGN(stt_bytes, PAGE_SIZE) / PAGE_SIZE;
}
-static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc)
-{
- long ret = 0;
-
- if (!current || !current->mm)
- return ret; /* process exited */
-
- down_write(¤t->mm->mmap_sem);
-
- if (inc) {
- unsigned long locked, lock_limit;
-
- locked = current->mm->locked_vm + stt_pages;
- lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
- if (locked > lock_limit && !capable(CAP_IPC_LOCK))
- ret = -ENOMEM;
- else
- current->mm->locked_vm += stt_pages;
- } else {
- if (WARN_ON_ONCE(stt_pages > current->mm->locked_vm))
- stt_pages = current->mm->locked_vm;
-
- current->mm->locked_vm -= stt_pages;
- }
-
- pr_debug("[%d] RLIMIT_MEMLOCK KVM %c%ld %ld/%ld%s\n", current->pid,
- inc ? '+' : '-',
- stt_pages << PAGE_SHIFT,
- current->mm->locked_vm << PAGE_SHIFT,
- rlimit(RLIMIT_MEMLOCK),
- ret ? " - exceeded" : "");
-
- up_write(¤t->mm->mmap_sem);
-
- return ret;
-}
-
static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head)
{
struct kvmppc_spapr_tce_iommu_table *stit = container_of(head,
@@ -133,7 +86,6 @@
continue;
kref_put(&stit->kref, kvm_spapr_tce_liobn_put);
- return;
}
}
}
@@ -229,11 +181,33 @@
unsigned long i, npages = kvmppc_tce_pages(stt->size);
for (i = 0; i < npages; i++)
- __free_page(stt->pages[i]);
+ if (stt->pages[i])
+ __free_page(stt->pages[i]);
kfree(stt);
}
+static struct page *kvm_spapr_get_tce_page(struct kvmppc_spapr_tce_table *stt,
+ unsigned long sttpage)
+{
+ struct page *page = stt->pages[sttpage];
+
+ if (page)
+ return page;
+
+ mutex_lock(&stt->alloc_lock);
+ page = stt->pages[sttpage];
+ if (!page) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ WARN_ON_ONCE(!page);
+ if (page)
+ stt->pages[sttpage] = page;
+ }
+ mutex_unlock(&stt->alloc_lock);
+
+ return page;
+}
+
static vm_fault_t kvm_spapr_tce_fault(struct vm_fault *vmf)
{
struct kvmppc_spapr_tce_table *stt = vmf->vma->vm_file->private_data;
@@ -242,7 +216,10 @@
if (vmf->pgoff >= kvmppc_tce_pages(stt->size))
return VM_FAULT_SIGBUS;
- page = stt->pages[vmf->pgoff];
+ page = kvm_spapr_get_tce_page(stt, vmf->pgoff);
+ if (!page)
+ return VM_FAULT_OOM;
+
get_page(page);
vmf->page = page;
return 0;
@@ -278,7 +255,7 @@
kvm_put_kvm(stt->kvm);
- kvmppc_account_memlimit(
+ account_locked_vm(current->mm,
kvmppc_stt_pages(kvmppc_tce_pages(stt->size)), false);
call_rcu(&stt->rcu, release_spapr_tce_table);
@@ -297,14 +274,13 @@
struct kvmppc_spapr_tce_table *siter;
unsigned long npages, size = args->size;
int ret = -ENOMEM;
- int i;
if (!args->size || args->page_shift < 12 || args->page_shift > 34 ||
(args->offset + args->size > (ULLONG_MAX >> args->page_shift)))
return -EINVAL;
npages = kvmppc_tce_pages(size);
- ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
+ ret = account_locked_vm(current->mm, kvmppc_stt_pages(npages), true);
if (ret)
return ret;
@@ -319,14 +295,9 @@
stt->offset = args->offset;
stt->size = size;
stt->kvm = kvm;
+ mutex_init(&stt->alloc_lock);
INIT_LIST_HEAD_RCU(&stt->iommu_tables);
- for (i = 0; i < npages; i++) {
- stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO);
- if (!stt->pages[i])
- goto fail;
- }
-
mutex_lock(&kvm->lock);
/* Check this LIOBN hasn't been previously allocated */
@@ -338,37 +309,114 @@
}
}
+ kvm_get_kvm(kvm);
if (!ret)
ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
stt, O_RDWR | O_CLOEXEC);
- if (ret >= 0) {
+ if (ret >= 0)
list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
- kvm_get_kvm(kvm);
- }
+ else
+ kvm_put_kvm(kvm);
mutex_unlock(&kvm->lock);
if (ret >= 0)
return ret;
- fail:
- for (i = 0; i < npages; i++)
- if (stt->pages[i])
- __free_page(stt->pages[i]);
-
kfree(stt);
fail_acct:
- kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
+ account_locked_vm(current->mm, kvmppc_stt_pages(npages), false);
return ret;
}
-static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
+static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
+ unsigned long *ua)
+{
+ unsigned long gfn = tce >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot;
+
+ memslot = search_memslots(kvm_memslots(kvm), gfn);
+ if (!memslot)
+ return -EINVAL;
+
+ *ua = __gfn_to_hva_memslot(memslot, gfn) |
+ (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+
+ return 0;
+}
+
+static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
+ unsigned long tce)
+{
+ unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
+ enum dma_data_direction dir = iommu_tce_direction(tce);
+ struct kvmppc_spapr_tce_iommu_table *stit;
+ unsigned long ua = 0;
+
+ /* Allow userspace to poison TCE table */
+ if (dir == DMA_NONE)
+ return H_SUCCESS;
+
+ if (iommu_tce_check_gpa(stt->page_shift, gpa))
+ return H_TOO_HARD;
+
+ if (kvmppc_tce_to_ua(stt->kvm, tce, &ua))
+ return H_TOO_HARD;
+
+ list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
+ unsigned long hpa = 0;
+ struct mm_iommu_table_group_mem_t *mem;
+ long shift = stit->tbl->it_page_shift;
+
+ mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
+ if (!mem)
+ return H_TOO_HARD;
+
+ if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
+ return H_TOO_HARD;
+ }
+
+ return H_SUCCESS;
+}
+
+/*
+ * Handles TCE requests for emulated devices.
+ * Puts guest TCE values to the table and expects user space to convert them.
+ * Cannot fail so kvmppc_tce_validate must be called before it.
+ */
+static void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+ unsigned long idx, unsigned long tce)
+{
+ struct page *page;
+ u64 *tbl;
+ unsigned long sttpage;
+
+ idx -= stt->offset;
+ sttpage = idx / TCES_PER_PAGE;
+ page = stt->pages[sttpage];
+
+ if (!page) {
+ /* We allow any TCE, not just with read|write permissions */
+ if (!tce)
+ return;
+
+ page = kvm_spapr_get_tce_page(stt, sttpage);
+ if (!page)
+ return;
+ }
+ tbl = page_to_virt(page);
+
+ tbl[idx % TCES_PER_PAGE] = tce;
+}
+
+static void kvmppc_clear_tce(struct mm_struct *mm, struct iommu_table *tbl,
+ unsigned long entry)
{
unsigned long hpa = 0;
enum dma_data_direction dir = DMA_NONE;
- iommu_tce_xchg(tbl, entry, &hpa, &dir);
+ iommu_tce_xchg_no_kill(mm, tbl, entry, &hpa, &dir);
}
static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -376,11 +424,10 @@
{
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry);
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
if (!pua)
- /* it_userspace allocation might be delayed */
- return H_TOO_HARD;
+ return H_SUCCESS;
mem = mm_iommu_lookup(kvm->mm, be64_to_cpu(*pua), pgsize);
if (!mem)
@@ -400,15 +447,16 @@
unsigned long hpa = 0;
long ret;
- if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
- return H_HARDWARE;
+ if (WARN_ON_ONCE(iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa,
+ &dir)))
+ return H_TOO_HARD;
if (dir == DMA_NONE)
return H_SUCCESS;
ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry);
if (ret != H_SUCCESS)
- iommu_tce_xchg(tbl, entry, &hpa, &dir);
+ iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
return ret;
}
@@ -449,15 +497,15 @@
return H_TOO_HARD;
if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
- return H_HARDWARE;
+ return H_TOO_HARD;
if (mm_iommu_mapped_inc(mem))
- return H_CLOSED;
+ return H_TOO_HARD;
- ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
+ ret = iommu_tce_xchg_no_kill(kvm->mm, tbl, entry, &hpa, &dir);
if (WARN_ON_ONCE(ret)) {
mm_iommu_mapped_dec(mem);
- return H_HARDWARE;
+ return H_TOO_HARD;
}
if (dir != DMA_NONE)
@@ -509,16 +557,15 @@
if (ret != H_SUCCESS)
return ret;
+ idx = srcu_read_lock(&vcpu->kvm->srcu);
+
ret = kvmppc_tce_validate(stt, tce);
if (ret != H_SUCCESS)
- return ret;
+ goto unlock_exit;
dir = iommu_tce_direction(tce);
- idx = srcu_read_lock(&vcpu->kvm->srcu);
-
- if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
+ if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
ret = H_PARAMETER;
goto unlock_exit;
}
@@ -533,14 +580,12 @@
ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
entry, ua, dir);
- if (ret == H_SUCCESS)
- continue;
+ iommu_tce_kill(stit->tbl, entry, 1);
- if (ret == H_TOO_HARD)
+ if (ret != H_SUCCESS) {
+ kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
goto unlock_exit;
-
- WARN_ON_ONCE(1);
- kvmppc_clear_tce(stit->tbl, entry);
+ }
}
kvmppc_tce_put(stt, entry, tce);
@@ -583,7 +628,7 @@
return ret;
idx = srcu_read_lock(&vcpu->kvm->srcu);
- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
+ if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua)) {
ret = H_TOO_HARD;
goto unlock_exit;
}
@@ -599,30 +644,49 @@
ret = kvmppc_tce_validate(stt, tce);
if (ret != H_SUCCESS)
goto unlock_exit;
+ }
- if (kvmppc_gpa_to_ua(vcpu->kvm,
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
- &ua, NULL))
- return H_PARAMETER;
+ for (i = 0; i < npages; ++i) {
+ /*
+ * This looks unsafe, because we validate, then regrab
+ * the TCE from userspace which could have been changed by
+ * another thread.
+ *
+ * But it actually is safe, because the relevant checks will be
+ * re-executed in the following code. If userspace tries to
+ * change this dodgily it will result in a messier failure mode
+ * but won't threaten the host.
+ */
+ if (get_user(tce, tces + i)) {
+ ret = H_TOO_HARD;
+ goto invalidate_exit;
+ }
+ tce = be64_to_cpu(tce);
+
+ if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua)) {
+ ret = H_PARAMETER;
+ goto invalidate_exit;
+ }
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
ret = kvmppc_tce_iommu_map(vcpu->kvm, stt,
stit->tbl, entry + i, ua,
iommu_tce_direction(tce));
- if (ret == H_SUCCESS)
- continue;
-
- if (ret == H_TOO_HARD)
- goto unlock_exit;
-
- WARN_ON_ONCE(1);
- kvmppc_clear_tce(stit->tbl, entry);
+ if (ret != H_SUCCESS) {
+ kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl,
+ entry);
+ goto invalidate_exit;
+ }
}
kvmppc_tce_put(stt, entry + i, tce);
}
+invalidate_exit:
+ list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+ iommu_tce_kill(stit->tbl, entry, npages);
+
unlock_exit:
srcu_read_unlock(&vcpu->kvm->srcu, idx);
@@ -661,16 +725,20 @@
continue;
if (ret == H_TOO_HARD)
- return ret;
+ goto invalidate_exit;
WARN_ON_ONCE(1);
- kvmppc_clear_tce(stit->tbl, entry);
+ kvmppc_clear_tce(vcpu->kvm->mm, stit->tbl, entry);
}
}
for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
- return H_SUCCESS;
+invalidate_exit:
+ list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+ iommu_tce_kill(stit->tbl, ioba >> stt->page_shift, npages);
+
+ return ret;
}
EXPORT_SYMBOL_GPL(kvmppc_h_stuff_tce);
diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c
index 6821ead..ab6eeb8 100644
--- a/arch/powerpc/kvm/book3s_64_vio_hv.c
+++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright 2010 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
* Copyright 2011 David Gibson, IBM Corporation <dwg@au1.ibm.com>
@@ -66,8 +55,6 @@
#endif
-#define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64))
-
/*
* Finds a TCE table descriptor by LIOBN.
*
@@ -87,6 +74,26 @@
}
EXPORT_SYMBOL_GPL(kvmppc_find_table);
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long tce,
+ unsigned long *ua, unsigned long **prmap)
+{
+ unsigned long gfn = tce >> PAGE_SHIFT;
+ struct kvm_memory_slot *memslot;
+
+ memslot = search_memslots(kvm_memslots_raw(kvm), gfn);
+ if (!memslot)
+ return -EINVAL;
+
+ *ua = __gfn_to_hva_memslot(memslot, gfn) |
+ (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+
+ if (prmap)
+ *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
+
+ return 0;
+}
+
/*
* Validates TCE address.
* At the moment flags and page mask are validated.
@@ -94,14 +101,14 @@
* to the table and user space is supposed to process them), we can skip
* checking other things (such as TCE is a guest RAM address or the page
* was actually allocated).
- *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- * mode on PR KVM
*/
-long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
+static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
+ unsigned long tce)
{
unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
enum dma_data_direction dir = iommu_tce_direction(tce);
+ struct kvmppc_spapr_tce_iommu_table *stit;
+ unsigned long ua = 0;
/* Allow userspace to poison TCE table */
if (dir == DMA_NONE)
@@ -110,9 +117,24 @@
if (iommu_tce_check_gpa(stt->page_shift, gpa))
return H_PARAMETER;
+ if (kvmppc_rm_tce_to_ua(stt->kvm, tce, &ua, NULL))
+ return H_TOO_HARD;
+
+ list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
+ unsigned long hpa = 0;
+ struct mm_iommu_table_group_mem_t *mem;
+ long shift = stit->tbl->it_page_shift;
+
+ mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
+ if (!mem)
+ return H_TOO_HARD;
+
+ if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
+ return H_TOO_HARD;
+ }
+
return H_SUCCESS;
}
-EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
/* Note on the use of page_address() in real mode,
*
@@ -144,13 +166,9 @@
/*
* Handles TCE requests for emulated devices.
* Puts guest TCE values to the table and expects user space to convert them.
- * Called in both real and virtual modes.
- * Cannot fail so kvmppc_tce_validate must be called before it.
- *
- * WARNING: This will be called in real-mode on HV KVM and virtual
- * mode on PR KVM
+ * Cannot fail so kvmppc_rm_tce_validate must be called before it.
*/
-void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
+static void kvmppc_rm_tce_put(struct kvmppc_spapr_tce_table *stt,
unsigned long idx, unsigned long tce)
{
struct page *page;
@@ -158,46 +176,60 @@
idx -= stt->offset;
page = stt->pages[idx / TCES_PER_PAGE];
+ /*
+ * page must not be NULL in real mode,
+ * kvmppc_rm_ioba_validate() must have taken care of this.
+ */
+ WARN_ON_ONCE_RM(!page);
tbl = kvmppc_page_address(page);
tbl[idx % TCES_PER_PAGE] = tce;
}
-EXPORT_SYMBOL_GPL(kvmppc_tce_put);
-long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
- unsigned long *ua, unsigned long **prmap)
+/*
+ * TCEs pages are allocated in kvmppc_rm_tce_put() which won't be able to do so
+ * in real mode.
+ * Check if kvmppc_rm_tce_put() can succeed in real mode, i.e. a TCEs page is
+ * allocated or not required (when clearing a tce entry).
+ */
+static long kvmppc_rm_ioba_validate(struct kvmppc_spapr_tce_table *stt,
+ unsigned long ioba, unsigned long npages, bool clearing)
{
- unsigned long gfn = gpa >> PAGE_SHIFT;
- struct kvm_memory_slot *memslot;
+ unsigned long i, idx, sttpage, sttpages;
+ unsigned long ret = kvmppc_ioba_validate(stt, ioba, npages);
- memslot = search_memslots(kvm_memslots(kvm), gfn);
- if (!memslot)
- return -EINVAL;
+ if (ret)
+ return ret;
+ /*
+ * clearing==true says kvmppc_rm_tce_put won't be allocating pages
+ * for empty tces.
+ */
+ if (clearing)
+ return H_SUCCESS;
- *ua = __gfn_to_hva_memslot(memslot, gfn) |
- (gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
+ idx = (ioba >> stt->page_shift) - stt->offset;
+ sttpage = idx / TCES_PER_PAGE;
+ sttpages = _ALIGN_UP(idx % TCES_PER_PAGE + npages, TCES_PER_PAGE) /
+ TCES_PER_PAGE;
+ for (i = sttpage; i < sttpage + sttpages; ++i)
+ if (!stt->pages[i])
+ return H_TOO_HARD;
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- if (prmap)
- *prmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
-#endif
-
- return 0;
+ return H_SUCCESS;
}
-EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
-static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
+static long iommu_tce_xchg_no_kill_rm(struct mm_struct *mm,
+ struct iommu_table *tbl,
unsigned long entry, unsigned long *hpa,
enum dma_data_direction *direction)
{
long ret;
- ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction);
+ ret = tbl->it_ops->xchg_no_kill(tbl, entry, hpa, direction, true);
if (!ret && ((*direction == DMA_FROM_DEVICE) ||
(*direction == DMA_BIDIRECTIONAL))) {
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
/*
* kvmppc_rm_tce_iommu_do_map() updates the UA cache after
* calling this so we still get here a valid UA.
@@ -209,13 +241,20 @@
return ret;
}
+extern void iommu_tce_kill_rm(struct iommu_table *tbl,
+ unsigned long entry, unsigned long pages)
+{
+ if (tbl->it_ops->tce_kill)
+ tbl->it_ops->tce_kill(tbl, entry, pages, true);
+}
+
static void kvmppc_rm_clear_tce(struct kvm *kvm, struct iommu_table *tbl,
unsigned long entry)
{
unsigned long hpa = 0;
enum dma_data_direction dir = DMA_NONE;
- iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
+ iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
}
static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm,
@@ -223,7 +262,7 @@
{
struct mm_iommu_table_group_mem_t *mem = NULL;
const unsigned long pgsize = 1ULL << tbl->it_page_shift;
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
if (!pua)
/* it_userspace allocation might be delayed */
@@ -247,7 +286,7 @@
unsigned long hpa = 0;
long ret;
- if (iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir))
+ if (iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir))
/*
* real mode xchg can fail if struct page crosses
* a page boundary
@@ -259,7 +298,7 @@
ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry);
if (ret)
- iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
+ iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
return ret;
}
@@ -287,7 +326,7 @@
{
long ret;
unsigned long hpa = 0;
- __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RM(tbl, entry);
+ __be64 *pua = IOMMU_TABLE_USERSPACE_ENTRY_RO(tbl, entry);
struct mm_iommu_table_group_mem_t *mem;
if (!pua)
@@ -300,12 +339,12 @@
if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
&hpa)))
- return H_HARDWARE;
+ return H_TOO_HARD;
if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
- return H_CLOSED;
+ return H_TOO_HARD;
- ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
+ ret = iommu_tce_xchg_no_kill_rm(kvm->mm, tbl, entry, &hpa, &dir);
if (ret) {
mm_iommu_mapped_dec(mem);
/*
@@ -364,17 +403,16 @@
if (!stt)
return H_TOO_HARD;
- ret = kvmppc_ioba_validate(stt, ioba, 1);
+ ret = kvmppc_rm_ioba_validate(stt, ioba, 1, tce == 0);
if (ret != H_SUCCESS)
return ret;
- ret = kvmppc_tce_validate(stt, tce);
+ ret = kvmppc_rm_tce_validate(stt, tce);
if (ret != H_SUCCESS)
return ret;
dir = iommu_tce_direction(tce);
- if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
+ if ((dir != DMA_NONE) && kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
return H_PARAMETER;
entry = ioba >> stt->page_shift;
@@ -387,17 +425,15 @@
ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
stit->tbl, entry, ua, dir);
- if (ret == H_SUCCESS)
- continue;
+ iommu_tce_kill_rm(stit->tbl, entry, 1);
- if (ret == H_TOO_HARD)
+ if (ret != H_SUCCESS) {
+ kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
return ret;
-
- WARN_ON_ONCE_RM(1);
- kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+ }
}
- kvmppc_tce_put(stt, entry, tce);
+ kvmppc_rm_tce_put(stt, entry, tce);
return H_SUCCESS;
}
@@ -468,7 +504,7 @@
if (tce_list & (SZ_4K - 1))
return H_PARAMETER;
- ret = kvmppc_ioba_validate(stt, ioba, npages);
+ ret = kvmppc_rm_ioba_validate(stt, ioba, npages, false);
if (ret != H_SUCCESS)
return ret;
@@ -480,7 +516,7 @@
*/
struct mm_iommu_table_group_mem_t *mem;
- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
return H_TOO_HARD;
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
@@ -496,12 +532,12 @@
* We do not require memory to be preregistered in this case
* so lock rmap and do __find_linux_pte_or_hugepte().
*/
- if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
return H_TOO_HARD;
rmap = (void *) vmalloc_to_phys(rmap);
if (WARN_ON_ONCE_RM(!rmap))
- return H_HARDWARE;
+ return H_TOO_HARD;
/*
* Synchronize with the MMU notifier callbacks in
@@ -521,34 +557,39 @@
for (i = 0; i < npages; ++i) {
unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
- ret = kvmppc_tce_validate(stt, tce);
+ ret = kvmppc_rm_tce_validate(stt, tce);
if (ret != H_SUCCESS)
goto unlock_exit;
+ }
+
+ for (i = 0; i < npages; ++i) {
+ unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
ua = 0;
- if (kvmppc_gpa_to_ua(vcpu->kvm,
- tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
- &ua, NULL))
- return H_PARAMETER;
+ if (kvmppc_rm_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
+ ret = H_PARAMETER;
+ goto invalidate_exit;
+ }
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
stit->tbl, entry + i, ua,
iommu_tce_direction(tce));
- if (ret == H_SUCCESS)
- continue;
-
- if (ret == H_TOO_HARD)
- goto unlock_exit;
-
- WARN_ON_ONCE_RM(1);
- kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
+ if (ret != H_SUCCESS) {
+ kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
+ entry);
+ goto invalidate_exit;
+ }
}
- kvmppc_tce_put(stt, entry + i, tce);
+ kvmppc_rm_tce_put(stt, entry + i, tce);
}
+invalidate_exit:
+ list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+ iommu_tce_kill_rm(stit->tbl, entry, npages);
+
unlock_exit:
if (rmap)
unlock_rmap(rmap);
@@ -572,7 +613,7 @@
if (!stt)
return H_TOO_HARD;
- ret = kvmppc_ioba_validate(stt, ioba, npages);
+ ret = kvmppc_rm_ioba_validate(stt, ioba, npages, tce_value == 0);
if (ret != H_SUCCESS)
return ret;
@@ -591,7 +632,7 @@
continue;
if (ret == H_TOO_HARD)
- return ret;
+ goto invalidate_exit;
WARN_ON_ONCE_RM(1);
kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
@@ -599,9 +640,13 @@
}
for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift))
- kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value);
+ kvmppc_rm_tce_put(stt, ioba >> stt->page_shift, tce_value);
- return H_SUCCESS;
+invalidate_exit:
+ list_for_each_entry_lockless(stit, &stt->iommu_tables, next)
+ iommu_tce_kill_rm(stit->tbl, ioba >> stt->page_shift, npages);
+
+ return ret;
}
/* This can be called in either virtual mode or real mode */
@@ -624,6 +669,10 @@
idx = (ioba >> stt->page_shift) - stt->offset;
page = stt->pages[idx / TCES_PER_PAGE];
+ if (!page) {
+ vcpu->arch.regs.gpr[4] = 0;
+ return H_SUCCESS;
+ }
tbl = (u64 *)page_address(page);
vcpu->arch.regs.gpr[4] = tbl[idx % TCES_PER_PAGE];
diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c
index 36b11c5..dad71d2 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
@@ -36,7 +25,6 @@
#define OP_31_XOP_MTSR 210
#define OP_31_XOP_MTSRIN 242
#define OP_31_XOP_TLBIEL 274
-#define OP_31_XOP_TLBIE 306
/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
#define OP_31_XOP_FAKE_SC1 308
#define OP_31_XOP_SLBMTE 402
@@ -48,6 +36,7 @@
#define OP_31_XOP_SLBMFEV 851
#define OP_31_XOP_EIOIO 854
#define OP_31_XOP_SLBMFEE 915
+#define OP_31_XOP_SLBFEE 979
#define OP_31_XOP_TBEGIN 654
#define OP_31_XOP_TABORT 910
@@ -110,7 +99,7 @@
vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
vcpu->arch.tar_tm = vcpu->arch.tar;
vcpu->arch.lr_tm = vcpu->arch.regs.link;
- vcpu->arch.cr_tm = vcpu->arch.cr;
+ vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
vcpu->arch.xer_tm = vcpu->arch.regs.xer;
vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
}
@@ -129,7 +118,7 @@
vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
vcpu->arch.tar = vcpu->arch.tar_tm;
vcpu->arch.regs.link = vcpu->arch.lr_tm;
- vcpu->arch.cr = vcpu->arch.cr_tm;
+ vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
vcpu->arch.regs.xer = vcpu->arch.xer_tm;
vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
}
@@ -141,7 +130,7 @@
uint64_t texasr;
/* CR0 = 0 | MSR[TS] | 0 */
- vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+ vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
<< CR0_SHIFT);
@@ -220,7 +209,7 @@
tm_abort(ra_val);
/* CR0 = 0 | MSR[TS] | 0 */
- vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
+ vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
<< CR0_SHIFT);
@@ -417,6 +406,23 @@
vcpu->arch.mmu.slbia(vcpu);
break;
+ case OP_31_XOP_SLBFEE:
+ if (!(inst & 1) || !vcpu->arch.mmu.slbfee) {
+ return EMULATE_FAIL;
+ } else {
+ ulong b, t;
+ ulong cr = kvmppc_get_cr(vcpu) & ~CR0_MASK;
+
+ b = kvmppc_get_gpr(vcpu, rb);
+ if (!vcpu->arch.mmu.slbfee(vcpu, b, &t))
+ cr |= 2 << CR0_SHIFT;
+ kvmppc_set_gpr(vcpu, rt, t);
+ /* copy XER[SO] bit to CR0[SO] */
+ cr |= (vcpu->arch.regs.xer & 0x80000000) >>
+ (31 - CR0_SHIFT);
+ kvmppc_set_cr(vcpu, cr);
+ }
+ break;
case OP_31_XOP_SLBMFEE:
if (!vcpu->arch.mmu.slbmfee) {
emulated = EMULATE_FAIL;
@@ -494,8 +500,8 @@
if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
preempt_disable();
- vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
- (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
+ vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
+ (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
(((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
diff --git a/arch/powerpc/kvm/book3s_exports.c b/arch/powerpc/kvm/book3s_exports.c
index 0d013fb..f085658 100644
--- a/arch/powerpc/kvm/book3s_exports.c
+++ b/arch/powerpc/kvm/book3s_exports.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 3e3a715..709cf1f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
* Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
@@ -12,10 +13,6 @@
*
* This file is derived from arch/powerpc/kvm/book3s.c,
* by Alexander Graf <agraf@suse.de>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kvm_host.h>
@@ -50,6 +47,7 @@
#include <asm/reg.h>
#include <asm/ppc-opcode.h>
#include <asm/asm-prototypes.h>
+#include <asm/archrandom.h>
#include <asm/debug.h>
#include <asm/disassemble.h>
#include <asm/cputable.h>
@@ -73,6 +71,7 @@
#include <asm/opal.h>
#include <asm/xics.h>
#include <asm/xive.h>
+#include <asm/hw_breakpoint.h>
#include "book3s.h"
@@ -104,6 +103,10 @@
module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
+static bool one_vm_per_core;
+module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
+
#ifdef CONFIG_KVM_XICS
static struct kernel_param_ops module_param_ops = {
.set = param_set_int,
@@ -117,6 +120,16 @@
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
#endif
+/* If set, guests are allowed to create and control nested guests */
+static bool nested = true;
+module_param(nested, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
+
+static inline bool nesting_enabled(struct kvm *kvm)
+{
+ return kvm->arch.nested_enable && kvm_is_radix(kvm);
+}
+
/* If set, the threads on each CPU core have to be in the same MMU mode */
static bool no_mixing_hpt_and_radix;
@@ -173,6 +186,10 @@
{
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+ /* If we're a nested hypervisor, fall back to ordinary IPIs for now */
+ if (kvmhv_on_pseries())
+ return false;
+
/* On POWER9 we can use msgsnd to IPI any cpu */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
@@ -384,8 +401,11 @@
spin_lock(&vc->lock);
vc->arch_compat = arch_compat;
- /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */
- vc->pcr = host_pcr_bit - guest_pcr_bit;
+ /*
+ * Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit
+ * Also set all reserved PCR bits
+ */
+ vc->pcr = (host_pcr_bit - guest_pcr_bit) | PCR_MASK;
spin_unlock(&vc->lock);
return 0;
@@ -410,8 +430,8 @@
vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
- pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
- vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
+ pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n",
+ vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
pr_err("fault dar = %.16lx dsisr = %.8x\n",
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
@@ -426,12 +446,7 @@
static struct kvm_vcpu *kvmppc_find_vcpu(struct kvm *kvm, int id)
{
- struct kvm_vcpu *ret;
-
- mutex_lock(&kvm->lock);
- ret = kvm_get_vcpu_by_id(kvm, id);
- mutex_unlock(&kvm->lock);
- return ret;
+ return kvm_get_vcpu_by_id(kvm, id);
}
static void init_vpa(struct kvm_vcpu *vcpu, struct lppaca *vpa)
@@ -730,8 +745,7 @@
/*
* Ensure that the read of vcore->dpdes comes after the read
* of vcpu->doorbell_request. This barrier matches the
- * lwsync in book3s_hv_rmhandlers.S just before the
- * fast_guest_return label.
+ * smp_wmb() in kvmppc_guest_entry_inject().
*/
smp_rmb();
vc = vcpu->arch.vcore;
@@ -783,6 +797,80 @@
}
}
+/* Copy guest memory in place - must reside within a single memslot */
+static int kvmppc_copy_guest(struct kvm *kvm, gpa_t to, gpa_t from,
+ unsigned long len)
+{
+ struct kvm_memory_slot *to_memslot = NULL;
+ struct kvm_memory_slot *from_memslot = NULL;
+ unsigned long to_addr, from_addr;
+ int r;
+
+ /* Get HPA for from address */
+ from_memslot = gfn_to_memslot(kvm, from >> PAGE_SHIFT);
+ if (!from_memslot)
+ return -EFAULT;
+ if ((from + len) >= ((from_memslot->base_gfn + from_memslot->npages)
+ << PAGE_SHIFT))
+ return -EINVAL;
+ from_addr = gfn_to_hva_memslot(from_memslot, from >> PAGE_SHIFT);
+ if (kvm_is_error_hva(from_addr))
+ return -EFAULT;
+ from_addr |= (from & (PAGE_SIZE - 1));
+
+ /* Get HPA for to address */
+ to_memslot = gfn_to_memslot(kvm, to >> PAGE_SHIFT);
+ if (!to_memslot)
+ return -EFAULT;
+ if ((to + len) >= ((to_memslot->base_gfn + to_memslot->npages)
+ << PAGE_SHIFT))
+ return -EINVAL;
+ to_addr = gfn_to_hva_memslot(to_memslot, to >> PAGE_SHIFT);
+ if (kvm_is_error_hva(to_addr))
+ return -EFAULT;
+ to_addr |= (to & (PAGE_SIZE - 1));
+
+ /* Perform copy */
+ r = raw_copy_in_user((void __user *)to_addr, (void __user *)from_addr,
+ len);
+ if (r)
+ return -EFAULT;
+ mark_page_dirty(kvm, to >> PAGE_SHIFT);
+ return 0;
+}
+
+static long kvmppc_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
+ unsigned long dest, unsigned long src)
+{
+ u64 pg_sz = SZ_4K; /* 4K page size */
+ u64 pg_mask = SZ_4K - 1;
+ int ret;
+
+ /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
+ if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
+ H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
+ return H_PARAMETER;
+
+ /* dest (and src if copy_page flag set) must be page aligned */
+ if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
+ return H_PARAMETER;
+
+ /* zero and/or copy the page as determined by the flags */
+ if (flags & H_COPY_PAGE) {
+ ret = kvmppc_copy_guest(vcpu->kvm, dest, src, pg_sz);
+ if (ret < 0)
+ return H_PARAMETER;
+ } else if (flags & H_ZERO_PAGE) {
+ ret = kvm_clear_guest(vcpu->kvm, dest, pg_sz);
+ if (ret < 0)
+ return H_PARAMETER;
+ }
+
+ /* We can ignore the remaining flags */
+
+ return H_SUCCESS;
+}
+
static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
{
struct kvmppc_vcore *vcore = target->arch.vcore;
@@ -904,7 +992,7 @@
case H_IPOLL:
case H_XIRR_X:
if (kvmppc_xics_enabled(vcpu)) {
- if (xive_enabled()) {
+ if (xics_on_xive()) {
ret = H_NOT_AVAILABLE;
return RESUME_GUEST;
}
@@ -912,6 +1000,20 @@
break;
}
return RESUME_HOST;
+ case H_SET_DABR:
+ ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
+ break;
+ case H_SET_XDABR:
+ ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5));
+ break;
+#ifdef CONFIG_SPAPR_TCE_IOMMU
+ case H_GET_TCE:
+ ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5));
+ if (ret == H_TOO_HARD)
+ return RESUME_HOST;
+ break;
case H_PUT_TCE:
ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
@@ -935,6 +1037,47 @@
if (ret == H_TOO_HARD)
return RESUME_HOST;
break;
+#endif
+ case H_RANDOM:
+ if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
+ ret = H_HARDWARE;
+ break;
+
+ case H_SET_PARTITION_TABLE:
+ ret = H_FUNCTION;
+ if (nesting_enabled(vcpu->kvm))
+ ret = kvmhv_set_partition_table(vcpu);
+ break;
+ case H_ENTER_NESTED:
+ ret = H_FUNCTION;
+ if (!nesting_enabled(vcpu->kvm))
+ break;
+ ret = kvmhv_enter_nested_guest(vcpu);
+ if (ret == H_INTERRUPT) {
+ kvmppc_set_gpr(vcpu, 3, 0);
+ vcpu->arch.hcall_needed = 0;
+ return -EINTR;
+ } else if (ret == H_TOO_HARD) {
+ kvmppc_set_gpr(vcpu, 3, 0);
+ vcpu->arch.hcall_needed = 0;
+ return RESUME_HOST;
+ }
+ break;
+ case H_TLB_INVALIDATE:
+ ret = H_FUNCTION;
+ if (nesting_enabled(vcpu->kvm))
+ ret = kvmhv_do_nested_tlbie(vcpu);
+ break;
+ case H_COPY_TOFROM_GUEST:
+ ret = H_FUNCTION;
+ if (nesting_enabled(vcpu->kvm))
+ ret = kvmhv_copy_tofrom_guest_nested(vcpu);
+ break;
+ case H_PAGE_INIT:
+ ret = kvmppc_h_page_init(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5),
+ kvmppc_get_gpr(vcpu, 6));
+ break;
default:
return RESUME_HOST;
}
@@ -943,6 +1086,24 @@
return RESUME_GUEST;
}
+/*
+ * Handle H_CEDE in the nested virtualization case where we haven't
+ * called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
+ * This has to be done early, not in kvmppc_pseries_do_hcall(), so
+ * that the cede logic in kvmppc_run_single_vcpu() works properly.
+ */
+static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.shregs.msr |= MSR_EE;
+ vcpu->arch.ceded = 1;
+ smp_mb();
+ if (vcpu->arch.prodded) {
+ vcpu->arch.prodded = 0;
+ smp_mb();
+ vcpu->arch.ceded = 0;
+ }
+}
+
static int kvmppc_hcall_impl_hv(unsigned long cmd)
{
switch (cmd) {
@@ -961,6 +1122,7 @@
case H_IPOLL:
case H_XIRR_X:
#endif
+ case H_PAGE_INIT:
return 1;
}
@@ -1085,7 +1247,6 @@
return RESUME_GUEST;
}
-/* Called with vcpu->arch.vcore->lock held */
static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct task_struct *tsk)
{
@@ -1132,6 +1293,22 @@
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ /* Print the MCE event to host console. */
+ machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
+
+ /*
+ * If the guest can do FWNMI, exit to userspace so it can
+ * deliver a FWNMI to the guest.
+ * Otherwise we synthesize a machine check for the guest
+ * so that it knows that the machine check occurred.
+ */
+ if (!vcpu->kvm->arch.fwnmi_enabled) {
+ ulong flags = vcpu->arch.shregs.msr & 0x083c0000;
+ kvmppc_core_queue_machine_check(vcpu, flags);
+ r = RESUME_GUEST;
+ break;
+ }
+
/* Exit to guest with KVM_EXIT_NMI as exit reason */
run->exit_reason = KVM_EXIT_NMI;
run->hw.hardware_exit_reason = vcpu->arch.trap;
@@ -1144,8 +1321,6 @@
run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
r = RESUME_HOST;
- /* Print the MCE event to host console. */
- machine_check_print_event_info(&vcpu->arch.mce_evt, false);
break;
case BOOK3S_INTERRUPT_PROGRAM:
{
@@ -1190,7 +1365,10 @@
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
- vcpu->arch.fault_dsisr = 0;
+ vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
+ DSISR_SRR1_MATCH_64S;
+ if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+ vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
r = RESUME_PAGE_FAULT;
break;
/*
@@ -1206,10 +1384,7 @@
swab32(vcpu->arch.emul_inst) :
vcpu->arch.emul_inst;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
- /* Need vcore unlocked to call kvmppc_get_last_inst */
- spin_unlock(&vcpu->arch.vcore->lock);
r = kvmppc_emulate_debug_inst(run, vcpu);
- spin_lock(&vcpu->arch.vcore->lock);
} else {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
@@ -1225,12 +1400,8 @@
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
r = EMULATE_FAIL;
if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
- cpu_has_feature(CPU_FTR_ARCH_300)) {
- /* Need vcore unlocked to call kvmppc_get_last_inst */
- spin_unlock(&vcpu->arch.vcore->lock);
+ cpu_has_feature(CPU_FTR_ARCH_300))
r = kvmppc_emulate_doorbell_instr(vcpu);
- spin_lock(&vcpu->arch.vcore->lock);
- }
if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
@@ -1265,6 +1436,104 @@
return r;
}
+static int kvmppc_handle_nested_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+ int r;
+ int srcu_idx;
+
+ vcpu->stat.sum_exits++;
+
+ /*
+ * This can happen if an interrupt occurs in the last stages
+ * of guest entry or the first stages of guest exit (i.e. after
+ * setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
+ * and before setting it to KVM_GUEST_MODE_HOST_HV).
+ * That can happen due to a bug, or due to a machine check
+ * occurring at just the wrong time.
+ */
+ if (vcpu->arch.shregs.msr & MSR_HV) {
+ pr_emerg("KVM trap in HV mode while nested!\n");
+ pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
+ vcpu->arch.trap, kvmppc_get_pc(vcpu),
+ vcpu->arch.shregs.msr);
+ kvmppc_dump_regs(vcpu);
+ return RESUME_HOST;
+ }
+ switch (vcpu->arch.trap) {
+ /* We're good on these - the host merely wanted to get our attention */
+ case BOOK3S_INTERRUPT_HV_DECREMENTER:
+ vcpu->stat.dec_exits++;
+ r = RESUME_GUEST;
+ break;
+ case BOOK3S_INTERRUPT_EXTERNAL:
+ vcpu->stat.ext_intr_exits++;
+ r = RESUME_HOST;
+ break;
+ case BOOK3S_INTERRUPT_H_DOORBELL:
+ case BOOK3S_INTERRUPT_H_VIRT:
+ vcpu->stat.ext_intr_exits++;
+ r = RESUME_GUEST;
+ break;
+ /* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
+ case BOOK3S_INTERRUPT_HMI:
+ case BOOK3S_INTERRUPT_PERFMON:
+ case BOOK3S_INTERRUPT_SYSTEM_RESET:
+ r = RESUME_GUEST;
+ break;
+ case BOOK3S_INTERRUPT_MACHINE_CHECK:
+ /* Pass the machine check to the L1 guest */
+ r = RESUME_HOST;
+ /* Print the MCE event to host console. */
+ machine_check_print_event_info(&vcpu->arch.mce_evt, false, true);
+ break;
+ /*
+ * We get these next two if the guest accesses a page which it thinks
+ * it has mapped but which is not actually present, either because
+ * it is for an emulated I/O device or because the corresonding
+ * host page has been paged out.
+ */
+ case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvmhv_nested_page_fault(run, vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ break;
+ case BOOK3S_INTERRUPT_H_INST_STORAGE:
+ vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
+ vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
+ DSISR_SRR1_MATCH_64S;
+ if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
+ vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ r = kvmhv_nested_page_fault(run, vcpu);
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ break;
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+ case BOOK3S_INTERRUPT_HV_SOFTPATCH:
+ /*
+ * This occurs for various TM-related instructions that
+ * we need to emulate on POWER9 DD2.2. We have already
+ * handled the cases where the guest was in real-suspend
+ * mode and was transitioning to transactional state.
+ */
+ r = kvmhv_p9_tm_emulation(vcpu);
+ break;
+#endif
+
+ case BOOK3S_INTERRUPT_HV_RM_HARD:
+ vcpu->arch.trap = 0;
+ r = RESUME_GUEST;
+ if (!xics_on_xive())
+ kvmppc_xics_rm_complete(vcpu, 0);
+ break;
+ default:
+ r = RESUME_HOST;
+ break;
+ }
+
+ return r;
+}
+
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
@@ -1309,7 +1578,6 @@
struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 mask;
- mutex_lock(&kvm->lock);
spin_lock(&vc->lock);
/*
* If ILE (interrupt little-endian) has changed, update the
@@ -1349,7 +1617,6 @@
mask &= 0xFFFFFFFF;
vc->lpcr = (vc->lpcr & ~mask) | (new_lpcr & mask);
spin_unlock(&vc->lock);
- mutex_unlock(&kvm->lock);
}
static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
@@ -1414,7 +1681,14 @@
*val = get_reg_val(id, vcpu->arch.pspb);
break;
case KVM_REG_PPC_DPDES:
- *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
+ /*
+ * On POWER9, where we are emulating msgsndp etc.,
+ * we return 1 bit for each vcpu, which can come from
+ * either vcore->dpdes or doorbell_request.
+ * On POWER8, doorbell_request is 0.
+ */
+ *val = get_reg_val(id, vcpu->arch.vcore->dpdes |
+ vcpu->arch.doorbell_request);
break;
case KVM_REG_PPC_VTB:
*val = get_reg_val(id, vcpu->arch.vcore->vtb);
@@ -1555,6 +1829,9 @@
case KVM_REG_PPC_ONLINE:
*val = get_reg_val(id, vcpu->arch.online);
break;
+ case KVM_REG_PPC_PTCR:
+ *val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
+ break;
default:
r = -EINVAL;
break;
@@ -1786,6 +2063,9 @@
atomic_dec(&vcpu->arch.vcore->online_count);
vcpu->arch.online = i;
break;
+ case KVM_REG_PPC_PTCR:
+ vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
+ break;
default:
r = -EINVAL;
break;
@@ -2019,15 +2299,18 @@
* Set the default HFSCR for the guest from the host value.
* This value is only used on POWER9.
* On POWER9, we want to virtualize the doorbell facility, so we
- * turn off the HFSCR bit, which causes those instructions to trap.
+ * don't set the HFSCR_MSGP bit, and that causes those instructions
+ * to trap and then we emulate them.
*/
- vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
- if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+ vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
+ HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
+ vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
+ if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+ vcpu->arch.hfscr |= HFSCR_TM;
+ }
+ if (cpu_has_feature(CPU_FTR_TM_COMP))
vcpu->arch.hfscr |= HFSCR_TM;
- else if (!cpu_has_feature(CPU_FTR_TM_COMP))
- vcpu->arch.hfscr &= ~HFSCR_TM;
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- vcpu->arch.hfscr &= ~HFSCR_MSGP;
kvmppc_mmu_book3s_hv_init(vcpu);
@@ -2055,11 +2338,17 @@
pr_devel("KVM: collision on id %u", id);
vcore = NULL;
} else if (!vcore) {
+ /*
+ * Take mmu_setup_lock for mutual exclusion
+ * with kvmppc_update_lpcr().
+ */
err = -ENOMEM;
vcore = kvmppc_vcore_create(kvm,
id & ~(kvm->arch.smt_mode - 1));
+ mutex_lock(&kvm->arch.mmu_setup_lock);
kvm->arch.vcores[core] = vcore;
kvm->arch.online_vcores++;
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
}
}
mutex_unlock(&kvm->lock);
@@ -2160,8 +2449,7 @@
kvmppc_core_prepare_to_enter(vcpu);
return;
}
- dec_nsec = (vcpu->arch.dec_expires - now) * NSEC_PER_SEC
- / tb_ticks_per_sec;
+ dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now);
hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
vcpu->arch.timer_running = 1;
}
@@ -2242,10 +2530,18 @@
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
+ cpumask_t *cpu_in_guest;
int i;
cpu = cpu_first_thread_sibling(cpu);
- cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+ if (nested) {
+ cpumask_set_cpu(cpu, &nested->need_tlb_flush);
+ cpu_in_guest = &nested->cpu_in_guest;
+ } else {
+ cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
+ cpu_in_guest = &kvm->arch.cpu_in_guest;
+ }
/*
* Make sure setting of bit in need_tlb_flush precedes
* testing of cpu_in_guest bits. The matching barrier on
@@ -2253,13 +2549,23 @@
*/
smp_mb();
for (i = 0; i < threads_per_core; ++i)
- if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
+ if (cpumask_test_cpu(cpu + i, cpu_in_guest))
smp_call_function_single(cpu + i, do_nothing, NULL, 1);
}
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
struct kvm *kvm = vcpu->kvm;
+ int prev_cpu;
+
+ if (!cpu_has_feature(CPU_FTR_HVMODE))
+ return;
+
+ if (nested)
+ prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
+ else
+ prev_cpu = vcpu->arch.prev_cpu;
/*
* With radix, the guest can do TLB invalidations itself,
@@ -2273,12 +2579,15 @@
* ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
- if (vcpu->arch.prev_cpu != pcpu) {
- if (vcpu->arch.prev_cpu >= 0 &&
- cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
+ if (prev_cpu != pcpu) {
+ if (prev_cpu >= 0 &&
+ cpu_first_thread_sibling(prev_cpu) !=
cpu_first_thread_sibling(pcpu))
- radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
- vcpu->arch.prev_cpu = pcpu;
+ radix_flush_cpu(kvm, prev_cpu, vcpu);
+ if (nested)
+ nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
+ else
+ vcpu->arch.prev_cpu = pcpu;
}
}
@@ -2493,6 +2802,10 @@
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return false;
+ /* In one_vm_per_core mode, require all vcores to be from the same vm */
+ if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
+ return false;
+
/* Some POWER9 chips require all threads to be in the same MMU mode */
if (no_mixing_hpt_and_radix &&
kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
@@ -2557,7 +2870,7 @@
if (!spin_trylock(&pvc->lock))
continue;
prepare_threads(pvc);
- if (!pvc->n_runnable) {
+ if (!pvc->n_runnable || !pvc->kvm->arch.mmu_ready) {
list_del_init(&pvc->preempt_list);
if (pvc->runner == NULL) {
pvc->vcore_state = VCORE_INACTIVE;
@@ -2578,15 +2891,20 @@
spin_unlock(&lp->lock);
}
-static bool recheck_signals(struct core_info *cip)
+static bool recheck_signals_and_mmu(struct core_info *cip)
{
int sub, i;
struct kvm_vcpu *vcpu;
+ struct kvmppc_vcore *vc;
- for (sub = 0; sub < cip->n_subcores; ++sub)
- for_each_runnable_thread(i, vcpu, cip->vc[sub])
+ for (sub = 0; sub < cip->n_subcores; ++sub) {
+ vc = cip->vc[sub];
+ if (!vc->kvm->arch.mmu_ready)
+ return true;
+ for_each_runnable_thread(i, vcpu, vc)
if (signal_pending(vcpu->arch.run_task))
return true;
+ }
return false;
}
@@ -2600,6 +2918,14 @@
spin_lock(&vc->lock);
now = get_tb();
for_each_runnable_thread(i, vcpu, vc) {
+ /*
+ * It's safe to unlock the vcore in the loop here, because
+ * for_each_runnable_thread() is safe against removal of
+ * the vcpu, and the vcore state is VCORE_EXITING here,
+ * so any vcpus becoming runnable will have their arch.trap
+ * set to zero and can't actually run in the guest.
+ */
+ spin_unlock(&vc->lock);
/* cancel pending dec exception if dec is positive */
if (now < vcpu->arch.dec_expires &&
kvmppc_core_pending_dec(vcpu))
@@ -2615,6 +2941,7 @@
vcpu->arch.ret = ret;
vcpu->arch.trap = 0;
+ spin_lock(&vc->lock);
if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
if (vcpu->arch.pending_exceptions)
kvmppc_core_prepare_to_enter(vcpu);
@@ -2807,7 +3134,7 @@
local_irq_disable();
hard_irq_disable();
if (lazy_irq_pending() || need_resched() ||
- recheck_signals(&core_info) || !vc->kvm->arch.mmu_ready) {
+ recheck_signals_and_mmu(&core_info)) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE;
/* Unlock all except the primary vcore */
@@ -2962,31 +3289,11 @@
for (sub = 0; sub < core_info.n_subcores; ++sub)
spin_unlock(&core_info.vc[sub]->lock);
- if (kvm_is_radix(vc->kvm)) {
- int tmp = pcpu;
+ guest_enter_irqoff();
- /*
- * Do we need to flush the process scoped TLB for the LPAR?
- *
- * On POWER9, individual threads can come in here, but the
- * TLB is shared between the 4 threads in a core, hence
- * invalidating on one thread invalidates for all.
- * Thus we make all 4 threads use the same bit here.
- *
- * Hash must be flushed in realmode in order to use tlbiel.
- */
- mtspr(SPRN_LPID, vc->kvm->arch.lpid);
- isync();
+ srcu_idx = srcu_read_lock(&vc->kvm->srcu);
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- tmp &= ~0x3UL;
-
- if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
- radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
- /* Clear the bit after the TLB flush */
- cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
- }
- }
+ this_cpu_disable_ftrace();
/*
* Interrupts will be enabled once we get into the guest,
@@ -2994,19 +3301,14 @@
*/
trace_hardirqs_on();
- guest_enter_irqoff();
-
- srcu_idx = srcu_read_lock(&vc->kvm->srcu);
-
- this_cpu_disable_ftrace();
-
trap = __kvmppc_vcore_entry();
+ trace_hardirqs_off();
+
this_cpu_enable_ftrace();
srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
- trace_hardirqs_off();
set_irq_happened(trap);
spin_lock(&vc->lock);
@@ -3080,6 +3382,323 @@
}
/*
+ * Load up hypervisor-mode registers on P9.
+ */
+static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
+ unsigned long lpcr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ s64 hdec;
+ u64 tb, purr, spurr;
+ int trap;
+ unsigned long host_hfscr = mfspr(SPRN_HFSCR);
+ unsigned long host_ciabr = mfspr(SPRN_CIABR);
+ unsigned long host_dawr = mfspr(SPRN_DAWR);
+ unsigned long host_dawrx = mfspr(SPRN_DAWRX);
+ unsigned long host_psscr = mfspr(SPRN_PSSCR);
+ unsigned long host_pidr = mfspr(SPRN_PID);
+
+ hdec = time_limit - mftb();
+ if (hdec < 0)
+ return BOOK3S_INTERRUPT_HV_DECREMENTER;
+ mtspr(SPRN_HDEC, hdec);
+
+ if (vc->tb_offset) {
+ u64 new_tb = mftb() + vc->tb_offset;
+ mtspr(SPRN_TBU40, new_tb);
+ tb = mftb();
+ if ((tb & 0xffffff) < (new_tb & 0xffffff))
+ mtspr(SPRN_TBU40, new_tb + 0x1000000);
+ vc->tb_offset_applied = vc->tb_offset;
+ }
+
+ if (vc->pcr)
+ mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
+ mtspr(SPRN_DPDES, vc->dpdes);
+ mtspr(SPRN_VTB, vc->vtb);
+
+ local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
+ local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
+ mtspr(SPRN_PURR, vcpu->arch.purr);
+ mtspr(SPRN_SPURR, vcpu->arch.spurr);
+
+ if (dawr_enabled()) {
+ mtspr(SPRN_DAWR, vcpu->arch.dawr);
+ mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
+ }
+ mtspr(SPRN_CIABR, vcpu->arch.ciabr);
+ mtspr(SPRN_IC, vcpu->arch.ic);
+ mtspr(SPRN_PID, vcpu->arch.pid);
+
+ mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+
+ mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
+
+ mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
+ mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
+ mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
+ mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
+
+ mtspr(SPRN_AMOR, ~0UL);
+
+ mtspr(SPRN_LPCR, lpcr);
+ isync();
+
+ kvmppc_xive_push_vcpu(vcpu);
+
+ mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
+ mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
+
+ trap = __kvmhv_vcpu_entry_p9(vcpu);
+
+ /* Advance host PURR/SPURR by the amount used by guest */
+ purr = mfspr(SPRN_PURR);
+ spurr = mfspr(SPRN_SPURR);
+ mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
+ purr - vcpu->arch.purr);
+ mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
+ spurr - vcpu->arch.spurr);
+ vcpu->arch.purr = purr;
+ vcpu->arch.spurr = spurr;
+
+ vcpu->arch.ic = mfspr(SPRN_IC);
+ vcpu->arch.pid = mfspr(SPRN_PID);
+ vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+
+ vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
+ vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
+ vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
+ vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
+
+ /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
+ mtspr(SPRN_PSSCR, host_psscr |
+ (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+ mtspr(SPRN_HFSCR, host_hfscr);
+ mtspr(SPRN_CIABR, host_ciabr);
+ mtspr(SPRN_DAWR, host_dawr);
+ mtspr(SPRN_DAWRX, host_dawrx);
+ mtspr(SPRN_PID, host_pidr);
+
+ /*
+ * Since this is radix, do a eieio; tlbsync; ptesync sequence in
+ * case we interrupted the guest between a tlbie and a ptesync.
+ */
+ asm volatile("eieio; tlbsync; ptesync");
+
+ mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */
+ isync();
+
+ vc->dpdes = mfspr(SPRN_DPDES);
+ vc->vtb = mfspr(SPRN_VTB);
+ mtspr(SPRN_DPDES, 0);
+ if (vc->pcr)
+ mtspr(SPRN_PCR, PCR_MASK);
+
+ if (vc->tb_offset_applied) {
+ u64 new_tb = mftb() - vc->tb_offset_applied;
+ mtspr(SPRN_TBU40, new_tb);
+ tb = mftb();
+ if ((tb & 0xffffff) < (new_tb & 0xffffff))
+ mtspr(SPRN_TBU40, new_tb + 0x1000000);
+ vc->tb_offset_applied = 0;
+ }
+
+ mtspr(SPRN_HDEC, 0x7fffffff);
+ mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
+
+ return trap;
+}
+
+/*
+ * Virtual-mode guest entry for POWER9 and later when the host and
+ * guest are both using the radix MMU. The LPIDR has already been set.
+ */
+int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
+ unsigned long lpcr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ unsigned long host_dscr = mfspr(SPRN_DSCR);
+ unsigned long host_tidr = mfspr(SPRN_TIDR);
+ unsigned long host_iamr = mfspr(SPRN_IAMR);
+ unsigned long host_amr = mfspr(SPRN_AMR);
+ s64 dec;
+ u64 tb;
+ int trap, save_pmu;
+
+ dec = mfspr(SPRN_DEC);
+ tb = mftb();
+ if (dec < 512)
+ return BOOK3S_INTERRUPT_HV_DECREMENTER;
+ local_paca->kvm_hstate.dec_expires = dec + tb;
+ if (local_paca->kvm_hstate.dec_expires < time_limit)
+ time_limit = local_paca->kvm_hstate.dec_expires;
+
+ vcpu->arch.ceded = 0;
+
+ kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */
+
+ kvmppc_subcore_enter_guest();
+
+ vc->entry_exit_map = 1;
+ vc->in_guest = 1;
+
+ if (vcpu->arch.vpa.pinned_addr) {
+ struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+ u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+ lp->yield_count = cpu_to_be32(yield_count);
+ vcpu->arch.vpa.dirty = 1;
+ }
+
+ if (cpu_has_feature(CPU_FTR_TM) ||
+ cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+ kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+
+ kvmhv_load_guest_pmu(vcpu);
+
+ msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
+ load_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+ load_vr_state(&vcpu->arch.vr);
+#endif
+ mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
+
+ mtspr(SPRN_DSCR, vcpu->arch.dscr);
+ mtspr(SPRN_IAMR, vcpu->arch.iamr);
+ mtspr(SPRN_PSPB, vcpu->arch.pspb);
+ mtspr(SPRN_FSCR, vcpu->arch.fscr);
+ mtspr(SPRN_TAR, vcpu->arch.tar);
+ mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+ mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+ mtspr(SPRN_BESCR, vcpu->arch.bescr);
+ mtspr(SPRN_WORT, vcpu->arch.wort);
+ mtspr(SPRN_TIDR, vcpu->arch.tid);
+ mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+ mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+ mtspr(SPRN_AMR, vcpu->arch.amr);
+ mtspr(SPRN_UAMOR, vcpu->arch.uamor);
+
+ if (!(vcpu->arch.ctrl & 1))
+ mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+
+ mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
+
+ if (kvmhv_on_pseries()) {
+ /*
+ * We need to save and restore the guest visible part of the
+ * psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
+ * doesn't do this for us. Note only required if pseries since
+ * this is done in kvmhv_load_hv_regs_and_go() below otherwise.
+ */
+ unsigned long host_psscr;
+ /* call our hypervisor to load up HV regs and go */
+ struct hv_guest_state hvregs;
+
+ host_psscr = mfspr(SPRN_PSSCR_PR);
+ mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+ kvmhv_save_hv_regs(vcpu, &hvregs);
+ hvregs.lpcr = lpcr;
+ vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+ hvregs.version = HV_GUEST_STATE_VERSION;
+ if (vcpu->arch.nested) {
+ hvregs.lpid = vcpu->arch.nested->shadow_lpid;
+ hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
+ } else {
+ hvregs.lpid = vcpu->kvm->arch.lpid;
+ hvregs.vcpu_token = vcpu->vcpu_id;
+ }
+ hvregs.hdec_expiry = time_limit;
+ trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
+ __pa(&vcpu->arch.regs));
+ kvmhv_restore_hv_return_state(vcpu, &hvregs);
+ vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+ vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+ vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+ vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
+ mtspr(SPRN_PSSCR_PR, host_psscr);
+
+ /* H_CEDE has to be handled now, not later */
+ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
+ kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
+ kvmppc_nested_cede(vcpu);
+ trap = 0;
+ }
+ } else {
+ trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
+ }
+
+ vcpu->arch.slb_max = 0;
+ dec = mfspr(SPRN_DEC);
+ if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
+ dec = (s32) dec;
+ tb = mftb();
+ vcpu->arch.dec_expires = dec + tb;
+ vcpu->cpu = -1;
+ vcpu->arch.thread_cpu = -1;
+ vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
+
+ vcpu->arch.iamr = mfspr(SPRN_IAMR);
+ vcpu->arch.pspb = mfspr(SPRN_PSPB);
+ vcpu->arch.fscr = mfspr(SPRN_FSCR);
+ vcpu->arch.tar = mfspr(SPRN_TAR);
+ vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+ vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+ vcpu->arch.bescr = mfspr(SPRN_BESCR);
+ vcpu->arch.wort = mfspr(SPRN_WORT);
+ vcpu->arch.tid = mfspr(SPRN_TIDR);
+ vcpu->arch.amr = mfspr(SPRN_AMR);
+ vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+ vcpu->arch.dscr = mfspr(SPRN_DSCR);
+
+ mtspr(SPRN_PSPB, 0);
+ mtspr(SPRN_WORT, 0);
+ mtspr(SPRN_UAMOR, 0);
+ mtspr(SPRN_DSCR, host_dscr);
+ mtspr(SPRN_TIDR, host_tidr);
+ mtspr(SPRN_IAMR, host_iamr);
+ mtspr(SPRN_PSPB, 0);
+
+ if (host_amr != vcpu->arch.amr)
+ mtspr(SPRN_AMR, host_amr);
+
+ msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
+ store_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+ store_vr_state(&vcpu->arch.vr);
+#endif
+ vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
+
+ if (cpu_has_feature(CPU_FTR_TM) ||
+ cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+ kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+
+ save_pmu = 1;
+ if (vcpu->arch.vpa.pinned_addr) {
+ struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+ u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+ lp->yield_count = cpu_to_be32(yield_count);
+ vcpu->arch.vpa.dirty = 1;
+ save_pmu = lp->pmcregs_in_use;
+ }
+ /* Must save pmu if this guest is capable of running nested guests */
+ save_pmu |= nesting_enabled(vcpu->kvm);
+
+ kvmhv_save_guest_pmu(vcpu, save_pmu);
+
+ vc->entry_exit_map = 0x101;
+ vc->in_guest = 0;
+
+ mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+ mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
+
+ kvmhv_load_host_pmu();
+
+ kvmppc_subcore_exit_guest();
+
+ return trap;
+}
+
+/*
* Wait for some other vcpu thread to execute us, and
* wake us up when we need to handle something in the host.
*/
@@ -3099,11 +3718,12 @@
static void grow_halt_poll_ns(struct kvmppc_vcore *vc)
{
- /* 10us base */
- if (vc->halt_poll_ns == 0 && halt_poll_ns_grow)
- vc->halt_poll_ns = 10000;
- else
- vc->halt_poll_ns *= halt_poll_ns_grow;
+ if (!halt_poll_ns_grow)
+ return;
+
+ vc->halt_poll_ns *= halt_poll_ns_grow;
+ if (vc->halt_poll_ns < halt_poll_ns_grow_start)
+ vc->halt_poll_ns = halt_poll_ns_grow_start;
}
static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
@@ -3117,7 +3737,7 @@
#ifdef CONFIG_KVM_XICS
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
{
- if (!xive_enabled())
+ if (!xics_on_xive())
return false;
return vcpu->arch.irq_pending || vcpu->arch.xive_saved_state.pipr <
vcpu->arch.xive_saved_state.cppr;
@@ -3256,12 +3876,17 @@
trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
+/*
+ * This never fails for a radix guest, as none of the operations it does
+ * for a radix guest can fail or have a way to report failure.
+ * kvmhv_run_single_vcpu() relies on this fact.
+ */
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
{
int r = 0;
struct kvm *kvm = vcpu->kvm;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.mmu_setup_lock);
if (!kvm->arch.mmu_ready) {
if (!kvm_is_radix(kvm))
r = kvmppc_hv_setup_htab_rma(vcpu);
@@ -3271,7 +3896,7 @@
kvm->arch.mmu_ready = 1;
}
}
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
return r;
}
@@ -3405,6 +4030,185 @@
return vcpu->arch.ret;
}
+int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
+ struct kvm_vcpu *vcpu, u64 time_limit,
+ unsigned long lpcr)
+{
+ int trap, r, pcpu;
+ int srcu_idx, lpid;
+ struct kvmppc_vcore *vc;
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *nested = vcpu->arch.nested;
+
+ trace_kvmppc_run_vcpu_enter(vcpu);
+
+ kvm_run->exit_reason = 0;
+ vcpu->arch.ret = RESUME_GUEST;
+ vcpu->arch.trap = 0;
+
+ vc = vcpu->arch.vcore;
+ vcpu->arch.ceded = 0;
+ vcpu->arch.run_task = current;
+ vcpu->arch.kvm_run = kvm_run;
+ vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
+ vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
+ vcpu->arch.busy_preempt = TB_NIL;
+ vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
+ vc->runnable_threads[0] = vcpu;
+ vc->n_runnable = 1;
+ vc->runner = vcpu;
+
+ /* See if the MMU is ready to go */
+ if (!kvm->arch.mmu_ready)
+ kvmhv_setup_mmu(vcpu);
+
+ if (need_resched())
+ cond_resched();
+
+ kvmppc_update_vpas(vcpu);
+
+ init_vcore_to_run(vc);
+ vc->preempt_tb = TB_NIL;
+
+ preempt_disable();
+ pcpu = smp_processor_id();
+ vc->pcpu = pcpu;
+ kvmppc_prepare_radix_vcpu(vcpu, pcpu);
+
+ local_irq_disable();
+ hard_irq_disable();
+ if (signal_pending(current))
+ goto sigpend;
+ if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
+ goto out;
+
+ if (!nested) {
+ kvmppc_core_prepare_to_enter(vcpu);
+ if (vcpu->arch.doorbell_request) {
+ vc->dpdes = 1;
+ smp_wmb();
+ vcpu->arch.doorbell_request = 0;
+ }
+ if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
+ &vcpu->arch.pending_exceptions))
+ lpcr |= LPCR_MER;
+ } else if (vcpu->arch.pending_exceptions ||
+ vcpu->arch.doorbell_request ||
+ xive_interrupt_pending(vcpu)) {
+ vcpu->arch.ret = RESUME_HOST;
+ goto out;
+ }
+
+ kvmppc_clear_host_core(pcpu);
+
+ local_paca->kvm_hstate.tid = 0;
+ local_paca->kvm_hstate.napping = 0;
+ local_paca->kvm_hstate.kvm_split_mode = NULL;
+ kvmppc_start_thread(vcpu, vc);
+ kvmppc_create_dtl_entry(vcpu, vc);
+ trace_kvm_guest_enter(vcpu);
+
+ vc->vcore_state = VCORE_RUNNING;
+ trace_kvmppc_run_core(vc, 0);
+
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
+ lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
+ mtspr(SPRN_LPID, lpid);
+ isync();
+ kvmppc_check_need_tlb_flush(kvm, pcpu, nested);
+ }
+
+ guest_enter_irqoff();
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ this_cpu_disable_ftrace();
+
+ /* Tell lockdep that we're about to enable interrupts */
+ trace_hardirqs_on();
+
+ trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
+ vcpu->arch.trap = trap;
+
+ trace_hardirqs_off();
+
+ this_cpu_enable_ftrace();
+
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
+ mtspr(SPRN_LPID, kvm->arch.host_lpid);
+ isync();
+ }
+
+ set_irq_happened(trap);
+
+ kvmppc_set_host_core(pcpu);
+
+ local_irq_enable();
+ guest_exit();
+
+ cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
+
+ preempt_enable();
+
+ /*
+ * cancel pending decrementer exception if DEC is now positive, or if
+ * entering a nested guest in which case the decrementer is now owned
+ * by L2 and the L1 decrementer is provided in hdec_expires
+ */
+ if (kvmppc_core_pending_dec(vcpu) &&
+ ((get_tb() < vcpu->arch.dec_expires) ||
+ (trap == BOOK3S_INTERRUPT_SYSCALL &&
+ kvmppc_get_gpr(vcpu, 3) == H_ENTER_NESTED)))
+ kvmppc_core_dequeue_dec(vcpu);
+
+ trace_kvm_guest_exit(vcpu);
+ r = RESUME_GUEST;
+ if (trap) {
+ if (!nested)
+ r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
+ else
+ r = kvmppc_handle_nested_exit(kvm_run, vcpu);
+ }
+ vcpu->arch.ret = r;
+
+ if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
+ !kvmppc_vcpu_woken(vcpu)) {
+ kvmppc_set_timer(vcpu);
+ while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
+ if (signal_pending(current)) {
+ vcpu->stat.signal_exits++;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->arch.ret = -EINTR;
+ break;
+ }
+ spin_lock(&vc->lock);
+ kvmppc_vcore_blocked(vc);
+ spin_unlock(&vc->lock);
+ }
+ }
+ vcpu->arch.ceded = 0;
+
+ vc->vcore_state = VCORE_INACTIVE;
+ trace_kvmppc_run_core(vc, 1);
+
+ done:
+ kvmppc_remove_runnable(vc, vcpu);
+ trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
+
+ return vcpu->arch.ret;
+
+ sigpend:
+ vcpu->stat.signal_exits++;
+ kvm_run->exit_reason = KVM_EXIT_INTR;
+ vcpu->arch.ret = -EINTR;
+ out:
+ local_irq_enable();
+ preempt_enable();
+ goto done;
+}
+
static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
int r;
@@ -3480,7 +4284,20 @@
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
do {
- r = kvmppc_run_vcpu(run, vcpu);
+ /*
+ * The early POWER9 chips that can't mix radix and HPT threads
+ * on the same core also need the workaround for the problem
+ * where the TLB would prefetch entries in the guest exit path
+ * for radix guests using the guest PIDR value and LPID 0.
+ * The workaround is in the old path (kvmppc_run_vcpu())
+ * but not the new path (kvmhv_run_single_vcpu()).
+ */
+ if (kvm->arch.threads_indep && kvm_is_radix(kvm) &&
+ !no_mixing_hpt_and_radix)
+ r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
+ vcpu->arch.vcore->lpcr);
+ else
+ r = kvmppc_run_vcpu(run, vcpu);
if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
!(vcpu->arch.shregs.msr & MSR_PR)) {
@@ -3494,7 +4311,7 @@
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
srcu_read_unlock(&kvm->srcu, srcu_idx);
} else if (r == RESUME_PASSTHROUGH) {
- if (WARN_ON(xive_enabled()))
+ if (WARN_ON(xics_on_xive()))
r = H_SUCCESS;
else
r = kvmppc_xics_rm_complete(vcpu, 0);
@@ -3559,6 +4376,10 @@
kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
+ /* If running as a nested hypervisor, we don't support HPT guests */
+ if (kvmhv_on_pseries())
+ info->flags |= KVM_PPC_NO_HASH;
+
return 0;
}
@@ -3660,7 +4481,8 @@
static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
- const struct kvm_memory_slot *new)
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
unsigned long npages = mem->memory_size >> PAGE_SHIFT;
@@ -3672,11 +4494,29 @@
*/
if (npages)
atomic64_inc(&kvm->arch.mmio_update);
+
+ /*
+ * For change == KVM_MR_MOVE or KVM_MR_DELETE, higher levels
+ * have already called kvm_arch_flush_shadow_memslot() to
+ * flush shadow mappings. For KVM_MR_CREATE we have no
+ * previous mappings. So the only case to handle is
+ * KVM_MR_FLAGS_ONLY when the KVM_MEM_LOG_DIRTY_PAGES bit
+ * has been changed.
+ * For radix guests, we flush on setting KVM_MEM_LOG_DIRTY_PAGES
+ * to get rid of any THP PTEs in the partition-scoped page tables
+ * so we can track dirtiness at the page level; we flush when
+ * clearing KVM_MEM_LOG_DIRTY_PAGES so that we can go back to
+ * using THP PTEs.
+ */
+ if (change == KVM_MR_FLAGS_ONLY && kvm_is_radix(kvm) &&
+ ((new->flags ^ old->flags) & KVM_MEM_LOG_DIRTY_PAGES))
+ kvmppc_radix_flush_memslot(kvm, old);
}
/*
* Update LPCR values in kvm->arch and in vcores.
- * Caller must hold kvm->lock.
+ * Caller must hold kvm->arch.mmu_setup_lock (for mutual exclusion
+ * of kvm->arch.lpcr update).
*/
void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask)
{
@@ -3723,13 +4563,12 @@
__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
dw1 = PATB_GR | kvm->arch.process_table;
}
-
- mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
+ kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
}
/*
* Set up HPT (hashed page table) and RMA (real-mode area).
- * Must be called with kvm->lock held.
+ * Must be called with kvm->arch.mmu_setup_lock held.
*/
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
@@ -3817,19 +4656,30 @@
goto out_srcu;
}
-/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+/*
+ * Must be called with kvm->arch.mmu_setup_lock held and
+ * mmu_ready = 0 and no vcpus running.
+ */
int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
{
+ if (nesting_enabled(kvm))
+ kvmhv_release_all_nested(kvm);
+ kvmppc_rmap_reset(kvm);
+ kvm->arch.process_table = 0;
+ /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ spin_lock(&kvm->mmu_lock);
+ kvm->arch.radix = 0;
+ spin_unlock(&kvm->mmu_lock);
kvmppc_free_radix(kvm);
kvmppc_update_lpcr(kvm, LPCR_VPM1,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
- kvmppc_rmap_reset(kvm);
- kvm->arch.radix = 0;
- kvm->arch.process_table = 0;
return 0;
}
-/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
+/*
+ * Must be called with kvm->arch.mmu_setup_lock held and
+ * mmu_ready = 0 and no vcpus running.
+ */
int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
{
int err;
@@ -3837,11 +4687,14 @@
err = kvmppc_init_vm_radix(kvm);
if (err)
return err;
-
+ kvmppc_rmap_reset(kvm);
+ /* Mutual exclusion with kvm_unmap_hva_range etc. */
+ spin_lock(&kvm->mmu_lock);
+ kvm->arch.radix = 1;
+ spin_unlock(&kvm->mmu_lock);
kvmppc_free_hpt(&kvm->arch.hpt);
kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
- kvm->arch.radix = 1;
return 0;
}
@@ -3931,6 +4784,8 @@
char buf[32];
int ret;
+ mutex_init(&kvm->arch.mmu_setup_lock);
+
/* Allocate the guest's logical partition ID */
lpid = kvmppc_alloc_lpid();
@@ -3940,6 +4795,8 @@
kvmppc_alloc_host_rm_ops();
+ kvmhv_vm_nested_init(kvm);
+
/*
* Since we don't flush the TLB when tearing down a VM,
* and this lpid might have previously been used,
@@ -3958,9 +4815,13 @@
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
/* Init LPCR for virtual RMA mode */
- kvm->arch.host_lpid = mfspr(SPRN_LPID);
- kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
- lpcr &= LPCR_PECE | LPCR_LPES;
+ if (cpu_has_feature(CPU_FTR_HVMODE)) {
+ kvm->arch.host_lpid = mfspr(SPRN_LPID);
+ kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
+ lpcr &= LPCR_PECE | LPCR_LPES;
+ } else {
+ lpcr = 0;
+ }
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
LPCR_VPM0 | LPCR_VPM1;
kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
@@ -3983,7 +4844,7 @@
* If xive is enabled, we route 0x500 interrupts directly
* to the guest.
*/
- if (xive_enabled())
+ if (xics_on_xive())
lpcr |= LPCR_LPES;
}
@@ -4027,8 +4888,14 @@
* On POWER9, we only need to do this if the "indep_threads_mode"
* module parameter has been set to N.
*/
- if (cpu_has_feature(CPU_FTR_ARCH_300))
- kvm->arch.threads_indep = indep_threads_mode;
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
+ pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
+ kvm->arch.threads_indep = true;
+ } else {
+ kvm->arch.threads_indep = indep_threads_mode;
+ }
+ }
if (!kvm->arch.threads_indep)
kvm_hv_vm_activated();
@@ -4051,6 +4918,8 @@
snprintf(buf, sizeof(buf), "vm%d", current->pid);
kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
kvmppc_mmu_debugfs_init(kvm);
+ if (radix_enabled())
+ kvmhv_radix_debugfs_init(kvm);
return 0;
}
@@ -4073,13 +4942,21 @@
kvmppc_free_vcores(kvm);
- kvmppc_free_lpid(kvm->arch.lpid);
if (kvm_is_radix(kvm))
kvmppc_free_radix(kvm);
else
kvmppc_free_hpt(&kvm->arch.hpt);
+ /* Perform global invalidation and return lpid to the pool */
+ if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+ if (nesting_enabled(kvm))
+ kvmhv_release_all_nested(kvm);
+ kvm->arch.process_table = 0;
+ kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
+ }
+ kvmppc_free_lpid(kvm->arch.lpid);
+
kvmppc_free_pimap(kvm);
}
@@ -4104,11 +4981,15 @@
static int kvmppc_core_check_processor_compat_hv(void)
{
- if (!cpu_has_feature(CPU_FTR_HVMODE) ||
- !cpu_has_feature(CPU_FTR_ARCH_206))
- return -EIO;
+ if (cpu_has_feature(CPU_FTR_HVMODE) &&
+ cpu_has_feature(CPU_FTR_ARCH_206))
+ return 0;
- return 0;
+ /* POWER9 in radix mode is capable of being a nested hypervisor. */
+ if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
+ return 0;
+
+ return -EIO;
}
#ifdef CONFIG_KVM_XICS
@@ -4199,7 +5080,7 @@
if (i == pimap->n_mapped)
pimap->n_mapped++;
- if (xive_enabled())
+ if (xics_on_xive())
rc = kvmppc_xive_set_mapped(kvm, guest_gsi, desc);
else
kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
@@ -4240,7 +5121,7 @@
return -ENODEV;
}
- if (xive_enabled())
+ if (xics_on_xive())
rc = kvmppc_xive_clr_mapped(kvm, guest_gsi, pimap->mapped[i].desc);
else
kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
@@ -4426,7 +5307,11 @@
if (radix && !radix_enabled())
return -EINVAL;
- mutex_lock(&kvm->lock);
+ /* If we're a nested hypervisor, we currently only support radix */
+ if (kvmhv_on_pseries() && !radix)
+ return -EINVAL;
+
+ mutex_lock(&kvm->arch.mmu_setup_lock);
if (radix != kvm_is_radix(kvm)) {
if (kvm->arch.mmu_ready) {
kvm->arch.mmu_ready = 0;
@@ -4454,10 +5339,61 @@
err = 0;
out_unlock:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.mmu_setup_lock);
return err;
}
+static int kvmhv_enable_nested(struct kvm *kvm)
+{
+ if (!nested)
+ return -EPERM;
+ if (!cpu_has_feature(CPU_FTR_ARCH_300) || no_mixing_hpt_and_radix)
+ return -ENODEV;
+
+ /* kvm == NULL means the caller is testing if the capability exists */
+ if (kvm)
+ kvm->arch.nested_enable = true;
+ return 0;
+}
+
+static int kvmhv_load_from_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+ int size)
+{
+ int rc = -EINVAL;
+
+ if (kvmhv_vcpu_is_radix(vcpu)) {
+ rc = kvmhv_copy_from_guest_radix(vcpu, *eaddr, ptr, size);
+
+ if (rc > 0)
+ rc = -EINVAL;
+ }
+
+ /* For now quadrants are the only way to access nested guest memory */
+ if (rc && vcpu->arch.nested)
+ rc = -EAGAIN;
+
+ return rc;
+}
+
+static int kvmhv_store_to_eaddr(struct kvm_vcpu *vcpu, ulong *eaddr, void *ptr,
+ int size)
+{
+ int rc = -EINVAL;
+
+ if (kvmhv_vcpu_is_radix(vcpu)) {
+ rc = kvmhv_copy_to_guest_radix(vcpu, *eaddr, ptr, size);
+
+ if (rc > 0)
+ rc = -EINVAL;
+ }
+
+ /* For now quadrants are the only way to access nested guest memory */
+ if (rc && vcpu->arch.nested)
+ rc = -EAGAIN;
+
+ return rc;
+}
+
static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -4497,6 +5433,9 @@
.configure_mmu = kvmhv_configure_mmu,
.get_rmmu_info = kvmhv_get_rmmu_info,
.set_smt_mode = kvmhv_set_smt_mode,
+ .enable_nested = kvmhv_enable_nested,
+ .load_from_eaddr = kvmhv_load_from_eaddr,
+ .store_to_eaddr = kvmhv_store_to_eaddr,
};
static int kvm_init_subcore_bitmap(void)
@@ -4514,13 +5453,11 @@
continue;
sibling_subcore_state =
- kmalloc_node(sizeof(struct sibling_subcore_state),
+ kzalloc_node(sizeof(struct sibling_subcore_state),
GFP_KERNEL, node);
if (!sibling_subcore_state)
return -ENOMEM;
- memset(sibling_subcore_state, 0,
- sizeof(struct sibling_subcore_state));
for (j = 0; j < threads_per_core; j++) {
int cpu = first_cpu + j;
@@ -4540,6 +5477,12 @@
static int kvmppc_book3s_init_hv(void)
{
int r;
+
+ if (!tlbie_capable) {
+ pr_err("KVM-HV: Host does not support TLBIE\n");
+ return -ENODEV;
+ }
+
/*
* FIXME!! Do we need to check on all cpus ?
*/
@@ -4547,6 +5490,10 @@
if (r < 0)
return -ENODEV;
+ r = kvmhv_nested_init();
+ if (r)
+ return r;
+
r = kvm_init_subcore_bitmap();
if (r)
return r;
@@ -4557,7 +5504,8 @@
* indirectly, via OPAL.
*/
#ifdef CONFIG_SMP
- if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
+ if (!xics_on_xive() && !kvmhv_on_pseries() &&
+ !local_paca->kvm_hstate.xics_phys) {
struct device_node *np;
np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
@@ -4605,6 +5553,7 @@
if (kvmppc_radix_possible())
kvmppc_radix_exit();
kvmppc_hv_ops = NULL;
+ kvmhv_nested_exit();
}
module_init(kvmppc_book3s_init_hv);
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c
index fc6bb96..7c19096 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/cpu.h>
@@ -231,6 +228,15 @@
void __iomem *xics_phys;
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
+ /* For a nested hypervisor, use the XICS via hcall */
+ if (kvmhv_on_pseries()) {
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
+ IPI_PRIORITY);
+ return;
+ }
+
/* On POWER9 we can use msgsnd for any destination cpu. */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
@@ -248,7 +254,7 @@
}
/* We should never reach this */
- if (WARN_ON_ONCE(xive_enabled()))
+ if (WARN_ON_ONCE(xics_on_xive()))
return;
/* Else poke the target with an IPI */
@@ -460,12 +466,19 @@
return 1;
/* Now read the interrupt from the ICP */
- xics_phys = local_paca->kvm_hstate.xics_phys;
- rc = 0;
- if (!xics_phys)
- rc = opal_int_get_xirr(&xirr, false);
- else
- xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
+ if (kvmhv_on_pseries()) {
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
+ xirr = cpu_to_be32(retbuf[0]);
+ } else {
+ xics_phys = local_paca->kvm_hstate.xics_phys;
+ rc = 0;
+ if (!xics_phys)
+ rc = opal_int_get_xirr(&xirr, false);
+ else
+ xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
+ }
if (rc < 0)
return 1;
@@ -494,7 +507,13 @@
*/
if (xisr == XICS_IPI) {
rc = 0;
- if (xics_phys) {
+ if (kvmhv_on_pseries()) {
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ plpar_hcall_raw(H_IPI, retbuf,
+ hard_smp_processor_id(), 0xff);
+ plpar_hcall_raw(H_EOI, retbuf, h_xirr);
+ } else if (xics_phys) {
__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
} else {
@@ -520,7 +539,13 @@
/* We raced with the host,
* we need to resend that IPI, bummer
*/
- if (xics_phys)
+ if (kvmhv_on_pseries()) {
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ plpar_hcall_raw(H_IPI, retbuf,
+ hard_smp_processor_id(),
+ IPI_PRIORITY);
+ } else if (xics_phys)
__raw_rm_writeb(IPI_PRIORITY,
xics_phys + XICS_MFRR);
else
@@ -549,7 +574,7 @@
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xive_enabled()) {
+ if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_xirr(vcpu);
if (unlikely(!__xive_vm_h_xirr))
@@ -564,7 +589,7 @@
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
vcpu->arch.regs.gpr[5] = get_tb();
- if (xive_enabled()) {
+ if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_xirr(vcpu);
if (unlikely(!__xive_vm_h_xirr))
@@ -578,7 +603,7 @@
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xive_enabled()) {
+ if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_ipoll(vcpu, server);
if (unlikely(!__xive_vm_h_ipoll))
@@ -593,7 +618,7 @@
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xive_enabled()) {
+ if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_ipi(vcpu, server, mfrr);
if (unlikely(!__xive_vm_h_ipi))
@@ -607,7 +632,7 @@
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xive_enabled()) {
+ if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_cppr(vcpu, cppr);
if (unlikely(!__xive_vm_h_cppr))
@@ -621,7 +646,7 @@
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
- if (xive_enabled()) {
+ if (xics_on_xive()) {
if (is_rm())
return xive_rm_h_eoi(vcpu, xirr);
if (unlikely(!__xive_vm_h_eoi))
@@ -729,3 +754,111 @@
smp_mb();
local_paca->kvm_hstate.kvm_split_mode = NULL;
}
+
+/*
+ * Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
+ * Can we inject a Decrementer or a External interrupt?
+ */
+void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
+{
+ int ext;
+ unsigned long vec = 0;
+ unsigned long lpcr;
+
+ /* Insert EXTERNAL bit into LPCR at the MER bit position */
+ ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
+ lpcr = mfspr(SPRN_LPCR);
+ lpcr |= ext << LPCR_MER_SH;
+ mtspr(SPRN_LPCR, lpcr);
+ isync();
+
+ if (vcpu->arch.shregs.msr & MSR_EE) {
+ if (ext) {
+ vec = BOOK3S_INTERRUPT_EXTERNAL;
+ } else {
+ long int dec = mfspr(SPRN_DEC);
+ if (!(lpcr & LPCR_LD))
+ dec = (int) dec;
+ if (dec < 0)
+ vec = BOOK3S_INTERRUPT_DECREMENTER;
+ }
+ }
+ if (vec) {
+ unsigned long msr, old_msr = vcpu->arch.shregs.msr;
+
+ kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
+ kvmppc_set_srr1(vcpu, old_msr);
+ kvmppc_set_pc(vcpu, vec);
+ msr = vcpu->arch.intr_msr;
+ if (MSR_TM_ACTIVE(old_msr))
+ msr |= MSR_TS_S;
+ vcpu->arch.shregs.msr = msr;
+ }
+
+ if (vcpu->arch.doorbell_request) {
+ mtspr(SPRN_DPDES, 1);
+ vcpu->arch.vcore->dpdes = 1;
+ smp_wmb();
+ vcpu->arch.doorbell_request = 0;
+ }
+}
+
+static void flush_guest_tlb(struct kvm *kvm)
+{
+ unsigned long rb, set;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ if (kvm_is_radix(kvm)) {
+ /* R=1 PRS=1 RIC=2 */
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r" (rb), "i" (1), "i" (1), "i" (2),
+ "r" (0) : "memory");
+ for (set = 1; set < kvm->arch.tlb_sets; ++set) {
+ rb += PPC_BIT(51); /* increment set number */
+ /* R=1 PRS=1 RIC=0 */
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r" (rb), "i" (1), "i" (1), "i" (0),
+ "r" (0) : "memory");
+ }
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_RADIX_INVALIDATE_ERAT_GUEST : : :"memory");
+ } else {
+ for (set = 0; set < kvm->arch.tlb_sets; ++set) {
+ /* R=0 PRS=0 RIC=0 */
+ asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
+ : : "r" (rb), "i" (0), "i" (0), "i" (0),
+ "r" (0) : "memory");
+ rb += PPC_BIT(51); /* increment set number */
+ }
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_ISA_3_0_INVALIDATE_ERAT : : :"memory");
+ }
+}
+
+void kvmppc_check_need_tlb_flush(struct kvm *kvm, int pcpu,
+ struct kvm_nested_guest *nested)
+{
+ cpumask_t *need_tlb_flush;
+
+ /*
+ * On POWER9, individual threads can come in here, but the
+ * TLB is shared between the 4 threads in a core, hence
+ * invalidating on one thread invalidates for all.
+ * Thus we make all 4 threads use the same bit.
+ */
+ if (cpu_has_feature(CPU_FTR_ARCH_300))
+ pcpu = cpu_first_thread_sibling(pcpu);
+
+ if (nested)
+ need_tlb_flush = &nested->need_tlb_flush;
+ else
+ need_tlb_flush = &kvm->arch.need_tlb_flush;
+
+ if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
+ flush_guest_tlb(kvm);
+
+ /* Clear the bit after the TLB flush */
+ cpumask_clear_cpu(pcpu, need_tlb_flush);
+ }
+}
+EXPORT_SYMBOL_GPL(kvmppc_check_need_tlb_flush);
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
index e3f738e..9af6604 100644
--- a/arch/powerpc/kvm/book3s_hv_hmi.c
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -1,19 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Hypervisor Maintenance Interrupt (HMI) handling.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program.
- *
* Copyright 2015 IBM Corporation
* Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
*/
@@ -24,6 +12,7 @@
#include <linux/compiler.h>
#include <asm/paca.h>
#include <asm/hmi.h>
+#include <asm/processor.h>
void wait_for_subcore_guest_exit(void)
{
diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S
index 666b91c..63fd81f 100644
--- a/arch/powerpc/kvm/book3s_hv_interrupts.S
+++ b/arch/powerpc/kvm/book3s_hv_interrupts.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
@@ -64,52 +53,7 @@
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Save host PMU registers */
-BEGIN_FTR_SECTION
- /* Work around P8 PMAE bug */
- li r3, -1
- clrrdi r3, r3, 10
- mfspr r8, SPRN_MMCR2
- mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
- isync
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- li r3, 1
- sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
- mfspr r7, SPRN_MMCR0 /* save MMCR0 */
- mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
- mfspr r6, SPRN_MMCRA
- /* Clear MMCRA in order to disable SDAR updates */
- li r5, 0
- mtspr SPRN_MMCRA, r5
- isync
- lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
- cmpwi r5, 0
- beq 31f /* skip if not */
- mfspr r5, SPRN_MMCR1
- mfspr r9, SPRN_SIAR
- mfspr r10, SPRN_SDAR
- std r7, HSTATE_MMCR0(r13)
- std r5, HSTATE_MMCR1(r13)
- std r6, HSTATE_MMCRA(r13)
- std r9, HSTATE_SIAR(r13)
- std r10, HSTATE_SDAR(r13)
-BEGIN_FTR_SECTION
- mfspr r9, SPRN_SIER
- std r8, HSTATE_MMCR2(r13)
- std r9, HSTATE_SIER(r13)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- mfspr r3, SPRN_PMC1
- mfspr r5, SPRN_PMC2
- mfspr r6, SPRN_PMC3
- mfspr r7, SPRN_PMC4
- mfspr r8, SPRN_PMC5
- mfspr r9, SPRN_PMC6
- stw r3, HSTATE_PMC1(r13)
- stw r5, HSTATE_PMC2(r13)
- stw r6, HSTATE_PMC3(r13)
- stw r7, HSTATE_PMC4(r13)
- stw r8, HSTATE_PMC5(r13)
- stw r9, HSTATE_PMC6(r13)
-31:
+ bl kvmhv_save_host_pmu
/*
* Put whatever is in the decrementer into the
@@ -161,3 +105,51 @@
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
+
+_GLOBAL(kvmhv_save_host_pmu)
+BEGIN_FTR_SECTION
+ /* Work around P8 PMAE bug */
+ li r3, -1
+ clrrdi r3, r3, 10
+ mfspr r8, SPRN_MMCR2
+ mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
+ isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ li r3, 1
+ sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
+ mfspr r7, SPRN_MMCR0 /* save MMCR0 */
+ mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
+ mfspr r6, SPRN_MMCRA
+ /* Clear MMCRA in order to disable SDAR updates */
+ li r5, 0
+ mtspr SPRN_MMCRA, r5
+ isync
+ lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
+ cmpwi r5, 0
+ beq 31f /* skip if not */
+ mfspr r5, SPRN_MMCR1
+ mfspr r9, SPRN_SIAR
+ mfspr r10, SPRN_SDAR
+ std r7, HSTATE_MMCR0(r13)
+ std r5, HSTATE_MMCR1(r13)
+ std r6, HSTATE_MMCRA(r13)
+ std r9, HSTATE_SIAR(r13)
+ std r10, HSTATE_SDAR(r13)
+BEGIN_FTR_SECTION
+ mfspr r9, SPRN_SIER
+ std r8, HSTATE_MMCR2(r13)
+ std r9, HSTATE_SIER(r13)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ mfspr r3, SPRN_PMC1
+ mfspr r5, SPRN_PMC2
+ mfspr r6, SPRN_PMC3
+ mfspr r7, SPRN_PMC4
+ mfspr r8, SPRN_PMC5
+ mfspr r9, SPRN_PMC6
+ stw r3, HSTATE_PMC1(r13)
+ stw r5, HSTATE_PMC2(r13)
+ stw r6, HSTATE_PMC3(r13)
+ stw r7, HSTATE_PMC4(r13)
+ stw r8, HSTATE_PMC5(r13)
+ stw r9, HSTATE_PMC6(r13)
+31: blr
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c
new file mode 100644
index 0000000..cdf30c6
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -0,0 +1,1455 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2018
+ * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
+ * Paul Mackerras <paulus@ozlabs.org>
+ *
+ * Description: KVM functions specific to running nested KVM-HV guests
+ * on Book3S processors (specifically POWER9 and later).
+ */
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/llist.h>
+
+#include <asm/kvm_ppc.h>
+#include <asm/kvm_book3s.h>
+#include <asm/mmu.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/pte-walk.h>
+#include <asm/reg.h>
+
+static struct patb_entry *pseries_partition_tb;
+
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
+
+void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ hr->pcr = vc->pcr | PCR_MASK;
+ hr->dpdes = vc->dpdes;
+ hr->hfscr = vcpu->arch.hfscr;
+ hr->tb_offset = vc->tb_offset;
+ hr->dawr0 = vcpu->arch.dawr;
+ hr->dawrx0 = vcpu->arch.dawrx;
+ hr->ciabr = vcpu->arch.ciabr;
+ hr->purr = vcpu->arch.purr;
+ hr->spurr = vcpu->arch.spurr;
+ hr->ic = vcpu->arch.ic;
+ hr->vtb = vc->vtb;
+ hr->srr0 = vcpu->arch.shregs.srr0;
+ hr->srr1 = vcpu->arch.shregs.srr1;
+ hr->sprg[0] = vcpu->arch.shregs.sprg0;
+ hr->sprg[1] = vcpu->arch.shregs.sprg1;
+ hr->sprg[2] = vcpu->arch.shregs.sprg2;
+ hr->sprg[3] = vcpu->arch.shregs.sprg3;
+ hr->pidr = vcpu->arch.pid;
+ hr->cfar = vcpu->arch.cfar;
+ hr->ppr = vcpu->arch.ppr;
+}
+
+static void byteswap_pt_regs(struct pt_regs *regs)
+{
+ unsigned long *addr = (unsigned long *) regs;
+
+ for (; addr < ((unsigned long *) (regs + 1)); addr++)
+ *addr = swab64(*addr);
+}
+
+static void byteswap_hv_regs(struct hv_guest_state *hr)
+{
+ hr->version = swab64(hr->version);
+ hr->lpid = swab32(hr->lpid);
+ hr->vcpu_token = swab32(hr->vcpu_token);
+ hr->lpcr = swab64(hr->lpcr);
+ hr->pcr = swab64(hr->pcr) | PCR_MASK;
+ hr->amor = swab64(hr->amor);
+ hr->dpdes = swab64(hr->dpdes);
+ hr->hfscr = swab64(hr->hfscr);
+ hr->tb_offset = swab64(hr->tb_offset);
+ hr->dawr0 = swab64(hr->dawr0);
+ hr->dawrx0 = swab64(hr->dawrx0);
+ hr->ciabr = swab64(hr->ciabr);
+ hr->hdec_expiry = swab64(hr->hdec_expiry);
+ hr->purr = swab64(hr->purr);
+ hr->spurr = swab64(hr->spurr);
+ hr->ic = swab64(hr->ic);
+ hr->vtb = swab64(hr->vtb);
+ hr->hdar = swab64(hr->hdar);
+ hr->hdsisr = swab64(hr->hdsisr);
+ hr->heir = swab64(hr->heir);
+ hr->asdr = swab64(hr->asdr);
+ hr->srr0 = swab64(hr->srr0);
+ hr->srr1 = swab64(hr->srr1);
+ hr->sprg[0] = swab64(hr->sprg[0]);
+ hr->sprg[1] = swab64(hr->sprg[1]);
+ hr->sprg[2] = swab64(hr->sprg[2]);
+ hr->sprg[3] = swab64(hr->sprg[3]);
+ hr->pidr = swab64(hr->pidr);
+ hr->cfar = swab64(hr->cfar);
+ hr->ppr = swab64(hr->ppr);
+}
+
+static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
+ struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ hr->dpdes = vc->dpdes;
+ hr->hfscr = vcpu->arch.hfscr;
+ hr->purr = vcpu->arch.purr;
+ hr->spurr = vcpu->arch.spurr;
+ hr->ic = vcpu->arch.ic;
+ hr->vtb = vc->vtb;
+ hr->srr0 = vcpu->arch.shregs.srr0;
+ hr->srr1 = vcpu->arch.shregs.srr1;
+ hr->sprg[0] = vcpu->arch.shregs.sprg0;
+ hr->sprg[1] = vcpu->arch.shregs.sprg1;
+ hr->sprg[2] = vcpu->arch.shregs.sprg2;
+ hr->sprg[3] = vcpu->arch.shregs.sprg3;
+ hr->pidr = vcpu->arch.pid;
+ hr->cfar = vcpu->arch.cfar;
+ hr->ppr = vcpu->arch.ppr;
+ switch (trap) {
+ case BOOK3S_INTERRUPT_H_DATA_STORAGE:
+ hr->hdar = vcpu->arch.fault_dar;
+ hr->hdsisr = vcpu->arch.fault_dsisr;
+ hr->asdr = vcpu->arch.fault_gpa;
+ break;
+ case BOOK3S_INTERRUPT_H_INST_STORAGE:
+ hr->asdr = vcpu->arch.fault_gpa;
+ break;
+ case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
+ hr->heir = vcpu->arch.emul_inst;
+ break;
+ }
+}
+
+static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+ /*
+ * Don't let L1 enable features for L2 which we've disabled for L1,
+ * but preserve the interrupt cause field.
+ */
+ hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
+
+ /* Don't let data address watchpoint match in hypervisor state */
+ hr->dawrx0 &= ~DAWRX_HYP;
+
+ /* Don't let completed instruction address breakpt match in HV state */
+ if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
+ hr->ciabr &= ~CIABR_PRIV;
+}
+
+static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ vc->pcr = hr->pcr | PCR_MASK;
+ vc->dpdes = hr->dpdes;
+ vcpu->arch.hfscr = hr->hfscr;
+ vcpu->arch.dawr = hr->dawr0;
+ vcpu->arch.dawrx = hr->dawrx0;
+ vcpu->arch.ciabr = hr->ciabr;
+ vcpu->arch.purr = hr->purr;
+ vcpu->arch.spurr = hr->spurr;
+ vcpu->arch.ic = hr->ic;
+ vc->vtb = hr->vtb;
+ vcpu->arch.shregs.srr0 = hr->srr0;
+ vcpu->arch.shregs.srr1 = hr->srr1;
+ vcpu->arch.shregs.sprg0 = hr->sprg[0];
+ vcpu->arch.shregs.sprg1 = hr->sprg[1];
+ vcpu->arch.shregs.sprg2 = hr->sprg[2];
+ vcpu->arch.shregs.sprg3 = hr->sprg[3];
+ vcpu->arch.pid = hr->pidr;
+ vcpu->arch.cfar = hr->cfar;
+ vcpu->arch.ppr = hr->ppr;
+}
+
+void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
+ struct hv_guest_state *hr)
+{
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+
+ vc->dpdes = hr->dpdes;
+ vcpu->arch.hfscr = hr->hfscr;
+ vcpu->arch.purr = hr->purr;
+ vcpu->arch.spurr = hr->spurr;
+ vcpu->arch.ic = hr->ic;
+ vc->vtb = hr->vtb;
+ vcpu->arch.fault_dar = hr->hdar;
+ vcpu->arch.fault_dsisr = hr->hdsisr;
+ vcpu->arch.fault_gpa = hr->asdr;
+ vcpu->arch.emul_inst = hr->heir;
+ vcpu->arch.shregs.srr0 = hr->srr0;
+ vcpu->arch.shregs.srr1 = hr->srr1;
+ vcpu->arch.shregs.sprg0 = hr->sprg[0];
+ vcpu->arch.shregs.sprg1 = hr->sprg[1];
+ vcpu->arch.shregs.sprg2 = hr->sprg[2];
+ vcpu->arch.shregs.sprg3 = hr->sprg[3];
+ vcpu->arch.pid = hr->pidr;
+ vcpu->arch.cfar = hr->cfar;
+ vcpu->arch.ppr = hr->ppr;
+}
+
+static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr)
+{
+ /* No need to reflect the page fault to L1, we've handled it */
+ vcpu->arch.trap = 0;
+
+ /*
+ * Since the L2 gprs have already been written back into L1 memory when
+ * we complete the mmio, store the L1 memory location of the L2 gpr
+ * being loaded into by the mmio so that the loaded value can be
+ * written there in kvmppc_complete_mmio_load()
+ */
+ if (((vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) == KVM_MMIO_REG_GPR)
+ && (vcpu->mmio_is_write == 0)) {
+ vcpu->arch.nested_io_gpr = (gpa_t) regs_ptr +
+ offsetof(struct pt_regs,
+ gpr[vcpu->arch.io_gpr]);
+ vcpu->arch.io_gpr = KVM_MMIO_REG_NESTED_GPR;
+ }
+}
+
+long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
+{
+ long int err, r;
+ struct kvm_nested_guest *l2;
+ struct pt_regs l2_regs, saved_l1_regs;
+ struct hv_guest_state l2_hv, saved_l1_hv;
+ struct kvmppc_vcore *vc = vcpu->arch.vcore;
+ u64 hv_ptr, regs_ptr;
+ u64 hdec_exp;
+ s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
+ u64 mask;
+ unsigned long lpcr;
+
+ if (vcpu->kvm->arch.l1_ptcr == 0)
+ return H_NOT_AVAILABLE;
+
+ /* copy parameters in */
+ hv_ptr = kvmppc_get_gpr(vcpu, 4);
+ err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
+ sizeof(struct hv_guest_state));
+ if (err)
+ return H_PARAMETER;
+ if (kvmppc_need_byteswap(vcpu))
+ byteswap_hv_regs(&l2_hv);
+ if (l2_hv.version != HV_GUEST_STATE_VERSION)
+ return H_P2;
+
+ regs_ptr = kvmppc_get_gpr(vcpu, 5);
+ err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
+ sizeof(struct pt_regs));
+ if (err)
+ return H_PARAMETER;
+ if (kvmppc_need_byteswap(vcpu))
+ byteswap_pt_regs(&l2_regs);
+ if (l2_hv.vcpu_token >= NR_CPUS)
+ return H_PARAMETER;
+
+ /* translate lpid */
+ l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
+ if (!l2)
+ return H_PARAMETER;
+ if (!l2->l1_gr_to_hr) {
+ mutex_lock(&l2->tlb_lock);
+ kvmhv_update_ptbl_cache(l2);
+ mutex_unlock(&l2->tlb_lock);
+ }
+
+ /* save l1 values of things */
+ vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+ saved_l1_regs = vcpu->arch.regs;
+ kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
+
+ /* convert TB values/offsets to host (L0) values */
+ hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
+ vc->tb_offset += l2_hv.tb_offset;
+
+ /* set L1 state to L2 state */
+ vcpu->arch.nested = l2;
+ vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
+ vcpu->arch.regs = l2_regs;
+ vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+ mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
+ LPCR_LPES | LPCR_MER;
+ lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
+ sanitise_hv_regs(vcpu, &l2_hv);
+ restore_hv_regs(vcpu, &l2_hv);
+
+ vcpu->arch.ret = RESUME_GUEST;
+ vcpu->arch.trap = 0;
+ do {
+ if (mftb() >= hdec_exp) {
+ vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+ r = RESUME_HOST;
+ break;
+ }
+ r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
+ lpcr);
+ } while (is_kvmppc_resume_guest(r));
+
+ /* save L2 state for return */
+ l2_regs = vcpu->arch.regs;
+ l2_regs.msr = vcpu->arch.shregs.msr;
+ delta_purr = vcpu->arch.purr - l2_hv.purr;
+ delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
+ delta_ic = vcpu->arch.ic - l2_hv.ic;
+ delta_vtb = vc->vtb - l2_hv.vtb;
+ save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
+
+ /* restore L1 state */
+ vcpu->arch.nested = NULL;
+ vcpu->arch.regs = saved_l1_regs;
+ vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
+ /* set L1 MSR TS field according to L2 transaction state */
+ if (l2_regs.msr & MSR_TS_MASK)
+ vcpu->arch.shregs.msr |= MSR_TS_S;
+ vc->tb_offset = saved_l1_hv.tb_offset;
+ restore_hv_regs(vcpu, &saved_l1_hv);
+ vcpu->arch.purr += delta_purr;
+ vcpu->arch.spurr += delta_spurr;
+ vcpu->arch.ic += delta_ic;
+ vc->vtb += delta_vtb;
+
+ kvmhv_put_nested(l2);
+
+ /* copy l2_hv_state and regs back to guest */
+ if (kvmppc_need_byteswap(vcpu)) {
+ byteswap_hv_regs(&l2_hv);
+ byteswap_pt_regs(&l2_regs);
+ }
+ err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
+ sizeof(struct hv_guest_state));
+ if (err)
+ return H_AUTHORITY;
+ err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
+ sizeof(struct pt_regs));
+ if (err)
+ return H_AUTHORITY;
+
+ if (r == -EINTR)
+ return H_INTERRUPT;
+
+ if (vcpu->mmio_needed) {
+ kvmhv_nested_mmio_needed(vcpu, regs_ptr);
+ return H_TOO_HARD;
+ }
+
+ return vcpu->arch.trap;
+}
+
+long kvmhv_nested_init(void)
+{
+ long int ptb_order;
+ unsigned long ptcr;
+ long rc;
+
+ if (!kvmhv_on_pseries())
+ return 0;
+ if (!radix_enabled())
+ return -ENODEV;
+
+ /* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
+ ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
+ if (ptb_order < 8)
+ ptb_order = 8;
+ pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
+ GFP_KERNEL);
+ if (!pseries_partition_tb) {
+ pr_err("kvm-hv: failed to allocated nested partition table\n");
+ return -ENOMEM;
+ }
+
+ ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
+ rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
+ if (rc != H_SUCCESS) {
+ pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
+ rc);
+ kfree(pseries_partition_tb);
+ pseries_partition_tb = NULL;
+ return -ENODEV;
+ }
+
+ return 0;
+}
+
+void kvmhv_nested_exit(void)
+{
+ /*
+ * N.B. the kvmhv_on_pseries() test is there because it enables
+ * the compiler to remove the call to plpar_hcall_norets()
+ * when CONFIG_PPC_PSERIES=n.
+ */
+ if (kvmhv_on_pseries() && pseries_partition_tb) {
+ plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
+ kfree(pseries_partition_tb);
+ pseries_partition_tb = NULL;
+ }
+}
+
+static void kvmhv_flush_lpid(unsigned int lpid)
+{
+ long rc;
+
+ if (!kvmhv_on_pseries()) {
+ radix__flush_all_lpid(lpid);
+ return;
+ }
+
+ rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
+ lpid, TLBIEL_INVAL_SET_LPID);
+ if (rc)
+ pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
+}
+
+void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
+{
+ if (!kvmhv_on_pseries()) {
+ mmu_partition_table_set_entry(lpid, dw0, dw1, true);
+ return;
+ }
+
+ pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
+ pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
+ /* L0 will do the necessary barriers */
+ kvmhv_flush_lpid(lpid);
+}
+
+static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
+{
+ unsigned long dw0;
+
+ dw0 = PATB_HR | radix__get_tree_size() |
+ __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
+ kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
+}
+
+void kvmhv_vm_nested_init(struct kvm *kvm)
+{
+ kvm->arch.max_nested_lpid = -1;
+}
+
+/*
+ * Handle the H_SET_PARTITION_TABLE hcall.
+ * r4 = guest real address of partition table + log_2(size) - 12
+ * (formatted as for the PTCR).
+ */
+long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
+{
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
+ int srcu_idx;
+ long ret = H_SUCCESS;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ /*
+ * Limit the partition table to 4096 entries (because that's what
+ * hardware supports), and check the base address.
+ */
+ if ((ptcr & PRTS_MASK) > 12 - 8 ||
+ !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
+ ret = H_PARAMETER;
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ if (ret == H_SUCCESS)
+ kvm->arch.l1_ptcr = ptcr;
+ return ret;
+}
+
+/*
+ * Handle the H_COPY_TOFROM_GUEST hcall.
+ * r4 = L1 lpid of nested guest
+ * r5 = pid
+ * r6 = eaddr to access
+ * r7 = to buffer (L1 gpa)
+ * r8 = from buffer (L1 gpa)
+ * r9 = n bytes to copy
+ */
+long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu)
+{
+ struct kvm_nested_guest *gp;
+ int l1_lpid = kvmppc_get_gpr(vcpu, 4);
+ int pid = kvmppc_get_gpr(vcpu, 5);
+ gva_t eaddr = kvmppc_get_gpr(vcpu, 6);
+ gpa_t gp_to = (gpa_t) kvmppc_get_gpr(vcpu, 7);
+ gpa_t gp_from = (gpa_t) kvmppc_get_gpr(vcpu, 8);
+ void *buf;
+ unsigned long n = kvmppc_get_gpr(vcpu, 9);
+ bool is_load = !!gp_to;
+ long rc;
+
+ if (gp_to && gp_from) /* One must be NULL to determine the direction */
+ return H_PARAMETER;
+
+ if (eaddr & (0xFFFUL << 52))
+ return H_PARAMETER;
+
+ buf = kzalloc(n, GFP_KERNEL);
+ if (!buf)
+ return H_NO_MEM;
+
+ gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false);
+ if (!gp) {
+ rc = H_PARAMETER;
+ goto out_free;
+ }
+
+ mutex_lock(&gp->tlb_lock);
+
+ if (is_load) {
+ /* Load from the nested guest into our buffer */
+ rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid,
+ eaddr, buf, NULL, n);
+ if (rc)
+ goto not_found;
+
+ /* Write what was loaded into our buffer back to the L1 guest */
+ rc = kvm_vcpu_write_guest(vcpu, gp_to, buf, n);
+ if (rc)
+ goto not_found;
+ } else {
+ /* Load the data to be stored from the L1 guest into our buf */
+ rc = kvm_vcpu_read_guest(vcpu, gp_from, buf, n);
+ if (rc)
+ goto not_found;
+
+ /* Store from our buffer into the nested guest */
+ rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid,
+ eaddr, NULL, buf, n);
+ if (rc)
+ goto not_found;
+ }
+
+out_unlock:
+ mutex_unlock(&gp->tlb_lock);
+ kvmhv_put_nested(gp);
+out_free:
+ kfree(buf);
+ return rc;
+not_found:
+ rc = H_NOT_FOUND;
+ goto out_unlock;
+}
+
+/*
+ * Reload the partition table entry for a guest.
+ * Caller must hold gp->tlb_lock.
+ */
+static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
+{
+ int ret;
+ struct patb_entry ptbl_entry;
+ unsigned long ptbl_addr;
+ struct kvm *kvm = gp->l1_host;
+
+ ret = -EFAULT;
+ ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
+ if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
+ ret = kvm_read_guest(kvm, ptbl_addr,
+ &ptbl_entry, sizeof(ptbl_entry));
+ if (ret) {
+ gp->l1_gr_to_hr = 0;
+ gp->process_table = 0;
+ } else {
+ gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
+ gp->process_table = be64_to_cpu(ptbl_entry.patb1);
+ }
+ kvmhv_set_nested_ptbl(gp);
+}
+
+struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
+{
+ struct kvm_nested_guest *gp;
+ long shadow_lpid;
+
+ gp = kzalloc(sizeof(*gp), GFP_KERNEL);
+ if (!gp)
+ return NULL;
+ gp->l1_host = kvm;
+ gp->l1_lpid = lpid;
+ mutex_init(&gp->tlb_lock);
+ gp->shadow_pgtable = pgd_alloc(kvm->mm);
+ if (!gp->shadow_pgtable)
+ goto out_free;
+ shadow_lpid = kvmppc_alloc_lpid();
+ if (shadow_lpid < 0)
+ goto out_free2;
+ gp->shadow_lpid = shadow_lpid;
+ gp->radix = 1;
+
+ memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
+
+ return gp;
+
+ out_free2:
+ pgd_free(kvm->mm, gp->shadow_pgtable);
+ out_free:
+ kfree(gp);
+ return NULL;
+}
+
+/*
+ * Free up any resources allocated for a nested guest.
+ */
+static void kvmhv_release_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+
+ if (gp->shadow_pgtable) {
+ /*
+ * No vcpu is using this struct and no call to
+ * kvmhv_get_nested can find this struct,
+ * so we don't need to hold kvm->mmu_lock.
+ */
+ kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+ gp->shadow_lpid);
+ pgd_free(kvm->mm, gp->shadow_pgtable);
+ }
+ kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
+ kvmppc_free_lpid(gp->shadow_lpid);
+ kfree(gp);
+}
+
+static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+ int lpid = gp->l1_lpid;
+ long ref;
+
+ spin_lock(&kvm->mmu_lock);
+ if (gp == kvm->arch.nested_guests[lpid]) {
+ kvm->arch.nested_guests[lpid] = NULL;
+ if (lpid == kvm->arch.max_nested_lpid) {
+ while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
+ ;
+ kvm->arch.max_nested_lpid = lpid;
+ }
+ --gp->refcnt;
+ }
+ ref = gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+ if (ref == 0)
+ kvmhv_release_nested(gp);
+}
+
+/*
+ * Free up all nested resources allocated for this guest.
+ * This is called with no vcpus of the guest running, when
+ * switching the guest to HPT mode or when destroying the
+ * guest.
+ */
+void kvmhv_release_all_nested(struct kvm *kvm)
+{
+ int i;
+ struct kvm_nested_guest *gp;
+ struct kvm_nested_guest *freelist = NULL;
+ struct kvm_memory_slot *memslot;
+ int srcu_idx;
+
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+ gp = kvm->arch.nested_guests[i];
+ if (!gp)
+ continue;
+ kvm->arch.nested_guests[i] = NULL;
+ if (--gp->refcnt == 0) {
+ gp->next = freelist;
+ freelist = gp;
+ }
+ }
+ kvm->arch.max_nested_lpid = -1;
+ spin_unlock(&kvm->mmu_lock);
+ while ((gp = freelist) != NULL) {
+ freelist = gp->next;
+ kvmhv_release_nested(gp);
+ }
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ kvm_for_each_memslot(memslot, kvm_memslots(kvm))
+ kvmhv_free_memslot_nest_rmap(memslot);
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+}
+
+/* caller must hold gp->tlb_lock */
+static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+
+ spin_lock(&kvm->mmu_lock);
+ kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
+ spin_unlock(&kvm->mmu_lock);
+ kvmhv_flush_lpid(gp->shadow_lpid);
+ kvmhv_update_ptbl_cache(gp);
+ if (gp->l1_gr_to_hr == 0)
+ kvmhv_remove_nested(gp);
+}
+
+struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
+ bool create)
+{
+ struct kvm_nested_guest *gp, *newgp;
+
+ if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
+ l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
+ return NULL;
+
+ spin_lock(&kvm->mmu_lock);
+ gp = kvm->arch.nested_guests[l1_lpid];
+ if (gp)
+ ++gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+
+ if (gp || !create)
+ return gp;
+
+ newgp = kvmhv_alloc_nested(kvm, l1_lpid);
+ if (!newgp)
+ return NULL;
+ spin_lock(&kvm->mmu_lock);
+ if (kvm->arch.nested_guests[l1_lpid]) {
+ /* someone else beat us to it */
+ gp = kvm->arch.nested_guests[l1_lpid];
+ } else {
+ kvm->arch.nested_guests[l1_lpid] = newgp;
+ ++newgp->refcnt;
+ gp = newgp;
+ newgp = NULL;
+ if (l1_lpid > kvm->arch.max_nested_lpid)
+ kvm->arch.max_nested_lpid = l1_lpid;
+ }
+ ++gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+
+ if (newgp)
+ kvmhv_release_nested(newgp);
+
+ return gp;
+}
+
+void kvmhv_put_nested(struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = gp->l1_host;
+ long ref;
+
+ spin_lock(&kvm->mmu_lock);
+ ref = --gp->refcnt;
+ spin_unlock(&kvm->mmu_lock);
+ if (ref == 0)
+ kvmhv_release_nested(gp);
+}
+
+static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
+{
+ if (lpid > kvm->arch.max_nested_lpid)
+ return NULL;
+ return kvm->arch.nested_guests[lpid];
+}
+
+static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
+{
+ return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
+ RMAP_NESTED_GPA_MASK));
+}
+
+void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
+ struct rmap_nested **n_rmap)
+{
+ struct llist_node *entry = ((struct llist_head *) rmapp)->first;
+ struct rmap_nested *cursor;
+ u64 rmap, new_rmap = (*n_rmap)->rmap;
+
+ /* Are there any existing entries? */
+ if (!(*rmapp)) {
+ /* No -> use the rmap as a single entry */
+ *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
+ return;
+ }
+
+ /* Do any entries match what we're trying to insert? */
+ for_each_nest_rmap_safe(cursor, entry, &rmap) {
+ if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
+ return;
+ }
+
+ /* Do we need to create a list or just add the new entry? */
+ rmap = *rmapp;
+ if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+ *rmapp = 0UL;
+ llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
+ if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
+ (*n_rmap)->list.next = (struct llist_node *) rmap;
+
+ /* Set NULL so not freed by caller */
+ *n_rmap = NULL;
+}
+
+static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap,
+ unsigned long clr, unsigned long set,
+ unsigned long hpa, unsigned long mask)
+{
+ struct kvm_nested_guest *gp;
+ unsigned long gpa;
+ unsigned int shift, lpid;
+ pte_t *ptep;
+
+ gpa = n_rmap & RMAP_NESTED_GPA_MASK;
+ lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
+ gp = kvmhv_find_nested(kvm, lpid);
+ if (!gp)
+ return;
+
+ /* Find the pte */
+ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ /*
+ * If the pte is present and the pfn is still the same, update the pte.
+ * If the pfn has changed then this is a stale rmap entry, the nested
+ * gpa actually points somewhere else now, and there is nothing to do.
+ * XXX A future optimisation would be to remove the rmap entry here.
+ */
+ if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa)) {
+ __radix_pte_update(ptep, clr, set);
+ kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
+ }
+}
+
+/*
+ * For a given list of rmap entries, update the rc bits in all ptes in shadow
+ * page tables for nested guests which are referenced by the rmap list.
+ */
+void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long clr, unsigned long set,
+ unsigned long hpa, unsigned long nbytes)
+{
+ struct llist_node *entry = ((struct llist_head *) rmapp)->first;
+ struct rmap_nested *cursor;
+ unsigned long rmap, mask;
+
+ if ((clr | set) & ~(_PAGE_DIRTY | _PAGE_ACCESSED))
+ return;
+
+ mask = PTE_RPN_MASK & ~(nbytes - 1);
+ hpa &= mask;
+
+ for_each_nest_rmap_safe(cursor, entry, &rmap)
+ kvmhv_update_nest_rmap_rc(kvm, rmap, clr, set, hpa, mask);
+}
+
+static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
+ unsigned long hpa, unsigned long mask)
+{
+ struct kvm_nested_guest *gp;
+ unsigned long gpa;
+ unsigned int shift, lpid;
+ pte_t *ptep;
+
+ gpa = n_rmap & RMAP_NESTED_GPA_MASK;
+ lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
+ gp = kvmhv_find_nested(kvm, lpid);
+ if (!gp)
+ return;
+
+ /* Find and invalidate the pte */
+ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ /* Don't spuriously invalidate ptes if the pfn has changed */
+ if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
+ kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+}
+
+static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
+ unsigned long hpa, unsigned long mask)
+{
+ struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
+ struct rmap_nested *cursor;
+ unsigned long rmap;
+
+ for_each_nest_rmap_safe(cursor, entry, &rmap) {
+ kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
+ kfree(cursor);
+ }
+}
+
+/* called with kvm->mmu_lock held */
+void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
+ const struct kvm_memory_slot *memslot,
+ unsigned long gpa, unsigned long hpa,
+ unsigned long nbytes)
+{
+ unsigned long gfn, end_gfn;
+ unsigned long addr_mask;
+
+ if (!memslot)
+ return;
+ gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
+ end_gfn = gfn + (nbytes >> PAGE_SHIFT);
+
+ addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
+ hpa &= addr_mask;
+
+ for (; gfn < end_gfn; gfn++) {
+ unsigned long *rmap = &memslot->arch.rmap[gfn];
+ kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
+ }
+}
+
+static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
+{
+ unsigned long page;
+
+ for (page = 0; page < free->npages; page++) {
+ unsigned long rmap, *rmapp = &free->arch.rmap[page];
+ struct rmap_nested *cursor;
+ struct llist_node *entry;
+
+ entry = llist_del_all((struct llist_head *) rmapp);
+ for_each_nest_rmap_safe(cursor, entry, &rmap)
+ kfree(cursor);
+ }
+}
+
+static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp,
+ long gpa, int *shift_ret)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool ret = false;
+ pte_t *ptep;
+ int shift;
+
+ spin_lock(&kvm->mmu_lock);
+ ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
+ if (!shift)
+ shift = PAGE_SHIFT;
+ if (ptep && pte_present(*ptep)) {
+ kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
+ ret = true;
+ }
+ spin_unlock(&kvm->mmu_lock);
+
+ if (shift_ret)
+ *shift_ret = shift;
+ return ret;
+}
+
+static inline int get_ric(unsigned int instr)
+{
+ return (instr >> 18) & 0x3;
+}
+
+static inline int get_prs(unsigned int instr)
+{
+ return (instr >> 17) & 0x1;
+}
+
+static inline int get_r(unsigned int instr)
+{
+ return (instr >> 16) & 0x1;
+}
+
+static inline int get_lpid(unsigned long r_val)
+{
+ return r_val & 0xffffffff;
+}
+
+static inline int get_is(unsigned long r_val)
+{
+ return (r_val >> 10) & 0x3;
+}
+
+static inline int get_ap(unsigned long r_val)
+{
+ return (r_val >> 5) & 0x7;
+}
+
+static inline long get_epn(unsigned long r_val)
+{
+ return r_val >> 12;
+}
+
+static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
+ int ap, long epn)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *gp;
+ long npages;
+ int shift, shadow_shift;
+ unsigned long addr;
+
+ shift = ap_to_shift(ap);
+ addr = epn << 12;
+ if (shift < 0)
+ /* Invalid ap encoding */
+ return -EINVAL;
+
+ addr &= ~((1UL << shift) - 1);
+ npages = 1UL << (shift - PAGE_SHIFT);
+
+ gp = kvmhv_get_nested(kvm, lpid, false);
+ if (!gp) /* No such guest -> nothing to do */
+ return 0;
+ mutex_lock(&gp->tlb_lock);
+
+ /* There may be more than one host page backing this single guest pte */
+ do {
+ kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
+
+ npages -= 1UL << (shadow_shift - PAGE_SHIFT);
+ addr += 1UL << shadow_shift;
+ } while (npages > 0);
+
+ mutex_unlock(&gp->tlb_lock);
+ kvmhv_put_nested(gp);
+ return 0;
+}
+
+static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp, int ric)
+{
+ struct kvm *kvm = vcpu->kvm;
+
+ mutex_lock(&gp->tlb_lock);
+ switch (ric) {
+ case 0:
+ /* Invalidate TLB */
+ spin_lock(&kvm->mmu_lock);
+ kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
+ gp->shadow_lpid);
+ kvmhv_flush_lpid(gp->shadow_lpid);
+ spin_unlock(&kvm->mmu_lock);
+ break;
+ case 1:
+ /*
+ * Invalidate PWC
+ * We don't cache this -> nothing to do
+ */
+ break;
+ case 2:
+ /* Invalidate TLB, PWC and caching of partition table entries */
+ kvmhv_flush_nested(gp);
+ break;
+ default:
+ break;
+ }
+ mutex_unlock(&gp->tlb_lock);
+}
+
+static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *gp;
+ int i;
+
+ spin_lock(&kvm->mmu_lock);
+ for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
+ gp = kvm->arch.nested_guests[i];
+ if (gp) {
+ spin_unlock(&kvm->mmu_lock);
+ kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+ spin_lock(&kvm->mmu_lock);
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+}
+
+static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
+ unsigned long rsval, unsigned long rbval)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_nested_guest *gp;
+ int r, ric, prs, is, ap;
+ int lpid;
+ long epn;
+ int ret = 0;
+
+ ric = get_ric(instr);
+ prs = get_prs(instr);
+ r = get_r(instr);
+ lpid = get_lpid(rsval);
+ is = get_is(rbval);
+
+ /*
+ * These cases are invalid and are not handled:
+ * r != 1 -> Only radix supported
+ * prs == 1 -> Not HV privileged
+ * ric == 3 -> No cluster bombs for radix
+ * is == 1 -> Partition scoped translations not associated with pid
+ * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
+ */
+ if ((!r) || (prs) || (ric == 3) || (is == 1) ||
+ ((!is) && (ric == 1 || ric == 2)))
+ return -EINVAL;
+
+ switch (is) {
+ case 0:
+ /*
+ * We know ric == 0
+ * Invalidate TLB for a given target address
+ */
+ epn = get_epn(rbval);
+ ap = get_ap(rbval);
+ ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
+ break;
+ case 2:
+ /* Invalidate matching LPID */
+ gp = kvmhv_get_nested(kvm, lpid, false);
+ if (gp) {
+ kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
+ kvmhv_put_nested(gp);
+ }
+ break;
+ case 3:
+ /* Invalidate ALL LPIDs */
+ kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+
+ return ret;
+}
+
+/*
+ * This handles the H_TLB_INVALIDATE hcall.
+ * Parameters are (r4) tlbie instruction code, (r5) rS contents,
+ * (r6) rB contents.
+ */
+long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
+{
+ int ret;
+
+ ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
+ kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
+ if (ret)
+ return H_PARAMETER;
+ return H_SUCCESS;
+}
+
+/* Used to convert a nested guest real address to a L1 guest real address */
+static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp,
+ unsigned long n_gpa, unsigned long dsisr,
+ struct kvmppc_pte *gpte_p)
+{
+ u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
+ int ret;
+
+ ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
+ &fault_addr);
+
+ if (ret) {
+ /* We didn't find a pte */
+ if (ret == -EINVAL) {
+ /* Unsupported mmu config */
+ flags |= DSISR_UNSUPP_MMU;
+ } else if (ret == -ENOENT) {
+ /* No translation found */
+ flags |= DSISR_NOHPTE;
+ } else if (ret == -EFAULT) {
+ /* Couldn't access L1 real address */
+ flags |= DSISR_PRTABLE_FAULT;
+ vcpu->arch.fault_gpa = fault_addr;
+ } else {
+ /* Unknown error */
+ return ret;
+ }
+ goto forward_to_l1;
+ } else {
+ /* We found a pte -> check permissions */
+ if (dsisr & DSISR_ISSTORE) {
+ /* Can we write? */
+ if (!gpte_p->may_write) {
+ flags |= DSISR_PROTFAULT;
+ goto forward_to_l1;
+ }
+ } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+ /* Can we execute? */
+ if (!gpte_p->may_execute) {
+ flags |= SRR1_ISI_N_OR_G;
+ goto forward_to_l1;
+ }
+ } else {
+ /* Can we read? */
+ if (!gpte_p->may_read && !gpte_p->may_write) {
+ flags |= DSISR_PROTFAULT;
+ goto forward_to_l1;
+ }
+ }
+ }
+
+ return 0;
+
+forward_to_l1:
+ vcpu->arch.fault_dsisr = flags;
+ if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
+ vcpu->arch.shregs.msr &= ~0x783f0000ul;
+ vcpu->arch.shregs.msr |= flags;
+ }
+ return RESUME_HOST;
+}
+
+static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp,
+ unsigned long n_gpa,
+ struct kvmppc_pte gpte,
+ unsigned long dsisr)
+{
+ struct kvm *kvm = vcpu->kvm;
+ bool writing = !!(dsisr & DSISR_ISSTORE);
+ u64 pgflags;
+ long ret;
+
+ /* Are the rc bits set in the L1 partition scoped pte? */
+ pgflags = _PAGE_ACCESSED;
+ if (writing)
+ pgflags |= _PAGE_DIRTY;
+ if (pgflags & ~gpte.rc)
+ return RESUME_HOST;
+
+ spin_lock(&kvm->mmu_lock);
+ /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
+ ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
+ gpte.raddr, kvm->arch.lpid);
+ if (!ret) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
+ ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
+ gp->shadow_lpid);
+ if (!ret)
+ ret = -EINVAL;
+ else
+ ret = 0;
+
+out_unlock:
+ spin_unlock(&kvm->mmu_lock);
+ return ret;
+}
+
+static inline int kvmppc_radix_level_to_shift(int level)
+{
+ switch (level) {
+ case 2:
+ return PUD_SHIFT;
+ case 1:
+ return PMD_SHIFT;
+ default:
+ return PAGE_SHIFT;
+ }
+}
+
+static inline int kvmppc_radix_shift_to_level(int shift)
+{
+ if (shift == PUD_SHIFT)
+ return 2;
+ if (shift == PMD_SHIFT)
+ return 1;
+ if (shift == PAGE_SHIFT)
+ return 0;
+ WARN_ON_ONCE(1);
+ return 0;
+}
+
+/* called with gp->tlb_lock held */
+static long int __kvmhv_nested_page_fault(struct kvm_run *run,
+ struct kvm_vcpu *vcpu,
+ struct kvm_nested_guest *gp)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memory_slot *memslot;
+ struct rmap_nested *n_rmap;
+ struct kvmppc_pte gpte;
+ pte_t pte, *pte_p;
+ unsigned long mmu_seq;
+ unsigned long dsisr = vcpu->arch.fault_dsisr;
+ unsigned long ea = vcpu->arch.fault_dar;
+ unsigned long *rmapp;
+ unsigned long n_gpa, gpa, gfn, perm = 0UL;
+ unsigned int shift, l1_shift, level;
+ bool writing = !!(dsisr & DSISR_ISSTORE);
+ bool kvm_ro = false;
+ long int ret;
+
+ if (!gp->l1_gr_to_hr) {
+ kvmhv_update_ptbl_cache(gp);
+ if (!gp->l1_gr_to_hr)
+ return RESUME_HOST;
+ }
+
+ /* Convert the nested guest real address into a L1 guest real address */
+
+ n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
+ if (!(dsisr & DSISR_PRTABLE_FAULT))
+ n_gpa |= ea & 0xFFF;
+ ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
+
+ /*
+ * If the hardware found a translation but we don't now have a usable
+ * translation in the l1 partition-scoped tree, remove the shadow pte
+ * and let the guest retry.
+ */
+ if (ret == RESUME_HOST &&
+ (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
+ DSISR_BAD_COPYPASTE)))
+ goto inval;
+ if (ret)
+ return ret;
+
+ /* Failed to set the reference/change bits */
+ if (dsisr & DSISR_SET_RC) {
+ ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
+ if (ret == RESUME_HOST)
+ return ret;
+ if (ret)
+ goto inval;
+ dsisr &= ~DSISR_SET_RC;
+ if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
+ DSISR_PROTFAULT)))
+ return RESUME_GUEST;
+ }
+
+ /*
+ * We took an HISI or HDSI while we were running a nested guest which
+ * means we have no partition scoped translation for that. This means
+ * we need to insert a pte for the mapping into our shadow_pgtable.
+ */
+
+ l1_shift = gpte.page_shift;
+ if (l1_shift < PAGE_SHIFT) {
+ /* We don't support l1 using a page size smaller than our own */
+ pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
+ l1_shift, PAGE_SHIFT);
+ return -EINVAL;
+ }
+ gpa = gpte.raddr;
+ gfn = gpa >> PAGE_SHIFT;
+
+ /* 1. Get the corresponding host memslot */
+
+ memslot = gfn_to_memslot(kvm, gfn);
+ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
+ if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
+ /* unusual error -> reflect to the guest as a DSI */
+ kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
+ return RESUME_GUEST;
+ }
+
+ /* passthrough of emulated MMIO case */
+ return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
+ }
+ if (memslot->flags & KVM_MEM_READONLY) {
+ if (writing) {
+ /* Give the guest a DSI */
+ kvmppc_core_queue_data_storage(vcpu, ea,
+ DSISR_ISSTORE | DSISR_PROTFAULT);
+ return RESUME_GUEST;
+ }
+ kvm_ro = true;
+ }
+
+ /* 2. Find the host pte for this L1 guest real address */
+
+ /* Used to check for invalidations in progress */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ /* See if can find translation in our partition scoped tables for L1 */
+ pte = __pte(0);
+ spin_lock(&kvm->mmu_lock);
+ pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
+ if (!shift)
+ shift = PAGE_SHIFT;
+ if (pte_p)
+ pte = *pte_p;
+ spin_unlock(&kvm->mmu_lock);
+
+ if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
+ /* No suitable pte found -> try to insert a mapping */
+ ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
+ writing, kvm_ro, &pte, &level);
+ if (ret == -EAGAIN)
+ return RESUME_GUEST;
+ else if (ret)
+ return ret;
+ shift = kvmppc_radix_level_to_shift(level);
+ }
+ /* Align gfn to the start of the page */
+ gfn = (gpa & ~((1UL << shift) - 1)) >> PAGE_SHIFT;
+
+ /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
+
+ /* The permissions is the combination of the host and l1 guest ptes */
+ perm |= gpte.may_read ? 0UL : _PAGE_READ;
+ perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
+ perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
+ /* Only set accessed/dirty (rc) bits if set in host and l1 guest ptes */
+ perm |= (gpte.rc & _PAGE_ACCESSED) ? 0UL : _PAGE_ACCESSED;
+ perm |= ((gpte.rc & _PAGE_DIRTY) && writing) ? 0UL : _PAGE_DIRTY;
+ pte = __pte(pte_val(pte) & ~perm);
+
+ /* What size pte can we insert? */
+ if (shift > l1_shift) {
+ u64 mask;
+ unsigned int actual_shift = PAGE_SHIFT;
+ if (PMD_SHIFT < l1_shift)
+ actual_shift = PMD_SHIFT;
+ mask = (1UL << shift) - (1UL << actual_shift);
+ pte = __pte(pte_val(pte) | (gpa & mask));
+ shift = actual_shift;
+ }
+ level = kvmppc_radix_shift_to_level(shift);
+ n_gpa &= ~((1UL << shift) - 1);
+
+ /* 4. Insert the pte into our shadow_pgtable */
+
+ n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
+ if (!n_rmap)
+ return RESUME_GUEST; /* Let the guest try again */
+ n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
+ (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
+ rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
+ ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
+ mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
+ if (n_rmap)
+ kfree(n_rmap);
+ if (ret == -EAGAIN)
+ ret = RESUME_GUEST; /* Let the guest try again */
+
+ return ret;
+
+ inval:
+ kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
+ return RESUME_GUEST;
+}
+
+long int kvmhv_nested_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu)
+{
+ struct kvm_nested_guest *gp = vcpu->arch.nested;
+ long int ret;
+
+ mutex_lock(&gp->tlb_lock);
+ ret = __kvmhv_nested_page_fault(run, vcpu, gp);
+ mutex_unlock(&gp->tlb_lock);
+ return ret;
+}
+
+int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
+{
+ int ret = -1;
+
+ spin_lock(&kvm->mmu_lock);
+ while (++lpid <= kvm->arch.max_nested_lpid) {
+ if (kvm->arch.nested_guests[lpid]) {
+ ret = lpid;
+ break;
+ }
+ }
+ spin_unlock(&kvm->mmu_lock);
+ return ret;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index b11043b..79f7d07 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -1,7 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*
* Copyright 2012 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*/
@@ -66,10 +64,8 @@
/*
* On POWER7, see if we can handle a machine check that occurred inside
* the guest in real mode, without switching to the host partition.
- *
- * Returns: 0 => exit guest, 1 => deliver machine check to guest
*/
-static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
+static void kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
{
unsigned long srr1 = vcpu->arch.shregs.msr;
struct machine_check_event mce_evt;
@@ -111,52 +107,24 @@
}
/*
- * See if we have already handled the condition in the linux host.
- * We assume that if the condition is recovered then linux host
- * will have generated an error log event that we will pick
- * up and log later.
- * Don't release mce event now. We will queue up the event so that
- * we can log the MCE event info on host console.
+ * Now get the event and stash it in the vcpu struct so it can
+ * be handled by the primary thread in virtual mode. We can't
+ * call machine_check_queue_event() here if we are running on
+ * an offline secondary thread.
*/
- if (!get_mce_event(&mce_evt, MCE_EVENT_DONTRELEASE))
- goto out;
+ if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
+ if (handled && mce_evt.version == MCE_V1)
+ mce_evt.disposition = MCE_DISPOSITION_RECOVERED;
+ } else {
+ memset(&mce_evt, 0, sizeof(mce_evt));
+ }
- if (mce_evt.version == MCE_V1 &&
- (mce_evt.severity == MCE_SEV_NO_ERROR ||
- mce_evt.disposition == MCE_DISPOSITION_RECOVERED))
- handled = 1;
-
-out:
- /*
- * For guest that supports FWNMI capability, hook the MCE event into
- * vcpu structure. We are going to exit the guest with KVM_EXIT_NMI
- * exit reason. On our way to exit we will pull this event from vcpu
- * structure and print it from thread 0 of the core/subcore.
- *
- * For guest that does not support FWNMI capability (old QEMU):
- * We are now going enter guest either through machine check
- * interrupt (for unhandled errors) or will continue from
- * current HSRR0 (for handled errors) in guest. Hence
- * queue up the event so that we can log it from host console later.
- */
- if (vcpu->kvm->arch.fwnmi_enabled) {
- /*
- * Hook up the mce event on to vcpu structure.
- * First clear the old event.
- */
- memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
- if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
- vcpu->arch.mce_evt = mce_evt;
- }
- } else
- machine_check_queue_event();
-
- return handled;
+ vcpu->arch.mce_evt = mce_evt;
}
-long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
+void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
{
- return kvmppc_realmode_mc_power7(vcpu);
+ kvmppc_realmode_mc_power7(vcpu);
}
/* Check if dynamic split is in force and return subcore size accordingly. */
@@ -177,6 +145,7 @@
local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
}
+EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
void kvmppc_subcore_exit_guest(void)
{
@@ -187,6 +156,7 @@
local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
}
+EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
static bool kvmppc_tb_resync_required(void)
{
@@ -331,5 +301,13 @@
} else {
wait_for_tb_resync();
}
+
+ /*
+ * Reset tb_offset_applied so the guest exit code won't try
+ * to subtract the previous timebase offset from the timebase.
+ */
+ if (local_paca->kvm_hstate.kvm_vcore)
+ local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
+
return 0;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index a67cf1c..2203054 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -1,7 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*
* Copyright 2010-2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*/
@@ -13,6 +11,7 @@
#include <linux/hugetlb.h>
#include <linux/module.h>
#include <linux/log2.h>
+#include <linux/sizes.h>
#include <asm/trace.h>
#include <asm/kvm_ppc.h>
@@ -100,14 +99,14 @@
} else {
rev->forw = rev->back = pte_index;
*rmap = (*rmap & ~KVMPPC_RMAP_INDEX) |
- pte_index | KVMPPC_RMAP_PRESENT;
+ pte_index | KVMPPC_RMAP_PRESENT | KVMPPC_RMAP_HPT;
}
unlock_rmap(rmap);
}
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
/* Update the dirty bitmap of a memslot */
-void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
+void kvmppc_update_dirty_map(const struct kvm_memory_slot *memslot,
unsigned long gfn, unsigned long psize)
{
unsigned long npages;
@@ -434,6 +433,37 @@
(HPTE_R_KEY_HI | HPTE_R_KEY_LO));
}
+static inline void fixup_tlbie_lpid(unsigned long rb_value, unsigned long lpid)
+{
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_ERAT_BUG)) {
+ /* Radix flush for a hash guest */
+
+ unsigned long rb,rs,prs,r,ric;
+
+ rb = PPC_BIT(52); /* IS = 2 */
+ rs = 0; /* lpid = 0 */
+ prs = 0; /* partition scoped */
+ r = 1; /* radix format */
+ ric = 0; /* RIC_FLSUH_TLB */
+
+ /*
+ * Need the extra ptesync to make sure we don't
+ * re-order the tlbie
+ */
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
+ : : "r"(rb), "i"(r), "i"(prs),
+ "i"(ric), "r"(rs) : "memory");
+ }
+
+ if (cpu_has_feature(CPU_FTR_P9_TLBIE_STQ_BUG)) {
+ asm volatile("ptesync": : :"memory");
+ asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
+ "r" (rb_value), "r" (lpid));
+ }
+}
+
static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
long npages, int global, bool need_sync)
{
@@ -452,16 +482,7 @@
"r" (rbvalues[i]), "r" (kvm->arch.lpid));
}
- if (cpu_has_feature(CPU_FTR_P9_TLBIE_BUG)) {
- /*
- * Need the extra ptesync to make sure we don't
- * re-order the tlbie
- */
- asm volatile("ptesync": : :"memory");
- asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : :
- "r" (rbvalues[0]), "r" (kvm->arch.lpid));
- }
-
+ fixup_tlbie_lpid(rbvalues[i - 1], kvm->arch.lpid);
asm volatile("eieio; tlbsync; ptesync" : : : "memory");
} else {
if (need_sync)
@@ -867,6 +888,149 @@
return ret;
}
+static int kvmppc_get_hpa(struct kvm_vcpu *vcpu, unsigned long gpa,
+ int writing, unsigned long *hpa,
+ struct kvm_memory_slot **memslot_p)
+{
+ struct kvm *kvm = vcpu->kvm;
+ struct kvm_memory_slot *memslot;
+ unsigned long gfn, hva, pa, psize = PAGE_SHIFT;
+ unsigned int shift;
+ pte_t *ptep, pte;
+
+ /* Find the memslot for this address */
+ gfn = gpa >> PAGE_SHIFT;
+ memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
+ if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID))
+ return H_PARAMETER;
+
+ /* Translate to host virtual address */
+ hva = __gfn_to_hva_memslot(memslot, gfn);
+
+ /* Try to find the host pte for that virtual address */
+ ptep = __find_linux_pte(vcpu->arch.pgdir, hva, NULL, &shift);
+ if (!ptep)
+ return H_TOO_HARD;
+ pte = kvmppc_read_update_linux_pte(ptep, writing);
+ if (!pte_present(pte))
+ return H_TOO_HARD;
+
+ /* Convert to a physical address */
+ if (shift)
+ psize = 1UL << shift;
+ pa = pte_pfn(pte) << PAGE_SHIFT;
+ pa |= hva & (psize - 1);
+ pa |= gpa & ~PAGE_MASK;
+
+ if (hpa)
+ *hpa = pa;
+ if (memslot_p)
+ *memslot_p = memslot;
+
+ return H_SUCCESS;
+}
+
+static long kvmppc_do_h_page_init_zero(struct kvm_vcpu *vcpu,
+ unsigned long dest)
+{
+ struct kvm_memory_slot *memslot;
+ struct kvm *kvm = vcpu->kvm;
+ unsigned long pa, mmu_seq;
+ long ret = H_SUCCESS;
+ int i;
+
+ /* Used later to detect if we might have been invalidated */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ ret = kvmppc_get_hpa(vcpu, dest, 1, &pa, &memslot);
+ if (ret != H_SUCCESS)
+ return ret;
+
+ /* Check if we've been invalidated */
+ raw_spin_lock(&kvm->mmu_lock.rlock);
+ if (mmu_notifier_retry(kvm, mmu_seq)) {
+ ret = H_TOO_HARD;
+ goto out_unlock;
+ }
+
+ /* Zero the page */
+ for (i = 0; i < SZ_4K; i += L1_CACHE_BYTES, pa += L1_CACHE_BYTES)
+ dcbz((void *)pa);
+ kvmppc_update_dirty_map(memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
+
+out_unlock:
+ raw_spin_unlock(&kvm->mmu_lock.rlock);
+ return ret;
+}
+
+static long kvmppc_do_h_page_init_copy(struct kvm_vcpu *vcpu,
+ unsigned long dest, unsigned long src)
+{
+ unsigned long dest_pa, src_pa, mmu_seq;
+ struct kvm_memory_slot *dest_memslot;
+ struct kvm *kvm = vcpu->kvm;
+ long ret = H_SUCCESS;
+
+ /* Used later to detect if we might have been invalidated */
+ mmu_seq = kvm->mmu_notifier_seq;
+ smp_rmb();
+
+ ret = kvmppc_get_hpa(vcpu, dest, 1, &dest_pa, &dest_memslot);
+ if (ret != H_SUCCESS)
+ return ret;
+ ret = kvmppc_get_hpa(vcpu, src, 0, &src_pa, NULL);
+ if (ret != H_SUCCESS)
+ return ret;
+
+ /* Check if we've been invalidated */
+ raw_spin_lock(&kvm->mmu_lock.rlock);
+ if (mmu_notifier_retry(kvm, mmu_seq)) {
+ ret = H_TOO_HARD;
+ goto out_unlock;
+ }
+
+ /* Copy the page */
+ memcpy((void *)dest_pa, (void *)src_pa, SZ_4K);
+
+ kvmppc_update_dirty_map(dest_memslot, dest >> PAGE_SHIFT, PAGE_SIZE);
+
+out_unlock:
+ raw_spin_unlock(&kvm->mmu_lock.rlock);
+ return ret;
+}
+
+long kvmppc_rm_h_page_init(struct kvm_vcpu *vcpu, unsigned long flags,
+ unsigned long dest, unsigned long src)
+{
+ struct kvm *kvm = vcpu->kvm;
+ u64 pg_mask = SZ_4K - 1; /* 4K page size */
+ long ret = H_SUCCESS;
+
+ /* Don't handle radix mode here, go up to the virtual mode handler */
+ if (kvm_is_radix(kvm))
+ return H_TOO_HARD;
+
+ /* Check for invalid flags (H_PAGE_SET_LOANED covers all CMO flags) */
+ if (flags & ~(H_ICACHE_INVALIDATE | H_ICACHE_SYNCHRONIZE |
+ H_ZERO_PAGE | H_COPY_PAGE | H_PAGE_SET_LOANED))
+ return H_PARAMETER;
+
+ /* dest (and src if copy_page flag set) must be page aligned */
+ if ((dest & pg_mask) || ((flags & H_COPY_PAGE) && (src & pg_mask)))
+ return H_PARAMETER;
+
+ /* zero and/or copy the page as determined by the flags */
+ if (flags & H_COPY_PAGE)
+ ret = kvmppc_do_h_page_init_copy(vcpu, dest, src);
+ else if (flags & H_ZERO_PAGE)
+ ret = kvmppc_do_h_page_init_zero(vcpu, dest);
+
+ /* We can ignore the other flags */
+
+ return ret;
+}
+
void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index)
{
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 758d1d2..287d591 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2012 Michael Ellerman, IBM Corporation.
* Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -61,7 +58,7 @@
hcpu = hcore << threads_shift;
kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu;
smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION);
- kvmppc_set_host_ipi(hcpu, 1);
+ kvmppc_set_host_ipi(hcpu);
smp_mb();
kvmhv_rm_send_ipi(hcpu);
}
@@ -136,7 +133,7 @@
/* Mark the target VCPU as having an interrupt pending */
vcpu->stat.queue_intr++;
- set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
+ set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
/* Kick self ? Just set MER and return */
if (vcpu == this_vcpu) {
@@ -144,6 +141,13 @@
return;
}
+ if (xive_enabled() && kvmhv_on_pseries()) {
+ /* No XICS access or hypercalls available, too hard */
+ this_icp->rm_action |= XICS_RM_KICK_VCPU;
+ this_icp->rm_kick_target = vcpu;
+ return;
+ }
+
/*
* Check if the core is loaded,
* if not, find an available host core to post to wake the VCPU,
@@ -170,8 +174,7 @@
static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
{
/* Note: Only called on self ! */
- clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
- &vcpu->arch.pending_exceptions);
+ clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
}
@@ -768,6 +771,14 @@
void __iomem *xics_phys;
int64_t rc;
+ if (kvmhv_on_pseries()) {
+ unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+
+ iosync();
+ plpar_hcall_raw(H_EOI, retbuf, hwirq);
+ return;
+ }
+
rc = pnv_opal_pci_msi_eoi(c, hwirq);
if (rc)
@@ -808,7 +819,7 @@
raddr = per_cpu_ptr(addr, cpu);
l = (unsigned long)raddr;
- if (REGION_ID(l) == VMALLOC_REGION_ID) {
+ if (get_region_id(l) == VMALLOC_REGION_ID) {
l = vmalloc_to_phys(raddr);
raddr = (unsigned int *)l;
}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 1d14046..0496e66 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1,12 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*
* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
@@ -18,6 +11,7 @@
*/
#include <asm/ppc_asm.h>
+#include <asm/code-patching-asm.h>
#include <asm/kvm_asm.h>
#include <asm/reg.h>
#include <asm/mmu.h>
@@ -28,12 +22,15 @@
#include <asm/exception-64s.h>
#include <asm/kvm_book3s_asm.h>
#include <asm/book3s/64/mmu-hash.h>
+#include <asm/export.h>
#include <asm/tm.h>
#include <asm/opal.h>
#include <asm/xive-regs.h>
#include <asm/thread_info.h>
#include <asm/asm-compat.h>
#include <asm/feature-fixups.h>
+#include <asm/cpuidle.h>
+#include <asm/ultravisor-api.h>
/* Sign-extend HDEC if not on POWER9 */
#define EXTEND_HDEC(reg) \
@@ -44,10 +41,12 @@
/* Values in HSTATE_NAPPING(r13) */
#define NAPPING_CEDE 1
#define NAPPING_NOVCPU 2
+#define NAPPING_UNSPLIT 3
/* Stack frame offsets for kvmppc_hv_entry */
-#define SFS 160
+#define SFS 208
#define STACK_SLOT_TRAP (SFS-4)
+#define STACK_SLOT_SHORT_PATH (SFS-8)
#define STACK_SLOT_TID (SFS-16)
#define STACK_SLOT_PSSCR (SFS-24)
#define STACK_SLOT_PID (SFS-32)
@@ -56,6 +55,10 @@
#define STACK_SLOT_DAWR (SFS-56)
#define STACK_SLOT_DAWRX (SFS-64)
#define STACK_SLOT_HFSCR (SFS-72)
+#define STACK_SLOT_AMR (SFS-80)
+#define STACK_SLOT_UAMOR (SFS-88)
+/* the following is used by the P9 short path */
+#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
/*
* Call kvmppc_hv_entry in real mode.
@@ -113,45 +116,7 @@
mtspr SPRN_SPRG_VDSO_WRITE,r3
/* Reload the host's PMU registers */
- lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
- cmpwi r4, 0
- beq 23f /* skip if not */
-BEGIN_FTR_SECTION
- ld r3, HSTATE_MMCR0(r13)
- andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
- cmpwi r4, MMCR0_PMAO
- beql kvmppc_fix_pmao
-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
- lwz r3, HSTATE_PMC1(r13)
- lwz r4, HSTATE_PMC2(r13)
- lwz r5, HSTATE_PMC3(r13)
- lwz r6, HSTATE_PMC4(r13)
- lwz r8, HSTATE_PMC5(r13)
- lwz r9, HSTATE_PMC6(r13)
- mtspr SPRN_PMC1, r3
- mtspr SPRN_PMC2, r4
- mtspr SPRN_PMC3, r5
- mtspr SPRN_PMC4, r6
- mtspr SPRN_PMC5, r8
- mtspr SPRN_PMC6, r9
- ld r3, HSTATE_MMCR0(r13)
- ld r4, HSTATE_MMCR1(r13)
- ld r5, HSTATE_MMCRA(r13)
- ld r6, HSTATE_SIAR(r13)
- ld r7, HSTATE_SDAR(r13)
- mtspr SPRN_MMCR1, r4
- mtspr SPRN_MMCRA, r5
- mtspr SPRN_SIAR, r6
- mtspr SPRN_SDAR, r7
-BEGIN_FTR_SECTION
- ld r8, HSTATE_MMCR2(r13)
- ld r9, HSTATE_SIER(r13)
- mtspr SPRN_MMCR2, r8
- mtspr SPRN_SIER, r9
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- mtspr SPRN_MMCR0, r3
- isync
-23:
+ bl kvmhv_load_host_pmu
/*
* Reload DEC. HDEC interrupts were disabled when
@@ -322,17 +287,19 @@
b kvmhv_switch_to_host
/*
- * We come in here when wakened from nap mode.
- * Relocation is off and most register values are lost.
- * r13 points to the PACA.
+ * We come in here when wakened from Linux offline idle code.
+ * Relocation is off
* r3 contains the SRR1 wakeup value, SRR1 is trashed.
*/
- .globl kvm_start_guest
-kvm_start_guest:
- /* Set runlatch bit the minute you wake up from nap */
- mfspr r0, SPRN_CTRLF
- ori r0, r0, 1
- mtspr SPRN_CTRLT, r0
+_GLOBAL(idle_kvm_start_guest)
+ ld r4,PACAEMERGSP(r13)
+ mfcr r5
+ mflr r0
+ std r1,0(r4)
+ std r5,8(r4)
+ std r0,16(r4)
+ subi r1,r4,STACK_FRAME_OVERHEAD
+ SAVE_NVGPRS(r1)
/*
* Could avoid this and pass it through in r3. For now,
@@ -340,27 +307,23 @@
*/
mtspr SPRN_SRR1,r3
- ld r2,PACATOC(r13)
-
li r0,0
stb r0,PACA_FTRACE_ENABLED(r13)
li r0,KVM_HWTHREAD_IN_KVM
stb r0,HSTATE_HWTHREAD_STATE(r13)
- /* NV GPR values from power7_idle() will no longer be valid */
- li r0,1
- stb r0,PACA_NAPSTATELOST(r13)
-
- /* were we napping due to cede? */
+ /* kvm cede / napping does not come through here */
lbz r0,HSTATE_NAPPING(r13)
- cmpwi r0,NAPPING_CEDE
- beq kvm_end_cede
- cmpwi r0,NAPPING_NOVCPU
- beq kvm_novcpu_wakeup
+ twnei r0,0
- ld r1,PACAEMERGSP(r13)
- subi r1,r1,STACK_FRAME_OVERHEAD
+ b 1f
+
+kvm_unsplit_wakeup:
+ li r0, 0
+ stb r0, HSTATE_NAPPING(r13)
+
+1:
/*
* We weren't napping due to cede, so this must be a secondary
@@ -469,19 +432,25 @@
lbz r3, HSTATE_HWTHREAD_REQ(r13)
cmpwi r3, 0
bne 54f
-/*
- * We jump to pnv_wakeup_loss, which will return to the caller
- * of power7_nap in the powernv cpu offline loop. The value we
- * put in r3 becomes the return value for power7_nap. pnv_wakeup_loss
- * requires SRR1 in r12.
- */
+
+ /*
+ * Jump to idle_return_gpr_loss, which returns to the
+ * idle_kvm_start_guest caller.
+ */
li r3, LPCR_PECE0
mfspr r4, SPRN_LPCR
rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1
mtspr SPRN_LPCR, r4
- li r3, 0
- mfspr r12,SPRN_SRR1
- b pnv_wakeup_loss
+ /* set up r3 for return */
+ mfspr r3,SPRN_SRR1
+ REST_NVGPRS(r1)
+ addi r1, r1, STACK_FRAME_OVERHEAD
+ ld r0, 16(r1)
+ ld r5, 8(r1)
+ ld r1, 0(r1)
+ mtlr r0
+ mtcr r5
+ blr
53: HMT_LOW
ld r5, HSTATE_KVM_VCORE(r13)
@@ -566,6 +535,8 @@
lbz r0, KVM_SPLIT_DO_NAP(r3)
cmpwi r0, 0
beq 57f
+ li r3, NAPPING_UNSPLIT
+ stb r3, HSTATE_NAPPING(r13)
li r3, (LPCR_PECEDH | LPCR_PECE0) >> 4
mfspr r5, SPRN_LPCR
rlwimi r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1)
@@ -613,11 +584,8 @@
1:
#endif
- /* Use cr7 as an indication of radix mode */
ld r5, HSTATE_KVM_VCORE(r13)
ld r9, VCORE_KVM(r5) /* pointer to struct kvm */
- lbz r0, KVM_RADIX(r9)
- cmpwi cr7, r0, 0
/*
* POWER7/POWER8 host -> guest partition switch code.
@@ -640,9 +608,6 @@
cmpwi r6,0
bne 10f
- /* Radix has already switched LPID and flushed core TLB */
- bne cr7, 22f
-
lwz r7,KVM_LPID(r9)
BEGIN_FTR_SECTION
ld r6,KVM_SDR1(r9)
@@ -654,41 +619,13 @@
mtspr SPRN_LPID,r7
isync
- /* See if we need to flush the TLB. Hash has to be done in RM */
- lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */
-BEGIN_FTR_SECTION
- /*
- * On POWER9, individual threads can come in here, but the
- * TLB is shared between the 4 threads in a core, hence
- * invalidating on one thread invalidates for all.
- * Thus we make all 4 threads use the same bit here.
- */
- clrrdi r6,r6,2
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
- clrldi r7,r6,64-6 /* extract bit number (6 bits) */
- srdi r6,r6,6 /* doubleword number */
- sldi r6,r6,3 /* address offset */
- add r6,r6,r9
- addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */
- li r8,1
- sld r8,r8,r7
- ld r7,0(r6)
- and. r7,r7,r8
- beq 22f
- /* Flush the TLB of any entries for this LPID */
- lwz r0,KVM_TLB_SETS(r9)
- mtctr r0
- li r7,0x800 /* IS field = 0b10 */
- ptesync
- li r0,0 /* RS for P9 version of tlbiel */
-28: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */
- addi r7,r7,0x1000
- bdnz 28b
- ptesync
-23: ldarx r7,0,r6 /* clear the bit after TLB flushed */
- andc r7,r7,r8
- stdcx. r7,0,r6
- bne 23b
+ /* See if we need to flush the TLB. */
+ mr r3, r9 /* kvm pointer */
+ lhz r4, PACAPACAINDEX(r13) /* physical cpu number */
+ li r5, 0 /* nested vcpu pointer */
+ bl kvmppc_check_need_tlb_flush
+ nop
+ ld r5, HSTATE_KVM_VCORE(r13)
/* Add timebase offset onto timebase */
22: ld r8,VCORE_TB_OFFSET(r5)
@@ -708,8 +645,10 @@
/* Load guest PCR value to select appropriate compat mode */
37: ld r7, VCORE_PCR(r5)
- cmpdi r7, 0
+ LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+ cmpld r7, r6
beq 38f
+ or r7, r7, r6
mtspr SPRN_PCR, r7
38:
@@ -760,11 +699,9 @@
mfspr r5, SPRN_TIDR
mfspr r6, SPRN_PSSCR
mfspr r7, SPRN_PID
- mfspr r8, SPRN_IAMR
std r5, STACK_SLOT_TID(r1)
std r6, STACK_SLOT_PSSCR(r1)
std r7, STACK_SLOT_PID(r1)
- std r8, STACK_SLOT_IAMR(r1)
mfspr r5, SPRN_HFSCR
std r5, STACK_SLOT_HFSCR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
@@ -772,11 +709,18 @@
mfspr r5, SPRN_CIABR
mfspr r6, SPRN_DAWR
mfspr r7, SPRN_DAWRX
+ mfspr r8, SPRN_IAMR
std r5, STACK_SLOT_CIABR(r1)
std r6, STACK_SLOT_DAWR(r1)
std r7, STACK_SLOT_DAWRX(r1)
+ std r8, STACK_SLOT_IAMR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ mfspr r5, SPRN_AMR
+ std r5, STACK_SLOT_AMR(r1)
+ mfspr r6, SPRN_UAMOR
+ std r6, STACK_SLOT_UAMOR(r1)
+
BEGIN_FTR_SECTION
/* Set partition DABR */
/* Do this before re-enabling PMU to avoid P7 DABR corruption bug */
@@ -796,66 +740,23 @@
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
- * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
mr r3, r4
ld r4, VCPU_MSR(r3)
+ li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_restore_tm_hv
+ nop
ld r4, HSTATE_KVM_VCPU(r13)
91:
#endif
- /* Load guest PMU registers */
- /* R4 is live here (vcpu pointer) */
- li r3, 1
- sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
- mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
- isync
-BEGIN_FTR_SECTION
- ld r3, VCPU_MMCR(r4)
- andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
- cmpwi r5, MMCR0_PMAO
- beql kvmppc_fix_pmao
-END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
- lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
- lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
- lwz r6, VCPU_PMC + 8(r4)
- lwz r7, VCPU_PMC + 12(r4)
- lwz r8, VCPU_PMC + 16(r4)
- lwz r9, VCPU_PMC + 20(r4)
- mtspr SPRN_PMC1, r3
- mtspr SPRN_PMC2, r5
- mtspr SPRN_PMC3, r6
- mtspr SPRN_PMC4, r7
- mtspr SPRN_PMC5, r8
- mtspr SPRN_PMC6, r9
- ld r3, VCPU_MMCR(r4)
- ld r5, VCPU_MMCR + 8(r4)
- ld r6, VCPU_MMCR + 16(r4)
- ld r7, VCPU_SIAR(r4)
- ld r8, VCPU_SDAR(r4)
- mtspr SPRN_MMCR1, r5
- mtspr SPRN_MMCRA, r6
- mtspr SPRN_SIAR, r7
- mtspr SPRN_SDAR, r8
-BEGIN_FTR_SECTION
- ld r5, VCPU_MMCR + 24(r4)
- ld r6, VCPU_SIER(r4)
- mtspr SPRN_MMCR2, r5
- mtspr SPRN_SIER, r6
-BEGIN_FTR_SECTION_NESTED(96)
- lwz r7, VCPU_PMC + 24(r4)
- lwz r8, VCPU_PMC + 28(r4)
- ld r9, VCPU_MMCR + 32(r4)
- mtspr SPRN_SPMC1, r7
- mtspr SPRN_SPMC2, r8
- mtspr SPRN_MMCRS, r9
-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- mtspr SPRN_MMCR0, r3
- isync
+ /* Load guest PMU registers; r4 = vcpu pointer here */
+ mr r3, r4
+ bl kvmhv_load_guest_pmu
/* Load up FP, VMX and VSX registers */
+ ld r4, HSTATE_KVM_VCPU(r13)
bl kvmppc_load_fp
ld r14, VCPU_GPR(R14)(r4)
@@ -892,18 +793,21 @@
mtspr SPRN_IAMR, r5
mtspr SPRN_PSPB, r6
mtspr SPRN_FSCR, r7
- ld r5, VCPU_DAWR(r4)
- ld r6, VCPU_DAWRX(r4)
- ld r7, VCPU_CIABR(r4)
- ld r8, VCPU_TAR(r4)
/*
* Handle broken DAWR case by not writing it. This means we
* can still store the DAWR register for migration.
*/
-BEGIN_FTR_SECTION
+ LOAD_REG_ADDR(r5, dawr_force_enable)
+ lbz r5, 0(r5)
+ cmpdi r5, 0
+ beq 1f
+ ld r5, VCPU_DAWR(r4)
+ ld r6, VCPU_DAWRX(r4)
mtspr SPRN_DAWR, r5
mtspr SPRN_DAWRX, r6
-END_FTR_SECTION_IFSET(CPU_FTR_DAWR)
+1:
+ ld r7, VCPU_CIABR(r4)
+ ld r8, VCPU_TAR(r4)
mtspr SPRN_CIABR, r7
mtspr SPRN_TAR, r8
ld r5, VCPU_IC(r4)
@@ -1039,17 +943,29 @@
#ifdef CONFIG_KVM_XICS
/* We are entering the guest on that thread, push VCPU to XIVE */
- ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
- cmpldi cr0, r10, 0
- beq no_xive
ld r11, VCPU_XIVE_SAVED_STATE(r4)
li r9, TM_QW1_OS
+ lwz r8, VCPU_XIVE_CAM_WORD(r4)
+ cmpwi r8, 0
+ beq no_xive
+ li r7, TM_QW1_OS + TM_WORD2
+ mfmsr r0
+ andi. r0, r0, MSR_DR /* in real mode? */
+ beq 2f
+ ld r10, HSTATE_XIVE_TIMA_VIRT(r13)
+ cmpldi cr1, r10, 0
+ beq cr1, no_xive
+ eieio
+ stdx r11,r9,r10
+ stwx r8,r7,r10
+ b 3f
+2: ld r10, HSTATE_XIVE_TIMA_PHYS(r13)
+ cmpldi cr1, r10, 0
+ beq cr1, no_xive
eieio
stdcix r11,r9,r10
- lwz r11, VCPU_XIVE_CAM_WORD(r4)
- li r9, TM_QW1_OS + TM_WORD2
- stwcix r11,r9,r10
- li r9, 1
+ stwcix r8,r7,r10
+3: li r9, 1
stb r9, VCPU_XIVE_PUSHED(r4)
eieio
@@ -1068,12 +984,16 @@
* on, we mask it.
*/
lbz r0, VCPU_XIVE_ESC_ON(r4)
- cmpwi r0,0
- beq 1f
- ld r10, VCPU_XIVE_ESC_RADDR(r4)
+ cmpwi cr1, r0,0
+ beq cr1, 1f
li r9, XIVE_ESB_SET_PQ_01
+ beq 4f /* in real mode? */
+ ld r10, VCPU_XIVE_ESC_VADDR(r4)
+ ldx r0, r10, r9
+ b 5f
+4: ld r10, VCPU_XIVE_ESC_RADDR(r4)
ldcix r0, r10, r9
- sync
+5: sync
/* We have a possible subtle race here: The escalation interrupt might
* have fired and be on its way to the host queue while we mask it,
@@ -1100,73 +1020,40 @@
no_xive:
#endif /* CONFIG_KVM_XICS */
-deliver_guest_interrupt:
- ld r6, VCPU_CTR(r4)
- ld r7, VCPU_XER(r4)
+ li r0, 0
+ stw r0, STACK_SLOT_SHORT_PATH(r1)
- mtctr r6
- mtxer r7
-
-kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
- ld r10, VCPU_PC(r4)
- ld r11, VCPU_MSR(r4)
+deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */
+ /* Check if we can deliver an external or decrementer interrupt now */
+ ld r0, VCPU_PENDING_EXC(r4)
+BEGIN_FTR_SECTION
+ /* On POWER9, also check for emulated doorbell interrupt */
+ lbz r3, VCPU_DBELL_REQ(r4)
+ or r0, r0, r3
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ cmpdi r0, 0
+ beq 71f
+ mr r3, r4
+ bl kvmppc_guest_entry_inject_int
+ ld r4, HSTATE_KVM_VCPU(r13)
+71:
ld r6, VCPU_SRR0(r4)
ld r7, VCPU_SRR1(r4)
mtspr SPRN_SRR0, r6
mtspr SPRN_SRR1, r7
+fast_guest_entry_c:
+ ld r10, VCPU_PC(r4)
+ ld r11, VCPU_MSR(r4)
/* r11 = vcpu->arch.msr & ~MSR_HV */
rldicl r11, r11, 63 - MSR_HV_LG, 1
rotldi r11, r11, 1 + MSR_HV_LG
ori r11, r11, MSR_ME
- /* Check if we can deliver an external or decrementer interrupt now */
- ld r0, VCPU_PENDING_EXC(r4)
- rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
- cmpdi cr1, r0, 0
- andi. r8, r11, MSR_EE
- mfspr r8, SPRN_LPCR
- /* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
- rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
- mtspr SPRN_LPCR, r8
- isync
- beq 5f
- li r0, BOOK3S_INTERRUPT_EXTERNAL
- bne cr1, 12f
- mfspr r0, SPRN_DEC
-BEGIN_FTR_SECTION
- /* On POWER9 check whether the guest has large decrementer enabled */
- andis. r8, r8, LPCR_LD@h
- bne 15f
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
- extsw r0, r0
-15: cmpdi r0, 0
- li r0, BOOK3S_INTERRUPT_DECREMENTER
- bge 5f
-
-12: mtspr SPRN_SRR0, r10
- mr r10,r0
- mtspr SPRN_SRR1, r11
- mr r9, r4
- bl kvmppc_msr_interrupt
-5:
-BEGIN_FTR_SECTION
- b fast_guest_return
-END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
- /* On POWER9, check for pending doorbell requests */
- lbz r0, VCPU_DBELL_REQ(r4)
- cmpwi r0, 0
- beq fast_guest_return
- ld r5, HSTATE_KVM_VCORE(r13)
- /* Set DPDES register so the CPU will take a doorbell interrupt */
- li r0, 1
- mtspr SPRN_DPDES, r0
- std r0, VCORE_DPDES(r5)
- /* Make sure other cpus see vcore->dpdes set before dbell req clear */
- lwsync
- /* Clear the pending doorbell request */
- li r0, 0
- stb r0, VCPU_DBELL_REQ(r4)
+ ld r6, VCPU_CTR(r4)
+ ld r7, VCPU_XER(r4)
+ mtctr r6
+ mtxer r7
/*
* Required state:
@@ -1202,16 +1089,10 @@
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r5, VCPU_LR(r4)
- lwz r6, VCPU_CR(r4)
mtlr r5
- mtcr r6
ld r1, VCPU_GPR(R1)(r4)
- ld r2, VCPU_GPR(R2)(r4)
- ld r3, VCPU_GPR(R3)(r4)
ld r5, VCPU_GPR(R5)(r4)
- ld r6, VCPU_GPR(R6)(r4)
- ld r7, VCPU_GPR(R7)(r4)
ld r8, VCPU_GPR(R8)(r4)
ld r9, VCPU_GPR(R9)(r4)
ld r10, VCPU_GPR(R10)(r4)
@@ -1229,10 +1110,119 @@
mtspr SPRN_HDSISR, r0
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+ ld r6, VCPU_KVM(r4)
+ lbz r7, KVM_SECURE_GUEST(r6)
+ cmpdi r7, 0
+ ld r6, VCPU_GPR(R6)(r4)
+ ld r7, VCPU_GPR(R7)(r4)
+ bne ret_to_ultra
+
+ lwz r0, VCPU_CR(r4)
+ mtcr r0
+
ld r0, VCPU_GPR(R0)(r4)
+ ld r2, VCPU_GPR(R2)(r4)
+ ld r3, VCPU_GPR(R3)(r4)
ld r4, VCPU_GPR(R4)(r4)
HRFI_TO_GUEST
b .
+/*
+ * Use UV_RETURN ultracall to return control back to the Ultravisor after
+ * processing an hypercall or interrupt that was forwarded (a.k.a. reflected)
+ * to the Hypervisor.
+ *
+ * All registers have already been loaded, except:
+ * R0 = hcall result
+ * R2 = SRR1, so UV can detect a synthesized interrupt (if any)
+ * R3 = UV_RETURN
+ */
+ret_to_ultra:
+ lwz r0, VCPU_CR(r4)
+ mtcr r0
+
+ ld r0, VCPU_GPR(R3)(r4)
+ mfspr r2, SPRN_SRR1
+ li r3, 0
+ ori r3, r3, UV_RETURN
+ ld r4, VCPU_GPR(R4)(r4)
+ sc 2
+
+/*
+ * Enter the guest on a P9 or later system where we have exactly
+ * one vcpu per vcore and we don't need to go to real mode
+ * (which implies that host and guest are both using radix MMU mode).
+ * r3 = vcpu pointer
+ * Most SPRs and all the VSRs have been loaded already.
+ */
+_GLOBAL(__kvmhv_vcpu_entry_p9)
+EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
+ mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+ stdu r1, -SFS(r1)
+
+ li r0, 1
+ stw r0, STACK_SLOT_SHORT_PATH(r1)
+
+ std r3, HSTATE_KVM_VCPU(r13)
+ mfcr r4
+ stw r4, SFS+8(r1)
+
+ std r1, HSTATE_HOST_R1(r13)
+
+ reg = 14
+ .rept 18
+ std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+ reg = reg + 1
+ .endr
+
+ reg = 14
+ .rept 18
+ ld reg, __VCPU_GPR(reg)(r3)
+ reg = reg + 1
+ .endr
+
+ mfmsr r10
+ std r10, HSTATE_HOST_MSR(r13)
+
+ mr r4, r3
+ b fast_guest_entry_c
+guest_exit_short_path:
+
+ li r0, KVM_GUEST_MODE_NONE
+ stb r0, HSTATE_IN_GUEST(r13)
+
+ reg = 14
+ .rept 18
+ std reg, __VCPU_GPR(reg)(r9)
+ reg = reg + 1
+ .endr
+
+ reg = 14
+ .rept 18
+ ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
+ reg = reg + 1
+ .endr
+
+ lwz r4, SFS+8(r1)
+ mtcr r4
+
+ mr r3, r12 /* trap number */
+
+ addi r1, r1, SFS
+ ld r0, PPC_LR_STKOFF(r1)
+ mtlr r0
+
+ /* If we are in real mode, do a rfid to get back to the caller */
+ mfmsr r4
+ andi. r5, r4, MSR_IR
+ bnelr
+ rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */
+ mtspr SPRN_SRR0, r0
+ ld r10, HSTATE_HOST_MSR(r13)
+ rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+ mtspr SPRN_SRR1, r10
+ RFI_TO_KERNEL
+ b .
secondary_too_late:
li r12, 0
@@ -1313,7 +1303,7 @@
std r3, VCPU_GPR(R12)(r9)
/* CR is in the high half of r12 */
srdi r4, r12, 32
- stw r4, VCPU_CR(r9)
+ std r4, VCPU_CR(r9)
BEGIN_FTR_SECTION
ld r3, HSTATE_CFAR(r13)
std r3, VCPU_CFAR(r9)
@@ -1387,18 +1377,26 @@
std r3, VCPU_CTR(r9)
std r4, VCPU_XER(r9)
+ /* Save more register state */
+ mfdar r3
+ mfdsisr r4
+ std r3, VCPU_DAR(r9)
+ stw r4, VCPU_DSISR(r9)
+
+ /* If this is a page table miss then see if it's theirs or ours */
+ cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
+ beq kvmppc_hdsi
+ std r3, VCPU_FAULT_DAR(r9)
+ stw r4, VCPU_FAULT_DSISR(r9)
+ cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE
+ beq kvmppc_hisi
+
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* For softpatch interrupt, go off and do TM instruction emulation */
cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
beq kvmppc_tm_emul
#endif
- /* If this is a page table miss then see if it's theirs or ours */
- cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
- beq kvmppc_hdsi
- cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE
- beq kvmppc_hisi
-
/* See if this is a leftover HDEC interrupt */
cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
bne 2f
@@ -1418,10 +1416,14 @@
BEGIN_FTR_SECTION
PPC_MSGSYNC
lwsync
+ /* always exit if we're running a nested guest */
+ ld r0, VCPU_NESTED(r9)
+ cmpdi r0, 0
+ bne guest_exit_cont
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi r0, 0
- beq 4f
+ beq maybe_reenter_guest
b guest_exit_cont
3:
/* If it's a hypervisor facility unavailable interrupt, save HFSCR */
@@ -1433,82 +1435,16 @@
14:
/* External interrupt ? */
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
- bne+ guest_exit_cont
-
- /* External interrupt, first check for host_ipi. If this is
- * set, we know the host wants us out so let's do it now
- */
- bl kvmppc_read_intr
-
- /*
- * Restore the active volatile registers after returning from
- * a C function.
- */
- ld r9, HSTATE_KVM_VCPU(r13)
- li r12, BOOK3S_INTERRUPT_EXTERNAL
-
- /*
- * kvmppc_read_intr return codes:
- *
- * Exit to host (r3 > 0)
- * 1 An interrupt is pending that needs to be handled by the host
- * Exit guest and return to host by branching to guest_exit_cont
- *
- * 2 Passthrough that needs completion in the host
- * Exit guest and return to host by branching to guest_exit_cont
- * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
- * to indicate to the host to complete handling the interrupt
- *
- * Before returning to guest, we check if any CPU is heading out
- * to the host and if so, we head out also. If no CPUs are heading
- * check return values <= 0.
- *
- * Return to guest (r3 <= 0)
- * 0 No external interrupt is pending
- * -1 A guest wakeup IPI (which has now been cleared)
- * In either case, we return to guest to deliver any pending
- * guest interrupts.
- *
- * -2 A PCI passthrough external interrupt was handled
- * (interrupt was delivered directly to guest)
- * Return to guest to deliver any pending guest interrupts.
- */
-
- cmpdi r3, 1
- ble 1f
-
- /* Return code = 2 */
- li r12, BOOK3S_INTERRUPT_HV_RM_HARD
- stw r12, VCPU_TRAP(r9)
- b guest_exit_cont
-
-1: /* Return code <= 1 */
- cmpdi r3, 0
- bgt guest_exit_cont
-
- /* Return code <= 0 */
-4: ld r5, HSTATE_KVM_VCORE(r13)
- lwz r0, VCORE_ENTRY_EXIT(r5)
- cmpwi r0, 0x100
- mr r4, r9
- blt deliver_guest_interrupt
-
-guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
- /* Save more register state */
- mfdar r6
- mfdsisr r7
- std r6, VCPU_DAR(r9)
- stw r7, VCPU_DSISR(r9)
- /* don't overwrite fault_dar/fault_dsisr if HDSI */
- cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
- beq mc_cont
- std r6, VCPU_FAULT_DAR(r9)
- stw r7, VCPU_FAULT_DSISR(r9)
-
+ beq kvmppc_guest_external
/* See if it is a machine check */
cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
beq machine_check_realmode
-mc_cont:
+ /* Or a hypervisor maintenance interrupt */
+ cmpwi r12, BOOK3S_INTERRUPT_HMI
+ beq hmi_realmode
+
+guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
+
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r9, VCPU_TB_RMEXIT
mr r4, r9
@@ -1552,6 +1488,18 @@
1:
#endif /* CONFIG_KVM_XICS */
+ /*
+ * Possibly flush the link stack here, before we do a blr in
+ * guest_exit_short_path.
+ */
+1: nop
+ patch_site 1b patch__call_kvm_flush_link_stack
+
+ /* If we came in through the P9 short path, go back out to C now */
+ lwz r0, STACK_SLOT_SHORT_PATH(r1)
+ cmpwi r0, 0
+ bne guest_exit_short_path
+
/* For hash guest, read the guest SLB and save it away */
ld r5, VCPU_KVM(r9)
lbz r0, KVM_RADIX(r5)
@@ -1713,22 +1661,25 @@
mtspr SPRN_PSPB, r0
mtspr SPRN_WORT, r0
BEGIN_FTR_SECTION
- mtspr SPRN_IAMR, r0
mtspr SPRN_TCSCR, r0
/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
li r0, 1
sldi r0, r0, 31
mtspr SPRN_MMCRS, r0
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
-8:
- /* Save and reset AMR and UAMOR before turning on the MMU */
+ /* Save and restore AMR, IAMR and UAMOR before turning on the MMU */
+ ld r8, STACK_SLOT_IAMR(r1)
+ mtspr SPRN_IAMR, r8
+
+8: /* Power7 jumps back in here */
mfspr r5,SPRN_AMR
mfspr r6,SPRN_UAMOR
std r5,VCPU_AMR(r9)
std r6,VCPU_UAMOR(r9)
- li r6,0
- mtspr SPRN_AMR,r6
+ ld r5,STACK_SLOT_AMR(r1)
+ ld r6,STACK_SLOT_UAMOR(r1)
+ mtspr SPRN_AMR, r5
mtspr SPRN_UAMOR, r6
/* Switch DSCR back to host value */
@@ -1780,11 +1731,13 @@
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
- * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
mr r3, r9
ld r4, VCPU_MSR(r3)
+ li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_save_tm_hv
+ nop
ld r9, HSTATE_KVM_VCPU(r13)
91:
#endif
@@ -1802,83 +1755,12 @@
25:
/* Save PMU registers if requested */
/* r8 and cr0.eq are live here */
-BEGIN_FTR_SECTION
- /*
- * POWER8 seems to have a hardware bug where setting
- * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
- * when some counters are already negative doesn't seem
- * to cause a performance monitor alert (and hence interrupt).
- * The effect of this is that when saving the PMU state,
- * if there is no PMU alert pending when we read MMCR0
- * before freezing the counters, but one becomes pending
- * before we read the counters, we lose it.
- * To work around this, we need a way to freeze the counters
- * before reading MMCR0. Normally, freezing the counters
- * is done by writing MMCR0 (to set MMCR0[FC]) which
- * unavoidably writes MMCR0[PMA0] as well. On POWER8,
- * we can also freeze the counters using MMCR2, by writing
- * 1s to all the counter freeze condition bits (there are
- * 9 bits each for 6 counters).
- */
- li r3, -1 /* set all freeze bits */
- clrrdi r3, r3, 10
- mfspr r10, SPRN_MMCR2
- mtspr SPRN_MMCR2, r3
- isync
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- li r3, 1
- sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
- mfspr r4, SPRN_MMCR0 /* save MMCR0 */
- mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
- mfspr r6, SPRN_MMCRA
- /* Clear MMCRA in order to disable SDAR updates */
- li r7, 0
- mtspr SPRN_MMCRA, r7
- isync
+ mr r3, r9
+ li r4, 1
beq 21f /* if no VPA, save PMU stuff anyway */
- lbz r7, LPPACA_PMCINUSE(r8)
- cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */
- bne 21f
- std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
- b 22f
-21: mfspr r5, SPRN_MMCR1
- mfspr r7, SPRN_SIAR
- mfspr r8, SPRN_SDAR
- std r4, VCPU_MMCR(r9)
- std r5, VCPU_MMCR + 8(r9)
- std r6, VCPU_MMCR + 16(r9)
-BEGIN_FTR_SECTION
- std r10, VCPU_MMCR + 24(r9)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- std r7, VCPU_SIAR(r9)
- std r8, VCPU_SDAR(r9)
- mfspr r3, SPRN_PMC1
- mfspr r4, SPRN_PMC2
- mfspr r5, SPRN_PMC3
- mfspr r6, SPRN_PMC4
- mfspr r7, SPRN_PMC5
- mfspr r8, SPRN_PMC6
- stw r3, VCPU_PMC(r9)
- stw r4, VCPU_PMC + 4(r9)
- stw r5, VCPU_PMC + 8(r9)
- stw r6, VCPU_PMC + 12(r9)
- stw r7, VCPU_PMC + 16(r9)
- stw r8, VCPU_PMC + 20(r9)
-BEGIN_FTR_SECTION
- mfspr r5, SPRN_SIER
- std r5, VCPU_SIER(r9)
-BEGIN_FTR_SECTION_NESTED(96)
- mfspr r6, SPRN_SPMC1
- mfspr r7, SPRN_SPMC2
- mfspr r8, SPRN_MMCRS
- stw r6, VCPU_PMC + 24(r9)
- stw r7, VCPU_PMC + 28(r9)
- std r8, VCPU_MMCR + 32(r9)
- lis r4, 0x8000
- mtspr SPRN_MMCRS, r4
-END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
-22:
+ lbz r4, LPPACA_PMCINUSE(r8)
+21: bl kvmhv_save_guest_pmu
+ ld r9, HSTATE_KVM_VCPU(r13)
/* Restore host values of some registers */
BEGIN_FTR_SECTION
@@ -1897,11 +1779,9 @@
ld r5, STACK_SLOT_TID(r1)
ld r6, STACK_SLOT_PSSCR(r1)
ld r7, STACK_SLOT_PID(r1)
- ld r8, STACK_SLOT_IAMR(r1)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
mtspr SPRN_PID, r7
- mtspr SPRN_IAMR, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
#ifdef CONFIG_PPC_RADIX_MMU
@@ -2010,24 +1890,6 @@
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
- /* If HMI, call kvmppc_realmode_hmi_handler() */
- lwz r12, STACK_SLOT_TRAP(r1)
- cmpwi r12, BOOK3S_INTERRUPT_HMI
- bne 27f
- bl kvmppc_realmode_hmi_handler
- nop
- cmpdi r3, 0
- /*
- * At this point kvmppc_realmode_hmi_handler may have resync-ed
- * the TB, and if it has, we must not subtract the guest timebase
- * offset from the timebase. So, skip it.
- *
- * Also, do not call kvmppc_subcore_exit_guest() because it has
- * been invoked as part of kvmppc_realmode_hmi_handler().
- */
- beq 30f
-
-27:
/* Subtract timebase offset from timebase */
ld r8, VCORE_TB_OFFSET_APPL(r5)
cmpdi r8,0
@@ -2045,19 +1907,29 @@
addis r8,r8,0x100 /* if so, increment upper 40 bits */
mtspr SPRN_TBU40,r8
-17: bl kvmppc_subcore_exit_guest
+17:
+ /*
+ * If this is an HMI, we called kvmppc_realmode_hmi_handler
+ * above, which may or may not have already called
+ * kvmppc_subcore_exit_guest. Fortunately, all that
+ * kvmppc_subcore_exit_guest does is clear a flag, so calling
+ * it again here is benign even if kvmppc_realmode_hmi_handler
+ * has already called it.
+ */
+ bl kvmppc_subcore_exit_guest
nop
30: ld r5,HSTATE_KVM_VCORE(r13)
ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
/* Reset PCR */
ld r0, VCORE_PCR(r5)
- cmpdi r0, 0
+ LOAD_REG_IMMEDIATE(r6, PCR_MASK)
+ cmpld r0, r6
beq 18f
- li r0, 0
- mtspr SPRN_PCR, r0
+ mtspr SPRN_PCR, r6
18:
/* Signal secondary CPUs to continue */
+ li r0, 0
stb r0,VCORE_IN_GUEST(r5)
19: lis r8,0x7fff /* MAX_INT@h */
mtspr SPRN_HDEC,r8
@@ -2099,6 +1971,89 @@
mtlr r0
blr
+.balign 32
+.global kvm_flush_link_stack
+kvm_flush_link_stack:
+ /* Save LR into r0 */
+ mflr r0
+
+ /* Flush the link stack. On Power8 it's up to 32 entries in size. */
+ .rept 32
+ bl .+4
+ .endr
+
+ /* And on Power9 it's up to 64. */
+BEGIN_FTR_SECTION
+ .rept 32
+ bl .+4
+ .endr
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+
+ /* Restore LR */
+ mtlr r0
+ blr
+
+kvmppc_guest_external:
+ /* External interrupt, first check for host_ipi. If this is
+ * set, we know the host wants us out so let's do it now
+ */
+ bl kvmppc_read_intr
+
+ /*
+ * Restore the active volatile registers after returning from
+ * a C function.
+ */
+ ld r9, HSTATE_KVM_VCPU(r13)
+ li r12, BOOK3S_INTERRUPT_EXTERNAL
+
+ /*
+ * kvmppc_read_intr return codes:
+ *
+ * Exit to host (r3 > 0)
+ * 1 An interrupt is pending that needs to be handled by the host
+ * Exit guest and return to host by branching to guest_exit_cont
+ *
+ * 2 Passthrough that needs completion in the host
+ * Exit guest and return to host by branching to guest_exit_cont
+ * However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
+ * to indicate to the host to complete handling the interrupt
+ *
+ * Before returning to guest, we check if any CPU is heading out
+ * to the host and if so, we head out also. If no CPUs are heading
+ * check return values <= 0.
+ *
+ * Return to guest (r3 <= 0)
+ * 0 No external interrupt is pending
+ * -1 A guest wakeup IPI (which has now been cleared)
+ * In either case, we return to guest to deliver any pending
+ * guest interrupts.
+ *
+ * -2 A PCI passthrough external interrupt was handled
+ * (interrupt was delivered directly to guest)
+ * Return to guest to deliver any pending guest interrupts.
+ */
+
+ cmpdi r3, 1
+ ble 1f
+
+ /* Return code = 2 */
+ li r12, BOOK3S_INTERRUPT_HV_RM_HARD
+ stw r12, VCPU_TRAP(r9)
+ b guest_exit_cont
+
+1: /* Return code <= 1 */
+ cmpdi r3, 0
+ bgt guest_exit_cont
+
+ /* Return code <= 0 */
+maybe_reenter_guest:
+ ld r5, HSTATE_KVM_VCORE(r13)
+ lwz r0, VCORE_ENTRY_EXIT(r5)
+ cmpwi r0, 0x100
+ mr r4, r9
+ blt deliver_guest_interrupt
+ b guest_exit_cont
+
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Softpatch interrupt for transactional memory emulation cases
@@ -2302,6 +2257,10 @@
andi. r0,r11,MSR_PR
/* sc 1 from userspace - reflect to guest syscall */
bne sc_1_fast_return
+ /* sc 1 from nested guest - give it to L1 to handle */
+ ld r0, VCPU_NESTED(r9)
+ cmpdi r0, 0
+ bne guest_exit_cont
clrrdi r3,r3,2
cmpldi r3,hcall_real_table_end - hcall_real_table
bge guest_exit_cont
@@ -2359,11 +2318,16 @@
.long DOTSYM(kvmppc_h_clear_mod) - hcall_real_table
.long DOTSYM(kvmppc_h_clear_ref) - hcall_real_table
.long DOTSYM(kvmppc_h_protect) - hcall_real_table
+#ifdef CONFIG_SPAPR_TCE_IOMMU
.long DOTSYM(kvmppc_h_get_tce) - hcall_real_table
.long DOTSYM(kvmppc_rm_h_put_tce) - hcall_real_table
+#else
+ .long 0 /* 0x1c */
+ .long 0 /* 0x20 */
+#endif
.long 0 /* 0x24 - H_SET_SPRG0 */
.long DOTSYM(kvmppc_h_set_dabr) - hcall_real_table
- .long 0 /* 0x2c */
+ .long DOTSYM(kvmppc_rm_h_page_init) - hcall_real_table
.long 0 /* 0x30 */
.long 0 /* 0x34 */
.long 0 /* 0x38 */
@@ -2438,8 +2402,13 @@
.long 0 /* 0x12c */
.long 0 /* 0x130 */
.long DOTSYM(kvmppc_h_set_xdabr) - hcall_real_table
+#ifdef CONFIG_SPAPR_TCE_IOMMU
.long DOTSYM(kvmppc_rm_h_stuff_tce) - hcall_real_table
.long DOTSYM(kvmppc_rm_h_put_tce_indirect) - hcall_real_table
+#else
+ .long 0 /* 0x138 */
+ .long 0 /* 0x13c */
+#endif
.long 0 /* 0x140 */
.long 0 /* 0x144 */
.long 0 /* 0x148 */
@@ -2561,6 +2530,7 @@
hcall_real_table_end:
_GLOBAL(kvmppc_h_set_xdabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
andi. r0, r5, DABRX_USER | DABRX_KERNEL
beq 6f
li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
@@ -2570,6 +2540,7 @@
blr
_GLOBAL(kvmppc_h_set_dabr)
+EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
li r5, DABRX_USER | DABRX_KERNEL
3:
BEGIN_FTR_SECTION
@@ -2588,20 +2559,31 @@
blr
2:
-BEGIN_FTR_SECTION
- /* POWER9 with disabled DAWR */
+ LOAD_REG_ADDR(r11, dawr_force_enable)
+ lbz r11, 0(r11)
+ cmpdi r11, 0
+ bne 3f
li r3, H_HARDWARE
blr
-END_FTR_SECTION_IFCLR(CPU_FTR_DAWR)
+3:
/* Emulate H_SET_DABR/X on P8 for the sake of compat mode guests */
rlwimi r5, r4, 5, DAWRX_DR | DAWRX_DW
rlwimi r5, r4, 2, DAWRX_WT
clrrdi r4, r4, 3
std r4, VCPU_DAWR(r3)
std r5, VCPU_DAWRX(r3)
+ /*
+ * If came in through the real mode hcall handler then it is necessary
+ * to write the registers since the return path won't. Otherwise it is
+ * sufficient to store then in the vcpu struct as they will be loaded
+ * next time the vcpu is run.
+ */
+ mfmsr r6
+ andi. r6, r6, MSR_DR /* in real mode? */
+ bne 4f
mtspr SPRN_DAWR, r4
mtspr SPRN_DAWRX, r5
- li r3, 0
+4: li r3, 0
blr
_GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
@@ -2682,11 +2664,13 @@
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
- * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
ld r3, HSTATE_KVM_VCPU(r13)
ld r4, VCPU_MSR(r3)
+ li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_save_tm_hv
+ nop
91:
#endif
@@ -2727,6 +2711,9 @@
lis r3, LPCR_PECEDP@h /* Do wake on privileged doorbell */
+ /* Go back to host stack */
+ ld r1, HSTATE_HOST_R1(r13)
+
/*
* Take a nap until a decrementer or external or doobell interrupt
* occurs, with PECE1 and PECE0 set in LPCR.
@@ -2755,26 +2742,42 @@
* requested level = 0 (just stop dispatching)
*/
lis r3, (PSSCR_EC | PSSCR_ESL)@h
- mtspr SPRN_PSSCR, r3
/* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */
li r4, LPCR_PECE_HVEE@higher
sldi r4, r4, 32
or r5, r5, r4
-END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
+FTR_SECTION_ELSE
+ li r3, PNV_THREAD_NAP
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mtspr SPRN_LPCR,r5
isync
- li r0, 0
- std r0, HSTATE_SCRATCH0(r13)
- ptesync
- ld r0, HSTATE_SCRATCH0(r13)
-1: cmpd r0, r0
- bne 1b
+
BEGIN_FTR_SECTION
- nap
+ bl isa300_idle_stop_mayloss
FTR_SECTION_ELSE
- PPC_STOP
-ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
- b .
+ bl isa206_idle_insn_mayloss
+ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
+
+ mfspr r0, SPRN_CTRLF
+ ori r0, r0, 1
+ mtspr SPRN_CTRLT, r0
+
+ mtspr SPRN_SRR1, r3
+
+ li r0, 0
+ stb r0, PACA_FTRACE_ENABLED(r13)
+
+ li r0, KVM_HWTHREAD_IN_KVM
+ stb r0, HSTATE_HWTHREAD_STATE(r13)
+
+ lbz r0, HSTATE_NAPPING(r13)
+ cmpwi r0, NAPPING_CEDE
+ beq kvm_end_cede
+ cmpwi r0, NAPPING_NOVCPU
+ beq kvm_novcpu_wakeup
+ cmpwi r0, NAPPING_UNSPLIT
+ beq kvm_unsplit_wakeup
+ twi 31,0,0 /* Nap state must not be zero */
33: mr r4, r3
li r3, 0
@@ -2782,12 +2785,11 @@
b 34f
kvm_end_cede:
+ /* Woken by external or decrementer interrupt */
+
/* get vcpu pointer */
ld r4, HSTATE_KVM_VCPU(r13)
- /* Woken by external or decrementer interrupt */
- ld r1, HSTATE_HOST_R1(r13)
-
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r4, VCPU_TB_RMINTR
bl kvmhv_accumulate_time
@@ -2802,11 +2804,13 @@
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
- * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
+ * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
mr r3, r4
ld r4, VCPU_MSR(r3)
+ li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_restore_tm_hv
+ nop
ld r4, HSTATE_KVM_VCPU(r13)
91:
#endif
@@ -2874,13 +2878,7 @@
mr r9, r4
cmpdi r3, 0
bgt guest_exit_cont
-
- /* see if any other thread is already exiting */
- lwz r0,VCORE_ENTRY_EXIT(r5)
- cmpwi r0,0x100
- bge guest_exit_cont
-
- b kvmppc_cede_reentry /* if not go back to guest */
+ b maybe_reenter_guest
/* cede when already previously prodded case */
kvm_cede_prodded:
@@ -2895,75 +2893,66 @@
kvm_cede_exit:
ld r9, HSTATE_KVM_VCPU(r13)
#ifdef CONFIG_KVM_XICS
- /* Abort if we still have a pending escalation */
- lbz r5, VCPU_XIVE_ESC_ON(r9)
- cmpwi r5, 0
- beq 1f
- li r0, 0
- stb r0, VCPU_CEDED(r9)
-1: /* Enable XIVE escalation */
- li r5, XIVE_ESB_SET_PQ_00
- mfmsr r0
- andi. r0, r0, MSR_DR /* in real mode? */
- beq 1f
+ /* are we using XIVE with single escalation? */
ld r10, VCPU_XIVE_ESC_VADDR(r9)
cmpdi r10, 0
beq 3f
- ldx r0, r10, r5
+ li r6, XIVE_ESB_SET_PQ_00
+ /*
+ * If we still have a pending escalation, abort the cede,
+ * and we must set PQ to 10 rather than 00 so that we don't
+ * potentially end up with two entries for the escalation
+ * interrupt in the XIVE interrupt queue. In that case
+ * we also don't want to set xive_esc_on to 1 here in
+ * case we race with xive_esc_irq().
+ */
+ lbz r5, VCPU_XIVE_ESC_ON(r9)
+ cmpwi r5, 0
+ beq 4f
+ li r0, 0
+ stb r0, VCPU_CEDED(r9)
+ li r6, XIVE_ESB_SET_PQ_10
+ b 5f
+4: li r0, 1
+ stb r0, VCPU_XIVE_ESC_ON(r9)
+ /* make sure store to xive_esc_on is seen before xive_esc_irq runs */
+ sync
+5: /* Enable XIVE escalation */
+ mfmsr r0
+ andi. r0, r0, MSR_DR /* in real mode? */
+ beq 1f
+ ldx r0, r10, r6
b 2f
1: ld r10, VCPU_XIVE_ESC_RADDR(r9)
- cmpdi r10, 0
- beq 3f
- ldcix r0, r10, r5
+ ldcix r0, r10, r6
2: sync
- li r0, 1
- stb r0, VCPU_XIVE_ESC_ON(r9)
#endif /* CONFIG_KVM_XICS */
3: b guest_exit_cont
- /* Try to handle a machine check in real mode */
+ /* Try to do machine check recovery in real mode */
machine_check_realmode:
mr r3, r9 /* get vcpu pointer */
bl kvmppc_realmode_machine_check
nop
+ /* all machine checks go to virtual mode for further handling */
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
- /*
- * For the guest that is FWNMI capable, deliver all the MCE errors
- * (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit
- * reason. This new approach injects machine check errors in guest
- * address space to guest with additional information in the form
- * of RTAS event, thus enabling guest kernel to suitably handle
- * such errors.
- *
- * For the guest that is not FWNMI capable (old QEMU) fallback
- * to old behaviour for backward compatibility:
- * Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
- * through machine check interrupt (set HSRR0 to 0x200).
- * For handled errors (no-fatal), just go back to guest execution
- * with current HSRR0.
- * if we receive machine check with MSR(RI=0) then deliver it to
- * guest as machine check causing guest to crash.
- */
- ld r11, VCPU_MSR(r9)
- rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
- bne mc_cont /* if so, exit to host */
- /* Check if guest is capable of handling NMI exit */
- ld r10, VCPU_KVM(r9)
- lbz r10, KVM_FWNMI(r10)
- cmpdi r10, 1 /* FWNMI capable? */
- beq mc_cont /* if so, exit with KVM_EXIT_NMI. */
+ b guest_exit_cont
- /* if not, fall through for backward compatibility. */
- andi. r10, r11, MSR_RI /* check for unrecoverable exception */
- beq 1f /* Deliver a machine check to guest */
- ld r10, VCPU_PC(r9)
- cmpdi r3, 0 /* Did we handle MCE ? */
- bne 2f /* Continue guest execution. */
- /* If not, deliver a machine check. SRR0/1 are already set */
-1: li r10, BOOK3S_INTERRUPT_MACHINE_CHECK
- bl kvmppc_msr_interrupt
-2: b fast_interrupt_c_return
+/*
+ * Call C code to handle a HMI in real mode.
+ * Only the primary thread does the call, secondary threads are handled
+ * by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
+ * r9 points to the vcpu on entry
+ */
+hmi_realmode:
+ lbz r0, HSTATE_PTID(r13)
+ cmpwi r0, 0
+ bne guest_exit_cont
+ bl kvmppc_realmode_hmi_handler
+ ld r9, HSTATE_KVM_VCPU(r13)
+ li r12, BOOK3S_INTERRUPT_HMI
+ b guest_exit_cont
/*
* Check the reason we woke from nap, and take appropriate action.
@@ -3130,10 +3119,12 @@
* Save transactional state and TM-related registers.
* Called with r3 pointing to the vcpu struct and r4 containing
* the guest MSR value.
- * This can modify all checkpointed registers, but
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
+ * If r5 == 0, this can modify all checkpointed registers, but
* restores r1 and r2 before exit.
*/
-kvmppc_save_tm_hv:
+_GLOBAL_TOC(kvmppc_save_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
/* See if we need to handle fake suspend mode */
BEGIN_FTR_SECTION
b __kvmppc_save_tm
@@ -3161,12 +3152,6 @@
END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
nop
- std r1, HSTATE_HOST_R1(r13)
-
- /* Clear the MSR RI since r1, r13 may be foobar. */
- li r5, 0
- mtmsrd r5, 1
-
/* We have to treclaim here because that's the only way to do S->N */
li r3, TM_CAUSE_KVM_RESCHED
TRECLAIM(R3)
@@ -3175,22 +3160,13 @@
* We were in fake suspend, so we are not going to save the
* register state as the guest checkpointed state (since
* we already have it), therefore we can now use any volatile GPR.
+ * In fact treclaim in fake suspend state doesn't modify
+ * any registers.
*/
- /* Reload PACA pointer, stack pointer and TOC. */
- GET_PACA(r13)
- ld r1, HSTATE_HOST_R1(r13)
- ld r2, PACATOC(r13)
- /* Set MSR RI now we have r1 and r13 back. */
- li r5, MSR_RI
- mtmsrd r5, 1
-
- HMT_MEDIUM
- ld r6, HSTATE_DSCR(r13)
- mtspr SPRN_DSCR, r6
-BEGIN_FTR_SECTION_NESTED(96)
+BEGIN_FTR_SECTION
bl pnv_power9_force_smt4_release
-END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
nop
4:
@@ -3216,10 +3192,12 @@
* Restore transactional state and TM-related registers.
* Called with r3 pointing to the vcpu struct
* and r4 containing the guest MSR value.
+ * r5 is non-zero iff non-volatile register state needs to be maintained.
* This potentially modifies all checkpointed registers.
* It restores r1 and r2 from the PACA.
*/
-kvmppc_restore_tm_hv:
+_GLOBAL_TOC(kvmppc_restore_tm_hv)
+EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
/*
* If we are doing TM emulation for the guest on a POWER9 DD2,
* then we don't actually do a trechkpt -- we either set up
@@ -3424,6 +3402,194 @@
blr
/*
+ * Load up guest PMU state. R3 points to the vcpu struct.
+ */
+_GLOBAL(kvmhv_load_guest_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
+ mr r4, r3
+ mflr r0
+ li r3, 1
+ sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
+ mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
+ isync
+BEGIN_FTR_SECTION
+ ld r3, VCPU_MMCR(r4)
+ andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+ cmpwi r5, MMCR0_PMAO
+ beql kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+ lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
+ lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
+ lwz r6, VCPU_PMC + 8(r4)
+ lwz r7, VCPU_PMC + 12(r4)
+ lwz r8, VCPU_PMC + 16(r4)
+ lwz r9, VCPU_PMC + 20(r4)
+ mtspr SPRN_PMC1, r3
+ mtspr SPRN_PMC2, r5
+ mtspr SPRN_PMC3, r6
+ mtspr SPRN_PMC4, r7
+ mtspr SPRN_PMC5, r8
+ mtspr SPRN_PMC6, r9
+ ld r3, VCPU_MMCR(r4)
+ ld r5, VCPU_MMCR + 8(r4)
+ ld r6, VCPU_MMCR + 16(r4)
+ ld r7, VCPU_SIAR(r4)
+ ld r8, VCPU_SDAR(r4)
+ mtspr SPRN_MMCR1, r5
+ mtspr SPRN_MMCRA, r6
+ mtspr SPRN_SIAR, r7
+ mtspr SPRN_SDAR, r8
+BEGIN_FTR_SECTION
+ ld r5, VCPU_MMCR + 24(r4)
+ ld r6, VCPU_SIER(r4)
+ mtspr SPRN_MMCR2, r5
+ mtspr SPRN_SIER, r6
+BEGIN_FTR_SECTION_NESTED(96)
+ lwz r7, VCPU_PMC + 24(r4)
+ lwz r8, VCPU_PMC + 28(r4)
+ ld r9, VCPU_MMCR + 32(r4)
+ mtspr SPRN_SPMC1, r7
+ mtspr SPRN_SPMC2, r8
+ mtspr SPRN_MMCRS, r9
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ mtspr SPRN_MMCR0, r3
+ isync
+ mtlr r0
+ blr
+
+/*
+ * Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
+ */
+_GLOBAL(kvmhv_load_host_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
+ mflr r0
+ lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
+ cmpwi r4, 0
+ beq 23f /* skip if not */
+BEGIN_FTR_SECTION
+ ld r3, HSTATE_MMCR0(r13)
+ andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
+ cmpwi r4, MMCR0_PMAO
+ beql kvmppc_fix_pmao
+END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
+ lwz r3, HSTATE_PMC1(r13)
+ lwz r4, HSTATE_PMC2(r13)
+ lwz r5, HSTATE_PMC3(r13)
+ lwz r6, HSTATE_PMC4(r13)
+ lwz r8, HSTATE_PMC5(r13)
+ lwz r9, HSTATE_PMC6(r13)
+ mtspr SPRN_PMC1, r3
+ mtspr SPRN_PMC2, r4
+ mtspr SPRN_PMC3, r5
+ mtspr SPRN_PMC4, r6
+ mtspr SPRN_PMC5, r8
+ mtspr SPRN_PMC6, r9
+ ld r3, HSTATE_MMCR0(r13)
+ ld r4, HSTATE_MMCR1(r13)
+ ld r5, HSTATE_MMCRA(r13)
+ ld r6, HSTATE_SIAR(r13)
+ ld r7, HSTATE_SDAR(r13)
+ mtspr SPRN_MMCR1, r4
+ mtspr SPRN_MMCRA, r5
+ mtspr SPRN_SIAR, r6
+ mtspr SPRN_SDAR, r7
+BEGIN_FTR_SECTION
+ ld r8, HSTATE_MMCR2(r13)
+ ld r9, HSTATE_SIER(r13)
+ mtspr SPRN_MMCR2, r8
+ mtspr SPRN_SIER, r9
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ mtspr SPRN_MMCR0, r3
+ isync
+ mtlr r0
+23: blr
+
+/*
+ * Save guest PMU state into the vcpu struct.
+ * r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
+ */
+_GLOBAL(kvmhv_save_guest_pmu)
+EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
+ mr r9, r3
+ mr r8, r4
+BEGIN_FTR_SECTION
+ /*
+ * POWER8 seems to have a hardware bug where setting
+ * MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
+ * when some counters are already negative doesn't seem
+ * to cause a performance monitor alert (and hence interrupt).
+ * The effect of this is that when saving the PMU state,
+ * if there is no PMU alert pending when we read MMCR0
+ * before freezing the counters, but one becomes pending
+ * before we read the counters, we lose it.
+ * To work around this, we need a way to freeze the counters
+ * before reading MMCR0. Normally, freezing the counters
+ * is done by writing MMCR0 (to set MMCR0[FC]) which
+ * unavoidably writes MMCR0[PMA0] as well. On POWER8,
+ * we can also freeze the counters using MMCR2, by writing
+ * 1s to all the counter freeze condition bits (there are
+ * 9 bits each for 6 counters).
+ */
+ li r3, -1 /* set all freeze bits */
+ clrrdi r3, r3, 10
+ mfspr r10, SPRN_MMCR2
+ mtspr SPRN_MMCR2, r3
+ isync
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ li r3, 1
+ sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
+ mfspr r4, SPRN_MMCR0 /* save MMCR0 */
+ mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
+ mfspr r6, SPRN_MMCRA
+ /* Clear MMCRA in order to disable SDAR updates */
+ li r7, 0
+ mtspr SPRN_MMCRA, r7
+ isync
+ cmpwi r8, 0 /* did they ask for PMU stuff to be saved? */
+ bne 21f
+ std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
+ b 22f
+21: mfspr r5, SPRN_MMCR1
+ mfspr r7, SPRN_SIAR
+ mfspr r8, SPRN_SDAR
+ std r4, VCPU_MMCR(r9)
+ std r5, VCPU_MMCR + 8(r9)
+ std r6, VCPU_MMCR + 16(r9)
+BEGIN_FTR_SECTION
+ std r10, VCPU_MMCR + 24(r9)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ std r7, VCPU_SIAR(r9)
+ std r8, VCPU_SDAR(r9)
+ mfspr r3, SPRN_PMC1
+ mfspr r4, SPRN_PMC2
+ mfspr r5, SPRN_PMC3
+ mfspr r6, SPRN_PMC4
+ mfspr r7, SPRN_PMC5
+ mfspr r8, SPRN_PMC6
+ stw r3, VCPU_PMC(r9)
+ stw r4, VCPU_PMC + 4(r9)
+ stw r5, VCPU_PMC + 8(r9)
+ stw r6, VCPU_PMC + 12(r9)
+ stw r7, VCPU_PMC + 16(r9)
+ stw r8, VCPU_PMC + 20(r9)
+BEGIN_FTR_SECTION
+ mfspr r5, SPRN_SIER
+ std r5, VCPU_SIER(r9)
+BEGIN_FTR_SECTION_NESTED(96)
+ mfspr r6, SPRN_SPMC1
+ mfspr r7, SPRN_SPMC2
+ mfspr r8, SPRN_MMCRS
+ stw r6, VCPU_PMC + 24(r9)
+ stw r7, VCPU_PMC + 28(r9)
+ std r8, VCPU_MMCR + 32(r9)
+ lis r4, 0x8000
+ mtspr SPRN_MMCRS, r4
+END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+22: blr
+
+/*
* This works around a hardware bug on POWER8E processors, where
* writing a 1 to the MMCR0[PMAO] bit doesn't generate a
* performance monitor interrupt. Instead, when we need to have
diff --git a/arch/powerpc/kvm/book3s_hv_tm.c b/arch/powerpc/kvm/book3s_hv_tm.c
index 0082850..0db9374 100644
--- a/arch/powerpc/kvm/book3s_hv_tm.c
+++ b/arch/powerpc/kvm/book3s_hv_tm.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2017 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kvm_host.h>
@@ -130,8 +127,8 @@
return RESUME_GUEST;
}
/* Set CR0 to indicate previous transactional state */
- vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
- (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+ vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+ (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
/* L=1 => tresume, L=0 => tsuspend */
if (instr & (1 << 21)) {
if (MSR_TM_SUSPENDED(msr))
@@ -174,8 +171,8 @@
copy_from_checkpoint(vcpu);
/* Set CR0 to indicate previous transactional state */
- vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
- (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+ vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+ (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
return RESUME_GUEST;
@@ -204,8 +201,8 @@
copy_to_checkpoint(vcpu);
/* Set CR0 to indicate previous transactional state */
- vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
- (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
+ vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+ (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 29);
vcpu->arch.shregs.msr = msr | MSR_TS_S;
return RESUME_GUEST;
}
diff --git a/arch/powerpc/kvm/book3s_hv_tm_builtin.c b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
index b2c7c6f..2172462 100644
--- a/arch/powerpc/kvm/book3s_hv_tm_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_tm_builtin.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2017 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kvm_host.h>
@@ -89,7 +86,8 @@
if (instr & (1 << 21))
vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
/* Set CR0 to 0b0010 */
- vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000;
+ vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
+ 0x20000000;
return 1;
}
@@ -105,5 +103,5 @@
vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */
vcpu->arch.regs.nip = vcpu->arch.tfhar;
copy_from_checkpoint(vcpu);
- vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000;
+ vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
}
diff --git a/arch/powerpc/kvm/book3s_interrupts.S b/arch/powerpc/kvm/book3s_interrupts.S
index d71dab1..f7ad99d 100644
--- a/arch/powerpc/kvm/book3s_interrupts.S
+++ b/arch/powerpc/kvm/book3s_interrupts.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
diff --git a/arch/powerpc/kvm/book3s_mmu_hpte.c b/arch/powerpc/kvm/book3s_mmu_hpte.c
index 905a934..ce79ac3 100644
--- a/arch/powerpc/kvm/book3s_mmu_hpte.c
+++ b/arch/powerpc/kvm/book3s_mmu_hpte.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010 SUSE Linux Products GmbH. All rights reserved.
*
* Authors:
* Alexander Graf <agraf@suse.de>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*/
#include <linux/kvm_host.h>
diff --git a/arch/powerpc/kvm/book3s_paired_singles.c b/arch/powerpc/kvm/book3s_paired_singles.c
index eab96cf..bf02827 100644
--- a/arch/powerpc/kvm/book3s_paired_singles.c
+++ b/arch/powerpc/kvm/book3s_paired_singles.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright Novell Inc 2010
*
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 614ebb4..cc65af8 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2009. SUSE Linux Products GmbH. All rights reserved.
*
@@ -13,10 +14,6 @@
*
* This file is derived from arch/powerpc/kvm/44x.c,
* by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kvm_host.h>
@@ -167,7 +164,7 @@
svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
- svcpu->cr = vcpu->arch.cr;
+ svcpu->cr = vcpu->arch.regs.ccr;
svcpu->xer = vcpu->arch.regs.xer;
svcpu->ctr = vcpu->arch.regs.ctr;
svcpu->lr = vcpu->arch.regs.link;
@@ -249,7 +246,7 @@
vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
- vcpu->arch.cr = svcpu->cr;
+ vcpu->arch.regs.ccr = svcpu->cr;
vcpu->arch.regs.xer = svcpu->xer;
vcpu->arch.regs.ctr = svcpu->ctr;
vcpu->arch.regs.link = svcpu->lr;
@@ -587,6 +584,7 @@
case PVR_POWER8:
case PVR_POWER8E:
case PVR_POWER8NVL:
+ case PVR_POWER9:
vcpu->arch.hflags |= BOOK3S_HFLAG_MULTI_PGSIZE |
BOOK3S_HFLAG_NEW_TLBIE;
break;
@@ -1246,7 +1244,6 @@
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_EXTERNAL:
- case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
case BOOK3S_INTERRUPT_EXTERNAL_HV:
case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++;
@@ -1914,7 +1911,8 @@
static void kvmppc_core_commit_memory_region_pr(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
- const struct kvm_memory_slot *new)
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
return;
}
diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c
index dae3be5..031c801 100644
--- a/arch/powerpc/kvm/book3s_pr_papr.c
+++ b/arch/powerpc/kvm/book3s_pr_papr.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2011. Freescale Inc. All rights reserved.
*
@@ -9,10 +10,6 @@
*
* Hypercall handling for running PAPR guests in PR KVM on Book 3S
* processors.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/anon_inodes.h>
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index b0089e0..3dc129a 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2009
*
diff --git a/arch/powerpc/kvm/book3s_rtas.c b/arch/powerpc/kvm/book3s_rtas.c
index 2d3b2b1..26b2599 100644
--- a/arch/powerpc/kvm/book3s_rtas.c
+++ b/arch/powerpc/kvm/book3s_rtas.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2012 Michael Ellerman, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -33,7 +30,7 @@
server = be32_to_cpu(args->args[1]);
priority = be32_to_cpu(args->args[2]);
- if (xive_enabled())
+ if (xics_on_xive())
rc = kvmppc_xive_set_xive(vcpu->kvm, irq, server, priority);
else
rc = kvmppc_xics_set_xive(vcpu->kvm, irq, server, priority);
@@ -56,7 +53,7 @@
irq = be32_to_cpu(args->args[0]);
server = priority = 0;
- if (xive_enabled())
+ if (xics_on_xive())
rc = kvmppc_xive_get_xive(vcpu->kvm, irq, &server, &priority);
else
rc = kvmppc_xics_get_xive(vcpu->kvm, irq, &server, &priority);
@@ -83,7 +80,7 @@
irq = be32_to_cpu(args->args[0]);
- if (xive_enabled())
+ if (xics_on_xive())
rc = kvmppc_xive_int_off(vcpu->kvm, irq);
else
rc = kvmppc_xics_int_off(vcpu->kvm, irq);
@@ -105,7 +102,7 @@
irq = be32_to_cpu(args->args[0]);
- if (xive_enabled())
+ if (xics_on_xive())
rc = kvmppc_xive_int_on(vcpu->kvm, irq);
else
rc = kvmppc_xics_int_on(vcpu->kvm, irq);
@@ -146,7 +143,7 @@
{
struct rtas_token_definition *d, *tmp;
- lockdep_assert_held(&kvm->lock);
+ lockdep_assert_held(&kvm->arch.rtas_token_lock);
list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
if (rtas_name_matches(d->handler->name, name)) {
@@ -167,7 +164,7 @@
bool found;
int i;
- lockdep_assert_held(&kvm->lock);
+ lockdep_assert_held(&kvm->arch.rtas_token_lock);
list_for_each_entry(d, &kvm->arch.rtas_tokens, list) {
if (d->token == token)
@@ -206,14 +203,14 @@
if (copy_from_user(&args, argp, sizeof(args)))
return -EFAULT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&kvm->arch.rtas_token_lock);
if (args.token)
rc = rtas_token_define(kvm, args.name, args.token);
else
rc = rtas_token_undefine(kvm, args.name);
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&kvm->arch.rtas_token_lock);
return rc;
}
@@ -245,7 +242,7 @@
orig_rets = args.rets;
args.rets = &args.args[be32_to_cpu(args.nargs)];
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&vcpu->kvm->arch.rtas_token_lock);
rc = -ENOENT;
list_for_each_entry(d, &vcpu->kvm->arch.rtas_tokens, list) {
@@ -256,7 +253,7 @@
}
}
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&vcpu->kvm->arch.rtas_token_lock);
if (rc == 0) {
args.rets = orig_rets;
@@ -282,8 +279,6 @@
{
struct rtas_token_definition *d, *tmp;
- lockdep_assert_held(&kvm->lock);
-
list_for_each_entry_safe(d, tmp, &kvm->arch.rtas_tokens, list) {
list_del(&d->list);
kfree(d);
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index e5c542a..0169bab 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright SUSE Linux Products GmbH 2010
*
diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index b8356cd..381bf8d 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -1,10 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2012 Michael Ellerman, IBM Corporation.
* Copyright 2012 Benjamin Herrenschmidt, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -310,7 +307,7 @@
*/
if (new.out_ee) {
kvmppc_book3s_queue_irqprio(icp->vcpu,
- BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+ BOOK3S_INTERRUPT_EXTERNAL);
if (!change_self)
kvmppc_fast_vcpu_kick(icp->vcpu);
}
@@ -593,8 +590,7 @@
u32 xirr;
/* First, remove EE from the processor */
- kvmppc_book3s_dequeue_irqprio(icp->vcpu,
- BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+ kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
/*
* ICP State: Accept_Interrupt
@@ -754,8 +750,7 @@
* We can remove EE from the current processor, the update
* transaction will set it again if needed
*/
- kvmppc_book3s_dequeue_irqprio(icp->vcpu,
- BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+ kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
do {
old_state = new_state = READ_ONCE(icp->state);
@@ -832,7 +827,7 @@
*
* Note: If EOI is incorrectly used by SW to lower the CPPR
* value (ie more favored), we do not check for rejection of
- * a pending interrupt, this is a SW error and PAPR sepcifies
+ * a pending interrupt, this is a SW error and PAPR specifies
* that we don't have to deal with it.
*
* The sending of an EOI to the ICS is handled after the
@@ -1017,17 +1012,7 @@
return 0;
}
-static int xics_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xics_debug_show, inode->i_private);
-}
-
-static const struct file_operations xics_debug_fops = {
- .open = xics_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(xics_debug);
static void xics_debugfs_init(struct kvmppc_xics *xics)
{
@@ -1167,8 +1152,7 @@
* Deassert the CPU interrupt request.
* icp_try_update will reassert it if necessary.
*/
- kvmppc_book3s_dequeue_irqprio(icp->vcpu,
- BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
+ kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
/*
* Note that if we displace an interrupt from old_state.xisr,
@@ -1393,7 +1377,8 @@
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- if (cpu_has_feature(CPU_FTR_ARCH_206)) {
+ if (cpu_has_feature(CPU_FTR_ARCH_206) &&
+ cpu_has_feature(CPU_FTR_HVMODE)) {
/* Enable real mode support */
xics->real_mode = ENABLE_REALMODE;
xics->real_mode_dbg = DEBUG_REALMODE;
diff --git a/arch/powerpc/kvm/book3s_xics.h b/arch/powerpc/kvm/book3s_xics.h
index 453c9e5..6231f76 100644
--- a/arch/powerpc/kvm/book3s_xics.h
+++ b/arch/powerpc/kvm/book3s_xics.h
@@ -1,10 +1,7 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2012 Michael Ellerman, IBM Corporation.
* Copyright 2012 Benjamin Herrenschmidt, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#ifndef _KVM_PPC_BOOK3S_XICS_H
diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c
index 30c2eb7..a3f9c66 100644
--- a/arch/powerpc/kvm/book3s_xive.c
+++ b/arch/powerpc/kvm/book3s_xive.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2017 Benjamin Herrenschmidt, IBM Corporation.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#define pr_fmt(fmt) "xive-kvm: " fmt
@@ -62,6 +59,75 @@
#define XIVE_Q_GAP 2
/*
+ * Push a vcpu's context to the XIVE on guest entry.
+ * This assumes we are in virtual mode (MMU on)
+ */
+void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
+{
+ void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
+ u64 pq;
+
+ /*
+ * Nothing to do if the platform doesn't have a XIVE
+ * or this vCPU doesn't have its own XIVE context
+ * (e.g. because it's not using an in-kernel interrupt controller).
+ */
+ if (!tima || !vcpu->arch.xive_cam_word)
+ return;
+
+ eieio();
+ __raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
+ __raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
+ vcpu->arch.xive_pushed = 1;
+ eieio();
+
+ /*
+ * We clear the irq_pending flag. There is a small chance of a
+ * race vs. the escalation interrupt happening on another
+ * processor setting it again, but the only consequence is to
+ * cause a spurious wakeup on the next H_CEDE, which is not an
+ * issue.
+ */
+ vcpu->arch.irq_pending = 0;
+
+ /*
+ * In single escalation mode, if the escalation interrupt is
+ * on, we mask it.
+ */
+ if (vcpu->arch.xive_esc_on) {
+ pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
+ XIVE_ESB_SET_PQ_01));
+ mb();
+
+ /*
+ * We have a possible subtle race here: The escalation
+ * interrupt might have fired and be on its way to the
+ * host queue while we mask it, and if we unmask it
+ * early enough (re-cede right away), there is a
+ * theorical possibility that it fires again, thus
+ * landing in the target queue more than once which is
+ * a big no-no.
+ *
+ * Fortunately, solving this is rather easy. If the
+ * above load setting PQ to 01 returns a previous
+ * value where P is set, then we know the escalation
+ * interrupt is somewhere on its way to the host. In
+ * that case we simply don't clear the xive_esc_on
+ * flag below. It will be eventually cleared by the
+ * handler for the escalation interrupt.
+ *
+ * Then, when doing a cede, we check that flag again
+ * before re-enabling the escalation interrupt, and if
+ * set, we abort the cede.
+ */
+ if (!(pq & XIVE_ESB_VAL_P))
+ /* Now P is 0, we can clear the flag */
+ vcpu->arch.xive_esc_on = 0;
+ }
+}
+EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
+
+/*
* This is a simple trigger for a generic XIVE IRQ. This must
* only be called for interrupts that support a trigger page
*/
@@ -100,10 +166,14 @@
*/
vcpu->arch.xive_esc_on = false;
+ /* This orders xive_esc_on = false vs. subsequent stale_p = true */
+ smp_wmb(); /* goes with smp_mb() in cleanup_single_escalation */
+
return IRQ_HANDLED;
}
-static int xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio)
+int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
+ bool single_escalation)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
struct xive_q *q = &xc->queues[prio];
@@ -122,7 +192,7 @@
return -EIO;
}
- if (xc->xive->single_escalation)
+ if (single_escalation)
name = kasprintf(GFP_KERNEL, "kvm-%d-%d",
vcpu->kvm->arch.lpid, xc->server_num);
else
@@ -154,7 +224,7 @@
* interrupt, thus leaving it effectively masked after
* it fires once.
*/
- if (xc->xive->single_escalation) {
+ if (single_escalation) {
struct irq_data *d = irq_get_irq_data(xc->esc_virq[prio]);
struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
@@ -207,14 +277,14 @@
return rc;
}
-/* Called with kvm_lock held */
+/* Called with xive->lock held */
static int xive_check_provisioning(struct kvm *kvm, u8 prio)
{
struct kvmppc_xive *xive = kvm->arch.xive;
struct kvm_vcpu *vcpu;
int i, rc;
- lockdep_assert_held(&kvm->lock);
+ lockdep_assert_held(&xive->lock);
/* Already provisioned ? */
if (xive->qmap & (1 << prio))
@@ -228,7 +298,8 @@
continue;
rc = xive_provision_queue(vcpu, prio);
if (rc == 0 && !xive->single_escalation)
- xive_attach_escalation(vcpu, prio);
+ kvmppc_xive_attach_escalation(vcpu, prio,
+ xive->single_escalation);
if (rc)
return rc;
}
@@ -279,7 +350,7 @@
return atomic_add_unless(&q->count, 1, max) ? 0 : -EBUSY;
}
-static int xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
+int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio)
{
struct kvm_vcpu *vcpu;
int i, rc;
@@ -317,11 +388,6 @@
return -EBUSY;
}
-static u32 xive_vp(struct kvmppc_xive *xive, u32 server)
-{
- return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
-}
-
static u8 xive_lock_and_mask(struct kvmppc_xive *xive,
struct kvmppc_xive_src_block *sb,
struct kvmppc_xive_irq_state *state)
@@ -367,8 +433,8 @@
*/
if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
xive_native_configure_irq(hw_num,
- xive_vp(xive, state->act_server),
- MASKED, state->number);
+ kvmppc_xive_vp(xive, state->act_server),
+ MASKED, state->number);
/* set old_p so we can track if an H_EOI was done */
state->old_p = true;
state->old_q = false;
@@ -423,8 +489,8 @@
*/
if (xd->flags & OPAL_XIVE_IRQ_MASK_VIA_FW) {
xive_native_configure_irq(hw_num,
- xive_vp(xive, state->act_server),
- state->act_priority, state->number);
+ kvmppc_xive_vp(xive, state->act_server),
+ state->act_priority, state->number);
/* If an EOI is needed, do it here */
if (!state->old_p)
xive_vm_source_eoi(hw_num, xd);
@@ -472,7 +538,7 @@
* priority. The count for that new target will have
* already been incremented.
*/
- rc = xive_select_target(kvm, &server, prio);
+ rc = kvmppc_xive_select_target(kvm, &server, prio);
/*
* We failed to find a target ? Not much we can do
@@ -500,7 +566,7 @@
kvmppc_xive_select_irq(state, &hw_num, NULL);
return xive_native_configure_irq(hw_num,
- xive_vp(xive, server),
+ kvmppc_xive_vp(xive, server),
prio, state->number);
}
@@ -561,9 +627,12 @@
irq, server, priority);
/* First, check provisioning of queues */
- if (priority != MASKED)
+ if (priority != MASKED) {
+ mutex_lock(&xive->lock);
rc = xive_check_provisioning(xive->kvm,
xive_prio_from_guest(priority));
+ mutex_unlock(&xive->lock);
+ }
if (rc) {
pr_devel(" provisioning failure %d !\n", rc);
return rc;
@@ -786,7 +855,8 @@
/*
* We can't update the state of a "pushed" VCPU, but that
- * shouldn't happen.
+ * shouldn't happen because the vcpu->mutex makes running a
+ * vcpu mutually exclusive with doing one_reg get/set on it.
*/
if (WARN_ON(vcpu->arch.xive_pushed))
return -EIO;
@@ -877,6 +947,13 @@
/* Turn the IPI hard off */
xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+ /*
+ * Reset ESB guest mapping. Needed when ESB pages are exposed
+ * to the guest in XIVE native mode
+ */
+ if (xive->ops && xive->ops->reset_mapped)
+ xive->ops->reset_mapped(kvm, guest_irq);
+
/* Grab info about irq */
state->pt_number = hw_irq;
state->pt_data = irq_data_get_irq_handler_data(host_data);
@@ -888,7 +965,7 @@
* which is fine for a never started interrupt.
*/
xive_native_configure_irq(hw_irq,
- xive_vp(xive, state->act_server),
+ kvmppc_xive_vp(xive, state->act_server),
state->act_priority, state->number);
/*
@@ -962,9 +1039,17 @@
state->pt_number = 0;
state->pt_data = NULL;
+ /*
+ * Reset ESB guest mapping. Needed when ESB pages are exposed
+ * to the guest in XIVE native mode
+ */
+ if (xive->ops && xive->ops->reset_mapped) {
+ xive->ops->reset_mapped(kvm, guest_irq);
+ }
+
/* Reconfigure the IPI */
xive_native_configure_irq(state->ipi_number,
- xive_vp(xive, state->act_server),
+ kvmppc_xive_vp(xive, state->act_server),
state->act_priority, state->number);
/*
@@ -986,7 +1071,7 @@
}
EXPORT_SYMBOL_GPL(kvmppc_xive_clr_mapped);
-static void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
+void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
struct kvm *kvm = vcpu->kvm;
@@ -1020,14 +1105,60 @@
arch_spin_unlock(&sb->lock);
}
}
+
+ /* Disable vcpu's escalation interrupt */
+ if (vcpu->arch.xive_esc_on) {
+ __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
+ XIVE_ESB_SET_PQ_01));
+ vcpu->arch.xive_esc_on = false;
+ }
+
+ /*
+ * Clear pointers to escalation interrupt ESB.
+ * This is safe because the vcpu->mutex is held, preventing
+ * any other CPU from concurrently executing a KVM_RUN ioctl.
+ */
+ vcpu->arch.xive_esc_vaddr = 0;
+ vcpu->arch.xive_esc_raddr = 0;
+}
+
+/*
+ * In single escalation mode, the escalation interrupt is marked so
+ * that EOI doesn't re-enable it, but just sets the stale_p flag to
+ * indicate that the P bit has already been dealt with. However, the
+ * assembly code that enters the guest sets PQ to 00 without clearing
+ * stale_p (because it has no easy way to address it). Hence we have
+ * to adjust stale_p before shutting down the interrupt.
+ */
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq)
+{
+ struct irq_data *d = irq_get_irq_data(irq);
+ struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
+
+ /*
+ * This slightly odd sequence gives the right result
+ * (i.e. stale_p set if xive_esc_on is false) even if
+ * we race with xive_esc_irq() and xive_irq_eoi().
+ */
+ xd->stale_p = false;
+ smp_mb(); /* paired with smb_wmb in xive_esc_irq */
+ if (!vcpu->arch.xive_esc_on)
+ xd->stale_p = true;
}
void kvmppc_xive_cleanup_vcpu(struct kvm_vcpu *vcpu)
{
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
- struct kvmppc_xive *xive = xc->xive;
+ struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
int i;
+ if (!kvmppc_xics_enabled(vcpu))
+ return;
+
+ if (!xc)
+ return;
+
pr_devel("cleanup_vcpu(cpu=%d)\n", xc->server_num);
/* Ensure no interrupt is still routed to that VP */
@@ -1037,20 +1168,28 @@
/* Mask the VP IPI */
xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_01);
- /* Disable the VP */
- xive_native_disable_vp(xc->vp_id);
-
- /* Free the queues & associated interrupts */
+ /* Free escalations */
for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
- struct xive_q *q = &xc->queues[i];
-
- /* Free the escalation irq */
if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
free_irq(xc->esc_virq[i], vcpu);
irq_dispose_mapping(xc->esc_virq[i]);
kfree(xc->esc_virq_names[i]);
}
- /* Free the queue */
+ }
+
+ /* Disable the VP */
+ xive_native_disable_vp(xc->vp_id);
+
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+
+ /* Free the queues */
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ struct xive_q *q = &xc->queues[i];
+
xive_native_disable_queue(xc->vp_id, q, i);
if (q->qpage) {
free_pages((unsigned long)q->qpage,
@@ -1066,6 +1205,10 @@
}
/* Free the VP */
kfree(xc);
+
+ /* Cleanup the vcpu */
+ vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+ vcpu->arch.xive_vcpu = NULL;
}
int kvmppc_xive_connect_vcpu(struct kvm_device *dev,
@@ -1074,6 +1217,7 @@
struct kvmppc_xive *xive = dev->private;
struct kvmppc_xive_vcpu *xc;
int i, r = -EBUSY;
+ u32 vp_id;
pr_devel("connect_vcpu(cpu=%d)\n", cpu);
@@ -1083,27 +1227,34 @@
}
if (xive->kvm != vcpu->kvm)
return -EPERM;
- if (vcpu->arch.irq_type)
+ if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
return -EBUSY;
- if (kvmppc_xive_find_server(vcpu->kvm, cpu)) {
- pr_devel("Duplicate !\n");
- return -EEXIST;
- }
if (cpu >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
pr_devel("Out of bounds !\n");
return -EINVAL;
}
- xc = kzalloc(sizeof(*xc), GFP_KERNEL);
- if (!xc)
- return -ENOMEM;
/* We need to synchronize with queue provisioning */
- mutex_lock(&vcpu->kvm->lock);
+ mutex_lock(&xive->lock);
+
+ vp_id = kvmppc_xive_vp(xive, cpu);
+ if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
+ pr_devel("Duplicate !\n");
+ r = -EEXIST;
+ goto bail;
+ }
+
+ xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+ if (!xc) {
+ r = -ENOMEM;
+ goto bail;
+ }
+
vcpu->arch.xive_vcpu = xc;
xc->xive = xive;
xc->vcpu = vcpu;
xc->server_num = cpu;
- xc->vp_id = xive_vp(xive, cpu);
+ xc->vp_id = vp_id;
xc->mfrr = 0xff;
xc->valid = true;
@@ -1156,7 +1307,8 @@
if (xive->qmap & (1 << i)) {
r = xive_provision_queue(vcpu, i);
if (r == 0 && !xive->single_escalation)
- xive_attach_escalation(vcpu, i);
+ kvmppc_xive_attach_escalation(
+ vcpu, i, xive->single_escalation);
if (r)
goto bail;
} else {
@@ -1171,7 +1323,7 @@
}
/* If not done above, attach priority 0 escalation */
- r = xive_attach_escalation(vcpu, 0);
+ r = kvmppc_xive_attach_escalation(vcpu, 0, xive->single_escalation);
if (r)
goto bail;
@@ -1181,7 +1333,7 @@
xive_vm_esb_load(&xc->vp_ipi_data, XIVE_ESB_SET_PQ_00);
bail:
- mutex_unlock(&vcpu->kvm->lock);
+ mutex_unlock(&xive->lock);
if (r) {
kvmppc_xive_cleanup_vcpu(vcpu);
return r;
@@ -1422,16 +1574,15 @@
return 0;
}
-static struct kvmppc_xive_src_block *xive_create_src_block(struct kvmppc_xive *xive,
- int irq)
+struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
+ struct kvmppc_xive *xive, int irq)
{
- struct kvm *kvm = xive->kvm;
struct kvmppc_xive_src_block *sb;
int i, bid;
bid = irq >> KVMPPC_XICS_ICS_SHIFT;
- mutex_lock(&kvm->lock);
+ mutex_lock(&xive->lock);
/* block already exists - somebody else got here first */
if (xive->src_blocks[bid])
@@ -1446,6 +1597,7 @@
for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
sb->irq_state[i].number = (bid << KVMPPC_XICS_ICS_SHIFT) | i;
+ sb->irq_state[i].eisn = 0;
sb->irq_state[i].guest_priority = MASKED;
sb->irq_state[i].saved_priority = MASKED;
sb->irq_state[i].act_priority = MASKED;
@@ -1457,7 +1609,7 @@
xive->max_sbid = bid;
out:
- mutex_unlock(&kvm->lock);
+ mutex_unlock(&xive->lock);
return xive->src_blocks[bid];
}
@@ -1502,7 +1654,7 @@
sb = kvmppc_xive_find_source(xive, irq, &idx);
if (!sb) {
pr_devel("No source, creating source block...\n");
- sb = xive_create_src_block(xive, irq);
+ sb = kvmppc_xive_create_src_block(xive, irq);
if (!sb) {
pr_devel("Failed to create block...\n");
return -ENOMEM;
@@ -1567,9 +1719,9 @@
/* If we have a priority target the interrupt */
if (act_prio != MASKED) {
/* First, check provisioning of queues */
- mutex_lock(&xive->kvm->lock);
+ mutex_lock(&xive->lock);
rc = xive_check_provisioning(xive->kvm, act_prio);
- mutex_unlock(&xive->kvm->lock);
+ mutex_unlock(&xive->lock);
/* Target interrupt */
if (rc == 0)
@@ -1723,10 +1875,9 @@
{
xive_vm_esb_load(xd, XIVE_ESB_SET_PQ_01);
xive_native_configure_irq(hw_num, 0, MASKED, 0);
- xive_cleanup_irq_data(xd);
}
-static void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
+void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb)
{
int i;
@@ -1737,9 +1888,10 @@
continue;
kvmppc_xive_cleanup_irq(state->ipi_number, &state->ipi_data);
+ xive_cleanup_irq_data(&state->ipi_data);
xive_native_free_irq(state->ipi_number);
- /* Pass-through, cleanup too */
+ /* Pass-through, cleanup too but keep IRQ hw data */
if (state->pt_number)
kvmppc_xive_cleanup_irq(state->pt_number, state->pt_data);
@@ -1747,16 +1899,53 @@
}
}
-static void kvmppc_xive_free(struct kvm_device *dev)
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xive_release(struct kvm_device *dev)
{
struct kvmppc_xive *xive = dev->private;
struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
int i;
+ pr_devel("Releasing xive device\n");
+
+ /*
+ * Since this is the device release function, we know that
+ * userspace does not have any open fd referring to the
+ * device. Therefore there can not be any of the device
+ * attribute set/get functions being executed concurrently,
+ * and similarly, the connect_vcpu and set/clr_mapped
+ * functions also cannot be being executed.
+ */
+
debugfs_remove(xive->dentry);
- if (kvm)
- kvm->arch.xive = NULL;
+ /*
+ * We should clean up the vCPU interrupt presenters first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+ * (i.e. kvmppc_xive_[gs]et_icp) can be done concurrently.
+ * Holding the vcpu->mutex also means that the vcpu cannot
+ * be executing the KVM_RUN ioctl, and therefore it cannot
+ * be executing the XIVE push or pull code or accessing
+ * the XIVE MMIO regions.
+ */
+ mutex_lock(&vcpu->mutex);
+ kvmppc_xive_cleanup_vcpu(vcpu);
+ mutex_unlock(&vcpu->mutex);
+ }
+
+ /*
+ * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
+ * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
+ * against xive code getting called during vcpu execution or
+ * set/get one_reg operations.
+ */
+ kvm->arch.xive = NULL;
/* Mask and free interrupts */
for (i = 0; i <= xive->max_sbid; i++) {
@@ -1769,11 +1958,45 @@
if (xive->vp_base != XIVE_INVALID_VP)
xive_native_free_vp_block(xive->vp_base);
+ /*
+ * A reference of the kvmppc_xive pointer is now kept under
+ * the xive_devices struct of the machine for reuse. It is
+ * freed when the VM is destroyed for now until we fix all the
+ * execution paths.
+ */
- kfree(xive);
kfree(dev);
}
+/*
+ * When the guest chooses the interrupt mode (XICS legacy or XIVE
+ * native), the VM will switch of KVM device. The previous device will
+ * be "released" before the new one is created.
+ *
+ * Until we are sure all execution paths are well protected, provide a
+ * fail safe (transitional) method for device destruction, in which
+ * the XIVE device pointer is recycled and not directly freed.
+ */
+struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type)
+{
+ struct kvmppc_xive **kvm_xive_device = type == KVM_DEV_TYPE_XIVE ?
+ &kvm->arch.xive_devices.native :
+ &kvm->arch.xive_devices.xics_on_xive;
+ struct kvmppc_xive *xive = *kvm_xive_device;
+
+ if (!xive) {
+ xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+ *kvm_xive_device = xive;
+ } else {
+ memset(xive, 0, sizeof(*xive));
+ }
+
+ return xive;
+}
+
+/*
+ * Create a XICS device with XIVE backend. kvm->lock is held.
+ */
static int kvmppc_xive_create(struct kvm_device *dev, u32 type)
{
struct kvmppc_xive *xive;
@@ -1782,13 +2005,14 @@
pr_devel("Creating xive for partition\n");
- xive = kzalloc(sizeof(*xive), GFP_KERNEL);
+ xive = kvmppc_xive_get_device(kvm, type);
if (!xive)
return -ENOMEM;
dev->private = xive;
xive->dev = dev;
xive->kvm = kvm;
+ mutex_init(&xive->lock);
/* Already there ? */
if (kvm->arch.xive)
@@ -1812,14 +2036,49 @@
xive->single_escalation = xive_native_has_single_escalation();
- if (ret) {
- kfree(xive);
+ if (ret)
return ret;
- }
return 0;
}
+int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ unsigned int i;
+
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ struct xive_q *q = &xc->queues[i];
+ u32 i0, i1, idx;
+
+ if (!q->qpage && !xc->esc_virq[i])
+ continue;
+
+ seq_printf(m, " [q%d]: ", i);
+
+ if (q->qpage) {
+ idx = q->idx;
+ i0 = be32_to_cpup(q->qpage + idx);
+ idx = (idx + 1) & q->msk;
+ i1 = be32_to_cpup(q->qpage + idx);
+ seq_printf(m, "T=%d %08x %08x...\n", q->toggle,
+ i0, i1);
+ }
+ if (xc->esc_virq[i]) {
+ struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
+ struct xive_irq_data *xd =
+ irq_data_get_irq_handler_data(d);
+ u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
+
+ seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
+ (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
+ (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
+ xc->esc_virq[i], pq, xd->eoi_page);
+ seq_puts(m, "\n");
+ }
+ }
+ return 0;
+}
static int xive_debug_show(struct seq_file *m, void *private)
{
@@ -1845,7 +2104,6 @@
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
- unsigned int i;
if (!xc)
continue;
@@ -1855,33 +2113,8 @@
xc->server_num, xc->cppr, xc->hw_cppr,
xc->mfrr, xc->pending,
xc->stat_rm_h_xirr, xc->stat_vm_h_xirr);
- for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
- struct xive_q *q = &xc->queues[i];
- u32 i0, i1, idx;
- if (!q->qpage && !xc->esc_virq[i])
- continue;
-
- seq_printf(m, " [q%d]: ", i);
-
- if (q->qpage) {
- idx = q->idx;
- i0 = be32_to_cpup(q->qpage + idx);
- idx = (idx + 1) & q->msk;
- i1 = be32_to_cpup(q->qpage + idx);
- seq_printf(m, "T=%d %08x %08x... \n", q->toggle, i0, i1);
- }
- if (xc->esc_virq[i]) {
- struct irq_data *d = irq_get_irq_data(xc->esc_virq[i]);
- struct xive_irq_data *xd = irq_data_get_irq_handler_data(d);
- u64 pq = xive_vm_esb_load(xd, XIVE_ESB_GET);
- seq_printf(m, "E:%c%c I(%d:%llx:%llx)",
- (pq & XIVE_ESB_VAL_P) ? 'P' : 'p',
- (pq & XIVE_ESB_VAL_Q) ? 'Q' : 'q',
- xc->esc_virq[i], pq, xd->eoi_page);
- seq_printf(m, "\n");
- }
- }
+ kvmppc_xive_debug_show_queues(m, vcpu);
t_rm_h_xirr += xc->stat_rm_h_xirr;
t_rm_h_ipoll += xc->stat_rm_h_ipoll;
@@ -1905,17 +2138,7 @@
return 0;
}
-static int xive_debug_open(struct inode *inode, struct file *file)
-{
- return single_open(file, xive_debug_show, inode->i_private);
-}
-
-static const struct file_operations xive_debug_fops = {
- .open = xive_debug_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
-};
+DEFINE_SHOW_ATTRIBUTE(xive_debug);
static void xive_debugfs_init(struct kvmppc_xive *xive)
{
@@ -1946,7 +2169,7 @@
.name = "kvm-xive",
.create = kvmppc_xive_create,
.init = kvmppc_xive_init,
- .destroy = kvmppc_xive_free,
+ .release = kvmppc_xive_release,
.set_attr = xive_set_attr,
.get_attr = xive_get_attr,
.has_attr = xive_has_attr,
diff --git a/arch/powerpc/kvm/book3s_xive.h b/arch/powerpc/kvm/book3s_xive.h
index a08ae6f..fe3ed50 100644
--- a/arch/powerpc/kvm/book3s_xive.h
+++ b/arch/powerpc/kvm/book3s_xive.h
@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#ifndef _KVM_PPC_BOOK3S_XIVE_H
@@ -13,6 +10,13 @@
#include "book3s_xics.h"
/*
+ * The XIVE Interrupt source numbers are within the range 0 to
+ * KVMPPC_XICS_NR_IRQS.
+ */
+#define KVMPPC_XIVE_FIRST_IRQ 0
+#define KVMPPC_XIVE_NR_IRQS KVMPPC_XICS_NR_IRQS
+
+/*
* State for one guest irq source.
*
* For each guest source we allocate a HW interrupt in the XIVE
@@ -54,6 +58,9 @@
bool saved_p;
bool saved_q;
u8 saved_scan_prio;
+
+ /* Xive native */
+ u32 eisn; /* Guest Effective IRQ number */
};
/* Select the "right" interrupt (IPI vs. passthrough) */
@@ -84,6 +91,11 @@
struct kvmppc_xive_irq_state irq_state[KVMPPC_XICS_IRQ_PER_ICS];
};
+struct kvmppc_xive;
+
+struct kvmppc_xive_ops {
+ int (*reset_mapped)(struct kvm *kvm, unsigned long guest_irq);
+};
struct kvmppc_xive {
struct kvm *kvm;
@@ -122,6 +134,11 @@
/* Flags */
u8 single_escalation;
+
+ struct kvmppc_xive_ops *ops;
+ struct address_space *mapping;
+ struct mutex mapping_lock;
+ struct mutex lock;
};
#define KVMPPC_XIVE_Q_COUNT 8
@@ -198,6 +215,23 @@
return xive->src_blocks[bid];
}
+static inline u32 kvmppc_xive_vp(struct kvmppc_xive *xive, u32 server)
+{
+ return xive->vp_base + kvmppc_pack_vcpu_id(xive->kvm, server);
+}
+
+static inline bool kvmppc_xive_vp_in_use(struct kvm *kvm, u32 vp_id)
+{
+ struct kvm_vcpu *vcpu = NULL;
+ int i;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ if (vcpu->arch.xive_vcpu && vp_id == vcpu->arch.xive_vcpu->vp_id)
+ return true;
+ }
+ return false;
+}
+
/*
* Mapping between guest priorities and host priorities
* is as follow.
@@ -248,5 +282,20 @@
extern int (*__xive_vm_h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr);
extern int (*__xive_vm_h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr);
+/*
+ * Common Xive routines for XICS-over-XIVE and XIVE native
+ */
+void kvmppc_xive_disable_vcpu_interrupts(struct kvm_vcpu *vcpu);
+int kvmppc_xive_debug_show_queues(struct seq_file *m, struct kvm_vcpu *vcpu);
+struct kvmppc_xive_src_block *kvmppc_xive_create_src_block(
+ struct kvmppc_xive *xive, int irq);
+void kvmppc_xive_free_sources(struct kvmppc_xive_src_block *sb);
+int kvmppc_xive_select_target(struct kvm *kvm, u32 *server, u8 prio);
+int kvmppc_xive_attach_escalation(struct kvm_vcpu *vcpu, u8 prio,
+ bool single_escalation);
+struct kvmppc_xive *kvmppc_xive_get_device(struct kvm *kvm, u32 type);
+void xive_cleanup_single_escalation(struct kvm_vcpu *vcpu,
+ struct kvmppc_xive_vcpu *xc, int irq);
+
#endif /* CONFIG_KVM_XICS */
#endif /* _KVM_PPC_BOOK3S_XICS_H */
diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c
new file mode 100644
index 0000000..78b906f
--- /dev/null
+++ b/arch/powerpc/kvm/book3s_xive_native.c
@@ -0,0 +1,1278 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright (c) 2017-2019, IBM Corporation.
+ */
+
+#define pr_fmt(fmt) "xive-kvm: " fmt
+
+#include <linux/kernel.h>
+#include <linux/kvm_host.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/spinlock.h>
+#include <linux/delay.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+#include <asm/kvm_book3s.h>
+#include <asm/kvm_ppc.h>
+#include <asm/hvcall.h>
+#include <asm/xive.h>
+#include <asm/xive-regs.h>
+#include <asm/debug.h>
+#include <asm/debugfs.h>
+#include <asm/opal.h>
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "book3s_xive.h"
+
+static u8 xive_vm_esb_load(struct xive_irq_data *xd, u32 offset)
+{
+ u64 val;
+
+ if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG)
+ offset |= offset << 4;
+
+ val = in_be64(xd->eoi_mmio + offset);
+ return (u8)val;
+}
+
+static void kvmppc_xive_native_cleanup_queue(struct kvm_vcpu *vcpu, int prio)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ struct xive_q *q = &xc->queues[prio];
+
+ xive_native_disable_queue(xc->vp_id, q, prio);
+ if (q->qpage) {
+ put_page(virt_to_page(q->qpage));
+ q->qpage = NULL;
+ }
+}
+
+void kvmppc_xive_native_cleanup_vcpu(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ int i;
+
+ if (!kvmppc_xive_enabled(vcpu))
+ return;
+
+ if (!xc)
+ return;
+
+ pr_devel("native_cleanup_vcpu(cpu=%d)\n", xc->server_num);
+
+ /* Ensure no interrupt is still routed to that VP */
+ xc->valid = false;
+ kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+ /* Free escalations */
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ /* Free the escalation irq */
+ if (xc->esc_virq[i]) {
+ if (xc->xive->single_escalation)
+ xive_cleanup_single_escalation(vcpu, xc,
+ xc->esc_virq[i]);
+ free_irq(xc->esc_virq[i], vcpu);
+ irq_dispose_mapping(xc->esc_virq[i]);
+ kfree(xc->esc_virq_names[i]);
+ xc->esc_virq[i] = 0;
+ }
+ }
+
+ /* Disable the VP */
+ xive_native_disable_vp(xc->vp_id);
+
+ /* Clear the cam word so guest entry won't try to push context */
+ vcpu->arch.xive_cam_word = 0;
+
+ /* Free the queues */
+ for (i = 0; i < KVMPPC_XIVE_Q_COUNT; i++) {
+ kvmppc_xive_native_cleanup_queue(vcpu, i);
+ }
+
+ /* Free the VP */
+ kfree(xc);
+
+ /* Cleanup the vcpu */
+ vcpu->arch.irq_type = KVMPPC_IRQ_DEFAULT;
+ vcpu->arch.xive_vcpu = NULL;
+}
+
+int kvmppc_xive_native_connect_vcpu(struct kvm_device *dev,
+ struct kvm_vcpu *vcpu, u32 server_num)
+{
+ struct kvmppc_xive *xive = dev->private;
+ struct kvmppc_xive_vcpu *xc = NULL;
+ int rc;
+ u32 vp_id;
+
+ pr_devel("native_connect_vcpu(server=%d)\n", server_num);
+
+ if (dev->ops != &kvm_xive_native_ops) {
+ pr_devel("Wrong ops !\n");
+ return -EPERM;
+ }
+ if (xive->kvm != vcpu->kvm)
+ return -EPERM;
+ if (vcpu->arch.irq_type != KVMPPC_IRQ_DEFAULT)
+ return -EBUSY;
+ if (server_num >= (KVM_MAX_VCPUS * vcpu->kvm->arch.emul_smt_mode)) {
+ pr_devel("Out of bounds !\n");
+ return -EINVAL;
+ }
+
+ mutex_lock(&xive->lock);
+
+ vp_id = kvmppc_xive_vp(xive, server_num);
+ if (kvmppc_xive_vp_in_use(xive->kvm, vp_id)) {
+ pr_devel("Duplicate !\n");
+ rc = -EEXIST;
+ goto bail;
+ }
+
+ xc = kzalloc(sizeof(*xc), GFP_KERNEL);
+ if (!xc) {
+ rc = -ENOMEM;
+ goto bail;
+ }
+
+ vcpu->arch.xive_vcpu = xc;
+ xc->xive = xive;
+ xc->vcpu = vcpu;
+ xc->server_num = server_num;
+
+ xc->vp_id = vp_id;
+ xc->valid = true;
+ vcpu->arch.irq_type = KVMPPC_IRQ_XIVE;
+
+ rc = xive_native_get_vp_info(xc->vp_id, &xc->vp_cam, &xc->vp_chip_id);
+ if (rc) {
+ pr_err("Failed to get VP info from OPAL: %d\n", rc);
+ goto bail;
+ }
+
+ /*
+ * Enable the VP first as the single escalation mode will
+ * affect escalation interrupts numbering
+ */
+ rc = xive_native_enable_vp(xc->vp_id, xive->single_escalation);
+ if (rc) {
+ pr_err("Failed to enable VP in OPAL: %d\n", rc);
+ goto bail;
+ }
+
+ /* Configure VCPU fields for use by assembly push/pull */
+ vcpu->arch.xive_saved_state.w01 = cpu_to_be64(0xff000000);
+ vcpu->arch.xive_cam_word = cpu_to_be32(xc->vp_cam | TM_QW1W2_VO);
+
+ /* TODO: reset all queues to a clean state ? */
+bail:
+ mutex_unlock(&xive->lock);
+ if (rc)
+ kvmppc_xive_native_cleanup_vcpu(vcpu);
+
+ return rc;
+}
+
+/*
+ * Device passthrough support
+ */
+static int kvmppc_xive_native_reset_mapped(struct kvm *kvm, unsigned long irq)
+{
+ struct kvmppc_xive *xive = kvm->arch.xive;
+ pgoff_t esb_pgoff = KVM_XIVE_ESB_PAGE_OFFSET + irq * 2;
+
+ if (irq >= KVMPPC_XIVE_NR_IRQS)
+ return -EINVAL;
+
+ /*
+ * Clear the ESB pages of the IRQ number being mapped (or
+ * unmapped) into the guest and let the the VM fault handler
+ * repopulate with the appropriate ESB pages (device or IC)
+ */
+ pr_debug("clearing esb pages for girq 0x%lx\n", irq);
+ mutex_lock(&xive->mapping_lock);
+ if (xive->mapping)
+ unmap_mapping_range(xive->mapping,
+ esb_pgoff << PAGE_SHIFT,
+ 2ull << PAGE_SHIFT, 1);
+ mutex_unlock(&xive->mapping_lock);
+ return 0;
+}
+
+static struct kvmppc_xive_ops kvmppc_xive_native_ops = {
+ .reset_mapped = kvmppc_xive_native_reset_mapped,
+};
+
+static vm_fault_t xive_native_esb_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct kvm_device *dev = vma->vm_file->private_data;
+ struct kvmppc_xive *xive = dev->private;
+ struct kvmppc_xive_src_block *sb;
+ struct kvmppc_xive_irq_state *state;
+ struct xive_irq_data *xd;
+ u32 hw_num;
+ u16 src;
+ u64 page;
+ unsigned long irq;
+ u64 page_offset;
+
+ /*
+ * Linux/KVM uses a two pages ESB setting, one for trigger and
+ * one for EOI
+ */
+ page_offset = vmf->pgoff - vma->vm_pgoff;
+ irq = page_offset / 2;
+
+ sb = kvmppc_xive_find_source(xive, irq, &src);
+ if (!sb) {
+ pr_devel("%s: source %lx not found !\n", __func__, irq);
+ return VM_FAULT_SIGBUS;
+ }
+
+ state = &sb->irq_state[src];
+ kvmppc_xive_select_irq(state, &hw_num, &xd);
+
+ arch_spin_lock(&sb->lock);
+
+ /*
+ * first/even page is for trigger
+ * second/odd page is for EOI and management.
+ */
+ page = page_offset % 2 ? xd->eoi_page : xd->trig_page;
+ arch_spin_unlock(&sb->lock);
+
+ if (WARN_ON(!page)) {
+ pr_err("%s: accessing invalid ESB page for source %lx !\n",
+ __func__, irq);
+ return VM_FAULT_SIGBUS;
+ }
+
+ vmf_insert_pfn(vma, vmf->address, page >> PAGE_SHIFT);
+ return VM_FAULT_NOPAGE;
+}
+
+static const struct vm_operations_struct xive_native_esb_vmops = {
+ .fault = xive_native_esb_fault,
+};
+
+static vm_fault_t xive_native_tima_fault(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ switch (vmf->pgoff - vma->vm_pgoff) {
+ case 0: /* HW - forbid access */
+ case 1: /* HV - forbid access */
+ return VM_FAULT_SIGBUS;
+ case 2: /* OS */
+ vmf_insert_pfn(vma, vmf->address, xive_tima_os >> PAGE_SHIFT);
+ return VM_FAULT_NOPAGE;
+ case 3: /* USER - TODO */
+ default:
+ return VM_FAULT_SIGBUS;
+ }
+}
+
+static const struct vm_operations_struct xive_native_tima_vmops = {
+ .fault = xive_native_tima_fault,
+};
+
+static int kvmppc_xive_native_mmap(struct kvm_device *dev,
+ struct vm_area_struct *vma)
+{
+ struct kvmppc_xive *xive = dev->private;
+
+ /* We only allow mappings at fixed offset for now */
+ if (vma->vm_pgoff == KVM_XIVE_TIMA_PAGE_OFFSET) {
+ if (vma_pages(vma) > 4)
+ return -EINVAL;
+ vma->vm_ops = &xive_native_tima_vmops;
+ } else if (vma->vm_pgoff == KVM_XIVE_ESB_PAGE_OFFSET) {
+ if (vma_pages(vma) > KVMPPC_XIVE_NR_IRQS * 2)
+ return -EINVAL;
+ vma->vm_ops = &xive_native_esb_vmops;
+ } else {
+ return -EINVAL;
+ }
+
+ vma->vm_flags |= VM_IO | VM_PFNMAP;
+ vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
+
+ /*
+ * Grab the KVM device file address_space to be able to clear
+ * the ESB pages mapping when a device is passed-through into
+ * the guest.
+ */
+ xive->mapping = vma->vm_file->f_mapping;
+ return 0;
+}
+
+static int kvmppc_xive_native_set_source(struct kvmppc_xive *xive, long irq,
+ u64 addr)
+{
+ struct kvmppc_xive_src_block *sb;
+ struct kvmppc_xive_irq_state *state;
+ u64 __user *ubufp = (u64 __user *) addr;
+ u64 val;
+ u16 idx;
+ int rc;
+
+ pr_devel("%s irq=0x%lx\n", __func__, irq);
+
+ if (irq < KVMPPC_XIVE_FIRST_IRQ || irq >= KVMPPC_XIVE_NR_IRQS)
+ return -E2BIG;
+
+ sb = kvmppc_xive_find_source(xive, irq, &idx);
+ if (!sb) {
+ pr_debug("No source, creating source block...\n");
+ sb = kvmppc_xive_create_src_block(xive, irq);
+ if (!sb) {
+ pr_err("Failed to create block...\n");
+ return -ENOMEM;
+ }
+ }
+ state = &sb->irq_state[idx];
+
+ if (get_user(val, ubufp)) {
+ pr_err("fault getting user info !\n");
+ return -EFAULT;
+ }
+
+ arch_spin_lock(&sb->lock);
+
+ /*
+ * If the source doesn't already have an IPI, allocate
+ * one and get the corresponding data
+ */
+ if (!state->ipi_number) {
+ state->ipi_number = xive_native_alloc_irq();
+ if (state->ipi_number == 0) {
+ pr_err("Failed to allocate IRQ !\n");
+ rc = -ENXIO;
+ goto unlock;
+ }
+ xive_native_populate_irq_data(state->ipi_number,
+ &state->ipi_data);
+ pr_debug("%s allocated hw_irq=0x%x for irq=0x%lx\n", __func__,
+ state->ipi_number, irq);
+ }
+
+ /* Restore LSI state */
+ if (val & KVM_XIVE_LEVEL_SENSITIVE) {
+ state->lsi = true;
+ if (val & KVM_XIVE_LEVEL_ASSERTED)
+ state->asserted = true;
+ pr_devel(" LSI ! Asserted=%d\n", state->asserted);
+ }
+
+ /* Mask IRQ to start with */
+ state->act_server = 0;
+ state->act_priority = MASKED;
+ xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+ xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+
+ /* Increment the number of valid sources and mark this one valid */
+ if (!state->valid)
+ xive->src_count++;
+ state->valid = true;
+
+ rc = 0;
+
+unlock:
+ arch_spin_unlock(&sb->lock);
+
+ return rc;
+}
+
+static int kvmppc_xive_native_update_source_config(struct kvmppc_xive *xive,
+ struct kvmppc_xive_src_block *sb,
+ struct kvmppc_xive_irq_state *state,
+ u32 server, u8 priority, bool masked,
+ u32 eisn)
+{
+ struct kvm *kvm = xive->kvm;
+ u32 hw_num;
+ int rc = 0;
+
+ arch_spin_lock(&sb->lock);
+
+ if (state->act_server == server && state->act_priority == priority &&
+ state->eisn == eisn)
+ goto unlock;
+
+ pr_devel("new_act_prio=%d new_act_server=%d mask=%d act_server=%d act_prio=%d\n",
+ priority, server, masked, state->act_server,
+ state->act_priority);
+
+ kvmppc_xive_select_irq(state, &hw_num, NULL);
+
+ if (priority != MASKED && !masked) {
+ rc = kvmppc_xive_select_target(kvm, &server, priority);
+ if (rc)
+ goto unlock;
+
+ state->act_priority = priority;
+ state->act_server = server;
+ state->eisn = eisn;
+
+ rc = xive_native_configure_irq(hw_num,
+ kvmppc_xive_vp(xive, server),
+ priority, eisn);
+ } else {
+ state->act_priority = MASKED;
+ state->act_server = 0;
+ state->eisn = 0;
+
+ rc = xive_native_configure_irq(hw_num, 0, MASKED, 0);
+ }
+
+unlock:
+ arch_spin_unlock(&sb->lock);
+ return rc;
+}
+
+static int kvmppc_xive_native_set_source_config(struct kvmppc_xive *xive,
+ long irq, u64 addr)
+{
+ struct kvmppc_xive_src_block *sb;
+ struct kvmppc_xive_irq_state *state;
+ u64 __user *ubufp = (u64 __user *) addr;
+ u16 src;
+ u64 kvm_cfg;
+ u32 server;
+ u8 priority;
+ bool masked;
+ u32 eisn;
+
+ sb = kvmppc_xive_find_source(xive, irq, &src);
+ if (!sb)
+ return -ENOENT;
+
+ state = &sb->irq_state[src];
+
+ if (!state->valid)
+ return -EINVAL;
+
+ if (get_user(kvm_cfg, ubufp))
+ return -EFAULT;
+
+ pr_devel("%s irq=0x%lx cfg=%016llx\n", __func__, irq, kvm_cfg);
+
+ priority = (kvm_cfg & KVM_XIVE_SOURCE_PRIORITY_MASK) >>
+ KVM_XIVE_SOURCE_PRIORITY_SHIFT;
+ server = (kvm_cfg & KVM_XIVE_SOURCE_SERVER_MASK) >>
+ KVM_XIVE_SOURCE_SERVER_SHIFT;
+ masked = (kvm_cfg & KVM_XIVE_SOURCE_MASKED_MASK) >>
+ KVM_XIVE_SOURCE_MASKED_SHIFT;
+ eisn = (kvm_cfg & KVM_XIVE_SOURCE_EISN_MASK) >>
+ KVM_XIVE_SOURCE_EISN_SHIFT;
+
+ if (priority != xive_prio_from_guest(priority)) {
+ pr_err("invalid priority for queue %d for VCPU %d\n",
+ priority, server);
+ return -EINVAL;
+ }
+
+ return kvmppc_xive_native_update_source_config(xive, sb, state, server,
+ priority, masked, eisn);
+}
+
+static int kvmppc_xive_native_sync_source(struct kvmppc_xive *xive,
+ long irq, u64 addr)
+{
+ struct kvmppc_xive_src_block *sb;
+ struct kvmppc_xive_irq_state *state;
+ struct xive_irq_data *xd;
+ u32 hw_num;
+ u16 src;
+ int rc = 0;
+
+ pr_devel("%s irq=0x%lx", __func__, irq);
+
+ sb = kvmppc_xive_find_source(xive, irq, &src);
+ if (!sb)
+ return -ENOENT;
+
+ state = &sb->irq_state[src];
+
+ rc = -EINVAL;
+
+ arch_spin_lock(&sb->lock);
+
+ if (state->valid) {
+ kvmppc_xive_select_irq(state, &hw_num, &xd);
+ xive_native_sync_source(hw_num);
+ rc = 0;
+ }
+
+ arch_spin_unlock(&sb->lock);
+ return rc;
+}
+
+static int xive_native_validate_queue_size(u32 qshift)
+{
+ /*
+ * We only support 64K pages for the moment. This is also
+ * advertised in the DT property "ibm,xive-eq-sizes"
+ */
+ switch (qshift) {
+ case 0: /* EQ reset */
+ case 16:
+ return 0;
+ case 12:
+ case 21:
+ case 24:
+ default:
+ return -EINVAL;
+ }
+}
+
+static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive,
+ long eq_idx, u64 addr)
+{
+ struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
+ struct kvmppc_xive_vcpu *xc;
+ void __user *ubufp = (void __user *) addr;
+ u32 server;
+ u8 priority;
+ struct kvm_ppc_xive_eq kvm_eq;
+ int rc;
+ __be32 *qaddr = 0;
+ struct page *page;
+ struct xive_q *q;
+ gfn_t gfn;
+ unsigned long page_size;
+ int srcu_idx;
+
+ /*
+ * Demangle priority/server tuple from the EQ identifier
+ */
+ priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
+ KVM_XIVE_EQ_PRIORITY_SHIFT;
+ server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
+ KVM_XIVE_EQ_SERVER_SHIFT;
+
+ if (copy_from_user(&kvm_eq, ubufp, sizeof(kvm_eq)))
+ return -EFAULT;
+
+ vcpu = kvmppc_xive_find_server(kvm, server);
+ if (!vcpu) {
+ pr_err("Can't find server %d\n", server);
+ return -ENOENT;
+ }
+ xc = vcpu->arch.xive_vcpu;
+
+ if (priority != xive_prio_from_guest(priority)) {
+ pr_err("Trying to restore invalid queue %d for VCPU %d\n",
+ priority, server);
+ return -EINVAL;
+ }
+ q = &xc->queues[priority];
+
+ pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
+ __func__, server, priority, kvm_eq.flags,
+ kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
+
+ /* reset queue and disable queueing */
+ if (!kvm_eq.qshift) {
+ q->guest_qaddr = 0;
+ q->guest_qshift = 0;
+
+ rc = xive_native_configure_queue(xc->vp_id, q, priority,
+ NULL, 0, true);
+ if (rc) {
+ pr_err("Failed to reset queue %d for VCPU %d: %d\n",
+ priority, xc->server_num, rc);
+ return rc;
+ }
+
+ if (q->qpage) {
+ put_page(virt_to_page(q->qpage));
+ q->qpage = NULL;
+ }
+
+ return 0;
+ }
+
+ /*
+ * sPAPR specifies a "Unconditional Notify (n) flag" for the
+ * H_INT_SET_QUEUE_CONFIG hcall which forces notification
+ * without using the coalescing mechanisms provided by the
+ * XIVE END ESBs. This is required on KVM as notification
+ * using the END ESBs is not supported.
+ */
+ if (kvm_eq.flags != KVM_XIVE_EQ_ALWAYS_NOTIFY) {
+ pr_err("invalid flags %d\n", kvm_eq.flags);
+ return -EINVAL;
+ }
+
+ rc = xive_native_validate_queue_size(kvm_eq.qshift);
+ if (rc) {
+ pr_err("invalid queue size %d\n", kvm_eq.qshift);
+ return rc;
+ }
+
+ if (kvm_eq.qaddr & ((1ull << kvm_eq.qshift) - 1)) {
+ pr_err("queue page is not aligned %llx/%llx\n", kvm_eq.qaddr,
+ 1ull << kvm_eq.qshift);
+ return -EINVAL;
+ }
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ gfn = gpa_to_gfn(kvm_eq.qaddr);
+ page = gfn_to_page(kvm, gfn);
+ if (is_error_page(page)) {
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ pr_err("Couldn't get queue page %llx!\n", kvm_eq.qaddr);
+ return -EINVAL;
+ }
+
+ page_size = kvm_host_page_size(kvm, gfn);
+ if (1ull << kvm_eq.qshift > page_size) {
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ pr_warn("Incompatible host page size %lx!\n", page_size);
+ return -EINVAL;
+ }
+
+ qaddr = page_to_virt(page) + (kvm_eq.qaddr & ~PAGE_MASK);
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+ /*
+ * Backup the queue page guest address to the mark EQ page
+ * dirty for migration.
+ */
+ q->guest_qaddr = kvm_eq.qaddr;
+ q->guest_qshift = kvm_eq.qshift;
+
+ /*
+ * Unconditional Notification is forced by default at the
+ * OPAL level because the use of END ESBs is not supported by
+ * Linux.
+ */
+ rc = xive_native_configure_queue(xc->vp_id, q, priority,
+ (__be32 *) qaddr, kvm_eq.qshift, true);
+ if (rc) {
+ pr_err("Failed to configure queue %d for VCPU %d: %d\n",
+ priority, xc->server_num, rc);
+ put_page(page);
+ return rc;
+ }
+
+ /*
+ * Only restore the queue state when needed. When doing the
+ * H_INT_SET_SOURCE_CONFIG hcall, it should not.
+ */
+ if (kvm_eq.qtoggle != 1 || kvm_eq.qindex != 0) {
+ rc = xive_native_set_queue_state(xc->vp_id, priority,
+ kvm_eq.qtoggle,
+ kvm_eq.qindex);
+ if (rc)
+ goto error;
+ }
+
+ rc = kvmppc_xive_attach_escalation(vcpu, priority,
+ xive->single_escalation);
+error:
+ if (rc)
+ kvmppc_xive_native_cleanup_queue(vcpu, priority);
+ return rc;
+}
+
+static int kvmppc_xive_native_get_queue_config(struct kvmppc_xive *xive,
+ long eq_idx, u64 addr)
+{
+ struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
+ struct kvmppc_xive_vcpu *xc;
+ struct xive_q *q;
+ void __user *ubufp = (u64 __user *) addr;
+ u32 server;
+ u8 priority;
+ struct kvm_ppc_xive_eq kvm_eq;
+ u64 qaddr;
+ u64 qshift;
+ u64 qeoi_page;
+ u32 escalate_irq;
+ u64 qflags;
+ int rc;
+
+ /*
+ * Demangle priority/server tuple from the EQ identifier
+ */
+ priority = (eq_idx & KVM_XIVE_EQ_PRIORITY_MASK) >>
+ KVM_XIVE_EQ_PRIORITY_SHIFT;
+ server = (eq_idx & KVM_XIVE_EQ_SERVER_MASK) >>
+ KVM_XIVE_EQ_SERVER_SHIFT;
+
+ vcpu = kvmppc_xive_find_server(kvm, server);
+ if (!vcpu) {
+ pr_err("Can't find server %d\n", server);
+ return -ENOENT;
+ }
+ xc = vcpu->arch.xive_vcpu;
+
+ if (priority != xive_prio_from_guest(priority)) {
+ pr_err("invalid priority for queue %d for VCPU %d\n",
+ priority, server);
+ return -EINVAL;
+ }
+ q = &xc->queues[priority];
+
+ memset(&kvm_eq, 0, sizeof(kvm_eq));
+
+ if (!q->qpage)
+ return 0;
+
+ rc = xive_native_get_queue_info(xc->vp_id, priority, &qaddr, &qshift,
+ &qeoi_page, &escalate_irq, &qflags);
+ if (rc)
+ return rc;
+
+ kvm_eq.flags = 0;
+ if (qflags & OPAL_XIVE_EQ_ALWAYS_NOTIFY)
+ kvm_eq.flags |= KVM_XIVE_EQ_ALWAYS_NOTIFY;
+
+ kvm_eq.qshift = q->guest_qshift;
+ kvm_eq.qaddr = q->guest_qaddr;
+
+ rc = xive_native_get_queue_state(xc->vp_id, priority, &kvm_eq.qtoggle,
+ &kvm_eq.qindex);
+ if (rc)
+ return rc;
+
+ pr_devel("%s VCPU %d priority %d fl:%x shift:%d addr:%llx g:%d idx:%d\n",
+ __func__, server, priority, kvm_eq.flags,
+ kvm_eq.qshift, kvm_eq.qaddr, kvm_eq.qtoggle, kvm_eq.qindex);
+
+ if (copy_to_user(ubufp, &kvm_eq, sizeof(kvm_eq)))
+ return -EFAULT;
+
+ return 0;
+}
+
+static void kvmppc_xive_reset_sources(struct kvmppc_xive_src_block *sb)
+{
+ int i;
+
+ for (i = 0; i < KVMPPC_XICS_IRQ_PER_ICS; i++) {
+ struct kvmppc_xive_irq_state *state = &sb->irq_state[i];
+
+ if (!state->valid)
+ continue;
+
+ if (state->act_priority == MASKED)
+ continue;
+
+ state->eisn = 0;
+ state->act_server = 0;
+ state->act_priority = MASKED;
+ xive_vm_esb_load(&state->ipi_data, XIVE_ESB_SET_PQ_01);
+ xive_native_configure_irq(state->ipi_number, 0, MASKED, 0);
+ if (state->pt_number) {
+ xive_vm_esb_load(state->pt_data, XIVE_ESB_SET_PQ_01);
+ xive_native_configure_irq(state->pt_number,
+ 0, MASKED, 0);
+ }
+ }
+}
+
+static int kvmppc_xive_reset(struct kvmppc_xive *xive)
+{
+ struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned int i;
+
+ pr_devel("%s\n", __func__);
+
+ mutex_lock(&xive->lock);
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ unsigned int prio;
+
+ if (!xc)
+ continue;
+
+ kvmppc_xive_disable_vcpu_interrupts(vcpu);
+
+ for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
+
+ /* Single escalation, no queue 7 */
+ if (prio == 7 && xive->single_escalation)
+ break;
+
+ if (xc->esc_virq[prio]) {
+ free_irq(xc->esc_virq[prio], vcpu);
+ irq_dispose_mapping(xc->esc_virq[prio]);
+ kfree(xc->esc_virq_names[prio]);
+ xc->esc_virq[prio] = 0;
+ }
+
+ kvmppc_xive_native_cleanup_queue(vcpu, prio);
+ }
+ }
+
+ for (i = 0; i <= xive->max_sbid; i++) {
+ struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+ if (sb) {
+ arch_spin_lock(&sb->lock);
+ kvmppc_xive_reset_sources(sb);
+ arch_spin_unlock(&sb->lock);
+ }
+ }
+
+ mutex_unlock(&xive->lock);
+
+ return 0;
+}
+
+static void kvmppc_xive_native_sync_sources(struct kvmppc_xive_src_block *sb)
+{
+ int j;
+
+ for (j = 0; j < KVMPPC_XICS_IRQ_PER_ICS; j++) {
+ struct kvmppc_xive_irq_state *state = &sb->irq_state[j];
+ struct xive_irq_data *xd;
+ u32 hw_num;
+
+ if (!state->valid)
+ continue;
+
+ /*
+ * The struct kvmppc_xive_irq_state reflects the state
+ * of the EAS configuration and not the state of the
+ * source. The source is masked setting the PQ bits to
+ * '-Q', which is what is being done before calling
+ * the KVM_DEV_XIVE_EQ_SYNC control.
+ *
+ * If a source EAS is configured, OPAL syncs the XIVE
+ * IC of the source and the XIVE IC of the previous
+ * target if any.
+ *
+ * So it should be fine ignoring MASKED sources as
+ * they have been synced already.
+ */
+ if (state->act_priority == MASKED)
+ continue;
+
+ kvmppc_xive_select_irq(state, &hw_num, &xd);
+ xive_native_sync_source(hw_num);
+ xive_native_sync_queue(hw_num);
+ }
+}
+
+static int kvmppc_xive_native_vcpu_eq_sync(struct kvm_vcpu *vcpu)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ unsigned int prio;
+ int srcu_idx;
+
+ if (!xc)
+ return -ENOENT;
+
+ for (prio = 0; prio < KVMPPC_XIVE_Q_COUNT; prio++) {
+ struct xive_q *q = &xc->queues[prio];
+
+ if (!q->qpage)
+ continue;
+
+ /* Mark EQ page dirty for migration */
+ srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+ mark_page_dirty(vcpu->kvm, gpa_to_gfn(q->guest_qaddr));
+ srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
+ }
+ return 0;
+}
+
+static int kvmppc_xive_native_eq_sync(struct kvmppc_xive *xive)
+{
+ struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned int i;
+
+ pr_devel("%s\n", __func__);
+
+ mutex_lock(&xive->lock);
+ for (i = 0; i <= xive->max_sbid; i++) {
+ struct kvmppc_xive_src_block *sb = xive->src_blocks[i];
+
+ if (sb) {
+ arch_spin_lock(&sb->lock);
+ kvmppc_xive_native_sync_sources(sb);
+ arch_spin_unlock(&sb->lock);
+ }
+ }
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvmppc_xive_native_vcpu_eq_sync(vcpu);
+ }
+ mutex_unlock(&xive->lock);
+
+ return 0;
+}
+
+static int kvmppc_xive_native_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct kvmppc_xive *xive = dev->private;
+
+ switch (attr->group) {
+ case KVM_DEV_XIVE_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_XIVE_RESET:
+ return kvmppc_xive_reset(xive);
+ case KVM_DEV_XIVE_EQ_SYNC:
+ return kvmppc_xive_native_eq_sync(xive);
+ }
+ break;
+ case KVM_DEV_XIVE_GRP_SOURCE:
+ return kvmppc_xive_native_set_source(xive, attr->attr,
+ attr->addr);
+ case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
+ return kvmppc_xive_native_set_source_config(xive, attr->attr,
+ attr->addr);
+ case KVM_DEV_XIVE_GRP_EQ_CONFIG:
+ return kvmppc_xive_native_set_queue_config(xive, attr->attr,
+ attr->addr);
+ case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
+ return kvmppc_xive_native_sync_source(xive, attr->attr,
+ attr->addr);
+ }
+ return -ENXIO;
+}
+
+static int kvmppc_xive_native_get_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct kvmppc_xive *xive = dev->private;
+
+ switch (attr->group) {
+ case KVM_DEV_XIVE_GRP_EQ_CONFIG:
+ return kvmppc_xive_native_get_queue_config(xive, attr->attr,
+ attr->addr);
+ }
+ return -ENXIO;
+}
+
+static int kvmppc_xive_native_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_XIVE_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_XIVE_RESET:
+ case KVM_DEV_XIVE_EQ_SYNC:
+ return 0;
+ }
+ break;
+ case KVM_DEV_XIVE_GRP_SOURCE:
+ case KVM_DEV_XIVE_GRP_SOURCE_CONFIG:
+ case KVM_DEV_XIVE_GRP_SOURCE_SYNC:
+ if (attr->attr >= KVMPPC_XIVE_FIRST_IRQ &&
+ attr->attr < KVMPPC_XIVE_NR_IRQS)
+ return 0;
+ break;
+ case KVM_DEV_XIVE_GRP_EQ_CONFIG:
+ return 0;
+ }
+ return -ENXIO;
+}
+
+/*
+ * Called when device fd is closed. kvm->lock is held.
+ */
+static void kvmppc_xive_native_release(struct kvm_device *dev)
+{
+ struct kvmppc_xive *xive = dev->private;
+ struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
+ int i;
+
+ pr_devel("Releasing xive native device\n");
+
+ /*
+ * Clear the KVM device file address_space which is used to
+ * unmap the ESB pages when a device is passed-through.
+ */
+ mutex_lock(&xive->mapping_lock);
+ xive->mapping = NULL;
+ mutex_unlock(&xive->mapping_lock);
+
+ /*
+ * Since this is the device release function, we know that
+ * userspace does not have any open fd or mmap referring to
+ * the device. Therefore there can not be any of the
+ * device attribute set/get, mmap, or page fault functions
+ * being executed concurrently, and similarly, the
+ * connect_vcpu and set/clr_mapped functions also cannot
+ * be being executed.
+ */
+
+ debugfs_remove(xive->dentry);
+
+ /*
+ * We should clean up the vCPU interrupt presenters first.
+ */
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ /*
+ * Take vcpu->mutex to ensure that no one_reg get/set ioctl
+ * (i.e. kvmppc_xive_native_[gs]et_vp) can be being done.
+ * Holding the vcpu->mutex also means that the vcpu cannot
+ * be executing the KVM_RUN ioctl, and therefore it cannot
+ * be executing the XIVE push or pull code or accessing
+ * the XIVE MMIO regions.
+ */
+ mutex_lock(&vcpu->mutex);
+ kvmppc_xive_native_cleanup_vcpu(vcpu);
+ mutex_unlock(&vcpu->mutex);
+ }
+
+ /*
+ * Now that we have cleared vcpu->arch.xive_vcpu, vcpu->arch.irq_type
+ * and vcpu->arch.xive_esc_[vr]addr on each vcpu, we are safe
+ * against xive code getting called during vcpu execution or
+ * set/get one_reg operations.
+ */
+ kvm->arch.xive = NULL;
+
+ for (i = 0; i <= xive->max_sbid; i++) {
+ if (xive->src_blocks[i])
+ kvmppc_xive_free_sources(xive->src_blocks[i]);
+ kfree(xive->src_blocks[i]);
+ xive->src_blocks[i] = NULL;
+ }
+
+ if (xive->vp_base != XIVE_INVALID_VP)
+ xive_native_free_vp_block(xive->vp_base);
+
+ /*
+ * A reference of the kvmppc_xive pointer is now kept under
+ * the xive_devices struct of the machine for reuse. It is
+ * freed when the VM is destroyed for now until we fix all the
+ * execution paths.
+ */
+
+ kfree(dev);
+}
+
+/*
+ * Create a XIVE device. kvm->lock is held.
+ */
+static int kvmppc_xive_native_create(struct kvm_device *dev, u32 type)
+{
+ struct kvmppc_xive *xive;
+ struct kvm *kvm = dev->kvm;
+ int ret = 0;
+
+ pr_devel("Creating xive native device\n");
+
+ if (kvm->arch.xive)
+ return -EEXIST;
+
+ xive = kvmppc_xive_get_device(kvm, type);
+ if (!xive)
+ return -ENOMEM;
+
+ dev->private = xive;
+ xive->dev = dev;
+ xive->kvm = kvm;
+ kvm->arch.xive = xive;
+ mutex_init(&xive->mapping_lock);
+ mutex_init(&xive->lock);
+
+ /*
+ * Allocate a bunch of VPs. KVM_MAX_VCPUS is a large value for
+ * a default. Getting the max number of CPUs the VM was
+ * configured with would improve our usage of the XIVE VP space.
+ */
+ xive->vp_base = xive_native_alloc_vp_block(KVM_MAX_VCPUS);
+ pr_devel("VP_Base=%x\n", xive->vp_base);
+
+ if (xive->vp_base == XIVE_INVALID_VP)
+ ret = -ENXIO;
+
+ xive->single_escalation = xive_native_has_single_escalation();
+ xive->ops = &kvmppc_xive_native_ops;
+
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+/*
+ * Interrupt Pending Buffer (IPB) offset
+ */
+#define TM_IPB_SHIFT 40
+#define TM_IPB_MASK (((u64) 0xFF) << TM_IPB_SHIFT)
+
+int kvmppc_xive_native_get_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ u64 opal_state;
+ int rc;
+
+ if (!kvmppc_xive_enabled(vcpu))
+ return -EPERM;
+
+ if (!xc)
+ return -ENOENT;
+
+ /* Thread context registers. We only care about IPB and CPPR */
+ val->xive_timaval[0] = vcpu->arch.xive_saved_state.w01;
+
+ /* Get the VP state from OPAL */
+ rc = xive_native_get_vp_state(xc->vp_id, &opal_state);
+ if (rc)
+ return rc;
+
+ /*
+ * Capture the backup of IPB register in the NVT structure and
+ * merge it in our KVM VP state.
+ */
+ val->xive_timaval[0] |= cpu_to_be64(opal_state & TM_IPB_MASK);
+
+ pr_devel("%s NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x opal=%016llx\n",
+ __func__,
+ vcpu->arch.xive_saved_state.nsr,
+ vcpu->arch.xive_saved_state.cppr,
+ vcpu->arch.xive_saved_state.ipb,
+ vcpu->arch.xive_saved_state.pipr,
+ vcpu->arch.xive_saved_state.w01,
+ (u32) vcpu->arch.xive_cam_word, opal_state);
+
+ return 0;
+}
+
+int kvmppc_xive_native_set_vp(struct kvm_vcpu *vcpu, union kvmppc_one_reg *val)
+{
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+ struct kvmppc_xive *xive = vcpu->kvm->arch.xive;
+
+ pr_devel("%s w01=%016llx vp=%016llx\n", __func__,
+ val->xive_timaval[0], val->xive_timaval[1]);
+
+ if (!kvmppc_xive_enabled(vcpu))
+ return -EPERM;
+
+ if (!xc || !xive)
+ return -ENOENT;
+
+ /* We can't update the state of a "pushed" VCPU */
+ if (WARN_ON(vcpu->arch.xive_pushed))
+ return -EBUSY;
+
+ /*
+ * Restore the thread context registers. IPB and CPPR should
+ * be the only ones that matter.
+ */
+ vcpu->arch.xive_saved_state.w01 = val->xive_timaval[0];
+
+ /*
+ * There is no need to restore the XIVE internal state (IPB
+ * stored in the NVT) as the IPB register was merged in KVM VP
+ * state when captured.
+ */
+ return 0;
+}
+
+bool kvmppc_xive_native_supported(void)
+{
+ return xive_native_has_queue_state_support();
+}
+
+static int xive_native_debug_show(struct seq_file *m, void *private)
+{
+ struct kvmppc_xive *xive = m->private;
+ struct kvm *kvm = xive->kvm;
+ struct kvm_vcpu *vcpu;
+ unsigned int i;
+
+ if (!kvm)
+ return 0;
+
+ seq_puts(m, "=========\nVCPU state\n=========\n");
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu;
+
+ if (!xc)
+ continue;
+
+ seq_printf(m, "cpu server %#x NSR=%02x CPPR=%02x IBP=%02x PIPR=%02x w01=%016llx w2=%08x\n",
+ xc->server_num,
+ vcpu->arch.xive_saved_state.nsr,
+ vcpu->arch.xive_saved_state.cppr,
+ vcpu->arch.xive_saved_state.ipb,
+ vcpu->arch.xive_saved_state.pipr,
+ vcpu->arch.xive_saved_state.w01,
+ (u32) vcpu->arch.xive_cam_word);
+
+ kvmppc_xive_debug_show_queues(m, vcpu);
+ }
+
+ return 0;
+}
+
+static int xive_native_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, xive_native_debug_show, inode->i_private);
+}
+
+static const struct file_operations xive_native_debug_fops = {
+ .open = xive_native_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static void xive_native_debugfs_init(struct kvmppc_xive *xive)
+{
+ char *name;
+
+ name = kasprintf(GFP_KERNEL, "kvm-xive-%p", xive);
+ if (!name) {
+ pr_err("%s: no memory for name\n", __func__);
+ return;
+ }
+
+ xive->dentry = debugfs_create_file(name, 0444, powerpc_debugfs_root,
+ xive, &xive_native_debug_fops);
+
+ pr_debug("%s: created %s\n", __func__, name);
+ kfree(name);
+}
+
+static void kvmppc_xive_native_init(struct kvm_device *dev)
+{
+ struct kvmppc_xive *xive = (struct kvmppc_xive *)dev->private;
+
+ /* Register some debug interfaces */
+ xive_native_debugfs_init(xive);
+}
+
+struct kvm_device_ops kvm_xive_native_ops = {
+ .name = "kvm-xive-native",
+ .create = kvmppc_xive_native_create,
+ .init = kvmppc_xive_native_init,
+ .release = kvmppc_xive_native_release,
+ .set_attr = kvmppc_xive_native_set_attr,
+ .get_attr = kvmppc_xive_native_get_attr,
+ .has_attr = kvmppc_xive_native_has_attr,
+ .mmap = kvmppc_xive_native_mmap,
+};
+
+void kvmppc_xive_native_init_module(void)
+{
+ ;
+}
+
+void kvmppc_xive_native_exit_module(void)
+{
+ ;
+}
diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 4171ede..a8a900a 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -1,9 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright 2017 Benjamin Herrenschmidt, IBM Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
/* File to be included by other .c files */
@@ -130,25 +127,15 @@
*/
prio = ffs(pending) - 1;
- /*
- * If the most favoured prio we found pending is less
- * favored (or equal) than a pending IPI, we return
- * the IPI instead.
- *
- * Note: If pending was 0 and mfrr is 0xff, we will
- * not spurriously take an IPI because mfrr cannot
- * then be smaller than cppr.
- */
- if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
- prio = xc->mfrr;
- hirq = XICS_IPI;
+ /* Don't scan past the guest cppr */
+ if (prio >= xc->cppr || prio > 7) {
+ if (xc->mfrr < xc->cppr) {
+ prio = xc->mfrr;
+ hirq = XICS_IPI;
+ }
break;
}
- /* Don't scan past the guest cppr */
- if (prio >= xc->cppr || prio > 7)
- break;
-
/* Grab queue and pointers */
q = &xc->queues[prio];
idx = q->idx;
@@ -184,9 +171,12 @@
* been set and another occurrence of the IPI will trigger.
*/
if (hirq == XICS_IPI || (prio == 0 && !qpage)) {
- if (scan_type == scan_fetch)
+ if (scan_type == scan_fetch) {
GLUE(X_PFX,source_eoi)(xc->vp_ipi,
&xc->vp_ipi_data);
+ q->idx = idx;
+ q->toggle = toggle;
+ }
/* Loop back on same queue with updated idx/toggle */
#ifdef XIVE_RUNTIME_CHECKS
WARN_ON(hirq && hirq != XICS_IPI);
@@ -199,32 +189,41 @@
if (hirq == XICS_DUMMY)
goto skip_ipi;
+ /* Clear the pending bit if the queue is now empty */
+ if (!hirq) {
+ pending &= ~(1 << prio);
+
+ /*
+ * Check if the queue count needs adjusting due to
+ * interrupts being moved away.
+ */
+ if (atomic_read(&q->pending_count)) {
+ int p = atomic_xchg(&q->pending_count, 0);
+ if (p) {
+#ifdef XIVE_RUNTIME_CHECKS
+ WARN_ON(p > atomic_read(&q->count));
+#endif
+ atomic_sub(p, &q->count);
+ }
+ }
+ }
+
+ /*
+ * If the most favoured prio we found pending is less
+ * favored (or equal) than a pending IPI, we return
+ * the IPI instead.
+ */
+ if (prio >= xc->mfrr && xc->mfrr < xc->cppr) {
+ prio = xc->mfrr;
+ hirq = XICS_IPI;
+ break;
+ }
+
/* If fetching, update queue pointers */
if (scan_type == scan_fetch) {
q->idx = idx;
q->toggle = toggle;
}
-
- /* Something found, stop searching */
- if (hirq)
- break;
-
- /* Clear the pending bit on the now empty queue */
- pending &= ~(1 << prio);
-
- /*
- * Check if the queue count needs adjusting due to
- * interrupts being moved away.
- */
- if (atomic_read(&q->pending_count)) {
- int p = atomic_xchg(&q->pending_count, 0);
- if (p) {
-#ifdef XIVE_RUNTIME_CHECKS
- WARN_ON(p > atomic_read(&q->count));
-#endif
- atomic_sub(p, &q->count);
- }
- }
}
/* If we are just taking a "peek", do nothing else */
@@ -280,14 +279,6 @@
/* First collect pending bits from HW */
GLUE(X_PFX,ack_pending)(xc);
- /*
- * Cleanup the old-style bits if needed (they may have been
- * set by pull or an escalation interrupts).
- */
- if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
- clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
- &vcpu->arch.pending_exceptions);
-
pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
xc->pending, xc->hw_cppr, xc->cppr);
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index a9ca016..be9a458 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2007
* Copyright 2010-2011 Freescale Semiconductor, Inc.
@@ -1833,7 +1822,8 @@
void kvmppc_core_commit_memory_region(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
const struct kvm_memory_slot *old,
- const struct kvm_memory_slot *new)
+ const struct kvm_memory_slot *new,
+ enum kvm_mr_change change)
{
}
diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
index 22ba08e..9d3169f 100644
--- a/arch/powerpc/kvm/booke.h
+++ b/arch/powerpc/kvm/booke.h
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2008
*
diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c
index d23e582..689ff5f 100644
--- a/arch/powerpc/kvm/booke_emulate.c
+++ b/arch/powerpc/kvm/booke_emulate.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2008
* Copyright 2011 Freescale Semiconductor, Inc.
diff --git a/arch/powerpc/kvm/booke_interrupts.S b/arch/powerpc/kvm/booke_interrupts.S
index 84c308a..2e56ab5 100644
--- a/arch/powerpc/kvm/booke_interrupts.S
+++ b/arch/powerpc/kvm/booke_interrupts.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2007
* Copyright 2011 Freescale Semiconductor, Inc.
diff --git a/arch/powerpc/kvm/bookehv_interrupts.S b/arch/powerpc/kvm/bookehv_interrupts.S
index 81bd8a0..c577ba4 100644
--- a/arch/powerpc/kvm/bookehv_interrupts.S
+++ b/arch/powerpc/kvm/bookehv_interrupts.S
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright (C) 2010-2011 Freescale Semiconductor, Inc.
*
@@ -75,6 +64,10 @@
PPC_LL r1, VCPU_HOST_STACK(r4)
PPC_LL r2, HOST_R2(r1)
+START_BTB_FLUSH_SECTION
+ BTB_FLUSH(r10)
+END_BTB_FLUSH_SECTION
+
mfspr r10, SPRN_PID
lwz r8, VCPU_HOST_PID(r4)
PPC_LL r11, VCPU_SHARED(r4)
@@ -182,7 +175,7 @@
*/
PPC_LL r4, PACACURRENT(r13)
PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4)
- stw r10, VCPU_CR(r4)
+ PPC_STL r10, VCPU_CR(r4)
PPC_STL r11, VCPU_GPR(R4)(r4)
PPC_STL r5, VCPU_GPR(R5)(r4)
PPC_STL r6, VCPU_GPR(R6)(r4)
@@ -292,7 +285,7 @@
PPC_STL r4, VCPU_GPR(R4)(r11)
PPC_LL r4, THREAD_NORMSAVE(0)(r10)
PPC_STL r5, VCPU_GPR(R5)(r11)
- stw r13, VCPU_CR(r11)
+ PPC_STL r13, VCPU_CR(r11)
mfspr r5, \srr0
PPC_STL r3, VCPU_GPR(R10)(r11)
PPC_LL r3, THREAD_NORMSAVE(2)(r10)
@@ -319,7 +312,7 @@
PPC_STL r4, VCPU_GPR(R4)(r11)
PPC_LL r4, GPR9(r8)
PPC_STL r5, VCPU_GPR(R5)(r11)
- stw r9, VCPU_CR(r11)
+ PPC_STL r9, VCPU_CR(r11)
mfspr r5, \srr0
PPC_STL r3, VCPU_GPR(R8)(r11)
PPC_LL r3, GPR10(r8)
@@ -643,7 +636,7 @@
PPC_LL r3, VCPU_LR(r4)
PPC_LL r5, VCPU_XER(r4)
PPC_LL r6, VCPU_CTR(r4)
- lwz r7, VCPU_CR(r4)
+ PPC_LL r7, VCPU_CR(r4)
PPC_LL r8, VCPU_PC(r4)
PPC_LD(r9, VCPU_SHARED_MSR, r11)
PPC_LL r0, VCPU_GPR(R0)(r4)
diff --git a/arch/powerpc/kvm/e500.c b/arch/powerpc/kvm/e500.c
index afd3c25..00649ca 100644
--- a/arch/powerpc/kvm/e500.c
+++ b/arch/powerpc/kvm/e500.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
*
@@ -6,10 +7,6 @@
* Description:
* This file is derived from arch/powerpc/kvm/44x.c,
* by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kvm_host.h>
@@ -443,6 +440,9 @@
struct kvm_vcpu *vcpu;
int err;
+ BUILD_BUG_ON_MSG(offsetof(struct kvmppc_vcpu_e500, vcpu) != 0,
+ "struct kvm_vcpu must be at offset 0 for arch usercopy region");
+
vcpu_e500 = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
if (!vcpu_e500) {
err = -ENOMEM;
diff --git a/arch/powerpc/kvm/e500.h b/arch/powerpc/kvm/e500.h
index 94f04fc..c3ef751 100644
--- a/arch/powerpc/kvm/e500.h
+++ b/arch/powerpc/kvm/e500.h
@@ -1,3 +1,4 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
*
@@ -10,17 +11,13 @@
* This file is based on arch/powerpc/kvm/44x_tlb.h and
* arch/powerpc/include/asm/kvm_44x.h by Hollis Blanchard <hollisb@us.ibm.com>,
* Copyright IBM Corp. 2007-2008
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#ifndef KVM_E500_H
#define KVM_E500_H
#include <linux/kvm_host.h>
-#include <asm/mmu-book3e.h>
+#include <asm/nohash/mmu-book3e.h>
#include <asm/tlb.h>
#include <asm/cputhreads.h>
diff --git a/arch/powerpc/kvm/e500_emulate.c b/arch/powerpc/kvm/e500_emulate.c
index 3f8189e..3d0d3ec 100644
--- a/arch/powerpc/kvm/e500_emulate.c
+++ b/arch/powerpc/kvm/e500_emulate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008-2011 Freescale Semiconductor, Inc. All rights reserved.
*
@@ -6,10 +7,6 @@
* Description:
* This file is derived from arch/powerpc/kvm/44x_emulate.c,
* by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <asm/kvm_ppc.h>
@@ -277,6 +274,13 @@
vcpu->arch.pwrmgtcr0 = spr_val;
break;
+ case SPRN_BUCSR:
+ /*
+ * If we are here, it means that we have already flushed the
+ * branch predictor, so just return to guest.
+ */
+ break;
+
/* extra exceptions */
#ifdef CONFIG_SPE_POSSIBLE
case SPRN_IVOR32:
diff --git a/arch/powerpc/kvm/e500_mmu.c b/arch/powerpc/kvm/e500_mmu.c
index 24296f4..2d910b8 100644
--- a/arch/powerpc/kvm/e500_mmu.c
+++ b/arch/powerpc/kvm/e500_mmu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
*
@@ -10,10 +11,6 @@
* Description:
* This file is based on arch/powerpc/kvm/44x_tlb.c,
* by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -783,7 +780,7 @@
if (!pages)
return -ENOMEM;
- ret = get_user_pages_fast(cfg->array, num_pages, 1, pages);
+ ret = get_user_pages_fast(cfg->array, num_pages, FOLL_WRITE, pages);
if (ret < 0)
goto free_pages;
diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c
index 8f2985e..321db0f 100644
--- a/arch/powerpc/kvm/e500_mmu_host.c
+++ b/arch/powerpc/kvm/e500_mmu_host.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
*
@@ -10,10 +11,6 @@
* Description:
* This file is based on arch/powerpc/kvm/44x_tlb.c,
* by Hollis Blanchard <hollisb@us.ibm.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kernel.h>
@@ -757,10 +754,11 @@
return 0;
}
-void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
+int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
{
/* The page will get remapped properly on its next fault */
kvm_unmap_hva(kvm, hva);
+ return 0;
}
/*****************************************/
diff --git a/arch/powerpc/kvm/e500_mmu_host.h b/arch/powerpc/kvm/e500_mmu_host.h
index 7624835..d8178cc 100644
--- a/arch/powerpc/kvm/e500_mmu_host.h
+++ b/arch/powerpc/kvm/e500_mmu_host.h
@@ -1,9 +1,6 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2008-2013 Freescale Semiconductor, Inc. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#ifndef KVM_E500_MMU_HOST_H
diff --git a/arch/powerpc/kvm/e500mc.c b/arch/powerpc/kvm/e500mc.c
index d316454..318e65c 100644
--- a/arch/powerpc/kvm/e500mc.c
+++ b/arch/powerpc/kvm/e500mc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2010,2012 Freescale Semiconductor, Inc. All rights reserved.
*
@@ -6,10 +7,6 @@
* Description:
* This file is derived from arch/powerpc/kvm/e500.c,
* by Yu Liu <yu.liu@freescale.com>.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
*/
#include <linux/kvm_host.h>
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index fa888bf..6fca38c 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2007
* Copyright 2011 Freescale Semiconductor, Inc.
@@ -61,11 +50,10 @@
dec_time = vcpu->arch.dec;
/*
- * Guest timebase ticks at the same frequency as host decrementer.
- * So use the host decrementer calculations for decrementer emulation.
+ * Guest timebase ticks at the same frequency as host timebase.
+ * So use the host timebase calculations for decrementer emulation.
*/
- dec_time = dec_time << decrementer_clockevent.shift;
- do_div(dec_time, decrementer_clockevent.mult);
+ dec_time = tb_to_ns(dec_time);
dec_nsec = do_div(dec_time, NSEC_PER_SEC);
hrtimer_start(&vcpu->arch.dec_timer,
ktime_set(dec_time, dec_nsec), HRTIMER_MODE_REL);
@@ -283,6 +271,7 @@
*/
if (inst == KVMPPC_INST_SW_BREAKPOINT) {
run->exit_reason = KVM_EXIT_DEBUG;
+ run->debug.arch.status = 0;
run->debug.arch.address = kvmppc_get_pc(vcpu);
emulated = EMULATE_EXIT_USER;
advance = 0;
diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c
index 75dce1e..2e496eb 100644
--- a/arch/powerpc/kvm/emulate_loadstore.c
+++ b/arch/powerpc/kvm/emulate_loadstore.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2007
* Copyright 2011 Freescale Semiconductor, Inc.
@@ -100,12 +89,6 @@
rs = get_rs(inst);
rt = get_rt(inst);
- /*
- * if mmio_vsx_tx_sx_enabled == 0, copy data between
- * VSR[0..31] and memory
- * if mmio_vsx_tx_sx_enabled == 1, copy data between
- * VSR[32..63] and memory
- */
vcpu->arch.mmio_vsx_copy_nums = 0;
vcpu->arch.mmio_vsx_offset = 0;
vcpu->arch.mmio_copy_type = KVMPPC_VSX_COPY_NONE;
@@ -117,7 +100,6 @@
emulated = EMULATE_FAIL;
vcpu->arch.regs.msr = vcpu->arch.shared->msr;
- vcpu->arch.regs.ccr = vcpu->arch.cr;
if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
int type = op.type & INSTR_TYPE_MASK;
int size = GETSIZE(op.type);
diff --git a/arch/powerpc/kvm/fpu.S b/arch/powerpc/kvm/fpu.S
index bf68d59..3dfae0c 100644
--- a/arch/powerpc/kvm/fpu.S
+++ b/arch/powerpc/kvm/fpu.S
@@ -1,13 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/*
* FPU helper code to use FPU operations from inside the kernel
*
* Copyright (C) 2010 Alexander Graf (agraf@suse.de)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
*/
#include <asm/reg.h>
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index eba5756..3a77bb6 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2007
*
@@ -61,6 +50,11 @@
return !!(v->arch.pending_exceptions) || kvm_request_pending(v);
}
+bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
+{
+ return kvm_arch_vcpu_runnable(vcpu);
+}
+
bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu)
{
return false;
@@ -331,10 +325,17 @@
{
ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
struct kvmppc_pte pte;
- int r;
+ int r = -EINVAL;
vcpu->stat.st++;
+ if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->store_to_eaddr)
+ r = vcpu->kvm->arch.kvm_ops->store_to_eaddr(vcpu, eaddr, ptr,
+ size);
+
+ if ((!r) || (r == -EAGAIN))
+ return r;
+
r = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
XLATE_WRITE, &pte);
if (r < 0)
@@ -367,10 +368,17 @@
{
ulong mp_pa = vcpu->arch.magic_page_pa & KVM_PAM & PAGE_MASK;
struct kvmppc_pte pte;
- int rc;
+ int rc = -EINVAL;
vcpu->stat.ld++;
+ if (vcpu->kvm->arch.kvm_ops && vcpu->kvm->arch.kvm_ops->load_from_eaddr)
+ rc = vcpu->kvm->arch.kvm_ops->load_from_eaddr(vcpu, eaddr, ptr,
+ size);
+
+ if ((!rc) || (rc == -EAGAIN))
+ return rc;
+
rc = kvmppc_xlate(vcpu, *eaddr, data ? XLATE_DATA : XLATE_INST,
XLATE_READ, &pte);
if (rc)
@@ -411,9 +419,9 @@
return 0;
}
-void kvm_arch_check_processor_compat(void *rtn)
+int kvm_arch_check_processor_compat(void)
{
- *(int *)rtn = kvmppc_core_check_processor_compat();
+ return kvmppc_core_check_processor_compat();
}
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
@@ -449,16 +457,6 @@
return -EINVAL;
}
-bool kvm_arch_has_vcpu_debugfs(void)
-{
- return false;
-}
-
-int kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
-{
- return 0;
-}
-
void kvm_arch_destroy_vm(struct kvm *kvm)
{
unsigned int i;
@@ -518,7 +516,6 @@
case KVM_CAP_PPC_UNSET_IRQ:
case KVM_CAP_PPC_IRQ_LEVEL:
case KVM_CAP_ENABLE_CAP:
- case KVM_CAP_ENABLE_CAP_VM:
case KVM_CAP_ONE_REG:
case KVM_CAP_IOEVENTFD:
case KVM_CAP_DEVICE_CTRL:
@@ -543,8 +540,11 @@
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_SPAPR_TCE:
case KVM_CAP_SPAPR_TCE_64:
- /* fallthrough */
+ r = 1;
+ break;
case KVM_CAP_SPAPR_TCE_VFIO:
+ r = !!cpu_has_feature(CPU_FTR_HVMODE);
+ break;
case KVM_CAP_PPC_RTAS:
case KVM_CAP_PPC_FIXUP_HCALL:
case KVM_CAP_PPC_ENABLE_HCALL:
@@ -554,6 +554,17 @@
case KVM_CAP_PPC_GET_CPU_CHAR:
r = 1;
break;
+#ifdef CONFIG_KVM_XIVE
+ case KVM_CAP_PPC_IRQ_XIVE:
+ /*
+ * We need XIVE to be enabled on the platform (implies
+ * a POWER9 processor) and the PowerNV platform, as
+ * nested is not yet supported.
+ */
+ r = xive_enabled() && !!cpu_has_feature(CPU_FTR_HVMODE) &&
+ kvmppc_xive_native_supported();
+ break;
+#endif
case KVM_CAP_PPC_ALLOC_HTAB:
r = hv_enabled;
@@ -594,7 +605,12 @@
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
- r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
+ r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
+ cpu_has_feature(CPU_FTR_HVMODE));
+ break;
+ case KVM_CAP_PPC_NESTED_HV:
+ r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
+ !kvmppc_hv_ops->enable_nested(NULL));
break;
#endif
case KVM_CAP_SYNC_MMU:
@@ -623,12 +639,12 @@
else
r = num_online_cpus();
break;
- case KVM_CAP_NR_MEMSLOTS:
- r = KVM_USER_MEM_SLOTS;
- break;
case KVM_CAP_MAX_VCPUS:
r = KVM_MAX_VCPUS;
break;
+ case KVM_CAP_MAX_VCPU_ID:
+ r = KVM_MAX_VCPU_ID;
+ break;
#ifdef CONFIG_PPC_BOOK3S_64
case KVM_CAP_PPC_GET_SMMU_INFO:
r = 1;
@@ -691,7 +707,7 @@
const struct kvm_memory_slot *new,
enum kvm_mr_change change)
{
- kvmppc_core_commit_memory_region(kvm, mem, old, new);
+ kvmppc_core_commit_memory_region(kvm, mem, old, new, change);
}
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
@@ -727,11 +743,14 @@
kvmppc_mpic_disconnect_vcpu(vcpu->arch.mpic, vcpu);
break;
case KVMPPC_IRQ_XICS:
- if (xive_enabled())
+ if (xics_on_xive())
kvmppc_xive_cleanup_vcpu(vcpu);
else
kvmppc_xics_free_icp(vcpu);
break;
+ case KVMPPC_IRQ_XIVE:
+ kvmppc_xive_native_cleanup_vcpu(vcpu);
+ break;
}
kvmppc_core_vcpu_free(vcpu);
@@ -1187,6 +1206,14 @@
kvmppc_set_vmx_byte(vcpu, gpr);
break;
#endif
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+ case KVM_MMIO_REG_NESTED_GPR:
+ if (kvmppc_need_byteswap(vcpu))
+ gpr = swab64(gpr);
+ kvm_vcpu_write_guest(vcpu, vcpu->arch.nested_io_gpr, &gpr,
+ sizeof(gpr));
+ break;
+#endif
default:
BUG();
}
@@ -1902,7 +1929,7 @@
r = -EPERM;
dev = kvm_device_from_filp(f.file);
if (dev) {
- if (xive_enabled())
+ if (xics_on_xive())
r = kvmppc_xive_connect_vcpu(dev, vcpu, cap->args[1]);
else
r = kvmppc_xics_connect_vcpu(dev, vcpu, cap->args[1]);
@@ -1912,6 +1939,30 @@
break;
}
#endif /* CONFIG_KVM_XICS */
+#ifdef CONFIG_KVM_XIVE
+ case KVM_CAP_PPC_IRQ_XIVE: {
+ struct fd f;
+ struct kvm_device *dev;
+
+ r = -EBADF;
+ f = fdget(cap->args[0]);
+ if (!f.file)
+ break;
+
+ r = -ENXIO;
+ if (!xive_enabled())
+ break;
+
+ r = -EPERM;
+ dev = kvm_device_from_filp(f.file);
+ if (dev)
+ r = kvmppc_xive_native_connect_vcpu(dev, vcpu,
+ cap->args[1]);
+
+ fdput(f);
+ break;
+ }
+#endif /* CONFIG_KVM_XIVE */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_FWNMI:
r = -EINVAL;
@@ -2079,8 +2130,8 @@
}
-static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
- struct kvm_enable_cap *cap)
+int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
+ struct kvm_enable_cap *cap)
{
int r;
@@ -2114,6 +2165,14 @@
r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
break;
}
+
+ case KVM_CAP_PPC_NESTED_HV:
+ r = -EINVAL;
+ if (!is_kvmppc_hv_enabled(kvm) ||
+ !kvm->arch.kvm_ops->enable_nested)
+ break;
+ r = kvm->arch.kvm_ops->enable_nested(kvm);
+ break;
#endif
default:
r = -EINVAL;
@@ -2152,10 +2211,12 @@
KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
KVM_PPC_CPU_CHAR_BR_HINT_HONOURED |
KVM_PPC_CPU_CHAR_MTTRIG_THR_RECONF |
- KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+ KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS |
+ KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
- KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+ KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR |
+ KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
}
return 0;
}
@@ -2214,12 +2275,16 @@
if (have_fw_feat(fw_features, "enabled",
"fw-count-cache-disabled"))
cp->character |= KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+ if (have_fw_feat(fw_features, "enabled",
+ "fw-count-cache-flush-bcctr2,0,0"))
+ cp->character |= KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
cp->character_mask = KVM_PPC_CPU_CHAR_SPEC_BAR_ORI31 |
KVM_PPC_CPU_CHAR_BCCTRL_SERIALISED |
KVM_PPC_CPU_CHAR_L1D_FLUSH_ORI30 |
KVM_PPC_CPU_CHAR_L1D_FLUSH_TRIG2 |
KVM_PPC_CPU_CHAR_L1D_THREAD_PRIV |
- KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS;
+ KVM_PPC_CPU_CHAR_COUNT_CACHE_DIS |
+ KVM_PPC_CPU_CHAR_BCCTR_FLUSH_ASSIST;
if (have_fw_feat(fw_features, "enabled",
"speculation-policy-favor-security"))
@@ -2230,9 +2295,13 @@
if (!have_fw_feat(fw_features, "disabled",
"needs-spec-barrier-for-bound-checks"))
cp->behaviour |= KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+ if (have_fw_feat(fw_features, "enabled",
+ "needs-count-cache-flush-on-context-switch"))
+ cp->behaviour |= KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
cp->behaviour_mask = KVM_PPC_CPU_BEHAV_FAVOUR_SECURITY |
KVM_PPC_CPU_BEHAV_L1D_FLUSH_PR |
- KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR;
+ KVM_PPC_CPU_BEHAV_BNDS_CHK_SPEC_BAR |
+ KVM_PPC_CPU_BEHAV_FLUSH_COUNT_CACHE;
of_node_put(fw_features);
}
@@ -2260,15 +2329,6 @@
break;
}
- case KVM_ENABLE_CAP:
- {
- struct kvm_enable_cap cap;
- r = -EFAULT;
- if (copy_from_user(&cap, argp, sizeof(cap)))
- goto out;
- r = kvm_vm_ioctl_enable_cap(kvm, &cap);
- break;
- }
#ifdef CONFIG_SPAPR_TCE_IOMMU
case KVM_CREATE_SPAPR_TCE_64: {
struct kvm_create_spapr_tce_64 create_tce_64;
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index 1c03c97..bfe4f10 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -1,16 +1,5 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2008
*
diff --git a/arch/powerpc/kvm/timing.h b/arch/powerpc/kvm/timing.h
index 3123690..ace65f9 100644
--- a/arch/powerpc/kvm/timing.h
+++ b/arch/powerpc/kvm/timing.h
@@ -1,16 +1,5 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* Copyright IBM Corp. 2008
*
diff --git a/arch/powerpc/kvm/tm.S b/arch/powerpc/kvm/tm.S
index 90e330f..3bf17c8 100644
--- a/arch/powerpc/kvm/tm.S
+++ b/arch/powerpc/kvm/tm.S
@@ -1,17 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License, version 2, as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*
* Derived from book3s_hv_rmhandlers.S, which is:
*
* Copyright 2011 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
- *
*/
#include <asm/reg.h>
@@ -28,17 +20,25 @@
* Save transactional state and TM-related registers.
* Called with:
* - r3 pointing to the vcpu struct
- * - r4 points to the MSR with current TS bits:
+ * - r4 containing the MSR with current TS bits:
* (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
- * This can modify all checkpointed registers, but
- * restores r1, r2 before exit.
+ * - r5 containing a flag indicating that non-volatile registers
+ * must be preserved.
+ * If r5 == 0, this can modify all checkpointed registers, but
+ * restores r1, r2 before exit. If r5 != 0, this restores the
+ * MSR TM/FP/VEC/VSX bits to their state on entry.
*/
_GLOBAL(__kvmppc_save_tm)
mflr r0
std r0, PPC_LR_STKOFF(r1)
+ stdu r1, -SWITCH_FRAME_SIZE(r1)
+
+ mr r9, r3
+ cmpdi cr7, r5, 0
/* Turn on TM. */
mfmsr r8
+ mr r10, r8
li r0, 1
rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
ori r8, r8, MSR_FP
@@ -51,6 +51,27 @@
std r1, HSTATE_SCRATCH2(r13)
std r3, HSTATE_SCRATCH1(r13)
+ /* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
+ mfcr r6
+ SAVE_GPR(6, r1)
+
+ /* Save DSCR so we can restore it to avoid running with user value */
+ mfspr r7, SPRN_DSCR
+ SAVE_GPR(7, r1)
+
+ /*
+ * We are going to do treclaim., which will modify all checkpointed
+ * registers. Save the non-volatile registers on the stack if
+ * preservation of non-volatile state has been requested.
+ */
+ beq cr7, 3f
+ SAVE_NVGPRS(r1)
+
+ /* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
+ li r0, 0
+ rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+ SAVE_GPR(10, r1) /* final MSR value */
+3:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
BEGIN_FTR_SECTION
/* Emulation of the treclaim instruction needs TEXASR before treclaim */
@@ -74,22 +95,25 @@
std r9, PACATMSCRATCH(r13)
ld r9, HSTATE_SCRATCH1(r13)
- /* Get a few more GPRs free. */
- std r29, VCPU_GPRS_TM(29)(r9)
- std r30, VCPU_GPRS_TM(30)(r9)
- std r31, VCPU_GPRS_TM(31)(r9)
-
- /* Save away PPR and DSCR soon so don't run with user values. */
- mfspr r31, SPRN_PPR
+ /* Save away PPR soon so we don't run with user value. */
+ std r0, VCPU_GPRS_TM(0)(r9)
+ mfspr r0, SPRN_PPR
HMT_MEDIUM
- mfspr r30, SPRN_DSCR
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- ld r29, HSTATE_DSCR(r13)
- mtspr SPRN_DSCR, r29
-#endif
- /* Save all but r9, r13 & r29-r31 */
- reg = 0
+ /* Reload stack pointer. */
+ std r1, VCPU_GPRS_TM(1)(r9)
+ ld r1, HSTATE_SCRATCH2(r13)
+
+ /* Set MSR RI now we have r1 and r13 back. */
+ std r2, VCPU_GPRS_TM(2)(r9)
+ li r2, MSR_RI
+ mtmsrd r2, 1
+
+ /* Reload TOC pointer. */
+ ld r2, PACATOC(r13)
+
+ /* Save all but r0-r2, r9 & r13 */
+ reg = 3
.rept 29
.if (reg != 9) && (reg != 13)
std reg, VCPU_GPRS_TM(reg)(r9)
@@ -103,33 +127,29 @@
ld r4, PACATMSCRATCH(r13)
std r4, VCPU_GPRS_TM(9)(r9)
- /* Reload stack pointer and TOC. */
- ld r1, HSTATE_SCRATCH2(r13)
- ld r2, PACATOC(r13)
-
- /* Set MSR RI now we have r1 and r13 back. */
- li r5, MSR_RI
- mtmsrd r5, 1
-
- /* Save away checkpinted SPRs. */
- std r31, VCPU_PPR_TM(r9)
- std r30, VCPU_DSCR_TM(r9)
- mflr r5
+ /* Restore host DSCR and CR values, after saving guest values */
mfcr r6
+ mfspr r7, SPRN_DSCR
+ stw r6, VCPU_CR_TM(r9)
+ std r7, VCPU_DSCR_TM(r9)
+ REST_GPR(6, r1)
+ REST_GPR(7, r1)
+ mtcr r6
+ mtspr SPRN_DSCR, r7
+
+ /* Save away checkpointed SPRs. */
+ std r0, VCPU_PPR_TM(r9)
+ mflr r5
mfctr r7
mfspr r8, SPRN_AMR
mfspr r10, SPRN_TAR
mfxer r11
std r5, VCPU_LR_TM(r9)
- stw r6, VCPU_CR_TM(r9)
std r7, VCPU_CTR_TM(r9)
std r8, VCPU_AMR_TM(r9)
std r10, VCPU_TAR_TM(r9)
std r11, VCPU_XER_TM(r9)
- /* Restore r12 as trap number. */
- lwz r12, VCPU_TRAP(r9)
-
/* Save FP/VSX. */
addi r3, r9, VCPU_FPRS_TM
bl store_fp_state
@@ -137,6 +157,11 @@
bl store_vr_state
mfspr r6, SPRN_VRSAVE
stw r6, VCPU_VRSAVE_TM(r9)
+
+ /* Restore non-volatile registers if requested to */
+ beq cr7, 1f
+ REST_NVGPRS(r1)
+ REST_GPR(10, r1)
1:
/*
* We need to save these SPRs after the treclaim so that the software
@@ -146,12 +171,16 @@
*/
mfspr r7, SPRN_TEXASR
std r7, VCPU_TEXASR(r9)
-11:
mfspr r5, SPRN_TFHAR
mfspr r6, SPRN_TFIAR
std r5, VCPU_TFHAR(r9)
std r6, VCPU_TFIAR(r9)
+ /* Restore MSR state if requested */
+ beq cr7, 2f
+ mtmsrd r10, 0
+2:
+ addi r1, r1, SWITCH_FRAME_SIZE
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
@@ -161,49 +190,22 @@
* be invoked from C function by PR KVM only.
*/
_GLOBAL(_kvmppc_save_tm_pr)
- mflr r5
- std r5, PPC_LR_STKOFF(r1)
- stdu r1, -SWITCH_FRAME_SIZE(r1)
- SAVE_NVGPRS(r1)
-
- /* save MSR since TM/math bits might be impacted
- * by __kvmppc_save_tm().
- */
- mfmsr r5
- SAVE_GPR(5, r1)
-
- /* also save DSCR/CR/TAR so that it can be recovered later */
- mfspr r6, SPRN_DSCR
- SAVE_GPR(6, r1)
-
- mfcr r7
- stw r7, _CCR(r1)
+ mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+ stdu r1, -PPC_MIN_STKFRM(r1)
mfspr r8, SPRN_TAR
- SAVE_GPR(8, r1)
+ std r8, PPC_MIN_STKFRM-8(r1)
+ li r5, 1 /* preserve non-volatile registers */
bl __kvmppc_save_tm
- REST_GPR(8, r1)
+ ld r8, PPC_MIN_STKFRM-8(r1)
mtspr SPRN_TAR, r8
- ld r7, _CCR(r1)
- mtcr r7
-
- REST_GPR(6, r1)
- mtspr SPRN_DSCR, r6
-
- /* need preserve current MSR's MSR_TS bits */
- REST_GPR(5, r1)
- mfmsr r6
- rldicl r6, r6, 64 - MSR_TS_S_LG, 62
- rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
- mtmsrd r5
-
- REST_NVGPRS(r1)
- addi r1, r1, SWITCH_FRAME_SIZE
- ld r5, PPC_LR_STKOFF(r1)
- mtlr r5
+ addi r1, r1, PPC_MIN_STKFRM
+ ld r0, PPC_LR_STKOFF(r1)
+ mtlr r0
blr
EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
@@ -215,15 +217,21 @@
* - r4 is the guest MSR with desired TS bits:
* For HV KVM, it is VCPU_MSR
* For PR KVM, it is provided by caller
- * This potentially modifies all checkpointed registers.
- * It restores r1, r2 from the PACA.
+ * - r5 containing a flag indicating that non-volatile registers
+ * must be preserved.
+ * If r5 == 0, this potentially modifies all checkpointed registers, but
+ * restores r1, r2 from the PACA before exit.
+ * If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
*/
_GLOBAL(__kvmppc_restore_tm)
mflr r0
std r0, PPC_LR_STKOFF(r1)
+ cmpdi cr7, r5, 0
+
/* Turn on TM/FP/VSX/VMX so we can restore them. */
mfmsr r5
+ mr r10, r5
li r6, MSR_TM >> 32
sldi r6, r6, 32
or r5, r5, r6
@@ -244,8 +252,7 @@
mr r5, r4
rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
- beqlr /* TM not active in guest */
- std r1, HSTATE_SCRATCH2(r13)
+ beq 9f /* TM not active in guest */
/* Make sure the failure summary is set, otherwise we'll program check
* when we trechkpt. It's possible that this might have been not set
@@ -256,6 +263,26 @@
mtspr SPRN_TEXASR, r7
/*
+ * Make a stack frame and save non-volatile registers if requested.
+ */
+ stdu r1, -SWITCH_FRAME_SIZE(r1)
+ std r1, HSTATE_SCRATCH2(r13)
+
+ mfcr r6
+ mfspr r7, SPRN_DSCR
+ SAVE_GPR(2, r1)
+ SAVE_GPR(6, r1)
+ SAVE_GPR(7, r1)
+
+ beq cr7, 4f
+ SAVE_NVGPRS(r1)
+
+ /* MSR[TS] will be 1 (suspended) once we do trechkpt */
+ li r0, 1
+ rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
+ SAVE_GPR(10, r1) /* final MSR value */
+4:
+ /*
* We need to load up the checkpointed state for the guest.
* We need to do this early as it will blow away any GPRs, VSRs and
* some SPRs.
@@ -291,8 +318,6 @@
ld r29, VCPU_DSCR_TM(r3)
ld r30, VCPU_PPR_TM(r3)
- std r2, PACATMSCRATCH(r13) /* Save TOC */
-
/* Clear the MSR RI since r1, r13 are all going to be foobar. */
li r5, 0
mtmsrd r5, 1
@@ -318,18 +343,31 @@
/* Now let's get back the state we need. */
HMT_MEDIUM
GET_PACA(r13)
-#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
- ld r29, HSTATE_DSCR(r13)
- mtspr SPRN_DSCR, r29
-#endif
ld r1, HSTATE_SCRATCH2(r13)
- ld r2, PACATMSCRATCH(r13)
+ REST_GPR(7, r1)
+ mtspr SPRN_DSCR, r7
/* Set the MSR RI since we have our registers back. */
li r5, MSR_RI
mtmsrd r5, 1
+
+ /* Restore TOC pointer and CR */
+ REST_GPR(2, r1)
+ REST_GPR(6, r1)
+ mtcr r6
+
+ /* Restore non-volatile registers if requested to. */
+ beq cr7, 5f
+ REST_GPR(10, r1)
+ REST_NVGPRS(r1)
+
+5: addi r1, r1, SWITCH_FRAME_SIZE
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
+
+9: /* Restore MSR bits if requested */
+ beqlr cr7
+ mtmsrd r10, 0
blr
/*
@@ -337,47 +375,23 @@
* can be invoked from C function by PR KVM only.
*/
_GLOBAL(_kvmppc_restore_tm_pr)
- mflr r5
- std r5, PPC_LR_STKOFF(r1)
- stdu r1, -SWITCH_FRAME_SIZE(r1)
- SAVE_NVGPRS(r1)
+ mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+ stdu r1, -PPC_MIN_STKFRM(r1)
- /* save MSR to avoid TM/math bits change */
- mfmsr r5
- SAVE_GPR(5, r1)
-
- /* also save DSCR/CR/TAR so that it can be recovered later */
- mfspr r6, SPRN_DSCR
- SAVE_GPR(6, r1)
-
- mfcr r7
- stw r7, _CCR(r1)
-
+ /* save TAR so that it can be recovered later */
mfspr r8, SPRN_TAR
- SAVE_GPR(8, r1)
+ std r8, PPC_MIN_STKFRM-8(r1)
+ li r5, 1
bl __kvmppc_restore_tm
- REST_GPR(8, r1)
+ ld r8, PPC_MIN_STKFRM-8(r1)
mtspr SPRN_TAR, r8
- ld r7, _CCR(r1)
- mtcr r7
-
- REST_GPR(6, r1)
- mtspr SPRN_DSCR, r6
-
- /* need preserve current MSR's MSR_TS bits */
- REST_GPR(5, r1)
- mfmsr r6
- rldicl r6, r6, 64 - MSR_TS_S_LG, 62
- rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
- mtmsrd r5
-
- REST_NVGPRS(r1)
- addi r1, r1, SWITCH_FRAME_SIZE
- ld r5, PPC_LR_STKOFF(r1)
- mtlr r5
+ addi r1, r1, PPC_MIN_STKFRM
+ ld r0, PPC_LR_STKOFF(r1)
+ mtlr r0
blr
EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
diff --git a/arch/powerpc/kvm/trace_book3s.h b/arch/powerpc/kvm/trace_book3s.h
index f3b2375..372a82f 100644
--- a/arch/powerpc/kvm/trace_book3s.h
+++ b/arch/powerpc/kvm/trace_book3s.h
@@ -14,7 +14,6 @@
{0x400, "INST_STORAGE"}, \
{0x480, "INST_SEGMENT"}, \
{0x500, "EXTERNAL"}, \
- {0x501, "EXTERNAL_LEVEL"}, \
{0x502, "EXTERNAL_HV"}, \
{0x600, "ALIGNMENT"}, \
{0x700, "PROGRAM"}, \