Blame - arch/x86/kernel/kvmclock.c - hafnium/third_party/linux

blob: 013fe3d21dbb3f4d5f834f74e0ca3d791a6f0b06 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/* KVM paravirtual clock driver. A clocksource implementation
				2	Copyright (C) 2008 Glauber de Oliveira Costa, Red Hat Inc.
				3
				4	This program is free software; you can redistribute it and/or modify
				5	it under the terms of the GNU General Public License as published by
				6	the Free Software Foundation; either version 2 of the License, or
				7	(at your option) any later version.
				8
				9	This program is distributed in the hope that it will be useful,
				10	but WITHOUT ANY WARRANTY; without even the implied warranty of
				11	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
				12	GNU General Public License for more details.
				13
				14	You should have received a copy of the GNU General Public License
				15	along with this program; if not, write to the Free Software
				16	Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
				17	*/
				18
				19	#include <linux/clocksource.h>
				20	#include <linux/kvm_para.h>
				21	#include <asm/pvclock.h>
				22	#include <asm/msr.h>
				23	#include <asm/apic.h>
				24	#include <linux/percpu.h>
				25	#include <linux/hardirq.h>
				26	#include <linux/cpuhotplug.h>
				27	#include <linux/sched.h>
				28	#include <linux/sched/clock.h>
				29	#include <linux/mm.h>
				30	#include <linux/slab.h>
				31	#include <linux/set_memory.h>
				32
				33	#include <asm/hypervisor.h>
				34	#include <asm/mem_encrypt.h>
				35	#include <asm/x86_init.h>
				36	#include <asm/reboot.h>
				37	#include <asm/kvmclock.h>
				38
				39	static int kvmclock __initdata = 1;
				40	static int kvmclock_vsyscall __initdata = 1;
				41	static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME;
				42	static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK;
				43	static u64 kvm_sched_clock_offset __ro_after_init;
				44
				45	static int __init parse_no_kvmclock(char *arg)
				46	{
				47	kvmclock = 0;
				48	return 0;
				49	}
				50	early_param("no-kvmclock", parse_no_kvmclock);
				51
				52	static int __init parse_no_kvmclock_vsyscall(char *arg)
				53	{
				54	kvmclock_vsyscall = 0;
				55	return 0;
				56	}
				57	early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
				58
				59	/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */
				60	#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS)
				61	#define HVC_BOOT_ARRAY_SIZE \
				62	(PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info))
				63
				64	static struct pvclock_vsyscall_time_info
				65	hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __bss_decrypted __aligned(PAGE_SIZE);
				66	static struct pvclock_wall_clock wall_clock __bss_decrypted;
				67	static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu);
				68	static struct pvclock_vsyscall_time_info *hvclock_mem;
				69
				70	static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void)
				71	{
				72	return &this_cpu_read(hv_clock_per_cpu)->pvti;
				73	}
				74
				75	static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void)
				76	{
				77	return this_cpu_read(hv_clock_per_cpu);
				78	}
				79
				80	/*
				81	* The wallclock is the time of day when we booted. Since then, some time may
				82	* have elapsed since the hypervisor wrote the data. So we try to account for
				83	* that with system time
				84	*/
				85	static void kvm_get_wallclock(struct timespec64 *now)
				86	{
				87	wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock));
				88	preempt_disable();
				89	pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now);
				90	preempt_enable();
				91	}
				92
				93	static int kvm_set_wallclock(const struct timespec64 *now)
				94	{
				95	return -ENODEV;
				96	}
				97
				98	static u64 kvm_clock_read(void)
				99	{
				100	u64 ret;
				101
				102	preempt_disable_notrace();
				103	ret = pvclock_clocksource_read(this_cpu_pvti());
				104	preempt_enable_notrace();
				105	return ret;
				106	}
				107
				108	static u64 kvm_clock_get_cycles(struct clocksource *cs)
				109	{
				110	return kvm_clock_read();
				111	}
				112
				113	static u64 kvm_sched_clock_read(void)
				114	{
				115	return kvm_clock_read() - kvm_sched_clock_offset;
				116	}
				117
				118	static inline void kvm_sched_clock_init(bool stable)
				119	{
				120	if (!stable) {
				121	pv_time_ops.sched_clock = kvm_clock_read;
				122	clear_sched_clock_stable();
				123	return;
				124	}
				125
				126	kvm_sched_clock_offset = kvm_clock_read();
				127	pv_time_ops.sched_clock = kvm_sched_clock_read;
				128
				129	pr_info("kvm-clock: using sched offset of %llu cycles",
				130	kvm_sched_clock_offset);
				131
				132	BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) >
				133	sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time));
				134	}
				135
				136	/*
				137	* If we don't do that, there is the possibility that the guest
				138	* will calibrate under heavy load - thus, getting a lower lpj -
				139	* and execute the delays themselves without load. This is wrong,
				140	* because no delay loop can finish beforehand.
				141	* Any heuristics is subject to fail, because ultimately, a large
				142	* poll of guests can be running and trouble each other. So we preset
				143	* lpj here
				144	*/
				145	static unsigned long kvm_get_tsc_khz(void)
				146	{
				147	setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ);
				148	return pvclock_tsc_khz(this_cpu_pvti());
				149	}
				150
				151	static void __init kvm_get_preset_lpj(void)
				152	{
				153	unsigned long khz;
				154	u64 lpj;
				155
				156	khz = kvm_get_tsc_khz();
				157
				158	lpj = ((u64)khz * 1000);
				159	do_div(lpj, HZ);
				160	preset_lpj = lpj;
				161	}
				162
				163	bool kvm_check_and_clear_guest_paused(void)
				164	{
				165	struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
				166	bool ret = false;
				167
				168	if (!src)
				169	return ret;
				170
				171	if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) {
				172	src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED;
				173	pvclock_touch_watchdogs();
				174	ret = true;
				175	}
				176	return ret;
				177	}
				178
				179	struct clocksource kvm_clock = {
				180	.name = "kvm-clock",
				181	.read = kvm_clock_get_cycles,
				182	.rating = 400,
				183	.mask = CLOCKSOURCE_MASK(64),
				184	.flags = CLOCK_SOURCE_IS_CONTINUOUS,
				185	};
				186	EXPORT_SYMBOL_GPL(kvm_clock);
				187
				188	static void kvm_register_clock(char *txt)
				189	{
				190	struct pvclock_vsyscall_time_info *src = this_cpu_hvclock();
				191	u64 pa;
				192
				193	if (!src)
				194	return;
				195
				196	pa = slow_virt_to_phys(&src->pvti) \| 0x01ULL;
				197	wrmsrl(msr_kvm_system_time, pa);
				198	pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt);
				199	}
				200
				201	static void kvm_save_sched_clock_state(void)
				202	{
				203	}
				204
				205	static void kvm_restore_sched_clock_state(void)
				206	{
				207	kvm_register_clock("primary cpu clock, resume");
				208	}
				209
				210	#ifdef CONFIG_X86_LOCAL_APIC
				211	static void kvm_setup_secondary_clock(void)
				212	{
				213	kvm_register_clock("secondary cpu clock");
				214	}
				215	#endif
				216
				217	/*
				218	* After the clock is registered, the host will keep writing to the
				219	* registered memory location. If the guest happens to shutdown, this memory
				220	* won't be valid. In cases like kexec, in which you install a new kernel, this
				221	* means a random memory location will be kept being written. So before any
				222	* kind of shutdown from our side, we unregister the clock by writing anything
				223	* that does not have the 'enable' bit set in the msr
				224	*/
				225	#ifdef CONFIG_KEXEC_CORE
				226	static void kvm_crash_shutdown(struct pt_regs *regs)
				227	{
				228	native_write_msr(msr_kvm_system_time, 0, 0);
				229	kvm_disable_steal_time();
				230	native_machine_crash_shutdown(regs);
				231	}
				232	#endif
				233
				234	static void kvm_shutdown(void)
				235	{
				236	native_write_msr(msr_kvm_system_time, 0, 0);
				237	kvm_disable_steal_time();
				238	native_machine_shutdown();
				239	}
				240
				241	static void __init kvmclock_init_mem(void)
				242	{
				243	unsigned long ncpus;
				244	unsigned int order;
				245	struct page *p;
				246	int r;
				247
				248	if (HVC_BOOT_ARRAY_SIZE >= num_possible_cpus())
				249	return;
				250
				251	ncpus = num_possible_cpus() - HVC_BOOT_ARRAY_SIZE;
				252	order = get_order(ncpus * sizeof(*hvclock_mem));
				253
				254	p = alloc_pages(GFP_KERNEL, order);
				255	if (!p) {
				256	pr_warn("%s: failed to alloc %d pages", __func__, (1U << order));
				257	return;
				258	}
				259
				260	hvclock_mem = page_address(p);
				261
				262	/*
				263	* hvclock is shared between the guest and the hypervisor, must
				264	* be mapped decrypted.
				265	*/
				266	if (sev_active()) {
				267	r = set_memory_decrypted((unsigned long) hvclock_mem,
				268	1UL << order);
				269	if (r) {
				270	__free_pages(p, order);
				271	hvclock_mem = NULL;
				272	pr_warn("kvmclock: set_memory_decrypted() failed. Disabling\n");
				273	return;
				274	}
				275	}
				276
				277	memset(hvclock_mem, 0, PAGE_SIZE << order);
				278	}
				279
				280	static int __init kvm_setup_vsyscall_timeinfo(void)
				281	{
				282	#ifdef CONFIG_X86_64
				283	u8 flags;
				284
				285	if (!per_cpu(hv_clock_per_cpu, 0) \|\| !kvmclock_vsyscall)
				286	return 0;
				287
				288	flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
				289	if (!(flags & PVCLOCK_TSC_STABLE_BIT))
				290	return 0;
				291
				292	kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK;
				293	#endif
				294
				295	kvmclock_init_mem();
				296
				297	return 0;
				298	}
				299	early_initcall(kvm_setup_vsyscall_timeinfo);
				300
				301	static int kvmclock_setup_percpu(unsigned int cpu)
				302	{
				303	struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu);
				304
				305	/*
				306	* The per cpu area setup replicates CPU0 data to all cpu
				307	* pointers. So carefully check. CPU0 has been set up in init
				308	* already.
				309	*/
				310	if (!cpu \|\| (p && p != per_cpu(hv_clock_per_cpu, 0)))
				311	return 0;
				312
				313	/* Use the static page for the first CPUs, allocate otherwise */
				314	if (cpu < HVC_BOOT_ARRAY_SIZE)
				315	p = &hv_clock_boot[cpu];
				316	else if (hvclock_mem)
				317	p = hvclock_mem + cpu - HVC_BOOT_ARRAY_SIZE;
				318	else
				319	return -ENOMEM;
				320
				321	per_cpu(hv_clock_per_cpu, cpu) = p;
				322	return p ? 0 : -ENOMEM;
				323	}
				324
				325	void __init kvmclock_init(void)
				326	{
				327	u8 flags;
				328
				329	if (!kvm_para_available() \|\| !kvmclock)
				330	return;
				331
				332	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
				333	msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
				334	msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
				335	} else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
				336	return;
				337	}
				338
				339	if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu",
				340	kvmclock_setup_percpu, NULL) < 0) {
				341	return;
				342	}
				343
				344	pr_info("kvm-clock: Using msrs %x and %x",
				345	msr_kvm_system_time, msr_kvm_wall_clock);
				346
				347	this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]);
				348	kvm_register_clock("primary cpu clock");
				349	pvclock_set_pvti_cpu0_va(hv_clock_boot);
				350
				351	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
				352	pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
				353
				354	flags = pvclock_read_flags(&hv_clock_boot[0].pvti);
				355	kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT);
				356
				357	x86_platform.calibrate_tsc = kvm_get_tsc_khz;
				358	x86_platform.calibrate_cpu = kvm_get_tsc_khz;
				359	x86_platform.get_wallclock = kvm_get_wallclock;
				360	x86_platform.set_wallclock = kvm_set_wallclock;
				361	#ifdef CONFIG_X86_LOCAL_APIC
				362	x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock;
				363	#endif
				364	x86_platform.save_sched_clock_state = kvm_save_sched_clock_state;
				365	x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state;
				366	machine_ops.shutdown = kvm_shutdown;
				367	#ifdef CONFIG_KEXEC_CORE
				368	machine_ops.crash_shutdown = kvm_crash_shutdown;
				369	#endif
				370	kvm_get_preset_lpj();
				371	clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
				372	pv_info.name = "KVM";
				373	}