Blame - arch/x86/kvm/mmu.c - hafnium/third_party/linux.git

blob: 1b82bc7c3ccaac9946581e14e81379415c97c4a9 [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	/*
				2	* Kernel-based Virtual Machine driver for Linux
				3	*
				4	* This module enables machines with Intel VT-x extensions to run virtual
				5	* machines without emulation or binary translation.
				6	*
				7	* MMU support
				8	*
				9	* Copyright (C) 2006 Qumranet, Inc.
				10	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				11	*
				12	* Authors:
				13	* Yaniv Kamay <yaniv@qumranet.com>
				14	* Avi Kivity <avi@qumranet.com>
				15	*
				16	* This work is licensed under the terms of the GNU GPL, version 2. See
				17	* the COPYING file in the top-level directory.
				18	*
				19	*/
				20
				21	#include "irq.h"
				22	#include "mmu.h"
				23	#include "x86.h"
				24	#include "kvm_cache_regs.h"
				25	#include "cpuid.h"
				26
				27	#include <linux/kvm_host.h>
				28	#include <linux/types.h>
				29	#include <linux/string.h>
				30	#include <linux/mm.h>
				31	#include <linux/highmem.h>
				32	#include <linux/moduleparam.h>
				33	#include <linux/export.h>
				34	#include <linux/swap.h>
				35	#include <linux/hugetlb.h>
				36	#include <linux/compiler.h>
				37	#include <linux/srcu.h>
				38	#include <linux/slab.h>
				39	#include <linux/sched/signal.h>
				40	#include <linux/uaccess.h>
				41	#include <linux/hash.h>
				42	#include <linux/kern_levels.h>
				43
				44	#include <asm/page.h>
				45	#include <asm/pat.h>
				46	#include <asm/cmpxchg.h>
				47	#include <asm/io.h>
				48	#include <asm/vmx.h>
				49	#include <asm/kvm_page_track.h>
				50	#include "trace.h"
				51
				52	/*
				53	* When setting this variable to true it enables Two-Dimensional-Paging
				54	* where the hardware walks 2 page tables:
				55	* 1. the guest-virtual to guest-physical
				56	* 2. while doing 1. it walks guest-physical to host-physical
				57	* If the hardware supports that we don't need to do shadow paging.
				58	*/
				59	bool tdp_enabled = false;
				60
				61	enum {
				62	AUDIT_PRE_PAGE_FAULT,
				63	AUDIT_POST_PAGE_FAULT,
				64	AUDIT_PRE_PTE_WRITE,
				65	AUDIT_POST_PTE_WRITE,
				66	AUDIT_PRE_SYNC,
				67	AUDIT_POST_SYNC
				68	};
				69
				70	#undef MMU_DEBUG
				71
				72	#ifdef MMU_DEBUG
				73	static bool dbg = 0;
				74	module_param(dbg, bool, 0644);
				75
				76	#define pgprintk(x...) do { if (dbg) printk(x); } while (0)
				77	#define rmap_printk(x...) do { if (dbg) printk(x); } while (0)
				78	#define MMU_WARN_ON(x) WARN_ON(x)
				79	#else
				80	#define pgprintk(x...) do { } while (0)
				81	#define rmap_printk(x...) do { } while (0)
				82	#define MMU_WARN_ON(x) do { } while (0)
				83	#endif
				84
				85	#define PTE_PREFETCH_NUM 8
				86
				87	#define PT_FIRST_AVAIL_BITS_SHIFT 10
				88	#define PT64_SECOND_AVAIL_BITS_SHIFT 52
				89
				90	#define PT64_LEVEL_BITS 9
				91
				92	#define PT64_LEVEL_SHIFT(level) \
				93	(PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
				94
				95	#define PT64_INDEX(address, level)\
				96	(((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
				97
				98
				99	#define PT32_LEVEL_BITS 10
				100
				101	#define PT32_LEVEL_SHIFT(level) \
				102	(PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
				103
				104	#define PT32_LVL_OFFSET_MASK(level) \
				105	(PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
				106	* PT32_LEVEL_BITS))) - 1))
				107
				108	#define PT32_INDEX(address, level)\
				109	(((address) >> PT32_LEVEL_SHIFT(level)) & ((1 << PT32_LEVEL_BITS) - 1))
				110
				111
				112	#define PT64_BASE_ADDR_MASK __sme_clr((((1ULL << 52) - 1) & ~(u64)(PAGE_SIZE-1)))
				113	#define PT64_DIR_BASE_ADDR_MASK \
				114	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + PT64_LEVEL_BITS)) - 1))
				115	#define PT64_LVL_ADDR_MASK(level) \
				116	(PT64_BASE_ADDR_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
				117	* PT64_LEVEL_BITS))) - 1))
				118	#define PT64_LVL_OFFSET_MASK(level) \
				119	(PT64_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
				120	* PT64_LEVEL_BITS))) - 1))
				121
				122	#define PT32_BASE_ADDR_MASK PAGE_MASK
				123	#define PT32_DIR_BASE_ADDR_MASK \
				124	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + PT32_LEVEL_BITS)) - 1))
				125	#define PT32_LVL_ADDR_MASK(level) \
				126	(PAGE_MASK & ~((1ULL << (PAGE_SHIFT + (((level) - 1) \
				127	* PT32_LEVEL_BITS))) - 1))
				128
				129	#define PT64_PERM_MASK (PT_PRESENT_MASK \| PT_WRITABLE_MASK \| shadow_user_mask \
				130	\| shadow_x_mask \| shadow_nx_mask \| shadow_me_mask)
				131
				132	#define ACC_EXEC_MASK 1
				133	#define ACC_WRITE_MASK PT_WRITABLE_MASK
				134	#define ACC_USER_MASK PT_USER_MASK
				135	#define ACC_ALL (ACC_EXEC_MASK \| ACC_WRITE_MASK \| ACC_USER_MASK)
				136
				137	/* The mask for the R/X bits in EPT PTEs */
				138	#define PT64_EPT_READABLE_MASK 0x1ull
				139	#define PT64_EPT_EXECUTABLE_MASK 0x4ull
				140
				141	#include <trace/events/kvm.h>
				142
				143	#define CREATE_TRACE_POINTS
				144	#include "mmutrace.h"
				145
				146	#define SPTE_HOST_WRITEABLE (1ULL << PT_FIRST_AVAIL_BITS_SHIFT)
				147	#define SPTE_MMU_WRITEABLE (1ULL << (PT_FIRST_AVAIL_BITS_SHIFT + 1))
				148
				149	#define SHADOW_PT_INDEX(addr, level) PT64_INDEX(addr, level)
				150
				151	/* make pte_list_desc fit well in cache line */
				152	#define PTE_LIST_EXT 3
				153
				154	/*
				155	* Return values of handle_mmio_page_fault and mmu.page_fault:
				156	* RET_PF_RETRY: let CPU fault again on the address.
				157	* RET_PF_EMULATE: mmio page fault, emulate the instruction directly.
				158	*
				159	* For handle_mmio_page_fault only:
				160	* RET_PF_INVALID: the spte is invalid, let the real page fault path update it.
				161	*/
				162	enum {
				163	RET_PF_RETRY = 0,
				164	RET_PF_EMULATE = 1,
				165	RET_PF_INVALID = 2,
				166	};
				167
				168	struct pte_list_desc {
				169	u64 *sptes[PTE_LIST_EXT];
				170	struct pte_list_desc *more;
				171	};
				172
				173	struct kvm_shadow_walk_iterator {
				174	u64 addr;
				175	hpa_t shadow_addr;
				176	u64 *sptep;
				177	int level;
				178	unsigned index;
				179	};
				180
				181	static const union kvm_mmu_page_role mmu_base_role_mask = {
				182	.cr0_wp = 1,
				183	.cr4_pae = 1,
				184	.nxe = 1,
				185	.smep_andnot_wp = 1,
				186	.smap_andnot_wp = 1,
				187	.smm = 1,
				188	.guest_mode = 1,
				189	.ad_disabled = 1,
				190	};
				191
				192	#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
				193	for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
				194	(_root), (_addr)); \
				195	shadow_walk_okay(&(_walker)); \
				196	shadow_walk_next(&(_walker)))
				197
				198	#define for_each_shadow_entry(_vcpu, _addr, _walker) \
				199	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
				200	shadow_walk_okay(&(_walker)); \
				201	shadow_walk_next(&(_walker)))
				202
				203	#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
				204	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
				205	shadow_walk_okay(&(_walker)) && \
				206	({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
				207	__shadow_walk_next(&(_walker), spte))
				208
				209	static struct kmem_cache *pte_list_desc_cache;
				210	static struct kmem_cache *mmu_page_header_cache;
				211	static struct percpu_counter kvm_total_used_mmu_pages;
				212
				213	static u64 __read_mostly shadow_nx_mask;
				214	static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
				215	static u64 __read_mostly shadow_user_mask;
				216	static u64 __read_mostly shadow_accessed_mask;
				217	static u64 __read_mostly shadow_dirty_mask;
				218	static u64 __read_mostly shadow_mmio_mask;
				219	static u64 __read_mostly shadow_mmio_value;
				220	static u64 __read_mostly shadow_present_mask;
				221	static u64 __read_mostly shadow_me_mask;
				222
				223	/*
				224	* SPTEs used by MMUs without A/D bits are marked with shadow_acc_track_value.
				225	* Non-present SPTEs with shadow_acc_track_value set are in place for access
				226	* tracking.
				227	*/
				228	static u64 __read_mostly shadow_acc_track_mask;
				229	static const u64 shadow_acc_track_value = SPTE_SPECIAL_MASK;
				230
				231	/*
				232	* The mask/shift to use for saving the original R/X bits when marking the PTE
				233	* as not-present for access tracking purposes. We do not save the W bit as the
				234	* PTEs being access tracked also need to be dirty tracked, so the W bit will be
				235	* restored only when a write is attempted to the page.
				236	*/
				237	static const u64 shadow_acc_track_saved_bits_mask = PT64_EPT_READABLE_MASK \|
				238	PT64_EPT_EXECUTABLE_MASK;
				239	static const u64 shadow_acc_track_saved_bits_shift = PT64_SECOND_AVAIL_BITS_SHIFT;
				240
				241	/*
				242	* This mask must be set on all non-zero Non-Present or Reserved SPTEs in order
				243	* to guard against L1TF attacks.
				244	*/
				245	static u64 __read_mostly shadow_nonpresent_or_rsvd_mask;
				246
				247	/*
				248	* The number of high-order 1 bits to use in the mask above.
				249	*/
				250	static const u64 shadow_nonpresent_or_rsvd_mask_len = 5;
				251
				252	/*
				253	* In some cases, we need to preserve the GFN of a non-present or reserved
				254	* SPTE when we usurp the upper five bits of the physical address space to
				255	* defend against L1TF, e.g. for MMIO SPTEs. To preserve the GFN, we'll
				256	* shift bits of the GFN that overlap with shadow_nonpresent_or_rsvd_mask
				257	* left into the reserved bits, i.e. the GFN in the SPTE will be split into
				258	* high and low parts. This mask covers the lower bits of the GFN.
				259	*/
				260	static u64 __read_mostly shadow_nonpresent_or_rsvd_lower_gfn_mask;
				261
				262
				263	static void mmu_spte_set(u64 *sptep, u64 spte);
				264	static union kvm_mmu_page_role
				265	kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu);
				266
				267	void kvm_mmu_set_mmio_spte_mask(u64 mmio_mask, u64 mmio_value)
				268	{
				269	BUG_ON((mmio_mask & mmio_value) != mmio_value);
				270	shadow_mmio_value = mmio_value \| SPTE_SPECIAL_MASK;
				271	shadow_mmio_mask = mmio_mask \| SPTE_SPECIAL_MASK;
				272	}
				273	EXPORT_SYMBOL_GPL(kvm_mmu_set_mmio_spte_mask);
				274
				275	static inline bool sp_ad_disabled(struct kvm_mmu_page *sp)
				276	{
				277	return sp->role.ad_disabled;
				278	}
				279
				280	static inline bool spte_ad_enabled(u64 spte)
				281	{
				282	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
				283	return !(spte & shadow_acc_track_value);
				284	}
				285
				286	static inline u64 spte_shadow_accessed_mask(u64 spte)
				287	{
				288	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
				289	return spte_ad_enabled(spte) ? shadow_accessed_mask : 0;
				290	}
				291
				292	static inline u64 spte_shadow_dirty_mask(u64 spte)
				293	{
				294	MMU_WARN_ON((spte & shadow_mmio_mask) == shadow_mmio_value);
				295	return spte_ad_enabled(spte) ? shadow_dirty_mask : 0;
				296	}
				297
				298	static inline bool is_access_track_spte(u64 spte)
				299	{
				300	return !spte_ad_enabled(spte) && (spte & shadow_acc_track_mask) == 0;
				301	}
				302
				303	/*
				304	* the low bit of the generation number is always presumed to be zero.
				305	* This disables mmio caching during memslot updates. The concept is
				306	* similar to a seqcount but instead of retrying the access we just punt
				307	* and ignore the cache.
				308	*
				309	* spte bits 3-11 are used as bits 1-9 of the generation number,
				310	* the bits 52-61 are used as bits 10-19 of the generation number.
				311	*/
				312	#define MMIO_SPTE_GEN_LOW_SHIFT 2
				313	#define MMIO_SPTE_GEN_HIGH_SHIFT 52
				314
				315	#define MMIO_GEN_SHIFT 20
				316	#define MMIO_GEN_LOW_SHIFT 10
				317	#define MMIO_GEN_LOW_MASK ((1 << MMIO_GEN_LOW_SHIFT) - 2)
				318	#define MMIO_GEN_MASK ((1 << MMIO_GEN_SHIFT) - 1)
				319
				320	static u64 generation_mmio_spte_mask(unsigned int gen)
				321	{
				322	u64 mask;
				323
				324	WARN_ON(gen & ~MMIO_GEN_MASK);
				325
				326	mask = (gen & MMIO_GEN_LOW_MASK) << MMIO_SPTE_GEN_LOW_SHIFT;
				327	mask \|= ((u64)gen >> MMIO_GEN_LOW_SHIFT) << MMIO_SPTE_GEN_HIGH_SHIFT;
				328	return mask;
				329	}
				330
				331	static unsigned int get_mmio_spte_generation(u64 spte)
				332	{
				333	unsigned int gen;
				334
				335	spte &= ~shadow_mmio_mask;
				336
				337	gen = (spte >> MMIO_SPTE_GEN_LOW_SHIFT) & MMIO_GEN_LOW_MASK;
				338	gen \|= (spte >> MMIO_SPTE_GEN_HIGH_SHIFT) << MMIO_GEN_LOW_SHIFT;
				339	return gen;
				340	}
				341
				342	static unsigned int kvm_current_mmio_generation(struct kvm_vcpu *vcpu)
				343	{
				344	return kvm_vcpu_memslots(vcpu)->generation & MMIO_GEN_MASK;
				345	}
				346
				347	static void mark_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, u64 gfn,
				348	unsigned access)
				349	{
				350	unsigned int gen = kvm_current_mmio_generation(vcpu);
				351	u64 mask = generation_mmio_spte_mask(gen);
				352	u64 gpa = gfn << PAGE_SHIFT;
				353
				354	access &= ACC_WRITE_MASK \| ACC_USER_MASK;
				355	mask \|= shadow_mmio_value \| access;
				356	mask \|= gpa \| shadow_nonpresent_or_rsvd_mask;
				357	mask \|= (gpa & shadow_nonpresent_or_rsvd_mask)
				358	<< shadow_nonpresent_or_rsvd_mask_len;
				359
				360	trace_mark_mmio_spte(sptep, gfn, access, gen);
				361	mmu_spte_set(sptep, mask);
				362	}
				363
				364	static bool is_mmio_spte(u64 spte)
				365	{
				366	return (spte & shadow_mmio_mask) == shadow_mmio_value;
				367	}
				368
				369	static gfn_t get_mmio_spte_gfn(u64 spte)
				370	{
				371	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
				372
				373	gpa \|= (spte >> shadow_nonpresent_or_rsvd_mask_len)
				374	& shadow_nonpresent_or_rsvd_mask;
				375
				376	return gpa >> PAGE_SHIFT;
				377	}
				378
				379	static unsigned get_mmio_spte_access(u64 spte)
				380	{
				381	u64 mask = generation_mmio_spte_mask(MMIO_GEN_MASK) \| shadow_mmio_mask;
				382	return (spte & ~mask) & ~PAGE_MASK;
				383	}
				384
				385	static bool set_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, gfn_t gfn,
				386	kvm_pfn_t pfn, unsigned access)
				387	{
				388	if (unlikely(is_noslot_pfn(pfn))) {
				389	mark_mmio_spte(vcpu, sptep, gfn, access);
				390	return true;
				391	}
				392
				393	return false;
				394	}
				395
				396	static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
				397	{
				398	unsigned int kvm_gen, spte_gen;
				399
				400	kvm_gen = kvm_current_mmio_generation(vcpu);
				401	spte_gen = get_mmio_spte_generation(spte);
				402
				403	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
				404	return likely(kvm_gen == spte_gen);
				405	}
				406
				407	/*
				408	* Sets the shadow PTE masks used by the MMU.
				409	*
				410	* Assumptions:
				411	* - Setting either @accessed_mask or @dirty_mask requires setting both
				412	* - At least one of @accessed_mask or @acc_track_mask must be set
				413	*/
				414	void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
				415	u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask,
				416	u64 acc_track_mask, u64 me_mask)
				417	{
				418	BUG_ON(!dirty_mask != !accessed_mask);
				419	BUG_ON(!accessed_mask && !acc_track_mask);
				420	BUG_ON(acc_track_mask & shadow_acc_track_value);
				421
				422	shadow_user_mask = user_mask;
				423	shadow_accessed_mask = accessed_mask;
				424	shadow_dirty_mask = dirty_mask;
				425	shadow_nx_mask = nx_mask;
				426	shadow_x_mask = x_mask;
				427	shadow_present_mask = p_mask;
				428	shadow_acc_track_mask = acc_track_mask;
				429	shadow_me_mask = me_mask;
				430	}
				431	EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
				432
				433	static void kvm_mmu_reset_all_pte_masks(void)
				434	{
				435	u8 low_phys_bits;
				436
				437	shadow_user_mask = 0;
				438	shadow_accessed_mask = 0;
				439	shadow_dirty_mask = 0;
				440	shadow_nx_mask = 0;
				441	shadow_x_mask = 0;
				442	shadow_mmio_mask = 0;
				443	shadow_present_mask = 0;
				444	shadow_acc_track_mask = 0;
				445
				446	/*
				447	* If the CPU has 46 or less physical address bits, then set an
				448	* appropriate mask to guard against L1TF attacks. Otherwise, it is
				449	* assumed that the CPU is not vulnerable to L1TF.
				450	*/
				451	low_phys_bits = boot_cpu_data.x86_phys_bits;
				452	if (boot_cpu_data.x86_phys_bits <
				453	52 - shadow_nonpresent_or_rsvd_mask_len) {
				454	shadow_nonpresent_or_rsvd_mask =
				455	rsvd_bits(boot_cpu_data.x86_phys_bits -
				456	shadow_nonpresent_or_rsvd_mask_len,
				457	boot_cpu_data.x86_phys_bits - 1);
				458	low_phys_bits -= shadow_nonpresent_or_rsvd_mask_len;
				459	}
				460	shadow_nonpresent_or_rsvd_lower_gfn_mask =
				461	GENMASK_ULL(low_phys_bits - 1, PAGE_SHIFT);
				462	}
				463
				464	static int is_cpuid_PSE36(void)
				465	{
				466	return 1;
				467	}
				468
				469	static int is_nx(struct kvm_vcpu *vcpu)
				470	{
				471	return vcpu->arch.efer & EFER_NX;
				472	}
				473
				474	static int is_shadow_present_pte(u64 pte)
				475	{
				476	return (pte != 0) && !is_mmio_spte(pte);
				477	}
				478
				479	static int is_large_pte(u64 pte)
				480	{
				481	return pte & PT_PAGE_SIZE_MASK;
				482	}
				483
				484	static int is_last_spte(u64 pte, int level)
				485	{
				486	if (level == PT_PAGE_TABLE_LEVEL)
				487	return 1;
				488	if (is_large_pte(pte))
				489	return 1;
				490	return 0;
				491	}
				492
				493	static bool is_executable_pte(u64 spte)
				494	{
				495	return (spte & (shadow_x_mask \| shadow_nx_mask)) == shadow_x_mask;
				496	}
				497
				498	static kvm_pfn_t spte_to_pfn(u64 pte)
				499	{
				500	return (pte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
				501	}
				502
				503	static gfn_t pse36_gfn_delta(u32 gpte)
				504	{
				505	int shift = 32 - PT32_DIR_PSE36_SHIFT - PAGE_SHIFT;
				506
				507	return (gpte & PT32_DIR_PSE36_MASK) << shift;
				508	}
				509
				510	#ifdef CONFIG_X86_64
				511	static void __set_spte(u64 *sptep, u64 spte)
				512	{
				513	WRITE_ONCE(*sptep, spte);
				514	}
				515
				516	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
				517	{
				518	WRITE_ONCE(*sptep, spte);
				519	}
				520
				521	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
				522	{
				523	return xchg(sptep, spte);
				524	}
				525
				526	static u64 __get_spte_lockless(u64 *sptep)
				527	{
				528	return READ_ONCE(*sptep);
				529	}
				530	#else
				531	union split_spte {
				532	struct {
				533	u32 spte_low;
				534	u32 spte_high;
				535	};
				536	u64 spte;
				537	};
				538
				539	static void count_spte_clear(u64 *sptep, u64 spte)
				540	{
				541	struct kvm_mmu_page *sp = page_header(__pa(sptep));
				542
				543	if (is_shadow_present_pte(spte))
				544	return;
				545
				546	/* Ensure the spte is completely set before we increase the count */
				547	smp_wmb();
				548	sp->clear_spte_count++;
				549	}
				550
				551	static void __set_spte(u64 *sptep, u64 spte)
				552	{
				553	union split_spte *ssptep, sspte;
				554
				555	ssptep = (union split_spte *)sptep;
				556	sspte = (union split_spte)spte;
				557
				558	ssptep->spte_high = sspte.spte_high;
				559
				560	/*
				561	* If we map the spte from nonpresent to present, We should store
				562	* the high bits firstly, then set present bit, so cpu can not
				563	* fetch this spte while we are setting the spte.
				564	*/
				565	smp_wmb();
				566
				567	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
				568	}
				569
				570	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
				571	{
				572	union split_spte *ssptep, sspte;
				573
				574	ssptep = (union split_spte *)sptep;
				575	sspte = (union split_spte)spte;
				576
				577	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
				578
				579	/*
				580	* If we map the spte from present to nonpresent, we should clear
				581	* present bit firstly to avoid vcpu fetch the old high bits.
				582	*/
				583	smp_wmb();
				584
				585	ssptep->spte_high = sspte.spte_high;
				586	count_spte_clear(sptep, spte);
				587	}
				588
				589	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
				590	{
				591	union split_spte *ssptep, sspte, orig;
				592
				593	ssptep = (union split_spte *)sptep;
				594	sspte = (union split_spte)spte;
				595
				596	/* xchg acts as a barrier before the setting of the high bits */
				597	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
				598	orig.spte_high = ssptep->spte_high;
				599	ssptep->spte_high = sspte.spte_high;
				600	count_spte_clear(sptep, spte);
				601
				602	return orig.spte;
				603	}
				604
				605	/*
				606	* The idea using the light way get the spte on x86_32 guest is from
				607	* gup_get_pte(arch/x86/mm/gup.c).
				608	*
				609	* An spte tlb flush may be pending, because kvm_set_pte_rmapp
				610	* coalesces them and we are running out of the MMU lock. Therefore
				611	* we need to protect against in-progress updates of the spte.
				612	*
				613	* Reading the spte while an update is in progress may get the old value
				614	* for the high part of the spte. The race is fine for a present->non-present
				615	* change (because the high part of the spte is ignored for non-present spte),
				616	* but for a present->present change we must reread the spte.
				617	*
				618	* All such changes are done in two steps (present->non-present and
				619	* non-present->present), hence it is enough to count the number of
				620	* present->non-present updates: if it changed while reading the spte,
				621	* we might have hit the race. This is done using clear_spte_count.
				622	*/
				623	static u64 __get_spte_lockless(u64 *sptep)
				624	{
				625	struct kvm_mmu_page *sp = page_header(__pa(sptep));
				626	union split_spte spte, orig = (union split_spte )sptep;
				627	int count;
				628
				629	retry:
				630	count = sp->clear_spte_count;
				631	smp_rmb();
				632
				633	spte.spte_low = orig->spte_low;
				634	smp_rmb();
				635
				636	spte.spte_high = orig->spte_high;
				637	smp_rmb();
				638
				639	if (unlikely(spte.spte_low != orig->spte_low \|\|
				640	count != sp->clear_spte_count))
				641	goto retry;
				642
				643	return spte.spte;
				644	}
				645	#endif
				646
				647	static bool spte_can_locklessly_be_made_writable(u64 spte)
				648	{
				649	return (spte & (SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE)) ==
				650	(SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE);
				651	}
				652
				653	static bool spte_has_volatile_bits(u64 spte)
				654	{
				655	if (!is_shadow_present_pte(spte))
				656	return false;
				657
				658	/*
				659	* Always atomically update spte if it can be updated
				660	* out of mmu-lock, it can ensure dirty bit is not lost,
				661	* also, it can help us to get a stable is_writable_pte()
				662	* to ensure tlb flush is not missed.
				663	*/
				664	if (spte_can_locklessly_be_made_writable(spte) \|\|
				665	is_access_track_spte(spte))
				666	return true;
				667
				668	if (spte_ad_enabled(spte)) {
				669	if ((spte & shadow_accessed_mask) == 0 \|\|
				670	(is_writable_pte(spte) && (spte & shadow_dirty_mask) == 0))
				671	return true;
				672	}
				673
				674	return false;
				675	}
				676
				677	static bool is_accessed_spte(u64 spte)
				678	{
				679	u64 accessed_mask = spte_shadow_accessed_mask(spte);
				680
				681	return accessed_mask ? spte & accessed_mask
				682	: !is_access_track_spte(spte);
				683	}
				684
				685	static bool is_dirty_spte(u64 spte)
				686	{
				687	u64 dirty_mask = spte_shadow_dirty_mask(spte);
				688
				689	return dirty_mask ? spte & dirty_mask : spte & PT_WRITABLE_MASK;
				690	}
				691
				692	/* Rules for using mmu_spte_set:
				693	* Set the sptep from nonpresent to present.
				694	* Note: the sptep being assigned must be either not present
				695	* or in a state where the hardware will not attempt to update
				696	* the spte.
				697	*/
				698	static void mmu_spte_set(u64 *sptep, u64 new_spte)
				699	{
				700	WARN_ON(is_shadow_present_pte(*sptep));
				701	__set_spte(sptep, new_spte);
				702	}
				703
				704	/*
				705	* Update the SPTE (excluding the PFN), but do not track changes in its
				706	* accessed/dirty status.
				707	*/
				708	static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
				709	{
				710	u64 old_spte = *sptep;
				711
				712	WARN_ON(!is_shadow_present_pte(new_spte));
				713
				714	if (!is_shadow_present_pte(old_spte)) {
				715	mmu_spte_set(sptep, new_spte);
				716	return old_spte;
				717	}
				718
				719	if (!spte_has_volatile_bits(old_spte))
				720	__update_clear_spte_fast(sptep, new_spte);
				721	else
				722	old_spte = __update_clear_spte_slow(sptep, new_spte);
				723
				724	WARN_ON(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
				725
				726	return old_spte;
				727	}
				728
				729	/* Rules for using mmu_spte_update:
				730	* Update the state bits, it means the mapped pfn is not changed.
				731	*
				732	* Whenever we overwrite a writable spte with a read-only one we
				733	* should flush remote TLBs. Otherwise rmap_write_protect
				734	* will find a read-only spte, even though the writable spte
				735	* might be cached on a CPU's TLB, the return value indicates this
				736	* case.
				737	*
				738	* Returns true if the TLB needs to be flushed
				739	*/
				740	static bool mmu_spte_update(u64 *sptep, u64 new_spte)
				741	{
				742	bool flush = false;
				743	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
				744
				745	if (!is_shadow_present_pte(old_spte))
				746	return false;
				747
				748	/*
				749	* For the spte updated out of mmu-lock is safe, since
				750	* we always atomically update it, see the comments in
				751	* spte_has_volatile_bits().
				752	*/
				753	if (spte_can_locklessly_be_made_writable(old_spte) &&
				754	!is_writable_pte(new_spte))
				755	flush = true;
				756
				757	/*
				758	* Flush TLB when accessed/dirty states are changed in the page tables,
				759	* to guarantee consistency between TLB and page tables.
				760	*/
				761
				762	if (is_accessed_spte(old_spte) && !is_accessed_spte(new_spte)) {
				763	flush = true;
				764	kvm_set_pfn_accessed(spte_to_pfn(old_spte));
				765	}
				766
				767	if (is_dirty_spte(old_spte) && !is_dirty_spte(new_spte)) {
				768	flush = true;
				769	kvm_set_pfn_dirty(spte_to_pfn(old_spte));
				770	}
				771
				772	return flush;
				773	}
				774
				775	/*
				776	* Rules for using mmu_spte_clear_track_bits:
				777	* It sets the sptep from present to nonpresent, and track the
				778	* state bits, it is used to clear the last level sptep.
				779	* Returns non-zero if the PTE was previously valid.
				780	*/
				781	static int mmu_spte_clear_track_bits(u64 *sptep)
				782	{
				783	kvm_pfn_t pfn;
				784	u64 old_spte = *sptep;
				785
				786	if (!spte_has_volatile_bits(old_spte))
				787	__update_clear_spte_fast(sptep, 0ull);
				788	else
				789	old_spte = __update_clear_spte_slow(sptep, 0ull);
				790
				791	if (!is_shadow_present_pte(old_spte))
				792	return 0;
				793
				794	pfn = spte_to_pfn(old_spte);
				795
				796	/*
				797	* KVM does not hold the refcount of the page used by
				798	* kvm mmu, before reclaiming the page, we should
				799	* unmap it from mmu first.
				800	*/
				801	WARN_ON(!kvm_is_reserved_pfn(pfn) && !page_count(pfn_to_page(pfn)));
				802
				803	if (is_accessed_spte(old_spte))
				804	kvm_set_pfn_accessed(pfn);
				805
				806	if (is_dirty_spte(old_spte))
				807	kvm_set_pfn_dirty(pfn);
				808
				809	return 1;
				810	}
				811
				812	/*
				813	* Rules for using mmu_spte_clear_no_track:
				814	* Directly clear spte without caring the state bits of sptep,
				815	* it is used to set the upper level spte.
				816	*/
				817	static void mmu_spte_clear_no_track(u64 *sptep)
				818	{
				819	__update_clear_spte_fast(sptep, 0ull);
				820	}
				821
				822	static u64 mmu_spte_get_lockless(u64 *sptep)
				823	{
				824	return __get_spte_lockless(sptep);
				825	}
				826
				827	static u64 mark_spte_for_access_track(u64 spte)
				828	{
				829	if (spte_ad_enabled(spte))
				830	return spte & ~shadow_accessed_mask;
				831
				832	if (is_access_track_spte(spte))
				833	return spte;
				834
				835	/*
				836	* Making an Access Tracking PTE will result in removal of write access
				837	* from the PTE. So, verify that we will be able to restore the write
				838	* access in the fast page fault path later on.
				839	*/
				840	WARN_ONCE((spte & PT_WRITABLE_MASK) &&
				841	!spte_can_locklessly_be_made_writable(spte),
				842	"kvm: Writable SPTE is not locklessly dirty-trackable\n");
				843
				844	WARN_ONCE(spte & (shadow_acc_track_saved_bits_mask <<
				845	shadow_acc_track_saved_bits_shift),
				846	"kvm: Access Tracking saved bit locations are not zero\n");
				847
				848	spte \|= (spte & shadow_acc_track_saved_bits_mask) <<
				849	shadow_acc_track_saved_bits_shift;
				850	spte &= ~shadow_acc_track_mask;
				851
				852	return spte;
				853	}
				854
				855	/* Restore an acc-track PTE back to a regular PTE */
				856	static u64 restore_acc_track_spte(u64 spte)
				857	{
				858	u64 new_spte = spte;
				859	u64 saved_bits = (spte >> shadow_acc_track_saved_bits_shift)
				860	& shadow_acc_track_saved_bits_mask;
				861
				862	WARN_ON_ONCE(spte_ad_enabled(spte));
				863	WARN_ON_ONCE(!is_access_track_spte(spte));
				864
				865	new_spte &= ~shadow_acc_track_mask;
				866	new_spte &= ~(shadow_acc_track_saved_bits_mask <<
				867	shadow_acc_track_saved_bits_shift);
				868	new_spte \|= saved_bits;
				869
				870	return new_spte;
				871	}
				872
				873	/* Returns the Accessed status of the PTE and resets it at the same time. */
				874	static bool mmu_spte_age(u64 *sptep)
				875	{
				876	u64 spte = mmu_spte_get_lockless(sptep);
				877
				878	if (!is_accessed_spte(spte))
				879	return false;
				880
				881	if (spte_ad_enabled(spte)) {
				882	clear_bit((ffs(shadow_accessed_mask) - 1),
				883	(unsigned long *)sptep);
				884	} else {
				885	/*
				886	* Capture the dirty status of the page, so that it doesn't get
				887	* lost when the SPTE is marked for access tracking.
				888	*/
				889	if (is_writable_pte(spte))
				890	kvm_set_pfn_dirty(spte_to_pfn(spte));
				891
				892	spte = mark_spte_for_access_track(spte);
				893	mmu_spte_update_no_track(sptep, spte);
				894	}
				895
				896	return true;
				897	}
				898
				899	static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
				900	{
				901	/*
				902	* Prevent page table teardown by making any free-er wait during
				903	* kvm_flush_remote_tlbs() IPI to all active vcpus.
				904	*/
				905	local_irq_disable();
				906
				907	/*
				908	* Make sure a following spte read is not reordered ahead of the write
				909	* to vcpu->mode.
				910	*/
				911	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
				912	}
				913
				914	static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
				915	{
				916	/*
				917	* Make sure the write to vcpu->mode is not reordered in front of
				918	* reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
				919	* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
				920	*/
				921	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
				922	local_irq_enable();
				923	}
				924
				925	static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
				926	struct kmem_cache *base_cache, int min)
				927	{
				928	void *obj;
				929
				930	if (cache->nobjs >= min)
				931	return 0;
				932	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
				933	obj = kmem_cache_zalloc(base_cache, GFP_KERNEL);
				934	if (!obj)
				935	return -ENOMEM;
				936	cache->objects[cache->nobjs++] = obj;
				937	}
				938	return 0;
				939	}
				940
				941	static int mmu_memory_cache_free_objects(struct kvm_mmu_memory_cache *cache)
				942	{
				943	return cache->nobjs;
				944	}
				945
				946	static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
				947	struct kmem_cache *cache)
				948	{
				949	while (mc->nobjs)
				950	kmem_cache_free(cache, mc->objects[--mc->nobjs]);
				951	}
				952
				953	static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
				954	int min)
				955	{
				956	void *page;
				957
				958	if (cache->nobjs >= min)
				959	return 0;
				960	while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
				961	page = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
				962	if (!page)
				963	return -ENOMEM;
				964	cache->objects[cache->nobjs++] = page;
				965	}
				966	return 0;
				967	}
				968
				969	static void mmu_free_memory_cache_page(struct kvm_mmu_memory_cache *mc)
				970	{
				971	while (mc->nobjs)
				972	free_page((unsigned long)mc->objects[--mc->nobjs]);
				973	}
				974
				975	static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
				976	{
				977	int r;
				978
				979	r = mmu_topup_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
				980	pte_list_desc_cache, 8 + PTE_PREFETCH_NUM);
				981	if (r)
				982	goto out;
				983	r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
				984	if (r)
				985	goto out;
				986	r = mmu_topup_memory_cache(&vcpu->arch.mmu_page_header_cache,
				987	mmu_page_header_cache, 4);
				988	out:
				989	return r;
				990	}
				991
				992	static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
				993	{
				994	mmu_free_memory_cache(&vcpu->arch.mmu_pte_list_desc_cache,
				995	pte_list_desc_cache);
				996	mmu_free_memory_cache_page(&vcpu->arch.mmu_page_cache);
				997	mmu_free_memory_cache(&vcpu->arch.mmu_page_header_cache,
				998	mmu_page_header_cache);
				999	}
				1000
				1001	static void mmu_memory_cache_alloc(struct kvm_mmu_memory_cache mc)
				1002	{
				1003	void *p;
				1004
				1005	BUG_ON(!mc->nobjs);
				1006	p = mc->objects[--mc->nobjs];
				1007	return p;
				1008	}
				1009
				1010	static struct pte_list_desc mmu_alloc_pte_list_desc(struct kvm_vcpu vcpu)
				1011	{
				1012	return mmu_memory_cache_alloc(&vcpu->arch.mmu_pte_list_desc_cache);
				1013	}
				1014
				1015	static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
				1016	{
				1017	kmem_cache_free(pte_list_desc_cache, pte_list_desc);
				1018	}
				1019
				1020	static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page *sp, int index)
				1021	{
				1022	if (!sp->role.direct)
				1023	return sp->gfns[index];
				1024
				1025	return sp->gfn + (index << ((sp->role.level - 1) * PT64_LEVEL_BITS));
				1026	}
				1027
				1028	static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
				1029	{
				1030	if (sp->role.direct)
				1031	BUG_ON(gfn != kvm_mmu_page_get_gfn(sp, index));
				1032	else
				1033	sp->gfns[index] = gfn;
				1034	}
				1035
				1036	/*
				1037	* Return the pointer to the large page information for a given gfn,
				1038	* handling slots that are not large page aligned.
				1039	*/
				1040	static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
				1041	struct kvm_memory_slot *slot,
				1042	int level)
				1043	{
				1044	unsigned long idx;
				1045
				1046	idx = gfn_to_index(gfn, slot->base_gfn, level);
				1047	return &slot->arch.lpage_info[level - 2][idx];
				1048	}
				1049
				1050	static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot,
				1051	gfn_t gfn, int count)
				1052	{
				1053	struct kvm_lpage_info *linfo;
				1054	int i;
				1055
				1056	for (i = PT_DIRECTORY_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1057	linfo = lpage_info_slot(gfn, slot, i);
				1058	linfo->disallow_lpage += count;
				1059	WARN_ON(linfo->disallow_lpage < 0);
				1060	}
				1061	}
				1062
				1063	void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
				1064	{
				1065	update_gfn_disallow_lpage_count(slot, gfn, 1);
				1066	}
				1067
				1068	void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn)
				1069	{
				1070	update_gfn_disallow_lpage_count(slot, gfn, -1);
				1071	}
				1072
				1073	static void account_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
				1074	{
				1075	struct kvm_memslots *slots;
				1076	struct kvm_memory_slot *slot;
				1077	gfn_t gfn;
				1078
				1079	kvm->arch.indirect_shadow_pages++;
				1080	gfn = sp->gfn;
				1081	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1082	slot = __gfn_to_memslot(slots, gfn);
				1083
				1084	/* the non-leaf shadow pages are keeping readonly. */
				1085	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				1086	return kvm_slot_page_track_add_page(kvm, slot, gfn,
				1087	KVM_PAGE_TRACK_WRITE);
				1088
				1089	kvm_mmu_gfn_disallow_lpage(slot, gfn);
				1090	}
				1091
				1092	static void unaccount_shadowed(struct kvm kvm, struct kvm_mmu_page sp)
				1093	{
				1094	struct kvm_memslots *slots;
				1095	struct kvm_memory_slot *slot;
				1096	gfn_t gfn;
				1097
				1098	kvm->arch.indirect_shadow_pages--;
				1099	gfn = sp->gfn;
				1100	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1101	slot = __gfn_to_memslot(slots, gfn);
				1102	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				1103	return kvm_slot_page_track_remove_page(kvm, slot, gfn,
				1104	KVM_PAGE_TRACK_WRITE);
				1105
				1106	kvm_mmu_gfn_allow_lpage(slot, gfn);
				1107	}
				1108
				1109	static bool __mmu_gfn_lpage_is_disallowed(gfn_t gfn, int level,
				1110	struct kvm_memory_slot *slot)
				1111	{
				1112	struct kvm_lpage_info *linfo;
				1113
				1114	if (slot) {
				1115	linfo = lpage_info_slot(gfn, slot, level);
				1116	return !!linfo->disallow_lpage;
				1117	}
				1118
				1119	return true;
				1120	}
				1121
				1122	static bool mmu_gfn_lpage_is_disallowed(struct kvm_vcpu *vcpu, gfn_t gfn,
				1123	int level)
				1124	{
				1125	struct kvm_memory_slot *slot;
				1126
				1127	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1128	return __mmu_gfn_lpage_is_disallowed(gfn, level, slot);
				1129	}
				1130
				1131	static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
				1132	{
				1133	unsigned long page_size;
				1134	int i, ret = 0;
				1135
				1136	page_size = kvm_host_page_size(kvm, gfn);
				1137
				1138	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1139	if (page_size >= KVM_HPAGE_SIZE(i))
				1140	ret = i;
				1141	else
				1142	break;
				1143	}
				1144
				1145	return ret;
				1146	}
				1147
				1148	static inline bool memslot_valid_for_gpte(struct kvm_memory_slot *slot,
				1149	bool no_dirty_log)
				1150	{
				1151	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
				1152	return false;
				1153	if (no_dirty_log && slot->dirty_bitmap)
				1154	return false;
				1155
				1156	return true;
				1157	}
				1158
				1159	static struct kvm_memory_slot *
				1160	gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
				1161	bool no_dirty_log)
				1162	{
				1163	struct kvm_memory_slot *slot;
				1164
				1165	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1166	if (!memslot_valid_for_gpte(slot, no_dirty_log))
				1167	slot = NULL;
				1168
				1169	return slot;
				1170	}
				1171
				1172	static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn,
				1173	bool *force_pt_level)
				1174	{
				1175	int host_level, level, max_level;
				1176	struct kvm_memory_slot *slot;
				1177
				1178	if (unlikely(*force_pt_level))
				1179	return PT_PAGE_TABLE_LEVEL;
				1180
				1181	slot = kvm_vcpu_gfn_to_memslot(vcpu, large_gfn);
				1182	*force_pt_level = !memslot_valid_for_gpte(slot, true);
				1183	if (unlikely(*force_pt_level))
				1184	return PT_PAGE_TABLE_LEVEL;
				1185
				1186	host_level = host_mapping_level(vcpu->kvm, large_gfn);
				1187
				1188	if (host_level == PT_PAGE_TABLE_LEVEL)
				1189	return host_level;
				1190
				1191	max_level = min(kvm_x86_ops->get_lpage_level(), host_level);
				1192
				1193	for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
				1194	if (__mmu_gfn_lpage_is_disallowed(large_gfn, level, slot))
				1195	break;
				1196
				1197	return level - 1;
				1198	}
				1199
				1200	/*
				1201	* About rmap_head encoding:
				1202	*
				1203	* If the bit zero of rmap_head->val is clear, then it points to the only spte
				1204	* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
				1205	* pte_list_desc containing more mappings.
				1206	*/
				1207
				1208	/*
				1209	* Returns the number of pointers in the rmap chain, not counting the new one.
				1210	*/
				1211	static int pte_list_add(struct kvm_vcpu vcpu, u64 spte,
				1212	struct kvm_rmap_head *rmap_head)
				1213	{
				1214	struct pte_list_desc *desc;
				1215	int i, count = 0;
				1216
				1217	if (!rmap_head->val) {
				1218	rmap_printk("pte_list_add: %p %llx 0->1\n", spte, *spte);
				1219	rmap_head->val = (unsigned long)spte;
				1220	} else if (!(rmap_head->val & 1)) {
				1221	rmap_printk("pte_list_add: %p %llx 1->many\n", spte, *spte);
				1222	desc = mmu_alloc_pte_list_desc(vcpu);
				1223	desc->sptes[0] = (u64 *)rmap_head->val;
				1224	desc->sptes[1] = spte;
				1225	rmap_head->val = (unsigned long)desc \| 1;
				1226	++count;
				1227	} else {
				1228	rmap_printk("pte_list_add: %p %llx many->many\n", spte, *spte);
				1229	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1230	while (desc->sptes[PTE_LIST_EXT-1] && desc->more) {
				1231	desc = desc->more;
				1232	count += PTE_LIST_EXT;
				1233	}
				1234	if (desc->sptes[PTE_LIST_EXT-1]) {
				1235	desc->more = mmu_alloc_pte_list_desc(vcpu);
				1236	desc = desc->more;
				1237	}
				1238	for (i = 0; desc->sptes[i]; ++i)
				1239	++count;
				1240	desc->sptes[i] = spte;
				1241	}
				1242	return count;
				1243	}
				1244
				1245	static void
				1246	pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head,
				1247	struct pte_list_desc *desc, int i,
				1248	struct pte_list_desc *prev_desc)
				1249	{
				1250	int j;
				1251
				1252	for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j)
				1253	;
				1254	desc->sptes[i] = desc->sptes[j];
				1255	desc->sptes[j] = NULL;
				1256	if (j != 0)
				1257	return;
				1258	if (!prev_desc && !desc->more)
				1259	rmap_head->val = (unsigned long)desc->sptes[0];
				1260	else
				1261	if (prev_desc)
				1262	prev_desc->more = desc->more;
				1263	else
				1264	rmap_head->val = (unsigned long)desc->more \| 1;
				1265	mmu_free_pte_list_desc(desc);
				1266	}
				1267
				1268	static void pte_list_remove(u64 spte, struct kvm_rmap_head rmap_head)
				1269	{
				1270	struct pte_list_desc *desc;
				1271	struct pte_list_desc *prev_desc;
				1272	int i;
				1273
				1274	if (!rmap_head->val) {
				1275	printk(KERN_ERR "pte_list_remove: %p 0->BUG\n", spte);
				1276	BUG();
				1277	} else if (!(rmap_head->val & 1)) {
				1278	rmap_printk("pte_list_remove: %p 1->0\n", spte);
				1279	if ((u64 *)rmap_head->val != spte) {
				1280	printk(KERN_ERR "pte_list_remove: %p 1->BUG\n", spte);
				1281	BUG();
				1282	}
				1283	rmap_head->val = 0;
				1284	} else {
				1285	rmap_printk("pte_list_remove: %p many->many\n", spte);
				1286	desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1287	prev_desc = NULL;
				1288	while (desc) {
				1289	for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) {
				1290	if (desc->sptes[i] == spte) {
				1291	pte_list_desc_remove_entry(rmap_head,
				1292	desc, i, prev_desc);
				1293	return;
				1294	}
				1295	}
				1296	prev_desc = desc;
				1297	desc = desc->more;
				1298	}
				1299	pr_err("pte_list_remove: %p many->many\n", spte);
				1300	BUG();
				1301	}
				1302	}
				1303
				1304	static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level,
				1305	struct kvm_memory_slot *slot)
				1306	{
				1307	unsigned long idx;
				1308
				1309	idx = gfn_to_index(gfn, slot->base_gfn, level);
				1310	return &slot->arch.rmap[level - PT_PAGE_TABLE_LEVEL][idx];
				1311	}
				1312
				1313	static struct kvm_rmap_head gfn_to_rmap(struct kvm kvm, gfn_t gfn,
				1314	struct kvm_mmu_page *sp)
				1315	{
				1316	struct kvm_memslots *slots;
				1317	struct kvm_memory_slot *slot;
				1318
				1319	slots = kvm_memslots_for_spte_role(kvm, sp->role);
				1320	slot = __gfn_to_memslot(slots, gfn);
				1321	return __gfn_to_rmap(gfn, sp->role.level, slot);
				1322	}
				1323
				1324	static bool rmap_can_add(struct kvm_vcpu *vcpu)
				1325	{
				1326	struct kvm_mmu_memory_cache *cache;
				1327
				1328	cache = &vcpu->arch.mmu_pte_list_desc_cache;
				1329	return mmu_memory_cache_free_objects(cache);
				1330	}
				1331
				1332	static int rmap_add(struct kvm_vcpu vcpu, u64 spte, gfn_t gfn)
				1333	{
				1334	struct kvm_mmu_page *sp;
				1335	struct kvm_rmap_head *rmap_head;
				1336
				1337	sp = page_header(__pa(spte));
				1338	kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn);
				1339	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
				1340	return pte_list_add(vcpu, spte, rmap_head);
				1341	}
				1342
				1343	static void rmap_remove(struct kvm kvm, u64 spte)
				1344	{
				1345	struct kvm_mmu_page *sp;
				1346	gfn_t gfn;
				1347	struct kvm_rmap_head *rmap_head;
				1348
				1349	sp = page_header(__pa(spte));
				1350	gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
				1351	rmap_head = gfn_to_rmap(kvm, gfn, sp);
				1352	pte_list_remove(spte, rmap_head);
				1353	}
				1354
				1355	/*
				1356	* Used by the following functions to iterate through the sptes linked by a
				1357	* rmap. All fields are private and not assumed to be used outside.
				1358	*/
				1359	struct rmap_iterator {
				1360	/* private fields */
				1361	struct pte_list_desc desc; / holds the sptep if not NULL */
				1362	int pos; /* index of the sptep */
				1363	};
				1364
				1365	/*
				1366	* Iteration must be started by this function. This should also be used after
				1367	* removing/dropping sptes from the rmap link because in such cases the
				1368	* information in the itererator may not be valid.
				1369	*
				1370	* Returns sptep if found, NULL otherwise.
				1371	*/
				1372	static u64 rmap_get_first(struct kvm_rmap_head rmap_head,
				1373	struct rmap_iterator *iter)
				1374	{
				1375	u64 *sptep;
				1376
				1377	if (!rmap_head->val)
				1378	return NULL;
				1379
				1380	if (!(rmap_head->val & 1)) {
				1381	iter->desc = NULL;
				1382	sptep = (u64 *)rmap_head->val;
				1383	goto out;
				1384	}
				1385
				1386	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~1ul);
				1387	iter->pos = 0;
				1388	sptep = iter->desc->sptes[iter->pos];
				1389	out:
				1390	BUG_ON(!is_shadow_present_pte(*sptep));
				1391	return sptep;
				1392	}
				1393
				1394	/*
				1395	* Must be used with a valid iterator: e.g. after rmap_get_first().
				1396	*
				1397	* Returns sptep if found, NULL otherwise.
				1398	*/
				1399	static u64 rmap_get_next(struct rmap_iterator iter)
				1400	{
				1401	u64 *sptep;
				1402
				1403	if (iter->desc) {
				1404	if (iter->pos < PTE_LIST_EXT - 1) {
				1405	++iter->pos;
				1406	sptep = iter->desc->sptes[iter->pos];
				1407	if (sptep)
				1408	goto out;
				1409	}
				1410
				1411	iter->desc = iter->desc->more;
				1412
				1413	if (iter->desc) {
				1414	iter->pos = 0;
				1415	/* desc->sptes[0] cannot be NULL */
				1416	sptep = iter->desc->sptes[iter->pos];
				1417	goto out;
				1418	}
				1419	}
				1420
				1421	return NULL;
				1422	out:
				1423	BUG_ON(!is_shadow_present_pte(*sptep));
				1424	return sptep;
				1425	}
				1426
				1427	#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
				1428	for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
				1429	_spte_; _spte_ = rmap_get_next(_iter_))
				1430
				1431	static void drop_spte(struct kvm kvm, u64 sptep)
				1432	{
				1433	if (mmu_spte_clear_track_bits(sptep))
				1434	rmap_remove(kvm, sptep);
				1435	}
				1436
				1437
				1438	static bool __drop_large_spte(struct kvm kvm, u64 sptep)
				1439	{
				1440	if (is_large_pte(*sptep)) {
				1441	WARN_ON(page_header(__pa(sptep))->role.level ==
				1442	PT_PAGE_TABLE_LEVEL);
				1443	drop_spte(kvm, sptep);
				1444	--kvm->stat.lpages;
				1445	return true;
				1446	}
				1447
				1448	return false;
				1449	}
				1450
				1451	static void drop_large_spte(struct kvm_vcpu vcpu, u64 sptep)
				1452	{
				1453	if (__drop_large_spte(vcpu->kvm, sptep))
				1454	kvm_flush_remote_tlbs(vcpu->kvm);
				1455	}
				1456
				1457	/*
				1458	* Write-protect on the specified @sptep, @pt_protect indicates whether
				1459	* spte write-protection is caused by protecting shadow page table.
				1460	*
				1461	* Note: write protection is difference between dirty logging and spte
				1462	* protection:
				1463	* - for dirty logging, the spte can be set to writable at anytime if
				1464	* its dirty bitmap is properly set.
				1465	* - for spte protection, the spte can be writable only after unsync-ing
				1466	* shadow page.
				1467	*
				1468	* Return true if tlb need be flushed.
				1469	*/
				1470	static bool spte_write_protect(u64 *sptep, bool pt_protect)
				1471	{
				1472	u64 spte = *sptep;
				1473
				1474	if (!is_writable_pte(spte) &&
				1475	!(pt_protect && spte_can_locklessly_be_made_writable(spte)))
				1476	return false;
				1477
				1478	rmap_printk("rmap_write_protect: spte %p %llx\n", sptep, *sptep);
				1479
				1480	if (pt_protect)
				1481	spte &= ~SPTE_MMU_WRITEABLE;
				1482	spte = spte & ~PT_WRITABLE_MASK;
				1483
				1484	return mmu_spte_update(sptep, spte);
				1485	}
				1486
				1487	static bool __rmap_write_protect(struct kvm *kvm,
				1488	struct kvm_rmap_head *rmap_head,
				1489	bool pt_protect)
				1490	{
				1491	u64 *sptep;
				1492	struct rmap_iterator iter;
				1493	bool flush = false;
				1494
				1495	for_each_rmap_spte(rmap_head, &iter, sptep)
				1496	flush \|= spte_write_protect(sptep, pt_protect);
				1497
				1498	return flush;
				1499	}
				1500
				1501	static bool spte_clear_dirty(u64 *sptep)
				1502	{
				1503	u64 spte = *sptep;
				1504
				1505	rmap_printk("rmap_clear_dirty: spte %p %llx\n", sptep, *sptep);
				1506
				1507	spte &= ~shadow_dirty_mask;
				1508
				1509	return mmu_spte_update(sptep, spte);
				1510	}
				1511
				1512	static bool wrprot_ad_disabled_spte(u64 *sptep)
				1513	{
				1514	bool was_writable = test_and_clear_bit(PT_WRITABLE_SHIFT,
				1515	(unsigned long *)sptep);
				1516	if (was_writable)
				1517	kvm_set_pfn_dirty(spte_to_pfn(*sptep));
				1518
				1519	return was_writable;
				1520	}
				1521
				1522	/*
				1523	* Gets the GFN ready for another round of dirty logging by clearing the
				1524	* - D bit on ad-enabled SPTEs, and
				1525	* - W bit on ad-disabled SPTEs.
				1526	* Returns true iff any D or W bits were cleared.
				1527	*/
				1528	static bool __rmap_clear_dirty(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1529	{
				1530	u64 *sptep;
				1531	struct rmap_iterator iter;
				1532	bool flush = false;
				1533
				1534	for_each_rmap_spte(rmap_head, &iter, sptep)
				1535	if (spte_ad_enabled(*sptep))
				1536	flush \|= spte_clear_dirty(sptep);
				1537	else
				1538	flush \|= wrprot_ad_disabled_spte(sptep);
				1539
				1540	return flush;
				1541	}
				1542
				1543	static bool spte_set_dirty(u64 *sptep)
				1544	{
				1545	u64 spte = *sptep;
				1546
				1547	rmap_printk("rmap_set_dirty: spte %p %llx\n", sptep, *sptep);
				1548
				1549	spte \|= shadow_dirty_mask;
				1550
				1551	return mmu_spte_update(sptep, spte);
				1552	}
				1553
				1554	static bool __rmap_set_dirty(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1555	{
				1556	u64 *sptep;
				1557	struct rmap_iterator iter;
				1558	bool flush = false;
				1559
				1560	for_each_rmap_spte(rmap_head, &iter, sptep)
				1561	if (spte_ad_enabled(*sptep))
				1562	flush \|= spte_set_dirty(sptep);
				1563
				1564	return flush;
				1565	}
				1566
				1567	/**
				1568	* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
				1569	* @kvm: kvm instance
				1570	* @slot: slot to protect
				1571	* @gfn_offset: start of the BITS_PER_LONG pages we care about
				1572	* @mask: indicates which pages we should protect
				1573	*
				1574	* Used when we do not need to care about huge page mappings: e.g. during dirty
				1575	* logging we do not have any such mappings.
				1576	*/
				1577	static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
				1578	struct kvm_memory_slot *slot,
				1579	gfn_t gfn_offset, unsigned long mask)
				1580	{
				1581	struct kvm_rmap_head *rmap_head;
				1582
				1583	while (mask) {
				1584	rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
				1585	PT_PAGE_TABLE_LEVEL, slot);
				1586	__rmap_write_protect(kvm, rmap_head, false);
				1587
				1588	/* clear the first set bit */
				1589	mask &= mask - 1;
				1590	}
				1591	}
				1592
				1593	/**
				1594	* kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
				1595	* protect the page if the D-bit isn't supported.
				1596	* @kvm: kvm instance
				1597	* @slot: slot to clear D-bit
				1598	* @gfn_offset: start of the BITS_PER_LONG pages we care about
				1599	* @mask: indicates which pages we should clear D-bit
				1600	*
				1601	* Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
				1602	*/
				1603	void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
				1604	struct kvm_memory_slot *slot,
				1605	gfn_t gfn_offset, unsigned long mask)
				1606	{
				1607	struct kvm_rmap_head *rmap_head;
				1608
				1609	while (mask) {
				1610	rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask),
				1611	PT_PAGE_TABLE_LEVEL, slot);
				1612	__rmap_clear_dirty(kvm, rmap_head);
				1613
				1614	/* clear the first set bit */
				1615	mask &= mask - 1;
				1616	}
				1617	}
				1618	EXPORT_SYMBOL_GPL(kvm_mmu_clear_dirty_pt_masked);
				1619
				1620	/**
				1621	* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
				1622	* PT level pages.
				1623	*
				1624	* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
				1625	* enable dirty logging for them.
				1626	*
				1627	* Used when we do not need to care about huge page mappings: e.g. during dirty
				1628	* logging we do not have any such mappings.
				1629	*/
				1630	void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
				1631	struct kvm_memory_slot *slot,
				1632	gfn_t gfn_offset, unsigned long mask)
				1633	{
				1634	if (kvm_x86_ops->enable_log_dirty_pt_masked)
				1635	kvm_x86_ops->enable_log_dirty_pt_masked(kvm, slot, gfn_offset,
				1636	mask);
				1637	else
				1638	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
				1639	}
				1640
				1641	/**
				1642	* kvm_arch_write_log_dirty - emulate dirty page logging
				1643	* @vcpu: Guest mode vcpu
				1644	*
				1645	* Emulate arch specific page modification logging for the
				1646	* nested hypervisor
				1647	*/
				1648	int kvm_arch_write_log_dirty(struct kvm_vcpu *vcpu)
				1649	{
				1650	if (kvm_x86_ops->write_log_dirty)
				1651	return kvm_x86_ops->write_log_dirty(vcpu);
				1652
				1653	return 0;
				1654	}
				1655
				1656	bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
				1657	struct kvm_memory_slot *slot, u64 gfn)
				1658	{
				1659	struct kvm_rmap_head *rmap_head;
				1660	int i;
				1661	bool write_protected = false;
				1662
				1663	for (i = PT_PAGE_TABLE_LEVEL; i <= PT_MAX_HUGEPAGE_LEVEL; ++i) {
				1664	rmap_head = __gfn_to_rmap(gfn, i, slot);
				1665	write_protected \|= __rmap_write_protect(kvm, rmap_head, true);
				1666	}
				1667
				1668	return write_protected;
				1669	}
				1670
				1671	static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn)
				1672	{
				1673	struct kvm_memory_slot *slot;
				1674
				1675	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				1676	return kvm_mmu_slot_gfn_write_protect(vcpu->kvm, slot, gfn);
				1677	}
				1678
				1679	static bool kvm_zap_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head)
				1680	{
				1681	u64 *sptep;
				1682	struct rmap_iterator iter;
				1683	bool flush = false;
				1684
				1685	while ((sptep = rmap_get_first(rmap_head, &iter))) {
				1686	rmap_printk("%s: spte %p %llx.\n", __func__, sptep, *sptep);
				1687
				1688	drop_spte(kvm, sptep);
				1689	flush = true;
				1690	}
				1691
				1692	return flush;
				1693	}
				1694
				1695	static int kvm_unmap_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1696	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1697	unsigned long data)
				1698	{
				1699	return kvm_zap_rmapp(kvm, rmap_head);
				1700	}
				1701
				1702	static int kvm_set_pte_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1703	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1704	unsigned long data)
				1705	{
				1706	u64 *sptep;
				1707	struct rmap_iterator iter;
				1708	int need_flush = 0;
				1709	u64 new_spte;
				1710	pte_t ptep = (pte_t )data;
				1711	kvm_pfn_t new_pfn;
				1712
				1713	WARN_ON(pte_huge(*ptep));
				1714	new_pfn = pte_pfn(*ptep);
				1715
				1716	restart:
				1717	for_each_rmap_spte(rmap_head, &iter, sptep) {
				1718	rmap_printk("kvm_set_pte_rmapp: spte %p %llx gfn %llx (%d)\n",
				1719	sptep, *sptep, gfn, level);
				1720
				1721	need_flush = 1;
				1722
				1723	if (pte_write(*ptep)) {
				1724	drop_spte(kvm, sptep);
				1725	goto restart;
				1726	} else {
				1727	new_spte = *sptep & ~PT64_BASE_ADDR_MASK;
				1728	new_spte \|= (u64)new_pfn << PAGE_SHIFT;
				1729
				1730	new_spte &= ~PT_WRITABLE_MASK;
				1731	new_spte &= ~SPTE_HOST_WRITEABLE;
				1732
				1733	new_spte = mark_spte_for_access_track(new_spte);
				1734
				1735	mmu_spte_clear_track_bits(sptep);
				1736	mmu_spte_set(sptep, new_spte);
				1737	}
				1738	}
				1739
				1740	if (need_flush)
				1741	kvm_flush_remote_tlbs(kvm);
				1742
				1743	return 0;
				1744	}
				1745
				1746	struct slot_rmap_walk_iterator {
				1747	/* input fields. */
				1748	struct kvm_memory_slot *slot;
				1749	gfn_t start_gfn;
				1750	gfn_t end_gfn;
				1751	int start_level;
				1752	int end_level;
				1753
				1754	/* output fields. */
				1755	gfn_t gfn;
				1756	struct kvm_rmap_head *rmap;
				1757	int level;
				1758
				1759	/* private field. */
				1760	struct kvm_rmap_head *end_rmap;
				1761	};
				1762
				1763	static void
				1764	rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level)
				1765	{
				1766	iterator->level = level;
				1767	iterator->gfn = iterator->start_gfn;
				1768	iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot);
				1769	iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level,
				1770	iterator->slot);
				1771	}
				1772
				1773	static void
				1774	slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
				1775	struct kvm_memory_slot *slot, int start_level,
				1776	int end_level, gfn_t start_gfn, gfn_t end_gfn)
				1777	{
				1778	iterator->slot = slot;
				1779	iterator->start_level = start_level;
				1780	iterator->end_level = end_level;
				1781	iterator->start_gfn = start_gfn;
				1782	iterator->end_gfn = end_gfn;
				1783
				1784	rmap_walk_init_level(iterator, iterator->start_level);
				1785	}
				1786
				1787	static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
				1788	{
				1789	return !!iterator->rmap;
				1790	}
				1791
				1792	static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
				1793	{
				1794	if (++iterator->rmap <= iterator->end_rmap) {
				1795	iterator->gfn += (1UL << KVM_HPAGE_GFN_SHIFT(iterator->level));
				1796	return;
				1797	}
				1798
				1799	if (++iterator->level > iterator->end_level) {
				1800	iterator->rmap = NULL;
				1801	return;
				1802	}
				1803
				1804	rmap_walk_init_level(iterator, iterator->level);
				1805	}
				1806
				1807	#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
				1808	_start_gfn, _end_gfn, _iter_) \
				1809	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
				1810	_end_level_, _start_gfn, _end_gfn); \
				1811	slot_rmap_walk_okay(_iter_); \
				1812	slot_rmap_walk_next(_iter_))
				1813
				1814	static int kvm_handle_hva_range(struct kvm *kvm,
				1815	unsigned long start,
				1816	unsigned long end,
				1817	unsigned long data,
				1818	int (handler)(struct kvm kvm,
				1819	struct kvm_rmap_head *rmap_head,
				1820	struct kvm_memory_slot *slot,
				1821	gfn_t gfn,
				1822	int level,
				1823	unsigned long data))
				1824	{
				1825	struct kvm_memslots *slots;
				1826	struct kvm_memory_slot *memslot;
				1827	struct slot_rmap_walk_iterator iterator;
				1828	int ret = 0;
				1829	int i;
				1830
				1831	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				1832	slots = __kvm_memslots(kvm, i);
				1833	kvm_for_each_memslot(memslot, slots) {
				1834	unsigned long hva_start, hva_end;
				1835	gfn_t gfn_start, gfn_end;
				1836
				1837	hva_start = max(start, memslot->userspace_addr);
				1838	hva_end = min(end, memslot->userspace_addr +
				1839	(memslot->npages << PAGE_SHIFT));
				1840	if (hva_start >= hva_end)
				1841	continue;
				1842	/*
				1843	* {gfn(page) \| page intersects with [hva_start, hva_end)} =
				1844	* {gfn_start, gfn_start+1, ..., gfn_end-1}.
				1845	*/
				1846	gfn_start = hva_to_gfn_memslot(hva_start, memslot);
				1847	gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot);
				1848
				1849	for_each_slot_rmap_range(memslot, PT_PAGE_TABLE_LEVEL,
				1850	PT_MAX_HUGEPAGE_LEVEL,
				1851	gfn_start, gfn_end - 1,
				1852	&iterator)
				1853	ret \|= handler(kvm, iterator.rmap, memslot,
				1854	iterator.gfn, iterator.level, data);
				1855	}
				1856	}
				1857
				1858	return ret;
				1859	}
				1860
				1861	static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
				1862	unsigned long data,
				1863	int (handler)(struct kvm kvm,
				1864	struct kvm_rmap_head *rmap_head,
				1865	struct kvm_memory_slot *slot,
				1866	gfn_t gfn, int level,
				1867	unsigned long data))
				1868	{
				1869	return kvm_handle_hva_range(kvm, hva, hva + 1, data, handler);
				1870	}
				1871
				1872	int kvm_unmap_hva_range(struct kvm *kvm, unsigned long start, unsigned long end)
				1873	{
				1874	return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp);
				1875	}
				1876
				1877	void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte)
				1878	{
				1879	kvm_handle_hva(kvm, hva, (unsigned long)&pte, kvm_set_pte_rmapp);
				1880	}
				1881
				1882	static int kvm_age_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1883	struct kvm_memory_slot *slot, gfn_t gfn, int level,
				1884	unsigned long data)
				1885	{
				1886	u64 *sptep;
				1887	struct rmap_iterator uninitialized_var(iter);
				1888	int young = 0;
				1889
				1890	for_each_rmap_spte(rmap_head, &iter, sptep)
				1891	young \|= mmu_spte_age(sptep);
				1892
				1893	trace_kvm_age_page(gfn, level, slot, young);
				1894	return young;
				1895	}
				1896
				1897	static int kvm_test_age_rmapp(struct kvm kvm, struct kvm_rmap_head rmap_head,
				1898	struct kvm_memory_slot *slot, gfn_t gfn,
				1899	int level, unsigned long data)
				1900	{
				1901	u64 *sptep;
				1902	struct rmap_iterator iter;
				1903
				1904	for_each_rmap_spte(rmap_head, &iter, sptep)
				1905	if (is_accessed_spte(*sptep))
				1906	return 1;
				1907	return 0;
				1908	}
				1909
				1910	#define RMAP_RECYCLE_THRESHOLD 1000
				1911
				1912	static void rmap_recycle(struct kvm_vcpu vcpu, u64 spte, gfn_t gfn)
				1913	{
				1914	struct kvm_rmap_head *rmap_head;
				1915	struct kvm_mmu_page *sp;
				1916
				1917	sp = page_header(__pa(spte));
				1918
				1919	rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp);
				1920
				1921	kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, 0);
				1922	kvm_flush_remote_tlbs(vcpu->kvm);
				1923	}
				1924
				1925	int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end)
				1926	{
				1927	return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp);
				1928	}
				1929
				1930	int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
				1931	{
				1932	return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
				1933	}
				1934
				1935	#ifdef MMU_DEBUG
				1936	static int is_empty_shadow_page(u64 *spt)
				1937	{
				1938	u64 *pos;
				1939	u64 *end;
				1940
				1941	for (pos = spt, end = pos + PAGE_SIZE / sizeof(u64); pos != end; pos++)
				1942	if (is_shadow_present_pte(*pos)) {
				1943	printk(KERN_ERR "%s: %p %llx\n", __func__,
				1944	pos, *pos);
				1945	return 0;
				1946	}
				1947	return 1;
				1948	}
				1949	#endif
				1950
				1951	/*
				1952	* This value is the sum of all of the kvm instances's
				1953	* kvm->arch.n_used_mmu_pages values. We need a global,
				1954	* aggregate version in order to make the slab shrinker
				1955	* faster
				1956	*/
				1957	static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
				1958	{
				1959	kvm->arch.n_used_mmu_pages += nr;
				1960	percpu_counter_add(&kvm_total_used_mmu_pages, nr);
				1961	}
				1962
				1963	static void kvm_mmu_free_page(struct kvm_mmu_page *sp)
				1964	{
				1965	MMU_WARN_ON(!is_empty_shadow_page(sp->spt));
				1966	hlist_del(&sp->hash_link);
				1967	list_del(&sp->link);
				1968	free_page((unsigned long)sp->spt);
				1969	if (!sp->role.direct)
				1970	free_page((unsigned long)sp->gfns);
				1971	kmem_cache_free(mmu_page_header_cache, sp);
				1972	}
				1973
				1974	static unsigned kvm_page_table_hashfn(gfn_t gfn)
				1975	{
				1976	return hash_64(gfn, KVM_MMU_HASH_SHIFT);
				1977	}
				1978
				1979	static void mmu_page_add_parent_pte(struct kvm_vcpu *vcpu,
				1980	struct kvm_mmu_page sp, u64 parent_pte)
				1981	{
				1982	if (!parent_pte)
				1983	return;
				1984
				1985	pte_list_add(vcpu, parent_pte, &sp->parent_ptes);
				1986	}
				1987
				1988	static void mmu_page_remove_parent_pte(struct kvm_mmu_page *sp,
				1989	u64 *parent_pte)
				1990	{
				1991	pte_list_remove(parent_pte, &sp->parent_ptes);
				1992	}
				1993
				1994	static void drop_parent_pte(struct kvm_mmu_page *sp,
				1995	u64 *parent_pte)
				1996	{
				1997	mmu_page_remove_parent_pte(sp, parent_pte);
				1998	mmu_spte_clear_no_track(parent_pte);
				1999	}
				2000
				2001	static struct kvm_mmu_page kvm_mmu_alloc_page(struct kvm_vcpu vcpu, int direct)
				2002	{
				2003	struct kvm_mmu_page *sp;
				2004
				2005	sp = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_header_cache);
				2006	sp->spt = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
				2007	if (!direct)
				2008	sp->gfns = mmu_memory_cache_alloc(&vcpu->arch.mmu_page_cache);
				2009	set_page_private(virt_to_page(sp->spt), (unsigned long)sp);
				2010
				2011	/*
				2012	* The active_mmu_pages list is the FIFO list, do not move the
				2013	* page until it is zapped. kvm_zap_obsolete_pages depends on
				2014	* this feature. See the comments in kvm_zap_obsolete_pages().
				2015	*/
				2016	list_add(&sp->link, &vcpu->kvm->arch.active_mmu_pages);
				2017	kvm_mod_used_mmu_pages(vcpu->kvm, +1);
				2018	return sp;
				2019	}
				2020
				2021	static void mark_unsync(u64 *spte);
				2022	static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
				2023	{
				2024	u64 *sptep;
				2025	struct rmap_iterator iter;
				2026
				2027	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
				2028	mark_unsync(sptep);
				2029	}
				2030	}
				2031
				2032	static void mark_unsync(u64 *spte)
				2033	{
				2034	struct kvm_mmu_page *sp;
				2035	unsigned int index;
				2036
				2037	sp = page_header(__pa(spte));
				2038	index = spte - sp->spt;
				2039	if (__test_and_set_bit(index, sp->unsync_child_bitmap))
				2040	return;
				2041	if (sp->unsync_children++)
				2042	return;
				2043	kvm_mmu_mark_parents_unsync(sp);
				2044	}
				2045
				2046	static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
				2047	struct kvm_mmu_page *sp)
				2048	{
				2049	return 0;
				2050	}
				2051
				2052	static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root)
				2053	{
				2054	}
				2055
				2056	static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
				2057	struct kvm_mmu_page sp, u64 spte,
				2058	const void *pte)
				2059	{
				2060	WARN_ON(1);
				2061	}
				2062
				2063	#define KVM_PAGE_ARRAY_NR 16
				2064
				2065	struct kvm_mmu_pages {
				2066	struct mmu_page_and_offset {
				2067	struct kvm_mmu_page *sp;
				2068	unsigned int idx;
				2069	} page[KVM_PAGE_ARRAY_NR];
				2070	unsigned int nr;
				2071	};
				2072
				2073	static int mmu_pages_add(struct kvm_mmu_pages pvec, struct kvm_mmu_page sp,
				2074	int idx)
				2075	{
				2076	int i;
				2077
				2078	if (sp->unsync)
				2079	for (i=0; i < pvec->nr; i++)
				2080	if (pvec->page[i].sp == sp)
				2081	return 0;
				2082
				2083	pvec->page[pvec->nr].sp = sp;
				2084	pvec->page[pvec->nr].idx = idx;
				2085	pvec->nr++;
				2086	return (pvec->nr == KVM_PAGE_ARRAY_NR);
				2087	}
				2088
				2089	static inline void clear_unsync_child_bit(struct kvm_mmu_page *sp, int idx)
				2090	{
				2091	--sp->unsync_children;
				2092	WARN_ON((int)sp->unsync_children < 0);
				2093	__clear_bit(idx, sp->unsync_child_bitmap);
				2094	}
				2095
				2096	static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
				2097	struct kvm_mmu_pages *pvec)
				2098	{
				2099	int i, ret, nr_unsync_leaf = 0;
				2100
				2101	for_each_set_bit(i, sp->unsync_child_bitmap, 512) {
				2102	struct kvm_mmu_page *child;
				2103	u64 ent = sp->spt[i];
				2104
				2105	if (!is_shadow_present_pte(ent) \|\| is_large_pte(ent)) {
				2106	clear_unsync_child_bit(sp, i);
				2107	continue;
				2108	}
				2109
				2110	child = page_header(ent & PT64_BASE_ADDR_MASK);
				2111
				2112	if (child->unsync_children) {
				2113	if (mmu_pages_add(pvec, child, i))
				2114	return -ENOSPC;
				2115
				2116	ret = __mmu_unsync_walk(child, pvec);
				2117	if (!ret) {
				2118	clear_unsync_child_bit(sp, i);
				2119	continue;
				2120	} else if (ret > 0) {
				2121	nr_unsync_leaf += ret;
				2122	} else
				2123	return ret;
				2124	} else if (child->unsync) {
				2125	nr_unsync_leaf++;
				2126	if (mmu_pages_add(pvec, child, i))
				2127	return -ENOSPC;
				2128	} else
				2129	clear_unsync_child_bit(sp, i);
				2130	}
				2131
				2132	return nr_unsync_leaf;
				2133	}
				2134
				2135	#define INVALID_INDEX (-1)
				2136
				2137	static int mmu_unsync_walk(struct kvm_mmu_page *sp,
				2138	struct kvm_mmu_pages *pvec)
				2139	{
				2140	pvec->nr = 0;
				2141	if (!sp->unsync_children)
				2142	return 0;
				2143
				2144	mmu_pages_add(pvec, sp, INVALID_INDEX);
				2145	return __mmu_unsync_walk(sp, pvec);
				2146	}
				2147
				2148	static void kvm_unlink_unsync_page(struct kvm kvm, struct kvm_mmu_page sp)
				2149	{
				2150	WARN_ON(!sp->unsync);
				2151	trace_kvm_mmu_sync_page(sp);
				2152	sp->unsync = 0;
				2153	--kvm->stat.mmu_unsync;
				2154	}
				2155
				2156	static int kvm_mmu_prepare_zap_page(struct kvm kvm, struct kvm_mmu_page sp,
				2157	struct list_head *invalid_list);
				2158	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				2159	struct list_head *invalid_list);
				2160
				2161	/*
				2162	* NOTE: we should pay more attention on the zapped-obsolete page
				2163	* (is_obsolete_sp(sp) && sp->role.invalid) when you do hash list walk
				2164	* since it has been deleted from active_mmu_pages but still can be found
				2165	* at hast list.
				2166	*
				2167	* for_each_valid_sp() has skipped that kind of pages.
				2168	*/
				2169	#define for_each_valid_sp(_kvm, _sp, _gfn) \
				2170	hlist_for_each_entry(_sp, \
				2171	&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)], hash_link) \
				2172	if (is_obsolete_sp((_kvm), (_sp)) \|\| (_sp)->role.invalid) { \
				2173	} else
				2174
				2175	#define for_each_gfn_indirect_valid_sp(_kvm, _sp, _gfn) \
				2176	for_each_valid_sp(_kvm, _sp, _gfn) \
				2177	if ((_sp)->gfn != (_gfn) \|\| (_sp)->role.direct) {} else
				2178
				2179	/* @sp->gfn should be write-protected at the call site */
				2180	static bool __kvm_sync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				2181	struct list_head *invalid_list)
				2182	{
				2183	if (sp->role.cr4_pae != !!is_pae(vcpu)
				2184	\|\| vcpu->arch.mmu.sync_page(vcpu, sp) == 0) {
				2185	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
				2186	return false;
				2187	}
				2188
				2189	return true;
				2190	}
				2191
				2192	static void kvm_mmu_flush_or_zap(struct kvm_vcpu *vcpu,
				2193	struct list_head *invalid_list,
				2194	bool remote_flush, bool local_flush)
				2195	{
				2196	if (!list_empty(invalid_list)) {
				2197	kvm_mmu_commit_zap_page(vcpu->kvm, invalid_list);
				2198	return;
				2199	}
				2200
				2201	if (remote_flush)
				2202	kvm_flush_remote_tlbs(vcpu->kvm);
				2203	else if (local_flush)
				2204	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2205	}
				2206
				2207	#ifdef CONFIG_KVM_MMU_AUDIT
				2208	#include "mmu_audit.c"
				2209	#else
				2210	static void kvm_mmu_audit(struct kvm_vcpu *vcpu, int point) { }
				2211	static void mmu_audit_disable(void) { }
				2212	#endif
				2213
				2214	static bool is_obsolete_sp(struct kvm kvm, struct kvm_mmu_page sp)
				2215	{
				2216	return unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
				2217	}
				2218
				2219	static bool kvm_sync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				2220	struct list_head *invalid_list)
				2221	{
				2222	kvm_unlink_unsync_page(vcpu->kvm, sp);
				2223	return __kvm_sync_page(vcpu, sp, invalid_list);
				2224	}
				2225
				2226	/* @gfn should be write-protected at the call site */
				2227	static bool kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn,
				2228	struct list_head *invalid_list)
				2229	{
				2230	struct kvm_mmu_page *s;
				2231	bool ret = false;
				2232
				2233	for_each_gfn_indirect_valid_sp(vcpu->kvm, s, gfn) {
				2234	if (!s->unsync)
				2235	continue;
				2236
				2237	WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
				2238	ret \|= kvm_sync_page(vcpu, s, invalid_list);
				2239	}
				2240
				2241	return ret;
				2242	}
				2243
				2244	struct mmu_page_path {
				2245	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
				2246	unsigned int idx[PT64_ROOT_MAX_LEVEL];
				2247	};
				2248
				2249	#define for_each_sp(pvec, sp, parents, i) \
				2250	for (i = mmu_pages_first(&pvec, &parents); \
				2251	i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
				2252	i = mmu_pages_next(&pvec, &parents, i))
				2253
				2254	static int mmu_pages_next(struct kvm_mmu_pages *pvec,
				2255	struct mmu_page_path *parents,
				2256	int i)
				2257	{
				2258	int n;
				2259
				2260	for (n = i+1; n < pvec->nr; n++) {
				2261	struct kvm_mmu_page *sp = pvec->page[n].sp;
				2262	unsigned idx = pvec->page[n].idx;
				2263	int level = sp->role.level;
				2264
				2265	parents->idx[level-1] = idx;
				2266	if (level == PT_PAGE_TABLE_LEVEL)
				2267	break;
				2268
				2269	parents->parent[level-2] = sp;
				2270	}
				2271
				2272	return n;
				2273	}
				2274
				2275	static int mmu_pages_first(struct kvm_mmu_pages *pvec,
				2276	struct mmu_page_path *parents)
				2277	{
				2278	struct kvm_mmu_page *sp;
				2279	int level;
				2280
				2281	if (pvec->nr == 0)
				2282	return 0;
				2283
				2284	WARN_ON(pvec->page[0].idx != INVALID_INDEX);
				2285
				2286	sp = pvec->page[0].sp;
				2287	level = sp->role.level;
				2288	WARN_ON(level == PT_PAGE_TABLE_LEVEL);
				2289
				2290	parents->parent[level-2] = sp;
				2291
				2292	/* Also set up a sentinel. Further entries in pvec are all
				2293	* children of sp, so this element is never overwritten.
				2294	*/
				2295	parents->parent[level-1] = NULL;
				2296	return mmu_pages_next(pvec, parents, 0);
				2297	}
				2298
				2299	static void mmu_pages_clear_parents(struct mmu_page_path *parents)
				2300	{
				2301	struct kvm_mmu_page *sp;
				2302	unsigned int level = 0;
				2303
				2304	do {
				2305	unsigned int idx = parents->idx[level];
				2306	sp = parents->parent[level];
				2307	if (!sp)
				2308	return;
				2309
				2310	WARN_ON(idx == INVALID_INDEX);
				2311	clear_unsync_child_bit(sp, idx);
				2312	level++;
				2313	} while (!sp->unsync_children);
				2314	}
				2315
				2316	static void mmu_sync_children(struct kvm_vcpu *vcpu,
				2317	struct kvm_mmu_page *parent)
				2318	{
				2319	int i;
				2320	struct kvm_mmu_page *sp;
				2321	struct mmu_page_path parents;
				2322	struct kvm_mmu_pages pages;
				2323	LIST_HEAD(invalid_list);
				2324	bool flush = false;
				2325
				2326	while (mmu_unsync_walk(parent, &pages)) {
				2327	bool protected = false;
				2328
				2329	for_each_sp(pages, sp, parents, i)
				2330	protected \|= rmap_write_protect(vcpu, sp->gfn);
				2331
				2332	if (protected) {
				2333	kvm_flush_remote_tlbs(vcpu->kvm);
				2334	flush = false;
				2335	}
				2336
				2337	for_each_sp(pages, sp, parents, i) {
				2338	flush \|= kvm_sync_page(vcpu, sp, &invalid_list);
				2339	mmu_pages_clear_parents(&parents);
				2340	}
				2341	if (need_resched() \|\| spin_needbreak(&vcpu->kvm->mmu_lock)) {
				2342	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2343	cond_resched_lock(&vcpu->kvm->mmu_lock);
				2344	flush = false;
				2345	}
				2346	}
				2347
				2348	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2349	}
				2350
				2351	static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
				2352	{
				2353	atomic_set(&sp->write_flooding_count, 0);
				2354	}
				2355
				2356	static void clear_sp_write_flooding_count(u64 *spte)
				2357	{
				2358	struct kvm_mmu_page *sp = page_header(__pa(spte));
				2359
				2360	__clear_sp_write_flooding_count(sp);
				2361	}
				2362
				2363	static struct kvm_mmu_page kvm_mmu_get_page(struct kvm_vcpu vcpu,
				2364	gfn_t gfn,
				2365	gva_t gaddr,
				2366	unsigned level,
				2367	int direct,
				2368	unsigned access)
				2369	{
				2370	union kvm_mmu_page_role role;
				2371	unsigned quadrant;
				2372	struct kvm_mmu_page *sp;
				2373	bool need_sync = false;
				2374	bool flush = false;
				2375	int collisions = 0;
				2376	LIST_HEAD(invalid_list);
				2377
				2378	role = vcpu->arch.mmu.base_role;
				2379	role.level = level;
				2380	role.direct = direct;
				2381	if (role.direct)
				2382	role.cr4_pae = 0;
				2383	role.access = access;
				2384	if (!vcpu->arch.mmu.direct_map
				2385	&& vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
				2386	quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
				2387	quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
				2388	role.quadrant = quadrant;
				2389	}
				2390	for_each_valid_sp(vcpu->kvm, sp, gfn) {
				2391	if (sp->gfn != gfn) {
				2392	collisions++;
				2393	continue;
				2394	}
				2395
				2396	if (!need_sync && sp->unsync)
				2397	need_sync = true;
				2398
				2399	if (sp->role.word != role.word)
				2400	continue;
				2401
				2402	if (sp->unsync) {
				2403	/* The page is good, but __kvm_sync_page might still end
				2404	* up zapping it. If so, break in order to rebuild it.
				2405	*/
				2406	if (!__kvm_sync_page(vcpu, sp, &invalid_list))
				2407	break;
				2408
				2409	WARN_ON(!list_empty(&invalid_list));
				2410	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2411	}
				2412
				2413	if (sp->unsync_children)
				2414	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
				2415
				2416	__clear_sp_write_flooding_count(sp);
				2417	trace_kvm_mmu_get_page(sp, false);
				2418	goto out;
				2419	}
				2420
				2421	++vcpu->kvm->stat.mmu_cache_miss;
				2422
				2423	sp = kvm_mmu_alloc_page(vcpu, direct);
				2424
				2425	sp->gfn = gfn;
				2426	sp->role = role;
				2427	hlist_add_head(&sp->hash_link,
				2428	&vcpu->kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)]);
				2429	if (!direct) {
				2430	/*
				2431	* we should do write protection before syncing pages
				2432	* otherwise the content of the synced shadow page may
				2433	* be inconsistent with guest page table.
				2434	*/
				2435	account_shadowed(vcpu->kvm, sp);
				2436	if (level == PT_PAGE_TABLE_LEVEL &&
				2437	rmap_write_protect(vcpu, gfn))
				2438	kvm_flush_remote_tlbs(vcpu->kvm);
				2439
				2440	if (level > PT_PAGE_TABLE_LEVEL && need_sync)
				2441	flush \|= kvm_sync_pages(vcpu, gfn, &invalid_list);
				2442	}
				2443	sp->mmu_valid_gen = vcpu->kvm->arch.mmu_valid_gen;
				2444	clear_page(sp->spt);
				2445	trace_kvm_mmu_get_page(sp, true);
				2446
				2447	kvm_mmu_flush_or_zap(vcpu, &invalid_list, false, flush);
				2448	out:
				2449	if (collisions > vcpu->kvm->stat.max_mmu_page_hash_collisions)
				2450	vcpu->kvm->stat.max_mmu_page_hash_collisions = collisions;
				2451	return sp;
				2452	}
				2453
				2454	static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
				2455	struct kvm_vcpu *vcpu, hpa_t root,
				2456	u64 addr)
				2457	{
				2458	iterator->addr = addr;
				2459	iterator->shadow_addr = root;
				2460	iterator->level = vcpu->arch.mmu.shadow_root_level;
				2461
				2462	if (iterator->level == PT64_ROOT_4LEVEL &&
				2463	vcpu->arch.mmu.root_level < PT64_ROOT_4LEVEL &&
				2464	!vcpu->arch.mmu.direct_map)
				2465	--iterator->level;
				2466
				2467	if (iterator->level == PT32E_ROOT_LEVEL) {
				2468	/*
				2469	* prev_root is currently only used for 64-bit hosts. So only
				2470	* the active root_hpa is valid here.
				2471	*/
				2472	BUG_ON(root != vcpu->arch.mmu.root_hpa);
				2473
				2474	iterator->shadow_addr
				2475	= vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
				2476	iterator->shadow_addr &= PT64_BASE_ADDR_MASK;
				2477	--iterator->level;
				2478	if (!iterator->shadow_addr)
				2479	iterator->level = 0;
				2480	}
				2481	}
				2482
				2483	static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
				2484	struct kvm_vcpu *vcpu, u64 addr)
				2485	{
				2486	shadow_walk_init_using_root(iterator, vcpu, vcpu->arch.mmu.root_hpa,
				2487	addr);
				2488	}
				2489
				2490	static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
				2491	{
				2492	if (iterator->level < PT_PAGE_TABLE_LEVEL)
				2493	return false;
				2494
				2495	iterator->index = SHADOW_PT_INDEX(iterator->addr, iterator->level);
				2496	iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
				2497	return true;
				2498	}
				2499
				2500	static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
				2501	u64 spte)
				2502	{
				2503	if (is_last_spte(spte, iterator->level)) {
				2504	iterator->level = 0;
				2505	return;
				2506	}
				2507
				2508	iterator->shadow_addr = spte & PT64_BASE_ADDR_MASK;
				2509	--iterator->level;
				2510	}
				2511
				2512	static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
				2513	{
				2514	__shadow_walk_next(iterator, *iterator->sptep);
				2515	}
				2516
				2517	static void link_shadow_page(struct kvm_vcpu vcpu, u64 sptep,
				2518	struct kvm_mmu_page *sp)
				2519	{
				2520	u64 spte;
				2521
				2522	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
				2523
				2524	spte = __pa(sp->spt) \| shadow_present_mask \| PT_WRITABLE_MASK \|
				2525	shadow_user_mask \| shadow_x_mask \| shadow_me_mask;
				2526
				2527	if (sp_ad_disabled(sp))
				2528	spte \|= shadow_acc_track_value;
				2529	else
				2530	spte \|= shadow_accessed_mask;
				2531
				2532	mmu_spte_set(sptep, spte);
				2533
				2534	mmu_page_add_parent_pte(vcpu, sp, sptep);
				2535
				2536	if (sp->unsync_children \|\| sp->unsync)
				2537	mark_unsync(sptep);
				2538	}
				2539
				2540	static void validate_direct_spte(struct kvm_vcpu vcpu, u64 sptep,
				2541	unsigned direct_access)
				2542	{
				2543	if (is_shadow_present_pte(sptep) && !is_large_pte(sptep)) {
				2544	struct kvm_mmu_page *child;
				2545
				2546	/*
				2547	* For the direct sp, if the guest pte's dirty bit
				2548	* changed form clean to dirty, it will corrupt the
				2549	* sp's access: allow writable in the read-only sp,
				2550	* so we should update the spte at this point to get
				2551	* a new sp with the correct access.
				2552	*/
				2553	child = page_header(*sptep & PT64_BASE_ADDR_MASK);
				2554	if (child->role.access == direct_access)
				2555	return;
				2556
				2557	drop_parent_pte(child, sptep);
				2558	kvm_flush_remote_tlbs(vcpu->kvm);
				2559	}
				2560	}
				2561
				2562	static bool mmu_page_zap_pte(struct kvm kvm, struct kvm_mmu_page sp,
				2563	u64 *spte)
				2564	{
				2565	u64 pte;
				2566	struct kvm_mmu_page *child;
				2567
				2568	pte = *spte;
				2569	if (is_shadow_present_pte(pte)) {
				2570	if (is_last_spte(pte, sp->role.level)) {
				2571	drop_spte(kvm, spte);
				2572	if (is_large_pte(pte))
				2573	--kvm->stat.lpages;
				2574	} else {
				2575	child = page_header(pte & PT64_BASE_ADDR_MASK);
				2576	drop_parent_pte(child, spte);
				2577	}
				2578	return true;
				2579	}
				2580
				2581	if (is_mmio_spte(pte))
				2582	mmu_spte_clear_no_track(spte);
				2583
				2584	return false;
				2585	}
				2586
				2587	static void kvm_mmu_page_unlink_children(struct kvm *kvm,
				2588	struct kvm_mmu_page *sp)
				2589	{
				2590	unsigned i;
				2591
				2592	for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
				2593	mmu_page_zap_pte(kvm, sp, sp->spt + i);
				2594	}
				2595
				2596	static void kvm_mmu_unlink_parents(struct kvm kvm, struct kvm_mmu_page sp)
				2597	{
				2598	u64 *sptep;
				2599	struct rmap_iterator iter;
				2600
				2601	while ((sptep = rmap_get_first(&sp->parent_ptes, &iter)))
				2602	drop_parent_pte(sp, sptep);
				2603	}
				2604
				2605	static int mmu_zap_unsync_children(struct kvm *kvm,
				2606	struct kvm_mmu_page *parent,
				2607	struct list_head *invalid_list)
				2608	{
				2609	int i, zapped = 0;
				2610	struct mmu_page_path parents;
				2611	struct kvm_mmu_pages pages;
				2612
				2613	if (parent->role.level == PT_PAGE_TABLE_LEVEL)
				2614	return 0;
				2615
				2616	while (mmu_unsync_walk(parent, &pages)) {
				2617	struct kvm_mmu_page *sp;
				2618
				2619	for_each_sp(pages, sp, parents, i) {
				2620	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				2621	mmu_pages_clear_parents(&parents);
				2622	zapped++;
				2623	}
				2624	}
				2625
				2626	return zapped;
				2627	}
				2628
				2629	static int kvm_mmu_prepare_zap_page(struct kvm kvm, struct kvm_mmu_page sp,
				2630	struct list_head *invalid_list)
				2631	{
				2632	int ret;
				2633
				2634	trace_kvm_mmu_prepare_zap_page(sp);
				2635	++kvm->stat.mmu_shadow_zapped;
				2636	ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
				2637	kvm_mmu_page_unlink_children(kvm, sp);
				2638	kvm_mmu_unlink_parents(kvm, sp);
				2639
				2640	if (!sp->role.invalid && !sp->role.direct)
				2641	unaccount_shadowed(kvm, sp);
				2642
				2643	if (sp->unsync)
				2644	kvm_unlink_unsync_page(kvm, sp);
				2645	if (!sp->root_count) {
				2646	/* Count self */
				2647	ret++;
				2648	list_move(&sp->link, invalid_list);
				2649	kvm_mod_used_mmu_pages(kvm, -1);
				2650	} else {
				2651	list_move(&sp->link, &kvm->arch.active_mmu_pages);
				2652
				2653	/*
				2654	* The obsolete pages can not be used on any vcpus.
				2655	* See the comments in kvm_mmu_invalidate_zap_all_pages().
				2656	*/
				2657	if (!sp->role.invalid && !is_obsolete_sp(kvm, sp))
				2658	kvm_reload_remote_mmus(kvm);
				2659	}
				2660
				2661	sp->role.invalid = 1;
				2662	return ret;
				2663	}
				2664
				2665	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
				2666	struct list_head *invalid_list)
				2667	{
				2668	struct kvm_mmu_page sp, nsp;
				2669
				2670	if (list_empty(invalid_list))
				2671	return;
				2672
				2673	/*
				2674	* We need to make sure everyone sees our modifications to
				2675	* the page tables and see changes to vcpu->mode here. The barrier
				2676	* in the kvm_flush_remote_tlbs() achieves this. This pairs
				2677	* with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
				2678	*
				2679	* In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
				2680	* guest mode and/or lockless shadow page table walks.
				2681	*/
				2682	kvm_flush_remote_tlbs(kvm);
				2683
				2684	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
				2685	WARN_ON(!sp->role.invalid \|\| sp->root_count);
				2686	kvm_mmu_free_page(sp);
				2687	}
				2688	}
				2689
				2690	static bool prepare_zap_oldest_mmu_page(struct kvm *kvm,
				2691	struct list_head *invalid_list)
				2692	{
				2693	struct kvm_mmu_page *sp;
				2694
				2695	if (list_empty(&kvm->arch.active_mmu_pages))
				2696	return false;
				2697
				2698	sp = list_last_entry(&kvm->arch.active_mmu_pages,
				2699	struct kvm_mmu_page, link);
				2700	return kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				2701	}
				2702
				2703	/*
				2704	* Changing the number of mmu pages allocated to the vm
				2705	* Note: if goal_nr_mmu_pages is too small, you will get dead lock
				2706	*/
				2707	void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
				2708	{
				2709	LIST_HEAD(invalid_list);
				2710
				2711	spin_lock(&kvm->mmu_lock);
				2712
				2713	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
				2714	/* Need to free some mmu pages to achieve the goal. */
				2715	while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages)
				2716	if (!prepare_zap_oldest_mmu_page(kvm, &invalid_list))
				2717	break;
				2718
				2719	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				2720	goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
				2721	}
				2722
				2723	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
				2724
				2725	spin_unlock(&kvm->mmu_lock);
				2726	}
				2727
				2728	int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
				2729	{
				2730	struct kvm_mmu_page *sp;
				2731	LIST_HEAD(invalid_list);
				2732	int r;
				2733
				2734	pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
				2735	r = 0;
				2736	spin_lock(&kvm->mmu_lock);
				2737	for_each_gfn_indirect_valid_sp(kvm, sp, gfn) {
				2738	pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
				2739	sp->role.word);
				2740	r = 1;
				2741	kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
				2742	}
				2743	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				2744	spin_unlock(&kvm->mmu_lock);
				2745
				2746	return r;
				2747	}
				2748	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page);
				2749
				2750	static void kvm_unsync_page(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
				2751	{
				2752	trace_kvm_mmu_unsync_page(sp);
				2753	++vcpu->kvm->stat.mmu_unsync;
				2754	sp->unsync = 1;
				2755
				2756	kvm_mmu_mark_parents_unsync(sp);
				2757	}
				2758
				2759	static bool mmu_need_write_protect(struct kvm_vcpu *vcpu, gfn_t gfn,
				2760	bool can_unsync)
				2761	{
				2762	struct kvm_mmu_page *sp;
				2763
				2764	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
				2765	return true;
				2766
				2767	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
				2768	if (!can_unsync)
				2769	return true;
				2770
				2771	if (sp->unsync)
				2772	continue;
				2773
				2774	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
				2775	kvm_unsync_page(vcpu, sp);
				2776	}
				2777
				2778	/*
				2779	* We need to ensure that the marking of unsync pages is visible
				2780	* before the SPTE is updated to allow writes because
				2781	* kvm_mmu_sync_roots() checks the unsync flags without holding
				2782	* the MMU lock and so can race with this. If the SPTE was updated
				2783	* before the page had been marked as unsync-ed, something like the
				2784	* following could happen:
				2785	*
				2786	* CPU 1 CPU 2
				2787	* ---------------------------------------------------------------------
				2788	* 1.2 Host updates SPTE
				2789	* to be writable
				2790	* 2.1 Guest writes a GPTE for GVA X.
				2791	* (GPTE being in the guest page table shadowed
				2792	* by the SP from CPU 1.)
				2793	* This reads SPTE during the page table walk.
				2794	* Since SPTE.W is read as 1, there is no
				2795	* fault.
				2796	*
				2797	* 2.2 Guest issues TLB flush.
				2798	* That causes a VM Exit.
				2799	*
				2800	* 2.3 kvm_mmu_sync_pages() reads sp->unsync.
				2801	* Since it is false, so it just returns.
				2802	*
				2803	* 2.4 Guest accesses GVA X.
				2804	* Since the mapping in the SP was not updated,
				2805	* so the old mapping for GVA X incorrectly
				2806	* gets used.
				2807	* 1.1 Host marks SP
				2808	* as unsync
				2809	* (sp->unsync = true)
				2810	*
				2811	* The write barrier below ensures that 1.1 happens before 1.2 and thus
				2812	* the situation in 2.4 does not arise. The implicit barrier in 2.2
				2813	* pairs with this write barrier.
				2814	*/
				2815	smp_wmb();
				2816
				2817	return false;
				2818	}
				2819
				2820	static bool kvm_is_mmio_pfn(kvm_pfn_t pfn)
				2821	{
				2822	if (pfn_valid(pfn))
				2823	return !is_zero_pfn(pfn) && PageReserved(pfn_to_page(pfn)) &&
				2824	/*
				2825	* Some reserved pages, such as those from NVDIMM
				2826	* DAX devices, are not for MMIO, and can be mapped
				2827	* with cached memory type for better performance.
				2828	* However, the above check misconceives those pages
				2829	* as MMIO, and results in KVM mapping them with UC
				2830	* memory type, which would hurt the performance.
				2831	* Therefore, we check the host memory type in addition
				2832	* and only treat UC/UC-/WC pages as MMIO.
				2833	*/
				2834	(!pat_enabled() \|\| pat_pfn_immune_to_uc_mtrr(pfn));
				2835
				2836	return true;
				2837	}
				2838
				2839	/* Bits which may be returned by set_spte() */
				2840	#define SET_SPTE_WRITE_PROTECTED_PT BIT(0)
				2841	#define SET_SPTE_NEED_REMOTE_TLB_FLUSH BIT(1)
				2842
				2843	static int set_spte(struct kvm_vcpu vcpu, u64 sptep,
				2844	unsigned pte_access, int level,
				2845	gfn_t gfn, kvm_pfn_t pfn, bool speculative,
				2846	bool can_unsync, bool host_writable)
				2847	{
				2848	u64 spte = 0;
				2849	int ret = 0;
				2850	struct kvm_mmu_page *sp;
				2851
				2852	if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
				2853	return 0;
				2854
				2855	sp = page_header(__pa(sptep));
				2856	if (sp_ad_disabled(sp))
				2857	spte \|= shadow_acc_track_value;
				2858
				2859	/*
				2860	* For the EPT case, shadow_present_mask is 0 if hardware
				2861	* supports exec-only page table entries. In that case,
				2862	* ACC_USER_MASK and shadow_user_mask are used to represent
				2863	* read access. See FNAME(gpte_access) in paging_tmpl.h.
				2864	*/
				2865	spte \|= shadow_present_mask;
				2866	if (!speculative)
				2867	spte \|= spte_shadow_accessed_mask(spte);
				2868
				2869	if (pte_access & ACC_EXEC_MASK)
				2870	spte \|= shadow_x_mask;
				2871	else
				2872	spte \|= shadow_nx_mask;
				2873
				2874	if (pte_access & ACC_USER_MASK)
				2875	spte \|= shadow_user_mask;
				2876
				2877	if (level > PT_PAGE_TABLE_LEVEL)
				2878	spte \|= PT_PAGE_SIZE_MASK;
				2879	if (tdp_enabled)
				2880	spte \|= kvm_x86_ops->get_mt_mask(vcpu, gfn,
				2881	kvm_is_mmio_pfn(pfn));
				2882
				2883	if (host_writable)
				2884	spte \|= SPTE_HOST_WRITEABLE;
				2885	else
				2886	pte_access &= ~ACC_WRITE_MASK;
				2887
				2888	if (!kvm_is_mmio_pfn(pfn))
				2889	spte \|= shadow_me_mask;
				2890
				2891	spte \|= (u64)pfn << PAGE_SHIFT;
				2892
				2893	if (pte_access & ACC_WRITE_MASK) {
				2894
				2895	/*
				2896	* Other vcpu creates new sp in the window between
				2897	* mapping_level() and acquiring mmu-lock. We can
				2898	* allow guest to retry the access, the mapping can
				2899	* be fixed if guest refault.
				2900	*/
				2901	if (level > PT_PAGE_TABLE_LEVEL &&
				2902	mmu_gfn_lpage_is_disallowed(vcpu, gfn, level))
				2903	goto done;
				2904
				2905	spte \|= PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE;
				2906
				2907	/*
				2908	* Optimization: for pte sync, if spte was writable the hash
				2909	* lookup is unnecessary (and expensive). Write protection
				2910	* is responsibility of mmu_get_page / kvm_sync_page.
				2911	* Same reasoning can be applied to dirty page accounting.
				2912	*/
				2913	if (!can_unsync && is_writable_pte(*sptep))
				2914	goto set_pte;
				2915
				2916	if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
				2917	pgprintk("%s: found shadow page for %llx, marking ro\n",
				2918	__func__, gfn);
				2919	ret \|= SET_SPTE_WRITE_PROTECTED_PT;
				2920	pte_access &= ~ACC_WRITE_MASK;
				2921	spte &= ~(PT_WRITABLE_MASK \| SPTE_MMU_WRITEABLE);
				2922	}
				2923	}
				2924
				2925	if (pte_access & ACC_WRITE_MASK) {
				2926	kvm_vcpu_mark_page_dirty(vcpu, gfn);
				2927	spte \|= spte_shadow_dirty_mask(spte);
				2928	}
				2929
				2930	if (speculative)
				2931	spte = mark_spte_for_access_track(spte);
				2932
				2933	set_pte:
				2934	if (mmu_spte_update(sptep, spte))
				2935	ret \|= SET_SPTE_NEED_REMOTE_TLB_FLUSH;
				2936	done:
				2937	return ret;
				2938	}
				2939
				2940	static int mmu_set_spte(struct kvm_vcpu vcpu, u64 sptep, unsigned pte_access,
				2941	int write_fault, int level, gfn_t gfn, kvm_pfn_t pfn,
				2942	bool speculative, bool host_writable)
				2943	{
				2944	int was_rmapped = 0;
				2945	int rmap_count;
				2946	int set_spte_ret;
				2947	int ret = RET_PF_RETRY;
				2948	bool flush = false;
				2949
				2950	pgprintk("%s: spte %llx write_fault %d gfn %llx\n", __func__,
				2951	*sptep, write_fault, gfn);
				2952
				2953	if (is_shadow_present_pte(*sptep)) {
				2954	/*
				2955	* If we overwrite a PTE page pointer with a 2MB PMD, unlink
				2956	* the parent of the now unreachable PTE.
				2957	*/
				2958	if (level > PT_PAGE_TABLE_LEVEL &&
				2959	!is_large_pte(*sptep)) {
				2960	struct kvm_mmu_page *child;
				2961	u64 pte = *sptep;
				2962
				2963	child = page_header(pte & PT64_BASE_ADDR_MASK);
				2964	drop_parent_pte(child, sptep);
				2965	flush = true;
				2966	} else if (pfn != spte_to_pfn(*sptep)) {
				2967	pgprintk("hfn old %llx new %llx\n",
				2968	spte_to_pfn(*sptep), pfn);
				2969	drop_spte(vcpu->kvm, sptep);
				2970	flush = true;
				2971	} else
				2972	was_rmapped = 1;
				2973	}
				2974
				2975	set_spte_ret = set_spte(vcpu, sptep, pte_access, level, gfn, pfn,
				2976	speculative, true, host_writable);
				2977	if (set_spte_ret & SET_SPTE_WRITE_PROTECTED_PT) {
				2978	if (write_fault)
				2979	ret = RET_PF_EMULATE;
				2980	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
				2981	}
				2982	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH \|\| flush)
				2983	kvm_flush_remote_tlbs(vcpu->kvm);
				2984
				2985	if (unlikely(is_mmio_spte(*sptep)))
				2986	ret = RET_PF_EMULATE;
				2987
				2988	pgprintk("%s: setting spte %llx\n", __func__, *sptep);
				2989	pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
				2990	is_large_pte(*sptep)? "2MB" : "4kB",
				2991	*sptep & PT_WRITABLE_MASK ? "RW" : "R", gfn,
				2992	*sptep, sptep);
				2993	if (!was_rmapped && is_large_pte(*sptep))
				2994	++vcpu->kvm->stat.lpages;
				2995
				2996	if (is_shadow_present_pte(*sptep)) {
				2997	if (!was_rmapped) {
				2998	rmap_count = rmap_add(vcpu, sptep, gfn);
				2999	if (rmap_count > RMAP_RECYCLE_THRESHOLD)
				3000	rmap_recycle(vcpu, sptep, gfn);
				3001	}
				3002	}
				3003
				3004	kvm_release_pfn_clean(pfn);
				3005
				3006	return ret;
				3007	}
				3008
				3009	static kvm_pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
				3010	bool no_dirty_log)
				3011	{
				3012	struct kvm_memory_slot *slot;
				3013
				3014	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
				3015	if (!slot)
				3016	return KVM_PFN_ERR_FAULT;
				3017
				3018	return gfn_to_pfn_memslot_atomic(slot, gfn);
				3019	}
				3020
				3021	static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
				3022	struct kvm_mmu_page *sp,
				3023	u64 start, u64 end)
				3024	{
				3025	struct page *pages[PTE_PREFETCH_NUM];
				3026	struct kvm_memory_slot *slot;
				3027	unsigned access = sp->role.access;
				3028	int i, ret;
				3029	gfn_t gfn;
				3030
				3031	gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
				3032	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK);
				3033	if (!slot)
				3034	return -1;
				3035
				3036	ret = gfn_to_page_many_atomic(slot, gfn, pages, end - start);
				3037	if (ret <= 0)
				3038	return -1;
				3039
				3040	for (i = 0; i < ret; i++, gfn++, start++)
				3041	mmu_set_spte(vcpu, start, access, 0, sp->role.level, gfn,
				3042	page_to_pfn(pages[i]), true, true);
				3043
				3044	return 0;
				3045	}
				3046
				3047	static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
				3048	struct kvm_mmu_page sp, u64 sptep)
				3049	{
				3050	u64 spte, start = NULL;
				3051	int i;
				3052
				3053	WARN_ON(!sp->role.direct);
				3054
				3055	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
				3056	spte = sp->spt + i;
				3057
				3058	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
				3059	if (is_shadow_present_pte(*spte) \|\| spte == sptep) {
				3060	if (!start)
				3061	continue;
				3062	if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
				3063	break;
				3064	start = NULL;
				3065	} else if (!start)
				3066	start = spte;
				3067	}
				3068	}
				3069
				3070	static void direct_pte_prefetch(struct kvm_vcpu vcpu, u64 sptep)
				3071	{
				3072	struct kvm_mmu_page *sp;
				3073
				3074	sp = page_header(__pa(sptep));
				3075
				3076	/*
				3077	* Without accessed bits, there's no way to distinguish between
				3078	* actually accessed translations and prefetched, so disable pte
				3079	* prefetch if accessed bits aren't available.
				3080	*/
				3081	if (sp_ad_disabled(sp))
				3082	return;
				3083
				3084	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				3085	return;
				3086
				3087	__direct_pte_prefetch(vcpu, sp, sptep);
				3088	}
				3089
				3090	static int __direct_map(struct kvm_vcpu *vcpu, int write, int map_writable,
				3091	int level, gfn_t gfn, kvm_pfn_t pfn, bool prefault)
				3092	{
				3093	struct kvm_shadow_walk_iterator iterator;
				3094	struct kvm_mmu_page *sp;
				3095	int emulate = 0;
				3096	gfn_t pseudo_gfn;
				3097
				3098	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3099	return 0;
				3100
				3101	for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
				3102	if (iterator.level == level) {
				3103	emulate = mmu_set_spte(vcpu, iterator.sptep, ACC_ALL,
				3104	write, level, gfn, pfn, prefault,
				3105	map_writable);
				3106	direct_pte_prefetch(vcpu, iterator.sptep);
				3107	++vcpu->stat.pf_fixed;
				3108	break;
				3109	}
				3110
				3111	drop_large_spte(vcpu, iterator.sptep);
				3112	if (!is_shadow_present_pte(*iterator.sptep)) {
				3113	u64 base_addr = iterator.addr;
				3114
				3115	base_addr &= PT64_LVL_ADDR_MASK(iterator.level);
				3116	pseudo_gfn = base_addr >> PAGE_SHIFT;
				3117	sp = kvm_mmu_get_page(vcpu, pseudo_gfn, iterator.addr,
				3118	iterator.level - 1, 1, ACC_ALL);
				3119
				3120	link_shadow_page(vcpu, iterator.sptep, sp);
				3121	}
				3122	}
				3123	return emulate;
				3124	}
				3125
				3126	static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
				3127	{
				3128	siginfo_t info;
				3129
				3130	clear_siginfo(&info);
				3131	info.si_signo = SIGBUS;
				3132	info.si_errno = 0;
				3133	info.si_code = BUS_MCEERR_AR;
				3134	info.si_addr = (void __user *)address;
				3135	info.si_addr_lsb = PAGE_SHIFT;
				3136
				3137	send_sig_info(SIGBUS, &info, tsk);
				3138	}
				3139
				3140	static int kvm_handle_bad_page(struct kvm_vcpu *vcpu, gfn_t gfn, kvm_pfn_t pfn)
				3141	{
				3142	/*
				3143	* Do not cache the mmio info caused by writing the readonly gfn
				3144	* into the spte otherwise read access on readonly gfn also can
				3145	* caused mmio page fault and treat it as mmio access.
				3146	*/
				3147	if (pfn == KVM_PFN_ERR_RO_FAULT)
				3148	return RET_PF_EMULATE;
				3149
				3150	if (pfn == KVM_PFN_ERR_HWPOISON) {
				3151	kvm_send_hwpoison_signal(kvm_vcpu_gfn_to_hva(vcpu, gfn), current);
				3152	return RET_PF_RETRY;
				3153	}
				3154
				3155	return -EFAULT;
				3156	}
				3157
				3158	static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
				3159	gfn_t gfnp, kvm_pfn_t pfnp,
				3160	int *levelp)
				3161	{
				3162	kvm_pfn_t pfn = *pfnp;
				3163	gfn_t gfn = *gfnp;
				3164	int level = *levelp;
				3165
				3166	/*
				3167	* Check if it's a transparent hugepage. If this would be an
				3168	* hugetlbfs page, level wouldn't be set to
				3169	* PT_PAGE_TABLE_LEVEL and there would be no adjustment done
				3170	* here.
				3171	*/
				3172	if (!is_error_noslot_pfn(pfn) && !kvm_is_reserved_pfn(pfn) &&
				3173	level == PT_PAGE_TABLE_LEVEL &&
				3174	PageTransCompoundMap(pfn_to_page(pfn)) &&
				3175	!mmu_gfn_lpage_is_disallowed(vcpu, gfn, PT_DIRECTORY_LEVEL)) {
				3176	unsigned long mask;
				3177	/*
				3178	* mmu_notifier_retry was successful and we hold the
				3179	* mmu_lock here, so the pmd can't become splitting
				3180	* from under us, and in turn
				3181	* __split_huge_page_refcount() can't run from under
				3182	* us and we can safely transfer the refcount from
				3183	* PG_tail to PG_head as we switch the pfn to tail to
				3184	* head.
				3185	*/
				3186	*levelp = level = PT_DIRECTORY_LEVEL;
				3187	mask = KVM_PAGES_PER_HPAGE(level) - 1;
				3188	VM_BUG_ON((gfn & mask) != (pfn & mask));
				3189	if (pfn & mask) {
				3190	gfn &= ~mask;
				3191	*gfnp = gfn;
				3192	kvm_release_pfn_clean(pfn);
				3193	pfn &= ~mask;
				3194	kvm_get_pfn(pfn);
				3195	*pfnp = pfn;
				3196	}
				3197	}
				3198	}
				3199
				3200	static bool handle_abnormal_pfn(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn,
				3201	kvm_pfn_t pfn, unsigned access, int *ret_val)
				3202	{
				3203	/* The pfn is invalid, report the error! */
				3204	if (unlikely(is_error_pfn(pfn))) {
				3205	*ret_val = kvm_handle_bad_page(vcpu, gfn, pfn);
				3206	return true;
				3207	}
				3208
				3209	if (unlikely(is_noslot_pfn(pfn)))
				3210	vcpu_cache_mmio_info(vcpu, gva, gfn, access);
				3211
				3212	return false;
				3213	}
				3214
				3215	static bool page_fault_can_be_fast(u32 error_code)
				3216	{
				3217	/*
				3218	* Do not fix the mmio spte with invalid generation number which
				3219	* need to be updated by slow page fault path.
				3220	*/
				3221	if (unlikely(error_code & PFERR_RSVD_MASK))
				3222	return false;
				3223
				3224	/* See if the page fault is due to an NX violation */
				3225	if (unlikely(((error_code & (PFERR_FETCH_MASK \| PFERR_PRESENT_MASK))
				3226	== (PFERR_FETCH_MASK \| PFERR_PRESENT_MASK))))
				3227	return false;
				3228
				3229	/*
				3230	* #PF can be fast if:
				3231	* 1. The shadow page table entry is not present, which could mean that
				3232	* the fault is potentially caused by access tracking (if enabled).
				3233	* 2. The shadow page table entry is present and the fault
				3234	* is caused by write-protect, that means we just need change the W
				3235	* bit of the spte which can be done out of mmu-lock.
				3236	*
				3237	* However, if access tracking is disabled we know that a non-present
				3238	* page must be a genuine page fault where we have to create a new SPTE.
				3239	* So, if access tracking is disabled, we return true only for write
				3240	* accesses to a present page.
				3241	*/
				3242
				3243	return shadow_acc_track_mask != 0 \|\|
				3244	((error_code & (PFERR_WRITE_MASK \| PFERR_PRESENT_MASK))
				3245	== (PFERR_WRITE_MASK \| PFERR_PRESENT_MASK));
				3246	}
				3247
				3248	/*
				3249	* Returns true if the SPTE was fixed successfully. Otherwise,
				3250	* someone else modified the SPTE from its original value.
				3251	*/
				3252	static bool
				3253	fast_pf_fix_direct_spte(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				3254	u64 *sptep, u64 old_spte, u64 new_spte)
				3255	{
				3256	gfn_t gfn;
				3257
				3258	WARN_ON(!sp->role.direct);
				3259
				3260	/*
				3261	* Theoretically we could also set dirty bit (and flush TLB) here in
				3262	* order to eliminate unnecessary PML logging. See comments in
				3263	* set_spte. But fast_page_fault is very unlikely to happen with PML
				3264	* enabled, so we do not do this. This might result in the same GPA
				3265	* to be logged in PML buffer again when the write really happens, and
				3266	* eventually to be called by mark_page_dirty twice. But it's also no
				3267	* harm. This also avoids the TLB flush needed after setting dirty bit
				3268	* so non-PML cases won't be impacted.
				3269	*
				3270	* Compare with set_spte where instead shadow_dirty_mask is set.
				3271	*/
				3272	if (cmpxchg64(sptep, old_spte, new_spte) != old_spte)
				3273	return false;
				3274
				3275	if (is_writable_pte(new_spte) && !is_writable_pte(old_spte)) {
				3276	/*
				3277	* The gfn of direct spte is stable since it is
				3278	* calculated by sp->gfn.
				3279	*/
				3280	gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
				3281	kvm_vcpu_mark_page_dirty(vcpu, gfn);
				3282	}
				3283
				3284	return true;
				3285	}
				3286
				3287	static bool is_access_allowed(u32 fault_err_code, u64 spte)
				3288	{
				3289	if (fault_err_code & PFERR_FETCH_MASK)
				3290	return is_executable_pte(spte);
				3291
				3292	if (fault_err_code & PFERR_WRITE_MASK)
				3293	return is_writable_pte(spte);
				3294
				3295	/* Fault was on Read access */
				3296	return spte & PT_PRESENT_MASK;
				3297	}
				3298
				3299	/*
				3300	* Return value:
				3301	* - true: let the vcpu to access on the same address again.
				3302	* - false: let the real page fault path to fix it.
				3303	*/
				3304	static bool fast_page_fault(struct kvm_vcpu *vcpu, gva_t gva, int level,
				3305	u32 error_code)
				3306	{
				3307	struct kvm_shadow_walk_iterator iterator;
				3308	struct kvm_mmu_page *sp;
				3309	bool fault_handled = false;
				3310	u64 spte = 0ull;
				3311	uint retry_count = 0;
				3312
				3313	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3314	return false;
				3315
				3316	if (!page_fault_can_be_fast(error_code))
				3317	return false;
				3318
				3319	walk_shadow_page_lockless_begin(vcpu);
				3320
				3321	do {
				3322	u64 new_spte;
				3323
				3324	for_each_shadow_entry_lockless(vcpu, gva, iterator, spte)
				3325	if (!is_shadow_present_pte(spte) \|\|
				3326	iterator.level < level)
				3327	break;
				3328
				3329	sp = page_header(__pa(iterator.sptep));
				3330	if (!is_last_spte(spte, sp->role.level))
				3331	break;
				3332
				3333	/*
				3334	* Check whether the memory access that caused the fault would
				3335	* still cause it if it were to be performed right now. If not,
				3336	* then this is a spurious fault caused by TLB lazily flushed,
				3337	* or some other CPU has already fixed the PTE after the
				3338	* current CPU took the fault.
				3339	*
				3340	* Need not check the access of upper level table entries since
				3341	* they are always ACC_ALL.
				3342	*/
				3343	if (is_access_allowed(error_code, spte)) {
				3344	fault_handled = true;
				3345	break;
				3346	}
				3347
				3348	new_spte = spte;
				3349
				3350	if (is_access_track_spte(spte))
				3351	new_spte = restore_acc_track_spte(new_spte);
				3352
				3353	/*
				3354	* Currently, to simplify the code, write-protection can
				3355	* be removed in the fast path only if the SPTE was
				3356	* write-protected for dirty-logging or access tracking.
				3357	*/
				3358	if ((error_code & PFERR_WRITE_MASK) &&
				3359	spte_can_locklessly_be_made_writable(spte))
				3360	{
				3361	new_spte \|= PT_WRITABLE_MASK;
				3362
				3363	/*
				3364	* Do not fix write-permission on the large spte. Since
				3365	* we only dirty the first page into the dirty-bitmap in
				3366	* fast_pf_fix_direct_spte(), other pages are missed
				3367	* if its slot has dirty logging enabled.
				3368	*
				3369	* Instead, we let the slow page fault path create a
				3370	* normal spte to fix the access.
				3371	*
				3372	* See the comments in kvm_arch_commit_memory_region().
				3373	*/
				3374	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				3375	break;
				3376	}
				3377
				3378	/* Verify that the fault can be handled in the fast path */
				3379	if (new_spte == spte \|\|
				3380	!is_access_allowed(error_code, new_spte))
				3381	break;
				3382
				3383	/*
				3384	* Currently, fast page fault only works for direct mapping
				3385	* since the gfn is not stable for indirect shadow page. See
				3386	* Documentation/virtual/kvm/locking.txt to get more detail.
				3387	*/
				3388	fault_handled = fast_pf_fix_direct_spte(vcpu, sp,
				3389	iterator.sptep, spte,
				3390	new_spte);
				3391	if (fault_handled)
				3392	break;
				3393
				3394	if (++retry_count > 4) {
				3395	printk_once(KERN_WARNING
				3396	"kvm: Fast #PF retrying more than 4 times.\n");
				3397	break;
				3398	}
				3399
				3400	} while (true);
				3401
				3402	trace_fast_page_fault(vcpu, gva, error_code, iterator.sptep,
				3403	spte, fault_handled);
				3404	walk_shadow_page_lockless_end(vcpu);
				3405
				3406	return fault_handled;
				3407	}
				3408
				3409	static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
				3410	gva_t gva, kvm_pfn_t pfn, bool write, bool writable);
				3411	static int make_mmu_pages_available(struct kvm_vcpu *vcpu);
				3412
				3413	static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, u32 error_code,
				3414	gfn_t gfn, bool prefault)
				3415	{
				3416	int r;
				3417	int level;
				3418	bool force_pt_level = false;
				3419	kvm_pfn_t pfn;
				3420	unsigned long mmu_seq;
				3421	bool map_writable, write = error_code & PFERR_WRITE_MASK;
				3422
				3423	level = mapping_level(vcpu, gfn, &force_pt_level);
				3424	if (likely(!force_pt_level)) {
				3425	/*
				3426	* This path builds a PAE pagetable - so we can map
				3427	* 2mb pages at maximum. Therefore check if the level
				3428	* is larger than that.
				3429	*/
				3430	if (level > PT_DIRECTORY_LEVEL)
				3431	level = PT_DIRECTORY_LEVEL;
				3432
				3433	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
				3434	}
				3435
				3436	if (fast_page_fault(vcpu, v, level, error_code))
				3437	return RET_PF_RETRY;
				3438
				3439	mmu_seq = vcpu->kvm->mmu_notifier_seq;
				3440	smp_rmb();
				3441
				3442	if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
				3443	return RET_PF_RETRY;
				3444
				3445	if (handle_abnormal_pfn(vcpu, v, gfn, pfn, ACC_ALL, &r))
				3446	return r;
				3447
				3448	spin_lock(&vcpu->kvm->mmu_lock);
				3449	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
				3450	goto out_unlock;
				3451	if (make_mmu_pages_available(vcpu) < 0)
				3452	goto out_unlock;
				3453	if (likely(!force_pt_level))
				3454	transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
				3455	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
				3456	spin_unlock(&vcpu->kvm->mmu_lock);
				3457
				3458	return r;
				3459
				3460	out_unlock:
				3461	spin_unlock(&vcpu->kvm->mmu_lock);
				3462	kvm_release_pfn_clean(pfn);
				3463	return RET_PF_RETRY;
				3464	}
				3465
				3466	static void mmu_free_root_page(struct kvm kvm, hpa_t root_hpa,
				3467	struct list_head *invalid_list)
				3468	{
				3469	struct kvm_mmu_page *sp;
				3470
				3471	if (!VALID_PAGE(*root_hpa))
				3472	return;
				3473
				3474	sp = page_header(*root_hpa & PT64_BASE_ADDR_MASK);
				3475	--sp->root_count;
				3476	if (!sp->root_count && sp->role.invalid)
				3477	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
				3478
				3479	*root_hpa = INVALID_PAGE;
				3480	}
				3481
				3482	/* roots_to_free must be some combination of the KVM_MMU_ROOT_* flags */
				3483	void kvm_mmu_free_roots(struct kvm_vcpu *vcpu, ulong roots_to_free)
				3484	{
				3485	int i;
				3486	LIST_HEAD(invalid_list);
				3487	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				3488	bool free_active_root = roots_to_free & KVM_MMU_ROOT_CURRENT;
				3489
				3490	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
				3491
				3492	/* Before acquiring the MMU lock, see if we need to do any real work. */
				3493	if (!(free_active_root && VALID_PAGE(mmu->root_hpa))) {
				3494	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				3495	if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
				3496	VALID_PAGE(mmu->prev_roots[i].hpa))
				3497	break;
				3498
				3499	if (i == KVM_MMU_NUM_PREV_ROOTS)
				3500	return;
				3501	}
				3502
				3503	spin_lock(&vcpu->kvm->mmu_lock);
				3504
				3505	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				3506	if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
				3507	mmu_free_root_page(vcpu->kvm, &mmu->prev_roots[i].hpa,
				3508	&invalid_list);
				3509
				3510	if (free_active_root) {
				3511	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
				3512	(mmu->root_level >= PT64_ROOT_4LEVEL \|\| mmu->direct_map)) {
				3513	mmu_free_root_page(vcpu->kvm, &mmu->root_hpa,
				3514	&invalid_list);
				3515	} else {
				3516	for (i = 0; i < 4; ++i)
				3517	if (mmu->pae_root[i] != 0)
				3518	mmu_free_root_page(vcpu->kvm,
				3519	&mmu->pae_root[i],
				3520	&invalid_list);
				3521	mmu->root_hpa = INVALID_PAGE;
				3522	}
				3523	}
				3524
				3525	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
				3526	spin_unlock(&vcpu->kvm->mmu_lock);
				3527	}
				3528	EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
				3529
				3530	static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
				3531	{
				3532	int ret = 0;
				3533
				3534	if (!kvm_is_visible_gfn(vcpu->kvm, root_gfn)) {
				3535	kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
				3536	ret = 1;
				3537	}
				3538
				3539	return ret;
				3540	}
				3541
				3542	static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
				3543	{
				3544	struct kvm_mmu_page *sp;
				3545	unsigned i;
				3546
				3547	if (vcpu->arch.mmu.shadow_root_level >= PT64_ROOT_4LEVEL) {
				3548	spin_lock(&vcpu->kvm->mmu_lock);
				3549	if(make_mmu_pages_available(vcpu) < 0) {
				3550	spin_unlock(&vcpu->kvm->mmu_lock);
				3551	return -ENOSPC;
				3552	}
				3553	sp = kvm_mmu_get_page(vcpu, 0, 0,
				3554	vcpu->arch.mmu.shadow_root_level, 1, ACC_ALL);
				3555	++sp->root_count;
				3556	spin_unlock(&vcpu->kvm->mmu_lock);
				3557	vcpu->arch.mmu.root_hpa = __pa(sp->spt);
				3558	} else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
				3559	for (i = 0; i < 4; ++i) {
				3560	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3561
				3562	MMU_WARN_ON(VALID_PAGE(root));
				3563	spin_lock(&vcpu->kvm->mmu_lock);
				3564	if (make_mmu_pages_available(vcpu) < 0) {
				3565	spin_unlock(&vcpu->kvm->mmu_lock);
				3566	return -ENOSPC;
				3567	}
				3568	sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
				3569	i << 30, PT32_ROOT_LEVEL, 1, ACC_ALL);
				3570	root = __pa(sp->spt);
				3571	++sp->root_count;
				3572	spin_unlock(&vcpu->kvm->mmu_lock);
				3573	vcpu->arch.mmu.pae_root[i] = root \| PT_PRESENT_MASK;
				3574	}
				3575	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
				3576	} else
				3577	BUG();
				3578
				3579	return 0;
				3580	}
				3581
				3582	static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
				3583	{
				3584	struct kvm_mmu_page *sp;
				3585	u64 pdptr, pm_mask;
				3586	gfn_t root_gfn;
				3587	int i;
				3588
				3589	root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
				3590
				3591	if (mmu_check_root(vcpu, root_gfn))
				3592	return 1;
				3593
				3594	/*
				3595	* Do we shadow a long mode page table? If so we need to
				3596	* write-protect the guests page table root.
				3597	*/
				3598	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
				3599	hpa_t root = vcpu->arch.mmu.root_hpa;
				3600
				3601	MMU_WARN_ON(VALID_PAGE(root));
				3602
				3603	spin_lock(&vcpu->kvm->mmu_lock);
				3604	if (make_mmu_pages_available(vcpu) < 0) {
				3605	spin_unlock(&vcpu->kvm->mmu_lock);
				3606	return -ENOSPC;
				3607	}
				3608	sp = kvm_mmu_get_page(vcpu, root_gfn, 0,
				3609	vcpu->arch.mmu.shadow_root_level, 0, ACC_ALL);
				3610	root = __pa(sp->spt);
				3611	++sp->root_count;
				3612	spin_unlock(&vcpu->kvm->mmu_lock);
				3613	vcpu->arch.mmu.root_hpa = root;
				3614	return 0;
				3615	}
				3616
				3617	/*
				3618	* We shadow a 32 bit page table. This may be a legacy 2-level
				3619	* or a PAE 3-level page table. In either case we need to be aware that
				3620	* the shadow page table may be a PAE or a long mode page table.
				3621	*/
				3622	pm_mask = PT_PRESENT_MASK;
				3623	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL)
				3624	pm_mask \|= PT_ACCESSED_MASK \| PT_WRITABLE_MASK \| PT_USER_MASK;
				3625
				3626	for (i = 0; i < 4; ++i) {
				3627	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3628
				3629	MMU_WARN_ON(VALID_PAGE(root));
				3630	if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
				3631	pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
				3632	if (!(pdptr & PT_PRESENT_MASK)) {
				3633	vcpu->arch.mmu.pae_root[i] = 0;
				3634	continue;
				3635	}
				3636	root_gfn = pdptr >> PAGE_SHIFT;
				3637	if (mmu_check_root(vcpu, root_gfn))
				3638	return 1;
				3639	}
				3640	spin_lock(&vcpu->kvm->mmu_lock);
				3641	if (make_mmu_pages_available(vcpu) < 0) {
				3642	spin_unlock(&vcpu->kvm->mmu_lock);
				3643	return -ENOSPC;
				3644	}
				3645	sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, PT32_ROOT_LEVEL,
				3646	0, ACC_ALL);
				3647	root = __pa(sp->spt);
				3648	++sp->root_count;
				3649	spin_unlock(&vcpu->kvm->mmu_lock);
				3650
				3651	vcpu->arch.mmu.pae_root[i] = root \| pm_mask;
				3652	}
				3653	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
				3654
				3655	/*
				3656	* If we shadow a 32 bit page table with a long mode page
				3657	* table we enter this path.
				3658	*/
				3659	if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_4LEVEL) {
				3660	if (vcpu->arch.mmu.lm_root == NULL) {
				3661	/*
				3662	* The additional page necessary for this is only
				3663	* allocated on demand.
				3664	*/
				3665
				3666	u64 *lm_root;
				3667
				3668	lm_root = (void*)get_zeroed_page(GFP_KERNEL);
				3669	if (lm_root == NULL)
				3670	return 1;
				3671
				3672	lm_root[0] = __pa(vcpu->arch.mmu.pae_root) \| pm_mask;
				3673
				3674	vcpu->arch.mmu.lm_root = lm_root;
				3675	}
				3676
				3677	vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
				3678	}
				3679
				3680	return 0;
				3681	}
				3682
				3683	static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
				3684	{
				3685	if (vcpu->arch.mmu.direct_map)
				3686	return mmu_alloc_direct_roots(vcpu);
				3687	else
				3688	return mmu_alloc_shadow_roots(vcpu);
				3689	}
				3690
				3691	void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
				3692	{
				3693	int i;
				3694	struct kvm_mmu_page *sp;
				3695
				3696	if (vcpu->arch.mmu.direct_map)
				3697	return;
				3698
				3699	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3700	return;
				3701
				3702	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
				3703
				3704	if (vcpu->arch.mmu.root_level >= PT64_ROOT_4LEVEL) {
				3705	hpa_t root = vcpu->arch.mmu.root_hpa;
				3706
				3707	sp = page_header(root);
				3708
				3709	/*
				3710	* Even if another CPU was marking the SP as unsync-ed
				3711	* simultaneously, any guest page table changes are not
				3712	* guaranteed to be visible anyway until this VCPU issues a TLB
				3713	* flush strictly after those changes are made. We only need to
				3714	* ensure that the other CPU sets these flags before any actual
				3715	* changes to the page tables are made. The comments in
				3716	* mmu_need_write_protect() describe what could go wrong if this
				3717	* requirement isn't satisfied.
				3718	*/
				3719	if (!smp_load_acquire(&sp->unsync) &&
				3720	!smp_load_acquire(&sp->unsync_children))
				3721	return;
				3722
				3723	spin_lock(&vcpu->kvm->mmu_lock);
				3724	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
				3725
				3726	mmu_sync_children(vcpu, sp);
				3727
				3728	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
				3729	spin_unlock(&vcpu->kvm->mmu_lock);
				3730	return;
				3731	}
				3732
				3733	spin_lock(&vcpu->kvm->mmu_lock);
				3734	kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
				3735
				3736	for (i = 0; i < 4; ++i) {
				3737	hpa_t root = vcpu->arch.mmu.pae_root[i];
				3738
				3739	if (root && VALID_PAGE(root)) {
				3740	root &= PT64_BASE_ADDR_MASK;
				3741	sp = page_header(root);
				3742	mmu_sync_children(vcpu, sp);
				3743	}
				3744	}
				3745
				3746	kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
				3747	spin_unlock(&vcpu->kvm->mmu_lock);
				3748	}
				3749	EXPORT_SYMBOL_GPL(kvm_mmu_sync_roots);
				3750
				3751	static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
				3752	u32 access, struct x86_exception *exception)
				3753	{
				3754	if (exception)
				3755	exception->error_code = 0;
				3756	return vaddr;
				3757	}
				3758
				3759	static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
				3760	u32 access,
				3761	struct x86_exception *exception)
				3762	{
				3763	if (exception)
				3764	exception->error_code = 0;
				3765	return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access, exception);
				3766	}
				3767
				3768	static bool
				3769	__is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check, u64 pte, int level)
				3770	{
				3771	int bit7 = (pte >> 7) & 1, low6 = pte & 0x3f;
				3772
				3773	return (pte & rsvd_check->rsvd_bits_mask[bit7][level-1]) \|
				3774	((rsvd_check->bad_mt_xwr & (1ull << low6)) != 0);
				3775	}
				3776
				3777	static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
				3778	{
				3779	return __is_rsvd_bits_set(&mmu->guest_rsvd_check, gpte, level);
				3780	}
				3781
				3782	static bool is_shadow_zero_bits_set(struct kvm_mmu *mmu, u64 spte, int level)
				3783	{
				3784	return __is_rsvd_bits_set(&mmu->shadow_zero_check, spte, level);
				3785	}
				3786
				3787	static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
				3788	{
				3789	/*
				3790	* A nested guest cannot use the MMIO cache if it is using nested
				3791	* page tables, because cr2 is a nGPA while the cache stores GPAs.
				3792	*/
				3793	if (mmu_is_nested(vcpu))
				3794	return false;
				3795
				3796	if (direct)
				3797	return vcpu_match_mmio_gpa(vcpu, addr);
				3798
				3799	return vcpu_match_mmio_gva(vcpu, addr);
				3800	}
				3801
				3802	/* return true if reserved bit is detected on spte. */
				3803	static bool
				3804	walk_shadow_page_get_mmio_spte(struct kvm_vcpu vcpu, u64 addr, u64 sptep)
				3805	{
				3806	struct kvm_shadow_walk_iterator iterator;
				3807	u64 sptes[PT64_ROOT_MAX_LEVEL], spte = 0ull;
				3808	int root, leaf;
				3809	bool reserved = false;
				3810
				3811	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3812	goto exit;
				3813
				3814	walk_shadow_page_lockless_begin(vcpu);
				3815
				3816	for (shadow_walk_init(&iterator, vcpu, addr),
				3817	leaf = root = iterator.level;
				3818	shadow_walk_okay(&iterator);
				3819	__shadow_walk_next(&iterator, spte)) {
				3820	spte = mmu_spte_get_lockless(iterator.sptep);
				3821
				3822	sptes[leaf - 1] = spte;
				3823	leaf--;
				3824
				3825	if (!is_shadow_present_pte(spte))
				3826	break;
				3827
				3828	reserved \|= is_shadow_zero_bits_set(&vcpu->arch.mmu, spte,
				3829	iterator.level);
				3830	}
				3831
				3832	walk_shadow_page_lockless_end(vcpu);
				3833
				3834	if (reserved) {
				3835	pr_err("%s: detect reserved bits on spte, addr 0x%llx, dump hierarchy:\n",
				3836	__func__, addr);
				3837	while (root > leaf) {
				3838	pr_err("------ spte 0x%llx level %d.\n",
				3839	sptes[root - 1], root);
				3840	root--;
				3841	}
				3842	}
				3843	exit:
				3844	*sptep = spte;
				3845	return reserved;
				3846	}
				3847
				3848	static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
				3849	{
				3850	u64 spte;
				3851	bool reserved;
				3852
				3853	if (mmio_info_in_cache(vcpu, addr, direct))
				3854	return RET_PF_EMULATE;
				3855
				3856	reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte);
				3857	if (WARN_ON(reserved))
				3858	return -EINVAL;
				3859
				3860	if (is_mmio_spte(spte)) {
				3861	gfn_t gfn = get_mmio_spte_gfn(spte);
				3862	unsigned access = get_mmio_spte_access(spte);
				3863
				3864	if (!check_mmio_spte(vcpu, spte))
				3865	return RET_PF_INVALID;
				3866
				3867	if (direct)
				3868	addr = 0;
				3869
				3870	trace_handle_mmio_page_fault(addr, gfn, access);
				3871	vcpu_cache_mmio_info(vcpu, addr, gfn, access);
				3872	return RET_PF_EMULATE;
				3873	}
				3874
				3875	/*
				3876	* If the page table is zapped by other cpus, let CPU fault again on
				3877	* the address.
				3878	*/
				3879	return RET_PF_RETRY;
				3880	}
				3881
				3882	static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
				3883	u32 error_code, gfn_t gfn)
				3884	{
				3885	if (unlikely(error_code & PFERR_RSVD_MASK))
				3886	return false;
				3887
				3888	if (!(error_code & PFERR_PRESENT_MASK) \|\|
				3889	!(error_code & PFERR_WRITE_MASK))
				3890	return false;
				3891
				3892	/*
				3893	* guest is writing the page which is write tracked which can
				3894	* not be fixed by page fault handler.
				3895	*/
				3896	if (kvm_page_track_is_active(vcpu, gfn, KVM_PAGE_TRACK_WRITE))
				3897	return true;
				3898
				3899	return false;
				3900	}
				3901
				3902	static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
				3903	{
				3904	struct kvm_shadow_walk_iterator iterator;
				3905	u64 spte;
				3906
				3907	if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
				3908	return;
				3909
				3910	walk_shadow_page_lockless_begin(vcpu);
				3911	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte) {
				3912	clear_sp_write_flooding_count(iterator.sptep);
				3913	if (!is_shadow_present_pte(spte))
				3914	break;
				3915	}
				3916	walk_shadow_page_lockless_end(vcpu);
				3917	}
				3918
				3919	static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
				3920	u32 error_code, bool prefault)
				3921	{
				3922	gfn_t gfn = gva >> PAGE_SHIFT;
				3923	int r;
				3924
				3925	pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code);
				3926
				3927	if (page_fault_handle_page_track(vcpu, error_code, gfn))
				3928	return RET_PF_EMULATE;
				3929
				3930	r = mmu_topup_memory_caches(vcpu);
				3931	if (r)
				3932	return r;
				3933
				3934	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
				3935
				3936
				3937	return nonpaging_map(vcpu, gva & PAGE_MASK,
				3938	error_code, gfn, prefault);
				3939	}
				3940
				3941	static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
				3942	{
				3943	struct kvm_arch_async_pf arch;
				3944
				3945	arch.token = (vcpu->arch.apf.id++ << 12) \| vcpu->vcpu_id;
				3946	arch.gfn = gfn;
				3947	arch.direct_map = vcpu->arch.mmu.direct_map;
				3948	arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
				3949
				3950	return kvm_setup_async_pf(vcpu, gva, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch);
				3951	}
				3952
				3953	bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu)
				3954	{
				3955	if (unlikely(!lapic_in_kernel(vcpu) \|\|
				3956	kvm_event_needs_reinjection(vcpu) \|\|
				3957	vcpu->arch.exception.pending))
				3958	return false;
				3959
				3960	if (!vcpu->arch.apf.delivery_as_pf_vmexit && is_guest_mode(vcpu))
				3961	return false;
				3962
				3963	return kvm_x86_ops->interrupt_allowed(vcpu);
				3964	}
				3965
				3966	static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
				3967	gva_t gva, kvm_pfn_t pfn, bool write, bool writable)
				3968	{
				3969	struct kvm_memory_slot *slot;
				3970	bool async;
				3971
				3972	/*
				3973	* Don't expose private memslots to L2.
				3974	*/
				3975	if (is_guest_mode(vcpu) && !kvm_is_visible_gfn(vcpu->kvm, gfn)) {
				3976	*pfn = KVM_PFN_NOSLOT;
				3977	return false;
				3978	}
				3979
				3980	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
				3981	async = false;
				3982	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, &async, write, writable);
				3983	if (!async)
				3984	return false; /* pfn has correct page already /
				3985
				3986	if (!prefault && kvm_can_do_async_pf(vcpu)) {
				3987	trace_kvm_try_async_get_page(gva, gfn);
				3988	if (kvm_find_async_pf_gfn(vcpu, gfn)) {
				3989	trace_kvm_async_pf_doublefault(gva, gfn);
				3990	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
				3991	return true;
				3992	} else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
				3993	return true;
				3994	}
				3995
				3996	*pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable);
				3997	return false;
				3998	}
				3999
				4000	int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
				4001	u64 fault_address, char *insn, int insn_len)
				4002	{
				4003	int r = 1;
				4004
				4005	vcpu->arch.l1tf_flush_l1d = true;
				4006	switch (vcpu->arch.apf.host_apf_reason) {
				4007	default:
				4008	trace_kvm_page_fault(fault_address, error_code);
				4009
				4010	if (kvm_event_needs_reinjection(vcpu))
				4011	kvm_mmu_unprotect_page_virt(vcpu, fault_address);
				4012	r = kvm_mmu_page_fault(vcpu, fault_address, error_code, insn,
				4013	insn_len);
				4014	break;
				4015	case KVM_PV_REASON_PAGE_NOT_PRESENT:
				4016	vcpu->arch.apf.host_apf_reason = 0;
				4017	local_irq_disable();
				4018	kvm_async_pf_task_wait(fault_address, 0);
				4019	local_irq_enable();
				4020	break;
				4021	case KVM_PV_REASON_PAGE_READY:
				4022	vcpu->arch.apf.host_apf_reason = 0;
				4023	local_irq_disable();
				4024	kvm_async_pf_task_wake(fault_address);
				4025	local_irq_enable();
				4026	break;
				4027	}
				4028	return r;
				4029	}
				4030	EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
				4031
				4032	static bool
				4033	check_hugepage_cache_consistency(struct kvm_vcpu *vcpu, gfn_t gfn, int level)
				4034	{
				4035	int page_num = KVM_PAGES_PER_HPAGE(level);
				4036
				4037	gfn &= ~(page_num - 1);
				4038
				4039	return kvm_mtrr_check_gfn_range_consistency(vcpu, gfn, page_num);
				4040	}
				4041
				4042	static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
				4043	bool prefault)
				4044	{
				4045	kvm_pfn_t pfn;
				4046	int r;
				4047	int level;
				4048	bool force_pt_level;
				4049	gfn_t gfn = gpa >> PAGE_SHIFT;
				4050	unsigned long mmu_seq;
				4051	int write = error_code & PFERR_WRITE_MASK;
				4052	bool map_writable;
				4053
				4054	MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
				4055
				4056	if (page_fault_handle_page_track(vcpu, error_code, gfn))
				4057	return RET_PF_EMULATE;
				4058
				4059	r = mmu_topup_memory_caches(vcpu);
				4060	if (r)
				4061	return r;
				4062
				4063	force_pt_level = !check_hugepage_cache_consistency(vcpu, gfn,
				4064	PT_DIRECTORY_LEVEL);
				4065	level = mapping_level(vcpu, gfn, &force_pt_level);
				4066	if (likely(!force_pt_level)) {
				4067	if (level > PT_DIRECTORY_LEVEL &&
				4068	!check_hugepage_cache_consistency(vcpu, gfn, level))
				4069	level = PT_DIRECTORY_LEVEL;
				4070	gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
				4071	}
				4072
				4073	if (fast_page_fault(vcpu, gpa, level, error_code))
				4074	return RET_PF_RETRY;
				4075
				4076	mmu_seq = vcpu->kvm->mmu_notifier_seq;
				4077	smp_rmb();
				4078
				4079	if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
				4080	return RET_PF_RETRY;
				4081
				4082	if (handle_abnormal_pfn(vcpu, 0, gfn, pfn, ACC_ALL, &r))
				4083	return r;
				4084
				4085	spin_lock(&vcpu->kvm->mmu_lock);
				4086	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
				4087	goto out_unlock;
				4088	if (make_mmu_pages_available(vcpu) < 0)
				4089	goto out_unlock;
				4090	if (likely(!force_pt_level))
				4091	transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
				4092	r = __direct_map(vcpu, write, map_writable, level, gfn, pfn, prefault);
				4093	spin_unlock(&vcpu->kvm->mmu_lock);
				4094
				4095	return r;
				4096
				4097	out_unlock:
				4098	spin_unlock(&vcpu->kvm->mmu_lock);
				4099	kvm_release_pfn_clean(pfn);
				4100	return RET_PF_RETRY;
				4101	}
				4102
				4103	static void nonpaging_init_context(struct kvm_vcpu *vcpu,
				4104	struct kvm_mmu *context)
				4105	{
				4106	context->page_fault = nonpaging_page_fault;
				4107	context->gva_to_gpa = nonpaging_gva_to_gpa;
				4108	context->sync_page = nonpaging_sync_page;
				4109	context->invlpg = nonpaging_invlpg;
				4110	context->update_pte = nonpaging_update_pte;
				4111	context->root_level = 0;
				4112	context->shadow_root_level = PT32E_ROOT_LEVEL;
				4113	context->direct_map = true;
				4114	context->nx = false;
				4115	}
				4116
				4117	/*
				4118	* Find out if a previously cached root matching the new CR3/role is available.
				4119	* The current root is also inserted into the cache.
				4120	* If a matching root was found, it is assigned to kvm_mmu->root_hpa and true is
				4121	* returned.
				4122	* Otherwise, the LRU root from the cache is assigned to kvm_mmu->root_hpa and
				4123	* false is returned. This root should now be freed by the caller.
				4124	*/
				4125	static bool cached_root_available(struct kvm_vcpu *vcpu, gpa_t new_cr3,
				4126	union kvm_mmu_page_role new_role)
				4127	{
				4128	uint i;
				4129	struct kvm_mmu_root_info root;
				4130	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				4131
				4132	root.cr3 = mmu->get_cr3(vcpu);
				4133	root.hpa = mmu->root_hpa;
				4134
				4135	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
				4136	swap(root, mmu->prev_roots[i]);
				4137
				4138	if (new_cr3 == root.cr3 && VALID_PAGE(root.hpa) &&
				4139	page_header(root.hpa) != NULL &&
				4140	new_role.word == page_header(root.hpa)->role.word)
				4141	break;
				4142	}
				4143
				4144	mmu->root_hpa = root.hpa;
				4145
				4146	return i < KVM_MMU_NUM_PREV_ROOTS;
				4147	}
				4148
				4149	static bool fast_cr3_switch(struct kvm_vcpu *vcpu, gpa_t new_cr3,
				4150	union kvm_mmu_page_role new_role,
				4151	bool skip_tlb_flush)
				4152	{
				4153	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				4154
				4155	/*
				4156	* For now, limit the fast switch to 64-bit hosts+VMs in order to avoid
				4157	* having to deal with PDPTEs. We may add support for 32-bit hosts/VMs
				4158	* later if necessary.
				4159	*/
				4160	if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL &&
				4161	mmu->root_level >= PT64_ROOT_4LEVEL) {
				4162	if (mmu_check_root(vcpu, new_cr3 >> PAGE_SHIFT))
				4163	return false;
				4164
				4165	if (cached_root_available(vcpu, new_cr3, new_role)) {
				4166	/*
				4167	* It is possible that the cached previous root page is
				4168	* obsolete because of a change in the MMU
				4169	* generation number. However, that is accompanied by
				4170	* KVM_REQ_MMU_RELOAD, which will free the root that we
				4171	* have set here and allocate a new one.
				4172	*/
				4173
				4174	kvm_make_request(KVM_REQ_LOAD_CR3, vcpu);
				4175	if (!skip_tlb_flush) {
				4176	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
				4177	kvm_x86_ops->tlb_flush(vcpu, true);
				4178	}
				4179
				4180	/*
				4181	* The last MMIO access's GVA and GPA are cached in the
				4182	* VCPU. When switching to a new CR3, that GVA->GPA
				4183	* mapping may no longer be valid. So clear any cached
				4184	* MMIO info even when we don't need to sync the shadow
				4185	* page tables.
				4186	*/
				4187	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
				4188
				4189	__clear_sp_write_flooding_count(
				4190	page_header(mmu->root_hpa));
				4191
				4192	return true;
				4193	}
				4194	}
				4195
				4196	return false;
				4197	}
				4198
				4199	static void __kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3,
				4200	union kvm_mmu_page_role new_role,
				4201	bool skip_tlb_flush)
				4202	{
				4203	if (!fast_cr3_switch(vcpu, new_cr3, new_role, skip_tlb_flush))
				4204	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOT_CURRENT);
				4205	}
				4206
				4207	void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu, gpa_t new_cr3, bool skip_tlb_flush)
				4208	{
				4209	__kvm_mmu_new_cr3(vcpu, new_cr3, kvm_mmu_calc_root_page_role(vcpu),
				4210	skip_tlb_flush);
				4211	}
				4212	EXPORT_SYMBOL_GPL(kvm_mmu_new_cr3);
				4213
				4214	static unsigned long get_cr3(struct kvm_vcpu *vcpu)
				4215	{
				4216	return kvm_read_cr3(vcpu);
				4217	}
				4218
				4219	static void inject_page_fault(struct kvm_vcpu *vcpu,
				4220	struct x86_exception *fault)
				4221	{
				4222	vcpu->arch.mmu.inject_page_fault(vcpu, fault);
				4223	}
				4224
				4225	static bool sync_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, gfn_t gfn,
				4226	unsigned access, int *nr_present)
				4227	{
				4228	if (unlikely(is_mmio_spte(*sptep))) {
				4229	if (gfn != get_mmio_spte_gfn(*sptep)) {
				4230	mmu_spte_clear_no_track(sptep);
				4231	return true;
				4232	}
				4233
				4234	(*nr_present)++;
				4235	mark_mmio_spte(vcpu, sptep, gfn, access);
				4236	return true;
				4237	}
				4238
				4239	return false;
				4240	}
				4241
				4242	static inline bool is_last_gpte(struct kvm_mmu *mmu,
				4243	unsigned level, unsigned gpte)
				4244	{
				4245	/*
				4246	* The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
				4247	* If it is clear, there are no large pages at this level, so clear
				4248	* PT_PAGE_SIZE_MASK in gpte if that is the case.
				4249	*/
				4250	gpte &= level - mmu->last_nonleaf_level;
				4251
				4252	/*
				4253	* PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set
				4254	* iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
				4255	* level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
				4256	*/
				4257	gpte \|= level - PT_PAGE_TABLE_LEVEL - 1;
				4258
				4259	return gpte & PT_PAGE_SIZE_MASK;
				4260	}
				4261
				4262	#define PTTYPE_EPT 18 /* arbitrary */
				4263	#define PTTYPE PTTYPE_EPT
				4264	#include "paging_tmpl.h"
				4265	#undef PTTYPE
				4266
				4267	#define PTTYPE 64
				4268	#include "paging_tmpl.h"
				4269	#undef PTTYPE
				4270
				4271	#define PTTYPE 32
				4272	#include "paging_tmpl.h"
				4273	#undef PTTYPE
				4274
				4275	static void
				4276	__reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				4277	struct rsvd_bits_validate *rsvd_check,
				4278	int maxphyaddr, int level, bool nx, bool gbpages,
				4279	bool pse, bool amd)
				4280	{
				4281	u64 exb_bit_rsvd = 0;
				4282	u64 gbpages_bit_rsvd = 0;
				4283	u64 nonleaf_bit8_rsvd = 0;
				4284
				4285	rsvd_check->bad_mt_xwr = 0;
				4286
				4287	if (!nx)
				4288	exb_bit_rsvd = rsvd_bits(63, 63);
				4289	if (!gbpages)
				4290	gbpages_bit_rsvd = rsvd_bits(7, 7);
				4291
				4292	/*
				4293	* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
				4294	* leaf entries) on AMD CPUs only.
				4295	*/
				4296	if (amd)
				4297	nonleaf_bit8_rsvd = rsvd_bits(8, 8);
				4298
				4299	switch (level) {
				4300	case PT32_ROOT_LEVEL:
				4301	/* no rsvd bits for 2 level 4K page table entries */
				4302	rsvd_check->rsvd_bits_mask[0][1] = 0;
				4303	rsvd_check->rsvd_bits_mask[0][0] = 0;
				4304	rsvd_check->rsvd_bits_mask[1][0] =
				4305	rsvd_check->rsvd_bits_mask[0][0];
				4306
				4307	if (!pse) {
				4308	rsvd_check->rsvd_bits_mask[1][1] = 0;
				4309	break;
				4310	}
				4311
				4312	if (is_cpuid_PSE36())
				4313	/* 36bits PSE 4MB page */
				4314	rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
				4315	else
				4316	/* 32 bits PSE 4MB page */
				4317	rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
				4318	break;
				4319	case PT32E_ROOT_LEVEL:
				4320	rsvd_check->rsvd_bits_mask[0][2] =
				4321	rsvd_bits(maxphyaddr, 63) \|
				4322	rsvd_bits(5, 8) \| rsvd_bits(1, 2); /* PDPTE */
				4323	rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd \|
				4324	rsvd_bits(maxphyaddr, 62); /* PDE */
				4325	rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd \|
				4326	rsvd_bits(maxphyaddr, 62); /* PTE */
				4327	rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd \|
				4328	rsvd_bits(maxphyaddr, 62) \|
				4329	rsvd_bits(13, 20); /* large page */
				4330	rsvd_check->rsvd_bits_mask[1][0] =
				4331	rsvd_check->rsvd_bits_mask[0][0];
				4332	break;
				4333	case PT64_ROOT_5LEVEL:
				4334	rsvd_check->rsvd_bits_mask[0][4] = exb_bit_rsvd \|
				4335	nonleaf_bit8_rsvd \| rsvd_bits(7, 7) \|
				4336	rsvd_bits(maxphyaddr, 51);
				4337	rsvd_check->rsvd_bits_mask[1][4] =
				4338	rsvd_check->rsvd_bits_mask[0][4];
				4339	case PT64_ROOT_4LEVEL:
				4340	rsvd_check->rsvd_bits_mask[0][3] = exb_bit_rsvd \|
				4341	nonleaf_bit8_rsvd \| rsvd_bits(7, 7) \|
				4342	rsvd_bits(maxphyaddr, 51);
				4343	rsvd_check->rsvd_bits_mask[0][2] = exb_bit_rsvd \|
				4344	nonleaf_bit8_rsvd \| gbpages_bit_rsvd \|
				4345	rsvd_bits(maxphyaddr, 51);
				4346	rsvd_check->rsvd_bits_mask[0][1] = exb_bit_rsvd \|
				4347	rsvd_bits(maxphyaddr, 51);
				4348	rsvd_check->rsvd_bits_mask[0][0] = exb_bit_rsvd \|
				4349	rsvd_bits(maxphyaddr, 51);
				4350	rsvd_check->rsvd_bits_mask[1][3] =
				4351	rsvd_check->rsvd_bits_mask[0][3];
				4352	rsvd_check->rsvd_bits_mask[1][2] = exb_bit_rsvd \|
				4353	gbpages_bit_rsvd \| rsvd_bits(maxphyaddr, 51) \|
				4354	rsvd_bits(13, 29);
				4355	rsvd_check->rsvd_bits_mask[1][1] = exb_bit_rsvd \|
				4356	rsvd_bits(maxphyaddr, 51) \|
				4357	rsvd_bits(13, 20); /* large page */
				4358	rsvd_check->rsvd_bits_mask[1][0] =
				4359	rsvd_check->rsvd_bits_mask[0][0];
				4360	break;
				4361	}
				4362	}
				4363
				4364	static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
				4365	struct kvm_mmu *context)
				4366	{
				4367	__reset_rsvds_bits_mask(vcpu, &context->guest_rsvd_check,
				4368	cpuid_maxphyaddr(vcpu), context->root_level,
				4369	context->nx,
				4370	guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
				4371	is_pse(vcpu), guest_cpuid_is_amd(vcpu));
				4372	}
				4373
				4374	static void
				4375	__reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
				4376	int maxphyaddr, bool execonly)
				4377	{
				4378	u64 bad_mt_xwr;
				4379
				4380	rsvd_check->rsvd_bits_mask[0][4] =
				4381	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 7);
				4382	rsvd_check->rsvd_bits_mask[0][3] =
				4383	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 7);
				4384	rsvd_check->rsvd_bits_mask[0][2] =
				4385	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 6);
				4386	rsvd_check->rsvd_bits_mask[0][1] =
				4387	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(3, 6);
				4388	rsvd_check->rsvd_bits_mask[0][0] = rsvd_bits(maxphyaddr, 51);
				4389
				4390	/* large page */
				4391	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
				4392	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
				4393	rsvd_check->rsvd_bits_mask[1][2] =
				4394	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(12, 29);
				4395	rsvd_check->rsvd_bits_mask[1][1] =
				4396	rsvd_bits(maxphyaddr, 51) \| rsvd_bits(12, 20);
				4397	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
				4398
				4399	bad_mt_xwr = 0xFFull << (2 * 8); /* bits 3..5 must not be 2 */
				4400	bad_mt_xwr \|= 0xFFull << (3 * 8); /* bits 3..5 must not be 3 */
				4401	bad_mt_xwr \|= 0xFFull << (7 * 8); /* bits 3..5 must not be 7 */
				4402	bad_mt_xwr \|= REPEAT_BYTE(1ull << 2); /* bits 0..2 must not be 010 */
				4403	bad_mt_xwr \|= REPEAT_BYTE(1ull << 6); /* bits 0..2 must not be 110 */
				4404	if (!execonly) {
				4405	/* bits 0..2 must not be 100 unless VMX capabilities allow it */
				4406	bad_mt_xwr \|= REPEAT_BYTE(1ull << 4);
				4407	}
				4408	rsvd_check->bad_mt_xwr = bad_mt_xwr;
				4409	}
				4410
				4411	static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
				4412	struct kvm_mmu *context, bool execonly)
				4413	{
				4414	__reset_rsvds_bits_mask_ept(&context->guest_rsvd_check,
				4415	cpuid_maxphyaddr(vcpu), execonly);
				4416	}
				4417
				4418	/*
				4419	* the page table on host is the shadow page table for the page
				4420	* table in guest or amd nested guest, its mmu features completely
				4421	* follow the features in guest.
				4422	*/
				4423	void
				4424	reset_shadow_zero_bits_mask(struct kvm_vcpu vcpu, struct kvm_mmu context)
				4425	{
				4426	bool uses_nx = context->nx \|\| context->base_role.smep_andnot_wp;
				4427	struct rsvd_bits_validate *shadow_zero_check;
				4428	int i;
				4429
				4430	/*
				4431	* Passing "true" to the last argument is okay; it adds a check
				4432	* on bit 8 of the SPTEs which KVM doesn't use anyway.
				4433	*/
				4434	shadow_zero_check = &context->shadow_zero_check;
				4435	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
				4436	boot_cpu_data.x86_phys_bits,
				4437	context->shadow_root_level, uses_nx,
				4438	guest_cpuid_has(vcpu, X86_FEATURE_GBPAGES),
				4439	is_pse(vcpu), true);
				4440
				4441	if (!shadow_me_mask)
				4442	return;
				4443
				4444	for (i = context->shadow_root_level; --i >= 0;) {
				4445	shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
				4446	shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
				4447	}
				4448
				4449	}
				4450	EXPORT_SYMBOL_GPL(reset_shadow_zero_bits_mask);
				4451
				4452	static inline bool boot_cpu_is_amd(void)
				4453	{
				4454	WARN_ON_ONCE(!tdp_enabled);
				4455	return shadow_x_mask == 0;
				4456	}
				4457
				4458	/*
				4459	* the direct page table on host, use as much mmu features as
				4460	* possible, however, kvm currently does not do execution-protection.
				4461	*/
				4462	static void
				4463	reset_tdp_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				4464	struct kvm_mmu *context)
				4465	{
				4466	struct rsvd_bits_validate *shadow_zero_check;
				4467	int i;
				4468
				4469	shadow_zero_check = &context->shadow_zero_check;
				4470
				4471	if (boot_cpu_is_amd())
				4472	__reset_rsvds_bits_mask(vcpu, shadow_zero_check,
				4473	boot_cpu_data.x86_phys_bits,
				4474	context->shadow_root_level, false,
				4475	boot_cpu_has(X86_FEATURE_GBPAGES),
				4476	true, true);
				4477	else
				4478	__reset_rsvds_bits_mask_ept(shadow_zero_check,
				4479	boot_cpu_data.x86_phys_bits,
				4480	false);
				4481
				4482	if (!shadow_me_mask)
				4483	return;
				4484
				4485	for (i = context->shadow_root_level; --i >= 0;) {
				4486	shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
				4487	shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
				4488	}
				4489	}
				4490
				4491	/*
				4492	* as the comments in reset_shadow_zero_bits_mask() except it
				4493	* is the shadow page table for intel nested guest.
				4494	*/
				4495	static void
				4496	reset_ept_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
				4497	struct kvm_mmu *context, bool execonly)
				4498	{
				4499	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
				4500	boot_cpu_data.x86_phys_bits, execonly);
				4501	}
				4502
				4503	#define BYTE_MASK(access) \
				4504	((1 & (access) ? 2 : 0) \| \
				4505	(2 & (access) ? 4 : 0) \| \
				4506	(3 & (access) ? 8 : 0) \| \
				4507	(4 & (access) ? 16 : 0) \| \
				4508	(5 & (access) ? 32 : 0) \| \
				4509	(6 & (access) ? 64 : 0) \| \
				4510	(7 & (access) ? 128 : 0))
				4511
				4512
				4513	static void update_permission_bitmask(struct kvm_vcpu *vcpu,
				4514	struct kvm_mmu *mmu, bool ept)
				4515	{
				4516	unsigned byte;
				4517
				4518	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
				4519	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
				4520	const u8 u = BYTE_MASK(ACC_USER_MASK);
				4521
				4522	bool cr4_smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP) != 0;
				4523	bool cr4_smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP) != 0;
				4524	bool cr0_wp = is_write_protection(vcpu);
				4525
				4526	for (byte = 0; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
				4527	unsigned pfec = byte << 1;
				4528
				4529	/*
				4530	* Each "*f" variable has a 1 bit for each UWX value
				4531	* that causes a fault with the given PFEC.
				4532	*/
				4533
				4534	/* Faults from writes to non-writable pages */
				4535	u8 wf = (pfec & PFERR_WRITE_MASK) ? ~w : 0;
				4536	/* Faults from user mode accesses to supervisor pages */
				4537	u8 uf = (pfec & PFERR_USER_MASK) ? ~u : 0;
				4538	/* Faults from fetches of non-executable pages*/
				4539	u8 ff = (pfec & PFERR_FETCH_MASK) ? ~x : 0;
				4540	/* Faults from kernel mode fetches of user pages */
				4541	u8 smepf = 0;
				4542	/* Faults from kernel mode accesses of user pages */
				4543	u8 smapf = 0;
				4544
				4545	if (!ept) {
				4546	/* Faults from kernel mode accesses to user pages */
				4547	u8 kf = (pfec & PFERR_USER_MASK) ? 0 : u;
				4548
				4549	/* Not really needed: !nx will cause pte.nx to fault */
				4550	if (!mmu->nx)
				4551	ff = 0;
				4552
				4553	/* Allow supervisor writes if !cr0.wp */
				4554	if (!cr0_wp)
				4555	wf = (pfec & PFERR_USER_MASK) ? wf : 0;
				4556
				4557	/* Disallow supervisor fetches of user code if cr4.smep */
				4558	if (cr4_smep)
				4559	smepf = (pfec & PFERR_FETCH_MASK) ? kf : 0;
				4560
				4561	/*
				4562	* SMAP:kernel-mode data accesses from user-mode
				4563	* mappings should fault. A fault is considered
				4564	* as a SMAP violation if all of the following
				4565	* conditions are ture:
				4566	* - X86_CR4_SMAP is set in CR4
				4567	* - A user page is accessed
				4568	* - The access is not a fetch
				4569	* - Page fault in kernel mode
				4570	* - if CPL = 3 or X86_EFLAGS_AC is clear
				4571	*
				4572	* Here, we cover the first three conditions.
				4573	* The fourth is computed dynamically in permission_fault();
				4574	* PFERR_RSVD_MASK bit will be set in PFEC if the access is
				4575	* not subject to SMAP restrictions.
				4576	*/
				4577	if (cr4_smap)
				4578	smapf = (pfec & (PFERR_RSVD_MASK\|PFERR_FETCH_MASK)) ? 0 : kf;
				4579	}
				4580
				4581	mmu->permissions[byte] = ff \| uf \| wf \| smepf \| smapf;
				4582	}
				4583	}
				4584
				4585	/*
				4586	* PKU is an additional mechanism by which the paging controls access to
				4587	* user-mode addresses based on the value in the PKRU register. Protection
				4588	* key violations are reported through a bit in the page fault error code.
				4589	* Unlike other bits of the error code, the PK bit is not known at the
				4590	* call site of e.g. gva_to_gpa; it must be computed directly in
				4591	* permission_fault based on two bits of PKRU, on some machine state (CR4,
				4592	* CR0, EFER, CPL), and on other bits of the error code and the page tables.
				4593	*
				4594	* In particular the following conditions come from the error code, the
				4595	* page tables and the machine state:
				4596	* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
				4597	* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
				4598	* - PK is always zero if U=0 in the page tables
				4599	* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
				4600	*
				4601	* The PKRU bitmask caches the result of these four conditions. The error
				4602	* code (minus the P bit) and the page table's U bit form an index into the
				4603	* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
				4604	* with the two bits of the PKRU register corresponding to the protection key.
				4605	* For the first three conditions above the bits will be 00, thus masking
				4606	* away both AD and WD. For all reads or if the last condition holds, WD
				4607	* only will be masked away.
				4608	*/
				4609	static void update_pkru_bitmask(struct kvm_vcpu vcpu, struct kvm_mmu mmu,
				4610	bool ept)
				4611	{
				4612	unsigned bit;
				4613	bool wp;
				4614
				4615	if (ept) {
				4616	mmu->pkru_mask = 0;
				4617	return;
				4618	}
				4619
				4620	/* PKEY is enabled only if CR4.PKE and EFER.LMA are both set. */
				4621	if (!kvm_read_cr4_bits(vcpu, X86_CR4_PKE) \|\| !is_long_mode(vcpu)) {
				4622	mmu->pkru_mask = 0;
				4623	return;
				4624	}
				4625
				4626	wp = is_write_protection(vcpu);
				4627
				4628	for (bit = 0; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
				4629	unsigned pfec, pkey_bits;
				4630	bool check_pkey, check_write, ff, uf, wf, pte_user;
				4631
				4632	pfec = bit << 1;
				4633	ff = pfec & PFERR_FETCH_MASK;
				4634	uf = pfec & PFERR_USER_MASK;
				4635	wf = pfec & PFERR_WRITE_MASK;
				4636
				4637	/* PFEC.RSVD is replaced by ACC_USER_MASK. */
				4638	pte_user = pfec & PFERR_RSVD_MASK;
				4639
				4640	/*
				4641	* Only need to check the access which is not an
				4642	* instruction fetch and is to a user page.
				4643	*/
				4644	check_pkey = (!ff && pte_user);
				4645	/*
				4646	* write access is controlled by PKRU if it is a
				4647	* user access or CR0.WP = 1.
				4648	*/
				4649	check_write = check_pkey && wf && (uf \|\| wp);
				4650
				4651	/* PKRU.AD stops both read and write access. */
				4652	pkey_bits = !!check_pkey;
				4653	/* PKRU.WD stops write access. */
				4654	pkey_bits \|= (!!check_write) << 1;
				4655
				4656	mmu->pkru_mask \|= (pkey_bits & 3) << pfec;
				4657	}
				4658	}
				4659
				4660	static void update_last_nonleaf_level(struct kvm_vcpu vcpu, struct kvm_mmu mmu)
				4661	{
				4662	unsigned root_level = mmu->root_level;
				4663
				4664	mmu->last_nonleaf_level = root_level;
				4665	if (root_level == PT32_ROOT_LEVEL && is_pse(vcpu))
				4666	mmu->last_nonleaf_level++;
				4667	}
				4668
				4669	static void paging64_init_context_common(struct kvm_vcpu *vcpu,
				4670	struct kvm_mmu *context,
				4671	int level)
				4672	{
				4673	context->nx = is_nx(vcpu);
				4674	context->root_level = level;
				4675
				4676	reset_rsvds_bits_mask(vcpu, context);
				4677	update_permission_bitmask(vcpu, context, false);
				4678	update_pkru_bitmask(vcpu, context, false);
				4679	update_last_nonleaf_level(vcpu, context);
				4680
				4681	MMU_WARN_ON(!is_pae(vcpu));
				4682	context->page_fault = paging64_page_fault;
				4683	context->gva_to_gpa = paging64_gva_to_gpa;
				4684	context->sync_page = paging64_sync_page;
				4685	context->invlpg = paging64_invlpg;
				4686	context->update_pte = paging64_update_pte;
				4687	context->shadow_root_level = level;
				4688	context->direct_map = false;
				4689	}
				4690
				4691	static void paging64_init_context(struct kvm_vcpu *vcpu,
				4692	struct kvm_mmu *context)
				4693	{
				4694	int root_level = is_la57_mode(vcpu) ?
				4695	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4696
				4697	paging64_init_context_common(vcpu, context, root_level);
				4698	}
				4699
				4700	static void paging32_init_context(struct kvm_vcpu *vcpu,
				4701	struct kvm_mmu *context)
				4702	{
				4703	context->nx = false;
				4704	context->root_level = PT32_ROOT_LEVEL;
				4705
				4706	reset_rsvds_bits_mask(vcpu, context);
				4707	update_permission_bitmask(vcpu, context, false);
				4708	update_pkru_bitmask(vcpu, context, false);
				4709	update_last_nonleaf_level(vcpu, context);
				4710
				4711	context->page_fault = paging32_page_fault;
				4712	context->gva_to_gpa = paging32_gva_to_gpa;
				4713	context->sync_page = paging32_sync_page;
				4714	context->invlpg = paging32_invlpg;
				4715	context->update_pte = paging32_update_pte;
				4716	context->shadow_root_level = PT32E_ROOT_LEVEL;
				4717	context->direct_map = false;
				4718	}
				4719
				4720	static void paging32E_init_context(struct kvm_vcpu *vcpu,
				4721	struct kvm_mmu *context)
				4722	{
				4723	paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
				4724	}
				4725
				4726	static union kvm_mmu_page_role
				4727	kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu)
				4728	{
				4729	union kvm_mmu_page_role role = {0};
				4730
				4731	role.guest_mode = is_guest_mode(vcpu);
				4732	role.smm = is_smm(vcpu);
				4733	role.ad_disabled = (shadow_accessed_mask == 0);
				4734	role.level = kvm_x86_ops->get_tdp_level(vcpu);
				4735	role.direct = true;
				4736	role.access = ACC_ALL;
				4737
				4738	return role;
				4739	}
				4740
				4741	static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
				4742	{
				4743	struct kvm_mmu *context = &vcpu->arch.mmu;
				4744
				4745	context->base_role.word = mmu_base_role_mask.word &
				4746	kvm_calc_tdp_mmu_root_page_role(vcpu).word;
				4747	context->page_fault = tdp_page_fault;
				4748	context->sync_page = nonpaging_sync_page;
				4749	context->invlpg = nonpaging_invlpg;
				4750	context->update_pte = nonpaging_update_pte;
				4751	context->shadow_root_level = kvm_x86_ops->get_tdp_level(vcpu);
				4752	context->direct_map = true;
				4753	context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
				4754	context->get_cr3 = get_cr3;
				4755	context->get_pdptr = kvm_pdptr_read;
				4756	context->inject_page_fault = kvm_inject_page_fault;
				4757
				4758	if (!is_paging(vcpu)) {
				4759	context->nx = false;
				4760	context->gva_to_gpa = nonpaging_gva_to_gpa;
				4761	context->root_level = 0;
				4762	} else if (is_long_mode(vcpu)) {
				4763	context->nx = is_nx(vcpu);
				4764	context->root_level = is_la57_mode(vcpu) ?
				4765	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4766	reset_rsvds_bits_mask(vcpu, context);
				4767	context->gva_to_gpa = paging64_gva_to_gpa;
				4768	} else if (is_pae(vcpu)) {
				4769	context->nx = is_nx(vcpu);
				4770	context->root_level = PT32E_ROOT_LEVEL;
				4771	reset_rsvds_bits_mask(vcpu, context);
				4772	context->gva_to_gpa = paging64_gva_to_gpa;
				4773	} else {
				4774	context->nx = false;
				4775	context->root_level = PT32_ROOT_LEVEL;
				4776	reset_rsvds_bits_mask(vcpu, context);
				4777	context->gva_to_gpa = paging32_gva_to_gpa;
				4778	}
				4779
				4780	update_permission_bitmask(vcpu, context, false);
				4781	update_pkru_bitmask(vcpu, context, false);
				4782	update_last_nonleaf_level(vcpu, context);
				4783	reset_tdp_shadow_zero_bits_mask(vcpu, context);
				4784	}
				4785
				4786	static union kvm_mmu_page_role
				4787	kvm_calc_shadow_mmu_root_page_role(struct kvm_vcpu *vcpu)
				4788	{
				4789	union kvm_mmu_page_role role = {0};
				4790	bool smep = kvm_read_cr4_bits(vcpu, X86_CR4_SMEP);
				4791	bool smap = kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
				4792
				4793	role.nxe = is_nx(vcpu);
				4794	role.cr4_pae = !!is_pae(vcpu);
				4795	role.cr0_wp = is_write_protection(vcpu);
				4796	role.smep_andnot_wp = smep && !is_write_protection(vcpu);
				4797	role.smap_andnot_wp = smap && !is_write_protection(vcpu);
				4798	role.guest_mode = is_guest_mode(vcpu);
				4799	role.smm = is_smm(vcpu);
				4800	role.direct = !is_paging(vcpu);
				4801	role.access = ACC_ALL;
				4802
				4803	if (!is_long_mode(vcpu))
				4804	role.level = PT32E_ROOT_LEVEL;
				4805	else if (is_la57_mode(vcpu))
				4806	role.level = PT64_ROOT_5LEVEL;
				4807	else
				4808	role.level = PT64_ROOT_4LEVEL;
				4809
				4810	return role;
				4811	}
				4812
				4813	void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu)
				4814	{
				4815	struct kvm_mmu *context = &vcpu->arch.mmu;
				4816
				4817	if (!is_paging(vcpu))
				4818	nonpaging_init_context(vcpu, context);
				4819	else if (is_long_mode(vcpu))
				4820	paging64_init_context(vcpu, context);
				4821	else if (is_pae(vcpu))
				4822	paging32E_init_context(vcpu, context);
				4823	else
				4824	paging32_init_context(vcpu, context);
				4825
				4826	context->base_role.word = mmu_base_role_mask.word &
				4827	kvm_calc_shadow_mmu_root_page_role(vcpu).word;
				4828	reset_shadow_zero_bits_mask(vcpu, context);
				4829	}
				4830	EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
				4831
				4832	static union kvm_mmu_page_role
				4833	kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty)
				4834	{
				4835	union kvm_mmu_page_role role = vcpu->arch.mmu.base_role;
				4836
				4837	role.level = PT64_ROOT_4LEVEL;
				4838	role.direct = false;
				4839	role.ad_disabled = !accessed_dirty;
				4840	role.guest_mode = true;
				4841	role.access = ACC_ALL;
				4842
				4843	return role;
				4844	}
				4845
				4846	void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
				4847	bool accessed_dirty, gpa_t new_eptp)
				4848	{
				4849	struct kvm_mmu *context = &vcpu->arch.mmu;
				4850	union kvm_mmu_page_role root_page_role =
				4851	kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty);
				4852
				4853	__kvm_mmu_new_cr3(vcpu, new_eptp, root_page_role, false);
				4854	context->shadow_root_level = PT64_ROOT_4LEVEL;
				4855
				4856	context->nx = true;
				4857	context->ept_ad = accessed_dirty;
				4858	context->page_fault = ept_page_fault;
				4859	context->gva_to_gpa = ept_gva_to_gpa;
				4860	context->sync_page = ept_sync_page;
				4861	context->invlpg = ept_invlpg;
				4862	context->update_pte = ept_update_pte;
				4863	context->root_level = PT64_ROOT_4LEVEL;
				4864	context->direct_map = false;
				4865	context->base_role.word = root_page_role.word & mmu_base_role_mask.word;
				4866	update_permission_bitmask(vcpu, context, true);
				4867	update_pkru_bitmask(vcpu, context, true);
				4868	update_last_nonleaf_level(vcpu, context);
				4869	reset_rsvds_bits_mask_ept(vcpu, context, execonly);
				4870	reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
				4871	}
				4872	EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
				4873
				4874	static void init_kvm_softmmu(struct kvm_vcpu *vcpu)
				4875	{
				4876	struct kvm_mmu *context = &vcpu->arch.mmu;
				4877
				4878	kvm_init_shadow_mmu(vcpu);
				4879	context->set_cr3 = kvm_x86_ops->set_cr3;
				4880	context->get_cr3 = get_cr3;
				4881	context->get_pdptr = kvm_pdptr_read;
				4882	context->inject_page_fault = kvm_inject_page_fault;
				4883	}
				4884
				4885	static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
				4886	{
				4887	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
				4888
				4889	g_context->get_cr3 = get_cr3;
				4890	g_context->get_pdptr = kvm_pdptr_read;
				4891	g_context->inject_page_fault = kvm_inject_page_fault;
				4892
				4893	/*
				4894	* Note that arch.mmu.gva_to_gpa translates l2_gpa to l1_gpa using
				4895	* L1's nested page tables (e.g. EPT12). The nested translation
				4896	* of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
				4897	* L2's page tables as the first level of translation and L1's
				4898	* nested page tables as the second level of translation. Basically
				4899	* the gva_to_gpa functions between mmu and nested_mmu are swapped.
				4900	*/
				4901	if (!is_paging(vcpu)) {
				4902	g_context->nx = false;
				4903	g_context->root_level = 0;
				4904	g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
				4905	} else if (is_long_mode(vcpu)) {
				4906	g_context->nx = is_nx(vcpu);
				4907	g_context->root_level = is_la57_mode(vcpu) ?
				4908	PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL;
				4909	reset_rsvds_bits_mask(vcpu, g_context);
				4910	g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
				4911	} else if (is_pae(vcpu)) {
				4912	g_context->nx = is_nx(vcpu);
				4913	g_context->root_level = PT32E_ROOT_LEVEL;
				4914	reset_rsvds_bits_mask(vcpu, g_context);
				4915	g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
				4916	} else {
				4917	g_context->nx = false;
				4918	g_context->root_level = PT32_ROOT_LEVEL;
				4919	reset_rsvds_bits_mask(vcpu, g_context);
				4920	g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
				4921	}
				4922
				4923	update_permission_bitmask(vcpu, g_context, false);
				4924	update_pkru_bitmask(vcpu, g_context, false);
				4925	update_last_nonleaf_level(vcpu, g_context);
				4926	}
				4927
				4928	void kvm_init_mmu(struct kvm_vcpu *vcpu, bool reset_roots)
				4929	{
				4930	if (reset_roots) {
				4931	uint i;
				4932
				4933	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
				4934
				4935	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				4936	vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
				4937	}
				4938
				4939	if (mmu_is_nested(vcpu))
				4940	init_kvm_nested_mmu(vcpu);
				4941	else if (tdp_enabled)
				4942	init_kvm_tdp_mmu(vcpu);
				4943	else
				4944	init_kvm_softmmu(vcpu);
				4945	}
				4946	EXPORT_SYMBOL_GPL(kvm_init_mmu);
				4947
				4948	static union kvm_mmu_page_role
				4949	kvm_mmu_calc_root_page_role(struct kvm_vcpu *vcpu)
				4950	{
				4951	if (tdp_enabled)
				4952	return kvm_calc_tdp_mmu_root_page_role(vcpu);
				4953	else
				4954	return kvm_calc_shadow_mmu_root_page_role(vcpu);
				4955	}
				4956
				4957	void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
				4958	{
				4959	kvm_mmu_unload(vcpu);
				4960	kvm_init_mmu(vcpu, true);
				4961	}
				4962	EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
				4963
				4964	int kvm_mmu_load(struct kvm_vcpu *vcpu)
				4965	{
				4966	int r;
				4967
				4968	r = mmu_topup_memory_caches(vcpu);
				4969	if (r)
				4970	goto out;
				4971	r = mmu_alloc_roots(vcpu);
				4972	kvm_mmu_sync_roots(vcpu);
				4973	if (r)
				4974	goto out;
				4975	kvm_mmu_load_cr3(vcpu);
				4976	kvm_x86_ops->tlb_flush(vcpu, true);
				4977	out:
				4978	return r;
				4979	}
				4980	EXPORT_SYMBOL_GPL(kvm_mmu_load);
				4981
				4982	void kvm_mmu_unload(struct kvm_vcpu *vcpu)
				4983	{
				4984	kvm_mmu_free_roots(vcpu, KVM_MMU_ROOTS_ALL);
				4985	WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
				4986	}
				4987	EXPORT_SYMBOL_GPL(kvm_mmu_unload);
				4988
				4989	static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
				4990	struct kvm_mmu_page sp, u64 spte,
				4991	const void *new)
				4992	{
				4993	if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
				4994	++vcpu->kvm->stat.mmu_pde_zapped;
				4995	return;
				4996	}
				4997
				4998	++vcpu->kvm->stat.mmu_pte_updated;
				4999	vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
				5000	}
				5001
				5002	static bool need_remote_flush(u64 old, u64 new)
				5003	{
				5004	if (!is_shadow_present_pte(old))
				5005	return false;
				5006	if (!is_shadow_present_pte(new))
				5007	return true;
				5008	if ((old ^ new) & PT64_BASE_ADDR_MASK)
				5009	return true;
				5010	old ^= shadow_nx_mask;
				5011	new ^= shadow_nx_mask;
				5012	return (old & ~new & PT64_PERM_MASK) != 0;
				5013	}
				5014
				5015	static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu vcpu, gpa_t gpa,
				5016	int *bytes)
				5017	{
				5018	u64 gentry = 0;
				5019	int r;
				5020
				5021	/*
				5022	* Assume that the pte write on a page table of the same type
				5023	* as the current vcpu paging mode since we update the sptes only
				5024	* when they have the same mode.
				5025	*/
				5026	if (is_pae(vcpu) && *bytes == 4) {
				5027	/* Handle a 32-bit guest writing two halves of a 64-bit gpte */
				5028	*gpa &= ~(gpa_t)7;
				5029	*bytes = 8;
				5030	}
				5031
				5032	if (bytes == 4 \|\| bytes == 8) {
				5033	r = kvm_vcpu_read_guest_atomic(vcpu, gpa, &gentry, bytes);
				5034	if (r)
				5035	gentry = 0;
				5036	}
				5037
				5038	return gentry;
				5039	}
				5040
				5041	/*
				5042	* If we're seeing too many writes to a page, it may no longer be a page table,
				5043	* or we may be forking, in which case it is better to unmap the page.
				5044	*/
				5045	static bool detect_write_flooding(struct kvm_mmu_page *sp)
				5046	{
				5047	/*
				5048	* Skip write-flooding detected for the sp whose level is 1, because
				5049	* it can become unsync, then the guest page is not write-protected.
				5050	*/
				5051	if (sp->role.level == PT_PAGE_TABLE_LEVEL)
				5052	return false;
				5053
				5054	atomic_inc(&sp->write_flooding_count);
				5055	return atomic_read(&sp->write_flooding_count) >= 3;
				5056	}
				5057
				5058	/*
				5059	* Misaligned accesses are too much trouble to fix up; also, they usually
				5060	* indicate a page is not used as a page table.
				5061	*/
				5062	static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
				5063	int bytes)
				5064	{
				5065	unsigned offset, pte_size, misaligned;
				5066
				5067	pgprintk("misaligned: gpa %llx bytes %d role %x\n",
				5068	gpa, bytes, sp->role.word);
				5069
				5070	offset = offset_in_page(gpa);
				5071	pte_size = sp->role.cr4_pae ? 8 : 4;
				5072
				5073	/*
				5074	* Sometimes, the OS only writes the last one bytes to update status
				5075	* bits, for example, in linux, andb instruction is used in clear_bit().
				5076	*/
				5077	if (!(offset & (pte_size - 1)) && bytes == 1)
				5078	return false;
				5079
				5080	misaligned = (offset ^ (offset + bytes - 1)) & ~(pte_size - 1);
				5081	misaligned \|= bytes < 4;
				5082
				5083	return misaligned;
				5084	}
				5085
				5086	static u64 get_written_sptes(struct kvm_mmu_page sp, gpa_t gpa, int *nspte)
				5087	{
				5088	unsigned page_offset, quadrant;
				5089	u64 *spte;
				5090	int level;
				5091
				5092	page_offset = offset_in_page(gpa);
				5093	level = sp->role.level;
				5094	*nspte = 1;
				5095	if (!sp->role.cr4_pae) {
				5096	page_offset <<= 1; /* 32->64 */
				5097	/*
				5098	* A 32-bit pde maps 4MB while the shadow pdes map
				5099	* only 2MB. So we need to double the offset again
				5100	* and zap two pdes instead of one.
				5101	*/
				5102	if (level == PT32_ROOT_LEVEL) {
				5103	page_offset &= ~7; /* kill rounding error */
				5104	page_offset <<= 1;
				5105	*nspte = 2;
				5106	}
				5107	quadrant = page_offset >> PAGE_SHIFT;
				5108	page_offset &= ~PAGE_MASK;
				5109	if (quadrant != sp->role.quadrant)
				5110	return NULL;
				5111	}
				5112
				5113	spte = &sp->spt[page_offset / sizeof(*spte)];
				5114	return spte;
				5115	}
				5116
				5117	static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
				5118	const u8 *new, int bytes,
				5119	struct kvm_page_track_notifier_node *node)
				5120	{
				5121	gfn_t gfn = gpa >> PAGE_SHIFT;
				5122	struct kvm_mmu_page *sp;
				5123	LIST_HEAD(invalid_list);
				5124	u64 entry, gentry, *spte;
				5125	int npte;
				5126	bool remote_flush, local_flush;
				5127
				5128	/*
				5129	* If we don't have indirect shadow pages, it means no page is
				5130	* write-protected, so we can exit simply.
				5131	*/
				5132	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
				5133	return;
				5134
				5135	remote_flush = local_flush = false;
				5136
				5137	pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
				5138
				5139	/*
				5140	* No need to care whether allocation memory is successful
				5141	* or not since pte prefetch is skiped if it does not have
				5142	* enough objects in the cache.
				5143	*/
				5144	mmu_topup_memory_caches(vcpu);
				5145
				5146	spin_lock(&vcpu->kvm->mmu_lock);
				5147
				5148	gentry = mmu_pte_write_fetch_gpte(vcpu, &gpa, &bytes);
				5149
				5150	++vcpu->kvm->stat.mmu_pte_write;
				5151	kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
				5152
				5153	for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn) {
				5154	if (detect_write_misaligned(sp, gpa, bytes) \|\|
				5155	detect_write_flooding(sp)) {
				5156	kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
				5157	++vcpu->kvm->stat.mmu_flooded;
				5158	continue;
				5159	}
				5160
				5161	spte = get_written_sptes(sp, gpa, &npte);
				5162	if (!spte)
				5163	continue;
				5164
				5165	local_flush = true;
				5166	while (npte--) {
				5167	entry = *spte;
				5168	mmu_page_zap_pte(vcpu->kvm, sp, spte);
				5169	if (gentry &&
				5170	!((sp->role.word ^ vcpu->arch.mmu.base_role.word)
				5171	& mmu_base_role_mask.word) && rmap_can_add(vcpu))
				5172	mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
				5173	if (need_remote_flush(entry, *spte))
				5174	remote_flush = true;
				5175	++spte;
				5176	}
				5177	}
				5178	kvm_mmu_flush_or_zap(vcpu, &invalid_list, remote_flush, local_flush);
				5179	kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
				5180	spin_unlock(&vcpu->kvm->mmu_lock);
				5181	}
				5182
				5183	int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
				5184	{
				5185	gpa_t gpa;
				5186	int r;
				5187
				5188	if (vcpu->arch.mmu.direct_map)
				5189	return 0;
				5190
				5191	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
				5192
				5193	r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
				5194
				5195	return r;
				5196	}
				5197	EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
				5198
				5199	static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
				5200	{
				5201	LIST_HEAD(invalid_list);
				5202
				5203	if (likely(kvm_mmu_available_pages(vcpu->kvm) >= KVM_MIN_FREE_MMU_PAGES))
				5204	return 0;
				5205
				5206	while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES) {
				5207	if (!prepare_zap_oldest_mmu_page(vcpu->kvm, &invalid_list))
				5208	break;
				5209
				5210	++vcpu->kvm->stat.mmu_recycled;
				5211	}
				5212	kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
				5213
				5214	if (!kvm_mmu_available_pages(vcpu->kvm))
				5215	return -ENOSPC;
				5216	return 0;
				5217	}
				5218
				5219	int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code,
				5220	void *insn, int insn_len)
				5221	{
				5222	int r, emulation_type = 0;
				5223	enum emulation_result er;
				5224	bool direct = vcpu->arch.mmu.direct_map;
				5225
				5226	/* With shadow page tables, fault_address contains a GVA or nGPA. */
				5227	if (vcpu->arch.mmu.direct_map) {
				5228	vcpu->arch.gpa_available = true;
				5229	vcpu->arch.gpa_val = cr2;
				5230	}
				5231
				5232	r = RET_PF_INVALID;
				5233	if (unlikely(error_code & PFERR_RSVD_MASK)) {
				5234	r = handle_mmio_page_fault(vcpu, cr2, direct);
				5235	if (r == RET_PF_EMULATE)
				5236	goto emulate;
				5237	}
				5238
				5239	if (r == RET_PF_INVALID) {
				5240	r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code),
				5241	false);
				5242	WARN_ON(r == RET_PF_INVALID);
				5243	}
				5244
				5245	if (r == RET_PF_RETRY)
				5246	return 1;
				5247	if (r < 0)
				5248	return r;
				5249
				5250	/*
				5251	* Before emulating the instruction, check if the error code
				5252	* was due to a RO violation while translating the guest page.
				5253	* This can occur when using nested virtualization with nested
				5254	* paging in both guests. If true, we simply unprotect the page
				5255	* and resume the guest.
				5256	*/
				5257	if (vcpu->arch.mmu.direct_map &&
				5258	(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
				5259	kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2));
				5260	return 1;
				5261	}
				5262
				5263	/*
				5264	* vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
				5265	* optimistically try to just unprotect the page and let the processor
				5266	* re-execute the instruction that caused the page fault. Do not allow
				5267	* retrying MMIO emulation, as it's not only pointless but could also
				5268	* cause us to enter an infinite loop because the processor will keep
				5269	* faulting on the non-existent MMIO address. Retrying an instruction
				5270	* from a nested guest is also pointless and dangerous as we are only
				5271	* explicitly shadowing L1's page tables, i.e. unprotecting something
				5272	* for L1 isn't going to magically fix whatever issue cause L2 to fail.
				5273	*/
				5274	if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
				5275	emulation_type = EMULTYPE_ALLOW_RETRY;
				5276	emulate:
				5277	/*
				5278	* On AMD platforms, under certain conditions insn_len may be zero on #NPF.
				5279	* This can happen if a guest gets a page-fault on data access but the HW
				5280	* table walker is not able to read the instruction page (e.g instruction
				5281	* page is not present in memory). In those cases we simply restart the
				5282	* guest.
				5283	*/
				5284	if (unlikely(insn && !insn_len))
				5285	return 1;
				5286
				5287	er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
				5288
				5289	switch (er) {
				5290	case EMULATE_DONE:
				5291	return 1;
				5292	case EMULATE_USER_EXIT:
				5293	++vcpu->stat.mmio_exits;
				5294	/* fall through */
				5295	case EMULATE_FAIL:
				5296	return 0;
				5297	default:
				5298	BUG();
				5299	}
				5300	}
				5301	EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
				5302
				5303	void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
				5304	{
				5305	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				5306	int i;
				5307
				5308	/* INVLPG on a * non-canonical address is a NOP according to the SDM. */
				5309	if (is_noncanonical_address(gva, vcpu))
				5310	return;
				5311
				5312	mmu->invlpg(vcpu, gva, mmu->root_hpa);
				5313
				5314	/*
				5315	* INVLPG is required to invalidate any global mappings for the VA,
				5316	* irrespective of PCID. Since it would take us roughly similar amount
				5317	* of work to determine whether any of the prev_root mappings of the VA
				5318	* is marked global, or to just sync it blindly, so we might as well
				5319	* just always sync it.
				5320	*
				5321	* Mappings not reachable via the current cr3 or the prev_roots will be
				5322	* synced when switching to that cr3, so nothing needs to be done here
				5323	* for them.
				5324	*/
				5325	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				5326	if (VALID_PAGE(mmu->prev_roots[i].hpa))
				5327	mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
				5328
				5329	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
				5330	++vcpu->stat.invlpg;
				5331	}
				5332	EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
				5333
				5334	void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
				5335	{
				5336	struct kvm_mmu *mmu = &vcpu->arch.mmu;
				5337	bool tlb_flush = false;
				5338	uint i;
				5339
				5340	if (pcid == kvm_get_active_pcid(vcpu)) {
				5341	mmu->invlpg(vcpu, gva, mmu->root_hpa);
				5342	tlb_flush = true;
				5343	}
				5344
				5345	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
				5346	if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
				5347	pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].cr3)) {
				5348	mmu->invlpg(vcpu, gva, mmu->prev_roots[i].hpa);
				5349	tlb_flush = true;
				5350	}
				5351	}
				5352
				5353	if (tlb_flush)
				5354	kvm_x86_ops->tlb_flush_gva(vcpu, gva);
				5355
				5356	++vcpu->stat.invlpg;
				5357
				5358	/*
				5359	* Mappings not reachable via the current cr3 or the prev_roots will be
				5360	* synced when switching to that cr3, so nothing needs to be done here
				5361	* for them.
				5362	*/
				5363	}
				5364	EXPORT_SYMBOL_GPL(kvm_mmu_invpcid_gva);
				5365
				5366	void kvm_enable_tdp(void)
				5367	{
				5368	tdp_enabled = true;
				5369	}
				5370	EXPORT_SYMBOL_GPL(kvm_enable_tdp);
				5371
				5372	void kvm_disable_tdp(void)
				5373	{
				5374	tdp_enabled = false;
				5375	}
				5376	EXPORT_SYMBOL_GPL(kvm_disable_tdp);
				5377
				5378	static void free_mmu_pages(struct kvm_vcpu *vcpu)
				5379	{
				5380	free_page((unsigned long)vcpu->arch.mmu.pae_root);
				5381	free_page((unsigned long)vcpu->arch.mmu.lm_root);
				5382	}
				5383
				5384	static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
				5385	{
				5386	struct page *page;
				5387	int i;
				5388
				5389	if (tdp_enabled)
				5390	return 0;
				5391
				5392	/*
				5393	* When emulating 32-bit mode, cr3 is only 32 bits even on x86_64.
				5394	* Therefore we need to allocate shadow page tables in the first
				5395	* 4GB of memory, which happens to fit the DMA32 zone.
				5396	*/
				5397	page = alloc_page(GFP_KERNEL \| __GFP_DMA32);
				5398	if (!page)
				5399	return -ENOMEM;
				5400
				5401	vcpu->arch.mmu.pae_root = page_address(page);
				5402	for (i = 0; i < 4; ++i)
				5403	vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
				5404
				5405	return 0;
				5406	}
				5407
				5408	int kvm_mmu_create(struct kvm_vcpu *vcpu)
				5409	{
				5410	uint i;
				5411
				5412	vcpu->arch.walk_mmu = &vcpu->arch.mmu;
				5413	vcpu->arch.mmu.root_hpa = INVALID_PAGE;
				5414	vcpu->arch.mmu.translate_gpa = translate_gpa;
				5415	vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
				5416
				5417	for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++)
				5418	vcpu->arch.mmu.prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
				5419
				5420	return alloc_mmu_pages(vcpu);
				5421	}
				5422
				5423	void kvm_mmu_setup(struct kvm_vcpu *vcpu)
				5424	{
				5425	MMU_WARN_ON(VALID_PAGE(vcpu->arch.mmu.root_hpa));
				5426
				5427	/*
				5428	* kvm_mmu_setup() is called only on vCPU initialization.
				5429	* Therefore, no need to reset mmu roots as they are not yet
				5430	* initialized.
				5431	*/
				5432	kvm_init_mmu(vcpu, false);
				5433	}
				5434
				5435	static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm,
				5436	struct kvm_memory_slot *slot,
				5437	struct kvm_page_track_notifier_node *node)
				5438	{
				5439	kvm_mmu_invalidate_zap_all_pages(kvm);
				5440	}
				5441
				5442	void kvm_mmu_init_vm(struct kvm *kvm)
				5443	{
				5444	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
				5445
				5446	node->track_write = kvm_mmu_pte_write;
				5447	node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot;
				5448	kvm_page_track_register_notifier(kvm, node);
				5449	}
				5450
				5451	void kvm_mmu_uninit_vm(struct kvm *kvm)
				5452	{
				5453	struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker;
				5454
				5455	kvm_page_track_unregister_notifier(kvm, node);
				5456	}
				5457
				5458	/* The return value indicates if tlb flush on all vcpus is needed. */
				5459	typedef bool (slot_level_handler) (struct kvm kvm, struct kvm_rmap_head *rmap_head);
				5460
				5461	/* The caller should hold mmu-lock before calling this function. */
				5462	static __always_inline bool
				5463	slot_handle_level_range(struct kvm kvm, struct kvm_memory_slot memslot,
				5464	slot_level_handler fn, int start_level, int end_level,
				5465	gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
				5466	{
				5467	struct slot_rmap_walk_iterator iterator;
				5468	bool flush = false;
				5469
				5470	for_each_slot_rmap_range(memslot, start_level, end_level, start_gfn,
				5471	end_gfn, &iterator) {
				5472	if (iterator.rmap)
				5473	flush \|= fn(kvm, iterator.rmap);
				5474
				5475	if (need_resched() \|\| spin_needbreak(&kvm->mmu_lock)) {
				5476	if (flush && lock_flush_tlb) {
				5477	kvm_flush_remote_tlbs(kvm);
				5478	flush = false;
				5479	}
				5480	cond_resched_lock(&kvm->mmu_lock);
				5481	}
				5482	}
				5483
				5484	if (flush && lock_flush_tlb) {
				5485	kvm_flush_remote_tlbs(kvm);
				5486	flush = false;
				5487	}
				5488
				5489	return flush;
				5490	}
				5491
				5492	static __always_inline bool
				5493	slot_handle_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5494	slot_level_handler fn, int start_level, int end_level,
				5495	bool lock_flush_tlb)
				5496	{
				5497	return slot_handle_level_range(kvm, memslot, fn, start_level,
				5498	end_level, memslot->base_gfn,
				5499	memslot->base_gfn + memslot->npages - 1,
				5500	lock_flush_tlb);
				5501	}
				5502
				5503	static __always_inline bool
				5504	slot_handle_all_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5505	slot_level_handler fn, bool lock_flush_tlb)
				5506	{
				5507	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
				5508	PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
				5509	}
				5510
				5511	static __always_inline bool
				5512	slot_handle_large_level(struct kvm kvm, struct kvm_memory_slot memslot,
				5513	slot_level_handler fn, bool lock_flush_tlb)
				5514	{
				5515	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL + 1,
				5516	PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
				5517	}
				5518
				5519	static __always_inline bool
				5520	slot_handle_leaf(struct kvm kvm, struct kvm_memory_slot memslot,
				5521	slot_level_handler fn, bool lock_flush_tlb)
				5522	{
				5523	return slot_handle_level(kvm, memslot, fn, PT_PAGE_TABLE_LEVEL,
				5524	PT_PAGE_TABLE_LEVEL, lock_flush_tlb);
				5525	}
				5526
				5527	void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
				5528	{
				5529	struct kvm_memslots *slots;
				5530	struct kvm_memory_slot *memslot;
				5531	int i;
				5532
				5533	spin_lock(&kvm->mmu_lock);
				5534	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				5535	slots = __kvm_memslots(kvm, i);
				5536	kvm_for_each_memslot(memslot, slots) {
				5537	gfn_t start, end;
				5538
				5539	start = max(gfn_start, memslot->base_gfn);
				5540	end = min(gfn_end, memslot->base_gfn + memslot->npages);
				5541	if (start >= end)
				5542	continue;
				5543
				5544	slot_handle_level_range(kvm, memslot, kvm_zap_rmapp,
				5545	PT_PAGE_TABLE_LEVEL, PT_MAX_HUGEPAGE_LEVEL,
				5546	start, end - 1, true);
				5547	}
				5548	}
				5549
				5550	spin_unlock(&kvm->mmu_lock);
				5551	}
				5552
				5553	static bool slot_rmap_write_protect(struct kvm *kvm,
				5554	struct kvm_rmap_head *rmap_head)
				5555	{
				5556	return __rmap_write_protect(kvm, rmap_head, false);
				5557	}
				5558
				5559	void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
				5560	struct kvm_memory_slot *memslot)
				5561	{
				5562	bool flush;
				5563
				5564	spin_lock(&kvm->mmu_lock);
				5565	flush = slot_handle_all_level(kvm, memslot, slot_rmap_write_protect,
				5566	false);
				5567	spin_unlock(&kvm->mmu_lock);
				5568
				5569	/*
				5570	* kvm_mmu_slot_remove_write_access() and kvm_vm_ioctl_get_dirty_log()
				5571	* which do tlb flush out of mmu-lock should be serialized by
				5572	* kvm->slots_lock otherwise tlb flush would be missed.
				5573	*/
				5574	lockdep_assert_held(&kvm->slots_lock);
				5575
				5576	/*
				5577	* We can flush all the TLBs out of the mmu lock without TLB
				5578	* corruption since we just change the spte from writable to
				5579	* readonly so that we only need to care the case of changing
				5580	* spte from present to present (changing the spte from present
				5581	* to nonpresent will flush all the TLBs immediately), in other
				5582	* words, the only case we care is mmu_spte_update() where we
				5583	* haved checked SPTE_HOST_WRITEABLE \| SPTE_MMU_WRITEABLE
				5584	* instead of PT_WRITABLE_MASK, that means it does not depend
				5585	* on PT_WRITABLE_MASK anymore.
				5586	*/
				5587	if (flush)
				5588	kvm_flush_remote_tlbs(kvm);
				5589	}
				5590
				5591	static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
				5592	struct kvm_rmap_head *rmap_head)
				5593	{
				5594	u64 *sptep;
				5595	struct rmap_iterator iter;
				5596	int need_tlb_flush = 0;
				5597	kvm_pfn_t pfn;
				5598	struct kvm_mmu_page *sp;
				5599
				5600	restart:
				5601	for_each_rmap_spte(rmap_head, &iter, sptep) {
				5602	sp = page_header(__pa(sptep));
				5603	pfn = spte_to_pfn(*sptep);
				5604
				5605	/*
				5606	* We cannot do huge page mapping for indirect shadow pages,
				5607	* which are found on the last rmap (level = 1) when not using
				5608	* tdp; such shadow pages are synced with the page table in
				5609	* the guest, and the guest page table is using 4K page size
				5610	* mapping if the indirect sp has level = 1.
				5611	*/
				5612	if (sp->role.direct &&
				5613	!kvm_is_reserved_pfn(pfn) &&
				5614	PageTransCompoundMap(pfn_to_page(pfn))) {
				5615	drop_spte(kvm, sptep);
				5616	need_tlb_flush = 1;
				5617	goto restart;
				5618	}
				5619	}
				5620
				5621	return need_tlb_flush;
				5622	}
				5623
				5624	void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
				5625	const struct kvm_memory_slot *memslot)
				5626	{
				5627	/* FIXME: const-ify all uses of struct kvm_memory_slot. */
				5628	spin_lock(&kvm->mmu_lock);
				5629	slot_handle_leaf(kvm, (struct kvm_memory_slot *)memslot,
				5630	kvm_mmu_zap_collapsible_spte, true);
				5631	spin_unlock(&kvm->mmu_lock);
				5632	}
				5633
				5634	void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
				5635	struct kvm_memory_slot *memslot)
				5636	{
				5637	bool flush;
				5638
				5639	spin_lock(&kvm->mmu_lock);
				5640	flush = slot_handle_leaf(kvm, memslot, __rmap_clear_dirty, false);
				5641	spin_unlock(&kvm->mmu_lock);
				5642
				5643	lockdep_assert_held(&kvm->slots_lock);
				5644
				5645	/*
				5646	* It's also safe to flush TLBs out of mmu lock here as currently this
				5647	* function is only used for dirty logging, in which case flushing TLB
				5648	* out of mmu lock also guarantees no dirty pages will be lost in
				5649	* dirty_bitmap.
				5650	*/
				5651	if (flush)
				5652	kvm_flush_remote_tlbs(kvm);
				5653	}
				5654	EXPORT_SYMBOL_GPL(kvm_mmu_slot_leaf_clear_dirty);
				5655
				5656	void kvm_mmu_slot_largepage_remove_write_access(struct kvm *kvm,
				5657	struct kvm_memory_slot *memslot)
				5658	{
				5659	bool flush;
				5660
				5661	spin_lock(&kvm->mmu_lock);
				5662	flush = slot_handle_large_level(kvm, memslot, slot_rmap_write_protect,
				5663	false);
				5664	spin_unlock(&kvm->mmu_lock);
				5665
				5666	/* see kvm_mmu_slot_remove_write_access */
				5667	lockdep_assert_held(&kvm->slots_lock);
				5668
				5669	if (flush)
				5670	kvm_flush_remote_tlbs(kvm);
				5671	}
				5672	EXPORT_SYMBOL_GPL(kvm_mmu_slot_largepage_remove_write_access);
				5673
				5674	void kvm_mmu_slot_set_dirty(struct kvm *kvm,
				5675	struct kvm_memory_slot *memslot)
				5676	{
				5677	bool flush;
				5678
				5679	spin_lock(&kvm->mmu_lock);
				5680	flush = slot_handle_all_level(kvm, memslot, __rmap_set_dirty, false);
				5681	spin_unlock(&kvm->mmu_lock);
				5682
				5683	lockdep_assert_held(&kvm->slots_lock);
				5684
				5685	/* see kvm_mmu_slot_leaf_clear_dirty */
				5686	if (flush)
				5687	kvm_flush_remote_tlbs(kvm);
				5688	}
				5689	EXPORT_SYMBOL_GPL(kvm_mmu_slot_set_dirty);
				5690
				5691	#define BATCH_ZAP_PAGES 10
				5692	static void kvm_zap_obsolete_pages(struct kvm *kvm)
				5693	{
				5694	struct kvm_mmu_page sp, node;
				5695	int batch = 0;
				5696
				5697	restart:
				5698	list_for_each_entry_safe_reverse(sp, node,
				5699	&kvm->arch.active_mmu_pages, link) {
				5700	int ret;
				5701
				5702	/*
				5703	* No obsolete page exists before new created page since
				5704	* active_mmu_pages is the FIFO list.
				5705	*/
				5706	if (!is_obsolete_sp(kvm, sp))
				5707	break;
				5708
				5709	/*
				5710	* Since we are reversely walking the list and the invalid
				5711	* list will be moved to the head, skip the invalid page
				5712	* can help us to avoid the infinity list walking.
				5713	*/
				5714	if (sp->role.invalid)
				5715	continue;
				5716
				5717	/*
				5718	* Need not flush tlb since we only zap the sp with invalid
				5719	* generation number.
				5720	*/
				5721	if (batch >= BATCH_ZAP_PAGES &&
				5722	cond_resched_lock(&kvm->mmu_lock)) {
				5723	batch = 0;
				5724	goto restart;
				5725	}
				5726
				5727	ret = kvm_mmu_prepare_zap_page(kvm, sp,
				5728	&kvm->arch.zapped_obsolete_pages);
				5729	batch += ret;
				5730
				5731	if (ret)
				5732	goto restart;
				5733	}
				5734
				5735	/*
				5736	* Should flush tlb before free page tables since lockless-walking
				5737	* may use the pages.
				5738	*/
				5739	kvm_mmu_commit_zap_page(kvm, &kvm->arch.zapped_obsolete_pages);
				5740	}
				5741
				5742	/*
				5743	* Fast invalidate all shadow pages and use lock-break technique
				5744	* to zap obsolete pages.
				5745	*
				5746	* It's required when memslot is being deleted or VM is being
				5747	* destroyed, in these cases, we should ensure that KVM MMU does
				5748	* not use any resource of the being-deleted slot or all slots
				5749	* after calling the function.
				5750	*/
				5751	void kvm_mmu_invalidate_zap_all_pages(struct kvm *kvm)
				5752	{
				5753	spin_lock(&kvm->mmu_lock);
				5754	trace_kvm_mmu_invalidate_zap_all_pages(kvm);
				5755	kvm->arch.mmu_valid_gen++;
				5756
				5757	/*
				5758	* Notify all vcpus to reload its shadow page table
				5759	* and flush TLB. Then all vcpus will switch to new
				5760	* shadow page table with the new mmu_valid_gen.
				5761	*
				5762	* Note: we should do this under the protection of
				5763	* mmu-lock, otherwise, vcpu would purge shadow page
				5764	* but miss tlb flush.
				5765	*/
				5766	kvm_reload_remote_mmus(kvm);
				5767
				5768	kvm_zap_obsolete_pages(kvm);
				5769	spin_unlock(&kvm->mmu_lock);
				5770	}
				5771
				5772	static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
				5773	{
				5774	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
				5775	}
				5776
				5777	void kvm_mmu_invalidate_mmio_sptes(struct kvm kvm, struct kvm_memslots slots)
				5778	{
				5779	/*
				5780	* The very rare case: if the generation-number is round,
				5781	* zap all shadow pages.
				5782	*/
				5783	if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) {
				5784	kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n");
				5785	kvm_mmu_invalidate_zap_all_pages(kvm);
				5786	}
				5787	}
				5788
				5789	static unsigned long
				5790	mmu_shrink_scan(struct shrinker shrink, struct shrink_control sc)
				5791	{
				5792	struct kvm *kvm;
				5793	int nr_to_scan = sc->nr_to_scan;
				5794	unsigned long freed = 0;
				5795
				5796	spin_lock(&kvm_lock);
				5797
				5798	list_for_each_entry(kvm, &vm_list, vm_list) {
				5799	int idx;
				5800	LIST_HEAD(invalid_list);
				5801
				5802	/*
				5803	* Never scan more than sc->nr_to_scan VM instances.
				5804	* Will not hit this condition practically since we do not try
				5805	* to shrink more than one VM and it is very unlikely to see
				5806	* !n_used_mmu_pages so many times.
				5807	*/
				5808	if (!nr_to_scan--)
				5809	break;
				5810	/*
				5811	* n_used_mmu_pages is accessed without holding kvm->mmu_lock
				5812	* here. We may skip a VM instance errorneosly, but we do not
				5813	* want to shrink a VM that only started to populate its MMU
				5814	* anyway.
				5815	*/
				5816	if (!kvm->arch.n_used_mmu_pages &&
				5817	!kvm_has_zapped_obsolete_pages(kvm))
				5818	continue;
				5819
				5820	idx = srcu_read_lock(&kvm->srcu);
				5821	spin_lock(&kvm->mmu_lock);
				5822
				5823	if (kvm_has_zapped_obsolete_pages(kvm)) {
				5824	kvm_mmu_commit_zap_page(kvm,
				5825	&kvm->arch.zapped_obsolete_pages);
				5826	goto unlock;
				5827	}
				5828
				5829	if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
				5830	freed++;
				5831	kvm_mmu_commit_zap_page(kvm, &invalid_list);
				5832
				5833	unlock:
				5834	spin_unlock(&kvm->mmu_lock);
				5835	srcu_read_unlock(&kvm->srcu, idx);
				5836
				5837	/*
				5838	* unfair on small ones
				5839	* per-vm shrinkers cry out
				5840	* sadness comes quickly
				5841	*/
				5842	list_move_tail(&kvm->vm_list, &vm_list);
				5843	break;
				5844	}
				5845
				5846	spin_unlock(&kvm_lock);
				5847	return freed;
				5848	}
				5849
				5850	static unsigned long
				5851	mmu_shrink_count(struct shrinker shrink, struct shrink_control sc)
				5852	{
				5853	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
				5854	}
				5855
				5856	static struct shrinker mmu_shrinker = {
				5857	.count_objects = mmu_shrink_count,
				5858	.scan_objects = mmu_shrink_scan,
				5859	.seeks = DEFAULT_SEEKS * 10,
				5860	};
				5861
				5862	static void mmu_destroy_caches(void)
				5863	{
				5864	kmem_cache_destroy(pte_list_desc_cache);
				5865	kmem_cache_destroy(mmu_page_header_cache);
				5866	}
				5867
				5868	int kvm_mmu_module_init(void)
				5869	{
				5870	int ret = -ENOMEM;
				5871
				5872	kvm_mmu_reset_all_pte_masks();
				5873
				5874	pte_list_desc_cache = kmem_cache_create("pte_list_desc",
				5875	sizeof(struct pte_list_desc),
				5876	0, SLAB_ACCOUNT, NULL);
				5877	if (!pte_list_desc_cache)
				5878	goto out;
				5879
				5880	mmu_page_header_cache = kmem_cache_create("kvm_mmu_page_header",
				5881	sizeof(struct kvm_mmu_page),
				5882	0, SLAB_ACCOUNT, NULL);
				5883	if (!mmu_page_header_cache)
				5884	goto out;
				5885
				5886	if (percpu_counter_init(&kvm_total_used_mmu_pages, 0, GFP_KERNEL))
				5887	goto out;
				5888
				5889	ret = register_shrinker(&mmu_shrinker);
				5890	if (ret)
				5891	goto out;
				5892
				5893	return 0;
				5894
				5895	out:
				5896	mmu_destroy_caches();
				5897	return ret;
				5898	}
				5899
				5900	/*
				5901	* Caculate mmu pages needed for kvm.
				5902	*/
				5903	unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
				5904	{
				5905	unsigned int nr_mmu_pages;
				5906	unsigned int nr_pages = 0;
				5907	struct kvm_memslots *slots;
				5908	struct kvm_memory_slot *memslot;
				5909	int i;
				5910
				5911	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
				5912	slots = __kvm_memslots(kvm, i);
				5913
				5914	kvm_for_each_memslot(memslot, slots)
				5915	nr_pages += memslot->npages;
				5916	}
				5917
				5918	nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
				5919	nr_mmu_pages = max(nr_mmu_pages,
				5920	(unsigned int) KVM_MIN_ALLOC_MMU_PAGES);
				5921
				5922	return nr_mmu_pages;
				5923	}
				5924
				5925	void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
				5926	{
				5927	kvm_mmu_unload(vcpu);
				5928	free_mmu_pages(vcpu);
				5929	mmu_free_memory_caches(vcpu);
				5930	}
				5931
				5932	void kvm_mmu_module_exit(void)
				5933	{
				5934	mmu_destroy_caches();
				5935	percpu_counter_destroy(&kvm_total_used_mmu_pages);
				5936	unregister_shrinker(&mmu_shrinker);
				5937	mmu_audit_disable();
				5938	}