Blame - arch/x86/kvm/paging_tmpl.h - hafnium/third_party/linux.git

blob: d4a8ad6c6a4bb59ab75c3c4bc45568c152a49288 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	/* SPDX-License-Identifier: GPL-2.0-only */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* Kernel-based Virtual Machine driver for Linux
				4	*
				5	* This module enables machines with Intel VT-x extensions to run virtual
				6	* machines without emulation or binary translation.
				7	*
				8	* MMU support
				9	*
				10	* Copyright (C) 2006 Qumranet, Inc.
				11	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
				12	*
				13	* Authors:
				14	* Yaniv Kamay <yaniv@qumranet.com>
				15	* Avi Kivity <avi@qumranet.com>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	16	*/
				17
				18	/*
				19	* We need the mmu code to access both 32-bit and 64-bit guest ptes,
				20	* so the code in this file is compiled twice, once per pte size.
				21	*/
				22
				23	#if PTTYPE == 64
				24	#define pt_element_t u64
				25	#define guest_walker guest_walker64
				26	#define FNAME(name) paging##64_##name
				27	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
				28	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
				29	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
				30	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
				31	#define PT_LEVEL_BITS PT64_LEVEL_BITS
				32	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
				33	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
				34	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
				35	#ifdef CONFIG_X86_64
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	36	#define PT_MAX_FULL_LEVELS PT64_ROOT_MAX_LEVEL
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	37	#define CMPXCHG cmpxchg
				38	#else
				39	#define CMPXCHG cmpxchg64
				40	#define PT_MAX_FULL_LEVELS 2
				41	#endif
				42	#elif PTTYPE == 32
				43	#define pt_element_t u32
				44	#define guest_walker guest_walker32
				45	#define FNAME(name) paging##32_##name
				46	#define PT_BASE_ADDR_MASK PT32_BASE_ADDR_MASK
				47	#define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
				48	#define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
				49	#define PT_INDEX(addr, level) PT32_INDEX(addr, level)
				50	#define PT_LEVEL_BITS PT32_LEVEL_BITS
				51	#define PT_MAX_FULL_LEVELS 2
				52	#define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT
				53	#define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT
				54	#define PT_HAVE_ACCESSED_DIRTY(mmu) true
				55	#define CMPXCHG cmpxchg
				56	#elif PTTYPE == PTTYPE_EPT
				57	#define pt_element_t u64
				58	#define guest_walker guest_walkerEPT
				59	#define FNAME(name) ept_##name
				60	#define PT_BASE_ADDR_MASK PT64_BASE_ADDR_MASK
				61	#define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
				62	#define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
				63	#define PT_INDEX(addr, level) PT64_INDEX(addr, level)
				64	#define PT_LEVEL_BITS PT64_LEVEL_BITS
				65	#define PT_GUEST_DIRTY_SHIFT 9
				66	#define PT_GUEST_ACCESSED_SHIFT 8
				67	#define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad)
				68	#define CMPXCHG cmpxchg64
				69	#define PT_MAX_FULL_LEVELS 4
				70	#else
				71	#error Invalid PTTYPE value
				72	#endif
				73
				74	#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT)
				75	#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT)
				76
				77	#define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl)
				78	#define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL)
				79
				80	/*
				81	* The guest_walker structure emulates the behavior of the hardware page
				82	* table walker.
				83	*/
				84	struct guest_walker {
				85	int level;
				86	unsigned max_level;
				87	gfn_t table_gfn[PT_MAX_FULL_LEVELS];
				88	pt_element_t ptes[PT_MAX_FULL_LEVELS];
				89	pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
				90	gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
				91	pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
				92	bool pte_writable[PT_MAX_FULL_LEVELS];
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	93	unsigned int pt_access[PT_MAX_FULL_LEVELS];
				94	unsigned int pte_access;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	95	gfn_t gfn;
				96	struct x86_exception fault;
				97	};
				98
				99	static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
				100	{
				101	return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
				102	}
				103
				104	static inline void FNAME(protect_clean_gpte)(struct kvm_mmu mmu, unsigned access,
				105	unsigned gpte)
				106	{
				107	unsigned mask;
				108
				109	/* dirty bit is not supported, so no need to track it */
				110	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
				111	return;
				112
				113	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
				114
				115	mask = (unsigned)~ACC_WRITE_MASK;
				116	/* Allow write access to dirty gptes */
				117	mask \|= (gpte >> (PT_GUEST_DIRTY_SHIFT - PT_WRITABLE_SHIFT)) &
				118	PT_WRITABLE_MASK;
				119	*access &= mask;
				120	}
				121
				122	static inline int FNAME(is_present_gpte)(unsigned long pte)
				123	{
				124	#if PTTYPE != PTTYPE_EPT
				125	return pte & PT_PRESENT_MASK;
				126	#else
				127	return pte & 7;
				128	#endif
				129	}
				130
				131	static int FNAME(cmpxchg_gpte)(struct kvm_vcpu vcpu, struct kvm_mmu mmu,
				132	pt_element_t __user *ptep_user, unsigned index,
				133	pt_element_t orig_pte, pt_element_t new_pte)
				134	{
				135	int npages;
				136	pt_element_t ret;
				137	pt_element_t *table;
				138	struct page *page;
				139
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	140	npages = get_user_pages_fast((unsigned long)ptep_user, 1, FOLL_WRITE, &page);
				141	if (likely(npages == 1)) {
				142	table = kmap_atomic(page);
				143	ret = CMPXCHG(&table[index], orig_pte, new_pte);
				144	kunmap_atomic(table);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	145
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	146	kvm_release_page_dirty(page);
				147	} else {
				148	struct vm_area_struct *vma;
				149	unsigned long vaddr = (unsigned long)ptep_user & PAGE_MASK;
				150	unsigned long pfn;
				151	unsigned long paddr;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	152
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	153	down_read(&current->mm->mmap_sem);
				154	vma = find_vma_intersection(current->mm, vaddr, vaddr + PAGE_SIZE);
				155	if (!vma \|\| !(vma->vm_flags & VM_PFNMAP)) {
				156	up_read(&current->mm->mmap_sem);
				157	return -EFAULT;
				158	}
				159	pfn = ((vaddr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
				160	paddr = pfn << PAGE_SHIFT;
				161	table = memremap(paddr, PAGE_SIZE, MEMREMAP_WB);
				162	if (!table) {
				163	up_read(&current->mm->mmap_sem);
				164	return -EFAULT;
				165	}
				166	ret = CMPXCHG(&table[index], orig_pte, new_pte);
				167	memunmap(table);
				168	up_read(&current->mm->mmap_sem);
				169	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	170
				171	return (ret != orig_pte);
				172	}
				173
				174	static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
				175	struct kvm_mmu_page sp, u64 spte,
				176	u64 gpte)
				177	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	178	if (is_rsvd_bits_set(vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	179	goto no_present;
				180
				181	if (!FNAME(is_present_gpte)(gpte))
				182	goto no_present;
				183
				184	/* if accessed bit is not supported prefetch non accessed gpte */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	185	if (PT_HAVE_ACCESSED_DIRTY(vcpu->arch.mmu) &&
				186	!(gpte & PT_GUEST_ACCESSED_MASK))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	187	goto no_present;
				188
				189	return false;
				190
				191	no_present:
				192	drop_spte(vcpu->kvm, spte);
				193	return true;
				194	}
				195
				196	/*
				197	* For PTTYPE_EPT, a page table can be executable but not readable
				198	* on supported processors. Therefore, set_spte does not automatically
				199	* set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
				200	* to signify readability since it isn't used in the EPT case
				201	*/
				202	static inline unsigned FNAME(gpte_access)(u64 gpte)
				203	{
				204	unsigned access;
				205	#if PTTYPE == PTTYPE_EPT
				206	access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) \|
				207	((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) \|
				208	((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
				209	#else
				210	BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
				211	BUILD_BUG_ON(ACC_EXEC_MASK != 1);
				212	access = gpte & (PT_WRITABLE_MASK \| PT_USER_MASK \| PT_PRESENT_MASK);
				213	/* Combine NX with P (which is set here) to get ACC_EXEC_MASK. */
				214	access ^= (gpte >> PT64_NX_SHIFT);
				215	#endif
				216
				217	return access;
				218	}
				219
				220	static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu,
				221	struct kvm_mmu *mmu,
				222	struct guest_walker *walker,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	223	gpa_t addr, int write_fault)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	224	{
				225	unsigned level, index;
				226	pt_element_t pte, orig_pte;
				227	pt_element_t __user *ptep_user;
				228	gfn_t table_gfn;
				229	int ret;
				230
				231	/* dirty/accessed bits are not supported, so no need to update them */
				232	if (!PT_HAVE_ACCESSED_DIRTY(mmu))
				233	return 0;
				234
				235	for (level = walker->max_level; level >= walker->level; --level) {
				236	pte = orig_pte = walker->ptes[level - 1];
				237	table_gfn = walker->table_gfn[level - 1];
				238	ptep_user = walker->ptep_user[level - 1];
				239	index = offset_in_page(ptep_user) / sizeof(pt_element_t);
				240	if (!(pte & PT_GUEST_ACCESSED_MASK)) {
				241	trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte));
				242	pte \|= PT_GUEST_ACCESSED_MASK;
				243	}
				244	if (level == walker->level && write_fault &&
				245	!(pte & PT_GUEST_DIRTY_MASK)) {
				246	trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
				247	#if PTTYPE == PTTYPE_EPT
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	248	if (kvm_arch_write_log_dirty(vcpu, addr))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	249	return -EINVAL;
				250	#endif
				251	pte \|= PT_GUEST_DIRTY_MASK;
				252	}
				253	if (pte == orig_pte)
				254	continue;
				255
				256	/*
				257	* If the slot is read-only, simply do not process the accessed
				258	* and dirty bits. This is the correct thing to do if the slot
				259	* is ROM, and page tables in read-as-ROM/write-as-MMIO slots
				260	* are only supported if the accessed and dirty bits are already
				261	* set in the ROM (so that MMIO writes are never needed).
				262	*
				263	* Note that NPT does not allow this at all and faults, since
				264	* it always wants nested page table entries for the guest
				265	* page tables to be writable. And EPT works but will simply
				266	* overwrite the read-only memory to set the accessed and dirty
				267	* bits.
				268	*/
				269	if (unlikely(!walker->pte_writable[level - 1]))
				270	continue;
				271
				272	ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, orig_pte, pte);
				273	if (ret)
				274	return ret;
				275
				276	kvm_vcpu_mark_page_dirty(vcpu, table_gfn);
				277	walker->ptes[level - 1] = pte;
				278	}
				279	return 0;
				280	}
				281
				282	static inline unsigned FNAME(gpte_pkeys)(struct kvm_vcpu *vcpu, u64 gpte)
				283	{
				284	unsigned pkeys = 0;
				285	#if PTTYPE == 64
				286	pte_t pte = {.pte = gpte};
				287
				288	pkeys = pte_flags_pkey(pte_flags(pte));
				289	#endif
				290	return pkeys;
				291	}
				292
				293	/*
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	294	* Fetch a guest pte for a guest virtual address, or for an L2's GPA.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	295	*/
				296	static int FNAME(walk_addr_generic)(struct guest_walker *walker,
				297	struct kvm_vcpu vcpu, struct kvm_mmu mmu,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	298	gpa_t addr, u32 access)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	299	{
				300	int ret;
				301	pt_element_t pte;
				302	pt_element_t __user *uninitialized_var(ptep_user);
				303	gfn_t table_gfn;
				304	u64 pt_access, pte_access;
				305	unsigned index, accessed_dirty, pte_pkey;
				306	unsigned nested_access;
				307	gpa_t pte_gpa;
				308	bool have_ad;
				309	int offset;
				310	u64 walk_nx_mask = 0;
				311	const int write_fault = access & PFERR_WRITE_MASK;
				312	const int user_fault = access & PFERR_USER_MASK;
				313	const int fetch_fault = access & PFERR_FETCH_MASK;
				314	u16 errcode = 0;
				315	gpa_t real_gpa;
				316	gfn_t gfn;
				317
				318	trace_kvm_mmu_pagetable_walk(addr, access);
				319	retry_walk:
				320	walker->level = mmu->root_level;
				321	pte = mmu->get_cr3(vcpu);
				322	have_ad = PT_HAVE_ACCESSED_DIRTY(mmu);
				323
				324	#if PTTYPE == 64
				325	walk_nx_mask = 1ULL << PT64_NX_SHIFT;
				326	if (walker->level == PT32E_ROOT_LEVEL) {
				327	pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3);
				328	trace_kvm_mmu_paging_element(pte, walker->level);
				329	if (!FNAME(is_present_gpte)(pte))
				330	goto error;
				331	--walker->level;
				332	}
				333	#endif
				334	walker->max_level = walker->level;
				335	ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu)));
				336
				337	/*
				338	* FIXME: on Intel processors, loads of the PDPTE registers for PAE paging
				339	* by the MOV to CR instruction are treated as reads and do not cause the
				340	* processor to set the dirty flag in any EPT paging-structure entry.
				341	*/
				342	nested_access = (have_ad ? PFERR_WRITE_MASK : 0) \| PFERR_USER_MASK;
				343
				344	pte_access = ~0;
				345	++walker->level;
				346
				347	do {
				348	gfn_t real_gfn;
				349	unsigned long host_addr;
				350
				351	pt_access = pte_access;
				352	--walker->level;
				353
				354	index = PT_INDEX(addr, walker->level);
				355	table_gfn = gpte_to_gfn(pte);
				356	offset = index * sizeof(pt_element_t);
				357	pte_gpa = gfn_to_gpa(table_gfn) + offset;
				358
				359	BUG_ON(walker->level < 1);
				360	walker->table_gfn[walker->level - 1] = table_gfn;
				361	walker->pte_gpa[walker->level - 1] = pte_gpa;
				362
				363	real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
				364	nested_access,
				365	&walker->fault);
				366
				367	/*
				368	* FIXME: This can happen if emulation (for of an INS/OUTS
				369	* instruction) triggers a nested page fault. The exit
				370	* qualification / exit info field will incorrectly have
				371	* "guest page access" as the nested page fault's cause,
				372	* instead of "guest page structure access". To fix this,
				373	* the x86_exception struct should be augmented with enough
				374	* information to fix the exit_qualification or exit_info_1
				375	* fields.
				376	*/
				377	if (unlikely(real_gfn == UNMAPPED_GVA))
				378	return 0;
				379
				380	real_gfn = gpa_to_gfn(real_gfn);
				381
				382	host_addr = kvm_vcpu_gfn_to_hva_prot(vcpu, real_gfn,
				383	&walker->pte_writable[walker->level - 1]);
				384	if (unlikely(kvm_is_error_hva(host_addr)))
				385	goto error;
				386
				387	ptep_user = (pt_element_t __user )((void )host_addr + offset);
				388	if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte))))
				389	goto error;
				390	walker->ptep_user[walker->level - 1] = ptep_user;
				391
				392	trace_kvm_mmu_paging_element(pte, walker->level);
				393
				394	/*
				395	* Inverting the NX it lets us AND it like other
				396	* permission bits.
				397	*/
				398	pte_access = pt_access & (pte ^ walk_nx_mask);
				399
				400	if (unlikely(!FNAME(is_present_gpte)(pte)))
				401	goto error;
				402
				403	if (unlikely(is_rsvd_bits_set(mmu, pte, walker->level))) {
				404	errcode = PFERR_RSVD_MASK \| PFERR_PRESENT_MASK;
				405	goto error;
				406	}
				407
				408	walker->ptes[walker->level - 1] = pte;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	409
				410	/* Convert to ACC__MASK flags for struct guest_walker. /
				411	walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	412	} while (!is_last_gpte(mmu, walker->level, pte));
				413
				414	pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
				415	accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
				416
				417	/* Convert to ACC__MASK flags for struct guest_walker. /
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	418	walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
				419	errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
				420	if (unlikely(errcode))
				421	goto error;
				422
				423	gfn = gpte_to_gfn_lvl(pte, walker->level);
				424	gfn += (addr & PT_LVL_OFFSET_MASK(walker->level)) >> PAGE_SHIFT;
				425
				426	if (PTTYPE == 32 && walker->level == PT_DIRECTORY_LEVEL && is_cpuid_PSE36())
				427	gfn += pse36_gfn_delta(pte);
				428
				429	real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn), access, &walker->fault);
				430	if (real_gpa == UNMAPPED_GVA)
				431	return 0;
				432
				433	walker->gfn = real_gpa >> PAGE_SHIFT;
				434
				435	if (!write_fault)
				436	FNAME(protect_clean_gpte)(mmu, &walker->pte_access, pte);
				437	else
				438	/*
				439	* On a write fault, fold the dirty bit into accessed_dirty.
				440	* For modes without A/D bits support accessed_dirty will be
				441	* always clear.
				442	*/
				443	accessed_dirty &= pte >>
				444	(PT_GUEST_DIRTY_SHIFT - PT_GUEST_ACCESSED_SHIFT);
				445
				446	if (unlikely(!accessed_dirty)) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	447	ret = FNAME(update_accessed_dirty_bits)(vcpu, mmu, walker,
				448	addr, write_fault);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	449	if (unlikely(ret < 0))
				450	goto error;
				451	else if (ret)
				452	goto retry_walk;
				453	}
				454
				455	pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	456	__func__, (u64)pte, walker->pte_access,
				457	walker->pt_access[walker->level - 1]);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	458	return 1;
				459
				460	error:
				461	errcode \|= write_fault \| user_fault;
				462	if (fetch_fault && (mmu->nx \|\|
				463	kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)))
				464	errcode \|= PFERR_FETCH_MASK;
				465
				466	walker->fault.vector = PF_VECTOR;
				467	walker->fault.error_code_valid = true;
				468	walker->fault.error_code = errcode;
				469
				470	#if PTTYPE == PTTYPE_EPT
				471	/*
				472	* Use PFERR_RSVD_MASK in error_code to to tell if EPT
				473	* misconfiguration requires to be injected. The detection is
				474	* done by is_rsvd_bits_set() above.
				475	*
				476	* We set up the value of exit_qualification to inject:
				477	* [2:0] - Derive from the access bits. The exit_qualification might be
				478	* out of date if it is serving an EPT misconfiguration.
				479	* [5:3] - Calculated by the page walk of the guest EPT page tables
				480	* [7:8] - Derived from [7:8] of real exit_qualification
				481	*
				482	* The other bits are set to 0.
				483	*/
				484	if (!(errcode & PFERR_RSVD_MASK)) {
				485	vcpu->arch.exit_qualification &= 0x180;
				486	if (write_fault)
				487	vcpu->arch.exit_qualification \|= EPT_VIOLATION_ACC_WRITE;
				488	if (user_fault)
				489	vcpu->arch.exit_qualification \|= EPT_VIOLATION_ACC_READ;
				490	if (fetch_fault)
				491	vcpu->arch.exit_qualification \|= EPT_VIOLATION_ACC_INSTR;
				492	vcpu->arch.exit_qualification \|= (pte_access & 0x7) << 3;
				493	}
				494	#endif
				495	walker->fault.address = addr;
				496	walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
				497
				498	trace_kvm_mmu_walker_error(walker->fault.error_code);
				499	return 0;
				500	}
				501
				502	static int FNAME(walk_addr)(struct guest_walker *walker,
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	503	struct kvm_vcpu *vcpu, gpa_t addr, u32 access)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	504	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	505	return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu, addr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	506	access);
				507	}
				508
				509	#if PTTYPE != PTTYPE_EPT
				510	static int FNAME(walk_addr_nested)(struct guest_walker *walker,
				511	struct kvm_vcpu *vcpu, gva_t addr,
				512	u32 access)
				513	{
				514	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
				515	addr, access);
				516	}
				517	#endif
				518
				519	static bool
				520	FNAME(prefetch_gpte)(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				521	u64 *spte, pt_element_t gpte, bool no_dirty_log)
				522	{
				523	unsigned pte_access;
				524	gfn_t gfn;
				525	kvm_pfn_t pfn;
				526
				527	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
				528	return false;
				529
				530	pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
				531
				532	gfn = gpte_to_gfn(gpte);
				533	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	534	FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	535	pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
				536	no_dirty_log && (pte_access & ACC_WRITE_MASK));
				537	if (is_error_pfn(pfn))
				538	return false;
				539
				540	/*
				541	* we call mmu_set_spte() with host_writable = true because
				542	* pte_prefetch_gfn_to_pfn always gets a writable pfn.
				543	*/
				544	mmu_set_spte(vcpu, spte, pte_access, 0, PT_PAGE_TABLE_LEVEL, gfn, pfn,
				545	true, true);
				546
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	547	kvm_release_pfn_clean(pfn);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	548	return true;
				549	}
				550
				551	static void FNAME(update_pte)(struct kvm_vcpu vcpu, struct kvm_mmu_page sp,
				552	u64 spte, const void pte)
				553	{
				554	pt_element_t gpte = (const pt_element_t )pte;
				555
				556	FNAME(prefetch_gpte)(vcpu, sp, spte, gpte, false);
				557	}
				558
				559	static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
				560	struct guest_walker *gw, int level)
				561	{
				562	pt_element_t curr_pte;
				563	gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
				564	u64 mask;
				565	int r, index;
				566
				567	if (level == PT_PAGE_TABLE_LEVEL) {
				568	mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
				569	base_gpa = pte_gpa & ~mask;
				570	index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
				571
				572	r = kvm_vcpu_read_guest_atomic(vcpu, base_gpa,
				573	gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
				574	curr_pte = gw->prefetch_ptes[index];
				575	} else
				576	r = kvm_vcpu_read_guest_atomic(vcpu, pte_gpa,
				577	&curr_pte, sizeof(curr_pte));
				578
				579	return r \|\| curr_pte != gw->ptes[level - 1];
				580	}
				581
				582	static void FNAME(pte_prefetch)(struct kvm_vcpu vcpu, struct guest_walker gw,
				583	u64 *sptep)
				584	{
				585	struct kvm_mmu_page *sp;
				586	pt_element_t *gptep = gw->prefetch_ptes;
				587	u64 *spte;
				588	int i;
				589
				590	sp = page_header(__pa(sptep));
				591
				592	if (sp->role.level > PT_PAGE_TABLE_LEVEL)
				593	return;
				594
				595	if (sp->role.direct)
				596	return __direct_pte_prefetch(vcpu, sp, sptep);
				597
				598	i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
				599	spte = sp->spt + i;
				600
				601	for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
				602	if (spte == sptep)
				603	continue;
				604
				605	if (is_shadow_present_pte(*spte))
				606	continue;
				607
				608	if (!FNAME(prefetch_gpte)(vcpu, sp, spte, gptep[i], true))
				609	break;
				610	}
				611	}
				612
				613	/*
				614	* Fetch a shadow pte for a specific level in the paging hierarchy.
				615	* If the guest tries to write a write-protected page, we need to
				616	* emulate this operation, return 1 to indicate this case.
				617	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	618	static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	619	struct guest_walker *gw,
				620	int write_fault, int hlevel,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	621	kvm_pfn_t pfn, bool map_writable, bool prefault,
				622	bool lpage_disallowed)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	623	{
				624	struct kvm_mmu_page *sp = NULL;
				625	struct kvm_shadow_walk_iterator it;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	626	unsigned int direct_access, access;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	627	int top_level, ret;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	628	gfn_t gfn, base_gfn;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	629
				630	direct_access = gw->pte_access;
				631
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	632	top_level = vcpu->arch.mmu->root_level;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	633	if (top_level == PT32E_ROOT_LEVEL)
				634	top_level = PT32_ROOT_LEVEL;
				635	/*
				636	* Verify that the top-level gpte is still there. Since the page
				637	* is a root page, it is either write protected (and cannot be
				638	* changed from now on) or it is invalid (in which case, we don't
				639	* really care if it changes underneath us after this point).
				640	*/
				641	if (FNAME(gpte_changed)(vcpu, gw, top_level))
				642	goto out_gpte_changed;
				643
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	644	if (!VALID_PAGE(vcpu->arch.mmu->root_hpa))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	645	goto out_gpte_changed;
				646
				647	for (shadow_walk_init(&it, vcpu, addr);
				648	shadow_walk_okay(&it) && it.level > gw->level;
				649	shadow_walk_next(&it)) {
				650	gfn_t table_gfn;
				651
				652	clear_sp_write_flooding_count(it.sptep);
				653	drop_large_spte(vcpu, it.sptep);
				654
				655	sp = NULL;
				656	if (!is_shadow_present_pte(*it.sptep)) {
				657	table_gfn = gw->table_gfn[it.level - 2];
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	658	access = gw->pt_access[it.level - 2];
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	659	sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
				660	false, access);
				661	}
				662
				663	/*
				664	* Verify that the gpte in the page we've just write
				665	* protected is still there.
				666	*/
				667	if (FNAME(gpte_changed)(vcpu, gw, it.level - 1))
				668	goto out_gpte_changed;
				669
				670	if (sp)
				671	link_shadow_page(vcpu, it.sptep, sp);
				672	}
				673
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	674	/*
				675	* FNAME(page_fault) might have clobbered the bottom bits of
				676	* gw->gfn, restore them from the virtual address.
				677	*/
				678	gfn = gw->gfn \| ((addr & PT_LVL_OFFSET_MASK(gw->level)) >> PAGE_SHIFT);
				679	base_gfn = gfn;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	680
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	681	trace_kvm_mmu_spte_requested(addr, gw->level, pfn);
				682
				683	for (; shadow_walk_okay(&it); shadow_walk_next(&it)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	684	clear_sp_write_flooding_count(it.sptep);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	685
				686	/*
				687	* We cannot overwrite existing page tables with an NX
				688	* large page, as the leaf could be executable.
				689	*/
				690	disallowed_hugepage_adjust(it, gfn, &pfn, &hlevel);
				691
				692	base_gfn = gfn & ~(KVM_PAGES_PER_HPAGE(it.level) - 1);
				693	if (it.level == hlevel)
				694	break;
				695
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	696	validate_direct_spte(vcpu, it.sptep, direct_access);
				697
				698	drop_large_spte(vcpu, it.sptep);
				699
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	700	if (!is_shadow_present_pte(*it.sptep)) {
				701	sp = kvm_mmu_get_page(vcpu, base_gfn, addr,
				702	it.level - 1, true, direct_access);
				703	link_shadow_page(vcpu, it.sptep, sp);
				704	if (lpage_disallowed)
				705	account_huge_nx_page(vcpu->kvm, sp);
				706	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	707	}
				708
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	709	ret = mmu_set_spte(vcpu, it.sptep, gw->pte_access, write_fault,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	710	it.level, base_gfn, pfn, prefault, map_writable);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	711	FNAME(pte_prefetch)(vcpu, gw, it.sptep);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	712	++vcpu->stat.pf_fixed;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	713	return ret;
				714
				715	out_gpte_changed:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	716	return RET_PF_RETRY;
				717	}
				718
				719	/*
				720	* To see whether the mapped gfn can write its page table in the current
				721	* mapping.
				722	*
				723	* It is the helper function of FNAME(page_fault). When guest uses large page
				724	* size to map the writable gfn which is used as current page table, we should
				725	* force kvm to use small page size to map it because new shadow page will be
				726	* created when kvm establishes shadow page table that stop kvm using large
				727	* page size. Do it early can avoid unnecessary #PF and emulation.
				728	*
				729	* @write_fault_to_shadow_pgtable will return true if the fault gfn is
				730	* currently used as its page table.
				731	*
				732	* Note: the PDPT page table is not checked for PAE-32 bit guest. It is ok
				733	* since the PDPT is always shadowed, that means, we can not use large page
				734	* size to map the gfn which is used as PDPT.
				735	*/
				736	static bool
				737	FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu,
				738	struct guest_walker *walker, int user_fault,
				739	bool *write_fault_to_shadow_pgtable)
				740	{
				741	int level;
				742	gfn_t mask = ~(KVM_PAGES_PER_HPAGE(walker->level) - 1);
				743	bool self_changed = false;
				744
				745	if (!(walker->pte_access & ACC_WRITE_MASK \|\|
				746	(!is_write_protection(vcpu) && !user_fault)))
				747	return false;
				748
				749	for (level = walker->level; level <= walker->max_level; level++) {
				750	gfn_t gfn = walker->gfn ^ walker->table_gfn[level - 1];
				751
				752	self_changed \|= !(gfn & mask);
				753	*write_fault_to_shadow_pgtable \|= !gfn;
				754	}
				755
				756	return self_changed;
				757	}
				758
				759	/*
				760	* Page fault handler. There are several causes for a page fault:
				761	* - there is no shadow pte for the guest pte
				762	* - write access through a shadow pte marked read only so that we can set
				763	* the dirty bit
				764	* - write access to a shadow pte marked read only so we can update the page
				765	* dirty bitmap, when userspace requests it
				766	* - mmio access; in this case we will never install a present shadow pte
				767	* - normal guest page fault due to the guest pte marked not present, not
				768	* writable, or not executable
				769	*
				770	* Returns: 1 if we need to emulate the instruction, 0 otherwise, or
				771	* a negative value on error.
				772	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	773	static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	774	bool prefault)
				775	{
				776	int write_fault = error_code & PFERR_WRITE_MASK;
				777	int user_fault = error_code & PFERR_USER_MASK;
				778	struct guest_walker walker;
				779	int r;
				780	kvm_pfn_t pfn;
				781	int level = PT_PAGE_TABLE_LEVEL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	782	unsigned long mmu_seq;
				783	bool map_writable, is_self_change_mapping;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	784	bool lpage_disallowed = (error_code & PFERR_FETCH_MASK) &&
				785	is_nx_huge_page_enabled();
				786	bool force_pt_level = lpage_disallowed;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	787
				788	pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
				789
				790	r = mmu_topup_memory_caches(vcpu);
				791	if (r)
				792	return r;
				793
				794	/*
				795	* If PFEC.RSVD is set, this is a shadow page fault.
				796	* The bit needs to be cleared before walking guest page tables.
				797	*/
				798	error_code &= ~PFERR_RSVD_MASK;
				799
				800	/*
				801	* Look up the guest pte for the faulting address.
				802	*/
				803	r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
				804
				805	/*
				806	* The page is not mapped by the guest. Let the guest handle it.
				807	*/
				808	if (!r) {
				809	pgprintk("%s: guest page fault\n", __func__);
				810	if (!prefault)
				811	inject_page_fault(vcpu, &walker.fault);
				812
				813	return RET_PF_RETRY;
				814	}
				815
				816	if (page_fault_handle_page_track(vcpu, error_code, walker.gfn)) {
				817	shadow_page_table_clear_flood(vcpu, addr);
				818	return RET_PF_EMULATE;
				819	}
				820
				821	vcpu->arch.write_fault_to_shadow_pgtable = false;
				822
				823	is_self_change_mapping = FNAME(is_self_change_mapping)(vcpu,
				824	&walker, user_fault, &vcpu->arch.write_fault_to_shadow_pgtable);
				825
				826	if (walker.level >= PT_DIRECTORY_LEVEL && !is_self_change_mapping) {
				827	level = mapping_level(vcpu, walker.gfn, &force_pt_level);
				828	if (likely(!force_pt_level)) {
				829	level = min(walker.level, level);
				830	walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
				831	}
				832	} else
				833	force_pt_level = true;
				834
				835	mmu_seq = vcpu->kvm->mmu_notifier_seq;
				836	smp_rmb();
				837
				838	if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
				839	&map_writable))
				840	return RET_PF_RETRY;
				841
				842	if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r))
				843	return r;
				844
				845	/*
				846	* Do not change pte_access if the pfn is a mmio page, otherwise
				847	* we will cache the incorrect access into mmio spte.
				848	*/
				849	if (write_fault && !(walker.pte_access & ACC_WRITE_MASK) &&
				850	!is_write_protection(vcpu) && !user_fault &&
				851	!is_noslot_pfn(pfn)) {
				852	walker.pte_access \|= ACC_WRITE_MASK;
				853	walker.pte_access &= ~ACC_USER_MASK;
				854
				855	/*
				856	* If we converted a user page to a kernel page,
				857	* so that the kernel can write to it when cr0.wp=0,
				858	* then we should prevent the kernel from executing it
				859	* if SMEP is enabled.
				860	*/
				861	if (kvm_read_cr4_bits(vcpu, X86_CR4_SMEP))
				862	walker.pte_access &= ~ACC_EXEC_MASK;
				863	}
				864
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	865	r = RET_PF_RETRY;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	866	spin_lock(&vcpu->kvm->mmu_lock);
				867	if (mmu_notifier_retry(vcpu->kvm, mmu_seq))
				868	goto out_unlock;
				869
				870	kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
				871	if (make_mmu_pages_available(vcpu) < 0)
				872	goto out_unlock;
				873	if (!force_pt_level)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	874	transparent_hugepage_adjust(vcpu, walker.gfn, &pfn, &level);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	875	r = FNAME(fetch)(vcpu, addr, &walker, write_fault,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	876	level, pfn, map_writable, prefault, lpage_disallowed);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	877	kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	878
				879	out_unlock:
				880	spin_unlock(&vcpu->kvm->mmu_lock);
				881	kvm_release_pfn_clean(pfn);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	882	return r;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	883	}
				884
				885	static gpa_t FNAME(get_level1_sp_gpa)(struct kvm_mmu_page *sp)
				886	{
				887	int offset = 0;
				888
				889	WARN_ON(sp->role.level != PT_PAGE_TABLE_LEVEL);
				890
				891	if (PTTYPE == 32)
				892	offset = sp->role.quadrant << PT64_LEVEL_BITS;
				893
				894	return gfn_to_gpa(sp->gfn) + offset * sizeof(pt_element_t);
				895	}
				896
				897	static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva, hpa_t root_hpa)
				898	{
				899	struct kvm_shadow_walk_iterator iterator;
				900	struct kvm_mmu_page *sp;
				901	int level;
				902	u64 *sptep;
				903
				904	vcpu_clear_mmio_info(vcpu, gva);
				905
				906	/*
				907	* No need to check return value here, rmap_can_add() can
				908	* help us to skip pte prefetch later.
				909	*/
				910	mmu_topup_memory_caches(vcpu);
				911
				912	if (!VALID_PAGE(root_hpa)) {
				913	WARN_ON(1);
				914	return;
				915	}
				916
				917	spin_lock(&vcpu->kvm->mmu_lock);
				918	for_each_shadow_entry_using_root(vcpu, root_hpa, gva, iterator) {
				919	level = iterator.level;
				920	sptep = iterator.sptep;
				921
				922	sp = page_header(__pa(sptep));
				923	if (is_last_spte(*sptep, level)) {
				924	pt_element_t gpte;
				925	gpa_t pte_gpa;
				926
				927	if (!sp->unsync)
				928	break;
				929
				930	pte_gpa = FNAME(get_level1_sp_gpa)(sp);
				931	pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
				932
				933	if (mmu_page_zap_pte(vcpu->kvm, sp, sptep))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	934	kvm_flush_remote_tlbs_with_address(vcpu->kvm,
				935	sp->gfn, KVM_PAGES_PER_HPAGE(sp->role.level));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	936
				937	if (!rmap_can_add(vcpu))
				938	break;
				939
				940	if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
				941	sizeof(pt_element_t)))
				942	break;
				943
				944	FNAME(update_pte)(vcpu, sp, sptep, &gpte);
				945	}
				946
				947	if (!is_shadow_present_pte(*sptep) \|\| !sp->unsync_children)
				948	break;
				949	}
				950	spin_unlock(&vcpu->kvm->mmu_lock);
				951	}
				952
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	953	/* Note, @addr is a GPA when gva_to_gpa() translates an L2 GPA to an L1 GPA. */
				954	static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gpa_t addr, u32 access,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	955	struct x86_exception *exception)
				956	{
				957	struct guest_walker walker;
				958	gpa_t gpa = UNMAPPED_GVA;
				959	int r;
				960
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	961	r = FNAME(walk_addr)(&walker, vcpu, addr, access);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	962
				963	if (r) {
				964	gpa = gfn_to_gpa(walker.gfn);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	965	gpa \|= addr & ~PAGE_MASK;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	966	} else if (exception)
				967	*exception = walker.fault;
				968
				969	return gpa;
				970	}
				971
				972	#if PTTYPE != PTTYPE_EPT
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	973	/* Note, gva_to_gpa_nested() is only used to translate L2 GVAs. */
				974	static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gpa_t vaddr,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	975	u32 access,
				976	struct x86_exception *exception)
				977	{
				978	struct guest_walker walker;
				979	gpa_t gpa = UNMAPPED_GVA;
				980	int r;
				981
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	982	#ifndef CONFIG_X86_64
				983	/* A 64-bit GVA should be impossible on 32-bit KVM. */
				984	WARN_ON_ONCE(vaddr >> 32);
				985	#endif
				986
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	987	r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
				988
				989	if (r) {
				990	gpa = gfn_to_gpa(walker.gfn);
				991	gpa \|= vaddr & ~PAGE_MASK;
				992	} else if (exception)
				993	*exception = walker.fault;
				994
				995	return gpa;
				996	}
				997	#endif
				998
				999	/*
				1000	* Using the cached information from sp->gfns is safe because:
				1001	* - The spte has a reference to the struct page, so the pfn for a given gfn
				1002	* can't change unless all sptes pointing to it are nuked first.
				1003	*
				1004	* Note:
				1005	* We should flush all tlbs if spte is dropped even though guest is
				1006	* responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
				1007	* and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
				1008	* used by guest then tlbs are not flushed, so guest is allowed to access the
				1009	* freed pages.
				1010	* And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
				1011	*/
				1012	static int FNAME(sync_page)(struct kvm_vcpu vcpu, struct kvm_mmu_page sp)
				1013	{
				1014	int i, nr_present = 0;
				1015	bool host_writable;
				1016	gpa_t first_pte_gpa;
				1017	int set_spte_ret = 0;
				1018
				1019	/* direct kvm_mmu_page can not be unsync. */
				1020	BUG_ON(sp->role.direct);
				1021
				1022	first_pte_gpa = FNAME(get_level1_sp_gpa)(sp);
				1023
				1024	for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
				1025	unsigned pte_access;
				1026	pt_element_t gpte;
				1027	gpa_t pte_gpa;
				1028	gfn_t gfn;
				1029
				1030	if (!sp->spt[i])
				1031	continue;
				1032
				1033	pte_gpa = first_pte_gpa + i * sizeof(pt_element_t);
				1034
				1035	if (kvm_vcpu_read_guest_atomic(vcpu, pte_gpa, &gpte,
				1036	sizeof(pt_element_t)))
				1037	return 0;
				1038
				1039	if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
				1040	/*
				1041	* Update spte before increasing tlbs_dirty to make
				1042	* sure no tlb flush is lost after spte is zapped; see
				1043	* the comments in kvm_flush_remote_tlbs().
				1044	*/
				1045	smp_wmb();
				1046	vcpu->kvm->tlbs_dirty++;
				1047	continue;
				1048	}
				1049
				1050	gfn = gpte_to_gfn(gpte);
				1051	pte_access = sp->role.access;
				1052	pte_access &= FNAME(gpte_access)(gpte);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1053	FNAME(protect_clean_gpte)(vcpu->arch.mmu, &pte_access, gpte);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1054
				1055	if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access,
				1056	&nr_present))
				1057	continue;
				1058
				1059	if (gfn != sp->gfns[i]) {
				1060	drop_spte(vcpu->kvm, &sp->spt[i]);
				1061	/*
				1062	* The same as above where we are doing
				1063	* prefetch_invalid_gpte().
				1064	*/
				1065	smp_wmb();
				1066	vcpu->kvm->tlbs_dirty++;
				1067	continue;
				1068	}
				1069
				1070	nr_present++;
				1071
				1072	host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
				1073
				1074	set_spte_ret \|= set_spte(vcpu, &sp->spt[i],
				1075	pte_access, PT_PAGE_TABLE_LEVEL,
				1076	gfn, spte_to_pfn(sp->spt[i]),
				1077	true, false, host_writable);
				1078	}
				1079
				1080	if (set_spte_ret & SET_SPTE_NEED_REMOTE_TLB_FLUSH)
				1081	kvm_flush_remote_tlbs(vcpu->kvm);
				1082
				1083	return nr_present;
				1084	}
				1085
				1086	#undef pt_element_t
				1087	#undef guest_walker
				1088	#undef FNAME
				1089	#undef PT_BASE_ADDR_MASK
				1090	#undef PT_INDEX
				1091	#undef PT_LVL_ADDR_MASK
				1092	#undef PT_LVL_OFFSET_MASK
				1093	#undef PT_LEVEL_BITS
				1094	#undef PT_MAX_FULL_LEVELS
				1095	#undef gpte_to_gfn
				1096	#undef gpte_to_gfn_lvl
				1097	#undef CMPXCHG
				1098	#undef PT_GUEST_ACCESSED_MASK
				1099	#undef PT_GUEST_DIRTY_MASK
				1100	#undef PT_GUEST_DIRTY_SHIFT
				1101	#undef PT_GUEST_ACCESSED_SHIFT
				1102	#undef PT_HAVE_ACCESSED_DIRTY