Blame - arch/x86/mm/fault.c - hafnium/third_party/linux.git

blob: 9ceacd1156dbc06c801f86c66e9aa79d104a0cde [file] [log] [blame]

Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0
				2	/*
				3	* Copyright (C) 1995 Linus Torvalds
				4	* Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
				5	* Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
				6	*/
				7	#include <linux/sched.h> /* test_thread_flag(), ... */
				8	#include <linux/sched/task_stack.h> /* task_stack_(), ... /
				9	#include <linux/kdebug.h> /* oops_begin/end, ... */
				10	#include <linux/extable.h> /* search_exception_tables */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	11	#include <linux/memblock.h> /* max_low_pfn */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	12	#include <linux/kprobes.h> /* NOKPROBE_SYMBOL, ... */
				13	#include <linux/mmiotrace.h> /* kmmio_handler, ... */
				14	#include <linux/perf_event.h> /* perf_sw_event */
				15	#include <linux/hugetlb.h> /* hstate_index_to_shift */
				16	#include <linux/prefetch.h> /* prefetchw */
				17	#include <linux/context_tracking.h> /* exception_enter(), ... */
				18	#include <linux/uaccess.h> /* faulthandler_disabled() */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	19	#include <linux/efi.h> /* efi_recover_from_page_fault()*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	20	#include <linux/mm_types.h>
				21
				22	#include <asm/cpufeature.h> /* boot_cpu_has, ... */
				23	#include <asm/traps.h> /* dotraplinkage, ... */
				24	#include <asm/pgalloc.h> /* pgd_(), ... /
				25	#include <asm/fixmap.h> /* VSYSCALL_ADDR */
				26	#include <asm/vsyscall.h> /* emulate_vsyscall */
				27	#include <asm/vm86.h> /* struct vm86 */
				28	#include <asm/mmu_context.h> /* vma_pkey() */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	29	#include <asm/efi.h> /* efi_recover_from_page_fault()*/
				30	#include <asm/desc.h> /* store_idt(), ... */
				31	#include <asm/cpu_entry_area.h> /* exception stack */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	32
				33	#define CREATE_TRACE_POINTS
				34	#include <asm/trace/exceptions.h>
				35
				36	/*
				37	* Returns 0 if mmiotrace is disabled, or if the fault is not
				38	* handled by mmiotrace:
				39	*/
				40	static nokprobe_inline int
				41	kmmio_fault(struct pt_regs *regs, unsigned long addr)
				42	{
				43	if (unlikely(is_kmmio_active()))
				44	if (kmmio_handler(regs, addr) == 1)
				45	return -1;
				46	return 0;
				47	}
				48
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	49	/*
				50	* Prefetch quirks:
				51	*
				52	* 32-bit mode:
				53	*
				54	* Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
				55	* Check that here and ignore it.
				56	*
				57	* 64-bit mode:
				58	*
				59	* Sometimes the CPU reports invalid exceptions on prefetch.
				60	* Check that here and ignore it.
				61	*
				62	* Opcode checker based on code by Richard Brunner.
				63	*/
				64	static inline int
				65	check_prefetch_opcode(struct pt_regs regs, unsigned char instr,
				66	unsigned char opcode, int *prefetch)
				67	{
				68	unsigned char instr_hi = opcode & 0xf0;
				69	unsigned char instr_lo = opcode & 0x0f;
				70
				71	switch (instr_hi) {
				72	case 0x20:
				73	case 0x30:
				74	/*
				75	* Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
				76	* In X86_64 long mode, the CPU will signal invalid
				77	* opcode if some of these prefixes are present so
				78	* X86_64 will never get here anyway
				79	*/
				80	return ((instr_lo & 7) == 0x6);
				81	#ifdef CONFIG_X86_64
				82	case 0x40:
				83	/*
				84	* In AMD64 long mode 0x40..0x4F are valid REX prefixes
				85	* Need to figure out under what instruction mode the
				86	* instruction was issued. Could check the LDT for lm,
				87	* but for now it's good enough to assume that long
				88	* mode only uses well known segments or kernel.
				89	*/
				90	return (!user_mode(regs) \|\| user_64bit_mode(regs));
				91	#endif
				92	case 0x60:
				93	/* 0x64 thru 0x67 are valid prefixes in all modes. */
				94	return (instr_lo & 0xC) == 0x4;
				95	case 0xF0:
				96	/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
				97	return !instr_lo \|\| (instr_lo>>1) == 1;
				98	case 0x00:
				99	/* Prefetch instruction is 0x0F0D or 0x0F18 */
				100	if (probe_kernel_address(instr, opcode))
				101	return 0;
				102
				103	*prefetch = (instr_lo == 0xF) &&
				104	(opcode == 0x0D \|\| opcode == 0x18);
				105	return 0;
				106	default:
				107	return 0;
				108	}
				109	}
				110
				111	static int
				112	is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
				113	{
				114	unsigned char *max_instr;
				115	unsigned char *instr;
				116	int prefetch = 0;
				117
				118	/*
				119	* If it was a exec (instruction fetch) fault on NX page, then
				120	* do not ignore the fault:
				121	*/
				122	if (error_code & X86_PF_INSTR)
				123	return 0;
				124
				125	instr = (void *)convert_ip_to_linear(current, regs);
				126	max_instr = instr + 15;
				127
				128	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
				129	return 0;
				130
				131	while (instr < max_instr) {
				132	unsigned char opcode;
				133
				134	if (probe_kernel_address(instr, opcode))
				135	break;
				136
				137	instr++;
				138
				139	if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
				140	break;
				141	}
				142	return prefetch;
				143	}
				144
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	145	DEFINE_SPINLOCK(pgd_lock);
				146	LIST_HEAD(pgd_list);
				147
				148	#ifdef CONFIG_X86_32
				149	static inline pmd_t vmalloc_sync_one(pgd_t pgd, unsigned long address)
				150	{
				151	unsigned index = pgd_index(address);
				152	pgd_t *pgd_k;
				153	p4d_t p4d, p4d_k;
				154	pud_t pud, pud_k;
				155	pmd_t pmd, pmd_k;
				156
				157	pgd += index;
				158	pgd_k = init_mm.pgd + index;
				159
				160	if (!pgd_present(*pgd_k))
				161	return NULL;
				162
				163	/*
				164	* set_pgd(pgd, *pgd_k); here would be useless on PAE
				165	* and redundant with the set_pmd() on non-PAE. As would
				166	* set_p4d/set_pud.
				167	*/
				168	p4d = p4d_offset(pgd, address);
				169	p4d_k = p4d_offset(pgd_k, address);
				170	if (!p4d_present(*p4d_k))
				171	return NULL;
				172
				173	pud = pud_offset(p4d, address);
				174	pud_k = pud_offset(p4d_k, address);
				175	if (!pud_present(*pud_k))
				176	return NULL;
				177
				178	pmd = pmd_offset(pud, address);
				179	pmd_k = pmd_offset(pud_k, address);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	180
				181	if (pmd_present(pmd) != pmd_present(pmd_k))
				182	set_pmd(pmd, *pmd_k);
				183
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	184	if (!pmd_present(*pmd_k))
				185	return NULL;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	186	else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	187	BUG_ON(pmd_pfn(pmd) != pmd_pfn(pmd_k));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	188
				189	return pmd_k;
				190	}
				191
				192	void vmalloc_sync_all(void)
				193	{
				194	unsigned long address;
				195
				196	if (SHARED_KERNEL_PMD)
				197	return;
				198
				199	for (address = VMALLOC_START & PMD_MASK;
				200	address >= TASK_SIZE_MAX && address < FIXADDR_TOP;
				201	address += PMD_SIZE) {
				202	struct page *page;
				203
				204	spin_lock(&pgd_lock);
				205	list_for_each_entry(page, &pgd_list, lru) {
				206	spinlock_t *pgt_lock;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	207
				208	/* the pgt_lock only for Xen */
				209	pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
				210
				211	spin_lock(pgt_lock);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	212	vmalloc_sync_one(page_address(page), address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	213	spin_unlock(pgt_lock);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	214	}
				215	spin_unlock(&pgd_lock);
				216	}
				217	}
				218
				219	/*
				220	* 32-bit:
				221	*
				222	* Handle a fault on the vmalloc or module mapping area
				223	*/
				224	static noinline int vmalloc_fault(unsigned long address)
				225	{
				226	unsigned long pgd_paddr;
				227	pmd_t *pmd_k;
				228	pte_t *pte_k;
				229
				230	/* Make sure we are in vmalloc area: */
				231	if (!(address >= VMALLOC_START && address < VMALLOC_END))
				232	return -1;
				233
				234	/*
				235	* Synchronize this task's top level page-table
				236	* with the 'reference' page table.
				237	*
				238	* Do _not_ use "current" here. We might be inside
				239	* an interrupt in the middle of a task switch..
				240	*/
				241	pgd_paddr = read_cr3_pa();
				242	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
				243	if (!pmd_k)
				244	return -1;
				245
				246	if (pmd_large(*pmd_k))
				247	return 0;
				248
				249	pte_k = pte_offset_kernel(pmd_k, address);
				250	if (!pte_present(*pte_k))
				251	return -1;
				252
				253	return 0;
				254	}
				255	NOKPROBE_SYMBOL(vmalloc_fault);
				256
				257	/*
				258	* Did it hit the DOS screen memory VA from vm86 mode?
				259	*/
				260	static inline void
				261	check_v8086_mode(struct pt_regs *regs, unsigned long address,
				262	struct task_struct *tsk)
				263	{
				264	#ifdef CONFIG_VM86
				265	unsigned long bit;
				266
				267	if (!v8086_mode(regs) \|\| !tsk->thread.vm86)
				268	return;
				269
				270	bit = (address - 0xA0000) >> PAGE_SHIFT;
				271	if (bit < 32)
				272	tsk->thread.vm86->screen_bitmap \|= 1 << bit;
				273	#endif
				274	}
				275
				276	static bool low_pfn(unsigned long pfn)
				277	{
				278	return pfn < max_low_pfn;
				279	}
				280
				281	static void dump_pagetable(unsigned long address)
				282	{
				283	pgd_t *base = __va(read_cr3_pa());
				284	pgd_t *pgd = &base[pgd_index(address)];
				285	p4d_t *p4d;
				286	pud_t *pud;
				287	pmd_t *pmd;
				288	pte_t *pte;
				289
				290	#ifdef CONFIG_X86_PAE
				291	pr_info("pdpt = %016Lx ", pgd_val(pgd));
				292	if (!low_pfn(pgd_val(pgd) >> PAGE_SHIFT) \|\| !pgd_present(pgd))
				293	goto out;
				294	#define pr_pde pr_cont
				295	#else
				296	#define pr_pde pr_info
				297	#endif
				298	p4d = p4d_offset(pgd, address);
				299	pud = pud_offset(p4d, address);
				300	pmd = pmd_offset(pud, address);
				301	pr_pde("pde = %0Lx ", sizeof(pmd) 2, (u64)pmd_val(*pmd));
				302	#undef pr_pde
				303
				304	/*
				305	* We must not directly access the pte in the highpte
				306	* case if the page table is located in highmem.
				307	* And let's rather not kmap-atomic the pte, just in case
				308	* it's allocated already:
				309	*/
				310	if (!low_pfn(pmd_pfn(pmd)) \|\| !pmd_present(pmd) \|\| pmd_large(*pmd))
				311	goto out;
				312
				313	pte = pte_offset_kernel(pmd, address);
				314	pr_cont("pte = %0Lx ", sizeof(pte) 2, (u64)pte_val(*pte));
				315	out:
				316	pr_cont("\n");
				317	}
				318
				319	#else /* CONFIG_X86_64: */
				320
				321	void vmalloc_sync_all(void)
				322	{
				323	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
				324	}
				325
				326	/*
				327	* 64-bit:
				328	*
				329	* Handle a fault on the vmalloc area
				330	*/
				331	static noinline int vmalloc_fault(unsigned long address)
				332	{
				333	pgd_t pgd, pgd_k;
				334	p4d_t p4d, p4d_k;
				335	pud_t *pud;
				336	pmd_t *pmd;
				337	pte_t *pte;
				338
				339	/* Make sure we are in vmalloc area: */
				340	if (!(address >= VMALLOC_START && address < VMALLOC_END))
				341	return -1;
				342
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	343	/*
				344	* Copy kernel mappings over when needed. This can also
				345	* happen within a race in page table update. In the later
				346	* case just flush:
				347	*/
				348	pgd = (pgd_t *)__va(read_cr3_pa()) + pgd_index(address);
				349	pgd_k = pgd_offset_k(address);
				350	if (pgd_none(*pgd_k))
				351	return -1;
				352
				353	if (pgtable_l5_enabled()) {
				354	if (pgd_none(*pgd)) {
				355	set_pgd(pgd, *pgd_k);
				356	arch_flush_lazy_mmu_mode();
				357	} else {
				358	BUG_ON(pgd_page_vaddr(pgd) != pgd_page_vaddr(pgd_k));
				359	}
				360	}
				361
				362	/* With 4-level paging, copying happens on the p4d level. */
				363	p4d = p4d_offset(pgd, address);
				364	p4d_k = p4d_offset(pgd_k, address);
				365	if (p4d_none(*p4d_k))
				366	return -1;
				367
				368	if (p4d_none(*p4d) && !pgtable_l5_enabled()) {
				369	set_p4d(p4d, *p4d_k);
				370	arch_flush_lazy_mmu_mode();
				371	} else {
				372	BUG_ON(p4d_pfn(p4d) != p4d_pfn(p4d_k));
				373	}
				374
				375	BUILD_BUG_ON(CONFIG_PGTABLE_LEVELS < 4);
				376
				377	pud = pud_offset(p4d, address);
				378	if (pud_none(*pud))
				379	return -1;
				380
				381	if (pud_large(*pud))
				382	return 0;
				383
				384	pmd = pmd_offset(pud, address);
				385	if (pmd_none(*pmd))
				386	return -1;
				387
				388	if (pmd_large(*pmd))
				389	return 0;
				390
				391	pte = pte_offset_kernel(pmd, address);
				392	if (!pte_present(*pte))
				393	return -1;
				394
				395	return 0;
				396	}
				397	NOKPROBE_SYMBOL(vmalloc_fault);
				398
				399	#ifdef CONFIG_CPU_SUP_AMD
				400	static const char errata93_warning[] =
				401	KERN_ERR
				402	"******* Your BIOS seems to not contain a fix for K8 errata #93\n"
				403	"******* Working around it, but it may cause SEGVs or burn power.\n"
				404	"******* Please consider a BIOS update.\n"
				405	"******* Disabling USB legacy in the BIOS may also help.\n";
				406	#endif
				407
				408	/*
				409	* No vm86 mode in 64-bit mode:
				410	*/
				411	static inline void
				412	check_v8086_mode(struct pt_regs *regs, unsigned long address,
				413	struct task_struct *tsk)
				414	{
				415	}
				416
				417	static int bad_address(void *p)
				418	{
				419	unsigned long dummy;
				420
				421	return probe_kernel_address((unsigned long *)p, dummy);
				422	}
				423
				424	static void dump_pagetable(unsigned long address)
				425	{
				426	pgd_t *base = __va(read_cr3_pa());
				427	pgd_t *pgd = base + pgd_index(address);
				428	p4d_t *p4d;
				429	pud_t *pud;
				430	pmd_t *pmd;
				431	pte_t *pte;
				432
				433	if (bad_address(pgd))
				434	goto bad;
				435
				436	pr_info("PGD %lx ", pgd_val(*pgd));
				437
				438	if (!pgd_present(*pgd))
				439	goto out;
				440
				441	p4d = p4d_offset(pgd, address);
				442	if (bad_address(p4d))
				443	goto bad;
				444
				445	pr_cont("P4D %lx ", p4d_val(*p4d));
				446	if (!p4d_present(p4d) \|\| p4d_large(p4d))
				447	goto out;
				448
				449	pud = pud_offset(p4d, address);
				450	if (bad_address(pud))
				451	goto bad;
				452
				453	pr_cont("PUD %lx ", pud_val(*pud));
				454	if (!pud_present(pud) \|\| pud_large(pud))
				455	goto out;
				456
				457	pmd = pmd_offset(pud, address);
				458	if (bad_address(pmd))
				459	goto bad;
				460
				461	pr_cont("PMD %lx ", pmd_val(*pmd));
				462	if (!pmd_present(pmd) \|\| pmd_large(pmd))
				463	goto out;
				464
				465	pte = pte_offset_kernel(pmd, address);
				466	if (bad_address(pte))
				467	goto bad;
				468
				469	pr_cont("PTE %lx", pte_val(*pte));
				470	out:
				471	pr_cont("\n");
				472	return;
				473	bad:
				474	pr_info("BAD\n");
				475	}
				476
				477	#endif /* CONFIG_X86_64 */
				478
				479	/*
				480	* Workaround for K8 erratum #93 & buggy BIOS.
				481	*
				482	* BIOS SMM functions are required to use a specific workaround
				483	* to avoid corruption of the 64bit RIP register on C stepping K8.
				484	*
				485	* A lot of BIOS that didn't get tested properly miss this.
				486	*
				487	* The OS sees this as a page fault with the upper 32bits of RIP cleared.
				488	* Try to work around it here.
				489	*
				490	* Note we only handle faults in kernel here.
				491	* Does nothing on 32-bit.
				492	*/
				493	static int is_errata93(struct pt_regs *regs, unsigned long address)
				494	{
				495	#if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
				496	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
				497	\|\| boot_cpu_data.x86 != 0xf)
				498	return 0;
				499
				500	if (address != regs->ip)
				501	return 0;
				502
				503	if ((address >> 32) != 0)
				504	return 0;
				505
				506	address \|= 0xffffffffUL << 32;
				507	if ((address >= (u64)_stext && address <= (u64)_etext) \|\|
				508	(address >= MODULES_VADDR && address <= MODULES_END)) {
				509	printk_once(errata93_warning);
				510	regs->ip = address;
				511	return 1;
				512	}
				513	#endif
				514	return 0;
				515	}
				516
				517	/*
				518	* Work around K8 erratum #100 K8 in compat mode occasionally jumps
				519	* to illegal addresses >4GB.
				520	*
				521	* We catch this in the page fault handler because these addresses
				522	* are not reachable. Just detect this case and return. Any code
				523	* segment in LDT is compatibility mode.
				524	*/
				525	static int is_errata100(struct pt_regs *regs, unsigned long address)
				526	{
				527	#ifdef CONFIG_X86_64
				528	if ((regs->cs == __USER32_CS \|\| (regs->cs & (1<<2))) && (address >> 32))
				529	return 1;
				530	#endif
				531	return 0;
				532	}
				533
				534	static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
				535	{
				536	#ifdef CONFIG_X86_F00F_BUG
				537	unsigned long nr;
				538
				539	/*
				540	* Pentium F0 0F C7 C8 bug workaround:
				541	*/
				542	if (boot_cpu_has_bug(X86_BUG_F00F)) {
				543	nr = (address - idt_descr.address) >> 3;
				544
				545	if (nr == 6) {
				546	do_invalid_op(regs, 0);
				547	return 1;
				548	}
				549	}
				550	#endif
				551	return 0;
				552	}
				553
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	554	static void show_ldttss(const struct desc_ptr gdt, const char name, u16 index)
				555	{
				556	u32 offset = (index >> 3) * sizeof(struct desc_struct);
				557	unsigned long addr;
				558	struct ldttss_desc desc;
				559
				560	if (index == 0) {
				561	pr_alert("%s: NULL\n", name);
				562	return;
				563	}
				564
				565	if (offset + sizeof(struct ldttss_desc) >= gdt->size) {
				566	pr_alert("%s: 0x%hx -- out of bounds\n", name, index);
				567	return;
				568	}
				569
				570	if (probe_kernel_read(&desc, (void *)(gdt->address + offset),
				571	sizeof(struct ldttss_desc))) {
				572	pr_alert("%s: 0x%hx -- GDT entry is not readable\n",
				573	name, index);
				574	return;
				575	}
				576
				577	addr = desc.base0 \| (desc.base1 << 16) \| ((unsigned long)desc.base2 << 24);
				578	#ifdef CONFIG_X86_64
				579	addr \|= ((u64)desc.base3 << 32);
				580	#endif
				581	pr_alert("%s: 0x%hx -- base=0x%lx limit=0x%x\n",
				582	name, index, addr, (desc.limit0 \| (desc.limit1 << 16)));
				583	}
				584
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	585	static void
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	586	show_fault_oops(struct pt_regs *regs, unsigned long error_code, unsigned long address)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	587	{
				588	if (!oops_may_print())
				589	return;
				590
				591	if (error_code & X86_PF_INSTR) {
				592	unsigned int level;
				593	pgd_t *pgd;
				594	pte_t *pte;
				595
				596	pgd = __va(read_cr3_pa());
				597	pgd += pgd_index(address);
				598
				599	pte = lookup_address_in_pgd(pgd, address, &level);
				600
				601	if (pte && pte_present(pte) && !pte_exec(pte))
				602	pr_crit("kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n",
				603	from_kuid(&init_user_ns, current_uid()));
				604	if (pte && pte_present(pte) && pte_exec(pte) &&
				605	(pgd_flags(*pgd) & _PAGE_USER) &&
				606	(__read_cr4() & X86_CR4_SMEP))
				607	pr_crit("unable to execute userspace code (SMEP?) (uid: %d)\n",
				608	from_kuid(&init_user_ns, current_uid()));
				609	}
				610
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	611	if (address < PAGE_SIZE && !user_mode(regs))
				612	pr_alert("BUG: kernel NULL pointer dereference, address: %px\n",
				613	(void *)address);
				614	else
				615	pr_alert("BUG: unable to handle page fault for address: %px\n",
				616	(void *)address);
				617
				618	pr_alert("#PF: %s %s in %s mode\n",
				619	(error_code & X86_PF_USER) ? "user" : "supervisor",
				620	(error_code & X86_PF_INSTR) ? "instruction fetch" :
				621	(error_code & X86_PF_WRITE) ? "write access" :
				622	"read access",
				623	user_mode(regs) ? "user" : "kernel");
				624	pr_alert("#PF: error_code(0x%04lx) - %s\n", error_code,
				625	!(error_code & X86_PF_PROT) ? "not-present page" :
				626	(error_code & X86_PF_RSVD) ? "reserved bit violation" :
				627	(error_code & X86_PF_PK) ? "protection keys violation" :
				628	"permissions violation");
				629
				630	if (!(error_code & X86_PF_USER) && user_mode(regs)) {
				631	struct desc_ptr idt, gdt;
				632	u16 ldtr, tr;
				633
				634	/*
				635	* This can happen for quite a few reasons. The more obvious
				636	* ones are faults accessing the GDT, or LDT. Perhaps
				637	* surprisingly, if the CPU tries to deliver a benign or
				638	* contributory exception from user code and gets a page fault
				639	* during delivery, the page fault can be delivered as though
				640	* it originated directly from user code. This could happen
				641	* due to wrong permissions on the IDT, GDT, LDT, TSS, or
				642	* kernel or IST stack.
				643	*/
				644	store_idt(&idt);
				645
				646	/* Usable even on Xen PV -- it's just slow. */
				647	native_store_gdt(&gdt);
				648
				649	pr_alert("IDT: 0x%lx (limit=0x%hx) GDT: 0x%lx (limit=0x%hx)\n",
				650	idt.address, idt.size, gdt.address, gdt.size);
				651
				652	store_ldt(ldtr);
				653	show_ldttss(&gdt, "LDTR", ldtr);
				654
				655	store_tr(tr);
				656	show_ldttss(&gdt, "TR", tr);
				657	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	658
				659	dump_pagetable(address);
				660	}
				661
				662	static noinline void
				663	pgtable_bad(struct pt_regs *regs, unsigned long error_code,
				664	unsigned long address)
				665	{
				666	struct task_struct *tsk;
				667	unsigned long flags;
				668	int sig;
				669
				670	flags = oops_begin();
				671	tsk = current;
				672	sig = SIGKILL;
				673
				674	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
				675	tsk->comm, address);
				676	dump_pagetable(address);
				677
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	678	if (__die("Bad pagetable", regs, error_code))
				679	sig = 0;
				680
				681	oops_end(flags, regs, sig);
				682	}
				683
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	684	static void set_signal_archinfo(unsigned long address,
				685	unsigned long error_code)
				686	{
				687	struct task_struct *tsk = current;
				688
				689	/*
				690	* To avoid leaking information about the kernel page
				691	* table layout, pretend that user-mode accesses to
				692	* kernel addresses are always protection faults.
				693	*
				694	* NB: This means that failed vsyscalls with vsyscall=none
				695	* will have the PROT bit. This doesn't leak any
				696	* information and does not appear to cause any problems.
				697	*/
				698	if (address >= TASK_SIZE_MAX)
				699	error_code \|= X86_PF_PROT;
				700
				701	tsk->thread.trap_nr = X86_TRAP_PF;
				702	tsk->thread.error_code = error_code \| X86_PF_USER;
				703	tsk->thread.cr2 = address;
				704	}
				705
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	706	static noinline void
				707	no_context(struct pt_regs *regs, unsigned long error_code,
				708	unsigned long address, int signal, int si_code)
				709	{
				710	struct task_struct *tsk = current;
				711	unsigned long flags;
				712	int sig;
				713
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	714	if (user_mode(regs)) {
				715	/*
				716	* This is an implicit supervisor-mode access from user
				717	* mode. Bypass all the kernel-mode recovery code and just
				718	* OOPS.
				719	*/
				720	goto oops;
				721	}
				722
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	723	/* Are we prepared to handle this kernel fault? */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	724	if (fixup_exception(regs, X86_TRAP_PF, error_code, address)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	725	/*
				726	* Any interrupt that takes a fault gets the fixup. This makes
				727	* the below recursive fault logic only apply to a faults from
				728	* task context.
				729	*/
				730	if (in_interrupt())
				731	return;
				732
				733	/*
				734	* Per the above we're !in_interrupt(), aka. task context.
				735	*
				736	* In this case we need to make sure we're not recursively
				737	* faulting through the emulate_vsyscall() logic.
				738	*/
				739	if (current->thread.sig_on_uaccess_err && signal) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	740	set_signal_archinfo(address, error_code);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	741
				742	/* XXX: hwpoison faults will set the wrong code. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	743	force_sig_fault(signal, si_code, (void __user *)address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	744	}
				745
				746	/*
				747	* Barring that, we can do the fixup and be happy.
				748	*/
				749	return;
				750	}
				751
				752	#ifdef CONFIG_VMAP_STACK
				753	/*
				754	* Stack overflow? During boot, we can fault near the initial
				755	* stack in the direct map, but that's not an overflow -- check
				756	* that we're in vmalloc space to avoid this.
				757	*/
				758	if (is_vmalloc_addr((void *)address) &&
				759	(((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) \|\|
				760	address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	761	unsigned long stack = __this_cpu_ist_top_va(DF) - sizeof(void *);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	762	/*
				763	* We're likely to be running with very little stack space
				764	* left. It's plausible that we'd hit this condition but
				765	* double-fault even before we get this far, in which case
				766	* we're fine: the double-fault handler will deal with it.
				767	*
				768	* We don't want to make it all the way into the oops code
				769	* and then double-fault, though, because we're likely to
				770	* break the console driver and lose most of the stack dump.
				771	*/
				772	asm volatile ("movq %[stack], %%rsp\n\t"
				773	"call handle_stack_overflow\n\t"
				774	"1: jmp 1b"
				775	: ASM_CALL_CONSTRAINT
				776	: "D" ("kernel stack overflow (page fault)"),
				777	"S" (regs), "d" (address),
				778	[stack] "rm" (stack));
				779	unreachable();
				780	}
				781	#endif
				782
				783	/*
				784	* 32-bit:
				785	*
				786	* Valid to do another page fault here, because if this fault
				787	* had been triggered by is_prefetch fixup_exception would have
				788	* handled it.
				789	*
				790	* 64-bit:
				791	*
				792	* Hall of shame of CPU/BIOS bugs.
				793	*/
				794	if (is_prefetch(regs, error_code, address))
				795	return;
				796
				797	if (is_errata93(regs, address))
				798	return;
				799
				800	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	801	* Buggy firmware could access regions which might page fault, try to
				802	* recover from such faults.
				803	*/
				804	if (IS_ENABLED(CONFIG_EFI))
				805	efi_recover_from_page_fault(address);
				806
				807	oops:
				808	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	809	* Oops. The kernel tried to access some bad page. We'll have to
				810	* terminate things with extreme prejudice:
				811	*/
				812	flags = oops_begin();
				813
				814	show_fault_oops(regs, error_code, address);
				815
				816	if (task_stack_end_corrupted(tsk))
				817	printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
				818
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	819	sig = SIGKILL;
				820	if (__die("Oops", regs, error_code))
				821	sig = 0;
				822
				823	/* Executive summary in case the body of the oops scrolled away */
				824	printk(KERN_DEFAULT "CR2: %016lx\n", address);
				825
				826	oops_end(flags, regs, sig);
				827	}
				828
				829	/*
				830	* Print out info about fatal segfaults, if the show_unhandled_signals
				831	* sysctl is set:
				832	*/
				833	static inline void
				834	show_signal_msg(struct pt_regs *regs, unsigned long error_code,
				835	unsigned long address, struct task_struct *tsk)
				836	{
				837	const char *loglvl = task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG;
				838
				839	if (!unhandled_signal(tsk, SIGSEGV))
				840	return;
				841
				842	if (!printk_ratelimit())
				843	return;
				844
				845	printk("%s%s[%d]: segfault at %lx ip %px sp %px error %lx",
				846	loglvl, tsk->comm, task_pid_nr(tsk), address,
				847	(void )regs->ip, (void )regs->sp, error_code);
				848
				849	print_vma_addr(KERN_CONT " in ", regs->ip);
				850
				851	printk(KERN_CONT "\n");
				852
				853	show_opcodes(regs, loglvl);
				854	}
				855
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	856	/*
				857	* The (legacy) vsyscall page is the long page in the kernel portion
				858	* of the address space that has user-accessible permissions.
				859	*/
				860	static bool is_vsyscall_vaddr(unsigned long vaddr)
				861	{
				862	return unlikely((vaddr & PAGE_MASK) == VSYSCALL_ADDR);
				863	}
				864
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	865	static void
				866	__bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	867	unsigned long address, u32 pkey, int si_code)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	868	{
				869	struct task_struct *tsk = current;
				870
				871	/* User mode accesses just cause a SIGSEGV */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	872	if (user_mode(regs) && (error_code & X86_PF_USER)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	873	/*
				874	* It's possible to have interrupts off here:
				875	*/
				876	local_irq_enable();
				877
				878	/*
				879	* Valid to do another page fault here because this one came
				880	* from user space:
				881	*/
				882	if (is_prefetch(regs, error_code, address))
				883	return;
				884
				885	if (is_errata100(regs, address))
				886	return;
				887
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	888	/*
				889	* To avoid leaking information about the kernel page table
				890	* layout, pretend that user-mode accesses to kernel addresses
				891	* are always protection faults.
				892	*/
				893	if (address >= TASK_SIZE_MAX)
				894	error_code \|= X86_PF_PROT;
				895
				896	if (likely(show_unhandled_signals))
				897	show_signal_msg(regs, error_code, address, tsk);
				898
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	899	set_signal_archinfo(address, error_code);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	900
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	901	if (si_code == SEGV_PKUERR)
				902	force_sig_pkuerr((void __user *)address, pkey);
				903
				904	force_sig_fault(SIGSEGV, si_code, (void __user *)address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	905
				906	return;
				907	}
				908
				909	if (is_f00f_bug(regs, address))
				910	return;
				911
				912	no_context(regs, error_code, address, SIGSEGV, si_code);
				913	}
				914
				915	static noinline void
				916	bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	917	unsigned long address)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	918	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	919	__bad_area_nosemaphore(regs, error_code, address, 0, SEGV_MAPERR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	920	}
				921
				922	static void
				923	__bad_area(struct pt_regs *regs, unsigned long error_code,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	924	unsigned long address, u32 pkey, int si_code)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	925	{
				926	struct mm_struct *mm = current->mm;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	927	/*
				928	* Something tried to access memory that isn't in our memory map..
				929	* Fix it, but check if it's kernel or user first..
				930	*/
				931	up_read(&mm->mmap_sem);
				932
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	933	__bad_area_nosemaphore(regs, error_code, address, pkey, si_code);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	934	}
				935
				936	static noinline void
				937	bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
				938	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	939	__bad_area(regs, error_code, address, 0, SEGV_MAPERR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	940	}
				941
				942	static inline bool bad_area_access_from_pkeys(unsigned long error_code,
				943	struct vm_area_struct *vma)
				944	{
				945	/* This code is always called on the current mm */
				946	bool foreign = false;
				947
				948	if (!boot_cpu_has(X86_FEATURE_OSPKE))
				949	return false;
				950	if (error_code & X86_PF_PK)
				951	return true;
				952	/* this checks permission keys on the VMA: */
				953	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
				954	(error_code & X86_PF_INSTR), foreign))
				955	return true;
				956	return false;
				957	}
				958
				959	static noinline void
				960	bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
				961	unsigned long address, struct vm_area_struct *vma)
				962	{
				963	/*
				964	* This OSPKE check is not strictly necessary at runtime.
				965	* But, doing it this way allows compiler optimizations
				966	* if pkeys are compiled out.
				967	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	968	if (bad_area_access_from_pkeys(error_code, vma)) {
				969	/*
				970	* A protection key fault means that the PKRU value did not allow
				971	* access to some PTE. Userspace can figure out what PKRU was
				972	* from the XSAVE state. This function captures the pkey from
				973	* the vma and passes it to userspace so userspace can discover
				974	* which protection key was set on the PTE.
				975	*
				976	* If we get here, we know that the hardware signaled a X86_PF_PK
				977	* fault and that there was a VMA once we got in the fault
				978	* handler. It does not guarantee that the VMA we find here
				979	* was the one that we faulted on.
				980	*
				981	* 1. T1 : mprotect_key(foo, PAGE_SIZE, pkey=4);
				982	* 2. T1 : set PKRU to deny access to pkey=4, touches page
				983	* 3. T1 : faults...
				984	* 4. T2: mprotect_key(foo, PAGE_SIZE, pkey=5);
				985	* 5. T1 : enters fault handler, takes mmap_sem, etc...
				986	* 6. T1 : reaches here, sees vma_pkey(vma)=5, when we really
				987	* faulted on a pte with its pkey=4.
				988	*/
				989	u32 pkey = vma_pkey(vma);
				990
				991	__bad_area(regs, error_code, address, pkey, SEGV_PKUERR);
				992	} else {
				993	__bad_area(regs, error_code, address, 0, SEGV_ACCERR);
				994	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	995	}
				996
				997	static void
				998	do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	999	vm_fault_t fault)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1000	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1001	/* Kernel mode? Handle exceptions or die: */
				1002	if (!(error_code & X86_PF_USER)) {
				1003	no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
				1004	return;
				1005	}
				1006
				1007	/* User-space => ok to do another page fault: */
				1008	if (is_prefetch(regs, error_code, address))
				1009	return;
				1010
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1011	set_signal_archinfo(address, error_code);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1012
				1013	#ifdef CONFIG_MEMORY_FAILURE
				1014	if (fault & (VM_FAULT_HWPOISON\|VM_FAULT_HWPOISON_LARGE)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1015	struct task_struct *tsk = current;
				1016	unsigned lsb = 0;
				1017
				1018	pr_err(
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1019	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
				1020	tsk->comm, tsk->pid, address);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1021	if (fault & VM_FAULT_HWPOISON_LARGE)
				1022	lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
				1023	if (fault & VM_FAULT_HWPOISON)
				1024	lsb = PAGE_SHIFT;
				1025	force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
				1026	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1027	}
				1028	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1029	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1030	}
				1031
				1032	static noinline void
				1033	mm_fault_error(struct pt_regs *regs, unsigned long error_code,
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1034	unsigned long address, vm_fault_t fault)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1035	{
				1036	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
				1037	no_context(regs, error_code, address, 0, 0);
				1038	return;
				1039	}
				1040
				1041	if (fault & VM_FAULT_OOM) {
				1042	/* Kernel mode? Handle exceptions or die: */
				1043	if (!(error_code & X86_PF_USER)) {
				1044	no_context(regs, error_code, address,
				1045	SIGSEGV, SEGV_MAPERR);
				1046	return;
				1047	}
				1048
				1049	/*
				1050	* We ran out of memory, call the OOM killer, and return the
				1051	* userspace (which will retry the fault, or kill us if we got
				1052	* oom-killed):
				1053	*/
				1054	pagefault_out_of_memory();
				1055	} else {
				1056	if (fault & (VM_FAULT_SIGBUS\|VM_FAULT_HWPOISON\|
				1057	VM_FAULT_HWPOISON_LARGE))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1058	do_sigbus(regs, error_code, address, fault);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1059	else if (fault & VM_FAULT_SIGSEGV)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1060	bad_area_nosemaphore(regs, error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1061	else
				1062	BUG();
				1063	}
				1064	}
				1065
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1066	static int spurious_kernel_fault_check(unsigned long error_code, pte_t *pte)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1067	{
				1068	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
				1069	return 0;
				1070
				1071	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
				1072	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1073
				1074	return 1;
				1075	}
				1076
				1077	/*
				1078	* Handle a spurious fault caused by a stale TLB entry.
				1079	*
				1080	* This allows us to lazily refresh the TLB when increasing the
				1081	* permissions of a kernel page (RO -> RW or NX -> X). Doing it
				1082	* eagerly is very expensive since that implies doing a full
				1083	* cross-processor TLB flush, even if no stale TLB entries exist
				1084	* on other processors.
				1085	*
				1086	* Spurious faults may only occur if the TLB contains an entry with
				1087	* fewer permission than the page table entry. Non-present (P = 0)
				1088	* and reserved bit (R = 1) faults are never spurious.
				1089	*
				1090	* There are no security implications to leaving a stale TLB when
				1091	* increasing the permissions on a page.
				1092	*
				1093	* Returns non-zero if a spurious fault was handled, zero otherwise.
				1094	*
				1095	* See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
				1096	* (Optional Invalidation).
				1097	*/
				1098	static noinline int
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1099	spurious_kernel_fault(unsigned long error_code, unsigned long address)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1100	{
				1101	pgd_t *pgd;
				1102	p4d_t *p4d;
				1103	pud_t *pud;
				1104	pmd_t *pmd;
				1105	pte_t *pte;
				1106	int ret;
				1107
				1108	/*
				1109	* Only writes to RO or instruction fetches from NX may cause
				1110	* spurious faults.
				1111	*
				1112	* These could be from user or supervisor accesses but the TLB
				1113	* is only lazily flushed after a kernel mapping protection
				1114	* change, so user accesses are not expected to cause spurious
				1115	* faults.
				1116	*/
				1117	if (error_code != (X86_PF_WRITE \| X86_PF_PROT) &&
				1118	error_code != (X86_PF_INSTR \| X86_PF_PROT))
				1119	return 0;
				1120
				1121	pgd = init_mm.pgd + pgd_index(address);
				1122	if (!pgd_present(*pgd))
				1123	return 0;
				1124
				1125	p4d = p4d_offset(pgd, address);
				1126	if (!p4d_present(*p4d))
				1127	return 0;
				1128
				1129	if (p4d_large(*p4d))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1130	return spurious_kernel_fault_check(error_code, (pte_t *) p4d);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1131
				1132	pud = pud_offset(p4d, address);
				1133	if (!pud_present(*pud))
				1134	return 0;
				1135
				1136	if (pud_large(*pud))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1137	return spurious_kernel_fault_check(error_code, (pte_t *) pud);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1138
				1139	pmd = pmd_offset(pud, address);
				1140	if (!pmd_present(*pmd))
				1141	return 0;
				1142
				1143	if (pmd_large(*pmd))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1144	return spurious_kernel_fault_check(error_code, (pte_t *) pmd);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1145
				1146	pte = pte_offset_kernel(pmd, address);
				1147	if (!pte_present(*pte))
				1148	return 0;
				1149
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1150	ret = spurious_kernel_fault_check(error_code, pte);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1151	if (!ret)
				1152	return 0;
				1153
				1154	/*
				1155	* Make sure we have permissions in PMD.
				1156	* If not, then there's a bug in the page tables:
				1157	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1158	ret = spurious_kernel_fault_check(error_code, (pte_t *) pmd);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1159	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
				1160
				1161	return ret;
				1162	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1163	NOKPROBE_SYMBOL(spurious_kernel_fault);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1164
				1165	int show_unhandled_signals = 1;
				1166
				1167	static inline int
				1168	access_error(unsigned long error_code, struct vm_area_struct *vma)
				1169	{
				1170	/* This is only called for the current mm, so: */
				1171	bool foreign = false;
				1172
				1173	/*
				1174	* Read or write was blocked by protection keys. This is
				1175	* always an unconditional error and can never result in
				1176	* a follow-up action to resolve the fault, like a COW.
				1177	*/
				1178	if (error_code & X86_PF_PK)
				1179	return 1;
				1180
				1181	/*
				1182	* Make sure to check the VMA so that we do not perform
				1183	* faults just to hit a X86_PF_PK as soon as we fill in a
				1184	* page.
				1185	*/
				1186	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
				1187	(error_code & X86_PF_INSTR), foreign))
				1188	return 1;
				1189
				1190	if (error_code & X86_PF_WRITE) {
				1191	/* write, present and write, not present: */
				1192	if (unlikely(!(vma->vm_flags & VM_WRITE)))
				1193	return 1;
				1194	return 0;
				1195	}
				1196
				1197	/* read, present: */
				1198	if (unlikely(error_code & X86_PF_PROT))
				1199	return 1;
				1200
				1201	/* read, not present: */
				1202	if (unlikely(!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE))))
				1203	return 1;
				1204
				1205	return 0;
				1206	}
				1207
				1208	static int fault_in_kernel_space(unsigned long address)
				1209	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1210	/*
				1211	* On 64-bit systems, the vsyscall page is at an address above
				1212	* TASK_SIZE_MAX, but is not considered part of the kernel
				1213	* address space.
				1214	*/
				1215	if (IS_ENABLED(CONFIG_X86_64) && is_vsyscall_vaddr(address))
				1216	return false;
				1217
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1218	return address >= TASK_SIZE_MAX;
				1219	}
				1220
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1221	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1222	* Called for all faults where 'address' is part of the kernel address
				1223	* space. Might get called for faults that originate from code that
				1224	* ran in userspace or the kernel.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1225	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1226	static void
				1227	do_kern_addr_fault(struct pt_regs *regs, unsigned long hw_error_code,
				1228	unsigned long address)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1229	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1230	/*
				1231	* Protection keys exceptions only happen on user pages. We
				1232	* have no user pages in the kernel portion of the address
				1233	* space, so do not expect them here.
				1234	*/
				1235	WARN_ON_ONCE(hw_error_code & X86_PF_PK);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1236
				1237	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1238	* We can fault-in kernel-space virtual memory on-demand. The
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1239	* 'reference' page table is init_mm.pgd.
				1240	*
				1241	* NOTE! We MUST NOT take any locks for this case. We may
				1242	* be in an interrupt or a critical region, and should
				1243	* only copy the information from the master page table,
				1244	* nothing more.
				1245	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1246	* Before doing this on-demand faulting, ensure that the
				1247	* fault is not any of the following:
				1248	* 1. A fault on a PTE with a reserved bit set.
				1249	* 2. A fault caused by a user-mode access. (Do not demand-
				1250	* fault kernel memory due to user-mode accesses).
				1251	* 3. A fault caused by a page-level protection violation.
				1252	* (A demand fault would be on a non-present page which
				1253	* would have X86_PF_PROT==0).
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1254	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1255	if (!(hw_error_code & (X86_PF_RSVD \| X86_PF_USER \| X86_PF_PROT))) {
				1256	if (vmalloc_fault(address) >= 0)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1257	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1258	}
				1259
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1260	/* Was the fault spurious, caused by lazy TLB invalidation? */
				1261	if (spurious_kernel_fault(hw_error_code, address))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1262	return;
				1263
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1264	/* kprobes don't want to hook the spurious faults: */
				1265	if (kprobe_page_fault(regs, X86_TRAP_PF))
				1266	return;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1267
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1268	/*
				1269	* Note, despite being a "bad area", there are quite a few
				1270	* acceptable reasons to get here, such as erratum fixups
				1271	* and handling kernel code that can fault, like get_user().
				1272	*
				1273	* Don't take the mm semaphore here. If we fixup a prefetch
				1274	* fault we could otherwise deadlock:
				1275	*/
				1276	bad_area_nosemaphore(regs, hw_error_code, address);
				1277	}
				1278	NOKPROBE_SYMBOL(do_kern_addr_fault);
				1279
				1280	/* Handle faults in the user portion of the address space */
				1281	static inline
				1282	void do_user_addr_fault(struct pt_regs *regs,
				1283	unsigned long hw_error_code,
				1284	unsigned long address)
				1285	{
				1286	struct vm_area_struct *vma;
				1287	struct task_struct *tsk;
				1288	struct mm_struct *mm;
				1289	vm_fault_t fault, major = 0;
				1290	unsigned int flags = FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_KILLABLE;
				1291
				1292	tsk = current;
				1293	mm = tsk->mm;
				1294
				1295	/* kprobes don't want to hook the spurious faults: */
				1296	if (unlikely(kprobe_page_fault(regs, X86_TRAP_PF)))
				1297	return;
				1298
				1299	/*
				1300	* Reserved bits are never expected to be set on
				1301	* entries in the user portion of the page tables.
				1302	*/
				1303	if (unlikely(hw_error_code & X86_PF_RSVD))
				1304	pgtable_bad(regs, hw_error_code, address);
				1305
				1306	/*
				1307	* If SMAP is on, check for invalid kernel (supervisor) access to user
				1308	* pages in the user address space. The odd case here is WRUSS,
				1309	* which, according to the preliminary documentation, does not respect
				1310	* SMAP and will have the USER bit set so, in all cases, SMAP
				1311	* enforcement appears to be consistent with the USER bit.
				1312	*/
				1313	if (unlikely(cpu_feature_enabled(X86_FEATURE_SMAP) &&
				1314	!(hw_error_code & X86_PF_USER) &&
				1315	!(regs->flags & X86_EFLAGS_AC)))
				1316	{
				1317	bad_area_nosemaphore(regs, hw_error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1318	return;
				1319	}
				1320
				1321	/*
				1322	* If we're in an interrupt, have no user context or are running
				1323	* in a region with pagefaults disabled then we must not take the fault
				1324	*/
				1325	if (unlikely(faulthandler_disabled() \|\| !mm)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1326	bad_area_nosemaphore(regs, hw_error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1327	return;
				1328	}
				1329
				1330	/*
				1331	* It's safe to allow irq's after cr2 has been saved and the
				1332	* vmalloc fault has been handled.
				1333	*
				1334	* User-mode registers count as a user access even for any
				1335	* potential system fault or CPU buglet:
				1336	*/
				1337	if (user_mode(regs)) {
				1338	local_irq_enable();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1339	flags \|= FAULT_FLAG_USER;
				1340	} else {
				1341	if (regs->flags & X86_EFLAGS_IF)
				1342	local_irq_enable();
				1343	}
				1344
				1345	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
				1346
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1347	if (hw_error_code & X86_PF_WRITE)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1348	flags \|= FAULT_FLAG_WRITE;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1349	if (hw_error_code & X86_PF_INSTR)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1350	flags \|= FAULT_FLAG_INSTRUCTION;
				1351
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1352	#ifdef CONFIG_X86_64
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1353	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1354	* Faults in the vsyscall page might need emulation. The
				1355	* vsyscall page is at a high address (>PAGE_OFFSET), but is
				1356	* considered to be part of the user address space.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1357	*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1358	* The vsyscall page does not have a "real" VMA, so do this
				1359	* emulation before we go searching for VMAs.
				1360	*
				1361	* PKRU never rejects instruction fetches, so we don't need
				1362	* to consider the PF_PK bit.
				1363	*/
				1364	if (is_vsyscall_vaddr(address)) {
				1365	if (emulate_vsyscall(hw_error_code, regs, address))
				1366	return;
				1367	}
				1368	#endif
				1369
				1370	/*
				1371	* Kernel-mode access to the user address space should only occur
				1372	* on well-defined single instructions listed in the exception
				1373	* tables. But, an erroneous kernel fault occurring outside one of
				1374	* those areas which also holds mmap_sem might deadlock attempting
				1375	* to validate the fault against the address space.
				1376	*
				1377	* Only do the expensive exception table search when we might be at
				1378	* risk of a deadlock. This happens if we
				1379	* 1. Failed to acquire mmap_sem, and
				1380	* 2. The access did not originate in userspace.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1381	*/
				1382	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1383	if (!user_mode(regs) && !search_exception_tables(regs->ip)) {
				1384	/*
				1385	* Fault from code in kernel from
				1386	* which we do not expect faults.
				1387	*/
				1388	bad_area_nosemaphore(regs, hw_error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1389	return;
				1390	}
				1391	retry:
				1392	down_read(&mm->mmap_sem);
				1393	} else {
				1394	/*
				1395	* The above down_read_trylock() might have succeeded in
				1396	* which case we'll have missed the might_sleep() from
				1397	* down_read():
				1398	*/
				1399	might_sleep();
				1400	}
				1401
				1402	vma = find_vma(mm, address);
				1403	if (unlikely(!vma)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1404	bad_area(regs, hw_error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1405	return;
				1406	}
				1407	if (likely(vma->vm_start <= address))
				1408	goto good_area;
				1409	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1410	bad_area(regs, hw_error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1411	return;
				1412	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1413	if (unlikely(expand_stack(vma, address))) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1414	bad_area(regs, hw_error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1415	return;
				1416	}
				1417
				1418	/*
				1419	* Ok, we have a good vm_area for this memory access, so
				1420	* we can handle it..
				1421	*/
				1422	good_area:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1423	if (unlikely(access_error(hw_error_code, vma))) {
				1424	bad_area_access_error(regs, hw_error_code, address, vma);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1425	return;
				1426	}
				1427
				1428	/*
				1429	* If for any reason at all we couldn't handle the fault,
				1430	* make sure we exit gracefully rather than endlessly redo
				1431	* the fault. Since we never set FAULT_FLAG_RETRY_NOWAIT, if
				1432	* we get VM_FAULT_RETRY back, the mmap_sem has been unlocked.
				1433	*
				1434	* Note that handle_userfault() may also release and reacquire mmap_sem
				1435	* (and not return with VM_FAULT_RETRY), when returning to userland to
				1436	* repeat the page fault later with a VM_FAULT_NOPAGE retval
				1437	* (potentially after handling any pending signal during the return to
				1438	* userland). The return to userland is identified whenever
				1439	* FAULT_FLAG_USER\|FAULT_FLAG_KILLABLE are both set in flags.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1440	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1441	fault = handle_mm_fault(vma, address, flags);
				1442	major \|= fault & VM_FAULT_MAJOR;
				1443
				1444	/*
				1445	* If we need to retry the mmap_sem has already been released,
				1446	* and if there is a fatal signal pending there is no guarantee
				1447	* that we made any progress. Handle this case first.
				1448	*/
				1449	if (unlikely(fault & VM_FAULT_RETRY)) {
				1450	/* Retry at most once */
				1451	if (flags & FAULT_FLAG_ALLOW_RETRY) {
				1452	flags &= ~FAULT_FLAG_ALLOW_RETRY;
				1453	flags \|= FAULT_FLAG_TRIED;
				1454	if (!fatal_signal_pending(tsk))
				1455	goto retry;
				1456	}
				1457
				1458	/* User mode? Just return to handle the fatal exception */
				1459	if (flags & FAULT_FLAG_USER)
				1460	return;
				1461
				1462	/* Not returning to user mode? Handle exceptions or die: */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1463	no_context(regs, hw_error_code, address, SIGBUS, BUS_ADRERR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1464	return;
				1465	}
				1466
				1467	up_read(&mm->mmap_sem);
				1468	if (unlikely(fault & VM_FAULT_ERROR)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1469	mm_fault_error(regs, hw_error_code, address, fault);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1470	return;
				1471	}
				1472
				1473	/*
				1474	* Major/minor page fault accounting. If any of the events
				1475	* returned VM_FAULT_MAJOR, we account it as a major fault.
				1476	*/
				1477	if (major) {
				1478	tsk->maj_flt++;
				1479	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
				1480	} else {
				1481	tsk->min_flt++;
				1482	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
				1483	}
				1484
				1485	check_v8086_mode(regs, address, tsk);
				1486	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1487	NOKPROBE_SYMBOL(do_user_addr_fault);
				1488
				1489	/*
				1490	* Explicitly marked noinline such that the function tracer sees this as the
				1491	* page_fault entry point.
				1492	*/
				1493	static noinline void
				1494	__do_page_fault(struct pt_regs *regs, unsigned long hw_error_code,
				1495	unsigned long address)
				1496	{
				1497	prefetchw(&current->mm->mmap_sem);
				1498
				1499	if (unlikely(kmmio_fault(regs, address)))
				1500	return;
				1501
				1502	/* Was the fault on kernel-controlled part of the address space? */
				1503	if (unlikely(fault_in_kernel_space(address)))
				1504	do_kern_addr_fault(regs, hw_error_code, address);
				1505	else
				1506	do_user_addr_fault(regs, hw_error_code, address);
				1507	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1508	NOKPROBE_SYMBOL(__do_page_fault);
				1509
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1510	static __always_inline void
				1511	trace_page_fault_entries(struct pt_regs *regs, unsigned long error_code,
				1512	unsigned long address)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1513	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1514	if (!trace_pagefault_enabled())
				1515	return;
				1516
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1517	if (user_mode(regs))
				1518	trace_page_fault_user(address, regs, error_code);
				1519	else
				1520	trace_page_fault_kernel(address, regs, error_code);
				1521	}
				1522
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1523	dotraplinkage void
				1524	do_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1525	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1526	enum ctx_state prev_state;
				1527
				1528	prev_state = exception_enter();
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame^]	1529	trace_page_fault_entries(regs, error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1530	__do_page_fault(regs, error_code, address);
				1531	exception_exit(prev_state);
				1532	}
				1533	NOKPROBE_SYMBOL(do_page_fault);