Blame - arch/powerpc/mm/fault.c - hafnium/third_party/linux

blob: 9f4a78e3cde9e2453cea457275e4985e00e75ca5 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* PowerPC version
				4	* Copyright (C) 1995-1996 Gary Thomas (gdt@linuxppc.org)
				5	*
				6	* Derived from "arch/i386/mm/fault.c"
				7	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				8	*
				9	* Modified by Cort Dougan and Paul Mackerras.
				10	*
				11	* Modified for PPC64 by Dave Engebretsen (engebret@ibm.com)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	12	*/
				13
				14	#include <linux/signal.h>
				15	#include <linux/sched.h>
				16	#include <linux/sched/task_stack.h>
				17	#include <linux/kernel.h>
				18	#include <linux/errno.h>
				19	#include <linux/string.h>
				20	#include <linux/types.h>
				21	#include <linux/pagemap.h>
				22	#include <linux/ptrace.h>
				23	#include <linux/mman.h>
				24	#include <linux/mm.h>
				25	#include <linux/interrupt.h>
				26	#include <linux/highmem.h>
				27	#include <linux/extable.h>
				28	#include <linux/kprobes.h>
				29	#include <linux/kdebug.h>
				30	#include <linux/perf_event.h>
				31	#include <linux/ratelimit.h>
				32	#include <linux/context_tracking.h>
				33	#include <linux/hugetlb.h>
				34	#include <linux/uaccess.h>
				35
				36	#include <asm/firmware.h>
				37	#include <asm/page.h>
				38	#include <asm/pgtable.h>
				39	#include <asm/mmu.h>
				40	#include <asm/mmu_context.h>
				41	#include <asm/siginfo.h>
				42	#include <asm/debug.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	43	#include <asm/kup.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	44
				45	/*
				46	* Check whether the instruction inst is a store using
				47	* an update addressing form which will update r1.
				48	*/
				49	static bool store_updates_sp(unsigned int inst)
				50	{
				51	/* check for 1 in the rA field */
				52	if (((inst >> 16) & 0x1f) != 1)
				53	return false;
				54	/* check major opcode */
				55	switch (inst >> 26) {
				56	case OP_STWU:
				57	case OP_STBU:
				58	case OP_STHU:
				59	case OP_STFSU:
				60	case OP_STFDU:
				61	return true;
				62	case OP_STD: /* std or stdu */
				63	return (inst & 3) == 1;
				64	case OP_31:
				65	/* check minor opcode */
				66	switch ((inst >> 1) & 0x3ff) {
				67	case OP_31_XOP_STDUX:
				68	case OP_31_XOP_STWUX:
				69	case OP_31_XOP_STBUX:
				70	case OP_31_XOP_STHUX:
				71	case OP_31_XOP_STFSUX:
				72	case OP_31_XOP_STFDUX:
				73	return true;
				74	}
				75	}
				76	return false;
				77	}
				78	/*
				79	* do_page_fault error handling helpers
				80	*/
				81
				82	static int
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	83	__bad_area_nosemaphore(struct pt_regs *regs, unsigned long address, int si_code)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	84	{
				85	/*
				86	* If we are in kernel mode, bail out with a SEGV, this will
				87	* be caught by the assembly which will restore the non-volatile
				88	* registers before calling bad_page_fault()
				89	*/
				90	if (!user_mode(regs))
				91	return SIGSEGV;
				92
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	93	_exception(SIGSEGV, regs, si_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	94
				95	return 0;
				96	}
				97
				98	static noinline int bad_area_nosemaphore(struct pt_regs *regs, unsigned long address)
				99	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	100	return __bad_area_nosemaphore(regs, address, SEGV_MAPERR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	101	}
				102
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	103	static int __bad_area(struct pt_regs *regs, unsigned long address, int si_code)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	104	{
				105	struct mm_struct *mm = current->mm;
				106
				107	/*
				108	* Something tried to access memory that isn't in our memory map..
				109	* Fix it, but check if it's kernel or user first..
				110	*/
				111	up_read(&mm->mmap_sem);
				112
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	113	return __bad_area_nosemaphore(regs, address, si_code);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	114	}
				115
				116	static noinline int bad_area(struct pt_regs *regs, unsigned long address)
				117	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	118	return __bad_area(regs, address, SEGV_MAPERR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	119	}
				120
				121	static int bad_key_fault_exception(struct pt_regs *regs, unsigned long address,
				122	int pkey)
				123	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	124	/*
				125	* If we are in kernel mode, bail out with a SEGV, this will
				126	* be caught by the assembly which will restore the non-volatile
				127	* registers before calling bad_page_fault()
				128	*/
				129	if (!user_mode(regs))
				130	return SIGSEGV;
				131
				132	_exception_pkey(regs, address, pkey);
				133
				134	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	135	}
				136
				137	static noinline int bad_access(struct pt_regs *regs, unsigned long address)
				138	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	139	return __bad_area(regs, address, SEGV_ACCERR);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	140	}
				141
				142	static int do_sigbus(struct pt_regs *regs, unsigned long address,
				143	vm_fault_t fault)
				144	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	145	if (!user_mode(regs))
				146	return SIGBUS;
				147
				148	current->thread.trap_nr = BUS_ADRERR;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	149	#ifdef CONFIG_MEMORY_FAILURE
				150	if (fault & (VM_FAULT_HWPOISON\|VM_FAULT_HWPOISON_LARGE)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	151	unsigned int lsb = 0; /* shutup gcc */
				152
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	153	pr_err("MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
				154	current->comm, current->pid, address);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	155
				156	if (fault & VM_FAULT_HWPOISON_LARGE)
				157	lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
				158	if (fault & VM_FAULT_HWPOISON)
				159	lsb = PAGE_SHIFT;
				160
				161	force_sig_mceerr(BUS_MCEERR_AR, (void __user *)address, lsb);
				162	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	163	}
				164
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	165	#endif
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	166	force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	167	return 0;
				168	}
				169
				170	static int mm_fault_error(struct pt_regs *regs, unsigned long addr,
				171	vm_fault_t fault)
				172	{
				173	/*
				174	* Kernel page fault interrupted by SIGKILL. We have no reason to
				175	* continue processing.
				176	*/
				177	if (fatal_signal_pending(current) && !user_mode(regs))
				178	return SIGKILL;
				179
				180	/* Out of memory */
				181	if (fault & VM_FAULT_OOM) {
				182	/*
				183	* We ran out of memory, or some other thing happened to us that
				184	* made us unable to handle the page fault gracefully.
				185	*/
				186	if (!user_mode(regs))
				187	return SIGSEGV;
				188	pagefault_out_of_memory();
				189	} else {
				190	if (fault & (VM_FAULT_SIGBUS\|VM_FAULT_HWPOISON\|
				191	VM_FAULT_HWPOISON_LARGE))
				192	return do_sigbus(regs, addr, fault);
				193	else if (fault & VM_FAULT_SIGSEGV)
				194	return bad_area_nosemaphore(regs, addr);
				195	else
				196	BUG();
				197	}
				198	return 0;
				199	}
				200
				201	/* Is this a bad kernel fault ? */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	202	static bool bad_kernel_fault(struct pt_regs *regs, unsigned long error_code,
				203	unsigned long address, bool is_write)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	204	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	205	int is_exec = TRAP(regs) == 0x400;
				206
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	207	if (is_exec) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	208	pr_crit_ratelimited("kernel tried to execute %s page (%lx) - exploit attempt? (uid: %d)\n",
				209	address >= TASK_SIZE ? "exec-protected" : "user",
				210	address,
				211	from_kuid(&init_user_ns, current_uid()));
				212
				213	// Kernel exec fault is always bad
				214	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	215	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	216
				217	if (!is_exec && address < TASK_SIZE && (error_code & DSISR_PROTFAULT) &&
				218	!search_exception_tables(regs->nip)) {
				219	pr_crit_ratelimited("Kernel attempted to access user page (%lx) - exploit attempt? (uid: %d)\n",
				220	address,
				221	from_kuid(&init_user_ns, current_uid()));
				222	}
				223
				224	// Kernel fault on kernel address is bad
				225	if (address >= TASK_SIZE)
				226	return true;
				227
				228	// Fault on user outside of certain regions (eg. copy_tofrom_user()) is bad
				229	if (!search_exception_tables(regs->nip))
				230	return true;
				231
				232	// Read/write fault in a valid region (the exception table search passed
				233	// above), but blocked by KUAP is bad, it can never succeed.
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	234	if (bad_kuap_fault(regs, address, is_write))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	235	return true;
				236
				237	// What's left? Kernel fault on user in well defined regions (extable
				238	// matched), and allowed by KUAP in the faulting context.
				239	return false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	240	}
				241
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	242	// This comes from 64-bit struct rt_sigframe + __SIGNAL_FRAMESIZE
				243	#define SIGFRAME_MAX_SIZE (4096 + 128)
				244
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	245	static bool bad_stack_expansion(struct pt_regs *regs, unsigned long address,
				246	struct vm_area_struct *vma, unsigned int flags,
				247	bool *must_retry)
				248	{
				249	/*
				250	* N.B. The POWER/Open ABI allows programs to access up to
				251	* 288 bytes below the stack pointer.
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	252	* The kernel signal delivery code writes a bit over 4KB
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	253	* below the stack pointer (r1) before decrementing it.
				254	* The exec code can write slightly over 640kB to the stack
				255	* before setting the user r1. Thus we allow the stack to
				256	* expand to 1MB without further checks.
				257	*/
				258	if (address + 0x100000 < vma->vm_end) {
				259	unsigned int __user nip = (unsigned int __user )regs->nip;
				260	/* get user regs even if this fault is in kernel mode */
				261	struct pt_regs *uregs = current->thread.regs;
				262	if (uregs == NULL)
				263	return true;
				264
				265	/*
				266	* A user-mode access to an address a long way below
				267	* the stack pointer is only valid if the instruction
				268	* is one which would update the stack pointer to the
				269	* address accessed if the instruction completed,
				270	* i.e. either stwu rs,n(r1) or stwux rs,r1,rb
				271	* (or the byte, halfword, float or double forms).
				272	*
				273	* If we don't check this then any write to the area
				274	* between the last mapped region and the stack will
				275	* expand the stack rather than segfaulting.
				276	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	277	if (address + SIGFRAME_MAX_SIZE >= uregs->gpr[1])
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	278	return false;
				279
				280	if ((flags & FAULT_FLAG_WRITE) && (flags & FAULT_FLAG_USER) &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	281	access_ok(nip, sizeof(*nip))) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	282	unsigned int inst;
				283	int res;
				284
				285	pagefault_disable();
				286	res = __get_user_inatomic(inst, nip);
				287	pagefault_enable();
				288	if (!res)
				289	return !store_updates_sp(inst);
				290	*must_retry = true;
				291	}
				292	return true;
				293	}
				294	return false;
				295	}
				296
				297	static bool access_error(bool is_write, bool is_exec,
				298	struct vm_area_struct *vma)
				299	{
				300	/*
				301	* Allow execution from readable areas if the MMU does not
				302	* provide separate controls over reading and executing.
				303	*
				304	* Note: That code used to not be enabled for 4xx/BookE.
				305	* It is now as I/D cache coherency for these is done at
				306	* set_pte_at() time and I see no reason why the test
				307	* below wouldn't be valid on those processors. This -may-
				308	* break programs compiled with a really old ABI though.
				309	*/
				310	if (is_exec) {
				311	return !(vma->vm_flags & VM_EXEC) &&
				312	(cpu_has_feature(CPU_FTR_NOEXECUTE) \|\|
				313	!(vma->vm_flags & (VM_READ \| VM_WRITE)));
				314	}
				315
				316	if (is_write) {
				317	if (unlikely(!(vma->vm_flags & VM_WRITE)))
				318	return true;
				319	return false;
				320	}
				321
				322	if (unlikely(!(vma->vm_flags & (VM_READ \| VM_EXEC \| VM_WRITE))))
				323	return true;
				324	/*
				325	* We should ideally do the vma pkey access check here. But in the
				326	* fault path, handle_mm_fault() also does the same check. To avoid
				327	* these multiple checks, we skip it here and handle access error due
				328	* to pkeys later.
				329	*/
				330	return false;
				331	}
				332
				333	#ifdef CONFIG_PPC_SMLPAR
				334	static inline void cmo_account_page_fault(void)
				335	{
				336	if (firmware_has_feature(FW_FEATURE_CMO)) {
				337	u32 page_ins;
				338
				339	preempt_disable();
				340	page_ins = be32_to_cpu(get_lppaca()->page_ins);
				341	page_ins += 1 << PAGE_FACTOR;
				342	get_lppaca()->page_ins = cpu_to_be32(page_ins);
				343	preempt_enable();
				344	}
				345	}
				346	#else
				347	static inline void cmo_account_page_fault(void) { }
				348	#endif /* CONFIG_PPC_SMLPAR */
				349
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	350	static void sanity_check_fault(bool is_write, bool is_user,
				351	unsigned long error_code, unsigned long address)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	352	{
				353	/*
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	354	* Userspace trying to access kernel address, we get PROTFAULT for that.
				355	*/
				356	if (is_user && address >= TASK_SIZE) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	357	if ((long)address == -1)
				358	return;
				359
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	360	pr_crit_ratelimited("%s[%d]: User access of kernel address (%lx) - exploit attempt? (uid: %d)\n",
				361	current->comm, current->pid, address,
				362	from_kuid(&init_user_ns, current_uid()));
				363	return;
				364	}
				365
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame^]	366	if (!IS_ENABLED(CONFIG_PPC_BOOK3S))
				367	return;
				368
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	369	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	370	* For hash translation mode, we should never get a
				371	* PROTFAULT. Any update to pte to reduce access will result in us
				372	* removing the hash page table entry, thus resulting in a DSISR_NOHPTE
				373	* fault instead of DSISR_PROTFAULT.
				374	*
				375	* A pte update to relax the access will not result in a hash page table
				376	* entry invalidate and hence can result in DSISR_PROTFAULT.
				377	* ptep_set_access_flags() doesn't do a hpte flush. This is why we have
				378	* the special !is_write in the below conditional.
				379	*
				380	* For platforms that doesn't supports coherent icache and do support
				381	* per page noexec bit, we do setup things such that we do the
				382	* sync between D/I cache via fault. But that is handled via low level
				383	* hash fault code (hash_page_do_lazy_icache()) and we should not reach
				384	* here in such case.
				385	*
				386	* For wrong access that can result in PROTFAULT, the above vma->vm_flags
				387	* check should handle those and hence we should fall to the bad_area
				388	* handling correctly.
				389	*
				390	* For embedded with per page exec support that doesn't support coherent
				391	* icache we do get PROTFAULT and we handle that D/I cache sync in
				392	* set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON
				393	* is conditional for server MMU.
				394	*
				395	* For radix, we can get prot fault for autonuma case, because radix
				396	* page table will have them marked noaccess for user.
				397	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	398	if (radix_enabled() \|\| is_write)
				399	return;
				400
				401	WARN_ON_ONCE(error_code & DSISR_PROTFAULT);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	402	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	403
				404	/*
				405	* Define the correct "is_write" bit in error_code based
				406	* on the processor family
				407	*/
				408	#if (defined(CONFIG_4xx) \|\| defined(CONFIG_BOOKE))
				409	#define page_fault_is_write(__err) ((__err) & ESR_DST)
				410	#define page_fault_is_bad(__err) (0)
				411	#else
				412	#define page_fault_is_write(__err) ((__err) & DSISR_ISSTORE)
				413	#if defined(CONFIG_PPC_8xx)
				414	#define page_fault_is_bad(__err) ((__err) & DSISR_NOEXEC_OR_G)
				415	#elif defined(CONFIG_PPC64)
				416	#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_64S)
				417	#else
				418	#define page_fault_is_bad(__err) ((__err) & DSISR_BAD_FAULT_32S)
				419	#endif
				420	#endif
				421
				422	/*
				423	* For 600- and 800-family processors, the error_code parameter is DSISR
				424	* for a data fault, SRR1 for an instruction fault. For 400-family processors
				425	* the error_code parameter is ESR for a data fault, 0 for an instruction
				426	* fault.
				427	* For 64-bit processors, the error_code parameter is
				428	* - DSISR for a non-SLB data access fault,
				429	* - SRR1 & 0x08000000 for a non-SLB instruction access fault
				430	* - 0 any SLB fault.
				431	*
				432	* The return value is 0 if the fault was handled, or the signal
				433	* number if this is a kernel fault that can't be handled here.
				434	*/
				435	static int __do_page_fault(struct pt_regs *regs, unsigned long address,
				436	unsigned long error_code)
				437	{
				438	struct vm_area_struct * vma;
				439	struct mm_struct *mm = current->mm;
				440	unsigned int flags = FAULT_FLAG_ALLOW_RETRY \| FAULT_FLAG_KILLABLE;
				441	int is_exec = TRAP(regs) == 0x400;
				442	int is_user = user_mode(regs);
				443	int is_write = page_fault_is_write(error_code);
				444	vm_fault_t fault, major = 0;
				445	bool must_retry = false;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	446	bool kprobe_fault = kprobe_page_fault(regs, 11);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	447
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	448	if (unlikely(debugger_fault_handler(regs) \|\| kprobe_fault))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	449	return 0;
				450
				451	if (unlikely(page_fault_is_bad(error_code))) {
				452	if (is_user) {
				453	_exception(SIGBUS, regs, BUS_OBJERR, address);
				454	return 0;
				455	}
				456	return SIGBUS;
				457	}
				458
				459	/* Additional sanity check(s) */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	460	sanity_check_fault(is_write, is_user, error_code, address);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	461
				462	/*
				463	* The kernel should never take an execute fault nor should it
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	464	* take a page fault to a kernel address or a page fault to a user
				465	* address outside of dedicated places
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	466	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	467	if (unlikely(!is_user && bad_kernel_fault(regs, error_code, address, is_write)))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	468	return SIGSEGV;
				469
				470	/*
				471	* If we're in an interrupt, have no user context or are running
				472	* in a region with pagefaults disabled then we must not take the fault
				473	*/
				474	if (unlikely(faulthandler_disabled() \|\| !mm)) {
				475	if (is_user)
				476	printk_ratelimited(KERN_ERR "Page fault in user mode"
				477	" with faulthandler_disabled()=%d"
				478	" mm=%p\n",
				479	faulthandler_disabled(), mm);
				480	return bad_area_nosemaphore(regs, address);
				481	}
				482
				483	/* We restore the interrupt state now */
				484	if (!arch_irq_disabled_regs(regs))
				485	local_irq_enable();
				486
				487	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
				488
				489	if (error_code & DSISR_KEYFAULT)
				490	return bad_key_fault_exception(regs, address,
				491	get_mm_addr_key(mm, address));
				492
				493	/*
				494	* We want to do this outside mmap_sem, because reading code around nip
				495	* can result in fault, which will cause a deadlock when called with
				496	* mmap_sem held
				497	*/
				498	if (is_user)
				499	flags \|= FAULT_FLAG_USER;
				500	if (is_write)
				501	flags \|= FAULT_FLAG_WRITE;
				502	if (is_exec)
				503	flags \|= FAULT_FLAG_INSTRUCTION;
				504
				505	/* When running in the kernel we expect faults to occur only to
				506	* addresses in user space. All other faults represent errors in the
				507	* kernel and should generate an OOPS. Unfortunately, in the case of an
				508	* erroneous fault occurring in a code path which already holds mmap_sem
				509	* we will deadlock attempting to validate the fault against the
				510	* address space. Luckily the kernel only validly references user
				511	* space from well defined areas of code, which are listed in the
				512	* exceptions table.
				513	*
				514	* As the vast majority of faults will be valid we will only perform
				515	* the source reference check when there is a possibility of a deadlock.
				516	* Attempt to lock the address space, if we cannot we then validate the
				517	* source. If this is invalid we can skip the address space check,
				518	* thus avoiding the deadlock.
				519	*/
				520	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
				521	if (!is_user && !search_exception_tables(regs->nip))
				522	return bad_area_nosemaphore(regs, address);
				523
				524	retry:
				525	down_read(&mm->mmap_sem);
				526	} else {
				527	/*
				528	* The above down_read_trylock() might have succeeded in
				529	* which case we'll have missed the might_sleep() from
				530	* down_read():
				531	*/
				532	might_sleep();
				533	}
				534
				535	vma = find_vma(mm, address);
				536	if (unlikely(!vma))
				537	return bad_area(regs, address);
				538	if (likely(vma->vm_start <= address))
				539	goto good_area;
				540	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN)))
				541	return bad_area(regs, address);
				542
				543	/* The stack is being expanded, check if it's valid */
				544	if (unlikely(bad_stack_expansion(regs, address, vma, flags,
				545	&must_retry))) {
				546	if (!must_retry)
				547	return bad_area(regs, address);
				548
				549	up_read(&mm->mmap_sem);
				550	if (fault_in_pages_readable((const char __user *)regs->nip,
				551	sizeof(unsigned int)))
				552	return bad_area_nosemaphore(regs, address);
				553	goto retry;
				554	}
				555
				556	/* Try to expand it */
				557	if (unlikely(expand_stack(vma, address)))
				558	return bad_area(regs, address);
				559
				560	good_area:
				561	if (unlikely(access_error(is_write, is_exec, vma)))
				562	return bad_access(regs, address);
				563
				564	/*
				565	* If for any reason at all we couldn't handle the fault,
				566	* make sure we exit gracefully rather than endlessly redo
				567	* the fault.
				568	*/
				569	fault = handle_mm_fault(vma, address, flags);
				570
				571	#ifdef CONFIG_PPC_MEM_KEYS
				572	/*
				573	* we skipped checking for access error due to key earlier.
				574	* Check that using handle_mm_fault error return.
				575	*/
				576	if (unlikely(fault & VM_FAULT_SIGSEGV) &&
				577	!arch_vma_access_permitted(vma, is_write, is_exec, 0)) {
				578
				579	int pkey = vma_pkey(vma);
				580
				581	up_read(&mm->mmap_sem);
				582	return bad_key_fault_exception(regs, address, pkey);
				583	}
				584	#endif /* CONFIG_PPC_MEM_KEYS */
				585
				586	major \|= fault & VM_FAULT_MAJOR;
				587
				588	/*
				589	* Handle the retry right now, the mmap_sem has been released in that
				590	* case.
				591	*/
				592	if (unlikely(fault & VM_FAULT_RETRY)) {
				593	/* We retry only once */
				594	if (flags & FAULT_FLAG_ALLOW_RETRY) {
				595	/*
				596	* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
				597	* of starvation.
				598	*/
				599	flags &= ~FAULT_FLAG_ALLOW_RETRY;
				600	flags \|= FAULT_FLAG_TRIED;
				601	if (!fatal_signal_pending(current))
				602	goto retry;
				603	}
				604
				605	/*
				606	* User mode? Just return to handle the fatal exception otherwise
				607	* return to bad_page_fault
				608	*/
				609	return is_user ? 0 : SIGBUS;
				610	}
				611
				612	up_read(&current->mm->mmap_sem);
				613
				614	if (unlikely(fault & VM_FAULT_ERROR))
				615	return mm_fault_error(regs, address, fault);
				616
				617	/*
				618	* Major/minor page fault accounting.
				619	*/
				620	if (major) {
				621	current->maj_flt++;
				622	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
				623	cmo_account_page_fault();
				624	} else {
				625	current->min_flt++;
				626	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
				627	}
				628	return 0;
				629	}
				630	NOKPROBE_SYMBOL(__do_page_fault);
				631
				632	int do_page_fault(struct pt_regs *regs, unsigned long address,
				633	unsigned long error_code)
				634	{
				635	enum ctx_state prev_state = exception_enter();
				636	int rc = __do_page_fault(regs, address, error_code);
				637	exception_exit(prev_state);
				638	return rc;
				639	}
				640	NOKPROBE_SYMBOL(do_page_fault);
				641
				642	/*
				643	* bad_page_fault is called when we have a bad access from the kernel.
				644	* It is called from the DSI and ISI handlers in head.S and from some
				645	* of the procedures in traps.c.
				646	*/
				647	void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
				648	{
				649	const struct exception_table_entry *entry;
				650
				651	/* Are we prepared to handle this fault? */
				652	if ((entry = search_exception_tables(regs->nip)) != NULL) {
				653	regs->nip = extable_fixup(entry);
				654	return;
				655	}
				656
				657	/* kernel has accessed a bad area */
				658
				659	switch (TRAP(regs)) {
				660	case 0x300:
				661	case 0x380:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	662	case 0xe00:
				663	pr_alert("BUG: %s at 0x%08lx\n",
				664	regs->dar < PAGE_SIZE ? "Kernel NULL pointer dereference" :
				665	"Unable to handle kernel data access", regs->dar);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	666	break;
				667	case 0x400:
				668	case 0x480:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	669	pr_alert("BUG: Unable to handle kernel instruction fetch%s",
				670	regs->nip < PAGE_SIZE ? " (NULL pointer?)\n" : "\n");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	671	break;
				672	case 0x600:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	673	pr_alert("BUG: Unable to handle kernel unaligned access at 0x%08lx\n",
				674	regs->dar);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	675	break;
				676	default:
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	677	pr_alert("BUG: Unable to handle unknown paging fault at 0x%08lx\n",
				678	regs->dar);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	679	break;
				680	}
				681	printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
				682	regs->nip);
				683
				684	if (task_stack_end_corrupted(current))
				685	printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
				686
				687	die("Kernel access of bad area", regs, sig);
				688	}