Blame - mm/oom_kill.c - hafnium/third_party/linux

blob: 419a814f467e0f4f96907db9621273aaa10dea6a [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-only
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* linux/mm/oom_kill.c
				4	*
				5	* Copyright (C) 1998,2000 Rik van Riel
				6	* Thanks go out to Claus Fischer for some serious inspiration and
				7	* for goading me into coding this file...
				8	* Copyright (C) 2010 Google, Inc.
				9	* Rewritten by David Rientjes
				10	*
				11	* The routines in this file are used to kill a process when
				12	* we're seriously out of memory. This gets called from __alloc_pages()
				13	* in mm/page_alloc.c when we really run out of memory.
				14	*
				15	* Since we won't call these routines often (on a well-configured
				16	* machine) this file will double as a 'coding guide' and a signpost
				17	* for newbie kernel hackers. It features several pointers to major
				18	* kernel subsystems and hints as to where to find out what things do.
				19	*/
				20
				21	#include <linux/oom.h>
				22	#include <linux/mm.h>
				23	#include <linux/err.h>
				24	#include <linux/gfp.h>
				25	#include <linux/sched.h>
				26	#include <linux/sched/mm.h>
				27	#include <linux/sched/coredump.h>
				28	#include <linux/sched/task.h>
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	29	#include <linux/sched/debug.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	30	#include <linux/swap.h>
				31	#include <linux/timex.h>
				32	#include <linux/jiffies.h>
				33	#include <linux/cpuset.h>
				34	#include <linux/export.h>
				35	#include <linux/notifier.h>
				36	#include <linux/memcontrol.h>
				37	#include <linux/mempolicy.h>
				38	#include <linux/security.h>
				39	#include <linux/ptrace.h>
				40	#include <linux/freezer.h>
				41	#include <linux/ftrace.h>
				42	#include <linux/ratelimit.h>
				43	#include <linux/kthread.h>
				44	#include <linux/init.h>
				45	#include <linux/mmu_notifier.h>
				46
				47	#include <asm/tlb.h>
				48	#include "internal.h"
				49	#include "slab.h"
				50
				51	#define CREATE_TRACE_POINTS
				52	#include <trace/events/oom.h>
				53
				54	int sysctl_panic_on_oom;
				55	int sysctl_oom_kill_allocating_task;
				56	int sysctl_oom_dump_tasks = 1;
				57
				58	/*
				59	* Serializes oom killer invocations (out_of_memory()) from all contexts to
				60	* prevent from over eager oom killing (e.g. when the oom killer is invoked
				61	* from different domains).
				62	*
				63	* oom_killer_disable() relies on this lock to stabilize oom_killer_disabled
				64	* and mark_oom_victim
				65	*/
				66	DEFINE_MUTEX(oom_lock);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	67	/* Serializes oom_score_adj and oom_score_adj_min updates */
				68	DEFINE_MUTEX(oom_adj_mutex);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	69
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	70	static inline bool is_memcg_oom(struct oom_control *oc)
				71	{
				72	return oc->memcg != NULL;
				73	}
				74
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	75	#ifdef CONFIG_NUMA
				76	/**
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	77	* oom_cpuset_eligible() - check task eligiblity for kill
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	78	* @start: task struct of which task to consider
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	79	* @oc: pointer to struct oom_control
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	80	*
				81	* Task eligibility is determined by whether or not a candidate task, @tsk,
				82	* shares the same mempolicy nodes as current if it is bound by such a policy
				83	* and whether or not it has the same set of allowed cpuset nodes.
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	84	*
				85	* This function is assuming oom-killer context and 'current' has triggered
				86	* the oom-killer.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	87	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	88	static bool oom_cpuset_eligible(struct task_struct *start,
				89	struct oom_control *oc)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	90	{
				91	struct task_struct *tsk;
				92	bool ret = false;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	93	const nodemask_t *mask = oc->nodemask;
				94
				95	if (is_memcg_oom(oc))
				96	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	97
				98	rcu_read_lock();
				99	for_each_thread(start, tsk) {
				100	if (mask) {
				101	/*
				102	* If this is a mempolicy constrained oom, tsk's
				103	* cpuset is irrelevant. Only return true if its
				104	* mempolicy intersects current, otherwise it may be
				105	* needlessly killed.
				106	*/
				107	ret = mempolicy_nodemask_intersects(tsk, mask);
				108	} else {
				109	/*
				110	* This is not a mempolicy constrained oom, so only
				111	* check the mems of tsk's cpuset.
				112	*/
				113	ret = cpuset_mems_allowed_intersects(current, tsk);
				114	}
				115	if (ret)
				116	break;
				117	}
				118	rcu_read_unlock();
				119
				120	return ret;
				121	}
				122	#else
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	123	static bool oom_cpuset_eligible(struct task_struct tsk, struct oom_control oc)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	124	{
				125	return true;
				126	}
				127	#endif /* CONFIG_NUMA */
				128
				129	/*
				130	* The process p may have detached its own ->mm while exiting or through
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	131	* kthread_use_mm(), but one or more of its subthreads may still have a valid
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	132	* pointer. Return p, or any of its subthreads with a valid ->mm, with
				133	* task_lock() held.
				134	*/
				135	struct task_struct find_lock_task_mm(struct task_struct p)
				136	{
				137	struct task_struct *t;
				138
				139	rcu_read_lock();
				140
				141	for_each_thread(p, t) {
				142	task_lock(t);
				143	if (likely(t->mm))
				144	goto found;
				145	task_unlock(t);
				146	}
				147	t = NULL;
				148	found:
				149	rcu_read_unlock();
				150
				151	return t;
				152	}
				153
				154	/*
				155	* order == -1 means the oom kill is required by sysrq, otherwise only
				156	* for display purposes.
				157	*/
				158	static inline bool is_sysrq_oom(struct oom_control *oc)
				159	{
				160	return oc->order == -1;
				161	}
				162
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	163	/* return true if the task is not adequate as candidate victim task. */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	164	static bool oom_unkillable_task(struct task_struct *p)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	165	{
				166	if (is_global_init(p))
				167	return true;
				168	if (p->flags & PF_KTHREAD)
				169	return true;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	170	return false;
				171	}
				172
				173	/*
				174	* Print out unreclaimble slabs info when unreclaimable slabs amount is greater
				175	* than all user memory (LRU pages)
				176	*/
				177	static bool is_dump_unreclaim_slabs(void)
				178	{
				179	unsigned long nr_lru;
				180
				181	nr_lru = global_node_page_state(NR_ACTIVE_ANON) +
				182	global_node_page_state(NR_INACTIVE_ANON) +
				183	global_node_page_state(NR_ACTIVE_FILE) +
				184	global_node_page_state(NR_INACTIVE_FILE) +
				185	global_node_page_state(NR_ISOLATED_ANON) +
				186	global_node_page_state(NR_ISOLATED_FILE) +
				187	global_node_page_state(NR_UNEVICTABLE);
				188
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	189	return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	190	}
				191
				192	/**
				193	* oom_badness - heuristic function to determine which candidate task to kill
				194	* @p: task struct of which task we should calculate
				195	* @totalpages: total present RAM allowed for page allocation
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	196	*
				197	* The heuristic for determining which task to kill is made to be as simple and
				198	* predictable as possible. The goal is to return the highest value for the
				199	* task consuming the most memory to avoid subsequent oom failures.
				200	*/
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	201	long oom_badness(struct task_struct *p, unsigned long totalpages)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	202	{
				203	long points;
				204	long adj;
				205
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	206	if (oom_unkillable_task(p))
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	207	return LONG_MIN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	208
				209	p = find_lock_task_mm(p);
				210	if (!p)
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	211	return LONG_MIN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	212
				213	/*
				214	* Do not even consider tasks which are explicitly marked oom
				215	* unkillable or have been already oom reaped or the are in
				216	* the middle of vfork
				217	*/
				218	adj = (long)p->signal->oom_score_adj;
				219	if (adj == OOM_SCORE_ADJ_MIN \|\|
				220	test_bit(MMF_OOM_SKIP, &p->mm->flags) \|\|
				221	in_vfork(p)) {
				222	task_unlock(p);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	223	return LONG_MIN;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	224	}
				225
				226	/*
				227	* The baseline for the badness score is the proportion of RAM that each
				228	* task's rss, pagetable and swap space use.
				229	*/
				230	points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
				231	mm_pgtables_bytes(p->mm) / PAGE_SIZE;
				232	task_unlock(p);
				233
				234	/* Normalize to oom_score_adj units */
				235	adj *= totalpages / 1000;
				236	points += adj;
				237
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	238	return points;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	239	}
				240
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	241	static const char * const oom_constraint_text[] = {
				242	[CONSTRAINT_NONE] = "CONSTRAINT_NONE",
				243	[CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
				244	[CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
				245	[CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	246	};
				247
				248	/*
				249	* Determine the type of allocation constraint.
				250	*/
				251	static enum oom_constraint constrained_alloc(struct oom_control *oc)
				252	{
				253	struct zone *zone;
				254	struct zoneref *z;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	255	enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	256	bool cpuset_limited = false;
				257	int nid;
				258
				259	if (is_memcg_oom(oc)) {
				260	oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1;
				261	return CONSTRAINT_MEMCG;
				262	}
				263
				264	/* Default to all available memory */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	265	oc->totalpages = totalram_pages() + total_swap_pages;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	266
				267	if (!IS_ENABLED(CONFIG_NUMA))
				268	return CONSTRAINT_NONE;
				269
				270	if (!oc->zonelist)
				271	return CONSTRAINT_NONE;
				272	/*
				273	* Reach here only when __GFP_NOFAIL is used. So, we should avoid
				274	* to kill current.We have to random task kill in this case.
				275	* Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now.
				276	*/
				277	if (oc->gfp_mask & __GFP_THISNODE)
				278	return CONSTRAINT_NONE;
				279
				280	/*
				281	* This is not a __GFP_THISNODE allocation, so a truncated nodemask in
				282	* the page allocator means a mempolicy is in effect. Cpuset policy
				283	* is enforced in get_page_from_freelist().
				284	*/
				285	if (oc->nodemask &&
				286	!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
				287	oc->totalpages = total_swap_pages;
				288	for_each_node_mask(nid, *oc->nodemask)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	289	oc->totalpages += node_present_pages(nid);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	290	return CONSTRAINT_MEMORY_POLICY;
				291	}
				292
				293	/* Check this allocation failure is caused by cpuset's wall function */
				294	for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	295	highest_zoneidx, oc->nodemask)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	296	if (!cpuset_zone_allowed(zone, oc->gfp_mask))
				297	cpuset_limited = true;
				298
				299	if (cpuset_limited) {
				300	oc->totalpages = total_swap_pages;
				301	for_each_node_mask(nid, cpuset_current_mems_allowed)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	302	oc->totalpages += node_present_pages(nid);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	303	return CONSTRAINT_CPUSET;
				304	}
				305	return CONSTRAINT_NONE;
				306	}
				307
				308	static int oom_evaluate_task(struct task_struct task, void arg)
				309	{
				310	struct oom_control *oc = arg;
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	311	long points;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	312
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	313	if (oom_unkillable_task(task))
				314	goto next;
				315
				316	/* p may not have freeable memory in nodemask */
				317	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	318	goto next;
				319
				320	/*
				321	* This task already has access to memory reserves and is being killed.
				322	* Don't allow any other task to have access to the reserves unless
				323	* the task has MMF_OOM_SKIP because chances that it would release
				324	* any memory is quite low.
				325	*/
				326	if (!is_sysrq_oom(oc) && tsk_is_oom_victim(task)) {
				327	if (test_bit(MMF_OOM_SKIP, &task->signal->oom_mm->flags))
				328	goto next;
				329	goto abort;
				330	}
				331
				332	/*
				333	* If task is allocating a lot of memory and has been marked to be
				334	* killed first if it triggers an oom, then select it.
				335	*/
				336	if (oom_task_origin(task)) {
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	337	points = LONG_MAX;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	338	goto select;
				339	}
				340
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	341	points = oom_badness(task, oc->totalpages);
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	342	if (points == LONG_MIN \|\| points < oc->chosen_points)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	343	goto next;
				344
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	345	select:
				346	if (oc->chosen)
				347	put_task_struct(oc->chosen);
				348	get_task_struct(task);
				349	oc->chosen = task;
				350	oc->chosen_points = points;
				351	next:
				352	return 0;
				353	abort:
				354	if (oc->chosen)
				355	put_task_struct(oc->chosen);
				356	oc->chosen = (void *)-1UL;
				357	return 1;
				358	}
				359
				360	/*
				361	* Simple selection loop. We choose the process with the highest number of
				362	* 'points'. In case scan was aborted, oc->chosen is set to -1.
				363	*/
				364	static void select_bad_process(struct oom_control *oc)
				365	{
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	366	oc->chosen_points = LONG_MIN;
				367
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	368	if (is_memcg_oom(oc))
				369	mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
				370	else {
				371	struct task_struct *p;
				372
				373	rcu_read_lock();
				374	for_each_process(p)
				375	if (oom_evaluate_task(p, oc))
				376	break;
				377	rcu_read_unlock();
				378	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	379	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	380
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	381	static int dump_task(struct task_struct p, void arg)
				382	{
				383	struct oom_control *oc = arg;
				384	struct task_struct *task;
				385
				386	if (oom_unkillable_task(p))
				387	return 0;
				388
				389	/* p may not have freeable memory in nodemask */
				390	if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
				391	return 0;
				392
				393	task = find_lock_task_mm(p);
				394	if (!task) {
				395	/*
				396	* This is a kthread or all of p's threads have already
				397	* detached their mm's. There's no need to report
				398	* them; they can't be oom killed anyway.
				399	*/
				400	return 0;
				401	}
				402
				403	pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
				404	task->pid, from_kuid(&init_user_ns, task_uid(task)),
				405	task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
				406	mm_pgtables_bytes(task->mm),
				407	get_mm_counter(task->mm, MM_SWAPENTS),
				408	task->signal->oom_score_adj, task->comm);
				409	task_unlock(task);
				410
				411	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	412	}
				413
				414	/**
				415	* dump_tasks - dump current memory state of all system tasks
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	416	* @oc: pointer to struct oom_control
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	417	*
				418	* Dumps the current memory state of all eligible tasks. Tasks not in the same
				419	* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
				420	* are not shown.
				421	* State information includes task's pid, uid, tgid, vm size, rss,
				422	* pgtables_bytes, swapents, oom_score_adj value, and name.
				423	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	424	static void dump_tasks(struct oom_control *oc)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	425	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	426	pr_info("Tasks state (memory values in pages):\n");
				427	pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	428
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	429	if (is_memcg_oom(oc))
				430	mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
				431	else {
				432	struct task_struct *p;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	433
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	434	rcu_read_lock();
				435	for_each_process(p)
				436	dump_task(p, oc);
				437	rcu_read_unlock();
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	438	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	439	}
				440
				441	static void dump_oom_summary(struct oom_control oc, struct task_struct victim)
				442	{
				443	/* one line summary of the oom killer context. */
				444	pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
				445	oom_constraint_text[oc->constraint],
				446	nodemask_pr_args(oc->nodemask));
				447	cpuset_print_current_mems_allowed();
				448	mem_cgroup_print_oom_context(oc->memcg, victim);
				449	pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
				450	from_kuid(&init_user_ns, task_uid(victim)));
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	451	}
				452
				453	static void dump_header(struct oom_control oc, struct task_struct p)
				454	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	455	pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
				456	current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	457	current->signal->oom_score_adj);
				458	if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
				459	pr_warn("COMPACTION is disabled!!!\n");
				460
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	461	dump_stack();
				462	if (is_memcg_oom(oc))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	463	mem_cgroup_print_oom_meminfo(oc->memcg);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	464	else {
				465	show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
				466	if (is_dump_unreclaim_slabs())
				467	dump_unreclaimable_slab();
				468	}
				469	if (sysctl_oom_dump_tasks)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	470	dump_tasks(oc);
				471	if (p)
				472	dump_oom_summary(oc, p);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	473	}
				474
				475	/*
				476	* Number of OOM victims in flight
				477	*/
				478	static atomic_t oom_victims = ATOMIC_INIT(0);
				479	static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
				480
				481	static bool oom_killer_disabled __read_mostly;
				482
				483	#define K(x) ((x) << (PAGE_SHIFT-10))
				484
				485	/*
				486	* task->mm can be NULL if the task is the exited group leader. So to
				487	* determine whether the task is using a particular mm, we examine all the
				488	* task's threads: if one of those is using this mm then this task was also
				489	* using it.
				490	*/
				491	bool process_shares_mm(struct task_struct p, struct mm_struct mm)
				492	{
				493	struct task_struct *t;
				494
				495	for_each_thread(p, t) {
				496	struct mm_struct *t_mm = READ_ONCE(t->mm);
				497	if (t_mm)
				498	return t_mm == mm;
				499	}
				500	return false;
				501	}
				502
				503	#ifdef CONFIG_MMU
				504	/*
				505	* OOM Reaper kernel thread which tries to reap the memory used by the OOM
				506	* victim (if that is possible) to help the OOM killer to move on.
				507	*/
				508	static struct task_struct *oom_reaper_th;
				509	static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait);
				510	static struct task_struct *oom_reaper_list;
				511	static DEFINE_SPINLOCK(oom_reaper_lock);
				512
				513	bool __oom_reap_task_mm(struct mm_struct *mm)
				514	{
				515	struct vm_area_struct *vma;
				516	bool ret = true;
				517
				518	/*
				519	* Tell all users of get_user/copy_from_user etc... that the content
				520	* is no longer stable. No barriers really needed because unmapping
				521	* should imply barriers already and the reader would hit a page fault
				522	* if it stumbled over a reaped memory.
				523	*/
				524	set_bit(MMF_UNSTABLE, &mm->flags);
				525
				526	for (vma = mm->mmap ; vma; vma = vma->vm_next) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	527	if (!can_madv_lru_vma(vma))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	528	continue;
				529
				530	/*
				531	* Only anonymous pages have a good chance to be dropped
				532	* without additional steps which we cannot afford as we
				533	* are OOM already.
				534	*
				535	* We do not even care about fs backed pages because all
				536	* which are reclaimable have already been reclaimed and
				537	* we do not want to block exit_mmap by keeping mm ref
				538	* count elevated without a good reason.
				539	*/
				540	if (vma_is_anonymous(vma) \|\| !(vma->vm_flags & VM_SHARED)) {
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	541	struct mmu_notifier_range range;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	542	struct mmu_gather tlb;
				543
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	544	mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
				545	vma, mm, vma->vm_start,
				546	vma->vm_end);
				547	tlb_gather_mmu(&tlb, mm, range.start, range.end);
				548	if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
				549	tlb_finish_mmu(&tlb, range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	550	ret = false;
				551	continue;
				552	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	553	unmap_page_range(&tlb, vma, range.start, range.end, NULL);
				554	mmu_notifier_invalidate_range_end(&range);
				555	tlb_finish_mmu(&tlb, range.start, range.end);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	556	}
				557	}
				558
				559	return ret;
				560	}
				561
				562	/*
				563	* Reaps the address space of the give task.
				564	*
				565	* Returns true on success and false if none or part of the address space
				566	* has been reclaimed and the caller should retry later.
				567	*/
				568	static bool oom_reap_task_mm(struct task_struct tsk, struct mm_struct mm)
				569	{
				570	bool ret = true;
				571
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	572	if (!mmap_read_trylock(mm)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	573	trace_skip_task_reaping(tsk->pid);
				574	return false;
				575	}
				576
				577	/*
				578	* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
				579	* work on the mm anymore. The check for MMF_OOM_SKIP must run
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	580	* under mmap_lock for reading because it serializes against the
				581	* mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	582	*/
				583	if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
				584	trace_skip_task_reaping(tsk->pid);
				585	goto out_unlock;
				586	}
				587
				588	trace_start_task_reaping(tsk->pid);
				589
				590	/* failed to reap part of the address space. Try again later */
				591	ret = __oom_reap_task_mm(mm);
				592	if (!ret)
				593	goto out_finish;
				594
				595	pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
				596	task_pid_nr(tsk), tsk->comm,
				597	K(get_mm_counter(mm, MM_ANONPAGES)),
				598	K(get_mm_counter(mm, MM_FILEPAGES)),
				599	K(get_mm_counter(mm, MM_SHMEMPAGES)));
				600	out_finish:
				601	trace_finish_task_reaping(tsk->pid);
				602	out_unlock:
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	603	mmap_read_unlock(mm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	604
				605	return ret;
				606	}
				607
				608	#define MAX_OOM_REAP_RETRIES 10
				609	static void oom_reap_task(struct task_struct *tsk)
				610	{
				611	int attempts = 0;
				612	struct mm_struct *mm = tsk->signal->oom_mm;
				613
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	614	/* Retry the mmap_read_trylock(mm) a few times */
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	615	while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
				616	schedule_timeout_idle(HZ/10);
				617
				618	if (attempts <= MAX_OOM_REAP_RETRIES \|\|
				619	test_bit(MMF_OOM_SKIP, &mm->flags))
				620	goto done;
				621
				622	pr_info("oom_reaper: unable to reap pid:%d (%s)\n",
				623	task_pid_nr(tsk), tsk->comm);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	624	sched_show_task(tsk);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	625	debug_show_all_locks();
				626
				627	done:
				628	tsk->oom_reaper_list = NULL;
				629
				630	/*
				631	* Hide this mm from OOM killer because it has been either reaped or
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	632	* somebody can't call mmap_write_unlock(mm).
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	633	*/
				634	set_bit(MMF_OOM_SKIP, &mm->flags);
				635
				636	/* Drop a reference taken by wake_oom_reaper */
				637	put_task_struct(tsk);
				638	}
				639
				640	static int oom_reaper(void *unused)
				641	{
				642	while (true) {
				643	struct task_struct *tsk = NULL;
				644
				645	wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
				646	spin_lock(&oom_reaper_lock);
				647	if (oom_reaper_list != NULL) {
				648	tsk = oom_reaper_list;
				649	oom_reaper_list = tsk->oom_reaper_list;
				650	}
				651	spin_unlock(&oom_reaper_lock);
				652
				653	if (tsk)
				654	oom_reap_task(tsk);
				655	}
				656
				657	return 0;
				658	}
				659
				660	static void wake_oom_reaper(struct task_struct *tsk)
				661	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	662	/* mm is already queued? */
				663	if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	664	return;
				665
				666	get_task_struct(tsk);
				667
				668	spin_lock(&oom_reaper_lock);
				669	tsk->oom_reaper_list = oom_reaper_list;
				670	oom_reaper_list = tsk;
				671	spin_unlock(&oom_reaper_lock);
				672	trace_wake_reaper(tsk->pid);
				673	wake_up(&oom_reaper_wait);
				674	}
				675
				676	static int __init oom_init(void)
				677	{
				678	oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
				679	return 0;
				680	}
				681	subsys_initcall(oom_init)
				682	#else
				683	static inline void wake_oom_reaper(struct task_struct *tsk)
				684	{
				685	}
				686	#endif /* CONFIG_MMU */
				687
				688	/**
				689	* mark_oom_victim - mark the given task as OOM victim
				690	* @tsk: task to mark
				691	*
				692	* Has to be called with oom_lock held and never after
				693	* oom has been disabled already.
				694	*
				695	* tsk->mm has to be non NULL and caller has to guarantee it is stable (either
				696	* under task_lock or operate on the current).
				697	*/
				698	static void mark_oom_victim(struct task_struct *tsk)
				699	{
				700	struct mm_struct *mm = tsk->mm;
				701
				702	WARN_ON(oom_killer_disabled);
				703	/* OOM killer might race with memcg OOM */
				704	if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
				705	return;
				706
				707	/* oom_mm is bound to the signal struct life time. */
				708	if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) {
				709	mmgrab(tsk->signal->oom_mm);
				710	set_bit(MMF_OOM_VICTIM, &mm->flags);
				711	}
				712
				713	/*
				714	* Make sure that the task is woken up from uninterruptible sleep
				715	* if it is frozen because OOM killer wouldn't be able to free
				716	* any memory and livelock. freezing_slow_path will tell the freezer
				717	* that TIF_MEMDIE tasks should be ignored.
				718	*/
				719	__thaw_task(tsk);
				720	atomic_inc(&oom_victims);
				721	trace_mark_victim(tsk->pid);
				722	}
				723
				724	/**
				725	* exit_oom_victim - note the exit of an OOM victim
				726	*/
				727	void exit_oom_victim(void)
				728	{
				729	clear_thread_flag(TIF_MEMDIE);
				730
				731	if (!atomic_dec_return(&oom_victims))
				732	wake_up_all(&oom_victims_wait);
				733	}
				734
				735	/**
				736	* oom_killer_enable - enable OOM killer
				737	*/
				738	void oom_killer_enable(void)
				739	{
				740	oom_killer_disabled = false;
				741	pr_info("OOM killer enabled.\n");
				742	}
				743
				744	/**
				745	* oom_killer_disable - disable OOM killer
				746	* @timeout: maximum timeout to wait for oom victims in jiffies
				747	*
				748	* Forces all page allocations to fail rather than trigger OOM killer.
				749	* Will block and wait until all OOM victims are killed or the given
				750	* timeout expires.
				751	*
				752	* The function cannot be called when there are runnable user tasks because
				753	* the userspace would see unexpected allocation failures as a result. Any
				754	* new usage of this function should be consulted with MM people.
				755	*
				756	* Returns true if successful and false if the OOM killer cannot be
				757	* disabled.
				758	*/
				759	bool oom_killer_disable(signed long timeout)
				760	{
				761	signed long ret;
				762
				763	/*
				764	* Make sure to not race with an ongoing OOM killer. Check that the
				765	* current is not killed (possibly due to sharing the victim's memory).
				766	*/
				767	if (mutex_lock_killable(&oom_lock))
				768	return false;
				769	oom_killer_disabled = true;
				770	mutex_unlock(&oom_lock);
				771
				772	ret = wait_event_interruptible_timeout(oom_victims_wait,
				773	!atomic_read(&oom_victims), timeout);
				774	if (ret <= 0) {
				775	oom_killer_enable();
				776	return false;
				777	}
				778	pr_info("OOM killer disabled.\n");
				779
				780	return true;
				781	}
				782
				783	static inline bool __task_will_free_mem(struct task_struct *task)
				784	{
				785	struct signal_struct *sig = task->signal;
				786
				787	/*
				788	* A coredumping process may sleep for an extended period in exit_mm(),
				789	* so the oom killer cannot assume that the process will promptly exit
				790	* and release memory.
				791	*/
				792	if (sig->flags & SIGNAL_GROUP_COREDUMP)
				793	return false;
				794
				795	if (sig->flags & SIGNAL_GROUP_EXIT)
				796	return true;
				797
				798	if (thread_group_empty(task) && (task->flags & PF_EXITING))
				799	return true;
				800
				801	return false;
				802	}
				803
				804	/*
				805	* Checks whether the given task is dying or exiting and likely to
				806	* release its address space. This means that all threads and processes
				807	* sharing the same mm have to be killed or exiting.
				808	* Caller has to make sure that task->mm is stable (hold task_lock or
				809	* it operates on the current).
				810	*/
				811	static bool task_will_free_mem(struct task_struct *task)
				812	{
				813	struct mm_struct *mm = task->mm;
				814	struct task_struct *p;
				815	bool ret = true;
				816
				817	/*
				818	* Skip tasks without mm because it might have passed its exit_mm and
				819	* exit_oom_victim. oom_reaper could have rescued that but do not rely
				820	* on that for now. We can consider find_lock_task_mm in future.
				821	*/
				822	if (!mm)
				823	return false;
				824
				825	if (!__task_will_free_mem(task))
				826	return false;
				827
				828	/*
				829	* This task has already been drained by the oom reaper so there are
				830	* only small chances it will free some more
				831	*/
				832	if (test_bit(MMF_OOM_SKIP, &mm->flags))
				833	return false;
				834
				835	if (atomic_read(&mm->mm_users) <= 1)
				836	return true;
				837
				838	/*
				839	* Make sure that all tasks which share the mm with the given tasks
				840	* are dying as well to make sure that a) nobody pins its mm and
				841	* b) the task is also reapable by the oom reaper.
				842	*/
				843	rcu_read_lock();
				844	for_each_process(p) {
				845	if (!process_shares_mm(p, mm))
				846	continue;
				847	if (same_thread_group(task, p))
				848	continue;
				849	ret = __task_will_free_mem(p);
				850	if (!ret)
				851	break;
				852	}
				853	rcu_read_unlock();
				854
				855	return ret;
				856	}
				857
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	858	static void __oom_kill_process(struct task_struct victim, const char message)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	859	{
				860	struct task_struct *p;
				861	struct mm_struct *mm;
				862	bool can_oom_reap = true;
				863
				864	p = find_lock_task_mm(victim);
				865	if (!p) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	866	pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
				867	message, task_pid_nr(victim), victim->comm);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	868	put_task_struct(victim);
				869	return;
				870	} else if (victim != p) {
				871	get_task_struct(p);
				872	put_task_struct(victim);
				873	victim = p;
				874	}
				875
				876	/* Get a reference to safely compare mm after task_unlock(victim) */
				877	mm = victim->mm;
				878	mmgrab(mm);
				879
				880	/* Raise event before sending signal: task reaper must see this */
				881	count_vm_event(OOM_KILL);
				882	memcg_memory_event_mm(mm, MEMCG_OOM_KILL);
				883
				884	/*
				885	* We should send SIGKILL before granting access to memory reserves
				886	* in order to prevent the OOM victim from depleting the memory
				887	* reserves from the user space under its control.
				888	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	889	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	890	mark_oom_victim(victim);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	891	pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
				892	message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
				893	K(get_mm_counter(mm, MM_ANONPAGES)),
				894	K(get_mm_counter(mm, MM_FILEPAGES)),
				895	K(get_mm_counter(mm, MM_SHMEMPAGES)),
				896	from_kuid(&init_user_ns, task_uid(victim)),
Olivier Deprez	0e64123	2021-09-23 10:07:05 +0200	[diff] [blame]	897	mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	898	task_unlock(victim);
				899
				900	/*
				901	* Kill all user processes sharing victim->mm in other thread groups, if
				902	* any. They don't get access to memory reserves, though, to avoid
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	903	* depletion of all memory. This prevents mm->mmap_lock livelock when an
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	904	* oom killed thread cannot exit because it requires the semaphore and
				905	* its contended by another thread trying to allocate memory itself.
				906	* That thread will now get access to memory reserves since it has a
				907	* pending fatal signal.
				908	*/
				909	rcu_read_lock();
				910	for_each_process(p) {
				911	if (!process_shares_mm(p, mm))
				912	continue;
				913	if (same_thread_group(p, victim))
				914	continue;
				915	if (is_global_init(p)) {
				916	can_oom_reap = false;
				917	set_bit(MMF_OOM_SKIP, &mm->flags);
				918	pr_info("oom killer %d (%s) has mm pinned by %d (%s)\n",
				919	task_pid_nr(victim), victim->comm,
				920	task_pid_nr(p), p->comm);
				921	continue;
				922	}
				923	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	924	* No kthead_use_mm() user needs to read from the userspace so
				925	* we are ok to reap it.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	926	*/
				927	if (unlikely(p->flags & PF_KTHREAD))
				928	continue;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	929	do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	930	}
				931	rcu_read_unlock();
				932
				933	if (can_oom_reap)
				934	wake_oom_reaper(victim);
				935
				936	mmdrop(mm);
				937	put_task_struct(victim);
				938	}
				939	#undef K
				940
				941	/*
				942	* Kill provided task unless it's secured by setting
				943	* oom_score_adj to OOM_SCORE_ADJ_MIN.
				944	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	945	static int oom_kill_memcg_member(struct task_struct task, void message)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	946	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	947	if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
				948	!is_global_init(task)) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	949	get_task_struct(task);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	950	__oom_kill_process(task, message);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	951	}
				952	return 0;
				953	}
				954
				955	static void oom_kill_process(struct oom_control oc, const char message)
				956	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	957	struct task_struct *victim = oc->chosen;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	958	struct mem_cgroup *oom_group;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	959	static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
				960	DEFAULT_RATELIMIT_BURST);
				961
				962	/*
				963	* If the task is already exiting, don't alarm the sysadmin or kill
				964	* its children or threads, just give it access to memory reserves
				965	* so it can die quickly
				966	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	967	task_lock(victim);
				968	if (task_will_free_mem(victim)) {
				969	mark_oom_victim(victim);
				970	wake_oom_reaper(victim);
				971	task_unlock(victim);
				972	put_task_struct(victim);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	973	return;
				974	}
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	975	task_unlock(victim);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	976
				977	if (__ratelimit(&oom_rs))
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	978	dump_header(oc, victim);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	979
				980	/*
				981	* Do we need to kill the entire memory cgroup?
				982	* Or even one of the ancestor memory cgroups?
				983	* Check this out before killing the victim task.
				984	*/
				985	oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
				986
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	987	__oom_kill_process(victim, message);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	988
				989	/*
				990	* If necessary, kill all tasks in the selected memory cgroup.
				991	*/
				992	if (oom_group) {
				993	mem_cgroup_print_oom_group(oom_group);
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	994	mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
				995	(void*)message);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	996	mem_cgroup_put(oom_group);
				997	}
				998	}
				999
				1000	/*
				1001	* Determines whether the kernel must panic because of the panic_on_oom sysctl.
				1002	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1003	static void check_panic_on_oom(struct oom_control *oc)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1004	{
				1005	if (likely(!sysctl_panic_on_oom))
				1006	return;
				1007	if (sysctl_panic_on_oom != 2) {
				1008	/*
				1009	* panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel
				1010	* does not panic for cpuset, mempolicy, or memcg allocation
				1011	* failures.
				1012	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1013	if (oc->constraint != CONSTRAINT_NONE)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1014	return;
				1015	}
				1016	/* Do not panic for oom kills triggered by sysrq */
				1017	if (is_sysrq_oom(oc))
				1018	return;
				1019	dump_header(oc, NULL);
				1020	panic("Out of memory: %s panic_on_oom is enabled\n",
				1021	sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
				1022	}
				1023
				1024	static BLOCKING_NOTIFIER_HEAD(oom_notify_list);
				1025
				1026	int register_oom_notifier(struct notifier_block *nb)
				1027	{
				1028	return blocking_notifier_chain_register(&oom_notify_list, nb);
				1029	}
				1030	EXPORT_SYMBOL_GPL(register_oom_notifier);
				1031
				1032	int unregister_oom_notifier(struct notifier_block *nb)
				1033	{
				1034	return blocking_notifier_chain_unregister(&oom_notify_list, nb);
				1035	}
				1036	EXPORT_SYMBOL_GPL(unregister_oom_notifier);
				1037
				1038	/**
				1039	* out_of_memory - kill the "best" process when we run out of memory
				1040	* @oc: pointer to struct oom_control
				1041	*
				1042	* If we run out of memory, we have the choice between either
				1043	* killing a random task (bad), letting the system crash (worse)
				1044	* OR try to be smart about which process to kill. Note that we
				1045	* don't have to be perfect here, we just have to be good.
				1046	*/
				1047	bool out_of_memory(struct oom_control *oc)
				1048	{
				1049	unsigned long freed = 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1050
				1051	if (oom_killer_disabled)
				1052	return false;
				1053
				1054	if (!is_memcg_oom(oc)) {
				1055	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
				1056	if (freed > 0)
				1057	/* Got some memory back in the last second. */
				1058	return true;
				1059	}
				1060
				1061	/*
				1062	* If current has a pending SIGKILL or is exiting, then automatically
				1063	* select it. The goal is to allow it to allocate so that it may
				1064	* quickly exit and free its memory.
				1065	*/
				1066	if (task_will_free_mem(current)) {
				1067	mark_oom_victim(current);
				1068	wake_oom_reaper(current);
				1069	return true;
				1070	}
				1071
				1072	/*
				1073	* The OOM killer does not compensate for IO-less reclaim.
				1074	* pagefault_out_of_memory lost its gfp context so we have to
				1075	* make sure exclude 0 mask - all other users should have at least
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1076	* ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
				1077	* invoke the OOM killer even if it is a GFP_NOFS allocation.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1078	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1079	if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1080	return true;
				1081
				1082	/*
				1083	* Check if there were limitations on the allocation (only relevant for
				1084	* NUMA and memcg) that may require different handling.
				1085	*/
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1086	oc->constraint = constrained_alloc(oc);
				1087	if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1088	oc->nodemask = NULL;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1089	check_panic_on_oom(oc);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1090
				1091	if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1092	current->mm && !oom_unkillable_task(current) &&
				1093	oom_cpuset_eligible(current, oc) &&
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1094	current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
				1095	get_task_struct(current);
				1096	oc->chosen = current;
				1097	oom_kill_process(oc, "Out of memory (oom_kill_allocating_task)");
				1098	return true;
				1099	}
				1100
				1101	select_bad_process(oc);
				1102	/* Found nothing?!?! */
				1103	if (!oc->chosen) {
				1104	dump_header(oc, NULL);
				1105	pr_warn("Out of memory and no killable processes...\n");
				1106	/*
				1107	* If we got here due to an actual allocation at the
				1108	* system level, we cannot survive this and will enter
				1109	* an endless loop in the allocator. Bail out now.
				1110	*/
				1111	if (!is_sysrq_oom(oc) && !is_memcg_oom(oc))
				1112	panic("System is deadlocked on memory\n");
				1113	}
				1114	if (oc->chosen && oc->chosen != (void *)-1UL)
				1115	oom_kill_process(oc, !is_memcg_oom(oc) ? "Out of memory" :
				1116	"Memory cgroup out of memory");
				1117	return !!oc->chosen;
				1118	}
				1119
				1120	/*
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1121	* The pagefault handler calls here because some allocation has failed. We have
				1122	* to take care of the memcg OOM here because this is the only safe context without
				1123	* any locks held but let the oom killer triggered from the allocation context care
				1124	* about the global OOM.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1125	*/
				1126	void pagefault_out_of_memory(void)
				1127	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1128	static DEFINE_RATELIMIT_STATE(pfoom_rs, DEFAULT_RATELIMIT_INTERVAL,
				1129	DEFAULT_RATELIMIT_BURST);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1130
				1131	if (mem_cgroup_oom_synchronize(true))
				1132	return;
				1133
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1134	if (fatal_signal_pending(current))
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1135	return;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1136
				1137	if (__ratelimit(&pfoom_rs))
				1138	pr_warn("Huh VM_FAULT_OOM leaked out to the #PF handler. Retrying PF\n");
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1139	}