Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f10aa53..71e3ace 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
*
@@ -63,21 +64,33 @@
*/
DEFINE_MUTEX(oom_lock);
+static inline bool is_memcg_oom(struct oom_control *oc)
+{
+ return oc->memcg != NULL;
+}
+
#ifdef CONFIG_NUMA
/**
- * has_intersects_mems_allowed() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligiblity for kill
* @start: task struct of which task to consider
- * @mask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
+ *
+ * This function is assuming oom-killer context and 'current' has triggered
+ * the oom-killer.
*/
-static bool has_intersects_mems_allowed(struct task_struct *start,
- const nodemask_t *mask)
+static bool oom_cpuset_eligible(struct task_struct *start,
+ struct oom_control *oc)
{
struct task_struct *tsk;
bool ret = false;
+ const nodemask_t *mask = oc->nodemask;
+
+ if (is_memcg_oom(oc))
+ return true;
rcu_read_lock();
for_each_thread(start, tsk) {
@@ -104,8 +117,7 @@
return ret;
}
#else
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
- const nodemask_t *mask)
+static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
{
return true;
}
@@ -145,28 +157,13 @@
return oc->order == -1;
}
-static inline bool is_memcg_oom(struct oom_control *oc)
-{
- return oc->memcg != NULL;
-}
-
/* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p)
{
if (is_global_init(p))
return true;
if (p->flags & PF_KTHREAD)
return true;
-
- /* When mem_cgroup_out_of_memory() and p is not member of the group */
- if (memcg && !task_in_mem_cgroup(p, memcg))
- return true;
-
- /* p may not have freeable memory in nodemask */
- if (!has_intersects_mems_allowed(p, nodemask))
- return true;
-
return false;
}
@@ -193,20 +190,17 @@
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
* @totalpages: total present RAM allowed for page allocation
- * @memcg: task's memory controller, if constrained
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
- const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
- if (oom_unkillable_task(p, memcg, nodemask))
+ if (oom_unkillable_task(p))
return 0;
p = find_lock_task_mm(p);
@@ -245,11 +239,11 @@
return points > 0 ? points : 1;
}
-enum oom_constraint {
- CONSTRAINT_NONE,
- CONSTRAINT_CPUSET,
- CONSTRAINT_MEMORY_POLICY,
- CONSTRAINT_MEMCG,
+static const char * const oom_constraint_text[] = {
+ [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
+ [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
+ [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
+ [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};
/*
@@ -269,7 +263,7 @@
}
/* Default to all available memory */
- oc->totalpages = totalram_pages + total_swap_pages;
+ oc->totalpages = totalram_pages() + total_swap_pages;
if (!IS_ENABLED(CONFIG_NUMA))
return CONSTRAINT_NONE;
@@ -293,7 +287,7 @@
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, *oc->nodemask)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
@@ -306,7 +300,7 @@
if (cpuset_limited) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, cpuset_current_mems_allowed)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_CPUSET;
}
return CONSTRAINT_NONE;
@@ -317,7 +311,11 @@
struct oom_control *oc = arg;
unsigned long points;
- if (oom_unkillable_task(task, NULL, oc->nodemask))
+ if (oom_unkillable_task(task))
+ goto next;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
goto next;
/*
@@ -341,13 +339,10 @@
goto select;
}
- points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+ points = oom_badness(task, oc->totalpages);
if (!points || points < oc->chosen_points)
goto next;
- /* Prefer thread group leaders for display purposes */
- if (points == oc->chosen_points && thread_group_leader(oc->chosen))
- goto next;
select:
if (oc->chosen)
put_task_struct(oc->chosen);
@@ -380,14 +375,44 @@
break;
rcu_read_unlock();
}
+}
- oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
+static int dump_task(struct task_struct *p, void *arg)
+{
+ struct oom_control *oc = arg;
+ struct task_struct *task;
+
+ if (oom_unkillable_task(p))
+ return 0;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+ return 0;
+
+ task = find_lock_task_mm(p);
+ if (!task) {
+ /*
+ * This is a kthread or all of p's threads have already
+ * detached their mm's. There's no need to report
+ * them; they can't be oom killed anyway.
+ */
+ return 0;
+ }
+
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ task->pid, from_kuid(&init_user_ns, task_uid(task)),
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+ mm_pgtables_bytes(task->mm),
+ get_mm_counter(task->mm, MM_SWAPENTS),
+ task->signal->oom_score_adj, task->comm);
+ task_unlock(task);
+
+ return 0;
}
/**
* dump_tasks - dump current memory state of all system tasks
- * @memcg: current's memory controller, if constrained
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
@@ -395,59 +420,55 @@
* State information includes task's pid, uid, tgid, vm size, rss,
* pgtables_bytes, swapents, oom_score_adj value, and name.
*/
-static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_tasks(struct oom_control *oc)
{
- struct task_struct *p;
- struct task_struct *task;
-
pr_info("Tasks state (memory values in pages):\n");
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
- rcu_read_lock();
- for_each_process(p) {
- if (oom_unkillable_task(p, memcg, nodemask))
- continue;
- task = find_lock_task_mm(p);
- if (!task) {
- /*
- * This is a kthread or all of p's threads have already
- * detached their mm's. There's no need to report
- * them; they can't be oom killed anyway.
- */
- continue;
- }
+ if (is_memcg_oom(oc))
+ mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+ else {
+ struct task_struct *p;
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
- task->pid, from_kuid(&init_user_ns, task_uid(task)),
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- mm_pgtables_bytes(task->mm),
- get_mm_counter(task->mm, MM_SWAPENTS),
- task->signal->oom_score_adj, task->comm);
- task_unlock(task);
+ rcu_read_lock();
+ for_each_process(p)
+ dump_task(p, oc);
+ rcu_read_unlock();
}
- rcu_read_unlock();
+}
+
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
+{
+ /* one line summary of the oom killer context. */
+ pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
+ oom_constraint_text[oc->constraint],
+ nodemask_pr_args(oc->nodemask));
+ cpuset_print_current_mems_allowed();
+ mem_cgroup_print_oom_context(oc->memcg, victim);
+ pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
+ from_kuid(&init_user_ns, task_uid(victim)));
}
static void dump_header(struct oom_control *oc, struct task_struct *p)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, &oc->gfp_mask,
- nodemask_pr_args(oc->nodemask), oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
- cpuset_print_current_mems_allowed();
dump_stack();
if (is_memcg_oom(oc))
- mem_cgroup_print_oom_info(oc->memcg, p);
+ mem_cgroup_print_oom_meminfo(oc->memcg);
else {
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
if (is_dump_unreclaim_slabs())
dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
- dump_tasks(oc->memcg, oc->nodemask);
+ dump_tasks(oc);
+ if (p)
+ dump_oom_summary(oc, p);
}
/*
@@ -502,7 +523,7 @@
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
continue;
/*
@@ -516,19 +537,21 @@
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- const unsigned long start = vma->vm_start;
- const unsigned long end = vma->vm_end;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- tlb_gather_mmu(&tlb, mm, start, end);
- if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
+ vma, mm, vma->vm_start,
+ vma->vm_end);
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
+ tlb_finish_mmu(&tlb, range.start, range.end);
ret = false;
continue;
}
- unmap_page_range(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ unmap_page_range(&tlb, vma, range.start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, range.start, range.end);
}
}
@@ -634,8 +657,8 @@
static void wake_oom_reaper(struct task_struct *tsk)
{
- /* tsk is already queued? */
- if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
@@ -830,7 +853,7 @@
return ret;
}
-static void __oom_kill_process(struct task_struct *victim)
+static void __oom_kill_process(struct task_struct *victim, const char *message)
{
struct task_struct *p;
struct mm_struct *mm;
@@ -859,13 +882,15 @@
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
- task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)),
- K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
+ from_kuid(&init_user_ns, task_uid(victim)),
+ mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
task_unlock(victim);
/*
@@ -897,7 +922,7 @@
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
}
rcu_read_unlock();
@@ -913,24 +938,20 @@
* Kill provided task unless it's secured by setting
* oom_score_adj to OOM_SCORE_ADJ_MIN.
*/
-static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+static int oom_kill_memcg_member(struct task_struct *task, void *message)
{
- if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
+ !is_global_init(task)) {
get_task_struct(task);
- __oom_kill_process(task);
+ __oom_kill_process(task, message);
}
return 0;
}
static void oom_kill_process(struct oom_control *oc, const char *message)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *victim = oc->chosen;
struct mem_cgroup *oom_group;
- unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -939,49 +960,18 @@
* its children or threads, just give it access to memory reserves
* so it can die quickly
*/
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
+ task_lock(victim);
+ if (task_will_free_mem(victim)) {
+ mark_oom_victim(victim);
+ wake_oom_reaper(victim);
+ task_unlock(victim);
+ put_task_struct(victim);
return;
}
- task_unlock(p);
+ task_unlock(victim);
if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
+ dump_header(oc, victim);
/*
* Do we need to kill the entire memory cgroup?
@@ -990,14 +980,15 @@
*/
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
- __oom_kill_process(victim);
+ __oom_kill_process(victim, message);
/*
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
- mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
+ (void*)message);
mem_cgroup_put(oom_group);
}
}
@@ -1005,8 +996,7 @@
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-static void check_panic_on_oom(struct oom_control *oc,
- enum oom_constraint constraint)
+static void check_panic_on_oom(struct oom_control *oc)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -1016,7 +1006,7 @@
* does not panic for cpuset, mempolicy, or memcg allocation
* failures.
*/
- if (constraint != CONSTRAINT_NONE)
+ if (oc->constraint != CONSTRAINT_NONE)
return;
}
/* Do not panic for oom kills triggered by sysrq */
@@ -1053,7 +1043,6 @@
bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
- enum oom_constraint constraint = CONSTRAINT_NONE;
if (oom_killer_disabled)
return false;
@@ -1080,22 +1069,24 @@
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here.
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA and memcg) that may require different handling.
*/
- constraint = constrained_alloc(oc);
- if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->constraint = constrained_alloc(oc);
+ if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
oc->nodemask = NULL;
- check_panic_on_oom(oc, constraint);
+ check_panic_on_oom(oc);
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
- current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
+ current->mm && !oom_unkillable_task(current) &&
+ oom_cpuset_eligible(current, oc) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
oc->chosen = current;