Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e79cb59..46ad252 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
@@ -19,26 +20,17 @@
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
+#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
@@ -65,6 +57,8 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
+#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -94,6 +88,10 @@
#define do_swap_account 0
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -248,6 +246,12 @@
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
+static inline bool should_force_charge(void)
+{
+ return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+ (current->flags & PF_EXITING);
+}
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -314,6 +318,7 @@
EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -437,14 +442,6 @@
}
}
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -487,7 +484,10 @@
unsigned long ino = 0;
rcu_read_lock();
- memcg = READ_ONCE(page->mem_cgroup);
+ if (PageSlab(page) && !PageTail(page))
+ memcg = memcg_from_slab_page(page);
+ else
+ memcg = READ_ONCE(page->mem_cgroup);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -680,10 +680,153 @@
return mz;
}
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- int event)
+/**
+ * __mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- return atomic_long_read(&memcg->events[event]);
+ long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmstats[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+}
+
+static struct mem_cgroup_per_node *
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
+{
+ struct mem_cgroup *parent;
+
+ parent = parent_mem_cgroup(pn->memcg);
+ if (!parent)
+ return NULL;
+ return mem_cgroup_nodeinfo(parent, nid);
+}
+
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+ long x;
+
+ /* Update node */
+ __mod_node_page_state(pgdat, idx, val);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = pn->memcg;
+
+ /* Update memcg */
+ __mod_memcg_state(memcg, idx, val);
+
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup_per_node *pi;
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
+ x = 0;
+ }
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+}
+
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+{
+ struct page *page = virt_to_head_page(p);
+ pg_data_t *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = memcg_from_slab_page(page);
+
+ /* Untracked pages have no memcg, no lruvec. Update only the node */
+ if (!memcg || memcg == root_mem_cgroup) {
+ __mod_node_page_state(pgdat, idx, val);
+ } else {
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ __mod_lruvec_state(lruvec, idx, val);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * __count_memcg_events - account VM events in a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event item
+ * @count: the number of events that occured
+ */
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+ unsigned long count)
+{
+ unsigned long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmevents[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+}
+
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
+{
+ return atomic_long_read(&memcg->vmevents[event]);
+}
+
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
+{
+ long x = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ return x;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -715,35 +858,7 @@
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
-}
-
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- unsigned long nr = 0;
- enum lru_list lru;
-
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += mem_cgroup_get_lru_size(lruvec, lru);
- }
- return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
-{
- unsigned long nr = 0;
- int nid;
-
- for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return nr;
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -751,8 +866,8 @@
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
@@ -768,7 +883,7 @@
default:
break;
}
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
return true;
}
return false;
@@ -845,7 +960,7 @@
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -1031,26 +1146,45 @@
css_put(&prev->css);
}
-static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+ struct mem_cgroup *dead_memcg)
{
- struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
int i;
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ for_each_node(nid) {
+ mz = mem_cgroup_nodeinfo(from, nid);
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
}
}
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup *last;
+
+ do {
+ __invalidate_reclaim_iterators(memcg, dead_memcg);
+ last = memcg;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ /*
+ * When cgruop1 non-hierarchy mode is used,
+ * parent_mem_cgroup() does not walk all the way up to the
+ * cgroup root (root_mem_cgroup). So we have to handle
+ * dead_memcg from cgroup root separately.
+ */
+ if (last != root_mem_cgroup)
+ __invalidate_reclaim_iterators(root_mem_cgroup,
+ dead_memcg);
+}
+
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
@@ -1076,7 +1210,7 @@
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&iter->css, 0, &it);
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
@@ -1168,32 +1302,6 @@
*lru_size += nr_pages;
}
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
- struct mem_cgroup *task_memcg;
- struct task_struct *p;
- bool ret;
-
- p = find_lock_task_mm(task);
- if (p) {
- task_memcg = get_mem_cgroup_from_mm(p->mm);
- task_unlock(p);
- } else {
- /*
- * All threads may have already detached their mm's, but the oom
- * killer still needs to detect if they have already been oom
- * killed to prevent needlessly killing additional tasks.
- */
- rcu_read_lock();
- task_memcg = mem_cgroup_from_task(task);
- css_get(&task_memcg->css);
- rcu_read_unlock();
- }
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
- css_put(&task_memcg->css);
- return ret;
-}
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -1269,85 +1377,174 @@
return false;
}
-static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- NR_SHMEM,
- NR_FILE_MAPPED,
- NR_FILE_DIRTY,
- NR_WRITEBACK,
- MEMCG_SWAP,
-};
+static char *memory_stat_format(struct mem_cgroup *memcg)
+{
+ struct seq_buf s;
+ int i;
-static const char *const memcg1_stat_names[] = {
- "cache",
- "rss",
- "rss_huge",
- "shmem",
- "mapped_file",
- "dirty",
- "writeback",
- "swap",
-};
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+ if (!s.buffer)
+ return NULL;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_buf_printf(&s, "anon %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_CACHE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "kernel_stack %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ 1024);
+ seq_buf_printf(&s, "slab %llu\n",
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "sock %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_SOCK) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "shmem %llu\n",
+ (u64)memcg_page_state(memcg, NR_SHMEM) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_mapped %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_dirty %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_writeback %llu\n",
+ (u64)memcg_page_state(memcg, NR_WRITEBACK) *
+ PAGE_SIZE);
+
+ /*
+ * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+ * with the NR_ANON_THP vm counter, but right now it's a pain in the
+ * arse because it requires migrating the work out of rmap to a place
+ * where the page->mem_cgroup is set up and stable.
+ */
+ seq_buf_printf(&s, "anon_thp %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "slab_reclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "slab_unreclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
+ PAGE_SIZE);
+
+ /* Accumulated memory events */
+
+ seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
+
+ seq_buf_printf(&s, "workingset_refault %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT));
+ seq_buf_printf(&s, "workingset_activate %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
+ memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
+
+ seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "pgscan %lu\n",
+ memcg_events(memcg, PGSCAN_KSWAPD) +
+ memcg_events(memcg, PGSCAN_DIRECT));
+ seq_buf_printf(&s, "pgsteal %lu\n",
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
+ memcg_events(memcg, PGSTEAL_DIRECT));
+ seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_buf_printf(&s, "thp_fault_alloc %lu\n",
+ memcg_events(memcg, THP_FAULT_ALLOC));
+ seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ /* The above should easily fit into one page */
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+
+ return s.buffer;
+}
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
+ * memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct mem_cgroup *iter;
- unsigned int i;
-
rcu_read_lock();
+ if (memcg) {
+ pr_cont(",oom_memcg=");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ } else
+ pr_cont(",global_oom");
if (p) {
- pr_info("Task in ");
+ pr_cont(",task_memcg=");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
- } else {
- pr_info("Memory limit reached of cgroup ");
}
-
- pr_cont_cgroup_path(memcg->css.cgroup);
- pr_cont("\n");
-
rcu_read_unlock();
+}
+
+/**
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
+ * memory controller.
+ * @memcg: The memory cgroup that went over limit
+ */
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
+{
+ char *buf;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
K((u64)memcg->memory.max), memcg->memory.failcnt);
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats for ");
- pr_cont_cgroup_path(iter->css.cgroup);
- pr_cont(":");
-
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
- continue;
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state(iter, memcg1_stats[i])));
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
-
- pr_cont("\n");
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->swap)),
+ K((u64)memcg->swap.max), memcg->swap.failcnt);
+ else {
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
+
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(":");
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return;
+ pr_info("%s", buf);
+ kfree(buf);
}
/*
@@ -1370,6 +1567,11 @@
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -1382,8 +1584,13 @@
};
bool ret;
- mutex_lock(&oom_lock);
- ret = out_of_memory(&oc);
+ if (mutex_lock_killable(&oom_lock))
+ return true;
+ /*
+ * A few threads which were not waiting at mutex_lock_killable() can
+ * fail to bail out. Therefore, check again after holding oom_lock.
+ */
+ ret = should_force_charge() || out_of_memory(&oc);
mutex_unlock(&oom_lock);
return ret;
}
@@ -1403,11 +1610,15 @@
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+
+ if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+ if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON))
return true;
return false;
@@ -1666,9 +1877,14 @@
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
+ enum oom_status ret;
+ bool locked;
+
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
+ memcg_memory_event(memcg, MEMCG_OOM);
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
@@ -1698,10 +1914,23 @@
return OOM_ASYNC;
}
- if (mem_cgroup_out_of_memory(memcg, mask, order))
- return OOM_SUCCESS;
+ mem_cgroup_mark_under_oom(memcg);
- return OOM_FAILED;
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ mem_cgroup_unmark_under_oom(memcg);
+ if (mem_cgroup_out_of_memory(memcg, mask, order))
+ ret = OOM_SUCCESS;
+ else
+ ret = OOM_FAILED;
+
+ if (locked)
+ mem_cgroup_oom_unlock(memcg);
+
+ return ret;
}
/**
@@ -2040,21 +2269,22 @@
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
memcg = stock->cached;
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
- continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
- css_put(&memcg->css);
- continue;
- }
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
- css_put(&memcg->css);
}
put_cpu();
mutex_unlock(&percpu_charge_mutex);
@@ -2063,7 +2293,7 @@
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *mi;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
@@ -2075,9 +2305,10 @@
int nid;
long x;
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
if (x)
- atomic_long_add(x, &memcg->stat[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmstats[i]);
if (i >= NR_VM_NODE_STAT_ITEMS)
continue;
@@ -2088,16 +2319,19 @@
pn = mem_cgroup_nodeinfo(memcg, nid);
x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
if (x)
- atomic_long_add(x, &pn->lruvec_stat[i]);
+ do {
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
}
}
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
long x;
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
if (x)
- atomic_long_add(x, &memcg->events[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmevents[i]);
}
}
@@ -2125,11 +2359,67 @@
}
/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ * overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
+ * to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ * +-------+------------------------+
+ * | usage | time to allocate in ms |
+ * +-------+------------------------+
+ * | 100M | 0 |
+ * | 101M | 6 |
+ * | 102M | 25 |
+ * | 103M | 57 |
+ * | 104M | 102 |
+ * | 105M | 159 |
+ * | 106M | 230 |
+ * | 107M | 313 |
+ * | 108M | 409 |
+ * | 109M | 518 |
+ * | 110M | 639 |
+ * | 111M | 774 |
+ * | 112M | 921 |
+ * | 113M | 1081 |
+ * | 114M | 1254 |
+ * | 115M | 1439 |
+ * | 116M | 1638 |
+ * | 117M | 1849 |
+ * | 118M | 2000 |
+ * | 119M | 2000 |
+ * | 120M | 2000 |
+ * +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
+/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
+ unsigned long usage, high, clamped_high;
+ unsigned long pflags;
+ unsigned long penalty_jiffies, overage;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
@@ -2138,8 +2428,75 @@
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
- css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ *
+ * We use overage compared to memory.high to calculate the number of
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
+ * fairly lenient on small overages, and increasingly harsh when the
+ * memcg in question makes it clear that it has no intention of stopping
+ * its crazy behaviour, so we exponentially increase the delay based on
+ * overage amount.
+ */
+
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ if (usage <= high)
+ goto out;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if it was a
+ * threshold of 1 page
+ */
+ clamped_high = max(high, 1UL);
+
+ overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+ clamped_high);
+
+ penalty_jiffies = ((u64)overage * overage * HZ)
+ >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+ /*
+ * Factor in the task's own contribution to the overage, such that four
+ * N-sized allocations are throttled approximately the same as one
+ * 4N-sized allocation.
+ *
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+ * larger the current charge patch is than that.
+ */
+ penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+ /*
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
+ * that it's not even worth doing, in an attempt to be nice to those who
+ * go only a small amount over their memory.high value and maybe haven't
+ * been aggressively reclaimed enough yet.
+ */
+ if (penalty_jiffies <= HZ / 100)
+ goto out;
+
+ /*
+ * If we exit early, we're guaranteed to die (since
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+ * need to account for any ill-begotten jiffies to pay them off later.
+ */
+ psi_memstall_enter(&pflags);
+ schedule_timeout_killable(penalty_jiffies);
+ psi_memstall_leave(&pflags);
+
+out:
+ css_put(&memcg->css);
}
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2152,7 +2509,6 @@
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- bool oomed = false;
enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
@@ -2179,14 +2535,21 @@
}
/*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (gfp_mask & __GFP_ATOMIC)
+ goto force;
+
+ /*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(tsk_is_oom_victim(current) ||
- fatal_signal_pending(current) ||
- current->flags & PF_EXITING))
+ if (unlikely(should_force_charge()))
goto force;
/*
@@ -2241,7 +2604,7 @@
if (nr_retries--)
goto retry;
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
@@ -2250,8 +2613,6 @@
if (fatal_signal_pending(current))
goto force;
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
-
/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
@@ -2262,7 +2623,6 @@
switch (oom_status) {
case OOM_SUCCESS:
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- oomed = true;
goto retry;
case OOM_FAILED:
goto force;
@@ -2329,13 +2689,13 @@
static void lock_page_lru(struct page *page, int *isolated)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
- spin_lock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&pgdat->lru_lock);
if (PageLRU(page)) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_lru(page));
*isolated = 1;
@@ -2345,17 +2705,17 @@
static void unlock_page_lru(struct page *page, int isolated)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
if (isolated) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
}
static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -2460,17 +2820,18 @@
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;
+ if (!css_tryget_online(&memcg->css))
+ return;
+
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
if (!cw)
return;
- css_get(&memcg->css);
-
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2478,25 +2839,6 @@
queue_work(memcg_kmem_cache_wq, &cw->work);
}
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- /*
- * We need to stop accounting when we kmalloc, because if the
- * corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_kmem_cache_create will recurse.
- *
- * However, it is better to enclose the whole function. Depending on
- * the debugging options enabled, INIT_WORK(), for instance, can
- * trigger an allocation. This too, will make us recurse. Because at
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
- * the safest choice is to do it like this, wrapping the whole function.
- */
- current->memcg_kmem_skip_account = 1;
- __memcg_schedule_kmem_cache_create(memcg, cachep);
- current->memcg_kmem_skip_account = 0;
-}
-
static inline bool memcg_kmem_bypass(void)
{
if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
@@ -2524,6 +2866,7 @@
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ struct memcg_cache_array *arr;
int kmemcg_id;
VM_BUG_ON(!is_root_cache(cachep));
@@ -2531,17 +2874,28 @@
if (memcg_kmem_bypass())
return cachep;
- if (current->memcg_kmem_skip_account)
- return cachep;
+ rcu_read_lock();
- memcg = get_mem_cgroup_from_current();
+ if (unlikely(current->active_memcg))
+ memcg = current->active_memcg;
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ if (!memcg || memcg == root_mem_cgroup)
+ goto out_unlock;
+
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
- goto out;
+ goto out_unlock;
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
- if (likely(memcg_cachep))
- return memcg_cachep;
+ arr = rcu_dereference(cachep->memcg_params.memcg_caches);
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match the data dependency
+ * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
+ */
+ memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2554,10 +2908,20 @@
* memcg_create_kmem_cache, this means no further allocation
* could happen with the slab_mutex held. So it's better to
* defer everything.
+ *
+ * If the memcg is dying or memcg_cache is about to be released,
+ * don't bother creating new kmem_caches. Because memcg_cachep
+ * is ZEROed as the fist step of kmem offlining, we don't need
+ * percpu_ref_tryget_live() here. css_tryget_online() check in
+ * memcg_schedule_kmem_cache_create() will prevent us from
+ * creation of a new kmem_cache.
*/
- memcg_schedule_kmem_cache_create(memcg, cachep);
-out:
- css_put(&memcg->css);
+ if (unlikely(!memcg_cachep))
+ memcg_schedule_kmem_cache_create(memcg, cachep);
+ else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
+ cachep = memcg_cachep;
+out_unlock:
+ rcu_read_unlock();
return cachep;
}
@@ -2568,11 +2932,11 @@
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params.memcg->css);
+ percpu_ref_put(&cachep->memcg_params.refcnt);
}
/**
- * memcg_kmem_charge_memcg: charge a kmem page
+ * __memcg_kmem_charge_memcg: charge a kmem page
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
@@ -2580,7 +2944,7 @@
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
@@ -2593,24 +2957,31 @@
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+ /*
+ * Enforce __GFP_NOFAIL allocation because callers are not
+ * prepared to see failures and likely do not have any failure
+ * handling code.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->kmem, nr_pages);
+ return 0;
+ }
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
-
- page->mem_cgroup = memcg;
-
return 0;
}
/**
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
@@ -2620,19 +2991,37 @@
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
- if (!ret)
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!ret) {
+ page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ }
}
css_put(&memcg->css);
return ret;
}
+
/**
- * memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge_memcg: uncharge a kmem page
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+/**
+ * __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
-void memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -2641,14 +3030,7 @@
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
+ __memcg_kmem_uncharge_memcg(memcg, nr_pages);
page->mem_cgroup = NULL;
/* slab pages do not have PageKmemcg flag set */
@@ -2663,7 +3045,7 @@
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone_lru_lock and migration entries setup in all page mappings.
+ * pgdat->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -2953,50 +3335,15 @@
return retval;
}
-struct accumulated_stats {
- unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
- unsigned long lru_pages[NR_LRU_LISTS];
- const unsigned int *stats_array;
- const unsigned int *events_array;
- int stats_size;
- int events_size;
-};
-
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
- struct accumulated_stats *acc)
-{
- struct mem_cgroup *mi;
- int i;
-
- for_each_mem_cgroup_tree(mi, memcg) {
- for (i = 0; i < acc->stats_size; i++)
- acc->stat[i] += memcg_page_state(mi,
- acc->stats_array ? acc->stats_array[i] : i);
-
- for (i = 0; i < acc->events_size; i++)
- acc->events[i] += memcg_sum_events(mi,
- acc->events_array ? acc->events_array[i] : i);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] +=
- mem_cgroup_nr_lru_pages(mi, BIT(i));
- }
-}
-
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val = 0;
+ unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg) {
- val += memcg_page_state(iter, MEMCG_CACHE);
- val += memcg_page_state(iter, MEMCG_RSS);
- if (swap)
- val += memcg_page_state(iter, MEMCG_SWAP);
- }
+ val = memcg_page_state(memcg, MEMCG_CACHE) +
+ memcg_page_state(memcg, MEMCG_RSS);
+ if (swap)
+ val += memcg_page_state(memcg, MEMCG_SWAP);
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -3057,6 +3404,72 @@
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+{
+ unsigned long stat[MEMCG_NR_STAT];
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+ int min_idx, max_idx;
+
+ if (slab_only) {
+ min_idx = NR_SLAB_RECLAIMABLE;
+ max_idx = NR_SLAB_UNRECLAIMABLE;
+ } else {
+ min_idx = 0;
+ max_idx = MEMCG_NR_STAT;
+ }
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ if (!slab_only)
+ max_idx = NR_VM_NODE_STAT_ITEMS;
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -3102,16 +3515,23 @@
*/
memcg->kmem_state = KMEM_ALLOCATED;
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
/*
+ * Deactivate and reparent kmem_caches. Then flush percpu
+ * slab statistics to have precise values at the parent and
+ * all ancestor levels. It's required to keep slab stats
+ * accurate after the reparenting of kmem_caches.
+ */
+ memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_flush_percpu_vmstats(memcg, true);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ /*
* Change kmemcg_id of this cgroup and all its descendants to the
* parent's id, and then move all entries from this cgroup's list_lrus
* to ones of the parent. After we have finished, all list_lrus
@@ -3141,9 +3561,8 @@
memcg_offline_kmem(memcg);
if (memcg->kmem_state == KMEM_ALLOCATED) {
- memcg_destroy_kmem_caches(memcg);
+ WARN_ON(!list_empty(&memcg->kmem_caches));
static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
}
}
#else
@@ -3235,6 +3654,9 @@
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
@@ -3320,6 +3742,42 @@
#endif
#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
@@ -3336,7 +3794,7 @@
const struct numa_stat *stat;
int nid;
unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -3370,6 +3828,28 @@
}
#endif /* CONFIG_NUMA */
+static const unsigned int memcg1_stats[] = {
+ MEMCG_CACHE,
+ MEMCG_RSS,
+ MEMCG_RSS_HUGE,
+ NR_SHMEM,
+ NR_FILE_MAPPED,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+ "cache",
+ "rss",
+ "rss_huge",
+ "shmem",
+ "mapped_file",
+ "dirty",
+ "writeback",
+ "swap",
+};
+
/* Universal VM events cgroup1 shows, original sort order */
static const unsigned int memcg1_events[] = {
PGPGIN,
@@ -3387,11 +3867,10 @@
static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
- struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3400,17 +3879,18 @@
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state(memcg, memcg1_stats[i]) *
+ memcg_page_state_local(memcg, memcg1_stats[i]) *
PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "%s %lu\n", memcg1_event_names[i],
- memcg_sum_events(memcg, memcg1_events[i]));
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -3424,27 +3904,22 @@
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
- acc.stats_array = memcg1_stats;
- acc.events_size = ARRAY_SIZE(memcg1_events);
- acc.events_array = memcg1_events;
- accumulate_memcg_tree(memcg, &acc);
-
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)acc.stat[i] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, memcg1_stats[i]) *
+ PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
- (u64)acc.events[i]);
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -3625,8 +4100,7 @@
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto unlock;
@@ -3820,7 +4294,7 @@
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
@@ -3847,6 +4321,8 @@
#ifdef CONFIG_CGROUP_WRITEBACK
+#include <trace/events/writeback.h>
+
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -3872,6 +4348,22 @@
return &memcg->cgwb_domain;
}
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page().
+ */
+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = atomic_long_read(&memcg->vmstats[idx]);
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
+ if (x < 0)
+ x = 0;
+ return x;
+}
+
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
@@ -3897,12 +4389,12 @@
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
@@ -3914,6 +4406,130 @@
}
}
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4321,14 +4937,12 @@
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
- atomic_add(n, &memcg->id.ref);
+ refcount_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
@@ -4336,11 +4950,6 @@
}
}
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
- mem_cgroup_id_get_many(memcg, 1);
-}
-
static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
{
mem_cgroup_id_put_many(memcg, 1);
@@ -4376,8 +4985,15 @@
if (!pn)
return 1;
+ pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+ if (!pn->lruvec_stat_local) {
+ kfree(pn);
+ return 1;
+ }
+
pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
if (!pn->lruvec_stat_cpu) {
+ free_percpu(pn->lruvec_stat_local);
kfree(pn);
return 1;
}
@@ -4399,6 +5015,7 @@
return;
free_percpu(pn->lruvec_stat_cpu);
+ free_percpu(pn->lruvec_stat_local);
kfree(pn);
}
@@ -4408,21 +5025,29 @@
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->stat_cpu);
+ free_percpu(memcg->vmstats_percpu);
+ free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size;
+ unsigned int size;
int node;
+ int __maybe_unused i;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4437,8 +5062,12 @@
if (memcg->id.id < 0)
goto fail;
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat_cpu)
+ memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+ if (!memcg->vmstats_local)
+ goto fail;
+
+ memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+ if (!memcg->vmstats_percpu)
goto fail;
for_each_node(node)
@@ -4462,6 +5091,14 @@
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
@@ -4512,6 +5149,9 @@
/* The following stuff does not apply to the root */
if (!parent) {
+#ifdef CONFIG_MEMCG_KMEM
+ INIT_LIST_HEAD(&memcg->kmem_caches);
+#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -4545,7 +5185,7 @@
}
/* Online state pins memcg ID, memcg ID pins CSS */
- atomic_set(&memcg->id.ref, 1);
+ refcount_set(&memcg->id.ref, 1);
css_get(css);
return 0;
}
@@ -4573,6 +5213,8 @@
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+ drain_all_stock(memcg);
+
mem_cgroup_id_put(memcg);
}
@@ -4586,7 +5228,12 @@
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
@@ -4669,7 +5316,7 @@
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
+ struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
@@ -4750,7 +5397,7 @@
/* shmem/tmpfs may report page out on swap: account for that too. */
if (shmem_mapping(mapping)) {
page = find_get_entry(mapping, pgoff);
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
if (do_memsw_account())
*entry = swp;
@@ -4782,6 +5429,8 @@
struct mem_cgroup *from,
struct mem_cgroup *to)
{
+ struct lruvec *from_vec, *to_vec;
+ struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -4805,11 +5454,15 @@
anon = PageAnon(page);
+ pgdat = page_pgdat(page);
+ from_vec = mem_cgroup_lruvec(pgdat, from);
+ to_vec = mem_cgroup_lruvec(pgdat, to);
+
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -4821,16 +5474,24 @@
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && !list_empty(page_deferred_list(page))) {
+ spin_lock(&from->deferred_split_queue.split_queue_lock);
+ list_del_init(page_deferred_list(page));
+ from->deferred_split_queue.split_queue_len--;
+ spin_unlock(&from->deferred_split_queue.split_queue_lock);
+ }
+#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
@@ -4839,6 +5500,17 @@
/* caller should have done css_get */
page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && list_empty(page_deferred_list(page))) {
+ spin_lock(&to->deferred_split_queue.split_queue_lock);
+ list_add_tail(page_deferred_list(page),
+ &to->deferred_split_queue.split_queue);
+ to->deferred_split_queue.split_queue_len++;
+ spin_unlock(&to->deferred_split_queue.split_queue_lock);
+ }
+#endif
+
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -4870,8 +5542,8 @@
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -4905,8 +5577,7 @@
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_public_page(page))
+ if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -4977,8 +5648,8 @@
if (ptl) {
/*
* Note their can not be MC_TARGET_DEVICE for now as we do not
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
- * MEMORY_DEVICE_PRIVATE but this might change.
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+ * this might change.
*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
@@ -4998,17 +5669,16 @@
return 0;
}
+static const struct mm_walk_ops precharge_walk_ops = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+};
+
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- };
down_read(&mm->mmap_sem);
- walk_page_range(0, mm->highest_vm_end,
- &mem_cgroup_count_precharge_walk);
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -5277,13 +5947,12 @@
return ret;
}
+static const struct mm_walk_ops charge_walk_ops = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+};
+
static void mem_cgroup_move_charge(void)
{
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mc.mm,
- };
-
lru_add_drain_all();
/*
* Signal lock_page_memcg() to take the memcg's move_lock
@@ -5309,7 +5978,8 @@
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+ NULL);
up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
@@ -5353,6 +6023,16 @@
root_mem_cgroup->use_hierarchy = false;
}
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+{
+ if (value == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
+
+ return 0;
+}
+
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -5363,15 +6043,8 @@
static int memory_min_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long min = READ_ONCE(memcg->memory.min);
-
- if (min == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
}
static ssize_t memory_min_write(struct kernfs_open_file *of,
@@ -5393,15 +6066,8 @@
static int memory_low_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->memory.low);
-
- if (low == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
}
static ssize_t memory_low_write(struct kernfs_open_file *of,
@@ -5423,15 +6089,7 @@
static int memory_high_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = READ_ONCE(memcg->high);
-
- if (high == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -5460,15 +6118,8 @@
static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
}
static ssize_t memory_max_write(struct kernfs_open_file *of,
@@ -5520,104 +6171,48 @@
return nbytes;
}
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
+{
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
+ seq_printf(m, "oom_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- seq_printf(m, "low %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
- seq_printf(m, "high %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
- seq_printf(m, "max %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
- seq_printf(m, "oom %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
- seq_printf(m, "oom_kill %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+ __memory_events_show(m, memcg->memory_events);
+ return 0;
+}
+static int memory_events_local_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ __memory_events_show(m, memcg->memory_events_local);
return 0;
}
static int memory_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- struct accumulated_stats acc;
- int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ char *buf;
- /*
- * Provide statistics on the state of the memory subsystem as
- * well as cumulative event counters that show past behavior.
- *
- * This list is ordered following a combination of these gradients:
- * 1) generic big picture -> specifics and details
- * 2) reflecting userspace activity -> reflecting kernel heuristics
- *
- * Current memory state:
- */
-
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = MEMCG_NR_STAT;
- acc.events_size = NR_VM_EVENT_ITEMS;
- accumulate_memcg_tree(memcg, &acc);
-
- seq_printf(m, "anon %llu\n",
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
- seq_printf(m, "file %llu\n",
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
- seq_printf(m, "kernel_stack %llu\n",
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
- seq_printf(m, "slab %llu\n",
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
- seq_printf(m, "sock %llu\n",
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
-
- seq_printf(m, "shmem %llu\n",
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
- seq_printf(m, "file_mapped %llu\n",
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
- seq_printf(m, "file_dirty %llu\n",
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
- seq_printf(m, "file_writeback %llu\n",
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
-
- seq_printf(m, "slab_reclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
- seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
-
- /* Accumulated memory events */
-
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
-
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
- acc.events[PGSCAN_DIRECT]);
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
- acc.events[PGSTEAL_DIRECT]);
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
-
- seq_printf(m, "workingset_refault %lu\n",
- acc.stat[WORKINGSET_REFAULT]);
- seq_printf(m, "workingset_activate %lu\n",
- acc.stat[WORKINGSET_ACTIVATE]);
- seq_printf(m, "workingset_nodereclaim %lu\n",
- acc.stat[WORKINGSET_NODERECLAIM]);
-
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return -ENOMEM;
+ seq_puts(m, buf);
+ kfree(buf);
return 0;
}
static int memory_oom_group_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", memcg->oom_group);
@@ -5683,6 +6278,12 @@
.seq_show = memory_events_show,
},
{
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
+ .seq_show = memory_events_local_show,
+ },
+ {
.name = "stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
@@ -5746,7 +6347,7 @@
*
* | memory.current, if memory.current < memory.low
* low_usage = |
- | 0, otherwise.
+ * | 0, otherwise.
*
*
* Such definition of the effective memory.low provides the expected
@@ -6045,7 +6646,7 @@
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
@@ -6080,7 +6681,7 @@
unsigned int nr_pages = 1;
if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
+ nr_pages = compound_nr(page);
ug->nr_huge += nr_pages;
}
if (PageAnon(page))
@@ -6092,7 +6693,7 @@
}
ug->pgpgout++;
} else {
- ug->nr_kmem += 1 << compound_order(page);
+ ug->nr_kmem += compound_nr(page);
__ClearPageKmemcg(page);
}
@@ -6377,7 +6978,7 @@
#ifdef CONFIG_MEMCG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
@@ -6600,15 +7201,8 @@
static int swap_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
}
static ssize_t swap_max_write(struct kernfs_open_file *of,
@@ -6630,7 +7224,7 @@
static int swap_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));