Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 961401c..ee4eecc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,9 +46,11 @@
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
+#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
+#include <linux/psi.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -126,6 +128,9 @@
unsigned int file_taken;
unsigned int taken;
} nr;
+
+ /* for recording the reclaimed slab by now */
+ struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCH
@@ -166,11 +171,22 @@
*/
unsigned long vm_total_pages;
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_MEMCG_KMEM
-
+#ifdef CONFIG_MEMCG
/*
* We allow subsystems to populate their shrinker-related
* LRU lists before register_shrinker_prepared() is called
@@ -222,18 +238,7 @@
idr_remove(&shrinker_idr, id);
up_write(&shrinker_rwsem);
}
-#else /* CONFIG_MEMCG_KMEM */
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- return 0;
-}
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
-#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
@@ -288,6 +293,15 @@
}
#else
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+
static bool global_reclaim(struct scan_control *sc)
{
return true;
@@ -337,12 +351,13 @@
*/
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
- unsigned long lru_size;
+ unsigned long lru_size = 0;
int zid;
- if (!mem_cgroup_disabled())
- lru_size = mem_cgroup_get_lru_size(lruvec, lru);
- else
+ if (!mem_cgroup_disabled()) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++)
+ lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ } else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
@@ -369,7 +384,7 @@
*/
int prealloc_shrinker(struct shrinker *shrinker)
{
- size_t size = sizeof(*shrinker->nr_deferred);
+ unsigned int size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -473,23 +488,22 @@
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = freeable >> priority;
- delta *= 4;
- do_div(delta, shrinker->seeks);
-
- /*
- * Make sure we apply some minimal pressure on default priority
- * even on small cgroups. Stale objects are not only consuming memory
- * by themselves, but can also hold a reference to a dying cgroup,
- * preventing it from being reclaimed. A dying cgroup with all
- * corresponding structures like per-cpu stats and kmem caches
- * can be really big, so it may lead to a significant waste of memory.
- */
- delta = max_t(unsigned long long, delta, min(freeable, batch_size));
+ if (shrinker->seeks) {
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
+ } else {
+ /*
+ * These objects don't require any IO to create. Trim
+ * them aggressively under memory pressure to keep
+ * them from causing refetches in the IO caches.
+ */
+ delta = freeable / 2;
+ }
total_scan += delta;
if (total_scan < 0) {
- pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
+ pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
next_deferred = nr;
@@ -575,7 +589,7 @@
return freed;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
@@ -583,7 +597,7 @@
unsigned long ret, freed = 0;
int i;
- if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ if (!mem_cgroup_online(memcg))
return 0;
if (!down_read_trylock(&shrinker_rwsem))
@@ -609,6 +623,11 @@
continue;
}
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_enabled() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
clear_bit(i, map->map);
@@ -645,13 +664,13 @@
up_read(&shrinker_rwsem);
return freed;
}
-#else /* CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG */
/**
* shrink_slab - shrink slab caches
@@ -680,7 +699,14 @@
unsigned long ret, freed = 0;
struct shrinker *shrinker;
- if (!mem_cgroup_is_root(memcg))
+ /*
+ * The root memcg might be allocated even though memcg is disabled
+ * via "cgroup_disable=memory" boot parameter. This could make
+ * mem_cgroup_is_root() return false, then just run memcg slab
+ * shrink, but skip global shrink. This may result in premature
+ * oom.
+ */
+ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
@@ -741,12 +767,12 @@
{
/*
* A freeable page cache page is referenced only by the caller
- * that isolated the page, the page cache radix tree and
- * optional buffer heads at page->private.
+ * that isolated the page, the page cache and optional buffer
+ * heads at page->private.
*/
- int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+ int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
HPAGE_PMD_NR : 1;
- return page_count(page) - page_has_private(page) == 1 + radix_pins;
+ return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -907,10 +933,7 @@
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
- if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
- refcount = 1 + HPAGE_PMD_NR;
- else
- refcount = 2;
+ refcount = 1 + compound_nr(page);
if (!page_ref_freeze(page, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
@@ -922,7 +945,7 @@
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, swap);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
} else {
@@ -948,7 +971,7 @@
*/
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
- shadow = workingset_eviction(mapping, page);
+ shadow = workingset_eviction(page);
__delete_from_page_cache(page, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
@@ -1098,28 +1121,23 @@
struct scan_control *sc,
enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
- bool force_reclaim)
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
- int pgactivate = 0;
- unsigned nr_unqueued_dirty = 0;
- unsigned nr_dirty = 0;
- unsigned nr_congested = 0;
unsigned nr_reclaimed = 0;
- unsigned nr_writeback = 0;
- unsigned nr_immediate = 0;
- unsigned nr_ref_keep = 0;
- unsigned nr_unmap_fail = 0;
+ unsigned pgactivate = 0;
+ memset(stat, 0, sizeof(*stat));
cond_resched();
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback;
+ unsigned int nr_pages;
cond_resched();
@@ -1131,7 +1149,10 @@
VM_BUG_ON_PAGE(PageActive(page), page);
- sc->nr_scanned++;
+ nr_pages = compound_nr(page);
+
+ /* Account the number of base pages even though THP */
+ sc->nr_scanned += nr_pages;
if (unlikely(!page_evictable(page)))
goto activate_locked;
@@ -1139,11 +1160,6 @@
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
- /* Double the slab pressure for mapped and swapcache pages */
- if ((page_mapped(page) || PageSwapCache(page)) &&
- !(PageAnon(page) && !PageSwapBacked(page)))
- sc->nr_scanned++;
-
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
@@ -1155,10 +1171,10 @@
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
- nr_dirty++;
+ stat->nr_dirty++;
if (dirty && !writeback)
- nr_unqueued_dirty++;
+ stat->nr_unqueued_dirty++;
/*
* Treat this page as congested if the underlying BDI is or if
@@ -1170,7 +1186,7 @@
if (((dirty || writeback) && mapping &&
inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
- nr_congested++;
+ stat->nr_congested++;
/*
* If a page at the tail of the LRU is under writeback, there
@@ -1219,7 +1235,7 @@
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
- nr_immediate++;
+ stat->nr_immediate++;
goto activate_locked;
/* Case 2 above */
@@ -1237,7 +1253,7 @@
* and it's also appropriate in global reclaim.
*/
SetPageReclaim(page);
- nr_writeback++;
+ stat->nr_writeback++;
goto activate_locked;
/* Case 3 above */
@@ -1250,14 +1266,14 @@
}
}
- if (!force_reclaim)
+ if (!ignore_references)
references = page_check_references(page, sc);
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
- nr_ref_keep++;
+ stat->nr_ref_keep += nr_pages;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1289,7 +1305,7 @@
}
if (!add_to_swap(page)) {
if (!PageTransHuge(page))
- goto activate_locked;
+ goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_huge_page_to_list(page,
page_list))
@@ -1298,7 +1314,7 @@
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(page))
- goto activate_locked;
+ goto activate_locked_split;
}
may_enter_fs = 1;
@@ -1313,6 +1329,18 @@
}
/*
+ * THP may get split above, need minus tail pages and update
+ * nr_pages to avoid accounting tail pages twice.
+ *
+ * The tail pages that are added into swap cache successfully
+ * reach here.
+ */
+ if ((nr_pages > 1) && !PageTransHuge(page)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
+
+ /*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
@@ -1322,7 +1350,7 @@
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
if (!try_to_unmap(page, flags)) {
- nr_unmap_fail++;
+ stat->nr_unmap_fail += nr_pages;
goto activate_locked;
}
}
@@ -1446,28 +1474,34 @@
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
- /*
- * At this point, we have no other references and there is
- * no way to pick any more up (removed from LRU, removed
- * from pagecache). Can use non-atomic bitops now (and
- * we obviously don't have to worry about waking up a process
- * waiting on the page lock, because there are no references.
- */
- __ClearPageLocked(page);
+
+ unlock_page(page);
free_it:
- nr_reclaimed++;
+ /*
+ * THP may get swapped out in a whole, need account
+ * all base pages.
+ */
+ nr_reclaimed += nr_pages;
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page))) {
- mem_cgroup_uncharge(page);
+ if (unlikely(PageTransHuge(page)))
(*get_compound_page_dtor(page))(page);
- } else
+ else
list_add(&page->lru, &free_pages);
continue;
+activate_locked_split:
+ /*
+ * The tail pages that are failed to add into swap cache
+ * reach here. Fixup nr_scanned and nr_pages.
+ */
+ if (nr_pages > 1) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
@@ -1475,8 +1509,9 @@
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
+ int type = page_is_file_cache(page);
SetPageActive(page);
- pgactivate++;
+ stat->nr_activate[type] += nr_pages;
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1486,6 +1521,8 @@
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
+
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
free_unref_page_list(&free_pages);
@@ -1493,16 +1530,6 @@
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
- if (stat) {
- stat->nr_dirty = nr_dirty;
- stat->nr_congested = nr_congested;
- stat->nr_unqueued_dirty = nr_unqueued_dirty;
- stat->nr_writeback = nr_writeback;
- stat->nr_immediate = nr_immediate;
- stat->nr_activate = pgactivate;
- stat->nr_ref_keep = nr_ref_keep;
- stat->nr_unmap_fail = nr_unmap_fail;
- }
return nr_reclaimed;
}
@@ -1514,20 +1541,21 @@
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
+ struct reclaim_stat dummy_stat;
unsigned long ret;
struct page *page, *next;
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !__PageMovable(page)) {
+ !__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, &dummy_stat, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1632,8 +1660,8 @@
}
-/*
- * zone_lru_lock is heavily contended. Some of the functions that
+/**
+ * pgdat->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
@@ -1655,7 +1683,7 @@
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
- isolate_mode_t mode, enum lru_list lru)
+ enum lru_list lru)
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
@@ -1664,11 +1692,11 @@
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
+ isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+ total_scan = 0;
scan = 0;
- for (total_scan = 0;
- scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
- total_scan++) {
+ while (scan < nr_to_scan && !list_empty(src)) {
struct page *page;
page = lru_to_page(src);
@@ -1676,9 +1704,12 @@
VM_BUG_ON_PAGE(!PageLRU(page), page);
+ nr_pages = compound_nr(page);
+ total_scan += nr_pages;
+
if (page_zonenum(page) > sc->reclaim_idx) {
list_move(&page->lru, &pages_skipped);
- nr_skipped[page_zonenum(page)]++;
+ nr_skipped[page_zonenum(page)] += nr_pages;
continue;
}
@@ -1687,11 +1718,14 @@
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
+ *
+ * Account all tail pages of THP. This would not cause
+ * premature OOM since __isolate_lru_page() returns -EBUSY
+ * only when the page is being freed somewhere else.
*/
- scan++;
+ scan += nr_pages;
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_pages = hpage_nr_pages(page);
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
@@ -1767,11 +1801,11 @@
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
- spin_lock_irq(zone_lru_lock(zone));
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ spin_lock_irq(&pgdat->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
if (PageLRU(page)) {
int lru = page_lru(page);
get_page(page);
@@ -1779,7 +1813,7 @@
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
}
return ret;
}
@@ -1821,40 +1855,54 @@
return isolated > inactive;
}
-static noinline_for_stack void
-putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+/*
+ * This moves pages from @list to corresponding LRU list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone_lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone_lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lruvec.
+ */
+
+static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
+ struct page *page;
+ enum lru_list lru;
- /*
- * Put back any unfreeable pages.
- */
- while (!list_empty(page_list)) {
- struct page *page = lru_to_page(page_list);
- int lru;
-
+ while (!list_empty(list)) {
+ page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
- list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
+ list_del(&page->lru);
spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
spin_lock_irq(&pgdat->lru_lock);
continue;
}
-
lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(page, lruvec, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- int numpages = hpage_nr_pages(page);
- reclaim_stat->recent_rotated[file] += numpages;
- }
+ nr_pages = hpage_nr_pages(page);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+ list_move(&page->lru, &lruvec->lists[lru]);
+
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
@@ -1862,18 +1910,21 @@
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
+ } else {
+ nr_moved += nr_pages;
}
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, page_list);
+ list_splice(&pages_to_free, list);
+
+ return nr_moved;
}
/*
@@ -1901,9 +1952,9 @@
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- struct reclaim_stat stat = {};
- isolate_mode_t isolate_mode = 0;
+ struct reclaim_stat stat;
int file = is_file_lru(lru);
+ enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
@@ -1923,28 +1974,18 @@
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
- nr_scanned);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_DIRECT, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
- nr_scanned);
- }
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1955,19 +1996,14 @@
spin_lock_irq(&pgdat->lru_lock);
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
- nr_reclaimed);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
- nr_reclaimed);
- }
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
+ reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
- putback_inactive_pages(lruvec, &page_list);
+ move_pages_to_lru(lruvec, &page_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -2004,73 +2040,6 @@
return nr_reclaimed;
}
-/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone_lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone_lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
- *
- * Returns the number of pages moved to the given lru.
- */
-
-static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list,
- struct list_head *pages_to_free,
- enum lru_list lru)
-{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct page *page;
- int nr_pages;
- int nr_moved = 0;
-
- while (!list_empty(list)) {
- page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
-
- nr_pages = hpage_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
-
- if (put_page_testzero(page)) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
-
- if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
- (*get_compound_page_dtor(page))(page);
- spin_lock_irq(&pgdat->lru_lock);
- } else
- list_add(&page->lru, pages_to_free);
- } else {
- nr_moved += nr_pages;
- }
- }
-
- if (!is_active_lru(lru)) {
- __count_vm_events(PGDEACTIVATE, nr_moved);
- count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
- nr_moved);
- }
-
- return nr_moved;
-}
-
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
@@ -2086,25 +2055,21 @@
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
- isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2145,6 +2110,7 @@
}
ClearPageActive(page); /* we are de-activating */
+ SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}
@@ -2160,17 +2126,79 @@
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
- nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ nr_activate = move_pages_to_lru(lruvec, &l_active);
+ nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ /* Keep all free pages in l_active list */
+ list_splice(&l_inactive, &l_active);
+
+ __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge_list(&l_hold);
- free_unref_page_list(&l_hold);
+ mem_cgroup_uncharge_list(&l_active);
+ free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid = -1;
+ unsigned long nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct reclaim_stat dummy_stat;
+ struct page *page;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ };
+
+ while (!list_empty(page_list)) {
+ page = lru_to_page(page_list);
+ if (nid == -1) {
+ nid = page_to_nid(page);
+ INIT_LIST_HEAD(&node_page_list);
+ }
+
+ if (nid == page_to_nid(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &node_page_list);
+ continue;
+ }
+
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+
+ nid = -1;
+ }
+
+ if (!list_empty(&node_page_list)) {
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+ }
+
+ return nr_reclaimed;
+}
+
/*
* The inactive anon list should be small enough that the VM never has
* to do too much work.
@@ -2200,8 +2228,7 @@
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct mem_cgroup *memcg,
- struct scan_control *sc, bool actual_reclaim)
+ struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2221,17 +2248,13 @@
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
/*
* When refaults are being observed, it means a new workingset
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
- if (file && actual_reclaim && lruvec->refaults != refaults) {
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
+ if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2241,7 +2264,7 @@
inactive_ratio = 1;
}
- if (actual_reclaim)
+ if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
@@ -2251,12 +2274,10 @@
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc)
+ struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru),
- memcg, sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2356,7 +2377,7 @@
* anonymous pages on the LRU in eligible zones.
* Otherwise, the small LRU gets thrashed.
*/
- if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, false, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
>> sc->priority) {
scan_balance = SCAN_ANON;
@@ -2374,7 +2395,7 @@
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2436,17 +2457,70 @@
*lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg,
+ sc->memcg_low_reclaim);
+
+ if (protection) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ cgroup_size;
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
+
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2466,7 +2540,7 @@
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2475,7 +2549,7 @@
BUG();
}
- *lru_pages += size;
+ *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2527,7 +2601,7 @@
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, memcg, sc);
+ lruvec, sc);
}
}
@@ -2594,7 +2668,7 @@
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2619,7 +2693,6 @@
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long nr_reclaimed,
- unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
@@ -2630,40 +2703,18 @@
if (!in_reclaim_compaction(sc))
return false;
- /* Consider stopping depending on scan and reclaim activity */
- if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
- /*
- * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
- * full LRU list has been scanned and we are still failing
- * to reclaim pages. This full LRU scan is potentially
- * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
- */
- if (!nr_reclaimed && !nr_scanned)
- return false;
- } else {
- /*
- * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
- * fail without consequence, stop if we failed to reclaim
- * any pages from the last SWAP_CLUSTER_MAX number of
- * pages that were scanned. This will return to the
- * caller faster at the risk reclaim/compaction and
- * the resulting allocation attempt fails
- */
- if (!nr_reclaimed)
- return false;
- }
-
/*
- * If we have not reclaimed enough pages for compaction and the
- * inactive lists are large enough, continue reclaiming
+ * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
+ * number of pages that were scanned. This will return to the caller
+ * with the risk reclaim/compaction and the resulting allocation attempt
+ * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
+ * allocations through requiring that the full LRU list has been scanned
+ * first, by assuming that zero delta of sc->nr_scanned means full LRU
+ * scan, but that approximation was wrong, and there were corner cases
+ * where always a non-zero amount of pages were scanned.
*/
- pages_for_compaction = compact_gap(sc->order);
- inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
- inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
- if (sc->nr_reclaimed < pages_for_compaction &&
- inactive_lru_pages > pages_for_compaction)
- return true;
+ if (!nr_reclaimed)
+ return false;
/* If compaction would go ahead or the allocation would succeed, stop */
for (z = 0; z <= sc->reclaim_idx; z++) {
@@ -2680,7 +2731,17 @@
;
}
}
- return true;
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = compact_gap(sc->order);
+ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ return inactive_lru_pages > pages_for_compaction;
}
static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
@@ -2697,10 +2758,6 @@
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .pgdat = pgdat,
- .priority = sc->priority,
- };
unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
@@ -2709,7 +2766,7 @@
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ memcg = mem_cgroup_iter(root, NULL, NULL);
do {
unsigned long lru_pages;
unsigned long reclaimed;
@@ -2736,6 +2793,13 @@
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
@@ -2744,30 +2808,15 @@
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->priority);
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+ sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- /*
- * Direct reclaim and kswapd have to scan all memory
- * cgroups to fulfill the overall scan target for the
- * node.
- *
- * Limit reclaim, on the other hand, only cares about
- * nr_to_reclaim pages to be reclaimed and it will
- * retry with decreasing priority if one round over the
- * whole hierarchy is not sufficient.
- */
- if (!global_reclaim(sc) &&
- sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
- } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2844,7 +2893,7 @@
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc));
+ sc));
/*
* Kswapd gives up on balancing particular nodes after too
@@ -2992,12 +3041,8 @@
unsigned long refaults;
struct lruvec *lruvec;
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
}
@@ -3245,20 +3290,20 @@
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
- trace_mm_vmscan_direct_reclaim_begin(order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
#ifdef CONFIG_MEMCG
+/* Only used by soft limit reclaim. Do not reuse for anything else. */
unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,
@@ -3274,13 +3319,13 @@
};
unsigned long lru_pages;
+ WARN_ON_ONCE(!current->reclaim_state);
+
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
@@ -3294,6 +3339,7 @@
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
+
return sc.nr_reclaimed;
}
@@ -3304,6 +3350,7 @@
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ unsigned long pflags;
int nid;
unsigned int noreclaim_flag;
struct scan_control sc = {
@@ -3318,6 +3365,7 @@
.may_swap = may_swap,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
@@ -3327,16 +3375,18 @@
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
- trace_mm_vmscan_memcg_reclaim_begin(0,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3354,7 +3404,7 @@
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3362,6 +3412,30 @@
} while (memcg);
}
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+{
+ int i;
+ struct zone *zone;
+
+ /*
+ * Check for watermark boosts top-down as the higher zones
+ * are more likely to be boosted. Both watermarks and boosts
+ * should not be checked at the time time as reclaim would
+ * start prematurely when there is no boosting and a lower
+ * zone is balanced.
+ */
+ for (i = classzone_idx; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ if (zone->watermark_boost)
+ return true;
+ }
+
+ return false;
+}
+
/*
* Returns true if there is an eligible zone balanced for the request order
* and classzone_idx
@@ -3372,6 +3446,10 @@
unsigned long mark = -1;
struct zone *zone;
+ /*
+ * Check watermarks bottom-up as lower zones are more likely to
+ * meet watermarks.
+ */
for (i = 0; i <= classzone_idx; i++) {
zone = pgdat->node_zones + i;
@@ -3490,7 +3568,7 @@
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * found to have free_pages <= high_wmark_pages(zone), any page in that zone
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
@@ -3499,23 +3577,45 @@
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ unsigned long pflags;
+ unsigned long nr_boost_reclaim;
+ unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+ bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
- .priority = DEF_PRIORITY,
- .may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
count_vm_event(PAGEOUTRUN);
+ /*
+ * Account for the reclaim boost. Note that the zone boost is left in
+ * place so that parallel allocations that are near the watermark will
+ * stall or direct reclaim until kswapd is finished.
+ */
+ nr_boost_reclaim = 0;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ nr_boost_reclaim += zone->watermark_boost;
+ zone_boosts[i] = zone->watermark_boost;
+ }
+ boosted = nr_boost_reclaim;
+
+restart:
+ sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
+ bool balanced;
bool ret;
sc.reclaim_idx = classzone_idx;
@@ -3542,13 +3642,39 @@
}
/*
- * Only reclaim if there are no eligible zones. Note that
- * sc.reclaim_idx is not used as buffer_heads_over_limit may
- * have adjusted it.
+ * If the pgdat is imbalanced then ignore boosting and preserve
+ * the watermarks for a later time and restart. Note that the
+ * zone watermarks will be still reset at the end of balancing
+ * on the grounds that the normal reclaim should be enough to
+ * re-evaluate if boosting is required when kswapd next wakes.
*/
- if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+ if (!balanced && nr_boost_reclaim) {
+ nr_boost_reclaim = 0;
+ goto restart;
+ }
+
+ /*
+ * If boosting is not active then only reclaim if there are no
+ * eligible zones. Note that sc.reclaim_idx is not used as
+ * buffer_heads_over_limit may have adjusted it.
+ */
+ if (!nr_boost_reclaim && balanced)
goto out;
+ /* Limit the priority of boosting to avoid reclaim writeback */
+ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+ raise_priority = false;
+
+ /*
+ * Do not writeback or swap pages for boosted reclaim. The
+ * intent is to relieve pressure not issue sub-optimal IO
+ * from reclaim context. If no pages are reclaimed, the
+ * reclaim will be aborted.
+ */
+ sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+ sc.may_swap = !nr_boost_reclaim;
+
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming. All
@@ -3600,6 +3726,16 @@
* progress in reclaiming pages
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+
+ /*
+ * If reclaim made no progress for a boost, stop reclaim as
+ * IO cannot be queued and it could be an infinite loop in
+ * extreme circumstances.
+ */
+ if (nr_boost_reclaim && !nr_reclaimed)
+ break;
+
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
@@ -3608,8 +3744,33 @@
pgdat->kswapd_failures++;
out:
+ /* If reclaim was boosted, account for the reclaim done in this pass */
+ if (boosted) {
+ unsigned long flags;
+
+ for (i = 0; i <= classzone_idx; i++) {
+ if (!zone_boosts[i])
+ continue;
+
+ /* Increments are under the zone lock */
+ zone = pgdat->node_zones + i;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ /*
+ * As there is now likely space, wakeup kcompact to defragment
+ * pageblocks.
+ */
+ wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+ }
+
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
+ psi_memstall_leave(&pflags);
+ set_task_reclaim_state(current, NULL);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3620,19 +3781,18 @@
}
/*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
*/
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type classzone_idx)
+ enum zone_type prev_classzone_idx)
{
if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return classzone_idx;
-
- return max(pgdat->kswapd_classzone_idx, classzone_idx);
+ return prev_classzone_idx;
+ return pgdat->kswapd_classzone_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3734,15 +3894,10 @@
unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
-
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
- };
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
- current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
@@ -3773,7 +3928,7 @@
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = kswapd_classzone_idx(pgdat, 0);
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
@@ -3804,7 +3959,6 @@
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
- current->reclaim_state = NULL;
return 0;
}
@@ -3827,15 +3981,20 @@
if (!cpuset_zone_allowed(zone, gfp_flags))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
- classzone_idx);
+
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ pgdat->kswapd_classzone_idx = classzone_idx;
+ else
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- pgdat_balanced(pgdat, order, classzone_idx)) {
+ (pgdat_balanced(pgdat, order, classzone_idx) &&
+ !pgdat_watermark_boosted(pgdat, classzone_idx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
@@ -3864,7 +4023,6 @@
*/
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3876,18 +4034,16 @@
.hibernation_mode = 1,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
- struct task_struct *p = current;
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
fs_reclaim_acquire(sc.gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(current, &sc.reclaim_state);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- p->reclaim_state = NULL;
+ set_task_reclaim_state(current, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
@@ -4052,7 +4208,6 @@
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
- struct reclaim_state reclaim_state;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4065,6 +4220,9 @@
.reclaim_idx = gfp_zone(gfp_mask),
};
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+ sc.gfp_mask);
+
cond_resched();
fs_reclaim_acquire(sc.gfp_mask);
/*
@@ -4074,8 +4232,7 @@
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(p, &sc.reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
@@ -4087,10 +4244,13 @@
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- p->reclaim_state = NULL;
+ set_task_reclaim_state(p, NULL);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
+
+ trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed >= nr_pages;
}
@@ -4163,17 +4323,16 @@
return ret;
}
-#ifdef CONFIG_SHMEM
/**
- * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
- * @pages: array of pages to check
- * @nr_pages: number of pages to check
+ * check_move_unevictable_pages - check pages for evictability and move to
+ * appropriate zone lru list
+ * @pvec: pagevec with lru pages to check
*
- * Checks pages for evictability and moves them to the appropriate lru list.
- *
- * This function is only used for SysV IPC SHM_UNLOCK.
+ * Checks pages for evictability, if an evictable page is in the unevictable
+ * lru list, moves it to the appropriate evictable lru list. This function
+ * should be only used for lru pages.
*/
-void check_move_unevictable_pages(struct page **pages, int nr_pages)
+void check_move_unevictable_pages(struct pagevec *pvec)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = NULL;
@@ -4181,8 +4340,8 @@
int pgrescued = 0;
int i;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pages[i];
+ for (i = 0; i < pvec->nr; i++) {
+ struct page *page = pvec->pages[i];
struct pglist_data *pagepgdat = page_pgdat(page);
pgscanned++;
@@ -4214,4 +4373,4 @@
spin_unlock_irq(&pgdat->lru_lock);
}
}
-#endif /* CONFIG_SHMEM */
+EXPORT_SYMBOL_GPL(check_move_unevictable_pages);