Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/mm/Kconfig b/mm/Kconfig
index de64ea6..a5dae9a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
menu "Memory Management options"
@@ -11,23 +12,24 @@
default DISCONTIGMEM_MANUAL if ARCH_DISCONTIGMEM_DEFAULT
default SPARSEMEM_MANUAL if ARCH_SPARSEMEM_DEFAULT
default FLATMEM_MANUAL
+ help
+ This option allows you to change some of the ways that
+ Linux manages its memory internally. Most users will
+ only have one option here selected by the architecture
+ configuration. This is normal.
config FLATMEM_MANUAL
bool "Flat Memory"
depends on !(ARCH_DISCONTIGMEM_ENABLE || ARCH_SPARSEMEM_ENABLE) || ARCH_FLATMEM_ENABLE
help
- This option allows you to change some of the ways that
- Linux manages its memory internally. Most users will
- only have one option here: FLATMEM. This is normal
- and a correct option.
+ This option is best suited for non-NUMA systems with
+ flat address space. The FLATMEM is the most efficient
+ system in terms of performance and resource consumption
+ and it is the best option for smaller systems.
- Some users of more advanced features like NUMA and
- memory hotplug may have different options here.
- DISCONTIGMEM is a more mature, better tested system,
- but is incompatible with memory hotplug and may suffer
- decreased performance over SPARSEMEM. If unsure between
- "Sparse Memory" and "Discontiguous Memory", choose
- "Discontiguous Memory".
+ For systems that have holes in their physical address
+ spaces and for features like NUMA and memory hotplug,
+ choose "Sparse Memory"
If unsure, choose this option (Flat Memory) over any other.
@@ -38,29 +40,26 @@
This option provides enhanced support for discontiguous
memory systems, over FLATMEM. These systems have holes
in their physical address spaces, and this option provides
- more efficient handling of these holes. However, the vast
- majority of hardware has quite flat address spaces, and
- can have degraded performance from the extra overhead that
- this option imposes.
+ more efficient handling of these holes.
- Many NUMA configurations will have this as the only option.
+ Although "Discontiguous Memory" is still used by several
+ architectures, it is considered deprecated in favor of
+ "Sparse Memory".
- If unsure, choose "Flat Memory" over this option.
+ If unsure, choose "Sparse Memory" over this option.
config SPARSEMEM_MANUAL
bool "Sparse Memory"
depends on ARCH_SPARSEMEM_ENABLE
help
This will be the only option for some systems, including
- memory hotplug systems. This is normal.
+ memory hot-plug systems. This is normal.
- For many other systems, this will be an alternative to
- "Discontiguous Memory". This option provides some potential
- performance benefits, along with decreased code complexity,
- but it is newer, and more experimental.
+ This option provides efficient support for systems with
+ holes is their physical address space and allows memory
+ hot-plug and hot-remove.
- If unsure, choose "Discontiguous Memory" or "Flat Memory"
- over this option.
+ If unsure, choose "Flat Memory" over this option.
endchoice
@@ -127,22 +126,17 @@
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
-config HAVE_MEMBLOCK
- bool
-
config HAVE_MEMBLOCK_NODE_MAP
bool
config HAVE_MEMBLOCK_PHYS_MAP
bool
-config HAVE_GENERIC_GUP
+config HAVE_FAST_GUP
+ depends on MMU
bool
-config ARCH_DISCARD_MEMBLOCK
- bool
-
-config NO_BOOTMEM
+config ARCH_KEEP_MEMBLOCK
bool
config MEMORY_ISOLATION
@@ -167,14 +161,13 @@
config MEMORY_HOTPLUG_DEFAULT_ONLINE
bool "Online the newly added memory blocks by default"
- default n
depends on MEMORY_HOTPLUG
help
This option sets the default policy setting for memory hotplug
onlining policy (/sys/devices/system/memory/auto_online_blocks) which
determines what happens to newly added memory regions. Policy setting
can always be changed at runtime.
- See Documentation/memory-hotplug.txt for more information.
+ See Documentation/admin-guide/mm/memory-hotplug.rst for more information.
Say Y here if you want all hot-plugged memory blocks to appear in
'online' state by default.
@@ -264,6 +257,9 @@
config ARCH_ENABLE_THP_MIGRATION
bool
+config CONTIG_ALLOC
+ def_bool (MEMORY_ISOLATION && COMPACTION) || CMA
+
config PHYS_ADDR_T_64BIT
def_bool 64BIT
@@ -277,11 +273,6 @@
by default when ZONE_DMA or HIGHMEM is selected, but you
may say n to override this.
-config NR_QUICK
- int
- depends on QUICKLIST
- default "1"
-
config VIRT_TO_BUS
bool
help
@@ -297,6 +288,7 @@
config KSM
bool "Enable KSM for page merging"
depends on MMU
+ select XXHASH
help
Enable Kernel Samepage Merging: KSM periodically scans those areas
of an application's address space that an app has advised may be
@@ -379,7 +371,7 @@
bool "Transparent Hugepage Support"
depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE
select COMPACTION
- select RADIX_TREE_MULTIORDER
+ select XARRAY_MULTI
help
Transparent Hugepages allows the kernel to use huge pages and
huge tlb transparently to the applications whenever possible.
@@ -441,7 +433,6 @@
config CLEANCACHE
bool "Enable cleancache driver to cache clean pages if tmem is present"
- default n
help
Cleancache can be thought of as a page-granularity victim cache
for clean pages that the kernel's pageframe replacement algorithm
@@ -465,7 +456,6 @@
config FRONTSWAP
bool "Enable frontswap to cache swap pages if tmem is present"
depends on SWAP
- default n
help
Frontswap is so named because it can be thought of as the opposite
of a "backing" store for a swap device. The data is stored into
@@ -481,7 +471,7 @@
config CMA
bool "Contiguous Memory Allocator"
- depends on HAVE_MEMBLOCK && MMU
+ depends on MMU
select MIGRATION
select MEMORY_ISOLATION
help
@@ -537,7 +527,6 @@
depends on FRONTSWAP && CRYPTO=y
select CRYPTO_LZO
select ZPOOL
- default n
help
A lightweight compressed cache for swap pages. It takes
pages that are in the process of being swapped out and attempts to
@@ -554,14 +543,12 @@
config ZPOOL
tristate "Common API for compressed memory storage"
- default n
help
Compressed memory storage API. This allows using either zbud or
zsmalloc.
config ZBUD
tristate "Low (Up to 2x) density storage for compressed pages"
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to two compressed pages per physical
@@ -572,7 +559,6 @@
config Z3FOLD
tristate "Up to 3x density storage for compressed pages"
depends on ZPOOL
- default n
help
A special purpose allocator for storing compressed pages.
It is designed to store up to three compressed pages per physical
@@ -582,7 +568,6 @@
config ZSMALLOC
tristate "Memory allocator for compressed pages"
depends on MMU
- default n
help
zsmalloc is a slab-based memory allocator designed to store
compressed RAM pages. zsmalloc uses virtual memory mapping
@@ -633,8 +618,6 @@
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
- default n
- depends on NO_BOOTMEM
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
@@ -661,8 +644,7 @@
See Documentation/admin-guide/mm/idle_page_tracking.rst for
more details.
-# arch_add_memory() comprehends device memory
-config ARCH_HAS_ZONE_DEVICE
+config ARCH_HAS_PTE_DEVMAP
bool
config ZONE_DEVICE
@@ -670,8 +652,8 @@
depends on MEMORY_HOTPLUG
depends on MEMORY_HOTREMOVE
depends on SPARSEMEM_VMEMMAP
- depends on ARCH_HAS_ZONE_DEVICE
- select RADIX_TREE_MULTIORDER
+ depends on ARCH_HAS_PTE_DEVMAP
+ select XARRAY_MULTI
help
Device memory hotplug support allows for establishing pmem,
@@ -682,42 +664,21 @@
If FS_DAX is enabled, then say Y.
-config ARCH_HAS_HMM
- bool
- default y
- depends on (X86_64 || PPC64)
- depends on ZONE_DEVICE
- depends on MMU && 64BIT
- depends on MEMORY_HOTPLUG
- depends on MEMORY_HOTREMOVE
- depends on SPARSEMEM_VMEMMAP
-
-config MIGRATE_VMA_HELPER
- bool
-
config DEV_PAGEMAP_OPS
bool
-config HMM
- bool
- select MIGRATE_VMA_HELPER
-
+#
+# Helpers to mirror range of the CPU page tables of a process into device page
+# tables.
+#
config HMM_MIRROR
- bool "HMM mirror CPU page table into a device page table"
- depends on ARCH_HAS_HMM
- select MMU_NOTIFIER
- select HMM
- help
- Select HMM_MIRROR if you want to mirror range of the CPU page table of a
- process into a device page table. Here, mirror means "keep synchronized".
- Prerequisites: the device must provide the ability to write-protect its
- page tables (at PAGE_SIZE granularity), and must be able to recover from
- the resulting potential page faults.
+ bool
+ depends on MMU
+ depends on MMU_NOTIFIER
config DEVICE_PRIVATE
bool "Unaddressable device memory (GPU memory, ...)"
- depends on ARCH_HAS_HMM
- select HMM
+ depends on ZONE_DEVICE
select DEV_PAGEMAP_OPS
help
@@ -725,17 +686,6 @@
memory; i.e., memory that is only accessible from the device (or
group of devices). You likely also want to select HMM_MIRROR.
-config DEVICE_PUBLIC
- bool "Addressable device memory (like GPU memory)"
- depends on ARCH_HAS_HMM
- select HMM
- select DEV_PAGEMAP_OPS
-
- help
- Allows creation of struct pages to represent addressable device
- memory; i.e., memory that is accessible from both the device and
- the CPU
-
config FRAME_VECTOR
bool
@@ -746,7 +696,6 @@
config PERCPU_STATS
bool "Collect percpu memory statistics"
- default n
help
This feature collects and exposes statistics via debugfs. The
information includes global and per chunk statistics, which can
@@ -754,14 +703,37 @@
config GUP_BENCHMARK
bool "Enable infrastructure for get_user_pages_fast() benchmarking"
- default n
help
Provides /sys/kernel/debug/gup_benchmark that helps with testing
performance of get_user_pages_fast().
See tools/testing/selftests/vm/gup_benchmark.c
+config GUP_GET_PTE_LOW_HIGH
+ bool
+
+config READ_ONLY_THP_FOR_FS
+ bool "Read-only THP for filesystems (EXPERIMENTAL)"
+ depends on TRANSPARENT_HUGE_PAGECACHE && SHMEM
+
+ help
+ Allow khugepaged to put read-only file-backed pages in THP.
+
+ This is marked experimental because it is a new feature. Write
+ support of file THPs will be developed in the next few release
+ cycles.
+
config ARCH_HAS_PTE_SPECIAL
bool
+#
+# Some architectures require a special hugepage directory format that is
+# required to support multiple hugepage sizes. For example a4fe3ce76
+# "powerpc/mm: Allow more flexible layouts for hugepage pagetables"
+# introduced it on powerpc. This allows for a more flexible hugepage
+# pagetable layouts.
+#
+config ARCH_HAS_HUGEPD
+ bool
+
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 9a7b8b0..327b3eb 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,3 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
config PAGE_EXTENSION
bool "Extend memmap on extra space for more information on page"
---help---
@@ -11,19 +12,25 @@
bool "Debug page memory allocations"
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
- select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
Depending on runtime enablement, this results in a small or large
slowdown, but helps to find certain types of memory corruption.
+ Also, the state of page tracking structures is checked more often as
+ pages are being allocated and freed, as unexpected state changes
+ often happen for same reasons as memory corruption (e.g. double free,
+ use-after-free). The error reports for these checks can be augmented
+ with stack traces of last allocation and freeing of the page, when
+ PAGE_OWNER is also selected and enabled on boot.
+
For architectures which don't enable ARCH_SUPPORTS_DEBUG_PAGEALLOC,
fill the pages with poison patterns after free_pages() and verify
- the patterns before alloc_pages(). Additionally,
- this option cannot be enabled in combination with hibernation as
- that would result in incorrect warnings of memory corruption after
- a resume because free pages are not saved to the suspend image.
+ the patterns before alloc_pages(). Additionally, this option cannot
+ be enabled in combination with hibernation as that would result in
+ incorrect warnings of memory corruption after a resume because free
+ pages are not saved to the suspend image.
By default this option will have a small overhead, e.g. by not
allowing the kernel mapping to be backed by large pages on some
@@ -33,12 +40,28 @@
config DEBUG_PAGEALLOC_ENABLE_DEFAULT
bool "Enable debug page memory allocations by default?"
- default n
depends on DEBUG_PAGEALLOC
---help---
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
+config PAGE_OWNER
+ bool "Track page owner"
+ depends on DEBUG_KERNEL && STACKTRACE_SUPPORT
+ select DEBUG_FS
+ select STACKTRACE
+ select STACKDEPOT
+ select PAGE_EXTENSION
+ help
+ This keeps track of what call chain is the owner of a page, may
+ help to find bare alloc_page(s) leaks. Even if you include this
+ feature on your build, it is disabled in default. You should pass
+ "page_owner=on" to boot parameter in order to enable it. Eats
+ a fair amount of memory if enabled. See tools/vm/page_owner_sort.c
+ for user-space helper.
+
+ If unsure, say N.
+
config PAGE_POISONING
bool "Poison pages after freeing"
select PAGE_POISONING_NO_SANITY if HIBERNATION
diff --git a/mm/Makefile b/mm/Makefile
index 26ef77a..d996846 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,11 +21,14 @@
KCOV_INSTRUMENT_mmzone.o := n
KCOV_INSTRUMENT_vmstat.o := n
+CFLAGS_init-mm.o += $(call cc-disable-warning, override-init)
+CFLAGS_init-mm.o += $(call cc-disable-warning, initializer-overrides)
+
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
- mlock.o mmap.o mprotect.o mremap.o msync.o \
- page_vma_mapped.o pagewalk.o pgtable-generic.o \
- rmap.o vmalloc.o
+mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
+ mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
+ msync.o page_vma_mapped.o pagewalk.o \
+ pgtable-generic.o rmap.o vmalloc.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -33,26 +36,25 @@
endif
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
- maccess.o page_alloc.o page-writeback.o \
+ maccess.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o mmu_context.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o $(mmu-y)
+ debug.o gup.o $(mmu-y)
+# Give 'page_alloc' its own module-parameter namespace
+page-alloc-y := page_alloc.o
+page-alloc-$(CONFIG_SHUFFLE_PAGE_ALLOCATOR) += shuffle.o
+
+obj-y += page-alloc.o
obj-y += init-mm.o
-
-ifdef CONFIG_NO_BOOTMEM
- obj-y += nobootmem.o
-else
- obj-y += bootmem.o
-endif
+obj-y += memblock.o
ifdef CONFIG_MMU
obj-$(CONFIG_ADVISE_SYSCALLS) += madvise.o
endif
-obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o swap_slots.o
obj-$(CONFIG_FRONTSWAP) += frontswap.o
@@ -73,7 +75,6 @@
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_MEMTEST) += memtest.o
obj-$(CONFIG_MIGRATION) += migrate.o
-obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
@@ -103,5 +104,6 @@
obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
-obj-$(CONFIG_HMM) += hmm.o
+obj-$(CONFIG_ZONE_DEVICE) += memremap.o
+obj-$(CONFIG_HMM_MIRROR) += hmm.o
obj-$(CONFIG_MEMFD_CREATE) += memfd.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 8a8bb87..c360f6a 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,5 +1,7 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/wait.h>
+#include <linux/rbtree.h>
#include <linux/backing-dev.h>
#include <linux/kthread.h>
#include <linux/freezer.h>
@@ -21,10 +23,12 @@
static struct class *bdi_class;
/*
- * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side
- * locking.
+ * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
+ * reader side locking.
*/
DEFINE_SPINLOCK(bdi_lock);
+static u64 bdi_id_cursor;
+static struct rb_root bdi_tree = RB_ROOT;
LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
@@ -102,39 +106,25 @@
}
DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats);
-static int bdi_debug_register(struct backing_dev_info *bdi, const char *name)
+static void bdi_debug_register(struct backing_dev_info *bdi, const char *name)
{
- if (!bdi_debug_root)
- return -ENOMEM;
-
bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root);
- if (!bdi->debug_dir)
- return -ENOMEM;
- bdi->debug_stats = debugfs_create_file("stats", 0444, bdi->debug_dir,
- bdi, &bdi_debug_stats_fops);
- if (!bdi->debug_stats) {
- debugfs_remove(bdi->debug_dir);
- bdi->debug_dir = NULL;
- return -ENOMEM;
- }
-
- return 0;
+ debugfs_create_file("stats", 0444, bdi->debug_dir, bdi,
+ &bdi_debug_stats_fops);
}
static void bdi_debug_unregister(struct backing_dev_info *bdi)
{
- debugfs_remove(bdi->debug_stats);
- debugfs_remove(bdi->debug_dir);
+ debugfs_remove_recursive(bdi->debug_dir);
}
#else
static inline void bdi_debug_init(void)
{
}
-static inline int bdi_debug_register(struct backing_dev_info *bdi,
+static inline void bdi_debug_register(struct backing_dev_info *bdi,
const char *name)
{
- return 0;
}
static inline void bdi_debug_unregister(struct backing_dev_info *bdi)
{
@@ -249,8 +239,8 @@
{
int err;
- bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_FREEZABLE |
- WQ_UNBOUND | WQ_SYSFS, 0);
+ bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND |
+ WQ_SYSFS, 0);
if (!bdi_wq)
return -ENOMEM;
@@ -628,13 +618,12 @@
}
/**
- * wb_get_create - get wb for a given memcg, create if necessary
+ * wb_get_lookup - get wb for a given memcg
* @bdi: target bdi
* @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
- * @gfp: allocation mask to use
*
- * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
- * create one. The returned wb has its refcount incremented.
+ * Try to get the wb for @memcg_css on @bdi. The returned wb has its
+ * refcount incremented.
*
* This function uses css_get() on @memcg_css and thus expects its refcnt
* to be positive on invocation. IOW, rcu_read_lock() protection on
@@ -651,6 +640,39 @@
* each lookup. On mismatch, the existing wb is discarded and a new one is
* created.
*/
+struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi,
+ struct cgroup_subsys_state *memcg_css)
+{
+ struct bdi_writeback *wb;
+
+ if (!memcg_css->parent)
+ return &bdi->wb;
+
+ rcu_read_lock();
+ wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
+ if (wb) {
+ struct cgroup_subsys_state *blkcg_css;
+
+ /* see whether the blkcg association has changed */
+ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
+ if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb)))
+ wb = NULL;
+ css_put(blkcg_css);
+ }
+ rcu_read_unlock();
+
+ return wb;
+}
+
+/**
+ * wb_get_create - get wb for a given memcg, create if necessary
+ * @bdi: target bdi
+ * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref)
+ * @gfp: allocation mask to use
+ *
+ * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to
+ * create one. See wb_get_lookup() for more details.
+ */
struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi,
struct cgroup_subsys_state *memcg_css,
gfp_t gfp)
@@ -663,20 +685,7 @@
return &bdi->wb;
do {
- rcu_read_lock();
- wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id);
- if (wb) {
- struct cgroup_subsys_state *blkcg_css;
-
- /* see whether the blkcg association has changed */
- blkcg_css = cgroup_get_e_css(memcg_css->cgroup,
- &io_cgrp_subsys);
- if (unlikely(wb->blkcg_css != blkcg_css ||
- !wb_tryget(wb)))
- wb = NULL;
- css_put(blkcg_css);
- }
- rcu_read_unlock();
+ wb = wb_get_lookup(bdi, memcg_css);
} while (!wb && !cgwb_create(bdi, memcg_css, gfp));
return wb;
@@ -689,6 +698,7 @@
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
+ init_rwsem(&bdi->wb_switch_rwsem);
ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
if (!ret) {
@@ -871,9 +881,58 @@
}
EXPORT_SYMBOL(bdi_alloc_node);
+static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
+{
+ struct rb_node **p = &bdi_tree.rb_node;
+ struct rb_node *parent = NULL;
+ struct backing_dev_info *bdi;
+
+ lockdep_assert_held(&bdi_lock);
+
+ while (*p) {
+ parent = *p;
+ bdi = rb_entry(parent, struct backing_dev_info, rb_node);
+
+ if (bdi->id > id)
+ p = &(*p)->rb_left;
+ else if (bdi->id < id)
+ p = &(*p)->rb_right;
+ else
+ break;
+ }
+
+ if (parentp)
+ *parentp = parent;
+ return p;
+}
+
+/**
+ * bdi_get_by_id - lookup and get bdi from its id
+ * @id: bdi id to lookup
+ *
+ * Find bdi matching @id and get it. Returns NULL if the matching bdi
+ * doesn't exist or is already unregistered.
+ */
+struct backing_dev_info *bdi_get_by_id(u64 id)
+{
+ struct backing_dev_info *bdi = NULL;
+ struct rb_node **p;
+
+ spin_lock_bh(&bdi_lock);
+ p = bdi_lookup_rb_node(id, NULL);
+ if (*p) {
+ bdi = rb_entry(*p, struct backing_dev_info, rb_node);
+ bdi_get(bdi);
+ }
+ spin_unlock_bh(&bdi_lock);
+
+ return bdi;
+}
+
int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
{
struct device *dev;
+ struct rb_node *parent, **p;
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
@@ -889,7 +948,15 @@
set_bit(WB_registered, &bdi->wb.state);
spin_lock_bh(&bdi_lock);
+
+ bdi->id = ++bdi_id_cursor;
+
+ p = bdi_lookup_rb_node(bdi->id, &parent);
+ rb_link_node(&bdi->rb_node, parent, p);
+ rb_insert_color(&bdi->rb_node, &bdi_tree);
+
list_add_tail_rcu(&bdi->bdi_list, &bdi_list);
+
spin_unlock_bh(&bdi_lock);
trace_writeback_bdi_register(bdi);
@@ -930,6 +997,7 @@
static void bdi_remove_from_list(struct backing_dev_info *bdi)
{
spin_lock_bh(&bdi_lock);
+ rb_erase(&bdi->rb_node, &bdi_tree);
list_del_rcu(&bdi->bdi_list);
spin_unlock_bh(&bdi_lock);
diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c
index ef858d5..26de020 100644
--- a/mm/balloon_compaction.c
+++ b/mm/balloon_compaction.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/balloon_compaction.c
*
@@ -10,101 +11,183 @@
#include <linux/export.h>
#include <linux/balloon_compaction.h>
+static void balloon_page_enqueue_one(struct balloon_dev_info *b_dev_info,
+ struct page *page)
+{
+ /*
+ * Block others from accessing the 'page' when we get around to
+ * establishing additional references. We should be the only one
+ * holding a reference to the 'page' at this point. If we are not, then
+ * memory corruption is possible and we should stop execution.
+ */
+ BUG_ON(!trylock_page(page));
+ balloon_page_insert(b_dev_info, page);
+ unlock_page(page);
+ __count_vm_event(BALLOON_INFLATE);
+}
+
+/**
+ * balloon_page_list_enqueue() - inserts a list of pages into the balloon page
+ * list.
+ * @b_dev_info: balloon device descriptor where we will insert a new page to
+ * @pages: pages to enqueue - allocated using balloon_page_alloc.
+ *
+ * Driver must call this function to properly enqueue balloon pages before
+ * definitively removing them from the guest system.
+ *
+ * Return: number of pages that were enqueued.
+ */
+size_t balloon_page_list_enqueue(struct balloon_dev_info *b_dev_info,
+ struct list_head *pages)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ size_t n_pages = 0;
+
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_for_each_entry_safe(page, tmp, pages, lru) {
+ list_del(&page->lru);
+ balloon_page_enqueue_one(b_dev_info, page);
+ n_pages++;
+ }
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ return n_pages;
+}
+EXPORT_SYMBOL_GPL(balloon_page_list_enqueue);
+
+/**
+ * balloon_page_list_dequeue() - removes pages from balloon's page list and
+ * returns a list of the pages.
+ * @b_dev_info: balloon device decriptor where we will grab a page from.
+ * @pages: pointer to the list of pages that would be returned to the caller.
+ * @n_req_pages: number of requested pages.
+ *
+ * Driver must call this function to properly de-allocate a previous enlisted
+ * balloon pages before definitively releasing it back to the guest system.
+ * This function tries to remove @n_req_pages from the ballooned pages and
+ * return them to the caller in the @pages list.
+ *
+ * Note that this function may fail to dequeue some pages even if the balloon
+ * isn't empty - since the page list can be temporarily empty due to compaction
+ * of isolated pages.
+ *
+ * Return: number of pages that were added to the @pages list.
+ */
+size_t balloon_page_list_dequeue(struct balloon_dev_info *b_dev_info,
+ struct list_head *pages, size_t n_req_pages)
+{
+ struct page *page, *tmp;
+ unsigned long flags;
+ size_t n_pages = 0;
+
+ spin_lock_irqsave(&b_dev_info->pages_lock, flags);
+ list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
+ if (n_pages == n_req_pages)
+ break;
+
+ /*
+ * Block others from accessing the 'page' while we get around to
+ * establishing additional references and preparing the 'page'
+ * to be released by the balloon driver.
+ */
+ if (!trylock_page(page))
+ continue;
+
+ if (IS_ENABLED(CONFIG_BALLOON_COMPACTION) &&
+ PageIsolated(page)) {
+ /* raced with isolation */
+ unlock_page(page);
+ continue;
+ }
+ balloon_page_delete(page);
+ __count_vm_event(BALLOON_DEFLATE);
+ list_add(&page->lru, pages);
+ unlock_page(page);
+ n_pages++;
+ }
+ spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+
+ return n_pages;
+}
+EXPORT_SYMBOL_GPL(balloon_page_list_dequeue);
+
/*
* balloon_page_alloc - allocates a new page for insertion into the balloon
- * page list.
+ * page list.
*
- * Driver must call it to properly allocate a new enlisted balloon page.
- * Driver must call balloon_page_enqueue before definitively removing it from
- * the guest system. This function returns the page address for the recently
- * allocated page or NULL in the case we fail to allocate a new page this turn.
+ * Driver must call this function to properly allocate a new balloon page.
+ * Driver must call balloon_page_enqueue before definitively removing the page
+ * from the guest system.
+ *
+ * Return: struct page for the allocated page or NULL on allocation failure.
*/
struct page *balloon_page_alloc(void)
{
struct page *page = alloc_page(balloon_mapping_gfp_mask() |
- __GFP_NOMEMALLOC | __GFP_NORETRY);
+ __GFP_NOMEMALLOC | __GFP_NORETRY |
+ __GFP_NOWARN);
return page;
}
EXPORT_SYMBOL_GPL(balloon_page_alloc);
/*
- * balloon_page_enqueue - allocates a new page and inserts it into the balloon
- * page list.
- * @b_dev_info: balloon device descriptor where we will insert a new page to
+ * balloon_page_enqueue - inserts a new page into the balloon page list.
+ *
+ * @b_dev_info: balloon device descriptor where we will insert a new page
* @page: new page to enqueue - allocated using balloon_page_alloc.
*
- * Driver must call it to properly enqueue a new allocated balloon page
- * before definitively removing it from the guest system.
- * This function returns the page address for the recently enqueued page or
- * NULL in the case we fail to allocate a new page this turn.
+ * Drivers must call this function to properly enqueue a new allocated balloon
+ * page before definitively removing the page from the guest system.
+ *
+ * Drivers must not call balloon_page_enqueue on pages that have been pushed to
+ * a list with balloon_page_push before removing them with balloon_page_pop. To
+ * enqueue a list of pages, use balloon_page_list_enqueue instead.
*/
void balloon_page_enqueue(struct balloon_dev_info *b_dev_info,
struct page *page)
{
unsigned long flags;
- /*
- * Block others from accessing the 'page' when we get around to
- * establishing additional references. We should be the only one
- * holding a reference to the 'page' at this point.
- */
- BUG_ON(!trylock_page(page));
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- balloon_page_insert(b_dev_info, page);
- __count_vm_event(BALLOON_INFLATE);
+ balloon_page_enqueue_one(b_dev_info, page);
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
- unlock_page(page);
}
EXPORT_SYMBOL_GPL(balloon_page_enqueue);
/*
* balloon_page_dequeue - removes a page from balloon's page list and returns
- * the its address to allow the driver release the page.
+ * its address to allow the driver to release the page.
* @b_dev_info: balloon device decriptor where we will grab a page from.
*
- * Driver must call it to properly de-allocate a previous enlisted balloon page
- * before definetively releasing it back to the guest system.
- * This function returns the page address for the recently dequeued page or
- * NULL in the case we find balloon's page list temporarily empty due to
- * compaction isolated pages.
+ * Driver must call this function to properly dequeue a previously enqueued page
+ * before definitively releasing it back to the guest system.
+ *
+ * Caller must perform its own accounting to ensure that this
+ * function is called only if some pages are actually enqueued.
+ *
+ * Note that this function may fail to dequeue some pages even if there are
+ * some enqueued pages - since the page list can be temporarily empty due to
+ * the compaction of isolated pages.
+ *
+ * TODO: remove the caller accounting requirements, and allow caller to wait
+ * until all pages can be dequeued.
+ *
+ * Return: struct page for the dequeued page, or NULL if no page was dequeued.
*/
struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info)
{
- struct page *page, *tmp;
unsigned long flags;
- bool dequeued_page;
+ LIST_HEAD(pages);
+ int n_pages;
- dequeued_page = false;
- spin_lock_irqsave(&b_dev_info->pages_lock, flags);
- list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) {
- /*
- * Block others from accessing the 'page' while we get around
- * establishing additional references and preparing the 'page'
- * to be released by the balloon driver.
- */
- if (trylock_page(page)) {
-#ifdef CONFIG_BALLOON_COMPACTION
- if (PageIsolated(page)) {
- /* raced with isolation */
- unlock_page(page);
- continue;
- }
-#endif
- balloon_page_delete(page);
- __count_vm_event(BALLOON_DEFLATE);
- unlock_page(page);
- dequeued_page = true;
- break;
- }
- }
- spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
+ n_pages = balloon_page_list_dequeue(b_dev_info, &pages, 1);
- if (!dequeued_page) {
+ if (n_pages != 1) {
/*
* If we are unable to dequeue a balloon page because the page
- * list is empty and there is no isolated pages, then something
+ * list is empty and there are no isolated pages, then something
* went out of track and some balloon pages are lost.
- * BUG() here, otherwise the balloon driver may get stuck into
+ * BUG() here, otherwise the balloon driver may get stuck in
* an infinite loop while attempting to release all its pages.
*/
spin_lock_irqsave(&b_dev_info->pages_lock, flags);
@@ -112,9 +195,9 @@
!b_dev_info->isolated_pages))
BUG();
spin_unlock_irqrestore(&b_dev_info->pages_lock, flags);
- page = NULL;
+ return NULL;
}
- return page;
+ return list_first_entry(&pages, struct page, lru);
}
EXPORT_SYMBOL_GPL(balloon_page_dequeue);
@@ -155,8 +238,8 @@
/*
* We can not easily support the no copy case here so ignore it as it
- * is unlikely to be use with ballon pages. See include/linux/hmm.h for
- * user of the MIGRATE_SYNC_NO_COPY mode.
+ * is unlikely to be used with balloon pages. See include/linux/hmm.h
+ * for a user of the MIGRATE_SYNC_NO_COPY mode.
*/
if (mode == MIGRATE_SYNC_NO_COPY)
return -EINVAL;
diff --git a/mm/bootmem.c b/mm/bootmem.c
deleted file mode 100644
index 97db0e8..0000000
--- a/mm/bootmem.c
+++ /dev/null
@@ -1,811 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bootmem - A boot-time physical memory allocator and configurator
- *
- * Copyright (C) 1999 Ingo Molnar
- * 1999 Kanoj Sarcar, SGI
- * 2008 Johannes Weiner
- *
- * Access to this subsystem has to be serialized externally (which is true
- * for the boot process anyway).
- */
-#include <linux/init.h>
-#include <linux/pfn.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/kmemleak.h>
-#include <linux/range.h>
-#include <linux/bug.h>
-#include <linux/io.h>
-#include <linux/bootmem.h>
-
-#include "internal.h"
-
-/**
- * DOC: bootmem overview
- *
- * Bootmem is a boot-time physical memory allocator and configurator.
- *
- * It is used early in the boot process before the page allocator is
- * set up.
- *
- * Bootmem is based on the most basic of allocators, a First Fit
- * allocator which uses a bitmap to represent memory. If a bit is 1,
- * the page is allocated and 0 if unallocated. To satisfy allocations
- * of sizes smaller than a page, the allocator records the Page Frame
- * Number (PFN) of the last allocation and the offset the allocation
- * ended at. Subsequent small allocations are merged together and
- * stored on the same page.
- *
- * The information used by the bootmem allocator is represented by
- * :c:type:`struct bootmem_data`. An array to hold up to %MAX_NUMNODES
- * such structures is statically allocated and then it is discarded
- * when the system initialization completes. Each entry in this array
- * corresponds to a node with memory. For UMA systems only entry 0 is
- * used.
- *
- * The bootmem allocator is initialized during early architecture
- * specific setup. Each architecture is required to supply a
- * :c:func:`setup_arch` function which, among other tasks, is
- * responsible for acquiring the necessary parameters to initialise
- * the boot memory allocator. These parameters define limits of usable
- * physical memory:
- *
- * * @min_low_pfn - the lowest PFN that is available in the system
- * * @max_low_pfn - the highest PFN that may be addressed by low
- * memory (%ZONE_NORMAL)
- * * @max_pfn - the last PFN available to the system.
- *
- * After those limits are determined, the :c:func:`init_bootmem` or
- * :c:func:`init_bootmem_node` function should be called to initialize
- * the bootmem allocator. The UMA case should use the `init_bootmem`
- * function. It will initialize ``contig_page_data`` structure that
- * represents the only memory node in the system. In the NUMA case the
- * `init_bootmem_node` function should be called to initialize the
- * bootmem allocator for each node.
- *
- * Once the allocator is set up, it is possible to use either single
- * node or NUMA variant of the allocation APIs.
- */
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data = {
- .bdata = &bootmem_node_data[0]
-};
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
-unsigned long max_low_pfn;
-unsigned long min_low_pfn;
-unsigned long max_pfn;
-unsigned long long max_possible_pfn;
-
-bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata;
-
-static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list);
-
-static int bootmem_debug;
-
-static int __init bootmem_debug_setup(char *buf)
-{
- bootmem_debug = 1;
- return 0;
-}
-early_param("bootmem_debug", bootmem_debug_setup);
-
-#define bdebug(fmt, args...) ({ \
- if (unlikely(bootmem_debug)) \
- pr_info("bootmem::%s " fmt, \
- __func__, ## args); \
-})
-
-static unsigned long __init bootmap_bytes(unsigned long pages)
-{
- unsigned long bytes = DIV_ROUND_UP(pages, BITS_PER_BYTE);
-
- return ALIGN(bytes, sizeof(long));
-}
-
-/**
- * bootmem_bootmap_pages - calculate bitmap size in pages
- * @pages: number of pages the bitmap has to represent
- *
- * Return: the number of pages needed to hold the bitmap.
- */
-unsigned long __init bootmem_bootmap_pages(unsigned long pages)
-{
- unsigned long bytes = bootmap_bytes(pages);
-
- return PAGE_ALIGN(bytes) >> PAGE_SHIFT;
-}
-
-/*
- * link bdata in order
- */
-static void __init link_bootmem(bootmem_data_t *bdata)
-{
- bootmem_data_t *ent;
-
- list_for_each_entry(ent, &bdata_list, list) {
- if (bdata->node_min_pfn < ent->node_min_pfn) {
- list_add_tail(&bdata->list, &ent->list);
- return;
- }
- }
-
- list_add_tail(&bdata->list, &bdata_list);
-}
-
-/*
- * Called once to set up the allocator itself.
- */
-static unsigned long __init init_bootmem_core(bootmem_data_t *bdata,
- unsigned long mapstart, unsigned long start, unsigned long end)
-{
- unsigned long mapsize;
-
- mminit_validate_memmodel_limits(&start, &end);
- bdata->node_bootmem_map = phys_to_virt(PFN_PHYS(mapstart));
- bdata->node_min_pfn = start;
- bdata->node_low_pfn = end;
- link_bootmem(bdata);
-
- /*
- * Initially all pages are reserved - setup_arch() has to
- * register free RAM areas explicitly.
- */
- mapsize = bootmap_bytes(end - start);
- memset(bdata->node_bootmem_map, 0xff, mapsize);
-
- bdebug("nid=%td start=%lx map=%lx end=%lx mapsize=%lx\n",
- bdata - bootmem_node_data, start, mapstart, end, mapsize);
-
- return mapsize;
-}
-
-/**
- * init_bootmem_node - register a node as boot memory
- * @pgdat: node to register
- * @freepfn: pfn where the bitmap for this node is to be placed
- * @startpfn: first pfn on the node
- * @endpfn: first pfn after the node
- *
- * Return: the number of bytes needed to hold the bitmap for this node.
- */
-unsigned long __init init_bootmem_node(pg_data_t *pgdat, unsigned long freepfn,
- unsigned long startpfn, unsigned long endpfn)
-{
- return init_bootmem_core(pgdat->bdata, freepfn, startpfn, endpfn);
-}
-
-/**
- * init_bootmem - register boot memory
- * @start: pfn where the bitmap is to be placed
- * @pages: number of available physical pages
- *
- * Return: the number of bytes needed to hold the bitmap.
- */
-unsigned long __init init_bootmem(unsigned long start, unsigned long pages)
-{
- max_low_pfn = pages;
- min_low_pfn = start;
- return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages);
-}
-
-void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
-{
- unsigned long cursor, end;
-
- kmemleak_free_part_phys(physaddr, size);
-
- cursor = PFN_UP(physaddr);
- end = PFN_DOWN(physaddr + size);
-
- for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
- totalram_pages++;
- }
-}
-
-static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
-{
- struct page *page;
- unsigned long *map, start, end, pages, cur, count = 0;
-
- if (!bdata->node_bootmem_map)
- return 0;
-
- map = bdata->node_bootmem_map;
- start = bdata->node_min_pfn;
- end = bdata->node_low_pfn;
-
- bdebug("nid=%td start=%lx end=%lx\n",
- bdata - bootmem_node_data, start, end);
-
- while (start < end) {
- unsigned long idx, vec;
- unsigned shift;
-
- idx = start - bdata->node_min_pfn;
- shift = idx & (BITS_PER_LONG - 1);
- /*
- * vec holds at most BITS_PER_LONG map bits,
- * bit 0 corresponds to start.
- */
- vec = ~map[idx / BITS_PER_LONG];
-
- if (shift) {
- vec >>= shift;
- if (end - start >= BITS_PER_LONG)
- vec |= ~map[idx / BITS_PER_LONG + 1] <<
- (BITS_PER_LONG - shift);
- }
- /*
- * If we have a properly aligned and fully unreserved
- * BITS_PER_LONG block of pages in front of us, free
- * it in one go.
- */
- if (IS_ALIGNED(start, BITS_PER_LONG) && vec == ~0UL) {
- int order = ilog2(BITS_PER_LONG);
-
- __free_pages_bootmem(pfn_to_page(start), start, order);
- count += BITS_PER_LONG;
- start += BITS_PER_LONG;
- } else {
- cur = start;
-
- start = ALIGN(start + 1, BITS_PER_LONG);
- while (vec && cur != start) {
- if (vec & 1) {
- page = pfn_to_page(cur);
- __free_pages_bootmem(page, cur, 0);
- count++;
- }
- vec >>= 1;
- ++cur;
- }
- }
- }
-
- cur = bdata->node_min_pfn;
- page = virt_to_page(bdata->node_bootmem_map);
- pages = bdata->node_low_pfn - bdata->node_min_pfn;
- pages = bootmem_bootmap_pages(pages);
- count += pages;
- while (pages--)
- __free_pages_bootmem(page++, cur++, 0);
- bdata->node_bootmem_map = NULL;
-
- bdebug("nid=%td released=%lx\n", bdata - bootmem_node_data, count);
-
- return count;
-}
-
-static int reset_managed_pages_done __initdata;
-
-void reset_node_managed_pages(pg_data_t *pgdat)
-{
- struct zone *z;
-
- for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->managed_pages = 0;
-}
-
-void __init reset_all_zones_managed_pages(void)
-{
- struct pglist_data *pgdat;
-
- if (reset_managed_pages_done)
- return;
-
- for_each_online_pgdat(pgdat)
- reset_node_managed_pages(pgdat);
-
- reset_managed_pages_done = 1;
-}
-
-unsigned long __init free_all_bootmem(void)
-{
- unsigned long total_pages = 0;
- bootmem_data_t *bdata;
-
- reset_all_zones_managed_pages();
-
- list_for_each_entry(bdata, &bdata_list, list)
- total_pages += free_all_bootmem_core(bdata);
-
- totalram_pages += total_pages;
-
- return total_pages;
-}
-
-static void __init __free(bootmem_data_t *bdata,
- unsigned long sidx, unsigned long eidx)
-{
- unsigned long idx;
-
- bdebug("nid=%td start=%lx end=%lx\n", bdata - bootmem_node_data,
- sidx + bdata->node_min_pfn,
- eidx + bdata->node_min_pfn);
-
- if (WARN_ON(bdata->node_bootmem_map == NULL))
- return;
-
- if (bdata->hint_idx > sidx)
- bdata->hint_idx = sidx;
-
- for (idx = sidx; idx < eidx; idx++)
- if (!test_and_clear_bit(idx, bdata->node_bootmem_map))
- BUG();
-}
-
-static int __init __reserve(bootmem_data_t *bdata, unsigned long sidx,
- unsigned long eidx, int flags)
-{
- unsigned long idx;
- int exclusive = flags & BOOTMEM_EXCLUSIVE;
-
- bdebug("nid=%td start=%lx end=%lx flags=%x\n",
- bdata - bootmem_node_data,
- sidx + bdata->node_min_pfn,
- eidx + bdata->node_min_pfn,
- flags);
-
- if (WARN_ON(bdata->node_bootmem_map == NULL))
- return 0;
-
- for (idx = sidx; idx < eidx; idx++)
- if (test_and_set_bit(idx, bdata->node_bootmem_map)) {
- if (exclusive) {
- __free(bdata, sidx, idx);
- return -EBUSY;
- }
- bdebug("silent double reserve of PFN %lx\n",
- idx + bdata->node_min_pfn);
- }
- return 0;
-}
-
-static int __init mark_bootmem_node(bootmem_data_t *bdata,
- unsigned long start, unsigned long end,
- int reserve, int flags)
-{
- unsigned long sidx, eidx;
-
- bdebug("nid=%td start=%lx end=%lx reserve=%d flags=%x\n",
- bdata - bootmem_node_data, start, end, reserve, flags);
-
- BUG_ON(start < bdata->node_min_pfn);
- BUG_ON(end > bdata->node_low_pfn);
-
- sidx = start - bdata->node_min_pfn;
- eidx = end - bdata->node_min_pfn;
-
- if (reserve)
- return __reserve(bdata, sidx, eidx, flags);
- else
- __free(bdata, sidx, eidx);
- return 0;
-}
-
-static int __init mark_bootmem(unsigned long start, unsigned long end,
- int reserve, int flags)
-{
- unsigned long pos;
- bootmem_data_t *bdata;
-
- pos = start;
- list_for_each_entry(bdata, &bdata_list, list) {
- int err;
- unsigned long max;
-
- if (pos < bdata->node_min_pfn ||
- pos >= bdata->node_low_pfn) {
- BUG_ON(pos != start);
- continue;
- }
-
- max = min(bdata->node_low_pfn, end);
-
- err = mark_bootmem_node(bdata, pos, max, reserve, flags);
- if (reserve && err) {
- mark_bootmem(start, pos, 0, 0);
- return err;
- }
-
- if (max == end)
- return 0;
- pos = bdata->node_low_pfn;
- }
- BUG();
-}
-
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size)
-{
- unsigned long start, end;
-
- kmemleak_free_part_phys(physaddr, size);
-
- start = PFN_UP(physaddr);
- end = PFN_DOWN(physaddr + size);
-
- mark_bootmem_node(pgdat->bdata, start, end, 0, 0);
-}
-
-void __init free_bootmem(unsigned long physaddr, unsigned long size)
-{
- unsigned long start, end;
-
- kmemleak_free_part_phys(physaddr, size);
-
- start = PFN_UP(physaddr);
- end = PFN_DOWN(physaddr + size);
-
- mark_bootmem(start, end, 0, 0);
-}
-
-/**
- * reserve_bootmem_node - mark a page range as reserved
- * @pgdat: node the range resides on
- * @physaddr: starting address of the range
- * @size: size of the range in bytes
- * @flags: reservation flags (see linux/bootmem.h)
- *
- * Partial pages will be reserved.
- *
- * The range must reside completely on the specified node.
- *
- * Return: 0 on success, -errno on failure.
- */
-int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size, int flags)
-{
- unsigned long start, end;
-
- start = PFN_DOWN(physaddr);
- end = PFN_UP(physaddr + size);
-
- return mark_bootmem_node(pgdat->bdata, start, end, 1, flags);
-}
-
-/**
- * reserve_bootmem - mark a page range as reserved
- * @addr: starting address of the range
- * @size: size of the range in bytes
- * @flags: reservation flags (see linux/bootmem.h)
- *
- * Partial pages will be reserved.
- *
- * The range must be contiguous but may span node boundaries.
- *
- * Return: 0 on success, -errno on failure.
- */
-int __init reserve_bootmem(unsigned long addr, unsigned long size,
- int flags)
-{
- unsigned long start, end;
-
- start = PFN_DOWN(addr);
- end = PFN_UP(addr + size);
-
- return mark_bootmem(start, end, 1, flags);
-}
-
-static unsigned long __init align_idx(struct bootmem_data *bdata,
- unsigned long idx, unsigned long step)
-{
- unsigned long base = bdata->node_min_pfn;
-
- /*
- * Align the index with respect to the node start so that the
- * combination of both satisfies the requested alignment.
- */
-
- return ALIGN(base + idx, step) - base;
-}
-
-static unsigned long __init align_off(struct bootmem_data *bdata,
- unsigned long off, unsigned long align)
-{
- unsigned long base = PFN_PHYS(bdata->node_min_pfn);
-
- /* Same as align_idx for byte offsets */
-
- return ALIGN(base + off, align) - base;
-}
-
-static void * __init alloc_bootmem_bdata(struct bootmem_data *bdata,
- unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
-{
- unsigned long fallback = 0;
- unsigned long min, max, start, sidx, midx, step;
-
- bdebug("nid=%td size=%lx [%lu pages] align=%lx goal=%lx limit=%lx\n",
- bdata - bootmem_node_data, size, PAGE_ALIGN(size) >> PAGE_SHIFT,
- align, goal, limit);
-
- BUG_ON(!size);
- BUG_ON(align & (align - 1));
- BUG_ON(limit && goal + size > limit);
-
- if (!bdata->node_bootmem_map)
- return NULL;
-
- min = bdata->node_min_pfn;
- max = bdata->node_low_pfn;
-
- goal >>= PAGE_SHIFT;
- limit >>= PAGE_SHIFT;
-
- if (limit && max > limit)
- max = limit;
- if (max <= min)
- return NULL;
-
- step = max(align >> PAGE_SHIFT, 1UL);
-
- if (goal && min < goal && goal < max)
- start = ALIGN(goal, step);
- else
- start = ALIGN(min, step);
-
- sidx = start - bdata->node_min_pfn;
- midx = max - bdata->node_min_pfn;
-
- if (bdata->hint_idx > sidx) {
- /*
- * Handle the valid case of sidx being zero and still
- * catch the fallback below.
- */
- fallback = sidx + 1;
- sidx = align_idx(bdata, bdata->hint_idx, step);
- }
-
- while (1) {
- int merge;
- void *region;
- unsigned long eidx, i, start_off, end_off;
-find_block:
- sidx = find_next_zero_bit(bdata->node_bootmem_map, midx, sidx);
- sidx = align_idx(bdata, sidx, step);
- eidx = sidx + PFN_UP(size);
-
- if (sidx >= midx || eidx > midx)
- break;
-
- for (i = sidx; i < eidx; i++)
- if (test_bit(i, bdata->node_bootmem_map)) {
- sidx = align_idx(bdata, i, step);
- if (sidx == i)
- sidx += step;
- goto find_block;
- }
-
- if (bdata->last_end_off & (PAGE_SIZE - 1) &&
- PFN_DOWN(bdata->last_end_off) + 1 == sidx)
- start_off = align_off(bdata, bdata->last_end_off, align);
- else
- start_off = PFN_PHYS(sidx);
-
- merge = PFN_DOWN(start_off) < sidx;
- end_off = start_off + size;
-
- bdata->last_end_off = end_off;
- bdata->hint_idx = PFN_UP(end_off);
-
- /*
- * Reserve the area now:
- */
- if (__reserve(bdata, PFN_DOWN(start_off) + merge,
- PFN_UP(end_off), BOOTMEM_EXCLUSIVE))
- BUG();
-
- region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) +
- start_off);
- memset(region, 0, size);
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(region, size, 0, 0);
- return region;
- }
-
- if (fallback) {
- sidx = align_idx(bdata, fallback - 1, step);
- fallback = 0;
- goto find_block;
- }
-
- return NULL;
-}
-
-static void * __init alloc_bootmem_core(unsigned long size,
- unsigned long align,
- unsigned long goal,
- unsigned long limit)
-{
- bootmem_data_t *bdata;
- void *region;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
-
- list_for_each_entry(bdata, &bdata_list, list) {
- if (goal && bdata->node_low_pfn <= PFN_DOWN(goal))
- continue;
- if (limit && bdata->node_min_pfn >= PFN_DOWN(limit))
- break;
-
- region = alloc_bootmem_bdata(bdata, size, align, goal, limit);
- if (region)
- return region;
- }
-
- return NULL;
-}
-
-static void * __init ___alloc_bootmem_nopanic(unsigned long size,
- unsigned long align,
- unsigned long goal,
- unsigned long limit)
-{
- void *ptr;
-
-restart:
- ptr = alloc_bootmem_core(size, align, goal, limit);
- if (ptr)
- return ptr;
- if (goal) {
- goal = 0;
- goto restart;
- }
-
- return NULL;
-}
-
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
- unsigned long goal)
-{
- unsigned long limit = 0;
-
- return ___alloc_bootmem_nopanic(size, align, goal, limit);
-}
-
-static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
-{
- void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
-
- if (mem)
- return mem;
- /*
- * Whoops, we cannot satisfy the allocation request.
- */
- pr_alert("bootmem alloc of %lu bytes failed!\n", size);
- panic("Out of memory");
- return NULL;
-}
-
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
- unsigned long goal)
-{
- unsigned long limit = 0;
-
- return ___alloc_bootmem(size, align, goal, limit);
-}
-
-void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
- unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
-{
- void *ptr;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-again:
-
- /* do not panic in alloc_bootmem_bdata() */
- if (limit && goal + size > limit)
- limit = 0;
-
- ptr = alloc_bootmem_bdata(pgdat->bdata, size, align, goal, limit);
- if (ptr)
- return ptr;
-
- ptr = alloc_bootmem_core(size, align, goal, limit);
- if (ptr)
- return ptr;
-
- if (goal) {
- goal = 0;
- goto again;
- }
-
- return NULL;
-}
-
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
-}
-
-void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal,
- unsigned long limit)
-{
- void *ptr;
-
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
- if (ptr)
- return ptr;
-
- pr_alert("bootmem alloc of %lu bytes failed!\n", size);
- panic("Out of memory");
- return NULL;
-}
-
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
-}
-
-void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
-#ifdef MAX_DMA32_PFN
- unsigned long end_pfn;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- /* update goal according ...MAX_DMA32_PFN */
- end_pfn = pgdat_end_pfn(pgdat);
-
- if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
- (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
- void *ptr;
- unsigned long new_goal;
-
- new_goal = MAX_DMA32_PFN << PAGE_SHIFT;
- ptr = alloc_bootmem_bdata(pgdat->bdata, size, align,
- new_goal, 0);
- if (ptr)
- return ptr;
- }
-#endif
-
- return __alloc_bootmem_node(pgdat, size, align, goal);
-
-}
-
-void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
- unsigned long goal)
-{
- return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
-}
-
-void * __init __alloc_bootmem_low_nopanic(unsigned long size,
- unsigned long align,
- unsigned long goal)
-{
- return ___alloc_bootmem_nopanic(size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
-}
-
-void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- return ___alloc_bootmem_node(pgdat, size, align,
- goal, ARCH_LOW_ADDRESS_LIMIT);
-}
diff --git a/mm/cleancache.c b/mm/cleancache.c
index 2bf12da..db7eee9 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Cleancache frontend
*
@@ -7,8 +8,6 @@
*
* Copyright (C) 2009-2010 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
*/
#include <linux/module.h>
@@ -305,8 +304,7 @@
{
#ifdef CONFIG_DEBUG_FS
struct dentry *root = debugfs_create_dir("cleancache", NULL);
- if (root == NULL)
- return -ENXIO;
+
debugfs_create_u64("succ_gets", 0444, root, &cleancache_succ_gets);
debugfs_create_u64("failed_gets", 0444, root, &cleancache_failed_gets);
debugfs_create_u64("puts", 0444, root, &cleancache_puts);
diff --git a/mm/cma.c b/mm/cma.c
index 4cb7612..7fe0b83 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Contiguous Memory Allocator
*
@@ -9,11 +10,6 @@
* Michal Nazarewicz <mina86@mina86.com>
* Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
* Joonsoo Kim <iamjoonsoo.kim@lge.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License or (at your optional) any later version of the license.
*/
#define pr_fmt(fmt) "cma: " fmt
@@ -106,8 +102,10 @@
cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
- if (!cma->bitmap)
+ if (!cma->bitmap) {
+ cma->count = 0;
return -ENOMEM;
+ }
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -280,6 +278,12 @@
*/
alignment = max(alignment, (phys_addr_t)PAGE_SIZE <<
max_t(unsigned long, MAX_ORDER - 1, pageblock_order));
+ if (fixed && base & (alignment - 1)) {
+ ret = -EINVAL;
+ pr_err("Region at %pa must be aligned to %pa bytes\n",
+ &base, &alignment);
+ goto err;
+ }
base = ALIGN(base, alignment);
size = ALIGN(size, alignment);
limit &= ~(alignment - 1);
@@ -310,6 +314,13 @@
if (limit == 0 || limit > memblock_end)
limit = memblock_end;
+ if (base + size > limit) {
+ ret = -EINVAL;
+ pr_err("Size (%pa) of region at %pa exceeds limit (%pa)\n",
+ &size, &base, &limit);
+ goto err;
+ }
+
/* Reserve memory */
if (fixed) {
if (memblock_is_region_reserved(base, size) ||
@@ -327,16 +338,14 @@
* memory in case of failure.
*/
if (base < highmem_start && limit > highmem_start) {
- addr = memblock_alloc_range(size, alignment,
- highmem_start, limit,
- MEMBLOCK_NONE);
+ addr = memblock_phys_alloc_range(size, alignment,
+ highmem_start, limit);
limit = highmem_start;
}
if (!addr) {
- addr = memblock_alloc_range(size, alignment, base,
- limit,
- MEMBLOCK_NONE);
+ addr = memblock_phys_alloc_range(size, alignment, base,
+ limit);
if (!addr) {
ret = -ENOMEM;
goto err;
@@ -353,12 +362,14 @@
ret = cma_init_reserved_mem(base, size, order_per_bit, name, res_cma);
if (ret)
- goto err;
+ goto free_mem;
pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
&base);
return 0;
+free_mem:
+ memblock_free(base, size);
err:
pr_err("Failed to reserve %ld MiB\n", (unsigned long)size / SZ_1M);
return ret;
@@ -367,23 +378,26 @@
#ifdef CONFIG_CMA_DEBUG
static void cma_debug_show_areas(struct cma *cma)
{
- unsigned long next_zero_bit, next_set_bit;
+ unsigned long next_zero_bit, next_set_bit, nr_zero;
unsigned long start = 0;
- unsigned int nr_zero, nr_total = 0;
+ unsigned long nr_part, nr_total = 0;
+ unsigned long nbits = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock);
pr_info("number of available pages: ");
for (;;) {
- next_zero_bit = find_next_zero_bit(cma->bitmap, cma->count, start);
- if (next_zero_bit >= cma->count)
+ next_zero_bit = find_next_zero_bit(cma->bitmap, nbits, start);
+ if (next_zero_bit >= nbits)
break;
- next_set_bit = find_next_bit(cma->bitmap, cma->count, next_zero_bit);
+ next_set_bit = find_next_bit(cma->bitmap, nbits, next_zero_bit);
nr_zero = next_set_bit - next_zero_bit;
- pr_cont("%s%u@%lu", nr_total ? "+" : "", nr_zero, next_zero_bit);
- nr_total += nr_zero;
+ nr_part = nr_zero << cma->order_per_bit;
+ pr_cont("%s%lu@%lu", nr_total ? "+" : "", nr_part,
+ next_zero_bit);
+ nr_total += nr_part;
start = next_zero_bit + nr_zero;
}
- pr_cont("=> %u free of %lu total pages\n", nr_total, cma->count);
+ pr_cont("=> %lu free of %lu total pages\n", nr_total, cma->count);
mutex_unlock(&cma->lock);
}
#else
@@ -407,6 +421,7 @@
unsigned long pfn = -1;
unsigned long start = 0;
unsigned long bitmap_maxno, bitmap_no, bitmap_count;
+ size_t i;
struct page *page = NULL;
int ret = -ENOMEM;
@@ -466,6 +481,16 @@
trace_cma_alloc(pfn, page, count, align);
+ /*
+ * CMA can allocate multiple page blocks, which results in different
+ * blocks being marked with different tags. Reset the tags to ignore
+ * those page blocks.
+ */
+ if (page) {
+ for (i = 0; i < count; i++)
+ page_kasan_tag_reset(page + i);
+ }
+
if (ret && !no_warn) {
pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n",
__func__, count, ret);
@@ -482,7 +507,7 @@
* @pages: Allocated pages.
* @count: Number of allocated pages.
*
- * This function releases memory allocated by alloc_cma().
+ * This function releases memory allocated by cma_alloc().
* It returns false when provided pages do not belong to contiguous area and
* true otherwise.
*/
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index ad6723e..a7dd9e8 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -21,8 +21,6 @@
unsigned long n;
};
-static struct dentry *cma_debugfs_root;
-
static int cma_debugfs_get(void *data, u64 *val)
{
unsigned long *p = data;
@@ -58,7 +56,7 @@
mutex_lock(&cma->lock);
for (;;) {
start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
- if (start >= cma->count)
+ if (start >= bitmap_maxno)
break;
end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
@@ -162,7 +160,7 @@
}
DEFINE_SIMPLE_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
-static void cma_debugfs_add_one(struct cma *cma, int idx)
+static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
char name[16];
@@ -170,7 +168,7 @@
scnprintf(name, sizeof(name), "cma-%s", cma->name);
- tmp = debugfs_create_dir(name, cma_debugfs_root);
+ tmp = debugfs_create_dir(name, root_dentry);
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
@@ -188,14 +186,13 @@
static int __init cma_debugfs_init(void)
{
+ struct dentry *cma_debugfs_root;
int i;
cma_debugfs_root = debugfs_create_dir("cma", NULL);
- if (!cma_debugfs_root)
- return -ENOMEM;
for (i = 0; i < cma_area_count; i++)
- cma_debugfs_add_one(&cma_areas[i], i);
+ cma_debugfs_add_one(&cma_areas[i], cma_debugfs_root);
return 0;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index faca45e..672d3c7 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -22,6 +22,7 @@
#include <linux/kthread.h>
#include <linux/freezer.h>
#include <linux/page_owner.h>
+#include <linux/psi.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -65,7 +66,7 @@
return high_pfn;
}
-static void map_pages(struct list_head *list)
+static void split_map_pages(struct list_head *list)
{
unsigned int i, order, nr_pages;
struct page *page, *next;
@@ -236,6 +237,78 @@
return false;
}
+static bool
+__reset_isolation_pfn(struct zone *zone, unsigned long pfn, bool check_source,
+ bool check_target)
+{
+ struct page *page = pfn_to_online_page(pfn);
+ struct page *block_page;
+ struct page *end_page;
+ unsigned long block_pfn;
+
+ if (!page)
+ return false;
+ if (zone != page_zone(page))
+ return false;
+ if (pageblock_skip_persistent(page))
+ return false;
+
+ /*
+ * If skip is already cleared do no further checking once the
+ * restart points have been set.
+ */
+ if (check_source && check_target && !get_pageblock_skip(page))
+ return true;
+
+ /*
+ * If clearing skip for the target scanner, do not select a
+ * non-movable pageblock as the starting point.
+ */
+ if (!check_source && check_target &&
+ get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ return false;
+
+ /* Ensure the start of the pageblock or zone is online and valid */
+ block_pfn = pageblock_start_pfn(pfn);
+ block_pfn = max(block_pfn, zone->zone_start_pfn);
+ block_page = pfn_to_online_page(block_pfn);
+ if (block_page) {
+ page = block_page;
+ pfn = block_pfn;
+ }
+
+ /* Ensure the end of the pageblock or zone is online and valid */
+ block_pfn = pageblock_end_pfn(pfn) - 1;
+ block_pfn = min(block_pfn, zone_end_pfn(zone) - 1);
+ end_page = pfn_to_online_page(block_pfn);
+ if (!end_page)
+ return false;
+
+ /*
+ * Only clear the hint if a sample indicates there is either a
+ * free page or an LRU page in the block. One or other condition
+ * is necessary for the block to be a migration source/target.
+ */
+ do {
+ if (pfn_valid_within(pfn)) {
+ if (check_source && PageLRU(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+
+ if (check_target && PageBuddy(page)) {
+ clear_pageblock_skip(page);
+ return true;
+ }
+ }
+
+ page += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ pfn += (1 << PAGE_ALLOC_COSTLY_ORDER);
+ } while (page <= end_page);
+
+ return false;
+}
+
/*
* This function is called to clear all cached information on pageblocks that
* should be skipped for page isolation when the migrate and free page scanner
@@ -243,30 +316,54 @@
*/
static void __reset_isolation_suitable(struct zone *zone)
{
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
- unsigned long pfn;
+ unsigned long migrate_pfn = zone->zone_start_pfn;
+ unsigned long free_pfn = zone_end_pfn(zone) - 1;
+ unsigned long reset_migrate = free_pfn;
+ unsigned long reset_free = migrate_pfn;
+ bool source_set = false;
+ bool free_set = false;
+
+ if (!zone->compact_blockskip_flush)
+ return;
zone->compact_blockskip_flush = false;
- /* Walk the zone and mark every pageblock as suitable for isolation */
- for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
- struct page *page;
-
+ /*
+ * Walk the zone and update pageblock skip information. Source looks
+ * for PageLRU while target looks for PageBuddy. When the scanner
+ * is found, both PageBuddy and PageLRU are checked as the pageblock
+ * is suitable as both source and target.
+ */
+ for (; migrate_pfn < free_pfn; migrate_pfn += pageblock_nr_pages,
+ free_pfn -= pageblock_nr_pages) {
cond_resched();
- page = pfn_to_online_page(pfn);
- if (!page)
- continue;
- if (zone != page_zone(page))
- continue;
- if (pageblock_skip_persistent(page))
- continue;
+ /* Update the migrate PFN */
+ if (__reset_isolation_pfn(zone, migrate_pfn, true, source_set) &&
+ migrate_pfn < reset_migrate) {
+ source_set = true;
+ reset_migrate = migrate_pfn;
+ zone->compact_init_migrate_pfn = reset_migrate;
+ zone->compact_cached_migrate_pfn[0] = reset_migrate;
+ zone->compact_cached_migrate_pfn[1] = reset_migrate;
+ }
- clear_pageblock_skip(page);
+ /* Update the free PFN */
+ if (__reset_isolation_pfn(zone, free_pfn, free_set, true) &&
+ free_pfn > reset_free) {
+ free_set = true;
+ reset_free = free_pfn;
+ zone->compact_init_free_pfn = reset_free;
+ zone->compact_cached_free_pfn = reset_free;
+ }
}
- reset_cached_positions(zone);
+ /* Leave no distance if no suitable block was reset */
+ if (reset_migrate >= reset_free) {
+ zone->compact_cached_migrate_pfn[0] = migrate_pfn;
+ zone->compact_cached_migrate_pfn[1] = migrate_pfn;
+ zone->compact_cached_free_pfn = free_pfn;
+ }
}
void reset_isolation_suitable(pg_data_t *pgdat)
@@ -285,15 +382,53 @@
}
/*
+ * Sets the pageblock skip bit if it was clear. Note that this is a hint as
+ * locks are not required for read/writers. Returns true if it was already set.
+ */
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ bool skip;
+
+ /* Do no update if skip hint is being ignored */
+ if (cc->ignore_skip_hint)
+ return false;
+
+ if (!IS_ALIGNED(pfn, pageblock_nr_pages))
+ return false;
+
+ skip = get_pageblock_skip(page);
+ if (!skip && !cc->no_set_skip_hint)
+ set_pageblock_skip(page);
+
+ return skip;
+}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+ struct zone *zone = cc->zone;
+
+ pfn = pageblock_end_pfn(pfn);
+
+ /* Set for isolation rather than compaction */
+ if (cc->no_set_skip_hint)
+ return;
+
+ if (pfn > zone->compact_cached_migrate_pfn[0])
+ zone->compact_cached_migrate_pfn[0] = pfn;
+ if (cc->mode != MIGRATE_ASYNC &&
+ pfn > zone->compact_cached_migrate_pfn[1])
+ zone->compact_cached_migrate_pfn[1] = pfn;
+}
+
+/*
* If no pages were isolated then mark this pageblock to be skipped in the
* future. The information is later cleared by __reset_isolation_suitable().
*/
static void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
{
struct zone *zone = cc->zone;
- unsigned long pfn;
if (cc->no_set_skip_hint)
return;
@@ -301,24 +436,11 @@
if (!page)
return;
- if (nr_isolated)
- return;
-
set_pageblock_skip(page);
- pfn = page_to_pfn(page);
-
/* Update where async and sync compaction should restart */
- if (migrate_scanner) {
- if (pfn > zone->compact_cached_migrate_pfn[0])
- zone->compact_cached_migrate_pfn[0] = pfn;
- if (cc->mode != MIGRATE_ASYNC &&
- pfn > zone->compact_cached_migrate_pfn[1])
- zone->compact_cached_migrate_pfn[1] = pfn;
- } else {
- if (pfn < zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = pfn;
- }
+ if (pfn < zone->compact_cached_free_pfn)
+ zone->compact_cached_free_pfn = pfn;
}
#else
static inline bool isolation_suitable(struct compact_control *cc,
@@ -333,32 +455,42 @@
}
static inline void update_pageblock_skip(struct compact_control *cc,
- struct page *page, unsigned long nr_isolated,
- bool migrate_scanner)
+ struct page *page, unsigned long pfn)
{
}
+
+static void update_cached_migrate(struct compact_control *cc, unsigned long pfn)
+{
+}
+
+static bool test_and_set_skip(struct compact_control *cc, struct page *page,
+ unsigned long pfn)
+{
+ return false;
+}
#endif /* CONFIG_COMPACTION */
/*
* Compaction requires the taking of some coarse locks that are potentially
- * very heavily contended. For async compaction, back out if the lock cannot
- * be taken immediately. For sync compaction, spin on the lock if needed.
+ * very heavily contended. For async compaction, trylock and record if the
+ * lock is contended. The lock will still be acquired but compaction will
+ * abort when the current block is finished regardless of success rate.
+ * Sync compaction acquires the lock.
*
- * Returns true if the lock is held
- * Returns false if the lock is not held and compaction should abort
+ * Always returns true which makes it easier to track lock state in callers.
*/
-static bool compact_trylock_irqsave(spinlock_t *lock, unsigned long *flags,
+static bool compact_lock_irqsave(spinlock_t *lock, unsigned long *flags,
struct compact_control *cc)
{
- if (cc->mode == MIGRATE_ASYNC) {
- if (!spin_trylock_irqsave(lock, *flags)) {
- cc->contended = true;
- return false;
- }
- } else {
- spin_lock_irqsave(lock, *flags);
+ /* Track if the lock is contended in async mode */
+ if (cc->mode == MIGRATE_ASYNC && !cc->contended) {
+ if (spin_trylock_irqsave(lock, *flags))
+ return true;
+
+ cc->contended = true;
}
+ spin_lock_irqsave(lock, *flags);
return true;
}
@@ -390,37 +522,7 @@
return true;
}
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
- cond_resched();
- }
-
- return false;
-}
-
-/*
- * Aside from avoiding lock contention, compaction also periodically checks
- * need_resched() and either schedules in sync compaction or aborts async
- * compaction. This is similar to what compact_unlock_should_abort() does, but
- * is used where no lock is concerned.
- *
- * Returns false when no scheduling was needed, or sync compaction scheduled.
- * Returns true when async compaction should abort.
- */
-static inline bool compact_should_abort(struct compact_control *cc)
-{
- /* async compaction aborts if contended */
- if (need_resched()) {
- if (cc->mode == MIGRATE_ASYNC) {
- cc->contended = true;
- return true;
- }
-
- cond_resched();
- }
+ cond_resched();
return false;
}
@@ -434,19 +536,24 @@
unsigned long *start_pfn,
unsigned long end_pfn,
struct list_head *freelist,
+ unsigned int stride,
bool strict)
{
int nr_scanned = 0, total_isolated = 0;
- struct page *cursor, *valid_page = NULL;
+ struct page *cursor;
unsigned long flags = 0;
bool locked = false;
unsigned long blockpfn = *start_pfn;
unsigned int order;
+ /* Strict mode is for isolation, speed is secondary */
+ if (strict)
+ stride = 1;
+
cursor = pfn_to_page(blockpfn);
/* Isolate free pages. */
- for (; blockpfn < end_pfn; blockpfn++, cursor++) {
+ for (; blockpfn < end_pfn; blockpfn += stride, cursor += stride) {
int isolated;
struct page *page = cursor;
@@ -464,9 +571,6 @@
if (!pfn_valid_within(blockpfn))
goto isolate_fail;
- if (!valid_page)
- valid_page = page;
-
/*
* For compound pages such as THP and hugetlbfs, we can save
* potentially a lot of iterations if we skip them at once.
@@ -494,18 +598,8 @@
* recheck as well.
*/
if (!locked) {
- /*
- * The zone lock must be held to isolate freepages.
- * Unfortunately this is a very coarse lock and can be
- * heavily contended if there are parallel allocations
- * or parallel compactions. For async compaction do not
- * spin on the lock and we acquire the lock as late as
- * possible.
- */
- locked = compact_trylock_irqsave(&cc->zone->lock,
+ locked = compact_lock_irqsave(&cc->zone->lock,
&flags, cc);
- if (!locked)
- break;
/* Recheck this is a buddy page under lock */
if (!PageBuddy(page))
@@ -564,10 +658,6 @@
if (strict && blockpfn < end_pfn)
total_isolated = 0;
- /* Update the pageblock-skip if the whole pageblock was scanned */
- if (blockpfn == end_pfn)
- update_pageblock_skip(cc, valid_page, total_isolated, false);
-
cc->total_free_scanned += nr_scanned;
if (total_isolated)
count_compact_events(COMPACTISOLATED, total_isolated);
@@ -625,7 +715,7 @@
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, &freelist, true);
+ block_end_pfn, &freelist, 0, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -643,7 +733,7 @@
}
/* __isolate_free_page() does not map the pages */
- map_pages(&freelist);
+ split_map_pages(&freelist);
if (pfn < end_pfn) {
/* Loop terminated early, cleanup. */
@@ -656,16 +746,16 @@
}
/* Similar to reclaim, but different enough that they don't share logic */
-static bool too_many_isolated(struct zone *zone)
+static bool too_many_isolated(pg_data_t *pgdat)
{
unsigned long active, inactive, isolated;
- inactive = node_page_state(zone->zone_pgdat, NR_INACTIVE_FILE) +
- node_page_state(zone->zone_pgdat, NR_INACTIVE_ANON);
- active = node_page_state(zone->zone_pgdat, NR_ACTIVE_FILE) +
- node_page_state(zone->zone_pgdat, NR_ACTIVE_ANON);
- isolated = node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE) +
- node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON);
+ inactive = node_page_state(pgdat, NR_INACTIVE_FILE) +
+ node_page_state(pgdat, NR_INACTIVE_ANON);
+ active = node_page_state(pgdat, NR_ACTIVE_FILE) +
+ node_page_state(pgdat, NR_ACTIVE_ANON);
+ isolated = node_page_state(pgdat, NR_ISOLATED_FILE) +
+ node_page_state(pgdat, NR_ISOLATED_ANON);
return isolated > (inactive + active) / 2;
}
@@ -692,7 +782,7 @@
isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long end_pfn, isolate_mode_t isolate_mode)
{
- struct zone *zone = cc->zone;
+ pg_data_t *pgdat = cc->zone->zone_pgdat;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
@@ -701,13 +791,14 @@
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
unsigned long next_skip_pfn = 0;
+ bool skip_updated = false;
/*
* Ensure that there are not too many pages isolated from the LRU
* list by either parallel reclaimers or compaction. If there are,
* delay for some time until fewer pages are isolated
*/
- while (unlikely(too_many_isolated(zone))) {
+ while (unlikely(too_many_isolated(pgdat))) {
/* async migration should just abort */
if (cc->mode == MIGRATE_ASYNC)
return 0;
@@ -718,8 +809,7 @@
return 0;
}
- if (compact_should_abort(cc))
- return 0;
+ cond_resched();
if (cc->direct_compaction && (cc->mode == MIGRATE_ASYNC)) {
skip_on_failure = true;
@@ -753,13 +843,15 @@
/*
* Periodically drop the lock (if held) regardless of its
- * contention, to give chance to IRQs. Abort async compaction
- * if contended.
+ * contention, to give chance to IRQs. Abort completely if
+ * a fatal signal is pending.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(zone_lru_lock(zone), flags,
- &locked, cc))
- break;
+ && compact_unlock_should_abort(&pgdat->lru_lock,
+ flags, &locked, cc)) {
+ low_pfn = 0;
+ goto fatal_pending;
+ }
if (!pfn_valid_within(low_pfn))
goto isolate_fail;
@@ -767,8 +859,19 @@
page = pfn_to_page(low_pfn);
- if (!valid_page)
+ /*
+ * Check if the pageblock has already been marked skipped.
+ * Only the aligned PFN is checked as the caller isolates
+ * COMPACT_CLUSTER_MAX at a time so the second call must
+ * not falsely conclude that the block should be skipped.
+ */
+ if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
+ if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
+ low_pfn = end_pfn;
+ goto isolate_abort;
+ }
valid_page = page;
+ }
/*
* Skip if free. We read page order here without zone lock
@@ -817,7 +920,7 @@
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(zone_lru_lock(zone),
+ spin_unlock_irqrestore(&pgdat->lru_lock,
flags);
locked = false;
}
@@ -847,10 +950,15 @@
/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
- locked = compact_trylock_irqsave(zone_lru_lock(zone),
+ locked = compact_lock_irqsave(&pgdat->lru_lock,
&flags, cc);
- if (!locked)
- break;
+
+ /* Try get exclusive access under lock */
+ if (!skip_updated) {
+ skip_updated = true;
+ if (test_and_set_skip(cc, page, low_pfn))
+ goto isolate_abort;
+ }
/* Recheck PageLRU and PageCompound under lock */
if (!PageLRU(page))
@@ -862,12 +970,12 @@
* is safe to read and it's 0 for tail pages.
*/
if (unlikely(PageCompound(page))) {
- low_pfn += (1UL << compound_order(page)) - 1;
+ low_pfn += compound_nr(page) - 1;
goto isolate_fail;
}
}
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
/* Try isolate the page */
if (__isolate_lru_page(page, isolate_mode) != 0)
@@ -886,16 +994,13 @@
nr_isolated++;
/*
- * Record where we could have freed pages by migration and not
- * yet flushed them to buddy allocator.
- * - this is the lowest page that was isolated and likely be
- * then freed by migration.
+ * Avoid isolating too much unless this block is being
+ * rescanned (e.g. dirty/writeback pages, parallel allocation)
+ * or a lock is contended. For contention, isolate quickly to
+ * potentially remove one source of contention.
*/
- if (!cc->last_migrated_pfn)
- cc->last_migrated_pfn = low_pfn;
-
- /* Avoid isolating too much */
- if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) {
+ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX &&
+ !cc->rescan && !cc->contended) {
++low_pfn;
break;
}
@@ -912,12 +1017,11 @@
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
locked = false;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
- cc->last_migrated_pfn = 0;
nr_isolated = 0;
}
@@ -938,19 +1042,28 @@
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+isolate_abort:
if (locked)
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
/*
- * Update the pageblock-skip information and cached scanner pfn,
- * if the whole pageblock was scanned without isolating any page.
+ * Updated the cached scanner pfn once the pageblock has been scanned
+ * Pages will either be migrated in which case there is no point
+ * scanning in the near future or migration failed in which case the
+ * failure reason may persist. The block is marked for skipping if
+ * there were no pages isolated in the block or if the block is
+ * rescanned twice in a row.
*/
- if (low_pfn == end_pfn)
- update_pageblock_skip(cc, valid_page, nr_isolated, true);
+ if (low_pfn == end_pfn && (!nr_isolated || cc->rescan)) {
+ if (valid_page && !skip_updated)
+ set_pageblock_skip(valid_page);
+ update_cached_migrate(cc, low_pfn);
+ }
trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
nr_scanned, nr_isolated);
+fatal_pending:
cc->total_migrate_scanned += nr_scanned;
if (nr_isolated)
count_compact_events(COMPACTISOLATED, nr_isolated);
@@ -1012,6 +1125,9 @@
{
int block_mt;
+ if (pageblock_skip_persistent(page))
+ return false;
+
if ((cc->mode != MIGRATE_ASYNC) || !cc->direct_compaction)
return true;
@@ -1049,6 +1165,14 @@
return false;
}
+static inline unsigned int
+freelist_scan_limit(struct compact_control *cc)
+{
+ unsigned short shift = BITS_PER_LONG - 1;
+
+ return (COMPACT_CLUSTER_MAX >> min(shift, cc->fast_search_fail)) + 1;
+}
+
/*
* Test whether the free scanner has reached the same or lower pageblock than
* the migration scanner, and compaction should thus terminate.
@@ -1060,6 +1184,248 @@
}
/*
+ * Used when scanning for a suitable migration target which scans freelists
+ * in reverse. Reorders the list such as the unscanned pages are scanned
+ * first on the next iteration of the free scanner
+ */
+static void
+move_freelist_head(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_last(freelist, &freepage->lru)) {
+ list_cut_before(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+/*
+ * Similar to move_freelist_head except used by the migration scanner
+ * when scanning forward. It's possible for these list operations to
+ * move against each other if they search the free list exactly in
+ * lockstep.
+ */
+static void
+move_freelist_tail(struct list_head *freelist, struct page *freepage)
+{
+ LIST_HEAD(sublist);
+
+ if (!list_is_first(freelist, &freepage->lru)) {
+ list_cut_position(&sublist, freelist, &freepage->lru);
+ if (!list_empty(&sublist))
+ list_splice_tail(&sublist, freelist);
+ }
+}
+
+static void
+fast_isolate_around(struct compact_control *cc, unsigned long pfn, unsigned long nr_isolated)
+{
+ unsigned long start_pfn, end_pfn;
+ struct page *page = pfn_to_page(pfn);
+
+ /* Do not search around if there are enough pages already */
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+
+ /* Minimise scanning during async compaction */
+ if (cc->direct_compaction && cc->mode == MIGRATE_ASYNC)
+ return;
+
+ /* Pageblock boundaries */
+ start_pfn = pageblock_start_pfn(pfn);
+ end_pfn = min(pageblock_end_pfn(pfn), zone_end_pfn(cc->zone)) - 1;
+
+ /* Scan before */
+ if (start_pfn != pfn) {
+ isolate_freepages_block(cc, &start_pfn, pfn, &cc->freepages, 1, false);
+ if (cc->nr_freepages >= cc->nr_migratepages)
+ return;
+ }
+
+ /* Scan after */
+ start_pfn = pfn + nr_isolated;
+ if (start_pfn < end_pfn)
+ isolate_freepages_block(cc, &start_pfn, end_pfn, &cc->freepages, 1, false);
+
+ /* Skip this pageblock in the future as it's full or nearly full */
+ if (cc->nr_freepages < cc->nr_migratepages)
+ set_pageblock_skip(page);
+}
+
+/* Search orders in round-robin fashion */
+static int next_search_order(struct compact_control *cc, int order)
+{
+ order--;
+ if (order < 0)
+ order = cc->order - 1;
+
+ /* Search wrapped around? */
+ if (order == cc->search_order) {
+ cc->search_order--;
+ if (cc->search_order < 0)
+ cc->search_order = cc->order - 1;
+ return -1;
+ }
+
+ return order;
+}
+
+static unsigned long
+fast_isolate_freepages(struct compact_control *cc)
+{
+ unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
+ unsigned int nr_scanned = 0;
+ unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long nr_isolated = 0;
+ unsigned long distance;
+ struct page *page = NULL;
+ bool scan_start = false;
+ int order;
+
+ /* Full compaction passes in a negative order */
+ if (cc->order <= 0)
+ return cc->free_pfn;
+
+ /*
+ * If starting the scan, use a deeper search and use the highest
+ * PFN found if a suitable one is not found.
+ */
+ if (cc->free_pfn >= cc->zone->compact_init_free_pfn) {
+ limit = pageblock_nr_pages >> 1;
+ scan_start = true;
+ }
+
+ /*
+ * Preferred point is in the top quarter of the scan space but take
+ * a pfn from the top half if the search is problematic.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn);
+ low_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 2));
+ min_pfn = pageblock_start_pfn(cc->free_pfn - (distance >> 1));
+
+ if (WARN_ON_ONCE(min_pfn > low_pfn))
+ low_pfn = min_pfn;
+
+ /*
+ * Search starts from the last successful isolation order or the next
+ * order to search after a previous failure
+ */
+ cc->search_order = min_t(unsigned int, cc->order - 1, cc->search_order);
+
+ for (order = cc->search_order;
+ !page && order >= 0;
+ order = next_search_order(cc, order)) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ struct page *freepage;
+ unsigned long flags;
+ unsigned int order_scanned = 0;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry_reverse(freepage, freelist, lru) {
+ unsigned long pfn;
+
+ order_scanned++;
+ nr_scanned++;
+ pfn = page_to_pfn(freepage);
+
+ if (pfn >= highest)
+ highest = pageblock_start_pfn(pfn);
+
+ if (pfn >= low_pfn) {
+ cc->fast_search_fail = 0;
+ cc->search_order = order;
+ page = freepage;
+ break;
+ }
+
+ if (pfn >= min_pfn && pfn > high_pfn) {
+ high_pfn = pfn;
+
+ /* Shorten the scan if a candidate is found */
+ limit >>= 1;
+ }
+
+ if (order_scanned >= limit)
+ break;
+ }
+
+ /* Use a minimum pfn if a preferred one was not found */
+ if (!page && high_pfn) {
+ page = pfn_to_page(high_pfn);
+
+ /* Update freepage for the list reorder below */
+ freepage = page;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_head(freelist, freepage);
+
+ /* Isolate the page if available */
+ if (page) {
+ if (__isolate_free_page(page, order)) {
+ set_page_private(page, order);
+ nr_isolated = 1 << order;
+ cc->nr_freepages += nr_isolated;
+ list_add_tail(&page->lru, &cc->freepages);
+ count_compact_events(COMPACTISOLATED, nr_isolated);
+ } else {
+ /* If isolation fails, abort the search */
+ order = cc->search_order + 1;
+ page = NULL;
+ }
+ }
+
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+
+ /*
+ * Smaller scan on next order so the total scan ig related
+ * to freelist_scan_limit.
+ */
+ if (order_scanned >= limit)
+ limit = min(1U, limit >> 1);
+ }
+
+ if (!page) {
+ cc->fast_search_fail++;
+ if (scan_start) {
+ /*
+ * Use the highest PFN found above min. If one was
+ * not found, be pessemistic for direct compaction
+ * and use the min mark.
+ */
+ if (highest) {
+ page = pfn_to_page(highest);
+ cc->free_pfn = highest;
+ } else {
+ if (cc->direct_compaction && pfn_valid(min_pfn)) {
+ page = pfn_to_page(min_pfn);
+ cc->free_pfn = min_pfn;
+ }
+ }
+ }
+ }
+
+ if (highest && highest >= cc->zone->compact_cached_free_pfn) {
+ highest -= pageblock_nr_pages;
+ cc->zone->compact_cached_free_pfn = highest;
+ }
+
+ cc->total_free_scanned += nr_scanned;
+ if (!page)
+ return cc->free_pfn;
+
+ low_pfn = page_to_pfn(page);
+ fast_isolate_around(cc, low_pfn, nr_isolated);
+ return low_pfn;
+}
+
+/*
* Based on information in the current compact_control, find blocks
* suitable for isolating free pages from and then isolate them.
*/
@@ -1072,6 +1438,12 @@
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
struct list_head *freelist = &cc->freepages;
+ unsigned int stride;
+
+ /* Try a small search of the free lists for a candidate */
+ isolate_start_pfn = fast_isolate_freepages(cc);
+ if (cc->nr_freepages)
+ goto splitmap;
/*
* Initialise the free scanner. The starting point is where we last
@@ -1085,10 +1457,11 @@
* is using.
*/
isolate_start_pfn = cc->free_pfn;
- block_start_pfn = pageblock_start_pfn(cc->free_pfn);
+ block_start_pfn = pageblock_start_pfn(isolate_start_pfn);
block_end_pfn = min(block_start_pfn + pageblock_nr_pages,
zone_end_pfn(zone));
low_pfn = pageblock_end_pfn(cc->migrate_pfn);
+ stride = cc->mode == MIGRATE_ASYNC ? COMPACT_CLUSTER_MAX : 1;
/*
* Isolate free pages until enough are available to migrate the
@@ -1099,14 +1472,14 @@
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
+ unsigned long nr_isolated;
+
/*
* This can iterate a massively long zone without finding any
- * suitable migration targets, so periodically check if we need
- * to schedule, or even abort async compaction.
+ * suitable migration targets, so periodically check resched.
*/
- if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
zone);
@@ -1122,15 +1495,15 @@
continue;
/* Found a block suitable for isolating free pages from. */
- isolate_freepages_block(cc, &isolate_start_pfn, block_end_pfn,
- freelist, false);
+ nr_isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ block_end_pfn, freelist, stride, false);
- /*
- * If we isolated enough freepages, or aborted due to lock
- * contention, terminate.
- */
- if ((cc->nr_freepages >= cc->nr_migratepages)
- || cc->contended) {
+ /* Update the skip hint if the full pageblock was scanned */
+ if (isolate_start_pfn == block_end_pfn)
+ update_pageblock_skip(cc, page, block_start_pfn);
+
+ /* Are enough freepages isolated? */
+ if (cc->nr_freepages >= cc->nr_migratepages) {
if (isolate_start_pfn >= block_end_pfn) {
/*
* Restart at previous pageblock if more
@@ -1147,10 +1520,14 @@
*/
break;
}
- }
- /* __isolate_free_page() does not map the pages */
- map_pages(freelist);
+ /* Adjust stride depending on isolation */
+ if (nr_isolated) {
+ stride = 1;
+ continue;
+ }
+ stride = min_t(unsigned int, COMPACT_CLUSTER_MAX, stride << 1);
+ }
/*
* Record where the free scanner will restart next time. Either we
@@ -1159,6 +1536,10 @@
* and the loop terminated due to isolate_start_pfn < low_pfn
*/
cc->free_pfn = isolate_start_pfn;
+
+splitmap:
+ /* __isolate_free_page() does not map the pages */
+ split_map_pages(freelist);
}
/*
@@ -1171,13 +1552,8 @@
struct compact_control *cc = (struct compact_control *)data;
struct page *freepage;
- /*
- * Isolate free pages if necessary, and if we are not aborting due to
- * contention.
- */
if (list_empty(&cc->freepages)) {
- if (!cc->contended)
- isolate_freepages(cc);
+ isolate_freepages(cc);
if (list_empty(&cc->freepages))
return NULL;
@@ -1216,13 +1592,153 @@
*/
int sysctl_compact_unevictable_allowed __read_mostly = 1;
+static inline void
+update_fast_start_pfn(struct compact_control *cc, unsigned long pfn)
+{
+ if (cc->fast_start_pfn == ULONG_MAX)
+ return;
+
+ if (!cc->fast_start_pfn)
+ cc->fast_start_pfn = pfn;
+
+ cc->fast_start_pfn = min(cc->fast_start_pfn, pfn);
+}
+
+static inline unsigned long
+reinit_migrate_pfn(struct compact_control *cc)
+{
+ if (!cc->fast_start_pfn || cc->fast_start_pfn == ULONG_MAX)
+ return cc->migrate_pfn;
+
+ cc->migrate_pfn = cc->fast_start_pfn;
+ cc->fast_start_pfn = ULONG_MAX;
+
+ return cc->migrate_pfn;
+}
+
+/*
+ * Briefly search the free lists for a migration source that already has
+ * some free pages to reduce the number of pages that need migration
+ * before a pageblock is free.
+ */
+static unsigned long fast_find_migrateblock(struct compact_control *cc)
+{
+ unsigned int limit = freelist_scan_limit(cc);
+ unsigned int nr_scanned = 0;
+ unsigned long distance;
+ unsigned long pfn = cc->migrate_pfn;
+ unsigned long high_pfn;
+ int order;
+
+ /* Skip hints are relied on to avoid repeats on the fast search */
+ if (cc->ignore_skip_hint)
+ return pfn;
+
+ /*
+ * If the migrate_pfn is not at the start of a zone or the start
+ * of a pageblock then assume this is a continuation of a previous
+ * scan restarted due to COMPACT_CLUSTER_MAX.
+ */
+ if (pfn != cc->zone->zone_start_pfn && pfn != pageblock_start_pfn(pfn))
+ return pfn;
+
+ /*
+ * For smaller orders, just linearly scan as the number of pages
+ * to migrate should be relatively small and does not necessarily
+ * justify freeing up a large block for a small allocation.
+ */
+ if (cc->order <= PAGE_ALLOC_COSTLY_ORDER)
+ return pfn;
+
+ /*
+ * Only allow kcompactd and direct requests for movable pages to
+ * quickly clear out a MOVABLE pageblock for allocation. This
+ * reduces the risk that a large movable pageblock is freed for
+ * an unmovable/reclaimable small allocation.
+ */
+ if (cc->direct_compaction && cc->migratetype != MIGRATE_MOVABLE)
+ return pfn;
+
+ /*
+ * When starting the migration scanner, pick any pageblock within the
+ * first half of the search space. Otherwise try and pick a pageblock
+ * within the first eighth to reduce the chances that a migration
+ * target later becomes a source.
+ */
+ distance = (cc->free_pfn - cc->migrate_pfn) >> 1;
+ if (cc->migrate_pfn != cc->zone->zone_start_pfn)
+ distance >>= 2;
+ high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
+
+ for (order = cc->order - 1;
+ order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order--) {
+ struct free_area *area = &cc->zone->free_area[order];
+ struct list_head *freelist;
+ unsigned long flags;
+ struct page *freepage;
+
+ if (!area->nr_free)
+ continue;
+
+ spin_lock_irqsave(&cc->zone->lock, flags);
+ freelist = &area->free_list[MIGRATE_MOVABLE];
+ list_for_each_entry(freepage, freelist, lru) {
+ unsigned long free_pfn;
+
+ nr_scanned++;
+ free_pfn = page_to_pfn(freepage);
+ if (free_pfn < high_pfn) {
+ /*
+ * Avoid if skipped recently. Ideally it would
+ * move to the tail but even safe iteration of
+ * the list assumes an entry is deleted, not
+ * reordered.
+ */
+ if (get_pageblock_skip(freepage)) {
+ if (list_is_last(freelist, &freepage->lru))
+ break;
+
+ continue;
+ }
+
+ /* Reorder to so a future search skips recent pages */
+ move_freelist_tail(freelist, freepage);
+
+ update_fast_start_pfn(cc, free_pfn);
+ pfn = pageblock_start_pfn(free_pfn);
+ cc->fast_search_fail = 0;
+ set_pageblock_skip(freepage);
+ break;
+ }
+
+ if (nr_scanned >= limit) {
+ cc->fast_search_fail++;
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+ }
+ spin_unlock_irqrestore(&cc->zone->lock, flags);
+ }
+
+ cc->total_migrate_scanned += nr_scanned;
+
+ /*
+ * If fast scanning failed then use a cached entry for a page block
+ * that had free pages as the basis for starting a linear scan.
+ */
+ if (pfn == cc->migrate_pfn)
+ pfn = reinit_migrate_pfn(cc);
+
+ return pfn;
+}
+
/*
* Isolate all pages that can be migrated from the first suitable block,
* starting at the block pointed to by the migrate scanner pfn within
* compact_control.
*/
-static isolate_migrate_t isolate_migratepages(struct zone *zone,
- struct compact_control *cc)
+static isolate_migrate_t isolate_migratepages(struct compact_control *cc)
{
unsigned long block_start_pfn;
unsigned long block_end_pfn;
@@ -1231,15 +1747,24 @@
const isolate_mode_t isolate_mode =
(sysctl_compact_unevictable_allowed ? ISOLATE_UNEVICTABLE : 0) |
(cc->mode != MIGRATE_SYNC ? ISOLATE_ASYNC_MIGRATE : 0);
+ bool fast_find_block;
/*
* Start at where we last stopped, or beginning of the zone as
- * initialized by compact_zone()
+ * initialized by compact_zone(). The first failure will use
+ * the lowest PFN as the starting point for linear scanning.
*/
- low_pfn = cc->migrate_pfn;
+ low_pfn = fast_find_migrateblock(cc);
block_start_pfn = pageblock_start_pfn(low_pfn);
- if (block_start_pfn < zone->zone_start_pfn)
- block_start_pfn = zone->zone_start_pfn;
+ if (block_start_pfn < cc->zone->zone_start_pfn)
+ block_start_pfn = cc->zone->zone_start_pfn;
+
+ /*
+ * fast_find_migrateblock marks a pageblock skipped so to avoid
+ * the isolation_suitable check below, check whether the fast
+ * search was successful.
+ */
+ fast_find_block = low_pfn != cc->migrate_pfn && !cc->fast_search_fail;
/* Only scan within a pageblock boundary */
block_end_pfn = pageblock_end_pfn(low_pfn);
@@ -1249,6 +1774,7 @@
* Do not cross the free scanner.
*/
for (; block_end_pfn <= cc->free_pfn;
+ fast_find_block = false,
low_pfn = block_end_pfn,
block_start_pfn = block_end_pfn,
block_end_pfn += pageblock_nr_pages) {
@@ -1256,34 +1782,45 @@
/*
* This can potentially iterate a massively long zone with
* many pageblocks unsuitable, so periodically check if we
- * need to schedule, or even abort async compaction.
+ * need to schedule.
*/
- if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages))
- && compact_should_abort(cc))
- break;
+ if (!(low_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)))
+ cond_resched();
- page = pageblock_pfn_to_page(block_start_pfn, block_end_pfn,
- zone);
+ page = pageblock_pfn_to_page(block_start_pfn,
+ block_end_pfn, cc->zone);
if (!page)
continue;
- /* If isolation recently failed, do not retry */
- if (!isolation_suitable(cc, page))
+ /*
+ * If isolation recently failed, do not retry. Only check the
+ * pageblock once. COMPACT_CLUSTER_MAX causes a pageblock
+ * to be visited multiple times. Assume skip was checked
+ * before making it "skip" so other compaction instances do
+ * not scan the same block.
+ */
+ if (IS_ALIGNED(low_pfn, pageblock_nr_pages) &&
+ !fast_find_block && !isolation_suitable(cc, page))
continue;
/*
- * For async compaction, also only scan in MOVABLE blocks.
- * Async compaction is optimistic to see if the minimum amount
- * of work satisfies the allocation.
+ * For async compaction, also only scan in MOVABLE blocks
+ * without huge pages. Async compaction is optimistic to see
+ * if the minimum amount of work satisfies the allocation.
+ * The cached PFN is updated as it's possible that all
+ * remaining blocks between source and target are unsuitable
+ * and the compaction scanners fail to meet.
*/
- if (!suitable_migration_source(cc, page))
+ if (!suitable_migration_source(cc, page)) {
+ update_cached_migrate(cc, block_end_pfn);
continue;
+ }
/* Perform the isolation */
low_pfn = isolate_migratepages_block(cc, low_pfn,
block_end_pfn, isolate_mode);
- if (!low_pfn || cc->contended)
+ if (!low_pfn)
return ISOLATE_ABORT;
/*
@@ -1309,19 +1846,16 @@
return order == -1;
}
-static enum compact_result __compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
const int migratetype = cc->migratetype;
-
- if (cc->contended || fatal_signal_pending(current))
- return COMPACT_CONTENDED;
+ int ret;
/* Compaction run completes if the migrate and free scanner meet */
if (compact_scanners_met(cc)) {
/* Let the next compaction start anew. */
- reset_cached_positions(zone);
+ reset_cached_positions(cc->zone);
/*
* Mark that the PG_migrate_skip information should be cleared
@@ -1330,7 +1864,7 @@
* based on an allocation request.
*/
if (cc->direct_compaction)
- zone->compact_blockskip_flush = true;
+ cc->zone->compact_blockskip_flush = true;
if (cc->whole_zone)
return COMPACT_COMPLETE;
@@ -1341,30 +1875,29 @@
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
- if (cc->finishing_block) {
- /*
- * We have finished the pageblock, but better check again that
- * we really succeeded.
- */
- if (IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
- cc->finishing_block = false;
- else
- return COMPACT_CONTINUE;
- }
+ /*
+ * Always finish scanning a pageblock to reduce the possibility of
+ * fallbacks in the future. This is particularly important when
+ * migration source is unmovable/reclaimable but it's not worth
+ * special casing.
+ */
+ if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages))
+ return COMPACT_CONTINUE;
/* Direct compactor: Is a suitable page free? */
+ ret = COMPACT_NO_SUITABLE_PAGE;
for (order = cc->order; order < MAX_ORDER; order++) {
- struct free_area *area = &zone->free_area[order];
+ struct free_area *area = &cc->zone->free_area[order];
bool can_steal;
/* Job done if page is free of the right migratetype */
- if (!list_empty(&area->free_list[migratetype]))
+ if (!free_area_empty(area, migratetype))
return COMPACT_SUCCESS;
#ifdef CONFIG_CMA
/* MIGRATE_MOVABLE can fallback on MIGRATE_CMA */
if (migratetype == MIGRATE_MOVABLE &&
- !list_empty(&area->free_list[MIGRATE_CMA]))
+ !free_area_empty(area, MIGRATE_CMA))
return COMPACT_SUCCESS;
#endif
/*
@@ -1392,21 +1925,23 @@
return COMPACT_SUCCESS;
}
- cc->finishing_block = true;
- return COMPACT_CONTINUE;
+ ret = COMPACT_CONTINUE;
+ break;
}
}
- return COMPACT_NO_SUITABLE_PAGE;
+ if (cc->contended || fatal_signal_pending(current))
+ ret = COMPACT_CONTENDED;
+
+ return ret;
}
-static enum compact_result compact_finished(struct zone *zone,
- struct compact_control *cc)
+static enum compact_result compact_finished(struct compact_control *cc)
{
int ret;
- ret = __compact_finished(zone, cc);
- trace_mm_compaction_finished(zone, cc->order, ret);
+ ret = __compact_finished(cc);
+ trace_mm_compaction_finished(cc->zone, cc->order, ret);
if (ret == COMPACT_NO_SUITABLE_PAGE)
ret = COMPACT_CONTINUE;
@@ -1430,7 +1965,7 @@
if (is_via_compact_memory(order))
return COMPACT_CONTINUE;
- watermark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ watermark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
/*
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
@@ -1533,15 +2068,29 @@
return false;
}
-static enum compact_result compact_zone(struct zone *zone, struct compact_control *cc)
+static enum compact_result
+compact_zone(struct compact_control *cc, struct capture_control *capc)
{
enum compact_result ret;
- unsigned long start_pfn = zone->zone_start_pfn;
- unsigned long end_pfn = zone_end_pfn(zone);
+ unsigned long start_pfn = cc->zone->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(cc->zone);
+ unsigned long last_migrated_pfn;
const bool sync = cc->mode != MIGRATE_ASYNC;
+ bool update_cached;
+
+ /*
+ * These counters track activities during zone compaction. Initialize
+ * them before compacting a new zone.
+ */
+ cc->total_migrate_scanned = 0;
+ cc->total_free_scanned = 0;
+ cc->nr_migratepages = 0;
+ cc->nr_freepages = 0;
+ INIT_LIST_HEAD(&cc->freepages);
+ INIT_LIST_HEAD(&cc->migratepages);
cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
- ret = compaction_suitable(zone, cc->order, cc->alloc_flags,
+ ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
cc->classzone_idx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
@@ -1554,8 +2103,8 @@
* Clear pageblock skip if there were failures recently and compaction
* is about to be retried after being deferred.
*/
- if (compaction_restarting(zone, cc->order))
- __reset_isolation_suitable(zone);
+ if (compaction_restarting(cc->zone, cc->order))
+ __reset_isolation_suitable(cc->zone);
/*
* Setup to move all movable pages to the end of the zone. Used cached
@@ -1563,43 +2112,76 @@
* want to compact the whole zone), but check that it is initialised
* by ensuring the values are within zone boundaries.
*/
+ cc->fast_start_pfn = 0;
if (cc->whole_zone) {
cc->migrate_pfn = start_pfn;
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
} else {
- cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync];
- cc->free_pfn = zone->compact_cached_free_pfn;
+ cc->migrate_pfn = cc->zone->compact_cached_migrate_pfn[sync];
+ cc->free_pfn = cc->zone->compact_cached_free_pfn;
if (cc->free_pfn < start_pfn || cc->free_pfn >= end_pfn) {
cc->free_pfn = pageblock_start_pfn(end_pfn - 1);
- zone->compact_cached_free_pfn = cc->free_pfn;
+ cc->zone->compact_cached_free_pfn = cc->free_pfn;
}
if (cc->migrate_pfn < start_pfn || cc->migrate_pfn >= end_pfn) {
cc->migrate_pfn = start_pfn;
- zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
- zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn;
+ cc->zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- if (cc->migrate_pfn == start_pfn)
+ if (cc->migrate_pfn <= cc->zone->compact_init_migrate_pfn)
cc->whole_zone = true;
}
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
+
+ /*
+ * Migrate has separate cached PFNs for ASYNC and SYNC* migration on
+ * the basis that some migrations will fail in ASYNC mode. However,
+ * if the cached PFNs match and pageblocks are skipped due to having
+ * no isolation candidates, then the sync state does not matter.
+ * Until a pageblock with isolation candidates is found, keep the
+ * cached PFNs in sync to avoid revisiting the same blocks.
+ */
+ update_cached = !sync &&
+ cc->zone->compact_cached_migrate_pfn[0] == cc->zone->compact_cached_migrate_pfn[1];
trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
cc->free_pfn, end_pfn, sync);
migrate_prep_local();
- while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) {
+ while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
+ unsigned long start_pfn = cc->migrate_pfn;
- switch (isolate_migratepages(zone, cc)) {
+ /*
+ * Avoid multiple rescans which can happen if a page cannot be
+ * isolated (dirty/writeback in async mode) or if the migrated
+ * pages are being allocated before the pageblock is cleared.
+ * The first rescan will capture the entire pageblock for
+ * migration. If it fails, it'll be marked skip and scanning
+ * will proceed as normal.
+ */
+ cc->rescan = false;
+ if (pageblock_start_pfn(last_migrated_pfn) ==
+ pageblock_start_pfn(start_pfn)) {
+ cc->rescan = true;
+ }
+
+ switch (isolate_migratepages(cc)) {
case ISOLATE_ABORT:
ret = COMPACT_CONTENDED;
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
+ last_migrated_pfn = 0;
goto out;
case ISOLATE_NONE:
+ if (update_cached) {
+ cc->zone->compact_cached_migrate_pfn[1] =
+ cc->zone->compact_cached_migrate_pfn[0];
+ }
+
/*
* We haven't isolated and migrated anything, but
* there might still be unflushed migrations from
@@ -1607,6 +2189,8 @@
*/
goto check_drain;
case ISOLATE_SUCCESS:
+ update_cached = false;
+ last_migrated_pfn = start_pfn;
;
}
@@ -1638,8 +2222,7 @@
cc->migrate_pfn = block_end_pfn(
cc->migrate_pfn - 1, cc->order);
/* Draining pcplists is useless in this case */
- cc->last_migrated_pfn = 0;
-
+ last_migrated_pfn = 0;
}
}
@@ -1651,21 +2234,26 @@
* compact_finished() can detect immediately if allocation
* would succeed.
*/
- if (cc->order > 0 && cc->last_migrated_pfn) {
+ if (cc->order > 0 && last_migrated_pfn) {
int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
- if (cc->last_migrated_pfn < current_block_start) {
+ if (last_migrated_pfn < current_block_start) {
cpu = get_cpu();
lru_add_drain_cpu(cpu);
- drain_local_pages(zone);
+ drain_local_pages(cc->zone);
put_cpu();
/* No more flushing until we migrate again */
- cc->last_migrated_pfn = 0;
+ last_migrated_pfn = 0;
}
}
+ /* Stop if a page has been captured */
+ if (capc && capc->page) {
+ ret = COMPACT_SUCCESS;
+ break;
+ }
}
out:
@@ -1684,8 +2272,8 @@
* Only go back, not forward. The cached pfn might have been
* already reset to zone end in compact_finished()
*/
- if (free_pfn > zone->compact_cached_free_pfn)
- zone->compact_cached_free_pfn = free_pfn;
+ if (free_pfn > cc->zone->compact_cached_free_pfn)
+ cc->zone->compact_cached_free_pfn = free_pfn;
}
count_compact_events(COMPACTMIGRATE_SCANNED, cc->total_migrate_scanned);
@@ -1699,15 +2287,13 @@
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx)
+ unsigned int alloc_flags, int classzone_idx,
+ struct page **capture)
{
enum compact_result ret;
struct compact_control cc = {
- .nr_freepages = 0,
- .nr_migratepages = 0,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.order = order,
+ .search_order = order,
.gfp_mask = gfp_mask,
.zone = zone,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
@@ -1719,14 +2305,22 @@
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
.ignore_block_suitable = (prio == MIN_COMPACT_PRIORITY)
};
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
+ struct capture_control capc = {
+ .cc = &cc,
+ .page = NULL,
+ };
- ret = compact_zone(zone, &cc);
+ if (capture)
+ current->capture_control = &capc;
+
+ ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
+ *capture = capc.page;
+ current->capture_control = NULL;
+
return ret;
}
@@ -1744,7 +2338,7 @@
*/
enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
unsigned int alloc_flags, const struct alloc_context *ac,
- enum compact_priority prio)
+ enum compact_priority prio, struct page **capture)
{
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
@@ -1772,7 +2366,7 @@
}
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac));
+ alloc_flags, ac_classzone_idx(ac), capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -1819,8 +2413,6 @@
struct zone *zone;
struct compact_control cc = {
.order = -1,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
.mode = MIGRATE_SYNC,
.ignore_skip_hint = true,
.whole_zone = true,
@@ -1834,13 +2426,9 @@
if (!populated_zone(zone))
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
- compact_zone(zone, &cc);
+ compact_zone(&cc, NULL);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
@@ -1875,14 +2463,6 @@
return 0;
}
-int sysctl_extfrag_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- proc_dointvec_minmax(table, write, buffer, length, ppos);
-
- return 0;
-}
-
#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA)
static ssize_t sysfs_compact_node(struct device *dev,
struct device_attribute *attr,
@@ -1947,8 +2527,7 @@
struct zone *zone;
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
- .total_migrate_scanned = 0,
- .total_free_scanned = 0,
+ .search_order = pgdat->kcompactd_max_order,
.classzone_idx = pgdat->kcompactd_classzone_idx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
@@ -1972,17 +2551,11 @@
COMPACT_CONTINUE)
continue;
- cc.nr_freepages = 0;
- cc.nr_migratepages = 0;
- cc.total_migrate_scanned = 0;
- cc.total_free_scanned = 0;
- cc.zone = zone;
- INIT_LIST_HEAD(&cc.freepages);
- INIT_LIST_HEAD(&cc.migratepages);
-
if (kthread_should_stop())
return;
- status = compact_zone(zone, &cc);
+
+ cc.zone = zone;
+ status = compact_zone(&cc, NULL);
if (status == COMPACT_SUCCESS) {
compaction_defer_reset(zone, cc.order, false);
@@ -2068,11 +2641,15 @@
pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
+ unsigned long pflags;
+
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
wait_event_freezable(pgdat->kcompactd_wait,
kcompactd_work_requested(pgdat));
+ psi_memstall_enter(&pflags);
kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
}
return 0;
diff --git a/mm/debug.c b/mm/debug.c
index bd10aad..0461df1 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -13,10 +13,11 @@
#include <trace/events/mmflags.h>
#include <linux/migrate.h>
#include <linux/page_owner.h>
+#include <linux/ctype.h>
#include "internal.h"
-char *migrate_reason_names[MR_TYPES] = {
+const char *migrate_reason_names[MR_TYPES] = {
"compaction",
"memory_failure",
"memory_hotplug",
@@ -43,6 +44,7 @@
void __dump_page(struct page *page, const char *reason)
{
+ struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
int mapcount;
@@ -52,10 +54,12 @@
* dump_page() when detected.
*/
if (page_poisoned) {
- pr_emerg("page:%px is uninitialized and poisoned", page);
+ pr_warn("page:%px is uninitialized and poisoned", page);
goto hex_only;
}
+ mapping = page_mapping(page);
+
/*
* Avoid VM_BUG_ON() in page_mapcount().
* page->_mapcount space in struct page is used by sl[aou]b pages to
@@ -63,27 +67,42 @@
*/
mapcount = PageSlab(page) ? 0 : page_mapcount(page);
- pr_emerg("page:%px count:%d mapcount:%d mapping:%px index:%#lx",
- page, page_ref_count(page), mapcount,
- page->mapping, page_to_pgoff(page));
if (PageCompound(page))
- pr_cont(" compound_mapcount: %d", compound_mapcount(page));
- pr_cont("\n");
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%px "
+ "index:%#lx compound_mapcount: %d\n",
+ page, page_ref_count(page), mapcount,
+ page->mapping, page_to_pgoff(page),
+ compound_mapcount(page));
+ else
+ pr_warn("page:%px refcount:%d mapcount:%d mapping:%px index:%#lx\n",
+ page, page_ref_count(page), mapcount,
+ page->mapping, page_to_pgoff(page));
+ if (PageKsm(page))
+ pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ else if (PageAnon(page))
+ pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ else if (mapping) {
+ if (mapping->host && mapping->host->i_dentry.first) {
+ struct dentry *dentry;
+ dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
+ pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
+ } else
+ pr_warn("%ps\n", mapping->a_ops);
+ pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ }
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags);
-
hex_only:
- print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32,
+ print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
sizeof(struct page), false);
if (reason)
- pr_alert("page dumped because: %s\n", reason);
+ pr_warn("page dumped because: %s\n", reason);
#ifdef CONFIG_MEMCG
if (!page_poisoned && page->mem_cgroup)
- pr_alert("page->mem_cgroup:%px\n", page->mem_cgroup);
+ pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
#endif
}
@@ -121,7 +140,7 @@
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
"pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
- "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n"
+ "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
"start_brk %lx brk %lx start_stack %lx\n"
"arg_start %lx arg_end %lx env_start %lx env_end %lx\n"
@@ -152,7 +171,8 @@
mm_pgtables_bytes(mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
- mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm,
+ (u64)atomic64_read(&mm->pinned_vm),
+ mm->data_vm, mm->exec_vm, mm->stack_vm,
mm->start_code, mm->end_code, mm->start_data, mm->end_data,
mm->start_brk, mm->brk, mm->start_stack,
mm->arg_start, mm->arg_end, mm->env_start, mm->env_end,
@@ -175,4 +195,49 @@
);
}
+static bool page_init_poisoning __read_mostly = true;
+
+static int __init setup_vm_debug(char *str)
+{
+ bool __page_init_poisoning = true;
+
+ /*
+ * Calling vm_debug with no arguments is equivalent to requesting
+ * to enable all debugging options we can control.
+ */
+ if (*str++ != '=' || !*str)
+ goto out;
+
+ __page_init_poisoning = false;
+ if (*str == '-')
+ goto out;
+
+ while (*str) {
+ switch (tolower(*str)) {
+ case'p':
+ __page_init_poisoning = true;
+ break;
+ default:
+ pr_err("vm_debug option '%c' unknown. skipped\n",
+ *str);
+ }
+
+ str++;
+ }
+out:
+ if (page_init_poisoning && !__page_init_poisoning)
+ pr_warn("Page struct poisoning disabled by kernel command line option 'vm_debug'\n");
+
+ page_init_poisoning = __page_init_poisoning;
+
+ return 1;
+}
+__setup("vm_debug", setup_vm_debug);
+
+void page_init_poison(struct page *page, size_t size)
+{
+ if (page_init_poisoning)
+ memset(page, PAGE_POISON_PATTERN, size);
+}
+EXPORT_SYMBOL_GPL(page_init_poison);
#endif /* CONFIG_DEBUG_VM */
diff --git a/mm/dmapool.c b/mm/dmapool.c
index 6d4b97e..fe5d330 100644
--- a/mm/dmapool.c
+++ b/mm/dmapool.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* DMA Pool allocator
*
@@ -5,10 +6,6 @@
* Copyright 2007 Intel Corporation
* Author: Matthew Wilcox <willy@linux.intel.com>
*
- * This software may be redistributed and/or modified under the terms of
- * the GNU General Public License ("GPL") version 2 as published by the
- * Free Software Foundation.
- *
* This allocator returns small blocks of a given size which are DMA-able by
* the given device. It uses the dma_alloc_coherent page allocator to get
* new pages, then splits them up into blocks of the required size.
@@ -114,10 +111,9 @@
* @size: size of the blocks in this pool.
* @align: alignment requirement for blocks; must be a power of two
* @boundary: returned blocks won't cross this power of two boundary
- * Context: !in_interrupt()
+ * Context: not in_interrupt()
*
- * Returns a dma allocation pool with the requested characteristics, or
- * null if one can't be created. Given one of these pools, dma_pool_alloc()
+ * Given one of these pools, dma_pool_alloc()
* may be used to allocate memory. Such memory will all have "consistent"
* DMA mappings, accessible by the device and its driver without using
* cache flushing primitives. The actual size of blocks allocated may be
@@ -127,6 +123,9 @@
* cross that size boundary. This is useful for devices which have
* addressing restrictions on individual DMA transfers, such as not crossing
* boundaries of 4KBytes.
+ *
+ * Return: a dma allocation pool with the requested characteristics, or
+ * %NULL if one can't be created.
*/
struct dma_pool *dma_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t boundary)
@@ -313,7 +312,7 @@
* @mem_flags: GFP_* bitmask
* @handle: pointer to dma address of block
*
- * This returns the kernel virtual address of a currently unused block,
+ * Return: the kernel virtual address of a currently unused block,
* and reports its dma address through the handle.
* If such a memory block can't be allocated, %NULL is returned.
*/
@@ -379,7 +378,7 @@
#endif
spin_unlock_irqrestore(&pool->lock, flags);
- if (mem_flags & __GFP_ZERO)
+ if (want_init_on_alloc(mem_flags))
memset(retval, 0, pool->size);
return retval;
@@ -429,6 +428,8 @@
}
offset = vaddr - page->vaddr;
+ if (want_init_on_free())
+ memset(vaddr, 0, pool->size);
#ifdef DMAPOOL_DEBUG
if ((dma - page->dma) != offset) {
spin_unlock_irqrestore(&pool->lock, flags);
@@ -498,6 +499,9 @@
*
* Managed dma_pool_create(). DMA pool created with this function is
* automatically destroyed on driver detach.
+ *
+ * Return: a managed dma allocation pool with the requested
+ * characteristics, or %NULL if one can't be created.
*/
struct dma_pool *dmam_pool_create(const char *name, struct device *dev,
size_t size, size_t align, size_t allocation)
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 467bcd0..4f17c83 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -27,8 +27,7 @@
* deactivate the pages and clear PG_Referenced.
*/
-static int generic_fadvise(struct file *file, loff_t offset, loff_t len,
- int advice)
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
struct inode *inode;
struct address_space *mapping;
@@ -178,6 +177,7 @@
}
return 0;
}
+EXPORT_SYMBOL(generic_fadvise);
int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
{
diff --git a/mm/failslab.c b/mm/failslab.c
index b135ebb..f92fed9 100644
--- a/mm/failslab.c
+++ b/mm/failslab.c
@@ -23,7 +23,8 @@
if (gfpflags & __GFP_NOFAIL)
return false;
- if (failslab.ignore_gfp_reclaim && (gfpflags & __GFP_RECLAIM))
+ if (failslab.ignore_gfp_reclaim &&
+ (gfpflags & __GFP_DIRECT_RECLAIM))
return false;
if (failslab.cache_filter && !(s->flags & SLAB_FAILSLAB))
@@ -48,18 +49,12 @@
if (IS_ERR(dir))
return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &failslab.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("cache-filter", mode, dir,
- &failslab.cache_filter))
- goto fail;
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &failslab.ignore_gfp_reclaim);
+ debugfs_create_bool("cache-filter", mode, dir,
+ &failslab.cache_filter);
return 0;
-fail:
- debugfs_remove_recursive(dir);
-
- return -ENOMEM;
}
late_initcall(failslab_debugfs_init);
diff --git a/mm/filemap.c b/mm/filemap.c
index 52517f2..85b7d08 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/filemap.c
*
@@ -24,6 +25,7 @@
#include <linux/pagemap.h>
#include <linux/file.h>
#include <linux/uio.h>
+#include <linux/error-injection.h>
#include <linux/hash.h>
#include <linux/writeback.h>
#include <linux/backing-dev.h>
@@ -36,6 +38,9 @@
#include <linux/cleancache.h>
#include <linux/shmem_fs.h>
#include <linux/rmap.h>
+#include <linux/delayacct.h>
+#include <linux/psi.h>
+#include <linux/ramfs.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -96,8 +101,8 @@
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
- * ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
- * ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
+ * ->pgdat->lru_lock (follow_page->mark_page_accessed)
+ * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
@@ -111,60 +116,26 @@
* ->tasklist_lock (memory_failure, collect_procs_ao)
*/
-static int page_cache_tree_insert(struct address_space *mapping,
- struct page *page, void **shadowp)
-{
- struct radix_tree_node *node;
- void **slot;
- int error;
-
- error = __radix_tree_create(&mapping->i_pages, page->index, 0,
- &node, &slot);
- if (error)
- return error;
- if (*slot) {
- void *p;
-
- p = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (!radix_tree_exceptional_entry(p))
- return -EEXIST;
-
- mapping->nrexceptional--;
- if (shadowp)
- *shadowp = p;
- }
- __radix_tree_replace(&mapping->i_pages, node, slot, page,
- workingset_lookup_update(mapping));
- mapping->nrpages++;
- return 0;
-}
-
-static void page_cache_tree_delete(struct address_space *mapping,
+static void page_cache_delete(struct address_space *mapping,
struct page *page, void *shadow)
{
- int i, nr;
+ XA_STATE(xas, &mapping->i_pages, page->index);
+ unsigned int nr = 1;
- /* hugetlb pages are represented by one entry in the radix tree */
- nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
+ mapping_set_update(&xas, mapping);
+
+ /* hugetlb pages are represented by a single entry in the xarray */
+ if (!PageHuge(page)) {
+ xas_set_order(&xas, page->index, compound_order(page));
+ nr = compound_nr(page);
+ }
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(nr != 1 && shadow, page);
- for (i = 0; i < nr; i++) {
- struct radix_tree_node *node;
- void **slot;
-
- __radix_tree_lookup(&mapping->i_pages, page->index + i,
- &node, &slot);
-
- VM_BUG_ON_PAGE(!node && nr != 1, page);
-
- radix_tree_clear_tags(&mapping->i_pages, node, slot);
- __radix_tree_replace(&mapping->i_pages, node, slot, shadow,
- workingset_lookup_update(mapping));
- }
+ xas_store(&xas, shadow);
+ xas_init_marks(&xas);
page->mapping = NULL;
/* Leave page->index set: truncation lookup relies upon it */
@@ -233,8 +204,9 @@
__mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
- } else {
- VM_BUG_ON_PAGE(PageTransHuge(page), page);
+ } else if (PageTransHuge(page)) {
+ __dec_node_page_state(page, NR_FILE_THPS);
+ filemap_nr_thps_dec(mapping);
}
/*
@@ -263,7 +235,7 @@
trace_mm_filemap_delete_from_page_cache(page);
unaccount_page_cache_page(mapping, page);
- page_cache_tree_delete(mapping, page, shadow);
+ page_cache_delete(mapping, page, shadow);
}
static void page_cache_free_page(struct address_space *mapping,
@@ -306,61 +278,62 @@
EXPORT_SYMBOL(delete_from_page_cache);
/*
- * page_cache_tree_delete_batch - delete several pages from page cache
+ * page_cache_delete_batch - delete several pages from page cache
* @mapping: the mapping to which pages belong
* @pvec: pagevec with pages to delete
*
* The function walks over mapping->i_pages and removes pages passed in @pvec
- * from the mapping. The function expects @pvec to be sorted by page index.
+ * from the mapping. The function expects @pvec to be sorted by page index
+ * and is optimised for it to be dense.
* It tolerates holes in @pvec (mapping entries at those indices are not
* modified). The function expects only THP head pages to be present in the
- * @pvec and takes care to delete all corresponding tail pages from the
- * mapping as well.
+ * @pvec.
*
* The function expects the i_pages lock to be held.
*/
-static void
-page_cache_tree_delete_batch(struct address_space *mapping,
+static void page_cache_delete_batch(struct address_space *mapping,
struct pagevec *pvec)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, pvec->pages[0]->index);
int total_pages = 0;
- int i = 0, tail_pages = 0;
+ int i = 0;
struct page *page;
- pgoff_t start;
- start = pvec->pages[0]->index;
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (i >= pagevec_count(pvec) && !tail_pages)
+ mapping_set_update(&xas, mapping);
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (i >= pagevec_count(pvec))
break;
- page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (radix_tree_exceptional_entry(page))
+
+ /* A swap/dax/shadow entry got inserted? Skip it. */
+ if (xa_is_value(page))
continue;
- if (!tail_pages) {
- /*
- * Some page got inserted in our range? Skip it. We
- * have our pages locked so they are protected from
- * being removed.
- */
- if (page != pvec->pages[i])
- continue;
- WARN_ON_ONCE(!PageLocked(page));
- if (PageTransHuge(page) && !PageHuge(page))
- tail_pages = HPAGE_PMD_NR - 1;
- page->mapping = NULL;
- /*
- * Leave page->index set: truncation lookup relies
- * upon it
- */
- i++;
- } else {
- tail_pages--;
+ /*
+ * A page got inserted in our range? Skip it. We have our
+ * pages locked so they are protected from being removed.
+ * If we see a page whose index is higher than ours, it
+ * means our page has been removed, which shouldn't be
+ * possible because we're holding the PageLock.
+ */
+ if (page != pvec->pages[i]) {
+ VM_BUG_ON_PAGE(page->index > pvec->pages[i]->index,
+ page);
+ continue;
}
- radix_tree_clear_tags(&mapping->i_pages, iter.node, slot);
- __radix_tree_replace(&mapping->i_pages, iter.node, slot, NULL,
- workingset_lookup_update(mapping));
+
+ WARN_ON_ONCE(!PageLocked(page));
+
+ if (page->index == xas.xa_index)
+ page->mapping = NULL;
+ /* Leave page->index set: truncation lookup relies on it */
+
+ /*
+ * Move to the next page in the vector if this is a regular
+ * page or the index is of the last sub-page of this compound
+ * page.
+ */
+ if (page->index + compound_nr(page) - 1 == xas.xa_index)
+ i++;
+ xas_store(&xas, NULL);
total_pages++;
}
mapping->nrpages -= total_pages;
@@ -381,7 +354,7 @@
unaccount_page_cache_page(mapping, pvec->pages[i]);
}
- page_cache_tree_delete_batch(mapping, pvec);
+ page_cache_delete_batch(mapping, pvec);
xa_unlock_irqrestore(&mapping->i_pages, flags);
for (i = 0; i < pagevec_count(pvec); i++)
@@ -426,6 +399,8 @@
* opposed to a regular memory cleansing writeback. The difference between
* these two operations is that if a dirty page/buffer is encountered, it must
* be waited upon, and not just skipped over.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode)
@@ -438,7 +413,8 @@
.range_end = end,
};
- if (!mapping_cap_writeback_dirty(mapping))
+ if (!mapping_cap_writeback_dirty(mapping) ||
+ !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
wbc_attach_fdatawrite_inode(&wbc, mapping->host);
@@ -472,6 +448,8 @@
*
* This is a mostly non-blocking flush. Not suitable for data-integrity
* purposes - I/O may not be started against all dirty pages.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int filemap_flush(struct address_space *mapping)
{
@@ -487,24 +465,38 @@
*
* Find at least one page in the range supplied, usually used to check if
* direct writing in this range will trigger a writeback.
+ *
+ * Return: %true if at least one page exists in the specified range,
+ * %false otherwise.
*/
bool filemap_range_has_page(struct address_space *mapping,
loff_t start_byte, loff_t end_byte)
{
- pgoff_t index = start_byte >> PAGE_SHIFT;
- pgoff_t end = end_byte >> PAGE_SHIFT;
struct page *page;
+ XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT);
+ pgoff_t max = end_byte >> PAGE_SHIFT;
if (end_byte < start_byte)
return false;
- if (mapping->nrpages == 0)
- return false;
+ rcu_read_lock();
+ for (;;) {
+ page = xas_find(&xas, max);
+ if (xas_retry(&xas, page))
+ continue;
+ /* Shadow entries don't count */
+ if (xa_is_value(page))
+ continue;
+ /*
+ * We don't need to try to pin this page; we're about to
+ * release the RCU lock anyway. It is enough to know that
+ * there was a page here recently.
+ */
+ break;
+ }
+ rcu_read_unlock();
- if (!find_get_pages_range(mapping, &index, end, 1, &page))
- return false;
- put_page(page);
- return true;
+ return page != NULL;
}
EXPORT_SYMBOL(filemap_range_has_page);
@@ -552,6 +544,8 @@
* Since the error status of the address space is cleared by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
+ *
+ * Return: error status of the address space.
*/
int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte,
loff_t end_byte)
@@ -562,6 +556,28 @@
EXPORT_SYMBOL(filemap_fdatawait_range);
/**
+ * filemap_fdatawait_range_keep_errors - wait for writeback to complete
+ * @mapping: address space structure to wait for
+ * @start_byte: offset in bytes where the range starts
+ * @end_byte: offset in bytes where the range ends (inclusive)
+ *
+ * Walk the list of under-writeback pages of the given address space in the
+ * given range and wait for all of them. Unlike filemap_fdatawait_range(),
+ * this function does not clear error status of the address space.
+ *
+ * Use this function if callers don't handle errors themselves. Expected
+ * call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
+ * fsfreeze(8)
+ */
+int filemap_fdatawait_range_keep_errors(struct address_space *mapping,
+ loff_t start_byte, loff_t end_byte)
+{
+ __filemap_fdatawait_range(mapping, start_byte, end_byte);
+ return filemap_check_and_keep_errors(mapping);
+}
+EXPORT_SYMBOL(filemap_fdatawait_range_keep_errors);
+
+/**
* file_fdatawait_range - wait for writeback to complete
* @file: file pointing to address space structure to wait for
* @start_byte: offset in bytes where the range starts
@@ -574,6 +590,8 @@
* Since the error status of the file is advanced by this function,
* callers are responsible for checking the return value and handling and/or
* reporting the error.
+ *
+ * Return: error status of the address space vs. the file->f_wb_err cursor.
*/
int file_fdatawait_range(struct file *file, loff_t start_byte, loff_t end_byte)
{
@@ -595,6 +613,8 @@
* Use this function if callers don't handle errors themselves. Expected
* call sites are system-wide / filesystem-wide data flushers: e.g. sync(2),
* fsfreeze(8)
+ *
+ * Return: error status of the address space.
*/
int filemap_fdatawait_keep_errors(struct address_space *mapping)
{
@@ -603,10 +623,13 @@
}
EXPORT_SYMBOL(filemap_fdatawait_keep_errors);
+/* Returns true if writeback might be needed or already in progress. */
static bool mapping_needs_writeback(struct address_space *mapping)
{
- return (!dax_mapping(mapping) && mapping->nrpages) ||
- (dax_mapping(mapping) && mapping->nrexceptional);
+ if (dax_mapping(mapping))
+ return mapping->nrexceptional;
+
+ return mapping->nrpages;
}
int filemap_write_and_wait(struct address_space *mapping)
@@ -646,6 +669,8 @@
*
* Note that @lend is inclusive (describes the last byte to be written) so
* that this function can be used to write to the very end-of-file (end = -1).
+ *
+ * Return: error status of the address space.
*/
int filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend)
@@ -701,6 +726,8 @@
* While we handle mapping->wb_err with atomic operations, the f_wb_err
* value is protected by the f_lock since we must ensure that it reflects
* the latest value swapped in for this file descriptor.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int file_check_and_advance_wb_err(struct file *file)
{
@@ -743,6 +770,8 @@
*
* After writing out and waiting on the data, we check and advance the
* f_wb_err cursor to the latest value, and return any errors detected there.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
{
@@ -775,51 +804,46 @@
* locked. This function does not add the new page to the LRU, the
* caller must do that.
*
- * The remove + add is atomic. The only way this function can fail is
- * memory allocation failure.
+ * The remove + add is atomic. This function cannot fail.
+ *
+ * Return: %0
*/
int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
{
- int error;
+ struct address_space *mapping = old->mapping;
+ void (*freepage)(struct page *) = mapping->a_ops->freepage;
+ pgoff_t offset = old->index;
+ XA_STATE(xas, &mapping->i_pages, offset);
+ unsigned long flags;
VM_BUG_ON_PAGE(!PageLocked(old), old);
VM_BUG_ON_PAGE(!PageLocked(new), new);
VM_BUG_ON_PAGE(new->mapping, new);
- error = radix_tree_preload(gfp_mask & GFP_RECLAIM_MASK);
- if (!error) {
- struct address_space *mapping = old->mapping;
- void (*freepage)(struct page *);
- unsigned long flags;
+ get_page(new);
+ new->mapping = mapping;
+ new->index = offset;
- pgoff_t offset = old->index;
- freepage = mapping->a_ops->freepage;
+ xas_lock_irqsave(&xas, flags);
+ xas_store(&xas, new);
- get_page(new);
- new->mapping = mapping;
- new->index = offset;
+ old->mapping = NULL;
+ /* hugetlb pages do not participate in page cache accounting. */
+ if (!PageHuge(old))
+ __dec_node_page_state(new, NR_FILE_PAGES);
+ if (!PageHuge(new))
+ __inc_node_page_state(new, NR_FILE_PAGES);
+ if (PageSwapBacked(old))
+ __dec_node_page_state(new, NR_SHMEM);
+ if (PageSwapBacked(new))
+ __inc_node_page_state(new, NR_SHMEM);
+ xas_unlock_irqrestore(&xas, flags);
+ mem_cgroup_migrate(old, new);
+ if (freepage)
+ freepage(old);
+ put_page(old);
- xa_lock_irqsave(&mapping->i_pages, flags);
- __delete_from_page_cache(old, NULL);
- error = page_cache_tree_insert(mapping, new, NULL);
- BUG_ON(error);
-
- /*
- * hugetlb pages do not participate in page cache accounting.
- */
- if (!PageHuge(new))
- __inc_node_page_state(new, NR_FILE_PAGES);
- if (PageSwapBacked(new))
- __inc_node_page_state(new, NR_SHMEM);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
- mem_cgroup_migrate(old, new);
- radix_tree_preload_end();
- if (freepage)
- freepage(old);
- put_page(old);
- }
-
- return error;
+ return 0;
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
@@ -828,12 +852,15 @@
pgoff_t offset, gfp_t gfp_mask,
void **shadowp)
{
+ XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
struct mem_cgroup *memcg;
int error;
+ void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
+ mapping_set_update(&xas, mapping);
if (!huge) {
error = mem_cgroup_try_charge(page, current->mm,
@@ -842,40 +869,49 @@
return error;
}
- error = radix_tree_maybe_preload(gfp_mask & GFP_RECLAIM_MASK);
- if (error) {
- if (!huge)
- mem_cgroup_cancel_charge(page, memcg, false);
- return error;
- }
-
get_page(page);
page->mapping = mapping;
page->index = offset;
- xa_lock_irq(&mapping->i_pages);
- error = page_cache_tree_insert(mapping, page, shadowp);
- radix_tree_preload_end();
- if (unlikely(error))
- goto err_insert;
+ do {
+ xas_lock_irq(&xas);
+ old = xas_load(&xas);
+ if (old && !xa_is_value(old))
+ xas_set_err(&xas, -EEXIST);
+ xas_store(&xas, page);
+ if (xas_error(&xas))
+ goto unlock;
- /* hugetlb pages do not participate in page cache accounting. */
- if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
- xa_unlock_irq(&mapping->i_pages);
+ if (xa_is_value(old)) {
+ mapping->nrexceptional--;
+ if (shadowp)
+ *shadowp = old;
+ }
+ mapping->nrpages++;
+
+ /* hugetlb pages do not participate in page cache accounting */
+ if (!huge)
+ __inc_node_page_state(page, NR_FILE_PAGES);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+
+ if (xas_error(&xas))
+ goto error;
+
if (!huge)
mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
-err_insert:
+error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- xa_unlock_irq(&mapping->i_pages);
if (!huge)
mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return error;
+ return xas_error(&xas);
}
+ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
/**
* add_to_page_cache_locked - add a locked page to the pagecache
@@ -886,6 +922,8 @@
*
* This function is used to add a page to the pagecache. It must be locked.
* This function does not add the page to the LRU. The caller must do that.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int add_to_page_cache_locked(struct page *page, struct address_space *mapping,
pgoff_t offset, gfp_t gfp_mask)
@@ -915,12 +953,9 @@
* data from the working set, only to cache data that will
* get overwritten with something else, is a waste of memory.
*/
- if (!(gfp_mask & __GFP_WRITE) &&
- shadow && workingset_refault(shadow)) {
- SetPageActive(page);
- workingset_activation(page);
- } else
- ClearPageActive(page);
+ WARN_ON_ONCE(PageActive(page));
+ if (!(gfp_mask & __GFP_WRITE) && shadow)
+ workingset_refault(page, shadow);
lru_cache_add(page);
}
return ret;
@@ -1003,7 +1038,14 @@
if (wait_page->bit_nr != key->bit_nr)
return 0;
- /* Stop walking if it's locked */
+ /*
+ * Stop walking if it's locked.
+ * Is this safe if put_and_wait_on_page_locked() is in use?
+ * Yes: the waker must hold a reference to this page, and if PG_locked
+ * has now already been set by another task, that task must also hold
+ * a reference to the *same usage* of this page; so there is no need
+ * to walk on to wake even the put_and_wait_on_page_locked() callers.
+ */
if (test_bit(key->bit_nr, &key->page->flags))
return -1;
@@ -1071,15 +1113,44 @@
wake_up_page_bit(page, bit);
}
+/*
+ * A choice of three behaviors for wait_on_page_bit_common():
+ */
+enum behavior {
+ EXCLUSIVE, /* Hold ref to page and take the bit when woken, like
+ * __lock_page() waiting on then setting PG_locked.
+ */
+ SHARED, /* Hold ref to page and check the bit when woken, like
+ * wait_on_page_writeback() waiting on PG_writeback.
+ */
+ DROP, /* Drop ref to page before wait, no check when woken,
+ * like put_and_wait_on_page_locked() on PG_locked.
+ */
+};
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
- struct page *page, int bit_nr, int state, bool lock)
+ struct page *page, int bit_nr, int state, enum behavior behavior)
{
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
+ bool bit_is_set;
+ bool thrashing = false;
+ bool delayacct = false;
+ unsigned long pflags;
int ret = 0;
+ if (bit_nr == PG_locked &&
+ !PageUptodate(page) && PageWorkingset(page)) {
+ if (!PageSwapBacked(page)) {
+ delayacct_thrashing_start();
+ delayacct = true;
+ }
+ psi_memstall_enter(&pflags);
+ thrashing = true;
+ }
+
init_wait(wait);
- wait->flags = lock ? WQ_FLAG_EXCLUSIVE : 0;
+ wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
@@ -1096,26 +1167,46 @@
spin_unlock_irq(&q->lock);
- if (likely(test_bit(bit_nr, &page->flags))) {
- io_schedule();
- }
+ bit_is_set = test_bit(bit_nr, &page->flags);
+ if (behavior == DROP)
+ put_page(page);
- if (lock) {
+ if (likely(bit_is_set))
+ io_schedule();
+
+ if (behavior == EXCLUSIVE) {
if (!test_and_set_bit_lock(bit_nr, &page->flags))
break;
- } else {
+ } else if (behavior == SHARED) {
if (!test_bit(bit_nr, &page->flags))
break;
}
- if (unlikely(signal_pending_state(state, current))) {
+ if (signal_pending_state(state, current)) {
ret = -EINTR;
break;
}
+
+ if (behavior == DROP) {
+ /*
+ * We can no longer safely access page->flags:
+ * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
+ * there is a risk of waiting forever on a page reused
+ * for something that keeps it locked indefinitely.
+ * But best check for -EINTR above before breaking.
+ */
+ break;
+ }
}
finish_wait(q, wait);
+ if (thrashing) {
+ if (delayacct)
+ delayacct_thrashing_end();
+ psi_memstall_leave(&pflags);
+ }
+
/*
* A signal could leave PageWaiters set. Clearing it here if
* !waitqueue_active would be possible (by open-coding finish_wait),
@@ -1130,18 +1221,37 @@
void wait_on_page_bit(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, false);
+ wait_on_page_bit_common(q, page, bit_nr, TASK_UNINTERRUPTIBLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit);
int wait_on_page_bit_killable(struct page *page, int bit_nr)
{
wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, false);
+ return wait_on_page_bit_common(q, page, bit_nr, TASK_KILLABLE, SHARED);
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
/**
+ * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
+ * @page: The page to wait for.
+ *
+ * The caller should hold a reference on @page. They expect the page to
+ * become unlocked relatively soon, but do not wish to hold up migration
+ * (for example) by holding the reference while waiting for the page to
+ * come unlocked. After this function returns, the caller should not
+ * dereference @page.
+ */
+void put_and_wait_on_page_locked(struct page *page)
+{
+ wait_queue_head_t *q;
+
+ page = compound_head(page);
+ q = page_waitqueue(page);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+}
+
+/**
* add_page_wait_queue - Add an arbitrary waiter to a page's wait queue
* @page: Page defining the wait queue of interest
* @waiter: Waiter to add to the queue
@@ -1270,7 +1380,8 @@
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
- wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, true);
+ wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE,
+ EXCLUSIVE);
}
EXPORT_SYMBOL(__lock_page);
@@ -1278,7 +1389,8 @@
{
struct page *page = compound_head(__page);
wait_queue_head_t *q = page_waitqueue(page);
- return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE, true);
+ return wait_on_page_bit_common(q, page, PG_locked, TASK_KILLABLE,
+ EXCLUSIVE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
@@ -1326,86 +1438,76 @@
}
/**
- * page_cache_next_hole - find the next hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
+ * page_cache_next_miss() - Find the next gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
*
- * Search the set [index, min(index+max_scan-1, MAX_INDEX)] for the
- * lowest indexed hole.
+ * Search the range [index, min(index + max_scan - 1, ULONG_MAX)] for the
+ * gap with the lowest index.
*
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'return - index >=
- * max_scan' will be true). In rare cases of index wrap-around, 0 will
- * be returned.
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 5, then subsequently a gap is
+ * created at index 10, page_cache_next_miss covering both indices may
+ * return 10 if called under the rcu_read_lock.
*
- * page_cache_next_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 5, then subsequently a hole is created at
- * index 10, page_cache_next_hole covering both indexes may return 10
- * if called under rcu_read_lock.
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'return - index >= max_scan' will be true).
+ * In the rare case of index wrap-around, 0 will be returned.
*/
-pgoff_t page_cache_next_hole(struct address_space *mapping,
+pgoff_t page_cache_next_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
- unsigned long i;
+ XA_STATE(xas, &mapping->i_pages, index);
- for (i = 0; i < max_scan; i++) {
- struct page *page;
-
- page = radix_tree_lookup(&mapping->i_pages, index);
- if (!page || radix_tree_exceptional_entry(page))
+ while (max_scan--) {
+ void *entry = xas_next(&xas);
+ if (!entry || xa_is_value(entry))
break;
- index++;
- if (index == 0)
+ if (xas.xa_index == 0)
break;
}
- return index;
+ return xas.xa_index;
}
-EXPORT_SYMBOL(page_cache_next_hole);
+EXPORT_SYMBOL(page_cache_next_miss);
/**
- * page_cache_prev_hole - find the prev hole (not-present entry)
- * @mapping: mapping
- * @index: index
- * @max_scan: maximum range to search
+ * page_cache_prev_miss() - Find the previous gap in the page cache.
+ * @mapping: Mapping.
+ * @index: Index.
+ * @max_scan: Maximum range to search.
*
- * Search backwards in the range [max(index-max_scan+1, 0), index] for
- * the first hole.
+ * Search the range [max(index - max_scan + 1, 0), index] for the
+ * gap with the highest index.
*
- * Returns: the index of the hole if found, otherwise returns an index
- * outside of the set specified (in which case 'index - return >=
- * max_scan' will be true). In rare cases of wrap-around, ULONG_MAX
- * will be returned.
+ * This function may be called under the rcu_read_lock. However, this will
+ * not atomically search a snapshot of the cache at a single point in time.
+ * For example, if a gap is created at index 10, then subsequently a gap is
+ * created at index 5, page_cache_prev_miss() covering both indices may
+ * return 5 if called under the rcu_read_lock.
*
- * page_cache_prev_hole may be called under rcu_read_lock. However,
- * like radix_tree_gang_lookup, this will not atomically search a
- * snapshot of the tree at a single point in time. For example, if a
- * hole is created at index 10, then subsequently a hole is created at
- * index 5, page_cache_prev_hole covering both indexes may return 5 if
- * called under rcu_read_lock.
+ * Return: The index of the gap if found, otherwise an index outside the
+ * range specified (in which case 'index - return >= max_scan' will be true).
+ * In the rare case of wrap-around, ULONG_MAX will be returned.
*/
-pgoff_t page_cache_prev_hole(struct address_space *mapping,
+pgoff_t page_cache_prev_miss(struct address_space *mapping,
pgoff_t index, unsigned long max_scan)
{
- unsigned long i;
+ XA_STATE(xas, &mapping->i_pages, index);
- for (i = 0; i < max_scan; i++) {
- struct page *page;
-
- page = radix_tree_lookup(&mapping->i_pages, index);
- if (!page || radix_tree_exceptional_entry(page))
+ while (max_scan--) {
+ void *entry = xas_prev(&xas);
+ if (!entry || xa_is_value(entry))
break;
- index--;
- if (index == ULONG_MAX)
+ if (xas.xa_index == ULONG_MAX)
break;
}
- return index;
+ return xas.xa_index;
}
-EXPORT_SYMBOL(page_cache_prev_hole);
+EXPORT_SYMBOL(page_cache_prev_miss);
/**
* find_get_entry - find and get a page cache entry
@@ -1418,52 +1520,39 @@
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Otherwise, %NULL is returned.
+ * Return: the found page or shadow entry, %NULL if nothing is found.
*/
struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
{
- void **pagep;
- struct page *head, *page;
+ XA_STATE(xas, &mapping->i_pages, offset);
+ struct page *page;
rcu_read_lock();
repeat:
- page = NULL;
- pagep = radix_tree_lookup_slot(&mapping->i_pages, offset);
- if (pagep) {
- page = radix_tree_deref_slot(pagep);
- if (unlikely(!page))
- goto out;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page))
- goto repeat;
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Return
- * it without attempting to raise page count.
- */
- goto out;
- }
+ xas_reset(&xas);
+ page = xas_load(&xas);
+ if (xas_retry(&xas, page))
+ goto repeat;
+ /*
+ * A shadow entry of a recently evicted page, or a swap entry from
+ * shmem/tmpfs. Return it without attempting to raise page count.
+ */
+ if (!page || xa_is_value(page))
+ goto out;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
+ if (!page_cache_get_speculative(page))
+ goto repeat;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
- /*
- * Has the page moved?
- * This is part of the lockless pagecache protocol. See
- * include/linux/pagemap.h for details.
- */
- if (unlikely(page != *pagep)) {
- put_page(head);
- goto repeat;
- }
+ /*
+ * Has the page moved or been split?
+ * This is part of the lockless pagecache protocol. See
+ * include/linux/pagemap.h for details.
+ */
+ if (unlikely(page != xas_reload(&xas))) {
+ put_page(page);
+ goto repeat;
}
+ page = find_subpage(page, offset);
out:
rcu_read_unlock();
@@ -1483,9 +1572,9 @@
* If the slot holds a shadow entry of a previously evicted page, or a
* swap entry from shmem/tmpfs, it is returned.
*
- * Otherwise, %NULL is returned.
- *
* find_lock_entry() may sleep.
+ *
+ * Return: the found page or shadow entry, %NULL if nothing is found.
*/
struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset)
{
@@ -1493,7 +1582,7 @@
repeat:
page = find_get_entry(mapping, offset);
- if (page && !radix_tree_exception(page)) {
+ if (page && !xa_is_value(page)) {
lock_page(page);
/* Has the page been truncated? */
if (unlikely(page_mapping(page) != mapping)) {
@@ -1525,12 +1614,17 @@
* - FGP_CREAT: If page is not present then a new page is allocated using
* @gfp_mask and added to the page cache and the VM's LRU
* list. The page is returned locked and with an increased
- * refcount. Otherwise, NULL is returned.
+ * refcount.
+ * - FGP_FOR_MMAP: Similar to FGP_CREAT, only we want to allow the caller to do
+ * its own locking dance if the page is already in cache, or unlock the page
+ * before returning if we had to add the page to pagecache.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
*
* If there is a page cache page, it is returned with an increased refcount.
+ *
+ * Return: the found page or %NULL otherwise.
*/
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
int fgp_flags, gfp_t gfp_mask)
@@ -1539,7 +1633,7 @@
repeat:
page = find_get_entry(mapping, offset);
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
page = NULL;
if (!page)
goto no_page;
@@ -1555,7 +1649,7 @@
}
/* Has the page been truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto repeat;
@@ -1563,7 +1657,7 @@
VM_BUG_ON_PAGE(page->index != offset, page);
}
- if (page && (fgp_flags & FGP_ACCESSED))
+ if (fgp_flags & FGP_ACCESSED)
mark_page_accessed(page);
no_page:
@@ -1578,7 +1672,7 @@
if (!page)
return NULL;
- if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK)))
+ if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
/* Init accessed so avoid atomic mark_page_accessed later */
@@ -1592,6 +1686,13 @@
if (err == -EEXIST)
goto repeat;
}
+
+ /*
+ * add_to_page_cache_lru locks the page, and for mmap we expect
+ * an unlocked page.
+ */
+ if (page && (fgp_flags & FGP_FOR_MMAP))
+ unlock_page(page);
}
return page;
@@ -1618,60 +1719,49 @@
* Any shadow entries of evicted pages, or swap entries from
* shmem/tmpfs, are included in the returned array.
*
- * find_get_entries() returns the number of pages and shadow entries
- * which were found.
+ * Return: the number of pages and shadow entries which were found.
*/
unsigned find_get_entries(struct address_space *mapping,
pgoff_t start, unsigned int nr_entries,
struct page **entries, pgoff_t *indices)
{
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
unsigned int ret = 0;
- struct radix_tree_iter iter;
if (!nr_entries)
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- struct page *head, *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
+ /*
+ * A shadow entry of a recently evicted page, a swap
+ * entry from shmem/tmpfs or a DAX entry. Return it
+ * without attempting to raise page count.
+ */
+ if (xa_is_value(page))
goto export;
- }
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
+ if (!page_cache_get_speculative(page))
+ goto retry;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+ page = find_subpage(page, xas.xa_index);
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
export:
- indices[ret] = iter.index;
+ indices[ret] = xas.xa_index;
entries[ret] = page;
if (++ret == nr_entries)
break;
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
@@ -1694,72 +1784,52 @@
* indexes. There may be holes in the indices due to not-present pages.
* We also update @start to index the next page for the traversal.
*
- * find_get_pages_range() returns the number of pages which were found. If this
- * number is smaller than @nr_pages, the end of specified range has been
+ * Return: the number of pages which were found. If this number is
+ * smaller than @nr_pages, the end of specified range has been
* reached.
*/
unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start,
pgoff_t end, unsigned int nr_pages,
struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, *start) {
- struct page *head, *page;
-
- if (iter.index > end)
- break;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each(&xas, page, end) {
+ if (xas_retry(&xas, page))
+ continue;
+ /* Skip over shadow, swap and DAX entries */
+ if (xa_is_value(page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Skip
- * over it.
- */
- continue;
- }
+ if (!page_cache_get_speculative(page))
+ goto retry;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
-
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
- *start = pages[ret - 1]->index + 1;
+ *start = xas.xa_index + 1;
goto out;
}
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
}
/*
* We come here when there is no page beyond @end. We take care to not
* overflow the index @start as it confuses some of the callers. This
- * breaks the iteration when there is page at index -1 but that is
+ * breaks the iteration when there is a page at index -1 but that is
* already broken anyway.
*/
if (end == (pgoff_t)-1)
@@ -1782,69 +1852,44 @@
* find_get_pages_contig() works exactly like find_get_pages(), except
* that the returned number of pages are guaranteed to be contiguous.
*
- * find_get_pages_contig() returns the number of pages which were found.
+ * Return: the number of pages which were found.
*/
unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
unsigned int nr_pages, struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, index);
+ struct page *page;
unsigned int ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_contig(slot, &mapping->i_pages, &iter, index) {
- struct page *head, *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- /* The hole, there no reason to continue */
- if (unlikely(!page))
- break;
-
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page,
- * or a swap entry from shmem/tmpfs. Stop
- * looking for contiguous pages.
- */
- break;
- }
-
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
-
+ for (page = xas_load(&xas); page; page = xas_next(&xas)) {
+ if (xas_retry(&xas, page))
+ continue;
/*
- * must check mapping and index after taking the ref.
- * otherwise we can get both false positives and false
- * negatives, which is just confusing to the caller.
+ * If the entry has been swapped out, we can stop looking.
+ * No current caller is looking for DAX entries.
*/
- if (page->mapping == NULL || page_to_pgoff(page) != iter.index) {
- put_page(page);
+ if (xa_is_value(page))
break;
- }
- pages[ret] = page;
+ if (!page_cache_get_speculative(page))
+ goto retry;
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
+
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages)
break;
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
}
rcu_read_unlock();
return ret;
@@ -1862,76 +1907,56 @@
*
* Like find_get_pages, except we only return pages which are tagged with
* @tag. We update @index to index the next page for the traversal.
+ *
+ * Return: the number of pages which were found.
*/
unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index,
- pgoff_t end, int tag, unsigned int nr_pages,
+ pgoff_t end, xa_mark_t tag, unsigned int nr_pages,
struct page **pages)
{
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, *index);
+ struct page *page;
unsigned ret = 0;
if (unlikely(!nr_pages))
return 0;
rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, *index, tag) {
- struct page *head, *page;
-
- if (iter.index > end)
- break;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each_marked(&xas, page, end, tag) {
+ if (xas_retry(&xas, page))
+ continue;
+ /*
+ * Shadow entries should never be tagged, but this iteration
+ * is lockless so there is a window for page reclaim to evict
+ * a page we saw tagged. Skip over it.
+ */
+ if (xa_is_value(page))
continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- /*
- * A shadow entry of a recently evicted page.
- *
- * Those entries should never be tagged, but
- * this tree walk is lockless and the tags are
- * looked up in bulk, one radix tree node at a
- * time, so there is a sizable window for page
- * reclaim to evict a page we saw tagged.
- *
- * Skip over it.
- */
- continue;
- }
+ if (!page_cache_get_speculative(page))
+ goto retry;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto put_page;
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
-
- pages[ret] = page;
+ pages[ret] = find_subpage(page, xas.xa_index);
if (++ret == nr_pages) {
- *index = pages[ret - 1]->index + 1;
+ *index = xas.xa_index + 1;
goto out;
}
+ continue;
+put_page:
+ put_page(page);
+retry:
+ xas_reset(&xas);
}
/*
- * We come here when we got at @end. We take care to not overflow the
+ * We come here when we got to @end. We take care to not overflow the
* index @index as it confuses some of the callers. This breaks the
- * iteration when there is page at index -1 but that is already broken
- * anyway.
+ * iteration when there is a page at index -1 but that is already
+ * broken anyway.
*/
if (end == (pgoff_t)-1)
*index = (pgoff_t)-1;
@@ -1944,76 +1969,6 @@
}
EXPORT_SYMBOL(find_get_pages_range_tag);
-/**
- * find_get_entries_tag - find and return entries that match @tag
- * @mapping: the address_space to search
- * @start: the starting page cache index
- * @tag: the tag index
- * @nr_entries: the maximum number of entries
- * @entries: where the resulting entries are placed
- * @indices: the cache indices corresponding to the entries in @entries
- *
- * Like find_get_entries, except we only return entries which are tagged with
- * @tag.
- */
-unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start,
- int tag, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices)
-{
- void **slot;
- unsigned int ret = 0;
- struct radix_tree_iter iter;
-
- if (!nr_entries)
- return 0;
-
- rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start, tag) {
- struct page *head, *page;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
- continue;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- /*
- * A shadow entry of a recently evicted page, a swap
- * entry from shmem/tmpfs or a DAX entry. Return it
- * without attempting to raise page count.
- */
- goto export;
- }
-
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
-export:
- indices[ret] = iter.index;
- entries[ret] = page;
- if (++ret == nr_entries)
- break;
- }
- rcu_read_unlock();
- return ret;
-}
-EXPORT_SYMBOL(find_get_entries_tag);
-
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -2046,6 +2001,10 @@
*
* This is really ugly. But the goto's actually try to clarify some
* of the logic when it comes to error handling etc.
+ *
+ * Return:
+ * * total number of bytes copied, including those the were already @written
+ * * negative error code if nothing was copied
*/
static ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
@@ -2122,7 +2081,7 @@
!mapping->a_ops->is_partially_uptodate)
goto page_not_up_to_date;
/* pipes can't handle partially uptodate pages */
- if (unlikely(iter->type & ITER_PIPE))
+ if (unlikely(iov_iter_is_pipe(iter)))
goto page_not_up_to_date;
if (!trylock_page(page))
goto page_not_up_to_date;
@@ -2307,6 +2266,9 @@
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
+ * Return:
+ * * number of bytes copied, even for partial reads
+ * * negative error code if nothing was read
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -2366,62 +2328,98 @@
EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
-/**
- * page_cache_read - adds requested page to the page cache if not already there
- * @file: file to read
- * @offset: page index
- * @gfp_mask: memory allocation flags
- *
- * This adds the requested page to the page cache if it isn't already there,
- * and schedules an I/O to read in its contents from disk.
- */
-static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
+#define MMAP_LOTSAMISS (100)
+static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
{
- struct address_space *mapping = file->f_mapping;
- struct page *page;
- int ret;
+ int flags = vmf->flags;
- do {
- page = __page_cache_alloc(gfp_mask);
- if (!page)
- return -ENOMEM;
+ if (fpin)
+ return fpin;
- ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
- if (ret == 0)
- ret = mapping->a_ops->readpage(file, page);
- else if (ret == -EEXIST)
- ret = 0; /* losing race to add is OK */
-
- put_page(page);
-
- } while (ret == AOP_TRUNCATED_PAGE);
-
- return ret;
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
}
-#define MMAP_LOTSAMISS (100)
+/*
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
+ * @vmf - the vm_fault for this fault.
+ * @page - the page to lock.
+ * @fpin - the pointer to the file we may pin (or is already pinned).
+ *
+ * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
+ * It differs in that it actually returns the page locked if it returns 1 and 0
+ * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
+ * will point to the pinned file and needs to be fput()'ed at a later point.
+ */
+static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
+ struct file **fpin)
+{
+ if (trylock_page(page))
+ return 1;
+
+ /*
+ * NOTE! This will make us return with VM_FAULT_RETRY, but with
+ * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
+ * is supposed to work. We have way too many special cases..
+ */
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
+ return 0;
+
+ *fpin = maybe_unlock_mmap_for_io(vmf, *fpin);
+ if (vmf->flags & FAULT_FLAG_KILLABLE) {
+ if (__lock_page_killable(page)) {
+ /*
+ * We didn't have the right flags to drop the mmap_sem,
+ * but all fault_handlers only check for fatal signals
+ * if we return VM_FAULT_RETRY, so we need to drop the
+ * mmap_sem here and return 0 if we don't have a fpin.
+ */
+ if (*fpin == NULL)
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ return 0;
+ }
+ } else
+ __lock_page(page);
+ return 1;
+}
+
/*
- * Synchronous readahead happens when we don't even find
- * a page in the page cache at all.
+ * Synchronous readahead happens when we don't even find a page in the page
+ * cache at all. We don't want to perform IO under the mmap sem, so if we have
+ * to drop the mmap sem we return the file that was pinned in order for us to do
+ * that. If we didn't pin a file then we return NULL. The file that is
+ * returned needs to be fput()'ed when we're done with it.
*/
-static void do_sync_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- pgoff_t offset)
+static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (!ra->ra_pages)
- return;
+ return fpin;
- if (vma->vm_flags & VM_SEQ_READ) {
+ if (vmf->vma->vm_flags & VM_SEQ_READ) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
- return;
+ return fpin;
}
/* Avoid banging the cache line if not needed */
@@ -2433,37 +2431,44 @@
* stop bothering with read-ahead. It will only hurt.
*/
if (ra->mmap_miss > MMAP_LOTSAMISS)
- return;
+ return fpin;
/*
* mmap read-around
*/
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
ra->start = max_t(long, 0, offset - ra->ra_pages / 2);
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ra_submit(ra, mapping, file);
+ return fpin;
}
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
- * so we want to possibly extend the readahead further..
+ * so we want to possibly extend the readahead further. We return the file that
+ * was pinned if we have to drop the mmap_sem in order to do IO.
*/
-static void do_async_mmap_readahead(struct vm_area_struct *vma,
- struct file_ra_state *ra,
- struct file *file,
- struct page *page,
- pgoff_t offset)
+static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
+ struct page *page)
{
+ struct file *file = vmf->vma->vm_file;
+ struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
+ struct file *fpin = NULL;
+ pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vma->vm_flags & VM_RAND_READ)
- return;
+ if (vmf->vma->vm_flags & VM_RAND_READ)
+ return fpin;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
- if (PageReadahead(page))
+ if (PageReadahead(page)) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
page, offset, ra->ra_pages);
+ }
+ return fpin;
}
/**
@@ -2479,20 +2484,21 @@
*
* vma->vm_mm->mmap_sem must be held on entry.
*
- * If our return value has VM_FAULT_RETRY set, it's because
- * lock_page_or_retry() returned 0.
- * The mmap_sem has usually been released in this case.
- * See __lock_page_or_retry() for the exception.
+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
+ * may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
*
* If our return value does not have VM_FAULT_RETRY set, the mmap_sem
* has not been released.
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
+ *
+ * Return: bitwise-OR of %VM_FAULT_ codes.
*/
vm_fault_t filemap_fault(struct vm_fault *vmf)
{
int error;
struct file *file = vmf->vma->vm_file;
+ struct file *fpin = NULL;
struct address_space *mapping = file->f_mapping;
struct file_ra_state *ra = &file->f_ra;
struct inode *inode = mapping->host;
@@ -2514,31 +2520,34 @@
* We found the page, so try async readahead before
* waiting for the lock.
*/
- do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
+ fpin = do_async_mmap_readahead(vmf, page);
} else if (!page) {
/* No page in the page cache at all */
- do_sync_mmap_readahead(vmf->vma, ra, file, offset);
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
ret = VM_FAULT_MAJOR;
+ fpin = do_sync_mmap_readahead(vmf);
retry_find:
- page = find_get_page(mapping, offset);
- if (!page)
- goto no_cached_page;
+ page = pagecache_get_page(mapping, offset,
+ FGP_CREAT|FGP_FOR_MMAP,
+ vmf->gfp_mask);
+ if (!page) {
+ if (fpin)
+ goto out_retry;
+ return vmf_error(-ENOMEM);
+ }
}
- if (!lock_page_or_retry(page, vmf->vma->vm_mm, vmf->flags)) {
- put_page(page);
- return ret | VM_FAULT_RETRY;
- }
+ if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
+ goto out_retry;
/* Did it get truncated? */
- if (unlikely(page->mapping != mapping)) {
+ if (unlikely(compound_head(page)->mapping != mapping)) {
unlock_page(page);
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page->index != offset, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
/*
* We have a locked page in the page cache, now we need to check
@@ -2548,6 +2557,16 @@
goto page_not_uptodate;
/*
+ * We've made it this far and we had to drop our mmap_sem, now is the
+ * time to return to the upper layer and have it re-find the vma and
+ * redo the fault.
+ */
+ if (fpin) {
+ unlock_page(page);
+ goto out_retry;
+ }
+
+ /*
* Found the page and have a reference on it.
* We must recheck i_size under page lock.
*/
@@ -2561,30 +2580,6 @@
vmf->page = page;
return ret | VM_FAULT_LOCKED;
-no_cached_page:
- /*
- * We're only likely to ever get here if MADV_RANDOM is in
- * effect.
- */
- error = page_cache_read(file, offset, vmf->gfp_mask);
-
- /*
- * The page we want has now been added to the page cache.
- * In the unlikely event that someone removed it in the
- * meantime, we'll just come back here and read it again.
- */
- if (error >= 0)
- goto retry_find;
-
- /*
- * An error return from page_cache_read can result if the
- * system is low on memory, or a problem occurs while trying
- * to schedule I/O.
- */
- if (error == -ENOMEM)
- return VM_FAULT_OOM;
- return VM_FAULT_SIGBUS;
-
page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
@@ -2593,12 +2588,15 @@
* and we need to check for errors.
*/
ClearPageError(page);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
error = mapping->a_ops->readpage(file, page);
if (!error) {
wait_on_page_locked(page);
if (!PageUptodate(page))
error = -EIO;
}
+ if (fpin)
+ goto out_retry;
put_page(page);
if (!error || error == AOP_TRUNCATED_PAGE)
@@ -2607,51 +2605,51 @@
/* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(file, ra);
return VM_FAULT_SIGBUS;
+
+out_retry:
+ /*
+ * We dropped the mmap_sem, we need to return to the fault handler to
+ * re-find the vma and come back and find our hopefully still populated
+ * page.
+ */
+ if (page)
+ put_page(page);
+ if (fpin)
+ fput(fpin);
+ return ret | VM_FAULT_RETRY;
}
EXPORT_SYMBOL(filemap_fault);
void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
- struct radix_tree_iter iter;
- void **slot;
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
unsigned long max_idx;
- struct page *head, *page;
+ XA_STATE(xas, &mapping->i_pages, start_pgoff);
+ struct page *page;
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start_pgoff) {
- if (iter.index > end_pgoff)
- break;
-repeat:
- page = radix_tree_deref_slot(slot);
- if (unlikely(!page))
+ xas_for_each(&xas, page, end_pgoff) {
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
goto next;
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
+
+ /*
+ * Check for a locked page first, as a speculative
+ * reference may adversely influence page migration.
+ */
+ if (PageLocked(page))
goto next;
- }
+ if (!page_cache_get_speculative(page))
+ goto next;
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
- goto repeat;
-
- /* The page was split under us? */
- if (compound_head(page) != head) {
- put_page(head);
- goto repeat;
- }
-
- /* Has the page moved? */
- if (unlikely(page != *slot)) {
- put_page(head);
- goto repeat;
- }
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ goto skip;
+ page = find_subpage(page, xas.xa_index);
if (!PageUptodate(page) ||
PageReadahead(page) ||
@@ -2670,10 +2668,10 @@
if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
- vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT;
+ vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
- vmf->pte += iter.index - last_pgoff;
- last_pgoff = iter.index;
+ vmf->pte += xas.xa_index - last_pgoff;
+ last_pgoff = xas.xa_index;
if (alloc_set_pte(vmf, NULL, page))
goto unlock;
unlock_page(page);
@@ -2686,8 +2684,6 @@
/* Huge page is mapped? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd))
break;
- if (iter.index == end_pgoff)
- break;
}
rcu_read_unlock();
}
@@ -2748,9 +2744,9 @@
return generic_file_mmap(file, vma);
}
#else
-int filemap_page_mkwrite(struct vm_fault *vmf)
+vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
- return -ENOSYS;
+ return VM_FAULT_SIGBUS;
}
int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
{
@@ -2797,12 +2793,16 @@
put_page(page);
if (err == -EEXIST)
goto repeat;
- /* Presumably ENOMEM for radix tree node */
+ /* Presumably ENOMEM for xarray node */
return ERR_PTR(err);
}
filler:
- err = filler(data, page);
+ if (filler)
+ err = filler(data, page);
+ else
+ err = mapping->a_ops->readpage(data, page);
+
if (err < 0) {
put_page(page);
return ERR_PTR(err);
@@ -2884,13 +2884,16 @@
* not set, try to fill the page and wait for it to become unlocked.
*
* If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page(struct address_space *mapping,
pgoff_t index,
int (*filler)(void *, struct page *),
void *data)
{
- return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping));
+ return do_read_cache_page(mapping, index, filler, data,
+ mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);
@@ -2904,18 +2907,49 @@
* any new page allocations done using the specified allocation flags.
*
* If the page does not get brought uptodate, return -EIO.
+ *
+ * Return: up to date page on success, ERR_PTR() on failure.
*/
struct page *read_cache_page_gfp(struct address_space *mapping,
pgoff_t index,
gfp_t gfp)
{
- filler_t *filler = (filler_t *)mapping->a_ops->readpage;
-
- return do_read_cache_page(mapping, index, filler, NULL, gfp);
+ return do_read_cache_page(mapping, index, NULL, NULL, gfp);
}
EXPORT_SYMBOL(read_cache_page_gfp);
/*
+ * Don't operate on ranges the page cache doesn't support, and don't exceed the
+ * LFS limits. If pos is under the limit it becomes a short access. If it
+ * exceeds the limit we return -EFBIG.
+ */
+static int generic_write_check_limits(struct file *file, loff_t pos,
+ loff_t *count)
+{
+ struct inode *inode = file->f_mapping->host;
+ loff_t max_size = inode->i_sb->s_maxbytes;
+ loff_t limit = rlimit(RLIMIT_FSIZE);
+
+ if (limit != RLIM_INFINITY) {
+ if (pos >= limit) {
+ send_sig(SIGXFSZ, current, 0);
+ return -EFBIG;
+ }
+ *count = min(*count, limit - pos);
+ }
+
+ if (!(file->f_flags & O_LARGEFILE))
+ max_size = MAX_NON_LFS;
+
+ if (unlikely(pos >= max_size))
+ return -EFBIG;
+
+ *count = min(*count, max_size - pos);
+
+ return 0;
+}
+
+/*
* Performs necessary checks before doing a write
*
* Can adjust writing position or amount of bytes to write.
@@ -2926,8 +2960,11 @@
{
struct file *file = iocb->ki_filp;
struct inode *inode = file->f_mapping->host;
- unsigned long limit = rlimit(RLIMIT_FSIZE);
- loff_t pos;
+ loff_t count;
+ int ret;
+
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
if (!iov_iter_count(from))
return 0;
@@ -2936,44 +2973,173 @@
if (iocb->ki_flags & IOCB_APPEND)
iocb->ki_pos = i_size_read(inode);
- pos = iocb->ki_pos;
-
if ((iocb->ki_flags & IOCB_NOWAIT) && !(iocb->ki_flags & IOCB_DIRECT))
return -EINVAL;
- if (limit != RLIM_INFINITY) {
- if (iocb->ki_pos >= limit) {
- send_sig(SIGXFSZ, current, 0);
- return -EFBIG;
- }
- iov_iter_truncate(from, limit - (unsigned long)pos);
- }
+ count = iov_iter_count(from);
+ ret = generic_write_check_limits(file, iocb->ki_pos, &count);
+ if (ret)
+ return ret;
- /*
- * LFS rule
- */
- if (unlikely(pos + iov_iter_count(from) > MAX_NON_LFS &&
- !(file->f_flags & O_LARGEFILE))) {
- if (pos >= MAX_NON_LFS)
- return -EFBIG;
- iov_iter_truncate(from, MAX_NON_LFS - (unsigned long)pos);
- }
-
- /*
- * Are we about to exceed the fs block limit ?
- *
- * If we have written data it becomes a short write. If we have
- * exceeded without writing data we send a signal and return EFBIG.
- * Linus frestrict idea will clean these up nicely..
- */
- if (unlikely(pos >= inode->i_sb->s_maxbytes))
- return -EFBIG;
-
- iov_iter_truncate(from, inode->i_sb->s_maxbytes - pos);
+ iov_iter_truncate(from, count);
return iov_iter_count(from);
}
EXPORT_SYMBOL(generic_write_checks);
+/*
+ * Performs necessary checks before doing a clone.
+ *
+ * Can adjust amount of bytes to clone via @req_count argument.
+ * Returns appropriate error code that caller should return or
+ * zero in case the clone should be allowed.
+ */
+int generic_remap_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ loff_t *req_count, unsigned int remap_flags)
+{
+ struct inode *inode_in = file_in->f_mapping->host;
+ struct inode *inode_out = file_out->f_mapping->host;
+ uint64_t count = *req_count;
+ uint64_t bcount;
+ loff_t size_in, size_out;
+ loff_t bs = inode_out->i_sb->s_blocksize;
+ int ret;
+
+ /* The start of both ranges must be aligned to an fs block. */
+ if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_out, bs))
+ return -EINVAL;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EINVAL;
+
+ size_in = i_size_read(inode_in);
+ size_out = i_size_read(inode_out);
+
+ /* Dedupe requires both ranges to be within EOF. */
+ if ((remap_flags & REMAP_FILE_DEDUP) &&
+ (pos_in >= size_in || pos_in + count > size_in ||
+ pos_out >= size_out || pos_out + count > size_out))
+ return -EINVAL;
+
+ /* Ensure the infile range is within the infile. */
+ if (pos_in >= size_in)
+ return -EINVAL;
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
+
+ /*
+ * If the user wanted us to link to the infile's EOF, round up to the
+ * next block boundary for this check.
+ *
+ * Otherwise, make sure the count is also block-aligned, having
+ * already confirmed the starting offsets' block alignment.
+ */
+ if (pos_in + count == size_in) {
+ bcount = ALIGN(size_in, bs) - pos_in;
+ } else {
+ if (!IS_ALIGNED(count, bs))
+ count = ALIGN_DOWN(count, bs);
+ bcount = count;
+ }
+
+ /* Don't allow overlapped cloning within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + bcount > pos_in &&
+ pos_out < pos_in + bcount)
+ return -EINVAL;
+
+ /*
+ * We shortened the request but the caller can't deal with that, so
+ * bounce the request back to userspace.
+ */
+ if (*req_count != count && !(remap_flags & REMAP_FILE_CAN_SHORTEN))
+ return -EINVAL;
+
+ *req_count = count;
+ return 0;
+}
+
+
+/*
+ * Performs common checks before doing a file copy/clone
+ * from @file_in to @file_out.
+ */
+int generic_file_rw_checks(struct file *file_in, struct file *file_out)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+
+ /* Don't copy dirs, pipes, sockets... */
+ if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+ return -EISDIR;
+ if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+ return -EINVAL;
+
+ if (!(file_in->f_mode & FMODE_READ) ||
+ !(file_out->f_mode & FMODE_WRITE) ||
+ (file_out->f_flags & O_APPEND))
+ return -EBADF;
+
+ return 0;
+}
+
+/*
+ * Performs necessary checks before doing a file copy
+ *
+ * Can adjust amount of bytes to copy via @req_count argument.
+ * Returns appropriate error code that caller should return or
+ * zero in case the copy should be allowed.
+ */
+int generic_copy_file_checks(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out,
+ size_t *req_count, unsigned int flags)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct inode *inode_out = file_inode(file_out);
+ uint64_t count = *req_count;
+ loff_t size_in;
+ int ret;
+
+ ret = generic_file_rw_checks(file_in, file_out);
+ if (ret)
+ return ret;
+
+ /* Don't touch certain kinds of inodes */
+ if (IS_IMMUTABLE(inode_out))
+ return -EPERM;
+
+ if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+ return -ETXTBSY;
+
+ /* Ensure offsets don't wrap. */
+ if (pos_in + count < pos_in || pos_out + count < pos_out)
+ return -EOVERFLOW;
+
+ /* Shorten the copy to EOF */
+ size_in = i_size_read(inode_in);
+ if (pos_in >= size_in)
+ count = 0;
+ else
+ count = min(count, size_in - (uint64_t)pos_in);
+
+ ret = generic_write_check_limits(file_out, pos_out, &count);
+ if (ret)
+ return ret;
+
+ /* Don't allow overlapped copying within the same file. */
+ if (inode_in == inode_out &&
+ pos_out + count > pos_in &&
+ pos_out < pos_in + count)
+ return -EINVAL;
+
+ *req_count = count;
+ return 0;
+}
+
int pagecache_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
@@ -3012,7 +3178,7 @@
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
if (filemap_range_has_page(inode->i_mapping, pos,
- pos + iov_iter_count(from)))
+ pos + write_len - 1))
return -EAGAIN;
} else {
written = filemap_write_and_wait_range(mapping, pos,
@@ -3195,6 +3361,10 @@
* This function does *not* take care of syncing data in case of O_SYNC write.
* A caller has to handle it. This is mainly due to the fact that we want to
* avoid syncing under i_mutex.
+ *
+ * Return:
+ * * number of bytes written, even for truncated writes
+ * * negative error code if no data has been written at all
*/
ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3279,6 +3449,10 @@
* This is a wrapper around __generic_file_write_iter() to be used by most
* filesystems. It takes care of syncing the file in case of O_SYNC file
* and acquires i_mutex as needed.
+ * Return:
+ * * negative error code if no data has been written at all of
+ * vfs_fsync_range() failed for a synchronous write
+ * * number of bytes written, even for truncated writes
*/
ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
{
@@ -3305,8 +3479,7 @@
* @gfp_mask: memory allocation flags (and I/O mode)
*
* The address_space is to try to release any data against the page
- * (presumably at page->private). If the release was successful, return '1'.
- * Otherwise return zero.
+ * (presumably at page->private).
*
* This may also be called if PG_fscache is set on a page, indicating that the
* page is known to the local caching routines.
@@ -3314,6 +3487,7 @@
* The @gfp_mask argument specifies whether I/O may be performed to release
* this page (__GFP_IO), and whether the call may block (__GFP_RECLAIM & __GFP_FS).
*
+ * Return: %1 if the release was successful, otherwise return zero.
*/
int try_to_release_page(struct page *page, gfp_t gfp_mask)
{
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index c64dca6..c431ca8 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -46,6 +46,8 @@
if (WARN_ON_ONCE(nr_frames > vec->nr_allocated))
nr_frames = vec->nr_allocated;
+ start = untagged_addr(start);
+
down_read(&mm->mmap_sem);
locked = 1;
vma = find_vma_intersection(mm, start, start + 1);
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 157e5bf..60bb20e 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Frontswap frontend
*
@@ -7,8 +8,6 @@
*
* Copyright (C) 2009-2012 Oracle Corp. All rights reserved.
* Author: Dan Magenheimer
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
*/
#include <linux/mman.h>
diff --git a/mm/gup.c b/mm/gup.c
index 1abc8b4..8f236a3 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/err.h>
@@ -13,6 +14,9 @@
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
#include <linux/hugetlb.h>
+#include <linux/migrate.h>
+#include <linux/mm_inline.h>
+#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
@@ -20,6 +24,102 @@
#include "internal.h"
+struct follow_page_context {
+ struct dev_pagemap *pgmap;
+ unsigned int page_mask;
+};
+
+/**
+ * put_user_pages_dirty_lock() - release and optionally dirty gup-pinned pages
+ * @pages: array of pages to be maybe marked dirty, and definitely released.
+ * @npages: number of pages in the @pages array.
+ * @make_dirty: whether to mark the pages dirty
+ *
+ * "gup-pinned page" refers to a page that has had one of the get_user_pages()
+ * variants called on that page.
+ *
+ * For each page in the @pages array, make that page (or its head page, if a
+ * compound page) dirty, if @make_dirty is true, and if the page was previously
+ * listed as clean. In any case, releases all pages using put_user_page(),
+ * possibly via put_user_pages(), for the non-dirty case.
+ *
+ * Please see the put_user_page() documentation for details.
+ *
+ * set_page_dirty_lock() is used internally. If instead, set_page_dirty() is
+ * required, then the caller should a) verify that this is really correct,
+ * because _lock() is usually required, and b) hand code it:
+ * set_page_dirty_lock(), put_user_page().
+ *
+ */
+void put_user_pages_dirty_lock(struct page **pages, unsigned long npages,
+ bool make_dirty)
+{
+ unsigned long index;
+
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+
+ if (!make_dirty) {
+ put_user_pages(pages, npages);
+ return;
+ }
+
+ for (index = 0; index < npages; index++) {
+ struct page *page = compound_head(pages[index]);
+ /*
+ * Checking PageDirty at this point may race with
+ * clear_page_dirty_for_io(), but that's OK. Two key
+ * cases:
+ *
+ * 1) This code sees the page as already dirty, so it
+ * skips the call to set_page_dirty(). That could happen
+ * because clear_page_dirty_for_io() called
+ * page_mkclean(), followed by set_page_dirty().
+ * However, now the page is going to get written back,
+ * which meets the original intention of setting it
+ * dirty, so all is well: clear_page_dirty_for_io() goes
+ * on to call TestClearPageDirty(), and write the page
+ * back.
+ *
+ * 2) This code sees the page as clean, so it calls
+ * set_page_dirty(). The page stays dirty, despite being
+ * written back, so it gets written back again in the
+ * next writeback cycle. This is harmless.
+ */
+ if (!PageDirty(page))
+ set_page_dirty_lock(page);
+ put_user_page(page);
+ }
+}
+EXPORT_SYMBOL(put_user_pages_dirty_lock);
+
+/**
+ * put_user_pages() - release an array of gup-pinned pages.
+ * @pages: array of pages to be marked dirty and released.
+ * @npages: number of pages in the @pages array.
+ *
+ * For each page in the @pages array, release the page using put_user_page().
+ *
+ * Please see the put_user_page() documentation for details.
+ */
+void put_user_pages(struct page **pages, unsigned long npages)
+{
+ unsigned long index;
+
+ /*
+ * TODO: this can be optimized for huge pages: if a series of pages is
+ * physically contiguous and part of the same compound page, then a
+ * single operation to the head page should suffice.
+ */
+ for (index = 0; index < npages; index++)
+ put_user_page(pages[index]);
+}
+EXPORT_SYMBOL(put_user_pages);
+
+#ifdef CONFIG_MMU
static struct page *no_page_table(struct vm_area_struct *vma,
unsigned int flags)
{
@@ -71,10 +171,10 @@
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
- unsigned long address, pmd_t *pmd, unsigned int flags)
+ unsigned long address, pmd_t *pmd, unsigned int flags,
+ struct dev_pagemap **pgmap)
{
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap = NULL;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
@@ -116,8 +216,8 @@
* Only return device mapping pages in the FOLL_GET case since
* they are only valid while holding the pgmap reference.
*/
- pgmap = get_dev_pagemap(pte_pfn(pte), NULL);
- if (pgmap)
+ *pgmap = get_dev_pagemap(pte_pfn(pte), *pgmap);
+ if (*pgmap)
page = pte_page(pte);
else
goto no_page;
@@ -153,12 +253,9 @@
}
if (flags & FOLL_GET) {
- get_page(page);
-
- /* drop the pgmap reference now that we hold the page */
- if (pgmap) {
- put_dev_pagemap(pgmap);
- pgmap = NULL;
+ if (unlikely(!try_get_page(page))) {
+ page = ERR_PTR(-ENOMEM);
+ goto out;
}
}
if (flags & FOLL_TOUCH) {
@@ -210,7 +307,8 @@
static struct page *follow_pmd_mask(struct vm_area_struct *vma,
unsigned long address, pud_t *pudp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
pmd_t *pmd, pmdval;
spinlock_t *ptl;
@@ -258,13 +356,13 @@
}
if (pmd_devmap(pmdval)) {
ptl = pmd_lock(mm, pmd);
- page = follow_devmap_pmd(vma, address, pmd, flags);
+ page = follow_devmap_pmd(vma, address, pmd, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
}
if (likely(!pmd_trans_huge(pmdval)))
- return follow_page_pte(vma, address, pmd, flags);
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
if ((flags & FOLL_NUMA) && pmd_protnone(pmdval))
return no_page_table(vma, flags);
@@ -284,9 +382,9 @@
}
if (unlikely(!pmd_trans_huge(*pmd))) {
spin_unlock(ptl);
- return follow_page_pte(vma, address, pmd, flags);
+ return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
- if (flags & FOLL_SPLIT) {
+ if (flags & (FOLL_SPLIT | FOLL_SPLIT_PMD)) {
int ret;
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
@@ -295,8 +393,11 @@
split_huge_pmd(vma, pmd, address);
if (pmd_trans_unstable(pmd))
ret = -EBUSY;
- } else {
- get_page(page);
+ } else if (flags & FOLL_SPLIT) {
+ if (unlikely(!try_get_page(page))) {
+ spin_unlock(ptl);
+ return ERR_PTR(-ENOMEM);
+ }
spin_unlock(ptl);
lock_page(page);
ret = split_huge_page(page);
@@ -304,21 +405,25 @@
put_page(page);
if (pmd_none(*pmd))
return no_page_table(vma, flags);
+ } else { /* flags & FOLL_SPLIT_PMD */
+ spin_unlock(ptl);
+ split_huge_pmd(vma, pmd, address);
+ ret = pte_alloc(mm, pmd) ? -ENOMEM : 0;
}
return ret ? ERR_PTR(ret) :
- follow_page_pte(vma, address, pmd, flags);
+ follow_page_pte(vma, address, pmd, flags, &ctx->pgmap);
}
page = follow_trans_huge_pmd(vma, address, pmd, flags);
spin_unlock(ptl);
- *page_mask = HPAGE_PMD_NR - 1;
+ ctx->page_mask = HPAGE_PMD_NR - 1;
return page;
}
-
static struct page *follow_pud_mask(struct vm_area_struct *vma,
unsigned long address, p4d_t *p4dp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
pud_t *pud;
spinlock_t *ptl;
@@ -344,7 +449,7 @@
}
if (pud_devmap(*pud)) {
ptl = pud_lock(mm, pud);
- page = follow_devmap_pud(vma, address, pud, flags);
+ page = follow_devmap_pud(vma, address, pud, flags, &ctx->pgmap);
spin_unlock(ptl);
if (page)
return page;
@@ -352,13 +457,13 @@
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
- return follow_pmd_mask(vma, address, pud, flags, page_mask);
+ return follow_pmd_mask(vma, address, pud, flags, ctx);
}
-
static struct page *follow_p4d_mask(struct vm_area_struct *vma,
unsigned long address, pgd_t *pgdp,
- unsigned int flags, unsigned int *page_mask)
+ unsigned int flags,
+ struct follow_page_context *ctx)
{
p4d_t *p4d;
struct page *page;
@@ -378,7 +483,7 @@
return page;
return no_page_table(vma, flags);
}
- return follow_pud_mask(vma, address, p4d, flags, page_mask);
+ return follow_pud_mask(vma, address, p4d, flags, ctx);
}
/**
@@ -386,23 +491,29 @@
* @vma: vm_area_struct mapping @address
* @address: virtual address to look up
* @flags: flags modifying lookup behaviour
- * @page_mask: on output, *page_mask is set according to the size of the page
+ * @ctx: contains dev_pagemap for %ZONE_DEVICE memory pinning and a
+ * pointer to output page_mask
*
* @flags can have FOLL_ flags set, defined in <linux/mm.h>
*
- * Returns the mapped (struct page *), %NULL if no mapping exists, or
+ * When getting pages from ZONE_DEVICE memory, the @ctx->pgmap caches
+ * the device's dev_pagemap metadata to avoid repeating expensive lookups.
+ *
+ * On output, the @ctx->page_mask is set according to the size of the page.
+ *
+ * Return: the mapped (struct page *), %NULL if no mapping exists, or
* an error pointer if there is a mapping to something not represented
* by a page descriptor (see also vm_normal_page()).
*/
-struct page *follow_page_mask(struct vm_area_struct *vma,
+static struct page *follow_page_mask(struct vm_area_struct *vma,
unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+ struct follow_page_context *ctx)
{
pgd_t *pgd;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
- *page_mask = 0;
+ ctx->page_mask = 0;
/* make this handle hugepd */
page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
@@ -431,7 +542,19 @@
return no_page_table(vma, flags);
}
- return follow_p4d_mask(vma, address, pgd, flags, page_mask);
+ return follow_p4d_mask(vma, address, pgd, flags, ctx);
+}
+
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
+{
+ struct follow_page_context ctx = { NULL };
+ struct page *page;
+
+ page = follow_page_mask(vma, address, foll_flags, &ctx);
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return page;
}
static int get_gate_page(struct mm_struct *mm, unsigned long address,
@@ -452,11 +575,14 @@
pgd = pgd_offset_k(address);
else
pgd = pgd_offset_gate(mm, address);
- BUG_ON(pgd_none(*pgd));
+ if (pgd_none(*pgd))
+ return -EFAULT;
p4d = p4d_offset(pgd, address);
- BUG_ON(p4d_none(*p4d));
+ if (p4d_none(*p4d))
+ return -EFAULT;
pud = pud_offset(p4d, address);
- BUG_ON(pud_none(*pud));
+ if (pud_none(*pud))
+ return -EFAULT;
pmd = pmd_offset(pud, address);
if (!pmd_present(*pmd))
return -EFAULT;
@@ -472,15 +598,11 @@
if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte)))
goto unmap;
*page = pte_page(*pte);
-
- /*
- * This should never happen (a device public page in the gate
- * area).
- */
- if (is_device_public_page(*page))
- goto unmap;
}
- get_page(*page);
+ if (unlikely(!try_get_page(*page))) {
+ ret = -ENOMEM;
+ goto unmap;
+ }
out:
ret = 0;
unmap:
@@ -659,13 +781,15 @@
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *nonblocking)
{
- long i = 0;
- unsigned int page_mask;
+ long ret = 0, i = 0;
struct vm_area_struct *vma = NULL;
+ struct follow_page_context ctx = { NULL };
if (!nr_pages)
return 0;
+ start = untagged_addr(start);
+
VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
/*
@@ -685,18 +809,19 @@
if (!vma || start >= vma->vm_end) {
vma = find_extend_vma(mm, start);
if (!vma && in_gate_area(mm, start)) {
- int ret;
ret = get_gate_page(mm, start & PAGE_MASK,
gup_flags, &vma,
pages ? &pages[i] : NULL);
if (ret)
- return i ? : ret;
- page_mask = 0;
+ goto out;
+ ctx.page_mask = 0;
goto next_page;
}
- if (!vma || check_vma_flags(vma, gup_flags))
- return i ? : -EFAULT;
+ if (!vma || check_vma_flags(vma, gup_flags)) {
+ ret = -EFAULT;
+ goto out;
+ }
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
@@ -709,23 +834,26 @@
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current)))
- return i ? i : -ERESTARTSYS;
+ if (fatal_signal_pending(current)) {
+ ret = -ERESTARTSYS;
+ goto out;
+ }
cond_resched();
- page = follow_page_mask(vma, start, foll_flags, &page_mask);
+
+ page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- int ret;
ret = faultin_page(tsk, vma, start, &foll_flags,
nonblocking);
switch (ret) {
case 0:
goto retry;
+ case -EBUSY:
+ ret = 0;
+ /* FALLTHRU */
case -EFAULT:
case -ENOMEM:
case -EHWPOISON:
- return i ? i : ret;
- case -EBUSY:
- return i;
+ goto out;
case -ENOENT:
goto next_page;
}
@@ -737,27 +865,31 @@
*/
goto next_page;
} else if (IS_ERR(page)) {
- return i ? i : PTR_ERR(page);
+ ret = PTR_ERR(page);
+ goto out;
}
if (pages) {
pages[i] = page;
flush_anon_page(vma, page, start);
flush_dcache_page(page);
- page_mask = 0;
+ ctx.page_mask = 0;
}
next_page:
if (vmas) {
vmas[i] = vma;
- page_mask = 0;
+ ctx.page_mask = 0;
}
- page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
+ page_increm = 1 + (~(start >> PAGE_SHIFT) & ctx.page_mask);
if (page_increm > nr_pages)
page_increm = nr_pages;
i += page_increm;
start += page_increm * PAGE_SIZE;
nr_pages -= page_increm;
} while (nr_pages);
- return i;
+out:
+ if (ctx.pgmap)
+ put_dev_pagemap(ctx.pgmap);
+ return i ? i : ret;
}
static bool vma_permits_fault(struct vm_area_struct *vma,
@@ -820,6 +952,8 @@
struct vm_area_struct *vma;
vm_fault_t ret, major = 0;
+ address = untagged_addr(address);
+
if (unlocked)
fault_flags |= FAULT_FLAG_ALLOW_RETRY;
@@ -898,10 +1032,6 @@
BUG_ON(ret >= nr_pages);
}
- if (!pages)
- /* If it's a prefault don't insist harder */
- return ret;
-
if (ret > 0) {
nr_pages -= ret;
pages_done += ret;
@@ -917,8 +1047,12 @@
pages_done = ret;
break;
}
- /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
- pages += ret;
+ /*
+ * VM_FAULT_RETRY triggered, so seek to the faulting offset.
+ * For the prefault case (!pages) we only update counts.
+ */
+ if (likely(pages))
+ pages += ret;
start += ret << PAGE_SHIFT;
/*
@@ -941,7 +1075,8 @@
pages_done++;
if (!nr_pages)
break;
- pages++;
+ if (likely(pages))
+ pages++;
start += PAGE_SIZE;
}
if (lock_dropped && *locked) {
@@ -956,68 +1091,6 @@
}
/*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
- * get_user_pages_locked() is suitable to replace the form:
- *
- * down_read(&mm->mmap_sem);
- * do_something()
- * get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
- *
- * to:
- *
- * int locked = 1;
- * down_read(&mm->mmap_sem);
- * do_something()
- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
- * if (locked)
- * up_read(&mm->mmap_sem);
- */
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, NULL, locked,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
-/*
- * get_user_pages_unlocked() is suitable to replace the form:
- *
- * down_read(&mm->mmap_sem);
- * get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
- *
- * with:
- *
- * get_user_pages_unlocked(tsk, mm, ..., pages);
- *
- * It is functionally equivalent to get_user_pages_fast so
- * get_user_pages_fast should be used instead if specific gup_flags
- * (e.g. FOLL_FORCE) are not required.
- */
-long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
-{
- struct mm_struct *mm = current->mm;
- int locked = 1;
- long ret;
-
- down_read(&mm->mmap_sem);
- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
- &locked, gup_flags | FOLL_TOUCH);
- if (locked)
- up_read(&mm->mmap_sem);
- return ret;
-}
-EXPORT_SYMBOL(get_user_pages_unlocked);
-
-/*
* get_user_pages_remote() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
* NULL if faults are not to be recorded.
@@ -1078,93 +1151,21 @@
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
EXPORT_SYMBOL(get_user_pages_remote);
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
-{
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
- pages, vmas, NULL,
- gup_flags | FOLL_TOUCH);
-}
-EXPORT_SYMBOL(get_user_pages);
-
-#ifdef CONFIG_FS_DAX
-/*
- * This is the same as get_user_pages() in that it assumes we are
- * operating on the current task's mm, but it goes further to validate
- * that the vmas associated with the address range are suitable for
- * longterm elevated page reference counts. For example, filesystem-dax
- * mappings are subject to the lifetime enforced by the filesystem and
- * we need guarantees that longterm users like RDMA and V4L2 only
- * establish mappings that have a kernel enforced revocation mechanism.
- *
- * "longterm" == userspace controlled elevated page count lifetime.
- * Contrast this to iov_iter_get_pages() usages which are transient.
- */
-long get_user_pages_longterm(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas_arg)
-{
- struct vm_area_struct **vmas = vmas_arg;
- struct vm_area_struct *vma_prev = NULL;
- long rc, i;
-
- if (!pages)
- return -EINVAL;
-
- if (!vmas) {
- vmas = kcalloc(nr_pages, sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas)
- return -ENOMEM;
- }
-
- rc = get_user_pages(start, nr_pages, gup_flags, pages, vmas);
-
- for (i = 0; i < rc; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma == vma_prev)
- continue;
-
- vma_prev = vma;
-
- if (vma_is_fsdax(vma))
- break;
- }
-
- /*
- * Either get_user_pages() failed, or the vma validation
- * succeeded, in either case we don't need to put_page() before
- * returning.
- */
- if (i >= rc)
- goto out;
-
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
-out:
- if (vmas != vmas_arg)
- kfree(vmas);
- return rc;
-}
-EXPORT_SYMBOL(get_user_pages_longterm);
-#endif /* CONFIG_FS_DAX */
-
/**
* populate_vma_page_range() - populate a range of pages in the vma.
* @vma: target vma
@@ -1311,9 +1312,389 @@
return page;
}
#endif /* CONFIG_ELF_CORE */
+#else /* CONFIG_MMU */
+static long __get_user_pages_locked(struct task_struct *tsk,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long nr_pages, struct page **pages,
+ struct vm_area_struct **vmas, int *locked,
+ unsigned int foll_flags)
+{
+ struct vm_area_struct *vma;
+ unsigned long vm_flags;
+ int i;
+
+ /* calculate required read or write permissions.
+ * If FOLL_FORCE is set, we only require the "MAY" flags.
+ */
+ vm_flags = (foll_flags & FOLL_WRITE) ?
+ (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
+ vm_flags &= (foll_flags & FOLL_FORCE) ?
+ (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
+
+ for (i = 0; i < nr_pages; i++) {
+ vma = find_vma(mm, start);
+ if (!vma)
+ goto finish_or_fault;
+
+ /* protect what we can, including chardevs */
+ if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
+ !(vm_flags & vma->vm_flags))
+ goto finish_or_fault;
+
+ if (pages) {
+ pages[i] = virt_to_page(start);
+ if (pages[i])
+ get_page(pages[i]);
+ }
+ if (vmas)
+ vmas[i] = vma;
+ start = (start + PAGE_SIZE) & PAGE_MASK;
+ }
+
+ return i;
+
+finish_or_fault:
+ return i ? : -EFAULT;
+}
+#endif /* !CONFIG_MMU */
+
+#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
+static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
+{
+ long i;
+ struct vm_area_struct *vma_prev = NULL;
+
+ for (i = 0; i < nr_pages; i++) {
+ struct vm_area_struct *vma = vmas[i];
+
+ if (vma == vma_prev)
+ continue;
+
+ vma_prev = vma;
+
+ if (vma_is_fsdax(vma))
+ return true;
+ }
+ return false;
+}
+
+#ifdef CONFIG_CMA
+static struct page *new_non_cma_page(struct page *page, unsigned long private)
+{
+ /*
+ * We want to make sure we allocate the new page from the same node
+ * as the source page.
+ */
+ int nid = page_to_nid(page);
+ /*
+ * Trying to allocate a page for migration. Ignore allocation
+ * failure warnings. We don't force __GFP_THISNODE here because
+ * this node here is the node where we have CMA reservation and
+ * in some case these nodes will have really less non movable
+ * allocation memory.
+ */
+ gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
+
+ if (PageHighMem(page))
+ gfp_mask |= __GFP_HIGHMEM;
+
+#ifdef CONFIG_HUGETLB_PAGE
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(page);
+ /*
+ * We don't want to dequeue from the pool because pool pages will
+ * mostly be from the CMA region.
+ */
+ return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
+ }
+#endif
+ if (PageTransHuge(page)) {
+ struct page *thp;
+ /*
+ * ignore allocation failure warnings
+ */
+ gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
+
+ /*
+ * Remove the movable mask so that we don't allocate from
+ * CMA area again.
+ */
+ thp_gfpmask &= ~__GFP_MOVABLE;
+ thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
+ if (!thp)
+ return NULL;
+ prep_transhuge_page(thp);
+ return thp;
+ }
+
+ return __alloc_pages_node(nid, gfp_mask, 0);
+}
+
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
+{
+ unsigned long i;
+ unsigned long step;
+ bool drain_allow = true;
+ bool migrate_allow = true;
+ LIST_HEAD(cma_page_list);
+
+check_again:
+ for (i = 0; i < nr_pages;) {
+
+ struct page *head = compound_head(pages[i]);
+
+ /*
+ * gup may start from a tail page. Advance step by the left
+ * part.
+ */
+ step = compound_nr(head) - (pages[i] - head);
+ /*
+ * If we get a page from the CMA zone, since we are going to
+ * be pinning these entries, we might as well move them out
+ * of the CMA zone if possible.
+ */
+ if (is_migrate_cma_page(head)) {
+ if (PageHuge(head))
+ isolate_huge_page(head, &cma_page_list);
+ else {
+ if (!PageLRU(head) && drain_allow) {
+ lru_add_drain_all();
+ drain_allow = false;
+ }
+
+ if (!isolate_lru_page(head)) {
+ list_add_tail(&head->lru, &cma_page_list);
+ mod_node_page_state(page_pgdat(head),
+ NR_ISOLATED_ANON +
+ page_is_file_cache(head),
+ hpage_nr_pages(head));
+ }
+ }
+ }
+
+ i += step;
+ }
+
+ if (!list_empty(&cma_page_list)) {
+ /*
+ * drop the above get_user_pages reference.
+ */
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ if (migrate_pages(&cma_page_list, new_non_cma_page,
+ NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+ /*
+ * some of the pages failed migration. Do get_user_pages
+ * without migration.
+ */
+ migrate_allow = false;
+
+ if (!list_empty(&cma_page_list))
+ putback_movable_pages(&cma_page_list);
+ }
+ /*
+ * We did migrate all the pages, Try to get the page references
+ * again migrating any new CMA pages which we failed to isolate
+ * earlier.
+ */
+ nr_pages = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ pages, vmas, NULL,
+ gup_flags);
+
+ if ((nr_pages > 0) && migrate_allow) {
+ drain_allow = true;
+ goto check_again;
+ }
+ }
+
+ return nr_pages;
+}
+#else
+static long check_and_migrate_cma_pages(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
+{
+ return nr_pages;
+}
+#endif /* CONFIG_CMA */
/*
- * Generic Fast GUP
+ * __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
+ * allows us to process the FOLL_LONGTERM flag.
+ */
+static long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int gup_flags)
+{
+ struct vm_area_struct **vmas_tmp = vmas;
+ unsigned long flags = 0;
+ long rc, i;
+
+ if (gup_flags & FOLL_LONGTERM) {
+ if (!pages)
+ return -EINVAL;
+
+ if (!vmas_tmp) {
+ vmas_tmp = kcalloc(nr_pages,
+ sizeof(struct vm_area_struct *),
+ GFP_KERNEL);
+ if (!vmas_tmp)
+ return -ENOMEM;
+ }
+ flags = memalloc_nocma_save();
+ }
+
+ rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ vmas_tmp, NULL, gup_flags);
+
+ if (gup_flags & FOLL_LONGTERM) {
+ memalloc_nocma_restore(flags);
+ if (rc < 0)
+ goto out;
+
+ if (check_dax_vmas(vmas_tmp, rc)) {
+ for (i = 0; i < rc; i++)
+ put_page(pages[i]);
+ rc = -EOPNOTSUPP;
+ goto out;
+ }
+
+ rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ vmas_tmp, gup_flags);
+ }
+
+out:
+ if (vmas_tmp != vmas)
+ kfree(vmas_tmp);
+ return rc;
+}
+#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
+static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ unsigned int flags)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ NULL, flags);
+}
+#endif /* CONFIG_FS_DAX || CONFIG_CMA */
+
+/*
+ * This is the same as get_user_pages_remote(), just with a
+ * less-flexible calling convention where we assume that the task
+ * and mm being operated on are the current task's and don't allow
+ * passing of a locked parameter. We also obviously don't pass
+ * FOLL_REMOTE in here.
+ */
+long get_user_pages(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ struct vm_area_struct **vmas)
+{
+ return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ pages, vmas, gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages);
+
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * to:
+ *
+ * int locked = 1;
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * if (locked)
+ * up_read(&mm->mmap_sem);
+ */
+long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ int *locked)
+{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
+ return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ pages, NULL, locked,
+ gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * with:
+ *
+ * get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead if specific gup_flags
+ * (e.g. FOLL_FORCE) are not required.
+ */
+long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags)
+{
+ struct mm_struct *mm = current->mm;
+ int locked = 1;
+ long ret;
+
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+ &locked, gup_flags | FOLL_TOUCH);
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
+/*
+ * Fast GUP
*
* get_user_pages_fast attempts to pin user pages by walking the page
* tables directly and avoids taking locks. Thus the walker needs to be
@@ -1345,20 +1726,64 @@
*
* This code is based heavily on the PowerPC implementation by Nick Piggin.
*/
-#ifdef CONFIG_HAVE_GENERIC_GUP
-
-#ifndef gup_get_pte
+#ifdef CONFIG_HAVE_FAST_GUP
+#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
/*
- * We assume that the PTE can be read atomically. If this is not the case for
- * your architecture, please provide the helper.
+ * WARNING: only to be used in the get_user_pages_fast() implementation.
+ *
+ * With get_user_pages_fast(), we walk down the pagetables without taking any
+ * locks. For this we would like to load the pointers atomically, but sometimes
+ * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
+ * we do have is the guarantee that a PTE will only either go from not present
+ * to present, or present to not present or both -- it will not switch to a
+ * completely different present page without a TLB flush in between; something
+ * that we are blocking by holding interrupts off.
+ *
+ * Setting ptes from not present to present goes:
+ *
+ * ptep->pte_high = h;
+ * smp_wmb();
+ * ptep->pte_low = l;
+ *
+ * And present to not present goes:
+ *
+ * ptep->pte_low = 0;
+ * smp_wmb();
+ * ptep->pte_high = 0;
+ *
+ * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
+ * We load pte_high *after* loading pte_low, which ensures we don't see an older
+ * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
+ * picked up a changed pte high. We might have gotten rubbish values from
+ * pte_low and pte_high, but we are guaranteed that pte_low will not have the
+ * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
+ * operates on present ptes we're safe.
+ */
+static inline pte_t gup_get_pte(pte_t *ptep)
+{
+ pte_t pte;
+
+ do {
+ pte.pte_low = ptep->pte_low;
+ smp_rmb();
+ pte.pte_high = ptep->pte_high;
+ smp_rmb();
+ } while (unlikely(pte.pte_low != ptep->pte_low));
+
+ return pte;
+}
+#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
+/*
+ * We require that the PTE can be read atomically.
*/
static inline pte_t gup_get_pte(pte_t *ptep)
{
return READ_ONCE(*ptep);
}
-#endif
+#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages)
+static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
+ struct page **pages)
{
while ((*nr) - nr_start) {
struct page *page = pages[--(*nr)];
@@ -1368,9 +1793,23 @@
}
}
+/*
+ * Return the compund head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -1388,10 +1827,13 @@
if (pte_protnone(pte))
goto pte_unmap;
- if (!pte_access_permitted(pte, write))
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
goto pte_unmap;
if (pte_devmap(pte)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ goto pte_unmap;
+
pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
if (unlikely(!pgmap)) {
undo_dev_pagemap(nr, nr_start, pages);
@@ -1402,9 +1844,9 @@
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- head = compound_head(page);
- if (!page_cache_get_speculative(head))
+ head = try_get_compound_head(page, 1);
+ if (!head)
goto pte_unmap;
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
@@ -1440,13 +1882,13 @@
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
return 0;
}
#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */
-#if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
static int __gup_device_huge(unsigned long pfn, unsigned long addr,
unsigned long end, struct page **pages, int *nr)
{
@@ -1522,17 +1964,106 @@
}
#endif
+#ifdef CONFIG_ARCH_HAS_HUGEPD
+static unsigned long hugepte_addr_end(unsigned long addr, unsigned long end,
+ unsigned long sz)
+{
+ unsigned long __boundary = (addr + sz) & ~(sz-1);
+ return (__boundary - 1 < end - 1) ? __boundary : end;
+}
+
+static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ unsigned long pte_end;
+ struct page *head, *page;
+ pte_t pte;
+ int refs;
+
+ pte_end = (addr + sz) & ~(sz-1);
+ if (pte_end < end)
+ end = pte_end;
+
+ pte = READ_ONCE(*ptep);
+
+ if (!pte_access_permitted(pte, flags & FOLL_WRITE))
+ return 0;
+
+ /* hugepages are never "special" */
+ VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+
+ refs = 0;
+ head = pte_page(pte);
+
+ page = head + ((addr & (sz-1)) >> PAGE_SHIFT);
+ do {
+ VM_BUG_ON(compound_head(page) != head);
+ pages[*nr] = page;
+ (*nr)++;
+ page++;
+ refs++;
+ } while (addr += PAGE_SIZE, addr != end);
+
+ head = try_get_compound_head(head, refs);
+ if (!head) {
+ *nr -= refs;
+ return 0;
+ }
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ /* Could be optimized better */
+ *nr -= refs;
+ while (refs--)
+ put_page(head);
+ return 0;
+ }
+
+ SetPageReferenced(head);
+ return 1;
+}
+
+static int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ pte_t *ptep;
+ unsigned long sz = 1UL << hugepd_shift(hugepd);
+ unsigned long next;
+
+ ptep = hugepte_offset(hugepd, addr, pdshift);
+ do {
+ next = hugepte_addr_end(addr, end, sz);
+ if (!gup_hugepte(ptep, sz, addr, end, flags, pages, nr))
+ return 0;
+ } while (ptep++, addr = next, addr != end);
+
+ return 1;
+}
+#else
+static inline int gup_huge_pd(hugepd_t hugepd, unsigned long addr,
+ unsigned int pdshift, unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
+{
+ return 0;
+}
+#endif /* CONFIG_ARCH_HAS_HUGEPD */
+
static int gup_huge_pmd(pmd_t orig, pmd_t *pmdp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pmd_access_permitted(orig, write))
+ if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pmd_devmap(orig))
+ if (pmd_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pmd(orig, pmdp, addr, end, pages, nr);
+ }
refs = 0;
page = pmd_page(orig) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1543,8 +2074,8 @@
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pmd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pmd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1561,16 +2092,19 @@
}
static int gup_huge_pud(pud_t orig, pud_t *pudp, unsigned long addr,
- unsigned long end, int write, struct page **pages, int *nr)
+ unsigned long end, unsigned int flags, struct page **pages, int *nr)
{
struct page *head, *page;
int refs;
- if (!pud_access_permitted(orig, write))
+ if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
- if (pud_devmap(orig))
+ if (pud_devmap(orig)) {
+ if (unlikely(flags & FOLL_LONGTERM))
+ return 0;
return __gup_device_huge_pud(orig, pudp, addr, end, pages, nr);
+ }
refs = 0;
page = pud_page(orig) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
@@ -1581,8 +2115,8 @@
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pud_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pud_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1599,13 +2133,13 @@
}
static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
- unsigned long end, int write,
+ unsigned long end, unsigned int flags,
struct page **pages, int *nr)
{
int refs;
struct page *head, *page;
- if (!pgd_access_permitted(orig, write))
+ if (!pgd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
BUILD_BUG_ON(pgd_devmap(orig));
@@ -1618,8 +2152,8 @@
refs++;
} while (addr += PAGE_SIZE, addr != end);
- head = compound_head(pgd_page(orig));
- if (!page_cache_add_speculative(head, refs)) {
+ head = try_get_compound_head(pgd_page(orig), refs);
+ if (!head) {
*nr -= refs;
return 0;
}
@@ -1636,7 +2170,7 @@
}
static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
@@ -1649,7 +2183,8 @@
if (!pmd_present(pmd))
return 0;
- if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd))) {
+ if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) ||
+ pmd_devmap(pmd))) {
/*
* NUMA hinting faults need to be handled in the GUP
* slowpath for accounting purposes and so that they
@@ -1658,7 +2193,7 @@
if (pmd_protnone(pmd))
return 0;
- if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
+ if (!gup_huge_pmd(pmd, pmdp, addr, next, flags,
pages, nr))
return 0;
@@ -1668,9 +2203,9 @@
* pmd format and THP pmd format
*/
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
- PMD_SHIFT, next, write, pages, nr))
+ PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, write, pages, nr))
+ } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
@@ -1678,7 +2213,7 @@
}
static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
@@ -1691,14 +2226,14 @@
if (pud_none(pud))
return 0;
if (unlikely(pud_huge(pud))) {
- if (!gup_huge_pud(pud, pudp, addr, next, write,
+ if (!gup_huge_pud(pud, pudp, addr, next, flags,
pages, nr))
return 0;
} else if (unlikely(is_hugepd(__hugepd(pud_val(pud))))) {
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
- PUD_SHIFT, next, write, pages, nr))
+ PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, write, pages, nr))
+ } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
@@ -1706,7 +2241,7 @@
}
static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
@@ -1721,9 +2256,9 @@
BUILD_BUG_ON(p4d_huge(p4d));
if (unlikely(is_hugepd(__hugepd(p4d_val(p4d))))) {
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
- P4D_SHIFT, next, write, pages, nr))
+ P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, write, pages, nr))
+ } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -1731,7 +2266,7 @@
}
static void gup_pgd_range(unsigned long addr, unsigned long end,
- int write, struct page **pages, int *nr)
+ unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pgd_t *pgdp;
@@ -1744,30 +2279,32 @@
if (pgd_none(pgd))
return;
if (unlikely(pgd_huge(pgd))) {
- if (!gup_huge_pgd(pgd, pgdp, addr, next, write,
+ if (!gup_huge_pgd(pgd, pgdp, addr, next, flags,
pages, nr))
return;
} else if (unlikely(is_hugepd(__hugepd(pgd_val(pgd))))) {
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
- PGDIR_SHIFT, next, write, pages, nr))
+ PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, write, pages, nr))
+ } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
+#else
+static inline void gup_pgd_range(unsigned long addr, unsigned long end,
+ unsigned int flags, struct page **pages, int *nr)
+{
+}
+#endif /* CONFIG_HAVE_FAST_GUP */
#ifndef gup_fast_permitted
/*
* Check if it's allowed to use __get_user_pages_fast() for the range, or
* we need to fall back to the slow version:
*/
-bool gup_fast_permitted(unsigned long start, int nr_pages, int write)
+static bool gup_fast_permitted(unsigned long start, unsigned long end)
{
- unsigned long len, end;
-
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
- return end >= start;
+ return true;
}
#endif
@@ -1776,21 +2313,24 @@
* the regular GUP.
* Note a difference with get_user_pages_fast: this always returns the
* number of pages pinned, 0 if no pages were pinned.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
{
- unsigned long addr, len, end;
+ unsigned long len, end;
unsigned long flags;
int nr = 0;
- start &= PAGE_MASK;
- addr = start;
+ start = untagged_addr(start) & PAGE_MASK;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
+ if (end <= start)
+ return 0;
+ if (unlikely(!access_ok((void __user *)start, len)))
return 0;
/*
@@ -1798,27 +2338,52 @@
* interrupts disabled by get_futex_key.
*
* With interrupts disabled, we block page table pages from being
- * freed from under us. See mmu_gather_tlb in asm-generic/tlb.h
- * for more details.
+ * freed from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
*
* We do not adopt an rcu_read_lock(.) here as we also want to
* block IPIs that come from THPs splitting.
*/
- if (gup_fast_permitted(start, nr_pages, write)) {
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
+ gup_fast_permitted(start, end)) {
local_irq_save(flags);
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(start, end, write ? FOLL_WRITE : 0, pages, &nr);
local_irq_restore(flags);
}
return nr;
}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
+static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int ret;
+
+ /*
+ * FIXME: FOLL_LONGTERM does not work with
+ * get_user_pages_unlocked() (see comments in that function)
+ */
+ if (gup_flags & FOLL_LONGTERM) {
+ down_read(¤t->mm->mmap_sem);
+ ret = __gup_longterm_locked(current, current->mm,
+ start, nr_pages,
+ pages, NULL, gup_flags);
+ up_read(¤t->mm->mmap_sem);
+ } else {
+ ret = get_user_pages_unlocked(start, nr_pages,
+ pages, gup_flags);
+ }
+
+ return ret;
+}
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
* @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
+ * @gup_flags: flags modifying pin behaviour
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
@@ -1830,27 +2395,29 @@
* requested. If nr_pages is 0 or negative, returns 0. If no pages
* were pinned, returns -errno.
*/
-int get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
+int get_user_pages_fast(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
{
unsigned long addr, len, end;
int nr = 0, ret = 0;
- start &= PAGE_MASK;
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
+ return -EINVAL;
+
+ start = untagged_addr(start) & PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
end = start + len;
- if (nr_pages <= 0)
+ if (end <= start)
return 0;
-
- if (unlikely(!access_ok(write ? VERIFY_WRITE : VERIFY_READ,
- (void __user *)start, len)))
+ if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- if (gup_fast_permitted(start, nr_pages, write)) {
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
+ gup_fast_permitted(start, end)) {
local_irq_disable();
- gup_pgd_range(addr, end, write, pages, &nr);
+ gup_pgd_range(addr, end, gup_flags, pages, &nr);
local_irq_enable();
ret = nr;
}
@@ -1860,8 +2427,8 @@
start += nr << PAGE_SHIFT;
pages += nr;
- ret = get_user_pages_unlocked(start, nr_pages - nr, pages,
- write ? FOLL_WRITE : 0);
+ ret = __gup_longterm_unlocked(start, nr_pages - nr,
+ gup_flags, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
@@ -1874,5 +2441,4 @@
return ret;
}
-
-#endif /* CONFIG_HAVE_GENERIC_GUP */
+EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 7405c9d..7dd602d 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -6,13 +6,17 @@
#include <linux/debugfs.h>
#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
+#define GUP_LONGTERM_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
+#define GUP_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
struct gup_benchmark {
- __u64 delta_usec;
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
__u64 addr;
__u64 size;
__u32 nr_pages_per_call;
__u32 flags;
+ __u64 expansion[10]; /* For future use */
};
static int __gup_benchmark_ioctl(unsigned int cmd,
@@ -23,6 +27,9 @@
int nr;
struct page **pages;
+ if (gup->size > ULONG_MAX)
+ return -EINVAL;
+
nr_pages = gup->size / PAGE_SIZE;
pages = kvcalloc(nr_pages, sizeof(void *), GFP_KERNEL);
if (!pages)
@@ -41,21 +48,41 @@
nr = (next - addr) / PAGE_SIZE;
}
- nr = get_user_pages_fast(addr, nr, gup->flags & 1, pages + i);
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ nr = get_user_pages_fast(addr, nr, gup->flags & 1,
+ pages + i);
+ break;
+ case GUP_LONGTERM_BENCHMARK:
+ nr = get_user_pages(addr, nr,
+ (gup->flags & 1) | FOLL_LONGTERM,
+ pages + i, NULL);
+ break;
+ case GUP_BENCHMARK:
+ nr = get_user_pages(addr, nr, gup->flags & 1, pages + i,
+ NULL);
+ break;
+ default:
+ return -1;
+ }
+
if (nr <= 0)
break;
i += nr;
}
end_time = ktime_get();
- gup->delta_usec = ktime_us_delta(end_time, start_time);
+ gup->get_delta_usec = ktime_us_delta(end_time, start_time);
gup->size = addr - gup->addr;
+ start_time = ktime_get();
for (i = 0; i < nr_pages; i++) {
if (!pages[i])
break;
put_page(pages[i]);
}
+ end_time = ktime_get();
+ gup->put_delta_usec = ktime_us_delta(end_time, start_time);
kvfree(pages);
return 0;
@@ -67,8 +94,14 @@
struct gup_benchmark gup;
int ret;
- if (cmd != GUP_FAST_BENCHMARK)
+ switch (cmd) {
+ case GUP_FAST_BENCHMARK:
+ case GUP_LONGTERM_BENCHMARK:
+ case GUP_BENCHMARK:
+ break;
+ default:
return -EINVAL;
+ }
if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
return -EFAULT;
@@ -90,12 +123,8 @@
static int gup_benchmark_init(void)
{
- void *ret;
-
- ret = debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
- &gup_benchmark_fops);
- if (!ret)
- pr_warn("Failed to create gup_benchmark in debugfs");
+ debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
+ &gup_benchmark_fops);
return 0;
}
diff --git a/mm/highmem.c b/mm/highmem.c
index 59db322..107b10f 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -105,9 +105,8 @@
}
#endif
-unsigned long totalhigh_pages __read_mostly;
-EXPORT_SYMBOL(totalhigh_pages);
-
+atomic_long_t _totalhigh_pages __read_mostly;
+EXPORT_SYMBOL(_totalhigh_pages);
EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
diff --git a/mm/hmm.c b/mm/hmm.c
index 90193a7..902f5fa 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -1,23 +1,14 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Copyright 2013 Red Hat Inc.
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * Authors: Jérôme Glisse <jglisse@redhat.com>
+ * Authors: Jérôme Glisse <jglisse@redhat.com>
*/
/*
* Refer to include/linux/hmm.h for information about heterogeneous memory
* management or HMM for short.
*/
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/hmm.h>
#include <linux/init.h>
#include <linux/rmap.h>
@@ -29,190 +20,140 @@
#include <linux/swapops.h>
#include <linux/hugetlb.h>
#include <linux/memremap.h>
+#include <linux/sched/mm.h>
#include <linux/jump_label.h>
+#include <linux/dma-mapping.h>
#include <linux/mmu_notifier.h>
#include <linux/memory_hotplug.h>
-#define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
-
-#if IS_ENABLED(CONFIG_HMM_MIRROR)
-static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
-
-/*
- * struct hmm - HMM per mm struct
- *
- * @mm: mm struct this HMM struct is bound to
- * @lock: lock protecting ranges list
- * @sequence: we track updates to the CPU page table with a sequence number
- * @ranges: list of range being snapshotted
- * @mirrors: list of mirrors for this mm
- * @mmu_notifier: mmu notifier to track updates to CPU page table
- * @mirrors_sem: read/write semaphore protecting the mirrors list
- */
-struct hmm {
- struct mm_struct *mm;
- spinlock_t lock;
- atomic_t sequence;
- struct list_head ranges;
- struct list_head mirrors;
- struct mmu_notifier mmu_notifier;
- struct rw_semaphore mirrors_sem;
-};
-
-/*
- * hmm_register - register HMM against an mm (HMM internal)
- *
- * @mm: mm struct to attach to
- *
- * This is not intended to be used directly by device drivers. It allocates an
- * HMM struct if mm does not have one, and initializes it.
- */
-static struct hmm *hmm_register(struct mm_struct *mm)
+static struct mmu_notifier *hmm_alloc_notifier(struct mm_struct *mm)
{
- struct hmm *hmm = READ_ONCE(mm->hmm);
- bool cleanup = false;
+ struct hmm *hmm;
- /*
- * The hmm struct can only be freed once the mm_struct goes away,
- * hence we should always have pre-allocated an new hmm struct
- * above.
- */
- if (hmm)
- return hmm;
-
- hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
+ hmm = kzalloc(sizeof(*hmm), GFP_KERNEL);
if (!hmm)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+
+ init_waitqueue_head(&hmm->wq);
INIT_LIST_HEAD(&hmm->mirrors);
init_rwsem(&hmm->mirrors_sem);
- atomic_set(&hmm->sequence, 0);
- hmm->mmu_notifier.ops = NULL;
INIT_LIST_HEAD(&hmm->ranges);
- spin_lock_init(&hmm->lock);
- hmm->mm = mm;
+ spin_lock_init(&hmm->ranges_lock);
+ hmm->notifiers = 0;
+ return &hmm->mmu_notifier;
+}
- spin_lock(&mm->page_table_lock);
- if (!mm->hmm)
- mm->hmm = hmm;
- else
- cleanup = true;
- spin_unlock(&mm->page_table_lock);
+static void hmm_free_notifier(struct mmu_notifier *mn)
+{
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
- if (cleanup)
- goto error;
-
- /*
- * We should only get here if hold the mmap_sem in write mode ie on
- * registration of first mirror through hmm_mirror_register()
- */
- hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
- if (__mmu_notifier_register(&hmm->mmu_notifier, mm))
- goto error_mm;
-
- return mm->hmm;
-
-error_mm:
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-error:
+ WARN_ON(!list_empty(&hmm->ranges));
+ WARN_ON(!list_empty(&hmm->mirrors));
kfree(hmm);
- return NULL;
-}
-
-void hmm_mm_destroy(struct mm_struct *mm)
-{
- kfree(mm->hmm);
-}
-
-static void hmm_invalidate_range(struct hmm *hmm,
- enum hmm_update_type action,
- unsigned long start,
- unsigned long end)
-{
- struct hmm_mirror *mirror;
- struct hmm_range *range;
-
- spin_lock(&hmm->lock);
- list_for_each_entry(range, &hmm->ranges, list) {
- unsigned long addr, idx, npages;
-
- if (end < range->start || start >= range->end)
- continue;
-
- range->valid = false;
- addr = max(start, range->start);
- idx = (addr - range->start) >> PAGE_SHIFT;
- npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
- memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
- }
- spin_unlock(&hmm->lock);
-
- down_read(&hmm->mirrors_sem);
- list_for_each_entry(mirror, &hmm->mirrors, list)
- mirror->ops->sync_cpu_device_pagetables(mirror, action,
- start, end);
- up_read(&hmm->mirrors_sem);
}
static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
{
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
struct hmm_mirror *mirror;
- struct hmm *hmm = mm->hmm;
- down_write(&hmm->mirrors_sem);
- mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
- list);
- while (mirror) {
- list_del_init(&mirror->list);
- if (mirror->ops->release) {
- /*
- * Drop mirrors_sem so callback can wait on any pending
- * work that might itself trigger mmu_notifier callback
- * and thus would deadlock with us.
- */
- up_write(&hmm->mirrors_sem);
+ /*
+ * Since hmm_range_register() holds the mmget() lock hmm_release() is
+ * prevented as long as a range exists.
+ */
+ WARN_ON(!list_empty_careful(&hmm->ranges));
+
+ down_read(&hmm->mirrors_sem);
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ /*
+ * Note: The driver is not allowed to trigger
+ * hmm_mirror_unregister() from this thread.
+ */
+ if (mirror->ops->release)
mirror->ops->release(mirror);
- down_write(&hmm->mirrors_sem);
- }
- mirror = list_first_entry_or_null(&hmm->mirrors,
- struct hmm_mirror, list);
}
- up_write(&hmm->mirrors_sem);
+ up_read(&hmm->mirrors_sem);
+}
+
+static void notifiers_decrement(struct hmm *hmm)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
+ hmm->notifiers--;
+ if (!hmm->notifiers) {
+ struct hmm_range *range;
+
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (range->valid)
+ continue;
+ range->valid = true;
+ }
+ wake_up_all(&hmm->wq);
+ }
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
}
static int hmm_invalidate_range_start(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
- bool blockable)
+ const struct mmu_notifier_range *nrange)
{
- struct hmm *hmm = mm->hmm;
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
+ struct hmm_mirror *mirror;
+ struct hmm_range *range;
+ unsigned long flags;
+ int ret = 0;
- VM_BUG_ON(!hmm);
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
+ hmm->notifiers++;
+ list_for_each_entry(range, &hmm->ranges, list) {
+ if (nrange->end < range->start || nrange->start >= range->end)
+ continue;
- atomic_inc(&hmm->sequence);
+ range->valid = false;
+ }
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
- return 0;
+ if (mmu_notifier_range_blockable(nrange))
+ down_read(&hmm->mirrors_sem);
+ else if (!down_read_trylock(&hmm->mirrors_sem)) {
+ ret = -EAGAIN;
+ goto out;
+ }
+
+ list_for_each_entry(mirror, &hmm->mirrors, list) {
+ int rc;
+
+ rc = mirror->ops->sync_cpu_device_pagetables(mirror, nrange);
+ if (rc) {
+ if (WARN_ON(mmu_notifier_range_blockable(nrange) ||
+ rc != -EAGAIN))
+ continue;
+ ret = -EAGAIN;
+ break;
+ }
+ }
+ up_read(&hmm->mirrors_sem);
+
+out:
+ if (ret)
+ notifiers_decrement(hmm);
+ return ret;
}
static void hmm_invalidate_range_end(struct mmu_notifier *mn,
- struct mm_struct *mm,
- unsigned long start,
- unsigned long end)
+ const struct mmu_notifier_range *nrange)
{
- struct hmm *hmm = mm->hmm;
+ struct hmm *hmm = container_of(mn, struct hmm, mmu_notifier);
- VM_BUG_ON(!hmm);
-
- hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
+ notifiers_decrement(hmm);
}
static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
.release = hmm_release,
.invalidate_range_start = hmm_invalidate_range_start,
.invalidate_range_end = hmm_invalidate_range_end,
+ .alloc_notifier = hmm_alloc_notifier,
+ .free_notifier = hmm_free_notifier,
};
/*
@@ -220,36 +161,35 @@
*
* @mirror: new mirror struct to register
* @mm: mm to register against
+ * Return: 0 on success, -ENOMEM if no memory, -EINVAL if invalid arguments
*
* To start mirroring a process address space, the device driver must register
* an HMM mirror struct.
*
- * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
+ * The caller cannot unregister the hmm_mirror while any ranges are
+ * registered.
+ *
+ * Callers using this function must put a call to mmu_notifier_synchronize()
+ * in their module exit functions.
*/
int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
{
+ struct mmu_notifier *mn;
+
+ lockdep_assert_held_write(&mm->mmap_sem);
+
/* Sanity check */
if (!mm || !mirror || !mirror->ops)
return -EINVAL;
-again:
- mirror->hmm = hmm_register(mm);
- if (!mirror->hmm)
- return -ENOMEM;
+ mn = mmu_notifier_get_locked(&hmm_mmu_notifier_ops, mm);
+ if (IS_ERR(mn))
+ return PTR_ERR(mn);
+ mirror->hmm = container_of(mn, struct hmm, mmu_notifier);
down_write(&mirror->hmm->mirrors_sem);
- if (mirror->hmm->mm == NULL) {
- /*
- * A racing hmm_mirror_unregister() is about to destroy the hmm
- * struct. Try again to allocate a new one.
- */
- up_write(&mirror->hmm->mirrors_sem);
- mirror->hmm = NULL;
- goto again;
- } else {
- list_add(&mirror->list, &mirror->hmm->mirrors);
- up_write(&mirror->hmm->mirrors_sem);
- }
+ list_add(&mirror->list, &mirror->hmm->mirrors);
+ up_write(&mirror->hmm->mirrors_sem);
return 0;
}
@@ -258,69 +198,58 @@
/*
* hmm_mirror_unregister() - unregister a mirror
*
- * @mirror: new mirror struct to register
+ * @mirror: mirror struct to unregister
*
* Stop mirroring a process address space, and cleanup.
*/
void hmm_mirror_unregister(struct hmm_mirror *mirror)
{
- bool should_unregister = false;
- struct mm_struct *mm;
- struct hmm *hmm;
+ struct hmm *hmm = mirror->hmm;
- if (mirror->hmm == NULL)
- return;
-
- hmm = mirror->hmm;
down_write(&hmm->mirrors_sem);
- list_del_init(&mirror->list);
- should_unregister = list_empty(&hmm->mirrors);
- mirror->hmm = NULL;
- mm = hmm->mm;
- hmm->mm = NULL;
+ list_del(&mirror->list);
up_write(&hmm->mirrors_sem);
-
- if (!should_unregister || mm == NULL)
- return;
-
- mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
-
- spin_lock(&mm->page_table_lock);
- if (mm->hmm == hmm)
- mm->hmm = NULL;
- spin_unlock(&mm->page_table_lock);
-
- kfree(hmm);
+ mmu_notifier_put(&hmm->mmu_notifier);
}
EXPORT_SYMBOL(hmm_mirror_unregister);
struct hmm_vma_walk {
struct hmm_range *range;
+ struct dev_pagemap *pgmap;
unsigned long last;
- bool fault;
- bool block;
+ unsigned int flags;
};
static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
bool write_fault, uint64_t *pfn)
{
- unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
+ unsigned int flags = FAULT_FLAG_REMOTE;
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
vm_fault_t ret;
- flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
- flags |= write_fault ? FAULT_FLAG_WRITE : 0;
- ret = handle_mm_fault(vma, addr, flags);
- if (ret & VM_FAULT_RETRY)
- return -EBUSY;
- if (ret & VM_FAULT_ERROR) {
- *pfn = range->values[HMM_PFN_ERROR];
- return -EFAULT;
- }
+ if (!vma)
+ goto err;
- return -EAGAIN;
+ if (hmm_vma_walk->flags & HMM_FAULT_ALLOW_RETRY)
+ flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (write_fault)
+ flags |= FAULT_FLAG_WRITE;
+
+ ret = handle_mm_fault(vma, addr, flags);
+ if (ret & VM_FAULT_RETRY) {
+ /* Note, handle_mm_fault did up_read(&mm->mmap_sem)) */
+ return -EAGAIN;
+ }
+ if (ret & VM_FAULT_ERROR)
+ goto err;
+
+ return -EBUSY;
+
+err:
+ *pfn = range->values[HMM_PFN_ERROR];
+ return -EFAULT;
}
static int hmm_pfns_bad(unsigned long addr,
@@ -340,13 +269,13 @@
}
/*
- * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
- * @start: range virtual start address (inclusive)
+ * hmm_vma_walk_hole_() - handle a range lacking valid pmd or pte(s)
+ * @addr: range virtual start address (inclusive)
* @end: range virtual end address (exclusive)
* @fault: should we fault or not ?
* @write_fault: write fault ?
* @walk: mm_walk structure
- * Returns: 0 on success, -EAGAIN after page fault, or page fault error
+ * Return: 0 on success, -EBUSY after page fault, or page fault error
*
* This function will be called whenever pmd_none() or pte_none() returns true,
* or whenever there is no page directory covering the virtual address range.
@@ -362,6 +291,10 @@
hmm_vma_walk->last = addr;
i = (addr - range->start) >> PAGE_SHIFT;
+
+ if (write_fault && walk->vma && !(walk->vma->vm_flags & VM_WRITE))
+ return -EPERM;
+
for (; addr < end; addr += PAGE_SIZE, i++) {
pfns[i] = range->values[HMM_PFN_NONE];
if (fault || write_fault) {
@@ -369,12 +302,12 @@
ret = hmm_vma_do_fault(walk, addr, write_fault,
&pfns[i]);
- if (ret != -EAGAIN)
+ if (ret != -EBUSY)
return ret;
}
}
- return (fault || write_fault) ? -EAGAIN : 0;
+ return (fault || write_fault) ? -EBUSY : 0;
}
static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
@@ -383,14 +316,25 @@
{
struct hmm_range *range = hmm_vma_walk->range;
- *fault = *write_fault = false;
- if (!hmm_vma_walk->fault)
+ if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT)
return;
+ /*
+ * So we not only consider the individual per page request we also
+ * consider the default flags requested for the range. The API can
+ * be used 2 ways. The first one where the HMM user coalesces
+ * multiple page faults into one request and sets flags per pfn for
+ * those faults. The second one where the HMM user wants to pre-
+ * fault a range with specific flags. For the latter one it is a
+ * waste to have the user pre-fill the pfn arrays with a default
+ * flags value.
+ */
+ pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+
/* We aren't ask to do anything ... */
if (!(pfns & range->flags[HMM_PFN_VALID]))
return;
- /* If this is device memory than only fault if explicitly requested */
+ /* If this is device memory then only fault if explicitly requested */
if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
/* Do we fault on device memory ? */
if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
@@ -417,15 +361,16 @@
{
unsigned long i;
- if (!hmm_vma_walk->fault) {
+ if (hmm_vma_walk->flags & HMM_FAULT_SNAPSHOT) {
*fault = *write_fault = false;
return;
}
+ *fault = *write_fault = false;
for (i = 0; i < npages; ++i) {
hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
fault, write_fault);
- if ((*fault) || (*write_fault))
+ if ((*write_fault))
return;
}
}
@@ -456,11 +401,9 @@
range->flags[HMM_PFN_VALID];
}
-static int hmm_vma_handle_pmd(struct mm_walk *walk,
- unsigned long addr,
- unsigned long end,
- uint64_t *pfns,
- pmd_t pmd)
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, uint64_t *pfns, pmd_t pmd)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
@@ -476,16 +419,32 @@
if (pmd_protnone(pmd) || fault || write_fault)
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
- pfn = pmd_pfn(pmd) + pte_index(addr);
- for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
+ pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
+ for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) {
+ if (pmd_devmap(pmd)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ }
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
hmm_vma_walk->last = end;
return 0;
}
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+/* stub to allow the code below to compile */
+int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
+ unsigned long end, uint64_t *pfns, pmd_t pmd);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
{
- if (pte_none(pte) || !pte_present(pte))
+ if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
return 0;
return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
range->flags[HMM_PFN_WRITE] :
@@ -498,18 +457,17 @@
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- struct vm_area_struct *vma = walk->vma;
bool fault, write_fault;
uint64_t cpu_flags;
pte_t pte = *ptep;
uint64_t orig_pfn = *pfn;
*pfn = range->values[HMM_PFN_NONE];
- cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
- &fault, &write_fault);
+ fault = write_fault = false;
if (pte_none(pte)) {
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0,
+ &fault, &write_fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -519,6 +477,9 @@
swp_entry_t entry = pte_to_swp_entry(pte);
if (!non_swap_entry(entry)) {
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
if (fault || write_fault)
goto fault;
return 0;
@@ -537,7 +498,8 @@
&fault, &write_fault);
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
+ *pfn = hmm_device_entry_from_pfn(range,
+ swp_offset(entry));
*pfn |= cpu_flags;
return 0;
}
@@ -546,9 +508,8 @@
if (fault || write_fault) {
pte_unmap(ptep);
hmm_vma_walk->last = addr;
- migration_entry_wait(vma->vm_mm,
- pmdp, addr);
- return -EAGAIN;
+ migration_entry_wait(walk->mm, pmdp, addr);
+ return -EBUSY;
}
return 0;
}
@@ -556,15 +517,33 @@
/* Report error for everything else */
*pfn = range->values[HMM_PFN_ERROR];
return -EFAULT;
+ } else {
+ cpu_flags = pte_to_hmm_pfn_flags(range, pte);
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
}
if (fault || write_fault)
goto fault;
- *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ if (pte_devmap(pte)) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pte_pfn(pte),
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ } else if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && pte_special(pte)) {
+ *pfn = range->values[HMM_PFN_SPECIAL];
+ return -EFAULT;
+ }
+
+ *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
return 0;
fault:
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep);
/* Fault any virtual address we were asked to fault */
return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
@@ -580,25 +559,40 @@
uint64_t *pfns = range->pfns;
unsigned long addr = start, i;
pte_t *ptep;
-
- i = (addr - range->start) >> PAGE_SHIFT;
+ pmd_t pmd;
again:
- if (pmd_none(*pmdp))
+ pmd = READ_ONCE(*pmdp);
+ if (pmd_none(pmd))
return hmm_vma_walk_hole(start, end, walk);
- if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
+ if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
+ bool fault, write_fault;
+ unsigned long npages;
+ uint64_t *pfns;
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ 0, &fault, &write_fault);
+ if (fault || write_fault) {
+ hmm_vma_walk->last = addr;
+ pmd_migration_entry_wait(walk->mm, pmdp);
+ return -EBUSY;
+ }
+ return 0;
+ } else if (!pmd_present(pmd))
return hmm_pfns_bad(start, end, walk);
- if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
- pmd_t pmd;
-
+ if (pmd_devmap(pmd) || pmd_trans_huge(pmd)) {
/*
- * No need to take pmd_lock here, even if some other threads
+ * No need to take pmd_lock here, even if some other thread
* is splitting the huge pmd we will get that event through
* mmu_notifier callback.
*
- * So just read pmd value and check again its a transparent
+ * So just read pmd value and check again it's a transparent
* huge or device mapping one and compute corresponding pfn
* values.
*/
@@ -607,13 +601,21 @@
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
+ i = (addr - range->start) >> PAGE_SHIFT;
return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
}
- if (pmd_bad(*pmdp))
+ /*
+ * We have handled all the valid cases above ie either none, migration,
+ * huge or transparent huge. At this point either it is a valid pmd
+ * entry pointing to pte directory or it is a bad pmd that will not
+ * recover.
+ */
+ if (pmd_bad(pmd))
return hmm_pfns_bad(start, end, walk);
ptep = pte_offset_map(pmdp, addr);
+ i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
int r;
@@ -624,12 +626,150 @@
return r;
}
}
+ if (hmm_vma_walk->pgmap) {
+ /*
+ * We do put_dev_pagemap() here and not in hmm_vma_handle_pte()
+ * so that we can leverage get_dev_pagemap() optimization which
+ * will not re-take a reference on a pgmap if we already have
+ * one.
+ */
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
pte_unmap(ptep - 1);
hmm_vma_walk->last = addr;
return 0;
}
+#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+{
+ if (!pud_present(pud))
+ return 0;
+ return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
+ range->flags[HMM_PFN_WRITE] :
+ range->flags[HMM_PFN_VALID];
+}
+
+static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ unsigned long addr = start, next;
+ pmd_t *pmdp;
+ pud_t pud;
+ int ret;
+
+again:
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ if (pud_huge(pud) && pud_devmap(pud)) {
+ unsigned long i, npages, pfn;
+ uint64_t *pfns, cpu_flags;
+ bool fault, write_fault;
+
+ if (!pud_present(pud))
+ return hmm_vma_walk_hole(start, end, walk);
+
+ i = (addr - range->start) >> PAGE_SHIFT;
+ npages = (end - addr) >> PAGE_SHIFT;
+ pfns = &range->pfns[i];
+
+ cpu_flags = pud_to_hmm_pfn_flags(range, pud);
+ hmm_range_need_fault(hmm_vma_walk, pfns, npages,
+ cpu_flags, &fault, &write_fault);
+ if (fault || write_fault)
+ return hmm_vma_walk_hole_(addr, end, fault,
+ write_fault, walk);
+
+ pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
+ for (i = 0; i < npages; ++i, ++pfn) {
+ hmm_vma_walk->pgmap = get_dev_pagemap(pfn,
+ hmm_vma_walk->pgmap);
+ if (unlikely(!hmm_vma_walk->pgmap))
+ return -EBUSY;
+ pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ }
+ if (hmm_vma_walk->pgmap) {
+ put_dev_pagemap(hmm_vma_walk->pgmap);
+ hmm_vma_walk->pgmap = NULL;
+ }
+ hmm_vma_walk->last = end;
+ return 0;
+ }
+
+ split_huge_pud(walk->vma, pudp, addr);
+ if (pud_none(*pudp))
+ goto again;
+
+ pmdp = pmd_offset(pudp, addr);
+ do {
+ next = pmd_addr_end(addr, end);
+ ret = hmm_vma_walk_pmd(pmdp, addr, next, walk);
+ if (ret)
+ return ret;
+ } while (pmdp++, addr = next, addr != end);
+
+ return 0;
+}
+#else
+#define hmm_vma_walk_pud NULL
+#endif
+
+#ifdef CONFIG_HUGETLB_PAGE
+static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
+ unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ unsigned long addr = start, i, pfn;
+ struct hmm_vma_walk *hmm_vma_walk = walk->private;
+ struct hmm_range *range = hmm_vma_walk->range;
+ struct vm_area_struct *vma = walk->vma;
+ uint64_t orig_pfn, cpu_flags;
+ bool fault, write_fault;
+ spinlock_t *ptl;
+ pte_t entry;
+ int ret = 0;
+
+ ptl = huge_pte_lock(hstate_vma(vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
+
+ i = (start - range->start) >> PAGE_SHIFT;
+ orig_pfn = range->pfns[i];
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry);
+ fault = write_fault = false;
+ hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
+ &fault, &write_fault);
+ if (fault || write_fault) {
+ ret = -ENOENT;
+ goto unlock;
+ }
+
+ pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
+ for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
+ range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
+ cpu_flags;
+ hmm_vma_walk->last = end;
+
+unlock:
+ spin_unlock(ptl);
+
+ if (ret == -ENOENT)
+ return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
+
+ return ret;
+}
+#else
+#define hmm_vma_walk_hugetlb_entry NULL
+#endif /* CONFIG_HUGETLB_PAGE */
+
static void hmm_pfns_clear(struct hmm_range *range,
uint64_t *pfns,
unsigned long addr,
@@ -639,789 +779,320 @@
*pfns = range->values[HMM_PFN_NONE];
}
-static void hmm_pfns_special(struct hmm_range *range)
-{
- unsigned long addr = range->start, i = 0;
-
- for (; addr < range->end; addr += PAGE_SIZE, i++)
- range->pfns[i] = range->values[HMM_PFN_SPECIAL];
-}
-
/*
- * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
- * @range: range being snapshotted
- * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
- * vma permission, 0 success
+ * hmm_range_register() - start tracking change to CPU page table over a range
+ * @range: range
+ * @mm: the mm struct for the range of virtual address
*
- * This snapshots the CPU page table for a range of virtual addresses. Snapshot
- * validity is tracked by range struct. See hmm_vma_range_done() for further
- * information.
+ * Return: 0 on success, -EFAULT if the address space is no longer valid
*
- * The range struct is initialized here. It tracks the CPU page table, but only
- * if the function returns success (0), in which case the caller must then call
- * hmm_vma_range_done() to stop CPU page table update tracking on this range.
- *
- * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
- * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
+ * Track updates to the CPU page table see include/linux/hmm.h
*/
-int hmm_vma_get_pfns(struct hmm_range *range)
+int hmm_range_register(struct hmm_range *range, struct hmm_mirror *mirror)
{
- struct vm_area_struct *vma = range->vma;
- struct hmm_vma_walk hmm_vma_walk;
- struct mm_walk mm_walk;
- struct hmm *hmm;
+ struct hmm *hmm = mirror->hmm;
+ unsigned long flags;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
+ range->valid = false;
+ range->hmm = NULL;
+
+ if ((range->start & (PAGE_SIZE - 1)) || (range->end & (PAGE_SIZE - 1)))
return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
+ if (range->start >= range->end)
return -EINVAL;
- hmm = hmm_register(vma->vm_mm);
- if (!hmm)
- return -ENOMEM;
- /* Caller must have registered a mirror, via hmm_mirror_register() ! */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
+ /* Prevent hmm_release() from running while the range is valid */
+ if (!mmget_not_zero(hmm->mmu_notifier.mm))
+ return -EFAULT;
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
+ /* Initialize range to track CPU page table updates. */
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
- */
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
- }
+ range->hmm = hmm;
+ list_add(&range->list, &hmm->ranges);
- /* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
+ /*
+ * If there are any concurrent notifiers we have to wait for them for
+ * the range to be valid (see hmm_range_wait_until_valid()).
+ */
+ if (!hmm->notifiers)
+ range->valid = true;
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
- hmm_vma_walk.fault = false;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
-
- walk_page_range(range->start, range->end, &mm_walk);
return 0;
}
-EXPORT_SYMBOL(hmm_vma_get_pfns);
+EXPORT_SYMBOL(hmm_range_register);
/*
- * hmm_vma_range_done() - stop tracking change to CPU page table over a range
- * @range: range being tracked
- * Returns: false if range data has been invalidated, true otherwise
+ * hmm_range_unregister() - stop tracking change to CPU page table over a range
+ * @range: range
*
* Range struct is used to track updates to the CPU page table after a call to
- * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
- * using the data, or wants to lock updates to the data it got from those
- * functions, it must call the hmm_vma_range_done() function, which will then
- * stop tracking CPU page table updates.
- *
- * Note that device driver must still implement general CPU page table update
- * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
- * the mmu_notifier API directly.
- *
- * CPU page table update tracking done through hmm_range is only temporary and
- * to be used while trying to duplicate CPU page table contents for a range of
- * virtual addresses.
- *
- * There are two ways to use this :
- * again:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * trans = device_build_page_table_update_transaction(pfns);
- * device_page_table_lock();
- * if (!hmm_vma_range_done(range)) {
- * device_page_table_unlock();
- * goto again;
- * }
- * device_commit_transaction(trans);
- * device_page_table_unlock();
- *
- * Or:
- * hmm_vma_get_pfns(range); or hmm_vma_fault(...);
- * device_page_table_lock();
- * hmm_vma_range_done(range);
- * device_update_page_table(range->pfns);
- * device_page_table_unlock();
+ * hmm_range_register(). See include/linux/hmm.h for how to use it.
*/
-bool hmm_vma_range_done(struct hmm_range *range)
+void hmm_range_unregister(struct hmm_range *range)
{
- unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
- struct hmm *hmm;
+ struct hmm *hmm = range->hmm;
+ unsigned long flags;
- if (range->end <= range->start) {
- BUG();
- return false;
- }
+ spin_lock_irqsave(&hmm->ranges_lock, flags);
+ list_del_init(&range->list);
+ spin_unlock_irqrestore(&hmm->ranges_lock, flags);
- hmm = hmm_register(range->vma->vm_mm);
- if (!hmm) {
- memset(range->pfns, 0, sizeof(*range->pfns) * npages);
- return false;
- }
+ /* Drop reference taken by hmm_range_register() */
+ mmput(hmm->mmu_notifier.mm);
- spin_lock(&hmm->lock);
- list_del_rcu(&range->list);
- spin_unlock(&hmm->lock);
-
- return range->valid;
+ /*
+ * The range is now invalid and the ref on the hmm is dropped, so
+ * poison the pointer. Leave other fields in place, for the caller's
+ * use.
+ */
+ range->valid = false;
+ memset(&range->hmm, POISON_INUSE, sizeof(range->hmm));
}
-EXPORT_SYMBOL(hmm_vma_range_done);
+EXPORT_SYMBOL(hmm_range_unregister);
-/*
- * hmm_vma_fault() - try to fault some address in a virtual address range
- * @range: range being faulted
- * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
- * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
+static const struct mm_walk_ops hmm_walk_ops = {
+ .pud_entry = hmm_vma_walk_pud,
+ .pmd_entry = hmm_vma_walk_pmd,
+ .pte_hole = hmm_vma_walk_hole,
+ .hugetlb_entry = hmm_vma_walk_hugetlb_entry,
+};
+
+/**
+ * hmm_range_fault - try to fault some address in a virtual address range
+ * @range: range being faulted
+ * @flags: HMM_FAULT_* flags
+ *
+ * Return: the number of valid pages in range->pfns[] (from range start
+ * address), which may be zero. On error one of the following status codes
+ * can be returned:
+ *
+ * -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
+ * (e.g., device file vma).
+ * -ENOMEM: Out of memory.
+ * -EPERM: Invalid permission (e.g., asking for write and range is read
+ * only).
+ * -EAGAIN: A page fault needs to be retried and mmap_sem was dropped.
+ * -EBUSY: The range has been invalidated and the caller needs to wait for
+ * the invalidation to finish.
+ * -EFAULT: Invalid (i.e., either no valid vma or it is illegal to access
+ * that range) number of valid pages in range->pfns[] (from
+ * range start address).
*
* This is similar to a regular CPU page fault except that it will not trigger
- * any memory migration if the memory being faulted is not accessible by CPUs.
+ * any memory migration if the memory being faulted is not accessible by CPUs
+ * and caller does not ask for migration.
*
* On error, for one virtual address in the range, the function will mark the
* corresponding HMM pfn entry with an error flag.
- *
- * Expected use pattern:
- * retry:
- * down_read(&mm->mmap_sem);
- * // Find vma and address device wants to fault, initialize hmm_pfn_t
- * // array accordingly
- * ret = hmm_vma_fault(range, write, block);
- * switch (ret) {
- * case -EAGAIN:
- * hmm_vma_range_done(range);
- * // You might want to rate limit or yield to play nicely, you may
- * // also commit any valid pfn in the array assuming that you are
- * // getting true from hmm_vma_range_monitor_end()
- * goto retry;
- * case 0:
- * break;
- * case -ENOMEM:
- * case -EINVAL:
- * case -EPERM:
- * default:
- * // Handle error !
- * up_read(&mm->mmap_sem)
- * return;
- * }
- * // Take device driver lock that serialize device page table update
- * driver_lock_device_page_table_update();
- * hmm_vma_range_done(range);
- * // Commit pfns we got from hmm_vma_fault()
- * driver_unlock_device_page_table_update();
- * up_read(&mm->mmap_sem)
- *
- * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
- * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
- *
- * YOU HAVE BEEN WARNED !
*/
-int hmm_vma_fault(struct hmm_range *range, bool block)
+long hmm_range_fault(struct hmm_range *range, unsigned int flags)
{
- struct vm_area_struct *vma = range->vma;
- unsigned long start = range->start;
+ const unsigned long device_vma = VM_IO | VM_PFNMAP | VM_MIXEDMAP;
+ unsigned long start = range->start, end;
struct hmm_vma_walk hmm_vma_walk;
- struct mm_walk mm_walk;
- struct hmm *hmm;
+ struct hmm *hmm = range->hmm;
+ struct vm_area_struct *vma;
int ret;
- /* Sanity check, this really should not happen ! */
- if (range->start < vma->vm_start || range->start >= vma->vm_end)
- return -EINVAL;
- if (range->end < vma->vm_start || range->end > vma->vm_end)
- return -EINVAL;
-
- hmm = hmm_register(vma->vm_mm);
- if (!hmm) {
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -ENOMEM;
- }
- /* Caller must have registered a mirror using hmm_mirror_register() */
- if (!hmm->mmu_notifier.ops)
- return -EINVAL;
-
- /* FIXME support hugetlb fs */
- if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma)) {
- hmm_pfns_special(range);
- return -EINVAL;
- }
-
- if (!(vma->vm_flags & VM_READ)) {
- /*
- * If vma do not allow read access, then assume that it does
- * not allow write access, either. Architecture that allow
- * write without read access are not supported by HMM, because
- * operations such has atomic access would not work.
- */
- hmm_pfns_clear(range, range->pfns, range->start, range->end);
- return -EPERM;
- }
-
- /* Initialize range to track CPU page table update */
- spin_lock(&hmm->lock);
- range->valid = true;
- list_add_rcu(&range->list, &hmm->ranges);
- spin_unlock(&hmm->lock);
-
- hmm_vma_walk.fault = true;
- hmm_vma_walk.block = block;
- hmm_vma_walk.range = range;
- mm_walk.private = &hmm_vma_walk;
- hmm_vma_walk.last = range->start;
-
- mm_walk.vma = vma;
- mm_walk.mm = vma->vm_mm;
- mm_walk.pte_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.pmd_entry = hmm_vma_walk_pmd;
- mm_walk.pte_hole = hmm_vma_walk_hole;
+ lockdep_assert_held(&hmm->mmu_notifier.mm->mmap_sem);
do {
- ret = walk_page_range(start, range->end, &mm_walk);
- start = hmm_vma_walk.last;
- } while (ret == -EAGAIN);
+ /* If range is no longer valid force retry. */
+ if (!range->valid)
+ return -EBUSY;
- if (ret) {
- unsigned long i;
+ vma = find_vma(hmm->mmu_notifier.mm, start);
+ if (vma == NULL || (vma->vm_flags & device_vma))
+ return -EFAULT;
- i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
- hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
- range->end);
- hmm_vma_range_done(range);
- }
- return ret;
-}
-EXPORT_SYMBOL(hmm_vma_fault);
-#endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
-
-
-#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC)
-struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
- unsigned long addr)
-{
- struct page *page;
-
- page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
- if (!page)
- return NULL;
- lock_page(page);
- return page;
-}
-EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
-
-
-static void hmm_devmem_ref_release(struct percpu_ref *ref)
-{
- struct hmm_devmem *devmem;
-
- devmem = container_of(ref, struct hmm_devmem, ref);
- complete(&devmem->completion);
-}
-
-static void hmm_devmem_ref_exit(void *data)
-{
- struct percpu_ref *ref = data;
- struct hmm_devmem *devmem;
-
- devmem = container_of(ref, struct hmm_devmem, ref);
- percpu_ref_exit(ref);
- devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
-}
-
-static void hmm_devmem_ref_kill(void *data)
-{
- struct percpu_ref *ref = data;
- struct hmm_devmem *devmem;
-
- devmem = container_of(ref, struct hmm_devmem, ref);
- percpu_ref_kill(ref);
- wait_for_completion(&devmem->completion);
- devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
-}
-
-static int hmm_devmem_fault(struct vm_area_struct *vma,
- unsigned long addr,
- const struct page *page,
- unsigned int flags,
- pmd_t *pmdp)
-{
- struct hmm_devmem *devmem = page->pgmap->data;
-
- return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
-}
-
-static void hmm_devmem_free(struct page *page, void *data)
-{
- struct hmm_devmem *devmem = data;
-
- page->mapping = NULL;
-
- devmem->ops->free(devmem, page);
-}
-
-static DEFINE_MUTEX(hmm_devmem_lock);
-static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
-
-static void hmm_devmem_radix_release(struct resource *resource)
-{
- resource_size_t key;
-
- mutex_lock(&hmm_devmem_lock);
- for (key = resource->start;
- key <= resource->end;
- key += PA_SECTION_SIZE)
- radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
- mutex_unlock(&hmm_devmem_lock);
-}
-
-static void hmm_devmem_release(struct device *dev, void *data)
-{
- struct hmm_devmem *devmem = data;
- struct resource *resource = devmem->resource;
- unsigned long start_pfn, npages;
- struct zone *zone;
- struct page *page;
-
- if (percpu_ref_tryget_live(&devmem->ref)) {
- dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
- percpu_ref_put(&devmem->ref);
- }
-
- /* pages are dead and unused, undo the arch mapping */
- start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
- npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
-
- page = pfn_to_page(start_pfn);
- zone = page_zone(page);
-
- mem_hotplug_begin();
- if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
- __remove_pages(zone, start_pfn, npages, NULL);
- else
- arch_remove_memory(start_pfn << PAGE_SHIFT,
- npages << PAGE_SHIFT, NULL);
- mem_hotplug_done();
-
- hmm_devmem_radix_release(resource);
-}
-
-static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
-{
- resource_size_t key, align_start, align_size, align_end;
- struct device *device = devmem->device;
- int ret, nid, is_ram;
- unsigned long pfn;
-
- align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
- align_size = ALIGN(devmem->resource->start +
- resource_size(devmem->resource),
- PA_SECTION_SIZE) - align_start;
-
- is_ram = region_intersects(align_start, align_size,
- IORESOURCE_SYSTEM_RAM,
- IORES_DESC_NONE);
- if (is_ram == REGION_MIXED) {
- WARN_ONCE(1, "%s attempted on mixed region %pr\n",
- __func__, devmem->resource);
- return -ENXIO;
- }
- if (is_ram == REGION_INTERSECTS)
- return -ENXIO;
-
- if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
- devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
- else
- devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
-
- devmem->pagemap.res = *devmem->resource;
- devmem->pagemap.page_fault = hmm_devmem_fault;
- devmem->pagemap.page_free = hmm_devmem_free;
- devmem->pagemap.dev = devmem->device;
- devmem->pagemap.ref = &devmem->ref;
- devmem->pagemap.data = devmem;
-
- mutex_lock(&hmm_devmem_lock);
- align_end = align_start + align_size - 1;
- for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
- struct hmm_devmem *dup;
-
- dup = radix_tree_lookup(&hmm_devmem_radix,
- key >> PA_SECTION_SHIFT);
- if (dup) {
- dev_err(device, "%s: collides with mapping for %s\n",
- __func__, dev_name(dup->device));
- mutex_unlock(&hmm_devmem_lock);
- ret = -EBUSY;
- goto error;
+ if (!(vma->vm_flags & VM_READ)) {
+ /*
+ * If vma do not allow read access, then assume that it
+ * does not allow write access, either. HMM does not
+ * support architecture that allow write without read.
+ */
+ hmm_pfns_clear(range, range->pfns,
+ range->start, range->end);
+ return -EPERM;
}
- ret = radix_tree_insert(&hmm_devmem_radix,
- key >> PA_SECTION_SHIFT,
- devmem);
+
+ hmm_vma_walk.pgmap = NULL;
+ hmm_vma_walk.last = start;
+ hmm_vma_walk.flags = flags;
+ hmm_vma_walk.range = range;
+ end = min(range->end, vma->vm_end);
+
+ walk_page_range(vma->vm_mm, start, end, &hmm_walk_ops,
+ &hmm_vma_walk);
+
+ do {
+ ret = walk_page_range(vma->vm_mm, start, end,
+ &hmm_walk_ops, &hmm_vma_walk);
+ start = hmm_vma_walk.last;
+
+ /* Keep trying while the range is valid. */
+ } while (ret == -EBUSY && range->valid);
+
if (ret) {
- dev_err(device, "%s: failed: %d\n", __func__, ret);
- mutex_unlock(&hmm_devmem_lock);
- goto error_radix;
+ unsigned long i;
+
+ i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ hmm_pfns_clear(range, &range->pfns[i],
+ hmm_vma_walk.last, range->end);
+ return ret;
}
- }
- mutex_unlock(&hmm_devmem_lock);
+ start = end;
- nid = dev_to_node(device);
- if (nid < 0)
- nid = numa_mem_id();
+ } while (start < range->end);
- mem_hotplug_begin();
- /*
- * For device private memory we call add_pages() as we only need to
- * allocate and initialize struct page for the device memory. More-
- * over the device memory is un-accessible thus we do not want to
- * create a linear mapping for the memory like arch_add_memory()
- * would do.
- *
- * For device public memory, which is accesible by the CPU, we do
- * want the linear mapping and thus use arch_add_memory().
- */
- if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
- ret = arch_add_memory(nid, align_start, align_size, NULL,
- false);
- else
- ret = add_pages(nid, align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL, false);
- if (ret) {
- mem_hotplug_done();
- goto error_add_memory;
- }
- move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
- align_start >> PAGE_SHIFT,
- align_size >> PAGE_SHIFT, NULL);
- mem_hotplug_done();
-
- for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
- struct page *page = pfn_to_page(pfn);
-
- page->pgmap = &devmem->pagemap;
- }
- return 0;
-
-error_add_memory:
- untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
-error_radix:
- hmm_devmem_radix_release(devmem->resource);
-error:
- return ret;
+ return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
}
+EXPORT_SYMBOL(hmm_range_fault);
-static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
-{
- struct hmm_devmem *devmem = data;
-
- return devmem->resource == match_data;
-}
-
-static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
-{
- devres_release(devmem->device, &hmm_devmem_release,
- &hmm_devmem_match, devmem->resource);
-}
-
-/*
- * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
+/**
+ * hmm_range_dma_map - hmm_range_fault() and dma map page all in one.
+ * @range: range being faulted
+ * @device: device to map page to
+ * @daddrs: array of dma addresses for the mapped pages
+ * @flags: HMM_FAULT_*
*
- * @ops: memory event device driver callback (see struct hmm_devmem_ops)
- * @device: device struct to bind the resource too
- * @size: size in bytes of the device memory to add
- * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
- *
- * This function first finds an empty range of physical address big enough to
- * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
- * in turn allocates struct pages. It does not do anything beyond that; all
- * events affecting the memory will go through the various callbacks provided
- * by hmm_devmem_ops struct.
- *
- * Device driver should call this function during device initialization and
- * is then responsible of memory management. HMM only provides helpers.
+ * Return: the number of pages mapped on success (including zero), or any
+ * status return from hmm_range_fault() otherwise.
*/
-struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
- struct device *device,
- unsigned long size)
+long hmm_range_dma_map(struct hmm_range *range, struct device *device,
+ dma_addr_t *daddrs, unsigned int flags)
{
- struct hmm_devmem *devmem;
- resource_size_t addr;
- int ret;
+ unsigned long i, npages, mapped;
+ long ret;
- dev_pagemap_get_ops();
+ ret = hmm_range_fault(range, flags);
+ if (ret <= 0)
+ return ret ? ret : -EBUSY;
- devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
- GFP_KERNEL, dev_to_node(device));
- if (!devmem)
- return ERR_PTR(-ENOMEM);
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0, mapped = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- init_completion(&devmem->completion);
- devmem->pfn_first = -1UL;
- devmem->pfn_last = -1UL;
- devmem->resource = NULL;
- devmem->device = device;
- devmem->ops = ops;
+ /*
+ * FIXME need to update DMA API to provide invalid DMA address
+ * value instead of a function to test dma address value. This
+ * would remove lot of dumb code duplicated accross many arch.
+ *
+ * For now setting it to 0 here is good enough as the pfns[]
+ * value is what is use to check what is valid and what isn't.
+ */
+ daddrs[i] = 0;
- ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
- 0, GFP_KERNEL);
- if (ret)
- goto error_percpu_ref;
-
- ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
- if (ret)
- goto error_devm_add_action;
-
- size = ALIGN(size, PA_SECTION_SIZE);
- addr = min((unsigned long)iomem_resource.end,
- (1UL << MAX_PHYSMEM_BITS) - 1);
- addr = addr - size + 1UL;
-
- /*
- * FIXME add a new helper to quickly walk resource tree and find free
- * range
- *
- * FIXME what about ioport_resource resource ?
- */
- for (; addr > size && addr >= iomem_resource.start; addr -= size) {
- ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
- if (ret != REGION_DISJOINT)
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
continue;
- devmem->resource = devm_request_mem_region(device, addr, size,
- dev_name(device));
- if (!devmem->resource) {
- ret = -ENOMEM;
- goto error_no_resource;
+ /* Check if range is being invalidated */
+ if (!range->valid) {
+ ret = -EBUSY;
+ goto unmap;
}
- break;
- }
- if (!devmem->resource) {
- ret = -ERANGE;
- goto error_no_resource;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ daddrs[i] = dma_map_page(device, page, 0, PAGE_SIZE, dir);
+ if (dma_mapping_error(device, daddrs[i])) {
+ ret = -EFAULT;
+ goto unmap;
+ }
+
+ mapped++;
}
- devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
- devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
- devmem->pfn_last = devmem->pfn_first +
- (resource_size(devmem->resource) >> PAGE_SHIFT);
+ return mapped;
- ret = hmm_devmem_pages_create(devmem);
- if (ret)
- goto error_pages;
+unmap:
+ for (npages = i, i = 0; (i < npages) && mapped; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- devres_add(device, devmem);
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
- ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
- if (ret) {
- hmm_devmem_remove(devmem);
- return ERR_PTR(ret);
+ if (dma_mapping_error(device, daddrs[i]))
+ continue;
+
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE])
+ dir = DMA_BIDIRECTIONAL;
+
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ mapped--;
}
- return devmem;
-
-error_pages:
- devm_release_mem_region(device, devmem->resource->start,
- resource_size(devmem->resource));
-error_no_resource:
-error_devm_add_action:
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
- devres_free(devmem);
- return ERR_PTR(ret);
+ return ret;
}
-EXPORT_SYMBOL(hmm_devmem_add);
+EXPORT_SYMBOL(hmm_range_dma_map);
-struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
- struct device *device,
- struct resource *res)
-{
- struct hmm_devmem *devmem;
- int ret;
-
- if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
- return ERR_PTR(-EINVAL);
-
- dev_pagemap_get_ops();
-
- devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
- GFP_KERNEL, dev_to_node(device));
- if (!devmem)
- return ERR_PTR(-ENOMEM);
-
- init_completion(&devmem->completion);
- devmem->pfn_first = -1UL;
- devmem->pfn_last = -1UL;
- devmem->resource = res;
- devmem->device = device;
- devmem->ops = ops;
-
- ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
- 0, GFP_KERNEL);
- if (ret)
- goto error_percpu_ref;
-
- ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
- if (ret)
- goto error_devm_add_action;
-
-
- devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
- devmem->pfn_last = devmem->pfn_first +
- (resource_size(devmem->resource) >> PAGE_SHIFT);
-
- ret = hmm_devmem_pages_create(devmem);
- if (ret)
- goto error_devm_add_action;
-
- devres_add(device, devmem);
-
- ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
- if (ret) {
- hmm_devmem_remove(devmem);
- return ERR_PTR(ret);
- }
-
- return devmem;
-
-error_devm_add_action:
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
-error_percpu_ref:
- devres_free(devmem);
- return ERR_PTR(ret);
-}
-EXPORT_SYMBOL(hmm_devmem_add_resource);
-
-/*
- * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
+/**
+ * hmm_range_dma_unmap() - unmap range of that was map with hmm_range_dma_map()
+ * @range: range being unmapped
+ * @device: device against which dma map was done
+ * @daddrs: dma address of mapped pages
+ * @dirty: dirty page if it had the write flag set
+ * Return: number of page unmapped on success, -EINVAL otherwise
*
- * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
- *
- * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
- * of the device driver. It will free struct page and remove the resource that
- * reserved the physical address range for this device memory.
+ * Note that caller MUST abide by mmu notifier or use HMM mirror and abide
+ * to the sync_cpu_device_pagetables() callback so that it is safe here to
+ * call set_page_dirty(). Caller must also take appropriate locks to avoid
+ * concurrent mmu notifier or sync_cpu_device_pagetables() to make progress.
*/
-void hmm_devmem_remove(struct hmm_devmem *devmem)
+long hmm_range_dma_unmap(struct hmm_range *range,
+ struct device *device,
+ dma_addr_t *daddrs,
+ bool dirty)
{
- resource_size_t start, size;
- struct device *device;
- bool cdm = false;
+ unsigned long i, npages;
+ long cpages = 0;
- if (!devmem)
- return;
+ /* Sanity check. */
+ if (range->end <= range->start)
+ return -EINVAL;
+ if (!daddrs)
+ return -EINVAL;
+ if (!range->pfns)
+ return -EINVAL;
- device = devmem->device;
- start = devmem->resource->start;
- size = resource_size(devmem->resource);
+ npages = (range->end - range->start) >> PAGE_SHIFT;
+ for (i = 0; i < npages; ++i) {
+ enum dma_data_direction dir = DMA_TO_DEVICE;
+ struct page *page;
- cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
- hmm_devmem_ref_kill(&devmem->ref);
- hmm_devmem_ref_exit(&devmem->ref);
- hmm_devmem_pages_remove(devmem);
+ page = hmm_device_entry_to_page(range, range->pfns[i]);
+ if (page == NULL)
+ continue;
- if (!cdm)
- devm_release_mem_region(device, start, size);
-}
-EXPORT_SYMBOL(hmm_devmem_remove);
+ /* If it is read and write than map bi-directional. */
+ if (range->pfns[i] & range->flags[HMM_PFN_WRITE]) {
+ dir = DMA_BIDIRECTIONAL;
-/*
- * A device driver that wants to handle multiple devices memory through a
- * single fake device can use hmm_device to do so. This is purely a helper
- * and it is not needed to make use of any HMM functionality.
- */
-#define HMM_DEVICE_MAX 256
+ /*
+ * See comments in function description on why it is
+ * safe here to call set_page_dirty()
+ */
+ if (dirty)
+ set_page_dirty(page);
+ }
-static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
-static DEFINE_SPINLOCK(hmm_device_lock);
-static struct class *hmm_device_class;
-static dev_t hmm_device_devt;
-
-static void hmm_device_release(struct device *device)
-{
- struct hmm_device *hmm_device;
-
- hmm_device = container_of(device, struct hmm_device, device);
- spin_lock(&hmm_device_lock);
- clear_bit(hmm_device->minor, hmm_device_mask);
- spin_unlock(&hmm_device_lock);
-
- kfree(hmm_device);
-}
-
-struct hmm_device *hmm_device_new(void *drvdata)
-{
- struct hmm_device *hmm_device;
-
- hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
- if (!hmm_device)
- return ERR_PTR(-ENOMEM);
-
- spin_lock(&hmm_device_lock);
- hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
- if (hmm_device->minor >= HMM_DEVICE_MAX) {
- spin_unlock(&hmm_device_lock);
- kfree(hmm_device);
- return ERR_PTR(-EBUSY);
+ /* Unmap and clear pfns/dma address */
+ dma_unmap_page(device, daddrs[i], PAGE_SIZE, dir);
+ range->pfns[i] = range->values[HMM_PFN_NONE];
+ /* FIXME see comments in hmm_vma_dma_map() */
+ daddrs[i] = 0;
+ cpages++;
}
- set_bit(hmm_device->minor, hmm_device_mask);
- spin_unlock(&hmm_device_lock);
- dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
- hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
- hmm_device->minor);
- hmm_device->device.release = hmm_device_release;
- dev_set_drvdata(&hmm_device->device, drvdata);
- hmm_device->device.class = hmm_device_class;
- device_initialize(&hmm_device->device);
-
- return hmm_device;
+ return cpages;
}
-EXPORT_SYMBOL(hmm_device_new);
-
-void hmm_device_put(struct hmm_device *hmm_device)
-{
- put_device(&hmm_device->device);
-}
-EXPORT_SYMBOL(hmm_device_put);
-
-static int __init hmm_init(void)
-{
- int ret;
-
- ret = alloc_chrdev_region(&hmm_device_devt, 0,
- HMM_DEVICE_MAX,
- "hmm_device");
- if (ret)
- return ret;
-
- hmm_device_class = class_create(THIS_MODULE, "hmm_device");
- if (IS_ERR(hmm_device_class)) {
- unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
- return PTR_ERR(hmm_device_class);
- }
- return 0;
-}
-
-device_initcall(hmm_init);
-#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
+EXPORT_SYMBOL(hmm_range_dma_unmap);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index d2cd70c..13cc937 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1,8 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2009 Red Hat, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -33,6 +31,8 @@
#include <linux/page_idle.h>
#include <linux/shmem_fs.h>
#include <linux/oom.h>
+#include <linux/numa.h>
+#include <linux/page_owner.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -62,6 +62,21 @@
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+{
+ /* The addr is used to check if the vma size fits */
+ unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
+
+ if (!transhuge_vma_suitable(vma, addr))
+ return false;
+ if (vma_is_anonymous(vma))
+ return __transparent_hugepage_enabled(vma);
+ if (vma_is_shmem(vma))
+ return shmem_huge_enabled(vma);
+
+ return false;
+}
+
static struct page *get_huge_zero_page(void)
{
struct page *zero_page;
@@ -420,7 +435,7 @@
* where the extra memory used could hurt more than TLB overhead
* is likely to save. The admin can still enable it through /sys.
*/
- if (totalram_pages < (512 << (20 - PAGE_SHIFT))) {
+ if (totalram_pages() < (512 << (20 - PAGE_SHIFT))) {
transparent_hugepage_flags = 0;
return 0;
}
@@ -481,11 +496,25 @@
return pmd;
}
-static inline struct list_head *page_deferred_list(struct page *page)
+#ifdef CONFIG_MEMCG
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- /* ->lru in the tail pages is occupied by compound_head. */
- return &page[2].deferred_list;
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ if (memcg)
+ return &memcg->deferred_split_queue;
+ else
+ return &pgdat->deferred_split_queue;
}
+#else
+static inline struct deferred_split *get_deferred_split_queue(struct page *page)
+{
+ struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
+
+ return &pgdat->deferred_split_queue;
+}
+#endif
void prep_transhuge_page(struct page *page)
{
@@ -498,7 +527,7 @@
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
{
unsigned long addr;
@@ -558,7 +587,7 @@
return VM_FAULT_FALLBACK;
}
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
ret = VM_FAULT_OOM;
goto release;
@@ -606,6 +635,7 @@
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
}
return 0;
@@ -633,16 +663,25 @@
{
const bool vma_madvised = !!(vma->vm_flags & VM_HUGEPAGE);
+ /* Always do synchronous compaction */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE | (vma_madvised ? 0 : __GFP_NORETRY);
+
+ /* Kick kcompactd and fail quickly */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
return GFP_TRANSHUGE_LIGHT | __GFP_KSWAPD_RECLAIM;
+
+ /* Synchronous compaction if madvised, otherwise kick kcompactd */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- __GFP_KSWAPD_RECLAIM);
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM :
+ __GFP_KSWAPD_RECLAIM);
+
+ /* Only do synchronous compaction if madvised */
if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return GFP_TRANSHUGE_LIGHT | (vma_madvised ? __GFP_DIRECT_RECLAIM :
- 0);
+ return GFP_TRANSHUGE_LIGHT |
+ (vma_madvised ? __GFP_DIRECT_RECLAIM : 0);
+
return GFP_TRANSHUGE_LIGHT;
}
@@ -670,7 +709,7 @@
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
+ if (!transhuge_vma_suitable(vma, haddr))
return VM_FAULT_FALLBACK;
if (unlikely(anon_vma_prepare(vma)))
return VM_FAULT_OOM;
@@ -683,7 +722,7 @@
struct page *zero_page;
bool set;
vm_fault_t ret;
- pgtable = pte_alloc_one(vma->vm_mm, haddr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable))
return VM_FAULT_OOM;
zero_page = mm_get_huge_zero_page(vma->vm_mm);
@@ -734,6 +773,21 @@
spinlock_t *ptl;
ptl = pmd_lock(mm, pmd);
+ if (!pmd_none(*pmd)) {
+ if (write) {
+ if (pmd_pfn(*pmd) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_huge_zero_pmd(*pmd));
+ goto out_unlock;
+ }
+ entry = pmd_mkyoung(*pmd);
+ entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
+ if (pmdp_set_access_flags(vma, addr, pmd, entry, 1))
+ update_mmu_cache_pmd(vma, addr, pmd);
+ }
+
+ goto out_unlock;
+ }
+
entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
if (pfn_t_devmap(pfn))
entry = pmd_mkdevmap(entry);
@@ -745,18 +799,25 @@
if (pgtable) {
pgtable_trans_huge_deposit(mm, pmd, pgtable);
mm_inc_nr_ptes(mm);
+ pgtable = NULL;
}
set_pmd_at(mm, addr, pmd, entry);
update_mmu_cache_pmd(vma, addr, pmd);
+
+out_unlock:
spin_unlock(ptl);
+ if (pgtable)
+ pte_free(mm, pgtable);
}
-vm_fault_t vmf_insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PMD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
+
/*
* If we had pmd_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -772,14 +833,14 @@
return VM_FAULT_SIGBUS;
if (arch_needs_pgtable_deposit()) {
- pgtable = pte_alloc_one(vma->vm_mm, addr);
+ pgtable = pte_alloc_one(vma->vm_mm);
if (!pgtable)
return VM_FAULT_OOM;
}
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pmd(vma, addr, pmd, pfn, pgprot, write, pgtable);
+ insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
@@ -800,6 +861,20 @@
spinlock_t *ptl;
ptl = pud_lock(mm, pud);
+ if (!pud_none(*pud)) {
+ if (write) {
+ if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_huge_zero_pud(*pud));
+ goto out_unlock;
+ }
+ entry = pud_mkyoung(*pud);
+ entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
+ if (pudp_set_access_flags(vma, addr, pud, entry, 1))
+ update_mmu_cache_pud(vma, addr, pud);
+ }
+ goto out_unlock;
+ }
+
entry = pud_mkhuge(pfn_t_pud(pfn, prot));
if (pfn_t_devmap(pfn))
entry = pud_mkdevmap(entry);
@@ -809,13 +884,17 @@
}
set_pud_at(mm, addr, pud, entry);
update_mmu_cache_pud(vma, addr, pud);
+
+out_unlock:
spin_unlock(ptl);
}
-vm_fault_t vmf_insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, pfn_t pfn, bool write)
+vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
{
+ unsigned long addr = vmf->address & PUD_MASK;
+ struct vm_area_struct *vma = vmf->vma;
pgprot_t pgprot = vma->vm_page_prot;
+
/*
* If we had pud_special, we could avoid all these restrictions,
* but we need to be consistent with PTEs and architectures that
@@ -832,7 +911,7 @@
track_pfn_insert(vma, &pgprot, pfn);
- insert_pfn_pud(vma, addr, pud, pfn, pgprot, write);
+ insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
@@ -852,11 +931,10 @@
}
struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
- pmd_t *pmd, int flags)
+ pmd_t *pmd, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -886,12 +964,11 @@
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
@@ -910,7 +987,7 @@
if (!vma_is_anonymous(vma))
return 0;
- pgtable = pte_alloc_one(dst_mm, addr);
+ pgtable = pte_alloc_one(dst_mm);
if (unlikely(!pgtable))
goto out;
@@ -1000,11 +1077,10 @@
}
struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
- pud_t *pud, int flags)
+ pud_t *pud, int flags, struct dev_pagemap **pgmap)
{
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
- struct dev_pagemap *pgmap;
struct page *page;
assert_spin_locked(pud_lockptr(mm, pud));
@@ -1028,12 +1104,11 @@
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
- pgmap = get_dev_pagemap(pfn, NULL);
- if (!pgmap)
+ *pgmap = get_dev_pagemap(pfn, *pgmap);
+ if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
get_page(page);
- put_dev_pagemap(pgmap);
return page;
}
@@ -1129,8 +1204,7 @@
int i;
vm_fault_t ret = 0;
struct page **pages;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
GFP_KERNEL);
@@ -1168,9 +1242,9 @@
cond_resched();
}
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
@@ -1215,8 +1289,7 @@
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pmdp_huge_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
- mmun_end);
+ mmu_notifier_invalidate_range_only_end(&range);
ret |= VM_FAULT_WRITE;
put_page(page);
@@ -1226,7 +1299,7 @@
out_free_pages:
spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
for (i = 0; i < HPAGE_PMD_NR; i++) {
memcg = (void *)page_private(pages[i]);
set_page_private(pages[i], 0);
@@ -1243,8 +1316,7 @@
struct page *page = NULL, *new_page;
struct mem_cgroup *memcg;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
gfp_t huge_gfp; /* for allocation and charge */
vm_fault_t ret = 0;
@@ -1288,7 +1360,7 @@
get_page(page);
spin_unlock(vmf->ptl);
alloc:
- if (transparent_hugepage_enabled(vma) &&
+ if (__transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow()) {
huge_gfp = alloc_hugepage_direct_gfpmask(vma);
new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
@@ -1325,6 +1397,7 @@
}
count_vm_event(THP_FAULT_ALLOC);
+ count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
if (!page)
clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
@@ -1333,9 +1406,9 @@
vma, HPAGE_PMD_NR);
__SetPageUptodate(new_page);
- mmun_start = haddr;
- mmun_end = haddr + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ haddr, haddr + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
spin_lock(vmf->ptl);
if (page)
@@ -1370,8 +1443,7 @@
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pmdp_huge_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start,
- mmun_end);
+ mmu_notifier_invalidate_range_only_end(&range);
out:
return ret;
out_unlock:
@@ -1464,7 +1536,7 @@
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- int page_nid = -1, this_nid = numa_node_id();
+ int page_nid = NUMA_NO_NODE, this_nid = numa_node_id();
int target_nid, last_cpupid = -1;
bool page_locked;
bool migrated = false;
@@ -1485,8 +1557,7 @@
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
goto out;
}
@@ -1510,7 +1581,7 @@
*/
page_locked = trylock_page(page);
target_nid = mpol_misplaced(page, vma, haddr);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
/* If the page was locked, there are no parallel migrations */
if (page_locked)
goto clear_pmdnuma;
@@ -1518,12 +1589,11 @@
/* Migration could have started since the pmd_trans_migrating check */
if (!page_locked) {
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
if (!get_page_unless_zero(page))
goto out_unlock;
spin_unlock(vmf->ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
goto out;
}
@@ -1540,14 +1610,14 @@
if (unlikely(!pmd_same(pmd, *vmf->pmd))) {
unlock_page(page);
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto out_unlock;
}
/* Bail if we fail to protect against THP splits for any reason */
if (unlikely(!anon_vma)) {
put_page(page);
- page_nid = -1;
+ page_nid = NUMA_NO_NODE;
goto clear_pmdnuma;
}
@@ -1562,8 +1632,20 @@
* We are not sure a pending tlb flush here is for a huge page
* mapping or not. Hence use the tlb range variant
*/
- if (mm_tlb_flush_pending(vma->vm_mm))
+ if (mm_tlb_flush_pending(vma->vm_mm)) {
flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
+ /*
+ * change_huge_pmd() released the pmd lock before
+ * invalidating the secondary MMUs sharing the primary
+ * MMU pagetables (with ->invalidate_range()). The
+ * mmu_notifier_invalidate_range_end() (which
+ * internally calls ->invalidate_range()) in
+ * change_pmd_range() will run after us, so we can't
+ * rely on it here and we need an explicit invalidate.
+ */
+ mmu_notifier_invalidate_range(vma->vm_mm, haddr,
+ haddr + HPAGE_PMD_SIZE);
+ }
/*
* Migrate the THP to the requested node, returns with page unlocked
@@ -1597,7 +1679,7 @@
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR,
flags);
@@ -1617,7 +1699,7 @@
struct mm_struct *mm = tlb->mm;
bool ret = false;
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -1693,7 +1775,7 @@
pmd_t orig_pmd;
spinlock_t *ptl;
- tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE);
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -1958,7 +2040,6 @@
int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
pud_t *pud, unsigned long addr)
{
- pud_t orig_pud;
spinlock_t *ptl;
ptl = __pud_trans_huge_lock(pud, vma);
@@ -1970,8 +2051,7 @@
* pgtable_trans_huge_withdraw after finishing pudp related
* operations.
*/
- orig_pud = pudp_huge_get_and_clear_full(tlb->mm, addr, pud,
- tlb->fullmm);
+ pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_dax(vma)) {
spin_unlock(ptl);
@@ -2000,14 +2080,16 @@
unsigned long address)
{
spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PUD_MASK;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PUD_SIZE);
- ptl = pud_lock(mm, pud);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PUD_MASK,
+ (address & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pud_lock(vma->vm_mm, pud);
if (unlikely(!pud_trans_huge(*pud) && !pud_devmap(*pud)))
goto out;
- __split_huge_pud_locked(vma, pud, haddr);
+ __split_huge_pud_locked(vma, pud, range.start);
out:
spin_unlock(ptl);
@@ -2015,8 +2097,7 @@
* No need to double call mmu_notifier->invalidate_range() callback as
* the above pudp_huge_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
- HPAGE_PUD_SIZE);
+ mmu_notifier_invalidate_range_only_end(&range);
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
@@ -2218,11 +2299,13 @@
unsigned long address, bool freeze, struct page *page)
{
spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long haddr = address & HPAGE_PMD_MASK;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, haddr, haddr + HPAGE_PMD_SIZE);
- ptl = pmd_lock(mm, pmd);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address & HPAGE_PMD_MASK,
+ (address & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+ ptl = pmd_lock(vma->vm_mm, pmd);
/*
* If caller asks to setup a migration entries, we need a page to check
@@ -2238,7 +2321,7 @@
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
goto out;
- __split_huge_pmd_locked(vma, pmd, haddr, freeze);
+ __split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
/*
@@ -2254,8 +2337,7 @@
* any further changes to individual pte will notify. So no need
* to call mmu_notifier->invalidate_range()
*/
- mmu_notifier_invalidate_range_only_end(mm, haddr, haddr +
- HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_only_end(&range);
}
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
@@ -2371,6 +2453,7 @@
(1L << PG_mlocked) |
(1L << PG_uptodate) |
(1L << PG_active) |
+ (1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
(1L << PG_dirty)));
@@ -2415,15 +2498,25 @@
pgoff_t end, unsigned long flags)
{
struct page *head = compound_head(page);
- struct zone *zone = page_zone(head);
+ pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
+ struct address_space *swap_cache = NULL;
+ unsigned long offset = 0;
int i;
- lruvec = mem_cgroup_page_lruvec(head, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(head, pgdat);
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
+ if (PageAnon(head) && PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ offset = swp_offset(entry);
+ swap_cache = swap_address_space(entry);
+ xa_lock(&swap_cache->i_pages);
+ }
+
for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
@@ -2433,24 +2526,35 @@
if (IS_ENABLED(CONFIG_SHMEM) && PageSwapBacked(head))
shmem_uncharge(head->mapping->host, 1);
put_page(head + i);
+ } else if (!PageAnon(page)) {
+ __xa_store(&head->mapping->i_pages, head[i].index,
+ head + i, 0);
+ } else if (swap_cache) {
+ __xa_store(&swap_cache->i_pages, offset + i,
+ head + i, 0);
}
}
ClearPageCompound(head);
+
+ split_page_owner(head, HPAGE_PMD_ORDER);
+
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
- /* Additional pin to radix tree of swap cache */
- if (PageSwapCache(head))
+ /* Additional pin to swap cache */
+ if (PageSwapCache(head)) {
page_ref_add(head, 2);
- else
+ xa_unlock(&swap_cache->i_pages);
+ } else {
page_ref_inc(head);
+ }
} else {
- /* Additional pin to radix tree */
+ /* Additional pin to page cache */
page_ref_add(head, 2);
xa_unlock(&head->mapping->i_pages);
}
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
remap_page(head);
@@ -2557,7 +2661,7 @@
{
int extra_pins;
- /* Additional pins from radix tree */
+ /* Additional pins from page cache */
if (PageAnon(page))
extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
else
@@ -2590,6 +2694,7 @@
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
@@ -2661,35 +2766,37 @@
lru_add_drain();
/* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);
+ spin_lock_irqsave(&pgdata->lru_lock, flags);
if (mapping) {
- void **pslot;
+ XA_STATE(xas, &mapping->i_pages, page_index(head));
- xa_lock(&mapping->i_pages);
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(head));
/*
- * Check if the head page is present in radix tree.
+ * Check if the head page is present in page cache.
* We assume all tail are present too, if head is there.
*/
- if (radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != head)
+ xa_lock(&mapping->i_pages);
+ if (xas_load(&xas) != head)
goto fail;
}
/* Prevent deferred_split_scan() touching ->_refcount */
- spin_lock(&pgdata->split_queue_lock);
+ spin_lock(&ds_queue->split_queue_lock);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
- if (mapping)
- __dec_node_page_state(page, NR_SHMEM_THPS);
- spin_unlock(&pgdata->split_queue_lock);
+ if (mapping) {
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_THPS);
+ else
+ __dec_node_page_state(page, NR_FILE_THPS);
+ }
+
+ spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
if (PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2706,10 +2813,10 @@
dump_page(page, "total_mapcount(head) > 0");
BUG();
}
- spin_unlock(&pgdata->split_queue_lock);
+ spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
xa_unlock(&mapping->i_pages);
- spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
+ spin_unlock_irqrestore(&pgdata->lru_lock, flags);
remap_page(head);
ret = -EBUSY;
}
@@ -2728,53 +2835,86 @@
void free_transhuge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
unsigned long flags;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
+ struct deferred_split *ds_queue = get_deferred_split_queue(page);
+#ifdef CONFIG_MEMCG
+ struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+#endif
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ /*
+ * The try_to_unmap() in page reclaim path might reach here too,
+ * this may cause a race condition to corrupt deferred split queue.
+ * And, if page reclaim is already handling the same page, it is
+ * unnecessary to handle it again in shrinker.
+ *
+ * Check PageSwapCache to determine if the page is being
+ * handled by page reclaim since THP swap would add the page into
+ * swap cache before calling try_to_unmap().
+ */
+ if (PageSwapCache(page))
+ return;
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- list_add_tail(page_deferred_list(page), &pgdata->split_queue);
- pgdata->split_queue_len++;
+ list_add_tail(page_deferred_list(page), &ds_queue->split_queue);
+ ds_queue->split_queue_len++;
+#ifdef CONFIG_MEMCG
+ if (memcg)
+ memcg_set_shrinker_bit(memcg, page_to_nid(page),
+ deferred_split_shrinker.id);
+#endif
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
- return READ_ONCE(pgdata->split_queue_len);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
+
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+ return READ_ONCE(ds_queue->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ struct deferred_split *ds_queue = &pgdata->deferred_split_queue;
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+#ifdef CONFIG_MEMCG
+ if (sc->memcg)
+ ds_queue = &sc->memcg->deferred_split_queue;
+#endif
+
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &pgdata->split_queue) {
+ list_for_each_safe(pos, next, &ds_queue->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
@@ -2782,12 +2922,12 @@
} else {
/* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- pgdata->split_queue_len--;
+ ds_queue->split_queue_len--;
}
if (!--sc->nr_to_scan)
break;
}
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
@@ -2801,15 +2941,15 @@
put_page(page);
}
- spin_lock_irqsave(&pgdata->split_queue_lock, flags);
- list_splice_tail(&list, &pgdata->split_queue);
- spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
+ spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ list_splice_tail(&list, &ds_queue->split_queue);
+ spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
/*
* Stop shrinker if we didn't split any page, but the queue is empty.
* This can happen if pages were freed under us.
*/
- if (!split && list_empty(&pgdata->split_queue))
+ if (!split && list_empty(&ds_queue->split_queue))
return SHRINK_STOP;
return split;
}
@@ -2818,7 +2958,8 @@
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
- .flags = SHRINKER_NUMA_AWARE,
+ .flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE |
+ SHRINKER_NONSLAB,
};
#ifdef CONFIG_DEBUG_FS
@@ -2867,12 +3008,8 @@
static int __init split_huge_pages_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
- &split_huge_pages_fops);
- if (!ret)
- pr_warn("Failed to create split_huge_pages in debugfs");
+ debugfs_create_file("split_huge_pages", 0200, NULL, NULL,
+ &split_huge_pages_fops);
return 0;
}
late_initcall(split_huge_pages_debugfs);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 309fb8c..b45a953 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic hugetlb support.
* (C) Nadia Yvette Chambers, April 2004
@@ -15,7 +16,7 @@
#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/mmdebug.h>
@@ -25,6 +26,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -739,7 +741,15 @@
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
- return inode->i_mapping->private_data;
+ /*
+ * At inode evict time, i_mapping may not point to the original
+ * address space within the inode. This original address space
+ * contains the pointer to the resv_map. So, always use the
+ * address space embedded within the inode.
+ * The VERY common case is inode->mapping == &inode->i_data but,
+ * this may not be true for device special inodes.
+ */
+ return (struct resv_map *)(&inode->i_data)->private_data;
}
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -887,7 +897,7 @@
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- int node = -1;
+ int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
@@ -919,7 +929,7 @@
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepage_migration_supported(h))
+ if (hugepage_movable_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -1058,6 +1068,7 @@
free_contig_range(page_to_pfn(page), 1 << order);
}
+#ifdef CONFIG_CONTIG_ALLOC
static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
@@ -1073,11 +1084,10 @@
struct page *page;
for (i = start_pfn; i < end_pfn; i++) {
- if (!pfn_valid(i))
+ page = pfn_to_online_page(i);
+ if (!page)
return false;
- page = pfn_to_page(i);
-
if (page_zone(page) != z)
return false;
@@ -1142,11 +1152,20 @@
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask) { return NULL; }
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
@@ -1156,7 +1175,7 @@
{
int i;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
@@ -1248,20 +1267,32 @@
(struct hugepage_subpool *)page_private(page);
bool restore_reserve;
- set_page_private(page, 0);
- page->mapping = NULL;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
/*
- * A return code of zero implies that the subpool will be under its
- * minimum size if the reservation is not restored after page is free.
- * Therefore, force restore_reserve operation.
+ * If PagePrivate() was set on page, page allocation consumed a
+ * reservation. If the page was associated with a subpool, there
+ * would have been a page reserved in the subpool before allocation
+ * via hugepage_subpool_get_pages(). Since we are 'restoring' the
+ * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * remove the reserved page from the subpool.
*/
- if (hugepage_subpool_put_pages(spool, 1) == 0)
- restore_reserve = true;
+ if (!restore_reserve) {
+ /*
+ * A return code of zero implies that the subpool will be
+ * under its minimum size if the reservation is not restored
+ * after page is free. Therefore, force restore_reserve
+ * operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+ }
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
@@ -1373,12 +1404,25 @@
}
static struct page *alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
+ bool alloc_try_hard = true;
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ /*
+ * By default we always try hard to allocate the page with
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
+ * a loop (to adjust global huge page counts) and previous allocation
+ * failed, do not continue to try hard on the same node. Use the
+ * node_alloc_noretry bitmap to manage this state information.
+ */
+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
+ alloc_try_hard = false;
+ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
+ if (alloc_try_hard)
+ gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
@@ -1387,6 +1431,22 @@
else
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ /*
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
+ * indicates an overall state change. Clear bit so that we resume
+ * normal 'try hard' allocations.
+ */
+ if (node_alloc_noretry && page && !alloc_try_hard)
+ node_clear(nid, *node_alloc_noretry);
+
+ /*
+ * If we tried hard to get a page but failed, set bit so that
+ * subsequent attempts will not try as hard until there is an
+ * overall state change.
+ */
+ if (node_alloc_noretry && !page && alloc_try_hard)
+ node_set(nid, *node_alloc_noretry);
+
return page;
}
@@ -1395,7 +1455,8 @@
* should use this function to get new hugetlb pages
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
@@ -1403,7 +1464,7 @@
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
page = alloc_buddy_huge_page(h, gfp_mask,
- nid, nmask);
+ nid, nmask, node_alloc_noretry);
if (!page)
return NULL;
@@ -1418,14 +1479,16 @@
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
+ node_alloc_noretry);
if (page)
break;
}
@@ -1478,16 +1541,29 @@
/*
* Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
- * dissolution fails because a give page is not a free hugepage, or because
- * free hugepages are fully reserved.
+ * nothing for in-use hugepages and non-hugepages.
+ * This function returns values like below:
+ *
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!PageHuge(page))
+ return 0;
+
spin_lock(&hugetlb_lock);
- if (PageHuge(page) && !page_count(page)) {
+ if (!PageHuge(page)) {
+ rc = 0;
+ goto out;
+ }
+
+ if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
int nid = page_to_nid(head);
@@ -1532,11 +1608,9 @@
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
page = pfn_to_page(pfn);
- if (PageHuge(page) && !page_count(page)) {
- rc = dissolve_free_huge_page(page);
- if (rc)
- break;
- }
+ rc = dissolve_free_huge_page(page);
+ if (rc)
+ break;
}
return rc;
@@ -1558,7 +1632,7 @@
goto out_unlock;
spin_unlock(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -1572,8 +1646,9 @@
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetPageHugeTemporary(page);
+ spin_unlock(&hugetlb_lock);
put_page(page);
- page = NULL;
+ return NULL;
} else {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -1585,15 +1660,15 @@
return page;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2100,9 +2175,9 @@
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = memblock_virt_alloc_try_nid_raw(
+ addr = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
- 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -2163,13 +2238,33 @@
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
+ nodemask_t *node_alloc_noretry;
+
+ if (!hstate_is_gigantic(h)) {
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * Ignore errors as lower level routines can deal with
+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
+ * time, we are likely in bigger trouble.
+ */
+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
+ GFP_KERNEL);
+ } else {
+ /* allocations done at boot time */
+ node_alloc_noretry = NULL;
+ }
+
+ /* bit mask controlling how hard we retry per-node allocations */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
- &node_states[N_MEMORY]))
+ &node_states[N_MEMORY],
+ node_alloc_noretry))
break;
cond_resched();
}
@@ -2181,6 +2276,8 @@
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
+
+ kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
@@ -2275,13 +2372,59 @@
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
- nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return h->max_huge_pages;
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * If we can not allocate the bit mask, do not attempt to allocate
+ * the requested huge pages.
+ */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
+ else
+ return -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+
+ /*
+ * Check for a node specific request.
+ * Changing node specific huge page count may require a corresponding
+ * change to the global count. In any case, the passed node mask
+ * (nodes_allowed) will restrict alloc/free to the specified node.
+ */
+ if (nid != NUMA_NO_NODE) {
+ unsigned long old_count = count;
+
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ /*
+ * User may have specified a large count value which caused the
+ * above calculation to overflow. In this case, they wanted
+ * to allocate as many huge pages as possible. Set count to
+ * largest possible value to align with their intention.
+ */
+ if (count < old_count)
+ count = ULONG_MAX;
+ }
+
+ /*
+ * Gigantic pages runtime allocation depend on the capability for large
+ * page range allocation.
+ * If the system does not provide this feature, return an error when
+ * the user tries to allocate gigantic pages but let the user free the
+ * boottime allocated gigantic pages.
+ */
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
+ return -EINVAL;
+ }
+ /* Fall through to decrease pool */
+ }
/*
* Increase the pool size
@@ -2294,7 +2437,6 @@
* pool might be one hugepage larger than it needs to be, but
* within all the constraints specified by the sysctls.
*/
- spin_lock(&hugetlb_lock);
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
@@ -2311,7 +2453,8 @@
/* yield cpu to avoid soft lockup */
cond_resched();
- ret = alloc_pool_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed,
+ node_alloc_noretry);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -2349,9 +2492,12 @@
break;
}
out:
- ret = persistent_huge_pages(h);
+ h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
- return ret;
+
+ NODEMASK_FREE(node_alloc_noretry);
+
+ return 0;
}
#define HSTATE_ATTR_RO(_name) \
@@ -2401,41 +2547,32 @@
unsigned long count, size_t len)
{
int err;
- NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+ nodemask_t nodes_allowed, *n_mask;
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
- err = -EINVAL;
- goto out;
- }
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return -EINVAL;
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
*/
if (!(obey_mempolicy &&
- init_nodemask_of_mempolicy(nodes_allowed))) {
- NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_MEMORY];
- }
- } else if (nodes_allowed) {
+ init_nodemask_of_mempolicy(&nodes_allowed)))
+ n_mask = &node_states[N_MEMORY];
+ else
+ n_mask = &nodes_allowed;
+ } else {
/*
- * per node hstate attribute: adjust count to global,
- * but restrict alloc/free to the specified node.
+ * Node specific request. count adjustment happens in
+ * set_max_huge_pages() after acquiring hugetlb_lock.
*/
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
- init_nodemask_of_node(nodes_allowed, nid);
- } else
- nodes_allowed = &node_states[N_MEMORY];
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ }
- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+ err = set_max_huge_pages(h, count, nid, n_mask);
- if (nodes_allowed != &node_states[N_MEMORY])
- NODEMASK_FREE(nodes_allowed);
-
- return len;
-out:
- NODEMASK_FREE(nodes_allowed);
- return err;
+ return err ? err : len;
}
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
@@ -3239,16 +3376,17 @@
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- mmun_start = vma->vm_start;
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+ if (cow) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+ vma->vm_start,
+ vma->vm_end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
@@ -3324,7 +3462,7 @@
}
if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -3341,8 +3479,7 @@
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start = start; /* For mmu_notifiers */
- unsigned long mmun_end = end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3352,14 +3489,16 @@
* This is a hugetlb vma, all the pte entries should point
* to huge page.
*/
- tlb_remove_check_page_size_change(tlb, sz);
+ tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+ end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
@@ -3427,7 +3566,7 @@
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
}
@@ -3545,9 +3684,8 @@
struct page *old_page, *new_page;
int outside_reserve = 0;
vm_fault_t ret = 0;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
unsigned long haddr = address & huge_page_mask(h);
+ struct mmu_notifier_range range;
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
@@ -3624,11 +3762,10 @@
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
- mmun_start = haddr;
- mmun_end = mmun_start + huge_page_size(h);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ haddr + huge_page_size(h));
+ mmu_notifier_invalidate_range_start(&range);
/*
* Retake the page table lock to check for racing updates
@@ -3641,16 +3778,17 @@
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range(mm, range.start, range.end);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out_release_all:
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
@@ -3730,6 +3868,7 @@
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -3776,8 +3915,7 @@
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3786,12 +3924,31 @@
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
+ /*
+ * Returning error will result in faulting task being
+ * sent SIGBUS. The hugetlb fault mutex prevents two
+ * tasks from racing to fault in the same page which
+ * could result in false unable to allocate errors.
+ * Page migration does not take the fault mutex, but
+ * does a clear then write of pte's under page table
+ * lock. Page fault code could race with migration,
+ * notice the clear pte and try to allocate a page
+ * here. Before returning error, get ptl and make
+ * sure there really is no pte entry.
+ */
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
+ ret = 0;
+ spin_unlock(ptl);
+ goto out;
+ }
+ spin_unlock(ptl);
ret = vmf_error(PTR_ERR(page));
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3862,6 +4019,15 @@
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
@@ -3876,21 +4042,14 @@
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3901,9 +4060,7 @@
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -3948,7 +4105,7 @@
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4096,7 +4253,6 @@
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -4164,6 +4320,7 @@
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4199,7 +4356,7 @@
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current))) {
+ if (fatal_signal_pending(current)) {
remainder = 0;
break;
}
@@ -4269,7 +4426,8 @@
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
*nr_pages = 0;
/*
@@ -4288,6 +4446,19 @@
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+
+ /*
+ * Instead of doing 'try_get_page()' below in the same_page
+ * loop, just check the count once here.
+ */
+ if (unlikely(page_count(page) <= 0)) {
+ if (pages) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+ }
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
@@ -4339,21 +4510,22 @@
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
- unsigned long f_start = start;
- unsigned long f_end = end;
bool shared_pmd = false;
+ struct mmu_notifier_range range;
/*
* In the case of shared PMDs, the area to flush could be beyond
- * start/end. Set f_start/f_end to cover the maximum possible
+ * start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+ 0, vma, mm, start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
- flush_cache_range(vma, f_start, f_end);
+ flush_cache_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range_start(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -4388,10 +4560,12 @@
continue;
}
if (!huge_pte_none(pte)) {
- pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
+ pte_t old_pte;
+
+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
- set_huge_pte_at(mm, address, ptep, pte);
+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
spin_unlock(ptl);
@@ -4404,7 +4578,7 @@
* did unshare a page of pmds, flush the range corresponding to the pud.
*/
if (shared_pmd)
- flush_hugetlb_tlb_range(vma, f_start, f_end);
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
else
flush_hugetlb_tlb_range(vma, start, end);
/*
@@ -4414,7 +4588,7 @@
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
- mmu_notifier_invalidate_range_end(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
}
@@ -4451,6 +4625,11 @@
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * resv_map can not be NULL as hugetlb_reserve_pages is only
+ * called for inodes for which resv_maps were created (see
+ * hugetlbfs_get_inode).
+ */
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to);
@@ -4542,6 +4721,10 @@
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
+ /*
+ * Since this routine can be called in the evict inode path for all
+ * hugetlbfs inodes, resv_map could be NULL.
+ */
if (resv_map) {
chg = region_del(resv_map, start, end);
/*
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 68c2f2f..2ac38bd 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -139,7 +139,7 @@
if (!page_hcg || page_hcg != h_cg)
goto out;
- nr_pages = 1 << compound_order(page);
+ nr_pages = compound_nr(page);
if (!parent) {
parent = root_h_cgroup;
/* root has no limit */
@@ -196,7 +196,7 @@
again:
rcu_read_lock();
h_cg = hugetlb_cgroup_from_task(current);
- if (!css_tryget_online(&h_cg->css)) {
+ if (!css_tryget(&h_cg->css)) {
rcu_read_unlock();
goto again;
}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index b6ac706..5b7430b 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/* Inject a hwpoison memory failure on a arbitrary pfn */
#include <linux/module.h>
#include <linux/debugfs.h>
@@ -76,63 +77,40 @@
static int pfn_inject_init(void)
{
- struct dentry *dentry;
-
hwpoison_dir = debugfs_create_dir("hwpoison", NULL);
- if (hwpoison_dir == NULL)
- return -ENOMEM;
/*
* Note that the below poison/unpoison interfaces do not involve
* hardware status change, hence do not require hardware support.
* They are mainly for testing hwpoison in software level.
*/
- dentry = debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir,
- NULL, &hwpoison_fops);
- if (!dentry)
- goto fail;
+ debugfs_create_file("corrupt-pfn", 0200, hwpoison_dir, NULL,
+ &hwpoison_fops);
- dentry = debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir,
- NULL, &unpoison_fops);
- if (!dentry)
- goto fail;
+ debugfs_create_file("unpoison-pfn", 0200, hwpoison_dir, NULL,
+ &unpoison_fops);
- dentry = debugfs_create_u32("corrupt-filter-enable", 0600,
- hwpoison_dir, &hwpoison_filter_enable);
- if (!dentry)
- goto fail;
+ debugfs_create_u32("corrupt-filter-enable", 0600, hwpoison_dir,
+ &hwpoison_filter_enable);
- dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600,
- hwpoison_dir, &hwpoison_filter_dev_major);
- if (!dentry)
- goto fail;
+ debugfs_create_u32("corrupt-filter-dev-major", 0600, hwpoison_dir,
+ &hwpoison_filter_dev_major);
- dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600,
- hwpoison_dir, &hwpoison_filter_dev_minor);
- if (!dentry)
- goto fail;
+ debugfs_create_u32("corrupt-filter-dev-minor", 0600, hwpoison_dir,
+ &hwpoison_filter_dev_minor);
- dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600,
- hwpoison_dir, &hwpoison_filter_flags_mask);
- if (!dentry)
- goto fail;
+ debugfs_create_u64("corrupt-filter-flags-mask", 0600, hwpoison_dir,
+ &hwpoison_filter_flags_mask);
- dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600,
- hwpoison_dir, &hwpoison_filter_flags_value);
- if (!dentry)
- goto fail;
+ debugfs_create_u64("corrupt-filter-flags-value", 0600, hwpoison_dir,
+ &hwpoison_filter_flags_value);
#ifdef CONFIG_MEMCG
- dentry = debugfs_create_u64("corrupt-filter-memcg", 0600,
- hwpoison_dir, &hwpoison_filter_memcg);
- if (!dentry)
- goto fail;
+ debugfs_create_u64("corrupt-filter-memcg", 0600, hwpoison_dir,
+ &hwpoison_filter_memcg);
#endif
return 0;
-fail:
- pfn_inject_exit();
- return -ENOMEM;
}
module_init(pfn_inject_init);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a31..1960330 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,6 +5,7 @@
#include <linux/spinlock.h>
#include <linux/list.h>
#include <linux/cpumask.h>
+#include <linux/mman.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
@@ -35,6 +36,6 @@
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns,
- .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
+ .cpu_bitmap = CPU_BITS_NONE,
INIT_MM_CONTEXT(init_mm)
};
diff --git a/mm/internal.h b/mm/internal.h
index 87256ae..0d5f720 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1,12 +1,8 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
/* internal.h: mm/ internal definitions
*
* Copyright (C) 2004 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#ifndef __MM_INTERNAL_H
#define __MM_INTERNAL_H
@@ -43,7 +39,7 @@
void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
unsigned long floor, unsigned long ceiling);
-static inline bool can_madv_dontneed_vma(struct vm_area_struct *vma)
+static inline bool can_madv_lru_vma(struct vm_area_struct *vma)
{
return !(vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP));
}
@@ -161,8 +157,9 @@
}
extern int __isolate_free_page(struct page *page, unsigned int order);
-extern void __free_pages_bootmem(struct page *page, unsigned long pfn,
+extern void memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order);
+extern void __free_pages_core(struct page *page, unsigned int order);
extern void prep_compound_page(struct page *page, unsigned int order);
extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
@@ -183,14 +180,16 @@
struct compact_control {
struct list_head freepages; /* List of free pages to migrate to */
struct list_head migratepages; /* List of pages being migrated */
- struct zone *zone;
- unsigned long nr_freepages; /* Number of isolated free pages */
- unsigned long nr_migratepages; /* Number of pages to migrate */
- unsigned long total_migrate_scanned;
- unsigned long total_free_scanned;
+ unsigned int nr_freepages; /* Number of isolated free pages */
+ unsigned int nr_migratepages; /* Number of pages to migrate */
unsigned long free_pfn; /* isolate_freepages search base */
unsigned long migrate_pfn; /* isolate_migratepages search base */
- unsigned long last_migrated_pfn;/* Not yet flushed page being freed */
+ unsigned long fast_start_pfn; /* a pfn to start linear scan from */
+ struct zone *zone;
+ unsigned long total_migrate_scanned;
+ unsigned long total_free_scanned;
+ unsigned short fast_search_fail;/* failures to use free list searches */
+ short search_order; /* order to start a fast search at */
const gfp_t gfp_mask; /* gfp mask of a direct compactor */
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
@@ -203,7 +202,16 @@
bool direct_compaction; /* False from kcompactd or /proc/... */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
- bool finishing_block; /* Finishing current pageblock */
+ bool rescan; /* Rescanning the same pageblock */
+};
+
+/*
+ * Used in direct compaction when a page should be taken from the freelists
+ * immediately when one is created during the free path.
+ */
+struct capture_control {
+ struct compact_control *cc;
+ struct page *page;
};
unsigned long
@@ -444,6 +452,16 @@
#define NODE_RECLAIM_SOME 0
#define NODE_RECLAIM_SUCCESS 1
+#ifdef CONFIG_NUMA
+extern int node_reclaim(struct pglist_data *, gfp_t, unsigned int);
+#else
+static inline int node_reclaim(struct pglist_data *pgdat, gfp_t mask,
+ unsigned int order)
+{
+ return NODE_RECLAIM_NOSCAN;
+}
+#endif
+
extern int hwpoison_filter(struct page *p);
extern u32 hwpoison_filter_dev_major;
@@ -480,10 +498,16 @@
#define ALLOC_OOM ALLOC_NO_WATERMARKS
#endif
-#define ALLOC_HARDER 0x10 /* try to alloc harder */
-#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
-#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#define ALLOC_HARDER 0x10 /* try to alloc harder */
+#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
+#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#define ALLOC_CMA 0x80 /* allow allocations from CMA areas */
+#ifdef CONFIG_ZONE_DMA32
+#define ALLOC_NOFRAGMENT 0x100 /* avoid mixing pageblock types */
+#else
+#define ALLOC_NOFRAGMENT 0x0
+#endif
+#define ALLOC_KSWAPD 0x200 /* allow waking of kswapd */
enum ttu_flags;
struct tlbflush_unmap_batch;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 27ddfd2..11c75fb 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -1,9 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/interval_tree.c - interval tree for mapping->i_mmap
*
* Copyright (C) 2012, Michel Lespinasse <walken@google.com>
- *
- * This file is released under the GPL v2.
*/
#include <linux/mm.h>
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 3289db3..08b43de 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -1,11 +1,24 @@
# SPDX-License-Identifier: GPL-2.0
KASAN_SANITIZE := n
-UBSAN_SANITIZE_kasan.o := n
+UBSAN_SANITIZE_common.o := n
+UBSAN_SANITIZE_generic.o := n
+UBSAN_SANITIZE_generic_report.o := n
+UBSAN_SANITIZE_tags.o := n
KCOV_INSTRUMENT := n
-CFLAGS_REMOVE_kasan.o = -pg
+CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
+
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
-CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-obj-y := kasan.o report.o kasan_init.o quarantine.o
+CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-$(CONFIG_KASAN) := common.o init.o report.o
+obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
+obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
new file mode 100644
index 0000000..6814d6d
--- /dev/null
+++ b/mm/kasan/common.c
@@ -0,0 +1,746 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains common generic and tag-based KASAN code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+#include <linux/uaccess.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+static inline int in_irqentry_text(unsigned long ptr)
+{
+ return (ptr >= (unsigned long)&__irqentry_text_start &&
+ ptr < (unsigned long)&__irqentry_text_end) ||
+ (ptr >= (unsigned long)&__softirqentry_text_start &&
+ ptr < (unsigned long)&__softirqentry_text_end);
+}
+
+static inline unsigned int filter_irq_stacks(unsigned long *entries,
+ unsigned int nr_entries)
+{
+ unsigned int i;
+
+ for (i = 0; i < nr_entries; i++) {
+ if (in_irqentry_text(entries[i])) {
+ /* Include the irqentry function into the stack. */
+ return i + 1;
+ }
+ }
+ return nr_entries;
+}
+
+static inline depot_stack_handle_t save_stack(gfp_t flags)
+{
+ unsigned long entries[KASAN_STACK_DEPTH];
+ unsigned int nr_entries;
+
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ nr_entries = filter_irq_stacks(entries, nr_entries);
+ return stack_depot_save(entries, nr_entries, flags);
+}
+
+static inline void set_track(struct kasan_track *track, gfp_t flags)
+{
+ track->pid = current->pid;
+ track->stack = save_stack(flags);
+}
+
+void kasan_enable_current(void)
+{
+ current->kasan_depth++;
+}
+
+void kasan_disable_current(void)
+{
+ current->kasan_depth--;
+}
+
+bool __kasan_check_read(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_read);
+
+bool __kasan_check_write(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ check_memory_region((unsigned long)addr, len, true, _RET_IP_);
+
+ return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+ return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ check_memory_region((unsigned long)src, len, false, _RET_IP_);
+ check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+ return __memcpy(dest, src, len);
+}
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+ void *shadow_start, *shadow_end;
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_poison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = reset_tag(address);
+
+ shadow_start = kasan_mem_to_shadow(address);
+ shadow_end = kasan_mem_to_shadow(address + size);
+
+ __memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+ u8 tag = get_tag(address);
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = reset_tag(address);
+
+ kasan_poison_shadow(address, size, tag);
+
+ if (size & KASAN_SHADOW_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ *shadow = tag;
+ else
+ *shadow = size & KASAN_SHADOW_MASK;
+ }
+}
+
+static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
+{
+ void *base = task_stack_page(task);
+ size_t size = sp - base;
+
+ kasan_unpoison_shadow(base, size);
+}
+
+/* Unpoison the entire stack for a task. */
+void kasan_unpoison_task_stack(struct task_struct *task)
+{
+ __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+}
+
+/* Unpoison the stack for the current task beyond a watermark sp value. */
+asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
+{
+ /*
+ * Calculate the task stack base address. Avoid using 'current'
+ * because this function is called by early resume code which hasn't
+ * yet set up the percpu register (%gs).
+ */
+ void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
+
+ kasan_unpoison_shadow(base, watermark - base);
+}
+
+/*
+ * Clear all poison for the region between the current SP and a provided
+ * watermark value, as is sometimes required prior to hand-crafted asm function
+ * returns in the middle of functions.
+ */
+void kasan_unpoison_stack_above_sp_to(const void *watermark)
+{
+ const void *sp = __builtin_frame_address(0);
+ size_t size = watermark - sp;
+
+ if (WARN_ON(sp > watermark))
+ return;
+ kasan_unpoison_shadow(sp, size);
+}
+
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+ u8 tag;
+ unsigned long i;
+
+ if (unlikely(PageHighMem(page)))
+ return;
+
+ tag = random_tag();
+ for (i = 0; i < (1 << order); i++)
+ page_kasan_tag_set(page + i, tag);
+ kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+ if (likely(!PageHighMem(page)))
+ kasan_poison_shadow(page_address(page),
+ PAGE_SIZE << order,
+ KASAN_FREE_PAGE);
+}
+
+/*
+ * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
+ * For larger allocations larger redzones are used.
+ */
+static inline unsigned int optimal_redzone(unsigned int object_size)
+{
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return 0;
+
+ return
+ object_size <= 64 - 16 ? 16 :
+ object_size <= 128 - 32 ? 32 :
+ object_size <= 512 - 64 ? 64 :
+ object_size <= 4096 - 128 ? 128 :
+ object_size <= (1 << 14) - 256 ? 256 :
+ object_size <= (1 << 15) - 512 ? 512 :
+ object_size <= (1 << 16) - 1024 ? 1024 : 2048;
+}
+
+void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags)
+{
+ unsigned int orig_size = *size;
+ unsigned int redzone_size;
+ int redzone_adjust;
+
+ /* Add alloc meta. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
+
+ /* Add free meta. */
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
+ (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta))) {
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+ }
+
+ redzone_size = optimal_redzone(cache->object_size);
+ redzone_adjust = redzone_size - (*size - cache->object_size);
+ if (redzone_adjust > 0)
+ *size += redzone_adjust;
+
+ *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
+ max(*size, cache->object_size + redzone_size));
+
+ /*
+ * If the metadata doesn't fit, don't enable KASAN at all.
+ */
+ if (*size <= cache->kasan_info.alloc_meta_offset ||
+ *size <= cache->kasan_info.free_meta_offset) {
+ cache->kasan_info.alloc_meta_offset = 0;
+ cache->kasan_info.free_meta_offset = 0;
+ *size = orig_size;
+ return;
+ }
+
+ *flags |= SLAB_KASAN;
+}
+
+size_t kasan_metadata_size(struct kmem_cache *cache)
+{
+ return (cache->kasan_info.alloc_meta_offset ?
+ sizeof(struct kasan_alloc_meta) : 0) +
+ (cache->kasan_info.free_meta_offset ?
+ sizeof(struct kasan_free_meta) : 0);
+}
+
+struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
+ const void *object)
+{
+ return (void *)object + cache->kasan_info.alloc_meta_offset;
+}
+
+struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
+ const void *object)
+{
+ BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
+ return (void *)object + cache->kasan_info.free_meta_offset;
+}
+
+
+static void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
+void kasan_poison_slab(struct page *page)
+{
+ unsigned long i;
+
+ for (i = 0; i < compound_nr(page); i++)
+ page_kasan_tag_reset(page + i);
+ kasan_poison_shadow(page_address(page), page_size(page),
+ KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_poison_shadow(object,
+ round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+ KASAN_KMALLOC_REDZONE);
+}
+
+/*
+ * This function assigns a tag to an object considering the following:
+ * 1. A cache might have a constructor, which might save a pointer to a slab
+ * object somewhere (e.g. in the object itself). We preassign a tag for
+ * each object in caches with constructors during slab creation and reuse
+ * the same tag each time a particular object is allocated.
+ * 2. A cache might be SLAB_TYPESAFE_BY_RCU, which means objects can be
+ * accessed after being freed. We preassign tags for objects in these
+ * caches as well.
+ * 3. For SLAB allocator we can't preassign tags randomly since the freelist
+ * is stored as an array of indexes instead of a linked list. Assign tags
+ * based on objects indexes, so that objects that are next to each other
+ * get different tags.
+ */
+static u8 assign_tag(struct kmem_cache *cache, const void *object,
+ bool init, bool keep_tag)
+{
+ /*
+ * 1. When an object is kmalloc()'ed, two hooks are called:
+ * kasan_slab_alloc() and kasan_kmalloc(). We assign the
+ * tag only in the first one.
+ * 2. We reuse the same tag for krealloc'ed objects.
+ */
+ if (keep_tag)
+ return get_tag(object);
+
+ /*
+ * If the cache neither has a constructor nor has SLAB_TYPESAFE_BY_RCU
+ * set, assign a tag when the object is being allocated (init == false).
+ */
+ if (!cache->ctor && !(cache->flags & SLAB_TYPESAFE_BY_RCU))
+ return init ? KASAN_TAG_KERNEL : random_tag();
+
+ /* For caches that either have a constructor or SLAB_TYPESAFE_BY_RCU: */
+#ifdef CONFIG_SLAB
+ /* For SLAB assign tags based on the object index in the freelist. */
+ return (u8)obj_to_index(cache, virt_to_page(object), (void *)object);
+#else
+ /*
+ * For SLUB assign a random tag during slab creation, otherwise reuse
+ * the already assigned tag.
+ */
+ return init ? random_tag() : get_tag(object);
+#endif
+}
+
+void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+ const void *object)
+{
+ struct kasan_alloc_meta *alloc_info;
+
+ if (!(cache->flags & SLAB_KASAN))
+ return (void *)object;
+
+ alloc_info = get_alloc_info(cache, object);
+ __memset(alloc_info, 0, sizeof(*alloc_info));
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ object = set_tag(object,
+ assign_tag(cache, object, true, false));
+
+ return (void *)object;
+}
+
+static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
+{
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return shadow_byte < 0 ||
+ shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
+
+ /* else CONFIG_KASAN_SW_TAGS: */
+ if ((u8)shadow_byte == KASAN_TAG_INVALID)
+ return true;
+ if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
+ return true;
+
+ return false;
+}
+
+static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+ unsigned long ip, bool quarantine)
+{
+ s8 shadow_byte;
+ u8 tag;
+ void *tagged_object;
+ unsigned long rounded_up_size;
+
+ tag = get_tag(object);
+ tagged_object = object;
+ object = reset_tag(object);
+
+ if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
+ object)) {
+ kasan_report_invalid_free(tagged_object, ip);
+ return true;
+ }
+
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
+ return false;
+
+ shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
+ if (shadow_invalid(tag, shadow_byte)) {
+ kasan_report_invalid_free(tagged_object, ip);
+ return true;
+ }
+
+ rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
+ kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+
+ if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
+ unlikely(!(cache->flags & SLAB_KASAN)))
+ return false;
+
+ kasan_set_free_info(cache, object, tag);
+
+ quarantine_put(get_free_info(cache, object), cache);
+
+ return IS_ENABLED(CONFIG_KASAN_GENERIC);
+}
+
+bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+{
+ return __kasan_slab_free(cache, object, ip, true);
+}
+
+static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags, bool keep_tag)
+{
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+ u8 tag = 0xff;
+
+ if (gfpflags_allow_blocking(flags))
+ quarantine_reduce();
+
+ if (unlikely(object == NULL))
+ return NULL;
+
+ redzone_start = round_up((unsigned long)(object + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = round_up((unsigned long)object + cache->object_size,
+ KASAN_SHADOW_SCALE_SIZE);
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ tag = assign_tag(cache, object, false, keep_tag);
+
+ /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
+ kasan_unpoison_shadow(set_tag(object, tag), size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
+
+ if (cache->flags & SLAB_KASAN)
+ set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+
+ return set_tag(object, tag);
+}
+
+void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
+ gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+}
+
+void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
+{
+ return __kasan_kmalloc(cache, object, size, flags, true);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+ gfp_t flags)
+{
+ struct page *page;
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+
+ if (gfpflags_allow_blocking(flags))
+ quarantine_reduce();
+
+ if (unlikely(ptr == NULL))
+ return NULL;
+
+ page = virt_to_page(ptr);
+ redzone_start = round_up((unsigned long)(ptr + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = (unsigned long)ptr + page_size(page);
+
+ kasan_unpoison_shadow(ptr, size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_PAGE_REDZONE);
+
+ return (void *)ptr;
+}
+
+void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
+{
+ struct page *page;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return (void *)object;
+
+ page = virt_to_head_page(object);
+
+ if (unlikely(!PageSlab(page)))
+ return kasan_kmalloc_large(object, size, flags);
+ else
+ return __kasan_kmalloc(page->slab_cache, object, size,
+ flags, true);
+}
+
+void kasan_poison_kfree(void *ptr, unsigned long ip)
+{
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+
+ if (unlikely(!PageSlab(page))) {
+ if (ptr != page_address(page)) {
+ kasan_report_invalid_free(ptr, ip);
+ return;
+ }
+ kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
+ } else {
+ __kasan_slab_free(page->slab_cache, ptr, ip, false);
+ }
+}
+
+void kasan_kfree_large(void *ptr, unsigned long ip)
+{
+ if (ptr != page_address(virt_to_head_page(ptr)))
+ kasan_report_invalid_free(ptr, ip);
+ /* The object will be poisoned by page_alloc. */
+}
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+ void *ret;
+ size_t scaled_size;
+ size_t shadow_size;
+ unsigned long shadow_start;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+ scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
+ shadow_size = round_up(scaled_size, PAGE_SIZE);
+
+ if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+ return -EINVAL;
+
+ ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+ shadow_start + shadow_size,
+ GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ __builtin_return_address(0));
+
+ if (ret) {
+ __memset(ret, KASAN_SHADOW_INIT, shadow_size);
+ find_vm_area(addr)->flags |= VM_KASAN;
+ kmemleak_ignore(ret);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+ if (vm->flags & VM_KASAN)
+ vfree(kasan_mem_to_shadow(vm->addr));
+}
+
+extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
+
+void kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
+{
+ unsigned long flags = user_access_save();
+ __kasan_report(addr, size, is_write, ip);
+ user_access_restore(flags);
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one is
+ * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
+ * pud_bad(), if pud is bad then it's bad because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
+static int __meminit kasan_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct memory_notify *mem_data = data;
+ unsigned long nr_shadow_pages, start_kaddr, shadow_start;
+ unsigned long shadow_end, shadow_size;
+
+ nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
+ start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
+ shadow_size = nr_shadow_pages << PAGE_SHIFT;
+ shadow_end = shadow_start + shadow_size;
+
+ if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
+ WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
+ return NOTIFY_BAD;
+
+ switch (action) {
+ case MEM_GOING_ONLINE: {
+ void *ret;
+
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
+ ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
+ shadow_end, GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD,
+ pfn_to_nid(mem_data->start_pfn),
+ __builtin_return_address(0));
+ if (!ret)
+ return NOTIFY_BAD;
+
+ kmemleak_ignore(ret);
+ return NOTIFY_OK;
+ }
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * shadow_start was either mapped during boot by kasan_init()
+ * or during memory online by __vmalloc_node_range().
+ * In the latter case we can use vfree() to free shadow.
+ * Non-NULL result of the find_vm_area() will tell us if
+ * that was the second case.
+ *
+ * Currently it's not possible to free shadow mapped
+ * during boot by kasan_init(). It's because the code
+ * to do that hasn't been written yet. So we'll just
+ * leak the memory.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+ hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+ return 0;
+}
+
+core_initcall(kasan_memhotplug_init);
+#endif
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
new file mode 100644
index 0000000..616f9dd
--- /dev/null
+++ b/mm/kasan/generic.c
@@ -0,0 +1,326 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core generic KASAN code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(shadow_value)) {
+ s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ return unlikely(last_accessible_byte >= shadow_value);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
+ unsigned long size)
+{
+ u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
+
+ /*
+ * Access crosses 8(shadow size)-byte boundary. Such access maps
+ * into 2 shadow bytes, so we need to check them both.
+ */
+ if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
+
+ return memory_is_poisoned_1(addr + size - 1);
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ /* Unaligned 16-bytes access maps into 3 shadow bytes. */
+ if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ return *shadow_addr || memory_is_poisoned_1(addr + 15);
+
+ return *shadow_addr;
+}
+
+static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
+ size_t size)
+{
+ while (size) {
+ if (unlikely(*start))
+ return (unsigned long)start;
+ start++;
+ size--;
+ }
+
+ return 0;
+}
+
+static __always_inline unsigned long memory_is_nonzero(const void *start,
+ const void *end)
+{
+ unsigned int words;
+ unsigned long ret;
+ unsigned int prefix = (unsigned long)start % 8;
+
+ if (end - start <= 16)
+ return bytes_is_nonzero(start, end - start);
+
+ if (prefix) {
+ prefix = 8 - prefix;
+ ret = bytes_is_nonzero(start, prefix);
+ if (unlikely(ret))
+ return ret;
+ start += prefix;
+ }
+
+ words = (end - start) / 8;
+ while (words) {
+ if (unlikely(*(u64 *)start))
+ return bytes_is_nonzero(start, 8);
+ start += 8;
+ words--;
+ }
+
+ return bytes_is_nonzero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+ size_t size)
+{
+ unsigned long ret;
+
+ ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
+ kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+ if (unlikely(ret)) {
+ unsigned long last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+ if (unlikely(ret != (unsigned long)last_shadow ||
+ ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+ if (__builtin_constant_p(size)) {
+ switch (size) {
+ case 1:
+ return memory_is_poisoned_1(addr);
+ case 2:
+ case 4:
+ case 8:
+ return memory_is_poisoned_2_4_8(addr, size);
+ case 16:
+ return memory_is_poisoned_16(addr);
+ default:
+ BUILD_BUG();
+ }
+ }
+
+ return memory_is_poisoned_n(addr, size);
+}
+
+static __always_inline bool check_memory_region_inline(unsigned long addr,
+ size_t size, bool write,
+ unsigned long ret_ip)
+{
+ if (unlikely(size == 0))
+ return true;
+
+ if (unlikely((void *)addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ kasan_report(addr, size, write, ret_ip);
+ return false;
+ }
+
+ if (likely(!memory_is_poisoned(addr, size)))
+ return true;
+
+ kasan_report(addr, size, write, ret_ip);
+ return false;
+}
+
+bool check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
+{
+ return check_memory_region_inline(addr, size, write, ret_ip);
+}
+
+void kasan_cache_shrink(struct kmem_cache *cache)
+{
+ quarantine_remove_cache(cache);
+}
+
+void kasan_cache_shutdown(struct kmem_cache *cache)
+{
+ if (!__kmem_cache_empty(cache))
+ quarantine_remove_cache(cache);
+}
+
+static void register_global(struct kasan_global *global)
+{
+ size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+ kasan_unpoison_shadow(global->beg, global->size);
+
+ kasan_poison_shadow(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
+#define DEFINE_ASAN_LOAD_STORE(size) \
+ void __asan_load##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, false, _RET_IP_);\
+ } \
+ EXPORT_SYMBOL(__asan_load##size); \
+ __alias(__asan_load##size) \
+ void __asan_load##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_load##size##_noabort); \
+ void __asan_store##size(unsigned long addr) \
+ { \
+ check_memory_region_inline(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__asan_store##size); \
+ __alias(__asan_store##size) \
+ void __asan_store##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+
+/* Emitted by compiler to poison alloca()ed objects. */
+void __asan_alloca_poison(unsigned long addr, size_t size)
+{
+ size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
+ rounded_up_size;
+ size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+
+ const void *left_redzone = (const void *)(addr -
+ KASAN_ALLOCA_REDZONE_SIZE);
+ const void *right_redzone = (const void *)(addr + rounded_up_size);
+
+ WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
+
+ kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_LEFT);
+ kasan_poison_shadow(right_redzone,
+ padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_RIGHT);
+}
+EXPORT_SYMBOL(__asan_alloca_poison);
+
+/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
+void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
+{
+ if (unlikely(!stack_top || stack_top > stack_bottom))
+ return;
+
+ kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+}
+EXPORT_SYMBOL(__asan_allocas_unpoison);
+
+/* Emitted by the compiler to [un]poison local variables. */
+#define DEFINE_ASAN_SET_SHADOW(byte) \
+ void __asan_set_shadow_##byte(const void *addr, size_t size) \
+ { \
+ __memset((void *)addr, 0x##byte, size); \
+ } \
+ EXPORT_SYMBOL(__asan_set_shadow_##byte)
+
+DEFINE_ASAN_SET_SHADOW(00);
+DEFINE_ASAN_SET_SHADOW(f1);
+DEFINE_ASAN_SET_SHADOW(f2);
+DEFINE_ASAN_SET_SHADOW(f3);
+DEFINE_ASAN_SET_SHADOW(f5);
+DEFINE_ASAN_SET_SHADOW(f8);
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
new file mode 100644
index 0000000..36c6459
--- /dev/null
+++ b/mm/kasan/generic_report.c
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains generic KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ void *p = addr;
+
+ while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
+ p += KASAN_SHADOW_SCALE_SIZE;
+ return p;
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
+
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ /*
+ * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_KMALLOC_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
+ case KASAN_GLOBAL_REDZONE:
+ bug_type = "global-out-of-bounds";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ bug_type = "use-after-free";
+ break;
+ case KASAN_ALLOCA_LEFT:
+ case KASAN_ALLOCA_RIGHT:
+ bug_type = "alloca-out-of-bounds";
+ break;
+ }
+
+ return bug_type;
+}
+
+static const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ if (addr_has_shadow(info->access_addr))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/kasan_init.c b/mm/kasan/init.c
similarity index 79%
rename from mm/kasan/kasan_init.c
rename to mm/kasan/init.c
index 7a2a2f1..ce45c49 100644
--- a/mm/kasan/kasan_init.c
+++ b/mm/kasan/init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* This file contains some kasan initialization code.
*
@@ -10,11 +11,10 @@
*
*/
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
-#include <linux/memblock.h>
#include <linux/mm.h>
#include <linux/pfn.h>
#include <linux/slab.h>
@@ -31,60 +31,66 @@
* - Latter it reused it as zero shadow to cover large ranges of memory
* that allowed to access, but not handled by kasan (vmalloc/vmemmap ...).
*/
-unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+unsigned char kasan_early_shadow_page[PAGE_SIZE] __page_aligned_bss;
#if CONFIG_PGTABLE_LEVELS > 4
-p4d_t kasan_zero_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
+p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D] __page_aligned_bss;
static inline bool kasan_p4d_table(pgd_t pgd)
{
- return pgd_page(pgd) == virt_to_page(lm_alias(kasan_zero_p4d));
+ return pgd_page(pgd) == virt_to_page(lm_alias(kasan_early_shadow_p4d));
}
#else
static inline bool kasan_p4d_table(pgd_t pgd)
{
- return 0;
+ return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 3
-pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+pud_t kasan_early_shadow_pud[PTRS_PER_PUD] __page_aligned_bss;
static inline bool kasan_pud_table(p4d_t p4d)
{
- return p4d_page(p4d) == virt_to_page(lm_alias(kasan_zero_pud));
+ return p4d_page(p4d) == virt_to_page(lm_alias(kasan_early_shadow_pud));
}
#else
static inline bool kasan_pud_table(p4d_t p4d)
{
- return 0;
+ return false;
}
#endif
#if CONFIG_PGTABLE_LEVELS > 2
-pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD] __page_aligned_bss;
static inline bool kasan_pmd_table(pud_t pud)
{
- return pud_page(pud) == virt_to_page(lm_alias(kasan_zero_pmd));
+ return pud_page(pud) == virt_to_page(lm_alias(kasan_early_shadow_pmd));
}
#else
static inline bool kasan_pmd_table(pud_t pud)
{
- return 0;
+ return false;
}
#endif
-pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss;
static inline bool kasan_pte_table(pmd_t pmd)
{
- return pmd_page(pmd) == virt_to_page(lm_alias(kasan_zero_pte));
+ return pmd_page(pmd) == virt_to_page(lm_alias(kasan_early_shadow_pte));
}
-static inline bool kasan_zero_page_entry(pte_t pte)
+static inline bool kasan_early_shadow_page_entry(pte_t pte)
{
- return pte_page(pte) == virt_to_page(lm_alias(kasan_zero_page));
+ return pte_page(pte) == virt_to_page(lm_alias(kasan_early_shadow_page));
}
static __init void *early_alloc(size_t size, int node)
{
- return memblock_virt_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, node);
+ void *ptr = memblock_alloc_try_nid(size, size, __pa(MAX_DMA_ADDRESS),
+ MEMBLOCK_ALLOC_ACCESSIBLE, node);
+
+ if (!ptr)
+ panic("%s: Failed to allocate %zu bytes align=%zx nid=%d from=%llx\n",
+ __func__, size, size, node, (u64)__pa(MAX_DMA_ADDRESS));
+
+ return ptr;
}
static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr,
@@ -93,7 +99,8 @@
pte_t *pte = pte_offset_kernel(pmd, addr);
pte_t zero_pte;
- zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_zero_page)), PAGE_KERNEL);
+ zero_pte = pfn_pte(PFN_DOWN(__pa_symbol(kasan_early_shadow_page)),
+ PAGE_KERNEL);
zero_pte = pte_wrprotect(zero_pte);
while (addr + PAGE_SIZE <= end) {
@@ -113,7 +120,8 @@
next = pmd_addr_end(addr, end);
if (IS_ALIGNED(addr, PMD_SIZE) && end - addr >= PMD_SIZE) {
- pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -121,7 +129,7 @@
pte_t *p;
if (slab_is_available())
- p = pte_alloc_one_kernel(&init_mm, addr);
+ p = pte_alloc_one_kernel(&init_mm);
else
p = early_alloc(PAGE_SIZE, NUMA_NO_NODE);
if (!p)
@@ -146,9 +154,11 @@
if (IS_ALIGNED(addr, PUD_SIZE) && end - addr >= PUD_SIZE) {
pmd_t *pmd;
- pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -182,12 +192,14 @@
pud_t *pud;
pmd_t *pmd;
- p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+ p4d_populate(&init_mm, p4d,
+ lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
- pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
pmd = pmd_offset(pud, addr);
pmd_populate_kernel(&init_mm, pmd,
- lm_alias(kasan_zero_pte));
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -210,13 +222,13 @@
}
/**
- * kasan_populate_zero_shadow - populate shadow memory region with
- * kasan_zero_page
+ * kasan_populate_early_shadow - populate shadow memory region with
+ * kasan_early_shadow_page
* @shadow_start - start of the memory range to populate
* @shadow_end - end of the memory range to populate
*/
-int __ref kasan_populate_zero_shadow(const void *shadow_start,
- const void *shadow_end)
+int __ref kasan_populate_early_shadow(const void *shadow_start,
+ const void *shadow_end)
{
unsigned long addr = (unsigned long)shadow_start;
unsigned long end = (unsigned long)shadow_end;
@@ -232,7 +244,7 @@
pmd_t *pmd;
/*
- * kasan_zero_pud should be populated with pmds
+ * kasan_early_shadow_pud should be populated with pmds
* at this moment.
* [pud,pmd]_populate*() below needed only for
* 3,2 - level page tables where we don't have
@@ -242,21 +254,25 @@
* The ifndef is required to avoid build breakage.
*
* With 5level-fixup.h, pgd_populate() is not nop and
- * we reference kasan_zero_p4d. It's not defined
+ * we reference kasan_early_shadow_p4d. It's not defined
* unless 5-level paging enabled.
*
* The ifndef can be dropped once all KASAN-enabled
* architectures will switch to pgtable-nop4d.h.
*/
#ifndef __ARCH_HAS_5LEVEL_HACK
- pgd_populate(&init_mm, pgd, lm_alias(kasan_zero_p4d));
+ pgd_populate(&init_mm, pgd,
+ lm_alias(kasan_early_shadow_p4d));
#endif
p4d = p4d_offset(pgd, addr);
- p4d_populate(&init_mm, p4d, lm_alias(kasan_zero_pud));
+ p4d_populate(&init_mm, p4d,
+ lm_alias(kasan_early_shadow_pud));
pud = pud_offset(p4d, addr);
- pud_populate(&init_mm, pud, lm_alias(kasan_zero_pmd));
+ pud_populate(&init_mm, pud,
+ lm_alias(kasan_early_shadow_pmd));
pmd = pmd_offset(pud, addr);
- pmd_populate_kernel(&init_mm, pmd, lm_alias(kasan_zero_pte));
+ pmd_populate_kernel(&init_mm, pmd,
+ lm_alias(kasan_early_shadow_pte));
continue;
}
@@ -351,7 +367,7 @@
if (!pte_present(*pte))
continue;
- if (WARN_ON(!kasan_zero_page_entry(*pte)))
+ if (WARN_ON(!kasan_early_shadow_page_entry(*pte)))
continue;
pte_clear(&init_mm, addr, pte);
}
@@ -481,7 +497,7 @@
WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
return -EINVAL;
- ret = kasan_populate_zero_shadow(shadow_start, shadow_end);
+ ret = kasan_populate_early_shadow(shadow_start, shadow_end);
if (ret)
kasan_remove_zero_shadow(shadow_start,
size >> KASAN_SHADOW_SCALE_SHIFT);
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
deleted file mode 100644
index c3bd520..0000000
--- a/mm/kasan/kasan.c
+++ /dev/null
@@ -1,903 +0,0 @@
-/*
- * This file contains shadow memory manipulation code.
- *
- * Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * Some code borrowed from https://github.com/xairy/kasan-prototype by
- * Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
-
-#include <linux/export.h>
-#include <linux/interrupt.h>
-#include <linux/init.h>
-#include <linux/kasan.h>
-#include <linux/kernel.h>
-#include <linux/kmemleak.h>
-#include <linux/linkage.h>
-#include <linux/memblock.h>
-#include <linux/memory.h>
-#include <linux/mm.h>
-#include <linux/module.h>
-#include <linux/printk.h>
-#include <linux/sched.h>
-#include <linux/sched/task_stack.h>
-#include <linux/slab.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/vmalloc.h>
-#include <linux/bug.h>
-
-#include "kasan.h"
-#include "../slab.h"
-
-void kasan_enable_current(void)
-{
- current->kasan_depth++;
-}
-
-void kasan_disable_current(void)
-{
- current->kasan_depth--;
-}
-
-/*
- * Poisons the shadow memory for 'size' bytes starting from 'addr'.
- * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
- */
-static void kasan_poison_shadow(const void *address, size_t size, u8 value)
-{
- void *shadow_start, *shadow_end;
-
- shadow_start = kasan_mem_to_shadow(address);
- shadow_end = kasan_mem_to_shadow(address + size);
-
- memset(shadow_start, value, shadow_end - shadow_start);
-}
-
-void kasan_unpoison_shadow(const void *address, size_t size)
-{
- kasan_poison_shadow(address, size, 0);
-
- if (size & KASAN_SHADOW_MASK) {
- u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
- *shadow = size & KASAN_SHADOW_MASK;
- }
-}
-
-static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
-{
- void *base = task_stack_page(task);
- size_t size = sp - base;
-
- kasan_unpoison_shadow(base, size);
-}
-
-/* Unpoison the entire stack for a task. */
-void kasan_unpoison_task_stack(struct task_struct *task)
-{
- __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
-}
-
-/* Unpoison the stack for the current task beyond a watermark sp value. */
-asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
-{
- /*
- * Calculate the task stack base address. Avoid using 'current'
- * because this function is called by early resume code which hasn't
- * yet set up the percpu register (%gs).
- */
- void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
-
- kasan_unpoison_shadow(base, watermark - base);
-}
-
-/*
- * Clear all poison for the region between the current SP and a provided
- * watermark value, as is sometimes required prior to hand-crafted asm function
- * returns in the middle of functions.
- */
-void kasan_unpoison_stack_above_sp_to(const void *watermark)
-{
- const void *sp = __builtin_frame_address(0);
- size_t size = watermark - sp;
-
- if (WARN_ON(sp > watermark))
- return;
- kasan_unpoison_shadow(sp, size);
-}
-
-/*
- * All functions below always inlined so compiler could
- * perform better optimizations in each of __asan_loadX/__assn_storeX
- * depending on memory access size X.
- */
-
-static __always_inline bool memory_is_poisoned_1(unsigned long addr)
-{
- s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
-
- if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
- return unlikely(last_accessible_byte >= shadow_value);
- }
-
- return false;
-}
-
-static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
- unsigned long size)
-{
- u8 *shadow_addr = (u8 *)kasan_mem_to_shadow((void *)addr);
-
- /*
- * Access crosses 8(shadow size)-byte boundary. Such access maps
- * into 2 shadow bytes, so we need to check them both.
- */
- if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
- return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
-
- return memory_is_poisoned_1(addr + size - 1);
-}
-
-static __always_inline bool memory_is_poisoned_16(unsigned long addr)
-{
- u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
-
- /* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
- return *shadow_addr || memory_is_poisoned_1(addr + 15);
-
- return *shadow_addr;
-}
-
-static __always_inline unsigned long bytes_is_nonzero(const u8 *start,
- size_t size)
-{
- while (size) {
- if (unlikely(*start))
- return (unsigned long)start;
- start++;
- size--;
- }
-
- return 0;
-}
-
-static __always_inline unsigned long memory_is_nonzero(const void *start,
- const void *end)
-{
- unsigned int words;
- unsigned long ret;
- unsigned int prefix = (unsigned long)start % 8;
-
- if (end - start <= 16)
- return bytes_is_nonzero(start, end - start);
-
- if (prefix) {
- prefix = 8 - prefix;
- ret = bytes_is_nonzero(start, prefix);
- if (unlikely(ret))
- return ret;
- start += prefix;
- }
-
- words = (end - start) / 8;
- while (words) {
- if (unlikely(*(u64 *)start))
- return bytes_is_nonzero(start, 8);
- start += 8;
- words--;
- }
-
- return bytes_is_nonzero(start, (end - start) % 8);
-}
-
-static __always_inline bool memory_is_poisoned_n(unsigned long addr,
- size_t size)
-{
- unsigned long ret;
-
- ret = memory_is_nonzero(kasan_mem_to_shadow((void *)addr),
- kasan_mem_to_shadow((void *)addr + size - 1) + 1);
-
- if (unlikely(ret)) {
- unsigned long last_byte = addr + size - 1;
- s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
-
- if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
- return true;
- }
- return false;
-}
-
-static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
-{
- if (__builtin_constant_p(size)) {
- switch (size) {
- case 1:
- return memory_is_poisoned_1(addr);
- case 2:
- case 4:
- case 8:
- return memory_is_poisoned_2_4_8(addr, size);
- case 16:
- return memory_is_poisoned_16(addr);
- default:
- BUILD_BUG();
- }
- }
-
- return memory_is_poisoned_n(addr, size);
-}
-
-static __always_inline void check_memory_region_inline(unsigned long addr,
- size_t size, bool write,
- unsigned long ret_ip)
-{
- if (unlikely(size == 0))
- return;
-
- if (unlikely((void *)addr <
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
- kasan_report(addr, size, write, ret_ip);
- return;
- }
-
- if (likely(!memory_is_poisoned(addr, size)))
- return;
-
- kasan_report(addr, size, write, ret_ip);
-}
-
-static void check_memory_region(unsigned long addr,
- size_t size, bool write,
- unsigned long ret_ip)
-{
- check_memory_region_inline(addr, size, write, ret_ip);
-}
-
-void kasan_check_read(const volatile void *p, unsigned int size)
-{
- check_memory_region((unsigned long)p, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(kasan_check_read);
-
-void kasan_check_write(const volatile void *p, unsigned int size)
-{
- check_memory_region((unsigned long)p, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(kasan_check_write);
-
-#undef memset
-void *memset(void *addr, int c, size_t len)
-{
- check_memory_region((unsigned long)addr, len, true, _RET_IP_);
-
- return __memset(addr, c, len);
-}
-
-#undef memmove
-void *memmove(void *dest, const void *src, size_t len)
-{
- check_memory_region((unsigned long)src, len, false, _RET_IP_);
- check_memory_region((unsigned long)dest, len, true, _RET_IP_);
-
- return __memmove(dest, src, len);
-}
-
-#undef memcpy
-void *memcpy(void *dest, const void *src, size_t len)
-{
- check_memory_region((unsigned long)src, len, false, _RET_IP_);
- check_memory_region((unsigned long)dest, len, true, _RET_IP_);
-
- return __memcpy(dest, src, len);
-}
-
-void kasan_alloc_pages(struct page *page, unsigned int order)
-{
- if (likely(!PageHighMem(page)))
- kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
-}
-
-void kasan_free_pages(struct page *page, unsigned int order)
-{
- if (likely(!PageHighMem(page)))
- kasan_poison_shadow(page_address(page),
- PAGE_SIZE << order,
- KASAN_FREE_PAGE);
-}
-
-/*
- * Adaptive redzone policy taken from the userspace AddressSanitizer runtime.
- * For larger allocations larger redzones are used.
- */
-static unsigned int optimal_redzone(unsigned int object_size)
-{
- return
- object_size <= 64 - 16 ? 16 :
- object_size <= 128 - 32 ? 32 :
- object_size <= 512 - 64 ? 64 :
- object_size <= 4096 - 128 ? 128 :
- object_size <= (1 << 14) - 256 ? 256 :
- object_size <= (1 << 15) - 512 ? 512 :
- object_size <= (1 << 16) - 1024 ? 1024 : 2048;
-}
-
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags)
-{
- unsigned int orig_size = *size;
- int redzone_adjust;
-
- /* Add alloc meta. */
- cache->kasan_info.alloc_meta_offset = *size;
- *size += sizeof(struct kasan_alloc_meta);
-
- /* Add free meta. */
- if (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
- cache->object_size < sizeof(struct kasan_free_meta)) {
- cache->kasan_info.free_meta_offset = *size;
- *size += sizeof(struct kasan_free_meta);
- }
- redzone_adjust = optimal_redzone(cache->object_size) -
- (*size - cache->object_size);
-
- if (redzone_adjust > 0)
- *size += redzone_adjust;
-
- *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
- max(*size, cache->object_size +
- optimal_redzone(cache->object_size)));
-
- /*
- * If the metadata doesn't fit, don't enable KASAN at all.
- */
- if (*size <= cache->kasan_info.alloc_meta_offset ||
- *size <= cache->kasan_info.free_meta_offset) {
- cache->kasan_info.alloc_meta_offset = 0;
- cache->kasan_info.free_meta_offset = 0;
- *size = orig_size;
- return;
- }
-
- *flags |= SLAB_KASAN;
-}
-
-void kasan_cache_shrink(struct kmem_cache *cache)
-{
- quarantine_remove_cache(cache);
-}
-
-void kasan_cache_shutdown(struct kmem_cache *cache)
-{
- if (!__kmem_cache_empty(cache))
- quarantine_remove_cache(cache);
-}
-
-size_t kasan_metadata_size(struct kmem_cache *cache)
-{
- return (cache->kasan_info.alloc_meta_offset ?
- sizeof(struct kasan_alloc_meta) : 0) +
- (cache->kasan_info.free_meta_offset ?
- sizeof(struct kasan_free_meta) : 0);
-}
-
-void kasan_poison_slab(struct page *page)
-{
- kasan_poison_shadow(page_address(page),
- PAGE_SIZE << compound_order(page),
- KASAN_KMALLOC_REDZONE);
-}
-
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
-{
- kasan_unpoison_shadow(object, cache->object_size);
-}
-
-void kasan_poison_object_data(struct kmem_cache *cache, void *object)
-{
- kasan_poison_shadow(object,
- round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_KMALLOC_REDZONE);
-}
-
-static inline int in_irqentry_text(unsigned long ptr)
-{
- return (ptr >= (unsigned long)&__irqentry_text_start &&
- ptr < (unsigned long)&__irqentry_text_end) ||
- (ptr >= (unsigned long)&__softirqentry_text_start &&
- ptr < (unsigned long)&__softirqentry_text_end);
-}
-
-static inline void filter_irq_stacks(struct stack_trace *trace)
-{
- int i;
-
- if (!trace->nr_entries)
- return;
- for (i = 0; i < trace->nr_entries; i++)
- if (in_irqentry_text(trace->entries[i])) {
- /* Include the irqentry function into the stack. */
- trace->nr_entries = i + 1;
- break;
- }
-}
-
-static inline depot_stack_handle_t save_stack(gfp_t flags)
-{
- unsigned long entries[KASAN_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = KASAN_STACK_DEPTH,
- .skip = 0
- };
-
- save_stack_trace(&trace);
- filter_irq_stacks(&trace);
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries-1] == ULONG_MAX)
- trace.nr_entries--;
-
- return depot_save_stack(&trace, flags);
-}
-
-static inline void set_track(struct kasan_track *track, gfp_t flags)
-{
- track->pid = current->pid;
- track->stack = save_stack(flags);
-}
-
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object)
-{
- BUILD_BUG_ON(sizeof(struct kasan_alloc_meta) > 32);
- return (void *)object + cache->kasan_info.alloc_meta_offset;
-}
-
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object)
-{
- BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
- return (void *)object + cache->kasan_info.free_meta_offset;
-}
-
-void kasan_init_slab_obj(struct kmem_cache *cache, const void *object)
-{
- struct kasan_alloc_meta *alloc_info;
-
- if (!(cache->flags & SLAB_KASAN))
- return;
-
- alloc_info = get_alloc_info(cache, object);
- __memset(alloc_info, 0, sizeof(*alloc_info));
-}
-
-void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags)
-{
- kasan_kmalloc(cache, object, cache->object_size, flags);
-}
-
-static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
- unsigned long ip, bool quarantine)
-{
- s8 shadow_byte;
- unsigned long rounded_up_size;
-
- if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
- object)) {
- kasan_report_invalid_free(object, ip);
- return true;
- }
-
- /* RCU slabs could be legally used after free within the RCU period */
- if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
- return false;
-
- shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
- if (shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE) {
- kasan_report_invalid_free(object, ip);
- return true;
- }
-
- rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
- kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
-
- if (!quarantine || unlikely(!(cache->flags & SLAB_KASAN)))
- return false;
-
- set_track(&get_alloc_info(cache, object)->free_track, GFP_NOWAIT);
- quarantine_put(get_free_info(cache, object), cache);
- return true;
-}
-
-bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
-{
- return __kasan_slab_free(cache, object, ip, true);
-}
-
-void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size,
- gfp_t flags)
-{
- unsigned long redzone_start;
- unsigned long redzone_end;
-
- if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
-
- if (unlikely(object == NULL))
- return;
-
- redzone_start = round_up((unsigned long)(object + size),
- KASAN_SHADOW_SCALE_SIZE);
- redzone_end = round_up((unsigned long)object + cache->object_size,
- KASAN_SHADOW_SCALE_SIZE);
-
- kasan_unpoison_shadow(object, size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
-
- if (cache->flags & SLAB_KASAN)
- set_track(&get_alloc_info(cache, object)->alloc_track, flags);
-}
-EXPORT_SYMBOL(kasan_kmalloc);
-
-void kasan_kmalloc_large(const void *ptr, size_t size, gfp_t flags)
-{
- struct page *page;
- unsigned long redzone_start;
- unsigned long redzone_end;
-
- if (gfpflags_allow_blocking(flags))
- quarantine_reduce();
-
- if (unlikely(ptr == NULL))
- return;
-
- page = virt_to_page(ptr);
- redzone_start = round_up((unsigned long)(ptr + size),
- KASAN_SHADOW_SCALE_SIZE);
- redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
-
- kasan_unpoison_shadow(ptr, size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_PAGE_REDZONE);
-}
-
-void kasan_krealloc(const void *object, size_t size, gfp_t flags)
-{
- struct page *page;
-
- if (unlikely(object == ZERO_SIZE_PTR))
- return;
-
- page = virt_to_head_page(object);
-
- if (unlikely(!PageSlab(page)))
- kasan_kmalloc_large(object, size, flags);
- else
- kasan_kmalloc(page->slab_cache, object, size, flags);
-}
-
-void kasan_poison_kfree(void *ptr, unsigned long ip)
-{
- struct page *page;
-
- page = virt_to_head_page(ptr);
-
- if (unlikely(!PageSlab(page))) {
- if (ptr != page_address(page)) {
- kasan_report_invalid_free(ptr, ip);
- return;
- }
- kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
- KASAN_FREE_PAGE);
- } else {
- __kasan_slab_free(page->slab_cache, ptr, ip, false);
- }
-}
-
-void kasan_kfree_large(void *ptr, unsigned long ip)
-{
- if (ptr != page_address(virt_to_head_page(ptr)))
- kasan_report_invalid_free(ptr, ip);
- /* The object will be poisoned by page_alloc. */
-}
-
-int kasan_module_alloc(void *addr, size_t size)
-{
- void *ret;
- size_t scaled_size;
- size_t shadow_size;
- unsigned long shadow_start;
-
- shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
- scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
- shadow_size = round_up(scaled_size, PAGE_SIZE);
-
- if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
- return -EINVAL;
-
- ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
- shadow_start + shadow_size,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
- __builtin_return_address(0));
-
- if (ret) {
- find_vm_area(addr)->flags |= VM_KASAN;
- kmemleak_ignore(ret);
- return 0;
- }
-
- return -ENOMEM;
-}
-
-void kasan_free_shadow(const struct vm_struct *vm)
-{
- if (vm->flags & VM_KASAN)
- vfree(kasan_mem_to_shadow(vm->addr));
-}
-
-static void register_global(struct kasan_global *global)
-{
- size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
-
- kasan_unpoison_shadow(global->beg, global->size);
-
- kasan_poison_shadow(global->beg + aligned_size,
- global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
-}
-
-void __asan_register_globals(struct kasan_global *globals, size_t size)
-{
- int i;
-
- for (i = 0; i < size; i++)
- register_global(&globals[i]);
-}
-EXPORT_SYMBOL(__asan_register_globals);
-
-void __asan_unregister_globals(struct kasan_global *globals, size_t size)
-{
-}
-EXPORT_SYMBOL(__asan_unregister_globals);
-
-#define DEFINE_ASAN_LOAD_STORE(size) \
- void __asan_load##size(unsigned long addr) \
- { \
- check_memory_region_inline(addr, size, false, _RET_IP_);\
- } \
- EXPORT_SYMBOL(__asan_load##size); \
- __alias(__asan_load##size) \
- void __asan_load##size##_noabort(unsigned long); \
- EXPORT_SYMBOL(__asan_load##size##_noabort); \
- void __asan_store##size(unsigned long addr) \
- { \
- check_memory_region_inline(addr, size, true, _RET_IP_); \
- } \
- EXPORT_SYMBOL(__asan_store##size); \
- __alias(__asan_store##size) \
- void __asan_store##size##_noabort(unsigned long); \
- EXPORT_SYMBOL(__asan_store##size##_noabort)
-
-DEFINE_ASAN_LOAD_STORE(1);
-DEFINE_ASAN_LOAD_STORE(2);
-DEFINE_ASAN_LOAD_STORE(4);
-DEFINE_ASAN_LOAD_STORE(8);
-DEFINE_ASAN_LOAD_STORE(16);
-
-void __asan_loadN(unsigned long addr, size_t size)
-{
- check_memory_region(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_loadN);
-
-__alias(__asan_loadN)
-void __asan_loadN_noabort(unsigned long, size_t);
-EXPORT_SYMBOL(__asan_loadN_noabort);
-
-void __asan_storeN(unsigned long addr, size_t size)
-{
- check_memory_region(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_storeN);
-
-__alias(__asan_storeN)
-void __asan_storeN_noabort(unsigned long, size_t);
-EXPORT_SYMBOL(__asan_storeN_noabort);
-
-/* to shut up compiler complaints */
-void __asan_handle_no_return(void) {}
-EXPORT_SYMBOL(__asan_handle_no_return);
-
-/* Emitted by compiler to poison large objects when they go out of scope. */
-void __asan_poison_stack_memory(const void *addr, size_t size)
-{
- /*
- * Addr is KASAN_SHADOW_SCALE_SIZE-aligned and the object is surrounded
- * by redzones, so we simply round up size to simplify logic.
- */
- kasan_poison_shadow(addr, round_up(size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_USE_AFTER_SCOPE);
-}
-EXPORT_SYMBOL(__asan_poison_stack_memory);
-
-/* Emitted by compiler to unpoison large objects when they go into scope. */
-void __asan_unpoison_stack_memory(const void *addr, size_t size)
-{
- kasan_unpoison_shadow(addr, size);
-}
-EXPORT_SYMBOL(__asan_unpoison_stack_memory);
-
-/* Emitted by compiler to poison alloca()ed objects. */
-void __asan_alloca_poison(unsigned long addr, size_t size)
-{
- size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
- size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
- rounded_up_size;
- size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
-
- const void *left_redzone = (const void *)(addr -
- KASAN_ALLOCA_REDZONE_SIZE);
- const void *right_redzone = (const void *)(addr + rounded_up_size);
-
- WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
-
- kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
- kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
- kasan_poison_shadow(right_redzone,
- padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
-}
-EXPORT_SYMBOL(__asan_alloca_poison);
-
-/* Emitted by compiler to unpoison alloca()ed areas when the stack unwinds. */
-void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
-{
- if (unlikely(!stack_top || stack_top > stack_bottom))
- return;
-
- kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
-}
-EXPORT_SYMBOL(__asan_allocas_unpoison);
-
-/* Emitted by the compiler to [un]poison local variables. */
-#define DEFINE_ASAN_SET_SHADOW(byte) \
- void __asan_set_shadow_##byte(const void *addr, size_t size) \
- { \
- __memset((void *)addr, 0x##byte, size); \
- } \
- EXPORT_SYMBOL(__asan_set_shadow_##byte)
-
-DEFINE_ASAN_SET_SHADOW(00);
-DEFINE_ASAN_SET_SHADOW(f1);
-DEFINE_ASAN_SET_SHADOW(f2);
-DEFINE_ASAN_SET_SHADOW(f3);
-DEFINE_ASAN_SET_SHADOW(f5);
-DEFINE_ASAN_SET_SHADOW(f8);
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static bool shadow_mapped(unsigned long addr)
-{
- pgd_t *pgd = pgd_offset_k(addr);
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (pgd_none(*pgd))
- return false;
- p4d = p4d_offset(pgd, addr);
- if (p4d_none(*p4d))
- return false;
- pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
- return false;
-
- /*
- * We can't use pud_large() or pud_huge(), the first one is
- * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
- * pud_bad(), if pud is bad then it's bad because it's huge.
- */
- if (pud_bad(*pud))
- return true;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- return false;
-
- if (pmd_bad(*pmd))
- return true;
- pte = pte_offset_kernel(pmd, addr);
- return !pte_none(*pte);
-}
-
-static int __meminit kasan_mem_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct memory_notify *mem_data = data;
- unsigned long nr_shadow_pages, start_kaddr, shadow_start;
- unsigned long shadow_end, shadow_size;
-
- nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
- start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
- shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
- shadow_size = nr_shadow_pages << PAGE_SHIFT;
- shadow_end = shadow_start + shadow_size;
-
- if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
- WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
- return NOTIFY_BAD;
-
- switch (action) {
- case MEM_GOING_ONLINE: {
- void *ret;
-
- /*
- * If shadow is mapped already than it must have been mapped
- * during the boot. This could happen if we onlining previously
- * offlined memory.
- */
- if (shadow_mapped(shadow_start))
- return NOTIFY_OK;
-
- ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
- shadow_end, GFP_KERNEL,
- PAGE_KERNEL, VM_NO_GUARD,
- pfn_to_nid(mem_data->start_pfn),
- __builtin_return_address(0));
- if (!ret)
- return NOTIFY_BAD;
-
- kmemleak_ignore(ret);
- return NOTIFY_OK;
- }
- case MEM_CANCEL_ONLINE:
- case MEM_OFFLINE: {
- struct vm_struct *vm;
-
- /*
- * shadow_start was either mapped during boot by kasan_init()
- * or during memory online by __vmalloc_node_range().
- * In the latter case we can use vfree() to free shadow.
- * Non-NULL result of the find_vm_area() will tell us if
- * that was the second case.
- *
- * Currently it's not possible to free shadow mapped
- * during boot by kasan_init(). It's because the code
- * to do that hasn't been written yet. So we'll just
- * leak the memory.
- */
- vm = find_vm_area((void *)shadow_start);
- if (vm)
- vfree((void *)shadow_start);
- }
- }
-
- return NOTIFY_OK;
-}
-
-static int __init kasan_memhotplug_init(void)
-{
- hotplug_memory_notifier(kasan_mem_notifier, 0);
-
- return 0;
-}
-
-core_initcall(kasan_memhotplug_init);
-#endif
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index c12dcfd..35cff6b 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -8,10 +8,22 @@
#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
+#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
+#define KASAN_TAG_MAX 0xFD /* maximum value for random tags */
+
+#ifdef CONFIG_KASAN_GENERIC
#define KASAN_FREE_PAGE 0xFF /* page was freed */
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#else
+#define KASAN_FREE_PAGE KASAN_TAG_INVALID
+#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
+#endif
+
#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
/*
@@ -22,7 +34,6 @@
#define KASAN_STACK_MID 0xF2
#define KASAN_STACK_RIGHT 0xF3
#define KASAN_STACK_PARTIAL 0xF4
-#define KASAN_USE_AFTER_SCOPE 0xF8
/*
* alloca redzone shadow values
@@ -32,6 +43,11 @@
#define KASAN_ALLOCA_REDZONE_SIZE 32
+/*
+ * Stack frame marker (compiler ABI).
+ */
+#define KASAN_CURRENT_STACK_FRAME_MAGIC 0x41B58AB3
+
/* Don't break randconfig/all*config builds */
#ifndef KASAN_ABI_VERSION
#define KASAN_ABI_VERSION 1
@@ -79,9 +95,19 @@
depot_stack_handle_t stack;
};
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+#define KASAN_NR_FREE_STACKS 5
+#else
+#define KASAN_NR_FREE_STACKS 1
+#endif
+
struct kasan_alloc_meta {
struct kasan_track alloc_track;
- struct kasan_track free_track;
+ struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
+ u8 free_track_idx;
+#endif
};
struct qlist_node {
@@ -105,11 +131,35 @@
<< KASAN_SHADOW_SCALE_SHIFT);
}
+static inline bool addr_has_shadow(const void *addr)
+{
+ return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+}
+
+void kasan_poison_shadow(const void *address, size_t size, u8 value);
+
+/**
+ * check_memory_region - Check memory region, and report if invalid access.
+ * @addr: the accessed address
+ * @size: the accessed size
+ * @write: true if access is a write access
+ * @ret_ip: return address
+ * @return: true if access was valid, false if invalid
+ */
+bool check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip);
+
+void *find_first_bad_addr(void *addr, size_t size);
+const char *get_bug_type(struct kasan_access_info *info);
+
void kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
void kasan_report_invalid_free(void *object, unsigned long ip);
-#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB)
+struct page *kasan_addr_to_page(const void *addr);
+
+#if defined(CONFIG_KASAN_GENERIC) && \
+ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
void quarantine_reduce(void);
void quarantine_remove_cache(struct kmem_cache *cache);
@@ -120,6 +170,40 @@
static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
+#ifdef CONFIG_KASAN_SW_TAGS
+
+void print_tags(u8 addr_tag, const void *addr);
+
+u8 random_tag(void);
+
+#else
+
+static inline void print_tags(u8 addr_tag, const void *addr) { }
+
+static inline u8 random_tag(void)
+{
+ return 0;
+}
+
+#endif
+
+#ifndef arch_kasan_set_tag
+static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
+{
+ return addr;
+}
+#endif
+#ifndef arch_kasan_reset_tag
+#define arch_kasan_reset_tag(addr) ((void *)(addr))
+#endif
+#ifndef arch_kasan_get_tag
+#define arch_kasan_get_tag(addr) 0
+#endif
+
+#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
+#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr))
+#define get_tag(addr) arch_kasan_get_tag(addr)
+
/*
* Exported functions for interfaces called from assembly or from generated
* code. Declarations here to avoid warning about missing declarations.
@@ -130,8 +214,6 @@
void __asan_loadN(unsigned long addr, size_t size);
void __asan_storeN(unsigned long addr, size_t size);
void __asan_handle_no_return(void);
-void __asan_poison_stack_memory(const void *addr, size_t size);
-void __asan_unpoison_stack_memory(const void *addr, size_t size);
void __asan_alloca_poison(unsigned long addr, size_t size);
void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 3a8ddf8..978bc4a 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
/*
* KASAN quarantine.
*
@@ -103,7 +104,7 @@
static int quarantine_tail;
/* Total size of all objects in global_quarantine across all batches. */
static unsigned long quarantine_size;
-static DEFINE_SPINLOCK(quarantine_lock);
+static DEFINE_RAW_SPINLOCK(quarantine_lock);
DEFINE_STATIC_SRCU(remove_cache_srcu);
/* Maximum size of the global queue. */
@@ -190,7 +191,7 @@
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
- spin_lock(&quarantine_lock);
+ raw_spin_lock(&quarantine_lock);
WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes);
qlist_move_all(&temp, &global_quarantine[quarantine_tail]);
if (global_quarantine[quarantine_tail].bytes >=
@@ -203,7 +204,7 @@
if (new_tail != quarantine_head)
quarantine_tail = new_tail;
}
- spin_unlock(&quarantine_lock);
+ raw_spin_unlock(&quarantine_lock);
}
local_irq_restore(flags);
@@ -230,13 +231,13 @@
* expected case).
*/
srcu_idx = srcu_read_lock(&remove_cache_srcu);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
/*
* Update quarantine size in case of hotplug. Allocate a fraction of
* the installed memory to quarantine minus per-cpu queue limits.
*/
- total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) /
+ total_size = (totalram_pages() << PAGE_SHIFT) /
QUARANTINE_FRACTION;
percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus();
new_quarantine_size = (total_size < percpu_quarantines) ?
@@ -254,7 +255,7 @@
quarantine_head = 0;
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, NULL);
srcu_read_unlock(&remove_cache_srcu, srcu_idx);
@@ -310,17 +311,17 @@
*/
on_each_cpu(per_cpu_remove_cache, cache, 1);
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
for (i = 0; i < QUARANTINE_BATCHES; i++) {
if (qlist_empty(&global_quarantine[i]))
continue;
qlist_move_cache(&global_quarantine[i], &to_free, cache);
/* Scanning whole quarantine can take a while. */
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
cond_resched();
- spin_lock_irqsave(&quarantine_lock, flags);
+ raw_spin_lock_irqsave(&quarantine_lock, flags);
}
- spin_unlock_irqrestore(&quarantine_lock, flags);
+ raw_spin_unlock_irqrestore(&quarantine_lock, flags);
qlist_free_all(&to_free, cache);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 5c169aa..6217821 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -1,5 +1,6 @@
+// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains error reporting code.
+ * This file contains common generic and tag-based KASAN error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
@@ -27,6 +28,7 @@
#include <linux/types.h>
#include <linux/kasan.h>
#include <linux/module.h>
+#include <linux/sched/task_stack.h>
#include <asm/sections.h>
@@ -39,129 +41,43 @@
#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
#define SHADOW_ROWS_AROUND_ADDR 2
-static const void *find_first_bad_addr(const void *addr, size_t size)
+static unsigned long kasan_flags;
+
+#define KASAN_BIT_REPORTED 0
+#define KASAN_BIT_MULTI_SHOT 1
+
+bool kasan_save_enable_multi_shot(void)
{
- u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
- const void *first_bad_addr = addr;
-
- while (!shadow_val && first_bad_addr < addr + size) {
- first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
- shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
- }
- return first_bad_addr;
+ return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
+EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
-static bool addr_has_shadow(struct kasan_access_info *info)
+void kasan_restore_multi_shot(bool enabled)
{
- return (info->access_addr >=
- kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
+ if (!enabled)
+ clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
}
+EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
+static int __init kasan_set_multi_shot(char *str)
{
- const char *bug_type = "unknown-crash";
- u8 *shadow_addr;
-
- info->first_bad_addr = find_first_bad_addr(info->access_addr,
- info->access_size);
-
- shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
-
- /*
- * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
- * at the next shadow byte to determine the type of the bad access.
- */
- if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
- shadow_addr++;
-
- switch (*shadow_addr) {
- case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
- /*
- * In theory it's still possible to see these shadow values
- * due to a data race in the kernel code.
- */
- bug_type = "out-of-bounds";
- break;
- case KASAN_PAGE_REDZONE:
- case KASAN_KMALLOC_REDZONE:
- bug_type = "slab-out-of-bounds";
- break;
- case KASAN_GLOBAL_REDZONE:
- bug_type = "global-out-of-bounds";
- break;
- case KASAN_STACK_LEFT:
- case KASAN_STACK_MID:
- case KASAN_STACK_RIGHT:
- case KASAN_STACK_PARTIAL:
- bug_type = "stack-out-of-bounds";
- break;
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- bug_type = "use-after-free";
- break;
- case KASAN_USE_AFTER_SCOPE:
- bug_type = "use-after-scope";
- break;
- case KASAN_ALLOCA_LEFT:
- case KASAN_ALLOCA_RIGHT:
- bug_type = "alloca-out-of-bounds";
- break;
- }
-
- return bug_type;
+ set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
+ return 1;
}
-
-static const char *get_wild_bug_type(struct kasan_access_info *info)
-{
- const char *bug_type = "unknown-crash";
-
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
-
- return bug_type;
-}
-
-static const char *get_bug_type(struct kasan_access_info *info)
-{
- if (addr_has_shadow(info))
- return get_shadow_bug_type(info);
- return get_wild_bug_type(info);
-}
+__setup("kasan_multi_shot", kasan_set_multi_shot);
static void print_error_description(struct kasan_access_info *info)
{
- const char *bug_type = get_bug_type(info);
-
pr_err("BUG: KASAN: %s in %pS\n",
- bug_type, (void *)info->ip);
+ get_bug_type(info), (void *)info->ip);
pr_err("%s of size %zu at addr %px by task %s/%d\n",
info->is_write ? "Write" : "Read", info->access_size,
info->access_addr, current->comm, task_pid_nr(current));
}
-static inline bool kernel_or_module_addr(const void *addr)
-{
- if (addr >= (void *)_stext && addr < (void *)_end)
- return true;
- if (is_module_address((unsigned long)addr))
- return true;
- return false;
-}
-
-static inline bool init_task_stack_addr(const void *addr)
-{
- return addr >= (void *)&init_thread_union.stack &&
- (addr <= (void *)&init_thread_union.stack +
- sizeof(init_thread_union.stack));
-}
-
static DEFINE_SPINLOCK(report_lock);
-static void kasan_start_report(unsigned long *flags)
+static void start_report(unsigned long *flags)
{
/*
* Make sure we don't end up in loop.
@@ -171,7 +87,7 @@
pr_err("==================================================================\n");
}
-static void kasan_end_report(unsigned long *flags)
+static void end_report(unsigned long *flags)
{
pr_err("==================================================================\n");
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
@@ -185,16 +101,17 @@
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
- struct stack_trace trace;
+ unsigned long *entries;
+ unsigned int nr_entries;
- depot_fetch_stack(track->stack, &trace);
- print_stack_trace(&trace, 0);
+ nr_entries = stack_depot_fetch(track->stack, &entries);
+ stack_trace_print(entries, nr_entries, 0);
} else {
pr_err("(stack is not available)\n");
}
}
-static struct page *addr_to_page(const void *addr)
+struct page *kasan_addr_to_page(const void *addr)
{
if ((addr >= (void *)PAGE_OFFSET) &&
(addr < high_memory))
@@ -234,24 +151,225 @@
(void *)(object_addr + cache->object_size));
}
+static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
+
static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr)
+ const void *addr, u8 tag)
{
struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
if (cache->flags & SLAB_KASAN) {
+ struct kasan_track *free_track;
+
print_track(&alloc_info->alloc_track, "Allocated");
pr_err("\n");
- print_track(&alloc_info->free_track, "Freed");
+ free_track = kasan_get_free_track(cache, object, tag);
+ print_track(free_track, "Freed");
pr_err("\n");
}
describe_object_addr(cache, object, addr);
}
-static void print_address_description(void *addr)
+static inline bool kernel_or_module_addr(const void *addr)
{
- struct page *page = addr_to_page(addr);
+ if (addr >= (void *)_stext && addr < (void *)_end)
+ return true;
+ if (is_module_address((unsigned long)addr))
+ return true;
+ return false;
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+ return addr >= (void *)&init_thread_union.stack &&
+ (addr <= (void *)&init_thread_union.stack +
+ sizeof(init_thread_union.stack));
+}
+
+static bool __must_check tokenize_frame_descr(const char **frame_descr,
+ char *token, size_t max_tok_len,
+ unsigned long *value)
+{
+ const char *sep = strchr(*frame_descr, ' ');
+
+ if (sep == NULL)
+ sep = *frame_descr + strlen(*frame_descr);
+
+ if (token != NULL) {
+ const size_t tok_len = sep - *frame_descr;
+
+ if (tok_len + 1 > max_tok_len) {
+ pr_err("KASAN internal error: frame description too long: %s\n",
+ *frame_descr);
+ return false;
+ }
+
+ /* Copy token (+ 1 byte for '\0'). */
+ strlcpy(token, *frame_descr, tok_len + 1);
+ }
+
+ /* Advance frame_descr past separator. */
+ *frame_descr = sep + 1;
+
+ if (value != NULL && kstrtoul(token, 10, value)) {
+ pr_err("KASAN internal error: not a valid number: %s\n", token);
+ return false;
+ }
+
+ return true;
+}
+
+static void print_decoded_frame_descr(const char *frame_descr)
+{
+ /*
+ * We need to parse the following string:
+ * "n alloc_1 alloc_2 ... alloc_n"
+ * where alloc_i looks like
+ * "offset size len name"
+ * or "offset size len name:line".
+ */
+
+ char token[64];
+ unsigned long num_objects;
+
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &num_objects))
+ return;
+
+ pr_err("\n");
+ pr_err("this frame has %lu %s:\n", num_objects,
+ num_objects == 1 ? "object" : "objects");
+
+ while (num_objects--) {
+ unsigned long offset;
+ unsigned long size;
+
+ /* access offset */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &offset))
+ return;
+ /* access size */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &size))
+ return;
+ /* name length (unused) */
+ if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
+ return;
+ /* object name */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ NULL))
+ return;
+
+ /* Strip line number; without filename it's not very helpful. */
+ strreplace(token, ':', '\0');
+
+ /* Finally, print object information. */
+ pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
+ }
+}
+
+static bool __must_check get_address_stack_frame_info(const void *addr,
+ unsigned long *offset,
+ const char **frame_descr,
+ const void **frame_pc)
+{
+ unsigned long aligned_addr;
+ unsigned long mem_ptr;
+ const u8 *shadow_bottom;
+ const u8 *shadow_ptr;
+ const unsigned long *frame;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
+
+ /*
+ * NOTE: We currently only support printing frame information for
+ * accesses to the task's own stack.
+ */
+ if (!object_is_on_stack(addr))
+ return false;
+
+ aligned_addr = round_down((unsigned long)addr, sizeof(long));
+ mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE);
+ shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
+ shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
+ }
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
+ }
+
+ if (shadow_ptr < shadow_bottom)
+ return false;
+
+ frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE);
+ if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
+ pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
+ frame[0]);
+ return false;
+ }
+
+ *offset = (unsigned long)addr - (unsigned long)frame;
+ *frame_descr = (const char *)frame[1];
+ *frame_pc = (void *)frame[2];
+
+ return true;
+}
+
+static void print_address_stack_frame(const void *addr)
+{
+ unsigned long offset;
+ const char *frame_descr;
+ const void *frame_pc;
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ return;
+
+ if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
+ &frame_pc))
+ return;
+
+ /*
+ * get_address_stack_frame_info only returns true if the given addr is
+ * on the current task's stack.
+ */
+ pr_err("\n");
+ pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
+ addr, current->comm, task_pid_nr(current), offset);
+ pr_err(" %pS\n", frame_pc);
+
+ if (!frame_descr)
+ return;
+
+ print_decoded_frame_descr(frame_descr);
+}
+
+static void print_address_description(void *addr, u8 tag)
+{
+ struct page *page = kasan_addr_to_page(addr);
dump_stack();
pr_err("\n");
@@ -260,7 +378,7 @@
struct kmem_cache *cache = page->slab_cache;
void *object = nearest_obj(cache, page, addr);
- describe_object(cache, object, addr);
+ describe_object(cache, object, addr, tag);
}
if (kernel_or_module_addr(addr) && !init_task_stack_addr(addr)) {
@@ -272,6 +390,8 @@
pr_err("The buggy address belongs to the page:\n");
dump_page(page, "kasan: bad access detected");
}
+
+ print_address_stack_frame(addr);
}
static bool row_is_guilty(const void *row, const void *guilty)
@@ -326,65 +446,7 @@
}
}
-void kasan_report_invalid_free(void *object, unsigned long ip)
-{
- unsigned long flags;
-
- kasan_start_report(&flags);
- pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
- pr_err("\n");
- print_address_description(object);
- pr_err("\n");
- print_shadow_for_address(object);
- kasan_end_report(&flags);
-}
-
-static void kasan_report_error(struct kasan_access_info *info)
-{
- unsigned long flags;
-
- kasan_start_report(&flags);
-
- print_error_description(info);
- pr_err("\n");
-
- if (!addr_has_shadow(info)) {
- dump_stack();
- } else {
- print_address_description((void *)info->access_addr);
- pr_err("\n");
- print_shadow_for_address(info->first_bad_addr);
- }
-
- kasan_end_report(&flags);
-}
-
-static unsigned long kasan_flags;
-
-#define KASAN_BIT_REPORTED 0
-#define KASAN_BIT_MULTI_SHOT 1
-
-bool kasan_save_enable_multi_shot(void)
-{
- return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-}
-EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot);
-
-void kasan_restore_multi_shot(bool enabled)
-{
- if (!enabled)
- clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
-}
-EXPORT_SYMBOL_GPL(kasan_restore_multi_shot);
-
-static int __init kasan_set_multi_shot(char *str)
-{
- set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags);
- return 1;
-}
-__setup("kasan_multi_shot", kasan_set_multi_shot);
-
-static inline bool kasan_report_enabled(void)
+static bool report_enabled(void)
{
if (current->kasan_depth)
return false;
@@ -393,59 +455,60 @@
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
}
-void kasan_report(unsigned long addr, size_t size,
- bool is_write, unsigned long ip)
+void kasan_report_invalid_free(void *object, unsigned long ip)
+{
+ unsigned long flags;
+ u8 tag = get_tag(object);
+
+ object = reset_tag(object);
+ start_report(&flags);
+ pr_err("BUG: KASAN: double-free or invalid-free in %pS\n", (void *)ip);
+ print_tags(tag, object);
+ pr_err("\n");
+ print_address_description(object, tag);
+ pr_err("\n");
+ print_shadow_for_address(object);
+ end_report(&flags);
+}
+
+void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
{
struct kasan_access_info info;
+ void *tagged_addr;
+ void *untagged_addr;
+ unsigned long flags;
- if (likely(!kasan_report_enabled()))
+ if (likely(!report_enabled()))
return;
disable_trace_on_warning();
- info.access_addr = (void *)addr;
- info.first_bad_addr = (void *)addr;
+ tagged_addr = (void *)addr;
+ untagged_addr = reset_tag(tagged_addr);
+
+ info.access_addr = tagged_addr;
+ if (addr_has_shadow(untagged_addr))
+ info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
+ else
+ info.first_bad_addr = untagged_addr;
info.access_size = size;
info.is_write = is_write;
info.ip = ip;
- kasan_report_error(&info);
+ start_report(&flags);
+
+ print_error_description(&info);
+ if (addr_has_shadow(untagged_addr))
+ print_tags(get_tag(tagged_addr), info.first_bad_addr);
+ pr_err("\n");
+
+ if (addr_has_shadow(untagged_addr)) {
+ print_address_description(untagged_addr, get_tag(tagged_addr));
+ pr_err("\n");
+ print_shadow_for_address(info.first_bad_addr);
+ } else {
+ dump_stack();
+ }
+
+ end_report(&flags);
}
-
-
-#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, false, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_load##size##_noabort)
-
-#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, true, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_store##size##_noabort)
-
-DEFINE_ASAN_REPORT_LOAD(1);
-DEFINE_ASAN_REPORT_LOAD(2);
-DEFINE_ASAN_REPORT_LOAD(4);
-DEFINE_ASAN_REPORT_LOAD(8);
-DEFINE_ASAN_REPORT_LOAD(16);
-DEFINE_ASAN_REPORT_STORE(1);
-DEFINE_ASAN_REPORT_STORE(2);
-DEFINE_ASAN_REPORT_STORE(4);
-DEFINE_ASAN_REPORT_STORE(8);
-DEFINE_ASAN_REPORT_STORE(16);
-
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_load_n_noabort);
-
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
new file mode 100644
index 0000000..0e987c9
--- /dev/null
+++ b/mm/kasan/tags.c
@@ -0,0 +1,163 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core tag-based KASAN code.
+ *
+ * Copyright (c) 2018 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/linkage.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/bug.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+static DEFINE_PER_CPU(u32, prng_state);
+
+void kasan_init_tags(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ per_cpu(prng_state, cpu) = (u32)get_cycles();
+}
+
+/*
+ * If a preemption happens between this_cpu_read and this_cpu_write, the only
+ * side effect is that we'll give a few allocated in different contexts objects
+ * the same tag. Since tag-based KASAN is meant to be used a probabilistic
+ * bug-detection debug feature, this doesn't have significant negative impact.
+ *
+ * Ideally the tags use strong randomness to prevent any attempts to predict
+ * them during explicit exploit attempts. But strong randomness is expensive,
+ * and we did an intentional trade-off to use a PRNG. This non-atomic RMW
+ * sequence has in fact positive effect, since interrupts that randomly skew
+ * PRNG at unpredictable points do only good.
+ */
+u8 random_tag(void)
+{
+ u32 state = this_cpu_read(prng_state);
+
+ state = 1664525 * state + 1013904223;
+ this_cpu_write(prng_state, state);
+
+ return (u8)(state % (KASAN_TAG_MAX + 1));
+}
+
+void *kasan_reset_tag(const void *addr)
+{
+ return reset_tag(addr);
+}
+
+bool check_memory_region(unsigned long addr, size_t size, bool write,
+ unsigned long ret_ip)
+{
+ u8 tag;
+ u8 *shadow_first, *shadow_last, *shadow;
+ void *untagged_addr;
+
+ if (unlikely(size == 0))
+ return true;
+
+ tag = get_tag((const void *)addr);
+
+ /*
+ * Ignore accesses for pointers tagged with 0xff (native kernel
+ * pointer tag) to suppress false positives caused by kmap.
+ *
+ * Some kernel code was written to account for archs that don't keep
+ * high memory mapped all the time, but rather map and unmap particular
+ * pages when needed. Instead of storing a pointer to the kernel memory,
+ * this code saves the address of the page structure and offset within
+ * that page for later use. Those pages are then mapped and unmapped
+ * with kmap/kunmap when necessary and virt_to_page is used to get the
+ * virtual address of the page. For arm64 (that keeps the high memory
+ * mapped all the time), kmap is turned into a page_address call.
+
+ * The issue is that with use of the page_address + virt_to_page
+ * sequence the top byte value of the original pointer gets lost (gets
+ * set to KASAN_TAG_KERNEL (0xFF)).
+ */
+ if (tag == KASAN_TAG_KERNEL)
+ return true;
+
+ untagged_addr = reset_tag((const void *)addr);
+ if (unlikely(untagged_addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ kasan_report(addr, size, write, ret_ip);
+ return false;
+ }
+ shadow_first = kasan_mem_to_shadow(untagged_addr);
+ shadow_last = kasan_mem_to_shadow(untagged_addr + size - 1);
+ for (shadow = shadow_first; shadow <= shadow_last; shadow++) {
+ if (*shadow != tag) {
+ kasan_report(addr, size, write, ret_ip);
+ return false;
+ }
+ }
+
+ return true;
+}
+
+#define DEFINE_HWASAN_LOAD_STORE(size) \
+ void __hwasan_load##size##_noabort(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, false, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_load##size##_noabort); \
+ void __hwasan_store##size##_noabort(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, true, _RET_IP_); \
+ } \
+ EXPORT_SYMBOL(__hwasan_store##size##_noabort)
+
+DEFINE_HWASAN_LOAD_STORE(1);
+DEFINE_HWASAN_LOAD_STORE(2);
+DEFINE_HWASAN_LOAD_STORE(4);
+DEFINE_HWASAN_LOAD_STORE(8);
+DEFINE_HWASAN_LOAD_STORE(16);
+
+void __hwasan_loadN_noabort(unsigned long addr, unsigned long size)
+{
+ check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_loadN_noabort);
+
+void __hwasan_storeN_noabort(unsigned long addr, unsigned long size)
+{
+ check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__hwasan_storeN_noabort);
+
+void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
+{
+ kasan_poison_shadow((void *)addr, size, tag);
+}
+EXPORT_SYMBOL(__hwasan_tag_memory);
diff --git a/mm/kasan/tags_report.c b/mm/kasan/tags_report.c
new file mode 100644
index 0000000..969ae08
--- /dev/null
+++ b/mm/kasan/tags_report.c
@@ -0,0 +1,82 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ struct kasan_alloc_meta *alloc_meta;
+ struct kmem_cache *cache;
+ struct page *page;
+ const void *addr;
+ void *object;
+ u8 tag;
+ int i;
+
+ tag = get_tag(info->access_addr);
+ addr = reset_tag(info->access_addr);
+ page = kasan_addr_to_page(addr);
+ if (page && PageSlab(page)) {
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, (void *)addr);
+ alloc_meta = get_alloc_info(cache, object);
+
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ return "out-of-bounds";
+ }
+
+#endif
+ return "invalid-access";
+}
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ u8 tag = get_tag(addr);
+ void *p = reset_tag(addr);
+ void *end = p + size;
+
+ while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
+ p += KASAN_SHADOW_SCALE_SIZE;
+ return p;
+}
+
+void print_tags(u8 addr_tag, const void *addr)
+{
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
+
+ pr_err("Pointer tag: [%02x], memory tag: [%02x]\n", addr_tag, *shadow);
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index fde5820..a8a57be 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -48,6 +48,7 @@
SCAN_CGROUP_CHARGE_FAIL,
SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
+ SCAN_PAGE_HAS_PRIVATE,
};
#define CREATE_TRACE_POINTS
@@ -76,6 +77,8 @@
static struct kmem_cache *mm_slot_cache __read_mostly;
+#define MAX_PTE_MAPPED_THP 8
+
/**
* struct mm_slot - hash lookup from mm to mm_slot
* @hash: hash collision list
@@ -86,6 +89,10 @@
struct hlist_node hash;
struct list_head mm_node;
struct mm_struct *mm;
+
+ /* pte-mapped THP in this mm */
+ int nr_pte_mapped_thp;
+ unsigned long pte_mapped_thp[MAX_PTE_MAPPED_THP];
};
/**
@@ -404,7 +411,11 @@
(vm_flags & VM_NOHUGEPAGE) ||
test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
return false;
- if (shmem_file(vma->vm_file)) {
+
+ if (shmem_file(vma->vm_file) ||
+ (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
+ vma->vm_file &&
+ (vm_flags & VM_DENYWRITE))) {
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return false;
return IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff,
@@ -456,8 +467,9 @@
unsigned long hstart, hend;
/*
- * khugepaged does not yet work on non-shmem files or special
- * mappings. And file-private shmem THP is not supported.
+ * khugepaged only supports read-only files for non-shmem files.
+ * khugepaged does not yet work on special mappings. And
+ * file-private shmem THP is not supported.
*/
if (!hugepage_vma_check(vma, vm_flags))
return 0;
@@ -710,7 +722,7 @@
for (i = 0; i < MAX_NUMNODES; i++) {
if (!khugepaged_node_load[i])
continue;
- if (node_distance(nid, i) > RECLAIM_DISTANCE)
+ if (node_distance(nid, i) > node_reclaim_distance)
return true;
}
return false;
@@ -944,8 +956,7 @@
int isolated = 0, result = 0;
struct mem_cgroup *memcg;
struct vm_area_struct *vma;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
gfp_t gfp;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -1005,6 +1016,9 @@
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
+ result = SCAN_ANY_PROCESS;
+ if (!mmget_still_valid(mm))
+ goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
@@ -1014,12 +1028,13 @@
anon_vma_lock_write(vma->anon_vma);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address, address + HPAGE_PMD_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
+
pte = pte_offset_map(pmd, address);
pte_ptl = pte_lockptr(mm, pmd);
- mmun_start = address;
- mmun_end = address + HPAGE_PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
@@ -1029,7 +1044,7 @@
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
@@ -1076,6 +1091,7 @@
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
mem_cgroup_commit_charge(new_page, memcg, false, true);
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
lru_cache_add_active_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
@@ -1225,7 +1241,7 @@
{
struct mm_struct *mm = mm_slot->mm;
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+ lockdep_assert_held(&khugepaged_mm_lock);
if (khugepaged_test_exit(mm)) {
/* free mm_slot */
@@ -1245,6 +1261,159 @@
}
#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+/*
+ * Notify khugepaged that given addr of the mm is pte-mapped THP. Then
+ * khugepaged should try to collapse the page table.
+ */
+static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
+ unsigned long addr)
+{
+ struct mm_slot *mm_slot;
+
+ VM_BUG_ON(addr & ~HPAGE_PMD_MASK);
+
+ spin_lock(&khugepaged_mm_lock);
+ mm_slot = get_mm_slot(mm);
+ if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP))
+ mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr;
+ spin_unlock(&khugepaged_mm_lock);
+ return 0;
+}
+
+/**
+ * Try to collapse a pte-mapped THP for mm at address haddr.
+ *
+ * This function checks whether all the PTEs in the PMD are pointing to the
+ * right THP. If so, retract the page table so the THP can refault in with
+ * as pmd-mapped.
+ */
+void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
+{
+ unsigned long haddr = addr & HPAGE_PMD_MASK;
+ struct vm_area_struct *vma = find_vma(mm, haddr);
+ struct page *hpage = NULL;
+ pte_t *start_pte, *pte;
+ pmd_t *pmd, _pmd;
+ spinlock_t *ptl;
+ int count = 0;
+ int i;
+
+ if (!vma || !vma->vm_file ||
+ vma->vm_start > haddr || vma->vm_end < haddr + HPAGE_PMD_SIZE)
+ return;
+
+ /*
+ * This vm_flags may not have VM_HUGEPAGE if the page was not
+ * collapsed by this mm. But we can still collapse if the page is
+ * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check()
+ * will not fail the vma for missing VM_HUGEPAGE
+ */
+ if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
+ return;
+
+ pmd = mm_find_pmd(mm, haddr);
+ if (!pmd)
+ return;
+
+ start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
+
+ /* step 1: check all mapped PTEs are to the right huge page */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ /* empty pte, skip */
+ if (pte_none(*pte))
+ continue;
+
+ /* page swapped out, abort */
+ if (!pte_present(*pte))
+ goto abort;
+
+ page = vm_normal_page(vma, addr, *pte);
+
+ if (!page || !PageCompound(page))
+ goto abort;
+
+ if (!hpage) {
+ hpage = compound_head(page);
+ /*
+ * The mapping of the THP should not change.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may
+ * change the page table, but the new page will
+ * not pass PageCompound() check.
+ */
+ if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
+ goto abort;
+ }
+
+ /*
+ * Confirm the page maps to the correct subpage.
+ *
+ * Note that uprobe, debugger, or MAP_PRIVATE may change
+ * the page table, but the new page will not pass
+ * PageCompound() check.
+ */
+ if (WARN_ON(hpage + i != page))
+ goto abort;
+ count++;
+ }
+
+ /* step 2: adjust rmap */
+ for (i = 0, addr = haddr, pte = start_pte;
+ i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) {
+ struct page *page;
+
+ if (pte_none(*pte))
+ continue;
+ page = vm_normal_page(vma, addr, *pte);
+ page_remove_rmap(page, false);
+ }
+
+ pte_unmap_unlock(start_pte, ptl);
+
+ /* step 3: set proper refcount and mm_counters. */
+ if (hpage) {
+ page_ref_sub(hpage, count);
+ add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
+ }
+
+ /* step 4: collapse pmd */
+ ptl = pmd_lock(vma->vm_mm, pmd);
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ return;
+
+abort:
+ pte_unmap_unlock(start_pte, ptl);
+}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ struct mm_struct *mm = mm_slot->mm;
+ int i;
+
+ if (likely(mm_slot->nr_pte_mapped_thp == 0))
+ return 0;
+
+ if (!down_write_trylock(&mm->mmap_sem))
+ return -EBUSY;
+
+ if (unlikely(khugepaged_test_exit(mm)))
+ goto out;
+
+ for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++)
+ collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]);
+
+out:
+ mm_slot->nr_pte_mapped_thp = 0;
+ up_write(&mm->mmap_sem);
+ return 0;
+}
+
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
@@ -1253,7 +1422,22 @@
i_mmap_lock_write(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- /* probably overkill */
+ /*
+ * Check vma->anon_vma to exclude MAP_PRIVATE mappings that
+ * got written to. These VMAs are likely not worth investing
+ * down_write(mmap_sem) as PMD-mapping is likely to be split
+ * later.
+ *
+ * Not that vma->anon_vma check is racy: it can be set up after
+ * the check but before we took mmap_sem by the fault path.
+ * But page lock would prevent establishing any new ptes of the
+ * page, so we are safe.
+ *
+ * An alternative would be drop the check, but check that page
+ * table is clear before calling pmdp_collapse_flush() under
+ * ptl. It has higher chance to recover THP for the VMA, but
+ * has higher cost too.
+ */
if (vma->anon_vma)
continue;
addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
@@ -1266,9 +1450,10 @@
continue;
/*
* We need exclusive mmap_sem to retract page table.
- * If trylock fails we would end up with pte-mapped THP after
- * re-fault. Not ideal, but it's more important to not disturb
- * the system too much.
+ *
+ * We use trylock due to lock inversion: we need to acquire
+ * mmap_sem while holding page lock. Fault path does it in
+ * reverse order. Trylock is a way to avoid deadlock.
*/
if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
@@ -1278,42 +1463,47 @@
up_write(&vma->vm_mm->mmap_sem);
mm_dec_nr_ptes(vma->vm_mm);
pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ } else {
+ /* Try again later */
+ khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
}
}
i_mmap_unlock_write(mapping);
}
/**
- * collapse_shmem - collapse small tmpfs/shmem pages into huge one.
+ * collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
- * - scan over radix tree replacing old pages the new one
- * + swap in pages if necessary;
+ * - scan page cache replacing old pages with the new one
+ * + swap/gup in pages if necessary;
* + fill in gaps;
- * + keep old pages around in case if rollback is required;
- * - if replacing succeed:
+ * + keep old pages around in case rollback is required;
+ * - if replacing succeeds:
* + copy data over;
* + free old pages;
* + unlock huge page;
* - if replacing failed;
* + put all pages back and unfreeze them;
- * + restore gaps in the radix-tree;
+ * + restore gaps in the page cache;
* + unlock and free huge page;
*/
-static void collapse_shmem(struct mm_struct *mm,
- struct address_space *mapping, pgoff_t start,
+static void collapse_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start,
struct page **hpage, int node)
{
+ struct address_space *mapping = file->f_mapping;
gfp_t gfp;
- struct page *page, *new_page, *tmp;
+ struct page *new_page;
struct mem_cgroup *memcg;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
int nr_none = 0, result = SCAN_SUCCEED;
+ bool is_shmem = shmem_file(file);
+ VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem);
VM_BUG_ON(start & (HPAGE_PMD_NR - 1));
/* Only allocate from the target node */
@@ -1330,8 +1520,23 @@
goto out;
}
+ /* This will be less messy when we use multi-index entries */
+ do {
+ xas_lock_irq(&xas);
+ xas_create_range(&xas);
+ if (!xas_error(&xas))
+ break;
+ xas_unlock_irq(&xas);
+ if (!xas_nomem(&xas, GFP_KERNEL)) {
+ mem_cgroup_cancel_charge(new_page, memcg, true);
+ result = SCAN_FAIL;
+ goto out;
+ }
+ } while (1);
+
__SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
+ if (is_shmem)
+ __SetPageSwapBacked(new_page);
new_page->index = start;
new_page->mapping = mapping;
@@ -1341,55 +1546,69 @@
* be able to map it or use it in another way until we unlock it.
*/
- index = start;
- xa_lock_irq(&mapping->i_pages);
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- int n = min(iter.index, end) - index;
+ xas_set(&xas, start);
+ for (index = start; index < end; index++) {
+ struct page *page = xas_next(&xas);
- /*
- * Stop if extent has been hole-punched, and is now completely
- * empty (the more obvious i_size_read() check would take an
- * irq-unsafe seqlock on 32-bit).
- */
- if (n >= HPAGE_PMD_NR) {
- result = SCAN_TRUNCATED;
- goto tree_locked;
- }
-
- /*
- * Handle holes in the radix tree: charge it from shmem and
- * insert relevant subpage of new_page into the radix-tree.
- */
- if (n && !shmem_charge(mapping->host, n)) {
- result = SCAN_FAIL;
- goto tree_locked;
- }
- for (; index < min(iter.index, end); index++) {
- radix_tree_insert(&mapping->i_pages, index,
- new_page + (index % HPAGE_PMD_NR));
- }
- nr_none += n;
-
- /* We are done. */
- if (index >= end)
- break;
-
- page = radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock);
- if (radix_tree_exceptional_entry(page) || !PageUptodate(page)) {
- xa_unlock_irq(&mapping->i_pages);
- /* swap in or instantiate fallocated page */
- if (shmem_getpage(mapping->host, index, &page,
- SGP_NOHUGE)) {
- result = SCAN_FAIL;
- goto tree_unlocked;
+ VM_BUG_ON(index != xas.xa_index);
+ if (is_shmem) {
+ if (!page) {
+ /*
+ * Stop if extent has been truncated or
+ * hole-punched, and is now completely
+ * empty.
+ */
+ if (index == start) {
+ if (!xas_next_entry(&xas, end - 1)) {
+ result = SCAN_TRUNCATED;
+ goto xa_locked;
+ }
+ xas_set(&xas, index);
+ }
+ if (!shmem_charge(mapping->host, 1)) {
+ result = SCAN_FAIL;
+ goto xa_locked;
+ }
+ xas_store(&xas, new_page);
+ nr_none++;
+ continue;
}
- } else if (trylock_page(page)) {
- get_page(page);
- xa_unlock_irq(&mapping->i_pages);
- } else {
- result = SCAN_PAGE_LOCK;
- goto tree_locked;
+
+ if (xa_is_value(page) || !PageUptodate(page)) {
+ xas_unlock_irq(&xas);
+ /* swap in or instantiate fallocated page */
+ if (shmem_getpage(mapping->host, index, &page,
+ SGP_NOHUGE)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ goto xa_locked;
+ }
+ } else { /* !is_shmem */
+ if (!page || xa_is_value(page)) {
+ xas_unlock_irq(&xas);
+ page_cache_sync_readahead(mapping, &file->f_ra,
+ file, index,
+ PAGE_SIZE);
+ /* drain pagevecs to help isolate_lru_page() */
+ lru_add_drain();
+ page = find_lock_page(mapping, index);
+ if (unlikely(page == NULL)) {
+ result = SCAN_FAIL;
+ goto xa_unlocked;
+ }
+ } else if (trylock_page(page)) {
+ get_page(page);
+ xas_unlock_irq(&xas);
+ } else {
+ result = SCAN_PAGE_LOCK;
+ goto xa_locked;
+ }
}
/*
@@ -1397,7 +1616,12 @@
* without racing with truncate.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageUptodate(page), page);
+
+ /* make sure the page is up to date */
+ if (unlikely(!PageUptodate(page))) {
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
/*
* If file was truncated then extended, or hole-punched, before
@@ -1413,30 +1637,45 @@
goto out_unlock;
}
+ if (!is_shmem && PageDirty(page)) {
+ /*
+ * khugepaged only works on read-only fd, so this
+ * page is dirty because it hasn't been flushed
+ * since first write.
+ */
+ result = SCAN_FAIL;
+ goto out_unlock;
+ }
+
if (isolate_lru_page(page)) {
result = SCAN_DEL_PAGE_LRU;
goto out_unlock;
}
+ if (page_has_private(page) &&
+ !try_to_release_page(page, GFP_KERNEL)) {
+ result = SCAN_PAGE_HAS_PRIVATE;
+ goto out_unlock;
+ }
+
if (page_mapped(page))
unmap_mapping_pages(mapping, index, 1, false);
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
+ xas_set(&xas, index);
- slot = radix_tree_lookup_slot(&mapping->i_pages, index);
- VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot,
- &mapping->i_pages.xa_lock), page);
+ VM_BUG_ON_PAGE(page != xas_load(&xas), page);
VM_BUG_ON_PAGE(page_mapped(page), page);
/*
* The page is expected to have page_count() == 3:
* - we hold a pin on it;
- * - one reference from radix tree;
+ * - one reference from page cache;
* - one from isolate_lru_page;
*/
if (!page_ref_freeze(page, 3)) {
result = SCAN_PAGE_COUNT;
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
putback_lru_page(page);
goto out_unlock;
}
@@ -1448,58 +1687,40 @@
list_add_tail(&page->lru, &pagelist);
/* Finally, replace with the new page. */
- radix_tree_replace_slot(&mapping->i_pages, slot,
- new_page + (index % HPAGE_PMD_NR));
-
- slot = radix_tree_iter_resume(slot, &iter);
- index++;
+ xas_store(&xas, new_page);
continue;
out_unlock:
unlock_page(page);
put_page(page);
- goto tree_unlocked;
+ goto xa_unlocked;
}
- /*
- * Handle hole in radix tree at the end of the range.
- * This code only triggers if there's nothing in radix tree
- * beyond 'end'.
- */
- if (index < end) {
- int n = end - index;
-
- /* Stop if extent has been truncated, and is now empty */
- if (n >= HPAGE_PMD_NR) {
- result = SCAN_TRUNCATED;
- goto tree_locked;
- }
- if (!shmem_charge(mapping->host, n)) {
- result = SCAN_FAIL;
- goto tree_locked;
- }
- for (; index < end; index++) {
- radix_tree_insert(&mapping->i_pages, index,
- new_page + (index % HPAGE_PMD_NR));
- }
- nr_none += n;
+ if (is_shmem)
+ __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ else {
+ __inc_node_page_state(new_page, NR_FILE_THPS);
+ filemap_nr_thps_inc(mapping);
}
- __inc_node_page_state(new_page, NR_SHMEM_THPS);
if (nr_none) {
struct zone *zone = page_zone(new_page);
__mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
- __mod_node_page_state(zone->zone_pgdat, NR_SHMEM, nr_none);
+ if (is_shmem)
+ __mod_node_page_state(zone->zone_pgdat,
+ NR_SHMEM, nr_none);
}
-tree_locked:
- xa_unlock_irq(&mapping->i_pages);
-tree_unlocked:
+xa_locked:
+ xas_unlock_irq(&xas);
+xa_unlocked:
if (result == SCAN_SUCCEED) {
+ struct page *page, *tmp;
+
/*
- * Replacing old pages with new one has succeed, now we need to
- * copy the content and free old pages.
+ * Replacing old pages with new one has succeeded, now we
+ * need to copy the content and free the old pages.
*/
index = start;
list_for_each_entry_safe(page, tmp, &pagelist, lru) {
@@ -1525,9 +1746,15 @@
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
- set_page_dirty(new_page);
mem_cgroup_commit_charge(new_page, memcg, false, true);
- lru_cache_add_anon(new_page);
+
+ if (is_shmem) {
+ set_page_dirty(new_page);
+ lru_cache_add_anon(new_page);
+ } else {
+ lru_cache_add_file(new_page);
+ }
+ count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -1537,39 +1764,42 @@
khugepaged_pages_collapsed++;
} else {
- /* Something went wrong: rollback changes to the radix-tree */
- xa_lock_irq(&mapping->i_pages);
- mapping->nrpages -= nr_none;
- shmem_uncharge(mapping->host, nr_none);
+ struct page *page;
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (iter.index >= end)
- break;
+ /* Something went wrong: roll back page cache changes */
+ xas_lock_irq(&xas);
+ mapping->nrpages -= nr_none;
+
+ if (is_shmem)
+ shmem_uncharge(mapping->host, nr_none);
+
+ xas_set(&xas, start);
+ xas_for_each(&xas, page, end - 1) {
page = list_first_entry_or_null(&pagelist,
struct page, lru);
- if (!page || iter.index < page->index) {
+ if (!page || xas.xa_index < page->index) {
if (!nr_none)
break;
nr_none--;
/* Put holes back where they were */
- radix_tree_delete(&mapping->i_pages, iter.index);
+ xas_store(&xas, NULL);
continue;
}
- VM_BUG_ON_PAGE(page->index != iter.index, page);
+ VM_BUG_ON_PAGE(page->index != xas.xa_index, page);
/* Unfreeze the page. */
list_del(&page->lru);
page_ref_unfreeze(page, 2);
- radix_tree_replace_slot(&mapping->i_pages, slot, page);
- slot = radix_tree_iter_resume(slot, &iter);
- xa_unlock_irq(&mapping->i_pages);
+ xas_store(&xas, page);
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
unlock_page(page);
putback_lru_page(page);
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
}
VM_BUG_ON(nr_none);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
mem_cgroup_cancel_charge(new_page, memcg, true);
new_page->mapping = NULL;
@@ -1581,13 +1811,12 @@
/* TODO: tracepoints */
}
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
struct page *page = NULL;
- struct radix_tree_iter iter;
- void **slot;
+ struct address_space *mapping = file->f_mapping;
+ XA_STATE(xas, &mapping->i_pages, start);
int present, swap;
int node = NUMA_NO_NODE;
int result = SCAN_SUCCEED;
@@ -1596,17 +1825,11 @@
swap = 0;
memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (iter.index >= start + HPAGE_PMD_NR)
- break;
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) {
+ if (xas_retry(&xas, page))
continue;
- }
- if (radix_tree_exception(page)) {
+ if (xa_is_value(page)) {
if (++swap > khugepaged_max_ptes_swap) {
result = SCAN_EXCEED_SWAP_PTE;
break;
@@ -1631,7 +1854,8 @@
break;
}
- if (page_count(page) != 1 + page_mapcount(page)) {
+ if (page_count(page) !=
+ 1 + page_mapcount(page) + page_has_private(page)) {
result = SCAN_PAGE_COUNT;
break;
}
@@ -1645,7 +1869,7 @@
present++;
if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
cond_resched_rcu();
}
}
@@ -1656,19 +1880,23 @@
result = SCAN_EXCEED_NONE_PTE;
} else {
node = khugepaged_find_target_node();
- collapse_shmem(mm, mapping, start, hpage, node);
+ collapse_file(mm, file, start, hpage, node);
}
}
/* TODO: tracepoints */
}
#else
-static void khugepaged_scan_shmem(struct mm_struct *mm,
- struct address_space *mapping,
- pgoff_t start, struct page **hpage)
+static void khugepaged_scan_file(struct mm_struct *mm,
+ struct file *file, pgoff_t start, struct page **hpage)
{
BUILD_BUG();
}
+
+static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
+{
+ return 0;
+}
#endif
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
@@ -1682,7 +1910,7 @@
int progress = 0;
VM_BUG_ON(!pages);
- VM_BUG_ON(NR_CPUS != 1 && !spin_is_locked(&khugepaged_mm_lock));
+ lockdep_assert_held(&khugepaged_mm_lock);
if (khugepaged_scan.mm_slot)
mm_slot = khugepaged_scan.mm_slot;
@@ -1693,6 +1921,7 @@
khugepaged_scan.mm_slot = mm_slot;
}
spin_unlock(&khugepaged_mm_lock);
+ khugepaged_collapse_pte_mapped_thps(mm_slot);
mm = mm_slot->mm;
/*
@@ -1738,17 +1967,18 @@
VM_BUG_ON(khugepaged_scan.address < hstart ||
khugepaged_scan.address + HPAGE_PMD_SIZE >
hend);
- if (shmem_file(vma->vm_file)) {
+ if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) {
struct file *file;
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- if (!shmem_huge_enabled(vma))
+
+ if (shmem_file(vma->vm_file)
+ && !shmem_huge_enabled(vma))
goto skip;
file = get_file(vma->vm_file);
up_read(&mm->mmap_sem);
ret = 1;
- khugepaged_scan_shmem(mm, file->f_mapping,
- pgoff, hpage);
+ khugepaged_scan_file(mm, file, pgoff, hpage);
fput(file);
} else {
ret = khugepaged_scan_pmd(mm, vma,
diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c
index dd3c23a..e19279f 100644
--- a/mm/kmemleak-test.c
+++ b/mm/kmemleak-test.c
@@ -1,21 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/kmemleak-test.c
*
* Copyright (C) 2008 ARM Limited
* Written by Catalin Marinas <catalin.marinas@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#define pr_fmt(fmt) "kmemleak: " fmt
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 17dd883..2446076 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1,23 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/kmemleak.c
*
* Copyright (C) 2008 ARM Limited
* Written by Catalin Marinas <catalin.marinas@arm.com>
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- *
* For more information on the algorithm and kmemleak usage, please see
* Documentation/dev-tools/kmemleak.rst.
*
@@ -86,12 +73,13 @@
#include <linux/seq_file.h>
#include <linux/cpumask.h>
#include <linux/spinlock.h>
+#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/rcupdate.h>
#include <linux/stacktrace.h>
#include <linux/cache.h>
#include <linux/percpu.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/pfn.h>
#include <linux/mmzone.h>
#include <linux/slab.h>
@@ -126,7 +114,7 @@
/* GFP bitmask for kmemleak internal allocations */
#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \
__GFP_NORETRY | __GFP_NOMEMALLOC | \
- __GFP_NOWARN | __GFP_NOFAIL)
+ __GFP_NOWARN)
/* scanning area inside a memory block */
struct kmemleak_scan_area {
@@ -180,7 +168,10 @@
#define OBJECT_REPORTED (1 << 1)
/* flag set to not scan the object */
#define OBJECT_NO_SCAN (1 << 2)
+/* flag set to fully scan the object when scan_area allocation failed */
+#define OBJECT_FULL_SCAN (1 << 3)
+#define HEX_PREFIX " "
/* number of bytes to print per line; must be 16 or 32 */
#define HEX_ROW_SIZE 16
/* number of bytes to print at a time (1, 2, 4, 8) */
@@ -194,6 +185,10 @@
static LIST_HEAD(object_list);
/* the list of gray-colored objects (see color_gray comment below) */
static LIST_HEAD(gray_list);
+/* memory pool allocation */
+static struct kmemleak_object mem_pool[CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE];
+static int mem_pool_free_count = ARRAY_SIZE(mem_pool);
+static LIST_HEAD(mem_pool_free_list);
/* search tree for object boundaries */
static struct rb_root object_tree_root = RB_ROOT;
/* rw_lock protecting the access to object_list and object_tree_root */
@@ -204,13 +199,11 @@
static struct kmem_cache *scan_area_cache;
/* set if tracing memory operations is enabled */
-static int kmemleak_enabled;
+static int kmemleak_enabled = 1;
/* same as above but only for the kmemleak_free() callback */
-static int kmemleak_free_enabled;
+static int kmemleak_free_enabled = 1;
/* set in the late_initcall if there were no errors */
static int kmemleak_initialized;
-/* enables or disables early logging of the memory operations */
-static int kmemleak_early_log = 1;
/* set if a kmemleak warning was issued */
static int kmemleak_warning;
/* set if a fatal kmemleak error has occurred */
@@ -235,48 +228,8 @@
/* If there are leaks that can be reported */
static bool kmemleak_found_leaks;
-/*
- * Early object allocation/freeing logging. Kmemleak is initialized after the
- * kernel allocator. However, both the kernel allocator and kmemleak may
- * allocate memory blocks which need to be tracked. Kmemleak defines an
- * arbitrary buffer to hold the allocation/freeing information before it is
- * fully initialized.
- */
-
-/* kmemleak operation type for early logging */
-enum {
- KMEMLEAK_ALLOC,
- KMEMLEAK_ALLOC_PERCPU,
- KMEMLEAK_FREE,
- KMEMLEAK_FREE_PART,
- KMEMLEAK_FREE_PERCPU,
- KMEMLEAK_NOT_LEAK,
- KMEMLEAK_IGNORE,
- KMEMLEAK_SCAN_AREA,
- KMEMLEAK_NO_SCAN,
- KMEMLEAK_SET_EXCESS_REF
-};
-
-/*
- * Structure holding the information passed to kmemleak callbacks during the
- * early logging.
- */
-struct early_log {
- int op_type; /* kmemleak operation type */
- int min_count; /* minimum reference count */
- const void *ptr; /* allocated/freed memory block */
- union {
- size_t size; /* memory block size */
- unsigned long excess_ref; /* surplus reference passing */
- };
- unsigned long trace[MAX_TRACE]; /* stack trace */
- unsigned int trace_len; /* stack trace length */
-};
-
-/* early logging buffer and current position */
-static struct early_log
- early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata;
-static int crt_early_log __initdata;
+static bool kmemleak_verbose;
+module_param_named(verbose, kmemleak_verbose, bool, 0600);
static void kmemleak_disable(void);
@@ -299,6 +252,25 @@
kmemleak_disable(); \
} while (0)
+#define warn_or_seq_printf(seq, fmt, ...) do { \
+ if (seq) \
+ seq_printf(seq, fmt, ##__VA_ARGS__); \
+ else \
+ pr_warn(fmt, ##__VA_ARGS__); \
+} while (0)
+
+static void warn_or_seq_hex_dump(struct seq_file *seq, int prefix_type,
+ int rowsize, int groupsize, const void *buf,
+ size_t len, bool ascii)
+{
+ if (seq)
+ seq_hex_dump(seq, HEX_PREFIX, prefix_type, rowsize, groupsize,
+ buf, len, ascii);
+ else
+ print_hex_dump(KERN_WARNING, pr_fmt(HEX_PREFIX), prefix_type,
+ rowsize, groupsize, buf, len, ascii);
+}
+
/*
* Printing of the objects hex dump to the seq file. The number of lines to be
* printed is limited to HEX_MAX_LINES to prevent seq file spamming. The
@@ -314,10 +286,10 @@
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
- seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
kasan_disable_current();
- seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE,
- HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
+ warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
+ HEX_GROUP_SIZE, ptr, len, HEX_ASCII);
kasan_enable_current();
}
@@ -365,17 +337,17 @@
int i;
unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies);
- seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
+ warn_or_seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n",
object->pointer, object->size);
- seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
+ warn_or_seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n",
object->comm, object->pid, object->jiffies,
msecs_age / 1000, msecs_age % 1000);
hex_dump_object(seq, object);
- seq_printf(seq, " backtrace:\n");
+ warn_or_seq_printf(seq, " backtrace:\n");
for (i = 0; i < object->trace_len; i++) {
void *ptr = (void *)object->trace[i];
- seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
+ warn_or_seq_printf(seq, " [<%p>] %pS\n", ptr, ptr);
}
}
@@ -386,11 +358,6 @@
*/
static void dump_object_info(struct kmemleak_object *object)
{
- struct stack_trace trace;
-
- trace.nr_entries = object->trace_len;
- trace.entries = object->trace;
-
pr_notice("Object 0x%08lx (size %zu):\n",
object->pointer, object->size);
pr_notice(" comm \"%s\", pid %d, jiffies %lu\n",
@@ -400,7 +367,7 @@
pr_notice(" flags = 0x%x\n", object->flags);
pr_notice(" checksum = %u\n", object->checksum);
pr_notice(" backtrace:\n");
- print_stack_trace(&trace, 4);
+ stack_trace_print(object->trace, object->trace_len, 4);
}
/*
@@ -444,6 +411,54 @@
}
/*
+ * Memory pool allocation and freeing. kmemleak_lock must not be held.
+ */
+static struct kmemleak_object *mem_pool_alloc(gfp_t gfp)
+{
+ unsigned long flags;
+ struct kmemleak_object *object;
+
+ /* try the slab allocator first */
+ if (object_cache) {
+ object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ if (object)
+ return object;
+ }
+
+ /* slab allocation failed, try the memory pool */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ object = list_first_entry_or_null(&mem_pool_free_list,
+ typeof(*object), object_list);
+ if (object)
+ list_del(&object->object_list);
+ else if (mem_pool_free_count)
+ object = &mem_pool[--mem_pool_free_count];
+ else
+ pr_warn_once("Memory pool empty, consider increasing CONFIG_DEBUG_KMEMLEAK_MEM_POOL_SIZE\n");
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+
+ return object;
+}
+
+/*
+ * Return the object to either the slab allocator or the memory pool.
+ */
+static void mem_pool_free(struct kmemleak_object *object)
+{
+ unsigned long flags;
+
+ if (object < mem_pool || object >= mem_pool + ARRAY_SIZE(mem_pool)) {
+ kmem_cache_free(object_cache, object);
+ return;
+ }
+
+ /* add the object to the memory pool free list */
+ write_lock_irqsave(&kmemleak_lock, flags);
+ list_add(&object->object_list, &mem_pool_free_list);
+ write_unlock_irqrestore(&kmemleak_lock, flags);
+}
+
+/*
* RCU callback to free a kmemleak_object.
*/
static void free_object_rcu(struct rcu_head *rcu)
@@ -461,7 +476,7 @@
hlist_del(&area->node);
kmem_cache_free(scan_area_cache, area);
}
- kmem_cache_free(object_cache, object);
+ mem_pool_free(object);
}
/*
@@ -479,7 +494,15 @@
/* should only get here after delete_object was called */
WARN_ON(object->flags & OBJECT_ALLOCATED);
- call_rcu(&object->rcu, free_object_rcu);
+ /*
+ * It may be too early for the RCU callbacks, however, there is no
+ * concurrent object_list traversal when !object_cache and all objects
+ * came from the memory pool. Free the object directly.
+ */
+ if (object_cache)
+ call_rcu(&object->rcu, free_object_rcu);
+ else
+ free_object_rcu(&object->rcu);
}
/*
@@ -504,6 +527,16 @@
}
/*
+ * Remove an object from the object_tree_root and object_list. Must be called
+ * with the kmemleak_lock held _if_ kmemleak is still enabled.
+ */
+static void __remove_object(struct kmemleak_object *object)
+{
+ rb_erase(&object->rb_node, &object_tree_root);
+ list_del_rcu(&object->object_list);
+}
+
+/*
* Look up an object in the object search tree and remove it from both
* object_tree_root and object_list. The returned object's use_count should be
* at least 1, as initially set by create_object().
@@ -515,10 +548,8 @@
write_lock_irqsave(&kmemleak_lock, flags);
object = lookup_object(ptr, alias);
- if (object) {
- rb_erase(&object->rb_node, &object_tree_root);
- list_del_rcu(&object->object_list);
- }
+ if (object)
+ __remove_object(object);
write_unlock_irqrestore(&kmemleak_lock, flags);
return object;
@@ -529,15 +560,7 @@
*/
static int __save_stack_trace(unsigned long *trace)
{
- struct stack_trace stack_trace;
-
- stack_trace.max_entries = MAX_TRACE;
- stack_trace.nr_entries = 0;
- stack_trace.entries = trace;
- stack_trace.skip = 2;
- save_stack_trace(&stack_trace);
-
- return stack_trace.nr_entries;
+ return stack_trace_save(trace, MAX_TRACE, 2);
}
/*
@@ -550,8 +573,9 @@
unsigned long flags;
struct kmemleak_object *object, *parent;
struct rb_node **link, *rb_parent;
+ unsigned long untagged_ptr;
- object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp));
+ object = mem_pool_alloc(gfp);
if (!object) {
pr_warn("Cannot allocate a kmemleak_object structure\n");
kmemleak_disable();
@@ -576,7 +600,7 @@
if (in_irq()) {
object->pid = 0;
strncpy(object->comm, "hardirq", sizeof(object->comm));
- } else if (in_softirq()) {
+ } else if (in_serving_softirq()) {
object->pid = 0;
strncpy(object->comm, "softirq", sizeof(object->comm));
} else {
@@ -595,8 +619,9 @@
write_lock_irqsave(&kmemleak_lock, flags);
- min_addr = min(min_addr, ptr);
- max_addr = max(max_addr, ptr + size);
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ min_addr = min(min_addr, untagged_ptr);
+ max_addr = max(max_addr, untagged_ptr + size);
link = &object_tree_root.rb_node;
rb_parent = NULL;
while (*link) {
@@ -689,9 +714,7 @@
/*
* Create one or two objects that may result from the memory block
* split. Note that partial freeing is only done by free_bootmem() and
- * this happens before kmemleak_init() is called. The path below is
- * only executed during early log recording in kmemleak_init(), so
- * GFP_KERNEL is enough.
+ * this happens before kmemleak_init() is called.
*/
start = object->pointer;
end = object->pointer + object->size;
@@ -763,7 +786,7 @@
{
unsigned long flags;
struct kmemleak_object *object;
- struct kmemleak_scan_area *area;
+ struct kmemleak_scan_area *area = NULL;
object = find_and_get_object(ptr, 1);
if (!object) {
@@ -772,13 +795,16 @@
return;
}
- area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
- if (!area) {
- pr_warn("Cannot allocate a scan area\n");
- goto out;
- }
+ if (scan_area_cache)
+ area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
spin_lock_irqsave(&object->lock, flags);
+ if (!area) {
+ pr_warn_once("Cannot allocate a scan area, scanning the full object\n");
+ /* mark the object for full scan to avoid false positives */
+ object->flags |= OBJECT_FULL_SCAN;
+ goto out_unlock;
+ }
if (size == SIZE_MAX) {
size = object->pointer + object->size - ptr;
} else if (ptr + size > object->pointer + object->size) {
@@ -795,7 +821,6 @@
hlist_add_head(&area->node, &object->area_list);
out_unlock:
spin_unlock_irqrestore(&object->lock, flags);
-out:
put_object(object);
}
@@ -845,86 +870,6 @@
put_object(object);
}
-/*
- * Log an early kmemleak_* call to the early_log buffer. These calls will be
- * processed later once kmemleak is fully initialized.
- */
-static void __init log_early(int op_type, const void *ptr, size_t size,
- int min_count)
-{
- unsigned long flags;
- struct early_log *log;
-
- if (kmemleak_error) {
- /* kmemleak stopped recording, just count the requests */
- crt_early_log++;
- return;
- }
-
- if (crt_early_log >= ARRAY_SIZE(early_log)) {
- crt_early_log++;
- kmemleak_disable();
- return;
- }
-
- /*
- * There is no need for locking since the kernel is still in UP mode
- * at this stage. Disabling the IRQs is enough.
- */
- local_irq_save(flags);
- log = &early_log[crt_early_log];
- log->op_type = op_type;
- log->ptr = ptr;
- log->size = size;
- log->min_count = min_count;
- log->trace_len = __save_stack_trace(log->trace);
- crt_early_log++;
- local_irq_restore(flags);
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc(struct early_log *log)
-{
- struct kmemleak_object *object;
- unsigned long flags;
- int i;
-
- if (!kmemleak_enabled || !log->ptr || IS_ERR(log->ptr))
- return;
-
- /*
- * RCU locking needed to ensure object is not freed via put_object().
- */
- rcu_read_lock();
- object = create_object((unsigned long)log->ptr, log->size,
- log->min_count, GFP_ATOMIC);
- if (!object)
- goto out;
- spin_lock_irqsave(&object->lock, flags);
- for (i = 0; i < log->trace_len; i++)
- object->trace[i] = log->trace[i];
- object->trace_len = log->trace_len;
- spin_unlock_irqrestore(&object->lock, flags);
-out:
- rcu_read_unlock();
-}
-
-/*
- * Log an early allocated block and populate the stack trace.
- */
-static void early_alloc_percpu(struct early_log *log)
-{
- unsigned int cpu;
- const void __percpu *ptr = log->ptr;
-
- for_each_possible_cpu(cpu) {
- log->ptr = per_cpu_ptr(ptr, cpu);
- early_alloc(log);
- }
-}
-
/**
* kmemleak_alloc - register a newly allocated object
* @ptr: pointer to beginning of the object
@@ -946,8 +891,6 @@
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
create_object((unsigned long)ptr, size, min_count, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC, ptr, size, min_count);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc);
@@ -975,8 +918,6 @@
for_each_possible_cpu(cpu)
create_object((unsigned long)per_cpu_ptr(ptr, cpu),
size, 0, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -1001,11 +942,6 @@
create_object((unsigned long)area->addr, size, 2, gfp);
object_set_excess_ref((unsigned long)area,
(unsigned long)area->addr);
- } else if (kmemleak_early_log) {
- log_early(KMEMLEAK_ALLOC, area->addr, size, 2);
- /* reusing early_log.size for storing area->addr */
- log_early(KMEMLEAK_SET_EXCESS_REF,
- area, (unsigned long)area->addr, 0);
}
}
EXPORT_SYMBOL_GPL(kmemleak_vmalloc);
@@ -1023,8 +959,6 @@
if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
delete_object_full((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free);
@@ -1043,8 +977,6 @@
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
delete_object_part((unsigned long)ptr, size);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PART, ptr, size, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_part);
@@ -1065,8 +997,6 @@
for_each_possible_cpu(cpu)
delete_object_full((unsigned long)per_cpu_ptr(ptr,
cpu));
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_FREE_PERCPU, ptr, 0, 0);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
@@ -1117,8 +1047,6 @@
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_gray_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_not_leak);
@@ -1137,8 +1065,6 @@
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
make_black_object((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_IGNORE, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_ignore);
@@ -1159,8 +1085,6 @@
if (kmemleak_enabled && ptr && size && !IS_ERR(ptr))
add_scan_area((unsigned long)ptr, size, gfp);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0);
}
EXPORT_SYMBOL(kmemleak_scan_area);
@@ -1179,8 +1103,6 @@
if (kmemleak_enabled && ptr && !IS_ERR(ptr))
object_no_scan((unsigned long)ptr);
- else if (kmemleak_early_log)
- log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0);
}
EXPORT_SYMBOL(kmemleak_no_scan);
@@ -1309,6 +1231,7 @@
unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
unsigned long *end = _end - (BYTES_PER_POINTER - 1);
unsigned long flags;
+ unsigned long untagged_ptr;
read_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
@@ -1323,7 +1246,8 @@
pointer = *ptr;
kasan_enable_current();
- if (pointer < min_addr || pointer >= max_addr)
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
+ if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
continue;
/*
@@ -1373,6 +1297,7 @@
/*
* Scan a large memory block in MAX_SCAN_SIZE chunks to reduce the latency.
*/
+#ifdef CONFIG_SMP
static void scan_large_block(void *start, void *end)
{
void *next;
@@ -1384,6 +1309,7 @@
cond_resched();
}
}
+#endif
/*
* Scan a memory block corresponding to a kmemleak_object. A condition is
@@ -1404,7 +1330,8 @@
if (!(object->flags & OBJECT_ALLOCATED))
/* already freed object */
goto out;
- if (hlist_empty(&object->area_list)) {
+ if (hlist_empty(&object->area_list) ||
+ object->flags & OBJECT_FULL_SCAN) {
void *start = (void *)object->pointer;
void *end = (void *)(object->pointer + object->size);
void *next;
@@ -1501,11 +1428,6 @@
}
rcu_read_unlock();
- /* data/bss scanning */
- scan_large_block(_sdata, _edata);
- scan_large_block(__bss_start, __bss_stop);
- scan_large_block(__start_ro_after_init, __end_ro_after_init);
-
#ifdef CONFIG_SMP
/* per-cpu sections scanning */
for_each_possible_cpu(i)
@@ -1523,11 +1445,14 @@
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
- if (!pfn_valid(pfn))
+ if (!page)
continue;
- page = pfn_to_page(pfn);
+
+ /* only scan pages belonging to this node */
+ if (page_to_nid(page) != i)
+ continue;
/* only scan if page is in use */
if (page_count(page) == 0)
continue;
@@ -1598,6 +1523,10 @@
if (unreferenced_object(object) &&
!(object->flags & OBJECT_REPORTED)) {
object->flags |= OBJECT_REPORTED;
+
+ if (kmemleak_verbose)
+ print_unreferenced(NULL, object);
+
new_leaks++;
}
spin_unlock_irqrestore(&object->lock, flags);
@@ -1619,7 +1548,7 @@
*/
static int kmemleak_scan_thread(void *arg)
{
- static int first_run = 1;
+ static int first_run = IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN);
pr_info("Automatic memory scanning thread started\n");
set_user_nice(current, 10);
@@ -1860,7 +1789,7 @@
}
if (!kmemleak_enabled) {
- ret = -EBUSY;
+ ret = -EPERM;
goto out;
}
@@ -1913,12 +1842,16 @@
static void __kmemleak_do_cleanup(void)
{
- struct kmemleak_object *object;
+ struct kmemleak_object *object, *tmp;
- rcu_read_lock();
- list_for_each_entry_rcu(object, &object_list, object_list)
- delete_object_full(object->pointer);
- rcu_read_unlock();
+ /*
+ * Kmemleak has already been disabled, no need for RCU list traversal
+ * or kmemleak_lock held.
+ */
+ list_for_each_entry_safe(object, tmp, &object_list, object_list) {
+ __remove_object(object);
+ __delete_object(object);
+ }
}
/*
@@ -1987,105 +1920,37 @@
}
early_param("kmemleak", kmemleak_boot_config);
-static void __init print_log_trace(struct early_log *log)
-{
- struct stack_trace trace;
-
- trace.nr_entries = log->trace_len;
- trace.entries = log->trace;
-
- pr_notice("Early log backtrace:\n");
- print_stack_trace(&trace, 2);
-}
-
/*
* Kmemleak initialization.
*/
void __init kmemleak_init(void)
{
- int i;
- unsigned long flags;
-
#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF
if (!kmemleak_skip_disable) {
- kmemleak_early_log = 0;
kmemleak_disable();
return;
}
#endif
+ if (kmemleak_error)
+ return;
+
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
object_cache = KMEM_CACHE(kmemleak_object, SLAB_NOLEAKTRACE);
scan_area_cache = KMEM_CACHE(kmemleak_scan_area, SLAB_NOLEAKTRACE);
- if (crt_early_log > ARRAY_SIZE(early_log))
- pr_warn("Early log buffer exceeded (%d), please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n",
- crt_early_log);
-
- /* the kernel is still in UP mode, so disabling the IRQs is enough */
- local_irq_save(flags);
- kmemleak_early_log = 0;
- if (kmemleak_error) {
- local_irq_restore(flags);
- return;
- } else {
- kmemleak_enabled = 1;
- kmemleak_free_enabled = 1;
- }
- local_irq_restore(flags);
-
- /*
- * This is the point where tracking allocations is safe. Automatic
- * scanning is started during the late initcall. Add the early logged
- * callbacks to the kmemleak infrastructure.
- */
- for (i = 0; i < crt_early_log; i++) {
- struct early_log *log = &early_log[i];
-
- switch (log->op_type) {
- case KMEMLEAK_ALLOC:
- early_alloc(log);
- break;
- case KMEMLEAK_ALLOC_PERCPU:
- early_alloc_percpu(log);
- break;
- case KMEMLEAK_FREE:
- kmemleak_free(log->ptr);
- break;
- case KMEMLEAK_FREE_PART:
- kmemleak_free_part(log->ptr, log->size);
- break;
- case KMEMLEAK_FREE_PERCPU:
- kmemleak_free_percpu(log->ptr);
- break;
- case KMEMLEAK_NOT_LEAK:
- kmemleak_not_leak(log->ptr);
- break;
- case KMEMLEAK_IGNORE:
- kmemleak_ignore(log->ptr);
- break;
- case KMEMLEAK_SCAN_AREA:
- kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL);
- break;
- case KMEMLEAK_NO_SCAN:
- kmemleak_no_scan(log->ptr);
- break;
- case KMEMLEAK_SET_EXCESS_REF:
- object_set_excess_ref((unsigned long)log->ptr,
- log->excess_ref);
- break;
- default:
- kmemleak_warn("Unknown early log operation: %d\n",
- log->op_type);
- }
-
- if (kmemleak_warning) {
- print_log_trace(log);
- kmemleak_warning = 0;
- }
- }
+ /* register the data/bss sections */
+ create_object((unsigned long)_sdata, _edata - _sdata,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ create_object((unsigned long)__bss_start, __bss_stop - __bss_start,
+ KMEMLEAK_GREY, GFP_ATOMIC);
+ /* only register .data..ro_after_init if not within .data */
+ if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata)
+ create_object((unsigned long)__start_ro_after_init,
+ __end_ro_after_init - __start_ro_after_init,
+ KMEMLEAK_GREY, GFP_ATOMIC);
}
/*
@@ -2093,14 +1958,9 @@
*/
static int __init kmemleak_late_init(void)
{
- struct dentry *dentry;
-
kmemleak_initialized = 1;
- dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL,
- &kmemleak_fops);
- if (!dentry)
- pr_warn("Failed to create the debugfs kmemleak file\n");
+ debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops);
if (kmemleak_error) {
/*
@@ -2113,11 +1973,14 @@
return -ENOMEM;
}
- mutex_lock(&scan_mutex);
- start_scan_thread();
- mutex_unlock(&scan_mutex);
+ if (IS_ENABLED(CONFIG_DEBUG_KMEMLEAK_AUTO_SCAN)) {
+ mutex_lock(&scan_mutex);
+ start_scan_thread();
+ mutex_unlock(&scan_mutex);
+ }
- pr_info("Kernel memory leak detector initialized\n");
+ pr_info("Kernel memory leak detector initialized (mem pool available: %d)\n",
+ mem_pool_free_count);
return 0;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 5b0894b..7905934 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Memory merging support.
*
@@ -10,8 +11,6 @@
* Andrea Arcangeli
* Chris Wright
* Hugh Dickins
- *
- * This work is licensed under the terms of the GNU GPL, version 2.
*/
#include <linux/errno.h>
@@ -25,7 +24,7 @@
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/spinlock.h>
-#include <linux/jhash.h>
+#include <linux/xxhash.h>
#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/wait.h>
@@ -296,6 +295,7 @@
static void wait_while_offlining(void);
static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait);
+static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait);
static DEFINE_MUTEX(ksm_thread_mutex);
static DEFINE_SPINLOCK(ksm_mmlist_lock);
@@ -597,7 +597,7 @@
chain->chain_prune_time = jiffies;
chain->rmap_hlist_len = STABLE_NODE_CHAIN;
#if defined (CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
- chain->nid = -1; /* debug */
+ chain->nid = NUMA_NO_NODE; /* debug */
#endif
ksm_stable_node_chains++;
@@ -666,6 +666,12 @@
free_stable_node(stable_node);
}
+enum get_ksm_page_flags {
+ GET_KSM_PAGE_NOLOCK,
+ GET_KSM_PAGE_LOCK,
+ GET_KSM_PAGE_TRYLOCK
+};
+
/*
* get_ksm_page: checks if the page indicated by the stable node
* is still its ksm page, despite having held no reference to it.
@@ -685,7 +691,8 @@
* a page to put something that might look like our key in page->mapping.
* is on its way to being freed; but it is an anomaly to bear in mind.
*/
-static struct page *get_ksm_page(struct stable_node *stable_node, bool lock_it)
+static struct page *get_ksm_page(struct stable_node *stable_node,
+ enum get_ksm_page_flags flags)
{
struct page *page;
void *expected_mapping;
@@ -705,8 +712,9 @@
* case this node is no longer referenced, and should be freed;
* however, it might mean that the page is under page_ref_freeze().
* The __remove_mapping() case is easy, again the node is now stale;
- * but if page is swapcache in migrate_page_move_mapping(), it might
- * still be our page, in which case it's essential to keep the node.
+ * the same is in reuse_ksm_page() case; but if page is swapcache
+ * in migrate_page_move_mapping(), it might still be our page,
+ * in which case it's essential to keep the node.
*/
while (!get_page_unless_zero(page)) {
/*
@@ -727,8 +735,15 @@
goto stale;
}
- if (lock_it) {
+ if (flags == GET_KSM_PAGE_TRYLOCK) {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return ERR_PTR(-EBUSY);
+ }
+ } else if (flags == GET_KSM_PAGE_LOCK)
lock_page(page);
+
+ if (flags != GET_KSM_PAGE_NOLOCK) {
if (READ_ONCE(page->mapping) != expected_mapping) {
unlock_page(page);
put_page(page);
@@ -762,7 +777,7 @@
struct page *page;
stable_node = rmap_item->head;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page)
goto out;
@@ -862,7 +877,7 @@
struct page *page;
int err;
- page = get_ksm_page(stable_node, true);
+ page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK);
if (!page) {
/*
* get_ksm_page did remove_node_from_stable_tree itself.
@@ -870,13 +885,13 @@
return 0;
}
- if (WARN_ON_ONCE(page_mapped(page))) {
- /*
- * This should not happen: but if it does, just refuse to let
- * merge_across_nodes be switched - there is no need to panic.
- */
- err = -EBUSY;
- } else {
+ /*
+ * Page could be still mapped if this races with __mmput() running in
+ * between ksm_exit() and exit_mmap(). Just refuse to let
+ * merge_across_nodes/max_page_sharing be switched.
+ */
+ err = -EBUSY;
+ if (!page_mapped(page)) {
/*
* The stable node did not yet appear stale to get_ksm_page(),
* since that allows for an unmapped ksm page to be recognized
@@ -1009,29 +1024,11 @@
{
u32 checksum;
void *addr = kmap_atomic(page);
- checksum = jhash2(addr, PAGE_SIZE / 4, 17);
+ checksum = xxhash(addr, PAGE_SIZE, 0);
kunmap_atomic(addr);
return checksum;
}
-static int memcmp_pages(struct page *page1, struct page *page2)
-{
- char *addr1, *addr2;
- int ret;
-
- addr1 = kmap_atomic(page1);
- addr2 = kmap_atomic(page2);
- ret = memcmp(addr1, addr2, PAGE_SIZE);
- kunmap_atomic(addr2);
- kunmap_atomic(addr1);
- return ret;
-}
-
-static inline int pages_identical(struct page *page1, struct page *page2)
-{
- return !memcmp_pages(page1, page2);
-}
-
static int write_protect_page(struct vm_area_struct *vma, struct page *page,
pte_t *orig_pte)
{
@@ -1042,8 +1039,7 @@
};
int swapped;
int err = -EFAULT;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
pvmw.address = page_address_in_vma(page, vma);
if (pvmw.address == -EFAULT)
@@ -1051,9 +1047,10 @@
BUG_ON(PageTransCompound(page));
- mmun_start = pvmw.address;
- mmun_end = pvmw.address + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ pvmw.address,
+ pvmw.address + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
if (!page_vma_mapped_walk(&pvmw))
goto out_mn;
@@ -1105,7 +1102,7 @@
out_unlock:
page_vma_mapped_walk_done(&pvmw);
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out:
return err;
}
@@ -1129,8 +1126,7 @@
spinlock_t *ptl;
unsigned long addr;
int err = -EFAULT;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
addr = page_address_in_vma(page, vma);
if (addr == -EFAULT)
@@ -1140,9 +1136,9 @@
if (!pmd)
goto out;
- mmun_start = addr;
- mmun_end = addr + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
+ addr + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
ptep = pte_offset_map_lock(mm, pmd, addr, &ptl);
if (!pte_same(*ptep, orig_pte)) {
@@ -1188,7 +1184,7 @@
pte_unmap_unlock(ptep, ptl);
err = 0;
out_mn:
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out:
return err;
}
@@ -1387,7 +1383,7 @@
* stable_node parameter itself will be freed from
* under us if it returns NULL.
*/
- _tree_page = get_ksm_page(dup, false);
+ _tree_page = get_ksm_page(dup, GET_KSM_PAGE_NOLOCK);
if (!_tree_page)
continue;
nr += 1;
@@ -1510,7 +1506,7 @@
if (!is_stable_node_chain(stable_node)) {
if (is_page_sharing_candidate(stable_node)) {
*_stable_node_dup = stable_node;
- return get_ksm_page(stable_node, false);
+ return get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK);
}
/*
* _stable_node_dup set to NULL means the stable_node
@@ -1615,7 +1611,8 @@
* wrprotected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -1675,7 +1672,12 @@
* It would be more elegant to return stable_node
* than kpage, but that involves more changes.
*/
- tree_page = get_ksm_page(stable_node_dup, true);
+ tree_page = get_ksm_page(stable_node_dup,
+ GET_KSM_PAGE_TRYLOCK);
+
+ if (PTR_ERR(tree_page) == -EBUSY)
+ return ERR_PTR(-EBUSY);
+
if (unlikely(!tree_page))
/*
* The tree may have been rebalanced,
@@ -1844,7 +1846,8 @@
* wrprotected at all times. Any will work
* fine to continue the walk.
*/
- tree_page = get_ksm_page(stable_node_any, false);
+ tree_page = get_ksm_page(stable_node_any,
+ GET_KSM_PAGE_NOLOCK);
}
VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
if (!tree_page) {
@@ -2070,6 +2073,9 @@
remove_rmap_item_from_tree(rmap_item);
if (kpage) {
+ if (PTR_ERR(kpage) == -EBUSY)
+ return;
+
err = try_to_merge_with_ksm_page(rmap_item, page, kpage);
if (!err) {
/*
@@ -2244,7 +2250,8 @@
list_for_each_entry_safe(stable_node, next,
&migrate_nodes, list) {
- page = get_ksm_page(stable_node, false);
+ page = get_ksm_page(stable_node,
+ GET_KSM_PAGE_NOLOCK);
if (page)
put_page(page);
cond_resched();
@@ -2391,6 +2398,8 @@
static int ksm_scan_thread(void *nothing)
{
+ unsigned int sleep_ms;
+
set_freezable();
set_user_nice(current, 5);
@@ -2404,8 +2413,10 @@
try_to_freeze();
if (ksmd_should_run()) {
- schedule_timeout_interruptible(
- msecs_to_jiffies(ksm_thread_sleep_millisecs));
+ sleep_ms = READ_ONCE(ksm_thread_sleep_millisecs);
+ wait_event_interruptible_timeout(ksm_iter_wait,
+ sleep_ms != READ_ONCE(ksm_thread_sleep_millisecs),
+ msecs_to_jiffies(sleep_ms));
} else {
wait_event_freezable(ksm_thread_wait,
ksmd_should_run() || kthread_should_stop());
@@ -2640,6 +2651,31 @@
goto again;
}
+bool reuse_ksm_page(struct page *page,
+ struct vm_area_struct *vma,
+ unsigned long address)
+{
+#ifdef CONFIG_DEBUG_VM
+ if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
+ WARN_ON(!page_mapped(page)) ||
+ WARN_ON(!PageLocked(page))) {
+ dump_page(page, "reuse_ksm_page");
+ return false;
+ }
+#endif
+
+ if (PageSwapCache(page) || !page_stable_node(page))
+ return false;
+ /* Prohibit parallel get_ksm_page() */
+ if (!page_ref_freeze(page, 1))
+ return false;
+
+ page_move_anon_rmap(page, vma);
+ page->index = linear_page_index(vma, address);
+ page_ref_unfreeze(page, 1);
+
+ return true;
+}
#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
@@ -2824,6 +2860,7 @@
return -EINVAL;
ksm_thread_sleep_millisecs = msecs;
+ wake_up_interruptible(&ksm_iter_wait);
return count;
}
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5b30625..0f1f6b0 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved.
* Authors: David Chinner and Glauber Costa
@@ -11,6 +12,7 @@
#include <linux/slab.h>
#include <linux/mutex.h>
#include <linux/memcontrol.h>
+#include "slab.h"
#ifdef CONFIG_MEMCG_KMEM
static LIST_HEAD(list_lrus);
@@ -37,11 +39,7 @@
static inline bool list_lru_memcg_aware(struct list_lru *lru)
{
- /*
- * This needs node 0 to be always present, even
- * in the systems supporting sparse numa ids.
- */
- return !!lru->node[0].memcg_lrus;
+ return lru->memcg_aware;
}
static inline struct list_lru_one *
@@ -66,7 +64,7 @@
if (!memcg_kmem_enabled())
return NULL;
page = virt_to_head_page(ptr);
- return page->mem_cgroup;
+ return memcg_from_slab_page(page);
}
static inline struct list_lru_one *
@@ -357,7 +355,7 @@
}
return 0;
fail:
- __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+ __memcg_destroy_list_lru_node(memcg_lrus, begin, i);
return -ENOMEM;
}
@@ -451,6 +449,8 @@
{
int i;
+ lru->memcg_aware = memcg_aware;
+
if (!memcg_aware)
return 0;
@@ -601,7 +601,6 @@
struct lock_class_key *key, struct shrinker *shrinker)
{
int i;
- size_t size = sizeof(*lru->node) * nr_node_ids;
int err = -ENOMEM;
#ifdef CONFIG_MEMCG_KMEM
@@ -612,7 +611,7 @@
#endif
memcg_get_cache_ids();
- lru->node = kzalloc(size, GFP_KERNEL);
+ lru->node = kcalloc(nr_node_ids, sizeof(*lru->node), GFP_KERNEL);
if (!lru->node)
goto out;
diff --git a/mm/maccess.c b/mm/maccess.c
index ec00be5..d065736 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Access kernel memory without faulting.
*/
@@ -5,8 +6,20 @@
#include <linux/mm.h>
#include <linux/uaccess.h>
+static __always_inline long
+probe_read_common(void *dst, const void __user *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
/**
- * probe_kernel_read(): safely attempt to read from a location
+ * probe_kernel_read(): safely attempt to read from a kernel-space location
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
@@ -29,17 +42,41 @@
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_from_user_inatomic(dst,
- (__force const void __user *)src, size);
- pagefault_enable();
+ ret = probe_read_common(dst, (__force const void __user *)src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_read);
/**
+ * probe_user_read(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_read(void *dst, const void __user *src, size_t size)
+ __attribute__((alias("__probe_user_read")));
+
+long __probe_user_read(void *dst, const void __user *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(src, size))
+ ret = probe_read_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_read);
+
+/**
* probe_kernel_write(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
@@ -66,6 +103,7 @@
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
+
/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
* @dst: Destination address, in kernel space. This buffer must be at
@@ -105,3 +143,76 @@
return ret ? -EFAULT : src - unsafe_addr;
}
+
+/**
+ * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ * address.
+ * @dst: Destination address, in kernel space. This buffer must be at
+ * least @count bytes long.
+ * @unsafe_addr: Unsafe user address.
+ * @count: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Copies a NUL-terminated string from unsafe user address to kernel buffer.
+ *
+ * On success, returns the length of the string INCLUDING the trailing NUL.
+ *
+ * If access fails, returns -EFAULT (some data may have been copied
+ * and the trailing NUL added).
+ *
+ * If @count is smaller than the length of the string, copies @count-1 bytes,
+ * sets the last byte of @dst buffer to NUL and returns @count.
+ */
+long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+ long count)
+{
+ mm_segment_t old_fs = get_fs();
+ long ret;
+
+ if (unlikely(count <= 0))
+ return 0;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strncpy_from_user(dst, unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ if (ret >= count) {
+ ret = count;
+ dst[ret - 1] = '\0';
+ } else if (ret > 0) {
+ ret++;
+ }
+
+ return ret;
+}
+
+/**
+ * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * @unsafe_addr: The string to measure.
+ * @count: Maximum count (including NUL)
+ *
+ * Get the size of a NUL-terminated string in user space without pagefault.
+ *
+ * Returns the size of the string INCLUDING the terminating NUL.
+ *
+ * If the string is too long, returns a number larger than @count. User
+ * has to check the return value against "> count".
+ * On exception (or invalid count), returns 0.
+ *
+ * Unlike strnlen_user, this can be used from IRQ handler etc. because
+ * it disables pagefaults.
+ */
+long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+{
+ mm_segment_t old_fs = get_fs();
+ int ret;
+
+ set_fs(USER_DS);
+ pagefault_disable();
+ ret = strnlen_user(unsafe_addr, count);
+ pagefault_enable();
+ set_fs(old_fs);
+
+ return ret;
+}
diff --git a/mm/madvise.c b/mm/madvise.c
index 71d21df..94c343b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -11,15 +11,18 @@
#include <linux/syscalls.h>
#include <linux/mempolicy.h>
#include <linux/page-isolation.h>
+#include <linux/page_idle.h>
#include <linux/userfaultfd_k.h>
#include <linux/hugetlb.h>
#include <linux/falloc.h>
+#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/ksm.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
+#include <linux/pagewalk.h>
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
@@ -29,6 +32,11 @@
#include "internal.h"
+struct madvise_walk_private {
+ struct mmu_gather *tlb;
+ bool pageout;
+};
+
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
* take mmap_sem for writing. Others, which simply traverse vmas, need
@@ -40,6 +48,8 @@
case MADV_REMOVE:
case MADV_WILLNEED:
case MADV_DONTNEED:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
case MADV_FREE:
return 0;
default:
@@ -105,28 +115,14 @@
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
error = ksm_madvise(vma, start, end, behavior, &new_flags);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
case MADV_HUGEPAGE:
case MADV_NOHUGEPAGE:
error = hugepage_madvise(vma, &new_flags, behavior);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
break;
}
@@ -152,15 +148,8 @@
goto out;
}
error = __split_vma(mm, vma, start, 1);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
if (end != vma->vm_end) {
@@ -169,15 +158,8 @@
goto out;
}
error = __split_vma(mm, vma, end, 0);
- if (error) {
- /*
- * madvise() returns EAGAIN if kernel resources, such as
- * slab, are temporarily unavailable.
- */
- if (error == -ENOMEM)
- error = -EAGAIN;
- goto out;
- }
+ if (error)
+ goto out_convert_errno;
}
success:
@@ -185,6 +167,14 @@
* vm_flags is protected by the mmap_sem held in write mode.
*/
vma->vm_flags = new_flags;
+
+out_convert_errno:
+ /*
+ * madvise() returns EAGAIN if kernel resources, such as
+ * slab, are temporarily unavailable.
+ */
+ if (error == -ENOMEM)
+ error = -EAGAIN;
out:
return error;
}
@@ -225,19 +215,9 @@
return 0;
}
-static void force_swapin_readahead(struct vm_area_struct *vma,
- unsigned long start, unsigned long end)
-{
- struct mm_walk walk = {
- .mm = vma->vm_mm,
- .pmd_entry = swapin_walk_pmd_entry,
- .private = vma,
- };
-
- walk_page_range(start, end, &walk);
-
- lru_add_drain(); /* Push any new pages onto the LRU now */
-}
+static const struct mm_walk_ops swapin_walk_ops = {
+ .pmd_entry = swapin_walk_pmd_entry,
+};
static void force_shm_swapin_readahead(struct vm_area_struct *vma,
unsigned long start, unsigned long end,
@@ -251,7 +231,7 @@
index = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
page = find_get_entry(mapping, index);
- if (!radix_tree_exceptional_entry(page)) {
+ if (!xa_is_value(page)) {
if (page)
put_page(page);
continue;
@@ -275,11 +255,13 @@
unsigned long start, unsigned long end)
{
struct file *file = vma->vm_file;
+ loff_t offset;
*prev = vma;
#ifdef CONFIG_SWAP
if (!file) {
- force_swapin_readahead(vma, start, end);
+ walk_page_range(vma->vm_mm, start, end, &swapin_walk_ops, vma);
+ lru_add_drain(); /* Push any new pages onto the LRU now */
return 0;
}
@@ -298,12 +280,276 @@
return 0;
}
- start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
- if (end > vma->vm_end)
- end = vma->vm_end;
- end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
+ /*
+ * Filesystem's fadvise may need to take various locks. We need to
+ * explicitly grab a reference because the vma (and hence the
+ * vma's reference to the file) can go away as soon as we drop
+ * mmap_sem.
+ */
+ *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+ get_file(file);
+ up_read(¤t->mm->mmap_sem);
+ offset = (loff_t)(start - vma->vm_start)
+ + ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
+ fput(file);
+ down_read(¤t->mm->mmap_sem);
+ return 0;
+}
- force_page_cache_readahead(file->f_mapping, file, start, end - start);
+static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct madvise_walk_private *private = walk->private;
+ struct mmu_gather *tlb = private->tlb;
+ bool pageout = private->pageout;
+ struct mm_struct *mm = tlb->mm;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *orig_pte, *pte, ptent;
+ spinlock_t *ptl;
+ struct page *page = NULL;
+ LIST_HEAD(page_list);
+
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (pmd_trans_huge(*pmd)) {
+ pmd_t orig_pmd;
+ unsigned long next = pmd_addr_end(addr, end);
+
+ tlb_change_page_size(tlb, HPAGE_PMD_SIZE);
+ ptl = pmd_trans_huge_lock(pmd, vma);
+ if (!ptl)
+ return 0;
+
+ orig_pmd = *pmd;
+ if (is_huge_zero_pmd(orig_pmd))
+ goto huge_unlock;
+
+ if (unlikely(!pmd_present(orig_pmd))) {
+ VM_BUG_ON(thp_migration_supported() &&
+ !is_pmd_migration_entry(orig_pmd));
+ goto huge_unlock;
+ }
+
+ page = pmd_page(orig_pmd);
+ if (next - addr != HPAGE_PMD_SIZE) {
+ int err;
+
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
+ get_page(page);
+ spin_unlock(ptl);
+ lock_page(page);
+ err = split_huge_page(page);
+ unlock_page(page);
+ put_page(page);
+ if (!err)
+ goto regular_page;
+ return 0;
+ }
+
+ if (pmd_young(orig_pmd)) {
+ pmdp_invalidate(vma, addr, pmd);
+ orig_pmd = pmd_mkold(orig_pmd);
+
+ set_pmd_at(mm, addr, pmd, orig_pmd);
+ tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
+ }
+
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
+ } else
+ deactivate_page(page);
+huge_unlock:
+ spin_unlock(ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ return 0;
+ }
+
+ if (pmd_trans_unstable(pmd))
+ return 0;
+regular_page:
+#endif
+ tlb_change_page_size(tlb, PAGE_SIZE);
+ orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
+ flush_tlb_batched_pending(mm);
+ arch_enter_lazy_mmu_mode();
+ for (; addr < end; pte++, addr += PAGE_SIZE) {
+ ptent = *pte;
+
+ if (pte_none(ptent))
+ continue;
+
+ if (!pte_present(ptent))
+ continue;
+
+ page = vm_normal_page(vma, addr, ptent);
+ if (!page)
+ continue;
+
+ /*
+ * Creating a THP page is expensive so split it only if we
+ * are sure it's worth. Split it if we are only owner.
+ */
+ if (PageTransCompound(page)) {
+ if (page_mapcount(page) != 1)
+ break;
+ get_page(page);
+ if (!trylock_page(page)) {
+ put_page(page);
+ break;
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+ if (split_huge_page(page)) {
+ unlock_page(page);
+ put_page(page);
+ pte_offset_map_lock(mm, pmd, addr, &ptl);
+ break;
+ }
+ unlock_page(page);
+ put_page(page);
+ pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+ pte--;
+ addr -= PAGE_SIZE;
+ continue;
+ }
+
+ VM_BUG_ON_PAGE(PageTransCompound(page), page);
+
+ if (pte_young(ptent)) {
+ ptent = ptep_get_and_clear_full(mm, addr, pte,
+ tlb->fullmm);
+ ptent = pte_mkold(ptent);
+ set_pte_at(mm, addr, pte, ptent);
+ tlb_remove_tlb_entry(tlb, pte, addr);
+ }
+
+ /*
+ * We are deactivating a page for accelerating reclaiming.
+ * VM couldn't reclaim the page unless we clear PG_young.
+ * As a side effect, it makes confuse idle-page tracking
+ * because they will miss recent referenced history.
+ */
+ ClearPageReferenced(page);
+ test_and_clear_page_young(page);
+ if (pageout) {
+ if (!isolate_lru_page(page)) {
+ if (PageUnevictable(page))
+ putback_lru_page(page);
+ else
+ list_add(&page->lru, &page_list);
+ }
+ } else
+ deactivate_page(page);
+ }
+
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(orig_pte, ptl);
+ if (pageout)
+ reclaim_pages(&page_list);
+ cond_resched();
+
+ return 0;
+}
+
+static const struct mm_walk_ops cold_walk_ops = {
+ .pmd_entry = madvise_cold_or_pageout_pte_range,
+};
+
+static void madvise_cold_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .pageout = false,
+ .tlb = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ tlb_end_vma(tlb, vma);
+}
+
+static long madvise_cold(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_cold_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
+ return 0;
+}
+
+static void madvise_pageout_page_range(struct mmu_gather *tlb,
+ struct vm_area_struct *vma,
+ unsigned long addr, unsigned long end)
+{
+ struct madvise_walk_private walk_private = {
+ .pageout = true,
+ .tlb = tlb,
+ };
+
+ tlb_start_vma(tlb, vma);
+ walk_page_range(vma->vm_mm, addr, end, &cold_walk_ops, &walk_private);
+ tlb_end_vma(tlb, vma);
+}
+
+static inline bool can_do_pageout(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * paging out pagecache only for non-anonymous mappings that correspond
+ * to the files the calling process could (if tried) open for writing;
+ * otherwise we'd be including shared non-exclusive mappings, which
+ * opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static long madvise_pageout(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start_addr, unsigned long end_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ *prev = vma;
+ if (!can_madv_lru_vma(vma))
+ return -EINVAL;
+
+ if (!can_do_pageout(vma))
+ return 0;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm, start_addr, end_addr);
+ madvise_pageout_page_range(&tlb, vma, start_addr, end_addr);
+ tlb_finish_mmu(&tlb, start_addr, end_addr);
+
return 0;
}
@@ -328,7 +574,7 @@
if (pmd_trans_unstable(pmd))
return 0;
- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+ tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
flush_tlb_batched_pending(mm);
arch_enter_lazy_mmu_mode();
@@ -354,7 +600,7 @@
continue;
}
- page = _vm_normal_page(vma, addr, ptent, true);
+ page = vm_normal_page(vma, addr, ptent);
if (!page)
continue;
@@ -440,47 +686,41 @@
return 0;
}
-static void madvise_free_page_range(struct mmu_gather *tlb,
- struct vm_area_struct *vma,
- unsigned long addr, unsigned long end)
-{
- struct mm_walk free_walk = {
- .pmd_entry = madvise_free_pte_range,
- .mm = vma->vm_mm,
- .private = tlb,
- };
-
- tlb_start_vma(tlb, vma);
- walk_page_range(addr, end, &free_walk);
- tlb_end_vma(tlb, vma);
-}
+static const struct mm_walk_ops madvise_free_walk_ops = {
+ .pmd_entry = madvise_free_pte_range,
+};
static int madvise_free_single_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr)
{
- unsigned long start, end;
struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
/* MADV_FREE works for only anon vma at the moment */
if (!vma_is_anonymous(vma))
return -EINVAL;
- start = max(vma->vm_start, start_addr);
- if (start >= vma->vm_end)
+ range.start = max(vma->vm_start, start_addr);
+ if (range.start >= vma->vm_end)
return -EINVAL;
- end = min(vma->vm_end, end_addr);
- if (end <= vma->vm_start)
+ range.end = min(vma->vm_end, end_addr);
+ if (range.end <= vma->vm_start)
return -EINVAL;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ range.start, range.end);
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, start, end);
- madvise_free_page_range(&tlb, vma, start, end);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_invalidate_range_start(&range);
+ tlb_start_vma(&tlb, vma);
+ walk_page_range(vma->vm_mm, range.start, range.end,
+ &madvise_free_walk_ops, &tlb);
+ tlb_end_vma(&tlb, vma);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, range.start, range.end);
return 0;
}
@@ -517,7 +757,7 @@
int behavior)
{
*prev = vma;
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
@@ -539,7 +779,7 @@
*/
return -ENOMEM;
}
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
return -EINVAL;
if (end > vma->vm_end) {
/*
@@ -693,6 +933,10 @@
return madvise_remove(vma, prev, start, end);
case MADV_WILLNEED:
return madvise_willneed(vma, prev, start, end);
+ case MADV_COLD:
+ return madvise_cold(vma, prev, start, end);
+ case MADV_PAGEOUT:
+ return madvise_pageout(vma, prev, start, end);
case MADV_FREE:
case MADV_DONTNEED:
return madvise_dontneed_free(vma, prev, start, end, behavior);
@@ -714,6 +958,8 @@
case MADV_WILLNEED:
case MADV_DONTNEED:
case MADV_FREE:
+ case MADV_COLD:
+ case MADV_PAGEOUT:
#ifdef CONFIG_KSM
case MADV_MERGEABLE:
case MADV_UNMERGEABLE:
@@ -808,6 +1054,8 @@
size_t len;
struct blk_plug plug;
+ start = untagged_addr(start);
+
if (!madvise_behavior_valid(behavior))
return error;
diff --git a/mm/memblock.c b/mm/memblock.c
index 2379444..c4b16ca 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Procedures for maintaining information about logical memory blocks.
*
* Peter Bergner, IBM Corp. June 2001.
* Copyright (C) 2001 Peter Bergner.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/kernel.h>
@@ -20,13 +16,19 @@
#include <linux/kmemleak.h>
#include <linux/seq_file.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h>
#include <asm/sections.h>
#include <linux/io.h>
#include "internal.h"
+#define INIT_MEMBLOCK_REGIONS 128
+#define INIT_PHYSMEM_REGIONS 4
+
+#ifndef INIT_MEMBLOCK_RESERVED_REGIONS
+# define INIT_MEMBLOCK_RESERVED_REGIONS INIT_MEMBLOCK_REGIONS
+#endif
+
/**
* DOC: memblock overview
*
@@ -70,20 +72,41 @@
* :c:func:`memblock_set_node`. The :c:func:`memblock_add_node`
* performs such an assignment directly.
*
- * Once memblock is setup the memory can be allocated using either
- * memblock or bootmem APIs.
+ * Once memblock is setup the memory can be allocated using one of the
+ * API variants:
+ *
+ * * :c:func:`memblock_phys_alloc*` - these functions return the
+ * **physical** address of the allocated memory
+ * * :c:func:`memblock_alloc*` - these functions return the **virtual**
+ * address of the allocated memory.
+ *
+ * Note, that both API variants use implict assumptions about allowed
+ * memory ranges and the fallback methods. Consult the documentation
+ * of :c:func:`memblock_alloc_internal` and
+ * :c:func:`memblock_alloc_range_nid` functions for more elaboarte
+ * description.
*
* As the system boot progresses, the architecture specific
* :c:func:`mem_init` function frees all the memory to the buddy page
* allocator.
*
- * If an architecure enables %CONFIG_ARCH_DISCARD_MEMBLOCK, the
+ * Unless an architecure enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
* memblock data structures will be discarded after the system
* initialization compltes.
*/
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+struct pglist_data __refdata contig_page_data;
+EXPORT_SYMBOL(contig_page_data);
+#endif
+
+unsigned long max_low_pfn;
+unsigned long min_low_pfn;
+unsigned long max_pfn;
+unsigned long long max_possible_pfn;
+
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
-static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
#endif
@@ -96,7 +119,7 @@
.reserved.regions = memblock_reserved_init_regions,
.reserved.cnt = 1, /* empty dummy entry */
- .reserved.max = INIT_MEMBLOCK_REGIONS,
+ .reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
@@ -116,7 +139,7 @@
static int memblock_memory_in_slab __initdata_memblock = 0;
static int memblock_reserved_in_slab __initdata_memblock = 0;
-enum memblock_flags __init_memblock choose_memblock_flags(void)
+static enum memblock_flags __init_memblock choose_memblock_flags(void)
{
return system_has_some_mirror ? MEMBLOCK_MIRROR : MEMBLOCK_NONE;
}
@@ -245,7 +268,7 @@
* Return:
* Found address on success, 0 on failure.
*/
-phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
+static phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t size,
phys_addr_t align, phys_addr_t start,
phys_addr_t end, int nid,
enum memblock_flags flags)
@@ -253,7 +276,8 @@
phys_addr_t kernel_end, ret;
/* pump up @end */
- if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
+ end == MEMBLOCK_ALLOC_KASAN)
end = memblock.current_limit;
/* avoid allocating the first page */
@@ -347,7 +371,7 @@
}
}
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+#ifndef CONFIG_ARCH_KEEP_MEMBLOCK
/**
* memblock_discard - discard memory and reserved arrays if they were allocated
*/
@@ -418,17 +442,7 @@
else
in_slab = &memblock_reserved_in_slab;
- /* Try to find some space for it.
- *
- * WARNING: We assume that either slab_is_available() and we use it or
- * we use MEMBLOCK for allocations. That means that this is unsafe to
- * use when bootmem is currently active (unless bootmem itself is
- * implemented on top of MEMBLOCK which isn't the case yet)
- *
- * This should however not be an issue for now, as we currently only
- * call into MEMBLOCK while it's still active, or much later when slab
- * is active for memory hotplug operations
- */
+ /* Try to find some space for it */
if (use_slab) {
new_array = kmalloc(new_size, GFP_KERNEL);
addr = new_array ? __pa(new_array) : 0;
@@ -684,7 +698,7 @@
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_add: [%pa-%pa] %pF\n",
+ memblock_dbg("memblock_add: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.memory, base, size, MAX_NUMNODES, 0);
@@ -791,12 +805,19 @@
return memblock_remove_range(&memblock.memory, base, size);
}
-
+/**
+ * memblock_free - free boot memory block
+ * @base: phys starting address of the boot memory block
+ * @size: size of the boot memory block in bytes
+ *
+ * Free boot memory block previously allocated by memblock_alloc_xx() API.
+ * The freeing memory will not be released to the buddy allocator.
+ */
int __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
phys_addr_t end = base + size - 1;
- memblock_dbg(" memblock_free: [%pa-%pa] %pF\n",
+ memblock_dbg(" memblock_free: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
@@ -807,7 +828,7 @@
{
phys_addr_t end = base + size - 1;
- memblock_dbg("memblock_reserve: [%pa-%pa] %pF\n",
+ memblock_dbg("memblock_reserve: [%pa-%pa] %pS\n",
&base, &end, (void *)_RET_IP_);
return memblock_add_range(&memblock.reserved, base, size, MAX_NUMNODES, 0);
@@ -834,11 +855,14 @@
if (ret)
return ret;
- for (i = start_rgn; i < end_rgn; i++)
+ for (i = start_rgn; i < end_rgn; i++) {
+ struct memblock_region *r = &type->regions[i];
+
if (set)
- memblock_set_region_flags(&type->regions[i], flag);
+ r->flags |= flag;
else
- memblock_clear_region_flags(&type->regions[i], flag);
+ r->flags &= ~flag;
+ }
memblock_merge_regions(type);
return 0;
@@ -938,8 +962,31 @@
*idx = ULLONG_MAX;
}
+static bool should_skip_region(struct memblock_region *m, int nid, int flags)
+{
+ int m_nid = memblock_get_region_node(m);
+
+ /* only memory regions are associated with nodes, check it */
+ if (nid != NUMA_NO_NODE && nid != m_nid)
+ return true;
+
+ /* skip hotpluggable memory regions if needed */
+ if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
+ return true;
+
+ /* if we want mirror memory skip non-mirror memory regions */
+ if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
+ return true;
+
+ /* skip nomap memory unless we were asked for it explicitly */
+ if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ return true;
+
+ return false;
+}
+
/**
- * __next__mem_range - next function for for_each_free_mem_range() etc.
+ * __next_mem_range - next function for for_each_free_mem_range() etc.
* @idx: pointer to u64 loop variable
* @nid: node selector, %NUMA_NO_NODE for all nodes
* @flags: pick from blocks based on memory attributes
@@ -985,20 +1032,7 @@
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- /* only memory regions are associated with nodes, check it */
- if (nid != NUMA_NO_NODE && nid != m_nid)
- continue;
-
- /* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
- continue;
-
- /* if we want mirror memory skip non-mirror memory regions */
- if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
- continue;
-
- /* skip nomap memory unless we were asked for it explicitly */
- if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ if (should_skip_region(m, nid, flags))
continue;
if (!type_b) {
@@ -1102,20 +1136,7 @@
phys_addr_t m_end = m->base + m->size;
int m_nid = memblock_get_region_node(m);
- /* only memory regions are associated with nodes, check it */
- if (nid != NUMA_NO_NODE && nid != m_nid)
- continue;
-
- /* skip hotpluggable memory regions if needed */
- if (movable_node_is_enabled() && memblock_is_hotpluggable(m))
- continue;
-
- /* if we want mirror memory skip non-mirror memory regions */
- if ((flags & MEMBLOCK_MIRROR) && !memblock_is_mirror(m))
- continue;
-
- /* skip nomap memory unless we were asked for it explicitly */
- if (!(flags & MEMBLOCK_NOMAP) && memblock_is_nomap(m))
+ if (should_skip_region(m, nid, flags))
continue;
if (!type_b) {
@@ -1170,7 +1191,7 @@
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
- * Common iterator interface used to define for_each_mem_range().
+ * Common iterator interface used to define for_each_mem_pfn_range().
*/
void __init_memblock __next_mem_pfn_range(int *idx, int nid,
unsigned long *out_start_pfn,
@@ -1230,166 +1251,125 @@
return 0;
}
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
-static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
- phys_addr_t align, phys_addr_t start,
- phys_addr_t end, int nid,
- enum memblock_flags flags)
-{
- phys_addr_t found;
-
- if (!align)
- align = SMP_CACHE_BYTES;
-
- found = memblock_find_in_range_node(size, align, start, end, nid,
- flags);
- if (found && !memblock_reserve(found, size)) {
- /*
- * The min_count is set to 0 so that memblock allocations are
- * never reported as leaks.
- */
- kmemleak_alloc_phys(found, size, 0, 0);
- return found;
- }
- return 0;
-}
-
-phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align,
- phys_addr_t start, phys_addr_t end,
- enum memblock_flags flags)
-{
- return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
- flags);
-}
-
-phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size,
- phys_addr_t align, phys_addr_t max_addr,
- int nid, enum memblock_flags flags)
-{
- return memblock_alloc_range_nid(size, align, 0, max_addr, nid, flags);
-}
-
-phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
-{
- enum memblock_flags flags = choose_memblock_flags();
- phys_addr_t ret;
-
-again:
- ret = memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE,
- nid, flags);
-
- if (!ret && (flags & MEMBLOCK_MIRROR)) {
- flags &= ~MEMBLOCK_MIRROR;
- goto again;
- }
- return ret;
-}
-
-phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
-{
- return memblock_alloc_base_nid(size, align, max_addr, NUMA_NO_NODE,
- MEMBLOCK_NONE);
-}
-
-phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
-{
- phys_addr_t alloc;
-
- alloc = __memblock_alloc_base(size, align, max_addr);
-
- if (alloc == 0)
- panic("ERROR: Failed to allocate %pa bytes below %pa.\n",
- &size, &max_addr);
-
- return alloc;
-}
-
-phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
-{
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
-}
-
-phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
-{
- phys_addr_t res = memblock_alloc_nid(size, align, nid);
-
- if (res)
- return res;
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
-}
-
-#if defined(CONFIG_NO_BOOTMEM)
+#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/**
- * memblock_virt_alloc_internal - allocate boot memory block
+ * __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
+ *
+ * @idx: pointer to u64 loop variable
+ * @zone: zone in which all of the memory blocks reside
+ * @out_spfn: ptr to ulong for start pfn of the range, can be %NULL
+ * @out_epfn: ptr to ulong for end pfn of the range, can be %NULL
+ *
+ * This function is meant to be a zone/pfn specific wrapper for the
+ * for_each_mem_range type iterators. Specifically they are used in the
+ * deferred memory init routines and as such we were duplicating much of
+ * this logic throughout the code. So instead of having it in multiple
+ * locations it seemed like it would make more sense to centralize this to
+ * one new iterator that does everything they need.
+ */
+void __init_memblock
+__next_mem_pfn_range_in_zone(u64 *idx, struct zone *zone,
+ unsigned long *out_spfn, unsigned long *out_epfn)
+{
+ int zone_nid = zone_to_nid(zone);
+ phys_addr_t spa, epa;
+ int nid;
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+
+ while (*idx != U64_MAX) {
+ unsigned long epfn = PFN_DOWN(epa);
+ unsigned long spfn = PFN_UP(spa);
+
+ /*
+ * Verify the end is at least past the start of the zone and
+ * that we have at least one PFN to initialize.
+ */
+ if (zone->zone_start_pfn < epfn && spfn < epfn) {
+ /* if we went too far just stop searching */
+ if (zone_end_pfn(zone) <= spfn) {
+ *idx = U64_MAX;
+ break;
+ }
+
+ if (out_spfn)
+ *out_spfn = max(zone->zone_start_pfn, spfn);
+ if (out_epfn)
+ *out_epfn = min(zone_end_pfn(zone), epfn);
+
+ return;
+ }
+
+ __next_mem_range(idx, zone_nid, MEMBLOCK_NONE,
+ &memblock.memory, &memblock.reserved,
+ &spa, &epa, &nid);
+ }
+
+ /* signal end of iteration */
+ if (out_spfn)
+ *out_spfn = ULONG_MAX;
+ if (out_epfn)
+ *out_epfn = 0;
+}
+
+#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
+
+/**
+ * memblock_alloc_range_nid - allocate boot memory block
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
- * @min_addr: the lower bound of the memory region to allocate (phys address)
- * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @start: the lower bound of the memory region to allocate (phys address)
+ * @end: the upper bound of the memory region to allocate (phys address)
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
- * The @min_addr limit is dropped if it can not be satisfied and the allocation
- * will fall back to memory below @min_addr. Also, allocation may fall back
- * to any node in the system if the specified node can not
- * hold the requested memory.
- *
* The allocation is performed from memory region limited by
- * memblock.current_limit if @max_addr == %BOOTMEM_ALLOC_ACCESSIBLE.
+ * memblock.current_limit if @max_addr == %MEMBLOCK_ALLOC_ACCESSIBLE.
*
- * The memory block is aligned on %SMP_CACHE_BYTES if @align == 0.
+ * If the specified node can not hold the requested memory the
+ * allocation falls back to any node in the system
*
- * The phys address of allocated boot memory block is converted to virtual and
- * allocated memory is reset to 0.
+ * For systems with memory mirroring, the allocation is attempted first
+ * from the regions with mirroring enabled and then retried from any
+ * memory region.
*
- * In addition, function sets the min_count to 0 using kmemleak_alloc for
+ * In addition, function sets the min_count to 0 using kmemleak_alloc_phys for
* allocated boot memory block, so that it is never reported as leaks.
*
* Return:
- * Virtual address of allocated memory block on success, NULL on failure.
+ * Physical address of allocated memory block on success, %0 on failure.
*/
-static void * __init memblock_virt_alloc_internal(
- phys_addr_t size, phys_addr_t align,
- phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
+static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start,
+ phys_addr_t end, int nid)
{
- phys_addr_t alloc;
- void *ptr;
enum memblock_flags flags = choose_memblock_flags();
+ phys_addr_t found;
if (WARN_ONCE(nid == MAX_NUMNODES, "Usage of MAX_NUMNODES is deprecated. Use NUMA_NO_NODE instead\n"))
nid = NUMA_NO_NODE;
- /*
- * Detect any accidental use of these APIs after slab is ready, as at
- * this moment memblock may be deinitialized already and its
- * internal data may be destroyed (after execution of free_all_bootmem)
- */
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, nid);
-
- if (!align)
+ if (!align) {
+ /* Can't use WARNs this early in boot on powerpc */
+ dump_stack();
align = SMP_CACHE_BYTES;
+ }
- if (max_addr > memblock.current_limit)
- max_addr = memblock.current_limit;
again:
- alloc = memblock_find_in_range_node(size, align, min_addr, max_addr,
- nid, flags);
- if (alloc && !memblock_reserve(alloc, size))
+ found = memblock_find_in_range_node(size, align, start, end, nid,
+ flags);
+ if (found && !memblock_reserve(found, size))
goto done;
if (nid != NUMA_NO_NODE) {
- alloc = memblock_find_in_range_node(size, align, min_addr,
- max_addr, NUMA_NO_NODE,
+ found = memblock_find_in_range_node(size, align, start,
+ end, NUMA_NO_NODE,
flags);
- if (alloc && !memblock_reserve(alloc, size))
+ if (found && !memblock_reserve(found, size))
goto done;
}
- if (min_addr) {
- min_addr = 0;
- goto again;
- }
-
if (flags & MEMBLOCK_MIRROR) {
flags &= ~MEMBLOCK_MIRROR;
pr_warn("Could not allocate %pap bytes of mirrored memory\n",
@@ -1397,30 +1377,119 @@
goto again;
}
- return NULL;
+ return 0;
+
done:
- ptr = phys_to_virt(alloc);
+ /* Skip kmemleak for kasan_init() due to high volume. */
+ if (end != MEMBLOCK_ALLOC_KASAN)
+ /*
+ * The min_count is set to 0 so that memblock allocated
+ * blocks are never reported as leaks. This is because many
+ * of these blocks are only referred via the physical
+ * address which is not looked up by kmemleak.
+ */
+ kmemleak_alloc_phys(found, size, 0, 0);
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks. This is because many of these blocks
- * are only referred via the physical address which is not
- * looked up by kmemleak.
- */
- kmemleak_alloc(ptr, size, 0, 0);
-
- return ptr;
+ return found;
}
/**
- * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing
+ * memblock_phys_alloc_range - allocate a memory block inside specified range
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @start: the lower bound of the memory region to allocate (physical address)
+ * @end: the upper bound of the memory region to allocate (physical address)
+ *
+ * Allocate @size bytes in the between @start and @end.
+ *
+ * Return: physical address of the allocated memory block on success,
+ * %0 on failure.
+ */
+phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
+ phys_addr_t align,
+ phys_addr_t start,
+ phys_addr_t end)
+{
+ return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE);
+}
+
+/**
+ * memblock_phys_alloc_try_nid - allocate a memory block from specified MUMA node
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Allocates memory block from the specified NUMA node. If the node
+ * has no available memory, attempts to allocated from any node in the
+ * system.
+ *
+ * Return: physical address of the allocated memory block on success,
+ * %0 on failure.
+ */
+phys_addr_t __init memblock_phys_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
+{
+ return memblock_alloc_range_nid(size, align, 0,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+}
+
+/**
+ * memblock_alloc_internal - allocate boot memory block
+ * @size: size of memory block to be allocated in bytes
+ * @align: alignment of the region and block's size
+ * @min_addr: the lower bound of the memory region to allocate (phys address)
+ * @max_addr: the upper bound of the memory region to allocate (phys address)
+ * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
+ *
+ * Allocates memory block using memblock_alloc_range_nid() and
+ * converts the returned physical address to virtual.
+ *
+ * The @min_addr limit is dropped if it can not be satisfied and the allocation
+ * will fall back to memory below @min_addr. Other constraints, such
+ * as node and mirrored memory will be handled again in
+ * memblock_alloc_range_nid().
+ *
+ * Return:
+ * Virtual address of allocated memory block on success, NULL on failure.
+ */
+static void * __init memblock_alloc_internal(
+ phys_addr_t size, phys_addr_t align,
+ phys_addr_t min_addr, phys_addr_t max_addr,
+ int nid)
+{
+ phys_addr_t alloc;
+
+ /*
+ * Detect any accidental use of these APIs after slab is ready, as at
+ * this moment memblock may be deinitialized already and its
+ * internal data may be destroyed (after execution of memblock_free_all)
+ */
+ if (WARN_ON_ONCE(slab_is_available()))
+ return kzalloc_node(size, GFP_NOWAIT, nid);
+
+ if (max_addr > memblock.current_limit)
+ max_addr = memblock.current_limit;
+
+ alloc = memblock_alloc_range_nid(size, align, min_addr, max_addr, nid);
+
+ /* retry allocation without lower limit */
+ if (!alloc && min_addr)
+ alloc = memblock_alloc_range_nid(size, align, 0, max_addr, nid);
+
+ if (!alloc)
+ return NULL;
+
+ return phys_to_virt(alloc);
+}
+
+/**
+ * memblock_alloc_try_nid_raw - allocate boot memory block without zeroing
* memory and without panicking
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
* @min_addr: the lower bound of the memory region from where the allocation
* is preferred (phys address)
* @max_addr: the upper bound of the memory region from where the allocation
- * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
@@ -1431,34 +1500,33 @@
* Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
-void * __init memblock_virt_alloc_try_nid_raw(
+void * __init memblock_alloc_try_nid_raw(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_virt_alloc_internal(size, align,
+ ptr = memblock_alloc_internal(size, align,
min_addr, max_addr, nid);
-#ifdef CONFIG_DEBUG_VM
if (ptr && size > 0)
- memset(ptr, PAGE_POISON_PATTERN, size);
-#endif
+ page_init_poison(ptr, size);
+
return ptr;
}
/**
- * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block
+ * memblock_alloc_try_nid - allocate boot memory block
* @size: size of memory block to be allocated in bytes
* @align: alignment of the region and block's size
* @min_addr: the lower bound of the memory region from where the allocation
* is preferred (phys address)
* @max_addr: the upper bound of the memory region from where the allocation
- * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
+ * is preferred (phys address), or %MEMBLOCK_ALLOC_ACCESSIBLE to
* allocate only from memory limited by memblock.current_limit value
* @nid: nid of the free area to find, %NUMA_NO_NODE for any node
*
@@ -1468,106 +1536,47 @@
* Return:
* Virtual address of allocated memory block on success, NULL on failure.
*/
-void * __init memblock_virt_alloc_try_nid_nopanic(
- phys_addr_t size, phys_addr_t align,
- phys_addr_t min_addr, phys_addr_t max_addr,
- int nid)
-{
- void *ptr;
-
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
- __func__, (u64)size, (u64)align, nid, &min_addr,
- &max_addr, (void *)_RET_IP_);
-
- ptr = memblock_virt_alloc_internal(size, align,
- min_addr, max_addr, nid);
- if (ptr)
- memset(ptr, 0, size);
- return ptr;
-}
-
-/**
- * memblock_virt_alloc_try_nid - allocate boot memory block with panicking
- * @size: size of memory block to be allocated in bytes
- * @align: alignment of the region and block's size
- * @min_addr: the lower bound of the memory region from where the allocation
- * is preferred (phys address)
- * @max_addr: the upper bound of the memory region from where the allocation
- * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to
- * allocate only from memory limited by memblock.current_limit value
- * @nid: nid of the free area to find, %NUMA_NO_NODE for any node
- *
- * Public panicking version of memblock_virt_alloc_try_nid_nopanic()
- * which provides debug information (including caller info), if enabled,
- * and panics if the request can not be satisfied.
- *
- * Return:
- * Virtual address of allocated memory block on success, NULL on failure.
- */
-void * __init memblock_virt_alloc_try_nid(
+void * __init memblock_alloc_try_nid(
phys_addr_t size, phys_addr_t align,
phys_addr_t min_addr, phys_addr_t max_addr,
int nid)
{
void *ptr;
- memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pF\n",
+ memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa %pS\n",
__func__, (u64)size, (u64)align, nid, &min_addr,
&max_addr, (void *)_RET_IP_);
- ptr = memblock_virt_alloc_internal(size, align,
+ ptr = memblock_alloc_internal(size, align,
min_addr, max_addr, nid);
- if (ptr) {
+ if (ptr)
memset(ptr, 0, size);
- return ptr;
- }
- panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=%pa max_addr=%pa\n",
- __func__, (u64)size, (u64)align, nid, &min_addr, &max_addr);
- return NULL;
-}
-#endif
-
-/**
- * __memblock_free_early - free boot memory block
- * @base: phys starting address of the boot memory block
- * @size: size of the boot memory block in bytes
- *
- * Free boot memory block previously allocated by memblock_virt_alloc_xx() API.
- * The freeing memory will not be released to the buddy allocator.
- */
-void __init __memblock_free_early(phys_addr_t base, phys_addr_t size)
-{
- phys_addr_t end = base + size - 1;
-
- memblock_dbg("%s: [%pa-%pa] %pF\n",
- __func__, &base, &end, (void *)_RET_IP_);
- kmemleak_free_part_phys(base, size);
- memblock_remove_range(&memblock.reserved, base, size);
+ return ptr;
}
/**
- * __memblock_free_late - free bootmem block pages directly to buddy allocator
+ * __memblock_free_late - free pages directly to buddy allocator
* @base: phys starting address of the boot memory block
* @size: size of the boot memory block in bytes
*
- * This is only useful when the bootmem allocator has already been torn
+ * This is only useful when the memblock allocator has already been torn
* down, but we are still initializing the system. Pages are released directly
- * to the buddy allocator, no bootmem metadata is updated because it is gone.
+ * to the buddy allocator.
*/
void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
{
phys_addr_t cursor, end;
end = base + size - 1;
- memblock_dbg("%s: [%pa-%pa] %pF\n",
+ memblock_dbg("%s: [%pa-%pa] %pS\n",
__func__, &base, &end, (void *)_RET_IP_);
kmemleak_free_part_phys(base, size);
cursor = PFN_UP(base);
end = PFN_DOWN(base + size);
for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
- totalram_pages++;
+ memblock_free_pages(pfn_to_page(cursor), cursor, 0);
+ totalram_pages_inc();
}
}
@@ -1718,7 +1727,7 @@
return -1;
}
-bool __init memblock_is_reserved(phys_addr_t addr)
+bool __init_memblock memblock_is_reserved(phys_addr_t addr)
{
return memblock_search(&memblock.reserved, addr) != -1;
}
@@ -1880,7 +1889,101 @@
}
early_param("memblock", early_memblock);
-#if defined(CONFIG_DEBUG_FS) && !defined(CONFIG_ARCH_DISCARD_MEMBLOCK)
+static void __init __free_pages_memory(unsigned long start, unsigned long end)
+{
+ int order;
+
+ while (start < end) {
+ order = min(MAX_ORDER - 1UL, __ffs(start));
+
+ while (start + (1UL << order) > end)
+ order--;
+
+ memblock_free_pages(pfn_to_page(start), start, order);
+
+ start += (1UL << order);
+ }
+}
+
+static unsigned long __init __free_memory_core(phys_addr_t start,
+ phys_addr_t end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = min_t(unsigned long,
+ PFN_DOWN(end), max_low_pfn);
+
+ if (start_pfn >= end_pfn)
+ return 0;
+
+ __free_pages_memory(start_pfn, end_pfn);
+
+ return end_pfn - start_pfn;
+}
+
+static unsigned long __init free_low_memory_core_early(void)
+{
+ unsigned long count = 0;
+ phys_addr_t start, end;
+ u64 i;
+
+ memblock_clear_hotplug(0, -1);
+
+ for_each_reserved_mem_region(i, &start, &end)
+ reserve_bootmem_region(start, end);
+
+ /*
+ * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
+ * because in some case like Node0 doesn't have RAM installed
+ * low ram will be on Node1
+ */
+ for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
+ NULL)
+ count += __free_memory_core(start, end);
+
+ return count;
+}
+
+static int reset_managed_pages_done __initdata;
+
+void reset_node_managed_pages(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ atomic_long_set(&z->managed_pages, 0);
+}
+
+void __init reset_all_zones_managed_pages(void)
+{
+ struct pglist_data *pgdat;
+
+ if (reset_managed_pages_done)
+ return;
+
+ for_each_online_pgdat(pgdat)
+ reset_node_managed_pages(pgdat);
+
+ reset_managed_pages_done = 1;
+}
+
+/**
+ * memblock_free_all - release free pages to the buddy allocator
+ *
+ * Return: the number of pages actually released.
+ */
+unsigned long __init memblock_free_all(void)
+{
+ unsigned long pages;
+
+ reset_all_zones_managed_pages();
+
+ pages = free_low_memory_core_early();
+ totalram_pages_add(pages);
+
+ return pages;
+}
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_ARCH_KEEP_MEMBLOCK)
static int memblock_debug_show(struct seq_file *m, void *private)
{
@@ -1903,8 +2006,7 @@
static int __init memblock_init_debugfs(void)
{
struct dentry *root = debugfs_create_dir("memblock", NULL);
- if (!root)
- return -ENXIO;
+
debugfs_create_file("memory", 0444, root,
&memblock.memory, &memblock_debug_fops);
debugfs_create_file("reserved", 0444, root,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index e79cb59..46ad252 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/* memcontrol.c - Memory Controller
*
* Copyright IBM Corporation, 2007
@@ -19,26 +20,17 @@
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/sched/mm.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
#include <linux/pagemap.h>
+#include <linux/vm_event_item.h>
#include <linux/smp.h>
#include <linux/page-flags.h>
#include <linux/backing-dev.h>
@@ -65,6 +57,8 @@
#include <linux/lockdep.h>
#include <linux/file.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
+#include <linux/seq_buf.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -94,6 +88,10 @@
#define do_swap_account 0
#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
+#endif
+
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
@@ -248,6 +246,12 @@
iter != NULL; \
iter = mem_cgroup_iter(NULL, iter, NULL))
+static inline bool should_force_charge(void)
+{
+ return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
+ (current->flags & PF_EXITING);
+}
+
/* Some nice accessors for the vmpressure. */
struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
{
@@ -314,6 +318,7 @@
EXPORT_SYMBOL(memcg_kmem_enabled_key);
struct workqueue_struct *memcg_kmem_cache_wq;
+#endif
static int memcg_shrinker_map_size;
static DEFINE_MUTEX(memcg_shrinker_map_mutex);
@@ -437,14 +442,6 @@
}
}
-#else /* CONFIG_MEMCG_KMEM */
-static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
-{
- return 0;
-}
-static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
-#endif /* CONFIG_MEMCG_KMEM */
-
/**
* mem_cgroup_css_from_page - css of the memcg associated with a page
* @page: page of interest
@@ -487,7 +484,10 @@
unsigned long ino = 0;
rcu_read_lock();
- memcg = READ_ONCE(page->mem_cgroup);
+ if (PageSlab(page) && !PageTail(page))
+ memcg = memcg_from_slab_page(page);
+ else
+ memcg = READ_ONCE(page->mem_cgroup);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -680,10 +680,153 @@
return mz;
}
-static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
- int event)
+/**
+ * __mod_memcg_state - update cgroup memory statistics
+ * @memcg: the memory cgroup
+ * @idx: the stat item - can be enum memcg_stat_item or enum node_stat_item
+ * @val: delta to add to the counter, can be negative
+ */
+void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- return atomic_long_read(&memcg->events[event]);
+ long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->stat[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmstats[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->stat[idx], x);
+}
+
+static struct mem_cgroup_per_node *
+parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
+{
+ struct mem_cgroup *parent;
+
+ parent = parent_mem_cgroup(pn->memcg);
+ if (!parent)
+ return NULL;
+ return mem_cgroup_nodeinfo(parent, nid);
+}
+
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
+ struct mem_cgroup_per_node *pn;
+ struct mem_cgroup *memcg;
+ long x;
+
+ /* Update node */
+ __mod_node_page_state(pgdat, idx, val);
+
+ if (mem_cgroup_disabled())
+ return;
+
+ pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
+ memcg = pn->memcg;
+
+ /* Update memcg */
+ __mod_memcg_state(memcg, idx, val);
+
+ /* Update lruvec */
+ __this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+
+ x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
+ if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup_per_node *pi;
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
+ atomic_long_add(x, &pi->lruvec_stat[idx]);
+ x = 0;
+ }
+ __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
+}
+
+void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+{
+ struct page *page = virt_to_head_page(p);
+ pg_data_t *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = memcg_from_slab_page(page);
+
+ /* Untracked pages have no memcg, no lruvec. Update only the node */
+ if (!memcg || memcg == root_mem_cgroup) {
+ __mod_node_page_state(pgdat, idx, val);
+ } else {
+ lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ __mod_lruvec_state(lruvec, idx, val);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * __count_memcg_events - account VM events in a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event item
+ * @count: the number of events that occured
+ */
+void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
+ unsigned long count)
+{
+ unsigned long x;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ x = count + __this_cpu_read(memcg->vmstats_percpu->events[idx]);
+ if (unlikely(x > MEMCG_CHARGE_BATCH)) {
+ struct mem_cgroup *mi;
+
+ /*
+ * Batch local counters to keep them in sync with
+ * the hierarchical ones.
+ */
+ __this_cpu_add(memcg->vmstats_local->events[idx], x);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &mi->vmevents[idx]);
+ x = 0;
+ }
+ __this_cpu_write(memcg->vmstats_percpu->events[idx], x);
+}
+
+static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
+{
+ return atomic_long_read(&memcg->vmevents[event]);
+}
+
+static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
+{
+ long x = 0;
+ int cpu;
+
+ for_each_possible_cpu(cpu)
+ x += per_cpu(memcg->vmstats_local->events[event], cpu);
+ return x;
}
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
@@ -715,35 +858,7 @@
nr_pages = -nr_pages; /* for event */
}
- __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
-}
-
-unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
-{
- struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
- unsigned long nr = 0;
- enum lru_list lru;
-
- VM_BUG_ON((unsigned)nid >= nr_node_ids);
-
- for_each_lru(lru) {
- if (!(BIT(lru) & lru_mask))
- continue;
- nr += mem_cgroup_get_lru_size(lruvec, lru);
- }
- return nr;
-}
-
-static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
-{
- unsigned long nr = 0;
- int nid;
-
- for_each_node_state(nid, N_MEMORY)
- nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
- return nr;
+ __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
}
static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
@@ -751,8 +866,8 @@
{
unsigned long val, next;
- val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
- next = __this_cpu_read(memcg->stat_cpu->targets[target]);
+ val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
/* from time_after() in jiffies.h */
if ((long)(next - val) < 0) {
switch (target) {
@@ -768,7 +883,7 @@
default:
break;
}
- __this_cpu_write(memcg->stat_cpu->targets[target], next);
+ __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
return true;
}
return false;
@@ -845,7 +960,7 @@
if (unlikely(!memcg))
memcg = root_mem_cgroup;
}
- } while (!css_tryget_online(&memcg->css));
+ } while (!css_tryget(&memcg->css));
rcu_read_unlock();
return memcg;
}
@@ -1031,26 +1146,45 @@
css_put(&prev->css);
}
-static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+static void __invalidate_reclaim_iterators(struct mem_cgroup *from,
+ struct mem_cgroup *dead_memcg)
{
- struct mem_cgroup *memcg = dead_memcg;
struct mem_cgroup_reclaim_iter *iter;
struct mem_cgroup_per_node *mz;
int nid;
int i;
- for (; memcg; memcg = parent_mem_cgroup(memcg)) {
- for_each_node(nid) {
- mz = mem_cgroup_nodeinfo(memcg, nid);
- for (i = 0; i <= DEF_PRIORITY; i++) {
- iter = &mz->iter[i];
- cmpxchg(&iter->position,
- dead_memcg, NULL);
- }
+ for_each_node(nid) {
+ mz = mem_cgroup_nodeinfo(from, nid);
+ for (i = 0; i <= DEF_PRIORITY; i++) {
+ iter = &mz->iter[i];
+ cmpxchg(&iter->position,
+ dead_memcg, NULL);
}
}
}
+static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
+{
+ struct mem_cgroup *memcg = dead_memcg;
+ struct mem_cgroup *last;
+
+ do {
+ __invalidate_reclaim_iterators(memcg, dead_memcg);
+ last = memcg;
+ } while ((memcg = parent_mem_cgroup(memcg)));
+
+ /*
+ * When cgruop1 non-hierarchy mode is used,
+ * parent_mem_cgroup() does not walk all the way up to the
+ * cgroup root (root_mem_cgroup). So we have to handle
+ * dead_memcg from cgroup root separately.
+ */
+ if (last != root_mem_cgroup)
+ __invalidate_reclaim_iterators(root_mem_cgroup,
+ dead_memcg);
+}
+
/**
* mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
* @memcg: hierarchy root
@@ -1076,7 +1210,7 @@
struct css_task_iter it;
struct task_struct *task;
- css_task_iter_start(&iter->css, 0, &it);
+ css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it);
while (!ret && (task = css_task_iter_next(&it)))
ret = fn(task, arg);
css_task_iter_end(&it);
@@ -1168,32 +1302,6 @@
*lru_size += nr_pages;
}
-bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
-{
- struct mem_cgroup *task_memcg;
- struct task_struct *p;
- bool ret;
-
- p = find_lock_task_mm(task);
- if (p) {
- task_memcg = get_mem_cgroup_from_mm(p->mm);
- task_unlock(p);
- } else {
- /*
- * All threads may have already detached their mm's, but the oom
- * killer still needs to detect if they have already been oom
- * killed to prevent needlessly killing additional tasks.
- */
- rcu_read_lock();
- task_memcg = mem_cgroup_from_task(task);
- css_get(&task_memcg->css);
- rcu_read_unlock();
- }
- ret = mem_cgroup_is_descendant(task_memcg, memcg);
- css_put(&task_memcg->css);
- return ret;
-}
-
/**
* mem_cgroup_margin - calculate chargeable space of a memory cgroup
* @memcg: the memory cgroup
@@ -1269,85 +1377,174 @@
return false;
}
-static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
- NR_SHMEM,
- NR_FILE_MAPPED,
- NR_FILE_DIRTY,
- NR_WRITEBACK,
- MEMCG_SWAP,
-};
+static char *memory_stat_format(struct mem_cgroup *memcg)
+{
+ struct seq_buf s;
+ int i;
-static const char *const memcg1_stat_names[] = {
- "cache",
- "rss",
- "rss_huge",
- "shmem",
- "mapped_file",
- "dirty",
- "writeback",
- "swap",
-};
+ seq_buf_init(&s, kmalloc(PAGE_SIZE, GFP_KERNEL), PAGE_SIZE);
+ if (!s.buffer)
+ return NULL;
+
+ /*
+ * Provide statistics on the state of the memory subsystem as
+ * well as cumulative event counters that show past behavior.
+ *
+ * This list is ordered following a combination of these gradients:
+ * 1) generic big picture -> specifics and details
+ * 2) reflecting userspace activity -> reflecting kernel heuristics
+ *
+ * Current memory state:
+ */
+
+ seq_buf_printf(&s, "anon %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_CACHE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "kernel_stack %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ 1024);
+ seq_buf_printf(&s, "slab %llu\n",
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "sock %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_SOCK) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "shmem %llu\n",
+ (u64)memcg_page_state(memcg, NR_SHMEM) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_mapped %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_MAPPED) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_dirty %llu\n",
+ (u64)memcg_page_state(memcg, NR_FILE_DIRTY) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "file_writeback %llu\n",
+ (u64)memcg_page_state(memcg, NR_WRITEBACK) *
+ PAGE_SIZE);
+
+ /*
+ * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
+ * with the NR_ANON_THP vm counter, but right now it's a pain in the
+ * arse because it requires migrating the work out of rmap to a place
+ * where the page->mem_cgroup is set up and stable.
+ */
+ seq_buf_printf(&s, "anon_thp %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
+ PAGE_SIZE);
+
+ for (i = 0; i < NR_LRU_LISTS; i++)
+ seq_buf_printf(&s, "%s %llu\n", mem_cgroup_lru_names[i],
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
+
+ seq_buf_printf(&s, "slab_reclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
+ PAGE_SIZE);
+ seq_buf_printf(&s, "slab_unreclaimable %llu\n",
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
+ PAGE_SIZE);
+
+ /* Accumulated memory events */
+
+ seq_buf_printf(&s, "pgfault %lu\n", memcg_events(memcg, PGFAULT));
+ seq_buf_printf(&s, "pgmajfault %lu\n", memcg_events(memcg, PGMAJFAULT));
+
+ seq_buf_printf(&s, "workingset_refault %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT));
+ seq_buf_printf(&s, "workingset_activate %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
+ memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
+
+ seq_buf_printf(&s, "pgrefill %lu\n", memcg_events(memcg, PGREFILL));
+ seq_buf_printf(&s, "pgscan %lu\n",
+ memcg_events(memcg, PGSCAN_KSWAPD) +
+ memcg_events(memcg, PGSCAN_DIRECT));
+ seq_buf_printf(&s, "pgsteal %lu\n",
+ memcg_events(memcg, PGSTEAL_KSWAPD) +
+ memcg_events(memcg, PGSTEAL_DIRECT));
+ seq_buf_printf(&s, "pgactivate %lu\n", memcg_events(memcg, PGACTIVATE));
+ seq_buf_printf(&s, "pgdeactivate %lu\n", memcg_events(memcg, PGDEACTIVATE));
+ seq_buf_printf(&s, "pglazyfree %lu\n", memcg_events(memcg, PGLAZYFREE));
+ seq_buf_printf(&s, "pglazyfreed %lu\n", memcg_events(memcg, PGLAZYFREED));
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ seq_buf_printf(&s, "thp_fault_alloc %lu\n",
+ memcg_events(memcg, THP_FAULT_ALLOC));
+ seq_buf_printf(&s, "thp_collapse_alloc %lu\n",
+ memcg_events(memcg, THP_COLLAPSE_ALLOC));
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+ /* The above should easily fit into one page */
+ WARN_ON_ONCE(seq_buf_has_overflowed(&s));
+
+ return s.buffer;
+}
#define K(x) ((x) << (PAGE_SHIFT-10))
/**
- * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
+ * mem_cgroup_print_oom_context: Print OOM information relevant to
+ * memory controller.
* @memcg: The memory cgroup that went over limit
* @p: Task that is going to be killed
*
* NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
* enabled
*/
-void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
+void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p)
{
- struct mem_cgroup *iter;
- unsigned int i;
-
rcu_read_lock();
+ if (memcg) {
+ pr_cont(",oom_memcg=");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ } else
+ pr_cont(",global_oom");
if (p) {
- pr_info("Task in ");
+ pr_cont(",task_memcg=");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_cont(" killed as a result of limit of ");
- } else {
- pr_info("Memory limit reached of cgroup ");
}
-
- pr_cont_cgroup_path(memcg->css.cgroup);
- pr_cont("\n");
-
rcu_read_unlock();
+}
+
+/**
+ * mem_cgroup_print_oom_meminfo: Print OOM memory information relevant to
+ * memory controller.
+ * @memcg: The memory cgroup that went over limit
+ */
+void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg)
+{
+ char *buf;
pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
K((u64)page_counter_read(&memcg->memory)),
K((u64)memcg->memory.max), memcg->memory.failcnt);
- pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->memsw)),
- K((u64)memcg->memsw.max), memcg->memsw.failcnt);
- pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
- K((u64)page_counter_read(&memcg->kmem)),
- K((u64)memcg->kmem.max), memcg->kmem.failcnt);
-
- for_each_mem_cgroup_tree(iter, memcg) {
- pr_info("Memory cgroup stats for ");
- pr_cont_cgroup_path(iter->css.cgroup);
- pr_cont(":");
-
- for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
- if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
- continue;
- pr_cont(" %s:%luKB", memcg1_stat_names[i],
- K(memcg_page_state(iter, memcg1_stats[i])));
- }
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
- K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
-
- pr_cont("\n");
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ pr_info("swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->swap)),
+ K((u64)memcg->swap.max), memcg->swap.failcnt);
+ else {
+ pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->memsw)),
+ K((u64)memcg->memsw.max), memcg->memsw.failcnt);
+ pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
+ K((u64)page_counter_read(&memcg->kmem)),
+ K((u64)memcg->kmem.max), memcg->kmem.failcnt);
}
+
+ pr_info("Memory cgroup stats for ");
+ pr_cont_cgroup_path(memcg->css.cgroup);
+ pr_cont(":");
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return;
+ pr_info("%s", buf);
+ kfree(buf);
}
/*
@@ -1370,6 +1567,11 @@
return max;
}
+unsigned long mem_cgroup_size(struct mem_cgroup *memcg)
+{
+ return page_counter_read(&memcg->memory);
+}
+
static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
int order)
{
@@ -1382,8 +1584,13 @@
};
bool ret;
- mutex_lock(&oom_lock);
- ret = out_of_memory(&oc);
+ if (mutex_lock_killable(&oom_lock))
+ return true;
+ /*
+ * A few threads which were not waiting at mutex_lock_killable() can
+ * fail to bail out. Therefore, check again after holding oom_lock.
+ */
+ ret = should_force_charge() || out_of_memory(&oc);
mutex_unlock(&oom_lock);
return ret;
}
@@ -1403,11 +1610,15 @@
static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
int nid, bool noswap)
{
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+
+ if (lruvec_page_state(lruvec, NR_INACTIVE_FILE) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE))
return true;
if (noswap || !total_swap_pages)
return false;
- if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
+ if (lruvec_page_state(lruvec, NR_INACTIVE_ANON) ||
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON))
return true;
return false;
@@ -1666,9 +1877,14 @@
static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
{
+ enum oom_status ret;
+ bool locked;
+
if (order > PAGE_ALLOC_COSTLY_ORDER)
return OOM_SKIPPED;
+ memcg_memory_event(memcg, MEMCG_OOM);
+
/*
* We are in the middle of the charge context here, so we
* don't want to block when potentially sitting on a callstack
@@ -1698,10 +1914,23 @@
return OOM_ASYNC;
}
- if (mem_cgroup_out_of_memory(memcg, mask, order))
- return OOM_SUCCESS;
+ mem_cgroup_mark_under_oom(memcg);
- return OOM_FAILED;
+ locked = mem_cgroup_oom_trylock(memcg);
+
+ if (locked)
+ mem_cgroup_oom_notify(memcg);
+
+ mem_cgroup_unmark_under_oom(memcg);
+ if (mem_cgroup_out_of_memory(memcg, mask, order))
+ ret = OOM_SUCCESS;
+ else
+ ret = OOM_FAILED;
+
+ if (locked)
+ mem_cgroup_oom_unlock(memcg);
+
+ return ret;
}
/**
@@ -2040,21 +2269,22 @@
for_each_online_cpu(cpu) {
struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
struct mem_cgroup *memcg;
+ bool flush = false;
+ rcu_read_lock();
memcg = stock->cached;
- if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
- continue;
- if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
- css_put(&memcg->css);
- continue;
- }
- if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
+ if (memcg && stock->nr_pages &&
+ mem_cgroup_is_descendant(memcg, root_memcg))
+ flush = true;
+ rcu_read_unlock();
+
+ if (flush &&
+ !test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
if (cpu == curcpu)
drain_local_stock(&stock->work);
else
schedule_work_on(cpu, &stock->work);
}
- css_put(&memcg->css);
}
put_cpu();
mutex_unlock(&percpu_charge_mutex);
@@ -2063,7 +2293,7 @@
static int memcg_hotplug_cpu_dead(unsigned int cpu)
{
struct memcg_stock_pcp *stock;
- struct mem_cgroup *memcg;
+ struct mem_cgroup *memcg, *mi;
stock = &per_cpu(memcg_stock, cpu);
drain_stock(stock);
@@ -2075,9 +2305,10 @@
int nid;
long x;
- x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
+ x = this_cpu_xchg(memcg->vmstats_percpu->stat[i], 0);
if (x)
- atomic_long_add(x, &memcg->stat[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmstats[i]);
if (i >= NR_VM_NODE_STAT_ITEMS)
continue;
@@ -2088,16 +2319,19 @@
pn = mem_cgroup_nodeinfo(memcg, nid);
x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
if (x)
- atomic_long_add(x, &pn->lruvec_stat[i]);
+ do {
+ atomic_long_add(x, &pn->lruvec_stat[i]);
+ } while ((pn = parent_nodeinfo(pn, nid)));
}
}
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
long x;
- x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
+ x = this_cpu_xchg(memcg->vmstats_percpu->events[i], 0);
if (x)
- atomic_long_add(x, &memcg->events[i]);
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ atomic_long_add(x, &memcg->vmevents[i]);
}
}
@@ -2125,11 +2359,67 @@
}
/*
+ * Clamp the maximum sleep time per allocation batch to 2 seconds. This is
+ * enough to still cause a significant slowdown in most cases, while still
+ * allowing diagnostics and tracing to proceed without becoming stuck.
+ */
+#define MEMCG_MAX_HIGH_DELAY_JIFFIES (2UL*HZ)
+
+/*
+ * When calculating the delay, we use these either side of the exponentiation to
+ * maintain precision and scale to a reasonable number of jiffies (see the table
+ * below.
+ *
+ * - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
+ * overage ratio to a delay.
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * proposed penalty in order to reduce to a reasonable number of jiffies, and
+ * to produce a reasonable delay curve.
+ *
+ * MEMCG_DELAY_SCALING_SHIFT just happens to be a number that produces a
+ * reasonable delay curve compared to precision-adjusted overage, not
+ * penalising heavily at first, but still making sure that growth beyond the
+ * limit penalises misbehaviour cgroups by slowing them down exponentially. For
+ * example, with a high of 100 megabytes:
+ *
+ * +-------+------------------------+
+ * | usage | time to allocate in ms |
+ * +-------+------------------------+
+ * | 100M | 0 |
+ * | 101M | 6 |
+ * | 102M | 25 |
+ * | 103M | 57 |
+ * | 104M | 102 |
+ * | 105M | 159 |
+ * | 106M | 230 |
+ * | 107M | 313 |
+ * | 108M | 409 |
+ * | 109M | 518 |
+ * | 110M | 639 |
+ * | 111M | 774 |
+ * | 112M | 921 |
+ * | 113M | 1081 |
+ * | 114M | 1254 |
+ * | 115M | 1439 |
+ * | 116M | 1638 |
+ * | 117M | 1849 |
+ * | 118M | 2000 |
+ * | 119M | 2000 |
+ * | 120M | 2000 |
+ * +-------+------------------------+
+ */
+ #define MEMCG_DELAY_PRECISION_SHIFT 20
+ #define MEMCG_DELAY_SCALING_SHIFT 14
+
+/*
* Scheduled by try_charge() to be executed from the userland return path
* and reclaims memory over the high limit.
*/
void mem_cgroup_handle_over_high(void)
{
+ unsigned long usage, high, clamped_high;
+ unsigned long pflags;
+ unsigned long penalty_jiffies, overage;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
struct mem_cgroup *memcg;
@@ -2138,8 +2428,75 @@
memcg = get_mem_cgroup_from_mm(current->mm);
reclaim_high(memcg, nr_pages, GFP_KERNEL);
- css_put(&memcg->css);
current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ *
+ * We use overage compared to memory.high to calculate the number of
+ * jiffies to sleep (penalty_jiffies). Ideally this value should be
+ * fairly lenient on small overages, and increasingly harsh when the
+ * memcg in question makes it clear that it has no intention of stopping
+ * its crazy behaviour, so we exponentially increase the delay based on
+ * overage amount.
+ */
+
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ if (usage <= high)
+ goto out;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if it was a
+ * threshold of 1 page
+ */
+ clamped_high = max(high, 1UL);
+
+ overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+ clamped_high);
+
+ penalty_jiffies = ((u64)overage * overage * HZ)
+ >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+
+ /*
+ * Factor in the task's own contribution to the overage, such that four
+ * N-sized allocations are throttled approximately the same as one
+ * 4N-sized allocation.
+ *
+ * MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
+ * larger the current charge patch is than that.
+ */
+ penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+
+ /*
+ * Don't sleep if the amount of jiffies this memcg owes us is so low
+ * that it's not even worth doing, in an attempt to be nice to those who
+ * go only a small amount over their memory.high value and maybe haven't
+ * been aggressively reclaimed enough yet.
+ */
+ if (penalty_jiffies <= HZ / 100)
+ goto out;
+
+ /*
+ * If we exit early, we're guaranteed to die (since
+ * schedule_timeout_killable sets TASK_KILLABLE). This means we don't
+ * need to account for any ill-begotten jiffies to pay them off later.
+ */
+ psi_memstall_enter(&pflags);
+ schedule_timeout_killable(penalty_jiffies);
+ psi_memstall_leave(&pflags);
+
+out:
+ css_put(&memcg->css);
}
static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
@@ -2152,7 +2509,6 @@
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- bool oomed = false;
enum oom_status oom_status;
if (mem_cgroup_is_root(memcg))
@@ -2179,14 +2535,21 @@
}
/*
+ * Memcg doesn't have a dedicated reserve for atomic
+ * allocations. But like the global atomic pool, we need to
+ * put the burden of reclaim on regular allocation requests
+ * and let these go through as privileged allocations.
+ */
+ if (gfp_mask & __GFP_ATOMIC)
+ goto force;
+
+ /*
* Unlike in global OOM situations, memcg is not in a physical
* memory shortage. Allow dying and OOM-killed tasks to
* bypass the last charges so that they can exit quickly and
* free their memory.
*/
- if (unlikely(tsk_is_oom_victim(current) ||
- fatal_signal_pending(current) ||
- current->flags & PF_EXITING))
+ if (unlikely(should_force_charge()))
goto force;
/*
@@ -2241,7 +2604,7 @@
if (nr_retries--)
goto retry;
- if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
+ if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto nomem;
if (gfp_mask & __GFP_NOFAIL)
@@ -2250,8 +2613,6 @@
if (fatal_signal_pending(current))
goto force;
- memcg_memory_event(mem_over_limit, MEMCG_OOM);
-
/*
* keep retrying as long as the memcg oom killer is able to make
* a forward progress or bypass the charge if the oom killer
@@ -2262,7 +2623,6 @@
switch (oom_status) {
case OOM_SUCCESS:
nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- oomed = true;
goto retry;
case OOM_FAILED:
goto force;
@@ -2329,13 +2689,13 @@
static void lock_page_lru(struct page *page, int *isolated)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
- spin_lock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&pgdat->lru_lock);
if (PageLRU(page)) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_lru(page));
*isolated = 1;
@@ -2345,17 +2705,17 @@
static void unlock_page_lru(struct page *page, int isolated)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
if (isolated) {
struct lruvec *lruvec;
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
VM_BUG_ON_PAGE(PageLRU(page), page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
}
static void commit_charge(struct page *page, struct mem_cgroup *memcg,
@@ -2460,17 +2820,18 @@
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
struct kmem_cache *cachep)
{
struct memcg_kmem_cache_create_work *cw;
+ if (!css_tryget_online(&memcg->css))
+ return;
+
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
if (!cw)
return;
- css_get(&memcg->css);
-
cw->memcg = memcg;
cw->cachep = cachep;
INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
@@ -2478,25 +2839,6 @@
queue_work(memcg_kmem_cache_wq, &cw->work);
}
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- /*
- * We need to stop accounting when we kmalloc, because if the
- * corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_kmem_cache_create will recurse.
- *
- * However, it is better to enclose the whole function. Depending on
- * the debugging options enabled, INIT_WORK(), for instance, can
- * trigger an allocation. This too, will make us recurse. Because at
- * this point we can't allow ourselves back into memcg_kmem_get_cache,
- * the safest choice is to do it like this, wrapping the whole function.
- */
- current->memcg_kmem_skip_account = 1;
- __memcg_schedule_kmem_cache_create(memcg, cachep);
- current->memcg_kmem_skip_account = 0;
-}
-
static inline bool memcg_kmem_bypass(void)
{
if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
@@ -2524,6 +2866,7 @@
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ struct memcg_cache_array *arr;
int kmemcg_id;
VM_BUG_ON(!is_root_cache(cachep));
@@ -2531,17 +2874,28 @@
if (memcg_kmem_bypass())
return cachep;
- if (current->memcg_kmem_skip_account)
- return cachep;
+ rcu_read_lock();
- memcg = get_mem_cgroup_from_current();
+ if (unlikely(current->active_memcg))
+ memcg = current->active_memcg;
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ if (!memcg || memcg == root_mem_cgroup)
+ goto out_unlock;
+
kmemcg_id = READ_ONCE(memcg->kmemcg_id);
if (kmemcg_id < 0)
- goto out;
+ goto out_unlock;
- memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
- if (likely(memcg_cachep))
- return memcg_cachep;
+ arr = rcu_dereference(cachep->memcg_params.memcg_caches);
+
+ /*
+ * Make sure we will access the up-to-date value. The code updating
+ * memcg_caches issues a write barrier to match the data dependency
+ * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
+ */
+ memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
/*
* If we are in a safe context (can wait, and not in interrupt
@@ -2554,10 +2908,20 @@
* memcg_create_kmem_cache, this means no further allocation
* could happen with the slab_mutex held. So it's better to
* defer everything.
+ *
+ * If the memcg is dying or memcg_cache is about to be released,
+ * don't bother creating new kmem_caches. Because memcg_cachep
+ * is ZEROed as the fist step of kmem offlining, we don't need
+ * percpu_ref_tryget_live() here. css_tryget_online() check in
+ * memcg_schedule_kmem_cache_create() will prevent us from
+ * creation of a new kmem_cache.
*/
- memcg_schedule_kmem_cache_create(memcg, cachep);
-out:
- css_put(&memcg->css);
+ if (unlikely(!memcg_cachep))
+ memcg_schedule_kmem_cache_create(memcg, cachep);
+ else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
+ cachep = memcg_cachep;
+out_unlock:
+ rcu_read_unlock();
return cachep;
}
@@ -2568,11 +2932,11 @@
void memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params.memcg->css);
+ percpu_ref_put(&cachep->memcg_params.refcnt);
}
/**
- * memcg_kmem_charge_memcg: charge a kmem page
+ * __memcg_kmem_charge_memcg: charge a kmem page
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
@@ -2580,7 +2944,7 @@
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
+int __memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
struct mem_cgroup *memcg)
{
unsigned int nr_pages = 1 << order;
@@ -2593,24 +2957,31 @@
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
!page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
+
+ /*
+ * Enforce __GFP_NOFAIL allocation because callers are not
+ * prepared to see failures and likely do not have any failure
+ * handling code.
+ */
+ if (gfp & __GFP_NOFAIL) {
+ page_counter_charge(&memcg->kmem, nr_pages);
+ return 0;
+ }
cancel_charge(memcg, nr_pages);
return -ENOMEM;
}
-
- page->mem_cgroup = memcg;
-
return 0;
}
/**
- * memcg_kmem_charge: charge a kmem page to the current memory cgroup
+ * __memcg_kmem_charge: charge a kmem page to the current memory cgroup
* @page: page to charge
* @gfp: reclaim mode
* @order: allocation order
*
* Returns 0 on success, an error code on failure.
*/
-int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
+int __memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
{
struct mem_cgroup *memcg;
int ret = 0;
@@ -2620,19 +2991,37 @@
memcg = get_mem_cgroup_from_current();
if (!mem_cgroup_is_root(memcg)) {
- ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
- if (!ret)
+ ret = __memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (!ret) {
+ page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ }
}
css_put(&memcg->css);
return ret;
}
+
/**
- * memcg_kmem_uncharge: uncharge a kmem page
+ * __memcg_kmem_uncharge_memcg: uncharge a kmem page
+ * @memcg: memcg to uncharge
+ * @nr_pages: number of pages to uncharge
+ */
+void __memcg_kmem_uncharge_memcg(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
+{
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ page_counter_uncharge(&memcg->kmem, nr_pages);
+
+ page_counter_uncharge(&memcg->memory, nr_pages);
+ if (do_memsw_account())
+ page_counter_uncharge(&memcg->memsw, nr_pages);
+}
+/**
+ * __memcg_kmem_uncharge: uncharge a kmem page
* @page: page to uncharge
* @order: allocation order
*/
-void memcg_kmem_uncharge(struct page *page, int order)
+void __memcg_kmem_uncharge(struct page *page, int order)
{
struct mem_cgroup *memcg = page->mem_cgroup;
unsigned int nr_pages = 1 << order;
@@ -2641,14 +3030,7 @@
return;
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
-
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
- page_counter_uncharge(&memcg->kmem, nr_pages);
-
- page_counter_uncharge(&memcg->memory, nr_pages);
- if (do_memsw_account())
- page_counter_uncharge(&memcg->memsw, nr_pages);
-
+ __memcg_kmem_uncharge_memcg(memcg, nr_pages);
page->mem_cgroup = NULL;
/* slab pages do not have PageKmemcg flag set */
@@ -2663,7 +3045,7 @@
/*
* Because tail pages are not marked as "used", set it. We're under
- * zone_lru_lock and migration entries setup in all page mappings.
+ * pgdat->lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
@@ -2953,50 +3335,15 @@
return retval;
}
-struct accumulated_stats {
- unsigned long stat[MEMCG_NR_STAT];
- unsigned long events[NR_VM_EVENT_ITEMS];
- unsigned long lru_pages[NR_LRU_LISTS];
- const unsigned int *stats_array;
- const unsigned int *events_array;
- int stats_size;
- int events_size;
-};
-
-static void accumulate_memcg_tree(struct mem_cgroup *memcg,
- struct accumulated_stats *acc)
-{
- struct mem_cgroup *mi;
- int i;
-
- for_each_mem_cgroup_tree(mi, memcg) {
- for (i = 0; i < acc->stats_size; i++)
- acc->stat[i] += memcg_page_state(mi,
- acc->stats_array ? acc->stats_array[i] : i);
-
- for (i = 0; i < acc->events_size; i++)
- acc->events[i] += memcg_sum_events(mi,
- acc->events_array ? acc->events_array[i] : i);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- acc->lru_pages[i] +=
- mem_cgroup_nr_lru_pages(mi, BIT(i));
- }
-}
-
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
{
- unsigned long val = 0;
+ unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- struct mem_cgroup *iter;
-
- for_each_mem_cgroup_tree(iter, memcg) {
- val += memcg_page_state(iter, MEMCG_CACHE);
- val += memcg_page_state(iter, MEMCG_RSS);
- if (swap)
- val += memcg_page_state(iter, MEMCG_SWAP);
- }
+ val = memcg_page_state(memcg, MEMCG_CACHE) +
+ memcg_page_state(memcg, MEMCG_RSS);
+ if (swap)
+ val += memcg_page_state(memcg, MEMCG_SWAP);
} else {
if (!swap)
val = page_counter_read(&memcg->memory);
@@ -3057,6 +3404,72 @@
}
}
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+{
+ unsigned long stat[MEMCG_NR_STAT];
+ struct mem_cgroup *mi;
+ int node, cpu, i;
+ int min_idx, max_idx;
+
+ if (slab_only) {
+ min_idx = NR_SLAB_RECLAIMABLE;
+ max_idx = NR_SLAB_UNRECLAIMABLE;
+ } else {
+ min_idx = 0;
+ max_idx = MEMCG_NR_STAT;
+ }
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &mi->vmstats[i]);
+
+ if (!slab_only)
+ max_idx = NR_VM_NODE_STAT_ITEMS;
+
+ for_each_node(node) {
+ struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
+ struct mem_cgroup_per_node *pi;
+
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = min_idx; i < max_idx; i++)
+ stat[i] += per_cpu(
+ pn->lruvec_stat_cpu->count[i], cpu);
+
+ for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
+ for (i = min_idx; i < max_idx; i++)
+ atomic_long_add(stat[i], &pi->lruvec_stat[i]);
+ }
+}
+
+static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
+{
+ unsigned long events[NR_VM_EVENT_ITEMS];
+ struct mem_cgroup *mi;
+ int cpu, i;
+
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] = 0;
+
+ for_each_online_cpu(cpu)
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ events[i] += per_cpu(memcg->vmstats_percpu->events[i],
+ cpu);
+
+ for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
+ for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
+ atomic_long_add(events[i], &mi->vmevents[i]);
+}
+
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
@@ -3102,16 +3515,23 @@
*/
memcg->kmem_state = KMEM_ALLOCATED;
- memcg_deactivate_kmem_caches(memcg);
-
- kmemcg_id = memcg->kmemcg_id;
- BUG_ON(kmemcg_id < 0);
-
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
/*
+ * Deactivate and reparent kmem_caches. Then flush percpu
+ * slab statistics to have precise values at the parent and
+ * all ancestor levels. It's required to keep slab stats
+ * accurate after the reparenting of kmem_caches.
+ */
+ memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_flush_percpu_vmstats(memcg, true);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ /*
* Change kmemcg_id of this cgroup and all its descendants to the
* parent's id, and then move all entries from this cgroup's list_lrus
* to ones of the parent. After we have finished, all list_lrus
@@ -3141,9 +3561,8 @@
memcg_offline_kmem(memcg);
if (memcg->kmem_state == KMEM_ALLOCATED) {
- memcg_destroy_kmem_caches(memcg);
+ WARN_ON(!list_empty(&memcg->kmem_caches));
static_branch_dec(&memcg_kmem_enabled_key);
- WARN_ON(page_counter_read(&memcg->kmem));
}
}
#else
@@ -3235,6 +3654,9 @@
ret = mem_cgroup_resize_max(memcg, nr_pages, true);
break;
case _KMEM:
+ pr_warn_once("kmem.limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
ret = memcg_update_kmem_max(memcg, nr_pages);
break;
case _TCP:
@@ -3320,6 +3742,42 @@
#endif
#ifdef CONFIG_NUMA
+
+#define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE))
+#define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON))
+#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
+
+static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
+ int nid, unsigned int lru_mask)
+{
+ struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ VM_BUG_ON((unsigned)nid >= nr_node_ids);
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
+static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
+ unsigned int lru_mask)
+{
+ unsigned long nr = 0;
+ enum lru_list lru;
+
+ for_each_lru(lru) {
+ if (!(BIT(lru) & lru_mask))
+ continue;
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ }
+ return nr;
+}
+
static int memcg_numa_stat_show(struct seq_file *m, void *v)
{
struct numa_stat {
@@ -3336,7 +3794,7 @@
const struct numa_stat *stat;
int nid;
unsigned long nr;
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
@@ -3370,6 +3828,28 @@
}
#endif /* CONFIG_NUMA */
+static const unsigned int memcg1_stats[] = {
+ MEMCG_CACHE,
+ MEMCG_RSS,
+ MEMCG_RSS_HUGE,
+ NR_SHMEM,
+ NR_FILE_MAPPED,
+ NR_FILE_DIRTY,
+ NR_WRITEBACK,
+ MEMCG_SWAP,
+};
+
+static const char *const memcg1_stat_names[] = {
+ "cache",
+ "rss",
+ "rss_huge",
+ "shmem",
+ "mapped_file",
+ "dirty",
+ "writeback",
+ "swap",
+};
+
/* Universal VM events cgroup1 shows, original sort order */
static const unsigned int memcg1_events[] = {
PGPGIN,
@@ -3387,11 +3867,10 @@
static int memcg_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
unsigned long memory, memsw;
struct mem_cgroup *mi;
unsigned int i;
- struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
@@ -3400,17 +3879,18 @@
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state(memcg, memcg1_stats[i]) *
+ memcg_page_state_local(memcg, memcg1_stats[i]) *
PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "%s %lu\n", memcg1_event_names[i],
- memcg_sum_events(memcg, memcg1_events[i]));
+ memcg_events_local(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
- mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
+ memcg_page_state_local(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
/* Hierarchical information */
memory = memsw = PAGE_COUNTER_MAX;
@@ -3424,27 +3904,22 @@
seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE);
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = ARRAY_SIZE(memcg1_stats);
- acc.stats_array = memcg1_stats;
- acc.events_size = ARRAY_SIZE(memcg1_events);
- acc.events_array = memcg1_events;
- accumulate_memcg_tree(memcg, &acc);
-
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
- (u64)acc.stat[i] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, memcg1_stats[i]) *
+ PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
- (u64)acc.events[i]);
+ (u64)memcg_events(memcg, memcg1_events[i]));
for (i = 0; i < NR_LRU_LISTS; i++)
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_LRU_BASE + i) *
+ PAGE_SIZE);
#ifdef CONFIG_DEBUG_VM
{
@@ -3625,8 +4100,7 @@
size = thresholds->primary ? thresholds->primary->size + 1 : 1;
/* Allocate memory for new array of thresholds */
- new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
- GFP_KERNEL);
+ new = kmalloc(struct_size(new, entries, size), GFP_KERNEL);
if (!new) {
ret = -ENOMEM;
goto unlock;
@@ -3820,7 +4294,7 @@
static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(sf);
seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
@@ -3847,6 +4321,8 @@
#ifdef CONFIG_CGROUP_WRITEBACK
+#include <trace/events/writeback.h>
+
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
{
return wb_domain_init(&memcg->cgwb_domain, gfp);
@@ -3872,6 +4348,22 @@
return &memcg->cgwb_domain;
}
+/*
+ * idx can be of type enum memcg_stat_item or node_stat_item.
+ * Keep in sync with memcg_exact_page().
+ */
+static unsigned long memcg_exact_page_state(struct mem_cgroup *memcg, int idx)
+{
+ long x = atomic_long_read(&memcg->vmstats[idx]);
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ x += per_cpu_ptr(memcg->vmstats_percpu, cpu)->stat[idx];
+ if (x < 0)
+ x = 0;
+ return x;
+}
+
/**
* mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
* @wb: bdi_writeback in question
@@ -3897,12 +4389,12 @@
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
struct mem_cgroup *parent;
- *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+ *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
/* this should eventually include NR_UNSTABLE_NFS */
- *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
- *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
- (1 << LRU_ACTIVE_FILE));
+ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
+ *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
+ memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
*pheadroom = PAGE_COUNTER_MAX;
while ((parent = parent_mem_cgroup(memcg))) {
@@ -3914,6 +4406,130 @@
}
}
+/*
+ * Foreign dirty flushing
+ *
+ * There's an inherent mismatch between memcg and writeback. The former
+ * trackes ownership per-page while the latter per-inode. This was a
+ * deliberate design decision because honoring per-page ownership in the
+ * writeback path is complicated, may lead to higher CPU and IO overheads
+ * and deemed unnecessary given that write-sharing an inode across
+ * different cgroups isn't a common use-case.
+ *
+ * Combined with inode majority-writer ownership switching, this works well
+ * enough in most cases but there are some pathological cases. For
+ * example, let's say there are two cgroups A and B which keep writing to
+ * different but confined parts of the same inode. B owns the inode and
+ * A's memory is limited far below B's. A's dirty ratio can rise enough to
+ * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
+ * triggering background writeback. A will be slowed down without a way to
+ * make writeback of the dirty pages happen.
+ *
+ * Conditions like the above can lead to a cgroup getting repatedly and
+ * severely throttled after making some progress after each
+ * dirty_expire_interval while the underyling IO device is almost
+ * completely idle.
+ *
+ * Solving this problem completely requires matching the ownership tracking
+ * granularities between memcg and writeback in either direction. However,
+ * the more egregious behaviors can be avoided by simply remembering the
+ * most recent foreign dirtying events and initiating remote flushes on
+ * them when local writeback isn't enough to keep the memory clean enough.
+ *
+ * The following two functions implement such mechanism. When a foreign
+ * page - a page whose memcg and writeback ownerships don't match - is
+ * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
+ * bdi_writeback on the page owning memcg. When balance_dirty_pages()
+ * decides that the memcg needs to sleep due to high dirty ratio, it calls
+ * mem_cgroup_flush_foreign() which queues writeback on the recorded
+ * foreign bdi_writebacks which haven't expired. Both the numbers of
+ * recorded bdi_writebacks and concurrent in-flight foreign writebacks are
+ * limited to MEMCG_CGWB_FRN_CNT.
+ *
+ * The mechanism only remembers IDs and doesn't hold any object references.
+ * As being wrong occasionally doesn't matter, updates and accesses to the
+ * records are lockless and racy.
+ */
+void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
+ struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = page->mem_cgroup;
+ struct memcg_cgwb_frn *frn;
+ u64 now = get_jiffies_64();
+ u64 oldest_at = now;
+ int oldest = -1;
+ int i;
+
+ trace_track_foreign_dirty(page, wb);
+
+ /*
+ * Pick the slot to use. If there is already a slot for @wb, keep
+ * using it. If not replace the oldest one which isn't being
+ * written out.
+ */
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ frn = &memcg->cgwb_frn[i];
+ if (frn->bdi_id == wb->bdi->id &&
+ frn->memcg_id == wb->memcg_css->id)
+ break;
+ if (time_before64(frn->at, oldest_at) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ oldest = i;
+ oldest_at = frn->at;
+ }
+ }
+
+ if (i < MEMCG_CGWB_FRN_CNT) {
+ /*
+ * Re-using an existing one. Update timestamp lazily to
+ * avoid making the cacheline hot. We want them to be
+ * reasonably up-to-date and significantly shorter than
+ * dirty_expire_interval as that's what expires the record.
+ * Use the shorter of 1s and dirty_expire_interval / 8.
+ */
+ unsigned long update_intv =
+ min_t(unsigned long, HZ,
+ msecs_to_jiffies(dirty_expire_interval * 10) / 8);
+
+ if (time_before64(frn->at, now - update_intv))
+ frn->at = now;
+ } else if (oldest >= 0) {
+ /* replace the oldest free one */
+ frn = &memcg->cgwb_frn[oldest];
+ frn->bdi_id = wb->bdi->id;
+ frn->memcg_id = wb->memcg_css->id;
+ frn->at = now;
+ }
+}
+
+/* issue foreign writeback flushes for recorded foreign dirtying events */
+void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+ unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
+ u64 now = jiffies_64;
+ int i;
+
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
+ struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
+
+ /*
+ * If the record is older than dirty_expire_interval,
+ * writeback on it has already started. No need to kick it
+ * off again. Also, don't start a new one if there's
+ * already one in flight.
+ */
+ if (time_after64(frn->at, now - intv) &&
+ atomic_read(&frn->done.cnt) == 1) {
+ frn->at = 0;
+ trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id);
+ cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
+ WB_REASON_FOREIGN_FLUSH,
+ &frn->done);
+ }
+ }
+}
+
#else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
@@ -4321,14 +4937,12 @@
static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
- atomic_add(n, &memcg->id.ref);
+ refcount_add(n, &memcg->id.ref);
}
static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
{
- VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
- if (atomic_sub_and_test(n, &memcg->id.ref)) {
+ if (refcount_sub_and_test(n, &memcg->id.ref)) {
mem_cgroup_id_remove(memcg);
/* Memcg ID pins CSS */
@@ -4336,11 +4950,6 @@
}
}
-static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
-{
- mem_cgroup_id_get_many(memcg, 1);
-}
-
static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
{
mem_cgroup_id_put_many(memcg, 1);
@@ -4376,8 +4985,15 @@
if (!pn)
return 1;
+ pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+ if (!pn->lruvec_stat_local) {
+ kfree(pn);
+ return 1;
+ }
+
pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
if (!pn->lruvec_stat_cpu) {
+ free_percpu(pn->lruvec_stat_local);
kfree(pn);
return 1;
}
@@ -4399,6 +5015,7 @@
return;
free_percpu(pn->lruvec_stat_cpu);
+ free_percpu(pn->lruvec_stat_local);
kfree(pn);
}
@@ -4408,21 +5025,29 @@
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
- free_percpu(memcg->stat_cpu);
+ free_percpu(memcg->vmstats_percpu);
+ free_percpu(memcg->vmstats_local);
kfree(memcg);
}
static void mem_cgroup_free(struct mem_cgroup *memcg)
{
memcg_wb_domain_exit(memcg);
+ /*
+ * Flush percpu vmstats and vmevents to guarantee the value correctness
+ * on parent's and all ancestor levels.
+ */
+ memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
{
struct mem_cgroup *memcg;
- size_t size;
+ unsigned int size;
int node;
+ int __maybe_unused i;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
@@ -4437,8 +5062,12 @@
if (memcg->id.id < 0)
goto fail;
- memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
- if (!memcg->stat_cpu)
+ memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+ if (!memcg->vmstats_local)
+ goto fail;
+
+ memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+ if (!memcg->vmstats_percpu)
goto fail;
for_each_node(node)
@@ -4462,6 +5091,14 @@
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ memcg->cgwb_frn[i].done =
+ __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
+#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&memcg->deferred_split_queue.split_queue_lock);
+ INIT_LIST_HEAD(&memcg->deferred_split_queue.split_queue);
+ memcg->deferred_split_queue.split_queue_len = 0;
#endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg;
@@ -4512,6 +5149,9 @@
/* The following stuff does not apply to the root */
if (!parent) {
+#ifdef CONFIG_MEMCG_KMEM
+ INIT_LIST_HEAD(&memcg->kmem_caches);
+#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -4545,7 +5185,7 @@
}
/* Online state pins memcg ID, memcg ID pins CSS */
- atomic_set(&memcg->id.ref, 1);
+ refcount_set(&memcg->id.ref, 1);
css_get(css);
return 0;
}
@@ -4573,6 +5213,8 @@
memcg_offline_kmem(memcg);
wb_memcg_offline(memcg);
+ drain_all_stock(memcg);
+
mem_cgroup_id_put(memcg);
}
@@ -4586,7 +5228,12 @@
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ int __maybe_unused i;
+#ifdef CONFIG_CGROUP_WRITEBACK
+ for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
+ wb_wait_for_completion(&memcg->cgwb_frn[i].done);
+#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key);
@@ -4669,7 +5316,7 @@
static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
- struct page *page = _vm_normal_page(vma, addr, ptent, true);
+ struct page *page = vm_normal_page(vma, addr, ptent);
if (!page || !page_mapped(page))
return NULL;
@@ -4750,7 +5397,7 @@
/* shmem/tmpfs may report page out on swap: account for that too. */
if (shmem_mapping(mapping)) {
page = find_get_entry(mapping, pgoff);
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
if (do_memsw_account())
*entry = swp;
@@ -4782,6 +5429,8 @@
struct mem_cgroup *from,
struct mem_cgroup *to)
{
+ struct lruvec *from_vec, *to_vec;
+ struct pglist_data *pgdat;
unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
@@ -4805,11 +5454,15 @@
anon = PageAnon(page);
+ pgdat = page_pgdat(page);
+ from_vec = mem_cgroup_lruvec(pgdat, from);
+ to_vec = mem_cgroup_lruvec(pgdat, to);
+
spin_lock_irqsave(&from->move_lock, flags);
if (!anon && page_mapped(page)) {
- __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
- __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
/*
@@ -4821,16 +5474,24 @@
struct address_space *mapping = page_mapping(page);
if (mapping_cap_account_dirty(mapping)) {
- __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
- __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
}
}
if (PageWriteback(page)) {
- __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
- __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
+ __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && !list_empty(page_deferred_list(page))) {
+ spin_lock(&from->deferred_split_queue.split_queue_lock);
+ list_del_init(page_deferred_list(page));
+ from->deferred_split_queue.split_queue_len--;
+ spin_unlock(&from->deferred_split_queue.split_queue_lock);
+ }
+#endif
/*
* It is safe to change page->mem_cgroup here because the page
* is referenced, charged, and isolated - we can't race with
@@ -4839,6 +5500,17 @@
/* caller should have done css_get */
page->mem_cgroup = to;
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (compound && list_empty(page_deferred_list(page))) {
+ spin_lock(&to->deferred_split_queue.split_queue_lock);
+ list_add_tail(page_deferred_list(page),
+ &to->deferred_split_queue.split_queue);
+ to->deferred_split_queue.split_queue_len++;
+ spin_unlock(&to->deferred_split_queue.split_queue_lock);
+ }
+#endif
+
spin_unlock_irqrestore(&from->move_lock, flags);
ret = 0;
@@ -4870,8 +5542,8 @@
* 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
* target for charge migration. if @target is not NULL, the entry is stored
* in target->ent.
- * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PUBLIC
- * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
+ * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE
+ * (so ZONE_DEVICE page and thus not on the lru).
* For now we such page is charge like a regular page would be as for all
* intent and purposes it is just special memory taking the place of a
* regular page.
@@ -4905,8 +5577,7 @@
*/
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
- if (is_device_private_page(page) ||
- is_device_public_page(page))
+ if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
if (target)
target->page = page;
@@ -4977,8 +5648,8 @@
if (ptl) {
/*
* Note their can not be MC_TARGET_DEVICE for now as we do not
- * support transparent huge page with MEMORY_DEVICE_PUBLIC or
- * MEMORY_DEVICE_PRIVATE but this might change.
+ * support transparent huge page with MEMORY_DEVICE_PRIVATE but
+ * this might change.
*/
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
@@ -4998,17 +5669,16 @@
return 0;
}
+static const struct mm_walk_ops precharge_walk_ops = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+};
+
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- };
down_read(&mm->mmap_sem);
- walk_page_range(0, mm->highest_vm_end,
- &mem_cgroup_count_precharge_walk);
+ walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -5277,13 +5947,12 @@
return ret;
}
+static const struct mm_walk_ops charge_walk_ops = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+};
+
static void mem_cgroup_move_charge(void)
{
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mc.mm,
- };
-
lru_add_drain_all();
/*
* Signal lock_page_memcg() to take the memcg's move_lock
@@ -5309,7 +5978,8 @@
* When we have consumed all precharges and failed in doing
* additional charge, the page walk just aborts.
*/
- walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
+ walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
+ NULL);
up_read(&mc.mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
@@ -5353,6 +6023,16 @@
root_mem_cgroup->use_hierarchy = false;
}
+static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
+{
+ if (value == PAGE_COUNTER_MAX)
+ seq_puts(m, "max\n");
+ else
+ seq_printf(m, "%llu\n", (u64)value * PAGE_SIZE);
+
+ return 0;
+}
+
static u64 memory_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
@@ -5363,15 +6043,8 @@
static int memory_min_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long min = READ_ONCE(memcg->memory.min);
-
- if (min == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.min));
}
static ssize_t memory_min_write(struct kernfs_open_file *of,
@@ -5393,15 +6066,8 @@
static int memory_low_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long low = READ_ONCE(memcg->memory.low);
-
- if (low == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.low));
}
static ssize_t memory_low_write(struct kernfs_open_file *of,
@@ -5423,15 +6089,7 @@
static int memory_high_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long high = READ_ONCE(memcg->high);
-
- if (high == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
@@ -5460,15 +6118,8 @@
static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->memory.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.max));
}
static ssize_t memory_max_write(struct kernfs_open_file *of,
@@ -5520,104 +6171,48 @@
return nbytes;
}
+static void __memory_events_show(struct seq_file *m, atomic_long_t *events)
+{
+ seq_printf(m, "low %lu\n", atomic_long_read(&events[MEMCG_LOW]));
+ seq_printf(m, "high %lu\n", atomic_long_read(&events[MEMCG_HIGH]));
+ seq_printf(m, "max %lu\n", atomic_long_read(&events[MEMCG_MAX]));
+ seq_printf(m, "oom %lu\n", atomic_long_read(&events[MEMCG_OOM]));
+ seq_printf(m, "oom_kill %lu\n",
+ atomic_long_read(&events[MEMCG_OOM_KILL]));
+}
+
static int memory_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
- seq_printf(m, "low %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
- seq_printf(m, "high %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
- seq_printf(m, "max %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
- seq_printf(m, "oom %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
- seq_printf(m, "oom_kill %lu\n",
- atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
+ __memory_events_show(m, memcg->memory_events);
+ return 0;
+}
+static int memory_events_local_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+
+ __memory_events_show(m, memcg->memory_events_local);
return 0;
}
static int memory_stat_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- struct accumulated_stats acc;
- int i;
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ char *buf;
- /*
- * Provide statistics on the state of the memory subsystem as
- * well as cumulative event counters that show past behavior.
- *
- * This list is ordered following a combination of these gradients:
- * 1) generic big picture -> specifics and details
- * 2) reflecting userspace activity -> reflecting kernel heuristics
- *
- * Current memory state:
- */
-
- memset(&acc, 0, sizeof(acc));
- acc.stats_size = MEMCG_NR_STAT;
- acc.events_size = NR_VM_EVENT_ITEMS;
- accumulate_memcg_tree(memcg, &acc);
-
- seq_printf(m, "anon %llu\n",
- (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
- seq_printf(m, "file %llu\n",
- (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
- seq_printf(m, "kernel_stack %llu\n",
- (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
- seq_printf(m, "slab %llu\n",
- (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
- acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
- seq_printf(m, "sock %llu\n",
- (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
-
- seq_printf(m, "shmem %llu\n",
- (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
- seq_printf(m, "file_mapped %llu\n",
- (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
- seq_printf(m, "file_dirty %llu\n",
- (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
- seq_printf(m, "file_writeback %llu\n",
- (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
-
- for (i = 0; i < NR_LRU_LISTS; i++)
- seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
- (u64)acc.lru_pages[i] * PAGE_SIZE);
-
- seq_printf(m, "slab_reclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
- seq_printf(m, "slab_unreclaimable %llu\n",
- (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
-
- /* Accumulated memory events */
-
- seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
- seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
-
- seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
- seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
- acc.events[PGSCAN_DIRECT]);
- seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
- acc.events[PGSTEAL_DIRECT]);
- seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
- seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
- seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
- seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
-
- seq_printf(m, "workingset_refault %lu\n",
- acc.stat[WORKINGSET_REFAULT]);
- seq_printf(m, "workingset_activate %lu\n",
- acc.stat[WORKINGSET_ACTIVATE]);
- seq_printf(m, "workingset_nodereclaim %lu\n",
- acc.stat[WORKINGSET_NODERECLAIM]);
-
+ buf = memory_stat_format(memcg);
+ if (!buf)
+ return -ENOMEM;
+ seq_puts(m, buf);
+ kfree(buf);
return 0;
}
static int memory_oom_group_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "%d\n", memcg->oom_group);
@@ -5683,6 +6278,12 @@
.seq_show = memory_events_show,
},
{
+ .name = "events.local",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .file_offset = offsetof(struct mem_cgroup, events_local_file),
+ .seq_show = memory_events_local_show,
+ },
+ {
.name = "stat",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
@@ -5746,7 +6347,7 @@
*
* | memory.current, if memory.current < memory.low
* low_usage = |
- | 0, otherwise.
+ * | 0, otherwise.
*
*
* Such definition of the effective memory.low provides the expected
@@ -6045,7 +6646,7 @@
__mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
__mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
@@ -6080,7 +6681,7 @@
unsigned int nr_pages = 1;
if (PageTransHuge(page)) {
- nr_pages <<= compound_order(page);
+ nr_pages = compound_nr(page);
ug->nr_huge += nr_pages;
}
if (PageAnon(page))
@@ -6092,7 +6693,7 @@
}
ug->pgpgout++;
} else {
- ug->nr_kmem += 1 << compound_order(page);
+ ug->nr_kmem += compound_nr(page);
__ClearPageKmemcg(page);
}
@@ -6377,7 +6978,7 @@
#ifdef CONFIG_MEMCG_SWAP
static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
{
- while (!atomic_inc_not_zero(&memcg->id.ref)) {
+ while (!refcount_inc_not_zero(&memcg->id.ref)) {
/*
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
@@ -6600,15 +7201,8 @@
static int swap_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- unsigned long max = READ_ONCE(memcg->swap.max);
-
- if (max == PAGE_COUNTER_MAX)
- seq_puts(m, "max\n");
- else
- seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
-
- return 0;
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.max));
}
static ssize_t swap_max_write(struct kernfs_open_file *of,
@@ -6630,7 +7224,7 @@
static int swap_events_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
diff --git a/mm/memfd.c b/mm/memfd.c
index 2bb5e25..2647c89 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -21,44 +21,37 @@
#include <uapi/linux/memfd.h>
/*
- * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
+ * We need a tag: a new tag would expand every xa_node by 8 bytes,
* so reuse a tag which we firmly believe is never set or cleared on tmpfs
* or hugetlbfs because they are memory only filesystems.
*/
#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE
#define LAST_SCAN 4 /* about 150ms max */
-static void memfd_tag_pins(struct address_space *mapping)
+static void memfd_tag_pins(struct xa_state *xas)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
- pgoff_t start;
struct page *page;
+ unsigned int tagged = 0;
lru_add_drain();
- start = 0;
- rcu_read_lock();
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- page = radix_tree_deref_slot(slot);
- if (!page || radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
- } else if (page_count(page) - page_mapcount(page) > 1) {
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_set(&mapping->i_pages, iter.index,
- MEMFD_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
- }
+ xas_lock_irq(xas);
+ xas_for_each(xas, page, ULONG_MAX) {
+ if (xa_is_value(page))
+ continue;
+ page = find_subpage(page, xas->xa_index);
+ if (page_count(page) - page_mapcount(page) > 1)
+ xas_set_mark(xas, MEMFD_TAG_PINNED);
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
+
+ xas_pause(xas);
+ xas_unlock_irq(xas);
+ cond_resched();
+ xas_lock_irq(xas);
}
- rcu_read_unlock();
+ xas_unlock_irq(xas);
}
/*
@@ -72,17 +65,17 @@
*/
static int memfd_wait_for_pins(struct address_space *mapping)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
- pgoff_t start;
+ XA_STATE(xas, &mapping->i_pages, 0);
struct page *page;
int error, scan;
- memfd_tag_pins(mapping);
+ memfd_tag_pins(&xas);
error = 0;
for (scan = 0; scan <= LAST_SCAN; scan++) {
- if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED))
+ unsigned int tagged = 0;
+
+ if (!xas_marked(&xas, MEMFD_TAG_PINNED))
break;
if (!scan)
@@ -90,45 +83,35 @@
else if (schedule_timeout_killable((HZ << scan) / 200))
scan = LAST_SCAN;
- start = 0;
- rcu_read_lock();
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter,
- start, MEMFD_TAG_PINNED) {
-
- page = radix_tree_deref_slot(slot);
- if (radix_tree_exception(page)) {
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
- continue;
- }
-
- page = NULL;
- }
-
- if (page &&
- page_count(page) - page_mapcount(page) != 1) {
- if (scan < LAST_SCAN)
- goto continue_resched;
-
+ xas_set(&xas, 0);
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, page, ULONG_MAX, MEMFD_TAG_PINNED) {
+ bool clear = true;
+ if (xa_is_value(page))
+ continue;
+ page = find_subpage(page, xas.xa_index);
+ if (page_count(page) - page_mapcount(page) != 1) {
/*
* On the last scan, we clean up all those tags
* we inserted; but make a note that we still
* found pages pinned.
*/
- error = -EBUSY;
+ if (scan == LAST_SCAN)
+ error = -EBUSY;
+ else
+ clear = false;
}
+ if (clear)
+ xas_clear_mark(&xas, MEMFD_TAG_PINNED);
+ if (++tagged % XA_CHECK_SCHED)
+ continue;
- xa_lock_irq(&mapping->i_pages);
- radix_tree_tag_clear(&mapping->i_pages,
- iter.index, MEMFD_TAG_PINNED);
- xa_unlock_irq(&mapping->i_pages);
-continue_resched:
- if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
- }
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
+ cond_resched();
+ xas_lock_irq(&xas);
}
- rcu_read_unlock();
+ xas_unlock_irq(&xas);
}
return error;
@@ -150,7 +133,8 @@
#define F_ALL_SEALS (F_SEAL_SEAL | \
F_SEAL_SHRINK | \
F_SEAL_GROW | \
- F_SEAL_WRITE)
+ F_SEAL_WRITE | \
+ F_SEAL_FUTURE_WRITE)
static int memfd_add_seals(struct file *file, unsigned int seals)
{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 0cd3de3..3151c87 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1,11 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2008, 2009 Intel Corporation
* Authors: Andi Kleen, Fengguang Wu
*
- * This software may be redistributed and/or modified under the terms of
- * the GNU General Public License ("GPL") version 2 only as published by the
- * Free Software Foundation.
- *
* High level machine check handler. Handles pages reported by the
* hardware as being corrupted usually due to a multi-bit ECC memory or cache
* failure.
@@ -202,7 +199,6 @@
struct task_struct *tsk;
unsigned long addr;
short size_shift;
- char addr_valid;
};
/*
@@ -216,12 +212,12 @@
short addr_lsb = tk->size_shift;
int ret;
- pr_err("Memory failure: %#lx: Killing %s:%d due to hardware memory corruption\n",
+ pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
pfn, t->comm, t->pid);
if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
- addr_lsb, current);
+ addr_lsb);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -327,22 +323,27 @@
}
}
tk->addr = page_address_in_vma(p, vma);
- tk->addr_valid = 1;
if (is_zone_device_page(p))
tk->size_shift = dev_pagemap_mapping_shift(p, vma);
else
tk->size_shift = compound_order(compound_head(p)) + PAGE_SHIFT;
/*
- * In theory we don't have to kill when the page was
- * munmaped. But it could be also a mremap. Since that's
- * likely very rare kill anyways just out of paranoia, but use
- * a SIGKILL because the error is not contained anymore.
+ * Send SIGKILL if "tk->addr == -EFAULT". Also, as
+ * "tk->size_shift" is always non-zero for !is_zone_device_page(),
+ * so "tk->size_shift == 0" effectively checks no mapping on
+ * ZONE_DEVICE. Indeed, when a devdax page is mmapped N times
+ * to a process' address space, it's possible not all N VMAs
+ * contain mappings for the page, but at least one VMA does.
+ * Only deliver SIGBUS with payload derived from the VMA that
+ * has a mapping for the page.
*/
- if (tk->addr == -EFAULT || tk->size_shift == 0) {
+ if (tk->addr == -EFAULT) {
pr_info("Memory failure: Unable to find user space address %lx in %s\n",
page_to_pfn(p), tsk->comm);
- tk->addr_valid = 0;
+ } else if (tk->size_shift == 0) {
+ kfree(tk);
+ return;
}
get_task_struct(tsk);
tk->tsk = tsk;
@@ -369,10 +370,11 @@
* make sure the process doesn't catch the
* signal and then access the memory. Just kill it.
*/
- if (fail || tk->addr_valid == 0) {
+ if (fail || tk->addr == -EFAULT) {
pr_err("Memory failure: %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n",
pfn, tk->tsk->comm, tk->tsk->pid);
- force_sig(SIGKILL, tk->tsk);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV,
+ tk->tsk, PIDTYPE_PID);
}
/*
@@ -1161,6 +1163,7 @@
LIST_HEAD(tokill);
int rc = -EBUSY;
loff_t start;
+ dax_entry_t cookie;
/*
* Prevent the inode from being freed while we are interrogating
@@ -1169,7 +1172,8 @@
* also prevents changes to the mapping of this pfn until
* poison signaling is complete.
*/
- if (!dax_lock_mapping_entry(page))
+ cookie = dax_lock_page(page);
+ if (!cookie)
goto out;
if (hwpoison_filter(page)) {
@@ -1177,16 +1181,12 @@
goto unlock;
}
- switch (pgmap->type) {
- case MEMORY_DEVICE_PRIVATE:
- case MEMORY_DEVICE_PUBLIC:
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
/*
* TODO: Handle HMM pages which may need coordination
* with device-side memory.
*/
goto unlock;
- default:
- break;
}
/*
@@ -1220,7 +1220,7 @@
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
unlock:
- dax_unlock_mapping_entry(page);
+ dax_unlock_page(page, cookie);
out:
/* drop pgmap ref acquired in caller */
put_dev_pagemap(pgmap);
@@ -1257,17 +1257,19 @@
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
- if (!pfn_valid(pfn)) {
+ p = pfn_to_online_page(pfn);
+ if (!p) {
+ if (pfn_valid(pfn)) {
+ pgmap = get_dev_pagemap(pfn, NULL);
+ if (pgmap)
+ return memory_failure_dev_pagemap(pfn, flags,
+ pgmap);
+ }
pr_err("Memory failure: %#lx: memory outside kernel control\n",
pfn);
return -ENXIO;
}
- pgmap = get_dev_pagemap(pfn, NULL);
- if (pgmap)
- return memory_failure_dev_pagemap(pfn, flags, pgmap);
-
- p = pfn_to_page(pfn);
if (PageHuge(p))
return memory_failure_hugetlb(pfn, flags);
if (TestSetPageHWPoison(p)) {
@@ -1730,6 +1732,8 @@
if (!ret) {
if (set_hwpoison_free_buddy_page(page))
num_poisoned_pages_inc();
+ else
+ ret = -EBUSY;
}
}
return ret;
@@ -1822,19 +1826,17 @@
struct page *hpage = compound_head(page);
if (!PageHuge(page) && PageTransHuge(hpage)) {
- lock_page(hpage);
- if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
- unlock_page(hpage);
- if (!PageAnon(hpage))
+ lock_page(page);
+ if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+ unlock_page(page);
+ if (!PageAnon(page))
pr_info("soft offline: %#lx: non anonymous thp\n", page_to_pfn(page));
else
pr_info("soft offline: %#lx: thp split failed\n", page_to_pfn(page));
- put_hwpoison_page(hpage);
+ put_hwpoison_page(page);
return -EBUSY;
}
- unlock_page(hpage);
- get_hwpoison_page(page);
- put_hwpoison_page(hpage);
+ unlock_page(page);
}
/*
@@ -1856,11 +1858,8 @@
static int soft_offline_free_page(struct page *page)
{
- int rc = 0;
- struct page *head = compound_head(page);
+ int rc = dissolve_free_huge_page(page);
- if (PageHuge(head))
- rc = dissolve_free_huge_page(page);
if (!rc) {
if (set_hwpoison_free_buddy_page(page))
num_poisoned_pages_inc();
diff --git a/mm/memory.c b/mm/memory.c
index 5c5df53..b1ca51a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory.c
*
@@ -69,6 +70,7 @@
#include <linux/userfaultfd_k.h>
#include <linux/dax.h>
#include <linux/oom.h>
+#include <linux/numa.h>
#include <asm/io.h>
#include <asm/mmu_context.h>
@@ -186,253 +188,6 @@
#endif /* SPLIT_RSS_COUNTING */
-#ifdef HAVE_GENERIC_MMU_GATHER
-
-static bool tlb_next_batch(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
-
- batch = tlb->active;
- if (batch->next) {
- tlb->active = batch->next;
- return true;
- }
-
- if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
- return false;
-
- batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
- if (!batch)
- return false;
-
- tlb->batch_count++;
- batch->next = NULL;
- batch->nr = 0;
- batch->max = MAX_GATHER_BATCH;
-
- tlb->active->next = batch;
- tlb->active = batch;
-
- return true;
-}
-
-void arch_tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- tlb->mm = mm;
-
- /* Is it from 0 to ~0? */
- tlb->fullmm = !(start | (end+1));
- tlb->need_flush_all = 0;
- tlb->local.next = NULL;
- tlb->local.nr = 0;
- tlb->local.max = ARRAY_SIZE(tlb->__pages);
- tlb->active = &tlb->local;
- tlb->batch_count = 0;
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb->batch = NULL;
-#endif
- tlb->page_size = 0;
-
- __tlb_reset_range(tlb);
-}
-
-static void tlb_flush_mmu_free(struct mmu_gather *tlb)
-{
- struct mmu_gather_batch *batch;
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
- tlb_table_flush(tlb);
-#endif
- for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
- free_pages_and_swap_cache(batch->pages, batch->nr);
- batch->nr = 0;
- }
- tlb->active = &tlb->local;
-}
-
-void tlb_flush_mmu(struct mmu_gather *tlb)
-{
- tlb_flush_mmu_tlbonly(tlb);
- tlb_flush_mmu_free(tlb);
-}
-
-/* tlb_finish_mmu
- * Called at the end of the shootdown operation to free up any resources
- * that were required.
- */
-void arch_tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end, bool force)
-{
- struct mmu_gather_batch *batch, *next;
-
- if (force)
- __tlb_adjust_range(tlb, start, end - start);
-
- tlb_flush_mmu(tlb);
-
- /* keep the page table cache within bounds */
- check_pgt_cache();
-
- for (batch = tlb->local.next; batch; batch = next) {
- next = batch->next;
- free_pages((unsigned long)batch, 0);
- }
- tlb->local.next = NULL;
-}
-
-/* __tlb_remove_page
- * Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
- * handling the additional races in SMP caused by other CPUs caching valid
- * mappings in their TLBs. Returns the number of free page slots left.
- * When out of page slots we must call tlb_flush_mmu().
- *returns true if the caller should flush.
- */
-bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
-{
- struct mmu_gather_batch *batch;
-
- VM_BUG_ON(!tlb->end);
- VM_WARN_ON(tlb->page_size != page_size);
-
- batch = tlb->active;
- /*
- * Add the page and check if we are full. If so
- * force a flush.
- */
- batch->pages[batch->nr++] = page;
- if (batch->nr == batch->max) {
- if (!tlb_next_batch(tlb))
- return true;
- batch = tlb->active;
- }
- VM_BUG_ON_PAGE(batch->nr > batch->max, page);
-
- return false;
-}
-
-#endif /* HAVE_GENERIC_MMU_GATHER */
-
-#ifdef CONFIG_HAVE_RCU_TABLE_FREE
-
-/*
- * See the comment near struct mmu_table_batch.
- */
-
-/*
- * If we want tlb_remove_table() to imply TLB invalidates.
- */
-static inline void tlb_table_invalidate(struct mmu_gather *tlb)
-{
-#ifdef CONFIG_HAVE_RCU_TABLE_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
-}
-
-static void tlb_remove_table_smp_sync(void *arg)
-{
- /* Simply deliver the interrupt */
-}
-
-static void tlb_remove_table_one(void *table)
-{
- /*
- * This isn't an RCU grace period and hence the page-tables cannot be
- * assumed to be actually RCU-freed.
- *
- * It is however sufficient for software page-table walkers that rely on
- * IRQ disabling. See the comment near struct mmu_table_batch.
- */
- smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
- __tlb_remove_table(table);
-}
-
-static void tlb_remove_table_rcu(struct rcu_head *head)
-{
- struct mmu_table_batch *batch;
- int i;
-
- batch = container_of(head, struct mmu_table_batch, rcu);
-
- for (i = 0; i < batch->nr; i++)
- __tlb_remove_table(batch->tables[i]);
-
- free_page((unsigned long)batch);
-}
-
-void tlb_table_flush(struct mmu_gather *tlb)
-{
- struct mmu_table_batch **batch = &tlb->batch;
-
- if (*batch) {
- tlb_table_invalidate(tlb);
- call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
- *batch = NULL;
- }
-}
-
-void tlb_remove_table(struct mmu_gather *tlb, void *table)
-{
- struct mmu_table_batch **batch = &tlb->batch;
-
- if (*batch == NULL) {
- *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
- if (*batch == NULL) {
- tlb_table_invalidate(tlb);
- tlb_remove_table_one(table);
- return;
- }
- (*batch)->nr = 0;
- }
-
- (*batch)->tables[(*batch)->nr++] = table;
- if ((*batch)->nr == MAX_TABLE_BATCH)
- tlb_table_flush(tlb);
-}
-
-#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
-
-/**
- * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
- * @tlb: the mmu_gather structure to initialize
- * @mm: the mm_struct of the target address space
- * @start: start of the region that will be removed from the page-table
- * @end: end of the region that will be removed from the page-table
- *
- * Called to initialize an (on-stack) mmu_gather structure for page-table
- * tear-down from @mm. The @start and @end are set to 0 and -1
- * respectively when @mm is without users and we're going to destroy
- * the full address space (exit/execve).
- */
-void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
- unsigned long start, unsigned long end)
-{
- arch_tlb_gather_mmu(tlb, mm, start, end);
- inc_tlb_flush_pending(tlb->mm);
-}
-
-void tlb_finish_mmu(struct mmu_gather *tlb,
- unsigned long start, unsigned long end)
-{
- /*
- * If there are parallel threads are doing PTE changes on same range
- * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
- * flush by batching, a thread has stable TLB entry can fail to flush
- * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
- * forcefully if we detect parallel PTE batching threads.
- */
- bool force = mm_tlb_flush_nested(tlb->mm);
-
- arch_tlb_finish_mmu(tlb, start, end, force);
- dec_tlb_flush_pending(tlb->mm);
-}
-
/*
* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
@@ -602,7 +357,7 @@
* We add page table cache pages with PAGE_SIZE,
* (see pte_free_tlb()), flush the tlb if we need
*/
- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+ tlb_change_page_size(tlb, PAGE_SIZE);
pgd = pgd_offset(tlb->mm, addr);
do {
next = pgd_addr_end(addr, end);
@@ -647,10 +402,10 @@
}
}
-int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
spinlock_t *ptl;
- pgtable_t new = pte_alloc_one(mm, address);
+ pgtable_t new = pte_alloc_one(mm);
if (!new)
return -ENOMEM;
@@ -681,9 +436,9 @@
return 0;
}
-int __pte_alloc_kernel(pmd_t *pmd, unsigned long address)
+int __pte_alloc_kernel(pmd_t *pmd)
{
- pte_t *new = pte_alloc_one_kernel(&init_mm, address);
+ pte_t *new = pte_alloc_one_kernel(&init_mm);
if (!new)
return -ENOMEM;
@@ -763,9 +518,9 @@
(long long)pte_val(pte), (long long)pmd_val(*pmd));
if (page)
dump_page(page, "bad pte");
- pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+ pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%pf mmap:%pf readpage:%pf\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
@@ -816,8 +571,8 @@
* PFNMAP mappings in order to support COWable mappings.
*
*/
-struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
- pte_t pte, bool with_public_device)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
+ pte_t pte)
{
unsigned long pfn = pte_pfn(pte);
@@ -830,29 +585,6 @@
return NULL;
if (is_zero_pfn(pfn))
return NULL;
-
- /*
- * Device public pages are special pages (they are ZONE_DEVICE
- * pages but different from persistent memory). They behave
- * allmost like normal pages. The difference is that they are
- * not on the lru and thus should never be involve with any-
- * thing that involve lru manipulation (mlock, numa balancing,
- * ...).
- *
- * This is why we still want to return NULL for such page from
- * vm_normal_page() so that we do not have to special case all
- * call site of vm_normal_page().
- */
- if (likely(pfn <= highest_memmap_pfn)) {
- struct page *page = pfn_to_page(pfn);
-
- if (is_device_public_page(page)) {
- if (with_public_device)
- return page;
- return NULL;
- }
- }
-
if (pte_devmap(pte))
return NULL;
@@ -1042,17 +774,6 @@
rss[mm_counter(page)]++;
} else if (pte_devmap(pte)) {
page = pte_page(pte);
-
- /*
- * Cache coherent device memory behave like regular page and
- * not like persistent memory page. For more informations see
- * MEMORY_DEVICE_CACHE_COHERENT in memory_hotplug.h
- */
- if (is_device_public_page(page)) {
- get_page(page);
- page_dup_rmap(page, false);
- rss[mm_counter(page)]++;
- }
}
out_set_pte:
@@ -1220,8 +941,7 @@
unsigned long next;
unsigned long addr = vma->vm_start;
unsigned long end = vma->vm_end;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
bool is_cow;
int ret;
@@ -1255,11 +975,12 @@
* is_cow_mapping() returns true.
*/
is_cow = is_cow_mapping(vma->vm_flags);
- mmun_start = addr;
- mmun_end = end;
- if (is_cow)
- mmu_notifier_invalidate_range_start(src_mm, mmun_start,
- mmun_end);
+
+ if (is_cow) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, src_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
ret = 0;
dst_pgd = pgd_offset(dst_mm, addr);
@@ -1276,7 +997,7 @@
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
if (is_cow)
- mmu_notifier_invalidate_range_end(src_mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -1293,7 +1014,7 @@
pte_t *pte;
swp_entry_t entry;
- tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
+ tlb_change_page_size(tlb, PAGE_SIZE);
again:
init_rss_vec(rss);
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
@@ -1305,10 +1026,13 @@
if (pte_none(ptent))
continue;
+ if (need_resched())
+ break;
+
if (pte_present(ptent)) {
struct page *page;
- page = _vm_normal_page(vma, addr, ptent, true);
+ page = vm_normal_page(vma, addr, ptent);
if (unlikely(details) && page) {
/*
* unmap_shared_mapping_pages() wants to
@@ -1372,7 +1096,6 @@
if (unlikely(details))
continue;
- entry = pte_to_swp_entry(ptent);
if (!non_swap_entry(entry))
rss[MM_SWAPENTS]--;
else if (is_migration_entry(entry)) {
@@ -1402,9 +1125,12 @@
*/
if (force_flush) {
force_flush = 0;
- tlb_flush_mmu_free(tlb);
- if (addr != end)
- goto again;
+ tlb_flush_mmu(tlb);
+ }
+
+ if (addr != end) {
+ cond_resched();
+ goto again;
}
return addr;
@@ -1579,12 +1305,14 @@
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
- mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ start_addr, end_addr);
+ mmu_notifier_invalidate_range_start(&range);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next)
unmap_single_vma(tlb, vma, start_addr, end_addr, NULL);
- mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
+ mmu_notifier_invalidate_range_end(&range);
}
/**
@@ -1598,18 +1326,19 @@
void zap_page_range(struct vm_area_struct *vma, unsigned long start,
unsigned long size)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- unsigned long end = start + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, start, end);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, start, end);
- for ( ; vma && vma->vm_start < end; vma = vma->vm_next)
- unmap_single_vma(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ start, start + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next)
+ unmap_single_vma(&tlb, vma, start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, start, range.end);
}
/**
@@ -1624,17 +1353,18 @@
static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
- struct mm_struct *mm = vma->vm_mm;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- unsigned long end = address + size;
lru_add_drain();
- tlb_gather_mmu(&tlb, mm, address, end);
- update_hiwater_rss(mm);
- mmu_notifier_invalidate_range_start(mm, address, end);
- unmap_single_vma(&tlb, vma, address, end, details);
- mmu_notifier_invalidate_range_end(mm, address, end);
- tlb_finish_mmu(&tlb, address, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address, address + size);
+ tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end);
+ update_hiwater_rss(vma->vm_mm);
+ mmu_notifier_invalidate_range_start(&range);
+ unmap_single_vma(&tlb, vma, address, range.end, details);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, address, range.end);
}
/**
@@ -1698,7 +1428,7 @@
spinlock_t *ptl;
retval = -EINVAL;
- if (PageAnon(page))
+ if (PageAnon(page) || PageSlab(page) || page_has_type(page))
goto out;
retval = -ENOMEM;
flush_dcache_page(page);
@@ -1716,8 +1446,6 @@
set_pte_at(mm, addr, pte, mk_pte(page, prot));
retval = 0;
- pte_unmap_unlock(pte, ptl);
- return retval;
out_unlock:
pte_unmap_unlock(pte, ptl);
out:
@@ -1750,6 +1478,8 @@
* under mm->mmap_sem write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
struct page *page)
@@ -1767,19 +1497,97 @@
}
EXPORT_SYMBOL(vm_insert_page);
-static int insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+/*
+ * __vm_map_pages - maps range of kernel pages into user vma
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ * @offset: user's requested vm_pgoff
+ *
+ * This allows drivers to map range of kernel pages into a user vma.
+ *
+ * Return: 0 on success and error code otherwise.
+ */
+static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num, unsigned long offset)
+{
+ unsigned long count = vma_pages(vma);
+ unsigned long uaddr = vma->vm_start;
+ int ret, i;
+
+ /* Fail if the user requested offset is beyond the end of the object */
+ if (offset >= num)
+ return -ENXIO;
+
+ /* Fail if the user requested size exceeds available object size */
+ if (count > num - offset)
+ return -ENXIO;
+
+ for (i = 0; i < count; i++) {
+ ret = vm_insert_page(vma, uaddr, pages[offset + i]);
+ if (ret < 0)
+ return ret;
+ uaddr += PAGE_SIZE;
+ }
+
+ return 0;
+}
+
+/**
+ * vm_map_pages - maps range of kernel pages starts with non zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Maps an object consisting of @num pages, catering for the user's
+ * requested vm_pgoff
+ *
+ * If we fail to insert any page into the vma, the function will return
+ * immediately leaving any previously inserted pages present. Callers
+ * from the mmap handler may immediately return the error as their caller
+ * will destroy the vma, removing any successfully inserted pages. Other
+ * callers should make their own arrangements for calling unmap_region().
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, vma->vm_pgoff);
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+/**
+ * vm_map_pages_zero - map range of kernel pages starts with zero offset
+ * @vma: user vma to map to
+ * @pages: pointer to array of source kernel pages
+ * @num: number of pages in page array
+ *
+ * Similar to vm_map_pages(), except that it explicitly sets the offset
+ * to 0. This function is intended for the drivers that did not consider
+ * vm_pgoff.
+ *
+ * Context: Process context. Called by mmap handlers.
+ * Return: 0 on success and error code otherwise.
+ */
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return __vm_map_pages(vma, pages, num, 0);
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
+static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr,
pfn_t pfn, pgprot_t prot, bool mkwrite)
{
struct mm_struct *mm = vma->vm_mm;
- int retval;
pte_t *pte, entry;
spinlock_t *ptl;
- retval = -ENOMEM;
pte = get_locked_pte(mm, addr, &ptl);
if (!pte)
- goto out;
- retval = -EBUSY;
+ return VM_FAULT_OOM;
if (!pte_none(*pte)) {
if (mkwrite) {
/*
@@ -1787,14 +1595,21 @@
* in may not match the PFN we have mapped if the
* mapped PFN is a writeable COW page. In the mkwrite
* case we are creating a writable PTE for a shared
- * mapping and we expect the PFNs to match.
+ * mapping and we expect the PFNs to match. If they
+ * don't match, we are likely racing with block
+ * allocation and mapping invalidation so just skip the
+ * update.
*/
- if (WARN_ON_ONCE(pte_pfn(*pte) != pfn_t_to_pfn(pfn)))
+ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) {
+ WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte)));
goto out_unlock;
- entry = *pte;
- goto out_mkwrite;
- } else
- goto out_unlock;
+ }
+ entry = pte_mkyoung(*pte);
+ entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+ if (ptep_set_access_flags(vma, addr, pte, entry, 1))
+ update_mmu_cache(vma, addr, pte);
+ }
+ goto out_unlock;
}
/* Ok, finally just insert the thing.. */
@@ -1803,7 +1618,6 @@
else
entry = pte_mkspecial(pfn_t_pte(pfn, prot));
-out_mkwrite:
if (mkwrite) {
entry = pte_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1812,56 +1626,32 @@
set_pte_at(mm, addr, pte, entry);
update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */
- retval = 0;
out_unlock:
pte_unmap_unlock(pte, ptl);
-out:
- return retval;
+ return VM_FAULT_NOPAGE;
}
/**
- * vm_insert_pfn - insert single pfn into user vma
- * @vma: user vma to map to
- * @addr: target user address of this page
- * @pfn: source kernel pfn
- *
- * Similar to vm_insert_page, this allows drivers to insert individual pages
- * they've allocated into a user vma. Same comments apply.
- *
- * This function should only be called from a vm_ops->fault handler, and
- * in that case the handler should return NULL.
- *
- * vma cannot be a COW mapping.
- *
- * As this is called only for pages that do not currently exist, we
- * do not need to flush old virtual caches or the TLB.
- */
-int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
- unsigned long pfn)
-{
- return vm_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
-}
-EXPORT_SYMBOL(vm_insert_pfn);
-
-/**
- * vm_insert_pfn_prot - insert single pfn into user vma with specified pgprot
+ * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot
* @vma: user vma to map to
* @addr: target user address of this page
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vm_insert_pfn, except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers to
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
- * cow mappings. In general, using multiple vmas is preferable;
- * vm_insert_pfn_prot should only be used if using multiple VMAs is
+ * COW mappings. In general, using multiple vmas is preferable;
+ * vmf_insert_pfn_prot should only be used if using multiple VMAs is
* impractical.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
*/
-int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
+vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, pgprot_t pgprot)
{
- int ret;
/*
* Technically, architectures with pte_special can avoid all these
* restrictions (same for remap_pfn_range). However we would like
@@ -1875,19 +1665,44 @@
BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
if (!pfn_modify_allowed(pfn, pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV));
- ret = insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
+ return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot,
false);
-
- return ret;
}
-EXPORT_SYMBOL(vm_insert_pfn_prot);
+EXPORT_SYMBOL(vmf_insert_pfn_prot);
+
+/**
+ * vmf_insert_pfn - insert single pfn into user vma
+ * @vma: user vma to map to
+ * @addr: target user address of this page
+ * @pfn: source kernel pfn
+ *
+ * Similar to vm_insert_page, this allows drivers to insert individual pages
+ * they've allocated into a user vma. Same comments apply.
+ *
+ * This function should only be called from a vm_ops->fault handler, and
+ * in that case the handler should return the result of this function.
+ *
+ * vma cannot be a COW mapping.
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * Context: Process context. May allocate using %GFP_KERNEL.
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
+ unsigned long pfn)
+{
+ return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vmf_insert_pfn);
static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn)
{
@@ -1903,20 +1718,21 @@
return false;
}
-static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn, bool mkwrite)
+static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
+ unsigned long addr, pfn_t pfn, bool mkwrite)
{
pgprot_t pgprot = vma->vm_page_prot;
+ int err;
BUG_ON(!vm_mixed_ok(vma, pfn));
if (addr < vma->vm_start || addr >= vma->vm_end)
- return -EFAULT;
+ return VM_FAULT_SIGBUS;
track_pfn_insert(vma, &pgprot, pfn);
if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot))
- return -EACCES;
+ return VM_FAULT_SIGBUS;
/*
* If we don't have pte special, then we have to use the pfn_valid()
@@ -1935,36 +1751,35 @@
* result in pfn_t_has_page() == false.
*/
page = pfn_to_page(pfn_t_to_pfn(pfn));
- return insert_page(vma, addr, page, pgprot);
+ err = insert_page(vma, addr, page, pgprot);
+ } else {
+ return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
}
- return insert_pfn(vma, addr, pfn, pgprot, mkwrite);
+
+ if (err == -ENOMEM)
+ return VM_FAULT_OOM;
+ if (err < 0 && err != -EBUSY)
+ return VM_FAULT_SIGBUS;
+
+ return VM_FAULT_NOPAGE;
}
-int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
- pfn_t pfn)
+vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
+ pfn_t pfn)
{
return __vm_insert_mixed(vma, addr, pfn, false);
-
}
-EXPORT_SYMBOL(vm_insert_mixed);
+EXPORT_SYMBOL(vmf_insert_mixed);
/*
* If the insertion of PTE failed because someone else already added a
* different entry in the mean time, we treat that as success as we assume
* the same entry was actually inserted.
*/
-
vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma,
unsigned long addr, pfn_t pfn)
{
- int err;
-
- err = __vm_insert_mixed(vma, addr, pfn, true);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- if (err < 0 && err != -EBUSY)
- return VM_FAULT_SIGBUS;
- return VM_FAULT_NOPAGE;
+ return __vm_insert_mixed(vma, addr, pfn, true);
}
EXPORT_SYMBOL(vmf_insert_mixed_mkwrite);
@@ -2074,7 +1889,9 @@
* @size: size of map area
* @prot: page protection flags for this mapping
*
- * Note: this is only safe if the mm semaphore is held when called.
+ * Note: this is only safe if the mm semaphore is held when called.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
@@ -2147,6 +1964,8 @@
*
* NOTE! Some drivers might want to tweak vma->vm_page_prot first to get
* whatever write-combining details or similar.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len)
{
@@ -2188,7 +2007,6 @@
{
pte_t *pte;
int err;
- pgtable_t token;
spinlock_t *uninitialized_var(ptl);
pte = (mm == &init_mm) ?
@@ -2201,10 +2019,8 @@
arch_enter_lazy_mmu_mode();
- token = pmd_pgtable(*pmd);
-
do {
- err = fn(pte++, token, addr, data);
+ err = fn(pte++, addr, data);
if (err)
break;
} while (addr += PAGE_SIZE, addr != end);
@@ -2385,6 +2201,10 @@
vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
+ if (vmf->vma->vm_file &&
+ IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host))
+ return VM_FAULT_SIGBUS;
+
ret = vmf->vma->vm_ops->page_mkwrite(vmf);
/* Restore original flags so that caller is not surprised */
vmf->flags = old_flags;
@@ -2491,9 +2311,8 @@
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- const unsigned long mmun_start = vmf->address & PAGE_MASK;
- const unsigned long mmun_end = mmun_start + PAGE_SIZE;
struct mem_cgroup *memcg;
+ struct mmu_notifier_range range;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
@@ -2516,7 +2335,10 @@
__SetPageUptodate(new_page);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm,
+ vmf->address & PAGE_MASK,
+ (vmf->address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(&range);
/*
* Re-check the pte - we dropped the lock
@@ -2593,7 +2415,7 @@
* No need to double call mmu_notifier->invalidate_range() callback as
* the above ptep_clear_flush_notify() did already call it.
*/
- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_only_end(&range);
if (old_page) {
/*
* Don't let another task, with possibly unlocked vma,
@@ -2624,12 +2446,13 @@
*
* This function handles all that is needed to finish a write page fault in a
* shared mapping due to PTE being read-only once the mapped page is prepared.
- * It handles locking of PTE and modifying it. The function returns
- * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE
- * lock.
+ * It handles locking of PTE and modifying it.
*
* The function expects the page to be locked or other protection against
* concurrent faults / writeback (such as DAX radix tree locks).
+ *
+ * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before
+ * we acquired PTE lock.
*/
vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
{
@@ -2747,8 +2570,11 @@
* Take out anonymous pages first, anonymous shared vmas are
* not dirty accountable.
*/
- if (PageAnon(vmf->page) && !PageKsm(vmf->page)) {
+ if (PageAnon(vmf->page)) {
int total_map_swapcount;
+ if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
+ page_count(vmf->page) != 1))
+ goto copy;
if (!trylock_page(vmf->page)) {
get_page(vmf->page);
pte_unmap_unlock(vmf->pte, vmf->ptl);
@@ -2763,6 +2589,15 @@
}
put_page(vmf->page);
}
+ if (PageKsm(vmf->page)) {
+ bool reused = reuse_ksm_page(vmf->page, vmf->vma,
+ vmf->address);
+ unlock_page(vmf->page);
+ if (!reused)
+ goto copy;
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
+ }
if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
if (total_map_swapcount == 1) {
/*
@@ -2783,7 +2618,7 @@
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
}
-
+copy:
/*
* Ok, we need to copy. Oh, well..
*/
@@ -2917,13 +2752,8 @@
migration_entry_wait(vma->vm_mm, vmf->pmd,
vmf->address);
} else if (is_device_private_entry(entry)) {
- /*
- * For un-addressable device memory we call the pgmap
- * fault handler callback. The callback must migrate
- * the page back to some CPU accessible page.
- */
- ret = device_private_entry_fault(vma, vmf->address, entry,
- vmf->flags, vmf->pmd);
+ vmf->page = device_private_entry_to_page(entry);
+ ret = vmf->page->pgmap->ops->migrate_to_ram(vmf);
} else if (is_hwpoison_entry(entry)) {
ret = VM_FAULT_HWPOISON;
} else {
@@ -2942,7 +2772,7 @@
struct swap_info_struct *si = swp_swap_info(entry);
if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(si, entry) == 1) {
+ __swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
@@ -3139,7 +2969,7 @@
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))
+ if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
/* See the comment in pte_alloc_one_map() */
@@ -3237,6 +3067,28 @@
struct vm_area_struct *vma = vmf->vma;
vm_fault_t ret;
+ /*
+ * Preallocate pte before we take page_lock because this might lead to
+ * deadlocks for memcg reclaim which waits for pages under writeback:
+ * lock_page(A)
+ * SetPageWriteback(A)
+ * unlock_page(A)
+ * lock_page(B)
+ * lock_page(B)
+ * pte_alloc_pne
+ * shrink_page_list
+ * wait_on_page_writeback(A)
+ * SetPageWriteback(B)
+ * unlock_page(B)
+ * # flush A, B to clear the writeback
+ */
+ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) {
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
+ if (!vmf->prealloc_pte)
+ return VM_FAULT_OOM;
+ smp_wmb(); /* See comment in __pte_alloc() */
+ }
+
ret = vma->vm_ops->fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY |
VM_FAULT_DONE_COW)))
@@ -3286,7 +3138,7 @@
pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte);
spin_unlock(vmf->ptl);
vmf->prealloc_pte = NULL;
- } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) {
+ } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) {
return VM_FAULT_OOM;
}
map_pte:
@@ -3319,19 +3171,6 @@
}
#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-
-#define HPAGE_CACHE_INDEX_MASK (HPAGE_PMD_NR - 1)
-static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
- unsigned long haddr)
-{
- if (((vma->vm_start >> PAGE_SHIFT) & HPAGE_CACHE_INDEX_MASK) !=
- (vma->vm_pgoff & HPAGE_CACHE_INDEX_MASK))
- return false;
- if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end)
- return false;
- return true;
-}
-
static void deposit_prealloc_pte(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -3365,7 +3204,7 @@
* related to pte entry. Use the preallocated table for that.
*/
if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) {
- vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address);
+ vmf->prealloc_pte = pte_alloc_one(vma->vm_mm);
if (!vmf->prealloc_pte)
return VM_FAULT_OOM;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3422,6 +3261,8 @@
*
* Target users are page handler itself and implementations of
* vm_ops->map_pages.
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
struct page *page)
@@ -3482,11 +3323,12 @@
* This function handles all that is needed to finish a page fault once the
* page to fault in is prepared. It handles locking of PTEs, inserts PTE for
* given page, adds reverse page mapping, handles memcg charges and LRU
- * addition. The function returns 0 on success, VM_FAULT_ code in case of
- * error.
+ * addition.
*
* The function expects the page to be locked and on success it consumes a
* reference of a page being mapped (for the PTE which maps it).
+ *
+ * Return: %0 on success, %VM_FAULT_ code in case of error.
*/
vm_fault_t finish_fault(struct vm_fault *vmf)
{
@@ -3542,12 +3384,8 @@
static int __init fault_around_debugfs(void)
{
- void *ret;
-
- ret = debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
- &fault_around_bytes_fops);
- if (!ret)
- pr_warn("Failed to create fault_around_bytes in debugfs");
+ debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL,
+ &fault_around_bytes_fops);
return 0;
}
late_initcall(fault_around_debugfs);
@@ -3603,8 +3441,7 @@
start_pgoff + nr_pages - 1);
if (pmd_none(*vmf->pmd)) {
- vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm,
- vmf->address);
+ vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm);
if (!vmf->prealloc_pte)
goto out;
smp_wmb(); /* See comment in __pte_alloc() */
@@ -3739,10 +3576,13 @@
* but allow concurrent faults).
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
+ * If mmap_sem is released, vma may become invalid (for example
+ * by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *vm_mm = vma->vm_mm;
vm_fault_t ret;
/*
@@ -3783,7 +3623,7 @@
/* preallocated pagetable is unused: free it */
if (vmf->prealloc_pte) {
- pte_free(vma->vm_mm, vmf->prealloc_pte);
+ pte_free(vm_mm, vmf->prealloc_pte);
vmf->prealloc_pte = NULL;
}
return ret;
@@ -3808,11 +3648,11 @@
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL;
- int page_nid = -1;
+ int page_nid = NUMA_NO_NODE;
int last_cpupid;
int target_nid;
bool migrated = false;
- pte_t pte;
+ pte_t pte, old_pte;
bool was_writable = pte_savedwrite(vmf->orig_pte);
int flags = 0;
@@ -3832,12 +3672,12 @@
* Make it present again, Depending on how arch implementes non
* accessible ptes, some can allow access by kernel mode.
*/
- pte = ptep_modify_prot_start(vma->vm_mm, vmf->address, vmf->pte);
- pte = pte_modify(pte, vma->vm_page_prot);
+ old_pte = ptep_modify_prot_start(vma, vmf->address, vmf->pte);
+ pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
pte = pte_mkwrite(pte);
- ptep_modify_prot_commit(vma->vm_mm, vmf->address, vmf->pte, pte);
+ ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
page = vm_normal_page(vma, vmf->address, pte);
@@ -3875,7 +3715,7 @@
target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid,
&flags);
pte_unmap_unlock(vmf->pte, vmf->ptl);
- if (target_nid == -1) {
+ if (target_nid == NUMA_NO_NODE) {
put_page(page);
goto out;
}
@@ -3889,7 +3729,7 @@
flags |= TNF_MIGRATE_FAIL;
out:
- if (page_nid != -1)
+ if (page_nid != NUMA_NO_NODE)
task_numa_fault(last_cpupid, page_nid, 1, flags);
return 0;
}
@@ -4074,7 +3914,7 @@
vmf.pud = pud_alloc(mm, p4d, address);
if (!vmf.pud)
return VM_FAULT_OOM;
- if (pud_none(*vmf.pud) && transparent_hugepage_enabled(vma)) {
+ if (pud_none(*vmf.pud) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pud(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4100,7 +3940,7 @@
vmf.pmd = pmd_alloc(mm, vmf.pud, address);
if (!vmf.pmd)
return VM_FAULT_OOM;
- if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ if (pmd_none(*vmf.pmd) && __transparent_hugepage_enabled(vma)) {
ret = create_huge_pmd(&vmf);
if (!(ret & VM_FAULT_FALLBACK))
return ret;
@@ -4274,7 +4114,7 @@
#endif /* __PAGETABLE_PMD_FOLDED */
static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- unsigned long *start, unsigned long *end,
+ struct mmu_notifier_range *range,
pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
@@ -4302,10 +4142,11 @@
if (!pmdpp)
goto out;
- if (start && end) {
- *start = address & PMD_MASK;
- *end = *start + PMD_SIZE;
- mmu_notifier_invalidate_range_start(mm, *start, *end);
+ if (range) {
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0,
+ NULL, mm, address & PMD_MASK,
+ (address & PMD_MASK) + PMD_SIZE);
+ mmu_notifier_invalidate_range_start(range);
}
*ptlp = pmd_lock(mm, pmd);
if (pmd_huge(*pmd)) {
@@ -4313,17 +4154,18 @@
return 0;
}
spin_unlock(*ptlp);
- if (start && end)
- mmu_notifier_invalidate_range_end(mm, *start, *end);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
}
if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
goto out;
- if (start && end) {
- *start = address & PAGE_MASK;
- *end = *start + PAGE_SIZE;
- mmu_notifier_invalidate_range_start(mm, *start, *end);
+ if (range) {
+ mmu_notifier_range_init(range, MMU_NOTIFY_CLEAR, 0, NULL, mm,
+ address & PAGE_MASK,
+ (address & PAGE_MASK) + PAGE_SIZE);
+ mmu_notifier_invalidate_range_start(range);
}
ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
if (!pte_present(*ptep))
@@ -4332,8 +4174,8 @@
return 0;
unlock:
pte_unmap_unlock(ptep, *ptlp);
- if (start && end)
- mmu_notifier_invalidate_range_end(mm, *start, *end);
+ if (range)
+ mmu_notifier_invalidate_range_end(range);
out:
return -EINVAL;
}
@@ -4345,20 +4187,20 @@
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, NULL, NULL,
+ !(res = __follow_pte_pmd(mm, address, NULL,
ptepp, NULL, ptlp)));
return res;
}
int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- unsigned long *start, unsigned long *end,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+ struct mmu_notifier_range *range,
+ pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
{
int res;
/* (void) is needed to make gcc happy */
(void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, start, end,
+ !(res = __follow_pte_pmd(mm, address, range,
ptepp, pmdpp, ptlp)));
return res;
}
@@ -4372,7 +4214,7 @@
*
* Only IO mappings and raw PFN mappings are allowed.
*
- * Returns zero and the pfn at @pfn on success, -ve otherwise.
+ * Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
unsigned long *pfn)
@@ -4459,7 +4301,9 @@
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
- down_read(&mm->mmap_sem);
+ if (down_read_killable(&mm->mmap_sem))
+ return 0;
+
/* ignore errors, just check how much was successfully transferred */
while (len) {
int bytes, ret, offset;
@@ -4522,6 +4366,8 @@
* @gup_flags: flags modifying lookup behaviour
*
* The caller must hold a reference on @mm.
+ *
+ * Return: number of bytes copied from source to destination.
*/
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f3f9197..f307bd8 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/memory_hotplug.c
*
@@ -33,12 +34,13 @@
#include <linux/stop_machine.h>
#include <linux/hugetlb.h>
#include <linux/memblock.h>
-#include <linux/bootmem.h>
#include <linux/compaction.h>
+#include <linux/rmap.h>
#include <asm/tlbflush.h>
#include "internal.h"
+#include "shuffle.h"
/*
* online_page_callback contains pointer to current page onlining function.
@@ -47,7 +49,7 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page);
+static void generic_online_page(struct page *page, unsigned int order);
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -96,27 +98,29 @@
cpus_read_unlock();
}
+u64 max_mem_size = U64_MAX;
+
/* add this memory to iomem resource */
static struct resource *register_memory_resource(u64 start, u64 size)
{
- struct resource *res, *conflict;
- res = kzalloc(sizeof(struct resource), GFP_KERNEL);
- if (!res)
- return ERR_PTR(-ENOMEM);
+ struct resource *res;
+ unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+ char *resource_name = "System RAM";
- res->name = "System RAM";
- res->start = start;
- res->end = start + size - 1;
- res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- conflict = request_resource_conflict(&iomem_resource, res);
- if (conflict) {
- if (conflict->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) {
- pr_debug("Device unaddressable memory block "
- "memory hotplug at %#010llx !\n",
- (unsigned long long)start);
- }
- pr_debug("System RAM resource %pR cannot be added\n", res);
- kfree(res);
+ if (start + size > max_mem_size)
+ return ERR_PTR(-E2BIG);
+
+ /*
+ * Request ownership of the new memory range. This might be
+ * a child of an existing resource that was present but
+ * not marked as busy.
+ */
+ res = __request_region(&iomem_resource, start, size,
+ resource_name, flags);
+
+ if (!res) {
+ pr_debug("Unable to reserve System RAM region: %016llx->%016llx\n",
+ start, start + size);
return ERR_PTR(-EEXIST);
}
return res;
@@ -128,7 +132,6 @@
return;
release_resource(res);
kfree(res);
- return;
}
#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
@@ -162,9 +165,10 @@
#ifndef CONFIG_SPARSEMEM_VMEMMAP
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
- unsigned long *usemap, mapsize, section_nr, i;
+ unsigned long mapsize, section_nr, i;
struct mem_section *ms;
struct page *page, *memmap;
+ struct mem_section_usage *usage;
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -184,10 +188,10 @@
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, SECTION_INFO);
- usemap = ms->pageblock_flags;
- page = virt_to_page(usemap);
+ usage = ms->usage;
+ page = virt_to_page(usage);
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -196,9 +200,10 @@
#else /* CONFIG_SPARSEMEM_VMEMMAP */
static void register_page_bootmem_info_section(unsigned long start_pfn)
{
- unsigned long *usemap, mapsize, section_nr, i;
+ unsigned long mapsize, section_nr, i;
struct mem_section *ms;
struct page *page, *memmap;
+ struct mem_section_usage *usage;
section_nr = pfn_to_section_nr(start_pfn);
ms = __nr_to_section(section_nr);
@@ -207,10 +212,10 @@
register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION);
- usemap = ms->pageblock_flags;
- page = virt_to_page(usemap);
+ usage = ms->usage;
+ page = virt_to_page(usage);
- mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
+ mapsize = PAGE_ALIGN(mem_section_usage_size()) >> PAGE_SHIFT;
for (i = 0; i < mapsize; i++, page++)
get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
@@ -246,22 +251,31 @@
}
#endif /* CONFIG_HAVE_BOOTMEM_INFO_NODE */
-static int __meminit __add_section(int nid, unsigned long phys_start_pfn,
- struct vmem_altmap *altmap, bool want_memblock)
+static int check_pfn_span(unsigned long pfn, unsigned long nr_pages,
+ const char *reason)
{
- int ret;
+ /*
+ * Disallow all operations smaller than a sub-section and only
+ * allow operations smaller than a section for
+ * SPARSEMEM_VMEMMAP. Note that check_hotplug_memory_range()
+ * enforces a larger memory_block_size_bytes() granularity for
+ * memory that will be marked online, so this check should only
+ * fire for direct arch_{add,remove}_memory() users outside of
+ * add_memory_resource().
+ */
+ unsigned long min_align;
- if (pfn_valid(phys_start_pfn))
- return -EEXIST;
-
- ret = sparse_add_one_section(NODE_DATA(nid), phys_start_pfn, altmap);
- if (ret < 0)
- return ret;
-
- if (!want_memblock)
- return 0;
-
- return hotplug_memory_register(nid, __pfn_to_section(phys_start_pfn));
+ if (IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ min_align = PAGES_PER_SUBSECTION;
+ else
+ min_align = PAGES_PER_SECTION;
+ if (!IS_ALIGNED(pfn, min_align)
+ || !IS_ALIGNED(nr_pages, min_align)) {
+ WARN(1, "Misaligned __%s_pages start: %#lx end: #%lx\n",
+ reason, pfn, pfn + nr_pages - 1);
+ return -EINVAL;
+ }
+ return 0;
}
/*
@@ -270,62 +284,54 @@
* call this function after deciding the zone to which to
* add the new pages.
*/
-int __ref __add_pages(int nid, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap,
- bool want_memblock)
+int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+ struct mhp_restrictions *restrictions)
{
- unsigned long i;
- int err = 0;
- int start_sec, end_sec;
-
- /* during initialize mem_map, align hot-added range to section */
- start_sec = pfn_to_section_nr(phys_start_pfn);
- end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
+ int err;
+ unsigned long nr, start_sec, end_sec;
+ struct vmem_altmap *altmap = restrictions->altmap;
if (altmap) {
/*
* Validate altmap is within bounds of the total request
*/
- if (altmap->base_pfn != phys_start_pfn
+ if (altmap->base_pfn != pfn
|| vmem_altmap_offset(altmap) > nr_pages) {
pr_warn_once("memory add fail, invalid altmap\n");
- err = -EINVAL;
- goto out;
+ return -EINVAL;
}
altmap->alloc = 0;
}
- for (i = start_sec; i <= end_sec; i++) {
- err = __add_section(nid, section_nr_to_pfn(i), altmap,
- want_memblock);
+ err = check_pfn_span(pfn, nr_pages, "add");
+ if (err)
+ return err;
- /*
- * EEXIST is finally dealt with by ioresource collision
- * check. see add_memory() => register_memory_resource()
- * Warning will be printed if there is collision.
- */
- if (err && (err != -EEXIST))
+ start_sec = pfn_to_section_nr(pfn);
+ end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ unsigned long pfns;
+
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ err = sparse_add_section(nid, pfn, pfns, altmap);
+ if (err)
break;
- err = 0;
+ pfn += pfns;
+ nr_pages -= pfns;
cond_resched();
}
vmemmap_populate_print_last();
-out:
return err;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
-
- for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(start_pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; start_pfn < end_pfn; start_pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_to_online_page(start_pfn)))
continue;
if (unlikely(pfn_to_nid(start_pfn) != nid))
@@ -345,15 +351,12 @@
unsigned long start_pfn,
unsigned long end_pfn)
{
- struct mem_section *ms;
unsigned long pfn;
/* pfn is the end pfn of a memory section. */
pfn = end_pfn - 1;
- for (; pfn >= start_pfn; pfn -= PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn >= start_pfn; pfn -= PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (unlikely(pfn_to_nid(pfn) != nid))
@@ -375,7 +378,6 @@
unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
unsigned long zone_end_pfn = z;
unsigned long pfn;
- struct mem_section *ms;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
@@ -412,17 +414,15 @@
* it check the zone has only hole or not.
*/
pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
+ for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
+ if (unlikely(!pfn_to_online_page(pfn)))
continue;
if (page_zone(pfn_to_page(pfn)) != zone)
continue;
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
+ /* Skip range to be removed */
+ if (pfn >= start_pfn && pfn < end_pfn)
continue;
/* If we find valid section, we have nothing to do */
@@ -436,110 +436,74 @@
zone_span_writeunlock(zone);
}
-static void shrink_pgdat_span(struct pglist_data *pgdat,
- unsigned long start_pfn, unsigned long end_pfn)
+static void update_pgdat_span(struct pglist_data *pgdat)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
- unsigned long pfn;
- struct mem_section *ms;
- int nid = pgdat->node_id;
+ unsigned long node_start_pfn = 0, node_end_pfn = 0;
+ struct zone *zone;
- if (pgdat_start_pfn == start_pfn) {
- /*
- * If the section is smallest section in the pgdat, it need
- * shrink pgdat->node_start_pfn and pgdat->node_spanned_pages.
- * In this case, we find second smallest valid mem_section
- * for shrinking zone.
- */
- pfn = find_smallest_section_pfn(nid, NULL, end_pfn,
- pgdat_end_pfn);
- if (pfn) {
- pgdat->node_start_pfn = pfn;
- pgdat->node_spanned_pages = pgdat_end_pfn - pfn;
+ for (zone = pgdat->node_zones;
+ zone < pgdat->node_zones + MAX_NR_ZONES; zone++) {
+ unsigned long zone_end_pfn = zone->zone_start_pfn +
+ zone->spanned_pages;
+
+ /* No need to lock the zones, they can't change. */
+ if (!zone->spanned_pages)
+ continue;
+ if (!node_end_pfn) {
+ node_start_pfn = zone->zone_start_pfn;
+ node_end_pfn = zone_end_pfn;
+ continue;
}
- } else if (pgdat_end_pfn == end_pfn) {
- /*
- * If the section is biggest section in the pgdat, it need
- * shrink pgdat->node_spanned_pages.
- * In this case, we find second biggest valid mem_section for
- * shrinking zone.
- */
- pfn = find_biggest_section_pfn(nid, NULL, pgdat_start_pfn,
- start_pfn);
- if (pfn)
- pgdat->node_spanned_pages = pfn - pgdat_start_pfn + 1;
+
+ if (zone_end_pfn > node_end_pfn)
+ node_end_pfn = zone_end_pfn;
+ if (zone->zone_start_pfn < node_start_pfn)
+ node_start_pfn = zone->zone_start_pfn;
}
- /*
- * If the section is not biggest or smallest mem_section in the pgdat,
- * it only creates a hole in the pgdat. So in this case, we need not
- * change the pgdat.
- * But perhaps, the pgdat has only hole data. Thus it check the pgdat
- * has only hole or not.
- */
- pfn = pgdat_start_pfn;
- for (; pfn < pgdat_end_pfn; pfn += PAGES_PER_SECTION) {
- ms = __pfn_to_section(pfn);
-
- if (unlikely(!valid_section(ms)))
- continue;
-
- if (pfn_to_nid(pfn) != nid)
- continue;
-
- /* If the section is current section, it continues the loop */
- if (start_pfn == pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- return;
- }
-
- /* The pgdat has no valid section */
- pgdat->node_start_pfn = 0;
- pgdat->node_spanned_pages = 0;
+ pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
-static void __remove_zone(struct zone *zone, unsigned long start_pfn)
+static void __remove_zone(struct zone *zone, unsigned long start_pfn,
+ unsigned long nr_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
- int nr_pages = PAGES_PER_SECTION;
unsigned long flags;
+#ifdef CONFIG_ZONE_DEVICE
+ /*
+ * Zone shrinking code cannot properly deal with ZONE_DEVICE. So
+ * we will not try to shrink the zones - which is okay as
+ * set_zone_contiguous() cannot deal with ZONE_DEVICE either way.
+ */
+ if (zone_idx(zone) == ZONE_DEVICE)
+ return;
+#endif
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
- shrink_pgdat_span(pgdat, start_pfn, start_pfn + nr_pages);
+ update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
}
-static int __remove_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset, struct vmem_altmap *altmap)
+static void __remove_section(struct zone *zone, unsigned long pfn,
+ unsigned long nr_pages, unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
- unsigned long start_pfn;
- int scn_nr;
- int ret = -EINVAL;
+ struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
- if (!valid_section(ms))
- return ret;
+ if (WARN_ON_ONCE(!valid_section(ms)))
+ return;
- ret = unregister_memory_section(ms);
- if (ret)
- return ret;
-
- scn_nr = __section_nr(ms);
- start_pfn = section_nr_to_pfn((unsigned long)scn_nr);
- __remove_zone(zone, start_pfn);
-
- sparse_remove_one_section(zone, ms, map_offset, altmap);
- return 0;
+ __remove_zone(zone, pfn, nr_pages);
+ sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
}
/**
* __remove_pages() - remove sections of pages from a zone
* @zone: zone from which pages need to be removed
- * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
+ * @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
*
@@ -548,58 +512,35 @@
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(struct zone *zone, unsigned long pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
- unsigned long i;
unsigned long map_offset = 0;
- int sections_to_remove, ret = 0;
+ unsigned long nr, start_sec, end_sec;
- /* In the ZONE_DEVICE case device driver owns the memory region */
- if (is_dev_zone(zone)) {
- if (altmap)
- map_offset = vmem_altmap_offset(altmap);
- } else {
- resource_size_t start, size;
-
- start = phys_start_pfn << PAGE_SHIFT;
- size = nr_pages * PAGE_SIZE;
-
- ret = release_mem_region_adjustable(&iomem_resource, start,
- size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
- }
+ map_offset = vmem_altmap_offset(altmap);
clear_zone_contiguous(zone);
- /*
- * We can only remove entire sections
- */
- BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
- BUG_ON(nr_pages % PAGES_PER_SECTION);
+ if (check_pfn_span(pfn, nr_pages, "remove"))
+ return;
- sections_to_remove = nr_pages / PAGES_PER_SECTION;
- for (i = 0; i < sections_to_remove; i++) {
- unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
+ start_sec = pfn_to_section_nr(pfn);
+ end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ unsigned long pfns;
cond_resched();
- ret = __remove_section(zone, __pfn_to_section(pfn), map_offset,
- altmap);
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ __remove_section(zone, pfn, pfns, map_offset, altmap);
+ pfn += pfns;
+ nr_pages -= pfns;
map_offset = 0;
- if (ret)
- break;
}
set_zone_contiguous(zone);
-
- return ret;
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
int set_online_page_callback(online_page_callback_t callback)
{
@@ -656,30 +597,41 @@
}
EXPORT_SYMBOL_GPL(__online_page_free);
-static void generic_online_page(struct page *page)
+static void generic_online_page(struct page *page, unsigned int order)
{
- __online_page_set_limits(page);
- __online_page_increment_counters(page);
- __online_page_free(page);
+ kernel_map_pages(page, 1 << order, 1);
+ __free_pages_core(page, order);
+ totalram_pages_add(1UL << order);
+#ifdef CONFIG_HIGHMEM
+ if (PageHighMem(page))
+ totalhigh_pages_add(1UL << order);
+#endif
}
static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
void *arg)
{
- unsigned long i;
- unsigned long onlined_pages = *(unsigned long *)arg;
- struct page *page;
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn;
+ int order;
- if (PageReserved(pfn_to_page(start_pfn)))
- for (i = 0; i < nr_pages; i++) {
- page = pfn_to_page(start_pfn + i);
- (*online_page_callback)(page);
- onlined_pages++;
- }
+ /*
+ * Online the pages. The callback might decide to keep some pages
+ * PG_reserved (to add them to the buddy later), but we still account
+ * them as being online/belonging to this zone ("present").
+ */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
+ order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
+ /* __free_pages_core() wants pfns to be aligned to the order */
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
+ order = 0;
+ (*online_page_callback)(pfn_to_page(pfn), order);
+ }
- online_mem_sections(start_pfn, start_pfn + nr_pages);
+ /* mark all involved sections as online */
+ online_mem_sections(start_pfn, end_pfn);
- *(unsigned long *)arg = onlined_pages;
+ *(unsigned long *)arg += nr_pages;
return 0;
}
@@ -688,62 +640,19 @@
struct zone *zone, struct memory_notify *arg)
{
int nid = zone_to_nid(zone);
- enum zone_type zone_last = ZONE_NORMAL;
- /*
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_NORMAL,
- * set zone_last to ZONE_NORMAL.
- *
- * If we don't have HIGHMEM nor movable node,
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
- */
- if (N_MEMORY == N_NORMAL_MEMORY)
- zone_last = ZONE_MOVABLE;
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
- /*
- * if the memory to be online is in a zone of 0...zone_last, and
- * the zones of 0...zone_last don't have memory before online, we will
- * need to set the node to node_states[N_NORMAL_MEMORY] after
- * the memory is online.
- */
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
- arg->status_change_nid_normal = nid;
- else
- arg->status_change_nid_normal = -1;
-
-#ifdef CONFIG_HIGHMEM
- /*
- * If we have movable node, node_states[N_HIGH_MEMORY]
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
- * set zone_last to ZONE_HIGHMEM.
- *
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_MOVABLE,
- * set zone_last to ZONE_MOVABLE.
- */
- zone_last = ZONE_HIGHMEM;
- if (N_MEMORY == N_HIGH_MEMORY)
- zone_last = ZONE_MOVABLE;
-
- if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
- arg->status_change_nid_high = nid;
- else
- arg->status_change_nid_high = -1;
-#else
- arg->status_change_nid_high = arg->status_change_nid_normal;
-#endif
-
- /*
- * if the node don't have memory befor online, we will need to
- * set the node to node_states[N_MEMORY] after the memory
- * is online.
- */
if (!node_state(nid, N_MEMORY))
arg->status_change_nid = nid;
- else
- arg->status_change_nid = -1;
+ if (zone_idx(zone) <= ZONE_NORMAL && !node_state(nid, N_NORMAL_MEMORY))
+ arg->status_change_nid_normal = nid;
+#ifdef CONFIG_HIGHMEM
+ if (zone_idx(zone) <= ZONE_HIGHMEM && !node_state(nid, N_HIGH_MEMORY))
+ arg->status_change_nid_high = nid;
+#endif
}
static void node_states_set_node(int node, struct memory_notify *arg)
@@ -754,7 +663,8 @@
if (arg->status_change_nid_high >= 0)
node_set_state(node, N_HIGH_MEMORY);
- node_set_state(node, N_MEMORY);
+ if (arg->status_change_nid >= 0)
+ node_set_state(node, N_MEMORY);
}
static void __meminit resize_zone_range(struct zone *zone, unsigned long start_pfn,
@@ -777,8 +687,13 @@
pgdat->node_start_pfn = start_pfn;
pgdat->node_spanned_pages = max(start_pfn + nr_pages, old_end_pfn) - pgdat->node_start_pfn;
-}
+}
+/*
+ * Associate the pfn range with the given zone, initializing the memmaps
+ * and resizing the pgdat/zone data to span the added pages. After this
+ * call, all affected pages are PG_reserved.
+ */
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages, struct vmem_altmap *altmap)
{
@@ -786,14 +701,13 @@
int nid = pgdat->node_id;
unsigned long flags;
- if (zone_is_empty(zone))
- init_currently_empty_zone(zone, start_pfn, nr_pages);
-
clear_zone_contiguous(zone);
/* TODO Huh pgdat is irqsave while zone is not. It used to be like that before */
pgdat_resize_lock(pgdat, &flags);
zone_span_writelock(zone);
+ if (zone_is_empty(zone))
+ init_currently_empty_zone(zone, start_pfn, nr_pages);
resize_zone_range(zone, start_pfn, nr_pages);
zone_span_writeunlock(zone);
resize_pgdat_range(pgdat, start_pfn, nr_pages);
@@ -868,21 +782,6 @@
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-/*
- * Associates the given pfn range with the given node and the zone appropriate
- * for the given online type.
- */
-static struct zone * __meminit move_pfn_range(int online_type, int nid,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- struct zone *zone;
-
- zone = zone_for_pfn_range(online_type, nid, start_pfn, nr_pages);
- move_pfn_range_to_zone(zone, start_pfn, nr_pages, NULL);
- return zone;
-}
-
-/* Must be protected by mem_hotplug_begin() or a device_lock */
int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
{
unsigned long flags;
@@ -894,15 +793,19 @@
struct memory_notify arg;
struct memory_block *mem;
+ mem_hotplug_begin();
+
/*
* We can't use pfn_to_nid() because nid might be stored in struct page
* which is not yet initialized. Instead, we find nid from memory block.
*/
mem = find_memory_block(__pfn_to_section(pfn));
nid = mem->nid;
+ put_device(&mem->dev);
/* associate pfn range with the zone */
- zone = move_pfn_range(online_type, nid, pfn, nr_pages);
+ zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -926,6 +829,7 @@
ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
online_pages_range);
if (ret) {
+ /* not a single memory resource was applicable */
if (need_zonelists_rebuild)
zone_pcp_reset(zone);
goto failed_addition;
@@ -937,27 +841,25 @@
zone->zone_pgdat->node_present_pages += onlined_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
- if (onlined_pages) {
- node_states_set_node(nid, &arg);
- if (need_zonelists_rebuild)
- build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
- }
+ shuffle_zone(zone);
+
+ node_states_set_node(nid, &arg);
+ if (need_zonelists_rebuild)
+ build_all_zonelists(NULL);
+ else
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
- if (onlined_pages) {
- kswapd_run(nid);
- kcompactd_run(nid);
- }
+ kswapd_run(nid);
+ kcompactd_run(nid);
vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
- if (onlined_pages)
- memory_notify(MEM_ONLINE, &arg);
+ memory_notify(MEM_ONLINE, &arg);
+ mem_hotplug_done();
return 0;
failed_addition:
@@ -965,6 +867,7 @@
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ mem_hotplug_done();
return ret;
}
#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
@@ -991,8 +894,11 @@
if (!pgdat)
return NULL;
+ pgdat->per_cpu_nodestats =
+ alloc_percpu(struct per_cpu_nodestat);
arch_refresh_nodedata(nid, pgdat);
} else {
+ int cpu;
/*
* Reset the nr_zones, order and classzone_idx before reuse.
* Note that kswapd will init kswapd_classzone_idx properly
@@ -1001,6 +907,12 @@
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = 0;
+ for_each_online_cpu(cpu) {
+ struct per_cpu_nodestat *p;
+
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
+ memset(p, 0, sizeof(*p));
+ }
}
/* we can use NODE_DATA(nid) from here */
@@ -1010,7 +922,6 @@
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
- pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat);
/*
* The node we allocated has no zone fallback lists. For avoiding
@@ -1036,7 +947,6 @@
arch_refresh_nodedata(nid, NULL);
free_percpu(pgdat->per_cpu_nodestats);
arch_free_nodedata(pgdat);
- return;
}
@@ -1091,16 +1001,11 @@
static int check_hotplug_memory_range(u64 start, u64 size)
{
- unsigned long block_sz = memory_block_size_bytes();
- u64 block_nr_pages = block_sz >> PAGE_SHIFT;
- u64 nr_pages = size >> PAGE_SHIFT;
- u64 start_pfn = PFN_DOWN(start);
-
/* memory range must be block size aligned */
- if (!nr_pages || !IS_ALIGNED(start_pfn, block_nr_pages) ||
- !IS_ALIGNED(nr_pages, block_nr_pages)) {
+ if (!size || !IS_ALIGNED(start, memory_block_size_bytes()) ||
+ !IS_ALIGNED(size, memory_block_size_bytes())) {
pr_err("Block size [%#lx] unaligned hotplug range: start %#llx, size %#llx",
- block_sz, start, size);
+ memory_block_size_bytes(), start, size);
return -EINVAL;
}
@@ -1112,9 +1017,15 @@
return device_online(&mem->dev);
}
-/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-int __ref add_memory_resource(int nid, struct resource *res, bool online)
+/*
+ * NOTE: The caller must call lock_device_hotplug() to serialize hotplug
+ * and online/offline operations (triggered e.g. by sysfs).
+ *
+ * we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
+ */
+int __ref add_memory_resource(int nid, struct resource *res)
{
+ struct mhp_restrictions restrictions = {};
u64 start, size;
bool new_node = false;
int ret;
@@ -1142,10 +1053,17 @@
new_node = ret;
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, NULL, true);
+ ret = arch_add_memory(nid, start, size, &restrictions);
if (ret < 0)
goto error;
+ /* create memory block devices after memory was added */
+ ret = create_memory_block_devices(start, size);
+ if (ret) {
+ arch_remove_memory(nid, start, size, NULL);
+ goto error;
+ }
+
if (new_node) {
/* If sysfs file of new node can't be created, cpu on the node
* can't be hot-added. There is no rollback way now.
@@ -1164,26 +1082,25 @@
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ /* device_online() will take the lock when calling online_pages() */
+ mem_hotplug_done();
+
/* online pages if requested */
- if (online)
- walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1),
- NULL, online_memory_block);
+ if (memhp_auto_online)
+ walk_memory_blocks(start, size, NULL, online_memory_block);
- goto out;
-
+ return ret;
error:
/* rollback pgdat allocation and others */
if (new_node)
rollback_node_hotadd(nid);
memblock_remove(start, size);
-
-out:
mem_hotplug_done();
return ret;
}
-EXPORT_SYMBOL_GPL(add_memory_resource);
-int __ref add_memory(int nid, u64 start, u64 size)
+/* requires device_hotplug_lock, see add_memory_resource() */
+int __ref __add_memory(int nid, u64 start, u64 size)
{
struct resource *res;
int ret;
@@ -1192,11 +1109,22 @@
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res, memhp_auto_online);
+ ret = add_memory_resource(nid, res);
if (ret < 0)
release_memory_resource(res);
return ret;
}
+
+int add_memory(int nid, u64 start, u64 size)
+{
+ int rc;
+
+ lock_device_hotplug();
+ rc = __add_memory(nid, start, size);
+ unlock_device_hotplug();
+
+ return rc;
+}
EXPORT_SYMBOL_GPL(add_memory);
#ifdef CONFIG_MEMORY_HOTREMOVE
@@ -1212,11 +1140,13 @@
return PageBuddy(page) && page_order(page) >= pageblock_order;
}
-/* Return the start of the next active pageblock after a given page */
-static struct page *next_active_pageblock(struct page *page)
+/* Return the pfn of the start of the next active pageblock after a given pfn */
+static unsigned long next_active_pageblock(unsigned long pfn)
{
+ struct page *page = pfn_to_page(pfn);
+
/* Ensure the starting page is pageblock-aligned */
- BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
+ BUG_ON(pfn & (pageblock_nr_pages - 1));
/* If the entire pageblock is free, move to the end of free page */
if (pageblock_free(page)) {
@@ -1224,16 +1154,16 @@
/* be careful. we don't have locks, page_order can be changed.*/
order = page_order(page);
if ((order < MAX_ORDER) && (order >= pageblock_order))
- return page + (1 << order);
+ return pfn + (1 << order);
}
- return page + pageblock_nr_pages;
+ return pfn + pageblock_nr_pages;
}
-static bool is_pageblock_removable_nolock(struct page *page)
+static bool is_pageblock_removable_nolock(unsigned long pfn)
{
+ struct page *page = pfn_to_page(pfn);
struct zone *zone;
- unsigned long pfn;
/*
* We have to be careful here because we are iterating over memory
@@ -1250,18 +1180,20 @@
if (!zone_spans_pfn(zone, pfn))
return false;
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true);
+ return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
}
/* Checks if this range of memory is likely to be hot-removable. */
bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
{
- struct page *page = pfn_to_page(start_pfn);
- struct page *end_page = page + nr_pages;
+ unsigned long end_pfn, pfn;
+
+ end_pfn = min(start_pfn + nr_pages,
+ zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
/* Check the starting page of each pageblock within the range */
- for (; page < end_page; page = next_active_pageblock(page)) {
- if (!is_pageblock_removable_nolock(page))
+ for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
+ if (!is_pageblock_removable_nolock(pfn))
return false;
cond_resched();
}
@@ -1297,6 +1229,9 @@
i++;
if (i == MAX_ORDER_NR_PAGES || pfn + i >= end_pfn)
continue;
+ /* Check if we got outside of the zone */
+ if (zone && !zone_spans_pfn(zone, pfn + i))
+ return 0;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
return 0;
@@ -1325,23 +1260,26 @@
static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
{
unsigned long pfn;
- struct page *page;
+
for (pfn = start; pfn < end; pfn++) {
- if (pfn_valid(pfn)) {
- page = pfn_to_page(pfn);
- if (PageLRU(page))
- return pfn;
- if (__PageMovable(page))
- return pfn;
- if (PageHuge(page)) {
- if (hugepage_migration_supported(page_hstate(page)) &&
- page_huge_active(page))
- return pfn;
- else
- pfn = round_up(pfn + 1,
- 1 << compound_order(page)) - 1;
- }
- }
+ struct page *page, *head;
+ unsigned long skip;
+
+ if (!pfn_valid(pfn))
+ continue;
+ page = pfn_to_page(pfn);
+ if (PageLRU(page))
+ return pfn;
+ if (__PageMovable(page))
+ return pfn;
+
+ if (!PageHuge(page))
+ continue;
+ head = compound_head(page);
+ if (page_huge_active(head))
+ return pfn;
+ skip = compound_nr(head) - (page - head);
+ pfn += skip - 1;
}
return 0;
}
@@ -1363,36 +1301,43 @@
return new_page_nodemask(page, nid, &nmask);
}
-#define NR_OFFLINE_AT_ONCE_PAGES (256)
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
struct page *page;
- int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
- int not_managed = 0;
int ret = 0;
LIST_HEAD(source);
- for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
if (PageHuge(page)) {
struct page *head = compound_head(page);
- pfn = page_to_pfn(head) + (1<<compound_order(head)) - 1;
- if (compound_order(head) > PFN_SECTION_SHIFT) {
- ret = -EBUSY;
- break;
- }
- if (isolate_huge_page(page, &source))
- move_pages -= 1 << compound_order(head);
+ pfn = page_to_pfn(head) + compound_nr(head) - 1;
+ isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
pfn = page_to_pfn(compound_head(page))
+ hpage_nr_pages(page) - 1;
+ /*
+ * HWPoison pages have elevated reference counts so the migration would
+ * fail on them. It also doesn't make any sense to migrate them in the
+ * first place. Still try to unmap such a page in case it is still mapped
+ * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
+ * the unmap as the catch all safety net).
+ */
+ if (PageHWPoison(page)) {
+ if (WARN_ON(PageLRU(page)))
+ isolate_lru_page(page);
+ if (page_mapped(page))
+ try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ continue;
+ }
+
if (!get_page_unless_zero(page))
continue;
/*
@@ -1404,41 +1349,31 @@
else
ret = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
if (!ret) { /* Success */
- put_page(page);
list_add_tail(&page->lru, &source);
- move_pages--;
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
} else {
-#ifdef CONFIG_DEBUG_VM
- pr_alert("failed to isolate pfn %lx\n", pfn);
+ pr_warn("failed to isolate pfn %lx\n", pfn);
dump_page(page, "isolation failed");
-#endif
- put_page(page);
- /* Because we don't have big zone->lock. we should
- check this again here. */
- if (page_count(page)) {
- not_managed++;
- ret = -EBUSY;
- break;
- }
}
+ put_page(page);
}
if (!list_empty(&source)) {
- if (not_managed) {
- putback_movable_pages(&source);
- goto out;
- }
-
/* Allocate a new page from the nearest neighbor node */
ret = migrate_pages(&source, new_node_page, NULL, 0,
MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
- if (ret)
+ if (ret) {
+ list_for_each_entry(page, &source, lru) {
+ pr_warn("migrating pfn %lx failed ret:%d ",
+ page_to_pfn(page), ret);
+ dump_page(page, "migration failure");
+ }
putback_movable_pages(&source);
+ }
}
-out:
+
return ret;
}
@@ -1449,15 +1384,10 @@
offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
void *data)
{
- __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
+ unsigned long *offlined_pages = (unsigned long *)data;
-static void
-offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
-{
- walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
- offline_isolated_pages_cb);
+ *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
+ return 0;
}
/*
@@ -1467,26 +1397,7 @@
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
void *data)
{
- int ret;
- long offlined = *(long *)data;
- ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
- offlined = nr_pages;
- if (!ret)
- *(long *)data += offlined;
- return ret;
-}
-
-static long
-check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
-{
- long offlined = 0;
- int ret;
-
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
- check_pages_isolated_cb);
- if (ret < 0)
- offlined = (long)ret;
- return offlined;
+ return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
}
static int __init cmdline_parse_movable_node(char *p)
@@ -1506,75 +1417,53 @@
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long present_pages = 0;
- enum zone_type zt, zone_last = ZONE_NORMAL;
+ enum zone_type zt;
+
+ arg->status_change_nid = NUMA_NO_NODE;
+ arg->status_change_nid_normal = NUMA_NO_NODE;
+ arg->status_change_nid_high = NUMA_NO_NODE;
/*
- * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_NORMAL,
- * set zone_last to ZONE_NORMAL.
- *
- * If we don't have HIGHMEM nor movable node,
- * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
- * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+ * Check whether node_states[N_NORMAL_MEMORY] will be changed.
+ * If the memory to be offline is within the range
+ * [0..ZONE_NORMAL], and it is the last present memory there,
+ * the zones in that range will become empty after the offlining,
+ * thus we can determine that we need to clear the node from
+ * node_states[N_NORMAL_MEMORY].
*/
- if (N_MEMORY == N_NORMAL_MEMORY)
- zone_last = ZONE_MOVABLE;
-
- /*
- * check whether node_states[N_NORMAL_MEMORY] will be changed.
- * If the memory to be offline is in a zone of 0...zone_last,
- * and it is the last present memory, 0...zone_last will
- * become empty after offline , thus we can determind we will
- * need to clear the node from node_states[N_NORMAL_MEMORY].
- */
- for (zt = 0; zt <= zone_last; zt++)
+ for (zt = 0; zt <= ZONE_NORMAL; zt++)
present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ if (zone_idx(zone) <= ZONE_NORMAL && nr_pages >= present_pages)
arg->status_change_nid_normal = zone_to_nid(zone);
- else
- arg->status_change_nid_normal = -1;
#ifdef CONFIG_HIGHMEM
/*
- * If we have movable node, node_states[N_HIGH_MEMORY]
- * contains nodes which have zones of 0...ZONE_HIGHMEM,
- * set zone_last to ZONE_HIGHMEM.
- *
- * If we don't have movable node, node_states[N_NORMAL_MEMORY]
- * contains nodes which have zones of 0...ZONE_MOVABLE,
- * set zone_last to ZONE_MOVABLE.
+ * node_states[N_HIGH_MEMORY] contains nodes which
+ * have normal memory or high memory.
+ * Here we add the present_pages belonging to ZONE_HIGHMEM.
+ * If the zone is within the range of [0..ZONE_HIGHMEM), and
+ * we determine that the zones in that range become empty,
+ * we need to clear the node for N_HIGH_MEMORY.
*/
- zone_last = ZONE_HIGHMEM;
- if (N_MEMORY == N_HIGH_MEMORY)
- zone_last = ZONE_MOVABLE;
-
- for (; zt <= zone_last; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
- if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+ present_pages += pgdat->node_zones[ZONE_HIGHMEM].present_pages;
+ if (zone_idx(zone) <= ZONE_HIGHMEM && nr_pages >= present_pages)
arg->status_change_nid_high = zone_to_nid(zone);
- else
- arg->status_change_nid_high = -1;
-#else
- arg->status_change_nid_high = arg->status_change_nid_normal;
#endif
/*
- * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+ * We have accounted the pages from [0..ZONE_NORMAL), and
+ * in case of CONFIG_HIGHMEM the pages from ZONE_HIGHMEM
+ * as well.
+ * Here we count the possible pages from ZONE_MOVABLE.
+ * If after having accounted all the pages, we see that the nr_pages
+ * to be offlined is over or equal to the accounted pages,
+ * we know that the node will become empty, and so, we can clear
+ * it for N_MEMORY as well.
*/
- zone_last = ZONE_MOVABLE;
+ present_pages += pgdat->node_zones[ZONE_MOVABLE].present_pages;
- /*
- * check whether node_states[N_HIGH_MEMORY] will be changed
- * If we try to offline the last present @nr_pages from the node,
- * we can determind we will need to clear the node from
- * node_states[N_HIGH_MEMORY].
- */
- for (; zt <= zone_last; zt++)
- present_pages += pgdat->node_zones[zt].present_pages;
if (nr_pages >= present_pages)
arg->status_change_nid = zone_to_nid(zone);
- else
- arg->status_change_nid = -1;
}
static void node_states_clear_node(int node, struct memory_notify *arg)
@@ -1582,12 +1471,10 @@
if (arg->status_change_nid_normal >= 0)
node_clear_state(node, N_NORMAL_MEMORY);
- if ((N_MEMORY != N_NORMAL_MEMORY) &&
- (arg->status_change_nid_high >= 0))
+ if (arg->status_change_nid_high >= 0)
node_clear_state(node, N_HIGH_MEMORY);
- if ((N_MEMORY != N_HIGH_MEMORY) &&
- (arg->status_change_nid >= 0))
+ if (arg->status_change_nid >= 0)
node_clear_state(node, N_MEMORY);
}
@@ -1595,22 +1482,24 @@
unsigned long end_pfn)
{
unsigned long pfn, nr_pages;
- long offlined_pages;
- int ret, node;
+ unsigned long offlined_pages = 0;
+ int ret, node, nr_isolate_pageblock;
unsigned long flags;
unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
+ char *reason;
- /* at least, alignment against pageblock is necessary */
- if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
- return -EINVAL;
- if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
- return -EINVAL;
+ mem_hotplug_begin();
+
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start, &valid_end))
- return -EINVAL;
+ if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
+ &valid_end)) {
+ ret = -EINVAL;
+ reason = "multizone range";
+ goto failed_removal;
+ }
zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
@@ -1618,9 +1507,13 @@
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
- MIGRATE_MOVABLE, true);
- if (ret)
- return ret;
+ MIGRATE_MOVABLE,
+ SKIP_HWPOISON | REPORT_FAILURE);
+ if (ret < 0) {
+ reason = "failure to isolate range";
+ goto failed_removal;
+ }
+ nr_isolate_pageblock = ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1628,43 +1521,61 @@
ret = memory_notify(MEM_GOING_OFFLINE, &arg);
ret = notifier_to_errno(ret);
- if (ret)
- goto failed_removal;
-
- pfn = start_pfn;
-repeat:
- /* start memory hot removal */
- ret = -EINTR;
- if (signal_pending(current))
- goto failed_removal;
-
- cond_resched();
- lru_add_drain_all();
- drain_all_pages(zone);
-
- pfn = scan_movable_pages(start_pfn, end_pfn);
- if (pfn) { /* We have movable pages */
- ret = do_migrate_range(pfn, end_pfn);
- goto repeat;
+ if (ret) {
+ reason = "notifier failure";
+ goto failed_removal_isolated;
}
- /*
- * dissolve free hugepages in the memory block before doing offlining
- * actually in order to make hugetlbfs's object counting consistent.
- */
- ret = dissolve_free_huge_pages(start_pfn, end_pfn);
- if (ret)
- goto failed_removal;
- /* check again */
- offlined_pages = check_pages_isolated(start_pfn, end_pfn);
- if (offlined_pages < 0)
- goto repeat;
- pr_info("Offlined Pages %ld\n", offlined_pages);
+ do {
+ for (pfn = start_pfn; pfn;) {
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ reason = "signal backoff";
+ goto failed_removal_isolated;
+ }
+
+ cond_resched();
+ lru_add_drain_all();
+
+ pfn = scan_movable_pages(pfn, end_pfn);
+ if (pfn) {
+ /*
+ * TODO: fatal migration failures should bail
+ * out
+ */
+ do_migrate_range(pfn, end_pfn);
+ }
+ }
+
+ /*
+ * Dissolve free hugepages in the memory block before doing
+ * offlining actually in order to make hugetlbfs's object
+ * counting consistent.
+ */
+ ret = dissolve_free_huge_pages(start_pfn, end_pfn);
+ if (ret) {
+ reason = "failure to dissolve huge pages";
+ goto failed_removal_isolated;
+ }
+ /* check again */
+ ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ NULL, check_pages_isolated_cb);
+ } while (ret);
+
/* Ok, all of our target is isolated.
We cannot do rollback at this point. */
- offline_isolated_pages(start_pfn, end_pfn);
- /* reset pagetype flags and makes migrate type to be MOVABLE */
- undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ walk_system_ram_range(start_pfn, end_pfn - start_pfn,
+ &offlined_pages, offline_isolated_pages_cb);
+ pr_info("Offlined Pages %ld\n", offlined_pages);
+ /*
+ * Onlining will reset pagetype flags and makes migrate type
+ * MOVABLE, so just need to decrease the number of isolated
+ * pageblocks zone counter here.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
zone->present_pages -= offlined_pages;
@@ -1691,75 +1602,27 @@
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
+ mem_hotplug_done();
return 0;
-failed_removal:
- pr_debug("memory offlining [mem %#010llx-%#010llx] failed\n",
- (unsigned long long) start_pfn << PAGE_SHIFT,
- ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
- memory_notify(MEM_CANCEL_OFFLINE, &arg);
- /* pushback to free area */
+failed_removal_isolated:
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+ memory_notify(MEM_CANCEL_OFFLINE, &arg);
+failed_removal:
+ pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
+ (unsigned long long) start_pfn << PAGE_SHIFT,
+ ((unsigned long long) end_pfn << PAGE_SHIFT) - 1,
+ reason);
+ /* pushback to free area */
+ mem_hotplug_done();
return ret;
}
-/* Must be protected by mem_hotplug_begin() or a device_lock */
int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
{
return __offline_pages(start_pfn, start_pfn + nr_pages);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
-/**
- * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn)
- * @start_pfn: start pfn of the memory range
- * @end_pfn: end pfn of the memory range
- * @arg: argument passed to func
- * @func: callback for each memory section walked
- *
- * This function walks through all present mem sections in range
- * [start_pfn, end_pfn) and call func on each mem section.
- *
- * Returns the return value of func.
- */
-int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
- void *arg, int (*func)(struct memory_block *, void *))
-{
- struct memory_block *mem = NULL;
- struct mem_section *section;
- unsigned long pfn, section_nr;
- int ret;
-
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- section_nr = pfn_to_section_nr(pfn);
- if (!present_section_nr(section_nr))
- continue;
-
- section = __nr_to_section(section_nr);
- /* same memblock? */
- if (mem)
- if ((section_nr >= mem->start_section_nr) &&
- (section_nr <= mem->end_section_nr))
- continue;
-
- mem = find_memory_block_hinted(section, mem);
- if (!mem)
- continue;
-
- ret = func(mem, arg);
- if (ret) {
- kobject_put(&mem->dev.kobj);
- return ret;
- }
- }
-
- if (mem)
- kobject_put(&mem->dev.kobj);
-
- return 0;
-}
-
-#ifdef CONFIG_MEMORY_HOTREMOVE
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1768,12 +1631,13 @@
phys_addr_t beginpa, endpa;
beginpa = PFN_PHYS(section_nr_to_pfn(mem->start_section_nr));
- endpa = PFN_PHYS(section_nr_to_pfn(mem->end_section_nr + 1))-1;
+ endpa = beginpa + memory_block_size_bytes() - 1;
pr_warn("removing memory fails, because memory [%pa-%pa] is onlined\n",
&beginpa, &endpa);
- }
- return ret;
+ return -EBUSY;
+ }
+ return 0;
}
static int check_cpu_on_node(pg_data_t *pgdat)
@@ -1792,32 +1656,16 @@
return 0;
}
-static void unmap_cpu_on_node(pg_data_t *pgdat)
+static int check_no_memblock_for_node_cb(struct memory_block *mem, void *arg)
{
-#ifdef CONFIG_ACPI_NUMA
- int cpu;
-
- for_each_possible_cpu(cpu)
- if (cpu_to_node(cpu) == pgdat->node_id)
- numa_clear_node(cpu);
-#endif
-}
-
-static int check_and_unmap_cpu_on_node(pg_data_t *pgdat)
-{
- int ret;
-
- ret = check_cpu_on_node(pgdat);
- if (ret)
- return ret;
+ int nid = *(int *)arg;
/*
- * the node will be offlined when we come here, so we can clear
- * the cpu_to_node() now.
+ * If a memory block belongs to multiple nodes, the stored nid is not
+ * reliable. However, such blocks are always online (e.g., cannot get
+ * offlined) and, therefore, are still spanned by the node.
*/
-
- unmap_cpu_on_node(pgdat);
- return 0;
+ return mem->nid == nid ? -EEXIST : 0;
}
/**
@@ -1832,27 +1680,26 @@
void try_offline_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long start_pfn = pgdat->node_start_pfn;
- unsigned long end_pfn = start_pfn + pgdat->node_spanned_pages;
- unsigned long pfn;
+ int rc;
- for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
- unsigned long section_nr = pfn_to_section_nr(pfn);
-
- if (!present_section_nr(section_nr))
- continue;
-
- if (pfn_to_nid(pfn) != nid)
- continue;
-
- /*
- * some memory sections of this node are not removed, and we
- * can't offline node now.
- */
+ /*
+ * If the node still spans pages (especially ZONE_DEVICE), don't
+ * offline it. A node spans memory after move_pfn_range_to_zone(),
+ * e.g., after the memory block was onlined.
+ */
+ if (pgdat->node_spanned_pages)
return;
- }
- if (check_and_unmap_cpu_on_node(pgdat))
+ /*
+ * Especially offline memory blocks might not be spanned by the
+ * node. They will get spanned by the node once they get onlined.
+ * However, they link to the node in sysfs and can get onlined later.
+ */
+ rc = for_each_memory_block(&nid, check_no_memblock_for_node_cb);
+ if (rc)
+ return;
+
+ if (check_cpu_on_node(pgdat))
return;
/*
@@ -1864,6 +1711,61 @@
}
EXPORT_SYMBOL(try_offline_node);
+static void __release_memory_resource(resource_size_t start,
+ resource_size_t size)
+{
+ int ret;
+
+ /*
+ * When removing memory in the same granularity as it was added,
+ * this function never fails. It might only fail if resources
+ * have to be adjusted or split. We'll ignore the error, as
+ * removing of memory cannot fail.
+ */
+ ret = release_mem_region_adjustable(&iomem_resource, start, size);
+ if (ret) {
+ resource_size_t endres = start + size - 1;
+
+ pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
+ &start, &endres, ret);
+ }
+}
+
+static int __ref try_remove_memory(int nid, u64 start, u64 size)
+{
+ int rc = 0;
+
+ BUG_ON(check_hotplug_memory_range(start, size));
+
+ mem_hotplug_begin();
+
+ /*
+ * All memory blocks must be offlined before removing memory. Check
+ * whether all memory blocks in question are offline and return error
+ * if this is not the case.
+ */
+ rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
+ if (rc)
+ goto done;
+
+ /* remove memmap entry */
+ firmware_map_remove(start, start + size, "System RAM");
+ memblock_free(start, size);
+ memblock_remove(start, size);
+
+ /* remove memory block devices before removing memory */
+ remove_memory_block_devices(start, size);
+
+ arch_remove_memory(nid, start, size, NULL);
+ __release_memory_resource(start, size);
+
+ try_offline_node(nid);
+
+done:
+ mem_hotplug_done();
+ return rc;
+}
+
/**
* remove_memory
* @nid: the node ID
@@ -1874,34 +1776,30 @@
* and online/offline operations before this call, as required by
* try_offline_node().
*/
-void __ref remove_memory(int nid, u64 start, u64 size)
+void __remove_memory(int nid, u64 start, u64 size)
{
- int ret;
-
- BUG_ON(check_hotplug_memory_range(start, size));
-
- mem_hotplug_begin();
/*
- * All memory blocks must be offlined before removing memory. Check
- * whether all memory blocks in question are offline and trigger a BUG()
- * if this is not the case.
+ * trigger BUG() if some memory is not offlined prior to calling this
+ * function
*/
- ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
- check_memblock_offlined_cb);
- if (ret)
+ if (try_remove_memory(nid, start, size))
BUG();
+}
- /* remove memmap entry */
- firmware_map_remove(start, start + size, "System RAM");
- memblock_free(start, size);
- memblock_remove(start, size);
+/*
+ * Remove memory if every memory block is offline, otherwise return -EBUSY is
+ * some memory is not offline
+ */
+int remove_memory(int nid, u64 start, u64 size)
+{
+ int rc;
- arch_remove_memory(start, size, NULL);
+ lock_device_hotplug();
+ rc = try_remove_memory(nid, start, size);
+ unlock_device_hotplug();
- try_offline_node(nid);
-
- mem_hotplug_done();
+ return rc;
}
EXPORT_SYMBOL_GPL(remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 149b6f4..e08c941 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1,9 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Simple NUMA memory policy for the Linux kernel.
*
* Copyright 2003,2004 Andi Kleen, SuSE Labs.
* (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
- * Subject to the GNU Public License, version 2.
*
* NUMA policy allows the user to give hints in which node(s) memory should
* be allocated.
@@ -68,7 +68,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mempolicy.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/hugetlb.h>
#include <linux/kernel.h>
@@ -306,7 +306,7 @@
else {
nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed,
*nodes);
- pol->w.cpuset_mems_allowed = tmp;
+ pol->w.cpuset_mems_allowed = *nodes;
}
if (nodes_empty(tmp))
@@ -350,7 +350,7 @@
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
@@ -403,7 +403,7 @@
},
};
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
struct queue_pages {
@@ -428,6 +428,16 @@
return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT);
}
+/*
+ * queue_pages_pmd() has four possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * 2 - THP was split.
+ * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an
+ * existing page was already on a node that does not follow the
+ * policy.
+ */
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
{
@@ -437,25 +447,29 @@
unsigned long flags;
if (unlikely(is_pmd_migration_entry(*pmd))) {
- ret = 1;
+ ret = -EIO;
goto unlock;
}
page = pmd_page(*pmd);
if (is_huge_zero_page(page)) {
spin_unlock(ptl);
__split_huge_pmd(walk->vma, pmd, addr, false, NULL);
+ ret = 2;
goto out;
}
- if (!queue_pages_required(page, qp)) {
- ret = 1;
+ if (!queue_pages_required(page, qp))
goto unlock;
- }
- ret = 1;
flags = qp->flags;
/* go to thp migration */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, qp->pagelist, flags);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ if (!vma_migratable(walk->vma) ||
+ migrate_page_add(page, qp->pagelist, flags)) {
+ ret = 1;
+ goto unlock;
+ }
+ } else
+ ret = -EIO;
unlock:
spin_unlock(ptl);
out:
@@ -465,6 +479,13 @@
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
+ *
+ * queue_pages_pte_range() has three possible return values:
+ * 0 - pages are placed on the right node or queued successfully.
+ * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * -EIO - only MPOL_MF_STRICT was specified and an existing page was already
+ * on a node that does not follow the policy.
*/
static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -474,15 +495,17 @@
struct queue_pages *qp = walk->private;
unsigned long flags = qp->flags;
int ret;
+ bool has_unmovable = false;
pte_t *pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
if (ptl) {
ret = queue_pages_pmd(pmd, ptl, addr, end, walk);
- if (ret)
- return 0;
+ if (ret != 2)
+ return ret;
}
+ /* THP was split, fall through to pte walk */
if (pmd_trans_unstable(pmd))
return 0;
@@ -502,11 +525,30 @@
continue;
if (!queue_pages_required(page, qp))
continue;
- migrate_page_add(page, qp->pagelist, flags);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+ /* MPOL_MF_STRICT must be specified if we get here */
+ if (!vma_migratable(vma)) {
+ has_unmovable = true;
+ break;
+ }
+
+ /*
+ * Do not abort immediately since there may be
+ * temporary off LRU pages in the range. Still
+ * need migrate other LRU pages.
+ */
+ if (migrate_page_add(page, qp->pagelist, flags))
+ has_unmovable = true;
+ } else
+ break;
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
- return 0;
+
+ if (has_unmovable)
+ return 1;
+
+ return addr != end ? -EIO : 0;
}
static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
@@ -576,7 +618,12 @@
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
- if (!vma_migratable(vma))
+ /*
+ * Need check MPOL_MF_STRICT to return -EIO if possible
+ * regardless of vma_migratable
+ */
+ if (!vma_migratable(vma) &&
+ !(flags & MPOL_MF_STRICT))
return 1;
if (endvma > end)
@@ -603,17 +650,31 @@
}
/* queue pages from current vma */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ if (flags & MPOL_MF_VALID)
return 0;
return 1;
}
+static const struct mm_walk_ops queue_pages_walk_ops = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+};
+
/*
* Walk through page tables and collect pages to be migrated.
*
* If pages found in a given range are on a set of nodes (determined by
* @nodes and @flags,) it's isolated and queued to the pagelist which is
- * passed via @private.)
+ * passed via @private.
+ *
+ * queue_pages_range() has three possible return values:
+ * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were
+ * specified.
+ * 0 - queue pages successfully or no misplaced page.
+ * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or
+ * memory range specified by nodemask and maxnode points outside
+ * your accessible address space (-EFAULT)
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
@@ -626,15 +687,8 @@
.nmask = nodes,
.prev = NULL,
};
- struct mm_walk queue_pages_walk = {
- .hugetlb_entry = queue_pages_hugetlb,
- .pmd_entry = queue_pages_pte_range,
- .test_walk = queue_pages_test_walk,
- .mm = mm,
- .private = &qp,
- };
- return walk_page_range(start, end, &queue_pages_walk);
+ return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
}
/*
@@ -797,16 +851,19 @@
}
}
-static int lookup_node(unsigned long addr)
+static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
struct page *p;
int err;
- err = get_user_pages(addr & PAGE_MASK, 1, 0, &p, NULL);
+ int locked = 1;
+ err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
if (err >= 0) {
err = page_to_nid(p);
put_page(p);
}
+ if (locked)
+ up_read(&mm->mmap_sem);
return err;
}
@@ -817,7 +874,7 @@
int err;
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct mempolicy *pol = current->mempolicy;
+ struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL;
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
@@ -857,7 +914,16 @@
if (flags & MPOL_F_NODE) {
if (flags & MPOL_F_ADDR) {
- err = lookup_node(addr);
+ /*
+ * Take a refcount on the mpol, lookup_node()
+ * wil drop the mmap_sem, so after calling
+ * lookup_node() only "pol" remains valid, "vma"
+ * is stale.
+ */
+ pol_refcount = pol;
+ vma = NULL;
+ mpol_get(pol);
+ err = lookup_node(mm, addr);
if (err < 0)
goto out;
*policy = err;
@@ -892,7 +958,9 @@
out:
mpol_cond_put(pol);
if (vma)
- up_read(¤t->mm->mmap_sem);
+ up_read(&mm->mmap_sem);
+ if (pol_refcount)
+ mpol_put(pol_refcount);
return err;
}
@@ -900,7 +968,7 @@
/*
* page migration, thp tail pages can be passed.
*/
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
struct page *head = compound_head(page);
@@ -913,8 +981,19 @@
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_cache(head),
hpage_nr_pages(head));
+ } else if (flags & MPOL_MF_STRICT) {
+ /*
+ * Non-movable page may reach here. And, there may be
+ * temporary off LRU pages or non-LRU movable pages.
+ * Treat them as unmovable pages since they can't be
+ * isolated, so they can't be moved at the moment. It
+ * should return -EIO for this case too.
+ */
+ return -EIO;
}
}
+
+ return 0;
}
/* page allocation callback for NUMA node migration */
@@ -1117,9 +1196,10 @@
}
#else
-static void migrate_page_add(struct page *page, struct list_head *pagelist,
+static int migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
+ return -EIO;
}
int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
@@ -1142,6 +1222,7 @@
struct mempolicy *new;
unsigned long end;
int err;
+ int ret;
LIST_HEAD(pagelist);
if (flags & ~(unsigned long)MPOL_MF_VALID)
@@ -1203,10 +1284,15 @@
if (err)
goto mpol_out;
- err = queue_pages_range(mm, start, end, nmask,
+ ret = queue_pages_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
- if (!err)
- err = mbind_range(mm, start, end, new);
+
+ if (ret < 0) {
+ err = ret;
+ goto up_out;
+ }
+
+ err = mbind_range(mm, start, end, new);
if (!err) {
int nr_failed = 0;
@@ -1219,13 +1305,16 @@
putback_movable_pages(&pagelist);
}
- if (nr_failed && (flags & MPOL_MF_STRICT))
+ if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT)))
err = -EIO;
- } else
- putback_movable_pages(&pagelist);
+ } else {
+up_out:
+ if (!list_empty(&pagelist))
+ putback_movable_pages(&pagelist);
+ }
up_write(&mm->mmap_sem);
- mpol_out:
+mpol_out:
mpol_put(new);
return err;
}
@@ -1300,7 +1389,7 @@
nodemask_t *nodes)
{
unsigned long copy = ALIGN(maxnode-1, 64) / 8;
- const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
+ unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long);
if (copy > nbytes) {
if (copy > PAGE_SIZE)
@@ -1320,6 +1409,7 @@
int err;
unsigned short mode_flags;
+ start = untagged_addr(start);
mode_flags = mode & MPOL_MODE_FLAGS;
mode &= ~MPOL_MODE_FLAGS;
if (mode >= MPOL_MAX)
@@ -1427,10 +1517,6 @@
if (nodes_empty(*new))
goto out_put;
- nodes_and(*new, *new, node_states[N_MEMORY]);
- if (nodes_empty(*new))
- goto out_put;
-
err = security_task_movememory(task);
if (err)
goto out_put;
@@ -1477,7 +1563,9 @@
int uninitialized_var(pval);
nodemask_t nodes;
- if (nmask != NULL && maxnode < MAX_NUMNODES)
+ addr = untagged_addr(addr);
+
+ if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
err = do_get_mempolicy(&pval, &nodes, addr, flags);
@@ -1513,7 +1601,7 @@
unsigned long nr_bits, alloc_size;
DECLARE_BITMAP(bm, MAX_NUMNODES);
- nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
+ nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids);
alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
if (nmask)
@@ -2039,43 +2127,25 @@
* If the policy is interleave, or does not allow the current
* node in its nodemask, we allocate the standard way.
*/
- if (pol->mode == MPOL_PREFERRED &&
- !(pol->flags & MPOL_F_LOCAL))
+ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL))
hpage_node = pol->v.preferred_node;
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
+ page = __alloc_pages_node(hpage_node,
+ gfp | __GFP_THISNODE, order);
+
/*
- * We cannot invoke reclaim if __GFP_THISNODE
- * is set. Invoking reclaim with
- * __GFP_THISNODE set, would cause THP
- * allocations to trigger heavy swapping
- * despite there may be tons of free memory
- * (including potentially plenty of THP
- * already available in the buddy) on all the
- * other NUMA nodes.
- *
- * At most we could invoke compaction when
- * __GFP_THISNODE is set (but we would need to
- * refrain from invoking reclaim even if
- * compaction returned COMPACT_SKIPPED because
- * there wasn't not enough memory to succeed
- * compaction). For now just avoid
- * __GFP_THISNODE instead of limiting the
- * allocation path to a strict and single
- * compaction invocation.
- *
- * Supposedly if direct reclaim was enabled by
- * the caller, the app prefers THP regardless
- * of the node it comes from so this would be
- * more desiderable behavior than only
- * providing THP originated from the local
- * node in such case.
+ * If hugepage allocations are configured to always
+ * synchronous compact or the vma has been madvised
+ * to prefer hugepage backing, retry allowing remote
+ * memory as well.
*/
- if (!(gfp & __GFP_DIRECT_RECLAIM))
- gfp |= __GFP_THISNODE;
- page = __alloc_pages_node(hpage_node, gfp, order);
+ if (!page && (gfp & __GFP_DIRECT_RECLAIM))
+ page = __alloc_pages_node(hpage_node,
+ gfp | __GFP_NORETRY, order);
+
goto out;
}
}
@@ -2087,6 +2157,7 @@
out:
return page;
}
+EXPORT_SYMBOL(alloc_pages_vma);
/**
* alloc_pages_current - Allocate pages.
@@ -2319,7 +2390,7 @@
unsigned long pgoff;
int thiscpu = raw_smp_processor_id();
int thisnid = cpu_to_node(thiscpu);
- int polnid = -1;
+ int polnid = NUMA_NO_NODE;
int ret = -1;
pol = get_vma_policy(vma, addr);
@@ -2725,12 +2796,11 @@
int mpol_parse_str(char *str, struct mempolicy **mpol)
{
struct mempolicy *new = NULL;
- unsigned short mode;
unsigned short mode_flags;
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int err = 1;
+ int err = 1, mode;
if (nodelist) {
/* NUL-terminate mode or flags string */
@@ -2745,12 +2815,8 @@
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (mode = 0; mode < MPOL_MAX; mode++) {
- if (!strcmp(str, policy_modes[mode])) {
- break;
- }
- }
- if (mode >= MPOL_MAX)
+ mode = match_string(policy_modes, MPOL_MAX, str);
+ if (mode < 0)
goto out;
switch (mode) {
diff --git a/mm/mempool.c b/mm/mempool.c
index 0ef8cc8..85efab3 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -222,6 +222,8 @@
*
* Like mempool_create(), but initializes the pool in (i.e. embedded in another
* structure).
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
@@ -245,6 +247,8 @@
* functions. This function might sleep. Both the alloc_fn() and the free_fn()
* functions might sleep - as long as the mempool_alloc() function is not called
* from IRQ contexts.
+ *
+ * Return: pointer to the created memory pool object or %NULL on error.
*/
mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data)
@@ -289,6 +293,8 @@
* Note, the caller must guarantee that no mempool_destroy is called
* while this function is running. mempool_alloc() & mempool_free()
* might be called (eg. from IRQ contexts) while this function executes.
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
int mempool_resize(mempool_t *pool, int new_min_nr)
{
@@ -363,6 +369,8 @@
* *never* fails when called from process contexts. (it might
* fail if called from an IRQ context.)
* Note: using __GFP_ZERO is not supported.
+ *
+ * Return: pointer to the allocated element or %NULL on error.
*/
void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask)
{
diff --git a/mm/memremap.c b/mm/memremap.c
new file mode 100644
index 0000000..03ccbdf
--- /dev/null
+++ b/mm/memremap.c
@@ -0,0 +1,457 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2015 Intel Corporation. All rights reserved. */
+#include <linux/device.h>
+#include <linux/io.h>
+#include <linux/kasan.h>
+#include <linux/memory_hotplug.h>
+#include <linux/mm.h>
+#include <linux/pfn_t.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/types.h>
+#include <linux/wait_bit.h>
+#include <linux/xarray.h>
+
+static DEFINE_XARRAY(pgmap_array);
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+DEFINE_STATIC_KEY_FALSE(devmap_managed_key);
+EXPORT_SYMBOL(devmap_managed_key);
+static atomic_t devmap_managed_enable;
+
+static void devmap_managed_enable_put(void)
+{
+ if (atomic_dec_and_test(&devmap_managed_enable))
+ static_branch_disable(&devmap_managed_key);
+}
+
+static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+{
+ if (!pgmap->ops || !pgmap->ops->page_free) {
+ WARN(1, "Missing page_free method\n");
+ return -EINVAL;
+ }
+
+ if (atomic_inc_return(&devmap_managed_enable) == 1)
+ static_branch_enable(&devmap_managed_key);
+ return 0;
+}
+#else
+static int devmap_managed_enable_get(struct dev_pagemap *pgmap)
+{
+ return -EINVAL;
+}
+static void devmap_managed_enable_put(void)
+{
+}
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
+
+static void pgmap_array_delete(struct resource *res)
+{
+ xa_store_range(&pgmap_array, PHYS_PFN(res->start), PHYS_PFN(res->end),
+ NULL, GFP_KERNEL);
+ synchronize_rcu();
+}
+
+static unsigned long pfn_first(struct dev_pagemap *pgmap)
+{
+ return PHYS_PFN(pgmap->res.start) +
+ vmem_altmap_offset(pgmap_altmap(pgmap));
+}
+
+static unsigned long pfn_end(struct dev_pagemap *pgmap)
+{
+ const struct resource *res = &pgmap->res;
+
+ return (res->start + resource_size(res)) >> PAGE_SHIFT;
+}
+
+static unsigned long pfn_next(unsigned long pfn)
+{
+ if (pfn % 1024 == 0)
+ cond_resched();
+ return pfn + 1;
+}
+
+#define for_each_device_pfn(pfn, map) \
+ for (pfn = pfn_first(map); pfn < pfn_end(map); pfn = pfn_next(pfn))
+
+static void dev_pagemap_kill(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->kill)
+ pgmap->ops->kill(pgmap);
+ else
+ percpu_ref_kill(pgmap->ref);
+}
+
+static void dev_pagemap_cleanup(struct dev_pagemap *pgmap)
+{
+ if (pgmap->ops && pgmap->ops->cleanup) {
+ pgmap->ops->cleanup(pgmap);
+ } else {
+ wait_for_completion(&pgmap->done);
+ percpu_ref_exit(pgmap->ref);
+ }
+ /*
+ * Undo the pgmap ref assignment for the internal case as the
+ * caller may re-enable the same pgmap.
+ */
+ if (pgmap->ref == &pgmap->internal_ref)
+ pgmap->ref = NULL;
+}
+
+void memunmap_pages(struct dev_pagemap *pgmap)
+{
+ struct resource *res = &pgmap->res;
+ struct page *first_page;
+ unsigned long pfn;
+ int nid;
+
+ dev_pagemap_kill(pgmap);
+ for_each_device_pfn(pfn, pgmap)
+ put_page(pfn_to_page(pfn));
+ dev_pagemap_cleanup(pgmap);
+
+ /* make sure to access a memmap that was actually initialized */
+ first_page = pfn_to_page(pfn_first(pgmap));
+
+ /* pages are dead and unused, undo the arch mapping */
+ nid = page_to_nid(first_page);
+
+ mem_hotplug_begin();
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ __remove_pages(page_zone(first_page), PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), NULL);
+ } else {
+ arch_remove_memory(nid, res->start, resource_size(res),
+ pgmap_altmap(pgmap));
+ kasan_remove_zero_shadow(__va(res->start), resource_size(res));
+ }
+ mem_hotplug_done();
+
+ untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
+ pgmap_array_delete(res);
+ WARN_ONCE(pgmap->altmap.alloc, "failed to free all reserved pages\n");
+ devmap_managed_enable_put();
+}
+EXPORT_SYMBOL_GPL(memunmap_pages);
+
+static void devm_memremap_pages_release(void *data)
+{
+ memunmap_pages(data);
+}
+
+static void dev_pagemap_percpu_release(struct percpu_ref *ref)
+{
+ struct dev_pagemap *pgmap =
+ container_of(ref, struct dev_pagemap, internal_ref);
+
+ complete(&pgmap->done);
+}
+
+/*
+ * Not device managed version of dev_memremap_pages, undone by
+ * memunmap_pages(). Please use dev_memremap_pages if you have a struct
+ * device available.
+ */
+void *memremap_pages(struct dev_pagemap *pgmap, int nid)
+{
+ struct resource *res = &pgmap->res;
+ struct dev_pagemap *conflict_pgmap;
+ struct mhp_restrictions restrictions = {
+ /*
+ * We do not want any optional features only our own memmap
+ */
+ .altmap = pgmap_altmap(pgmap),
+ };
+ pgprot_t pgprot = PAGE_KERNEL;
+ int error, is_ram;
+ bool need_devmap_managed = true;
+
+ switch (pgmap->type) {
+ case MEMORY_DEVICE_PRIVATE:
+ if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) {
+ WARN(1, "Device private memory not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ if (!pgmap->ops || !pgmap->ops->migrate_to_ram) {
+ WARN(1, "Missing migrate_to_ram method\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_FS_DAX:
+ if (!IS_ENABLED(CONFIG_ZONE_DEVICE) ||
+ IS_ENABLED(CONFIG_FS_DAX_LIMITED)) {
+ WARN(1, "File system DAX not supported\n");
+ return ERR_PTR(-EINVAL);
+ }
+ break;
+ case MEMORY_DEVICE_DEVDAX:
+ case MEMORY_DEVICE_PCI_P2PDMA:
+ need_devmap_managed = false;
+ break;
+ default:
+ WARN(1, "Invalid pgmap type %d\n", pgmap->type);
+ break;
+ }
+
+ if (!pgmap->ref) {
+ if (pgmap->ops && (pgmap->ops->kill || pgmap->ops->cleanup))
+ return ERR_PTR(-EINVAL);
+
+ init_completion(&pgmap->done);
+ error = percpu_ref_init(&pgmap->internal_ref,
+ dev_pagemap_percpu_release, 0, GFP_KERNEL);
+ if (error)
+ return ERR_PTR(error);
+ pgmap->ref = &pgmap->internal_ref;
+ } else {
+ if (!pgmap->ops || !pgmap->ops->kill || !pgmap->ops->cleanup) {
+ WARN(1, "Missing reference count teardown definition\n");
+ return ERR_PTR(-EINVAL);
+ }
+ }
+
+ if (need_devmap_managed) {
+ error = devmap_managed_enable_get(pgmap);
+ if (error)
+ return ERR_PTR(error);
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->start), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ error = -ENOMEM;
+ goto err_array;
+ }
+
+ conflict_pgmap = get_dev_pagemap(PHYS_PFN(res->end), NULL);
+ if (conflict_pgmap) {
+ WARN(1, "Conflicting mapping in same section\n");
+ put_dev_pagemap(conflict_pgmap);
+ error = -ENOMEM;
+ goto err_array;
+ }
+
+ is_ram = region_intersects(res->start, resource_size(res),
+ IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
+
+ if (is_ram != REGION_DISJOINT) {
+ WARN_ONCE(1, "%s attempted on %s region %pr\n", __func__,
+ is_ram == REGION_MIXED ? "mixed" : "ram", res);
+ error = -ENXIO;
+ goto err_array;
+ }
+
+ error = xa_err(xa_store_range(&pgmap_array, PHYS_PFN(res->start),
+ PHYS_PFN(res->end), pgmap, GFP_KERNEL));
+ if (error)
+ goto err_array;
+
+ if (nid < 0)
+ nid = numa_mem_id();
+
+ error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(res->start), 0,
+ resource_size(res));
+ if (error)
+ goto err_pfn_remap;
+
+ mem_hotplug_begin();
+
+ /*
+ * For device private memory we call add_pages() as we only need to
+ * allocate and initialize struct page for the device memory. More-
+ * over the device memory is un-accessible thus we do not want to
+ * create a linear mapping for the memory like arch_add_memory()
+ * would do.
+ *
+ * For all other device memory types, which are accessible by
+ * the CPU, we do want the linear mapping and thus use
+ * arch_add_memory().
+ */
+ if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
+ error = add_pages(nid, PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), &restrictions);
+ } else {
+ error = kasan_add_zero_shadow(__va(res->start), resource_size(res));
+ if (error) {
+ mem_hotplug_done();
+ goto err_kasan;
+ }
+
+ error = arch_add_memory(nid, res->start, resource_size(res),
+ &restrictions);
+ }
+
+ if (!error) {
+ struct zone *zone;
+
+ zone = &NODE_DATA(nid)->node_zones[ZONE_DEVICE];
+ move_pfn_range_to_zone(zone, PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), restrictions.altmap);
+ }
+
+ mem_hotplug_done();
+ if (error)
+ goto err_add_memory;
+
+ /*
+ * Initialization of the pages has been deferred until now in order
+ * to allow us to do the work while not holding the hotplug lock.
+ */
+ memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
+ PHYS_PFN(res->start),
+ PHYS_PFN(resource_size(res)), pgmap);
+ percpu_ref_get_many(pgmap->ref, pfn_end(pgmap) - pfn_first(pgmap));
+ return __va(res->start);
+
+ err_add_memory:
+ kasan_remove_zero_shadow(__va(res->start), resource_size(res));
+ err_kasan:
+ untrack_pfn(NULL, PHYS_PFN(res->start), resource_size(res));
+ err_pfn_remap:
+ pgmap_array_delete(res);
+ err_array:
+ dev_pagemap_kill(pgmap);
+ dev_pagemap_cleanup(pgmap);
+ devmap_managed_enable_put();
+ return ERR_PTR(error);
+}
+EXPORT_SYMBOL_GPL(memremap_pages);
+
+/**
+ * devm_memremap_pages - remap and provide memmap backing for the given resource
+ * @dev: hosting device for @res
+ * @pgmap: pointer to a struct dev_pagemap
+ *
+ * Notes:
+ * 1/ At a minimum the res and type members of @pgmap must be initialized
+ * by the caller before passing it to this function
+ *
+ * 2/ The altmap field may optionally be initialized, in which case
+ * PGMAP_ALTMAP_VALID must be set in pgmap->flags.
+ *
+ * 3/ The ref field may optionally be provided, in which pgmap->ref must be
+ * 'live' on entry and will be killed and reaped at
+ * devm_memremap_pages_release() time, or if this routine fails.
+ *
+ * 4/ res is expected to be a host memory range that could feasibly be
+ * treated as a "System RAM" range, i.e. not a device mmio range, but
+ * this is not enforced.
+ */
+void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ int error;
+ void *ret;
+
+ ret = memremap_pages(pgmap, dev_to_node(dev));
+ if (IS_ERR(ret))
+ return ret;
+
+ error = devm_add_action_or_reset(dev, devm_memremap_pages_release,
+ pgmap);
+ if (error)
+ return ERR_PTR(error);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(devm_memremap_pages);
+
+void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap)
+{
+ devm_release_action(dev, devm_memremap_pages_release, pgmap);
+}
+EXPORT_SYMBOL_GPL(devm_memunmap_pages);
+
+unsigned long vmem_altmap_offset(struct vmem_altmap *altmap)
+{
+ /* number of pfns from base where pfn_to_page() is valid */
+ if (altmap)
+ return altmap->reserve + altmap->free;
+ return 0;
+}
+
+void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns)
+{
+ altmap->alloc -= nr_pfns;
+}
+
+/**
+ * get_dev_pagemap() - take a new live reference on the dev_pagemap for @pfn
+ * @pfn: page frame number to lookup page_map
+ * @pgmap: optional known pgmap that already has a reference
+ *
+ * If @pgmap is non-NULL and covers @pfn it will be returned as-is. If @pgmap
+ * is non-NULL but does not cover @pfn the reference to it will be released.
+ */
+struct dev_pagemap *get_dev_pagemap(unsigned long pfn,
+ struct dev_pagemap *pgmap)
+{
+ resource_size_t phys = PFN_PHYS(pfn);
+
+ /*
+ * In the cached case we're already holding a live reference.
+ */
+ if (pgmap) {
+ if (phys >= pgmap->res.start && phys <= pgmap->res.end)
+ return pgmap;
+ put_dev_pagemap(pgmap);
+ }
+
+ /* fall back to slow path lookup */
+ rcu_read_lock();
+ pgmap = xa_load(&pgmap_array, PHYS_PFN(phys));
+ if (pgmap && !percpu_ref_tryget_live(pgmap->ref))
+ pgmap = NULL;
+ rcu_read_unlock();
+
+ return pgmap;
+}
+EXPORT_SYMBOL_GPL(get_dev_pagemap);
+
+#ifdef CONFIG_DEV_PAGEMAP_OPS
+void __put_devmap_managed_page(struct page *page)
+{
+ int count = page_ref_dec_return(page);
+
+ /*
+ * If refcount is 1 then page is freed and refcount is stable as nobody
+ * holds a reference on the page.
+ */
+ if (count == 1) {
+ /* Clear Active bit in case of parallel mark_page_accessed */
+ __ClearPageActive(page);
+ __ClearPageWaiters(page);
+
+ mem_cgroup_uncharge(page);
+
+ /*
+ * When a device_private page is freed, the page->mapping field
+ * may still contain a (stale) mapping value. For example, the
+ * lower bits of page->mapping may still identify the page as
+ * an anonymous page. Ultimately, this entire field is just
+ * stale and wrong, and it will cause errors if not cleared.
+ * One example is:
+ *
+ * migrate_vma_pages()
+ * migrate_vma_insert_page()
+ * page_add_new_anon_rmap()
+ * __page_set_anon_rmap()
+ * ...checks page->mapping, via PageAnon(page) call,
+ * and incorrectly concludes that the page is an
+ * anonymous page. Therefore, it incorrectly,
+ * silently fails to set up the new anon rmap.
+ *
+ * For other types of ZONE_DEVICE pages, migration is either
+ * handled differently or not done at all, so there is no need
+ * to clear page->mapping.
+ */
+ if (is_device_private_page(page))
+ page->mapping = NULL;
+
+ page->pgmap->ops->page_free(page);
+ } else if (!count)
+ __put_page(page);
+}
+EXPORT_SYMBOL(__put_devmap_managed_page);
+#endif /* CONFIG_DEV_PAGEMAP_OPS */
diff --git a/mm/migrate.c b/mm/migrate.c
index 84381b5..4fe45d1 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -38,6 +38,7 @@
#include <linux/hugetlb.h>
#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
+#include <linux/pagewalk.h>
#include <linux/pfn_t.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
@@ -100,7 +101,7 @@
/*
* Check PageMovable before holding a PG_lock because page's owner
* assumes anybody doesn't touch PG_lock of newly allocated page
- * so unconditionally grapping the lock ruins page's owner side.
+ * so unconditionally grabbing the lock ruins page's owner side.
*/
if (unlikely(!__PageMovable(page)))
goto out_putpage;
@@ -246,12 +247,8 @@
if (is_device_private_page(new)) {
entry = make_device_private_entry(new, pte_write(pte));
pte = swp_entry_to_pte(entry);
- } else if (is_device_public_page(new)) {
- pte = pte_mkdevmap(pte);
- flush_dcache_page(new);
}
- } else
- flush_dcache_page(new);
+ }
#ifdef CONFIG_HUGETLB_PAGE
if (PageHuge(new)) {
@@ -326,17 +323,14 @@
page = migration_entry_to_page(entry);
/*
- * Once radix-tree replacement of page migration started, page_count
- * *must* be zero. And, we don't want to call wait_on_page_locked()
- * against a page without get_page().
- * So, we use get_page_unless_zero(), here. Even failed, page fault
- * will occur again.
+ * Once page cache replacement of page migration started, page_count
+ * is zero; but we must not call put_and_wait_on_page_locked() without
+ * a ref. Use get_page_unless_zero(), and just fault again if it fails.
*/
if (!get_page_unless_zero(page))
goto out;
pte_unmap_unlock(ptep, ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
return;
out:
pte_unmap_unlock(ptep, ptl);
@@ -370,63 +364,27 @@
if (!get_page_unless_zero(page))
goto unlock;
spin_unlock(ptl);
- wait_on_page_locked(page);
- put_page(page);
+ put_and_wait_on_page_locked(page);
return;
unlock:
spin_unlock(ptl);
}
#endif
-#ifdef CONFIG_BLOCK
-/* Returns true if all buffers are successfully locked */
-static bool buffer_migrate_lock_buffers(struct buffer_head *head,
- enum migrate_mode mode)
+static int expected_page_refs(struct address_space *mapping, struct page *page)
{
- struct buffer_head *bh = head;
+ int expected_count = 1;
- /* Simple case, sync compaction */
- if (mode != MIGRATE_ASYNC) {
- do {
- get_bh(bh);
- lock_buffer(bh);
- bh = bh->b_this_page;
+ /*
+ * Device public or private pages have an extra refcount as they are
+ * ZONE_DEVICE pages.
+ */
+ expected_count += is_device_private_page(page);
+ if (mapping)
+ expected_count += hpage_nr_pages(page) + page_has_private(page);
- } while (bh != head);
-
- return true;
- }
-
- /* async case, we cannot block on lock_buffer so use trylock_buffer */
- do {
- get_bh(bh);
- if (!trylock_buffer(bh)) {
- /*
- * We failed to lock the buffer and cannot stall in
- * async migration. Release the taken locks
- */
- struct buffer_head *failed_bh = bh;
- put_bh(failed_bh);
- bh = head;
- while (bh != failed_bh) {
- unlock_buffer(bh);
- put_bh(bh);
- bh = bh->b_this_page;
- }
- return false;
- }
-
- bh = bh->b_this_page;
- } while (bh != head);
- return true;
+ return expected_count;
}
-#else
-static inline bool buffer_migrate_lock_buffers(struct buffer_head *head,
- enum migrate_mode mode)
-{
- return true;
-}
-#endif /* CONFIG_BLOCK */
/*
* Replace the page in the mapping.
@@ -437,21 +395,12 @@
* 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
*/
int migrate_page_move_mapping(struct address_space *mapping,
- struct page *newpage, struct page *page,
- struct buffer_head *head, enum migrate_mode mode,
- int extra_count)
+ struct page *newpage, struct page *page, int extra_count)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
struct zone *oldzone, *newzone;
int dirty;
- int expected_count = 1 + extra_count;
- void **pslot;
-
- /*
- * Device public or private pages have an extra refcount as they are
- * ZONE_DEVICE pages.
- */
- expected_count += is_device_private_page(page);
- expected_count += is_device_public_page(page);
+ int expected_count = expected_page_refs(mapping, page) + extra_count;
if (!mapping) {
/* Anonymous page without mapping */
@@ -470,35 +419,14 @@
oldzone = page_zone(page);
newzone = page_zone(newpage);
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- page_index(page));
-
- expected_count += hpage_nr_pages(page) + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot,
- &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
- return -EAGAIN;
- }
-
- /*
- * In the async migration case of moving a page with buffers, lock the
- * buffers using trylock before the mapping is moved. If the mapping
- * was moved, we later failed to lock the buffers and could not move
- * the mapping back due to an elevated page count, we would have to
- * block waiting on other references to be dropped.
- */
- if (mode == MIGRATE_ASYNC && head &&
- !buffer_migrate_lock_buffers(head, mode)) {
- page_ref_unfreeze(page, expected_count);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -526,16 +454,13 @@
SetPageDirty(newpage);
}
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
if (PageTransHuge(page)) {
int i;
- int index = page_index(page);
for (i = 1; i < HPAGE_PMD_NR; i++) {
- pslot = radix_tree_lookup_slot(&mapping->i_pages,
- index + i);
- radix_tree_replace_slot(&mapping->i_pages, pslot,
- newpage + i);
+ xas_next(&xas);
+ xas_store(&xas, newpage);
}
}
@@ -546,7 +471,7 @@
*/
page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
- xa_unlock(&mapping->i_pages);
+ xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
/*
@@ -586,22 +511,18 @@
int migrate_huge_page_move_mapping(struct address_space *mapping,
struct page *newpage, struct page *page)
{
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
int expected_count;
- void **pslot;
- xa_lock_irq(&mapping->i_pages);
-
- pslot = radix_tree_lookup_slot(&mapping->i_pages, page_index(page));
-
+ xas_lock_irq(&xas);
expected_count = 2 + page_has_private(page);
- if (page_count(page) != expected_count ||
- radix_tree_deref_slot_protected(pslot, &mapping->i_pages.xa_lock) != page) {
- xa_unlock_irq(&mapping->i_pages);
+ if (page_count(page) != expected_count || xas_load(&xas) != page) {
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
if (!page_ref_freeze(page, expected_count)) {
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return -EAGAIN;
}
@@ -610,11 +531,11 @@
get_page(newpage);
- radix_tree_replace_slot(&mapping->i_pages, pslot, newpage);
+ xas_store(&xas, newpage);
page_ref_unfreeze(page, expected_count - 1);
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
return MIGRATEPAGE_SUCCESS;
}
@@ -685,6 +606,8 @@
SetPageActive(newpage);
} else if (TestClearPageUnevictable(page))
SetPageUnevictable(newpage);
+ if (PageWorkingset(page))
+ SetPageWorkingset(newpage);
if (PageChecked(page))
SetPageChecked(newpage);
if (PageMappedToDisk(page))
@@ -758,7 +681,7 @@
BUG_ON(PageWriteback(page)); /* Writeback must be complete */
- rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode, 0);
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
@@ -772,34 +695,94 @@
EXPORT_SYMBOL(migrate_page);
#ifdef CONFIG_BLOCK
-/*
- * Migration function for pages with buffers. This function can only be used
- * if the underlying filesystem guarantees that no other references to "page"
- * exist.
- */
-int buffer_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page, enum migrate_mode mode)
+/* Returns true if all buffers are successfully locked */
+static bool buffer_migrate_lock_buffers(struct buffer_head *head,
+ enum migrate_mode mode)
+{
+ struct buffer_head *bh = head;
+
+ /* Simple case, sync compaction */
+ if (mode != MIGRATE_ASYNC) {
+ do {
+ lock_buffer(bh);
+ bh = bh->b_this_page;
+
+ } while (bh != head);
+
+ return true;
+ }
+
+ /* async case, we cannot block on lock_buffer so use trylock_buffer */
+ do {
+ if (!trylock_buffer(bh)) {
+ /*
+ * We failed to lock the buffer and cannot stall in
+ * async migration. Release the taken locks
+ */
+ struct buffer_head *failed_bh = bh;
+ bh = head;
+ while (bh != failed_bh) {
+ unlock_buffer(bh);
+ bh = bh->b_this_page;
+ }
+ return false;
+ }
+
+ bh = bh->b_this_page;
+ } while (bh != head);
+ return true;
+}
+
+static int __buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode,
+ bool check_refs)
{
struct buffer_head *bh, *head;
int rc;
+ int expected_count;
if (!page_has_buffers(page))
return migrate_page(mapping, newpage, page, mode);
+ /* Check whether page does not have extra refs before we do more work */
+ expected_count = expected_page_refs(mapping, page);
+ if (page_count(page) != expected_count)
+ return -EAGAIN;
+
head = page_buffers(page);
+ if (!buffer_migrate_lock_buffers(head, mode))
+ return -EAGAIN;
- rc = migrate_page_move_mapping(mapping, newpage, page, head, mode, 0);
+ if (check_refs) {
+ bool busy;
+ bool invalidated = false;
+recheck_buffers:
+ busy = false;
+ spin_lock(&mapping->private_lock);
+ bh = head;
+ do {
+ if (atomic_read(&bh->b_count)) {
+ busy = true;
+ break;
+ }
+ bh = bh->b_this_page;
+ } while (bh != head);
+ if (busy) {
+ if (invalidated) {
+ rc = -EAGAIN;
+ goto unlock_buffers;
+ }
+ spin_unlock(&mapping->private_lock);
+ invalidate_bh_lrus();
+ invalidated = true;
+ goto recheck_buffers;
+ }
+ }
+
+ rc = migrate_page_move_mapping(mapping, newpage, page, 0);
if (rc != MIGRATEPAGE_SUCCESS)
- return rc;
-
- /*
- * In the async case, migrate_page_move_mapping locked the buffers
- * with an IRQ-safe spinlock held. In the sync case, the buffers
- * need to be locked now
- */
- if (mode != MIGRATE_ASYNC)
- BUG_ON(!buffer_migrate_lock_buffers(head, mode));
+ goto unlock_buffers;
ClearPagePrivate(page);
set_page_private(newpage, page_private(page));
@@ -821,17 +804,43 @@
else
migrate_page_states(newpage, page);
+ rc = MIGRATEPAGE_SUCCESS;
+unlock_buffers:
+ if (check_refs)
+ spin_unlock(&mapping->private_lock);
bh = head;
do {
unlock_buffer(bh);
- put_bh(bh);
bh = bh->b_this_page;
} while (bh != head);
- return MIGRATEPAGE_SUCCESS;
+ return rc;
+}
+
+/*
+ * Migration function for pages with buffers. This function can only be used
+ * if the underlying filesystem guarantees that no other references to "page"
+ * exist. For example attached buffer heads are accessed only under page lock.
+ */
+int buffer_migrate_page(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, false);
}
EXPORT_SYMBOL(buffer_migrate_page);
+
+/*
+ * Same as above except that this variant is more careful and checks that there
+ * are also no buffer head references. This function is the right one for
+ * mappings where buffer heads are directly looked up and referenced (such as
+ * block device mappings).
+ */
+int buffer_migrate_page_norefs(struct address_space *mapping,
+ struct page *newpage, struct page *page, enum migrate_mode mode)
+{
+ return __buffer_migrate_page(mapping, newpage, page, mode, true);
+}
#endif
/*
@@ -899,7 +908,7 @@
*/
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL))
- return -EAGAIN;
+ return mode == MIGRATE_SYNC ? -EAGAIN : -EBUSY;
return migrate_page(mapping, newpage, page, mode);
}
@@ -983,6 +992,10 @@
*/
if (!PageMappingFlags(page))
page->mapping = NULL;
+
+ if (likely(!is_zone_device_page(newpage)))
+ flush_dcache_page(newpage);
+
}
out:
return rc;
@@ -1118,10 +1131,13 @@
* If migration is successful, decrease refcount of the newpage
* which will not free the page because new page owner increased
* refcounter. As well, if it is LRU page, add the page to LRU
- * list in here.
+ * list in here. Use the old state of the isolated source page to
+ * determine if we migrated a LRU page. newpage was already unlocked
+ * and possibly modified by its owner - don't rely on the page
+ * state.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- if (unlikely(__PageMovable(newpage)))
+ if (unlikely(!is_lru))
put_page(newpage);
else
putback_lru_page(newpage);
@@ -1272,7 +1288,7 @@
struct anon_vma *anon_vma = NULL;
/*
- * Movability of hugepages depends on architectures and hugepage size.
+ * Migratability of hugepages depends on architectures and their size.
* This check is necessary because some callers of hugepage migration
* like soft offline and memory hotremove don't walk through page
* tables or check whether the hugepage is pmd-based or not before
@@ -1300,6 +1316,16 @@
lock_page(hpage);
}
+ /*
+ * Check for pages which are in the process of being freed. Without
+ * page_mapping() set, hugetlbfs specific move page routine will not
+ * be called and we could leak usage counts for subpools.
+ */
+ if (page_private(hpage) && !page_mapping(hpage)) {
+ rc = -EBUSY;
+ goto out_unlock;
+ }
+
if (PageAnon(hpage))
anon_vma = page_get_anon_vma(hpage);
@@ -1330,6 +1356,7 @@
put_new_page = NULL;
}
+out_unlock:
unlock_page(hpage);
out:
if (rc != -EAGAIN)
@@ -1585,7 +1612,7 @@
goto out_flush;
if (get_user(node, nodes + i))
goto out_flush;
- addr = (unsigned long)p;
+ addr = (unsigned long)untagged_addr(p);
err = -ENODEV;
if (node < 0 || node >= MAX_NUMNODES)
@@ -1865,7 +1892,7 @@
VM_BUG_ON_PAGE(compound_order(page) && !PageTransHuge(page), page);
/* Avoid migrating to a node that is nearly full */
- if (!migrate_balanced_pgdat(pgdat, 1UL << compound_order(page)))
+ if (!migrate_balanced_pgdat(pgdat, compound_nr(page)))
return 0;
if (isolate_lru_page(page))
@@ -1973,8 +2000,7 @@
int isolated = 0;
struct page *new_page = NULL;
int page_lru = page_is_file_cache(page);
- unsigned long mmun_start = address & HPAGE_PMD_MASK;
- unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
+ unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
@@ -1997,15 +2023,15 @@
/* anon mapping, we can simply copy page->mapping to the new page: */
new_page->mapping = page->mapping;
new_page->index = page->index;
+ /* flush the cache before copying using the kernel virtual address */
+ flush_cache_range(vma, start, start + HPAGE_PMD_SIZE);
migrate_page_copy(new_page, page);
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
ptl = pmd_lock(mm, pmd);
if (unlikely(!pmd_same(*pmd, entry) || !page_ref_freeze(page, 2))) {
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -2029,16 +2055,26 @@
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
/*
- * Clear the old entry under pagetable lock and establish the new PTE.
- * Any parallel GUP will either observe the old page blocking on the
- * page lock, block on the page table lock or observe the new page.
- * The SetPageUptodate on the new page and page_add_new_anon_rmap
- * guarantee the copy is visible before the pagetable update.
+ * Overwrite the old entry under pagetable lock and establish
+ * the new PTE. Any parallel GUP will either observe the old
+ * page blocking on the page lock, block on the page table
+ * lock or observe the new page. The SetPageUptodate on the
+ * new page and page_add_new_anon_rmap guarantee the copy is
+ * visible before the pagetable update.
*/
- flush_cache_range(vma, mmun_start, mmun_end);
- page_add_anon_rmap(new_page, vma, mmun_start, true);
- pmdp_huge_clear_flush_notify(vma, mmun_start, pmd);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ page_add_anon_rmap(new_page, vma, start, true);
+ /*
+ * At this point the pmd is numa/protnone (i.e. non present) and the TLB
+ * has already been flushed globally. So no TLB can be currently
+ * caching this non present pmd mapping. There's no need to clear the
+ * pmd before doing set_pmd_at(), nor to flush the TLB after
+ * set_pmd_at(). Clearing the pmd here would introduce a race
+ * condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
+ * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
+ * pmd.
+ */
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
page_ref_unfreeze(page, 2);
@@ -2047,11 +2083,6 @@
set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED);
spin_unlock(ptl);
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end);
/* Take an "isolate" reference and put new page on the LRU. */
get_page(new_page);
@@ -2075,7 +2106,7 @@
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
entry = pmd_modify(entry, vma->vm_page_prot);
- set_pmd_at(mm, mmun_start, pmd, entry);
+ set_pmd_at(mm, start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
spin_unlock(ptl);
@@ -2089,17 +2120,7 @@
#endif /* CONFIG_NUMA */
-#if defined(CONFIG_MIGRATE_VMA_HELPER)
-struct migrate_vma {
- struct vm_area_struct *vma;
- unsigned long *dst;
- unsigned long *src;
- unsigned long cpages;
- unsigned long npages;
- unsigned long start;
- unsigned long end;
-};
-
+#ifdef CONFIG_DEVICE_PRIVATE
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
struct mm_walk *walk)
@@ -2197,17 +2218,15 @@
pte_t pte;
pte = *ptep;
- pfn = pte_pfn(pte);
if (pte_none(pte)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
if (!pte_present(pte)) {
- mpfn = pfn = 0;
+ mpfn = 0;
/*
* Only care about unaddressable device page special
@@ -2219,28 +2238,27 @@
goto next;
page = device_private_entry_to_page(entry);
- mpfn = migrate_pfn(page_to_pfn(page))|
- MIGRATE_PFN_DEVICE | MIGRATE_PFN_MIGRATE;
+ mpfn = migrate_pfn(page_to_pfn(page)) |
+ MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
+ pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
migrate->cpages++;
- pfn = 0;
goto next;
}
- page = _vm_normal_page(migrate->vma, addr, pte, true);
+ page = vm_normal_page(migrate->vma, addr, pte);
mpfn = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE;
mpfn |= pte_write(pte) ? MIGRATE_PFN_WRITE : 0;
}
/* FIXME support THP */
if (!page || !page->mapping || PageTransCompound(page)) {
- mpfn = pfn = 0;
+ mpfn = 0;
goto next;
}
- pfn = page_to_pfn(page);
/*
* By getting a reference on the page we pin it and that blocks
@@ -2299,6 +2317,11 @@
return 0;
}
+static const struct mm_walk_ops migrate_vma_walk_ops = {
+ .pmd_entry = migrate_vma_collect_pmd,
+ .pte_hole = migrate_vma_collect_hole,
+};
+
/*
* migrate_vma_collect() - collect pages over a range of virtual addresses
* @migrate: migrate struct containing all migration information
@@ -2309,25 +2332,16 @@
*/
static void migrate_vma_collect(struct migrate_vma *migrate)
{
- struct mm_walk mm_walk;
+ struct mmu_notifier_range range;
- mm_walk.pmd_entry = migrate_vma_collect_pmd;
- mm_walk.pte_entry = NULL;
- mm_walk.pte_hole = migrate_vma_collect_hole;
- mm_walk.hugetlb_entry = NULL;
- mm_walk.test_walk = NULL;
- mm_walk.vma = migrate->vma;
- mm_walk.mm = migrate->vma->vm_mm;
- mm_walk.private = migrate;
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
+ migrate->vma->vm_mm, migrate->start, migrate->end);
+ mmu_notifier_invalidate_range_start(&range);
- mmu_notifier_invalidate_range_start(mm_walk.mm,
- migrate->start,
- migrate->end);
- walk_page_range(migrate->start, migrate->end, &mm_walk);
- mmu_notifier_invalidate_range_end(mm_walk.mm,
- migrate->start,
- migrate->end);
+ walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
+ &migrate_vma_walk_ops, migrate);
+ mmu_notifier_invalidate_range_end(&range);
migrate->end = migrate->start + (migrate->npages << PAGE_SHIFT);
}
@@ -2371,16 +2385,7 @@
* FIXME proper solution is to rework migration_entry_wait() so
* it does not need to take a reference on page.
*/
- if (is_device_private_page(page))
- return true;
-
- /*
- * Only allow device public page to be migrated and account for
- * the extra reference count imply by ZONE_DEVICE pages.
- */
- if (!is_device_public_page(page))
- return false;
- extra++;
+ return is_device_private_page(page);
}
/* For file back page */
@@ -2559,6 +2564,110 @@
}
}
+/**
+ * migrate_vma_setup() - prepare to migrate a range of memory
+ * @args: contains the vma, start, and and pfns arrays for the migration
+ *
+ * Returns: negative errno on failures, 0 when 0 or more pages were migrated
+ * without an error.
+ *
+ * Prepare to migrate a range of memory virtual address range by collecting all
+ * the pages backing each virtual address in the range, saving them inside the
+ * src array. Then lock those pages and unmap them. Once the pages are locked
+ * and unmapped, check whether each page is pinned or not. Pages that aren't
+ * pinned have the MIGRATE_PFN_MIGRATE flag set (by this function) in the
+ * corresponding src array entry. Then restores any pages that are pinned, by
+ * remapping and unlocking those pages.
+ *
+ * The caller should then allocate destination memory and copy source memory to
+ * it for all those entries (ie with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE
+ * flag set). Once these are allocated and copied, the caller must update each
+ * corresponding entry in the dst array with the pfn value of the destination
+ * page and with the MIGRATE_PFN_VALID and MIGRATE_PFN_LOCKED flags set
+ * (destination pages must have their struct pages locked, via lock_page()).
+ *
+ * Note that the caller does not have to migrate all the pages that are marked
+ * with MIGRATE_PFN_MIGRATE flag in src array unless this is a migration from
+ * device memory to system memory. If the caller cannot migrate a device page
+ * back to system memory, then it must return VM_FAULT_SIGBUS, which has severe
+ * consequences for the userspace process, so it must be avoided if at all
+ * possible.
+ *
+ * For empty entries inside CPU page table (pte_none() or pmd_none() is true) we
+ * do set MIGRATE_PFN_MIGRATE flag inside the corresponding source array thus
+ * allowing the caller to allocate device memory for those unback virtual
+ * address. For this the caller simply has to allocate device memory and
+ * properly set the destination entry like for regular migration. Note that
+ * this can still fails and thus inside the device driver must check if the
+ * migration was successful for those entries after calling migrate_vma_pages()
+ * just like for regular migration.
+ *
+ * After that, the callers must call migrate_vma_pages() to go over each entry
+ * in the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
+ * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
+ * then migrate_vma_pages() to migrate struct page information from the source
+ * struct page to the destination struct page. If it fails to migrate the
+ * struct page information, then it clears the MIGRATE_PFN_MIGRATE flag in the
+ * src array.
+ *
+ * At this point all successfully migrated pages have an entry in the src
+ * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
+ * array entry with MIGRATE_PFN_VALID flag set.
+ *
+ * Once migrate_vma_pages() returns the caller may inspect which pages were
+ * successfully migrated, and which were not. Successfully migrated pages will
+ * have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
+ *
+ * It is safe to update device page table after migrate_vma_pages() because
+ * both destination and source page are still locked, and the mmap_sem is held
+ * in read mode (hence no one can unmap the range being migrated).
+ *
+ * Once the caller is done cleaning up things and updating its page table (if it
+ * chose to do so, this is not an obligation) it finally calls
+ * migrate_vma_finalize() to update the CPU page table to point to new pages
+ * for successfully migrated pages or otherwise restore the CPU page table to
+ * point to the original source pages.
+ */
+int migrate_vma_setup(struct migrate_vma *args)
+{
+ long nr_pages = (args->end - args->start) >> PAGE_SHIFT;
+
+ args->start &= PAGE_MASK;
+ args->end &= PAGE_MASK;
+ if (!args->vma || is_vm_hugetlb_page(args->vma) ||
+ (args->vma->vm_flags & VM_SPECIAL) || vma_is_dax(args->vma))
+ return -EINVAL;
+ if (nr_pages <= 0)
+ return -EINVAL;
+ if (args->start < args->vma->vm_start ||
+ args->start >= args->vma->vm_end)
+ return -EINVAL;
+ if (args->end <= args->vma->vm_start || args->end > args->vma->vm_end)
+ return -EINVAL;
+ if (!args->src || !args->dst)
+ return -EINVAL;
+
+ memset(args->src, 0, sizeof(*args->src) * nr_pages);
+ args->cpages = 0;
+ args->npages = 0;
+
+ migrate_vma_collect(args);
+
+ if (args->cpages)
+ migrate_vma_prepare(args);
+ if (args->cpages)
+ migrate_vma_unmap(args);
+
+ /*
+ * At this point pages are locked and unmapped, and thus they have
+ * stable content and can safely be copied to destination memory that
+ * is allocated by the drivers.
+ */
+ return 0;
+
+}
+EXPORT_SYMBOL(migrate_vma_setup);
+
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
@@ -2605,7 +2714,7 @@
*
* Here we only have down_read(mmap_sem).
*/
- if (pte_alloc(mm, pmdp, addr))
+ if (pte_alloc(mm, pmdp))
goto abort;
/* See the comment in pte_alloc_one_map() */
@@ -2630,11 +2739,6 @@
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
- } else if (is_device_public_page(page)) {
- entry = pte_mkold(mk_pte(page, READ_ONCE(vma->vm_page_prot)));
- if (vma->vm_flags & VM_WRITE)
- entry = pte_mkwrite(pte_mkdirty(entry));
- entry = pte_mkdevmap(entry);
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
@@ -2695,7 +2799,7 @@
*src &= ~MIGRATE_PFN_MIGRATE;
}
-/*
+/**
* migrate_vma_pages() - migrate meta-data from src page to dst page
* @migrate: migrate struct containing all migration information
*
@@ -2703,13 +2807,12 @@
* struct page. This effectively finishes the migration from source page to the
* destination page.
*/
-static void migrate_vma_pages(struct migrate_vma *migrate)
+void migrate_vma_pages(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
- struct vm_area_struct *vma = migrate->vma;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long addr, i, mmu_start;
+ struct mmu_notifier_range range;
+ unsigned long addr, i;
bool notified = false;
for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) {
@@ -2728,11 +2831,14 @@
continue;
}
if (!notified) {
- mmu_start = addr;
notified = true;
- mmu_notifier_invalidate_range_start(mm,
- mmu_start,
- migrate->end);
+
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_CLEAR, 0,
+ NULL,
+ migrate->vma->vm_mm,
+ addr, migrate->end);
+ mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
&migrate->src[i],
@@ -2752,7 +2858,7 @@
migrate->src[i] &= ~MIGRATE_PFN_MIGRATE;
continue;
}
- } else if (!is_device_public_page(newpage)) {
+ } else {
/*
* Other types of ZONE_DEVICE page are not
* supported.
@@ -2773,11 +2879,11 @@
* did already call it.
*/
if (notified)
- mmu_notifier_invalidate_range_only_end(mm, mmu_start,
- migrate->end);
+ mmu_notifier_invalidate_range_only_end(&range);
}
+EXPORT_SYMBOL(migrate_vma_pages);
-/*
+/**
* migrate_vma_finalize() - restore CPU page table entry
* @migrate: migrate struct containing all migration information
*
@@ -2788,7 +2894,7 @@
* This also unlocks the pages and puts them back on the lru, or drops the extra
* refcount, for device pages.
*/
-static void migrate_vma_finalize(struct migrate_vma *migrate)
+void migrate_vma_finalize(struct migrate_vma *migrate)
{
const unsigned long npages = migrate->npages;
unsigned long i;
@@ -2831,124 +2937,5 @@
}
}
}
-
-/*
- * migrate_vma() - migrate a range of memory inside vma
- *
- * @ops: migration callback for allocating destination memory and copying
- * @vma: virtual memory area containing the range to be migrated
- * @start: start address of the range to migrate (inclusive)
- * @end: end address of the range to migrate (exclusive)
- * @src: array of hmm_pfn_t containing source pfns
- * @dst: array of hmm_pfn_t containing destination pfns
- * @private: pointer passed back to each of the callback
- * Returns: 0 on success, error code otherwise
- *
- * This function tries to migrate a range of memory virtual address range, using
- * callbacks to allocate and copy memory from source to destination. First it
- * collects all the pages backing each virtual address in the range, saving this
- * inside the src array. Then it locks those pages and unmaps them. Once the pages
- * are locked and unmapped, it checks whether each page is pinned or not. Pages
- * that aren't pinned have the MIGRATE_PFN_MIGRATE flag set (by this function)
- * in the corresponding src array entry. It then restores any pages that are
- * pinned, by remapping and unlocking those pages.
- *
- * At this point it calls the alloc_and_copy() callback. For documentation on
- * what is expected from that callback, see struct migrate_vma_ops comments in
- * include/linux/migrate.h
- *
- * After the alloc_and_copy() callback, this function goes over each entry in
- * the src array that has the MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag
- * set. If the corresponding entry in dst array has MIGRATE_PFN_VALID flag set,
- * then the function tries to migrate struct page information from the source
- * struct page to the destination struct page. If it fails to migrate the struct
- * page information, then it clears the MIGRATE_PFN_MIGRATE flag in the src
- * array.
- *
- * At this point all successfully migrated pages have an entry in the src
- * array with MIGRATE_PFN_VALID and MIGRATE_PFN_MIGRATE flag set and the dst
- * array entry with MIGRATE_PFN_VALID flag set.
- *
- * It then calls the finalize_and_map() callback. See comments for "struct
- * migrate_vma_ops", in include/linux/migrate.h for details about
- * finalize_and_map() behavior.
- *
- * After the finalize_and_map() callback, for successfully migrated pages, this
- * function updates the CPU page table to point to new pages, otherwise it
- * restores the CPU page table to point to the original source pages.
- *
- * Function returns 0 after the above steps, even if no pages were migrated
- * (The function only returns an error if any of the arguments are invalid.)
- *
- * Both src and dst array must be big enough for (end - start) >> PAGE_SHIFT
- * unsigned long entries.
- */
-int migrate_vma(const struct migrate_vma_ops *ops,
- struct vm_area_struct *vma,
- unsigned long start,
- unsigned long end,
- unsigned long *src,
- unsigned long *dst,
- void *private)
-{
- struct migrate_vma migrate;
-
- /* Sanity check the arguments */
- start &= PAGE_MASK;
- end &= PAGE_MASK;
- if (!vma || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL) ||
- vma_is_dax(vma))
- return -EINVAL;
- if (start < vma->vm_start || start >= vma->vm_end)
- return -EINVAL;
- if (end <= vma->vm_start || end > vma->vm_end)
- return -EINVAL;
- if (!ops || !src || !dst || start >= end)
- return -EINVAL;
-
- memset(src, 0, sizeof(*src) * ((end - start) >> PAGE_SHIFT));
- migrate.src = src;
- migrate.dst = dst;
- migrate.start = start;
- migrate.npages = 0;
- migrate.cpages = 0;
- migrate.end = end;
- migrate.vma = vma;
-
- /* Collect, and try to unmap source pages */
- migrate_vma_collect(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /* Lock and isolate page */
- migrate_vma_prepare(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /* Unmap pages */
- migrate_vma_unmap(&migrate);
- if (!migrate.cpages)
- return 0;
-
- /*
- * At this point pages are locked and unmapped, and thus they have
- * stable content and can safely be copied to destination memory that
- * is allocated by the callback.
- *
- * Note that migration can fail in migrate_vma_struct_page() for each
- * individual page.
- */
- ops->alloc_and_copy(vma, src, dst, start, end, private);
-
- /* This does the real migration of struct page */
- migrate_vma_pages(&migrate);
-
- ops->finalize_and_map(vma, src, dst, start, end, private);
-
- /* Unlock and remap pages */
- migrate_vma_finalize(&migrate);
-
- return 0;
-}
-EXPORT_SYMBOL(migrate_vma);
-#endif /* defined(MIGRATE_VMA_HELPER) */
+EXPORT_SYMBOL(migrate_vma_finalize);
+#endif /* CONFIG_DEVICE_PRIVATE */
diff --git a/mm/mincore.c b/mm/mincore.c
index fc37afe..49b6fa2 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -10,7 +10,7 @@
*/
#include <linux/pagemap.h>
#include <linux/gfp.h>
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/mman.h>
#include <linux/syscalls.h>
#include <linux/swap.h>
@@ -66,10 +66,18 @@
* shmem/tmpfs may return swap: account for swapcache
* page too.
*/
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- page = find_get_page(swap_address_space(swp),
- swp_offset(swp));
+ struct swap_info_struct *si;
+
+ /* Prevent swap device to being swapoff under us */
+ si = get_swap_device(swp);
+ if (si) {
+ page = find_get_page(swap_address_space(swp),
+ swp_offset(swp));
+ put_swap_device(si);
+ } else
+ page = NULL;
}
} else
page = find_get_page(mapping, pgoff);
@@ -169,6 +177,28 @@
return 0;
}
+static inline bool can_do_mincore(struct vm_area_struct *vma)
+{
+ if (vma_is_anonymous(vma))
+ return true;
+ if (!vma->vm_file)
+ return false;
+ /*
+ * Reveal pagecache information only for non-anonymous mappings that
+ * correspond to the files the calling process could (if tried) open
+ * for writing; otherwise we'd be including shared non-exclusive
+ * mappings, which opens a side channel.
+ */
+ return inode_owner_or_capable(file_inode(vma->vm_file)) ||
+ inode_permission(file_inode(vma->vm_file), MAY_WRITE) == 0;
+}
+
+static const struct mm_walk_ops mincore_walk_ops = {
+ .pmd_entry = mincore_pte_range,
+ .pte_hole = mincore_unmapped_range,
+ .hugetlb_entry = mincore_hugetlb,
+};
+
/*
* Do a chunk of "sys_mincore()". We've already checked
* all the arguments, we hold the mmap semaphore: we should
@@ -179,19 +209,17 @@
struct vm_area_struct *vma;
unsigned long end;
int err;
- struct mm_walk mincore_walk = {
- .pmd_entry = mincore_pte_range,
- .pte_hole = mincore_unmapped_range,
- .hugetlb_entry = mincore_hugetlb,
- .private = vec,
- };
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
- mincore_walk.mm = vma->vm_mm;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
- err = walk_page_range(addr, end, &mincore_walk);
+ if (!can_do_mincore(vma)) {
+ unsigned long pages = DIV_ROUND_UP(end - addr, PAGE_SIZE);
+ memset(vec, 1, pages);
+ return pages;
+ }
+ err = walk_page_range(vma->vm_mm, addr, end, &mincore_walk_ops, vec);
if (err < 0)
return err;
return (end - addr) >> PAGE_SHIFT;
@@ -228,19 +256,21 @@
unsigned long pages;
unsigned char *tmp;
+ start = untagged_addr(start);
+
/* Check the start address: needs to be page-aligned.. */
if (start & ~PAGE_MASK)
return -EINVAL;
/* ..and we need to be passed a valid user-space range */
- if (!access_ok(VERIFY_READ, (void __user *) start, len))
+ if (!access_ok((void __user *) start, len))
return -ENOMEM;
/* This also avoids any overflows on PAGE_ALIGN */
pages = len >> PAGE_SHIFT;
pages += (offset_in_page(len)) != 0;
- if (!access_ok(VERIFY_WRITE, vec, pages))
+ if (!access_ok(vec, pages))
return -EFAULT;
tmp = (void *) __get_free_page(GFP_USER);
diff --git a/mm/mlock.c b/mm/mlock.c
index 41cc47e..a72c1ee 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -182,7 +182,7 @@
unsigned int munlock_vma_page(struct page *page)
{
int nr_pages;
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
@@ -194,7 +194,7 @@
* might otherwise copy PageMlocked to part of the tail pages before
* we clear it in the head page. It also stabilizes hpage_nr_pages().
*/
- spin_lock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&pgdat->lru_lock);
if (!TestClearPageMlocked(page)) {
/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
@@ -203,17 +203,17 @@
}
nr_pages = hpage_nr_pages(page);
- __mod_zone_page_state(zone, NR_MLOCK, -nr_pages);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
__munlock_isolated_page(page);
goto out;
}
__munlock_isolation_failed(page);
unlock_out:
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
out:
return nr_pages - 1;
@@ -298,7 +298,7 @@
pagevec_init(&pvec_putback);
/* Phase 1: page isolation */
- spin_lock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&zone->zone_pgdat->lru_lock);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
@@ -325,7 +325,7 @@
pvec->pages[i] = NULL;
}
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&zone->zone_pgdat->lru_lock);
/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
@@ -636,11 +636,11 @@
* is also counted.
* Return value: previously mlocked page counts
*/
-static int count_mm_mlocked_page_nr(struct mm_struct *mm,
+static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm,
unsigned long start, size_t len)
{
struct vm_area_struct *vma;
- int count = 0;
+ unsigned long count = 0;
if (mm == NULL)
mm = current->mm;
@@ -674,6 +674,8 @@
unsigned long lock_limit;
int error = -ENOMEM;
+ start = untagged_addr(start);
+
if (!can_do_mlock())
return -EPERM;
@@ -735,6 +737,8 @@
{
int ret;
+ start = untagged_addr(start);
+
len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
@@ -797,7 +801,8 @@
unsigned long lock_limit;
int ret;
- if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)))
+ if (!flags || (flags & ~(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT)) ||
+ flags == MCL_ONFAULT)
return -EINVAL;
if (!can_do_mlock())
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 6838a53..5c91838 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm_init.c - Memory initialisation verification and debugging
*
@@ -146,7 +147,7 @@
s32 batch = max_t(s32, nr*2, 32);
/* batch size set to 0.4% of (total memory/#cpus), or max int32 */
- memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff);
+ memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
}
diff --git a/mm/mmap.c b/mm/mmap.c
index f7cd9cb..a7d8c84 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/mmap.c
*
@@ -45,6 +46,7 @@
#include <linux/moduleparam.h>
#include <linux/pkeys.h>
#include <linux/oom.h>
+#include <linux/sched/mm.h>
#include <linux/uaccess.h>
#include <asm/cacheflush.h>
@@ -191,16 +193,21 @@
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long retval;
- unsigned long newbrk, oldbrk;
+ unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
struct vm_area_struct *next;
unsigned long min_brk;
bool populate;
+ bool downgraded = false;
LIST_HEAD(uf);
+ brk = untagged_addr(brk);
+
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
+ origbrk = mm->brk;
+
#ifdef CONFIG_COMPAT_BRK
/*
* CONFIG_COMPAT_BRK can still be overridden by setting
@@ -229,14 +236,32 @@
newbrk = PAGE_ALIGN(brk);
oldbrk = PAGE_ALIGN(mm->brk);
- if (oldbrk == newbrk)
- goto set_brk;
+ if (oldbrk == newbrk) {
+ mm->brk = brk;
+ goto success;
+ }
- /* Always allow shrinking brk. */
+ /*
+ * Always allow shrinking brk.
+ * __do_munmap() may downgrade mmap_sem to read.
+ */
if (brk <= mm->brk) {
- if (!do_munmap(mm, newbrk, oldbrk-newbrk, &uf))
- goto set_brk;
- goto out;
+ int ret;
+
+ /*
+ * mm->brk must to be protected by write mmap_sem so update it
+ * before downgrading mmap_sem. When __do_munmap() fails,
+ * mm->brk will be restored from origbrk.
+ */
+ mm->brk = brk;
+ ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
+ if (ret < 0) {
+ mm->brk = origbrk;
+ goto out;
+ } else if (ret == 1) {
+ downgraded = true;
+ }
+ goto success;
}
/* Check against existing mmap mappings. */
@@ -247,25 +272,28 @@
/* Ok, looks good - let it rip. */
if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
goto out;
-
-set_brk:
mm->brk = brk;
+
+success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
- up_write(&mm->mmap_sem);
+ if (downgraded)
+ up_read(&mm->mmap_sem);
+ else
+ up_write(&mm->mmap_sem);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
return brk;
out:
- retval = mm->brk;
+ retval = origbrk;
up_write(&mm->mmap_sem);
return retval;
}
-static long vma_compute_subtree_gap(struct vm_area_struct *vma)
+static inline unsigned long vma_compute_gap(struct vm_area_struct *vma)
{
- unsigned long max, prev_end, subtree_gap;
+ unsigned long gap, prev_end;
/*
* Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
@@ -273,14 +301,21 @@
* an unmapped area; whereas when expanding we only require one.
* That's a little inconsistent, but keeps the code here simpler.
*/
- max = vm_start_gap(vma);
+ gap = vm_start_gap(vma);
if (vma->vm_prev) {
prev_end = vm_end_gap(vma->vm_prev);
- if (max > prev_end)
- max -= prev_end;
+ if (gap > prev_end)
+ gap -= prev_end;
else
- max = 0;
+ gap = 0;
}
+ return gap;
+}
+
+#ifdef CONFIG_DEBUG_VM_RB
+static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma)
+{
+ unsigned long max = vma_compute_gap(vma), subtree_gap;
if (vma->vm_rb.rb_left) {
subtree_gap = rb_entry(vma->vm_rb.rb_left,
struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -296,7 +331,6 @@
return max;
}
-#ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct mm_struct *mm)
{
struct rb_root *root = &mm->mm_rb;
@@ -402,8 +436,9 @@
#define validate_mm(mm) do { } while (0)
#endif
-RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
+RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks,
+ struct vm_area_struct, vm_rb,
+ unsigned long, rb_subtree_gap, vma_compute_gap)
/*
* Update augmented rbtree rb_subtree_gap values after vma->vm_start or
@@ -413,8 +448,8 @@
static void vma_gap_update(struct vm_area_struct *vma)
{
/*
- * As it turns out, RB_DECLARE_CALLBACKS() already created a callback
- * function that does exacltly what we want.
+ * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created
+ * a callback function that does exactly what we want.
*/
vma_gap_callbacks_propagate(&vma->vm_rb, NULL);
}
@@ -988,7 +1023,7 @@
* VM_SOFTDIRTY should not prevent from VMA merging, if we
* match the flags but dirty bit -- the caller should mark
* merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressue on the memory system forcing
+ * comparison, we increase pressure on the memory system forcing
* the kernel to generate new VMAs when old one could be
* extended instead.
*/
@@ -1091,7 +1126,7 @@
* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
* might become case 1 below case 2 below case 3 below
*
- * It is important for case 8 that the the vma NNNN overlapping the
+ * It is important for case 8 that the vma NNNN overlapping the
* region AAAA is never going to extended over XXXX. Instead XXXX must
* be extended in region AAAA and NNNN must be removed. This way in
* all cases where vma_merge succeeds, the moment vma_adjust drops the
@@ -1332,6 +1367,9 @@
if (S_ISBLK(inode->i_mode))
return MAX_LFS_FILESIZE;
+ if (S_ISSOCK(inode->i_mode))
+ return MAX_LFS_FILESIZE;
+
/* Special "we do even unsigned file positions" case */
if (file->f_mode & FMODE_UNSIGNED_OFFSET)
return 0;
@@ -1457,8 +1495,12 @@
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
- if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
- return -EACCES;
+ if (prot & PROT_WRITE) {
+ if (!(file->f_mode & FMODE_WRITE))
+ return -EACCES;
+ if (IS_SWAPFILE(file->f_mapping->host))
+ return -ETXTBSY;
+ }
/*
* Make sure we don't allow writing to an append-only
@@ -1547,6 +1589,8 @@
struct file *file = NULL;
unsigned long retval;
+ addr = untagged_addr(addr);
+
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
@@ -1621,7 +1665,7 @@
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
/*
- * Some shared mappigns will want the pages marked read-only
+ * Some shared mappings will want the pages marked read-only
* to track write events. If so, we'll downgrade vm_page_prot
* to the private version (using protection_map[] without the
* VM_SHARED bit).
@@ -2042,6 +2086,15 @@
return gap_end;
}
+
+#ifndef arch_get_mmap_end
+#define arch_get_mmap_end(addr) (TASK_SIZE)
+#endif
+
+#ifndef arch_get_mmap_base
+#define arch_get_mmap_base(addr, base) (base)
+#endif
+
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -2061,8 +2114,9 @@
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -2071,7 +2125,7 @@
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
@@ -2080,7 +2134,7 @@
info.flags = 0;
info.length = len;
info.low_limit = mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = mmap_end;
info.align_mask = 0;
return vm_unmapped_area(&info);
}
@@ -2092,17 +2146,17 @@
*/
#ifndef HAVE_ARCH_UNMAPPED_AREA_TOPDOWN
unsigned long
-arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
- const unsigned long len, const unsigned long pgoff,
- const unsigned long flags)
+arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
- unsigned long addr = addr0;
struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
/* requested length too big for entire address space */
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > mmap_end - mmap_min_addr)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -2112,7 +2166,7 @@
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma_prev(mm, addr, &prev);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (mmap_end - len >= addr && addr >= mmap_min_addr &&
(!vma || addr + len <= vm_start_gap(vma)) &&
(!prev || addr >= vm_end_gap(prev)))
return addr;
@@ -2121,7 +2175,7 @@
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
addr = vm_unmapped_area(&info);
@@ -2135,7 +2189,7 @@
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
+ info.high_limit = mmap_end;
addr = vm_unmapped_area(&info);
}
@@ -2234,12 +2288,9 @@
if (vma) {
*pprev = vma->vm_prev;
} else {
- struct rb_node *rb_node = mm->mm_rb.rb_node;
- *pprev = NULL;
- while (rb_node) {
- *pprev = rb_entry(rb_node, struct vm_area_struct, vm_rb);
- rb_node = rb_node->rb_right;
- }
+ struct rb_node *rb_node = rb_last(&mm->mm_rb);
+
+ *pprev = rb_node ? rb_entry(rb_node, struct vm_area_struct, vm_rb) : NULL;
}
return vma;
}
@@ -2391,12 +2442,11 @@
{
struct mm_struct *mm = vma->vm_mm;
struct vm_area_struct *prev;
- int error;
+ int error = 0;
address &= PAGE_MASK;
- error = security_mmap_addr(address);
- if (error)
- return error;
+ if (address < mmap_min_addr)
+ return -EPERM;
/* Enforce stack_guard_gap */
prev = vma->vm_prev;
@@ -2492,7 +2542,8 @@
vma = find_vma_prev(mm, addr, &prev);
if (vma && (vma->vm_start <= addr))
return vma;
- if (!prev || expand_stack(prev, addr))
+ /* don't alter vm_end if the coredump is running */
+ if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr))
return NULL;
if (prev->vm_flags & VM_LOCKED)
populate_vma_page_range(prev, addr, prev->vm_end, NULL);
@@ -2518,6 +2569,9 @@
return vma;
if (!(vma->vm_flags & VM_GROWSDOWN))
return NULL;
+ /* don't alter vm_start if the coredump is running */
+ if (!mmget_still_valid(mm))
+ return NULL;
start = vma->vm_start;
if (expand_stack(vma, addr))
return NULL;
@@ -2687,8 +2741,8 @@
* work. This now handles partial unmappings.
* Jeremy Fitzhardinge <jeremy@goop.org>
*/
-int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
- struct list_head *uf)
+int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf, bool downgrade)
{
unsigned long end;
struct vm_area_struct *vma, *prev, *last;
@@ -2697,9 +2751,17 @@
return -EINVAL;
len = PAGE_ALIGN(len);
+ end = start + len;
if (len == 0)
return -EINVAL;
+ /*
+ * arch_unmap() might do unmaps itself. It must be called
+ * and finish any rbtree manipulation before this code
+ * runs and also starts to manipulate the rbtree.
+ */
+ arch_unmap(mm, start, end);
+
/* Find the first overlapping VMA */
vma = find_vma(mm, start);
if (!vma)
@@ -2708,7 +2770,6 @@
/* we have start < vma->vm_end */
/* if it doesn't overlap, we have nothing.. */
- end = start + len;
if (vma->vm_start >= end)
return 0;
@@ -2770,25 +2831,32 @@
mm->locked_vm -= vma_pages(tmp);
munlock_vma_pages_all(tmp);
}
+
tmp = tmp->vm_next;
}
}
- /*
- * Remove the vma's, and unmap the actual pages
- */
+ /* Detach vmas from rbtree */
detach_vmas_to_be_unmapped(mm, vma, prev, end);
- unmap_region(mm, vma, prev, start, end);
- arch_unmap(mm, vma, start, end);
+ if (downgrade)
+ downgrade_write(&mm->mmap_sem);
+
+ unmap_region(mm, vma, prev, start, end);
/* Fix up all other VM information */
remove_vma_list(mm, vma);
- return 0;
+ return downgrade ? 1 : 0;
}
-int vm_munmap(unsigned long start, size_t len)
+int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
+ struct list_head *uf)
+{
+ return __do_munmap(mm, start, len, uf, false);
+}
+
+static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
{
int ret;
struct mm_struct *mm = current->mm;
@@ -2797,17 +2865,33 @@
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
- ret = do_munmap(mm, start, len, &uf);
- up_write(&mm->mmap_sem);
+ ret = __do_munmap(mm, start, len, &uf, downgrade);
+ /*
+ * Returning 1 indicates mmap_sem is downgraded.
+ * But 1 is not legal return value of vm_munmap() and munmap(), reset
+ * it to 0 before return.
+ */
+ if (ret == 1) {
+ up_read(&mm->mmap_sem);
+ ret = 0;
+ } else
+ up_write(&mm->mmap_sem);
+
userfaultfd_unmap_complete(mm, &uf);
return ret;
}
+
+int vm_munmap(unsigned long start, size_t len)
+{
+ return __vm_munmap(start, len, false);
+}
EXPORT_SYMBOL(vm_munmap);
SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
{
+ addr = untagged_addr(addr);
profile_munmap(addr);
- return vm_munmap(addr, len);
+ return __vm_munmap(addr, len, true);
}
@@ -2910,16 +2994,6 @@
return ret;
}
-static inline void verify_mm_writelocked(struct mm_struct *mm)
-{
-#ifdef CONFIG_DEBUG_VM
- if (unlikely(down_read_trylock(&mm->mmap_sem))) {
- WARN_ON(1);
- up_read(&mm->mmap_sem);
- }
-#endif
-}
-
/*
* this is really a simplified "do_mmap". it only handles
* anonymous maps. eventually we may be able to do some
@@ -2947,12 +3021,6 @@
return error;
/*
- * mm->mmap_sem is required to protect against another thread
- * changing the mappings in case we sleep.
- */
- verify_mm_writelocked(mm);
-
- /*
* Clear old maps. this also does some error checking for us
*/
while (find_vma_links(mm, addr, addr + len, &prev, &rb_link,
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
new file mode 100644
index 0000000..7d70e5c
--- /dev/null
+++ b/mm/mmu_gather.c
@@ -0,0 +1,278 @@
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/kernel.h>
+#include <linux/mmdebug.h>
+#include <linux/mm_types.h>
+#include <linux/pagemap.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/swap.h>
+
+#include <asm/pgalloc.h>
+#include <asm/tlb.h>
+
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+
+static bool tlb_next_batch(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ batch = tlb->active;
+ if (batch->next) {
+ tlb->active = batch->next;
+ return true;
+ }
+
+ if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
+ return false;
+
+ batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
+ if (!batch)
+ return false;
+
+ tlb->batch_count++;
+ batch->next = NULL;
+ batch->nr = 0;
+ batch->max = MAX_GATHER_BATCH;
+
+ tlb->active->next = batch;
+ tlb->active = batch;
+
+ return true;
+}
+
+static void tlb_batch_pages_flush(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch;
+
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
+ free_pages_and_swap_cache(batch->pages, batch->nr);
+ batch->nr = 0;
+ }
+ tlb->active = &tlb->local;
+}
+
+static void tlb_batch_list_free(struct mmu_gather *tlb)
+{
+ struct mmu_gather_batch *batch, *next;
+
+ for (batch = tlb->local.next; batch; batch = next) {
+ next = batch->next;
+ free_pages((unsigned long)batch, 0);
+ }
+ tlb->local.next = NULL;
+}
+
+bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
+{
+ struct mmu_gather_batch *batch;
+
+ VM_BUG_ON(!tlb->end);
+
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+ VM_WARN_ON(tlb->page_size != page_size);
+#endif
+
+ batch = tlb->active;
+ /*
+ * Add the page and check if we are full. If so
+ * force a flush.
+ */
+ batch->pages[batch->nr++] = page;
+ if (batch->nr == batch->max) {
+ if (!tlb_next_batch(tlb))
+ return true;
+ batch = tlb->active;
+ }
+ VM_BUG_ON_PAGE(batch->nr > batch->max, page);
+
+ return false;
+}
+
+#endif /* HAVE_MMU_GATHER_NO_GATHER */
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+
+/*
+ * See the comment near struct mmu_table_batch.
+ */
+
+/*
+ * If we want tlb_remove_table() to imply TLB invalidates.
+ */
+static inline void tlb_table_invalidate(struct mmu_gather *tlb)
+{
+#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then we still
+ * need to RCU-sched wait while freeing the pages because software
+ * walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+#endif
+}
+
+static void tlb_remove_table_smp_sync(void *arg)
+{
+ /* Simply deliver the interrupt */
+}
+
+static void tlb_remove_table_one(void *table)
+{
+ /*
+ * This isn't an RCU grace period and hence the page-tables cannot be
+ * assumed to be actually RCU-freed.
+ *
+ * It is however sufficient for software page-table walkers that rely on
+ * IRQ disabling. See the comment near struct mmu_table_batch.
+ */
+ smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
+ __tlb_remove_table(table);
+}
+
+static void tlb_remove_table_rcu(struct rcu_head *head)
+{
+ struct mmu_table_batch *batch;
+ int i;
+
+ batch = container_of(head, struct mmu_table_batch, rcu);
+
+ for (i = 0; i < batch->nr; i++)
+ __tlb_remove_table(batch->tables[i]);
+
+ free_page((unsigned long)batch);
+}
+
+static void tlb_table_flush(struct mmu_gather *tlb)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch) {
+ tlb_table_invalidate(tlb);
+ call_rcu(&(*batch)->rcu, tlb_remove_table_rcu);
+ *batch = NULL;
+ }
+}
+
+void tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ struct mmu_table_batch **batch = &tlb->batch;
+
+ if (*batch == NULL) {
+ *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
+ if (*batch == NULL) {
+ tlb_table_invalidate(tlb);
+ tlb_remove_table_one(table);
+ return;
+ }
+ (*batch)->nr = 0;
+ }
+
+ (*batch)->tables[(*batch)->nr++] = table;
+ if ((*batch)->nr == MAX_TABLE_BATCH)
+ tlb_table_flush(tlb);
+}
+
+#endif /* CONFIG_HAVE_RCU_TABLE_FREE */
+
+static void tlb_flush_mmu_free(struct mmu_gather *tlb)
+{
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb_table_flush(tlb);
+#endif
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb_batch_pages_flush(tlb);
+#endif
+}
+
+void tlb_flush_mmu(struct mmu_gather *tlb)
+{
+ tlb_flush_mmu_tlbonly(tlb);
+ tlb_flush_mmu_free(tlb);
+}
+
+/**
+ * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
+ * @tlb: the mmu_gather structure to initialize
+ * @mm: the mm_struct of the target address space
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called to initialize an (on-stack) mmu_gather structure for page-table
+ * tear-down from @mm. The @start and @end are set to 0 and -1
+ * respectively when @mm is without users and we're going to destroy
+ * the full address space (exit/execve).
+ */
+void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
+ unsigned long start, unsigned long end)
+{
+ tlb->mm = mm;
+
+ /* Is it from 0 to ~0? */
+ tlb->fullmm = !(start | (end+1));
+
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb->need_flush_all = 0;
+ tlb->local.next = NULL;
+ tlb->local.nr = 0;
+ tlb->local.max = ARRAY_SIZE(tlb->__pages);
+ tlb->active = &tlb->local;
+ tlb->batch_count = 0;
+#endif
+
+#ifdef CONFIG_HAVE_RCU_TABLE_FREE
+ tlb->batch = NULL;
+#endif
+#ifdef CONFIG_HAVE_MMU_GATHER_PAGE_SIZE
+ tlb->page_size = 0;
+#endif
+
+ __tlb_reset_range(tlb);
+ inc_tlb_flush_pending(tlb->mm);
+}
+
+/**
+ * tlb_finish_mmu - finish an mmu_gather structure
+ * @tlb: the mmu_gather structure to finish
+ * @start: start of the region that will be removed from the page-table
+ * @end: end of the region that will be removed from the page-table
+ *
+ * Called at the end of the shootdown operation to free up any resources that
+ * were required.
+ */
+void tlb_finish_mmu(struct mmu_gather *tlb,
+ unsigned long start, unsigned long end)
+{
+ /*
+ * If there are parallel threads are doing PTE changes on same range
+ * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB
+ * flush by batching, one thread may end up seeing inconsistent PTEs
+ * and result in having stale TLB entries. So flush TLB forcefully
+ * if we detect parallel PTE batching threads.
+ *
+ * However, some syscalls, e.g. munmap(), may free page tables, this
+ * needs force flush everything in the given range. Otherwise this
+ * may result in having stale TLB entries for some architectures,
+ * e.g. aarch64, that could specify flush what level TLB.
+ */
+ if (mm_tlb_flush_nested(tlb->mm)) {
+ /*
+ * The aarch64 yields better performance with fullmm by
+ * avoiding multiple CPUs spamming TLBI messages at the
+ * same time.
+ *
+ * On x86 non-fullmm doesn't yield significant difference
+ * against fullmm.
+ */
+ tlb->fullmm = 1;
+ __tlb_reset_range(tlb);
+ tlb->freed_tables = 1;
+ }
+
+ tlb_flush_mmu(tlb);
+
+#ifndef CONFIG_HAVE_MMU_GATHER_NO_GATHER
+ tlb_batch_list_free(tlb);
+#endif
+ dec_tlb_flush_pending(tlb->mm);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 82bb1a9..9a889e4 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1,12 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/mmu_notifier.c
*
* Copyright (C) 2008 Qumranet, Inc.
* Copyright (C) 2008 SGI
* Christoph Lameter <cl@linux.com>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
*/
#include <linux/rculist.h>
@@ -23,24 +21,11 @@
/* global SRCU for all MMs */
DEFINE_STATIC_SRCU(srcu);
-/*
- * This function allows mmu_notifier::release callback to delay a call to
- * a function that will free appropriate resources. The function must be
- * quick and must not block.
- */
-void mmu_notifier_call_srcu(struct rcu_head *rcu,
- void (*func)(struct rcu_head *rcu))
-{
- call_srcu(&srcu, rcu, func);
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_call_srcu);
-
-void mmu_notifier_synchronize(void)
-{
- /* Wait for any running method to finish. */
- srcu_barrier(&srcu);
-}
-EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+#ifdef CONFIG_LOCKDEP
+struct lockdep_map __mmu_notifier_invalidate_range_start_map = {
+ .name = "mmu_notifier_invalidate_range_start"
+};
+#endif
/*
* This function can't run concurrently against mmu_notifier_register
@@ -174,22 +159,28 @@
srcu_read_unlock(&srcu, id);
}
-int __mmu_notifier_invalidate_range_start(struct mm_struct *mm,
- unsigned long start, unsigned long end,
- bool blockable)
+int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range)
{
struct mmu_notifier *mn;
int ret = 0;
int id;
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
if (mn->ops->invalidate_range_start) {
- int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable);
+ int _ret;
+
+ if (!mmu_notifier_range_blockable(range))
+ non_block_start();
+ _ret = mn->ops->invalidate_range_start(mn, range);
+ if (!mmu_notifier_range_blockable(range))
+ non_block_end();
if (_ret) {
pr_info("%pS callback failed with %d in %sblockable context.\n",
- mn->ops->invalidate_range_start, _ret,
- !blockable ? "non-" : "");
+ mn->ops->invalidate_range_start, _ret,
+ !mmu_notifier_range_blockable(range) ? "non-" : "");
+ WARN_ON(mmu_notifier_range_blockable(range) ||
+ _ret != -EAGAIN);
ret = _ret;
}
}
@@ -198,18 +189,16 @@
return ret;
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start);
-void __mmu_notifier_invalidate_range_end(struct mm_struct *mm,
- unsigned long start,
- unsigned long end,
+void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range,
bool only_end)
{
struct mmu_notifier *mn;
int id;
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+ hlist_for_each_entry_rcu(mn, &range->mm->mmu_notifier_mm->list, hlist) {
/*
* Call invalidate_range here too to avoid the need for the
* subsystem of having to register an invalidate_range_end
@@ -224,13 +213,20 @@
* already happen under page table lock.
*/
if (!only_end && mn->ops->invalidate_range)
- mn->ops->invalidate_range(mn, mm, start, end);
- if (mn->ops->invalidate_range_end)
- mn->ops->invalidate_range_end(mn, mm, start, end);
+ mn->ops->invalidate_range(mn, range->mm,
+ range->start,
+ range->end);
+ if (mn->ops->invalidate_range_end) {
+ if (!mmu_notifier_range_blockable(range))
+ non_block_start();
+ mn->ops->invalidate_range_end(mn, range);
+ if (!mmu_notifier_range_blockable(range))
+ non_block_end();
+ }
}
srcu_read_unlock(&srcu, id);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_end);
void __mmu_notifier_invalidate_range(struct mm_struct *mm,
unsigned long start, unsigned long end)
@@ -245,66 +241,49 @@
}
srcu_read_unlock(&srcu, id);
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range);
/*
- * Must be called while holding mm->mmap_sem for either read or write.
- * The result is guaranteed to be valid until mm->mmap_sem is dropped.
+ * Same as mmu_notifier_register but here the caller must hold the
+ * mmap_sem in write mode.
*/
-bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm)
+int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
{
- struct mmu_notifier *mn;
- int id;
- bool ret = false;
-
- WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem));
-
- if (!mm_has_notifiers(mm))
- return ret;
-
- id = srcu_read_lock(&srcu);
- hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
- if (!mn->ops->invalidate_range &&
- !mn->ops->invalidate_range_start &&
- !mn->ops->invalidate_range_end)
- continue;
-
- if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) {
- ret = true;
- break;
- }
- }
- srcu_read_unlock(&srcu, id);
- return ret;
-}
-
-static int do_mmu_notifier_register(struct mmu_notifier *mn,
- struct mm_struct *mm,
- int take_mmap_sem)
-{
- struct mmu_notifier_mm *mmu_notifier_mm;
+ struct mmu_notifier_mm *mmu_notifier_mm = NULL;
int ret;
+ lockdep_assert_held_write(&mm->mmap_sem);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
- ret = -ENOMEM;
- mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
- if (unlikely(!mmu_notifier_mm))
- goto out;
+ if (IS_ENABLED(CONFIG_LOCKDEP)) {
+ fs_reclaim_acquire(GFP_KERNEL);
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+ fs_reclaim_release(GFP_KERNEL);
+ }
- if (take_mmap_sem)
- down_write(&mm->mmap_sem);
+ mn->mm = mm;
+ mn->users = 1;
+
+ if (!mm->mmu_notifier_mm) {
+ /*
+ * kmalloc cannot be called under mm_take_all_locks(), but we
+ * know that mm->mmu_notifier_mm can't change while we hold
+ * the write side of the mmap_sem.
+ */
+ mmu_notifier_mm =
+ kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL);
+ if (!mmu_notifier_mm)
+ return -ENOMEM;
+
+ INIT_HLIST_HEAD(&mmu_notifier_mm->list);
+ spin_lock_init(&mmu_notifier_mm->lock);
+ }
+
ret = mm_take_all_locks(mm);
if (unlikely(ret))
goto out_clean;
- if (!mm_has_notifiers(mm)) {
- INIT_HLIST_HEAD(&mmu_notifier_mm->list);
- spin_lock_init(&mmu_notifier_mm->lock);
-
- mm->mmu_notifier_mm = mmu_notifier_mm;
- mmu_notifier_mm = NULL;
- }
+ /* Pairs with the mmdrop in mmu_notifier_unregister_* */
mmgrab(mm);
/*
@@ -315,48 +294,118 @@
* We can't race against any other mmu notifier method either
* thanks to mm_take_all_locks().
*/
+ if (mmu_notifier_mm)
+ mm->mmu_notifier_mm = mmu_notifier_mm;
+
spin_lock(&mm->mmu_notifier_mm->lock);
- hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list);
+ hlist_add_head_rcu(&mn->hlist, &mm->mmu_notifier_mm->list);
spin_unlock(&mm->mmu_notifier_mm->lock);
mm_drop_all_locks(mm);
-out_clean:
- if (take_mmap_sem)
- up_write(&mm->mmap_sem);
- kfree(mmu_notifier_mm);
-out:
BUG_ON(atomic_read(&mm->mm_users) <= 0);
+ return 0;
+
+out_clean:
+ kfree(mmu_notifier_mm);
return ret;
}
+EXPORT_SYMBOL_GPL(__mmu_notifier_register);
-/*
+/**
+ * mmu_notifier_register - Register a notifier on a mm
+ * @mn: The notifier to attach
+ * @mm: The mm to attach the notifier to
+ *
* Must not hold mmap_sem nor any other VM related lock when calling
* this registration function. Must also ensure mm_users can't go down
* to zero while this runs to avoid races with mmu_notifier_release,
* so mm has to be current->mm or the mm should be pinned safely such
* as with get_task_mm(). If the mm is not current->mm, the mm_users
* pin should be released by calling mmput after mmu_notifier_register
- * returns. mmu_notifier_unregister must be always called to
- * unregister the notifier. mm_count is automatically pinned to allow
- * mmu_notifier_unregister to safely run at any time later, before or
- * after exit_mmap. ->release will always be called before exit_mmap
- * frees the pages.
+ * returns.
+ *
+ * mmu_notifier_unregister() or mmu_notifier_put() must be always called to
+ * unregister the notifier.
+ *
+ * While the caller has a mmu_notifier get the mn->mm pointer will remain
+ * valid, and can be converted to an active mm pointer via mmget_not_zero().
*/
int mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
{
- return do_mmu_notifier_register(mn, mm, 1);
+ int ret;
+
+ down_write(&mm->mmap_sem);
+ ret = __mmu_notifier_register(mn, mm);
+ up_write(&mm->mmap_sem);
+ return ret;
}
EXPORT_SYMBOL_GPL(mmu_notifier_register);
-/*
- * Same as mmu_notifier_register but here the caller must hold the
- * mmap_sem in write mode.
- */
-int __mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm)
+static struct mmu_notifier *
+find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
{
- return do_mmu_notifier_register(mn, mm, 0);
+ struct mmu_notifier *mn;
+
+ spin_lock(&mm->mmu_notifier_mm->lock);
+ hlist_for_each_entry_rcu (mn, &mm->mmu_notifier_mm->list, hlist) {
+ if (mn->ops != ops)
+ continue;
+
+ if (likely(mn->users != UINT_MAX))
+ mn->users++;
+ else
+ mn = ERR_PTR(-EOVERFLOW);
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ return mn;
+ }
+ spin_unlock(&mm->mmu_notifier_mm->lock);
+ return NULL;
}
-EXPORT_SYMBOL_GPL(__mmu_notifier_register);
+
+/**
+ * mmu_notifier_get_locked - Return the single struct mmu_notifier for
+ * the mm & ops
+ * @ops: The operations struct being subscribe with
+ * @mm : The mm to attach notifiers too
+ *
+ * This function either allocates a new mmu_notifier via
+ * ops->alloc_notifier(), or returns an already existing notifier on the
+ * list. The value of the ops pointer is used to determine when two notifiers
+ * are the same.
+ *
+ * Each call to mmu_notifier_get() must be paired with a call to
+ * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem.
+ *
+ * While the caller has a mmu_notifier get the mm pointer will remain valid,
+ * and can be converted to an active mm pointer via mmget_not_zero().
+ */
+struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
+ struct mm_struct *mm)
+{
+ struct mmu_notifier *mn;
+ int ret;
+
+ lockdep_assert_held_write(&mm->mmap_sem);
+
+ if (mm->mmu_notifier_mm) {
+ mn = find_get_mmu_notifier(mm, ops);
+ if (mn)
+ return mn;
+ }
+
+ mn = ops->alloc_notifier(mm);
+ if (IS_ERR(mn))
+ return mn;
+ mn->ops = ops;
+ ret = __mmu_notifier_register(mn, mm);
+ if (ret)
+ goto out_free;
+ return mn;
+out_free:
+ mn->ops->free_notifier(mn);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_get_locked);
/* this is called after the last mmu_notifier_unregister() returned */
void __mmu_notifier_mm_destroy(struct mm_struct *mm)
@@ -417,21 +466,81 @@
}
EXPORT_SYMBOL_GPL(mmu_notifier_unregister);
-/*
- * Same as mmu_notifier_unregister but no callback and no srcu synchronization.
- */
-void mmu_notifier_unregister_no_release(struct mmu_notifier *mn,
- struct mm_struct *mm)
+static void mmu_notifier_free_rcu(struct rcu_head *rcu)
{
+ struct mmu_notifier *mn = container_of(rcu, struct mmu_notifier, rcu);
+ struct mm_struct *mm = mn->mm;
+
+ mn->ops->free_notifier(mn);
+ /* Pairs with the get in __mmu_notifier_register() */
+ mmdrop(mm);
+}
+
+/**
+ * mmu_notifier_put - Release the reference on the notifier
+ * @mn: The notifier to act on
+ *
+ * This function must be paired with each mmu_notifier_get(), it releases the
+ * reference obtained by the get. If this is the last reference then process
+ * to free the notifier will be run asynchronously.
+ *
+ * Unlike mmu_notifier_unregister() the get/put flow only calls ops->release
+ * when the mm_struct is destroyed. Instead free_notifier is always called to
+ * release any resources held by the user.
+ *
+ * As ops->release is not guaranteed to be called, the user must ensure that
+ * all sptes are dropped, and no new sptes can be established before
+ * mmu_notifier_put() is called.
+ *
+ * This function can be called from the ops->release callback, however the
+ * caller must still ensure it is called pairwise with mmu_notifier_get().
+ *
+ * Modules calling this function must call mmu_notifier_synchronize() in
+ * their __exit functions to ensure the async work is completed.
+ */
+void mmu_notifier_put(struct mmu_notifier *mn)
+{
+ struct mm_struct *mm = mn->mm;
+
spin_lock(&mm->mmu_notifier_mm->lock);
- /*
- * Can not use list_del_rcu() since __mmu_notifier_release
- * can delete it before we hold the lock.
- */
+ if (WARN_ON(!mn->users) || --mn->users)
+ goto out_unlock;
hlist_del_init_rcu(&mn->hlist);
spin_unlock(&mm->mmu_notifier_mm->lock);
- BUG_ON(atomic_read(&mm->mm_count) <= 0);
- mmdrop(mm);
+ call_srcu(&srcu, &mn->rcu, mmu_notifier_free_rcu);
+ return;
+
+out_unlock:
+ spin_unlock(&mm->mmu_notifier_mm->lock);
}
-EXPORT_SYMBOL_GPL(mmu_notifier_unregister_no_release);
+EXPORT_SYMBOL_GPL(mmu_notifier_put);
+
+/**
+ * mmu_notifier_synchronize - Ensure all mmu_notifiers are freed
+ *
+ * This function ensures that all outstanding async SRU work from
+ * mmu_notifier_put() is completed. After it returns any mmu_notifier_ops
+ * associated with an unused mmu_notifier will no longer be called.
+ *
+ * Before using the caller must ensure that all of its mmu_notifiers have been
+ * fully released via mmu_notifier_put().
+ *
+ * Modules using the mmu_notifier_put() API should call this in their __exit
+ * function to avoid module unloading races.
+ */
+void mmu_notifier_synchronize(void)
+{
+ synchronize_srcu(&srcu);
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_synchronize);
+
+bool
+mmu_notifier_range_update_to_read_only(const struct mmu_notifier_range *range)
+{
+ if (!range->vma || range->event != MMU_NOTIFY_PROTECTION_VMA)
+ return false;
+ /* Return true if the vma still have the read flag set. */
+ return range->vma->vm_flags & VM_READ;
+}
+EXPORT_SYMBOL_GPL(mmu_notifier_range_update_to_read_only);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 6d33162..7967825 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -9,7 +9,7 @@
* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
*/
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/hugetlb.h>
#include <linux/shm.h>
#include <linux/mman.h>
@@ -39,7 +39,6 @@
unsigned long addr, unsigned long end, pgprot_t newprot,
int dirty_accountable, int prot_numa)
{
- struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
@@ -110,8 +109,8 @@
continue;
}
- ptent = ptep_modify_prot_start(mm, addr, pte);
- ptent = pte_modify(ptent, newprot);
+ oldpte = ptep_modify_prot_start(vma, addr, pte);
+ ptent = pte_modify(oldpte, newprot);
if (preserve_write)
ptent = pte_mk_savedwrite(ptent);
@@ -121,7 +120,7 @@
!(vma->vm_flags & VM_SOFTDIRTY))) {
ptent = pte_mkwrite(ptent);
}
- ptep_modify_prot_commit(mm, addr, pte, ptent);
+ ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
} else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
@@ -136,7 +135,7 @@
newpte = swp_entry_to_pte(entry);
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
- set_pte_at(mm, addr, pte, newpte);
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
@@ -150,7 +149,7 @@
*/
make_device_private_entry_read(&entry);
newpte = swp_entry_to_pte(entry);
- set_pte_at(mm, addr, pte, newpte);
+ set_pte_at(vma->vm_mm, addr, pte, newpte);
pages++;
}
@@ -167,11 +166,12 @@
pgprot_t newprot, int dirty_accountable, int prot_numa)
{
pmd_t *pmd;
- struct mm_struct *mm = vma->vm_mm;
unsigned long next;
unsigned long pages = 0;
unsigned long nr_huge_updates = 0;
- unsigned long mni_start = 0;
+ struct mmu_notifier_range range;
+
+ range.start = 0;
pmd = pmd_offset(pud, addr);
do {
@@ -183,9 +183,11 @@
goto next;
/* invoke the mmu notifier if the pmd is populated */
- if (!mni_start) {
- mni_start = addr;
- mmu_notifier_invalidate_range_start(mm, mni_start, end);
+ if (!range.start) {
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_PROTECTION_VMA, 0,
+ vma, vma->vm_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
}
if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
@@ -214,8 +216,8 @@
cond_resched();
} while (pmd++, addr = next, addr != end);
- if (mni_start)
- mmu_notifier_invalidate_range_end(mm, mni_start, end);
+ if (range.start)
+ mmu_notifier_invalidate_range_end(&range);
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
@@ -327,20 +329,11 @@
return 0;
}
-static int prot_none_walk(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, unsigned long newflags)
-{
- pgprot_t new_pgprot = vm_get_page_prot(newflags);
- struct mm_walk prot_none_walk = {
- .pte_entry = prot_none_pte_entry,
- .hugetlb_entry = prot_none_hugetlb_entry,
- .test_walk = prot_none_test,
- .mm = current->mm,
- .private = &new_pgprot,
- };
-
- return walk_page_range(start, end, &prot_none_walk);
-}
+static const struct mm_walk_ops prot_none_walk_ops = {
+ .pte_entry = prot_none_pte_entry,
+ .hugetlb_entry = prot_none_hugetlb_entry,
+ .test_walk = prot_none_test,
+};
int
mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
@@ -367,7 +360,10 @@
if (arch_has_pfn_modify_check() &&
(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) &&
(newflags & (VM_READ|VM_WRITE|VM_EXEC)) == 0) {
- error = prot_none_walk(vma, start, end, newflags);
+ pgprot_t new_pgprot = vm_get_page_prot(newflags);
+
+ error = walk_page_range(current->mm, start, end,
+ &prot_none_walk_ops, &new_pgprot);
if (error)
return error;
}
@@ -463,6 +459,8 @@
const bool rier = (current->personality & READ_IMPLIES_EXEC) &&
(prot & PROT_READ);
+ start = untagged_addr(start);
+
prot &= ~(PROT_GROWSDOWN|PROT_GROWSUP);
if (grows == (PROT_GROWSDOWN|PROT_GROWSUP)) /* can't be both */
return -EINVAL;
diff --git a/mm/mremap.c b/mm/mremap.c
index a9617e7..1fc8a29 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -191,22 +191,67 @@
drop_rmap_locks(vma);
}
+#ifdef CONFIG_HAVE_MOVE_PMD
+static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ pmd_t pmd;
+
+ if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
+ || old_end - old_addr < PMD_SIZE)
+ return false;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd)))
+ return false;
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_sem prevents deadlock.
+ */
+ old_ptl = pmd_lock(vma->vm_mm, old_pmd);
+ new_ptl = pmd_lockptr(mm, new_pmd);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ /* Clear the pmd */
+ pmd = *old_pmd;
+ pmd_clear(old_pmd);
+
+ VM_BUG_ON(!pmd_none(*new_pmd));
+
+ /* Set the new pmd */
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+
+ return true;
+}
+#endif
+
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
bool need_rmap_locks)
{
unsigned long extent, next, old_end;
+ struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
- mmun_start = old_addr;
- mmun_end = old_end;
- mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
+ old_addr, old_end);
+ mmu_notifier_invalidate_range_start(&range);
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
@@ -237,8 +282,26 @@
split_huge_pmd(vma, old_pmd, old_addr);
if (pmd_trans_unstable(old_pmd))
continue;
+ } else if (extent == PMD_SIZE) {
+#ifdef CONFIG_HAVE_MOVE_PMD
+ /*
+ * If the extent is PMD-sized, try to speed the move by
+ * moving at the PMD level if possible.
+ */
+ bool moved;
+
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
+ moved = move_normal_pmd(vma, old_addr, new_addr,
+ old_end, old_pmd, new_pmd);
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
+ if (moved)
+ continue;
+#endif
}
- if (pte_alloc(new_vma->vm_mm, new_pmd, new_addr))
+
+ if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
@@ -247,7 +310,7 @@
new_pmd, new_addr, need_rmap_locks);
}
- mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return len + old_addr - old_end; /* how much done */
}
@@ -454,6 +517,23 @@
if (addr + old_len > new_addr && new_addr + new_len > addr)
goto out;
+ /*
+ * move_vma() need us to stay 4 maps below the threshold, otherwise
+ * it will bail out at the very beginning.
+ * That is a problem if we have already unmaped the regions here
+ * (new_addr, and old_addr), because userspace will not know the
+ * state of the vma's after it gets -ENOMEM.
+ * So, to avoid such scenario we can pre-compute if the whole
+ * operation has high chances to success map-wise.
+ * Worst-scenario case is when both vma's (new_addr and old_addr) get
+ * split in 3 before unmaping it.
+ * That means 2 more maps (1 for each) to the ones we already hold.
+ * Check whether current map count plus 2 still leads us to 4 maps below
+ * the threshold, otherwise return -ENOMEM here to be more safe.
+ */
+ if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
+ return -ENOMEM;
+
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
if (ret)
goto out;
@@ -521,10 +601,14 @@
unsigned long ret = -EINVAL;
unsigned long charged = 0;
bool locked = false;
+ bool downgraded = false;
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
+ addr = untagged_addr(addr);
+ new_addr = untagged_addr(new_addr);
+
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
return ret;
@@ -557,12 +641,20 @@
/*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
- * do_munmap does all the needed commit accounting
+ * __do_munmap does all the needed commit accounting, and
+ * downgrades mmap_sem to read if so directed.
*/
if (old_len >= new_len) {
- ret = do_munmap(mm, addr+new_len, old_len - new_len, &uf_unmap);
- if (ret && old_len != new_len)
+ int retval;
+
+ retval = __do_munmap(mm, addr+new_len, old_len - new_len,
+ &uf_unmap, true);
+ if (retval < 0 && old_len != new_len) {
+ ret = retval;
goto out;
+ /* Returning 1 indicates mmap_sem is downgraded to read. */
+ } else if (retval == 1)
+ downgraded = true;
ret = addr;
goto out;
}
@@ -627,7 +719,10 @@
vm_unacct_memory(charged);
locked = 0;
}
- up_write(¤t->mm->mmap_sem);
+ if (downgraded)
+ up_read(¤t->mm->mmap_sem);
+ else
+ up_write(¤t->mm->mmap_sem);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
userfaultfd_unmap_complete(mm, &uf_unmap_early);
diff --git a/mm/msync.c b/mm/msync.c
index ef30a42..c3bd3e7 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -37,6 +37,8 @@
int unmapped_error = 0;
int error = -EINVAL;
+ start = untagged_addr(start);
+
if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
goto out;
if (offset_in_page(start))
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
deleted file mode 100644
index 439af3b..0000000
--- a/mm/nobootmem.c
+++ /dev/null
@@ -1,445 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * bootmem - A boot-time physical memory allocator and configurator
- *
- * Copyright (C) 1999 Ingo Molnar
- * 1999 Kanoj Sarcar, SGI
- * 2008 Johannes Weiner
- *
- * Access to this subsystem has to be serialized externally (which is true
- * for the boot process anyway).
- */
-#include <linux/init.h>
-#include <linux/pfn.h>
-#include <linux/slab.h>
-#include <linux/export.h>
-#include <linux/kmemleak.h>
-#include <linux/range.h>
-#include <linux/memblock.h>
-#include <linux/bootmem.h>
-
-#include <asm/bug.h>
-#include <asm/io.h>
-
-#include "internal.h"
-
-#ifndef CONFIG_HAVE_MEMBLOCK
-#error CONFIG_HAVE_MEMBLOCK not defined
-#endif
-
-#ifndef CONFIG_NEED_MULTIPLE_NODES
-struct pglist_data __refdata contig_page_data;
-EXPORT_SYMBOL(contig_page_data);
-#endif
-
-unsigned long max_low_pfn;
-unsigned long min_low_pfn;
-unsigned long max_pfn;
-unsigned long long max_possible_pfn;
-
-static void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
- u64 goal, u64 limit)
-{
- void *ptr;
- u64 addr;
- enum memblock_flags flags = choose_memblock_flags();
-
- if (limit > memblock.current_limit)
- limit = memblock.current_limit;
-
-again:
- addr = memblock_find_in_range_node(size, align, goal, limit, nid,
- flags);
- if (!addr && (flags & MEMBLOCK_MIRROR)) {
- flags &= ~MEMBLOCK_MIRROR;
- pr_warn("Could not allocate %pap bytes of mirrored memory\n",
- &size);
- goto again;
- }
- if (!addr)
- return NULL;
-
- if (memblock_reserve(addr, size))
- return NULL;
-
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(ptr, size, 0, 0);
- return ptr;
-}
-
-/**
- * free_bootmem_late - free bootmem pages directly to page allocator
- * @addr: starting address of the range
- * @size: size of the range in bytes
- *
- * This is only useful when the bootmem allocator has already been torn
- * down, but we are still initializing the system. Pages are given directly
- * to the page allocator, no bootmem metadata is updated because it is gone.
- */
-void __init free_bootmem_late(unsigned long addr, unsigned long size)
-{
- unsigned long cursor, end;
-
- kmemleak_free_part_phys(addr, size);
-
- cursor = PFN_UP(addr);
- end = PFN_DOWN(addr + size);
-
- for (; cursor < end; cursor++) {
- __free_pages_bootmem(pfn_to_page(cursor), cursor, 0);
- totalram_pages++;
- }
-}
-
-static void __init __free_pages_memory(unsigned long start, unsigned long end)
-{
- int order;
-
- while (start < end) {
- order = min(MAX_ORDER - 1UL, __ffs(start));
-
- while (start + (1UL << order) > end)
- order--;
-
- __free_pages_bootmem(pfn_to_page(start), start, order);
-
- start += (1UL << order);
- }
-}
-
-static unsigned long __init __free_memory_core(phys_addr_t start,
- phys_addr_t end)
-{
- unsigned long start_pfn = PFN_UP(start);
- unsigned long end_pfn = min_t(unsigned long,
- PFN_DOWN(end), max_low_pfn);
-
- if (start_pfn >= end_pfn)
- return 0;
-
- __free_pages_memory(start_pfn, end_pfn);
-
- return end_pfn - start_pfn;
-}
-
-static unsigned long __init free_low_memory_core_early(void)
-{
- unsigned long count = 0;
- phys_addr_t start, end;
- u64 i;
-
- memblock_clear_hotplug(0, -1);
-
- for_each_reserved_mem_region(i, &start, &end)
- reserve_bootmem_region(start, end);
-
- /*
- * We need to use NUMA_NO_NODE instead of NODE_DATA(0)->node_id
- * because in some case like Node0 doesn't have RAM installed
- * low ram will be on Node1
- */
- for_each_free_mem_range(i, NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end,
- NULL)
- count += __free_memory_core(start, end);
-
- return count;
-}
-
-static int reset_managed_pages_done __initdata;
-
-void reset_node_managed_pages(pg_data_t *pgdat)
-{
- struct zone *z;
-
- for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
- z->managed_pages = 0;
-}
-
-void __init reset_all_zones_managed_pages(void)
-{
- struct pglist_data *pgdat;
-
- if (reset_managed_pages_done)
- return;
-
- for_each_online_pgdat(pgdat)
- reset_node_managed_pages(pgdat);
-
- reset_managed_pages_done = 1;
-}
-
-/**
- * free_all_bootmem - release free pages to the buddy allocator
- *
- * Return: the number of pages actually released.
- */
-unsigned long __init free_all_bootmem(void)
-{
- unsigned long pages;
-
- reset_all_zones_managed_pages();
-
- pages = free_low_memory_core_early();
- totalram_pages += pages;
-
- return pages;
-}
-
-/**
- * free_bootmem_node - mark a page range as usable
- * @pgdat: node the range resides on
- * @physaddr: starting physical address of the range
- * @size: size of the range in bytes
- *
- * Partial pages will be considered reserved and left as they are.
- *
- * The range must reside completely on the specified node.
- */
-void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
- unsigned long size)
-{
- memblock_free(physaddr, size);
-}
-
-/**
- * free_bootmem - mark a page range as usable
- * @addr: starting physical address of the range
- * @size: size of the range in bytes
- *
- * Partial pages will be considered reserved and left as they are.
- *
- * The range must be contiguous but may span node boundaries.
- */
-void __init free_bootmem(unsigned long addr, unsigned long size)
-{
- memblock_free(addr, size);
-}
-
-static void * __init ___alloc_bootmem_nopanic(unsigned long size,
- unsigned long align,
- unsigned long goal,
- unsigned long limit)
-{
- void *ptr;
-
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc(size, GFP_NOWAIT);
-
-restart:
-
- ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align, goal, limit);
-
- if (ptr)
- return ptr;
-
- if (goal != 0) {
- goal = 0;
- goto restart;
- }
-
- return NULL;
-}
-
-/**
- * __alloc_bootmem_nopanic - allocate boot memory without panicking
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * Return: address of the allocated region or %NULL on failure.
- */
-void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align,
- unsigned long goal)
-{
- unsigned long limit = -1UL;
-
- return ___alloc_bootmem_nopanic(size, align, goal, limit);
-}
-
-static void * __init ___alloc_bootmem(unsigned long size, unsigned long align,
- unsigned long goal, unsigned long limit)
-{
- void *mem = ___alloc_bootmem_nopanic(size, align, goal, limit);
-
- if (mem)
- return mem;
- /*
- * Whoops, we cannot satisfy the allocation request.
- */
- pr_alert("bootmem alloc of %lu bytes failed!\n", size);
- panic("Out of memory");
- return NULL;
-}
-
-/**
- * __alloc_bootmem - allocate boot memory
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * The function panics if the request can not be satisfied.
- *
- * Return: address of the allocated region.
- */
-void * __init __alloc_bootmem(unsigned long size, unsigned long align,
- unsigned long goal)
-{
- unsigned long limit = -1UL;
-
- return ___alloc_bootmem(size, align, goal, limit);
-}
-
-void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat,
- unsigned long size,
- unsigned long align,
- unsigned long goal,
- unsigned long limit)
-{
- void *ptr;
-
-again:
- ptr = __alloc_memory_core_early(pgdat->node_id, size, align,
- goal, limit);
- if (ptr)
- return ptr;
-
- ptr = __alloc_memory_core_early(NUMA_NO_NODE, size, align,
- goal, limit);
- if (ptr)
- return ptr;
-
- if (goal) {
- goal = 0;
- goto again;
- }
-
- return NULL;
-}
-
-void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- return ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, 0);
-}
-
-static void * __init ___alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal,
- unsigned long limit)
-{
- void *ptr;
-
- ptr = ___alloc_bootmem_node_nopanic(pgdat, size, align, goal, limit);
- if (ptr)
- return ptr;
-
- pr_alert("bootmem alloc of %lu bytes failed!\n", size);
- panic("Out of memory");
- return NULL;
-}
-
-/**
- * __alloc_bootmem_node - allocate boot memory from a specific node
- * @pgdat: node to allocate from
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- *
- * Return: address of the allocated region.
- */
-void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- return ___alloc_bootmem_node(pgdat, size, align, goal, 0);
-}
-
-void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- return __alloc_bootmem_node(pgdat, size, align, goal);
-}
-
-
-/**
- * __alloc_bootmem_low - allocate low boot memory
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may happen on any node in the system.
- *
- * The function panics if the request can not be satisfied.
- *
- * Return: address of the allocated region.
- */
-void * __init __alloc_bootmem_low(unsigned long size, unsigned long align,
- unsigned long goal)
-{
- return ___alloc_bootmem(size, align, goal, ARCH_LOW_ADDRESS_LIMIT);
-}
-
-void * __init __alloc_bootmem_low_nopanic(unsigned long size,
- unsigned long align,
- unsigned long goal)
-{
- return ___alloc_bootmem_nopanic(size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
-}
-
-/**
- * __alloc_bootmem_low_node - allocate low boot memory from a specific node
- * @pgdat: node to allocate from
- * @size: size of the request in bytes
- * @align: alignment of the region
- * @goal: preferred starting address of the region
- *
- * The goal is dropped if it can not be satisfied and the allocation will
- * fall back to memory below @goal.
- *
- * Allocation may fall back to any node in the system if the specified node
- * can not hold the requested memory.
- *
- * The function panics if the request can not be satisfied.
- *
- * Return: address of the allocated region.
- */
-void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size,
- unsigned long align, unsigned long goal)
-{
- if (WARN_ON_ONCE(slab_is_available()))
- return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
-
- return ___alloc_bootmem_node(pgdat, size, align, goal,
- ARCH_LOW_ADDRESS_LIMIT);
-}
diff --git a/mm/nommu.c b/mm/nommu.c
index e4aac33..99b7ec3 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/nommu.c
*
@@ -107,97 +108,9 @@
* The ksize() function is only guaranteed to work for pointers
* returned by kmalloc(). So handle arbitrary pointers here.
*/
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long start, unsigned long nr_pages,
- unsigned int foll_flags, struct page **pages,
- struct vm_area_struct **vmas, int *nonblocking)
-{
- struct vm_area_struct *vma;
- unsigned long vm_flags;
- int i;
-
- /* calculate required read or write permissions.
- * If FOLL_FORCE is set, we only require the "MAY" flags.
- */
- vm_flags = (foll_flags & FOLL_WRITE) ?
- (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD);
- vm_flags &= (foll_flags & FOLL_FORCE) ?
- (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE);
-
- for (i = 0; i < nr_pages; i++) {
- vma = find_vma(mm, start);
- if (!vma)
- goto finish_or_fault;
-
- /* protect what we can, including chardevs */
- if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) ||
- !(vm_flags & vma->vm_flags))
- goto finish_or_fault;
-
- if (pages) {
- pages[i] = virt_to_page(start);
- if (pages[i])
- get_page(pages[i]);
- }
- if (vmas)
- vmas[i] = vma;
- start = (start + PAGE_SIZE) & PAGE_MASK;
- }
-
- return i;
-
-finish_or_fault:
- return i ? : -EFAULT;
-}
-
-/*
- * get a list of pages in an address range belonging to the specified process
- * and indicate the VMA that covers each page
- * - this is potentially dodgy as we may end incrementing the page count of a
- * slab page or a secondary page from a compound page
- * - don't permit access to VMAs that don't support it, such as I/O mappings
- */
-long get_user_pages(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- struct vm_area_struct **vmas)
-{
- return __get_user_pages(current, current->mm, start, nr_pages,
- gup_flags, pages, vmas, NULL);
-}
-EXPORT_SYMBOL(get_user_pages);
-
-long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
- unsigned int gup_flags, struct page **pages,
- int *locked)
-{
- return get_user_pages(start, nr_pages, gup_flags, pages, NULL);
-}
-EXPORT_SYMBOL(get_user_pages_locked);
-
-static long __get_user_pages_unlocked(struct task_struct *tsk,
- struct mm_struct *mm, unsigned long start,
- unsigned long nr_pages, struct page **pages,
- unsigned int gup_flags)
-{
- long ret;
- down_read(&mm->mmap_sem);
- ret = __get_user_pages(tsk, mm, start, nr_pages, gup_flags, pages,
- NULL, NULL);
- up_read(&mm->mmap_sem);
- return ret;
-}
-
-long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
- struct page **pages, unsigned int gup_flags)
-{
- return __get_user_pages_unlocked(current, current->mm, start, nr_pages,
- pages, gup_flags);
-}
-EXPORT_SYMBOL(get_user_pages_unlocked);
-
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -473,6 +386,20 @@
}
EXPORT_SYMBOL(vm_insert_page);
+int vm_map_pages(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages);
+
+int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages,
+ unsigned long num)
+{
+ return -EINVAL;
+}
+EXPORT_SYMBOL(vm_map_pages_zero);
+
/*
* sys_brk() for the most part doesn't need the global kernel
* lock, except when an application is doing something nasty
@@ -1334,7 +1261,9 @@
add_nommu_region(region);
/* clear anonymous mappings that don't ask for uninitialized data */
- if (!vma->vm_file && !(flags & MAP_UNINITIALIZED))
+ if (!vma->vm_file &&
+ (!IS_ENABLED(CONFIG_MMAP_ALLOW_UNINITIALIZED) ||
+ !(flags & MAP_UNINITIALIZED)))
memset((void *)region->vm_start, 0,
region->vm_end - region->vm_start);
@@ -1709,11 +1638,9 @@
return ret;
}
-struct page *follow_page_mask(struct vm_area_struct *vma,
- unsigned long address, unsigned int flags,
- unsigned int *page_mask)
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
+ unsigned int foll_flags)
{
- *page_mask = 0;
return NULL;
}
@@ -1779,7 +1706,8 @@
struct vm_area_struct *vma;
int write = gup_flags & FOLL_WRITE;
- down_read(&mm->mmap_sem);
+ if (down_read_killable(&mm->mmap_sem))
+ return 0;
/* the access must start within one of the target process's mappings */
vma = find_vma(mm, addr);
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index f10aa53..71e3ace 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/oom_kill.c
*
@@ -63,21 +64,33 @@
*/
DEFINE_MUTEX(oom_lock);
+static inline bool is_memcg_oom(struct oom_control *oc)
+{
+ return oc->memcg != NULL;
+}
+
#ifdef CONFIG_NUMA
/**
- * has_intersects_mems_allowed() - check task eligiblity for kill
+ * oom_cpuset_eligible() - check task eligiblity for kill
* @start: task struct of which task to consider
- * @mask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
*
* Task eligibility is determined by whether or not a candidate task, @tsk,
* shares the same mempolicy nodes as current if it is bound by such a policy
* and whether or not it has the same set of allowed cpuset nodes.
+ *
+ * This function is assuming oom-killer context and 'current' has triggered
+ * the oom-killer.
*/
-static bool has_intersects_mems_allowed(struct task_struct *start,
- const nodemask_t *mask)
+static bool oom_cpuset_eligible(struct task_struct *start,
+ struct oom_control *oc)
{
struct task_struct *tsk;
bool ret = false;
+ const nodemask_t *mask = oc->nodemask;
+
+ if (is_memcg_oom(oc))
+ return true;
rcu_read_lock();
for_each_thread(start, tsk) {
@@ -104,8 +117,7 @@
return ret;
}
#else
-static bool has_intersects_mems_allowed(struct task_struct *tsk,
- const nodemask_t *mask)
+static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
{
return true;
}
@@ -145,28 +157,13 @@
return oc->order == -1;
}
-static inline bool is_memcg_oom(struct oom_control *oc)
-{
- return oc->memcg != NULL;
-}
-
/* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p,
- struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p)
{
if (is_global_init(p))
return true;
if (p->flags & PF_KTHREAD)
return true;
-
- /* When mem_cgroup_out_of_memory() and p is not member of the group */
- if (memcg && !task_in_mem_cgroup(p, memcg))
- return true;
-
- /* p may not have freeable memory in nodemask */
- if (!has_intersects_mems_allowed(p, nodemask))
- return true;
-
return false;
}
@@ -193,20 +190,17 @@
* oom_badness - heuristic function to determine which candidate task to kill
* @p: task struct of which task we should calculate
* @totalpages: total present RAM allowed for page allocation
- * @memcg: task's memory controller, if constrained
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
*
* The heuristic for determining which task to kill is made to be as simple and
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
- const nodemask_t *nodemask, unsigned long totalpages)
+unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
- if (oom_unkillable_task(p, memcg, nodemask))
+ if (oom_unkillable_task(p))
return 0;
p = find_lock_task_mm(p);
@@ -245,11 +239,11 @@
return points > 0 ? points : 1;
}
-enum oom_constraint {
- CONSTRAINT_NONE,
- CONSTRAINT_CPUSET,
- CONSTRAINT_MEMORY_POLICY,
- CONSTRAINT_MEMCG,
+static const char * const oom_constraint_text[] = {
+ [CONSTRAINT_NONE] = "CONSTRAINT_NONE",
+ [CONSTRAINT_CPUSET] = "CONSTRAINT_CPUSET",
+ [CONSTRAINT_MEMORY_POLICY] = "CONSTRAINT_MEMORY_POLICY",
+ [CONSTRAINT_MEMCG] = "CONSTRAINT_MEMCG",
};
/*
@@ -269,7 +263,7 @@
}
/* Default to all available memory */
- oc->totalpages = totalram_pages + total_swap_pages;
+ oc->totalpages = totalram_pages() + total_swap_pages;
if (!IS_ENABLED(CONFIG_NUMA))
return CONSTRAINT_NONE;
@@ -293,7 +287,7 @@
!nodes_subset(node_states[N_MEMORY], *oc->nodemask)) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, *oc->nodemask)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_MEMORY_POLICY;
}
@@ -306,7 +300,7 @@
if (cpuset_limited) {
oc->totalpages = total_swap_pages;
for_each_node_mask(nid, cpuset_current_mems_allowed)
- oc->totalpages += node_spanned_pages(nid);
+ oc->totalpages += node_present_pages(nid);
return CONSTRAINT_CPUSET;
}
return CONSTRAINT_NONE;
@@ -317,7 +311,11 @@
struct oom_control *oc = arg;
unsigned long points;
- if (oom_unkillable_task(task, NULL, oc->nodemask))
+ if (oom_unkillable_task(task))
+ goto next;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(task, oc))
goto next;
/*
@@ -341,13 +339,10 @@
goto select;
}
- points = oom_badness(task, NULL, oc->nodemask, oc->totalpages);
+ points = oom_badness(task, oc->totalpages);
if (!points || points < oc->chosen_points)
goto next;
- /* Prefer thread group leaders for display purposes */
- if (points == oc->chosen_points && thread_group_leader(oc->chosen))
- goto next;
select:
if (oc->chosen)
put_task_struct(oc->chosen);
@@ -380,14 +375,44 @@
break;
rcu_read_unlock();
}
+}
- oc->chosen_points = oc->chosen_points * 1000 / oc->totalpages;
+static int dump_task(struct task_struct *p, void *arg)
+{
+ struct oom_control *oc = arg;
+ struct task_struct *task;
+
+ if (oom_unkillable_task(p))
+ return 0;
+
+ /* p may not have freeable memory in nodemask */
+ if (!is_memcg_oom(oc) && !oom_cpuset_eligible(p, oc))
+ return 0;
+
+ task = find_lock_task_mm(p);
+ if (!task) {
+ /*
+ * This is a kthread or all of p's threads have already
+ * detached their mm's. There's no need to report
+ * them; they can't be oom killed anyway.
+ */
+ return 0;
+ }
+
+ pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
+ task->pid, from_kuid(&init_user_ns, task_uid(task)),
+ task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
+ mm_pgtables_bytes(task->mm),
+ get_mm_counter(task->mm, MM_SWAPENTS),
+ task->signal->oom_score_adj, task->comm);
+ task_unlock(task);
+
+ return 0;
}
/**
* dump_tasks - dump current memory state of all system tasks
- * @memcg: current's memory controller, if constrained
- * @nodemask: nodemask passed to page allocator for mempolicy ooms
+ * @oc: pointer to struct oom_control
*
* Dumps the current memory state of all eligible tasks. Tasks not in the same
* memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
@@ -395,59 +420,55 @@
* State information includes task's pid, uid, tgid, vm size, rss,
* pgtables_bytes, swapents, oom_score_adj value, and name.
*/
-static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
+static void dump_tasks(struct oom_control *oc)
{
- struct task_struct *p;
- struct task_struct *task;
-
pr_info("Tasks state (memory values in pages):\n");
pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n");
- rcu_read_lock();
- for_each_process(p) {
- if (oom_unkillable_task(p, memcg, nodemask))
- continue;
- task = find_lock_task_mm(p);
- if (!task) {
- /*
- * This is a kthread or all of p's threads have already
- * detached their mm's. There's no need to report
- * them; they can't be oom killed anyway.
- */
- continue;
- }
+ if (is_memcg_oom(oc))
+ mem_cgroup_scan_tasks(oc->memcg, dump_task, oc);
+ else {
+ struct task_struct *p;
- pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n",
- task->pid, from_kuid(&init_user_ns, task_uid(task)),
- task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- mm_pgtables_bytes(task->mm),
- get_mm_counter(task->mm, MM_SWAPENTS),
- task->signal->oom_score_adj, task->comm);
- task_unlock(task);
+ rcu_read_lock();
+ for_each_process(p)
+ dump_task(p, oc);
+ rcu_read_unlock();
}
- rcu_read_unlock();
+}
+
+static void dump_oom_summary(struct oom_control *oc, struct task_struct *victim)
+{
+ /* one line summary of the oom killer context. */
+ pr_info("oom-kill:constraint=%s,nodemask=%*pbl",
+ oom_constraint_text[oc->constraint],
+ nodemask_pr_args(oc->nodemask));
+ cpuset_print_current_mems_allowed();
+ mem_cgroup_print_oom_context(oc->memcg, victim);
+ pr_cont(",task=%s,pid=%d,uid=%d\n", victim->comm, victim->pid,
+ from_kuid(&init_user_ns, task_uid(victim)));
}
static void dump_header(struct oom_control *oc, struct task_struct *p)
{
- pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n",
- current->comm, oc->gfp_mask, &oc->gfp_mask,
- nodemask_pr_args(oc->nodemask), oc->order,
+ pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), order=%d, oom_score_adj=%hd\n",
+ current->comm, oc->gfp_mask, &oc->gfp_mask, oc->order,
current->signal->oom_score_adj);
if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order)
pr_warn("COMPACTION is disabled!!!\n");
- cpuset_print_current_mems_allowed();
dump_stack();
if (is_memcg_oom(oc))
- mem_cgroup_print_oom_info(oc->memcg, p);
+ mem_cgroup_print_oom_meminfo(oc->memcg);
else {
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
if (is_dump_unreclaim_slabs())
dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
- dump_tasks(oc->memcg, oc->nodemask);
+ dump_tasks(oc);
+ if (p)
+ dump_oom_summary(oc, p);
}
/*
@@ -502,7 +523,7 @@
set_bit(MMF_UNSTABLE, &mm->flags);
for (vma = mm->mmap ; vma; vma = vma->vm_next) {
- if (!can_madv_dontneed_vma(vma))
+ if (!can_madv_lru_vma(vma))
continue;
/*
@@ -516,19 +537,21 @@
* count elevated without a good reason.
*/
if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) {
- const unsigned long start = vma->vm_start;
- const unsigned long end = vma->vm_end;
+ struct mmu_notifier_range range;
struct mmu_gather tlb;
- tlb_gather_mmu(&tlb, mm, start, end);
- if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) {
- tlb_finish_mmu(&tlb, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0,
+ vma, mm, vma->vm_start,
+ vma->vm_end);
+ tlb_gather_mmu(&tlb, mm, range.start, range.end);
+ if (mmu_notifier_invalidate_range_start_nonblock(&range)) {
+ tlb_finish_mmu(&tlb, range.start, range.end);
ret = false;
continue;
}
- unmap_page_range(&tlb, vma, start, end, NULL);
- mmu_notifier_invalidate_range_end(mm, start, end);
- tlb_finish_mmu(&tlb, start, end);
+ unmap_page_range(&tlb, vma, range.start, range.end, NULL);
+ mmu_notifier_invalidate_range_end(&range);
+ tlb_finish_mmu(&tlb, range.start, range.end);
}
}
@@ -634,8 +657,8 @@
static void wake_oom_reaper(struct task_struct *tsk)
{
- /* tsk is already queued? */
- if (tsk == oom_reaper_list || tsk->oom_reaper_list)
+ /* mm is already queued? */
+ if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
@@ -830,7 +853,7 @@
return ret;
}
-static void __oom_kill_process(struct task_struct *victim)
+static void __oom_kill_process(struct task_struct *victim, const char *message)
{
struct task_struct *p;
struct mm_struct *mm;
@@ -859,13 +882,15 @@
* in order to prevent the OOM victim from depleting the memory
* reserves from the user space under its control.
*/
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, PIDTYPE_TGID);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, victim, PIDTYPE_TGID);
mark_oom_victim(victim);
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
- task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)),
- K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+ pr_err("%s: Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB, UID:%u pgtables:%lukB oom_score_adj:%hd\n",
+ message, task_pid_nr(victim), victim->comm, K(mm->total_vm),
+ K(get_mm_counter(mm, MM_ANONPAGES)),
+ K(get_mm_counter(mm, MM_FILEPAGES)),
+ K(get_mm_counter(mm, MM_SHMEMPAGES)),
+ from_kuid(&init_user_ns, task_uid(victim)),
+ mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
task_unlock(victim);
/*
@@ -897,7 +922,7 @@
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, PIDTYPE_TGID);
+ do_send_sig_info(SIGKILL, SEND_SIG_PRIV, p, PIDTYPE_TGID);
}
rcu_read_unlock();
@@ -913,24 +938,20 @@
* Kill provided task unless it's secured by setting
* oom_score_adj to OOM_SCORE_ADJ_MIN.
*/
-static int oom_kill_memcg_member(struct task_struct *task, void *unused)
+static int oom_kill_memcg_member(struct task_struct *task, void *message)
{
- if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
+ if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN &&
+ !is_global_init(task)) {
get_task_struct(task);
- __oom_kill_process(task);
+ __oom_kill_process(task, message);
}
return 0;
}
static void oom_kill_process(struct oom_control *oc, const char *message)
{
- struct task_struct *p = oc->chosen;
- unsigned int points = oc->chosen_points;
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t;
+ struct task_struct *victim = oc->chosen;
struct mem_cgroup *oom_group;
- unsigned int victim_points = 0;
static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -939,49 +960,18 @@
* its children or threads, just give it access to memory reserves
* so it can die quickly
*/
- task_lock(p);
- if (task_will_free_mem(p)) {
- mark_oom_victim(p);
- wake_oom_reaper(p);
- task_unlock(p);
- put_task_struct(p);
+ task_lock(victim);
+ if (task_will_free_mem(victim)) {
+ mark_oom_victim(victim);
+ wake_oom_reaper(victim);
+ task_unlock(victim);
+ put_task_struct(victim);
return;
}
- task_unlock(p);
+ task_unlock(victim);
if (__ratelimit(&oom_rs))
- dump_header(oc, p);
-
- pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
-
- /*
- * If any of p's children has a different mm and is eligible for kill,
- * the one with the highest oom_badness() score is sacrificed for its
- * parent. This attempts to lose the minimal amount of work done while
- * still freeing memory.
- */
- read_lock(&tasklist_lock);
- for_each_thread(p, t) {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (process_shares_mm(child, p->mm))
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child,
- oc->memcg, oc->nodemask, oc->totalpages);
- if (child_points > victim_points) {
- put_task_struct(victim);
- victim = child;
- victim_points = child_points;
- get_task_struct(victim);
- }
- }
- }
- read_unlock(&tasklist_lock);
+ dump_header(oc, victim);
/*
* Do we need to kill the entire memory cgroup?
@@ -990,14 +980,15 @@
*/
oom_group = mem_cgroup_get_oom_group(victim, oc->memcg);
- __oom_kill_process(victim);
+ __oom_kill_process(victim, message);
/*
* If necessary, kill all tasks in the selected memory cgroup.
*/
if (oom_group) {
mem_cgroup_print_oom_group(oom_group);
- mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL);
+ mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member,
+ (void*)message);
mem_cgroup_put(oom_group);
}
}
@@ -1005,8 +996,7 @@
/*
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
-static void check_panic_on_oom(struct oom_control *oc,
- enum oom_constraint constraint)
+static void check_panic_on_oom(struct oom_control *oc)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -1016,7 +1006,7 @@
* does not panic for cpuset, mempolicy, or memcg allocation
* failures.
*/
- if (constraint != CONSTRAINT_NONE)
+ if (oc->constraint != CONSTRAINT_NONE)
return;
}
/* Do not panic for oom kills triggered by sysrq */
@@ -1053,7 +1043,6 @@
bool out_of_memory(struct oom_control *oc)
{
unsigned long freed = 0;
- enum oom_constraint constraint = CONSTRAINT_NONE;
if (oom_killer_disabled)
return false;
@@ -1080,22 +1069,24 @@
* The OOM killer does not compensate for IO-less reclaim.
* pagefault_out_of_memory lost its gfp context so we have to
* make sure exclude 0 mask - all other users should have at least
- * ___GFP_DIRECT_RECLAIM to get here.
+ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to
+ * invoke the OOM killer even if it is a GFP_NOFS allocation.
*/
- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS))
+ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc))
return true;
/*
* Check if there were limitations on the allocation (only relevant for
* NUMA and memcg) that may require different handling.
*/
- constraint = constrained_alloc(oc);
- if (constraint != CONSTRAINT_MEMORY_POLICY)
+ oc->constraint = constrained_alloc(oc);
+ if (oc->constraint != CONSTRAINT_MEMORY_POLICY)
oc->nodemask = NULL;
- check_panic_on_oom(oc, constraint);
+ check_panic_on_oom(oc);
if (!is_memcg_oom(oc) && sysctl_oom_kill_allocating_task &&
- current->mm && !oom_unkillable_task(current, NULL, oc->nodemask) &&
+ current->mm && !oom_unkillable_task(current) &&
+ oom_cpuset_eligible(current, oc) &&
current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
get_task_struct(current);
oc->chosen = current;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 84ae9bf..50055d2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/page-writeback.c
*
@@ -270,7 +271,7 @@
* node_dirtyable_memory - number of dirtyable pages in a node
* @pgdat: the node
*
- * Returns the node's number of pages potentially available for dirty
+ * Return: the node's number of pages potentially available for dirty
* page cache. This is the base value for the per-node dirty limits.
*/
static unsigned long node_dirtyable_memory(struct pglist_data *pgdat)
@@ -355,7 +356,7 @@
/**
* global_dirtyable_memory - number of globally dirtyable pages
*
- * Returns the global number of pages potentially available for dirty
+ * Return: the global number of pages potentially available for dirty
* page cache. This is the base value for the global dirty limits.
*/
static unsigned long global_dirtyable_memory(void)
@@ -470,7 +471,7 @@
* node_dirty_limit - maximum number of dirty pages allowed in a node
* @pgdat: the node
*
- * Returns the maximum number of dirty pages allowed in a node, based
+ * Return: the maximum number of dirty pages allowed in a node, based
* on the node's dirtyable memory.
*/
static unsigned long node_dirty_limit(struct pglist_data *pgdat)
@@ -495,7 +496,7 @@
* node_dirty_ok - tells whether a node is within its dirty limits
* @pgdat: the node to check
*
- * Returns %true when the dirty pages in @pgdat are within the node's
+ * Return: %true when the dirty pages in @pgdat are within the node's
* dirty limit, %false if the limit is exceeded.
*/
bool node_dirty_ok(struct pglist_data *pgdat)
@@ -743,9 +744,6 @@
* __wb_calc_thresh - @wb's share of dirty throttling threshold
* @dtc: dirty_throttle_context of interest
*
- * Returns @wb's dirty limit in pages. The term "dirty" in the context of
- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
- *
* Note that balance_dirty_pages() will only seriously take it as a hard limit
* when sleeping max_pause per page is not enough to keep the dirty pages under
* control. For example, when the device is completely stalled due to some error
@@ -759,6 +757,9 @@
*
* The wb's share of dirty limit will be adapting to its throughput and
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ *
+ * Return: @wb's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
@@ -1666,6 +1667,8 @@
if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb);
+ mem_cgroup_flush_foreign(wb);
+
/*
* Calculate global domain's pos_ratio and select the
* global dtc by default.
@@ -1918,7 +1921,9 @@
* @wb: bdi_writeback of interest
*
* Determines whether background writeback should keep writing @wb or it's
- * clean enough. Returns %true if writeback should continue.
+ * clean enough.
+ *
+ * Return: %true if writeback should continue.
*/
bool wb_over_bg_thresh(struct bdi_writeback *wb)
{
@@ -2097,34 +2102,25 @@
* dirty pages in the file (thus it is important for this function to be quick
* so that it can tag pages faster than a dirtying process can create them).
*/
-/*
- * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce the i_pages lock
- * latency.
- */
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
-#define WRITEBACK_TAG_BATCH 4096
- unsigned long tagged = 0;
- struct radix_tree_iter iter;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
+ unsigned int tagged = 0;
+ void *page;
- xa_lock_irq(&mapping->i_pages);
- radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, start,
- PAGECACHE_TAG_DIRTY) {
- if (iter.index > end)
- break;
- radix_tree_iter_tag_set(&mapping->i_pages, &iter,
- PAGECACHE_TAG_TOWRITE);
- tagged++;
- if ((tagged % WRITEBACK_TAG_BATCH) != 0)
+ xas_lock_irq(&xas);
+ xas_for_each_marked(&xas, page, end, PAGECACHE_TAG_DIRTY) {
+ xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ if (++tagged % XA_CHECK_SCHED)
continue;
- slot = radix_tree_iter_resume(slot, &iter);
- xa_unlock_irq(&mapping->i_pages);
+
+ xas_pause(&xas);
+ xas_unlock_irq(&xas);
cond_resched();
- xa_lock_irq(&mapping->i_pages);
+ xas_lock_irq(&xas);
}
- xa_unlock_irq(&mapping->i_pages);
+ xas_unlock_irq(&xas);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
@@ -2149,6 +2145,15 @@
* not miss some pages (e.g., because some other process has cleared TOWRITE
* tag we set). The rule we follow is that TOWRITE tag can be cleared only
* by the process clearing the DIRTY tag (and submitting the page for IO).
+ *
+ * To avoid deadlocks between range_cyclic writeback and callers that hold
+ * pages in PageWriteback to aggregate IO until write_cache_pages() returns,
+ * we do not loop back to the start of the file. Doing so causes a page
+ * lock/page writeback access order inversion - we should only ever lock
+ * multiple pages in ascending page->index order, and looping back to the start
+ * of the file violates that rule and causes deadlocks.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
@@ -2156,37 +2161,31 @@
{
int ret = 0;
int done = 0;
+ int error;
struct pagevec pvec;
int nr_pages;
pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
- int cycled;
int range_whole = 0;
- int tag;
+ xa_mark_t tag;
pagevec_init(&pvec);
if (wbc->range_cyclic) {
writeback_index = mapping->writeback_index; /* prev offset */
index = writeback_index;
- if (index == 0)
- cycled = 1;
- else
- cycled = 0;
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
end = wbc->range_end >> PAGE_SHIFT;
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
range_whole = 1;
- cycled = 1; /* ignore range_cyclic tests */
}
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
else
tag = PAGECACHE_TAG_DIRTY;
-retry:
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
tag_pages_for_writeback(mapping, index, end);
done_index = index;
@@ -2236,25 +2235,31 @@
goto continue_unlock;
trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
- ret = (*writepage)(page, wbc, data);
- if (unlikely(ret)) {
- if (ret == AOP_WRITEPAGE_ACTIVATE) {
+ error = (*writepage)(page, wbc, data);
+ if (unlikely(error)) {
+ /*
+ * Handle errors according to the type of
+ * writeback. There's no need to continue for
+ * background writeback. Just push done_index
+ * past this page so media errors won't choke
+ * writeout for the entire file. For integrity
+ * writeback, we must process the entire dirty
+ * set regardless of errors because the fs may
+ * still have state to clear for each page. In
+ * that case we continue processing and return
+ * the first error.
+ */
+ if (error == AOP_WRITEPAGE_ACTIVATE) {
unlock_page(page);
- ret = 0;
- } else {
- /*
- * done_index is set past this page,
- * so media errors will not choke
- * background writeout for the entire
- * file. This has consequences for
- * range_cyclic semantics (ie. it may
- * not be suitable for data integrity
- * writeout).
- */
+ error = 0;
+ } else if (wbc->sync_mode != WB_SYNC_ALL) {
+ ret = error;
done_index = page->index + 1;
done = 1;
break;
}
+ if (!ret)
+ ret = error;
}
/*
@@ -2272,17 +2277,14 @@
pagevec_release(&pvec);
cond_resched();
}
- if (!cycled && !done) {
- /*
- * range_cyclic:
- * We hit the last page and there is more work to be done: wrap
- * back to the start of the file
- */
- cycled = 1;
- index = 0;
- end = writeback_index - 1;
- goto retry;
- }
+
+ /*
+ * If we hit the last page and there is more work to be done: wrap
+ * back the index back to the start of the file for the next
+ * time we are called.
+ */
+ if (wbc->range_cyclic && !done)
+ done_index = 0;
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
mapping->writeback_index = done_index;
@@ -2310,6 +2312,8 @@
*
* This is a library function, which implements the writepages()
* address_space_operation.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int generic_writepages(struct address_space *mapping,
struct writeback_control *wbc)
@@ -2356,6 +2360,8 @@
*
* Note that the mapping's AS_EIO/AS_ENOSPC flags will be cleared when this
* function returns.
+ *
+ * Return: %0 on success, negative error code otherwise
*/
int write_one_page(struct page *page)
{
@@ -2423,9 +2429,10 @@
task_io_account_write(PAGE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
+
+ mem_cgroup_track_foreign_dirty(page, wb);
}
}
-EXPORT_SYMBOL(account_page_dirtied);
/*
* Helper function for deaccounting dirty page without writeback.
@@ -2445,7 +2452,7 @@
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
- * its radix tree.
+ * the xarray.
*
* This is also used when a single buffer is being dirtied: we want to set the
* page dirty in that case, but not all the buffers. This is a "bottom-up"
@@ -2471,7 +2478,7 @@
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
+ __xa_set_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_DIRTY);
xa_unlock_irqrestore(&mapping->i_pages, flags);
unlock_page_memcg(page);
@@ -2634,13 +2641,13 @@
* Returns true if the page was previously dirty.
*
* This is for preparing to put the page under writeout. We leave the page
- * tagged as dirty in the radix tree so that a concurrent write-for-sync
+ * tagged as dirty in the xarray so that a concurrent write-for-sync
* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
* implementation will run either set_page_writeback() or set_page_dirty(),
- * at which stage we bring the page's dirty flag and radix-tree dirty tag
+ * at which stage we bring the page's dirty flag and xarray dirty tag
* back into sync.
*
- * This incoherency between the page's dirty flag and radix-tree tag is
+ * This incoherency between the page's dirty flag and xarray tag is
* unfortunate, but it only exists while the page is locked.
*/
int clear_page_dirty_for_io(struct page *page)
@@ -2721,7 +2728,7 @@
xa_lock_irqsave(&mapping->i_pages, flags);
ret = TestClearPageWriteback(page);
if (ret) {
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
+ __xa_clear_mark(&mapping->i_pages, page_index(page),
PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi)) {
struct bdi_writeback *wb = inode_to_wb(inode);
@@ -2761,11 +2768,13 @@
lock_page_memcg(page);
if (mapping && mapping_use_writeback_tags(mapping)) {
+ XA_STATE(xas, &mapping->i_pages, page_index(page));
struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode);
unsigned long flags;
- xa_lock_irqsave(&mapping->i_pages, flags);
+ xas_lock_irqsave(&xas, flags);
+ xas_load(&xas);
ret = TestSetPageWriteback(page);
if (!ret) {
bool on_wblist;
@@ -2773,8 +2782,7 @@
on_wblist = mapping_tagged(mapping,
PAGECACHE_TAG_WRITEBACK);
- radix_tree_tag_set(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_WRITEBACK);
+ xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
if (bdi_cap_account_writeback(bdi))
inc_wb_stat(inode_to_wb(inode), WB_WRITEBACK);
@@ -2787,12 +2795,10 @@
sb_mark_inode_writeback(mapping->host);
}
if (!PageDirty(page))
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_DIRTY);
+ xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
if (!keep_write)
- radix_tree_tag_clear(&mapping->i_pages, page_index(page),
- PAGECACHE_TAG_TOWRITE);
- xa_unlock_irqrestore(&mapping->i_pages, flags);
+ xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
+ xas_unlock_irqrestore(&xas, flags);
} else {
ret = TestSetPageWriteback(page);
}
@@ -2807,14 +2813,16 @@
EXPORT_SYMBOL(__test_set_page_writeback);
/*
- * Return true if any of the pages in the mapping are marked with the
- * passed tag.
+ * Wait for a page to complete writeback
*/
-int mapping_tagged(struct address_space *mapping, int tag)
+void wait_on_page_writeback(struct page *page)
{
- return radix_tree_tagged(&mapping->i_pages, tag);
+ if (PageWriteback(page)) {
+ trace_wait_on_page_writeback(page, page_mapping(page));
+ wait_on_page_bit(page, PG_writeback);
+ }
}
-EXPORT_SYMBOL(mapping_tagged);
+EXPORT_SYMBOL_GPL(wait_on_page_writeback);
/**
* wait_for_stable_page() - wait for writeback to finish, if necessary.
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 93e73cc..f391c0c 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/page_alloc.c
*
@@ -16,11 +17,11 @@
#include <linux/stddef.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
-#include <linux/bootmem.h>
#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
@@ -43,12 +44,12 @@
#include <linux/mempolicy.h>
#include <linux/memremap.h>
#include <linux/stop_machine.h>
+#include <linux/random.h>
#include <linux/sort.h>
#include <linux/pfn.h>
#include <linux/backing-dev.h>
#include <linux/fault-inject.h>
#include <linux/page-isolation.h>
-#include <linux/page_ext.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
#include <linux/compaction.h>
@@ -66,11 +67,13 @@
#include <linux/ftrace.h>
#include <linux/lockdep.h>
#include <linux/nmi.h>
+#include <linux/psi.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include "internal.h"
+#include "shuffle.h"
/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */
static DEFINE_MUTEX(pcp_batch_high_lock);
@@ -96,8 +99,12 @@
#endif
/* work_structs for global per-cpu drains */
+struct pcpu_drain {
+ struct zone *zone;
+ struct work_struct work;
+};
DEFINE_MUTEX(pcpu_drain_mutex);
-DEFINE_PER_CPU(struct work_struct, pcpu_drain);
+DEFINE_PER_CPU(struct pcpu_drain, pcpu_drain);
#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY
volatile unsigned long latent_entropy __latent_entropy;
@@ -121,15 +128,62 @@
};
EXPORT_SYMBOL(node_states);
-/* Protect totalram_pages and zone->managed_pages */
-static DEFINE_SPINLOCK(managed_page_count_lock);
-
-unsigned long totalram_pages __read_mostly;
+atomic_long_t _totalram_pages __read_mostly;
+EXPORT_SYMBOL(_totalram_pages);
unsigned long totalreserve_pages __read_mostly;
unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
+#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_alloc);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_alloc);
+#endif
+EXPORT_SYMBOL(init_on_alloc);
+
+#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
+DEFINE_STATIC_KEY_TRUE(init_on_free);
+#else
+DEFINE_STATIC_KEY_FALSE(init_on_free);
+#endif
+EXPORT_SYMBOL(init_on_free);
+
+static int __init early_init_on_alloc(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ if (!buf)
+ return -EINVAL;
+ ret = kstrtobool(buf, &bool_result);
+ if (bool_result && page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
+ if (bool_result)
+ static_branch_enable(&init_on_alloc);
+ else
+ static_branch_disable(&init_on_alloc);
+ return ret;
+}
+early_param("init_on_alloc", early_init_on_alloc);
+
+static int __init early_init_on_free(char *buf)
+{
+ int ret;
+ bool bool_result;
+
+ if (!buf)
+ return -EINVAL;
+ ret = kstrtobool(buf, &bool_result);
+ if (bool_result && page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
+ if (bool_result)
+ static_branch_enable(&init_on_free);
+ else
+ static_branch_disable(&init_on_free);
+ return ret;
+}
+early_param("init_on_free", early_init_on_free);
/*
* A cached value of the page's pageblock's migratetype, used when the page is
@@ -218,8 +272,6 @@
[ZONE_MOVABLE] = 0,
};
-EXPORT_SYMBOL(totalram_pages);
-
static char * const zone_names[MAX_NR_ZONES] = {
#ifdef CONFIG_ZONE_DMA
"DMA",
@@ -237,7 +289,7 @@
#endif
};
-char * const migratetype_names[MIGRATE_TYPES] = {
+const char * const migratetype_names[MIGRATE_TYPES] = {
"Unmovable",
"Movable",
"Reclaimable",
@@ -263,20 +315,34 @@
int min_free_kbytes = 1024;
int user_min_free_kbytes = -1;
+#ifdef CONFIG_DISCONTIGMEM
+/*
+ * DiscontigMem defines memory ranges as separate pg_data_t even if the ranges
+ * are not on separate NUMA nodes. Functionally this works but with
+ * watermark_boost_factor, it can reclaim prematurely as the ranges can be
+ * quite small. By default, do not boost watermarks on discontigmem as in
+ * many cases very high-order allocations like THP are likely to be
+ * unsupported and the premature reclaim offsets the advantage of long-term
+ * fragmentation avoidance.
+ */
+int watermark_boost_factor __read_mostly;
+#else
+int watermark_boost_factor __read_mostly = 15000;
+#endif
int watermark_scale_factor = 10;
-static unsigned long nr_kernel_pages __meminitdata;
-static unsigned long nr_all_pages __meminitdata;
-static unsigned long dma_reserve __meminitdata;
+static unsigned long nr_kernel_pages __initdata;
+static unsigned long nr_all_pages __initdata;
+static unsigned long dma_reserve __initdata;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __meminitdata;
-static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __meminitdata;
+static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
+static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
static unsigned long required_kernelcore_percent __initdata;
static unsigned long required_movablecore __initdata;
static unsigned long required_movablecore_percent __initdata;
-static unsigned long zone_movable_pfn[MAX_NUMNODES] __meminitdata;
+static unsigned long zone_movable_pfn[MAX_NUMNODES] __initdata;
static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
@@ -285,8 +351,8 @@
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
-int nr_node_ids __read_mostly = MAX_NUMNODES;
-int nr_online_nodes __read_mostly = 1;
+unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
+unsigned int nr_online_nodes __read_mostly = 1;
EXPORT_SYMBOL(nr_node_ids);
EXPORT_SYMBOL(nr_online_nodes);
#endif
@@ -294,6 +360,32 @@
int page_group_by_mobility_disabled __read_mostly;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
+/*
+ * During boot we initialize deferred pages on-demand, as needed, but once
+ * page_alloc_init_late() has finished, the deferred pages are all initialized,
+ * and we can permanently disable that path.
+ */
+static DEFINE_STATIC_KEY_TRUE(deferred_pages);
+
+/*
+ * Calling kasan_free_pages() only after deferred memory initialization
+ * has completed. Poisoning pages during deferred memory init will greatly
+ * lengthen the process and cause problem in large memory systems as the
+ * deferred pages initialization is done with interrupt disabled.
+ *
+ * Assuming that there will be no reference to those newly initialized
+ * pages before they are ever allocated, this should have no effect on
+ * KASAN memory tracking as the poison will be properly inserted at page
+ * allocation time. The only corner case is when pages are allocated by
+ * on-demand allocation and then freed again before the deferred pages
+ * initialization is done, but this is not likely to happen.
+ */
+static inline void kasan_free_nondeferred_pages(struct page *page, int order)
+{
+ if (!static_branch_unlikely(&deferred_pages))
+ kasan_free_pages(page, order);
+}
+
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
@@ -306,36 +398,50 @@
}
/*
- * Returns false when the remaining initialisation should be deferred until
+ * Returns true when the remaining initialisation should be deferred until
* later in the boot cycle when it can be parallelised.
*/
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static bool __meminit
+defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
- /* Always populate low zones for address-constrained allocations */
- if (zone_end < pgdat_end_pfn(pgdat))
- return true;
- (*nr_initialised)++;
- if ((*nr_initialised > pgdat->static_init_pgcnt) &&
- (pfn & (PAGES_PER_SECTION - 1)) == 0) {
- pgdat->first_deferred_pfn = pfn;
- return false;
+ static unsigned long prev_end_pfn, nr_initialised;
+
+ /*
+ * prev_end_pfn static that contains the end of previous zone
+ * No need to protect because called very early in boot before smp_init.
+ */
+ if (prev_end_pfn != end_pfn) {
+ prev_end_pfn = end_pfn;
+ nr_initialised = 0;
}
- return true;
+ /* Always populate low zones for address-constrained allocations */
+ if (end_pfn < pgdat_end_pfn(NODE_DATA(nid)))
+ return false;
+
+ /*
+ * We start only with one section of pages, more pages are added as
+ * needed until the rest of deferred pages are initialized.
+ */
+ nr_initialised++;
+ if ((nr_initialised > PAGES_PER_SECTION) &&
+ (pfn & (PAGES_PER_SECTION - 1)) == 0) {
+ NODE_DATA(nid)->first_deferred_pfn = pfn;
+ return true;
+ }
+ return false;
}
#else
+#define kasan_free_nondeferred_pages(p, o) kasan_free_pages(p, o)
+
static inline bool early_page_uninitialised(unsigned long pfn)
{
return false;
}
-static inline bool update_defer_init(pg_data_t *pgdat,
- unsigned long pfn, unsigned long zone_end,
- unsigned long *nr_initialised)
+static inline bool defer_init(int nid, unsigned long pfn, unsigned long end_pfn)
{
- return true;
+ return false;
}
#endif
@@ -344,7 +450,7 @@
unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
- return __pfn_to_section(pfn)->pageblock_flags;
+ return section_to_usemap(__pfn_to_section(pfn));
#else
return page_zone(page)->pageblock_flags;
#endif /* CONFIG_SPARSEMEM */
@@ -419,6 +525,7 @@
unsigned long old_word, word;
BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4);
+ BUILD_BUG_ON(MIGRATE_TYPES > (1 << PB_migratetype_bits));
bitmap = get_pageblock_bitmap(page, pfn);
bitidx = pfn_to_bitidx(page, pfn);
@@ -563,6 +670,7 @@
void free_compound_page(struct page *page)
{
+ mem_cgroup_uncharge(page);
__free_pages_ok(page, compound_order(page));
}
@@ -585,31 +693,30 @@
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-bool _debug_pagealloc_enabled __read_mostly
- = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
+DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
+#else
+DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
+#endif
EXPORT_SYMBOL(_debug_pagealloc_enabled);
-bool _debug_guardpage_enabled __read_mostly;
+
+DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
- if (!buf)
+ bool enable = false;
+
+ if (kstrtobool(buf, &enable))
return -EINVAL;
- return kstrtobool(buf, &_debug_pagealloc_enabled);
+
+ if (enable)
+ static_branch_enable(&_debug_pagealloc_enabled);
+
+ return 0;
}
early_param("debug_pagealloc", early_debug_pagealloc);
-static bool need_debug_guardpage(void)
-{
- /* If we don't use debug_pagealloc, we don't need guard page */
- if (!debug_pagealloc_enabled())
- return false;
-
- if (!debug_guardpage_minorder())
- return false;
-
- return true;
-}
-
static void init_debug_guardpage(void)
{
if (!debug_pagealloc_enabled())
@@ -618,14 +725,9 @@
if (!debug_guardpage_minorder())
return;
- _debug_guardpage_enabled = true;
+ static_branch_enable(&_debug_guardpage_enabled);
}
-struct page_ext_operations debug_guardpage_ops = {
- .need = need_debug_guardpage,
- .init = init_debug_guardpage,
-};
-
static int __init debug_guardpage_minorder_setup(char *buf)
{
unsigned long res;
@@ -643,20 +745,13 @@
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
- struct page_ext *page_ext;
-
if (!debug_guardpage_enabled())
return false;
if (order >= debug_guardpage_minorder())
return false;
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return false;
-
- __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
-
+ __SetPageGuard(page);
INIT_LIST_HEAD(&page->lru);
set_page_private(page, order);
/* Guard pages are not available for any usage */
@@ -668,23 +763,16 @@
static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype)
{
- struct page_ext *page_ext;
-
if (!debug_guardpage_enabled())
return;
- page_ext = lookup_page_ext(page);
- if (unlikely(!page_ext))
- return;
-
- __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags);
+ __ClearPageGuard(page);
set_page_private(page, 0);
if (!is_migrate_isolate(migratetype))
__mod_zone_freepage_state(zone, (1 << order), migratetype);
}
#else
-struct page_ext_operations debug_guardpage_ops;
static inline bool set_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) { return false; }
static inline void clear_page_guard(struct zone *zone, struct page *page,
@@ -697,12 +785,6 @@
__SetPageBuddy(page);
}
-static inline void rmv_page_order(struct page *page)
-{
- __ClearPageBuddy(page);
- set_page_private(page, 0);
-}
-
/*
* This function checks whether a page is free && is the buddy
* we can coalesce a page and its buddy if
@@ -744,6 +826,57 @@
return 0;
}
+#ifdef CONFIG_COMPACTION
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ struct capture_control *capc = current->capture_control;
+
+ return capc &&
+ !(current->flags & PF_KTHREAD) &&
+ !capc->page &&
+ capc->cc->zone == zone &&
+ capc->cc->direct_compaction ? capc : NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ if (!capc || order != capc->cc->order)
+ return false;
+
+ /* Do not accidentally pollute CMA or isolated regions*/
+ if (is_migrate_cma(migratetype) ||
+ is_migrate_isolate(migratetype))
+ return false;
+
+ /*
+ * Do not let lower order allocations polluate a movable pageblock.
+ * This might let an unmovable request use a reclaimable pageblock
+ * and vice-versa but no more than normal fallback logic which can
+ * have trouble finding a high-order free page.
+ */
+ if (order < pageblock_order && migratetype == MIGRATE_MOVABLE)
+ return false;
+
+ capc->page = page;
+ return true;
+}
+
+#else
+static inline struct capture_control *task_capc(struct zone *zone)
+{
+ return NULL;
+}
+
+static inline bool
+compaction_capture(struct capture_control *capc, struct page *page,
+ int order, int migratetype)
+{
+ return false;
+}
+#endif /* CONFIG_COMPACTION */
+
/*
* Freeing function for a buddy system allocator.
*
@@ -777,6 +910,7 @@
unsigned long uninitialized_var(buddy_pfn);
struct page *buddy;
unsigned int max_order;
+ struct capture_control *capc = task_capc(zone);
max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
@@ -792,6 +926,11 @@
continue_merging:
while (order < max_order - 1) {
+ if (compaction_capture(capc, page, order, migratetype)) {
+ __mod_zone_freepage_state(zone, -(1 << order),
+ migratetype);
+ return;
+ }
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -803,13 +942,10 @@
* Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
* merge with it and move up one order.
*/
- if (page_is_guard(buddy)) {
+ if (page_is_guard(buddy))
clear_page_guard(zone, buddy, order, migratetype);
- } else {
- list_del(&buddy->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(buddy);
- }
+ else
+ del_page_from_free_area(buddy, &zone->free_area[order]);
combined_pfn = buddy_pfn & pfn;
page = page + (combined_pfn - pfn);
pfn = combined_pfn;
@@ -851,7 +987,8 @@
* so it's less likely to be used soon and more likely to be merged
* as a higher order page
*/
- if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)) {
+ if ((order < MAX_ORDER-2) && pfn_valid_within(buddy_pfn)
+ && !is_shuffle_order(order)) {
struct page *higher_page, *higher_buddy;
combined_pfn = buddy_pfn & pfn;
higher_page = page + (combined_pfn - pfn);
@@ -859,15 +996,18 @@
higher_buddy = higher_page + (buddy_pfn - combined_pfn);
if (pfn_valid_within(buddy_pfn) &&
page_is_buddy(higher_page, higher_buddy, order + 1)) {
- list_add_tail(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
- goto out;
+ add_to_free_area_tail(page, &zone->free_area[order],
+ migratetype);
+ return;
}
}
- list_add(&page->lru, &zone->free_area[order].free_list[migratetype]);
-out:
- zone->free_area[order].nr_free++;
+ if (is_shuffle_order(order))
+ add_to_free_area_random(page, &zone->free_area[order],
+ migratetype);
+ else
+ add_to_free_area(page, &zone->free_area[order], migratetype);
+
}
/*
@@ -977,6 +1117,14 @@
return ret;
}
+static void kernel_init_free_pages(struct page *page, int numpages)
+{
+ int i;
+
+ for (i = 0; i < numpages; i++)
+ clear_highpage(page + i);
+}
+
static __always_inline bool free_pages_prepare(struct page *page,
unsigned int order, bool check_free)
{
@@ -1011,7 +1159,7 @@
if (PageMappingFlags(page))
page->mapping = NULL;
if (memcg_kmem_enabled() && PageKmemcg(page))
- memcg_kmem_uncharge(page, order);
+ __memcg_kmem_uncharge(page, order);
if (check_free)
bad += free_pages_check(page);
if (bad)
@@ -1027,28 +1175,56 @@
debug_check_no_obj_freed(page_address(page),
PAGE_SIZE << order);
}
- arch_free_page(page, order);
+ if (want_init_on_free())
+ kernel_init_free_pages(page, 1 << order);
+
kernel_poison_pages(page, 1 << order, 0);
- kernel_map_pages(page, 1 << order, 0);
- kasan_free_pages(page, order);
+ /*
+ * arch_free_page() can make the page's contents inaccessible. s390
+ * does this. So nothing which can access the page's contents should
+ * happen after this.
+ */
+ arch_free_page(page, order);
+
+ if (debug_pagealloc_enabled())
+ kernel_map_pages(page, 1 << order, 0);
+
+ kasan_free_nondeferred_pages(page, order);
return true;
}
#ifdef CONFIG_DEBUG_VM
-static inline bool free_pcp_prepare(struct page *page)
+/*
+ * With DEBUG_VM enabled, order-0 pages are checked immediately when being freed
+ * to pcp lists. With debug_pagealloc also enabled, they are also rechecked when
+ * moved from pcp lists to free lists.
+ */
+static bool free_pcp_prepare(struct page *page)
{
return free_pages_prepare(page, 0, true);
}
-static inline bool bulkfree_pcp_prepare(struct page *page)
+static bool bulkfree_pcp_prepare(struct page *page)
{
- return false;
+ if (debug_pagealloc_enabled())
+ return free_pages_check(page);
+ else
+ return false;
}
#else
+/*
+ * With DEBUG_VM disabled, order-0 pages being freed are checked only when
+ * moving from pcp lists to free list in order to reduce overhead. With
+ * debug_pagealloc enabled, they are checked also immediately when being freed
+ * to the pcp lists.
+ */
static bool free_pcp_prepare(struct page *page)
{
- return free_pages_prepare(page, 0, false);
+ if (debug_pagealloc_enabled())
+ return free_pages_prepare(page, 0, true);
+ else
+ return free_pages_prepare(page, 0, false);
}
static bool bulkfree_pcp_prepare(struct page *page)
@@ -1176,6 +1352,7 @@
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
+ page_kasan_tag_reset(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
@@ -1231,7 +1408,12 @@
/* Avoid false-positive PageTail() */
INIT_LIST_HEAD(&page->lru);
- SetPageReserved(page);
+ /*
+ * no need for atomic set_bit because the struct
+ * page is not visible yet so nobody should
+ * access it yet.
+ */
+ __SetPageReserved(page);
}
}
}
@@ -1252,7 +1434,7 @@
local_irq_restore(flags);
}
-static void __init __free_pages_boot_core(struct page *page, unsigned int order)
+void __free_pages_core(struct page *page, unsigned int order)
{
unsigned int nr_pages = 1 << order;
struct page *p = page;
@@ -1267,7 +1449,7 @@
__ClearPageReserved(p);
set_page_count(p, 0);
- page_zone(page)->managed_pages += nr_pages;
+ atomic_long_add(nr_pages, &page_zone(page)->managed_pages);
set_page_refcounted(page);
__free_pages(page, order);
}
@@ -1293,45 +1475,31 @@
#endif
#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
+/* Only safe to use early in boot when initialisation is single-threaded */
+static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
int nid;
- nid = __early_pfn_to_nid(pfn, state);
+ nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
if (nid >= 0 && nid != node)
return false;
return true;
}
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return meminit_pfn_in_nid(pfn, node, &early_pfnnid_cache);
-}
-
#else
-
static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
{
return true;
}
-static inline bool __meminit __maybe_unused
-meminit_pfn_in_nid(unsigned long pfn, int node,
- struct mminit_pfnnid_cache *state)
-{
- return true;
-}
#endif
-void __init __free_pages_bootmem(struct page *page, unsigned long pfn,
+void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
{
if (early_page_uninitialised(pfn))
return;
- return __free_pages_boot_core(page, order);
+ __free_pages_core(page, order);
}
/*
@@ -1421,14 +1589,14 @@
if (nr_pages == pageblock_nr_pages &&
(pfn & (pageblock_nr_pages - 1)) == 0) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, pageblock_order);
+ __free_pages_core(page, pageblock_order);
return;
}
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if ((pfn & (pageblock_nr_pages - 1)) == 0)
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
- __free_pages_boot_core(page, 0);
+ __free_pages_core(page, 0);
}
}
@@ -1451,21 +1619,13 @@
*
* Then, we check if a current large page is valid by only checking the validity
* of the head pfn.
- *
- * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave
- * within a node: a pfn is between start and end of a node, but does not belong
- * to this memory node.
*/
-static inline bool __init
-deferred_pfn_valid(int nid, unsigned long pfn,
- struct mminit_pfnnid_cache *nid_init_state)
+static inline bool __init deferred_pfn_valid(unsigned long pfn)
{
if (!pfn_valid_within(pfn))
return false;
if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn))
return false;
- if (!meminit_pfn_in_nid(pfn, nid, nid_init_state))
- return false;
return true;
}
@@ -1473,15 +1633,14 @@
* Free pages to buddy allocator. Try to free aligned pages in
* pageblock_nr_pages sizes.
*/
-static void __init deferred_free_pages(int nid, int zid, unsigned long pfn,
+static void __init deferred_free_pages(unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
unsigned long nr_free = 0;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 0;
} else if (!(pfn & nr_pgmask)) {
@@ -1501,17 +1660,18 @@
* by performing it only once every pageblock_nr_pages.
* Return number of pages initialized.
*/
-static unsigned long __init deferred_init_pages(int nid, int zid,
+static unsigned long __init deferred_init_pages(struct zone *zone,
unsigned long pfn,
unsigned long end_pfn)
{
- struct mminit_pfnnid_cache nid_init_state = { };
unsigned long nr_pgmask = pageblock_nr_pages - 1;
+ int nid = zone_to_nid(zone);
unsigned long nr_pages = 0;
+ int zid = zone_idx(zone);
struct page *page = NULL;
for (; pfn < end_pfn; pfn++) {
- if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) {
+ if (!deferred_pfn_valid(pfn)) {
page = NULL;
continue;
} else if (!page || !(pfn & nr_pgmask)) {
@@ -1526,18 +1686,100 @@
return (nr_pages);
}
+/*
+ * This function is meant to pre-load the iterator for the zone init.
+ * Specifically it walks through the ranges until we are caught up to the
+ * first_init_pfn value and exits there. If we never encounter the value we
+ * return false indicating there are no valid ranges left.
+ */
+static bool __init
+deferred_init_mem_pfn_range_in_zone(u64 *i, struct zone *zone,
+ unsigned long *spfn, unsigned long *epfn,
+ unsigned long first_init_pfn)
+{
+ u64 j;
+
+ /*
+ * Start out by walking through the ranges in this zone that have
+ * already been initialized. We don't need to do anything with them
+ * so we just need to flush them out of the system.
+ */
+ for_each_free_mem_pfn_range_in_zone(j, zone, spfn, epfn) {
+ if (*epfn <= first_init_pfn)
+ continue;
+ if (*spfn < first_init_pfn)
+ *spfn = first_init_pfn;
+ *i = j;
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Initialize and free pages. We do it in two loops: first we initialize
+ * struct page, then free to buddy allocator, because while we are
+ * freeing pages we can access pages that are ahead (computing buddy
+ * page in __free_one_page()).
+ *
+ * In order to try and keep some memory in the cache we have the loop
+ * broken along max page order boundaries. This way we will not cause
+ * any issues with the buddy page computation.
+ */
+static unsigned long __init
+deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
+ unsigned long *end_pfn)
+{
+ unsigned long mo_pfn = ALIGN(*start_pfn + 1, MAX_ORDER_NR_PAGES);
+ unsigned long spfn = *start_pfn, epfn = *end_pfn;
+ unsigned long nr_pages = 0;
+ u64 j = *i;
+
+ /* First we loop through and initialize the page values */
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, start_pfn, end_pfn) {
+ unsigned long t;
+
+ if (mo_pfn <= *start_pfn)
+ break;
+
+ t = min(mo_pfn, *end_pfn);
+ nr_pages += deferred_init_pages(zone, *start_pfn, t);
+
+ if (mo_pfn < *end_pfn) {
+ *start_pfn = mo_pfn;
+ break;
+ }
+ }
+
+ /* Reset values and now loop through freeing pages as needed */
+ swap(j, *i);
+
+ for_each_free_mem_pfn_range_in_zone_from(j, zone, &spfn, &epfn) {
+ unsigned long t;
+
+ if (mo_pfn <= spfn)
+ break;
+
+ t = min(mo_pfn, epfn);
+ deferred_free_pages(spfn, t);
+
+ if (mo_pfn <= epfn)
+ break;
+ }
+
+ return nr_pages;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
- int nid = pgdat->node_id;
- unsigned long start = jiffies;
- unsigned long nr_pages = 0;
- unsigned long spfn, epfn, first_init_pfn, flags;
- phys_addr_t spa, epa;
- int zid;
- struct zone *zone;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
+ unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long first_init_pfn, flags;
+ unsigned long start = jiffies;
+ struct zone *zone;
+ int zid;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1563,44 +1805,33 @@
if (first_init_pfn < zone_end_pfn(zone))
break;
}
- first_init_pfn = max(zone->zone_start_pfn, first_init_pfn);
+
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_init_pfn))
+ goto zone_empty;
/*
- * Initialize and free pages. We do it in two loops: first we initialize
- * struct page, than free to buddy allocator, because while we are
- * freeing pages we can access pages that are ahead (computing buddy
- * page in __free_one_page()).
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
*/
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- nr_pages += deferred_init_pages(nid, zid, spfn, epfn);
- }
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
- }
+ while (spfn < epfn)
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+zone_empty:
pgdat_resize_unlock(pgdat, &flags);
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
- jiffies_to_msecs(jiffies - start));
+ pr_info("node %d initialised, %lu pages in %ums\n",
+ pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
}
/*
- * During boot we initialize deferred pages on-demand, as needed, but once
- * page_alloc_init_late() has finished, the deferred pages are all initialized,
- * and we can permanently disable that path.
- */
-static DEFINE_STATIC_KEY_TRUE(deferred_pages);
-
-/*
* If this zone has deferred pages, try to grow it by initializing enough
* deferred pages to satisfy the allocation specified by order, rounded up to
* the nearest PAGES_PER_SECTION boundary. So we're adding memory in increments
@@ -1618,14 +1849,11 @@
static noinline bool __init
deferred_grow_zone(struct zone *zone, unsigned int order)
{
- int zid = zone_idx(zone);
- int nid = zone_to_nid(zone);
- pg_data_t *pgdat = NODE_DATA(nid);
unsigned long nr_pages_needed = ALIGN(1 << order, PAGES_PER_SECTION);
- unsigned long nr_pages = 0;
- unsigned long first_init_pfn, spfn, epfn, t, flags;
+ pg_data_t *pgdat = zone->zone_pgdat;
unsigned long first_deferred_pfn = pgdat->first_deferred_pfn;
- phys_addr_t spa, epa;
+ unsigned long spfn, epfn, flags;
+ unsigned long nr_pages = 0;
u64 i;
/* Only the last zone may have deferred pages */
@@ -1654,38 +1882,36 @@
return true;
}
- first_init_pfn = max(zone->zone_start_pfn, first_deferred_pfn);
-
- if (first_init_pfn >= pgdat_end_pfn(pgdat)) {
+ /* If the zone is empty somebody else may have cleared out the zone */
+ if (!deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ first_deferred_pfn)) {
+ pgdat->first_deferred_pfn = ULONG_MAX;
pgdat_resize_unlock(pgdat, &flags);
- return false;
+ /* Retry only once. */
+ return first_deferred_pfn != ULONG_MAX;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa));
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so
+ * that we can avoid introducing any issues with the buddy
+ * allocator.
+ */
+ while (spfn < epfn) {
+ /* update our first deferred PFN for this section */
+ first_deferred_pfn = spfn;
- while (spfn < epfn && nr_pages < nr_pages_needed) {
- t = ALIGN(spfn + PAGES_PER_SECTION, PAGES_PER_SECTION);
- first_deferred_pfn = min(t, epfn);
- nr_pages += deferred_init_pages(nid, zid, spfn,
- first_deferred_pfn);
- spfn = first_deferred_pfn;
- }
+ nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ /* We should only stop along section boundaries */
+ if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
+ continue;
+
+ /* If our quota has been met we can stop here */
if (nr_pages >= nr_pages_needed)
break;
}
- for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) {
- spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa));
- epfn = min_t(unsigned long, first_deferred_pfn, PFN_DOWN(epa));
- deferred_free_pages(nid, zid, spfn, epfn);
-
- if (first_deferred_pfn == epfn)
- break;
- }
- pgdat->first_deferred_pfn = first_deferred_pfn;
+ pgdat->first_deferred_pfn = spfn;
pgdat_resize_unlock(pgdat, &flags);
return nr_pages > 0;
@@ -1708,9 +1934,9 @@
void __init page_alloc_init_late(void)
{
struct zone *zone;
+ int nid;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
- int nid;
/* There will be num_node_state(N_MEMORY) threads */
atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
@@ -1722,6 +1948,14 @@
wait_for_completion(&pgdat_init_all_done_comp);
/*
+ * The number of managed pages has changed due to the initialisation
+ * so the pcpu batch and high limits needs to be updated or the limits
+ * will be artificially small.
+ */
+ for_each_populated_zone(zone)
+ zone_pcp_update(zone);
+
+ /*
* We initialized the rest of the deferred pages. Permanently disable
* on-demand struct page initialization.
*/
@@ -1730,13 +1964,19 @@
/* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init();
#endif
-#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
+
/* Discard memblock private memory */
memblock_discard();
-#endif
+
+ for_each_node_state(nid, N_MEMORY)
+ shuffle_free_memory(NODE_DATA(nid));
for_each_populated_zone(zone)
set_zone_contiguous(zone);
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ init_debug_guardpage();
+#endif
}
#ifdef CONFIG_CMA
@@ -1805,8 +2045,7 @@
if (set_page_guard(zone, &page[size], high, migratetype))
continue;
- list_add(&page[size].lru, &area->free_list[migratetype]);
- area->nr_free++;
+ add_to_free_area(&page[size], area, migratetype);
set_page_order(&page[size], high);
}
}
@@ -1821,7 +2060,7 @@
if (unlikely(page->mapping != NULL))
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _count";
+ bad_reason = "nonzero _refcount";
if (unlikely(page->flags & __PG_HWPOISON)) {
bad_reason = "HWPoisoned (hardware-corrupted)";
bad_flags = __PG_HWPOISON;
@@ -1855,28 +2094,44 @@
static inline bool free_pages_prezeroed(void)
{
- return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
- page_poisoning_enabled();
+ return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
+ page_poisoning_enabled()) || want_init_on_free();
}
#ifdef CONFIG_DEBUG_VM
-static bool check_pcp_refill(struct page *page)
+/*
+ * With DEBUG_VM enabled, order-0 pages are checked for expected state when
+ * being allocated from pcp lists. With debug_pagealloc also enabled, they are
+ * also checked when pcp lists are refilled from the free lists.
+ */
+static inline bool check_pcp_refill(struct page *page)
{
- return false;
+ if (debug_pagealloc_enabled())
+ return check_new_page(page);
+ else
+ return false;
}
-static bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page)
{
return check_new_page(page);
}
#else
-static bool check_pcp_refill(struct page *page)
+/*
+ * With DEBUG_VM disabled, free order-0 pages are checked for expected state
+ * when pcp lists are being refilled from the free lists. With debug_pagealloc
+ * enabled, they are also checked when being allocated from the pcp lists.
+ */
+static inline bool check_pcp_refill(struct page *page)
{
return check_new_page(page);
}
-static bool check_new_pcp(struct page *page)
+static inline bool check_new_pcp(struct page *page)
{
- return false;
+ if (debug_pagealloc_enabled())
+ return check_new_page(page);
+ else
+ return false;
}
#endif /* CONFIG_DEBUG_VM */
@@ -1900,22 +2155,20 @@
set_page_refcounted(page);
arch_alloc_page(page, order);
- kernel_map_pages(page, 1 << order, 1);
- kernel_poison_pages(page, 1 << order, 1);
+ if (debug_pagealloc_enabled())
+ kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
+ kernel_poison_pages(page, 1 << order, 1);
set_page_owner(page, order, gfp_flags);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
unsigned int alloc_flags)
{
- int i;
-
post_alloc_hook(page, order, gfp_flags);
- if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO))
- for (i = 0; i < (1 << order); i++)
- clear_highpage(page + i);
+ if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
+ kernel_init_free_pages(page, 1 << order);
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -1947,13 +2200,10 @@
/* Find a page of the appropriate size in the preferred list */
for (current_order = order; current_order < MAX_ORDER; ++current_order) {
area = &(zone->free_area[current_order]);
- page = list_first_entry_or_null(&area->free_list[migratetype],
- struct page, lru);
+ page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- list_del(&page->lru);
- rmv_page_order(page);
- area->nr_free--;
+ del_page_from_free_area(page, area);
expand(zone, page, order, current_order, area, migratetype);
set_pcppage_migratetype(page, migratetype);
return page;
@@ -1969,8 +2219,8 @@
*/
static int fallbacks[MIGRATE_TYPES][4] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
- [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
+ [MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
#ifdef CONFIG_CMA
[MIGRATE_CMA] = { MIGRATE_TYPES }, /* Never used */
#endif
@@ -2003,31 +2253,12 @@
unsigned int order;
int pages_moved = 0;
-#ifndef CONFIG_HOLES_IN_ZONE
- /*
- * page_zone is not safe to call in this context when
- * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant
- * anyway as we check zone boundaries in move_freepages_block().
- * Remove at a later date when no bug reports exist related to
- * grouping pages by mobility
- */
- VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) &&
- pfn_valid(page_to_pfn(end_page)) &&
- page_zone(start_page) != page_zone(end_page));
-#endif
-
- if (num_movable)
- *num_movable = 0;
-
for (page = start_page; page <= end_page;) {
if (!pfn_valid_within(page_to_pfn(page))) {
page++;
continue;
}
- /* Make sure we are not inadvertently changing nodes */
- VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
-
if (!PageBuddy(page)) {
/*
* We assume that pages that could be isolated for
@@ -2042,9 +2273,12 @@
continue;
}
+ /* Make sure we are not inadvertently changing nodes */
+ VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page);
+ VM_BUG_ON_PAGE(page_zone(page) != zone, page);
+
order = page_order(page);
- list_move(&page->lru,
- &zone->free_area[order].free_list[migratetype]);
+ move_to_free_area(page, &zone->free_area[order], migratetype);
page += 1 << order;
pages_moved += 1 << order;
}
@@ -2058,6 +2292,9 @@
unsigned long start_pfn, end_pfn;
struct page *start_page, *end_page;
+ if (num_movable)
+ *num_movable = 0;
+
start_pfn = page_to_pfn(page);
start_pfn = start_pfn & ~(pageblock_nr_pages-1);
start_page = pfn_to_page(start_pfn);
@@ -2118,6 +2355,33 @@
return false;
}
+static inline void boost_watermark(struct zone *zone)
+{
+ unsigned long max_boost;
+
+ if (!watermark_boost_factor)
+ return;
+
+ max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
+ watermark_boost_factor, 10000);
+
+ /*
+ * high watermark may be uninitialised if fragmentation occurs
+ * very early in boot so do not boost. We do not fall
+ * through and boost by pageblock_nr_pages as failing
+ * allocations that early means that reclaim is not going
+ * to help and it may even be impossible to reclaim the
+ * boosted watermark resulting in a hang.
+ */
+ if (!max_boost)
+ return;
+
+ max_boost = max(pageblock_nr_pages, max_boost);
+
+ zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
+ max_boost);
+}
+
/*
* This function implements actual steal behaviour. If order is large enough,
* we can steal whole pageblock. If not, we first move freepages in this
@@ -2127,7 +2391,7 @@
* itself, so pages freed in the future will be put on the correct free list.
*/
static void steal_suitable_fallback(struct zone *zone, struct page *page,
- int start_type, bool whole_block)
+ unsigned int alloc_flags, int start_type, bool whole_block)
{
unsigned int current_order = page_order(page);
struct free_area *area;
@@ -2149,6 +2413,15 @@
goto single_page;
}
+ /*
+ * Boost watermarks to increase reclaim pressure to reduce the
+ * likelihood of future fallbacks. Wake kswapd now as the node
+ * may be balanced overall and kswapd will not wake naturally.
+ */
+ boost_watermark(zone);
+ if (alloc_flags & ALLOC_KSWAPD)
+ set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+
/* We are not allowed to try stealing from the whole block */
if (!whole_block)
goto single_page;
@@ -2193,7 +2466,7 @@
single_page:
area = &zone->free_area[current_order];
- list_move(&page->lru, &area->free_list[start_type]);
+ move_to_free_area(page, area, start_type);
}
/*
@@ -2217,7 +2490,7 @@
if (fallback_mt == MIGRATE_TYPES)
break;
- if (list_empty(&area->free_list[fallback_mt]))
+ if (free_area_empty(area, fallback_mt))
continue;
if (can_steal_fallback(order, migratetype))
@@ -2247,7 +2520,7 @@
* Limit the number reserved to 1 pageblock or roughly 1% of a zone.
* Check is race-prone but harmless.
*/
- max_managed = (zone->managed_pages / 100) + pageblock_nr_pages;
+ max_managed = (zone_managed_pages(zone) / 100) + pageblock_nr_pages;
if (zone->nr_reserved_highatomic >= max_managed)
return;
@@ -2304,9 +2577,7 @@
for (order = 0; order < MAX_ORDER; order++) {
struct free_area *area = &(zone->free_area[order]);
- page = list_first_entry_or_null(
- &area->free_list[MIGRATE_HIGHATOMIC],
- struct page, lru);
+ page = get_page_from_free_area(area, MIGRATE_HIGHATOMIC);
if (!page)
continue;
@@ -2364,20 +2635,30 @@
* condition simpler.
*/
static __always_inline bool
-__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
+__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
+ unsigned int alloc_flags)
{
struct free_area *area;
int current_order;
+ int min_order = order;
struct page *page;
int fallback_mt;
bool can_steal;
/*
+ * Do not steal pages from freelists belonging to other pageblocks
+ * i.e. orders < pageblock_order. If there are no local zones free,
+ * the zonelists will be reiterated without ALLOC_NOFRAGMENT.
+ */
+ if (alloc_flags & ALLOC_NOFRAGMENT)
+ min_order = pageblock_order;
+
+ /*
* Find the largest available free page in the other list. This roughly
* approximates finding the pageblock with the most free pages, which
* would be too costly to do exactly.
*/
- for (current_order = MAX_ORDER - 1; current_order >= order;
+ for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {
area = &(zone->free_area[current_order]);
fallback_mt = find_suitable_fallback(area, current_order,
@@ -2419,10 +2700,10 @@
VM_BUG_ON(current_order == MAX_ORDER);
do_steal:
- page = list_first_entry(&area->free_list[fallback_mt],
- struct page, lru);
+ page = get_page_from_free_area(area, fallback_mt);
- steal_suitable_fallback(zone, page, start_migratetype, can_steal);
+ steal_suitable_fallback(zone, page, alloc_flags, start_migratetype,
+ can_steal);
trace_mm_page_alloc_extfrag(page, order, current_order,
start_migratetype, fallback_mt);
@@ -2436,7 +2717,8 @@
* Call me with the zone->lock already held.
*/
static __always_inline struct page *
-__rmqueue(struct zone *zone, unsigned int order, int migratetype)
+__rmqueue(struct zone *zone, unsigned int order, int migratetype,
+ unsigned int alloc_flags)
{
struct page *page;
@@ -2446,7 +2728,8 @@
if (migratetype == MIGRATE_MOVABLE)
page = __rmqueue_cma_fallback(zone, order);
- if (!page && __rmqueue_fallback(zone, order, migratetype))
+ if (!page && __rmqueue_fallback(zone, order, migratetype,
+ alloc_flags))
goto retry;
}
@@ -2461,13 +2744,14 @@
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype)
+ int migratetype, unsigned int alloc_flags)
{
int i, alloced = 0;
spin_lock(&zone->lock);
for (i = 0; i < count; ++i) {
- struct page *page = __rmqueue(zone, order, migratetype);
+ struct page *page = __rmqueue(zone, order, migratetype,
+ alloc_flags);
if (unlikely(page == NULL))
break;
@@ -2581,6 +2865,10 @@
static void drain_local_pages_wq(struct work_struct *work)
{
+ struct pcpu_drain *drain;
+
+ drain = container_of(work, struct pcpu_drain, work);
+
/*
* drain_all_pages doesn't use proper cpu hotplug protection so
* we can race with cpu offline when the WQ can move this from
@@ -2589,7 +2877,7 @@
* a different one.
*/
preempt_disable();
- drain_local_pages(NULL);
+ drain_local_pages(drain->zone);
preempt_enable();
}
@@ -2660,12 +2948,14 @@
}
for_each_cpu(cpu, &cpus_with_pcps) {
- struct work_struct *work = per_cpu_ptr(&pcpu_drain, cpu);
- INIT_WORK(work, drain_local_pages_wq);
- queue_work_on(cpu, mm_percpu_wq, work);
+ struct pcpu_drain *drain = per_cpu_ptr(&pcpu_drain, cpu);
+
+ drain->zone = zone;
+ INIT_WORK(&drain->work, drain_local_pages_wq);
+ queue_work_on(cpu, mm_percpu_wq, &drain->work);
}
for_each_cpu(cpu, &cpus_with_pcps)
- flush_work(per_cpu_ptr(&pcpu_drain, cpu));
+ flush_work(&per_cpu_ptr(&pcpu_drain, cpu)->work);
mutex_unlock(&pcpu_drain_mutex);
}
@@ -2847,6 +3137,7 @@
int __isolate_free_page(struct page *page, unsigned int order)
{
+ struct free_area *area = &page_zone(page)->free_area[order];
unsigned long watermark;
struct zone *zone;
int mt;
@@ -2863,7 +3154,7 @@
* watermark, because we already know our high-order page
* exists.
*/
- watermark = min_wmark_pages(zone) + (1UL << order);
+ watermark = zone->_watermark[WMARK_MIN] + (1UL << order);
if (!zone_watermark_ok(zone, 0, watermark, 0, ALLOC_CMA))
return 0;
@@ -2871,9 +3162,8 @@
}
/* Remove page from free list */
- list_del(&page->lru);
- zone->free_area[order].nr_free--;
- rmv_page_order(page);
+
+ del_page_from_free_area(page, area);
/*
* Set the pageblock if the isolated page is at least half of a
@@ -2923,6 +3213,7 @@
/* Remove page from the per-cpu list, caller must protect the list */
static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
+ unsigned int alloc_flags,
struct per_cpu_pages *pcp,
struct list_head *list)
{
@@ -2932,7 +3223,7 @@
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
pcp->batch, list,
- migratetype);
+ migratetype, alloc_flags);
if (unlikely(list_empty(list)))
return NULL;
}
@@ -2947,8 +3238,8 @@
/* Lock and remove page from the per-cpu list */
static struct page *rmqueue_pcplist(struct zone *preferred_zone,
- struct zone *zone, unsigned int order,
- gfp_t gfp_flags, int migratetype)
+ struct zone *zone, gfp_t gfp_flags,
+ int migratetype, unsigned int alloc_flags)
{
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -2958,9 +3249,9 @@
local_irq_save(flags);
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list = &pcp->lists[migratetype];
- page = __rmqueue_pcplist(zone, migratetype, pcp, list);
+ page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list);
if (page) {
- __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order);
+ __count_zid_vm_events(PGALLOC, page_zonenum(page), 1);
zone_statistics(preferred_zone, zone);
}
local_irq_restore(flags);
@@ -2980,8 +3271,8 @@
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, order,
- gfp_flags, migratetype);
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ migratetype, alloc_flags);
goto out;
}
@@ -3000,7 +3291,7 @@
trace_mm_page_alloc_zone_locked(page, order, migratetype);
}
if (!page)
- page = __rmqueue(zone, order, migratetype);
+ page = __rmqueue(zone, order, migratetype, alloc_flags);
} while (page && check_new_pages(page, order));
spin_unlock(&zone->lock);
if (!page)
@@ -3013,6 +3304,12 @@
local_irq_restore(flags);
out:
+ /* Separate test+clear to avoid unnecessary atomics */
+ if (test_bit(ZONE_BOOSTED_WATERMARK, &zone->flags)) {
+ clear_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
+ wakeup_kswapd(zone, 0, 0, zone_idx(zone));
+ }
+
VM_BUG_ON_PAGE(page && bad_range(zone, page), page);
return page;
@@ -3042,7 +3339,7 @@
}
__setup("fail_page_alloc=", setup_fail_page_alloc);
-static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
if (order < fail_page_alloc.min_order)
return false;
@@ -3066,24 +3363,14 @@
dir = fault_create_debugfs_attr("fail_page_alloc", NULL,
&fail_page_alloc.attr);
- if (IS_ERR(dir))
- return PTR_ERR(dir);
- if (!debugfs_create_bool("ignore-gfp-wait", mode, dir,
- &fail_page_alloc.ignore_gfp_reclaim))
- goto fail;
- if (!debugfs_create_bool("ignore-gfp-highmem", mode, dir,
- &fail_page_alloc.ignore_gfp_highmem))
- goto fail;
- if (!debugfs_create_u32("min-order", mode, dir,
- &fail_page_alloc.min_order))
- goto fail;
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_reclaim);
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+ debugfs_create_u32("min-order", mode, dir, &fail_page_alloc.min_order);
return 0;
-fail:
- debugfs_remove_recursive(dir);
-
- return -ENOMEM;
}
late_initcall(fail_page_alloc_debugfs);
@@ -3092,13 +3379,19 @@
#else /* CONFIG_FAIL_PAGE_ALLOC */
-static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+static inline bool __should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return false;
}
#endif /* CONFIG_FAIL_PAGE_ALLOC */
+static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return __should_fail_alloc_page(gfp_mask, order);
+}
+ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
@@ -3167,13 +3460,13 @@
continue;
for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) {
- if (!list_empty(&area->free_list[mt]))
+ if (!free_area_empty(area, mt))
return true;
}
#ifdef CONFIG_CMA
if ((alloc_flags & ALLOC_CMA) &&
- !list_empty(&area->free_list[MIGRATE_CMA])) {
+ !free_area_empty(area, MIGRATE_CMA)) {
return true;
}
#endif
@@ -3233,7 +3526,7 @@
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
{
return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <=
- RECLAIM_DISTANCE;
+ node_reclaim_distance;
}
#else /* CONFIG_NUMA */
static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone)
@@ -3243,6 +3536,43 @@
#endif /* CONFIG_NUMA */
/*
+ * The restriction on ZONE_DMA32 as being a suitable zone to use to avoid
+ * fragmentation is subtle. If the preferred zone was HIGHMEM then
+ * premature use of a lower zone may cause lowmem pressure problems that
+ * are worse than fragmentation. If the next zone is ZONE_DMA then it is
+ * probably too small. It only makes sense to spread allocations to avoid
+ * fragmentation between the Normal and DMA32 zones.
+ */
+static inline unsigned int
+alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
+{
+ unsigned int alloc_flags = 0;
+
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
+#ifdef CONFIG_ZONE_DMA32
+ if (!zone)
+ return alloc_flags;
+
+ if (zone_idx(zone) != ZONE_NORMAL)
+ return alloc_flags;
+
+ /*
+ * If ZONE_DMA32 exists, assume it is the one after ZONE_NORMAL and
+ * the pointer is within zone->zone_pgdat->node_zones[]. Also assume
+ * on UMA that if Normal is populated then so is DMA32.
+ */
+ BUILD_BUG_ON(ZONE_NORMAL - ZONE_DMA32 != 1);
+ if (nr_online_nodes > 1 && !populated_zone(--zone))
+ return alloc_flags;
+
+ alloc_flags |= ALLOC_NOFRAGMENT;
+#endif /* CONFIG_ZONE_DMA32 */
+ return alloc_flags;
+}
+
+/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
@@ -3250,14 +3580,18 @@
get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
const struct alloc_context *ac)
{
- struct zoneref *z = ac->preferred_zoneref;
+ struct zoneref *z;
struct zone *zone;
struct pglist_data *last_pgdat_dirty_limit = NULL;
+ bool no_fallback;
+retry:
/*
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
+ no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
+ z = ac->preferred_zoneref;
for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
ac->nodemask) {
struct page *page;
@@ -3296,7 +3630,23 @@
}
}
- mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
+ if (no_fallback && nr_online_nodes > 1 &&
+ zone != ac->preferred_zoneref->zone) {
+ int local_nid;
+
+ /*
+ * If moving to a remote node, retry but allow
+ * fragmenting fallbacks. Locality is more important
+ * than fragmentation avoidance.
+ */
+ local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+ if (zone_to_nid(zone) != local_nid) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+ }
+
+ mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
ac_classzone_idx(ac), alloc_flags)) {
int ret;
@@ -3363,30 +3713,21 @@
}
}
+ /*
+ * It's possible on a UMA machine to get through all zones that are
+ * fragmented. If avoiding fragmentation, reset and try again.
+ */
+ if (no_fallback) {
+ alloc_flags &= ~ALLOC_NOFRAGMENT;
+ goto retry;
+ }
+
return NULL;
}
-/*
- * Large machines with many possible nodes should not always dump per-node
- * meminfo in irq context.
- */
-static inline bool should_suppress_show_mem(void)
-{
- bool ret = false;
-
-#if NODES_SHIFT > 8
- ret = in_interrupt();
-#endif
- return ret;
-}
-
static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask)
{
unsigned int filter = SHOW_MEM_FILTER_NODES;
- static DEFINE_RATELIMIT_STATE(show_mem_rs, HZ, 1);
-
- if (should_suppress_show_mem() || !__ratelimit(&show_mem_rs))
- return;
/*
* This documents exceptions given to allocations in certain
@@ -3407,8 +3748,7 @@
{
struct va_format vaf;
va_list args;
- static DEFINE_RATELIMIT_STATE(nopage_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
+ static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
return;
@@ -3416,13 +3756,13 @@
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
- pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n",
+ pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl",
current->comm, &vaf, gfp_mask, &gfp_mask,
nodemask_pr_args(nodemask));
va_end(args);
cpuset_print_current_mems_allowed();
-
+ pr_cont("\n");
dump_stack();
warn_alloc_show_mem(gfp_mask, nodemask);
}
@@ -3548,19 +3888,21 @@
unsigned int alloc_flags, const struct alloc_context *ac,
enum compact_priority prio, enum compact_result *compact_result)
{
- struct page *page;
+ struct page *page = NULL;
+ unsigned long pflags;
unsigned int noreclaim_flag;
if (!order)
return NULL;
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
- *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
- prio);
- memalloc_noreclaim_restore(noreclaim_flag);
- if (*compact_result <= COMPACT_INACTIVE)
- return NULL;
+ *compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ prio, &page);
+
+ memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
/*
* At least in one zone compaction wasn't deferred or skipped, so let's
@@ -3568,7 +3910,13 @@
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+ /* Prep a captured page if available */
+ if (page)
+ prep_new_page(page, order, gfp_mask, alloc_flags);
+
+ /* Try get a page from the freelist if available */
+ if (!page)
+ page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -3617,17 +3965,25 @@
goto check_priority;
/*
- * make sure the compaction wasn't deferred or didn't bail out early
- * due to locks contention before we declare that we should give up.
- * But do not retry if the given zonelist is not suitable for
- * compaction.
+ * compaction was skipped because there are not enough order-0 pages
+ * to work with, so we retry only if it looks like reclaim can help.
*/
- if (compaction_withdrawn(compact_result)) {
+ if (compaction_needs_reclaim(compact_result)) {
ret = compaction_zonelist_suitable(ac, order, alloc_flags);
goto out;
}
/*
+ * make sure the compaction wasn't deferred or didn't bail out early
+ * due to locks contention before we declare that we should give up.
+ * But the next retry should use a higher priority if allowed, so
+ * we don't just keep bailing out endlessly.
+ */
+ if (compaction_withdrawn(compact_result)) {
+ goto check_priority;
+ }
+
+ /*
* !costly requests are much more important than __GFP_RETRY_MAYFAIL
* costly ones because they are de facto nofail and invoke OOM
* killer to move on while costly can fail and users are ready
@@ -3753,25 +4109,24 @@
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
const struct alloc_context *ac)
{
- struct reclaim_state reclaim_state;
int progress;
unsigned int noreclaim_flag;
+ unsigned long pflags;
cond_resched();
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
+ psi_memstall_enter(&pflags);
fs_reclaim_acquire(gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- current->reclaim_state = &reclaim_state;
progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
ac->nodemask);
- current->reclaim_state = NULL;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(gfp_mask);
+ psi_memstall_leave(&pflags);
cond_resched();
@@ -3856,6 +4211,9 @@
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
+ if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ alloc_flags |= ALLOC_KSWAPD;
+
#ifdef CONFIG_CMA
if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
@@ -3922,6 +4280,7 @@
{
struct zone *zone;
struct zoneref *z;
+ bool ret = false;
/*
* Costly allocations might have made a progress but this doesn't mean
@@ -3985,25 +4344,24 @@
}
}
- /*
- * Memory allocation/reclaim might be called from a WQ
- * context and the current implementation of the WQ
- * concurrency control doesn't recognize that
- * a particular WQ is congested if the worker thread is
- * looping without ever sleeping. Therefore we have to
- * do a short sleep here rather than calling
- * cond_resched().
- */
- if (current->flags & PF_WQ_WORKER)
- schedule_timeout_uninterruptible(1);
- else
- cond_resched();
-
- return true;
+ ret = true;
+ goto out;
}
}
- return false;
+out:
+ /*
+ * Memory allocation/reclaim might be called from a WQ context and the
+ * current implementation of the WQ concurrency control doesn't
+ * recognize that a particular WQ is congested if the worker thread is
+ * looping without ever sleeping. Therefore we have to do a short sleep
+ * here rather than calling cond_resched().
+ */
+ if (current->flags & PF_WQ_WORKER)
+ schedule_timeout_uninterruptible(1);
+ else
+ cond_resched();
+ return ret;
}
static inline bool
@@ -4087,7 +4445,7 @@
if (!ac->preferred_zoneref->zone)
goto nopage;
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
/*
@@ -4118,6 +4476,30 @@
if (page)
goto got_pg;
+ if (order >= pageblock_order && (gfp_mask & __GFP_IO) &&
+ !(gfp_mask & __GFP_RETRY_MAYFAIL)) {
+ /*
+ * If allocating entire pageblock(s) and compaction
+ * failed because all zones are below low watermarks
+ * or is prohibited because it recently failed at this
+ * order, fail immediately unless the allocator has
+ * requested compaction and reclaim retry.
+ *
+ * Reclaim is
+ * - potentially very expensive because zones are far
+ * below their low watermarks or this is part of very
+ * bursty high order allocations,
+ * - not guaranteed to help because isolate_freepages()
+ * may not iterate over freed pages as part of its
+ * linear scan, and
+ * - unlikely to make entire pageblocks free on its
+ * own.
+ */
+ if (compact_result == COMPACT_SKIPPED ||
+ compact_result == COMPACT_DEFERRED)
+ goto nopage;
+ }
+
/*
* Checks for costly allocations with __GFP_NORETRY, which
* includes THP page fault allocations
@@ -4145,7 +4527,7 @@
retry:
/* Ensure kswapd doesn't accidentally go to sleep as long as we loop */
- if (gfp_mask & __GFP_KSWAPD_RECLAIM)
+ if (alloc_flags & ALLOC_KSWAPD)
wake_all_kswapds(order, gfp_mask, ac);
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
@@ -4364,6 +4746,12 @@
finalise_ac(gfp_mask, &ac);
+ /*
+ * Forbid the first pass from falling back to types that fragment
+ * memory until all local zones are considered.
+ */
+ alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp_mask);
+
/* First allocation attempt */
page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (likely(page))
@@ -4389,7 +4777,7 @@
out:
if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&
- unlikely(memcg_kmem_charge(page, gfp_mask, order) != 0)) {
+ unlikely(__memcg_kmem_charge(page, gfp_mask, order) != 0)) {
__free_pages(page, order);
page = NULL;
}
@@ -4422,16 +4810,19 @@
}
EXPORT_SYMBOL(get_zeroed_page);
-void __free_pages(struct page *page, unsigned int order)
+static inline void free_the_page(struct page *page, unsigned int order)
{
- if (put_page_testzero(page)) {
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ if (order == 0) /* Via pcp? */
+ free_unref_page(page);
+ else
+ __free_pages_ok(page, order);
}
+void __free_pages(struct page *page, unsigned int order)
+{
+ if (put_page_testzero(page))
+ free_the_page(page, order);
+}
EXPORT_SYMBOL(__free_pages);
void free_pages(unsigned long addr, unsigned int order)
@@ -4480,14 +4871,8 @@
{
VM_BUG_ON_PAGE(page_ref_count(page) == 0, page);
- if (page_ref_sub_and_test(page, count)) {
- unsigned int order = compound_order(page);
-
- if (order == 0)
- free_unref_page(page);
- else
- __free_pages_ok(page, order);
- }
+ if (page_ref_sub_and_test(page, count))
+ free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(__page_frag_cache_drain);
@@ -4511,11 +4896,11 @@
/* Even if we own the page, we do not use atomic_set().
* This would break get_page_unless_zero() users.
*/
- page_ref_add(page, size - 1);
+ page_ref_add(page, PAGE_FRAG_CACHE_MAX_SIZE);
/* reset page count bias and offset to start of new frag */
nc->pfmemalloc = page_is_pfmemalloc(page);
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
nc->offset = size;
}
@@ -4531,10 +4916,10 @@
size = nc->size;
#endif
/* OK, page count is 0, we can safely set it */
- set_page_count(page, size);
+ set_page_count(page, PAGE_FRAG_CACHE_MAX_SIZE + 1);
/* reset page count bias and offset to start of new frag */
- nc->pagecnt_bias = size;
+ nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
}
@@ -4553,7 +4938,7 @@
struct page *page = virt_to_head_page(addr);
if (unlikely(put_page_testzero(page)))
- __free_pages_ok(page, compound_order(page));
+ free_the_page(page, compound_order(page));
}
EXPORT_SYMBOL(page_frag_free);
@@ -4576,7 +4961,7 @@
/**
* alloc_pages_exact - allocate an exact number physically-contiguous pages.
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* This function is similar to alloc_pages(), except that it allocates the
* minimum number of pages to satisfy the request. alloc_pages() can only
@@ -4585,12 +4970,17 @@
* This function is also limited by MAX_ORDER.
*
* Memory allocated by this function must be released by free_pages_exact().
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void *alloc_pages_exact(size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
unsigned long addr;
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
addr = __get_free_pages(gfp_mask, order);
return make_alloc_exact(addr, order, size);
}
@@ -4601,15 +4991,22 @@
* pages on a node.
* @nid: the preferred node ID where memory should be allocated
* @size: the number of bytes to allocate
- * @gfp_mask: GFP flags for the allocation
+ * @gfp_mask: GFP flags for the allocation, must not contain __GFP_COMP
*
* Like alloc_pages_exact(), but try to allocate on node nid first before falling
* back.
+ *
+ * Return: pointer to the allocated area or %NULL in case of error.
*/
void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask)
{
unsigned int order = get_order(size);
- struct page *p = alloc_pages_node(nid, gfp_mask, order);
+ struct page *p;
+
+ if (WARN_ON_ONCE(gfp_mask & __GFP_COMP))
+ gfp_mask &= ~__GFP_COMP;
+
+ p = alloc_pages_node(nid, gfp_mask, order);
if (!p)
return NULL;
return make_alloc_exact((unsigned long)page_address(p), order, size);
@@ -4638,11 +5035,13 @@
* nr_free_zone_pages - count number of pages beyond high watermark
* @offset: The zone index of the highest zone
*
- * nr_free_zone_pages() counts the number of counts pages which are beyond the
+ * nr_free_zone_pages() counts the number of pages which are beyond the
* high watermark within all zones at or below a given zone index. For each
* zone, the number of pages is calculated as:
*
* nr_free_zone_pages = managed_pages - high_pages
+ *
+ * Return: number of pages beyond high watermark.
*/
static unsigned long nr_free_zone_pages(int offset)
{
@@ -4655,7 +5054,7 @@
struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
for_each_zone_zonelist(zone, z, zonelist, offset) {
- unsigned long size = zone->managed_pages;
+ unsigned long size = zone_managed_pages(zone);
unsigned long high = high_wmark_pages(zone);
if (size > high)
sum += size - high;
@@ -4669,6 +5068,9 @@
*
* nr_free_buffer_pages() counts the number of pages which are beyond the high
* watermark within ZONE_DMA and ZONE_NORMAL.
+ *
+ * Return: number of pages beyond high watermark within ZONE_DMA and
+ * ZONE_NORMAL.
*/
unsigned long nr_free_buffer_pages(void)
{
@@ -4681,6 +5083,8 @@
*
* nr_free_pagecache_pages() counts the number of pages which are beyond the
* high watermark within all zones.
+ *
+ * Return: number of pages beyond high watermark within all zones.
*/
unsigned long nr_free_pagecache_pages(void)
{
@@ -4699,6 +5103,7 @@
unsigned long pagecache;
unsigned long wmark_low = 0;
unsigned long pages[NR_LRU_LISTS];
+ unsigned long reclaimable;
struct zone *zone;
int lru;
@@ -4706,7 +5111,7 @@
pages[lru] = global_node_page_state(NR_LRU_BASE + lru);
for_each_zone(zone)
- wmark_low += zone->watermark[WMARK_LOW];
+ wmark_low += low_wmark_pages(zone);
/*
* Estimate the amount of memory available for userspace allocations,
@@ -4724,19 +5129,13 @@
available += pagecache;
/*
- * Part of the reclaimable slab consists of items that are in use,
- * and cannot be freed. Cap this estimate at the low watermark.
+ * Part of the reclaimable slab and other kernel memory consists of
+ * items that are in use, and cannot be freed. Cap this estimate at the
+ * low watermark.
*/
- available += global_node_page_state(NR_SLAB_RECLAIMABLE) -
- min(global_node_page_state(NR_SLAB_RECLAIMABLE) / 2,
- wmark_low);
-
- /*
- * Part of the kernel memory, which can be released under memory
- * pressure.
- */
- available += global_node_page_state(NR_INDIRECTLY_RECLAIMABLE_BYTES) >>
- PAGE_SHIFT;
+ reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
available = 0;
@@ -4746,11 +5145,11 @@
void si_meminfo(struct sysinfo *val)
{
- val->totalram = totalram_pages;
+ val->totalram = totalram_pages();
val->sharedram = global_node_page_state(NR_SHMEM);
val->freeram = global_zone_page_state(NR_FREE_PAGES);
val->bufferram = nr_blockdev_pages();
- val->totalhigh = totalhigh_pages;
+ val->totalhigh = totalhigh_pages();
val->freehigh = nr_free_highpages();
val->mem_unit = PAGE_SIZE;
}
@@ -4767,7 +5166,7 @@
pg_data_t *pgdat = NODE_DATA(nid);
for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++)
- managed_pages += pgdat->node_zones[zone_type].managed_pages;
+ managed_pages += zone_managed_pages(&pgdat->node_zones[zone_type]);
val->totalram = managed_pages;
val->sharedram = node_page_state(pgdat, NR_SHMEM);
val->freeram = sum_zone_node_page_state(nid, NR_FREE_PAGES);
@@ -4776,7 +5175,7 @@
struct zone *zone = &pgdat->node_zones[zone_type];
if (is_highmem(zone)) {
- managed_highpages += zone->managed_pages;
+ managed_highpages += zone_managed_pages(zone);
free_highpages += zone_page_state(zone, NR_FREE_PAGES);
}
}
@@ -4983,7 +5382,7 @@
K(zone_page_state(zone, NR_ZONE_UNEVICTABLE)),
K(zone_page_state(zone, NR_ZONE_WRITE_PENDING)),
K(zone->present_pages),
- K(zone->managed_pages),
+ K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
@@ -5017,7 +5416,7 @@
types[order] = 0;
for (type = 0; type < MIGRATE_TYPES; type++) {
- if (!list_empty(&area->free_list[type]))
+ if (!free_area_empty(area, type))
types[order] |= 1 << type;
}
}
@@ -5132,7 +5531,8 @@
* from each node to each node in the system), and should also prefer nodes
* with no CPUs, since presumably they'll have very little allocation pressure
* on them otherwise.
- * It returns -1 if no node is found.
+ *
+ * Return: node id of the found node or %NUMA_NO_NODE if no node is found.
*/
static int find_next_best_node(int node, nodemask_t *used_node_mask)
{
@@ -5438,7 +5838,7 @@
else
page_group_by_mobility_disabled = 0;
- pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
+ pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n",
nr_online_nodes,
page_group_by_mobility_disabled ? "off" : "on",
vm_total_pages);
@@ -5447,76 +5847,150 @@
#endif
}
+/* If zone is ZONE_MOVABLE but memory is mirrored, it is an overlapped init */
+static bool __meminit
+overlap_memmap_init(unsigned long zone, unsigned long *pfn)
+{
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ static struct memblock_region *r;
+
+ if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
+ if (!r || *pfn >= memblock_region_memory_end_pfn(r)) {
+ for_each_memblock(memory, r) {
+ if (*pfn < memblock_region_memory_end_pfn(r))
+ break;
+ }
+ }
+ if (*pfn >= memblock_region_memory_base_pfn(r) &&
+ memblock_is_mirror(r)) {
+ *pfn = memblock_region_memory_end_pfn(r);
+ return true;
+ }
+ }
+#endif
+ return false;
+}
+
/*
* Initially all pages are reserved - free ones are freed
- * up by free_all_bootmem() once the early boot process is
+ * up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context,
struct vmem_altmap *altmap)
{
- unsigned long end_pfn = start_pfn + size;
- pg_data_t *pgdat = NODE_DATA(nid);
- unsigned long pfn;
- unsigned long nr_initialised = 0;
+ unsigned long pfn, end_pfn = start_pfn + size;
struct page *page;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- struct memblock_region *r = NULL, *tmp;
-#endif
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
+#ifdef CONFIG_ZONE_DEVICE
/*
* Honor reservation requested by the driver for this ZONE_DEVICE
- * memory
+ * memory. We limit the total number of pages to initialize to just
+ * those that might contain the memory mapping. We will defer the
+ * ZONE_DEVICE page initialization until after we have released
+ * the hotplug lock.
*/
- if (altmap && start_pfn == altmap->base_pfn)
- start_pfn += altmap->reserve;
+ if (zone == ZONE_DEVICE) {
+ if (!altmap)
+ return;
+
+ if (start_pfn == altmap->base_pfn)
+ start_pfn += altmap->reserve;
+ end_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ }
+#endif
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context != MEMMAP_EARLY)
- goto not_early;
-
- if (!early_pfn_valid(pfn))
- continue;
- if (!early_pfn_in_nid(pfn, nid))
- continue;
- if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised))
- break;
-
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- /*
- * Check given memblock attribute by firmware which can affect
- * kernel memory layout. If zone==ZONE_MOVABLE but memory is
- * mirrored, it's an overlapped memmap init. skip it.
- */
- if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
- if (!r || pfn >= memblock_region_memory_end_pfn(r)) {
- for_each_memblock(memory, tmp)
- if (pfn < memblock_region_memory_end_pfn(tmp))
- break;
- r = tmp;
- }
- if (pfn >= memblock_region_memory_base_pfn(r) &&
- memblock_is_mirror(r)) {
- /* already initialized as NORMAL */
- pfn = memblock_region_memory_end_pfn(r);
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
continue;
- }
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ if (overlap_memmap_init(zone, &pfn))
+ continue;
+ if (defer_init(nid, pfn, end_pfn))
+ break;
}
-#endif
-not_early:
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
if (context == MEMMAP_HOTPLUG)
- SetPageReserved(page);
+ __SetPageReserved(page);
+
+ /*
+ * Mark the block movable so that blocks are reserved for
+ * movable at startup. This will force kernel allocations
+ * to reserve their blocks rather than leaking throughout
+ * the address space during boot when many long-lived
+ * kernel allocations are made.
+ *
+ * bitmap is created for zone's valid pfn range. but memmap
+ * can be created for invalid pages (for alignment)
+ * check here not to call set_pageblock_migratetype() against
+ * pfn out of zone.
+ */
+ if (!(pfn & (pageblock_nr_pages - 1))) {
+ set_pageblock_migratetype(page, MIGRATE_MOVABLE);
+ cond_resched();
+ }
+ }
+}
+
+#ifdef CONFIG_ZONE_DEVICE
+void __ref memmap_init_zone_device(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long size,
+ struct dev_pagemap *pgmap)
+{
+ unsigned long pfn, end_pfn = start_pfn + size;
+ struct pglist_data *pgdat = zone->zone_pgdat;
+ struct vmem_altmap *altmap = pgmap_altmap(pgmap);
+ unsigned long zone_idx = zone_idx(zone);
+ unsigned long start = jiffies;
+ int nid = pgdat->node_id;
+
+ if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE))
+ return;
+
+ /*
+ * The call to memmap_init_zone should have already taken care
+ * of the pages reserved for the memmap, so we can just jump to
+ * the end of that region and start processing the device pages.
+ */
+ if (altmap) {
+ start_pfn = altmap->base_pfn + vmem_altmap_offset(altmap);
+ size = end_pfn - start_pfn;
+ }
+
+ for (pfn = start_pfn; pfn < end_pfn; pfn++) {
+ struct page *page = pfn_to_page(pfn);
+
+ __init_single_page(page, pfn, zone_idx, nid);
+
+ /*
+ * Mark page reserved as it will need to wait for onlining
+ * phase for it to be fully associated with a zone.
+ *
+ * We can use the non-atomic __set_bit operation for setting
+ * the flag as we are still initializing the pages.
+ */
+ __SetPageReserved(page);
+
+ /*
+ * ZONE_DEVICE pages union ->lru with a ->pgmap back pointer
+ * and zone_device_data. It is a bug if a ZONE_DEVICE page is
+ * ever freed or placed on a driver-private list.
+ */
+ page->pgmap = pgmap;
+ page->zone_device_data = NULL;
/*
* Mark the block movable so that blocks are reserved for
@@ -5531,27 +6005,19 @@
* pfn out of zone.
*
* Please note that MEMMAP_HOTPLUG path doesn't clear memmap
- * because this is done early in sparse_add_one_section
+ * because this is done early in section_activate()
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
cond_resched();
}
}
-#ifdef CONFIG_SPARSEMEM
- /*
- * If the zone does not span the rest of the section then
- * we should at least initialize those pages. Otherwise we
- * could blow up on a poisoned page in some paths which depend
- * on full sections being initialized (e.g. memory hotplug).
- */
- while (end_pfn % PAGES_PER_SECTION) {
- __init_single_page(pfn_to_page(end_pfn), end_pfn, zone, nid);
- end_pfn++;
- }
-#endif
+
+ pr_info("%s initialised %lu pages in %ums\n", __func__,
+ size, jiffies_to_msecs(jiffies - start));
}
+#endif
static void __meminit zone_init_free_lists(struct zone *zone)
{
unsigned int order, t;
@@ -5561,10 +6027,11 @@
}
}
-#ifndef __HAVE_ARCH_MEMMAP_INIT
-#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY, NULL)
-#endif
+void __meminit __weak memmap_init(unsigned long size, int nid,
+ unsigned long zone, unsigned long start_pfn)
+{
+ memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+}
static int zone_batchsize(struct zone *zone)
{
@@ -5575,7 +6042,7 @@
* The per-cpu-pages pools are set to around 1000th of the
* size of the zone.
*/
- batch = zone->managed_pages / 1024;
+ batch = zone_managed_pages(zone) / 1024;
/* But no more than a meg. */
if (batch * PAGE_SIZE > 1024 * 1024)
batch = (1024 * 1024) / PAGE_SIZE;
@@ -5656,7 +6123,6 @@
memset(p, 0, sizeof(*p));
pcp = &p->pcp;
- pcp->count = 0;
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
}
@@ -5686,7 +6152,7 @@
{
if (percpu_pagelist_fraction)
pageset_set_high(pcp,
- (zone->managed_pages /
+ (zone_managed_pages(zone) /
percpu_pagelist_fraction));
else
pageset_set_batch(pcp, zone_batchsize(zone));
@@ -5778,7 +6244,7 @@
return state->last_nid;
nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != -1) {
+ if (nid != NUMA_NO_NODE) {
state->last_start = start_pfn;
state->last_end = end_pfn;
state->last_nid = nid;
@@ -5840,7 +6306,7 @@
* with no available memory, a warning is printed and the start and end
* PFNs will be 0.
*/
-void __meminit get_pfn_range_for_nid(unsigned int nid,
+void __init get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
{
unsigned long this_start_pfn, this_end_pfn;
@@ -5889,7 +6355,7 @@
* highest usable zone for ZONE_MOVABLE. This preserves the assumption that
* zones within a node are in order of monotonic increases memory addresses
*/
-static void __meminit adjust_zone_range_for_zone_movable(int nid,
+static void __init adjust_zone_range_for_zone_movable(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -5920,7 +6386,7 @@
* Return the number of pages a zone spans in a node, including holes
* present_pages = zone_spanned_pages_in_node() - zone_absent_pages_in_node()
*/
-static unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -5928,13 +6394,15 @@
unsigned long *zone_end_pfn,
unsigned long *ignored)
{
+ unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
+ unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
/* When hotadd a new node from cpu_up(), the node should be empty */
if (!node_start_pfn && !node_end_pfn)
return 0;
/* Get the start and end of the zone */
- *zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
- *zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
+ *zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
+ *zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
adjust_zone_range_for_zone_movable(nid, zone_type,
node_start_pfn, node_end_pfn,
zone_start_pfn, zone_end_pfn);
@@ -5955,7 +6423,7 @@
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
* then all holes in the requested range will be accounted for.
*/
-unsigned long __meminit __absent_pages_in_range(int nid,
+unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
unsigned long range_end_pfn)
{
@@ -5976,7 +6444,7 @@
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range.
+ * Return: the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
@@ -5985,7 +6453,7 @@
}
/* Return the number of page frames in holes in a zone on a node */
-static unsigned long __meminit zone_absent_pages_in_node(int nid,
+static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6037,7 +6505,7 @@
}
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
+static inline unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6056,7 +6524,7 @@
return zones_size[zone_type];
}
-static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
+static inline unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
@@ -6070,7 +6538,7 @@
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
+static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zones_size,
@@ -6138,10 +6606,14 @@
{
unsigned long usemapsize = usemap_size(zone_start_pfn, zonesize);
zone->pageblock_flags = NULL;
- if (usemapsize)
+ if (usemapsize) {
zone->pageblock_flags =
- memblock_virt_alloc_node_nopanic(usemapsize,
- pgdat->node_id);
+ memblock_alloc_node(usemapsize, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!zone->pageblock_flags)
+ panic("Failed to allocate %ld bytes for zone %s pageblock flags on node %d\n",
+ usemapsize, zone->name, pgdat->node_id);
+ }
}
#else
static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone,
@@ -6208,9 +6680,11 @@
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static void pgdat_init_split_queue(struct pglist_data *pgdat)
{
- spin_lock_init(&pgdat->split_queue_lock);
- INIT_LIST_HEAD(&pgdat->split_queue);
- pgdat->split_queue_len = 0;
+ struct deferred_split *ds_queue = &pgdat->deferred_split_queue;
+
+ spin_lock_init(&ds_queue->split_queue_lock);
+ INIT_LIST_HEAD(&ds_queue->split_queue);
+ ds_queue->split_queue_len = 0;
}
#else
static void pgdat_init_split_queue(struct pglist_data *pgdat) {}
@@ -6243,7 +6717,7 @@
static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid,
unsigned long remaining_pages)
{
- zone->managed_pages = remaining_pages;
+ atomic_long_set(&zone->managed_pages, remaining_pages);
zone_set_nid(zone, nid);
zone->name = zone_names[idx];
zone->zone_pgdat = NODE_DATA(nid);
@@ -6370,7 +6844,11 @@
end = pgdat_end_pfn(pgdat);
end = ALIGN(end, MAX_ORDER_NR_PAGES);
size = (end - start) * sizeof(struct page);
- map = memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
+ map = memblock_alloc_node(size, SMP_CACHE_BYTES,
+ pgdat->node_id);
+ if (!map)
+ panic("Failed to allocate %ld bytes for node %d memory map\n",
+ size, pgdat->node_id);
pgdat->node_mem_map = map + offset;
}
pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n",
@@ -6396,12 +6874,6 @@
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
{
- /*
- * We start only with one section of pages, more pages are added as
- * needed until the rest of deferred pages are initialized.
- */
- pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION,
- pgdat->node_spanned_pages);
pgdat->first_deferred_pfn = ULONG_MAX;
}
#else
@@ -6439,48 +6911,67 @@
free_area_init_core(pgdat);
}
-#if defined(CONFIG_HAVE_MEMBLOCK) && !defined(CONFIG_FLAT_NODE_MEM_MAP)
+#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
+/*
+ * Zero all valid struct pages in range [spfn, epfn), return number of struct
+ * pages zeroed
+ */
+static u64 zero_pfn_range(unsigned long spfn, unsigned long epfn)
+{
+ unsigned long pfn;
+ u64 pgcnt = 0;
+
+ for (pfn = spfn; pfn < epfn; pfn++) {
+ if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
+ pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
+ + pageblock_nr_pages - 1;
+ continue;
+ }
+ mm_zero_struct_page(pfn_to_page(pfn));
+ pgcnt++;
+ }
+
+ return pgcnt;
+}
+
/*
* Only struct pages that are backed by physical memory are zeroed and
* initialized by going through __init_single_page(). But, there are some
* struct pages which are reserved in memblock allocator and their fields
* may be accessed (for example page_to_pfn() on some configuration accesses
* flags). We must explicitly zero those struct pages.
+ *
+ * This function also addresses a similar issue where struct pages are left
+ * uninitialized because the physical address range is not covered by
+ * memblock.memory or memblock.reserved. That could happen when memblock
+ * layout is manually configured via memmap=.
*/
void __init zero_resv_unavail(void)
{
phys_addr_t start, end;
- unsigned long pfn;
u64 i, pgcnt;
+ phys_addr_t next = 0;
/*
- * Loop through ranges that are reserved, but do not have reported
- * physical memory backing.
+ * Loop through unavailable ranges not covered by memblock.memory.
*/
pgcnt = 0;
- for_each_resv_unavail_range(i, &start, &end) {
- for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) {
- if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) {
- pfn = ALIGN_DOWN(pfn, pageblock_nr_pages)
- + pageblock_nr_pages - 1;
- continue;
- }
- mm_zero_struct_page(pfn_to_page(pfn));
- pgcnt++;
- }
+ for_each_mem_range(i, &memblock.memory, NULL,
+ NUMA_NO_NODE, MEMBLOCK_NONE, &start, &end, NULL) {
+ if (next < start)
+ pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
+ next = end;
}
+ pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
/*
* Struct pages that do not have backing memory. This could be because
* firmware is using some of this memory, or for some other reasons.
- * Once memblock is changed so such behaviour is not allowed: i.e.
- * list of "reserved" memory must be a subset of list of "memory", then
- * this code can be removed.
*/
if (pgcnt)
- pr_info("Reserved but unavailable: %lld pages", pgcnt);
+ pr_info("Zeroed struct page in unavailable ranges: %lld pages", pgcnt);
}
-#endif /* CONFIG_HAVE_MEMBLOCK && !CONFIG_FLAT_NODE_MEM_MAP */
+#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -6513,14 +7004,14 @@
* model has fine enough granularity to avoid incorrect mapping for the
* populated node map.
*
- * Returns the determined alignment in pfn's. 0 if there is no alignment
+ * Return: the determined alignment in pfn's. 0 if there is no alignment
* requirement (single node).
*/
unsigned long __init node_map_pfn_alignment(void)
{
unsigned long accl_mask = 0, last_end = 0;
unsigned long start, end, mask;
- int last_nid = -1;
+ int last_nid = NUMA_NO_NODE;
int i, nid;
for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, &nid) {
@@ -6568,7 +7059,7 @@
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
- * It returns the minimum PFN based on information provided via
+ * Return: the minimum PFN based on information provided via
* memblock_set_node().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
@@ -6815,15 +7306,12 @@
{
enum zone_type zone_type;
- if (N_MEMORY == N_NORMAL_MEMORY)
- return;
-
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
if (populated_zone(zone)) {
- node_set_state(nid, N_HIGH_MEMORY);
- if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
- zone_type <= ZONE_NORMAL)
+ if (IS_ENABLED(CONFIG_HIGHMEM))
+ node_set_state(nid, N_HIGH_MEMORY);
+ if (zone_type <= ZONE_NORMAL)
node_set_state(nid, N_NORMAL_MEMORY);
break;
}
@@ -6896,12 +7384,18 @@
(u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
- /* Print out the early node map */
+ /*
+ * Print out the early node map, and initialize the
+ * subsection-map relative to active online memory ranges to
+ * enable future "sub-section" extensions of the memory map.
+ */
pr_info("Early memory node ranges\n");
- for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid) {
pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
((u64)end_pfn << PAGE_SHIFT) - 1);
+ subsection_map_init(start_pfn, end_pfn - start_pfn);
+ }
/* Initialise every node */
mminit_verify_pageflags_layout();
@@ -6979,18 +7473,16 @@
void adjust_managed_page_count(struct page *page, long count)
{
- spin_lock(&managed_page_count_lock);
- page_zone(page)->managed_pages += count;
- totalram_pages += count;
+ atomic_long_add(count, &page_zone(page)->managed_pages);
+ totalram_pages_add(count);
#ifdef CONFIG_HIGHMEM
if (PageHighMem(page))
- totalhigh_pages += count;
+ totalhigh_pages_add(count);
#endif
- spin_unlock(&managed_page_count_lock);
}
EXPORT_SYMBOL(adjust_managed_page_count);
-unsigned long free_reserved_area(void *start, void *end, int poison, char *s)
+unsigned long free_reserved_area(void *start, void *end, int poison, const char *s)
{
void *pos;
unsigned long pages = 0;
@@ -7021,15 +7513,14 @@
return pages;
}
-EXPORT_SYMBOL(free_reserved_area);
#ifdef CONFIG_HIGHMEM
void free_highmem_page(struct page *page)
{
__free_reserved_page(page);
- totalram_pages++;
- page_zone(page)->managed_pages++;
- totalhigh_pages++;
+ totalram_pages_inc();
+ atomic_long_inc(&page_zone(page)->managed_pages);
+ totalhigh_pages_inc();
}
#endif
@@ -7078,10 +7569,10 @@
physpages << (PAGE_SHIFT - 10),
codesize >> 10, datasize >> 10, rosize >> 10,
(init_data_size + init_code_size) >> 10, bss_size >> 10,
- (physpages - totalram_pages - totalcma_pages) << (PAGE_SHIFT - 10),
+ (physpages - totalram_pages() - totalcma_pages) << (PAGE_SHIFT - 10),
totalcma_pages << (PAGE_SHIFT - 10),
#ifdef CONFIG_HIGHMEM
- totalhigh_pages << (PAGE_SHIFT - 10),
+ totalhigh_pages() << (PAGE_SHIFT - 10),
#endif
str ? ", " : "", str ? str : "");
}
@@ -7134,10 +7625,28 @@
return 0;
}
+#ifdef CONFIG_NUMA
+int hashdist = HASHDIST_DEFAULT;
+
+static int __init set_hashdist(char *str)
+{
+ if (!str)
+ return 0;
+ hashdist = simple_strtoul(str, &str, 0);
+ return 1;
+}
+__setup("hashdist=", set_hashdist);
+#endif
+
void __init page_alloc_init(void)
{
int ret;
+#ifdef CONFIG_NUMA
+ if (num_node_state(N_MEMORY) == 1)
+ hashdist = 0;
+#endif
+
ret = cpuhp_setup_state_nocalls(CPUHP_PAGE_ALLOC_DEAD,
"mm/page_alloc:dead", NULL,
page_alloc_cpu_dead);
@@ -7161,6 +7670,7 @@
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
long max = 0;
+ unsigned long managed_pages = zone_managed_pages(zone);
/* Find valid and maximum lowmem_reserve in the zone */
for (j = i; j < MAX_NR_ZONES; j++) {
@@ -7171,8 +7681,8 @@
/* we treat the high watermark as reserved pages. */
max += high_wmark_pages(zone);
- if (max > zone->managed_pages)
- max = zone->managed_pages;
+ if (max > managed_pages)
+ max = managed_pages;
pgdat->totalreserve_pages += max;
@@ -7196,7 +7706,7 @@
for_each_online_pgdat(pgdat) {
for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
- unsigned long managed_pages = zone->managed_pages;
+ unsigned long managed_pages = zone_managed_pages(zone);
zone->lowmem_reserve[j] = 0;
@@ -7214,7 +7724,7 @@
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx];
}
- managed_pages += lower_zone->managed_pages;
+ managed_pages += zone_managed_pages(lower_zone);
}
}
}
@@ -7233,14 +7743,14 @@
/* Calculate total number of !ZONE_HIGHMEM pages */
for_each_zone(zone) {
if (!is_highmem(zone))
- lowmem_pages += zone->managed_pages;
+ lowmem_pages += zone_managed_pages(zone);
}
for_each_zone(zone) {
u64 tmp;
spin_lock_irqsave(&zone->lock, flags);
- tmp = (u64)pages_min * zone->managed_pages;
+ tmp = (u64)pages_min * zone_managed_pages(zone);
do_div(tmp, lowmem_pages);
if (is_highmem(zone)) {
/*
@@ -7249,20 +7759,20 @@
* value here.
*
* The WMARK_HIGH-WMARK_LOW and (WMARK_LOW-WMARK_MIN)
- * deltas control asynch page reclaim, and so should
+ * deltas control async page reclaim, and so should
* not be capped for highmem.
*/
unsigned long min_pages;
- min_pages = zone->managed_pages / 1024;
+ min_pages = zone_managed_pages(zone) / 1024;
min_pages = clamp(min_pages, SWAP_CLUSTER_MAX, 128UL);
- zone->watermark[WMARK_MIN] = min_pages;
+ zone->_watermark[WMARK_MIN] = min_pages;
} else {
/*
* If it's a lowmem zone, reserve a number of pages
* proportionate to the zone's size.
*/
- zone->watermark[WMARK_MIN] = tmp;
+ zone->_watermark[WMARK_MIN] = tmp;
}
/*
@@ -7271,11 +7781,12 @@
* ensure a minimum size on small systems.
*/
tmp = max_t(u64, tmp >> 2,
- mult_frac(zone->managed_pages,
+ mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
- zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
- zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
+ zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
+ zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -7376,6 +7887,18 @@
return 0;
}
+int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *length, loff_t *ppos)
+{
+ int rc;
+
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
+ if (rc)
+ return rc;
+
+ return 0;
+}
+
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
@@ -7401,8 +7924,8 @@
pgdat->min_unmapped_pages = 0;
for_each_zone(zone)
- zone->zone_pgdat->min_unmapped_pages += (zone->managed_pages *
- sysctl_min_unmapped_ratio) / 100;
+ zone->zone_pgdat->min_unmapped_pages += (zone_managed_pages(zone) *
+ sysctl_min_unmapped_ratio) / 100;
}
@@ -7429,8 +7952,8 @@
pgdat->min_slab_pages = 0;
for_each_zone(zone)
- zone->zone_pgdat->min_slab_pages += (zone->managed_pages *
- sysctl_min_slab_ratio) / 100;
+ zone->zone_pgdat->min_slab_pages += (zone_managed_pages(zone) *
+ sysctl_min_slab_ratio) / 100;
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
@@ -7508,19 +8031,6 @@
return ret;
}
-#ifdef CONFIG_NUMA
-int hashdist = HASHDIST_DEFAULT;
-
-static int __init set_hashdist(char *str)
-{
- if (!str)
- return 0;
- hashdist = simple_strtoul(str, &str, 0);
- return 1;
-}
-__setup("hashdist=", set_hashdist);
-#endif
-
#ifndef __HAVE_ARCH_RESERVED_KERNEL_PAGES
/*
* Returns the number of pages that arch has reserved but
@@ -7567,6 +8077,7 @@
unsigned long log2qty, size;
void *table = NULL;
gfp_t gfp_flags;
+ bool virt;
/* allow the kernel cmdline to have a say */
if (!numentries) {
@@ -7623,32 +8134,34 @@
gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC;
do {
+ virt = false;
size = bucketsize << log2qty;
if (flags & HASH_EARLY) {
if (flags & HASH_ZERO)
- table = memblock_virt_alloc_nopanic(size, 0);
+ table = memblock_alloc(size, SMP_CACHE_BYTES);
else
- table = memblock_virt_alloc_raw(size, 0);
- } else if (hashdist) {
+ table = memblock_alloc_raw(size,
+ SMP_CACHE_BYTES);
+ } else if (get_order(size) >= MAX_ORDER || hashdist) {
table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ virt = true;
} else {
/*
* If bucketsize is not a power-of-two, we may free
* some pages at the end of hash table which
* alloc_pages_exact() automatically does
*/
- if (get_order(size) < MAX_ORDER) {
- table = alloc_pages_exact(size, gfp_flags);
- kmemleak_alloc(table, size, 1, gfp_flags);
- }
+ table = alloc_pages_exact(size, gfp_flags);
+ kmemleak_alloc(table, size, 1, gfp_flags);
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- pr_info("%s hash table entries: %ld (order: %d, %lu bytes)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size);
+ pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
+ tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ virt ? "vmalloc" : "linear");
if (_hash_shift)
*_hash_shift = log2qty;
@@ -7668,10 +8181,12 @@
* race condition. So you can't expect this function should be exact.
*/
bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
- int migratetype,
- bool skip_hwpoisoned_pages)
+ int migratetype, int flags)
{
- unsigned long pfn, iter, found;
+ unsigned long found;
+ unsigned long iter = 0;
+ unsigned long pfn = page_to_pfn(page);
+ const char *reason = "unmovable page";
/*
* TODO we could make this much more efficient by not checking every
@@ -7681,17 +8196,20 @@
* can still lead to having bootmem allocations in zone_movable.
*/
- /*
- * CMA allocations (alloc_contig_range) really need to mark isolate
- * CMA pageblocks even when they are not movable in fact so consider
- * them movable here.
- */
- if (is_migrate_cma(migratetype) &&
- is_migrate_cma(get_pageblock_migratetype(page)))
- return false;
+ if (is_migrate_cma_page(page)) {
+ /*
+ * CMA allocations (alloc_contig_range) really need to mark
+ * isolate CMA pageblocks even when they are not movable in fact
+ * so consider them movable here.
+ */
+ if (is_migrate_cma(migratetype))
+ return false;
- pfn = page_to_pfn(page);
- for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) {
+ reason = "CMA page";
+ goto unmovable;
+ }
+
+ for (found = 0; iter < pageblock_nr_pages; iter++) {
unsigned long check = pfn + iter;
if (!pfn_valid_within(check))
@@ -7712,7 +8230,7 @@
/*
* Hugepages are not in LRU lists, but they're movable.
- * We need not scan over tail pages bacause we don't
+ * We need not scan over tail pages because we don't
* handle each tail page individually in migration.
*/
if (PageHuge(page)) {
@@ -7722,7 +8240,7 @@
if (!hugepage_migration_supported(page_hstate(head)))
goto unmovable;
- skip_pages = (1 << compound_order(head)) - (page - head);
+ skip_pages = compound_nr(head) - (page - head);
iter += skip_pages - 1;
continue;
}
@@ -7743,7 +8261,7 @@
* The HWPoisoned page may be not in buddy system, and
* page_count() is not 0.
*/
- if (skip_hwpoisoned_pages && PageHWPoison(page))
+ if ((flags & SKIP_HWPOISON) && PageHWPoison(page))
continue;
if (__PageMovable(page))
@@ -7770,11 +8288,12 @@
return false;
unmovable:
WARN_ON_ONCE(zone_idx(zone) == ZONE_MOVABLE);
+ if (flags & REPORT_FAILURE)
+ dump_page(pfn_to_page(pfn + iter), reason);
return true;
}
-#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
-
+#ifdef CONFIG_CONTIG_ALLOC
static unsigned long pfn_max_align_down(unsigned long pfn)
{
return pfn & ~(max_t(unsigned long, MAX_ORDER_NR_PAGES,
@@ -7849,7 +8368,7 @@
* pageblocks in the range. Once isolated, the pageblocks should not
* be modified by others.
*
- * Returns zero on success or negative error code. On success all
+ * Return: zero on success or negative error code. On success all
* pages which PFN is in [start, end) are allocated for the caller and
* need to be freed with free_contig_range().
*/
@@ -7896,9 +8415,8 @@
*/
ret = start_isolate_page_range(pfn_max_align_down(start),
- pfn_max_align_up(end), migratetype,
- false);
- if (ret)
+ pfn_max_align_up(end), migratetype, 0);
+ if (ret < 0)
return ret;
/*
@@ -7934,7 +8452,6 @@
*/
lru_add_drain_all();
- drain_all_pages(cc.zone);
order = 0;
outer_start = start;
@@ -7985,8 +8502,9 @@
pfn_max_align_up(end), migratetype);
return ret;
}
+#endif /* CONFIG_CONTIG_ALLOC */
-void free_contig_range(unsigned long pfn, unsigned nr_pages)
+void free_contig_range(unsigned long pfn, unsigned int nr_pages)
{
unsigned int count = 0;
@@ -7998,9 +8516,7 @@
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
-#endif
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
* page high values need to be recalulated.
@@ -8014,7 +8530,6 @@
per_cpu_ptr(zone->pageset, cpu));
mutex_unlock(&pcp_batch_high_lock);
}
-#endif
void zone_pcp_reset(struct zone *zone)
{
@@ -8040,7 +8555,7 @@
* All pages in the range must be in a single zone and isolated
* before calling this.
*/
-void
+unsigned long
__offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
{
struct page *page;
@@ -8048,12 +8563,15 @@
unsigned int order, i;
unsigned long pfn;
unsigned long flags;
+ unsigned long offlined_pages = 0;
+
/* find the first valid pfn */
for (pfn = start_pfn; pfn < end_pfn; pfn++)
if (pfn_valid(pfn))
break;
if (pfn == end_pfn)
- return;
+ return offlined_pages;
+
offline_mem_sections(pfn, end_pfn);
zone = page_zone(pfn_to_page(pfn));
spin_lock_irqsave(&zone->lock, flags);
@@ -8071,24 +8589,26 @@
if (unlikely(!PageBuddy(page) && PageHWPoison(page))) {
pfn++;
SetPageReserved(page);
+ offlined_pages++;
continue;
}
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
order = page_order(page);
+ offlined_pages += 1 << order;
#ifdef CONFIG_DEBUG_VM
pr_info("remove from free list %lx %d %lx\n",
pfn, 1 << order, end_pfn);
#endif
- list_del(&page->lru);
- rmv_page_order(page);
- zone->free_area[order].nr_free--;
+ del_page_from_free_area(page, &zone->free_area[order]);
for (i = 0; i < (1 << order); i++)
SetPageReserved((page+i));
pfn += (1 << order);
}
spin_unlock_irqrestore(&zone->lock, flags);
+
+ return offlined_pages;
}
#endif
diff --git a/mm/page_ext.c b/mm/page_ext.c
index a9826da..4ade843 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -1,7 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/page_ext.h>
#include <linux/memory.h>
#include <linux/vmalloc.h>
@@ -59,9 +59,6 @@
*/
static struct page_ext_operations *page_ext_ops[] = {
-#ifdef CONFIG_DEBUG_PAGEALLOC
- &debug_guardpage_ops,
-#endif
#ifdef CONFIG_PAGE_OWNER
&page_owner_ops,
#endif
@@ -70,8 +67,9 @@
#endif
};
+unsigned long page_ext_size = sizeof(struct page_ext);
+
static unsigned long total_usage;
-static unsigned long extra_mem;
static bool __init invoke_need_callbacks(void)
{
@@ -81,9 +79,8 @@
for (i = 0; i < entries; i++) {
if (page_ext_ops[i]->need && page_ext_ops[i]->need()) {
- page_ext_ops[i]->offset = sizeof(struct page_ext) +
- extra_mem;
- extra_mem += page_ext_ops[i]->size;
+ page_ext_ops[i]->offset = page_ext_size;
+ page_ext_size += page_ext_ops[i]->size;
need = true;
}
}
@@ -102,14 +99,9 @@
}
}
-static unsigned long get_entry_size(void)
-{
- return sizeof(struct page_ext) + extra_mem;
-}
-
static inline struct page_ext *get_entry(void *base, unsigned long index)
{
- return base + get_entry_size() * index;
+ return base + page_ext_size * index;
}
#if !defined(CONFIG_SPARSEMEM)
@@ -159,11 +151,11 @@
!IS_ALIGNED(node_end_pfn(nid), MAX_ORDER_NR_PAGES))
nr_pages += MAX_ORDER_NR_PAGES;
- table_size = get_entry_size() * nr_pages;
+ table_size = page_ext_size * nr_pages;
- base = memblock_virt_alloc_try_nid_nopanic(
+ base = memblock_alloc_try_nid(
table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
if (!base)
return -ENOMEM;
NODE_DATA(nid)->node_page_ext = base;
@@ -237,7 +229,7 @@
if (section->page_ext)
return 0;
- table_size = get_entry_size() * PAGES_PER_SECTION;
+ table_size = page_ext_size * PAGES_PER_SECTION;
base = alloc_page_ext(table_size, nid);
/*
@@ -257,7 +249,7 @@
* we need to apply a mask.
*/
pfn &= PAGE_SECTION_MASK;
- section->page_ext = (void *)base - get_entry_size() * pfn;
+ section->page_ext = (void *)base - page_ext_size * pfn;
total_usage += table_size;
return 0;
}
@@ -270,9 +262,10 @@
struct page *page = virt_to_page(addr);
size_t table_size;
- table_size = get_entry_size() * PAGES_PER_SECTION;
+ table_size = page_ext_size * PAGES_PER_SECTION;
BUG_ON(PageReserved(page));
+ kmemleak_free(addr);
free_pages_exact(addr, table_size);
}
}
@@ -300,7 +293,7 @@
start = SECTION_ALIGN_DOWN(start_pfn);
end = SECTION_ALIGN_UP(start_pfn + nr_pages);
- if (nid == -1) {
+ if (nid == NUMA_NO_NODE) {
/*
* In this case, "nid" already exists and contains valid memory.
* "start_pfn" passed to us is a pfn which is an arg for
@@ -398,10 +391,8 @@
* We know some arch can have a nodes layout such as
* -------------pfn-------------->
* N0 | N1 | N2 | N0 | N1 | N2|....
- *
- * Take into account DEFERRED_STRUCT_PAGE_INIT.
*/
- if (early_pfn_to_nid(pfn) != nid)
+ if (pfn_to_nid(pfn) != nid)
continue;
if (init_section_page_ext(pfn, nid))
goto oom;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 6302bc6..2955124 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -1,6 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/init.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kobject.h>
@@ -31,7 +31,7 @@
static struct page *page_idle_get_page(unsigned long pfn)
{
struct page *page;
- struct zone *zone;
+ pg_data_t *pgdat;
if (!pfn_valid(pfn))
return NULL;
@@ -41,13 +41,13 @@
!get_page_unless_zero(page))
return NULL;
- zone = page_zone(page);
- spin_lock_irq(zone_lru_lock(zone));
+ pgdat = page_pgdat(page);
+ spin_lock_irq(&pgdat->lru_lock);
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
return page;
}
@@ -136,7 +136,7 @@
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
- end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+ end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
@@ -181,7 +181,7 @@
end_pfn = pfn + count * BITS_PER_BYTE;
if (end_pfn > max_pfn)
- end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+ end_pfn = max_pfn;
for (; pfn < end_pfn; pfn++) {
bit = pfn % BITMAP_CHUNK_BITS;
diff --git a/mm/page_io.c b/mm/page_io.c
index aafd19e..60a66a5 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -29,10 +29,9 @@
static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
{
- int i, nr = hpage_nr_pages(page);
struct bio *bio;
- bio = bio_alloc(gfp_flags, nr);
+ bio = bio_alloc(gfp_flags, 1);
if (bio) {
struct block_device *bdev;
@@ -41,9 +40,7 @@
bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_end_io = end_io;
- for (i = 0; i < nr; i++)
- bio_add_page(bio, page + i, PAGE_SIZE, 0);
- VM_BUG_ON(bio->bi_iter.bi_size != PAGE_SIZE * nr);
+ bio_add_page(bio, page, PAGE_SIZE * hpage_nr_pages(page), 0);
}
return bio;
}
@@ -76,6 +73,7 @@
{
struct swap_info_struct *sis;
struct gendisk *disk;
+ swp_entry_t entry;
/*
* There is no guarantee that the page is in swap cache - the software
@@ -107,11 +105,10 @@
* we again wish to reclaim it.
*/
disk = sis->bdev->bd_disk;
- if (disk->fops->swap_slot_free_notify) {
- swp_entry_t entry;
+ entry.val = page_private(page);
+ if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
unsigned long offset;
- entry.val = page_private(page);
offset = swp_offset(entry);
SetPageDirty(page);
@@ -140,8 +137,10 @@
unlock_page(page);
WRITE_ONCE(bio->bi_private, NULL);
bio_put(bio);
- wake_up_process(waiter);
- put_task_struct(waiter);
+ if (waiter) {
+ blk_wake_io_task(waiter);
+ put_task_struct(waiter);
+ }
}
int generic_swapfile_activate(struct swap_info_struct *sis,
@@ -164,7 +163,7 @@
blocks_per_page = PAGE_SIZE >> blkbits;
/*
- * Map all the blocks into the extent list. This code doesn't try
+ * Map all the blocks into the extent tree. This code doesn't try
* to be very smart.
*/
probe_block = 0;
@@ -283,7 +282,7 @@
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -294,7 +293,7 @@
};
struct iov_iter from;
- iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
+ iov_iter_bvec(&from, WRITE, &bv, 1, PAGE_SIZE);
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
@@ -339,7 +338,7 @@
goto out;
}
bio->bi_opf = REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc);
- bio_associate_blkcg_from_page(bio, page);
+ bio_associate_blkg_from_page(bio, page);
count_swpout_vm_event(page);
set_page_writeback(page);
unlock_page(page);
@@ -365,7 +364,7 @@
goto out;
}
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -398,9 +397,12 @@
* Keep this task valid during swap readpage because the oom killer may
* attempt to access it in the page fault retry time check.
*/
- get_task_struct(current);
- bio->bi_private = current;
bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ if (synchronous) {
+ bio->bi_opf |= REQ_HIPRI;
+ get_task_struct(current);
+ bio->bi_private = current;
+ }
count_vm_event(PSWPIN);
bio_get(bio);
qc = submit_bio(bio);
@@ -409,8 +411,8 @@
if (!READ_ONCE(bio->bi_private))
break;
- if (!blk_poll(disk->queue, qc))
- break;
+ if (!blk_poll(disk->queue, qc, true))
+ io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
@@ -423,7 +425,7 @@
{
struct swap_info_struct *sis = page_swap_info(page);
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_FS) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 43e0856..89c19c0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -15,8 +15,7 @@
#define CREATE_TRACE_POINTS
#include <trace/events/page_isolation.h>
-static int set_migratetype_isolate(struct page *page, int migratetype,
- bool skip_hwpoisoned_pages)
+static int set_migratetype_isolate(struct page *page, int migratetype, int isol_flags)
{
struct zone *zone;
unsigned long flags, pfn;
@@ -61,7 +60,7 @@
* We just check MOVABLE pages.
*/
if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype,
- skip_hwpoisoned_pages))
+ isol_flags))
ret = 0;
/*
@@ -152,8 +151,6 @@
for (i = 0; i < nr_pages; i++) {
struct page *page;
- if (!pfn_valid_within(pfn + i))
- continue;
page = pfn_to_online_page(pfn + i);
if (!page)
continue;
@@ -162,34 +159,44 @@
return NULL;
}
-/*
- * start_isolate_page_range() -- make page-allocation-type of range of pages
- * to be MIGRATE_ISOLATE.
- * @start_pfn: The lower PFN of the range to be isolated.
- * @end_pfn: The upper PFN of the range to be isolated.
- * @migratetype: migrate type to set in error recovery.
+/**
+ * start_isolate_page_range() - make page-allocation-type of range of pages to
+ * be MIGRATE_ISOLATE.
+ * @start_pfn: The lower PFN of the range to be isolated.
+ * @end_pfn: The upper PFN of the range to be isolated.
+ * start_pfn/end_pfn must be aligned to pageblock_order.
+ * @migratetype: Migrate type to set in error recovery.
+ * @flags: The following flags are allowed (they can be combined in
+ * a bit mask)
+ * SKIP_HWPOISON - ignore hwpoison pages
+ * REPORT_FAILURE - report details about the failure to
+ * isolate the range
*
* Making page-allocation-type to be MIGRATE_ISOLATE means free pages in
* the range will never be allocated. Any free pages and pages freed in the
- * future will not be allocated again.
- *
- * start_pfn/end_pfn must be aligned to pageblock_order.
- * Return 0 on success and -EBUSY if any part of range cannot be isolated.
+ * future will not be allocated again. If specified range includes migrate types
+ * other than MOVABLE or CMA, this will fail with -EBUSY. For isolating all
+ * pages in the range finally, the caller have to free all pages in the range.
+ * test_page_isolated() can be used for test it.
*
* There is no high level synchronization mechanism that prevents two threads
- * from trying to isolate overlapping ranges. If this happens, one thread
+ * from trying to isolate overlapping ranges. If this happens, one thread
* will notice pageblocks in the overlapping range already set to isolate.
* This happens in set_migratetype_isolate, and set_migratetype_isolate
- * returns an error. We then clean up by restoring the migration type on
- * pageblocks we may have modified and return -EBUSY to caller. This
+ * returns an error. We then clean up by restoring the migration type on
+ * pageblocks we may have modified and return -EBUSY to caller. This
* prevents two threads from simultaneously working on overlapping ranges.
+ *
+ * Return: the number of isolated pageblocks on success and -EBUSY if any part
+ * of range cannot be isolated.
*/
int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
- unsigned migratetype, bool skip_hwpoisoned_pages)
+ unsigned migratetype, int flags)
{
unsigned long pfn;
unsigned long undo_pfn;
struct page *page;
+ int nr_isolate_pageblock = 0;
BUG_ON(!IS_ALIGNED(start_pfn, pageblock_nr_pages));
BUG_ON(!IS_ALIGNED(end_pfn, pageblock_nr_pages));
@@ -198,13 +205,15 @@
pfn < end_pfn;
pfn += pageblock_nr_pages) {
page = __first_valid_page(pfn, pageblock_nr_pages);
- if (page &&
- set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) {
- undo_pfn = pfn;
- goto undo;
+ if (page) {
+ if (set_migratetype_isolate(page, migratetype, flags)) {
+ undo_pfn = pfn;
+ goto undo;
+ }
+ nr_isolate_pageblock++;
}
}
- return 0;
+ return nr_isolate_pageblock;
undo:
for (pfn = start_pfn;
pfn < undo_pfn;
@@ -221,7 +230,7 @@
/*
* Make isolated pages available again.
*/
-int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
+void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn,
unsigned migratetype)
{
unsigned long pfn;
@@ -238,7 +247,6 @@
continue;
unset_migratetype_isolate(page, migratetype);
}
- return 0;
}
/*
* Test all pages in the range is free(means isolated) or not.
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d80adfe..18ecde9 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -3,7 +3,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/stacktrace.h>
#include <linux/page_owner.h>
#include <linux/jump_label.h>
@@ -24,9 +24,10 @@
short last_migrate_reason;
gfp_t gfp_mask;
depot_stack_handle_t handle;
+ depot_stack_handle_t free_handle;
};
-static bool page_owner_disabled = true;
+static bool page_owner_enabled = false;
DEFINE_STATIC_KEY_FALSE(page_owner_inited);
static depot_stack_handle_t dummy_handle;
@@ -41,7 +42,7 @@
return -EINVAL;
if (strcmp(buf, "on") == 0)
- page_owner_disabled = false;
+ page_owner_enabled = true;
return 0;
}
@@ -49,24 +50,16 @@
static bool need_page_owner(void)
{
- if (page_owner_disabled)
- return false;
-
- return true;
+ return page_owner_enabled;
}
static __always_inline depot_stack_handle_t create_dummy_stack(void)
{
unsigned long entries[4];
- struct stack_trace dummy;
+ unsigned int nr_entries;
- dummy.nr_entries = 0;
- dummy.max_entries = ARRAY_SIZE(entries);
- dummy.entries = &entries[0];
- dummy.skip = 0;
-
- save_stack_trace(&dummy);
- return depot_save_stack(&dummy, GFP_KERNEL);
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0);
+ return stack_depot_save(entries, nr_entries, GFP_KERNEL);
}
static noinline void register_dummy_stack(void)
@@ -86,7 +79,7 @@
static void init_page_owner(void)
{
- if (page_owner_disabled)
+ if (!page_owner_enabled)
return;
register_dummy_stack();
@@ -107,80 +100,83 @@
return (void *)page_ext + page_owner_ops.offset;
}
-void __reset_page_owner(struct page *page, unsigned int order)
+static inline bool check_recursive_alloc(unsigned long *entries,
+ unsigned int nr_entries,
+ unsigned long ip)
{
- int i;
- struct page_ext *page_ext;
+ unsigned int i;
- for (i = 0; i < (1 << order); i++) {
- page_ext = lookup_page_ext(page + i);
- if (unlikely(!page_ext))
- continue;
- __clear_bit(PAGE_EXT_OWNER, &page_ext->flags);
- }
-}
-
-static inline bool check_recursive_alloc(struct stack_trace *trace,
- unsigned long ip)
-{
- int i;
-
- if (!trace->nr_entries)
- return false;
-
- for (i = 0; i < trace->nr_entries; i++) {
- if (trace->entries[i] == ip)
+ for (i = 0; i < nr_entries; i++) {
+ if (entries[i] == ip)
return true;
}
-
return false;
}
static noinline depot_stack_handle_t save_stack(gfp_t flags)
{
unsigned long entries[PAGE_OWNER_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = PAGE_OWNER_STACK_DEPTH,
- .skip = 2
- };
depot_stack_handle_t handle;
+ unsigned int nr_entries;
- save_stack_trace(&trace);
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries-1] == ULONG_MAX)
- trace.nr_entries--;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 2);
/*
- * We need to check recursion here because our request to stackdepot
- * could trigger memory allocation to save new entry. New memory
- * allocation would reach here and call depot_save_stack() again
- * if we don't catch it. There is still not enough memory in stackdepot
- * so it would try to allocate memory again and loop forever.
+ * We need to check recursion here because our request to
+ * stackdepot could trigger memory allocation to save new
+ * entry. New memory allocation would reach here and call
+ * stack_depot_save_entries() again if we don't catch it. There is
+ * still not enough memory in stackdepot so it would try to
+ * allocate memory again and loop forever.
*/
- if (check_recursive_alloc(&trace, _RET_IP_))
+ if (check_recursive_alloc(entries, nr_entries, _RET_IP_))
return dummy_handle;
- handle = depot_save_stack(&trace, flags);
+ handle = stack_depot_save(entries, nr_entries, flags);
if (!handle)
handle = failure_handle;
return handle;
}
-static inline void __set_page_owner_handle(struct page_ext *page_ext,
- depot_stack_handle_t handle, unsigned int order, gfp_t gfp_mask)
+void __reset_page_owner(struct page *page, unsigned int order)
{
+ int i;
+ struct page_ext *page_ext;
+ depot_stack_handle_t handle = 0;
struct page_owner *page_owner;
- page_owner = get_page_owner(page_ext);
- page_owner->handle = handle;
- page_owner->order = order;
- page_owner->gfp_mask = gfp_mask;
- page_owner->last_migrate_reason = -1;
+ handle = save_stack(GFP_NOWAIT | __GFP_NOWARN);
- __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ page_ext = lookup_page_ext(page);
+ if (unlikely(!page_ext))
+ return;
+ for (i = 0; i < (1 << order); i++) {
+ __clear_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+ page_owner = get_page_owner(page_ext);
+ page_owner->free_handle = handle;
+ page_ext = page_ext_next(page_ext);
+ }
+}
+
+static inline void __set_page_owner_handle(struct page *page,
+ struct page_ext *page_ext, depot_stack_handle_t handle,
+ unsigned int order, gfp_t gfp_mask)
+{
+ struct page_owner *page_owner;
+ int i;
+
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->handle = handle;
+ page_owner->order = order;
+ page_owner->gfp_mask = gfp_mask;
+ page_owner->last_migrate_reason = -1;
+ __set_bit(PAGE_EXT_OWNER, &page_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
+
+ page_ext = page_ext_next(page_ext);
+ }
}
noinline void __set_page_owner(struct page *page, unsigned int order,
@@ -193,7 +189,7 @@
return;
handle = save_stack(gfp_mask);
- __set_page_owner_handle(page_ext, handle, order, gfp_mask);
+ __set_page_owner_handle(page, page_ext, handle, order, gfp_mask);
}
void __set_page_owner_migrate_reason(struct page *page, int reason)
@@ -217,10 +213,11 @@
if (unlikely(!page_ext))
return;
- page_owner = get_page_owner(page_ext);
- page_owner->order = 0;
- for (i = 1; i < (1 << order); i++)
- __copy_page_owner(page, page + i);
+ for (i = 0; i < (1 << order); i++) {
+ page_owner = get_page_owner(page_ext);
+ page_owner->order = 0;
+ page_ext = page_ext_next(page_ext);
+ }
}
void __copy_page_owner(struct page *oldpage, struct page *newpage)
@@ -250,6 +247,7 @@
* the new page, which will be freed.
*/
__set_bit(PAGE_EXT_OWNER, &new_ext->flags);
+ __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags);
}
void pagetypeinfo_showmixedcount_print(struct seq_file *m,
@@ -273,7 +271,8 @@
* not matter as the mixed block count will still be correct
*/
for (; pfn < end_pfn; ) {
- if (!pfn_valid(pfn)) {
+ page = pfn_to_online_page(pfn);
+ if (!page) {
pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES);
continue;
}
@@ -281,13 +280,13 @@
block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages);
block_end_pfn = min(block_end_pfn, end_pfn);
- page = pfn_to_page(pfn);
pageblock_mt = get_pageblock_migratetype(page);
for (; pfn < block_end_pfn; pfn++) {
if (!pfn_valid_within(pfn))
continue;
+ /* The pageblock is online, no need to recheck. */
page = pfn_to_page(pfn);
if (page_zone(page) != zone)
@@ -309,7 +308,7 @@
if (unlikely(!page_ext))
continue;
- if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
continue;
page_owner = get_page_owner(page_ext);
@@ -340,17 +339,12 @@
struct page *page, struct page_owner *page_owner,
depot_stack_handle_t handle)
{
- int ret;
- int pageblock_mt, page_mt;
+ int ret, pageblock_mt, page_mt;
+ unsigned long *entries;
+ unsigned int nr_entries;
char *kbuf;
- unsigned long entries[PAGE_OWNER_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = PAGE_OWNER_STACK_DEPTH,
- .skip = 0
- };
+ count = min_t(size_t, count, PAGE_SIZE);
kbuf = kmalloc(count, GFP_KERNEL);
if (!kbuf)
return -ENOMEM;
@@ -377,8 +371,8 @@
if (ret >= count)
goto err;
- depot_fetch_stack(handle, &trace);
- ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
+ nr_entries = stack_depot_fetch(handle, &entries);
+ ret += stack_trace_snprint(kbuf + ret, count - ret, entries, nr_entries, 0);
if (ret >= count)
goto err;
@@ -409,14 +403,9 @@
{
struct page_ext *page_ext = lookup_page_ext(page);
struct page_owner *page_owner;
- unsigned long entries[PAGE_OWNER_STACK_DEPTH];
- struct stack_trace trace = {
- .nr_entries = 0,
- .entries = entries,
- .max_entries = PAGE_OWNER_STACK_DEPTH,
- .skip = 0
- };
depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries;
gfp_t gfp_mask;
int mt;
@@ -430,20 +419,34 @@
mt = gfpflags_to_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
- pr_alert("page_owner info is not active (free page?)\n");
+ pr_alert("page_owner info is not present (never set?)\n");
return;
}
+ if (test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
+ pr_alert("page_owner tracks the page as allocated\n");
+ else
+ pr_alert("page_owner tracks the page as freed\n");
+
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+
handle = READ_ONCE(page_owner->handle);
if (!handle) {
- pr_alert("page_owner info is not active (free page?)\n");
- return;
+ pr_alert("page_owner allocation stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ stack_trace_print(entries, nr_entries, 0);
}
- depot_fetch_stack(handle, &trace);
- pr_alert("page allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
- page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
- print_stack_trace(&trace, 0);
+ handle = READ_ONCE(page_owner->free_handle);
+ if (!handle) {
+ pr_alert("page_owner free stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ pr_alert("page last free stack trace:\n");
+ stack_trace_print(entries, nr_entries, 0);
+ }
if (page_owner->last_migrate_reason != -1)
pr_alert("page has been migrated, last migrate reason: %s\n",
@@ -506,9 +509,23 @@
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags))
continue;
+ /*
+ * Although we do have the info about past allocation of free
+ * pages, it's not relevant for current memory usage.
+ */
+ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags))
+ continue;
+
page_owner = get_page_owner(page_ext);
/*
+ * Don't print "tail" pages of high-order allocations as that
+ * would inflate the stats.
+ */
+ if (!IS_ALIGNED(pfn, 1 << page_owner->order))
+ continue;
+
+ /*
* Access to page_ext->handle isn't synchronous so we should
* be careful to access it.
*/
@@ -587,7 +604,8 @@
continue;
/* Found early allocated page */
- __set_page_owner_handle(page_ext, early_handle, 0, 0);
+ __set_page_owner_handle(page, page_ext, early_handle,
+ 0, 0);
count++;
}
cond_resched();
@@ -624,16 +642,14 @@
static int __init pageowner_init(void)
{
- struct dentry *dentry;
-
if (!static_branch_unlikely(&page_owner_inited)) {
pr_info("page_owner is disabled\n");
return 0;
}
- dentry = debugfs_create_file("page_owner", 0400, NULL,
- NULL, &proc_page_owner_operations);
+ debugfs_create_file("page_owner", 0400, NULL, NULL,
+ &proc_page_owner_operations);
- return PTR_ERR_OR_ZERO(dentry);
+ return 0;
}
late_initcall(pageowner_init)
diff --git a/mm/page_poison.c b/mm/page_poison.c
index aa2b3d3..34b9181 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -6,6 +6,7 @@
#include <linux/page_ext.h>
#include <linux/poison.h>
#include <linux/ratelimit.h>
+#include <linux/kasan.h>
static bool want_page_poisoning __read_mostly;
@@ -17,11 +18,16 @@
}
early_param("page_poison", early_page_poison_param);
+/**
+ * page_poisoning_enabled - check if page poisoning is enabled
+ *
+ * Return true if page poisoning is enabled, or false if not.
+ */
bool page_poisoning_enabled(void)
{
/*
* Assumes that debug_pagealloc_enabled is set before
- * free_all_bootmem.
+ * memblock_free_all.
* Page poisoning is debug page alloc for some arches. If
* either of those options are enabled, enable poisoning.
*/
@@ -29,12 +35,16 @@
(!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
debug_pagealloc_enabled()));
}
+EXPORT_SYMBOL_GPL(page_poisoning_enabled);
static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
+ /* KASAN still think the page is in-use, so skip it. */
+ kasan_disable_current();
memset(addr, PAGE_POISON, PAGE_SIZE);
+ kasan_enable_current();
kunmap_atomic(addr);
}
@@ -91,7 +101,7 @@
/*
* Page poisoning when enabled poisons each and every page
* that is freed to buddy. Thus no extra check is done to
- * see if a page was posioned.
+ * see if a page was poisoned.
*/
check_poison_mem(addr, PAGE_SIZE);
kunmap_atomic(addr);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 11df03e..eff4b45 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -153,8 +153,7 @@
if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
- pvmw->pte = huge_pte_offset(mm, pvmw->address,
- PAGE_SIZE << compound_order(page));
+ pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
return false;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index c3084ff..d48c2a9 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/mm.h>
+#include <linux/pagewalk.h>
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
@@ -9,10 +9,11 @@
{
pte_t *pte;
int err = 0;
+ const struct mm_walk_ops *ops = walk->ops;
pte = pte_offset_map(pmd, addr);
for (;;) {
- err = walk->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
+ err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
addr += PAGE_SIZE;
@@ -30,6 +31,7 @@
{
pmd_t *pmd;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
pmd = pmd_offset(pud, addr);
@@ -37,8 +39,8 @@
again:
next = pmd_addr_end(addr, end);
if (pmd_none(*pmd) || !walk->vma) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
@@ -47,8 +49,8 @@
* This implies that each ->pmd_entry() handler
* needs to know about pmd_trans_huge() pmds
*/
- if (walk->pmd_entry)
- err = walk->pmd_entry(pmd, addr, next, walk);
+ if (ops->pmd_entry)
+ err = ops->pmd_entry(pmd, addr, next, walk);
if (err)
break;
@@ -56,7 +58,7 @@
* Check this here so we only break down trans_huge
* pages when we _need_ to
*/
- if (!walk->pte_entry)
+ if (!ops->pte_entry)
continue;
split_huge_pmd(walk->vma, pmd, addr);
@@ -75,6 +77,7 @@
{
pud_t *pud;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
pud = pud_offset(p4d, addr);
@@ -82,18 +85,18 @@
again:
next = pud_addr_end(addr, end);
if (pud_none(*pud) || !walk->vma) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pud_entry) {
+ if (ops->pud_entry) {
spinlock_t *ptl = pud_trans_huge_lock(pud, walk->vma);
if (ptl) {
- err = walk->pud_entry(pud, addr, next, walk);
+ err = ops->pud_entry(pud, addr, next, walk);
spin_unlock(ptl);
if (err)
break;
@@ -105,7 +108,7 @@
if (pud_none(*pud))
goto again;
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
@@ -119,19 +122,20 @@
{
p4d_t *p4d;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_pud_range(p4d, addr, next, walk);
if (err)
break;
@@ -145,19 +149,20 @@
{
pgd_t *pgd;
unsigned long next;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
pgd = pgd_offset(walk->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
continue;
}
- if (walk->pmd_entry || walk->pte_entry)
+ if (ops->pmd_entry || ops->pte_entry)
err = walk_p4d_range(pgd, addr, next, walk);
if (err)
break;
@@ -183,6 +188,7 @@
unsigned long hmask = huge_page_mask(h);
unsigned long sz = huge_page_size(h);
pte_t *pte;
+ const struct mm_walk_ops *ops = walk->ops;
int err = 0;
do {
@@ -190,9 +196,9 @@
pte = huge_pte_offset(walk->mm, addr & hmask, sz);
if (pte)
- err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
- else if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
+ err = ops->hugetlb_entry(pte, hmask, addr, next, walk);
+ else if (ops->pte_hole)
+ err = ops->pte_hole(addr, next, walk);
if (err)
break;
@@ -220,9 +226,10 @@
struct mm_walk *walk)
{
struct vm_area_struct *vma = walk->vma;
+ const struct mm_walk_ops *ops = walk->ops;
- if (walk->test_walk)
- return walk->test_walk(start, end, walk);
+ if (ops->test_walk)
+ return ops->test_walk(start, end, walk);
/*
* vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
@@ -234,8 +241,8 @@
*/
if (vma->vm_flags & VM_PFNMAP) {
int err = 1;
- if (walk->pte_hole)
- err = walk->pte_hole(start, end, walk);
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, end, walk);
return err ? err : 1;
}
return 0;
@@ -248,7 +255,7 @@
struct vm_area_struct *vma = walk->vma;
if (vma && is_vm_hugetlb_page(vma)) {
- if (walk->hugetlb_entry)
+ if (walk->ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
@@ -258,11 +265,13 @@
/**
* walk_page_range - walk page table with caller specific callbacks
- * @start: start address of the virtual address range
- * @end: end address of the virtual address range
- * @walk: mm_walk structure defining the callbacks and the target address space
+ * @mm: mm_struct representing the target process of page table walk
+ * @start: start address of the virtual address range
+ * @end: end address of the virtual address range
+ * @ops: operation to call during the walk
+ * @private: private data for callbacks' usage
*
- * Recursively walk the page table tree of the process represented by @walk->mm
+ * Recursively walk the page table tree of the process represented by @mm
* within the virtual address range [@start, @end). During walking, we can do
* some caller-specific works for each entry, by setting up pmd_entry(),
* pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
@@ -278,47 +287,52 @@
*
* Before starting to walk page table, some callers want to check whether
* they really want to walk over the current vma, typically by checking
- * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * its vm_flags. walk_page_test() and @ops->test_walk() are used for this
* purpose.
*
* struct mm_walk keeps current values of some common data like vma and pmd,
* which are useful for the access from callbacks. If you want to pass some
- * caller-specific data to callbacks, @walk->private should be helpful.
+ * caller-specific data to callbacks, @private should be helpful.
*
* Locking:
- * Callers of walk_page_range() and walk_page_vma() should hold
- * @walk->mm->mmap_sem, because these function traverse vma list and/or
- * access to vma's data.
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
+ * because these function traverse vma list and/or access to vma's data.
*/
-int walk_page_range(unsigned long start, unsigned long end,
- struct mm_walk *walk)
+int walk_page_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, const struct mm_walk_ops *ops,
+ void *private)
{
int err = 0;
unsigned long next;
struct vm_area_struct *vma;
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = mm,
+ .private = private,
+ };
if (start >= end)
return -EINVAL;
- if (!walk->mm)
+ if (!walk.mm)
return -EINVAL;
- VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
+ lockdep_assert_held(&walk.mm->mmap_sem);
- vma = find_vma(walk->mm, start);
+ vma = find_vma(walk.mm, start);
do {
if (!vma) { /* after the last vma */
- walk->vma = NULL;
+ walk.vma = NULL;
next = end;
} else if (start < vma->vm_start) { /* outside vma */
- walk->vma = NULL;
+ walk.vma = NULL;
next = min(end, vma->vm_start);
} else { /* inside vma */
- walk->vma = vma;
+ walk.vma = vma;
next = min(end, vma->vm_end);
vma = vma->vm_next;
- err = walk_page_test(start, next, walk);
+ err = walk_page_test(start, next, &walk);
if (err > 0) {
/*
* positive return values are purely for
@@ -331,28 +345,34 @@
if (err < 0)
break;
}
- if (walk->vma || walk->pte_hole)
- err = __walk_page_range(start, next, walk);
+ if (walk.vma || walk.ops->pte_hole)
+ err = __walk_page_range(start, next, &walk);
if (err)
break;
} while (start = next, start < end);
return err;
}
-int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
+ void *private)
{
+ struct mm_walk walk = {
+ .ops = ops,
+ .mm = vma->vm_mm,
+ .vma = vma,
+ .private = private,
+ };
int err;
- if (!walk->mm)
+ if (!walk.mm)
return -EINVAL;
- VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
- VM_BUG_ON(!vma);
- walk->vma = vma;
- err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+ lockdep_assert_held(&walk.mm->mmap_sem);
+
+ err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
if (err > 0)
return 0;
if (err < 0)
return err;
- return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+ return __walk_page_range(vma->vm_start, vma->vm_end, &walk);
}
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index b1739dc..0468ba5 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -9,8 +9,17 @@
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
+ *
+ * The scan hint is the largest known contiguous area before the contig hint.
+ * It is not necessarily the actual largest contig hint though. There is an
+ * invariant that the scan_hint_start > contig_hint_start iff
+ * scan_hint == contig_hint. This is necessary because when scanning forward,
+ * we don't know if a new contig hint would be better than the current one.
*/
struct pcpu_block_md {
+ int scan_hint; /* scan hint for block */
+ int scan_hint_start; /* block relative starting
+ position of the scan hint */
int contig_hint; /* contig hint for block */
int contig_hint_start; /* block relative starting
position of the contig hint */
@@ -19,6 +28,7 @@
int right_free; /* size of free space along
the right side of the block */
int first_free; /* block position of first free */
+ int nr_bits; /* total bits responsible for */
};
struct pcpu_chunk {
@@ -29,9 +39,7 @@
struct list_head list; /* linked to pcpu_slot lists */
int free_bytes; /* free bytes in the chunk */
- int contig_bits; /* max contiguous size hint */
- int contig_bits_start; /* contig_bits starting
- offset */
+ struct pcpu_block_md chunk_md;
void *base_addr; /* base address of this chunk */
unsigned long *alloc_map; /* allocation map */
@@ -39,7 +47,6 @@
struct pcpu_block_md *md_blocks; /* metadata blocks */
void *data; /* chunk data */
- int first_bit; /* no free below this */
bool immutable; /* no [de]population allowed */
int start_offset; /* the overlap with the previous
region to have a page aligned
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 38de70a..20d2b69 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-km.c - kernel memory based chunk allocation
*
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
- *
* Chunks are allocated as a contiguous kernel memory using gfp
* allocation. This is to be used on nommu architectures.
*
@@ -50,6 +49,7 @@
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
struct page *pages;
+ unsigned long flags;
int i;
chunk = pcpu_alloc_chunk(gfp);
@@ -66,11 +66,11 @@
pcpu_set_page_chunk(nth_page(pages, i), chunk);
chunk->data = pages;
- chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
+ chunk->base_addr = page_address(pages);
- spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, 0, nr_pages, false);
- spin_unlock_irq(&pcpu_lock);
+ spin_lock_irqsave(&pcpu_lock, flags);
+ pcpu_chunk_populated(chunk, 0, nr_pages);
+ spin_unlock_irqrestore(&pcpu_lock, flags);
pcpu_stats_chunk_alloc();
trace_percpu_create_chunk(chunk->base_addr);
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index b5fdd43..a5a8b22 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-debug.c
*
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennisz@fb.com>
*
- * This file is released under the GPLv2.
- *
* Prints statistics about the percpu allocator and backing chunks.
*/
#include <linux/debugfs.h>
@@ -53,6 +52,7 @@
static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
int *buffer)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int i, last_alloc, as_len, start, end;
int *alloc_sizes, *p;
/* statistics */
@@ -121,9 +121,9 @@
P("nr_alloc", chunk->nr_alloc);
P("max_alloc_size", chunk->max_alloc_size);
P("empty_pop_pages", chunk->nr_empty_pop_pages);
- P("first_bit", chunk->first_bit);
+ P("first_bit", chunk_md->first_free);
P("free_bytes", chunk->free_bytes);
- P("contig_bytes", chunk->contig_bits * PCPU_MIN_ALLOC_SIZE);
+ P("contig_bytes", chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
P("sum_frag", sum_frag);
P("max_frag", max_frag);
P("cur_min_alloc", cur_min_alloc);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index d8078de..a2b395a 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -1,11 +1,10 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu-vm.c - vmalloc area based chunk allocation
*
* Copyright (C) 2010 SUSE Linux Products GmbH
* Copyright (C) 2010 Tejun Heo <tj@kernel.org>
*
- * This file is released under the GPLv2.
- *
* Chunks are mapped into vmalloc areas and populated page by page.
* This is the default chunk allocator.
*/
diff --git a/mm/percpu.c b/mm/percpu.c
index 4b90682..7e06a1e 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/percpu.c - percpu memory allocator
*
@@ -7,8 +8,6 @@
* Copyright (C) 2017 Facebook Inc.
* Copyright (C) 2017 Dennis Zhou <dennisszhou@gmail.com>
*
- * This file is released under the GPLv2 license.
- *
* The percpu allocator handles both static and dynamic areas. Percpu
* areas are allocated in chunks which are divided into units. There is
* a 1-to-1 mapping for units to possible cpus. These units are grouped
@@ -65,7 +64,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/bitmap.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/err.h>
#include <linux/lcm.h>
#include <linux/list.h>
@@ -94,6 +93,8 @@
/* the slots are sorted by free bytes left, 1-31 bytes share the same slot */
#define PCPU_SLOT_BASE_SHIFT 5
+/* chunks in slots below this are subject to being sidelined on failed alloc */
+#define PCPU_SLOT_FAIL_THRESHOLD 3
#define PCPU_EMPTY_POP_PAGES_LOW 2
#define PCPU_EMPTY_POP_PAGES_HIGH 4
@@ -231,10 +232,13 @@
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
- if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk->contig_bits == 0)
+ const struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+
+ if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE ||
+ chunk_md->contig_hint == 0)
return 0;
- return pcpu_size_to_slot(chunk->free_bytes);
+ return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE);
}
/* set the pointer to a chunk in a page struct */
@@ -318,6 +322,34 @@
return index * PCPU_BITMAP_BLOCK_BITS + off;
}
+/*
+ * pcpu_next_hint - determine which hint to use
+ * @block: block of interest
+ * @alloc_bits: size of allocation
+ *
+ * This determines if we should scan based on the scan_hint or first_free.
+ * In general, we want to scan from first_free to fulfill allocations by
+ * first fit. However, if we know a scan_hint at position scan_hint_start
+ * cannot fulfill an allocation, we can begin scanning from there knowing
+ * the contig_hint will be our fallback.
+ */
+static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits)
+{
+ /*
+ * The three conditions below determine if we can skip past the
+ * scan_hint. First, does the scan hint exist. Second, is the
+ * contig_hint after the scan_hint (possibly not true iff
+ * contig_hint == scan_hint). Third, is the allocation request
+ * larger than the scan_hint.
+ */
+ if (block->scan_hint &&
+ block->contig_hint_start > block->scan_hint_start &&
+ alloc_bits > block->scan_hint)
+ return block->scan_hint_start + block->scan_hint;
+
+ return block->first_free;
+}
+
/**
* pcpu_next_md_free_region - finds the next hint free area
* @chunk: chunk of interest
@@ -413,9 +445,11 @@
if (block->contig_hint &&
block->contig_hint_start >= block_off &&
block->contig_hint >= *bits + alloc_bits) {
+ int start = pcpu_next_hint(block, alloc_bits);
+
*bits += alloc_bits + block->contig_hint_start -
- block->first_free;
- *bit_off = pcpu_block_off_to_off(i, block->first_free);
+ start;
+ *bit_off = pcpu_block_off_to_off(i, start);
return;
}
/* reset to satisfy the second predicate above */
@@ -488,6 +522,22 @@
kvfree(ptr);
}
+static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
+ bool move_front)
+{
+ if (chunk != pcpu_reserved_chunk) {
+ if (move_front)
+ list_move(&chunk->list, &pcpu_slot[slot]);
+ else
+ list_move_tail(&chunk->list, &pcpu_slot[slot]);
+ }
+}
+
+static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot)
+{
+ __pcpu_chunk_move(chunk, slot, true);
+}
+
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
@@ -505,110 +555,39 @@
{
int nslot = pcpu_chunk_slot(chunk);
- if (chunk != pcpu_reserved_chunk && oslot != nslot) {
- if (oslot < nslot)
- list_move(&chunk->list, &pcpu_slot[nslot]);
- else
- list_move_tail(&chunk->list, &pcpu_slot[nslot]);
- }
+ if (oslot != nslot)
+ __pcpu_chunk_move(chunk, nslot, oslot < nslot);
}
-/**
- * pcpu_cnt_pop_pages- counts populated backing pages in range
+/*
+ * pcpu_update_empty_pages - update empty page counters
* @chunk: chunk of interest
- * @bit_off: start offset
- * @bits: size of area to check
+ * @nr: nr of empty pages
*
- * Calculates the number of populated pages in the region
- * [page_start, page_end). This keeps track of how many empty populated
- * pages are available and decide if async work should be scheduled.
- *
- * RETURNS:
- * The nr of populated pages.
+ * This is used to keep track of the empty pages now based on the premise
+ * a md_block covers a page. The hint update functions recognize if a block
+ * is made full or broken to calculate deltas for keeping track of free pages.
*/
-static inline int pcpu_cnt_pop_pages(struct pcpu_chunk *chunk, int bit_off,
- int bits)
+static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr)
{
- int page_start = PFN_UP(bit_off * PCPU_MIN_ALLOC_SIZE);
- int page_end = PFN_DOWN((bit_off + bits) * PCPU_MIN_ALLOC_SIZE);
-
- if (page_start >= page_end)
- return 0;
-
- /*
- * bitmap_weight counts the number of bits set in a bitmap up to
- * the specified number of bits. This is counting the populated
- * pages up to page_end and then subtracting the populated pages
- * up to page_start to count the populated pages in
- * [page_start, page_end).
- */
- return bitmap_weight(chunk->populated, page_end) -
- bitmap_weight(chunk->populated, page_start);
-}
-
-/**
- * pcpu_chunk_update - updates the chunk metadata given a free area
- * @chunk: chunk of interest
- * @bit_off: chunk offset
- * @bits: size of free area
- *
- * This updates the chunk's contig hint and starting offset given a free area.
- * Choose the best starting offset if the contig hint is equal.
- */
-static void pcpu_chunk_update(struct pcpu_chunk *chunk, int bit_off, int bits)
-{
- if (bits > chunk->contig_bits) {
- chunk->contig_bits_start = bit_off;
- chunk->contig_bits = bits;
- } else if (bits == chunk->contig_bits && chunk->contig_bits_start &&
- (!bit_off ||
- __ffs(bit_off) > __ffs(chunk->contig_bits_start))) {
- /* use the start with the best alignment */
- chunk->contig_bits_start = bit_off;
- }
-}
-
-/**
- * pcpu_chunk_refresh_hint - updates metadata about a chunk
- * @chunk: chunk of interest
- *
- * Iterates over the metadata blocks to find the largest contig area.
- * It also counts the populated pages and uses the delta to update the
- * global count.
- *
- * Updates:
- * chunk->contig_bits
- * chunk->contig_bits_start
- * nr_empty_pop_pages (chunk and global)
- */
-static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk)
-{
- int bit_off, bits, nr_empty_pop_pages;
-
- /* clear metadata */
- chunk->contig_bits = 0;
-
- bit_off = chunk->first_bit;
- bits = nr_empty_pop_pages = 0;
- pcpu_for_each_md_free_region(chunk, bit_off, bits) {
- pcpu_chunk_update(chunk, bit_off, bits);
-
- nr_empty_pop_pages += pcpu_cnt_pop_pages(chunk, bit_off, bits);
- }
-
- /*
- * Keep track of nr_empty_pop_pages.
- *
- * The chunk maintains the previous number of free pages it held,
- * so the delta is used to update the global counter. The reserved
- * chunk is not part of the free page count as they are populated
- * at init and are special to serving reserved allocations.
- */
+ chunk->nr_empty_pop_pages += nr;
if (chunk != pcpu_reserved_chunk)
- pcpu_nr_empty_pop_pages +=
- (nr_empty_pop_pages - chunk->nr_empty_pop_pages);
+ pcpu_nr_empty_pop_pages += nr;
+}
- chunk->nr_empty_pop_pages = nr_empty_pop_pages;
+/*
+ * pcpu_region_overlap - determines if two regions overlap
+ * @a: start of first region, inclusive
+ * @b: end of first region, exclusive
+ * @x: start of second region, inclusive
+ * @y: end of second region, exclusive
+ *
+ * This is used to determine if the hint region [a, b) overlaps with the
+ * allocated region [x, y).
+ */
+static inline bool pcpu_region_overlap(int a, int b, int x, int y)
+{
+ return (a < y) && (x < b);
}
/**
@@ -629,16 +608,132 @@
if (start == 0)
block->left_free = contig;
- if (end == PCPU_BITMAP_BLOCK_BITS)
+ if (end == block->nr_bits)
block->right_free = contig;
if (contig > block->contig_hint) {
+ /* promote the old contig_hint to be the new scan_hint */
+ if (start > block->contig_hint_start) {
+ if (block->contig_hint > block->scan_hint) {
+ block->scan_hint_start =
+ block->contig_hint_start;
+ block->scan_hint = block->contig_hint;
+ } else if (start < block->scan_hint_start) {
+ /*
+ * The old contig_hint == scan_hint. But, the
+ * new contig is larger so hold the invariant
+ * scan_hint_start < contig_hint_start.
+ */
+ block->scan_hint = 0;
+ }
+ } else {
+ block->scan_hint = 0;
+ }
block->contig_hint_start = start;
block->contig_hint = contig;
- } else if (block->contig_hint_start && contig == block->contig_hint &&
- (!start || __ffs(start) > __ffs(block->contig_hint_start))) {
- /* use the start with the best alignment */
- block->contig_hint_start = start;
+ } else if (contig == block->contig_hint) {
+ if (block->contig_hint_start &&
+ (!start ||
+ __ffs(start) > __ffs(block->contig_hint_start))) {
+ /* start has a better alignment so use it */
+ block->contig_hint_start = start;
+ if (start < block->scan_hint_start &&
+ block->contig_hint > block->scan_hint)
+ block->scan_hint = 0;
+ } else if (start > block->scan_hint_start ||
+ block->contig_hint > block->scan_hint) {
+ /*
+ * Knowing contig == contig_hint, update the scan_hint
+ * if it is farther than or larger than the current
+ * scan_hint.
+ */
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ } else {
+ /*
+ * The region is smaller than the contig_hint. So only update
+ * the scan_hint if it is larger than or equal and farther than
+ * the current scan_hint.
+ */
+ if ((start < block->contig_hint_start &&
+ (contig > block->scan_hint ||
+ (contig == block->scan_hint &&
+ start > block->scan_hint_start)))) {
+ block->scan_hint_start = start;
+ block->scan_hint = contig;
+ }
+ }
+}
+
+/*
+ * pcpu_block_update_scan - update a block given a free area from a scan
+ * @chunk: chunk of interest
+ * @bit_off: chunk offset
+ * @bits: size of free area
+ *
+ * Finding the final allocation spot first goes through pcpu_find_block_fit()
+ * to find a block that can hold the allocation and then pcpu_alloc_area()
+ * where a scan is used. When allocations require specific alignments,
+ * we can inadvertently create holes which will not be seen in the alloc
+ * or free paths.
+ *
+ * This takes a given free area hole and updates a block as it may change the
+ * scan_hint. We need to scan backwards to ensure we don't miss free bits
+ * from alignment.
+ */
+static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off,
+ int bits)
+{
+ int s_off = pcpu_off_to_block_off(bit_off);
+ int e_off = s_off + bits;
+ int s_index, l_bit;
+ struct pcpu_block_md *block;
+
+ if (e_off > PCPU_BITMAP_BLOCK_BITS)
+ return;
+
+ s_index = pcpu_off_to_block_index(bit_off);
+ block = chunk->md_blocks + s_index;
+
+ /* scan backwards in case of alignment skipping free bits */
+ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off);
+ s_off = (s_off == l_bit) ? 0 : l_bit + 1;
+
+ pcpu_block_update(block, s_off, e_off);
+}
+
+/**
+ * pcpu_chunk_refresh_hint - updates metadata about a chunk
+ * @chunk: chunk of interest
+ * @full_scan: if we should scan from the beginning
+ *
+ * Iterates over the metadata blocks to find the largest contig area.
+ * A full scan can be avoided on the allocation path as this is triggered
+ * if we broke the contig_hint. In doing so, the scan_hint will be before
+ * the contig_hint or after if the scan_hint == contig_hint. This cannot
+ * be prevented on freeing as we want to find the largest area possibly
+ * spanning blocks.
+ */
+static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan)
+{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int bit_off, bits;
+
+ /* promote scan_hint to contig_hint */
+ if (!full_scan && chunk_md->scan_hint) {
+ bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint;
+ chunk_md->contig_hint_start = chunk_md->scan_hint_start;
+ chunk_md->contig_hint = chunk_md->scan_hint;
+ chunk_md->scan_hint = 0;
+ } else {
+ bit_off = chunk_md->first_free;
+ chunk_md->contig_hint = 0;
+ }
+
+ bits = 0;
+ pcpu_for_each_md_free_region(chunk, bit_off, bits) {
+ pcpu_block_update(chunk_md, bit_off, bit_off + bits);
}
}
@@ -654,14 +749,23 @@
{
struct pcpu_block_md *block = chunk->md_blocks + index;
unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index);
- int rs, re; /* region start, region end */
+ int rs, re, start; /* region start, region end */
- /* clear hints */
- block->contig_hint = 0;
- block->left_free = block->right_free = 0;
+ /* promote scan_hint to contig_hint */
+ if (block->scan_hint) {
+ start = block->scan_hint_start + block->scan_hint;
+ block->contig_hint_start = block->scan_hint_start;
+ block->contig_hint = block->scan_hint;
+ block->scan_hint = 0;
+ } else {
+ start = block->first_free;
+ block->contig_hint = 0;
+ }
+
+ block->right_free = 0;
/* iterate over free areas and update the contig hints */
- pcpu_for_each_unpop_region(alloc_map, rs, re, block->first_free,
+ pcpu_for_each_unpop_region(alloc_map, rs, re, start,
PCPU_BITMAP_BLOCK_BITS) {
pcpu_block_update(block, rs, re);
}
@@ -680,6 +784,8 @@
static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
+ int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
@@ -704,15 +810,29 @@
* If the allocation breaks the contig_hint, a scan is required to
* restore this hint.
*/
+ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
if (s_off == s_block->first_free)
s_block->first_free = find_next_zero_bit(
pcpu_index_alloc_map(chunk, s_index),
PCPU_BITMAP_BLOCK_BITS,
s_off + bits);
- if (s_off >= s_block->contig_hint_start &&
- s_off < s_block->contig_hint_start + s_block->contig_hint) {
+ if (pcpu_region_overlap(s_block->scan_hint_start,
+ s_block->scan_hint_start + s_block->scan_hint,
+ s_off,
+ s_off + bits))
+ s_block->scan_hint = 0;
+
+ if (pcpu_region_overlap(s_block->contig_hint_start,
+ s_block->contig_hint_start +
+ s_block->contig_hint,
+ s_off,
+ s_off + bits)) {
/* block contig hint is broken - scan to fix it */
+ if (!s_off)
+ s_block->left_free = 0;
pcpu_block_refresh_hint(chunk, s_index);
} else {
/* update left and right contig manually */
@@ -728,6 +848,9 @@
* Update e_block.
*/
if (s_index != e_index) {
+ if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
+
/*
* When the allocation is across blocks, the end is along
* the left part of the e_block.
@@ -740,11 +863,14 @@
/* reset the block */
e_block++;
} else {
+ if (e_off > e_block->scan_hint_start)
+ e_block->scan_hint = 0;
+
+ e_block->left_free = 0;
if (e_off > e_block->contig_hint_start) {
/* contig hint is broken - scan to fix it */
pcpu_block_refresh_hint(chunk, e_index);
} else {
- e_block->left_free = 0;
e_block->right_free =
min_t(int, e_block->right_free,
PCPU_BITMAP_BLOCK_BITS - e_off);
@@ -752,21 +878,36 @@
}
/* update in-between md_blocks */
+ nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
+ block->scan_hint = 0;
block->contig_hint = 0;
block->left_free = 0;
block->right_free = 0;
}
}
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, -nr_empty_pages);
+
+ if (pcpu_region_overlap(chunk_md->scan_hint_start,
+ chunk_md->scan_hint_start +
+ chunk_md->scan_hint,
+ bit_off,
+ bit_off + bits))
+ chunk_md->scan_hint = 0;
+
/*
* The only time a full chunk scan is required is if the chunk
* contig hint is broken. Otherwise, it means a smaller space
* was used and therefore the chunk contig hint is still correct.
*/
- if (bit_off >= chunk->contig_bits_start &&
- bit_off < chunk->contig_bits_start + chunk->contig_bits)
- pcpu_chunk_refresh_hint(chunk);
+ if (pcpu_region_overlap(chunk_md->contig_hint_start,
+ chunk_md->contig_hint_start +
+ chunk_md->contig_hint,
+ bit_off,
+ bit_off + bits))
+ pcpu_chunk_refresh_hint(chunk, false);
}
/**
@@ -782,13 +923,15 @@
*
* A chunk update is triggered if a page becomes free, a block becomes free,
* or the free spans across blocks. This tradeoff is to minimize iterating
- * over the block metadata to update chunk->contig_bits. chunk->contig_bits
- * may be off by up to a page, but it will never be more than the available
- * space. If the contig hint is contained in one block, it will be accurate.
+ * over the block metadata to update chunk_md->contig_hint.
+ * chunk_md->contig_hint may be off by up to a page, but it will never be more
+ * than the available space. If the contig hint is contained in one block, it
+ * will be accurate.
*/
static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off,
int bits)
{
+ int nr_empty_pages = 0;
struct pcpu_block_md *s_block, *e_block, *block;
int s_index, e_index; /* block indexes of the freed allocation */
int s_off, e_off; /* block offsets of the freed allocation */
@@ -842,16 +985,22 @@
/* update s_block */
e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS;
+ if (!start && e_off == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
pcpu_block_update(s_block, start, e_off);
/* freeing in the same block */
if (s_index != e_index) {
/* update e_block */
+ if (end == PCPU_BITMAP_BLOCK_BITS)
+ nr_empty_pages++;
pcpu_block_update(e_block, 0, end);
/* reset md_blocks in the middle */
+ nr_empty_pages += (e_index - s_index - 1);
for (block = s_block + 1; block < e_block; block++) {
block->first_free = 0;
+ block->scan_hint = 0;
block->contig_hint_start = 0;
block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
block->left_free = PCPU_BITMAP_BLOCK_BITS;
@@ -859,19 +1008,21 @@
}
}
+ if (nr_empty_pages)
+ pcpu_update_empty_pages(chunk, nr_empty_pages);
+
/*
- * Refresh chunk metadata when the free makes a page free, a block
- * free, or spans across blocks. The contig hint may be off by up to
- * a page, but if the hint is contained in a block, it will be accurate
- * with the else condition below.
+ * Refresh chunk metadata when the free makes a block free or spans
+ * across blocks. The contig_hint may be off by up to a page, but if
+ * the contig_hint is contained in a block, it will be accurate with
+ * the else condition below.
*/
- if ((ALIGN_DOWN(end, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS)) >
- ALIGN(start, min(PCPU_BITS_PER_PAGE, PCPU_BITMAP_BLOCK_BITS))) ||
- s_index != e_index)
- pcpu_chunk_refresh_hint(chunk);
+ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index)
+ pcpu_chunk_refresh_hint(chunk, true);
else
- pcpu_chunk_update(chunk, pcpu_block_off_to_off(s_index, start),
- s_block->contig_hint);
+ pcpu_block_update(&chunk->chunk_md,
+ pcpu_block_off_to_off(s_index, start),
+ end);
}
/**
@@ -926,6 +1077,7 @@
static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, bool pop_only)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, next_off;
/*
@@ -934,12 +1086,12 @@
* cannot fit in the global hint, there is memory pressure and creating
* a new chunk would happen soon.
*/
- bit_off = ALIGN(chunk->contig_bits_start, align) -
- chunk->contig_bits_start;
- if (bit_off + alloc_bits > chunk->contig_bits)
+ bit_off = ALIGN(chunk_md->contig_hint_start, align) -
+ chunk_md->contig_hint_start;
+ if (bit_off + alloc_bits > chunk_md->contig_hint)
return -1;
- bit_off = chunk->first_bit;
+ bit_off = pcpu_next_hint(chunk_md, alloc_bits);
bits = 0;
pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) {
if (!pop_only || pcpu_is_populated(chunk, bit_off, bits,
@@ -956,6 +1108,62 @@
return bit_off;
}
+/*
+ * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off()
+ * @map: the address to base the search on
+ * @size: the bitmap size in bits
+ * @start: the bitnumber to start searching at
+ * @nr: the number of zeroed bits we're looking for
+ * @align_mask: alignment mask for zero area
+ * @largest_off: offset of the largest area skipped
+ * @largest_bits: size of the largest area skipped
+ *
+ * The @align_mask should be one less than a power of 2.
+ *
+ * This is a modified version of bitmap_find_next_zero_area_off() to remember
+ * the largest area that was skipped. This is imperfect, but in general is
+ * good enough. The largest remembered region is the largest failed region
+ * seen. This does not include anything we possibly skipped due to alignment.
+ * pcpu_block_update_scan() does scan backwards to try and recover what was
+ * lost to alignment. While this can cause scanning to miss earlier possible
+ * free areas, smaller allocations will eventually fill those holes.
+ */
+static unsigned long pcpu_find_zero_area(unsigned long *map,
+ unsigned long size,
+ unsigned long start,
+ unsigned long nr,
+ unsigned long align_mask,
+ unsigned long *largest_off,
+ unsigned long *largest_bits)
+{
+ unsigned long index, end, i, area_off, area_bits;
+again:
+ index = find_next_zero_bit(map, size, start);
+
+ /* Align allocation */
+ index = __ALIGN_MASK(index, align_mask);
+ area_off = index;
+
+ end = index + nr;
+ if (end > size)
+ return end;
+ i = find_next_bit(map, end, index);
+ if (i < end) {
+ area_bits = i - area_off;
+ /* remember largest unused area with best alignment */
+ if (area_bits > *largest_bits ||
+ (area_bits == *largest_bits && *largest_off &&
+ (!area_off || __ffs(area_off) > __ffs(*largest_off)))) {
+ *largest_off = area_off;
+ *largest_bits = area_bits;
+ }
+
+ start = i + 1;
+ goto again;
+ }
+ return index;
+}
+
/**
* pcpu_alloc_area - allocates an area from a pcpu_chunk
* @chunk: chunk of interest
@@ -978,7 +1186,9 @@
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
size_t align, int start)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
size_t align_mask = (align) ? (align - 1) : 0;
+ unsigned long area_off = 0, area_bits = 0;
int bit_off, end, oslot;
lockdep_assert_held(&pcpu_lock);
@@ -988,12 +1198,16 @@
/*
* Search to find a fit.
*/
- end = start + alloc_bits + PCPU_BITMAP_BLOCK_BITS;
- bit_off = bitmap_find_next_zero_area(chunk->alloc_map, end, start,
- alloc_bits, align_mask);
+ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS,
+ pcpu_chunk_map_bits(chunk));
+ bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits,
+ align_mask, &area_off, &area_bits);
if (bit_off >= end)
return -1;
+ if (area_bits)
+ pcpu_block_update_scan(chunk, area_off, area_bits);
+
/* update alloc map */
bitmap_set(chunk->alloc_map, bit_off, alloc_bits);
@@ -1005,8 +1219,8 @@
chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE;
/* update first free bit */
- if (bit_off == chunk->first_bit)
- chunk->first_bit = find_next_zero_bit(
+ if (bit_off == chunk_md->first_free)
+ chunk_md->first_free = find_next_zero_bit(
chunk->alloc_map,
pcpu_chunk_map_bits(chunk),
bit_off + alloc_bits);
@@ -1028,6 +1242,7 @@
*/
static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
+ struct pcpu_block_md *chunk_md = &chunk->chunk_md;
int bit_off, bits, end, oslot;
lockdep_assert_held(&pcpu_lock);
@@ -1047,24 +1262,34 @@
chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
/* update first free bit */
- chunk->first_bit = min(chunk->first_bit, bit_off);
+ chunk_md->first_free = min(chunk_md->first_free, bit_off);
pcpu_block_update_hint_free(chunk, bit_off, bits);
pcpu_chunk_relocate(chunk, oslot);
}
+static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
+{
+ block->scan_hint = 0;
+ block->contig_hint = nr_bits;
+ block->left_free = nr_bits;
+ block->right_free = nr_bits;
+ block->first_free = 0;
+ block->nr_bits = nr_bits;
+}
+
static void pcpu_init_md_blocks(struct pcpu_chunk *chunk)
{
struct pcpu_block_md *md_block;
+ /* init the chunk's block */
+ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk));
+
for (md_block = chunk->md_blocks;
md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk);
- md_block++) {
- md_block->contig_hint = PCPU_BITMAP_BLOCK_BITS;
- md_block->left_free = PCPU_BITMAP_BLOCK_BITS;
- md_block->right_free = PCPU_BITMAP_BLOCK_BITS;
- }
+ md_block++)
+ pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS);
}
/**
@@ -1086,6 +1311,7 @@
struct pcpu_chunk *chunk;
unsigned long aligned_addr, lcm_align;
int start_offset, offset_bits, region_size, region_bits;
+ size_t alloc_size;
/* region calculations */
aligned_addr = tmp_addr & PAGE_MASK;
@@ -1101,9 +1327,12 @@
region_size = ALIGN(start_offset + map_size, lcm_align);
/* allocate chunk */
- chunk = memblock_virt_alloc(sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(region_size >> PAGE_SHIFT),
- 0);
+ alloc_size = sizeof(struct pcpu_chunk) +
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT);
+ chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
INIT_LIST_HEAD(&chunk->list);
@@ -1114,23 +1343,33 @@
chunk->nr_pages = region_size >> PAGE_SHIFT;
region_bits = pcpu_chunk_map_bits(chunk);
- chunk->alloc_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits) *
- sizeof(chunk->alloc_map[0]), 0);
- chunk->bound_map = memblock_virt_alloc(BITS_TO_LONGS(region_bits + 1) *
- sizeof(chunk->bound_map[0]), 0);
- chunk->md_blocks = memblock_virt_alloc(pcpu_chunk_nr_blocks(chunk) *
- sizeof(chunk->md_blocks[0]), 0);
+ alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]);
+ chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->alloc_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size =
+ BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]);
+ chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->bound_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]);
+ chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!chunk->md_blocks)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
pcpu_init_md_blocks(chunk);
/* manage populated page bitmap */
chunk->immutable = true;
bitmap_fill(chunk->populated, chunk->nr_pages);
chunk->nr_populated = chunk->nr_pages;
- chunk->nr_empty_pop_pages =
- pcpu_cnt_pop_pages(chunk, start_offset / PCPU_MIN_ALLOC_SIZE,
- map_size / PCPU_MIN_ALLOC_SIZE);
+ chunk->nr_empty_pop_pages = chunk->nr_pages;
- chunk->contig_bits = map_size / PCPU_MIN_ALLOC_SIZE;
chunk->free_bytes = map_size;
if (chunk->start_offset) {
@@ -1140,7 +1379,7 @@
set_bit(0, chunk->bound_map);
set_bit(offset_bits, chunk->bound_map);
- chunk->first_bit = offset_bits;
+ chunk->chunk_md.first_free = offset_bits;
pcpu_block_update_hint_alloc(chunk, 0, offset_bits);
}
@@ -1193,7 +1432,6 @@
pcpu_init_md_blocks(chunk);
/* init metadata */
- chunk->contig_bits = region_bits;
chunk->free_bytes = chunk->nr_pages * PAGE_SIZE;
return chunk;
@@ -1223,7 +1461,6 @@
* @chunk: pcpu_chunk which got populated
* @page_start: the start page
* @page_end: the end page
- * @for_alloc: if this is to populate for allocation
*
* Pages in [@page_start,@page_end) have been populated to @chunk. Update
* the bookkeeping information accordingly. Must be called after each
@@ -1233,7 +1470,7 @@
* is to serve an allocation in that area.
*/
static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start,
- int page_end, bool for_alloc)
+ int page_end)
{
int nr = page_end - page_start;
@@ -1243,10 +1480,7 @@
chunk->nr_populated += nr;
pcpu_nr_populated += nr;
- if (!for_alloc) {
- chunk->nr_empty_pop_pages += nr;
- pcpu_nr_empty_pop_pages += nr;
- }
+ pcpu_update_empty_pages(chunk, nr);
}
/**
@@ -1268,9 +1502,9 @@
bitmap_clear(chunk->populated, page_start, nr);
chunk->nr_populated -= nr;
- chunk->nr_empty_pop_pages -= nr;
- pcpu_nr_empty_pop_pages -= nr;
pcpu_nr_populated -= nr;
+
+ pcpu_update_empty_pages(chunk, -nr);
}
/*
@@ -1357,7 +1591,7 @@
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
bool do_warn = !(gfp & __GFP_NOWARN);
static int warn_limit = 10;
- struct pcpu_chunk *chunk;
+ struct pcpu_chunk *chunk, *next;
const char *err;
int slot, off, cpu, ret;
unsigned long flags;
@@ -1419,11 +1653,14 @@
restart:
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+ list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) {
off = pcpu_find_block_fit(chunk, bits, bit_align,
is_atomic);
- if (off < 0)
+ if (off < 0) {
+ if (slot < PCPU_SLOT_FAIL_THRESHOLD)
+ pcpu_chunk_move(chunk, 0);
continue;
+ }
off = pcpu_alloc_area(chunk, bits, bit_align, off);
if (off >= 0)
@@ -1482,7 +1719,7 @@
err = "failed to populate";
goto fail_unlock;
}
- pcpu_chunk_populated(chunk, rs, re, true);
+ pcpu_chunk_populated(chunk, rs, re);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
@@ -1681,7 +1918,7 @@
if (!ret) {
nr_to_pop -= nr;
spin_lock_irq(&pcpu_lock);
- pcpu_chunk_populated(chunk, rs, rs + nr, false);
+ pcpu_chunk_populated(chunk, rs, rs + nr);
spin_unlock_irq(&pcpu_lock);
} else {
nr_to_pop = 0;
@@ -1721,6 +1958,7 @@
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
+ bool need_balance = false;
if (!ptr)
return;
@@ -1742,7 +1980,7 @@
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
if (pos != chunk) {
- pcpu_schedule_balance_work();
+ need_balance = true;
break;
}
}
@@ -1750,6 +1988,9 @@
trace_percpu_free_percpu(chunk->base_addr, off, ptr);
spin_unlock_irqrestore(&pcpu_lock, flags);
+
+ if (need_balance)
+ pcpu_schedule_balance_work();
}
EXPORT_SYMBOL_GPL(free_percpu);
@@ -1884,11 +2125,11 @@
void *ptr;
int unit;
- base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
+ base_size = ALIGN(struct_size(ai, groups, nr_groups),
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
- ptr = memblock_virt_alloc_nopanic(PFN_ALIGN(ai_size), PAGE_SIZE);
+ ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE);
if (!ptr)
return NULL;
ai = ptr;
@@ -1979,7 +2220,7 @@
* @base_addr: mapped address
*
* Initialize the first percpu chunk which contains the kernel static
- * perpcu area. This function is to be called from arch percpu area
+ * percpu area. This function is to be called from arch percpu area
* setup path.
*
* @ai contains all information necessary to initialize the first
@@ -2026,12 +2267,9 @@
* share the same vm, but use offset regions in the area allocation map.
* The chunk serving the dynamic region is circulated in the chunk slots
* and available for dynamic allocation like any other chunk.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
*/
-int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
- void *base_addr)
+void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
+ void *base_addr)
{
size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
size_t static_size, dyn_size;
@@ -2044,6 +2282,7 @@
int group, unit, i;
int map_size;
unsigned long tmp_addr;
+ size_t alloc_size;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -2075,12 +2314,29 @@
PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0);
/* process group information and build config tables accordingly */
- group_offsets = memblock_virt_alloc(ai->nr_groups *
- sizeof(group_offsets[0]), 0);
- group_sizes = memblock_virt_alloc(ai->nr_groups *
- sizeof(group_sizes[0]), 0);
- unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
- unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
+ alloc_size = ai->nr_groups * sizeof(group_offsets[0]);
+ group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!group_offsets)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = ai->nr_groups * sizeof(group_sizes[0]);
+ group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!group_sizes)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_map[0]);
+ unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!unit_map)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
+
+ alloc_size = nr_cpu_ids * sizeof(unit_off[0]);
+ unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
+ if (!unit_off)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ alloc_size);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
@@ -2144,8 +2400,11 @@
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = memblock_virt_alloc(
- pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
+ pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
+ SMP_CACHE_BYTES);
+ if (!pcpu_slot)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
@@ -2195,7 +2454,6 @@
/* we're done */
pcpu_base_addr = base_addr;
- return 0;
}
#ifdef CONFIG_SMP
@@ -2382,7 +2640,7 @@
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;
- for (group = 0, unit = 0; group_cnt[group]; group++) {
+ for (group = 0, unit = 0; group < nr_groups; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
/*
@@ -2448,7 +2706,7 @@
struct pcpu_alloc_info *ai;
size_t size_sum, areas_size;
unsigned long max_distance;
- int group, i, highest_group, rc;
+ int group, i, highest_group, rc = 0;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size,
cpu_distance_fn);
@@ -2458,7 +2716,7 @@
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *));
- areas = memblock_virt_alloc_nopanic(areas_size, 0);
+ areas = memblock_alloc(areas_size, SMP_CACHE_BYTES);
if (!areas) {
rc = -ENOMEM;
goto out_free;
@@ -2529,11 +2787,11 @@
ai->groups[group].base_offset = areas[group] - base;
}
- pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
- PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
+ pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n",
+ PFN_DOWN(size_sum), ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
- rc = pcpu_setup_first_chunk(ai, base);
+ pcpu_setup_first_chunk(ai, base);
goto out_free;
out_free_areas:
@@ -2577,7 +2835,7 @@
int unit_pages;
size_t pages_size;
struct page **pages;
- int unit, i, j, rc;
+ int unit, i, j, rc = 0;
int upa;
int nr_g0_units;
@@ -2589,7 +2847,7 @@
BUG_ON(ai->nr_groups != 1);
upa = ai->alloc_size/ai->unit_size;
nr_g0_units = roundup(num_possible_cpus(), upa);
- if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) {
+ if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) {
pcpu_free_alloc_info(ai);
return -EINVAL;
}
@@ -2599,7 +2857,10 @@
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
- pages = memblock_virt_alloc(pages_size, 0);
+ pages = memblock_alloc(pages_size, SMP_CACHE_BYTES);
+ if (!pages)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ pages_size);
/* allocate pages */
j = 0;
@@ -2651,11 +2912,11 @@
}
/* we're ready, commit */
- pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n",
- unit_pages, psize_str, vm.addr, ai->static_size,
+ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n",
+ unit_pages, psize_str, ai->static_size,
ai->reserved_size, ai->dyn_size);
- rc = pcpu_setup_first_chunk(ai, vm.addr);
+ pcpu_setup_first_chunk(ai, vm.addr);
goto out_free_ar;
enomem:
@@ -2688,8 +2949,7 @@
static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size,
size_t align)
{
- return memblock_virt_alloc_from_nopanic(
- size, align, __pa(MAX_DMA_ADDRESS));
+ return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS));
}
static void __init pcpu_dfl_fc_free(void *ptr, size_t size)
@@ -2737,9 +2997,7 @@
void *fc;
ai = pcpu_alloc_alloc_info(1, 1);
- fc = memblock_virt_alloc_from_nopanic(unit_size,
- PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS));
+ fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
if (!ai || !fc)
panic("Failed to allocate memory for percpu areas.");
/* kmemleak tracks the percpu allocations separately */
@@ -2752,8 +3010,7 @@
ai->groups[0].nr_units = 1;
ai->groups[0].cpu_map[0] = 0;
- if (pcpu_setup_first_chunk(ai, fc) < 0)
- panic("Failed to initialize percpu areas.");
+ pcpu_setup_first_chunk(ai, fc);
pcpu_free_alloc_info(ai);
}
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cf2af04..532c292 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -8,6 +8,7 @@
*/
#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
#include <asm/tlb.h>
#include <asm-generic/pgtable.h>
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index a447092..357aa7b 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -1,12 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* linux/mm/process_vm_access.c
*
* Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
*/
#include <linux/mm.h>
diff --git a/mm/quicklist.c b/mm/quicklist.c
deleted file mode 100644
index 5e98ac7..0000000
--- a/mm/quicklist.c
+++ /dev/null
@@ -1,103 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Quicklist support.
- *
- * Quicklists are light weight lists of pages that have a defined state
- * on alloc and free. Pages must be in the quicklist specific defined state
- * (zero by default) when the page is freed. It seems that the initial idea
- * for such lists first came from Dave Miller and then various other people
- * improved on it.
- *
- * Copyright (C) 2007 SGI,
- * Christoph Lameter <cl@linux.com>
- * Generalized, added support for multiple lists and
- * constructors / destructors.
- */
-#include <linux/kernel.h>
-
-#include <linux/gfp.h>
-#include <linux/mm.h>
-#include <linux/mmzone.h>
-#include <linux/quicklist.h>
-
-DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist);
-
-#define FRACTION_OF_NODE_MEM 16
-
-static unsigned long max_pages(unsigned long min_pages)
-{
- unsigned long node_free_pages, max;
- int node = numa_node_id();
- struct zone *zones = NODE_DATA(node)->node_zones;
- int num_cpus_on_node;
-
- node_free_pages =
-#ifdef CONFIG_ZONE_DMA
- zone_page_state(&zones[ZONE_DMA], NR_FREE_PAGES) +
-#endif
-#ifdef CONFIG_ZONE_DMA32
- zone_page_state(&zones[ZONE_DMA32], NR_FREE_PAGES) +
-#endif
- zone_page_state(&zones[ZONE_NORMAL], NR_FREE_PAGES);
-
- max = node_free_pages / FRACTION_OF_NODE_MEM;
-
- num_cpus_on_node = cpumask_weight(cpumask_of_node(node));
- max /= num_cpus_on_node;
-
- return max(max, min_pages);
-}
-
-static long min_pages_to_free(struct quicklist *q,
- unsigned long min_pages, long max_free)
-{
- long pages_to_free;
-
- pages_to_free = q->nr_pages - max_pages(min_pages);
-
- return min(pages_to_free, max_free);
-}
-
-/*
- * Trim down the number of pages in the quicklist
- */
-void quicklist_trim(int nr, void (*dtor)(void *),
- unsigned long min_pages, unsigned long max_free)
-{
- long pages_to_free;
- struct quicklist *q;
-
- q = &get_cpu_var(quicklist)[nr];
- if (q->nr_pages > min_pages) {
- pages_to_free = min_pages_to_free(q, min_pages, max_free);
-
- while (pages_to_free > 0) {
- /*
- * We pass a gfp_t of 0 to quicklist_alloc here
- * because we will never call into the page allocator.
- */
- void *p = quicklist_alloc(nr, 0, NULL);
-
- if (dtor)
- dtor(p);
- free_page((unsigned long)p);
- pages_to_free--;
- }
- }
- put_cpu_var(quicklist);
-}
-
-unsigned long quicklist_total_size(void)
-{
- unsigned long count = 0;
- int cpu;
- struct quicklist *ql, *q;
-
- for_each_online_cpu(cpu) {
- ql = per_cpu(quicklist, cpu);
- for (q = ql; q < ql + CONFIG_NR_QUICK; q++)
- count += q->nr_pages;
- }
- return count;
-}
-
diff --git a/mm/readahead.c b/mm/readahead.c
index 4e63014..2fe72cd 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/readahead.c - address_space-level file readahead.
*
@@ -81,6 +82,8 @@
* @data: private data for the callback routine.
*
* Hides the details of the LRU cache etc from the filesystems.
+ *
+ * Returns: %0 on success, error return by @filler otherwise
*/
int read_cache_pages(struct address_space *mapping, struct list_head *pages,
int (*filler)(void *, struct page *), void *data)
@@ -176,10 +179,8 @@
if (page_offset > end_index)
break;
- rcu_read_lock();
- page = radix_tree_lookup(&mapping->i_pages, page_offset);
- rcu_read_unlock();
- if (page && !radix_tree_exceptional_entry(page)) {
+ page = xa_load(&mapping->i_pages, page_offset);
+ if (page && !xa_is_value(page)) {
/*
* Page already present? Kick off the current batch of
* contiguous pages before continuing with the next
@@ -272,17 +273,15 @@
* return it as the new window size.
*/
static unsigned long get_next_ra_size(struct file_ra_state *ra,
- unsigned long max)
+ unsigned long max)
{
unsigned long cur = ra->size;
- unsigned long newsize;
if (cur < max / 16)
- newsize = 4 * cur;
- else
- newsize = 2 * cur;
-
- return min(newsize, max);
+ return 4 * cur;
+ if (cur <= max / 2)
+ return 2 * cur;
+ return max;
}
/*
@@ -336,7 +335,7 @@
pgoff_t head;
rcu_read_lock();
- head = page_cache_prev_hole(mapping, offset - 1, max);
+ head = page_cache_prev_miss(mapping, offset - 1, max);
rcu_read_unlock();
return offset - 1 - head;
@@ -425,7 +424,7 @@
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_hole(mapping, offset + 1, max_pages);
+ start = page_cache_next_miss(mapping, offset + 1, max_pages);
rcu_read_unlock();
if (!start || start - offset > max_pages)
diff --git a/mm/rmap.c b/mm/rmap.c
index 85b7f94..0c7b2a9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -27,7 +27,7 @@
* mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
- * zone_lru_lock (in mark_page_accessed, isolate_lru_page)
+ * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
@@ -61,6 +61,7 @@
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
+#include <linux/huge_mm.h>
#include <linux/backing-dev.h>
#include <linux/page_idle.h>
#include <linux/memremap.h>
@@ -850,7 +851,7 @@
};
*vm_flags = 0;
- if (!page_mapped(page))
+ if (!pra.mapcount)
return 0;
if (!page_rmapping(page))
@@ -889,21 +890,22 @@
.address = address,
.flags = PVMW_SYNC,
};
- unsigned long start = address, end;
+ struct mmu_notifier_range range;
int *cleaned = arg;
/*
* We have to assume the worse case ie pmd for invalidation. Note that
* the page can not be free from this function.
*/
- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
+ 0, vma, vma->vm_mm, address,
+ min(vma->vm_end, address + page_size(page)));
+ mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
- unsigned long cstart;
int ret = 0;
- cstart = address = pvmw.address;
+ address = pvmw.address;
if (pvmw.pte) {
pte_t entry;
pte_t *pte = pvmw.pte;
@@ -926,11 +928,10 @@
continue;
flush_cache_page(vma, address, page_to_pfn(page));
- entry = pmdp_huge_clear_flush(vma, address, pmd);
+ entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
set_pmd_at(vma->vm_mm, address, pmd, entry);
- cstart &= PMD_MASK;
ret = 1;
#else
/* unexpected pmd-mapped page? */
@@ -949,7 +950,7 @@
(*cleaned)++;
}
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+ mmu_notifier_invalidate_range_end(&range);
return true;
}
@@ -1017,7 +1018,7 @@
/**
* __page_set_anon_rmap - set up new anonymous rmap
- * @page: Page to add to rmap
+ * @page: Page or Hugepage to add to rmap
* @vma: VM area to add page to.
* @address: User virtual address of the mapping
* @exclusive: the page is exclusively owned by the current process
@@ -1189,8 +1190,10 @@
}
if (!atomic_inc_and_test(compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __inc_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __inc_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (PageTransCompound(page) && page_mapping(page)) {
VM_WARN_ON_ONCE(!PageLocked(page));
@@ -1229,8 +1232,10 @@
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
goto out;
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ if (PageSwapBacked(page))
+ __dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
+ else
+ __dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
goto out;
@@ -1345,7 +1350,7 @@
pte_t pteval;
struct page *subpage;
bool ret = true;
- unsigned long start = address, end;
+ struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)arg;
/* munlock has nothing to gain from examining un-locked vmas */
@@ -1369,15 +1374,18 @@
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
- end = min(vma->vm_end, start + (PAGE_SIZE << compound_order(page)));
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
+ address,
+ min(vma->vm_end, address + page_size(page)));
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
* accordingly.
*/
- adjust_range_if_pmd_sharing_possible(vma, &start, &end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start,
+ &range.end);
}
- mmu_notifier_invalidate_range_start(vma->vm_mm, start, end);
+ mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
@@ -1428,9 +1436,10 @@
* we must flush them all. start/end were
* already adjusted above to cover this range.
*/
- flush_cache_range(vma, start, end);
- flush_tlb_range(vma, start, end);
- mmu_notifier_invalidate_range(mm, start, end);
+ flush_cache_range(vma, range.start, range.end);
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
/*
* The ref count of the PMD page was dropped
@@ -1467,7 +1476,15 @@
/*
* No need to invalidate here it will synchronize on
* against the special swap migration pte.
+ *
+ * The assignment to subpage above was computed from a
+ * swap PTE which results in an invalid pointer.
+ * Since only PAGE_SIZE pages can currently be
+ * migrated, just set it to page. This will need to be
+ * changed when hugepage migrations to device private
+ * memory are supported.
*/
+ subpage = page;
goto discard;
}
@@ -1508,8 +1525,7 @@
if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) {
pteval = swp_entry_to_pte(make_hwpoison_entry(subpage));
if (PageHuge(page)) {
- int nr = 1 << compound_order(page);
- hugetlb_count_sub(nr, mm);
+ hugetlb_count_sub(compound_nr(page), mm);
set_huge_swap_pte_at(mm, address,
pvmw.pte, pteval,
vma_mmu_pagesize(vma));
@@ -1650,7 +1666,7 @@
put_page(page);
}
- mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -1910,27 +1926,10 @@
#ifdef CONFIG_HUGETLB_PAGE
/*
- * The following three functions are for anonymous (private mapped) hugepages.
+ * The following two functions are for anonymous (private mapped) hugepages.
* Unlike common anonymous pages, anonymous hugepages have no accounting code
* and no lru code, because we handle hugepages differently from common pages.
*/
-static void __hugepage_set_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int exclusive)
-{
- struct anon_vma *anon_vma = vma->anon_vma;
-
- BUG_ON(!anon_vma);
-
- if (PageAnon(page))
- return;
- if (!exclusive)
- anon_vma = anon_vma->root;
-
- anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
- page->index = linear_page_index(vma, address);
-}
-
void hugepage_add_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
@@ -1942,7 +1941,7 @@
/* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(compound_mapcount_ptr(page));
if (first)
- __hugepage_set_anon_rmap(page, vma, address, 0);
+ __page_set_anon_rmap(page, vma, address, 0);
}
void hugepage_add_new_anon_rmap(struct page *page,
@@ -1950,6 +1949,6 @@
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
- __hugepage_set_anon_rmap(page, vma, address, 1);
+ __page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index d908c87..5e313fa 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -1,13 +1,9 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* rodata_test.c: functional test for mark_rodata_ro function
*
* (C) Copyright 2008 Intel Corporation
* Author: Arjan van de Ven <arjan@linux.intel.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; version 2
- * of the License.
*/
#define pr_fmt(fmt) "rodata_test: " fmt
diff --git a/mm/shmem.c b/mm/shmem.c
index b6cf0e8..220be9f 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -36,6 +36,8 @@
#include <linux/uio.h>
#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
+#include <linux/frontswap.h>
+#include <linux/fs_parser.h>
#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
@@ -106,21 +108,41 @@
pgoff_t nr_unswapped; /* how often writepage refused to swap out */
};
+struct shmem_options {
+ unsigned long long blocks;
+ unsigned long long inodes;
+ struct mempolicy *mpol;
+ kuid_t uid;
+ kgid_t gid;
+ umode_t mode;
+ int huge;
+ int seen;
+#define SHMEM_SEEN_BLOCKS 1
+#define SHMEM_SEEN_INODES 2
+#define SHMEM_SEEN_HUGE 4
+};
+
#ifdef CONFIG_TMPFS
static unsigned long shmem_default_max_blocks(void)
{
- return totalram_pages / 2;
+ return totalram_pages() / 2;
}
static unsigned long shmem_default_max_inodes(void)
{
- return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+ unsigned long nr_pages = totalram_pages();
+
+ return min(nr_pages - totalhigh_pages(), nr_pages / 2);
}
#endif
static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
static int shmem_replace_page(struct page **pagep, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index);
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type);
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
@@ -326,24 +348,20 @@
}
/*
- * Replace item expected in radix tree by a new item, while holding tree lock.
+ * Replace item expected in xarray by a new item, while holding xa_lock.
*/
-static int shmem_radix_tree_replace(struct address_space *mapping,
+static int shmem_replace_entry(struct address_space *mapping,
pgoff_t index, void *expected, void *replacement)
{
- struct radix_tree_node *node;
- void __rcu **pslot;
+ XA_STATE(xas, &mapping->i_pages, index);
void *item;
VM_BUG_ON(!expected);
VM_BUG_ON(!replacement);
- item = __radix_tree_lookup(&mapping->i_pages, index, &node, &pslot);
- if (!item)
- return -ENOENT;
+ item = xas_load(&xas);
if (item != expected)
return -ENOENT;
- __radix_tree_replace(&mapping->i_pages, node, pslot,
- replacement, NULL);
+ xas_store(&xas, replacement);
return 0;
}
@@ -357,12 +375,7 @@
static bool shmem_confirm_swap(struct address_space *mapping,
pgoff_t index, swp_entry_t swap)
{
- void *item;
-
- rcu_read_lock();
- item = radix_tree_lookup(&mapping->i_pages, index);
- rcu_read_unlock();
- return item == swp_to_radix_entry(swap);
+ return xa_load(&mapping->i_pages, index) == swp_to_radix_entry(swap);
}
/*
@@ -402,7 +415,7 @@
static int shmem_huge __read_mostly;
-#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
+#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
{
if (!strcmp(str, "never"))
@@ -419,7 +432,9 @@
return SHMEM_HUGE_FORCE;
return -EINVAL;
}
+#endif
+#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS)
static const char *shmem_format_huge(int huge)
{
switch (huge) {
@@ -590,9 +605,11 @@
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
- pgoff_t index, void *expected)
+ pgoff_t index, void *expected, gfp_t gfp)
{
- int error, nr = hpage_nr_pages(page);
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
+ unsigned long i = 0;
+ unsigned long nr = compound_nr(page);
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -604,47 +621,39 @@
page->mapping = mapping;
page->index = index;
- xa_lock_irq(&mapping->i_pages);
- if (PageTransHuge(page)) {
- void __rcu **results;
- pgoff_t idx;
- int i;
-
- error = 0;
- if (radix_tree_gang_lookup_slot(&mapping->i_pages,
- &results, &idx, index, 1) &&
- idx < index + HPAGE_PMD_NR) {
- error = -EEXIST;
+ do {
+ void *entry;
+ xas_lock_irq(&xas);
+ entry = xas_find_conflict(&xas);
+ if (entry != expected)
+ xas_set_err(&xas, -EEXIST);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+next:
+ xas_store(&xas, page);
+ if (++i < nr) {
+ xas_next(&xas);
+ goto next;
}
-
- if (!error) {
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- error = radix_tree_insert(&mapping->i_pages,
- index + i, page + i);
- VM_BUG_ON(error);
- }
+ if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
- }
- } else if (!expected) {
- error = radix_tree_insert(&mapping->i_pages, index, page);
- } else {
- error = shmem_radix_tree_replace(mapping, index, expected,
- page);
- }
-
- if (!error) {
- mapping->nrpages += nr;
- if (PageTransHuge(page))
__inc_node_page_state(page, NR_SHMEM_THPS);
+ }
+ mapping->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
__mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
- xa_unlock_irq(&mapping->i_pages);
- } else {
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
+
+ if (xas_error(&xas)) {
page->mapping = NULL;
- xa_unlock_irq(&mapping->i_pages);
page_ref_sub(page, nr);
+ return xas_error(&xas);
}
- return error;
+
+ return 0;
}
/*
@@ -658,7 +667,7 @@
VM_BUG_ON_PAGE(PageCompound(page), page);
xa_lock_irq(&mapping->i_pages);
- error = shmem_radix_tree_replace(mapping, page->index, page, radswap);
+ error = shmem_replace_entry(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
__dec_node_page_state(page, NR_FILE_PAGES);
@@ -669,16 +678,14 @@
}
/*
- * Remove swap entry from radix tree, free the swap and its page cache.
+ * Remove swap entry from page cache, free the swap and its page cache.
*/
static int shmem_free_swap(struct address_space *mapping,
pgoff_t index, void *radswap)
{
void *old;
- xa_lock_irq(&mapping->i_pages);
- old = radix_tree_delete_item(&mapping->i_pages, index, radswap);
- xa_unlock_irq(&mapping->i_pages);
+ old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
if (old != radswap)
return -ENOENT;
free_swap_and_cache(radix_to_swp_entry(radswap));
@@ -695,29 +702,19 @@
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
+ XA_STATE(xas, &mapping->i_pages, start);
struct page *page;
unsigned long swapped = 0;
rcu_read_lock();
-
- radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) {
- if (iter.index >= end)
- break;
-
- page = radix_tree_deref_slot(slot);
-
- if (radix_tree_deref_retry(page)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, page, end - 1) {
+ if (xas_retry(&xas, page))
continue;
- }
-
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
swapped++;
if (need_resched()) {
- slot = radix_tree_iter_resume(slot, &iter);
+ xas_pause(&xas);
cond_resched_rcu();
}
}
@@ -785,14 +782,14 @@
break;
index = indices[pvec.nr - 1] + 1;
pagevec_remove_exceptionals(&pvec);
- check_move_unevictable_pages(pvec.pages, pvec.nr);
+ check_move_unevictable_pages(&pvec);
pagevec_release(&pvec);
cond_resched();
}
}
/*
- * Remove range of pages and swap entries from radix tree, and free them.
+ * Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
@@ -828,7 +825,7 @@
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
@@ -925,7 +922,7 @@
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (unfalloc)
continue;
if (shmem_free_swap(mapping, index, page)) {
@@ -1101,9 +1098,14 @@
}
spin_unlock(&sbinfo->shrinklist_lock);
}
- if (!list_empty(&info->swaplist)) {
+ while (!list_empty(&info->swaplist)) {
+ /* Wait while shmem_unuse() is scanning this inode... */
+ wait_var_event(&info->stop_eviction,
+ !atomic_read(&info->stop_eviction));
mutex_lock(&shmem_swaplist_mutex);
- list_del_init(&info->swaplist);
+ /* ...but beware of the race if we peeked too early */
+ if (!atomic_read(&info->stop_eviction))
+ list_del_init(&info->swaplist);
mutex_unlock(&shmem_swaplist_mutex);
}
}
@@ -1114,166 +1116,174 @@
clear_inode(inode);
}
-static unsigned long find_swap_entry(struct radix_tree_root *root, void *item)
+extern struct swap_info_struct *swap_info[];
+
+static int shmem_find_swap_entries(struct address_space *mapping,
+ pgoff_t start, unsigned int nr_entries,
+ struct page **entries, pgoff_t *indices,
+ unsigned int type, bool frontswap)
{
- struct radix_tree_iter iter;
- void __rcu **slot;
- unsigned long found = -1;
- unsigned int checked = 0;
+ XA_STATE(xas, &mapping->i_pages, start);
+ struct page *page;
+ swp_entry_t entry;
+ unsigned int ret = 0;
+
+ if (!nr_entries)
+ return 0;
rcu_read_lock();
- radix_tree_for_each_slot(slot, root, &iter, 0) {
- void *entry = radix_tree_deref_slot(slot);
-
- if (radix_tree_deref_retry(entry)) {
- slot = radix_tree_iter_retry(&iter);
+ xas_for_each(&xas, page, ULONG_MAX) {
+ if (xas_retry(&xas, page))
continue;
+
+ if (!xa_is_value(page))
+ continue;
+
+ entry = radix_to_swp_entry(page);
+ if (swp_type(entry) != type)
+ continue;
+ if (frontswap &&
+ !frontswap_test(swap_info[type], swp_offset(entry)))
+ continue;
+
+ indices[ret] = xas.xa_index;
+ entries[ret] = page;
+
+ if (need_resched()) {
+ xas_pause(&xas);
+ cond_resched_rcu();
}
- if (entry == item) {
- found = iter.index;
+ if (++ret == nr_entries)
break;
- }
- checked++;
- if ((checked % 4096) != 0)
- continue;
- slot = radix_tree_iter_resume(slot, &iter);
- cond_resched_rcu();
}
-
rcu_read_unlock();
- return found;
+
+ return ret;
+}
+
+/*
+ * Move the swapped pages for an inode to page cache. Returns the count
+ * of pages swapped in, or the error in case of failure.
+ */
+static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
+ pgoff_t *indices)
+{
+ int i = 0;
+ int ret = 0;
+ int error = 0;
+ struct address_space *mapping = inode->i_mapping;
+
+ for (i = 0; i < pvec.nr; i++) {
+ struct page *page = pvec.pages[i];
+
+ if (!xa_is_value(page))
+ continue;
+ error = shmem_swapin_page(inode, indices[i],
+ &page, SGP_CACHE,
+ mapping_gfp_mask(mapping),
+ NULL, NULL);
+ if (error == 0) {
+ unlock_page(page);
+ put_page(page);
+ ret++;
+ }
+ if (error == -ENOMEM)
+ break;
+ error = 0;
+ }
+ return error ? error : ret;
}
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
-static int shmem_unuse_inode(struct shmem_inode_info *info,
- swp_entry_t swap, struct page **pagep)
+static int shmem_unuse_inode(struct inode *inode, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
- struct address_space *mapping = info->vfs_inode.i_mapping;
- void *radswap;
- pgoff_t index;
- gfp_t gfp;
- int error = 0;
+ struct address_space *mapping = inode->i_mapping;
+ pgoff_t start = 0;
+ struct pagevec pvec;
+ pgoff_t indices[PAGEVEC_SIZE];
+ bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
+ int ret = 0;
- radswap = swp_to_radix_entry(swap);
- index = find_swap_entry(&mapping->i_pages, radswap);
- if (index == -1)
- return -EAGAIN; /* tell shmem_unuse we found nothing */
+ pagevec_init(&pvec);
+ do {
+ unsigned int nr_entries = PAGEVEC_SIZE;
- /*
- * Move _head_ to start search for next from here.
- * But be careful: shmem_evict_inode checks list_empty without taking
- * mutex, and there's an instant in list_move_tail when info->swaplist
- * would appear empty, if it were the only one on shmem_swaplist.
- */
- if (shmem_swaplist.next != &info->swaplist)
- list_move_tail(&shmem_swaplist, &info->swaplist);
+ if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
+ nr_entries = *fs_pages_to_unuse;
- gfp = mapping_gfp_mask(mapping);
- if (shmem_should_replace_page(*pagep, gfp)) {
- mutex_unlock(&shmem_swaplist_mutex);
- error = shmem_replace_page(pagep, gfp, info, index);
- mutex_lock(&shmem_swaplist_mutex);
- /*
- * We needed to drop mutex to make that restrictive page
- * allocation, but the inode might have been freed while we
- * dropped it: although a racing shmem_evict_inode() cannot
- * complete without emptying the radix_tree, our page lock
- * on this swapcache page is not enough to prevent that -
- * free_swap_and_cache() of our swap entry will only
- * trylock_page(), removing swap from radix_tree whatever.
- *
- * We must not proceed to shmem_add_to_page_cache() if the
- * inode has been freed, but of course we cannot rely on
- * inode or mapping or info to check that. However, we can
- * safely check if our swap entry is still in use (and here
- * it can't have got reused for another page): if it's still
- * in use, then the inode cannot have been freed yet, and we
- * can safely proceed (if it's no longer in use, that tells
- * nothing about the inode, but we don't need to unuse swap).
- */
- if (!page_swapcount(*pagep))
- error = -ENOENT;
- }
-
- /*
- * We rely on shmem_swaplist_mutex, not only to protect the swaplist,
- * but also to hold up shmem_evict_inode(): so inode cannot be freed
- * beneath us (pagelock doesn't help until the page is in pagecache).
- */
- if (!error)
- error = shmem_add_to_page_cache(*pagep, mapping, index,
- radswap);
- if (error != -ENOMEM) {
- /*
- * Truncation and eviction use free_swap_and_cache(), which
- * only does trylock page: if we raced, best clean up here.
- */
- delete_from_swap_cache(*pagep);
- set_page_dirty(*pagep);
- if (!error) {
- spin_lock_irq(&info->lock);
- info->swapped--;
- spin_unlock_irq(&info->lock);
- swap_free(swap);
+ pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
+ pvec.pages, indices,
+ type, frontswap);
+ if (pvec.nr == 0) {
+ ret = 0;
+ break;
}
- }
- return error;
+
+ ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ if (ret < 0)
+ break;
+
+ if (frontswap_partial) {
+ *fs_pages_to_unuse -= ret;
+ if (*fs_pages_to_unuse == 0) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ break;
+ }
+ }
+
+ start = indices[pvec.nr - 1];
+ } while (true);
+
+ return ret;
}
/*
- * Search through swapped inodes to find and replace swap by page.
+ * Read all the shared memory data that resides in the swap
+ * device 'type' back into memory, so the swap device can be
+ * unused.
*/
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
- struct list_head *this, *next;
- struct shmem_inode_info *info;
- struct mem_cgroup *memcg;
+ struct shmem_inode_info *info, *next;
int error = 0;
- /*
- * There's a faint possibility that swap page was replaced before
- * caller locked it: caller will come back later with the right page.
- */
- if (unlikely(!PageSwapCache(page) || page_private(page) != swap.val))
- goto out;
-
- /*
- * Charge page using GFP_KERNEL while we can wait, before taking
- * the shmem_swaplist_mutex which might hold up shmem_writepage().
- * Charged back to the user (not to caller) when swap account is used.
- */
- error = mem_cgroup_try_charge_delay(page, current->mm, GFP_KERNEL,
- &memcg, false);
- if (error)
- goto out;
- /* No radix_tree_preload: swap entry keeps a place for page in tree */
- error = -EAGAIN;
+ if (list_empty(&shmem_swaplist))
+ return 0;
mutex_lock(&shmem_swaplist_mutex);
- list_for_each_safe(this, next, &shmem_swaplist) {
- info = list_entry(this, struct shmem_inode_info, swaplist);
- if (info->swapped)
- error = shmem_unuse_inode(info, swap, &page);
- else
+ list_for_each_entry_safe(info, next, &shmem_swaplist, swaplist) {
+ if (!info->swapped) {
list_del_init(&info->swaplist);
+ continue;
+ }
+ /*
+ * Drop the swaplist mutex while searching the inode for swap;
+ * but before doing so, make sure shmem_evict_inode() will not
+ * remove placeholder inode from swaplist, nor let it be freed
+ * (igrab() would protect from unlink, but not from unmount).
+ */
+ atomic_inc(&info->stop_eviction);
+ mutex_unlock(&shmem_swaplist_mutex);
+
+ error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
+ fs_pages_to_unuse);
cond_resched();
- if (error != -EAGAIN)
+
+ mutex_lock(&shmem_swaplist_mutex);
+ next = list_next_entry(info, swaplist);
+ if (!info->swapped)
+ list_del_init(&info->swaplist);
+ if (atomic_dec_and_test(&info->stop_eviction))
+ wake_up_var(&info->stop_eviction);
+ if (error)
break;
- /* found nothing in this: move on to search the next */
}
mutex_unlock(&shmem_swaplist_mutex);
- if (error) {
- if (error != -ENOMEM)
- error = 0;
- mem_cgroup_cancel_charge(page, memcg, false);
- } else
- mem_cgroup_commit_charge(page, memcg, true, false);
-out:
- unlock_page(page);
- put_page(page);
return error;
}
@@ -1357,7 +1367,7 @@
*/
mutex_lock(&shmem_swaplist_mutex);
if (list_empty(&info->swaplist))
- list_add_tail(&info->swaplist, &shmem_swaplist);
+ list_add(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
spin_lock_irq(&info->lock);
@@ -1457,23 +1467,17 @@
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
- struct inode *inode = &info->vfs_inode;
- struct address_space *mapping = inode->i_mapping;
- pgoff_t idx, hindex;
- void __rcu **results;
+ struct address_space *mapping = info->vfs_inode.i_mapping;
+ pgoff_t hindex;
struct page *page;
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
return NULL;
hindex = round_down(index, HPAGE_PMD_NR);
- rcu_read_lock();
- if (radix_tree_gang_lookup_slot(&mapping->i_pages, &results, &idx,
- hindex, 1) && idx < hindex + HPAGE_PMD_NR) {
- rcu_read_unlock();
+ if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
+ XA_PRESENT))
return NULL;
- }
- rcu_read_unlock();
shmem_pseudo_vma_init(&pvma, info, hindex);
page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
@@ -1584,8 +1588,7 @@
* a nice clean interface for us to replace oldpage by newpage there.
*/
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_radix_tree_replace(swap_mapping, swap_index, oldpage,
- newpage);
+ error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
__inc_node_page_state(newpage, NR_FILE_PAGES);
__dec_node_page_state(oldpage, NR_FILE_PAGES);
@@ -1615,13 +1618,123 @@
}
/*
+ * Swap in the page pointed to by *pagep.
+ * Caller has to make sure that *pagep contains a valid swapped page.
+ * Returns 0 and the page in pagep if success. On failure, returns the
+ * the error code and NULL in *pagep.
+ */
+static int shmem_swapin_page(struct inode *inode, pgoff_t index,
+ struct page **pagep, enum sgp_type sgp,
+ gfp_t gfp, struct vm_area_struct *vma,
+ vm_fault_t *fault_type)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
+ struct mem_cgroup *memcg;
+ struct page *page;
+ swp_entry_t swap;
+ int error;
+
+ VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
+ swap = radix_to_swp_entry(*pagep);
+ *pagep = NULL;
+
+ /* Look it up and read it in.. */
+ page = lookup_swap_cache(swap, NULL, 0);
+ if (!page) {
+ /* Or update major stats only when swapin succeeds?? */
+ if (fault_type) {
+ *fault_type |= VM_FAULT_MAJOR;
+ count_vm_event(PGMAJFAULT);
+ count_memcg_event_mm(charge_mm, PGMAJFAULT);
+ }
+ /* Here we actually start the io */
+ page = shmem_swapin(swap, gfp, info, index);
+ if (!page) {
+ error = -ENOMEM;
+ goto failed;
+ }
+ }
+
+ /* We have to do this with page locked to prevent races */
+ lock_page(page);
+ if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ !shmem_confirm_swap(mapping, index, swap)) {
+ error = -EEXIST;
+ goto unlock;
+ }
+ if (!PageUptodate(page)) {
+ error = -EIO;
+ goto failed;
+ }
+ wait_on_page_writeback(page);
+
+ if (shmem_should_replace_page(page, gfp)) {
+ error = shmem_replace_page(&page, gfp, info, index);
+ if (error)
+ goto failed;
+ }
+
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
+ false);
+ if (!error) {
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp);
+ /*
+ * We already confirmed swap under page lock, and make
+ * no memory allocation here, so usually no possibility
+ * of error; but free_swap_and_cache() only trylocks a
+ * page, so it is just possible that the entry has been
+ * truncated or holepunched since swap was confirmed.
+ * shmem_undo_range() will have done some of the
+ * unaccounting, now delete_from_swap_cache() will do
+ * the rest.
+ */
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg, false);
+ delete_from_swap_cache(page);
+ }
+ }
+ if (error)
+ goto failed;
+
+ mem_cgroup_commit_charge(page, memcg, true, false);
+
+ spin_lock_irq(&info->lock);
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+
+ if (sgp == SGP_WRITE)
+ mark_page_accessed(page);
+
+ delete_from_swap_cache(page);
+ set_page_dirty(page);
+ swap_free(swap);
+
+ *pagep = page;
+ return 0;
+failed:
+ if (!shmem_confirm_swap(mapping, index, swap))
+ error = -EEXIST;
+unlock:
+ if (page) {
+ unlock_page(page);
+ put_page(page);
+ }
+
+ return error;
+}
+
+/*
* shmem_getpage_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * fault_mm and fault_type are only supplied by shmem_fault:
+ * vmf and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
@@ -1635,7 +1748,6 @@
struct mm_struct *charge_mm;
struct mem_cgroup *memcg;
struct page *page;
- swp_entry_t swap;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
int error;
@@ -1647,17 +1759,23 @@
if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
sgp = SGP_CACHE;
repeat:
- swap.val = 0;
- page = find_lock_entry(mapping, index);
- if (radix_tree_exceptional_entry(page)) {
- swap = radix_to_swp_entry(page);
- page = NULL;
- }
-
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
- error = -EINVAL;
- goto unlock;
+ return -EINVAL;
+ }
+
+ sbinfo = SHMEM_SB(inode->i_sb);
+ charge_mm = vma ? vma->vm_mm : current->mm;
+
+ page = find_lock_entry(mapping, index);
+ if (xa_is_value(page)) {
+ error = shmem_swapin_page(inode, index, &page,
+ sgp, gfp, vma, fault_type);
+ if (error == -EEXIST)
+ goto repeat;
+
+ *pagep = page;
+ return error;
}
if (page && sgp == SGP_WRITE)
@@ -1671,7 +1789,7 @@
put_page(page);
page = NULL;
}
- if (page || (sgp == SGP_READ && !swap.val)) {
+ if (page || sgp == SGP_READ) {
*pagep = page;
return 0;
}
@@ -1680,220 +1798,138 @@
* Fast cache lookup did not find it:
* bring it back from swap or allocate.
*/
- sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
- if (swap.val) {
- /* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
- /* Or update major stats only when swapin succeeds?? */
- if (fault_type) {
- *fault_type |= VM_FAULT_MAJOR;
- count_vm_event(PGMAJFAULT);
- count_memcg_event_mm(charge_mm, PGMAJFAULT);
- }
- /* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
- error = -ENOMEM;
- goto failed;
- }
- }
+ if (vma && userfaultfd_missing(vma)) {
+ *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
+ return 0;
+ }
- /* We have to do this with page locked to prevent races */
- lock_page(page);
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
- !shmem_confirm_swap(mapping, index, swap)) {
- error = -EEXIST; /* try again */
- goto unlock;
- }
- if (!PageUptodate(page)) {
- error = -EIO;
- goto failed;
- }
- wait_on_page_writeback(page);
-
- if (shmem_should_replace_page(page, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
- if (error)
- goto failed;
- }
-
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap));
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- * Reset swap.val? No, leave it so "failed" goes back to
- * "repeat": reading a hole and writing should succeed.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
- if (error)
- goto failed;
-
- mem_cgroup_commit_charge(page, memcg, true, false);
-
- spin_lock_irq(&info->lock);
- info->swapped--;
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
-
- if (sgp == SGP_WRITE)
- mark_page_accessed(page);
-
- delete_from_swap_cache(page);
- set_page_dirty(page);
- swap_free(swap);
-
- } else {
- if (vma && userfaultfd_missing(vma)) {
- *fault_type = handle_userfault(vmf, VM_UFFD_MISSING);
- return 0;
- }
-
- /* shmem_symlink() */
- if (mapping->a_ops != &shmem_aops)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_FORCE)
+ /* shmem_symlink() */
+ if (mapping->a_ops != &shmem_aops)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ goto alloc_nohuge;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ goto alloc_huge;
+ switch (sbinfo->huge) {
+ loff_t i_size;
+ pgoff_t off;
+ case SHMEM_HUGE_NEVER:
+ goto alloc_nohuge;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ off = round_up(index, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >= HPAGE_PMD_SIZE &&
+ i_size >> PAGE_SHIFT >= off)
goto alloc_huge;
- switch (sbinfo->huge) {
- loff_t i_size;
- pgoff_t off;
- case SHMEM_HUGE_NEVER:
- goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(index, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- goto alloc_huge;
- /* fallthrough */
- case SHMEM_HUGE_ADVISE:
- if (sgp_huge == SGP_HUGE)
- goto alloc_huge;
- /* TODO: implement fadvise() hints */
- goto alloc_nohuge;
- }
+ /* fallthrough */
+ case SHMEM_HUGE_ADVISE:
+ if (sgp_huge == SGP_HUGE)
+ goto alloc_huge;
+ /* TODO: implement fadvise() hints */
+ goto alloc_nohuge;
+ }
alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
- if (IS_ERR(page)) {
-alloc_nohuge: page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
- }
- if (IS_ERR(page)) {
- int retry = 5;
- error = PTR_ERR(page);
- page = NULL;
- if (error != -ENOSPC)
- goto failed;
- /*
- * Try to reclaim some spece by splitting a huge page
- * beyond i_size on the filesystem.
- */
- while (retry--) {
- int ret;
- ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
- if (ret == SHRINK_STOP)
- break;
- if (ret)
- goto alloc_nohuge;
- }
- goto failed;
- }
+ page = shmem_alloc_and_acct_page(gfp, inode, index, true);
+ if (IS_ERR(page)) {
+alloc_nohuge:
+ page = shmem_alloc_and_acct_page(gfp, inode,
+ index, false);
+ }
+ if (IS_ERR(page)) {
+ int retry = 5;
- if (PageTransHuge(page))
- hindex = round_down(index, HPAGE_PMD_NR);
- else
- hindex = index;
-
- if (sgp == SGP_WRITE)
- __SetPageReferenced(page);
-
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
- if (error)
- goto unacct;
- error = radix_tree_maybe_preload_order(gfp & GFP_RECLAIM_MASK,
- compound_order(page));
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL);
- radix_tree_preload_end();
- }
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
- goto unacct;
- }
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
-
- spin_lock_irq(&info->lock);
- info->alloced += 1 << compound_order(page);
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
- shmem_recalc_inode(inode);
- spin_unlock_irq(&info->lock);
- alloced = true;
-
- if (PageTransHuge(page) &&
- DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
- /*
- * Part of the huge page is beyond i_size: subject
- * to shrink under memory pressure.
- */
- spin_lock(&sbinfo->shrinklist_lock);
- /*
- * _careful to defend against unlocked access to
- * ->shrink_list in shmem_unused_huge_shrink()
- */
- if (list_empty_careful(&info->shrinklist)) {
- list_add_tail(&info->shrinklist,
- &sbinfo->shrinklist);
- sbinfo->shrinklist_len++;
- }
- spin_unlock(&sbinfo->shrinklist_lock);
- }
-
+ error = PTR_ERR(page);
+ page = NULL;
+ if (error != -ENOSPC)
+ goto unlock;
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Try to reclaim some space by splitting a huge page
+ * beyond i_size on the filesystem.
*/
- if (sgp == SGP_FALLOC)
- sgp = SGP_WRITE;
+ while (retry--) {
+ int ret;
+
+ ret = shmem_unused_huge_shrink(sbinfo, NULL, 1);
+ if (ret == SHRINK_STOP)
+ break;
+ if (ret)
+ goto alloc_nohuge;
+ }
+ goto unlock;
+ }
+
+ if (PageTransHuge(page))
+ hindex = round_down(index, HPAGE_PMD_NR);
+ else
+ hindex = index;
+
+ if (sgp == SGP_WRITE)
+ __SetPageReferenced(page);
+
+ error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
+ PageTransHuge(page));
+ if (error)
+ goto unacct;
+ error = shmem_add_to_page_cache(page, mapping, hindex,
+ NULL, gfp & GFP_RECLAIM_MASK);
+ if (error) {
+ mem_cgroup_cancel_charge(page, memcg,
+ PageTransHuge(page));
+ goto unacct;
+ }
+ mem_cgroup_commit_charge(page, memcg, false,
+ PageTransHuge(page));
+ lru_cache_add_anon(page);
+
+ spin_lock_irq(&info->lock);
+ info->alloced += compound_nr(page);
+ inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ alloced = true;
+
+ if (PageTransHuge(page) &&
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
+ hindex + HPAGE_PMD_NR - 1) {
+ /*
+ * Part of the huge page is beyond i_size: subject
+ * to shrink under memory pressure.
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ /*
+ * _careful to defend against unlocked access to
+ * ->shrink_list in shmem_unused_huge_shrink()
+ */
+ if (list_empty_careful(&info->shrinklist)) {
+ list_add_tail(&info->shrinklist,
+ &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ }
+ spin_unlock(&sbinfo->shrinklist_lock);
+ }
+
+ /*
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ */
+ if (sgp == SGP_FALLOC)
+ sgp = SGP_WRITE;
clear:
- /*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
- * it now, lest undo on failure cancel our earlier guarantee.
- */
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
- int i;
+ /*
+ * Let SGP_WRITE caller clear ends if write does not fill page;
+ * but SGP_FALLOC on a page fallocated earlier must initialize
+ * it now, lest undo on failure cancel our earlier guarantee.
+ */
+ if (sgp != SGP_WRITE && !PageUptodate(page)) {
+ struct page *head = compound_head(page);
+ int i;
- for (i = 0; i < (1 << compound_order(head)); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
- }
- SetPageUptodate(head);
+ for (i = 0; i < compound_nr(head); i++) {
+ clear_highpage(head + i);
+ flush_dcache_page(head + i);
}
+ SetPageUptodate(head);
}
/* Perhaps the file has been truncated since we checked */
@@ -1916,16 +1952,13 @@
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, 1 << compound_order(page));
+ shmem_inode_unacct_blocks(inode, compound_nr(page));
if (PageTransHuge(page)) {
unlock_page(page);
put_page(page);
goto alloc_nohuge;
}
-failed:
- if (swap.val && !shmem_confirm_swap(mapping, index, swap))
- error = -EEXIST;
unlock:
if (page) {
unlock_page(page);
@@ -1937,7 +1970,7 @@
spin_unlock_irq(&info->lock);
goto repeat;
}
- if (error == -EEXIST) /* from above or from radix_tree_insert */
+ if (error == -EEXIST)
goto repeat;
return error;
}
@@ -2169,6 +2202,24 @@
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
+ struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+
+ if (info->seals & F_SEAL_FUTURE_WRITE) {
+ /*
+ * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
+ * "future write" seal active.
+ */
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
+ return -EPERM;
+
+ /*
+ * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
+ * read-only mapping, take care to not allow mprotect to revert
+ * protections.
+ */
+ vma->vm_flags &= ~(VM_MAYWRITE);
+ }
+
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
@@ -2199,6 +2250,7 @@
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
+ atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
INIT_LIST_HEAD(&info->shrinklist);
@@ -2312,11 +2364,8 @@
if (ret)
goto out_release;
- ret = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
- if (!ret) {
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL);
- radix_tree_preload_end();
- }
+ ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
+ gfp & GFP_RECLAIM_MASK);
if (ret)
goto out_release_uncharge;
@@ -2422,8 +2471,9 @@
pgoff_t index = pos >> PAGE_SHIFT;
/* i_mutex is held by caller */
- if (unlikely(info->seals & (F_SEAL_WRITE | F_SEAL_GROW))) {
- if (info->seals & F_SEAL_WRITE)
+ if (unlikely(info->seals & (F_SEAL_GROW |
+ F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
return -EPERM;
if ((info->seals & F_SEAL_GROW) && pos + len > inode->i_size)
return -EPERM;
@@ -2579,7 +2629,7 @@
}
/*
- * llseek SEEK_DATA or SEEK_HOLE through the radix_tree.
+ * llseek SEEK_DATA or SEEK_HOLE through the page cache.
*/
static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
pgoff_t index, pgoff_t end, int whence)
@@ -2609,7 +2659,7 @@
index = indices[i];
}
page = pvec.pages[i];
- if (page && !radix_tree_exceptional_entry(page)) {
+ if (page && !xa_is_value(page)) {
if (!PageUptodate(page))
page = NULL;
}
@@ -2686,7 +2736,7 @@
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
/* protected by i_mutex */
- if (info->seals & F_SEAL_WRITE) {
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
error = -EPERM;
goto out;
}
@@ -2895,16 +2945,20 @@
static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
{
struct inode *inode = d_inode(old_dentry);
- int ret;
+ int ret = 0;
/*
* No ordinary (disk based) filesystem counts links as inodes;
* but each new link needs a new dentry, pinning lowmem, and
* tmpfs dentries cannot be pruned until they are unlinked.
+ * But if an O_TMPFILE file is linked into the tmpfs, the
+ * first link must skip that, to get the accounting right.
*/
- ret = shmem_reserve_inode(inode->i_sb);
- if (ret)
- goto out;
+ if (inode->i_nlink) {
+ ret = shmem_reserve_inode(inode->i_sb);
+ if (ret)
+ goto out;
+ }
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
@@ -3310,16 +3364,132 @@
.fh_to_dentry = shmem_fh_to_dentry,
};
-static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
- bool remount)
+enum shmem_param {
+ Opt_gid,
+ Opt_huge,
+ Opt_mode,
+ Opt_mpol,
+ Opt_nr_blocks,
+ Opt_nr_inodes,
+ Opt_size,
+ Opt_uid,
+};
+
+static const struct fs_parameter_spec shmem_param_specs[] = {
+ fsparam_u32 ("gid", Opt_gid),
+ fsparam_enum ("huge", Opt_huge),
+ fsparam_u32oct("mode", Opt_mode),
+ fsparam_string("mpol", Opt_mpol),
+ fsparam_string("nr_blocks", Opt_nr_blocks),
+ fsparam_string("nr_inodes", Opt_nr_inodes),
+ fsparam_string("size", Opt_size),
+ fsparam_u32 ("uid", Opt_uid),
+ {}
+};
+
+static const struct fs_parameter_enum shmem_param_enums[] = {
+ { Opt_huge, "never", SHMEM_HUGE_NEVER },
+ { Opt_huge, "always", SHMEM_HUGE_ALWAYS },
+ { Opt_huge, "within_size", SHMEM_HUGE_WITHIN_SIZE },
+ { Opt_huge, "advise", SHMEM_HUGE_ADVISE },
+ {}
+};
+
+const struct fs_parameter_description shmem_fs_parameters = {
+ .name = "tmpfs",
+ .specs = shmem_param_specs,
+ .enums = shmem_param_enums,
+};
+
+static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
{
- char *this_char, *value, *rest;
- struct mempolicy *mpol = NULL;
- uid_t uid;
- gid_t gid;
+ struct shmem_options *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ unsigned long long size;
+ char *rest;
+ int opt;
+
+ opt = fs_parse(fc, &shmem_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_size:
+ size = memparse(param->string, &rest);
+ if (*rest == '%') {
+ size <<= PAGE_SHIFT;
+ size *= totalram_pages();
+ do_div(size, 100);
+ rest++;
+ }
+ if (*rest)
+ goto bad_value;
+ ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE);
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_blocks:
+ ctx->blocks = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_BLOCKS;
+ break;
+ case Opt_nr_inodes:
+ ctx->inodes = memparse(param->string, &rest);
+ if (*rest)
+ goto bad_value;
+ ctx->seen |= SHMEM_SEEN_INODES;
+ break;
+ case Opt_mode:
+ ctx->mode = result.uint_32 & 07777;
+ break;
+ case Opt_uid:
+ ctx->uid = make_kuid(current_user_ns(), result.uint_32);
+ if (!uid_valid(ctx->uid))
+ goto bad_value;
+ break;
+ case Opt_gid:
+ ctx->gid = make_kgid(current_user_ns(), result.uint_32);
+ if (!gid_valid(ctx->gid))
+ goto bad_value;
+ break;
+ case Opt_huge:
+ ctx->huge = result.uint_32;
+ if (ctx->huge != SHMEM_HUGE_NEVER &&
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ has_transparent_hugepage()))
+ goto unsupported_parameter;
+ ctx->seen |= SHMEM_SEEN_HUGE;
+ break;
+ case Opt_mpol:
+ if (IS_ENABLED(CONFIG_NUMA)) {
+ mpol_put(ctx->mpol);
+ ctx->mpol = NULL;
+ if (mpol_parse_str(param->string, &ctx->mpol))
+ goto bad_value;
+ break;
+ }
+ goto unsupported_parameter;
+ }
+ return 0;
+
+unsupported_parameter:
+ return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key);
+bad_value:
+ return invalf(fc, "tmpfs: Bad value for '%s'", param->key);
+}
+
+static int shmem_parse_options(struct fs_context *fc, void *data)
+{
+ char *options = data;
+
+ if (options) {
+ int err = security_sb_eat_lsm_opts(options, &fc->security);
+ if (err)
+ return err;
+ }
while (options != NULL) {
- this_char = options;
+ char *this_char = options;
for (;;) {
/*
* NUL-terminate this option: unfortunately,
@@ -3335,139 +3505,83 @@
break;
}
}
- if (!*this_char)
- continue;
- if ((value = strchr(this_char,'=')) != NULL) {
- *value++ = 0;
- } else {
- pr_err("tmpfs: No value for mount option '%s'\n",
- this_char);
- goto error;
- }
+ if (*this_char) {
+ char *value = strchr(this_char,'=');
+ size_t len = 0;
+ int err;
- if (!strcmp(this_char,"size")) {
- unsigned long long size;
- size = memparse(value,&rest);
- if (*rest == '%') {
- size <<= PAGE_SHIFT;
- size *= totalram_pages;
- do_div(size, 100);
- rest++;
+ if (value) {
+ *value++ = '\0';
+ len = strlen(value);
}
- if (*rest)
- goto bad_val;
- sbinfo->max_blocks =
- DIV_ROUND_UP(size, PAGE_SIZE);
- } else if (!strcmp(this_char,"nr_blocks")) {
- sbinfo->max_blocks = memparse(value, &rest);
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"nr_inodes")) {
- sbinfo->max_inodes = memparse(value, &rest);
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"mode")) {
- if (remount)
- continue;
- sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777;
- if (*rest)
- goto bad_val;
- } else if (!strcmp(this_char,"uid")) {
- if (remount)
- continue;
- uid = simple_strtoul(value, &rest, 0);
- if (*rest)
- goto bad_val;
- sbinfo->uid = make_kuid(current_user_ns(), uid);
- if (!uid_valid(sbinfo->uid))
- goto bad_val;
- } else if (!strcmp(this_char,"gid")) {
- if (remount)
- continue;
- gid = simple_strtoul(value, &rest, 0);
- if (*rest)
- goto bad_val;
- sbinfo->gid = make_kgid(current_user_ns(), gid);
- if (!gid_valid(sbinfo->gid))
- goto bad_val;
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
- } else if (!strcmp(this_char, "huge")) {
- int huge;
- huge = shmem_parse_huge(value);
- if (huge < 0)
- goto bad_val;
- if (!has_transparent_hugepage() &&
- huge != SHMEM_HUGE_NEVER)
- goto bad_val;
- sbinfo->huge = huge;
-#endif
-#ifdef CONFIG_NUMA
- } else if (!strcmp(this_char,"mpol")) {
- mpol_put(mpol);
- mpol = NULL;
- if (mpol_parse_str(value, &mpol))
- goto bad_val;
-#endif
- } else {
- pr_err("tmpfs: Bad mount option %s\n", this_char);
- goto error;
+ err = vfs_parse_fs_string(fc, this_char, value, len);
+ if (err < 0)
+ return err;
}
}
- sbinfo->mpol = mpol;
return 0;
-
-bad_val:
- pr_err("tmpfs: Bad value '%s' for mount option '%s'\n",
- value, this_char);
-error:
- mpol_put(mpol);
- return 1;
-
}
-static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
+/*
+ * Reconfigure a shmem filesystem.
+ *
+ * Note that we disallow change from limited->unlimited blocks/inodes while any
+ * are in use; but we must separately disallow unlimited->limited, because in
+ * that case we have no record of how much is already in use.
+ */
+static int shmem_reconfigure(struct fs_context *fc)
{
- struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- struct shmem_sb_info config = *sbinfo;
+ struct shmem_options *ctx = fc->fs_private;
+ struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
- int error = -EINVAL;
-
- config.mpol = NULL;
- if (shmem_parse_options(data, &config, true))
- return error;
+ const char *err;
spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
- if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0)
- goto out;
- if (config.max_inodes < inodes)
- goto out;
- /*
- * Those tests disallow limited->unlimited while any are in use;
- * but we must separately disallow unlimited->limited, because
- * in that case we have no record of how much is already in use.
- */
- if (config.max_blocks && !sbinfo->max_blocks)
- goto out;
- if (config.max_inodes && !sbinfo->max_inodes)
- goto out;
+ if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
+ if (!sbinfo->max_blocks) {
+ err = "Cannot retroactively limit size";
+ goto out;
+ }
+ if (percpu_counter_compare(&sbinfo->used_blocks,
+ ctx->blocks) > 0) {
+ err = "Too small a size for current use";
+ goto out;
+ }
+ }
+ if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) {
+ if (!sbinfo->max_inodes) {
+ err = "Cannot retroactively limit inodes";
+ goto out;
+ }
+ if (ctx->inodes < inodes) {
+ err = "Too few inodes for current use";
+ goto out;
+ }
+ }
- error = 0;
- sbinfo->huge = config.huge;
- sbinfo->max_blocks = config.max_blocks;
- sbinfo->max_inodes = config.max_inodes;
- sbinfo->free_inodes = config.max_inodes - inodes;
+ if (ctx->seen & SHMEM_SEEN_HUGE)
+ sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_BLOCKS)
+ sbinfo->max_blocks = ctx->blocks;
+ if (ctx->seen & SHMEM_SEEN_INODES) {
+ sbinfo->max_inodes = ctx->inodes;
+ sbinfo->free_inodes = ctx->inodes - inodes;
+ }
/*
* Preserve previous mempolicy unless mpol remount option was specified.
*/
- if (config.mpol) {
+ if (ctx->mpol) {
mpol_put(sbinfo->mpol);
- sbinfo->mpol = config.mpol; /* transfers initial ref */
+ sbinfo->mpol = ctx->mpol; /* transfers initial ref */
+ ctx->mpol = NULL;
}
+ spin_unlock(&sbinfo->stat_lock);
+ return 0;
out:
spin_unlock(&sbinfo->stat_lock);
- return error;
+ return invalf(fc, "tmpfs: %s", err);
}
static int shmem_show_options(struct seq_file *seq, struct dentry *root)
@@ -3508,8 +3622,9 @@
sb->s_fs_info = NULL;
}
-int shmem_fill_super(struct super_block *sb, void *data, int silent)
+static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
{
+ struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
int err = -ENOMEM;
@@ -3520,9 +3635,6 @@
if (!sbinfo)
return -ENOMEM;
- sbinfo->mode = 0777 | S_ISVTX;
- sbinfo->uid = current_fsuid();
- sbinfo->gid = current_fsgid();
sb->s_fs_info = sbinfo;
#ifdef CONFIG_TMPFS
@@ -3532,12 +3644,10 @@
* but the internal instance is left unlimited.
*/
if (!(sb->s_flags & SB_KERNMOUNT)) {
- sbinfo->max_blocks = shmem_default_max_blocks();
- sbinfo->max_inodes = shmem_default_max_inodes();
- if (shmem_parse_options(data, sbinfo, false)) {
- err = -EINVAL;
- goto failed;
- }
+ if (!(ctx->seen & SHMEM_SEEN_BLOCKS))
+ ctx->blocks = shmem_default_max_blocks();
+ if (!(ctx->seen & SHMEM_SEEN_INODES))
+ ctx->inodes = shmem_default_max_inodes();
} else {
sb->s_flags |= SB_NOUSER;
}
@@ -3546,11 +3656,18 @@
#else
sb->s_flags |= SB_NOUSER;
#endif
+ sbinfo->max_blocks = ctx->blocks;
+ sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ sbinfo->uid = ctx->uid;
+ sbinfo->gid = ctx->gid;
+ sbinfo->mode = ctx->mode;
+ sbinfo->huge = ctx->huge;
+ sbinfo->mpol = ctx->mpol;
+ ctx->mpol = NULL;
spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
- sbinfo->free_inodes = sbinfo->max_inodes;
spin_lock_init(&sbinfo->shrinklist_lock);
INIT_LIST_HEAD(&sbinfo->shrinklist);
@@ -3583,6 +3700,31 @@
return err;
}
+static int shmem_get_tree(struct fs_context *fc)
+{
+ return get_tree_nodev(fc, shmem_fill_super);
+}
+
+static void shmem_free_fc(struct fs_context *fc)
+{
+ struct shmem_options *ctx = fc->fs_private;
+
+ if (ctx) {
+ mpol_put(ctx->mpol);
+ kfree(ctx);
+ }
+}
+
+static const struct fs_context_operations shmem_fs_context_ops = {
+ .free = shmem_free_fc,
+ .get_tree = shmem_get_tree,
+#ifdef CONFIG_TMPFS
+ .parse_monolithic = shmem_parse_options,
+ .parse_param = shmem_parse_one,
+ .reconfigure = shmem_reconfigure,
+#endif
+};
+
static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
@@ -3594,9 +3736,8 @@
return &info->vfs_inode;
}
-static void shmem_destroy_callback(struct rcu_head *head)
+static void shmem_free_in_core_inode(struct inode *inode)
{
- struct inode *inode = container_of(head, struct inode, i_rcu);
if (S_ISLNK(inode->i_mode))
kfree(inode->i_link);
kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
@@ -3606,7 +3747,6 @@
{
if (S_ISREG(inode->i_mode))
mpol_free_shared_policy(&SHMEM_I(inode)->policy);
- call_rcu(&inode->i_rcu, shmem_destroy_callback);
}
static void shmem_init_inode(void *foo)
@@ -3697,10 +3837,10 @@
static const struct super_operations shmem_ops = {
.alloc_inode = shmem_alloc_inode,
+ .free_inode = shmem_free_in_core_inode,
.destroy_inode = shmem_destroy_inode,
#ifdef CONFIG_TMPFS
.statfs = shmem_statfs,
- .remount_fs = shmem_remount_fs,
.show_options = shmem_show_options,
#endif
.evict_inode = shmem_evict_inode,
@@ -3721,16 +3861,30 @@
#endif
};
-static struct dentry *shmem_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+int shmem_init_fs_context(struct fs_context *fc)
{
- return mount_nodev(fs_type, flags, data, shmem_fill_super);
+ struct shmem_options *ctx;
+
+ ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->mode = 0777 | S_ISVTX;
+ ctx->uid = current_fsuid();
+ ctx->gid = current_fsgid();
+
+ fc->fs_private = ctx;
+ fc->ops = &shmem_fs_context_ops;
+ return 0;
}
static struct file_system_type shmem_fs_type = {
.owner = THIS_MODULE,
.name = "tmpfs",
- .mount = shmem_mount,
+ .init_fs_context = shmem_init_fs_context,
+#ifdef CONFIG_TMPFS
+ .parameters = &shmem_fs_parameters,
+#endif
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -3739,10 +3893,6 @@
{
int error;
- /* If rootfs called this, don't re-init */
- if (shmem_inode_cachep)
- return 0;
-
shmem_init_inodecache();
error = register_filesystem(&shmem_fs_type);
@@ -3836,6 +3986,9 @@
loff_t i_size;
pgoff_t off;
+ if ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ return false;
if (shmem_huge == SHMEM_HUGE_FORCE)
return true;
if (shmem_huge == SHMEM_HUGE_DENY)
@@ -3875,7 +4028,8 @@
static struct file_system_type shmem_fs_type = {
.name = "tmpfs",
- .mount = ramfs_mount,
+ .init_fs_context = ramfs_init_fs_context,
+ .parameters = &ramfs_fs_parameters,
.kill_sb = kill_litter_super,
.fs_flags = FS_USERNS_MOUNT,
};
@@ -3890,7 +4044,8 @@
return 0;
}
-int shmem_unuse(swp_entry_t swap, struct page *page)
+int shmem_unuse(unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
return 0;
}
diff --git a/mm/shuffle.c b/mm/shuffle.c
new file mode 100644
index 0000000..b3fe97f
--- /dev/null
+++ b/mm/shuffle.c
@@ -0,0 +1,207 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+
+#include <linux/mm.h>
+#include <linux/init.h>
+#include <linux/mmzone.h>
+#include <linux/random.h>
+#include <linux/moduleparam.h>
+#include "internal.h"
+#include "shuffle.h"
+
+DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+static unsigned long shuffle_state __ro_after_init;
+
+/*
+ * Depending on the architecture, module parameter parsing may run
+ * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
+ * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
+ * attempts to turn on the implementation, but aborts if it finds
+ * SHUFFLE_FORCE_DISABLE already set.
+ */
+__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+ if (ctl == SHUFFLE_FORCE_DISABLE)
+ set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
+
+ if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
+ if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_disable(&page_alloc_shuffle_key);
+ } else if (ctl == SHUFFLE_ENABLE
+ && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
+ static_branch_enable(&page_alloc_shuffle_key);
+}
+
+static bool shuffle_param;
+static int shuffle_show(char *buffer, const struct kernel_param *kp)
+{
+ return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
+ ? 'Y' : 'N');
+}
+
+static __meminit int shuffle_store(const char *val,
+ const struct kernel_param *kp)
+{
+ int rc = param_set_bool(val, kp);
+
+ if (rc < 0)
+ return rc;
+ if (shuffle_param)
+ page_alloc_shuffle(SHUFFLE_ENABLE);
+ else
+ page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+ return 0;
+}
+module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
+
+/*
+ * For two pages to be swapped in the shuffle, they must be free (on a
+ * 'free_area' lru), have the same order, and have the same migratetype.
+ */
+static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+{
+ struct page *page;
+
+ /*
+ * Given we're dealing with randomly selected pfns in a zone we
+ * need to ask questions like...
+ */
+
+ /* ...is the pfn even in the memmap? */
+ if (!pfn_valid_within(pfn))
+ return NULL;
+
+ /* ...is the pfn in a present section or a hole? */
+ if (!pfn_present(pfn))
+ return NULL;
+
+ /* ...is the page free and currently on a free_area list? */
+ page = pfn_to_page(pfn);
+ if (!PageBuddy(page))
+ return NULL;
+
+ /*
+ * ...is the page on the same list as the page we will
+ * shuffle it with?
+ */
+ if (page_order(page) != order)
+ return NULL;
+
+ return page;
+}
+
+/*
+ * Fisher-Yates shuffle the freelist which prescribes iterating through an
+ * array, pfns in this case, and randomly swapping each entry with another in
+ * the span, end_pfn - start_pfn.
+ *
+ * To keep the implementation simple it does not attempt to correct for sources
+ * of bias in the distribution, like modulo bias or pseudo-random number
+ * generator bias. I.e. the expectation is that this shuffling raises the bar
+ * for attacks that exploit the predictability of page allocations, but need not
+ * be a perfect shuffle.
+ */
+#define SHUFFLE_RETRY 10
+void __meminit __shuffle_zone(struct zone *z)
+{
+ unsigned long i, flags;
+ unsigned long start_pfn = z->zone_start_pfn;
+ unsigned long end_pfn = zone_end_pfn(z);
+ const int order = SHUFFLE_ORDER;
+ const int order_pages = 1 << order;
+
+ spin_lock_irqsave(&z->lock, flags);
+ start_pfn = ALIGN(start_pfn, order_pages);
+ for (i = start_pfn; i < end_pfn; i += order_pages) {
+ unsigned long j;
+ int migratetype, retry;
+ struct page *page_i, *page_j;
+
+ /*
+ * We expect page_i, in the sub-range of a zone being added
+ * (@start_pfn to @end_pfn), to more likely be valid compared to
+ * page_j randomly selected in the span @zone_start_pfn to
+ * @spanned_pages.
+ */
+ page_i = shuffle_valid_page(i, order);
+ if (!page_i)
+ continue;
+
+ for (retry = 0; retry < SHUFFLE_RETRY; retry++) {
+ /*
+ * Pick a random order aligned page in the zone span as
+ * a swap target. If the selected pfn is a hole, retry
+ * up to SHUFFLE_RETRY attempts find a random valid pfn
+ * in the zone.
+ */
+ j = z->zone_start_pfn +
+ ALIGN_DOWN(get_random_long() % z->spanned_pages,
+ order_pages);
+ page_j = shuffle_valid_page(j, order);
+ if (page_j && page_j != page_i)
+ break;
+ }
+ if (retry >= SHUFFLE_RETRY) {
+ pr_debug("%s: failed to swap %#lx\n", __func__, i);
+ continue;
+ }
+
+ /*
+ * Each migratetype corresponds to its own list, make sure the
+ * types match otherwise we're moving pages to lists where they
+ * do not belong.
+ */
+ migratetype = get_pageblock_migratetype(page_i);
+ if (get_pageblock_migratetype(page_j) != migratetype) {
+ pr_debug("%s: migratetype mismatch %#lx\n", __func__, i);
+ continue;
+ }
+
+ list_swap(&page_i->lru, &page_j->lru);
+
+ pr_debug("%s: swap: %#lx -> %#lx\n", __func__, i, j);
+
+ /* take it easy on the zone lock */
+ if ((i % (100 * order_pages)) == 0) {
+ spin_unlock_irqrestore(&z->lock, flags);
+ cond_resched();
+ spin_lock_irqsave(&z->lock, flags);
+ }
+ }
+ spin_unlock_irqrestore(&z->lock, flags);
+}
+
+/**
+ * shuffle_free_memory - reduce the predictability of the page allocator
+ * @pgdat: node page data
+ */
+void __meminit __shuffle_free_memory(pg_data_t *pgdat)
+{
+ struct zone *z;
+
+ for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++)
+ shuffle_zone(z);
+}
+
+void add_to_free_area_random(struct page *page, struct free_area *area,
+ int migratetype)
+{
+ static u64 rand;
+ static u8 rand_bits;
+
+ /*
+ * The lack of locking is deliberate. If 2 threads race to
+ * update the rand state it just adds to the entropy.
+ */
+ if (rand_bits == 0) {
+ rand_bits = 64;
+ rand = get_random_u64();
+ }
+
+ if (rand & 1)
+ add_to_free_area(page, area, migratetype);
+ else
+ add_to_free_area_tail(page, area, migratetype);
+ rand_bits--;
+ rand >>= 1;
+}
diff --git a/mm/shuffle.h b/mm/shuffle.h
new file mode 100644
index 0000000..777a257
--- /dev/null
+++ b/mm/shuffle.h
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-2.0
+// Copyright(c) 2018 Intel Corporation. All rights reserved.
+#ifndef _MM_SHUFFLE_H
+#define _MM_SHUFFLE_H
+#include <linux/jump_label.h>
+
+/*
+ * SHUFFLE_ENABLE is called from the command line enabling path, or by
+ * platform-firmware enabling that indicates the presence of a
+ * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
+ * the command line path and overrides any previous or future
+ * SHUFFLE_ENABLE.
+ */
+enum mm_shuffle_ctl {
+ SHUFFLE_ENABLE,
+ SHUFFLE_FORCE_DISABLE,
+};
+
+#define SHUFFLE_ORDER (MAX_ORDER-1)
+
+#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
+DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
+extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
+extern void __shuffle_free_memory(pg_data_t *pgdat);
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_free_memory(pgdat);
+}
+
+extern void __shuffle_zone(struct zone *z);
+static inline void shuffle_zone(struct zone *z)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return;
+ __shuffle_zone(z);
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ if (!static_branch_unlikely(&page_alloc_shuffle_key))
+ return false;
+ return order >= SHUFFLE_ORDER;
+}
+#else
+static inline void shuffle_free_memory(pg_data_t *pgdat)
+{
+}
+
+static inline void shuffle_zone(struct zone *z)
+{
+}
+
+static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
+{
+}
+
+static inline bool is_shuffle_order(int order)
+{
+ return false;
+}
+#endif
+#endif /* _MM_SHUFFLE_H */
diff --git a/mm/slab.c b/mm/slab.c
index d73c7a4..66e5d80 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -362,29 +362,6 @@
#endif
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-
-static inline bool is_store_user_clean(struct kmem_cache *cachep)
-{
- return atomic_read(&cachep->store_user_clean) == 1;
-}
-
-static inline void set_store_user_clean(struct kmem_cache *cachep)
-{
- atomic_set(&cachep->store_user_clean, 1);
-}
-
-static inline void set_store_user_dirty(struct kmem_cache *cachep)
-{
- if (is_store_user_clean(cachep))
- atomic_set(&cachep->store_user_clean, 0);
-}
-
-#else
-static inline void set_store_user_dirty(struct kmem_cache *cachep) {}
-
-#endif
-
/*
* Do not go above this order unless 0 objects fit into the slab or
* overridden on the command line.
@@ -394,31 +371,12 @@
static int slab_max_order = SLAB_MAX_ORDER_LO;
static bool slab_max_order_set __initdata;
-static inline struct kmem_cache *virt_to_cache(const void *obj)
-{
- struct page *page = virt_to_head_page(obj);
- return page->slab_cache;
-}
-
static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
unsigned int idx)
{
return page->s_mem + cache->size * idx;
}
-/*
- * We want to avoid an expensive divide : (offset / cache->size)
- * Using the fact that size is a constant for a particular cache,
- * we can replace (offset / cache->size) by
- * reciprocal_divide(offset, cache->reciprocal_buffer_size)
- */
-static inline unsigned int obj_to_index(const struct kmem_cache *cache,
- const struct page *page, void *obj)
-{
- u32 offset = (obj - page->s_mem);
- return reciprocal_divide(offset, cache->reciprocal_buffer_size);
-}
-
#define BOOT_CPUCACHE_ENTRIES 1
/* internal cache of cache description objs */
static struct kmem_cache kmem_cache_boot = {
@@ -563,14 +521,6 @@
static void init_arraycache(struct array_cache *ac, int limit, int batch)
{
- /*
- * The array_cache structures contain pointers to free object.
- * However, when such objects are allocated or transferred to another
- * cache the pointers are not cleared and they could be counted as
- * valid references during a kmemleak scan. Therefore, kmemleak must
- * not scan such objects.
- */
- kmemleak_no_scan(ac);
if (ac) {
ac->avail = 0;
ac->limit = limit;
@@ -586,6 +536,14 @@
struct array_cache *ac = NULL;
ac = kmalloc_node(memsize, gfp, node);
+ /*
+ * The array_cache structures contain pointers to free object.
+ * However, when such objects are allocated or transferred to another
+ * cache the pointers are not cleared and they could be counted as
+ * valid references during a kmemleak scan. Therefore, kmemleak must
+ * not scan such objects.
+ */
+ kmemleak_no_scan(ac);
init_arraycache(ac, entries, batchcount);
return ac;
}
@@ -679,20 +637,22 @@
struct alien_cache *alc = NULL;
alc = kmalloc_node(memsize, gfp, node);
- init_arraycache(&alc->ac, entries, batch);
- spin_lock_init(&alc->lock);
+ if (alc) {
+ kmemleak_no_scan(alc);
+ init_arraycache(&alc->ac, entries, batch);
+ spin_lock_init(&alc->lock);
+ }
return alc;
}
static struct alien_cache **alloc_alien_cache(int node, int limit, gfp_t gfp)
{
struct alien_cache **alc_ptr;
- size_t memsize = sizeof(void *) * nr_node_ids;
int i;
if (limit > 1)
limit = 12;
- alc_ptr = kzalloc_node(memsize, gfp, node);
+ alc_ptr = kcalloc_node(nr_node_ids, sizeof(void *), gfp, node);
if (!alc_ptr)
return NULL;
@@ -962,10 +922,10 @@
* To protect lockless access to n->shared during irq disabled context.
* If n->shared isn't NULL in irq disabled context, accessing to it is
* guaranteed to be valid until irq is re-enabled, because it will be
- * freed after synchronize_sched().
+ * freed after synchronize_rcu().
*/
if (old_shared && force_change)
- synchronize_sched();
+ synchronize_rcu();
fail:
kfree(old_shared);
@@ -1001,10 +961,8 @@
/* cpu is dead; no one can alloc from it. */
nc = per_cpu_ptr(cachep->cpu_cache, cpu);
- if (nc) {
- free_block(cachep, nc->entry, nc->avail, node, &list);
- nc->avail = 0;
- }
+ free_block(cachep, nc->entry, nc->avail, node, &list);
+ nc->avail = 0;
if (!cpumask_empty(mask)) {
spin_unlock_irq(&n->list_lock);
@@ -1248,7 +1206,7 @@
* page orders on machines with more than 32MB of memory if
* not overridden on the command line.
*/
- if (!slab_max_order_set && totalram_pages > (32 << 20) >> PAGE_SHIFT)
+ if (!slab_max_order_set && totalram_pages() > (32 << 20) >> PAGE_SHIFT)
slab_max_order = SLAB_MAX_ORDER_HI;
/* Bootstrap is tricky, because several objects are allocated
@@ -1281,14 +1239,14 @@
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
- memcg_link_cache(kmem_cache);
+ memcg_link_cache(kmem_cache, NULL);
slab_state = PARTIAL;
/*
* Initialize the caches that provide memory for the kmem_cache_node
* structures first. Without this, further allocations will bug.
*/
- kmalloc_caches[INDEX_NODE] = create_kmalloc_cache(
+ kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE] = create_kmalloc_cache(
kmalloc_info[INDEX_NODE].name,
kmalloc_size(INDEX_NODE), ARCH_KMALLOC_FLAGS,
0, kmalloc_size(INDEX_NODE));
@@ -1304,7 +1262,7 @@
for_each_online_node(nid) {
init_list(kmem_cache, &init_kmem_cache_node[CACHE_CACHE + nid], nid);
- init_list(kmalloc_caches[INDEX_NODE],
+ init_list(kmalloc_caches[KMALLOC_NORMAL][INDEX_NODE],
&init_kmem_cache_node[SIZE_NODE + nid], nid);
}
}
@@ -1402,7 +1360,6 @@
int nodeid)
{
struct page *page;
- int nr_pages;
flags |= cachep->allocflags;
@@ -1412,17 +1369,11 @@
return NULL;
}
- if (memcg_charge_slab(page, flags, cachep->gfporder, cachep)) {
+ if (charge_slab_page(page, flags, cachep->gfporder, cachep)) {
__free_pages(page, cachep->gfporder);
return NULL;
}
- nr_pages = (1 << cachep->gfporder);
- if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, nr_pages);
- else
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, nr_pages);
-
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1437,12 +1388,6 @@
static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
{
int order = cachep->gfporder;
- unsigned long nr_freed = (1 << order);
-
- if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
- mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed);
- else
- mod_lruvec_page_state(page, NR_SLAB_UNRECLAIMABLE, -nr_freed);
BUG_ON(!PageSlab(page));
__ClearPageSlabPfmemalloc(page);
@@ -1451,8 +1396,8 @@
page->mapping = NULL;
if (current->reclaim_state)
- current->reclaim_state->reclaimed_slab += nr_freed;
- memcg_uncharge_slab(page, order, cachep);
+ current->reclaim_state->reclaimed_slab += 1 << order;
+ uncharge_slab_page(page, order, cachep);
__free_pages(page, order);
}
@@ -1478,53 +1423,17 @@
}
#ifdef CONFIG_DEBUG_PAGEALLOC
-static void store_stackinfo(struct kmem_cache *cachep, unsigned long *addr,
- unsigned long caller)
-{
- int size = cachep->object_size;
-
- addr = (unsigned long *)&((char *)addr)[obj_offset(cachep)];
-
- if (size < 5 * sizeof(unsigned long))
- return;
-
- *addr++ = 0x12345678;
- *addr++ = caller;
- *addr++ = smp_processor_id();
- size -= 3 * sizeof(unsigned long);
- {
- unsigned long *sptr = &caller;
- unsigned long svalue;
-
- while (!kstack_end(sptr)) {
- svalue = *sptr++;
- if (kernel_text_address(svalue)) {
- *addr++ = svalue;
- size -= sizeof(unsigned long);
- if (size <= sizeof(unsigned long))
- break;
- }
- }
-
- }
- *addr++ = 0x87654321;
-}
-
-static void slab_kernel_map(struct kmem_cache *cachep, void *objp,
- int map, unsigned long caller)
+static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
{
if (!is_debug_pagealloc_cache(cachep))
return;
- if (caller)
- store_stackinfo(cachep, objp, caller);
-
kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
}
#else
static inline void slab_kernel_map(struct kmem_cache *cachep, void *objp,
- int map, unsigned long caller) {}
+ int map) {}
#endif
@@ -1672,7 +1581,7 @@
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
- slab_kernel_map(cachep, objp, 1, 0);
+ slab_kernel_map(cachep, objp, 1);
}
if (cachep->flags & SLAB_RED_ZONE) {
if (*dbg_redzone1(cachep, objp) != RED_INACTIVE)
@@ -1721,8 +1630,8 @@
{
struct page *page, *n;
- list_for_each_entry_safe(page, n, list, lru) {
- list_del(&page->lru);
+ list_for_each_entry_safe(page, n, list, slab_list) {
+ list_del(&page->slab_list);
slab_destroy(cachep, page);
}
}
@@ -1738,6 +1647,8 @@
* This could be made much more intelligent. For now, try to avoid using
* high order pages for slabs. When the gfp() functions are more friendly
* towards high-order requests, this should be changed.
+ *
+ * Return: number of left-over bytes in a slab
*/
static size_t calculate_slab_order(struct kmem_cache *cachep,
size_t size, slab_flags_t flags)
@@ -1900,6 +1811,14 @@
cachep->num = 0;
+ /*
+ * If slab auto-initialization on free is enabled, store the freelist
+ * off-slab, so that its contents don't end up in one of the allocated
+ * objects.
+ */
+ if (unlikely(slab_want_init_on_free(cachep)))
+ return false;
+
if (cachep->ctor || flags & SLAB_TYPESAFE_BY_RCU)
return false;
@@ -1986,6 +1905,8 @@
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
+ *
+ * Return: a pointer to the created cache or %NULL in case of error
*/
int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags)
{
@@ -2122,6 +2043,8 @@
cachep->allocflags = __GFP_COMP;
if (flags & SLAB_CACHE_DMA)
cachep->allocflags |= GFP_DMA;
+ if (flags & SLAB_CACHE_DMA32)
+ cachep->allocflags |= GFP_DMA32;
if (flags & SLAB_RECLAIM_ACCOUNT)
cachep->allocflags |= __GFP_RECLAIMABLE;
cachep->size = size;
@@ -2272,8 +2195,8 @@
goto out;
}
- page = list_entry(p, struct page, lru);
- list_del(&page->lru);
+ page = list_entry(p, struct page, slab_list);
+ list_del(&page->slab_list);
n->free_slabs--;
n->total_slabs--;
/*
@@ -2324,6 +2247,10 @@
{
__kmem_cache_shrink(cachep);
}
+
+void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
+{
+}
#endif
int __kmem_cache_shutdown(struct kmem_cache *cachep)
@@ -2438,7 +2365,7 @@
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
- slab_kernel_map(cachep, objp, 0, 0);
+ slab_kernel_map(cachep, objp, 0);
}
}
#endif
@@ -2574,7 +2501,7 @@
for (i = 0; i < cachep->num; i++) {
objp = index_to_obj(cachep, page, i);
- kasan_init_slab_obj(cachep, objp);
+ objp = kasan_init_slab_obj(cachep, objp);
/* constructor could break poison info */
if (DEBUG == 0 && cachep->ctor) {
@@ -2595,11 +2522,6 @@
objp = index_to_obj(cachep, page, get_free_obj(page, page->active));
page->active++;
-#if DEBUG
- if (cachep->flags & SLAB_STORE_USER)
- set_store_user_dirty(cachep);
-#endif
-
return objp;
}
@@ -2692,6 +2614,13 @@
offset *= cachep->colour_off;
+ /*
+ * Call kasan_poison_slab() before calling alloc_slabmgmt(), so
+ * page_address() in the latter returns a non-tagged pointer,
+ * as it should be for slab pages.
+ */
+ kasan_poison_slab(page);
+
/* Get slab management. */
freelist = alloc_slabmgmt(cachep, page, offset,
local_flags & ~GFP_CONSTRAINT_MASK, page_node);
@@ -2700,7 +2629,6 @@
slab_map_pages(cachep, page, freelist);
- kasan_poison_slab(page);
cache_init_objs(cachep, page);
if (gfpflags_allow_blocking(local_flags))
@@ -2726,13 +2654,13 @@
if (!page)
return;
- INIT_LIST_HEAD(&page->lru);
+ INIT_LIST_HEAD(&page->slab_list);
n = get_node(cachep, page_to_nid(page));
spin_lock(&n->list_lock);
n->total_slabs++;
if (!page->active) {
- list_add_tail(&page->lru, &(n->slabs_free));
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
fixup_slab_list(cachep, n, page, &list);
@@ -2799,10 +2727,8 @@
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
*dbg_redzone2(cachep, objp) = RED_INACTIVE;
}
- if (cachep->flags & SLAB_STORE_USER) {
- set_store_user_dirty(cachep);
+ if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = (void *)caller;
- }
objnr = obj_to_index(cachep, page, objp);
@@ -2811,7 +2737,7 @@
if (cachep->flags & SLAB_POISON) {
poison_obj(cachep, objp, POISON_FREE);
- slab_kernel_map(cachep, objp, 0, caller);
+ slab_kernel_map(cachep, objp, 0);
}
return objp;
}
@@ -2841,9 +2767,9 @@
void **list)
{
/* move slabp to correct slabp list: */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (page->active == cachep->num) {
- list_add(&page->lru, &n->slabs_full);
+ list_add(&page->slab_list, &n->slabs_full);
if (OBJFREELIST_SLAB(cachep)) {
#if DEBUG
/* Poisoning will be done without holding the lock */
@@ -2857,7 +2783,7 @@
page->freelist = NULL;
}
} else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&page->slab_list, &n->slabs_partial);
}
/* Try to find non-pfmemalloc slab if needed */
@@ -2880,20 +2806,20 @@
}
/* Move pfmemalloc slab to the end of list to speed up next search */
- list_del(&page->lru);
+ list_del(&page->slab_list);
if (!page->active) {
- list_add_tail(&page->lru, &n->slabs_free);
+ list_add_tail(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
- list_for_each_entry(page, &n->slabs_partial, lru) {
+ list_for_each_entry(page, &n->slabs_partial, slab_list) {
if (!PageSlabPfmemalloc(page))
return page;
}
n->free_touched = 1;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
if (!PageSlabPfmemalloc(page)) {
n->free_slabs--;
return page;
@@ -2908,11 +2834,12 @@
struct page *page;
assert_spin_locked(&n->list_lock);
- page = list_first_entry_or_null(&n->slabs_partial, struct page, lru);
+ page = list_first_entry_or_null(&n->slabs_partial, struct page,
+ slab_list);
if (!page) {
n->free_touched = 1;
page = list_first_entry_or_null(&n->slabs_free, struct page,
- lru);
+ slab_list);
if (page)
n->free_slabs--;
}
@@ -3075,7 +3002,7 @@
return objp;
if (cachep->flags & SLAB_POISON) {
check_poison_obj(cachep, objp);
- slab_kernel_map(cachep, objp, 1, 0);
+ slab_kernel_map(cachep, objp, 1);
poison_obj(cachep, objp, POISON_INUSE);
}
if (cachep->flags & SLAB_STORE_USER)
@@ -3329,7 +3256,7 @@
local_irq_restore(save_flags);
ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, caller);
- if (unlikely(flags & __GFP_ZERO) && ptr)
+ if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
memset(ptr, 0, cachep->object_size);
slab_post_alloc_hook(cachep, flags, 1, &ptr);
@@ -3386,7 +3313,7 @@
objp = cache_alloc_debugcheck_after(cachep, flags, objp, caller);
prefetchw(objp);
- if (unlikely(flags & __GFP_ZERO) && objp)
+ if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);
slab_post_alloc_hook(cachep, flags, 1, &objp);
@@ -3413,29 +3340,29 @@
objp = objpp[i];
page = virt_to_head_page(objp);
- list_del(&page->lru);
+ list_del(&page->slab_list);
check_spinlock_acquired_node(cachep, node);
slab_put_obj(cachep, page, objp);
STATS_DEC_ACTIVE(cachep);
/* fixup slab chains */
if (page->active == 0) {
- list_add(&page->lru, &n->slabs_free);
+ list_add(&page->slab_list, &n->slabs_free);
n->free_slabs++;
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&page->slab_list, &n->slabs_partial);
}
}
while (n->free_objects > n->free_limit && !list_empty(&n->slabs_free)) {
n->free_objects -= cachep->num;
- page = list_last_entry(&n->slabs_free, struct page, lru);
- list_move(&page->lru, list);
+ page = list_last_entry(&n->slabs_free, struct page, slab_list);
+ list_move(&page->slab_list, list);
n->free_slabs--;
n->total_slabs--;
}
@@ -3473,7 +3400,7 @@
int i = 0;
struct page *page;
- list_for_each_entry(page, &n->slabs_free, lru) {
+ list_for_each_entry(page, &n->slabs_free, slab_list) {
BUG_ON(page->active);
i++;
@@ -3507,6 +3434,8 @@
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
+ if (unlikely(slab_want_init_on_free(cachep)))
+ memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
@@ -3546,12 +3475,13 @@
*
* Allocate an object from this cache. The flags are only relevant
* if the cache has no available objects.
+ *
+ * Return: pointer to the new object or %NULL in case of error
*/
void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
{
void *ret = slab_alloc(cachep, flags, _RET_IP_);
- kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc(_RET_IP_, ret,
cachep->object_size, cachep->size, flags);
@@ -3593,7 +3523,7 @@
cache_alloc_debugcheck_after_bulk(s, flags, size, p, _RET_IP_);
/* Clear memory outside IRQ disabled section */
- if (unlikely(flags & __GFP_ZERO))
+ if (unlikely(slab_want_init_on_alloc(flags, s)))
for (i = 0; i < size; i++)
memset(p[i], 0, s->object_size);
@@ -3617,7 +3547,7 @@
ret = slab_alloc(cachep, flags, _RET_IP_);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(_RET_IP_, ret,
size, cachep->size, flags);
return ret;
@@ -3636,12 +3566,13 @@
* node, which can improve the performance for cpu bound structures.
*
* Fallback to other node is possible if __GFP_THISNODE is not set.
+ *
+ * Return: pointer to the new object or %NULL in case of error
*/
void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
- kasan_slab_alloc(cachep, ret, flags);
trace_kmem_cache_alloc_node(_RET_IP_, ret,
cachep->object_size, cachep->size,
flags, nodeid);
@@ -3660,7 +3591,7 @@
ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc_node(_RET_IP_, ret,
size, cachep->size,
flags, nodeid);
@@ -3681,7 +3612,7 @@
if (unlikely(ZERO_OR_NULL_PTR(cachep)))
return cachep;
ret = kmem_cache_alloc_node_trace(cachep, flags, node, size);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
return ret;
}
@@ -3705,6 +3636,8 @@
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
* @caller: function caller for debug tracking of the caller
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
unsigned long caller)
@@ -3719,7 +3652,7 @@
return cachep;
ret = slab_alloc(cachep, flags, caller);
- kasan_kmalloc(cachep, ret, size, flags);
+ ret = kasan_kmalloc(cachep, ret, size, flags);
trace_kmalloc(caller, ret,
size, cachep->size, flags);
@@ -3777,6 +3710,8 @@
s = virt_to_cache(objp);
else
s = cache_from_obj(orig_s, objp);
+ if (!s)
+ continue;
debug_check_no_locks_freed(objp, s->object_size);
if (!(s->flags & SLAB_DEBUG_OBJECTS))
@@ -3811,6 +3746,10 @@
local_irq_save(flags);
kfree_debugcheck(objp);
c = virt_to_cache(objp);
+ if (!c) {
+ local_irq_restore(flags);
+ return;
+ }
debug_check_no_locks_freed(objp, c->object_size);
debug_check_no_obj_freed(objp, c->object_size);
@@ -4170,6 +4109,8 @@
* @buffer: user buffer
* @count: data length
* @ppos: unused
+ *
+ * Return: %0 on success, negative error code otherwise.
*/
ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
@@ -4214,195 +4155,6 @@
return res;
}
-#ifdef CONFIG_DEBUG_SLAB_LEAK
-
-static inline int add_caller(unsigned long *n, unsigned long v)
-{
- unsigned long *p;
- int l;
- if (!v)
- return 1;
- l = n[1];
- p = n + 2;
- while (l) {
- int i = l/2;
- unsigned long *q = p + 2 * i;
- if (*q == v) {
- q[1]++;
- return 1;
- }
- if (*q > v) {
- l = i;
- } else {
- p = q + 2;
- l -= i + 1;
- }
- }
- if (++n[1] == n[0])
- return 0;
- memmove(p + 2, p, n[1] * 2 * sizeof(unsigned long) - ((void *)p - (void *)n));
- p[0] = v;
- p[1] = 1;
- return 1;
-}
-
-static void handle_slab(unsigned long *n, struct kmem_cache *c,
- struct page *page)
-{
- void *p;
- int i, j;
- unsigned long v;
-
- if (n[0] == n[1])
- return;
- for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- bool active = true;
-
- for (j = page->active; j < c->num; j++) {
- if (get_free_obj(page, j) == i) {
- active = false;
- break;
- }
- }
-
- if (!active)
- continue;
-
- /*
- * probe_kernel_read() is used for DEBUG_PAGEALLOC. page table
- * mapping is established when actual object allocation and
- * we could mistakenly access the unmapped object in the cpu
- * cache.
- */
- if (probe_kernel_read(&v, dbg_userword(c, p), sizeof(v)))
- continue;
-
- if (!add_caller(n, v))
- return;
- }
-}
-
-static void show_symbol(struct seq_file *m, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
- unsigned long offset, size;
- char modname[MODULE_NAME_LEN], name[KSYM_NAME_LEN];
-
- if (lookup_symbol_attrs(address, &size, &offset, modname, name) == 0) {
- seq_printf(m, "%s+%#lx/%#lx", name, offset, size);
- if (modname[0])
- seq_printf(m, " [%s]", modname);
- return;
- }
-#endif
- seq_printf(m, "%px", (void *)address);
-}
-
-static int leaks_show(struct seq_file *m, void *p)
-{
- struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
- struct page *page;
- struct kmem_cache_node *n;
- const char *name;
- unsigned long *x = m->private;
- int node;
- int i;
-
- if (!(cachep->flags & SLAB_STORE_USER))
- return 0;
- if (!(cachep->flags & SLAB_RED_ZONE))
- return 0;
-
- /*
- * Set store_user_clean and start to grab stored user information
- * for all objects on this cache. If some alloc/free requests comes
- * during the processing, information would be wrong so restart
- * whole processing.
- */
- do {
- set_store_user_clean(cachep);
- drain_cpu_caches(cachep);
-
- x[1] = 0;
-
- for_each_kmem_cache_node(cachep, node, n) {
-
- check_irq_on();
- spin_lock_irq(&n->list_lock);
-
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
- spin_unlock_irq(&n->list_lock);
- }
- } while (!is_store_user_clean(cachep));
-
- name = cachep->name;
- if (x[0] == x[1]) {
- /* Increase the buffer size */
- mutex_unlock(&slab_mutex);
- m->private = kcalloc(x[0] * 4, sizeof(unsigned long),
- GFP_KERNEL);
- if (!m->private) {
- /* Too bad, we are really out */
- m->private = x;
- mutex_lock(&slab_mutex);
- return -ENOMEM;
- }
- *(unsigned long *)m->private = x[0] * 2;
- kfree(x);
- mutex_lock(&slab_mutex);
- /* Now make sure this entry will be retried */
- m->count = m->size;
- return 0;
- }
- for (i = 0; i < x[1]; i++) {
- seq_printf(m, "%s: %lu ", name, x[2*i+3]);
- show_symbol(m, x[2*i+2]);
- seq_putc(m, '\n');
- }
-
- return 0;
-}
-
-static const struct seq_operations slabstats_op = {
- .start = slab_start,
- .next = slab_next,
- .stop = slab_stop,
- .show = leaks_show,
-};
-
-static int slabstats_open(struct inode *inode, struct file *file)
-{
- unsigned long *n;
-
- n = __seq_open_private(file, &slabstats_op, PAGE_SIZE);
- if (!n)
- return -ENOMEM;
-
- *n = PAGE_SIZE / (2 * sizeof(unsigned long));
-
- return 0;
-}
-
-static const struct file_operations proc_slabstats_operations = {
- .open = slabstats_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release_private,
-};
-#endif
-
-static int __init slab_proc_init(void)
-{
-#ifdef CONFIG_DEBUG_SLAB_LEAK
- proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
-#endif
- return 0;
-}
-module_init(slab_proc_init);
-
#ifdef CONFIG_HARDENED_USERCOPY
/*
* Rejects incorrectly sized objects and objects that are to be copied
@@ -4419,6 +4171,8 @@
unsigned int objnr;
unsigned long offset;
+ ptr = kasan_reset_tag(ptr);
+
/* Find and validate object. */
cachep = page->slab_cache;
objnr = obj_to_index(cachep, page, (void *)ptr);
@@ -4451,31 +4205,26 @@
#endif /* CONFIG_HARDENED_USERCOPY */
/**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
+ * __ksize -- Uninstrumented ksize.
+ * @objp: pointer to the object
*
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
+ * Unlike ksize(), __ksize() is uninstrumented, and does not provide the same
+ * safety checks as ksize() with KASAN instrumentation enabled.
+ *
+ * Return: size of the actual memory used by @objp in bytes
*/
-size_t ksize(const void *objp)
+size_t __ksize(const void *objp)
{
+ struct kmem_cache *c;
size_t size;
BUG_ON(!objp);
if (unlikely(objp == ZERO_SIZE_PTR))
return 0;
- size = virt_to_cache(objp)->object_size;
- /* We assume that ksize callers could use the whole allocated area,
- * so we need to unpoison this area.
- */
- kasan_unpoison_shadow(objp, size);
+ c = virt_to_cache(objp);
+ size = c ? c->object_size : 0;
return size;
}
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(__ksize);
diff --git a/mm/slab.h b/mm/slab.h
index 58c6c1c..b2b0169 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,6 +30,69 @@
struct list_head list; /* List of all slab caches on the system */
};
+#else /* !CONFIG_SLOB */
+
+struct memcg_cache_array {
+ struct rcu_head rcu;
+ struct kmem_cache *entries[0];
+};
+
+/*
+ * This is the main placeholder for memcg-related information in kmem caches.
+ * Both the root cache and the child caches will have it. For the root cache,
+ * this will hold a dynamically allocated array large enough to hold
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
+ *
+ * Root and child caches hold different metadata.
+ *
+ * @root_cache: Common to root and child caches. NULL for root, pointer to
+ * the root cache for children.
+ *
+ * The following fields are specific to root caches.
+ *
+ * @memcg_caches: kmemcg ID indexed table of child caches. This table is
+ * used to index child cachces during allocation and cleared
+ * early during shutdown.
+ *
+ * @root_caches_node: List node for slab_root_caches list.
+ *
+ * @children: List of all child caches. While the child caches are also
+ * reachable through @memcg_caches, a child cache remains on
+ * this list until it is actually destroyed.
+ *
+ * The following fields are specific to child caches.
+ *
+ * @memcg: Pointer to the memcg this cache belongs to.
+ *
+ * @children_node: List node for @root_cache->children list.
+ *
+ * @kmem_caches_node: List node for @memcg->kmem_caches list.
+ */
+struct memcg_cache_params {
+ struct kmem_cache *root_cache;
+ union {
+ struct {
+ struct memcg_cache_array __rcu *memcg_caches;
+ struct list_head __root_caches_node;
+ struct list_head children;
+ bool dying;
+ };
+ struct {
+ struct mem_cgroup *memcg;
+ struct list_head children_node;
+ struct list_head kmem_caches_node;
+ struct percpu_ref refcnt;
+
+ void (*work_fn)(struct kmem_cache *);
+ union {
+ struct rcu_head rcu_head;
+ struct work_struct work;
+ };
+ };
+ };
+};
#endif /* CONFIG_SLOB */
#ifdef CONFIG_SLAB
@@ -127,7 +190,8 @@
/* Legal flag mask for kmem_cache_create(), for various configurations */
-#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | SLAB_PANIC | \
+#define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \
+ SLAB_CACHE_DMA32 | SLAB_PANIC | \
SLAB_TYPESAFE_BY_RCU | SLAB_DEBUG_OBJECTS )
#if defined(CONFIG_DEBUG_SLAB)
@@ -171,7 +235,9 @@
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
void __kmemcg_cache_deactivate(struct kmem_cache *s);
+void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
+void kmem_cache_shrink_all(struct kmem_cache *s);
struct seq_file;
struct file;
@@ -203,6 +269,12 @@
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
+static inline int cache_vmstat_idx(struct kmem_cache *s)
+{
+ return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
+ NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE;
+}
+
#ifdef CONFIG_MEMCG_KMEM
/* List of all root caches. */
@@ -240,31 +312,6 @@
return s->name;
}
-/*
- * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away by either
- * taking a css reference to the owner cgroup, or holding the slab_mutex.
- */
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
- struct kmem_cache *cachep;
- struct memcg_cache_array *arr;
-
- rcu_read_lock();
- arr = rcu_dereference(s->memcg_params.memcg_caches);
-
- /*
- * Make sure we will access the up-to-date value. The code updating
- * memcg_caches issues a write barrier to match this (see
- * memcg_create_kmem_cache()).
- */
- cachep = READ_ONCE(arr->entries[idx]);
- rcu_read_unlock();
-
- return cachep;
-}
-
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
if (is_root_cache(s))
@@ -272,29 +319,94 @@
return s->memcg_params.root_cache;
}
+/*
+ * Expects a pointer to a slab page. Please note, that PageSlab() check
+ * isn't sufficient, as it returns true also for tail compound slab pages,
+ * which do not have slab_cache pointer set.
+ * So this function assumes that the page can pass PageSlab() && !PageTail()
+ * check.
+ *
+ * The kmem_cache can be reparented asynchronously. The caller must ensure
+ * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
+ */
+static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+{
+ struct kmem_cache *s;
+
+ s = READ_ONCE(page->slab_cache);
+ if (s && !is_root_cache(s))
+ return READ_ONCE(s->memcg_params.memcg);
+
+ return NULL;
+}
+
+/*
+ * Charge the slab page belonging to the non-root kmem_cache.
+ * Can be called for non-root kmem_caches only.
+ */
static __always_inline int memcg_charge_slab(struct page *page,
gfp_t gfp, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+ int ret;
+
+ rcu_read_lock();
+ memcg = READ_ONCE(s->memcg_params.memcg);
+ while (memcg && !css_tryget_online(&memcg->css))
+ memcg = parent_mem_cgroup(memcg);
+ rcu_read_unlock();
+
+ if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ (1 << order));
+ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
return 0;
- if (is_root_cache(s))
- return 0;
- return memcg_kmem_charge_memcg(page, gfp, order, s->memcg_params.memcg);
+ }
+
+ ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
+ if (ret)
+ goto out;
+
+ lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ mod_lruvec_state(lruvec, cache_vmstat_idx(s), 1 << order);
+
+ /* transer try_charge() page references to kmem_cache */
+ percpu_ref_get_many(&s->memcg_params.refcnt, 1 << order);
+ css_put_many(&memcg->css, 1 << order);
+out:
+ css_put(&memcg->css);
+ return ret;
}
+/*
+ * Uncharge a slab page belonging to a non-root kmem_cache.
+ * Can be called for non-root kmem_caches only.
+ */
static __always_inline void memcg_uncharge_slab(struct page *page, int order,
struct kmem_cache *s)
{
- if (!memcg_kmem_enabled())
- return;
- memcg_kmem_uncharge(page, order);
+ struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
+
+ rcu_read_lock();
+ memcg = READ_ONCE(s->memcg_params.memcg);
+ if (likely(!mem_cgroup_is_root(memcg))) {
+ lruvec = mem_cgroup_lruvec(page_pgdat(page), memcg);
+ mod_lruvec_state(lruvec, cache_vmstat_idx(s), -(1 << order));
+ memcg_kmem_uncharge_memcg(page, order, memcg);
+ } else {
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(1 << order));
+ }
+ rcu_read_unlock();
+
+ percpu_ref_put_many(&s->memcg_params.refcnt, 1 << order);
}
extern void slab_init_memcg_params(struct kmem_cache *);
-extern void memcg_link_cache(struct kmem_cache *s);
-extern void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
- void (*deact_fn)(struct kmem_cache *));
+extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
#else /* CONFIG_MEMCG_KMEM */
@@ -313,7 +425,7 @@
static inline bool slab_equal_or_root(struct kmem_cache *s,
struct kmem_cache *p)
{
- return true;
+ return s == p;
}
static inline const char *cache_name(struct kmem_cache *s)
@@ -321,17 +433,16 @@
return s->name;
}
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
-{
- return NULL;
-}
-
static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
return s;
}
+static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+{
+ return NULL;
+}
+
static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
struct kmem_cache *s)
{
@@ -347,16 +458,52 @@
{
}
-static inline void memcg_link_cache(struct kmem_cache *s)
+static inline void memcg_link_cache(struct kmem_cache *s,
+ struct mem_cgroup *memcg)
{
}
#endif /* CONFIG_MEMCG_KMEM */
+static inline struct kmem_cache *virt_to_cache(const void *obj)
+{
+ struct page *page;
+
+ page = virt_to_head_page(obj);
+ if (WARN_ONCE(!PageSlab(page), "%s: Object is not a Slab page!\n",
+ __func__))
+ return NULL;
+ return page->slab_cache;
+}
+
+static __always_inline int charge_slab_page(struct page *page,
+ gfp_t gfp, int order,
+ struct kmem_cache *s)
+{
+ if (is_root_cache(s)) {
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ 1 << order);
+ return 0;
+ }
+
+ return memcg_charge_slab(page, gfp, order, s);
+}
+
+static __always_inline void uncharge_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
+{
+ if (is_root_cache(s)) {
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(1 << order));
+ return;
+ }
+
+ memcg_uncharge_slab(page, order, s);
+}
+
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
struct kmem_cache *cachep;
- struct page *page;
/*
* When kmemcg is not being used, both assignments should return the
@@ -366,18 +513,15 @@
* will also be a constant.
*/
if (!memcg_kmem_enabled() &&
+ !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
!unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
return s;
- page = virt_to_head_page(x);
- cachep = page->slab_cache;
- if (slab_equal_or_root(cachep, s))
- return cachep;
-
- pr_err("%s: Wrong slab cache. %s but object is from %s\n",
- __func__, s->name, cachep->name);
- WARN_ON_ONCE(1);
- return s;
+ cachep = virt_to_cache(x);
+ WARN_ONCE(cachep && !slab_equal_or_root(cachep, s),
+ "%s: Wrong slab cache. %s but object is from %s\n",
+ __func__, s->name, cachep->name);
+ return cachep;
}
static inline size_t slab_ksize(const struct kmem_cache *s)
@@ -437,11 +581,10 @@
flags &= gfp_allowed_mask;
for (i = 0; i < size; i++) {
- void *object = p[i];
-
- kmemleak_alloc_recursive(object, s->object_size, 1,
+ p[i] = kasan_slab_alloc(s, p[i], flags);
+ /* As p[i] might get tagged, call kmemleak hook after KASAN. */
+ kmemleak_alloc_recursive(p[i], s->object_size, 1,
s->flags, flags);
- kasan_slab_alloc(s, object, flags);
}
if (memcg_kmem_enabled())
@@ -528,4 +671,24 @@
static inline void cache_random_seq_destroy(struct kmem_cache *cachep) { }
#endif /* CONFIG_SLAB_FREELIST_RANDOM */
+static inline bool slab_want_init_on_alloc(gfp_t flags, struct kmem_cache *c)
+{
+ if (static_branch_unlikely(&init_on_alloc)) {
+ if (c->ctor)
+ return false;
+ if (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))
+ return flags & __GFP_ZERO;
+ return true;
+ }
+ return flags & __GFP_ZERO;
+}
+
+static inline bool slab_want_init_on_free(struct kmem_cache *c)
+{
+ if (static_branch_unlikely(&init_on_free))
+ return !(c->ctor ||
+ (c->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)));
+ return false;
+}
+
#endif /* MM_SLAB_H */
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3a7ac4f..f9fb27b 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -17,6 +17,7 @@
#include <linux/uaccess.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
@@ -53,7 +54,7 @@
SLAB_FAILSLAB | SLAB_KASAN)
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
- SLAB_ACCOUNT)
+ SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
/*
* Merge control. If this is set then no merging of slab caches will occur.
@@ -130,6 +131,9 @@
#ifdef CONFIG_MEMCG_KMEM
LIST_HEAD(slab_root_caches);
+static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
+
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
void slab_init_memcg_params(struct kmem_cache *s)
{
@@ -140,13 +144,18 @@
}
static int init_memcg_params(struct kmem_cache *s,
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+ struct kmem_cache *root_cache)
{
struct memcg_cache_array *arr;
if (root_cache) {
+ int ret = percpu_ref_init(&s->memcg_params.refcnt,
+ kmemcg_cache_shutdown,
+ 0, GFP_KERNEL);
+ if (ret)
+ return ret;
+
s->memcg_params.root_cache = root_cache;
- s->memcg_params.memcg = memcg;
INIT_LIST_HEAD(&s->memcg_params.children_node);
INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
return 0;
@@ -169,8 +178,13 @@
static void destroy_memcg_params(struct kmem_cache *s)
{
- if (is_root_cache(s))
+ if (is_root_cache(s)) {
kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+ } else {
+ mem_cgroup_put(s->memcg_params.memcg);
+ WRITE_ONCE(s->memcg_params.memcg, NULL);
+ percpu_ref_exit(&s->memcg_params.refcnt);
+ }
}
static void free_memcg_params(struct rcu_head *rcu)
@@ -221,11 +235,13 @@
return ret;
}
-void memcg_link_cache(struct kmem_cache *s)
+void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
{
if (is_root_cache(s)) {
list_add(&s->root_caches_node, &slab_root_caches);
} else {
+ css_get(&memcg->css);
+ s->memcg_params.memcg = memcg;
list_add(&s->memcg_params.children_node,
&s->memcg_params.root_cache->memcg_params.children);
list_add(&s->memcg_params.kmem_caches_node,
@@ -244,7 +260,7 @@
}
#else
static inline int init_memcg_params(struct kmem_cache *s,
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+ struct kmem_cache *root_cache)
{
return 0;
}
@@ -384,7 +400,7 @@
s->useroffset = useroffset;
s->usersize = usersize;
- err = init_memcg_params(s, memcg, root_cache);
+ err = init_memcg_params(s, root_cache);
if (err)
goto out_free_cache;
@@ -394,7 +410,7 @@
s->refcount = 1;
list_add(&s->list, &slab_caches);
- memcg_link_cache(s);
+ memcg_link_cache(s, memcg);
out:
if (err)
return ERR_PTR(err);
@@ -406,8 +422,9 @@
goto out;
}
-/*
- * kmem_cache_create_usercopy - Create a cache.
+/**
+ * kmem_cache_create_usercopy - Create a cache with a region suitable
+ * for copying to userspace
* @name: A string which is used in /proc/slabinfo to identify this cache.
* @size: The size of objects to be created in this cache.
* @align: The required alignment for the objects.
@@ -416,7 +433,6 @@
* @usersize: Usercopy region size
* @ctor: A constructor for the objects.
*
- * Returns a ptr to the cache on success, NULL on failure.
* Cannot be called within a interrupt, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
@@ -425,12 +441,14 @@
* %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
* to catch references to uninitialised memory.
*
- * %SLAB_RED_ZONE - Insert `Red' zones around the allocated memory to check
+ * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
* for buffer overruns.
*
* %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
* cacheline. This can be beneficial if you're counting cycles as closely
* as davem.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
*/
struct kmem_cache *
kmem_cache_create_usercopy(const char *name,
@@ -514,6 +532,31 @@
}
EXPORT_SYMBOL(kmem_cache_create_usercopy);
+/**
+ * kmem_cache_create - Create a cache.
+ * @name: A string which is used in /proc/slabinfo to identify this cache.
+ * @size: The size of objects to be created in this cache.
+ * @align: The required alignment for the objects.
+ * @flags: SLAB flags
+ * @ctor: A constructor for the objects.
+ *
+ * Cannot be called within a interrupt, but can be interrupted.
+ * The @ctor is run when new pages are allocated by the cache.
+ *
+ * The flags are
+ *
+ * %SLAB_POISON - Poison the slab with a known test pattern (a5a5a5a5)
+ * to catch references to uninitialised memory.
+ *
+ * %SLAB_RED_ZONE - Insert `Red` zones around the allocated memory to check
+ * for buffer overruns.
+ *
+ * %SLAB_HWCACHE_ALIGN - Align the objects in this cache to a hardware
+ * cacheline. This can be beneficial if you're counting cycles as closely
+ * as davem.
+ *
+ * Return: a pointer to the cache on success, NULL on failure.
+ */
struct kmem_cache *
kmem_cache_create(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
@@ -613,7 +656,7 @@
* The memory cgroup could have been offlined while the cache
* creation work was pending.
*/
- if (memcg->kmem_state != KMEM_ONLINE || root_cache->memcg_params.dying)
+ if (memcg->kmem_state != KMEM_ONLINE)
goto out_unlock;
idx = memcg_cache_id(memcg);
@@ -650,7 +693,7 @@
}
/*
- * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * Since readers won't lock (see memcg_kmem_get_cache()), we need a
* barrier here to ensure nobody will see the kmem_cache partially
* initialized.
*/
@@ -664,74 +707,95 @@
put_online_cpus();
}
-static void kmemcg_deactivate_workfn(struct work_struct *work)
+static void kmemcg_workfn(struct work_struct *work)
{
struct kmem_cache *s = container_of(work, struct kmem_cache,
- memcg_params.deact_work);
+ memcg_params.work);
get_online_cpus();
get_online_mems();
mutex_lock(&slab_mutex);
-
- s->memcg_params.deact_fn(s);
-
+ s->memcg_params.work_fn(s);
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
-
- /* done, put the ref from slab_deactivate_memcg_cache_rcu_sched() */
- css_put(&s->memcg_params.memcg->css);
}
-static void kmemcg_deactivate_rcufn(struct rcu_head *head)
+static void kmemcg_rcufn(struct rcu_head *head)
{
struct kmem_cache *s = container_of(head, struct kmem_cache,
- memcg_params.deact_rcu_head);
+ memcg_params.rcu_head);
/*
- * We need to grab blocking locks. Bounce to ->deact_work. The
+ * We need to grab blocking locks. Bounce to ->work. The
* work item shares the space with the RCU head and can't be
* initialized eariler.
*/
- INIT_WORK(&s->memcg_params.deact_work, kmemcg_deactivate_workfn);
- queue_work(memcg_kmem_cache_wq, &s->memcg_params.deact_work);
+ INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
+ queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
}
-/**
- * slab_deactivate_memcg_cache_rcu_sched - schedule deactivation after a
- * sched RCU grace period
- * @s: target kmem_cache
- * @deact_fn: deactivation function to call
- *
- * Schedule @deact_fn to be invoked with online cpus, mems and slab_mutex
- * held after a sched RCU grace period. The slab is guaranteed to stay
- * alive until @deact_fn is finished. This is to be used from
- * __kmemcg_cache_deactivate().
- */
-void slab_deactivate_memcg_cache_rcu_sched(struct kmem_cache *s,
- void (*deact_fn)(struct kmem_cache *))
+static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
{
- if (WARN_ON_ONCE(is_root_cache(s)) ||
- WARN_ON_ONCE(s->memcg_params.deact_fn))
- return;
-
- if (s->memcg_params.root_cache->memcg_params.dying)
- return;
-
- /* pin memcg so that @s doesn't get destroyed in the middle */
- css_get(&s->memcg_params.memcg->css);
-
- s->memcg_params.deact_fn = deact_fn;
- call_rcu_sched(&s->memcg_params.deact_rcu_head, kmemcg_deactivate_rcufn);
+ WARN_ON(shutdown_cache(s));
}
-void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
+{
+ struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
+ memcg_params.refcnt);
+ unsigned long flags;
+
+ spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
+ if (s->memcg_params.root_cache->memcg_params.dying)
+ goto unlock;
+
+ s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
+ INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
+ queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
+
+unlock:
+ spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
+}
+
+static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
+{
+ __kmemcg_cache_deactivate_after_rcu(s);
+ percpu_ref_kill(&s->memcg_params.refcnt);
+}
+
+static void kmemcg_cache_deactivate(struct kmem_cache *s)
+{
+ if (WARN_ON_ONCE(is_root_cache(s)))
+ return;
+
+ __kmemcg_cache_deactivate(s);
+ s->flags |= SLAB_DEACTIVATED;
+
+ /*
+ * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
+ * flag and make sure that no new kmem_cache deactivation tasks
+ * are queued (see flush_memcg_workqueue() ).
+ */
+ spin_lock_irq(&memcg_kmem_wq_lock);
+ if (s->memcg_params.root_cache->memcg_params.dying)
+ goto unlock;
+
+ s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
+ call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
+unlock:
+ spin_unlock_irq(&memcg_kmem_wq_lock);
+}
+
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
{
int idx;
struct memcg_cache_array *arr;
struct kmem_cache *s, *c;
+ unsigned int nr_reparented;
idx = memcg_cache_id(memcg);
@@ -746,30 +810,20 @@
if (!c)
continue;
- __kmemcg_cache_deactivate(c);
+ kmemcg_cache_deactivate(c);
arr->entries[idx] = NULL;
}
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
-{
- struct kmem_cache *s, *s2;
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- list_for_each_entry_safe(s, s2, &memcg->kmem_caches,
- memcg_params.kmem_caches_node) {
- /*
- * The cgroup is about to be freed and therefore has no charges
- * left. Hence, all its caches must be empty by now.
- */
- BUG_ON(shutdown_cache(s));
+ nr_reparented = 0;
+ list_for_each_entry(s, &memcg->kmem_caches,
+ memcg_params.kmem_caches_node) {
+ WRITE_ONCE(s->memcg_params.memcg, parent);
+ css_put(&memcg->css);
+ nr_reparented++;
+ }
+ if (nr_reparented) {
+ list_splice_init(&memcg->kmem_caches,
+ &parent->kmem_caches);
+ css_get_many(&parent->css, nr_reparented);
}
mutex_unlock(&slab_mutex);
@@ -834,16 +888,15 @@
static void flush_memcg_workqueue(struct kmem_cache *s)
{
- mutex_lock(&slab_mutex);
+ spin_lock_irq(&memcg_kmem_wq_lock);
s->memcg_params.dying = true;
- mutex_unlock(&slab_mutex);
+ spin_unlock_irq(&memcg_kmem_wq_lock);
/*
- * SLUB deactivates the kmem_caches through call_rcu_sched. Make
+ * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
* sure all registered rcu callbacks have been invoked.
*/
- if (IS_ENABLED(CONFIG_SLUB))
- rcu_barrier_sched();
+ rcu_barrier();
/*
* SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
@@ -912,6 +965,8 @@
*
* Releases as many slabs as possible for a cache.
* To help debugging, a zero exit status indicates all slabs were released.
+ *
+ * Return: %0 if all slabs were released, non-zero otherwise
*/
int kmem_cache_shrink(struct kmem_cache *cachep)
{
@@ -927,6 +982,43 @@
}
EXPORT_SYMBOL(kmem_cache_shrink);
+/**
+ * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
+ * @s: The cache pointer
+ */
+void kmem_cache_shrink_all(struct kmem_cache *s)
+{
+ struct kmem_cache *c;
+
+ if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
+ kmem_cache_shrink(s);
+ return;
+ }
+
+ get_online_cpus();
+ get_online_mems();
+ kasan_cache_shrink(s);
+ __kmem_cache_shrink(s);
+
+ /*
+ * We have to take the slab_mutex to protect from the memcg list
+ * modification.
+ */
+ mutex_lock(&slab_mutex);
+ for_each_memcg_cache(c, s) {
+ /*
+ * Don't need to shrink deactivated memcg caches.
+ */
+ if (s->flags & SLAB_DEACTIVATED)
+ continue;
+ kasan_cache_shrink(c);
+ __kmem_cache_shrink(c);
+ }
+ mutex_unlock(&slab_mutex);
+ put_online_mems();
+ put_online_cpus();
+}
+
bool slab_is_available(void)
{
return slab_state >= UP;
@@ -939,10 +1031,19 @@
unsigned int useroffset, unsigned int usersize)
{
int err;
+ unsigned int align = ARCH_KMALLOC_MINALIGN;
s->name = name;
s->size = s->object_size = size;
- s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ /*
+ * For power of two sizes, guarantee natural alignment for kmalloc
+ * caches, regardless of SL*B debugging options.
+ */
+ if (is_power_of_2(size))
+ align = max(align, size);
+ s->align = calculate_alignment(flags, align, size);
+
s->useroffset = useroffset;
s->usersize = usersize;
@@ -968,19 +1069,16 @@
create_boot_cache(s, name, size, flags, useroffset, usersize);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s);
+ memcg_link_cache(s, NULL);
s->refcount = 1;
return s;
}
-struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
+struct kmem_cache *
+kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1] __ro_after_init =
+{ /* initialization for https://bugs.llvm.org/show_bug.cgi?id=42570 */ };
EXPORT_SYMBOL(kmalloc_caches);
-#ifdef CONFIG_ZONE_DMA
-struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1] __ro_after_init;
-EXPORT_SYMBOL(kmalloc_dma_caches);
-#endif
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -1033,19 +1131,12 @@
index = size_index[size_index_elem(size)];
} else {
- if (unlikely(size > KMALLOC_MAX_CACHE_SIZE)) {
- WARN_ON(1);
+ if (WARN_ON_ONCE(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
- }
index = fls(size - 1);
}
-#ifdef CONFIG_ZONE_DMA
- if (unlikely((flags & GFP_DMA)))
- return kmalloc_dma_caches[index];
-
-#endif
- return kmalloc_caches[index];
+ return kmalloc_caches[kmalloc_type(flags)][index];
}
/*
@@ -1059,15 +1150,15 @@
{"kmalloc-16", 16}, {"kmalloc-32", 32},
{"kmalloc-64", 64}, {"kmalloc-128", 128},
{"kmalloc-256", 256}, {"kmalloc-512", 512},
- {"kmalloc-1024", 1024}, {"kmalloc-2048", 2048},
- {"kmalloc-4096", 4096}, {"kmalloc-8192", 8192},
- {"kmalloc-16384", 16384}, {"kmalloc-32768", 32768},
- {"kmalloc-65536", 65536}, {"kmalloc-131072", 131072},
- {"kmalloc-262144", 262144}, {"kmalloc-524288", 524288},
- {"kmalloc-1048576", 1048576}, {"kmalloc-2097152", 2097152},
- {"kmalloc-4194304", 4194304}, {"kmalloc-8388608", 8388608},
- {"kmalloc-16777216", 16777216}, {"kmalloc-33554432", 33554432},
- {"kmalloc-67108864", 67108864}
+ {"kmalloc-1k", 1024}, {"kmalloc-2k", 2048},
+ {"kmalloc-4k", 4096}, {"kmalloc-8k", 8192},
+ {"kmalloc-16k", 16384}, {"kmalloc-32k", 32768},
+ {"kmalloc-64k", 65536}, {"kmalloc-128k", 131072},
+ {"kmalloc-256k", 262144}, {"kmalloc-512k", 524288},
+ {"kmalloc-1M", 1048576}, {"kmalloc-2M", 2097152},
+ {"kmalloc-4M", 4194304}, {"kmalloc-8M", 8388608},
+ {"kmalloc-16M", 16777216}, {"kmalloc-32M", 33554432},
+ {"kmalloc-64M", 67108864}
};
/*
@@ -1117,9 +1208,36 @@
}
}
-static void __init new_kmalloc_cache(int idx, slab_flags_t flags)
+static const char *
+kmalloc_cache_name(const char *prefix, unsigned int size)
{
- kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name,
+
+ static const char units[3] = "\0kM";
+ int idx = 0;
+
+ while (size >= 1024 && (size % 1024 == 0)) {
+ size /= 1024;
+ idx++;
+ }
+
+ return kasprintf(GFP_NOWAIT, "%s-%u%c", prefix, size, units[idx]);
+}
+
+static void __init
+new_kmalloc_cache(int idx, int type, slab_flags_t flags)
+{
+ const char *name;
+
+ if (type == KMALLOC_RECLAIM) {
+ flags |= SLAB_RECLAIM_ACCOUNT;
+ name = kmalloc_cache_name("kmalloc-rcl",
+ kmalloc_info[idx].size);
+ BUG_ON(!name);
+ } else {
+ name = kmalloc_info[idx].name;
+ }
+
+ kmalloc_caches[type][idx] = create_kmalloc_cache(name,
kmalloc_info[idx].size, flags, 0,
kmalloc_info[idx].size);
}
@@ -1131,21 +1249,25 @@
*/
void __init create_kmalloc_caches(slab_flags_t flags)
{
- int i;
+ int i, type;
- for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
- if (!kmalloc_caches[i])
- new_kmalloc_cache(i, flags);
+ for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
+ for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
+ if (!kmalloc_caches[type][i])
+ new_kmalloc_cache(i, type, flags);
- /*
- * Caches that are not of the two-to-the-power-of size.
- * These have to be created immediately after the
- * earlier power of two caches
- */
- if (KMALLOC_MIN_SIZE <= 32 && !kmalloc_caches[1] && i == 6)
- new_kmalloc_cache(1, flags);
- if (KMALLOC_MIN_SIZE <= 64 && !kmalloc_caches[2] && i == 7)
- new_kmalloc_cache(2, flags);
+ /*
+ * Caches that are not of the two-to-the-power-of size.
+ * These have to be created immediately after the
+ * earlier power of two caches
+ */
+ if (KMALLOC_MIN_SIZE <= 32 && i == 6 &&
+ !kmalloc_caches[type][1])
+ new_kmalloc_cache(1, type, flags);
+ if (KMALLOC_MIN_SIZE <= 64 && i == 7 &&
+ !kmalloc_caches[type][2])
+ new_kmalloc_cache(2, type, flags);
+ }
}
/* Kmalloc array is now usable */
@@ -1153,16 +1275,15 @@
#ifdef CONFIG_ZONE_DMA
for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[i];
+ struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
if (s) {
unsigned int size = kmalloc_size(i);
- char *n = kasprintf(GFP_NOWAIT,
- "dma-kmalloc-%u", size);
+ const char *n = kmalloc_cache_name("dma-kmalloc", size);
BUG_ON(!n);
- kmalloc_dma_caches[i] = create_kmalloc_cache(n,
- size, SLAB_CACHE_DMA | flags, 0, 0);
+ kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
+ n, size, SLAB_CACHE_DMA | flags, 0, 0);
}
}
#endif
@@ -1176,14 +1297,19 @@
*/
void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
{
- void *ret;
+ void *ret = NULL;
struct page *page;
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
- ret = page ? page_address(page) : NULL;
+ if (likely(page)) {
+ ret = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
+ ret = kasan_kmalloc_large(ret, size, flags);
+ /* As ret might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ret, size, 1, flags);
- kasan_kmalloc_large(ret, size, flags);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -1378,7 +1504,7 @@
#if defined(CONFIG_MEMCG)
void *memcg_slab_start(struct seq_file *m, loff_t *pos)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
mutex_lock(&slab_mutex);
return seq_list_start(&memcg->kmem_caches, *pos);
@@ -1386,7 +1512,7 @@
void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
return seq_list_next(p, &memcg->kmem_caches, pos);
}
@@ -1400,7 +1526,7 @@
{
struct kmem_cache *s = list_entry(p, struct kmem_cache,
memcg_params.kmem_caches_node);
- struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
if (p == memcg->kmem_caches.next)
print_slabinfo_header(m);
@@ -1449,6 +1575,64 @@
return 0;
}
module_init(slab_proc_init);
+
+#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
+/*
+ * Display information about kmem caches that have child memcg caches.
+ */
+static int memcg_slabinfo_show(struct seq_file *m, void *unused)
+{
+ struct kmem_cache *s, *c;
+ struct slabinfo sinfo;
+
+ mutex_lock(&slab_mutex);
+ seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
+ seq_puts(m, " <active_slabs> <num_slabs>\n");
+ list_for_each_entry(s, &slab_root_caches, root_caches_node) {
+ /*
+ * Skip kmem caches that don't have any memcg children.
+ */
+ if (list_empty(&s->memcg_params.children))
+ continue;
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(s, &sinfo);
+ seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n",
+ cache_name(s), sinfo.active_objs, sinfo.num_objs,
+ sinfo.active_slabs, sinfo.num_slabs);
+
+ for_each_memcg_cache(c, s) {
+ struct cgroup_subsys_state *css;
+ char *status = "";
+
+ css = &c->memcg_params.memcg->css;
+ if (!(css->flags & CSS_ONLINE))
+ status = ":dead";
+ else if (c->flags & SLAB_DEACTIVATED)
+ status = ":deact";
+
+ memset(&sinfo, 0, sizeof(sinfo));
+ get_slabinfo(c, &sinfo);
+ seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
+ cache_name(c), css->id, status,
+ sinfo.active_objs, sinfo.num_objs,
+ sinfo.active_slabs, sinfo.num_slabs);
+ }
+ }
+ mutex_unlock(&slab_mutex);
+ return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
+
+static int __init memcg_slabinfo_init(void)
+{
+ debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
+ NULL, NULL, &memcg_slabinfo_fops);
+ return 0;
+}
+
+late_initcall(memcg_slabinfo_init);
+#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
@@ -1461,7 +1645,7 @@
ks = ksize(p);
if (ks >= new_size) {
- kasan_krealloc((void *)p, new_size, flags);
+ p = kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
@@ -1481,6 +1665,8 @@
* This function is like krealloc() except it never frees the originally
* allocated buffer. Use this if you don't want to free the buffer immediately
* like, for example, with RCU.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
void *__krealloc(const void *p, size_t new_size, gfp_t flags)
{
@@ -1502,6 +1688,8 @@
* lesser of the new and old sizes. If @p is %NULL, krealloc()
* behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
* %NULL pointer, the object pointed to is freed.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
*/
void *krealloc(const void *p, size_t new_size, gfp_t flags)
{
@@ -1513,7 +1701,7 @@
}
ret = __do_krealloc(p, new_size, flags);
- if (ret && p != ret)
+ if (ret && kasan_reset_tag(p) != kasan_reset_tag(ret))
kfree(p);
return ret;
@@ -1544,6 +1732,52 @@
}
EXPORT_SYMBOL(kzfree);
+/**
+ * ksize - get the actual amount of memory allocated for a given object
+ * @objp: Pointer to the object
+ *
+ * kmalloc may internally round up allocations and return more memory
+ * than requested. ksize() can be used to determine the actual amount of
+ * memory allocated. The caller may use this additional memory, even though
+ * a smaller amount of memory was initially specified with the kmalloc call.
+ * The caller must guarantee that objp points to a valid object previously
+ * allocated with either kmalloc() or kmem_cache_alloc(). The object
+ * must not be freed during the duration of the call.
+ *
+ * Return: size of the actual memory used by @objp in bytes
+ */
+size_t ksize(const void *objp)
+{
+ size_t size;
+
+ if (WARN_ON_ONCE(!objp))
+ return 0;
+ /*
+ * We need to check that the pointed to object is valid, and only then
+ * unpoison the shadow memory below. We use __kasan_check_read(), to
+ * generate a more useful report at the time ksize() is called (rather
+ * than later where behaviour is undefined due to potential
+ * use-after-free or double-free).
+ *
+ * If the pointed to memory is invalid we return 0, to avoid users of
+ * ksize() writing to and potentially corrupting the memory region.
+ *
+ * We want to perform the check before __ksize(), to avoid potentially
+ * crashing in __ksize() due to accessing invalid metadata.
+ */
+ if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1))
+ return 0;
+
+ size = __ksize(objp);
+ /*
+ * We assume that ksize callers could use whole allocated area,
+ * so we need to unpoison this area.
+ */
+ kasan_unpoison_shadow(objp, size);
+ return size;
+}
+EXPORT_SYMBOL(ksize);
+
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/slob.c b/mm/slob.c
index 307c2c9..fa53e9f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -112,13 +112,13 @@
static void set_slob_page_free(struct page *sp, struct list_head *list)
{
- list_add(&sp->lru, list);
+ list_add(&sp->slab_list, list);
__SetPageSlobFree(sp);
}
static inline void clear_slob_page_free(struct page *sp)
{
- list_del(&sp->lru);
+ list_del(&sp->slab_list);
__ClearPageSlobFree(sp);
}
@@ -190,7 +190,7 @@
static void *slob_new_pages(gfp_t gfp, int order, int node)
{
- void *page;
+ struct page *page;
#ifdef CONFIG_NUMA
if (node != NUMA_NO_NODE)
@@ -202,29 +202,59 @@
if (!page)
return NULL;
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
return page_address(page);
}
static void slob_free_pages(void *b, int order)
{
+ struct page *sp = virt_to_page(b);
+
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- free_pages((unsigned long)b, order);
+
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
}
/*
- * Allocate a slob block within a given slob_page sp.
+ * slob_page_alloc() - Allocate a slob block within a given slob_page sp.
+ * @sp: Page to look in.
+ * @size: Size of the allocation.
+ * @align: Allocation alignment.
+ * @align_offset: Offset in the allocated block that will be aligned.
+ * @page_removed_from_list: Return parameter.
+ *
+ * Tries to find a chunk of memory at least @size bytes big within @page.
+ *
+ * Return: Pointer to memory if allocated, %NULL otherwise. If the
+ * allocation fills up @page then the page is removed from the
+ * freelist, in this case @page_removed_from_list will be set to
+ * true (set to false otherwise).
*/
-static void *slob_page_alloc(struct page *sp, size_t size, int align)
+static void *slob_page_alloc(struct page *sp, size_t size, int align,
+ int align_offset, bool *page_removed_from_list)
{
slob_t *prev, *cur, *aligned = NULL;
int delta = 0, units = SLOB_UNITS(size);
+ *page_removed_from_list = false;
for (prev = NULL, cur = sp->freelist; ; prev = cur, cur = slob_next(cur)) {
slobidx_t avail = slob_units(cur);
+ /*
+ * 'aligned' will hold the address of the slob block so that the
+ * address 'aligned'+'align_offset' is aligned according to the
+ * 'align' parameter. This is for kmalloc() which prepends the
+ * allocated block with its size, so that the block itself is
+ * aligned when needed.
+ */
if (align) {
- aligned = (slob_t *)ALIGN((unsigned long)cur, align);
+ aligned = (slob_t *)
+ (ALIGN((unsigned long)cur + align_offset, align)
+ - align_offset);
delta = aligned - cur;
}
if (avail >= units + delta) { /* room enough? */
@@ -254,8 +284,10 @@
}
sp->units -= units;
- if (!sp->units)
+ if (!sp->units) {
clear_slob_page_free(sp);
+ *page_removed_from_list = true;
+ }
return cur;
}
if (slob_last(cur))
@@ -266,13 +298,14 @@
/*
* slob_alloc: entry point into the slob allocator.
*/
-static void *slob_alloc(size_t size, gfp_t gfp, int align, int node)
+static void *slob_alloc(size_t size, gfp_t gfp, int align, int node,
+ int align_offset)
{
struct page *sp;
- struct list_head *prev;
struct list_head *slob_list;
slob_t *b = NULL;
unsigned long flags;
+ bool _unused;
if (size < SLOB_BREAK1)
slob_list = &free_slob_small;
@@ -283,7 +316,8 @@
spin_lock_irqsave(&slob_lock, flags);
/* Iterate through each partially free page, try to find room */
- list_for_each_entry(sp, slob_list, lru) {
+ list_for_each_entry(sp, slob_list, slab_list) {
+ bool page_removed_from_list = false;
#ifdef CONFIG_NUMA
/*
* If there's a node specification, search for a partial
@@ -296,18 +330,25 @@
if (sp->units < SLOB_UNITS(size))
continue;
- /* Attempt to alloc */
- prev = sp->lru.prev;
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, align_offset, &page_removed_from_list);
if (!b)
continue;
- /* Improve fragment distribution and reduce our average
- * search time by starting our next search here. (see
- * Knuth vol 1, sec 2.5, pg 449) */
- if (prev != slob_list->prev &&
- slob_list->next != prev->next)
- list_move_tail(slob_list, prev->next);
+ /*
+ * If slob_page_alloc() removed sp from the list then we
+ * cannot call list functions on sp. If so allocation
+ * did not fragment the page anyway so optimisation is
+ * unnecessary.
+ */
+ if (!page_removed_from_list) {
+ /*
+ * Improve fragment distribution and reduce our average
+ * search time by starting our next search here. (see
+ * Knuth vol 1, sec 2.5, pg 449)
+ */
+ if (!list_is_first(&sp->slab_list, slob_list))
+ list_rotate_to_front(&sp->slab_list, slob_list);
+ }
break;
}
spin_unlock_irqrestore(&slob_lock, flags);
@@ -323,10 +364,10 @@
spin_lock_irqsave(&slob_lock, flags);
sp->units = SLOB_UNITS(PAGE_SIZE);
sp->freelist = b;
- INIT_LIST_HEAD(&sp->lru);
+ INIT_LIST_HEAD(&sp->slab_list);
set_slob(b, SLOB_UNITS(PAGE_SIZE), b + SLOB_UNITS(PAGE_SIZE));
set_slob_page_free(sp, slob_list);
- b = slob_page_alloc(sp, size, align);
+ b = slob_page_alloc(sp, size, align, align_offset, &_unused);
BUG_ON(!b);
spin_unlock_irqrestore(&slob_lock, flags);
}
@@ -428,7 +469,7 @@
__do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
{
unsigned int *m;
- int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
+ int minalign = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
void *ret;
gfp &= gfp_allowed_mask;
@@ -436,19 +477,28 @@
fs_reclaim_acquire(gfp);
fs_reclaim_release(gfp);
- if (size < PAGE_SIZE - align) {
+ if (size < PAGE_SIZE - minalign) {
+ int align = minalign;
+
+ /*
+ * For power of two sizes, guarantee natural alignment for
+ * kmalloc()'d objects.
+ */
+ if (is_power_of_2(size))
+ align = max(minalign, (int) size);
+
if (!size)
return ZERO_SIZE_PTR;
- m = slob_alloc(size + align, gfp, align, node);
+ m = slob_alloc(size + minalign, gfp, align, node, minalign);
if (!m)
return NULL;
*m = size;
- ret = (void *)m + align;
+ ret = (void *)m + minalign;
trace_kmalloc_node(caller, ret,
- size, size + align, gfp, node);
+ size, size + minalign, gfp, node);
} else {
unsigned int order = get_order(size);
@@ -498,13 +548,18 @@
int align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
unsigned int *m = (unsigned int *)(block - align);
slob_free(m, *m + align);
- } else
- __free_pages(sp, compound_order(sp));
+ } else {
+ unsigned int order = compound_order(sp);
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(sp, order);
+
+ }
}
EXPORT_SYMBOL(kfree);
/* can't use ksize for kmem_cache_alloc memory, only kmalloc */
-size_t ksize(const void *block)
+size_t __ksize(const void *block)
{
struct page *sp;
int align;
@@ -516,13 +571,13 @@
sp = virt_to_page(block);
if (unlikely(!PageSlab(sp)))
- return PAGE_SIZE << compound_order(sp);
+ return page_size(sp);
align = max_t(size_t, ARCH_KMALLOC_MINALIGN, ARCH_SLAB_MINALIGN);
m = (unsigned int *)(block - align);
return SLOB_UNITS(*m) * SLOB_UNIT;
}
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(__ksize);
int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags)
{
@@ -544,7 +599,7 @@
fs_reclaim_release(flags);
if (c->size < PAGE_SIZE) {
- b = slob_alloc(c->size, flags, c->align, node);
+ b = slob_alloc(c->size, flags, c->align, node, 0);
trace_kmem_cache_alloc_node(_RET_IP_, b, c->object_size,
SLOB_UNITS(c->size) * SLOB_UNIT,
flags, node);
diff --git a/mm/slub.c b/mm/slub.c
index 8da34a8..e72e802 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -58,10 +58,11 @@
* D. page->frozen -> frozen state
*
* If a slab is frozen then it is exempt from list management. It is not
- * on any list. The processor that froze the slab is the one who can
- * perform list operations on the page. Other processors may put objects
- * onto the freelist but the processor that froze the slab is the only
- * one that can retrieve the objects from the page's freelist.
+ * on any list except per cpu partial list. The processor that froze the
+ * slab is the one who can perform list operations on the page. Other
+ * processors may put objects onto the freelist but the processor that
+ * froze the slab is the only one that can retrieve the objects from the
+ * page's freelist.
*
* The list_lock protects the partial and full list on each node and
* the partial slab counter. If taken then no new slabs may be added or
@@ -249,7 +250,18 @@
unsigned long ptr_addr)
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
- return (void *)((unsigned long)ptr ^ s->random ^ ptr_addr);
+ /*
+ * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * Normally, this doesn't cause any issues, as both set_freepointer()
+ * and get_freepointer() are called with a pointer with the same tag.
+ * However, there are some issues with CONFIG_SLUB_DEBUG code. For
+ * example, when __free_slub() iterates over objects in a cache, it
+ * passes untagged pointers to check_object(). check_object() in turns
+ * calls get_freepointer() with an untagged pointer, which causes the
+ * freepointer to be restored incorrectly.
+ */
+ return (void *)((unsigned long)ptr ^ s->random ^
+ (unsigned long)kasan_reset_tag((void *)ptr_addr));
#else
return ptr;
#endif
@@ -303,15 +315,10 @@
__p < (__addr) + (__objects) * (__s)->size; \
__p += (__s)->size)
-#define for_each_object_idx(__p, __idx, __s, __addr, __objects) \
- for (__p = fixup_red_left(__s, __addr), __idx = 1; \
- __idx <= __objects; \
- __p += (__s)->size, __idx++)
-
/* Determine object index from a given position */
static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
{
- return (p - addr) / s->size;
+ return (kasan_reset_tag(p) - addr) / s->size;
}
static inline unsigned int order_objects(unsigned int order, unsigned int size)
@@ -507,6 +514,7 @@
return 1;
base = page_address(page);
+ object = kasan_reset_tag(object);
object = restore_red_left(s, object);
if (object < base || object >= base + page->objects * s->size ||
(object - base) % s->size) {
@@ -545,31 +553,22 @@
if (addr) {
#ifdef CONFIG_STACKTRACE
- struct stack_trace trace;
- int i;
+ unsigned int nr_entries;
- trace.nr_entries = 0;
- trace.max_entries = TRACK_ADDRS_COUNT;
- trace.entries = p->addrs;
- trace.skip = 3;
metadata_access_enable();
- save_stack_trace(&trace);
+ nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
metadata_access_disable();
- /* See rant in lockdep.c */
- if (trace.nr_entries != 0 &&
- trace.entries[trace.nr_entries - 1] == ULONG_MAX)
- trace.nr_entries--;
-
- for (i = trace.nr_entries; i < TRACK_ADDRS_COUNT; i++)
- p->addrs[i] = 0;
+ if (nr_entries < TRACK_ADDRS_COUNT)
+ p->addrs[nr_entries] = 0;
#endif
p->addr = addr;
p->cpu = smp_processor_id();
p->pid = current->pid;
p->when = jiffies;
- } else
+ } else {
memset(p, 0, sizeof(struct track));
+ }
}
static void init_tracking(struct kmem_cache *s, void *object)
@@ -830,7 +829,7 @@
return 1;
start = page_address(page);
- length = PAGE_SIZE << compound_order(page);
+ length = page_size(page);
end = start + length;
remainder = length % s->size;
if (!remainder)
@@ -1016,7 +1015,7 @@
return;
lockdep_assert_held(&n->list_lock);
- list_add(&page->lru, &n->full);
+ list_add(&page->slab_list, &n->full);
}
static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
@@ -1025,7 +1024,7 @@
return;
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
}
/* Tracking of the number of slabs for debugging purposes */
@@ -1075,9 +1074,19 @@
init_tracking(s, object);
}
+static
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
+{
+ if (!(s->flags & SLAB_POISON))
+ return;
+
+ metadata_access_enable();
+ memset(addr, POISON_INUSE, page_size(page));
+ metadata_access_disable();
+}
+
static inline int alloc_consistency_checks(struct kmem_cache *s,
- struct page *page,
- void *object, unsigned long addr)
+ struct page *page, void *object)
{
if (!check_slab(s, page))
return 0;
@@ -1098,7 +1107,7 @@
void *object, unsigned long addr)
{
if (s->flags & SLAB_CONSISTENCY_CHECKS) {
- if (!alloc_consistency_checks(s, page, object, addr))
+ if (!alloc_consistency_checks(s, page, object))
goto bad;
}
@@ -1271,27 +1280,69 @@
if (*str == ',')
slub_debug_slabs = str + 1;
out:
+ if ((static_branch_unlikely(&init_on_alloc) ||
+ static_branch_unlikely(&init_on_free)) &&
+ (slub_debug & SLAB_POISON))
+ pr_info("mem auto-init: SLAB_POISON will take precedence over init_on_alloc/init_on_free\n");
return 1;
}
__setup("slub_debug", setup_slub_debug);
+/*
+ * kmem_cache_flags - apply debugging options to the cache
+ * @object_size: the size of an object without meta data
+ * @flags: flags to set
+ * @name: name of the cache
+ * @ctor: constructor function
+ *
+ * Debug option(s) are applied to @flags. In addition to the debug
+ * option(s), if a slab name (or multiple) is specified i.e.
+ * slub_debug=<Debug-Options>,<slab name1>,<slab name2> ...
+ * then only the select slabs will receive the debug option(s).
+ */
slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t flags, const char *name,
void (*ctor)(void *))
{
- /*
- * Enable debugging if selected on the kernel commandline.
- */
- if (slub_debug && (!slub_debug_slabs || (name &&
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
- flags |= slub_debug;
+ char *iter;
+ size_t len;
+
+ /* If slub_debug = 0, it folds into the if conditional. */
+ if (!slub_debug_slabs)
+ return flags | slub_debug;
+
+ len = strlen(name);
+ iter = slub_debug_slabs;
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchrnul(iter, ',');
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
+
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= slub_debug;
+ break;
+ }
+
+ if (!*end)
+ break;
+ iter = end + 1;
+ }
return flags;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
struct page *page, void *object) {}
+static inline
+void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr) {}
static inline int alloc_debug_processing(struct kmem_cache *s,
struct page *page, void *object, unsigned long addr) { return 0; }
@@ -1334,10 +1385,12 @@
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
+static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
+ ptr = kasan_kmalloc_large(ptr, size, flags);
+ /* As ptr might get tagged, call kmemleak hook after KASAN. */
kmemleak_alloc(ptr, size, 1, flags);
- kasan_kmalloc_large(ptr, size, flags);
+ return ptr;
}
static __always_inline void kfree_hook(void *x)
@@ -1374,18 +1427,11 @@
static inline bool slab_free_freelist_hook(struct kmem_cache *s,
void **head, void **tail)
{
-/*
- * Compiler cannot detect this function can be removed if slab_free_hook()
- * evaluates to nothing. Thus, catch all relevant config debug options here.
- */
-#if defined(CONFIG_LOCKDEP) || \
- defined(CONFIG_DEBUG_KMEMLEAK) || \
- defined(CONFIG_DEBUG_OBJECTS_FREE) || \
- defined(CONFIG_KASAN)
void *object;
void *next = *head;
void *old_tail = *tail ? *tail : *head;
+ int rsize;
/* Head and tail of the reconstructed freelist */
*head = NULL;
@@ -1394,6 +1440,19 @@
do {
object = next;
next = get_freepointer(s, object);
+
+ if (slab_want_init_on_free(s)) {
+ /*
+ * Clear the object and the metadata, but don't touch
+ * the redzone.
+ */
+ memset(object, 0, s->object_size);
+ rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
+ : 0;
+ memset((char *)object + s->inuse, 0,
+ s->size - s->inuse - rsize);
+
+ }
/* If object's reuse doesn't have to be delayed */
if (!slab_free_hook(s, object)) {
/* Move object to the new freelist */
@@ -1408,21 +1467,19 @@
*tail = NULL;
return *head != NULL;
-#else
- return true;
-#endif
}
-static void setup_object(struct kmem_cache *s, struct page *page,
+static void *setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
setup_object_debug(s, page, object);
- kasan_init_slab_obj(s, object);
+ object = kasan_init_slab_obj(s, object);
if (unlikely(s->ctor)) {
kasan_unpoison_object_data(s, object);
s->ctor(object);
kasan_poison_object_data(s, object);
}
+ return object;
}
/*
@@ -1439,7 +1496,7 @@
else
page = __alloc_pages_node(node, flags, order);
- if (page && memcg_charge_slab(page, flags, order, s)) {
+ if (page && charge_slab_page(page, flags, order, s)) {
__free_pages(page, order);
page = NULL;
}
@@ -1530,16 +1587,16 @@
/* First entry is used as the base of the freelist */
cur = next_freelist_entry(s, page, &pos, start, page_limit,
freelist_count);
+ cur = setup_object(s, page, cur);
page->freelist = cur;
for (idx = 1; idx < page->objects; idx++) {
- setup_object(s, page, cur);
next = next_freelist_entry(s, page, &pos, start, page_limit,
freelist_count);
+ next = setup_object(s, page, next);
set_freepointer(s, cur, next);
cur = next;
}
- setup_object(s, page, cur);
set_freepointer(s, cur, NULL);
return true;
@@ -1561,8 +1618,8 @@
struct page *page;
struct kmem_cache_order_objects oo = s->oo;
gfp_t alloc_gfp;
- void *start, *p;
- int idx, order;
+ void *start, *p, *next;
+ int idx;
bool shuffle;
flags &= gfp_allowed_mask;
@@ -1596,30 +1653,30 @@
page->objects = oo_objects(oo);
- order = compound_order(page);
page->slab_cache = s;
__SetPageSlab(page);
if (page_is_pfmemalloc(page))
SetPageSlabPfmemalloc(page);
+ kasan_poison_slab(page);
+
start = page_address(page);
- if (unlikely(s->flags & SLAB_POISON))
- memset(start, POISON_INUSE, PAGE_SIZE << order);
-
- kasan_poison_slab(page);
+ setup_page_debug(s, page, start);
shuffle = shuffle_freelist(s, page);
if (!shuffle) {
- for_each_object_idx(p, idx, s, start, page->objects) {
- setup_object(s, page, p);
- if (likely(idx < page->objects))
- set_freepointer(s, p, p + s->size);
- else
- set_freepointer(s, p, NULL);
+ start = fixup_red_left(s, start);
+ start = setup_object(s, page, start);
+ page->freelist = start;
+ for (idx = 0, p = start; idx < page->objects - 1; idx++) {
+ next = p + s->size;
+ next = setup_object(s, page, next);
+ set_freepointer(s, p, next);
+ p = next;
}
- page->freelist = fixup_red_left(s, start);
+ set_freepointer(s, p, NULL);
}
page->inuse = page->objects;
@@ -1631,11 +1688,6 @@
if (!page)
return NULL;
- mod_lruvec_page_state(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- 1 << oo_order(oo));
-
inc_slabs_node(s, page_to_nid(page), page->objects);
return page;
@@ -1669,18 +1721,13 @@
check_object(s, page, p, SLUB_RED_INACTIVE);
}
- mod_lruvec_page_state(page,
- (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE,
- -pages);
-
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- memcg_uncharge_slab(page, order, s);
+ uncharge_slab_page(page, order, s);
__free_pages(page, order);
}
@@ -1713,9 +1760,9 @@
{
n->nr_partial++;
if (tail == DEACTIVATE_TO_TAIL)
- list_add_tail(&page->lru, &n->partial);
+ list_add_tail(&page->slab_list, &n->partial);
else
- list_add(&page->lru, &n->partial);
+ list_add(&page->slab_list, &n->partial);
}
static inline void add_partial(struct kmem_cache_node *n,
@@ -1729,7 +1776,7 @@
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- list_del(&page->lru);
+ list_del(&page->slab_list);
n->nr_partial--;
}
@@ -1803,7 +1850,7 @@
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ list_for_each_entry_safe(page, page2, &n->partial, slab_list) {
void *t;
if (!pfmemalloc_match(page, flags))
@@ -1891,7 +1938,7 @@
}
}
} while (read_mems_allowed_retry(cpuset_mems_cookie));
-#endif
+#endif /* CONFIG_NUMA */
return NULL;
}
@@ -1936,6 +1983,7 @@
return tid + TID_STEP;
}
+#ifdef SLUB_DEBUG_CMPXCHG
static inline unsigned int tid_to_cpu(unsigned long tid)
{
return tid % TID_STEP;
@@ -1945,6 +1993,7 @@
{
return tid / TID_STEP;
}
+#endif
static inline unsigned int init_tid(int cpu)
{
@@ -2069,7 +2118,7 @@
if (!lock) {
lock = 1;
/*
- * Taking the spinlock removes the possiblity
+ * Taking the spinlock removes the possibility
* that acquire_slab() will see a slab page that
* is frozen
*/
@@ -2089,26 +2138,15 @@
}
if (l != m) {
-
if (l == M_PARTIAL)
-
remove_partial(n, page);
-
else if (l == M_FULL)
-
remove_full(s, n, page);
- if (m == M_PARTIAL) {
-
+ if (m == M_PARTIAL)
add_partial(n, page, tail);
- stat(s, tail);
-
- } else if (m == M_FULL) {
-
- stat(s, DEACTIVATE_FULL);
+ else if (m == M_FULL)
add_full(s, n, page);
-
- }
}
l = m;
@@ -2121,7 +2159,11 @@
if (lock)
spin_unlock(&n->list_lock);
- if (m == M_FREE) {
+ if (m == M_PARTIAL)
+ stat(s, tail);
+ else if (m == M_FULL)
+ stat(s, DEACTIVATE_FULL);
+ else if (m == M_FREE) {
stat(s, DEACTIVATE_EMPTY);
discard_slab(s, page);
stat(s, FREE_SLAB);
@@ -2196,12 +2238,12 @@
discard_slab(s, page);
stat(s, FREE_SLAB);
}
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
/*
- * Put a page that was just frozen (in __slab_free) into a partial page
- * slot if available.
+ * Put a page that was just frozen (in __slab_free|get_partial_node) into a
+ * partial page slot if available.
*
* If we did not find a slot then simply move all the partials to the
* per node partial list.
@@ -2255,7 +2297,7 @@
local_irq_restore(flags);
}
preempt_enable();
-#endif
+#endif /* CONFIG_SLUB_CPU_PARTIAL */
}
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2275,12 +2317,10 @@
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- if (likely(c)) {
- if (c->page)
- flush_slab(s, c);
+ if (c->page)
+ flush_slab(s, c);
- unfreeze_partials(s, c);
- }
+ unfreeze_partials(s, c);
}
static void flush_cpu_slab(void *d)
@@ -2329,7 +2369,7 @@
static inline int node_match(struct page *page, int node)
{
#ifdef CONFIG_NUMA
- if (!page || (node != NUMA_NO_NODE && page_to_nid(page) != node))
+ if (node != NUMA_NO_NODE && page_to_nid(page) != node)
return 0;
#endif
return 1;
@@ -2356,7 +2396,7 @@
struct page *page;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
x += get_count(page);
spin_unlock_irqrestore(&n->list_lock, flags);
return x;
@@ -2430,8 +2470,7 @@
stat(s, ALLOC_SLAB);
c->page = page;
*pc = c;
- } else
- freelist = NULL;
+ }
return freelist;
}
@@ -2612,6 +2651,17 @@
}
/*
+ * If the object has been wiped upon free, make sure it's fully initialized by
+ * zeroing out freelist pointer.
+ */
+static __always_inline void maybe_wipe_obj_freeptr(struct kmem_cache *s,
+ void *obj)
+{
+ if (unlikely(slab_want_init_on_free(s)) && obj)
+ memset((void *)((char *)obj + s->offset), 0, sizeof(void *));
+}
+
+/*
* Inlined fastpath so that allocation functions (kmalloc, kmem_cache_alloc)
* have the fastpath folded into their functions. So no function call
* overhead for requests that can be satisfied on the fastpath.
@@ -2700,7 +2750,9 @@
stat(s, ALLOC_FASTPATH);
}
- if (unlikely(gfpflags & __GFP_ZERO) && object)
+ maybe_wipe_obj_freeptr(s, object);
+
+ if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
slab_post_alloc_hook(s, gfpflags, 1, &object);
@@ -2730,7 +2782,7 @@
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
- kasan_kmalloc(s, ret, size, gfpflags);
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2758,12 +2810,12 @@
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
- kasan_kmalloc(s, ret, size, gfpflags);
+ ret = kasan_kmalloc(s, ret, size, gfpflags);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
-#endif
+#endif /* CONFIG_NUMA */
/*
* Slow path handling. This may still be called frequently since objects
@@ -2862,8 +2914,7 @@
* then add it.
*/
if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) {
- if (kmem_cache_debug(s))
- remove_full(s, n, page);
+ remove_full(s, n, page);
add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
@@ -2954,7 +3005,7 @@
do_slab_free(s, page, head, tail, cnt, addr);
}
-#ifdef CONFIG_KASAN
+#ifdef CONFIG_KASAN_GENERIC
void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr)
{
do_slab_free(cache, virt_to_head_page(x), x, NULL, 1, addr);
@@ -3113,16 +3164,19 @@
goto error;
c = this_cpu_ptr(s->cpu_slab);
+ maybe_wipe_obj_freeptr(s, p[i]);
+
continue; /* goto for-loop */
}
c->freelist = get_freepointer(s, object);
p[i] = object;
+ maybe_wipe_obj_freeptr(s, p[i]);
}
c->tid = next_tid(c->tid);
local_irq_enable();
/* Clear memory outside IRQ disabled fastpath loop */
- if (unlikely(flags & __GFP_ZERO)) {
+ if (unlikely(slab_want_init_on_alloc(flags, s))) {
int j;
for (j = 0; j < i; j++)
@@ -3326,16 +3380,16 @@
n = page->freelist;
BUG_ON(!n);
- page->freelist = get_freepointer(kmem_cache_node, n);
- page->inuse = 1;
- page->frozen = 0;
- kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
- kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
+ n = kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node),
GFP_KERNEL);
+ page->freelist = get_freepointer(kmem_cache_node, n);
+ page->inuse = 1;
+ page->frozen = 0;
+ kmem_cache_node->node[node] = n;
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3539,6 +3593,9 @@
if (s->flags & SLAB_CACHE_DMA)
s->allocflags |= GFP_DMA;
+ if (s->flags & SLAB_CACHE_DMA32)
+ s->allocflags |= GFP_DMA32;
+
if (s->flags & SLAB_RECLAIM_ACCOUNT)
s->allocflags |= __GFP_RECLAIMABLE;
@@ -3608,10 +3665,6 @@
free_kmem_cache_nodes(s);
error:
- if (flags & SLAB_PANIC)
- panic("Cannot create slab %s size=%u realsize=%u order=%u offset=%u flags=%lx\n",
- s->name, s->size, s->size,
- oo_order(s->oo), s->offset, (unsigned long)flags);
return -EINVAL;
}
@@ -3621,9 +3674,7 @@
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- unsigned long *map = kcalloc(BITS_TO_LONGS(page->objects),
- sizeof(long),
- GFP_ATOMIC);
+ unsigned long *map = bitmap_zalloc(page->objects, GFP_ATOMIC);
if (!map)
return;
slab_err(s, page, text, s->name);
@@ -3638,7 +3689,7 @@
}
}
slab_unlock(page);
- kfree(map);
+ bitmap_free(map);
#endif
}
@@ -3654,10 +3705,10 @@
BUG_ON(irqs_disabled());
spin_lock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &n->partial, lru) {
+ list_for_each_entry_safe(page, h, &n->partial, slab_list) {
if (!page->inuse) {
remove_partial(n, page);
- list_add(&page->lru, &discard);
+ list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
"Objects remaining in %s on __kmem_cache_shutdown()");
@@ -3665,7 +3716,7 @@
}
spin_unlock_irq(&n->list_lock);
- list_for_each_entry_safe(page, h, &discard, lru)
+ list_for_each_entry_safe(page, h, &discard, slab_list)
discard_slab(s, page);
}
@@ -3748,7 +3799,7 @@
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
- kasan_kmalloc(s, ret, size, flags);
+ ret = kasan_kmalloc(s, ret, size, flags);
return ret;
}
@@ -3759,14 +3810,17 @@
{
struct page *page;
void *ptr = NULL;
+ unsigned int order = get_order(size);
flags |= __GFP_COMP;
- page = alloc_pages_node(node, flags, get_order(size));
- if (page)
+ page = alloc_pages_node(node, flags, order);
+ if (page) {
ptr = page_address(page);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ 1 << order);
+ }
- kmalloc_large_node_hook(ptr, size, flags);
- return ptr;
+ return kmalloc_large_node_hook(ptr, size, flags);
}
void *__kmalloc_node(size_t size, gfp_t flags, int node)
@@ -3793,12 +3847,12 @@
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
- kasan_kmalloc(s, ret, size, flags);
+ ret = kasan_kmalloc(s, ret, size, flags);
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
-#endif
+#endif /* CONFIG_NUMA */
#ifdef CONFIG_HARDENED_USERCOPY
/*
@@ -3816,6 +3870,8 @@
unsigned int offset;
size_t object_size;
+ ptr = kasan_reset_tag(ptr);
+
/* Find object and usable object size. */
s = page->slab_cache;
@@ -3858,7 +3914,7 @@
}
#endif /* CONFIG_HARDENED_USERCOPY */
-static size_t __ksize(const void *object)
+size_t __ksize(const void *object)
{
struct page *page;
@@ -3869,22 +3925,12 @@
if (unlikely(!PageSlab(page))) {
WARN_ON(!PageCompound(page));
- return PAGE_SIZE << compound_order(page);
+ return page_size(page);
}
return slab_ksize(page->slab_cache);
}
-
-size_t ksize(const void *object)
-{
- size_t size = __ksize(object);
- /* We assume that ksize callers could use whole allocated area,
- * so we need to unpoison this area.
- */
- kasan_unpoison_shadow(object, size);
- return size;
-}
-EXPORT_SYMBOL(ksize);
+EXPORT_SYMBOL(__ksize);
void kfree(const void *x)
{
@@ -3898,9 +3944,13 @@
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
+ unsigned int order = compound_order(page);
+
BUG_ON(!PageCompound(page));
kfree_hook(object);
- __free_pages(page, compound_order(page));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
+ -(1 << order));
+ __free_pages(page, order);
return;
}
slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
@@ -3944,7 +3994,7 @@
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
- list_for_each_entry_safe(page, t, &n->partial, lru) {
+ list_for_each_entry_safe(page, t, &n->partial, slab_list) {
int free = page->objects - page->inuse;
/* Do not reread page->inuse */
@@ -3954,10 +4004,10 @@
BUG_ON(free <= 0);
if (free == page->objects) {
- list_move(&page->lru, &discard);
+ list_move(&page->slab_list, &discard);
n->nr_partial--;
} else if (free <= SHRINK_PROMOTE_MAX)
- list_move(&page->lru, promote + free - 1);
+ list_move(&page->slab_list, promote + free - 1);
}
/*
@@ -3970,7 +4020,7 @@
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, &discard, lru)
+ list_for_each_entry_safe(page, t, &discard, slab_list)
discard_slab(s, page);
if (slabs_node(s, node))
@@ -3981,7 +4031,7 @@
}
#ifdef CONFIG_MEMCG
-static void kmemcg_cache_deact_after_rcu(struct kmem_cache *s)
+void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
{
/*
* Called with all the locks held after a sched RCU grace period.
@@ -4007,14 +4057,8 @@
*/
slub_set_cpu_partial(s, 0);
s->min_partial = 0;
-
- /*
- * s->cpu_partial is checked locklessly (see put_cpu_partial), so
- * we have to make sure the change is visible before shrinking.
- */
- slab_deactivate_memcg_cache_rcu_sched(s, kmemcg_cache_deact_after_rcu);
}
-#endif
+#endif /* CONFIG_MEMCG */
static int slab_mem_going_offline_callback(void *arg)
{
@@ -4162,17 +4206,17 @@
for_each_kmem_cache_node(s, node, n) {
struct page *p;
- list_for_each_entry(p, &n->partial, lru)
+ list_for_each_entry(p, &n->partial, slab_list)
p->slab_cache = s;
#ifdef CONFIG_SLUB_DEBUG
- list_for_each_entry(p, &n->full, lru)
+ list_for_each_entry(p, &n->full, slab_list)
p->slab_cache = s;
#endif
}
slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s);
+ memcg_link_cache(s, NULL);
return s;
}
@@ -4213,7 +4257,7 @@
cpuhp_setup_state_nocalls(CPUHP_SLUB_DEAD, "slub:dead", NULL,
slub_cpu_dead);
- pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%d\n",
+ pr_info("SLUB: HWalign=%d, Order=%u-%u, MinObjects=%u, CPUs=%u, Nodes=%u\n",
cache_line_size(),
slub_min_order, slub_max_order, slub_min_objects,
nr_cpu_ids, nr_node_ids);
@@ -4383,7 +4427,7 @@
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru) {
+ list_for_each_entry(page, &n->partial, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4394,7 +4438,7 @@
if (!(s->flags & SLAB_STORE_USER))
goto out;
- list_for_each_entry(page, &n->full, lru) {
+ list_for_each_entry(page, &n->full, slab_list) {
validate_slab_slab(s, page, map);
count++;
}
@@ -4411,10 +4455,8 @@
{
int node;
unsigned long count = 0;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map)
return -ENOMEM;
@@ -4422,7 +4464,7 @@
flush_all(s);
for_each_kmem_cache_node(s, node, n)
count += validate_slab_node(s, n, map);
- kfree(map);
+ bitmap_free(map);
return count;
}
/*
@@ -4573,14 +4615,12 @@
unsigned long i;
struct loc_track t = { 0, 0, NULL };
int node;
- unsigned long *map = kmalloc_array(BITS_TO_LONGS(oo_objects(s->max)),
- sizeof(unsigned long),
- GFP_KERNEL);
struct kmem_cache_node *n;
+ unsigned long *map = bitmap_alloc(oo_objects(s->max), GFP_KERNEL);
if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
- kfree(map);
+ bitmap_free(map);
return sprintf(buf, "Out of memory\n");
}
/* Push back cpu slabs */
@@ -4594,9 +4634,9 @@
continue;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->partial, lru)
+ list_for_each_entry(page, &n->partial, slab_list)
process_slab(&t, s, page, alloc, map);
- list_for_each_entry(page, &n->full, lru)
+ list_for_each_entry(page, &n->full, slab_list)
process_slab(&t, s, page, alloc, map);
spin_unlock_irqrestore(&n->list_lock, flags);
}
@@ -4646,17 +4686,18 @@
}
free_loc_track(&t);
- kfree(map);
+ bitmap_free(map);
if (!t.count)
len += sprintf(buf, "No data\n");
return len;
}
-#endif
+#endif /* CONFIG_SLUB_DEBUG */
#ifdef SLUB_RESILIENCY_TEST
static void __init resiliency_test(void)
{
u8 *p;
+ int type = KMALLOC_NORMAL;
BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10);
@@ -4669,7 +4710,7 @@
pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n",
p + 16);
- validate_slab_cache(kmalloc_caches[4]);
+ validate_slab_cache(kmalloc_caches[type][4]);
/* Hmmm... The next two are dangerous */
p = kzalloc(32, GFP_KERNEL);
@@ -4678,39 +4719,39 @@
p);
pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[5]);
+ validate_slab_cache(kmalloc_caches[type][5]);
p = kzalloc(64, GFP_KERNEL);
p += 64 + (get_cycles() & 0xff) * sizeof(void *);
*p = 0x56;
pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
p);
pr_err("If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches[6]);
+ validate_slab_cache(kmalloc_caches[type][6]);
pr_err("\nB. Corruption after free\n");
p = kzalloc(128, GFP_KERNEL);
kfree(p);
*p = 0x78;
pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[7]);
+ validate_slab_cache(kmalloc_caches[type][7]);
p = kzalloc(256, GFP_KERNEL);
kfree(p);
p[50] = 0x9a;
pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[8]);
+ validate_slab_cache(kmalloc_caches[type][8]);
p = kzalloc(512, GFP_KERNEL);
kfree(p);
p[512] = 0xab;
pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches[9]);
+ validate_slab_cache(kmalloc_caches[type][9]);
}
#else
#ifdef CONFIG_SYSFS
static void resiliency_test(void) {};
#endif
-#endif
+#endif /* SLUB_RESILIENCY_TEST */
#ifdef CONFIG_SYSFS
enum slab_stat_type {
@@ -4794,7 +4835,17 @@
}
}
- get_online_mems();
+ /*
+ * It is impossible to take "mem_hotplug_lock" here with "kernfs_mutex"
+ * already held which will conflict with an existing lock order:
+ *
+ * mem_hotplug_lock->slab_mutex->kernfs_mutex
+ *
+ * We don't really need mem_hotplug_lock (to hold off
+ * slab_mem_going_offline_callback) here because slab's memory hot
+ * unplug code doesn't destroy the kmem_cache->node[] data.
+ */
+
#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
struct kmem_cache_node *n;
@@ -4835,7 +4886,6 @@
x += sprintf(buf + x, " N%d=%lu",
node, nodes[node]);
#endif
- put_online_mems();
kfree(nodes);
return x + sprintf(buf + x, "\n");
}
@@ -5256,7 +5306,7 @@
const char *buf, size_t length)
{
if (buf[0] == '1')
- kmem_cache_shrink(s);
+ kmem_cache_shrink_all(s);
else
return -EINVAL;
return length;
@@ -5367,7 +5417,7 @@
STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node);
STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain);
-#endif
+#endif /* CONFIG_SLUB_STATS */
static struct attribute *slab_attrs[] = {
&slab_size_attr.attr,
@@ -5568,7 +5618,7 @@
if (buffer)
free_page((unsigned long)buffer);
-#endif
+#endif /* CONFIG_MEMCG */
}
static void kmem_cache_release(struct kobject *k)
@@ -5633,6 +5683,8 @@
*/
if (s->flags & SLAB_CACHE_DMA)
*p++ = 'd';
+ if (s->flags & SLAB_CACHE_DMA32)
+ *p++ = 'D';
if (s->flags & SLAB_RECLAIM_ACCOUNT)
*p++ = 'a';
if (s->flags & SLAB_CONSISTENCY_CHECKS)
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 8301293..200aef6 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -20,7 +20,7 @@
*/
#include <linux/mm.h>
#include <linux/mmzone.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/memremap.h>
#include <linux/highmem.h>
#include <linux/slab.h>
@@ -42,8 +42,8 @@
unsigned long align,
unsigned long goal)
{
- return memblock_virt_alloc_try_nid_raw(size, align, goal,
- BOOTMEM_ALLOC_ACCESSIBLE, node);
+ return memblock_alloc_try_nid_raw(size, align, goal,
+ MEMBLOCK_ALLOC_ACCESSIBLE, node);
}
void * __meminit vmemmap_alloc_block(unsigned long size, int node)
@@ -245,19 +245,26 @@
return 0;
}
-struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+struct page * __meminit __populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
unsigned long start;
unsigned long end;
- struct page *map;
- map = pfn_to_page(pnum * PAGES_PER_SECTION);
- start = (unsigned long)map;
- end = (unsigned long)(map + PAGES_PER_SECTION);
+ /*
+ * The minimum granularity of memmap extensions is
+ * PAGES_PER_SUBSECTION as allocations are tracked in the
+ * 'subsection_map' bitmap of the section.
+ */
+ end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
+ pfn &= PAGE_SUBSECTION_MASK;
+ nr_pages = end - pfn;
+
+ start = (unsigned long) pfn_to_page(pfn);
+ end = start + nr_pages * sizeof(struct page);
if (vmemmap_populate(start, end, nid, altmap))
return NULL;
- return map;
+ return pfn_to_page(pfn);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 10b07ee..f6891c1 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -5,12 +5,14 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/mmzone.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/highmem.h>
#include <linux/export.h>
#include <linux/spinlock.h>
#include <linux/vmalloc.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include "internal.h"
#include <asm/dma.h>
@@ -65,10 +67,15 @@
unsigned long array_size = SECTIONS_PER_ROOT *
sizeof(struct mem_section);
- if (slab_is_available())
+ if (slab_is_available()) {
section = kzalloc_node(array_size, GFP_KERNEL, nid);
- else
- section = memblock_virt_alloc_node(array_size, nid);
+ } else {
+ section = memblock_alloc_node(array_size, SMP_CACHE_BYTES,
+ nid);
+ if (!section)
+ panic("%s: Failed to allocate %lu bytes nid=%d\n",
+ __func__, array_size, nid);
+ }
return section;
}
@@ -78,8 +85,15 @@
unsigned long root = SECTION_NR_TO_ROOT(section_nr);
struct mem_section *section;
+ /*
+ * An existing section is possible in the sub-section hotplug
+ * case. First hot-add instantiates, follow-on hot-add reuses
+ * the existing section.
+ *
+ * The mem_hotplug_lock resolves the apparent race below.
+ */
if (mem_section[root])
- return -EEXIST;
+ return 0;
section = sparse_index_alloc(nid);
if (!section)
@@ -97,7 +111,7 @@
#endif
#ifdef CONFIG_SPARSEMEM_EXTREME
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
{
unsigned long root_nr;
struct mem_section *root = NULL;
@@ -116,9 +130,9 @@
return (root_nr * SECTIONS_PER_ROOT) + (ms - root);
}
#else
-int __section_nr(struct mem_section* ms)
+unsigned long __section_nr(struct mem_section *ms)
{
- return (int)(ms - mem_section[0]);
+ return (unsigned long)(ms - mem_section[0]);
}
#endif
@@ -173,10 +187,10 @@
* Keeping track of this gives us an easy way to break out of
* those loops early.
*/
-int __highest_present_section_nr;
+unsigned long __highest_present_section_nr;
static void section_mark_present(struct mem_section *ms)
{
- int section_nr = __section_nr(ms);
+ unsigned long section_nr = __section_nr(ms);
if (section_nr > __highest_present_section_nr)
__highest_present_section_nr = section_nr;
@@ -184,7 +198,7 @@
ms->section_mem_map |= SECTION_MARKED_PRESENT;
}
-static inline int next_present_section_nr(int section_nr)
+static inline unsigned long next_present_section_nr(unsigned long section_nr)
{
do {
section_nr++;
@@ -196,7 +210,7 @@
}
#define for_each_present_section_nr(start, section_nr) \
for (section_nr = next_present_section_nr(start-1); \
- ((section_nr >= 0) && \
+ ((section_nr != -1) && \
(section_nr <= __highest_present_section_nr)); \
section_nr = next_present_section_nr(section_nr))
@@ -205,6 +219,41 @@
return next_present_section_nr(-1);
}
+static void subsection_mask_set(unsigned long *map, unsigned long pfn,
+ unsigned long nr_pages)
+{
+ int idx = subsection_map_index(pfn);
+ int end = subsection_map_index(pfn + nr_pages - 1);
+
+ bitmap_set(map, idx, end - idx + 1);
+}
+
+void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
+{
+ int end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
+ unsigned long nr, start_sec = pfn_to_section_nr(pfn);
+
+ if (!nr_pages)
+ return;
+
+ for (nr = start_sec; nr <= end_sec; nr++) {
+ struct mem_section *ms;
+ unsigned long pfns;
+
+ pfns = min(nr_pages, PAGES_PER_SECTION
+ - (pfn & ~PAGE_SECTION_MASK));
+ ms = __nr_to_section(nr);
+ subsection_mask_set(ms->usage->subsection_map, pfn, pfns);
+
+ pr_debug("%s: sec: %lu pfns: %lu set(%d, %d)\n", __func__, nr,
+ pfns, subsection_map_index(pfn),
+ subsection_map_index(pfn + pfns - 1));
+
+ pfn += pfns;
+ nr_pages -= pfns;
+ }
+}
+
/* Record a memory area against a node. */
void __init memory_present(int nid, unsigned long start, unsigned long end)
{
@@ -216,7 +265,10 @@
size = sizeof(struct mem_section*) * NR_SECTION_ROOTS;
align = 1 << (INTERNODE_CACHE_SHIFT);
- mem_section = memblock_virt_alloc(size, align);
+ mem_section = memblock_alloc(size, align);
+ if (!mem_section)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx\n",
+ __func__, size, align);
}
#endif
@@ -239,6 +291,22 @@
}
/*
+ * Mark all memblocks as present using memory_present(). This is a
+ * convienence function that is useful for a number of arches
+ * to mark all of the systems memory as present during initialization.
+ */
+void __init memblocks_present(void)
+{
+ struct memblock_region *reg;
+
+ for_each_memblock(memory, reg) {
+ memory_present(memblock_get_region_node(reg),
+ memblock_region_memory_base_pfn(reg),
+ memblock_region_memory_end_pfn(reg));
+ }
+}
+
+/*
* Subtle, we encode the real pfn into the mem_map such that
* the identity pfn - section_mem_map will return the actual
* physical page frame number.
@@ -264,33 +332,31 @@
static void __meminit sparse_init_one_section(struct mem_section *ms,
unsigned long pnum, struct page *mem_map,
- unsigned long *pageblock_bitmap)
+ struct mem_section_usage *usage, unsigned long flags)
{
ms->section_mem_map &= ~SECTION_MAP_MASK;
- ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum) |
- SECTION_HAS_MEM_MAP;
- ms->pageblock_flags = pageblock_bitmap;
+ ms->section_mem_map |= sparse_encode_mem_map(mem_map, pnum)
+ | SECTION_HAS_MEM_MAP | flags;
+ ms->usage = usage;
}
-unsigned long usemap_size(void)
+static unsigned long usemap_size(void)
{
return BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS) * sizeof(unsigned long);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
-static unsigned long *__kmalloc_section_usemap(void)
+size_t mem_section_usage_size(void)
{
- return kmalloc(usemap_size(), GFP_KERNEL);
+ return sizeof(struct mem_section_usage) + usemap_size();
}
-#endif /* CONFIG_MEMORY_HOTPLUG */
#ifdef CONFIG_MEMORY_HOTREMOVE
-static unsigned long * __init
+static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
+ struct mem_section_usage *usage;
unsigned long goal, limit;
- unsigned long *p;
int nid;
/*
* A page may contain usemaps for other sections preventing the
@@ -306,17 +372,16 @@
limit = goal + (1UL << PA_SECTION_SHIFT);
nid = early_pfn_to_nid(goal >> PAGE_SHIFT);
again:
- p = memblock_virt_alloc_try_nid_nopanic(size,
- SMP_CACHE_BYTES, goal, limit,
- nid);
- if (!p && limit) {
+ usage = memblock_alloc_try_nid(size, SMP_CACHE_BYTES, goal, limit, nid);
+ if (!usage && limit) {
limit = 0;
goto again;
}
- return p;
+ return usage;
}
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
{
unsigned long usemap_snr, pgdat_snr;
static unsigned long old_usemap_snr;
@@ -330,7 +395,7 @@
old_pgdat_snr = NR_MEM_SECTIONS;
}
- usemap_snr = pfn_to_section_nr(__pa(usemap) >> PAGE_SHIFT);
+ usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT);
pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT);
if (usemap_snr == pgdat_snr)
return;
@@ -358,14 +423,15 @@
usemap_snr, pgdat_snr, nid);
}
#else
-static unsigned long * __init
+static struct mem_section_usage * __init
sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat,
unsigned long size)
{
- return memblock_virt_alloc_node_nopanic(size, pgdat->node_id);
+ return memblock_alloc_node(size, SMP_CACHE_BYTES, pgdat->node_id);
}
-static void __init check_usemap_section_nr(int nid, unsigned long *usemap)
+static void __init check_usemap_section_nr(int nid,
+ struct mem_section_usage *usage)
{
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
@@ -382,18 +448,23 @@
return PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION);
}
-struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+struct page __init *__populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
unsigned long size = section_map_size();
struct page *map = sparse_buffer_alloc(size);
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
if (map)
return map;
- map = memblock_virt_alloc_try_nid(size,
- PAGE_SIZE, __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ map = memblock_alloc_try_nid(size,
+ PAGE_SIZE, addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
+ if (!map)
+ panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa\n",
+ __func__, size, PAGE_SIZE, nid, &addr);
+
return map;
}
#endif /* !CONFIG_SPARSEMEM_VMEMMAP */
@@ -401,13 +472,20 @@
static void *sparsemap_buf __meminitdata;
static void *sparsemap_buf_end __meminitdata;
+static inline void __meminit sparse_buffer_free(unsigned long size)
+{
+ WARN_ON(!sparsemap_buf || size == 0);
+ memblock_free_early(__pa(sparsemap_buf), size);
+}
+
static void __init sparse_buffer_init(unsigned long size, int nid)
{
+ phys_addr_t addr = __pa(MAX_DMA_ADDRESS);
WARN_ON(sparsemap_buf); /* forgot to call sparse_buffer_fini()? */
sparsemap_buf =
- memblock_virt_alloc_try_nid_raw(size, PAGE_SIZE,
- __pa(MAX_DMA_ADDRESS),
- BOOTMEM_ALLOC_ACCESSIBLE, nid);
+ memblock_alloc_try_nid_raw(size, PAGE_SIZE,
+ addr,
+ MEMBLOCK_ALLOC_ACCESSIBLE, nid);
sparsemap_buf_end = sparsemap_buf + size;
}
@@ -416,7 +494,7 @@
unsigned long size = sparsemap_buf_end - sparsemap_buf;
if (sparsemap_buf && size > 0)
- memblock_free_early(__pa(sparsemap_buf), size);
+ sparse_buffer_free(size);
sparsemap_buf = NULL;
}
@@ -425,11 +503,15 @@
void *ptr = NULL;
if (sparsemap_buf) {
- ptr = PTR_ALIGN(sparsemap_buf, size);
+ ptr = (void *) roundup((unsigned long)sparsemap_buf, size);
if (ptr + size > sparsemap_buf_end)
ptr = NULL;
- else
+ else {
+ /* Free redundant aligned space */
+ if ((unsigned long)(ptr - sparsemap_buf) > 0)
+ sparse_buffer_free((unsigned long)(ptr - sparsemap_buf));
sparsemap_buf = ptr + size;
+ }
}
return ptr;
}
@@ -446,32 +528,35 @@
unsigned long pnum_end,
unsigned long map_count)
{
- unsigned long pnum, usemap_longs, *usemap;
+ struct mem_section_usage *usage;
+ unsigned long pnum;
struct page *map;
- usemap_longs = BITS_TO_LONGS(SECTION_BLOCKFLAGS_BITS);
- usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
- usemap_size() *
- map_count);
- if (!usemap) {
+ usage = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nid),
+ mem_section_usage_size() * map_count);
+ if (!usage) {
pr_err("%s: node[%d] usemap allocation failed", __func__, nid);
goto failed;
}
sparse_buffer_init(map_count * section_map_size(), nid);
for_each_present_section_nr(pnum_begin, pnum) {
+ unsigned long pfn = section_nr_to_pfn(pnum);
+
if (pnum >= pnum_end)
break;
- map = sparse_mem_map_populate(pnum, nid, NULL);
+ map = __populate_section_memmap(pfn, PAGES_PER_SECTION,
+ nid, NULL);
if (!map) {
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
goto failed;
}
- check_usemap_section_nr(nid, usemap);
- sparse_init_one_section(__nr_to_section(pnum), pnum, map, usemap);
- usemap += usemap_longs;
+ check_usemap_section_nr(nid, usage);
+ sparse_init_one_section(__nr_to_section(pnum), pnum, map, usage,
+ SECTION_IS_EARLY);
+ usage = (void *) usage + mem_section_usage_size();
}
sparse_buffer_fini();
return;
@@ -539,7 +624,7 @@
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-/* Mark all memory sections within the pfn range as online */
+/* Mark all memory sections within the pfn range as offline */
void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
@@ -562,21 +647,20 @@
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
- struct vmem_altmap *altmap)
+static struct page *populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- /* This will make the necessary allocations eventually. */
- return sparse_mem_map_populate(pnum, nid, altmap);
+ return __populate_section_memmap(pfn, nr_pages, nid, altmap);
}
-static void __kfree_section_memmap(struct page *memmap,
+
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
vmemmap_free(start, end, altmap);
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)
{
unsigned long start = (unsigned long)memmap;
@@ -584,9 +668,9 @@
vmemmap_free(start, end, NULL);
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
-static struct page *__kmalloc_section_memmap(void)
+struct page *populate_section_memmap(unsigned long pfn,
+ unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
struct page *page, *ret;
unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
@@ -607,15 +691,11 @@
return ret;
}
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
- return __kmalloc_section_memmap();
-}
+ struct page *memmap = pfn_to_page(pfn);
-static void __kfree_section_memmap(struct page *memmap,
- struct vmem_altmap *altmap)
-{
if (is_vmalloc_addr(memmap))
vfree(memmap);
else
@@ -623,7 +703,6 @@
get_order(sizeof(struct page) * PAGES_PER_SECTION));
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_map_bootmem(struct page *memmap)
{
unsigned long maps_section_nr, removing_section_nr, i;
@@ -653,81 +732,182 @@
put_page_bootmem(page);
}
}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_SPARSEMEM_VMEMMAP */
-/*
- * returns the number of sections whose mem_maps were properly
- * set. If this is <=0, then that means that the passed-in
- * map was not consumed and must be freed.
+static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ DECLARE_BITMAP(tmp, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ bool section_is_early = early_section(ms);
+ struct page *memmap = NULL;
+ unsigned long *subsection_map = ms->usage
+ ? &ms->usage->subsection_map[0] : NULL;
+
+ subsection_mask_set(map, pfn, nr_pages);
+ if (subsection_map)
+ bitmap_and(tmp, map, subsection_map, SUBSECTIONS_PER_SECTION);
+
+ if (WARN(!subsection_map || !bitmap_equal(tmp, map, SUBSECTIONS_PER_SECTION),
+ "section already deactivated (%#lx + %ld)\n",
+ pfn, nr_pages))
+ return;
+
+ /*
+ * There are 3 cases to handle across two configurations
+ * (SPARSEMEM_VMEMMAP={y,n}):
+ *
+ * 1/ deactivation of a partial hot-added section (only possible
+ * in the SPARSEMEM_VMEMMAP=y case).
+ * a/ section was present at memory init
+ * b/ section was hot-added post memory init
+ * 2/ deactivation of a complete hot-added section
+ * 3/ deactivation of a complete section from memory init
+ *
+ * For 1/, when subsection_map does not empty we will not be
+ * freeing the usage map, but still need to free the vmemmap
+ * range.
+ *
+ * For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
+ */
+ bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
+ if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+ unsigned long section_nr = pfn_to_section_nr(pfn);
+
+ if (!section_is_early) {
+ kfree(ms->usage);
+ ms->usage = NULL;
+ }
+ memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
+ ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+ }
+
+ if (section_is_early && memmap)
+ free_map_bootmem(memmap);
+ else
+ depopulate_section_memmap(pfn, nr_pages, altmap);
+}
+
+static struct page * __meminit section_activate(int nid, unsigned long pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
+{
+ DECLARE_BITMAP(map, SUBSECTIONS_PER_SECTION) = { 0 };
+ struct mem_section *ms = __pfn_to_section(pfn);
+ struct mem_section_usage *usage = NULL;
+ unsigned long *subsection_map;
+ struct page *memmap;
+ int rc = 0;
+
+ subsection_mask_set(map, pfn, nr_pages);
+
+ if (!ms->usage) {
+ usage = kzalloc(mem_section_usage_size(), GFP_KERNEL);
+ if (!usage)
+ return ERR_PTR(-ENOMEM);
+ ms->usage = usage;
+ }
+ subsection_map = &ms->usage->subsection_map[0];
+
+ if (bitmap_empty(map, SUBSECTIONS_PER_SECTION))
+ rc = -EINVAL;
+ else if (bitmap_intersects(map, subsection_map, SUBSECTIONS_PER_SECTION))
+ rc = -EEXIST;
+ else
+ bitmap_or(subsection_map, map, subsection_map,
+ SUBSECTIONS_PER_SECTION);
+
+ if (rc) {
+ if (usage)
+ ms->usage = NULL;
+ kfree(usage);
+ return ERR_PTR(rc);
+ }
+
+ /*
+ * The early init code does not consider partially populated
+ * initial sections, it simply assumes that memory will never be
+ * referenced. If we hot-add memory into such a section then we
+ * do not need to populate the memmap and can simply reuse what
+ * is already there.
+ */
+ if (nr_pages < PAGES_PER_SECTION && early_section(ms))
+ return pfn_to_page(pfn);
+
+ memmap = populate_section_memmap(pfn, nr_pages, nid, altmap);
+ if (!memmap) {
+ section_deactivate(pfn, nr_pages, altmap);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return memmap;
+}
+
+/**
+ * sparse_add_section - add a memory section, or populate an existing one
+ * @nid: The node to add section on
+ * @start_pfn: start pfn of the memory range
+ * @nr_pages: number of pfns to add in the section
+ * @altmap: device page map
+ *
+ * This is only intended for hotplug.
+ *
+ * Return:
+ * * 0 - On success.
+ * * -EEXIST - Section has been present.
+ * * -ENOMEM - Out of memory.
*/
-int __meminit sparse_add_one_section(struct pglist_data *pgdat,
- unsigned long start_pfn, struct vmem_altmap *altmap)
+int __meminit sparse_add_section(int nid, unsigned long start_pfn,
+ unsigned long nr_pages, struct vmem_altmap *altmap)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct mem_section *ms;
struct page *memmap;
- unsigned long *usemap;
- unsigned long flags;
int ret;
- /*
- * no locking for this, because it does its own
- * plus, it does a kmalloc
- */
- ret = sparse_index_init(section_nr, pgdat->node_id);
- if (ret < 0 && ret != -EEXIST)
+ ret = sparse_index_init(section_nr, nid);
+ if (ret < 0)
return ret;
- ret = 0;
- memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, altmap);
- if (!memmap)
- return -ENOMEM;
- usemap = __kmalloc_section_usemap();
- if (!usemap) {
- __kfree_section_memmap(memmap, altmap);
- return -ENOMEM;
- }
- pgdat_resize_lock(pgdat, &flags);
+ memmap = section_activate(nid, start_pfn, nr_pages, altmap);
+ if (IS_ERR(memmap))
+ return PTR_ERR(memmap);
- ms = __pfn_to_section(start_pfn);
- if (ms->section_mem_map & SECTION_MARKED_PRESENT) {
- ret = -EEXIST;
- goto out;
- }
-
-#ifdef CONFIG_DEBUG_VM
/*
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- memset(memmap, PAGE_POISON_PATTERN, sizeof(struct page) * PAGES_PER_SECTION);
-#endif
+ page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+ ms = __nr_to_section(section_nr);
+ set_section_nid(section_nr, nid);
section_mark_present(ms);
- sparse_init_one_section(ms, section_nr, memmap, usemap);
-out:
- pgdat_resize_unlock(pgdat, &flags);
- if (ret < 0) {
- kfree(usemap);
- __kfree_section_memmap(memmap, altmap);
- }
- return ret;
+ /* Align memmap to section boundary in the subsection case */
+ if (section_nr_to_pfn(section_nr) != start_pfn)
+ memmap = pfn_to_kaddr(section_nr_to_pfn(section_nr));
+ sparse_init_one_section(ms, section_nr, memmap, ms->usage, 0);
+
+ return 0;
}
-#ifdef CONFIG_MEMORY_HOTREMOVE
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
int i;
- if (!memmap)
+ /*
+ * A further optimization is to have per section refcounted
+ * num_poisoned_pages. But that would need more space per memmap, so
+ * for now just do a quick global check to speed up this routine in the
+ * absence of bad pages.
+ */
+ if (atomic_long_read(&num_poisoned_pages) == 0)
return;
for (i = 0; i < nr_pages; i++) {
if (PageHWPoison(&memmap[i])) {
- atomic_long_sub(1, &num_poisoned_pages);
+ num_poisoned_pages_dec();
ClearPageHWPoison(&memmap[i]);
}
}
@@ -738,54 +918,12 @@
}
#endif
-static void free_section_usemap(struct page *memmap, unsigned long *usemap,
+void sparse_remove_section(struct mem_section *ms, unsigned long pfn,
+ unsigned long nr_pages, unsigned long map_offset,
struct vmem_altmap *altmap)
{
- struct page *usemap_page;
-
- if (!usemap)
- return;
-
- usemap_page = virt_to_page(usemap);
- /*
- * Check to see if allocation came from hot-plug-add
- */
- if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
- kfree(usemap);
- if (memmap)
- __kfree_section_memmap(memmap, altmap);
- return;
- }
-
- /*
- * The usemap came from bootmem. This is packed with other usemaps
- * on the section which has pgdat at boot time. Just keep it as is now.
- */
-
- if (memmap)
- free_map_bootmem(memmap);
+ clear_hwpoisoned_pages(pfn_to_page(pfn) + map_offset,
+ nr_pages - map_offset);
+ section_deactivate(pfn, nr_pages, altmap);
}
-
-void sparse_remove_one_section(struct zone *zone, struct mem_section *ms,
- unsigned long map_offset, struct vmem_altmap *altmap)
-{
- struct page *memmap = NULL;
- unsigned long *usemap = NULL, flags;
- struct pglist_data *pgdat = zone->zone_pgdat;
-
- pgdat_resize_lock(pgdat, &flags);
- if (ms->section_mem_map) {
- usemap = ms->pageblock_flags;
- memmap = sparse_decode_mem_map(ms->section_mem_map,
- __section_nr(ms));
- ms->section_mem_map = 0;
- ms->pageblock_flags = NULL;
- }
- pgdat_resize_unlock(pgdat, &flags);
-
- clear_hwpoisoned_pages(memmap + map_offset,
- PAGES_PER_SECTION - map_offset);
- free_section_usemap(memmap, usemap, altmap);
-}
-#endif /* CONFIG_MEMORY_HOTREMOVE */
#endif /* CONFIG_MEMORY_HOTPLUG */
diff --git a/mm/swap.c b/mm/swap.c
index 26fc9b5..38c3fa4 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/swap.c
*
@@ -7,7 +8,7 @@
/*
* This file contains the default values for the operation of the
* Linux VM subsystem. Fine-tuning documentation can be found in
- * Documentation/sysctl/vm.txt.
+ * Documentation/admin-guide/sysctl/vm.rst.
* Started 18.12.91
* Swap aging added 23.2.95, Stephen Tweedie.
* Buffermem limits added 12.3.98, Rik van Riel.
@@ -29,7 +30,6 @@
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/backing-dev.h>
-#include <linux/memremap.h>
#include <linux/memcontrol.h>
#include <linux/gfp.h>
#include <linux/uio.h>
@@ -47,6 +47,7 @@
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
+static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
@@ -59,24 +60,24 @@
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
unsigned long flags;
- spin_lock_irqsave(zone_lru_lock(zone), flags);
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ spin_lock_irqsave(&pgdat->lru_lock, flags);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(zone_lru_lock(zone), flags);
+ spin_unlock_irqrestore(&pgdat->lru_lock, flags);
}
__ClearPageWaiters(page);
- mem_cgroup_uncharge(page);
}
static void __put_single_page(struct page *page)
{
__page_cache_release(page);
+ mem_cgroup_uncharge(page);
free_unref_page(page);
}
@@ -127,7 +128,7 @@
while (!list_empty(pages)) {
struct page *victim;
- victim = list_entry(pages->prev, struct page, lru);
+ victim = lru_to_page(pages);
list_del(&victim->lru);
put_page(victim);
}
@@ -321,19 +322,14 @@
{
}
-static bool need_activate_page_drain(int cpu)
-{
- return false;
-}
-
void activate_page(struct page *page)
{
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
page = compound_head(page);
- spin_lock_irq(zone_lru_lock(zone));
- __activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_lock_irq(&pgdat->lru_lock);
+ __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
+ spin_unlock_irq(&pgdat->lru_lock);
}
#endif
@@ -520,7 +516,6 @@
del_page_from_lru_list(page, lruvec, lru + active);
ClearPageActive(page);
ClearPageReferenced(page);
- add_page_to_lru_list(page, lruvec, lru);
if (PageWriteback(page) || PageDirty(page)) {
/*
@@ -528,13 +523,14 @@
* It can make readahead confusing. But race window
* is _really_ small and it's non-critical problem.
*/
+ add_page_to_lru_list(page, lruvec, lru);
SetPageReclaim(page);
} else {
/*
* The page's writeback ends up during pagevec
* We moves tha page into tail of inactive.
*/
- list_move_tail(&page->lru, &lruvec->lists[lru]);
+ add_page_to_lru_list_tail(page, lruvec, lru);
__count_vm_event(PGROTATED);
}
@@ -543,6 +539,22 @@
update_page_reclaim_stat(lruvec, file, 0);
}
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
+ void *arg)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ int file = page_is_file_cache(page);
+ int lru = page_lru_base_type(page);
+
+ del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
+ ClearPageActive(page);
+ ClearPageReferenced(page);
+ add_page_to_lru_list(page, lruvec, lru);
+
+ __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
+ update_page_reclaim_stat(lruvec, file, 0);
+ }
+}
static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
void *arg)
@@ -595,6 +607,10 @@
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ if (pagevec_count(pvec))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+
pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -628,6 +644,26 @@
}
}
+/*
+ * deactivate_page - deactivate a page
+ * @page: page to deactivate
+ *
+ * deactivate_page() moves @page to the inactive list if @page was on the active
+ * list and was not an unevictable page. This is done to accelerate the reclaim
+ * of @page.
+ */
+void deactivate_page(struct page *page)
+{
+ if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+
+ get_page(page);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ put_cpu_var(lru_deactivate_pvecs);
+ }
+}
+
/**
* mark_page_lazyfree - make an anon page lazyfree
* @page: page to deactivate
@@ -654,13 +690,15 @@
put_cpu();
}
+#ifdef CONFIG_SMP
+
+static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
+
static void lru_add_drain_per_cpu(struct work_struct *dummy)
{
lru_add_drain();
}
-static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work);
-
/*
* Doesn't need any cpu hotplug locking because we do rely on per-cpu
* kworkers being shut down before our page_alloc_cpu_dead callback is
@@ -690,6 +728,7 @@
if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
+ pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
@@ -703,6 +742,12 @@
mutex_unlock(&lock);
}
+#else
+void lru_add_drain_all(void)
+{
+ lru_add_drain();
+}
+#endif
/**
* release_pages - batched put_page()
@@ -737,15 +782,20 @@
if (is_huge_zero_page(page))
continue;
- /* Device public page can not be huge page */
- if (is_device_public_page(page)) {
+ if (is_zone_device_page(page)) {
if (locked_pgdat) {
spin_unlock_irqrestore(&locked_pgdat->lru_lock,
flags);
locked_pgdat = NULL;
}
- put_devmap_managed_page(page);
- continue;
+ /*
+ * ZONE_DEVICE pages that return 'false' from
+ * put_devmap_managed_page() do not require special
+ * processing, and instead, expect a call to
+ * put_page_testzero().
+ */
+ if (put_devmap_managed_page(page))
+ continue;
}
page = compound_head(page);
@@ -824,8 +874,7 @@
VM_BUG_ON_PAGE(!PageHead(page), page);
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
- VM_BUG_ON(NR_CPUS != 1 &&
- !spin_is_locked(&lruvec_pgdat(lruvec)->lru_lock));
+ lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
if (!list)
SetPageLRU(page_tail);
@@ -837,17 +886,15 @@
get_page(page_tail);
list_add_tail(&page_tail->lru, list);
} else {
- struct list_head *list_head;
/*
* Head page has not yet been counted, as an hpage,
* so we must account for each subpage individually.
*
- * Use the standard add function to put page_tail on the list,
- * but then correct its position so they all end up in order.
+ * Put page_tail on the list at the correct position
+ * so they all end up in order.
*/
- add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail));
- list_head = page_tail->lru.prev;
- list_move_tail(&page_tail->lru, list_head);
+ add_page_to_lru_list_tail(page_tail, lruvec,
+ page_lru(page_tail));
}
if (!PageUnevictable(page))
@@ -866,7 +913,7 @@
SetPageLRU(page);
/*
* Page becomes evictable in two ways:
- * 1) Within LRU lock [munlock_vma_pages() and __munlock_pagevec()].
+ * 1) Within LRU lock [munlock_vma_page() and __munlock_pagevec()].
* 2) Before acquiring LRU lock to put the page to correct LRU and then
* a) do PageLRU check with lock [check_move_unevictable_pages]
* b) do PageLRU check before lock [clear_page_mlock]
@@ -965,7 +1012,7 @@
for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- if (!radix_tree_exceptional_entry(page))
+ if (!xa_is_value(page))
pvec->pages[j++] = page;
}
pvec->nr = j;
@@ -1002,7 +1049,7 @@
unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag)
+ xa_mark_t tag)
{
pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
PAGEVEC_SIZE, pvec->pages);
@@ -1012,7 +1059,7 @@
unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
struct address_space *mapping, pgoff_t *index, pgoff_t end,
- int tag, unsigned max_pages)
+ xa_mark_t tag, unsigned max_pages)
{
pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
@@ -1024,7 +1071,7 @@
*/
void __init swap_setup(void)
{
- unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT);
+ unsigned long megs = totalram_pages() >> (20 - PAGE_SHIFT);
/* Use a smaller cluster for small-memory machines */
if (megs < 16)
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ecee9c6..8e7ce9a 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -73,23 +73,24 @@
unsigned int i, j, nr;
unsigned long ret = 0;
struct address_space *spaces;
+ struct swap_info_struct *si;
- rcu_read_lock();
for (i = 0; i < MAX_SWAPFILES; i++) {
- /*
- * The corresponding entries in nr_swapper_spaces and
- * swapper_spaces will be reused only after at least
- * one grace period. So it is impossible for them
- * belongs to different usage.
- */
- nr = nr_swapper_spaces[i];
- spaces = rcu_dereference(swapper_spaces[i]);
- if (!nr || !spaces)
+ swp_entry_t entry = swp_entry(i, 1);
+
+ /* Avoid get_swap_device() to warn for bad swap entry */
+ if (!swp_swap_info(entry))
continue;
+ /* Prevent swapoff to free swapper_spaces */
+ si = get_swap_device(entry);
+ if (!si)
+ continue;
+ nr = nr_swapper_spaces[i];
+ spaces = swapper_spaces[i];
for (j = 0; j < nr; j++)
ret += spaces[j].nrpages;
+ put_swap_device(si);
}
- rcu_read_unlock();
return ret;
}
@@ -107,14 +108,15 @@
}
/*
- * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
+ * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int __add_to_swap_cache(struct page *page, swp_entry_t entry)
+int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
{
- int error, i, nr = hpage_nr_pages(page);
- struct address_space *address_space;
+ struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
+ XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
+ unsigned long i, nr = compound_nr(page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -123,73 +125,52 @@
page_ref_add(page, nr);
SetPageSwapCache(page);
- address_space = swap_address_space(entry);
- xa_lock_irq(&address_space->i_pages);
- for (i = 0; i < nr; i++) {
- set_page_private(page + i, entry.val + i);
- error = radix_tree_insert(&address_space->i_pages,
- idx + i, page + i);
- if (unlikely(error))
- break;
- }
- if (likely(!error)) {
+ do {
+ xas_lock_irq(&xas);
+ xas_create_range(&xas);
+ if (xas_error(&xas))
+ goto unlock;
+ for (i = 0; i < nr; i++) {
+ VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ set_page_private(page + i, entry.val + i);
+ xas_store(&xas, page);
+ xas_next(&xas);
+ }
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
ADD_CACHE_INFO(add_total, nr);
- } else {
- /*
- * Only the context which have set SWAP_HAS_CACHE flag
- * would call add_to_swap_cache().
- * So add_to_swap_cache() doesn't returns -EEXIST.
- */
- VM_BUG_ON(error == -EEXIST);
- set_page_private(page + i, 0UL);
- while (i--) {
- radix_tree_delete(&address_space->i_pages, idx + i);
- set_page_private(page + i, 0UL);
- }
- ClearPageSwapCache(page);
- page_ref_sub(page, nr);
- }
- xa_unlock_irq(&address_space->i_pages);
+unlock:
+ xas_unlock_irq(&xas);
+ } while (xas_nomem(&xas, gfp));
- return error;
-}
+ if (!xas_error(&xas))
+ return 0;
-
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask)
-{
- int error;
-
- error = radix_tree_maybe_preload_order(gfp_mask, compound_order(page));
- if (!error) {
- error = __add_to_swap_cache(page, entry);
- radix_tree_preload_end();
- }
- return error;
+ ClearPageSwapCache(page);
+ page_ref_sub(page, nr);
+ return xas_error(&xas);
}
/*
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page)
+void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
{
- struct address_space *address_space;
+ struct address_space *address_space = swap_address_space(entry);
int i, nr = hpage_nr_pages(page);
- swp_entry_t entry;
- pgoff_t idx;
+ pgoff_t idx = swp_offset(entry);
+ XA_STATE(xas, &address_space->i_pages, idx);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
VM_BUG_ON_PAGE(PageWriteback(page), page);
- entry.val = page_private(page);
- address_space = swap_address_space(entry);
- idx = swp_offset(entry);
for (i = 0; i < nr; i++) {
- radix_tree_delete(&address_space->i_pages, idx + i);
+ void *entry = xas_store(&xas, NULL);
+ VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
+ xas_next(&xas);
}
ClearPageSwapCache(page);
address_space->nrpages -= nr;
@@ -217,7 +198,7 @@
return 0;
/*
- * Radix-tree node allocations from PF_MEMALLOC contexts could
+ * XArray node allocations from PF_MEMALLOC contexts could
* completely exhaust the page allocator. __GFP_NOMEMALLOC
* stops emergency reserves from being allocated.
*
@@ -229,7 +210,6 @@
*/
err = add_to_swap_cache(page, entry,
__GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
- /* -ENOMEM radix-tree allocation failure */
if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -263,14 +243,11 @@
*/
void delete_from_swap_cache(struct page *page)
{
- swp_entry_t entry;
- struct address_space *address_space;
+ swp_entry_t entry = { .val = page_private(page) };
+ struct address_space *address_space = swap_address_space(entry);
- entry.val = page_private(page);
-
- address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, entry);
xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
@@ -334,8 +311,13 @@
unsigned long addr)
{
struct page *page;
+ struct swap_info_struct *si;
+ si = get_swap_device(entry);
+ if (!si)
+ return NULL;
page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ put_swap_device(si);
INC_CACHE_INFO(find_total);
if (page) {
@@ -378,8 +360,8 @@
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page, *new_page = NULL;
- struct address_space *swapper_space = swap_address_space(entry);
+ struct page *found_page = NULL, *new_page = NULL;
+ struct swap_info_struct *si;
int err;
*new_page_allocated = false;
@@ -389,7 +371,12 @@
* called after lookup_swap_cache() failed, re-calling
* that would confuse statistics.
*/
- found_page = find_get_page(swapper_space, swp_offset(entry));
+ si = get_swap_device(entry);
+ if (!si)
+ break;
+ found_page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
+ put_swap_device(si);
if (found_page)
break;
@@ -414,18 +401,10 @@
}
/*
- * call radix_tree_preload() while we can wait.
- */
- err = radix_tree_maybe_preload(gfp_mask & GFP_KERNEL);
- if (err)
- break;
-
- /*
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
if (err == -EEXIST) {
- radix_tree_preload_end();
/*
* We might race against get_swap_page() and stumble
* across a SWAP_HAS_CACHE swap_map entry whose page
@@ -433,26 +412,20 @@
*/
cond_resched();
continue;
- }
- if (err) { /* swp entry is obsolete ? */
- radix_tree_preload_end();
+ } else if (err) /* swp entry is obsolete ? */
break;
- }
- /* May fail (-ENOMEM) if radix-tree node allocation failed. */
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
__SetPageLocked(new_page);
__SetPageSwapBacked(new_page);
- err = __add_to_swap_cache(new_page, entry);
+ err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
if (likely(!err)) {
- radix_tree_preload_end();
- /*
- * Initiate read into locked page and return.
- */
+ /* Initiate read into locked page */
+ SetPageWorkingset(new_page);
lru_cache_add_anon(new_page);
*new_page_allocated = true;
return new_page;
}
- radix_tree_preload_end();
__ClearPageLocked(new_page);
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -561,7 +534,7 @@
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
- * Caller must hold down_read on the vma->vm_mm if vmf->vma is not NULL.
+ * Caller must hold read mmap_sem if vmf->vma is not NULL.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
@@ -581,6 +554,13 @@
if (!mask)
goto skip;
+ /* Test swap type to make sure the dereference is safe */
+ if (likely(si->flags & (SWP_BLKDEV | SWP_FS))) {
+ struct inode *inode = si->swap_file->f_mapping->host;
+ if (inode_read_congested(inode))
+ goto skip;
+ }
+
do_poll = false;
/* Read a page_cluster sized and aligned cluster around offset. */
start_offset = offset & ~mask;
@@ -625,27 +605,23 @@
return -ENOMEM;
for (i = 0; i < nr; i++) {
space = spaces + i;
- INIT_RADIX_TREE(&space->i_pages, GFP_ATOMIC|__GFP_NOWARN);
+ xa_init_flags(&space->i_pages, XA_FLAGS_LOCK_IRQ);
atomic_set(&space->i_mmap_writable, 0);
space->a_ops = &swap_aops;
/* swap cache doesn't use writeback related tags */
mapping_set_no_writeback_tags(space);
}
nr_swapper_spaces[type] = nr;
- rcu_assign_pointer(swapper_spaces[type], spaces);
+ swapper_spaces[type] = spaces;
return 0;
}
void exit_swap_address_space(unsigned int type)
{
- struct address_space *spaces;
-
- spaces = swapper_spaces[type];
+ kvfree(swapper_spaces[type]);
nr_swapper_spaces[type] = 0;
- rcu_assign_pointer(swapper_spaces[type], NULL);
- synchronize_rcu();
- kvfree(spaces);
+ swapper_spaces[type] = NULL;
}
static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
@@ -729,6 +705,20 @@
pte_unmap(orig_pte);
}
+/**
+ * swap_vma_readahead - swap in pages in hope we need them soon
+ * @entry: swap entry of this memory
+ * @gfp_mask: memory allocation flags
+ * @vmf: fault information
+ *
+ * Returns the struct page for entry and addr, after queueing swapin.
+ *
+ * Primitive swap readahead code. We simply read in a few pages whoes
+ * virtual addresses are around the fault address in the same vma.
+ *
+ * Caller must hold read mmap_sem if vmf->vma is not NULL.
+ *
+ */
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 8810a6d..dab4352 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/swapfile.c
*
@@ -98,37 +99,71 @@
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
+static struct swap_info_struct *swap_type_to_swap_info(int type)
+{
+ if (type >= READ_ONCE(nr_swapfiles))
+ return NULL;
+
+ smp_rmb(); /* Pairs with smp_wmb in alloc_swap_info. */
+ return READ_ONCE(swap_info[type]);
+}
+
static inline unsigned char swap_count(unsigned char ent)
{
return ent & ~SWAP_HAS_CACHE; /* may include COUNT_CONTINUED flag */
}
+/* Reclaim the swap entry anyway if possible */
+#define TTRS_ANYWAY 0x1
+/*
+ * Reclaim the swap entry if there are no more mappings of the
+ * corresponding page
+ */
+#define TTRS_UNMAPPED 0x2
+/* Reclaim the swap entry if swap is getting full*/
+#define TTRS_FULL 0x4
+
/* returns 1 if swap entry is freed */
-static int
-__try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset)
+static int __try_to_reclaim_swap(struct swap_info_struct *si,
+ unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
struct page *page;
int ret = 0;
- page = find_get_page(swap_address_space(entry), swp_offset(entry));
+ page = find_get_page(swap_address_space(entry), offset);
if (!page)
return 0;
/*
- * This function is called from scan_swap_map() and it's called
- * by vmscan.c at reclaiming pages. So, we hold a lock on a page, here.
- * We have to use trylock for avoiding deadlock. This is a special
+ * When this function is called from scan_swap_map_slots() and it's
+ * called by vmscan.c at reclaiming pages. So, we hold a lock on a page,
+ * here. We have to use trylock for avoiding deadlock. This is a special
* case and you should use try_to_free_swap() with explicit lock_page()
* in usual operations.
*/
if (trylock_page(page)) {
- ret = try_to_free_swap(page);
+ if ((flags & TTRS_ANYWAY) ||
+ ((flags & TTRS_UNMAPPED) && !page_mapped(page)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(page)))
+ ret = try_to_free_swap(page);
unlock_page(page);
}
put_page(page);
return ret;
}
+static inline struct swap_extent *first_se(struct swap_info_struct *sis)
+{
+ struct rb_node *rb = rb_first(&sis->swap_extent_root);
+ return rb_entry(rb, struct swap_extent, rb_node);
+}
+
+static inline struct swap_extent *next_se(struct swap_extent *se)
+{
+ struct rb_node *rb = rb_next(&se->rb_node);
+ return rb ? rb_entry(rb, struct swap_extent, rb_node) : NULL;
+}
+
/*
* swapon tell device that all the old swap contents can be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -141,7 +176,7 @@
int err = 0;
/* Do not discard the swap header page! */
- se = &si->first_swap_extent;
+ se = first_se(si);
start_block = (se->start_block + 1) << (PAGE_SHIFT - 9);
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
@@ -152,7 +187,7 @@
cond_resched();
}
- list_for_each_entry(se, &si->first_swap_extent.list, list) {
+ for (se = next_se(se); se; se = next_se(se)) {
start_block = se->start_block << (PAGE_SHIFT - 9);
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
@@ -166,6 +201,26 @@
return err; /* That will often be -EOPNOTSUPP */
}
+static struct swap_extent *
+offset_to_swap_extent(struct swap_info_struct *sis, unsigned long offset)
+{
+ struct swap_extent *se;
+ struct rb_node *rb;
+
+ rb = sis->swap_extent_root.rb_node;
+ while (rb) {
+ se = rb_entry(rb, struct swap_extent, rb_node);
+ if (offset < se->start_page)
+ rb = rb->rb_left;
+ else if (offset >= se->start_page + se->nr_pages)
+ rb = rb->rb_right;
+ else
+ return se;
+ }
+ /* It *must* be present */
+ BUG();
+}
+
/*
* swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -173,32 +228,25 @@
static void discard_swap_cluster(struct swap_info_struct *si,
pgoff_t start_page, pgoff_t nr_pages)
{
- struct swap_extent *se = si->curr_swap_extent;
- int found_extent = 0;
+ struct swap_extent *se = offset_to_swap_extent(si, start_page);
while (nr_pages) {
- if (se->start_page <= start_page &&
- start_page < se->start_page + se->nr_pages) {
- pgoff_t offset = start_page - se->start_page;
- sector_t start_block = se->start_block + offset;
- sector_t nr_blocks = se->nr_pages - offset;
+ pgoff_t offset = start_page - se->start_page;
+ sector_t start_block = se->start_block + offset;
+ sector_t nr_blocks = se->nr_pages - offset;
- if (nr_blocks > nr_pages)
- nr_blocks = nr_pages;
- start_page += nr_blocks;
- nr_pages -= nr_blocks;
+ if (nr_blocks > nr_pages)
+ nr_blocks = nr_pages;
+ start_page += nr_blocks;
+ nr_pages -= nr_blocks;
- if (!found_extent++)
- si->curr_swap_extent = se;
+ start_block <<= PAGE_SHIFT - 9;
+ nr_blocks <<= PAGE_SHIFT - 9;
+ if (blkdev_issue_discard(si->bdev, start_block,
+ nr_blocks, GFP_NOIO, 0))
+ break;
- start_block <<= PAGE_SHIFT - 9;
- nr_blocks <<= PAGE_SHIFT - 9;
- if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, 0))
- break;
- }
-
- se = list_next_entry(se, list);
+ se = next_se(se);
}
}
@@ -780,7 +828,7 @@
int swap_was_freed;
unlock_cluster(ci);
spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset);
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed)
@@ -919,6 +967,7 @@
struct swap_cluster_info *ci;
ci = lock_cluster(si, offset);
+ memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
cluster_set_count_flag(ci, 0, 0);
free_cluster(si, idx);
unlock_cluster(ci);
@@ -989,7 +1038,7 @@
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FILE))
+ if (!(si->flags & SWP_FS))
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
@@ -1030,12 +1079,14 @@
/* The only caller of this function is now suspend routine */
swp_entry_t get_swap_page_of_type(int type)
{
- struct swap_info_struct *si;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
pgoff_t offset;
- si = swap_info[type];
+ if (!si)
+ goto fail;
+
spin_lock(&si->lock);
- if (si && (si->flags & SWP_WRITEOK)) {
+ if (si->flags & SWP_WRITEOK) {
atomic_long_dec(&nr_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
@@ -1046,20 +1097,20 @@
atomic_long_inc(&nr_swap_pages);
}
spin_unlock(&si->lock);
+fail:
return (swp_entry_t) {0};
}
static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
- unsigned long offset, type;
+ unsigned long offset;
if (!entry.val)
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
+ p = swp_swap_info(entry);
+ if (!p)
goto bad_nofile;
- p = swap_info[type];
if (!(p->flags & SWP_USED))
goto bad_device;
offset = swp_offset(entry);
@@ -1160,6 +1211,69 @@
return usage;
}
+/*
+ * Check whether swap entry is valid in the swap device. If so,
+ * return pointer to swap_info_struct, and keep the swap entry valid
+ * via preventing the swap device from being swapoff, until
+ * put_swap_device() is called. Otherwise return NULL.
+ *
+ * The entirety of the RCU read critical section must come before the
+ * return from or after the call to synchronize_rcu() in
+ * enable_swap_info() or swapoff(). So if "si->flags & SWP_VALID" is
+ * true, the si->map, si->cluster_info, etc. must be valid in the
+ * critical section.
+ *
+ * Notice that swapoff or swapoff+swapon can still happen before the
+ * rcu_read_lock() in get_swap_device() or after the rcu_read_unlock()
+ * in put_swap_device() if there isn't any other way to prevent
+ * swapoff, such as page lock, page table lock, etc. The caller must
+ * be prepared for that. For example, the following situation is
+ * possible.
+ *
+ * CPU1 CPU2
+ * do_swap_page()
+ * ... swapoff+swapon
+ * __read_swap_cache_async()
+ * swapcache_prepare()
+ * __swap_duplicate()
+ * // check swap_map
+ * // verify PTE not changed
+ *
+ * In __swap_duplicate(), the swap_map need to be checked before
+ * changing partly because the specified swap entry may be for another
+ * swap device which has been swapoff. And in do_swap_page(), after
+ * the page is read from the swap device, the PTE is verified not
+ * changed with the page table locked to check whether the swap device
+ * has been swapoff or swapoff+swapon.
+ */
+struct swap_info_struct *get_swap_device(swp_entry_t entry)
+{
+ struct swap_info_struct *si;
+ unsigned long offset;
+
+ if (!entry.val)
+ goto out;
+ si = swp_swap_info(entry);
+ if (!si)
+ goto bad_nofile;
+
+ rcu_read_lock();
+ if (!(si->flags & SWP_VALID))
+ goto unlock_out;
+ offset = swp_offset(entry);
+ if (offset >= si->max)
+ goto unlock_out;
+
+ return si;
+bad_nofile:
+ pr_err("%s: %s%08lx\n", __func__, Bad_file, entry.val);
+out:
+ return NULL;
+unlock_out:
+ rcu_read_unlock();
+ return NULL;
+}
+
static unsigned char __swap_entry_free(struct swap_info_struct *p,
swp_entry_t entry, unsigned char usage)
{
@@ -1169,6 +1283,8 @@
ci = lock_cluster_or_swap_info(p, offset);
usage = __swap_entry_free_locked(p, offset, usage);
unlock_cluster_or_swap_info(p, ci);
+ if (!usage)
+ free_swap_slot(entry);
return usage;
}
@@ -1199,10 +1315,8 @@
struct swap_info_struct *p;
p = _swap_info_get(entry);
- if (p) {
- if (!__swap_entry_free(p, entry, 1))
- free_swap_slot(entry);
- }
+ if (p)
+ __swap_entry_free(p, entry, 1);
}
/*
@@ -1237,9 +1351,6 @@
if (free_entries == SWAPFILE_CLUSTER) {
unlock_cluster_or_swap_info(si, ci);
spin_lock(&si->lock);
- ci = lock_cluster(si, offset);
- memset(map, 0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
swap_free_cluster(si, idx);
spin_unlock(&si->lock);
@@ -1334,11 +1445,18 @@
return count;
}
-int __swap_count(struct swap_info_struct *si, swp_entry_t entry)
+int __swap_count(swp_entry_t entry)
{
+ struct swap_info_struct *si;
pgoff_t offset = swp_offset(entry);
+ int count = 0;
- return swap_count(si->swap_map[offset]);
+ si = get_swap_device(entry);
+ if (si) {
+ count = swap_count(si->swap_map[offset]);
+ put_swap_device(si);
+ }
+ return count;
}
static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
@@ -1363,9 +1481,11 @@
int count = 0;
struct swap_info_struct *si;
- si = __swap_info_get(entry);
- if (si)
+ si = get_swap_device(entry);
+ if (si) {
count = swap_swapcount(si, entry);
+ put_swap_device(si);
+ }
return count;
}
@@ -1612,7 +1732,6 @@
int free_swap_and_cache(swp_entry_t entry)
{
struct swap_info_struct *p;
- struct page *page = NULL;
unsigned char count;
if (non_swap_entry(entry))
@@ -1622,30 +1741,9 @@
if (p) {
count = __swap_entry_free(p, entry, 1);
if (count == SWAP_HAS_CACHE &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
- if (page && !trylock_page(page)) {
- put_page(page);
- page = NULL;
- }
- } else if (!count)
- free_swap_slot(entry);
- }
- if (page) {
- /*
- * Not mapped elsewhere, or swap space full? Free it!
- * Also recheck PageSwapCache now page is locked (above).
- */
- if (PageSwapCache(page) && !PageWriteback(page) &&
- (!page_mapped(page) || mem_cgroup_swap_full(page)) &&
- !swap_page_trans_huge_swapped(p, entry)) {
- page = compound_head(page);
- delete_from_swap_cache(page);
- SetPageDirty(page);
- }
- unlock_page(page);
- put_page(page);
+ !swap_page_trans_huge_swapped(p, entry))
+ __try_to_reclaim_swap(p, swp_offset(entry),
+ TTRS_UNMAPPED | TTRS_FULL);
}
return p != NULL;
}
@@ -1682,7 +1780,7 @@
return type;
}
if (bdev == sis->bdev) {
- struct swap_extent *se = &sis->first_swap_extent;
+ struct swap_extent *se = first_se(sis);
if (se->start_block == offset) {
if (bdev_p)
@@ -1708,10 +1806,9 @@
sector_t swapdev_block(int type, pgoff_t offset)
{
struct block_device *bdev;
+ struct swap_info_struct *si = swap_type_to_swap_info(type);
- if ((unsigned int)type >= nr_swapfiles)
- return 0;
- if (!(swap_info[type]->flags & SWP_WRITEOK))
+ if (!si || !(si->flags & SWP_WRITEOK))
return 0;
return map_swap_entry(swp_entry(type, offset), &bdev);
}
@@ -1810,44 +1907,77 @@
}
static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned long addr, unsigned long end,
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
- pte_t swp_pte = swp_entry_to_pte(entry);
+ struct page *page;
+ swp_entry_t entry;
pte_t *pte;
+ struct swap_info_struct *si;
+ unsigned long offset;
int ret = 0;
+ volatile unsigned char *swap_map;
- /*
- * We don't actually need pte lock while scanning for swp_pte: since
- * we hold page lock and mmap_sem, swp_pte cannot be inserted into the
- * page table while we're scanning; though it could get zapped, and on
- * some architectures (e.g. x86_32 with PAE) we might catch a glimpse
- * of unmatched parts which look like swp_pte, so unuse_pte must
- * recheck under pte lock. Scanning without pte lock lets it be
- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
- */
+ si = swap_info[type];
pte = pte_offset_map(pmd, addr);
do {
- /*
- * swapoff spends a _lot_ of time in this loop!
- * Test inline before going to call unuse_pte.
- */
- if (unlikely(pte_same_as_swp(*pte, swp_pte))) {
- pte_unmap(pte);
- ret = unuse_pte(vma, pmd, addr, entry, page);
- if (ret)
- goto out;
- pte = pte_offset_map(pmd, addr);
+ struct vm_fault vmf;
+
+ if (!is_swap_pte(*pte))
+ continue;
+
+ entry = pte_to_swp_entry(*pte);
+ if (swp_type(entry) != type)
+ continue;
+
+ offset = swp_offset(entry);
+ if (frontswap && !frontswap_test(si, offset))
+ continue;
+
+ pte_unmap(pte);
+ swap_map = &si->swap_map[offset];
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+ if (!page) {
+ if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
+ goto try_next;
+ return -ENOMEM;
}
+
+ lock_page(page);
+ wait_on_page_writeback(page);
+ ret = unuse_pte(vma, pmd, addr, entry, page);
+ if (ret < 0) {
+ unlock_page(page);
+ put_page(page);
+ goto out;
+ }
+
+ try_to_free_swap(page);
+ unlock_page(page);
+ put_page(page);
+
+ if (*fs_pages_to_unuse && !--(*fs_pages_to_unuse)) {
+ ret = FRONTSWAP_PAGES_UNUSED;
+ goto out;
+ }
+try_next:
+ pte = pte_offset_map(pmd, addr);
} while (pte++, addr += PAGE_SIZE, addr != end);
pte_unmap(pte - 1);
+
+ ret = 0;
out:
return ret;
}
static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
pmd_t *pmd;
unsigned long next;
@@ -1859,7 +1989,8 @@
next = pmd_addr_end(addr, end);
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
continue;
- ret = unuse_pte_range(vma, pmd, addr, next, entry, page);
+ ret = unuse_pte_range(vma, pmd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pmd++, addr = next, addr != end);
@@ -1868,7 +1999,8 @@
static inline int unuse_pud_range(struct vm_area_struct *vma, p4d_t *p4d,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
pud_t *pud;
unsigned long next;
@@ -1879,7 +2011,8 @@
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- ret = unuse_pmd_range(vma, pud, addr, next, entry, page);
+ ret = unuse_pmd_range(vma, pud, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pud++, addr = next, addr != end);
@@ -1888,7 +2021,8 @@
static inline int unuse_p4d_range(struct vm_area_struct *vma, pgd_t *pgd,
unsigned long addr, unsigned long end,
- swp_entry_t entry, struct page *page)
+ unsigned int type, bool frontswap,
+ unsigned long *fs_pages_to_unuse)
{
p4d_t *p4d;
unsigned long next;
@@ -1899,78 +2033,66 @@
next = p4d_addr_end(addr, end);
if (p4d_none_or_clear_bad(p4d))
continue;
- ret = unuse_pud_range(vma, p4d, addr, next, entry, page);
+ ret = unuse_pud_range(vma, p4d, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (p4d++, addr = next, addr != end);
return 0;
}
-static int unuse_vma(struct vm_area_struct *vma,
- swp_entry_t entry, struct page *page)
+static int unuse_vma(struct vm_area_struct *vma, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
pgd_t *pgd;
unsigned long addr, end, next;
int ret;
- if (page_anon_vma(page)) {
- addr = page_address_in_vma(page, vma);
- if (addr == -EFAULT)
- return 0;
- else
- end = addr + PAGE_SIZE;
- } else {
- addr = vma->vm_start;
- end = vma->vm_end;
- }
+ addr = vma->vm_start;
+ end = vma->vm_end;
pgd = pgd_offset(vma->vm_mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- ret = unuse_p4d_range(vma, pgd, addr, next, entry, page);
+ ret = unuse_p4d_range(vma, pgd, addr, next, type,
+ frontswap, fs_pages_to_unuse);
if (ret)
return ret;
} while (pgd++, addr = next, addr != end);
return 0;
}
-static int unuse_mm(struct mm_struct *mm,
- swp_entry_t entry, struct page *page)
+static int unuse_mm(struct mm_struct *mm, unsigned int type,
+ bool frontswap, unsigned long *fs_pages_to_unuse)
{
struct vm_area_struct *vma;
int ret = 0;
- if (!down_read_trylock(&mm->mmap_sem)) {
- /*
- * Activate page so shrink_inactive_list is unlikely to unmap
- * its ptes while lock is dropped, so swapoff can make progress.
- */
- activate_page(page);
- unlock_page(page);
- down_read(&mm->mmap_sem);
- lock_page(page);
- }
+ down_read(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
- if (vma->anon_vma && (ret = unuse_vma(vma, entry, page)))
- break;
+ if (vma->anon_vma) {
+ ret = unuse_vma(vma, type, frontswap,
+ fs_pages_to_unuse);
+ if (ret)
+ break;
+ }
cond_resched();
}
up_read(&mm->mmap_sem);
- return (ret < 0)? ret: 0;
+ return ret;
}
/*
* Scan swap_map (or frontswap_map if frontswap parameter is true)
- * from current position to next entry still in use.
- * Recycle to start on reaching the end, returning 0 when empty.
+ * from current position to next entry still in use. Return 0
+ * if there are no inuse entries after prev till end of the map.
*/
static unsigned int find_next_to_unuse(struct swap_info_struct *si,
unsigned int prev, bool frontswap)
{
- unsigned int max = si->max;
- unsigned int i = prev;
+ unsigned int i;
unsigned char count;
/*
@@ -1979,20 +2101,7 @@
* hits are okay, and sys_swapoff() has already prevented new
* allocations from this area (while holding swap_lock).
*/
- for (;;) {
- if (++i >= max) {
- if (!prev) {
- i = 0;
- break;
- }
- /*
- * No entries in use at top of swap_map,
- * loop back to start and recheck there.
- */
- max = prev + 1;
- prev = 0;
- i = 1;
- }
+ for (i = prev + 1; i < si->max; i++) {
count = READ_ONCE(si->swap_map[i]);
if (count && swap_count(count) != SWAP_MAP_BAD)
if (!frontswap || frontswap_test(si, i))
@@ -2000,239 +2109,123 @@
if ((i % LATENCY_LIMIT) == 0)
cond_resched();
}
+
+ if (i == si->max)
+ i = 0;
+
return i;
}
/*
- * We completely avoid races by reading each swap page in advance,
- * and then search for the process using it. All the necessary
- * page table adjustments can then be made atomically.
- *
- * if the boolean frontswap is true, only unuse pages_to_unuse pages;
+ * If the boolean frontswap is true, only unuse pages_to_unuse pages;
* pages_to_unuse==0 means all pages; ignored if frontswap is false
*/
int try_to_unuse(unsigned int type, bool frontswap,
unsigned long pages_to_unuse)
{
+ struct mm_struct *prev_mm;
+ struct mm_struct *mm;
+ struct list_head *p;
+ int retval = 0;
struct swap_info_struct *si = swap_info[type];
- struct mm_struct *start_mm;
- volatile unsigned char *swap_map; /* swap_map is accessed without
- * locking. Mark it as volatile
- * to prevent compiler doing
- * something odd.
- */
- unsigned char swcount;
struct page *page;
swp_entry_t entry;
- unsigned int i = 0;
- int retval = 0;
+ unsigned int i;
- /*
- * When searching mms for an entry, a good strategy is to
- * start at the first mm we freed the previous entry from
- * (though actually we don't notice whether we or coincidence
- * freed the entry). Initialize this start_mm with a hold.
- *
- * A simpler strategy would be to start at the last mm we
- * freed the previous entry from; but that would take less
- * advantage of mmlist ordering, which clusters forked mms
- * together, child after parent. If we race with dup_mmap(), we
- * prefer to resolve parent before child, lest we miss entries
- * duplicated after we scanned child: using last mm would invert
- * that.
- */
- start_mm = &init_mm;
- mmget(&init_mm);
+ if (!si->inuse_pages)
+ return 0;
- /*
- * Keep on scanning until all entries have gone. Usually,
- * one pass through swap_map is enough, but not necessarily:
- * there are races when an instance of an entry might be missed.
- */
- while ((i = find_next_to_unuse(si, i, frontswap)) != 0) {
- if (signal_pending(current)) {
- retval = -EINTR;
- break;
- }
+ if (!frontswap)
+ pages_to_unuse = 0;
- /*
- * Get a page for the entry, using the existing swap
- * cache page if there is one. Otherwise, get a clean
- * page and read the swap into it.
- */
- swap_map = &si->swap_map[i];
- entry = swp_entry(type, i);
- page = read_swap_cache_async(entry,
- GFP_HIGHUSER_MOVABLE, NULL, 0, false);
- if (!page) {
- /*
- * Either swap_duplicate() failed because entry
- * has been freed independently, and will not be
- * reused since sys_swapoff() already disabled
- * allocation from here, or alloc_page() failed.
- */
- swcount = *swap_map;
- /*
- * We don't hold lock here, so the swap entry could be
- * SWAP_MAP_BAD (when the cluster is discarding).
- * Instead of fail out, We can just skip the swap
- * entry because swapoff will wait for discarding
- * finish anyway.
- */
- if (!swcount || swcount == SWAP_MAP_BAD)
- continue;
- retval = -ENOMEM;
- break;
- }
+retry:
+ retval = shmem_unuse(type, frontswap, &pages_to_unuse);
+ if (retval)
+ goto out;
- /*
- * Don't hold on to start_mm if it looks like exiting.
- */
- if (atomic_read(&start_mm->mm_users) == 1) {
- mmput(start_mm);
- start_mm = &init_mm;
- mmget(&init_mm);
- }
+ prev_mm = &init_mm;
+ mmget(prev_mm);
- /*
- * Wait for and lock page. When do_swap_page races with
- * try_to_unuse, do_swap_page can handle the fault much
- * faster than try_to_unuse can locate the entry. This
- * apparently redundant "wait_on_page_locked" lets try_to_unuse
- * defer to do_swap_page in such a case - in some tests,
- * do_swap_page and try_to_unuse repeatedly compete.
- */
- wait_on_page_locked(page);
- wait_on_page_writeback(page);
- lock_page(page);
- wait_on_page_writeback(page);
+ spin_lock(&mmlist_lock);
+ p = &init_mm.mmlist;
+ while (si->inuse_pages &&
+ !signal_pending(current) &&
+ (p = p->next) != &init_mm.mmlist) {
- /*
- * Remove all references to entry.
- */
- swcount = *swap_map;
- if (swap_count(swcount) == SWAP_MAP_SHMEM) {
- retval = shmem_unuse(entry, page);
- /* page has already been unlocked and released */
- if (retval < 0)
- break;
+ mm = list_entry(p, struct mm_struct, mmlist);
+ if (!mmget_not_zero(mm))
continue;
- }
- if (swap_count(swcount) && start_mm != &init_mm)
- retval = unuse_mm(start_mm, entry, page);
+ spin_unlock(&mmlist_lock);
+ mmput(prev_mm);
+ prev_mm = mm;
+ retval = unuse_mm(mm, type, frontswap, &pages_to_unuse);
- if (swap_count(*swap_map)) {
- int set_start_mm = (*swap_map >= swcount);
- struct list_head *p = &start_mm->mmlist;
- struct mm_struct *new_start_mm = start_mm;
- struct mm_struct *prev_mm = start_mm;
- struct mm_struct *mm;
-
- mmget(new_start_mm);
- mmget(prev_mm);
- spin_lock(&mmlist_lock);
- while (swap_count(*swap_map) && !retval &&
- (p = p->next) != &start_mm->mmlist) {
- mm = list_entry(p, struct mm_struct, mmlist);
- if (!mmget_not_zero(mm))
- continue;
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- prev_mm = mm;
-
- cond_resched();
-
- swcount = *swap_map;
- if (!swap_count(swcount)) /* any usage ? */
- ;
- else if (mm == &init_mm)
- set_start_mm = 1;
- else
- retval = unuse_mm(mm, entry, page);
-
- if (set_start_mm && *swap_map < swcount) {
- mmput(new_start_mm);
- mmget(mm);
- new_start_mm = mm;
- set_start_mm = 0;
- }
- spin_lock(&mmlist_lock);
- }
- spin_unlock(&mmlist_lock);
- mmput(prev_mm);
- mmput(start_mm);
- start_mm = new_start_mm;
- }
if (retval) {
- unlock_page(page);
- put_page(page);
- break;
+ mmput(prev_mm);
+ goto out;
}
/*
- * If a reference remains (rare), we would like to leave
- * the page in the swap cache; but try_to_unmap could
- * then re-duplicate the entry once we drop page lock,
- * so we might loop indefinitely; also, that page could
- * not be swapped out to other storage meanwhile. So:
- * delete from cache even if there's another reference,
- * after ensuring that the data has been saved to disk -
- * since if the reference remains (rarer), it will be
- * read from disk into another page. Splitting into two
- * pages would be incorrect if swap supported "shared
- * private" pages, but they are handled by tmpfs files.
- *
- * Given how unuse_vma() targets one particular offset
- * in an anon_vma, once the anon_vma has been determined,
- * this splitting happens to be just what is needed to
- * handle where KSM pages have been swapped out: re-reading
- * is unnecessarily slow, but we can fix that later on.
- */
- if (swap_count(*swap_map) &&
- PageDirty(page) && PageSwapCache(page)) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_NONE,
- };
-
- swap_writepage(compound_head(page), &wbc);
- lock_page(page);
- wait_on_page_writeback(page);
- }
-
- /*
- * It is conceivable that a racing task removed this page from
- * swap cache just before we acquired the page lock at the top,
- * or while we dropped it in unuse_mm(). The page might even
- * be back in swap cache on another swap area: that we must not
- * delete, since it may not have been written out to swap yet.
- */
- if (PageSwapCache(page) &&
- likely(page_private(page) == entry.val) &&
- !page_swapped(page))
- delete_from_swap_cache(compound_head(page));
-
- /*
- * So we could skip searching mms once swap count went
- * to 1, we did not mark any present ptes as dirty: must
- * mark page dirty so shrink_page_list will preserve it.
- */
- SetPageDirty(page);
- unlock_page(page);
- put_page(page);
-
- /*
* Make sure that we aren't completely killing
* interactive performance.
*/
cond_resched();
- if (frontswap && pages_to_unuse > 0) {
- if (!--pages_to_unuse)
- break;
- }
+ spin_lock(&mmlist_lock);
+ }
+ spin_unlock(&mmlist_lock);
+
+ mmput(prev_mm);
+
+ i = 0;
+ while (si->inuse_pages &&
+ !signal_pending(current) &&
+ (i = find_next_to_unuse(si, i, frontswap)) != 0) {
+
+ entry = swp_entry(type, i);
+ page = find_get_page(swap_address_space(entry), i);
+ if (!page)
+ continue;
+
+ /*
+ * It is conceivable that a racing task removed this page from
+ * swap cache just before we acquired the page lock. The page
+ * might even be back in swap cache on another swap area. But
+ * that is okay, try_to_free_swap() only removes stale pages.
+ */
+ lock_page(page);
+ wait_on_page_writeback(page);
+ try_to_free_swap(page);
+ unlock_page(page);
+ put_page(page);
+
+ /*
+ * For frontswap, we just need to unuse pages_to_unuse, if
+ * it was specified. Need not check frontswap again here as
+ * we already zeroed out pages_to_unuse if not frontswap.
+ */
+ if (pages_to_unuse && --pages_to_unuse == 0)
+ goto out;
}
- mmput(start_mm);
- return retval;
+ /*
+ * Lets check again to see if there are still swap entries in the map.
+ * If yes, we would need to do retry the unuse logic again.
+ * Under global memory pressure, swap entries can be reinserted back
+ * into process space after the mmlist loop above passes over them.
+ *
+ * Limit the number of retries? No: when mmget_not_zero() above fails,
+ * that mm is likely to be freeing swap from exit_mmap(), which proceeds
+ * at its own independent pace; and even shmem_writepage() could have
+ * been preempted after get_swap_page(), temporarily hiding that swap.
+ * It's easy and robust (though cpu-intensive) just to keep retrying.
+ */
+ if (si->inuse_pages) {
+ if (!signal_pending(current))
+ goto retry;
+ retval = -EINTR;
+ }
+out:
+ return (retval == FRONTSWAP_PAGES_UNUSED) ? 0 : retval;
}
/*
@@ -2264,26 +2257,15 @@
static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
{
struct swap_info_struct *sis;
- struct swap_extent *start_se;
struct swap_extent *se;
pgoff_t offset;
- sis = swap_info[swp_type(entry)];
+ sis = swp_swap_info(entry);
*bdev = sis->bdev;
offset = swp_offset(entry);
- start_se = sis->curr_swap_extent;
- se = start_se;
-
- for ( ; ; ) {
- if (se->start_page <= offset &&
- offset < (se->start_page + se->nr_pages)) {
- return se->start_block + (offset - se->start_page);
- }
- se = list_next_entry(se, list);
- sis->curr_swap_extent = se;
- BUG_ON(se == start_se); /* It *must* be present */
- }
+ se = offset_to_swap_extent(sis, offset);
+ return se->start_block + (offset - se->start_page);
}
/*
@@ -2301,27 +2283,27 @@
*/
static void destroy_swap_extents(struct swap_info_struct *sis)
{
- while (!list_empty(&sis->first_swap_extent.list)) {
- struct swap_extent *se;
+ while (!RB_EMPTY_ROOT(&sis->swap_extent_root)) {
+ struct rb_node *rb = sis->swap_extent_root.rb_node;
+ struct swap_extent *se = rb_entry(rb, struct swap_extent, rb_node);
- se = list_first_entry(&sis->first_swap_extent.list,
- struct swap_extent, list);
- list_del(&se->list);
+ rb_erase(rb, &sis->swap_extent_root);
kfree(se);
}
- if (sis->flags & SWP_FILE) {
+ if (sis->flags & SWP_ACTIVATED) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
- sis->flags &= ~SWP_FILE;
- mapping->a_ops->swap_deactivate(swap_file);
+ sis->flags &= ~SWP_ACTIVATED;
+ if (mapping->a_ops->swap_deactivate)
+ mapping->a_ops->swap_deactivate(swap_file);
}
}
/*
* Add a block range (and the corresponding page range) into this swapdev's
- * extent list. The extent list is kept sorted in page order.
+ * extent tree.
*
* This function rather assumes that it is called in ascending page order.
*/
@@ -2329,20 +2311,21 @@
add_swap_extent(struct swap_info_struct *sis, unsigned long start_page,
unsigned long nr_pages, sector_t start_block)
{
+ struct rb_node **link = &sis->swap_extent_root.rb_node, *parent = NULL;
struct swap_extent *se;
struct swap_extent *new_se;
- struct list_head *lh;
- if (start_page == 0) {
- se = &sis->first_swap_extent;
- sis->curr_swap_extent = se;
- se->start_page = 0;
- se->nr_pages = nr_pages;
- se->start_block = start_block;
- return 1;
- } else {
- lh = sis->first_swap_extent.list.prev; /* Highest extent */
- se = list_entry(lh, struct swap_extent, list);
+ /*
+ * place the new node at the right most since the
+ * function is called in ascending page order.
+ */
+ while (*link) {
+ parent = *link;
+ link = &parent->rb_right;
+ }
+
+ if (parent) {
+ se = rb_entry(parent, struct swap_extent, rb_node);
BUG_ON(se->start_page + se->nr_pages != start_page);
if (se->start_block + se->nr_pages == start_block) {
/* Merge it */
@@ -2351,9 +2334,7 @@
}
}
- /*
- * No merge. Insert a new extent, preserving ordering.
- */
+ /* No merge, insert a new extent. */
new_se = kmalloc(sizeof(*se), GFP_KERNEL);
if (new_se == NULL)
return -ENOMEM;
@@ -2361,9 +2342,11 @@
new_se->nr_pages = nr_pages;
new_se->start_block = start_block;
- list_add_tail(&new_se->list, &sis->first_swap_extent.list);
+ rb_link_node(&new_se->rb_node, parent, link);
+ rb_insert_color(&new_se->rb_node, &sis->swap_extent_root);
return 1;
}
+EXPORT_SYMBOL_GPL(add_swap_extent);
/*
* A `swap extent' is a simple thing which maps a contiguous range of pages
@@ -2385,9 +2368,8 @@
* requirements, they are simply tossed out - we will never use those blocks
* for swapping.
*
- * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This
- * prevents root from shooting her foot off by ftruncating an in-use swapfile,
- * which will scribble on the fs.
+ * For all swap devices we set S_SWAPFILE across the life of the swapon. This
+ * prevents users from writing to the swap device, which will corrupt memory.
*
* The amount of disk space which a single swap extent represents varies.
* Typically it is in the 1-4 megabyte range. So we can have hundreds of
@@ -2411,8 +2393,10 @@
if (mapping->a_ops->swap_activate) {
ret = mapping->a_ops->swap_activate(sis, swap_file, span);
+ if (ret >= 0)
+ sis->flags |= SWP_ACTIVATED;
if (!ret) {
- sis->flags |= SWP_FILE;
+ sis->flags |= SWP_FS;
ret = add_swap_extent(sis, 0, sis->max, 0);
*span = sis->pages;
}
@@ -2434,9 +2418,9 @@
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void _enable_swap_info(struct swap_info_struct *p, int prio,
- unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+static void setup_swap_info(struct swap_info_struct *p, int prio,
+ unsigned char *swap_map,
+ struct swap_cluster_info *cluster_info)
{
int i;
@@ -2461,7 +2445,11 @@
}
p->swap_map = swap_map;
p->cluster_info = cluster_info;
- p->flags |= SWP_WRITEOK;
+}
+
+static void _enable_swap_info(struct swap_info_struct *p)
+{
+ p->flags |= SWP_WRITEOK | SWP_VALID;
atomic_long_add(p->pages, &nr_swap_pages);
total_swap_pages += p->pages;
@@ -2488,7 +2476,17 @@
frontswap_init(p->type, frontswap_map);
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, prio, swap_map, cluster_info);
+ setup_swap_info(p, prio, swap_map, cluster_info);
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * Guarantee swap_map, cluster_info, etc. fields are valid
+ * between get/put_swap_device() if SWP_VALID bit is set
+ */
+ synchronize_rcu();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2497,7 +2495,8 @@
{
spin_lock(&swap_lock);
spin_lock(&p->lock);
- _enable_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
+ _enable_swap_info(p);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
}
@@ -2600,6 +2599,17 @@
reenable_swap_slots_cache_unlock();
+ spin_lock(&swap_lock);
+ spin_lock(&p->lock);
+ p->flags &= ~SWP_VALID; /* mark swap device as invalid */
+ spin_unlock(&p->lock);
+ spin_unlock(&swap_lock);
+ /*
+ * wait for swap operations protected by get/put_swap_device()
+ * to complete
+ */
+ synchronize_rcu();
+
flush_work(&p->discard_work);
destroy_swap_extents(p);
@@ -2650,13 +2660,14 @@
inode = mapping->host;
if (S_ISBLK(inode->i_mode)) {
struct block_device *bdev = I_BDEV(inode);
+
set_blocksize(bdev, old_block_size);
blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
- } else {
- inode_lock(inode);
- inode->i_flags &= ~S_SWAPFILE;
- inode_unlock(inode);
}
+
+ inode_lock(inode);
+ inode->i_flags &= ~S_SWAPFILE;
+ inode_unlock(inode);
filp_close(swap_file, NULL);
/*
@@ -2706,9 +2717,7 @@
if (!l)
return SEQ_START_TOKEN;
- for (type = 0; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (type = 0; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
if (!--l)
@@ -2728,9 +2737,7 @@
else
type = si->type + 1;
- for (; type < nr_swapfiles; type++) {
- smp_rmb(); /* read nr_swapfiles before swap_info[type] */
- si = swap_info[type];
+ for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
++*pos;
@@ -2820,7 +2827,7 @@
unsigned int type;
int i;
- p = kvzalloc(sizeof(*p), GFP_KERNEL);
+ p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -2836,14 +2843,14 @@
}
if (type >= nr_swapfiles) {
p->type = type;
- swap_info[type] = p;
+ WRITE_ONCE(swap_info[type], p);
/*
* Write swap_info[type] before nr_swapfiles, in case a
* racing procfs swap_start() or swap_next() is reading them.
* (We never shrink nr_swapfiles, we never free this entry.)
*/
smp_wmb();
- nr_swapfiles++;
+ WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
} else {
kvfree(p);
p = swap_info[type];
@@ -2852,7 +2859,7 @@
* would be relying on p->type to remain valid.
*/
}
- INIT_LIST_HEAD(&p->first_swap_extent.list);
+ p->swap_extent_root = RB_ROOT;
plist_node_init(&p->list, 0);
for_each_node(i)
plist_node_init(&p->avail_lists[i], 0);
@@ -2883,11 +2890,11 @@
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
p->bdev = inode->i_sb->s_bdev;
- inode_lock(inode);
- if (IS_SWAPFILE(inode))
- return -EBUSY;
- } else
- return -EINVAL;
+ }
+
+ inode_lock(inode);
+ if (IS_SWAPFILE(inode))
+ return -EBUSY;
return 0;
}
@@ -3268,6 +3275,17 @@
if (error)
goto bad_swap;
+ /*
+ * Flush any pending IO and dirty mappings before we start using this
+ * swap device.
+ */
+ inode->i_flags |= S_SWAPFILE;
+ error = inode_drain_writes(inode);
+ if (error) {
+ inode->i_flags &= ~S_SWAPFILE;
+ goto bad_swap;
+ }
+
mutex_lock(&swapon_mutex);
prio = -1;
if (swap_flags & SWAP_FLAG_PREFER)
@@ -3288,8 +3306,6 @@
atomic_inc(&proc_poll_event);
wake_up_interruptible(&proc_poll_wait);
- if (S_ISREG(inode->i_mode))
- inode->i_flags |= S_SWAPFILE;
error = 0;
goto out;
bad_swap:
@@ -3311,7 +3327,7 @@
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file) {
- if (inode && S_ISREG(inode->i_mode)) {
+ if (inode) {
inode_unlock(inode);
inode = NULL;
}
@@ -3324,7 +3340,7 @@
}
if (name)
putname(name);
- if (inode && S_ISREG(inode->i_mode))
+ if (inode)
inode_unlock(inode);
if (!error)
enable_swap_slots_cache();
@@ -3363,22 +3379,16 @@
{
struct swap_info_struct *p;
struct swap_cluster_info *ci;
- unsigned long offset, type;
+ unsigned long offset;
unsigned char count;
unsigned char has_cache;
int err = -EINVAL;
- if (non_swap_entry(entry))
+ p = get_swap_device(entry);
+ if (!p)
goto out;
- type = swp_type(entry);
- if (type >= nr_swapfiles)
- goto bad_file;
- p = swap_info[type];
offset = swp_offset(entry);
- if (unlikely(offset >= p->max))
- goto out;
-
ci = lock_cluster_or_swap_info(p, offset);
count = p->swap_map[offset];
@@ -3424,11 +3434,9 @@
unlock_out:
unlock_cluster_or_swap_info(p, ci);
out:
+ if (p)
+ put_swap_device(p);
return err;
-
-bad_file:
- pr_err("swap_dup: %s%08lx\n", Bad_file, entry.val);
- goto out;
}
/*
@@ -3471,7 +3479,7 @@
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
{
- return swap_info[swp_type(entry)];
+ return swap_type_to_swap_info(swp_type(entry));
}
struct swap_info_struct *page_swap_info(struct page *page)
@@ -3520,6 +3528,7 @@
struct page *list_page;
pgoff_t offset;
unsigned char count;
+ int ret = 0;
/*
* When debugging, it's easier to use __GFP_ZERO here; but it's better
@@ -3527,15 +3536,15 @@
*/
page = alloc_page(gfp_mask | __GFP_HIGHMEM);
- si = swap_info_get(entry);
+ si = get_swap_device(entry);
if (!si) {
/*
* An acceptable race has occurred since the failing
- * __swap_duplicate(): the swap entry has been freed,
- * perhaps even the whole swap_map cleared for swapoff.
+ * __swap_duplicate(): the swap device may be swapoff
*/
goto outer;
}
+ spin_lock(&si->lock);
offset = swp_offset(entry);
@@ -3553,9 +3562,8 @@
}
if (!page) {
- unlock_cluster(ci);
- spin_unlock(&si->lock);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out;
}
/*
@@ -3607,10 +3615,11 @@
out:
unlock_cluster(ci);
spin_unlock(&si->lock);
+ put_swap_device(si);
outer:
if (page)
__free_page(page);
- return 0;
+ return ret;
}
/*
diff --git a/mm/truncate.c b/mm/truncate.c
index 71b65aa..dd9ebc1 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/truncate.c - code for taking down pages from address_spaces
*
@@ -33,15 +34,12 @@
static inline void __clear_shadow_entry(struct address_space *mapping,
pgoff_t index, void *entry)
{
- struct radix_tree_node *node;
- void **slot;
+ XA_STATE(xas, &mapping->i_pages, index);
- if (!__radix_tree_lookup(&mapping->i_pages, index, &node, &slot))
+ xas_set_update(&xas, workingset_update_node);
+ if (xas_load(&xas) != entry)
return;
- if (*slot != entry)
- return;
- __radix_tree_replace(&mapping->i_pages, node, slot, NULL,
- workingset_update_node);
+ xas_store(&xas, NULL);
mapping->nrexceptional--;
}
@@ -70,7 +68,7 @@
return;
for (j = 0; j < pagevec_count(pvec); j++)
- if (radix_tree_exceptional_entry(pvec->pages[j]))
+ if (xa_is_value(pvec->pages[j]))
break;
if (j == pagevec_count(pvec))
@@ -85,7 +83,7 @@
struct page *page = pvec->pages[i];
pgoff_t index = indices[i];
- if (!radix_tree_exceptional_entry(page)) {
+ if (!xa_is_value(page)) {
pvec->pages[j++] = page;
continue;
}
@@ -347,7 +345,7 @@
if (index >= end)
break;
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
continue;
if (!trylock_page(page))
@@ -442,7 +440,7 @@
break;
}
- if (radix_tree_exceptional_entry(page))
+ if (xa_is_value(page))
continue;
lock_page(page);
@@ -542,6 +540,8 @@
* invalidate_mapping_pages() will not block on IO activity. It will not
* invalidate pages which are dirty, locked, under writeback or mapped into
* pagetables.
+ *
+ * Return: the number of the pages that were invalidated
*/
unsigned long invalidate_mapping_pages(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -565,7 +565,7 @@
if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
invalidate_exceptional_entry(mapping, index,
page);
continue;
@@ -592,6 +592,16 @@
unlock_page(page);
continue;
}
+
+ /* Take a pin outside pagevec */
+ get_page(page);
+
+ /*
+ * Drop extra pins before trying to invalidate
+ * the huge page.
+ */
+ pagevec_remove_exceptionals(&pvec);
+ pagevec_release(&pvec);
}
ret = invalidate_inode_page(page);
@@ -602,6 +612,8 @@
*/
if (!ret)
deactivate_file_page(page);
+ if (PageTransHuge(page))
+ put_page(page);
count += ret;
}
pagevec_remove_exceptionals(&pvec);
@@ -667,7 +679,7 @@
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EBUSY if any pages could not be invalidated.
+ * Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2_range(struct address_space *mapping,
pgoff_t start, pgoff_t end)
@@ -696,7 +708,7 @@
if (index > end)
break;
- if (radix_tree_exceptional_entry(page)) {
+ if (xa_is_value(page)) {
if (!invalidate_exceptional_entry2(mapping,
index, page))
ret = -EBUSY;
@@ -742,10 +754,10 @@
index++;
}
/*
- * For DAX we invalidate page tables after invalidating radix tree. We
+ * For DAX we invalidate page tables after invalidating page cache. We
* could invalidate page tables while invalidating each entry however
* that would be expensive. And doing range unmapping before doesn't
- * work as we have no cheap way to find whether radix tree entry didn't
+ * work as we have no cheap way to find whether page cache entry didn't
* get remapped later.
*/
if (dax_mapping(mapping)) {
@@ -764,7 +776,7 @@
* Any pages which are found to be mapped into pagetables are unmapped prior to
* invalidation.
*
- * Returns -EBUSY if any pages could not be invalidated.
+ * Return: -EBUSY if any pages could not be invalidated.
*/
int invalidate_inode_pages2(struct address_space *mapping)
{
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 852eb4e..660717a 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* This implements the various checks for CONFIG_HARDENED_USERCOPY*,
* which are designed to protect kernel memory from needless exposure
@@ -6,15 +7,11 @@
*
* Copyright (C) 2001-2016 PaX Team, Bradley Spengler, Open Source
* Security Inc.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/task.h>
@@ -151,7 +148,7 @@
bool to_user)
{
/* Reject if object wraps past end of memory. */
- if (ptr + n < ptr)
+ if (ptr + (n - 1) < ptr)
usercopy_abort("wrapped address", NULL, to_user, 0, ptr + n);
/* Reject if NULL or ZERO-allocation. */
@@ -231,7 +228,12 @@
if (!virt_addr_valid(ptr))
return;
- page = virt_to_head_page(ptr);
+ /*
+ * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
+ * highmem page or fallback to virt_to_page(). The following
+ * is effectively a highmem-aware virt_to_head_page().
+ */
+ page = compound_head(kmap_to_page((void *)ptr));
if (PageSlab(page)) {
/* Check slab allocator for flags and size. */
@@ -247,7 +249,8 @@
/*
* Validates that the given object is:
* - not bogus address
- * - known-safe heap or stack object
+ * - fully contained by stack (or stack frame, when available)
+ * - fully within SLAB object (or object whitelist area, when available)
* - not in kernel text
*/
void __check_object_size(const void *ptr, unsigned long n, bool to_user)
@@ -262,9 +265,6 @@
/* Check for invalid addresses. */
check_bogus_address((const unsigned long)ptr, n, to_user);
- /* Check for bad heap object. */
- check_heap_object(ptr, n, to_user);
-
/* Check for bad stack object. */
switch (check_stack_object(ptr, n)) {
case NOT_STACK:
@@ -282,6 +282,9 @@
usercopy_abort("process stack", NULL, to_user, 0, n);
}
+ /* Check for bad heap object. */
+ check_heap_object(ptr, n, to_user);
+
/* Check for object in kernel to avoid text exposure. */
check_kernel_text_object((const unsigned long)ptr, n, to_user);
}
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 458acda..c7ae74c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1,10 +1,8 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* mm/userfaultfd.c
*
* Copyright (C) 2015 Red Hat, Inc.
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
*/
#include <linux/mm.h>
@@ -271,8 +269,7 @@
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, dst_mm, dst_vma, mapping,
- idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
@@ -543,7 +540,7 @@
break;
}
if (unlikely(pmd_none(dst_pmdval)) &&
- unlikely(__pte_alloc(dst_mm, dst_pmd, dst_addr))) {
+ unlikely(__pte_alloc(dst_mm, dst_pmd))) {
err = -ENOMEM;
break;
}
diff --git a/mm/util.c b/mm/util.c
index 9e3ebd2..3ad6db9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/string.h>
@@ -6,6 +7,7 @@
#include <linux/err.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/sched/signal.h>
#include <linux/sched/task_stack.h>
#include <linux/security.h>
#include <linux/swap.h>
@@ -14,18 +16,18 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
#include <linux/userfaultfd_k.h>
+#include <linux/elf.h>
+#include <linux/elf-randomize.h>
+#include <linux/personality.h>
+#include <linux/random.h>
+#include <linux/processor.h>
+#include <linux/sizes.h>
+#include <linux/compat.h>
-#include <asm/sections.h>
#include <linux/uaccess.h>
#include "internal.h"
-static inline int is_kernel_rodata(unsigned long addr)
-{
- return addr >= (unsigned long)__start_rodata &&
- addr < (unsigned long)__end_rodata;
-}
-
/**
* kfree_const - conditionally free memory
* @x: pointer to the memory
@@ -43,6 +45,8 @@
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrdup(const char *s, gfp_t gfp)
{
@@ -65,9 +69,10 @@
* @s: the string to duplicate
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
- * Function returns source string if it is in .rodata section otherwise it
- * fallbacks to kstrdup.
- * Strings allocated by kstrdup_const should be freed by kfree_const.
+ * Note: Strings allocated by kstrdup_const should be freed by kfree_const.
+ *
+ * Return: source string if it is in .rodata section otherwise
+ * fallback to kstrdup.
*/
const char *kstrdup_const(const char *s, gfp_t gfp)
{
@@ -85,6 +90,8 @@
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
*
* Note: Use kmemdup_nul() instead if the size is known exactly.
+ *
+ * Return: newly allocated copy of @s or %NULL in case of error
*/
char *kstrndup(const char *s, size_t max, gfp_t gfp)
{
@@ -110,6 +117,8 @@
* @src: memory region to duplicate
* @len: memory region length
* @gfp: GFP mask to use
+ *
+ * Return: newly allocated copy of @src or %NULL in case of error
*/
void *kmemdup(const void *src, size_t len, gfp_t gfp)
{
@@ -127,6 +136,9 @@
* @s: The data to stringify
* @len: The size of the data
* @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Return: newly allocated copy of @s with NUL-termination or %NULL in
+ * case of error
*/
char *kmemdup_nul(const char *s, size_t len, gfp_t gfp)
{
@@ -150,14 +162,14 @@
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure. Result is physically
+ * Return: an ERR_PTR() on failure. Result is physically
* contiguous, to be freed by kfree().
*/
void *memdup_user(const void __user *src, size_t len)
{
void *p;
- p = kmalloc_track_caller(len, GFP_USER);
+ p = kmalloc_track_caller(len, GFP_USER | __GFP_NOWARN);
if (!p)
return ERR_PTR(-ENOMEM);
@@ -176,7 +188,7 @@
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure. Result may be not
+ * Return: an ERR_PTR() on failure. Result may be not
* physically contiguous. Use kvfree() to free.
*/
void *vmemdup_user(const void __user *src, size_t len)
@@ -200,6 +212,8 @@
* strndup_user - duplicate an existing string from user space
* @s: The string to duplicate
* @n: Maximum number of bytes to copy, including the trailing NUL.
+ *
+ * Return: newly allocated copy of @s or an ERR_PTR() in case of error
*/
char *strndup_user(const char __user *s, long n)
{
@@ -231,7 +245,7 @@
* @src: source address in user space
* @len: number of bytes to copy
*
- * Returns an ERR_PTR() on failure.
+ * Return: an ERR_PTR() on failure.
*/
void *memdup_user_nul(const void __user *src, size_t len)
{
@@ -286,7 +300,105 @@
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
-#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
+#ifndef STACK_RND_MASK
+#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
+#endif
+
+unsigned long randomize_stack_top(unsigned long stack_top)
+{
+ unsigned long random_variable = 0;
+
+ if (current->flags & PF_RANDOMIZE) {
+ random_variable = get_random_long();
+ random_variable &= STACK_RND_MASK;
+ random_variable <<= PAGE_SHIFT;
+ }
+#ifdef CONFIG_STACK_GROWSUP
+ return PAGE_ALIGN(stack_top) + random_variable;
+#else
+ return PAGE_ALIGN(stack_top) - random_variable;
+#endif
+}
+
+#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+ /* Is the current task 32bit ? */
+ if (!IS_ENABLED(CONFIG_64BIT) || is_compat_task())
+ return randomize_page(mm->brk, SZ_32M);
+
+ return randomize_page(mm->brk, SZ_1G);
+}
+
+unsigned long arch_mmap_rnd(void)
+{
+ unsigned long rnd;
+
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ if (is_compat_task())
+ rnd = get_random_long() & ((1UL << mmap_rnd_compat_bits) - 1);
+ else
+#endif /* CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS */
+ rnd = get_random_long() & ((1UL << mmap_rnd_bits) - 1);
+
+ return rnd << PAGE_SHIFT;
+}
+
+static int mmap_is_legacy(struct rlimit *rlim_stack)
+{
+ if (current->personality & ADDR_COMPAT_LAYOUT)
+ return 1;
+
+ if (rlim_stack->rlim_cur == RLIM_INFINITY)
+ return 1;
+
+ return sysctl_legacy_va_layout;
+}
+
+/*
+ * Leave enough space between the mmap area and the stack to honour ulimit in
+ * the face of randomisation.
+ */
+#define MIN_GAP (SZ_128M)
+#define MAX_GAP (STACK_TOP / 6 * 5)
+
+static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
+{
+ unsigned long gap = rlim_stack->rlim_cur;
+ unsigned long pad = stack_guard_gap;
+
+ /* Account for stack randomization if necessary */
+ if (current->flags & PF_RANDOMIZE)
+ pad += (STACK_RND_MASK << PAGE_SHIFT);
+
+ /* Values close to RLIM_INFINITY can overflow. */
+ if (gap + pad > gap)
+ gap += pad;
+
+ if (gap < MIN_GAP)
+ gap = MIN_GAP;
+ else if (gap > MAX_GAP)
+ gap = MAX_GAP;
+
+ return PAGE_ALIGN(STACK_TOP - gap - rnd);
+}
+
+void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
+{
+ unsigned long random_factor = 0UL;
+
+ if (current->flags & PF_RANDOMIZE)
+ random_factor = arch_mmap_rnd();
+
+ if (mmap_is_legacy(rlim_stack)) {
+ mm->mmap_base = TASK_UNMAPPED_BASE + random_factor;
+ mm->get_unmapped_area = arch_get_unmapped_area;
+ } else {
+ mm->mmap_base = mmap_base(random_factor, rlim_stack);
+ mm->get_unmapped_area = arch_get_unmapped_area_topdown;
+ }
+}
+#elif defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
{
mm->mmap_base = TASK_UNMAPPED_BASE;
@@ -294,52 +406,79 @@
}
#endif
-/*
- * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
- * back to the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- * If the architecture does not support this function, simply return with no
- * pages pinned.
+/**
+ * __account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
+ * @task: task used to check RLIMIT_MEMLOCK
+ * @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
+ *
+ * Assumes @task and @mm are valid (i.e. at least one reference on each), and
+ * that mmap_sem is held as writer.
+ *
+ * Return:
+ * * 0 on success
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
-int __weak __get_user_pages_fast(unsigned long start,
- int nr_pages, int write, struct page **pages)
+int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
+ struct task_struct *task, bool bypass_rlim)
{
- return 0;
+ unsigned long locked_vm, limit;
+ int ret = 0;
+
+ lockdep_assert_held_write(&mm->mmap_sem);
+
+ locked_vm = mm->locked_vm;
+ if (inc) {
+ if (!bypass_rlim) {
+ limit = task_rlimit(task, RLIMIT_MEMLOCK) >> PAGE_SHIFT;
+ if (locked_vm + pages > limit)
+ ret = -ENOMEM;
+ }
+ if (!ret)
+ mm->locked_vm = locked_vm + pages;
+ } else {
+ WARN_ON_ONCE(pages > locked_vm);
+ mm->locked_vm = locked_vm - pages;
+ }
+
+ pr_debug("%s: [%d] caller %ps %c%lu %lu/%lu%s\n", __func__, task->pid,
+ (void *)_RET_IP_, (inc) ? '+' : '-', pages << PAGE_SHIFT,
+ locked_vm << PAGE_SHIFT, task_rlimit(task, RLIMIT_MEMLOCK),
+ ret ? " - exceeded" : "");
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+EXPORT_SYMBOL_GPL(__account_locked_vm);
/**
- * get_user_pages_fast() - pin user pages in memory
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @write: whether pages will be written to
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long.
+ * account_locked_vm - account locked pages to an mm's locked_vm
+ * @mm: mm to account against, may be NULL
+ * @pages: number of pages to account
+ * @inc: %true if @pages should be considered positive, %false if not
*
- * Returns number of pages pinned. This may be fewer than the number
- * requested. If nr_pages is 0 or negative, returns 0. If no pages
- * were pinned, returns -errno.
+ * Assumes a non-NULL @mm is valid (i.e. at least one reference on it).
*
- * get_user_pages_fast provides equivalent functionality to get_user_pages,
- * operating on current and current->mm, with force=0 and vma=NULL. However
- * unlike get_user_pages, it must be called without mmap_sem held.
- *
- * get_user_pages_fast may take mmap_sem and page table locks, so no
- * assumptions can be made about lack of locking. get_user_pages_fast is to be
- * implemented in a way that is advantageous (vs get_user_pages()) when the
- * user memory area is already faulted in and present in ptes. However if the
- * pages have to be faulted in, it may turn out to be slightly slower so
- * callers need to carefully consider what to use. On many architectures,
- * get_user_pages_fast simply falls back to get_user_pages.
+ * Return:
+ * * 0 on success, or if mm is NULL
+ * * -ENOMEM if RLIMIT_MEMLOCK would be exceeded.
*/
-int __weak get_user_pages_fast(unsigned long start,
- int nr_pages, int write, struct page **pages)
+int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
{
- return get_user_pages_unlocked(start, nr_pages, pages,
- write ? FOLL_WRITE : 0);
+ int ret;
+
+ if (pages == 0 || !mm)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+ ret = __account_locked_vm(mm, pages, inc, current,
+ capable(CAP_IPC_LOCK));
+ up_write(&mm->mmap_sem);
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(get_user_pages_fast);
+EXPORT_SYMBOL_GPL(account_locked_vm);
unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
@@ -393,6 +532,8 @@
*
* Please note that any use of gfp flags outside of GFP_KERNEL is careful to not
* fall back to vmalloc.
+ *
+ * Return: pointer to the allocated memory of %NULL in case of failure
*/
void *kvmalloc_node(size_t size, gfp_t flags, int node)
{
@@ -442,7 +583,7 @@
* It is slightly more efficient to use kfree() or vfree() if you are certain
* that you know which one to use.
*
- * Context: Any context except NMI.
+ * Context: Either preemptible task context or not-NMI interrupt.
*/
void kvfree(const void *addr)
{
@@ -485,7 +626,7 @@
return true;
if (PageHuge(page))
return false;
- for (i = 0; i < hpage_nr_pages(page); i++) {
+ for (i = 0; i < compound_nr(page); i++) {
if (atomic_read(&page[i]._mapcount) >= 0)
return true;
}
@@ -600,7 +741,7 @@
if (sysctl_overcommit_kbytes)
allowed = sysctl_overcommit_kbytes >> (PAGE_SHIFT - 10);
else
- allowed = ((totalram_pages - hugetlb_total_pages())
+ allowed = ((totalram_pages() - hugetlb_total_pages())
* sysctl_overcommit_ratio / 100);
allowed += total_swap_pages;
@@ -645,7 +786,7 @@
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- long free, allowed, reserve;
+ long allowed;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
@@ -660,52 +801,9 @@
return 0;
if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) {
- free = global_zone_page_state(NR_FREE_PAGES);
- free += global_node_page_state(NR_FILE_PAGES);
-
- /*
- * shmem pages shouldn't be counted as free in this
- * case, they can't be purged, only swapped out, and
- * that won't affect the overall amount of available
- * memory in the system.
- */
- free -= global_node_page_state(NR_SHMEM);
-
- free += get_nr_swap_pages();
-
- /*
- * Any slabs which are created with the
- * SLAB_RECLAIM_ACCOUNT flag claim to have contents
- * which are reclaimable, under pressure. The dentry
- * cache and most inode caches should fall into this
- */
- free += global_node_page_state(NR_SLAB_RECLAIMABLE);
-
- /*
- * Part of the kernel memory, which can be released
- * under memory pressure.
- */
- free += global_node_page_state(
- NR_INDIRECTLY_RECLAIMABLE_BYTES) >> PAGE_SHIFT;
-
- /*
- * Leave reserved pages. The pages are not for anonymous pages.
- */
- if (free <= totalreserve_pages)
+ if (pages > totalram_pages() + total_swap_pages)
goto error;
- else
- free -= totalreserve_pages;
-
- /*
- * Reserve some for root
- */
- if (!cap_sys_admin)
- free -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
-
- if (free > pages)
- return 0;
-
- goto error;
+ return 0;
}
allowed = vm_commit_limit();
@@ -719,7 +817,8 @@
* Don't let a single process grow so big a user can't recover
*/
if (mm) {
- reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+ long reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
+
allowed -= min_t(long, mm->total_vm / 32, reserve);
}
@@ -737,7 +836,8 @@
* @buffer: the buffer to copy to.
* @buflen: the length of the buffer. Larger cmdline values are truncated
* to this length.
- * Returns the size of the cmdline field copied. Note that the copy does
+ *
+ * Return: the size of the cmdline field copied. Note that the copy does
* not guarantee an ending NULL byte.
*/
int get_cmdline(struct task_struct *task, char *buffer, int buflen)
@@ -751,12 +851,12 @@
if (!mm->arg_end)
goto out_mm; /* Shh! No looking before we're done */
- down_read(&mm->mmap_sem);
+ spin_lock(&mm->arg_lock);
arg_start = mm->arg_start;
arg_end = mm->arg_end;
env_start = mm->env_start;
env_end = mm->env_end;
- up_read(&mm->mmap_sem);
+ spin_unlock(&mm->arg_lock);
len = arg_end - arg_start;
@@ -788,3 +888,16 @@
out:
return res;
}
+
+int memcmp_pages(struct page *page1, struct page *page2)
+{
+ char *addr1, *addr2;
+ int ret;
+
+ addr1 = kmap_atomic(page1);
+ addr2 = kmap_atomic(page2);
+ ret = memcmp(addr1, addr2, PAGE_SIZE);
+ kunmap_atomic(addr2);
+ kunmap_atomic(addr1);
+ return ret;
+}
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a728fc4..a3c70e2 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/vmalloc.c
*
@@ -18,6 +19,7 @@
#include <linux/interrupt.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
+#include <linux/set_memory.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/list.h>
@@ -31,6 +33,7 @@
#include <linux/compiler.h>
#include <linux/llist.h>
#include <linux/bitops.h>
+#include <linux/rbtree_augmented.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
@@ -323,22 +326,89 @@
/*** Global kva allocator ***/
-#define VM_LAZY_FREE 0x02
-#define VM_VM_AREA 0x04
+#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
+#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
+
static DEFINE_SPINLOCK(vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
+static bool vmap_initialized __read_mostly;
-/* The vmap cache globals are protected by vmap_area_lock */
-static struct rb_node *free_vmap_cache;
-static unsigned long cached_hole_size;
-static unsigned long cached_vstart;
-static unsigned long cached_align;
+/*
+ * This kmem_cache is used for vmap_area objects. Instead of
+ * allocating from slab we reuse an object from this cache to
+ * make things faster. Especially in "no edge" splitting of
+ * free block.
+ */
+static struct kmem_cache *vmap_area_cachep;
-static unsigned long vmap_area_pcpu_hole;
+/*
+ * This linked list is used in pair with free_vmap_area_root.
+ * It gives O(1) access to prev/next to perform fast coalescing.
+ */
+static LIST_HEAD(free_vmap_area_list);
+
+/*
+ * This augment red-black tree represents the free vmap space.
+ * All vmap_area objects in this tree are sorted by va->va_start
+ * address. It is used for allocation and merging when a vmap
+ * object is released.
+ *
+ * Each vmap_area node contains a maximum available free block
+ * of its sub-tree, right or left. Therefore it is possible to
+ * find a lowest match of free area.
+ */
+static struct rb_root free_vmap_area_root = RB_ROOT;
+
+/*
+ * Preload a CPU with one object for "no edge" split case. The
+ * aim is to get rid of allocations from the atomic context, thus
+ * to use more permissive allocation masks.
+ */
+static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
+
+static __always_inline unsigned long
+va_size(struct vmap_area *va)
+{
+ return (va->va_end - va->va_start);
+}
+
+static __always_inline unsigned long
+get_subtree_max_size(struct rb_node *node)
+{
+ struct vmap_area *va;
+
+ va = rb_entry_safe(node, struct vmap_area, rb_node);
+ return va ? va->subtree_max_size : 0;
+}
+
+/*
+ * Gets called when remove the node and rotate.
+ */
+static __always_inline unsigned long
+compute_subtree_max_size(struct vmap_area *va)
+{
+ return max3(va_size(va),
+ get_subtree_max_size(va->rb_node.rb_left),
+ get_subtree_max_size(va->rb_node.rb_right));
+}
+
+RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
+ struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
+
+static void purge_vmap_area_lazy(void);
+static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+static unsigned long lazy_max_pages(void);
+
+static atomic_long_t nr_vmalloc_pages;
+
+unsigned long vmalloc_nr_pages(void)
+{
+ return atomic_long_read(&nr_vmalloc_pages);
+}
static struct vmap_area *__find_vmap_area(unsigned long addr)
{
@@ -359,41 +429,618 @@
return NULL;
}
-static void __insert_vmap_area(struct vmap_area *va)
+/*
+ * This function returns back addresses of parent node
+ * and its left or right link for further processing.
+ */
+static __always_inline struct rb_node **
+find_va_links(struct vmap_area *va,
+ struct rb_root *root, struct rb_node *from,
+ struct rb_node **parent)
{
- struct rb_node **p = &vmap_area_root.rb_node;
- struct rb_node *parent = NULL;
- struct rb_node *tmp;
+ struct vmap_area *tmp_va;
+ struct rb_node **link;
- while (*p) {
- struct vmap_area *tmp_va;
-
- parent = *p;
- tmp_va = rb_entry(parent, struct vmap_area, rb_node);
- if (va->va_start < tmp_va->va_end)
- p = &(*p)->rb_left;
- else if (va->va_end > tmp_va->va_start)
- p = &(*p)->rb_right;
- else
- BUG();
+ if (root) {
+ link = &root->rb_node;
+ if (unlikely(!*link)) {
+ *parent = NULL;
+ return link;
+ }
+ } else {
+ link = &from;
}
- rb_link_node(&va->rb_node, parent, p);
- rb_insert_color(&va->rb_node, &vmap_area_root);
+ /*
+ * Go to the bottom of the tree. When we hit the last point
+ * we end up with parent rb_node and correct direction, i name
+ * it link, where the new va->rb_node will be attached to.
+ */
+ do {
+ tmp_va = rb_entry(*link, struct vmap_area, rb_node);
- /* address-sort this list */
- tmp = rb_prev(&va->rb_node);
- if (tmp) {
- struct vmap_area *prev;
- prev = rb_entry(tmp, struct vmap_area, rb_node);
- list_add_rcu(&va->list, &prev->list);
- } else
- list_add_rcu(&va->list, &vmap_area_list);
+ /*
+ * During the traversal we also do some sanity check.
+ * Trigger the BUG() if there are sides(left/right)
+ * or full overlaps.
+ */
+ if (va->va_start < tmp_va->va_end &&
+ va->va_end <= tmp_va->va_start)
+ link = &(*link)->rb_left;
+ else if (va->va_end > tmp_va->va_start &&
+ va->va_start >= tmp_va->va_end)
+ link = &(*link)->rb_right;
+ else
+ BUG();
+ } while (*link);
+
+ *parent = &tmp_va->rb_node;
+ return link;
}
-static void purge_vmap_area_lazy(void);
+static __always_inline struct list_head *
+get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
+{
+ struct list_head *list;
-static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
+ if (unlikely(!parent))
+ /*
+ * The red-black tree where we try to find VA neighbors
+ * before merging or inserting is empty, i.e. it means
+ * there is no free vmap space. Normally it does not
+ * happen but we handle this case anyway.
+ */
+ return NULL;
+
+ list = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ return (&parent->rb_right == link ? list->next : list);
+}
+
+static __always_inline void
+link_va(struct vmap_area *va, struct rb_root *root,
+ struct rb_node *parent, struct rb_node **link, struct list_head *head)
+{
+ /*
+ * VA is still not in the list, but we can
+ * identify its future previous list_head node.
+ */
+ if (likely(parent)) {
+ head = &rb_entry(parent, struct vmap_area, rb_node)->list;
+ if (&parent->rb_right != link)
+ head = head->prev;
+ }
+
+ /* Insert to the rb-tree */
+ rb_link_node(&va->rb_node, parent, link);
+ if (root == &free_vmap_area_root) {
+ /*
+ * Some explanation here. Just perform simple insertion
+ * to the tree. We do not set va->subtree_max_size to
+ * its current size before calling rb_insert_augmented().
+ * It is because of we populate the tree from the bottom
+ * to parent levels when the node _is_ in the tree.
+ *
+ * Therefore we set subtree_max_size to zero after insertion,
+ * to let __augment_tree_propagate_from() puts everything to
+ * the correct order later on.
+ */
+ rb_insert_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ va->subtree_max_size = 0;
+ } else {
+ rb_insert_color(&va->rb_node, root);
+ }
+
+ /* Address-sort this list */
+ list_add(&va->list, head);
+}
+
+static __always_inline void
+unlink_va(struct vmap_area *va, struct rb_root *root)
+{
+ if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
+ return;
+
+ if (root == &free_vmap_area_root)
+ rb_erase_augmented(&va->rb_node,
+ root, &free_vmap_area_rb_augment_cb);
+ else
+ rb_erase(&va->rb_node, root);
+
+ list_del(&va->list);
+ RB_CLEAR_NODE(&va->rb_node);
+}
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+static void
+augment_tree_propagate_check(struct rb_node *n)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long size;
+ bool found = false;
+
+ if (n == NULL)
+ return;
+
+ va = rb_entry(n, struct vmap_area, rb_node);
+ size = va->subtree_max_size;
+ node = n;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) == size) {
+ node = node->rb_left;
+ } else {
+ if (va_size(va) == size) {
+ found = true;
+ break;
+ }
+
+ node = node->rb_right;
+ }
+ }
+
+ if (!found) {
+ va = rb_entry(n, struct vmap_area, rb_node);
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
+ }
+
+ augment_tree_propagate_check(n->rb_left);
+ augment_tree_propagate_check(n->rb_right);
+}
+#endif
+
+/*
+ * This function populates subtree_max_size from bottom to upper
+ * levels starting from VA point. The propagation must be done
+ * when VA size is modified by changing its va_start/va_end. Or
+ * in case of newly inserting of VA to the tree.
+ *
+ * It means that __augment_tree_propagate_from() must be called:
+ * - After VA has been inserted to the tree(free path);
+ * - After VA has been shrunk(allocation path);
+ * - After VA has been increased(merging path).
+ *
+ * Please note that, it does not mean that upper parent nodes
+ * and their subtree_max_size are recalculated all the time up
+ * to the root node.
+ *
+ * 4--8
+ * /\
+ * / \
+ * / \
+ * 2--2 8--8
+ *
+ * For example if we modify the node 4, shrinking it to 2, then
+ * no any modification is required. If we shrink the node 2 to 1
+ * its subtree_max_size is updated only, and set to 1. If we shrink
+ * the node 8 to 6, then its subtree_max_size is set to 6 and parent
+ * node becomes 4--6.
+ */
+static __always_inline void
+augment_tree_propagate_from(struct vmap_area *va)
+{
+ struct rb_node *node = &va->rb_node;
+ unsigned long new_va_sub_max_size;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ new_va_sub_max_size = compute_subtree_max_size(va);
+
+ /*
+ * If the newly calculated maximum available size of the
+ * subtree is equal to the current one, then it means that
+ * the tree is propagated correctly. So we have to stop at
+ * this point to save cycles.
+ */
+ if (va->subtree_max_size == new_va_sub_max_size)
+ break;
+
+ va->subtree_max_size = new_va_sub_max_size;
+ node = rb_parent(&va->rb_node);
+ }
+
+#if DEBUG_AUGMENT_PROPAGATE_CHECK
+ augment_tree_propagate_check(free_vmap_area_root.rb_node);
+#endif
+}
+
+static void
+insert_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ link = find_va_links(va, root, NULL, &parent);
+ link_va(va, root, parent, link, head);
+}
+
+static void
+insert_vmap_area_augment(struct vmap_area *va,
+ struct rb_node *from, struct rb_root *root,
+ struct list_head *head)
+{
+ struct rb_node **link;
+ struct rb_node *parent;
+
+ if (from)
+ link = find_va_links(va, NULL, from, &parent);
+ else
+ link = find_va_links(va, root, NULL, &parent);
+
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+}
+
+/*
+ * Merge de-allocated chunk of VA memory with previous
+ * and next free blocks. If coalesce is not done a new
+ * free area is inserted. If VA has been merged, it is
+ * freed.
+ */
+static __always_inline void
+merge_or_add_vmap_area(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ struct vmap_area *sibling;
+ struct list_head *next;
+ struct rb_node **link;
+ struct rb_node *parent;
+ bool merged = false;
+
+ /*
+ * Find a place in the tree where VA potentially will be
+ * inserted, unless it is merged with its sibling/siblings.
+ */
+ link = find_va_links(va, root, NULL, &parent);
+
+ /*
+ * Get next node of VA to check if merging can be done.
+ */
+ next = get_va_next_sibling(parent, link);
+ if (unlikely(next == NULL))
+ goto insert;
+
+ /*
+ * start end
+ * | |
+ * |<------VA------>|<-----Next----->|
+ * | |
+ * start end
+ */
+ if (next != head) {
+ sibling = list_entry(next, struct vmap_area, list);
+ if (sibling->va_start == va->va_end) {
+ sibling->va_start = va->va_start;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+
+ /* Point to the new merged area. */
+ va = sibling;
+ merged = true;
+ }
+ }
+
+ /*
+ * start end
+ * | |
+ * |<-----Prev----->|<------VA------>|
+ * | |
+ * start end
+ */
+ if (next->prev != head) {
+ sibling = list_entry(next->prev, struct vmap_area, list);
+ if (sibling->va_end == va->va_start) {
+ sibling->va_end = va->va_end;
+
+ /* Check and update the tree if needed. */
+ augment_tree_propagate_from(sibling);
+
+ if (merged)
+ unlink_va(va, root);
+
+ /* Free vmap_area object. */
+ kmem_cache_free(vmap_area_cachep, va);
+ return;
+ }
+ }
+
+insert:
+ if (!merged) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
+}
+
+static __always_inline bool
+is_within_this_va(struct vmap_area *va, unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ unsigned long nva_start_addr;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Can be overflowed due to big size or alignment. */
+ if (nva_start_addr + size < nva_start_addr ||
+ nva_start_addr < vstart)
+ return false;
+
+ return (nva_start_addr + size <= va->va_end);
+}
+
+/*
+ * Find the first free block(lowest start address) in the tree,
+ * that will accomplish the request corresponding to passing
+ * parameters.
+ */
+static __always_inline struct vmap_area *
+find_vmap_lowest_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+ struct rb_node *node;
+ unsigned long length;
+
+ /* Start from the root. */
+ node = free_vmap_area_root.rb_node;
+
+ /* Adjust the search size for alignment overhead. */
+ length = size + align - 1;
+
+ while (node) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+
+ if (get_subtree_max_size(node->rb_left) >= length &&
+ vstart < va->va_start) {
+ node = node->rb_left;
+ } else {
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ /*
+ * Does not make sense to go deeper towards the right
+ * sub-tree if it does not have a free block that is
+ * equal or bigger to the requested search length.
+ */
+ if (get_subtree_max_size(node->rb_right) >= length) {
+ node = node->rb_right;
+ continue;
+ }
+
+ /*
+ * OK. We roll back and find the first right sub-tree,
+ * that will satisfy the search criteria. It can happen
+ * only once due to "vstart" restriction.
+ */
+ while ((node = rb_parent(node))) {
+ va = rb_entry(node, struct vmap_area, rb_node);
+ if (is_within_this_va(va, size, align, vstart))
+ return va;
+
+ if (get_subtree_max_size(node->rb_right) >= length &&
+ vstart <= va->va_start) {
+ node = node->rb_right;
+ break;
+ }
+ }
+ }
+ }
+
+ return NULL;
+}
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+#include <linux/random.h>
+
+static struct vmap_area *
+find_vmap_lowest_linear_match(unsigned long size,
+ unsigned long align, unsigned long vstart)
+{
+ struct vmap_area *va;
+
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ if (!is_within_this_va(va, size, align, vstart))
+ continue;
+
+ return va;
+ }
+
+ return NULL;
+}
+
+static void
+find_vmap_lowest_match_check(unsigned long size)
+{
+ struct vmap_area *va_1, *va_2;
+ unsigned long vstart;
+ unsigned int rnd;
+
+ get_random_bytes(&rnd, sizeof(rnd));
+ vstart = VMALLOC_START + rnd;
+
+ va_1 = find_vmap_lowest_match(size, 1, vstart);
+ va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
+
+ if (va_1 != va_2)
+ pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
+ va_1, va_2, vstart);
+}
+#endif
+
+enum fit_type {
+ NOTHING_FIT = 0,
+ FL_FIT_TYPE = 1, /* full fit */
+ LE_FIT_TYPE = 2, /* left edge fit */
+ RE_FIT_TYPE = 3, /* right edge fit */
+ NE_FIT_TYPE = 4 /* no edge fit */
+};
+
+static __always_inline enum fit_type
+classify_va_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size)
+{
+ enum fit_type type;
+
+ /* Check if it is within VA. */
+ if (nva_start_addr < va->va_start ||
+ nva_start_addr + size > va->va_end)
+ return NOTHING_FIT;
+
+ /* Now classify. */
+ if (va->va_start == nva_start_addr) {
+ if (va->va_end == nva_start_addr + size)
+ type = FL_FIT_TYPE;
+ else
+ type = LE_FIT_TYPE;
+ } else if (va->va_end == nva_start_addr + size) {
+ type = RE_FIT_TYPE;
+ } else {
+ type = NE_FIT_TYPE;
+ }
+
+ return type;
+}
+
+static __always_inline int
+adjust_va_to_fit_type(struct vmap_area *va,
+ unsigned long nva_start_addr, unsigned long size,
+ enum fit_type type)
+{
+ struct vmap_area *lva = NULL;
+
+ if (type == FL_FIT_TYPE) {
+ /*
+ * No need to split VA, it fully fits.
+ *
+ * | |
+ * V NVA V
+ * |---------------|
+ */
+ unlink_va(va, &free_vmap_area_root);
+ kmem_cache_free(vmap_area_cachep, va);
+ } else if (type == LE_FIT_TYPE) {
+ /*
+ * Split left edge of fit VA.
+ *
+ * | |
+ * V NVA V R
+ * |-------|-------|
+ */
+ va->va_start += size;
+ } else if (type == RE_FIT_TYPE) {
+ /*
+ * Split right edge of fit VA.
+ *
+ * | |
+ * L V NVA V
+ * |-------|-------|
+ */
+ va->va_end = nva_start_addr;
+ } else if (type == NE_FIT_TYPE) {
+ /*
+ * Split no edge of fit VA.
+ *
+ * | |
+ * L V NVA V R
+ * |---|-------|---|
+ */
+ lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
+ if (unlikely(!lva)) {
+ /*
+ * For percpu allocator we do not do any pre-allocation
+ * and leave it as it is. The reason is it most likely
+ * never ends up with NE_FIT_TYPE splitting. In case of
+ * percpu allocations offsets and sizes are aligned to
+ * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
+ * are its main fitting cases.
+ *
+ * There are a few exceptions though, as an example it is
+ * a first allocation (early boot up) when we have "one"
+ * big free space that has to be split.
+ */
+ lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!lva)
+ return -1;
+ }
+
+ /*
+ * Build the remainder.
+ */
+ lva->va_start = va->va_start;
+ lva->va_end = nva_start_addr;
+
+ /*
+ * Shrink this VA to remaining size.
+ */
+ va->va_start = nva_start_addr + size;
+ } else {
+ return -1;
+ }
+
+ if (type != FL_FIT_TYPE) {
+ augment_tree_propagate_from(va);
+
+ if (lva) /* type == NE_FIT_TYPE */
+ insert_vmap_area_augment(lva, &va->rb_node,
+ &free_vmap_area_root, &free_vmap_area_list);
+ }
+
+ return 0;
+}
+
+/*
+ * Returns a start address of the newly allocated area, if success.
+ * Otherwise a vend is returned that indicates failure.
+ */
+static __always_inline unsigned long
+__alloc_vmap_area(unsigned long size, unsigned long align,
+ unsigned long vstart, unsigned long vend)
+{
+ unsigned long nva_start_addr;
+ struct vmap_area *va;
+ enum fit_type type;
+ int ret;
+
+ va = find_vmap_lowest_match(size, align, vstart);
+ if (unlikely(!va))
+ return vend;
+
+ if (va->va_start > vstart)
+ nva_start_addr = ALIGN(va->va_start, align);
+ else
+ nva_start_addr = ALIGN(vstart, align);
+
+ /* Check the "vend" restriction. */
+ if (nva_start_addr + size > vend)
+ return vend;
+
+ /* Classify what we have found. */
+ type = classify_va_fit_type(va, nva_start_addr, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ return vend;
+
+ /* Update the free vmap_area. */
+ ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
+ if (ret)
+ return vend;
+
+#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
+ find_vmap_lowest_match_check(size);
+#endif
+
+ return nva_start_addr;
+}
/*
* Allocate a region of KVA of the specified size and alignment, within the
@@ -404,19 +1051,20 @@
unsigned long vstart, unsigned long vend,
int node, gfp_t gfp_mask)
{
- struct vmap_area *va;
- struct rb_node *n;
+ struct vmap_area *va, *pva;
unsigned long addr;
int purged = 0;
- struct vmap_area *first;
BUG_ON(!size);
BUG_ON(offset_in_page(size));
BUG_ON(!is_power_of_2(align));
+ if (unlikely(!vmap_initialized))
+ return ERR_PTR(-EBUSY);
+
might_sleep();
- va = kmalloc_node(sizeof(struct vmap_area),
+ va = kmem_cache_alloc_node(vmap_area_cachep,
gfp_mask & GFP_RECLAIM_MASK, node);
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
@@ -428,84 +1076,46 @@
kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
retry:
- spin_lock(&vmap_area_lock);
/*
- * Invalidate cache if we have more permissive parameters.
- * cached_hole_size notes the largest hole noticed _below_
- * the vmap_area cached in free_vmap_cache: if size fits
- * into that hole, we want to scan from vstart to reuse
- * the hole instead of allocating above free_vmap_cache.
- * Note that __free_vmap_area may update free_vmap_cache
- * without updating cached_hole_size or cached_align.
+ * Preload this CPU with one extra vmap_area object to ensure
+ * that we have it available when fit type of free area is
+ * NE_FIT_TYPE.
+ *
+ * The preload is done in non-atomic context, thus it allows us
+ * to use more permissive allocation masks to be more stable under
+ * low memory condition and high memory pressure.
+ *
+ * Even if it fails we do not really care about that. Just proceed
+ * as it is. "overflow" path will refill the cache we allocate from.
*/
- if (!free_vmap_cache ||
- size < cached_hole_size ||
- vstart < cached_vstart ||
- align < cached_align) {
-nocache:
- cached_hole_size = 0;
- free_vmap_cache = NULL;
- }
- /* record if we encounter less permissive parameters */
- cached_vstart = vstart;
- cached_align = align;
+ preempt_disable();
+ if (!__this_cpu_read(ne_fit_preload_node)) {
+ preempt_enable();
+ pva = kmem_cache_alloc_node(vmap_area_cachep, GFP_KERNEL, node);
+ preempt_disable();
- /* find starting point for our search */
- if (free_vmap_cache) {
- first = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- addr = ALIGN(first->va_end, align);
- if (addr < vstart)
- goto nocache;
- if (addr + size < addr)
- goto overflow;
-
- } else {
- addr = ALIGN(vstart, align);
- if (addr + size < addr)
- goto overflow;
-
- n = vmap_area_root.rb_node;
- first = NULL;
-
- while (n) {
- struct vmap_area *tmp;
- tmp = rb_entry(n, struct vmap_area, rb_node);
- if (tmp->va_end >= addr) {
- first = tmp;
- if (tmp->va_start <= addr)
- break;
- n = n->rb_left;
- } else
- n = n->rb_right;
+ if (__this_cpu_cmpxchg(ne_fit_preload_node, NULL, pva)) {
+ if (pva)
+ kmem_cache_free(vmap_area_cachep, pva);
}
-
- if (!first)
- goto found;
}
- /* from the starting point, walk areas until a suitable hole is found */
- while (addr + size > first->va_start && addr + size <= vend) {
- if (addr + cached_hole_size < first->va_start)
- cached_hole_size = first->va_start - addr;
- addr = ALIGN(first->va_end, align);
- if (addr + size < addr)
- goto overflow;
+ spin_lock(&vmap_area_lock);
+ preempt_enable();
- if (list_is_last(&first->list, &vmap_area_list))
- goto found;
-
- first = list_next_entry(first, list);
- }
-
-found:
- if (addr + size > vend)
+ /*
+ * If an allocation fails, the "vend" address is
+ * returned. Therefore trigger the overflow path.
+ */
+ addr = __alloc_vmap_area(size, align, vstart, vend);
+ if (unlikely(addr == vend))
goto overflow;
va->va_start = addr;
va->va_end = addr + size;
- va->flags = 0;
- __insert_vmap_area(va);
- free_vmap_cache = &va->rb_node;
+ va->vm = NULL;
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
+
spin_unlock(&vmap_area_lock);
BUG_ON(!IS_ALIGNED(va->va_start, align));
@@ -534,7 +1144,8 @@
if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
size);
- kfree(va);
+
+ kmem_cache_free(vmap_area_cachep, va);
return ERR_PTR(-EBUSY);
}
@@ -552,37 +1163,16 @@
static void __free_vmap_area(struct vmap_area *va)
{
- BUG_ON(RB_EMPTY_NODE(&va->rb_node));
-
- if (free_vmap_cache) {
- if (va->va_end < cached_vstart) {
- free_vmap_cache = NULL;
- } else {
- struct vmap_area *cache;
- cache = rb_entry(free_vmap_cache, struct vmap_area, rb_node);
- if (va->va_start <= cache->va_start) {
- free_vmap_cache = rb_prev(&va->rb_node);
- /*
- * We don't try to update cached_hole_size or
- * cached_align, but it won't go very wrong.
- */
- }
- }
- }
- rb_erase(&va->rb_node, &vmap_area_root);
- RB_CLEAR_NODE(&va->rb_node);
- list_del_rcu(&va->list);
+ /*
+ * Remove from the busy tree/list.
+ */
+ unlink_va(va, &vmap_area_root);
/*
- * Track the highest possible candidate for pcpu area
- * allocation. Areas outside of vmalloc area can be returned
- * here too, consider only end addresses which fall inside
- * vmalloc area proper.
+ * Merge VA with its neighbors, otherwise just add it.
*/
- if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END)
- vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end);
-
- kfree_rcu(va, rcu_head);
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
}
/*
@@ -628,7 +1218,7 @@
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
}
-static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
+static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
/*
* Serialize vmap purging. There is no actual criticial section protected
@@ -646,7 +1236,7 @@
*/
void set_iounmap_nonlazy(void)
{
- atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+ atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
}
/*
@@ -654,34 +1244,53 @@
*/
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
+ unsigned long resched_threshold;
struct llist_node *valist;
struct vmap_area *va;
struct vmap_area *n_va;
- bool do_free = false;
lockdep_assert_held(&vmap_purge_lock);
valist = llist_del_all(&vmap_purge_list);
+ if (unlikely(valist == NULL))
+ return false;
+
+ /*
+ * First make sure the mappings are removed from all page-tables
+ * before they are freed.
+ */
+ vmalloc_sync_all();
+
+ /*
+ * TODO: to calculate a flush range without looping.
+ * The list can be up to lazy_max_pages() elements.
+ */
llist_for_each_entry(va, valist, purge_list) {
if (va->va_start < start)
start = va->va_start;
if (va->va_end > end)
end = va->va_end;
- do_free = true;
}
- if (!do_free)
- return false;
-
flush_tlb_kernel_range(start, end);
+ resched_threshold = lazy_max_pages() << 1;
spin_lock(&vmap_area_lock);
llist_for_each_entry_safe(va, n_va, valist, purge_list) {
- int nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
- __free_vmap_area(va);
- atomic_sub(nr, &vmap_lazy_nr);
- cond_resched_lock(&vmap_area_lock);
+ /*
+ * Finally insert or merge lazily-freed area. It is
+ * detached and there is no need to "unlink" it from
+ * anything.
+ */
+ merge_or_add_vmap_area(va,
+ &free_vmap_area_root, &free_vmap_area_list);
+
+ atomic_long_sub(nr, &vmap_lazy_nr);
+
+ if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
+ cond_resched_lock(&vmap_area_lock);
}
spin_unlock(&vmap_area_lock);
return true;
@@ -717,10 +1326,14 @@
*/
static void free_vmap_area_noflush(struct vmap_area *va)
{
- int nr_lazy;
+ unsigned long nr_lazy;
- nr_lazy = atomic_add_return((va->va_end - va->va_start) >> PAGE_SHIFT,
- &vmap_lazy_nr);
+ spin_lock(&vmap_area_lock);
+ unlink_va(va, &vmap_area_root);
+ spin_unlock(&vmap_area_lock);
+
+ nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
+ PAGE_SHIFT, &vmap_lazy_nr);
/* After this point, we may free va at any time */
llist_add(&va->purge_list, &vmap_purge_list);
@@ -783,8 +1396,6 @@
#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
-static bool vmap_initialized __read_mostly = false;
-
struct vmap_block_queue {
spinlock_t lock;
struct list_head free;
@@ -840,7 +1451,7 @@
* @order: how many 2^order pages should be occupied in newly allocated block
* @gfp_mask: flags for the page level allocator
*
- * Returns: virtual address in a newly allocated block or ERR_PTR(-errno)
+ * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
*/
static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
{
@@ -1055,24 +1666,9 @@
spin_unlock(&vb->lock);
}
-/**
- * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
- *
- * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
- * to amortize TLB flushing overheads. What this means is that any page you
- * have now, may, in a former life, have been mapped into kernel virtual
- * address by the vmap layer and so there might be some CPUs with TLB entries
- * still referencing that page (additional to the regular 1:1 kernel mapping).
- *
- * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
- * be sure that none of the pages we have control over will have any aliases
- * from the vmap layer.
- */
-void vm_unmap_aliases(void)
+static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
{
- unsigned long start = ULONG_MAX, end = 0;
int cpu;
- int flush = 0;
if (unlikely(!vmap_initialized))
return;
@@ -1109,6 +1705,27 @@
flush_tlb_kernel_range(start, end);
mutex_unlock(&vmap_purge_lock);
}
+
+/**
+ * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
+ *
+ * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
+ * to amortize TLB flushing overheads. What this means is that any page you
+ * have now, may, in a former life, have been mapped into kernel virtual
+ * address by the vmap layer and so there might be some CPUs with TLB entries
+ * still referencing that page (additional to the regular 1:1 kernel mapping).
+ *
+ * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
+ * be sure that none of the pages we have control over will have any aliases
+ * from the vmap layer.
+ */
+void vm_unmap_aliases(void)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+ int flush = 0;
+
+ _vm_unmap_aliases(start, end, flush);
+}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/**
@@ -1187,6 +1804,7 @@
EXPORT_SYMBOL(vm_map_ram);
static struct vm_struct *vmlist __initdata;
+
/**
* vm_area_add_early - add vmap area early during boot
* @vm: vm_struct to add
@@ -1238,12 +1856,58 @@
vm_area_add_early(vm);
}
+static void vmap_init_free_space(void)
+{
+ unsigned long vmap_start = 1;
+ const unsigned long vmap_end = ULONG_MAX;
+ struct vmap_area *busy, *free;
+
+ /*
+ * B F B B B F
+ * -|-----|.....|-----|-----|-----|.....|-
+ * | The KVA space |
+ * |<--------------------------------->|
+ */
+ list_for_each_entry(busy, &vmap_area_list, list) {
+ if (busy->va_start - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = busy->va_start;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+
+ vmap_start = busy->va_end;
+ }
+
+ if (vmap_end - vmap_start > 0) {
+ free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (!WARN_ON_ONCE(!free)) {
+ free->va_start = vmap_start;
+ free->va_end = vmap_end;
+
+ insert_vmap_area_augment(free, NULL,
+ &free_vmap_area_root,
+ &free_vmap_area_list);
+ }
+ }
+}
+
void __init vmalloc_init(void)
{
struct vmap_area *va;
struct vm_struct *tmp;
int i;
+ /*
+ * Create the cache for vmap_area objects.
+ */
+ vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
+
for_each_possible_cpu(i) {
struct vmap_block_queue *vbq;
struct vfree_deferred *p;
@@ -1258,16 +1922,20 @@
/* Import existing vmlist entries. */
for (tmp = vmlist; tmp; tmp = tmp->next) {
- va = kzalloc(sizeof(struct vmap_area), GFP_NOWAIT);
- va->flags = VM_VM_AREA;
+ va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
+ if (WARN_ON_ONCE(!va))
+ continue;
+
va->va_start = (unsigned long)tmp->addr;
va->va_end = va->va_start + tmp->size;
va->vm = tmp;
- __insert_vmap_area(va);
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- vmap_area_pcpu_hole = VMALLOC_END;
-
+ /*
+ * Now we can initialize a free vmap space.
+ */
+ vmap_init_free_space();
vmap_initialized = true;
}
@@ -1355,7 +2023,6 @@
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->vm = vm;
- va->flags |= VM_VM_AREA;
spin_unlock(&vmap_area_lock);
}
@@ -1421,13 +2088,15 @@
}
/**
- * get_vm_area - reserve a contiguous kernel virtual area
- * @size: size of the area
- * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
+ * get_vm_area - reserve a contiguous kernel virtual area
+ * @size: size of the area
+ * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
*
- * Search an area of @size in the kernel virtual mapping area,
- * and reserved it for out purposes. Returns the area descriptor
- * on success or %NULL on failure.
+ * Search an area of @size in the kernel virtual mapping area,
+ * and reserved it for out purposes. Returns the area descriptor
+ * on success or %NULL on failure.
+ *
+ * Return: the area descriptor on success or %NULL on failure.
*/
struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
{
@@ -1444,31 +2113,35 @@
}
/**
- * find_vm_area - find a continuous kernel virtual area
- * @addr: base address
+ * find_vm_area - find a continuous kernel virtual area
+ * @addr: base address
*
- * Search for the kernel VM area starting at @addr, and return it.
- * It is up to the caller to do all required locking to keep the returned
- * pointer valid.
+ * Search for the kernel VM area starting at @addr, and return it.
+ * It is up to the caller to do all required locking to keep the returned
+ * pointer valid.
+ *
+ * Return: pointer to the found area or %NULL on faulure
*/
struct vm_struct *find_vm_area(const void *addr)
{
struct vmap_area *va;
va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA)
- return va->vm;
+ if (!va)
+ return NULL;
- return NULL;
+ return va->vm;
}
/**
- * remove_vm_area - find and remove a continuous kernel virtual area
- * @addr: base address
+ * remove_vm_area - find and remove a continuous kernel virtual area
+ * @addr: base address
*
- * Search for the kernel VM area starting at @addr, and remove it.
- * This function returns the found VM area, but using it is NOT safe
- * on SMP machines, except for its size or flags.
+ * Search for the kernel VM area starting at @addr, and remove it.
+ * This function returns the found VM area, but using it is NOT safe
+ * on SMP machines, except for its size or flags.
+ *
+ * Return: pointer to the found area or %NULL on faulure
*/
struct vm_struct *remove_vm_area(const void *addr)
{
@@ -1476,14 +2149,12 @@
might_sleep();
- va = find_vmap_area((unsigned long)addr);
- if (va && va->flags & VM_VM_AREA) {
+ spin_lock(&vmap_area_lock);
+ va = __find_vmap_area((unsigned long)addr);
+ if (va && va->vm) {
struct vm_struct *vm = va->vm;
- spin_lock(&vmap_area_lock);
va->vm = NULL;
- va->flags &= ~VM_VM_AREA;
- va->flags |= VM_LAZY_FREE;
spin_unlock(&vmap_area_lock);
kasan_free_shadow(vm);
@@ -1491,9 +2162,68 @@
return vm;
}
+
+ spin_unlock(&vmap_area_lock);
return NULL;
}
+static inline void set_area_direct_map(const struct vm_struct *area,
+ int (*set_direct_map)(struct page *page))
+{
+ int i;
+
+ for (i = 0; i < area->nr_pages; i++)
+ if (page_address(area->pages[i]))
+ set_direct_map(area->pages[i]);
+}
+
+/* Handle removing and resetting vm mappings related to the vm_struct. */
+static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
+{
+ unsigned long start = ULONG_MAX, end = 0;
+ int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
+ int flush_dmap = 0;
+ int i;
+
+ remove_vm_area(area->addr);
+
+ /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
+ if (!flush_reset)
+ return;
+
+ /*
+ * If not deallocating pages, just do the flush of the VM area and
+ * return.
+ */
+ if (!deallocate_pages) {
+ vm_unmap_aliases();
+ return;
+ }
+
+ /*
+ * If execution gets here, flush the vm mapping and reset the direct
+ * map. Find the start and end range of the direct mappings to make sure
+ * the vm_unmap_aliases() flush includes the direct map.
+ */
+ for (i = 0; i < area->nr_pages; i++) {
+ unsigned long addr = (unsigned long)page_address(area->pages[i]);
+ if (addr) {
+ start = min(addr, start);
+ end = max(addr + PAGE_SIZE, end);
+ flush_dmap = 1;
+ }
+ }
+
+ /*
+ * Set direct map to something invalid so that it won't be cached if
+ * there are any accesses after the TLB flush, then flush the TLB and
+ * reset the direct map permissions to the default.
+ */
+ set_area_direct_map(area, set_direct_map_invalid_noflush);
+ _vm_unmap_aliases(start, end, flush_dmap);
+ set_area_direct_map(area, set_direct_map_default_noflush);
+}
+
static void __vunmap(const void *addr, int deallocate_pages)
{
struct vm_struct *area;
@@ -1505,7 +2235,7 @@
addr))
return;
- area = find_vmap_area((unsigned long)addr)->vm;
+ area = find_vm_area(addr);
if (unlikely(!area)) {
WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
addr);
@@ -1515,7 +2245,8 @@
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
- remove_vm_area(addr);
+ vm_remove_mappings(area, deallocate_pages);
+
if (deallocate_pages) {
int i;
@@ -1525,6 +2256,7 @@
BUG_ON(!page);
__free_pages(page, 0);
}
+ atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
kvfree(area->pages);
}
@@ -1548,11 +2280,11 @@
}
/**
- * vfree_atomic - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree_atomic - release memory allocated by vmalloc()
+ * @addr: memory base address
*
- * This one is just like vfree() but can be called in any atomic context
- * except NMIs.
+ * This one is just like vfree() but can be called in any atomic context
+ * except NMIs.
*/
void vfree_atomic(const void *addr)
{
@@ -1565,19 +2297,29 @@
__vfree_deferred(addr);
}
+static void __vfree(const void *addr)
+{
+ if (unlikely(in_interrupt()))
+ __vfree_deferred(addr);
+ else
+ __vunmap(addr, 1);
+}
+
/**
- * vfree - release memory allocated by vmalloc()
- * @addr: memory base address
+ * vfree - release memory allocated by vmalloc()
+ * @addr: memory base address
*
- * Free the virtually continuous memory area starting at @addr, as
- * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
- * NULL, no operation is performed.
+ * Free the virtually continuous memory area starting at @addr, as
+ * obtained from vmalloc(), vmalloc_32() or __vmalloc(). If @addr is
+ * NULL, no operation is performed.
*
- * Must not be called in NMI context (strictly speaking, only if we don't
- * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
- * conventions for vfree() arch-depenedent would be a really bad idea)
+ * Must not be called in NMI context (strictly speaking, only if we don't
+ * have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
+ * conventions for vfree() arch-depenedent would be a really bad idea)
*
- * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
+ * May sleep if called *not* from interrupt context.
+ *
+ * NOTE: assumes that the object at @addr has a size >= sizeof(llist_node)
*/
void vfree(const void *addr)
{
@@ -1585,23 +2327,23 @@
kmemleak_free(addr);
+ might_sleep_if(!in_interrupt());
+
if (!addr)
return;
- if (unlikely(in_interrupt()))
- __vfree_deferred(addr);
- else
- __vunmap(addr, 1);
+
+ __vfree(addr);
}
EXPORT_SYMBOL(vfree);
/**
- * vunmap - release virtual mapping obtained by vmap()
- * @addr: memory base address
+ * vunmap - release virtual mapping obtained by vmap()
+ * @addr: memory base address
*
- * Free the virtually contiguous memory area starting at @addr,
- * which was created from the page array passed to vmap().
+ * Free the virtually contiguous memory area starting at @addr,
+ * which was created from the page array passed to vmap().
*
- * Must not be called in interrupt context.
+ * Must not be called in interrupt context.
*/
void vunmap(const void *addr)
{
@@ -1613,24 +2355,26 @@
EXPORT_SYMBOL(vunmap);
/**
- * vmap - map an array of pages into virtually contiguous space
- * @pages: array of page pointers
- * @count: number of pages to map
- * @flags: vm_area->flags
- * @prot: page protection for the mapping
+ * vmap - map an array of pages into virtually contiguous space
+ * @pages: array of page pointers
+ * @count: number of pages to map
+ * @flags: vm_area->flags
+ * @prot: page protection for the mapping
*
- * Maps @count pages from @pages into contiguous kernel virtual
- * space.
+ * Maps @count pages from @pages into contiguous kernel virtual
+ * space.
+ *
+ * Return: the address of the area or %NULL on failure
*/
void *vmap(struct page **pages, unsigned int count,
- unsigned long flags, pgprot_t prot)
+ unsigned long flags, pgprot_t prot)
{
struct vm_struct *area;
unsigned long size; /* In bytes */
might_sleep();
- if (count > totalram_pages)
+ if (count > totalram_pages())
return NULL;
size = (unsigned long)count << PAGE_SHIFT;
@@ -1664,7 +2408,6 @@
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *));
- area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
@@ -1672,13 +2415,16 @@
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
- area->pages = pages;
- if (!area->pages) {
+
+ if (!pages) {
remove_vm_area(area->addr);
kfree(area);
return NULL;
}
+ area->pages = pages;
+ area->nr_pages = nr_pages;
+
for (i = 0; i < area->nr_pages; i++) {
struct page *page;
@@ -1690,12 +2436,14 @@
if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */
area->nr_pages = i;
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
goto fail;
}
area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched();
}
+ atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
if (map_vm_area(area, prot, pages))
goto fail;
@@ -1705,25 +2453,27 @@
warn_alloc(gfp_mask, NULL,
"vmalloc: allocation failure, allocated %ld of %ld bytes",
(area->nr_pages*PAGE_SIZE), area->size);
- vfree(area->addr);
+ __vfree(area->addr);
return NULL;
}
/**
- * __vmalloc_node_range - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @start: vm area range start
- * @end: vm area range end
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
+ * __vmalloc_node_range - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @start: vm area range start
+ * @end: vm area range end
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
+ *
+ * Return: the address of the area or %NULL on failure
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
@@ -1735,7 +2485,7 @@
unsigned long real_size = size;
size = PAGE_ALIGN(size);
- if (!size || (size >> PAGE_SHIFT) > totalram_pages)
+ if (!size || (size >> PAGE_SHIFT) > totalram_pages())
goto fail;
area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
@@ -1764,25 +2514,35 @@
return NULL;
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node_range);
+#endif
+
/**
- * __vmalloc_node - allocate virtually contiguous memory
- * @size: allocation size
- * @align: desired alignment
- * @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
- * @node: node to use for allocation or NUMA_NO_NODE
- * @caller: caller's return address
+ * __vmalloc_node - allocate virtually contiguous memory
+ * @size: allocation size
+ * @align: desired alignment
+ * @gfp_mask: flags for the page level allocator
+ * @prot: protection mask for the allocated pages
+ * @node: node to use for allocation or NUMA_NO_NODE
+ * @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level
+ * allocator with @gfp_mask flags. Map them into contiguous
+ * kernel virtual space, using a pagetable protection of @prot.
*
- * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
- * and __GFP_NOFAIL are not supported
+ * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
+ * and __GFP_NOFAIL are not supported
*
- * Any use of gfp flags outside of GFP_KERNEL should be consulted
- * with mm people.
+ * Any use of gfp flags outside of GFP_KERNEL should be consulted
+ * with mm people.
*
+ * Return: pointer to the allocated memory or %NULL on error
*/
static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
@@ -1814,13 +2574,16 @@
}
/**
- * vmalloc - allocate virtually contiguous memory
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * vmalloc - allocate virtually contiguous memory
+ * @size: allocation size
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc(unsigned long size)
{
@@ -1830,14 +2593,17 @@
EXPORT_SYMBOL(vmalloc);
/**
- * vzalloc - allocate virtually contiguous memory with zero fill
- * @size: allocation size
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
- * The memory allocated is set to zero.
+ * vzalloc - allocate virtually contiguous memory with zero fill
+ * @size: allocation size
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
+ * The memory allocated is set to zero.
+ *
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc(unsigned long size)
{
@@ -1852,34 +2618,30 @@
*
* The resulting memory area is zeroed so it can be mapped to userspace
* without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, SHMLBA,
- GFP_KERNEL | __GFP_ZERO,
- PAGE_KERNEL, NUMA_NO_NODE,
- __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_user);
/**
- * vmalloc_node - allocate memory on a specific node
- * @size: allocation size
- * @node: numa node
+ * vmalloc_node - allocate memory on a specific node
+ * @size: allocation size
+ * @node: numa node
*
- * Allocate enough pages to cover @size from the page level
- * allocator and map them into contiguous kernel virtual space.
+ * Allocate enough pages to cover @size from the page level
+ * allocator and map them into contiguous kernel virtual space.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_node(unsigned long size, int node)
{
@@ -1899,6 +2661,8 @@
*
* For tight control over page level allocator and protection flags
* use __vmalloc_node() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
@@ -1908,21 +2672,23 @@
EXPORT_SYMBOL(vzalloc_node);
/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
+ * vmalloc_exec - allocate virtually contiguous, executable memory
+ * @size: allocation size
*
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
+ * Kernel-internal function to allocate enough pages to cover @size
+ * the page level allocator and map them into contiguous and
+ * executable kernel virtual space.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
+ * For tight control over page level allocator and protection flags
+ * use __vmalloc() instead.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
-
void *vmalloc_exec(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL_EXEC,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
+ GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
+ NUMA_NO_NODE, __builtin_return_address(0));
}
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
@@ -1938,11 +2704,13 @@
#endif
/**
- * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
- * @size: allocation size
+ * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
+ * @size: allocation size
*
- * Allocate enough 32bit PA addressable pages to cover @size from the
- * page level allocator and map them into contiguous kernel virtual space.
+ * Allocate enough 32bit PA addressable pages to cover @size from the
+ * page level allocator and map them into contiguous kernel virtual space.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32(unsigned long size)
{
@@ -1953,23 +2721,19 @@
/**
* vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
- * @size: allocation size
+ * @size: allocation size
*
* The resulting memory area is 32bit addressable and zeroed so it can be
* mapped to userspace without leaking data.
+ *
+ * Return: pointer to the allocated memory or %NULL on error
*/
void *vmalloc_32_user(unsigned long size)
{
- struct vm_struct *area;
- void *ret;
-
- ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
- if (ret) {
- area = find_vm_area(ret);
- area->flags |= VM_USERMAP;
- }
- return ret;
+ return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
+ GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
+ VM_USERMAP, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32_user);
@@ -2055,31 +2819,29 @@
}
/**
- * vread() - read vmalloc area in a safe way.
- * @buf: buffer for reading data
- * @addr: vm address.
- * @count: number of bytes to be read.
+ * vread() - read vmalloc area in a safe way.
+ * @buf: buffer for reading data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
*
- * Returns # of bytes which addr and buf should be increased.
- * (same number to @count). Returns 0 if [addr...addr+count) doesn't
- * includes any intersect with alive vmalloc area.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from that area to a given buffer. If the given memory range
+ * of [addr...addr+count) includes some valid address, data is copied to
+ * proper area of @buf. If there are memory holes, they'll be zero-filled.
+ * IOREMAP area is treated as memory hole and no copy is done.
*
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from that area to a given buffer. If the given memory range
- * of [addr...addr+count) includes some valid address, data is copied to
- * proper area of @buf. If there are memory holes, they'll be zero-filled.
- * IOREMAP area is treated as memory hole and no copy is done.
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
*
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
+ * Note: In usual ops, vread() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any information, as /dev/kmem.
*
- * Note: In usual ops, vread() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
- *
+ * Return: number of bytes for which addr and buf should be increased
+ * (same number as @count) or %0 if [addr...addr+count) doesn't
+ * include any intersection with valid vmalloc area
*/
-
long vread(char *buf, char *addr, unsigned long count)
{
struct vmap_area *va;
@@ -2097,7 +2859,7 @@
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
@@ -2136,31 +2898,29 @@
}
/**
- * vwrite() - write vmalloc area in a safe way.
- * @buf: buffer for source data
- * @addr: vm address.
- * @count: number of bytes to be read.
+ * vwrite() - write vmalloc area in a safe way.
+ * @buf: buffer for source data
+ * @addr: vm address.
+ * @count: number of bytes to be read.
*
- * Returns # of bytes which addr and buf should be incresed.
- * (same number to @count).
- * If [addr...addr+count) doesn't includes any intersect with valid
- * vmalloc area, returns 0.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * copy data from a buffer to the given addr. If specified range of
+ * [addr...addr+count) includes some valid address, data is copied from
+ * proper area of @buf. If there are memory holes, no copy to hole.
+ * IOREMAP area is treated as memory hole and no copy is done.
*
- * This function checks that addr is a valid vmalloc'ed area, and
- * copy data from a buffer to the given addr. If specified range of
- * [addr...addr+count) includes some valid address, data is copied from
- * proper area of @buf. If there are memory holes, no copy to hole.
- * IOREMAP area is treated as memory hole and no copy is done.
+ * If [addr...addr+count) doesn't includes any intersects with alive
+ * vm_struct area, returns 0. @buf should be kernel's buffer.
*
- * If [addr...addr+count) doesn't includes any intersects with alive
- * vm_struct area, returns 0. @buf should be kernel's buffer.
+ * Note: In usual ops, vwrite() is never necessary because the caller
+ * should know vmalloc() area is valid and can use memcpy().
+ * This is for routines which have to access vmalloc area without
+ * any information, as /dev/kmem.
*
- * Note: In usual ops, vwrite() is never necessary because the caller
- * should know vmalloc() area is valid and can use memcpy().
- * This is for routines which have to access vmalloc area without
- * any informaion, as /dev/kmem.
+ * Return: number of bytes for which addr and buf should be
+ * increased (same number as @count) or %0 if [addr...addr+count)
+ * doesn't include any intersection with valid vmalloc area
*/
-
long vwrite(char *buf, char *addr, unsigned long count)
{
struct vmap_area *va;
@@ -2179,7 +2939,7 @@
if (!count)
break;
- if (!(va->flags & VM_VM_AREA))
+ if (!va->vm)
continue;
vm = va->vm;
@@ -2212,20 +2972,20 @@
}
/**
- * remap_vmalloc_range_partial - map vmalloc pages to userspace
- * @vma: vma to cover
- * @uaddr: target user address to start at
- * @kaddr: virtual address of vmalloc kernel memory
- * @size: size of map area
+ * remap_vmalloc_range_partial - map vmalloc pages to userspace
+ * @vma: vma to cover
+ * @uaddr: target user address to start at
+ * @kaddr: virtual address of vmalloc kernel memory
+ * @size: size of map area
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that @kaddr is a valid vmalloc'ed area,
- * and that it is big enough to cover the range starting at
- * @uaddr in @vma. Will return failure if that criteria isn't
- * met.
+ * This function checks that @kaddr is a valid vmalloc'ed area,
+ * and that it is big enough to cover the range starting at
+ * @uaddr in @vma. Will return failure if that criteria isn't
+ * met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
void *kaddr, unsigned long size)
@@ -2241,10 +3001,10 @@
if (!area)
return -EINVAL;
- if (!(area->flags & VM_USERMAP))
+ if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
return -EINVAL;
- if (kaddr + size > area->addr + area->size)
+ if (kaddr + size > area->addr + get_vm_area_size(area))
return -EINVAL;
do {
@@ -2267,18 +3027,18 @@
EXPORT_SYMBOL(remap_vmalloc_range_partial);
/**
- * remap_vmalloc_range - map vmalloc pages to userspace
- * @vma: vma to cover (map full range of vma)
- * @addr: vmalloc memory
- * @pgoff: number of pages into addr before first page to map
+ * remap_vmalloc_range - map vmalloc pages to userspace
+ * @vma: vma to cover (map full range of vma)
+ * @addr: vmalloc memory
+ * @pgoff: number of pages into addr before first page to map
*
- * Returns: 0 for success, -Exxx on failure
+ * Returns: 0 for success, -Exxx on failure
*
- * This function checks that addr is a valid vmalloc'ed area, and
- * that it is big enough to cover the vma. Will return failure if
- * that criteria isn't met.
+ * This function checks that addr is a valid vmalloc'ed area, and
+ * that it is big enough to cover the vma. Will return failure if
+ * that criteria isn't met.
*
- * Similar to remap_pfn_range() (see mm/memory.c)
+ * Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
unsigned long pgoff)
@@ -2292,13 +3052,16 @@
/*
* Implement a stub for vmalloc_sync_all() if the architecture chose not to
* have one.
+ *
+ * The purpose of this function is to make sure the vmalloc area
+ * mappings are identical in all page-tables in the system.
*/
void __weak vmalloc_sync_all(void)
{
}
-static int f(pte_t *pte, pgtable_t table, unsigned long addr, void *data)
+static int f(pte_t *pte, unsigned long addr, void *data)
{
pte_t ***p = data;
@@ -2310,18 +3073,18 @@
}
/**
- * alloc_vm_area - allocate a range of kernel address space
- * @size: size of the area
- * @ptes: returns the PTEs for the address space
+ * alloc_vm_area - allocate a range of kernel address space
+ * @size: size of the area
+ * @ptes: returns the PTEs for the address space
*
- * Returns: NULL on failure, vm_struct on success
+ * Returns: NULL on failure, vm_struct on success
*
- * This function reserves a range of kernel address space, and
- * allocates pagetables to map that range. No actual mappings
- * are created.
+ * This function reserves a range of kernel address space, and
+ * allocates pagetables to map that range. No actual mappings
+ * are created.
*
- * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
- * allocated for the VM area are returned.
+ * If @ptes is non-NULL, pointers to the PTEs (in init_mm)
+ * allocated for the VM area are returned.
*/
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
@@ -2362,81 +3125,64 @@
}
/**
- * pvm_find_next_prev - find the next and prev vmap_area surrounding @end
- * @end: target address
- * @pnext: out arg for the next vmap_area
- * @pprev: out arg for the previous vmap_area
+ * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
+ * @addr: target address
*
- * Returns: %true if either or both of next and prev are found,
- * %false if no vmap_area exists
- *
- * Find vmap_areas end addresses of which enclose @end. ie. if not
- * NULL, *pnext->va_end > @end and *pprev->va_end <= @end.
+ * Returns: vmap_area if it is found. If there is no such area
+ * the first highest(reverse order) vmap_area is returned
+ * i.e. va->va_start < addr && va->va_end < addr or NULL
+ * if there are no any areas before @addr.
*/
-static bool pvm_find_next_prev(unsigned long end,
- struct vmap_area **pnext,
- struct vmap_area **pprev)
+static struct vmap_area *
+pvm_find_va_enclose_addr(unsigned long addr)
{
- struct rb_node *n = vmap_area_root.rb_node;
- struct vmap_area *va = NULL;
+ struct vmap_area *va, *tmp;
+ struct rb_node *n;
+
+ n = free_vmap_area_root.rb_node;
+ va = NULL;
while (n) {
- va = rb_entry(n, struct vmap_area, rb_node);
- if (end < va->va_end)
- n = n->rb_left;
- else if (end > va->va_end)
+ tmp = rb_entry(n, struct vmap_area, rb_node);
+ if (tmp->va_start <= addr) {
+ va = tmp;
+ if (tmp->va_end >= addr)
+ break;
+
n = n->rb_right;
- else
- break;
+ } else {
+ n = n->rb_left;
+ }
}
- if (!va)
- return false;
-
- if (va->va_end > end) {
- *pnext = va;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
- } else {
- *pprev = va;
- *pnext = node_to_va(rb_next(&(*pprev)->rb_node));
- }
- return true;
+ return va;
}
/**
- * pvm_determine_end - find the highest aligned address between two vmap_areas
- * @pnext: in/out arg for the next vmap_area
- * @pprev: in/out arg for the previous vmap_area
- * @align: alignment
+ * pvm_determine_end_from_reverse - find the highest aligned address
+ * of free block below VMALLOC_END
+ * @va:
+ * in - the VA we start the search(reverse order);
+ * out - the VA with the highest aligned end address.
*
- * Returns: determined end address
- *
- * Find the highest aligned address between *@pnext and *@pprev below
- * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned
- * down address is between the end addresses of the two vmap_areas.
- *
- * Please note that the address returned by this function may fall
- * inside *@pnext vmap_area. The caller is responsible for checking
- * that.
+ * Returns: determined end address within vmap_area
*/
-static unsigned long pvm_determine_end(struct vmap_area **pnext,
- struct vmap_area **pprev,
- unsigned long align)
+static unsigned long
+pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
{
- const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
+ unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
unsigned long addr;
- if (*pnext)
- addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end);
- else
- addr = vmalloc_end;
-
- while (*pprev && (*pprev)->va_end > addr) {
- *pnext = *pprev;
- *pprev = node_to_va(rb_prev(&(*pnext)->rb_node));
+ if (likely(*va)) {
+ list_for_each_entry_from_reverse((*va),
+ &free_vmap_area_list, list) {
+ addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
+ if ((*va)->va_start < addr)
+ return addr;
+ }
}
- return addr;
+ return 0;
}
/**
@@ -2456,12 +3202,12 @@
* to gigabytes. To avoid interacting with regular vmallocs, these
* areas are allocated from top.
*
- * Despite its complicated look, this allocator is rather simple. It
- * does everything top-down and scans areas from the end looking for
- * matching slot. While scanning, if any of the areas overlaps with
- * existing vmap_area, the base address is pulled down to fit the
- * area. Scanning is repeated till all the areas fit and then all
- * necessary data structures are inserted and the result is returned.
+ * Despite its complicated look, this allocator is rather simple. It
+ * does everything top-down and scans free blocks from the end looking
+ * for matching base. While scanning, if any of the areas do not fit the
+ * base address is pulled down to fit the area. Scanning is repeated till
+ * all the areas fit and then all necessary data structures are inserted
+ * and the result is returned.
*/
struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
const size_t *sizes, int nr_vms,
@@ -2469,11 +3215,12 @@
{
const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
- struct vmap_area **vas, *prev, *next;
+ struct vmap_area **vas, *va;
struct vm_struct **vms;
int area, area2, last_area, term_area;
- unsigned long base, start, end, last_end;
+ unsigned long base, start, size, end, last_end;
bool purged = false;
+ enum fit_type type;
/* verify parameters and allocate data structures */
BUG_ON(offset_in_page(align) || !is_power_of_2(align));
@@ -2509,7 +3256,7 @@
goto err_free2;
for (area = 0; area < nr_vms; area++) {
- vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL);
+ vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
if (!vas[area] || !vms[area])
goto err_free;
@@ -2522,49 +3269,39 @@
start = offsets[area];
end = start + sizes[area];
- if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) {
- base = vmalloc_end - last_end;
- goto found;
- }
- base = pvm_determine_end(&next, &prev, align) - end;
+ va = pvm_find_va_enclose_addr(vmalloc_end);
+ base = pvm_determine_end_from_reverse(&va, align) - end;
while (true) {
- BUG_ON(next && next->va_end <= base + end);
- BUG_ON(prev && prev->va_end > base + end);
-
/*
* base might have underflowed, add last_end before
* comparing.
*/
- if (base + last_end < vmalloc_start + last_end) {
- spin_unlock(&vmap_area_lock);
- if (!purged) {
- purge_vmap_area_lazy();
- purged = true;
- goto retry;
- }
- goto err_free;
- }
+ if (base + last_end < vmalloc_start + last_end)
+ goto overflow;
/*
- * If next overlaps, move base downwards so that it's
- * right below next and then recheck.
+ * Fitting base has not been found.
*/
- if (next && next->va_start < base + end) {
- base = pvm_determine_end(&next, &prev, align) - end;
+ if (va == NULL)
+ goto overflow;
+
+ /*
+ * If required width exeeds current VA block, move
+ * base downwards and then recheck.
+ */
+ if (base + end > va->va_end) {
+ base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
/*
- * If prev overlaps, shift down next and prev and move
- * base so that it's right below new next and then
- * recheck.
+ * If this VA does not fit, move base downwards and recheck.
*/
- if (prev && prev->va_end > base + start) {
- next = prev;
- prev = node_to_va(rb_prev(&next->rb_node));
- base = pvm_determine_end(&next, &prev, align) - end;
+ if (base + start < va->va_start) {
+ va = node_to_va(rb_prev(&va->rb_node));
+ base = pvm_determine_end_from_reverse(&va, align) - end;
term_area = area;
continue;
}
@@ -2576,22 +3313,41 @@
area = (area + nr_vms - 1) % nr_vms;
if (area == term_area)
break;
+
start = offsets[area];
end = start + sizes[area];
- pvm_find_next_prev(base + end, &next, &prev);
+ va = pvm_find_va_enclose_addr(base + end);
}
-found:
+
/* we've found a fitting base, insert all va's */
for (area = 0; area < nr_vms; area++) {
- struct vmap_area *va = vas[area];
+ int ret;
- va->va_start = base + offsets[area];
- va->va_end = va->va_start + sizes[area];
- __insert_vmap_area(va);
+ start = base + offsets[area];
+ size = sizes[area];
+
+ va = pvm_find_va_enclose_addr(start);
+ if (WARN_ON_ONCE(va == NULL))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ type = classify_va_fit_type(va, start, size);
+ if (WARN_ON_ONCE(type == NOTHING_FIT))
+ /* It is a BUG(), but trigger recovery instead. */
+ goto recovery;
+
+ ret = adjust_va_to_fit_type(va, start, size, type);
+ if (unlikely(ret))
+ goto recovery;
+
+ /* Allocated area. */
+ va = vas[area];
+ va->va_start = start;
+ va->va_end = start + size;
+
+ insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
}
- vmap_area_pcpu_hole = base + offsets[last_area];
-
spin_unlock(&vmap_area_lock);
/* insert all vm's */
@@ -2602,9 +3358,38 @@
kfree(vas);
return vms;
+recovery:
+ /* Remove previously inserted areas. */
+ while (area--) {
+ __free_vmap_area(vas[area]);
+ vas[area] = NULL;
+ }
+
+overflow:
+ spin_unlock(&vmap_area_lock);
+ if (!purged) {
+ purge_vmap_area_lazy();
+ purged = true;
+
+ /* Before "retry", check if we recover. */
+ for (area = 0; area < nr_vms; area++) {
+ if (vas[area])
+ continue;
+
+ vas[area] = kmem_cache_zalloc(
+ vmap_area_cachep, GFP_KERNEL);
+ if (!vas[area])
+ goto err_free;
+ }
+
+ goto retry;
+ }
+
err_free:
for (area = 0; area < nr_vms; area++) {
- kfree(vas[area]);
+ if (vas[area])
+ kmem_cache_free(vmap_area_cachep, vas[area]);
+
kfree(vms[area]);
}
err_free2:
@@ -2673,6 +3458,22 @@
}
}
+static void show_purge_info(struct seq_file *m)
+{
+ struct llist_node *head;
+ struct vmap_area *va;
+
+ head = READ_ONCE(vmap_purge_list.first);
+ if (head == NULL)
+ return;
+
+ llist_for_each_entry(va, head, purge_list) {
+ seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ }
+}
+
static int s_show(struct seq_file *m, void *p)
{
struct vmap_area *va;
@@ -2681,14 +3482,13 @@
va = list_entry(p, struct vmap_area, list);
/*
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
- * behalf of vmap area is being tear down or vm_map_ram allocation.
+ * s_show can encounter race with remove_vm_area, !vm on behalf
+ * of vmap area is being tear down or vm_map_ram allocation.
*/
- if (!(va->flags & VM_VM_AREA)) {
- seq_printf(m, "0x%pK-0x%pK %7ld %s\n",
+ if (!va->vm) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start,
- va->flags & VM_LAZY_FREE ? "unpurged vm_area" : "vm_map_ram");
+ va->va_end - va->va_start);
return 0;
}
@@ -2719,11 +3519,24 @@
if (v->flags & VM_USERMAP)
seq_puts(m, " user");
+ if (v->flags & VM_DMA_COHERENT)
+ seq_puts(m, " dma-coherent");
+
if (is_vmalloc_addr(v->pages))
seq_puts(m, " vpages");
show_numa_info(m, v);
seq_putc(m, '\n');
+
+ /*
+ * As a final step, dump "unpurged" areas. Note,
+ * that entire "/proc/vmallocinfo" output will not
+ * be address sorted, because the purge list is not
+ * sorted.
+ */
+ if (list_is_last(&va->list, &vmap_area_list))
+ show_purge_info(m);
+
return 0;
}
@@ -2747,4 +3560,3 @@
module_init(proc_vmalloc_init);
#endif
-
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 4854584..4bac22f 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Linux VM pressure
*
@@ -6,10 +7,6 @@
*
* Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
* Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms of the GNU General Public License version 2 as published
- * by the Free Software Foundation.
*/
#include <linux/cgroup.h>
@@ -358,6 +355,9 @@
* "hierarchy" or "local").
*
* To be used as memcg event method.
+ *
+ * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
+ * not be parsed.
*/
int vmpressure_register_event(struct mem_cgroup *memcg,
struct eventfd_ctx *eventfd, const char *args)
@@ -365,7 +365,7 @@
struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
struct vmpressure_event *ev;
enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
- enum vmpressure_levels level = -1;
+ enum vmpressure_levels level;
char *spec, *spec_orig;
char *token;
int ret = 0;
@@ -378,20 +378,18 @@
/* Find required level */
token = strsep(&spec, ",");
- level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
- if (level < 0) {
- ret = level;
+ ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+ if (ret < 0)
goto out;
- }
+ level = ret;
/* Find optional mode */
token = strsep(&spec, ",");
if (token) {
- mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
- if (mode < 0) {
- ret = mode;
+ ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+ if (ret < 0)
goto out;
- }
+ mode = ret;
}
ev = kzalloc(sizeof(*ev), GFP_KERNEL);
@@ -407,6 +405,7 @@
mutex_lock(&vmpr->events_lock);
list_add(&ev->node, &vmpr->events);
mutex_unlock(&vmpr->events_lock);
+ ret = 0;
out:
kfree(spec_orig);
return ret;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 961401c..ee4eecc 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -46,9 +46,11 @@
#include <linux/delayacct.h>
#include <linux/sysctl.h>
#include <linux/oom.h>
+#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/printk.h>
#include <linux/dax.h>
+#include <linux/psi.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -126,6 +128,9 @@
unsigned int file_taken;
unsigned int taken;
} nr;
+
+ /* for recording the reclaimed slab by now */
+ struct reclaim_state reclaim_state;
};
#ifdef ARCH_HAS_PREFETCH
@@ -166,11 +171,22 @@
*/
unsigned long vm_total_pages;
+static void set_task_reclaim_state(struct task_struct *task,
+ struct reclaim_state *rs)
+{
+ /* Check for an overwrite */
+ WARN_ON_ONCE(rs && task->reclaim_state);
+
+ /* Check for the nulling of an already-nulled member */
+ WARN_ON_ONCE(!rs && !task->reclaim_state);
+
+ task->reclaim_state = rs;
+}
+
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
-#ifdef CONFIG_MEMCG_KMEM
-
+#ifdef CONFIG_MEMCG
/*
* We allow subsystems to populate their shrinker-related
* LRU lists before register_shrinker_prepared() is called
@@ -222,18 +238,7 @@
idr_remove(&shrinker_idr, id);
up_write(&shrinker_rwsem);
}
-#else /* CONFIG_MEMCG_KMEM */
-static int prealloc_memcg_shrinker(struct shrinker *shrinker)
-{
- return 0;
-}
-static void unregister_memcg_shrinker(struct shrinker *shrinker)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
-#ifdef CONFIG_MEMCG
static bool global_reclaim(struct scan_control *sc)
{
return !sc->target_mem_cgroup;
@@ -288,6 +293,15 @@
}
#else
+static int prealloc_memcg_shrinker(struct shrinker *shrinker)
+{
+ return 0;
+}
+
+static void unregister_memcg_shrinker(struct shrinker *shrinker)
+{
+}
+
static bool global_reclaim(struct scan_control *sc)
{
return true;
@@ -337,12 +351,13 @@
*/
unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx)
{
- unsigned long lru_size;
+ unsigned long lru_size = 0;
int zid;
- if (!mem_cgroup_disabled())
- lru_size = mem_cgroup_get_lru_size(lruvec, lru);
- else
+ if (!mem_cgroup_disabled()) {
+ for (zid = 0; zid < MAX_NR_ZONES; zid++)
+ lru_size += mem_cgroup_get_zone_lru_size(lruvec, lru, zid);
+ } else
lru_size = node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru);
for (zid = zone_idx + 1; zid < MAX_NR_ZONES; zid++) {
@@ -369,7 +384,7 @@
*/
int prealloc_shrinker(struct shrinker *shrinker)
{
- size_t size = sizeof(*shrinker->nr_deferred);
+ unsigned int size = sizeof(*shrinker->nr_deferred);
if (shrinker->flags & SHRINKER_NUMA_AWARE)
size *= nr_node_ids;
@@ -473,23 +488,22 @@
nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
total_scan = nr;
- delta = freeable >> priority;
- delta *= 4;
- do_div(delta, shrinker->seeks);
-
- /*
- * Make sure we apply some minimal pressure on default priority
- * even on small cgroups. Stale objects are not only consuming memory
- * by themselves, but can also hold a reference to a dying cgroup,
- * preventing it from being reclaimed. A dying cgroup with all
- * corresponding structures like per-cpu stats and kmem caches
- * can be really big, so it may lead to a significant waste of memory.
- */
- delta = max_t(unsigned long long, delta, min(freeable, batch_size));
+ if (shrinker->seeks) {
+ delta = freeable >> priority;
+ delta *= 4;
+ do_div(delta, shrinker->seeks);
+ } else {
+ /*
+ * These objects don't require any IO to create. Trim
+ * them aggressively under memory pressure to keep
+ * them from causing refetches in the IO caches.
+ */
+ delta = freeable / 2;
+ }
total_scan += delta;
if (total_scan < 0) {
- pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n",
+ pr_err("shrink_slab: %pS negative objects to delete nr=%ld\n",
shrinker->scan_objects, total_scan);
total_scan = freeable;
next_deferred = nr;
@@ -575,7 +589,7 @@
return freed;
}
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
@@ -583,7 +597,7 @@
unsigned long ret, freed = 0;
int i;
- if (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))
+ if (!mem_cgroup_online(memcg))
return 0;
if (!down_read_trylock(&shrinker_rwsem))
@@ -609,6 +623,11 @@
continue;
}
+ /* Call non-slab shrinkers even though kmem is disabled */
+ if (!memcg_kmem_enabled() &&
+ !(shrinker->flags & SHRINKER_NONSLAB))
+ continue;
+
ret = do_shrink_slab(&sc, shrinker, priority);
if (ret == SHRINK_EMPTY) {
clear_bit(i, map->map);
@@ -645,13 +664,13 @@
up_read(&shrinker_rwsem);
return freed;
}
-#else /* CONFIG_MEMCG_KMEM */
+#else /* CONFIG_MEMCG */
static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid,
struct mem_cgroup *memcg, int priority)
{
return 0;
}
-#endif /* CONFIG_MEMCG_KMEM */
+#endif /* CONFIG_MEMCG */
/**
* shrink_slab - shrink slab caches
@@ -680,7 +699,14 @@
unsigned long ret, freed = 0;
struct shrinker *shrinker;
- if (!mem_cgroup_is_root(memcg))
+ /*
+ * The root memcg might be allocated even though memcg is disabled
+ * via "cgroup_disable=memory" boot parameter. This could make
+ * mem_cgroup_is_root() return false, then just run memcg slab
+ * shrink, but skip global shrink. This may result in premature
+ * oom.
+ */
+ if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem))
@@ -741,12 +767,12 @@
{
/*
* A freeable page cache page is referenced only by the caller
- * that isolated the page, the page cache radix tree and
- * optional buffer heads at page->private.
+ * that isolated the page, the page cache and optional buffer
+ * heads at page->private.
*/
- int radix_pins = PageTransHuge(page) && PageSwapCache(page) ?
+ int page_cache_pins = PageTransHuge(page) && PageSwapCache(page) ?
HPAGE_PMD_NR : 1;
- return page_count(page) - page_has_private(page) == 1 + radix_pins;
+ return page_count(page) - page_has_private(page) == 1 + page_cache_pins;
}
static int may_write_to_inode(struct inode *inode, struct scan_control *sc)
@@ -907,10 +933,7 @@
* Note that if SetPageDirty is always performed via set_page_dirty,
* and thus under the i_pages lock, then this ordering is not required.
*/
- if (unlikely(PageTransHuge(page)) && PageSwapCache(page))
- refcount = 1 + HPAGE_PMD_NR;
- else
- refcount = 2;
+ refcount = 1 + compound_nr(page);
if (!page_ref_freeze(page, refcount))
goto cannot_free;
/* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */
@@ -922,7 +945,7 @@
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page);
+ __delete_from_swap_cache(page, swap);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
} else {
@@ -948,7 +971,7 @@
*/
if (reclaimed && page_is_file_cache(page) &&
!mapping_exiting(mapping) && !dax_mapping(mapping))
- shadow = workingset_eviction(mapping, page);
+ shadow = workingset_eviction(page);
__delete_from_page_cache(page, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
@@ -1098,28 +1121,23 @@
struct scan_control *sc,
enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
- bool force_reclaim)
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
- int pgactivate = 0;
- unsigned nr_unqueued_dirty = 0;
- unsigned nr_dirty = 0;
- unsigned nr_congested = 0;
unsigned nr_reclaimed = 0;
- unsigned nr_writeback = 0;
- unsigned nr_immediate = 0;
- unsigned nr_ref_keep = 0;
- unsigned nr_unmap_fail = 0;
+ unsigned pgactivate = 0;
+ memset(stat, 0, sizeof(*stat));
cond_resched();
while (!list_empty(page_list)) {
struct address_space *mapping;
struct page *page;
int may_enter_fs;
- enum page_references references = PAGEREF_RECLAIM_CLEAN;
+ enum page_references references = PAGEREF_RECLAIM;
bool dirty, writeback;
+ unsigned int nr_pages;
cond_resched();
@@ -1131,7 +1149,10 @@
VM_BUG_ON_PAGE(PageActive(page), page);
- sc->nr_scanned++;
+ nr_pages = compound_nr(page);
+
+ /* Account the number of base pages even though THP */
+ sc->nr_scanned += nr_pages;
if (unlikely(!page_evictable(page)))
goto activate_locked;
@@ -1139,11 +1160,6 @@
if (!sc->may_unmap && page_mapped(page))
goto keep_locked;
- /* Double the slab pressure for mapped and swapcache pages */
- if ((page_mapped(page) || PageSwapCache(page)) &&
- !(PageAnon(page) && !PageSwapBacked(page)))
- sc->nr_scanned++;
-
may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
@@ -1155,10 +1171,10 @@
*/
page_check_dirty_writeback(page, &dirty, &writeback);
if (dirty || writeback)
- nr_dirty++;
+ stat->nr_dirty++;
if (dirty && !writeback)
- nr_unqueued_dirty++;
+ stat->nr_unqueued_dirty++;
/*
* Treat this page as congested if the underlying BDI is or if
@@ -1170,7 +1186,7 @@
if (((dirty || writeback) && mapping &&
inode_write_congested(mapping->host)) ||
(writeback && PageReclaim(page)))
- nr_congested++;
+ stat->nr_congested++;
/*
* If a page at the tail of the LRU is under writeback, there
@@ -1219,7 +1235,7 @@
if (current_is_kswapd() &&
PageReclaim(page) &&
test_bit(PGDAT_WRITEBACK, &pgdat->flags)) {
- nr_immediate++;
+ stat->nr_immediate++;
goto activate_locked;
/* Case 2 above */
@@ -1237,7 +1253,7 @@
* and it's also appropriate in global reclaim.
*/
SetPageReclaim(page);
- nr_writeback++;
+ stat->nr_writeback++;
goto activate_locked;
/* Case 3 above */
@@ -1250,14 +1266,14 @@
}
}
- if (!force_reclaim)
+ if (!ignore_references)
references = page_check_references(page, sc);
switch (references) {
case PAGEREF_ACTIVATE:
goto activate_locked;
case PAGEREF_KEEP:
- nr_ref_keep++;
+ stat->nr_ref_keep += nr_pages;
goto keep_locked;
case PAGEREF_RECLAIM:
case PAGEREF_RECLAIM_CLEAN:
@@ -1289,7 +1305,7 @@
}
if (!add_to_swap(page)) {
if (!PageTransHuge(page))
- goto activate_locked;
+ goto activate_locked_split;
/* Fallback to swap normal pages */
if (split_huge_page_to_list(page,
page_list))
@@ -1298,7 +1314,7 @@
count_vm_event(THP_SWPOUT_FALLBACK);
#endif
if (!add_to_swap(page))
- goto activate_locked;
+ goto activate_locked_split;
}
may_enter_fs = 1;
@@ -1313,6 +1329,18 @@
}
/*
+ * THP may get split above, need minus tail pages and update
+ * nr_pages to avoid accounting tail pages twice.
+ *
+ * The tail pages that are added into swap cache successfully
+ * reach here.
+ */
+ if ((nr_pages > 1) && !PageTransHuge(page)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
+
+ /*
* The page is mapped into the page tables of one or more
* processes. Try to unmap it here.
*/
@@ -1322,7 +1350,7 @@
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
if (!try_to_unmap(page, flags)) {
- nr_unmap_fail++;
+ stat->nr_unmap_fail += nr_pages;
goto activate_locked;
}
}
@@ -1446,28 +1474,34 @@
count_memcg_page_event(page, PGLAZYFREED);
} else if (!mapping || !__remove_mapping(mapping, page, true))
goto keep_locked;
- /*
- * At this point, we have no other references and there is
- * no way to pick any more up (removed from LRU, removed
- * from pagecache). Can use non-atomic bitops now (and
- * we obviously don't have to worry about waking up a process
- * waiting on the page lock, because there are no references.
- */
- __ClearPageLocked(page);
+
+ unlock_page(page);
free_it:
- nr_reclaimed++;
+ /*
+ * THP may get swapped out in a whole, need account
+ * all base pages.
+ */
+ nr_reclaimed += nr_pages;
/*
* Is there need to periodically free_page_list? It would
* appear not as the counts should be low
*/
- if (unlikely(PageTransHuge(page))) {
- mem_cgroup_uncharge(page);
+ if (unlikely(PageTransHuge(page)))
(*get_compound_page_dtor(page))(page);
- } else
+ else
list_add(&page->lru, &free_pages);
continue;
+activate_locked_split:
+ /*
+ * The tail pages that are failed to add into swap cache
+ * reach here. Fixup nr_scanned and nr_pages.
+ */
+ if (nr_pages > 1) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
activate_locked:
/* Not a candidate for swapping, so reclaim swap space. */
if (PageSwapCache(page) && (mem_cgroup_swap_full(page) ||
@@ -1475,8 +1509,9 @@
try_to_free_swap(page);
VM_BUG_ON_PAGE(PageActive(page), page);
if (!PageMlocked(page)) {
+ int type = page_is_file_cache(page);
SetPageActive(page);
- pgactivate++;
+ stat->nr_activate[type] += nr_pages;
count_memcg_page_event(page, PGACTIVATE);
}
keep_locked:
@@ -1486,6 +1521,8 @@
VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
}
+ pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
+
mem_cgroup_uncharge_list(&free_pages);
try_to_unmap_flush();
free_unref_page_list(&free_pages);
@@ -1493,16 +1530,6 @@
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
- if (stat) {
- stat->nr_dirty = nr_dirty;
- stat->nr_congested = nr_congested;
- stat->nr_unqueued_dirty = nr_unqueued_dirty;
- stat->nr_writeback = nr_writeback;
- stat->nr_immediate = nr_immediate;
- stat->nr_activate = pgactivate;
- stat->nr_ref_keep = nr_ref_keep;
- stat->nr_unmap_fail = nr_unmap_fail;
- }
return nr_reclaimed;
}
@@ -1514,20 +1541,21 @@
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
+ struct reclaim_stat dummy_stat;
unsigned long ret;
struct page *page, *next;
LIST_HEAD(clean_pages);
list_for_each_entry_safe(page, next, page_list, lru) {
if (page_is_file_cache(page) && !PageDirty(page) &&
- !__PageMovable(page)) {
+ !__PageMovable(page) && !PageUnevictable(page)) {
ClearPageActive(page);
list_move(&page->lru, &clean_pages);
}
}
ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, NULL, true);
+ TTU_IGNORE_ACCESS, &dummy_stat, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
return ret;
@@ -1632,8 +1660,8 @@
}
-/*
- * zone_lru_lock is heavily contended. Some of the functions that
+/**
+ * pgdat->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
@@ -1655,7 +1683,7 @@
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct lruvec *lruvec, struct list_head *dst,
unsigned long *nr_scanned, struct scan_control *sc,
- isolate_mode_t mode, enum lru_list lru)
+ enum lru_list lru)
{
struct list_head *src = &lruvec->lists[lru];
unsigned long nr_taken = 0;
@@ -1664,11 +1692,11 @@
unsigned long skipped = 0;
unsigned long scan, total_scan, nr_pages;
LIST_HEAD(pages_skipped);
+ isolate_mode_t mode = (sc->may_unmap ? 0 : ISOLATE_UNMAPPED);
+ total_scan = 0;
scan = 0;
- for (total_scan = 0;
- scan < nr_to_scan && nr_taken < nr_to_scan && !list_empty(src);
- total_scan++) {
+ while (scan < nr_to_scan && !list_empty(src)) {
struct page *page;
page = lru_to_page(src);
@@ -1676,9 +1704,12 @@
VM_BUG_ON_PAGE(!PageLRU(page), page);
+ nr_pages = compound_nr(page);
+ total_scan += nr_pages;
+
if (page_zonenum(page) > sc->reclaim_idx) {
list_move(&page->lru, &pages_skipped);
- nr_skipped[page_zonenum(page)]++;
+ nr_skipped[page_zonenum(page)] += nr_pages;
continue;
}
@@ -1687,11 +1718,14 @@
* return with no isolated pages if the LRU mostly contains
* ineligible pages. This causes the VM to not reclaim any
* pages, triggering a premature OOM.
+ *
+ * Account all tail pages of THP. This would not cause
+ * premature OOM since __isolate_lru_page() returns -EBUSY
+ * only when the page is being freed somewhere else.
*/
- scan++;
+ scan += nr_pages;
switch (__isolate_lru_page(page, mode)) {
case 0:
- nr_pages = hpage_nr_pages(page);
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
@@ -1767,11 +1801,11 @@
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
- struct zone *zone = page_zone(page);
+ pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
- spin_lock_irq(zone_lru_lock(zone));
- lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
+ spin_lock_irq(&pgdat->lru_lock);
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
if (PageLRU(page)) {
int lru = page_lru(page);
get_page(page);
@@ -1779,7 +1813,7 @@
del_page_from_lru_list(page, lruvec, lru);
ret = 0;
}
- spin_unlock_irq(zone_lru_lock(zone));
+ spin_unlock_irq(&pgdat->lru_lock);
}
return ret;
}
@@ -1821,40 +1855,54 @@
return isolated > inactive;
}
-static noinline_for_stack void
-putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
+/*
+ * This moves pages from @list to corresponding LRU list.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes, from rmap.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold zone_lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()) so we
+ * should drop zone_lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ *
+ * Returns the number of pages moved to the given lruvec.
+ */
+
+static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
+ struct list_head *list)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
+ int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
+ struct page *page;
+ enum lru_list lru;
- /*
- * Put back any unfreeable pages.
- */
- while (!list_empty(page_list)) {
- struct page *page = lru_to_page(page_list);
- int lru;
-
+ while (!list_empty(list)) {
+ page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
- list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
+ list_del(&page->lru);
spin_unlock_irq(&pgdat->lru_lock);
putback_lru_page(page);
spin_lock_irq(&pgdat->lru_lock);
continue;
}
-
lruvec = mem_cgroup_page_lruvec(page, pgdat);
SetPageLRU(page);
lru = page_lru(page);
- add_page_to_lru_list(page, lruvec, lru);
- if (is_active_lru(lru)) {
- int file = is_file_lru(lru);
- int numpages = hpage_nr_pages(page);
- reclaim_stat->recent_rotated[file] += numpages;
- }
+ nr_pages = hpage_nr_pages(page);
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+ list_move(&page->lru, &lruvec->lists[lru]);
+
if (put_page_testzero(page)) {
__ClearPageLRU(page);
__ClearPageActive(page);
@@ -1862,18 +1910,21 @@
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
(*get_compound_page_dtor(page))(page);
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
+ } else {
+ nr_moved += nr_pages;
}
}
/*
* To save our caller's stack, now use input list for pages to free.
*/
- list_splice(&pages_to_free, page_list);
+ list_splice(&pages_to_free, list);
+
+ return nr_moved;
}
/*
@@ -1901,9 +1952,9 @@
unsigned long nr_scanned;
unsigned long nr_reclaimed = 0;
unsigned long nr_taken;
- struct reclaim_stat stat = {};
- isolate_mode_t isolate_mode = 0;
+ struct reclaim_stat stat;
int file = is_file_lru(lru);
+ enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
@@ -1923,28 +1974,18 @@
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_KSWAPD, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_KSWAPD,
- nr_scanned);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSCAN_DIRECT, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGSCAN_DIRECT,
- nr_scanned);
- }
+ item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1955,19 +1996,14 @@
spin_lock_irq(&pgdat->lru_lock);
- if (current_is_kswapd()) {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_KSWAPD, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_KSWAPD,
- nr_reclaimed);
- } else {
- if (global_reclaim(sc))
- __count_vm_events(PGSTEAL_DIRECT, nr_reclaimed);
- count_memcg_events(lruvec_memcg(lruvec), PGSTEAL_DIRECT,
- nr_reclaimed);
- }
+ item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ if (global_reclaim(sc))
+ __count_vm_events(item, nr_reclaimed);
+ __count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
+ reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
+ reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
- putback_inactive_pages(lruvec, &page_list);
+ move_pages_to_lru(lruvec, &page_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
@@ -2004,73 +2040,6 @@
return nr_reclaimed;
}
-/*
- * This moves pages from the active list to the inactive list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone_lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone_lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
- *
- * Returns the number of pages moved to the given lru.
- */
-
-static unsigned move_active_pages_to_lru(struct lruvec *lruvec,
- struct list_head *list,
- struct list_head *pages_to_free,
- enum lru_list lru)
-{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct page *page;
- int nr_pages;
- int nr_moved = 0;
-
- while (!list_empty(list)) {
- page = lru_to_page(list);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
-
- nr_pages = hpage_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
-
- if (put_page_testzero(page)) {
- __ClearPageLRU(page);
- __ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
-
- if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge(page);
- (*get_compound_page_dtor(page))(page);
- spin_lock_irq(&pgdat->lru_lock);
- } else
- list_add(&page->lru, pages_to_free);
- } else {
- nr_moved += nr_pages;
- }
- }
-
- if (!is_active_lru(lru)) {
- __count_vm_events(PGDEACTIVATE, nr_moved);
- count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
- nr_moved);
- }
-
- return nr_moved;
-}
-
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
@@ -2086,25 +2055,21 @@
struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
- isolate_mode_t isolate_mode = 0;
int file = is_file_lru(lru);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
lru_add_drain();
- if (!sc->may_unmap)
- isolate_mode |= ISOLATE_UNMAPPED;
-
spin_lock_irq(&pgdat->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
- &nr_scanned, sc, isolate_mode, lru);
+ &nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
reclaim_stat->recent_scanned[file] += nr_taken;
__count_vm_events(PGREFILL, nr_scanned);
- count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
+ __count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2145,6 +2110,7 @@
}
ClearPageActive(page); /* we are de-activating */
+ SetPageWorkingset(page);
list_add(&page->lru, &l_inactive);
}
@@ -2160,17 +2126,79 @@
*/
reclaim_stat->recent_rotated[file] += nr_rotated;
- nr_activate = move_active_pages_to_lru(lruvec, &l_active, &l_hold, lru);
- nr_deactivate = move_active_pages_to_lru(lruvec, &l_inactive, &l_hold, lru - LRU_ACTIVE);
+ nr_activate = move_pages_to_lru(lruvec, &l_active);
+ nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
+ /* Keep all free pages in l_active list */
+ list_splice(&l_inactive, &l_active);
+
+ __count_vm_events(PGDEACTIVATE, nr_deactivate);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
+
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&pgdat->lru_lock);
- mem_cgroup_uncharge_list(&l_hold);
- free_unref_page_list(&l_hold);
+ mem_cgroup_uncharge_list(&l_active);
+ free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
nr_deactivate, nr_rotated, sc->priority, file);
}
+unsigned long reclaim_pages(struct list_head *page_list)
+{
+ int nid = -1;
+ unsigned long nr_reclaimed = 0;
+ LIST_HEAD(node_page_list);
+ struct reclaim_stat dummy_stat;
+ struct page *page;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .priority = DEF_PRIORITY,
+ .may_writepage = 1,
+ .may_unmap = 1,
+ .may_swap = 1,
+ };
+
+ while (!list_empty(page_list)) {
+ page = lru_to_page(page_list);
+ if (nid == -1) {
+ nid = page_to_nid(page);
+ INIT_LIST_HEAD(&node_page_list);
+ }
+
+ if (nid == page_to_nid(page)) {
+ ClearPageActive(page);
+ list_move(&page->lru, &node_page_list);
+ continue;
+ }
+
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+
+ nid = -1;
+ }
+
+ if (!list_empty(&node_page_list)) {
+ nr_reclaimed += shrink_page_list(&node_page_list,
+ NODE_DATA(nid),
+ &sc, 0,
+ &dummy_stat, false);
+ while (!list_empty(&node_page_list)) {
+ page = lru_to_page(&node_page_list);
+ list_del(&page->lru);
+ putback_lru_page(page);
+ }
+ }
+
+ return nr_reclaimed;
+}
+
/*
* The inactive anon list should be small enough that the VM never has
* to do too much work.
@@ -2200,8 +2228,7 @@
* 10TB 320 32GB
*/
static bool inactive_list_is_low(struct lruvec *lruvec, bool file,
- struct mem_cgroup *memcg,
- struct scan_control *sc, bool actual_reclaim)
+ struct scan_control *sc, bool trace)
{
enum lru_list active_lru = file * LRU_FILE + LRU_ACTIVE;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -2221,17 +2248,13 @@
inactive = lruvec_lru_size(lruvec, inactive_lru, sc->reclaim_idx);
active = lruvec_lru_size(lruvec, active_lru, sc->reclaim_idx);
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
/*
* When refaults are being observed, it means a new workingset
* is being established. Disable active list protection to get
* rid of the stale workingset quickly.
*/
- if (file && actual_reclaim && lruvec->refaults != refaults) {
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
+ if (file && lruvec->refaults != refaults) {
inactive_ratio = 0;
} else {
gb = (inactive + active) >> (30 - PAGE_SHIFT);
@@ -2241,7 +2264,7 @@
inactive_ratio = 1;
}
- if (actual_reclaim)
+ if (trace)
trace_mm_vmscan_inactive_list_is_low(pgdat->node_id, sc->reclaim_idx,
lruvec_lru_size(lruvec, inactive_lru, MAX_NR_ZONES), inactive,
lruvec_lru_size(lruvec, active_lru, MAX_NR_ZONES), active,
@@ -2251,12 +2274,10 @@
}
static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
- struct lruvec *lruvec, struct mem_cgroup *memcg,
- struct scan_control *sc)
+ struct lruvec *lruvec, struct scan_control *sc)
{
if (is_active_lru(lru)) {
- if (inactive_list_is_low(lruvec, is_file_lru(lru),
- memcg, sc, true))
+ if (inactive_list_is_low(lruvec, is_file_lru(lru), sc, true))
shrink_active_list(nr_to_scan, lruvec, sc, lru);
return 0;
}
@@ -2356,7 +2377,7 @@
* anonymous pages on the LRU in eligible zones.
* Otherwise, the small LRU gets thrashed.
*/
- if (!inactive_list_is_low(lruvec, false, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, false, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, sc->reclaim_idx)
>> sc->priority) {
scan_balance = SCAN_ANON;
@@ -2374,7 +2395,7 @@
* lruvec even if it has plenty of old anonymous pages unless the
* system is under heavy pressure.
*/
- if (!inactive_list_is_low(lruvec, true, memcg, sc, false) &&
+ if (!inactive_list_is_low(lruvec, true, sc, false) &&
lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, sc->reclaim_idx) >> sc->priority) {
scan_balance = SCAN_FILE;
goto out;
@@ -2436,17 +2457,70 @@
*lru_pages = 0;
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
- unsigned long size;
+ unsigned long lruvec_size;
unsigned long scan;
+ unsigned long protection;
- size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- scan = size >> sc->priority;
+ lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
+ protection = mem_cgroup_protection(memcg,
+ sc->memcg_low_reclaim);
+
+ if (protection) {
+ /*
+ * Scale a cgroup's reclaim pressure by proportioning
+ * its current usage to its memory.low or memory.min
+ * setting.
+ *
+ * This is important, as otherwise scanning aggression
+ * becomes extremely binary -- from nothing as we
+ * approach the memory protection threshold, to totally
+ * nominal as we exceed it. This results in requiring
+ * setting extremely liberal protection thresholds. It
+ * also means we simply get no protection at all if we
+ * set it too low, which is not ideal.
+ *
+ * If there is any protection in place, we reduce scan
+ * pressure by how much of the total memory used is
+ * within protection thresholds.
+ *
+ * There is one special case: in the first reclaim pass,
+ * we skip over all groups that are within their low
+ * protection. If that fails to reclaim enough pages to
+ * satisfy the reclaim goal, we come back and override
+ * the best-effort low protection. However, we still
+ * ideally want to honor how well-behaved groups are in
+ * that case instead of simply punishing them all
+ * equally. As such, we reclaim them based on how much
+ * memory they are using, reducing the scan pressure
+ * again by how much of the total memory used is under
+ * hard protection.
+ */
+ unsigned long cgroup_size = mem_cgroup_size(memcg);
+
+ /* Avoid TOCTOU with earlier protection check */
+ cgroup_size = max(cgroup_size, protection);
+
+ scan = lruvec_size - lruvec_size * protection /
+ cgroup_size;
+
+ /*
+ * Minimally target SWAP_CLUSTER_MAX pages to keep
+ * reclaim moving forwards, avoiding decremeting
+ * sc->priority further than desirable.
+ */
+ scan = max(scan, SWAP_CLUSTER_MAX);
+ } else {
+ scan = lruvec_size;
+ }
+
+ scan >>= sc->priority;
+
/*
* If the cgroup's already been deleted, make sure to
* scrape out the remaining cache.
*/
if (!scan && !mem_cgroup_online(memcg))
- scan = min(size, SWAP_CLUSTER_MAX);
+ scan = min(lruvec_size, SWAP_CLUSTER_MAX);
switch (scan_balance) {
case SCAN_EQUAL:
@@ -2466,7 +2540,7 @@
case SCAN_ANON:
/* Scan one type exclusively */
if ((scan_balance == SCAN_FILE) != file) {
- size = 0;
+ lruvec_size = 0;
scan = 0;
}
break;
@@ -2475,7 +2549,7 @@
BUG();
}
- *lru_pages += size;
+ *lru_pages += lruvec_size;
nr[lru] = scan;
}
}
@@ -2527,7 +2601,7 @@
nr[lru] -= nr_to_scan;
nr_reclaimed += shrink_list(lru, nr_to_scan,
- lruvec, memcg, sc);
+ lruvec, sc);
}
}
@@ -2594,7 +2668,7 @@
* Even if we did not try to evict anon pages at all, we want to
* rebalance the anon lru active/inactive ratio.
*/
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
}
@@ -2619,7 +2693,6 @@
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
unsigned long nr_reclaimed,
- unsigned long nr_scanned,
struct scan_control *sc)
{
unsigned long pages_for_compaction;
@@ -2630,40 +2703,18 @@
if (!in_reclaim_compaction(sc))
return false;
- /* Consider stopping depending on scan and reclaim activity */
- if (sc->gfp_mask & __GFP_RETRY_MAYFAIL) {
- /*
- * For __GFP_RETRY_MAYFAIL allocations, stop reclaiming if the
- * full LRU list has been scanned and we are still failing
- * to reclaim pages. This full LRU scan is potentially
- * expensive but a __GFP_RETRY_MAYFAIL caller really wants to succeed
- */
- if (!nr_reclaimed && !nr_scanned)
- return false;
- } else {
- /*
- * For non-__GFP_RETRY_MAYFAIL allocations which can presumably
- * fail without consequence, stop if we failed to reclaim
- * any pages from the last SWAP_CLUSTER_MAX number of
- * pages that were scanned. This will return to the
- * caller faster at the risk reclaim/compaction and
- * the resulting allocation attempt fails
- */
- if (!nr_reclaimed)
- return false;
- }
-
/*
- * If we have not reclaimed enough pages for compaction and the
- * inactive lists are large enough, continue reclaiming
+ * Stop if we failed to reclaim any pages from the last SWAP_CLUSTER_MAX
+ * number of pages that were scanned. This will return to the caller
+ * with the risk reclaim/compaction and the resulting allocation attempt
+ * fails. In the past we have tried harder for __GFP_RETRY_MAYFAIL
+ * allocations through requiring that the full LRU list has been scanned
+ * first, by assuming that zero delta of sc->nr_scanned means full LRU
+ * scan, but that approximation was wrong, and there were corner cases
+ * where always a non-zero amount of pages were scanned.
*/
- pages_for_compaction = compact_gap(sc->order);
- inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
- if (get_nr_swap_pages() > 0)
- inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
- if (sc->nr_reclaimed < pages_for_compaction &&
- inactive_lru_pages > pages_for_compaction)
- return true;
+ if (!nr_reclaimed)
+ return false;
/* If compaction would go ahead or the allocation would succeed, stop */
for (z = 0; z <= sc->reclaim_idx; z++) {
@@ -2680,7 +2731,17 @@
;
}
}
- return true;
+
+ /*
+ * If we have not reclaimed enough pages for compaction and the
+ * inactive lists are large enough, continue reclaiming
+ */
+ pages_for_compaction = compact_gap(sc->order);
+ inactive_lru_pages = node_page_state(pgdat, NR_INACTIVE_FILE);
+ if (get_nr_swap_pages() > 0)
+ inactive_lru_pages += node_page_state(pgdat, NR_INACTIVE_ANON);
+
+ return inactive_lru_pages > pages_for_compaction;
}
static bool pgdat_memcg_congested(pg_data_t *pgdat, struct mem_cgroup *memcg)
@@ -2697,10 +2758,6 @@
do {
struct mem_cgroup *root = sc->target_mem_cgroup;
- struct mem_cgroup_reclaim_cookie reclaim = {
- .pgdat = pgdat,
- .priority = sc->priority,
- };
unsigned long node_lru_pages = 0;
struct mem_cgroup *memcg;
@@ -2709,7 +2766,7 @@
nr_reclaimed = sc->nr_reclaimed;
nr_scanned = sc->nr_scanned;
- memcg = mem_cgroup_iter(root, NULL, &reclaim);
+ memcg = mem_cgroup_iter(root, NULL, NULL);
do {
unsigned long lru_pages;
unsigned long reclaimed;
@@ -2736,6 +2793,13 @@
memcg_memory_event(memcg, MEMCG_LOW);
break;
case MEMCG_PROT_NONE:
+ /*
+ * All protection thresholds breached. We may
+ * still choose to vary the scan pressure
+ * applied based on by how much the cgroup in
+ * question has exceeded its protection
+ * thresholds (see get_scan_count).
+ */
break;
}
@@ -2744,30 +2808,15 @@
shrink_node_memcg(pgdat, memcg, sc, &lru_pages);
node_lru_pages += lru_pages;
- shrink_slab(sc->gfp_mask, pgdat->node_id,
- memcg, sc->priority);
+ shrink_slab(sc->gfp_mask, pgdat->node_id, memcg,
+ sc->priority);
/* Record the group's reclaim efficiency */
vmpressure(sc->gfp_mask, memcg, false,
sc->nr_scanned - scanned,
sc->nr_reclaimed - reclaimed);
- /*
- * Direct reclaim and kswapd have to scan all memory
- * cgroups to fulfill the overall scan target for the
- * node.
- *
- * Limit reclaim, on the other hand, only cares about
- * nr_to_reclaim pages to be reclaimed and it will
- * retry with decreasing priority if one round over the
- * whole hierarchy is not sufficient.
- */
- if (!global_reclaim(sc) &&
- sc->nr_reclaimed >= sc->nr_to_reclaim) {
- mem_cgroup_iter_break(root, memcg);
- break;
- }
- } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+ } while ((memcg = mem_cgroup_iter(root, memcg, NULL)));
if (reclaim_state) {
sc->nr_reclaimed += reclaim_state->reclaimed_slab;
@@ -2844,7 +2893,7 @@
wait_iff_congested(BLK_RW_ASYNC, HZ/10);
} while (should_continue_reclaim(pgdat, sc->nr_reclaimed - nr_reclaimed,
- sc->nr_scanned - nr_scanned, sc));
+ sc));
/*
* Kswapd gives up on balancing particular nodes after too
@@ -2992,12 +3041,8 @@
unsigned long refaults;
struct lruvec *lruvec;
- if (memcg)
- refaults = memcg_page_state(memcg, WORKINGSET_ACTIVATE);
- else
- refaults = node_page_state(pgdat, WORKINGSET_ACTIVATE);
-
lruvec = mem_cgroup_lruvec(pgdat, memcg);
+ refaults = lruvec_page_state_local(lruvec, WORKINGSET_ACTIVATE);
lruvec->refaults = refaults;
} while ((memcg = mem_cgroup_iter(root_memcg, memcg, NULL)));
}
@@ -3245,20 +3290,20 @@
if (throttle_direct_reclaim(sc.gfp_mask, zonelist, nodemask))
return 1;
- trace_mm_vmscan_direct_reclaim_begin(order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ trace_mm_vmscan_direct_reclaim_begin(order, sc.gfp_mask);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
#ifdef CONFIG_MEMCG
+/* Only used by soft limit reclaim. Do not reuse for anything else. */
unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat,
@@ -3274,13 +3319,13 @@
};
unsigned long lru_pages;
+ WARN_ON_ONCE(!current->reclaim_state);
+
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
trace_mm_vmscan_memcg_softlimit_reclaim_begin(sc.order,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ sc.gfp_mask);
/*
* NOTE: Although we can get the priority field, using it
@@ -3294,6 +3339,7 @@
trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
*nr_scanned = sc.nr_scanned;
+
return sc.nr_reclaimed;
}
@@ -3304,6 +3350,7 @@
{
struct zonelist *zonelist;
unsigned long nr_reclaimed;
+ unsigned long pflags;
int nid;
unsigned int noreclaim_flag;
struct scan_control sc = {
@@ -3318,6 +3365,7 @@
.may_swap = may_swap,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
/*
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
* take care of from where we get pages. So the node where we start the
@@ -3327,16 +3375,18 @@
zonelist = &NODE_DATA(nid)->node_zonelists[ZONELIST_FALLBACK];
- trace_mm_vmscan_memcg_reclaim_begin(0,
- sc.may_writepage,
- sc.gfp_mask,
- sc.reclaim_idx);
+ trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
+ psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
+
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+
memalloc_noreclaim_restore(noreclaim_flag);
+ psi_memstall_leave(&pflags);
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
+ set_task_reclaim_state(current, NULL);
return nr_reclaimed;
}
@@ -3354,7 +3404,7 @@
do {
struct lruvec *lruvec = mem_cgroup_lruvec(pgdat, memcg);
- if (inactive_list_is_low(lruvec, false, memcg, sc, true))
+ if (inactive_list_is_low(lruvec, false, sc, true))
shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
sc, LRU_ACTIVE_ANON);
@@ -3362,6 +3412,30 @@
} while (memcg);
}
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+{
+ int i;
+ struct zone *zone;
+
+ /*
+ * Check for watermark boosts top-down as the higher zones
+ * are more likely to be boosted. Both watermarks and boosts
+ * should not be checked at the time time as reclaim would
+ * start prematurely when there is no boosting and a lower
+ * zone is balanced.
+ */
+ for (i = classzone_idx; i >= 0; i--) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ if (zone->watermark_boost)
+ return true;
+ }
+
+ return false;
+}
+
/*
* Returns true if there is an eligible zone balanced for the request order
* and classzone_idx
@@ -3372,6 +3446,10 @@
unsigned long mark = -1;
struct zone *zone;
+ /*
+ * Check watermarks bottom-up as lower zones are more likely to
+ * meet watermarks.
+ */
for (i = 0; i <= classzone_idx; i++) {
zone = pgdat->node_zones + i;
@@ -3490,7 +3568,7 @@
*
* kswapd scans the zones in the highmem->normal->dma direction. It skips
* zones which have free_pages > high_wmark_pages(zone), but once a zone is
- * found to have free_pages <= high_wmark_pages(zone), any page is that zone
+ * found to have free_pages <= high_wmark_pages(zone), any page in that zone
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
@@ -3499,23 +3577,45 @@
int i;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ unsigned long pflags;
+ unsigned long nr_boost_reclaim;
+ unsigned long zone_boosts[MAX_NR_ZONES] = { 0, };
+ bool boosted;
struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.order = order,
- .priority = DEF_PRIORITY,
- .may_writepage = !laptop_mode,
.may_unmap = 1,
- .may_swap = 1,
};
+ set_task_reclaim_state(current, &sc.reclaim_state);
+ psi_memstall_enter(&pflags);
__fs_reclaim_acquire();
count_vm_event(PAGEOUTRUN);
+ /*
+ * Account for the reclaim boost. Note that the zone boost is left in
+ * place so that parallel allocations that are near the watermark will
+ * stall or direct reclaim until kswapd is finished.
+ */
+ nr_boost_reclaim = 0;
+ for (i = 0; i <= classzone_idx; i++) {
+ zone = pgdat->node_zones + i;
+ if (!managed_zone(zone))
+ continue;
+
+ nr_boost_reclaim += zone->watermark_boost;
+ zone_boosts[i] = zone->watermark_boost;
+ }
+ boosted = nr_boost_reclaim;
+
+restart:
+ sc.priority = DEF_PRIORITY;
do {
unsigned long nr_reclaimed = sc.nr_reclaimed;
bool raise_priority = true;
+ bool balanced;
bool ret;
sc.reclaim_idx = classzone_idx;
@@ -3542,13 +3642,39 @@
}
/*
- * Only reclaim if there are no eligible zones. Note that
- * sc.reclaim_idx is not used as buffer_heads_over_limit may
- * have adjusted it.
+ * If the pgdat is imbalanced then ignore boosting and preserve
+ * the watermarks for a later time and restart. Note that the
+ * zone watermarks will be still reset at the end of balancing
+ * on the grounds that the normal reclaim should be enough to
+ * re-evaluate if boosting is required when kswapd next wakes.
*/
- if (pgdat_balanced(pgdat, sc.order, classzone_idx))
+ balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+ if (!balanced && nr_boost_reclaim) {
+ nr_boost_reclaim = 0;
+ goto restart;
+ }
+
+ /*
+ * If boosting is not active then only reclaim if there are no
+ * eligible zones. Note that sc.reclaim_idx is not used as
+ * buffer_heads_over_limit may have adjusted it.
+ */
+ if (!nr_boost_reclaim && balanced)
goto out;
+ /* Limit the priority of boosting to avoid reclaim writeback */
+ if (nr_boost_reclaim && sc.priority == DEF_PRIORITY - 2)
+ raise_priority = false;
+
+ /*
+ * Do not writeback or swap pages for boosted reclaim. The
+ * intent is to relieve pressure not issue sub-optimal IO
+ * from reclaim context. If no pages are reclaimed, the
+ * reclaim will be aborted.
+ */
+ sc.may_writepage = !laptop_mode && !nr_boost_reclaim;
+ sc.may_swap = !nr_boost_reclaim;
+
/*
* Do some background aging of the anon list, to give
* pages a chance to be referenced before reclaiming. All
@@ -3600,6 +3726,16 @@
* progress in reclaiming pages
*/
nr_reclaimed = sc.nr_reclaimed - nr_reclaimed;
+ nr_boost_reclaim -= min(nr_boost_reclaim, nr_reclaimed);
+
+ /*
+ * If reclaim made no progress for a boost, stop reclaim as
+ * IO cannot be queued and it could be an infinite loop in
+ * extreme circumstances.
+ */
+ if (nr_boost_reclaim && !nr_reclaimed)
+ break;
+
if (raise_priority || !nr_reclaimed)
sc.priority--;
} while (sc.priority >= 1);
@@ -3608,8 +3744,33 @@
pgdat->kswapd_failures++;
out:
+ /* If reclaim was boosted, account for the reclaim done in this pass */
+ if (boosted) {
+ unsigned long flags;
+
+ for (i = 0; i <= classzone_idx; i++) {
+ if (!zone_boosts[i])
+ continue;
+
+ /* Increments are under the zone lock */
+ zone = pgdat->node_zones + i;
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->watermark_boost -= min(zone->watermark_boost, zone_boosts[i]);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+
+ /*
+ * As there is now likely space, wakeup kcompact to defragment
+ * pageblocks.
+ */
+ wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+ }
+
snapshot_refaults(NULL, pgdat);
__fs_reclaim_release();
+ psi_memstall_leave(&pflags);
+ set_task_reclaim_state(current, NULL);
+
/*
* Return the order kswapd stopped reclaiming at as
* prepare_kswapd_sleep() takes it into account. If another caller
@@ -3620,19 +3781,18 @@
}
/*
- * pgdat->kswapd_classzone_idx is the highest zone index that a recent
- * allocation request woke kswapd for. When kswapd has not woken recently,
- * the value is MAX_NR_ZONES which is not a valid index. This compares a
- * given classzone and returns it or the highest classzone index kswapd
- * was recently woke for.
+ * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
+ * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
+ * a valid index then either kswapd runs for first time or kswapd couldn't sleep
+ * after previous reclaim attempt (node is still unbalanced). In that case
+ * return the zone index of the previous kswapd reclaim cycle.
*/
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type classzone_idx)
+ enum zone_type prev_classzone_idx)
{
if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return classzone_idx;
-
- return max(pgdat->kswapd_classzone_idx, classzone_idx);
+ return prev_classzone_idx;
+ return pgdat->kswapd_classzone_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3734,15 +3894,10 @@
unsigned int classzone_idx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
-
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
- };
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (!cpumask_empty(cpumask))
set_cpus_allowed_ptr(tsk, cpumask);
- current->reclaim_state = &reclaim_state;
/*
* Tell the memory management that we're a "memory allocator",
@@ -3773,7 +3928,7 @@
/* Read the new order and classzone_idx */
alloc_order = reclaim_order = pgdat->kswapd_order;
- classzone_idx = kswapd_classzone_idx(pgdat, 0);
+ classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
pgdat->kswapd_order = 0;
pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
@@ -3804,7 +3959,6 @@
}
tsk->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD);
- current->reclaim_state = NULL;
return 0;
}
@@ -3827,15 +3981,20 @@
if (!cpuset_zone_allowed(zone, gfp_flags))
return;
pgdat = zone->zone_pgdat;
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat,
- classzone_idx);
+
+ if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
+ pgdat->kswapd_classzone_idx = classzone_idx;
+ else
+ pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
+ classzone_idx);
pgdat->kswapd_order = max(pgdat->kswapd_order, order);
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- pgdat_balanced(pgdat, order, classzone_idx)) {
+ (pgdat_balanced(pgdat, order, classzone_idx) &&
+ !pgdat_watermark_boosted(pgdat, classzone_idx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
@@ -3864,7 +4023,6 @@
*/
unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
{
- struct reclaim_state reclaim_state;
struct scan_control sc = {
.nr_to_reclaim = nr_to_reclaim,
.gfp_mask = GFP_HIGHUSER_MOVABLE,
@@ -3876,18 +4034,16 @@
.hibernation_mode = 1,
};
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
- struct task_struct *p = current;
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
fs_reclaim_acquire(sc.gfp_mask);
noreclaim_flag = memalloc_noreclaim_save();
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(current, &sc.reclaim_state);
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
- p->reclaim_state = NULL;
+ set_task_reclaim_state(current, NULL);
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
@@ -4052,7 +4208,6 @@
/* Minimum pages needed in order to stay on node */
const unsigned long nr_pages = 1 << order;
struct task_struct *p = current;
- struct reclaim_state reclaim_state;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -4065,6 +4220,9 @@
.reclaim_idx = gfp_zone(gfp_mask),
};
+ trace_mm_vmscan_node_reclaim_begin(pgdat->node_id, order,
+ sc.gfp_mask);
+
cond_resched();
fs_reclaim_acquire(sc.gfp_mask);
/*
@@ -4074,8 +4232,7 @@
*/
noreclaim_flag = memalloc_noreclaim_save();
p->flags |= PF_SWAPWRITE;
- reclaim_state.reclaimed_slab = 0;
- p->reclaim_state = &reclaim_state;
+ set_task_reclaim_state(p, &sc.reclaim_state);
if (node_pagecache_reclaimable(pgdat) > pgdat->min_unmapped_pages) {
/*
@@ -4087,10 +4244,13 @@
} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
}
- p->reclaim_state = NULL;
+ set_task_reclaim_state(p, NULL);
current->flags &= ~PF_SWAPWRITE;
memalloc_noreclaim_restore(noreclaim_flag);
fs_reclaim_release(sc.gfp_mask);
+
+ trace_mm_vmscan_node_reclaim_end(sc.nr_reclaimed);
+
return sc.nr_reclaimed >= nr_pages;
}
@@ -4163,17 +4323,16 @@
return ret;
}
-#ifdef CONFIG_SHMEM
/**
- * check_move_unevictable_pages - check pages for evictability and move to appropriate zone lru list
- * @pages: array of pages to check
- * @nr_pages: number of pages to check
+ * check_move_unevictable_pages - check pages for evictability and move to
+ * appropriate zone lru list
+ * @pvec: pagevec with lru pages to check
*
- * Checks pages for evictability and moves them to the appropriate lru list.
- *
- * This function is only used for SysV IPC SHM_UNLOCK.
+ * Checks pages for evictability, if an evictable page is in the unevictable
+ * lru list, moves it to the appropriate evictable lru list. This function
+ * should be only used for lru pages.
*/
-void check_move_unevictable_pages(struct page **pages, int nr_pages)
+void check_move_unevictable_pages(struct pagevec *pvec)
{
struct lruvec *lruvec;
struct pglist_data *pgdat = NULL;
@@ -4181,8 +4340,8 @@
int pgrescued = 0;
int i;
- for (i = 0; i < nr_pages; i++) {
- struct page *page = pages[i];
+ for (i = 0; i < pvec->nr; i++) {
+ struct page *page = pvec->pages[i];
struct pglist_data *pagepgdat = page_pgdat(page);
pgscanned++;
@@ -4214,4 +4373,4 @@
spin_unlock_irq(&pgdat->lru_lock);
}
}
-#endif /* CONFIG_SHMEM */
+EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7878da7..a822204 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* linux/mm/vmstat.c
*
@@ -227,7 +228,7 @@
* 125 1024 10 16-32 GB 9
*/
- mem = zone->managed_pages >> (27 - PAGE_SHIFT);
+ mem = zone_managed_pages(zone) >> (27 - PAGE_SHIFT);
threshold = 2 * fls(num_online_cpus()) * (1 + fls(mem));
@@ -1143,8 +1144,10 @@
"nr_slab_unreclaimable",
"nr_isolated_anon",
"nr_isolated_file",
+ "workingset_nodes",
"workingset_refault",
"workingset_activate",
+ "workingset_restore",
"workingset_nodereclaim",
"nr_anon_pages",
"nr_mapped",
@@ -1155,13 +1158,15 @@
"nr_shmem",
"nr_shmem_hugepages",
"nr_shmem_pmdmapped",
+ "nr_file_hugepages",
+ "nr_file_pmdmapped",
"nr_anon_transparent_hugepages",
"nr_unstable",
"nr_vmscan_write",
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
"nr_written",
- "", /* nr_indirectly_reclaimable */
+ "nr_kernel_misc_reclaimable",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1272,13 +1277,8 @@
#endif
#endif /* CONFIG_MEMORY_BALLOON */
#ifdef CONFIG_DEBUG_TLBFLUSH
-#ifdef CONFIG_SMP
"nr_tlb_remote_flush",
"nr_tlb_remote_flush_received",
-#else
- "", /* nr_tlb_remote_flush */
- "", /* nr_tlb_remote_flush_received */
-#endif /* CONFIG_SMP */
"nr_tlb_local_flush_all",
"nr_tlb_local_flush_one",
#endif /* CONFIG_DEBUG_TLBFLUSH */
@@ -1383,12 +1383,29 @@
unsigned long freecount = 0;
struct free_area *area;
struct list_head *curr;
+ bool overflow = false;
area = &(zone->free_area[order]);
- list_for_each(curr, &area->free_list[mtype])
- freecount++;
- seq_printf(m, "%6lu ", freecount);
+ list_for_each(curr, &area->free_list[mtype]) {
+ /*
+ * Cap the free_list iteration because it might
+ * be really large and we are under a spinlock
+ * so a long time spent here could trigger a
+ * hard lockup detector. Anyway this is a
+ * debugging tool so knowing there is a handful
+ * of pages of this order should be more than
+ * sufficient.
+ */
+ if (++freecount >= 100000) {
+ overflow = true;
+ break;
+ }
+ }
+ seq_printf(m, "%s%6lu ", overflow ? ">" : "", freecount);
+ spin_unlock_irq(&zone->lock);
+ cond_resched();
+ spin_lock_irq(&zone->lock);
}
seq_putc(m, '\n');
}
@@ -1567,7 +1584,7 @@
high_wmark_pages(zone),
zone->spanned_pages,
zone->present_pages,
- zone->managed_pages);
+ zone_managed_pages(zone));
seq_printf(m,
"\n protection: (%ld",
@@ -1663,6 +1680,8 @@
stat_items_size += sizeof(struct vm_event_state);
#endif
+ BUILD_BUG_ON(stat_items_size !=
+ ARRAY_SIZE(vmstat_text) * sizeof(unsigned long));
v = kmalloc(stat_items_size, GFP_KERNEL);
m->private = v;
if (!v)
@@ -1706,10 +1725,6 @@
unsigned long *l = arg;
unsigned long off = l - (unsigned long *)m->private;
- /* Skip hidden vmstat items. */
- if (*vmstat_text[off] == '\0')
- return 0;
-
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
@@ -1827,12 +1842,13 @@
/*
* The fast way of checking if there are any vmstat diffs.
- * This works because the diffs are byte sized items.
*/
- if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS))
+ if (memchr_inv(p->vm_stat_diff, 0, NR_VM_ZONE_STAT_ITEMS *
+ sizeof(p->vm_stat_diff[0])))
return true;
#ifdef CONFIG_NUMA
- if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS))
+ if (memchr_inv(p->vm_numa_stat_diff, 0, NR_VM_NUMA_STAT_ITEMS *
+ sizeof(p->vm_numa_stat_diff[0])))
return true;
#endif
}
@@ -1973,7 +1989,7 @@
#endif
#ifdef CONFIG_PROC_FS
proc_create_seq("buddyinfo", 0444, NULL, &fragmentation_op);
- proc_create_seq("pagetypeinfo", 0444, NULL, &pagetypeinfo_op);
+ proc_create_seq("pagetypeinfo", 0400, NULL, &pagetypeinfo_op);
proc_create_seq("vmstat", 0444, NULL, &vmstat_op);
proc_create_seq("zoneinfo", 0444, NULL, &zoneinfo_op);
#endif
@@ -2120,21 +2136,14 @@
struct dentry *extfrag_debug_root;
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
- if (!extfrag_debug_root)
- return -ENOMEM;
- if (!debugfs_create_file("unusable_index", 0444,
- extfrag_debug_root, NULL, &unusable_file_ops))
- goto fail;
+ debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
+ &unusable_file_ops);
- if (!debugfs_create_file("extfrag_index", 0444,
- extfrag_debug_root, NULL, &extfrag_file_ops))
- goto fail;
+ debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
+ &extfrag_file_ops);
return 0;
-fail:
- debugfs_remove_recursive(extfrag_debug_root);
- return -ENOMEM;
}
module_init(extfrag_debug_init);
diff --git a/mm/workingset.c b/mm/workingset.c
index 4516dd7..c963831 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -121,7 +121,7 @@
* the only thing eating into inactive list space is active pages.
*
*
- * Activating refaulting pages
+ * Refaulting inactive pages
*
* All that is known about the active list is that the pages have been
* accessed more than once in the past. This means that at any given
@@ -134,6 +134,10 @@
* used less frequently than the refaulting page - or even not used at
* all anymore.
*
+ * That means if inactive cache is refaulting with a suitable refault
+ * distance, we assume the cache workingset is transitioning and put
+ * pressure on the current active list.
+ *
* If this is wrong and demotion kicks in, the pages which are truly
* used more frequently will be reactivated while the less frequently
* used once will be evicted from memory.
@@ -141,6 +145,14 @@
* But if this is right, the stale pages will be pushed out of memory
* and the used pages get to stay in cache.
*
+ * Refaulting active pages
+ *
+ * If on the other hand the refaulting pages have recently been
+ * deactivated, it means that the active list is no longer protecting
+ * actively used cache from reclaim. The cache is NOT transitioning to
+ * a different workingset; the existing workingset is thrashing in the
+ * space allocated to the page cache.
+ *
*
* Implementation
*
@@ -148,21 +160,20 @@
* and activations is maintained (node->inactive_age).
*
* On eviction, a snapshot of this counter (along with some bits to
- * identify the node) is stored in the now empty page cache radix tree
+ * identify the node) is stored in the now empty page cache
* slot of the evicted page. This is called a shadow entry.
*
* On cache misses for which there are shadow entries, an eligible
* refault distance will immediately activate the refaulting page.
*/
-#define EVICTION_SHIFT (RADIX_TREE_EXCEPTIONAL_ENTRY + \
- NODES_SHIFT + \
- MEM_CGROUP_ID_SHIFT)
+#define EVICTION_SHIFT ((BITS_PER_LONG - BITS_PER_XA_VALUE) + \
+ 1 + NODES_SHIFT + MEM_CGROUP_ID_SHIFT)
#define EVICTION_MASK (~0UL >> EVICTION_SHIFT)
/*
* Eviction timestamps need to be able to cover the full range of
- * actionable refaults. However, bits are tight in the radix tree
+ * actionable refaults. However, bits are tight in the xarray
* entry, and after storing the identifier for the lruvec there might
* not be enough left to represent every single actionable refault. In
* that case, we have to sacrifice granularity for distance, and group
@@ -170,23 +181,27 @@
*/
static unsigned int bucket_order __read_mostly;
-static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction)
+static void *pack_shadow(int memcgid, pg_data_t *pgdat, unsigned long eviction,
+ bool workingset)
{
eviction >>= bucket_order;
+ eviction &= EVICTION_MASK;
eviction = (eviction << MEM_CGROUP_ID_SHIFT) | memcgid;
eviction = (eviction << NODES_SHIFT) | pgdat->node_id;
- eviction = (eviction << RADIX_TREE_EXCEPTIONAL_SHIFT);
+ eviction = (eviction << 1) | workingset;
- return (void *)(eviction | RADIX_TREE_EXCEPTIONAL_ENTRY);
+ return xa_mk_value(eviction);
}
static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
- unsigned long *evictionp)
+ unsigned long *evictionp, bool *workingsetp)
{
- unsigned long entry = (unsigned long)shadow;
+ unsigned long entry = xa_to_value(shadow);
int memcgid, nid;
+ bool workingset;
- entry >>= RADIX_TREE_EXCEPTIONAL_SHIFT;
+ workingset = entry & 1;
+ entry >>= 1;
nid = entry & ((1UL << NODES_SHIFT) - 1);
entry >>= NODES_SHIFT;
memcgid = entry & ((1UL << MEM_CGROUP_ID_SHIFT) - 1);
@@ -195,20 +210,20 @@
*memcgidp = memcgid;
*pgdat = NODE_DATA(nid);
*evictionp = entry << bucket_order;
+ *workingsetp = workingset;
}
/**
* workingset_eviction - note the eviction of a page from memory
- * @mapping: address space the page was backing
* @page: the page being evicted
*
- * Returns a shadow entry to be stored in @mapping->i_pages in place
+ * Returns a shadow entry to be stored in @page->mapping->i_pages in place
* of the evicted @page so that a later refault can be detected.
*/
-void *workingset_eviction(struct address_space *mapping, struct page *page)
+void *workingset_eviction(struct page *page)
{
- struct mem_cgroup *memcg = page_memcg(page);
struct pglist_data *pgdat = page_pgdat(page);
+ struct mem_cgroup *memcg = page_memcg(page);
int memcgid = mem_cgroup_id(memcg);
unsigned long eviction;
struct lruvec *lruvec;
@@ -220,30 +235,30 @@
lruvec = mem_cgroup_lruvec(pgdat, memcg);
eviction = atomic_long_inc_return(&lruvec->inactive_age);
- return pack_shadow(memcgid, pgdat, eviction);
+ return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
/**
* workingset_refault - evaluate the refault of a previously evicted page
+ * @page: the freshly allocated replacement page
* @shadow: shadow entry of the evicted page
*
* Calculates and evaluates the refault distance of the previously
* evicted page in the context of the node it was allocated in.
- *
- * Returns %true if the page should be activated, %false otherwise.
*/
-bool workingset_refault(void *shadow)
+void workingset_refault(struct page *page, void *shadow)
{
unsigned long refault_distance;
+ struct pglist_data *pgdat;
unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
unsigned long refault;
- struct pglist_data *pgdat;
+ bool workingset;
int memcgid;
- unpack_shadow(shadow, &memcgid, &pgdat, &eviction);
+ unpack_shadow(shadow, &memcgid, &pgdat, &eviction, &workingset);
rcu_read_lock();
/*
@@ -263,41 +278,51 @@
* configurations instead.
*/
memcg = mem_cgroup_from_id(memcgid);
- if (!mem_cgroup_disabled() && !memcg) {
- rcu_read_unlock();
- return false;
- }
+ if (!mem_cgroup_disabled() && !memcg)
+ goto out;
lruvec = mem_cgroup_lruvec(pgdat, memcg);
refault = atomic_long_read(&lruvec->inactive_age);
active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES);
/*
- * The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases.
+ * Calculate the refault distance
*
- * There is a special case: usually, shadow entries have a
- * short lifetime and are either refaulted or reclaimed along
- * with the inode before they get too old. But it is not
- * impossible for the inactive_age to lap a shadow entry in
- * the field, which can then can result in a false small
- * refault distance, leading to a false activation should this
- * old entry actually refault again. However, earlier kernels
- * used to deactivate unconditionally with *every* reclaim
- * invocation for the longest time, so the occasional
- * inappropriate activation leading to pressure on the active
- * list is not a problem.
+ * The unsigned subtraction here gives an accurate distance
+ * across inactive_age overflows in most cases. There is a
+ * special case: usually, shadow entries have a short lifetime
+ * and are either refaulted or reclaimed along with the inode
+ * before they get too old. But it is not impossible for the
+ * inactive_age to lap a shadow entry in the field, which can
+ * then result in a false small refault distance, leading to a
+ * false activation should this old entry actually refault
+ * again. However, earlier kernels used to deactivate
+ * unconditionally with *every* reclaim invocation for the
+ * longest time, so the occasional inappropriate activation
+ * leading to pressure on the active list is not a problem.
*/
refault_distance = (refault - eviction) & EVICTION_MASK;
inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
- if (refault_distance <= active_file) {
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
- rcu_read_unlock();
- return true;
+ /*
+ * Compare the distance to the existing workingset size. We
+ * don't act on pages that couldn't stay resident even if all
+ * the memory was available to the page cache.
+ */
+ if (refault_distance > active_file)
+ goto out;
+
+ SetPageActive(page);
+ atomic_long_inc(&lruvec->inactive_age);
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+
+ /* Page was active prior to eviction */
+ if (workingset) {
+ SetPageWorkingset(page);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
}
+out:
rcu_read_unlock();
- return false;
}
/**
@@ -340,7 +365,7 @@
static struct list_lru shadow_nodes;
-void workingset_update_node(struct radix_tree_node *node)
+void workingset_update_node(struct xa_node *node)
{
/*
* Track non-empty nodes that contain only shadow entries;
@@ -350,12 +375,18 @@
* already where they should be. The list_empty() test is safe
* as node->private_list is protected by the i_pages lock.
*/
- if (node->count && node->count == node->exceptional) {
- if (list_empty(&node->private_list))
+ VM_WARN_ON_ONCE(!irqs_disabled()); /* For __inc_lruvec_page_state */
+
+ if (node->count && node->count == node->nr_values) {
+ if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
+ __inc_lruvec_slab_state(node, WORKINGSET_NODES);
+ }
} else {
- if (!list_empty(&node->private_list))
+ if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
+ __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+ }
}
}
@@ -364,12 +395,12 @@
{
unsigned long max_nodes;
unsigned long nodes;
- unsigned long cache;
+ unsigned long pages;
nodes = list_lru_shrink_count(&shadow_nodes, sc);
/*
- * Approximate a reasonable limit for the radix tree nodes
+ * Approximate a reasonable limit for the nodes
* containing shadow entries. We don't need to keep more
* shadow entries than possible pages on the active list,
* since refault distances bigger than that are dismissed.
@@ -384,20 +415,28 @@
* worst-case density of 1/8th. Below that, not all eligible
* refaults can be detected anymore.
*
- * On 64-bit with 7 radix_tree_nodes per page and 64 slots
+ * On 64-bit with 7 xa_nodes per page and 64 slots
* each, this will reclaim shadow entries when they consume
* ~1.8% of available memory:
*
- * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE
+ * PAGE_SIZE / xa_nodes / node_entries * 8 / PAGE_SIZE
*/
+#ifdef CONFIG_MEMCG
if (sc->memcg) {
- cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid,
- LRU_ALL_FILE);
- } else {
- cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) +
- node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE);
- }
- max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3);
+ struct lruvec *lruvec;
+ int i;
+
+ lruvec = mem_cgroup_lruvec(NODE_DATA(sc->nid), sc->memcg);
+ for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
+ pages += lruvec_page_state_local(lruvec,
+ NR_LRU_BASE + i);
+ pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
+ pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
+ } else
+#endif
+ pages = node_present_pages(sc->nid);
+
+ max_nodes = pages >> (XA_CHUNK_SHIFT - 3);
if (!nodes)
return SHRINK_EMPTY;
@@ -410,11 +449,11 @@
static enum lru_status shadow_lru_isolate(struct list_head *item,
struct list_lru_one *lru,
spinlock_t *lru_lock,
- void *arg)
+ void *arg) __must_hold(lru_lock)
{
+ struct xa_node *node = container_of(item, struct xa_node, private_list);
+ XA_STATE(xas, node->array, 0);
struct address_space *mapping;
- struct radix_tree_node *node;
- unsigned int i;
int ret;
/*
@@ -422,15 +461,14 @@
* the shadow node LRU under the i_pages lock and the
* lru_lock. Because the page cache tree is emptied before
* the inode can be destroyed, holding the lru_lock pins any
- * address_space that has radix tree nodes on the LRU.
+ * address_space that has nodes on the LRU.
*
* We can then safely transition to the i_pages lock to
* pin only the address_space of the particular node we want
* to reclaim, take the node off-LRU, and drop the lru_lock.
*/
- node = container_of(item, struct radix_tree_node, private_list);
- mapping = container_of(node->root, struct address_space, i_pages);
+ mapping = container_of(node->array, struct address_space, i_pages);
/* Coming from the list, invert the lock order */
if (!xa_trylock(&mapping->i_pages)) {
@@ -440,6 +478,8 @@
}
list_lru_isolate(lru, item);
+ __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+
spin_unlock(lru_lock);
/*
@@ -447,29 +487,21 @@
* no pages, so we expect to be able to remove them all and
* delete and free the empty node afterwards.
*/
- if (WARN_ON_ONCE(!node->exceptional))
+ if (WARN_ON_ONCE(!node->nr_values))
goto out_invalid;
- if (WARN_ON_ONCE(node->count != node->exceptional))
+ if (WARN_ON_ONCE(node->count != node->nr_values))
goto out_invalid;
- for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
- if (node->slots[i]) {
- if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i])))
- goto out_invalid;
- if (WARN_ON_ONCE(!node->exceptional))
- goto out_invalid;
- if (WARN_ON_ONCE(!mapping->nrexceptional))
- goto out_invalid;
- node->slots[i] = NULL;
- node->exceptional--;
- node->count--;
- mapping->nrexceptional--;
- }
- }
- if (WARN_ON_ONCE(node->exceptional))
- goto out_invalid;
- inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM);
- __radix_tree_delete_node(&mapping->i_pages, node,
- workingset_lookup_update(mapping));
+ mapping->nrexceptional -= node->nr_values;
+ xas.xa_node = xa_parent_locked(&mapping->i_pages, node);
+ xas.xa_offset = node->offset;
+ xas.xa_shift = node->shift + XA_CHUNK_SHIFT;
+ xas_set_update(&xas, workingset_update_node);
+ /*
+ * We could store a shadow entry here which was the minimum of the
+ * shadow entries we were tracking ...
+ */
+ xas_store(&xas, NULL);
+ __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
@@ -491,7 +523,7 @@
static struct shrinker workingset_shadow_shrinker = {
.count_objects = count_shadow_nodes,
.scan_objects = scan_shadow_nodes,
- .seeks = DEFAULT_SEEKS,
+ .seeks = 0, /* ->count reports only fully expendable nodes */
.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE,
};
@@ -516,7 +548,7 @@
* double the initial memory by using totalram_pages as-is.
*/
timestamp_bits = BITS_PER_LONG - EVICTION_SHIFT;
- max_order = fls_long(totalram_pages - 1);
+ max_order = fls_long(totalram_pages() - 1);
if (max_order > timestamp_bits)
bucket_order = max_order - timestamp_bits;
pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n",
diff --git a/mm/z3fold.c b/mm/z3fold.c
index aee9b0b..6d3d3f6 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* z3fold.c
*
@@ -24,60 +25,24 @@
#include <linux/atomic.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
#include <linux/list.h>
#include <linux/mm.h>
#include <linux/module.h>
+#include <linux/page-flags.h>
+#include <linux/migrate.h>
+#include <linux/node.h>
+#include <linux/compaction.h>
#include <linux/percpu.h>
+#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
+#include <linux/fs.h>
#include <linux/preempt.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/zpool.h>
-
-/*****************
- * Structures
-*****************/
-struct z3fold_pool;
-struct z3fold_ops {
- int (*evict)(struct z3fold_pool *pool, unsigned long handle);
-};
-
-enum buddy {
- HEADLESS = 0,
- FIRST,
- MIDDLE,
- LAST,
- BUDDIES_MAX
-};
-
-/*
- * struct z3fold_header - z3fold page metadata occupying first chunks of each
- * z3fold page, except for HEADLESS pages
- * @buddy: links the z3fold page into the relevant list in the
- * pool
- * @page_lock: per-page lock
- * @refcount: reference count for the z3fold page
- * @work: work_struct for page layout optimization
- * @pool: pointer to the pool which this page belongs to
- * @cpu: CPU which this page "belongs" to
- * @first_chunks: the size of the first buddy in chunks, 0 if free
- * @middle_chunks: the size of the middle buddy in chunks, 0 if free
- * @last_chunks: the size of the last buddy in chunks, 0 if free
- * @first_num: the starting number (for the first handle)
- */
-struct z3fold_header {
- struct list_head buddy;
- spinlock_t page_lock;
- struct kref refcount;
- struct work_struct work;
- struct z3fold_pool *pool;
- short cpu;
- unsigned short first_chunks;
- unsigned short middle_chunks;
- unsigned short last_chunks;
- unsigned short start_middle;
- unsigned short first_num:2;
-};
+#include <linux/magic.h>
/*
* NCHUNKS_ORDER determines the internal allocation granularity, effectively
@@ -100,6 +65,66 @@
#define BUDDY_MASK (0x3)
#define BUDDY_SHIFT 2
+#define SLOTS_ALIGN (0x40)
+
+/*****************
+ * Structures
+*****************/
+struct z3fold_pool;
+struct z3fold_ops {
+ int (*evict)(struct z3fold_pool *pool, unsigned long handle);
+};
+
+enum buddy {
+ HEADLESS = 0,
+ FIRST,
+ MIDDLE,
+ LAST,
+ BUDDIES_MAX = LAST
+};
+
+struct z3fold_buddy_slots {
+ /*
+ * we are using BUDDY_MASK in handle_to_buddy etc. so there should
+ * be enough slots to hold all possible variants
+ */
+ unsigned long slot[BUDDY_MASK + 1];
+ unsigned long pool; /* back link + flags */
+};
+#define HANDLE_FLAG_MASK (0x03)
+
+/*
+ * struct z3fold_header - z3fold page metadata occupying first chunks of each
+ * z3fold page, except for HEADLESS pages
+ * @buddy: links the z3fold page into the relevant list in the
+ * pool
+ * @page_lock: per-page lock
+ * @refcount: reference count for the z3fold page
+ * @work: work_struct for page layout optimization
+ * @slots: pointer to the structure holding buddy slots
+ * @pool: pointer to the containing pool
+ * @cpu: CPU which this page "belongs" to
+ * @first_chunks: the size of the first buddy in chunks, 0 if free
+ * @middle_chunks: the size of the middle buddy in chunks, 0 if free
+ * @last_chunks: the size of the last buddy in chunks, 0 if free
+ * @first_num: the starting number (for the first handle)
+ * @mapped_count: the number of objects currently mapped
+ */
+struct z3fold_header {
+ struct list_head buddy;
+ spinlock_t page_lock;
+ struct kref refcount;
+ struct work_struct work;
+ struct z3fold_buddy_slots *slots;
+ struct z3fold_pool *pool;
+ short cpu;
+ unsigned short first_chunks;
+ unsigned short middle_chunks;
+ unsigned short last_chunks;
+ unsigned short start_middle;
+ unsigned short first_num:2;
+ unsigned short mapped_count:2;
+};
/**
* struct z3fold_pool - stores metadata for each z3fold pool
@@ -113,11 +138,13 @@
* added buddy.
* @stale: list of pages marked for freeing
* @pages_nr: number of z3fold pages in the pool.
+ * @c_handle: cache for z3fold_buddy_slots allocation
* @ops: pointer to a structure of user defined operations specified at
* pool creation time.
* @compact_wq: workqueue for page layout background optimization
* @release_wq: workqueue for safe page release
* @work: work_struct for safe page release
+ * @inode: inode for z3fold pseudo filesystem
*
* This structure is allocated at pool creation time and maintains metadata
* pertaining to a particular z3fold pool.
@@ -130,12 +157,14 @@
struct list_head lru;
struct list_head stale;
atomic64_t pages_nr;
+ struct kmem_cache *c_handle;
const struct z3fold_ops *ops;
struct zpool *zpool;
const struct zpool_ops *zpool_ops;
struct workqueue_struct *compact_wq;
struct workqueue_struct *release_wq;
struct work_struct work;
+ struct inode *inode;
};
/*
@@ -164,11 +193,113 @@
static void compact_page_work(struct work_struct *w);
+static inline struct z3fold_buddy_slots *alloc_slots(struct z3fold_pool *pool,
+ gfp_t gfp)
+{
+ struct z3fold_buddy_slots *slots;
+
+ slots = kmem_cache_alloc(pool->c_handle,
+ (gfp & ~(__GFP_HIGHMEM | __GFP_MOVABLE)));
+
+ if (slots) {
+ memset(slots->slot, 0, sizeof(slots->slot));
+ slots->pool = (unsigned long)pool;
+ }
+
+ return slots;
+}
+
+static inline struct z3fold_pool *slots_to_pool(struct z3fold_buddy_slots *s)
+{
+ return (struct z3fold_pool *)(s->pool & ~HANDLE_FLAG_MASK);
+}
+
+static inline struct z3fold_buddy_slots *handle_to_slots(unsigned long handle)
+{
+ return (struct z3fold_buddy_slots *)(handle & ~(SLOTS_ALIGN - 1));
+}
+
+static inline void free_handle(unsigned long handle)
+{
+ struct z3fold_buddy_slots *slots;
+ int i;
+ bool is_free;
+
+ if (handle & (1 << PAGE_HEADLESS))
+ return;
+
+ WARN_ON(*(unsigned long *)handle == 0);
+ *(unsigned long *)handle = 0;
+ slots = handle_to_slots(handle);
+ is_free = true;
+ for (i = 0; i <= BUDDY_MASK; i++) {
+ if (slots->slot[i]) {
+ is_free = false;
+ break;
+ }
+ }
+
+ if (is_free) {
+ struct z3fold_pool *pool = slots_to_pool(slots);
+
+ kmem_cache_free(pool->c_handle, slots);
+ }
+}
+
+static int z3fold_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, Z3FOLD_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type z3fold_fs = {
+ .name = "z3fold",
+ .init_fs_context = z3fold_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static struct vfsmount *z3fold_mnt;
+static int z3fold_mount(void)
+{
+ int ret = 0;
+
+ z3fold_mnt = kern_mount(&z3fold_fs);
+ if (IS_ERR(z3fold_mnt))
+ ret = PTR_ERR(z3fold_mnt);
+
+ return ret;
+}
+
+static void z3fold_unmount(void)
+{
+ kern_unmount(z3fold_mnt);
+}
+
+static const struct address_space_operations z3fold_aops;
+static int z3fold_register_migration(struct z3fold_pool *pool)
+{
+ pool->inode = alloc_anon_inode(z3fold_mnt->mnt_sb);
+ if (IS_ERR(pool->inode)) {
+ pool->inode = NULL;
+ return 1;
+ }
+
+ pool->inode->i_mapping->private_data = pool;
+ pool->inode->i_mapping->a_ops = &z3fold_aops;
+ return 0;
+}
+
+static void z3fold_unregister_migration(struct z3fold_pool *pool)
+{
+ if (pool->inode)
+ iput(pool->inode);
+ }
+
/* Initializes the z3fold header of a newly allocated z3fold page */
-static struct z3fold_header *init_z3fold_page(struct page *page,
- struct z3fold_pool *pool)
+static struct z3fold_header *init_z3fold_page(struct page *page, bool headless,
+ struct z3fold_pool *pool, gfp_t gfp)
{
struct z3fold_header *zhdr = page_address(page);
+ struct z3fold_buddy_slots *slots;
INIT_LIST_HEAD(&page->lru);
clear_bit(PAGE_HEADLESS, &page->private);
@@ -176,6 +307,12 @@
clear_bit(NEEDS_COMPACTING, &page->private);
clear_bit(PAGE_STALE, &page->private);
clear_bit(PAGE_CLAIMED, &page->private);
+ if (headless)
+ return zhdr;
+
+ slots = alloc_slots(pool, gfp);
+ if (!slots)
+ return NULL;
spin_lock_init(&zhdr->page_lock);
kref_init(&zhdr->refcount);
@@ -185,6 +322,7 @@
zhdr->first_num = 0;
zhdr->start_middle = 0;
zhdr->cpu = -1;
+ zhdr->slots = slots;
zhdr->pool = pool;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_WORK(&zhdr->work, compact_page_work);
@@ -192,8 +330,14 @@
}
/* Resets the struct page fields and frees the page */
-static void free_z3fold_page(struct page *page)
+static void free_z3fold_page(struct page *page, bool headless)
{
+ if (!headless) {
+ lock_page(page);
+ __ClearPageMovable(page);
+ unlock_page(page);
+ }
+ ClearPagePrivate(page);
__free_page(page);
}
@@ -215,33 +359,62 @@
spin_unlock(&zhdr->page_lock);
}
+/* Helper function to build the index */
+static inline int __idx(struct z3fold_header *zhdr, enum buddy bud)
+{
+ return (bud + zhdr->first_num) & BUDDY_MASK;
+}
+
/*
* Encodes the handle of a particular buddy within a z3fold page
* Pool lock should be held as this function accesses first_num
*/
+static unsigned long __encode_handle(struct z3fold_header *zhdr,
+ struct z3fold_buddy_slots *slots,
+ enum buddy bud)
+{
+ unsigned long h = (unsigned long)zhdr;
+ int idx = 0;
+
+ /*
+ * For a headless page, its handle is its pointer with the extra
+ * PAGE_HEADLESS bit set
+ */
+ if (bud == HEADLESS)
+ return h | (1 << PAGE_HEADLESS);
+
+ /* otherwise, return pointer to encoded handle */
+ idx = __idx(zhdr, bud);
+ h += idx;
+ if (bud == LAST)
+ h |= (zhdr->last_chunks << BUDDY_SHIFT);
+
+ slots->slot[idx] = h;
+ return (unsigned long)&slots->slot[idx];
+}
+
static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
{
- unsigned long handle;
-
- handle = (unsigned long)zhdr;
- if (bud != HEADLESS) {
- handle |= (bud + zhdr->first_num) & BUDDY_MASK;
- if (bud == LAST)
- handle |= (zhdr->last_chunks << BUDDY_SHIFT);
- }
- return handle;
+ return __encode_handle(zhdr, zhdr->slots, bud);
}
/* Returns the z3fold page where a given handle is stored */
-static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
+static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h)
{
- return (struct z3fold_header *)(handle & PAGE_MASK);
+ unsigned long addr = h;
+
+ if (!(addr & (1 << PAGE_HEADLESS)))
+ addr = *(unsigned long *)h;
+
+ return (struct z3fold_header *)(addr & PAGE_MASK);
}
/* only for LAST bud, returns zero otherwise */
static unsigned short handle_to_chunks(unsigned long handle)
{
- return (handle & ~PAGE_MASK) >> BUDDY_SHIFT;
+ unsigned long addr = *(unsigned long *)handle;
+
+ return (addr & ~PAGE_MASK) >> BUDDY_SHIFT;
}
/*
@@ -251,21 +424,31 @@
*/
static enum buddy handle_to_buddy(unsigned long handle)
{
- struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
- return (handle - zhdr->first_num) & BUDDY_MASK;
+ struct z3fold_header *zhdr;
+ unsigned long addr;
+
+ WARN_ON(handle & (1 << PAGE_HEADLESS));
+ addr = *(unsigned long *)handle;
+ zhdr = (struct z3fold_header *)(addr & PAGE_MASK);
+ return (addr - zhdr->first_num) & BUDDY_MASK;
+}
+
+static inline struct z3fold_pool *zhdr_to_pool(struct z3fold_header *zhdr)
+{
+ return zhdr->pool;
}
static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
clear_bit(NEEDS_COMPACTING, &page->private);
spin_lock(&pool->lock);
if (!list_empty(&page->lru))
- list_del(&page->lru);
+ list_del_init(&page->lru);
spin_unlock(&pool->lock);
if (locked)
z3fold_page_unlock(zhdr);
@@ -295,9 +478,10 @@
{
struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
refcount);
- spin_lock(&zhdr->pool->lock);
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
+ spin_lock(&pool->lock);
list_del_init(&zhdr->buddy);
- spin_unlock(&zhdr->pool->lock);
+ spin_unlock(&pool->lock);
WARN_ON(z3fold_page_trylock(zhdr));
__release_z3fold_page(zhdr, true);
@@ -318,7 +502,7 @@
continue;
spin_unlock(&pool->stale_lock);
cancel_work_sync(&zhdr->work);
- free_z3fold_page(page);
+ free_z3fold_page(page, false);
cond_resched();
spin_lock(&pool->stale_lock);
}
@@ -349,6 +533,23 @@
return nfree;
}
+/* Add to the appropriate unbuddied list */
+static inline void add_to_unbuddied(struct z3fold_pool *pool,
+ struct z3fold_header *zhdr)
+{
+ if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
+ zhdr->middle_chunks == 0) {
+ struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
+
+ int freechunks = num_free_chunks(zhdr);
+ spin_lock(&pool->lock);
+ list_add(&zhdr->buddy, &unbuddied[freechunks]);
+ spin_unlock(&pool->lock);
+ zhdr->cpu = smp_processor_id();
+ put_cpu_ptr(pool->unbuddied);
+ }
+}
+
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
unsigned short dst_chunk)
{
@@ -367,6 +568,9 @@
if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
return 0; /* can't move middle chunk, it's used */
+ if (unlikely(PageIsolated(page)))
+ return 0;
+
if (zhdr->middle_chunks == 0)
return 0; /* nothing to compact */
@@ -406,10 +610,8 @@
static void do_compact_page(struct z3fold_header *zhdr, bool locked)
{
- struct z3fold_pool *pool = zhdr->pool;
+ struct z3fold_pool *pool = zhdr_to_pool(zhdr);
struct page *page;
- struct list_head *unbuddied;
- int fchunks;
page = virt_to_page(zhdr);
if (locked)
@@ -429,19 +631,15 @@
return;
}
- z3fold_compact_page(zhdr);
- unbuddied = get_cpu_ptr(pool->unbuddied);
- fchunks = num_free_chunks(zhdr);
- if (fchunks < NCHUNKS &&
- (!zhdr->first_chunks || !zhdr->middle_chunks ||
- !zhdr->last_chunks)) {
- /* the page's not completely free and it's unbuddied */
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[fchunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
+ if (unlikely(PageIsolated(page) ||
+ test_bit(PAGE_CLAIMED, &page->private) ||
+ test_bit(PAGE_STALE, &page->private))) {
+ z3fold_page_unlock(zhdr);
+ return;
}
- put_cpu_ptr(pool->unbuddied);
+
+ z3fold_compact_page(zhdr);
+ add_to_unbuddied(pool, zhdr);
z3fold_page_unlock(zhdr);
}
@@ -453,6 +651,103 @@
do_compact_page(zhdr, false);
}
+/* returns _locked_ z3fold page header or NULL */
+static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
+ size_t size, bool can_sleep)
+{
+ struct z3fold_header *zhdr = NULL;
+ struct page *page;
+ struct list_head *unbuddied;
+ int chunks = size_to_chunks(size), i;
+
+lookup:
+ /* First, try to find an unbuddied z3fold page. */
+ unbuddied = get_cpu_ptr(pool->unbuddied);
+ for_each_unbuddied_list(i, chunks) {
+ struct list_head *l = &unbuddied[i];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr)
+ continue;
+
+ /* Re-check under lock. */
+ spin_lock(&pool->lock);
+ l = &unbuddied[i];
+ if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
+ struct z3fold_header, buddy)) ||
+ !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ put_cpu_ptr(pool->unbuddied);
+ if (can_sleep)
+ cond_resched();
+ goto lookup;
+ }
+
+ /*
+ * this page could not be removed from its unbuddied
+ * list while pool lock was held, and then we've taken
+ * page lock so kref_put could not be called before
+ * we got here, so it's safe to just call kref_get()
+ */
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ put_cpu_ptr(pool->unbuddied);
+
+ if (!zhdr) {
+ int cpu;
+
+ /* look for _exact_ match on other cpus' lists */
+ for_each_online_cpu(cpu) {
+ struct list_head *l;
+
+ unbuddied = per_cpu_ptr(pool->unbuddied, cpu);
+ spin_lock(&pool->lock);
+ l = &unbuddied[chunks];
+
+ zhdr = list_first_entry_or_null(READ_ONCE(l),
+ struct z3fold_header, buddy);
+
+ if (!zhdr || !z3fold_page_trylock(zhdr)) {
+ spin_unlock(&pool->lock);
+ zhdr = NULL;
+ continue;
+ }
+ list_del_init(&zhdr->buddy);
+ zhdr->cpu = -1;
+ spin_unlock(&pool->lock);
+
+ page = virt_to_page(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private)) {
+ z3fold_page_unlock(zhdr);
+ zhdr = NULL;
+ if (can_sleep)
+ cond_resched();
+ continue;
+ }
+ kref_get(&zhdr->refcount);
+ break;
+ }
+ }
+
+ return zhdr;
+}
/*
* API Functions
@@ -476,6 +771,11 @@
pool = kzalloc(sizeof(struct z3fold_pool), gfp);
if (!pool)
goto out;
+ pool->c_handle = kmem_cache_create("z3fold_handle",
+ sizeof(struct z3fold_buddy_slots),
+ SLOTS_ALIGN, 0, NULL);
+ if (!pool->c_handle)
+ goto out_c;
spin_lock_init(&pool->lock);
spin_lock_init(&pool->stale_lock);
pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
@@ -497,15 +797,21 @@
pool->release_wq = create_singlethread_workqueue(pool->name);
if (!pool->release_wq)
goto out_wq;
+ if (z3fold_register_migration(pool))
+ goto out_rwq;
INIT_WORK(&pool->work, free_pages_work);
pool->ops = ops;
return pool;
+out_rwq:
+ destroy_workqueue(pool->release_wq);
out_wq:
destroy_workqueue(pool->compact_wq);
out_unbuddied:
free_percpu(pool->unbuddied);
out_pool:
+ kmem_cache_destroy(pool->c_handle);
+out_c:
kfree(pool);
out:
return NULL;
@@ -519,8 +825,20 @@
*/
static void z3fold_destroy_pool(struct z3fold_pool *pool)
{
- destroy_workqueue(pool->release_wq);
+ kmem_cache_destroy(pool->c_handle);
+
+ /*
+ * We need to destroy pool->compact_wq before pool->release_wq,
+ * as any pending work on pool->compact_wq will call
+ * queue_work(pool->release_wq, &pool->work).
+ *
+ * There are still outstanding pages until both workqueues are drained,
+ * so we cannot unregister migration until then.
+ */
+
destroy_workqueue(pool->compact_wq);
+ destroy_workqueue(pool->release_wq);
+ z3fold_unregister_migration(pool);
kfree(pool);
}
@@ -546,13 +864,13 @@
static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
unsigned long *handle)
{
- int chunks = 0, i, freechunks;
+ int chunks = size_to_chunks(size);
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
enum buddy bud;
bool can_sleep = gfpflags_allow_blocking(gfp);
- if (!size || (gfp & __GFP_HIGHMEM))
+ if (!size)
return -EINVAL;
if (size > PAGE_SIZE)
@@ -561,56 +879,8 @@
if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
bud = HEADLESS;
else {
- struct list_head *unbuddied;
- chunks = size_to_chunks(size);
-
-lookup:
- /* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
- for_each_unbuddied_list(i, chunks) {
- struct list_head *l = &unbuddied[i];
-
- zhdr = list_first_entry_or_null(READ_ONCE(l),
- struct z3fold_header, buddy);
-
- if (!zhdr)
- continue;
-
- /* Re-check under lock. */
- spin_lock(&pool->lock);
- l = &unbuddied[i];
- if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
- struct z3fold_header, buddy)) ||
- !z3fold_page_trylock(zhdr)) {
- spin_unlock(&pool->lock);
- put_cpu_ptr(pool->unbuddied);
- goto lookup;
- }
- list_del_init(&zhdr->buddy);
- zhdr->cpu = -1;
- spin_unlock(&pool->lock);
-
- page = virt_to_page(zhdr);
- if (test_bit(NEEDS_COMPACTING, &page->private)) {
- z3fold_page_unlock(zhdr);
- zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
- if (can_sleep)
- cond_resched();
- goto lookup;
- }
-
- /*
- * this page could not be removed from its unbuddied
- * list while pool lock was held, and then we've taken
- * page lock so kref_put could not be called before
- * we got here, so it's safe to just call kref_get()
- */
- kref_get(&zhdr->refcount);
- break;
- }
- put_cpu_ptr(pool->unbuddied);
-
+retry:
+ zhdr = __z3fold_alloc(pool, size, can_sleep);
if (zhdr) {
if (zhdr->first_chunks == 0) {
if (zhdr->middle_chunks != 0 &&
@@ -630,8 +900,9 @@
z3fold_page_unlock(zhdr);
pr_err("No free chunks in unbuddied\n");
WARN_ON(1);
- goto lookup;
+ goto retry;
}
+ page = virt_to_page(zhdr);
goto found;
}
bud = FIRST;
@@ -662,13 +933,27 @@
if (!page)
return -ENOMEM;
+ zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp);
+ if (!zhdr) {
+ __free_page(page);
+ return -ENOMEM;
+ }
atomic64_inc(&pool->pages_nr);
- zhdr = init_z3fold_page(page, pool);
if (bud == HEADLESS) {
set_bit(PAGE_HEADLESS, &page->private);
goto headless;
}
+ if (can_sleep) {
+ lock_page(page);
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ } else {
+ if (trylock_page(page)) {
+ __SetPageMovable(page, pool->inode->i_mapping);
+ unlock_page(page);
+ }
+ }
z3fold_page_lock(zhdr);
found:
@@ -680,19 +965,7 @@
zhdr->middle_chunks = chunks;
zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
}
-
- if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
- zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
- /* Add to unbuddied list */
- freechunks = num_free_chunks(zhdr);
- spin_lock(&pool->lock);
- list_add(&zhdr->buddy, &unbuddied[freechunks]);
- spin_unlock(&pool->lock);
- zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
- }
+ add_to_unbuddied(pool, zhdr);
headless:
spin_lock(&pool->lock);
@@ -725,9 +998,11 @@
struct z3fold_header *zhdr;
struct page *page;
enum buddy bud;
+ bool page_claimed;
zhdr = handle_to_z3fold_header(handle);
page = virt_to_page(zhdr);
+ page_claimed = test_and_set_bit(PAGE_CLAIMED, &page->private);
if (test_bit(PAGE_HEADLESS, &page->private)) {
/* if a headless page is under reclaim, just leave.
@@ -735,11 +1010,11 @@
* has not been set before, we release this page
* immediately so we don't care about its value any more.
*/
- if (!test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (!page_claimed) {
spin_lock(&pool->lock);
list_del(&page->lru);
spin_unlock(&pool->lock);
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
}
return;
@@ -766,16 +1041,20 @@
return;
}
+ free_handle(handle);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
}
- if (test_bit(PAGE_CLAIMED, &page->private)) {
+ if (page_claimed) {
+ /* the page has not been claimed by us */
z3fold_page_unlock(zhdr);
return;
}
- if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+ if (unlikely(PageIsolated(page)) ||
+ test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
@@ -785,10 +1064,12 @@
zhdr->cpu = -1;
kref_get(&zhdr->refcount);
do_compact_page(zhdr, true);
+ clear_bit(PAGE_CLAIMED, &page->private);
return;
}
kref_get(&zhdr->refcount);
queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
@@ -834,6 +1115,7 @@
struct z3fold_header *zhdr = NULL;
struct page *page = NULL;
struct list_head *pos;
+ struct z3fold_buddy_slots slots;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
spin_lock(&pool->lock);
@@ -852,14 +1134,22 @@
/* this bit could have been set by free, in which case
* we pass over to the next page in the pool.
*/
- if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ page = NULL;
continue;
+ }
+ if (unlikely(PageIsolated(page))) {
+ clear_bit(PAGE_CLAIMED, &page->private);
+ page = NULL;
+ continue;
+ }
zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
break;
if (!z3fold_page_trylock(zhdr)) {
+ clear_bit(PAGE_CLAIMED, &page->private);
zhdr = NULL;
continue; /* can't evict at this point */
}
@@ -877,26 +1167,30 @@
if (!test_bit(PAGE_HEADLESS, &page->private)) {
/*
- * We need encode the handles before unlocking, since
- * we can race with free that will set
- * (first|last)_chunks to 0
+ * We need encode the handles before unlocking, and
+ * use our local slots structure because z3fold_free
+ * can zero out zhdr->slots and we can't do much
+ * about that
*/
first_handle = 0;
last_handle = 0;
middle_handle = 0;
if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
+ first_handle = __encode_handle(zhdr, &slots,
+ FIRST);
if (zhdr->middle_chunks)
- middle_handle = encode_handle(zhdr, MIDDLE);
+ middle_handle = __encode_handle(zhdr, &slots,
+ MIDDLE);
if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
+ last_handle = __encode_handle(zhdr, &slots,
+ LAST);
/*
* it's safe to unlock here because we hold a
* reference to this page
*/
z3fold_page_unlock(zhdr);
} else {
- first_handle = encode_handle(zhdr, HEADLESS);
+ first_handle = __encode_handle(zhdr, &slots, HEADLESS);
last_handle = middle_handle = 0;
}
@@ -919,16 +1213,16 @@
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
if (ret == 0) {
- free_z3fold_page(page);
+ free_z3fold_page(page, true);
atomic64_dec(&pool->pages_nr);
return 0;
}
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
+ clear_bit(PAGE_CLAIMED, &page->private);
} else {
z3fold_page_lock(zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
atomic64_dec(&pool->pages_nr);
@@ -943,6 +1237,7 @@
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
}
/* We started off locked to we need to lock the pool back */
@@ -996,6 +1291,8 @@
break;
}
+ if (addr)
+ zhdr->mapped_count++;
z3fold_page_unlock(zhdr);
out:
return addr;
@@ -1022,6 +1319,7 @@
buddy = handle_to_buddy(handle);
if (buddy == MIDDLE)
clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
+ zhdr->mapped_count--;
z3fold_page_unlock(zhdr);
}
@@ -1036,6 +1334,134 @@
return atomic64_read(&pool->pages_nr);
}
+static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(PageIsolated(page), page);
+
+ if (test_bit(PAGE_HEADLESS, &page->private) ||
+ test_bit(PAGE_CLAIMED, &page->private))
+ return false;
+
+ zhdr = page_address(page);
+ z3fold_page_lock(zhdr);
+ if (test_bit(NEEDS_COMPACTING, &page->private) ||
+ test_bit(PAGE_STALE, &page->private))
+ goto out;
+
+ pool = zhdr_to_pool(zhdr);
+
+ if (zhdr->mapped_count == 0) {
+ kref_get(&zhdr->refcount);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ spin_lock(&pool->lock);
+ if (!list_empty(&page->lru))
+ list_del(&page->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+ return true;
+ }
+out:
+ z3fold_page_unlock(zhdr);
+ return false;
+}
+
+static int z3fold_page_migrate(struct address_space *mapping, struct page *newpage,
+ struct page *page, enum migrate_mode mode)
+{
+ struct z3fold_header *zhdr, *new_zhdr;
+ struct z3fold_pool *pool;
+ struct address_space *new_mapping;
+
+ VM_BUG_ON_PAGE(!PageMovable(page), page);
+ VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ if (!z3fold_page_trylock(zhdr)) {
+ return -EAGAIN;
+ }
+ if (zhdr->mapped_count != 0) {
+ z3fold_page_unlock(zhdr);
+ return -EBUSY;
+ }
+ if (work_pending(&zhdr->work)) {
+ z3fold_page_unlock(zhdr);
+ return -EAGAIN;
+ }
+ new_zhdr = page_address(newpage);
+ memcpy(new_zhdr, zhdr, PAGE_SIZE);
+ newpage->private = page->private;
+ page->private = 0;
+ z3fold_page_unlock(zhdr);
+ spin_lock_init(&new_zhdr->page_lock);
+ INIT_WORK(&new_zhdr->work, compact_page_work);
+ /*
+ * z3fold_page_isolate() ensures that new_zhdr->buddy is empty,
+ * so we only have to reinitialize it.
+ */
+ INIT_LIST_HEAD(&new_zhdr->buddy);
+ new_mapping = page_mapping(page);
+ __ClearPageMovable(page);
+ ClearPagePrivate(page);
+
+ get_page(newpage);
+ z3fold_page_lock(new_zhdr);
+ if (new_zhdr->first_chunks)
+ encode_handle(new_zhdr, FIRST);
+ if (new_zhdr->last_chunks)
+ encode_handle(new_zhdr, LAST);
+ if (new_zhdr->middle_chunks)
+ encode_handle(new_zhdr, MIDDLE);
+ set_bit(NEEDS_COMPACTING, &newpage->private);
+ new_zhdr->cpu = smp_processor_id();
+ spin_lock(&pool->lock);
+ list_add(&newpage->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ __SetPageMovable(newpage, new_mapping);
+ z3fold_page_unlock(new_zhdr);
+
+ queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
+
+ page_mapcount_reset(page);
+ put_page(page);
+ return 0;
+}
+
+static void z3fold_page_putback(struct page *page)
+{
+ struct z3fold_header *zhdr;
+ struct z3fold_pool *pool;
+
+ zhdr = page_address(page);
+ pool = zhdr_to_pool(zhdr);
+
+ z3fold_page_lock(zhdr);
+ if (!list_empty(&zhdr->buddy))
+ list_del_init(&zhdr->buddy);
+ INIT_LIST_HEAD(&page->lru);
+ if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
+ atomic64_dec(&pool->pages_nr);
+ return;
+ }
+ spin_lock(&pool->lock);
+ list_add(&page->lru, &pool->lru);
+ spin_unlock(&pool->lock);
+ z3fold_page_unlock(zhdr);
+}
+
+static const struct address_space_operations z3fold_aops = {
+ .isolate_page = z3fold_page_isolate,
+ .migratepage = z3fold_page_migrate,
+ .putback_page = z3fold_page_putback,
+};
+
/*****************
* zpool
****************/
@@ -1133,8 +1559,14 @@
static int __init init_z3fold(void)
{
+ int ret;
+
/* Make sure the z3fold header is not larger than the page size */
BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
+ ret = z3fold_mount();
+ if (ret)
+ return ret;
+
zpool_register_driver(&z3fold_zpool_driver);
return 0;
@@ -1142,6 +1574,7 @@
static void __exit exit_z3fold(void)
{
+ z3fold_unmount();
zpool_unregister_driver(&z3fold_zpool_driver);
}
diff --git a/mm/zbud.c b/mm/zbud.c
index 28458f7..de5dd4d 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* zbud.c
*
diff --git a/mm/zpool.c b/mm/zpool.c
index 01a771e..8636692 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* zpool memory storage api
*
@@ -238,6 +239,22 @@
}
/**
+ * zpool_malloc_support_movable() - Check if the zpool support
+ * allocate movable memory
+ * @zpool: The zpool to check
+ *
+ * This returns if the zpool support allocate movable memory.
+ *
+ * Implementations must guarantee this to be thread-safe.
+ *
+ * Returns: true if if the zpool support allocate movable memory, false if not
+ */
+bool zpool_malloc_support_movable(struct zpool *zpool)
+{
+ return zpool->driver->malloc_support_movable;
+}
+
+/**
* zpool_malloc() - Allocate memory
* @zpool: The zpool to allocate from.
* @size: The amount of memory to allocate.
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 9da6555..2b2b9aa 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -52,7 +52,9 @@
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
#include <linux/migrate.h>
+#include <linux/wait.h>
#include <linux/pagemap.h>
#include <linux/fs.h>
@@ -267,6 +269,10 @@
#ifdef CONFIG_COMPACTION
struct inode *inode;
struct work_struct free_work;
+ /* A wait queue for when migration races with async_free_zspage() */
+ struct wait_queue_head migration_wait;
+ atomic_long_t isolated_pages;
+ bool destroying;
#endif
};
@@ -418,7 +424,7 @@
case ZPOOL_MM_WO:
zs_mm = ZS_MM_WO;
break;
- case ZPOOL_MM_RW: /* fallthru */
+ case ZPOOL_MM_RW: /* fall through */
default:
zs_mm = ZS_MM_RW;
break;
@@ -437,15 +443,16 @@
}
static struct zpool_driver zs_zpool_driver = {
- .type = "zsmalloc",
- .owner = THIS_MODULE,
- .create = zs_zpool_create,
- .destroy = zs_zpool_destroy,
- .malloc = zs_zpool_malloc,
- .free = zs_zpool_free,
- .map = zs_zpool_map,
- .unmap = zs_zpool_unmap,
- .total_size = zs_zpool_total_size,
+ .type = "zsmalloc",
+ .owner = THIS_MODULE,
+ .create = zs_zpool_create,
+ .destroy = zs_zpool_destroy,
+ .malloc_support_movable = true,
+ .malloc = zs_zpool_malloc,
+ .free = zs_zpool_free,
+ .map = zs_zpool_map,
+ .unmap = zs_zpool_unmap,
+ .total_size = zs_zpool_total_size,
};
MODULE_ALIAS("zpool-zsmalloc");
@@ -470,10 +477,6 @@
return zspage->inuse;
}
-static inline void set_zspage_inuse(struct zspage *zspage, int val)
-{
- zspage->inuse = val;
-}
static inline void mod_zspage_inuse(struct zspage *zspage, int val)
{
@@ -575,8 +578,6 @@
}
zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
- if (!zs_stat_root)
- pr_warn("debugfs 'zsmalloc' stat dir creation failed\n");
}
static void __exit zs_stat_exit(void)
@@ -647,29 +648,15 @@
static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
{
- struct dentry *entry;
-
if (!zs_stat_root) {
pr_warn("no root stat dir, not creating <%s> stat dir\n", name);
return;
}
- entry = debugfs_create_dir(name, zs_stat_root);
- if (!entry) {
- pr_warn("debugfs dir <%s> creation failed\n", name);
- return;
- }
- pool->stat_dentry = entry;
+ pool->stat_dentry = debugfs_create_dir(name, zs_stat_root);
- entry = debugfs_create_file("classes", S_IFREG | 0444,
- pool->stat_dentry, pool,
- &zs_stats_size_fops);
- if (!entry) {
- pr_warn("%s: debugfs file entry <%s> creation failed\n",
- name, "classes");
- debugfs_remove_recursive(pool->stat_dentry);
- pool->stat_dentry = NULL;
- }
+ debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool,
+ &zs_stats_size_fops);
}
static void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -1814,19 +1801,14 @@
} while ((page = get_next_page(page)) != NULL);
}
-static struct dentry *zs_mount(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data)
+static int zs_init_fs_context(struct fs_context *fc)
{
- static const struct dentry_operations ops = {
- .d_dname = simple_dname,
- };
-
- return mount_pseudo(fs_type, "zsmalloc:", NULL, &ops, ZSMALLOC_MAGIC);
+ return init_pseudo(fc, ZSMALLOC_MAGIC) ? 0 : -ENOMEM;
}
static struct file_system_type zsmalloc_fs = {
.name = "zsmalloc",
- .mount = zs_mount,
+ .init_fs_context = zs_init_fs_context,
.kill_sb = kill_anon_super,
};
@@ -1882,6 +1864,31 @@
zspage->isolated--;
}
+static void putback_zspage_deferred(struct zs_pool *pool,
+ struct size_class *class,
+ struct zspage *zspage)
+{
+ enum fullness_group fg;
+
+ fg = putback_zspage(class, zspage);
+ if (fg == ZS_EMPTY)
+ schedule_work(&pool->free_work);
+
+}
+
+static inline void zs_pool_dec_isolated(struct zs_pool *pool)
+{
+ VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0);
+ atomic_long_dec(&pool->isolated_pages);
+ /*
+ * There's no possibility of racing, since wait_for_isolated_drain()
+ * checks the isolated count under &class->lock after enqueuing
+ * on migration_wait.
+ */
+ if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying)
+ wake_up_all(&pool->migration_wait);
+}
+
static void replace_sub_page(struct size_class *class, struct zspage *zspage,
struct page *newpage, struct page *oldpage)
{
@@ -1951,6 +1958,7 @@
*/
if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) {
get_zspage_mapping(zspage, &class_idx, &fullness);
+ atomic_long_inc(&pool->isolated_pages);
remove_zspage(class, zspage, fullness);
}
@@ -2050,8 +2058,16 @@
* Page migration is done so let's putback isolated zspage to
* the list if @page is final isolated subpage in the zspage.
*/
- if (!is_zspage_isolated(zspage))
- putback_zspage(class, zspage);
+ if (!is_zspage_isolated(zspage)) {
+ /*
+ * We cannot race with zs_destroy_pool() here because we wait
+ * for isolation to hit zero before we start destroying.
+ * Also, we ensure that everyone can see pool->destroying before
+ * we start waiting.
+ */
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
+ }
reset_page(page);
put_page(page);
@@ -2097,13 +2113,12 @@
spin_lock(&class->lock);
dec_zspage_isolation(zspage);
if (!is_zspage_isolated(zspage)) {
- fg = putback_zspage(class, zspage);
/*
* Due to page_lock, we cannot free zspage immediately
* so let's defer.
*/
- if (fg == ZS_EMPTY)
- schedule_work(&pool->free_work);
+ putback_zspage_deferred(pool, class, zspage);
+ zs_pool_dec_isolated(pool);
}
spin_unlock(&class->lock);
}
@@ -2127,8 +2142,36 @@
return 0;
}
+static bool pool_isolated_are_drained(struct zs_pool *pool)
+{
+ return atomic_long_read(&pool->isolated_pages) == 0;
+}
+
+/* Function for resolving migration */
+static void wait_for_isolated_drain(struct zs_pool *pool)
+{
+
+ /*
+ * We're in the process of destroying the pool, so there are no
+ * active allocations. zs_page_isolate() fails for completely free
+ * zspages, so we need only wait for the zs_pool's isolated
+ * count to hit zero.
+ */
+ wait_event(pool->migration_wait,
+ pool_isolated_are_drained(pool));
+}
+
static void zs_unregister_migration(struct zs_pool *pool)
{
+ pool->destroying = true;
+ /*
+ * We need a memory barrier here to ensure global visibility of
+ * pool->destroying. Thus pool->isolated pages will either be 0 in which
+ * case we don't care, or it will be > 0 and pool->destroying will
+ * ensure that we wake up once isolation hits 0.
+ */
+ smp_mb();
+ wait_for_isolated_drain(pool); /* This can block */
flush_work(&pool->free_work);
iput(pool->inode);
}
@@ -2366,6 +2409,10 @@
if (!pool->name)
goto err;
+#ifdef CONFIG_COMPACTION
+ init_waitqueue_head(&pool->migration_wait);
+#endif
+
if (create_cache(pool))
goto err;
diff --git a/mm/zswap.c b/mm/zswap.c
index cd91fd9..46a3223 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
/*
* zswap.c - zswap driver file
*
@@ -8,16 +9,6 @@
* than reading from the swap device, can also improve workload performance.
*
* Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version 2
- * of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
@@ -219,8 +210,8 @@
static bool zswap_is_full(void)
{
- return totalram_pages * zswap_max_pool_percent / 100 <
- DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
+ return totalram_pages() * zswap_max_pool_percent / 100 <
+ DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
}
static void zswap_update_total_size(void)
@@ -865,7 +856,6 @@
/* extract swpentry from data */
zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
swpentry = zhdr->swpentry; /* here */
- zpool_unmap_handle(pool, handle);
tree = zswap_trees[swp_type(swpentry)];
offset = swp_offset(swpentry);
@@ -875,6 +865,7 @@
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
+ zpool_unmap_handle(pool, handle);
return 0;
}
spin_unlock(&tree->lock);
@@ -895,15 +886,13 @@
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
dlen = PAGE_SIZE;
- src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
- ZPOOL_MM_RO) + sizeof(struct zswap_header);
+ src = (u8 *)zhdr + sizeof(struct zswap_header);
dst = kmap_atomic(page);
tfm = *get_cpu_ptr(entry->pool->tfm);
ret = crypto_comp_decompress(tfm, src, entry->length,
dst, &dlen);
put_cpu_ptr(entry->pool->tfm);
kunmap_atomic(dst);
- zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -949,6 +938,7 @@
spin_unlock(&tree->lock);
end:
+ zpool_unmap_handle(pool, handle);
return ret;
}
@@ -1006,6 +996,7 @@
char *buf;
u8 *src, *dst;
struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) };
+ gfp_t gfp;
/* THP isn't supported */
if (PageTransHuge(page)) {
@@ -1079,9 +1070,10 @@
/* store */
hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0;
- ret = zpool_malloc(entry->pool->zpool, hlen + dlen,
- __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
- &handle);
+ gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
+ if (zpool_malloc_support_movable(entry->pool->zpool))
+ gfp |= __GFP_HIGHMEM | __GFP_MOVABLE;
+ ret = zpool_malloc(entry->pool->zpool, hlen + dlen, gfp, &handle);
if (ret == -ENOSPC) {
zswap_reject_compress_poor++;
goto put_dstmem;
@@ -1262,8 +1254,6 @@
return -ENODEV;
zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
- if (!zswap_debugfs_root)
- return -ENOMEM;
debugfs_create_u64("pool_limit_hit", 0444,
zswap_debugfs_root, &zswap_pool_limit_hit);