Update Linux to v5.4.148
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.4.148.tar.gz
Change-Id: Ib3d26c5ba9b022e2e03533005c4fed4d7c30b61b
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/Kconfig b/mm/Kconfig
index a5dae9a..fbdc5c7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -576,19 +576,6 @@
returned by an alloc(). This handle must be mapped in order to
access the allocated space.
-config PGTABLE_MAPPING
- bool "Use page table mapping to access object in zsmalloc"
- depends on ZSMALLOC
- help
- By default, zsmalloc uses a copy-based object mapping method to
- access allocations that span two pages. However, if a particular
- architecture (ex, ARM) performs VM mapping faster than copying,
- then you should select this. This causes zsmalloc to use page table
- mapping rather than copying for object mapping.
-
- You can check speed with zsmalloc benchmark:
- https://github.com/spartacus06/zsmapbench
-
config ZSMALLOC_STAT
bool "Export zsmalloc statistics"
depends on ZSMALLOC
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index c360f6a..3f2480e 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -21,6 +21,7 @@
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
+static const char *bdi_unknown_name = "(unknown)";
/*
* bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU
@@ -937,7 +938,8 @@
if (bdi->dev) /* The driver needs to use separate queues per device */
return 0;
- dev = device_create_vargs(bdi_class, NULL, MKDEV(0, 0), bdi, fmt, args);
+ vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args);
+ dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name);
if (IS_ERR(dev))
return PTR_ERR(dev);
@@ -1042,6 +1044,14 @@
}
EXPORT_SYMBOL(bdi_put);
+const char *bdi_dev_name(struct backing_dev_info *bdi)
+{
+ if (!bdi || !bdi->dev)
+ return bdi_unknown_name;
+ return bdi->dev_name;
+}
+EXPORT_SYMBOL_GPL(bdi_dev_name);
+
static wait_queue_head_t congestion_wqh[2] = {
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]),
__WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1])
diff --git a/mm/cma.c b/mm/cma.c
index 7fe0b83..7de520c 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -93,19 +93,15 @@
mutex_unlock(&cma->lock);
}
-static int __init cma_activate_area(struct cma *cma)
+static void __init cma_activate_area(struct cma *cma)
{
- int bitmap_size = BITS_TO_LONGS(cma_bitmap_maxno(cma)) * sizeof(long);
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;
struct zone *zone;
- cma->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
-
- if (!cma->bitmap) {
- cma->count = 0;
- return -ENOMEM;
- }
+ cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
+ if (!cma->bitmap)
+ goto out_error;
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -135,25 +131,22 @@
spin_lock_init(&cma->mem_head_lock);
#endif
- return 0;
+ return;
not_in_zone:
- pr_err("CMA area %s could not be activated\n", cma->name);
- kfree(cma->bitmap);
+ bitmap_free(cma->bitmap);
+out_error:
cma->count = 0;
- return -EINVAL;
+ pr_err("CMA area %s could not be activated\n", cma->name);
+ return;
}
static int __init cma_init_reserved_areas(void)
{
int i;
- for (i = 0; i < cma_area_count; i++) {
- int ret = cma_activate_area(&cma_areas[i]);
-
- if (ret)
- return ret;
- }
+ for (i = 0; i < cma_area_count; i++)
+ cma_activate_area(&cma_areas[i]);
return 0;
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 672d3c7..d686887 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1276,7 +1276,7 @@
{
unsigned int limit = min(1U, freelist_scan_limit(cc) >> 1);
unsigned int nr_scanned = 0;
- unsigned long low_pfn, min_pfn, high_pfn = 0, highest = 0;
+ unsigned long low_pfn, min_pfn, highest = 0;
unsigned long nr_isolated = 0;
unsigned long distance;
struct page *page = NULL;
@@ -1321,6 +1321,7 @@
struct page *freepage;
unsigned long flags;
unsigned int order_scanned = 0;
+ unsigned long high_pfn = 0;
if (!area->nr_free)
continue;
@@ -1629,6 +1630,7 @@
unsigned long pfn = cc->migrate_pfn;
unsigned long high_pfn;
int order;
+ bool found_block = false;
/* Skip hints are relied on to avoid repeats on the fast search */
if (cc->ignore_skip_hint)
@@ -1671,7 +1673,7 @@
high_pfn = pageblock_start_pfn(cc->migrate_pfn + distance);
for (order = cc->order - 1;
- order >= PAGE_ALLOC_COSTLY_ORDER && pfn == cc->migrate_pfn && nr_scanned < limit;
+ order >= PAGE_ALLOC_COSTLY_ORDER && !found_block && nr_scanned < limit;
order--) {
struct free_area *area = &cc->zone->free_area[order];
struct list_head *freelist;
@@ -1686,7 +1688,11 @@
list_for_each_entry(freepage, freelist, lru) {
unsigned long free_pfn;
- nr_scanned++;
+ if (nr_scanned++ >= limit) {
+ move_freelist_tail(freelist, freepage);
+ break;
+ }
+
free_pfn = page_to_pfn(freepage);
if (free_pfn < high_pfn) {
/*
@@ -1695,12 +1701,8 @@
* the list assumes an entry is deleted, not
* reordered.
*/
- if (get_pageblock_skip(freepage)) {
- if (list_is_last(freelist, &freepage->lru))
- break;
-
+ if (get_pageblock_skip(freepage))
continue;
- }
/* Reorder to so a future search skips recent pages */
move_freelist_tail(freelist, freepage);
@@ -1708,15 +1710,10 @@
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
cc->fast_search_fail = 0;
+ found_block = true;
set_pageblock_skip(freepage);
break;
}
-
- if (nr_scanned >= limit) {
- cc->fast_search_fail++;
- move_freelist_tail(freelist, freepage);
- break;
- }
}
spin_unlock_irqrestore(&cc->zone->lock, flags);
}
@@ -1727,9 +1724,10 @@
* If fast scanning failed then use a cached entry for a page block
* that had free pages as the basis for starting a linear scan.
*/
- if (pfn == cc->migrate_pfn)
+ if (!found_block) {
+ cc->fast_search_fail++;
pfn = reinit_migrate_pfn(cc);
-
+ }
return pfn;
}
@@ -2310,16 +2308,26 @@
.page = NULL,
};
- if (capture)
- current->capture_control = &capc;
+ /*
+ * Make sure the structs are really initialized before we expose the
+ * capture control, in case we are interrupted and the interrupt handler
+ * frees a page.
+ */
+ barrier();
+ WRITE_ONCE(current->capture_control, &capc);
ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
- *capture = capc.page;
- current->capture_control = NULL;
+ /*
+ * Make sure we hide capture control first before we read the captured
+ * page pointer, otherwise an interrupt could free and capture a page
+ * and we would leak it.
+ */
+ WRITE_ONCE(current->capture_control, NULL);
+ *capture = READ_ONCE(capc.page);
return ret;
}
@@ -2333,6 +2341,7 @@
* @alloc_flags: The allocation flags of the current allocation
* @ac: The context of current allocation
* @prio: Determines how hard direct compaction should try to succeed
+ * @capture: Pointer to free page created by compaction will be stored here
*
* This is the main entry point for direct page compaction.
*/
diff --git a/mm/debug.c b/mm/debug.c
index 0461df1..6a52316 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -47,6 +47,7 @@
struct address_space *mapping;
bool page_poisoned = PagePoisoned(page);
int mapcount;
+ char *type = "";
/*
* If struct page is poisoned don't access Page*() functions as that
@@ -78,9 +79,9 @@
page, page_ref_count(page), mapcount,
page->mapping, page_to_pgoff(page));
if (PageKsm(page))
- pr_warn("ksm flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ type = "ksm ";
else if (PageAnon(page))
- pr_warn("anon flags: %#lx(%pGp)\n", page->flags, &page->flags);
+ type = "anon ";
else if (mapping) {
if (mapping->host && mapping->host->i_dentry.first) {
struct dentry *dentry;
@@ -88,10 +89,11 @@
pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
} else
pr_warn("%ps\n", mapping->a_ops);
- pr_warn("flags: %#lx(%pGp)\n", page->flags, &page->flags);
}
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
+ pr_warn("%sflags: %#lx(%pGp)\n", type, page->flags, &page->flags);
+
hex_only:
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
diff --git a/mm/filemap.c b/mm/filemap.c
index 85b7d08..c10e237 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -847,16 +847,15 @@
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
-static int __add_to_page_cache_locked(struct page *page,
- struct address_space *mapping,
- pgoff_t offset, gfp_t gfp_mask,
- void **shadowp)
+noinline int __add_to_page_cache_locked(struct page *page,
+ struct address_space *mapping,
+ pgoff_t offset, gfp_t gfp_mask,
+ void **shadowp)
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
struct mem_cgroup *memcg;
int error;
- void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
@@ -872,21 +871,41 @@
get_page(page);
page->mapping = mapping;
page->index = offset;
+ gfp_mask &= GFP_RECLAIM_MASK;
do {
+ unsigned int order = xa_get_order(xas.xa, xas.xa_index);
+ void *entry, *old = NULL;
+
+ if (order > thp_order(page))
+ xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index),
+ order, gfp_mask);
xas_lock_irq(&xas);
- old = xas_load(&xas);
- if (old && !xa_is_value(old))
- xas_set_err(&xas, -EEXIST);
+ xas_for_each_conflict(&xas, entry) {
+ old = entry;
+ if (!xa_is_value(entry)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ }
+
+ if (old) {
+ if (shadowp)
+ *shadowp = old;
+ /* entry may have been split before we acquired lock */
+ order = xa_get_order(xas.xa, xas.xa_index);
+ if (order > thp_order(page)) {
+ xas_split(&xas, old, order);
+ xas_reset(&xas);
+ }
+ }
+
xas_store(&xas, page);
if (xas_error(&xas))
goto unlock;
- if (xa_is_value(old)) {
+ if (old)
mapping->nrexceptional--;
- if (shadowp)
- *shadowp = old;
- }
mapping->nrpages++;
/* hugetlb pages do not participate in page cache accounting */
@@ -894,7 +913,7 @@
__inc_node_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
- } while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
+ } while (xas_nomem(&xas, gfp_mask));
if (xas_error(&xas))
goto error;
@@ -2329,27 +2348,6 @@
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
-static struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
- struct file *fpin)
-{
- int flags = vmf->flags;
-
- if (fpin)
- return fpin;
-
- /*
- * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
- * anything, so we only pin the file and drop the mmap_sem if only
- * FAULT_FLAG_ALLOW_RETRY is set.
- */
- if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
- FAULT_FLAG_ALLOW_RETRY) {
- fpin = get_file(vmf->vma->vm_file);
- up_read(&vmf->vma->vm_mm->mmap_sem);
- }
- return fpin;
-}
-
/*
* lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
* @vmf - the vm_fault for this fault.
@@ -2459,7 +2457,7 @@
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
- if (vmf->vma->vm_flags & VM_RAND_READ)
+ if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
@@ -2866,6 +2864,14 @@
unlock_page(page);
goto out;
}
+
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures.
+ * Clear page error before actual read, PG_error will be
+ * set again if read page fails.
+ */
+ ClearPageError(page);
goto filler;
out:
diff --git a/mm/gup.c b/mm/gup.c
index 8f236a3..3ef7695 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -161,13 +161,22 @@
}
/*
- * FOLL_FORCE can write to even unwritable pte's, but only
- * after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE or a forced COW break can write even to unwritable pte's,
+ * but only after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
{
- return pte_write(pte) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+ return pte_write(pte) || ((flags & FOLL_COW) && pte_dirty(pte));
+}
+
+/*
+ * A (separate) COW fault might break the page the other way and
+ * get_user_pages() would return the page from what is now the wrong
+ * VM. So we need to force a COW break at GUP time even for reads.
+ */
+static inline bool should_force_cow_break(struct vm_area_struct *vma, unsigned int flags)
+{
+ return is_cow_mapping(vma->vm_flags) && (flags & FOLL_GET);
}
static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -823,12 +832,18 @@
goto out;
}
if (is_vm_hugetlb_page(vma)) {
+ if (should_force_cow_break(vma, foll_flags))
+ foll_flags |= FOLL_WRITE;
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
- gup_flags, nonblocking);
+ foll_flags, nonblocking);
continue;
}
}
+
+ if (should_force_cow_break(vma, foll_flags))
+ foll_flags |= FOLL_WRITE;
+
retry:
/*
* If we have a pending SIGKILL, don't keep faulting pages and
@@ -2169,13 +2184,13 @@
return 1;
}
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
- pmdp = pmd_offset(&pud, addr);
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
pmd_t pmd = READ_ONCE(*pmdp);
@@ -2212,13 +2227,13 @@
return 1;
}
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
- pudp = pud_offset(&p4d, addr);
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
do {
pud_t pud = READ_ONCE(*pudp);
@@ -2233,20 +2248,20 @@
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
+ } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
return 1;
}
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
- p4dp = p4d_offset(&pgd, addr);
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
do {
p4d_t p4d = READ_ONCE(*p4dp);
@@ -2258,7 +2273,7 @@
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
+ } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -2286,7 +2301,7 @@
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
+ } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -2316,6 +2331,10 @@
*
* If the architecture does not support this function, simply return with no
* pages pinned.
+ *
+ * Careful, careful! COW breaking can go either way, so a non-write
+ * access can get ambiguous page results. If you call this function without
+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
*/
int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
struct page **pages)
@@ -2343,6 +2362,12 @@
*
* We do not adopt an rcu_read_lock(.) here as we also want to
* block IPIs that come from THPs splitting.
+ *
+ * NOTE! We allow read-only gup_fast() here, but you'd better be
+ * careful about possible COW pages. You'll get _a_ COW page, but
+ * not necessarily the one you intended to get depending on what
+ * COW event happens after this. COW may break the page copy in a
+ * random direction.
*/
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
@@ -2401,7 +2426,8 @@
unsigned long addr, len, end;
int nr = 0, ret = 0;
- if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM)))
+ if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
+ FOLL_FORCE)))
return -EINVAL;
start = untagged_addr(start) & PAGE_MASK;
@@ -2414,10 +2440,17 @@
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
+ /*
+ * The FAST_GUP case requires FOLL_WRITE even for pure reads,
+ * because get_user_pages() may need to cause an early COW in
+ * order to avoid confusing the normal COW routines. So only
+ * targets that are already writable are safe to do by just
+ * looking at the page tables.
+ */
if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
gup_fast_permitted(start, end)) {
local_irq_disable();
- gup_pgd_range(addr, end, gup_flags, pages, &nr);
+ gup_pgd_range(addr, end, gup_flags | FOLL_WRITE, pages, &nr);
local_irq_enable();
ret = nr;
}
diff --git a/mm/gup_benchmark.c b/mm/gup_benchmark.c
index 7dd602d..ad9d5b1 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_benchmark.c
@@ -26,6 +26,7 @@
unsigned long i, nr_pages, addr, next;
int nr;
struct page **pages;
+ int ret = 0;
if (gup->size > ULONG_MAX)
return -EINVAL;
@@ -63,7 +64,9 @@
NULL);
break;
default:
- return -1;
+ kvfree(pages);
+ ret = -EINVAL;
+ goto out;
}
if (nr <= 0)
@@ -85,7 +88,8 @@
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
kvfree(pages);
- return 0;
+out:
+ return ret;
}
static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 13cc937..e50799d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -61,6 +61,7 @@
static atomic_t huge_zero_refcount;
struct page *huge_zero_page __read_mostly;
+unsigned long huge_zero_pfn __read_mostly = ~0UL;
bool transparent_hugepage_enabled(struct vm_area_struct *vma)
{
@@ -97,6 +98,7 @@
__free_pages(zero_page, compound_order(zero_page));
goto retry;
}
+ WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page));
/* We take additional reference here. It will be put back by shrinker */
atomic_set(&huge_zero_refcount, 2);
@@ -146,6 +148,7 @@
if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
struct page *zero_page = xchg(&huge_zero_page, NULL);
BUG_ON(zero_page == NULL);
+ WRITE_ONCE(huge_zero_pfn, ~0UL);
__free_pages(zero_page, compound_order(zero_page));
return HPAGE_PMD_NR;
}
@@ -177,16 +180,13 @@
{
ssize_t ret = count;
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags);
} else
@@ -250,32 +250,27 @@
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- if (!memcmp("always", buf,
- min(sizeof("always")-1, count))) {
+ if (sysfs_streq(buf, "always")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer+madvise", buf,
- min(sizeof("defer+madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "defer+madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("defer", buf,
- min(sizeof("defer")-1, count))) {
+ } else if (sysfs_streq(buf, "defer")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("madvise", buf,
- min(sizeof("madvise")-1, count))) {
+ } else if (sysfs_streq(buf, "madvise")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
set_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags);
- } else if (!memcmp("never", buf,
- min(sizeof("never")-1, count))) {
+ } else if (sysfs_streq(buf, "never")) {
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags);
clear_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags);
@@ -527,13 +522,13 @@
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
-static unsigned long __thp_get_unmapped_area(struct file *filp, unsigned long len,
+static unsigned long __thp_get_unmapped_area(struct file *filp,
+ unsigned long addr, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
{
- unsigned long addr;
loff_t off_end = off + len;
loff_t off_align = round_up(off, size);
- unsigned long len_pad;
+ unsigned long len_pad, ret;
if (off_end <= off_align || (off_end - off_align) < size)
return 0;
@@ -542,30 +537,40 @@
if (len_pad < len || (off + len_pad) < off)
return 0;
- addr = current->mm->get_unmapped_area(filp, 0, len_pad,
+ ret = current->mm->get_unmapped_area(filp, addr, len_pad,
off >> PAGE_SHIFT, flags);
- if (IS_ERR_VALUE(addr))
+
+ /*
+ * The failure might be due to length padding. The caller will retry
+ * without the padding.
+ */
+ if (IS_ERR_VALUE(ret))
return 0;
- addr += (off - addr) & (size - 1);
- return addr;
+ /*
+ * Do not try to align to THP boundary if allocation at the address
+ * hint succeeds.
+ */
+ if (ret == addr)
+ return addr;
+
+ ret += (off - ret) & (size - 1);
+ return ret;
}
unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
+ unsigned long ret;
loff_t off = (loff_t)pgoff << PAGE_SHIFT;
- if (addr)
- goto out;
if (!IS_DAX(filp->f_mapping->host) || !IS_ENABLED(CONFIG_FS_DAX_PMD))
goto out;
- addr = __thp_get_unmapped_area(filp, len, off, flags, PMD_SIZE);
- if (addr)
- return addr;
-
- out:
+ ret = __thp_get_unmapped_area(filp, addr, len, off, flags, PMD_SIZE);
+ if (ret)
+ return ret;
+out:
return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
}
EXPORT_SYMBOL_GPL(thp_get_unmapped_area);
@@ -720,7 +725,6 @@
transparent_hugepage_use_zero_page()) {
pgtable_t pgtable;
struct page *zero_page;
- bool set;
vm_fault_t ret;
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable))
@@ -733,25 +737,25 @@
}
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
ret = 0;
- set = false;
if (pmd_none(*vmf->pmd)) {
ret = check_stable_address_space(vma->vm_mm);
if (ret) {
spin_unlock(vmf->ptl);
+ pte_free(vma->vm_mm, pgtable);
} else if (userfaultfd_missing(vma)) {
spin_unlock(vmf->ptl);
+ pte_free(vma->vm_mm, pgtable);
ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK);
} else {
set_huge_zero_page(pgtable, vma->vm_mm, vma,
haddr, vmf->pmd, zero_page);
spin_unlock(vmf->ptl);
- set = true;
}
- } else
+ } else {
spin_unlock(vmf->ptl);
- if (!set)
pte_free(vma->vm_mm, pgtable);
+ }
return ret;
}
gfp = alloc_hugepage_direct_gfpmask(vma);
@@ -1452,13 +1456,12 @@
}
/*
- * FOLL_FORCE can write to even unwritable pmd's, but only
- * after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE or a forced COW break can write even to unwritable pmd's,
+ * but only after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
- return pmd_write(pmd) ||
- ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+ return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1720,7 +1723,7 @@
* If other processes are mapping this page, we couldn't discard
* the page unless they all do MADV_FREE so let's skip the page.
*/
- if (page_mapcount(page) != 1)
+ if (total_mapcount(page) != 1)
goto out;
if (!trylock_page(page))
@@ -2155,7 +2158,7 @@
count_vm_event(THP_SPLIT_PMD);
if (!vma_is_anonymous(vma)) {
- _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+ old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd);
/*
* We are going to unmap this huge page. So
* just go ahead and zap it
@@ -2164,16 +2167,25 @@
zap_deposited_table(mm, pmd);
if (vma_is_dax(vma))
return;
- page = pmd_page(_pmd);
- if (!PageDirty(page) && pmd_dirty(_pmd))
- set_page_dirty(page);
- if (!PageReferenced(page) && pmd_young(_pmd))
- SetPageReferenced(page);
- page_remove_rmap(page, true);
- put_page(page);
+ if (unlikely(is_pmd_migration_entry(old_pmd))) {
+ swp_entry_t entry;
+
+ entry = pmd_to_swp_entry(old_pmd);
+ page = migration_entry_to_page(entry);
+ } else {
+ page = pmd_page(old_pmd);
+ if (!PageDirty(page) && pmd_dirty(old_pmd))
+ set_page_dirty(page);
+ if (!PageReferenced(page) && pmd_young(old_pmd))
+ SetPageReferenced(page);
+ page_remove_rmap(page, true);
+ put_page(page);
+ }
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (is_huge_zero_pmd(*pmd)) {
+ }
+
+ if (is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
@@ -2261,27 +2273,33 @@
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
- atomic_inc(&page[i]._mapcount);
+ if (!pmd_migration)
+ atomic_inc(&page[i]._mapcount);
pte_unmap(pte);
}
- /*
- * Set PG_double_map before dropping compound_mapcount to avoid
- * false-negative page_mapped().
- */
- if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
- for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_inc(&page[i]._mapcount);
- }
-
- if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
- /* Last compound_mapcount is gone. */
- __dec_node_page_state(page, NR_ANON_THPS);
- if (TestClearPageDoubleMap(page)) {
- /* No need in mapcount reference anymore */
+ if (!pmd_migration) {
+ /*
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
+ */
+ if (compound_mapcount(page) > 1 &&
+ !TestSetPageDoubleMap(page)) {
for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_dec(&page[i]._mapcount);
+ atomic_inc(&page[i]._mapcount);
}
+
+ lock_page_memcg(page);
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+ unlock_page_memcg(page);
}
smp_wmb(); /* make pte visible before pmd */
@@ -2300,6 +2318,8 @@
{
spinlock_t *ptl;
struct mmu_notifier_range range;
+ bool do_unlock_page = false;
+ pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2312,11 +2332,41 @@
* pmd against. Otherwise we can end up replacing wrong page.
*/
VM_BUG_ON(freeze && !page);
- if (page && page != pmd_page(*pmd))
- goto out;
+ if (page) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+ if (page != pmd_page(*pmd))
+ goto out;
+ }
+repeat:
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ if (!page) {
+ page = pmd_page(*pmd);
+ /*
+ * An anonymous page must be locked, to ensure that a
+ * concurrent reuse_swap_page() sees stable mapcount;
+ * but reuse_swap_page() is not used on shmem or file,
+ * and page lock must not be taken when zap_pmd_range()
+ * calls __split_huge_pmd() while i_mmap_lock is held.
+ */
+ if (PageAnon(page)) {
+ if (unlikely(!trylock_page(page))) {
+ get_page(page);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
+ put_page(page);
+ }
+ do_unlock_page = true;
+ }
+ }
if (PageMlocked(page))
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
@@ -2324,6 +2374,8 @@
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
+ if (do_unlock_page)
+ unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2409,16 +2461,16 @@
static void unmap_page(struct page *page)
{
enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
- bool unmap_success;
+ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC;
VM_BUG_ON_PAGE(!PageHead(page), page);
if (PageAnon(page))
ttu_flags |= TTU_SPLIT_FREEZE;
- unmap_success = try_to_unmap(page, ttu_flags);
- VM_BUG_ON_PAGE(!unmap_success, page);
+ try_to_unmap(page, ttu_flags);
+
+ VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
static void remap_page(struct page *page)
@@ -2537,7 +2589,7 @@
ClearPageCompound(head);
- split_page_owner(head, HPAGE_PMD_ORDER);
+ split_page_owner(head, HPAGE_PMD_NR);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
@@ -2697,12 +2749,12 @@
struct deferred_split *ds_queue = get_deferred_split_queue(page);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
- int count, mapcount, extra_pins, ret;
+ int extra_pins, ret;
bool mlocked;
unsigned long flags;
pgoff_t end;
- VM_BUG_ON_PAGE(is_huge_zero_page(page), page);
+ VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(!PageCompound(page), page);
@@ -2759,7 +2811,6 @@
mlocked = PageMlocked(page);
unmap_page(head);
- VM_BUG_ON_PAGE(compound_mapcount(head), head);
/* Make sure the page is not on per-CPU pagevec as it takes pin */
if (mlocked)
@@ -2782,9 +2833,7 @@
/* Prevent deferred_split_scan() touching ->_refcount */
spin_lock(&ds_queue->split_queue_lock);
- count = page_count(head);
- mapcount = total_mapcount(head);
- if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) {
+ if (page_ref_freeze(head, 1 + extra_pins)) {
if (!list_empty(page_deferred_list(head))) {
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
@@ -2805,16 +2854,9 @@
} else
ret = 0;
} else {
- if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- pr_alert("total_mapcount: %u, page_count(): %u\n",
- mapcount, count);
- if (PageTail(page))
- dump_page(head, NULL);
- dump_page(page, "total_mapcount(head) > 0");
- BUG();
- }
spin_unlock(&ds_queue->split_queue_lock);
-fail: if (mapping)
+fail:
+ if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
remap_page(head);
@@ -3030,8 +3072,7 @@
return;
flush_cache_range(vma, address, address + HPAGE_PMD_SIZE);
- pmdval = *pvmw->pmd;
- pmdp_invalidate(vma, address, pvmw->pmd);
+ pmdval = pmdp_invalidate(vma, address, pvmw->pmd);
if (pmd_dirty(pmdval))
set_page_dirty(page);
entry = make_migration_entry(page, pmd_write(pmdval));
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b45a953..95a3274 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -27,6 +27,7 @@
#include <linux/swapops.h>
#include <linux/jhash.h>
#include <linux/numa.h>
+#include <linux/llist.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -70,6 +71,21 @@
static int num_fault_mutexes;
struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
+static inline bool PageHugeFreed(struct page *head)
+{
+ return page_private(head + 4) == -1UL;
+}
+
+static inline void SetPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, -1UL);
+}
+
+static inline void ClearPageHugeFreed(struct page *head)
+{
+ set_page_private(head + 4, 0);
+}
+
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -575,13 +591,20 @@
{
struct hugepage_subpool *spool = subpool_inode(inode);
long rsv_adjust;
+ bool reserved = false;
rsv_adjust = hugepage_subpool_get_pages(spool, 1);
- if (rsv_adjust) {
+ if (rsv_adjust > 0) {
struct hstate *h = hstate_inode(inode);
- hugetlb_acct_memory(h, 1);
+ if (!hugetlb_acct_memory(h, 1))
+ reserved = true;
+ } else if (!rsv_adjust) {
+ reserved = true;
}
+
+ if (!reserved)
+ pr_warn("hugetlb: Huge Page Reserved count may go negative.\n");
}
/*
@@ -868,6 +891,7 @@
list_move(&page->lru, &h->hugepage_freelists[nid]);
h->free_huge_pages++;
h->free_huge_pages_node[nid]++;
+ SetPageHugeFreed(page);
}
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
@@ -885,6 +909,7 @@
return NULL;
list_move(&page->lru, &h->hugepage_activelist);
set_page_refcounted(page);
+ ClearPageHugeFreed(page);
h->free_huge_pages--;
h->free_huge_pages_node[nid]--;
return page;
@@ -1174,14 +1199,16 @@
static void update_and_free_page(struct hstate *h, struct page *page)
{
int i;
+ struct page *subpage = page;
if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
h->nr_huge_pages_node[page_to_nid(page)]--;
- for (i = 0; i < pages_per_huge_page(h); i++) {
- page[i].flags &= ~(1 << PG_locked | 1 << PG_error |
+ for (i = 0; i < pages_per_huge_page(h);
+ i++, subpage = mem_map_next(subpage, page, i)) {
+ subpage->flags &= ~(1 << PG_locked | 1 << PG_error |
1 << PG_referenced | 1 << PG_dirty |
1 << PG_active | 1 << PG_private |
1 << PG_writeback);
@@ -1216,12 +1243,11 @@
*/
bool page_huge_active(struct page *page)
{
- VM_BUG_ON_PAGE(!PageHuge(page), page);
- return PageHead(page) && PagePrivate(&page[1]);
+ return PageHeadHuge(page) && PagePrivate(&page[1]);
}
/* never called for tail page */
-static void set_page_huge_active(struct page *page)
+void set_page_huge_active(struct page *page)
{
VM_BUG_ON_PAGE(!PageHeadHuge(page), page);
SetPagePrivate(&page[1]);
@@ -1255,7 +1281,7 @@
page[2].mapping = NULL;
}
-void free_huge_page(struct page *page)
+static void __free_huge_page(struct page *page)
{
/*
* Can't pass hstate in here because it is called from the
@@ -1318,6 +1344,54 @@
spin_unlock(&hugetlb_lock);
}
+/*
+ * As free_huge_page() can be called from a non-task context, we have
+ * to defer the actual freeing in a workqueue to prevent potential
+ * hugetlb_lock deadlock.
+ *
+ * free_hpage_workfn() locklessly retrieves the linked list of pages to
+ * be freed and frees them one-by-one. As the page->mapping pointer is
+ * going to be cleared in __free_huge_page() anyway, it is reused as the
+ * llist_node structure of a lockless linked list of huge pages to be freed.
+ */
+static LLIST_HEAD(hpage_freelist);
+
+static void free_hpage_workfn(struct work_struct *work)
+{
+ struct llist_node *node;
+ struct page *page;
+
+ node = llist_del_all(&hpage_freelist);
+
+ while (node) {
+ page = container_of((struct address_space **)node,
+ struct page, mapping);
+ node = node->next;
+ __free_huge_page(page);
+ }
+}
+static DECLARE_WORK(free_hpage_work, free_hpage_workfn);
+
+void free_huge_page(struct page *page)
+{
+ /*
+ * Defer freeing if in non-task context to avoid hugetlb_lock deadlock.
+ */
+ if (!in_task()) {
+ /*
+ * Only call schedule_work() if hpage_freelist is previously
+ * empty. Otherwise, schedule_work() had been called but the
+ * workfn hasn't retrieved the list yet.
+ */
+ if (llist_add((struct llist_node *)&page->mapping,
+ &hpage_freelist))
+ schedule_work(&free_hpage_work);
+ return;
+ }
+
+ __free_huge_page(page);
+}
+
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
{
INIT_LIST_HEAD(&page->lru);
@@ -1326,6 +1400,7 @@
set_hugetlb_cgroup(page, NULL);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
+ ClearPageHugeFreed(page);
spin_unlock(&hugetlb_lock);
}
@@ -1386,15 +1461,12 @@
return get_compound_page_dtor(page_head) == free_huge_page;
}
-pgoff_t __basepage_index(struct page *page)
+pgoff_t hugetlb_basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
pgoff_t index = page_index(page_head);
unsigned long compound_idx;
- if (!PageHuge(page_head))
- return page_index(page);
-
if (compound_order(page_head) >= MAX_ORDER)
compound_idx = page_to_pfn(page) - page_to_pfn(page_head);
else
@@ -1553,6 +1625,7 @@
{
int rc = -EBUSY;
+retry:
/* Not to disrupt normal path by vainly holding hugetlb_lock */
if (!PageHuge(page))
return 0;
@@ -1569,6 +1642,26 @@
int nid = page_to_nid(head);
if (h->free_huge_pages - h->resv_huge_pages == 0)
goto out;
+
+ /*
+ * We should make sure that the page is already on the free list
+ * when it is dissolved.
+ */
+ if (unlikely(!PageHugeFreed(head))) {
+ spin_unlock(&hugetlb_lock);
+ cond_resched();
+
+ /*
+ * Theoretically, we should return -EBUSY when we
+ * encounter this race. In fact, we have a chance
+ * to successfully dissolve the page if we do a
+ * retry. Because the race window is quite small.
+ * If we seize this opportunity, it is an optimization
+ * for increasing the success rate of dissolving page.
+ */
+ goto retry;
+ }
+
/*
* Move PageHWPoison flag from head page to the raw error page,
* which makes any subpages rather than the error page reusable.
@@ -2725,8 +2818,10 @@
return -ENOMEM;
retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group);
- if (retval)
+ if (retval) {
kobject_put(hstate_kobjs[hi]);
+ hstate_kobjs[hi] = NULL;
+ }
return retval;
}
@@ -3033,6 +3128,22 @@
}
#ifdef CONFIG_SYSCTL
+static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *length,
+ loff_t *ppos, unsigned long *out)
+{
+ struct ctl_table dup_table;
+
+ /*
+ * In order to avoid races with __do_proc_doulongvec_minmax(), we
+ * can duplicate the @table and alter the duplicate of it.
+ */
+ dup_table = *table;
+ dup_table.data = out;
+
+ return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
+}
+
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
@@ -3044,9 +3155,8 @@
if (!hugepages_supported())
return -EOPNOTSUPP;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
@@ -3090,9 +3200,8 @@
if (write && hstate_is_gigantic(h))
return -EINVAL;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
@@ -3915,7 +4024,7 @@
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3973,7 +4082,7 @@
* So we need to block hugepage fault by PG_hwpoison bit check.
*/
if (unlikely(PageHWPoison(page))) {
- ret = VM_FAULT_HWPOISON |
+ ret = VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
goto backout_unlocked;
}
@@ -4043,7 +4152,7 @@
#ifdef CONFIG_SMP
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address)
+ pgoff_t idx)
{
unsigned long key[2];
u32 hash;
@@ -4051,7 +4160,7 @@
key[0] = (unsigned long) mapping;
key[1] = idx;
- hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
+ hash = jhash2((u32 *)&key, sizeof(key)/(sizeof(u32)), 0);
return hash & (num_fault_mutexes - 1);
}
@@ -4061,7 +4170,7 @@
* return 0 and avoid the hashing overhead.
*/
u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx, unsigned long address)
+ pgoff_t idx)
{
return 0;
}
@@ -4105,7 +4214,7 @@
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4226,10 +4335,20 @@
struct page *page;
if (!*pagep) {
- ret = -ENOMEM;
- page = alloc_huge_page(dst_vma, dst_addr, 0);
- if (IS_ERR(page))
+ /* If a page already exists, then it's UFFDIO_COPY for
+ * a non-missing case. Return -EEXIST.
+ */
+ if (vm_shared &&
+ hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) {
+ ret = -EEXIST;
goto out;
+ }
+
+ page = alloc_huge_page(dst_vma, dst_addr, 0);
+ if (IS_ERR(page)) {
+ ret = -ENOMEM;
+ goto out;
+ }
ret = copy_huge_page_from_user(page,
(const void __user *) src_addr,
@@ -4797,25 +4916,23 @@
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr = *start;
+ unsigned long v_start = ALIGN(vma->vm_start, PUD_SIZE),
+ v_end = ALIGN_DOWN(vma->vm_end, PUD_SIZE);
- if (!(vma->vm_flags & VM_MAYSHARE))
+ /*
+ * vma need span at least one aligned PUD size and the start,end range
+ * must at least partialy within it.
+ */
+ if (!(vma->vm_flags & VM_MAYSHARE) || !(v_end > v_start) ||
+ (*end <= v_start) || (*start >= v_end))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ if (*start > v_start)
+ *start = ALIGN_DOWN(*start, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ if (*end < v_end)
+ *end = ALIGN(*end, PUD_SIZE);
}
/*
@@ -4967,8 +5084,8 @@
{
pgd_t *pgd;
p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
+ pud_t *pud, pud_entry;
+ pmd_t *pmd, pmd_entry;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
@@ -4978,17 +5095,19 @@
return NULL;
pud = pud_offset(p4d, addr);
- if (sz != PUD_SIZE && pud_none(*pud))
+ pud_entry = READ_ONCE(*pud);
+ if (sz != PUD_SIZE && pud_none(pud_entry))
return NULL;
/* hugepage or swap? */
- if (pud_huge(*pud) || !pud_present(*pud))
+ if (pud_huge(pud_entry) || !pud_present(pud_entry))
return (pte_t *)pud;
pmd = pmd_offset(pud, addr);
- if (sz != PMD_SIZE && pmd_none(*pmd))
+ pmd_entry = READ_ONCE(*pmd);
+ if (sz != PMD_SIZE && pmd_none(pmd_entry))
return NULL;
/* hugepage or swap? */
- if (pmd_huge(*pmd) || !pmd_present(*pmd))
+ if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry))
return (pte_t *)pmd;
return NULL;
@@ -5075,9 +5194,9 @@
{
bool ret = true;
- VM_BUG_ON_PAGE(!PageHead(page), page);
spin_lock(&hugetlb_lock);
- if (!page_huge_active(page) || !get_page_unless_zero(page)) {
+ if (!PageHeadHuge(page) || !page_huge_active(page) ||
+ !get_page_unless_zero(page)) {
ret = false;
goto unlock;
}
diff --git a/mm/internal.h b/mm/internal.h
index 0d5f720..cf38254 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -339,27 +339,73 @@
extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
/*
- * At what user virtual address is page expected in @vma?
+ * At what user virtual address is page expected in vma?
+ * Returns -EFAULT if all of the page is outside the range of vma.
+ * If page is a compound head, the entire compound page is considered.
*/
static inline unsigned long
-__vma_address(struct page *page, struct vm_area_struct *vma)
-{
- pgoff_t pgoff = page_to_pgoff(page);
- return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-}
-
-static inline unsigned long
vma_address(struct page *page, struct vm_area_struct *vma)
{
- unsigned long start, end;
+ pgoff_t pgoff;
+ unsigned long address;
- start = __vma_address(page, vma);
- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page);
+ if (pgoff >= vma->vm_pgoff) {
+ address = vma->vm_start +
+ ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address >= vma->vm_end)
+ address = -EFAULT;
+ } else if (PageHead(page) &&
+ pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) {
+ /* Test above avoids possibility of wrap to 0 on 32-bit */
+ address = vma->vm_start;
+ } else {
+ address = -EFAULT;
+ }
+ return address;
+}
- /* page should be within @vma mapping range */
- VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
+/*
+ * Then at what user virtual address will none of the page be found in vma?
+ * Assumes that vma_address() already returned a good starting address.
+ * If page is a compound head, the entire compound page is considered.
+ */
+static inline unsigned long
+vma_address_end(struct page *page, struct vm_area_struct *vma)
+{
+ pgoff_t pgoff;
+ unsigned long address;
- return max(start, vma->vm_start);
+ VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */
+ pgoff = page_to_pgoff(page) + compound_nr(page);
+ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+ /* Check for address beyond vma (or wrapped through 0?) */
+ if (address < vma->vm_start || address > vma->vm_end)
+ address = vma->vm_end;
+ return address;
+}
+
+static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
+ struct file *fpin)
+{
+ int flags = vmf->flags;
+
+ if (fpin)
+ return fpin;
+
+ /*
+ * FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
+ * anything, so we only pin the file and drop the mmap_sem if only
+ * FAULT_FLAG_ALLOW_RETRY is set.
+ */
+ if ((flags & (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT)) ==
+ FAULT_FLAG_ALLOW_RETRY) {
+ fpin = get_file(vmf->vma->vm_file);
+ up_read(&vmf->vma->vm_mm->mmap_sem);
+ }
+ return fpin;
}
#else /* !CONFIG_MMU */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 08b43de..f36ffc0 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -14,10 +14,10 @@
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
-CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
-CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
+CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
+CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
+CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
obj-$(CONFIG_KASAN) := common.o init.o report.o
obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 616f9dd..76a8003 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -15,7 +15,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
#include <linux/interrupt.h>
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ce45c49..ee21e1c 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -388,9 +388,10 @@
if (kasan_pte_table(*pmd)) {
if (IS_ALIGNED(addr, PMD_SIZE) &&
- IS_ALIGNED(next, PMD_SIZE))
+ IS_ALIGNED(next, PMD_SIZE)) {
pmd_clear(pmd);
- continue;
+ continue;
+ }
}
pte = pte_offset_kernel(pmd, addr);
kasan_remove_pte_table(pte, addr, next);
@@ -413,9 +414,10 @@
if (kasan_pmd_table(*pud)) {
if (IS_ALIGNED(addr, PUD_SIZE) &&
- IS_ALIGNED(next, PUD_SIZE))
+ IS_ALIGNED(next, PUD_SIZE)) {
pud_clear(pud);
- continue;
+ continue;
+ }
}
pmd = pmd_offset(pud, addr);
pmd_base = pmd_offset(pud, 0);
@@ -439,9 +441,10 @@
if (kasan_pud_table(*p4d)) {
if (IS_ALIGNED(addr, P4D_SIZE) &&
- IS_ALIGNED(next, P4D_SIZE))
+ IS_ALIGNED(next, P4D_SIZE)) {
p4d_clear(p4d);
- continue;
+ continue;
+ }
}
pud = pud_offset(p4d, addr);
kasan_remove_pud_table(pud, addr, next);
@@ -473,9 +476,10 @@
if (kasan_p4d_table(*pgd)) {
if (IS_ALIGNED(addr, PGDIR_SIZE) &&
- IS_ALIGNED(next, PGDIR_SIZE))
+ IS_ALIGNED(next, PGDIR_SIZE)) {
pgd_clear(pgd);
- continue;
+ continue;
+ }
}
p4d = p4d_offset(pgd, addr);
@@ -499,7 +503,6 @@
ret = kasan_populate_early_shadow(shadow_start, shadow_end);
if (ret)
- kasan_remove_zero_shadow(shadow_start,
- size >> KASAN_SHADOW_SCALE_SHIFT);
+ kasan_remove_zero_shadow(start, size);
return ret;
}
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 0e987c9..caf4efd 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -12,7 +12,6 @@
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#define DISABLE_BRANCH_PROFILING
#include <linux/export.h>
#include <linux/interrupt.h>
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index a8a57be..3c23265 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -54,6 +54,9 @@
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
@@ -401,7 +404,7 @@
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -438,7 +441,7 @@
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+ VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -625,17 +628,17 @@
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
}
- if (likely(writable)) {
- if (likely(referenced)) {
- result = SCAN_SUCCEED;
- trace_mm_collapse_huge_page_isolate(page, none_or_zero,
- referenced, writable, result);
- return 1;
- }
- } else {
- result = SCAN_PAGE_RO;
- }
+ if (unlikely(!writable)) {
+ result = SCAN_PAGE_RO;
+ } else if (unlikely(!referenced)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ trace_mm_collapse_huge_page_isolate(page, none_or_zero,
+ referenced, writable, result);
+ return 1;
+ }
out:
release_pte_pages(pte, _pte);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
@@ -832,6 +835,18 @@
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
+ /*
+ * If the hpage allocated earlier was briefly exposed in page cache
+ * before collapse_file() failed, it is possible that racing lookups
+ * have not yet completed, and would then be unpleasantly surprised by
+ * finding the hpage reused for the same mapping at a different offset.
+ * Just release the previous allocation if there is any danger of that.
+ */
+ if (*hpage && page_count(*hpage) > 1) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
if (!*hpage)
*hpage = khugepaged_alloc_hugepage(wait);
@@ -876,6 +891,9 @@
return SCAN_ADDRESS_RANGE;
if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
+ /* Anon VMA expected */
+ if (!vma->anon_vma || vma->vm_ops)
+ return SCAN_VMA_CHECK;
return 0;
}
@@ -1016,9 +1034,6 @@
* handled by the anon_vma lock + PG_lock.
*/
down_write(&mm->mmap_sem);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
@@ -1291,7 +1306,7 @@
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = find_vma(mm, haddr);
- struct page *hpage = NULL;
+ struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd, _pmd;
spinlock_t *ptl;
@@ -1311,9 +1326,17 @@
if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
return;
+ hpage = find_lock_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, haddr));
+ if (!hpage)
+ return;
+
+ if (!PageHead(hpage))
+ goto drop_hpage;
+
pmd = mm_find_pmd(mm, haddr);
if (!pmd)
- return;
+ goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
@@ -1332,30 +1355,11 @@
page = vm_normal_page(vma, addr, *pte);
- if (!page || !PageCompound(page))
- goto abort;
-
- if (!hpage) {
- hpage = compound_head(page);
- /*
- * The mapping of the THP should not change.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may
- * change the page table, but the new page will
- * not pass PageCompound() check.
- */
- if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
- goto abort;
- }
-
/*
- * Confirm the page maps to the correct subpage.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may change
- * the page table, but the new page will not pass
- * PageCompound() check.
+ * Note that uprobe, debugger, or MAP_PRIVATE may change the
+ * page table, but the new page will not be a subpage of hpage.
*/
- if (WARN_ON(hpage + i != page))
+ if (hpage + i != page)
goto abort;
count++;
}
@@ -1374,21 +1378,26 @@
pte_unmap_unlock(start_pte, ptl);
/* step 3: set proper refcount and mm_counters. */
- if (hpage) {
+ if (count) {
page_ref_sub(hpage, count);
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
+
+drop_hpage:
+ unlock_page(hpage);
+ put_page(hpage);
return;
abort:
pte_unmap_unlock(start_pte, ptl);
+ goto drop_hpage;
}
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
@@ -1417,6 +1426,7 @@
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1445,7 +1455,8 @@
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
@@ -1455,17 +1466,19 @@
* mmap_sem while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- up_write(&vma->vm_mm->mmap_sem);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (down_write_trylock(&mm->mmap_sem)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ up_write(&mm->mmap_sem);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
@@ -1594,7 +1607,7 @@
xas_unlock_irq(&xas);
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
- PAGE_SIZE);
+ end - index);
/* drain pagevecs to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
@@ -1655,6 +1668,7 @@
if (page_has_private(page) &&
!try_to_release_page(page, GFP_KERNEL)) {
result = SCAN_PAGE_HAS_PRIVATE;
+ putback_lru_page(page);
goto out_unlock;
}
@@ -2166,8 +2180,6 @@
int start_stop_khugepaged(void)
{
- static struct task_struct *khugepaged_thread __read_mostly;
- static DEFINE_MUTEX(khugepaged_mutex);
int err = 0;
mutex_lock(&khugepaged_mutex);
@@ -2194,3 +2206,11 @@
mutex_unlock(&khugepaged_mutex);
return err;
}
+
+void khugepaged_min_free_kbytes_update(void)
+{
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled() && khugepaged_thread)
+ set_recommended_min_free_kbytes();
+ mutex_unlock(&khugepaged_mutex);
+}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 2446076..312942d 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1947,7 +1947,7 @@
create_object((unsigned long)__bss_start, __bss_stop - __bss_start,
KMEMLEAK_GREY, GFP_ATOMIC);
/* only register .data..ro_after_init if not within .data */
- if (__start_ro_after_init < _sdata || __end_ro_after_init > _edata)
+ if (&__start_ro_after_init < &_sdata || &__end_ro_after_init > &_edata)
create_object((unsigned long)__start_ro_after_init,
__end_ro_after_init - __start_ro_after_init,
KMEMLEAK_GREY, GFP_ATOMIC);
diff --git a/mm/ksm.c b/mm/ksm.c
index 7905934..0bbae78 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -793,6 +793,7 @@
stable_node->rmap_hlist_len--;
put_anon_vma(rmap_item->anon_vma);
+ rmap_item->head = NULL;
rmap_item->address &= PAGE_MASK;
} else if (rmap_item->address & UNSTABLE_FLAG) {
@@ -2112,8 +2113,16 @@
down_read(&mm->mmap_sem);
vma = find_mergeable_vma(mm, rmap_item->address);
- err = try_to_merge_one_page(vma, page,
- ZERO_PAGE(rmap_item->address));
+ if (vma) {
+ err = try_to_merge_one_page(vma, page,
+ ZERO_PAGE(rmap_item->address));
+ } else {
+ /*
+ * If the vma is out of date, we do not need to
+ * continue.
+ */
+ err = 0;
+ }
up_read(&mm->mmap_sem);
/*
* In case of failure, the page was not really empty, so we
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 0f1f6b0..d12c194 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -544,7 +544,6 @@
struct list_lru_node *nlru = &lru->node[nid];
int dst_idx = dst_memcg->kmemcg_id;
struct list_lru_one *src, *dst;
- bool set;
/*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
@@ -556,11 +555,12 @@
dst = list_lru_from_memcg_idx(nlru, dst_idx);
list_splice_init(&src->list, &dst->list);
- set = (!dst->nr_items && src->nr_items);
- dst->nr_items += src->nr_items;
- if (set)
+
+ if (src->nr_items) {
+ dst->nr_items += src->nr_items;
memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
- src->nr_items = 0;
+ src->nr_items = 0;
+ }
spin_unlock_irq(&nlru->lock);
}
diff --git a/mm/maccess.c b/mm/maccess.c
index d065736..2d3c3d0 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -18,6 +18,18 @@
return ret ? -EFAULT : 0;
}
+static __always_inline long
+probe_write_common(void __user *dst, const void *src, size_t size)
+{
+ long ret;
+
+ pagefault_disable();
+ ret = __copy_to_user_inatomic(dst, src, size);
+ pagefault_enable();
+
+ return ret ? -EFAULT : 0;
+}
+
/**
* probe_kernel_read(): safely attempt to read from a kernel-space location
* @dst: pointer to the buffer that shall take the data
@@ -85,6 +97,7 @@
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
+
long __weak probe_kernel_write(void *dst, const void *src, size_t size)
__attribute__((alias("__probe_kernel_write")));
@@ -94,15 +107,39 @@
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- pagefault_disable();
- ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
- pagefault_enable();
+ ret = probe_write_common((__force void __user *)dst, src, size);
set_fs(old_fs);
- return ret ? -EFAULT : 0;
+ return ret;
}
EXPORT_SYMBOL_GPL(probe_kernel_write);
+/**
+ * probe_user_write(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+
+long __weak probe_user_write(void __user *dst, const void *src, size_t size)
+ __attribute__((alias("__probe_user_write")));
+
+long __probe_user_write(void __user *dst, const void *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = get_fs();
+
+ set_fs(USER_DS);
+ if (access_ok(dst, size))
+ ret = probe_write_common(dst, src, size);
+ set_fs(old_fs);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(probe_user_write);
/**
* strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
diff --git a/mm/madvise.c b/mm/madvise.c
index 94c343b..1107e99 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -288,9 +288,9 @@
*/
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
get_file(file);
- up_read(¤t->mm->mmap_sem);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ up_read(¤t->mm->mmap_sem);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
down_read(¤t->mm->mmap_sem);
@@ -335,12 +335,14 @@
}
page = pmd_page(orig_pmd);
+
+ /* Do not interfere with other mappings of this page */
+ if (page_mapcount(page) != 1)
+ goto huge_unlock;
+
if (next - addr != HPAGE_PMD_SIZE) {
int err;
- if (page_mapcount(page) != 1)
- goto huge_unlock;
-
get_page(page);
spin_unlock(ptl);
lock_page(page);
@@ -378,9 +380,9 @@
return 0;
}
+regular_page:
if (pmd_trans_unstable(pmd))
return 0;
-regular_page:
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -426,6 +428,10 @@
continue;
}
+ /* Do not interfere with other mappings of this page */
+ if (page_mapcount(page) != 1)
+ continue;
+
VM_BUG_ON_PAGE(PageTransCompound(page), page);
if (pte_young(ptent)) {
diff --git a/mm/memblock.c b/mm/memblock.c
index c4b16ca..11f6ae3 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -257,14 +257,6 @@
*
* Find @size free area aligned to @align in the specified range and node.
*
- * When allocation direction is bottom-up, the @start should be greater
- * than the end of the kernel image. Otherwise, it will be trimmed. The
- * reason is that we want the bottom-up allocation just near the kernel
- * image so it is highly likely that the allocated memory and the kernel
- * will reside in the same node.
- *
- * If bottom-up allocation failed, will try to allocate memory top-down.
- *
* Return:
* Found address on success, 0 on failure.
*/
@@ -273,8 +265,6 @@
phys_addr_t end, int nid,
enum memblock_flags flags)
{
- phys_addr_t kernel_end, ret;
-
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE ||
end == MEMBLOCK_ALLOC_KASAN)
@@ -283,40 +273,13 @@
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
- kernel_end = __pa_symbol(_end);
- /*
- * try bottom-up allocation only when bottom-up mode
- * is set and @end is above the kernel image.
- */
- if (memblock_bottom_up() && end > kernel_end) {
- phys_addr_t bottom_up_start;
-
- /* make sure we will allocate above the kernel */
- bottom_up_start = max(start, kernel_end);
-
- /* ok, try bottom-up allocation first */
- ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid, flags);
- if (ret)
- return ret;
-
- /*
- * we always limit bottom-up allocation above the kernel,
- * but top-down allocation doesn't have the limit, so
- * retrying top-down allocation may succeed when bottom-up
- * allocation failed.
- *
- * bottom-up allocation is expected to be fail very rarely,
- * so we use WARN_ONCE() here to see the stack trace if
- * fail happens.
- */
- WARN_ONCE(IS_ENABLED(CONFIG_MEMORY_HOTREMOVE),
- "memblock: bottom-up allocation failed, memory hotremove may be affected\n");
- }
-
- return __memblock_find_range_top_down(start, end, size, align, nid,
- flags);
+ if (memblock_bottom_up())
+ return __memblock_find_range_bottom_up(start, end, size, align,
+ nid, flags);
+ else
+ return __memblock_find_range_top_down(start, end, size, align,
+ nid, flags);
}
/**
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 46ad252..6d7fe35 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -418,8 +418,10 @@
if (mem_cgroup_is_root(memcg))
continue;
ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
- if (ret)
+ if (ret) {
+ mem_cgroup_iter_break(NULL, memcg);
goto unlock;
+ }
}
unlock:
if (!ret)
@@ -774,8 +776,13 @@
rcu_read_lock();
memcg = memcg_from_slab_page(page);
- /* Untracked pages have no memcg, no lruvec. Update only the node */
- if (!memcg || memcg == root_mem_cgroup) {
+ /*
+ * Untracked pages have no memcg, no lruvec. Update only the
+ * node. If we reparent the slab objects to the root memcg,
+ * when we free the slab object, we need to update the per-memcg
+ * vmstats to keep it correct for the root memcg.
+ */
+ if (!memcg) {
__mod_node_page_state(pgdat, idx, val);
} else {
lruvec = mem_cgroup_lruvec(pgdat, memcg);
@@ -784,6 +791,17 @@
rcu_read_unlock();
}
+void mod_memcg_obj_state(void *p, int idx, int val)
+{
+ struct mem_cgroup *memcg;
+
+ rcu_read_lock();
+ memcg = mem_cgroup_from_obj(p);
+ if (memcg)
+ mod_memcg_state(memcg, idx, val);
+ rcu_read_unlock();
+}
+
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
@@ -2412,28 +2430,44 @@
#define MEMCG_DELAY_SCALING_SHIFT 14
/*
- * Scheduled by try_charge() to be executed from the userland return path
- * and reclaims memory over the high limit.
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
*/
-void mem_cgroup_handle_over_high(void)
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages)
{
- unsigned long usage, high, clamped_high;
- unsigned long pflags;
- unsigned long penalty_jiffies, overage;
- unsigned int nr_pages = current->memcg_nr_pages_over_high;
- struct mem_cgroup *memcg;
+ unsigned long penalty_jiffies;
+ u64 max_overage = 0;
- if (likely(!nr_pages))
- return;
+ do {
+ unsigned long usage, high;
+ u64 overage;
- memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
- current->memcg_nr_pages_over_high = 0;
+ usage = page_counter_read(&memcg->memory);
+ high = READ_ONCE(memcg->high);
+
+ if (usage <= high)
+ continue;
+
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
+
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ overage = div64_u64(overage, high);
+
+ if (overage > max_overage)
+ max_overage = overage;
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ if (!max_overage)
+ return 0;
/*
- * memory.high is breached and reclaim is unable to keep up. Throttle
- * allocators proactively to slow down excessive growth.
- *
* We use overage compared to memory.high to calculate the number of
* jiffies to sleep (penalty_jiffies). Ideally this value should be
* fairly lenient on small overages, and increasingly harsh when the
@@ -2441,24 +2475,9 @@
* its crazy behaviour, so we exponentially increase the delay based on
* overage amount.
*/
-
- usage = page_counter_read(&memcg->memory);
- high = READ_ONCE(memcg->high);
-
- if (usage <= high)
- goto out;
-
- /*
- * Prevent division by 0 in overage calculation by acting as if it was a
- * threshold of 1 page
- */
- clamped_high = max(high, 1UL);
-
- overage = div_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
- clamped_high);
-
- penalty_jiffies = ((u64)overage * overage * HZ)
- >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
+ penalty_jiffies = max_overage * max_overage * HZ;
+ penalty_jiffies >>= MEMCG_DELAY_PRECISION_SHIFT;
+ penalty_jiffies >>= MEMCG_DELAY_SCALING_SHIFT;
/*
* Factor in the task's own contribution to the overage, such that four
@@ -2475,7 +2494,32 @@
* application moving forwards and also permit diagnostics, albeit
* extremely slowly.
*/
- penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+ return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+}
+
+/*
+ * Scheduled by try_charge() to be executed from the userland return path
+ * and reclaims memory over the high limit.
+ */
+void mem_cgroup_handle_over_high(void)
+{
+ unsigned long penalty_jiffies;
+ unsigned long pflags;
+ unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ struct mem_cgroup *memcg;
+
+ if (likely(!nr_pages))
+ return;
+
+ memcg = get_mem_cgroup_from_mm(current->mm);
+ reclaim_high(memcg, nr_pages, GFP_KERNEL);
+ current->memcg_nr_pages_over_high = 0;
+
+ /*
+ * memory.high is breached and reclaim is unable to keep up. Throttle
+ * allocators proactively to slow down excessive growth.
+ */
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -2753,6 +2797,33 @@
}
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * Returns a pointer to the memory cgroup to which the kernel object is charged.
+ *
+ * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
+ * cgroup_mutex, etc.
+ */
+struct mem_cgroup *mem_cgroup_from_obj(void *p)
+{
+ struct page *page;
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ page = virt_to_head_page(p);
+
+ /*
+ * Slab pages don't have page->mem_cgroup set because corresponding
+ * kmem caches can be reparented during the lifetime. That's why
+ * memcg_from_slab_page() should be used instead.
+ */
+ if (PageSlab(page))
+ return memcg_from_slab_page(page);
+
+ /* All other pages use page->mem_cgroup */
+ return page->mem_cgroup;
+}
+
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2829,8 +2900,10 @@
return;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
- if (!cw)
+ if (!cw) {
+ css_put(&memcg->css);
return;
+ }
cw->memcg = memcg;
cw->cachep = cachep;
@@ -3404,49 +3477,34 @@
}
}
-static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only)
+static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg)
{
- unsigned long stat[MEMCG_NR_STAT];
+ unsigned long stat[MEMCG_NR_STAT] = {0};
struct mem_cgroup *mi;
int node, cpu, i;
- int min_idx, max_idx;
-
- if (slab_only) {
- min_idx = NR_SLAB_RECLAIMABLE;
- max_idx = NR_SLAB_UNRECLAIMABLE;
- } else {
- min_idx = 0;
- max_idx = MEMCG_NR_STAT;
- }
-
- for (i = min_idx; i < max_idx; i++)
- stat[i] = 0;
for_each_online_cpu(cpu)
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu);
for (mi = memcg; mi; mi = parent_mem_cgroup(mi))
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < MEMCG_NR_STAT; i++)
atomic_long_add(stat[i], &mi->vmstats[i]);
- if (!slab_only)
- max_idx = NR_VM_NODE_STAT_ITEMS;
-
for_each_node(node) {
struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
struct mem_cgroup_per_node *pi;
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] = 0;
for_each_online_cpu(cpu)
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
stat[i] += per_cpu(
pn->lruvec_stat_cpu->count[i], cpu);
for (pi = pn; pi; pi = parent_nodeinfo(pi, node))
- for (i = min_idx; i < max_idx; i++)
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
atomic_long_add(stat[i], &pi->lruvec_stat[i]);
}
}
@@ -3520,13 +3578,9 @@
parent = root_mem_cgroup;
/*
- * Deactivate and reparent kmem_caches. Then flush percpu
- * slab statistics to have precise values at the parent and
- * all ancestor levels. It's required to keep slab stats
- * accurate after the reparenting of kmem_caches.
+ * Deactivate and reparent kmem_caches.
*/
memcg_deactivate_kmem_caches(memcg, parent);
- memcg_flush_percpu_vmstats(memcg, true);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -4168,7 +4222,7 @@
struct mem_cgroup_thresholds *thresholds;
struct mem_cgroup_threshold_ary *new;
unsigned long usage;
- int i, j, size;
+ int i, j, size, entries;
mutex_lock(&memcg->thresholds_lock);
@@ -4188,14 +4242,20 @@
__mem_cgroup_threshold(memcg, type == _MEMSWAP);
/* Calculate new number of threshold */
- size = 0;
+ size = entries = 0;
for (i = 0; i < thresholds->primary->size; i++) {
if (thresholds->primary->entries[i].eventfd != eventfd)
size++;
+ else
+ entries++;
}
new = thresholds->spare;
+ /* If no items related to eventfd have been cleared, nothing to do */
+ if (!entries)
+ goto unlock;
+
/* Set thresholds array to NULL if we don't have thresholds */
if (!size) {
kfree(new);
@@ -5037,7 +5097,7 @@
* Flush percpu vmstats and vmevents to guarantee the value correctness
* on parent's and all ancestor levels.
*/
- memcg_flush_percpu_vmstats(memcg, false);
+ memcg_flush_percpu_vmstats(memcg);
memcg_flush_percpu_vmevents(memcg);
__mem_cgroup_free(memcg);
}
@@ -5048,19 +5108,22 @@
unsigned int size;
int node;
int __maybe_unused i;
+ long error = -ENOMEM;
size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
memcg = kzalloc(size, GFP_KERNEL);
if (!memcg)
- return NULL;
+ return ERR_PTR(error);
memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
1, MEM_CGROUP_ID_MAX,
GFP_KERNEL);
- if (memcg->id.id < 0)
+ if (memcg->id.id < 0) {
+ error = memcg->id.id;
goto fail;
+ }
memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
if (!memcg->vmstats_local)
@@ -5105,7 +5168,7 @@
fail:
mem_cgroup_id_remove(memcg);
__mem_cgroup_free(memcg);
- return NULL;
+ return ERR_PTR(error);
}
static struct cgroup_subsys_state * __ref
@@ -5116,8 +5179,8 @@
long error = -ENOMEM;
memcg = mem_cgroup_alloc();
- if (!memcg)
- return ERR_PTR(error);
+ if (IS_ERR(memcg))
+ return ERR_CAST(memcg);
memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
@@ -5167,7 +5230,7 @@
fail:
mem_cgroup_id_remove(memcg);
mem_cgroup_free(memcg);
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(error);
}
static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
@@ -5340,7 +5403,7 @@
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
+ if (!(mc.flags & MOVE_ANON))
return NULL;
/*
@@ -5359,6 +5422,9 @@
return page;
}
+ if (non_swap_entry(ent))
+ return NULL;
+
/*
* Because lookup_swap_cache() updates some statistics counter,
* we call find_get_page() with swapper_space directly.
@@ -5431,7 +5497,6 @@
{
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
- unsigned long flags;
unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret;
bool anon;
@@ -5458,18 +5523,13 @@
from_vec = mem_cgroup_lruvec(pgdat, from);
to_vec = mem_cgroup_lruvec(pgdat, to);
- spin_lock_irqsave(&from->move_lock, flags);
+ lock_page_memcg(page);
if (!anon && page_mapped(page)) {
__mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
__mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
}
- /*
- * move_lock grabbed above and caller set from->moving_account, so
- * mod_memcg_page_state will serialize updates to PageDirty.
- * So mapping should be stable for dirty pages.
- */
if (!anon && PageDirty(page)) {
struct address_space *mapping = page_mapping(page);
@@ -5484,34 +5544,24 @@
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (compound && !list_empty(page_deferred_list(page))) {
- spin_lock(&from->deferred_split_queue.split_queue_lock);
- list_del_init(page_deferred_list(page));
- from->deferred_split_queue.split_queue_len--;
- spin_unlock(&from->deferred_split_queue.split_queue_lock);
- }
-#endif
/*
+ * All state has been migrated, let's switch to the new memcg.
+ *
* It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
+ * is referenced, charged, isolated, and locked: we can't race
+ * with (un)charging, migration, LRU putback, or anything else
+ * that would rely on a stable page->mem_cgroup.
+ *
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * to save space. As soon as we switch page->mem_cgroup to a
+ * new memcg that isn't locked, the above state can change
+ * concurrently again. Make sure we're truly done with it.
*/
+ smp_mb();
- /* caller should have done css_get */
- page->mem_cgroup = to;
+ page->mem_cgroup = to; /* caller should have done css_get */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (compound && list_empty(page_deferred_list(page))) {
- spin_lock(&to->deferred_split_queue.split_queue_lock);
- list_add_tail(page_deferred_list(page),
- &to->deferred_split_queue.split_queue);
- to->deferred_split_queue.split_queue_len++;
- spin_unlock(&to->deferred_split_queue.split_queue_lock);
- }
-#endif
-
- spin_unlock_irqrestore(&from->move_lock, flags);
+ __unlock_page_memcg(from);
ret = 0;
@@ -5730,7 +5780,6 @@
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- mem_cgroup_id_get_many(mc.to, mc.moved_swap);
css_put_many(&mc.to->css, mc.moved_swap);
mc.moved_swap = 0;
@@ -5921,7 +5970,8 @@
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
mc.precharge--;
- /* we fixup refcnts and charges later. */
+ mem_cgroup_id_get_many(mc.to, 1);
+ /* we fixup other refcnts and charges later. */
mc.moved_swap++;
}
break;
@@ -6396,6 +6446,14 @@
if (!root)
root = root_mem_cgroup;
+
+ /*
+ * Effective values of the reclaim targets are ignored so they
+ * can be stale. Have a look at mem_cgroup_protection for more
+ * details.
+ * TODO: calculation should be more robust so that we do not need
+ * that special casing.
+ */
if (memcg == root)
return MEMCG_PROT_NONE;
@@ -6827,19 +6885,9 @@
if (!mem_cgroup_sockets_enabled)
return;
- /*
- * Socket cloning can throw us here with sk_memcg already
- * filled. It won't however, necessarily happen from
- * process context. So the test for root memcg given
- * the current task's memcg won't help us in this case.
- *
- * Respecting the original socket's memcg is a better
- * decision in this case.
- */
- if (sk->sk_memcg) {
- css_get(&sk->sk_memcg->css);
+ /* Do not associate the sock with unrelated interrupted task's memcg. */
+ if (in_interrupt())
return;
- }
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 3151c87..9030ab0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1215,7 +1215,7 @@
* communicated in siginfo, see kill_proc()
*/
start = (page->index << PAGE_SHIFT) & ~(size - 1);
- unmap_mapping_range(page->mapping, start, start + size, 0);
+ unmap_mapping_range(page->mapping, start, size, 0);
}
kill_procs(&tokill, flags & MF_MUST_KILL, !unmap_success, pfn, flags);
rc = 0;
@@ -1382,7 +1382,12 @@
return 0;
}
- if (!PageTransTail(p) && !PageLRU(p))
+ /*
+ * __munlock_pagevec may clear a writeback page's LRU flag without
+ * page_lock. We need wait writeback completion for this page or it
+ * may trigger vfs BUG while evict inode.
+ */
+ if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p))
goto identify_page_state;
/*
diff --git a/mm/memory.c b/mm/memory.c
index b1ca51a..4bb7c6a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -118,6 +118,18 @@
2;
#endif
+#ifndef arch_faults_on_old_pte
+static inline bool arch_faults_on_old_pte(void)
+{
+ /*
+ * Those arches which don't have hw access flag feature need to
+ * implement their own helper. By default, "true" means pagefault
+ * will be hit on old pte.
+ */
+ return true;
+}
+#endif
+
static int __init disable_randmaps(char *s)
{
randomize_va_space = 0;
@@ -138,7 +150,7 @@
zero_pfn = page_to_pfn(ZERO_PAGE(0));
return 0;
}
-core_initcall(init_zero_pfn);
+early_initcall(init_zero_pfn);
#if defined(SPLIT_RSS_COUNTING)
@@ -1153,7 +1165,18 @@
else if (zap_huge_pmd(tlb, vma, pmd, addr))
goto next;
/* fall through */
+ } else if (details && details->single_page &&
+ PageTransCompound(details->single_page) &&
+ next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) {
+ spinlock_t *ptl = pmd_lock(tlb->mm, pmd);
+ /*
+ * Take and drop THP pmd lock so that we cannot return
+ * prematurely, while zap_huge_pmd() has cleared *pmd,
+ * but not yet decremented compound_mapcount().
+ */
+ spin_unlock(ptl);
}
+
/*
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
@@ -1792,11 +1815,11 @@
unsigned long addr, unsigned long end,
unsigned long pfn, pgprot_t prot)
{
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
spinlock_t *ptl;
int err = 0;
- pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
+ mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
arch_enter_lazy_mmu_mode();
@@ -1810,7 +1833,7 @@
pfn++;
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
return err;
}
@@ -2145,32 +2168,101 @@
return same;
}
-static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma)
+static inline bool cow_user_page(struct page *dst, struct page *src,
+ struct vm_fault *vmf)
{
+ bool ret;
+ void *kaddr;
+ void __user *uaddr;
+ bool locked = false;
+ struct vm_area_struct *vma = vmf->vma;
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long addr = vmf->address;
+
debug_dma_assert_idle(src);
+ if (likely(src)) {
+ copy_user_highpage(dst, src, addr, vma);
+ return true;
+ }
+
/*
* If the source page was a PFN mapping, we don't have
* a "struct page" for it. We do a best-effort copy by
* just copying from the original user address. If that
* fails, we just zero-fill it. Live with it.
*/
- if (unlikely(!src)) {
- void *kaddr = kmap_atomic(dst);
- void __user *uaddr = (void __user *)(va & PAGE_MASK);
+ kaddr = kmap_atomic(dst);
+ uaddr = (void __user *)(addr & PAGE_MASK);
+
+ /*
+ * On architectures with software "accessed" bits, we would
+ * take a double page fault, so mark it accessed here.
+ */
+ if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) {
+ pte_t entry;
+
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /*
+ * Other thread has already handled the fault
+ * and we don't need to do anything. If it's
+ * not the case, the fault will be triggered
+ * again on the same address.
+ */
+ ret = false;
+ goto pte_unlock;
+ }
+
+ entry = pte_mkyoung(vmf->orig_pte);
+ if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0))
+ update_mmu_cache(vma, addr, vmf->pte);
+ }
+
+ /*
+ * This really shouldn't fail, because the page is there
+ * in the page tables. But it might just be unreadable,
+ * in which case we just give up and fill the result with
+ * zeroes.
+ */
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ if (locked)
+ goto warn;
+
+ /* Re-validate under PTL if the page is still mapped */
+ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
+ locked = true;
+ if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
+ /* The PTE changed under us. Retry page fault. */
+ ret = false;
+ goto pte_unlock;
+ }
/*
- * This really shouldn't fail, because the page is there
- * in the page tables. But it might just be unreadable,
- * in which case we just give up and fill the result with
- * zeroes.
+ * The same page can be mapped back since last copy attampt.
+ * Try to copy again under PTL.
*/
- if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
+ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
+ /*
+ * Give a warn in case there can be some obscure
+ * use-case
+ */
+warn:
+ WARN_ON_ONCE(1);
clear_page(kaddr);
- kunmap_atomic(kaddr);
- flush_dcache_page(dst);
- } else
- copy_user_highpage(dst, src, va, vma);
+ }
+ }
+
+ ret = true;
+
+pte_unlock:
+ if (locked)
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ kunmap_atomic(kaddr);
+ flush_dcache_page(dst);
+
+ return ret;
}
static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma)
@@ -2227,10 +2319,11 @@
*
* The function expects the page to be locked and unlocks it.
*/
-static void fault_dirty_shared_page(struct vm_area_struct *vma,
- struct page *page)
+static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
{
+ struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping;
+ struct page *page = vmf->page;
bool dirtied;
bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite;
@@ -2245,16 +2338,30 @@
mapping = page_rmapping(page);
unlock_page(page);
- if ((dirtied || page_mkwrite) && mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
-
if (!page_mkwrite)
file_update_time(vma->vm_file);
+
+ /*
+ * Throttle page dirtying rate down to writeback speed.
+ *
+ * mapping may be NULL here because some device drivers do not
+ * set page.mapping but still dirty their pages
+ *
+ * Drop the mmap_sem before waiting on IO, if we can. The file
+ * is pinning the mapping, as per above.
+ */
+ if ((dirtied || page_mkwrite) && mapping) {
+ struct file *fpin;
+
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ balance_dirty_pages_ratelimited(mapping);
+ if (fpin) {
+ fput(fpin);
+ return VM_FAULT_RETRY;
+ }
+ }
+
+ return 0;
}
/*
@@ -2327,7 +2434,19 @@
vmf->address);
if (!new_page)
goto oom;
- cow_user_page(new_page, old_page, vmf->address, vma);
+
+ if (!cow_user_page(new_page, old_page, vmf)) {
+ /*
+ * COW failed, if the fault was solved by other,
+ * it's fine. If not, userspace would re-fault on
+ * the same address and we will handle the fault
+ * from the second attempt.
+ */
+ put_page(new_page);
+ if (old_page)
+ put_page(old_page);
+ return 0;
+ }
}
if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
@@ -2497,6 +2616,7 @@
__releases(vmf->ptl)
{
struct vm_area_struct *vma = vmf->vma;
+ vm_fault_t ret = VM_FAULT_WRITE;
get_page(vmf->page);
@@ -2520,10 +2640,10 @@
wp_page_reuse(vmf);
lock_page(vmf->page);
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
put_page(vmf->page);
- return VM_FAULT_WRITE;
+ return ret;
}
/*
@@ -2661,6 +2781,36 @@
}
/**
+ * unmap_mapping_page() - Unmap single page from processes.
+ * @page: The locked page to be unmapped.
+ *
+ * Unmap this page from any userspace process which still has it mmaped.
+ * Typically, for efficiency, the range of nearby pages has already been
+ * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once
+ * truncation or invalidation holds the lock on a page, it may find that
+ * the page has been remapped again: and then uses unmap_mapping_page()
+ * to unmap it finally.
+ */
+void unmap_mapping_page(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct zap_details details = { };
+
+ VM_BUG_ON(!PageLocked(page));
+ VM_BUG_ON(PageTail(page));
+
+ details.check_mapping = mapping;
+ details.first_index = page->index;
+ details.last_index = page->index + hpage_nr_pages(page) - 1;
+ details.single_page = page;
+
+ i_mmap_lock_write(mapping);
+ if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)))
+ unmap_mapping_range_tree(&mapping->i_mmap, &details);
+ i_mmap_unlock_write(mapping);
+}
+
+/**
* unmap_mapping_pages() - Unmap pages from processes.
* @mapping: The address space containing pages to be unmapped.
* @start: Index of first page to be unmapped.
@@ -3567,7 +3717,7 @@
return ret;
}
- fault_dirty_shared_page(vma, vmf->page);
+ ret |= fault_dirty_shared_page(vmf);
return ret;
}
@@ -4113,9 +4263,9 @@
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp,
+ pmd_t **pmdpp, spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4180,31 +4330,33 @@
return -EINVAL;
}
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+/**
+ * follow_pte - look up PTE at a user virtual address
+ * @mm: the mm_struct of the target address space
+ * @address: user virtual address
+ * @ptepp: location to store found PTE
+ * @ptlp: location to store the lock for the PTE
+ *
+ * On a successful return, the pointer to the PTE is stored in @ptepp;
+ * the corresponding lock is taken and its location is stored in @ptlp.
+ * The contents of the PTE are only stable until @ptlp is released;
+ * any further use, if any, must be protected against invalidation
+ * with MMU notifiers.
+ *
+ * Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
+ * should be taken for read.
+ *
+ * KVM uses this function. While it is arguably less bad than ``follow_pfn``,
+ * it is not a good general-purpose API.
+ *
+ * Return: zero on success, -ve otherwise.
+ */
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ pte_t **ptepp, spinlock_t **ptlp)
{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, NULL,
- ptepp, NULL, ptlp)));
- return res;
+ return follow_invalidate_pte(mm, address, NULL, ptepp, NULL, ptlp);
}
-
-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
-{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, range,
- ptepp, pmdpp, ptlp)));
- return res;
-}
-EXPORT_SYMBOL(follow_pte_pmd);
+EXPORT_SYMBOL_GPL(follow_pte);
/**
* follow_pfn - look up PFN at a user virtual address
@@ -4214,6 +4366,9 @@
*
* Only IO mappings and raw PFN mappings are allowed.
*
+ * This function does not allow the caller to read the permissions
+ * of the PTE. Do not use it.
+ *
* Return: zero and the pfn at @pfn on success, -ve otherwise.
*/
int follow_pfn(struct vm_area_struct *vma, unsigned long address,
@@ -4604,17 +4759,19 @@
void *page_kaddr;
unsigned long i, rc = 0;
unsigned long ret_val = pages_per_huge_page * PAGE_SIZE;
+ struct page *subpage = dst_page;
- for (i = 0; i < pages_per_huge_page; i++) {
+ for (i = 0; i < pages_per_huge_page;
+ i++, subpage = mem_map_next(subpage, dst_page, i)) {
if (allow_pagefault)
- page_kaddr = kmap(dst_page + i);
+ page_kaddr = kmap(subpage);
else
- page_kaddr = kmap_atomic(dst_page + i);
+ page_kaddr = kmap_atomic(subpage);
rc = copy_from_user(page_kaddr,
(const void __user *)(src + i * PAGE_SIZE),
PAGE_SIZE);
if (allow_pagefault)
- kunmap(dst_page + i);
+ kunmap(subpage);
else
kunmap_atomic(page_kaddr);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f307bd8..bcc2686 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -465,8 +465,9 @@
pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
-static void __remove_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages)
+void __ref remove_pfn_range_from_zone(struct zone *zone,
+ unsigned long start_pfn,
+ unsigned long nr_pages)
{
struct pglist_data *pgdat = zone->zone_pgdat;
unsigned long flags;
@@ -481,28 +482,30 @@
return;
#endif
+ clear_zone_contiguous(zone);
+
pgdat_resize_lock(zone->zone_pgdat, &flags);
shrink_zone_span(zone, start_pfn, start_pfn + nr_pages);
update_pgdat_span(pgdat);
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+
+ set_zone_contiguous(zone);
}
-static void __remove_section(struct zone *zone, unsigned long pfn,
- unsigned long nr_pages, unsigned long map_offset,
- struct vmem_altmap *altmap)
+static void __remove_section(unsigned long pfn, unsigned long nr_pages,
+ unsigned long map_offset,
+ struct vmem_altmap *altmap)
{
struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
if (WARN_ON_ONCE(!valid_section(ms)))
return;
- __remove_zone(zone, pfn, nr_pages);
sparse_remove_section(ms, pfn, nr_pages, map_offset, altmap);
}
/**
- * __remove_pages() - remove sections of pages from a zone
- * @zone: zone from which pages need to be removed
+ * __remove_pages() - remove sections of pages
* @pfn: starting pageframe (must be aligned to start of a section)
* @nr_pages: number of pages to remove (must be multiple of section size)
* @altmap: alternative device page map or %NULL if default memmap is used
@@ -512,16 +515,14 @@
* sure that pages are marked reserved and zones are adjust properly by
* calling offline_pages().
*/
-void __remove_pages(struct zone *zone, unsigned long pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+void __remove_pages(unsigned long pfn, unsigned long nr_pages,
+ struct vmem_altmap *altmap)
{
unsigned long map_offset = 0;
unsigned long nr, start_sec, end_sec;
map_offset = vmem_altmap_offset(altmap);
- clear_zone_contiguous(zone);
-
if (check_pfn_span(pfn, nr_pages, "remove"))
return;
@@ -533,13 +534,11 @@
cond_resched();
pfns = min(nr_pages, PAGES_PER_SECTION
- (pfn & ~PAGE_SECTION_MASK));
- __remove_section(zone, pfn, pfns, map_offset, altmap);
+ __remove_section(pfn, pfns, map_offset, altmap);
pfn += pfns;
nr_pages -= pfns;
map_offset = 0;
}
-
- set_zone_contiguous(zone);
}
int set_online_page_callback(online_page_callback_t callback)
@@ -599,7 +598,13 @@
static void generic_online_page(struct page *page, unsigned int order)
{
- kernel_map_pages(page, 1 << order, 1);
+ /*
+ * Freeing the page with debug_pagealloc enabled will try to unmap it,
+ * so we should map it first. This is better than introducing a special
+ * case in page freeing fast path.
+ */
+ if (debug_pagealloc_enabled_static())
+ kernel_map_pages(page, 1 << order, 1);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
@@ -720,7 +725,7 @@
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMMAP_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap);
set_zone_contiguous(zone);
}
@@ -770,8 +775,8 @@
return movable_node_enabled ? movable_zone : kernel_zone;
}
-struct zone * zone_for_pfn_range(int online_type, int nid, unsigned start_pfn,
- unsigned long nr_pages)
+struct zone *zone_for_pfn_range(int online_type, int nid,
+ unsigned long start_pfn, unsigned long nr_pages)
{
if (online_type == MMOP_ONLINE_KERNEL)
return default_kernel_zone_for_pfn(nid, start_pfn, nr_pages);
@@ -867,6 +872,7 @@
(unsigned long long) pfn << PAGE_SHIFT,
(((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1);
memory_notify(MEM_CANCEL_ONLINE, &arg);
+ remove_pfn_range_from_zone(zone, pfn, nr_pages);
mem_hotplug_done();
return ret;
}
@@ -1076,7 +1082,8 @@
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
+ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
BUG_ON(ret);
/* create new memmap entry */
@@ -1560,6 +1567,20 @@
/* check again */
ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
NULL, check_pages_isolated_cb);
+ /*
+ * per-cpu pages are drained in start_isolate_page_range, but if
+ * there are still pages that are not free, make sure that we
+ * drain again, because when we isolated range we might
+ * have raced with another thread that was adding pages to pcp
+ * list.
+ *
+ * Forward progress should be still guaranteed because
+ * pages on the pcp list can only belong to MOVABLE_ZONE
+ * because has_unmovable_pages explicitly checks for
+ * PageBuddy on freed pages on other zones.
+ */
+ if (ret)
+ drain_all_pages(zone);
} while (ret);
/* Ok, all of our target is isolated.
@@ -1602,6 +1623,7 @@
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
+ remove_pfn_range_from_zone(zone, start_pfn, nr_pages);
mem_hotplug_done();
return 0;
@@ -1737,8 +1759,6 @@
BUG_ON(check_hotplug_memory_range(start, size));
- mem_hotplug_begin();
-
/*
* All memory blocks must be offlined before removing memory. Check
* whether all memory blocks in question are offline and return error
@@ -1746,24 +1766,28 @@
*/
rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
if (rc)
- goto done;
+ return rc;
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
memblock_free(start, size);
memblock_remove(start, size);
- /* remove memory block devices before removing memory */
+ /*
+ * Memory block device removal under the device_hotplug_lock is
+ * a barrier against racing online attempts.
+ */
remove_memory_block_devices(start, size);
+ mem_hotplug_begin();
+
arch_remove_memory(nid, start, size, NULL);
__release_memory_resource(start, size);
try_offline_node(nid);
-done:
mem_hotplug_done();
- return rc;
+ return 0;
}
/**
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index e08c941..87d1659 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -496,7 +496,7 @@
unsigned long flags = qp->flags;
int ret;
bool has_unmovable = false;
- pte_t *pte;
+ pte_t *pte, *mapped_pte;
spinlock_t *ptl;
ptl = pmd_trans_huge_lock(pmd, vma);
@@ -510,7 +510,7 @@
if (pmd_trans_unstable(pmd))
return 0;
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
@@ -542,7 +542,7 @@
} else
break;
}
- pte_unmap_unlock(pte - 1, ptl);
+ pte_unmap_unlock(mapped_pte, ptl);
cond_resched();
if (has_unmovable)
@@ -2802,6 +2802,9 @@
char *flags = strchr(str, '=');
int err = 1, mode;
+ if (flags)
+ *flags++ = '\0'; /* terminate mode string */
+
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
@@ -2812,9 +2815,6 @@
} else
nodes_clear(nodes);
- if (flags)
- *flags++ = '\0'; /* terminate mode string */
-
mode = match_string(policy_modes, MPOL_MAX, str);
if (mode < 0)
goto out;
@@ -2822,7 +2822,9 @@
switch (mode) {
case MPOL_PREFERRED:
/*
- * Insist on a nodelist of one node only
+ * Insist on a nodelist of one node only, although later
+ * we use first_node(nodes) to grab a single node, so here
+ * nodelist (or nodes) cannot be empty.
*/
if (nodelist) {
char *rest = nodelist;
@@ -2830,6 +2832,8 @@
rest++;
if (*rest)
goto out;
+ if (nodes_empty(nodes))
+ goto out;
}
break;
case MPOL_INTERLEAVE:
diff --git a/mm/memremap.c b/mm/memremap.c
index 03ccbdf..c51c6bd 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -120,7 +120,7 @@
mem_hotplug_begin();
if (pgmap->type == MEMORY_DEVICE_PRIVATE) {
- __remove_pages(page_zone(first_page), PHYS_PFN(res->start),
+ __remove_pages(PHYS_PFN(res->start),
PHYS_PFN(resource_size(res)), NULL);
} else {
arch_remove_memory(nid, res->start, resource_size(res),
diff --git a/mm/migrate.c b/mm/migrate.c
index 4fe45d1..5092ef2 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -321,6 +321,7 @@
goto out;
page = migration_entry_to_page(entry);
+ page = compound_head(page);
/*
* Once page cache replacement of page migration started, page_count
@@ -1516,9 +1517,11 @@
/*
* Resolves the given address to a struct page, isolates it from the LRU and
* puts it to the given pagelist.
- * Returns -errno if the page cannot be found/isolated or 0 when it has been
- * queued or the page doesn't need to be migrated because it is already on
- * the target node
+ * Returns:
+ * errno - if the page cannot be found/isolated
+ * 0 - when it doesn't have to be migrated because it is already on the
+ * target node
+ * 1 - when it has been queued
*/
static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
int node, struct list_head *pagelist, bool migrate_all)
@@ -1557,7 +1560,7 @@
if (PageHuge(page)) {
if (PageHead(page)) {
isolate_huge_page(page, pagelist);
- err = 0;
+ err = 1;
}
} else {
struct page *head;
@@ -1567,7 +1570,7 @@
if (err)
goto out_putpage;
- err = 0;
+ err = 1;
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_cache(head),
@@ -1629,8 +1632,19 @@
start = i;
} else if (node != current_node) {
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
err = store_status(status, start, current_node, i - start);
if (err)
goto out;
@@ -1644,16 +1658,28 @@
*/
err = add_page_for_migration(mm, addr, current_node,
&pagelist, flags & MPOL_MF_MOVE_ALL);
- if (!err)
+
+ if (!err) {
+ /* The page is already on the target node */
+ err = store_status(status, i, current_node, 1);
+ if (err)
+ goto out_flush;
continue;
+ } else if (err > 0) {
+ /* The page is successfully queued for migration */
+ continue;
+ }
err = store_status(status, i, err, 1);
if (err)
goto out_flush;
err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err)
+ if (err) {
+ if (err > 0)
+ err += nr_pages - i - 1;
goto out;
+ }
if (i > start) {
err = store_status(status, start, current_node, i - start);
if (err)
@@ -1667,9 +1693,16 @@
/* Make sure we do not overwrite the existing error */
err1 = do_move_pages_to_node(mm, &pagelist, current_node);
+ /*
+ * Don't have to report non-attempted pages here since:
+ * - If the above loop is done gracefully all pages have been
+ * attempted.
+ * - If the above loop is aborted it means a fatal error
+ * happened, should return ret.
+ */
if (!err1)
err1 = store_status(status, start, current_node, i - start);
- if (!err)
+ if (err >= 0)
err = err1;
out:
return err;
@@ -2739,6 +2772,13 @@
swp_entry = make_device_private_entry(page, vma->vm_flags & VM_WRITE);
entry = swp_entry_to_pte(swp_entry);
+ } else {
+ /*
+ * For now we only support migrating to un-addressable
+ * device memory.
+ */
+ pr_warn_once("Unsupported ZONE_DEVICE page type.\n");
+ goto abort;
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
diff --git a/mm/mmap.c b/mm/mmap.c
index a7d8c84..ba78f1f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -90,12 +90,6 @@
* MAP_PRIVATE r: (no) no r: (yes) yes r: (no) yes r: (no) yes
* w: (no) no w: (no) no w: (copy) copy w: (no) no
* x: (no) no x: (no) yes x: (no) yes x: (yes) yes
- *
- * On arm64, PROT_EXEC has the following behaviour for both MAP_SHARED and
- * MAP_PRIVATE:
- * r: (no) no
- * w: (no) no
- * x: (yes) yes
*/
pgprot_t protection_map[16] __ro_after_init = {
__P000, __P001, __P010, __P011, __P100, __P101, __P110, __P111,
@@ -201,8 +195,6 @@
bool downgraded = false;
LIST_HEAD(uf);
- brk = untagged_addr(brk);
-
if (down_write_killable(&mm->mmap_sem))
return -EINTR;
@@ -1589,8 +1581,6 @@
struct file *file = NULL;
unsigned long retval;
- addr = untagged_addr(addr);
-
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
file = fget(fd);
@@ -2136,6 +2126,7 @@
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
info.align_mask = 0;
+ info.align_offset = 0;
return vm_unmapped_area(&info);
}
#endif
@@ -2177,6 +2168,7 @@
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
info.align_mask = 0;
+ info.align_offset = 0;
addr = vm_unmapped_area(&info);
/*
@@ -2632,7 +2624,7 @@
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
-static void
+static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
@@ -2657,6 +2649,17 @@
/* Kill the cache */
vmacache_invalidate(mm);
+
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (vma && (vma->vm_flags & VM_GROWSDOWN))
+ return false;
+ if (prev && (prev->vm_flags & VM_GROWSUP))
+ return false;
+ return true;
}
/*
@@ -2837,7 +2840,8 @@
}
/* Detach vmas from rbtree */
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
+ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
+ downgrade = false;
if (downgrade)
downgrade_write(&mm->mmap_sem);
@@ -3170,6 +3174,7 @@
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
+ cond_resched();
}
vm_unacct_memory(nr_accounted);
}
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
index 3e612ae..a1da47e 100644
--- a/mm/mmu_context.c
+++ b/mm/mmu_context.c
@@ -25,13 +25,16 @@
struct task_struct *tsk = current;
task_lock(tsk);
+ /* Hold off tlb flush IPIs while switching mm's */
+ local_irq_disable();
active_mm = tsk->active_mm;
if (active_mm != mm) {
mmgrab(mm);
tsk->active_mm = mm;
}
tsk->mm = mm;
- switch_mm(active_mm, mm, tsk);
+ switch_mm_irqs_off(active_mm, mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
#ifdef finish_arch_post_lock_switch
finish_arch_post_lock_switch();
@@ -56,9 +59,11 @@
task_lock(tsk);
sync_mm_rss(mm);
+ local_irq_disable();
tsk->mm = NULL;
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
+ local_irq_enable();
task_unlock(tsk);
}
EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index 7d70e5c..7c1b8f6 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -102,14 +102,14 @@
*/
static inline void tlb_table_invalidate(struct mmu_gather *tlb)
{
-#ifndef CONFIG_HAVE_RCU_TABLE_NO_INVALIDATE
- /*
- * Invalidate page-table caches used by hardware walkers. Then we still
- * need to RCU-sched wait while freeing the pages because software
- * walkers can still be in-flight.
- */
- tlb_flush_mmu_tlbonly(tlb);
-#endif
+ if (tlb_needs_table_invalidate()) {
+ /*
+ * Invalidate page-table caches used by hardware walkers. Then
+ * we still need to RCU-sched wait while freeing the pages
+ * because software walkers can still be in-flight.
+ */
+ tlb_flush_mmu_tlbonly(tlb);
+ }
}
static void tlb_remove_table_smp_sync(void *arg)
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 7967825..95dee88 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -161,6 +161,31 @@
return pages;
}
+/*
+ * Used when setting automatic NUMA hinting protection where it is
+ * critical that a numa hinting PMD is not confused with a bad PMD.
+ */
+static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+{
+ pmd_t pmdval = pmd_read_atomic(pmd);
+
+ /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ barrier();
+#endif
+
+ if (pmd_none(pmdval))
+ return 1;
+ if (pmd_trans_huge(pmdval))
+ return 0;
+ if (unlikely(pmd_bad(pmdval))) {
+ pmd_clear_bad(pmd);
+ return 1;
+ }
+
+ return 0;
+}
+
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -178,8 +203,17 @@
unsigned long this_pages;
next = pmd_addr_end(addr, end);
- if (!is_swap_pmd(*pmd) && !pmd_trans_huge(*pmd) && !pmd_devmap(*pmd)
- && pmd_none_or_clear_bad(pmd))
+
+ /*
+ * Automatic NUMA balancing walks the tables with mmap_sem
+ * held for read. It's possible a parallel update to occur
+ * between pmd_trans_huge() and a pmd_none_or_clear_bad()
+ * check leading to a false positive and clearing.
+ * Hence, it's necessary to atomically read the PMD value
+ * for all the checks.
+ */
+ if (!is_swap_pmd(*pmd) && !pmd_devmap(*pmd) &&
+ pmd_none_or_clear_bad_unless_trans_huge(pmd))
goto next;
/* invoke the mmu notifier if the pmd is populated */
diff --git a/mm/mremap.c b/mm/mremap.c
index 1fc8a29..8005d0b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -266,7 +266,7 @@
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
/* See comment in move_ptes() */
@@ -606,8 +606,17 @@
LIST_HEAD(uf_unmap_early);
LIST_HEAD(uf_unmap);
+ /*
+ * There is a deliberate asymmetry here: we strip the pointer tag
+ * from the old address but leave the new address alone. This is
+ * for consistency with mmap(), where we prevent the creation of
+ * aliasing mappings in userspace by leaving the tag bits of the
+ * mapping address intact. A non-zero tag will cause the subsequent
+ * range checks to reject the address as invalid.
+ *
+ * See Documentation/arm64/tagged-address-abi.rst for more information.
+ */
addr = untagged_addr(addr);
- new_addr = untagged_addr(new_addr);
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
return ret;
diff --git a/mm/nommu.c b/mm/nommu.c
index 99b7ec3..3b67bd2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -359,10 +359,14 @@
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
+ * chose not to have one.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
+{
+}
+
+void __weak vmalloc_sync_unmappings(void)
{
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 71e3ace..f1b810d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -63,6 +63,8 @@
* and mark_oom_victim
*/
DEFINE_MUTEX(oom_lock);
+/* Serializes oom_score_adj and oom_score_adj_min updates */
+DEFINE_MUTEX(oom_adj_mutex);
static inline bool is_memcg_oom(struct oom_control *oc)
{
@@ -195,17 +197,17 @@
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p))
- return 0;
+ return LONG_MIN;
p = find_lock_task_mm(p);
if (!p)
- return 0;
+ return LONG_MIN;
/*
* Do not even consider tasks which are explicitly marked oom
@@ -217,7 +219,7 @@
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
- return 0;
+ return LONG_MIN;
}
/*
@@ -232,11 +234,7 @@
adj *= totalpages / 1000;
points += adj;
- /*
- * Never return 0 for an eligible task regardless of the root bonus and
- * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
- */
- return points > 0 ? points : 1;
+ return points;
}
static const char * const oom_constraint_text[] = {
@@ -309,7 +307,7 @@
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
- unsigned long points;
+ long points;
if (oom_unkillable_task(task))
goto next;
@@ -335,12 +333,12 @@
* killed first if it triggers an oom, then select it.
*/
if (oom_task_origin(task)) {
- points = ULONG_MAX;
+ points = LONG_MAX;
goto select;
}
points = oom_badness(task, oc->totalpages);
- if (!points || points < oc->chosen_points)
+ if (points == LONG_MIN || points < oc->chosen_points)
goto next;
select:
@@ -364,6 +362,8 @@
*/
static void select_bad_process(struct oom_control *oc)
{
+ oc->chosen_points = LONG_MIN;
+
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {
@@ -890,7 +890,7 @@
K(get_mm_counter(mm, MM_FILEPAGES)),
K(get_mm_counter(mm, MM_SHMEMPAGES)),
from_kuid(&init_user_ns, task_uid(victim)),
- mm_pgtables_bytes(mm), victim->signal->oom_score_adj);
+ mm_pgtables_bytes(mm) >> 10, victim->signal->oom_score_adj);
task_unlock(victim);
/*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 50055d2..2d658b2 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -201,11 +201,11 @@
if (this_bw < tot_bw) {
if (min) {
min *= this_bw;
- do_div(min, tot_bw);
+ min = div64_ul(min, tot_bw);
}
if (max < 100) {
max *= this_bw;
- do_div(max, tot_bw);
+ max = div64_ul(max, tot_bw);
}
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index f391c0c..283ac9d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,7 @@
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
+#include <linux/khugepaged.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -694,34 +695,27 @@
#ifdef CONFIG_DEBUG_PAGEALLOC
unsigned int _debug_guardpage_minorder;
-#ifdef CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT
-DEFINE_STATIC_KEY_TRUE(_debug_pagealloc_enabled);
-#else
+bool _debug_pagealloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_DEBUG_PAGEALLOC_ENABLE_DEFAULT);
+EXPORT_SYMBOL(_debug_pagealloc_enabled_early);
DEFINE_STATIC_KEY_FALSE(_debug_pagealloc_enabled);
-#endif
EXPORT_SYMBOL(_debug_pagealloc_enabled);
DEFINE_STATIC_KEY_FALSE(_debug_guardpage_enabled);
static int __init early_debug_pagealloc(char *buf)
{
- bool enable = false;
-
- if (kstrtobool(buf, &enable))
- return -EINVAL;
-
- if (enable)
- static_branch_enable(&_debug_pagealloc_enabled);
-
- return 0;
+ return kstrtobool(buf, &_debug_pagealloc_enabled_early);
}
early_param("debug_pagealloc", early_debug_pagealloc);
-static void init_debug_guardpage(void)
+void init_debug_pagealloc(void)
{
if (!debug_pagealloc_enabled())
return;
+ static_branch_enable(&_debug_pagealloc_enabled);
+
if (!debug_guardpage_minorder())
return;
@@ -912,7 +906,7 @@
unsigned int max_order;
struct capture_control *capc = task_capc(zone);
- max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
+ max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -925,7 +919,7 @@
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
- while (order < max_order - 1) {
+ while (order < max_order) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
@@ -951,7 +945,7 @@
pfn = combined_pfn;
order++;
}
- if (max_order < MAX_ORDER) {
+ if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
@@ -972,7 +966,7 @@
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
- max_order++;
+ max_order = order + 1;
goto continue_merging;
}
@@ -1186,7 +1180,7 @@
*/
arch_free_page(page, order);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
kernel_map_pages(page, 1 << order, 0);
kasan_free_nondeferred_pages(page, order);
@@ -1207,7 +1201,7 @@
static bool bulkfree_pcp_prepare(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return free_pages_check(page);
else
return false;
@@ -1221,7 +1215,7 @@
*/
static bool free_pcp_prepare(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return free_pages_prepare(page, 0, true);
else
return free_pages_prepare(page, 0, false);
@@ -1263,6 +1257,11 @@
struct page *page, *tmp;
LIST_HEAD(head);
+ /*
+ * Ensure proper count is passed which otherwise would stuck in the
+ * below while (list_empty(list)) loop.
+ */
+ count = min(pcp->count, count);
while (count) {
struct list_head *list;
@@ -1562,6 +1561,7 @@
if (!__pageblock_pfn_to_page(block_start_pfn,
block_end_pfn, zone))
return;
+ cond_resched();
}
/* We confirm that there is no hole */
@@ -1646,7 +1646,6 @@
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
- touch_nmi_watchdog();
} else {
nr_free++;
}
@@ -1676,7 +1675,6 @@
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- touch_nmi_watchdog();
} else {
page++;
}
@@ -1799,6 +1797,13 @@
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
pgdat->first_deferred_pfn = ULONG_MAX;
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
/* Only the highest zone is deferred so find it */
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
zone = pgdat->node_zones + zid;
@@ -1816,11 +1821,11 @@
* that we can avoid introducing any issues with the buddy
* allocator.
*/
- while (spfn < epfn)
+ while (spfn < epfn) {
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
zone_empty:
- pgdat_resize_unlock(pgdat, &flags);
-
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
@@ -1863,17 +1868,6 @@
pgdat_resize_lock(pgdat, &flags);
/*
- * If deferred pages have been initialized while we were waiting for
- * the lock, return true, as the zone was grown. The caller will retry
- * this zone. We won't return to this function since the caller also
- * has this static branch.
- */
- if (!static_branch_unlikely(&deferred_pages)) {
- pgdat_resize_unlock(pgdat, &flags);
- return true;
- }
-
- /*
* If someone grew this zone while we were waiting for spinlock, return
* true, as there might be enough pages already.
*/
@@ -1901,6 +1895,7 @@
first_deferred_pfn = spfn;
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
/* We should only stop along section boundaries */
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
@@ -1973,10 +1968,6 @@
for_each_populated_zone(zone)
set_zone_contiguous(zone);
-
-#ifdef CONFIG_DEBUG_PAGEALLOC
- init_debug_guardpage();
-#endif
}
#ifdef CONFIG_CMA
@@ -2106,7 +2097,7 @@
*/
static inline bool check_pcp_refill(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
@@ -2128,7 +2119,7 @@
}
static inline bool check_new_pcp(struct page *page)
{
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
return check_new_page(page);
else
return false;
@@ -2155,7 +2146,7 @@
set_page_refcounted(page);
arch_alloc_page(page, order);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
kernel_map_pages(page, 1 << order, 1);
kasan_alloc_pages(page, order);
kernel_poison_pages(page, 1 << order, 1);
@@ -2355,12 +2346,20 @@
return false;
}
-static inline void boost_watermark(struct zone *zone)
+static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
if (!watermark_boost_factor)
- return;
+ return false;
+ /*
+ * Don't bother in zones that are unlikely to produce results.
+ * On small machines, including kdump capture kernels running
+ * in a small area, boosting the watermark can cause an out of
+ * memory situation immediately.
+ */
+ if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
+ return false;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
@@ -2374,12 +2373,14 @@
* boosted watermark resulting in a hang.
*/
if (!max_boost)
- return;
+ return false;
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
max_boost);
+
+ return true;
}
/*
@@ -2418,8 +2419,7 @@
* likelihood of future fallbacks. Wake kswapd now as the node
* may be balanced overall and kswapd will not wake naturally.
*/
- boost_watermark(zone);
- if (alloc_flags & ALLOC_KSWAPD)
+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
@@ -3131,7 +3131,7 @@
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
- split_page_owner(page, order);
+ split_page_owner(page, 1 << order);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -3386,7 +3386,7 @@
#endif /* CONFIG_FAIL_PAGE_ALLOC */
-static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
{
return __should_fail_alloc_page(gfp_mask, order);
}
@@ -3485,7 +3485,8 @@
}
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+ unsigned long mark, int classzone_idx,
+ unsigned int alloc_flags, gfp_t gfp_mask)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
long cma_pages = 0;
@@ -3506,8 +3507,23 @@
if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
return true;
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
- free_pages);
+ if (__zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ free_pages))
+ return true;
+ /*
+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * when checking the min watermark. The min watermark is the
+ * point where boosting is ignored so that kswapd is woken up
+ * when below the low watermark.
+ */
+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
+ mark = z->_watermark[WMARK_MIN];
+ return __zone_watermark_ok(z, order, mark, classzone_idx,
+ alloc_flags, free_pages);
+ }
+
+ return false;
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
@@ -3648,7 +3664,8 @@
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
- ac_classzone_idx(ac), alloc_flags)) {
+ ac_classzone_idx(ac), alloc_flags,
+ gfp_mask)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -4911,6 +4928,11 @@
if (!page_ref_sub_and_test(page, nc->pagecnt_bias))
goto refill;
+ if (unlikely(nc->pfmemalloc)) {
+ free_the_page(page, compound_order(page));
+ goto refill;
+ }
+
#if (PAGE_SIZE < PAGE_FRAG_CACHE_MAX_SIZE)
/* if size can vary use size else just use PAGE_SIZE */
size = nc->size;
@@ -5877,7 +5899,7 @@
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum memmap_context context,
+ unsigned long start_pfn, enum meminit_context context,
struct vmem_altmap *altmap)
{
unsigned long pfn, end_pfn = start_pfn + size;
@@ -5909,7 +5931,7 @@
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
+ if (context == MEMINIT_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
@@ -5922,7 +5944,7 @@
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
+ if (context == MEMINIT_HOTPLUG)
__SetPageReserved(page);
/*
@@ -6004,7 +6026,7 @@
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
@@ -6030,7 +6052,7 @@
void __meminit __weak memmap_init(unsigned long size, int nid,
unsigned long zone, unsigned long start_pfn)
{
- memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+ memmap_init_zone(size, nid, zone, start_pfn, MEMINIT_EARLY, NULL);
}
static int zone_batchsize(struct zone *zone)
@@ -6944,7 +6966,8 @@
* This function also addresses a similar issue where struct pages are left
* uninitialized because the physical address range is not covered by
* memblock.memory or memblock.reserved. That could happen when memblock
- * layout is manually configured via memmap=.
+ * layout is manually configured via memmap=, or when the highest physical
+ * address (max_pfn) does not end on a section boundary.
*/
void __init zero_resv_unavail(void)
{
@@ -6962,7 +6985,16 @@
pgcnt += zero_pfn_range(PFN_DOWN(next), PFN_UP(start));
next = end;
}
- pgcnt += zero_pfn_range(PFN_DOWN(next), max_pfn);
+
+ /*
+ * Early sections always have a fully populated memmap for the whole
+ * section - see pfn_valid(). If the last section has holes at the
+ * end and that section is marked "online", the memmap will be
+ * considered initialized. Make sure that memmap has a well defined
+ * state.
+ */
+ pgcnt += zero_pfn_range(PFN_DOWN(next),
+ round_up(max_pfn, PAGES_PER_SECTION));
/*
* Struct pages that do not have backing memory. This could be because
@@ -7862,9 +7894,11 @@
setup_min_slab_ratio();
#endif
+ khugepaged_min_free_kbytes_update();
+
return 0;
}
-core_initcall(init_per_zone_wmark_min)
+postcore_initcall(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
diff --git a/mm/page_counter.c b/mm/page_counter.c
index de31470..147ff99 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -77,7 +77,7 @@
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
@@ -121,7 +121,7 @@
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
* inaccuracy in the failcnt.
@@ -130,7 +130,7 @@
*fail = c;
goto failed;
}
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
diff --git a/mm/page_io.c b/mm/page_io.c
index 60a66a5..bcf27d0 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -260,11 +260,6 @@
return ret;
}
-static sector_t swap_page_sector(struct page *page)
-{
- return (sector_t)__page_file_index(page) << (PAGE_SHIFT - 9);
-}
-
static inline void count_swpout_vm_event(struct page *page)
{
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 89c19c0..da0f6e1 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -187,6 +187,14 @@
* pageblocks we may have modified and return -EBUSY to caller. This
* prevents two threads from simultaneously working on overlapping ranges.
*
+ * Please note that there is no strong synchronization with the page allocator
+ * either. Pages might be freed while their page blocks are marked ISOLATED.
+ * In some cases pages might still end up on pcp lists and that would allow
+ * for their allocation even when they are in fact isolated already. Depending
+ * on how strong of a guarantee the caller needs drain_all_pages might be needed
+ * (e.g. __offline_pages will need to call it after check for isolated range for
+ * a next retry).
+ *
* Return: the number of isolated pageblocks on success and -EBUSY if any part
* of range cannot be isolated.
*/
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 18ecde9..83d0894 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -204,7 +204,7 @@
page_owner->last_migrate_reason = reason;
}
-void __split_page_owner(struct page *page, unsigned int order)
+void __split_page_owner(struct page *page, unsigned int nr)
{
int i;
struct page_ext *page_ext = lookup_page_ext(page);
@@ -213,7 +213,7 @@
if (unlikely(!page_ext))
return;
- for (i = 0; i < (1 << order); i++) {
+ for (i = 0; i < nr; i++) {
page_owner = get_page_owner(page_ext);
page_owner->order = 0;
page_ext = page_ext_next(page_ext);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index eff4b45..029f559 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -111,6 +111,13 @@
return pfn_in_hpage(pvmw->page, pfn);
}
+static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size)
+{
+ pvmw->address = (pvmw->address + size) & ~(size - 1);
+ if (!pvmw->address)
+ pvmw->address = ULONG_MAX;
+}
+
/**
* page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at
* @pvmw->address
@@ -139,6 +146,7 @@
{
struct mm_struct *mm = pvmw->vma->vm_mm;
struct page *page = pvmw->page;
+ unsigned long end;
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
@@ -148,10 +156,11 @@
if (pvmw->pmd && !pvmw->pte)
return not_found(pvmw);
- if (pvmw->pte)
- goto next_pte;
+ if (unlikely(PageHuge(page))) {
+ /* The only possible mapping was handled on last iteration */
+ if (pvmw->pte)
+ return not_found(pvmw);
- if (unlikely(PageHuge(pvmw->page))) {
/* when pud is not present, pte will be NULL */
pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page));
if (!pvmw->pte)
@@ -163,78 +172,108 @@
return not_found(pvmw);
return true;
}
-restart:
- pgd = pgd_offset(mm, pvmw->address);
- if (!pgd_present(*pgd))
- return false;
- p4d = p4d_offset(pgd, pvmw->address);
- if (!p4d_present(*p4d))
- return false;
- pud = pud_offset(p4d, pvmw->address);
- if (!pud_present(*pud))
- return false;
- pvmw->pmd = pmd_offset(pud, pvmw->address);
- /*
- * Make sure the pmd value isn't cached in a register by the
- * compiler and used as a stale value after we've observed a
- * subsequent update.
- */
- pmde = READ_ONCE(*pvmw->pmd);
- if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
- if (likely(pmd_trans_huge(*pvmw->pmd))) {
- if (pvmw->flags & PVMW_MIGRATION)
- return not_found(pvmw);
- if (pmd_page(*pvmw->pmd) != page)
- return not_found(pvmw);
- return true;
- } else if (!pmd_present(*pvmw->pmd)) {
- if (thp_migration_supported()) {
- if (!(pvmw->flags & PVMW_MIGRATION))
- return not_found(pvmw);
- if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) {
- swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd);
- if (migration_entry_to_page(entry) != page)
- return not_found(pvmw);
- return true;
- }
+ /*
+ * Seek to next pte only makes sense for THP.
+ * But more important than that optimization, is to filter out
+ * any PageKsm page: whose page->index misleads vma_address()
+ * and vma_address_end() to disaster.
+ */
+ end = PageTransCompound(page) ?
+ vma_address_end(page, pvmw->vma) :
+ pvmw->address + PAGE_SIZE;
+ if (pvmw->pte)
+ goto next_pte;
+restart:
+ do {
+ pgd = pgd_offset(mm, pvmw->address);
+ if (!pgd_present(*pgd)) {
+ step_forward(pvmw, PGDIR_SIZE);
+ continue;
+ }
+ p4d = p4d_offset(pgd, pvmw->address);
+ if (!p4d_present(*p4d)) {
+ step_forward(pvmw, P4D_SIZE);
+ continue;
+ }
+ pud = pud_offset(p4d, pvmw->address);
+ if (!pud_present(*pud)) {
+ step_forward(pvmw, PUD_SIZE);
+ continue;
+ }
+
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
+ /*
+ * Make sure the pmd value isn't cached in a register by the
+ * compiler and used as a stale value after we've observed a
+ * subsequent update.
+ */
+ pmde = READ_ONCE(*pvmw->pmd);
+
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+ pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ pmde = *pvmw->pmd;
+ if (likely(pmd_trans_huge(pmde))) {
+ if (pvmw->flags & PVMW_MIGRATION)
+ return not_found(pvmw);
+ if (pmd_page(pmde) != page)
+ return not_found(pvmw);
+ return true;
}
- return not_found(pvmw);
- } else {
+ if (!pmd_present(pmde)) {
+ swp_entry_t entry;
+
+ if (!thp_migration_supported() ||
+ !(pvmw->flags & PVMW_MIGRATION))
+ return not_found(pvmw);
+ entry = pmd_to_swp_entry(pmde);
+ if (!is_migration_entry(entry) ||
+ migration_entry_to_page(entry) != page)
+ return not_found(pvmw);
+ return true;
+ }
/* THP pmd was split under us: handle on pte level */
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
+ } else if (!pmd_present(pmde)) {
+ /*
+ * If PVMW_SYNC, take and drop THP pmd lock so that we
+ * cannot return prematurely, while zap_huge_pmd() has
+ * cleared *pmd but not decremented compound_mapcount().
+ */
+ if ((pvmw->flags & PVMW_SYNC) &&
+ PageTransCompound(page)) {
+ spinlock_t *ptl = pmd_lock(mm, pvmw->pmd);
+
+ spin_unlock(ptl);
+ }
+ step_forward(pvmw, PMD_SIZE);
+ continue;
}
- } else if (!pmd_present(pmde)) {
- return false;
- }
- if (!map_pte(pvmw))
- goto next_pte;
- while (1) {
+ if (!map_pte(pvmw))
+ goto next_pte;
+this_pte:
if (check_pte(pvmw))
return true;
next_pte:
- /* Seek to next pte only makes sense for THP */
- if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page))
- return not_found(pvmw);
do {
pvmw->address += PAGE_SIZE;
- if (pvmw->address >= pvmw->vma->vm_end ||
- pvmw->address >=
- __vma_address(pvmw->page, pvmw->vma) +
- hpage_nr_pages(pvmw->page) * PAGE_SIZE)
+ if (pvmw->address >= end)
return not_found(pvmw);
/* Did we cross page table boundary? */
- if (pvmw->address % PMD_SIZE == 0) {
- pte_unmap(pvmw->pte);
+ if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) {
if (pvmw->ptl) {
spin_unlock(pvmw->ptl);
pvmw->ptl = NULL;
}
+ pte_unmap(pvmw->pte);
+ pvmw->pte = NULL;
goto restart;
- } else {
- pvmw->pte++;
+ }
+ pvmw->pte++;
+ if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) {
+ pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+ spin_lock(pvmw->ptl);
}
} while (pte_none(*pvmw->pte));
@@ -242,7 +281,10 @@
pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
spin_lock(pvmw->ptl);
}
- }
+ goto this_pte;
+ } while (pvmw->address < end);
+
+ return false;
}
/**
@@ -261,14 +303,10 @@
.vma = vma,
.flags = PVMW_SYNC,
};
- unsigned long start, end;
- start = __vma_address(page, vma);
- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
-
- if (unlikely(end < vma->vm_start || start >= vma->vm_end))
+ pvmw.address = vma_address(page, vma);
+ if (pvmw.address == -EFAULT)
return 0;
- pvmw.address = max(start, vma->vm_start);
if (!page_vma_mapped_walk(&pvmw))
return 0;
page_vma_mapped_walk_done(&pvmw);
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index d48c2a9..4eb09e0 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -16,9 +16,9 @@
err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk);
if (err)
break;
- addr += PAGE_SIZE;
- if (addr == end)
+ if (addr >= end - PAGE_SIZE)
break;
+ addr += PAGE_SIZE;
pte++;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 7e06a1e..806bc16 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1328,7 +1328,7 @@
/* allocate chunk */
alloc_size = sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(region_size >> PAGE_SHIFT);
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long);
chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk)
panic("%s: Failed to allocate %zu bytes\n", __func__,
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 532c292..49e8a4f 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -126,8 +126,8 @@
{
pmd_t pmd;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
- VM_BUG_ON((pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
- !pmd_devmap(*pmdp)) || !pmd_present(*pmdp));
+ VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) &&
+ !pmd_devmap(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
diff --git a/mm/rmap.c b/mm/rmap.c
index 0c7b2a9..45f2106 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -687,7 +687,6 @@
*/
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
- unsigned long address;
if (PageAnon(page)) {
struct anon_vma *page__anon_vma = page_anon_vma(page);
/*
@@ -697,15 +696,13 @@
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping) {
- if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
- return -EFAULT;
- } else
+ } else if (!vma->vm_file) {
return -EFAULT;
- address = __vma_address(page, vma);
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) {
return -EFAULT;
- return address;
+ }
+
+ return vma_address(page, vma);
}
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
@@ -899,7 +896,7 @@
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- min(vma->vm_end, address + page_size(page)));
+ vma_address_end(page, vma));
mmu_notifier_invalidate_range_start(&range);
while (page_vma_mapped_walk(&pvmw)) {
@@ -1353,6 +1350,15 @@
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)arg;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ if (flags & TTU_SYNC)
+ pvmw.flags = PVMW_SYNC;
+
/* munlock has nothing to gain from examining un-locked vmas */
if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED))
return true;
@@ -1374,9 +1380,10 @@
* Note that the page can not be free in this function as call of
* try_to_unmap() must hold a reference on the page.
*/
+ range.end = PageKsm(page) ?
+ address + PAGE_SIZE : vma_address_end(page, vma);
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- address,
- min(vma->vm_end, address + page_size(page)));
+ address, range.end);
if (PageHuge(page)) {
/*
* If sharing is possible, start and end will be adjusted
@@ -1690,9 +1697,9 @@
return is_vma_temporary_stack(vma);
}
-static int page_mapcount_is_zero(struct page *page)
+static int page_not_mapped(struct page *page)
{
- return !total_mapcount(page);
+ return !page_mapped(page);
}
/**
@@ -1710,7 +1717,7 @@
struct rmap_walk_control rwc = {
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
- .done = page_mapcount_is_zero,
+ .done = page_not_mapped,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1731,14 +1738,15 @@
else
rmap_walk(page, &rwc);
- return !page_mapcount(page) ? true : false;
+ /*
+ * When racing against e.g. zap_pte_range() on another cpu,
+ * in between its ptep_get_and_clear_full() and page_remove_rmap(),
+ * try_to_unmap() may return false when it is about to become true,
+ * if page table locking is skipped: use TTU_SYNC to wait for that.
+ */
+ return !page_mapcount(page);
}
-static int page_not_mapped(struct page *page)
-{
- return !page_mapped(page);
-};
-
/**
* try_to_munlock - try to munlock a page
* @page: the page to be munlocked
@@ -1833,6 +1841,7 @@
struct vm_area_struct *vma = avc->vma;
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
@@ -1887,6 +1896,7 @@
pgoff_start, pgoff_end) {
unsigned long address = vma_address(page, vma);
+ VM_BUG_ON_VMA(address == -EFAULT, vma);
cond_resched();
if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
diff --git a/mm/shmem.c b/mm/shmem.c
index 220be9f..b119c44 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2022,16 +2022,14 @@
shmem_falloc->waitq &&
vmf->pgoff >= shmem_falloc->start &&
vmf->pgoff < shmem_falloc->next) {
+ struct file *fpin;
wait_queue_head_t *shmem_falloc_waitq;
DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
ret = VM_FAULT_NOPAGE;
- if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
- !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
- /* It's polite to up mmap_sem if we can */
- up_read(&vma->vm_mm->mmap_sem);
+ fpin = maybe_unlock_mmap_for_io(vmf, NULL);
+ if (fpin)
ret = VM_FAULT_RETRY;
- }
shmem_falloc_waitq = shmem_falloc->waitq;
prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
@@ -2049,6 +2047,9 @@
spin_lock(&inode->i_lock);
finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
spin_unlock(&inode->i_lock);
+
+ if (fpin)
+ fput(fpin);
return ret;
}
spin_unlock(&inode->i_lock);
@@ -2105,9 +2106,10 @@
/*
* Our priority is to support MAP_SHARED mapped hugely;
* and support MAP_PRIVATE mapped hugely too, until it is COWed.
- * But if caller specified an address hint, respect that as before.
+ * But if caller specified an address hint and we allocated area there
+ * successfully, respect that as before.
*/
- if (uaddr)
+ if (uaddr == addr)
return addr;
if (shmem_huge != SHMEM_HUGE_FORCE) {
@@ -2141,7 +2143,7 @@
if (inflated_len < len)
return addr;
- inflated_addr = get_area(NULL, 0, inflated_len, 0, flags);
+ inflated_addr = get_area(NULL, uaddr, inflated_len, 0, flags);
if (IS_ERR_VALUE(inflated_addr))
return addr;
if (inflated_addr & ~PAGE_MASK)
@@ -2181,7 +2183,11 @@
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
- spin_lock_irq(&info->lock);
+ /*
+ * What serializes the accesses to info->flags?
+ * ipc_lock_object() when called from shmctl_do_lock(),
+ * no serialization needed when called from shm_destroy().
+ */
if (lock && !(info->flags & VM_LOCKED)) {
if (!user_shm_lock(inode->i_size, user))
goto out_nomem;
@@ -2196,29 +2202,17 @@
retval = 0;
out_nomem:
- spin_unlock_irq(&info->lock);
return retval;
}
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
- if (info->seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
-
- /*
- * Since the F_SEAL_FUTURE_WRITE seals allow for a MAP_SHARED
- * read-only mapping, take care to not allow mprotect to revert
- * protections.
- */
- vma->vm_flags &= ~(VM_MAYWRITE);
- }
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
@@ -2319,8 +2313,18 @@
pgoff_t offset, max_off;
ret = -ENOMEM;
- if (!shmem_inode_acct_block(inode, 1))
+ if (!shmem_inode_acct_block(inode, 1)) {
+ /*
+ * We may have got a page, returned -ENOENT triggering a retry,
+ * and now we find ourselves with -ENOMEM. Release the page, to
+ * avoid a BUG_ON in our caller.
+ */
+ if (unlikely(*pagep)) {
+ put_page(*pagep);
+ *pagep = NULL;
+ }
goto out;
+ }
if (!*pagep) {
page = shmem_alloc_page(gfp, info, pgoff);
@@ -2398,11 +2402,11 @@
lru_cache_add_anon(page);
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
+ spin_unlock_irq(&info->lock);
inc_mm_counter(dst_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -2742,7 +2746,7 @@
}
shmem_falloc.waitq = &shmem_falloc_waitq;
- shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+ shmem_falloc.start = (u64)unmap_start >> PAGE_SHIFT;
shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
spin_lock(&inode->i_lock);
inode->i_private = &shmem_falloc;
diff --git a/mm/shuffle.c b/mm/shuffle.c
index b3fe97f..56958ff 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -58,25 +58,25 @@
* For two pages to be swapped in the shuffle, they must be free (on a
* 'free_area' lru), have the same order, and have the same migratetype.
*/
-static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+static struct page * __meminit shuffle_valid_page(struct zone *zone,
+ unsigned long pfn, int order)
{
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
/*
* Given we're dealing with randomly selected pfns in a zone we
* need to ask questions like...
*/
- /* ...is the pfn even in the memmap? */
- if (!pfn_valid_within(pfn))
+ /* ... is the page managed by the buddy? */
+ if (!page)
return NULL;
- /* ...is the pfn in a present section or a hole? */
- if (!pfn_present(pfn))
+ /* ... is the page assigned to the same zone? */
+ if (page_zone(page) != zone)
return NULL;
/* ...is the page free and currently on a free_area list? */
- page = pfn_to_page(pfn);
if (!PageBuddy(page))
return NULL;
@@ -123,7 +123,7 @@
* page_j randomly selected in the span @zone_start_pfn to
* @spanned_pages.
*/
- page_i = shuffle_valid_page(i, order);
+ page_i = shuffle_valid_page(z, i, order);
if (!page_i)
continue;
@@ -137,7 +137,7 @@
j = z->zone_start_pfn +
ALIGN_DOWN(get_random_long() % z->spanned_pages,
order_pages);
- page_j = shuffle_valid_page(j, order);
+ page_j = shuffle_valid_page(z, j, order);
if (page_j && page_j != page_i)
break;
}
diff --git a/mm/slab.c b/mm/slab.c
index 66e5d80..d1d7624 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1415,7 +1415,7 @@
#if DEBUG
static bool is_debug_pagealloc_cache(struct kmem_cache *cachep)
{
- if (debug_pagealloc_enabled() && OFF_SLAB(cachep) &&
+ if (debug_pagealloc_enabled_static() && OFF_SLAB(cachep) &&
(cachep->size % PAGE_SIZE) == 0)
return true;
@@ -2007,7 +2007,7 @@
* to check size >= 256. It guarantees that all necessary small
* sized slab is initialized in current slab initialization sequence.
*/
- if (debug_pagealloc_enabled() && (flags & SLAB_POISON) &&
+ if (debug_pagealloc_enabled_static() && (flags & SLAB_POISON) &&
size >= 256 && cachep->object_size > cache_line_size()) {
if (size < PAGE_SIZE || size % PAGE_SIZE == 0) {
size_t tmp_size = ALIGN(size, PAGE_SIZE);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f9fb27b..8f12824 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -85,8 +85,7 @@
#ifdef CONFIG_DEBUG_VM
static int kmem_cache_sanity_check(const char *name, unsigned int size)
{
- if (!name || in_interrupt() || size < sizeof(void *) ||
- size > KMALLOC_MAX_SIZE) {
+ if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) {
pr_err("kmem_cache_create(%s) integrity check failed\n", name);
return -EINVAL;
}
@@ -886,12 +885,15 @@
return 0;
}
-static void flush_memcg_workqueue(struct kmem_cache *s)
+static void memcg_set_kmem_cache_dying(struct kmem_cache *s)
{
spin_lock_irq(&memcg_kmem_wq_lock);
s->memcg_params.dying = true;
spin_unlock_irq(&memcg_kmem_wq_lock);
+}
+static void flush_memcg_workqueue(struct kmem_cache *s)
+{
/*
* SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
* sure all registered rcu callbacks have been invoked.
@@ -903,17 +905,26 @@
* deactivates the memcg kmem_caches through workqueue. Make sure all
* previous workitems on workqueue are processed.
*/
- flush_workqueue(memcg_kmem_cache_wq);
+ if (likely(memcg_kmem_cache_wq))
+ flush_workqueue(memcg_kmem_cache_wq);
+
+ /*
+ * If we're racing with children kmem_cache deactivation, it might
+ * take another rcu grace period to complete their destruction.
+ * At this moment the corresponding percpu_ref_kill() call should be
+ * done, but it might take another rcu grace period to complete
+ * switching to the atomic mode.
+ * Please, note that we check without grabbing the slab_mutex. It's safe
+ * because at this moment the children list can't grow.
+ */
+ if (!list_empty(&s->memcg_params.children))
+ rcu_barrier();
}
#else
static inline int shutdown_memcg_caches(struct kmem_cache *s)
{
return 0;
}
-
-static inline void flush_memcg_workqueue(struct kmem_cache *s)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
@@ -931,8 +942,6 @@
if (unlikely(!s))
return;
- flush_memcg_workqueue(s);
-
get_online_cpus();
get_online_mems();
@@ -942,6 +951,32 @@
if (s->refcount)
goto out_unlock;
+#ifdef CONFIG_MEMCG_KMEM
+ memcg_set_kmem_cache_dying(s);
+
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ flush_memcg_workqueue(s);
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+
+ /*
+ * Another thread referenced it again
+ */
+ if (READ_ONCE(s->refcount)) {
+ spin_lock_irq(&memcg_kmem_wq_lock);
+ s->memcg_params.dying = false;
+ spin_unlock_irq(&memcg_kmem_wq_lock);
+ goto out_unlock;
+ }
+#endif
+
err = shutdown_memcg_caches(s);
if (!err)
err = shutdown_cache(s);
@@ -1727,7 +1762,7 @@
if (unlikely(ZERO_OR_NULL_PTR(mem)))
return;
ks = ksize(mem);
- memset(mem, 0, ks);
+ memzero_explicit(mem, ks);
kfree(mem);
}
EXPORT_SYMBOL(kzfree);
diff --git a/mm/slub.c b/mm/slub.c
index e72e802..ca7143f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/bit_spinlock.h>
#include <linux/interrupt.h>
+#include <linux/swab.h>
#include <linux/bitops.h>
#include <linux/slab.h>
#include "slab.h"
@@ -261,7 +262,7 @@
* freepointer to be restored incorrectly.
*/
return (void *)((unsigned long)ptr ^ s->random ^
- (unsigned long)kasan_reset_tag((void *)ptr_addr));
+ swab((unsigned long)kasan_reset_tag((void *)ptr_addr)));
#else
return ptr;
#endif
@@ -290,7 +291,7 @@
unsigned long freepointer_addr;
void *p;
- if (!debug_pagealloc_enabled())
+ if (!debug_pagealloc_enabled_static())
return get_freepointer(s, object);
freepointer_addr = (unsigned long)object + s->offset;
@@ -533,15 +534,32 @@
metadata_access_disable();
}
+/*
+ * See comment in calculate_sizes().
+ */
+static inline bool freeptr_outside_object(struct kmem_cache *s)
+{
+ return s->offset >= s->inuse;
+}
+
+/*
+ * Return offset of the end of info block which is inuse + free pointer if
+ * not overlapping with object.
+ */
+static inline unsigned int get_info_end(struct kmem_cache *s)
+{
+ if (freeptr_outside_object(s))
+ return s->inuse + sizeof(void *);
+ else
+ return s->inuse;
+}
+
static struct track *get_track(struct kmem_cache *s, void *object,
enum track_item alloc)
{
struct track *p;
- if (s->offset)
- p = object + s->offset + sizeof(void *);
- else
- p = object + s->inuse;
+ p = object + get_info_end(s);
return p + alloc;
}
@@ -644,6 +662,20 @@
va_end(args);
}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, page, nextfree) && freelist) {
+ object_err(s, page, *freelist, "Freechain corrupt");
+ *freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
@@ -657,21 +689,18 @@
p, p - addr, get_freepointer(s, p));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
+ print_section(KERN_ERR, "Redzone ", p - s->red_left_pad,
s->red_left_pad);
else if (p > addr + 16)
print_section(KERN_ERR, "Bytes b4 ", p - 16, 16);
- print_section(KERN_ERR, "Object ", p,
+ print_section(KERN_ERR, "Object ", p,
min_t(unsigned int, s->object_size, PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section(KERN_ERR, "Redzone ", p + s->object_size,
+ print_section(KERN_ERR, "Redzone ", p + s->object_size,
s->inuse - s->object_size);
- if (s->offset)
- off = s->offset + sizeof(void *);
- else
- off = s->inuse;
+ off = get_info_end(s);
if (s->flags & SLAB_STORE_USER)
off += 2 * sizeof(struct track);
@@ -680,7 +709,7 @@
if (off != size_from_object(s))
/* Beginning of the filler is the free pointer */
- print_section(KERN_ERR, "Padding ", p + off,
+ print_section(KERN_ERR, "Padding ", p + off,
size_from_object(s) - off);
dump_stack();
@@ -762,7 +791,7 @@
* object address
* Bytes of the object to be managed.
* If the freepointer may overlay the object then the free
- * pointer is the first word of the object.
+ * pointer is at the middle of the object.
*
* Poisoning uses 0x6b (POISON_FREE) and the last byte is
* 0xa5 (POISON_END)
@@ -796,11 +825,7 @@
static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p)
{
- unsigned long off = s->inuse; /* The end of info */
-
- if (s->offset)
- /* Freepointer is placed after the object. */
- off += sizeof(void *);
+ unsigned long off = get_info_end(s); /* The end of info */
if (s->flags & SLAB_STORE_USER)
/* We also have user information there */
@@ -858,11 +883,11 @@
u8 *endobject = object + s->object_size;
if (s->flags & SLAB_RED_ZONE) {
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Left Redzone",
object - s->red_left_pad, val, s->red_left_pad))
return 0;
- if (!check_bytes_and_report(s, page, object, "Redzone",
+ if (!check_bytes_and_report(s, page, object, "Right Redzone",
endobject, val, s->inuse - s->object_size))
return 0;
} else {
@@ -877,7 +902,7 @@
if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->object_size - 1) ||
- !check_bytes_and_report(s, page, p, "Poison",
+ !check_bytes_and_report(s, page, p, "End Poison",
p + s->object_size - 1, POISON_END, 1)))
return 0;
/*
@@ -886,7 +911,7 @@
check_pad_bytes(s, page, p);
}
- if (!s->offset && val == SLUB_RED_ACTIVE)
+ if (!freeptr_outside_object(s) && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
@@ -1379,6 +1404,11 @@
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ return false;
+}
#endif /* CONFIG_SLUB_DEBUG */
/*
@@ -1953,8 +1983,6 @@
if (node == NUMA_NO_NODE)
searchnode = numa_mem_id();
- else if (!node_present_pages(node))
- searchnode = node_to_mem_node(node);
object = get_partial_node(s, get_node(s, searchnode), c, flags);
if (object || node != NUMA_NO_NODE)
@@ -2064,6 +2092,14 @@
void *prior;
unsigned long counters;
+ /*
+ * If 'nextfree' is invalid, it is possible that the object at
+ * 'freelist' is already corrupted. So isolate all objects
+ * starting at 'freelist'.
+ */
+ if (freelist_corrupted(s, page, &freelist, nextfree))
+ break;
+
do {
prior = page->freelist;
counters = page->counters;
@@ -2543,17 +2579,27 @@
struct page *page;
page = c->page;
- if (!page)
+ if (!page) {
+ /*
+ * if the node is not online or has no normal memory, just
+ * ignore the node constraint
+ */
+ if (unlikely(node != NUMA_NO_NODE &&
+ !node_state(node, N_NORMAL_MEMORY)))
+ node = NUMA_NO_NODE;
goto new_slab;
+ }
redo:
if (unlikely(!node_match(page, node))) {
- int searchnode = node;
-
- if (node != NUMA_NO_NODE && !node_present_pages(node))
- searchnode = node_to_mem_node(node);
-
- if (unlikely(!node_match(page, searchnode))) {
+ /*
+ * same as above but node_match() being false already
+ * implies node != NUMA_NO_NODE
+ */
+ if (!node_state(node, N_NORMAL_MEMORY)) {
+ node = NUMA_NO_NODE;
+ goto redo;
+ } else {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, page, c->freelist, c);
goto new_slab;
@@ -2718,7 +2764,7 @@
object = c->freelist;
page = c->page;
- if (unlikely(!object || !node_match(page, node))) {
+ if (unlikely(!object || !page || !node_match(page, node))) {
object = __slab_alloc(s, gfpflags, node, addr, c);
stat(s, ALLOC_SLOWPATH);
} else {
@@ -2977,11 +3023,13 @@
barrier();
if (likely(page == c->page)) {
- set_freepointer(s, tail_obj, c->freelist);
+ void **freelist = READ_ONCE(c->freelist);
+
+ set_freepointer(s, tail_obj, freelist);
if (unlikely(!this_cpu_cmpxchg_double(
s->cpu_slab->freelist, s->cpu_slab->tid,
- c->freelist, tid,
+ freelist, tid,
head, next_tid(tid)))) {
note_cmpxchg_failure("slab_free", s, tid);
@@ -3155,6 +3203,15 @@
if (unlikely(!object)) {
/*
+ * We may have removed an object from c->freelist using
+ * the fastpath in the previous iteration; in that case,
+ * c->tid has not been bumped yet.
+ * Since ___slab_alloc() may reenable interrupts while
+ * allocating memory, we should bump c->tid now.
+ */
+ c->tid = next_tid(c->tid);
+
+ /*
* Invoking slow path likely have side-effect
* of re-populating per CPU c->freelist
*/
@@ -3530,15 +3587,22 @@
*/
s->inuse = size;
- if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
- s->ctor)) {
+ if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) ||
+ ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) ||
+ s->ctor) {
/*
* Relocate free pointer after the object if it is not
* permitted to overwrite the first word of the object on
* kmem_cache_free.
*
* This is the case if we do RCU, have a constructor or
- * destructor or are poisoning the objects.
+ * destructor, are poisoning the objects, or are
+ * redzoning an object smaller than sizeof(void *).
+ *
+ * The assumption that s->offset >= s->inuse means free
+ * pointer is outside of the object is used in the
+ * freeptr_outside_object() function. If that is no
+ * longer true, the function needs to be modified.
*/
s->offset = size;
size += sizeof(void *);
@@ -5602,7 +5666,8 @@
*/
if (buffer)
buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
+ else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf) &&
+ !IS_ENABLED(CONFIG_SLUB_STATS))
buf = mbuf;
else {
buffer = (char *) get_zeroed_page(GFP_KERNEL);
diff --git a/mm/sparse.c b/mm/sparse.c
index f6891c1..191e29c 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -551,6 +551,7 @@
pr_err("%s: node[%d] memory map backing failed. Some memory will not be available.",
__func__, nid);
pnum_begin = pnum;
+ sparse_buffer_fini();
goto failed;
}
check_usemap_section_nr(nid, usage);
@@ -647,7 +648,7 @@
#endif
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static struct page *populate_section_memmap(unsigned long pfn,
+static struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
return __populate_section_memmap(pfn, nr_pages, nid, altmap);
@@ -669,7 +670,7 @@
vmemmap_free(start, end, NULL);
}
#else
-struct page *populate_section_memmap(unsigned long pfn,
+struct page * __meminit populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
struct page *page, *ret;
@@ -742,6 +743,7 @@
struct mem_section *ms = __pfn_to_section(pfn);
bool section_is_early = early_section(ms);
struct page *memmap = NULL;
+ bool empty;
unsigned long *subsection_map = ms->usage
? &ms->usage->subsection_map[0] : NULL;
@@ -772,21 +774,37 @@
* For 2/ and 3/ the SPARSEMEM_VMEMMAP={y,n} cases are unified
*/
bitmap_xor(subsection_map, map, subsection_map, SUBSECTIONS_PER_SECTION);
- if (bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION)) {
+ empty = bitmap_empty(subsection_map, SUBSECTIONS_PER_SECTION);
+ if (empty) {
unsigned long section_nr = pfn_to_section_nr(pfn);
- if (!section_is_early) {
+ /*
+ * When removing an early section, the usage map is kept (as the
+ * usage maps of other sections fall into the same page). It
+ * will be re-used when re-adding the section - which is then no
+ * longer an early section. If the usage map is PageReserved, it
+ * was allocated during boot.
+ */
+ if (!PageReserved(virt_to_page(ms->usage))) {
kfree(ms->usage);
ms->usage = NULL;
}
memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
- ms->section_mem_map = sparse_encode_mem_map(NULL, section_nr);
+ /*
+ * Mark the section invalid so that valid_section()
+ * return false. This prevents code from dereferencing
+ * ms->usage array.
+ */
+ ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
}
if (section_is_early && memmap)
free_map_bootmem(memmap);
else
depopulate_section_memmap(pfn, nr_pages, altmap);
+
+ if (empty)
+ ms->section_mem_map = (unsigned long)NULL;
}
static struct page * __meminit section_activate(int nid, unsigned long pfn,
@@ -877,7 +895,7 @@
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 8e7ce9a..7c434fc 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -23,6 +23,7 @@
#include <linux/huge_mm.h>
#include <asm/pgtable.h>
+#include "internal.h"
/*
* swapper_space is a fiction, retained to simplify the path through
@@ -418,7 +419,8 @@
/* May fail (-ENOMEM) if XArray node allocation failed. */
__SetPageLocked(new_page);
__SetPageSwapBacked(new_page);
- err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
+ err = add_to_swap_cache(new_page, entry,
+ gfp_mask & GFP_RECLAIM_MASK);
if (likely(!err)) {
/* Initiate read into locked page */
SetPageWorkingset(new_page);
@@ -509,10 +511,11 @@
return 1;
hits = atomic_xchg(&swapin_readahead_hits, 0);
- pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+ pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
+ max_pages,
atomic_read(&last_readahead_pages));
if (!hits)
- prev_offset = offset;
+ WRITE_ONCE(prev_offset, offset);
atomic_set(&last_readahead_pages, pages);
return pages;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index dab4352..f696421 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -221,6 +221,19 @@
BUG();
}
+sector_t swap_page_sector(struct page *page)
+{
+ struct swap_info_struct *sis = page_swap_info(page);
+ struct swap_extent *se;
+ sector_t sector;
+ pgoff_t offset;
+
+ offset = __page_file_index(page);
+ se = offset_to_swap_extent(sis, offset);
+ sector = se->start_block + (offset - se->start_page);
+ return sector << (PAGE_SHIFT - 9);
+}
+
/*
* swap allocation tell device that a cluster of swap can now be discarded,
* to allow the swap device to optimize its wear-levelling.
@@ -1038,7 +1051,7 @@
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FS))
+ if (si->flags & SWP_BLKDEV)
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
@@ -2132,7 +2145,7 @@
swp_entry_t entry;
unsigned int i;
- if (!si->inuse_pages)
+ if (!READ_ONCE(si->inuse_pages))
return 0;
if (!frontswap)
@@ -2148,7 +2161,7 @@
spin_lock(&mmlist_lock);
p = &init_mm.mmlist;
- while (si->inuse_pages &&
+ while (READ_ONCE(si->inuse_pages) &&
!signal_pending(current) &&
(p = p->next) != &init_mm.mmlist) {
@@ -2177,7 +2190,7 @@
mmput(prev_mm);
i = 0;
- while (si->inuse_pages &&
+ while (READ_ONCE(si->inuse_pages) &&
!signal_pending(current) &&
(i = find_next_to_unuse(si, i, frontswap)) != 0) {
@@ -2219,7 +2232,7 @@
* been preempted after get_swap_page(), temporarily hiding that swap.
* It's easy and robust (though cpu-intensive) just to keep retrying.
*/
- if (si->inuse_pages) {
+ if (READ_ONCE(si->inuse_pages)) {
if (!signal_pending(current))
goto retry;
retval = -EINTR;
@@ -2737,10 +2750,10 @@
else
type = si->type + 1;
+ ++(*pos);
for (; (si = swap_type_to_swap_info(type)); type++) {
if (!(si->flags & SWP_USED) || !si->swap_map)
continue;
- ++*pos;
return si;
}
@@ -2824,6 +2837,7 @@
static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
+ struct swap_info_struct *defer = NULL;
unsigned int type;
int i;
@@ -2852,7 +2866,7 @@
smp_wmb();
WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
} else {
- kvfree(p);
+ defer = p;
p = swap_info[type];
/*
* Do not memset this entry: a racing procfs swap_next()
@@ -2865,6 +2879,7 @@
plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
+ kvfree(defer);
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
@@ -2892,10 +2907,6 @@
p->bdev = inode->i_sb->s_bdev;
}
- inode_lock(inode);
- if (IS_SWAPFILE(inode))
- return -EBUSY;
-
return 0;
}
@@ -3150,17 +3161,22 @@
mapping = swap_file->f_mapping;
inode = mapping->host;
- /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */
error = claim_swapfile(p, inode);
if (unlikely(error))
goto bad_swap;
+ inode_lock(inode);
+ if (IS_SWAPFILE(inode)) {
+ error = -EBUSY;
+ goto bad_swap_unlock_inode;
+ }
+
/*
* Read the swap header.
*/
if (!mapping->a_ops->readpage) {
error = -EINVAL;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
page = read_mapping_page(mapping, 0, swap_file);
if (IS_ERR(page)) {
@@ -3172,14 +3188,14 @@
maxpages = read_swap_header(p, swap_header, inode);
if (unlikely(!maxpages)) {
error = -EINVAL;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
/* OK, set up the swap map and apply the bad block list */
swap_map = vzalloc(maxpages);
if (!swap_map) {
error = -ENOMEM;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
if (bdi_cap_stable_pages_required(inode_to_bdi(inode)))
@@ -3204,7 +3220,7 @@
GFP_KERNEL);
if (!cluster_info) {
error = -ENOMEM;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
for (ci = 0; ci < nr_cluster; ci++)
@@ -3213,7 +3229,7 @@
p->percpu_cluster = alloc_percpu(struct percpu_cluster);
if (!p->percpu_cluster) {
error = -ENOMEM;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
for_each_possible_cpu(cpu) {
struct percpu_cluster *cluster;
@@ -3227,13 +3243,13 @@
error = swap_cgroup_swapon(p->type, maxpages);
if (error)
- goto bad_swap;
+ goto bad_swap_unlock_inode;
nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
cluster_info, maxpages, &span);
if (unlikely(nr_extents < 0)) {
error = nr_extents;
- goto bad_swap;
+ goto bad_swap_unlock_inode;
}
/* frontswap enabled? set up bit-per-page map for frontswap */
if (IS_ENABLED(CONFIG_FRONTSWAP))
@@ -3273,7 +3289,7 @@
error = init_swap_address_space(p->type, maxpages);
if (error)
- goto bad_swap;
+ goto bad_swap_unlock_inode;
/*
* Flush any pending IO and dirty mappings before we start using this
@@ -3283,7 +3299,7 @@
error = inode_drain_writes(inode);
if (error) {
inode->i_flags &= ~S_SWAPFILE;
- goto bad_swap;
+ goto free_swap_address_space;
}
mutex_lock(&swapon_mutex);
@@ -3308,6 +3324,10 @@
error = 0;
goto out;
+free_swap_address_space:
+ exit_swap_address_space(p->type);
+bad_swap_unlock_inode:
+ inode_unlock(inode);
bad_swap:
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
@@ -3315,6 +3335,7 @@
set_blocksize(p->bdev, p->old_block_size);
blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
}
+ inode = NULL;
destroy_swap_extents(p);
swap_cgroup_swapoff(p->type);
spin_lock(&swap_lock);
@@ -3326,13 +3347,8 @@
kvfree(frontswap_map);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
- if (swap_file) {
- if (inode) {
- inode_unlock(inode);
- inode = NULL;
- }
+ if (swap_file)
filp_close(swap_file, NULL);
- }
out:
if (page && !IS_ERR(page)) {
kunmap(page);
diff --git a/mm/truncate.c b/mm/truncate.c
index dd9ebc1..4d5add7 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -173,13 +173,10 @@
* its lock, b) when a concurrent invalidate_mapping_pages got there first and
* c) when tmpfs swizzles a page between a tmpfs inode and swapper_space.
*/
-static void
-truncate_cleanup_page(struct address_space *mapping, struct page *page)
+static void truncate_cleanup_page(struct page *page)
{
- if (page_mapped(page)) {
- pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
- unmap_mapping_pages(mapping, page->index, nr, false);
- }
+ if (page_mapped(page))
+ unmap_mapping_page(page);
if (page_has_private(page))
do_invalidatepage(page, 0, PAGE_SIZE);
@@ -224,7 +221,7 @@
if (page->mapping != mapping)
return -EIO;
- truncate_cleanup_page(mapping, page);
+ truncate_cleanup_page(page);
delete_from_page_cache(page);
return 0;
}
@@ -362,7 +359,7 @@
pagevec_add(&locked_pvec, page);
}
for (i = 0; i < pagevec_count(&locked_pvec); i++)
- truncate_cleanup_page(mapping, locked_pvec.pages[i]);
+ truncate_cleanup_page(locked_pvec.pages[i]);
delete_from_page_cache_batch(mapping, &locked_pvec);
for (i = 0; i < pagevec_count(&locked_pvec); i++)
unlock_page(locked_pvec.pages[i]);
@@ -715,6 +712,16 @@
continue;
}
+ if (!did_range_unmap && page_mapped(page)) {
+ /*
+ * If page is mapped, before taking its lock,
+ * zap the rest of the file in one hit.
+ */
+ unmap_mapping_pages(mapping, index,
+ (1 + end - index), false);
+ did_range_unmap = 1;
+ }
+
lock_page(page);
WARN_ON(page_to_index(page) != index);
if (page->mapping != mapping) {
@@ -722,23 +729,11 @@
continue;
}
wait_on_page_writeback(page);
- if (page_mapped(page)) {
- if (!did_range_unmap) {
- /*
- * Zap the rest of the file in one hit.
- */
- unmap_mapping_pages(mapping, index,
- (1 + end - index), false);
- did_range_unmap = 1;
- } else {
- /*
- * Just zap this page
- */
- unmap_mapping_pages(mapping, index,
- 1, false);
- }
- }
+
+ if (page_mapped(page))
+ unmap_mapping_page(page);
BUG_ON(page_mapped(page));
+
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index c7ae74c..640ff2b 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -269,7 +269,7 @@
*/
idx = linear_page_index(dst_vma, dst_addr);
mapping = dst_vma->vm_file->f_mapping;
- hash = hugetlb_fault_mutex_hash(h, mapping, idx, dst_addr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
err = -ENOMEM;
diff --git a/mm/util.c b/mm/util.c
index 3ad6db9..ab358c6 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -594,6 +594,24 @@
}
EXPORT_SYMBOL(kvfree);
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a3c70e2..5797e1e 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -34,6 +34,7 @@
#include <linux/llist.h>
#include <linux/bitops.h>
#include <linux/rbtree_augmented.h>
+#include <linux/overflow.h>
#include <linux/uaccess.h>
#include <asm/tlbflush.h>
@@ -84,6 +85,8 @@
if (pmd_none_or_clear_bad(pmd))
continue;
vunmap_pte_range(pmd, addr, next);
+
+ cond_resched();
} while (pmd++, addr = next, addr != end);
}
@@ -1259,7 +1262,7 @@
* First make sure the mappings are removed from all page-tables
* before they are freed.
*/
- vmalloc_sync_all();
+ vmalloc_sync_unmappings();
/*
* TODO: to calculate a flush range without looping.
@@ -1349,7 +1352,7 @@
{
flush_cache_vunmap(va->va_start, va->va_end);
unmap_vmap_area(va);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
free_vmap_area_noflush(va);
@@ -1647,7 +1650,7 @@
vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
- if (debug_pagealloc_enabled())
+ if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range((unsigned long)addr,
(unsigned long)addr + size);
@@ -2976,6 +2979,7 @@
* @vma: vma to cover
* @uaddr: target user address to start at
* @kaddr: virtual address of vmalloc kernel memory
+ * @pgoff: offset from @kaddr to start at
* @size: size of map area
*
* Returns: 0 for success, -Exxx on failure
@@ -2988,9 +2992,15 @@
* Similar to remap_pfn_range() (see mm/memory.c)
*/
int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
- void *kaddr, unsigned long size)
+ void *kaddr, unsigned long pgoff,
+ unsigned long size)
{
struct vm_struct *area;
+ unsigned long off;
+ unsigned long end_index;
+
+ if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
+ return -EINVAL;
size = PAGE_ALIGN(size);
@@ -3004,8 +3014,10 @@
if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
return -EINVAL;
- if (kaddr + size > area->addr + get_vm_area_size(area))
+ if (check_add_overflow(size, off, &end_index) ||
+ end_index > get_vm_area_size(area))
return -EINVAL;
+ kaddr += off;
do {
struct page *page = vmalloc_to_page(kaddr);
@@ -3044,22 +3056,25 @@
unsigned long pgoff)
{
return remap_vmalloc_range_partial(vma, vma->vm_start,
- addr + (pgoff << PAGE_SHIFT),
+ addr, pgoff,
vma->vm_end - vma->vm_start);
}
EXPORT_SYMBOL(remap_vmalloc_range);
/*
- * Implement a stub for vmalloc_sync_all() if the architecture chose not to
- * have one.
+ * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
+ * not to have one.
*
* The purpose of this function is to make sure the vmalloc area
* mappings are identical in all page-tables in the system.
*/
-void __weak vmalloc_sync_all(void)
+void __weak vmalloc_sync_mappings(void)
{
}
+void __weak vmalloc_sync_unmappings(void)
+{
+}
static int f(pte_t *pte, unsigned long addr, void *data)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index ee4eecc..de94881 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -89,9 +89,12 @@
unsigned int may_swap:1;
/*
- * Cgroups are not reclaimed below their configured memory.low,
- * unless we threaten to OOM. If any cgroups are skipped due to
- * memory.low and nothing was reclaimed, go back for memory.low.
+ * Cgroup memory below memory.low is protected as long as we
+ * don't threaten to OOM. If any cgroup is reclaimed at
+ * reduced force or passed over entirely due to its memory.low
+ * setting (memcg_low_skipped), and nothing is reclaimed as a
+ * result, then go back for one more cycle that reclaims the protected
+ * memory (memcg_low_reclaim) to avert OOM.
*/
unsigned int memcg_low_reclaim:1;
unsigned int memcg_low_skipped:1;
@@ -422,7 +425,7 @@
{
down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list);
-#ifdef CONFIG_MEMCG_KMEM
+#ifdef CONFIG_MEMCG
if (shrinker->flags & SHRINKER_MEMCG_AWARE)
idr_replace(&shrinker_idr, shrinker, shrinker->id);
#endif
@@ -2458,14 +2461,14 @@
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
unsigned long lruvec_size;
+ unsigned long low, min;
unsigned long scan;
- unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(memcg,
- sc->memcg_low_reclaim);
+ mem_cgroup_protection(sc->target_mem_cgroup, memcg,
+ &min, &low);
- if (protection) {
+ if (min || low) {
/*
* Scale a cgroup's reclaim pressure by proportioning
* its current usage to its memory.low or memory.min
@@ -2496,12 +2499,21 @@
* hard protection.
*/
unsigned long cgroup_size = mem_cgroup_size(memcg);
+ unsigned long protection;
+
+ /* memory.low scaling, make sure we retry before OOM */
+ if (!sc->memcg_low_reclaim && low > min) {
+ protection = low;
+ sc->memcg_low_skipped = 1;
+ } else {
+ protection = min;
+ }
/* Avoid TOCTOU with earlier protection check */
cgroup_size = max(cgroup_size, protection);
scan = lruvec_size - lruvec_size * protection /
- cgroup_size;
+ (cgroup_size + 1);
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
@@ -2530,10 +2542,13 @@
/*
* Scan types proportional to swappiness and
* their relative recent reclaim efficiency.
- * Make sure we don't miss the last page
- * because of a round-off error.
+ * Make sure we don't miss the last page on
+ * the offlined memory cgroups because of a
+ * round-off error.
*/
- scan = DIV64_U64_ROUND_UP(scan * fraction[file],
+ scan = mem_cgroup_online(memcg) ?
+ div64_u64(scan * fraction[file], denominator) :
+ DIV64_U64_ROUND_UP(scan * fraction[file],
denominator);
break;
case SCAN_FILE:
@@ -2772,6 +2787,14 @@
unsigned long reclaimed;
unsigned long scanned;
+ /*
+ * This loop can become CPU-bound when target memcgs
+ * aren't eligible for reclaim - either because they
+ * don't have any reclaimable pages, or because their
+ * memory is explicitly protected. Avoid soft lockups.
+ */
+ cond_resched();
+
switch (mem_cgroup_protected(root, memcg)) {
case MEMCG_PROT_MIN:
/*
@@ -3157,8 +3180,9 @@
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
- pgdat->kswapd_classzone_idx = min(pgdat->kswapd_classzone_idx,
- (enum zone_type)ZONE_NORMAL);
+ if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL)
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL);
+
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3790,9 +3814,9 @@
static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
enum zone_type prev_classzone_idx)
{
- if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- return prev_classzone_idx;
- return pgdat->kswapd_classzone_idx;
+ enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+
+ return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
@@ -3836,8 +3860,11 @@
* the previous request that slept prematurely.
*/
if (remaining) {
- pgdat->kswapd_classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
- pgdat->kswapd_order = max(pgdat->kswapd_order, reclaim_order);
+ WRITE_ONCE(pgdat->kswapd_classzone_idx,
+ kswapd_classzone_idx(pgdat, classzone_idx));
+
+ if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
+ WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
}
finish_wait(&pgdat->kswapd_wait, &wait);
@@ -3914,12 +3941,12 @@
tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
set_freezable();
- pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
+ WRITE_ONCE(pgdat->kswapd_order, 0);
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
for ( ; ; ) {
bool ret;
- alloc_order = reclaim_order = pgdat->kswapd_order;
+ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
kswapd_try_sleep:
@@ -3927,10 +3954,10 @@
classzone_idx);
/* Read the new order and classzone_idx */
- alloc_order = reclaim_order = pgdat->kswapd_order;
+ alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
- pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = MAX_NR_ZONES;
+ WRITE_ONCE(pgdat->kswapd_order, 0);
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3974,20 +4001,23 @@
enum zone_type classzone_idx)
{
pg_data_t *pgdat;
+ enum zone_type curr_idx;
if (!managed_zone(zone))
return;
if (!cpuset_zone_allowed(zone, gfp_flags))
return;
- pgdat = zone->zone_pgdat;
- if (pgdat->kswapd_classzone_idx == MAX_NR_ZONES)
- pgdat->kswapd_classzone_idx = classzone_idx;
- else
- pgdat->kswapd_classzone_idx = max(pgdat->kswapd_classzone_idx,
- classzone_idx);
- pgdat->kswapd_order = max(pgdat->kswapd_order, order);
+ pgdat = zone->zone_pgdat;
+ curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+
+ if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx)
+ WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx);
+
+ if (READ_ONCE(pgdat->kswapd_order) < order)
+ WRITE_ONCE(pgdat->kswapd_order, order);
+
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 6d3d3f6..e971437 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -839,6 +839,7 @@
destroy_workqueue(pool->compact_wq);
destroy_workqueue(pool->release_wq);
z3fold_unregister_migration(pool);
+ free_percpu(pool->unbuddied);
kfree(pool);
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2b2b9aa..443b3b1 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -293,11 +293,7 @@
};
struct mapping_area {
-#ifdef CONFIG_PGTABLE_MAPPING
- struct vm_struct *vm; /* vm area for mapping object that span pages */
-#else
char *vm_buf; /* copy buffer for objects that span pages */
-#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
};
@@ -1113,46 +1109,6 @@
return zspage;
}
-#ifdef CONFIG_PGTABLE_MAPPING
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
- /*
- * Make sure we don't leak memory if a cpu UP notification
- * and zs_init() race and both call zs_cpu_up() on the same cpu
- */
- if (area->vm)
- return 0;
- area->vm = alloc_vm_area(PAGE_SIZE * 2, NULL);
- if (!area->vm)
- return -ENOMEM;
- return 0;
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
- if (area->vm)
- free_vm_area(area->vm);
- area->vm = NULL;
-}
-
-static inline void *__zs_map_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
- area->vm_addr = area->vm->addr;
- return area->vm_addr + off;
-}
-
-static inline void __zs_unmap_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- unsigned long addr = (unsigned long)area->vm_addr;
-
- unmap_kernel_range(addr, PAGE_SIZE * 2);
-}
-
-#else /* CONFIG_PGTABLE_MAPPING */
-
static inline int __zs_cpu_up(struct mapping_area *area)
{
/*
@@ -1233,8 +1189,6 @@
pagefault_enable();
}
-#endif /* CONFIG_PGTABLE_MAPPING */
-
static int zs_cpu_prepare(unsigned int cpu)
{
struct mapping_area *area;
@@ -2069,6 +2023,11 @@
zs_pool_dec_isolated(pool);
}
+ if (page_zone(newpage) != page_zone(page)) {
+ dec_zone_page_state(page, NR_ZSPAGES);
+ inc_zone_page_state(newpage, NR_ZSPAGES);
+ }
+
reset_page(page);
put_page(page);
page = newpage;
@@ -2257,11 +2216,13 @@
return obj_wasted * class->pages_per_zspage;
}
-static void __zs_compact(struct zs_pool *pool, struct size_class *class)
+static unsigned long __zs_compact(struct zs_pool *pool,
+ struct size_class *class)
{
struct zs_compact_control cc;
struct zspage *src_zspage;
struct zspage *dst_zspage = NULL;
+ unsigned long pages_freed = 0;
spin_lock(&class->lock);
while ((src_zspage = isolate_zspage(class, true))) {
@@ -2291,7 +2252,7 @@
putback_zspage(class, dst_zspage);
if (putback_zspage(class, src_zspage) == ZS_EMPTY) {
free_zspage(pool, class, src_zspage);
- pool->stats.pages_compacted += class->pages_per_zspage;
+ pages_freed += class->pages_per_zspage;
}
spin_unlock(&class->lock);
cond_resched();
@@ -2302,12 +2263,15 @@
putback_zspage(class, src_zspage);
spin_unlock(&class->lock);
+
+ return pages_freed;
}
unsigned long zs_compact(struct zs_pool *pool)
{
int i;
struct size_class *class;
+ unsigned long pages_freed = 0;
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
class = pool->size_class[i];
@@ -2315,10 +2279,11 @@
continue;
if (class->index != i)
continue;
- __zs_compact(pool, class);
+ pages_freed += __zs_compact(pool, class);
}
+ atomic_long_add(pages_freed, &pool->stats.pages_compacted);
- return pool->stats.pages_compacted;
+ return pages_freed;
}
EXPORT_SYMBOL_GPL(zs_compact);
@@ -2335,13 +2300,12 @@
struct zs_pool *pool = container_of(shrinker, struct zs_pool,
shrinker);
- pages_freed = pool->stats.pages_compacted;
/*
* Compact classes and calculate compaction delta.
* Can run concurrently with a manually triggered
* (by user) compaction.
*/
- pages_freed = zs_compact(pool) - pages_freed;
+ pages_freed = zs_compact(pool);
return pages_freed ? pages_freed : SHRINK_STOP;
}