Update Linux to v5.10.157
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.157.tar.xz
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
Change-Id: I7b30d9e98d8c465d6b44de8e7433b4a40b3289ba
diff --git a/mm/compaction.c b/mm/compaction.c
index dba4244..8dfbe86 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -1747,6 +1747,8 @@
update_fast_start_pfn(cc, free_pfn);
pfn = pageblock_start_pfn(free_pfn);
+ if (pfn < cc->zone->zone_start_pfn)
+ pfn = cc->zone->zone_start_pfn;
cc->fast_search_fail = 0;
found_block = true;
set_pageblock_skip(freepage);
diff --git a/mm/filemap.c b/mm/filemap.c
index 125b69f..3a983bc 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3303,7 +3303,7 @@
unsigned long offset; /* Offset into pagecache page */
unsigned long bytes; /* Bytes to write to page */
size_t copied; /* Bytes copied from user */
- void *fsdata;
+ void *fsdata = NULL;
offset = (pos & (PAGE_SIZE - 1));
bytes = min_t(unsigned long, PAGE_SIZE - offset,
diff --git a/mm/gup.c b/mm/gup.c
index 6cb7d8a..b47c751 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2128,8 +2128,28 @@
}
#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+/*
+ * Fast-gup relies on pte change detection to avoid concurrent pgtable
+ * operations.
+ *
+ * To pin the page, fast-gup needs to do below in order:
+ * (1) pin the page (by prefetching pte), then (2) check pte not changed.
+ *
+ * For the rest of pgtable operations where pgtable updates can be racy
+ * with fast-gup, we need to do (1) clear pte, then (2) check whether page
+ * is pinned.
+ *
+ * Above will work for all pte-level operations, including THP split.
+ *
+ * For THP collapse, it's a bit more complicated because fast-gup may be
+ * walking a pgtable page that is being freed (pte is still valid but pmd
+ * can be cleared already). To avoid race in such condition, we need to
+ * also check pmd here to make sure pmd doesn't change (corresponds to
+ * pmdp_collapse_flush() in the THP collapse code path).
+ */
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
struct dev_pagemap *pgmap = NULL;
int nr_start = *nr, ret = 0;
@@ -2169,7 +2189,8 @@
if (!head)
goto pte_unmap;
- if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ if (unlikely(pmd_val(pmd) != pmd_val(*pmdp)) ||
+ unlikely(pte_val(pte) != pte_val(*ptep))) {
put_compound_head(head, 1, flags);
goto pte_unmap;
}
@@ -2214,8 +2235,9 @@
* get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
-static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
- unsigned int flags, struct page **pages, int *nr)
+static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
+ unsigned long end, unsigned int flags,
+ struct page **pages, int *nr)
{
return 0;
}
@@ -2522,7 +2544,7 @@
if (!gup_huge_pd(__hugepd(pmd_val(pmd)), addr,
PMD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pte_range(pmd, addr, next, flags, pages, nr))
+ } else if (!gup_pte_range(pmd, pmdp, addr, next, flags, pages, nr))
return 0;
} while (pmdp++, addr = next, addr != end);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 594368f..cb7b0ae 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1691,7 +1691,7 @@
VM_BUG_ON(!is_pmd_migration_entry(orig_pmd));
entry = pmd_to_swp_entry(orig_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
flush_needed = 0;
} else
WARN_ONCE(1, "Non present huge pmd without pmd migration enabled!");
@@ -2110,7 +2110,7 @@
swp_entry_t entry;
entry = pmd_to_swp_entry(old_pmd);
- page = pfn_to_page(swp_offset(entry));
+ page = migration_entry_to_page(entry);
write = is_write_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index fce705f..d8c63d7 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2387,11 +2387,11 @@
page = alloc_buddy_huge_page_with_mpol(h, vma, addr);
if (!page)
goto out_uncharge_cgroup;
+ spin_lock(&hugetlb_lock);
if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) {
SetPagePrivate(page);
h->resv_huge_pages--;
}
- spin_lock(&hugetlb_lock);
list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
@@ -4337,6 +4337,7 @@
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
bool new_page = false;
+ u32 hash = hugetlb_fault_mutex_hash(mapping, idx);
/*
* Currently, we are forced to kill the process in the event the
@@ -4346,7 +4347,7 @@
if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) {
pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n",
current->pid);
- return ret;
+ goto out;
}
/*
@@ -4365,7 +4366,6 @@
* Check for page in userfault range
*/
if (userfaultfd_missing(vma)) {
- u32 hash;
struct vm_fault vmf = {
.vma = vma,
.address = haddr,
@@ -4380,17 +4380,14 @@
};
/*
- * hugetlb_fault_mutex and i_mmap_rwsem must be
- * dropped before handling userfault. Reacquire
- * after handling fault to make calling code simpler.
+ * vma_lock and hugetlb_fault_mutex must be dropped
+ * before handling userfault. Also mmap_lock will
+ * be dropped during handling userfault, any vma
+ * operation should be careful from here.
*/
- hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
i_mmap_unlock_read(mapping);
- ret = handle_userfault(&vmf, VM_UFFD_MISSING);
- i_mmap_lock_read(mapping);
- mutex_lock(&hugetlb_fault_mutex_table[hash]);
- goto out;
+ return handle_userfault(&vmf, VM_UFFD_MISSING);
}
page = alloc_huge_page(vma, haddr, 0);
@@ -4497,6 +4494,8 @@
unlock_page(page);
out:
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
return ret;
backout:
@@ -4592,10 +4591,12 @@
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
- if (huge_pte_none(entry)) {
- ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
- goto out_mutex;
- }
+ if (huge_pte_none(entry))
+ /*
+ * hugetlb_no_page will drop vma lock and hugetlb fault
+ * mutex internally, which make us return immediately.
+ */
+ return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, flags);
ret = 0;
@@ -5465,7 +5466,14 @@
pud_clear(pud);
put_page(virt_to_page(ptep));
mm_dec_nr_pmds(mm);
- *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+ /*
+ * This update of passed address optimizes loops sequentially
+ * processing addresses in increments of huge page size (PMD_SIZE
+ * in this case). By clearing the pud, a PUD_SIZE area is unmapped.
+ * Update address to the 'last page' in the cleared area so that
+ * calling loop can move to first page past this area.
+ */
+ *addr |= PUD_SIZE - PMD_SIZE;
return 1;
}
#define want_pmd_share() (1)
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 0e3f849..6221938 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -299,6 +299,13 @@
struct qlist_head *q;
q = this_cpu_ptr(&cpu_quarantine);
+ /*
+ * Ensure the ordering between the writing to q->offline and
+ * per_cpu_remove_cache. Prevent cpu_quarantine from being corrupted
+ * by interrupt.
+ */
+ if (READ_ONCE(q->offline))
+ return;
qlist_move_cache(q, &to_free, cache);
qlist_free_all(&to_free, cache);
}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 969e57d..cf4dceb 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1144,10 +1144,12 @@
pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
/*
- * After this gup_fast can't run anymore. This also removes
- * any huge TLB entry from the CPU so we won't allow
- * huge and small TLB entries for the same virtual address
- * to avoid the risk of CPU bugs in that area.
+ * This removes any huge TLB entry from the CPU so we won't allow
+ * huge and small TLB entries for the same virtual address to
+ * avoid the risk of CPU bugs in that area.
+ *
+ * Parallel fast GUP is fine since fast GUP will back off when
+ * it detects PMD is changed.
*/
_pmd = pmdp_collapse_flush(vma, address, pmd);
spin_unlock(pmd_ptl);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 56fcfcb..4801751 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -787,6 +787,8 @@
unsigned long flags;
struct kmemleak_object *object;
struct kmemleak_scan_area *area = NULL;
+ unsigned long untagged_ptr;
+ unsigned long untagged_objp;
object = find_and_get_object(ptr, 1);
if (!object) {
@@ -795,6 +797,9 @@
return;
}
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
+ untagged_objp = (unsigned long)kasan_reset_tag((void *)object->pointer);
+
if (scan_area_cache)
area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp));
@@ -806,8 +811,8 @@
goto out_unlock;
}
if (size == SIZE_MAX) {
- size = object->pointer + object->size - ptr;
- } else if (ptr + size > object->pointer + object->size) {
+ size = untagged_objp + object->size - untagged_ptr;
+ } else if (untagged_ptr + size > untagged_objp + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
diff --git a/mm/maccess.c b/mm/maccess.c
index 3bd7040..f6ea117 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -83,7 +83,7 @@
return src - unsafe_addr;
Efault:
pagefault_enable();
- dst[-1] = '\0';
+ dst[0] = '\0';
return -EFAULT;
}
#else /* HAVE_GET_KERNEL_NOFAULT */
diff --git a/mm/madvise.c b/mm/madvise.c
index 24abc79..f71fc88 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -434,8 +434,11 @@
continue;
}
- /* Do not interfere with other mappings of this page */
- if (page_mapcount(page) != 1)
+ /*
+ * Do not interfere with other mappings of this page and
+ * non-LRU page.
+ */
+ if (!PageLRU(page) || page_mapcount(page) != 1)
continue;
VM_BUG_ON_PAGE(PageTransCompound(page), page);
@@ -1229,8 +1232,7 @@
iov_iter_advance(&iter, iovec.iov_len);
}
- if (ret == 0)
- ret = total_len - iov_iter_count(&iter);
+ ret = (total_len - iov_iter_count(&iter)) ? : ret;
release_mm:
mmput(mm);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index dbe07fe..92ab008 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7124,7 +7124,7 @@
if (!strcmp(token, "nokmem"))
cgroup_memory_nokmem = true;
}
- return 0;
+ return 1;
}
__setup("cgroup.memory=", cgroup_memory);
diff --git a/mm/memory.c b/mm/memory.c
index 4fe24cd..cbc0a16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -823,6 +823,17 @@
if (likely(!page_maybe_dma_pinned(page)))
return 1;
+ /*
+ * The vma->anon_vma of the child process may be NULL
+ * because the entire vma does not contain anonymous pages.
+ * A BUG will occur when the copy_present_page() passes
+ * a copy of a non-anonymous page of that vma to the
+ * page_add_new_anon_rmap() to set up new anonymous rmap.
+ * Return 1 if the page is not an anonymous page.
+ */
+ if (!PageAnon(page))
+ return 1;
+
new_page = *prealloc;
if (!new_page)
return -EAGAIN;
@@ -1204,6 +1215,17 @@
return ret;
}
+/* Whether we should zap all COWed (private) pages too */
+static inline bool should_zap_cows(struct zap_details *details)
+{
+ /* By default, zap all pages */
+ if (!details)
+ return true;
+
+ /* Or, we zap COWed pages only if the caller wants to */
+ return !details->check_mapping;
+}
+
static unsigned long zap_pte_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
@@ -1295,16 +1317,18 @@
continue;
}
- /* If details->check_mapping, we leave swap entries. */
- if (unlikely(details))
- continue;
-
- if (!non_swap_entry(entry))
+ if (!non_swap_entry(entry)) {
+ /* Genuine swap entry, hence a private anon page */
+ if (!should_zap_cows(details))
+ continue;
rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
+ } else if (is_migration_entry(entry)) {
struct page *page;
page = migration_entry_to_page(entry);
+ if (details && details->check_mapping &&
+ details->check_mapping != page_rmapping(page))
+ continue;
rss[mm_counter(page)]--;
}
if (unlikely(!free_swap_and_cache(entry)))
@@ -3676,11 +3700,20 @@
return ret;
if (unlikely(PageHWPoison(vmf->page))) {
- if (ret & VM_FAULT_LOCKED)
- unlock_page(vmf->page);
- put_page(vmf->page);
+ struct page *page = vmf->page;
+ vm_fault_t poisonret = VM_FAULT_HWPOISON;
+ if (ret & VM_FAULT_LOCKED) {
+ if (page_mapped(page))
+ unmap_mapping_pages(page_mapping(page),
+ page->index, 1, false);
+ /* Retry if a clean page was removed from the cache. */
+ if (invalidate_inode_page(page))
+ poisonret = VM_FAULT_NOPAGE;
+ unlock_page(page);
+ }
+ put_page(page);
vmf->page = NULL;
- return VM_FAULT_HWPOISON;
+ return poisonret;
}
if (unlikely(!(ret & VM_FAULT_LOCKED)))
@@ -4347,6 +4380,19 @@
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
/* No support for anonymous transparent PUD pages yet */
if (vma_is_anonymous(vmf->vma))
+ return VM_FAULT_FALLBACK;
+ if (vmf->vma->vm_ops->huge_fault)
+ return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+ return VM_FAULT_FALLBACK;
+}
+
+static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
+{
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && \
+ defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+ /* No support for anonymous transparent PUD pages yet */
+ if (vma_is_anonymous(vmf->vma))
goto split;
if (vmf->vma->vm_ops->huge_fault) {
vm_fault_t ret = vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
@@ -4357,19 +4403,7 @@
split:
/* COW or write-notify not handled on PUD level: split pud.*/
__split_huge_pud(vmf->vma, vmf->pud, vmf->address);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
- return VM_FAULT_FALLBACK;
-}
-
-static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
-{
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- /* No support for anonymous transparent PUD pages yet */
- if (vma_is_anonymous(vmf->vma))
- return VM_FAULT_FALLBACK;
- if (vmf->vma->vm_ops->huge_fault)
- return vmf->vma->vm_ops->huge_fault(vmf, PE_SIZE_PUD);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
return VM_FAULT_FALLBACK;
}
@@ -5273,6 +5307,8 @@
if (rc)
break;
+ flush_dcache_page(subpage);
+
cond_resched();
}
return ret_val;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c8b1592..f9f4744 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -374,7 +374,7 @@
*/
static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
- if (!pol)
+ if (!pol || pol->mode == MPOL_LOCAL)
return;
if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
@@ -802,7 +802,6 @@
static int mbind_range(struct mm_struct *mm, unsigned long start,
unsigned long end, struct mempolicy *new_pol)
{
- struct vm_area_struct *next;
struct vm_area_struct *prev;
struct vm_area_struct *vma;
int err = 0;
@@ -817,8 +816,7 @@
if (start > vma->vm_start)
prev = vma;
- for (; vma && vma->vm_start < end; prev = vma, vma = next) {
- next = vma->vm_next;
+ for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) {
vmstart = max(start, vma->vm_start);
vmend = min(end, vma->vm_end);
@@ -832,10 +830,6 @@
new_pol, vma->vm_userfaultfd_ctx);
if (prev) {
vma = prev;
- next = vma->vm_next;
- if (mpol_equal(vma_policy(vma), new_pol))
- continue;
- /* vma_merge() joined vma && vma->next, case 8 */
goto replace;
}
if (vma->vm_start != vmstart) {
@@ -2642,6 +2636,7 @@
mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
if (!mpol_new)
goto err_out;
+ atomic_set(&mpol_new->refcnt, 1);
goto restart;
}
diff --git a/mm/memremap.c b/mm/memremap.c
index 2455bac..299aad0 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -348,6 +348,7 @@
WARN(1, "File system DAX not supported\n");
return ERR_PTR(-EINVAL);
}
+ params.pgprot = pgprot_decrypted(params.pgprot);
break;
case MEMORY_DEVICE_GENERIC:
break;
diff --git a/mm/migrate.c b/mm/migrate.c
index 278e6f3..b716b8f 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1010,9 +1010,12 @@
if (!PageMappingFlags(page))
page->mapping = NULL;
- if (likely(!is_zone_device_page(newpage)))
- flush_dcache_page(newpage);
+ if (likely(!is_zone_device_page(newpage))) {
+ int i, nr = compound_nr(newpage);
+ for (i = 0; i < nr; i++)
+ flush_dcache_page(newpage + i);
+ }
}
out:
return rc;
@@ -2470,13 +2473,14 @@
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = mpfn;
}
- arch_leave_lazy_mmu_mode();
- pte_unmap_unlock(ptep - 1, ptl);
/* Only flush the TLB if we actually modified any entries */
if (unmapped)
flush_tlb_range(walk->vma, start, end);
+ arch_leave_lazy_mmu_mode();
+ pte_unmap_unlock(ptep - 1, ptl);
+
return 0;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 5c8b448..33ebda8 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1694,8 +1694,12 @@
pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags)))
return 0;
- /* Do we need to track softdirty? */
- if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY))
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) && !(vm_flags & VM_SOFTDIRTY) &&
+ !is_vm_hugetlb_page(vma))
return 1;
/* Specialty mapping? */
@@ -1852,7 +1856,7 @@
if (!arch_validate_flags(vma->vm_flags)) {
error = -EINVAL;
if (file)
- goto unmap_and_free_vma;
+ goto close_and_free_vma;
else
goto free_vma;
}
@@ -1896,13 +1900,15 @@
return addr;
+close_and_free_vma:
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
unmap_and_free_vma:
vma->vm_file = NULL;
fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
- charged = 0;
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
allow_write_and_free_vma:
@@ -2140,14 +2146,6 @@
return addr;
}
-#ifndef arch_get_mmap_end
-#define arch_get_mmap_end(addr) (TASK_SIZE)
-#endif
-
-#ifndef arch_get_mmap_base
-#define arch_get_mmap_base(addr, base) (base)
-#endif
-
/* Get an address range which is currently unmapped.
* For shmat() with addr=0.
*
@@ -2577,7 +2575,7 @@
if (!*endptr)
stack_guard_gap = val << PAGE_SHIFT;
- return 0;
+ return 1;
}
__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
@@ -2669,11 +2667,28 @@
{
struct vm_area_struct *next = vma_next(mm, prev);
struct mmu_gather tlb;
+ struct vm_area_struct *cur_vma;
lru_add_drain();
tlb_gather_mmu(&tlb, mm, start, end);
update_hiwater_rss(mm);
unmap_vmas(&tlb, vma, start, end);
+
+ /*
+ * Ensure we have no stale TLB entries by the time this mapping is
+ * removed from the rmap.
+ * Note that we don't have to worry about nested flushes here because
+ * we're holding the mm semaphore for removing the mapping - so any
+ * concurrent flush in this region has to be coming through the rmap,
+ * and we synchronize against that using the rmap lock.
+ */
+ for (cur_vma = vma; cur_vma; cur_vma = cur_vma->vm_next) {
+ if ((cur_vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) != 0) {
+ tlb_flush_mmu(&tlb);
+ break;
+ }
+ }
+
free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
next ? next->vm_start : USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb, start, end);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 07f42a7..9165ca6 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -1043,6 +1043,18 @@
}
EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
+static bool
+mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
+ unsigned long seq)
+{
+ bool ret;
+
+ spin_lock(&subscriptions->lock);
+ ret = subscriptions->invalidate_seq != seq;
+ spin_unlock(&subscriptions->lock);
+ return ret;
+}
+
/**
* mmu_interval_notifier_remove - Remove a interval notifier
* @interval_sub: Interval subscription to unregister
@@ -1090,7 +1102,7 @@
lock_map_release(&__mmu_notifier_invalidate_range_start_map);
if (seq)
wait_event(subscriptions->wq,
- READ_ONCE(subscriptions->invalidate_seq) != seq);
+ mmu_interval_seq_released(subscriptions, seq));
/* pairs with mmgrab in mmu_interval_notifier_insert() */
mmdrop(mm);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc..f337831 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,20 +72,6 @@
return z;
}
-#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-bool memmap_valid_within(unsigned long pfn,
- struct page *page, struct zone *zone)
-{
- if (page_to_pfn(page) != pfn)
- return false;
-
- if (page_zone(page) != zone)
- return false;
-
- return true;
-}
-#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
void lruvec_init(struct lruvec *lruvec)
{
enum lru_list lru;
diff --git a/mm/mremap.c b/mm/mremap.c
index 138abba..3334c40 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -260,6 +260,9 @@
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
+ if (!len)
+ return 0;
+
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
@@ -307,12 +310,10 @@
*/
bool moved;
- if (need_rmap_locks)
- take_rmap_locks(vma);
+ take_rmap_locks(vma);
moved = move_normal_pmd(vma, old_addr, new_addr,
old_pmd, new_pmd);
- if (need_rmap_locks)
- drop_rmap_locks(vma);
+ drop_rmap_locks(vma);
if (moved)
continue;
#endif
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 419a814..3d7c557 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -633,7 +633,7 @@
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
- /* Drop a reference taken by wake_oom_reaper */
+ /* Drop a reference taken by queue_oom_reaper */
put_task_struct(tsk);
}
@@ -643,12 +643,12 @@
struct task_struct *tsk = NULL;
wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
- spin_lock(&oom_reaper_lock);
+ spin_lock_irq(&oom_reaper_lock);
if (oom_reaper_list != NULL) {
tsk = oom_reaper_list;
oom_reaper_list = tsk->oom_reaper_list;
}
- spin_unlock(&oom_reaper_lock);
+ spin_unlock_irq(&oom_reaper_lock);
if (tsk)
oom_reap_task(tsk);
@@ -657,20 +657,46 @@
return 0;
}
-static void wake_oom_reaper(struct task_struct *tsk)
+static void wake_oom_reaper(struct timer_list *timer)
+{
+ struct task_struct *tsk = container_of(timer, struct task_struct,
+ oom_reaper_timer);
+ struct mm_struct *mm = tsk->signal->oom_mm;
+ unsigned long flags;
+
+ /* The victim managed to terminate on its own - see exit_mmap */
+ if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
+ put_task_struct(tsk);
+ return;
+ }
+
+ spin_lock_irqsave(&oom_reaper_lock, flags);
+ tsk->oom_reaper_list = oom_reaper_list;
+ oom_reaper_list = tsk;
+ spin_unlock_irqrestore(&oom_reaper_lock, flags);
+ trace_wake_reaper(tsk->pid);
+ wake_up(&oom_reaper_wait);
+}
+
+/*
+ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
+ * The timers timeout is arbitrary... the longer it is, the longer the worst
+ * case scenario for the OOM can take. If it is too small, the oom_reaper can
+ * get in the way and release resources needed by the process exit path.
+ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
+ * before the exit path is able to wake the futex waiters.
+ */
+#define OOM_REAPER_DELAY (2*HZ)
+static void queue_oom_reaper(struct task_struct *tsk)
{
/* mm is already queued? */
if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
return;
get_task_struct(tsk);
-
- spin_lock(&oom_reaper_lock);
- tsk->oom_reaper_list = oom_reaper_list;
- oom_reaper_list = tsk;
- spin_unlock(&oom_reaper_lock);
- trace_wake_reaper(tsk->pid);
- wake_up(&oom_reaper_wait);
+ timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
+ tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
+ add_timer(&tsk->oom_reaper_timer);
}
static int __init oom_init(void)
@@ -680,7 +706,7 @@
}
subsys_initcall(oom_init)
#else
-static inline void wake_oom_reaper(struct task_struct *tsk)
+static inline void queue_oom_reaper(struct task_struct *tsk)
{
}
#endif /* CONFIG_MMU */
@@ -931,7 +957,7 @@
rcu_read_unlock();
if (can_oom_reap)
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
mmdrop(mm);
put_task_struct(victim);
@@ -967,7 +993,7 @@
task_lock(victim);
if (task_will_free_mem(victim)) {
mark_oom_victim(victim);
- wake_oom_reaper(victim);
+ queue_oom_reaper(victim);
task_unlock(victim);
put_task_struct(victim);
return;
@@ -1065,7 +1091,7 @@
*/
if (task_will_free_mem(current)) {
mark_oom_victim(current);
- wake_oom_reaper(current);
+ queue_oom_reaper(current);
return true;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index c63656c..a56f2b9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3679,11 +3679,15 @@
* need to be calculated.
*/
if (!order) {
- long fast_free;
+ long usable_free;
+ long reserved;
- fast_free = free_pages;
- fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
- if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ usable_free = free_pages;
+ reserved = __zone_watermark_unusable_free(z, 0, alloc_flags);
+
+ /* reserved may over estimate high-atomic reserves. */
+ usable_free -= min(usable_free, reserved);
+ if (usable_free > mark + z->lowmem_reserve[highest_zoneidx])
return true;
}
@@ -4318,6 +4322,30 @@
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
+/*
+ * Zonelists may change due to hotplug during allocation. Detect when zonelists
+ * have been rebuilt so allocation retries. Reader side does not lock and
+ * retries the allocation if zonelist changes. Writer side is protected by the
+ * embedded spin_lock.
+ */
+static DEFINE_SEQLOCK(zonelist_update_seq);
+
+static unsigned int zonelist_iter_begin(void)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqbegin(&zonelist_update_seq);
+
+ return 0;
+}
+
+static unsigned int check_retry_zonelist(unsigned int seq)
+{
+ if (IS_ENABLED(CONFIG_MEMORY_HOTREMOVE))
+ return read_seqretry(&zonelist_update_seq, seq);
+
+ return seq;
+}
+
/* Perform direct synchronous page reclaim */
static unsigned long
__perform_reclaim(gfp_t gfp_mask, unsigned int order,
@@ -4625,6 +4653,7 @@
int compaction_retries;
int no_progress_loops;
unsigned int cpuset_mems_cookie;
+ unsigned int zonelist_iter_cookie;
int reserve_flags;
/*
@@ -4635,11 +4664,12 @@
(__GFP_ATOMIC|__GFP_DIRECT_RECLAIM)))
gfp_mask &= ~__GFP_ATOMIC;
-retry_cpuset:
+restart:
compaction_retries = 0;
no_progress_loops = 0;
compact_priority = DEF_COMPACT_PRIORITY;
cpuset_mems_cookie = read_mems_allowed_begin();
+ zonelist_iter_cookie = zonelist_iter_begin();
/*
* The fast path uses conservative alloc_flags to succeed only until
@@ -4798,9 +4828,13 @@
goto retry;
- /* Deal with possible cpuset update races before we start OOM killing */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/* Reclaim has failed us, start killing things */
page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
@@ -4820,9 +4854,13 @@
}
nopage:
- /* Deal with possible cpuset update races before we fail */
- if (check_retry_cpuset(cpuset_mems_cookie, ac))
- goto retry_cpuset;
+ /*
+ * Deal with possible cpuset update races or zonelist updates to avoid
+ * a unnecessary OOM kill.
+ */
+ if (check_retry_cpuset(cpuset_mems_cookie, ac) ||
+ check_retry_zonelist(zonelist_iter_cookie))
+ goto restart;
/*
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
@@ -5125,6 +5163,18 @@
/* reset page count bias and offset to start of new frag */
nc->pagecnt_bias = PAGE_FRAG_CACHE_MAX_SIZE + 1;
offset = size - fragsz;
+ if (unlikely(offset < 0)) {
+ /*
+ * The caller is trying to allocate a fragment
+ * with fragsz > PAGE_SIZE but the cache isn't big
+ * enough to satisfy the request, this may
+ * happen in low memory conditions.
+ * We don't release the cache page because
+ * it could make memory pressure worse
+ * so we simply return NULL here.
+ */
+ return NULL;
+ }
}
nc->pagecnt_bias--;
@@ -5653,7 +5703,7 @@
do {
zone_type--;
zone = pgdat->node_zones + zone_type;
- if (managed_zone(zone)) {
+ if (populated_zone(zone)) {
zoneref_set_zone(zone, &zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
@@ -5920,9 +5970,8 @@
int nid;
int __maybe_unused cpu;
pg_data_t *self = data;
- static DEFINE_SPINLOCK(lock);
- spin_lock(&lock);
+ write_seqlock(&zonelist_update_seq);
#ifdef CONFIG_NUMA
memset(node_load, 0, sizeof(node_load));
@@ -5955,7 +6004,7 @@
#endif
}
- spin_unlock(&lock);
+ write_sequnlock(&zonelist_update_seq);
}
static noinline void __init
@@ -7402,10 +7451,17 @@
out2:
/* Align start of ZONE_MOVABLE on all nids to MAX_ORDER_NR_PAGES */
- for (nid = 0; nid < MAX_NUMNODES; nid++)
+ for (nid = 0; nid < MAX_NUMNODES; nid++) {
+ unsigned long start_pfn, end_pfn;
+
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+ if (zone_movable_pfn[nid] >= end_pfn)
+ zone_movable_pfn[nid] = 0;
+ }
+
out:
/* restore the node_state */
node_states[N_MEMORY] = saved_node_state;
@@ -7671,7 +7727,7 @@
*/
#define adj_init_size(start, end, size, pos, adj) \
do { \
- if (start <= pos && pos < end && size > adj) \
+ if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
size -= adj; \
} while (0)
diff --git a/mm/page_io.c b/mm/page_io.c
index 9647981..f0ada44 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -69,54 +69,6 @@
bio_put(bio);
}
-static void swap_slot_free_notify(struct page *page)
-{
- struct swap_info_struct *sis;
- struct gendisk *disk;
- swp_entry_t entry;
-
- /*
- * There is no guarantee that the page is in swap cache - the software
- * suspend code (at least) uses end_swap_bio_read() against a non-
- * swapcache page. So we must check PG_swapcache before proceeding with
- * this optimization.
- */
- if (unlikely(!PageSwapCache(page)))
- return;
-
- sis = page_swap_info(page);
- if (data_race(!(sis->flags & SWP_BLKDEV)))
- return;
-
- /*
- * The swap subsystem performs lazy swap slot freeing,
- * expecting that the page will be swapped out again.
- * So we can avoid an unnecessary write if the page
- * isn't redirtied.
- * This is good for real swap storage because we can
- * reduce unnecessary I/O and enhance wear-leveling
- * if an SSD is used as the as swap device.
- * But if in-memory swap device (eg zram) is used,
- * this causes a duplicated copy between uncompressed
- * data in VM-owned memory and compressed data in
- * zram-owned memory. So let's free zram-owned memory
- * and make the VM-owned decompressed page *dirty*,
- * so the page should be swapped out somewhere again if
- * we again wish to reclaim it.
- */
- disk = sis->bdev->bd_disk;
- entry.val = page_private(page);
- if (disk->fops->swap_slot_free_notify && __swap_count(entry) == 1) {
- unsigned long offset;
-
- offset = swp_offset(entry);
-
- SetPageDirty(page);
- disk->fops->swap_slot_free_notify(sis->bdev,
- offset);
- }
-}
-
static void end_swap_bio_read(struct bio *bio)
{
struct page *page = bio_first_page_all(bio);
@@ -132,7 +84,6 @@
}
SetPageUptodate(page);
- swap_slot_free_notify(page);
out:
unlock_page(page);
WRITE_ONCE(bio->bi_private, NULL);
@@ -409,11 +360,6 @@
if (sis->flags & SWP_SYNCHRONOUS_IO) {
ret = bdev_read_page(sis->bdev, swap_page_sector(page), page);
if (!ret) {
- if (trylock_page(page)) {
- swap_slot_free_notify(page);
- unlock_page(page);
- }
-
count_vm_event(PSWPIN);
goto out;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index e81640d..371ec21 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -71,7 +71,7 @@
do {
again:
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd) || (!walk->vma && !walk->no_vma)) {
+ if (pmd_none(*pmd)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
@@ -129,7 +129,7 @@
do {
again:
next = pud_addr_end(addr, end);
- if (pud_none(*pud) || (!walk->vma && !walk->no_vma)) {
+ if (pud_none(*pud)) {
if (ops->pte_hole)
err = ops->pte_hole(addr, next, depth, walk);
if (err)
@@ -318,19 +318,19 @@
struct vm_area_struct *vma = walk->vma;
const struct mm_walk_ops *ops = walk->ops;
- if (vma && ops->pre_vma) {
+ if (ops->pre_vma) {
err = ops->pre_vma(start, end, walk);
if (err)
return err;
}
- if (vma && is_vm_hugetlb_page(vma)) {
+ if (is_vm_hugetlb_page(vma)) {
if (ops->hugetlb_entry)
err = walk_hugetlb_range(start, end, walk);
} else
err = walk_pgd_range(start, end, walk);
- if (vma && ops->post_vma)
+ if (ops->post_vma)
ops->post_vma(walk);
return err;
@@ -402,9 +402,13 @@
if (!vma) { /* after the last vma */
walk.vma = NULL;
next = end;
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, -1, &walk);
} else if (start < vma->vm_start) { /* outside vma */
walk.vma = NULL;
next = min(end, vma->vm_start);
+ if (ops->pte_hole)
+ err = ops->pte_hole(start, next, -1, &walk);
} else { /* inside vma */
walk.vma = vma;
next = min(end, vma->vm_end);
@@ -422,9 +426,8 @@
}
if (err < 0)
break;
- }
- if (walk.vma || walk.ops->pte_hole)
err = __walk_page_range(start, next, &walk);
+ }
if (err)
break;
} while (start = next, start < end);
@@ -453,9 +456,9 @@
if (start >= end || !walk.mm)
return -EINVAL;
- mmap_assert_locked(walk.mm);
+ mmap_assert_write_locked(walk.mm);
- return __walk_page_range(start, end, &walk);
+ return walk_pgd_range(start, end, &walk);
}
int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 93f2f63..a917bf5 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -141,13 +141,13 @@
{
const struct ptdump_range *range = st->range;
- mmap_read_lock(mm);
+ mmap_write_lock(mm);
while (range->start != range->end) {
walk_page_range_novma(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
- mmap_read_unlock(mm);
+ mmap_write_unlock(mm);
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
diff --git a/mm/rmap.c b/mm/rmap.c
index 14f84f7..e6f840b 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -89,7 +89,8 @@
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
- anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->num_children = 0;
+ anon_vma->num_active_vmas = 0;
anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
@@ -197,6 +198,7 @@
anon_vma = anon_vma_alloc();
if (unlikely(!anon_vma))
goto out_enomem_free_avc;
+ anon_vma->num_children++; /* self-parent link for new root */
allocated = anon_vma;
}
@@ -206,8 +208,7 @@
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
- /* vma reference or self-parent link for new root */
- anon_vma->degree++;
+ anon_vma->num_active_vmas++;
allocated = NULL;
avc = NULL;
}
@@ -292,19 +293,19 @@
anon_vma_chain_link(dst, avc, anon_vma);
/*
- * Reuse existing anon_vma if its degree lower than two,
- * that means it has no vma and only one anon_vma child.
+ * Reuse existing anon_vma if it has no vma and only one
+ * anon_vma child.
*
- * Do not chose parent anon_vma, otherwise first child
- * will always reuse it. Root anon_vma is never reused:
+ * Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
if (!dst->anon_vma && src->anon_vma &&
- anon_vma != src->anon_vma && anon_vma->degree < 2)
+ anon_vma->num_children < 2 &&
+ anon_vma->num_active_vmas == 0)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
- dst->anon_vma->degree++;
+ dst->anon_vma->num_active_vmas++;
unlock_anon_vma_root(root);
return 0;
@@ -354,6 +355,7 @@
anon_vma = anon_vma_alloc();
if (!anon_vma)
goto out_error;
+ anon_vma->num_active_vmas++;
avc = anon_vma_chain_alloc(GFP_KERNEL);
if (!avc)
goto out_error_free_anon_vma;
@@ -374,7 +376,7 @@
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
- anon_vma->parent->degree++;
+ anon_vma->parent->num_children++;
anon_vma_unlock_write(anon_vma);
return 0;
@@ -406,7 +408,7 @@
* to free them outside the lock.
*/
if (RB_EMPTY_ROOT(&anon_vma->rb_root.rb_root)) {
- anon_vma->parent->degree--;
+ anon_vma->parent->num_children--;
continue;
}
@@ -414,7 +416,7 @@
anon_vma_chain_free(avc);
}
if (vma->anon_vma)
- vma->anon_vma->degree--;
+ vma->anon_vma->num_active_vmas--;
unlock_anon_vma_root(root);
/*
@@ -425,7 +427,8 @@
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
- VM_WARN_ON(anon_vma->degree);
+ VM_WARN_ON(anon_vma->num_children);
+ VM_WARN_ON(anon_vma->num_active_vmas);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
@@ -1640,7 +1643,30 @@
/* MADV_FREE page check */
if (!PageSwapBacked(page)) {
- if (!PageDirty(page)) {
+ int ref_count, map_count;
+
+ /*
+ * Synchronize with gup_pte_range():
+ * - clear PTE; barrier; read refcount
+ * - inc refcount; barrier; read PTE
+ */
+ smp_mb();
+
+ ref_count = page_ref_count(page);
+ map_count = page_mapcount(page);
+
+ /*
+ * Order reads for page refcount and dirty flag
+ * (see comments in __remove_mapping()).
+ */
+ smp_rmb();
+
+ /*
+ * The only page refs must be one from isolation
+ * plus the rmap(s) (dropped by discard:).
+ */
+ if (ref_count == 1 + map_count &&
+ !PageDirty(page)) {
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm,
address, address + PAGE_SIZE);
diff --git a/mm/slub.c b/mm/slub.c
index 1384dc9..b0f6375 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2297,6 +2297,7 @@
c->page = NULL;
c->freelist = NULL;
+ c->tid = next_tid(c->tid);
}
/*
@@ -2430,8 +2431,6 @@
{
stat(s, CPUSLAB_FLUSH);
deactivate_slab(s, c->page, c->freelist, c);
-
- c->tid = next_tid(c->tid);
}
/*
@@ -2717,6 +2716,7 @@
if (!freelist) {
c->page = NULL;
+ c->tid = next_tid(c->tid);
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
}
@@ -5559,7 +5559,8 @@
char *name = kmalloc(ID_STR_LENGTH, GFP_KERNEL);
char *p = name;
- BUG_ON(!name);
+ if (!name)
+ return ERR_PTR(-ENOMEM);
*p++ = ':';
/*
@@ -5617,6 +5618,8 @@
* for the symlinks.
*/
name = create_unique_id(s);
+ if (IS_ERR(name))
+ return PTR_ERR(name);
}
s->kobj.kset = kset;
diff --git a/mm/usercopy.c b/mm/usercopy.c
index b3de3c4..540968b 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -294,7 +294,10 @@
static int __init parse_hardened_usercopy(char *str)
{
- return strtobool(str, &enable_checks);
+ if (strtobool(str, &enable_checks))
+ pr_warn("Invalid option string for hardened_usercopy: '%s'\n",
+ str);
+ return 1;
}
__setup("hardened_usercopy=", parse_hardened_usercopy);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 9a3d451..078d95c 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -83,6 +83,8 @@
/* don't free the page */
goto out;
}
+
+ flush_dcache_page(page);
} else {
page = *pagep;
*pagep = NULL;
@@ -595,6 +597,7 @@
err = -EFAULT;
goto out;
}
+ flush_dcache_page(page);
goto retry;
} else
BUG_ON(page);
diff --git a/mm/util.c b/mm/util.c
index 8904727..25bfda7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -331,6 +331,38 @@
#endif
}
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
+ *
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
+ *
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
+ */
+unsigned long randomize_page(unsigned long start, unsigned long range)
+{
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
+
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
+}
+
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
@@ -629,6 +661,21 @@
}
EXPORT_SYMBOL(kvfree_sensitive);
+void *kvrealloc(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+{
+ void *newp;
+
+ if (oldsize >= newsize)
+ return (void *)p;
+ newp = kvmalloc(newsize, flags);
+ if (!newp)
+ return NULL;
+ memcpy(newp, p, oldsize);
+ kvfree(p);
+ return newp;
+}
+EXPORT_SYMBOL(kvrealloc);
+
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
diff --git a/mm/vmscan.c b/mm/vmscan.c
index f2817e8..51ccd80 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2439,8 +2439,8 @@
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
struct blk_plug plug;
- bool scan_adjusted;
get_scan_count(lruvec, sc, nr);
@@ -2458,8 +2458,8 @@
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -2479,7 +2479,7 @@
cond_resched();
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
continue;
/*
@@ -2530,8 +2530,6 @@
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 698bc0b..e292e63 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1503,10 +1503,6 @@
if (!page)
continue;
- /* Watch for unexpected holes punched in the memmap */
- if (!memmap_valid_within(pfn, page, zone))
- continue;
-
if (page_zone(page) != zone)
continue;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 73cd507..c18dc8e 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -1748,11 +1748,40 @@
*/
static void lock_zspage(struct zspage *zspage)
{
- struct page *page = get_first_page(zspage);
+ struct page *curr_page, *page;
- do {
- lock_page(page);
- } while ((page = get_next_page(page)) != NULL);
+ /*
+ * Pages we haven't locked yet can be migrated off the list while we're
+ * trying to lock them, so we need to be careful and only attempt to
+ * lock each page under migrate_read_lock(). Otherwise, the page we lock
+ * may no longer belong to the zspage. This means that we may wait for
+ * the wrong page to unlock, so we must take a reference to the page
+ * prior to waiting for it to unlock outside migrate_read_lock().
+ */
+ while (1) {
+ migrate_read_lock(zspage);
+ page = get_first_page(zspage);
+ if (trylock_page(page))
+ break;
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ }
+
+ curr_page = page;
+ while ((page = get_next_page(curr_page))) {
+ if (trylock_page(page)) {
+ curr_page = page;
+ } else {
+ get_page(page);
+ migrate_read_unlock(zspage);
+ wait_on_page_locked(page);
+ put_page(page);
+ migrate_read_lock(zspage);
+ }
+ }
+ migrate_read_unlock(zspage);
}
static int zs_init_fs_context(struct fs_context *fc)