Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e50799d..594368f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -63,7 +63,14 @@
struct page *huge_zero_page __read_mostly;
unsigned long huge_zero_pfn __read_mostly = ~0UL;
-bool transparent_hugepage_enabled(struct vm_area_struct *vma)
+static inline bool file_thp_enabled(struct vm_area_struct *vma)
+{
+ return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file &&
+ !inode_is_open_for_write(vma->vm_file->f_inode) &&
+ (vma->vm_flags & VM_EXEC);
+}
+
+bool transparent_hugepage_active(struct vm_area_struct *vma)
{
/* The addr is used to check if the vma size fits */
unsigned long addr = (vma->vm_end & HPAGE_PMD_MASK) - HPAGE_PMD_SIZE;
@@ -74,6 +81,8 @@
return __transparent_hugepage_enabled(vma);
if (vma_is_shmem(vma))
return shmem_huge_enabled(vma);
+ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS))
+ return file_thp_enabled(vma);
return false;
}
@@ -306,35 +315,14 @@
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
-#ifdef CONFIG_DEBUG_VM
-static ssize_t debug_cow_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static ssize_t debug_cow_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- return single_hugepage_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static struct kobj_attribute debug_cow_attr =
- __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
-#endif /* CONFIG_DEBUG_VM */
-
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
&use_zero_page_attr.attr,
&hpage_pmd_size_attr.attr,
-#if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE)
+#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
-#ifdef CONFIG_DEBUG_VM
- &debug_cow_attr.attr,
-#endif
NULL,
};
@@ -396,7 +384,11 @@
struct kobject *hugepage_kobj;
if (!has_transparent_hugepage()) {
- transparent_hugepage_flags = 0;
+ /*
+ * Hardware doesn't support hugepages, hence disable
+ * DAX PMD support.
+ */
+ transparent_hugepage_flags = 1 << TRANSPARENT_HUGEPAGE_NEVER_DAX;
return -EINVAL;
}
@@ -522,6 +514,17 @@
set_compound_page_dtor(page, TRANSHUGE_PAGE_DTOR);
}
+bool is_transparent_hugepage(struct page *page)
+{
+ if (!PageCompound(page))
+ return false;
+
+ page = compound_head(page);
+ return is_huge_zero_page(page) ||
+ page[1].compound_dtor == TRANSHUGE_PAGE_DTOR;
+}
+EXPORT_SYMBOL_GPL(is_transparent_hugepage);
+
static unsigned long __thp_get_unmapped_area(struct file *filp,
unsigned long addr, unsigned long len,
loff_t off, unsigned long flags, unsigned long size)
@@ -579,18 +582,19 @@
struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
+ count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
+ cgroup_throttle_swaprate(page, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
@@ -621,7 +625,6 @@
vm_fault_t ret2;
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
@@ -632,15 +635,14 @@
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- mem_cgroup_commit_charge(page, memcg, false, true);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -649,7 +651,6 @@
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return ret;
@@ -815,11 +816,24 @@
pte_free(mm, pgtable);
}
-vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write)
+/**
+ * vmf_insert_pfn_pmd_prot - insert a pmd size pfn
+ * @vmf: Structure describing the fault
+ * @pfn: pfn to insert
+ * @pgprot: page protection to use
+ * @write: whether it's a write fault
+ *
+ * Insert a pmd size pfn. See vmf_insert_pfn() for additional info and
+ * also consult the vmf_insert_mixed_prot() documentation when
+ * @pgprot != @vmf->vma->vm_page_prot.
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn,
+ pgprot_t pgprot, bool write)
{
unsigned long addr = vmf->address & PMD_MASK;
struct vm_area_struct *vma = vmf->vma;
- pgprot_t pgprot = vma->vm_page_prot;
pgtable_t pgtable = NULL;
/*
@@ -847,7 +861,7 @@
insert_pfn_pmd(vma, addr, vmf->pmd, pfn, pgprot, write, pgtable);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pmd_prot);
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
static pud_t maybe_pud_mkwrite(pud_t pud, struct vm_area_struct *vma)
@@ -893,11 +907,24 @@
spin_unlock(ptl);
}
-vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write)
+/**
+ * vmf_insert_pfn_pud_prot - insert a pud size pfn
+ * @vmf: Structure describing the fault
+ * @pfn: pfn to insert
+ * @pgprot: page protection to use
+ * @write: whether it's a write fault
+ *
+ * Insert a pud size pfn. See vmf_insert_pfn() for additional info and
+ * also consult the vmf_insert_mixed_prot() documentation when
+ * @pgprot != @vmf->vma->vm_page_prot.
+ *
+ * Return: vm_fault_t value.
+ */
+vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn,
+ pgprot_t pgprot, bool write)
{
unsigned long addr = vmf->address & PUD_MASK;
struct vm_area_struct *vma = vmf->vma;
- pgprot_t pgprot = vma->vm_page_prot;
/*
* If we had pud_special, we could avoid all these restrictions,
@@ -918,7 +945,7 @@
insert_pfn_pud(vma, addr, vmf->pud, pfn, pgprot, write);
return VM_FAULT_NOPAGE;
}
-EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud);
+EXPORT_SYMBOL_GPL(vmf_insert_pfn_pud_prot);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
@@ -949,6 +976,11 @@
*/
WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set");
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
if (flags & FOLL_WRITE && !pmd_write(*pmd))
return NULL;
@@ -964,7 +996,7 @@
* device mapped pages can only be returned if the
* caller will manage the page reference count.
*/
- if (!(flags & FOLL_GET))
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PMD_MASK) >> PAGE_SHIFT;
@@ -972,14 +1004,15 @@
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- get_page(page);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
return page;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
- struct vm_area_struct *vma)
+ struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
{
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
@@ -988,7 +1021,7 @@
int ret = -ENOMEM;
/* Skip if can be re-fill on fault */
- if (!vma_is_anonymous(vma))
+ if (!vma_is_anonymous(dst_vma))
return 0;
pgtable = pte_alloc_one(dst_mm);
@@ -1012,11 +1045,15 @@
pmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*src_pmd))
pmd = pmd_swp_mksoft_dirty(pmd);
+ if (pmd_swp_uffd_wp(*src_pmd))
+ pmd = pmd_swp_mkuffd_wp(pmd);
set_pmd_at(src_mm, addr, src_pmd, pmd);
}
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_swp_clear_uffd_wp(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
goto out_unlock;
@@ -1033,28 +1070,44 @@
* a page table.
*/
if (is_huge_zero_pmd(pmd)) {
- struct page *zero_page;
/*
* get_huge_zero_page() will never allocate a new page here,
* since we already have a zero page to copy. It just takes a
* reference.
*/
- zero_page = mm_get_huge_zero_page(dst_mm);
- set_huge_zero_page(pgtable, dst_mm, vma, addr, dst_pmd,
- zero_page);
- ret = 0;
- goto out_unlock;
+ mm_get_huge_zero_page(dst_mm);
+ goto out_zero_page;
}
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+
+ /*
+ * If this page is a potentially pinned page, split and retry the fault
+ * with smaller page size. Normally this should not happen because the
+ * userspace should use MADV_DONTFORK upon pinned regions. This is a
+ * best effort that the pinned pages won't be replaced by another
+ * random page during the coming copy-on-write.
+ */
+ if (unlikely(is_cow_mapping(src_vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(src_page))) {
+ pte_free(dst_mm, pgtable);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pmd(src_vma, src_pmd, addr, false, NULL);
+ return -EAGAIN;
+ }
+
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
+out_zero_page:
mm_inc_nr_ptes(dst_mm);
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
-
pmdp_set_wrprotect(src_mm, addr, src_pmd);
+ if (!userfaultfd_wp(dst_vma))
+ pmd = pmd_clear_uffd_wp(pmd);
pmd = pmd_mkold(pmd_wrprotect(pmd));
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
@@ -1092,6 +1145,11 @@
if (flags & FOLL_WRITE && !pud_write(*pud))
return NULL;
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
if (pud_present(*pud) && pud_devmap(*pud))
/* pass */;
else
@@ -1103,8 +1161,10 @@
/*
* device mapped pages can only be returned if the
* caller will manage the page reference count.
+ *
+ * At least one of FOLL_GET | FOLL_PIN must be set, so assert that here:
*/
- if (!(flags & FOLL_GET))
+ if (!(flags & (FOLL_GET | FOLL_PIN)))
return ERR_PTR(-EEXIST);
pfn += (addr & ~PUD_MASK) >> PAGE_SHIFT;
@@ -1112,7 +1172,8 @@
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- get_page(page);
+ if (!try_grab_page(page, flags))
+ page = ERR_PTR(-ENOMEM);
return page;
}
@@ -1143,6 +1204,16 @@
/* No huge zero pud yet */
}
+ /* Please refer to comments in copy_huge_pmd() */
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(pud_page(pud)))) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pud(vma, src_pud, addr);
+ return -EAGAIN;
+ }
+
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
set_pud_at(dst_mm, addr, dst_pud, pud);
@@ -1197,271 +1268,73 @@
spin_unlock(vmf->ptl);
}
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
- pmd_t orig_pmd, struct page *page)
-{
- struct vm_area_struct *vma = vmf->vma;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mem_cgroup *memcg;
- pgtable_t pgtable;
- pmd_t _pmd;
- int i;
- vm_fault_t ret = 0;
- struct page **pages;
- struct mmu_notifier_range range;
-
- pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
- GFP_KERNEL);
- if (unlikely(!pages)) {
- ret |= VM_FAULT_OOM;
- goto out;
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address, page_to_nid(page));
- if (unlikely(!pages[i] ||
- mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
- GFP_KERNEL, &memcg, false))) {
- if (pages[i])
- put_page(pages[i]);
- while (--i >= 0) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg,
- false);
- put_page(pages[i]);
- }
- kfree(pages);
- ret |= VM_FAULT_OOM;
- goto out;
- }
- set_page_private(pages[i], (unsigned long)memcg);
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SIZE * i, vma);
- __SetPageUptodate(pages[i]);
- cond_resched();
- }
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_free_pages;
- VM_BUG_ON_PAGE(!PageHead(page), page);
-
- /*
- * Leave pmd empty until pte is filled note we must notify here as
- * concurrent CPU thread might write to new page before the call to
- * mmu_notifier_invalidate_range_end() happens which can lead to a
- * device seeing memory write in different order than CPU.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
- pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t entry;
- entry = mk_pte(pages[i], vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
- mem_cgroup_commit_charge(pages[i], memcg, false, false);
- lru_cache_add_active_or_unevictable(pages[i], vma);
- vmf->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*vmf->pte));
- set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
- pte_unmap(vmf->pte);
- }
- kfree(pages);
-
- smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
- page_remove_rmap(page, true);
- spin_unlock(vmf->ptl);
-
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-
- ret |= VM_FAULT_WRITE;
- put_page(page);
-
-out:
- return ret;
-
-out_free_pages:
- spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(&range);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg, false);
- put_page(pages[i]);
- }
- kfree(pages);
- goto out;
-}
-
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *new_page;
- struct mem_cgroup *memcg;
+ struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mmu_notifier_range range;
- gfp_t huge_gfp; /* for allocation and charge */
- vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
if (is_huge_zero_pmd(orig_pmd))
- goto alloc;
+ goto fallback;
+
spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_unlock;
+
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- /*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
- */
+
+ /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
lock_page(page);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
unlock_page(page);
put_page(page);
- goto out_unlock;
+ return 0;
}
put_page(page);
}
+
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- ret |= VM_FAULT_WRITE;
unlock_page(page);
- goto out_unlock;
- }
- unlock_page(page);
- get_page(page);
- spin_unlock(vmf->ptl);
-alloc:
- if (__transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma);
- new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
- } else
- new_page = NULL;
-
- if (likely(new_page)) {
- prep_transhuge_page(new_page);
- } else {
- if (!page) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- } else {
- ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
- if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- }
- put_page(page);
- }
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
-
- if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
- huge_gfp, &memcg, true))) {
- put_page(new_page);
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- if (page)
- put_page(page);
- ret |= VM_FAULT_FALLBACK;
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
- if (!page)
- clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
- else
- copy_user_huge_page(new_page, page, vmf->address,
- vma, HPAGE_PMD_NR);
- __SetPageUptodate(new_page);
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- spin_lock(vmf->ptl);
- if (page)
- put_page(page);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(new_page, memcg, true);
- put_page(new_page);
- goto out_mn;
- } else {
- pmd_t entry;
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
- page_add_new_anon_rmap(new_page, vma, haddr, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- if (!page) {
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- } else {
- VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page, true);
- put_page(page);
- }
- ret |= VM_FAULT_WRITE;
+ return VM_FAULT_WRITE;
}
+
+ unlock_page(page);
spin_unlock(vmf->ptl);
-out_mn:
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-out:
- return ret;
-out_unlock:
- spin_unlock(vmf->ptl);
- return ret;
+fallback:
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ return VM_FAULT_FALLBACK;
}
/*
- * FOLL_FORCE or a forced COW break can write even to unwritable pmd's,
- * but only after we've gone through a COW cycle and they are dirty.
+ * FOLL_FORCE can write to even unwritable pmd's, but only
+ * after we've gone through a COW cycle and they are dirty.
*/
static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
{
- return pmd_write(pmd) || ((flags & FOLL_COW) && pmd_dirty(pmd));
+ return pmd_write(pmd) ||
+ ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
}
struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1487,8 +1360,13 @@
page = pmd_page(*pmd);
VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page);
+
+ if (!try_grab_page(page, flags))
+ return ERR_PTR(-ENOMEM);
+
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags);
+
if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
/*
* We don't mlock() pte-mapped THPs. This way we can avoid
@@ -1517,7 +1395,6 @@
goto skip_mlock;
if (!trylock_page(page))
goto skip_mlock;
- lru_add_drain();
if (page->mapping && !PageDoubleMap(page))
mlock_vma_page(page);
unlock_page(page);
@@ -1525,8 +1402,6 @@
skip_mlock:
page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page);
- if (flags & FOLL_GET)
- get_page(page);
out:
return page;
@@ -1789,10 +1664,10 @@
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
- if (vma_is_dax(vma)) {
+ if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
zap_deposited_table(tlb->mm, pmd);
spin_unlock(ptl);
@@ -1864,19 +1739,13 @@
}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
bool force_flush = false;
- if ((old_addr & ~HPAGE_PMD_MASK) ||
- (new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE)
- return false;
-
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
@@ -1888,7 +1757,7 @@
/*
* We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
+ * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
if (old_ptl) {
@@ -1924,13 +1793,16 @@
* - HPAGE_PMD_NR is protections changed and TLB flush necessary
*/
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, pgprot_t newprot, int prot_numa)
+ unsigned long addr, pgprot_t newprot, unsigned long cp_flags)
{
struct mm_struct *mm = vma->vm_mm;
spinlock_t *ptl;
pmd_t entry;
bool preserve_write;
int ret;
+ bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
+ bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
@@ -1954,6 +1826,8 @@
newpmd = swp_entry_to_pmd(entry);
if (pmd_swp_soft_dirty(*pmd))
newpmd = pmd_swp_mksoft_dirty(newpmd);
+ if (pmd_swp_uffd_wp(*pmd))
+ newpmd = pmd_swp_mkuffd_wp(newpmd);
set_pmd_at(mm, addr, pmd, newpmd);
}
goto unlock;
@@ -1972,9 +1846,9 @@
goto unlock;
/*
- * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
- * which is also under down_read(mmap_sem):
+ * which is also under mmap_read_lock(mm):
*
* CPU0: CPU1:
* change_huge_pmd(prot_numa=1)
@@ -1997,6 +1871,17 @@
entry = pmd_modify(entry, newprot);
if (preserve_write)
entry = pmd_mk_savedwrite(entry);
+ if (uffd_wp) {
+ entry = pmd_wrprotect(entry);
+ entry = pmd_mkuffd_wp(entry);
+ } else if (uffd_wp_resolve) {
+ /*
+ * Leave the write bit to be handled by PF interrupt
+ * handler, then things like COW could be properly
+ * handled.
+ */
+ entry = pmd_clear_uffd_wp(entry);
+ }
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry));
@@ -2056,7 +1941,7 @@
*/
pudp_huge_get_and_clear_full(tlb->mm, addr, pud, tlb->fullmm);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
- if (vma_is_dax(vma)) {
+ if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
/* No zero page support yet */
} else {
@@ -2145,7 +2030,7 @@
struct page *page;
pgtable_t pgtable;
pmd_t old_pmd, _pmd;
- bool young, write, soft_dirty, pmd_migration = false;
+ bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false;
unsigned long addr;
int i;
@@ -2165,7 +2050,7 @@
*/
if (arch_needs_pgtable_deposit())
zap_deposited_table(mm, pmd);
- if (vma_is_dax(vma))
+ if (vma_is_special_huge(vma))
return;
if (unlikely(is_pmd_migration_entry(old_pmd))) {
swp_entry_t entry;
@@ -2206,8 +2091,8 @@
* free), userland could trigger a small page size TLB miss on the
* small sized TLB while the hugepage TLB entry is still established in
* the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
+ * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that it's
* only safe if the permission and cache attributes of the two entries
* loaded in the two TLB is identical (which should be the case here).
* But it is generally safer to never allow small and huge TLB entries
@@ -2229,6 +2114,7 @@
write = is_write_migration_entry(entry);
young = false;
soft_dirty = pmd_swp_soft_dirty(old_pmd);
+ uffd_wp = pmd_swp_uffd_wp(old_pmd);
} else {
page = pmd_page(old_pmd);
if (pmd_dirty(old_pmd))
@@ -2236,6 +2122,7 @@
write = pmd_write(old_pmd);
young = pmd_young(old_pmd);
soft_dirty = pmd_soft_dirty(old_pmd);
+ uffd_wp = pmd_uffd_wp(old_pmd);
}
VM_BUG_ON_PAGE(!page_count(page), page);
page_ref_add(page, HPAGE_PMD_NR - 1);
@@ -2260,6 +2147,8 @@
entry = swp_entry_to_pte(swp_entry);
if (soft_dirty)
entry = pte_swp_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_swp_mkuffd_wp(entry);
} else {
entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot));
entry = maybe_mkwrite(entry, vma);
@@ -2269,6 +2158,8 @@
entry = pte_mkold(entry);
if (soft_dirty)
entry = pte_mksoft_dirty(entry);
+ if (uffd_wp)
+ entry = pte_mkuffd_wp(entry);
}
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
@@ -2444,13 +2335,13 @@
/*
* If we're also updating the vma->vm_next->vm_start, if the new
- * vm_next->vm_start isn't page aligned and it could previously
+ * vm_next->vm_start isn't hpage aligned and it could previously
* contain an hugepage: check if we need to split an huge pmd.
*/
if (adjust_next > 0) {
struct vm_area_struct *next = vma->vm_next;
unsigned long nstart = next->vm_start;
- nstart += adjust_next << PAGE_SHIFT;
+ nstart += adjust_next;
if (nstart & ~HPAGE_PMD_MASK &&
(nstart & HPAGE_PMD_MASK) >= next->vm_start &&
(nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end)
@@ -2460,8 +2351,8 @@
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
- TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC;
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC |
+ TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
VM_BUG_ON_PAGE(!PageHead(page), page);
@@ -2473,13 +2364,13 @@
VM_WARN_ON_ONCE_PAGE(page_mapped(page), page);
}
-static void remap_page(struct page *page)
+static void remap_page(struct page *page, unsigned int nr)
{
int i;
if (PageTransHuge(page)) {
remove_migration_ptes(page, page, true);
} else {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
remove_migration_ptes(page + i, page + i, true);
}
}
@@ -2508,6 +2399,9 @@
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
+#ifdef CONFIG_64BIT
+ (1L << PG_arch_2) |
+#endif
(1L << PG_dirty)));
/* ->mapping in first tail page is compound_mapcount */
@@ -2554,12 +2448,13 @@
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
+ unsigned int nr = thp_nr_pages(head);
int i;
lruvec = mem_cgroup_page_lruvec(head, pgdat);
/* complete memcg works before add pages to LRU */
- mem_cgroup_split_huge_fixup(head);
+ split_page_memcg(head, nr);
if (PageAnon(head) && PageSwapCache(head)) {
swp_entry_t entry = { .val = page_private(head) };
@@ -2569,7 +2464,7 @@
xa_lock(&swap_cache->i_pages);
}
- for (i = HPAGE_PMD_NR - 1; i >= 1; i--) {
+ for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
if (head[i].index >= end) {
@@ -2589,7 +2484,7 @@
ClearPageCompound(head);
- split_page_owner(head, HPAGE_PMD_NR);
+ split_page_owner(head, nr);
/* See comment in __split_huge_page_tail() */
if (PageAnon(head)) {
@@ -2608,9 +2503,15 @@
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- remap_page(head);
+ remap_page(head, nr);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ if (PageSwapCache(head)) {
+ swp_entry_t entry = { .val = page_private(head) };
+
+ split_swap_cluster(entry);
+ }
+
+ for (i = 0; i < nr; i++) {
struct page *subpage = head + i;
if (subpage == page)
continue;
@@ -2629,7 +2530,7 @@
int total_mapcount(struct page *page)
{
- int i, compound, ret;
+ int i, compound, nr, ret;
VM_BUG_ON_PAGE(PageTail(page), page);
@@ -2637,16 +2538,17 @@
return atomic_read(&page->_mapcount) + 1;
compound = compound_mapcount(page);
+ nr = compound_nr(page);
if (PageHuge(page))
return compound;
ret = compound;
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ for (i = 0; i < nr; i++)
ret += atomic_read(&page[i]._mapcount) + 1;
/* File pages has compound_mapcount included in _mapcount */
if (!PageAnon(page))
- return ret - compound * HPAGE_PMD_NR;
+ return ret - compound * nr;
if (PageDoubleMap(page))
- ret -= HPAGE_PMD_NR;
+ ret -= nr;
return ret;
}
@@ -2691,14 +2593,14 @@
page = compound_head(page);
_total_mapcount = ret = 0;
- for (i = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0; i < thp_nr_pages(page); i++) {
mapcount = atomic_read(&page[i]._mapcount) + 1;
ret = max(ret, mapcount);
_total_mapcount += mapcount;
}
if (PageDoubleMap(page)) {
ret -= 1;
- _total_mapcount -= HPAGE_PMD_NR;
+ _total_mapcount -= thp_nr_pages(page);
}
mapcount = compound_mapcount(page);
ret += mapcount;
@@ -2715,9 +2617,9 @@
/* Additional pins from page cache */
if (PageAnon(page))
- extra_pins = PageSwapCache(page) ? HPAGE_PMD_NR : 0;
+ extra_pins = PageSwapCache(page) ? thp_nr_pages(page) : 0;
else
- extra_pins = HPAGE_PMD_NR;
+ extra_pins = thp_nr_pages(page);
if (pextra_pins)
*pextra_pins = extra_pins;
return total_mapcount(page) == page_count(page) - extra_pins - 1;
@@ -2746,24 +2648,23 @@
{
struct page *head = compound_head(page);
struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
- struct deferred_split *ds_queue = get_deferred_split_queue(page);
+ struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int extra_pins, ret;
- bool mlocked;
unsigned long flags;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(head), head);
+ VM_BUG_ON_PAGE(!PageCompound(head), head);
- if (PageWriteback(page))
+ if (PageWriteback(head))
return -EBUSY;
if (PageAnon(head)) {
/*
- * The caller does not necessarily hold an mmap_sem that would
+ * The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
* reference to it and then lock the anon_vma for write. This
* is similar to page_lock_anon_vma_read except the write lock
@@ -2809,13 +2710,8 @@
goto out_unlock;
}
- mlocked = PageMlocked(page);
unmap_page(head);
- /* Make sure the page is not on per-CPU pagevec as it takes pin */
- if (mlocked)
- lru_add_drain();
-
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(&pgdata->lru_lock, flags);
@@ -2838,28 +2734,23 @@
ds_queue->split_queue_len--;
list_del(page_deferred_list(head));
}
+ spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
- if (PageSwapBacked(page))
- __dec_node_page_state(page, NR_SHMEM_THPS);
+ if (PageSwapBacked(head))
+ __dec_node_page_state(head, NR_SHMEM_THPS);
else
- __dec_node_page_state(page, NR_FILE_THPS);
+ __dec_node_page_state(head, NR_FILE_THPS);
}
- spin_unlock(&ds_queue->split_queue_lock);
__split_huge_page(page, list, end, flags);
- if (PageSwapCache(head)) {
- swp_entry_t entry = { .val = page_private(head) };
-
- ret = split_swap_cluster(entry);
- } else
- ret = 0;
+ ret = 0;
} else {
spin_unlock(&ds_queue->split_queue_lock);
fail:
if (mapping)
xa_unlock(&mapping->i_pages);
spin_unlock_irqrestore(&pgdata->lru_lock, flags);
- remap_page(head);
+ remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
@@ -3045,7 +2936,7 @@
return 0;
}
-DEFINE_SIMPLE_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
+DEFINE_DEBUGFS_ATTRIBUTE(split_huge_pages_fops, NULL, split_huge_pages_set,
"%llu\n");
static int __init split_huge_pages_debugfs(void)
@@ -3103,6 +2994,8 @@
pmde = pmd_mksoft_dirty(pmde);
if (is_write_migration_entry(entry))
pmde = maybe_pmd_mkwrite(pmde, vma);
+ if (pmd_swp_uffd_wp(*pvmw->pmd))
+ pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde));
flush_cache_range(vma, mmun_start, mmun_start + HPAGE_PMD_SIZE);
if (PageAnon(new))