Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/rmap.c b/mm/rmap.c
index 45f2106..14f84f7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,10 +21,11 @@
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * mm->mmap_sem
- * page->flags PG_locked (lock_page)
+ * mm->mmap_lock
+ * page->flags PG_locked (lock_page) * (see huegtlbfs below)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
@@ -43,6 +44,11 @@
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
* ->tasklist_lock
* pte map lock
+ *
+ * * hugetlbfs PageHuge() pages take locks in this order:
+ * mapping->i_mmap_rwsem
+ * hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
+ * page->flags PG_locked (lock_page)
*/
#include <linux/mm.h>
@@ -171,7 +177,7 @@
* to do any locking for the common case of already having
* an anon_vma.
*
- * This must be called with the mmap_sem held for reading.
+ * This must be called with the mmap_lock held for reading.
*/
int __anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -251,13 +257,19 @@
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
- * If dst->anon_vma is NULL this function tries to find and reuse existing
- * anon_vma which has no vmas and only one child anon_vma. This prevents
- * degradation of anon_vma hierarchy to endless linear chain in case of
- * constantly forking task. On the other hand, an anon_vma with more than one
- * child isn't reused even if there was no alive vma, thus rmap walker has a
- * good chance of avoiding scanning the whole hierarchy when it searches where
- * page is mapped.
+ * anon_vma_clone() is called by __vma_split(), __split_vma(), copy_vma() and
+ * anon_vma_fork(). The first three want an exact copy of src, while the last
+ * one, anon_vma_fork(), may try to reuse an existing anon_vma to prevent
+ * endless growth of anon_vma. Since dst->anon_vma is set to NULL before call,
+ * we can identify this case by checking (!dst->anon_vma && src->anon_vma).
+ *
+ * If (!dst->anon_vma && src->anon_vma) is true, this function tries to find
+ * and reuse existing anon_vma which has no vmas and only one child anon_vma.
+ * This prevents degradation of anon_vma hierarchy to endless linear chain in
+ * case of constantly forking task. On the other hand, an anon_vma with more
+ * than one child isn't reused even if there was no alive vma, thus rmap
+ * walker has a good chance of avoiding scanning the whole hierarchy when it
+ * searches where page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
@@ -287,8 +299,8 @@
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
- if (!dst->anon_vma && anon_vma != src->anon_vma &&
- anon_vma->degree < 2)
+ if (!dst->anon_vma && src->anon_vma &&
+ anon_vma != src->anon_vma && anon_vma->degree < 2)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
@@ -458,9 +470,10 @@
* chain and verify that the page in question is indeed mapped in it
* [ something equivalent to page_mapped_in_vma() ].
*
- * Since anon_vma's slab is DESTROY_BY_RCU and we know from page_remove_rmap()
- * that the anon_vma pointer from page->mapping is valid if there is a
- * mapcount, we can dereference the anon_vma after observing those.
+ * Since anon_vma's slab is SLAB_TYPESAFE_BY_RCU and we know from
+ * page_remove_rmap() that the anon_vma pointer from page->mapping is valid
+ * if there is a mapcount, we can dereference the anon_vma after observing
+ * those.
*/
struct anon_vma *page_get_anon_vma(struct page *page)
{
@@ -659,7 +672,7 @@
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
- if (mm->tlb_flush_batched) {
+ if (data_race(mm->tlb_flush_batched)) {
flush_tlb_mm(mm);
/*
@@ -917,7 +930,7 @@
set_pte_at(vma->vm_mm, address, pte, entry);
ret = 1;
} else {
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
pmd_t *pmd = pvmw.pmd;
pmd_t entry;
@@ -1052,7 +1065,6 @@
static void __page_check_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address)
{
-#ifdef CONFIG_DEBUG_VM
/*
* The page's anon-rmap details (mapping and index) are guaranteed to
* be set up correctly at this point.
@@ -1065,9 +1077,9 @@
* are initially only visible via the pagetables, and the pte is locked
* over the call to page_add_new_anon_rmap.
*/
- BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root);
- BUG_ON(page_to_pgoff(page) != linear_page_index(vma, address));
-#endif
+ VM_BUG_ON_PAGE(page_anon_vma(page)->root != vma->anon_vma->root, page);
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address),
+ page);
}
/**
@@ -1099,6 +1111,11 @@
bool compound = flags & RMAP_COMPOUND;
bool first;
+ if (unlikely(PageKsm(page)))
+ lock_page_memcg(page);
+ else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
if (compound) {
atomic_t *mapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1110,7 +1127,7 @@
}
if (first) {
- int nr = compound ? hpage_nr_pages(page) : 1;
+ int nr = compound ? thp_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
@@ -1118,13 +1135,14 @@
* disabled.
*/
if (compound)
- __inc_node_page_state(page, NR_ANON_THPS);
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
- if (unlikely(PageKsm(page)))
- return;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (unlikely(PageKsm(page))) {
+ unlock_page_memcg(page);
+ return;
+ }
/* address might be in next vma when migration races vma_adjust */
if (first)
@@ -1148,7 +1166,7 @@
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, bool compound)
{
- int nr = compound ? hpage_nr_pages(page) : 1;
+ int nr = compound ? thp_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__SetPageSwapBacked(page);
@@ -1156,14 +1174,17 @@
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
/* increment count (starts at -1) */
atomic_set(compound_mapcount_ptr(page), 0);
- __inc_node_page_state(page, NR_ANON_THPS);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1181,7 +1202,7 @@
VM_BUG_ON_PAGE(compound && !PageTransHuge(page), page);
lock_page_memcg(page);
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_inc_and_test(&page[i]._mapcount))
nr++;
}
@@ -1212,30 +1233,29 @@
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
/* hugetlb pages are always mapped with pmds */
atomic_dec(compound_mapcount_ptr(page));
- goto out;
+ return;
}
/* page still mapped by someone else? */
if (compound && PageTransHuge(page)) {
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- goto out;
+ return;
if (PageSwapBacked(page))
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ return;
}
/*
@@ -1247,8 +1267,6 @@
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
-out:
- unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1265,28 +1283,34 @@
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_node_page_state(page, NR_ANON_THPS);
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/*
* Subpages can be mapped with PTEs too. Check how many of
- * themi are still mapped.
+ * them are still mapped.
*/
- for (i = 0, nr = 0; i < HPAGE_PMD_NR; i++) {
+ for (i = 0, nr = 0; i < thp_nr_pages(page); i++) {
if (atomic_add_negative(-1, &page[i]._mapcount))
nr++;
}
+
+ /*
+ * Queue the page for deferred split if at least one small
+ * page of the compound page is unmapped, but at least one
+ * small page is still mapped.
+ */
+ if (nr && nr < thp_nr_pages(page))
+ deferred_split_huge_page(page);
} else {
- nr = HPAGE_PMD_NR;
+ nr = thp_nr_pages(page);
}
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
- if (nr) {
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
- deferred_split_huge_page(page);
- }
+ if (nr)
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}
/**
@@ -1298,22 +1322,28 @@
*/
void page_remove_rmap(struct page *page, bool compound)
{
- if (!PageAnon(page))
- return page_remove_file_rmap(page, compound);
+ lock_page_memcg(page);
- if (compound)
- return page_remove_anon_compound_rmap(page);
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page, compound);
+ goto out;
+ }
+
+ if (compound) {
+ page_remove_anon_compound_rmap(page);
+ goto out;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- return;
+ goto out;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __dec_node_page_state(page, NR_ANON_MAPPED);
+ __dec_lruvec_page_state(page, NR_ANON_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1330,6 +1360,8 @@
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+out:
+ unlock_page_memcg(page);
}
/*
@@ -1348,7 +1380,7 @@
struct page *subpage;
bool ret = true;
struct mmu_notifier_range range;
- enum ttu_flags flags = (enum ttu_flags)arg;
+ enum ttu_flags flags = (enum ttu_flags)(long)arg;
/*
* When racing against e.g. zap_pte_range() on another cpu,
@@ -1416,7 +1448,7 @@
if (!PageTransCompound(page)) {
/*
* Holding pte lock, we do *not* need
- * mmap_sem here
+ * mmap_lock here
*/
mlock_vma_page(page);
}
@@ -1434,8 +1466,14 @@
subpage = page - page_to_pfn(page) + pte_pfn(*pvmw.pte);
address = pvmw.address;
- if (PageHuge(page)) {
- if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ if (PageHuge(page) && !PageAnon(page)) {
+ /*
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
+ */
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
@@ -1477,8 +1515,15 @@
*/
entry = make_migration_entry(page, 0);
swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
+
+ /*
+ * pteval maps a zone device page and is therefore
+ * a swap pte.
+ */
+ if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
@@ -1495,15 +1540,6 @@
goto discard;
}
- if (!(flags & TTU_IGNORE_ACCESS)) {
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- }
-
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
@@ -1578,6 +1614,8 @@
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/*
* No need to invalidate here it will synchronize on
@@ -1644,6 +1682,8 @@
swp_pte = swp_entry_to_pte(entry);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pteval))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, address, pvmw.pte, swp_pte);
/* Invalidate as we cleared the pte */
mmu_notifier_invalidate_range(mm, address,
@@ -1678,23 +1718,9 @@
return ret;
}
-bool is_vma_temporary_stack(struct vm_area_struct *vma)
-{
- int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
-
- if (!maybe_stack)
- return false;
-
- if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) ==
- VM_STACK_INCOMPLETE_SETUP)
- return true;
-
- return false;
-}
-
static bool invalid_migration_vma(struct vm_area_struct *vma, void *arg)
{
- return is_vma_temporary_stack(vma);
+ return vma_is_temporary_stack(vma);
}
static int page_not_mapped(struct page *page)
@@ -1792,7 +1818,7 @@
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
- * are holding mmap_sem. Users without mmap_sem are required to
+ * are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
anon_vma = page_anon_vma(page);
@@ -1812,7 +1838,7 @@
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
*
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
@@ -1835,7 +1861,7 @@
return;
pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
@@ -1865,7 +1891,7 @@
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
@@ -1889,7 +1915,7 @@
return;
pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
if (!locked)
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap,
@@ -1959,6 +1985,9 @@
{
BUG_ON(address < vma->vm_start || address >= vma->vm_end);
atomic_set(compound_mapcount_ptr(page), 0);
+ if (hpage_pincount_available(page))
+ atomic_set(compound_pincount_ptr(page), 0);
+
__page_set_anon_rmap(page, vma, address, 1);
}
#endif /* CONFIG_HUGETLB_PAGE */