Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/migrate.c b/mm/migrate.c
index 5092ef2..278e6f3 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -48,6 +48,7 @@
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
@@ -192,7 +193,7 @@
put_page(page);
} else {
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -thp_nr_pages(page));
putback_lru_page(page);
}
}
@@ -242,12 +243,16 @@
entry = pte_to_swp_entry(*pvmw.pte);
if (is_write_migration_entry(entry))
pte = maybe_mkwrite(pte, vma);
+ else if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_mkuffd_wp(pte);
- if (unlikely(is_zone_device_page(new))) {
- if (is_device_private_page(new)) {
- entry = make_device_private_entry(new, pte_write(pte));
- pte = swp_entry_to_pte(entry);
- }
+ if (unlikely(is_device_private_page(new))) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*pvmw.pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_swp_mkuffd_wp(pte);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -377,12 +382,12 @@
int expected_count = 1;
/*
- * Device public or private pages have an extra refcount as they are
+ * Device private pages have an extra refcount as they are
* ZONE_DEVICE pages.
*/
expected_count += is_device_private_page(page);
if (mapping)
- expected_count += hpage_nr_pages(page) + page_has_private(page);
+ expected_count += thp_nr_pages(page) + page_has_private(page);
return expected_count;
}
@@ -402,6 +407,7 @@
struct zone *oldzone, *newzone;
int dirty;
int expected_count = expected_page_refs(mapping, page) + extra_count;
+ int nr = thp_nr_pages(page);
if (!mapping) {
/* Anonymous page without mapping */
@@ -437,7 +443,7 @@
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
+ page_ref_add(newpage, nr); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -459,7 +465,7 @@
if (PageTransHuge(page)) {
int i;
- for (i = 1; i < HPAGE_PMD_NR; i++) {
+ for (i = 1; i < nr; i++) {
xas_next(&xas);
xas_store(&xas, newpage);
}
@@ -470,7 +476,7 @@
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
+ page_ref_unfreeze(page, expected_count - nr);
xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -486,17 +492,24 @@
* are mapped to swap space.
*/
if (newzone != oldzone) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
+ struct lruvec *old_lruvec, *new_lruvec;
+ struct mem_cgroup *memcg;
+
+ memcg = page_memcg(page);
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
+
+ __mod_lruvec_state(old_lruvec, NR_FILE_PAGES, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_PAGES, nr);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
+ __mod_lruvec_state(old_lruvec, NR_SHMEM, -nr);
+ __mod_lruvec_state(new_lruvec, NR_SHMEM, nr);
}
- if (dirty && mapping_cap_account_dirty(mapping)) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
- __dec_zone_state(oldzone, NR_ZONE_WRITE_PENDING);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_DIRTY);
- __inc_zone_state(newzone, NR_ZONE_WRITE_PENDING);
+ if (dirty && mapping_can_writeback(mapping)) {
+ __mod_lruvec_state(old_lruvec, NR_FILE_DIRTY, -nr);
+ __mod_zone_page_state(oldzone, NR_ZONE_WRITE_PENDING, -nr);
+ __mod_lruvec_state(new_lruvec, NR_FILE_DIRTY, nr);
+ __mod_zone_page_state(newzone, NR_ZONE_WRITE_PENDING, nr);
}
}
local_irq_enable();
@@ -580,7 +593,7 @@
} else {
/* thp page */
BUG_ON(!PageTransHuge(src));
- nr_pages = hpage_nr_pages(src);
+ nr_pages = thp_nr_pages(src);
}
for (i = 0; i < nr_pages; i++) {
@@ -647,9 +660,18 @@
if (PageWriteback(newpage))
end_page_writeback(newpage);
+ /*
+ * PG_readahead shares the same bit with PG_reclaim. The above
+ * end_page_writeback() may clear PG_readahead mistakenly, so set the
+ * bit after that.
+ */
+ if (PageReadahead(page))
+ SetPageReadahead(newpage);
+
copy_page_owner(page, newpage);
- mem_cgroup_migrate(page, newpage);
+ if (!PageHuge(page))
+ mem_cgroup_migrate(page, newpage);
}
EXPORT_SYMBOL(migrate_page_states);
@@ -785,11 +807,7 @@
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
- ClearPagePrivate(page);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- get_page(newpage);
+ attach_page_private(newpage, detach_page_private(page));
bh = head;
do {
@@ -798,8 +816,6 @@
} while (bh != head);
- SetPagePrivate(newpage);
-
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
@@ -987,7 +1003,7 @@
}
/*
- * Anonymous and movable page->mapping will be cleard by
+ * Anonymous and movable page->mapping will be cleared by
* free_pages_prepare so don't reset it here for keeping
* the type to work PageAnon, for example.
*/
@@ -1020,7 +1036,7 @@
* to the LRU. Later, when the IO completes the pages are
* marked uptodate and unlocked. However, the queueing
* could be merging multiple pages for one bio (e.g.
- * mpage_readpages). If an allocation happens for the
+ * mpage_readahead). If an allocation happens for the
* second or third page, the process can end up locking
* the same page twice and deadlocking. Rather than
* trying to be clever about what pages can be locked,
@@ -1108,8 +1124,7 @@
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
- try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
page_was_mapped = 1;
}
@@ -1148,36 +1163,21 @@
}
/*
- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
- * around it.
- */
-#if defined(CONFIG_ARM) && \
- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
-#define ICE_noinline noinline
-#else
-#define ICE_noinline
-#endif
-
-/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
+static int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode,
enum migrate_reason reason)
{
int rc = MIGRATEPAGE_SUCCESS;
- struct page *newpage;
+ struct page *newpage = NULL;
if (!thp_migration_supported() && PageTransHuge(page))
return -ENOMEM;
- newpage = get_new_page(page, private);
- if (!newpage)
- return -ENOMEM;
-
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
ClearPageActive(page);
@@ -1188,13 +1188,13 @@
__ClearPageIsolated(page);
unlock_page(page);
}
- if (put_new_page)
- put_new_page(newpage, private);
- else
- put_page(newpage);
goto out;
}
+ newpage = get_new_page(page, private);
+ if (!newpage)
+ return -ENOMEM;
+
rc = __unmap_and_move(page, newpage, force, mode);
if (rc == MIGRATEPAGE_SUCCESS)
set_page_owner_migrate_reason(newpage, reason);
@@ -1204,8 +1204,7 @@
/*
* A page that has been migrated has all references
* removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
+ * migrated will have kept its references and be restored.
*/
list_del(&page->lru);
@@ -1216,7 +1215,7 @@
*/
if (likely(!__PageMovable(page)))
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_cache(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -thp_nr_pages(page));
}
/*
@@ -1225,16 +1224,11 @@
* we want to retry.
*/
if (rc == MIGRATEPAGE_SUCCESS) {
- put_page(page);
- if (reason == MR_MEMORY_FAILURE) {
+ if (reason != MR_MEMORY_FAILURE)
/*
- * Set PG_HWPoison on just freed page
- * intentionally. Although it's rather weird,
- * it's how HWPoison flag works at the moment.
+ * We release the page in page_handle_poison.
*/
- if (set_hwpoison_free_buddy_page(page))
- num_poisoned_pages_inc();
- }
+ put_page(page);
} else {
if (rc != -EAGAIN) {
if (likely(!__PageMovable(page))) {
@@ -1287,6 +1281,7 @@
int page_was_mapped = 0;
struct page *new_hpage;
struct anon_vma *anon_vma = NULL;
+ struct address_space *mapping = NULL;
/*
* Migratability of hugepages depends on architectures and their size.
@@ -1334,9 +1329,29 @@
goto put_anon;
if (page_mapped(hpage)) {
- try_to_unmap(hpage,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ bool mapping_locked = false;
+ enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
+
+ if (!PageAnon(hpage)) {
+ /*
+ * In shared mappings, try_to_unmap could potentially
+ * call huge_pmd_unshare. Because of this, take
+ * semaphore in write mode here and set TTU_RMAP_LOCKED
+ * to let lower levels know we have taken the lock.
+ */
+ mapping = hugetlb_page_mapping_lock_write(hpage);
+ if (unlikely(!mapping))
+ goto unlock_put_anon;
+
+ mapping_locked = true;
+ ttu |= TTU_RMAP_LOCKED;
+ }
+
+ try_to_unmap(hpage, ttu);
page_was_mapped = 1;
+
+ if (mapping_locked)
+ i_mmap_unlock_write(mapping);
}
if (!page_mapped(hpage))
@@ -1346,6 +1361,7 @@
remove_migration_ptes(hpage,
rc == MIGRATEPAGE_SUCCESS ? new_hpage : hpage, false);
+unlock_put_anon:
unlock_page(new_hpage);
put_anon:
@@ -1402,22 +1418,35 @@
enum migrate_mode mode, int reason)
{
int retry = 1;
+ int thp_retry = 1;
int nr_failed = 0;
int nr_succeeded = 0;
+ int nr_thp_succeeded = 0;
+ int nr_thp_failed = 0;
+ int nr_thp_split = 0;
int pass = 0;
+ bool is_thp = false;
struct page *page;
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
- int rc;
+ int rc, nr_subpages;
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
- for(pass = 0; pass < 10 && retry; pass++) {
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
+ thp_retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
retry:
+ /*
+ * THP statistics is based on the source huge page.
+ * Capture required information that might get lost
+ * during migration.
+ */
+ is_thp = PageTransHuge(page) && !PageHuge(page);
+ nr_subpages = thp_nr_pages(page);
cond_resched();
if (PageHuge(page))
@@ -1442,21 +1471,35 @@
* we encounter them after the rest of the list
* is processed.
*/
- if (PageTransHuge(page) && !PageHuge(page)) {
+ if (is_thp) {
lock_page(page);
rc = split_huge_page_to_list(page, from);
unlock_page(page);
if (!rc) {
list_safe_reset_next(page, page2, lru);
+ nr_thp_split++;
goto retry;
}
+
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ goto out;
}
nr_failed++;
goto out;
case -EAGAIN:
+ if (is_thp) {
+ thp_retry++;
+ break;
+ }
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ if (is_thp) {
+ nr_thp_succeeded++;
+ nr_succeeded += nr_subpages;
+ break;
+ }
nr_succeeded++;
break;
default:
@@ -1466,19 +1509,27 @@
* removed from migration page list and not
* retried in the next outer loop.
*/
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ break;
+ }
nr_failed++;
break;
}
}
}
- nr_failed += retry;
+ nr_failed += retry + thp_retry;
+ nr_thp_failed += thp_retry;
rc = nr_failed;
out:
- if (nr_succeeded)
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- if (nr_failed)
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
@@ -1486,6 +1537,49 @@
return rc;
}
+struct page *alloc_migration_target(struct page *page, unsigned long private)
+{
+ struct migration_target_control *mtc;
+ gfp_t gfp_mask;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
+ int nid;
+ int zidx;
+
+ mtc = (struct migration_target_control *)private;
+ gfp_mask = mtc->gfp_mask;
+ nid = mtc->nid;
+ if (nid == NUMA_NO_NODE)
+ nid = page_to_nid(page);
+
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(compound_head(page));
+
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ }
+
+ if (PageTransHuge(page)) {
+ /*
+ * clear __GFP_RECLAIM to make the migration callback
+ * consistent with regular THP allocations.
+ */
+ gfp_mask &= ~__GFP_RECLAIM;
+ gfp_mask |= GFP_TRANSHUGE;
+ order = HPAGE_PMD_ORDER;
+ }
+ zidx = zone_idx(page_zone(page));
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+
+ if (new_page && PageTransHuge(new_page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
+}
+
#ifdef CONFIG_NUMA
static int store_status(int __user *status, int start, int value, int nr)
@@ -1503,12 +1597,13 @@
struct list_head *pagelist, int node)
{
int err;
+ struct migration_target_control mtc = {
+ .nid = node,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
- if (list_empty(pagelist))
- return 0;
-
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -1531,7 +1626,7 @@
unsigned int follflags;
int err;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
err = -EFAULT;
vma = find_vma(mm, addr);
if (!vma || addr < vma->vm_start || !vma_migratable(vma))
@@ -1573,8 +1668,8 @@
err = 1;
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
+ NR_ISOLATED_ANON + page_is_file_lru(head),
+ thp_nr_pages(head));
}
out_putpage:
/*
@@ -1584,10 +1679,36 @@
*/
put_page(page);
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
+static int move_pages_and_store_status(struct mm_struct *mm, int node,
+ struct list_head *pagelist, int __user *status,
+ int start, int i, unsigned long nr_pages)
+{
+ int err;
+
+ if (list_empty(pagelist))
+ return 0;
+
+ err = do_move_pages_to_node(mm, pagelist, node);
+ if (err) {
+ /*
+ * Positive err means the number of failed
+ * pages to migrate. Since we are going to
+ * abort and return the number of non-migrated
+ * pages, so need to incude the rest of the
+ * nr_pages that have not been attempted as
+ * well.
+ */
+ if (err > 0)
+ err += nr_pages - i - 1;
+ return err;
+ }
+ return store_status(status, start, node, i - start);
+}
+
/*
* Migrate an array of page address onto an array of nodes and fill
* the corresponding array of status.
@@ -1631,21 +1752,8 @@
current_node = node;
start = i;
} else if (node != current_node) {
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- /*
- * Positive err means the number of failed
- * pages to migrate. Since we are going to
- * abort and return the number of non-migrated
- * pages, so need to incude the rest of the
- * nr_pages that have not been attempted as
- * well.
- */
- if (err > 0)
- err += nr_pages - i - 1;
- goto out;
- }
- err = store_status(status, start, current_node, i - start);
+ err = move_pages_and_store_status(mm, current_node,
+ &pagelist, status, start, i, nr_pages);
if (err)
goto out;
start = i;
@@ -1659,49 +1767,29 @@
err = add_page_for_migration(mm, addr, current_node,
&pagelist, flags & MPOL_MF_MOVE_ALL);
- if (!err) {
- /* The page is already on the target node */
- err = store_status(status, i, current_node, 1);
- if (err)
- goto out_flush;
- continue;
- } else if (err > 0) {
+ if (err > 0) {
/* The page is successfully queued for migration */
continue;
}
- err = store_status(status, i, err, 1);
+ /*
+ * If the page is already on the target node (!err), store the
+ * node, otherwise, store the err.
+ */
+ err = store_status(status, i, err ? : current_node, 1);
if (err)
goto out_flush;
- err = do_move_pages_to_node(mm, &pagelist, current_node);
- if (err) {
- if (err > 0)
- err += nr_pages - i - 1;
+ err = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
+ if (err)
goto out;
- }
- if (i > start) {
- err = store_status(status, start, current_node, i - start);
- if (err)
- goto out;
- }
current_node = NUMA_NO_NODE;
}
out_flush:
- if (list_empty(&pagelist))
- return err;
-
/* Make sure we do not overwrite the existing error */
- err1 = do_move_pages_to_node(mm, &pagelist, current_node);
- /*
- * Don't have to report non-attempted pages here since:
- * - If the above loop is done gracefully all pages have been
- * attempted.
- * - If the above loop is aborted it means a fatal error
- * happened, should return ret.
- */
- if (!err1)
- err1 = store_status(status, start, current_node, i - start);
+ err1 = move_pages_and_store_status(mm, current_node, &pagelist,
+ status, start, i, nr_pages);
if (err >= 0)
err = err1;
out:
@@ -1716,7 +1804,7 @@
{
unsigned long i;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
@@ -1743,7 +1831,7 @@
status++;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
/*
@@ -1780,6 +1868,53 @@
return nr_pages ? -EFAULT : 0;
}
+static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
+{
+ struct task_struct *task;
+ struct mm_struct *mm;
+
+ /*
+ * There is no need to check if current process has the right to modify
+ * the specified process when they are same.
+ */
+ if (!pid) {
+ mmget(current->mm);
+ *mem_nodes = cpuset_mems_allowed(current);
+ return current->mm;
+ }
+
+ /* Find the mm_struct */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (!task) {
+ rcu_read_unlock();
+ return ERR_PTR(-ESRCH);
+ }
+ get_task_struct(task);
+
+ /*
+ * Check if this process has the right to modify the specified
+ * process. Use the regular "ptrace_may_access()" checks.
+ */
+ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
+ rcu_read_unlock();
+ mm = ERR_PTR(-EPERM);
+ goto out;
+ }
+ rcu_read_unlock();
+
+ mm = ERR_PTR(security_task_movememory(task));
+ if (IS_ERR(mm))
+ goto out;
+ *mem_nodes = cpuset_mems_allowed(task);
+ mm = get_task_mm(task);
+out:
+ put_task_struct(task);
+ if (!mm)
+ mm = ERR_PTR(-EINVAL);
+ return mm;
+}
+
/*
* Move a list of pages in the address space of the currently executing
* process.
@@ -1789,7 +1924,6 @@
const int __user *nodes,
int __user *status, int flags)
{
- struct task_struct *task;
struct mm_struct *mm;
int err;
nodemask_t task_nodes;
@@ -1801,36 +1935,9 @@
if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
- /* Find the mm_struct */
- rcu_read_lock();
- task = pid ? find_task_by_vpid(pid) : current;
- if (!task) {
- rcu_read_unlock();
- return -ESRCH;
- }
- get_task_struct(task);
-
- /*
- * Check if this process has the right to modify the specified
- * process. Use the regular "ptrace_may_access()" checks.
- */
- if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
- rcu_read_unlock();
- err = -EPERM;
- goto out;
- }
- rcu_read_unlock();
-
- err = security_task_movememory(task);
- if (err)
- goto out;
-
- task_nodes = cpuset_mems_allowed(task);
- mm = get_task_mm(task);
- put_task_struct(task);
-
- if (!mm)
- return -EINVAL;
+ mm = find_mm_struct(pid, &task_nodes);
+ if (IS_ERR(mm))
+ return PTR_ERR(mm);
if (nodes)
err = do_pages_move(mm, task_nodes, nr_pages, pages,
@@ -1840,10 +1947,6 @@
mmput(mm);
return err;
-
-out:
- put_task_struct(task);
- return err;
}
SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages,
@@ -1896,7 +1999,7 @@
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone) +
nr_migrate_pages,
- 0, 0))
+ ZONE_MOVABLE, 0))
continue;
return true;
}
@@ -1943,9 +2046,9 @@
return 0;
}
- page_lru = page_is_file_cache(page);
+ page_lru = page_is_file_lru(page);
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
- hpage_nr_pages(page));
+ thp_nr_pages(page));
/*
* Isolating the page has taken another reference, so the
@@ -1979,7 +2082,7 @@
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
+ if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
(vma->vm_flags & VM_EXEC))
goto out;
@@ -1987,7 +2090,7 @@
* Also do not migrate dirty pages as not all filesystems can move
* dirty pages in MIGRATE_ASYNC mode which is a waste of cycles.
*/
- if (page_is_file_cache(page) && PageDirty(page))
+ if (page_is_file_lru(page) && PageDirty(page))
goto out;
isolated = numamigrate_isolate_page(pgdat, page);
@@ -2002,7 +2105,7 @@
if (!list_empty(&migratepages)) {
list_del(&page->lru);
dec_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
putback_lru_page(page);
}
isolated = 0;
@@ -2032,7 +2135,7 @@
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
struct page *new_page = NULL;
- int page_lru = page_is_file_cache(page);
+ int page_lru = page_is_file_lru(page);
unsigned long start = address & HPAGE_PMD_MASK;
new_page = alloc_pages_node(node,
@@ -2103,7 +2206,7 @@
* pmd before doing set_pmd_at(), nor to flush the TLB after
* set_pmd_at(). Clearing the pmd here would introduce a race
* condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
- * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
* MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
* pmd.
*/
@@ -2156,12 +2259,23 @@
#ifdef CONFIG_DEVICE_PRIVATE
static int migrate_vma_collect_hole(unsigned long start,
unsigned long end,
+ __always_unused int depth,
struct mm_walk *walk)
{
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma)) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ }
+ return 0;
+ }
+
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
migrate->npages++;
@@ -2178,7 +2292,7 @@
struct migrate_vma *migrate = walk->private;
unsigned long addr;
- for (addr = start & PAGE_MASK; addr < end; addr += PAGE_SIZE) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->dst[migrate->npages] = 0;
migrate->src[migrate->npages++] = 0;
}
@@ -2200,7 +2314,7 @@
again:
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end, walk);
+ return migrate_vma_collect_hole(start, end, -1, walk);
if (pmd_trans_huge(*pmdp)) {
struct page *page;
@@ -2233,7 +2347,7 @@
return migrate_vma_collect_skip(start, end,
walk);
if (pmd_none(*pmdp))
- return migrate_vma_collect_hole(start, end,
+ return migrate_vma_collect_hole(start, end, -1,
walk);
}
}
@@ -2245,7 +2359,7 @@
arch_enter_lazy_mmu_mode();
for (; addr < end; addr += PAGE_SIZE, ptep++) {
- unsigned long mpfn, pfn;
+ unsigned long mpfn = 0, pfn;
struct page *page;
swp_entry_t entry;
pte_t pte;
@@ -2253,14 +2367,14 @@
pte = *ptep;
if (pte_none(pte)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
goto next;
}
if (!pte_present(pte)) {
- mpfn = 0;
-
/*
* Only care about unaddressable device page special
* page table entry. Other special swap entries are not
@@ -2271,11 +2385,18 @@
goto next;
page = device_private_entry_to_page(entry);
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
+ goto next;
+
mpfn = migrate_pfn(page_to_pfn(page)) |
MIGRATE_PFN_MIGRATE;
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
+ goto next;
pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
mpfn = MIGRATE_PFN_MIGRATE;
@@ -2320,8 +2441,17 @@
entry = make_migration_entry(page, mpfn &
MIGRATE_PFN_WRITE);
swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pte))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, addr, ptep, swp_pte);
/*
@@ -2367,8 +2497,14 @@
{
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
- migrate->vma->vm_mm, migrate->start, migrate->end);
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
@@ -2552,7 +2688,7 @@
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
unsigned long addr, i, restore = 0;
@@ -2599,7 +2735,7 @@
/**
* migrate_vma_setup() - prepare to migrate a range of memory
- * @args: contains the vma, start, and and pfns arrays for the migration
+ * @args: contains the vma, start, and pfns arrays for the migration
*
* Returns: negative errno on failures, 0 when 0 or more pages were migrated
* without an error.
@@ -2652,7 +2788,7 @@
* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
*
* It is safe to update device page table after migrate_vma_pages() because
- * both destination and source page are still locked, and the mmap_sem is held
+ * both destination and source page are still locked, and the mmap_lock is held
* in read mode (hence no one can unmap the range being migrated).
*
* Once the caller is done cleaning up things and updating its page table (if it
@@ -2701,6 +2837,14 @@
}
EXPORT_SYMBOL(migrate_vma_setup);
+/*
+ * This code closely matches the code in:
+ * __handle_mm_fault()
+ * handle_pte_fault()
+ * do_anonymous_page()
+ * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE
+ * private page.
+ */
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
@@ -2709,7 +2853,6 @@
{
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
- struct mem_cgroup *memcg;
bool flush = false;
spinlock_t *ptl;
pte_t entry;
@@ -2742,10 +2885,10 @@
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
- * Here we only have down_read(mmap_sem).
+ * Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(mm, pmdp))
goto abort;
@@ -2756,7 +2899,7 @@
if (unlikely(anon_vma_prepare(vma)))
goto abort;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto abort;
/*
@@ -2788,36 +2931,29 @@
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (check_stable_address_space(mm))
+ goto unlock_abort;
+
if (pte_present(*ptep)) {
unsigned long pfn = pte_pfn(*ptep);
- if (!is_zero_pfn(pfn)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (!is_zero_pfn(pfn))
+ goto unlock_abort;
flush = true;
- } else if (!pte_none(*ptep)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ } else if (!pte_none(*ptep))
+ goto unlock_abort;
/*
- * Check for usefaultfd but do not deliver the fault. Instead,
+ * Check for userfaultfd but do not deliver the fault. Instead,
* just back off.
*/
- if (userfaultfd_missing(vma)) {
- pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
- goto abort;
- }
+ if (userfaultfd_missing(vma))
+ goto unlock_abort;
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
if (!is_zone_device_page(page))
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
get_page(page);
if (flush) {
@@ -2835,6 +2971,8 @@
*src = MIGRATE_PFN_MIGRATE;
return;
+unlock_abort:
+ pte_unmap_unlock(ptep, ptl);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
@@ -2867,9 +3005,8 @@
}
if (!page) {
- if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) {
+ if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE))
continue;
- }
if (!notified) {
notified = true;
@@ -2961,7 +3098,6 @@
remove_migration_ptes(page, newpage, false);
unlock_page(page);
- migrate->cpages--;
if (is_zone_device_page(page))
put_page(page);