Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 87d1659..c8b1592 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -127,6 +127,32 @@
static struct mempolicy preferred_node_policy[MAX_NUMNODES];
+/**
+ * numa_map_to_online_node - Find closest online node
+ * @node: Node id to start the search
+ *
+ * Lookup the next closest node by distance if @nid is not online.
+ */
+int numa_map_to_online_node(int node)
+{
+ int min_dist = INT_MAX, dist, n, min_node;
+
+ if (node == NUMA_NO_NODE || node_online(node))
+ return node;
+
+ min_node = node;
+ for_each_online_node(n) {
+ dist = node_distance(node, n);
+ if (dist < min_dist) {
+ min_dist = dist;
+ min_node = n;
+ }
+ }
+
+ return min_node;
+}
+EXPORT_SYMBOL_GPL(numa_map_to_online_node);
+
struct mempolicy *get_task_policy(struct task_struct *p)
{
struct mempolicy *pol = p->mempolicy;
@@ -198,7 +224,7 @@
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy. May also be called holding the mmap_semaphore for write.
+ * and mempolicy. May also be called holding the mmap_lock for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -342,7 +368,7 @@
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
- * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
@@ -372,17 +398,17 @@
/*
* Rebind each vma in mm to new nodemask.
*
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next)
mpol_rebind_policy(vma->vm_policy, new);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
@@ -410,7 +436,9 @@
struct list_head *pagelist;
unsigned long flags;
nodemask_t *nmask;
- struct vm_area_struct *prev;
+ unsigned long start;
+ unsigned long end;
+ struct vm_area_struct *first;
};
/*
@@ -440,6 +468,7 @@
*/
static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr,
unsigned long end, struct mm_walk *walk)
+ __releases(ptl)
{
int ret = 0;
struct page *page;
@@ -555,9 +584,10 @@
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+ int ret = 0;
#ifdef CONFIG_HUGETLB_PAGE
struct queue_pages *qp = walk->private;
- unsigned long flags = qp->flags;
+ unsigned long flags = (qp->flags & MPOL_MF_VALID);
struct page *page;
spinlock_t *ptl;
pte_t entry;
@@ -569,16 +599,44 @@
page = pte_page(entry);
if (!queue_pages_required(page, qp))
goto unlock;
+
+ if (flags == MPOL_MF_STRICT) {
+ /*
+ * STRICT alone means only detecting misplaced page and no
+ * need to further check other vma.
+ */
+ ret = -EIO;
+ goto unlock;
+ }
+
+ if (!vma_migratable(walk->vma)) {
+ /*
+ * Must be STRICT with MOVE*, otherwise .test_walk() have
+ * stopped walking current vma.
+ * Detecting misplaced page but allow migrating pages which
+ * have been queued.
+ */
+ ret = 1;
+ goto unlock;
+ }
+
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
- (flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, qp->pagelist);
+ (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) {
+ if (!isolate_huge_page(page, qp->pagelist) &&
+ (flags & MPOL_MF_STRICT))
+ /*
+ * Failed to isolate page but allow migrating pages
+ * which have been queued.
+ */
+ ret = 1;
+ }
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
- return 0;
+ return ret;
}
#ifdef CONFIG_NUMA_BALANCING
@@ -596,7 +654,7 @@
{
int nr_updated;
- nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
@@ -618,6 +676,22 @@
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
+ /* range check first */
+ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma);
+
+ if (!qp->first) {
+ qp->first = vma;
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ (qp->start < vma->vm_start))
+ /* hole at head side of range */
+ return -EFAULT;
+ }
+ if (!(flags & MPOL_MF_DISCONTIG_OK) &&
+ ((vma->vm_end < qp->end) &&
+ (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start)))
+ /* hole at middle or tail of range */
+ return -EFAULT;
+
/*
* Need check MPOL_MF_STRICT to return -EIO if possible
* regardless of vma_migratable
@@ -628,22 +702,10 @@
if (endvma > end)
endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return -EFAULT;
- if (qp->prev && qp->prev->vm_end < vma->vm_start)
- return -EFAULT;
- }
-
- qp->prev = vma;
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (!is_vm_hugetlb_page(vma) &&
- (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)) &&
+ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) &&
!(vma->vm_flags & VM_MIXEDMAP))
change_prot_numa(vma, start, endvma);
return 1;
@@ -681,19 +743,28 @@
nodemask_t *nodes, unsigned long flags,
struct list_head *pagelist)
{
+ int err;
struct queue_pages qp = {
.pagelist = pagelist,
.flags = flags,
.nmask = nodes,
- .prev = NULL,
+ .start = start,
+ .end = end,
+ .first = NULL,
};
- return walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+ err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp);
+
+ if (!qp.first)
+ /* whole range in hole */
+ err = -EFAULT;
+
+ return err;
}
/*
* Apply policy to a single VMA
- * This must be called with the mmap_sem held for writing.
+ * This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
@@ -718,7 +789,7 @@
}
old = vma->vm_policy;
- vma->vm_policy = new; /* protected by mmap_sem */
+ vma->vm_policy = new; /* protected by mmap_lock */
mpol_put(old);
return 0;
@@ -740,8 +811,7 @@
unsigned long vmend;
vma = find_vma(mm, start);
- if (!vma || vma->vm_start > start)
- return -EFAULT;
+ VM_BUG_ON(!vma);
prev = vma->vm_prev;
if (start > vma->vm_start)
@@ -805,13 +875,12 @@
goto out;
}
- task_lock(current);
ret = mpol_set_nodemask(new, nodes, scratch);
if (ret) {
- task_unlock(current);
mpol_put(new);
goto out;
}
+ task_lock(current);
old = current->mempolicy;
current->mempolicy = new;
if (new && new->mode == MPOL_INTERLEAVE)
@@ -837,7 +906,6 @@
switch (p->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*nodes = p->v.nodes;
break;
@@ -853,17 +921,17 @@
static int lookup_node(struct mm_struct *mm, unsigned long addr)
{
- struct page *p;
+ struct page *p = NULL;
int err;
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err >= 0) {
+ if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -896,10 +964,10 @@
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -916,7 +984,7 @@
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
- * wil drop the mmap_sem, so after calling
+ * wil drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
@@ -958,7 +1026,7 @@
out:
mpol_cond_put(pol);
if (vma)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (pol_refcount)
mpol_put(pol_refcount);
return err;
@@ -979,8 +1047,8 @@
if (!isolate_lru_page(head)) {
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
- NR_ISOLATED_ANON + page_is_file_cache(head),
- hpage_nr_pages(head));
+ NR_ISOLATED_ANON + page_is_file_lru(head),
+ thp_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
@@ -996,27 +1064,6 @@
return 0;
}
-/* page allocation callback for NUMA node migration */
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
-{
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- else if (PageTransHuge(page)) {
- struct page *thp;
-
- thp = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE, 0);
-}
-
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
@@ -1027,6 +1074,10 @@
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
nodes_clear(nmask);
node_set(source, nmask);
@@ -1041,8 +1092,8 @@
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1067,7 +1118,7 @@
if (err)
return err;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
@@ -1148,7 +1199,7 @@
if (err < 0)
break;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (err < 0)
return err;
return busy;
@@ -1165,7 +1216,7 @@
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
- unsigned long uninitialized_var(address);
+ unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
@@ -1271,12 +1322,10 @@
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
- down_write(&mm->mmap_sem);
- task_lock(current);
+ mmap_write_lock(mm);
err = mpol_set_nodemask(new, nmask, scratch);
- task_unlock(current);
if (err)
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
@@ -1313,7 +1362,7 @@
putback_movable_pages(&pagelist);
}
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
return err;
@@ -1560,14 +1609,14 @@
unsigned long flags)
{
int err;
- int uninitialized_var(pval);
+ int pval;
nodemask_t nodes;
- addr = untagged_addr(addr);
-
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
+ addr = untagged_addr(addr);
+
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
@@ -1699,6 +1748,34 @@
#endif /* CONFIG_COMPAT */
+bool vma_migratable(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & (VM_IO | VM_PFNMAP))
+ return false;
+
+ /*
+ * DAX device mappings require predictable access latency, so avoid
+ * incurring periodic faults.
+ */
+ if (vma_is_dax(vma))
+ return false;
+
+ if (is_vm_hugetlb_page(vma) &&
+ !hugepage_migration_supported(hstate_vma(vma)))
+ return false;
+
+ /*
+ * Migration allocates pages in the highest zone. If we cannot
+ * do so then migration (at least from node to node) is not
+ * possible.
+ */
+ if (vma->vm_file &&
+ gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping))
+ < policy_zone)
+ return false;
+ return true;
+}
+
struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
unsigned long addr)
{
@@ -1793,7 +1870,7 @@
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
@@ -1805,8 +1882,7 @@
}
/* Return the node id preferred by the given mempolicy, or the given id */
-static int policy_node(gfp_t gfp, struct mempolicy *policy,
- int nd)
+static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
{
if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL))
nd = policy->v.preferred_node;
@@ -1994,7 +2070,6 @@
break;
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
*mask = mempolicy->v.nodes;
break;
@@ -2089,7 +2164,7 @@
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * When VMA is not NULL caller must read-lock the mmap_lock of the
* mm_struct of the VMA to prevent it from going away. Should be used for
* all allocations for pages that will be mapped into user space. Returns
* NULL when no page can be allocated.
@@ -2133,18 +2208,22 @@
nmask = policy_nodemask(gfp, pol);
if (!nmask || node_isset(hpage_node, *nmask)) {
mpol_cond_put(pol);
+ /*
+ * First, try to allocate THP only on local node, but
+ * don't reclaim unnecessarily, just compact.
+ */
page = __alloc_pages_node(hpage_node,
- gfp | __GFP_THISNODE, order);
+ gfp | __GFP_THISNODE | __GFP_NORETRY, order);
/*
* If hugepage allocations are configured to always
* synchronous compact or the vma has been madvised
* to prefer hugepage backing, retry allowing remote
- * memory as well.
+ * memory with both reclaim and compact as well.
*/
if (!page && (gfp & __GFP_DIRECT_RECLAIM))
- page = __alloc_pages_node(hpage_node,
- gfp | __GFP_NORETRY, order);
+ page = __alloc_pages_nodemask(gfp, order,
+ hpage_node, nmask);
goto out;
}
@@ -2257,7 +2336,6 @@
switch (a->mode) {
case MPOL_BIND:
- /* Fall through */
case MPOL_INTERLEAVE:
return !!nodes_equal(a->v.nodes, b->v.nodes);
case MPOL_PREFERRED: