Update Linux to v5.4.2
Change-Id: Idf6911045d9d382da2cfe01b1edff026404ac8fd
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 309fb8c..b45a953 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1,3 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0-only
/*
* Generic hugetlb support.
* (C) Nadia Yvette Chambers, April 2004
@@ -15,7 +16,7 @@
#include <linux/compiler.h>
#include <linux/cpuset.h>
#include <linux/mutex.h>
-#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
#include <linux/mmdebug.h>
@@ -25,6 +26,7 @@
#include <linux/swap.h>
#include <linux/swapops.h>
#include <linux/jhash.h>
+#include <linux/numa.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -739,7 +741,15 @@
static inline struct resv_map *inode_resv_map(struct inode *inode)
{
- return inode->i_mapping->private_data;
+ /*
+ * At inode evict time, i_mapping may not point to the original
+ * address space within the inode. This original address space
+ * contains the pointer to the resv_map. So, always use the
+ * address space embedded within the inode.
+ * The VERY common case is inode->mapping == &inode->i_data but,
+ * this may not be true for device special inodes.
+ */
+ return (struct resv_map *)(&inode->i_data)->private_data;
}
static struct resv_map *vma_resv_map(struct vm_area_struct *vma)
@@ -887,7 +897,7 @@
struct zonelist *zonelist;
struct zone *zone;
struct zoneref *z;
- int node = -1;
+ int node = NUMA_NO_NODE;
zonelist = node_zonelist(nid, gfp_mask);
@@ -919,7 +929,7 @@
/* Movability of hugepages depends on migration support. */
static inline gfp_t htlb_alloc_mask(struct hstate *h)
{
- if (hugepage_migration_supported(h))
+ if (hugepage_movable_supported(h))
return GFP_HIGHUSER_MOVABLE;
else
return GFP_HIGHUSER;
@@ -1058,6 +1068,7 @@
free_contig_range(page_to_pfn(page), 1 << order);
}
+#ifdef CONFIG_CONTIG_ALLOC
static int __alloc_gigantic_page(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
{
@@ -1073,11 +1084,10 @@
struct page *page;
for (i = start_pfn; i < end_pfn; i++) {
- if (!pfn_valid(i))
+ page = pfn_to_online_page(i);
+ if (!page)
return false;
- page = pfn_to_page(i);
-
if (page_zone(page) != z)
return false;
@@ -1142,11 +1152,20 @@
static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
static void prep_compound_gigantic_page(struct page *page, unsigned int order);
+#else /* !CONFIG_CONTIG_ALLOC */
+static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
+#endif /* CONFIG_CONTIG_ALLOC */
#else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */
-static inline bool gigantic_page_supported(void) { return false; }
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nodemask) { return NULL; }
+ int nid, nodemask_t *nodemask)
+{
+ return NULL;
+}
static inline void free_gigantic_page(struct page *page, unsigned int order) { }
static inline void destroy_compound_gigantic_page(struct page *page,
unsigned int order) { }
@@ -1156,7 +1175,7 @@
{
int i;
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
return;
h->nr_huge_pages--;
@@ -1248,20 +1267,32 @@
(struct hugepage_subpool *)page_private(page);
bool restore_reserve;
- set_page_private(page, 0);
- page->mapping = NULL;
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(page_mapcount(page), page);
+
+ set_page_private(page, 0);
+ page->mapping = NULL;
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
/*
- * A return code of zero implies that the subpool will be under its
- * minimum size if the reservation is not restored after page is free.
- * Therefore, force restore_reserve operation.
+ * If PagePrivate() was set on page, page allocation consumed a
+ * reservation. If the page was associated with a subpool, there
+ * would have been a page reserved in the subpool before allocation
+ * via hugepage_subpool_get_pages(). Since we are 'restoring' the
+ * reservtion, do not call hugepage_subpool_put_pages() as this will
+ * remove the reserved page from the subpool.
*/
- if (hugepage_subpool_put_pages(spool, 1) == 0)
- restore_reserve = true;
+ if (!restore_reserve) {
+ /*
+ * A return code of zero implies that the subpool will be
+ * under its minimum size if the reservation is not restored
+ * after page is free. Therefore, force restore_reserve
+ * operation.
+ */
+ if (hugepage_subpool_put_pages(spool, 1) == 0)
+ restore_reserve = true;
+ }
spin_lock(&hugetlb_lock);
clear_page_huge_active(page);
@@ -1373,12 +1404,25 @@
}
static struct page *alloc_buddy_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
int order = huge_page_order(h);
struct page *page;
+ bool alloc_try_hard = true;
- gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN;
+ /*
+ * By default we always try hard to allocate the page with
+ * __GFP_RETRY_MAYFAIL flag. However, if we are allocating pages in
+ * a loop (to adjust global huge page counts) and previous allocation
+ * failed, do not continue to try hard on the same node. Use the
+ * node_alloc_noretry bitmap to manage this state information.
+ */
+ if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
+ alloc_try_hard = false;
+ gfp_mask |= __GFP_COMP|__GFP_NOWARN;
+ if (alloc_try_hard)
+ gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask);
@@ -1387,6 +1431,22 @@
else
__count_vm_event(HTLB_BUDDY_PGALLOC_FAIL);
+ /*
+ * If we did not specify __GFP_RETRY_MAYFAIL, but still got a page this
+ * indicates an overall state change. Clear bit so that we resume
+ * normal 'try hard' allocations.
+ */
+ if (node_alloc_noretry && page && !alloc_try_hard)
+ node_clear(nid, *node_alloc_noretry);
+
+ /*
+ * If we tried hard to get a page but failed, set bit so that
+ * subsequent attempts will not try as hard until there is an
+ * overall state change.
+ */
+ if (node_alloc_noretry && !page && alloc_try_hard)
+ node_set(nid, *node_alloc_noretry);
+
return page;
}
@@ -1395,7 +1455,8 @@
* should use this function to get new hugetlb pages
*/
static struct page *alloc_fresh_huge_page(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask)
+ gfp_t gfp_mask, int nid, nodemask_t *nmask,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
@@ -1403,7 +1464,7 @@
page = alloc_gigantic_page(h, gfp_mask, nid, nmask);
else
page = alloc_buddy_huge_page(h, gfp_mask,
- nid, nmask);
+ nid, nmask, node_alloc_noretry);
if (!page)
return NULL;
@@ -1418,14 +1479,16 @@
* Allocates a fresh page to the hugetlb allocator pool in the node interleaved
* manner.
*/
-static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed,
+ nodemask_t *node_alloc_noretry)
{
struct page *page;
int nr_nodes, node;
gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE;
for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) {
- page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed);
+ page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed,
+ node_alloc_noretry);
if (page)
break;
}
@@ -1478,16 +1541,29 @@
/*
* Dissolve a given free hugepage into free buddy pages. This function does
- * nothing for in-use (including surplus) hugepages. Returns -EBUSY if the
- * dissolution fails because a give page is not a free hugepage, or because
- * free hugepages are fully reserved.
+ * nothing for in-use hugepages and non-hugepages.
+ * This function returns values like below:
+ *
+ * -EBUSY: failed to dissolved free hugepages or the hugepage is in-use
+ * (allocated or reserved.)
+ * 0: successfully dissolved free hugepages or the page is not a
+ * hugepage (considered as already dissolved)
*/
int dissolve_free_huge_page(struct page *page)
{
int rc = -EBUSY;
+ /* Not to disrupt normal path by vainly holding hugetlb_lock */
+ if (!PageHuge(page))
+ return 0;
+
spin_lock(&hugetlb_lock);
- if (PageHuge(page) && !page_count(page)) {
+ if (!PageHuge(page)) {
+ rc = 0;
+ goto out;
+ }
+
+ if (!page_count(page)) {
struct page *head = compound_head(page);
struct hstate *h = page_hstate(head);
int nid = page_to_nid(head);
@@ -1532,11 +1608,9 @@
for (pfn = start_pfn; pfn < end_pfn; pfn += 1 << minimum_order) {
page = pfn_to_page(pfn);
- if (PageHuge(page) && !page_count(page)) {
- rc = dissolve_free_huge_page(page);
- if (rc)
- break;
- }
+ rc = dissolve_free_huge_page(page);
+ if (rc)
+ break;
}
return rc;
@@ -1558,7 +1632,7 @@
goto out_unlock;
spin_unlock(&hugetlb_lock);
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -1572,8 +1646,9 @@
*/
if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) {
SetPageHugeTemporary(page);
+ spin_unlock(&hugetlb_lock);
put_page(page);
- page = NULL;
+ return NULL;
} else {
h->surplus_huge_pages++;
h->surplus_huge_pages_node[page_to_nid(page)]++;
@@ -1585,15 +1660,15 @@
return page;
}
-static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
- int nid, nodemask_t *nmask)
+struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+ int nid, nodemask_t *nmask)
{
struct page *page;
if (hstate_is_gigantic(h))
return NULL;
- page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask);
+ page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL);
if (!page)
return NULL;
@@ -2100,9 +2175,9 @@
for_each_node_mask_to_alloc(h, nr_nodes, node, &node_states[N_MEMORY]) {
void *addr;
- addr = memblock_virt_alloc_try_nid_raw(
+ addr = memblock_alloc_try_nid_raw(
huge_page_size(h), huge_page_size(h),
- 0, BOOTMEM_ALLOC_ACCESSIBLE, node);
+ 0, MEMBLOCK_ALLOC_ACCESSIBLE, node);
if (addr) {
/*
* Use the beginning of the huge page to store the
@@ -2163,13 +2238,33 @@
static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
{
unsigned long i;
+ nodemask_t *node_alloc_noretry;
+
+ if (!hstate_is_gigantic(h)) {
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * Ignore errors as lower level routines can deal with
+ * node_alloc_noretry == NULL. If this kmalloc fails at boot
+ * time, we are likely in bigger trouble.
+ */
+ node_alloc_noretry = kmalloc(sizeof(*node_alloc_noretry),
+ GFP_KERNEL);
+ } else {
+ /* allocations done at boot time */
+ node_alloc_noretry = NULL;
+ }
+
+ /* bit mask controlling how hard we retry per-node allocations */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
- &node_states[N_MEMORY]))
+ &node_states[N_MEMORY],
+ node_alloc_noretry))
break;
cond_resched();
}
@@ -2181,6 +2276,8 @@
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
+
+ kfree(node_alloc_noretry);
}
static void __init hugetlb_init_hstates(void)
@@ -2275,13 +2372,59 @@
}
#define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages)
-static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count,
- nodemask_t *nodes_allowed)
+static int set_max_huge_pages(struct hstate *h, unsigned long count, int nid,
+ nodemask_t *nodes_allowed)
{
unsigned long min_count, ret;
+ NODEMASK_ALLOC(nodemask_t, node_alloc_noretry, GFP_KERNEL);
- if (hstate_is_gigantic(h) && !gigantic_page_supported())
- return h->max_huge_pages;
+ /*
+ * Bit mask controlling how hard we retry per-node allocations.
+ * If we can not allocate the bit mask, do not attempt to allocate
+ * the requested huge pages.
+ */
+ if (node_alloc_noretry)
+ nodes_clear(*node_alloc_noretry);
+ else
+ return -ENOMEM;
+
+ spin_lock(&hugetlb_lock);
+
+ /*
+ * Check for a node specific request.
+ * Changing node specific huge page count may require a corresponding
+ * change to the global count. In any case, the passed node mask
+ * (nodes_allowed) will restrict alloc/free to the specified node.
+ */
+ if (nid != NUMA_NO_NODE) {
+ unsigned long old_count = count;
+
+ count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
+ /*
+ * User may have specified a large count value which caused the
+ * above calculation to overflow. In this case, they wanted
+ * to allocate as many huge pages as possible. Set count to
+ * largest possible value to align with their intention.
+ */
+ if (count < old_count)
+ count = ULONG_MAX;
+ }
+
+ /*
+ * Gigantic pages runtime allocation depend on the capability for large
+ * page range allocation.
+ * If the system does not provide this feature, return an error when
+ * the user tries to allocate gigantic pages but let the user free the
+ * boottime allocated gigantic pages.
+ */
+ if (hstate_is_gigantic(h) && !IS_ENABLED(CONFIG_CONTIG_ALLOC)) {
+ if (count > persistent_huge_pages(h)) {
+ spin_unlock(&hugetlb_lock);
+ NODEMASK_FREE(node_alloc_noretry);
+ return -EINVAL;
+ }
+ /* Fall through to decrease pool */
+ }
/*
* Increase the pool size
@@ -2294,7 +2437,6 @@
* pool might be one hugepage larger than it needs to be, but
* within all the constraints specified by the sysctls.
*/
- spin_lock(&hugetlb_lock);
while (h->surplus_huge_pages && count > persistent_huge_pages(h)) {
if (!adjust_pool_surplus(h, nodes_allowed, -1))
break;
@@ -2311,7 +2453,8 @@
/* yield cpu to avoid soft lockup */
cond_resched();
- ret = alloc_pool_huge_page(h, nodes_allowed);
+ ret = alloc_pool_huge_page(h, nodes_allowed,
+ node_alloc_noretry);
spin_lock(&hugetlb_lock);
if (!ret)
goto out;
@@ -2349,9 +2492,12 @@
break;
}
out:
- ret = persistent_huge_pages(h);
+ h->max_huge_pages = persistent_huge_pages(h);
spin_unlock(&hugetlb_lock);
- return ret;
+
+ NODEMASK_FREE(node_alloc_noretry);
+
+ return 0;
}
#define HSTATE_ATTR_RO(_name) \
@@ -2401,41 +2547,32 @@
unsigned long count, size_t len)
{
int err;
- NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY);
+ nodemask_t nodes_allowed, *n_mask;
- if (hstate_is_gigantic(h) && !gigantic_page_supported()) {
- err = -EINVAL;
- goto out;
- }
+ if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported())
+ return -EINVAL;
if (nid == NUMA_NO_NODE) {
/*
* global hstate attribute
*/
if (!(obey_mempolicy &&
- init_nodemask_of_mempolicy(nodes_allowed))) {
- NODEMASK_FREE(nodes_allowed);
- nodes_allowed = &node_states[N_MEMORY];
- }
- } else if (nodes_allowed) {
+ init_nodemask_of_mempolicy(&nodes_allowed)))
+ n_mask = &node_states[N_MEMORY];
+ else
+ n_mask = &nodes_allowed;
+ } else {
/*
- * per node hstate attribute: adjust count to global,
- * but restrict alloc/free to the specified node.
+ * Node specific request. count adjustment happens in
+ * set_max_huge_pages() after acquiring hugetlb_lock.
*/
- count += h->nr_huge_pages - h->nr_huge_pages_node[nid];
- init_nodemask_of_node(nodes_allowed, nid);
- } else
- nodes_allowed = &node_states[N_MEMORY];
+ init_nodemask_of_node(&nodes_allowed, nid);
+ n_mask = &nodes_allowed;
+ }
- h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed);
+ err = set_max_huge_pages(h, count, nid, n_mask);
- if (nodes_allowed != &node_states[N_MEMORY])
- NODEMASK_FREE(nodes_allowed);
-
- return len;
-out:
- NODEMASK_FREE(nodes_allowed);
- return err;
+ return err ? err : len;
}
static ssize_t nr_hugepages_store_common(bool obey_mempolicy,
@@ -3239,16 +3376,17 @@
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
int ret = 0;
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
- mmun_start = vma->vm_start;
- mmun_end = vma->vm_end;
- if (cow)
- mmu_notifier_invalidate_range_start(src, mmun_start, mmun_end);
+ if (cow) {
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, src,
+ vma->vm_start,
+ vma->vm_end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
spinlock_t *src_ptl, *dst_ptl;
@@ -3324,7 +3462,7 @@
}
if (cow)
- mmu_notifier_invalidate_range_end(src, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
return ret;
}
@@ -3341,8 +3479,7 @@
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
- unsigned long mmun_start = start; /* For mmu_notifiers */
- unsigned long mmun_end = end; /* For mmu_notifiers */
+ struct mmu_notifier_range range;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3352,14 +3489,16 @@
* This is a hugetlb vma, all the pte entries should point
* to huge page.
*/
- tlb_remove_check_page_size_change(tlb, sz);
+ tlb_change_page_size(tlb, sz);
tlb_start_vma(tlb, vma);
/*
* If sharing possible, alert mmu notifiers of worst case.
*/
- adjust_range_if_pmd_sharing_possible(vma, &mmun_start, &mmun_end);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, mm, start,
+ end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
+ mmu_notifier_invalidate_range_start(&range);
address = start;
for (; address < end; address += sz) {
ptep = huge_pte_offset(mm, address, sz);
@@ -3427,7 +3566,7 @@
if (ref_page)
break;
}
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
}
@@ -3545,9 +3684,8 @@
struct page *old_page, *new_page;
int outside_reserve = 0;
vm_fault_t ret = 0;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
unsigned long haddr = address & huge_page_mask(h);
+ struct mmu_notifier_range range;
pte = huge_ptep_get(ptep);
old_page = pte_page(pte);
@@ -3624,11 +3762,10 @@
copy_user_huge_page(new_page, old_page, address, vma,
pages_per_huge_page(h));
__SetPageUptodate(new_page);
- set_page_huge_active(new_page);
- mmun_start = haddr;
- mmun_end = mmun_start + huge_page_size(h);
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, haddr,
+ haddr + huge_page_size(h));
+ mmu_notifier_invalidate_range_start(&range);
/*
* Retake the page table lock to check for racing updates
@@ -3641,16 +3778,17 @@
/* Break COW */
huge_ptep_clear_flush(vma, haddr, ptep);
- mmu_notifier_invalidate_range(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range(mm, range.start, range.end);
set_huge_pte_at(mm, haddr, ptep,
make_huge_pte(vma, new_page, 1));
page_remove_rmap(old_page, true);
hugepage_add_new_anon_rmap(new_page, vma, haddr);
+ set_page_huge_active(new_page);
/* Make the old page be freed below */
new_page = old_page;
}
spin_unlock(ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
+ mmu_notifier_invalidate_range_end(&range);
out_release_all:
restore_reserve_on_error(h, vma, haddr, new_page);
put_page(new_page);
@@ -3730,6 +3868,7 @@
pte_t new_pte;
spinlock_t *ptl;
unsigned long haddr = address & huge_page_mask(h);
+ bool new_page = false;
/*
* Currently, we are forced to kill the process in the event the
@@ -3776,8 +3915,7 @@
* handling userfault. Reacquire after handling
* fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping,
- idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
@@ -3786,12 +3924,31 @@
page = alloc_huge_page(vma, haddr, 0);
if (IS_ERR(page)) {
+ /*
+ * Returning error will result in faulting task being
+ * sent SIGBUS. The hugetlb fault mutex prevents two
+ * tasks from racing to fault in the same page which
+ * could result in false unable to allocate errors.
+ * Page migration does not take the fault mutex, but
+ * does a clear then write of pte's under page table
+ * lock. Page fault code could race with migration,
+ * notice the clear pte and try to allocate a page
+ * here. Before returning error, get ptl and make
+ * sure there really is no pte entry.
+ */
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (!huge_pte_none(huge_ptep_get(ptep))) {
+ ret = 0;
+ spin_unlock(ptl);
+ goto out;
+ }
+ spin_unlock(ptl);
ret = vmf_error(PTR_ERR(page));
goto out;
}
clear_huge_page(page, address, pages_per_huge_page(h));
__SetPageUptodate(page);
- set_page_huge_active(page);
+ new_page = true;
if (vma->vm_flags & VM_MAYSHARE) {
int err = huge_add_to_page_cache(page, mapping, idx);
@@ -3862,6 +4019,15 @@
}
spin_unlock(ptl);
+
+ /*
+ * Only make newly allocated pages active. Existing pages found
+ * in the pagecache could be !page_huge_active() if they have been
+ * isolated for migration.
+ */
+ if (new_page)
+ set_page_huge_active(page);
+
unlock_page(page);
out:
return ret;
@@ -3876,21 +4042,14 @@
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
unsigned long key[2];
u32 hash;
- if (vma->vm_flags & VM_SHARED) {
- key[0] = (unsigned long) mapping;
- key[1] = idx;
- } else {
- key[0] = (unsigned long) mm;
- key[1] = address >> huge_page_shift(h);
- }
+ key[0] = (unsigned long) mapping;
+ key[1] = idx;
hash = jhash2((u32 *)&key, sizeof(key)/sizeof(u32), 0);
@@ -3901,9 +4060,7 @@
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm,
- struct vm_area_struct *vma,
- struct address_space *mapping,
+u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
pgoff_t idx, unsigned long address)
{
return 0;
@@ -3948,7 +4105,7 @@
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr);
+ hash = hugetlb_fault_mutex_hash(h, mapping, idx, haddr);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4096,7 +4253,6 @@
* the set_pte_at() write.
*/
__SetPageUptodate(page);
- set_page_huge_active(page);
mapping = dst_vma->vm_file->f_mapping;
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
@@ -4164,6 +4320,7 @@
update_mmu_cache(dst_vma, dst_addr, dst_pte);
spin_unlock(ptl);
+ set_page_huge_active(page);
if (vm_shared)
unlock_page(page);
ret = 0;
@@ -4199,7 +4356,7 @@
* If we have a pending SIGKILL, don't keep faulting pages and
* potentially allocating memory.
*/
- if (unlikely(fatal_signal_pending(current))) {
+ if (fatal_signal_pending(current)) {
remainder = 0;
break;
}
@@ -4269,7 +4426,8 @@
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking)
+ if (nonblocking &&
+ !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*nonblocking = 0;
*nr_pages = 0;
/*
@@ -4288,6 +4446,19 @@
pfn_offset = (vaddr & ~huge_page_mask(h)) >> PAGE_SHIFT;
page = pte_page(huge_ptep_get(pte));
+
+ /*
+ * Instead of doing 'try_get_page()' below in the same_page
+ * loop, just check the count once here.
+ */
+ if (unlikely(page_count(page) <= 0)) {
+ if (pages) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
+ }
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
@@ -4339,21 +4510,22 @@
pte_t pte;
struct hstate *h = hstate_vma(vma);
unsigned long pages = 0;
- unsigned long f_start = start;
- unsigned long f_end = end;
bool shared_pmd = false;
+ struct mmu_notifier_range range;
/*
* In the case of shared PMDs, the area to flush could be beyond
- * start/end. Set f_start/f_end to cover the maximum possible
+ * start/end. Set range.start/range.end to cover the maximum possible
* range if PMD sharing is possible.
*/
- adjust_range_if_pmd_sharing_possible(vma, &f_start, &f_end);
+ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA,
+ 0, vma, mm, start, end);
+ adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
BUG_ON(address >= end);
- flush_cache_range(vma, f_start, f_end);
+ flush_cache_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range_start(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_start(&range);
i_mmap_lock_write(vma->vm_file->f_mapping);
for (; address < end; address += huge_page_size(h)) {
spinlock_t *ptl;
@@ -4388,10 +4560,12 @@
continue;
}
if (!huge_pte_none(pte)) {
- pte = huge_ptep_get_and_clear(mm, address, ptep);
- pte = pte_mkhuge(huge_pte_modify(pte, newprot));
+ pte_t old_pte;
+
+ old_pte = huge_ptep_modify_prot_start(vma, address, ptep);
+ pte = pte_mkhuge(huge_pte_modify(old_pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
- set_huge_pte_at(mm, address, ptep, pte);
+ huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte);
pages++;
}
spin_unlock(ptl);
@@ -4404,7 +4578,7 @@
* did unshare a page of pmds, flush the range corresponding to the pud.
*/
if (shared_pmd)
- flush_hugetlb_tlb_range(vma, f_start, f_end);
+ flush_hugetlb_tlb_range(vma, range.start, range.end);
else
flush_hugetlb_tlb_range(vma, start, end);
/*
@@ -4414,7 +4588,7 @@
* See Documentation/vm/mmu_notifier.rst
*/
i_mmap_unlock_write(vma->vm_file->f_mapping);
- mmu_notifier_invalidate_range_end(mm, f_start, f_end);
+ mmu_notifier_invalidate_range_end(&range);
return pages << h->order;
}
@@ -4451,6 +4625,11 @@
* called to make the mapping read-write. Assume !vma is a shm mapping
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
+ /*
+ * resv_map can not be NULL as hugetlb_reserve_pages is only
+ * called for inodes for which resv_maps were created (see
+ * hugetlbfs_get_inode).
+ */
resv_map = inode_resv_map(inode);
chg = region_chg(resv_map, from, to);
@@ -4542,6 +4721,10 @@
struct hugepage_subpool *spool = subpool_inode(inode);
long gbl_reserve;
+ /*
+ * Since this routine can be called in the evict inode path for all
+ * hugetlbfs inodes, resv_map could be NULL.
+ */
if (resv_map) {
chg = region_del(resv_map, start, end);
/*