Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 95a3274..fce705f 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -19,6 +19,7 @@
#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
@@ -28,9 +29,10 @@
#include <linux/jhash.h>
#include <linux/numa.h>
#include <linux/llist.h>
+#include <linux/cma.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <linux/io.h>
@@ -44,6 +46,12 @@
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+
+#ifdef CONFIG_CMA
+static struct cma *hugetlb_cma[MAX_NUMNODES];
+#endif
+static unsigned long hugetlb_cma_size __initdata;
+
/*
* Minimum page order among possible hugepage sizes, set to a proper value
* at boot time.
@@ -55,8 +63,8 @@
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
-static unsigned long __initdata default_hstate_size;
static bool __initdata parsed_valid_hugepagesz = true;
+static bool __initdata parsed_default_hugepagesz;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -96,7 +104,7 @@
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, give up any reservations mased on minimum size and
+ * remain, give up any reservations based on minimum size and
* free the subpool */
if (free) {
if (spool->min_hpages != -1)
@@ -141,10 +149,10 @@
/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
- * the request. Otherwise, return the number of pages by which the
+ * request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
- * a subpool minimum size must be manitained.
+ * a subpool minimum size must be maintained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
@@ -235,114 +243,317 @@
return subpool_inode(file_inode(vma->vm_file));
}
-/*
- * Region tracking -- allows tracking of reservations and instantiated pages
- * across the pages in a mapping.
- *
- * The region data structures are embedded into a resv_map and protected
- * by a resv_map's lock. The set of regions within the resv_map represent
- * reservations for huge pages, or huge pages that have already been
- * instantiated within the map. The from and to elements are huge page
- * indicies into the associated mapping. from indicates the starting index
- * of the region. to represents the first index past the end of the region.
- *
- * For example, a file region structure with from == 0 and to == 4 represents
- * four huge pages in a mapping. It is important to note that the to element
- * represents the first element past the end of the region. This is used in
- * arithmetic as 4(to) - 0(from) = 4 huge pages in the region.
- *
- * Interval notation of the form [from, to) will be used to indicate that
- * the endpoint from is inclusive and to is exclusive.
+/* Helper that removes a struct file_region from the resv_map cache and returns
+ * it for use.
*/
-struct file_region {
- struct list_head link;
- long from;
- long to;
-};
-
-/*
- * Add the huge page range represented by [f, t) to the reserve
- * map. In the normal case, existing regions will be expanded
- * to accommodate the specified range. Sufficient regions should
- * exist for expansion due to the previous call to region_chg
- * with the same range. However, it is possible that region_del
- * could have been called after region_chg and modifed the map
- * in such a way that no region exists to be expanded. In this
- * case, pull a region descriptor from the cache associated with
- * the map and use that for the new range.
- *
- * Return the number of new huge pages added to the map. This
- * number is greater than or equal to zero.
- */
-static long region_add(struct resv_map *resv, long f, long t)
+static struct file_region *
+get_file_region_entry_from_cache(struct resv_map *resv, long from, long to)
{
- struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg, *trg;
- long add = 0;
+ struct file_region *nrg = NULL;
- spin_lock(&resv->lock);
- /* Locate the region we are either in or before. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
+ VM_BUG_ON(resv->region_cache_count <= 0);
- /*
- * If no region exists which can be expanded to include the
- * specified range, the list must have been modified by an
- * interleving call to region_del(). Pull a region descriptor
- * from the cache and use it for this range.
- */
- if (&rg->link == head || t < rg->from) {
- VM_BUG_ON(resv->region_cache_count <= 0);
+ resv->region_cache_count--;
+ nrg = list_first_entry(&resv->region_cache, struct file_region, link);
+ list_del(&nrg->link);
- resv->region_cache_count--;
- nrg = list_first_entry(&resv->region_cache, struct file_region,
- link);
- list_del(&nrg->link);
+ nrg->from = from;
+ nrg->to = to;
- nrg->from = f;
- nrg->to = t;
- list_add(&nrg->link, rg->link.prev);
+ return nrg;
+}
- add += t - f;
- goto out_locked;
+static void copy_hugetlb_cgroup_uncharge_info(struct file_region *nrg,
+ struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ nrg->reservation_counter = rg->reservation_counter;
+ nrg->css = rg->css;
+ if (rg->css)
+ css_get(rg->css);
+#endif
+}
+
+/* Helper that records hugetlb_cgroup uncharge info. */
+static void record_hugetlb_cgroup_uncharge_info(struct hugetlb_cgroup *h_cg,
+ struct hstate *h,
+ struct resv_map *resv,
+ struct file_region *nrg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (h_cg) {
+ nrg->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ nrg->css = &h_cg->css;
+ /*
+ * The caller will hold exactly one h_cg->css reference for the
+ * whole contiguous reservation region. But this area might be
+ * scattered when there are already some file_regions reside in
+ * it. As a result, many file_regions may share only one css
+ * reference. In order to ensure that one file_region must hold
+ * exactly one h_cg->css reference, we should do css_get for
+ * each file_region and leave the reference held by caller
+ * untouched.
+ */
+ css_get(&h_cg->css);
+ if (!resv->pages_per_hpage)
+ resv->pages_per_hpage = pages_per_huge_page(h);
+ /* pages_per_hpage should be the same for all entries in
+ * a resv_map.
+ */
+ VM_BUG_ON(resv->pages_per_hpage != pages_per_huge_page(h));
+ } else {
+ nrg->reservation_counter = NULL;
+ nrg->css = NULL;
+ }
+#endif
+}
+
+static void put_uncharge_info(struct file_region *rg)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (rg->css)
+ css_put(rg->css);
+#endif
+}
+
+static bool has_same_uncharge_info(struct file_region *rg,
+ struct file_region *org)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ return rg && org &&
+ rg->reservation_counter == org->reservation_counter &&
+ rg->css == org->css;
+
+#else
+ return true;
+#endif
+}
+
+static void coalesce_file_region(struct resv_map *resv, struct file_region *rg)
+{
+ struct file_region *nrg = NULL, *prg = NULL;
+
+ prg = list_prev_entry(rg, link);
+ if (&prg->link != &resv->regions && prg->to == rg->from &&
+ has_same_uncharge_info(prg, rg)) {
+ prg->to = rg->to;
+
+ list_del(&rg->link);
+ put_uncharge_info(rg);
+ kfree(rg);
+
+ rg = prg;
}
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
+ nrg = list_next_entry(rg, link);
+ if (&nrg->link != &resv->regions && nrg->from == rg->to &&
+ has_same_uncharge_info(nrg, rg)) {
+ nrg->from = rg->from;
- /* Check for and consume any regions we now overlap with. */
- nrg = rg;
- list_for_each_entry_safe(rg, trg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
+ list_del(&rg->link);
+ put_uncharge_info(rg);
+ kfree(rg);
+ }
+}
+
+/*
+ * Must be called with resv->lock held.
+ *
+ * Calling this with regions_needed != NULL will count the number of pages
+ * to be added but will not modify the linked list. And regions_needed will
+ * indicate the number of file_regions needed in the cache to carry out to add
+ * the regions for this range.
+ */
+static long add_reservation_in_range(struct resv_map *resv, long f, long t,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h, long *regions_needed)
+{
+ long add = 0;
+ struct list_head *head = &resv->regions;
+ long last_accounted_offset = f;
+ struct file_region *rg = NULL, *trg = NULL, *nrg = NULL;
+
+ if (regions_needed)
+ *regions_needed = 0;
+
+ /* In this loop, we essentially handle an entry for the range
+ * [last_accounted_offset, rg->from), at every iteration, with some
+ * bounds checking.
+ */
+ list_for_each_entry_safe(rg, trg, head, link) {
+ /* Skip irrelevant regions that start before our range. */
+ if (rg->from < f) {
+ /* If this region ends after the last accounted offset,
+ * then we need to update last_accounted_offset.
+ */
+ if (rg->to > last_accounted_offset)
+ last_accounted_offset = rg->to;
+ continue;
+ }
+
+ /* When we find a region that starts beyond our range, we've
+ * finished.
+ */
if (rg->from > t)
break;
- /* If this area reaches higher then extend our area to
- * include it completely. If this is not the first area
- * which we intend to reuse, free it. */
- if (rg->to > t)
- t = rg->to;
- if (rg != nrg) {
- /* Decrement return value by the deleted range.
- * Another range will span this area so that by
- * end of routine add will be >= zero
- */
- add -= (rg->to - rg->from);
- list_del(&rg->link);
- kfree(rg);
+ /* Add an entry for last_accounted_offset -> rg->from, and
+ * update last_accounted_offset.
+ */
+ if (rg->from > last_accounted_offset) {
+ add += rg->from - last_accounted_offset;
+ if (!regions_needed) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, rg->from);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h,
+ resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else
+ *regions_needed += 1;
}
+
+ last_accounted_offset = rg->to;
}
- add += (nrg->from - f); /* Added to beginning of region */
- nrg->from = f;
- add += t - nrg->to; /* Added to end of region */
- nrg->to = t;
+ /* Handle the case where our range extends beyond
+ * last_accounted_offset.
+ */
+ if (last_accounted_offset < t) {
+ add += t - last_accounted_offset;
+ if (!regions_needed) {
+ nrg = get_file_region_entry_from_cache(
+ resv, last_accounted_offset, t);
+ record_hugetlb_cgroup_uncharge_info(h_cg, h, resv, nrg);
+ list_add(&nrg->link, rg->link.prev);
+ coalesce_file_region(resv, nrg);
+ } else
+ *regions_needed += 1;
+ }
-out_locked:
- resv->adds_in_progress--;
+ VM_BUG_ON(add < 0);
+ return add;
+}
+
+/* Must be called with resv->lock acquired. Will drop lock to allocate entries.
+ */
+static int allocate_file_region_entries(struct resv_map *resv,
+ int regions_needed)
+ __must_hold(&resv->lock)
+{
+ struct list_head allocated_regions;
+ int to_allocate = 0, i = 0;
+ struct file_region *trg = NULL, *rg = NULL;
+
+ VM_BUG_ON(regions_needed < 0);
+
+ INIT_LIST_HEAD(&allocated_regions);
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * the number of in progress add operations plus regions_needed.
+ *
+ * This is a while loop because when we drop the lock, some other call
+ * to region_add or region_del may have consumed some region_entries,
+ * so we keep looping here until we finally have enough entries for
+ * (adds_in_progress + regions_needed).
+ */
+ while (resv->region_cache_count <
+ (resv->adds_in_progress + regions_needed)) {
+ to_allocate = resv->adds_in_progress + regions_needed -
+ resv->region_cache_count;
+
+ /* At this point, we should have enough entries in the cache
+ * for all the existings adds_in_progress. We should only be
+ * needing to allocate for regions_needed.
+ */
+ VM_BUG_ON(resv->region_cache_count < resv->adds_in_progress);
+
+ spin_unlock(&resv->lock);
+ for (i = 0; i < to_allocate; i++) {
+ trg = kmalloc(sizeof(*trg), GFP_KERNEL);
+ if (!trg)
+ goto out_of_memory;
+ list_add(&trg->link, &allocated_regions);
+ }
+
+ spin_lock(&resv->lock);
+
+ list_splice(&allocated_regions, &resv->region_cache);
+ resv->region_cache_count += to_allocate;
+ }
+
+ return 0;
+
+out_of_memory:
+ list_for_each_entry_safe(rg, trg, &allocated_regions, link) {
+ list_del(&rg->link);
+ kfree(rg);
+ }
+ return -ENOMEM;
+}
+
+/*
+ * Add the huge page range represented by [f, t) to the reserve
+ * map. Regions will be taken from the cache to fill in this range.
+ * Sufficient regions should exist in the cache due to the previous
+ * call to region_chg with the same range, but in some cases the cache will not
+ * have sufficient entries due to races with other code doing region_add or
+ * region_del. The extra needed entries will be allocated.
+ *
+ * regions_needed is the out value provided by a previous call to region_chg.
+ *
+ * Return the number of new huge pages added to the map. This number is greater
+ * than or equal to zero. If file_region entries needed to be allocated for
+ * this operation and we were not able to allocate, it returns -ENOMEM.
+ * region_add of regions of length 1 never allocate file_regions and cannot
+ * fail; region_chg will always allocate at least 1 entry and a region_add for
+ * 1 page will only require at most 1 entry.
+ */
+static long region_add(struct resv_map *resv, long f, long t,
+ long in_regions_needed, struct hstate *h,
+ struct hugetlb_cgroup *h_cg)
+{
+ long add = 0, actual_regions_needed = 0;
+
+ spin_lock(&resv->lock);
+retry:
+
+ /* Count how many regions are actually needed to execute this add. */
+ add_reservation_in_range(resv, f, t, NULL, NULL,
+ &actual_regions_needed);
+
+ /*
+ * Check for sufficient descriptors in the cache to accommodate
+ * this add operation. Note that actual_regions_needed may be greater
+ * than in_regions_needed, as the resv_map may have been modified since
+ * the region_chg call. In this case, we need to make sure that we
+ * allocate extra entries, such that we have enough for all the
+ * existing adds_in_progress, plus the excess needed for this
+ * operation.
+ */
+ if (actual_regions_needed > in_regions_needed &&
+ resv->region_cache_count <
+ resv->adds_in_progress +
+ (actual_regions_needed - in_regions_needed)) {
+ /* region_add operation of range 1 should never need to
+ * allocate file_region entries.
+ */
+ VM_BUG_ON(t - f <= 1);
+
+ if (allocate_file_region_entries(
+ resv, actual_regions_needed - in_regions_needed)) {
+ return -ENOMEM;
+ }
+
+ goto retry;
+ }
+
+ add = add_reservation_in_range(resv, f, t, h_cg, h, NULL);
+
+ resv->adds_in_progress -= in_regions_needed;
+
spin_unlock(&resv->lock);
VM_BUG_ON(add < 0);
return add;
@@ -355,111 +566,38 @@
* call to region_add that will actually modify the reserve
* map to add the specified range [f, t). region_chg does
* not change the number of huge pages represented by the
- * map. However, if the existing regions in the map can not
- * be expanded to represent the new range, a new file_region
- * structure is added to the map as a placeholder. This is
- * so that the subsequent region_add call will have all the
- * regions it needs and will not fail.
+ * map. A number of new file_region structures is added to the cache as a
+ * placeholder, for the subsequent region_add call to use. At least 1
+ * file_region structure is added.
*
- * Upon entry, region_chg will also examine the cache of region descriptors
- * associated with the map. If there are not enough descriptors cached, one
- * will be allocated for the in progress add operation.
+ * out_regions_needed is the number of regions added to the
+ * resv->adds_in_progress. This value needs to be provided to a follow up call
+ * to region_add or region_abort for proper accounting.
*
* Returns the number of huge pages that need to be added to the existing
* reservation map for the range [f, t). This number is greater or equal to
* zero. -ENOMEM is returned if a new file_region structure or cache entry
* is needed and can not be allocated.
*/
-static long region_chg(struct resv_map *resv, long f, long t)
+static long region_chg(struct resv_map *resv, long f, long t,
+ long *out_regions_needed)
{
- struct list_head *head = &resv->regions;
- struct file_region *rg, *nrg = NULL;
long chg = 0;
-retry:
spin_lock(&resv->lock);
-retry_locked:
- resv->adds_in_progress++;
- /*
- * Check for sufficient descriptors in the cache to accommodate
- * the number of in progress add operations.
- */
- if (resv->adds_in_progress > resv->region_cache_count) {
- struct file_region *trg;
+ /* Count how many hugepages in this range are NOT represented. */
+ chg = add_reservation_in_range(resv, f, t, NULL, NULL,
+ out_regions_needed);
- VM_BUG_ON(resv->adds_in_progress - resv->region_cache_count > 1);
- /* Must drop lock to allocate a new descriptor. */
- resv->adds_in_progress--;
- spin_unlock(&resv->lock);
+ if (*out_regions_needed == 0)
+ *out_regions_needed = 1;
- trg = kmalloc(sizeof(*trg), GFP_KERNEL);
- if (!trg) {
- kfree(nrg);
- return -ENOMEM;
- }
+ if (allocate_file_region_entries(resv, *out_regions_needed))
+ return -ENOMEM;
- spin_lock(&resv->lock);
- list_add(&trg->link, &resv->region_cache);
- resv->region_cache_count++;
- goto retry_locked;
- }
+ resv->adds_in_progress += *out_regions_needed;
- /* Locate the region we are before or in. */
- list_for_each_entry(rg, head, link)
- if (f <= rg->to)
- break;
-
- /* If we are below the current region then a new region is required.
- * Subtle, allocate a new region at the position but make it zero
- * size such that we can guarantee to record the reservation. */
- if (&rg->link == head || t < rg->from) {
- if (!nrg) {
- resv->adds_in_progress--;
- spin_unlock(&resv->lock);
- nrg = kmalloc(sizeof(*nrg), GFP_KERNEL);
- if (!nrg)
- return -ENOMEM;
-
- nrg->from = f;
- nrg->to = f;
- INIT_LIST_HEAD(&nrg->link);
- goto retry;
- }
-
- list_add(&nrg->link, rg->link.prev);
- chg = t - f;
- goto out_nrg;
- }
-
- /* Round our left edge to the current segment if it encloses us. */
- if (f > rg->from)
- f = rg->from;
- chg = t - f;
-
- /* Check for and consume any regions we now overlap with. */
- list_for_each_entry(rg, rg->link.prev, link) {
- if (&rg->link == head)
- break;
- if (rg->from > t)
- goto out;
-
- /* We overlap with this area, if it extends further than
- * us then we must extend ourselves. Account for its
- * existing reservation. */
- if (rg->to > t) {
- chg += rg->to - t;
- t = rg->to;
- }
- chg -= rg->to - rg->from;
- }
-
-out:
- spin_unlock(&resv->lock);
- /* We already know we raced and no longer need the new region */
- kfree(nrg);
- return chg;
-out_nrg:
spin_unlock(&resv->lock);
return chg;
}
@@ -469,17 +607,20 @@
* of the resv_map keeps track of the operations in progress between
* calls to region_chg and region_add. Operations are sometimes
* aborted after the call to region_chg. In such cases, region_abort
- * is called to decrement the adds_in_progress counter.
+ * is called to decrement the adds_in_progress counter. regions_needed
+ * is the value returned by the region_chg call, it is used to decrement
+ * the adds_in_progress counter.
*
* NOTE: The range arguments [f, t) are not needed or used in this
* routine. They are kept to make reading the calling code easier as
* arguments will match the associated region_chg call.
*/
-static void region_abort(struct resv_map *resv, long f, long t)
+static void region_abort(struct resv_map *resv, long f, long t,
+ long regions_needed)
{
spin_lock(&resv->lock);
VM_BUG_ON(!resv->region_cache_count);
- resv->adds_in_progress--;
+ resv->adds_in_progress -= regions_needed;
spin_unlock(&resv->lock);
}
@@ -543,10 +684,15 @@
}
del += t - f;
+ hugetlb_cgroup_uncharge_file_region(
+ resv, rg, t - f, false);
/* New entry for end of split region */
nrg->from = t;
nrg->to = rg->to;
+
+ copy_hugetlb_cgroup_uncharge_info(nrg, rg);
+
INIT_LIST_HEAD(&nrg->link);
/* Original entry is trimmed */
@@ -559,15 +705,23 @@
if (f <= rg->from && t >= rg->to) { /* Remove entire region */
del += rg->to - rg->from;
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - rg->from, true);
list_del(&rg->link);
kfree(rg);
continue;
}
if (f <= rg->from) { /* Trim beginning of region */
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ t - rg->from, false);
+
del += t - rg->from;
rg->from = t;
} else { /* Trim end of region */
+ hugetlb_cgroup_uncharge_file_region(resv, rg,
+ rg->to - f, false);
+
del += rg->to - f;
rg->to = f;
}
@@ -718,6 +872,25 @@
vma->vm_private_data = (void *)value;
}
+static void
+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
+ struct hugetlb_cgroup *h_cg,
+ struct hstate *h)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+ if (!h_cg || !h) {
+ resv_map->reservation_counter = NULL;
+ resv_map->pages_per_hpage = 0;
+ resv_map->css = NULL;
+ } else {
+ resv_map->reservation_counter =
+ &h_cg->rsvd_hugepage[hstate_index(h)];
+ resv_map->pages_per_hpage = pages_per_huge_page(h);
+ resv_map->css = &h_cg->css;
+ }
+#endif
+}
+
struct resv_map *resv_map_alloc(void)
{
struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
@@ -734,6 +907,13 @@
INIT_LIST_HEAD(&resv_map->regions);
resv_map->adds_in_progress = 0;
+ /*
+ * Initialize these to 0. On shared mappings, 0's here indicate these
+ * fields don't do cgroup accounting. On private mappings, these will be
+ * re-initialized to the proper values, to indicate that hugetlb cgroup
+ * reservations are to be un-charged from here.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
INIT_LIST_HEAD(&resv_map->region_cache);
list_add(&rg->link, &resv_map->region_cache);
@@ -847,7 +1027,7 @@
* We know VM_NORESERVE is not set. Therefore, there SHOULD
* be a region map for all pages. The only situation where
* there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reverves to
+ * fallocate. In this case, there really are no reserves to
* use. This situation is indicated if chg != 0.
*/
if (chg)
@@ -897,22 +1077,24 @@
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
- if (!PageHWPoison(page))
- break;
- /*
- * if 'non-isolated free hugepage' not found on the list,
- * the allocation fails.
- */
- if (&h->hugepage_freelists[nid] == &page->lru)
- return NULL;
- list_move(&page->lru, &h->hugepage_activelist);
- set_page_refcounted(page);
- ClearPageHugeFreed(page);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- return page;
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
+ if (nocma && is_migrate_cma_page(page))
+ continue;
+
+ if (PageHWPoison(page))
+ continue;
+
+ list_move(&page->lru, &h->hugepage_activelist);
+ set_page_refcounted(page);
+ ClearPageHugeFreed(page);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ return page;
+ }
+
+ return NULL;
}
static struct page *dequeue_huge_page_nodemask(struct hstate *h, gfp_t gfp_mask, int nid,
@@ -951,15 +1133,6 @@
return NULL;
}
-/* Movability of hugepages depends on migration support. */
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
-{
- if (hugepage_movable_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -1079,104 +1252,69 @@
struct page *p = page + 1;
atomic_set(compound_mapcount_ptr(page), 0);
+ atomic_set(compound_pincount_ptr(page), 0);
+
for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
clear_compound_head(p);
set_page_refcounted(p);
}
set_compound_order(page, 0);
+ page[1].compound_nr = 0;
__ClearPageHead(page);
}
static void free_gigantic_page(struct page *page, unsigned int order)
{
+ /*
+ * If the page isn't allocated using the cma allocator,
+ * cma_release() returns false.
+ */
+#ifdef CONFIG_CMA
+ if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+ return;
+#endif
+
free_contig_range(page_to_pfn(page), 1 << order);
}
#ifdef CONFIG_CONTIG_ALLOC
-static int __alloc_gigantic_page(unsigned long start_pfn,
- unsigned long nr_pages, gfp_t gfp_mask)
-{
- unsigned long end_pfn = start_pfn + nr_pages;
- return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE,
- gfp_mask);
-}
-
-static bool pfn_range_valid_gigantic(struct zone *z,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long i, end_pfn = start_pfn + nr_pages;
- struct page *page;
-
- for (i = start_pfn; i < end_pfn; i++) {
- page = pfn_to_online_page(i);
- if (!page)
- return false;
-
- if (page_zone(page) != z)
- return false;
-
- if (PageReserved(page))
- return false;
-
- if (page_count(page) > 0)
- return false;
-
- if (PageHuge(page))
- return false;
- }
-
- return true;
-}
-
-static bool zone_spans_last_pfn(const struct zone *zone,
- unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long last_pfn = start_pfn + nr_pages - 1;
- return zone_spans_pfn(zone, last_pfn);
-}
-
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
- unsigned int order = huge_page_order(h);
- unsigned long nr_pages = 1 << order;
- unsigned long ret, pfn, flags;
- struct zonelist *zonelist;
- struct zone *zone;
- struct zoneref *z;
+ unsigned long nr_pages = 1UL << huge_page_order(h);
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
- zonelist = node_zonelist(nid, gfp_mask);
- for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) {
- spin_lock_irqsave(&zone->lock, flags);
+#ifdef CONFIG_CMA
+ {
+ struct page *page;
+ int node;
- pfn = ALIGN(zone->zone_start_pfn, nr_pages);
- while (zone_spans_last_pfn(zone, pfn, nr_pages)) {
- if (pfn_range_valid_gigantic(zone, pfn, nr_pages)) {
- /*
- * We release the zone lock here because
- * alloc_contig_range() will also lock the zone
- * at some point. If there's an allocation
- * spinning on this lock, it may win the race
- * and cause alloc_contig_range() to fail...
- */
- spin_unlock_irqrestore(&zone->lock, flags);
- ret = __alloc_gigantic_page(pfn, nr_pages, gfp_mask);
- if (!ret)
- return pfn_to_page(pfn);
- spin_lock_irqsave(&zone->lock, flags);
- }
- pfn += nr_pages;
+ if (hugetlb_cma[nid]) {
+ page = cma_alloc(hugetlb_cma[nid], nr_pages,
+ huge_page_order(h), true);
+ if (page)
+ return page;
}
- spin_unlock_irqrestore(&zone->lock, flags);
- }
+ if (!(gfp_mask & __GFP_THISNODE)) {
+ for_each_node_mask(node, *nodemask) {
+ if (node == nid || !hugetlb_cma[node])
+ continue;
- return NULL;
+ page = cma_alloc(hugetlb_cma[node], nr_pages,
+ huge_page_order(h), true);
+ if (page)
+ return page;
+ }
+ }
+ }
+#endif
+
+ return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
-static void prep_new_huge_page(struct hstate *h, struct page *page, int nid);
-static void prep_compound_gigantic_page(struct page *page, unsigned int order);
#else /* !CONFIG_CONTIG_ALLOC */
static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
@@ -1214,11 +1352,18 @@
1 << PG_writeback);
}
VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+ VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
set_page_refcounted(page);
if (hstate_is_gigantic(h)) {
+ /*
+ * Temporarily drop the hugetlb_lock, because
+ * we might block in free_gigantic_page().
+ */
+ spin_unlock(&hugetlb_lock);
destroy_compound_gigantic_page(page, huge_page_order(h));
free_gigantic_page(page, huge_page_order(h));
+ spin_lock(&hugetlb_lock);
} else {
__free_pages(page, huge_page_order(h));
}
@@ -1324,6 +1469,8 @@
clear_page_huge_active(page);
hugetlb_cgroup_uncharge_page(hstate_index(h),
pages_per_huge_page(h), page);
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
+ pages_per_huge_page(h), page);
if (restore_reserve)
h->resv_huge_pages++;
@@ -1396,8 +1543,9 @@
{
INIT_LIST_HEAD(&page->lru);
set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
- spin_lock(&hugetlb_lock);
set_hugetlb_cgroup(page, NULL);
+ set_hugetlb_cgroup_rsvd(page, NULL);
+ spin_lock(&hugetlb_lock);
h->nr_huge_pages++;
h->nr_huge_pages_node[nid]++;
ClearPageHugeFreed(page);
@@ -1419,7 +1567,7 @@
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
* hugepages and clear the PG_reserved bit from all tail pages
- * too. Otherwse drivers using get_user_pages() to access tail
+ * too. Otherwise drivers using get_user_pages() to access tail
* pages may get the reference counting wrong if they see
* PG_reserved set on a tail page (despite the head page not
* having PG_reserved set). Enforcing this consistency between
@@ -1432,6 +1580,7 @@
set_compound_head(p, page);
}
atomic_set(compound_mapcount_ptr(page), -1);
+ atomic_set(compound_pincount_ptr(page), 0);
}
/*
@@ -1458,7 +1607,27 @@
if (!PageHead(page_head))
return 0;
- return get_compound_page_dtor(page_head) == free_huge_page;
+ return page_head[1].compound_dtor == HUGETLB_PAGE_DTOR;
+}
+
+/*
+ * Find and lock address space (mapping) in write mode.
+ *
+ * Upon entry, the page is locked which means that page_mapping() is
+ * stable. Due to locking order, we can only trylock_write. If we can
+ * not get the lock, simply return NULL to caller.
+ */
+struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+{
+ struct address_space *mapping = page_mapping(hpage);
+
+ if (!mapping)
+ return mapping;
+
+ if (i_mmap_trylock_write(mapping))
+ return mapping;
+
+ return NULL;
}
pgoff_t hugetlb_basepage_index(struct page *page)
@@ -1753,7 +1922,7 @@
return page;
}
-struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
@@ -1795,31 +1964,9 @@
}
/* page migration callback function */
-struct page *alloc_huge_page_node(struct hstate *h, int nid)
-{
- gfp_t gfp_mask = htlb_alloc_mask(h);
- struct page *page = NULL;
-
- if (nid != NUMA_NO_NODE)
- gfp_mask |= __GFP_THISNODE;
-
- spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
- spin_unlock(&hugetlb_lock);
-
- if (!page)
- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
-
- return page;
-}
-
-/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask)
+ nodemask_t *nmask, gfp_t gfp_mask)
{
- gfp_t gfp_mask = htlb_alloc_mask(h);
-
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
@@ -1847,7 +1994,7 @@
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
@@ -1858,6 +2005,7 @@
* of size 'delta'.
*/
static int gather_surplus_pages(struct hstate *h, int delta)
+ __must_hold(&hugetlb_lock)
{
struct list_head surplus_list;
struct page *page, *tmp;
@@ -1975,7 +2123,7 @@
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the the freed pages across the
+ * free_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*
* Note that we decrement resv_huge_pages as we free the pages. If
@@ -2033,6 +2181,7 @@
struct resv_map *resv;
pgoff_t idx;
long ret;
+ long dummy_out_regions_needed;
resv = vma_resv_map(vma);
if (!resv)
@@ -2041,20 +2190,29 @@
idx = vma_hugecache_offset(h, vma, addr);
switch (mode) {
case VMA_NEEDS_RESV:
- ret = region_chg(resv, idx, idx + 1);
+ ret = region_chg(resv, idx, idx + 1, &dummy_out_regions_needed);
+ /* We assume that vma_reservation_* routines always operate on
+ * 1 page, and that adding to resv map a 1 page entry can only
+ * ever require 1 region.
+ */
+ VM_BUG_ON(dummy_out_regions_needed != 1);
break;
case VMA_COMMIT_RESV:
- ret = region_add(resv, idx, idx + 1);
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
break;
case VMA_END_RESV:
- region_abort(resv, idx, idx + 1);
+ region_abort(resv, idx, idx + 1, 1);
ret = 0;
break;
case VMA_ADD_RESV:
- if (vma->vm_flags & VM_MAYSHARE)
- ret = region_add(resv, idx, idx + 1);
- else {
- region_abort(resv, idx, idx + 1);
+ if (vma->vm_flags & VM_MAYSHARE) {
+ ret = region_add(resv, idx, idx + 1, 1, NULL, NULL);
+ /* region_add calls of range 1 should never fail. */
+ VM_BUG_ON(ret < 0);
+ } else {
+ region_abort(resv, idx, idx + 1, 1);
ret = region_del(resv, idx, idx + 1);
}
break;
@@ -2165,6 +2323,7 @@
long gbl_chg;
int ret, idx;
struct hugetlb_cgroup *h_cg;
+ bool deferred_reserve;
idx = hstate_index(h);
/*
@@ -2202,9 +2361,19 @@
gbl_chg = 1;
}
+ /* If this allocation is not consuming a reservation, charge it now.
+ */
+ deferred_reserve = map_chg || avoid_reserve || !vma_resv_map(vma);
+ if (deferred_reserve) {
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ idx, pages_per_huge_page(h), &h_cg);
+ if (ret)
+ goto out_subpool_put;
+ }
+
ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg);
if (ret)
- goto out_subpool_put;
+ goto out_uncharge_cgroup_reservation;
spin_lock(&hugetlb_lock);
/*
@@ -2223,10 +2392,18 @@
h->resv_huge_pages--;
}
spin_lock(&hugetlb_lock);
- list_move(&page->lru, &h->hugepage_activelist);
+ list_add(&page->lru, &h->hugepage_activelist);
/* Fall through */
}
hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page);
+ /* If allocation is not consuming a reservation, also store the
+ * hugetlb_cgroup pointer on the page.
+ */
+ if (deferred_reserve) {
+ hugetlb_cgroup_commit_charge_rsvd(idx, pages_per_huge_page(h),
+ h_cg, page);
+ }
+
spin_unlock(&hugetlb_lock);
set_page_private(page, (unsigned long)spool);
@@ -2246,11 +2423,18 @@
rsv_adjust = hugepage_subpool_put_pages(spool, 1);
hugetlb_acct_memory(h, -rsv_adjust);
+ if (deferred_reserve)
+ hugetlb_cgroup_uncharge_page_rsvd(hstate_index(h),
+ pages_per_huge_page(h), page);
}
return page;
out_uncharge_cgroup:
hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg);
+out_uncharge_cgroup_reservation:
+ if (deferred_reserve)
+ hugetlb_cgroup_uncharge_cgroup_rsvd(idx, pages_per_huge_page(h),
+ h_cg);
out_subpool_put:
if (map_chg || avoid_reserve)
hugepage_subpool_put_pages(spool, 1);
@@ -2292,16 +2476,10 @@
return 1;
}
-static void __init prep_compound_huge_page(struct page *page,
- unsigned int order)
-{
- if (unlikely(order > (MAX_ORDER - 1)))
- prep_compound_gigantic_page(page, order);
- else
- prep_compound_page(page, order);
-}
-
-/* Put bootmem huge pages into the standard lists after mem_map is up */
+/*
+ * Put bootmem huge pages into the standard lists after mem_map is up.
+ * Note: This only applies to gigantic (order > MAX_ORDER) pages.
+ */
static void __init gather_bootmem_prealloc(void)
{
struct huge_bootmem_page *m;
@@ -2310,20 +2488,19 @@
struct page *page = virt_to_page(m);
struct hstate *h = m->hstate;
+ VM_BUG_ON(!hstate_is_gigantic(h));
WARN_ON(page_count(page) != 1);
- prep_compound_huge_page(page, h->order);
+ prep_compound_gigantic_page(page, huge_page_order(h));
WARN_ON(PageReserved(page));
prep_new_huge_page(h, page, page_to_nid(page));
put_page(page); /* free it into the hugepage allocator */
/*
- * If we had gigantic hugepages allocated at boot time, we need
- * to restore the 'stolen' pages to totalram_pages in order to
- * fix confusing memory reports from free(1) and another
- * side-effects, like CommitLimit going negative.
+ * We need to restore the 'stolen' pages to totalram_pages
+ * in order to fix confusing memory reports from free(1) and
+ * other side-effects, like CommitLimit going negative.
*/
- if (hstate_is_gigantic(h))
- adjust_managed_page_count(page, 1 << h->order);
+ adjust_managed_page_count(page, pages_per_huge_page(h));
cond_resched();
}
}
@@ -2353,6 +2530,10 @@
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
+ if (hugetlb_cma_size) {
+ pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
+ goto free;
+ }
if (!alloc_bootmem_huge_page(h))
break;
} else if (!alloc_pool_huge_page(h,
@@ -2369,7 +2550,7 @@
h->max_huge_pages, buf, i);
h->max_huge_pages = i;
}
-
+free:
kfree(node_alloc_noretry);
}
@@ -2839,7 +3020,7 @@
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("Hugetlb: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
}
}
@@ -2943,7 +3124,7 @@
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
- pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
@@ -2991,25 +3172,44 @@
{
int i;
- if (!hugepages_supported())
+ if (!hugepages_supported()) {
+ if (hugetlb_max_hstate || default_hstate_max_huge_pages)
+ pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
return 0;
+ }
- if (!size_to_hstate(default_hstate_size)) {
- if (default_hstate_size != 0) {
- pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
- default_hstate_size, HPAGE_SIZE);
+ /*
+ * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
+ * architectures depend on setup being done here.
+ */
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ if (!parsed_default_hugepagesz) {
+ /*
+ * If we did not parse a default huge page size, set
+ * default_hstate_idx to HPAGE_SIZE hstate. And, if the
+ * number of huge pages for this default size was implicitly
+ * specified, set that here as well.
+ * Note that the implicit setting will overwrite an explicit
+ * setting. A warning will be printed in this case.
+ */
+ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
+ if (default_hstate_max_huge_pages) {
+ if (default_hstate.max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(&default_hstate),
+ 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
+ default_hstate.max_huge_pages, buf);
+ pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
+ default_hstate_max_huge_pages);
+ }
+ default_hstate.max_huge_pages =
+ default_hstate_max_huge_pages;
}
-
- default_hstate_size = HPAGE_SIZE;
- if (!size_to_hstate(default_hstate_size))
- hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
- }
- default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
- if (default_hstate_max_huge_pages) {
- if (!default_hstate.max_huge_pages)
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
}
+ hugetlb_cma_check();
hugetlb_init_hstates();
gather_bootmem_prealloc();
report_hugepages();
@@ -3034,10 +3234,10 @@
}
subsys_initcall(hugetlb_init);
-/* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_bad_size(void)
+/* Overwritten by architectures with more huge page sizes */
+bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
- parsed_valid_hugepagesz = false;
+ return size == HPAGE_SIZE;
}
void __init hugetlb_add_hstate(unsigned int order)
@@ -3046,7 +3246,6 @@
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -3067,20 +3266,29 @@
parsed_hstate = h;
}
-static int __init hugetlb_nrpages_setup(char *s)
+/*
+ * hugepages command line processing
+ * hugepages normally follows a valid hugepagsz or default_hugepagsz
+ * specification. If not, ignore the hugepages value. hugepages can also
+ * be the first huge page command line option in which case it implicitly
+ * specifies the number of huge pages for the default size.
+ */
+static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
if (!parsed_valid_hugepagesz) {
- pr_warn("hugepages = %s preceded by "
- "an unsupported hugepagesz, ignoring\n", s);
+ pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
+
/*
- * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
- * so this hugepages= parameter goes to the "default hstate".
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
+ * yet, so this hugepages= parameter goes to the "default hstate".
+ * Otherwise, it goes with the previously parsed hugepagesz or
+ * default_hugepagesz.
*/
else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
@@ -3088,8 +3296,8 @@
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
- return 1;
+ pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
+ return 0;
}
if (sscanf(s, "%lu", mhp) <= 0)
@@ -3107,22 +3315,118 @@
return 1;
}
-__setup("hugepages=", hugetlb_nrpages_setup);
+__setup("hugepages=", hugepages_setup);
-static int __init hugetlb_default_setup(char *s)
+/*
+ * hugepagesz command line processing
+ * A specific huge page size can only be specified once with hugepagesz.
+ * hugepagesz is followed by hugepages on the command line. The global
+ * variable 'parsed_valid_hugepagesz' is used to determine if prior
+ * hugepagesz argument was valid.
+ */
+static int __init hugepagesz_setup(char *s)
{
- default_hstate_size = memparse(s, &s);
+ unsigned long size;
+ struct hstate *h;
+
+ parsed_valid_hugepagesz = false;
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ h = size_to_hstate(size);
+ if (h) {
+ /*
+ * hstate for this size already exists. This is normally
+ * an error, but is allowed if the existing hstate is the
+ * default hstate. More specifically, it is only allowed if
+ * the number of huge pages for the default hstate was not
+ * previously specified.
+ */
+ if (!parsed_default_hugepagesz || h != &default_hstate ||
+ default_hstate.max_huge_pages) {
+ pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
+ return 0;
+ }
+
+ /*
+ * No need to call hugetlb_add_hstate() as hstate already
+ * exists. But, do set parsed_hstate so that a following
+ * hugepages= parameter will be applied to this hstate.
+ */
+ parsed_hstate = h;
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
return 1;
}
-__setup("default_hugepagesz=", hugetlb_default_setup);
+__setup("hugepagesz=", hugepagesz_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+/*
+ * default_hugepagesz command line input
+ * Only one instance of default_hugepagesz allowed on command line.
+ */
+static int __init default_hugepagesz_setup(char *s)
+{
+ unsigned long size;
+
+ parsed_valid_hugepagesz = false;
+ if (parsed_default_hugepagesz) {
+ pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
+ return 0;
+ }
+
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
+ parsed_default_hugepagesz = true;
+ default_hstate_idx = hstate_index(size_to_hstate(size));
+
+ /*
+ * The number of default huge pages (for this size) could have been
+ * specified as the first hugetlb parameter: hugepages=X. If so,
+ * then default_hstate_max_huge_pages is set. If the default huge
+ * page size is gigantic (>= MAX_ORDER), then the pages must be
+ * allocated here from bootmem allocator.
+ */
+ if (default_hstate_max_huge_pages) {
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (hstate_is_gigantic(&default_hstate))
+ hugetlb_hstate_alloc_pages(&default_hstate);
+ default_hstate_max_huge_pages = 0;
+ }
+
+ return 1;
+}
+__setup("default_hugepagesz=", default_hugepagesz_setup);
+
+static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
+ nodemask_t *mpol_allowed;
+ unsigned int *array = h->free_huge_pages_node;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
- for_each_node_mask(node, cpuset_current_mems_allowed)
- nr += array[node];
+ mpol_allowed = policy_nodemask_current(gfp_mask);
+
+ for_each_node_mask(node, cpuset_current_mems_allowed) {
+ if (!mpol_allowed ||
+ (mpol_allowed && node_isset(node, *mpol_allowed)))
+ nr += array[node];
+ }
return nr;
}
@@ -3146,7 +3450,7 @@
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp = h->max_huge_pages;
@@ -3168,7 +3472,7 @@
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(false, table, write,
@@ -3177,7 +3481,7 @@
#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(true, table, write,
buffer, length, ppos);
@@ -3185,8 +3489,7 @@
#endif /* CONFIG_NUMA */
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp;
@@ -3246,18 +3549,20 @@
seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024);
}
-int hugetlb_report_node_meminfo(int nid, char *buf)
+int hugetlb_report_node_meminfo(char *buf, int len, int nid)
{
struct hstate *h = &default_hstate;
+
if (!hugepages_supported())
return 0;
- return sprintf(buf,
- "Node %d HugePages_Total: %5u\n"
- "Node %d HugePages_Free: %5u\n"
- "Node %d HugePages_Surp: %5u\n",
- nid, h->nr_huge_pages_node[nid],
- nid, h->free_huge_pages_node[nid],
- nid, h->surplus_huge_pages_node[nid]);
+
+ return sysfs_emit_at(buf, len,
+ "Node %d HugePages_Total: %5u\n"
+ "Node %d HugePages_Free: %5u\n"
+ "Node %d HugePages_Surp: %5u\n",
+ nid, h->nr_huge_pages_node[nid],
+ nid, h->free_huge_pages_node[nid],
+ nid, h->surplus_huge_pages_node[nid]);
}
void hugetlb_show_meminfo(void)
@@ -3316,12 +3621,18 @@
* we fall back to check against current free page availability as
* a best attempt and hopefully to minimize the impact of changing
* semantics that cpuset has.
+ *
+ * Apart from cpuset, we also have memory policy mechanism that
+ * also determines from which node the kernel will allocate memory
+ * in a NUMA system. So similar to cpuset, we also should consider
+ * the memory policy of the current task. Similar to the description
+ * above.
*/
if (delta > 0) {
if (gather_surplus_pages(h, delta) < 0)
goto out;
- if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ if (delta > allowed_mems_nr(h)) {
return_unused_surplus_pages(h, delta);
goto out;
}
@@ -3348,8 +3659,10 @@
* after this open call completes. It is therefore safe to take a
* new reference here without additional locking.
*/
- if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
+ if (resv && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) {
+ resv_map_dup_hugetlb_cgroup_uncharge_info(resv);
kref_get(&resv->refs);
+ }
}
static void hugetlb_vm_op_close(struct vm_area_struct *vma)
@@ -3367,9 +3680,7 @@
end = vma_hugecache_offset(h, vma, vma->vm_end);
reserve = (end - start) - region_count(resv, start, end);
-
- kref_put(&resv->refs, resv_map_release);
-
+ hugetlb_cgroup_uncharge_counter(resv, start, end);
if (reserve) {
/*
* Decrement reserve counts. The global reserve count may be
@@ -3378,6 +3689,8 @@
gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
hugetlb_acct_memory(h, -gbl_reserve);
}
+
+ kref_put(&resv->refs, resv_map_release);
}
static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
@@ -3457,23 +3770,23 @@
if (huge_pte_none(pte) || pte_present(pte))
return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_migration_entry(swp))
+ if (is_migration_entry(swp))
return true;
else
return false;
}
-static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+static bool is_hugetlb_entry_hwpoisoned(pte_t pte)
{
swp_entry_t swp;
if (huge_pte_none(pte) || pte_present(pte))
- return 0;
+ return false;
swp = pte_to_swp_entry(pte);
- if (non_swap_entry(swp) && is_hwpoison_entry(swp))
- return 1;
+ if (is_hwpoison_entry(swp))
+ return true;
else
- return 0;
+ return false;
}
int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
@@ -3485,6 +3798,7 @@
int cow;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
+ struct address_space *mapping = vma->vm_file->f_mapping;
struct mmu_notifier_range range;
int ret = 0;
@@ -3495,6 +3809,14 @@
vma->vm_start,
vma->vm_end);
mmu_notifier_invalidate_range_start(&range);
+ } else {
+ /*
+ * For shared mappings i_mmap_rwsem must be held to call
+ * huge_pte_alloc, otherwise the returned ptep could go
+ * away if part of a shared pmd and another thread calls
+ * huge_pmd_unshare.
+ */
+ i_mmap_lock_read(mapping);
}
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
@@ -3572,6 +3894,8 @@
if (cow)
mmu_notifier_invalidate_range_end(&range);
+ else
+ i_mmap_unlock_read(mapping);
return ret;
}
@@ -3589,6 +3913,7 @@
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
struct mmu_notifier_range range;
+ bool force_flush = false;
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(start & ~huge_page_mask(h));
@@ -3615,12 +3940,10 @@
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
- /*
- * We just unmapped a page of PMDs by clearing a PUD.
- * The caller's TLB flush range should cover this area.
- */
+ tlb_flush_pmd_range(tlb, address & PUD_MASK, PUD_SIZE);
+ force_flush = true;
continue;
}
@@ -3677,6 +4000,22 @@
}
mmu_notifier_invalidate_range_end(&range);
tlb_end_vma(tlb, vma);
+
+ /*
+ * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We
+ * could defer the flush until now, since by holding i_mmap_rwsem we
+ * guaranteed that the last refernece would not be dropped. But we must
+ * do the flushing before we return, as otherwise i_mmap_rwsem will be
+ * dropped and the last reference to the shared PMDs page might be
+ * dropped as well.
+ *
+ * In theory we could defer the freeing of the PMD pages as well, but
+ * huge_pmd_unshare() relies on the exact page_count for the PMD page to
+ * detect sharing, so we cannot defer the release of the page either.
+ * Instead, do flush now.
+ */
+ if (force_flush)
+ tlb_flush_mmu_tlbonly(tlb);
}
void __unmap_hugepage_range_final(struct mmu_gather *tlb,
@@ -3839,10 +4178,30 @@
* may get SIGKILLed if it later faults.
*/
if (outside_reserve) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t idx;
+ u32 hash;
+
put_page(old_page);
BUG_ON(huge_pte_none(pte));
+ /*
+ * Drop hugetlb_fault_mutex and i_mmap_rwsem before
+ * unmapping. unmapping needs to hold i_mmap_rwsem
+ * in write mode. Dropping i_mmap_rwsem in read mode
+ * here is OK as COW mappings do not interact with
+ * PMD sharing.
+ *
+ * Reacquire both after unmap operation.
+ */
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
+
unmap_ref_private(mm, vma, old_page, haddr);
- BUG_ON(huge_pte_none(pte));
+
+ i_mmap_lock_read(mapping);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
spin_lock(ptl);
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (likely(ptep &&
@@ -3991,16 +4350,17 @@
}
/*
- * Use page lock to guard against racing truncation
- * before we get page_table_lock.
+ * We can not race with truncation due to holding i_mmap_rwsem.
+ * i_size is modified when holding i_mmap_rwsem, so check here
+ * once for faults beyond end of file.
*/
+ size = i_size_read(mapping->host) >> huge_page_shift(h);
+ if (idx >= size)
+ goto out;
+
retry:
page = find_lock_page(mapping, idx);
if (!page) {
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto out;
-
/*
* Check for page in userfault range
*/
@@ -4020,13 +4380,15 @@
};
/*
- * hugetlb_fault_mutex must be dropped before
- * handling userfault. Reacquire after handling
- * fault to make calling code simpler.
+ * hugetlb_fault_mutex and i_mmap_rwsem must be
+ * dropped before handling userfault. Reacquire
+ * after handling fault to make calling code simpler.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+ i_mmap_lock_read(mapping);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
goto out;
}
@@ -4104,10 +4466,6 @@
}
ptl = huge_pte_lock(h, mm, ptep);
- size = i_size_read(mapping->host) >> huge_page_shift(h);
- if (idx >= size)
- goto backout;
-
ret = 0;
if (!huge_pte_none(huge_ptep_get(ptep)))
goto backout;
@@ -4151,8 +4509,7 @@
}
#ifdef CONFIG_SMP
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx)
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
unsigned long key[2];
u32 hash;
@@ -4169,8 +4526,7 @@
* For uniprocesor systems we always use a single mutex, so just
* return 0 and avoid the hashing overhead.
*/
-u32 hugetlb_fault_mutex_hash(struct hstate *h, struct address_space *mapping,
- pgoff_t idx)
+u32 hugetlb_fault_mutex_hash(struct address_space *mapping, pgoff_t idx)
{
return 0;
}
@@ -4193,6 +4549,11 @@
ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
if (ptep) {
+ /*
+ * Since we hold no locks, ptep could be stale. That is
+ * OK as we are only making decisions based on content and
+ * not actually modifying content here.
+ */
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
migration_entry_wait_huge(vma, mm, ptep);
@@ -4200,21 +4561,34 @@
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
+ /*
+ * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+ * until finished with ptep. This serves two purposes:
+ * 1) It prevents huge_pmd_unshare from being called elsewhere
+ * and making the ptep no longer valid.
+ * 2) It synchronizes us with i_size modifications during truncation.
+ *
+ * ptep could have already be assigned via huge_pte_offset. That
+ * is OK, as huge_pte_alloc will return the same value unless
+ * something has changed.
+ */
mapping = vma->vm_file->f_mapping;
- idx = vma_hugecache_offset(h, vma, haddr);
+ i_mmap_lock_read(mapping);
+ ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+ if (!ptep) {
+ i_mmap_unlock_read(mapping);
+ return VM_FAULT_OOM;
+ }
/*
* Serialize hugepage allocation and instantiation, so that we don't
* get spurious allocation failures if two CPUs race to instantiate
* the same page in the page cache.
*/
- hash = hugetlb_fault_mutex_hash(h, mapping, idx);
+ idx = vma_hugecache_offset(h, vma, haddr);
+ hash = hugetlb_fault_mutex_hash(mapping, idx);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
entry = huge_ptep_get(ptep);
@@ -4228,9 +4602,9 @@
/*
* entry could be a migration/hwpoison entry at this point, so this
* check prevents the kernel from going below assuming that we have
- * a active hugepage in pagecache. This goto expects the 2nd page fault,
- * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
- * handle it.
+ * an active hugepage in pagecache. This goto expects the 2nd page
+ * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
+ * properly handle it.
*/
if (!pte_present(entry))
goto out_mutex;
@@ -4301,6 +4675,7 @@
}
out_mutex:
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ i_mmap_unlock_read(mapping);
/*
* Generally it's safe to hold refcount during waiting page lock. But
* here we just wait to defer the next page fault to avoid busy loop and
@@ -4354,7 +4729,7 @@
(const void __user *) src_addr,
pages_per_huge_page(h), false);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
*pagep = page;
@@ -4457,7 +4832,7 @@
long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, unsigned long *nr_pages,
- long i, unsigned int flags, int *nonblocking)
+ long i, unsigned int flags, int *locked)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
@@ -4528,14 +4903,17 @@
spin_unlock(ptl);
if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
- if (nonblocking)
- fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+ if (locked)
+ fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+ FAULT_FLAG_KILLABLE;
if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY |
FAULT_FLAG_RETRY_NOWAIT;
if (flags & FOLL_TRIED) {
- VM_WARN_ON_ONCE(fault_flags &
- FAULT_FLAG_ALLOW_RETRY);
+ /*
+ * Note: FAULT_FLAG_ALLOW_RETRY and
+ * FAULT_FLAG_TRIED can co-exist
+ */
fault_flags |= FAULT_FLAG_TRIED;
}
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
@@ -4545,9 +4923,9 @@
break;
}
if (ret & VM_FAULT_RETRY) {
- if (nonblocking &&
+ if (locked &&
!(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
- *nonblocking = 0;
+ *locked = 0;
*nr_pages = 0;
/*
* VM_FAULT_RETRY must not return an
@@ -4567,22 +4945,39 @@
page = pte_page(huge_ptep_get(pte));
/*
- * Instead of doing 'try_get_page()' below in the same_page
- * loop, just check the count once here.
+ * If subpage information not requested, update counters
+ * and skip the same_page loop below.
*/
- if (unlikely(page_count(page) <= 0)) {
- if (pages) {
+ if (!pages && !vmas && !pfn_offset &&
+ (vaddr + huge_page_size(h) < vma->vm_end) &&
+ (remainder >= pages_per_huge_page(h))) {
+ vaddr += huge_page_size(h);
+ remainder -= pages_per_huge_page(h);
+ i += pages_per_huge_page(h);
+ spin_unlock(ptl);
+ continue;
+ }
+
+same_page:
+ if (pages) {
+ pages[i] = mem_map_offset(page, pfn_offset);
+ /*
+ * try_grab_page() should always succeed here, because:
+ * a) we hold the ptl lock, and b) we've just checked
+ * that the huge page is present in the page tables. If
+ * the huge page is present, then the tail pages must
+ * also be present. The ptl prevents the head page and
+ * tail pages from being rearranged in any way. So this
+ * page must be available at this point, unless the page
+ * refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
spin_unlock(ptl);
remainder = 0;
err = -ENOMEM;
break;
}
}
-same_page:
- if (pages) {
- pages[i] = mem_map_offset(page, pfn_offset);
- get_page(pages[i]);
- }
if (vmas)
vmas[i] = vma;
@@ -4652,7 +5047,7 @@
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
@@ -4717,11 +5112,12 @@
struct vm_area_struct *vma,
vm_flags_t vm_flags)
{
- long ret, chg;
+ long ret, chg, add = -1;
struct hstate *h = hstate_inode(inode);
struct hugepage_subpool *spool = subpool_inode(inode);
struct resv_map *resv_map;
- long gbl_reserve;
+ struct hugetlb_cgroup *h_cg = NULL;
+ long gbl_reserve, regions_needed = 0;
/* This should never happen */
if (from > to) {
@@ -4751,9 +5147,10 @@
*/
resv_map = inode_resv_map(inode);
- chg = region_chg(resv_map, from, to);
+ chg = region_chg(resv_map, from, to, ®ions_needed);
} else {
+ /* Private mapping. */
resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
@@ -4769,6 +5166,21 @@
goto out_err;
}
+ ret = hugetlb_cgroup_charge_cgroup_rsvd(
+ hstate_index(h), chg * pages_per_huge_page(h), &h_cg);
+
+ if (ret < 0) {
+ ret = -ENOMEM;
+ goto out_err;
+ }
+
+ if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) {
+ /* For private mappings, the hugetlb_cgroup uncharge info hangs
+ * of the resv_map.
+ */
+ resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
+ }
+
/*
* There must be enough pages in the subpool for the mapping. If
* the subpool has a minimum size, there may be some global
@@ -4777,7 +5189,7 @@
gbl_reserve = hugepage_subpool_get_pages(spool, chg);
if (gbl_reserve < 0) {
ret = -ENOSPC;
- goto out_err;
+ goto out_uncharge_cgroup;
}
/*
@@ -4786,9 +5198,7 @@
*/
ret = hugetlb_acct_memory(h, gbl_reserve);
if (ret < 0) {
- /* put back original number of pages, chg */
- (void)hugepage_subpool_put_pages(spool, chg);
- goto out_err;
+ goto out_put_pages;
}
/*
@@ -4803,9 +5213,13 @@
* else has to be done for private mappings here
*/
if (!vma || vma->vm_flags & VM_MAYSHARE) {
- long add = region_add(resv_map, from, to);
+ add = region_add(resv_map, from, to, regions_needed, h, h_cg);
- if (unlikely(chg > add)) {
+ if (unlikely(add < 0)) {
+ hugetlb_acct_memory(h, -gbl_reserve);
+ ret = add;
+ goto out_put_pages;
+ } else if (unlikely(chg > add)) {
/*
* pages in this range were added to the reserve
* map between region_chg and region_add. This
@@ -4815,17 +5229,41 @@
*/
long rsv_adjust;
+ /*
+ * hugetlb_cgroup_uncharge_cgroup_rsvd() will put the
+ * reference to h_cg->css. See comment below for detail.
+ */
+ hugetlb_cgroup_uncharge_cgroup_rsvd(
+ hstate_index(h),
+ (chg - add) * pages_per_huge_page(h), h_cg);
+
rsv_adjust = hugepage_subpool_put_pages(spool,
chg - add);
hugetlb_acct_memory(h, -rsv_adjust);
+ } else if (h_cg) {
+ /*
+ * The file_regions will hold their own reference to
+ * h_cg->css. So we should release the reference held
+ * via hugetlb_cgroup_charge_cgroup_rsvd() when we are
+ * done.
+ */
+ hugetlb_cgroup_put_rsvd_cgroup(h_cg);
}
}
return 0;
+out_put_pages:
+ /* put back original number of pages, chg */
+ (void)hugepage_subpool_put_pages(spool, chg);
+out_uncharge_cgroup:
+ hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h),
+ chg * pages_per_huge_page(h), h_cg);
out_err:
if (!vma || vma->vm_flags & VM_MAYSHARE)
- /* Don't call region_abort if region_chg failed */
- if (chg >= 0)
- region_abort(resv_map, from, to);
+ /* Only call region_abort if the region_chg succeeded but the
+ * region_add failed or didn't run.
+ */
+ if (chg >= 0 && add < 0)
+ region_abort(resv_map, from, to, regions_needed);
if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&resv_map->refs, resv_map_release);
return ret;
@@ -4939,10 +5377,18 @@
* Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
* and returns the corresponding pte. While this is not necessary for the
* !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode if
+ * sharing is possible. For hugetlbfs, this prevents removal of any page
+ * table entries associated with the address space. This is important as we
+ * are setting up sharing based on existing page table entries (mappings).
+ *
+ * NOTE: This routine is only called from huge_pte_alloc. Some callers of
+ * huge_pte_alloc know that sharing is not possible and do not take
+ * i_mmap_rwsem as a performance optimization. This is handled by the
+ * if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
+ * only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
{
@@ -4959,7 +5405,7 @@
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_lock_write(mapping);
+ i_mmap_assert_locked(mapping);
vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
if (svma == vma)
continue;
@@ -4989,7 +5435,6 @@
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
- i_mmap_unlock_write(mapping);
return pte;
}
@@ -5000,17 +5445,19 @@
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
p4d_t *p4d = p4d_offset(pgd, *addr);
pud_t *pud = pud_offset(p4d, *addr);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -5028,7 +5475,8 @@
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
return 0;
}
@@ -5074,8 +5522,8 @@
* huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
- * Return: Pointer to page table or swap entry (PUD or PMD) for
- * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * Return: Pointer to page table entry (PUD or PMD) for
+ * address @addr, or NULL if a !p*d_present() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
@@ -5084,8 +5532,8 @@
{
pgd_t *pgd;
p4d_t *p4d;
- pud_t *pud, pud_entry;
- pmd_t *pmd, pmd_entry;
+ pud_t *pud;
+ pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
@@ -5095,22 +5543,16 @@
return NULL;
pud = pud_offset(p4d, addr);
- pud_entry = READ_ONCE(*pud);
- if (sz != PUD_SIZE && pud_none(pud_entry))
- return NULL;
- /* hugepage or swap? */
- if (pud_huge(pud_entry) || !pud_present(pud_entry))
+ if (sz == PUD_SIZE)
+ /* must be pud huge, non-present or none */
return (pte_t *)pud;
+ if (!pud_present(*pud))
+ return NULL;
+ /* must have a valid entry and size to go further */
pmd = pmd_offset(pud, addr);
- pmd_entry = READ_ONCE(*pmd);
- if (sz != PMD_SIZE && pmd_none(pmd_entry))
- return NULL;
- /* hugepage or swap? */
- if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry))
- return (pte_t *)pmd;
-
- return NULL;
+ /* must be pmd huge, non-present or none */
+ return (pte_t *)pmd;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
@@ -5141,6 +5583,12 @@
struct page *page = NULL;
spinlock_t *ptl;
pte_t pte;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
@@ -5153,8 +5601,18 @@
pte = huge_ptep_get((pte_t *)pmd);
if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
- if (flags & FOLL_GET)
- get_page(page);
+ /*
+ * try_grab_page() should always succeed here, because: a) we
+ * hold the pmd (ptl) lock, and b) we've just checked that the
+ * huge pmd (head) page is present in the page tables. The ptl
+ * prevents the head page and tail pages from being rearranged
+ * in any way. So this page must be available at this point,
+ * unless the page refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
@@ -5175,7 +5633,7 @@
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -5184,7 +5642,7 @@
struct page * __weak
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);
@@ -5249,3 +5707,75 @@
spin_unlock(&hugetlb_lock);
}
}
+
+#ifdef CONFIG_CMA
+static bool cma_reserve_called __initdata;
+
+static int __init cmdline_parse_hugetlb_cma(char *p)
+{
+ hugetlb_cma_size = memparse(p, &p);
+ return 0;
+}
+
+early_param("hugetlb_cma", cmdline_parse_hugetlb_cma);
+
+void __init hugetlb_cma_reserve(int order)
+{
+ unsigned long size, reserved, per_node;
+ int nid;
+
+ cma_reserve_called = true;
+
+ if (!hugetlb_cma_size)
+ return;
+
+ if (hugetlb_cma_size < (PAGE_SIZE << order)) {
+ pr_warn("hugetlb_cma: cma area should be at least %lu MiB\n",
+ (PAGE_SIZE << order) / SZ_1M);
+ return;
+ }
+
+ /*
+ * If 3 GB area is requested on a machine with 4 numa nodes,
+ * let's allocate 1 GB on first three nodes and ignore the last one.
+ */
+ per_node = DIV_ROUND_UP(hugetlb_cma_size, nr_online_nodes);
+ pr_info("hugetlb_cma: reserve %lu MiB, up to %lu MiB per node\n",
+ hugetlb_cma_size / SZ_1M, per_node / SZ_1M);
+
+ reserved = 0;
+ for_each_node_state(nid, N_ONLINE) {
+ int res;
+ char name[CMA_MAX_NAME];
+
+ size = min(per_node, hugetlb_cma_size - reserved);
+ size = round_up(size, PAGE_SIZE << order);
+
+ snprintf(name, sizeof(name), "hugetlb%d", nid);
+ res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
+ 0, false, name,
+ &hugetlb_cma[nid], nid);
+ if (res) {
+ pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
+ res, nid);
+ continue;
+ }
+
+ reserved += size;
+ pr_info("hugetlb_cma: reserved %lu MiB on node %d\n",
+ size / SZ_1M, nid);
+
+ if (reserved >= hugetlb_cma_size)
+ break;
+ }
+}
+
+void __init hugetlb_cma_check(void)
+{
+ if (!hugetlb_cma_size || cma_reserve_called)
+ return;
+
+ pr_warn("hugetlb_cma: the option isn't supported by current arch\n");
+}
+
+#endif /* CONFIG_CMA */