Update Linux to v5.10.109
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.109.tar.xz
Change-Id: I19bca9fc6762d4e63bcf3e4cba88bbe560d9c76c
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index bcc2686..6275b1c 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -49,8 +49,6 @@
* and restore_online_page_callback() for generic callback restore.
*/
-static void generic_online_page(struct page *page, unsigned int order);
-
static online_page_callback_t online_page_callback = generic_online_page;
static DEFINE_MUTEX(online_page_callback_lock);
@@ -69,18 +67,17 @@
bool movable_node_enabled = false;
#ifndef CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE
-bool memhp_auto_online;
+int memhp_default_online_type = MMOP_OFFLINE;
#else
-bool memhp_auto_online = true;
+int memhp_default_online_type = MMOP_ONLINE;
#endif
-EXPORT_SYMBOL_GPL(memhp_auto_online);
static int __init setup_memhp_default_state(char *str)
{
- if (!strcmp(str, "online"))
- memhp_auto_online = true;
- else if (!strcmp(str, "offline"))
- memhp_auto_online = false;
+ const int online_type = memhp_online_type_from_str(str);
+
+ if (online_type >= 0)
+ memhp_default_online_type = online_type;
return 1;
}
@@ -101,13 +98,22 @@
u64 max_mem_size = U64_MAX;
/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size,
+ const char *resource_name)
{
struct resource *res;
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- char *resource_name = "System RAM";
- if (start + size > max_mem_size)
+ if (strcmp(resource_name, "System RAM"))
+ flags |= IORESOURCE_SYSRAM_DRIVER_MANAGED;
+
+ /*
+ * Make sure value parsed from 'mem=' only restricts memory adding
+ * while booting, so that memory hotplug won't be impacted. Please
+ * refer to document of 'mem=' in kernel-parameters.txt for more
+ * details.
+ */
+ if (start + size > max_mem_size && system_state < SYSTEM_RUNNING)
return ERR_PTR(-E2BIG);
/*
@@ -278,6 +284,22 @@
return 0;
}
+static int check_hotplug_memory_addressable(unsigned long pfn,
+ unsigned long nr_pages)
+{
+ const u64 max_addr = PFN_PHYS(pfn + nr_pages) - 1;
+
+ if (max_addr >> MAX_PHYSMEM_BITS) {
+ const u64 max_allowed = (1ull << (MAX_PHYSMEM_BITS + 1)) - 1;
+ WARN(1,
+ "Hotplugged memory exceeds maximum addressable address, range=%#llx-%#llx, maximum=%#llx\n",
+ (u64)PFN_PHYS(pfn), max_addr, max_allowed);
+ return -E2BIG;
+ }
+
+ return 0;
+}
+
/*
* Reasonably generic function for adding memory. It is
* expected that archs that support memory hotplug will
@@ -285,11 +307,19 @@
* add the new pages.
*/
int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
- struct mhp_restrictions *restrictions)
+ struct mhp_params *params)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
int err;
- unsigned long nr, start_sec, end_sec;
- struct vmem_altmap *altmap = restrictions->altmap;
+ struct vmem_altmap *altmap = params->altmap;
+
+ if (WARN_ON_ONCE(!params->pgprot.pgprot))
+ return -EINVAL;
+
+ err = check_hotplug_memory_addressable(pfn, nr_pages);
+ if (err)
+ return err;
if (altmap) {
/*
@@ -307,18 +337,13 @@
if (err)
return err;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- err = sparse_add_section(nid, pfn, pfns, altmap);
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
+ err = sparse_add_section(nid, pfn, cur_nr_pages, altmap);
if (err)
break;
- pfn += pfns;
- nr_pages -= pfns;
cond_resched();
}
vmemmap_populate_print_last();
@@ -337,7 +362,7 @@
if (unlikely(pfn_to_nid(start_pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(start_pfn)))
+ if (zone != page_zone(pfn_to_page(start_pfn)))
continue;
return start_pfn;
@@ -362,7 +387,7 @@
if (unlikely(pfn_to_nid(pfn) != nid))
continue;
- if (zone && zone != page_zone(pfn_to_page(pfn)))
+ if (zone != page_zone(pfn_to_page(pfn)))
continue;
return pfn;
@@ -374,14 +399,11 @@
static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long zone_start_pfn = zone->zone_start_pfn;
- unsigned long z = zone_end_pfn(zone); /* zone_end_pfn namespace clash */
- unsigned long zone_end_pfn = z;
unsigned long pfn;
int nid = zone_to_nid(zone);
zone_span_writelock(zone);
- if (zone_start_pfn == start_pfn) {
+ if (zone->zone_start_pfn == start_pfn) {
/*
* If the section is smallest section in the zone, it need
* shrink zone->zone_start_pfn and zone->zone_spanned_pages.
@@ -389,50 +411,30 @@
* for shrinking zone.
*/
pfn = find_smallest_section_pfn(nid, zone, end_pfn,
- zone_end_pfn);
+ zone_end_pfn(zone));
if (pfn) {
+ zone->spanned_pages = zone_end_pfn(zone) - pfn;
zone->zone_start_pfn = pfn;
- zone->spanned_pages = zone_end_pfn - pfn;
+ } else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
}
- } else if (zone_end_pfn == end_pfn) {
+ } else if (zone_end_pfn(zone) == end_pfn) {
/*
* If the section is biggest section in the zone, it need
* shrink zone->spanned_pages.
* In this case, we find second biggest valid mem_section for
* shrinking zone.
*/
- pfn = find_biggest_section_pfn(nid, zone, zone_start_pfn,
+ pfn = find_biggest_section_pfn(nid, zone, zone->zone_start_pfn,
start_pfn);
if (pfn)
- zone->spanned_pages = pfn - zone_start_pfn + 1;
+ zone->spanned_pages = pfn - zone->zone_start_pfn + 1;
+ else {
+ zone->zone_start_pfn = 0;
+ zone->spanned_pages = 0;
+ }
}
-
- /*
- * The section is not biggest or smallest mem_section in the zone, it
- * only creates a hole in the zone. So in this case, we need not
- * change the zone. But perhaps, the zone has only hole data. Thus
- * it check the zone has only hole or not.
- */
- pfn = zone_start_pfn;
- for (; pfn < zone_end_pfn; pfn += PAGES_PER_SUBSECTION) {
- if (unlikely(!pfn_to_online_page(pfn)))
- continue;
-
- if (page_zone(pfn_to_page(pfn)) != zone)
- continue;
-
- /* Skip range to be removed */
- if (pfn >= start_pfn && pfn < end_pfn)
- continue;
-
- /* If we find valid section, we have nothing to do */
- zone_span_writeunlock(zone);
- return;
- }
-
- /* The zone has no valid section */
- zone->zone_start_pfn = 0;
- zone->spanned_pages = 0;
zone_span_writeunlock(zone);
}
@@ -469,8 +471,20 @@
unsigned long start_pfn,
unsigned long nr_pages)
{
+ const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long flags;
+ unsigned long pfn, cur_nr_pages, flags;
+
+ /* Poison struct pages because they are now uninitialized again. */
+ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
+ cond_resched();
+
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages =
+ min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
+ page_init_poison(pfn_to_page(pfn),
+ sizeof(struct page) * cur_nr_pages);
+ }
#ifdef CONFIG_ZONE_DEVICE
/*
@@ -496,7 +510,7 @@
unsigned long map_offset,
struct vmem_altmap *altmap)
{
- struct mem_section *ms = __nr_to_section(pfn_to_section_nr(pfn));
+ struct mem_section *ms = __pfn_to_section(pfn);
if (WARN_ON_ONCE(!valid_section(ms)))
return;
@@ -518,25 +532,21 @@
void __remove_pages(unsigned long pfn, unsigned long nr_pages,
struct vmem_altmap *altmap)
{
+ const unsigned long end_pfn = pfn + nr_pages;
+ unsigned long cur_nr_pages;
unsigned long map_offset = 0;
- unsigned long nr, start_sec, end_sec;
map_offset = vmem_altmap_offset(altmap);
if (check_pfn_span(pfn, nr_pages, "remove"))
return;
- start_sec = pfn_to_section_nr(pfn);
- end_sec = pfn_to_section_nr(pfn + nr_pages - 1);
- for (nr = start_sec; nr <= end_sec; nr++) {
- unsigned long pfns;
-
+ for (; pfn < end_pfn; pfn += cur_nr_pages) {
cond_resched();
- pfns = min(nr_pages, PAGES_PER_SECTION
- - (pfn & ~PAGE_SECTION_MASK));
- __remove_section(pfn, pfns, map_offset, altmap);
- pfn += pfns;
- nr_pages -= pfns;
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages = min(end_pfn - pfn,
+ SECTION_ALIGN_UP(pfn + 1) - pfn);
+ __remove_section(pfn, cur_nr_pages, map_offset, altmap);
map_offset = 0;
}
}
@@ -579,24 +589,7 @@
}
EXPORT_SYMBOL_GPL(restore_online_page_callback);
-void __online_page_set_limits(struct page *page)
-{
-}
-EXPORT_SYMBOL_GPL(__online_page_set_limits);
-
-void __online_page_increment_counters(struct page *page)
-{
- adjust_managed_page_count(page, 1);
-}
-EXPORT_SYMBOL_GPL(__online_page_increment_counters);
-
-void __online_page_free(struct page *page)
-{
- __free_reserved_page(page);
-}
-EXPORT_SYMBOL_GPL(__online_page_free);
-
-static void generic_online_page(struct page *page, unsigned int order)
+void generic_online_page(struct page *page, unsigned int order)
{
/*
* Freeing the page with debug_pagealloc enabled will try to unmap it,
@@ -612,32 +605,24 @@
totalhigh_pages_add(1UL << order);
#endif
}
+EXPORT_SYMBOL_GPL(generic_online_page);
-static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
- void *arg)
+static void online_pages_range(unsigned long start_pfn, unsigned long nr_pages)
{
const unsigned long end_pfn = start_pfn + nr_pages;
unsigned long pfn;
- int order;
/*
- * Online the pages. The callback might decide to keep some pages
- * PG_reserved (to add them to the buddy later), but we still account
- * them as being online/belonging to this zone ("present").
+ * Online the pages in MAX_ORDER - 1 aligned chunks. The callback might
+ * decide to not expose all pages to the buddy (e.g., expose them
+ * later). We account all pages as being online and belonging to this
+ * zone ("present").
*/
- for (pfn = start_pfn; pfn < end_pfn; pfn += 1ul << order) {
- order = min(MAX_ORDER - 1, get_order(PFN_PHYS(end_pfn - pfn)));
- /* __free_pages_core() wants pfns to be aligned to the order */
- if (WARN_ON_ONCE(!IS_ALIGNED(pfn, 1ul << order)))
- order = 0;
- (*online_page_callback)(pfn_to_page(pfn), order);
- }
+ for (pfn = start_pfn; pfn < end_pfn; pfn += MAX_ORDER_NR_PAGES)
+ (*online_page_callback)(pfn_to_page(pfn), MAX_ORDER - 1);
/* mark all involved sections as online */
online_mem_sections(start_pfn, end_pfn);
-
- *(unsigned long *)arg += nr_pages;
- return 0;
}
/* check which state of node_states will be changed when online memory */
@@ -698,9 +683,14 @@
* Associate the pfn range with the given zone, initializing the memmaps
* and resizing the pgdat/zone data to span the added pages. After this
* call, all affected pages are PG_reserved.
+ *
+ * All aligned pageblocks are initialized to the specified migratetype
+ * (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
+ * zone stats (e.g., nr_isolate_pageblock) are touched.
*/
void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
- unsigned long nr_pages, struct vmem_altmap *altmap)
+ unsigned long nr_pages,
+ struct vmem_altmap *altmap, int migratetype)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int nid = pgdat->node_id;
@@ -724,8 +714,8 @@
* expects the zone spans the pfn range. All the pages in the range
* are reserved so nobody should be touching them so we should be safe
*/
- memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMINIT_HOTPLUG, altmap);
+ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, 0,
+ MEMINIT_HOTPLUG, altmap, migratetype);
set_zone_contiguous(zone);
}
@@ -787,30 +777,25 @@
return default_zone_for_pfn(nid, start_pfn, nr_pages);
}
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+ int online_type, int nid)
{
unsigned long flags;
- unsigned long onlined_pages = 0;
struct zone *zone;
int need_zonelists_rebuild = 0;
- int nid;
int ret;
struct memory_notify arg;
- struct memory_block *mem;
+
+ /* We can only online full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
mem_hotplug_begin();
- /*
- * We can't use pfn_to_nid() because nid might be stored in struct page
- * which is not yet initialized. Instead, we find nid from memory block.
- */
- mem = find_memory_block(__pfn_to_section(pfn));
- nid = mem->nid;
- put_device(&mem->dev);
-
/* associate pfn range with the zone */
zone = zone_for_pfn_range(online_type, nid, pfn, nr_pages);
- move_pfn_range_to_zone(zone, pfn, nr_pages, NULL);
+ move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_ISOLATE);
arg.start_pfn = pfn;
arg.nr_pages = nr_pages;
@@ -822,6 +807,14 @@
goto failed_addition;
/*
+ * Fixup the number of isolated pageblocks before marking the sections
+ * onlining, such that undo_isolate_page_range() works correctly.
+ */
+ spin_lock_irqsave(&zone->lock, flags);
+ zone->nr_isolate_pageblock += nr_pages / pageblock_nr_pages;
+ spin_unlock_irqrestore(&zone->lock, flags);
+
+ /*
* If this zone is not populated, then it is not in zonelist.
* This means the page allocator ignores this zone.
* So, zonelist must be updated after online.
@@ -831,36 +824,34 @@
setup_zone_pageset(zone);
}
- ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
- online_pages_range);
- if (ret) {
- /* not a single memory resource was applicable */
- if (need_zonelists_rebuild)
- zone_pcp_reset(zone);
- goto failed_addition;
- }
-
- zone->present_pages += onlined_pages;
+ online_pages_range(pfn, nr_pages);
+ zone->present_pages += nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages += onlined_pages;
+ zone->zone_pgdat->node_present_pages += nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
- shuffle_zone(zone);
-
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
+ zone_pcp_update(zone);
+
+ /* Basic onlining is complete, allow allocation of onlined pages. */
+ undo_isolate_page_range(pfn, pfn + nr_pages, MIGRATE_MOVABLE);
+
+ /*
+ * Freshly onlined pages aren't shuffled (e.g., all pages are placed to
+ * the tail of the freelist when undoing isolation). Shuffle the whole
+ * zone to make sure the just onlined pages are properly distributed
+ * across the whole freelist - to create an initial shuffle.
+ */
+ shuffle_zone(zone);
init_per_zone_wmark_min();
kswapd_run(nid);
kcompactd_run(nid);
- vm_total_pages = nr_free_pagecache_pages();
-
writeback_set_ratelimit();
memory_notify(MEM_ONLINE, &arg);
@@ -889,10 +880,9 @@
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
+static pg_data_t __ref *hotadd_new_pgdat(int nid)
{
struct pglist_data *pgdat;
- unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
if (!pgdat) {
@@ -906,13 +896,13 @@
} else {
int cpu;
/*
- * Reset the nr_zones, order and classzone_idx before reuse.
- * Note that kswapd will init kswapd_classzone_idx properly
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
* when it starts in the near future.
*/
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
for_each_online_cpu(cpu) {
struct per_cpu_nodestat *p;
@@ -922,9 +912,8 @@
}
/* we can use NODE_DATA(nid) from here */
-
pgdat->node_id = nid;
- pgdat->node_start_pfn = start_pfn;
+ pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
@@ -959,7 +948,6 @@
/**
* try_online_node - online a node if offlined
* @nid: the node ID
- * @start: start addr of the node
* @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
*
@@ -968,7 +956,7 @@
* 0 -> the node is already online
* -ENOMEM -> the node could not be allocated
*/
-static int __try_online_node(int nid, u64 start, bool set_node_online)
+static int __try_online_node(int nid, bool set_node_online)
{
pg_data_t *pgdat;
int ret = 1;
@@ -976,7 +964,7 @@
if (node_online(nid))
return 0;
- pgdat = hotadd_new_pgdat(nid, start);
+ pgdat = hotadd_new_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
@@ -1000,7 +988,7 @@
int ret;
mem_hotplug_begin();
- ret = __try_online_node(nid, 0, true);
+ ret = __try_online_node(nid, true);
mem_hotplug_done();
return ret;
}
@@ -1020,6 +1008,7 @@
static int online_memory_block(struct memory_block *mem, void *arg)
{
+ mem->online_type = memhp_default_online_type;
return device_online(&mem->dev);
}
@@ -1029,9 +1018,9 @@
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res)
+int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
- struct mhp_restrictions restrictions = {};
+ struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
u64 start, size;
bool new_node = false;
int ret;
@@ -1043,23 +1032,23 @@
if (ret)
return ret;
+ if (!node_possible(nid)) {
+ WARN(1, "node %d was absent from the node_possible_map\n", nid);
+ return -EINVAL;
+ }
+
mem_hotplug_begin();
- /*
- * Add new range to memblock so that when hotadd_new_pgdat() is called
- * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
- * this new range and calculate total pages correctly. The range will
- * be removed at hot-remove time.
- */
- memblock_add_node(start, size, nid);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_add_node(start, size, nid);
- ret = __try_online_node(nid, start, false);
+ ret = __try_online_node(nid, false);
if (ret < 0)
goto error;
new_node = ret;
/* call arch's memory hotadd */
- ret = arch_add_memory(nid, start, size, &restrictions);
+ ret = arch_add_memory(nid, start, size, ¶ms);
if (ret < 0)
goto error;
@@ -1082,18 +1071,25 @@
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
- MEMINIT_HOTPLUG);
- BUG_ON(ret);
+ link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
/* create new memmap entry */
- firmware_map_add_hotplug(start, start + size, "System RAM");
+ if (!strcmp(res->name, "System RAM"))
+ firmware_map_add_hotplug(start, start + size, "System RAM");
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
+ /*
+ * In case we're allowed to merge the resource, flag it and trigger
+ * merging now that adding succeeded.
+ */
+ if (mhp_flags & MEMHP_MERGE_RESOURCE)
+ merge_system_ram_resource(res);
+
/* online pages if requested */
- if (memhp_auto_online)
+ if (memhp_default_online_type != MMOP_OFFLINE)
walk_memory_blocks(start, size, NULL, online_memory_block);
return ret;
@@ -1101,123 +1097,99 @@
/* rollback pgdat allocation and others */
if (new_node)
rollback_node_hotadd(nid);
- memblock_remove(start, size);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_remove(start, size);
mem_hotplug_done();
return ret;
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size)
+int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
- res = register_memory_resource(start, size);
+ res = register_memory_resource(start, size, "System RAM");
if (IS_ERR(res))
return PTR_ERR(res);
- ret = add_memory_resource(nid, res);
+ ret = add_memory_resource(nid, res, mhp_flags);
if (ret < 0)
release_memory_resource(res);
return ret;
}
-int add_memory(int nid, u64 start, u64 size)
+int add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
int rc;
lock_device_hotplug();
- rc = __add_memory(nid, start, size);
+ rc = __add_memory(nid, start, size, mhp_flags);
unlock_device_hotplug();
return rc;
}
EXPORT_SYMBOL_GPL(add_memory);
+/*
+ * Add special, driver-managed memory to the system as system RAM. Such
+ * memory is not exposed via the raw firmware-provided memmap as system
+ * RAM, instead, it is detected and added by a driver - during cold boot,
+ * after a reboot, and after kexec.
+ *
+ * Reasons why this memory should not be used for the initial memmap of a
+ * kexec kernel or for placing kexec images:
+ * - The booting kernel is in charge of determining how this memory will be
+ * used (e.g., use persistent memory as system RAM)
+ * - Coordination with a hypervisor is required before this memory
+ * can be used (e.g., inaccessible parts).
+ *
+ * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
+ * memory map") are created. Also, the created memory resource is flagged
+ * with IORESOURCE_SYSRAM_DRIVER_MANAGED, so in-kernel users can special-case
+ * this memory as well (esp., not place kexec images onto it).
+ *
+ * The resource_name (visible via /proc/iomem) has to have the format
+ * "System RAM ($DRIVER)".
+ */
+int add_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name, mhp_t mhp_flags)
+{
+ struct resource *res;
+ int rc;
+
+ if (!resource_name ||
+ strstr(resource_name, "System RAM (") != resource_name ||
+ resource_name[strlen(resource_name) - 1] != ')')
+ return -EINVAL;
+
+ lock_device_hotplug();
+
+ res = register_memory_resource(start, size, resource_name);
+ if (IS_ERR(res)) {
+ rc = PTR_ERR(res);
+ goto out_unlock;
+ }
+
+ rc = add_memory_resource(nid, res, mhp_flags);
+ if (rc < 0)
+ release_memory_resource(res);
+
+out_unlock:
+ unlock_device_hotplug();
+ return rc;
+}
+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+
#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
- * set and the size of the free page is given by page_order(). Using this,
- * the function determines if the pageblock contains only free pages.
- * Due to buddy contraints, a free page at least the size of a pageblock will
- * be located at the start of the pageblock
+ * Confirm all pages in a range [start, end) belong to the same zone (skipping
+ * memory holes). When true, return the zone.
*/
-static inline int pageblock_free(struct page *page)
-{
- return PageBuddy(page) && page_order(page) >= pageblock_order;
-}
-
-/* Return the pfn of the start of the next active pageblock after a given pfn */
-static unsigned long next_active_pageblock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
-
- /* Ensure the starting page is pageblock-aligned */
- BUG_ON(pfn & (pageblock_nr_pages - 1));
-
- /* If the entire pageblock is free, move to the end of free page */
- if (pageblock_free(page)) {
- int order;
- /* be careful. we don't have locks, page_order can be changed.*/
- order = page_order(page);
- if ((order < MAX_ORDER) && (order >= pageblock_order))
- return pfn + (1 << order);
- }
-
- return pfn + pageblock_nr_pages;
-}
-
-static bool is_pageblock_removable_nolock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
- struct zone *zone;
-
- /*
- * We have to be careful here because we are iterating over memory
- * sections which are not zone aware so we might end up outside of
- * the zone but still within the section.
- * We have to take care about the node as well. If the node is offline
- * its NODE_DATA will be NULL - see page_zone.
- */
- if (!node_online(page_to_nid(page)))
- return false;
-
- zone = page_zone(page);
- pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
- return false;
-
- return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, SKIP_HWPOISON);
-}
-
-/* Checks if this range of memory is likely to be hot-removable. */
-bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long end_pfn, pfn;
-
- end_pfn = min(start_pfn + nr_pages,
- zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
-
- /* Check the starting page of each pageblock within the range */
- for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
- if (!is_pageblock_removable_nolock(pfn))
- return false;
- cond_resched();
- }
-
- /* All pageblocks in the memory block are likely to be hot-removable */
- return true;
-}
-
-/*
- * Confirm all pages in a range [start, end) belong to the same zone.
- * When true, return its valid [start, end).
- */
-int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn,
- unsigned long *valid_start, unsigned long *valid_end)
+struct zone *test_pages_in_a_zone(unsigned long start_pfn,
+ unsigned long end_pfn)
{
unsigned long pfn, sec_end_pfn;
- unsigned long start, end;
struct zone *zone = NULL;
struct page *page;
int i;
@@ -1238,33 +1210,30 @@
continue;
/* Check if we got outside of the zone */
if (zone && !zone_spans_pfn(zone, pfn + i))
- return 0;
+ return NULL;
page = pfn_to_page(pfn + i);
if (zone && page_zone(page) != zone)
- return 0;
- if (!zone)
- start = pfn + i;
+ return NULL;
zone = page_zone(page);
- end = pfn + MAX_ORDER_NR_PAGES;
}
}
- if (zone) {
- *valid_start = start;
- *valid_end = min(end, end_pfn);
- return 1;
- } else {
- return 0;
- }
+ return zone;
}
/*
* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). We scan pfn because it's much
- * easier than scanning over linked list. This function returns the pfn
- * of the first found movable page if it's found, otherwise 0.
+ * non-lru movable pages and hugepages). Will skip over most unmovable
+ * pages (esp., pages that can be skipped when offlining), but bail out on
+ * definitely unmovable pages.
+ *
+ * Returns:
+ * 0 in case a movable page is found and movable_pfn was updated.
+ * -ENOENT in case no movable page was found.
+ * -EBUSY in case a definitely unmovable page was found.
*/
-static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
+static int scan_movable_pages(unsigned long start, unsigned long end,
+ unsigned long *movable_pfn)
{
unsigned long pfn;
@@ -1276,43 +1245,38 @@
continue;
page = pfn_to_page(pfn);
if (PageLRU(page))
- return pfn;
+ goto found;
if (__PageMovable(page))
- return pfn;
+ goto found;
+
+ /*
+ * PageOffline() pages that are not marked __PageMovable() and
+ * have a reference count > 0 (after MEM_GOING_OFFLINE) are
+ * definitely unmovable. If their reference count would be 0,
+ * they could at least be skipped when offlining memory.
+ */
+ if (PageOffline(page) && page_count(page))
+ return -EBUSY;
if (!PageHuge(page))
continue;
head = compound_head(page);
if (page_huge_active(head))
- return pfn;
+ goto found;
skip = compound_nr(head) - (page - head);
pfn += skip - 1;
}
+ return -ENOENT;
+found:
+ *movable_pfn = pfn;
return 0;
}
-static struct page *new_node_page(struct page *page, unsigned long private)
-{
- int nid = page_to_nid(page);
- nodemask_t nmask = node_states[N_MEMORY];
-
- /*
- * try to allocate from a different node but reuse this node if there
- * are no other online nodes to be used (e.g. we are offlining a part
- * of the only existing node)
- */
- node_clear(nid, nmask);
- if (nodes_empty(nmask))
- node_set(nid, nmask);
-
- return new_page_nodemask(page, nid, &nmask);
-}
-
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
- struct page *page;
+ struct page *page, *head;
int ret = 0;
LIST_HEAD(source);
@@ -1320,15 +1284,14 @@
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
+ head = compound_head(page);
if (PageHuge(page)) {
- struct page *head = compound_head(page);
pfn = page_to_pfn(head) + compound_nr(head) - 1;
isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
- pfn = page_to_pfn(compound_head(page))
- + hpage_nr_pages(page) - 1;
+ pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
/*
* HWPoison pages have elevated reference counts so the migration would
@@ -1341,7 +1304,7 @@
if (WARN_ON(PageLRU(page)))
isolate_lru_page(page);
if (page_mapped(page))
- try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ try_to_unmap(page, TTU_IGNORE_MLOCK);
continue;
}
@@ -1359,7 +1322,7 @@
list_add_tail(&page->lru, &source);
if (!__PageMovable(page))
inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_cache(page));
+ page_is_file_lru(page));
} else {
pr_warn("failed to isolate pfn %lx\n", pfn);
@@ -1368,9 +1331,28 @@
put_page(page);
}
if (!list_empty(&source)) {
- /* Allocate a new page from the nearest neighbor node */
- ret = migrate_pages(&source, new_node_page, NULL, 0,
- MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
+ nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
+
+ /*
+ * We have checked that migration range is on a single zone so
+ * we can use the nid of the first page to all the others.
+ */
+ mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+
+ /*
+ * try to allocate from a different node but reuse this node
+ * if there are no other online nodes to be used (e.g. we are
+ * offlining a part of the only existing node)
+ */
+ node_clear(mtc.nid, nmask);
+ if (nodes_empty(nmask))
+ node_set(mtc.nid, nmask);
+ ret = migrate_pages(&source, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG);
if (ret) {
list_for_each_entry(page, &source, lru) {
pr_warn("migrating pfn %lx failed ret:%d ",
@@ -1384,36 +1366,9 @@
return ret;
}
-/*
- * remove from free_area[] and mark all as Reserved.
- */
-static int
-offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
- void *data)
-{
- unsigned long *offlined_pages = (unsigned long *)data;
-
- *offlined_pages += __offline_isolated_pages(start, start + nr_pages);
- return 0;
-}
-
-/*
- * Check all pages in range, recoreded as memory resource, are isolated.
- */
-static int
-check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
- void *data)
-{
- return test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
-}
-
static int __init cmdline_parse_movable_node(char *p)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
movable_node_enabled = true;
-#else
- pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
-#endif
return 0;
}
early_param("movable_node", cmdline_parse_movable_node);
@@ -1485,42 +1440,66 @@
node_clear_state(node, N_MEMORY);
}
-static int __ref __offline_pages(unsigned long start_pfn,
- unsigned long end_pfn)
+static int count_system_ram_pages_cb(unsigned long start_pfn,
+ unsigned long nr_pages, void *data)
{
- unsigned long pfn, nr_pages;
- unsigned long offlined_pages = 0;
- int ret, node, nr_isolate_pageblock;
+ unsigned long *nr_system_ram_pages = data;
+
+ *nr_system_ram_pages += nr_pages;
+ return 0;
+}
+
+int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
+{
+ const unsigned long end_pfn = start_pfn + nr_pages;
+ unsigned long pfn, system_ram_pages = 0;
unsigned long flags;
- unsigned long valid_start, valid_end;
struct zone *zone;
struct memory_notify arg;
+ int ret, node;
char *reason;
+ /* We can only offline full sections (e.g., SECTION_IS_ONLINE) */
+ if (WARN_ON_ONCE(!nr_pages ||
+ !IS_ALIGNED(start_pfn | nr_pages, PAGES_PER_SECTION)))
+ return -EINVAL;
+
mem_hotplug_begin();
+ /*
+ * Don't allow to offline memory blocks that contain holes.
+ * Consequently, memory blocks with holes can never get onlined
+ * via the hotplug path - online_pages() - as hotplugged memory has
+ * no holes. This way, we e.g., don't have to worry about marking
+ * memory holes PG_reserved, don't need pfn_valid() checks, and can
+ * avoid using walk_system_ram_range() later.
+ */
+ walk_system_ram_range(start_pfn, nr_pages, &system_ram_pages,
+ count_system_ram_pages_cb);
+ if (system_ram_pages != nr_pages) {
+ ret = -EINVAL;
+ reason = "memory holes";
+ goto failed_removal;
+ }
+
/* This makes hotplug much easier...and readable.
we assume this for now. .*/
- if (!test_pages_in_a_zone(start_pfn, end_pfn, &valid_start,
- &valid_end)) {
+ zone = test_pages_in_a_zone(start_pfn, end_pfn);
+ if (!zone) {
ret = -EINVAL;
reason = "multizone range";
goto failed_removal;
}
-
- zone = page_zone(pfn_to_page(valid_start));
node = zone_to_nid(zone);
- nr_pages = end_pfn - start_pfn;
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
- SKIP_HWPOISON | REPORT_FAILURE);
- if (ret < 0) {
+ MEMORY_OFFLINE | REPORT_FAILURE);
+ if (ret) {
reason = "failure to isolate range";
goto failed_removal;
}
- nr_isolate_pageblock = ret;
arg.start_pfn = start_pfn;
arg.nr_pages = nr_pages;
@@ -1534,7 +1513,8 @@
}
do {
- for (pfn = start_pfn; pfn;) {
+ pfn = start_pfn;
+ do {
if (signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff";
@@ -1544,14 +1524,19 @@
cond_resched();
lru_add_drain_all();
- pfn = scan_movable_pages(pfn, end_pfn);
- if (pfn) {
+ ret = scan_movable_pages(pfn, end_pfn, &pfn);
+ if (!ret) {
/*
* TODO: fatal migration failures should bail
* out
*/
do_migrate_range(pfn, end_pfn);
}
+ } while (!ret);
+
+ if (ret != -ENOENT) {
+ reason = "unmovable page";
+ goto failed_removal_isolated;
}
/*
@@ -1564,9 +1549,7 @@
reason = "failure to dissolve huge pages";
goto failed_removal_isolated;
}
- /* check again */
- ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- NULL, check_pages_isolated_cb);
+
/*
* per-cpu pages are drained in start_isolate_page_range, but if
* there are still pages that are not free, make sure that we
@@ -1579,30 +1562,30 @@
* because has_unmovable_pages explicitly checks for
* PageBuddy on freed pages on other zones.
*/
+ ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
if (ret)
drain_all_pages(zone);
} while (ret);
- /* Ok, all of our target is isolated.
- We cannot do rollback at this point. */
- walk_system_ram_range(start_pfn, end_pfn - start_pfn,
- &offlined_pages, offline_isolated_pages_cb);
- pr_info("Offlined Pages %ld\n", offlined_pages);
+ /* Mark all sections offline and remove free pages from the buddy. */
+ __offline_isolated_pages(start_pfn, end_pfn);
+ pr_info("Offlined Pages %ld\n", nr_pages);
+
/*
- * Onlining will reset pagetype flags and makes migrate type
- * MOVABLE, so just need to decrease the number of isolated
- * pageblocks zone counter here.
+ * The memory sections are marked offline, and the pageblock flags
+ * effectively stale; nobody should be touching them. Fixup the number
+ * of isolated pageblocks, memory onlining will properly revert this.
*/
spin_lock_irqsave(&zone->lock, flags);
- zone->nr_isolate_pageblock -= nr_isolate_pageblock;
+ zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
/* removal success */
- adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages);
- zone->present_pages -= offlined_pages;
+ adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
+ zone->present_pages -= nr_pages;
pgdat_resize_lock(zone->zone_pgdat, &flags);
- zone->zone_pgdat->node_present_pages -= offlined_pages;
+ zone->zone_pgdat->node_present_pages -= nr_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
init_per_zone_wmark_min();
@@ -1619,7 +1602,6 @@
kcompactd_stop(node);
}
- vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
@@ -1640,11 +1622,6 @@
return ret;
}
-int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
-{
- return __offline_pages(start_pfn, start_pfn + nr_pages);
-}
-
static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1733,26 +1710,6 @@
}
EXPORT_SYMBOL(try_offline_node);
-static void __release_memory_resource(resource_size_t start,
- resource_size_t size)
-{
- int ret;
-
- /*
- * When removing memory in the same granularity as it was added,
- * this function never fails. It might only fail if resources
- * have to be adjusted or split. We'll ignore the error, as
- * removing of memory cannot fail.
- */
- ret = release_mem_region_adjustable(&iomem_resource, start, size);
- if (ret) {
- resource_size_t endres = start + size - 1;
-
- pr_warn("Unable to release resource <%pa-%pa> (%d)\n",
- &start, &endres, ret);
- }
-}
-
static int __ref try_remove_memory(int nid, u64 start, u64 size)
{
int rc = 0;
@@ -1770,8 +1727,6 @@
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
- memblock_free(start, size);
- memblock_remove(start, size);
/*
* Memory block device removal under the device_hotplug_lock is
@@ -1782,7 +1737,13 @@
mem_hotplug_begin();
arch_remove_memory(nid, start, size, NULL);
- __release_memory_resource(start, size);
+
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ memblock_free(start, size);
+ memblock_remove(start, size);
+ }
+
+ release_mem_region_adjustable(start, size);
try_offline_node(nid);
@@ -1826,4 +1787,41 @@
return rc;
}
EXPORT_SYMBOL_GPL(remove_memory);
+
+/*
+ * Try to offline and remove a memory block. Might take a long time to
+ * finish in case memory is still in use. Primarily useful for memory devices
+ * that logically unplugged all memory (so it's no longer in use) and want to
+ * offline + remove the memory block.
+ */
+int offline_and_remove_memory(int nid, u64 start, u64 size)
+{
+ struct memory_block *mem;
+ int rc = -EINVAL;
+
+ if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
+ size != memory_block_size_bytes())
+ return rc;
+
+ lock_device_hotplug();
+ mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
+ if (mem)
+ rc = device_offline(&mem->dev);
+ /* Ignore if the device is already offline. */
+ if (rc > 0)
+ rc = 0;
+
+ /*
+ * In case we succeeded to offline the memory block, remove it.
+ * This cannot fail as it cannot get onlined in the meantime.
+ */
+ if (!rc) {
+ rc = try_remove_memory(nid, start, size);
+ WARN_ON_ONCE(rc);
+ }
+ unlock_device_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */