Blame - arch/powerpc/mm/numa.c - hafnium/third_party/linux.git

blob: 094a1076fd1fe3601f445048c669d836de6e4ec1 [file] [log] [blame]

David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1	// SPDX-License-Identifier: GPL-2.0-or-later
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	2	/*
				3	* pSeries NUMA support
				4	*
				5	* Copyright (C) 2002 Anton Blanchard <anton@au.ibm.com>, IBM
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	6	*/
				7	#define pr_fmt(fmt) "numa: " fmt
				8
				9	#include <linux/threads.h>
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	10	#include <linux/memblock.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	11	#include <linux/init.h>
				12	#include <linux/mm.h>
				13	#include <linux/mmzone.h>
				14	#include <linux/export.h>
				15	#include <linux/nodemask.h>
				16	#include <linux/cpu.h>
				17	#include <linux/notifier.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	18	#include <linux/of.h>
				19	#include <linux/pfn.h>
				20	#include <linux/cpuset.h>
				21	#include <linux/node.h>
				22	#include <linux/stop_machine.h>
				23	#include <linux/proc_fs.h>
				24	#include <linux/seq_file.h>
				25	#include <linux/uaccess.h>
				26	#include <linux/slab.h>
				27	#include <asm/cputhreads.h>
				28	#include <asm/sparsemem.h>
				29	#include <asm/prom.h>
				30	#include <asm/smp.h>
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	31	#include <asm/topology.h>
				32	#include <asm/firmware.h>
				33	#include <asm/paca.h>
				34	#include <asm/hvcall.h>
				35	#include <asm/setup.h>
				36	#include <asm/vdso.h>
				37	#include <asm/drmem.h>
				38
				39	static int numa_enabled = 1;
				40
				41	static char *cmdline __initdata;
				42
				43	static int numa_debug;
				44	#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
				45
				46	int numa_cpu_lookup_table[NR_CPUS];
				47	cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
				48	struct pglist_data *node_data[MAX_NUMNODES];
				49
				50	EXPORT_SYMBOL(numa_cpu_lookup_table);
				51	EXPORT_SYMBOL(node_to_cpumask_map);
				52	EXPORT_SYMBOL(node_data);
				53
				54	static int min_common_depth;
				55	static int n_mem_addr_cells, n_mem_size_cells;
				56	static int form1_affinity;
				57
				58	#define MAX_DISTANCE_REF_POINTS 4
				59	static int distance_ref_points_depth;
				60	static const __be32 *distance_ref_points;
				61	static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
				62
				63	/*
				64	* Allocate node_to_cpumask_map based on number of available nodes
				65	* Requires node_possible_map to be valid.
				66	*
				67	* Note: cpumask_of_node() is not valid until after this is done.
				68	*/
				69	static void __init setup_node_to_cpumask_map(void)
				70	{
				71	unsigned int node;
				72
				73	/* setup nr_node_ids if not done yet */
				74	if (nr_node_ids == MAX_NUMNODES)
				75	setup_nr_node_ids();
				76
				77	/* allocate the map */
				78	for_each_node(node)
				79	alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
				80
				81	/* cpumask_of_node() will now work */
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	82	dbg("Node to cpumask map for %u nodes\n", nr_node_ids);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	83	}
				84
				85	static int __init fake_numa_create_new_node(unsigned long end_pfn,
				86	unsigned int *nid)
				87	{
				88	unsigned long long mem;
				89	char *p = cmdline;
				90	static unsigned int fake_nid;
				91	static unsigned long long curr_boundary;
				92
				93	/*
				94	* Modify node id, iff we started creating NUMA nodes
				95	* We want to continue from where we left of the last time
				96	*/
				97	if (fake_nid)
				98	*nid = fake_nid;
				99	/*
				100	* In case there are no more arguments to parse, the
				101	* node_id should be the same as the last fake node id
				102	* (we've handled this above).
				103	*/
				104	if (!p)
				105	return 0;
				106
				107	mem = memparse(p, &p);
				108	if (!mem)
				109	return 0;
				110
				111	if (mem < curr_boundary)
				112	return 0;
				113
				114	curr_boundary = mem;
				115
				116	if ((end_pfn << PAGE_SHIFT) > mem) {
				117	/*
				118	* Skip commas and spaces
				119	*/
				120	while (p == ',' \|\| p == ' ' \|\| *p == '\t')
				121	p++;
				122
				123	cmdline = p;
				124	fake_nid++;
				125	*nid = fake_nid;
				126	dbg("created new fake_node with id %d\n", fake_nid);
				127	return 1;
				128	}
				129	return 0;
				130	}
				131
				132	static void reset_numa_cpu_lookup_table(void)
				133	{
				134	unsigned int cpu;
				135
				136	for_each_possible_cpu(cpu)
				137	numa_cpu_lookup_table[cpu] = -1;
				138	}
				139
				140	static void map_cpu_to_node(int cpu, int node)
				141	{
				142	update_numa_cpu_lookup_table(cpu, node);
				143
				144	dbg("adding cpu %d to node %d\n", cpu, node);
				145
				146	if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
				147	cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
				148	}
				149
				150	#if defined(CONFIG_HOTPLUG_CPU) \|\| defined(CONFIG_PPC_SPLPAR)
				151	static void unmap_cpu_from_node(unsigned long cpu)
				152	{
				153	int node = numa_cpu_lookup_table[cpu];
				154
				155	dbg("removing cpu %lu from node %d\n", cpu, node);
				156
				157	if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
				158	cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
				159	} else {
				160	printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
				161	cpu, node);
				162	}
				163	}
				164	#endif /* CONFIG_HOTPLUG_CPU \|\| CONFIG_PPC_SPLPAR */
				165
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	166	int cpu_distance(__be32 cpu1_assoc, __be32 cpu2_assoc)
				167	{
				168	int dist = 0;
				169
				170	int i, index;
				171
				172	for (i = 0; i < distance_ref_points_depth; i++) {
				173	index = be32_to_cpu(distance_ref_points[i]);
				174	if (cpu1_assoc[index] == cpu2_assoc[index])
				175	break;
				176	dist++;
				177	}
				178
				179	return dist;
				180	}
				181
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	182	/* must hold reference to node during call */
				183	static const __be32 of_get_associativity(struct device_node dev)
				184	{
				185	return of_get_property(dev, "ibm,associativity", NULL);
				186	}
				187
				188	int __node_distance(int a, int b)
				189	{
				190	int i;
				191	int distance = LOCAL_DISTANCE;
				192
				193	if (!form1_affinity)
				194	return ((a == b) ? LOCAL_DISTANCE : REMOTE_DISTANCE);
				195
				196	for (i = 0; i < distance_ref_points_depth; i++) {
				197	if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
				198	break;
				199
				200	/* Double the distance for each NUMA level */
				201	distance *= 2;
				202	}
				203
				204	return distance;
				205	}
				206	EXPORT_SYMBOL(__node_distance);
				207
				208	static void initialize_distance_lookup_table(int nid,
				209	const __be32 *associativity)
				210	{
				211	int i;
				212
				213	if (!form1_affinity)
				214	return;
				215
				216	for (i = 0; i < distance_ref_points_depth; i++) {
				217	const __be32 *entry;
				218
				219	entry = &associativity[be32_to_cpu(distance_ref_points[i]) - 1];
				220	distance_lookup_table[nid][i] = of_read_number(entry, 1);
				221	}
				222	}
				223
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	224	/*
				225	* Returns nid in the range [0..nr_node_ids], or -1 if no useful NUMA
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	226	* info is found.
				227	*/
				228	static int associativity_to_nid(const __be32 *associativity)
				229	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	230	int nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	231
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	232	if (!numa_enabled)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	233	goto out;
				234
				235	if (of_read_number(associativity, 1) >= min_common_depth)
				236	nid = of_read_number(&associativity[min_common_depth], 1);
				237
				238	/* POWER4 LPAR uses 0xffff as invalid node */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	239	if (nid == 0xffff \|\| nid >= nr_node_ids)
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	240	nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	241
				242	if (nid > 0 &&
				243	of_read_number(associativity, 1) >= distance_ref_points_depth) {
				244	/*
				245	* Skip the length field and send start of associativity array
				246	*/
				247	initialize_distance_lookup_table(nid, associativity + 1);
				248	}
				249
				250	out:
				251	return nid;
				252	}
				253
				254	/* Returns the nid associated with the given device tree node,
				255	* or -1 if not found.
				256	*/
				257	static int of_node_to_nid_single(struct device_node *device)
				258	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	259	int nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	260	const __be32 *tmp;
				261
				262	tmp = of_get_associativity(device);
				263	if (tmp)
				264	nid = associativity_to_nid(tmp);
				265	return nid;
				266	}
				267
				268	/* Walk the device tree upwards, looking for an associativity id */
				269	int of_node_to_nid(struct device_node *device)
				270	{
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	271	int nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	272
				273	of_node_get(device);
				274	while (device) {
				275	nid = of_node_to_nid_single(device);
				276	if (nid != -1)
				277	break;
				278
				279	device = of_get_next_parent(device);
				280	}
				281	of_node_put(device);
				282
				283	return nid;
				284	}
				285	EXPORT_SYMBOL(of_node_to_nid);
				286
				287	static int __init find_min_common_depth(void)
				288	{
				289	int depth;
				290	struct device_node *root;
				291
				292	if (firmware_has_feature(FW_FEATURE_OPAL))
				293	root = of_find_node_by_path("/ibm,opal");
				294	else
				295	root = of_find_node_by_path("/rtas");
				296	if (!root)
				297	root = of_find_node_by_path("/");
				298
				299	/*
				300	* This property is a set of 32-bit integers, each representing
				301	* an index into the ibm,associativity nodes.
				302	*
				303	* With form 0 affinity the first integer is for an SMP configuration
				304	* (should be all 0's) and the second is for a normal NUMA
				305	* configuration. We have only one level of NUMA.
				306	*
				307	* With form 1 affinity the first integer is the most significant
				308	* NUMA boundary and the following are progressively less significant
				309	* boundaries. There can be more than one level of NUMA.
				310	*/
				311	distance_ref_points = of_get_property(root,
				312	"ibm,associativity-reference-points",
				313	&distance_ref_points_depth);
				314
				315	if (!distance_ref_points) {
				316	dbg("NUMA: ibm,associativity-reference-points not found.\n");
				317	goto err;
				318	}
				319
				320	distance_ref_points_depth /= sizeof(int);
				321
				322	if (firmware_has_feature(FW_FEATURE_OPAL) \|\|
				323	firmware_has_feature(FW_FEATURE_TYPE1_AFFINITY)) {
				324	dbg("Using form 1 affinity\n");
				325	form1_affinity = 1;
				326	}
				327
				328	if (form1_affinity) {
				329	depth = of_read_number(distance_ref_points, 1);
				330	} else {
				331	if (distance_ref_points_depth < 2) {
				332	printk(KERN_WARNING "NUMA: "
				333	"short ibm,associativity-reference-points\n");
				334	goto err;
				335	}
				336
				337	depth = of_read_number(&distance_ref_points[1], 1);
				338	}
				339
				340	/*
				341	* Warn and cap if the hardware supports more than
				342	* MAX_DISTANCE_REF_POINTS domains.
				343	*/
				344	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
				345	printk(KERN_WARNING "NUMA: distance array capped at "
				346	"%d entries\n", MAX_DISTANCE_REF_POINTS);
				347	distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
				348	}
				349
				350	of_node_put(root);
				351	return depth;
				352
				353	err:
				354	of_node_put(root);
				355	return -1;
				356	}
				357
				358	static void __init get_n_mem_cells(int n_addr_cells, int n_size_cells)
				359	{
				360	struct device_node *memory = NULL;
				361
				362	memory = of_find_node_by_type(memory, "memory");
				363	if (!memory)
				364	panic("numa.c: No memory nodes found!");
				365
				366	*n_addr_cells = of_n_addr_cells(memory);
				367	*n_size_cells = of_n_size_cells(memory);
				368	of_node_put(memory);
				369	}
				370
				371	static unsigned long read_n_cells(int n, const __be32 **buf)
				372	{
				373	unsigned long result = 0;
				374
				375	while (n--) {
				376	result = (result << 32) \| of_read_number(*buf, 1);
				377	(*buf)++;
				378	}
				379	return result;
				380	}
				381
				382	struct assoc_arrays {
				383	u32 n_arrays;
				384	u32 array_sz;
				385	const __be32 *arrays;
				386	};
				387
				388	/*
				389	* Retrieve and validate the list of associativity arrays for drconf
				390	* memory from the ibm,associativity-lookup-arrays property of the
				391	* device tree..
				392	*
				393	* The layout of the ibm,associativity-lookup-arrays property is a number N
				394	* indicating the number of associativity arrays, followed by a number M
				395	* indicating the size of each associativity array, followed by a list
				396	* of N associativity arrays.
				397	*/
				398	static int of_get_assoc_arrays(struct assoc_arrays *aa)
				399	{
				400	struct device_node *memory;
				401	const __be32 *prop;
				402	u32 len;
				403
				404	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
				405	if (!memory)
				406	return -1;
				407
				408	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
				409	if (!prop \|\| len < 2 * sizeof(unsigned int)) {
				410	of_node_put(memory);
				411	return -1;
				412	}
				413
				414	aa->n_arrays = of_read_number(prop++, 1);
				415	aa->array_sz = of_read_number(prop++, 1);
				416
				417	of_node_put(memory);
				418
				419	/* Now that we know the number of arrays and size of each array,
				420	* revalidate the size of the property read in.
				421	*/
				422	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
				423	return -1;
				424
				425	aa->arrays = prop;
				426	return 0;
				427	}
				428
				429	/*
				430	* This is like of_node_to_nid_single() for memory represented in the
				431	* ibm,dynamic-reconfiguration-memory node.
				432	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	433	int of_drconf_to_nid_single(struct drmem_lmb *lmb)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	434	{
				435	struct assoc_arrays aa = { .arrays = NULL };
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	436	int default_nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	437	int nid = default_nid;
				438	int rc, index;
				439
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	440	if ((min_common_depth < 0) \|\| !numa_enabled)
				441	return default_nid;
				442
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	443	rc = of_get_assoc_arrays(&aa);
				444	if (rc)
				445	return default_nid;
				446
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	447	if (min_common_depth <= aa.array_sz &&
				448	!(lmb->flags & DRCONF_MEM_AI_INVALID) && lmb->aa_index < aa.n_arrays) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	449	index = lmb->aa_index * aa.array_sz + min_common_depth - 1;
				450	nid = of_read_number(&aa.arrays[index], 1);
				451
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	452	if (nid == 0xffff \|\| nid >= nr_node_ids)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	453	nid = default_nid;
				454
				455	if (nid > 0) {
				456	index = lmb->aa_index * aa.array_sz;
				457	initialize_distance_lookup_table(nid,
				458	&aa.arrays[index]);
				459	}
				460	}
				461
				462	return nid;
				463	}
				464
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	465	#ifdef CONFIG_PPC_SPLPAR
				466	static int vphn_get_nid(long lcpu)
				467	{
				468	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
				469	long rc, hwid;
				470
				471	/*
				472	* On a shared lpar, device tree will not have node associativity.
				473	* At this time lppaca, or its __old_status field may not be
				474	* updated. Hence kernel cannot detect if its on a shared lpar. So
				475	* request an explicit associativity irrespective of whether the
				476	* lpar is shared or dedicated. Use the device tree property as a
				477	* fallback. cpu_to_phys_id is only valid between
				478	* smp_setup_cpu_maps() and smp_setup_pacas().
				479	*/
				480	if (firmware_has_feature(FW_FEATURE_VPHN)) {
				481	if (cpu_to_phys_id)
				482	hwid = cpu_to_phys_id[lcpu];
				483	else
				484	hwid = get_hard_smp_processor_id(lcpu);
				485
				486	rc = hcall_vphn(hwid, VPHN_FLAG_VCPU, associativity);
				487	if (rc == H_SUCCESS)
				488	return associativity_to_nid(associativity);
				489	}
				490
				491	return NUMA_NO_NODE;
				492	}
				493	#else
				494	static int vphn_get_nid(long unused)
				495	{
				496	return NUMA_NO_NODE;
				497	}
				498	#endif /* CONFIG_PPC_SPLPAR */
				499
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	500	/*
				501	* Figure out to which domain a cpu belongs and stick it there.
				502	* Return the id of the domain used.
				503	*/
				504	static int numa_setup_cpu(unsigned long lcpu)
				505	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	506	struct device_node *cpu;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	507	int fcpu = cpu_first_thread_sibling(lcpu);
				508	int nid = NUMA_NO_NODE;
				509
				510	if (!cpu_present(lcpu)) {
				511	set_cpu_numa_node(lcpu, first_online_node);
				512	return first_online_node;
				513	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	514
				515	/*
				516	* If a valid cpu-to-node mapping is already available, use it
				517	* directly instead of querying the firmware, since it represents
				518	* the most recent mapping notified to us by the platform (eg: VPHN).
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	519	* Since cpu_to_node binding remains the same for all threads in the
				520	* core. If a valid cpu-to-node mapping is already available, for
				521	* the first thread in the core, use it.
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	522	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	523	nid = numa_cpu_lookup_table[fcpu];
				524	if (nid >= 0) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	525	map_cpu_to_node(lcpu, nid);
				526	return nid;
				527	}
				528
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	529	nid = vphn_get_nid(lcpu);
				530	if (nid != NUMA_NO_NODE)
				531	goto out_present;
				532
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	533	cpu = of_get_cpu_node(lcpu, NULL);
				534
				535	if (!cpu) {
				536	WARN_ON(1);
				537	if (cpu_present(lcpu))
				538	goto out_present;
				539	else
				540	goto out;
				541	}
				542
				543	nid = of_node_to_nid_single(cpu);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	544	of_node_put(cpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	545
				546	out_present:
				547	if (nid < 0 \|\| !node_possible(nid))
				548	nid = first_online_node;
				549
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	550	/*
				551	* Update for the first thread of the core. All threads of a core
				552	* have to be part of the same node. This not only avoids querying
				553	* for every other thread in the core, but always avoids a case
				554	* where virtual node associativity change causes subsequent threads
				555	* of a core to be associated with different nid. However if first
				556	* thread is already online, expect it to have a valid mapping.
				557	*/
				558	if (fcpu != lcpu) {
				559	WARN_ON(cpu_online(fcpu));
				560	map_cpu_to_node(fcpu, nid);
				561	}
				562
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	563	map_cpu_to_node(lcpu, nid);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	564	out:
				565	return nid;
				566	}
				567
				568	static void verify_cpu_node_mapping(int cpu, int node)
				569	{
				570	int base, sibling, i;
				571
				572	/* Verify that all the threads in the core belong to the same node */
				573	base = cpu_first_thread_sibling(cpu);
				574
				575	for (i = 0; i < threads_per_core; i++) {
				576	sibling = base + i;
				577
				578	if (sibling == cpu \|\| cpu_is_offline(sibling))
				579	continue;
				580
				581	if (cpu_to_node(sibling) != node) {
				582	WARN(1, "CPU thread siblings %d and %d don't belong"
				583	" to the same node!\n", cpu, sibling);
				584	break;
				585	}
				586	}
				587	}
				588
				589	/* Must run before sched domains notifier. */
				590	static int ppc_numa_cpu_prepare(unsigned int cpu)
				591	{
				592	int nid;
				593
				594	nid = numa_setup_cpu(cpu);
				595	verify_cpu_node_mapping(cpu, nid);
				596	return 0;
				597	}
				598
				599	static int ppc_numa_cpu_dead(unsigned int cpu)
				600	{
				601	#ifdef CONFIG_HOTPLUG_CPU
				602	unmap_cpu_from_node(cpu);
				603	#endif
				604	return 0;
				605	}
				606
				607	/*
				608	* Check and possibly modify a memory region to enforce the memory limit.
				609	*
				610	* Returns the size the region should have to enforce the memory limit.
				611	* This will either be the original value of size, a truncated value,
				612	* or zero. If the returned value of size is 0 the region should be
				613	* discarded as it lies wholly above the memory limit.
				614	*/
				615	static unsigned long __init numa_enforce_memory_limit(unsigned long start,
				616	unsigned long size)
				617	{
				618	/*
				619	* We use memblock_end_of_DRAM() in here instead of memory_limit because
				620	* we've already adjusted it for the limit and it takes care of
				621	* having memory holes below the limit. Also, in the case of
				622	* iommu_is_off, memory_limit is not set but is implicitly enforced.
				623	*/
				624
				625	if (start + size <= memblock_end_of_DRAM())
				626	return size;
				627
				628	if (start >= memblock_end_of_DRAM())
				629	return 0;
				630
				631	return memblock_end_of_DRAM() - start;
				632	}
				633
				634	/*
				635	* Reads the counter for a given entry in
				636	* linux,drconf-usable-memory property
				637	*/
				638	static inline int __init read_usm_ranges(const __be32 **usm)
				639	{
				640	/*
				641	* For each lmb in ibm,dynamic-memory a corresponding
				642	* entry in linux,drconf-usable-memory property contains
				643	* a counter followed by that many (base, size) duple.
				644	* read the counter from linux,drconf-usable-memory
				645	*/
				646	return read_n_cells(n_mem_size_cells, usm);
				647	}
				648
				649	/*
				650	* Extract NUMA information from the ibm,dynamic-reconfiguration-memory
				651	* node. This assumes n_mem_{addr,size}_cells have been set.
				652	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	653	static int __init numa_setup_drmem_lmb(struct drmem_lmb *lmb,
				654	const __be32 **usm,
				655	void *data)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	656	{
				657	unsigned int ranges, is_kexec_kdump = 0;
				658	unsigned long base, size, sz;
				659	int nid;
				660
				661	/*
				662	* Skip this block if the reserved bit is set in flags (0x80)
				663	* or if the block is not assigned to this partition (0x8)
				664	*/
				665	if ((lmb->flags & DRCONF_MEM_RESERVED)
				666	\|\| !(lmb->flags & DRCONF_MEM_ASSIGNED))
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	667	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	668
				669	if (*usm)
				670	is_kexec_kdump = 1;
				671
				672	base = lmb->base_addr;
				673	size = drmem_lmb_size();
				674	ranges = 1;
				675
				676	if (is_kexec_kdump) {
				677	ranges = read_usm_ranges(usm);
				678	if (!ranges) /* there are no (base, size) duple */
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	679	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	680	}
				681
				682	do {
				683	if (is_kexec_kdump) {
				684	base = read_n_cells(n_mem_addr_cells, usm);
				685	size = read_n_cells(n_mem_size_cells, usm);
				686	}
				687
				688	nid = of_drconf_to_nid_single(lmb);
				689	fake_numa_create_new_node(((base + size) >> PAGE_SHIFT),
				690	&nid);
				691	node_set_online(nid);
				692	sz = numa_enforce_memory_limit(base, size);
				693	if (sz)
				694	memblock_set_node(base, sz, &memblock.memory, nid);
				695	} while (--ranges);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	696
				697	return 0;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	698	}
				699
				700	static int __init parse_numa_properties(void)
				701	{
				702	struct device_node *memory;
				703	int default_nid = 0;
				704	unsigned long i;
				705
				706	if (numa_enabled == 0) {
				707	printk(KERN_WARNING "NUMA disabled by user\n");
				708	return -1;
				709	}
				710
				711	min_common_depth = find_min_common_depth();
				712
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	713	if (min_common_depth < 0) {
				714	/*
				715	* if we fail to parse min_common_depth from device tree
				716	* mark the numa disabled, boot with numa disabled.
				717	*/
				718	numa_enabled = false;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	719	return min_common_depth;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	720	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	721
				722	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
				723
				724	/*
				725	* Even though we connect cpus to numa domains later in SMP
				726	* init, we need to know the node ids now. This is because
				727	* each node to be onlined must have NODE_DATA etc backing it.
				728	*/
				729	for_each_present_cpu(i) {
				730	struct device_node *cpu;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	731	int nid = vphn_get_nid(i);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	732
				733	/*
				734	* Don't fall back to default_nid yet -- we will plug
				735	* cpus into nodes once the memory scan has discovered
				736	* the topology.
				737	*/
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	738	if (nid == NUMA_NO_NODE) {
				739	cpu = of_get_cpu_node(i, NULL);
				740	BUG_ON(!cpu);
				741	nid = of_node_to_nid_single(cpu);
				742	of_node_put(cpu);
				743	}
				744
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	745	node_set_online(nid);
				746	}
				747
				748	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
				749
				750	for_each_node_by_type(memory, "memory") {
				751	unsigned long start;
				752	unsigned long size;
				753	int nid;
				754	int ranges;
				755	const __be32 *memcell_buf;
				756	unsigned int len;
				757
				758	memcell_buf = of_get_property(memory,
				759	"linux,usable-memory", &len);
				760	if (!memcell_buf \|\| len <= 0)
				761	memcell_buf = of_get_property(memory, "reg", &len);
				762	if (!memcell_buf \|\| len <= 0)
				763	continue;
				764
				765	/* ranges in cell */
				766	ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
				767	new_range:
				768	/* these are order-sensitive, and modify the buffer pointer */
				769	start = read_n_cells(n_mem_addr_cells, &memcell_buf);
				770	size = read_n_cells(n_mem_size_cells, &memcell_buf);
				771
				772	/*
				773	* Assumption: either all memory nodes or none will
				774	* have associativity properties. If none, then
				775	* everything goes to default_nid.
				776	*/
				777	nid = of_node_to_nid_single(memory);
				778	if (nid < 0)
				779	nid = default_nid;
				780
				781	fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
				782	node_set_online(nid);
				783
				784	size = numa_enforce_memory_limit(start, size);
				785	if (size)
				786	memblock_set_node(start, size, &memblock.memory, nid);
				787
				788	if (--ranges)
				789	goto new_range;
				790	}
				791
				792	/*
				793	* Now do the same thing for each MEMBLOCK listed in the
				794	* ibm,dynamic-memory property in the
				795	* ibm,dynamic-reconfiguration-memory node.
				796	*/
				797	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
				798	if (memory) {
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	799	walk_drmem_lmbs(memory, NULL, numa_setup_drmem_lmb);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	800	of_node_put(memory);
				801	}
				802
				803	return 0;
				804	}
				805
				806	static void __init setup_nonnuma(void)
				807	{
				808	unsigned long top_of_ram = memblock_end_of_DRAM();
				809	unsigned long total_ram = memblock_phys_mem_size();
				810	unsigned long start_pfn, end_pfn;
				811	unsigned int nid = 0;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	812	int i;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	813
				814	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
				815	top_of_ram, total_ram);
				816	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
				817	(top_of_ram - total_ram) >> 20);
				818
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	819	for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	820	fake_numa_create_new_node(end_pfn, &nid);
				821	memblock_set_node(PFN_PHYS(start_pfn),
				822	PFN_PHYS(end_pfn - start_pfn),
				823	&memblock.memory, nid);
				824	node_set_online(nid);
				825	}
				826	}
				827
				828	void __init dump_numa_cpu_topology(void)
				829	{
				830	unsigned int node;
				831	unsigned int cpu, count;
				832
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	833	if (!numa_enabled)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	834	return;
				835
				836	for_each_online_node(node) {
				837	pr_info("Node %d CPUs:", node);
				838
				839	count = 0;
				840	/*
				841	* If we used a CPU iterator here we would miss printing
				842	* the holes in the cpumap.
				843	*/
				844	for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
				845	if (cpumask_test_cpu(cpu,
				846	node_to_cpumask_map[node])) {
				847	if (count == 0)
				848	pr_cont(" %u", cpu);
				849	++count;
				850	} else {
				851	if (count > 1)
				852	pr_cont("-%u", cpu - 1);
				853	count = 0;
				854	}
				855	}
				856
				857	if (count > 1)
				858	pr_cont("-%u", nr_cpu_ids - 1);
				859	pr_cont("\n");
				860	}
				861	}
				862
				863	/* Initialize NODE_DATA for a node on the local memory */
				864	static void __init setup_node_data(int nid, u64 start_pfn, u64 end_pfn)
				865	{
				866	u64 spanned_pages = end_pfn - start_pfn;
				867	const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
				868	u64 nd_pa;
				869	void *nd;
				870	int tnid;
				871
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	872	nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
				873	if (!nd_pa)
				874	panic("Cannot allocate %zu bytes for node %d data\n",
				875	nd_size, nid);
				876
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	877	nd = __va(nd_pa);
				878
				879	/* report and initialize */
				880	pr_info(" NODE_DATA [mem %#010Lx-%#010Lx]\n",
				881	nd_pa, nd_pa + nd_size - 1);
				882	tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
				883	if (tnid != nid)
				884	pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
				885
				886	node_data[nid] = nd;
				887	memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
				888	NODE_DATA(nid)->node_id = nid;
				889	NODE_DATA(nid)->node_start_pfn = start_pfn;
				890	NODE_DATA(nid)->node_spanned_pages = spanned_pages;
				891	}
				892
				893	static void __init find_possible_nodes(void)
				894	{
				895	struct device_node *rtas;
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	896	const __be32 *domains = NULL;
				897	int prop_length, max_nodes;
				898	u32 i;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	899
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	900	if (!numa_enabled)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	901	return;
				902
				903	rtas = of_find_node_by_path("/rtas");
				904	if (!rtas)
				905	return;
				906
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	907	/*
				908	* ibm,current-associativity-domains is a fairly recent property. If
				909	* it doesn't exist, then fallback on ibm,max-associativity-domains.
				910	* Current denotes what the platform can support compared to max
				911	* which denotes what the Hypervisor can support.
				912	*
				913	* If the LPAR is migratable, new nodes might be activated after a LPM,
				914	* so we should consider the max number in that case.
				915	*/
				916	if (!of_get_property(of_root, "ibm,migratable-partition", NULL))
				917	domains = of_get_property(rtas,
				918	"ibm,current-associativity-domains",
				919	&prop_length);
				920	if (!domains) {
				921	domains = of_get_property(rtas, "ibm,max-associativity-domains",
				922	&prop_length);
				923	if (!domains)
				924	goto out;
				925	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	926
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	927	max_nodes = of_read_number(&domains[min_common_depth], 1);
				928	pr_info("Partition configured for %d NUMA nodes.\n", max_nodes);
				929
				930	for (i = 0; i < max_nodes; i++) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	931	if (!node_possible(i))
				932	node_set(i, node_possible_map);
				933	}
				934
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	935	prop_length /= sizeof(int);
				936	if (prop_length > min_common_depth + 2)
				937	coregroup_enabled = 1;
				938
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	939	out:
				940	of_node_put(rtas);
				941	}
				942
				943	void __init mem_topology_setup(void)
				944	{
				945	int cpu;
				946
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	947	/*
				948	* Linux/mm assumes node 0 to be online at boot. However this is not
				949	* true on PowerPC, where node 0 is similar to any other node, it
				950	* could be cpuless, memoryless node. So force node 0 to be offline
				951	* for now. This will prevent cpuless, memoryless node 0 showing up
				952	* unnecessarily as online. If a node has cpus or memory that need
				953	* to be online, then node will anyway be marked online.
				954	*/
				955	node_set_offline(0);
				956
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	957	if (parse_numa_properties())
				958	setup_nonnuma();
				959
				960	/*
				961	* Modify the set of possible NUMA nodes to reflect information
				962	* available about the set of online nodes, and the set of nodes
				963	* that we expect to make use of for this platform's affinity
				964	* calculations.
				965	*/
				966	nodes_and(node_possible_map, node_possible_map, node_online_map);
				967
				968	find_possible_nodes();
				969
				970	setup_node_to_cpumask_map();
				971
				972	reset_numa_cpu_lookup_table();
				973
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	974	for_each_possible_cpu(cpu) {
				975	/*
				976	* Powerpc with CONFIG_NUMA always used to have a node 0,
				977	* even if it was memoryless or cpuless. For all cpus that
				978	* are possible but not present, cpu_to_node() would point
				979	* to node 0. To remove a cpuless, memoryless dummy node,
				980	* powerpc need to make sure all possible but not present
				981	* cpu_to_node are set to a proper node.
				982	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	983	numa_setup_cpu(cpu);
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	984	}
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	985	}
				986
				987	void __init initmem_init(void)
				988	{
				989	int nid;
				990
				991	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
				992	max_pfn = max_low_pfn;
				993
				994	memblock_dump_all();
				995
				996	for_each_online_node(nid) {
				997	unsigned long start_pfn, end_pfn;
				998
				999	get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
				1000	setup_node_data(nid, start_pfn, end_pfn);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1001	}
				1002
				1003	sparse_init();
				1004
				1005	/*
				1006	* We need the numa_cpu_lookup_table to be accurate for all CPUs,
				1007	* even before we online them, so that we can use cpu_to_{node,mem}
				1008	* early in boot, cf. smp_prepare_cpus().
				1009	* _nocalls() + manual invocation is used because cpuhp is not yet
				1010	* initialized for the boot CPU.
				1011	*/
				1012	cpuhp_setup_state_nocalls(CPUHP_POWER_NUMA_PREPARE, "powerpc/numa:prepare",
				1013	ppc_numa_cpu_prepare, ppc_numa_cpu_dead);
				1014	}
				1015
				1016	static int __init early_numa(char *p)
				1017	{
				1018	if (!p)
				1019	return 0;
				1020
				1021	if (strstr(p, "off"))
				1022	numa_enabled = 0;
				1023
				1024	if (strstr(p, "debug"))
				1025	numa_debug = 1;
				1026
				1027	p = strstr(p, "fake=");
				1028	if (p)
				1029	cmdline = p + strlen("fake=");
				1030
				1031	return 0;
				1032	}
				1033	early_param("numa", early_numa);
				1034
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1035	#ifdef CONFIG_MEMORY_HOTPLUG
				1036	/*
				1037	* Find the node associated with a hot added memory section for
				1038	* memory represented in the device tree by the property
				1039	* ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
				1040	*/
				1041	static int hot_add_drconf_scn_to_nid(unsigned long scn_addr)
				1042	{
				1043	struct drmem_lmb *lmb;
				1044	unsigned long lmb_size;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1045	int nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1046
				1047	lmb_size = drmem_lmb_size();
				1048
				1049	for_each_drmem_lmb(lmb) {
				1050	/* skip this block if it is reserved or not assigned to
				1051	* this partition */
				1052	if ((lmb->flags & DRCONF_MEM_RESERVED)
				1053	\|\| !(lmb->flags & DRCONF_MEM_ASSIGNED))
				1054	continue;
				1055
				1056	if ((scn_addr < lmb->base_addr)
				1057	\|\| (scn_addr >= (lmb->base_addr + lmb_size)))
				1058	continue;
				1059
				1060	nid = of_drconf_to_nid_single(lmb);
				1061	break;
				1062	}
				1063
				1064	return nid;
				1065	}
				1066
				1067	/*
				1068	* Find the node associated with a hot added memory section for memory
				1069	* represented in the device tree as a node (i.e. memory@XXXX) for
				1070	* each memblock.
				1071	*/
				1072	static int hot_add_node_scn_to_nid(unsigned long scn_addr)
				1073	{
				1074	struct device_node *memory;
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1075	int nid = NUMA_NO_NODE;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1076
				1077	for_each_node_by_type(memory, "memory") {
				1078	unsigned long start, size;
				1079	int ranges;
				1080	const __be32 *memcell_buf;
				1081	unsigned int len;
				1082
				1083	memcell_buf = of_get_property(memory, "reg", &len);
				1084	if (!memcell_buf \|\| len <= 0)
				1085	continue;
				1086
				1087	/* ranges in cell */
				1088	ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
				1089
				1090	while (ranges--) {
				1091	start = read_n_cells(n_mem_addr_cells, &memcell_buf);
				1092	size = read_n_cells(n_mem_size_cells, &memcell_buf);
				1093
				1094	if ((scn_addr < start) \|\| (scn_addr >= (start + size)))
				1095	continue;
				1096
				1097	nid = of_node_to_nid_single(memory);
				1098	break;
				1099	}
				1100
				1101	if (nid >= 0)
				1102	break;
				1103	}
				1104
				1105	of_node_put(memory);
				1106
				1107	return nid;
				1108	}
				1109
				1110	/*
				1111	* Find the node associated with a hot added memory section. Section
				1112	* corresponds to a SPARSEMEM section, not an MEMBLOCK. It is assumed that
				1113	* sections are fully contained within a single MEMBLOCK.
				1114	*/
				1115	int hot_add_scn_to_nid(unsigned long scn_addr)
				1116	{
				1117	struct device_node *memory = NULL;
				1118	int nid;
				1119
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1120	if (!numa_enabled)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1121	return first_online_node;
				1122
				1123	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
				1124	if (memory) {
				1125	nid = hot_add_drconf_scn_to_nid(scn_addr);
				1126	of_node_put(memory);
				1127	} else {
				1128	nid = hot_add_node_scn_to_nid(scn_addr);
				1129	}
				1130
				1131	if (nid < 0 \|\| !node_possible(nid))
				1132	nid = first_online_node;
				1133
				1134	return nid;
				1135	}
				1136
				1137	static u64 hot_add_drconf_memory_max(void)
				1138	{
				1139	struct device_node *memory = NULL;
				1140	struct device_node *dn = NULL;
				1141	const __be64 *lrdr = NULL;
				1142
				1143	dn = of_find_node_by_path("/rtas");
				1144	if (dn) {
				1145	lrdr = of_get_property(dn, "ibm,lrdr-capacity", NULL);
				1146	of_node_put(dn);
				1147	if (lrdr)
				1148	return be64_to_cpup(lrdr);
				1149	}
				1150
				1151	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
				1152	if (memory) {
				1153	of_node_put(memory);
				1154	return drmem_lmb_memory_max();
				1155	}
				1156	return 0;
				1157	}
				1158
				1159	/*
				1160	* memory_hotplug_max - return max address of memory that may be added
				1161	*
				1162	* This is currently only used on systems that support drconfig memory
				1163	* hotplug.
				1164	*/
				1165	u64 memory_hotplug_max(void)
				1166	{
				1167	return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
				1168	}
				1169	#endif /* CONFIG_MEMORY_HOTPLUG */
				1170
				1171	/* Virtual Processor Home Node (VPHN) support */
				1172	#ifdef CONFIG_PPC_SPLPAR
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1173	static int topology_inited;
				1174
				1175	/*
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1176	* Retrieve the new associativity information for a virtual processor's
				1177	* home node.
				1178	*/
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1179	static long vphn_get_associativity(unsigned long cpu,
				1180	__be32 *associativity)
				1181	{
				1182	long rc;
				1183
David Brazdil	0f672f6	2019-12-10 10:32:29 +0000	[diff] [blame]	1184	rc = hcall_vphn(get_hard_smp_processor_id(cpu),
				1185	VPHN_FLAG_VCPU, associativity);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1186
				1187	switch (rc) {
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1188	case H_SUCCESS:
				1189	dbg("VPHN hcall succeeded. Reset polling...\n");
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1190	goto out;
				1191
				1192	case H_FUNCTION:
				1193	pr_err_ratelimited("VPHN unsupported. Disabling polling...\n");
				1194	break;
				1195	case H_HARDWARE:
				1196	pr_err_ratelimited("hcall_vphn() experienced a hardware fault "
				1197	"preventing VPHN. Disabling polling...\n");
				1198	break;
				1199	case H_PARAMETER:
				1200	pr_err_ratelimited("hcall_vphn() was passed an invalid parameter. "
				1201	"Disabling polling...\n");
				1202	break;
				1203	default:
				1204	pr_err_ratelimited("hcall_vphn() returned %ld. Disabling polling...\n"
				1205	, rc);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1206	break;
				1207	}
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1208	out:
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1209	return rc;
				1210	}
				1211
				1212	int find_and_online_cpu_nid(int cpu)
				1213	{
				1214	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
				1215	int new_nid;
				1216
				1217	/* Use associativity from first thread for all siblings */
				1218	if (vphn_get_associativity(cpu, associativity))
				1219	return cpu_to_node(cpu);
				1220
				1221	new_nid = associativity_to_nid(associativity);
				1222	if (new_nid < 0 \|\| !node_possible(new_nid))
				1223	new_nid = first_online_node;
				1224
				1225	if (NODE_DATA(new_nid) == NULL) {
				1226	#ifdef CONFIG_MEMORY_HOTPLUG
				1227	/*
				1228	* Need to ensure that NODE_DATA is initialized for a node from
				1229	* available memory (see memblock_alloc_try_nid). If unable to
				1230	* init the node, then default to nearest node that has memory
				1231	* installed. Skip onlining a node if the subsystems are not
				1232	* yet initialized.
				1233	*/
				1234	if (!topology_inited \|\| try_online_node(new_nid))
				1235	new_nid = first_online_node;
				1236	#else
				1237	/*
				1238	* Default to using the nearest node that has memory installed.
				1239	* Otherwise, it would be necessary to patch the kernel MM code
				1240	* to deal with more memoryless-node error conditions.
				1241	*/
				1242	new_nid = first_online_node;
				1243	#endif
				1244	}
				1245
				1246	pr_debug("%s:%d cpu %d nid %d\n", __FUNCTION__, __LINE__,
				1247	cpu, new_nid);
				1248	return new_nid;
				1249	}
				1250
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1251	int cpu_to_coregroup_id(int cpu)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1252	{
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1253	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
				1254	int index;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1255
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1256	if (cpu < 0 \|\| cpu > nr_cpu_ids)
				1257	return -1;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1258
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1259	if (!coregroup_enabled)
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1260	goto out;
				1261
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1262	if (!firmware_has_feature(FW_FEATURE_VPHN))
				1263	goto out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1264
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1265	if (vphn_get_associativity(cpu, associativity))
				1266	goto out;
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1267
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1268	index = of_read_number(associativity, 1);
				1269	if (index > min_common_depth + 1)
				1270	return of_read_number(&associativity[index - 1], 1);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1271
				1272	out:
Olivier Deprez	157378f	2022-04-04 15:47:50 +0200	[diff] [blame^]	1273	return cpu_to_core_id(cpu);
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1274	}
				1275
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1276	static int topology_update_init(void)
				1277	{
Andrew Scull	b4b6d4a	2019-01-02 15:54:55 +0000	[diff] [blame]	1278	topology_inited = 1;
				1279	return 0;
				1280	}
				1281	device_initcall(topology_update_init);
				1282	#endif /* CONFIG_PPC_SPLPAR */