blob: c9976dc4aa65fde08a2b000a9bfe30f789d2c65d [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001// SPDX-License-Identifier: GPL-2.0
2/*
3 * Basic Node interface support
4 */
5
6#include <linux/module.h>
7#include <linux/init.h>
8#include <linux/mm.h>
9#include <linux/memory.h>
10#include <linux/vmstat.h>
11#include <linux/notifier.h>
12#include <linux/node.h>
13#include <linux/hugetlb.h>
14#include <linux/compaction.h>
15#include <linux/cpumask.h>
16#include <linux/topology.h>
17#include <linux/nodemask.h>
18#include <linux/cpu.h>
19#include <linux/device.h>
David Brazdil0f672f62019-12-10 10:32:29 +000020#include <linux/pm_runtime.h>
Andrew Scullb4b6d4a2019-01-02 15:54:55 +000021#include <linux/swap.h>
22#include <linux/slab.h>
23
24static struct bus_type node_subsys = {
25 .name = "node",
26 .dev_name = "node",
27};
28
29
30static ssize_t node_read_cpumap(struct device *dev, bool list, char *buf)
31{
32 ssize_t n;
33 cpumask_var_t mask;
34 struct node *node_dev = to_node(dev);
35
36 /* 2008/04/07: buf currently PAGE_SIZE, need 9 chars per 32 bits. */
37 BUILD_BUG_ON((NR_CPUS/32 * 9) > (PAGE_SIZE-1));
38
39 if (!alloc_cpumask_var(&mask, GFP_KERNEL))
40 return 0;
41
42 cpumask_and(mask, cpumask_of_node(node_dev->dev.id), cpu_online_mask);
43 n = cpumap_print_to_pagebuf(list, buf, mask);
44 free_cpumask_var(mask);
45
46 return n;
47}
48
49static inline ssize_t node_read_cpumask(struct device *dev,
50 struct device_attribute *attr, char *buf)
51{
52 return node_read_cpumap(dev, false, buf);
53}
54static inline ssize_t node_read_cpulist(struct device *dev,
55 struct device_attribute *attr, char *buf)
56{
57 return node_read_cpumap(dev, true, buf);
58}
59
60static DEVICE_ATTR(cpumap, S_IRUGO, node_read_cpumask, NULL);
61static DEVICE_ATTR(cpulist, S_IRUGO, node_read_cpulist, NULL);
62
David Brazdil0f672f62019-12-10 10:32:29 +000063/**
64 * struct node_access_nodes - Access class device to hold user visible
65 * relationships to other nodes.
66 * @dev: Device for this memory access class
67 * @list_node: List element in the node's access list
68 * @access: The access class rank
69 * @hmem_attrs: Heterogeneous memory performance attributes
70 */
71struct node_access_nodes {
72 struct device dev;
73 struct list_head list_node;
74 unsigned access;
75#ifdef CONFIG_HMEM_REPORTING
76 struct node_hmem_attrs hmem_attrs;
77#endif
78};
79#define to_access_nodes(dev) container_of(dev, struct node_access_nodes, dev)
80
81static struct attribute *node_init_access_node_attrs[] = {
82 NULL,
83};
84
85static struct attribute *node_targ_access_node_attrs[] = {
86 NULL,
87};
88
89static const struct attribute_group initiators = {
90 .name = "initiators",
91 .attrs = node_init_access_node_attrs,
92};
93
94static const struct attribute_group targets = {
95 .name = "targets",
96 .attrs = node_targ_access_node_attrs,
97};
98
99static const struct attribute_group *node_access_node_groups[] = {
100 &initiators,
101 &targets,
102 NULL,
103};
104
105static void node_remove_accesses(struct node *node)
106{
107 struct node_access_nodes *c, *cnext;
108
109 list_for_each_entry_safe(c, cnext, &node->access_list, list_node) {
110 list_del(&c->list_node);
111 device_unregister(&c->dev);
112 }
113}
114
115static void node_access_release(struct device *dev)
116{
117 kfree(to_access_nodes(dev));
118}
119
120static struct node_access_nodes *node_init_node_access(struct node *node,
121 unsigned access)
122{
123 struct node_access_nodes *access_node;
124 struct device *dev;
125
126 list_for_each_entry(access_node, &node->access_list, list_node)
127 if (access_node->access == access)
128 return access_node;
129
130 access_node = kzalloc(sizeof(*access_node), GFP_KERNEL);
131 if (!access_node)
132 return NULL;
133
134 access_node->access = access;
135 dev = &access_node->dev;
136 dev->parent = &node->dev;
137 dev->release = node_access_release;
138 dev->groups = node_access_node_groups;
139 if (dev_set_name(dev, "access%u", access))
140 goto free;
141
142 if (device_register(dev))
143 goto free_name;
144
145 pm_runtime_no_callbacks(dev);
146 list_add_tail(&access_node->list_node, &node->access_list);
147 return access_node;
148free_name:
149 kfree_const(dev->kobj.name);
150free:
151 kfree(access_node);
152 return NULL;
153}
154
155#ifdef CONFIG_HMEM_REPORTING
156#define ACCESS_ATTR(name) \
157static ssize_t name##_show(struct device *dev, \
158 struct device_attribute *attr, \
159 char *buf) \
160{ \
161 return sprintf(buf, "%u\n", to_access_nodes(dev)->hmem_attrs.name); \
162} \
163static DEVICE_ATTR_RO(name);
164
165ACCESS_ATTR(read_bandwidth)
166ACCESS_ATTR(read_latency)
167ACCESS_ATTR(write_bandwidth)
168ACCESS_ATTR(write_latency)
169
170static struct attribute *access_attrs[] = {
171 &dev_attr_read_bandwidth.attr,
172 &dev_attr_read_latency.attr,
173 &dev_attr_write_bandwidth.attr,
174 &dev_attr_write_latency.attr,
175 NULL,
176};
177
178/**
179 * node_set_perf_attrs - Set the performance values for given access class
180 * @nid: Node identifier to be set
181 * @hmem_attrs: Heterogeneous memory performance attributes
182 * @access: The access class the for the given attributes
183 */
184void node_set_perf_attrs(unsigned int nid, struct node_hmem_attrs *hmem_attrs,
185 unsigned access)
186{
187 struct node_access_nodes *c;
188 struct node *node;
189 int i;
190
191 if (WARN_ON_ONCE(!node_online(nid)))
192 return;
193
194 node = node_devices[nid];
195 c = node_init_node_access(node, access);
196 if (!c)
197 return;
198
199 c->hmem_attrs = *hmem_attrs;
200 for (i = 0; access_attrs[i] != NULL; i++) {
201 if (sysfs_add_file_to_group(&c->dev.kobj, access_attrs[i],
202 "initiators")) {
203 pr_info("failed to add performance attribute to node %d\n",
204 nid);
205 break;
206 }
207 }
208}
209
210/**
211 * struct node_cache_info - Internal tracking for memory node caches
212 * @dev: Device represeting the cache level
213 * @node: List element for tracking in the node
214 * @cache_attrs:Attributes for this cache level
215 */
216struct node_cache_info {
217 struct device dev;
218 struct list_head node;
219 struct node_cache_attrs cache_attrs;
220};
221#define to_cache_info(device) container_of(device, struct node_cache_info, dev)
222
223#define CACHE_ATTR(name, fmt) \
224static ssize_t name##_show(struct device *dev, \
225 struct device_attribute *attr, \
226 char *buf) \
227{ \
228 return sprintf(buf, fmt "\n", to_cache_info(dev)->cache_attrs.name);\
229} \
230DEVICE_ATTR_RO(name);
231
232CACHE_ATTR(size, "%llu")
233CACHE_ATTR(line_size, "%u")
234CACHE_ATTR(indexing, "%u")
235CACHE_ATTR(write_policy, "%u")
236
237static struct attribute *cache_attrs[] = {
238 &dev_attr_indexing.attr,
239 &dev_attr_size.attr,
240 &dev_attr_line_size.attr,
241 &dev_attr_write_policy.attr,
242 NULL,
243};
244ATTRIBUTE_GROUPS(cache);
245
246static void node_cache_release(struct device *dev)
247{
248 kfree(dev);
249}
250
251static void node_cacheinfo_release(struct device *dev)
252{
253 struct node_cache_info *info = to_cache_info(dev);
254 kfree(info);
255}
256
257static void node_init_cache_dev(struct node *node)
258{
259 struct device *dev;
260
261 dev = kzalloc(sizeof(*dev), GFP_KERNEL);
262 if (!dev)
263 return;
264
Olivier Deprez0e641232021-09-23 10:07:05 +0200265 device_initialize(dev);
David Brazdil0f672f62019-12-10 10:32:29 +0000266 dev->parent = &node->dev;
267 dev->release = node_cache_release;
268 if (dev_set_name(dev, "memory_side_cache"))
Olivier Deprez0e641232021-09-23 10:07:05 +0200269 goto put_device;
David Brazdil0f672f62019-12-10 10:32:29 +0000270
Olivier Deprez0e641232021-09-23 10:07:05 +0200271 if (device_add(dev))
272 goto put_device;
David Brazdil0f672f62019-12-10 10:32:29 +0000273
274 pm_runtime_no_callbacks(dev);
275 node->cache_dev = dev;
276 return;
Olivier Deprez0e641232021-09-23 10:07:05 +0200277put_device:
278 put_device(dev);
David Brazdil0f672f62019-12-10 10:32:29 +0000279}
280
281/**
282 * node_add_cache() - add cache attribute to a memory node
283 * @nid: Node identifier that has new cache attributes
284 * @cache_attrs: Attributes for the cache being added
285 */
286void node_add_cache(unsigned int nid, struct node_cache_attrs *cache_attrs)
287{
288 struct node_cache_info *info;
289 struct device *dev;
290 struct node *node;
291
292 if (!node_online(nid) || !node_devices[nid])
293 return;
294
295 node = node_devices[nid];
296 list_for_each_entry(info, &node->cache_attrs, node) {
297 if (info->cache_attrs.level == cache_attrs->level) {
298 dev_warn(&node->dev,
299 "attempt to add duplicate cache level:%d\n",
300 cache_attrs->level);
301 return;
302 }
303 }
304
305 if (!node->cache_dev)
306 node_init_cache_dev(node);
307 if (!node->cache_dev)
308 return;
309
310 info = kzalloc(sizeof(*info), GFP_KERNEL);
311 if (!info)
312 return;
313
314 dev = &info->dev;
Olivier Deprez0e641232021-09-23 10:07:05 +0200315 device_initialize(dev);
David Brazdil0f672f62019-12-10 10:32:29 +0000316 dev->parent = node->cache_dev;
317 dev->release = node_cacheinfo_release;
318 dev->groups = cache_groups;
319 if (dev_set_name(dev, "index%d", cache_attrs->level))
Olivier Deprez0e641232021-09-23 10:07:05 +0200320 goto put_device;
David Brazdil0f672f62019-12-10 10:32:29 +0000321
322 info->cache_attrs = *cache_attrs;
Olivier Deprez0e641232021-09-23 10:07:05 +0200323 if (device_add(dev)) {
David Brazdil0f672f62019-12-10 10:32:29 +0000324 dev_warn(&node->dev, "failed to add cache level:%d\n",
325 cache_attrs->level);
Olivier Deprez0e641232021-09-23 10:07:05 +0200326 goto put_device;
David Brazdil0f672f62019-12-10 10:32:29 +0000327 }
328 pm_runtime_no_callbacks(dev);
329 list_add_tail(&info->node, &node->cache_attrs);
330 return;
Olivier Deprez0e641232021-09-23 10:07:05 +0200331put_device:
332 put_device(dev);
David Brazdil0f672f62019-12-10 10:32:29 +0000333}
334
335static void node_remove_caches(struct node *node)
336{
337 struct node_cache_info *info, *next;
338
339 if (!node->cache_dev)
340 return;
341
342 list_for_each_entry_safe(info, next, &node->cache_attrs, node) {
343 list_del(&info->node);
344 device_unregister(&info->dev);
345 }
346 device_unregister(node->cache_dev);
347}
348
349static void node_init_caches(unsigned int nid)
350{
351 INIT_LIST_HEAD(&node_devices[nid]->cache_attrs);
352}
353#else
354static void node_init_caches(unsigned int nid) { }
355static void node_remove_caches(struct node *node) { }
356#endif
357
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000358#define K(x) ((x) << (PAGE_SHIFT - 10))
359static ssize_t node_read_meminfo(struct device *dev,
360 struct device_attribute *attr, char *buf)
361{
362 int n;
363 int nid = dev->id;
364 struct pglist_data *pgdat = NODE_DATA(nid);
365 struct sysinfo i;
David Brazdil0f672f62019-12-10 10:32:29 +0000366 unsigned long sreclaimable, sunreclaimable;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000367
368 si_meminfo_node(&i, nid);
David Brazdil0f672f62019-12-10 10:32:29 +0000369 sreclaimable = node_page_state(pgdat, NR_SLAB_RECLAIMABLE);
370 sunreclaimable = node_page_state(pgdat, NR_SLAB_UNRECLAIMABLE);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000371 n = sprintf(buf,
372 "Node %d MemTotal: %8lu kB\n"
373 "Node %d MemFree: %8lu kB\n"
374 "Node %d MemUsed: %8lu kB\n"
375 "Node %d Active: %8lu kB\n"
376 "Node %d Inactive: %8lu kB\n"
377 "Node %d Active(anon): %8lu kB\n"
378 "Node %d Inactive(anon): %8lu kB\n"
379 "Node %d Active(file): %8lu kB\n"
380 "Node %d Inactive(file): %8lu kB\n"
381 "Node %d Unevictable: %8lu kB\n"
382 "Node %d Mlocked: %8lu kB\n",
383 nid, K(i.totalram),
384 nid, K(i.freeram),
385 nid, K(i.totalram - i.freeram),
386 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON) +
387 node_page_state(pgdat, NR_ACTIVE_FILE)),
388 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON) +
389 node_page_state(pgdat, NR_INACTIVE_FILE)),
390 nid, K(node_page_state(pgdat, NR_ACTIVE_ANON)),
391 nid, K(node_page_state(pgdat, NR_INACTIVE_ANON)),
392 nid, K(node_page_state(pgdat, NR_ACTIVE_FILE)),
393 nid, K(node_page_state(pgdat, NR_INACTIVE_FILE)),
394 nid, K(node_page_state(pgdat, NR_UNEVICTABLE)),
395 nid, K(sum_zone_node_page_state(nid, NR_MLOCK)));
396
397#ifdef CONFIG_HIGHMEM
398 n += sprintf(buf + n,
399 "Node %d HighTotal: %8lu kB\n"
400 "Node %d HighFree: %8lu kB\n"
401 "Node %d LowTotal: %8lu kB\n"
402 "Node %d LowFree: %8lu kB\n",
403 nid, K(i.totalhigh),
404 nid, K(i.freehigh),
405 nid, K(i.totalram - i.totalhigh),
406 nid, K(i.freeram - i.freehigh));
407#endif
408 n += sprintf(buf + n,
409 "Node %d Dirty: %8lu kB\n"
410 "Node %d Writeback: %8lu kB\n"
411 "Node %d FilePages: %8lu kB\n"
412 "Node %d Mapped: %8lu kB\n"
413 "Node %d AnonPages: %8lu kB\n"
414 "Node %d Shmem: %8lu kB\n"
415 "Node %d KernelStack: %8lu kB\n"
416 "Node %d PageTables: %8lu kB\n"
417 "Node %d NFS_Unstable: %8lu kB\n"
418 "Node %d Bounce: %8lu kB\n"
419 "Node %d WritebackTmp: %8lu kB\n"
David Brazdil0f672f62019-12-10 10:32:29 +0000420 "Node %d KReclaimable: %8lu kB\n"
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000421 "Node %d Slab: %8lu kB\n"
422 "Node %d SReclaimable: %8lu kB\n"
423 "Node %d SUnreclaim: %8lu kB\n"
424#ifdef CONFIG_TRANSPARENT_HUGEPAGE
425 "Node %d AnonHugePages: %8lu kB\n"
426 "Node %d ShmemHugePages: %8lu kB\n"
427 "Node %d ShmemPmdMapped: %8lu kB\n"
David Brazdil0f672f62019-12-10 10:32:29 +0000428 "Node %d FileHugePages: %8lu kB\n"
429 "Node %d FilePmdMapped: %8lu kB\n"
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000430#endif
431 ,
432 nid, K(node_page_state(pgdat, NR_FILE_DIRTY)),
433 nid, K(node_page_state(pgdat, NR_WRITEBACK)),
434 nid, K(node_page_state(pgdat, NR_FILE_PAGES)),
435 nid, K(node_page_state(pgdat, NR_FILE_MAPPED)),
436 nid, K(node_page_state(pgdat, NR_ANON_MAPPED)),
437 nid, K(i.sharedram),
438 nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB),
439 nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)),
440 nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
441 nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)),
442 nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
David Brazdil0f672f62019-12-10 10:32:29 +0000443 nid, K(sreclaimable +
444 node_page_state(pgdat, NR_KERNEL_MISC_RECLAIMABLE)),
445 nid, K(sreclaimable + sunreclaimable),
446 nid, K(sreclaimable),
447 nid, K(sunreclaimable)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000448#ifdef CONFIG_TRANSPARENT_HUGEPAGE
David Brazdil0f672f62019-12-10 10:32:29 +0000449 ,
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000450 nid, K(node_page_state(pgdat, NR_ANON_THPS) *
451 HPAGE_PMD_NR),
452 nid, K(node_page_state(pgdat, NR_SHMEM_THPS) *
453 HPAGE_PMD_NR),
454 nid, K(node_page_state(pgdat, NR_SHMEM_PMDMAPPED) *
David Brazdil0f672f62019-12-10 10:32:29 +0000455 HPAGE_PMD_NR),
456 nid, K(node_page_state(pgdat, NR_FILE_THPS) *
457 HPAGE_PMD_NR),
458 nid, K(node_page_state(pgdat, NR_FILE_PMDMAPPED) *
459 HPAGE_PMD_NR)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000460#endif
David Brazdil0f672f62019-12-10 10:32:29 +0000461 );
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000462 n += hugetlb_report_node_meminfo(nid, buf + n);
463 return n;
464}
465
466#undef K
467static DEVICE_ATTR(meminfo, S_IRUGO, node_read_meminfo, NULL);
468
469static ssize_t node_read_numastat(struct device *dev,
470 struct device_attribute *attr, char *buf)
471{
472 return sprintf(buf,
473 "numa_hit %lu\n"
474 "numa_miss %lu\n"
475 "numa_foreign %lu\n"
476 "interleave_hit %lu\n"
477 "local_node %lu\n"
478 "other_node %lu\n",
479 sum_zone_numa_state(dev->id, NUMA_HIT),
480 sum_zone_numa_state(dev->id, NUMA_MISS),
481 sum_zone_numa_state(dev->id, NUMA_FOREIGN),
482 sum_zone_numa_state(dev->id, NUMA_INTERLEAVE_HIT),
483 sum_zone_numa_state(dev->id, NUMA_LOCAL),
484 sum_zone_numa_state(dev->id, NUMA_OTHER));
485}
486static DEVICE_ATTR(numastat, S_IRUGO, node_read_numastat, NULL);
487
488static ssize_t node_read_vmstat(struct device *dev,
489 struct device_attribute *attr, char *buf)
490{
491 int nid = dev->id;
492 struct pglist_data *pgdat = NODE_DATA(nid);
493 int i;
494 int n = 0;
495
496 for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
497 n += sprintf(buf+n, "%s %lu\n", vmstat_text[i],
498 sum_zone_node_page_state(nid, i));
499
500#ifdef CONFIG_NUMA
501 for (i = 0; i < NR_VM_NUMA_STAT_ITEMS; i++)
502 n += sprintf(buf+n, "%s %lu\n",
503 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS],
504 sum_zone_numa_state(nid, i));
505#endif
506
507 for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
508 n += sprintf(buf+n, "%s %lu\n",
509 vmstat_text[i + NR_VM_ZONE_STAT_ITEMS +
510 NR_VM_NUMA_STAT_ITEMS],
511 node_page_state(pgdat, i));
512
513 return n;
514}
515static DEVICE_ATTR(vmstat, S_IRUGO, node_read_vmstat, NULL);
516
517static ssize_t node_read_distance(struct device *dev,
518 struct device_attribute *attr, char *buf)
519{
520 int nid = dev->id;
521 int len = 0;
522 int i;
523
524 /*
525 * buf is currently PAGE_SIZE in length and each node needs 4 chars
526 * at the most (distance + space or newline).
527 */
528 BUILD_BUG_ON(MAX_NUMNODES * 4 > PAGE_SIZE);
529
530 for_each_online_node(i)
531 len += sprintf(buf + len, "%s%d", i ? " " : "", node_distance(nid, i));
532
533 len += sprintf(buf + len, "\n");
534 return len;
535}
536static DEVICE_ATTR(distance, S_IRUGO, node_read_distance, NULL);
537
538static struct attribute *node_dev_attrs[] = {
539 &dev_attr_cpumap.attr,
540 &dev_attr_cpulist.attr,
541 &dev_attr_meminfo.attr,
542 &dev_attr_numastat.attr,
543 &dev_attr_distance.attr,
544 &dev_attr_vmstat.attr,
545 NULL
546};
547ATTRIBUTE_GROUPS(node_dev);
548
549#ifdef CONFIG_HUGETLBFS
550/*
551 * hugetlbfs per node attributes registration interface:
552 * When/if hugetlb[fs] subsystem initializes [sometime after this module],
553 * it will register its per node attributes for all online nodes with
554 * memory. It will also call register_hugetlbfs_with_node(), below, to
555 * register its attribute registration functions with this node driver.
556 * Once these hooks have been initialized, the node driver will call into
557 * the hugetlb module to [un]register attributes for hot-plugged nodes.
558 */
559static node_registration_func_t __hugetlb_register_node;
560static node_registration_func_t __hugetlb_unregister_node;
561
562static inline bool hugetlb_register_node(struct node *node)
563{
564 if (__hugetlb_register_node &&
565 node_state(node->dev.id, N_MEMORY)) {
566 __hugetlb_register_node(node);
567 return true;
568 }
569 return false;
570}
571
572static inline void hugetlb_unregister_node(struct node *node)
573{
574 if (__hugetlb_unregister_node)
575 __hugetlb_unregister_node(node);
576}
577
578void register_hugetlbfs_with_node(node_registration_func_t doregister,
579 node_registration_func_t unregister)
580{
581 __hugetlb_register_node = doregister;
582 __hugetlb_unregister_node = unregister;
583}
584#else
585static inline void hugetlb_register_node(struct node *node) {}
586
587static inline void hugetlb_unregister_node(struct node *node) {}
588#endif
589
590static void node_device_release(struct device *dev)
591{
592 struct node *node = to_node(dev);
593
594#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HUGETLBFS)
595 /*
596 * We schedule the work only when a memory section is
597 * onlined/offlined on this node. When we come here,
598 * all the memory on this node has been offlined,
599 * so we won't enqueue new work to this work.
600 *
601 * The work is using node->node_work, so we should
602 * flush work before freeing the memory.
603 */
604 flush_work(&node->node_work);
605#endif
606 kfree(node);
607}
608
609/*
610 * register_node - Setup a sysfs device for a node.
611 * @num - Node number to use when creating the device.
612 *
613 * Initialize and register the node device.
614 */
615static int register_node(struct node *node, int num)
616{
617 int error;
618
619 node->dev.id = num;
620 node->dev.bus = &node_subsys;
621 node->dev.release = node_device_release;
622 node->dev.groups = node_dev_groups;
623 error = device_register(&node->dev);
624
625 if (error)
626 put_device(&node->dev);
627 else {
628 hugetlb_register_node(node);
629
630 compaction_register_node(node);
631 }
632 return error;
633}
634
635/**
636 * unregister_node - unregister a node device
637 * @node: node going away
638 *
639 * Unregisters a node device @node. All the devices on the node must be
640 * unregistered before calling this function.
641 */
642void unregister_node(struct node *node)
643{
644 hugetlb_unregister_node(node); /* no-op, if memoryless node */
David Brazdil0f672f62019-12-10 10:32:29 +0000645 node_remove_accesses(node);
646 node_remove_caches(node);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000647 device_unregister(&node->dev);
648}
649
650struct node *node_devices[MAX_NUMNODES];
651
652/*
653 * register cpu under node
654 */
655int register_cpu_under_node(unsigned int cpu, unsigned int nid)
656{
657 int ret;
658 struct device *obj;
659
660 if (!node_online(nid))
661 return 0;
662
663 obj = get_cpu_device(cpu);
664 if (!obj)
665 return 0;
666
667 ret = sysfs_create_link(&node_devices[nid]->dev.kobj,
668 &obj->kobj,
669 kobject_name(&obj->kobj));
670 if (ret)
671 return ret;
672
673 return sysfs_create_link(&obj->kobj,
674 &node_devices[nid]->dev.kobj,
675 kobject_name(&node_devices[nid]->dev.kobj));
676}
677
David Brazdil0f672f62019-12-10 10:32:29 +0000678/**
679 * register_memory_node_under_compute_node - link memory node to its compute
680 * node for a given access class.
681 * @mem_nid: Memory node number
682 * @cpu_nid: Cpu node number
683 * @access: Access class to register
684 *
685 * Description:
686 * For use with platforms that may have separate memory and compute nodes.
687 * This function will export node relationships linking which memory
688 * initiator nodes can access memory targets at a given ranked access
689 * class.
690 */
691int register_memory_node_under_compute_node(unsigned int mem_nid,
692 unsigned int cpu_nid,
693 unsigned access)
694{
695 struct node *init_node, *targ_node;
696 struct node_access_nodes *initiator, *target;
697 int ret;
698
699 if (!node_online(cpu_nid) || !node_online(mem_nid))
700 return -ENODEV;
701
702 init_node = node_devices[cpu_nid];
703 targ_node = node_devices[mem_nid];
704 initiator = node_init_node_access(init_node, access);
705 target = node_init_node_access(targ_node, access);
706 if (!initiator || !target)
707 return -ENOMEM;
708
709 ret = sysfs_add_link_to_group(&initiator->dev.kobj, "targets",
710 &targ_node->dev.kobj,
711 dev_name(&targ_node->dev));
712 if (ret)
713 return ret;
714
715 ret = sysfs_add_link_to_group(&target->dev.kobj, "initiators",
716 &init_node->dev.kobj,
717 dev_name(&init_node->dev));
718 if (ret)
719 goto err;
720
721 return 0;
722 err:
723 sysfs_remove_link_from_group(&initiator->dev.kobj, "targets",
724 dev_name(&targ_node->dev));
725 return ret;
726}
727
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000728int unregister_cpu_under_node(unsigned int cpu, unsigned int nid)
729{
730 struct device *obj;
731
732 if (!node_online(nid))
733 return 0;
734
735 obj = get_cpu_device(cpu);
736 if (!obj)
737 return 0;
738
739 sysfs_remove_link(&node_devices[nid]->dev.kobj,
740 kobject_name(&obj->kobj));
741 sysfs_remove_link(&obj->kobj,
742 kobject_name(&node_devices[nid]->dev.kobj));
743
744 return 0;
745}
746
747#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
748static int __ref get_nid_for_pfn(unsigned long pfn)
749{
750 if (!pfn_valid_within(pfn))
751 return -1;
752#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
753 if (system_state < SYSTEM_RUNNING)
754 return early_pfn_to_nid(pfn);
755#endif
756 return pfn_to_nid(pfn);
757}
758
Olivier Deprez0e641232021-09-23 10:07:05 +0200759static int do_register_memory_block_under_node(int nid,
760 struct memory_block *mem_blk)
761{
762 int ret;
763
764 /*
765 * If this memory block spans multiple nodes, we only indicate
766 * the last processed node.
767 */
768 mem_blk->nid = nid;
769
770 ret = sysfs_create_link_nowarn(&node_devices[nid]->dev.kobj,
771 &mem_blk->dev.kobj,
772 kobject_name(&mem_blk->dev.kobj));
773 if (ret)
774 return ret;
775
776 return sysfs_create_link_nowarn(&mem_blk->dev.kobj,
777 &node_devices[nid]->dev.kobj,
778 kobject_name(&node_devices[nid]->dev.kobj));
779}
780
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000781/* register memory section under specified node if it spans that node */
Olivier Deprez0e641232021-09-23 10:07:05 +0200782static int register_mem_block_under_node_early(struct memory_block *mem_blk,
783 void *arg)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000784{
David Brazdil0f672f62019-12-10 10:32:29 +0000785 unsigned long memory_block_pfns = memory_block_size_bytes() / PAGE_SIZE;
786 unsigned long start_pfn = section_nr_to_pfn(mem_blk->start_section_nr);
787 unsigned long end_pfn = start_pfn + memory_block_pfns - 1;
Olivier Deprez0e641232021-09-23 10:07:05 +0200788 int nid = *(int *)arg;
David Brazdil0f672f62019-12-10 10:32:29 +0000789 unsigned long pfn;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000790
David Brazdil0f672f62019-12-10 10:32:29 +0000791 for (pfn = start_pfn; pfn <= end_pfn; pfn++) {
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000792 int page_nid;
793
794 /*
795 * memory block could have several absent sections from start.
796 * skip pfn range from absent section
797 */
798 if (!pfn_present(pfn)) {
799 pfn = round_down(pfn + PAGES_PER_SECTION,
800 PAGES_PER_SECTION) - 1;
801 continue;
802 }
803
804 /*
Olivier Deprez0e641232021-09-23 10:07:05 +0200805 * We need to check if page belongs to nid only at the boot
806 * case because node's ranges can be interleaved.
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000807 */
Olivier Deprez0e641232021-09-23 10:07:05 +0200808 page_nid = get_nid_for_pfn(pfn);
809 if (page_nid < 0)
810 continue;
811 if (page_nid != nid)
812 continue;
David Brazdil0f672f62019-12-10 10:32:29 +0000813
Olivier Deprez0e641232021-09-23 10:07:05 +0200814 return do_register_memory_block_under_node(nid, mem_blk);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000815 }
816 /* mem section does not span the specified node */
817 return 0;
818}
819
David Brazdil0f672f62019-12-10 10:32:29 +0000820/*
Olivier Deprez0e641232021-09-23 10:07:05 +0200821 * During hotplug we know that all pages in the memory block belong to the same
822 * node.
823 */
824static int register_mem_block_under_node_hotplug(struct memory_block *mem_blk,
825 void *arg)
826{
827 int nid = *(int *)arg;
828
829 return do_register_memory_block_under_node(nid, mem_blk);
830}
831
832/*
David Brazdil0f672f62019-12-10 10:32:29 +0000833 * Unregister a memory block device under the node it spans. Memory blocks
834 * with multiple nodes cannot be offlined and therefore also never be removed.
835 */
836void unregister_memory_block_under_nodes(struct memory_block *mem_blk)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000837{
David Brazdil0f672f62019-12-10 10:32:29 +0000838 if (mem_blk->nid == NUMA_NO_NODE)
839 return;
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000840
David Brazdil0f672f62019-12-10 10:32:29 +0000841 sysfs_remove_link(&node_devices[mem_blk->nid]->dev.kobj,
842 kobject_name(&mem_blk->dev.kobj));
843 sysfs_remove_link(&mem_blk->dev.kobj,
844 kobject_name(&node_devices[mem_blk->nid]->dev.kobj));
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000845}
846
Olivier Deprez0e641232021-09-23 10:07:05 +0200847int link_mem_sections(int nid, unsigned long start_pfn, unsigned long end_pfn,
848 enum meminit_context context)
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000849{
Olivier Deprez0e641232021-09-23 10:07:05 +0200850 walk_memory_blocks_func_t func;
851
852 if (context == MEMINIT_HOTPLUG)
853 func = register_mem_block_under_node_hotplug;
854 else
855 func = register_mem_block_under_node_early;
856
David Brazdil0f672f62019-12-10 10:32:29 +0000857 return walk_memory_blocks(PFN_PHYS(start_pfn),
858 PFN_PHYS(end_pfn - start_pfn), (void *)&nid,
Olivier Deprez0e641232021-09-23 10:07:05 +0200859 func);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000860}
861
862#ifdef CONFIG_HUGETLBFS
863/*
864 * Handle per node hstate attribute [un]registration on transistions
865 * to/from memoryless state.
866 */
867static void node_hugetlb_work(struct work_struct *work)
868{
869 struct node *node = container_of(work, struct node, node_work);
870
871 /*
872 * We only get here when a node transitions to/from memoryless state.
873 * We can detect which transition occurred by examining whether the
874 * node has memory now. hugetlb_register_node() already check this
875 * so we try to register the attributes. If that fails, then the
876 * node has transitioned to memoryless, try to unregister the
877 * attributes.
878 */
879 if (!hugetlb_register_node(node))
880 hugetlb_unregister_node(node);
881}
882
883static void init_node_hugetlb_work(int nid)
884{
885 INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work);
886}
887
888static int node_memory_callback(struct notifier_block *self,
889 unsigned long action, void *arg)
890{
891 struct memory_notify *mnb = arg;
892 int nid = mnb->status_change_nid;
893
894 switch (action) {
895 case MEM_ONLINE:
896 case MEM_OFFLINE:
897 /*
898 * offload per node hstate [un]registration to a work thread
899 * when transitioning to/from memoryless state.
900 */
901 if (nid != NUMA_NO_NODE)
902 schedule_work(&node_devices[nid]->node_work);
903 break;
904
905 case MEM_GOING_ONLINE:
906 case MEM_GOING_OFFLINE:
907 case MEM_CANCEL_ONLINE:
908 case MEM_CANCEL_OFFLINE:
909 default:
910 break;
911 }
912
913 return NOTIFY_OK;
914}
915#endif /* CONFIG_HUGETLBFS */
916#endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
917
918#if !defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || \
919 !defined(CONFIG_HUGETLBFS)
920static inline int node_memory_callback(struct notifier_block *self,
921 unsigned long action, void *arg)
922{
923 return NOTIFY_OK;
924}
925
926static void init_node_hugetlb_work(int nid) { }
927
928#endif
929
930int __register_one_node(int nid)
931{
932 int error;
933 int cpu;
934
935 node_devices[nid] = kzalloc(sizeof(struct node), GFP_KERNEL);
936 if (!node_devices[nid])
937 return -ENOMEM;
938
939 error = register_node(node_devices[nid], nid);
940
941 /* link cpu under this node */
942 for_each_present_cpu(cpu) {
943 if (cpu_to_node(cpu) == nid)
944 register_cpu_under_node(cpu, nid);
945 }
946
David Brazdil0f672f62019-12-10 10:32:29 +0000947 INIT_LIST_HEAD(&node_devices[nid]->access_list);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000948 /* initialize work queue for memory hot plug */
949 init_node_hugetlb_work(nid);
David Brazdil0f672f62019-12-10 10:32:29 +0000950 node_init_caches(nid);
Andrew Scullb4b6d4a2019-01-02 15:54:55 +0000951
952 return error;
953}
954
955void unregister_one_node(int nid)
956{
957 if (!node_devices[nid])
958 return;
959
960 unregister_node(node_devices[nid]);
961 node_devices[nid] = NULL;
962}
963
964/*
965 * node states attributes
966 */
967
968static ssize_t print_nodes_state(enum node_states state, char *buf)
969{
970 int n;
971
972 n = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
973 nodemask_pr_args(&node_states[state]));
974 buf[n++] = '\n';
975 buf[n] = '\0';
976 return n;
977}
978
979struct node_attr {
980 struct device_attribute attr;
981 enum node_states state;
982};
983
984static ssize_t show_node_state(struct device *dev,
985 struct device_attribute *attr, char *buf)
986{
987 struct node_attr *na = container_of(attr, struct node_attr, attr);
988 return print_nodes_state(na->state, buf);
989}
990
991#define _NODE_ATTR(name, state) \
992 { __ATTR(name, 0444, show_node_state, NULL), state }
993
994static struct node_attr node_state_attr[] = {
995 [N_POSSIBLE] = _NODE_ATTR(possible, N_POSSIBLE),
996 [N_ONLINE] = _NODE_ATTR(online, N_ONLINE),
997 [N_NORMAL_MEMORY] = _NODE_ATTR(has_normal_memory, N_NORMAL_MEMORY),
998#ifdef CONFIG_HIGHMEM
999 [N_HIGH_MEMORY] = _NODE_ATTR(has_high_memory, N_HIGH_MEMORY),
1000#endif
1001 [N_MEMORY] = _NODE_ATTR(has_memory, N_MEMORY),
1002 [N_CPU] = _NODE_ATTR(has_cpu, N_CPU),
1003};
1004
1005static struct attribute *node_state_attrs[] = {
1006 &node_state_attr[N_POSSIBLE].attr.attr,
1007 &node_state_attr[N_ONLINE].attr.attr,
1008 &node_state_attr[N_NORMAL_MEMORY].attr.attr,
1009#ifdef CONFIG_HIGHMEM
1010 &node_state_attr[N_HIGH_MEMORY].attr.attr,
1011#endif
1012 &node_state_attr[N_MEMORY].attr.attr,
1013 &node_state_attr[N_CPU].attr.attr,
1014 NULL
1015};
1016
1017static struct attribute_group memory_root_attr_group = {
1018 .attrs = node_state_attrs,
1019};
1020
1021static const struct attribute_group *cpu_root_attr_groups[] = {
1022 &memory_root_attr_group,
1023 NULL,
1024};
1025
1026#define NODE_CALLBACK_PRI 2 /* lower than SLAB */
1027static int __init register_node_type(void)
1028{
1029 int ret;
1030
1031 BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES);
1032 BUILD_BUG_ON(ARRAY_SIZE(node_state_attrs)-1 != NR_NODE_STATES);
1033
1034 ret = subsys_system_register(&node_subsys, cpu_root_attr_groups);
1035 if (!ret) {
1036 static struct notifier_block node_memory_callback_nb = {
1037 .notifier_call = node_memory_callback,
1038 .priority = NODE_CALLBACK_PRI,
1039 };
1040 register_hotmemory_notifier(&node_memory_callback_nb);
1041 }
1042
1043 /*
1044 * Note: we're not going to unregister the node class if we fail
1045 * to register the node state class attribute files.
1046 */
1047 return ret;
1048}
1049postcore_initcall(register_node_type);