blob: a12afff146d10d274792f396f7e0eeab3c9962f2 [file] [log] [blame]
Andrew Scullb4b6d4a2019-01-02 15:54:55 +00001/*
2 * Debug helper to dump the current kernel pagetables of the system
3 * so that we can see what the various memory ranges are set to.
4 *
5 * (C) Copyright 2008 Intel Corporation
6 *
7 * Author: Arjan van de Ven <arjan@linux.intel.com>
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; version 2
12 * of the License.
13 */
14
15#include <linux/debugfs.h>
16#include <linux/kasan.h>
17#include <linux/mm.h>
18#include <linux/init.h>
19#include <linux/sched.h>
20#include <linux/seq_file.h>
21#include <linux/highmem.h>
22
23#include <asm/pgtable.h>
24
25/*
26 * The dumper groups pagetable entries of the same type into one, and for
27 * that it needs to keep some state when walking, and flush this state
28 * when a "break" in the continuity is found.
29 */
30struct pg_state {
31 int level;
32 pgprot_t current_prot;
33 pgprotval_t effective_prot;
34 unsigned long start_address;
35 unsigned long current_address;
36 const struct addr_marker *marker;
37 unsigned long lines;
38 bool to_dmesg;
39 bool check_wx;
40 unsigned long wx_pages;
41};
42
43struct addr_marker {
44 unsigned long start_address;
45 const char *name;
46 unsigned long max_lines;
47};
48
49/* Address space markers hints */
50
51#ifdef CONFIG_X86_64
52
53enum address_markers_idx {
54 USER_SPACE_NR = 0,
55 KERNEL_SPACE_NR,
56 LOW_KERNEL_NR,
57#if defined(CONFIG_MODIFY_LDT_SYSCALL) && defined(CONFIG_X86_5LEVEL)
58 LDT_NR,
59#endif
60 VMALLOC_START_NR,
61 VMEMMAP_START_NR,
62#ifdef CONFIG_KASAN
63 KASAN_SHADOW_START_NR,
64 KASAN_SHADOW_END_NR,
65#endif
66 CPU_ENTRY_AREA_NR,
67#if defined(CONFIG_MODIFY_LDT_SYSCALL) && !defined(CONFIG_X86_5LEVEL)
68 LDT_NR,
69#endif
70#ifdef CONFIG_X86_ESPFIX64
71 ESPFIX_START_NR,
72#endif
73#ifdef CONFIG_EFI
74 EFI_END_NR,
75#endif
76 HIGH_KERNEL_NR,
77 MODULES_VADDR_NR,
78 MODULES_END_NR,
79 FIXADDR_START_NR,
80 END_OF_SPACE_NR,
81};
82
83static struct addr_marker address_markers[] = {
84 [USER_SPACE_NR] = { 0, "User Space" },
85 [KERNEL_SPACE_NR] = { (1UL << 63), "Kernel Space" },
86 [LOW_KERNEL_NR] = { 0UL, "Low Kernel Mapping" },
87 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
88 [VMEMMAP_START_NR] = { 0UL, "Vmemmap" },
89#ifdef CONFIG_KASAN
90 /*
91 * These fields get initialized with the (dynamic)
92 * KASAN_SHADOW_{START,END} values in pt_dump_init().
93 */
94 [KASAN_SHADOW_START_NR] = { 0UL, "KASAN shadow" },
95 [KASAN_SHADOW_END_NR] = { 0UL, "KASAN shadow end" },
96#endif
97#ifdef CONFIG_MODIFY_LDT_SYSCALL
98 [LDT_NR] = { 0UL, "LDT remap" },
99#endif
100 [CPU_ENTRY_AREA_NR] = { CPU_ENTRY_AREA_BASE,"CPU entry Area" },
101#ifdef CONFIG_X86_ESPFIX64
102 [ESPFIX_START_NR] = { ESPFIX_BASE_ADDR, "ESPfix Area", 16 },
103#endif
104#ifdef CONFIG_EFI
105 [EFI_END_NR] = { EFI_VA_END, "EFI Runtime Services" },
106#endif
107 [HIGH_KERNEL_NR] = { __START_KERNEL_map, "High Kernel Mapping" },
108 [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" },
109 [MODULES_END_NR] = { MODULES_END, "End Modules" },
110 [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" },
111 [END_OF_SPACE_NR] = { -1, NULL }
112};
113
114#define INIT_PGD ((pgd_t *) &init_top_pgt)
115
116#else /* CONFIG_X86_64 */
117
118enum address_markers_idx {
119 USER_SPACE_NR = 0,
120 KERNEL_SPACE_NR,
121 VMALLOC_START_NR,
122 VMALLOC_END_NR,
123#ifdef CONFIG_HIGHMEM
124 PKMAP_BASE_NR,
125#endif
126#ifdef CONFIG_MODIFY_LDT_SYSCALL
127 LDT_NR,
128#endif
129 CPU_ENTRY_AREA_NR,
130 FIXADDR_START_NR,
131 END_OF_SPACE_NR,
132};
133
134static struct addr_marker address_markers[] = {
135 [USER_SPACE_NR] = { 0, "User Space" },
136 [KERNEL_SPACE_NR] = { PAGE_OFFSET, "Kernel Mapping" },
137 [VMALLOC_START_NR] = { 0UL, "vmalloc() Area" },
138 [VMALLOC_END_NR] = { 0UL, "vmalloc() End" },
139#ifdef CONFIG_HIGHMEM
140 [PKMAP_BASE_NR] = { 0UL, "Persistent kmap() Area" },
141#endif
142#ifdef CONFIG_MODIFY_LDT_SYSCALL
143 [LDT_NR] = { 0UL, "LDT remap" },
144#endif
145 [CPU_ENTRY_AREA_NR] = { 0UL, "CPU entry area" },
146 [FIXADDR_START_NR] = { 0UL, "Fixmap area" },
147 [END_OF_SPACE_NR] = { -1, NULL }
148};
149
150#define INIT_PGD (swapper_pg_dir)
151
152#endif /* !CONFIG_X86_64 */
153
154/* Multipliers for offsets within the PTEs */
155#define PTE_LEVEL_MULT (PAGE_SIZE)
156#define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
157#define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
158#define P4D_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
159#define PGD_LEVEL_MULT (PTRS_PER_P4D * P4D_LEVEL_MULT)
160
161#define pt_dump_seq_printf(m, to_dmesg, fmt, args...) \
162({ \
163 if (to_dmesg) \
164 printk(KERN_INFO fmt, ##args); \
165 else \
166 if (m) \
167 seq_printf(m, fmt, ##args); \
168})
169
170#define pt_dump_cont_printf(m, to_dmesg, fmt, args...) \
171({ \
172 if (to_dmesg) \
173 printk(KERN_CONT fmt, ##args); \
174 else \
175 if (m) \
176 seq_printf(m, fmt, ##args); \
177})
178
179/*
180 * Print a readable form of a pgprot_t to the seq_file
181 */
182static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
183{
184 pgprotval_t pr = pgprot_val(prot);
185 static const char * const level_name[] =
186 { "cr3", "pgd", "p4d", "pud", "pmd", "pte" };
187
188 if (!(pr & _PAGE_PRESENT)) {
189 /* Not present */
190 pt_dump_cont_printf(m, dmsg, " ");
191 } else {
192 if (pr & _PAGE_USER)
193 pt_dump_cont_printf(m, dmsg, "USR ");
194 else
195 pt_dump_cont_printf(m, dmsg, " ");
196 if (pr & _PAGE_RW)
197 pt_dump_cont_printf(m, dmsg, "RW ");
198 else
199 pt_dump_cont_printf(m, dmsg, "ro ");
200 if (pr & _PAGE_PWT)
201 pt_dump_cont_printf(m, dmsg, "PWT ");
202 else
203 pt_dump_cont_printf(m, dmsg, " ");
204 if (pr & _PAGE_PCD)
205 pt_dump_cont_printf(m, dmsg, "PCD ");
206 else
207 pt_dump_cont_printf(m, dmsg, " ");
208
209 /* Bit 7 has a different meaning on level 3 vs 4 */
210 if (level <= 4 && pr & _PAGE_PSE)
211 pt_dump_cont_printf(m, dmsg, "PSE ");
212 else
213 pt_dump_cont_printf(m, dmsg, " ");
214 if ((level == 5 && pr & _PAGE_PAT) ||
215 ((level == 4 || level == 3) && pr & _PAGE_PAT_LARGE))
216 pt_dump_cont_printf(m, dmsg, "PAT ");
217 else
218 pt_dump_cont_printf(m, dmsg, " ");
219 if (pr & _PAGE_GLOBAL)
220 pt_dump_cont_printf(m, dmsg, "GLB ");
221 else
222 pt_dump_cont_printf(m, dmsg, " ");
223 if (pr & _PAGE_NX)
224 pt_dump_cont_printf(m, dmsg, "NX ");
225 else
226 pt_dump_cont_printf(m, dmsg, "x ");
227 }
228 pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
229}
230
231/*
232 * On 64 bits, sign-extend the 48 bit address to 64 bit
233 */
234static unsigned long normalize_addr(unsigned long u)
235{
236 int shift;
237 if (!IS_ENABLED(CONFIG_X86_64))
238 return u;
239
240 shift = 64 - (__VIRTUAL_MASK_SHIFT + 1);
241 return (signed long)(u << shift) >> shift;
242}
243
244/*
245 * This function gets called on a break in a continuous series
246 * of PTE entries; the next one is different so we need to
247 * print what we collected so far.
248 */
249static void note_page(struct seq_file *m, struct pg_state *st,
250 pgprot_t new_prot, pgprotval_t new_eff, int level)
251{
252 pgprotval_t prot, cur, eff;
253 static const char units[] = "BKMGTPE";
254
255 /*
256 * If we have a "break" in the series, we need to flush the state that
257 * we have now. "break" is either changing perms, levels or
258 * address space marker.
259 */
260 prot = pgprot_val(new_prot);
261 cur = pgprot_val(st->current_prot);
262 eff = st->effective_prot;
263
264 if (!st->level) {
265 /* First entry */
266 st->current_prot = new_prot;
267 st->effective_prot = new_eff;
268 st->level = level;
269 st->marker = address_markers;
270 st->lines = 0;
271 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
272 st->marker->name);
273 } else if (prot != cur || new_eff != eff || level != st->level ||
274 st->current_address >= st->marker[1].start_address) {
275 const char *unit = units;
276 unsigned long delta;
277 int width = sizeof(unsigned long) * 2;
278
279 if (st->check_wx && (eff & _PAGE_RW) && !(eff & _PAGE_NX)) {
280 WARN_ONCE(1,
281 "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
282 (void *)st->start_address,
283 (void *)st->start_address);
284 st->wx_pages += (st->current_address -
285 st->start_address) / PAGE_SIZE;
286 }
287
288 /*
289 * Now print the actual finished series
290 */
291 if (!st->marker->max_lines ||
292 st->lines < st->marker->max_lines) {
293 pt_dump_seq_printf(m, st->to_dmesg,
294 "0x%0*lx-0x%0*lx ",
295 width, st->start_address,
296 width, st->current_address);
297
298 delta = st->current_address - st->start_address;
299 while (!(delta & 1023) && unit[1]) {
300 delta >>= 10;
301 unit++;
302 }
303 pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
304 delta, *unit);
305 printk_prot(m, st->current_prot, st->level,
306 st->to_dmesg);
307 }
308 st->lines++;
309
310 /*
311 * We print markers for special areas of address space,
312 * such as the start of vmalloc space etc.
313 * This helps in the interpretation.
314 */
315 if (st->current_address >= st->marker[1].start_address) {
316 if (st->marker->max_lines &&
317 st->lines > st->marker->max_lines) {
318 unsigned long nskip =
319 st->lines - st->marker->max_lines;
320 pt_dump_seq_printf(m, st->to_dmesg,
321 "... %lu entr%s skipped ... \n",
322 nskip,
323 nskip == 1 ? "y" : "ies");
324 }
325 st->marker++;
326 st->lines = 0;
327 pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
328 st->marker->name);
329 }
330
331 st->start_address = st->current_address;
332 st->current_prot = new_prot;
333 st->effective_prot = new_eff;
334 st->level = level;
335 }
336}
337
338static inline pgprotval_t effective_prot(pgprotval_t prot1, pgprotval_t prot2)
339{
340 return (prot1 & prot2 & (_PAGE_USER | _PAGE_RW)) |
341 ((prot1 | prot2) & _PAGE_NX);
342}
343
344static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
345 pgprotval_t eff_in, unsigned long P)
346{
347 int i;
348 pte_t *pte;
349 pgprotval_t prot, eff;
350
351 for (i = 0; i < PTRS_PER_PTE; i++) {
352 st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
353 pte = pte_offset_map(&addr, st->current_address);
354 prot = pte_flags(*pte);
355 eff = effective_prot(eff_in, prot);
356 note_page(m, st, __pgprot(prot), eff, 5);
357 pte_unmap(pte);
358 }
359}
360#ifdef CONFIG_KASAN
361
362/*
363 * This is an optimization for KASAN=y case. Since all kasan page tables
364 * eventually point to the kasan_zero_page we could call note_page()
365 * right away without walking through lower level page tables. This saves
366 * us dozens of seconds (minutes for 5-level config) while checking for
367 * W+X mapping or reading kernel_page_tables debugfs file.
368 */
369static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
370 void *pt)
371{
372 if (__pa(pt) == __pa(kasan_zero_pmd) ||
373 (pgtable_l5_enabled() && __pa(pt) == __pa(kasan_zero_p4d)) ||
374 __pa(pt) == __pa(kasan_zero_pud)) {
375 pgprotval_t prot = pte_flags(kasan_zero_pte[0]);
376 note_page(m, st, __pgprot(prot), 0, 5);
377 return true;
378 }
379 return false;
380}
381#else
382static inline bool kasan_page_table(struct seq_file *m, struct pg_state *st,
383 void *pt)
384{
385 return false;
386}
387#endif
388
389#if PTRS_PER_PMD > 1
390
391static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
392 pgprotval_t eff_in, unsigned long P)
393{
394 int i;
395 pmd_t *start, *pmd_start;
396 pgprotval_t prot, eff;
397
398 pmd_start = start = (pmd_t *)pud_page_vaddr(addr);
399 for (i = 0; i < PTRS_PER_PMD; i++) {
400 st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
401 if (!pmd_none(*start)) {
402 prot = pmd_flags(*start);
403 eff = effective_prot(eff_in, prot);
404 if (pmd_large(*start) || !pmd_present(*start)) {
405 note_page(m, st, __pgprot(prot), eff, 4);
406 } else if (!kasan_page_table(m, st, pmd_start)) {
407 walk_pte_level(m, st, *start, eff,
408 P + i * PMD_LEVEL_MULT);
409 }
410 } else
411 note_page(m, st, __pgprot(0), 0, 4);
412 start++;
413 }
414}
415
416#else
417#define walk_pmd_level(m,s,a,e,p) walk_pte_level(m,s,__pmd(pud_val(a)),e,p)
418#define pud_large(a) pmd_large(__pmd(pud_val(a)))
419#define pud_none(a) pmd_none(__pmd(pud_val(a)))
420#endif
421
422#if PTRS_PER_PUD > 1
423
424static void walk_pud_level(struct seq_file *m, struct pg_state *st, p4d_t addr,
425 pgprotval_t eff_in, unsigned long P)
426{
427 int i;
428 pud_t *start, *pud_start;
429 pgprotval_t prot, eff;
430 pud_t *prev_pud = NULL;
431
432 pud_start = start = (pud_t *)p4d_page_vaddr(addr);
433
434 for (i = 0; i < PTRS_PER_PUD; i++) {
435 st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
436 if (!pud_none(*start)) {
437 prot = pud_flags(*start);
438 eff = effective_prot(eff_in, prot);
439 if (pud_large(*start) || !pud_present(*start)) {
440 note_page(m, st, __pgprot(prot), eff, 3);
441 } else if (!kasan_page_table(m, st, pud_start)) {
442 walk_pmd_level(m, st, *start, eff,
443 P + i * PUD_LEVEL_MULT);
444 }
445 } else
446 note_page(m, st, __pgprot(0), 0, 3);
447
448 prev_pud = start;
449 start++;
450 }
451}
452
453#else
454#define walk_pud_level(m,s,a,e,p) walk_pmd_level(m,s,__pud(p4d_val(a)),e,p)
455#define p4d_large(a) pud_large(__pud(p4d_val(a)))
456#define p4d_none(a) pud_none(__pud(p4d_val(a)))
457#endif
458
459static void walk_p4d_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
460 pgprotval_t eff_in, unsigned long P)
461{
462 int i;
463 p4d_t *start, *p4d_start;
464 pgprotval_t prot, eff;
465
466 if (PTRS_PER_P4D == 1)
467 return walk_pud_level(m, st, __p4d(pgd_val(addr)), eff_in, P);
468
469 p4d_start = start = (p4d_t *)pgd_page_vaddr(addr);
470
471 for (i = 0; i < PTRS_PER_P4D; i++) {
472 st->current_address = normalize_addr(P + i * P4D_LEVEL_MULT);
473 if (!p4d_none(*start)) {
474 prot = p4d_flags(*start);
475 eff = effective_prot(eff_in, prot);
476 if (p4d_large(*start) || !p4d_present(*start)) {
477 note_page(m, st, __pgprot(prot), eff, 2);
478 } else if (!kasan_page_table(m, st, p4d_start)) {
479 walk_pud_level(m, st, *start, eff,
480 P + i * P4D_LEVEL_MULT);
481 }
482 } else
483 note_page(m, st, __pgprot(0), 0, 2);
484
485 start++;
486 }
487}
488
489#define pgd_large(a) (pgtable_l5_enabled() ? pgd_large(a) : p4d_large(__p4d(pgd_val(a))))
490#define pgd_none(a) (pgtable_l5_enabled() ? pgd_none(a) : p4d_none(__p4d(pgd_val(a))))
491
492static inline bool is_hypervisor_range(int idx)
493{
494#ifdef CONFIG_X86_64
495 /*
496 * ffff800000000000 - ffff87ffffffffff is reserved for
497 * the hypervisor.
498 */
499 return (idx >= pgd_index(__PAGE_OFFSET) - 16) &&
500 (idx < pgd_index(__PAGE_OFFSET));
501#else
502 return false;
503#endif
504}
505
506static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
507 bool checkwx, bool dmesg)
508{
509 pgd_t *start = INIT_PGD;
510 pgprotval_t prot, eff;
511 int i;
512 struct pg_state st = {};
513
514 if (pgd) {
515 start = pgd;
516 st.to_dmesg = dmesg;
517 }
518
519 st.check_wx = checkwx;
520 if (checkwx)
521 st.wx_pages = 0;
522
523 for (i = 0; i < PTRS_PER_PGD; i++) {
524 st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
525 if (!pgd_none(*start) && !is_hypervisor_range(i)) {
526 prot = pgd_flags(*start);
527#ifdef CONFIG_X86_PAE
528 eff = _PAGE_USER | _PAGE_RW;
529#else
530 eff = prot;
531#endif
532 if (pgd_large(*start) || !pgd_present(*start)) {
533 note_page(m, &st, __pgprot(prot), eff, 1);
534 } else {
535 walk_p4d_level(m, &st, *start, eff,
536 i * PGD_LEVEL_MULT);
537 }
538 } else
539 note_page(m, &st, __pgprot(0), 0, 1);
540
541 cond_resched();
542 start++;
543 }
544
545 /* Flush out the last page */
546 st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
547 note_page(m, &st, __pgprot(0), 0, 0);
548 if (!checkwx)
549 return;
550 if (st.wx_pages)
551 pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
552 st.wx_pages);
553 else
554 pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
555}
556
557void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
558{
559 ptdump_walk_pgd_level_core(m, pgd, false, true);
560}
561
562void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user)
563{
564#ifdef CONFIG_PAGE_TABLE_ISOLATION
565 if (user && static_cpu_has(X86_FEATURE_PTI))
566 pgd = kernel_to_user_pgdp(pgd);
567#endif
568 ptdump_walk_pgd_level_core(m, pgd, false, false);
569}
570EXPORT_SYMBOL_GPL(ptdump_walk_pgd_level_debugfs);
571
572void ptdump_walk_user_pgd_level_checkwx(void)
573{
574#ifdef CONFIG_PAGE_TABLE_ISOLATION
575 pgd_t *pgd = INIT_PGD;
576
577 if (!(__supported_pte_mask & _PAGE_NX) ||
578 !static_cpu_has(X86_FEATURE_PTI))
579 return;
580
581 pr_info("x86/mm: Checking user space page tables\n");
582 pgd = kernel_to_user_pgdp(pgd);
583 ptdump_walk_pgd_level_core(NULL, pgd, true, false);
584#endif
585}
586
587void ptdump_walk_pgd_level_checkwx(void)
588{
589 ptdump_walk_pgd_level_core(NULL, NULL, true, false);
590}
591
592static int __init pt_dump_init(void)
593{
594 /*
595 * Various markers are not compile-time constants, so assign them
596 * here.
597 */
598#ifdef CONFIG_X86_64
599 address_markers[LOW_KERNEL_NR].start_address = PAGE_OFFSET;
600 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
601 address_markers[VMEMMAP_START_NR].start_address = VMEMMAP_START;
602#ifdef CONFIG_MODIFY_LDT_SYSCALL
603 address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
604#endif
605#ifdef CONFIG_KASAN
606 address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START;
607 address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END;
608#endif
609#endif
610#ifdef CONFIG_X86_32
611 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
612 address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
613# ifdef CONFIG_HIGHMEM
614 address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
615# endif
616 address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
617 address_markers[CPU_ENTRY_AREA_NR].start_address = CPU_ENTRY_AREA_BASE;
618# ifdef CONFIG_MODIFY_LDT_SYSCALL
619 address_markers[LDT_NR].start_address = LDT_BASE_ADDR;
620# endif
621#endif
622 return 0;
623}
624__initcall(pt_dump_init);