• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Debug helper to dump the current kernel pagetables of the system
3  * so that we can see what the various memory ranges are set to.
4  *
5  * (C) Copyright 2008 Intel Corporation
6  *
7  * Author: Arjan van de Ven <arjan@linux.intel.com>
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License
11  * as published by the Free Software Foundation; version 2
12  * of the License.
13  */
14 
15 #include <linux/debugfs.h>
16 #include <linux/mm.h>
17 #include <linux/module.h>
18 #include <linux/sched.h>
19 #include <linux/seq_file.h>
20 
21 #include <asm/pgtable.h>
22 
23 /*
24  * The dumper groups pagetable entries of the same type into one, and for
25  * that it needs to keep some state when walking, and flush this state
26  * when a "break" in the continuity is found.
27  */
28 struct pg_state {
29 	int level;
30 	pgprot_t current_prot;
31 	unsigned long start_address;
32 	unsigned long current_address;
33 	const struct addr_marker *marker;
34 	unsigned long lines;
35 	bool to_dmesg;
36 	bool check_wx;
37 	unsigned long wx_pages;
38 };
39 
40 struct addr_marker {
41 	unsigned long start_address;
42 	const char *name;
43 	unsigned long max_lines;
44 };
45 
46 /* indices for address_markers; keep sync'd w/ address_markers below */
47 enum address_markers_idx {
48 	USER_SPACE_NR = 0,
49 #ifdef CONFIG_X86_64
50 	KERNEL_SPACE_NR,
51 	LOW_KERNEL_NR,
52 	VMALLOC_START_NR,
53 	VMEMMAP_START_NR,
54 # ifdef CONFIG_X86_ESPFIX64
55 	ESPFIX_START_NR,
56 # endif
57 	HIGH_KERNEL_NR,
58 	MODULES_VADDR_NR,
59 	MODULES_END_NR,
60 #else
61 	KERNEL_SPACE_NR,
62 	VMALLOC_START_NR,
63 	VMALLOC_END_NR,
64 # ifdef CONFIG_HIGHMEM
65 	PKMAP_BASE_NR,
66 # endif
67 	FIXADDR_START_NR,
68 #endif
69 };
70 
71 /* Address space markers hints */
72 static struct addr_marker address_markers[] = {
73 	{ 0, "User Space" },
74 #ifdef CONFIG_X86_64
75 	{ 0x8000000000000000UL, "Kernel Space" },
76 	{ PAGE_OFFSET,		"Low Kernel Mapping" },
77 	{ VMALLOC_START,        "vmalloc() Area" },
78 	{ VMEMMAP_START,        "Vmemmap" },
79 # ifdef CONFIG_X86_ESPFIX64
80 	{ ESPFIX_BASE_ADDR,	"ESPfix Area", 16 },
81 # endif
82 # ifdef CONFIG_EFI
83 	{ EFI_VA_END,		"EFI Runtime Services" },
84 # endif
85 	{ __START_KERNEL_map,   "High Kernel Mapping" },
86 	{ MODULES_VADDR,        "Modules" },
87 	{ MODULES_END,          "End Modules" },
88 #else
89 	{ PAGE_OFFSET,          "Kernel Mapping" },
90 	{ 0/* VMALLOC_START */, "vmalloc() Area" },
91 	{ 0/*VMALLOC_END*/,     "vmalloc() End" },
92 # ifdef CONFIG_HIGHMEM
93 	{ 0/*PKMAP_BASE*/,      "Persistent kmap() Area" },
94 # endif
95 	{ 0/*FIXADDR_START*/,   "Fixmap Area" },
96 #endif
97 	{ -1, NULL }		/* End of list */
98 };
99 
100 /* Multipliers for offsets within the PTEs */
101 #define PTE_LEVEL_MULT (PAGE_SIZE)
102 #define PMD_LEVEL_MULT (PTRS_PER_PTE * PTE_LEVEL_MULT)
103 #define PUD_LEVEL_MULT (PTRS_PER_PMD * PMD_LEVEL_MULT)
104 #define PGD_LEVEL_MULT (PTRS_PER_PUD * PUD_LEVEL_MULT)
105 
106 #define pt_dump_seq_printf(m, to_dmesg, fmt, args...)		\
107 ({								\
108 	if (to_dmesg)					\
109 		printk(KERN_INFO fmt, ##args);			\
110 	else							\
111 		if (m)						\
112 			seq_printf(m, fmt, ##args);		\
113 })
114 
115 #define pt_dump_cont_printf(m, to_dmesg, fmt, args...)		\
116 ({								\
117 	if (to_dmesg)					\
118 		printk(KERN_CONT fmt, ##args);			\
119 	else							\
120 		if (m)						\
121 			seq_printf(m, fmt, ##args);		\
122 })
123 
124 /*
125  * Print a readable form of a pgprot_t to the seq_file
126  */
printk_prot(struct seq_file * m,pgprot_t prot,int level,bool dmsg)127 static void printk_prot(struct seq_file *m, pgprot_t prot, int level, bool dmsg)
128 {
129 	pgprotval_t pr = pgprot_val(prot);
130 	static const char * const level_name[] =
131 		{ "cr3", "pgd", "pud", "pmd", "pte" };
132 
133 	if (!pgprot_val(prot)) {
134 		/* Not present */
135 		pt_dump_cont_printf(m, dmsg, "                              ");
136 	} else {
137 		if (pr & _PAGE_USER)
138 			pt_dump_cont_printf(m, dmsg, "USR ");
139 		else
140 			pt_dump_cont_printf(m, dmsg, "    ");
141 		if (pr & _PAGE_RW)
142 			pt_dump_cont_printf(m, dmsg, "RW ");
143 		else
144 			pt_dump_cont_printf(m, dmsg, "ro ");
145 		if (pr & _PAGE_PWT)
146 			pt_dump_cont_printf(m, dmsg, "PWT ");
147 		else
148 			pt_dump_cont_printf(m, dmsg, "    ");
149 		if (pr & _PAGE_PCD)
150 			pt_dump_cont_printf(m, dmsg, "PCD ");
151 		else
152 			pt_dump_cont_printf(m, dmsg, "    ");
153 
154 		/* Bit 7 has a different meaning on level 3 vs 4 */
155 		if (level <= 3 && pr & _PAGE_PSE)
156 			pt_dump_cont_printf(m, dmsg, "PSE ");
157 		else
158 			pt_dump_cont_printf(m, dmsg, "    ");
159 		if ((level == 4 && pr & _PAGE_PAT) ||
160 		    ((level == 3 || level == 2) && pr & _PAGE_PAT_LARGE))
161 			pt_dump_cont_printf(m, dmsg, "PAT ");
162 		else
163 			pt_dump_cont_printf(m, dmsg, "    ");
164 		if (pr & _PAGE_GLOBAL)
165 			pt_dump_cont_printf(m, dmsg, "GLB ");
166 		else
167 			pt_dump_cont_printf(m, dmsg, "    ");
168 		if (pr & _PAGE_NX)
169 			pt_dump_cont_printf(m, dmsg, "NX ");
170 		else
171 			pt_dump_cont_printf(m, dmsg, "x  ");
172 	}
173 	pt_dump_cont_printf(m, dmsg, "%s\n", level_name[level]);
174 }
175 
176 /*
177  * On 64 bits, sign-extend the 48 bit address to 64 bit
178  */
normalize_addr(unsigned long u)179 static unsigned long normalize_addr(unsigned long u)
180 {
181 #ifdef CONFIG_X86_64
182 	return (signed long)(u << 16) >> 16;
183 #else
184 	return u;
185 #endif
186 }
187 
188 /*
189  * This function gets called on a break in a continuous series
190  * of PTE entries; the next one is different so we need to
191  * print what we collected so far.
192  */
note_page(struct seq_file * m,struct pg_state * st,pgprot_t new_prot,int level)193 static void note_page(struct seq_file *m, struct pg_state *st,
194 		      pgprot_t new_prot, int level)
195 {
196 	pgprotval_t prot, cur;
197 	static const char units[] = "BKMGTPE";
198 
199 	/*
200 	 * If we have a "break" in the series, we need to flush the state that
201 	 * we have now. "break" is either changing perms, levels or
202 	 * address space marker.
203 	 */
204 	prot = pgprot_val(new_prot);
205 	cur = pgprot_val(st->current_prot);
206 
207 	if (!st->level) {
208 		/* First entry */
209 		st->current_prot = new_prot;
210 		st->level = level;
211 		st->marker = address_markers;
212 		st->lines = 0;
213 		pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
214 				   st->marker->name);
215 	} else if (prot != cur || level != st->level ||
216 		   st->current_address >= st->marker[1].start_address) {
217 		const char *unit = units;
218 		unsigned long delta;
219 		int width = sizeof(unsigned long) * 2;
220 		pgprotval_t pr = pgprot_val(st->current_prot);
221 
222 		if (st->check_wx && (pr & _PAGE_RW) && !(pr & _PAGE_NX)) {
223 			WARN_ONCE(1,
224 				  "x86/mm: Found insecure W+X mapping at address %p/%pS\n",
225 				  (void *)st->start_address,
226 				  (void *)st->start_address);
227 			st->wx_pages += (st->current_address -
228 					 st->start_address) / PAGE_SIZE;
229 		}
230 
231 		/*
232 		 * Now print the actual finished series
233 		 */
234 		if (!st->marker->max_lines ||
235 		    st->lines < st->marker->max_lines) {
236 			pt_dump_seq_printf(m, st->to_dmesg,
237 					   "0x%0*lx-0x%0*lx   ",
238 					   width, st->start_address,
239 					   width, st->current_address);
240 
241 			delta = st->current_address - st->start_address;
242 			while (!(delta & 1023) && unit[1]) {
243 				delta >>= 10;
244 				unit++;
245 			}
246 			pt_dump_cont_printf(m, st->to_dmesg, "%9lu%c ",
247 					    delta, *unit);
248 			printk_prot(m, st->current_prot, st->level,
249 				    st->to_dmesg);
250 		}
251 		st->lines++;
252 
253 		/*
254 		 * We print markers for special areas of address space,
255 		 * such as the start of vmalloc space etc.
256 		 * This helps in the interpretation.
257 		 */
258 		if (st->current_address >= st->marker[1].start_address) {
259 			if (st->marker->max_lines &&
260 			    st->lines > st->marker->max_lines) {
261 				unsigned long nskip =
262 					st->lines - st->marker->max_lines;
263 				pt_dump_seq_printf(m, st->to_dmesg,
264 						   "... %lu entr%s skipped ... \n",
265 						   nskip,
266 						   nskip == 1 ? "y" : "ies");
267 			}
268 			st->marker++;
269 			st->lines = 0;
270 			pt_dump_seq_printf(m, st->to_dmesg, "---[ %s ]---\n",
271 					   st->marker->name);
272 		}
273 
274 		st->start_address = st->current_address;
275 		st->current_prot = new_prot;
276 		st->level = level;
277 	}
278 }
279 
walk_pte_level(struct seq_file * m,struct pg_state * st,pmd_t addr,unsigned long P)280 static void walk_pte_level(struct seq_file *m, struct pg_state *st, pmd_t addr,
281 							unsigned long P)
282 {
283 	int i;
284 	pte_t *start;
285 	pgprotval_t prot;
286 
287 	start = (pte_t *) pmd_page_vaddr(addr);
288 	for (i = 0; i < PTRS_PER_PTE; i++) {
289 		prot = pte_flags(*start);
290 		st->current_address = normalize_addr(P + i * PTE_LEVEL_MULT);
291 		note_page(m, st, __pgprot(prot), 4);
292 		start++;
293 	}
294 }
295 
296 #if PTRS_PER_PMD > 1
297 
walk_pmd_level(struct seq_file * m,struct pg_state * st,pud_t addr,unsigned long P)298 static void walk_pmd_level(struct seq_file *m, struct pg_state *st, pud_t addr,
299 							unsigned long P)
300 {
301 	int i;
302 	pmd_t *start;
303 	pgprotval_t prot;
304 
305 	start = (pmd_t *) pud_page_vaddr(addr);
306 	for (i = 0; i < PTRS_PER_PMD; i++) {
307 		st->current_address = normalize_addr(P + i * PMD_LEVEL_MULT);
308 		if (!pmd_none(*start)) {
309 			if (pmd_large(*start) || !pmd_present(*start)) {
310 				prot = pmd_flags(*start);
311 				note_page(m, st, __pgprot(prot), 3);
312 			} else {
313 				walk_pte_level(m, st, *start,
314 					       P + i * PMD_LEVEL_MULT);
315 			}
316 		} else
317 			note_page(m, st, __pgprot(0), 3);
318 		start++;
319 	}
320 }
321 
322 #else
323 #define walk_pmd_level(m,s,a,p) walk_pte_level(m,s,__pmd(pud_val(a)),p)
324 #define pud_large(a) pmd_large(__pmd(pud_val(a)))
325 #define pud_none(a)  pmd_none(__pmd(pud_val(a)))
326 #endif
327 
328 #if PTRS_PER_PUD > 1
329 
walk_pud_level(struct seq_file * m,struct pg_state * st,pgd_t addr,unsigned long P)330 static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr,
331 							unsigned long P)
332 {
333 	int i;
334 	pud_t *start;
335 	pgprotval_t prot;
336 
337 	start = (pud_t *) pgd_page_vaddr(addr);
338 
339 	for (i = 0; i < PTRS_PER_PUD; i++) {
340 		st->current_address = normalize_addr(P + i * PUD_LEVEL_MULT);
341 		if (!pud_none(*start)) {
342 			if (pud_large(*start) || !pud_present(*start)) {
343 				prot = pud_flags(*start);
344 				note_page(m, st, __pgprot(prot), 2);
345 			} else {
346 				walk_pmd_level(m, st, *start,
347 					       P + i * PUD_LEVEL_MULT);
348 			}
349 		} else
350 			note_page(m, st, __pgprot(0), 2);
351 
352 		start++;
353 	}
354 }
355 
356 #else
357 #define walk_pud_level(m,s,a,p) walk_pmd_level(m,s,__pud(pgd_val(a)),p)
358 #define pgd_large(a) pud_large(__pud(pgd_val(a)))
359 #define pgd_none(a)  pud_none(__pud(pgd_val(a)))
360 #endif
361 
362 #ifdef CONFIG_X86_64
is_hypervisor_range(int idx)363 static inline bool is_hypervisor_range(int idx)
364 {
365 	/*
366 	 * ffff800000000000 - ffff87ffffffffff is reserved for
367 	 * the hypervisor.
368 	 */
369 	return paravirt_enabled() &&
370 		(idx >= pgd_index(__PAGE_OFFSET) - 16) &&
371 		(idx < pgd_index(__PAGE_OFFSET));
372 }
373 #else
is_hypervisor_range(int idx)374 static inline bool is_hypervisor_range(int idx) { return false; }
375 #endif
376 
ptdump_walk_pgd_level_core(struct seq_file * m,pgd_t * pgd,bool checkwx)377 static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd,
378 				       bool checkwx)
379 {
380 #ifdef CONFIG_X86_64
381 	pgd_t *start = (pgd_t *) &init_level4_pgt;
382 #else
383 	pgd_t *start = swapper_pg_dir;
384 #endif
385 	pgprotval_t prot;
386 	int i;
387 	struct pg_state st = {};
388 
389 	if (pgd) {
390 		start = pgd;
391 		st.to_dmesg = true;
392 	}
393 
394 	st.check_wx = checkwx;
395 	if (checkwx)
396 		st.wx_pages = 0;
397 
398 	for (i = 0; i < PTRS_PER_PGD; i++) {
399 		st.current_address = normalize_addr(i * PGD_LEVEL_MULT);
400 		if (!pgd_none(*start) && !is_hypervisor_range(i)) {
401 			if (pgd_large(*start) || !pgd_present(*start)) {
402 				prot = pgd_flags(*start);
403 				note_page(m, &st, __pgprot(prot), 1);
404 			} else {
405 				walk_pud_level(m, &st, *start,
406 					       i * PGD_LEVEL_MULT);
407 			}
408 		} else
409 			note_page(m, &st, __pgprot(0), 1);
410 
411 		cond_resched();
412 		start++;
413 	}
414 
415 	/* Flush out the last page */
416 	st.current_address = normalize_addr(PTRS_PER_PGD*PGD_LEVEL_MULT);
417 	note_page(m, &st, __pgprot(0), 0);
418 	if (!checkwx)
419 		return;
420 	if (st.wx_pages)
421 		pr_info("x86/mm: Checked W+X mappings: FAILED, %lu W+X pages found.\n",
422 			st.wx_pages);
423 	else
424 		pr_info("x86/mm: Checked W+X mappings: passed, no W+X pages found.\n");
425 }
426 
ptdump_walk_pgd_level(struct seq_file * m,pgd_t * pgd)427 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd)
428 {
429 	ptdump_walk_pgd_level_core(m, pgd, false);
430 }
431 
ptdump_walk_pgd_level_checkwx(void)432 void ptdump_walk_pgd_level_checkwx(void)
433 {
434 	ptdump_walk_pgd_level_core(NULL, NULL, true);
435 }
436 
437 #ifdef CONFIG_X86_PTDUMP
ptdump_show(struct seq_file * m,void * v)438 static int ptdump_show(struct seq_file *m, void *v)
439 {
440 	ptdump_walk_pgd_level(m, NULL);
441 	return 0;
442 }
443 
ptdump_open(struct inode * inode,struct file * filp)444 static int ptdump_open(struct inode *inode, struct file *filp)
445 {
446 	return single_open(filp, ptdump_show, NULL);
447 }
448 
449 static const struct file_operations ptdump_fops = {
450 	.open		= ptdump_open,
451 	.read		= seq_read,
452 	.llseek		= seq_lseek,
453 	.release	= single_release,
454 };
455 #endif
456 
pt_dump_init(void)457 static int pt_dump_init(void)
458 {
459 #ifdef CONFIG_X86_PTDUMP
460 	struct dentry *pe;
461 #endif
462 
463 #ifdef CONFIG_X86_32
464 	/* Not a compile-time constant on x86-32 */
465 	address_markers[VMALLOC_START_NR].start_address = VMALLOC_START;
466 	address_markers[VMALLOC_END_NR].start_address = VMALLOC_END;
467 # ifdef CONFIG_HIGHMEM
468 	address_markers[PKMAP_BASE_NR].start_address = PKMAP_BASE;
469 # endif
470 	address_markers[FIXADDR_START_NR].start_address = FIXADDR_START;
471 #endif
472 
473 #ifdef CONFIG_X86_PTDUMP
474 	pe = debugfs_create_file("kernel_page_tables", 0600, NULL, NULL,
475 				 &ptdump_fops);
476 	if (!pe)
477 		return -ENOMEM;
478 #endif
479 
480 	return 0;
481 }
482 
483 __initcall(pt_dump_init);
484 MODULE_LICENSE("GPL");
485 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
486 MODULE_DESCRIPTION("Kernel debugging helper that dumps pagetables");
487