• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <linux/bug.h>
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/string.h>
5 #include <linux/types.h>
6 #include <linux/bug.h>
7 #include <linux/init.h>
8 #include <linux/interrupt.h>
9 #include <linux/spinlock.h>
10 #include <linux/mm.h>
11 #include <linux/uaccess.h>
12 
13 #undef pr_fmt
14 #define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
15 
16 #include <asm/kaiser.h>
17 #include <asm/tlbflush.h>	/* to verify its kaiser declarations */
18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h>
20 #include <asm/desc.h>
21 #include <asm/cmdline.h>
22 #include <asm/vsyscall.h>
23 
24 int kaiser_enabled __read_mostly = 1;
25 EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */
26 
27 __visible
28 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
29 
30 /*
31  * These can have bit 63 set, so we can not just use a plain "or"
32  * instruction to get their value or'd into CR3.  It would take
33  * another register.  So, we use a memory reference to these instead.
34  *
35  * This is also handy because systems that do not support PCIDs
36  * just end up or'ing a 0 into their CR3, which does no harm.
37  */
38 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
39 
40 /*
41  * At runtime, the only things we map are some things for CPU
42  * hotplug, and stacks for new processes.  No two CPUs will ever
43  * be populating the same addresses, so we only need to ensure
44  * that we protect between two CPUs trying to allocate and
45  * populate the same page table page.
46  *
47  * Only take this lock when doing a set_p[4um]d(), but it is not
48  * needed for doing a set_pte().  We assume that only the *owner*
49  * of a given allocation will be doing this for _their_
50  * allocation.
51  *
52  * This ensures that once a system has been running for a while
53  * and there have been stacks all over and these page tables
54  * are fully populated, there will be no further acquisitions of
55  * this lock.
56  */
57 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
58 
59 /*
60  * Returns -1 on error.
61  */
get_pa_from_mapping(unsigned long vaddr)62 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
63 {
64 	pgd_t *pgd;
65 	pud_t *pud;
66 	pmd_t *pmd;
67 	pte_t *pte;
68 
69 	pgd = pgd_offset_k(vaddr);
70 	/*
71 	 * We made all the kernel PGDs present in kaiser_init().
72 	 * We expect them to stay that way.
73 	 */
74 	BUG_ON(pgd_none(*pgd));
75 	/*
76 	 * PGDs are either 512GB or 128TB on all x86_64
77 	 * configurations.  We don't handle these.
78 	 */
79 	BUG_ON(pgd_large(*pgd));
80 
81 	pud = pud_offset(pgd, vaddr);
82 	if (pud_none(*pud)) {
83 		WARN_ON_ONCE(1);
84 		return -1;
85 	}
86 
87 	if (pud_large(*pud))
88 		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
89 
90 	pmd = pmd_offset(pud, vaddr);
91 	if (pmd_none(*pmd)) {
92 		WARN_ON_ONCE(1);
93 		return -1;
94 	}
95 
96 	if (pmd_large(*pmd))
97 		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
98 
99 	pte = pte_offset_kernel(pmd, vaddr);
100 	if (pte_none(*pte)) {
101 		WARN_ON_ONCE(1);
102 		return -1;
103 	}
104 
105 	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
106 }
107 
108 /*
109  * This is a relatively normal page table walk, except that it
110  * also tries to allocate page tables pages along the way.
111  *
112  * Returns a pointer to a PTE on success, or NULL on failure.
113  */
kaiser_pagetable_walk(unsigned long address,bool user)114 static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
115 {
116 	pmd_t *pmd;
117 	pud_t *pud;
118 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
119 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
120 	unsigned long prot = _KERNPG_TABLE;
121 
122 	if (pgd_none(*pgd)) {
123 		WARN_ONCE(1, "All shadow pgds should have been populated");
124 		return NULL;
125 	}
126 	BUILD_BUG_ON(pgd_large(*pgd) != 0);
127 
128 	if (user) {
129 		/*
130 		 * The vsyscall page is the only page that will have
131 		 *  _PAGE_USER set. Catch everything else.
132 		 */
133 		BUG_ON(address != VSYSCALL_ADDR);
134 
135 		set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
136 		prot = _PAGE_TABLE;
137 	}
138 
139 	pud = pud_offset(pgd, address);
140 	/* The shadow page tables do not use large mappings: */
141 	if (pud_large(*pud)) {
142 		WARN_ON(1);
143 		return NULL;
144 	}
145 	if (pud_none(*pud)) {
146 		unsigned long new_pmd_page = __get_free_page(gfp);
147 		if (!new_pmd_page)
148 			return NULL;
149 		spin_lock(&shadow_table_allocation_lock);
150 		if (pud_none(*pud)) {
151 			set_pud(pud, __pud(prot | __pa(new_pmd_page)));
152 			__inc_zone_page_state(virt_to_page((void *)
153 						new_pmd_page), NR_KAISERTABLE);
154 		} else
155 			free_page(new_pmd_page);
156 		spin_unlock(&shadow_table_allocation_lock);
157 	}
158 
159 	pmd = pmd_offset(pud, address);
160 	/* The shadow page tables do not use large mappings: */
161 	if (pmd_large(*pmd)) {
162 		WARN_ON(1);
163 		return NULL;
164 	}
165 	if (pmd_none(*pmd)) {
166 		unsigned long new_pte_page = __get_free_page(gfp);
167 		if (!new_pte_page)
168 			return NULL;
169 		spin_lock(&shadow_table_allocation_lock);
170 		if (pmd_none(*pmd)) {
171 			set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
172 			__inc_zone_page_state(virt_to_page((void *)
173 						new_pte_page), NR_KAISERTABLE);
174 		} else
175 			free_page(new_pte_page);
176 		spin_unlock(&shadow_table_allocation_lock);
177 	}
178 
179 	return pte_offset_kernel(pmd, address);
180 }
181 
kaiser_add_user_map(const void * __start_addr,unsigned long size,unsigned long flags)182 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
183 			       unsigned long flags)
184 {
185 	int ret = 0;
186 	pte_t *pte;
187 	unsigned long start_addr = (unsigned long )__start_addr;
188 	unsigned long address = start_addr & PAGE_MASK;
189 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
190 	unsigned long target_address;
191 
192 	/*
193 	 * It is convenient for callers to pass in __PAGE_KERNEL etc,
194 	 * and there is no actual harm from setting _PAGE_GLOBAL, so
195 	 * long as CR4.PGE is not set.  But it is nonetheless troubling
196 	 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
197 	 * requires that not to be #defined to 0): so mask it off here.
198 	 */
199 	flags &= ~_PAGE_GLOBAL;
200 	if (!(__supported_pte_mask & _PAGE_NX))
201 		flags &= ~_PAGE_NX;
202 
203 	for (; address < end_addr; address += PAGE_SIZE) {
204 		target_address = get_pa_from_mapping(address);
205 		if (target_address == -1) {
206 			ret = -EIO;
207 			break;
208 		}
209 		pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
210 		if (!pte) {
211 			ret = -ENOMEM;
212 			break;
213 		}
214 		if (pte_none(*pte)) {
215 			set_pte(pte, __pte(flags | target_address));
216 		} else {
217 			pte_t tmp;
218 			set_pte(&tmp, __pte(flags | target_address));
219 			WARN_ON_ONCE(!pte_same(*pte, tmp));
220 		}
221 	}
222 	return ret;
223 }
224 
kaiser_add_user_map_ptrs(const void * start,const void * end,unsigned long flags)225 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
226 {
227 	unsigned long size = end - start;
228 
229 	return kaiser_add_user_map(start, size, flags);
230 }
231 
232 /*
233  * Ensure that the top level of the (shadow) page tables are
234  * entirely populated.  This ensures that all processes that get
235  * forked have the same entries.  This way, we do not have to
236  * ever go set up new entries in older processes.
237  *
238  * Note: we never free these, so there are no updates to them
239  * after this.
240  */
kaiser_init_all_pgds(void)241 static void __init kaiser_init_all_pgds(void)
242 {
243 	pgd_t *pgd;
244 	int i = 0;
245 
246 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
247 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
248 		pgd_t new_pgd;
249 		pud_t *pud = pud_alloc_one(&init_mm,
250 					   PAGE_OFFSET + i * PGDIR_SIZE);
251 		if (!pud) {
252 			WARN_ON(1);
253 			break;
254 		}
255 		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
256 		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
257 		/*
258 		 * Make sure not to stomp on some other pgd entry.
259 		 */
260 		if (!pgd_none(pgd[i])) {
261 			WARN_ON(1);
262 			continue;
263 		}
264 		set_pgd(pgd + i, new_pgd);
265 	}
266 }
267 
268 #define kaiser_add_user_map_early(start, size, flags) do {	\
269 	int __ret = kaiser_add_user_map(start, size, flags);	\
270 	WARN_ON(__ret);						\
271 } while (0)
272 
273 #define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
274 	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
275 	WARN_ON(__ret);							\
276 } while (0)
277 
kaiser_check_boottime_disable(void)278 void __init kaiser_check_boottime_disable(void)
279 {
280 	bool enable = true;
281 	char arg[5];
282 	int ret;
283 
284 	if (boot_cpu_has(X86_FEATURE_XENPV))
285 		goto silent_disable;
286 
287 	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
288 	if (ret > 0) {
289 		if (!strncmp(arg, "on", 2))
290 			goto enable;
291 
292 		if (!strncmp(arg, "off", 3))
293 			goto disable;
294 
295 		if (!strncmp(arg, "auto", 4))
296 			goto skip;
297 	}
298 
299 	if (cmdline_find_option_bool(boot_command_line, "nopti"))
300 		goto disable;
301 
302 skip:
303 	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
304 		goto disable;
305 
306 enable:
307 	if (enable)
308 		setup_force_cpu_cap(X86_FEATURE_KAISER);
309 
310 	return;
311 
312 disable:
313 	pr_info("disabled\n");
314 
315 silent_disable:
316 	kaiser_enabled = 0;
317 	setup_clear_cpu_cap(X86_FEATURE_KAISER);
318 }
319 
320 /*
321  * If anything in here fails, we will likely die on one of the
322  * first kernel->user transitions and init will die.  But, we
323  * will have most of the kernel up by then and should be able to
324  * get a clean warning out of it.  If we BUG_ON() here, we run
325  * the risk of being before we have good console output.
326  */
kaiser_init(void)327 void __init kaiser_init(void)
328 {
329 	int cpu;
330 
331 	if (!kaiser_enabled)
332 		return;
333 
334 	kaiser_init_all_pgds();
335 
336 	/*
337 	 * Note that this sets _PAGE_USER and it needs to happen when the
338 	 * pagetable hierarchy gets created, i.e., early. Otherwise
339 	 * kaiser_pagetable_walk() will encounter initialized PTEs in the
340 	 * hierarchy and not set the proper permissions, leading to the
341 	 * pagefaults with page-protection violations when trying to read the
342 	 * vsyscall page. For example.
343 	 */
344 	if (vsyscall_enabled())
345 		kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
346 					  PAGE_SIZE,
347 					  vsyscall_pgprot);
348 
349 	for_each_possible_cpu(cpu) {
350 		void *percpu_vaddr = __per_cpu_user_mapped_start +
351 				     per_cpu_offset(cpu);
352 		unsigned long percpu_sz = __per_cpu_user_mapped_end -
353 					  __per_cpu_user_mapped_start;
354 		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
355 					  __PAGE_KERNEL);
356 	}
357 
358 	/*
359 	 * Map the entry/exit text section, which is needed at
360 	 * switches from user to and from kernel.
361 	 */
362 	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
363 				       __PAGE_KERNEL_RX);
364 
365 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
366 	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
367 				       __irqentry_text_end,
368 				       __PAGE_KERNEL_RX);
369 #endif
370 	kaiser_add_user_map_early((void *)idt_descr.address,
371 				  sizeof(gate_desc) * NR_VECTORS,
372 				  __PAGE_KERNEL_RO);
373 #ifdef CONFIG_TRACING
374 	kaiser_add_user_map_early(&trace_idt_descr,
375 				  sizeof(trace_idt_descr),
376 				  __PAGE_KERNEL);
377 	kaiser_add_user_map_early(&trace_idt_table,
378 				  sizeof(gate_desc) * NR_VECTORS,
379 				  __PAGE_KERNEL);
380 #endif
381 	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
382 				  __PAGE_KERNEL);
383 	kaiser_add_user_map_early(&debug_idt_table,
384 				  sizeof(gate_desc) * NR_VECTORS,
385 				  __PAGE_KERNEL);
386 
387 	pr_info("enabled\n");
388 }
389 
390 /* Add a mapping to the shadow mapping, and synchronize the mappings */
kaiser_add_mapping(unsigned long addr,unsigned long size,unsigned long flags)391 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
392 {
393 	if (!kaiser_enabled)
394 		return 0;
395 	return kaiser_add_user_map((const void *)addr, size, flags);
396 }
397 
kaiser_remove_mapping(unsigned long start,unsigned long size)398 void kaiser_remove_mapping(unsigned long start, unsigned long size)
399 {
400 	extern void unmap_pud_range_nofree(pgd_t *pgd,
401 				unsigned long start, unsigned long end);
402 	unsigned long end = start + size;
403 	unsigned long addr, next;
404 	pgd_t *pgd;
405 
406 	if (!kaiser_enabled)
407 		return;
408 	pgd = native_get_shadow_pgd(pgd_offset_k(start));
409 	for (addr = start; addr < end; pgd++, addr = next) {
410 		next = pgd_addr_end(addr, end);
411 		unmap_pud_range_nofree(pgd, addr, next);
412 	}
413 }
414 
415 /*
416  * Page table pages are page-aligned.  The lower half of the top
417  * level is used for userspace and the top half for the kernel.
418  * This returns true for user pages that need to get copied into
419  * both the user and kernel copies of the page tables, and false
420  * for kernel pages that should only be in the kernel copy.
421  */
is_userspace_pgd(pgd_t * pgdp)422 static inline bool is_userspace_pgd(pgd_t *pgdp)
423 {
424 	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
425 }
426 
kaiser_set_shadow_pgd(pgd_t * pgdp,pgd_t pgd)427 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
428 {
429 	if (!kaiser_enabled)
430 		return pgd;
431 	/*
432 	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
433 	 * skip cases like kexec and EFI which make temporary low mappings.
434 	 */
435 	if (pgd.pgd & _PAGE_USER) {
436 		if (is_userspace_pgd(pgdp)) {
437 			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
438 			/*
439 			 * Even if the entry is *mapping* userspace, ensure
440 			 * that userspace can not use it.  This way, if we
441 			 * get out to userspace running on the kernel CR3,
442 			 * userspace will crash instead of running.
443 			 */
444 			if (__supported_pte_mask & _PAGE_NX)
445 				pgd.pgd |= _PAGE_NX;
446 		}
447 	} else if (!pgd.pgd) {
448 		/*
449 		 * pgd_clear() cannot check _PAGE_USER, and is even used to
450 		 * clear corrupted pgd entries: so just rely on cases like
451 		 * kexec and EFI never to be using pgd_clear().
452 		 */
453 		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
454 		    is_userspace_pgd(pgdp))
455 			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
456 	}
457 	return pgd;
458 }
459 
kaiser_setup_pcid(void)460 void kaiser_setup_pcid(void)
461 {
462 	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
463 
464 	if (this_cpu_has(X86_FEATURE_PCID))
465 		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
466 	/*
467 	 * These variables are used by the entry/exit
468 	 * code to change PCID and pgd and TLB flushing.
469 	 */
470 	this_cpu_write(x86_cr3_pcid_user, user_cr3);
471 }
472 
473 /*
474  * Make a note that this cpu will need to flush USER tlb on return to user.
475  * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
476  */
kaiser_flush_tlb_on_return_to_user(void)477 void kaiser_flush_tlb_on_return_to_user(void)
478 {
479 	if (this_cpu_has(X86_FEATURE_PCID))
480 		this_cpu_write(x86_cr3_pcid_user,
481 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
482 }
483 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
484