1 #include <linux/bug.h>
2 #include <linux/kernel.h>
3 #include <linux/errno.h>
4 #include <linux/string.h>
5 #include <linux/types.h>
6 #include <linux/bug.h>
7 #include <linux/init.h>
8 #include <linux/interrupt.h>
9 #include <linux/spinlock.h>
10 #include <linux/mm.h>
11 #include <linux/uaccess.h>
12
13 #undef pr_fmt
14 #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
15
16 #include <asm/kaiser.h>
17 #include <asm/tlbflush.h> /* to verify its kaiser declarations */
18 #include <asm/pgtable.h>
19 #include <asm/pgalloc.h>
20 #include <asm/desc.h>
21 #include <asm/cmdline.h>
22 #include <asm/vsyscall.h>
23
24 int kaiser_enabled __read_mostly = 1;
25 EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
26
27 __visible
28 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
29
30 /*
31 * These can have bit 63 set, so we can not just use a plain "or"
32 * instruction to get their value or'd into CR3. It would take
33 * another register. So, we use a memory reference to these instead.
34 *
35 * This is also handy because systems that do not support PCIDs
36 * just end up or'ing a 0 into their CR3, which does no harm.
37 */
38 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
39
40 /*
41 * At runtime, the only things we map are some things for CPU
42 * hotplug, and stacks for new processes. No two CPUs will ever
43 * be populating the same addresses, so we only need to ensure
44 * that we protect between two CPUs trying to allocate and
45 * populate the same page table page.
46 *
47 * Only take this lock when doing a set_p[4um]d(), but it is not
48 * needed for doing a set_pte(). We assume that only the *owner*
49 * of a given allocation will be doing this for _their_
50 * allocation.
51 *
52 * This ensures that once a system has been running for a while
53 * and there have been stacks all over and these page tables
54 * are fully populated, there will be no further acquisitions of
55 * this lock.
56 */
57 static DEFINE_SPINLOCK(shadow_table_allocation_lock);
58
59 /*
60 * Returns -1 on error.
61 */
get_pa_from_mapping(unsigned long vaddr)62 static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
63 {
64 pgd_t *pgd;
65 pud_t *pud;
66 pmd_t *pmd;
67 pte_t *pte;
68
69 pgd = pgd_offset_k(vaddr);
70 /*
71 * We made all the kernel PGDs present in kaiser_init().
72 * We expect them to stay that way.
73 */
74 BUG_ON(pgd_none(*pgd));
75 /*
76 * PGDs are either 512GB or 128TB on all x86_64
77 * configurations. We don't handle these.
78 */
79 BUG_ON(pgd_large(*pgd));
80
81 pud = pud_offset(pgd, vaddr);
82 if (pud_none(*pud)) {
83 WARN_ON_ONCE(1);
84 return -1;
85 }
86
87 if (pud_large(*pud))
88 return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
89
90 pmd = pmd_offset(pud, vaddr);
91 if (pmd_none(*pmd)) {
92 WARN_ON_ONCE(1);
93 return -1;
94 }
95
96 if (pmd_large(*pmd))
97 return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
98
99 pte = pte_offset_kernel(pmd, vaddr);
100 if (pte_none(*pte)) {
101 WARN_ON_ONCE(1);
102 return -1;
103 }
104
105 return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
106 }
107
108 /*
109 * This is a relatively normal page table walk, except that it
110 * also tries to allocate page tables pages along the way.
111 *
112 * Returns a pointer to a PTE on success, or NULL on failure.
113 */
kaiser_pagetable_walk(unsigned long address,bool user)114 static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
115 {
116 pmd_t *pmd;
117 pud_t *pud;
118 pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
119 gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
120 unsigned long prot = _KERNPG_TABLE;
121
122 if (pgd_none(*pgd)) {
123 WARN_ONCE(1, "All shadow pgds should have been populated");
124 return NULL;
125 }
126 BUILD_BUG_ON(pgd_large(*pgd) != 0);
127
128 if (user) {
129 /*
130 * The vsyscall page is the only page that will have
131 * _PAGE_USER set. Catch everything else.
132 */
133 BUG_ON(address != VSYSCALL_ADDR);
134
135 set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
136 prot = _PAGE_TABLE;
137 }
138
139 pud = pud_offset(pgd, address);
140 /* The shadow page tables do not use large mappings: */
141 if (pud_large(*pud)) {
142 WARN_ON(1);
143 return NULL;
144 }
145 if (pud_none(*pud)) {
146 unsigned long new_pmd_page = __get_free_page(gfp);
147 if (!new_pmd_page)
148 return NULL;
149 spin_lock(&shadow_table_allocation_lock);
150 if (pud_none(*pud)) {
151 set_pud(pud, __pud(prot | __pa(new_pmd_page)));
152 __inc_zone_page_state(virt_to_page((void *)
153 new_pmd_page), NR_KAISERTABLE);
154 } else
155 free_page(new_pmd_page);
156 spin_unlock(&shadow_table_allocation_lock);
157 }
158
159 pmd = pmd_offset(pud, address);
160 /* The shadow page tables do not use large mappings: */
161 if (pmd_large(*pmd)) {
162 WARN_ON(1);
163 return NULL;
164 }
165 if (pmd_none(*pmd)) {
166 unsigned long new_pte_page = __get_free_page(gfp);
167 if (!new_pte_page)
168 return NULL;
169 spin_lock(&shadow_table_allocation_lock);
170 if (pmd_none(*pmd)) {
171 set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
172 __inc_zone_page_state(virt_to_page((void *)
173 new_pte_page), NR_KAISERTABLE);
174 } else
175 free_page(new_pte_page);
176 spin_unlock(&shadow_table_allocation_lock);
177 }
178
179 return pte_offset_kernel(pmd, address);
180 }
181
kaiser_add_user_map(const void * __start_addr,unsigned long size,unsigned long flags)182 static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
183 unsigned long flags)
184 {
185 int ret = 0;
186 pte_t *pte;
187 unsigned long start_addr = (unsigned long )__start_addr;
188 unsigned long address = start_addr & PAGE_MASK;
189 unsigned long end_addr = PAGE_ALIGN(start_addr + size);
190 unsigned long target_address;
191
192 /*
193 * It is convenient for callers to pass in __PAGE_KERNEL etc,
194 * and there is no actual harm from setting _PAGE_GLOBAL, so
195 * long as CR4.PGE is not set. But it is nonetheless troubling
196 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
197 * requires that not to be #defined to 0): so mask it off here.
198 */
199 flags &= ~_PAGE_GLOBAL;
200 if (!(__supported_pte_mask & _PAGE_NX))
201 flags &= ~_PAGE_NX;
202
203 for (; address < end_addr; address += PAGE_SIZE) {
204 target_address = get_pa_from_mapping(address);
205 if (target_address == -1) {
206 ret = -EIO;
207 break;
208 }
209 pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
210 if (!pte) {
211 ret = -ENOMEM;
212 break;
213 }
214 if (pte_none(*pte)) {
215 set_pte(pte, __pte(flags | target_address));
216 } else {
217 pte_t tmp;
218 set_pte(&tmp, __pte(flags | target_address));
219 WARN_ON_ONCE(!pte_same(*pte, tmp));
220 }
221 }
222 return ret;
223 }
224
kaiser_add_user_map_ptrs(const void * start,const void * end,unsigned long flags)225 static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
226 {
227 unsigned long size = end - start;
228
229 return kaiser_add_user_map(start, size, flags);
230 }
231
232 /*
233 * Ensure that the top level of the (shadow) page tables are
234 * entirely populated. This ensures that all processes that get
235 * forked have the same entries. This way, we do not have to
236 * ever go set up new entries in older processes.
237 *
238 * Note: we never free these, so there are no updates to them
239 * after this.
240 */
kaiser_init_all_pgds(void)241 static void __init kaiser_init_all_pgds(void)
242 {
243 pgd_t *pgd;
244 int i = 0;
245
246 pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
247 for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
248 pgd_t new_pgd;
249 pud_t *pud = pud_alloc_one(&init_mm,
250 PAGE_OFFSET + i * PGDIR_SIZE);
251 if (!pud) {
252 WARN_ON(1);
253 break;
254 }
255 inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
256 new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
257 /*
258 * Make sure not to stomp on some other pgd entry.
259 */
260 if (!pgd_none(pgd[i])) {
261 WARN_ON(1);
262 continue;
263 }
264 set_pgd(pgd + i, new_pgd);
265 }
266 }
267
268 #define kaiser_add_user_map_early(start, size, flags) do { \
269 int __ret = kaiser_add_user_map(start, size, flags); \
270 WARN_ON(__ret); \
271 } while (0)
272
273 #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \
274 int __ret = kaiser_add_user_map_ptrs(start, end, flags); \
275 WARN_ON(__ret); \
276 } while (0)
277
kaiser_check_boottime_disable(void)278 void __init kaiser_check_boottime_disable(void)
279 {
280 bool enable = true;
281 char arg[5];
282 int ret;
283
284 if (boot_cpu_has(X86_FEATURE_XENPV))
285 goto silent_disable;
286
287 ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
288 if (ret > 0) {
289 if (!strncmp(arg, "on", 2))
290 goto enable;
291
292 if (!strncmp(arg, "off", 3))
293 goto disable;
294
295 if (!strncmp(arg, "auto", 4))
296 goto skip;
297 }
298
299 if (cmdline_find_option_bool(boot_command_line, "nopti"))
300 goto disable;
301
302 skip:
303 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
304 goto disable;
305
306 enable:
307 if (enable)
308 setup_force_cpu_cap(X86_FEATURE_KAISER);
309
310 return;
311
312 disable:
313 pr_info("disabled\n");
314
315 silent_disable:
316 kaiser_enabled = 0;
317 setup_clear_cpu_cap(X86_FEATURE_KAISER);
318 }
319
320 /*
321 * If anything in here fails, we will likely die on one of the
322 * first kernel->user transitions and init will die. But, we
323 * will have most of the kernel up by then and should be able to
324 * get a clean warning out of it. If we BUG_ON() here, we run
325 * the risk of being before we have good console output.
326 */
kaiser_init(void)327 void __init kaiser_init(void)
328 {
329 int cpu;
330
331 if (!kaiser_enabled)
332 return;
333
334 kaiser_init_all_pgds();
335
336 /*
337 * Note that this sets _PAGE_USER and it needs to happen when the
338 * pagetable hierarchy gets created, i.e., early. Otherwise
339 * kaiser_pagetable_walk() will encounter initialized PTEs in the
340 * hierarchy and not set the proper permissions, leading to the
341 * pagefaults with page-protection violations when trying to read the
342 * vsyscall page. For example.
343 */
344 if (vsyscall_enabled())
345 kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
346 PAGE_SIZE,
347 vsyscall_pgprot);
348
349 for_each_possible_cpu(cpu) {
350 void *percpu_vaddr = __per_cpu_user_mapped_start +
351 per_cpu_offset(cpu);
352 unsigned long percpu_sz = __per_cpu_user_mapped_end -
353 __per_cpu_user_mapped_start;
354 kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
355 __PAGE_KERNEL);
356 }
357
358 /*
359 * Map the entry/exit text section, which is needed at
360 * switches from user to and from kernel.
361 */
362 kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
363 __PAGE_KERNEL_RX);
364
365 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
366 kaiser_add_user_map_ptrs_early(__irqentry_text_start,
367 __irqentry_text_end,
368 __PAGE_KERNEL_RX);
369 #endif
370 kaiser_add_user_map_early((void *)idt_descr.address,
371 sizeof(gate_desc) * NR_VECTORS,
372 __PAGE_KERNEL_RO);
373 #ifdef CONFIG_TRACING
374 kaiser_add_user_map_early(&trace_idt_descr,
375 sizeof(trace_idt_descr),
376 __PAGE_KERNEL);
377 kaiser_add_user_map_early(&trace_idt_table,
378 sizeof(gate_desc) * NR_VECTORS,
379 __PAGE_KERNEL);
380 #endif
381 kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
382 __PAGE_KERNEL);
383 kaiser_add_user_map_early(&debug_idt_table,
384 sizeof(gate_desc) * NR_VECTORS,
385 __PAGE_KERNEL);
386
387 pr_info("enabled\n");
388 }
389
390 /* Add a mapping to the shadow mapping, and synchronize the mappings */
kaiser_add_mapping(unsigned long addr,unsigned long size,unsigned long flags)391 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
392 {
393 if (!kaiser_enabled)
394 return 0;
395 return kaiser_add_user_map((const void *)addr, size, flags);
396 }
397
kaiser_remove_mapping(unsigned long start,unsigned long size)398 void kaiser_remove_mapping(unsigned long start, unsigned long size)
399 {
400 extern void unmap_pud_range_nofree(pgd_t *pgd,
401 unsigned long start, unsigned long end);
402 unsigned long end = start + size;
403 unsigned long addr, next;
404 pgd_t *pgd;
405
406 if (!kaiser_enabled)
407 return;
408 pgd = native_get_shadow_pgd(pgd_offset_k(start));
409 for (addr = start; addr < end; pgd++, addr = next) {
410 next = pgd_addr_end(addr, end);
411 unmap_pud_range_nofree(pgd, addr, next);
412 }
413 }
414
415 /*
416 * Page table pages are page-aligned. The lower half of the top
417 * level is used for userspace and the top half for the kernel.
418 * This returns true for user pages that need to get copied into
419 * both the user and kernel copies of the page tables, and false
420 * for kernel pages that should only be in the kernel copy.
421 */
is_userspace_pgd(pgd_t * pgdp)422 static inline bool is_userspace_pgd(pgd_t *pgdp)
423 {
424 return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
425 }
426
kaiser_set_shadow_pgd(pgd_t * pgdp,pgd_t pgd)427 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
428 {
429 if (!kaiser_enabled)
430 return pgd;
431 /*
432 * Do we need to also populate the shadow pgd? Check _PAGE_USER to
433 * skip cases like kexec and EFI which make temporary low mappings.
434 */
435 if (pgd.pgd & _PAGE_USER) {
436 if (is_userspace_pgd(pgdp)) {
437 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
438 /*
439 * Even if the entry is *mapping* userspace, ensure
440 * that userspace can not use it. This way, if we
441 * get out to userspace running on the kernel CR3,
442 * userspace will crash instead of running.
443 */
444 if (__supported_pte_mask & _PAGE_NX)
445 pgd.pgd |= _PAGE_NX;
446 }
447 } else if (!pgd.pgd) {
448 /*
449 * pgd_clear() cannot check _PAGE_USER, and is even used to
450 * clear corrupted pgd entries: so just rely on cases like
451 * kexec and EFI never to be using pgd_clear().
452 */
453 if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
454 is_userspace_pgd(pgdp))
455 native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
456 }
457 return pgd;
458 }
459
kaiser_setup_pcid(void)460 void kaiser_setup_pcid(void)
461 {
462 unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
463
464 if (this_cpu_has(X86_FEATURE_PCID))
465 user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
466 /*
467 * These variables are used by the entry/exit
468 * code to change PCID and pgd and TLB flushing.
469 */
470 this_cpu_write(x86_cr3_pcid_user, user_cr3);
471 }
472
473 /*
474 * Make a note that this cpu will need to flush USER tlb on return to user.
475 * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
476 */
kaiser_flush_tlb_on_return_to_user(void)477 void kaiser_flush_tlb_on_return_to_user(void)
478 {
479 if (this_cpu_has(X86_FEATURE_PCID))
480 this_cpu_write(x86_cr3_pcid_user,
481 X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
482 }
483 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
484