1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Based on arch/arm/mm/fault.c
4 *
5 * Copyright (C) 1995 Linus Torvalds
6 * Copyright (C) 1995-2004 Russell King
7 * Copyright (C) 2012 ARM Ltd.
8 */
9
10 #include <linux/acpi.h>
11 #include <linux/bitfield.h>
12 #include <linux/extable.h>
13 #include <linux/kfence.h>
14 #include <linux/signal.h>
15 #include <linux/mm.h>
16 #include <linux/hardirq.h>
17 #include <linux/init.h>
18 #include <linux/kasan.h>
19 #include <linux/kprobes.h>
20 #include <linux/uaccess.h>
21 #include <linux/page-flags.h>
22 #include <linux/sched/signal.h>
23 #include <linux/sched/debug.h>
24 #include <linux/highmem.h>
25 #include <linux/perf_event.h>
26 #include <linux/preempt.h>
27 #include <linux/hugetlb.h>
28 #include <linux/vm_event_item.h>
29
30 #include <asm/acpi.h>
31 #include <asm/bug.h>
32 #include <asm/cmpxchg.h>
33 #include <asm/cpufeature.h>
34 #include <asm/exception.h>
35 #include <asm/daifflags.h>
36 #include <asm/debug-monitors.h>
37 #include <asm/esr.h>
38 #include <asm/kprobes.h>
39 #include <asm/mte.h>
40 #include <asm/processor.h>
41 #include <asm/sysreg.h>
42 #include <asm/system_misc.h>
43 #include <asm/tlbflush.h>
44 #include <asm/traps.h>
45 #include <asm/virt.h>
46
47 #include <trace/hooks/fault.h>
48
49 struct fault_info {
50 int (*fn)(unsigned long far, unsigned long esr,
51 struct pt_regs *regs);
52 int sig;
53 int code;
54 const char *name;
55 };
56
57 static const struct fault_info fault_info[];
58 static struct fault_info debug_fault_info[];
59
esr_to_fault_info(unsigned long esr)60 static inline const struct fault_info *esr_to_fault_info(unsigned long esr)
61 {
62 return fault_info + (esr & ESR_ELx_FSC);
63 }
64
esr_to_debug_fault_info(unsigned long esr)65 static inline const struct fault_info *esr_to_debug_fault_info(unsigned long esr)
66 {
67 return debug_fault_info + DBG_ESR_EVT(esr);
68 }
69
data_abort_decode(unsigned long esr)70 static void data_abort_decode(unsigned long esr)
71 {
72 pr_alert("Data abort info:\n");
73
74 if (esr & ESR_ELx_ISV) {
75 pr_alert(" Access size = %u byte(s)\n",
76 1U << ((esr & ESR_ELx_SAS) >> ESR_ELx_SAS_SHIFT));
77 pr_alert(" SSE = %lu, SRT = %lu\n",
78 (esr & ESR_ELx_SSE) >> ESR_ELx_SSE_SHIFT,
79 (esr & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT);
80 pr_alert(" SF = %lu, AR = %lu\n",
81 (esr & ESR_ELx_SF) >> ESR_ELx_SF_SHIFT,
82 (esr & ESR_ELx_AR) >> ESR_ELx_AR_SHIFT);
83 } else {
84 pr_alert(" ISV = 0, ISS = 0x%08lx\n", esr & ESR_ELx_ISS_MASK);
85 }
86
87 pr_alert(" CM = %lu, WnR = %lu\n",
88 (esr & ESR_ELx_CM) >> ESR_ELx_CM_SHIFT,
89 (esr & ESR_ELx_WNR) >> ESR_ELx_WNR_SHIFT);
90 }
91
mem_abort_decode(unsigned long esr)92 static void mem_abort_decode(unsigned long esr)
93 {
94 pr_alert("Mem abort info:\n");
95
96 pr_alert(" ESR = 0x%016lx\n", esr);
97 pr_alert(" EC = 0x%02lx: %s, IL = %u bits\n",
98 ESR_ELx_EC(esr), esr_get_class_string(esr),
99 (esr & ESR_ELx_IL) ? 32 : 16);
100 pr_alert(" SET = %lu, FnV = %lu\n",
101 (esr & ESR_ELx_SET_MASK) >> ESR_ELx_SET_SHIFT,
102 (esr & ESR_ELx_FnV) >> ESR_ELx_FnV_SHIFT);
103 pr_alert(" EA = %lu, S1PTW = %lu\n",
104 (esr & ESR_ELx_EA) >> ESR_ELx_EA_SHIFT,
105 (esr & ESR_ELx_S1PTW) >> ESR_ELx_S1PTW_SHIFT);
106 pr_alert(" FSC = 0x%02lx: %s\n", (esr & ESR_ELx_FSC),
107 esr_to_fault_info(esr)->name);
108
109 if (esr_is_data_abort(esr))
110 data_abort_decode(esr);
111 }
112
mm_to_pgd_phys(struct mm_struct * mm)113 static inline unsigned long mm_to_pgd_phys(struct mm_struct *mm)
114 {
115 /* Either init_pg_dir or swapper_pg_dir */
116 if (mm == &init_mm)
117 return __pa_symbol(mm->pgd);
118
119 return (unsigned long)virt_to_phys(mm->pgd);
120 }
121
122 /*
123 * Dump out the page tables associated with 'addr' in the currently active mm.
124 */
show_pte(unsigned long addr)125 static void show_pte(unsigned long addr)
126 {
127 struct mm_struct *mm;
128 pgd_t *pgdp;
129 pgd_t pgd;
130
131 if (is_ttbr0_addr(addr)) {
132 /* TTBR0 */
133 mm = current->active_mm;
134 if (mm == &init_mm) {
135 pr_alert("[%016lx] user address but active_mm is swapper\n",
136 addr);
137 return;
138 }
139 } else if (is_ttbr1_addr(addr)) {
140 /* TTBR1 */
141 mm = &init_mm;
142 } else {
143 pr_alert("[%016lx] address between user and kernel address ranges\n",
144 addr);
145 return;
146 }
147
148 pr_alert("%s pgtable: %luk pages, %llu-bit VAs, pgdp=%016lx\n",
149 mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K,
150 vabits_actual, mm_to_pgd_phys(mm));
151 pgdp = pgd_offset(mm, addr);
152 pgd = READ_ONCE(*pgdp);
153 pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd));
154
155 do {
156 p4d_t *p4dp, p4d;
157 pud_t *pudp, pud;
158 pmd_t *pmdp, pmd;
159 pte_t *ptep, pte;
160
161 if (pgd_none(pgd) || pgd_bad(pgd))
162 break;
163
164 p4dp = p4d_offset(pgdp, addr);
165 p4d = READ_ONCE(*p4dp);
166 pr_cont(", p4d=%016llx", p4d_val(p4d));
167 if (p4d_none(p4d) || p4d_bad(p4d))
168 break;
169
170 pudp = pud_offset(p4dp, addr);
171 pud = READ_ONCE(*pudp);
172 pr_cont(", pud=%016llx", pud_val(pud));
173 if (pud_none(pud) || pud_bad(pud))
174 break;
175
176 pmdp = pmd_offset(pudp, addr);
177 pmd = READ_ONCE(*pmdp);
178 pr_cont(", pmd=%016llx", pmd_val(pmd));
179 if (pmd_none(pmd) || pmd_bad(pmd))
180 break;
181
182 ptep = pte_offset_map(pmdp, addr);
183 pte = READ_ONCE(*ptep);
184 pr_cont(", pte=%016llx", pte_val(pte));
185 pte_unmap(ptep);
186 } while(0);
187
188 pr_cont("\n");
189 }
190
191 /*
192 * This function sets the access flags (dirty, accessed), as well as write
193 * permission, and only to a more permissive setting.
194 *
195 * It needs to cope with hardware update of the accessed/dirty state by other
196 * agents in the system and can safely skip the __sync_icache_dcache() call as,
197 * like set_pte_at(), the PTE is never changed from no-exec to exec here.
198 *
199 * Returns whether or not the PTE actually changed.
200 */
ptep_set_access_flags(struct vm_area_struct * vma,unsigned long address,pte_t * ptep,pte_t entry,int dirty)201 int ptep_set_access_flags(struct vm_area_struct *vma,
202 unsigned long address, pte_t *ptep,
203 pte_t entry, int dirty)
204 {
205 pteval_t old_pteval, pteval;
206 pte_t pte = READ_ONCE(*ptep);
207
208 if (pte_same(pte, entry))
209 return 0;
210
211 /* only preserve the access flags and write permission */
212 pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY;
213
214 /*
215 * Setting the flags must be done atomically to avoid racing with the
216 * hardware update of the access/dirty state. The PTE_RDONLY bit must
217 * be set to the most permissive (lowest value) of *ptep and entry
218 * (calculated as: a & b == ~(~a | ~b)).
219 */
220 pte_val(entry) ^= PTE_RDONLY;
221 pteval = pte_val(pte);
222 do {
223 old_pteval = pteval;
224 pteval ^= PTE_RDONLY;
225 pteval |= pte_val(entry);
226 pteval ^= PTE_RDONLY;
227 pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval);
228 } while (pteval != old_pteval);
229
230 /* Invalidate a stale read-only entry */
231 if (dirty)
232 flush_tlb_page(vma, address);
233 return 1;
234 }
235
is_el1_instruction_abort(unsigned long esr)236 static bool is_el1_instruction_abort(unsigned long esr)
237 {
238 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_CUR;
239 }
240
is_el1_data_abort(unsigned long esr)241 static bool is_el1_data_abort(unsigned long esr)
242 {
243 return ESR_ELx_EC(esr) == ESR_ELx_EC_DABT_CUR;
244 }
245
is_el1_permission_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)246 static inline bool is_el1_permission_fault(unsigned long addr, unsigned long esr,
247 struct pt_regs *regs)
248 {
249 unsigned long fsc_type = esr & ESR_ELx_FSC_TYPE;
250
251 if (!is_el1_data_abort(esr) && !is_el1_instruction_abort(esr))
252 return false;
253
254 if (fsc_type == ESR_ELx_FSC_PERM)
255 return true;
256
257 if (is_ttbr0_addr(addr) && system_uses_ttbr0_pan())
258 return fsc_type == ESR_ELx_FSC_FAULT &&
259 (regs->pstate & PSR_PAN_BIT);
260
261 return false;
262 }
263
is_pkvm_stage2_abort(unsigned int esr)264 static bool is_pkvm_stage2_abort(unsigned int esr)
265 {
266 /*
267 * S1PTW should only ever be set in ESR_EL1 if the pkvm hypervisor
268 * injected a stage-2 abort -- see host_inject_abort().
269 */
270 return is_pkvm_initialized() && (esr & ESR_ELx_S1PTW);
271 }
272
is_spurious_el1_translation_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)273 static bool __kprobes is_spurious_el1_translation_fault(unsigned long addr,
274 unsigned long esr,
275 struct pt_regs *regs)
276 {
277 unsigned long flags;
278 u64 par, dfsc;
279
280 if (!is_el1_data_abort(esr) ||
281 (esr & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT)
282 return false;
283
284 if (is_pkvm_stage2_abort(esr))
285 return false;
286
287 local_irq_save(flags);
288 asm volatile("at s1e1r, %0" :: "r" (addr));
289 isb();
290 par = read_sysreg_par();
291 local_irq_restore(flags);
292
293 /*
294 * If we now have a valid translation, treat the translation fault as
295 * spurious.
296 */
297 if (!(par & SYS_PAR_EL1_F))
298 return true;
299
300 /*
301 * If we got a different type of fault from the AT instruction,
302 * treat the translation fault as spurious.
303 */
304 dfsc = FIELD_GET(SYS_PAR_EL1_FST, par);
305 return (dfsc & ESR_ELx_FSC_TYPE) != ESR_ELx_FSC_FAULT;
306 }
307
die_kernel_fault(const char * msg,unsigned long addr,unsigned long esr,struct pt_regs * regs)308 static void die_kernel_fault(const char *msg, unsigned long addr,
309 unsigned long esr, struct pt_regs *regs)
310 {
311 bust_spinlocks(1);
312
313 pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
314 addr);
315
316 trace_android_rvh_die_kernel_fault(msg, addr, esr, regs);
317 mem_abort_decode(esr);
318
319 show_pte(addr);
320 die("Oops", regs, esr);
321 bust_spinlocks(0);
322 make_task_dead(SIGKILL);
323 }
324
325 #ifdef CONFIG_KASAN_HW_TAGS
report_tag_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)326 static void report_tag_fault(unsigned long addr, unsigned long esr,
327 struct pt_regs *regs)
328 {
329 /*
330 * SAS bits aren't set for all faults reported in EL1, so we can't
331 * find out access size.
332 */
333 bool is_write = !!(esr & ESR_ELx_WNR);
334 kasan_report(addr, 0, is_write, regs->pc);
335 }
336 #else
337 /* Tag faults aren't enabled without CONFIG_KASAN_HW_TAGS. */
report_tag_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)338 static inline void report_tag_fault(unsigned long addr, unsigned long esr,
339 struct pt_regs *regs) { }
340 #endif
341
do_tag_recovery(unsigned long addr,unsigned long esr,struct pt_regs * regs)342 static void do_tag_recovery(unsigned long addr, unsigned long esr,
343 struct pt_regs *regs)
344 {
345
346 report_tag_fault(addr, esr, regs);
347
348 /*
349 * Disable MTE Tag Checking on the local CPU for the current EL.
350 * It will be done lazily on the other CPUs when they will hit a
351 * tag fault.
352 */
353 sysreg_clear_set(sctlr_el1, SCTLR_ELx_TCF_MASK, SCTLR_ELx_TCF_NONE);
354 isb();
355 }
356
is_el1_mte_sync_tag_check_fault(unsigned long esr)357 static bool is_el1_mte_sync_tag_check_fault(unsigned long esr)
358 {
359 unsigned long fsc = esr & ESR_ELx_FSC;
360
361 if (!is_el1_data_abort(esr))
362 return false;
363
364 if (fsc == ESR_ELx_FSC_MTE)
365 return true;
366
367 return false;
368 }
369
is_translation_fault(unsigned long esr)370 static bool is_translation_fault(unsigned long esr)
371 {
372 return (esr & ESR_ELx_FSC_TYPE) == ESR_ELx_FSC_FAULT;
373 }
374
__do_kernel_fault(unsigned long addr,unsigned long esr,struct pt_regs * regs)375 static void __do_kernel_fault(unsigned long addr, unsigned long esr,
376 struct pt_regs *regs)
377 {
378 const char *msg;
379
380 /*
381 * Are we prepared to handle this kernel fault?
382 * We are almost certainly not prepared to handle instruction faults.
383 */
384 if (!is_el1_instruction_abort(esr) && fixup_exception(regs))
385 return;
386
387 if (WARN_RATELIMIT(is_spurious_el1_translation_fault(addr, esr, regs),
388 "Ignoring spurious kernel translation fault at virtual address %016lx\n", addr))
389 return;
390
391 if (is_el1_mte_sync_tag_check_fault(esr)) {
392 do_tag_recovery(addr, esr, regs);
393
394 return;
395 }
396
397 if (is_el1_permission_fault(addr, esr, regs)) {
398 if (esr & ESR_ELx_WNR)
399 msg = "write to read-only memory";
400 else if (is_el1_instruction_abort(esr))
401 msg = "execute from non-executable memory";
402 else
403 msg = "read from unreadable memory";
404 } else if (addr < PAGE_SIZE) {
405 msg = "NULL pointer dereference";
406 } else if (is_pkvm_stage2_abort(esr)) {
407 msg = "access to hypervisor-protected memory";
408 } else {
409 if (is_translation_fault(esr) &&
410 kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
411 return;
412
413 msg = "paging request";
414 }
415
416 die_kernel_fault(msg, addr, esr, regs);
417 }
418
set_thread_esr(unsigned long address,unsigned long esr)419 static void set_thread_esr(unsigned long address, unsigned long esr)
420 {
421 current->thread.fault_address = address;
422
423 /*
424 * If the faulting address is in the kernel, we must sanitize the ESR.
425 * From userspace's point of view, kernel-only mappings don't exist
426 * at all, so we report them as level 0 translation faults.
427 * (This is not quite the way that "no mapping there at all" behaves:
428 * an alignment fault not caused by the memory type would take
429 * precedence over translation fault for a real access to empty
430 * space. Unfortunately we can't easily distinguish "alignment fault
431 * not caused by memory type" from "alignment fault caused by memory
432 * type", so we ignore this wrinkle and just return the translation
433 * fault.)
434 */
435 if (!is_ttbr0_addr(current->thread.fault_address)) {
436 switch (ESR_ELx_EC(esr)) {
437 case ESR_ELx_EC_DABT_LOW:
438 /*
439 * These bits provide only information about the
440 * faulting instruction, which userspace knows already.
441 * We explicitly clear bits which are architecturally
442 * RES0 in case they are given meanings in future.
443 * We always report the ESR as if the fault was taken
444 * to EL1 and so ISV and the bits in ISS[23:14] are
445 * clear. (In fact it always will be a fault to EL1.)
446 */
447 esr &= ESR_ELx_EC_MASK | ESR_ELx_IL |
448 ESR_ELx_CM | ESR_ELx_WNR;
449 esr |= ESR_ELx_FSC_FAULT;
450 break;
451 case ESR_ELx_EC_IABT_LOW:
452 /*
453 * Claim a level 0 translation fault.
454 * All other bits are architecturally RES0 for faults
455 * reported with that DFSC value, so we clear them.
456 */
457 esr &= ESR_ELx_EC_MASK | ESR_ELx_IL;
458 esr |= ESR_ELx_FSC_FAULT;
459 break;
460 default:
461 /*
462 * This should never happen (entry.S only brings us
463 * into this code for insn and data aborts from a lower
464 * exception level). Fail safe by not providing an ESR
465 * context record at all.
466 */
467 WARN(1, "ESR 0x%lx is not DABT or IABT from EL0\n", esr);
468 esr = 0;
469 break;
470 }
471 }
472
473 current->thread.fault_code = esr;
474 }
475
do_bad_area(unsigned long far,unsigned long esr,struct pt_regs * regs)476 static void do_bad_area(unsigned long far, unsigned long esr,
477 struct pt_regs *regs)
478 {
479 unsigned long addr = untagged_addr(far);
480
481 /*
482 * If we are in kernel mode at this point, we have no context to
483 * handle this fault with.
484 */
485 if (user_mode(regs)) {
486 const struct fault_info *inf = esr_to_fault_info(esr);
487
488 set_thread_esr(addr, esr);
489 arm64_force_sig_fault(inf->sig, inf->code, far, inf->name);
490 } else {
491 __do_kernel_fault(addr, esr, regs);
492 }
493 }
494
495 #define VM_FAULT_BADMAP ((__force vm_fault_t)0x010000)
496 #define VM_FAULT_BADACCESS ((__force vm_fault_t)0x020000)
497
__do_page_fault(struct mm_struct * mm,unsigned long addr,unsigned int mm_flags,unsigned long vm_flags,struct pt_regs * regs)498 static vm_fault_t __do_page_fault(struct mm_struct *mm, unsigned long addr,
499 unsigned int mm_flags, unsigned long vm_flags,
500 struct pt_regs *regs)
501 {
502 struct vm_area_struct *vma = find_vma(mm, addr);
503
504 if (unlikely(!vma))
505 return VM_FAULT_BADMAP;
506
507 /*
508 * Ok, we have a good vm_area for this memory access, so we can handle
509 * it.
510 */
511 if (unlikely(vma->vm_start > addr)) {
512 if (!(vma->vm_flags & VM_GROWSDOWN))
513 return VM_FAULT_BADMAP;
514 if (expand_stack(vma, addr))
515 return VM_FAULT_BADMAP;
516 }
517
518 /*
519 * Check that the permissions on the VMA allow for the fault which
520 * occurred.
521 */
522 if (!(vma->vm_flags & vm_flags))
523 return VM_FAULT_BADACCESS;
524 return handle_mm_fault(vma, addr, mm_flags, regs);
525 }
526
is_el0_instruction_abort(unsigned long esr)527 static bool is_el0_instruction_abort(unsigned long esr)
528 {
529 return ESR_ELx_EC(esr) == ESR_ELx_EC_IABT_LOW;
530 }
531
532 /*
533 * Note: not valid for EL1 DC IVAC, but we never use that such that it
534 * should fault. EL0 cannot issue DC IVAC (undef).
535 */
is_write_abort(unsigned long esr)536 static bool is_write_abort(unsigned long esr)
537 {
538 return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM);
539 }
540
do_page_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)541 static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
542 struct pt_regs *regs)
543 {
544 const struct fault_info *inf;
545 struct mm_struct *mm = current->mm;
546 vm_fault_t fault;
547 unsigned long vm_flags;
548 unsigned int mm_flags = FAULT_FLAG_DEFAULT;
549 unsigned long addr = untagged_addr(far);
550 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
551 struct vm_area_struct *vma;
552 struct vm_area_struct pvma;
553 unsigned long seq;
554 #endif
555
556 if (kprobe_page_fault(regs, esr))
557 return 0;
558
559 /*
560 * If we're in an interrupt or have no user context, we must not take
561 * the fault.
562 */
563 if (faulthandler_disabled() || !mm)
564 goto no_context;
565
566 if (user_mode(regs))
567 mm_flags |= FAULT_FLAG_USER;
568
569 /*
570 * vm_flags tells us what bits we must have in vma->vm_flags
571 * for the fault to be benign, __do_page_fault() would check
572 * vma->vm_flags & vm_flags and returns an error if the
573 * intersection is empty
574 */
575 if (is_el0_instruction_abort(esr)) {
576 /* It was exec fault */
577 vm_flags = VM_EXEC;
578 mm_flags |= FAULT_FLAG_INSTRUCTION;
579 } else if (is_write_abort(esr)) {
580 /* It was write fault */
581 vm_flags = VM_WRITE;
582 mm_flags |= FAULT_FLAG_WRITE;
583 } else {
584 /* It was read fault */
585 vm_flags = VM_READ;
586 /* Write implies read */
587 vm_flags |= VM_WRITE;
588 /* If EPAN is absent then exec implies read */
589 if (!cpus_have_const_cap(ARM64_HAS_EPAN))
590 vm_flags |= VM_EXEC;
591 }
592
593 if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) {
594 if (is_el1_instruction_abort(esr))
595 die_kernel_fault("execution of user memory",
596 addr, esr, regs);
597
598 if (!search_exception_tables(regs->pc))
599 die_kernel_fault("access to user memory outside uaccess routines",
600 addr, esr, regs);
601 }
602
603 if (is_pkvm_stage2_abort(esr)) {
604 if (!user_mode(regs))
605 goto no_context;
606 arm64_force_sig_fault(SIGSEGV, SEGV_ACCERR, far, "stage-2 fault");
607 return 0;
608 }
609
610 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, addr);
611
612 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
613 /*
614 * No need to try speculative faults for kernel or
615 * single threaded user space.
616 */
617 if (!(mm_flags & FAULT_FLAG_USER) || atomic_read(&mm->mm_users) == 1)
618 goto no_spf;
619
620 count_vm_event(SPF_ATTEMPT);
621 seq = mmap_seq_read_start(mm);
622 if (seq & 1) {
623 count_vm_spf_event(SPF_ABORT_ODD);
624 goto spf_abort;
625 }
626 vma = get_vma(mm, addr);
627 if (!vma) {
628 count_vm_spf_event(SPF_ABORT_UNMAPPED);
629 goto spf_abort;
630 }
631 if (!vma_can_speculate(vma, mm_flags)) {
632 put_vma(vma);
633 count_vm_spf_event(SPF_ABORT_NO_SPECULATE);
634 goto spf_abort;
635 }
636 pvma = *vma;
637 if (!mmap_seq_read_check(mm, seq, SPF_ABORT_VMA_COPY)) {
638 put_vma(vma);
639 goto spf_abort;
640 }
641 if (!(pvma.vm_flags & vm_flags)) {
642 put_vma(vma);
643 count_vm_spf_event(SPF_ABORT_ACCESS_ERROR);
644 goto spf_abort;
645 }
646 fault = do_handle_mm_fault(&pvma, addr & PAGE_MASK,
647 mm_flags | FAULT_FLAG_SPECULATIVE, seq, regs);
648 put_vma(vma);
649
650 /* Quick path to respond to signals */
651 if (fault_signal_pending(fault, regs)) {
652 if (!user_mode(regs))
653 goto no_context;
654 return 0;
655 }
656 if (!(fault & VM_FAULT_RETRY))
657 goto done;
658
659 spf_abort:
660 count_vm_event(SPF_ABORT);
661 no_spf:
662
663 #endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
664
665 /*
666 * As per x86, we may deadlock here. However, since the kernel only
667 * validly references user space from well defined areas of the code,
668 * we can bug out early if this is from code which shouldn't.
669 */
670 if (!mmap_read_trylock(mm)) {
671 if (!user_mode(regs) && !search_exception_tables(regs->pc))
672 goto no_context;
673 retry:
674 mmap_read_lock(mm);
675 } else {
676 /*
677 * The above mmap_read_trylock() might have succeeded in which
678 * case, we'll have missed the might_sleep() from down_read().
679 */
680 might_sleep();
681 #ifdef CONFIG_DEBUG_VM
682 if (!user_mode(regs) && !search_exception_tables(regs->pc)) {
683 mmap_read_unlock(mm);
684 goto no_context;
685 }
686 #endif
687 }
688
689 fault = __do_page_fault(mm, addr, mm_flags, vm_flags, regs);
690
691 /* Quick path to respond to signals */
692 if (fault_signal_pending(fault, regs)) {
693 if (!user_mode(regs))
694 goto no_context;
695 return 0;
696 }
697
698 if (fault & VM_FAULT_RETRY) {
699 if (mm_flags & FAULT_FLAG_ALLOW_RETRY) {
700 mm_flags |= FAULT_FLAG_TRIED;
701 goto retry;
702 }
703 }
704 mmap_read_unlock(mm);
705 #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
706 done:
707 #endif
708
709 /*
710 * Handle the "normal" (no error) case first.
711 */
712 if (likely(!(fault & (VM_FAULT_ERROR | VM_FAULT_BADMAP |
713 VM_FAULT_BADACCESS))))
714 return 0;
715
716 /*
717 * If we are in kernel mode at this point, we have no context to
718 * handle this fault with.
719 */
720 if (!user_mode(regs))
721 goto no_context;
722
723 if (fault & VM_FAULT_OOM) {
724 /*
725 * We ran out of memory, call the OOM killer, and return to
726 * userspace (which will retry the fault, or kill us if we got
727 * oom-killed).
728 */
729 pagefault_out_of_memory();
730 return 0;
731 }
732
733 inf = esr_to_fault_info(esr);
734 set_thread_esr(addr, esr);
735 if (fault & VM_FAULT_SIGBUS) {
736 /*
737 * We had some memory, but were unable to successfully fix up
738 * this page fault.
739 */
740 arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name);
741 } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) {
742 unsigned int lsb;
743
744 lsb = PAGE_SHIFT;
745 if (fault & VM_FAULT_HWPOISON_LARGE)
746 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
747
748 arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name);
749 } else {
750 /*
751 * Something tried to access memory that isn't in our memory
752 * map.
753 */
754 arm64_force_sig_fault(SIGSEGV,
755 fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR,
756 far, inf->name);
757 }
758
759 return 0;
760
761 no_context:
762 __do_kernel_fault(addr, esr, regs);
763 return 0;
764 }
765
do_translation_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)766 static int __kprobes do_translation_fault(unsigned long far,
767 unsigned long esr,
768 struct pt_regs *regs)
769 {
770 unsigned long addr = untagged_addr(far);
771
772 if (is_ttbr0_addr(addr))
773 return do_page_fault(far, esr, regs);
774
775 do_bad_area(far, esr, regs);
776 return 0;
777 }
778
do_alignment_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)779 static int do_alignment_fault(unsigned long far, unsigned long esr,
780 struct pt_regs *regs)
781 {
782 do_bad_area(far, esr, regs);
783 return 0;
784 }
785
do_bad(unsigned long far,unsigned long esr,struct pt_regs * regs)786 static int do_bad(unsigned long far, unsigned long esr, struct pt_regs *regs)
787 {
788 unsigned long addr = untagged_addr(far);
789 int ret = 1;
790
791 trace_android_vh_handle_tlb_conf(addr, esr, &ret);
792 return ret;
793 }
794
do_sea(unsigned long far,unsigned long esr,struct pt_regs * regs)795 static int do_sea(unsigned long far, unsigned long esr, struct pt_regs *regs)
796 {
797 const struct fault_info *inf;
798 unsigned long siaddr;
799
800 inf = esr_to_fault_info(esr);
801
802 if (user_mode(regs) && apei_claim_sea(regs) == 0) {
803 /*
804 * APEI claimed this as a firmware-first notification.
805 * Some processing deferred to task_work before ret_to_user().
806 */
807 return 0;
808 }
809
810 if (esr & ESR_ELx_FnV) {
811 siaddr = 0;
812 } else {
813 /*
814 * The architecture specifies that the tag bits of FAR_EL1 are
815 * UNKNOWN for synchronous external aborts. Mask them out now
816 * so that userspace doesn't see them.
817 */
818 siaddr = untagged_addr(far);
819 }
820 trace_android_rvh_do_sea(siaddr, esr, regs);
821 arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr);
822
823 return 0;
824 }
825
do_tag_check_fault(unsigned long far,unsigned long esr,struct pt_regs * regs)826 static int do_tag_check_fault(unsigned long far, unsigned long esr,
827 struct pt_regs *regs)
828 {
829 /*
830 * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN
831 * for tag check faults. Set them to corresponding bits in the untagged
832 * address.
833 */
834 far = (__untagged_addr(far) & ~MTE_TAG_MASK) | (far & MTE_TAG_MASK);
835 do_bad_area(far, esr, regs);
836 return 0;
837 }
838
839 static const struct fault_info fault_info[] = {
840 { do_bad, SIGKILL, SI_KERNEL, "ttbr address size fault" },
841 { do_bad, SIGKILL, SI_KERNEL, "level 1 address size fault" },
842 { do_bad, SIGKILL, SI_KERNEL, "level 2 address size fault" },
843 { do_bad, SIGKILL, SI_KERNEL, "level 3 address size fault" },
844 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" },
845 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" },
846 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" },
847 { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" },
848 { do_bad, SIGKILL, SI_KERNEL, "unknown 8" },
849 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" },
850 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" },
851 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 access flag fault" },
852 { do_bad, SIGKILL, SI_KERNEL, "unknown 12" },
853 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 permission fault" },
854 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 permission fault" },
855 { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 3 permission fault" },
856 { do_sea, SIGBUS, BUS_OBJERR, "synchronous external abort" },
857 { do_tag_check_fault, SIGSEGV, SEGV_MTESERR, "synchronous tag check fault" },
858 { do_bad, SIGKILL, SI_KERNEL, "unknown 18" },
859 { do_bad, SIGKILL, SI_KERNEL, "unknown 19" },
860 { do_sea, SIGKILL, SI_KERNEL, "level 0 (translation table walk)" },
861 { do_sea, SIGKILL, SI_KERNEL, "level 1 (translation table walk)" },
862 { do_sea, SIGKILL, SI_KERNEL, "level 2 (translation table walk)" },
863 { do_sea, SIGKILL, SI_KERNEL, "level 3 (translation table walk)" },
864 { do_sea, SIGBUS, BUS_OBJERR, "synchronous parity or ECC error" }, // Reserved when RAS is implemented
865 { do_bad, SIGKILL, SI_KERNEL, "unknown 25" },
866 { do_bad, SIGKILL, SI_KERNEL, "unknown 26" },
867 { do_bad, SIGKILL, SI_KERNEL, "unknown 27" },
868 { do_sea, SIGKILL, SI_KERNEL, "level 0 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
869 { do_sea, SIGKILL, SI_KERNEL, "level 1 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
870 { do_sea, SIGKILL, SI_KERNEL, "level 2 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
871 { do_sea, SIGKILL, SI_KERNEL, "level 3 synchronous parity error (translation table walk)" }, // Reserved when RAS is implemented
872 { do_bad, SIGKILL, SI_KERNEL, "unknown 32" },
873 { do_alignment_fault, SIGBUS, BUS_ADRALN, "alignment fault" },
874 { do_bad, SIGKILL, SI_KERNEL, "unknown 34" },
875 { do_bad, SIGKILL, SI_KERNEL, "unknown 35" },
876 { do_bad, SIGKILL, SI_KERNEL, "unknown 36" },
877 { do_bad, SIGKILL, SI_KERNEL, "unknown 37" },
878 { do_bad, SIGKILL, SI_KERNEL, "unknown 38" },
879 { do_bad, SIGKILL, SI_KERNEL, "unknown 39" },
880 { do_bad, SIGKILL, SI_KERNEL, "unknown 40" },
881 { do_bad, SIGKILL, SI_KERNEL, "unknown 41" },
882 { do_bad, SIGKILL, SI_KERNEL, "unknown 42" },
883 { do_bad, SIGKILL, SI_KERNEL, "unknown 43" },
884 { do_bad, SIGKILL, SI_KERNEL, "unknown 44" },
885 { do_bad, SIGKILL, SI_KERNEL, "unknown 45" },
886 { do_bad, SIGKILL, SI_KERNEL, "unknown 46" },
887 { do_bad, SIGKILL, SI_KERNEL, "unknown 47" },
888 { do_bad, SIGKILL, SI_KERNEL, "TLB conflict abort" },
889 { do_bad, SIGKILL, SI_KERNEL, "Unsupported atomic hardware update fault" },
890 { do_bad, SIGKILL, SI_KERNEL, "unknown 50" },
891 { do_bad, SIGKILL, SI_KERNEL, "unknown 51" },
892 { do_bad, SIGKILL, SI_KERNEL, "implementation fault (lockdown abort)" },
893 { do_bad, SIGBUS, BUS_OBJERR, "implementation fault (unsupported exclusive)" },
894 { do_bad, SIGKILL, SI_KERNEL, "unknown 54" },
895 { do_bad, SIGKILL, SI_KERNEL, "unknown 55" },
896 { do_bad, SIGKILL, SI_KERNEL, "unknown 56" },
897 { do_bad, SIGKILL, SI_KERNEL, "unknown 57" },
898 { do_bad, SIGKILL, SI_KERNEL, "unknown 58" },
899 { do_bad, SIGKILL, SI_KERNEL, "unknown 59" },
900 { do_bad, SIGKILL, SI_KERNEL, "unknown 60" },
901 { do_bad, SIGKILL, SI_KERNEL, "section domain fault" },
902 { do_bad, SIGKILL, SI_KERNEL, "page domain fault" },
903 { do_bad, SIGKILL, SI_KERNEL, "unknown 63" },
904 };
905
do_mem_abort(unsigned long far,unsigned long esr,struct pt_regs * regs)906 void do_mem_abort(unsigned long far, unsigned long esr, struct pt_regs *regs)
907 {
908 const struct fault_info *inf = esr_to_fault_info(esr);
909 unsigned long addr = untagged_addr(far);
910
911 if (!inf->fn(far, esr, regs))
912 return;
913
914 if (!user_mode(regs)) {
915 pr_alert("Unhandled fault at 0x%016lx\n", addr);
916 trace_android_rvh_do_mem_abort(addr, esr, regs);
917 mem_abort_decode(esr);
918 show_pte(addr);
919 }
920
921 /*
922 * At this point we have an unrecognized fault type whose tag bits may
923 * have been defined as UNKNOWN. Therefore we only expose the untagged
924 * address to the signal handler.
925 */
926 arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr);
927 }
928 NOKPROBE_SYMBOL(do_mem_abort);
929
do_sp_pc_abort(unsigned long addr,unsigned long esr,struct pt_regs * regs)930 void do_sp_pc_abort(unsigned long addr, unsigned long esr, struct pt_regs *regs)
931 {
932 trace_android_rvh_do_sp_pc_abort(addr, esr, regs);
933
934 arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN,
935 addr, esr);
936 }
937 NOKPROBE_SYMBOL(do_sp_pc_abort);
938
939 int __init early_brk64(unsigned long addr, unsigned long esr,
940 struct pt_regs *regs);
941
942 /*
943 * __refdata because early_brk64 is __init, but the reference to it is
944 * clobbered at arch_initcall time.
945 * See traps.c and debug-monitors.c:debug_traps_init().
946 */
947 static struct fault_info __refdata debug_fault_info[] = {
948 { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware breakpoint" },
949 { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware single-step" },
950 { do_bad, SIGTRAP, TRAP_HWBKPT, "hardware watchpoint" },
951 { do_bad, SIGKILL, SI_KERNEL, "unknown 3" },
952 { do_bad, SIGTRAP, TRAP_BRKPT, "aarch32 BKPT" },
953 { do_bad, SIGKILL, SI_KERNEL, "aarch32 vector catch" },
954 { early_brk64, SIGTRAP, TRAP_BRKPT, "aarch64 BRK" },
955 { do_bad, SIGKILL, SI_KERNEL, "unknown 7" },
956 };
957
hook_debug_fault_code(int nr,int (* fn)(unsigned long,unsigned long,struct pt_regs *),int sig,int code,const char * name)958 void __init hook_debug_fault_code(int nr,
959 int (*fn)(unsigned long, unsigned long, struct pt_regs *),
960 int sig, int code, const char *name)
961 {
962 BUG_ON(nr < 0 || nr >= ARRAY_SIZE(debug_fault_info));
963
964 debug_fault_info[nr].fn = fn;
965 debug_fault_info[nr].sig = sig;
966 debug_fault_info[nr].code = code;
967 debug_fault_info[nr].name = name;
968 }
969
970 /*
971 * In debug exception context, we explicitly disable preemption despite
972 * having interrupts disabled.
973 * This serves two purposes: it makes it much less likely that we would
974 * accidentally schedule in exception context and it will force a warning
975 * if we somehow manage to schedule by accident.
976 */
debug_exception_enter(struct pt_regs * regs)977 static void debug_exception_enter(struct pt_regs *regs)
978 {
979 preempt_disable();
980
981 /* This code is a bit fragile. Test it. */
982 RCU_LOCKDEP_WARN(!rcu_is_watching(), "exception_enter didn't work");
983 }
984 NOKPROBE_SYMBOL(debug_exception_enter);
985
debug_exception_exit(struct pt_regs * regs)986 static void debug_exception_exit(struct pt_regs *regs)
987 {
988 preempt_enable_no_resched();
989 }
990 NOKPROBE_SYMBOL(debug_exception_exit);
991
do_debug_exception(unsigned long addr_if_watchpoint,unsigned long esr,struct pt_regs * regs)992 void do_debug_exception(unsigned long addr_if_watchpoint, unsigned long esr,
993 struct pt_regs *regs)
994 {
995 const struct fault_info *inf = esr_to_debug_fault_info(esr);
996 unsigned long pc = instruction_pointer(regs);
997
998 debug_exception_enter(regs);
999
1000 if (user_mode(regs) && !is_ttbr0_addr(pc))
1001 arm64_apply_bp_hardening();
1002
1003 if (inf->fn(addr_if_watchpoint, esr, regs)) {
1004 arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr);
1005 }
1006
1007 debug_exception_exit(regs);
1008 }
1009 NOKPROBE_SYMBOL(do_debug_exception);
1010
1011 /*
1012 * Used during anonymous page fault handling.
1013 */
alloc_zeroed_user_highpage_movable(struct vm_area_struct * vma,unsigned long vaddr)1014 struct page *alloc_zeroed_user_highpage_movable(struct vm_area_struct *vma,
1015 unsigned long vaddr)
1016 {
1017 gfp_t flags = GFP_HIGHUSER_MOVABLE | __GFP_ZERO | __GFP_CMA;
1018
1019 /*
1020 * If the page is mapped with PROT_MTE, initialise the tags at the
1021 * point of allocation and page zeroing as this is usually faster than
1022 * separate DC ZVA and STGM.
1023 */
1024 if (vma->vm_flags & VM_MTE)
1025 flags |= __GFP_ZEROTAGS;
1026
1027 return alloc_page_vma(flags, vma, vaddr);
1028 }
1029
tag_clear_highpage(struct page * page)1030 void tag_clear_highpage(struct page *page)
1031 {
1032 mte_zero_clear_page_tags(page_address(page));
1033 set_page_mte_tagged(page);
1034 }
1035