• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   *  Copyright (C) 1995  Linus Torvalds
3   *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
4   *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
5   */
6  #include <linux/magic.h>		/* STACK_END_MAGIC		*/
7  #include <linux/sched.h>		/* test_thread_flag(), ...	*/
8  #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
9  #include <linux/module.h>		/* search_exception_table	*/
10  #include <linux/bootmem.h>		/* max_low_pfn			*/
11  #include <linux/kprobes.h>		/* __kprobes, ...		*/
12  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
13  #include <linux/perf_event.h>		/* perf_sw_event		*/
14  #include <linux/hugetlb.h>		/* hstate_index_to_shift	*/
15  #include <linux/prefetch.h>		/* prefetchw			*/
16  
17  #include <asm/traps.h>			/* dotraplinkage, ...		*/
18  #include <asm/pgalloc.h>		/* pgd_*(), ...			*/
19  #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
20  #include <asm/fixmap.h>			/* VSYSCALL_START		*/
21  
22  /*
23   * Page fault error code bits:
24   *
25   *   bit 0 ==	 0: no page found	1: protection fault
26   *   bit 1 ==	 0: read access		1: write access
27   *   bit 2 ==	 0: kernel-mode access	1: user-mode access
28   *   bit 3 ==				1: use of reserved bit detected
29   *   bit 4 ==				1: fault was an instruction fetch
30   */
31  enum x86_pf_error_code {
32  
33  	PF_PROT		=		1 << 0,
34  	PF_WRITE	=		1 << 1,
35  	PF_USER		=		1 << 2,
36  	PF_RSVD		=		1 << 3,
37  	PF_INSTR	=		1 << 4,
38  };
39  
40  /*
41   * Returns 0 if mmiotrace is disabled, or if the fault is not
42   * handled by mmiotrace:
43   */
44  static inline int __kprobes
kmmio_fault(struct pt_regs * regs,unsigned long addr)45  kmmio_fault(struct pt_regs *regs, unsigned long addr)
46  {
47  	if (unlikely(is_kmmio_active()))
48  		if (kmmio_handler(regs, addr) == 1)
49  			return -1;
50  	return 0;
51  }
52  
notify_page_fault(struct pt_regs * regs)53  static inline int __kprobes notify_page_fault(struct pt_regs *regs)
54  {
55  	int ret = 0;
56  
57  	/* kprobe_running() needs smp_processor_id() */
58  	if (kprobes_built_in() && !user_mode_vm(regs)) {
59  		preempt_disable();
60  		if (kprobe_running() && kprobe_fault_handler(regs, 14))
61  			ret = 1;
62  		preempt_enable();
63  	}
64  
65  	return ret;
66  }
67  
68  /*
69   * Prefetch quirks:
70   *
71   * 32-bit mode:
72   *
73   *   Sometimes AMD Athlon/Opteron CPUs report invalid exceptions on prefetch.
74   *   Check that here and ignore it.
75   *
76   * 64-bit mode:
77   *
78   *   Sometimes the CPU reports invalid exceptions on prefetch.
79   *   Check that here and ignore it.
80   *
81   * Opcode checker based on code by Richard Brunner.
82   */
83  static inline int
check_prefetch_opcode(struct pt_regs * regs,unsigned char * instr,unsigned char opcode,int * prefetch)84  check_prefetch_opcode(struct pt_regs *regs, unsigned char *instr,
85  		      unsigned char opcode, int *prefetch)
86  {
87  	unsigned char instr_hi = opcode & 0xf0;
88  	unsigned char instr_lo = opcode & 0x0f;
89  
90  	switch (instr_hi) {
91  	case 0x20:
92  	case 0x30:
93  		/*
94  		 * Values 0x26,0x2E,0x36,0x3E are valid x86 prefixes.
95  		 * In X86_64 long mode, the CPU will signal invalid
96  		 * opcode if some of these prefixes are present so
97  		 * X86_64 will never get here anyway
98  		 */
99  		return ((instr_lo & 7) == 0x6);
100  #ifdef CONFIG_X86_64
101  	case 0x40:
102  		/*
103  		 * In AMD64 long mode 0x40..0x4F are valid REX prefixes
104  		 * Need to figure out under what instruction mode the
105  		 * instruction was issued. Could check the LDT for lm,
106  		 * but for now it's good enough to assume that long
107  		 * mode only uses well known segments or kernel.
108  		 */
109  		return (!user_mode(regs) || user_64bit_mode(regs));
110  #endif
111  	case 0x60:
112  		/* 0x64 thru 0x67 are valid prefixes in all modes. */
113  		return (instr_lo & 0xC) == 0x4;
114  	case 0xF0:
115  		/* 0xF0, 0xF2, 0xF3 are valid prefixes in all modes. */
116  		return !instr_lo || (instr_lo>>1) == 1;
117  	case 0x00:
118  		/* Prefetch instruction is 0x0F0D or 0x0F18 */
119  		if (probe_kernel_address(instr, opcode))
120  			return 0;
121  
122  		*prefetch = (instr_lo == 0xF) &&
123  			(opcode == 0x0D || opcode == 0x18);
124  		return 0;
125  	default:
126  		return 0;
127  	}
128  }
129  
130  static int
is_prefetch(struct pt_regs * regs,unsigned long error_code,unsigned long addr)131  is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
132  {
133  	unsigned char *max_instr;
134  	unsigned char *instr;
135  	int prefetch = 0;
136  
137  	/*
138  	 * If it was a exec (instruction fetch) fault on NX page, then
139  	 * do not ignore the fault:
140  	 */
141  	if (error_code & PF_INSTR)
142  		return 0;
143  
144  	instr = (void *)convert_ip_to_linear(current, regs);
145  	max_instr = instr + 15;
146  
147  	if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
148  		return 0;
149  
150  	while (instr < max_instr) {
151  		unsigned char opcode;
152  
153  		if (probe_kernel_address(instr, opcode))
154  			break;
155  
156  		instr++;
157  
158  		if (!check_prefetch_opcode(regs, instr, opcode, &prefetch))
159  			break;
160  	}
161  	return prefetch;
162  }
163  
164  static void
force_sig_info_fault(int si_signo,int si_code,unsigned long address,struct task_struct * tsk,int fault)165  force_sig_info_fault(int si_signo, int si_code, unsigned long address,
166  		     struct task_struct *tsk, int fault)
167  {
168  	unsigned lsb = 0;
169  	siginfo_t info;
170  
171  	info.si_signo	= si_signo;
172  	info.si_errno	= 0;
173  	info.si_code	= si_code;
174  	info.si_addr	= (void __user *)address;
175  	if (fault & VM_FAULT_HWPOISON_LARGE)
176  		lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
177  	if (fault & VM_FAULT_HWPOISON)
178  		lsb = PAGE_SHIFT;
179  	info.si_addr_lsb = lsb;
180  
181  	force_sig_info(si_signo, &info, tsk);
182  }
183  
184  DEFINE_SPINLOCK(pgd_lock);
185  LIST_HEAD(pgd_list);
186  
187  #ifdef CONFIG_X86_32
vmalloc_sync_one(pgd_t * pgd,unsigned long address)188  static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
189  {
190  	unsigned index = pgd_index(address);
191  	pgd_t *pgd_k;
192  	pud_t *pud, *pud_k;
193  	pmd_t *pmd, *pmd_k;
194  
195  	pgd += index;
196  	pgd_k = init_mm.pgd + index;
197  
198  	if (!pgd_present(*pgd_k))
199  		return NULL;
200  
201  	/*
202  	 * set_pgd(pgd, *pgd_k); here would be useless on PAE
203  	 * and redundant with the set_pmd() on non-PAE. As would
204  	 * set_pud.
205  	 */
206  	pud = pud_offset(pgd, address);
207  	pud_k = pud_offset(pgd_k, address);
208  	if (!pud_present(*pud_k))
209  		return NULL;
210  
211  	pmd = pmd_offset(pud, address);
212  	pmd_k = pmd_offset(pud_k, address);
213  	if (!pmd_present(*pmd_k))
214  		return NULL;
215  
216  	if (!pmd_present(*pmd))
217  		set_pmd(pmd, *pmd_k);
218  	else
219  		BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
220  
221  	return pmd_k;
222  }
223  
vmalloc_sync_all(void)224  void vmalloc_sync_all(void)
225  {
226  	unsigned long address;
227  
228  	if (SHARED_KERNEL_PMD)
229  		return;
230  
231  	for (address = VMALLOC_START & PMD_MASK;
232  	     address >= TASK_SIZE && address < FIXADDR_TOP;
233  	     address += PMD_SIZE) {
234  		struct page *page;
235  
236  		spin_lock(&pgd_lock);
237  		list_for_each_entry(page, &pgd_list, lru) {
238  			spinlock_t *pgt_lock;
239  			pmd_t *ret;
240  
241  			/* the pgt_lock only for Xen */
242  			pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
243  
244  			spin_lock(pgt_lock);
245  			ret = vmalloc_sync_one(page_address(page), address);
246  			spin_unlock(pgt_lock);
247  
248  			if (!ret)
249  				break;
250  		}
251  		spin_unlock(&pgd_lock);
252  	}
253  }
254  
255  /*
256   * 32-bit:
257   *
258   *   Handle a fault on the vmalloc or module mapping area
259   */
vmalloc_fault(unsigned long address)260  static noinline __kprobes int vmalloc_fault(unsigned long address)
261  {
262  	unsigned long pgd_paddr;
263  	pmd_t *pmd_k;
264  	pte_t *pte_k;
265  
266  	/* Make sure we are in vmalloc area: */
267  	if (!(address >= VMALLOC_START && address < VMALLOC_END))
268  		return -1;
269  
270  	WARN_ON_ONCE(in_nmi());
271  
272  	/*
273  	 * Synchronize this task's top level page-table
274  	 * with the 'reference' page table.
275  	 *
276  	 * Do _not_ use "current" here. We might be inside
277  	 * an interrupt in the middle of a task switch..
278  	 */
279  	pgd_paddr = read_cr3();
280  	pmd_k = vmalloc_sync_one(__va(pgd_paddr), address);
281  	if (!pmd_k)
282  		return -1;
283  
284  	pte_k = pte_offset_kernel(pmd_k, address);
285  	if (!pte_present(*pte_k))
286  		return -1;
287  
288  	return 0;
289  }
290  
291  /*
292   * Did it hit the DOS screen memory VA from vm86 mode?
293   */
294  static inline void
check_v8086_mode(struct pt_regs * regs,unsigned long address,struct task_struct * tsk)295  check_v8086_mode(struct pt_regs *regs, unsigned long address,
296  		 struct task_struct *tsk)
297  {
298  	unsigned long bit;
299  
300  	if (!v8086_mode(regs))
301  		return;
302  
303  	bit = (address - 0xA0000) >> PAGE_SHIFT;
304  	if (bit < 32)
305  		tsk->thread.screen_bitmap |= 1 << bit;
306  }
307  
low_pfn(unsigned long pfn)308  static bool low_pfn(unsigned long pfn)
309  {
310  	return pfn < max_low_pfn;
311  }
312  
dump_pagetable(unsigned long address)313  static void dump_pagetable(unsigned long address)
314  {
315  	pgd_t *base = __va(read_cr3());
316  	pgd_t *pgd = &base[pgd_index(address)];
317  	pmd_t *pmd;
318  	pte_t *pte;
319  
320  #ifdef CONFIG_X86_PAE
321  	printk("*pdpt = %016Lx ", pgd_val(*pgd));
322  	if (!low_pfn(pgd_val(*pgd) >> PAGE_SHIFT) || !pgd_present(*pgd))
323  		goto out;
324  #endif
325  	pmd = pmd_offset(pud_offset(pgd, address), address);
326  	printk(KERN_CONT "*pde = %0*Lx ", sizeof(*pmd) * 2, (u64)pmd_val(*pmd));
327  
328  	/*
329  	 * We must not directly access the pte in the highpte
330  	 * case if the page table is located in highmem.
331  	 * And let's rather not kmap-atomic the pte, just in case
332  	 * it's allocated already:
333  	 */
334  	if (!low_pfn(pmd_pfn(*pmd)) || !pmd_present(*pmd) || pmd_large(*pmd))
335  		goto out;
336  
337  	pte = pte_offset_kernel(pmd, address);
338  	printk("*pte = %0*Lx ", sizeof(*pte) * 2, (u64)pte_val(*pte));
339  out:
340  	printk("\n");
341  }
342  
343  #else /* CONFIG_X86_64: */
344  
vmalloc_sync_all(void)345  void vmalloc_sync_all(void)
346  {
347  	sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
348  }
349  
350  /*
351   * 64-bit:
352   *
353   *   Handle a fault on the vmalloc area
354   *
355   * This assumes no large pages in there.
356   */
vmalloc_fault(unsigned long address)357  static noinline __kprobes int vmalloc_fault(unsigned long address)
358  {
359  	pgd_t *pgd, *pgd_ref;
360  	pud_t *pud, *pud_ref;
361  	pmd_t *pmd, *pmd_ref;
362  	pte_t *pte, *pte_ref;
363  
364  	/* Make sure we are in vmalloc area: */
365  	if (!(address >= VMALLOC_START && address < VMALLOC_END))
366  		return -1;
367  
368  	WARN_ON_ONCE(in_nmi());
369  
370  	/*
371  	 * Copy kernel mappings over when needed. This can also
372  	 * happen within a race in page table update. In the later
373  	 * case just flush:
374  	 */
375  	pgd = pgd_offset(current->active_mm, address);
376  	pgd_ref = pgd_offset_k(address);
377  	if (pgd_none(*pgd_ref))
378  		return -1;
379  
380  	if (pgd_none(*pgd)) {
381  		set_pgd(pgd, *pgd_ref);
382  		arch_flush_lazy_mmu_mode();
383  	} else {
384  		BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
385  	}
386  
387  	/*
388  	 * Below here mismatches are bugs because these lower tables
389  	 * are shared:
390  	 */
391  
392  	pud = pud_offset(pgd, address);
393  	pud_ref = pud_offset(pgd_ref, address);
394  	if (pud_none(*pud_ref))
395  		return -1;
396  
397  	if (pud_none(*pud) || pud_page_vaddr(*pud) != pud_page_vaddr(*pud_ref))
398  		BUG();
399  
400  	pmd = pmd_offset(pud, address);
401  	pmd_ref = pmd_offset(pud_ref, address);
402  	if (pmd_none(*pmd_ref))
403  		return -1;
404  
405  	if (pmd_none(*pmd) || pmd_page(*pmd) != pmd_page(*pmd_ref))
406  		BUG();
407  
408  	pte_ref = pte_offset_kernel(pmd_ref, address);
409  	if (!pte_present(*pte_ref))
410  		return -1;
411  
412  	pte = pte_offset_kernel(pmd, address);
413  
414  	/*
415  	 * Don't use pte_page here, because the mappings can point
416  	 * outside mem_map, and the NUMA hash lookup cannot handle
417  	 * that:
418  	 */
419  	if (!pte_present(*pte) || pte_pfn(*pte) != pte_pfn(*pte_ref))
420  		BUG();
421  
422  	return 0;
423  }
424  
425  #ifdef CONFIG_CPU_SUP_AMD
426  static const char errata93_warning[] =
427  KERN_ERR
428  "******* Your BIOS seems to not contain a fix for K8 errata #93\n"
429  "******* Working around it, but it may cause SEGVs or burn power.\n"
430  "******* Please consider a BIOS update.\n"
431  "******* Disabling USB legacy in the BIOS may also help.\n";
432  #endif
433  
434  /*
435   * No vm86 mode in 64-bit mode:
436   */
437  static inline void
check_v8086_mode(struct pt_regs * regs,unsigned long address,struct task_struct * tsk)438  check_v8086_mode(struct pt_regs *regs, unsigned long address,
439  		 struct task_struct *tsk)
440  {
441  }
442  
bad_address(void * p)443  static int bad_address(void *p)
444  {
445  	unsigned long dummy;
446  
447  	return probe_kernel_address((unsigned long *)p, dummy);
448  }
449  
dump_pagetable(unsigned long address)450  static void dump_pagetable(unsigned long address)
451  {
452  	pgd_t *base = __va(read_cr3() & PHYSICAL_PAGE_MASK);
453  	pgd_t *pgd = base + pgd_index(address);
454  	pud_t *pud;
455  	pmd_t *pmd;
456  	pte_t *pte;
457  
458  	if (bad_address(pgd))
459  		goto bad;
460  
461  	printk("PGD %lx ", pgd_val(*pgd));
462  
463  	if (!pgd_present(*pgd))
464  		goto out;
465  
466  	pud = pud_offset(pgd, address);
467  	if (bad_address(pud))
468  		goto bad;
469  
470  	printk("PUD %lx ", pud_val(*pud));
471  	if (!pud_present(*pud) || pud_large(*pud))
472  		goto out;
473  
474  	pmd = pmd_offset(pud, address);
475  	if (bad_address(pmd))
476  		goto bad;
477  
478  	printk("PMD %lx ", pmd_val(*pmd));
479  	if (!pmd_present(*pmd) || pmd_large(*pmd))
480  		goto out;
481  
482  	pte = pte_offset_kernel(pmd, address);
483  	if (bad_address(pte))
484  		goto bad;
485  
486  	printk("PTE %lx", pte_val(*pte));
487  out:
488  	printk("\n");
489  	return;
490  bad:
491  	printk("BAD\n");
492  }
493  
494  #endif /* CONFIG_X86_64 */
495  
496  /*
497   * Workaround for K8 erratum #93 & buggy BIOS.
498   *
499   * BIOS SMM functions are required to use a specific workaround
500   * to avoid corruption of the 64bit RIP register on C stepping K8.
501   *
502   * A lot of BIOS that didn't get tested properly miss this.
503   *
504   * The OS sees this as a page fault with the upper 32bits of RIP cleared.
505   * Try to work around it here.
506   *
507   * Note we only handle faults in kernel here.
508   * Does nothing on 32-bit.
509   */
is_errata93(struct pt_regs * regs,unsigned long address)510  static int is_errata93(struct pt_regs *regs, unsigned long address)
511  {
512  #if defined(CONFIG_X86_64) && defined(CONFIG_CPU_SUP_AMD)
513  	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD
514  	    || boot_cpu_data.x86 != 0xf)
515  		return 0;
516  
517  	if (address != regs->ip)
518  		return 0;
519  
520  	if ((address >> 32) != 0)
521  		return 0;
522  
523  	address |= 0xffffffffUL << 32;
524  	if ((address >= (u64)_stext && address <= (u64)_etext) ||
525  	    (address >= MODULES_VADDR && address <= MODULES_END)) {
526  		printk_once(errata93_warning);
527  		regs->ip = address;
528  		return 1;
529  	}
530  #endif
531  	return 0;
532  }
533  
534  /*
535   * Work around K8 erratum #100 K8 in compat mode occasionally jumps
536   * to illegal addresses >4GB.
537   *
538   * We catch this in the page fault handler because these addresses
539   * are not reachable. Just detect this case and return.  Any code
540   * segment in LDT is compatibility mode.
541   */
is_errata100(struct pt_regs * regs,unsigned long address)542  static int is_errata100(struct pt_regs *regs, unsigned long address)
543  {
544  #ifdef CONFIG_X86_64
545  	if ((regs->cs == __USER32_CS || (regs->cs & (1<<2))) && (address >> 32))
546  		return 1;
547  #endif
548  	return 0;
549  }
550  
is_f00f_bug(struct pt_regs * regs,unsigned long address)551  static int is_f00f_bug(struct pt_regs *regs, unsigned long address)
552  {
553  #ifdef CONFIG_X86_F00F_BUG
554  	unsigned long nr;
555  
556  	/*
557  	 * Pentium F0 0F C7 C8 bug workaround:
558  	 */
559  	if (boot_cpu_data.f00f_bug) {
560  		nr = (address - idt_descr.address) >> 3;
561  
562  		if (nr == 6) {
563  			do_invalid_op(regs, 0);
564  			return 1;
565  		}
566  	}
567  #endif
568  	return 0;
569  }
570  
571  static const char nx_warning[] = KERN_CRIT
572  "kernel tried to execute NX-protected page - exploit attempt? (uid: %d)\n";
573  
574  static void
show_fault_oops(struct pt_regs * regs,unsigned long error_code,unsigned long address)575  show_fault_oops(struct pt_regs *regs, unsigned long error_code,
576  		unsigned long address)
577  {
578  	if (!oops_may_print())
579  		return;
580  
581  	if (error_code & PF_INSTR) {
582  		unsigned int level;
583  
584  		pte_t *pte = lookup_address(address, &level);
585  
586  		if (pte && pte_present(*pte) && !pte_exec(*pte))
587  			printk(nx_warning, current_uid());
588  	}
589  
590  	printk(KERN_ALERT "BUG: unable to handle kernel ");
591  	if (address < PAGE_SIZE)
592  		printk(KERN_CONT "NULL pointer dereference");
593  	else
594  		printk(KERN_CONT "paging request");
595  
596  	printk(KERN_CONT " at %p\n", (void *) address);
597  	printk(KERN_ALERT "IP:");
598  	printk_address(regs->ip, 1);
599  
600  	dump_pagetable(address);
601  }
602  
603  static noinline void
pgtable_bad(struct pt_regs * regs,unsigned long error_code,unsigned long address)604  pgtable_bad(struct pt_regs *regs, unsigned long error_code,
605  	    unsigned long address)
606  {
607  	struct task_struct *tsk;
608  	unsigned long flags;
609  	int sig;
610  
611  	flags = oops_begin();
612  	tsk = current;
613  	sig = SIGKILL;
614  
615  	printk(KERN_ALERT "%s: Corrupted page table at address %lx\n",
616  	       tsk->comm, address);
617  	dump_pagetable(address);
618  
619  	tsk->thread.cr2		= address;
620  	tsk->thread.trap_nr	= X86_TRAP_PF;
621  	tsk->thread.error_code	= error_code;
622  
623  	if (__die("Bad pagetable", regs, error_code))
624  		sig = 0;
625  
626  	oops_end(flags, regs, sig);
627  }
628  
629  static noinline void
no_context(struct pt_regs * regs,unsigned long error_code,unsigned long address,int signal,int si_code)630  no_context(struct pt_regs *regs, unsigned long error_code,
631  	   unsigned long address, int signal, int si_code)
632  {
633  	struct task_struct *tsk = current;
634  	unsigned long *stackend;
635  	unsigned long flags;
636  	int sig;
637  
638  	/* Are we prepared to handle this kernel fault? */
639  	if (fixup_exception(regs)) {
640  		if (current_thread_info()->sig_on_uaccess_error && signal) {
641  			tsk->thread.trap_nr = X86_TRAP_PF;
642  			tsk->thread.error_code = error_code | PF_USER;
643  			tsk->thread.cr2 = address;
644  
645  			/* XXX: hwpoison faults will set the wrong code. */
646  			force_sig_info_fault(signal, si_code, address, tsk, 0);
647  		}
648  		return;
649  	}
650  
651  	/*
652  	 * 32-bit:
653  	 *
654  	 *   Valid to do another page fault here, because if this fault
655  	 *   had been triggered by is_prefetch fixup_exception would have
656  	 *   handled it.
657  	 *
658  	 * 64-bit:
659  	 *
660  	 *   Hall of shame of CPU/BIOS bugs.
661  	 */
662  	if (is_prefetch(regs, error_code, address))
663  		return;
664  
665  	if (is_errata93(regs, address))
666  		return;
667  
668  	/*
669  	 * Oops. The kernel tried to access some bad page. We'll have to
670  	 * terminate things with extreme prejudice:
671  	 */
672  	flags = oops_begin();
673  
674  	show_fault_oops(regs, error_code, address);
675  
676  	stackend = end_of_stack(tsk);
677  	if (tsk != &init_task && *stackend != STACK_END_MAGIC)
678  		printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
679  
680  	tsk->thread.cr2		= address;
681  	tsk->thread.trap_nr	= X86_TRAP_PF;
682  	tsk->thread.error_code	= error_code;
683  
684  	sig = SIGKILL;
685  	if (__die("Oops", regs, error_code))
686  		sig = 0;
687  
688  	/* Executive summary in case the body of the oops scrolled away */
689  	printk(KERN_DEFAULT "CR2: %016lx\n", address);
690  
691  	oops_end(flags, regs, sig);
692  }
693  
694  /*
695   * Print out info about fatal segfaults, if the show_unhandled_signals
696   * sysctl is set:
697   */
698  static inline void
show_signal_msg(struct pt_regs * regs,unsigned long error_code,unsigned long address,struct task_struct * tsk)699  show_signal_msg(struct pt_regs *regs, unsigned long error_code,
700  		unsigned long address, struct task_struct *tsk)
701  {
702  	if (!unhandled_signal(tsk, SIGSEGV))
703  		return;
704  
705  	if (!printk_ratelimit())
706  		return;
707  
708  	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
709  		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
710  		tsk->comm, task_pid_nr(tsk), address,
711  		(void *)regs->ip, (void *)regs->sp, error_code);
712  
713  	print_vma_addr(KERN_CONT " in ", regs->ip);
714  
715  	printk(KERN_CONT "\n");
716  }
717  
718  static void
__bad_area_nosemaphore(struct pt_regs * regs,unsigned long error_code,unsigned long address,int si_code)719  __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
720  		       unsigned long address, int si_code)
721  {
722  	struct task_struct *tsk = current;
723  
724  	/* User mode accesses just cause a SIGSEGV */
725  	if (error_code & PF_USER) {
726  		/*
727  		 * It's possible to have interrupts off here:
728  		 */
729  		local_irq_enable();
730  
731  		/*
732  		 * Valid to do another page fault here because this one came
733  		 * from user space:
734  		 */
735  		if (is_prefetch(regs, error_code, address))
736  			return;
737  
738  		if (is_errata100(regs, address))
739  			return;
740  
741  #ifdef CONFIG_X86_64
742  		/*
743  		 * Instruction fetch faults in the vsyscall page might need
744  		 * emulation.
745  		 */
746  		if (unlikely((error_code & PF_INSTR) &&
747  			     ((address & ~0xfff) == VSYSCALL_START))) {
748  			if (emulate_vsyscall(regs, address))
749  				return;
750  		}
751  #endif
752  		/* Kernel addresses are always protection faults: */
753  		if (address >= TASK_SIZE)
754  			error_code |= PF_PROT;
755  
756  		if (likely(show_unhandled_signals))
757  			show_signal_msg(regs, error_code, address, tsk);
758  
759  		tsk->thread.cr2		= address;
760  		tsk->thread.error_code	= error_code;
761  		tsk->thread.trap_nr	= X86_TRAP_PF;
762  
763  		force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
764  
765  		return;
766  	}
767  
768  	if (is_f00f_bug(regs, address))
769  		return;
770  
771  	no_context(regs, error_code, address, SIGSEGV, si_code);
772  }
773  
774  static noinline void
bad_area_nosemaphore(struct pt_regs * regs,unsigned long error_code,unsigned long address)775  bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
776  		     unsigned long address)
777  {
778  	__bad_area_nosemaphore(regs, error_code, address, SEGV_MAPERR);
779  }
780  
781  static void
__bad_area(struct pt_regs * regs,unsigned long error_code,unsigned long address,int si_code)782  __bad_area(struct pt_regs *regs, unsigned long error_code,
783  	   unsigned long address, int si_code)
784  {
785  	struct mm_struct *mm = current->mm;
786  
787  	/*
788  	 * Something tried to access memory that isn't in our memory map..
789  	 * Fix it, but check if it's kernel or user first..
790  	 */
791  	up_read(&mm->mmap_sem);
792  
793  	__bad_area_nosemaphore(regs, error_code, address, si_code);
794  }
795  
796  static noinline void
bad_area(struct pt_regs * regs,unsigned long error_code,unsigned long address)797  bad_area(struct pt_regs *regs, unsigned long error_code, unsigned long address)
798  {
799  	__bad_area(regs, error_code, address, SEGV_MAPERR);
800  }
801  
802  static noinline void
bad_area_access_error(struct pt_regs * regs,unsigned long error_code,unsigned long address)803  bad_area_access_error(struct pt_regs *regs, unsigned long error_code,
804  		      unsigned long address)
805  {
806  	__bad_area(regs, error_code, address, SEGV_ACCERR);
807  }
808  
809  /* TODO: fixup for "mm-invoke-oom-killer-from-page-fault.patch" */
810  static void
out_of_memory(struct pt_regs * regs,unsigned long error_code,unsigned long address)811  out_of_memory(struct pt_regs *regs, unsigned long error_code,
812  	      unsigned long address)
813  {
814  	/*
815  	 * We ran out of memory, call the OOM killer, and return the userspace
816  	 * (which will retry the fault, or kill us if we got oom-killed):
817  	 */
818  	up_read(&current->mm->mmap_sem);
819  
820  	pagefault_out_of_memory();
821  }
822  
823  static void
do_sigbus(struct pt_regs * regs,unsigned long error_code,unsigned long address,unsigned int fault)824  do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
825  	  unsigned int fault)
826  {
827  	struct task_struct *tsk = current;
828  	struct mm_struct *mm = tsk->mm;
829  	int code = BUS_ADRERR;
830  
831  	up_read(&mm->mmap_sem);
832  
833  	/* Kernel mode? Handle exceptions or die: */
834  	if (!(error_code & PF_USER)) {
835  		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
836  		return;
837  	}
838  
839  	/* User-space => ok to do another page fault: */
840  	if (is_prefetch(regs, error_code, address))
841  		return;
842  
843  	tsk->thread.cr2		= address;
844  	tsk->thread.error_code	= error_code;
845  	tsk->thread.trap_nr	= X86_TRAP_PF;
846  
847  #ifdef CONFIG_MEMORY_FAILURE
848  	if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
849  		printk(KERN_ERR
850  	"MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
851  			tsk->comm, tsk->pid, address);
852  		code = BUS_MCEERR_AR;
853  	}
854  #endif
855  	force_sig_info_fault(SIGBUS, code, address, tsk, fault);
856  }
857  
858  static noinline int
mm_fault_error(struct pt_regs * regs,unsigned long error_code,unsigned long address,unsigned int fault)859  mm_fault_error(struct pt_regs *regs, unsigned long error_code,
860  	       unsigned long address, unsigned int fault)
861  {
862  	/*
863  	 * Pagefault was interrupted by SIGKILL. We have no reason to
864  	 * continue pagefault.
865  	 */
866  	if (fatal_signal_pending(current)) {
867  		if (!(fault & VM_FAULT_RETRY))
868  			up_read(&current->mm->mmap_sem);
869  		if (!(error_code & PF_USER))
870  			no_context(regs, error_code, address, 0, 0);
871  		return 1;
872  	}
873  	if (!(fault & VM_FAULT_ERROR))
874  		return 0;
875  
876  	if (fault & VM_FAULT_OOM) {
877  		/* Kernel mode? Handle exceptions or die: */
878  		if (!(error_code & PF_USER)) {
879  			up_read(&current->mm->mmap_sem);
880  			no_context(regs, error_code, address,
881  				   SIGSEGV, SEGV_MAPERR);
882  			return 1;
883  		}
884  
885  		out_of_memory(regs, error_code, address);
886  	} else {
887  		if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
888  			     VM_FAULT_HWPOISON_LARGE))
889  			do_sigbus(regs, error_code, address, fault);
890  		else
891  			BUG();
892  	}
893  	return 1;
894  }
895  
spurious_fault_check(unsigned long error_code,pte_t * pte)896  static int spurious_fault_check(unsigned long error_code, pte_t *pte)
897  {
898  	if ((error_code & PF_WRITE) && !pte_write(*pte))
899  		return 0;
900  
901  	if ((error_code & PF_INSTR) && !pte_exec(*pte))
902  		return 0;
903  
904  	return 1;
905  }
906  
907  /*
908   * Handle a spurious fault caused by a stale TLB entry.
909   *
910   * This allows us to lazily refresh the TLB when increasing the
911   * permissions of a kernel page (RO -> RW or NX -> X).  Doing it
912   * eagerly is very expensive since that implies doing a full
913   * cross-processor TLB flush, even if no stale TLB entries exist
914   * on other processors.
915   *
916   * There are no security implications to leaving a stale TLB when
917   * increasing the permissions on a page.
918   */
919  static noinline __kprobes int
spurious_fault(unsigned long error_code,unsigned long address)920  spurious_fault(unsigned long error_code, unsigned long address)
921  {
922  	pgd_t *pgd;
923  	pud_t *pud;
924  	pmd_t *pmd;
925  	pte_t *pte;
926  	int ret;
927  
928  	/* Reserved-bit violation or user access to kernel space? */
929  	if (error_code & (PF_USER | PF_RSVD))
930  		return 0;
931  
932  	pgd = init_mm.pgd + pgd_index(address);
933  	if (!pgd_present(*pgd))
934  		return 0;
935  
936  	pud = pud_offset(pgd, address);
937  	if (!pud_present(*pud))
938  		return 0;
939  
940  	if (pud_large(*pud))
941  		return spurious_fault_check(error_code, (pte_t *) pud);
942  
943  	pmd = pmd_offset(pud, address);
944  	if (!pmd_present(*pmd))
945  		return 0;
946  
947  	if (pmd_large(*pmd))
948  		return spurious_fault_check(error_code, (pte_t *) pmd);
949  
950  	/*
951  	 * Note: don't use pte_present() here, since it returns true
952  	 * if the _PAGE_PROTNONE bit is set.  However, this aliases the
953  	 * _PAGE_GLOBAL bit, which for kernel pages give false positives
954  	 * when CONFIG_DEBUG_PAGEALLOC is used.
955  	 */
956  	pte = pte_offset_kernel(pmd, address);
957  	if (!(pte_flags(*pte) & _PAGE_PRESENT))
958  		return 0;
959  
960  	ret = spurious_fault_check(error_code, pte);
961  	if (!ret)
962  		return 0;
963  
964  	/*
965  	 * Make sure we have permissions in PMD.
966  	 * If not, then there's a bug in the page tables:
967  	 */
968  	ret = spurious_fault_check(error_code, (pte_t *) pmd);
969  	WARN_ONCE(!ret, "PMD has incorrect permission bits\n");
970  
971  	return ret;
972  }
973  
974  int show_unhandled_signals = 1;
975  
976  static inline int
access_error(unsigned long error_code,struct vm_area_struct * vma)977  access_error(unsigned long error_code, struct vm_area_struct *vma)
978  {
979  	if (error_code & PF_WRITE) {
980  		/* write, present and write, not present: */
981  		if (unlikely(!(vma->vm_flags & VM_WRITE)))
982  			return 1;
983  		return 0;
984  	}
985  
986  	/* read, present: */
987  	if (unlikely(error_code & PF_PROT))
988  		return 1;
989  
990  	/* read, not present: */
991  	if (unlikely(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))))
992  		return 1;
993  
994  	return 0;
995  }
996  
fault_in_kernel_space(unsigned long address)997  static int fault_in_kernel_space(unsigned long address)
998  {
999  	return address >= TASK_SIZE_MAX;
1000  }
1001  
1002  /*
1003   * This routine handles page faults.  It determines the address,
1004   * and the problem, and then passes it off to one of the appropriate
1005   * routines.
1006   */
1007  dotraplinkage void __kprobes
do_page_fault(struct pt_regs * regs,unsigned long error_code)1008  do_page_fault(struct pt_regs *regs, unsigned long error_code)
1009  {
1010  	struct vm_area_struct *vma;
1011  	struct task_struct *tsk;
1012  	unsigned long address;
1013  	struct mm_struct *mm;
1014  	int fault;
1015  	int write = error_code & PF_WRITE;
1016  	unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
1017  					(write ? FAULT_FLAG_WRITE : 0);
1018  
1019  	tsk = current;
1020  	mm = tsk->mm;
1021  
1022  	/* Get the faulting address: */
1023  	address = read_cr2();
1024  
1025  	/*
1026  	 * Detect and handle instructions that would cause a page fault for
1027  	 * both a tracked kernel page and a userspace page.
1028  	 */
1029  	if (kmemcheck_active(regs))
1030  		kmemcheck_hide(regs);
1031  	prefetchw(&mm->mmap_sem);
1032  
1033  	if (unlikely(kmmio_fault(regs, address)))
1034  		return;
1035  
1036  	/*
1037  	 * We fault-in kernel-space virtual memory on-demand. The
1038  	 * 'reference' page table is init_mm.pgd.
1039  	 *
1040  	 * NOTE! We MUST NOT take any locks for this case. We may
1041  	 * be in an interrupt or a critical region, and should
1042  	 * only copy the information from the master page table,
1043  	 * nothing more.
1044  	 *
1045  	 * This verifies that the fault happens in kernel space
1046  	 * (error_code & 4) == 0, and that the fault was not a
1047  	 * protection error (error_code & 9) == 0.
1048  	 */
1049  	if (unlikely(fault_in_kernel_space(address))) {
1050  		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
1051  			if (vmalloc_fault(address) >= 0)
1052  				return;
1053  
1054  			if (kmemcheck_fault(regs, address, error_code))
1055  				return;
1056  		}
1057  
1058  		/* Can handle a stale RO->RW TLB: */
1059  		if (spurious_fault(error_code, address))
1060  			return;
1061  
1062  		/* kprobes don't want to hook the spurious faults: */
1063  		if (notify_page_fault(regs))
1064  			return;
1065  		/*
1066  		 * Don't take the mm semaphore here. If we fixup a prefetch
1067  		 * fault we could otherwise deadlock:
1068  		 */
1069  		bad_area_nosemaphore(regs, error_code, address);
1070  
1071  		return;
1072  	}
1073  
1074  	/* kprobes don't want to hook the spurious faults: */
1075  	if (unlikely(notify_page_fault(regs)))
1076  		return;
1077  	/*
1078  	 * It's safe to allow irq's after cr2 has been saved and the
1079  	 * vmalloc fault has been handled.
1080  	 *
1081  	 * User-mode registers count as a user access even for any
1082  	 * potential system fault or CPU buglet:
1083  	 */
1084  	if (user_mode_vm(regs)) {
1085  		local_irq_enable();
1086  		error_code |= PF_USER;
1087  	} else {
1088  		if (regs->flags & X86_EFLAGS_IF)
1089  			local_irq_enable();
1090  	}
1091  
1092  	if (unlikely(error_code & PF_RSVD))
1093  		pgtable_bad(regs, error_code, address);
1094  
1095  	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
1096  
1097  	/*
1098  	 * If we're in an interrupt, have no user context or are running
1099  	 * in an atomic region then we must not take the fault:
1100  	 */
1101  	if (unlikely(in_atomic() || !mm)) {
1102  		bad_area_nosemaphore(regs, error_code, address);
1103  		return;
1104  	}
1105  
1106  	/*
1107  	 * When running in the kernel we expect faults to occur only to
1108  	 * addresses in user space.  All other faults represent errors in
1109  	 * the kernel and should generate an OOPS.  Unfortunately, in the
1110  	 * case of an erroneous fault occurring in a code path which already
1111  	 * holds mmap_sem we will deadlock attempting to validate the fault
1112  	 * against the address space.  Luckily the kernel only validly
1113  	 * references user space from well defined areas of code, which are
1114  	 * listed in the exceptions table.
1115  	 *
1116  	 * As the vast majority of faults will be valid we will only perform
1117  	 * the source reference check when there is a possibility of a
1118  	 * deadlock. Attempt to lock the address space, if we cannot we then
1119  	 * validate the source. If this is invalid we can skip the address
1120  	 * space check, thus avoiding the deadlock:
1121  	 */
1122  	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
1123  		if ((error_code & PF_USER) == 0 &&
1124  		    !search_exception_tables(regs->ip)) {
1125  			bad_area_nosemaphore(regs, error_code, address);
1126  			return;
1127  		}
1128  retry:
1129  		down_read(&mm->mmap_sem);
1130  	} else {
1131  		/*
1132  		 * The above down_read_trylock() might have succeeded in
1133  		 * which case we'll have missed the might_sleep() from
1134  		 * down_read():
1135  		 */
1136  		might_sleep();
1137  	}
1138  
1139  	vma = find_vma(mm, address);
1140  	if (unlikely(!vma)) {
1141  		bad_area(regs, error_code, address);
1142  		return;
1143  	}
1144  	if (likely(vma->vm_start <= address))
1145  		goto good_area;
1146  	if (unlikely(!(vma->vm_flags & VM_GROWSDOWN))) {
1147  		bad_area(regs, error_code, address);
1148  		return;
1149  	}
1150  	if (error_code & PF_USER) {
1151  		/*
1152  		 * Accessing the stack below %sp is always a bug.
1153  		 * The large cushion allows instructions like enter
1154  		 * and pusha to work. ("enter $65535, $31" pushes
1155  		 * 32 pointers and then decrements %sp by 65535.)
1156  		 */
1157  		if (unlikely(address + 65536 + 32 * sizeof(unsigned long) < regs->sp)) {
1158  			bad_area(regs, error_code, address);
1159  			return;
1160  		}
1161  	}
1162  	if (unlikely(expand_stack(vma, address))) {
1163  		bad_area(regs, error_code, address);
1164  		return;
1165  	}
1166  
1167  	/*
1168  	 * Ok, we have a good vm_area for this memory access, so
1169  	 * we can handle it..
1170  	 */
1171  good_area:
1172  	if (unlikely(access_error(error_code, vma))) {
1173  		bad_area_access_error(regs, error_code, address);
1174  		return;
1175  	}
1176  
1177  	/*
1178  	 * If for any reason at all we couldn't handle the fault,
1179  	 * make sure we exit gracefully rather than endlessly redo
1180  	 * the fault:
1181  	 */
1182  	fault = handle_mm_fault(mm, vma, address, flags);
1183  
1184  	if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1185  		if (mm_fault_error(regs, error_code, address, fault))
1186  			return;
1187  	}
1188  
1189  	/*
1190  	 * Major/minor page fault accounting is only done on the
1191  	 * initial attempt. If we go through a retry, it is extremely
1192  	 * likely that the page will be found in page cache at that point.
1193  	 */
1194  	if (flags & FAULT_FLAG_ALLOW_RETRY) {
1195  		if (fault & VM_FAULT_MAJOR) {
1196  			tsk->maj_flt++;
1197  			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
1198  				      regs, address);
1199  		} else {
1200  			tsk->min_flt++;
1201  			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
1202  				      regs, address);
1203  		}
1204  		if (fault & VM_FAULT_RETRY) {
1205  			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1206  			 * of starvation. */
1207  			flags &= ~FAULT_FLAG_ALLOW_RETRY;
1208  			goto retry;
1209  		}
1210  	}
1211  
1212  	check_v8086_mode(regs, address, tsk);
1213  
1214  	up_read(&mm->mmap_sem);
1215  }
1216