• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  S390 version
4  *    Copyright IBM Corp. 1999
5  *    Author(s): Hartmut Penner (hp@de.ibm.com)
6  *               Ulrich Weigand (uweigand@de.ibm.com)
7  *
8  *  Derived from "arch/i386/mm/fault.c"
9  *    Copyright (C) 1995  Linus Torvalds
10  */
11 
12 #include <linux/kernel_stat.h>
13 #include <linux/perf_event.h>
14 #include <linux/signal.h>
15 #include <linux/sched.h>
16 #include <linux/sched/debug.h>
17 #include <linux/kernel.h>
18 #include <linux/errno.h>
19 #include <linux/string.h>
20 #include <linux/types.h>
21 #include <linux/ptrace.h>
22 #include <linux/mman.h>
23 #include <linux/mm.h>
24 #include <linux/compat.h>
25 #include <linux/smp.h>
26 #include <linux/kdebug.h>
27 #include <linux/init.h>
28 #include <linux/console.h>
29 #include <linux/extable.h>
30 #include <linux/hardirq.h>
31 #include <linux/kprobes.h>
32 #include <linux/uaccess.h>
33 #include <linux/hugetlb.h>
34 #include <asm/asm-offsets.h>
35 #include <asm/diag.h>
36 #include <asm/pgtable.h>
37 #include <asm/gmap.h>
38 #include <asm/irq.h>
39 #include <asm/mmu_context.h>
40 #include <asm/facility.h>
41 #include "../kernel/entry.h"
42 
43 #define __FAIL_ADDR_MASK -4096L
44 #define __SUBCODE_MASK 0x0600
45 #define __PF_RES_FIELD 0x8000000000000000ULL
46 
47 #define VM_FAULT_BADCONTEXT	0x010000
48 #define VM_FAULT_BADMAP		0x020000
49 #define VM_FAULT_BADACCESS	0x040000
50 #define VM_FAULT_SIGNAL		0x080000
51 #define VM_FAULT_PFAULT		0x100000
52 
53 enum fault_type {
54 	KERNEL_FAULT,
55 	USER_FAULT,
56 	VDSO_FAULT,
57 	GMAP_FAULT,
58 };
59 
60 static unsigned long store_indication __read_mostly;
61 
fault_init(void)62 static int __init fault_init(void)
63 {
64 	if (test_facility(75))
65 		store_indication = 0xc00;
66 	return 0;
67 }
68 early_initcall(fault_init);
69 
notify_page_fault(struct pt_regs * regs)70 static inline int notify_page_fault(struct pt_regs *regs)
71 {
72 	int ret = 0;
73 
74 	/* kprobe_running() needs smp_processor_id() */
75 	if (kprobes_built_in() && !user_mode(regs)) {
76 		preempt_disable();
77 		if (kprobe_running() && kprobe_fault_handler(regs, 14))
78 			ret = 1;
79 		preempt_enable();
80 	}
81 	return ret;
82 }
83 
84 
85 /*
86  * Unlock any spinlocks which will prevent us from getting the
87  * message out.
88  */
bust_spinlocks(int yes)89 void bust_spinlocks(int yes)
90 {
91 	if (yes) {
92 		oops_in_progress = 1;
93 	} else {
94 		int loglevel_save = console_loglevel;
95 		console_unblank();
96 		oops_in_progress = 0;
97 		/*
98 		 * OK, the message is on the console.  Now we call printk()
99 		 * without oops_in_progress set so that printk will give klogd
100 		 * a poke.  Hold onto your hats...
101 		 */
102 		console_loglevel = 15;
103 		printk(" ");
104 		console_loglevel = loglevel_save;
105 	}
106 }
107 
108 /*
109  * Find out which address space caused the exception.
110  */
get_fault_type(struct pt_regs * regs)111 static inline enum fault_type get_fault_type(struct pt_regs *regs)
112 {
113 	unsigned long trans_exc_code;
114 
115 	trans_exc_code = regs->int_parm_long & 3;
116 	if (likely(trans_exc_code == 0)) {
117 		/* primary space exception */
118 		if (IS_ENABLED(CONFIG_PGSTE) &&
119 		    test_pt_regs_flag(regs, PIF_GUEST_FAULT))
120 			return GMAP_FAULT;
121 		if (current->thread.mm_segment == USER_DS)
122 			return USER_FAULT;
123 		return KERNEL_FAULT;
124 	}
125 	if (trans_exc_code == 2) {
126 		/* secondary space exception */
127 		if (current->thread.mm_segment & 1) {
128 			if (current->thread.mm_segment == USER_DS_SACF)
129 				return USER_FAULT;
130 			return KERNEL_FAULT;
131 		}
132 		return VDSO_FAULT;
133 	}
134 	if (trans_exc_code == 1) {
135 		/* access register mode, not used in the kernel */
136 		return USER_FAULT;
137 	}
138 	/* home space exception -> access via kernel ASCE */
139 	return KERNEL_FAULT;
140 }
141 
bad_address(void * p)142 static int bad_address(void *p)
143 {
144 	unsigned long dummy;
145 
146 	return probe_kernel_address((unsigned long *)p, dummy);
147 }
148 
dump_pagetable(unsigned long asce,unsigned long address)149 static void dump_pagetable(unsigned long asce, unsigned long address)
150 {
151 	unsigned long *table = __va(asce & _ASCE_ORIGIN);
152 
153 	pr_alert("AS:%016lx ", asce);
154 	switch (asce & _ASCE_TYPE_MASK) {
155 	case _ASCE_TYPE_REGION1:
156 		table += (address & _REGION1_INDEX) >> _REGION1_SHIFT;
157 		if (bad_address(table))
158 			goto bad;
159 		pr_cont("R1:%016lx ", *table);
160 		if (*table & _REGION_ENTRY_INVALID)
161 			goto out;
162 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
163 		/* fallthrough */
164 	case _ASCE_TYPE_REGION2:
165 		table += (address & _REGION2_INDEX) >> _REGION2_SHIFT;
166 		if (bad_address(table))
167 			goto bad;
168 		pr_cont("R2:%016lx ", *table);
169 		if (*table & _REGION_ENTRY_INVALID)
170 			goto out;
171 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
172 		/* fallthrough */
173 	case _ASCE_TYPE_REGION3:
174 		table += (address & _REGION3_INDEX) >> _REGION3_SHIFT;
175 		if (bad_address(table))
176 			goto bad;
177 		pr_cont("R3:%016lx ", *table);
178 		if (*table & (_REGION_ENTRY_INVALID | _REGION3_ENTRY_LARGE))
179 			goto out;
180 		table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
181 		/* fallthrough */
182 	case _ASCE_TYPE_SEGMENT:
183 		table += (address & _SEGMENT_INDEX) >> _SEGMENT_SHIFT;
184 		if (bad_address(table))
185 			goto bad;
186 		pr_cont("S:%016lx ", *table);
187 		if (*table & (_SEGMENT_ENTRY_INVALID | _SEGMENT_ENTRY_LARGE))
188 			goto out;
189 		table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
190 	}
191 	table += (address & _PAGE_INDEX) >> _PAGE_SHIFT;
192 	if (bad_address(table))
193 		goto bad;
194 	pr_cont("P:%016lx ", *table);
195 out:
196 	pr_cont("\n");
197 	return;
198 bad:
199 	pr_cont("BAD\n");
200 }
201 
dump_fault_info(struct pt_regs * regs)202 static void dump_fault_info(struct pt_regs *regs)
203 {
204 	unsigned long asce;
205 
206 	pr_alert("Failing address: %016lx TEID: %016lx\n",
207 		 regs->int_parm_long & __FAIL_ADDR_MASK, regs->int_parm_long);
208 	pr_alert("Fault in ");
209 	switch (regs->int_parm_long & 3) {
210 	case 3:
211 		pr_cont("home space ");
212 		break;
213 	case 2:
214 		pr_cont("secondary space ");
215 		break;
216 	case 1:
217 		pr_cont("access register ");
218 		break;
219 	case 0:
220 		pr_cont("primary space ");
221 		break;
222 	}
223 	pr_cont("mode while using ");
224 	switch (get_fault_type(regs)) {
225 	case USER_FAULT:
226 		asce = S390_lowcore.user_asce;
227 		pr_cont("user ");
228 		break;
229 	case VDSO_FAULT:
230 		asce = S390_lowcore.vdso_asce;
231 		pr_cont("vdso ");
232 		break;
233 	case GMAP_FAULT:
234 		asce = ((struct gmap *) S390_lowcore.gmap)->asce;
235 		pr_cont("gmap ");
236 		break;
237 	case KERNEL_FAULT:
238 		asce = S390_lowcore.kernel_asce;
239 		pr_cont("kernel ");
240 		break;
241 	}
242 	pr_cont("ASCE.\n");
243 	dump_pagetable(asce, regs->int_parm_long & __FAIL_ADDR_MASK);
244 }
245 
246 int show_unhandled_signals = 1;
247 
report_user_fault(struct pt_regs * regs,long signr,int is_mm_fault)248 void report_user_fault(struct pt_regs *regs, long signr, int is_mm_fault)
249 {
250 	if ((task_pid_nr(current) > 1) && !show_unhandled_signals)
251 		return;
252 	if (!unhandled_signal(current, signr))
253 		return;
254 	if (!printk_ratelimit())
255 		return;
256 	printk(KERN_ALERT "User process fault: interruption code %04x ilc:%d ",
257 	       regs->int_code & 0xffff, regs->int_code >> 17);
258 	print_vma_addr(KERN_CONT "in ", regs->psw.addr);
259 	printk(KERN_CONT "\n");
260 	if (is_mm_fault)
261 		dump_fault_info(regs);
262 	show_regs(regs);
263 }
264 
265 /*
266  * Send SIGSEGV to task.  This is an external routine
267  * to keep the stack usage of do_page_fault small.
268  */
do_sigsegv(struct pt_regs * regs,int si_code)269 static noinline void do_sigsegv(struct pt_regs *regs, int si_code)
270 {
271 	report_user_fault(regs, SIGSEGV, 1);
272 	force_sig_fault(SIGSEGV, si_code,
273 			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
274 			current);
275 }
276 
do_no_context(struct pt_regs * regs)277 static noinline void do_no_context(struct pt_regs *regs)
278 {
279 	const struct exception_table_entry *fixup;
280 
281 	/* Are we prepared to handle this kernel fault?  */
282 	fixup = search_exception_tables(regs->psw.addr);
283 	if (fixup) {
284 		regs->psw.addr = extable_fixup(fixup);
285 		return;
286 	}
287 
288 	/*
289 	 * Oops. The kernel tried to access some bad page. We'll have to
290 	 * terminate things with extreme prejudice.
291 	 */
292 	if (get_fault_type(regs) == KERNEL_FAULT)
293 		printk(KERN_ALERT "Unable to handle kernel pointer dereference"
294 		       " in virtual kernel address space\n");
295 	else
296 		printk(KERN_ALERT "Unable to handle kernel paging request"
297 		       " in virtual user address space\n");
298 	dump_fault_info(regs);
299 	die(regs, "Oops");
300 	do_exit(SIGKILL);
301 }
302 
do_low_address(struct pt_regs * regs)303 static noinline void do_low_address(struct pt_regs *regs)
304 {
305 	/* Low-address protection hit in kernel mode means
306 	   NULL pointer write access in kernel mode.  */
307 	if (regs->psw.mask & PSW_MASK_PSTATE) {
308 		/* Low-address protection hit in user mode 'cannot happen'. */
309 		die (regs, "Low-address protection");
310 		do_exit(SIGKILL);
311 	}
312 
313 	do_no_context(regs);
314 }
315 
do_sigbus(struct pt_regs * regs)316 static noinline void do_sigbus(struct pt_regs *regs)
317 {
318 	/*
319 	 * Send a sigbus, regardless of whether we were in kernel
320 	 * or user mode.
321 	 */
322 	force_sig_fault(SIGBUS, BUS_ADRERR,
323 			(void __user *)(regs->int_parm_long & __FAIL_ADDR_MASK),
324 			current);
325 }
326 
signal_return(struct pt_regs * regs)327 static noinline int signal_return(struct pt_regs *regs)
328 {
329 	u16 instruction;
330 	int rc;
331 
332 	rc = __get_user(instruction, (u16 __user *) regs->psw.addr);
333 	if (rc)
334 		return rc;
335 	if (instruction == 0x0a77) {
336 		set_pt_regs_flag(regs, PIF_SYSCALL);
337 		regs->int_code = 0x00040077;
338 		return 0;
339 	} else if (instruction == 0x0aad) {
340 		set_pt_regs_flag(regs, PIF_SYSCALL);
341 		regs->int_code = 0x000400ad;
342 		return 0;
343 	}
344 	return -EACCES;
345 }
346 
do_fault_error(struct pt_regs * regs,int access,vm_fault_t fault)347 static noinline void do_fault_error(struct pt_regs *regs, int access,
348 					vm_fault_t fault)
349 {
350 	int si_code;
351 
352 	switch (fault) {
353 	case VM_FAULT_BADACCESS:
354 		if (access == VM_EXEC && signal_return(regs) == 0)
355 			break;
356 	case VM_FAULT_BADMAP:
357 		/* Bad memory access. Check if it is kernel or user space. */
358 		if (user_mode(regs)) {
359 			/* User mode accesses just cause a SIGSEGV */
360 			si_code = (fault == VM_FAULT_BADMAP) ?
361 				SEGV_MAPERR : SEGV_ACCERR;
362 			do_sigsegv(regs, si_code);
363 			break;
364 		}
365 	case VM_FAULT_BADCONTEXT:
366 	case VM_FAULT_PFAULT:
367 		do_no_context(regs);
368 		break;
369 	case VM_FAULT_SIGNAL:
370 		if (!user_mode(regs))
371 			do_no_context(regs);
372 		break;
373 	default: /* fault & VM_FAULT_ERROR */
374 		if (fault & VM_FAULT_OOM) {
375 			if (!user_mode(regs))
376 				do_no_context(regs);
377 			else
378 				pagefault_out_of_memory();
379 		} else if (fault & VM_FAULT_SIGSEGV) {
380 			/* Kernel mode? Handle exceptions or die */
381 			if (!user_mode(regs))
382 				do_no_context(regs);
383 			else
384 				do_sigsegv(regs, SEGV_MAPERR);
385 		} else if (fault & VM_FAULT_SIGBUS) {
386 			/* Kernel mode? Handle exceptions or die */
387 			if (!user_mode(regs))
388 				do_no_context(regs);
389 			else
390 				do_sigbus(regs);
391 		} else
392 			BUG();
393 		break;
394 	}
395 }
396 
397 /*
398  * This routine handles page faults.  It determines the address,
399  * and the problem, and then passes it off to one of the appropriate
400  * routines.
401  *
402  * interruption code (int_code):
403  *   04       Protection           ->  Write-Protection  (suprression)
404  *   10       Segment translation  ->  Not present       (nullification)
405  *   11       Page translation     ->  Not present       (nullification)
406  *   3b       Region third trans.  ->  Not present       (nullification)
407  */
do_exception(struct pt_regs * regs,int access)408 static inline vm_fault_t do_exception(struct pt_regs *regs, int access)
409 {
410 	struct gmap *gmap;
411 	struct task_struct *tsk;
412 	struct mm_struct *mm;
413 	struct vm_area_struct *vma;
414 	enum fault_type type;
415 	unsigned long trans_exc_code;
416 	unsigned long address;
417 	unsigned int flags;
418 	vm_fault_t fault;
419 
420 	tsk = current;
421 	/*
422 	 * The instruction that caused the program check has
423 	 * been nullified. Don't signal single step via SIGTRAP.
424 	 */
425 	clear_pt_regs_flag(regs, PIF_PER_TRAP);
426 
427 	if (notify_page_fault(regs))
428 		return 0;
429 
430 	mm = tsk->mm;
431 	trans_exc_code = regs->int_parm_long;
432 
433 	/*
434 	 * Verify that the fault happened in user space, that
435 	 * we are not in an interrupt and that there is a
436 	 * user context.
437 	 */
438 	fault = VM_FAULT_BADCONTEXT;
439 	type = get_fault_type(regs);
440 	switch (type) {
441 	case KERNEL_FAULT:
442 		goto out;
443 	case VDSO_FAULT:
444 		fault = VM_FAULT_BADMAP;
445 		goto out;
446 	case USER_FAULT:
447 	case GMAP_FAULT:
448 		if (faulthandler_disabled() || !mm)
449 			goto out;
450 		break;
451 	}
452 
453 	address = trans_exc_code & __FAIL_ADDR_MASK;
454 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
455 	flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
456 	if (user_mode(regs))
457 		flags |= FAULT_FLAG_USER;
458 	if (access == VM_WRITE || (trans_exc_code & store_indication) == 0x400)
459 		flags |= FAULT_FLAG_WRITE;
460 	down_read(&mm->mmap_sem);
461 
462 	gmap = NULL;
463 	if (IS_ENABLED(CONFIG_PGSTE) && type == GMAP_FAULT) {
464 		gmap = (struct gmap *) S390_lowcore.gmap;
465 		current->thread.gmap_addr = address;
466 		current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
467 		current->thread.gmap_int_code = regs->int_code & 0xffff;
468 		address = __gmap_translate(gmap, address);
469 		if (address == -EFAULT) {
470 			fault = VM_FAULT_BADMAP;
471 			goto out_up;
472 		}
473 		if (gmap->pfault_enabled)
474 			flags |= FAULT_FLAG_RETRY_NOWAIT;
475 	}
476 
477 retry:
478 	fault = VM_FAULT_BADMAP;
479 	vma = find_vma(mm, address);
480 	if (!vma)
481 		goto out_up;
482 
483 	if (unlikely(vma->vm_start > address)) {
484 		if (!(vma->vm_flags & VM_GROWSDOWN))
485 			goto out_up;
486 		if (expand_stack(vma, address))
487 			goto out_up;
488 	}
489 
490 	/*
491 	 * Ok, we have a good vm_area for this memory access, so
492 	 * we can handle it..
493 	 */
494 	fault = VM_FAULT_BADACCESS;
495 	if (unlikely(!(vma->vm_flags & access)))
496 		goto out_up;
497 
498 	if (is_vm_hugetlb_page(vma))
499 		address &= HPAGE_MASK;
500 	/*
501 	 * If for any reason at all we couldn't handle the fault,
502 	 * make sure we exit gracefully rather than endlessly redo
503 	 * the fault.
504 	 */
505 	fault = handle_mm_fault(vma, address, flags);
506 	/* No reason to continue if interrupted by SIGKILL. */
507 	if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) {
508 		fault = VM_FAULT_SIGNAL;
509 		if (flags & FAULT_FLAG_RETRY_NOWAIT)
510 			goto out_up;
511 		goto out;
512 	}
513 	if (unlikely(fault & VM_FAULT_ERROR))
514 		goto out_up;
515 
516 	/*
517 	 * Major/minor page fault accounting is only done on the
518 	 * initial attempt. If we go through a retry, it is extremely
519 	 * likely that the page will be found in page cache at that point.
520 	 */
521 	if (flags & FAULT_FLAG_ALLOW_RETRY) {
522 		if (fault & VM_FAULT_MAJOR) {
523 			tsk->maj_flt++;
524 			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1,
525 				      regs, address);
526 		} else {
527 			tsk->min_flt++;
528 			perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1,
529 				      regs, address);
530 		}
531 		if (fault & VM_FAULT_RETRY) {
532 			if (IS_ENABLED(CONFIG_PGSTE) && gmap &&
533 			    (flags & FAULT_FLAG_RETRY_NOWAIT)) {
534 				/* FAULT_FLAG_RETRY_NOWAIT has been set,
535 				 * mmap_sem has not been released */
536 				current->thread.gmap_pfault = 1;
537 				fault = VM_FAULT_PFAULT;
538 				goto out_up;
539 			}
540 			/* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
541 			 * of starvation. */
542 			flags &= ~(FAULT_FLAG_ALLOW_RETRY |
543 				   FAULT_FLAG_RETRY_NOWAIT);
544 			flags |= FAULT_FLAG_TRIED;
545 			down_read(&mm->mmap_sem);
546 			goto retry;
547 		}
548 	}
549 	if (IS_ENABLED(CONFIG_PGSTE) && gmap) {
550 		address =  __gmap_link(gmap, current->thread.gmap_addr,
551 				       address);
552 		if (address == -EFAULT) {
553 			fault = VM_FAULT_BADMAP;
554 			goto out_up;
555 		}
556 		if (address == -ENOMEM) {
557 			fault = VM_FAULT_OOM;
558 			goto out_up;
559 		}
560 	}
561 	fault = 0;
562 out_up:
563 	up_read(&mm->mmap_sem);
564 out:
565 	return fault;
566 }
567 
do_protection_exception(struct pt_regs * regs)568 void do_protection_exception(struct pt_regs *regs)
569 {
570 	unsigned long trans_exc_code;
571 	int access;
572 	vm_fault_t fault;
573 
574 	trans_exc_code = regs->int_parm_long;
575 	/*
576 	 * Protection exceptions are suppressing, decrement psw address.
577 	 * The exception to this rule are aborted transactions, for these
578 	 * the PSW already points to the correct location.
579 	 */
580 	if (!(regs->int_code & 0x200))
581 		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
582 	/*
583 	 * Check for low-address protection.  This needs to be treated
584 	 * as a special case because the translation exception code
585 	 * field is not guaranteed to contain valid data in this case.
586 	 */
587 	if (unlikely(!(trans_exc_code & 4))) {
588 		do_low_address(regs);
589 		return;
590 	}
591 	if (unlikely(MACHINE_HAS_NX && (trans_exc_code & 0x80))) {
592 		regs->int_parm_long = (trans_exc_code & ~PAGE_MASK) |
593 					(regs->psw.addr & PAGE_MASK);
594 		access = VM_EXEC;
595 		fault = VM_FAULT_BADACCESS;
596 	} else {
597 		access = VM_WRITE;
598 		fault = do_exception(regs, access);
599 	}
600 	if (unlikely(fault))
601 		do_fault_error(regs, access, fault);
602 }
603 NOKPROBE_SYMBOL(do_protection_exception);
604 
do_dat_exception(struct pt_regs * regs)605 void do_dat_exception(struct pt_regs *regs)
606 {
607 	int access;
608 	vm_fault_t fault;
609 
610 	access = VM_READ | VM_EXEC | VM_WRITE;
611 	fault = do_exception(regs, access);
612 	if (unlikely(fault))
613 		do_fault_error(regs, access, fault);
614 }
615 NOKPROBE_SYMBOL(do_dat_exception);
616 
617 #ifdef CONFIG_PFAULT
618 /*
619  * 'pfault' pseudo page faults routines.
620  */
621 static int pfault_disable;
622 
nopfault(char * str)623 static int __init nopfault(char *str)
624 {
625 	pfault_disable = 1;
626 	return 1;
627 }
628 
629 __setup("nopfault", nopfault);
630 
631 struct pfault_refbk {
632 	u16 refdiagc;
633 	u16 reffcode;
634 	u16 refdwlen;
635 	u16 refversn;
636 	u64 refgaddr;
637 	u64 refselmk;
638 	u64 refcmpmk;
639 	u64 reserved;
640 } __attribute__ ((packed, aligned(8)));
641 
pfault_init(void)642 int pfault_init(void)
643 {
644 	struct pfault_refbk refbk = {
645 		.refdiagc = 0x258,
646 		.reffcode = 0,
647 		.refdwlen = 5,
648 		.refversn = 2,
649 		.refgaddr = __LC_LPP,
650 		.refselmk = 1ULL << 48,
651 		.refcmpmk = 1ULL << 48,
652 		.reserved = __PF_RES_FIELD };
653         int rc;
654 
655 	if (pfault_disable)
656 		return -1;
657 	diag_stat_inc(DIAG_STAT_X258);
658 	asm volatile(
659 		"	diag	%1,%0,0x258\n"
660 		"0:	j	2f\n"
661 		"1:	la	%0,8\n"
662 		"2:\n"
663 		EX_TABLE(0b,1b)
664 		: "=d" (rc) : "a" (&refbk), "m" (refbk) : "cc");
665         return rc;
666 }
667 
pfault_fini(void)668 void pfault_fini(void)
669 {
670 	struct pfault_refbk refbk = {
671 		.refdiagc = 0x258,
672 		.reffcode = 1,
673 		.refdwlen = 5,
674 		.refversn = 2,
675 	};
676 
677 	if (pfault_disable)
678 		return;
679 	diag_stat_inc(DIAG_STAT_X258);
680 	asm volatile(
681 		"	diag	%0,0,0x258\n"
682 		"0:	nopr	%%r7\n"
683 		EX_TABLE(0b,0b)
684 		: : "a" (&refbk), "m" (refbk) : "cc");
685 }
686 
687 static DEFINE_SPINLOCK(pfault_lock);
688 static LIST_HEAD(pfault_list);
689 
690 #define PF_COMPLETE	0x0080
691 
692 /*
693  * The mechanism of our pfault code: if Linux is running as guest, runs a user
694  * space process and the user space process accesses a page that the host has
695  * paged out we get a pfault interrupt.
696  *
697  * This allows us, within the guest, to schedule a different process. Without
698  * this mechanism the host would have to suspend the whole virtual cpu until
699  * the page has been paged in.
700  *
701  * So when we get such an interrupt then we set the state of the current task
702  * to uninterruptible and also set the need_resched flag. Both happens within
703  * interrupt context(!). If we later on want to return to user space we
704  * recognize the need_resched flag and then call schedule().  It's not very
705  * obvious how this works...
706  *
707  * Of course we have a lot of additional fun with the completion interrupt (->
708  * host signals that a page of a process has been paged in and the process can
709  * continue to run). This interrupt can arrive on any cpu and, since we have
710  * virtual cpus, actually appear before the interrupt that signals that a page
711  * is missing.
712  */
pfault_interrupt(struct ext_code ext_code,unsigned int param32,unsigned long param64)713 static void pfault_interrupt(struct ext_code ext_code,
714 			     unsigned int param32, unsigned long param64)
715 {
716 	struct task_struct *tsk;
717 	__u16 subcode;
718 	pid_t pid;
719 
720 	/*
721 	 * Get the external interruption subcode & pfault initial/completion
722 	 * signal bit. VM stores this in the 'cpu address' field associated
723 	 * with the external interrupt.
724 	 */
725 	subcode = ext_code.subcode;
726 	if ((subcode & 0xff00) != __SUBCODE_MASK)
727 		return;
728 	inc_irq_stat(IRQEXT_PFL);
729 	/* Get the token (= pid of the affected task). */
730 	pid = param64 & LPP_PID_MASK;
731 	rcu_read_lock();
732 	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
733 	if (tsk)
734 		get_task_struct(tsk);
735 	rcu_read_unlock();
736 	if (!tsk)
737 		return;
738 	spin_lock(&pfault_lock);
739 	if (subcode & PF_COMPLETE) {
740 		/* signal bit is set -> a page has been swapped in by VM */
741 		if (tsk->thread.pfault_wait == 1) {
742 			/* Initial interrupt was faster than the completion
743 			 * interrupt. pfault_wait is valid. Set pfault_wait
744 			 * back to zero and wake up the process. This can
745 			 * safely be done because the task is still sleeping
746 			 * and can't produce new pfaults. */
747 			tsk->thread.pfault_wait = 0;
748 			list_del(&tsk->thread.list);
749 			wake_up_process(tsk);
750 			put_task_struct(tsk);
751 		} else {
752 			/* Completion interrupt was faster than initial
753 			 * interrupt. Set pfault_wait to -1 so the initial
754 			 * interrupt doesn't put the task to sleep.
755 			 * If the task is not running, ignore the completion
756 			 * interrupt since it must be a leftover of a PFAULT
757 			 * CANCEL operation which didn't remove all pending
758 			 * completion interrupts. */
759 			if (tsk->state == TASK_RUNNING)
760 				tsk->thread.pfault_wait = -1;
761 		}
762 	} else {
763 		/* signal bit not set -> a real page is missing. */
764 		if (WARN_ON_ONCE(tsk != current))
765 			goto out;
766 		if (tsk->thread.pfault_wait == 1) {
767 			/* Already on the list with a reference: put to sleep */
768 			goto block;
769 		} else if (tsk->thread.pfault_wait == -1) {
770 			/* Completion interrupt was faster than the initial
771 			 * interrupt (pfault_wait == -1). Set pfault_wait
772 			 * back to zero and exit. */
773 			tsk->thread.pfault_wait = 0;
774 		} else {
775 			/* Initial interrupt arrived before completion
776 			 * interrupt. Let the task sleep.
777 			 * An extra task reference is needed since a different
778 			 * cpu may set the task state to TASK_RUNNING again
779 			 * before the scheduler is reached. */
780 			get_task_struct(tsk);
781 			tsk->thread.pfault_wait = 1;
782 			list_add(&tsk->thread.list, &pfault_list);
783 block:
784 			/* Since this must be a userspace fault, there
785 			 * is no kernel task state to trample. Rely on the
786 			 * return to userspace schedule() to block. */
787 			__set_current_state(TASK_UNINTERRUPTIBLE);
788 			set_tsk_need_resched(tsk);
789 			set_preempt_need_resched();
790 		}
791 	}
792 out:
793 	spin_unlock(&pfault_lock);
794 	put_task_struct(tsk);
795 }
796 
pfault_cpu_dead(unsigned int cpu)797 static int pfault_cpu_dead(unsigned int cpu)
798 {
799 	struct thread_struct *thread, *next;
800 	struct task_struct *tsk;
801 
802 	spin_lock_irq(&pfault_lock);
803 	list_for_each_entry_safe(thread, next, &pfault_list, list) {
804 		thread->pfault_wait = 0;
805 		list_del(&thread->list);
806 		tsk = container_of(thread, struct task_struct, thread);
807 		wake_up_process(tsk);
808 		put_task_struct(tsk);
809 	}
810 	spin_unlock_irq(&pfault_lock);
811 	return 0;
812 }
813 
pfault_irq_init(void)814 static int __init pfault_irq_init(void)
815 {
816 	int rc;
817 
818 	rc = register_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
819 	if (rc)
820 		goto out_extint;
821 	rc = pfault_init() == 0 ? 0 : -EOPNOTSUPP;
822 	if (rc)
823 		goto out_pfault;
824 	irq_subclass_register(IRQ_SUBCLASS_SERVICE_SIGNAL);
825 	cpuhp_setup_state_nocalls(CPUHP_S390_PFAULT_DEAD, "s390/pfault:dead",
826 				  NULL, pfault_cpu_dead);
827 	return 0;
828 
829 out_pfault:
830 	unregister_external_irq(EXT_IRQ_CP_SERVICE, pfault_interrupt);
831 out_extint:
832 	pfault_disable = 1;
833 	return rc;
834 }
835 early_initcall(pfault_irq_init);
836 
837 #endif /* CONFIG_PFAULT */
838