• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1/*
2 *  linux/arch/x86_64/entry.S
3 *
4 *  Copyright (C) 1991, 1992  Linus Torvalds
5 *  Copyright (C) 2000, 2001, 2002  Andi Kleen SuSE Labs
6 *  Copyright (C) 2000  Pavel Machek <pavel@suse.cz>
7 */
8
9/*
10 * entry.S contains the system-call and fault low-level handling routines.
11 *
12 * NOTE: This code handles signal-recognition, which happens every time
13 * after an interrupt and after each system call.
14 *
15 * Normal syscalls and interrupts don't save a full stack frame, this is
16 * only done for syscall tracing, signals or fork/exec et.al.
17 *
18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11.
22 * - full stack frame: Like partial stack frame, but all register saved.
23 *
24 * Some macro usage:
25 * - CFI macros are used to generate dwarf2 unwind information for better
26 * backtraces. They don't change any code.
27 * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
28 * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
29 * There are unfortunately lots of special cases where some registers
30 * not touched. The macro is a big mess that should be cleaned up.
31 * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
32 * Gives a full stack frame.
33 * - ENTRY/END Define functions in the symbol table.
34 * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
35 * frame that is otherwise undefined after a SYSCALL
36 * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
37 * - errorentry/paranoidentry/zeroentry - Define exception entry points.
38 */
39
40#include <linux/linkage.h>
41#include <asm/segment.h>
42#include <asm/cache.h>
43#include <asm/errno.h>
44#include <asm/dwarf2.h>
45#include <asm/calling.h>
46#include <asm/asm-offsets.h>
47#include <asm/msr.h>
48#include <asm/unistd.h>
49#include <asm/thread_info.h>
50#include <asm/hw_irq.h>
51#include <asm/page.h>
52#include <asm/irqflags.h>
53#include <asm/paravirt.h>
54#include <asm/ftrace.h>
55
56/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
57#include <linux/elf-em.h>
58#define AUDIT_ARCH_X86_64	(EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE)
59#define __AUDIT_ARCH_64BIT 0x80000000
60#define __AUDIT_ARCH_LE	   0x40000000
61
62	.code64
63#ifdef CONFIG_FUNCTION_TRACER
64#ifdef CONFIG_DYNAMIC_FTRACE
65ENTRY(mcount)
66	retq
67END(mcount)
68
69ENTRY(ftrace_caller)
70	cmpl $0, function_trace_stop
71	jne  ftrace_stub
72
73	MCOUNT_SAVE_FRAME
74
75	movq 0x38(%rsp), %rdi
76	movq 8(%rbp), %rsi
77	subq $MCOUNT_INSN_SIZE, %rdi
78
79.globl ftrace_call
80ftrace_call:
81	call ftrace_stub
82
83	MCOUNT_RESTORE_FRAME
84
85#ifdef CONFIG_FUNCTION_GRAPH_TRACER
86.globl ftrace_graph_call
87ftrace_graph_call:
88	jmp ftrace_stub
89#endif
90
91.globl ftrace_stub
92ftrace_stub:
93	retq
94END(ftrace_caller)
95
96#else /* ! CONFIG_DYNAMIC_FTRACE */
97ENTRY(mcount)
98	cmpl $0, function_trace_stop
99	jne  ftrace_stub
100
101	cmpq $ftrace_stub, ftrace_trace_function
102	jnz trace
103
104#ifdef CONFIG_FUNCTION_GRAPH_TRACER
105	cmpq $ftrace_stub, ftrace_graph_return
106	jnz ftrace_graph_caller
107
108	cmpq $ftrace_graph_entry_stub, ftrace_graph_entry
109	jnz ftrace_graph_caller
110#endif
111
112.globl ftrace_stub
113ftrace_stub:
114	retq
115
116trace:
117	MCOUNT_SAVE_FRAME
118
119	movq 0x38(%rsp), %rdi
120	movq 8(%rbp), %rsi
121	subq $MCOUNT_INSN_SIZE, %rdi
122
123	call   *ftrace_trace_function
124
125	MCOUNT_RESTORE_FRAME
126
127	jmp ftrace_stub
128END(mcount)
129#endif /* CONFIG_DYNAMIC_FTRACE */
130#endif /* CONFIG_FUNCTION_TRACER */
131
132#ifdef CONFIG_FUNCTION_GRAPH_TRACER
133ENTRY(ftrace_graph_caller)
134	cmpl $0, function_trace_stop
135	jne ftrace_stub
136
137	MCOUNT_SAVE_FRAME
138
139	leaq 8(%rbp), %rdi
140	movq 0x38(%rsp), %rsi
141	subq $MCOUNT_INSN_SIZE, %rsi
142
143	call	prepare_ftrace_return
144
145	MCOUNT_RESTORE_FRAME
146
147	retq
148END(ftrace_graph_caller)
149
150
151.globl return_to_handler
152return_to_handler:
153	subq  $80, %rsp
154
155	movq %rax, (%rsp)
156	movq %rcx, 8(%rsp)
157	movq %rdx, 16(%rsp)
158	movq %rsi, 24(%rsp)
159	movq %rdi, 32(%rsp)
160	movq %r8, 40(%rsp)
161	movq %r9, 48(%rsp)
162	movq %r10, 56(%rsp)
163	movq %r11, 64(%rsp)
164
165	call ftrace_return_to_handler
166
167	movq %rax, 72(%rsp)
168	movq 64(%rsp), %r11
169	movq 56(%rsp), %r10
170	movq 48(%rsp), %r9
171	movq 40(%rsp), %r8
172	movq 32(%rsp), %rdi
173	movq 24(%rsp), %rsi
174	movq 16(%rsp), %rdx
175	movq 8(%rsp), %rcx
176	movq (%rsp), %rax
177	addq $72, %rsp
178	retq
179#endif
180
181
182#ifndef CONFIG_PREEMPT
183#define retint_kernel retint_restore_args
184#endif
185
186#ifdef CONFIG_PARAVIRT
187ENTRY(native_usergs_sysret64)
188	swapgs
189	sysretq
190#endif /* CONFIG_PARAVIRT */
191
192
193.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
194#ifdef CONFIG_TRACE_IRQFLAGS
195	bt   $9,EFLAGS-\offset(%rsp)	/* interrupts off? */
196	jnc  1f
197	TRACE_IRQS_ON
1981:
199#endif
200.endm
201
202/*
203 * C code is not supposed to know about undefined top of stack. Every time
204 * a C function with an pt_regs argument is called from the SYSCALL based
205 * fast path FIXUP_TOP_OF_STACK is needed.
206 * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
207 * manipulation.
208 */
209
210	/* %rsp:at FRAMEEND */
211	.macro FIXUP_TOP_OF_STACK tmp offset=0
212	movq %gs:pda_oldrsp,\tmp
213	movq \tmp,RSP+\offset(%rsp)
214	movq $__USER_DS,SS+\offset(%rsp)
215	movq $__USER_CS,CS+\offset(%rsp)
216	movq $-1,RCX+\offset(%rsp)
217	movq R11+\offset(%rsp),\tmp  /* get eflags */
218	movq \tmp,EFLAGS+\offset(%rsp)
219	.endm
220
221	.macro RESTORE_TOP_OF_STACK tmp offset=0
222	movq RSP+\offset(%rsp),\tmp
223	movq \tmp,%gs:pda_oldrsp
224	movq EFLAGS+\offset(%rsp),\tmp
225	movq \tmp,R11+\offset(%rsp)
226	.endm
227
228	.macro FAKE_STACK_FRAME child_rip
229	/* push in order ss, rsp, eflags, cs, rip */
230	xorl %eax, %eax
231	pushq $__KERNEL_DS /* ss */
232	CFI_ADJUST_CFA_OFFSET	8
233	/*CFI_REL_OFFSET	ss,0*/
234	pushq %rax /* rsp */
235	CFI_ADJUST_CFA_OFFSET	8
236	CFI_REL_OFFSET	rsp,0
237	pushq $X86_EFLAGS_IF /* eflags - interrupts on */
238	CFI_ADJUST_CFA_OFFSET	8
239	/*CFI_REL_OFFSET	rflags,0*/
240	pushq $__KERNEL_CS /* cs */
241	CFI_ADJUST_CFA_OFFSET	8
242	/*CFI_REL_OFFSET	cs,0*/
243	pushq \child_rip /* rip */
244	CFI_ADJUST_CFA_OFFSET	8
245	CFI_REL_OFFSET	rip,0
246	pushq	%rax /* orig rax */
247	CFI_ADJUST_CFA_OFFSET	8
248	.endm
249
250	.macro UNFAKE_STACK_FRAME
251	addq $8*6, %rsp
252	CFI_ADJUST_CFA_OFFSET	-(6*8)
253	.endm
254
255/*
256 * initial frame state for interrupts (and exceptions without error code)
257 */
258	.macro EMPTY_FRAME start=1 offset=0
259	.if \start
260	CFI_STARTPROC simple
261	CFI_SIGNAL_FRAME
262	CFI_DEF_CFA rsp,8+\offset
263	.else
264	CFI_DEF_CFA_OFFSET 8+\offset
265	.endif
266	.endm
267
268/*
269 * initial frame state for interrupts (and exceptions without error code)
270 */
271	.macro INTR_FRAME start=1 offset=0
272	EMPTY_FRAME \start, SS+8+\offset-RIP
273	/*CFI_REL_OFFSET ss, SS+\offset-RIP*/
274	CFI_REL_OFFSET rsp, RSP+\offset-RIP
275	/*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
276	/*CFI_REL_OFFSET cs, CS+\offset-RIP*/
277	CFI_REL_OFFSET rip, RIP+\offset-RIP
278	.endm
279
280/*
281 * initial frame state for exceptions with error code (and interrupts
282 * with vector already pushed)
283 */
284	.macro XCPT_FRAME start=1 offset=0
285	INTR_FRAME \start, RIP+\offset-ORIG_RAX
286	/*CFI_REL_OFFSET orig_rax, ORIG_RAX-ORIG_RAX*/
287	.endm
288
289/*
290 * frame that enables calling into C.
291 */
292	.macro PARTIAL_FRAME start=1 offset=0
293	XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
294	CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
295	CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
296	CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
297	CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
298	CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
299	CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
300	CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
301	CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
302	CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
303	.endm
304
305/*
306 * frame that enables passing a complete pt_regs to a C function.
307 */
308	.macro DEFAULT_FRAME start=1 offset=0
309	PARTIAL_FRAME \start, R11+\offset-R15
310	CFI_REL_OFFSET rbx, RBX+\offset
311	CFI_REL_OFFSET rbp, RBP+\offset
312	CFI_REL_OFFSET r12, R12+\offset
313	CFI_REL_OFFSET r13, R13+\offset
314	CFI_REL_OFFSET r14, R14+\offset
315	CFI_REL_OFFSET r15, R15+\offset
316	.endm
317
318/* save partial stack frame */
319ENTRY(save_args)
320	XCPT_FRAME
321	cld
322	movq_cfi rdi, RDI+16-ARGOFFSET
323	movq_cfi rsi, RSI+16-ARGOFFSET
324	movq_cfi rdx, RDX+16-ARGOFFSET
325	movq_cfi rcx, RCX+16-ARGOFFSET
326	movq_cfi rax, RAX+16-ARGOFFSET
327	movq_cfi  r8,  R8+16-ARGOFFSET
328	movq_cfi  r9,  R9+16-ARGOFFSET
329	movq_cfi r10, R10+16-ARGOFFSET
330	movq_cfi r11, R11+16-ARGOFFSET
331
332	leaq -ARGOFFSET+16(%rsp),%rdi	/* arg1 for handler */
333	movq_cfi rbp, 8		/* push %rbp */
334	leaq 8(%rsp), %rbp		/* mov %rsp, %ebp */
335	testl $3, CS(%rdi)
336	je 1f
337	SWAPGS
338	/*
339	 * irqcount is used to check if a CPU is already on an interrupt stack
340	 * or not. While this is essentially redundant with preempt_count it is
341	 * a little cheaper to use a separate counter in the PDA (short of
342	 * moving irq_enter into assembly, which would be too much work)
343	 */
3441:	incl %gs:pda_irqcount
345	jne 2f
346	popq_cfi %rax			/* move return address... */
347	mov %gs:pda_irqstackptr,%rsp
348	EMPTY_FRAME 0
349	pushq_cfi %rbp			/* backlink for unwinder */
350	pushq_cfi %rax			/* ... to the new stack */
351	/*
352	 * We entered an interrupt context - irqs are off:
353	 */
3542:	TRACE_IRQS_OFF
355	ret
356	CFI_ENDPROC
357END(save_args)
358
359ENTRY(save_rest)
360	PARTIAL_FRAME 1 REST_SKIP+8
361	movq 5*8+16(%rsp), %r11	/* save return address */
362	movq_cfi rbx, RBX+16
363	movq_cfi rbp, RBP+16
364	movq_cfi r12, R12+16
365	movq_cfi r13, R13+16
366	movq_cfi r14, R14+16
367	movq_cfi r15, R15+16
368	movq %r11, 8(%rsp)	/* return address */
369	FIXUP_TOP_OF_STACK %r11, 16
370	ret
371	CFI_ENDPROC
372END(save_rest)
373
374/* save complete stack frame */
375ENTRY(save_paranoid)
376	XCPT_FRAME 1 RDI+8
377	cld
378	movq_cfi rdi, RDI+8
379	movq_cfi rsi, RSI+8
380	movq_cfi rdx, RDX+8
381	movq_cfi rcx, RCX+8
382	movq_cfi rax, RAX+8
383	movq_cfi r8, R8+8
384	movq_cfi r9, R9+8
385	movq_cfi r10, R10+8
386	movq_cfi r11, R11+8
387	movq_cfi rbx, RBX+8
388	movq_cfi rbp, RBP+8
389	movq_cfi r12, R12+8
390	movq_cfi r13, R13+8
391	movq_cfi r14, R14+8
392	movq_cfi r15, R15+8
393	movl $1,%ebx
394	movl $MSR_GS_BASE,%ecx
395	rdmsr
396	testl %edx,%edx
397	js 1f	/* negative -> in kernel */
398	SWAPGS
399	xorl %ebx,%ebx
4001:	ret
401	CFI_ENDPROC
402END(save_paranoid)
403
404/*
405 * A newly forked process directly context switches into this address.
406 *
407 * rdi: prev task we switched from
408 */
409ENTRY(ret_from_fork)
410	DEFAULT_FRAME
411
412	push kernel_eflags(%rip)
413	CFI_ADJUST_CFA_OFFSET 8
414	popf					# reset kernel eflags
415	CFI_ADJUST_CFA_OFFSET -8
416
417	call schedule_tail			# rdi: 'prev' task parameter
418
419	GET_THREAD_INFO(%rcx)
420
421	CFI_REMEMBER_STATE
422	RESTORE_REST
423
424	testl $3, CS-ARGOFFSET(%rsp)		# from kernel_thread?
425	je   int_ret_from_sys_call
426
427	testl $_TIF_IA32, TI_flags(%rcx)	# 32-bit compat task needs IRET
428	jnz  int_ret_from_sys_call
429
430	RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET
431	jmp ret_from_sys_call			# go to the SYSRET fastpath
432
433	CFI_RESTORE_STATE
434	CFI_ENDPROC
435END(ret_from_fork)
436
437/*
438 * System call entry. Upto 6 arguments in registers are supported.
439 *
440 * SYSCALL does not save anything on the stack and does not change the
441 * stack pointer.
442 */
443
444/*
445 * Register setup:
446 * rax  system call number
447 * rdi  arg0
448 * rcx  return address for syscall/sysret, C arg3
449 * rsi  arg1
450 * rdx  arg2
451 * r10  arg3 	(--> moved to rcx for C)
452 * r8   arg4
453 * r9   arg5
454 * r11  eflags for syscall/sysret, temporary for C
455 * r12-r15,rbp,rbx saved by C code, not touched.
456 *
457 * Interrupts are off on entry.
458 * Only called from user space.
459 *
460 * XXX	if we had a free scratch register we could save the RSP into the stack frame
461 *      and report it properly in ps. Unfortunately we haven't.
462 *
463 * When user can change the frames always force IRET. That is because
464 * it deals with uncanonical addresses better. SYSRET has trouble
465 * with them due to bugs in both AMD and Intel CPUs.
466 */
467
468ENTRY(system_call)
469	CFI_STARTPROC	simple
470	CFI_SIGNAL_FRAME
471	CFI_DEF_CFA	rsp,PDA_STACKOFFSET
472	CFI_REGISTER	rip,rcx
473	/*CFI_REGISTER	rflags,r11*/
474	SWAPGS_UNSAFE_STACK
475	/*
476	 * A hypervisor implementation might want to use a label
477	 * after the swapgs, so that it can do the swapgs
478	 * for the guest and jump here on syscall.
479	 */
480ENTRY(system_call_after_swapgs)
481
482	movq	%rsp,%gs:pda_oldrsp
483	movq	%gs:pda_kernelstack,%rsp
484	/*
485	 * No need to follow this irqs off/on section - it's straight
486	 * and short:
487	 */
488	ENABLE_INTERRUPTS(CLBR_NONE)
489	SAVE_ARGS 8,1
490	movq  %rax,ORIG_RAX-ARGOFFSET(%rsp)
491	movq  %rcx,RIP-ARGOFFSET(%rsp)
492	CFI_REL_OFFSET rip,RIP-ARGOFFSET
493	GET_THREAD_INFO(%rcx)
494	testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx)
495	jnz tracesys
496system_call_fastpath:
497	cmpq $__NR_syscall_max,%rax
498	ja badsys
499	movq %r10,%rcx
500	call *sys_call_table(,%rax,8)  # XXX:	 rip relative
501	movq %rax,RAX-ARGOFFSET(%rsp)
502/*
503 * Syscall return path ending with SYSRET (fast path)
504 * Has incomplete stack frame and undefined top of stack.
505 */
506ret_from_sys_call:
507	movl $_TIF_ALLWORK_MASK,%edi
508	/* edi:	flagmask */
509sysret_check:
510	LOCKDEP_SYS_EXIT
511	GET_THREAD_INFO(%rcx)
512	DISABLE_INTERRUPTS(CLBR_NONE)
513	TRACE_IRQS_OFF
514	movl TI_flags(%rcx),%edx
515	andl %edi,%edx
516	jnz  sysret_careful
517	CFI_REMEMBER_STATE
518	/*
519	 * sysretq will re-enable interrupts:
520	 */
521	TRACE_IRQS_ON
522	movq RIP-ARGOFFSET(%rsp),%rcx
523	CFI_REGISTER	rip,rcx
524	RESTORE_ARGS 0,-ARG_SKIP,1
525	/*CFI_REGISTER	rflags,r11*/
526	movq	%gs:pda_oldrsp, %rsp
527	USERGS_SYSRET64
528
529	CFI_RESTORE_STATE
530	/* Handle reschedules */
531	/* edx:	work, edi: workmask */
532sysret_careful:
533	bt $TIF_NEED_RESCHED,%edx
534	jnc sysret_signal
535	TRACE_IRQS_ON
536	ENABLE_INTERRUPTS(CLBR_NONE)
537	pushq %rdi
538	CFI_ADJUST_CFA_OFFSET 8
539	call schedule
540	popq  %rdi
541	CFI_ADJUST_CFA_OFFSET -8
542	jmp sysret_check
543
544	/* Handle a signal */
545sysret_signal:
546	TRACE_IRQS_ON
547	ENABLE_INTERRUPTS(CLBR_NONE)
548#ifdef CONFIG_AUDITSYSCALL
549	bt $TIF_SYSCALL_AUDIT,%edx
550	jc sysret_audit
551#endif
552	/* edx:	work flags (arg3) */
553	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
554	xorl %esi,%esi # oldset -> arg2
555	SAVE_REST
556	FIXUP_TOP_OF_STACK %r11
557	call do_notify_resume
558	RESTORE_TOP_OF_STACK %r11
559	RESTORE_REST
560	movl $_TIF_WORK_MASK,%edi
561	/* Use IRET because user could have changed frame. This
562	   works because ptregscall_common has called FIXUP_TOP_OF_STACK. */
563	DISABLE_INTERRUPTS(CLBR_NONE)
564	TRACE_IRQS_OFF
565	jmp int_with_check
566
567badsys:
568	movq $-ENOSYS,RAX-ARGOFFSET(%rsp)
569	jmp ret_from_sys_call
570
571#ifdef CONFIG_AUDITSYSCALL
572	/*
573	 * Fast path for syscall audit without full syscall trace.
574	 * We just call audit_syscall_entry() directly, and then
575	 * jump back to the normal fast path.
576	 */
577auditsys:
578	movq %r10,%r9			/* 6th arg: 4th syscall arg */
579	movq %rdx,%r8			/* 5th arg: 3rd syscall arg */
580	movq %rsi,%rcx			/* 4th arg: 2nd syscall arg */
581	movq %rdi,%rdx			/* 3rd arg: 1st syscall arg */
582	movq %rax,%rsi			/* 2nd arg: syscall number */
583	movl $AUDIT_ARCH_X86_64,%edi	/* 1st arg: audit arch */
584	call audit_syscall_entry
585	LOAD_ARGS 0		/* reload call-clobbered registers */
586	jmp system_call_fastpath
587
588	/*
589	 * Return fast path for syscall audit.  Call audit_syscall_exit()
590	 * directly and then jump back to the fast path with TIF_SYSCALL_AUDIT
591	 * masked off.
592	 */
593sysret_audit:
594	movq %rax,%rsi		/* second arg, syscall return value */
595	cmpq $0,%rax		/* is it < 0? */
596	setl %al		/* 1 if so, 0 if not */
597	movzbl %al,%edi		/* zero-extend that into %edi */
598	inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
599	call audit_syscall_exit
600	movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
601	jmp sysret_check
602#endif	/* CONFIG_AUDITSYSCALL */
603
604	/* Do syscall tracing */
605tracesys:
606#ifdef CONFIG_AUDITSYSCALL
607	testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx)
608	jz auditsys
609#endif
610	SAVE_REST
611	movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
612	FIXUP_TOP_OF_STACK %rdi
613	movq %rsp,%rdi
614	call syscall_trace_enter
615	/*
616	 * Reload arg registers from stack in case ptrace changed them.
617	 * We don't reload %rax because syscall_trace_enter() returned
618	 * the value it wants us to use in the table lookup.
619	 */
620	LOAD_ARGS ARGOFFSET, 1
621	RESTORE_REST
622	cmpq $__NR_syscall_max,%rax
623	ja   int_ret_from_sys_call	/* RAX(%rsp) set to -ENOSYS above */
624	movq %r10,%rcx	/* fixup for C */
625	call *sys_call_table(,%rax,8)
626	movq %rax,RAX-ARGOFFSET(%rsp)
627	/* Use IRET because user could have changed frame */
628
629/*
630 * Syscall return path ending with IRET.
631 * Has correct top of stack, but partial stack frame.
632 */
633	.globl int_ret_from_sys_call
634	.globl int_with_check
635int_ret_from_sys_call:
636	DISABLE_INTERRUPTS(CLBR_NONE)
637	TRACE_IRQS_OFF
638	testl $3,CS-ARGOFFSET(%rsp)
639	je retint_restore_args
640	movl $_TIF_ALLWORK_MASK,%edi
641	/* edi:	mask to check */
642int_with_check:
643	LOCKDEP_SYS_EXIT_IRQ
644	GET_THREAD_INFO(%rcx)
645	movl TI_flags(%rcx),%edx
646	andl %edi,%edx
647	jnz   int_careful
648	andl    $~TS_COMPAT,TI_status(%rcx)
649	jmp   retint_swapgs
650
651	/* Either reschedule or signal or syscall exit tracking needed. */
652	/* First do a reschedule test. */
653	/* edx:	work, edi: workmask */
654int_careful:
655	bt $TIF_NEED_RESCHED,%edx
656	jnc  int_very_careful
657	TRACE_IRQS_ON
658	ENABLE_INTERRUPTS(CLBR_NONE)
659	pushq %rdi
660	CFI_ADJUST_CFA_OFFSET 8
661	call schedule
662	popq %rdi
663	CFI_ADJUST_CFA_OFFSET -8
664	DISABLE_INTERRUPTS(CLBR_NONE)
665	TRACE_IRQS_OFF
666	jmp int_with_check
667
668	/* handle signals and tracing -- both require a full stack frame */
669int_very_careful:
670	TRACE_IRQS_ON
671	ENABLE_INTERRUPTS(CLBR_NONE)
672	SAVE_REST
673	/* Check for syscall exit trace */
674	testl $_TIF_WORK_SYSCALL_EXIT,%edx
675	jz int_signal
676	pushq %rdi
677	CFI_ADJUST_CFA_OFFSET 8
678	leaq 8(%rsp),%rdi	# &ptregs -> arg1
679	call syscall_trace_leave
680	popq %rdi
681	CFI_ADJUST_CFA_OFFSET -8
682	andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
683	jmp int_restore_rest
684
685int_signal:
686	testl $_TIF_DO_NOTIFY_MASK,%edx
687	jz 1f
688	movq %rsp,%rdi		# &ptregs -> arg1
689	xorl %esi,%esi		# oldset -> arg2
690	call do_notify_resume
6911:	movl $_TIF_WORK_MASK,%edi
692int_restore_rest:
693	RESTORE_REST
694	DISABLE_INTERRUPTS(CLBR_NONE)
695	TRACE_IRQS_OFF
696	jmp int_with_check
697	CFI_ENDPROC
698END(system_call)
699
700/*
701 * Certain special system calls that need to save a complete full stack frame.
702 */
703	.macro PTREGSCALL label,func,arg
704ENTRY(\label)
705	PARTIAL_FRAME 1 8		/* offset 8: return address */
706	subq $REST_SKIP, %rsp
707	CFI_ADJUST_CFA_OFFSET REST_SKIP
708	call save_rest
709	DEFAULT_FRAME 0 8		/* offset 8: return address */
710	leaq 8(%rsp), \arg	/* pt_regs pointer */
711	call \func
712	jmp ptregscall_common
713	CFI_ENDPROC
714END(\label)
715	.endm
716
717	PTREGSCALL stub_clone, sys_clone, %r8
718	PTREGSCALL stub_fork, sys_fork, %rdi
719	PTREGSCALL stub_vfork, sys_vfork, %rdi
720	PTREGSCALL stub_sigaltstack, sys_sigaltstack, %rdx
721	PTREGSCALL stub_iopl, sys_iopl, %rsi
722
723ENTRY(ptregscall_common)
724	DEFAULT_FRAME 1 8	/* offset 8: return address */
725	RESTORE_TOP_OF_STACK %r11, 8
726	movq_cfi_restore R15+8, r15
727	movq_cfi_restore R14+8, r14
728	movq_cfi_restore R13+8, r13
729	movq_cfi_restore R12+8, r12
730	movq_cfi_restore RBP+8, rbp
731	movq_cfi_restore RBX+8, rbx
732	ret $REST_SKIP		/* pop extended registers */
733	CFI_ENDPROC
734END(ptregscall_common)
735
736ENTRY(stub_execve)
737	CFI_STARTPROC
738	popq %r11
739	CFI_ADJUST_CFA_OFFSET -8
740	CFI_REGISTER rip, r11
741	SAVE_REST
742	FIXUP_TOP_OF_STACK %r11
743	movq %rsp, %rcx
744	call sys_execve
745	RESTORE_TOP_OF_STACK %r11
746	movq %rax,RAX(%rsp)
747	RESTORE_REST
748	jmp int_ret_from_sys_call
749	CFI_ENDPROC
750END(stub_execve)
751
752/*
753 * sigreturn is special because it needs to restore all registers on return.
754 * This cannot be done with SYSRET, so use the IRET return path instead.
755 */
756ENTRY(stub_rt_sigreturn)
757	CFI_STARTPROC
758	addq $8, %rsp
759	CFI_ADJUST_CFA_OFFSET	-8
760	SAVE_REST
761	movq %rsp,%rdi
762	FIXUP_TOP_OF_STACK %r11
763	call sys_rt_sigreturn
764	movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
765	RESTORE_REST
766	jmp int_ret_from_sys_call
767	CFI_ENDPROC
768END(stub_rt_sigreturn)
769
770/*
771 * Build the entry stubs and pointer table with some assembler magic.
772 * We pack 7 stubs into a single 32-byte chunk, which will fit in a
773 * single cache line on all modern x86 implementations.
774 */
775	.section .init.rodata,"a"
776ENTRY(interrupt)
777	.text
778	.p2align 5
779	.p2align CONFIG_X86_L1_CACHE_SHIFT
780ENTRY(irq_entries_start)
781	INTR_FRAME
782vector=FIRST_EXTERNAL_VECTOR
783.rept (NR_VECTORS-FIRST_EXTERNAL_VECTOR+6)/7
784	.balign 32
785  .rept	7
786    .if vector < NR_VECTORS
787      .if vector <> FIRST_EXTERNAL_VECTOR
788	CFI_ADJUST_CFA_OFFSET -8
789      .endif
7901:	pushq $(~vector+0x80)	/* Note: always in signed byte range */
791	CFI_ADJUST_CFA_OFFSET 8
792      .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
793	jmp 2f
794      .endif
795      .previous
796	.quad 1b
797      .text
798vector=vector+1
799    .endif
800  .endr
8012:	jmp common_interrupt
802.endr
803	CFI_ENDPROC
804END(irq_entries_start)
805
806.previous
807END(interrupt)
808.previous
809
810/*
811 * Interrupt entry/exit.
812 *
813 * Interrupt entry points save only callee clobbered registers in fast path.
814 *
815 * Entry runs with interrupts off.
816 */
817
818/* 0(%rsp): ~(interrupt number) */
819	.macro interrupt func
820	subq $10*8, %rsp
821	CFI_ADJUST_CFA_OFFSET 10*8
822	call save_args
823	PARTIAL_FRAME 0
824	call \func
825	.endm
826
827	/*
828	 * The interrupt stubs push (~vector+0x80) onto the stack and
829	 * then jump to common_interrupt.
830	 */
831	.p2align CONFIG_X86_L1_CACHE_SHIFT
832common_interrupt:
833	XCPT_FRAME
834	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */
835	interrupt do_IRQ
836	/* 0(%rsp): oldrsp-ARGOFFSET */
837ret_from_intr:
838	DISABLE_INTERRUPTS(CLBR_NONE)
839	TRACE_IRQS_OFF
840	decl %gs:pda_irqcount
841	leaveq
842	CFI_DEF_CFA_REGISTER	rsp
843	CFI_ADJUST_CFA_OFFSET	-8
844exit_intr:
845	GET_THREAD_INFO(%rcx)
846	testl $3,CS-ARGOFFSET(%rsp)
847	je retint_kernel
848
849	/* Interrupt came from user space */
850	/*
851	 * Has a correct top of stack, but a partial stack frame
852	 * %rcx: thread info. Interrupts off.
853	 */
854retint_with_reschedule:
855	movl $_TIF_WORK_MASK,%edi
856retint_check:
857	LOCKDEP_SYS_EXIT_IRQ
858	movl TI_flags(%rcx),%edx
859	andl %edi,%edx
860	CFI_REMEMBER_STATE
861	jnz  retint_careful
862
863retint_swapgs:		/* return to user-space */
864	/*
865	 * The iretq could re-enable interrupts:
866	 */
867	DISABLE_INTERRUPTS(CLBR_ANY)
868	TRACE_IRQS_IRETQ
869	SWAPGS
870	jmp restore_args
871
872retint_restore_args:	/* return to kernel space */
873	DISABLE_INTERRUPTS(CLBR_ANY)
874	/*
875	 * The iretq could re-enable interrupts:
876	 */
877	TRACE_IRQS_IRETQ
878restore_args:
879	RESTORE_ARGS 0,8,0
880
881irq_return:
882	INTERRUPT_RETURN
883
884	.section __ex_table, "a"
885	.quad irq_return, bad_iret
886	.previous
887
888#ifdef CONFIG_PARAVIRT
889ENTRY(native_iret)
890	iretq
891
892	.section __ex_table,"a"
893	.quad native_iret, bad_iret
894	.previous
895#endif
896
897	.section .fixup,"ax"
898bad_iret:
899	/*
900	 * The iret traps when the %cs or %ss being restored is bogus.
901	 * We've lost the original trap vector and error code.
902	 * #GPF is the most likely one to get for an invalid selector.
903	 * So pretend we completed the iret and took the #GPF in user mode.
904	 *
905	 * We are now running with the kernel GS after exception recovery.
906	 * But error_entry expects us to have user GS to match the user %cs,
907	 * so swap back.
908	 */
909	pushq $0
910
911	SWAPGS
912	jmp general_protection
913
914	.previous
915
916	/* edi: workmask, edx: work */
917retint_careful:
918	CFI_RESTORE_STATE
919	bt    $TIF_NEED_RESCHED,%edx
920	jnc   retint_signal
921	TRACE_IRQS_ON
922	ENABLE_INTERRUPTS(CLBR_NONE)
923	pushq %rdi
924	CFI_ADJUST_CFA_OFFSET	8
925	call  schedule
926	popq %rdi
927	CFI_ADJUST_CFA_OFFSET	-8
928	GET_THREAD_INFO(%rcx)
929	DISABLE_INTERRUPTS(CLBR_NONE)
930	TRACE_IRQS_OFF
931	jmp retint_check
932
933retint_signal:
934	testl $_TIF_DO_NOTIFY_MASK,%edx
935	jz    retint_swapgs
936	TRACE_IRQS_ON
937	ENABLE_INTERRUPTS(CLBR_NONE)
938	SAVE_REST
939	movq $-1,ORIG_RAX(%rsp)
940	xorl %esi,%esi		# oldset
941	movq %rsp,%rdi		# &pt_regs
942	call do_notify_resume
943	RESTORE_REST
944	DISABLE_INTERRUPTS(CLBR_NONE)
945	TRACE_IRQS_OFF
946	GET_THREAD_INFO(%rcx)
947	jmp retint_with_reschedule
948
949#ifdef CONFIG_PREEMPT
950	/* Returning to kernel space. Check if we need preemption */
951	/* rcx:	 threadinfo. interrupts off. */
952ENTRY(retint_kernel)
953	cmpl $0,TI_preempt_count(%rcx)
954	jnz  retint_restore_args
955	bt  $TIF_NEED_RESCHED,TI_flags(%rcx)
956	jnc  retint_restore_args
957	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
958	jnc  retint_restore_args
959	call preempt_schedule_irq
960	jmp exit_intr
961#endif
962
963	CFI_ENDPROC
964END(common_interrupt)
965
966/*
967 * APIC interrupts.
968 */
969.macro apicinterrupt num sym do_sym
970ENTRY(\sym)
971	INTR_FRAME
972	pushq $~(\num)
973	CFI_ADJUST_CFA_OFFSET 8
974	interrupt \do_sym
975	jmp ret_from_intr
976	CFI_ENDPROC
977END(\sym)
978.endm
979
980#ifdef CONFIG_SMP
981apicinterrupt IRQ_MOVE_CLEANUP_VECTOR \
982	irq_move_cleanup_interrupt smp_irq_move_cleanup_interrupt
983#endif
984
985apicinterrupt UV_BAU_MESSAGE \
986	uv_bau_message_intr1 uv_bau_message_interrupt
987apicinterrupt LOCAL_TIMER_VECTOR \
988	apic_timer_interrupt smp_apic_timer_interrupt
989
990#ifdef CONFIG_SMP
991apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
992	invalidate_interrupt0 smp_invalidate_interrupt
993apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \
994	invalidate_interrupt1 smp_invalidate_interrupt
995apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \
996	invalidate_interrupt2 smp_invalidate_interrupt
997apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \
998	invalidate_interrupt3 smp_invalidate_interrupt
999apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
1000	invalidate_interrupt4 smp_invalidate_interrupt
1001apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
1002	invalidate_interrupt5 smp_invalidate_interrupt
1003apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
1004	invalidate_interrupt6 smp_invalidate_interrupt
1005apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
1006	invalidate_interrupt7 smp_invalidate_interrupt
1007#endif
1008
1009apicinterrupt THRESHOLD_APIC_VECTOR \
1010	threshold_interrupt mce_threshold_interrupt
1011apicinterrupt THERMAL_APIC_VECTOR \
1012	thermal_interrupt smp_thermal_interrupt
1013
1014#ifdef CONFIG_SMP
1015apicinterrupt CALL_FUNCTION_SINGLE_VECTOR \
1016	call_function_single_interrupt smp_call_function_single_interrupt
1017apicinterrupt CALL_FUNCTION_VECTOR \
1018	call_function_interrupt smp_call_function_interrupt
1019apicinterrupt RESCHEDULE_VECTOR \
1020	reschedule_interrupt smp_reschedule_interrupt
1021#endif
1022
1023apicinterrupt ERROR_APIC_VECTOR \
1024	error_interrupt smp_error_interrupt
1025apicinterrupt SPURIOUS_APIC_VECTOR \
1026	spurious_interrupt smp_spurious_interrupt
1027
1028/*
1029 * Exception entry points.
1030 */
1031.macro zeroentry sym do_sym
1032ENTRY(\sym)
1033	INTR_FRAME
1034	PARAVIRT_ADJUST_EXCEPTION_FRAME
1035	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
1036	subq $15*8,%rsp
1037	CFI_ADJUST_CFA_OFFSET 15*8
1038	call error_entry
1039	DEFAULT_FRAME 0
1040	movq %rsp,%rdi		/* pt_regs pointer */
1041	xorl %esi,%esi		/* no error code */
1042	call \do_sym
1043	jmp error_exit		/* %ebx: no swapgs flag */
1044	CFI_ENDPROC
1045END(\sym)
1046.endm
1047
1048.macro paranoidzeroentry sym do_sym
1049ENTRY(\sym)
1050	INTR_FRAME
1051	PARAVIRT_ADJUST_EXCEPTION_FRAME
1052	pushq $-1		/* ORIG_RAX: no syscall to restart */
1053	CFI_ADJUST_CFA_OFFSET 8
1054	subq $15*8, %rsp
1055	call save_paranoid
1056	TRACE_IRQS_OFF
1057	movq %rsp,%rdi		/* pt_regs pointer */
1058	xorl %esi,%esi		/* no error code */
1059	call \do_sym
1060	jmp paranoid_exit	/* %ebx: no swapgs flag */
1061	CFI_ENDPROC
1062END(\sym)
1063.endm
1064
1065.macro paranoidzeroentry_ist sym do_sym ist
1066ENTRY(\sym)
1067	INTR_FRAME
1068	PARAVIRT_ADJUST_EXCEPTION_FRAME
1069	pushq $-1		/* ORIG_RAX: no syscall to restart */
1070	CFI_ADJUST_CFA_OFFSET 8
1071	subq $15*8, %rsp
1072	call save_paranoid
1073	TRACE_IRQS_OFF
1074	movq %rsp,%rdi		/* pt_regs pointer */
1075	xorl %esi,%esi		/* no error code */
1076	movq %gs:pda_data_offset, %rbp
1077	subq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1078	call \do_sym
1079	addq $EXCEPTION_STKSZ, per_cpu__init_tss + TSS_ist + (\ist - 1) * 8(%rbp)
1080	jmp paranoid_exit	/* %ebx: no swapgs flag */
1081	CFI_ENDPROC
1082END(\sym)
1083.endm
1084
1085.macro errorentry sym do_sym
1086ENTRY(\sym)
1087	XCPT_FRAME
1088	PARAVIRT_ADJUST_EXCEPTION_FRAME
1089	subq $15*8,%rsp
1090	CFI_ADJUST_CFA_OFFSET 15*8
1091	call error_entry
1092	DEFAULT_FRAME 0
1093	movq %rsp,%rdi			/* pt_regs pointer */
1094	movq ORIG_RAX(%rsp),%rsi	/* get error code */
1095	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
1096	call \do_sym
1097	jmp error_exit			/* %ebx: no swapgs flag */
1098	CFI_ENDPROC
1099END(\sym)
1100.endm
1101
1102	/* error code is on the stack already */
1103.macro paranoiderrorentry sym do_sym
1104ENTRY(\sym)
1105	XCPT_FRAME
1106	PARAVIRT_ADJUST_EXCEPTION_FRAME
1107	subq $15*8,%rsp
1108	CFI_ADJUST_CFA_OFFSET 15*8
1109	call save_paranoid
1110	DEFAULT_FRAME 0
1111	TRACE_IRQS_OFF
1112	movq %rsp,%rdi			/* pt_regs pointer */
1113	movq ORIG_RAX(%rsp),%rsi	/* get error code */
1114	movq $-1,ORIG_RAX(%rsp)		/* no syscall to restart */
1115	call \do_sym
1116	jmp paranoid_exit		/* %ebx: no swapgs flag */
1117	CFI_ENDPROC
1118END(\sym)
1119.endm
1120
1121zeroentry divide_error do_divide_error
1122zeroentry overflow do_overflow
1123zeroentry bounds do_bounds
1124zeroentry invalid_op do_invalid_op
1125zeroentry device_not_available do_device_not_available
1126paranoiderrorentry double_fault do_double_fault
1127zeroentry coprocessor_segment_overrun do_coprocessor_segment_overrun
1128errorentry invalid_TSS do_invalid_TSS
1129errorentry segment_not_present do_segment_not_present
1130zeroentry spurious_interrupt_bug do_spurious_interrupt_bug
1131zeroentry coprocessor_error do_coprocessor_error
1132errorentry alignment_check do_alignment_check
1133zeroentry simd_coprocessor_error do_simd_coprocessor_error
1134
1135	/* Reload gs selector with exception handling */
1136	/* edi:  new selector */
1137ENTRY(native_load_gs_index)
1138	CFI_STARTPROC
1139	pushf
1140	CFI_ADJUST_CFA_OFFSET 8
1141	DISABLE_INTERRUPTS(CLBR_ANY | ~(CLBR_RDI))
1142	SWAPGS
1143gs_change:
1144	movl %edi,%gs
11452:	mfence		/* workaround */
1146	SWAPGS
1147	popf
1148	CFI_ADJUST_CFA_OFFSET -8
1149	ret
1150	CFI_ENDPROC
1151END(native_load_gs_index)
1152
1153	.section __ex_table,"a"
1154	.align 8
1155	.quad gs_change,bad_gs
1156	.previous
1157	.section .fixup,"ax"
1158	/* running with kernelgs */
1159bad_gs:
1160	SWAPGS			/* switch back to user gs */
1161	xorl %eax,%eax
1162	movl %eax,%gs
1163	jmp  2b
1164	.previous
1165
1166/*
1167 * Create a kernel thread.
1168 *
1169 * C extern interface:
1170 *	extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1171 *
1172 * asm input arguments:
1173 *	rdi: fn, rsi: arg, rdx: flags
1174 */
1175ENTRY(kernel_thread)
1176	CFI_STARTPROC
1177	FAKE_STACK_FRAME $child_rip
1178	SAVE_ALL
1179
1180	# rdi: flags, rsi: usp, rdx: will be &pt_regs
1181	movq %rdx,%rdi
1182	orq  kernel_thread_flags(%rip),%rdi
1183	movq $-1, %rsi
1184	movq %rsp, %rdx
1185
1186	xorl %r8d,%r8d
1187	xorl %r9d,%r9d
1188
1189	# clone now
1190	call do_fork
1191	movq %rax,RAX(%rsp)
1192	xorl %edi,%edi
1193
1194	/*
1195	 * It isn't worth to check for reschedule here,
1196	 * so internally to the x86_64 port you can rely on kernel_thread()
1197	 * not to reschedule the child before returning, this avoids the need
1198	 * of hacks for example to fork off the per-CPU idle tasks.
1199	 * [Hopefully no generic code relies on the reschedule -AK]
1200	 */
1201	RESTORE_ALL
1202	UNFAKE_STACK_FRAME
1203	ret
1204	CFI_ENDPROC
1205END(kernel_thread)
1206
1207ENTRY(child_rip)
1208	pushq $0		# fake return address
1209	CFI_STARTPROC
1210	/*
1211	 * Here we are in the child and the registers are set as they were
1212	 * at kernel_thread() invocation in the parent.
1213	 */
1214	movq %rdi, %rax
1215	movq %rsi, %rdi
1216	call *%rax
1217	# exit
1218	mov %eax, %edi
1219	call do_exit
1220	ud2			# padding for call trace
1221	CFI_ENDPROC
1222END(child_rip)
1223
1224/*
1225 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
1226 *
1227 * C extern interface:
1228 *	 extern long execve(char *name, char **argv, char **envp)
1229 *
1230 * asm input arguments:
1231 *	rdi: name, rsi: argv, rdx: envp
1232 *
1233 * We want to fallback into:
1234 *	extern long sys_execve(char *name, char **argv,char **envp, struct pt_regs *regs)
1235 *
1236 * do_sys_execve asm fallback arguments:
1237 *	rdi: name, rsi: argv, rdx: envp, rcx: fake frame on the stack
1238 */
1239ENTRY(kernel_execve)
1240	CFI_STARTPROC
1241	FAKE_STACK_FRAME $0
1242	SAVE_ALL
1243	movq %rsp,%rcx
1244	call sys_execve
1245	movq %rax, RAX(%rsp)
1246	RESTORE_REST
1247	testq %rax,%rax
1248	je int_ret_from_sys_call
1249	RESTORE_ARGS
1250	UNFAKE_STACK_FRAME
1251	ret
1252	CFI_ENDPROC
1253END(kernel_execve)
1254
1255/* Call softirq on interrupt stack. Interrupts are off. */
1256ENTRY(call_softirq)
1257	CFI_STARTPROC
1258	push %rbp
1259	CFI_ADJUST_CFA_OFFSET	8
1260	CFI_REL_OFFSET rbp,0
1261	mov  %rsp,%rbp
1262	CFI_DEF_CFA_REGISTER rbp
1263	incl %gs:pda_irqcount
1264	cmove %gs:pda_irqstackptr,%rsp
1265	push  %rbp			# backlink for old unwinder
1266	call __do_softirq
1267	leaveq
1268	CFI_DEF_CFA_REGISTER	rsp
1269	CFI_ADJUST_CFA_OFFSET   -8
1270	decl %gs:pda_irqcount
1271	ret
1272	CFI_ENDPROC
1273END(call_softirq)
1274
1275#ifdef CONFIG_XEN
1276zeroentry xen_hypervisor_callback xen_do_hypervisor_callback
1277
1278/*
1279 * A note on the "critical region" in our callback handler.
1280 * We want to avoid stacking callback handlers due to events occurring
1281 * during handling of the last event. To do this, we keep events disabled
1282 * until we've done all processing. HOWEVER, we must enable events before
1283 * popping the stack frame (can't be done atomically) and so it would still
1284 * be possible to get enough handler activations to overflow the stack.
1285 * Although unlikely, bugs of that kind are hard to track down, so we'd
1286 * like to avoid the possibility.
1287 * So, on entry to the handler we detect whether we interrupted an
1288 * existing activation in its critical region -- if so, we pop the current
1289 * activation and restart the handler using the previous one.
1290 */
1291ENTRY(xen_do_hypervisor_callback)   # do_hypervisor_callback(struct *pt_regs)
1292	CFI_STARTPROC
1293/*
1294 * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will
1295 * see the correct pointer to the pt_regs
1296 */
1297	movq %rdi, %rsp            # we don't return, adjust the stack frame
1298	CFI_ENDPROC
1299	DEFAULT_FRAME
130011:	incl %gs:pda_irqcount
1301	movq %rsp,%rbp
1302	CFI_DEF_CFA_REGISTER rbp
1303	cmovzq %gs:pda_irqstackptr,%rsp
1304	pushq %rbp			# backlink for old unwinder
1305	call xen_evtchn_do_upcall
1306	popq %rsp
1307	CFI_DEF_CFA_REGISTER rsp
1308	decl %gs:pda_irqcount
1309	jmp  error_exit
1310	CFI_ENDPROC
1311END(do_hypervisor_callback)
1312
1313/*
1314 * Hypervisor uses this for application faults while it executes.
1315 * We get here for two reasons:
1316 *  1. Fault while reloading DS, ES, FS or GS
1317 *  2. Fault while executing IRET
1318 * Category 1 we do not need to fix up as Xen has already reloaded all segment
1319 * registers that could be reloaded and zeroed the others.
1320 * Category 2 we fix up by killing the current process. We cannot use the
1321 * normal Linux return path in this case because if we use the IRET hypercall
1322 * to pop the stack frame we end up in an infinite loop of failsafe callbacks.
1323 * We distinguish between categories by comparing each saved segment register
1324 * with its current contents: any discrepancy means we in category 1.
1325 */
1326ENTRY(xen_failsafe_callback)
1327	INTR_FRAME 1 (6*8)
1328	/*CFI_REL_OFFSET gs,GS*/
1329	/*CFI_REL_OFFSET fs,FS*/
1330	/*CFI_REL_OFFSET es,ES*/
1331	/*CFI_REL_OFFSET ds,DS*/
1332	CFI_REL_OFFSET r11,8
1333	CFI_REL_OFFSET rcx,0
1334	movw %ds,%cx
1335	cmpw %cx,0x10(%rsp)
1336	CFI_REMEMBER_STATE
1337	jne 1f
1338	movw %es,%cx
1339	cmpw %cx,0x18(%rsp)
1340	jne 1f
1341	movw %fs,%cx
1342	cmpw %cx,0x20(%rsp)
1343	jne 1f
1344	movw %gs,%cx
1345	cmpw %cx,0x28(%rsp)
1346	jne 1f
1347	/* All segments match their saved values => Category 2 (Bad IRET). */
1348	movq (%rsp),%rcx
1349	CFI_RESTORE rcx
1350	movq 8(%rsp),%r11
1351	CFI_RESTORE r11
1352	addq $0x30,%rsp
1353	CFI_ADJUST_CFA_OFFSET -0x30
1354	pushq_cfi $0	/* RIP */
1355	pushq_cfi %r11
1356	pushq_cfi %rcx
1357	jmp general_protection
1358	CFI_RESTORE_STATE
13591:	/* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */
1360	movq (%rsp),%rcx
1361	CFI_RESTORE rcx
1362	movq 8(%rsp),%r11
1363	CFI_RESTORE r11
1364	addq $0x30,%rsp
1365	CFI_ADJUST_CFA_OFFSET -0x30
1366	pushq_cfi $0
1367	SAVE_ALL
1368	jmp error_exit
1369	CFI_ENDPROC
1370END(xen_failsafe_callback)
1371
1372#endif /* CONFIG_XEN */
1373
1374/*
1375 * Some functions should be protected against kprobes
1376 */
1377	.pushsection .kprobes.text, "ax"
1378
1379paranoidzeroentry_ist debug do_debug DEBUG_STACK
1380paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
1381paranoiderrorentry stack_segment do_stack_segment
1382errorentry general_protection do_general_protection
1383errorentry page_fault do_page_fault
1384#ifdef CONFIG_X86_MCE
1385paranoidzeroentry machine_check do_machine_check
1386#endif
1387
1388	/*
1389	 * "Paranoid" exit path from exception stack.
1390	 * Paranoid because this is used by NMIs and cannot take
1391	 * any kernel state for granted.
1392	 * We don't do kernel preemption checks here, because only
1393	 * NMI should be common and it does not enable IRQs and
1394	 * cannot get reschedule ticks.
1395	 *
1396	 * "trace" is 0 for the NMI handler only, because irq-tracing
1397	 * is fundamentally NMI-unsafe. (we cannot change the soft and
1398	 * hard flags at once, atomically)
1399	 */
1400
1401	/* ebx:	no swapgs flag */
1402ENTRY(paranoid_exit)
1403	INTR_FRAME
1404	DISABLE_INTERRUPTS(CLBR_NONE)
1405	TRACE_IRQS_OFF
1406	testl %ebx,%ebx				/* swapgs needed? */
1407	jnz paranoid_restore
1408	testl $3,CS(%rsp)
1409	jnz   paranoid_userspace
1410paranoid_swapgs:
1411	TRACE_IRQS_IRETQ 0
1412	SWAPGS_UNSAFE_STACK
1413paranoid_restore:
1414	RESTORE_ALL 8
1415	jmp irq_return
1416paranoid_userspace:
1417	GET_THREAD_INFO(%rcx)
1418	movl TI_flags(%rcx),%ebx
1419	andl $_TIF_WORK_MASK,%ebx
1420	jz paranoid_swapgs
1421	movq %rsp,%rdi			/* &pt_regs */
1422	call sync_regs
1423	movq %rax,%rsp			/* switch stack for scheduling */
1424	testl $_TIF_NEED_RESCHED,%ebx
1425	jnz paranoid_schedule
1426	movl %ebx,%edx			/* arg3: thread flags */
1427	TRACE_IRQS_ON
1428	ENABLE_INTERRUPTS(CLBR_NONE)
1429	xorl %esi,%esi 			/* arg2: oldset */
1430	movq %rsp,%rdi 			/* arg1: &pt_regs */
1431	call do_notify_resume
1432	DISABLE_INTERRUPTS(CLBR_NONE)
1433	TRACE_IRQS_OFF
1434	jmp paranoid_userspace
1435paranoid_schedule:
1436	TRACE_IRQS_ON
1437	ENABLE_INTERRUPTS(CLBR_ANY)
1438	call schedule
1439	DISABLE_INTERRUPTS(CLBR_ANY)
1440	TRACE_IRQS_OFF
1441	jmp paranoid_userspace
1442	CFI_ENDPROC
1443END(paranoid_exit)
1444
1445/*
1446 * Exception entry point. This expects an error code/orig_rax on the stack.
1447 * returns in "no swapgs flag" in %ebx.
1448 */
1449ENTRY(error_entry)
1450	XCPT_FRAME
1451	CFI_ADJUST_CFA_OFFSET 15*8
1452	/* oldrax contains error code */
1453	cld
1454	movq_cfi rdi, RDI+8
1455	movq_cfi rsi, RSI+8
1456	movq_cfi rdx, RDX+8
1457	movq_cfi rcx, RCX+8
1458	movq_cfi rax, RAX+8
1459	movq_cfi  r8,  R8+8
1460	movq_cfi  r9,  R9+8
1461	movq_cfi r10, R10+8
1462	movq_cfi r11, R11+8
1463	movq_cfi rbx, RBX+8
1464	movq_cfi rbp, RBP+8
1465	movq_cfi r12, R12+8
1466	movq_cfi r13, R13+8
1467	movq_cfi r14, R14+8
1468	movq_cfi r15, R15+8
1469	xorl %ebx,%ebx
1470	testl $3,CS+8(%rsp)
1471	je error_kernelspace
1472error_swapgs:
1473	SWAPGS
1474error_sti:
1475	TRACE_IRQS_OFF
1476	ret
1477	CFI_ENDPROC
1478
1479/*
1480 * There are two places in the kernel that can potentially fault with
1481 * usergs. Handle them here. The exception handlers after iret run with
1482 * kernel gs again, so don't set the user space flag. B stepping K8s
1483 * sometimes report an truncated RIP for IRET exceptions returning to
1484 * compat mode. Check for these here too.
1485 */
1486error_kernelspace:
1487	incl %ebx
1488	leaq irq_return(%rip),%rcx
1489	cmpq %rcx,RIP+8(%rsp)
1490	je error_swapgs
1491	movl %ecx,%ecx	/* zero extend */
1492	cmpq %rcx,RIP+8(%rsp)
1493	je error_swapgs
1494	cmpq $gs_change,RIP+8(%rsp)
1495	je error_swapgs
1496	jmp error_sti
1497END(error_entry)
1498
1499
1500/* ebx:	no swapgs flag (1: don't need swapgs, 0: need it) */
1501ENTRY(error_exit)
1502	DEFAULT_FRAME
1503	movl %ebx,%eax
1504	RESTORE_REST
1505	DISABLE_INTERRUPTS(CLBR_NONE)
1506	TRACE_IRQS_OFF
1507	GET_THREAD_INFO(%rcx)
1508	testl %eax,%eax
1509	jne retint_kernel
1510	LOCKDEP_SYS_EXIT_IRQ
1511	movl TI_flags(%rcx),%edx
1512	movl $_TIF_WORK_MASK,%edi
1513	andl %edi,%edx
1514	jnz retint_careful
1515	jmp retint_swapgs
1516	CFI_ENDPROC
1517END(error_exit)
1518
1519
1520	/* runs on exception stack */
1521ENTRY(nmi)
1522	INTR_FRAME
1523	PARAVIRT_ADJUST_EXCEPTION_FRAME
1524	pushq_cfi $-1
1525	subq $15*8, %rsp
1526	CFI_ADJUST_CFA_OFFSET 15*8
1527	call save_paranoid
1528	DEFAULT_FRAME 0
1529	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
1530	movq %rsp,%rdi
1531	movq $-1,%rsi
1532	call do_nmi
1533#ifdef CONFIG_TRACE_IRQFLAGS
1534	/* paranoidexit; without TRACE_IRQS_OFF */
1535	/* ebx:	no swapgs flag */
1536	DISABLE_INTERRUPTS(CLBR_NONE)
1537	testl %ebx,%ebx				/* swapgs needed? */
1538	jnz nmi_restore
1539	testl $3,CS(%rsp)
1540	jnz nmi_userspace
1541nmi_swapgs:
1542	SWAPGS_UNSAFE_STACK
1543nmi_restore:
1544	RESTORE_ALL 8
1545	jmp irq_return
1546nmi_userspace:
1547	GET_THREAD_INFO(%rcx)
1548	movl TI_flags(%rcx),%ebx
1549	andl $_TIF_WORK_MASK,%ebx
1550	jz nmi_swapgs
1551	movq %rsp,%rdi			/* &pt_regs */
1552	call sync_regs
1553	movq %rax,%rsp			/* switch stack for scheduling */
1554	testl $_TIF_NEED_RESCHED,%ebx
1555	jnz nmi_schedule
1556	movl %ebx,%edx			/* arg3: thread flags */
1557	ENABLE_INTERRUPTS(CLBR_NONE)
1558	xorl %esi,%esi 			/* arg2: oldset */
1559	movq %rsp,%rdi 			/* arg1: &pt_regs */
1560	call do_notify_resume
1561	DISABLE_INTERRUPTS(CLBR_NONE)
1562	jmp nmi_userspace
1563nmi_schedule:
1564	ENABLE_INTERRUPTS(CLBR_ANY)
1565	call schedule
1566	DISABLE_INTERRUPTS(CLBR_ANY)
1567	jmp nmi_userspace
1568	CFI_ENDPROC
1569#else
1570	jmp paranoid_exit
1571	CFI_ENDPROC
1572#endif
1573END(nmi)
1574
1575ENTRY(ignore_sysret)
1576	CFI_STARTPROC
1577	mov $-ENOSYS,%eax
1578	sysret
1579	CFI_ENDPROC
1580END(ignore_sysret)
1581
1582/*
1583 * End of kprobes section
1584 */
1585	.popsection
1586