• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Copyright (C) 1995  Linus Torvalds
4  *
5  *  Pentium III FXSR, SSE support
6  *	Gareth Hughes <gareth@valinux.com>, May 2000
7  *
8  *  X86-64 port
9  *	Andi Kleen.
10  *
11  *	CPU hotplug support - ashok.raj@intel.com
12  */
13 
14 /*
15  * This file handles the architecture-dependent parts of process handling..
16  */
17 
18 #include <linux/cpu.h>
19 #include <linux/errno.h>
20 #include <linux/sched.h>
21 #include <linux/sched/task.h>
22 #include <linux/sched/task_stack.h>
23 #include <linux/fs.h>
24 #include <linux/kernel.h>
25 #include <linux/mm.h>
26 #include <linux/elfcore.h>
27 #include <linux/smp.h>
28 #include <linux/slab.h>
29 #include <linux/user.h>
30 #include <linux/interrupt.h>
31 #include <linux/delay.h>
32 #include <linux/export.h>
33 #include <linux/ptrace.h>
34 #include <linux/notifier.h>
35 #include <linux/kprobes.h>
36 #include <linux/kdebug.h>
37 #include <linux/prctl.h>
38 #include <linux/uaccess.h>
39 #include <linux/io.h>
40 #include <linux/ftrace.h>
41 #include <linux/syscalls.h>
42 
43 #include <asm/processor.h>
44 #include <asm/fpu/internal.h>
45 #include <asm/mmu_context.h>
46 #include <asm/prctl.h>
47 #include <asm/desc.h>
48 #include <asm/proto.h>
49 #include <asm/ia32.h>
50 #include <asm/debugreg.h>
51 #include <asm/switch_to.h>
52 #include <asm/xen/hypervisor.h>
53 #include <asm/vdso.h>
54 #include <asm/resctrl.h>
55 #include <asm/unistd.h>
56 #include <asm/fsgsbase.h>
57 #ifdef CONFIG_IA32_EMULATION
58 /* Not included via unistd.h */
59 #include <asm/unistd_32_ia32.h>
60 #endif
61 
62 #include "process.h"
63 
64 /* Prints also some state that isn't saved in the pt_regs */
__show_regs(struct pt_regs * regs,enum show_regs_mode mode,const char * log_lvl)65 void __show_regs(struct pt_regs *regs, enum show_regs_mode mode,
66 		 const char *log_lvl)
67 {
68 	unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
69 	unsigned long d0, d1, d2, d3, d6, d7;
70 	unsigned int fsindex, gsindex;
71 	unsigned int ds, es;
72 
73 	show_iret_regs(regs, log_lvl);
74 
75 	if (regs->orig_ax != -1)
76 		pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
77 	else
78 		pr_cont("\n");
79 
80 	printk("%sRAX: %016lx RBX: %016lx RCX: %016lx\n",
81 	       log_lvl, regs->ax, regs->bx, regs->cx);
82 	printk("%sRDX: %016lx RSI: %016lx RDI: %016lx\n",
83 	       log_lvl, regs->dx, regs->si, regs->di);
84 	printk("%sRBP: %016lx R08: %016lx R09: %016lx\n",
85 	       log_lvl, regs->bp, regs->r8, regs->r9);
86 	printk("%sR10: %016lx R11: %016lx R12: %016lx\n",
87 	       log_lvl, regs->r10, regs->r11, regs->r12);
88 	printk("%sR13: %016lx R14: %016lx R15: %016lx\n",
89 	       log_lvl, regs->r13, regs->r14, regs->r15);
90 
91 	if (mode == SHOW_REGS_SHORT)
92 		return;
93 
94 	if (mode == SHOW_REGS_USER) {
95 		rdmsrl(MSR_FS_BASE, fs);
96 		rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
97 		printk("%sFS:  %016lx GS:  %016lx\n",
98 		       log_lvl, fs, shadowgs);
99 		return;
100 	}
101 
102 	asm("movl %%ds,%0" : "=r" (ds));
103 	asm("movl %%es,%0" : "=r" (es));
104 	asm("movl %%fs,%0" : "=r" (fsindex));
105 	asm("movl %%gs,%0" : "=r" (gsindex));
106 
107 	rdmsrl(MSR_FS_BASE, fs);
108 	rdmsrl(MSR_GS_BASE, gs);
109 	rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
110 
111 	cr0 = read_cr0();
112 	cr2 = read_cr2();
113 	cr3 = __read_cr3();
114 	cr4 = __read_cr4();
115 
116 	printk("%sFS:  %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
117 	       log_lvl, fs, fsindex, gs, gsindex, shadowgs);
118 	printk("%sCS:  %04lx DS: %04x ES: %04x CR0: %016lx\n",
119 		log_lvl, regs->cs, ds, es, cr0);
120 	printk("%sCR2: %016lx CR3: %016lx CR4: %016lx\n",
121 		log_lvl, cr2, cr3, cr4);
122 
123 	get_debugreg(d0, 0);
124 	get_debugreg(d1, 1);
125 	get_debugreg(d2, 2);
126 	get_debugreg(d3, 3);
127 	get_debugreg(d6, 6);
128 	get_debugreg(d7, 7);
129 
130 	/* Only print out debug registers if they are in their non-default state. */
131 	if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
132 	    (d6 == DR6_RESERVED) && (d7 == 0x400))) {
133 		printk("%sDR0: %016lx DR1: %016lx DR2: %016lx\n",
134 		       log_lvl, d0, d1, d2);
135 		printk("%sDR3: %016lx DR6: %016lx DR7: %016lx\n",
136 		       log_lvl, d3, d6, d7);
137 	}
138 
139 	if (boot_cpu_has(X86_FEATURE_OSPKE))
140 		printk("%sPKRU: %08x\n", log_lvl, read_pkru());
141 }
142 
release_thread(struct task_struct * dead_task)143 void release_thread(struct task_struct *dead_task)
144 {
145 	WARN_ON(dead_task->mm);
146 }
147 
148 enum which_selector {
149 	FS,
150 	GS
151 };
152 
153 /*
154  * Out of line to be protected from kprobes and tracing. If this would be
155  * traced or probed than any access to a per CPU variable happens with
156  * the wrong GS.
157  *
158  * It is not used on Xen paravirt. When paravirt support is needed, it
159  * needs to be renamed with native_ prefix.
160  */
__rdgsbase_inactive(void)161 static noinstr unsigned long __rdgsbase_inactive(void)
162 {
163 	unsigned long gsbase;
164 
165 	lockdep_assert_irqs_disabled();
166 
167 	if (!static_cpu_has(X86_FEATURE_XENPV)) {
168 		native_swapgs();
169 		gsbase = rdgsbase();
170 		native_swapgs();
171 	} else {
172 		instrumentation_begin();
173 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
174 		instrumentation_end();
175 	}
176 
177 	return gsbase;
178 }
179 
180 /*
181  * Out of line to be protected from kprobes and tracing. If this would be
182  * traced or probed than any access to a per CPU variable happens with
183  * the wrong GS.
184  *
185  * It is not used on Xen paravirt. When paravirt support is needed, it
186  * needs to be renamed with native_ prefix.
187  */
__wrgsbase_inactive(unsigned long gsbase)188 static noinstr void __wrgsbase_inactive(unsigned long gsbase)
189 {
190 	lockdep_assert_irqs_disabled();
191 
192 	if (!static_cpu_has(X86_FEATURE_XENPV)) {
193 		native_swapgs();
194 		wrgsbase(gsbase);
195 		native_swapgs();
196 	} else {
197 		instrumentation_begin();
198 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
199 		instrumentation_end();
200 	}
201 }
202 
203 /*
204  * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
205  * not available.  The goal is to be reasonably fast on non-FSGSBASE systems.
206  * It's forcibly inlined because it'll generate better code and this function
207  * is hot.
208  */
save_base_legacy(struct task_struct * prev_p,unsigned short selector,enum which_selector which)209 static __always_inline void save_base_legacy(struct task_struct *prev_p,
210 					     unsigned short selector,
211 					     enum which_selector which)
212 {
213 	if (likely(selector == 0)) {
214 		/*
215 		 * On Intel (without X86_BUG_NULL_SEG), the segment base could
216 		 * be the pre-existing saved base or it could be zero.  On AMD
217 		 * (with X86_BUG_NULL_SEG), the segment base could be almost
218 		 * anything.
219 		 *
220 		 * This branch is very hot (it's hit twice on almost every
221 		 * context switch between 64-bit programs), and avoiding
222 		 * the RDMSR helps a lot, so we just assume that whatever
223 		 * value is already saved is correct.  This matches historical
224 		 * Linux behavior, so it won't break existing applications.
225 		 *
226 		 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
227 		 * report that the base is zero, it needs to actually be zero:
228 		 * see the corresponding logic in load_seg_legacy.
229 		 */
230 	} else {
231 		/*
232 		 * If the selector is 1, 2, or 3, then the base is zero on
233 		 * !X86_BUG_NULL_SEG CPUs and could be anything on
234 		 * X86_BUG_NULL_SEG CPUs.  In the latter case, Linux
235 		 * has never attempted to preserve the base across context
236 		 * switches.
237 		 *
238 		 * If selector > 3, then it refers to a real segment, and
239 		 * saving the base isn't necessary.
240 		 */
241 		if (which == FS)
242 			prev_p->thread.fsbase = 0;
243 		else
244 			prev_p->thread.gsbase = 0;
245 	}
246 }
247 
save_fsgs(struct task_struct * task)248 static __always_inline void save_fsgs(struct task_struct *task)
249 {
250 	savesegment(fs, task->thread.fsindex);
251 	savesegment(gs, task->thread.gsindex);
252 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
253 		/*
254 		 * If FSGSBASE is enabled, we can't make any useful guesses
255 		 * about the base, and user code expects us to save the current
256 		 * value.  Fortunately, reading the base directly is efficient.
257 		 */
258 		task->thread.fsbase = rdfsbase();
259 		task->thread.gsbase = __rdgsbase_inactive();
260 	} else {
261 		save_base_legacy(task, task->thread.fsindex, FS);
262 		save_base_legacy(task, task->thread.gsindex, GS);
263 	}
264 }
265 
266 /*
267  * While a process is running,current->thread.fsbase and current->thread.gsbase
268  * may not match the corresponding CPU registers (see save_base_legacy()).
269  */
current_save_fsgs(void)270 void current_save_fsgs(void)
271 {
272 	unsigned long flags;
273 
274 	/* Interrupts need to be off for FSGSBASE */
275 	local_irq_save(flags);
276 	save_fsgs(current);
277 	local_irq_restore(flags);
278 }
279 #if IS_ENABLED(CONFIG_KVM)
280 EXPORT_SYMBOL_GPL(current_save_fsgs);
281 #endif
282 
loadseg(enum which_selector which,unsigned short sel)283 static __always_inline void loadseg(enum which_selector which,
284 				    unsigned short sel)
285 {
286 	if (which == FS)
287 		loadsegment(fs, sel);
288 	else
289 		load_gs_index(sel);
290 }
291 
load_seg_legacy(unsigned short prev_index,unsigned long prev_base,unsigned short next_index,unsigned long next_base,enum which_selector which)292 static __always_inline void load_seg_legacy(unsigned short prev_index,
293 					    unsigned long prev_base,
294 					    unsigned short next_index,
295 					    unsigned long next_base,
296 					    enum which_selector which)
297 {
298 	if (likely(next_index <= 3)) {
299 		/*
300 		 * The next task is using 64-bit TLS, is not using this
301 		 * segment at all, or is having fun with arcane CPU features.
302 		 */
303 		if (next_base == 0) {
304 			/*
305 			 * Nasty case: on AMD CPUs, we need to forcibly zero
306 			 * the base.
307 			 */
308 			if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
309 				loadseg(which, __USER_DS);
310 				loadseg(which, next_index);
311 			} else {
312 				/*
313 				 * We could try to exhaustively detect cases
314 				 * under which we can skip the segment load,
315 				 * but there's really only one case that matters
316 				 * for performance: if both the previous and
317 				 * next states are fully zeroed, we can skip
318 				 * the load.
319 				 *
320 				 * (This assumes that prev_base == 0 has no
321 				 * false positives.  This is the case on
322 				 * Intel-style CPUs.)
323 				 */
324 				if (likely(prev_index | next_index | prev_base))
325 					loadseg(which, next_index);
326 			}
327 		} else {
328 			if (prev_index != next_index)
329 				loadseg(which, next_index);
330 			wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
331 			       next_base);
332 		}
333 	} else {
334 		/*
335 		 * The next task is using a real segment.  Loading the selector
336 		 * is sufficient.
337 		 */
338 		loadseg(which, next_index);
339 	}
340 }
341 
x86_fsgsbase_load(struct thread_struct * prev,struct thread_struct * next)342 static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
343 					      struct thread_struct *next)
344 {
345 	if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
346 		/* Update the FS and GS selectors if they could have changed. */
347 		if (unlikely(prev->fsindex || next->fsindex))
348 			loadseg(FS, next->fsindex);
349 		if (unlikely(prev->gsindex || next->gsindex))
350 			loadseg(GS, next->gsindex);
351 
352 		/* Update the bases. */
353 		wrfsbase(next->fsbase);
354 		__wrgsbase_inactive(next->gsbase);
355 	} else {
356 		load_seg_legacy(prev->fsindex, prev->fsbase,
357 				next->fsindex, next->fsbase, FS);
358 		load_seg_legacy(prev->gsindex, prev->gsbase,
359 				next->gsindex, next->gsbase, GS);
360 	}
361 }
362 
x86_fsgsbase_read_task(struct task_struct * task,unsigned short selector)363 unsigned long x86_fsgsbase_read_task(struct task_struct *task,
364 				     unsigned short selector)
365 {
366 	unsigned short idx = selector >> 3;
367 	unsigned long base;
368 
369 	if (likely((selector & SEGMENT_TI_MASK) == 0)) {
370 		if (unlikely(idx >= GDT_ENTRIES))
371 			return 0;
372 
373 		/*
374 		 * There are no user segments in the GDT with nonzero bases
375 		 * other than the TLS segments.
376 		 */
377 		if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
378 			return 0;
379 
380 		idx -= GDT_ENTRY_TLS_MIN;
381 		base = get_desc_base(&task->thread.tls_array[idx]);
382 	} else {
383 #ifdef CONFIG_MODIFY_LDT_SYSCALL
384 		struct ldt_struct *ldt;
385 
386 		/*
387 		 * If performance here mattered, we could protect the LDT
388 		 * with RCU.  This is a slow path, though, so we can just
389 		 * take the mutex.
390 		 */
391 		mutex_lock(&task->mm->context.lock);
392 		ldt = task->mm->context.ldt;
393 		if (unlikely(!ldt || idx >= ldt->nr_entries))
394 			base = 0;
395 		else
396 			base = get_desc_base(ldt->entries + idx);
397 		mutex_unlock(&task->mm->context.lock);
398 #else
399 		base = 0;
400 #endif
401 	}
402 
403 	return base;
404 }
405 
x86_gsbase_read_cpu_inactive(void)406 unsigned long x86_gsbase_read_cpu_inactive(void)
407 {
408 	unsigned long gsbase;
409 
410 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
411 		unsigned long flags;
412 
413 		local_irq_save(flags);
414 		gsbase = __rdgsbase_inactive();
415 		local_irq_restore(flags);
416 	} else {
417 		rdmsrl(MSR_KERNEL_GS_BASE, gsbase);
418 	}
419 
420 	return gsbase;
421 }
422 
x86_gsbase_write_cpu_inactive(unsigned long gsbase)423 void x86_gsbase_write_cpu_inactive(unsigned long gsbase)
424 {
425 	if (boot_cpu_has(X86_FEATURE_FSGSBASE)) {
426 		unsigned long flags;
427 
428 		local_irq_save(flags);
429 		__wrgsbase_inactive(gsbase);
430 		local_irq_restore(flags);
431 	} else {
432 		wrmsrl(MSR_KERNEL_GS_BASE, gsbase);
433 	}
434 }
435 
x86_fsbase_read_task(struct task_struct * task)436 unsigned long x86_fsbase_read_task(struct task_struct *task)
437 {
438 	unsigned long fsbase;
439 
440 	if (task == current)
441 		fsbase = x86_fsbase_read_cpu();
442 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
443 		 (task->thread.fsindex == 0))
444 		fsbase = task->thread.fsbase;
445 	else
446 		fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
447 
448 	return fsbase;
449 }
450 
x86_gsbase_read_task(struct task_struct * task)451 unsigned long x86_gsbase_read_task(struct task_struct *task)
452 {
453 	unsigned long gsbase;
454 
455 	if (task == current)
456 		gsbase = x86_gsbase_read_cpu_inactive();
457 	else if (boot_cpu_has(X86_FEATURE_FSGSBASE) ||
458 		 (task->thread.gsindex == 0))
459 		gsbase = task->thread.gsbase;
460 	else
461 		gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
462 
463 	return gsbase;
464 }
465 
x86_fsbase_write_task(struct task_struct * task,unsigned long fsbase)466 void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
467 {
468 	WARN_ON_ONCE(task == current);
469 
470 	task->thread.fsbase = fsbase;
471 }
472 
x86_gsbase_write_task(struct task_struct * task,unsigned long gsbase)473 void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
474 {
475 	WARN_ON_ONCE(task == current);
476 
477 	task->thread.gsbase = gsbase;
478 }
479 
480 static void
start_thread_common(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp,unsigned int _cs,unsigned int _ss,unsigned int _ds)481 start_thread_common(struct pt_regs *regs, unsigned long new_ip,
482 		    unsigned long new_sp,
483 		    unsigned int _cs, unsigned int _ss, unsigned int _ds)
484 {
485 	WARN_ON_ONCE(regs != current_pt_regs());
486 
487 	if (static_cpu_has(X86_BUG_NULL_SEG)) {
488 		/* Loading zero below won't clear the base. */
489 		loadsegment(fs, __USER_DS);
490 		load_gs_index(__USER_DS);
491 	}
492 
493 	loadsegment(fs, 0);
494 	loadsegment(es, _ds);
495 	loadsegment(ds, _ds);
496 	load_gs_index(0);
497 
498 	regs->ip		= new_ip;
499 	regs->sp		= new_sp;
500 	regs->cs		= _cs;
501 	regs->ss		= _ss;
502 	regs->flags		= X86_EFLAGS_IF;
503 }
504 
505 void
start_thread(struct pt_regs * regs,unsigned long new_ip,unsigned long new_sp)506 start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
507 {
508 	start_thread_common(regs, new_ip, new_sp,
509 			    __USER_CS, __USER_DS, 0);
510 }
511 EXPORT_SYMBOL_GPL(start_thread);
512 
513 #ifdef CONFIG_COMPAT
compat_start_thread(struct pt_regs * regs,u32 new_ip,u32 new_sp)514 void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
515 {
516 	start_thread_common(regs, new_ip, new_sp,
517 			    test_thread_flag(TIF_X32)
518 			    ? __USER_CS : __USER32_CS,
519 			    __USER_DS, __USER_DS);
520 }
521 #endif
522 
523 /*
524  *	switch_to(x,y) should switch tasks from x to y.
525  *
526  * This could still be optimized:
527  * - fold all the options into a flag word and test it with a single test.
528  * - could test fs/gs bitsliced
529  *
530  * Kprobes not supported here. Set the probe on schedule instead.
531  * Function graph tracer not supported too.
532  */
533 __visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct * prev_p,struct task_struct * next_p)534 __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
535 {
536 	struct thread_struct *prev = &prev_p->thread;
537 	struct thread_struct *next = &next_p->thread;
538 	int cpu = smp_processor_id();
539 
540 	WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
541 		     this_cpu_read(irq_count) != -1);
542 
543 	if (!test_thread_flag(TIF_NEED_FPU_LOAD))
544 		switch_fpu_prepare(prev_p, cpu);
545 
546 	/* We must save %fs and %gs before load_TLS() because
547 	 * %fs and %gs may be cleared by load_TLS().
548 	 *
549 	 * (e.g. xen_load_tls())
550 	 */
551 	save_fsgs(prev_p);
552 
553 	/*
554 	 * Load TLS before restoring any segments so that segment loads
555 	 * reference the correct GDT entries.
556 	 */
557 	load_TLS(next, cpu);
558 
559 	/*
560 	 * Leave lazy mode, flushing any hypercalls made here.  This
561 	 * must be done after loading TLS entries in the GDT but before
562 	 * loading segments that might reference them.
563 	 */
564 	arch_end_context_switch(next_p);
565 
566 	/* Switch DS and ES.
567 	 *
568 	 * Reading them only returns the selectors, but writing them (if
569 	 * nonzero) loads the full descriptor from the GDT or LDT.  The
570 	 * LDT for next is loaded in switch_mm, and the GDT is loaded
571 	 * above.
572 	 *
573 	 * We therefore need to write new values to the segment
574 	 * registers on every context switch unless both the new and old
575 	 * values are zero.
576 	 *
577 	 * Note that we don't need to do anything for CS and SS, as
578 	 * those are saved and restored as part of pt_regs.
579 	 */
580 	savesegment(es, prev->es);
581 	if (unlikely(next->es | prev->es))
582 		loadsegment(es, next->es);
583 
584 	savesegment(ds, prev->ds);
585 	if (unlikely(next->ds | prev->ds))
586 		loadsegment(ds, next->ds);
587 
588 	x86_fsgsbase_load(prev, next);
589 
590 	/*
591 	 * Switch the PDA and FPU contexts.
592 	 */
593 	this_cpu_write(current_task, next_p);
594 	this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
595 
596 	switch_fpu_finish(next_p);
597 
598 	/* Reload sp0. */
599 	update_task_stack(next_p);
600 
601 	switch_to_extra(prev_p, next_p);
602 
603 	if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
604 		/*
605 		 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
606 		 * does not update the cached descriptor.  As a result, if we
607 		 * do SYSRET while SS is NULL, we'll end up in user mode with
608 		 * SS apparently equal to __USER_DS but actually unusable.
609 		 *
610 		 * The straightforward workaround would be to fix it up just
611 		 * before SYSRET, but that would slow down the system call
612 		 * fast paths.  Instead, we ensure that SS is never NULL in
613 		 * system call context.  We do this by replacing NULL SS
614 		 * selectors at every context switch.  SYSCALL sets up a valid
615 		 * SS, so the only way to get NULL is to re-enter the kernel
616 		 * from CPL 3 through an interrupt.  Since that can't happen
617 		 * in the same task as a running syscall, we are guaranteed to
618 		 * context switch between every interrupt vector entry and a
619 		 * subsequent SYSRET.
620 		 *
621 		 * We read SS first because SS reads are much faster than
622 		 * writes.  Out of caution, we force SS to __KERNEL_DS even if
623 		 * it previously had a different non-NULL value.
624 		 */
625 		unsigned short ss_sel;
626 		savesegment(ss, ss_sel);
627 		if (ss_sel != __KERNEL_DS)
628 			loadsegment(ss, __KERNEL_DS);
629 	}
630 
631 	/* Load the Intel cache allocation PQR MSR. */
632 	resctrl_sched_in();
633 
634 	return prev_p;
635 }
636 
set_personality_64bit(void)637 void set_personality_64bit(void)
638 {
639 	/* inherit personality from parent */
640 
641 	/* Make sure to be in 64bit mode */
642 	clear_thread_flag(TIF_IA32);
643 	clear_thread_flag(TIF_ADDR32);
644 	clear_thread_flag(TIF_X32);
645 	/* Pretend that this comes from a 64bit execve */
646 	task_pt_regs(current)->orig_ax = __NR_execve;
647 	current_thread_info()->status &= ~TS_COMPAT;
648 
649 	/* Ensure the corresponding mm is not marked. */
650 	if (current->mm)
651 		current->mm->context.ia32_compat = 0;
652 
653 	/* TBD: overwrites user setup. Should have two bits.
654 	   But 64bit processes have always behaved this way,
655 	   so it's not too bad. The main problem is just that
656 	   32bit children are affected again. */
657 	current->personality &= ~READ_IMPLIES_EXEC;
658 }
659 
__set_personality_x32(void)660 static void __set_personality_x32(void)
661 {
662 #ifdef CONFIG_X86_X32
663 	clear_thread_flag(TIF_IA32);
664 	set_thread_flag(TIF_X32);
665 	if (current->mm)
666 		current->mm->context.ia32_compat = TIF_X32;
667 	current->personality &= ~READ_IMPLIES_EXEC;
668 	/*
669 	 * in_32bit_syscall() uses the presence of the x32 syscall bit
670 	 * flag to determine compat status.  The x86 mmap() code relies on
671 	 * the syscall bitness so set x32 syscall bit right here to make
672 	 * in_32bit_syscall() work during exec().
673 	 *
674 	 * Pretend to come from a x32 execve.
675 	 */
676 	task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
677 	current_thread_info()->status &= ~TS_COMPAT;
678 #endif
679 }
680 
__set_personality_ia32(void)681 static void __set_personality_ia32(void)
682 {
683 #ifdef CONFIG_IA32_EMULATION
684 	set_thread_flag(TIF_IA32);
685 	clear_thread_flag(TIF_X32);
686 	if (current->mm)
687 		current->mm->context.ia32_compat = TIF_IA32;
688 	current->personality |= force_personality32;
689 	/* Prepare the first "return" to user space */
690 	task_pt_regs(current)->orig_ax = __NR_ia32_execve;
691 	current_thread_info()->status |= TS_COMPAT;
692 #endif
693 }
694 
set_personality_ia32(bool x32)695 void set_personality_ia32(bool x32)
696 {
697 	/* Make sure to be in 32bit mode */
698 	set_thread_flag(TIF_ADDR32);
699 
700 	if (x32)
701 		__set_personality_x32();
702 	else
703 		__set_personality_ia32();
704 }
705 EXPORT_SYMBOL_GPL(set_personality_ia32);
706 
707 #ifdef CONFIG_CHECKPOINT_RESTORE
prctl_map_vdso(const struct vdso_image * image,unsigned long addr)708 static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
709 {
710 	int ret;
711 
712 	ret = map_vdso_once(image, addr);
713 	if (ret)
714 		return ret;
715 
716 	return (long)image->size;
717 }
718 #endif
719 
do_arch_prctl_64(struct task_struct * task,int option,unsigned long arg2)720 long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
721 {
722 	int ret = 0;
723 
724 	switch (option) {
725 	case ARCH_SET_GS: {
726 		if (unlikely(arg2 >= TASK_SIZE_MAX))
727 			return -EPERM;
728 
729 		preempt_disable();
730 		/*
731 		 * ARCH_SET_GS has always overwritten the index
732 		 * and the base. Zero is the most sensible value
733 		 * to put in the index, and is the only value that
734 		 * makes any sense if FSGSBASE is unavailable.
735 		 */
736 		if (task == current) {
737 			loadseg(GS, 0);
738 			x86_gsbase_write_cpu_inactive(arg2);
739 
740 			/*
741 			 * On non-FSGSBASE systems, save_base_legacy() expects
742 			 * that we also fill in thread.gsbase.
743 			 */
744 			task->thread.gsbase = arg2;
745 
746 		} else {
747 			task->thread.gsindex = 0;
748 			x86_gsbase_write_task(task, arg2);
749 		}
750 		preempt_enable();
751 		break;
752 	}
753 	case ARCH_SET_FS: {
754 		/*
755 		 * Not strictly needed for %fs, but do it for symmetry
756 		 * with %gs
757 		 */
758 		if (unlikely(arg2 >= TASK_SIZE_MAX))
759 			return -EPERM;
760 
761 		preempt_disable();
762 		/*
763 		 * Set the selector to 0 for the same reason
764 		 * as %gs above.
765 		 */
766 		if (task == current) {
767 			loadseg(FS, 0);
768 			x86_fsbase_write_cpu(arg2);
769 
770 			/*
771 			 * On non-FSGSBASE systems, save_base_legacy() expects
772 			 * that we also fill in thread.fsbase.
773 			 */
774 			task->thread.fsbase = arg2;
775 		} else {
776 			task->thread.fsindex = 0;
777 			x86_fsbase_write_task(task, arg2);
778 		}
779 		preempt_enable();
780 		break;
781 	}
782 	case ARCH_GET_FS: {
783 		unsigned long base = x86_fsbase_read_task(task);
784 
785 		ret = put_user(base, (unsigned long __user *)arg2);
786 		break;
787 	}
788 	case ARCH_GET_GS: {
789 		unsigned long base = x86_gsbase_read_task(task);
790 
791 		ret = put_user(base, (unsigned long __user *)arg2);
792 		break;
793 	}
794 
795 #ifdef CONFIG_CHECKPOINT_RESTORE
796 # ifdef CONFIG_X86_X32_ABI
797 	case ARCH_MAP_VDSO_X32:
798 		return prctl_map_vdso(&vdso_image_x32, arg2);
799 # endif
800 # if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
801 	case ARCH_MAP_VDSO_32:
802 		return prctl_map_vdso(&vdso_image_32, arg2);
803 # endif
804 	case ARCH_MAP_VDSO_64:
805 		return prctl_map_vdso(&vdso_image_64, arg2);
806 #endif
807 
808 	default:
809 		ret = -EINVAL;
810 		break;
811 	}
812 
813 	return ret;
814 }
815 
SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)816 SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
817 {
818 	long ret;
819 
820 	ret = do_arch_prctl_64(current, option, arg2);
821 	if (ret == -EINVAL)
822 		ret = do_arch_prctl_common(current, option, arg2);
823 
824 	return ret;
825 }
826 
827 #ifdef CONFIG_IA32_EMULATION
COMPAT_SYSCALL_DEFINE2(arch_prctl,int,option,unsigned long,arg2)828 COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
829 {
830 	return do_arch_prctl_common(current, option, arg2);
831 }
832 #endif
833 
KSTK_ESP(struct task_struct * task)834 unsigned long KSTK_ESP(struct task_struct *task)
835 {
836 	return task_pt_regs(task)->sp;
837 }
838