• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  kernel/sched/syscalls.c
4  *
5  *  Core kernel scheduler syscalls related code
6  *
7  *  Copyright (C) 1991-2002  Linus Torvalds
8  *  Copyright (C) 1998-2024  Ingo Molnar, Red Hat
9  */
10 #include <linux/sched.h>
11 #include <linux/cpuset.h>
12 #include <linux/sched/debug.h>
13 
14 #include <uapi/linux/sched/types.h>
15 
16 #include "sched.h"
17 #include "autogroup.h"
18 
19 #include <trace/hooks/sched.h>
20 #undef TRACE_INCLUDE_PATH
21 
__normal_prio(int policy,int rt_prio,int nice)22 static inline int __normal_prio(int policy, int rt_prio, int nice)
23 {
24 	int prio;
25 
26 	if (dl_policy(policy))
27 		prio = MAX_DL_PRIO - 1;
28 	else if (rt_policy(policy))
29 		prio = MAX_RT_PRIO - 1 - rt_prio;
30 	else
31 		prio = NICE_TO_PRIO(nice);
32 
33 	return prio;
34 }
35 
36 /*
37  * Calculate the expected normal priority: i.e. priority
38  * without taking RT-inheritance into account. Might be
39  * boosted by interactivity modifiers. Changes upon fork,
40  * setprio syscalls, and whenever the interactivity
41  * estimator recalculates.
42  */
normal_prio(struct task_struct * p)43 static inline int normal_prio(struct task_struct *p)
44 {
45 	return __normal_prio(p->policy, p->rt_priority, PRIO_TO_NICE(p->static_prio));
46 }
47 
48 /*
49  * Calculate the current priority, i.e. the priority
50  * taken into account by the scheduler. This value might
51  * be boosted by RT tasks, or might be boosted by
52  * interactivity modifiers. Will be RT if the task got
53  * RT-boosted. If not then it returns p->normal_prio.
54  */
effective_prio(struct task_struct * p)55 static int effective_prio(struct task_struct *p)
56 {
57 	p->normal_prio = normal_prio(p);
58 	/*
59 	 * If we are RT tasks or we were boosted to RT priority,
60 	 * keep the priority unchanged. Otherwise, update priority
61 	 * to the normal priority:
62 	 */
63 	if (!rt_or_dl_prio(p->prio))
64 		return p->normal_prio;
65 	return p->prio;
66 }
67 
set_user_nice(struct task_struct * p,long nice)68 void set_user_nice(struct task_struct *p, long nice)
69 {
70 	bool queued, running;
71 	struct rq *rq;
72 	int old_prio;
73 	bool allowed = true;
74 
75 	trace_android_rvh_set_user_nice(p, &nice);
76 	if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
77 		return;
78 	/*
79 	 * We have to be careful, if called from sys_setpriority(),
80 	 * the task might be in the middle of scheduling on another CPU.
81 	 */
82 	CLASS(task_rq_lock, rq_guard)(p);
83 	rq = rq_guard.rq;
84 
85 	update_rq_clock(rq);
86 
87 	trace_android_rvh_set_user_nice_locked(p, &nice, &allowed);
88 	if (!allowed)
89 		return;
90 
91 	/*
92 	 * The RT priorities are set via sched_setscheduler(), but we still
93 	 * allow the 'normal' nice value to be set - but as expected
94 	 * it won't have any effect on scheduling until the task is
95 	 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
96 	 */
97 	if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
98 		p->static_prio = NICE_TO_PRIO(nice);
99 		return;
100 	}
101 
102 	queued = task_on_rq_queued(p);
103 	running = task_current_donor(rq, p);
104 	if (queued)
105 		dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
106 	if (running)
107 		put_prev_task(rq, p);
108 
109 	p->static_prio = NICE_TO_PRIO(nice);
110 	set_load_weight(p, true);
111 	old_prio = p->prio;
112 	p->prio = effective_prio(p);
113 
114 	if (queued)
115 		enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
116 	if (running)
117 		set_next_task(rq, p);
118 
119 	/*
120 	 * If the task increased its priority or is running and
121 	 * lowered its priority, then reschedule its CPU:
122 	 */
123 	p->sched_class->prio_changed(rq, p, old_prio);
124 }
125 EXPORT_SYMBOL(set_user_nice);
126 
127 /*
128  * is_nice_reduction - check if nice value is an actual reduction
129  *
130  * Similar to can_nice() but does not perform a capability check.
131  *
132  * @p: task
133  * @nice: nice value
134  */
is_nice_reduction(const struct task_struct * p,const int nice)135 static bool is_nice_reduction(const struct task_struct *p, const int nice)
136 {
137 	/* Convert nice value [19,-20] to rlimit style value [1,40]: */
138 	int nice_rlim = nice_to_rlimit(nice);
139 
140 	return (nice_rlim <= task_rlimit(p, RLIMIT_NICE));
141 }
142 
143 /*
144  * can_nice - check if a task can reduce its nice value
145  * @p: task
146  * @nice: nice value
147  */
can_nice(const struct task_struct * p,const int nice)148 int can_nice(const struct task_struct *p, const int nice)
149 {
150 	return is_nice_reduction(p, nice) || capable(CAP_SYS_NICE);
151 }
152 
153 #ifdef __ARCH_WANT_SYS_NICE
154 
155 /*
156  * sys_nice - change the priority of the current process.
157  * @increment: priority increment
158  *
159  * sys_setpriority is a more generic, but much slower function that
160  * does similar things.
161  */
SYSCALL_DEFINE1(nice,int,increment)162 SYSCALL_DEFINE1(nice, int, increment)
163 {
164 	long nice, retval;
165 
166 	/*
167 	 * Setpriority might change our priority at the same moment.
168 	 * We don't have to worry. Conceptually one call occurs first
169 	 * and we have a single winner.
170 	 */
171 	increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
172 	nice = task_nice(current) + increment;
173 
174 	nice = clamp_val(nice, MIN_NICE, MAX_NICE);
175 	if (increment < 0 && !can_nice(current, nice))
176 		return -EPERM;
177 
178 	retval = security_task_setnice(current, nice);
179 	if (retval)
180 		return retval;
181 
182 	set_user_nice(current, nice);
183 	return 0;
184 }
185 
186 #endif
187 
188 /**
189  * task_prio - return the priority value of a given task.
190  * @p: the task in question.
191  *
192  * Return: The priority value as seen by users in /proc.
193  *
194  * sched policy         return value   kernel prio    user prio/nice
195  *
196  * normal, batch, idle     [0 ... 39]  [100 ... 139]          0/[-20 ... 19]
197  * fifo, rr             [-2 ... -100]     [98 ... 0]  [1 ... 99]
198  * deadline                     -101             -1           0
199  */
task_prio(const struct task_struct * p)200 int task_prio(const struct task_struct *p)
201 {
202 	return p->prio - MAX_RT_PRIO;
203 }
204 
205 /**
206  * idle_cpu - is a given CPU idle currently?
207  * @cpu: the processor in question.
208  *
209  * Return: 1 if the CPU is currently idle. 0 otherwise.
210  */
idle_cpu(int cpu)211 int idle_cpu(int cpu)
212 {
213 	struct rq *rq = cpu_rq(cpu);
214 
215 	if (rq->curr != rq->idle)
216 		return 0;
217 
218 	if (rq->nr_running)
219 		return 0;
220 
221 #ifdef CONFIG_SMP
222 	if (rq->ttwu_pending)
223 		return 0;
224 #endif
225 
226 	return 1;
227 }
228 
229 /**
230  * available_idle_cpu - is a given CPU idle for enqueuing work.
231  * @cpu: the CPU in question.
232  *
233  * Return: 1 if the CPU is currently idle. 0 otherwise.
234  */
available_idle_cpu(int cpu)235 int available_idle_cpu(int cpu)
236 {
237 	if (!idle_cpu(cpu))
238 		return 0;
239 
240 	if (vcpu_is_preempted(cpu))
241 		return 0;
242 
243 	return 1;
244 }
245 EXPORT_SYMBOL_GPL(available_idle_cpu);
246 
247 /**
248  * idle_task - return the idle task for a given CPU.
249  * @cpu: the processor in question.
250  *
251  * Return: The idle task for the CPU @cpu.
252  */
idle_task(int cpu)253 struct task_struct *idle_task(int cpu)
254 {
255 	return cpu_rq(cpu)->idle;
256 }
257 
258 #ifdef CONFIG_SCHED_CORE
sched_core_idle_cpu(int cpu)259 int sched_core_idle_cpu(int cpu)
260 {
261 	struct rq *rq = cpu_rq(cpu);
262 
263 	if (sched_core_enabled(rq) && rq->curr == rq->idle)
264 		return 1;
265 
266 	return idle_cpu(cpu);
267 }
268 
269 #endif
270 
271 /**
272  * find_process_by_pid - find a process with a matching PID value.
273  * @pid: the pid in question.
274  *
275  * The task of @pid, if found. %NULL otherwise.
276  */
find_process_by_pid(pid_t pid)277 static struct task_struct *find_process_by_pid(pid_t pid)
278 {
279 	return pid ? find_task_by_vpid(pid) : current;
280 }
281 
find_get_task(pid_t pid)282 static struct task_struct *find_get_task(pid_t pid)
283 {
284 	struct task_struct *p;
285 	guard(rcu)();
286 
287 	p = find_process_by_pid(pid);
288 	if (likely(p))
289 		get_task_struct(p);
290 
291 	return p;
292 }
293 
DEFINE_CLASS(find_get_task,struct task_struct *,if (_T)put_task_struct (_T),find_get_task (pid),pid_t pid)294 DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T),
295 	     find_get_task(pid), pid_t pid)
296 
297 /*
298  * sched_setparam() passes in -1 for its policy, to let the functions
299  * it calls know not to change it.
300  */
301 #define SETPARAM_POLICY	-1
302 
303 static void __setscheduler_params(struct task_struct *p,
304 		const struct sched_attr *attr)
305 {
306 	int policy = attr->sched_policy;
307 
308 	if (policy == SETPARAM_POLICY)
309 		policy = p->policy;
310 
311 	p->policy = policy;
312 
313 	if (dl_policy(policy)) {
314 		__setparam_dl(p, attr);
315 	} else if (fair_policy(policy)) {
316 		p->static_prio = NICE_TO_PRIO(attr->sched_nice);
317 		if (attr->sched_runtime) {
318 			p->se.custom_slice = 1;
319 			p->se.slice = clamp_t(u64, attr->sched_runtime,
320 					      NSEC_PER_MSEC/10,   /* HZ=1000 * 10 */
321 					      NSEC_PER_MSEC*100); /* HZ=100  / 10 */
322 		} else {
323 			p->se.custom_slice = 0;
324 			p->se.slice = sysctl_sched_base_slice;
325 		}
326 	}
327 
328 	/* rt-policy tasks do not have a timerslack */
329 	if (rt_or_dl_task_policy(p)) {
330 		p->timer_slack_ns = 0;
331 	} else if (p->timer_slack_ns == 0) {
332 		/* when switching back to non-rt policy, restore timerslack */
333 		p->timer_slack_ns = p->default_timer_slack_ns;
334 	}
335 
336 	/*
337 	 * __sched_setscheduler() ensures attr->sched_priority == 0 when
338 	 * !rt_policy. Always setting this ensures that things like
339 	 * getparam()/getattr() don't report silly values for !rt tasks.
340 	 */
341 	p->rt_priority = attr->sched_priority;
342 	p->normal_prio = normal_prio(p);
343 	set_load_weight(p, true);
344 }
345 
346 /*
347  * Check the target process has a UID that matches the current process's:
348  */
check_same_owner(struct task_struct * p)349 static bool check_same_owner(struct task_struct *p)
350 {
351 	const struct cred *cred = current_cred(), *pcred;
352 	guard(rcu)();
353 
354 	pcred = __task_cred(p);
355 	return (uid_eq(cred->euid, pcred->euid) ||
356 		uid_eq(cred->euid, pcred->uid));
357 }
358 
359 #ifdef CONFIG_UCLAMP_TASK
360 
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)361 static int uclamp_validate(struct task_struct *p,
362 			   const struct sched_attr *attr)
363 {
364 	int util_min = p->uclamp_req[UCLAMP_MIN].value;
365 	int util_max = p->uclamp_req[UCLAMP_MAX].value;
366 	bool done = false;
367 	int ret = 0;
368 
369 	trace_android_vh_uclamp_validate(p, attr, &ret, &done);
370 	if (done)
371 		return ret;
372 
373 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
374 		util_min = attr->sched_util_min;
375 
376 		if (util_min + 1 > SCHED_CAPACITY_SCALE + 1)
377 			return -EINVAL;
378 	}
379 
380 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
381 		util_max = attr->sched_util_max;
382 
383 		if (util_max + 1 > SCHED_CAPACITY_SCALE + 1)
384 			return -EINVAL;
385 	}
386 
387 	if (util_min != -1 && util_max != -1 && util_min > util_max)
388 		return -EINVAL;
389 
390 	/*
391 	 * We have valid uclamp attributes; make sure uclamp is enabled.
392 	 *
393 	 * We need to do that here, because enabling static branches is a
394 	 * blocking operation which obviously cannot be done while holding
395 	 * scheduler locks.
396 	 */
397 	if (!uclamp_is_used())
398 		static_branch_enable(&sched_uclamp_used);
399 
400 	return 0;
401 }
402 
uclamp_reset(const struct sched_attr * attr,enum uclamp_id clamp_id,struct uclamp_se * uc_se)403 static bool uclamp_reset(const struct sched_attr *attr,
404 			 enum uclamp_id clamp_id,
405 			 struct uclamp_se *uc_se)
406 {
407 	/* Reset on sched class change for a non user-defined clamp value. */
408 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)) &&
409 	    !uc_se->user_defined)
410 		return true;
411 
412 	/* Reset on sched_util_{min,max} == -1. */
413 	if (clamp_id == UCLAMP_MIN &&
414 	    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
415 	    attr->sched_util_min == -1) {
416 		return true;
417 	}
418 
419 	if (clamp_id == UCLAMP_MAX &&
420 	    attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
421 	    attr->sched_util_max == -1) {
422 		return true;
423 	}
424 
425 	return false;
426 }
427 
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)428 static void __setscheduler_uclamp(struct task_struct *p,
429 				  const struct sched_attr *attr)
430 {
431 	enum uclamp_id clamp_id;
432 
433 	for_each_clamp_id(clamp_id) {
434 		struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
435 		unsigned int value;
436 
437 		if (!uclamp_reset(attr, clamp_id, uc_se))
438 			continue;
439 
440 		/*
441 		 * RT by default have a 100% boost value that could be modified
442 		 * at runtime.
443 		 */
444 		if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN))
445 			value = sysctl_sched_uclamp_util_min_rt_default;
446 		else
447 			value = uclamp_none(clamp_id);
448 
449 		uclamp_se_set(uc_se, value, false);
450 
451 	}
452 
453 	if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)))
454 		return;
455 
456 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN &&
457 	    attr->sched_util_min != -1) {
458 		uclamp_se_set(&p->uclamp_req[UCLAMP_MIN],
459 			      attr->sched_util_min, true);
460 		trace_android_vh_setscheduler_uclamp(p, UCLAMP_MIN, attr->sched_util_min);
461 	}
462 
463 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX &&
464 	    attr->sched_util_max != -1) {
465 		uclamp_se_set(&p->uclamp_req[UCLAMP_MAX],
466 			      attr->sched_util_max, true);
467 		trace_android_vh_setscheduler_uclamp(p, UCLAMP_MAX, attr->sched_util_max);
468 	}
469 }
470 
471 #else /* !CONFIG_UCLAMP_TASK: */
472 
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)473 static inline int uclamp_validate(struct task_struct *p,
474 				  const struct sched_attr *attr)
475 {
476 	return -EOPNOTSUPP;
477 }
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)478 static void __setscheduler_uclamp(struct task_struct *p,
479 				  const struct sched_attr *attr) { }
480 #endif
481 
482 /*
483  * Allow unprivileged RT tasks to decrease priority.
484  * Only issue a capable test if needed and only once to avoid an audit
485  * event on permitted non-privileged operations:
486  */
user_check_sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,int policy,int reset_on_fork)487 static int user_check_sched_setscheduler(struct task_struct *p,
488 					 const struct sched_attr *attr,
489 					 int policy, int reset_on_fork)
490 {
491 	if (fair_policy(policy)) {
492 		if (attr->sched_nice < task_nice(p) &&
493 		    !is_nice_reduction(p, attr->sched_nice))
494 			goto req_priv;
495 	}
496 
497 	if (rt_policy(policy)) {
498 		unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
499 
500 		/* Can't set/change the rt policy: */
501 		if (policy != p->policy && !rlim_rtprio)
502 			goto req_priv;
503 
504 		/* Can't increase priority: */
505 		if (attr->sched_priority > p->rt_priority &&
506 		    attr->sched_priority > rlim_rtprio)
507 			goto req_priv;
508 	}
509 
510 	/*
511 	 * Can't set/change SCHED_DEADLINE policy at all for now
512 	 * (safest behavior); in the future we would like to allow
513 	 * unprivileged DL tasks to increase their relative deadline
514 	 * or reduce their runtime (both ways reducing utilization)
515 	 */
516 	if (dl_policy(policy))
517 		goto req_priv;
518 
519 	/*
520 	 * Treat SCHED_IDLE as nice 20. Only allow a switch to
521 	 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
522 	 */
523 	if (task_has_idle_policy(p) && !idle_policy(policy)) {
524 		if (!is_nice_reduction(p, task_nice(p)))
525 			goto req_priv;
526 	}
527 
528 	/* Can't change other user's priorities: */
529 	if (!check_same_owner(p))
530 		goto req_priv;
531 
532 	/* Normal users shall not reset the sched_reset_on_fork flag: */
533 	if (p->sched_reset_on_fork && !reset_on_fork)
534 		goto req_priv;
535 
536 	if (!capable(CAP_SYS_NICE)) {
537 		/* Can't change util-clamps */
538 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
539 			return -EPERM;
540 	}
541 
542 	return 0;
543 
544 req_priv:
545 	if (!capable(CAP_SYS_NICE))
546 		return -EPERM;
547 
548 	return 0;
549 }
550 
__sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,bool user,bool pi)551 int __sched_setscheduler(struct task_struct *p,
552 			 const struct sched_attr *attr,
553 			 bool user, bool pi)
554 {
555 	int oldpolicy = -1, policy = attr->sched_policy;
556 	int retval, oldprio, newprio, queued, running;
557 	const struct sched_class *prev_class, *next_class;
558 	struct balance_callback *head;
559 	struct rq_flags rf;
560 	int reset_on_fork;
561 	int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
562 	struct rq *rq;
563 	bool cpuset_locked = false;
564 
565 	/* The pi code expects interrupts enabled */
566 	BUG_ON(pi && in_interrupt());
567 recheck:
568 	/* Double check policy once rq lock held: */
569 	if (policy < 0) {
570 		reset_on_fork = p->sched_reset_on_fork;
571 		policy = oldpolicy = p->policy;
572 	} else {
573 		reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
574 
575 		if (!valid_policy(policy))
576 			return -EINVAL;
577 	}
578 
579 	if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV))
580 		return -EINVAL;
581 
582 	/*
583 	 * Valid priorities for SCHED_FIFO and SCHED_RR are
584 	 * 1..MAX_RT_PRIO-1, valid priority for SCHED_NORMAL,
585 	 * SCHED_BATCH and SCHED_IDLE is 0.
586 	 */
587 	if (attr->sched_priority > MAX_RT_PRIO-1)
588 		return -EINVAL;
589 	if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
590 	    (rt_policy(policy) != (attr->sched_priority != 0)))
591 		return -EINVAL;
592 
593 	if (user) {
594 		retval = user_check_sched_setscheduler(p, attr, policy, reset_on_fork);
595 		if (retval)
596 			return retval;
597 
598 		if (attr->sched_flags & SCHED_FLAG_SUGOV)
599 			return -EINVAL;
600 
601 		retval = security_task_setscheduler(p);
602 		if (retval)
603 			return retval;
604 	}
605 
606 	/* Update task specific "requested" clamps */
607 	if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
608 		retval = uclamp_validate(p, attr);
609 		if (retval)
610 			return retval;
611 	}
612 
613 	/*
614 	 * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
615 	 * information.
616 	 */
617 	if (dl_policy(policy) || dl_policy(p->policy)) {
618 		cpuset_locked = true;
619 		cpuset_lock();
620 	}
621 
622 	/*
623 	 * Make sure no PI-waiters arrive (or leave) while we are
624 	 * changing the priority of the task:
625 	 *
626 	 * To be able to change p->policy safely, the appropriate
627 	 * runqueue lock must be held.
628 	 */
629 	rq = task_rq_lock(p, &rf);
630 	update_rq_clock(rq);
631 
632 	/*
633 	 * Changing the policy of the stop threads its a very bad idea:
634 	 */
635 	if (p == rq->stop) {
636 		retval = -EINVAL;
637 		goto unlock;
638 	}
639 
640 	retval = scx_check_setscheduler(p, policy);
641 	if (retval)
642 		goto unlock;
643 
644 	/*
645 	 * If not changing anything there's no need to proceed further,
646 	 * but store a possible modification of reset_on_fork.
647 	 */
648 	if (unlikely(policy == p->policy)) {
649 		if (fair_policy(policy) &&
650 		    (attr->sched_nice != task_nice(p) ||
651 		     (attr->sched_runtime != p->se.slice)))
652 			goto change;
653 		if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
654 			goto change;
655 		if (dl_policy(policy) && dl_param_changed(p, attr))
656 			goto change;
657 		if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP)
658 			goto change;
659 
660 		p->sched_reset_on_fork = reset_on_fork;
661 		retval = 0;
662 		goto unlock;
663 	}
664 change:
665 
666 	if (user) {
667 #ifdef CONFIG_RT_GROUP_SCHED
668 		/*
669 		 * Do not allow real-time tasks into groups that have no runtime
670 		 * assigned.
671 		 */
672 		if (rt_bandwidth_enabled() && rt_policy(policy) &&
673 				task_group(p)->rt_bandwidth.rt_runtime == 0 &&
674 				!task_group_is_autogroup(task_group(p))) {
675 			retval = -EPERM;
676 			goto unlock;
677 		}
678 #endif
679 #ifdef CONFIG_SMP
680 		if (dl_bandwidth_enabled() && dl_policy(policy) &&
681 				!(attr->sched_flags & SCHED_FLAG_SUGOV)) {
682 			cpumask_t *span = rq->rd->span;
683 
684 			/*
685 			 * Don't allow tasks with an affinity mask smaller than
686 			 * the entire root_domain to become SCHED_DEADLINE. We
687 			 * will also fail if there's no bandwidth available.
688 			 */
689 			if (!cpumask_subset(span, p->cpus_ptr) ||
690 			    rq->rd->dl_bw.bw == 0) {
691 				retval = -EPERM;
692 				goto unlock;
693 			}
694 		}
695 #endif
696 	}
697 
698 	/* Re-check policy now with rq lock held: */
699 	if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
700 		policy = oldpolicy = -1;
701 		task_rq_unlock(rq, p, &rf);
702 		if (cpuset_locked)
703 			cpuset_unlock();
704 		goto recheck;
705 	}
706 
707 	/*
708 	 * If setscheduling to SCHED_DEADLINE (or changing the parameters
709 	 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
710 	 * is available.
711 	 */
712 	if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
713 		retval = -EBUSY;
714 		goto unlock;
715 	}
716 
717 	p->sched_reset_on_fork = reset_on_fork;
718 	oldprio = p->prio;
719 
720 	newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
721 	if (pi) {
722 		/*
723 		 * Take priority boosted tasks into account. If the new
724 		 * effective priority is unchanged, we just store the new
725 		 * normal parameters and do not touch the scheduler class and
726 		 * the runqueue. This will be done when the task deboost
727 		 * itself.
728 		 */
729 		newprio = rt_effective_prio(p, newprio);
730 		if (newprio == oldprio)
731 			queue_flags &= ~DEQUEUE_MOVE;
732 	}
733 
734 	prev_class = p->sched_class;
735 	next_class = __setscheduler_class(policy, newprio);
736 
737 	if (prev_class != next_class && p->se.sched_delayed)
738 		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
739 
740 	queued = task_on_rq_queued(p);
741 	running = task_current_donor(rq, p);
742 	if (queued)
743 		dequeue_task(rq, p, queue_flags);
744 	if (running)
745 		put_prev_task(rq, p);
746 
747 	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
748 		__setscheduler_params(p, attr);
749 		p->sched_class = next_class;
750 		p->prio = newprio;
751 		trace_android_rvh_setscheduler(p);
752 	}
753 	__setscheduler_uclamp(p, attr);
754 	check_class_changing(rq, p, prev_class);
755 
756 	if (queued) {
757 		/*
758 		 * We enqueue to tail when the priority of a task is
759 		 * increased (user space view).
760 		 */
761 		if (oldprio < p->prio)
762 			queue_flags |= ENQUEUE_HEAD;
763 
764 		enqueue_task(rq, p, queue_flags);
765 	}
766 	if (running)
767 		set_next_task(rq, p);
768 
769 	check_class_changed(rq, p, prev_class, oldprio);
770 
771 	/* Avoid rq from going away on us: */
772 	preempt_disable();
773 	head = splice_balance_callbacks(rq);
774 	task_rq_unlock(rq, p, &rf);
775 
776 	if (pi) {
777 		if (cpuset_locked)
778 			cpuset_unlock();
779 		rt_mutex_adjust_pi(p);
780 	}
781 
782 	/* Run balance callbacks after we've adjusted the PI chain: */
783 	balance_callbacks(rq, head);
784 	preempt_enable();
785 
786 	return 0;
787 
788 unlock:
789 	task_rq_unlock(rq, p, &rf);
790 	if (cpuset_locked)
791 		cpuset_unlock();
792 	return retval;
793 }
794 
_sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param,bool check)795 static int _sched_setscheduler(struct task_struct *p, int policy,
796 			       const struct sched_param *param, bool check)
797 {
798 	struct sched_attr attr = {
799 		.sched_policy   = policy,
800 		.sched_priority = param->sched_priority,
801 		.sched_nice	= PRIO_TO_NICE(p->static_prio),
802 	};
803 
804 	if (p->se.custom_slice)
805 		attr.sched_runtime = p->se.slice;
806 
807 	/* Fixup the legacy SCHED_RESET_ON_FORK hack. */
808 	if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
809 		attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
810 		policy &= ~SCHED_RESET_ON_FORK;
811 		attr.sched_policy = policy;
812 	}
813 
814 	return __sched_setscheduler(p, &attr, check, true);
815 }
816 /**
817  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
818  * @p: the task in question.
819  * @policy: new policy.
820  * @param: structure containing the new RT priority.
821  *
822  * Use sched_set_fifo(), read its comment.
823  *
824  * Return: 0 on success. An error code otherwise.
825  *
826  * NOTE that the task may be already dead.
827  */
sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param)828 int sched_setscheduler(struct task_struct *p, int policy,
829 		       const struct sched_param *param)
830 {
831 	return _sched_setscheduler(p, policy, param, true);
832 }
833 EXPORT_SYMBOL_GPL(sched_setscheduler);
834 
sched_setattr(struct task_struct * p,const struct sched_attr * attr)835 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
836 {
837 	return __sched_setscheduler(p, attr, true, true);
838 }
839 EXPORT_SYMBOL_GPL(sched_setattr);
840 
sched_setattr_nocheck(struct task_struct * p,const struct sched_attr * attr)841 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
842 {
843 	return __sched_setscheduler(p, attr, false, true);
844 }
845 EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
846 
847 /**
848  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernel-space.
849  * @p: the task in question.
850  * @policy: new policy.
851  * @param: structure containing the new RT priority.
852  *
853  * Just like sched_setscheduler, only don't bother checking if the
854  * current context has permission.  For example, this is needed in
855  * stop_machine(): we create temporary high priority worker threads,
856  * but our caller might not have that capability.
857  *
858  * Return: 0 on success. An error code otherwise.
859  */
sched_setscheduler_nocheck(struct task_struct * p,int policy,const struct sched_param * param)860 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
861 			       const struct sched_param *param)
862 {
863 	return _sched_setscheduler(p, policy, param, false);
864 }
865 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
866 
867 /*
868  * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
869  * incapable of resource management, which is the one thing an OS really should
870  * be doing.
871  *
872  * This is of course the reason it is limited to privileged users only.
873  *
874  * Worse still; it is fundamentally impossible to compose static priority
875  * workloads. You cannot take two correctly working static prio workloads
876  * and smash them together and still expect them to work.
877  *
878  * For this reason 'all' FIFO tasks the kernel creates are basically at:
879  *
880  *   MAX_RT_PRIO / 2
881  *
882  * The administrator _MUST_ configure the system, the kernel simply doesn't
883  * know enough information to make a sensible choice.
884  */
sched_set_fifo(struct task_struct * p)885 void sched_set_fifo(struct task_struct *p)
886 {
887 	struct sched_param sp = { .sched_priority = MAX_RT_PRIO / 2 };
888 	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
889 }
890 EXPORT_SYMBOL_GPL(sched_set_fifo);
891 
892 /*
893  * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
894  */
sched_set_fifo_low(struct task_struct * p)895 void sched_set_fifo_low(struct task_struct *p)
896 {
897 	struct sched_param sp = { .sched_priority = 1 };
898 	WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
899 }
900 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
901 
sched_set_normal(struct task_struct * p,int nice)902 void sched_set_normal(struct task_struct *p, int nice)
903 {
904 	struct sched_attr attr = {
905 		.sched_policy = SCHED_NORMAL,
906 		.sched_nice = nice,
907 	};
908 	WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
909 }
910 EXPORT_SYMBOL_GPL(sched_set_normal);
911 
912 static int
do_sched_setscheduler(pid_t pid,int policy,struct sched_param __user * param)913 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
914 {
915 	struct sched_param lparam;
916 
917 	if (!param || pid < 0)
918 		return -EINVAL;
919 	if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
920 		return -EFAULT;
921 
922 	CLASS(find_get_task, p)(pid);
923 	if (!p)
924 		return -ESRCH;
925 
926 	return sched_setscheduler(p, policy, &lparam);
927 }
928 
929 /*
930  * Mimics kernel/events/core.c perf_copy_attr().
931  */
sched_copy_attr(struct sched_attr __user * uattr,struct sched_attr * attr)932 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
933 {
934 	u32 size;
935 	int ret;
936 
937 	/* Zero the full structure, so that a short copy will be nice: */
938 	memset(attr, 0, sizeof(*attr));
939 
940 	ret = get_user(size, &uattr->size);
941 	if (ret)
942 		return ret;
943 
944 	/* ABI compatibility quirk: */
945 	if (!size)
946 		size = SCHED_ATTR_SIZE_VER0;
947 	if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE)
948 		goto err_size;
949 
950 	ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
951 	if (ret) {
952 		if (ret == -E2BIG)
953 			goto err_size;
954 		return ret;
955 	}
956 
957 	if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
958 	    size < SCHED_ATTR_SIZE_VER1)
959 		return -EINVAL;
960 
961 	/*
962 	 * XXX: Do we want to be lenient like existing syscalls; or do we want
963 	 * to be strict and return an error on out-of-bounds values?
964 	 */
965 	attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
966 
967 	return 0;
968 
969 err_size:
970 	put_user(sizeof(*attr), &uattr->size);
971 	return -E2BIG;
972 }
973 
get_params(struct task_struct * p,struct sched_attr * attr)974 static void get_params(struct task_struct *p, struct sched_attr *attr)
975 {
976 	if (task_has_dl_policy(p)) {
977 		__getparam_dl(p, attr);
978 	} else if (task_has_rt_policy(p)) {
979 		attr->sched_priority = p->rt_priority;
980 	} else {
981 		attr->sched_nice = task_nice(p);
982 		attr->sched_runtime = p->se.slice;
983 	}
984 }
985 
986 /**
987  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
988  * @pid: the pid in question.
989  * @policy: new policy.
990  * @param: structure containing the new RT priority.
991  *
992  * Return: 0 on success. An error code otherwise.
993  */
SYSCALL_DEFINE3(sched_setscheduler,pid_t,pid,int,policy,struct sched_param __user *,param)994 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
995 {
996 	if (policy < 0)
997 		return -EINVAL;
998 
999 	return do_sched_setscheduler(pid, policy, param);
1000 }
1001 
1002 /**
1003  * sys_sched_setparam - set/change the RT priority of a thread
1004  * @pid: the pid in question.
1005  * @param: structure containing the new RT priority.
1006  *
1007  * Return: 0 on success. An error code otherwise.
1008  */
SYSCALL_DEFINE2(sched_setparam,pid_t,pid,struct sched_param __user *,param)1009 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
1010 {
1011 	return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
1012 }
1013 
1014 /**
1015  * sys_sched_setattr - same as above, but with extended sched_attr
1016  * @pid: the pid in question.
1017  * @uattr: structure containing the extended parameters.
1018  * @flags: for future extension.
1019  */
SYSCALL_DEFINE3(sched_setattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,flags)1020 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
1021 			       unsigned int, flags)
1022 {
1023 	struct sched_attr attr;
1024 	int retval;
1025 
1026 	if (!uattr || pid < 0 || flags)
1027 		return -EINVAL;
1028 
1029 	retval = sched_copy_attr(uattr, &attr);
1030 	if (retval)
1031 		return retval;
1032 
1033 	if ((int)attr.sched_policy < 0)
1034 		return -EINVAL;
1035 	if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY)
1036 		attr.sched_policy = SETPARAM_POLICY;
1037 
1038 	CLASS(find_get_task, p)(pid);
1039 	if (!p)
1040 		return -ESRCH;
1041 
1042 	if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
1043 		get_params(p, &attr);
1044 
1045 	return sched_setattr(p, &attr);
1046 }
1047 
1048 /**
1049  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
1050  * @pid: the pid in question.
1051  *
1052  * Return: On success, the policy of the thread. Otherwise, a negative error
1053  * code.
1054  */
SYSCALL_DEFINE1(sched_getscheduler,pid_t,pid)1055 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
1056 {
1057 	struct task_struct *p;
1058 	int retval;
1059 
1060 	if (pid < 0)
1061 		return -EINVAL;
1062 
1063 	guard(rcu)();
1064 	p = find_process_by_pid(pid);
1065 	if (!p)
1066 		return -ESRCH;
1067 
1068 	retval = security_task_getscheduler(p);
1069 	if (!retval) {
1070 		retval = p->policy;
1071 		if (p->sched_reset_on_fork)
1072 			retval |= SCHED_RESET_ON_FORK;
1073 	}
1074 	return retval;
1075 }
1076 
1077 /**
1078  * sys_sched_getparam - get the RT priority of a thread
1079  * @pid: the pid in question.
1080  * @param: structure containing the RT priority.
1081  *
1082  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
1083  * code.
1084  */
SYSCALL_DEFINE2(sched_getparam,pid_t,pid,struct sched_param __user *,param)1085 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
1086 {
1087 	struct sched_param lp = { .sched_priority = 0 };
1088 	struct task_struct *p;
1089 	int retval;
1090 
1091 	if (!param || pid < 0)
1092 		return -EINVAL;
1093 
1094 	scoped_guard (rcu) {
1095 		p = find_process_by_pid(pid);
1096 		if (!p)
1097 			return -ESRCH;
1098 
1099 		retval = security_task_getscheduler(p);
1100 		if (retval)
1101 			return retval;
1102 
1103 		if (task_has_rt_policy(p))
1104 			lp.sched_priority = p->rt_priority;
1105 	}
1106 
1107 	/*
1108 	 * This one might sleep, we cannot do it with a spinlock held ...
1109 	 */
1110 	return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
1111 }
1112 
1113 /*
1114  * Copy the kernel size attribute structure (which might be larger
1115  * than what user-space knows about) to user-space.
1116  *
1117  * Note that all cases are valid: user-space buffer can be larger or
1118  * smaller than the kernel-space buffer. The usual case is that both
1119  * have the same size.
1120  */
1121 static int
sched_attr_copy_to_user(struct sched_attr __user * uattr,struct sched_attr * kattr,unsigned int usize)1122 sched_attr_copy_to_user(struct sched_attr __user *uattr,
1123 			struct sched_attr *kattr,
1124 			unsigned int usize)
1125 {
1126 	unsigned int ksize = sizeof(*kattr);
1127 
1128 	if (!access_ok(uattr, usize))
1129 		return -EFAULT;
1130 
1131 	/*
1132 	 * sched_getattr() ABI forwards and backwards compatibility:
1133 	 *
1134 	 * If usize == ksize then we just copy everything to user-space and all is good.
1135 	 *
1136 	 * If usize < ksize then we only copy as much as user-space has space for,
1137 	 * this keeps ABI compatibility as well. We skip the rest.
1138 	 *
1139 	 * If usize > ksize then user-space is using a newer version of the ABI,
1140 	 * which part the kernel doesn't know about. Just ignore it - tooling can
1141 	 * detect the kernel's knowledge of attributes from the attr->size value
1142 	 * which is set to ksize in this case.
1143 	 */
1144 	kattr->size = min(usize, ksize);
1145 
1146 	if (copy_to_user(uattr, kattr, kattr->size))
1147 		return -EFAULT;
1148 
1149 	return 0;
1150 }
1151 
1152 /**
1153  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
1154  * @pid: the pid in question.
1155  * @uattr: structure containing the extended parameters.
1156  * @usize: sizeof(attr) for fwd/bwd comp.
1157  * @flags: for future extension.
1158  */
SYSCALL_DEFINE4(sched_getattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,usize,unsigned int,flags)1159 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
1160 		unsigned int, usize, unsigned int, flags)
1161 {
1162 	struct sched_attr kattr = { };
1163 	struct task_struct *p;
1164 	int retval;
1165 
1166 	if (!uattr || pid < 0 || usize > PAGE_SIZE ||
1167 	    usize < SCHED_ATTR_SIZE_VER0 || flags)
1168 		return -EINVAL;
1169 
1170 	scoped_guard (rcu) {
1171 		p = find_process_by_pid(pid);
1172 		if (!p)
1173 			return -ESRCH;
1174 
1175 		retval = security_task_getscheduler(p);
1176 		if (retval)
1177 			return retval;
1178 
1179 		kattr.sched_policy = p->policy;
1180 		if (p->sched_reset_on_fork)
1181 			kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
1182 		get_params(p, &kattr);
1183 		kattr.sched_flags &= SCHED_FLAG_ALL;
1184 
1185 #ifdef CONFIG_UCLAMP_TASK
1186 		/*
1187 		 * This could race with another potential updater, but this is fine
1188 		 * because it'll correctly read the old or the new value. We don't need
1189 		 * to guarantee who wins the race as long as it doesn't return garbage.
1190 		 */
1191 		kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
1192 		kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
1193 #endif
1194 	}
1195 
1196 	return sched_attr_copy_to_user(uattr, &kattr, usize);
1197 }
1198 
1199 #ifdef CONFIG_SMP
dl_task_check_affinity(struct task_struct * p,const struct cpumask * mask)1200 int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
1201 {
1202 	/*
1203 	 * If the task isn't a deadline task or admission control is
1204 	 * disabled then we don't care about affinity changes.
1205 	 */
1206 	if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
1207 		return 0;
1208 
1209 	/*
1210 	 * Since bandwidth control happens on root_domain basis,
1211 	 * if admission test is enabled, we only admit -deadline
1212 	 * tasks allowed to run on all the CPUs in the task's
1213 	 * root_domain.
1214 	 */
1215 	guard(rcu)();
1216 	if (!cpumask_subset(task_rq(p)->rd->span, mask))
1217 		return -EBUSY;
1218 
1219 	return 0;
1220 }
1221 #endif /* CONFIG_SMP */
1222 
__sched_setaffinity(struct task_struct * p,struct affinity_context * ctx)1223 int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx)
1224 {
1225 	int retval;
1226 	cpumask_var_t cpus_allowed, new_mask;
1227 
1228 	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
1229 		return -ENOMEM;
1230 
1231 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
1232 		retval = -ENOMEM;
1233 		goto out_free_cpus_allowed;
1234 	}
1235 
1236 	cpuset_cpus_allowed(p, cpus_allowed);
1237 	cpumask_and(new_mask, ctx->new_mask, cpus_allowed);
1238 
1239 	ctx->new_mask = new_mask;
1240 	ctx->flags |= SCA_CHECK;
1241 
1242 	retval = dl_task_check_affinity(p, new_mask);
1243 	if (retval)
1244 		goto out_free_new_mask;
1245 
1246 	retval = __set_cpus_allowed_ptr(p, ctx);
1247 	if (retval)
1248 		goto out_free_new_mask;
1249 
1250 	cpuset_cpus_allowed(p, cpus_allowed);
1251 	if (!cpumask_subset(new_mask, cpus_allowed)) {
1252 		/*
1253 		 * We must have raced with a concurrent cpuset update.
1254 		 * Just reset the cpumask to the cpuset's cpus_allowed.
1255 		 */
1256 		cpumask_copy(new_mask, cpus_allowed);
1257 
1258 		/*
1259 		 * If SCA_USER is set, a 2nd call to __set_cpus_allowed_ptr()
1260 		 * will restore the previous user_cpus_ptr value.
1261 		 *
1262 		 * In the unlikely event a previous user_cpus_ptr exists,
1263 		 * we need to further restrict the mask to what is allowed
1264 		 * by that old user_cpus_ptr.
1265 		 */
1266 		if (unlikely((ctx->flags & SCA_USER) && ctx->user_mask)) {
1267 			bool empty = !cpumask_and(new_mask, new_mask,
1268 						  ctx->user_mask);
1269 
1270 			if (empty)
1271 				cpumask_copy(new_mask, cpus_allowed);
1272 		}
1273 		__set_cpus_allowed_ptr(p, ctx);
1274 		retval = -EINVAL;
1275 	}
1276 
1277 out_free_new_mask:
1278 	free_cpumask_var(new_mask);
1279 out_free_cpus_allowed:
1280 	free_cpumask_var(cpus_allowed);
1281 	return retval;
1282 }
1283 
sched_setaffinity(pid_t pid,const struct cpumask * in_mask)1284 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
1285 {
1286 	struct affinity_context ac;
1287 	struct cpumask *user_mask;
1288 	int retval = 0;
1289 	bool skip = false;
1290 
1291 	CLASS(find_get_task, p)(pid);
1292 	if (!p)
1293 		return -ESRCH;
1294 
1295 	if (p->flags & PF_NO_SETAFFINITY)
1296 		return -EINVAL;
1297 
1298 	if (!check_same_owner(p)) {
1299 		guard(rcu)();
1300 		if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE))
1301 			return -EPERM;
1302 	}
1303 
1304 	trace_android_vh_sched_setaffinity_early(p, in_mask, &skip);
1305 	if (skip)
1306 		return retval;
1307 	retval = security_task_setscheduler(p);
1308 	if (retval)
1309 		return retval;
1310 
1311 	/*
1312 	 * With non-SMP configs, user_cpus_ptr/user_mask isn't used and
1313 	 * alloc_user_cpus_ptr() returns NULL.
1314 	 */
1315 	user_mask = alloc_user_cpus_ptr(NUMA_NO_NODE);
1316 	if (user_mask) {
1317 		cpumask_copy(user_mask, in_mask);
1318 	} else if (IS_ENABLED(CONFIG_SMP)) {
1319 		return -ENOMEM;
1320 	}
1321 
1322 	ac = (struct affinity_context){
1323 		.new_mask  = in_mask,
1324 		.user_mask = user_mask,
1325 		.flags     = SCA_USER,
1326 	};
1327 
1328 	retval = __sched_setaffinity(p, &ac);
1329 	kfree(ac.user_mask);
1330 
1331 	return retval;
1332 }
1333 
get_user_cpu_mask(unsigned long __user * user_mask_ptr,unsigned len,struct cpumask * new_mask)1334 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
1335 			     struct cpumask *new_mask)
1336 {
1337 	if (len < cpumask_size())
1338 		cpumask_clear(new_mask);
1339 	else if (len > cpumask_size())
1340 		len = cpumask_size();
1341 
1342 	return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
1343 }
1344 
1345 /**
1346  * sys_sched_setaffinity - set the CPU affinity of a process
1347  * @pid: pid of the process
1348  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1349  * @user_mask_ptr: user-space pointer to the new CPU mask
1350  *
1351  * Return: 0 on success. An error code otherwise.
1352  */
SYSCALL_DEFINE3(sched_setaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)1353 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
1354 		unsigned long __user *, user_mask_ptr)
1355 {
1356 	cpumask_var_t new_mask;
1357 	int retval;
1358 
1359 	if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
1360 		return -ENOMEM;
1361 
1362 	retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
1363 	if (retval == 0)
1364 		retval = sched_setaffinity(pid, new_mask);
1365 	free_cpumask_var(new_mask);
1366 	return retval;
1367 }
1368 
sched_getaffinity(pid_t pid,struct cpumask * mask)1369 long sched_getaffinity(pid_t pid, struct cpumask *mask)
1370 {
1371 	struct task_struct *p;
1372 	int retval;
1373 
1374 	guard(rcu)();
1375 	p = find_process_by_pid(pid);
1376 	if (!p)
1377 		return -ESRCH;
1378 
1379 	retval = security_task_getscheduler(p);
1380 	if (retval)
1381 		return retval;
1382 
1383 	guard(raw_spinlock_irqsave)(&p->pi_lock);
1384 	cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
1385 
1386 	return 0;
1387 }
1388 
1389 /**
1390  * sys_sched_getaffinity - get the CPU affinity of a process
1391  * @pid: pid of the process
1392  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
1393  * @user_mask_ptr: user-space pointer to hold the current CPU mask
1394  *
1395  * Return: size of CPU mask copied to user_mask_ptr on success. An
1396  * error code otherwise.
1397  */
SYSCALL_DEFINE3(sched_getaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)1398 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
1399 		unsigned long __user *, user_mask_ptr)
1400 {
1401 	int ret;
1402 	cpumask_var_t mask;
1403 
1404 	if ((len * BITS_PER_BYTE) < nr_cpu_ids)
1405 		return -EINVAL;
1406 	if (len & (sizeof(unsigned long)-1))
1407 		return -EINVAL;
1408 
1409 	if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
1410 		return -ENOMEM;
1411 
1412 	ret = sched_getaffinity(pid, mask);
1413 	if (ret == 0) {
1414 		unsigned int retlen = min(len, cpumask_size());
1415 
1416 		if (copy_to_user(user_mask_ptr, cpumask_bits(mask), retlen))
1417 			ret = -EFAULT;
1418 		else
1419 			ret = retlen;
1420 	}
1421 	free_cpumask_var(mask);
1422 
1423 	return ret;
1424 }
1425 
do_sched_yield(void)1426 static void do_sched_yield(void)
1427 {
1428 	struct rq_flags rf;
1429 	struct rq *rq;
1430 	long skip = 0;
1431 
1432 	trace_android_rvh_before_do_sched_yield(&skip);
1433 	if (skip)
1434 		return;
1435 
1436 	rq = this_rq_lock_irq(&rf);
1437 
1438 	schedstat_inc(rq->yld_count);
1439 	current->sched_class->yield_task(rq);
1440 
1441 	trace_android_rvh_do_sched_yield(rq);
1442 
1443 	preempt_disable();
1444 	rq_unlock_irq(rq, &rf);
1445 	sched_preempt_enable_no_resched();
1446 
1447 	schedule();
1448 }
1449 
1450 /**
1451  * sys_sched_yield - yield the current processor to other threads.
1452  *
1453  * This function yields the current CPU to other tasks. If there are no
1454  * other threads running on this CPU then this function will return.
1455  *
1456  * Return: 0.
1457  */
SYSCALL_DEFINE0(sched_yield)1458 SYSCALL_DEFINE0(sched_yield)
1459 {
1460 	do_sched_yield();
1461 	return 0;
1462 }
1463 
1464 /**
1465  * yield - yield the current processor to other threads.
1466  *
1467  * Do not ever use this function, there's a 99% chance you're doing it wrong.
1468  *
1469  * The scheduler is at all times free to pick the calling task as the most
1470  * eligible task to run, if removing the yield() call from your code breaks
1471  * it, it's already broken.
1472  *
1473  * Typical broken usage is:
1474  *
1475  * while (!event)
1476  *	yield();
1477  *
1478  * where one assumes that yield() will let 'the other' process run that will
1479  * make event true. If the current task is a SCHED_FIFO task that will never
1480  * happen. Never use yield() as a progress guarantee!!
1481  *
1482  * If you want to use yield() to wait for something, use wait_event().
1483  * If you want to use yield() to be 'nice' for others, use cond_resched().
1484  * If you still want to use yield(), do not!
1485  */
yield(void)1486 void __sched yield(void)
1487 {
1488 	set_current_state(TASK_RUNNING);
1489 	do_sched_yield();
1490 }
1491 EXPORT_SYMBOL(yield);
1492 
1493 /**
1494  * yield_to - yield the current processor to another thread in
1495  * your thread group, or accelerate that thread toward the
1496  * processor it's on.
1497  * @p: target task
1498  * @preempt: whether task preemption is allowed or not
1499  *
1500  * It's the caller's job to ensure that the target task struct
1501  * can't go away on us before we can do any checks.
1502  *
1503  * Return:
1504  *	true (>0) if we indeed boosted the target task.
1505  *	false (0) if we failed to boost the target.
1506  *	-ESRCH if there's no task to yield to.
1507  */
yield_to(struct task_struct * p,bool preempt)1508 int __sched yield_to(struct task_struct *p, bool preempt)
1509 {
1510 	struct task_struct *curr = current;
1511 	struct rq *rq, *p_rq;
1512 	int yielded = 0;
1513 
1514 	scoped_guard (raw_spinlock_irqsave, &p->pi_lock) {
1515 		rq = this_rq();
1516 
1517 again:
1518 		p_rq = task_rq(p);
1519 		/*
1520 		 * If we're the only runnable task on the rq and target rq also
1521 		 * has only one task, there's absolutely no point in yielding.
1522 		 */
1523 		if (rq->nr_running == 1 && p_rq->nr_running == 1)
1524 			return -ESRCH;
1525 
1526 		guard(double_rq_lock)(rq, p_rq);
1527 		if (task_rq(p) != p_rq)
1528 			goto again;
1529 
1530 		if (!curr->sched_class->yield_to_task)
1531 			return 0;
1532 
1533 		if (curr->sched_class != p->sched_class)
1534 			return 0;
1535 
1536 		if (task_on_cpu(p_rq, p) || !task_is_running(p))
1537 			return 0;
1538 
1539 		yielded = curr->sched_class->yield_to_task(rq, p);
1540 		if (yielded) {
1541 			schedstat_inc(rq->yld_count);
1542 			/*
1543 			 * Make p's CPU reschedule; pick_next_entity
1544 			 * takes care of fairness.
1545 			 */
1546 			if (preempt && rq != p_rq)
1547 				resched_curr(p_rq);
1548 		}
1549 	}
1550 
1551 	if (yielded)
1552 		schedule();
1553 
1554 	return yielded;
1555 }
1556 EXPORT_SYMBOL_GPL(yield_to);
1557 
1558 /**
1559  * sys_sched_get_priority_max - return maximum RT priority.
1560  * @policy: scheduling class.
1561  *
1562  * Return: On success, this syscall returns the maximum
1563  * rt_priority that can be used by a given scheduling class.
1564  * On failure, a negative error code is returned.
1565  */
SYSCALL_DEFINE1(sched_get_priority_max,int,policy)1566 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
1567 {
1568 	int ret = -EINVAL;
1569 
1570 	switch (policy) {
1571 	case SCHED_FIFO:
1572 	case SCHED_RR:
1573 		ret = MAX_RT_PRIO-1;
1574 		break;
1575 	case SCHED_DEADLINE:
1576 	case SCHED_NORMAL:
1577 	case SCHED_BATCH:
1578 	case SCHED_IDLE:
1579 	case SCHED_EXT:
1580 		ret = 0;
1581 		break;
1582 	}
1583 	return ret;
1584 }
1585 
1586 /**
1587  * sys_sched_get_priority_min - return minimum RT priority.
1588  * @policy: scheduling class.
1589  *
1590  * Return: On success, this syscall returns the minimum
1591  * rt_priority that can be used by a given scheduling class.
1592  * On failure, a negative error code is returned.
1593  */
SYSCALL_DEFINE1(sched_get_priority_min,int,policy)1594 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
1595 {
1596 	int ret = -EINVAL;
1597 
1598 	switch (policy) {
1599 	case SCHED_FIFO:
1600 	case SCHED_RR:
1601 		ret = 1;
1602 		break;
1603 	case SCHED_DEADLINE:
1604 	case SCHED_NORMAL:
1605 	case SCHED_BATCH:
1606 	case SCHED_IDLE:
1607 	case SCHED_EXT:
1608 		ret = 0;
1609 	}
1610 	return ret;
1611 }
1612 
sched_rr_get_interval(pid_t pid,struct timespec64 * t)1613 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
1614 {
1615 	unsigned int time_slice = 0;
1616 	int retval;
1617 
1618 	if (pid < 0)
1619 		return -EINVAL;
1620 
1621 	scoped_guard (rcu) {
1622 		struct task_struct *p = find_process_by_pid(pid);
1623 		if (!p)
1624 			return -ESRCH;
1625 
1626 		retval = security_task_getscheduler(p);
1627 		if (retval)
1628 			return retval;
1629 
1630 		scoped_guard (task_rq_lock, p) {
1631 			struct rq *rq = scope.rq;
1632 			if (p->sched_class->get_rr_interval)
1633 				time_slice = p->sched_class->get_rr_interval(rq, p);
1634 		}
1635 	}
1636 
1637 	jiffies_to_timespec64(time_slice, t);
1638 	return 0;
1639 }
1640 
1641 /**
1642  * sys_sched_rr_get_interval - return the default time-slice of a process.
1643  * @pid: pid of the process.
1644  * @interval: userspace pointer to the time-slice value.
1645  *
1646  * this syscall writes the default time-slice value of a given process
1647  * into the user-space timespec buffer. A value of '0' means infinity.
1648  *
1649  * Return: On success, 0 and the time-slice is in @interval. Otherwise,
1650  * an error code.
1651  */
SYSCALL_DEFINE2(sched_rr_get_interval,pid_t,pid,struct __kernel_timespec __user *,interval)1652 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
1653 		struct __kernel_timespec __user *, interval)
1654 {
1655 	struct timespec64 t;
1656 	int retval = sched_rr_get_interval(pid, &t);
1657 
1658 	if (retval == 0)
1659 		retval = put_timespec64(&t, interval);
1660 
1661 	return retval;
1662 }
1663 
1664 #ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32,pid_t,pid,struct old_timespec32 __user *,interval)1665 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
1666 		struct old_timespec32 __user *, interval)
1667 {
1668 	struct timespec64 t;
1669 	int retval = sched_rr_get_interval(pid, &t);
1670 
1671 	if (retval == 0)
1672 		retval = put_old_timespec32(&t, interval);
1673 	return retval;
1674 }
1675 #endif
1676