1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * kernel/sched/core.c
4 *
5 * Core kernel scheduler code and related syscalls
6 *
7 * Copyright (C) 1991-2002 Linus Torvalds
8 */
9 #define CREATE_TRACE_POINTS
10 #include <trace/events/sched.h>
11 #undef CREATE_TRACE_POINTS
12
13 #include "sched.h"
14
15 #include <linux/nospec.h>
16
17 #include <linux/kcov.h>
18 #include <linux/scs.h>
19 #include <linux/irq.h>
20 #include <linux/delay.h>
21
22 #include <asm/switch_to.h>
23 #include <asm/tlb.h>
24
25 #include "../workqueue_internal.h"
26 #include "../../io_uring/io-wq.h"
27 #include "../smpboot.h"
28
29 #include "pelt.h"
30 #include "smp.h"
31 #include "walt.h"
32 #include "rtg/rtg.h"
33
34 /*
35 * Export tracepoints that act as a bare tracehook (ie: have no trace event
36 * associated with them) to allow external modules to probe them.
37 */
38 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
39 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
40 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
41 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
42 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
43 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
44 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
45 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
46 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
47 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
48 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
49 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
50 #ifdef CONFIG_SCHEDSTATS
51 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
52 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
53 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
54 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
55 #endif
56
57 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
58
59 #ifdef CONFIG_SCHED_DEBUG
60 /*
61 * Debugging: various feature bits
62 *
63 * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
64 * sysctl_sched_features, defined in sched.h, to allow constants propagation
65 * at compile time and compiler optimization based on features default.
66 */
67 #define SCHED_FEAT(name, enabled) (1UL << __SCHED_FEAT_##name) * enabled |
68 const_debug unsigned int sysctl_sched_features =
69 #include "features.h"
70 0;
71 #undef SCHED_FEAT
72 #endif
73
74 /*
75 * Number of tasks to iterate in a single balance run.
76 * Limited because this is done with IRQs disabled.
77 */
78 const_debug unsigned int sysctl_sched_nr_migrate = 32;
79
80 /*
81 * period over which we measure -rt task CPU usage in us.
82 * default: 1s
83 */
84 unsigned int sysctl_sched_rt_period = 1000000;
85
86 __read_mostly int scheduler_running;
87
88 /*
89 * part of the period that we allow rt tasks to run in us.
90 * default: 0.95s
91 */
92 int sysctl_sched_rt_runtime = 950000;
93
94 /*
95 * Serialization rules
96 *
97 * Lock order
98 *
99 * p->pi_lock
100 * rq->lock
101 * hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
102 *
103 * rq1->lock
104 * rq2->lock where: rq1 < rq2
105 *
106 * Regular state
107 *
108 * Normal scheduling state is serialized by rq->lock. __schedule() takes the
109 * local CPU's rq->lock, it optionally removes the task from the runqueue and
110 * always looks at the local rq data structures to find the most elegible task
111 * to run next.
112 *
113 * Task enqueue is also under rq->lock, possibly taken from another CPU.
114 * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
115 * the local CPU to avoid bouncing the runqueue state around [ see
116 * ttwu_queue_wakelist() ]
117 *
118 * Task wakeup, specifically wakeups that involve migration, are horribly
119 * complicated to avoid having to take two rq->locks.
120 *
121 * Special state
122 *
123 * System-calls and anything external will use task_rq_lock() which acquires
124 * both p->pi_lock and rq->lock. As a consequence the state they change is
125 * stable while holding either lock
126 *
127 * - sched_setaffinity()/
128 * set_cpus_allowed_ptr(): p->cpus_ptr, p->nr_cpus_allowed
129 * - set_user_nice(): p->se.load, p->*prio
130 * - __sched_setscheduler(): p->sched_class, p->policy, p->*prio,
131 * p->se.load, p->rt_priority,
132 * p->dl.dl_{runtime, deadline, period, flags, bw, density}
133 * - sched_setnuma(): p->numa_preferred_nid
134 * - sched_move_task()/
135 * cpu_cgroup_fork(): p->sched_task_group
136 * - uclamp_update_active() p->uclamp*
137 *
138 * p->state <- TASK_*
139 *
140 * is changed locklessly using set_current_state(), __set_current_state() or
141 * set_special_state(), see their respective comments, or by
142 * try_to_wake_up(). This latter uses p->pi_lock to serialize against
143 * concurrent self.
144 *
145 * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
146 *
147 * is set by activate_task() and cleared by deactivate_task(), under
148 * rq->lock. Non-zero indicates the task is runnable, the special
149 * ON_RQ_MIGRATING state is used for migration without holding both
150 * rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
151 *
152 * p->on_cpu <- { 0, 1 }
153 *
154 * is set by prepare_task() and cleared by finish_task() such that it will be
155 * set before p is scheduled-in and cleared after p is scheduled-out, both
156 * under rq->lock. Non-zero indicates the task is running on its CPU.
157 *
158 * [ The astute reader will observe that it is possible for two tasks on one
159 * CPU to have ->on_cpu = 1 at the same time. ]
160 *
161 * task_cpu(p): is changed by set_task_cpu(), the rules are:
162 *
163 * - Don't call set_task_cpu() on a blocked task:
164 *
165 * We don't care what CPU we're not running on, this simplifies hotplug,
166 * the CPU assignment of blocked tasks isn't required to be valid.
167 *
168 * - for try_to_wake_up(), called under p->pi_lock:
169 *
170 * This allows try_to_wake_up() to only take one rq->lock, see its comment.
171 *
172 * - for migration called under rq->lock:
173 * [ see task_on_rq_migrating() in task_rq_lock() ]
174 *
175 * o move_queued_task()
176 * o detach_task()
177 *
178 * - for migration called under double_rq_lock()
179 *
180 * o __migrate_swap_task()
181 * o push_rt_task() / pull_rt_task()
182 * o push_dl_task() / pull_dl_task()
183 * o dl_task_offline_migration()
184 *
185 */
186
187 /*
188 * __task_rq_lock - lock the rq @p resides on.
189 */
__task_rq_lock(struct task_struct * p,struct rq_flags * rf)190 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
191 __acquires(rq->lock)
192 {
193 struct rq *rq;
194
195 lockdep_assert_held(&p->pi_lock);
196
197 for (;;) {
198 rq = task_rq(p);
199 raw_spin_lock(&rq->lock);
200 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
201 rq_pin_lock(rq, rf);
202 return rq;
203 }
204 raw_spin_unlock(&rq->lock);
205
206 while (unlikely(task_on_rq_migrating(p))) {
207 cpu_relax();
208 }
209 }
210 }
211
212 /*
213 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
214 */
task_rq_lock(struct task_struct * p,struct rq_flags * rf)215 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
216 __acquires(p->pi_lock) __acquires(rq->lock)
217 {
218 struct rq *rq;
219
220 for (;;) {
221 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
222 rq = task_rq(p);
223 raw_spin_lock(&rq->lock);
224 /*
225 * move_queued_task() task_rq_lock()
226 *
227 * ACQUIRE (rq->lock)
228 * [S] ->on_rq = MIGRATING [L] rq = task_rq()
229 * WMB (__set_task_cpu()) ACQUIRE (rq->lock);
230 * [S] ->cpu = new_cpu [L] task_rq()
231 * [L] ->on_rq
232 * RELEASE (rq->lock)
233 *
234 * If we observe the old CPU in task_rq_lock(), the acquire of
235 * the old rq->lock will fully serialize against the stores.
236 *
237 * If we observe the new CPU in task_rq_lock(), the address
238 * dependency headed by '[L] rq = task_rq()' and the acquire
239 * will pair with the WMB to ensure we then also see migrating.
240 */
241 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
242 rq_pin_lock(rq, rf);
243 return rq;
244 }
245 raw_spin_unlock(&rq->lock);
246 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
247
248 while (unlikely(task_on_rq_migrating(p))) {
249 cpu_relax();
250 }
251 }
252 }
253
254 /*
255 * RQ-clock updating methods
256 */
257
update_rq_clock_task(struct rq * rq,s64 delta)258 static void update_rq_clock_task(struct rq *rq, s64 delta)
259 {
260 /*
261 * In theory, the compile should just see 0 here, and optimize out the call
262 * to sched_rt_avg_update. But I don't trust it...
263 */
264 s64 __maybe_unused steal = 0, irq_delta = 0;
265
266 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
267 irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
268 /*
269 * Since irq_time is only updated on {soft,}irq_exit, we might run into
270 * this case when a previous update_rq_clock() happened inside a
271 * {soft,}irq region.
272 *
273 * When this happens, we stop ->clock_task and only update the
274 * prev_irq_time stamp to account for the part that fit, so that a next
275 * update will consume the rest. This ensures ->clock_task is
276 * monotonic.
277 *
278 * It does however cause some slight miss-attribution of {soft,}irq
279 * time, a more accurate solution would be to update the irq_time using
280 * the current rq->clock timestamp, except that would require using
281 * atomic ops.
282 */
283 if (irq_delta > delta) {
284 irq_delta = delta;
285 }
286
287 rq->prev_irq_time += irq_delta;
288 delta -= irq_delta;
289 #endif
290 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
291 if (static_key_false((¶virt_steal_rq_enabled))) {
292 steal = paravirt_steal_clock(cpu_of(rq));
293 steal -= rq->prev_steal_time_rq;
294
295 if (unlikely(steal > delta)) {
296 steal = delta;
297 }
298
299 rq->prev_steal_time_rq += steal;
300 delta -= steal;
301 }
302 #endif
303
304 rq->clock_task += delta;
305
306 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
307 if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) {
308 update_irq_load_avg(rq, irq_delta + steal);
309 }
310 #endif
311 update_rq_clock_pelt(rq, delta);
312 }
313
update_rq_clock(struct rq * rq)314 void update_rq_clock(struct rq *rq)
315 {
316 s64 delta;
317
318 lockdep_assert_held(&rq->lock);
319
320 if (rq->clock_update_flags & RQCF_ACT_SKIP) {
321 return;
322 }
323
324 #ifdef CONFIG_SCHED_DEBUG
325 if (sched_feat(WARN_DOUBLE_CLOCK)) {
326 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
327 }
328 rq->clock_update_flags |= RQCF_UPDATED;
329 #endif
330
331 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
332 if (delta < 0) {
333 return;
334 }
335 rq->clock += delta;
336 update_rq_clock_task(rq, delta);
337 }
338
rq_csd_init(struct rq * rq,struct __call_single_data * csd,smp_call_func_t func)339 static inline void rq_csd_init(struct rq *rq, struct __call_single_data *csd,
340 smp_call_func_t func)
341 {
342 csd->flags = 0;
343 csd->func = func;
344 csd->info = rq;
345 }
346
347 #ifdef CONFIG_SCHED_HRTICK
348 /*
349 * Use HR-timers to deliver accurate preemption points.
350 */
351
hrtick_clear(struct rq * rq)352 static void hrtick_clear(struct rq *rq)
353 {
354 if (hrtimer_active(&rq->hrtick_timer)) {
355 hrtimer_cancel(&rq->hrtick_timer);
356 }
357 }
358
359 /*
360 * High-resolution timer tick.
361 * Runs from hardirq context with interrupts disabled.
362 */
hrtick(struct hrtimer * timer)363 static enum hrtimer_restart hrtick(struct hrtimer *timer)
364 {
365 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
366 struct rq_flags rf;
367
368 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
369
370 rq_lock(rq, &rf);
371 update_rq_clock(rq);
372 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
373 rq_unlock(rq, &rf);
374
375 return HRTIMER_NORESTART;
376 }
377
378 #ifdef CONFIG_SMP
379
__hrtick_restart(struct rq * rq)380 static void __hrtick_restart(struct rq *rq)
381 {
382 struct hrtimer *timer = &rq->hrtick_timer;
383 ktime_t time = rq->hrtick_time;
384
385 hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
386 }
387
388 /*
389 * called from hardirq (IPI) context
390 */
__hrtick_start(void * arg)391 static void __hrtick_start(void *arg)
392 {
393 struct rq *rq = arg;
394 struct rq_flags rf;
395
396 rq_lock(rq, &rf);
397 __hrtick_restart(rq);
398 rq_unlock(rq, &rf);
399 }
400
401 /*
402 * Called to set the hrtick timer state.
403 *
404 * called with rq->lock held and irqs disabled
405 */
hrtick_start(struct rq * rq,u64 delay)406 void hrtick_start(struct rq *rq, u64 delay)
407 {
408 struct hrtimer *timer = &rq->hrtick_timer;
409 s64 delta;
410
411 /*
412 * Don't schedule slices shorter than 10000ns, that just
413 * doesn't make sense and can cause timer DoS.
414 */
415 delta = max_t(s64, delay, 10000LL);
416 rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
417
418 if (rq == this_rq()) {
419 __hrtick_restart(rq);
420 } else {
421 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
422 }
423 }
424
425 #else
426 /*
427 * Called to set the hrtick timer state.
428 *
429 * called with rq->lock held and irqs disabled
430 */
hrtick_start(struct rq * rq,u64 delay)431 void hrtick_start(struct rq *rq, u64 delay)
432 {
433 /*
434 * Don't schedule slices shorter than 10000ns, that just
435 * doesn't make sense. Rely on vruntime for fairness.
436 */
437 delay = max_t(u64, delay, 10000LL);
438 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
439 HRTIMER_MODE_REL_PINNED_HARD);
440 }
441
442 #endif /* CONFIG_SMP */
443
hrtick_rq_init(struct rq * rq)444 static void hrtick_rq_init(struct rq *rq)
445 {
446 #ifdef CONFIG_SMP
447 rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
448 #endif
449 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
450 rq->hrtick_timer.function = hrtick;
451 }
452 #else /* CONFIG_SCHED_HRTICK */
hrtick_clear(struct rq * rq)453 static inline void hrtick_clear(struct rq *rq)
454 {
455 }
456
hrtick_rq_init(struct rq * rq)457 static inline void hrtick_rq_init(struct rq *rq)
458 {
459 }
460 #endif /* CONFIG_SCHED_HRTICK */
461
462 /*
463 * cmpxchg based fetch_or, macro so it works for different integer types
464 */
465 #define fetch_or(ptr, mask) \
466 ( { \
467 typeof(ptr) _ptr = (ptr); \
468 typeof(mask) _mask = (mask); \
469 typeof(*_ptr) _old, _val = *_ptr; \
470 \
471 for ( ; ; ) { \
472 _old = cmpxchg(_ptr, _val, _val | _mask); \
473 if (_old == _val) \
474 break; \
475 _val = _old; \
476 } \
477 _old; \
478 })
479
480 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
481 /*
482 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
483 * this avoids any races wrt polling state changes and thereby avoids
484 * spurious IPIs.
485 */
set_nr_and_not_polling(struct task_struct * p)486 static bool set_nr_and_not_polling(struct task_struct *p)
487 {
488 struct thread_info *ti = task_thread_info(p);
489 return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
490 }
491
492 /*
493 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
494 *
495 * If this returns true, then the idle task promises to call
496 * sched_ttwu_pending() and reschedule soon.
497 */
set_nr_if_polling(struct task_struct * p)498 static bool set_nr_if_polling(struct task_struct *p)
499 {
500 struct thread_info *ti = task_thread_info(p);
501 typeof(ti->flags) old, val = READ_ONCE(ti->flags);
502
503 for (;;) {
504 if (!(val & _TIF_POLLING_NRFLAG)) {
505 return false;
506 }
507 if (val & _TIF_NEED_RESCHED) {
508 return true;
509 }
510 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
511 if (old == val) {
512 break;
513 }
514 val = old;
515 }
516 return true;
517 }
518
519 #else
set_nr_and_not_polling(struct task_struct * p)520 static bool set_nr_and_not_polling(struct task_struct *p)
521 {
522 set_tsk_need_resched(p);
523 return true;
524 }
525
526 #ifdef CONFIG_SMP
set_nr_if_polling(struct task_struct * p)527 static bool set_nr_if_polling(struct task_struct *p)
528 {
529 return false;
530 }
531 #endif
532 #endif
533
__wake_q_add(struct wake_q_head * head,struct task_struct * task)534 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
535 {
536 struct wake_q_node *node = &task->wake_q;
537
538 /*
539 * Atomically grab the task, if ->wake_q is !nil already it means
540 * its already queued (either by us or someone else) and will get the
541 * wakeup due to that.
542 *
543 * In order to ensure that a pending wakeup will observe our pending
544 * state, even in the failed case, an explicit smp_mb() must be used.
545 */
546 smp_mb__before_atomic();
547 if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) {
548 return false;
549 }
550
551 /*
552 * The head is context local, there can be no concurrency.
553 */
554 *head->lastp = node;
555 head->lastp = &node->next;
556 return true;
557 }
558
559 /**
560 * wake_q_add() - queue a wakeup for 'later' waking.
561 * @head: the wake_q_head to add @task to
562 * @task: the task to queue for 'later' wakeup
563 *
564 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
565 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
566 * instantly.
567 *
568 * This function must be used as-if it were wake_up_process(); IOW the task
569 * must be ready to be woken at this location.
570 */
wake_q_add(struct wake_q_head * head,struct task_struct * task)571 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
572 {
573 if (__wake_q_add(head, task)) {
574 get_task_struct(task);
575 }
576 }
577
578 /**
579 * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
580 * @head: the wake_q_head to add @task to
581 * @task: the task to queue for 'later' wakeup
582 *
583 * Queue a task for later wakeup, most likely by the wake_up_q() call in the
584 * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
585 * instantly.
586 *
587 * This function must be used as-if it were wake_up_process(); IOW the task
588 * must be ready to be woken at this location.
589 *
590 * This function is essentially a task-safe equivalent to wake_q_add(). Callers
591 * that already hold reference to @task can call the 'safe' version and trust
592 * wake_q to do the right thing depending whether or not the @task is already
593 * queued for wakeup.
594 */
wake_q_add_safe(struct wake_q_head * head,struct task_struct * task)595 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
596 {
597 if (!__wake_q_add(head, task)) {
598 put_task_struct(task);
599 }
600 }
601
wake_up_q(struct wake_q_head * head)602 void wake_up_q(struct wake_q_head *head)
603 {
604 struct wake_q_node *node = head->first;
605
606 while (node != WAKE_Q_TAIL) {
607 struct task_struct *task;
608
609 task = container_of(node, struct task_struct, wake_q);
610 BUG_ON(!task);
611 /* Task can safely be re-inserted now: */
612 node = node->next;
613 task->wake_q.next = NULL;
614
615 /*
616 * wake_up_process() executes a full barrier, which pairs with
617 * the queueing in wake_q_add() so as not to miss wakeups.
618 */
619 wake_up_process(task);
620 put_task_struct(task);
621 }
622 }
623
624 /*
625 * resched_curr - mark rq's current task 'to be rescheduled now'.
626 *
627 * On UP this means the setting of the need_resched flag, on SMP it
628 * might also involve a cross-CPU call to trigger the scheduler on
629 * the target CPU.
630 */
resched_curr(struct rq * rq)631 void resched_curr(struct rq *rq)
632 {
633 struct task_struct *curr = rq->curr;
634 int cpu;
635
636 lockdep_assert_held(&rq->lock);
637
638 if (test_tsk_need_resched(curr)) {
639 return;
640 }
641
642 cpu = cpu_of(rq);
643 if (cpu == smp_processor_id()) {
644 set_tsk_need_resched(curr);
645 set_preempt_need_resched();
646 return;
647 }
648
649 if (set_nr_and_not_polling(curr)) {
650 smp_send_reschedule(cpu);
651 } else {
652 trace_sched_wake_idle_without_ipi(cpu);
653 }
654 }
655
resched_cpu(int cpu)656 void resched_cpu(int cpu)
657 {
658 struct rq *rq = cpu_rq(cpu);
659 unsigned long flags;
660
661 raw_spin_lock_irqsave(&rq->lock, flags);
662 if (cpu_online(cpu) || cpu == smp_processor_id()) {
663 resched_curr(rq);
664 }
665 raw_spin_unlock_irqrestore(&rq->lock, flags);
666 }
667
668 #ifdef CONFIG_SMP
669 #ifdef CONFIG_NO_HZ_COMMON
670 /*
671 * In the semi idle case, use the nearest busy CPU for migrating timers
672 * from an idle CPU. This is good for power-savings.
673 *
674 * We don't do similar optimization for completely idle system, as
675 * selecting an idle CPU will add more delays to the timers than intended
676 * (as that CPU's timer base may not be uptodate wrt jiffies etc).
677 */
get_nohz_timer_target(void)678 int get_nohz_timer_target(void)
679 {
680 int i, cpu = smp_processor_id(), default_cpu = -1;
681 struct sched_domain *sd;
682
683 if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
684 if (!idle_cpu(cpu)) {
685 return cpu;
686 }
687 default_cpu = cpu;
688 }
689
690 rcu_read_lock();
691 for_each_domain(cpu, sd)
692 {
693 for_each_cpu_and(i, sched_domain_span(sd),
694 housekeeping_cpumask(HK_FLAG_TIMER))
695 {
696 if (cpu == i) {
697 continue;
698 }
699
700 if (!idle_cpu(i)) {
701 cpu = i;
702 goto unlock;
703 }
704 }
705 }
706
707 if (default_cpu == -1) {
708 for_each_cpu_and(i, cpu_active_mask,
709 housekeeping_cpumask(HK_FLAG_TIMER))
710 {
711 if (cpu == i) {
712 continue;
713 }
714
715 if (!idle_cpu(i)) {
716 cpu = i;
717 goto unlock;
718 }
719 }
720
721 /* no active, not-idle, housekpeeing CPU found. */
722 default_cpu = cpumask_any(cpu_active_mask);
723 if (unlikely(default_cpu >= nr_cpu_ids)) {
724 goto unlock;
725 }
726 }
727
728 cpu = default_cpu;
729 unlock:
730 rcu_read_unlock();
731 return cpu;
732 }
733
734 /*
735 * When add_timer_on() enqueues a timer into the timer wheel of an
736 * idle CPU then this timer might expire before the next timer event
737 * which is scheduled to wake up that CPU. In case of a completely
738 * idle system the next event might even be infinite time into the
739 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
740 * leaves the inner idle loop so the newly added timer is taken into
741 * account when the CPU goes back to idle and evaluates the timer
742 * wheel for the next timer event.
743 */
wake_up_idle_cpu(int cpu)744 static void wake_up_idle_cpu(int cpu)
745 {
746 struct rq *rq = cpu_rq(cpu);
747
748 if (cpu == smp_processor_id()) {
749 return;
750 }
751
752 if (set_nr_and_not_polling(rq->idle)) {
753 smp_send_reschedule(cpu);
754 } else {
755 trace_sched_wake_idle_without_ipi(cpu);
756 }
757 }
758
wake_up_full_nohz_cpu(int cpu)759 static bool wake_up_full_nohz_cpu(int cpu)
760 {
761 /*
762 * We just need the target to call irq_exit() and re-evaluate
763 * the next tick. The nohz full kick at least implies that.
764 * If needed we can still optimize that later with an
765 * empty IRQ.
766 */
767 if (cpu_is_offline(cpu)) {
768 return true; /* Don't try to wake offline CPUs. */
769 }
770 if (tick_nohz_full_cpu(cpu)) {
771 if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) {
772 tick_nohz_full_kick_cpu(cpu);
773 }
774 return true;
775 }
776
777 return false;
778 }
779
780 /*
781 * Wake up the specified CPU. If the CPU is going offline, it is the
782 * caller's responsibility to deal with the lost wakeup, for example,
783 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
784 */
wake_up_nohz_cpu(int cpu)785 void wake_up_nohz_cpu(int cpu)
786 {
787 if (!wake_up_full_nohz_cpu(cpu)) {
788 wake_up_idle_cpu(cpu);
789 }
790 }
791
nohz_csd_func(void * info)792 static void nohz_csd_func(void *info)
793 {
794 struct rq *rq = info;
795 int cpu = cpu_of(rq);
796 unsigned int flags;
797
798 /*
799 * Release the rq::nohz_csd.
800 */
801 flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
802 WARN_ON(!(flags & NOHZ_KICK_MASK));
803
804 rq->idle_balance = idle_cpu(cpu);
805 if (rq->idle_balance && !need_resched()) {
806 rq->nohz_idle_balance = flags;
807 raise_softirq_irqoff(SCHED_SOFTIRQ);
808 }
809 }
810
811 #endif /* CONFIG_NO_HZ_COMMON */
812
813 #ifdef CONFIG_NO_HZ_FULL
sched_can_stop_tick(struct rq * rq)814 bool sched_can_stop_tick(struct rq *rq)
815 {
816 int fifo_nr_running;
817
818 /* Deadline tasks, even if single, need the tick */
819 if (rq->dl.dl_nr_running) {
820 return false;
821 }
822
823 /*
824 * If there are more than one RR tasks, we need the tick to effect the
825 * actual RR behaviour.
826 */
827 if (rq->rt.rr_nr_running) {
828 if (rq->rt.rr_nr_running == 1) {
829 return true;
830 } else {
831 return false;
832 }
833 }
834
835 /*
836 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
837 * forced preemption between FIFO tasks.
838 */
839 fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
840 if (fifo_nr_running) {
841 return true;
842 }
843
844 /*
845 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
846 * if there's more than one we need the tick for involuntary
847 * preemption.
848 */
849 if (rq->nr_running > 1) {
850 return false;
851 }
852
853 return true;
854 }
855 #endif /* CONFIG_NO_HZ_FULL */
856 #endif /* CONFIG_SMP */
857
858 #if defined(CONFIG_RT_GROUP_SCHED) || \
859 (defined(CONFIG_FAIR_GROUP_SCHED) && \
860 (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
861 /*
862 * Iterate task_group tree rooted at *from, calling @down when first entering a
863 * node and @up when leaving it for the final time.
864 *
865 * Caller must hold rcu_lock or sufficient equivalent.
866 */
walk_tg_tree_from(struct task_group * from,tg_visitor down,tg_visitor up,void * data)867 int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up,
868 void *data)
869 {
870 struct task_group *parent, *child;
871 int ret;
872
873 parent = from;
874
875 down:
876 ret = (*down)(parent, data);
877 if (ret) {
878 goto out;
879 }
880 list_for_each_entry_rcu(child, &parent->children, siblings)
881 {
882 parent = child;
883 goto down;
884
885 up:
886 continue;
887 }
888 ret = (*up)(parent, data);
889 if (ret || parent == from) {
890 goto out;
891 }
892
893 child = parent;
894 parent = parent->parent;
895 if (parent) {
896 goto up;
897 }
898 out:
899 return ret;
900 }
901
tg_nop(struct task_group * tg,void * data)902 int tg_nop(struct task_group *tg, void *data)
903 {
904 return 0;
905 }
906 #endif
907
set_load_weight(struct task_struct * p)908 static void set_load_weight(struct task_struct *p)
909 {
910 bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
911 int prio = p->static_prio - MAX_RT_PRIO;
912 struct load_weight *load = &p->se.load;
913
914 /*
915 * SCHED_IDLE tasks get minimal weight:
916 */
917 if (task_has_idle_policy(p)) {
918 load->weight = scale_load(WEIGHT_IDLEPRIO);
919 load->inv_weight = WMULT_IDLEPRIO;
920 return;
921 }
922
923 /*
924 * SCHED_OTHER tasks have to update their load when changing their
925 * weight
926 */
927 if (update_load && p->sched_class == &fair_sched_class) {
928 reweight_task(p, prio);
929 } else {
930 load->weight = scale_load(sched_prio_to_weight[prio]);
931 load->inv_weight = sched_prio_to_wmult[prio];
932 }
933 }
934
935 #ifdef CONFIG_SCHED_LATENCY_NICE
set_latency_weight(struct task_struct * p)936 static void set_latency_weight(struct task_struct *p)
937 {
938 p->se.latency_weight = sched_latency_to_weight[p->latency_prio];
939 }
940
__setscheduler_latency(struct task_struct * p,const struct sched_attr * attr)941 static void __setscheduler_latency(struct task_struct *p,
942 const struct sched_attr *attr)
943 {
944 if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
945 p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
946 set_latency_weight(p);
947 }
948 }
949
latency_nice_validate(struct task_struct * p,bool user,const struct sched_attr * attr)950 static int latency_nice_validate(struct task_struct *p, bool user,
951 const struct sched_attr *attr)
952 {
953 if (attr->sched_latency_nice > MAX_LATENCY_NICE) {
954 return -EINVAL;
955 }
956 if (attr->sched_latency_nice < MIN_LATENCY_NICE) {
957 return -EINVAL;
958 }
959 /* Use the same security checks as NICE */
960 if (user && attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) &&
961 !capable(CAP_SYS_NICE)) {
962 return -EPERM;
963 }
964
965 return 0;
966 }
967 #else
__setscheduler_latency(struct task_struct * p,const struct sched_attr * attr)968 static void __setscheduler_latency(struct task_struct *p,
969 const struct sched_attr *attr)
970 {
971 }
972
latency_nice_validate(struct task_struct * p,bool user,const struct sched_attr * attr)973 static inline int latency_nice_validate(struct task_struct *p, bool user,
974 const struct sched_attr *attr)
975 {
976 return -EOPNOTSUPP;
977 }
978 #endif
979
980 #ifdef CONFIG_UCLAMP_TASK
981 /*
982 * Serializes updates of utilization clamp values
983 *
984 * The (slow-path) user-space triggers utilization clamp value updates which
985 * can require updates on (fast-path) scheduler's data structures used to
986 * support enqueue/dequeue operations.
987 * While the per-CPU rq lock protects fast-path update operations, user-space
988 * requests are serialized using a mutex to reduce the risk of conflicting
989 * updates or API abuses.
990 */
991 static DEFINE_MUTEX(uclamp_mutex);
992
993 /* Max allowed minimum utilization */
994 unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
995
996 /* Max allowed maximum utilization */
997 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
998
999 /*
1000 * By default RT tasks run at the maximum performance point/capacity of the
1001 * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1002 * SCHED_CAPACITY_SCALE.
1003 *
1004 * This knob allows admins to change the default behavior when uclamp is being
1005 * used. In battery powered devices, particularly, running at the maximum
1006 * capacity and frequency will increase energy consumption and shorten the
1007 * battery life.
1008 *
1009 * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1010 *
1011 * This knob will not override the system default sched_util_clamp_min defined
1012 * above.
1013 */
1014 unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1015
1016 /* All clamps are required to be less or equal than these values */
1017 static struct uclamp_se uclamp_default[UCLAMP_CNT];
1018
1019 /*
1020 * This static key is used to reduce the uclamp overhead in the fast path. It
1021 * primarily disables the call to uclamp_rq_{inc, dec}() in
1022 * enqueue/dequeue_task().
1023 *
1024 * This allows users to continue to enable uclamp in their kernel config with
1025 * minimum uclamp overhead in the fast path.
1026 *
1027 * As soon as userspace modifies any of the uclamp knobs, the static key is
1028 * enabled, since we have an actual users that make use of uclamp
1029 * functionality.
1030 *
1031 * The knobs that would enable this static key are:
1032 *
1033 * * A task modifying its uclamp value with sched_setattr().
1034 * * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1035 * * An admin modifying the cgroup cpu.uclamp.{min, max}
1036 */
1037 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1038
1039 /* Integer rounded range for each bucket */
1040 #define UCLAMP_BUCKET_DELTA \
1041 DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1042
1043 #define cycle_each_clamp_id(clamp_id) \
1044 for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1045
uclamp_bucket_id(unsigned int clamp_value)1046 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1047 {
1048 return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA,
1049 UCLAMP_BUCKETS - 1);
1050 }
1051
uclamp_none(enum uclamp_id clamp_id)1052 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1053 {
1054 if (clamp_id == UCLAMP_MIN) {
1055 return 0;
1056 }
1057 return SCHED_CAPACITY_SCALE;
1058 }
1059
uclamp_se_set(struct uclamp_se * uc_se,unsigned int value,bool user_defined)1060 static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value,
1061 bool user_defined)
1062 {
1063 uc_se->value = value;
1064 uc_se->bucket_id = uclamp_bucket_id(value);
1065 uc_se->user_defined = user_defined;
1066 }
1067
uclamp_idle_value(struct rq * rq,enum uclamp_id clamp_id,unsigned int clamp_value)1068 static inline unsigned int uclamp_idle_value(struct rq *rq,
1069 enum uclamp_id clamp_id,
1070 unsigned int clamp_value)
1071 {
1072 /*
1073 * Avoid blocked utilization pushing up the frequency when we go
1074 * idle (which drops the max-clamp) by retaining the last known
1075 * max-clamp.
1076 */
1077 if (clamp_id == UCLAMP_MAX) {
1078 rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1079 return clamp_value;
1080 }
1081
1082 return uclamp_none(UCLAMP_MIN);
1083 }
1084
uclamp_idle_reset(struct rq * rq,enum uclamp_id clamp_id,unsigned int clamp_value)1085 static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1086 unsigned int clamp_value)
1087 {
1088 /* Reset max-clamp retention only on idle exit */
1089 if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) {
1090 return;
1091 }
1092
1093 WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1094 }
1095
uclamp_rq_max_value(struct rq * rq,enum uclamp_id clamp_id,unsigned int clamp_value)1096 static inline unsigned int uclamp_rq_max_value(struct rq *rq,
1097 enum uclamp_id clamp_id,
1098 unsigned int clamp_value)
1099 {
1100 struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1101 int bucket_id = UCLAMP_BUCKETS - 1;
1102
1103 /*
1104 * Since both min and max clamps are max aggregated, find the
1105 * top most bucket with tasks in.
1106 */
1107 for (; bucket_id >= 0; bucket_id--) {
1108 if (!bucket[bucket_id].tasks) {
1109 continue;
1110 }
1111 return bucket[bucket_id].value;
1112 }
1113
1114 /* No tasks -- default clamp values */
1115 return uclamp_idle_value(rq, clamp_id, clamp_value);
1116 }
1117
__uclamp_update_util_min_rt_default(struct task_struct * p)1118 static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1119 {
1120 unsigned int default_util_min;
1121 struct uclamp_se *uc_se;
1122
1123 lockdep_assert_held(&p->pi_lock);
1124
1125 uc_se = &p->uclamp_req[UCLAMP_MIN];
1126
1127 /* Only sync if user didn't override the default */
1128 if (uc_se->user_defined) {
1129 return;
1130 }
1131
1132 default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1133 uclamp_se_set(uc_se, default_util_min, false);
1134 }
1135
uclamp_update_util_min_rt_default(struct task_struct * p)1136 static void uclamp_update_util_min_rt_default(struct task_struct *p)
1137 {
1138 struct rq_flags rf;
1139 struct rq *rq;
1140
1141 if (!rt_task(p)) {
1142 return;
1143 }
1144
1145 /* Protect updates to p->uclamp_* */
1146 rq = task_rq_lock(p, &rf);
1147 __uclamp_update_util_min_rt_default(p);
1148 task_rq_unlock(rq, p, &rf);
1149 }
1150
uclamp_sync_util_min_rt_default(void)1151 static void uclamp_sync_util_min_rt_default(void)
1152 {
1153 struct task_struct *g, *p;
1154
1155 /*
1156 * copy_process() sysctl_uclamp
1157 * uclamp_min_rt = X;
1158 * write_lock(&tasklist_lock) read_lock(&tasklist_lock)
1159 * // link thread smp_mb__after_spinlock()
1160 * write_unlock(&tasklist_lock) read_unlock(&tasklist_lock);
1161 * sched_post_fork() for_each_process_thread()
1162 * __uclamp_sync_rt() __uclamp_sync_rt()
1163 *
1164 * Ensures that either sched_post_fork() will observe the new
1165 * uclamp_min_rt or for_each_process_thread() will observe the new
1166 * task.
1167 */
1168 read_lock(&tasklist_lock);
1169 smp_mb__after_spinlock();
1170 read_unlock(&tasklist_lock);
1171
1172 rcu_read_lock();
1173 for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p);
1174 rcu_read_unlock();
1175 }
1176
uclamp_tg_restrict(struct task_struct * p,enum uclamp_id clamp_id)1177 static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p,
1178 enum uclamp_id clamp_id)
1179 {
1180 /* Copy by value as we could modify it */
1181 struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1182 #ifdef CONFIG_UCLAMP_TASK_GROUP
1183 unsigned int tg_min, tg_max, value;
1184
1185 /*
1186 * Tasks in autogroups or root task group will be
1187 * restricted by system defaults.
1188 */
1189 if (task_group_is_autogroup(task_group(p))) {
1190 return uc_req;
1191 }
1192 if (task_group(p) == &root_task_group) {
1193 return uc_req;
1194 }
1195
1196 tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1197 tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1198 value = uc_req.value;
1199 value = clamp(value, tg_min, tg_max);
1200 uclamp_se_set(&uc_req, value, false);
1201 #endif
1202
1203 return uc_req;
1204 }
1205
1206 /*
1207 * The effective clamp bucket index of a task depends on, by increasing
1208 * priority:
1209 * - the task specific clamp value, when explicitly requested from userspace
1210 * - the task group effective clamp value, for tasks not either in the root
1211 * group or in an autogroup
1212 * - the system default clamp value, defined by the sysadmin
1213 */
uclamp_eff_get(struct task_struct * p,enum uclamp_id clamp_id)1214 static inline struct uclamp_se uclamp_eff_get(struct task_struct *p,
1215 enum uclamp_id clamp_id)
1216 {
1217 struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1218 struct uclamp_se uc_max = uclamp_default[clamp_id];
1219
1220 /* System default restrictions always apply */
1221 if (unlikely(uc_req.value > uc_max.value)) {
1222 return uc_max;
1223 }
1224
1225 return uc_req;
1226 }
1227
uclamp_eff_value(struct task_struct * p,enum uclamp_id clamp_id)1228 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1229 {
1230 struct uclamp_se uc_eff;
1231
1232 /* Task currently refcounted: use back-annotated (effective) value */
1233 if (p->uclamp[clamp_id].active) {
1234 return (unsigned long)p->uclamp[clamp_id].value;
1235 }
1236
1237 uc_eff = uclamp_eff_get(p, clamp_id);
1238
1239 return (unsigned long)uc_eff.value;
1240 }
1241
1242 /*
1243 * When a task is enqueued on a rq, the clamp bucket currently defined by the
1244 * task's uclamp::bucket_id is refcounted on that rq. This also immediately
1245 * updates the rq's clamp value if required.
1246 *
1247 * Tasks can have a task-specific value requested from user-space, track
1248 * within each bucket the maximum value for tasks refcounted in it.
1249 * This "local max aggregation" allows to track the exact "requested" value
1250 * for each bucket when all its RUNNABLE tasks require the same clamp.
1251 */
uclamp_rq_inc_id(struct rq * rq,struct task_struct * p,enum uclamp_id clamp_id)1252 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1253 enum uclamp_id clamp_id)
1254 {
1255 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1256 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1257 struct uclamp_bucket *bucket;
1258
1259 lockdep_assert_held(&rq->lock);
1260
1261 /* Update task effective clamp */
1262 p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1263
1264 bucket = &uc_rq->bucket[uc_se->bucket_id];
1265 bucket->tasks++;
1266 uc_se->active = true;
1267
1268 uclamp_idle_reset(rq, clamp_id, uc_se->value);
1269
1270 /*
1271 * Local max aggregation: rq buckets always track the max
1272 * "requested" clamp value of its RUNNABLE tasks.
1273 */
1274 if (bucket->tasks == 1 || uc_se->value > bucket->value) {
1275 bucket->value = uc_se->value;
1276 }
1277
1278 if (uc_se->value > READ_ONCE(uc_rq->value)) {
1279 WRITE_ONCE(uc_rq->value, uc_se->value);
1280 }
1281 }
1282
1283 /*
1284 * When a task is dequeued from a rq, the clamp bucket refcounted by the task
1285 * is released. If this is the last task reference counting the rq's max
1286 * active clamp value, then the rq's clamp value is updated.
1287 *
1288 * Both refcounted tasks and rq's cached clamp values are expected to be
1289 * always valid. If it's detected they are not, as defensive programming,
1290 * enforce the expected state and warn.
1291 */
uclamp_rq_dec_id(struct rq * rq,struct task_struct * p,enum uclamp_id clamp_id)1292 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1293 enum uclamp_id clamp_id)
1294 {
1295 struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1296 struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1297 struct uclamp_bucket *bucket;
1298 unsigned int bkt_clamp;
1299 unsigned int rq_clamp;
1300
1301 lockdep_assert_held(&rq->lock);
1302
1303 /*
1304 * If sched_uclamp_used was enabled after task @p was enqueued,
1305 * we could end up with unbalanced call to uclamp_rq_dec_id().
1306 *
1307 * In this case the uc_se->active flag should be false since no uclamp
1308 * accounting was performed at enqueue time and we can just return
1309 * here.
1310 *
1311 * Need to be careful of the following enqeueue/dequeue ordering
1312 * problem too
1313 *
1314 * enqueue(taskA)
1315 * // sched_uclamp_used gets enabled
1316 * enqueue(taskB)
1317 * dequeue(taskA)
1318 * // Must not decrement bukcet->tasks here
1319 * dequeue(taskB)
1320 *
1321 * where we could end up with stale data in uc_se and
1322 * bucket[uc_se->bucket_id].
1323 *
1324 * The following check here eliminates the possibility of such race.
1325 */
1326 if (unlikely(!uc_se->active)) {
1327 return;
1328 }
1329
1330 bucket = &uc_rq->bucket[uc_se->bucket_id];
1331
1332 SCHED_WARN_ON(!bucket->tasks);
1333 if (likely(bucket->tasks)) {
1334 bucket->tasks--;
1335 }
1336
1337 uc_se->active = false;
1338
1339 /*
1340 * Keep "local max aggregation" simple and accept to (possibly)
1341 * overboost some RUNNABLE tasks in the same bucket.
1342 * The rq clamp bucket value is reset to its base value whenever
1343 * there are no more RUNNABLE tasks refcounting it.
1344 */
1345 if (likely(bucket->tasks)) {
1346 return;
1347 }
1348
1349 rq_clamp = READ_ONCE(uc_rq->value);
1350 /*
1351 * Defensive programming: this should never happen. If it happens,
1352 * e.g. due to future modification, warn and fixup the expected value.
1353 */
1354 SCHED_WARN_ON(bucket->value > rq_clamp);
1355 if (bucket->value >= rq_clamp) {
1356 bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1357 WRITE_ONCE(uc_rq->value, bkt_clamp);
1358 }
1359 }
1360
uclamp_rq_inc(struct rq * rq,struct task_struct * p)1361 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1362 {
1363 enum uclamp_id clamp_id;
1364
1365 /*
1366 * Avoid any overhead until uclamp is actually used by the userspace.
1367 *
1368 * The condition is constructed such that a NOP is generated when
1369 * sched_uclamp_used is disabled.
1370 */
1371 if (!static_branch_unlikely(&sched_uclamp_used)) {
1372 return;
1373 }
1374
1375 if (unlikely(!p->sched_class->uclamp_enabled)) {
1376 return;
1377 }
1378
1379 cycle_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id);
1380
1381 /* Reset clamp idle holding when there is one RUNNABLE task */
1382 if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) {
1383 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1384 }
1385 }
1386
uclamp_rq_dec(struct rq * rq,struct task_struct * p)1387 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1388 {
1389 enum uclamp_id clamp_id;
1390
1391 /*
1392 * Avoid any overhead until uclamp is actually used by the userspace.
1393 *
1394 * The condition is constructed such that a NOP is generated when
1395 * sched_uclamp_used is disabled.
1396 */
1397 if (!static_branch_unlikely(&sched_uclamp_used)) {
1398 return;
1399 }
1400
1401 if (unlikely(!p->sched_class->uclamp_enabled)) {
1402 return;
1403 }
1404
1405 cycle_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id);
1406 }
1407
uclamp_rq_reinc_id(struct rq * rq,struct task_struct * p,enum uclamp_id clamp_id)1408 static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1409 enum uclamp_id clamp_id)
1410 {
1411 if (!p->uclamp[clamp_id].active) {
1412 return;
1413 }
1414
1415 uclamp_rq_dec_id(rq, p, clamp_id);
1416 uclamp_rq_inc_id(rq, p, clamp_id);
1417
1418 /*
1419 * Make sure to clear the idle flag if we've transiently reached 0
1420 * active tasks on rq.
1421 */
1422 if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) {
1423 rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1424 }
1425 }
1426
uclamp_update_active(struct task_struct * p)1427 static inline void uclamp_update_active(struct task_struct *p)
1428 {
1429 enum uclamp_id clamp_id;
1430 struct rq_flags rf;
1431 struct rq *rq;
1432
1433 /*
1434 * Lock the task and the rq where the task is (or was) queued.
1435 *
1436 * We might lock the (previous) rq of a !RUNNABLE task, but that's the
1437 * price to pay to safely serialize util_{min,max} updates with
1438 * enqueues, dequeues and migration operations.
1439 * This is the same locking schema used by __set_cpus_allowed_ptr().
1440 */
1441 rq = task_rq_lock(p, &rf);
1442
1443 /*
1444 * Setting the clamp bucket is serialized by task_rq_lock().
1445 * If the task is not yet RUNNABLE and its task_struct is not
1446 * affecting a valid clamp bucket, the next time it's enqueued,
1447 * it will already see the updated clamp bucket value.
1448 */
1449 cycle_each_clamp_id(clamp_id) uclamp_rq_reinc_id(rq, p, clamp_id);
1450
1451 task_rq_unlock(rq, p, &rf);
1452 }
1453
1454 #ifdef CONFIG_UCLAMP_TASK_GROUP
uclamp_update_active_tasks(struct cgroup_subsys_state * css)1455 static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1456 {
1457 struct css_task_iter it;
1458 struct task_struct *p;
1459
1460 css_task_iter_start(css, 0, &it);
1461 while ((p = css_task_iter_next(&it))) {
1462 uclamp_update_active(p);
1463 }
1464 css_task_iter_end(&it);
1465 }
1466
1467 static void cpu_util_update_eff(struct cgroup_subsys_state *css);
uclamp_update_root_tg(void)1468 static void uclamp_update_root_tg(void)
1469 {
1470 struct task_group *tg = &root_task_group;
1471
1472 uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], sysctl_sched_uclamp_util_min,
1473 false);
1474 uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max,
1475 false);
1476
1477 rcu_read_lock();
1478 cpu_util_update_eff(&root_task_group.css);
1479 rcu_read_unlock();
1480 }
1481 #else
uclamp_update_root_tg(void)1482 static void uclamp_update_root_tg(void)
1483 {
1484 }
1485 #endif
1486
sysctl_sched_uclamp_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)1487 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1488 void *buffer, size_t *lenp, loff_t *ppos)
1489 {
1490 bool update_root_tg = false;
1491 int old_min, old_max, old_min_rt;
1492 int result;
1493
1494 mutex_lock(&uclamp_mutex);
1495 old_min = sysctl_sched_uclamp_util_min;
1496 old_max = sysctl_sched_uclamp_util_max;
1497 old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1498
1499 result = proc_dointvec(table, write, buffer, lenp, ppos);
1500 if (result) {
1501 goto undo;
1502 }
1503 if (!write) {
1504 goto done;
1505 }
1506
1507 if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1508 sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1509 sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1510 result = -EINVAL;
1511 goto undo;
1512 }
1513
1514 if (old_min != sysctl_sched_uclamp_util_min) {
1515 uclamp_se_set(&uclamp_default[UCLAMP_MIN], sysctl_sched_uclamp_util_min,
1516 false);
1517 update_root_tg = true;
1518 }
1519 if (old_max != sysctl_sched_uclamp_util_max) {
1520 uclamp_se_set(&uclamp_default[UCLAMP_MAX], sysctl_sched_uclamp_util_max,
1521 false);
1522 update_root_tg = true;
1523 }
1524
1525 if (update_root_tg) {
1526 static_branch_enable(&sched_uclamp_used);
1527 uclamp_update_root_tg();
1528 }
1529
1530 if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1531 static_branch_enable(&sched_uclamp_used);
1532 uclamp_sync_util_min_rt_default();
1533 }
1534
1535 /*
1536 * We update all RUNNABLE tasks only when task groups are in use.
1537 * Otherwise, keep it simple and do just a lazy update at each next
1538 * task enqueue time.
1539 */
1540
1541 goto done;
1542
1543 undo:
1544 sysctl_sched_uclamp_util_min = old_min;
1545 sysctl_sched_uclamp_util_max = old_max;
1546 sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1547 done:
1548 mutex_unlock(&uclamp_mutex);
1549
1550 return result;
1551 }
1552
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)1553 static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr)
1554 {
1555 unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1556 unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1557
1558 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1559 lower_bound = attr->sched_util_min;
1560 }
1561 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1562 upper_bound = attr->sched_util_max;
1563 }
1564
1565 if (lower_bound > upper_bound) {
1566 return -EINVAL;
1567 }
1568 if (upper_bound > SCHED_CAPACITY_SCALE) {
1569 return -EINVAL;
1570 }
1571
1572 /*
1573 * We have valid uclamp attributes; make sure uclamp is enabled.
1574 *
1575 * We need to do that here, because enabling static branches is a
1576 * blocking operation which obviously cannot be done while holding
1577 * scheduler locks.
1578 */
1579 static_branch_enable(&sched_uclamp_used);
1580
1581 return 0;
1582 }
1583
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)1584 static void __setscheduler_uclamp(struct task_struct *p,
1585 const struct sched_attr *attr)
1586 {
1587 enum uclamp_id clamp_id;
1588
1589 /*
1590 * On scheduling class change, reset to default clamps for tasks
1591 * without a task-specific value.
1592 */
1593 cycle_each_clamp_id(clamp_id) {
1594 struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1595
1596 /* Keep using defined clamps across class changes */
1597 if (uc_se->user_defined) {
1598 continue;
1599 }
1600
1601 /*
1602 * RT by default have a 100% boost value that could be modified
1603 * at runtime.
1604 */
1605 if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) {
1606 __uclamp_update_util_min_rt_default(p);
1607 } else {
1608 uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
1609 }
1610 }
1611
1612 if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) {
1613 return;
1614 }
1615
1616 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1617 uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true);
1618 }
1619
1620 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1621 uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true);
1622 }
1623 }
1624
uclamp_fork(struct task_struct * p)1625 static void uclamp_fork(struct task_struct *p)
1626 {
1627 enum uclamp_id clamp_id;
1628
1629 /*
1630 * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1631 * as the task is still at its early fork stages.
1632 */
1633 cycle_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false;
1634
1635 if (likely(!p->sched_reset_on_fork)) {
1636 return;
1637 }
1638
1639 cycle_each_clamp_id(clamp_id) {
1640 uclamp_se_set(&p->uclamp_req[clamp_id], uclamp_none(clamp_id), false);
1641 }
1642 }
1643
uclamp_post_fork(struct task_struct * p)1644 static void uclamp_post_fork(struct task_struct *p)
1645 {
1646 uclamp_update_util_min_rt_default(p);
1647 }
1648
init_uclamp_rq(struct rq * rq)1649 static void __init init_uclamp_rq(struct rq *rq)
1650 {
1651 enum uclamp_id clamp_id;
1652 struct uclamp_rq *uc_rq = rq->uclamp;
1653
1654 cycle_each_clamp_id(clamp_id) {
1655 uc_rq[clamp_id] = (struct uclamp_rq) {.value = uclamp_none(clamp_id)};
1656 }
1657
1658 rq->uclamp_flags = UCLAMP_FLAG_IDLE;
1659 }
1660
init_uclamp(void)1661 static void __init init_uclamp(void)
1662 {
1663 struct uclamp_se uc_max = {};
1664 enum uclamp_id clamp_id;
1665 int cpu;
1666
1667 for_each_possible_cpu(cpu) init_uclamp_rq(cpu_rq(cpu));
1668
1669 cycle_each_clamp_id(clamp_id)
1670 {
1671 uclamp_se_set(&init_task.uclamp_req[clamp_id], uclamp_none(clamp_id),
1672 false);
1673 }
1674
1675 /* System defaults allow max clamp values for both indexes */
1676 uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1677 cycle_each_clamp_id(clamp_id)
1678 {
1679 uclamp_default[clamp_id] = uc_max;
1680 #ifdef CONFIG_UCLAMP_TASK_GROUP
1681 root_task_group.uclamp_req[clamp_id] = uc_max;
1682 root_task_group.uclamp[clamp_id] = uc_max;
1683 #endif
1684 }
1685 }
1686
1687 #else /* CONFIG_UCLAMP_TASK */
uclamp_rq_inc(struct rq * rq,struct task_struct * p)1688 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1689 {
1690 }
uclamp_rq_dec(struct rq * rq,struct task_struct * p)1691 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1692 {
1693 }
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)1694 static inline int uclamp_validate(struct task_struct *p,
1695 const struct sched_attr *attr)
1696 {
1697 return -EOPNOTSUPP;
1698 }
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)1699 static void __setscheduler_uclamp(struct task_struct *p,
1700 const struct sched_attr *attr)
1701 {
1702 }
uclamp_fork(struct task_struct * p)1703 static inline void uclamp_fork(struct task_struct *p)
1704 {
1705 }
uclamp_post_fork(struct task_struct * p)1706 static inline void uclamp_post_fork(struct task_struct *p)
1707 {
1708 }
init_uclamp(void)1709 static inline void init_uclamp(void)
1710 {
1711 }
1712 #endif /* CONFIG_UCLAMP_TASK */
1713
enqueue_task(struct rq * rq,struct task_struct * p,int flags)1714 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1715 {
1716 if (!(flags & ENQUEUE_NOCLOCK)) {
1717 update_rq_clock(rq);
1718 }
1719
1720 if (!(flags & ENQUEUE_RESTORE)) {
1721 sched_info_queued(rq, p);
1722 psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1723 }
1724
1725 uclamp_rq_inc(rq, p);
1726 p->sched_class->enqueue_task(rq, p, flags);
1727 }
1728
dequeue_task(struct rq * rq,struct task_struct * p,int flags)1729 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1730 {
1731 if (!(flags & DEQUEUE_NOCLOCK)) {
1732 update_rq_clock(rq);
1733 }
1734
1735 if (!(flags & DEQUEUE_SAVE)) {
1736 sched_info_dequeued(rq, p);
1737 psi_dequeue(p, flags & DEQUEUE_SLEEP);
1738 }
1739
1740 uclamp_rq_dec(rq, p);
1741 p->sched_class->dequeue_task(rq, p, flags);
1742 }
1743
activate_task(struct rq * rq,struct task_struct * p,int flags)1744 void activate_task(struct rq *rq, struct task_struct *p, int flags)
1745 {
1746 enqueue_task(rq, p, flags);
1747
1748 p->on_rq = TASK_ON_RQ_QUEUED;
1749 }
1750
deactivate_task(struct rq * rq,struct task_struct * p,int flags)1751 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1752 {
1753 p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1754
1755 dequeue_task(rq, p, flags);
1756 }
1757
__normal_prio(int policy,int rt_prio,int nice)1758 static inline int __normal_prio(int policy, int rt_prio, int nice)
1759 {
1760 int prio;
1761
1762 if (dl_policy(policy)) {
1763 prio = MAX_DL_PRIO - 1;
1764 } else if (rt_policy(policy)) {
1765 prio = MAX_RT_PRIO - 1 - rt_prio;
1766 } else {
1767 prio = NICE_TO_PRIO(nice);
1768 }
1769
1770 return prio;
1771 }
1772
1773 /*
1774 * Calculate the expected normal priority: i.e. priority
1775 * without taking RT-inheritance into account. Might be
1776 * boosted by interactivity modifiers. Changes upon fork,
1777 * setprio syscalls, and whenever the interactivity
1778 * estimator recalculates.
1779 */
normal_prio(struct task_struct * p)1780 static inline int normal_prio(struct task_struct *p)
1781 {
1782 return __normal_prio(p->policy, p->rt_priority,
1783 PRIO_TO_NICE(p->static_prio));
1784 }
1785
1786 /*
1787 * Calculate the current priority, i.e. the priority
1788 * taken into account by the scheduler. This value might
1789 * be boosted by RT tasks, or might be boosted by
1790 * interactivity modifiers. Will be RT if the task got
1791 * RT-boosted. If not then it returns p->normal_prio.
1792 */
effective_prio(struct task_struct * p)1793 static int effective_prio(struct task_struct *p)
1794 {
1795 p->normal_prio = normal_prio(p);
1796 /*
1797 * If we are RT tasks or we were boosted to RT priority,
1798 * keep the priority unchanged. Otherwise, update priority
1799 * to the normal priority:
1800 */
1801 if (!rt_prio(p->prio)) {
1802 return p->normal_prio;
1803 }
1804 return p->prio;
1805 }
1806
1807 /**
1808 * task_curr - is this task currently executing on a CPU?
1809 * @p: the task in question.
1810 *
1811 * Return: 1 if the task is currently executing. 0 otherwise.
1812 */
task_curr(const struct task_struct * p)1813 inline int task_curr(const struct task_struct *p)
1814 {
1815 return cpu_curr(task_cpu(p)) == p;
1816 }
1817
1818 /*
1819 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
1820 * use the balance_callback list if you want balancing.
1821 *
1822 * this means any call to check_class_changed() must be followed by a call to
1823 * balance_callback().
1824 */
check_class_changed(struct rq * rq,struct task_struct * p,const struct sched_class * prev_class,int oldprio)1825 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1826 const struct sched_class *prev_class,
1827 int oldprio)
1828 {
1829 if (prev_class != p->sched_class) {
1830 if (prev_class->switched_from) {
1831 prev_class->switched_from(rq, p);
1832 }
1833
1834 p->sched_class->switched_to(rq, p);
1835 } else if (oldprio != p->prio || dl_task(p)) {
1836 p->sched_class->prio_changed(rq, p, oldprio);
1837 }
1838 }
1839
check_preempt_curr(struct rq * rq,struct task_struct * p,int flags)1840 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1841 {
1842 if (p->sched_class == rq->curr->sched_class) {
1843 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1844 } else if (p->sched_class > rq->curr->sched_class) {
1845 resched_curr(rq);
1846 }
1847
1848 /*
1849 * A queue event has occurred, and we're going to schedule. In
1850 * this case, we can save a useless back to back clock update.
1851 */
1852 if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) {
1853 rq_clock_skip_update(rq);
1854 }
1855 }
1856
1857 #ifdef CONFIG_SMP
1858
1859 /*
1860 * Per-CPU kthreads are allowed to run on !active && online CPUs, see
1861 * __set_cpus_allowed_ptr() and select_fallback_rq().
1862 */
is_cpu_allowed(struct task_struct * p,int cpu)1863 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1864 {
1865 if (!cpumask_test_cpu(cpu, p->cpus_ptr)) {
1866 return false;
1867 }
1868
1869 if (is_per_cpu_kthread(p)) {
1870 return cpu_online(cpu);
1871 }
1872
1873 if (!cpu_active(cpu)) {
1874 return false;
1875 }
1876
1877 return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
1878 }
1879
1880 /*
1881 * This is how migration works
1882 *
1883 * 1) we invoke migration_cpu_stop() on the target CPU using
1884 * stop_one_cpu().
1885 * 2) stopper starts to run (implicitly forcing the migrated thread
1886 * off the CPU)
1887 * 3) it checks whether the migrated task is still in the wrong runqueue.
1888 * 4) if it's in the wrong runqueue then the migration thread removes
1889 * it and puts it into the right queue.
1890 * 5) stopper completes and stop_one_cpu() returns and the migration
1891 * is done.
1892 */
1893
1894 /*
1895 * move_queued_task - move a queued task to new rq.
1896 *
1897 * Returns (locked) new rq. Old rq's lock is released.
1898 */
move_queued_task(struct rq * rq,struct rq_flags * rf,struct task_struct * p,int new_cpu)1899 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
1900 struct task_struct *p, int new_cpu)
1901 {
1902 lockdep_assert_held(&rq->lock);
1903
1904 deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1905 #ifdef CONFIG_SCHED_WALT
1906 double_lock_balance(rq, cpu_rq(new_cpu));
1907 if (!(rq->clock_update_flags & RQCF_UPDATED)) {
1908 update_rq_clock(rq);
1909 }
1910 #endif
1911 set_task_cpu(p, new_cpu);
1912 #ifdef CONFIG_SCHED_WALT
1913 double_rq_unlock(cpu_rq(new_cpu), rq);
1914 #else
1915 rq_unlock(rq, rf);
1916 #endif
1917
1918 rq = cpu_rq(new_cpu);
1919
1920 rq_lock(rq, rf);
1921 BUG_ON(task_cpu(p) != new_cpu);
1922 activate_task(rq, p, 0);
1923 check_preempt_curr(rq, p, 0);
1924
1925 return rq;
1926 }
1927
1928 struct migration_arg {
1929 struct task_struct *task;
1930 int dest_cpu;
1931 };
1932
1933 /*
1934 * Move (not current) task off this CPU, onto the destination CPU. We're doing
1935 * this because either it can't run here any more (set_cpus_allowed()
1936 * away from this CPU, or CPU going down), or because we're
1937 * attempting to rebalance this task on exec (sched_exec).
1938 *
1939 * So we race with normal scheduler movements, but that's OK, as long
1940 * as the task is no longer on this CPU.
1941 */
__migrate_task(struct rq * rq,struct rq_flags * rf,struct task_struct * p,int dest_cpu)1942 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
1943 struct task_struct *p, int dest_cpu)
1944 {
1945 /* Affinity changed (again). */
1946 if (!is_cpu_allowed(p, dest_cpu)) {
1947 return rq;
1948 }
1949
1950 update_rq_clock(rq);
1951 rq = move_queued_task(rq, rf, p, dest_cpu);
1952
1953 return rq;
1954 }
1955
1956 /*
1957 * migration_cpu_stop - this will be executed by a highprio stopper thread
1958 * and performs thread migration by bumping thread off CPU then
1959 * 'pushing' onto another runqueue.
1960 */
migration_cpu_stop(void * data)1961 static int migration_cpu_stop(void *data)
1962 {
1963 struct migration_arg *arg = data;
1964 struct task_struct *p = arg->task;
1965 struct rq *rq = this_rq();
1966 struct rq_flags rf;
1967
1968 /*
1969 * The original target CPU might have gone down and we might
1970 * be on another CPU but it doesn't matter.
1971 */
1972 local_irq_disable();
1973 /*
1974 * We need to explicitly wake pending tasks before running
1975 * __migrate_task() such that we will not miss enforcing cpus_ptr
1976 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1977 */
1978 flush_smp_call_function_from_idle();
1979
1980 raw_spin_lock(&p->pi_lock);
1981 rq_lock(rq, &rf);
1982 /*
1983 * If task_rq(p) != rq, it cannot be migrated here, because we're
1984 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1985 * we're holding p->pi_lock.
1986 */
1987 if (task_rq(p) == rq) {
1988 if (task_on_rq_queued(p)) {
1989 rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1990 } else {
1991 p->wake_cpu = arg->dest_cpu;
1992 }
1993 }
1994 rq_unlock(rq, &rf);
1995 raw_spin_unlock(&p->pi_lock);
1996
1997 local_irq_enable();
1998 return 0;
1999 }
2000
2001 /*
2002 * sched_class::set_cpus_allowed must do the below, but is not required to
2003 * actually call this function.
2004 */
set_cpus_allowed_common(struct task_struct * p,const struct cpumask * new_mask)2005 void set_cpus_allowed_common(struct task_struct *p,
2006 const struct cpumask *new_mask)
2007 {
2008 cpumask_copy(&p->cpus_mask, new_mask);
2009 p->nr_cpus_allowed = cpumask_weight(new_mask);
2010 }
2011
do_set_cpus_allowed(struct task_struct * p,const struct cpumask * new_mask)2012 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2013 {
2014 struct rq *rq = task_rq(p);
2015 bool queued, running;
2016
2017 lockdep_assert_held(&p->pi_lock);
2018
2019 queued = task_on_rq_queued(p);
2020 running = task_current(rq, p);
2021
2022 if (queued) {
2023 /*
2024 * Because __kthread_bind() calls this on blocked tasks without
2025 * holding rq->lock.
2026 */
2027 lockdep_assert_held(&rq->lock);
2028 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2029 }
2030 if (running) {
2031 put_prev_task(rq, p);
2032 }
2033
2034 p->sched_class->set_cpus_allowed(p, new_mask);
2035
2036 if (queued) {
2037 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2038 }
2039 if (running) {
2040 set_next_task(rq, p);
2041 }
2042 }
2043
2044 /*
2045 * Change a given task's CPU affinity. Migrate the thread to a
2046 * proper CPU and schedule it away if the CPU it's executing on
2047 * is removed from the allowed bitmask.
2048 *
2049 * NOTE: the caller must have a valid reference to the task, the
2050 * task must not exit() & deallocate itself prematurely. The
2051 * call is not atomic; no spinlocks may be held.
2052 */
__set_cpus_allowed_ptr(struct task_struct * p,const struct cpumask * new_mask,bool check)2053 static int __set_cpus_allowed_ptr(struct task_struct *p,
2054 const struct cpumask *new_mask, bool check)
2055 {
2056 const struct cpumask *cpu_valid_mask = cpu_active_mask;
2057 unsigned int dest_cpu;
2058 struct rq_flags rf;
2059 struct rq *rq;
2060 int ret = 0;
2061 #ifdef CONFIG_CPU_ISOLATION_OPT
2062 cpumask_t allowed_mask;
2063 #endif
2064
2065 rq = task_rq_lock(p, &rf);
2066 update_rq_clock(rq);
2067
2068 if (p->flags & PF_KTHREAD) {
2069 /*
2070 * Kernel threads are allowed on online && !active CPUs
2071 */
2072 cpu_valid_mask = cpu_online_mask;
2073 }
2074
2075 /*
2076 * Must re-check here, to close a race against __kthread_bind(),
2077 * sched_setaffinity() is not guaranteed to observe the flag.
2078 */
2079 if (check && (p->flags & PF_NO_SETAFFINITY)) {
2080 ret = -EINVAL;
2081 goto out;
2082 }
2083
2084 if (cpumask_equal(&p->cpus_mask, new_mask)) {
2085 goto out;
2086 }
2087
2088 #ifdef CONFIG_CPU_ISOLATION_OPT
2089 cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
2090 cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask);
2091
2092 dest_cpu = cpumask_any(&allowed_mask);
2093 if (dest_cpu >= nr_cpu_ids) {
2094 cpumask_and(&allowed_mask, cpu_valid_mask, new_mask);
2095 dest_cpu = cpumask_any(&allowed_mask);
2096 if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
2097 ret = -EINVAL;
2098 goto out;
2099 }
2100 }
2101 #else
2102 /*
2103 * Picking a ~random cpu helps in cases where we are changing affinity
2104 * for groups of tasks (ie. cpuset), so that load balancing is not
2105 * immediately required to distribute the tasks within their new mask.
2106 */
2107 dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2108 if (dest_cpu >= nr_cpu_ids) {
2109 ret = -EINVAL;
2110 goto out;
2111 }
2112 #endif
2113
2114 do_set_cpus_allowed(p, new_mask);
2115
2116 if (p->flags & PF_KTHREAD) {
2117 /*
2118 * For kernel threads that do indeed end up on online &&
2119 * !active we want to ensure they are strict per-CPU threads.
2120 */
2121 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
2122 !cpumask_intersects(new_mask, cpu_active_mask) &&
2123 p->nr_cpus_allowed != 1);
2124 }
2125
2126 /* Can the task run on the task's current CPU? If so, we're done */
2127 #ifdef CONFIG_CPU_ISOLATION_OPT
2128 if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) {
2129 goto out;
2130 }
2131 #else
2132 if (cpumask_test_cpu(task_cpu(p), new_mask)) {
2133 goto out;
2134 }
2135 #endif
2136
2137 if (task_running(rq, p) || p->state == TASK_WAKING) {
2138 struct migration_arg arg = {p, dest_cpu};
2139 /* Need help from migration thread: drop lock and wait. */
2140 task_rq_unlock(rq, p, &rf);
2141 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
2142 return 0;
2143 } else if (task_on_rq_queued(p)) {
2144 /*
2145 * OK, since we're going to drop the lock immediately
2146 * afterwards anyway.
2147 */
2148 rq = move_queued_task(rq, &rf, p, dest_cpu);
2149 }
2150 out:
2151 task_rq_unlock(rq, p, &rf);
2152
2153 return ret;
2154 }
2155
set_cpus_allowed_ptr(struct task_struct * p,const struct cpumask * new_mask)2156 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2157 {
2158 return __set_cpus_allowed_ptr(p, new_mask, false);
2159 }
2160 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2161
set_task_cpu(struct task_struct * p,unsigned int new_cpu)2162 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2163 {
2164 #ifdef CONFIG_SCHED_DEBUG
2165 /*
2166 * We should never call set_task_cpu() on a blocked task,
2167 * ttwu() will sort out the placement.
2168 */
2169 WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2170 !p->on_rq);
2171
2172 /*
2173 * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
2174 * because schedstat_wait_{start,end} rebase migrating task's wait_start
2175 * time relying on p->on_rq.
2176 */
2177 WARN_ON_ONCE(p->state == TASK_RUNNING &&
2178 p->sched_class == &fair_sched_class &&
2179 (p->on_rq && !task_on_rq_migrating(p)));
2180
2181 #ifdef CONFIG_LOCKDEP
2182 /*
2183 * The caller should hold either p->pi_lock or rq->lock, when changing
2184 * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2185 *
2186 * sched_move_task() holds both and thus holding either pins the cgroup,
2187 * see task_group().
2188 *
2189 * Furthermore, all task_rq users should acquire both locks, see
2190 * task_rq_lock().
2191 */
2192 WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2193 lockdep_is_held(&task_rq(p)->lock)));
2194 #endif
2195 /*
2196 * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
2197 */
2198 WARN_ON_ONCE(!cpu_online(new_cpu));
2199 #endif
2200
2201 trace_sched_migrate_task(p, new_cpu);
2202
2203 if (task_cpu(p) != new_cpu) {
2204 if (p->sched_class->migrate_task_rq) {
2205 p->sched_class->migrate_task_rq(p, new_cpu);
2206 }
2207 p->se.nr_migrations++;
2208 rseq_migrate(p);
2209 perf_event_task_migrate(p);
2210 fixup_busy_time(p, new_cpu);
2211 }
2212
2213 __set_task_cpu(p, new_cpu);
2214 }
2215
2216 #ifdef CONFIG_NUMA_BALANCING
__migrate_swap_task(struct task_struct * p,int cpu)2217 static void __migrate_swap_task(struct task_struct *p, int cpu)
2218 {
2219 if (task_on_rq_queued(p)) {
2220 struct rq *src_rq, *dst_rq;
2221 struct rq_flags srf, drf;
2222
2223 src_rq = task_rq(p);
2224 dst_rq = cpu_rq(cpu);
2225
2226 rq_pin_lock(src_rq, &srf);
2227 rq_pin_lock(dst_rq, &drf);
2228
2229 deactivate_task(src_rq, p, 0);
2230 set_task_cpu(p, cpu);
2231 activate_task(dst_rq, p, 0);
2232 check_preempt_curr(dst_rq, p, 0);
2233
2234 rq_unpin_lock(dst_rq, &drf);
2235 rq_unpin_lock(src_rq, &srf);
2236 } else {
2237 /*
2238 * Task isn't running anymore; make it appear like we migrated
2239 * it before it went to sleep. This means on wakeup we make the
2240 * previous CPU our target instead of where it really is.
2241 */
2242 p->wake_cpu = cpu;
2243 }
2244 }
2245
2246 struct migration_swap_arg {
2247 struct task_struct *src_task, *dst_task;
2248 int src_cpu, dst_cpu;
2249 };
2250
migrate_swap_stop(void * data)2251 static int migrate_swap_stop(void *data)
2252 {
2253 struct migration_swap_arg *arg = data;
2254 struct rq *src_rq, *dst_rq;
2255 int ret = -EAGAIN;
2256
2257 if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) {
2258 return -EAGAIN;
2259 }
2260
2261 src_rq = cpu_rq(arg->src_cpu);
2262 dst_rq = cpu_rq(arg->dst_cpu);
2263
2264 double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
2265 double_rq_lock(src_rq, dst_rq);
2266
2267 if (task_cpu(arg->dst_task) != arg->dst_cpu) {
2268 goto unlock;
2269 }
2270
2271 if (task_cpu(arg->src_task) != arg->src_cpu) {
2272 goto unlock;
2273 }
2274
2275 if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) {
2276 goto unlock;
2277 }
2278
2279 if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) {
2280 goto unlock;
2281 }
2282
2283 __migrate_swap_task(arg->src_task, arg->dst_cpu);
2284 __migrate_swap_task(arg->dst_task, arg->src_cpu);
2285
2286 ret = 0;
2287
2288 unlock:
2289 double_rq_unlock(src_rq, dst_rq);
2290 raw_spin_unlock(&arg->dst_task->pi_lock);
2291 raw_spin_unlock(&arg->src_task->pi_lock);
2292
2293 return ret;
2294 }
2295
2296 /*
2297 * Cross migrate two tasks
2298 */
migrate_swap(struct task_struct * cur,struct task_struct * p,int target_cpu,int curr_cpu)2299 int migrate_swap(struct task_struct *cur, struct task_struct *p, int target_cpu,
2300 int curr_cpu)
2301 {
2302 struct migration_swap_arg arg;
2303 int ret = -EINVAL;
2304
2305 arg = (struct migration_swap_arg) {
2306 .src_task = cur,
2307 .src_cpu = curr_cpu,
2308 .dst_task = p,
2309 .dst_cpu = target_cpu,
2310 };
2311
2312 if (arg.src_cpu == arg.dst_cpu) {
2313 goto out;
2314 }
2315
2316 /*
2317 * These three tests are all lockless; this is OK since all of them
2318 * will be re-checked with proper locks held further down the line.
2319 */
2320 if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) {
2321 goto out;
2322 }
2323
2324 if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) {
2325 goto out;
2326 }
2327
2328 if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) {
2329 goto out;
2330 }
2331
2332 trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2333 ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2334
2335 out:
2336 return ret;
2337 }
2338 #endif /* CONFIG_NUMA_BALANCING */
2339
2340 /*
2341 * wait_task_inactive - wait for a thread to unschedule.
2342 *
2343 * If @match_state is nonzero, it's the @p->state value just checked and
2344 * not expected to change. If it changes, i.e. @p might have woken up,
2345 * then return zero. When we succeed in waiting for @p to be off its CPU,
2346 * we return a positive number (its total switch count). If a second call
2347 * a short while later returns the same number, the caller can be sure that
2348 * @p has remained unscheduled the whole time.
2349 *
2350 * The caller must ensure that the task *will* unschedule sometime soon,
2351 * else this function might spin for a *long* time. This function can't
2352 * be called with interrupts off, or it may introduce deadlock with
2353 * smp_call_function() if an IPI is sent by the same process we are
2354 * waiting to become inactive.
2355 */
wait_task_inactive(struct task_struct * p,long match_state)2356 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2357 {
2358 int running, queued;
2359 struct rq_flags rf;
2360 unsigned long ncsw;
2361 struct rq *rq;
2362
2363 for (;;) {
2364 /*
2365 * We do the initial early heuristics without holding
2366 * any task-queue locks at all. We'll only try to get
2367 * the runqueue lock when things look like they will
2368 * work out!
2369 */
2370 rq = task_rq(p);
2371
2372 /*
2373 * If the task is actively running on another CPU
2374 * still, just relax and busy-wait without holding
2375 * any locks.
2376 *
2377 * NOTE! Since we don't hold any locks, it's not
2378 * even sure that "rq" stays as the right runqueue!
2379 * But we don't care, since "task_running()" will
2380 * return false if the runqueue has changed and p
2381 * is actually now running somewhere else!
2382 */
2383 while (task_running(rq, p)) {
2384 if (match_state && unlikely(p->state != match_state)) {
2385 return 0;
2386 }
2387 cpu_relax();
2388 }
2389
2390 /*
2391 * Ok, time to look more closely! We need the rq
2392 * lock now, to be *sure*. If we're wrong, we'll
2393 * just go back and repeat.
2394 */
2395 rq = task_rq_lock(p, &rf);
2396 trace_sched_wait_task(p);
2397 running = task_running(rq, p);
2398 queued = task_on_rq_queued(p);
2399 ncsw = 0;
2400 if (!match_state || p->state == match_state) {
2401 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2402 }
2403 task_rq_unlock(rq, p, &rf);
2404
2405 /*
2406 * If it changed from the expected state, bail out now.
2407 */
2408 if (unlikely(!ncsw)) {
2409 break;
2410 }
2411
2412 /*
2413 * Was it really running after all now that we
2414 * checked with the proper locks actually held?
2415 *
2416 * Oops. Go back and try again..
2417 */
2418 if (unlikely(running)) {
2419 cpu_relax();
2420 continue;
2421 }
2422
2423 /*
2424 * It's not enough that it's not actively running,
2425 * it must be off the runqueue _entirely_, and not
2426 * preempted!
2427 *
2428 * So if it was still runnable (but just not actively
2429 * running right now), it's preempted, and we should
2430 * yield - it could be a while.
2431 */
2432 if (unlikely(queued)) {
2433 ktime_t to = NSEC_PER_SEC / HZ;
2434
2435 set_current_state(TASK_UNINTERRUPTIBLE);
2436 schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2437 continue;
2438 }
2439
2440 /*
2441 * Ahh, all good. It wasn't running, and it wasn't
2442 * runnable, which means that it will never become
2443 * running in the future either. We're all done!
2444 */
2445 break;
2446 }
2447
2448 return ncsw;
2449 }
2450
2451 /***
2452 * kick_process - kick a running thread to enter/exit the kernel
2453 * @p: the to-be-kicked thread
2454 *
2455 * Cause a process which is running on another CPU to enter
2456 * kernel-mode, without any delay. (to get signals handled.)
2457 *
2458 * NOTE: this function doesn't have to take the runqueue lock,
2459 * because all it wants to ensure is that the remote task enters
2460 * the kernel. If the IPI races and the task has been migrated
2461 * to another CPU then no harm is done and the purpose has been
2462 * achieved as well.
2463 */
kick_process(struct task_struct * p)2464 void kick_process(struct task_struct *p)
2465 {
2466 int cpu;
2467
2468 preempt_disable();
2469 cpu = task_cpu(p);
2470 if ((cpu != smp_processor_id()) && task_curr(p)) {
2471 smp_send_reschedule(cpu);
2472 }
2473 preempt_enable();
2474 }
2475 EXPORT_SYMBOL_GPL(kick_process);
2476
2477 /*
2478 * ->cpus_ptr is protected by both rq->lock and p->pi_lock
2479 *
2480 * A few notes on cpu_active vs cpu_online:
2481 *
2482 * - cpu_active must be a subset of cpu_online
2483 *
2484 * - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
2485 * see __set_cpus_allowed_ptr(). At this point the newly online
2486 * CPU isn't yet part of the sched domains, and balancing will not
2487 * see it.
2488 *
2489 * - on CPU-down we clear cpu_active() to mask the sched domains and
2490 * avoid the load balancer to place new tasks on the to be removed
2491 * CPU. Existing tasks will remain running there and will be taken
2492 * off.
2493 *
2494 * This means that fallback selection must not select !active CPUs.
2495 * And can assume that any active CPU must be online. Conversely
2496 * select_task_rq() below may allow selection of !active CPUs in order
2497 * to satisfy the above rules.
2498 */
2499 #ifdef CONFIG_CPU_ISOLATION_OPT
select_fallback_rq(int cpu,struct task_struct * p,bool allow_iso)2500 static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
2501 #else
2502 static int select_fallback_rq(int cpu, struct task_struct *p)
2503 #endif
2504 {
2505 int nid = cpu_to_node(cpu);
2506 const struct cpumask *nodemask = NULL;
2507 enum { cpuset, possible, fail, bug } state = cpuset;
2508 int dest_cpu;
2509 #ifdef CONFIG_CPU_ISOLATION_OPT
2510 int isolated_candidate = -1;
2511 #endif
2512
2513 /*
2514 * If the node that the CPU is on has been offlined, cpu_to_node()
2515 * will return -1. There is no CPU on the node, and we should
2516 * select the CPU on the other node.
2517 */
2518 if (nid != -1) {
2519 nodemask = cpumask_of_node(nid);
2520
2521 /* Look for allowed, online CPU in same node. */
2522 for_each_cpu(dest_cpu, nodemask)
2523 {
2524 if (!cpu_active(dest_cpu)) {
2525 continue;
2526 }
2527 if (cpu_isolated(dest_cpu)) {
2528 continue;
2529 }
2530 if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) {
2531 return dest_cpu;
2532 }
2533 }
2534 }
2535
2536 for (;;) {
2537 /* Any allowed, online CPU? */
2538 for_each_cpu(dest_cpu, p->cpus_ptr)
2539 {
2540 if (!is_cpu_allowed(p, dest_cpu)) {
2541 continue;
2542 }
2543 #ifdef CONFIG_CPU_ISOLATION_OPT
2544 if (cpu_isolated(dest_cpu)) {
2545 if (allow_iso) {
2546 isolated_candidate = dest_cpu;
2547 }
2548 continue;
2549 }
2550 goto out;
2551 }
2552
2553 if (isolated_candidate != -1) {
2554 dest_cpu = isolated_candidate;
2555 #endif
2556 goto out;
2557 }
2558
2559 /* No more Mr. Nice Guy. */
2560 switch (state) {
2561 case cpuset:
2562 if (IS_ENABLED(CONFIG_CPUSETS)) {
2563 cpuset_cpus_allowed_fallback(p);
2564 state = possible;
2565 break;
2566 }
2567 fallthrough;
2568 case possible:
2569 do_set_cpus_allowed(p, task_cpu_possible_mask(p));
2570 state = fail;
2571 break;
2572 case fail:
2573 #ifdef CONFIG_CPU_ISOLATION_OPT
2574 allow_iso = true;
2575 state = bug;
2576 break;
2577 #else
2578 #endif
2579
2580 case bug:
2581 BUG();
2582 break;
2583 }
2584 }
2585
2586 out:
2587 if (state != cpuset) {
2588 /*
2589 * Don't tell them about moving exiting tasks or
2590 * kernel threads (both mm NULL), since they never
2591 * leave kernel.
2592 */
2593 if (p->mm && printk_ratelimit()) {
2594 printk_deferred("process %d (%s) no longer affine to cpu%d\n",
2595 task_pid_nr(p), p->comm, cpu);
2596 }
2597 }
2598
2599 return dest_cpu;
2600 }
2601
2602 /*
2603 * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
2604 */
select_task_rq(struct task_struct * p,int cpu,int sd_flags,int wake_flags)2605 static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags,
2606 int wake_flags)
2607 {
2608 #ifdef CONFIG_CPU_ISOLATION_OPT
2609 bool allow_isolated = (p->flags & PF_KTHREAD);
2610 #endif
2611
2612 lockdep_assert_held(&p->pi_lock);
2613
2614 if (p->nr_cpus_allowed > 1) {
2615 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
2616 } else {
2617 cpu = cpumask_any(p->cpus_ptr);
2618 }
2619
2620 /*
2621 * In order not to call set_task_cpu() on a blocking task we need
2622 * to rely on ttwu() to place the task on a valid ->cpus_ptr
2623 * CPU.
2624 *
2625 * Since this is common to all placement strategies, this lives here.
2626 *
2627 * [ this allows ->select_task() to simply return task_cpu(p) and
2628 * not worry about this generic constraint ]
2629 */
2630 #ifdef CONFIG_CPU_ISOLATION_OPT
2631 if (unlikely(!is_cpu_allowed(p, cpu)) ||
2632 (cpu_isolated(cpu) && !allow_isolated)) {
2633 cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
2634 }
2635 #else
2636 if (unlikely(!is_cpu_allowed(p, cpu))) {
2637 cpu = select_fallback_rq(task_cpu(p), p);
2638 }
2639 #endif
2640
2641 return cpu;
2642 }
2643
sched_set_stop_task(int cpu,struct task_struct * stop)2644 void sched_set_stop_task(int cpu, struct task_struct *stop)
2645 {
2646 struct sched_param param = {.sched_priority = MAX_RT_PRIO - 1};
2647 struct task_struct *old_stop = cpu_rq(cpu)->stop;
2648
2649 if (stop) {
2650 /*
2651 * Make it appear like a SCHED_FIFO task, its something
2652 * userspace knows about and won't get confused about.
2653 *
2654 * Also, it will make PI more or less work without too
2655 * much confusion -- but then, stop work should not
2656 * rely on PI working anyway.
2657 */
2658 sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m);
2659
2660 stop->sched_class = &stop_sched_class;
2661 }
2662
2663 cpu_rq(cpu)->stop = stop;
2664
2665 if (old_stop) {
2666 /*
2667 * Reset it back to a normal scheduling class so that
2668 * it can die in pieces.
2669 */
2670 old_stop->sched_class = &rt_sched_class;
2671 }
2672 }
2673
2674 #else
2675
__set_cpus_allowed_ptr(struct task_struct * p,const struct cpumask * new_mask,bool check)2676 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2677 const struct cpumask *new_mask,
2678 bool check)
2679 {
2680 return set_cpus_allowed_ptr(p, new_mask);
2681 }
2682
2683 #endif /* CONFIG_SMP */
2684
ttwu_stat(struct task_struct * p,int cpu,int wake_flags)2685 static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2686 {
2687 struct rq *rq;
2688
2689 if (!schedstat_enabled()) {
2690 return;
2691 }
2692
2693 rq = this_rq();
2694 #ifdef CONFIG_SMP
2695 if (cpu == rq->cpu) {
2696 __schedstat_inc(rq->ttwu_local);
2697 __schedstat_inc(p->se.statistics.nr_wakeups_local);
2698 } else {
2699 struct sched_domain *sd;
2700
2701 __schedstat_inc(p->se.statistics.nr_wakeups_remote);
2702 rcu_read_lock();
2703 for_each_domain(rq->cpu, sd)
2704 {
2705 if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2706 __schedstat_inc(sd->ttwu_wake_remote);
2707 break;
2708 }
2709 }
2710 rcu_read_unlock();
2711 }
2712
2713 if (wake_flags & WF_MIGRATED) {
2714 __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
2715 }
2716 #endif /* CONFIG_SMP */
2717
2718 __schedstat_inc(rq->ttwu_count);
2719 __schedstat_inc(p->se.statistics.nr_wakeups);
2720
2721 if (wake_flags & WF_SYNC) {
2722 __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2723 }
2724 }
2725
2726 /*
2727 * Mark the task runnable and perform wakeup-preemption.
2728 */
ttwu_do_wakeup(struct rq * rq,struct task_struct * p,int wake_flags,struct rq_flags * rf)2729 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
2730 struct rq_flags *rf)
2731 {
2732 check_preempt_curr(rq, p, wake_flags);
2733 p->state = TASK_RUNNING;
2734 trace_sched_wakeup(p);
2735
2736 #ifdef CONFIG_SMP
2737 if (p->sched_class->task_woken) {
2738 /*
2739 * Our task @p is fully woken up and running; so its safe to
2740 * drop the rq->lock, hereafter rq is only used for statistics.
2741 */
2742 rq_unpin_lock(rq, rf);
2743 p->sched_class->task_woken(rq, p);
2744 rq_repin_lock(rq, rf);
2745 }
2746
2747 if (rq->idle_stamp) {
2748 u64 delta = rq_clock(rq) - rq->idle_stamp;
2749 u64 max = 2 * rq->max_idle_balance_cost;
2750
2751 update_avg(&rq->avg_idle, delta);
2752
2753 if (rq->avg_idle > max) {
2754 rq->avg_idle = max;
2755 }
2756
2757 rq->idle_stamp = 0;
2758 }
2759 #endif
2760 }
2761
ttwu_do_activate(struct rq * rq,struct task_struct * p,int wake_flags,struct rq_flags * rf)2762 static void ttwu_do_activate(struct rq *rq, struct task_struct *p,
2763 int wake_flags, struct rq_flags *rf)
2764 {
2765 int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
2766
2767 if (wake_flags & WF_SYNC) {
2768 en_flags |= ENQUEUE_WAKEUP_SYNC;
2769 }
2770
2771 lockdep_assert_held(&rq->lock);
2772
2773 if (p->sched_contributes_to_load) {
2774 rq->nr_uninterruptible--;
2775 }
2776
2777 #ifdef CONFIG_SMP
2778 if (wake_flags & WF_MIGRATED) {
2779 en_flags |= ENQUEUE_MIGRATED;
2780 } else
2781 #endif
2782 if (p->in_iowait) {
2783 delayacct_blkio_end(p);
2784 atomic_dec(&task_rq(p)->nr_iowait);
2785 }
2786
2787 activate_task(rq, p, en_flags);
2788 ttwu_do_wakeup(rq, p, wake_flags, rf);
2789 }
2790
2791 /*
2792 * Consider @p being inside a wait loop:
2793 *
2794 * for (;;) {
2795 * set_current_state(TASK_UNINTERRUPTIBLE);
2796 *
2797 * if (CONDITION)
2798 * break;
2799 *
2800 * schedule();
2801 * }
2802 * __set_current_state(TASK_RUNNING);
2803 *
2804 * between set_current_state() and schedule(). In this case @p is still
2805 * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
2806 * an atomic manner.
2807 *
2808 * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
2809 * then schedule() must still happen and p->state can be changed to
2810 * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
2811 * need to do a full wakeup with enqueue.
2812 *
2813 * Returns: %true when the wakeup is done,
2814 * %false otherwise.
2815 */
ttwu_runnable(struct task_struct * p,int wake_flags)2816 static int ttwu_runnable(struct task_struct *p, int wake_flags)
2817 {
2818 struct rq_flags rf;
2819 struct rq *rq;
2820 int ret = 0;
2821
2822 rq = __task_rq_lock(p, &rf);
2823 if (task_on_rq_queued(p)) {
2824 /* check_preempt_curr() may use rq clock */
2825 update_rq_clock(rq);
2826 ttwu_do_wakeup(rq, p, wake_flags, &rf);
2827 ret = 1;
2828 }
2829 __task_rq_unlock(rq, &rf);
2830
2831 return ret;
2832 }
2833
2834 #ifdef CONFIG_SMP
sched_ttwu_pending(void * arg)2835 void sched_ttwu_pending(void *arg)
2836 {
2837 struct llist_node *llist = arg;
2838 struct rq *rq = this_rq();
2839 struct task_struct *p, *t;
2840 struct rq_flags rf;
2841
2842 if (!llist) {
2843 return;
2844 }
2845
2846 /*
2847 * rq::ttwu_pending racy indication of out-standing wakeups.
2848 * Races such that false-negatives are possible, since they
2849 * are shorter lived that false-positives would be.
2850 */
2851 WRITE_ONCE(rq->ttwu_pending, 0);
2852
2853 rq_lock_irqsave(rq, &rf);
2854 update_rq_clock(rq);
2855
2856 llist_for_each_entry_safe(p, t, llist, wake_entry.llist)
2857 {
2858 if (WARN_ON_ONCE(p->on_cpu)) {
2859 smp_cond_load_acquire(&p->on_cpu, !VAL);
2860 }
2861
2862 if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) {
2863 set_task_cpu(p, cpu_of(rq));
2864 }
2865
2866 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2867 }
2868
2869 rq_unlock_irqrestore(rq, &rf);
2870 }
2871
send_call_function_single_ipi(int cpu)2872 void send_call_function_single_ipi(int cpu)
2873 {
2874 struct rq *rq = cpu_rq(cpu);
2875
2876 if (!set_nr_if_polling(rq->idle)) {
2877 arch_send_call_function_single_ipi(cpu);
2878 } else {
2879 trace_sched_wake_idle_without_ipi(cpu);
2880 }
2881 }
2882
2883 /*
2884 * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
2885 * necessary. The wakee CPU on receipt of the IPI will queue the task
2886 * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
2887 * of the wakeup instead of the waker.
2888 */
__ttwu_queue_wakelist(struct task_struct * p,int cpu,int wake_flags)2889 static void __ttwu_queue_wakelist(struct task_struct *p, int cpu,
2890 int wake_flags)
2891 {
2892 struct rq *rq = cpu_rq(cpu);
2893
2894 p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2895
2896 WRITE_ONCE(rq->ttwu_pending, 1);
2897 __smp_call_single_queue(cpu, &p->wake_entry.llist);
2898 }
2899
wake_up_if_idle(int cpu)2900 void wake_up_if_idle(int cpu)
2901 {
2902 struct rq *rq = cpu_rq(cpu);
2903 struct rq_flags rf;
2904
2905 rcu_read_lock();
2906
2907 if (!is_idle_task(rcu_dereference(rq->curr))) {
2908 goto out;
2909 }
2910
2911 if (set_nr_if_polling(rq->idle)) {
2912 trace_sched_wake_idle_without_ipi(cpu);
2913 } else {
2914 rq_lock_irqsave(rq, &rf);
2915 if (is_idle_task(rq->curr)) {
2916 smp_send_reschedule(cpu);
2917 }
2918 /* Else CPU is not idle, do nothing here: */
2919 rq_unlock_irqrestore(rq, &rf);
2920 }
2921
2922 out:
2923 rcu_read_unlock();
2924 }
2925
cpus_share_cache(int this_cpu,int that_cpu)2926 bool cpus_share_cache(int this_cpu, int that_cpu)
2927 {
2928 if (this_cpu == that_cpu) {
2929 return true;
2930 }
2931
2932 return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2933 }
2934
ttwu_queue_cond(int cpu,int wake_flags)2935 static inline bool ttwu_queue_cond(int cpu, int wake_flags)
2936 {
2937 /*
2938 * If the CPU does not share cache, then queue the task on the
2939 * remote rqs wakelist to avoid accessing remote data.
2940 */
2941 if (!cpus_share_cache(smp_processor_id(), cpu)) {
2942 return true;
2943 }
2944
2945 /*
2946 * If the task is descheduling and the only running task on the
2947 * CPU then use the wakelist to offload the task activation to
2948 * the soon-to-be-idle CPU as the current CPU is likely busy.
2949 * nr_running is checked to avoid unnecessary task stacking.
2950 */
2951 if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) {
2952 return true;
2953 }
2954
2955 return false;
2956 }
2957
ttwu_queue_wakelist(struct task_struct * p,int cpu,int wake_flags)2958 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2959 {
2960 if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
2961 if (WARN_ON_ONCE(cpu == smp_processor_id())) {
2962 return false;
2963 }
2964
2965 sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2966 __ttwu_queue_wakelist(p, cpu, wake_flags);
2967 return true;
2968 }
2969
2970 return false;
2971 }
2972
2973 #else /* !CONFIG_SMP */
2974
ttwu_queue_wakelist(struct task_struct * p,int cpu,int wake_flags)2975 static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu,
2976 int wake_flags)
2977 {
2978 return false;
2979 }
2980
2981 #endif /* CONFIG_SMP */
2982
ttwu_queue(struct task_struct * p,int cpu,int wake_flags)2983 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
2984 {
2985 struct rq *rq = cpu_rq(cpu);
2986 struct rq_flags rf;
2987
2988 if (ttwu_queue_wakelist(p, cpu, wake_flags)) {
2989 return;
2990 }
2991
2992 rq_lock(rq, &rf);
2993 update_rq_clock(rq);
2994 ttwu_do_activate(rq, p, wake_flags, &rf);
2995 rq_unlock(rq, &rf);
2996 }
2997
2998 /*
2999 * Notes on Program-Order guarantees on SMP systems.
3000 *
3001 * MIGRATION
3002 *
3003 * The basic program-order guarantee on SMP systems is that when a task [t]
3004 * migrates, all its activity on its old CPU [c0] happens-before any subsequent
3005 * execution on its new CPU [c1].
3006 *
3007 * For migration (of runnable tasks) this is provided by the following means:
3008 *
3009 * A) UNLOCK of the rq(c0)->lock scheduling out task t
3010 * B) migration for t is required to synchronize *both* rq(c0)->lock and
3011 * rq(c1)->lock (if not at the same time, then in that order).
3012 * C) LOCK of the rq(c1)->lock scheduling in task
3013 *
3014 * Release/acquire chaining guarantees that B happens after A and C after B.
3015 * Note: the CPU doing B need not be c0 or c1
3016 *
3017 * Example:
3018 *
3019 * CPU0 CPU1 CPU2
3020 *
3021 * LOCK rq(0)->lock
3022 * sched-out X
3023 * sched-in Y
3024 * UNLOCK rq(0)->lock
3025 *
3026 * LOCK rq(0)->lock // orders against CPU0
3027 * dequeue X
3028 * UNLOCK rq(0)->lock
3029 *
3030 * LOCK rq(1)->lock
3031 * enqueue X
3032 * UNLOCK rq(1)->lock
3033 *
3034 * LOCK rq(1)->lock // orders against CPU2
3035 * sched-out Z
3036 * sched-in X
3037 * UNLOCK rq(1)->lock
3038 *
3039 *
3040 * BLOCKING -- aka. SLEEP + WAKEUP
3041 *
3042 * For blocking we (obviously) need to provide the same guarantee as for
3043 * migration. However the means are completely different as there is no lock
3044 * chain to provide order. Instead we do:
3045 *
3046 * 1) smp_store_release(X->on_cpu, 0) -- finish_task()
3047 * 2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
3048 *
3049 * Example:
3050 *
3051 * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule)
3052 *
3053 * LOCK rq(0)->lock LOCK X->pi_lock
3054 * dequeue X
3055 * sched-out X
3056 * smp_store_release(X->on_cpu, 0);
3057 *
3058 * smp_cond_load_acquire(&X->on_cpu, !VAL);
3059 * X->state = WAKING
3060 * set_task_cpu(X,2)
3061 *
3062 * LOCK rq(2)->lock
3063 * enqueue X
3064 * X->state = RUNNING
3065 * UNLOCK rq(2)->lock
3066 *
3067 * LOCK rq(2)->lock // orders against
3068 * CPU1 sched-out Z sched-in X UNLOCK rq(2)->lock
3069 *
3070 * UNLOCK X->pi_lock
3071 * UNLOCK rq(0)->lock
3072 *
3073 *
3074 * However, for wakeups there is a second guarantee we must provide, namely we
3075 * must ensure that CONDITION=1 done by the caller can not be reordered with
3076 * accesses to the task state; see try_to_wake_up() and set_current_state().
3077 */
3078
3079 #ifdef CONFIG_SMP
3080 #ifdef CONFIG_SCHED_WALT
3081 /* utility function to update walt signals at wakeup */
walt_try_to_wake_up(struct task_struct * p)3082 static inline void walt_try_to_wake_up(struct task_struct *p)
3083 {
3084 struct rq *rq = cpu_rq(task_cpu(p));
3085 struct rq_flags rf;
3086 u64 wallclock;
3087
3088 rq_lock_irqsave(rq, &rf);
3089 wallclock = sched_ktime_clock();
3090 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
3091 update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
3092 rq_unlock_irqrestore(rq, &rf);
3093 }
3094 #else
3095 #define walt_try_to_wake_up(a) \
3096 { \
3097 }
3098 #endif
3099 #endif
3100
3101 /**
3102 * try_to_wake_up - wake up a thread
3103 * @p: the thread to be awakened
3104 * @state: the mask of task states that can be woken
3105 * @wake_flags: wake modifier flags (WF_*)
3106 *
3107 * Conceptually does
3108 *
3109 * If (@state & @p->state) @p->state = TASK_RUNNING.
3110 *
3111 * If the task was not queued/runnable, also place it back on a runqueue.
3112 *
3113 * This function is atomic against schedule() which would dequeue the task.
3114 *
3115 * It issues a full memory barrier before accessing @p->state, see the comment
3116 * with set_current_state().
3117 *
3118 * Uses p->pi_lock to serialize against concurrent wake-ups.
3119 *
3120 * Relies on p->pi_lock stabilizing:
3121 * - p->sched_class
3122 * - p->cpus_ptr
3123 * - p->sched_task_group
3124 * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3125 *
3126 * Tries really hard to only take one task_rq(p)->lock for performance.
3127 * Takes rq->lock in:
3128 * - ttwu_runnable() -- old rq, unavoidable, see comment there;
3129 * - ttwu_queue() -- new rq, for enqueue of the task;
3130 * - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3131 *
3132 * As a consequence we race really badly with just about everything. See the
3133 * many memory barriers and their comments for details.
3134 *
3135 * Return: %true if @p->state changes (an actual wakeup was done),
3136 * %false otherwise.
3137 */
try_to_wake_up(struct task_struct * p,unsigned int state,int wake_flags)3138 static int try_to_wake_up(struct task_struct *p, unsigned int state,
3139 int wake_flags)
3140 {
3141 unsigned long flags;
3142 int cpu, success = 0;
3143
3144 preempt_disable();
3145 if (p == current) {
3146 /*
3147 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3148 * == smp_processor_id()'. Together this means we can special
3149 * case the whole 'p->on_rq && ttwu_runnable()' case below
3150 * without taking any locks.
3151 *
3152 * In particular:
3153 * - we rely on Program-Order guarantees for all the ordering,
3154 * - we're serialized against set_special_state() by virtue of
3155 * it disabling IRQs (this allows not taking ->pi_lock).
3156 */
3157 if (!(p->state & state)) {
3158 goto out;
3159 }
3160
3161 success = 1;
3162 trace_sched_waking(p);
3163 p->state = TASK_RUNNING;
3164 trace_sched_wakeup(p);
3165 goto out;
3166 }
3167
3168 /*
3169 * If we are going to wake up a thread waiting for CONDITION we
3170 * need to ensure that CONDITION=1 done by the caller can not be
3171 * reordered with p->state check below. This pairs with smp_store_mb()
3172 * in set_current_state() that the waiting thread does.
3173 */
3174 raw_spin_lock_irqsave(&p->pi_lock, flags);
3175 smp_mb__after_spinlock();
3176 if (!(p->state & state)) {
3177 goto unlock;
3178 }
3179
3180 #ifdef CONFIG_FREEZER
3181 /*
3182 * If we're going to wake up a thread which may be frozen, then
3183 * we can only do so if we have an active CPU which is capable of
3184 * running it. This may not be the case when resuming from suspend,
3185 * as the secondary CPUs may not yet be back online. See __thaw_task()
3186 * for the actual wakeup.
3187 */
3188 if (unlikely(frozen_or_skipped(p)) &&
3189 !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) {
3190 goto unlock;
3191 }
3192 #endif
3193
3194 trace_sched_waking(p);
3195
3196 /* We're going to change ->state: */
3197 success = 1;
3198
3199 /*
3200 * Ensure we load p->on_rq _after_ p->state, otherwise it would
3201 * be possible to, falsely, observe p->on_rq == 0 and get stuck
3202 * in smp_cond_load_acquire() below.
3203 *
3204 * sched_ttwu_pending() try_to_wake_up()
3205 * STORE p->on_rq = 1 LOAD p->state
3206 * UNLOCK rq->lock
3207 *
3208 * __schedule() (switch to task 'p')
3209 * LOCK rq->lock smp_rmb();
3210 * smp_mb__after_spinlock();
3211 * UNLOCK rq->lock
3212 *
3213 * [task p]
3214 * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq
3215 *
3216 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
3217 * __schedule(). See the comment for smp_mb__after_spinlock().
3218 *
3219 * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
3220 */
3221 smp_rmb();
3222 if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) {
3223 goto unlock;
3224 }
3225
3226 #ifdef CONFIG_SMP
3227 /*
3228 * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
3229 * possible to, falsely, observe p->on_cpu == 0.
3230 *
3231 * One must be running (->on_cpu == 1) in order to remove oneself
3232 * from the runqueue.
3233 *
3234 * __schedule() (switch to task 'p') try_to_wake_up()
3235 * STORE p->on_cpu = 1 LOAD p->on_rq
3236 * UNLOCK rq->lock
3237 *
3238 * __schedule() (put 'p' to sleep)
3239 * LOCK rq->lock smp_rmb();
3240 * smp_mb__after_spinlock();
3241 * STORE p->on_rq = 0 LOAD p->on_cpu
3242 *
3243 * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
3244 * __schedule(). See the comment for smp_mb__after_spinlock().
3245 *
3246 * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3247 * schedule()'s deactivate_task() has 'happened' and p will no longer
3248 * care about it's own p->state. See the comment in __schedule().
3249 */
3250 smp_acquire__after_ctrl_dep();
3251
3252 walt_try_to_wake_up(p);
3253
3254 /*
3255 * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3256 * == 0), which means we need to do an enqueue, change p->state to
3257 * TASK_WAKING such that we can unlock p->pi_lock before doing the
3258 * enqueue, such as ttwu_queue_wakelist().
3259 */
3260 p->state = TASK_WAKING;
3261
3262 /*
3263 * If the owning (remote) CPU is still in the middle of schedule() with
3264 * this task as prev, considering queueing p on the remote CPUs wake_list
3265 * which potentially sends an IPI instead of spinning on p->on_cpu to
3266 * let the waker make forward progress. This is safe because IRQs are
3267 * disabled and the IPI will deliver after on_cpu is cleared.
3268 *
3269 * Ensure we load task_cpu(p) after p->on_cpu:
3270 *
3271 * set_task_cpu(p, cpu);
3272 * STORE p->cpu = @cpu
3273 * __schedule() (switch to task 'p')
3274 * LOCK rq->lock
3275 * smp_mb__after_spin_lock() smp_cond_load_acquire(&p->on_cpu)
3276 * STORE p->on_cpu = 1 LOAD p->cpu
3277 *
3278 * to ensure we observe the correct CPU on which the task is currently
3279 * scheduling.
3280 */
3281 if (smp_load_acquire(&p->on_cpu) &&
3282 ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) {
3283 goto unlock;
3284 }
3285
3286 /*
3287 * If the owning (remote) CPU is still in the middle of schedule() with
3288 * this task as prev, wait until its done referencing the task.
3289 *
3290 * Pairs with the smp_store_release() in finish_task().
3291 *
3292 * This ensures that tasks getting woken will be fully ordered against
3293 * their previous state and preserve Program Order.
3294 */
3295 smp_cond_load_acquire(&p->on_cpu, !VAL);
3296
3297 cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
3298 if (task_cpu(p) != cpu) {
3299 if (p->in_iowait) {
3300 delayacct_blkio_end(p);
3301 atomic_dec(&task_rq(p)->nr_iowait);
3302 }
3303
3304 wake_flags |= WF_MIGRATED;
3305 psi_ttwu_dequeue(p);
3306 set_task_cpu(p, cpu);
3307 }
3308 #else
3309 cpu = task_cpu(p);
3310 #endif /* CONFIG_SMP */
3311
3312 ttwu_queue(p, cpu, wake_flags);
3313 unlock:
3314 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3315 out:
3316 if (success) {
3317 ttwu_stat(p, task_cpu(p), wake_flags);
3318 }
3319 preempt_enable();
3320
3321 return success;
3322 }
3323
3324 /**
3325 * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3326 * @p: Process for which the function is to be invoked, can be @current.
3327 * @func: Function to invoke.
3328 * @arg: Argument to function.
3329 *
3330 * If the specified task can be quickly locked into a definite state
3331 * (either sleeping or on a given runqueue), arrange to keep it in that
3332 * state while invoking @func(@arg). This function can use ->on_rq and
3333 * task_curr() to work out what the state is, if required. Given that
3334 * @func can be invoked with a runqueue lock held, it had better be quite
3335 * lightweight.
3336 *
3337 * Returns:
3338 * @false if the task slipped out from under the locks.
3339 * @true if the task was locked onto a runqueue or is sleeping.
3340 * However, @func can override this by returning @false.
3341 */
try_invoke_on_locked_down_task(struct task_struct * p,bool (* func)(struct task_struct * t,void * arg),void * arg)3342 bool try_invoke_on_locked_down_task(struct task_struct *p,
3343 bool (*func)(struct task_struct *t,
3344 void *arg),
3345 void *arg)
3346 {
3347 struct rq_flags rf;
3348 bool ret = false;
3349 struct rq *rq;
3350
3351 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3352 if (p->on_rq) {
3353 rq = __task_rq_lock(p, &rf);
3354 if (task_rq(p) == rq) {
3355 ret = func(p, arg);
3356 }
3357 rq_unlock(rq, &rf);
3358 } else {
3359 switch (p->state) {
3360 case TASK_RUNNING:
3361 case TASK_WAKING:
3362 break;
3363 default:
3364 smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3365 if (!p->on_rq) {
3366 ret = func(p, arg);
3367 }
3368 }
3369 }
3370 raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3371 return ret;
3372 }
3373
3374 /**
3375 * wake_up_process - Wake up a specific process
3376 * @p: The process to be woken up.
3377 *
3378 * Attempt to wake up the nominated process and move it to the set of runnable
3379 * processes.
3380 *
3381 * Return: 1 if the process was woken up, 0 if it was already running.
3382 *
3383 * This function executes a full memory barrier before accessing the task state.
3384 */
wake_up_process(struct task_struct * p)3385 int wake_up_process(struct task_struct *p)
3386 {
3387 return try_to_wake_up(p, TASK_NORMAL, 0);
3388 }
3389 EXPORT_SYMBOL(wake_up_process);
3390
wake_up_state(struct task_struct * p,unsigned int state)3391 int wake_up_state(struct task_struct *p, unsigned int state)
3392 {
3393 return try_to_wake_up(p, state, 0);
3394 }
3395
3396 /*
3397 * Perform scheduler related setup for a newly forked process p.
3398 * p is forked by current.
3399 *
3400 * __sched_fork() is basic setup used by init_idle() too:
3401 */
__sched_fork(unsigned long clone_flags,struct task_struct * p)3402 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3403 {
3404 p->on_rq = 0;
3405
3406 p->se.on_rq = 0;
3407 p->se.exec_start = 0;
3408 p->se.sum_exec_runtime = 0;
3409 p->se.prev_sum_exec_runtime = 0;
3410 p->se.nr_migrations = 0;
3411 p->se.vruntime = 0;
3412 INIT_LIST_HEAD(&p->se.group_node);
3413
3414 #ifdef CONFIG_FAIR_GROUP_SCHED
3415 p->se.cfs_rq = NULL;
3416 #endif
3417
3418 #ifdef CONFIG_SCHEDSTATS
3419 /* Even if schedstat is disabled, there should not be garbage */
3420 memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3421 #endif
3422
3423 RB_CLEAR_NODE(&p->dl.rb_node);
3424 init_dl_task_timer(&p->dl);
3425 init_dl_inactive_task_timer(&p->dl);
3426 __dl_clear_params(p);
3427
3428 INIT_LIST_HEAD(&p->rt.run_list);
3429 p->rt.timeout = 0;
3430 p->rt.time_slice = sched_rr_timeslice;
3431 p->rt.on_rq = 0;
3432 p->rt.on_list = 0;
3433
3434 #ifdef CONFIG_PREEMPT_NOTIFIERS
3435 INIT_HLIST_HEAD(&p->preempt_notifiers);
3436 #endif
3437
3438 #ifdef CONFIG_COMPACTION
3439 p->capture_control = NULL;
3440 #endif
3441 init_numa_balancing(clone_flags, p);
3442 #ifdef CONFIG_SMP
3443 p->wake_entry.u_flags = CSD_TYPE_TTWU;
3444 #endif
3445 #ifdef CONFIG_SCHED_RTG
3446 p->rtg_depth = 0;
3447 #endif
3448 }
3449
3450 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3451
3452 #ifdef CONFIG_NUMA_BALANCING
3453
set_numabalancing_state(bool enabled)3454 void set_numabalancing_state(bool enabled)
3455 {
3456 if (enabled) {
3457 static_branch_enable(&sched_numa_balancing);
3458 } else {
3459 static_branch_disable(&sched_numa_balancing);
3460 }
3461 }
3462
3463 #ifdef CONFIG_PROC_SYSCTL
sysctl_numa_balancing(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)3464 int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
3465 size_t *lenp, loff_t *ppos)
3466 {
3467 struct ctl_table t;
3468 int err;
3469 int state = static_branch_likely(&sched_numa_balancing);
3470
3471 if (write && !capable(CAP_SYS_ADMIN)) {
3472 return -EPERM;
3473 }
3474
3475 t = *table;
3476 t.data = &state;
3477 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3478 if (err < 0) {
3479 return err;
3480 }
3481 if (write) {
3482 set_numabalancing_state(state);
3483 }
3484 return err;
3485 }
3486 #endif
3487 #endif
3488
3489 #ifdef CONFIG_SCHEDSTATS
3490
3491 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
3492 static bool __initdata __sched_schedstats = false;
3493
set_schedstats(bool enabled)3494 static void set_schedstats(bool enabled)
3495 {
3496 if (enabled) {
3497 static_branch_enable(&sched_schedstats);
3498 } else {
3499 static_branch_disable(&sched_schedstats);
3500 }
3501 }
3502
force_schedstat_enabled(void)3503 void force_schedstat_enabled(void)
3504 {
3505 if (!schedstat_enabled()) {
3506 pr_info("kernel profiling enabled schedstats, disable via "
3507 "kernel.sched_schedstats.\n");
3508 static_branch_enable(&sched_schedstats);
3509 }
3510 }
3511
setup_schedstats(char * str)3512 static int __init setup_schedstats(char *str)
3513 {
3514 int ret = 0;
3515 if (!str) {
3516 goto out;
3517 }
3518
3519 /*
3520 * This code is called before jump labels have been set up, so we can't
3521 * change the static branch directly just yet. Instead set a temporary
3522 * variable so init_schedstats() can do it later.
3523 */
3524 if (!strcmp(str, "enable")) {
3525 __sched_schedstats = true;
3526 ret = 1;
3527 } else if (!strcmp(str, "disable")) {
3528 __sched_schedstats = false;
3529 ret = 1;
3530 }
3531 out:
3532 if (!ret) {
3533 pr_warn("Unable to parse schedstats=\n");
3534 }
3535
3536 return ret;
3537 }
3538 __setup("schedstats=", setup_schedstats);
3539
init_schedstats(void)3540 static void __init init_schedstats(void)
3541 {
3542 set_schedstats(__sched_schedstats);
3543 }
3544
3545 #ifdef CONFIG_PROC_SYSCTL
sysctl_schedstats(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)3546 int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3547 size_t *lenp, loff_t *ppos)
3548 {
3549 struct ctl_table t;
3550 int err;
3551 int state = static_branch_likely(&sched_schedstats);
3552
3553 if (write && !capable(CAP_SYS_ADMIN)) {
3554 return -EPERM;
3555 }
3556
3557 t = *table;
3558 t.data = &state;
3559 err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3560 if (err < 0) {
3561 return err;
3562 }
3563 if (write) {
3564 set_schedstats(state);
3565 }
3566 return err;
3567 }
3568 #endif /* CONFIG_PROC_SYSCTL */
3569 #else /* !CONFIG_SCHEDSTATS */
init_schedstats(void)3570 static inline void init_schedstats(void)
3571 {
3572 }
3573 #endif /* CONFIG_SCHEDSTATS */
3574
3575 /*
3576 * fork()/clone()-time setup
3577 */
sched_fork(unsigned long clone_flags,struct task_struct * p)3578 int sched_fork(unsigned long clone_flags, struct task_struct *p)
3579 {
3580 init_new_task_load(p);
3581 __sched_fork(clone_flags, p);
3582 /*
3583 * We mark the process as NEW here. This guarantees that
3584 * nobody will actually run it, and a signal or other external
3585 * event cannot wake it up and insert it on the runqueue either.
3586 */
3587 p->state = TASK_NEW;
3588
3589 /*
3590 * Make sure we do not leak PI boosting priority to the child.
3591 */
3592 p->prio = current->normal_prio;
3593
3594 #ifdef CONFIG_SCHED_LATENCY_NICE
3595 /* Propagate the parent's latency requirements to the child as well */
3596 p->latency_prio = current->latency_prio;
3597 #endif
3598
3599 uclamp_fork(p);
3600
3601 /*
3602 * Revert to default priority/policy on fork if requested.
3603 */
3604 if (unlikely(p->sched_reset_on_fork)) {
3605 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3606 p->policy = SCHED_NORMAL;
3607 #ifdef CONFIG_SCHED_RTG
3608 if (current->rtg_depth != 0) {
3609 p->static_prio = current->static_prio;
3610 } else {
3611 p->static_prio = NICE_TO_PRIO(0);
3612 }
3613 #else
3614 p->static_prio = NICE_TO_PRIO(0);
3615 #endif
3616 p->rt_priority = 0;
3617 } else if (PRIO_TO_NICE(p->static_prio) < 0) {
3618 p->static_prio = NICE_TO_PRIO(0);
3619 }
3620
3621 p->prio = p->normal_prio = p->static_prio;
3622 set_load_weight(p);
3623
3624 #ifdef CONFIG_SCHED_LATENCY_NICE
3625 p->latency_prio = NICE_TO_LATENCY(0);
3626 set_latency_weight(p);
3627 #endif
3628
3629 /*
3630 * We don't need the reset flag anymore after the fork. It has
3631 * fulfilled its duty:
3632 */
3633 p->sched_reset_on_fork = 0;
3634 }
3635
3636 if (dl_prio(p->prio)) {
3637 return -EAGAIN;
3638 } else if (rt_prio(p->prio)) {
3639 p->sched_class = &rt_sched_class;
3640 } else {
3641 p->sched_class = &fair_sched_class;
3642 }
3643
3644 init_entity_runnable_average(&p->se);
3645
3646 #ifdef CONFIG_SCHED_INFO
3647 if (likely(sched_info_on())) {
3648 memset(&p->sched_info, 0, sizeof(p->sched_info));
3649 }
3650 #endif
3651 #if defined(CONFIG_SMP)
3652 p->on_cpu = 0;
3653 #endif
3654 init_task_preempt_count(p);
3655 #ifdef CONFIG_SMP
3656 plist_node_init(&p->pushable_tasks, MAX_PRIO);
3657 RB_CLEAR_NODE(&p->pushable_dl_tasks);
3658 #endif
3659 return 0;
3660 }
3661
sched_post_fork(struct task_struct * p,struct kernel_clone_args * kargs)3662 void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
3663 {
3664 unsigned long flags;
3665 #ifdef CONFIG_CGROUP_SCHED
3666 struct task_group *tg;
3667 #endif
3668
3669 raw_spin_lock_irqsave(&p->pi_lock, flags);
3670 #ifdef CONFIG_CGROUP_SCHED
3671 tg = container_of(kargs->cset->subsys[cpu_cgrp_id], struct task_group, css);
3672 p->sched_task_group = autogroup_task_group(p, tg);
3673 #endif
3674 rseq_migrate(p);
3675 /*
3676 * We're setting the CPU for the first time, we don't migrate,
3677 * so use __set_task_cpu().
3678 */
3679 __set_task_cpu(p, smp_processor_id());
3680 if (p->sched_class->task_fork) {
3681 p->sched_class->task_fork(p);
3682 }
3683 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3684
3685 uclamp_post_fork(p);
3686 }
3687
to_ratio(u64 period,u64 runtime)3688 unsigned long to_ratio(u64 period, u64 runtime)
3689 {
3690 if (runtime == RUNTIME_INF) {
3691 return BW_UNIT;
3692 }
3693
3694 /*
3695 * Doing this here saves a lot of checks in all
3696 * the calling paths, and returning zero seems
3697 * safe for them anyway.
3698 */
3699 if (period == 0) {
3700 return 0;
3701 }
3702
3703 return div64_u64(runtime << BW_SHIFT, period);
3704 }
3705
3706 /*
3707 * wake_up_new_task - wake up a newly created task for the first time.
3708 *
3709 * This function will do some initial scheduler statistics housekeeping
3710 * that must be done for every newly created context, then puts the task
3711 * on the runqueue and wakes it.
3712 */
wake_up_new_task(struct task_struct * p)3713 void wake_up_new_task(struct task_struct *p)
3714 {
3715 struct rq_flags rf;
3716 struct rq *rq;
3717
3718 raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3719 add_new_task_to_grp(p);
3720
3721 p->state = TASK_RUNNING;
3722 #ifdef CONFIG_SMP
3723 /*
3724 * Fork balancing, do it here and not earlier because:
3725 * - cpus_ptr can change in the fork path
3726 * - any previously selected CPU might disappear through hotplug
3727 *
3728 * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
3729 * as we're not fully set-up yet.
3730 */
3731 p->recent_used_cpu = task_cpu(p);
3732 rseq_migrate(p);
3733 __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
3734 #endif
3735 rq = __task_rq_lock(p, &rf);
3736 update_rq_clock(rq);
3737 post_init_entity_util_avg(p);
3738
3739 mark_task_starting(p);
3740
3741 activate_task(rq, p, ENQUEUE_NOCLOCK);
3742 trace_sched_wakeup_new(p);
3743 check_preempt_curr(rq, p, WF_FORK);
3744 #ifdef CONFIG_SMP
3745 if (p->sched_class->task_woken) {
3746 /*
3747 * Nothing relies on rq->lock after this, so its fine to
3748 * drop it.
3749 */
3750 rq_unpin_lock(rq, &rf);
3751 p->sched_class->task_woken(rq, p);
3752 rq_repin_lock(rq, &rf);
3753 }
3754 #endif
3755 task_rq_unlock(rq, p, &rf);
3756 }
3757
3758 #ifdef CONFIG_PREEMPT_NOTIFIERS
3759
3760 static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
3761
preempt_notifier_inc(void)3762 void preempt_notifier_inc(void)
3763 {
3764 static_branch_inc(&preempt_notifier_key);
3765 }
3766 EXPORT_SYMBOL_GPL(preempt_notifier_inc);
3767
preempt_notifier_dec(void)3768 void preempt_notifier_dec(void)
3769 {
3770 static_branch_dec(&preempt_notifier_key);
3771 }
3772 EXPORT_SYMBOL_GPL(preempt_notifier_dec);
3773
3774 /**
3775 * preempt_notifier_register - tell me when current is being preempted &
3776 * rescheduled
3777 * @notifier: notifier struct to register
3778 */
preempt_notifier_register(struct preempt_notifier * notifier)3779 void preempt_notifier_register(struct preempt_notifier *notifier)
3780 {
3781 if (!static_branch_unlikely(&preempt_notifier_key)) {
3782 WARN(1, "registering preempt_notifier while notifiers disabled\n");
3783 }
3784
3785 hlist_add_head(¬ifier->link, ¤t->preempt_notifiers);
3786 }
3787 EXPORT_SYMBOL_GPL(preempt_notifier_register);
3788
3789 /**
3790 * preempt_notifier_unregister - no longer interested in preemption
3791 * notifications
3792 * @notifier: notifier struct to unregister
3793 *
3794 * This is *not* safe to call from within a preemption notifier.
3795 */
preempt_notifier_unregister(struct preempt_notifier * notifier)3796 void preempt_notifier_unregister(struct preempt_notifier *notifier)
3797 {
3798 hlist_del(¬ifier->link);
3799 }
3800 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3801
__fire_sched_in_preempt_notifiers(struct task_struct * curr)3802 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
3803 {
3804 struct preempt_notifier *notifier;
3805
3806 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3807 notifier->ops->sched_in(notifier, raw_smp_processor_id());
3808 }
3809
3810 static __always_inline void
fire_sched_in_preempt_notifiers(struct task_struct * curr)3811 fire_sched_in_preempt_notifiers(struct task_struct *curr)
3812 {
3813 if (static_branch_unlikely(&preempt_notifier_key)) {
3814 __fire_sched_in_preempt_notifiers(curr);
3815 }
3816 }
3817
__fire_sched_out_preempt_notifiers(struct task_struct * curr,struct task_struct * next)3818 static void __fire_sched_out_preempt_notifiers(struct task_struct *curr,
3819 struct task_struct *next)
3820 {
3821 struct preempt_notifier *notifier;
3822
3823 hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3824 notifier->ops->sched_out(notifier, next);
3825 }
3826
3827 static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct * curr,struct task_struct * next)3828 fire_sched_out_preempt_notifiers(struct task_struct *curr,
3829 struct task_struct *next)
3830 {
3831 if (static_branch_unlikely(&preempt_notifier_key)) {
3832 __fire_sched_out_preempt_notifiers(curr, next);
3833 }
3834 }
3835
3836 #else /* !CONFIG_PREEMPT_NOTIFIERS */
3837
fire_sched_in_preempt_notifiers(struct task_struct * curr)3838 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3839 {
3840 }
3841
fire_sched_out_preempt_notifiers(struct task_struct * curr,struct task_struct * next)3842 static inline void fire_sched_out_preempt_notifiers(struct task_struct *curr,
3843 struct task_struct *next)
3844 {
3845 }
3846
3847 #endif /* CONFIG_PREEMPT_NOTIFIERS */
3848
prepare_task(struct task_struct * next)3849 static inline void prepare_task(struct task_struct *next)
3850 {
3851 #ifdef CONFIG_SMP
3852 /*
3853 * Claim the task as running, we do this before switching to it
3854 * such that any running task will have this set.
3855 *
3856 * See the ttwu() WF_ON_CPU case and its ordering comment.
3857 */
3858 WRITE_ONCE(next->on_cpu, 1);
3859 #endif
3860 }
3861
finish_task(struct task_struct * prev)3862 static inline void finish_task(struct task_struct *prev)
3863 {
3864 #ifdef CONFIG_SMP
3865 /*
3866 * This must be the very last reference to @prev from this CPU. After
3867 * p->on_cpu is cleared, the task can be moved to a different CPU. We
3868 * must ensure this doesn't happen until the switch is completely
3869 * finished.
3870 *
3871 * In particular, the load of prev->state in finish_task_switch() must
3872 * happen before this.
3873 *
3874 * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
3875 */
3876 smp_store_release(&prev->on_cpu, 0);
3877 #endif
3878 }
3879
prepare_lock_switch(struct rq * rq,struct task_struct * next,struct rq_flags * rf)3880 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next,
3881 struct rq_flags *rf)
3882 {
3883 /*
3884 * Since the runqueue lock will be released by the next
3885 * task (which is an invalid locking op but in the case
3886 * of the scheduler it's an obvious special-case), so we
3887 * do an early lockdep release here:
3888 */
3889 rq_unpin_lock(rq, rf);
3890 spin_release(&rq->lock.dep_map, _THIS_IP_);
3891 #ifdef CONFIG_DEBUG_SPINLOCK
3892 /* this is a valid case when another task releases the spinlock */
3893 rq->lock.owner = next;
3894 #endif
3895 }
3896
finish_lock_switch(struct rq * rq)3897 static inline void finish_lock_switch(struct rq *rq)
3898 {
3899 /*
3900 * If we are tracking spinlock dependencies then we have to
3901 * fix up the runqueue lock - which gets 'carried over' from
3902 * prev into current:
3903 */
3904 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
3905 raw_spin_unlock_irq(&rq->lock);
3906 }
3907
3908 /*
3909 * NOP if the arch has not defined these:
3910 */
3911
3912 #ifndef prepare_arch_switch
3913 #define prepare_arch_switch(next) \
3914 do { \
3915 } while (0)
3916 #endif
3917
3918 #ifndef finish_arch_post_lock_switch
3919 #define finish_arch_post_lock_switch() \
3920 do { \
3921 } while (0)
3922 #endif
3923
3924 /**
3925 * prepare_task_switch - prepare to switch tasks
3926 * @rq: the runqueue preparing to switch
3927 * @prev: the current task that is being switched out
3928 * @next: the task we are going to switch to.
3929 *
3930 * This is called with the rq lock held and interrupts off. It must
3931 * be paired with a subsequent finish_task_switch after the context
3932 * switch.
3933 *
3934 * prepare_task_switch sets up locking and calls architecture specific
3935 * hooks.
3936 */
prepare_task_switch(struct rq * rq,struct task_struct * prev,struct task_struct * next)3937 static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev,
3938 struct task_struct *next)
3939 {
3940 kcov_prepare_switch(prev);
3941 sched_info_switch(rq, prev, next);
3942 perf_event_task_sched_out(prev, next);
3943 rseq_preempt(prev);
3944 fire_sched_out_preempt_notifiers(prev, next);
3945 prepare_task(next);
3946 prepare_arch_switch(next);
3947 }
3948
3949 /**
3950 * finish_task_switch - clean up after a task-switch
3951 * @prev: the thread we just switched away from.
3952 *
3953 * finish_task_switch must be called after the context switch, paired
3954 * with a prepare_task_switch call before the context switch.
3955 * finish_task_switch will reconcile locking set up by prepare_task_switch,
3956 * and do any other architecture-specific cleanup actions.
3957 *
3958 * Note that we may have delayed dropping an mm in context_switch(). If
3959 * so, we finish that here outside of the runqueue lock. (Doing it
3960 * with the lock held can cause deadlocks; see schedule() for
3961 * details.)
3962 *
3963 * The context switch have flipped the stack from under us and restored the
3964 * local variables which were saved when this task called schedule() in the
3965 * past. prev == current is still correct but we need to recalculate this_rq
3966 * because prev may have moved to another CPU.
3967 */
finish_task_switch(struct task_struct * prev)3968 static struct rq *finish_task_switch(struct task_struct *prev)
3969 __releases(rq->lock)
3970 {
3971 struct rq *rq = this_rq();
3972 struct mm_struct *mm = rq->prev_mm;
3973 long prev_state;
3974
3975 /*
3976 * The previous task will have left us with a preempt_count of 2
3977 * because it left us after:
3978 *
3979 * schedule()
3980 * preempt_disable(); // 1
3981 * __schedule()
3982 * raw_spin_lock_irq(&rq->lock) // 2
3983 *
3984 * Also, see FORK_PREEMPT_COUNT.
3985 */
3986 if (WARN_ONCE(preempt_count() != 2 * PREEMPT_DISABLE_OFFSET,
3987 "corrupted preempt_count: %s/%d/0x%x\n", current->comm,
3988 current->pid, preempt_count())) {
3989 preempt_count_set(FORK_PREEMPT_COUNT);
3990 }
3991
3992 rq->prev_mm = NULL;
3993
3994 /*
3995 * A task struct has one reference for the use as "current".
3996 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
3997 * schedule one last time. The schedule call will never return, and
3998 * the scheduled task must drop that reference.
3999 *
4000 * We must observe prev->state before clearing prev->on_cpu (in
4001 * finish_task), otherwise a concurrent wakeup can get prev
4002 * running on another CPU and we could rave with its RUNNING -> DEAD
4003 * transition, resulting in a double drop.
4004 */
4005 prev_state = prev->state;
4006 vtime_task_switch(prev);
4007 perf_event_task_sched_in(prev, current);
4008 finish_task(prev);
4009 finish_lock_switch(rq);
4010 finish_arch_post_lock_switch();
4011 kcov_finish_switch(current);
4012
4013 fire_sched_in_preempt_notifiers(current);
4014 /*
4015 * When switching through a kernel thread, the loop in
4016 * membarrier_{private,global}_expedited() may have observed that
4017 * kernel thread and not issued an IPI. It is therefore possible to
4018 * schedule between user->kernel->user threads without passing though
4019 * switch_mm(). Membarrier requires a barrier after storing to
4020 * rq->curr, before returning to userspace, so provide them here:
4021 *
4022 * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
4023 * provided by mmdrop(),
4024 * - a sync_core for SYNC_CORE.
4025 */
4026 if (mm) {
4027 membarrier_mm_sync_core_before_usermode(mm);
4028 mmdrop(mm);
4029 }
4030 if (unlikely(prev_state == TASK_DEAD)) {
4031 if (prev->sched_class->task_dead) {
4032 prev->sched_class->task_dead(prev);
4033 }
4034
4035 /*
4036 * Remove function-return probe instances associated with this
4037 * task and put them back on the free list.
4038 */
4039 kprobe_flush_task(prev);
4040
4041 /* Task is done with its stack. */
4042 put_task_stack(prev);
4043
4044 put_task_struct_rcu_user(prev);
4045 }
4046
4047 tick_nohz_task_switch();
4048 return rq;
4049 }
4050
4051 #ifdef CONFIG_SMP
4052
4053 /* rq->lock is NOT held, but preemption is disabled */
__balance_callback(struct rq * rq)4054 static void __balance_callback(struct rq *rq)
4055 {
4056 struct callback_head *head, *next;
4057 void (*func)(struct rq * rq);
4058 unsigned long flags;
4059
4060 raw_spin_lock_irqsave(&rq->lock, flags);
4061 head = rq->balance_callback;
4062 rq->balance_callback = NULL;
4063 while (head) {
4064 func = (void (*)(struct rq *))head->func;
4065 next = head->next;
4066 head->next = NULL;
4067 head = next;
4068
4069 func(rq);
4070 }
4071 raw_spin_unlock_irqrestore(&rq->lock, flags);
4072 }
4073
balance_callback(struct rq * rq)4074 static inline void balance_callback(struct rq *rq)
4075 {
4076 if (unlikely(rq->balance_callback)) {
4077 __balance_callback(rq);
4078 }
4079 }
4080
4081 #else
4082
balance_callback(struct rq * rq)4083 static inline void balance_callback(struct rq *rq)
4084 {
4085 }
4086
4087 #endif
4088
4089 /**
4090 * schedule_tail - first thing a freshly forked thread must call.
4091 * @prev: the thread we just switched away from.
4092 */
schedule_tail(struct task_struct * prev)4093 asmlinkage __visible void schedule_tail(struct task_struct *prev)
4094 __releases(rq->lock)
4095 {
4096 struct rq *rq;
4097
4098 /*
4099 * New tasks start with FORK_PREEMPT_COUNT, see there and
4100 * finish_task_switch() for details.
4101 *
4102 * finish_task_switch() will drop rq->lock() and lower preempt_count
4103 * and the preempt_enable() will end up enabling preemption (on
4104 * PREEMPT_COUNT kernels).
4105 */
4106
4107 rq = finish_task_switch(prev);
4108 balance_callback(rq);
4109 preempt_enable();
4110
4111 if (current->set_child_tid) {
4112 put_user(task_pid_vnr(current), current->set_child_tid);
4113 }
4114
4115 calculate_sigpending();
4116 }
4117
4118 /*
4119 * context_switch - switch to the new MM and the new thread's register state.
4120 */
context_switch(struct rq * rq,struct task_struct * prev,struct task_struct * next,struct rq_flags * rf)4121 static __always_inline struct rq *context_switch(struct rq *rq,
4122 struct task_struct *prev,
4123 struct task_struct *next,
4124 struct rq_flags *rf)
4125 {
4126 prepare_task_switch(rq, prev, next);
4127
4128 /*
4129 * For paravirt, this is coupled with an exit in switch_to to
4130 * combine the page table reload and the switch backend into
4131 * one hypercall.
4132 */
4133 arch_start_context_switch(prev);
4134
4135 /*
4136 * kernel -> kernel lazy + transfer active
4137 * user -> kernel lazy + mmgrab() active
4138 *
4139 * kernel -> user switch + mmdrop() active
4140 * user -> user switch
4141 */
4142 if (!next->mm) { // to kernel
4143 enter_lazy_tlb(prev->active_mm, next);
4144
4145 next->active_mm = prev->active_mm;
4146 if (prev->mm) { // from user
4147 mmgrab(prev->active_mm);
4148 } else {
4149 prev->active_mm = NULL;
4150 }
4151 } else { // to user
4152 membarrier_switch_mm(rq, prev->active_mm, next->mm);
4153 /*
4154 * sys_membarrier() requires an smp_mb() between setting
4155 * rq->curr / membarrier_switch_mm() and returning to userspace.
4156 *
4157 * The below provides this either through switch_mm(), or in
4158 * case 'prev->active_mm == next->mm' through
4159 * finish_task_switch()'s mmdrop().
4160 */
4161 switch_mm_irqs_off(prev->active_mm, next->mm, next);
4162
4163 if (!prev->mm) { // from kernel
4164 /* will mmdrop() in finish_task_switch(). */
4165 rq->prev_mm = prev->active_mm;
4166 prev->active_mm = NULL;
4167 }
4168 }
4169
4170 rq->clock_update_flags &= ~(RQCF_ACT_SKIP | RQCF_REQ_SKIP);
4171
4172 prepare_lock_switch(rq, next, rf);
4173
4174 /* Here we just switch the register state and the stack. */
4175 switch_to(prev, next, prev);
4176 barrier();
4177
4178 return finish_task_switch(prev);
4179 }
4180
4181 /*
4182 * nr_running and nr_context_switches
4183 *
4184 * externally visible scheduler statistics: current number of runnable
4185 * threads, total number of context switches performed since bootup.
4186 */
nr_running(void)4187 unsigned long nr_running(void)
4188 {
4189 unsigned long i, sum = 0;
4190
4191 for_each_online_cpu(i) sum += cpu_rq(i)->nr_running;
4192
4193 return sum;
4194 }
4195
4196 /*
4197 * Check if only the current task is running on the CPU.
4198 *
4199 * Caution: this function does not check that the caller has disabled
4200 * preemption, thus the result might have a time-of-check-to-time-of-use
4201 * race. The caller is responsible to use it correctly, for example:
4202 *
4203 * - from a non-preemptible section (of course)
4204 *
4205 * - from a thread that is bound to a single CPU
4206 *
4207 * - in a loop with very short iterations (e.g. a polling loop)
4208 */
single_task_running(void)4209 bool single_task_running(void)
4210 {
4211 return raw_rq()->nr_running == 1;
4212 }
4213 EXPORT_SYMBOL(single_task_running);
4214
nr_context_switches(void)4215 unsigned long long nr_context_switches(void)
4216 {
4217 int i;
4218 unsigned long long sum = 0;
4219
4220 for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches;
4221
4222 return sum;
4223 }
4224
4225 /*
4226 * Consumers of these two interfaces, like for example the cpuidle menu
4227 * governor, are using nonsensical data. Preferring shallow idle state selection
4228 * for a CPU that has IO-wait which might not even end up running the task when
4229 * it does become runnable.
4230 */
4231
nr_iowait_cpu(int cpu)4232 unsigned long nr_iowait_cpu(int cpu)
4233 {
4234 return atomic_read(&cpu_rq(cpu)->nr_iowait);
4235 }
4236
4237 /*
4238 * IO-wait accounting, and how its mostly bollocks (on SMP).
4239 *
4240 * The idea behind IO-wait account is to account the idle time that we could
4241 * have spend running if it were not for IO. That is, if we were to improve the
4242 * storage performance, we'd have a proportional reduction in IO-wait time.
4243 *
4244 * This all works nicely on UP, where, when a task blocks on IO, we account
4245 * idle time as IO-wait, because if the storage were faster, it could've been
4246 * running and we'd not be idle.
4247 *
4248 * This has been extended to SMP, by doing the same for each CPU. This however
4249 * is broken.
4250 *
4251 * Imagine for instance the case where two tasks block on one CPU, only the one
4252 * CPU will have IO-wait accounted, while the other has regular idle. Even
4253 * though, if the storage were faster, both could've ran at the same time,
4254 * utilising both CPUs.
4255 *
4256 * This means, that when looking globally, the current IO-wait accounting on
4257 * SMP is a lower bound, by reason of under accounting.
4258 *
4259 * Worse, since the numbers are provided per CPU, they are sometimes
4260 * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
4261 * associated with any one particular CPU, it can wake to another CPU than it
4262 * blocked on. This means the per CPU IO-wait number is meaningless.
4263 *
4264 * Task CPU affinities can make all that even more 'interesting'.
4265 */
4266
nr_iowait(void)4267 unsigned long nr_iowait(void)
4268 {
4269 unsigned long i, sum = 0;
4270
4271 for_each_possible_cpu(i) sum += nr_iowait_cpu(i);
4272
4273 return sum;
4274 }
4275
4276 #ifdef CONFIG_SMP
4277
4278 /*
4279 * sched_exec - execve() is a valuable balancing opportunity, because at
4280 * this point the task has the smallest effective memory and cache footprint.
4281 */
sched_exec(void)4282 void sched_exec(void)
4283 {
4284 struct task_struct *p = current;
4285 unsigned long flags;
4286 int dest_cpu;
4287
4288 raw_spin_lock_irqsave(&p->pi_lock, flags);
4289 dest_cpu =
4290 p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
4291 if (dest_cpu == smp_processor_id()) {
4292 goto unlock;
4293 }
4294
4295 if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
4296 struct migration_arg arg = {p, dest_cpu};
4297
4298 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4299 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
4300 return;
4301 }
4302 unlock:
4303 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4304 }
4305
4306 #endif
4307
4308 DEFINE_PER_CPU(struct kernel_stat, kstat);
4309 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
4310
4311 EXPORT_PER_CPU_SYMBOL(kstat);
4312 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
4313
4314 /*
4315 * The function fair_sched_class.update_curr accesses the struct curr
4316 * and its field curr->exec_start; when called from task_sched_runtime(),
4317 * we observe a high rate of cache misses in practice.
4318 * Prefetching this data results in improved performance.
4319 */
prefetch_curr_exec_start(struct task_struct * p)4320 static inline void prefetch_curr_exec_start(struct task_struct *p)
4321 {
4322 #ifdef CONFIG_FAIR_GROUP_SCHED
4323 struct sched_entity *curr = (&p->se)->cfs_rq->curr;
4324 #else
4325 struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
4326 #endif
4327 prefetch(curr);
4328 prefetch(&curr->exec_start);
4329 }
4330
4331 /*
4332 * Return accounted runtime for the task.
4333 * In case the task is currently running, return the runtime plus current's
4334 * pending runtime that have not been accounted yet.
4335 */
task_sched_runtime(struct task_struct * p)4336 unsigned long long task_sched_runtime(struct task_struct *p)
4337 {
4338 struct rq_flags rf;
4339 struct rq *rq;
4340 u64 ns;
4341
4342 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
4343 /*
4344 * 64-bit doesn't need locks to atomically read a 64-bit value.
4345 * So we have a optimization chance when the task's delta_exec is 0.
4346 * Reading ->on_cpu is racy, but this is ok.
4347 *
4348 * If we race with it leaving CPU, we'll take a lock. So we're correct.
4349 * If we race with it entering CPU, unaccounted time is 0. This is
4350 * indistinguishable from the read occurring a few cycles earlier.
4351 * If we see ->on_cpu without ->on_rq, the task is leaving, and has
4352 * been accounted, so we're correct here as well.
4353 */
4354 if (!p->on_cpu || !task_on_rq_queued(p)) {
4355 return p->se.sum_exec_runtime;
4356 }
4357 #endif
4358
4359 rq = task_rq_lock(p, &rf);
4360 /*
4361 * Must be ->curr _and_ ->on_rq. If dequeued, we would
4362 * project cycles that may never be accounted to this
4363 * thread, breaking clock_gettime().
4364 */
4365 if (task_current(rq, p) && task_on_rq_queued(p)) {
4366 prefetch_curr_exec_start(p);
4367 update_rq_clock(rq);
4368 p->sched_class->update_curr(rq);
4369 }
4370 ns = p->se.sum_exec_runtime;
4371 task_rq_unlock(rq, p, &rf);
4372
4373 return ns;
4374 }
4375
4376 /*
4377 * This function gets called by the timer code, with HZ frequency.
4378 * We call it with interrupts disabled.
4379 */
scheduler_tick(void)4380 void scheduler_tick(void)
4381 {
4382 int cpu = smp_processor_id();
4383 struct rq *rq = cpu_rq(cpu);
4384 struct task_struct *curr = rq->curr;
4385 struct rq_flags rf;
4386 u64 wallclock;
4387 unsigned long thermal_pressure;
4388
4389 arch_scale_freq_tick();
4390 sched_clock_tick();
4391
4392 rq_lock(rq, &rf);
4393
4394 set_window_start(rq);
4395 wallclock = sched_ktime_clock();
4396 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
4397 update_rq_clock(rq);
4398 thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4399 update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
4400 curr->sched_class->task_tick(rq, curr, 0);
4401 calc_global_load_tick(rq);
4402 psi_task_tick(rq);
4403
4404 rq_unlock(rq, &rf);
4405
4406 #ifdef CONFIG_SCHED_RTG
4407 sched_update_rtg_tick(curr);
4408 #endif
4409 perf_event_task_tick();
4410
4411 #ifdef CONFIG_SMP
4412 rq->idle_balance = idle_cpu(cpu);
4413 trigger_load_balance(rq);
4414
4415 #ifdef CONFIG_SCHED_EAS
4416 if (curr->sched_class->check_for_migration) {
4417 curr->sched_class->check_for_migration(rq, curr);
4418 }
4419 #endif
4420 #endif
4421 }
4422
4423 #ifdef CONFIG_NO_HZ_FULL
4424
4425 struct tick_work {
4426 int cpu;
4427 atomic_t state;
4428 struct delayed_work work;
4429 };
4430 /* Values for ->state, see diagram below. */
4431 #define TICK_SCHED_REMOTE_OFFLINE 0
4432 #define TICK_SCHED_REMOTE_OFFLINING 1
4433 #define TICK_SCHED_REMOTE_RUNNING 2
4434
4435 /*
4436 * State diagram for ->state:
4437 *
4438 *
4439 * TICK_SCHED_REMOTE_OFFLINE
4440 * | ^
4441 * | |
4442 * | | sched_tick_remote()
4443 * | |
4444 * | |
4445 * +--TICK_SCHED_REMOTE_OFFLINING
4446 * | ^
4447 * | |
4448 * sched_tick_start() | | sched_tick_stop()
4449 * | |
4450 * V |
4451 * TICK_SCHED_REMOTE_RUNNING
4452 *
4453 *
4454 * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
4455 * and sched_tick_start() are happy to leave the state in RUNNING.
4456 */
4457
4458 static struct tick_work __percpu *tick_work_cpu;
4459
sched_tick_remote(struct work_struct * work)4460 static void sched_tick_remote(struct work_struct *work)
4461 {
4462 struct delayed_work *dwork = to_delayed_work(work);
4463 struct tick_work *twork = container_of(dwork, struct tick_work, work);
4464 int cpu = twork->cpu;
4465 struct rq *rq = cpu_rq(cpu);
4466 struct task_struct *curr;
4467 struct rq_flags rf;
4468 u64 delta;
4469 int os;
4470
4471 /*
4472 * Handle the tick only if it appears the remote CPU is running in full
4473 * dynticks mode. The check is racy by nature, but missing a tick or
4474 * having one too much is no big deal because the scheduler tick updates
4475 * statistics and checks timeslices in a time-independent way, regardless
4476 * of when exactly it is running.
4477 */
4478 if (!tick_nohz_tick_stopped_cpu(cpu)) {
4479 goto out_requeue;
4480 }
4481
4482 rq_lock_irq(rq, &rf);
4483 curr = rq->curr;
4484 if (cpu_is_offline(cpu)) {
4485 goto out_unlock;
4486 }
4487
4488 update_rq_clock(rq);
4489
4490 if (!is_idle_task(curr)) {
4491 /*
4492 * Make sure the next tick runs within a reasonable
4493 * amount of time.
4494 */
4495 delta = rq_clock_task(rq) - curr->se.exec_start;
4496 WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 0x3);
4497 }
4498 curr->sched_class->task_tick(rq, curr, 0);
4499
4500 calc_load_nohz_remote(rq);
4501 out_unlock:
4502 rq_unlock_irq(rq, &rf);
4503 out_requeue:
4504
4505 /*
4506 * Run the remote tick once per second (1Hz). This arbitrary
4507 * frequency is large enough to avoid overload but short enough
4508 * to keep scheduler internal stats reasonably up to date. But
4509 * first update state to reflect hotplug activity if required.
4510 */
4511 os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
4512 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
4513 if (os == TICK_SCHED_REMOTE_RUNNING) {
4514 queue_delayed_work(system_unbound_wq, dwork, HZ);
4515 }
4516 }
4517
sched_tick_start(int cpu)4518 static void sched_tick_start(int cpu)
4519 {
4520 int os;
4521 struct tick_work *twork;
4522
4523 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) {
4524 return;
4525 }
4526
4527 WARN_ON_ONCE(!tick_work_cpu);
4528
4529 twork = per_cpu_ptr(tick_work_cpu, cpu);
4530 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
4531 WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
4532 if (os == TICK_SCHED_REMOTE_OFFLINE) {
4533 twork->cpu = cpu;
4534 INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
4535 queue_delayed_work(system_unbound_wq, &twork->work, HZ);
4536 }
4537 }
4538
4539 #ifdef CONFIG_HOTPLUG_CPU
sched_tick_stop(int cpu)4540 static void sched_tick_stop(int cpu)
4541 {
4542 struct tick_work *twork;
4543 int os;
4544
4545 if (housekeeping_cpu(cpu, HK_FLAG_TICK)) {
4546 return;
4547 }
4548
4549 WARN_ON_ONCE(!tick_work_cpu);
4550
4551 twork = per_cpu_ptr(tick_work_cpu, cpu);
4552 /* There cannot be competing actions, but don't rely on stop-machine. */
4553 os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
4554 WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
4555 /* Don't cancel, as this would mess up the state machine. */
4556 }
4557 #endif /* CONFIG_HOTPLUG_CPU */
4558
sched_tick_offload_init(void)4559 int __init sched_tick_offload_init(void)
4560 {
4561 tick_work_cpu = alloc_percpu(struct tick_work);
4562 BUG_ON(!tick_work_cpu);
4563 return 0;
4564 }
4565
4566 #else /* !CONFIG_NO_HZ_FULL */
sched_tick_start(int cpu)4567 static inline void sched_tick_start(int cpu)
4568 {
4569 }
sched_tick_stop(int cpu)4570 static inline void sched_tick_stop(int cpu)
4571 {
4572 }
4573 #endif
4574
4575 #if defined(CONFIG_PREEMPTION) && \
4576 (defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE))
4577 /*
4578 * If the value passed in is equal to the current preempt count
4579 * then we just disabled preemption. Start timing the latency.
4580 */
preempt_latency_start(int val)4581 static inline void preempt_latency_start(int val)
4582 {
4583 if (preempt_count() == val) {
4584 unsigned long ip = get_lock_parent_ip();
4585 #ifdef CONFIG_DEBUG_PREEMPT
4586 current->preempt_disable_ip = ip;
4587 #endif
4588 trace_preempt_off(CALLER_ADDR0, ip);
4589 }
4590 }
4591
preempt_count_add(int val)4592 void preempt_count_add(int val)
4593 {
4594 #ifdef CONFIG_DEBUG_PREEMPT
4595 /*
4596 * Underflow?
4597 */
4598 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) {
4599 return;
4600 }
4601 #endif
4602 __preempt_count_add(val);
4603 #ifdef CONFIG_DEBUG_PREEMPT
4604 /*
4605 * Spinlock count overflowing soon?
4606 */
4607 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 0xa);
4608 #endif
4609 preempt_latency_start(val);
4610 }
4611 EXPORT_SYMBOL(preempt_count_add);
4612 NOKPROBE_SYMBOL(preempt_count_add);
4613
4614 /*
4615 * If the value passed in equals to the current preempt count
4616 * then we just enabled preemption. Stop timing the latency.
4617 */
preempt_latency_stop(int val)4618 static inline void preempt_latency_stop(int val)
4619 {
4620 if (preempt_count() == val) {
4621 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
4622 }
4623 }
4624
preempt_count_sub(int val)4625 void preempt_count_sub(int val)
4626 {
4627 #ifdef CONFIG_DEBUG_PREEMPT
4628 /*
4629 * Underflow?
4630 */
4631 if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) {
4632 return;
4633 }
4634 /*
4635 * Is the spinlock portion underflowing?
4636 */
4637 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4638 !(preempt_count() & PREEMPT_MASK))) {
4639 return;
4640 }
4641 #endif
4642
4643 preempt_latency_stop(val);
4644 __preempt_count_sub(val);
4645 }
4646 EXPORT_SYMBOL(preempt_count_sub);
4647 NOKPROBE_SYMBOL(preempt_count_sub);
4648
4649 #else
preempt_latency_start(int val)4650 static inline void preempt_latency_start(int val)
4651 {
4652 }
preempt_latency_stop(int val)4653 static inline void preempt_latency_stop(int val)
4654 {
4655 }
4656 #endif
4657
get_preempt_disable_ip(struct task_struct * p)4658 static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
4659 {
4660 #ifdef CONFIG_DEBUG_PREEMPT
4661 return p->preempt_disable_ip;
4662 #else
4663 return 0;
4664 #endif
4665 }
4666
4667 /*
4668 * Print scheduling while atomic bug:
4669 */
__schedule_bug(struct task_struct * prev)4670 static noinline void __schedule_bug(struct task_struct *prev)
4671 {
4672 /* Save this before calling printk(), since that will clobber it */
4673 unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
4674
4675 if (oops_in_progress) {
4676 return;
4677 }
4678
4679 printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", prev->comm,
4680 prev->pid, preempt_count());
4681
4682 debug_show_held_locks(prev);
4683 print_modules();
4684 if (irqs_disabled()) {
4685 print_irqtrace_events(prev);
4686 }
4687 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) {
4688 pr_err("Preemption disabled at:");
4689 print_ip_sym(KERN_ERR, preempt_disable_ip);
4690 }
4691 if (panic_on_warn) {
4692 panic("scheduling while atomic\n");
4693 }
4694
4695 dump_stack();
4696 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4697 }
4698
4699 /*
4700 * Various schedule()-time debugging checks and statistics:
4701 */
schedule_debug(struct task_struct * prev,bool preempt)4702 static inline void schedule_debug(struct task_struct *prev, bool preempt)
4703 {
4704 #ifdef CONFIG_SCHED_STACK_END_CHECK
4705 if (task_stack_end_corrupted(prev)) {
4706 panic("corrupted stack end detected inside scheduler\n");
4707 }
4708
4709 if (task_scs_end_corrupted(prev)) {
4710 panic("corrupted shadow stack detected inside scheduler\n");
4711 }
4712 #endif
4713
4714 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4715 if (!preempt && prev->state && prev->non_block_count) {
4716 printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4717 prev->comm, prev->pid, prev->non_block_count);
4718 dump_stack();
4719 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4720 }
4721 #endif
4722
4723 if (unlikely(in_atomic_preempt_off())) {
4724 __schedule_bug(prev);
4725 preempt_count_set(PREEMPT_DISABLED);
4726 }
4727 rcu_sleep_check();
4728
4729 profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4730
4731 schedstat_inc(this_rq()->sched_count);
4732 }
4733
put_prev_task_balance(struct rq * rq,struct task_struct * prev,struct rq_flags * rf)4734 static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4735 struct rq_flags *rf)
4736 {
4737 #ifdef CONFIG_SMP
4738 const struct sched_class *class;
4739 /*
4740 * We must do the balancing pass before put_prev_task(), such
4741 * that when we release the rq->lock the task is in the same
4742 * state as before we took rq->lock.
4743 *
4744 * We can terminate the balance pass as soon as we know there is
4745 * a runnable task of @class priority or higher.
4746 */
4747 for_class_range(class, prev->sched_class, &idle_sched_class)
4748 {
4749 if (class->balance(rq, prev, rf)) {
4750 break;
4751 }
4752 }
4753 #endif
4754
4755 put_prev_task(rq, prev);
4756 }
4757
4758 /*
4759 * Pick up the highest-prio task:
4760 */
4761 static inline struct task_struct *
pick_next_task(struct rq * rq,struct task_struct * prev,struct rq_flags * rf)4762 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4763 {
4764 const struct sched_class *class;
4765 struct task_struct *p;
4766
4767 /*
4768 * Optimization: we know that if all tasks are in the fair class we can
4769 * call that function directly, but only if the @prev task wasn't of a
4770 * higher scheduling class, because otherwise those loose the
4771 * opportunity to pull in more work from other CPUs.
4772 */
4773 if (likely(prev->sched_class <= &fair_sched_class &&
4774 rq->nr_running == rq->cfs.h_nr_running)) {
4775 p = pick_next_task_fair(rq, prev, rf);
4776 if (unlikely(p == RETRY_TASK)) {
4777 goto restart;
4778 }
4779
4780 /* Assumes fair_sched_class->next == idle_sched_class */
4781 if (!p) {
4782 put_prev_task(rq, prev);
4783 p = pick_next_task_idle(rq);
4784 }
4785
4786 return p;
4787 }
4788
4789 restart:
4790 put_prev_task_balance(rq, prev, rf);
4791
4792 for_each_class(class)
4793 {
4794 p = class->pick_next_task(rq);
4795 if (p) {
4796 return p;
4797 }
4798 }
4799
4800 /* The idle class should always have a runnable task: */
4801 BUG();
4802 }
4803
4804 /*
4805 * __schedule() is the main scheduler function.
4806 *
4807 * The main means of driving the scheduler and thus entering this function are:
4808 *
4809 * 1. Explicit blocking: mutex, semaphore, waitqueue, etc.
4810 *
4811 * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
4812 * paths. For example, see arch/x86/entry_64.S.
4813 *
4814 * To drive preemption between tasks, the scheduler sets the flag in timer
4815 * interrupt handler scheduler_tick().
4816 *
4817 * 3. Wakeups don't really cause entry into schedule(). They add a
4818 * task to the run-queue and that's it.
4819 *
4820 * Now, if the new task added to the run-queue preempts the current
4821 * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4822 * called on the nearest possible occasion:
4823 *
4824 * - If the kernel is preemptible (CONFIG_PREEMPTION=y):
4825 *
4826 * - in syscall or exception context, at the next outmost
4827 * preempt_enable(). (this might be as soon as the wake_up()'s
4828 * spin_unlock()!)
4829 *
4830 * - in IRQ context, return from interrupt-handler to
4831 * preemptible context
4832 *
4833 * - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
4834 * then at the next:
4835 *
4836 * - cond_resched() call
4837 * - explicit schedule() call
4838 * - return from syscall or exception to user-space
4839 * - return from interrupt-handler to user-space
4840 *
4841 * WARNING: must be called with preemption disabled!
4842 */
__schedule(bool preempt)4843 static void __sched notrace __schedule(bool preempt)
4844 {
4845 struct task_struct *prev, *next;
4846 unsigned long *switch_count;
4847 unsigned long prev_state;
4848 struct rq_flags rf;
4849 struct rq *rq;
4850 int cpu;
4851 u64 wallclock;
4852
4853 cpu = smp_processor_id();
4854 rq = cpu_rq(cpu);
4855 prev = rq->curr;
4856
4857 schedule_debug(prev, preempt);
4858
4859 if (sched_feat(HRTICK)) {
4860 hrtick_clear(rq);
4861 }
4862
4863 local_irq_disable();
4864 rcu_note_context_switch(preempt);
4865
4866 /*
4867 * Make sure that signal_pending_state()->signal_pending() below
4868 * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4869 * done by the caller to avoid the race with signal_wake_up():
4870 *
4871 * __set_current_state(@state) signal_wake_up()
4872 * schedule() set_tsk_thread_flag(p, TIF_SIGPENDING)
4873 * wake_up_state(p, state)
4874 * LOCK rq->lock LOCK p->pi_state
4875 * smp_mb__after_spinlock() smp_mb__after_spinlock()
4876 * if (signal_pending_state()) if (p->state & @state)
4877 *
4878 * Also, the membarrier system call requires a full memory barrier
4879 * after coming from user-space, before storing to rq->curr.
4880 */
4881 rq_lock(rq, &rf);
4882 smp_mb__after_spinlock();
4883
4884 /* Promote REQ to ACT */
4885 rq->clock_update_flags <<= 1;
4886 update_rq_clock(rq);
4887
4888 switch_count = &prev->nivcsw;
4889
4890 /*
4891 * We must load prev->state once (task_struct::state is volatile), such
4892 * that:
4893 *
4894 * - we form a control dependency vs deactivate_task() below.
4895 * - ptrace_{,un}freeze_traced() can change ->state underneath us.
4896 */
4897 prev_state = prev->state;
4898 if (!preempt && prev_state) {
4899 if (signal_pending_state(prev_state, prev)) {
4900 prev->state = TASK_RUNNING;
4901 } else {
4902 prev->sched_contributes_to_load =
4903 (prev_state & TASK_UNINTERRUPTIBLE) &&
4904 !(prev_state & TASK_NOLOAD) && !(prev->flags & PF_FROZEN);
4905
4906 if (prev->sched_contributes_to_load) {
4907 rq->nr_uninterruptible++;
4908 }
4909
4910 /*
4911 * __schedule() ttwu()
4912 * prev_state = prev->state; if (p->on_rq && ...)
4913 * if (prev_state) goto out;
4914 * p->on_rq = 0; smp_acquire__after_ctrl_dep();
4915 * p->state = TASK_WAKING
4916 *
4917 * Where __schedule() and ttwu() have matching control dependencies.
4918 *
4919 * After this, schedule() must not care about p->state any more.
4920 */
4921 deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4922
4923 if (prev->in_iowait) {
4924 atomic_inc(&rq->nr_iowait);
4925 delayacct_blkio_start();
4926 }
4927 }
4928 switch_count = &prev->nvcsw;
4929 }
4930
4931 next = pick_next_task(rq, prev, &rf);
4932 clear_tsk_need_resched(prev);
4933 clear_preempt_need_resched();
4934
4935 wallclock = sched_ktime_clock();
4936 if (likely(prev != next)) {
4937 #ifdef CONFIG_SCHED_WALT
4938 if (!prev->on_rq) {
4939 prev->last_sleep_ts = wallclock;
4940 }
4941 #endif
4942 update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
4943 update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
4944 rq->nr_switches++;
4945 /*
4946 * RCU users of rcu_dereference(rq->curr) may not see
4947 * changes to task_struct made by pick_next_task().
4948 */
4949 RCU_INIT_POINTER(rq->curr, next);
4950 /*
4951 * The membarrier system call requires each architecture
4952 * to have a full memory barrier after updating
4953 * rq->curr, before returning to user-space.
4954 *
4955 * Here are the schemes providing that barrier on the
4956 * various architectures:
4957 * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
4958 * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
4959 * - finish_lock_switch() for weakly-ordered
4960 * architectures where spin_unlock is a full barrier,
4961 * - switch_to() for arm64 (weakly-ordered, spin_unlock
4962 * is a RELEASE barrier),
4963 */
4964 ++*switch_count;
4965
4966 psi_sched_switch(prev, next, !task_on_rq_queued(prev));
4967
4968 trace_sched_switch(preempt, prev, next);
4969
4970 /* Also unlocks the rq: */
4971 rq = context_switch(rq, prev, next, &rf);
4972 } else {
4973 update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
4974 rq->clock_update_flags &= ~(RQCF_ACT_SKIP | RQCF_REQ_SKIP);
4975 rq_unlock_irq(rq, &rf);
4976 }
4977
4978 balance_callback(rq);
4979 }
4980
do_task_dead(void)4981 void __noreturn do_task_dead(void)
4982 {
4983 /* Causes final put_task_struct in finish_task_switch(): */
4984 set_special_state(TASK_DEAD);
4985
4986 /* Tell freezer to ignore us: */
4987 current->flags |= PF_NOFREEZE;
4988
4989 __schedule(false);
4990 BUG();
4991
4992 /* Avoid "noreturn function does return" - but don't continue if BUG() is a
4993 * NOP: */
4994 for (;;) {
4995 cpu_relax();
4996 }
4997 }
4998
sched_submit_work(struct task_struct * tsk)4999 static inline void sched_submit_work(struct task_struct *tsk)
5000 {
5001 unsigned int task_flags;
5002
5003 if (!tsk->state) {
5004 return;
5005 }
5006
5007 task_flags = tsk->flags;
5008 /*
5009 * If a worker went to sleep, notify and ask workqueue whether
5010 * it wants to wake up a task to maintain concurrency.
5011 * As this function is called inside the schedule() context,
5012 * we disable preemption to avoid it calling schedule() again
5013 * in the possible wakeup of a kworker and because wq_worker_sleeping()
5014 * requires it.
5015 */
5016 if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5017 preempt_disable();
5018 if (task_flags & PF_WQ_WORKER) {
5019 wq_worker_sleeping(tsk);
5020 } else {
5021 io_wq_worker_sleeping(tsk);
5022 }
5023 preempt_enable_no_resched();
5024 }
5025
5026 if (tsk_is_pi_blocked(tsk)) {
5027 return;
5028 }
5029
5030 /*
5031 * If we are going to sleep and we have plugged IO queued,
5032 * make sure to submit it to avoid deadlocks.
5033 */
5034 if (blk_needs_flush_plug(tsk)) {
5035 blk_schedule_flush_plug(tsk);
5036 }
5037 }
5038
sched_update_worker(struct task_struct * tsk)5039 static void sched_update_worker(struct task_struct *tsk)
5040 {
5041 if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5042 if (tsk->flags & PF_WQ_WORKER) {
5043 wq_worker_running(tsk);
5044 } else {
5045 io_wq_worker_running(tsk);
5046 }
5047 }
5048 }
5049
schedule(void)5050 asmlinkage __visible void __sched schedule(void)
5051 {
5052 struct task_struct *tsk = current;
5053
5054 sched_submit_work(tsk);
5055 do {
5056 preempt_disable();
5057 __schedule(false);
5058 sched_preempt_enable_no_resched();
5059 } while (need_resched());
5060 sched_update_worker(tsk);
5061 }
5062 EXPORT_SYMBOL(schedule);
5063
5064 /*
5065 * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
5066 * state (have scheduled out non-voluntarily) by making sure that all
5067 * tasks have either left the run queue or have gone into user space.
5068 * As idle tasks do not do either, they must not ever be preempted
5069 * (schedule out non-voluntarily).
5070 *
5071 * schedule_idle() is similar to schedule_preempt_disable() except that it
5072 * never enables preemption because it does not call sched_submit_work().
5073 */
schedule_idle(void)5074 void __sched schedule_idle(void)
5075 {
5076 /*
5077 * As this skips calling sched_submit_work(), which the idle task does
5078 * regardless because that function is a nop when the task is in a
5079 * TASK_RUNNING state, make sure this isn't used someplace that the
5080 * current task can be in any other state. Note, idle is always in the
5081 * TASK_RUNNING state.
5082 */
5083 WARN_ON_ONCE(current->state);
5084 do {
5085 __schedule(false);
5086 } while (need_resched());
5087 }
5088
5089 #ifdef CONFIG_CONTEXT_TRACKING
schedule_user(void)5090 asmlinkage __visible void __sched schedule_user(void)
5091 {
5092 /*
5093 * If we come here after a random call to set_need_resched(),
5094 * or we have been woken up remotely but the IPI has not yet arrived,
5095 * we haven't yet exited the RCU idle mode. Do it here manually until
5096 * we find a better solution.
5097 *
5098 * NB: There are buggy callers of this function. Ideally we
5099 * should warn if prev_state != CONTEXT_USER, but that will trigger
5100 * too frequently to make sense yet.
5101 */
5102 enum ctx_state prev_state = exception_enter();
5103 schedule();
5104 exception_exit(prev_state);
5105 }
5106 #endif
5107
5108 /**
5109 * schedule_preempt_disabled - called with preemption disabled
5110 *
5111 * Returns with preemption disabled. Note: preempt_count must be 1
5112 */
schedule_preempt_disabled(void)5113 void __sched schedule_preempt_disabled(void)
5114 {
5115 sched_preempt_enable_no_resched();
5116 schedule();
5117 preempt_disable();
5118 }
5119
preempt_schedule_common(void)5120 static void __sched notrace preempt_schedule_common(void)
5121 {
5122 do {
5123 /*
5124 * Because the function tracer can trace preempt_count_sub()
5125 * and it also uses preempt_enable/disable_notrace(), if
5126 * NEED_RESCHED is set, the preempt_enable_notrace() called
5127 * by the function tracer will call this function again and
5128 * cause infinite recursion.
5129 *
5130 * Preemption must be disabled here before the function
5131 * tracer can trace. Break up preempt_disable() into two
5132 * calls. One to disable preemption without fear of being
5133 * traced. The other to still record the preemption latency,
5134 * which can also be traced by the function tracer.
5135 */
5136 preempt_disable_notrace();
5137 preempt_latency_start(1);
5138 __schedule(true);
5139 preempt_latency_stop(1);
5140 preempt_enable_no_resched_notrace();
5141
5142 /*
5143 * Check again in case we missed a preemption opportunity
5144 * between schedule and now.
5145 */
5146 } while (need_resched());
5147 }
5148
5149 #ifdef CONFIG_PREEMPTION
5150 /*
5151 * This is the entry point to schedule() from in-kernel preemption
5152 * off of preempt_enable.
5153 */
preempt_schedule(void)5154 asmlinkage __visible void __sched notrace preempt_schedule(void)
5155 {
5156 /*
5157 * If there is a non-zero preempt_count or interrupts are disabled,
5158 * we do not want to preempt the current task. Just return..
5159 */
5160 if (likely(!preemptible())) {
5161 return;
5162 }
5163
5164 preempt_schedule_common();
5165 }
5166 NOKPROBE_SYMBOL(preempt_schedule);
5167 EXPORT_SYMBOL(preempt_schedule);
5168
5169 /**
5170 * preempt_schedule_notrace - preempt_schedule called by tracing
5171 *
5172 * The tracing infrastructure uses preempt_enable_notrace to prevent
5173 * recursion and tracing preempt enabling caused by the tracing
5174 * infrastructure itself. But as tracing can happen in areas coming
5175 * from userspace or just about to enter userspace, a preempt enable
5176 * can occur before user_exit() is called. This will cause the scheduler
5177 * to be called when the system is still in usermode.
5178 *
5179 * To prevent this, the preempt_enable_notrace will use this function
5180 * instead of preempt_schedule() to exit user context if needed before
5181 * calling the scheduler.
5182 */
preempt_schedule_notrace(void)5183 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
5184 {
5185 enum ctx_state prev_ctx;
5186
5187 if (likely(!preemptible())) {
5188 return;
5189 }
5190
5191 do {
5192 /*
5193 * Because the function tracer can trace preempt_count_sub()
5194 * and it also uses preempt_enable/disable_notrace(), if
5195 * NEED_RESCHED is set, the preempt_enable_notrace() called
5196 * by the function tracer will call this function again and
5197 * cause infinite recursion.
5198 *
5199 * Preemption must be disabled here before the function
5200 * tracer can trace. Break up preempt_disable() into two
5201 * calls. One to disable preemption without fear of being
5202 * traced. The other to still record the preemption latency,
5203 * which can also be traced by the function tracer.
5204 */
5205 preempt_disable_notrace();
5206 preempt_latency_start(1);
5207 /*
5208 * Needs preempt disabled in case user_exit() is traced
5209 * and the tracer calls preempt_enable_notrace() causing
5210 * an infinite recursion.
5211 */
5212 prev_ctx = exception_enter();
5213 __schedule(true);
5214 exception_exit(prev_ctx);
5215
5216 preempt_latency_stop(1);
5217 preempt_enable_no_resched_notrace();
5218 } while (need_resched());
5219 }
5220 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
5221
5222 #endif /* CONFIG_PREEMPTION */
5223
5224 /*
5225 * This is the entry point to schedule() from kernel preemption
5226 * off of irq context.
5227 * Note, that this is called and return with irqs disabled. This will
5228 * protect us against recursive calling from irq.
5229 */
preempt_schedule_irq(void)5230 asmlinkage __visible void __sched preempt_schedule_irq(void)
5231 {
5232 enum ctx_state prev_state;
5233
5234 /* Catch callers which need to be fixed */
5235 BUG_ON(preempt_count() || !irqs_disabled());
5236
5237 prev_state = exception_enter();
5238
5239 do {
5240 preempt_disable();
5241 local_irq_enable();
5242 __schedule(true);
5243 local_irq_disable();
5244 sched_preempt_enable_no_resched();
5245 } while (need_resched());
5246
5247 exception_exit(prev_state);
5248 }
5249
default_wake_function(wait_queue_entry_t * curr,unsigned mode,int wake_flags,void * key)5250 int default_wake_function(wait_queue_entry_t *curr, unsigned mode,
5251 int wake_flags, void *key)
5252 {
5253 WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && (wake_flags & ~(WF_SYNC)));
5254 return try_to_wake_up(curr->private, mode, wake_flags);
5255 }
5256 EXPORT_SYMBOL(default_wake_function);
5257
__setscheduler_prio(struct task_struct * p,int prio)5258 static void __setscheduler_prio(struct task_struct *p, int prio)
5259 {
5260 if (dl_prio(prio)) {
5261 p->sched_class = &dl_sched_class;
5262 } else if (rt_prio(prio)) {
5263 p->sched_class = &rt_sched_class;
5264 } else {
5265 p->sched_class = &fair_sched_class;
5266 }
5267
5268 p->prio = prio;
5269 }
5270
5271 #ifdef CONFIG_RT_MUTEXES
5272
__rt_effective_prio(struct task_struct * pi_task,int prio)5273 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
5274 {
5275 if (pi_task) {
5276 prio = min(prio, pi_task->prio);
5277 }
5278
5279 return prio;
5280 }
5281
rt_effective_prio(struct task_struct * p,int prio)5282 static inline int rt_effective_prio(struct task_struct *p, int prio)
5283 {
5284 struct task_struct *pi_task = rt_mutex_get_top_task(p);
5285
5286 return __rt_effective_prio(pi_task, prio);
5287 }
5288
5289 /*
5290 * rt_mutex_setprio - set the current priority of a task
5291 * @p: task to boost
5292 * @pi_task: donor task
5293 *
5294 * This function changes the 'effective' priority of a task. It does
5295 * not touch ->normal_prio like __setscheduler().
5296 *
5297 * Used by the rt_mutex code to implement priority inheritance
5298 * logic. Call site only calls if the priority of the task changed.
5299 */
rt_mutex_setprio(struct task_struct * p,struct task_struct * pi_task)5300 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
5301 {
5302 int prio, oldprio, queued, running,
5303 queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
5304 const struct sched_class *prev_class;
5305 struct rq_flags rf;
5306 struct rq *rq;
5307
5308 /* XXX used to be waiter->prio, not waiter->task->prio */
5309 prio = __rt_effective_prio(pi_task, p->normal_prio);
5310 /*
5311 * If nothing changed; bail early.
5312 */
5313 if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) {
5314 return;
5315 }
5316
5317 rq = __task_rq_lock(p, &rf);
5318 update_rq_clock(rq);
5319 /*
5320 * Set under pi_lock && rq->lock, such that the value can be used under
5321 * either lock.
5322 *
5323 * Note that there is loads of tricky to make this pointer cache work
5324 * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
5325 * ensure a task is de-boosted (pi_task is set to NULL) before the
5326 * task is allowed to run again (and can exit). This ensures the pointer
5327 * points to a blocked task -- which guaratees the task is present.
5328 */
5329 p->pi_top_task = pi_task;
5330
5331 /*
5332 * For FIFO/RR we only need to set prio, if that matches we're done.
5333 */
5334 if (prio == p->prio && !dl_prio(prio)) {
5335 goto out_unlock;
5336 }
5337
5338 /*
5339 * Idle task boosting is a nono in general. There is one
5340 * exception, when PREEMPT_RT and NOHZ is active:
5341 *
5342 * The idle task calls get_next_timer_interrupt() and holds
5343 * the timer wheel base->lock on the CPU and another CPU wants
5344 * to access the timer (probably to cancel it). We can safely
5345 * ignore the boosting request, as the idle CPU runs this code
5346 * with interrupts disabled and will complete the lock
5347 * protected section without being interrupted. So there is no
5348 * real need to boost.
5349 */
5350 if (unlikely(p == rq->idle)) {
5351 WARN_ON(p != rq->curr);
5352 WARN_ON(p->pi_blocked_on);
5353 goto out_unlock;
5354 }
5355
5356 trace_sched_pi_setprio(p, pi_task);
5357 oldprio = p->prio;
5358
5359 if (oldprio == prio) {
5360 queue_flag &= ~DEQUEUE_MOVE;
5361 }
5362
5363 prev_class = p->sched_class;
5364 queued = task_on_rq_queued(p);
5365 running = task_current(rq, p);
5366 if (queued) {
5367 dequeue_task(rq, p, queue_flag);
5368 }
5369 if (running) {
5370 put_prev_task(rq, p);
5371 }
5372
5373 /*
5374 * Boosting condition are:
5375 * 1. -rt task is running and holds mutex A
5376 * --> -dl task blocks on mutex A
5377 *
5378 * 2. -dl task is running and holds mutex A
5379 * --> -dl task blocks on mutex A and could preempt the
5380 * running task
5381 */
5382 if (dl_prio(prio)) {
5383 if (!dl_prio(p->normal_prio) ||
5384 (pi_task && dl_prio(pi_task->prio) &&
5385 dl_entity_preempt(&pi_task->dl, &p->dl))) {
5386 p->dl.pi_se = pi_task->dl.pi_se;
5387 queue_flag |= ENQUEUE_REPLENISH;
5388 } else {
5389 p->dl.pi_se = &p->dl;
5390 }
5391 } else if (rt_prio(prio)) {
5392 if (dl_prio(oldprio)) {
5393 p->dl.pi_se = &p->dl;
5394 }
5395 if (oldprio < prio) {
5396 queue_flag |= ENQUEUE_HEAD;
5397 }
5398 } else {
5399 if (dl_prio(oldprio)) {
5400 p->dl.pi_se = &p->dl;
5401 }
5402 if (rt_prio(oldprio)) {
5403 p->rt.timeout = 0;
5404 }
5405 }
5406
5407 __setscheduler_prio(p, prio);
5408
5409 if (queued) {
5410 enqueue_task(rq, p, queue_flag);
5411 }
5412 if (running) {
5413 set_next_task(rq, p);
5414 }
5415
5416 check_class_changed(rq, p, prev_class, oldprio);
5417 out_unlock:
5418 /* Avoid rq from going away on us: */
5419 preempt_disable();
5420 __task_rq_unlock(rq, &rf);
5421
5422 balance_callback(rq);
5423 preempt_enable();
5424 }
5425 #else
rt_effective_prio(struct task_struct * p,int prio)5426 static inline int rt_effective_prio(struct task_struct *p, int prio)
5427 {
5428 return prio;
5429 }
5430 #endif
5431
set_user_nice(struct task_struct * p,long nice)5432 void set_user_nice(struct task_struct *p, long nice)
5433 {
5434 bool queued, running;
5435 int old_prio;
5436 struct rq_flags rf;
5437 struct rq *rq;
5438
5439 if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) {
5440 return;
5441 }
5442 /*
5443 * We have to be careful, if called from sys_setpriority(),
5444 * the task might be in the middle of scheduling on another CPU.
5445 */
5446 rq = task_rq_lock(p, &rf);
5447 update_rq_clock(rq);
5448
5449 /*
5450 * The RT priorities are set via sched_setscheduler(), but we still
5451 * allow the 'normal' nice value to be set - but as expected
5452 * it wont have any effect on scheduling until the task is
5453 * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
5454 */
5455 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
5456 p->static_prio = NICE_TO_PRIO(nice);
5457 goto out_unlock;
5458 }
5459 queued = task_on_rq_queued(p);
5460 running = task_current(rq, p);
5461 if (queued) {
5462 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
5463 }
5464 if (running) {
5465 put_prev_task(rq, p);
5466 }
5467
5468 p->static_prio = NICE_TO_PRIO(nice);
5469 set_load_weight(p);
5470 old_prio = p->prio;
5471 p->prio = effective_prio(p);
5472
5473 if (queued) {
5474 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5475 }
5476 if (running) {
5477 set_next_task(rq, p);
5478 }
5479
5480 /*
5481 * If the task increased its priority or is running and
5482 * lowered its priority, then reschedule its CPU:
5483 */
5484 p->sched_class->prio_changed(rq, p, old_prio);
5485
5486 out_unlock:
5487 task_rq_unlock(rq, p, &rf);
5488 }
5489 EXPORT_SYMBOL(set_user_nice);
5490
5491 /*
5492 * can_nice - check if a task can reduce its nice value
5493 * @p: task
5494 * @nice: nice value
5495 */
can_nice(const struct task_struct * p,const int nice)5496 int can_nice(const struct task_struct *p, const int nice)
5497 {
5498 /* Convert nice value [19,-20] to rlimit style value [1,40]: */
5499 int nice_rlim = nice_to_rlimit(nice);
5500
5501 return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE));
5502 }
5503
5504 #ifdef __ARCH_WANT_SYS_NICE
5505
5506 /*
5507 * sys_nice - change the priority of the current process.
5508 * @increment: priority increment
5509 *
5510 * sys_setpriority is a more generic, but much slower function that
5511 * does similar things.
5512 */
SYSCALL_DEFINE1(nice,int,increment)5513 SYSCALL_DEFINE1(nice, int, increment)
5514 {
5515 long nice, retval;
5516
5517 /*
5518 * Setpriority might change our priority at the same moment.
5519 * We don't have to worry. Conceptually one call occurs first
5520 * and we have a single winner.
5521 */
5522 increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
5523 nice = task_nice(current) + increment;
5524
5525 nice = clamp_val(nice, MIN_NICE, MAX_NICE);
5526 if (increment < 0 && !can_nice(current, nice)) {
5527 return -EPERM;
5528 }
5529
5530 retval = security_task_setnice(current, nice);
5531 if (retval) {
5532 return retval;
5533 }
5534
5535 set_user_nice(current, nice);
5536 return 0;
5537 }
5538
5539 #endif
5540
5541 /**
5542 * task_prio - return the priority value of a given task.
5543 * @p: the task in question.
5544 *
5545 * Return: The priority value as seen by users in /proc.
5546 * RT tasks are offset by -200. Normal tasks are centered
5547 * around 0, value goes from -16 to +15.
5548 */
task_prio(const struct task_struct * p)5549 int task_prio(const struct task_struct *p)
5550 {
5551 return p->prio - MAX_RT_PRIO;
5552 }
5553
5554 /**
5555 * idle_cpu - is a given CPU idle currently?
5556 * @cpu: the processor in question.
5557 *
5558 * Return: 1 if the CPU is currently idle. 0 otherwise.
5559 */
idle_cpu(int cpu)5560 int idle_cpu(int cpu)
5561 {
5562 struct rq *rq = cpu_rq(cpu);
5563
5564 if (rq->curr != rq->idle) {
5565 return 0;
5566 }
5567
5568 if (rq->nr_running) {
5569 return 0;
5570 }
5571
5572 #ifdef CONFIG_SMP
5573 if (rq->ttwu_pending) {
5574 return 0;
5575 }
5576 #endif
5577
5578 return 1;
5579 }
5580
5581 /**
5582 * available_idle_cpu - is a given CPU idle for enqueuing work.
5583 * @cpu: the CPU in question.
5584 *
5585 * Return: 1 if the CPU is currently idle. 0 otherwise.
5586 */
available_idle_cpu(int cpu)5587 int available_idle_cpu(int cpu)
5588 {
5589 if (!idle_cpu(cpu)) {
5590 return 0;
5591 }
5592
5593 if (vcpu_is_preempted(cpu)) {
5594 return 0;
5595 }
5596
5597 return 1;
5598 }
5599
5600 /**
5601 * idle_task - return the idle task for a given CPU.
5602 * @cpu: the processor in question.
5603 *
5604 * Return: The idle task for the CPU @cpu.
5605 */
idle_task(int cpu)5606 struct task_struct *idle_task(int cpu)
5607 {
5608 return cpu_rq(cpu)->idle;
5609 }
5610
5611 /**
5612 * find_process_by_pid - find a process with a matching PID value.
5613 * @pid: the pid in question.
5614 *
5615 * The task of @pid, if found. %NULL otherwise.
5616 */
find_process_by_pid(pid_t pid)5617 static struct task_struct *find_process_by_pid(pid_t pid)
5618 {
5619 return pid ? find_task_by_vpid(pid) : current;
5620 }
5621
5622 /*
5623 * sched_setparam() passes in -1 for its policy, to let the functions
5624 * it calls know not to change it.
5625 */
5626 #define SETPARAM_POLICY (-1)
5627
__setscheduler_params(struct task_struct * p,const struct sched_attr * attr)5628 static void __setscheduler_params(struct task_struct *p,
5629 const struct sched_attr *attr)
5630 {
5631 int policy = attr->sched_policy;
5632
5633 if (policy == SETPARAM_POLICY) {
5634 policy = p->policy;
5635 }
5636
5637 p->policy = policy;
5638
5639 if (dl_policy(policy)) {
5640 __setparam_dl(p, attr);
5641 } else if (fair_policy(policy)) {
5642 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5643 }
5644
5645 /*
5646 * __sched_setscheduler() ensures attr->sched_priority == 0 when
5647 * !rt_policy. Always setting this ensures that things like
5648 * getparam()/getattr() don't report silly values for !rt tasks.
5649 */
5650 p->rt_priority = attr->sched_priority;
5651 p->normal_prio = normal_prio(p);
5652 set_load_weight(p);
5653 }
5654
5655 /*
5656 * Check the target process has a UID that matches the current process's:
5657 */
check_same_owner(struct task_struct * p)5658 static bool check_same_owner(struct task_struct *p)
5659 {
5660 const struct cred *cred = current_cred(), *pcred;
5661 bool match;
5662
5663 rcu_read_lock();
5664 pcred = __task_cred(p);
5665 match = (uid_eq(cred->euid, pcred->euid) || uid_eq(cred->euid, pcred->uid));
5666 rcu_read_unlock();
5667 return match;
5668 }
5669
__sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,bool user,bool pi)5670 static int __sched_setscheduler(struct task_struct *p,
5671 const struct sched_attr *attr, bool user,
5672 bool pi)
5673 {
5674 int oldpolicy = -1, policy = attr->sched_policy;
5675 int retval, oldprio, newprio, queued, running;
5676 const struct sched_class *prev_class;
5677 struct rq_flags rf;
5678 int reset_on_fork;
5679 int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
5680 struct rq *rq;
5681
5682 /* The pi code expects interrupts enabled */
5683 BUG_ON(pi && in_interrupt());
5684 while (1) {
5685 /* Double check policy once rq lock held: */
5686 if (policy < 0) {
5687 reset_on_fork = p->sched_reset_on_fork;
5688 policy = oldpolicy = p->policy;
5689 } else {
5690 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
5691
5692 if (!valid_policy(policy)) {
5693 return -EINVAL;
5694 }
5695 }
5696
5697 if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) {
5698 return -EINVAL;
5699 }
5700
5701 /*
5702 * Valid priorities for SCHED_FIFO and SCHED_RR are
5703 * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5704 * SCHED_BATCH and SCHED_IDLE is 0.
5705 */
5706 if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) ||
5707 (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) {
5708 return -EINVAL;
5709 }
5710 if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
5711 (rt_policy(policy) != (attr->sched_priority != 0))) {
5712 return -EINVAL;
5713 }
5714
5715 /*
5716 * Allow unprivileged RT tasks to decrease priority:
5717 */
5718 if (user && !capable(CAP_SYS_NICE)) {
5719 if (fair_policy(policy)) {
5720 if (attr->sched_nice < task_nice(p) &&
5721 !can_nice(p, attr->sched_nice)) {
5722 return -EPERM;
5723 }
5724 }
5725
5726 if (rt_policy(policy)) {
5727 unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
5728 /* Can't set/change the rt policy: */
5729 if (policy != p->policy && !rlim_rtprio) {
5730 return -EPERM;
5731 }
5732
5733 /* Can't increase priority: */
5734 if (attr->sched_priority > p->rt_priority &&
5735 attr->sched_priority > rlim_rtprio) {
5736 return -EPERM;
5737 }
5738 }
5739
5740 /*
5741 * Can't set/change SCHED_DEADLINE policy at all for now
5742 * (safest behavior); in the future we would like to allow
5743 * unprivileged DL tasks to increase their relative deadline
5744 * or reduce their runtime (both ways reducing utilization)
5745 */
5746 if (dl_policy(policy)) {
5747 return -EPERM;
5748 }
5749
5750 /*
5751 * Treat SCHED_IDLE as nice 20. Only allow a switch to
5752 * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
5753 */
5754 if (task_has_idle_policy(p) && !idle_policy(policy)) {
5755 if (!can_nice(p, task_nice(p))) {
5756 return -EPERM;
5757 }
5758 }
5759
5760 /* Can't change other user's priorities: */
5761 if (!check_same_owner(p)) {
5762 return -EPERM;
5763 }
5764
5765 /* Normal users shall not reset the sched_reset_on_fork flag: */
5766 if (p->sched_reset_on_fork && !reset_on_fork) {
5767 return -EPERM;
5768 }
5769 }
5770
5771 if (user) {
5772 if (attr->sched_flags & SCHED_FLAG_SUGOV) {
5773 return -EINVAL;
5774 }
5775
5776 retval = security_task_setscheduler(p);
5777 if (retval) {
5778 return retval;
5779 }
5780 }
5781
5782 /* Update task specific "requested" clamps */
5783 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
5784 retval = uclamp_validate(p, attr);
5785 if (retval) {
5786 return retval;
5787 }
5788 }
5789
5790 if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
5791 retval = latency_nice_validate(p, user, attr);
5792 if (retval) {
5793 return retval;
5794 }
5795 }
5796
5797 if (pi) {
5798 cpuset_read_lock();
5799 }
5800
5801 /*
5802 * Make sure no PI-waiters arrive (or leave) while we are
5803 * changing the priority of the task:
5804 *
5805 * To be able to change p->policy safely, the appropriate
5806 * runqueue lock must be held.
5807 */
5808 rq = task_rq_lock(p, &rf);
5809 update_rq_clock(rq);
5810
5811 /*
5812 * Changing the policy of the stop threads its a very bad idea:
5813 */
5814 if (p == rq->stop) {
5815 retval = -EINVAL;
5816 goto unlock;
5817 }
5818
5819 /*
5820 * If not changing anything there's no need to proceed further,
5821 * but store a possible modification of reset_on_fork.
5822 */
5823 if (unlikely(policy == p->policy)) {
5824 if (fair_policy(policy) && attr->sched_nice != task_nice(p)) {
5825 goto change;
5826 }
5827 if (rt_policy(policy) && attr->sched_priority != p->rt_priority) {
5828 goto change;
5829 }
5830 if (dl_policy(policy) && dl_param_changed(p, attr)) {
5831 goto change;
5832 }
5833 if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
5834 goto change;
5835 }
5836 #ifdef CONFIG_SCHED_LATENCY_NICE
5837 if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
5838 (attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))) {
5839 goto change;
5840 }
5841 #endif
5842
5843 p->sched_reset_on_fork = reset_on_fork;
5844 retval = 0;
5845 goto unlock;
5846 }
5847 change:
5848
5849 if (user) {
5850 #ifdef CONFIG_RT_GROUP_SCHED
5851 /*
5852 * Do not allow realtime tasks into groups that have no runtime
5853 * assigned.
5854 */
5855 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5856 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5857 !task_group_is_autogroup(task_group(p))) {
5858 retval = -EPERM;
5859 goto unlock;
5860 }
5861 #endif
5862 #ifdef CONFIG_SMP
5863 if (dl_bandwidth_enabled() && dl_policy(policy) &&
5864 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
5865 cpumask_t *span = rq->rd->span;
5866
5867 /*
5868 * Don't allow tasks with an affinity mask smaller than
5869 * the entire root_domain to become SCHED_DEADLINE. We
5870 * will also fail if there's no bandwidth available.
5871 */
5872 if (!cpumask_subset(span, p->cpus_ptr) ||
5873 rq->rd->dl_bw.bw == 0) {
5874 retval = -EPERM;
5875 goto unlock;
5876 }
5877 }
5878 #endif
5879 }
5880
5881 /* Re-check policy now with rq lock held: */
5882 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5883 policy = oldpolicy = -1;
5884 task_rq_unlock(rq, p, &rf);
5885 if (pi) {
5886 cpuset_read_unlock();
5887 }
5888 continue;
5889 }
5890 break;
5891 }
5892
5893 /*
5894 * If setscheduling to SCHED_DEADLINE (or changing the parameters
5895 * of a SCHED_DEADLINE task) we need to check if enough bandwidth
5896 * is available.
5897 */
5898 if ((dl_policy(policy) || dl_task(p)) &&
5899 sched_dl_overflow(p, policy, attr)) {
5900 retval = -EBUSY;
5901 goto unlock;
5902 }
5903
5904 p->sched_reset_on_fork = reset_on_fork;
5905 oldprio = p->prio;
5906
5907 newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
5908 if (pi) {
5909 /*
5910 * Take priority boosted tasks into account. If the new
5911 * effective priority is unchanged, we just store the new
5912 * normal parameters and do not touch the scheduler class and
5913 * the runqueue. This will be done when the task deboost
5914 * itself.
5915 */
5916 newprio = rt_effective_prio(p, newprio);
5917 if (newprio == oldprio) {
5918 queue_flags &= ~DEQUEUE_MOVE;
5919 }
5920 }
5921
5922 queued = task_on_rq_queued(p);
5923 running = task_current(rq, p);
5924 if (queued) {
5925 dequeue_task(rq, p, queue_flags);
5926 }
5927 if (running) {
5928 put_prev_task(rq, p);
5929 }
5930
5931 prev_class = p->sched_class;
5932
5933 if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
5934 __setscheduler_params(p, attr);
5935 __setscheduler_prio(p, newprio);
5936 }
5937 __setscheduler_latency(p, attr);
5938 __setscheduler_uclamp(p, attr);
5939
5940 if (queued) {
5941 /*
5942 * We enqueue to tail when the priority of a task is
5943 * increased (user space view).
5944 */
5945 if (oldprio < p->prio) {
5946 queue_flags |= ENQUEUE_HEAD;
5947 }
5948
5949 enqueue_task(rq, p, queue_flags);
5950 }
5951 if (running) {
5952 set_next_task(rq, p);
5953 }
5954
5955 check_class_changed(rq, p, prev_class, oldprio);
5956
5957 /* Avoid rq from going away on us: */
5958 preempt_disable();
5959 task_rq_unlock(rq, p, &rf);
5960
5961 if (pi) {
5962 cpuset_read_unlock();
5963 rt_mutex_adjust_pi(p);
5964 }
5965
5966 /* Run balance callbacks after we've adjusted the PI chain: */
5967 balance_callback(rq);
5968 preempt_enable();
5969
5970 return 0;
5971
5972 unlock:
5973 task_rq_unlock(rq, p, &rf);
5974 if (pi) {
5975 cpuset_read_unlock();
5976 }
5977 return retval;
5978 }
5979
_sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param,bool check)5980 static int _sched_setscheduler(struct task_struct *p, int policy,
5981 const struct sched_param *param, bool check)
5982 {
5983 struct sched_attr attr = {
5984 .sched_policy = policy,
5985 .sched_priority = param->sched_priority,
5986 .sched_nice = PRIO_TO_NICE(p->static_prio),
5987 };
5988
5989 /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
5990 if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
5991 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5992 policy &= ~SCHED_RESET_ON_FORK;
5993 attr.sched_policy = policy;
5994 }
5995
5996 return __sched_setscheduler(p, &attr, check, true);
5997 }
5998 /**
5999 * sched_setscheduler - change the scheduling policy and/or RT priority of a
6000 * thread.
6001 * @p: the task in question.
6002 * @policy: new policy.
6003 * @param: structure containing the new RT priority.
6004 *
6005 * Use sched_set_fifo(), read its comment.
6006 *
6007 * Return: 0 on success. An error code otherwise.
6008 *
6009 * NOTE that the task may be already dead.
6010 */
sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param)6011 int sched_setscheduler(struct task_struct *p, int policy,
6012 const struct sched_param *param)
6013 {
6014 return _sched_setscheduler(p, policy, param, true);
6015 }
6016 EXPORT_SYMBOL_GPL(sched_setscheduler);
6017
sched_setattr(struct task_struct * p,const struct sched_attr * attr)6018 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
6019 {
6020 return __sched_setscheduler(p, attr, true, true);
6021 }
6022 EXPORT_SYMBOL_GPL(sched_setattr);
6023
sched_setattr_nocheck(struct task_struct * p,const struct sched_attr * attr)6024 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
6025 {
6026 return __sched_setscheduler(p, attr, false, true);
6027 }
6028 EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
6029
6030 /**
6031 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority
6032 * of a thread from kernelspace.
6033 * @p: the task in question.
6034 * @policy: new policy.
6035 * @param: structure containing the new RT priority.
6036 *
6037 * Just like sched_setscheduler, only don't bother checking if the
6038 * current context has permission. For example, this is needed in
6039 * stop_machine(): we create temporary high priority worker threads,
6040 * but our caller might not have that capability.
6041 *
6042 * Return: 0 on success. An error code otherwise.
6043 */
sched_setscheduler_nocheck(struct task_struct * p,int policy,const struct sched_param * param)6044 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
6045 const struct sched_param *param)
6046 {
6047 return _sched_setscheduler(p, policy, param, false);
6048 }
6049 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
6050
6051 /*
6052 * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
6053 * incapable of resource management, which is the one thing an OS really should
6054 * be doing.
6055 *
6056 * This is of course the reason it is limited to privileged users only.
6057 *
6058 * Worse still; it is fundamentally impossible to compose static priority
6059 * workloads. You cannot take two correctly working static prio workloads
6060 * and smash them together and still expect them to work.
6061 *
6062 * For this reason 'all' FIFO tasks the kernel creates are basically at:
6063 *
6064 * MAX_RT_PRIO / 2
6065 *
6066 * The administrator _MUST_ configure the system, the kernel simply doesn't
6067 * know enough information to make a sensible choice.
6068 */
sched_set_fifo(struct task_struct * p)6069 void sched_set_fifo(struct task_struct *p)
6070 {
6071 struct sched_param sp = {.sched_priority = MAX_RT_PRIO / 2};
6072 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6073 }
6074 EXPORT_SYMBOL_GPL(sched_set_fifo);
6075
6076 /*
6077 * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
6078 */
sched_set_fifo_low(struct task_struct * p)6079 void sched_set_fifo_low(struct task_struct *p)
6080 {
6081 struct sched_param sp = {.sched_priority = 1};
6082 WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6083 }
6084 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
6085
sched_set_normal(struct task_struct * p,int nice)6086 void sched_set_normal(struct task_struct *p, int nice)
6087 {
6088 struct sched_attr attr = {
6089 .sched_policy = SCHED_NORMAL,
6090 .sched_nice = nice,
6091 };
6092 WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
6093 }
6094 EXPORT_SYMBOL_GPL(sched_set_normal);
6095
do_sched_setscheduler(pid_t pid,int policy,struct sched_param __user * param)6096 static int do_sched_setscheduler(pid_t pid, int policy,
6097 struct sched_param __user *param)
6098 {
6099 struct sched_param lparam;
6100 struct task_struct *p;
6101 int retval;
6102
6103 if (!param || pid < 0) {
6104 return -EINVAL;
6105 }
6106 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) {
6107 return -EFAULT;
6108 }
6109
6110 rcu_read_lock();
6111 retval = -ESRCH;
6112 p = find_process_by_pid(pid);
6113 if (likely(p)) {
6114 get_task_struct(p);
6115 }
6116 rcu_read_unlock();
6117
6118 if (likely(p)) {
6119 retval = sched_setscheduler(p, policy, &lparam);
6120 put_task_struct(p);
6121 }
6122
6123 return retval;
6124 }
6125
6126 /*
6127 * Mimics kernel/events/core.c perf_copy_attr().
6128 */
sched_copy_attr(struct sched_attr __user * uattr,struct sched_attr * attr)6129 static int sched_copy_attr(struct sched_attr __user *uattr,
6130 struct sched_attr *attr)
6131 {
6132 u32 size;
6133 int ret;
6134
6135 /* Zero the full structure, so that a short copy will be nice: */
6136 memset(attr, 0, sizeof(*attr));
6137
6138 ret = get_user(size, &uattr->size);
6139 if (ret) {
6140 return ret;
6141 }
6142
6143 /* ABI compatibility quirk: */
6144 if (!size) {
6145 size = SCHED_ATTR_SIZE_VER0;
6146 }
6147 if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) {
6148 goto err_size;
6149 }
6150
6151 ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
6152 if (ret) {
6153 if (ret == -E2BIG) {
6154 goto err_size;
6155 }
6156 return ret;
6157 }
6158
6159 if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
6160 size < SCHED_ATTR_SIZE_VER1) {
6161 return -EINVAL;
6162 }
6163
6164 #ifdef CONFIG_SCHED_LATENCY_NICE
6165 if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
6166 size < SCHED_ATTR_SIZE_VER2) {
6167 return -EINVAL;
6168 }
6169 #endif
6170 /*
6171 * XXX: Do we want to be lenient like existing syscalls; or do we want
6172 * to be strict and return an error on out-of-bounds values?
6173 */
6174 attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
6175
6176 return 0;
6177
6178 err_size:
6179 put_user(sizeof(*attr), &uattr->size);
6180 return -E2BIG;
6181 }
6182
6183 /**
6184 * sys_sched_setscheduler - set/change the scheduler policy and RT priority
6185 * @pid: the pid in question.
6186 * @policy: new policy.
6187 * @param: structure containing the new RT priority.
6188 *
6189 * Return: 0 on success. An error code otherwise.
6190 */
SYSCALL_DEFINE3(sched_setscheduler,pid_t,pid,int,policy,struct sched_param __user *,param)6191 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
6192 struct sched_param __user *, param)
6193 {
6194 if (policy < 0) {
6195 return -EINVAL;
6196 }
6197
6198 return do_sched_setscheduler(pid, policy, param);
6199 }
6200
6201 /**
6202 * sys_sched_setparam - set/change the RT priority of a thread
6203 * @pid: the pid in question.
6204 * @param: structure containing the new RT priority.
6205 *
6206 * Return: 0 on success. An error code otherwise.
6207 */
SYSCALL_DEFINE2(sched_setparam,pid_t,pid,struct sched_param __user *,param)6208 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
6209 {
6210 return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
6211 }
6212
6213 /**
6214 * sys_sched_setattr - same as above, but with extended sched_attr
6215 * @pid: the pid in question.
6216 * @uattr: structure containing the extended parameters.
6217 * @flags: for future extension.
6218 */
SYSCALL_DEFINE3(sched_setattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,flags)6219 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
6220 unsigned int, flags)
6221 {
6222 struct sched_attr attr;
6223 struct task_struct *p;
6224 int retval;
6225
6226 if (!uattr || pid < 0 || flags) {
6227 return -EINVAL;
6228 }
6229
6230 retval = sched_copy_attr(uattr, &attr);
6231 if (retval) {
6232 return retval;
6233 }
6234
6235 if ((int)attr.sched_policy < 0) {
6236 return -EINVAL;
6237 }
6238 if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) {
6239 attr.sched_policy = SETPARAM_POLICY;
6240 }
6241
6242 rcu_read_lock();
6243 retval = -ESRCH;
6244 p = find_process_by_pid(pid);
6245 if (likely(p)) {
6246 get_task_struct(p);
6247 }
6248 rcu_read_unlock();
6249
6250 if (likely(p)) {
6251 retval = sched_setattr(p, &attr);
6252 put_task_struct(p);
6253 }
6254
6255 return retval;
6256 }
6257
6258 /**
6259 * sys_sched_getscheduler - get the policy (scheduling class) of a thread
6260 * @pid: the pid in question.
6261 *
6262 * Return: On success, the policy of the thread. Otherwise, a negative error
6263 * code.
6264 */
SYSCALL_DEFINE1(sched_getscheduler,pid_t,pid)6265 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6266 {
6267 struct task_struct *p;
6268 int retval;
6269
6270 if (pid < 0) {
6271 return -EINVAL;
6272 }
6273
6274 retval = -ESRCH;
6275 rcu_read_lock();
6276 p = find_process_by_pid(pid);
6277 if (p) {
6278 retval = security_task_getscheduler(p);
6279 if (!retval) {
6280 retval =
6281 p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6282 }
6283 }
6284 rcu_read_unlock();
6285 return retval;
6286 }
6287
6288 /**
6289 * sys_sched_getparam - get the RT priority of a thread
6290 * @pid: the pid in question.
6291 * @param: structure containing the RT priority.
6292 *
6293 * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
6294 * code.
6295 */
SYSCALL_DEFINE2(sched_getparam,pid_t,pid,struct sched_param __user *,param)6296 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6297 {
6298 struct sched_param lp = {.sched_priority = 0};
6299 struct task_struct *p;
6300 int retval;
6301
6302 if (!param || pid < 0) {
6303 return -EINVAL;
6304 }
6305
6306 rcu_read_lock();
6307 p = find_process_by_pid(pid);
6308 retval = -ESRCH;
6309 if (!p) {
6310 goto out_unlock;
6311 }
6312
6313 retval = security_task_getscheduler(p);
6314 if (retval) {
6315 goto out_unlock;
6316 }
6317
6318 if (task_has_rt_policy(p)) {
6319 lp.sched_priority = p->rt_priority;
6320 }
6321 rcu_read_unlock();
6322
6323 /*
6324 * This one might sleep, we cannot do it with a spinlock held ...
6325 */
6326 retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
6327
6328 return retval;
6329
6330 out_unlock:
6331 rcu_read_unlock();
6332 return retval;
6333 }
6334
6335 /*
6336 * Copy the kernel size attribute structure (which might be larger
6337 * than what user-space knows about) to user-space.
6338 *
6339 * Note that all cases are valid: user-space buffer can be larger or
6340 * smaller than the kernel-space buffer. The usual case is that both
6341 * have the same size.
6342 */
sched_attr_copy_to_user(struct sched_attr __user * uattr,struct sched_attr * kattr,unsigned int usize)6343 static int sched_attr_copy_to_user(struct sched_attr __user *uattr,
6344 struct sched_attr *kattr, unsigned int usize)
6345 {
6346 unsigned int ksize = sizeof(*kattr);
6347
6348 if (!access_ok(uattr, usize)) {
6349 return -EFAULT;
6350 }
6351
6352 /*
6353 * sched_getattr() ABI forwards and backwards compatibility:
6354 *
6355 * If usize == ksize then we just copy everything to user-space and all is
6356 * good.
6357 *
6358 * If usize < ksize then we only copy as much as user-space has space for,
6359 * this keeps ABI compatibility as well. We skip the rest.
6360 *
6361 * If usize > ksize then user-space is using a newer version of the ABI,
6362 * which part the kernel doesn't know about. Just ignore it - tooling can
6363 * detect the kernel's knowledge of attributes from the attr->size value
6364 * which is set to ksize in this case.
6365 */
6366 kattr->size = min(usize, ksize);
6367
6368 if (copy_to_user(uattr, kattr, kattr->size)) {
6369 return -EFAULT;
6370 }
6371
6372 return 0;
6373 }
6374
6375 /**
6376 * sys_sched_getattr - similar to sched_getparam, but with sched_attr
6377 * @pid: the pid in question.
6378 * @uattr: structure containing the extended parameters.
6379 * @usize: sizeof(attr) for fwd/bwd comp.
6380 * @flags: for future extension.
6381 */
SYSCALL_DEFINE4(sched_getattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,usize,unsigned int,flags)6382 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
6383 unsigned int, usize, unsigned int, flags)
6384 {
6385 struct sched_attr kattr = {};
6386 struct task_struct *p;
6387 int retval;
6388
6389 if (!uattr || pid < 0 || usize > PAGE_SIZE ||
6390 usize < SCHED_ATTR_SIZE_VER0 || flags) {
6391 return -EINVAL;
6392 }
6393
6394 rcu_read_lock();
6395 p = find_process_by_pid(pid);
6396 retval = -ESRCH;
6397 if (!p) {
6398 goto out_unlock;
6399 }
6400
6401 retval = security_task_getscheduler(p);
6402 if (retval) {
6403 goto out_unlock;
6404 }
6405
6406 kattr.sched_policy = p->policy;
6407 if (p->sched_reset_on_fork) {
6408 kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
6409 }
6410 if (task_has_dl_policy(p)) {
6411 __getparam_dl(p, &kattr);
6412 } else if (task_has_rt_policy(p)) {
6413 kattr.sched_priority = p->rt_priority;
6414 } else {
6415 kattr.sched_nice = task_nice(p);
6416 }
6417
6418 #ifdef CONFIG_SCHED_LATENCY_NICE
6419 kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
6420 #endif
6421
6422 #ifdef CONFIG_UCLAMP_TASK
6423 /*
6424 * This could race with another potential updater, but this is fine
6425 * because it'll correctly read the old or the new value. We don't need
6426 * to guarantee who wins the race as long as it doesn't return garbage.
6427 */
6428 kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
6429 kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
6430 #endif
6431
6432 rcu_read_unlock();
6433
6434 return sched_attr_copy_to_user(uattr, &kattr, usize);
6435
6436 out_unlock:
6437 rcu_read_unlock();
6438 return retval;
6439 }
6440
sched_setaffinity(pid_t pid,const struct cpumask * in_mask)6441 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6442 {
6443 cpumask_var_t cpus_allowed, new_mask;
6444 struct task_struct *p;
6445 int retval;
6446 #ifdef CONFIG_CPU_ISOLATION_OPT
6447 int dest_cpu;
6448 cpumask_t allowed_mask;
6449 #endif
6450
6451 rcu_read_lock();
6452
6453 p = find_process_by_pid(pid);
6454 if (!p) {
6455 rcu_read_unlock();
6456 return -ESRCH;
6457 }
6458
6459 /* Prevent p going away */
6460 get_task_struct(p);
6461 rcu_read_unlock();
6462
6463 if (p->flags & PF_NO_SETAFFINITY) {
6464 retval = -EINVAL;
6465 goto out_put_task;
6466 }
6467 if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6468 retval = -ENOMEM;
6469 goto out_put_task;
6470 }
6471 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6472 retval = -ENOMEM;
6473 goto out_free_cpus_allowed;
6474 }
6475 retval = -EPERM;
6476 if (!check_same_owner(p)) {
6477 rcu_read_lock();
6478 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
6479 rcu_read_unlock();
6480 goto out_free_new_mask;
6481 }
6482 rcu_read_unlock();
6483 }
6484
6485 retval = security_task_setscheduler(p);
6486 if (retval) {
6487 goto out_free_new_mask;
6488 }
6489
6490 cpuset_cpus_allowed(p, cpus_allowed);
6491 cpumask_and(new_mask, in_mask, cpus_allowed);
6492
6493 /*
6494 * Since bandwidth control happens on root_domain basis,
6495 * if admission test is enabled, we only admit -deadline
6496 * tasks allowed to run on all the CPUs in the task's
6497 * root_domain.
6498 */
6499 #ifdef CONFIG_SMP
6500 if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
6501 rcu_read_lock();
6502 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
6503 retval = -EBUSY;
6504 rcu_read_unlock();
6505 goto out_free_new_mask;
6506 }
6507 rcu_read_unlock();
6508 }
6509 #endif
6510 while (1) {
6511 #ifdef CONFIG_CPU_ISOLATION_OPT
6512 cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
6513 dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
6514 if (dest_cpu < nr_cpu_ids) {
6515 #endif
6516 retval = __set_cpus_allowed_ptr(p, new_mask, true);
6517 if (!retval) {
6518 cpuset_cpus_allowed(p, cpus_allowed);
6519 if (!cpumask_subset(new_mask, cpus_allowed)) {
6520 /*
6521 * We must have raced with a concurrent cpuset
6522 * update. Just reset the cpus_allowed to the
6523 * cpuset's cpus_allowed
6524 */
6525 cpumask_copy(new_mask, cpus_allowed);
6526 continue;
6527 }
6528 }
6529 #ifdef CONFIG_CPU_ISOLATION_OPT
6530 } else {
6531 retval = -EINVAL;
6532 }
6533 #endif
6534 break;
6535 }
6536
6537 out_free_new_mask:
6538 free_cpumask_var(new_mask);
6539 out_free_cpus_allowed:
6540 free_cpumask_var(cpus_allowed);
6541 out_put_task:
6542 put_task_struct(p);
6543 return retval;
6544 }
6545
get_user_cpu_mask(unsigned long __user * user_mask_ptr,unsigned len,struct cpumask * new_mask)6546 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
6547 struct cpumask *new_mask)
6548 {
6549 if (len < cpumask_size()) {
6550 cpumask_clear(new_mask);
6551 } else if (len > cpumask_size()) {
6552 len = cpumask_size();
6553 }
6554
6555 return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
6556 }
6557
6558 /**
6559 * sys_sched_setaffinity - set the CPU affinity of a process
6560 * @pid: pid of the process
6561 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6562 * @user_mask_ptr: user-space pointer to the new CPU mask
6563 *
6564 * Return: 0 on success. An error code otherwise.
6565 */
SYSCALL_DEFINE3(sched_setaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)6566 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6567 unsigned long __user *, user_mask_ptr)
6568 {
6569 cpumask_var_t new_mask;
6570 int retval;
6571
6572 if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6573 return -ENOMEM;
6574 }
6575
6576 retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6577 if (retval == 0) {
6578 retval = sched_setaffinity(pid, new_mask);
6579 }
6580 free_cpumask_var(new_mask);
6581 return retval;
6582 }
6583
sched_getaffinity(pid_t pid,struct cpumask * mask)6584 long sched_getaffinity(pid_t pid, struct cpumask *mask)
6585 {
6586 struct task_struct *p;
6587 unsigned long flags;
6588 int retval;
6589
6590 rcu_read_lock();
6591
6592 retval = -ESRCH;
6593 p = find_process_by_pid(pid);
6594 if (!p) {
6595 goto out_unlock;
6596 }
6597
6598 retval = security_task_getscheduler(p);
6599 if (retval) {
6600 goto out_unlock;
6601 }
6602
6603 raw_spin_lock_irqsave(&p->pi_lock, flags);
6604 cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
6605
6606 #ifdef CONFIG_CPU_ISOLATION_OPT
6607 /* The userspace tasks are forbidden to run on
6608 * isolated CPUs. So exclude isolated CPUs from
6609 * the getaffinity.
6610 */
6611 if (!(p->flags & PF_KTHREAD)) {
6612 cpumask_andnot(mask, mask, cpu_isolated_mask);
6613 }
6614 #endif
6615
6616 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6617
6618 out_unlock:
6619 rcu_read_unlock();
6620
6621 return retval;
6622 }
6623
6624 /**
6625 * sys_sched_getaffinity - get the CPU affinity of a process
6626 * @pid: pid of the process
6627 * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6628 * @user_mask_ptr: user-space pointer to hold the current CPU mask
6629 *
6630 * Return: size of CPU mask copied to user_mask_ptr on success. An
6631 * error code otherwise.
6632 */
SYSCALL_DEFINE3(sched_getaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)6633 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6634 unsigned long __user *, user_mask_ptr)
6635 {
6636 int ret;
6637 cpumask_var_t mask;
6638
6639 if ((len * BITS_PER_BYTE) < nr_cpu_ids) {
6640 return -EINVAL;
6641 }
6642 if (len & (sizeof(unsigned long) - 1)) {
6643 return -EINVAL;
6644 }
6645
6646 if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
6647 return -ENOMEM;
6648 }
6649
6650 ret = sched_getaffinity(pid, mask);
6651 if (ret == 0) {
6652 unsigned int retlen = min(len, cpumask_size());
6653
6654 if (copy_to_user(user_mask_ptr, mask, retlen)) {
6655 ret = -EFAULT;
6656 } else {
6657 ret = retlen;
6658 }
6659 }
6660 free_cpumask_var(mask);
6661
6662 return ret;
6663 }
6664
6665 /**
6666 * sys_sched_yield - yield the current processor to other threads.
6667 *
6668 * This function yields the current CPU to other tasks. If there are no
6669 * other threads running on this CPU then this function will return.
6670 *
6671 * Return: 0.
6672 */
do_sched_yield(void)6673 static void do_sched_yield(void)
6674 {
6675 struct rq_flags rf;
6676 struct rq *rq;
6677
6678 rq = this_rq_lock_irq(&rf);
6679
6680 schedstat_inc(rq->yld_count);
6681 current->sched_class->yield_task(rq);
6682
6683 preempt_disable();
6684 rq_unlock_irq(rq, &rf);
6685 sched_preempt_enable_no_resched();
6686
6687 schedule();
6688 }
6689
SYSCALL_DEFINE0(sched_yield)6690 SYSCALL_DEFINE0(sched_yield)
6691 {
6692 do_sched_yield();
6693 return 0;
6694 }
6695
6696 #ifndef CONFIG_PREEMPTION
_cond_resched(void)6697 int __sched _cond_resched(void)
6698 {
6699 if (should_resched(0)) {
6700 preempt_schedule_common();
6701 return 1;
6702 }
6703 rcu_all_qs();
6704 return 0;
6705 }
6706 EXPORT_SYMBOL(_cond_resched);
6707 #endif
6708
6709 /*
6710 * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6711 * call schedule, and on return reacquire the lock.
6712 *
6713 * This works OK both with and without CONFIG_PREEMPTION. We do strange
6714 * low-level operations here to prevent schedule() from being called twice (once
6715 * via spin_unlock(), once by hand).
6716 */
__cond_resched_lock(spinlock_t * lock)6717 int __cond_resched_lock(spinlock_t *lock)
6718 {
6719 int resched = should_resched(PREEMPT_LOCK_OFFSET);
6720 int ret = 0;
6721
6722 lockdep_assert_held(lock);
6723
6724 if (spin_needbreak(lock) || resched) {
6725 spin_unlock(lock);
6726 if (resched) {
6727 preempt_schedule_common();
6728 } else {
6729 cpu_relax();
6730 }
6731 ret = 1;
6732 spin_lock(lock);
6733 }
6734 return ret;
6735 }
6736 EXPORT_SYMBOL(__cond_resched_lock);
6737
6738 /**
6739 * yield - yield the current processor to other threads.
6740 *
6741 * Do not ever use this function, there's a 99% chance you're doing it wrong.
6742 *
6743 * The scheduler is at all times free to pick the calling task as the most
6744 * eligible task to run, if removing the yield() call from your code breaks
6745 * it, its already broken.
6746 *
6747 * Typical broken usage is:
6748 *
6749 * while (!event)
6750 * yield();
6751 *
6752 * where one assumes that yield() will let 'the other' process run that will
6753 * make event true. If the current task is a SCHED_FIFO task that will never
6754 * happen. Never use yield() as a progress guarantee!!
6755 *
6756 * If you want to use yield() to wait for something, use wait_event().
6757 * If you want to use yield() to be 'nice' for others, use cond_resched().
6758 * If you still want to use yield(), do not!
6759 */
yield(void)6760 void __sched yield(void)
6761 {
6762 set_current_state(TASK_RUNNING);
6763 do_sched_yield();
6764 }
6765 EXPORT_SYMBOL(yield);
6766
6767 /**
6768 * yield_to - yield the current processor to another thread in
6769 * your thread group, or accelerate that thread toward the
6770 * processor it's on.
6771 * @p: target task
6772 * @preempt: whether task preemption is allowed or not
6773 *
6774 * It's the caller's job to ensure that the target task struct
6775 * can't go away on us before we can do any checks.
6776 *
6777 * Return:
6778 * true (>0) if we indeed boosted the target task.
6779 * false (0) if we failed to boost the target.
6780 * -ESRCH if there's no task to yield to.
6781 */
yield_to(struct task_struct * p,bool preempt)6782 int __sched yield_to(struct task_struct *p, bool preempt)
6783 {
6784 struct task_struct *curr = current;
6785 struct rq *rq, *p_rq;
6786 unsigned long flags;
6787 int yielded = 0;
6788
6789 local_irq_save(flags);
6790 rq = this_rq();
6791
6792 again:
6793 p_rq = task_rq(p);
6794 /*
6795 * If we're the only runnable task on the rq and target rq also
6796 * has only one task, there's absolutely no point in yielding.
6797 */
6798 if (rq->nr_running == 1 && p_rq->nr_running == 1) {
6799 yielded = -ESRCH;
6800 goto out_irq;
6801 }
6802
6803 double_rq_lock(rq, p_rq);
6804 if (task_rq(p) != p_rq) {
6805 double_rq_unlock(rq, p_rq);
6806 goto again;
6807 }
6808
6809 if (!curr->sched_class->yield_to_task) {
6810 goto out_unlock;
6811 }
6812
6813 if (curr->sched_class != p->sched_class) {
6814 goto out_unlock;
6815 }
6816
6817 if (task_running(p_rq, p) || p->state) {
6818 goto out_unlock;
6819 }
6820
6821 yielded = curr->sched_class->yield_to_task(rq, p);
6822 if (yielded) {
6823 schedstat_inc(rq->yld_count);
6824 /*
6825 * Make p's CPU reschedule; pick_next_entity takes care of
6826 * fairness.
6827 */
6828 if (preempt && rq != p_rq) {
6829 resched_curr(p_rq);
6830 }
6831 }
6832
6833 out_unlock:
6834 double_rq_unlock(rq, p_rq);
6835 out_irq:
6836 local_irq_restore(flags);
6837
6838 if (yielded > 0) {
6839 schedule();
6840 }
6841
6842 return yielded;
6843 }
6844 EXPORT_SYMBOL_GPL(yield_to);
6845
io_schedule_prepare(void)6846 int io_schedule_prepare(void)
6847 {
6848 int old_iowait = current->in_iowait;
6849
6850 current->in_iowait = 1;
6851 blk_schedule_flush_plug(current);
6852
6853 return old_iowait;
6854 }
6855
io_schedule_finish(int token)6856 void io_schedule_finish(int token)
6857 {
6858 current->in_iowait = token;
6859 }
6860
6861 /*
6862 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6863 * that process accounting knows that this is a task in IO wait state.
6864 */
io_schedule_timeout(long timeout)6865 long __sched io_schedule_timeout(long timeout)
6866 {
6867 int token;
6868 long ret;
6869
6870 token = io_schedule_prepare();
6871 ret = schedule_timeout(timeout);
6872 io_schedule_finish(token);
6873
6874 return ret;
6875 }
6876 EXPORT_SYMBOL(io_schedule_timeout);
6877
io_schedule(void)6878 void __sched io_schedule(void)
6879 {
6880 int token;
6881
6882 token = io_schedule_prepare();
6883 schedule();
6884 io_schedule_finish(token);
6885 }
6886 EXPORT_SYMBOL(io_schedule);
6887
6888 /**
6889 * sys_sched_get_priority_max - return maximum RT priority.
6890 * @policy: scheduling class.
6891 *
6892 * Return: On success, this syscall returns the maximum
6893 * rt_priority that can be used by a given scheduling class.
6894 * On failure, a negative error code is returned.
6895 */
SYSCALL_DEFINE1(sched_get_priority_max,int,policy)6896 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6897 {
6898 int ret = -EINVAL;
6899
6900 switch (policy) {
6901 case SCHED_FIFO:
6902 case SCHED_RR:
6903 ret = MAX_USER_RT_PRIO - 1;
6904 break;
6905 case SCHED_DEADLINE:
6906 case SCHED_NORMAL:
6907 case SCHED_BATCH:
6908 case SCHED_IDLE:
6909 ret = 0;
6910 break;
6911 }
6912 return ret;
6913 }
6914
6915 /**
6916 * sys_sched_get_priority_min - return minimum RT priority.
6917 * @policy: scheduling class.
6918 *
6919 * Return: On success, this syscall returns the minimum
6920 * rt_priority that can be used by a given scheduling class.
6921 * On failure, a negative error code is returned.
6922 */
SYSCALL_DEFINE1(sched_get_priority_min,int,policy)6923 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6924 {
6925 int ret = -EINVAL;
6926
6927 switch (policy) {
6928 case SCHED_FIFO:
6929 case SCHED_RR:
6930 ret = 1;
6931 break;
6932 case SCHED_DEADLINE:
6933 case SCHED_NORMAL:
6934 case SCHED_BATCH:
6935 case SCHED_IDLE:
6936 ret = 0;
6937 }
6938 return ret;
6939 }
6940
sched_rr_get_interval(pid_t pid,struct timespec64 * t)6941 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
6942 {
6943 struct task_struct *p;
6944 unsigned int time_slice;
6945 struct rq_flags rf;
6946 struct rq *rq;
6947 int retval;
6948
6949 if (pid < 0) {
6950 return -EINVAL;
6951 }
6952
6953 retval = -ESRCH;
6954 rcu_read_lock();
6955 p = find_process_by_pid(pid);
6956 if (!p) {
6957 goto out_unlock;
6958 }
6959
6960 retval = security_task_getscheduler(p);
6961 if (retval) {
6962 goto out_unlock;
6963 }
6964
6965 rq = task_rq_lock(p, &rf);
6966 time_slice = 0;
6967 if (p->sched_class->get_rr_interval) {
6968 time_slice = p->sched_class->get_rr_interval(rq, p);
6969 }
6970 task_rq_unlock(rq, p, &rf);
6971
6972 rcu_read_unlock();
6973 jiffies_to_timespec64(time_slice, t);
6974 return 0;
6975
6976 out_unlock:
6977 rcu_read_unlock();
6978 return retval;
6979 }
6980
6981 /**
6982 * sys_sched_rr_get_interval - return the default timeslice of a process.
6983 * @pid: pid of the process.
6984 * @interval: userspace pointer to the timeslice value.
6985 *
6986 * this syscall writes the default timeslice value of a given process
6987 * into the user-space timespec buffer. A value of '0' means infinity.
6988 *
6989 * Return: On success, 0 and the timeslice is in @interval. Otherwise,
6990 * an error code.
6991 */
SYSCALL_DEFINE2(sched_rr_get_interval,pid_t,pid,struct __kernel_timespec __user *,interval)6992 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6993 struct __kernel_timespec __user *, interval)
6994 {
6995 struct timespec64 t;
6996 int retval = sched_rr_get_interval(pid, &t);
6997
6998 if (retval == 0) {
6999 retval = put_timespec64(&t, interval);
7000 }
7001
7002 return retval;
7003 }
7004
7005 #ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32,pid_t,pid,struct old_timespec32 __user *,interval)7006 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
7007 struct old_timespec32 __user *, interval)
7008 {
7009 struct timespec64 t;
7010 int retval = sched_rr_get_interval(pid, &t);
7011
7012 if (retval == 0) {
7013 retval = put_old_timespec32(&t, interval);
7014 }
7015 return retval;
7016 }
7017 #endif
7018
sched_show_task(struct task_struct * p)7019 void sched_show_task(struct task_struct *p)
7020 {
7021 unsigned long free = 0;
7022 int ppid;
7023
7024 if (!try_get_task_stack(p)) {
7025 return;
7026 }
7027
7028 pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
7029
7030 if (p->state == TASK_RUNNING) {
7031 pr_cont(" running task ");
7032 }
7033 #ifdef CONFIG_DEBUG_STACK_USAGE
7034 free = stack_not_used(p);
7035 #endif
7036 ppid = 0;
7037 rcu_read_lock();
7038 if (pid_alive(p)) {
7039 ppid = task_pid_nr(rcu_dereference(p->real_parent));
7040 }
7041 rcu_read_unlock();
7042 pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", free,
7043 task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags);
7044
7045 print_worker_info(KERN_INFO, p);
7046 show_stack(p, NULL, KERN_INFO);
7047 put_task_stack(p);
7048 }
7049 EXPORT_SYMBOL_GPL(sched_show_task);
7050
state_filter_match(unsigned long state_filter,struct task_struct * p)7051 static inline bool state_filter_match(unsigned long state_filter,
7052 struct task_struct *p)
7053 {
7054 /* no filter, everything matches */
7055 if (!state_filter) {
7056 return true;
7057 }
7058
7059 /* filter, but doesn't match */
7060 if (!(p->state & state_filter)) {
7061 return false;
7062 }
7063
7064 /*
7065 * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
7066 * TASK_KILLABLE).
7067 */
7068 if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) {
7069 return false;
7070 }
7071
7072 return true;
7073 }
7074
show_state_filter(unsigned long state_filter)7075 void show_state_filter(unsigned long state_filter)
7076 {
7077 struct task_struct *g, *p;
7078
7079 rcu_read_lock();
7080 for_each_process_thread(g, p)
7081 {
7082 /*
7083 * reset the NMI-timeout, listing all files on a slow
7084 * console might take a lot of time:
7085 * Also, reset softlockup watchdogs on all CPUs, because
7086 * another CPU might be blocked waiting for us to process
7087 * an IPI.
7088 */
7089 touch_nmi_watchdog();
7090 touch_all_softlockup_watchdogs();
7091 if (state_filter_match(state_filter, p)) {
7092 sched_show_task(p);
7093 }
7094 }
7095
7096 #ifdef CONFIG_SCHED_DEBUG
7097 if (!state_filter) {
7098 sysrq_sched_debug_show();
7099 }
7100 #endif
7101 rcu_read_unlock();
7102 /*
7103 * Only show locks if all tasks are dumped:
7104 */
7105 if (!state_filter) {
7106 debug_show_all_locks();
7107 }
7108 }
7109
7110 /**
7111 * init_idle - set up an idle thread for a given CPU
7112 * @idle: task in question
7113 * @cpu: CPU the idle task belongs to
7114 *
7115 * NOTE: this function does not set the idle thread's NEED_RESCHED
7116 * flag, to make booting more robust.
7117 */
init_idle(struct task_struct * idle,int cpu)7118 void __init init_idle(struct task_struct *idle, int cpu)
7119 {
7120 struct rq *rq = cpu_rq(cpu);
7121 unsigned long flags;
7122
7123 __sched_fork(0, idle);
7124
7125 raw_spin_lock_irqsave(&idle->pi_lock, flags);
7126 raw_spin_lock(&rq->lock);
7127
7128 idle->state = TASK_RUNNING;
7129 idle->se.exec_start = sched_clock();
7130 idle->flags |= PF_IDLE;
7131
7132
7133 #ifdef CONFIG_SMP
7134 /*
7135 * Its possible that init_idle() gets called multiple times on a task,
7136 * in that case do_set_cpus_allowed() will not do the right thing.
7137 *
7138 * And since this is boot we can forgo the serialization.
7139 */
7140 set_cpus_allowed_common(idle, cpumask_of(cpu));
7141 #endif
7142 /*
7143 * We're having a chicken and egg problem, even though we are
7144 * holding rq->lock, the CPU isn't yet set to this CPU so the
7145 * lockdep check in task_group() will fail.
7146 *
7147 * Similar case to sched_fork(). / Alternatively we could
7148 * use task_rq_lock() here and obtain the other rq->lock.
7149 *
7150 * Silence PROVE_RCU
7151 */
7152 rcu_read_lock();
7153 __set_task_cpu(idle, cpu);
7154 rcu_read_unlock();
7155
7156 rq->idle = idle;
7157 rcu_assign_pointer(rq->curr, idle);
7158 idle->on_rq = TASK_ON_RQ_QUEUED;
7159 #ifdef CONFIG_SMP
7160 idle->on_cpu = 1;
7161 #endif
7162 raw_spin_unlock(&rq->lock);
7163 raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
7164
7165 /* Set the preempt count _outside_ the spinlocks! */
7166 init_idle_preempt_count(idle, cpu);
7167
7168 /*
7169 * The idle tasks have their own, simple scheduling class:
7170 */
7171 idle->sched_class = &idle_sched_class;
7172 ftrace_graph_init_idle_task(idle, cpu);
7173 vtime_init_idle(idle, cpu);
7174 #ifdef CONFIG_SMP
7175 sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
7176 #endif
7177 }
7178
7179 #ifdef CONFIG_SMP
7180
cpuset_cpumask_can_shrink(const struct cpumask * cur,const struct cpumask * trial)7181 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
7182 const struct cpumask *trial)
7183 {
7184 int ret = 1;
7185
7186 if (!cpumask_weight(cur)) {
7187 return ret;
7188 }
7189
7190 ret = dl_cpuset_cpumask_can_shrink(cur, trial);
7191
7192 return ret;
7193 }
7194
task_can_attach(struct task_struct * p,const struct cpumask * cs_effective_cpus)7195 int task_can_attach(struct task_struct *p,
7196 const struct cpumask *cs_effective_cpus)
7197 {
7198 int ret = 0;
7199
7200 /*
7201 * Kthreads which disallow setaffinity shouldn't be moved
7202 * to a new cpuset; we don't want to change their CPU
7203 * affinity and isolating such threads by their set of
7204 * allowed nodes is unnecessary. Thus, cpusets are not
7205 * applicable for such threads. This prevents checking for
7206 * success of set_cpus_allowed_ptr() on all attached tasks
7207 * before cpus_mask may be changed.
7208 */
7209 if (p->flags & PF_NO_SETAFFINITY) {
7210 ret = -EINVAL;
7211 goto out;
7212 }
7213
7214 if (dl_task(p) &&
7215 !cpumask_intersects(task_rq(p)->rd->span, cs_effective_cpus)) {
7216 int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
7217 if (unlikely(cpu >= nr_cpu_ids))
7218 return -EINVAL;
7219 ret = dl_cpu_busy(cpu, p);
7220 }
7221
7222 out:
7223 return ret;
7224 }
7225
7226 bool sched_smp_initialized __read_mostly;
7227
7228 #ifdef CONFIG_NUMA_BALANCING
7229 /* Migrate current task p to target_cpu */
migrate_task_to(struct task_struct * p,int target_cpu)7230 int migrate_task_to(struct task_struct *p, int target_cpu)
7231 {
7232 struct migration_arg arg = {p, target_cpu};
7233 int curr_cpu = task_cpu(p);
7234 if (curr_cpu == target_cpu) {
7235 return 0;
7236 }
7237
7238 if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) {
7239 return -EINVAL;
7240 }
7241
7242 /* This is not properly updating schedstats */
7243
7244 trace_sched_move_numa(p, curr_cpu, target_cpu);
7245 return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
7246 }
7247
7248 /*
7249 * Requeue a task on a given node and accurately track the number of NUMA
7250 * tasks on the runqueues
7251 */
sched_setnuma(struct task_struct * p,int nid)7252 void sched_setnuma(struct task_struct *p, int nid)
7253 {
7254 bool queued, running;
7255 struct rq_flags rf;
7256 struct rq *rq;
7257
7258 rq = task_rq_lock(p, &rf);
7259 queued = task_on_rq_queued(p);
7260 running = task_current(rq, p);
7261
7262 if (queued) {
7263 dequeue_task(rq, p, DEQUEUE_SAVE);
7264 }
7265 if (running) {
7266 put_prev_task(rq, p);
7267 }
7268
7269 p->numa_preferred_nid = nid;
7270
7271 if (queued) {
7272 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
7273 }
7274 if (running) {
7275 set_next_task(rq, p);
7276 }
7277 task_rq_unlock(rq, p, &rf);
7278 }
7279 #endif /* CONFIG_NUMA_BALANCING */
7280
7281 #ifdef CONFIG_HOTPLUG_CPU
7282 /*
7283 * Ensure that the idle task is using init_mm right before its CPU goes
7284 * offline.
7285 */
idle_task_exit(void)7286 void idle_task_exit(void)
7287 {
7288 struct mm_struct *mm = current->active_mm;
7289
7290 BUG_ON(cpu_online(smp_processor_id()));
7291 BUG_ON(current != this_rq()->idle);
7292
7293 if (mm != &init_mm) {
7294 switch_mm(mm, &init_mm, current);
7295 finish_arch_post_lock_switch();
7296 }
7297
7298 /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
7299 }
7300
7301 /*
7302 * Since this CPU is going 'away' for a while, fold any nr_active delta
7303 * we might have. Assumes we're called after migrate_tasks() so that the
7304 * nr_active count is stable. We need to take the teardown thread which
7305 * is calling this into account, so we hand in adjust = 1 to the load
7306 * calculation.
7307 *
7308 * Also see the comment "Global load-average calculations".
7309 */
calc_load_migrate(struct rq * rq)7310 static void calc_load_migrate(struct rq *rq)
7311 {
7312 long delta = calc_load_fold_active(rq, 1);
7313 if (delta) {
7314 atomic_long_add(delta, &calc_load_tasks);
7315 }
7316 }
7317
__pick_migrate_task(struct rq * rq)7318 static struct task_struct *__pick_migrate_task(struct rq *rq)
7319 {
7320 const struct sched_class *class;
7321 struct task_struct *next;
7322
7323 for_each_class(class)
7324 {
7325 next = class->pick_next_task(rq);
7326 if (next) {
7327 next->sched_class->put_prev_task(rq, next);
7328 return next;
7329 }
7330 }
7331
7332 /* The idle class should always have a runnable task */
7333 BUG();
7334 }
7335
7336 #ifdef CONFIG_CPU_ISOLATION_OPT
7337 /*
7338 * Remove a task from the runqueue and pretend that it's migrating. This
7339 * should prevent migrations for the detached task and disallow further
7340 * changes to tsk_cpus_allowed.
7341 */
detach_one_task_core(struct task_struct * p,struct rq * rq,struct list_head * tasks)7342 static void detach_one_task_core(struct task_struct *p, struct rq *rq,
7343 struct list_head *tasks)
7344 {
7345 lockdep_assert_held(&rq->lock);
7346
7347 p->on_rq = TASK_ON_RQ_MIGRATING;
7348 deactivate_task(rq, p, 0);
7349 list_add(&p->se.group_node, tasks);
7350 }
7351
attach_tasks_core(struct list_head * tasks,struct rq * rq)7352 static void attach_tasks_core(struct list_head *tasks, struct rq *rq)
7353 {
7354 struct task_struct *p;
7355
7356 lockdep_assert_held(&rq->lock);
7357
7358 while (!list_empty(tasks)) {
7359 p = list_first_entry(tasks, struct task_struct, se.group_node);
7360 list_del_init(&p->se.group_node);
7361
7362 BUG_ON(task_rq(p) != rq);
7363 activate_task(rq, p, 0);
7364 p->on_rq = TASK_ON_RQ_QUEUED;
7365 }
7366 }
7367
7368 #else
7369
detach_one_task_core(struct task_struct * p,struct rq * rq,struct list_head * tasks)7370 static void detach_one_task_core(struct task_struct *p, struct rq *rq,
7371 struct list_head *tasks)
7372 {
7373 }
7374
attach_tasks_core(struct list_head * tasks,struct rq * rq)7375 static void attach_tasks_core(struct list_head *tasks, struct rq *rq)
7376 {
7377 }
7378
7379 #endif /* CONFIG_CPU_ISOLATION_OPT */
7380
7381 /*
7382 * Migrate all tasks (not pinned if pinned argument say so) from the rq,
7383 * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
7384 *
7385 * Called with rq->lock held even though we'er in stop_machine() and
7386 * there's no concurrency possible, we hold the required locks anyway
7387 * because of lock validation efforts.
7388 */
migrate_tasks(struct rq * dead_rq,struct rq_flags * rf,bool migrate_pinned_tasks)7389 void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
7390 bool migrate_pinned_tasks)
7391 {
7392 struct rq *rq = dead_rq;
7393 struct task_struct *next, *stop = rq->stop;
7394 struct rq_flags orf = *rf;
7395 int dest_cpu;
7396 unsigned int num_pinned_kthreads = 1; /* this thread */
7397 LIST_HEAD(tasks);
7398 cpumask_t avail_cpus;
7399
7400 #ifdef CONFIG_CPU_ISOLATION_OPT
7401 cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
7402 #else
7403 cpumask_copy(&avail_cpus, cpu_online_mask);
7404 #endif
7405
7406 /*
7407 * Fudge the rq selection such that the below task selection loop
7408 * doesn't get stuck on the currently eligible stop task.
7409 *
7410 * We're currently inside stop_machine() and the rq is either stuck
7411 * in the stop_machine_cpu_stop() loop, or we're executing this code,
7412 * either way we should never end up calling schedule() until we're
7413 * done here.
7414 */
7415 rq->stop = NULL;
7416
7417 /*
7418 * put_prev_task() and pick_next_task() sched
7419 * class method both need to have an up-to-date
7420 * value of rq->clock[_task]
7421 */
7422 update_rq_clock(rq);
7423
7424 #ifdef CONFIG_SCHED_DEBUG
7425 /* note the clock update in orf */
7426 orf.clock_update_flags |= RQCF_UPDATED;
7427 #endif
7428
7429 for (;;) {
7430 /*
7431 * There's this thread running, bail when that's the only
7432 * remaining thread.
7433 */
7434 if (rq->nr_running == 1) {
7435 break;
7436 }
7437
7438 next = __pick_migrate_task(rq);
7439 if (!migrate_pinned_tasks && (next->flags & PF_KTHREAD) &&
7440 !cpumask_intersects(&avail_cpus, &next->cpus_mask)) {
7441 detach_one_task_core(next, rq, &tasks);
7442 num_pinned_kthreads += 1;
7443 continue;
7444 }
7445
7446 /*
7447 * Rules for changing task_struct::cpus_mask are holding
7448 * both pi_lock and rq->lock, such that holding either
7449 * stabilizes the mask.
7450 *
7451 * Drop rq->lock is not quite as disastrous as it usually is
7452 * because !cpu_active at this point, which means load-balance
7453 * will not interfere. Also, stop-machine.
7454 */
7455 rq_unlock(rq, rf);
7456 raw_spin_lock(&next->pi_lock);
7457 rq_relock(rq, rf);
7458 if (!(rq->clock_update_flags & RQCF_UPDATED)) {
7459 update_rq_clock(rq);
7460 }
7461
7462 /*
7463 * Since we're inside stop-machine, _nothing_ should have
7464 * changed the task, WARN if weird stuff happened, because in
7465 * that case the above rq->lock drop is a fail too.
7466 * However, during cpu isolation the load balancer might have
7467 * interferred since we don't stop all CPUs. Ignore warning for
7468 * this case.
7469 */
7470 if (task_rq(next) != rq || !task_on_rq_queued(next)) {
7471 WARN_ON(migrate_pinned_tasks);
7472 raw_spin_unlock(&next->pi_lock);
7473 continue;
7474 }
7475
7476 /* Find suitable destination for @next, with force if needed. */
7477 #ifdef CONFIG_CPU_ISOLATION_OPT
7478 dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
7479 #else
7480 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
7481 #endif
7482 rq = __migrate_task(rq, rf, next, dest_cpu);
7483 if (rq != dead_rq) {
7484 rq_unlock(rq, rf);
7485 rq = dead_rq;
7486 *rf = orf;
7487 rq_relock(rq, rf);
7488 if (!(rq->clock_update_flags & RQCF_UPDATED)) {
7489 update_rq_clock(rq);
7490 }
7491 }
7492 raw_spin_unlock(&next->pi_lock);
7493 }
7494
7495 rq->stop = stop;
7496
7497 if (num_pinned_kthreads > 1) {
7498 attach_tasks_core(&tasks, rq);
7499 }
7500 }
7501
7502 #ifdef CONFIG_SCHED_EAS
clear_eas_migration_request(int cpu)7503 static void clear_eas_migration_request(int cpu)
7504 {
7505 struct rq *rq = cpu_rq(cpu);
7506 unsigned long flags;
7507
7508 clear_reserved(cpu);
7509 if (rq->push_task) {
7510 struct task_struct *push_task = NULL;
7511
7512 raw_spin_lock_irqsave(&rq->lock, flags);
7513 if (rq->push_task) {
7514 clear_reserved(rq->push_cpu);
7515 push_task = rq->push_task;
7516 rq->push_task = NULL;
7517 }
7518 rq->active_balance = 0;
7519 raw_spin_unlock_irqrestore(&rq->lock, flags);
7520 if (push_task) {
7521 put_task_struct(push_task);
7522 }
7523 }
7524 }
7525 #else
clear_eas_migration_request(int cpu)7526 static inline void clear_eas_migration_request(int cpu)
7527 {
7528 }
7529 #endif
7530
7531 #ifdef CONFIG_CPU_ISOLATION_OPT
do_isolation_work_cpu_stop(void * data)7532 int do_isolation_work_cpu_stop(void *data)
7533 {
7534 unsigned int cpu = smp_processor_id();
7535 struct rq *rq = cpu_rq(cpu);
7536 struct rq_flags rf;
7537
7538 watchdog_disable(cpu);
7539
7540 local_irq_disable();
7541
7542 irq_migrate_all_off_this_cpu();
7543
7544 flush_smp_call_function_from_idle();
7545
7546 /* Update our root-domain */
7547 rq_lock(rq, &rf);
7548
7549 /*
7550 * Temporarily mark the rq as offline. This will allow us to
7551 * move tasks off the CPU.
7552 */
7553 if (rq->rd) {
7554 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7555 set_rq_offline(rq);
7556 }
7557
7558 migrate_tasks(rq, &rf, false);
7559
7560 if (rq->rd) {
7561 set_rq_online(rq);
7562 }
7563 rq_unlock(rq, &rf);
7564
7565 clear_eas_migration_request(cpu);
7566 local_irq_enable();
7567 return 0;
7568 }
7569
do_unisolation_work_cpu_stop(void * data)7570 int do_unisolation_work_cpu_stop(void *data)
7571 {
7572 watchdog_enable(smp_processor_id());
7573 return 0;
7574 }
7575
sched_update_group_capacities(int cpu)7576 static void sched_update_group_capacities(int cpu)
7577 {
7578 struct sched_domain *sd;
7579
7580 mutex_lock(&sched_domains_mutex);
7581 rcu_read_lock();
7582
7583 for_each_domain(cpu, sd)
7584 {
7585 int balance_cpu = group_balance_cpu(sd->groups);
7586
7587 init_sched_groups_capacity(cpu, sd);
7588 /*
7589 * Need to ensure this is also called with balancing
7590 * cpu.
7591 */
7592 if (cpu != balance_cpu) {
7593 init_sched_groups_capacity(balance_cpu, sd);
7594 }
7595 }
7596
7597 rcu_read_unlock();
7598 mutex_unlock(&sched_domains_mutex);
7599 }
7600
7601 static unsigned int cpu_isolation_vote[NR_CPUS];
7602
sched_isolate_count(const cpumask_t * mask,bool include_offline)7603 int sched_isolate_count(const cpumask_t *mask, bool include_offline)
7604 {
7605 cpumask_t count_mask = CPU_MASK_NONE;
7606
7607 if (include_offline) {
7608 cpumask_complement(&count_mask, cpu_online_mask);
7609 cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
7610 cpumask_and(&count_mask, &count_mask, mask);
7611 } else {
7612 cpumask_and(&count_mask, mask, cpu_isolated_mask);
7613 }
7614
7615 return cpumask_weight(&count_mask);
7616 }
7617
7618 /*
7619 * 1) CPU is isolated and cpu is offlined:
7620 * Unisolate the core.
7621 * 2) CPU is not isolated and CPU is offlined:
7622 * No action taken.
7623 * 3) CPU is offline and request to isolate
7624 * Request ignored.
7625 * 4) CPU is offline and isolated:
7626 * Not a possible state.
7627 * 5) CPU is online and request to isolate
7628 * Normal case: Isolate the CPU
7629 * 6) CPU is not isolated and comes back online
7630 * Nothing to do
7631 *
7632 * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
7633 * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
7634 * Client is also responsible for unisolating when a core goes offline
7635 * (after CPU is marked offline).
7636 */
sched_isolate_cpu(int cpu)7637 int sched_isolate_cpu(int cpu)
7638 {
7639 struct rq *rq;
7640 cpumask_t avail_cpus;
7641 int ret_code = 0;
7642 u64 start_time = 0;
7643
7644 if (trace_sched_isolate_enabled()) {
7645 start_time = sched_clock();
7646 }
7647
7648 cpu_maps_update_begin();
7649
7650 cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
7651
7652 if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) ||
7653 !cpu_online(cpu) || cpu >= NR_CPUS) {
7654 ret_code = -EINVAL;
7655 goto out;
7656 }
7657
7658 rq = cpu_rq(cpu);
7659
7660 if (++cpu_isolation_vote[cpu] > 1) {
7661 goto out;
7662 }
7663
7664 /* We cannot isolate ALL cpus in the system */
7665 if (cpumask_weight(&avail_cpus) == 1) {
7666 --cpu_isolation_vote[cpu];
7667 ret_code = -EINVAL;
7668 goto out;
7669 }
7670
7671 /*
7672 * There is a race between watchdog being enabled by hotplug and
7673 * core isolation disabling the watchdog. When a CPU is hotplugged in
7674 * and the hotplug lock has been released the watchdog thread might
7675 * not have run yet to enable the watchdog.
7676 * We have to wait for the watchdog to be enabled before proceeding.
7677 */
7678 if (!watchdog_configured(cpu)) {
7679 msleep(0x14);
7680 if (!watchdog_configured(cpu)) {
7681 --cpu_isolation_vote[cpu];
7682 ret_code = -EBUSY;
7683 goto out;
7684 }
7685 }
7686
7687 set_cpu_isolated(cpu, true);
7688 cpumask_clear_cpu(cpu, &avail_cpus);
7689
7690 /* Migrate timers */
7691 smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
7692 smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
7693
7694 watchdog_disable(cpu);
7695 irq_lock_sparse();
7696 stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
7697 irq_unlock_sparse();
7698
7699 calc_load_migrate(rq);
7700 update_max_interval();
7701 sched_update_group_capacities(cpu);
7702
7703 out:
7704 cpu_maps_update_done();
7705 trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], start_time, 1);
7706 return ret_code;
7707 }
7708
7709 /*
7710 * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
7711 * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
7712 * Client is also responsible for unisolating when a core goes offline
7713 * (after CPU is marked offline).
7714 */
sched_unisolate_cpu_unlocked(int cpu)7715 int sched_unisolate_cpu_unlocked(int cpu)
7716 {
7717 int ret_code = 0;
7718 u64 start_time = 0;
7719
7720 if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) || cpu >= NR_CPUS) {
7721 ret_code = -EINVAL;
7722 goto out;
7723 }
7724
7725 if (trace_sched_isolate_enabled()) {
7726 start_time = sched_clock();
7727 }
7728
7729 if (!cpu_isolation_vote[cpu]) {
7730 ret_code = -EINVAL;
7731 goto out;
7732 }
7733
7734 if (--cpu_isolation_vote[cpu]) {
7735 goto out;
7736 }
7737
7738 set_cpu_isolated(cpu, false);
7739 update_max_interval();
7740 sched_update_group_capacities(cpu);
7741
7742 if (cpu_online(cpu)) {
7743 stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);
7744
7745 /* Kick CPU to immediately do load balancing */
7746 if (!atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(cpu))) {
7747 smp_send_reschedule(cpu);
7748 }
7749 }
7750
7751 out:
7752 trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], start_time, 0);
7753 return ret_code;
7754 }
7755
sched_unisolate_cpu(int cpu)7756 int sched_unisolate_cpu(int cpu)
7757 {
7758 int ret_code;
7759
7760 cpu_maps_update_begin();
7761 ret_code = sched_unisolate_cpu_unlocked(cpu);
7762 cpu_maps_update_done();
7763 return ret_code;
7764 }
7765
7766 #endif /* CONFIG_CPU_ISOLATION_OPT */
7767
7768 #endif /* CONFIG_HOTPLUG_CPU */
7769
set_rq_online(struct rq * rq)7770 void set_rq_online(struct rq *rq)
7771 {
7772 if (!rq->online) {
7773 const struct sched_class *class;
7774
7775 cpumask_set_cpu(rq->cpu, rq->rd->online);
7776 rq->online = 1;
7777
7778 for_each_class(class)
7779 {
7780 if (class->rq_online) {
7781 class->rq_online(rq);
7782 }
7783 }
7784 }
7785 }
7786
set_rq_offline(struct rq * rq)7787 void set_rq_offline(struct rq *rq)
7788 {
7789 if (rq->online) {
7790 const struct sched_class *class;
7791
7792 for_each_class(class)
7793 {
7794 if (class->rq_offline) {
7795 class->rq_offline(rq);
7796 }
7797 }
7798
7799 cpumask_clear_cpu(rq->cpu, rq->rd->online);
7800 rq->online = 0;
7801 }
7802 }
7803
7804 /*
7805 * used to mark begin/end of suspend/resume:
7806 */
7807 static int num_cpus_frozen;
7808
7809 /*
7810 * Update cpusets according to cpu_active mask. If cpusets are
7811 * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7812 * around partition_sched_domains().
7813 *
7814 * If we come here as part of a suspend/resume, don't touch cpusets because we
7815 * want to restore it back to its original state upon resume anyway.
7816 */
cpuset_cpu_active(void)7817 static void cpuset_cpu_active(void)
7818 {
7819 if (cpuhp_tasks_frozen) {
7820 /*
7821 * num_cpus_frozen tracks how many CPUs are involved in suspend
7822 * resume sequence. As long as this is not the last online
7823 * operation in the resume sequence, just build a single sched
7824 * domain, ignoring cpusets.
7825 */
7826 partition_sched_domains(1, NULL, NULL);
7827 if (--num_cpus_frozen) {
7828 return;
7829 }
7830 /*
7831 * This is the last CPU online operation. So fall through and
7832 * restore the original sched domains by considering the
7833 * cpuset configurations.
7834 */
7835 cpuset_force_rebuild();
7836 }
7837 cpuset_update_active_cpus();
7838 }
7839
cpuset_cpu_inactive(unsigned int cpu)7840 static int cpuset_cpu_inactive(unsigned int cpu)
7841 {
7842 if (!cpuhp_tasks_frozen) {
7843 int ret = dl_cpu_busy(cpu, NULL);
7844 if (ret) {
7845 return ret;
7846 }
7847 cpuset_update_active_cpus();
7848 } else {
7849 num_cpus_frozen++;
7850 partition_sched_domains(1, NULL, NULL);
7851 }
7852 return 0;
7853 }
7854
sched_cpu_activate(unsigned int cpu)7855 int sched_cpu_activate(unsigned int cpu)
7856 {
7857 struct rq *rq = cpu_rq(cpu);
7858 struct rq_flags rf;
7859
7860 #ifdef CONFIG_SCHED_SMT
7861 /*
7862 * When going up, increment the number of cores with SMT present.
7863 */
7864 if (cpumask_weight(cpu_smt_mask(cpu)) == 0x2) {
7865 static_branch_inc_cpuslocked(&sched_smt_present);
7866 }
7867 #endif
7868 set_cpu_active(cpu, true);
7869
7870 if (sched_smp_initialized) {
7871 sched_domains_numa_masks_set(cpu);
7872 cpuset_cpu_active();
7873 }
7874
7875 /*
7876 * Put the rq online, if not already. This happens:
7877 *
7878 * 1) In the early boot process, because we build the real domains
7879 * after all CPUs have been brought up.
7880 *
7881 * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
7882 * domains.
7883 */
7884 rq_lock_irqsave(rq, &rf);
7885 if (rq->rd) {
7886 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7887 set_rq_online(rq);
7888 }
7889 rq_unlock_irqrestore(rq, &rf);
7890
7891 return 0;
7892 }
7893
sched_cpu_deactivate(unsigned int cpu)7894 int sched_cpu_deactivate(unsigned int cpu)
7895 {
7896 int ret;
7897
7898 set_cpu_active(cpu, false);
7899 /*
7900 * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
7901 * users of this state to go away such that all new such users will
7902 * observe it.
7903 *
7904 * Do sync before park smpboot threads to take care the rcu boost case.
7905 */
7906 synchronize_rcu();
7907
7908 #ifdef CONFIG_SCHED_SMT
7909 /*
7910 * When going down, decrement the number of cores with SMT present.
7911 */
7912 if (cpumask_weight(cpu_smt_mask(cpu)) == 0x2) {
7913 static_branch_dec_cpuslocked(&sched_smt_present);
7914 }
7915 #endif
7916
7917 if (!sched_smp_initialized) {
7918 return 0;
7919 }
7920
7921 ret = cpuset_cpu_inactive(cpu);
7922 if (ret) {
7923 set_cpu_active(cpu, true);
7924 return ret;
7925 }
7926 sched_domains_numa_masks_clear(cpu);
7927 return 0;
7928 }
7929
sched_rq_cpu_starting(unsigned int cpu)7930 static void sched_rq_cpu_starting(unsigned int cpu)
7931 {
7932 struct rq *rq = cpu_rq(cpu);
7933 unsigned long flags;
7934
7935 raw_spin_lock_irqsave(&rq->lock, flags);
7936 set_window_start(rq);
7937 raw_spin_unlock_irqrestore(&rq->lock, flags);
7938
7939 rq->calc_load_update = calc_load_update;
7940 update_max_interval();
7941 }
7942
sched_cpu_starting(unsigned int cpu)7943 int sched_cpu_starting(unsigned int cpu)
7944 {
7945 sched_rq_cpu_starting(cpu);
7946 sched_tick_start(cpu);
7947 clear_eas_migration_request(cpu);
7948 return 0;
7949 }
7950
7951 #ifdef CONFIG_HOTPLUG_CPU
sched_cpu_dying(unsigned int cpu)7952 int sched_cpu_dying(unsigned int cpu)
7953 {
7954 struct rq *rq = cpu_rq(cpu);
7955 struct rq_flags rf;
7956
7957 /* Handle pending wakeups and then migrate everything off */
7958 sched_tick_stop(cpu);
7959
7960 rq_lock_irqsave(rq, &rf);
7961
7962 if (rq->rd) {
7963 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7964 set_rq_offline(rq);
7965 }
7966 migrate_tasks(rq, &rf, true);
7967 BUG_ON(rq->nr_running != 1);
7968 rq_unlock_irqrestore(rq, &rf);
7969
7970 clear_eas_migration_request(cpu);
7971
7972 calc_load_migrate(rq);
7973 update_max_interval();
7974 nohz_balance_exit_idle(rq);
7975 hrtick_clear(rq);
7976 return 0;
7977 }
7978 #endif
7979
sched_init_smp(void)7980 void __init sched_init_smp(void)
7981 {
7982 sched_init_numa();
7983
7984 /*
7985 * There's no userspace yet to cause hotplug operations; hence all the
7986 * CPU masks are stable and all blatant races in the below code cannot
7987 * happen.
7988 */
7989 mutex_lock(&sched_domains_mutex);
7990 sched_init_domains(cpu_active_mask);
7991 mutex_unlock(&sched_domains_mutex);
7992
7993 update_cluster_topology();
7994
7995 /* Move init over to a non-isolated CPU */
7996 if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) <
7997 0) {
7998 BUG();
7999 }
8000 sched_init_granularity();
8001
8002 init_sched_rt_class();
8003 init_sched_dl_class();
8004
8005 sched_smp_initialized = true;
8006 }
8007
migration_init(void)8008 static int __init migration_init(void)
8009 {
8010 sched_cpu_starting(smp_processor_id());
8011 return 0;
8012 }
8013 early_initcall(migration_init);
8014
8015 #else
sched_init_smp(void)8016 void __init sched_init_smp(void)
8017 {
8018 sched_init_granularity();
8019 }
8020 #endif /* CONFIG_SMP */
8021
in_sched_functions(unsigned long addr)8022 int in_sched_functions(unsigned long addr)
8023 {
8024 return in_lock_functions(addr) ||
8025 (addr >= (unsigned long)__sched_text_start &&
8026 addr < (unsigned long)__sched_text_end);
8027 }
8028
8029 #ifdef CONFIG_CGROUP_SCHED
8030 /*
8031 * Default task group.
8032 * Every task in system belongs to this group at bootup.
8033 */
8034 struct task_group root_task_group;
8035 LIST_HEAD(task_groups);
8036
8037 /* Cacheline aligned slab cache for task_group */
8038 static struct kmem_cache *task_group_cache __read_mostly;
8039 #endif
8040
8041 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
8042 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
8043
sched_init(void)8044 void __init sched_init(void)
8045 {
8046 unsigned long ptr = 0;
8047 int i;
8048
8049 /* Make sure the linker didn't screw up */
8050 BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
8051 &fair_sched_class + 1 != &rt_sched_class ||
8052 &rt_sched_class + 1 != &dl_sched_class);
8053 #ifdef CONFIG_SMP
8054 BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
8055 #endif
8056
8057 wait_bit_init();
8058
8059 init_clusters();
8060
8061 #ifdef CONFIG_FAIR_GROUP_SCHED
8062 ptr += 2 * nr_cpu_ids * sizeof(void **);
8063 #endif
8064 #ifdef CONFIG_RT_GROUP_SCHED
8065 ptr += 2 * nr_cpu_ids * sizeof(void **);
8066 #endif
8067 if (ptr) {
8068 ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
8069
8070 #ifdef CONFIG_FAIR_GROUP_SCHED
8071 root_task_group.se = (struct sched_entity **)ptr;
8072 ptr += nr_cpu_ids * sizeof(void **);
8073
8074 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8075 ptr += nr_cpu_ids * sizeof(void **);
8076
8077 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8078 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8079 #endif /* CONFIG_FAIR_GROUP_SCHED */
8080 #ifdef CONFIG_RT_GROUP_SCHED
8081 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8082 ptr += nr_cpu_ids * sizeof(void **);
8083
8084 root_task_group.rt_rq = (struct rt_rq **)ptr;
8085 ptr += nr_cpu_ids * sizeof(void **);
8086
8087 #endif /* CONFIG_RT_GROUP_SCHED */
8088 }
8089 #ifdef CONFIG_CPUMASK_OFFSTACK
8090 for_each_possible_cpu(i)
8091 {
8092 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
8093 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
8094 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
8095 cpumask_size(), GFP_KERNEL, cpu_to_node(i));
8096 }
8097 #endif /* CONFIG_CPUMASK_OFFSTACK */
8098
8099 init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(),
8100 global_rt_runtime());
8101 init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(),
8102 global_rt_runtime());
8103
8104 #ifdef CONFIG_SMP
8105 init_defrootdomain();
8106 #endif
8107
8108 #ifdef CONFIG_RT_GROUP_SCHED
8109 init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(),
8110 global_rt_runtime());
8111 #endif /* CONFIG_RT_GROUP_SCHED */
8112
8113 #ifdef CONFIG_CGROUP_SCHED
8114 task_group_cache = KMEM_CACHE(task_group, 0);
8115
8116 list_add(&root_task_group.list, &task_groups);
8117 INIT_LIST_HEAD(&root_task_group.children);
8118 INIT_LIST_HEAD(&root_task_group.siblings);
8119 autogroup_init(&init_task);
8120 #endif /* CONFIG_CGROUP_SCHED */
8121
8122 for_each_possible_cpu(i)
8123 {
8124 struct rq *rq;
8125
8126 rq = cpu_rq(i);
8127 raw_spin_lock_init(&rq->lock);
8128 rq->nr_running = 0;
8129 rq->calc_load_active = 0;
8130 rq->calc_load_update = jiffies + LOAD_FREQ;
8131 init_cfs_rq(&rq->cfs);
8132 init_rt_rq(&rq->rt);
8133 init_dl_rq(&rq->dl);
8134 #ifdef CONFIG_FAIR_GROUP_SCHED
8135 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8136 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
8137 /*
8138 * How much CPU bandwidth does root_task_group get?
8139 *
8140 * In case of task-groups formed thr' the cgroup filesystem, it
8141 * gets 100% of the CPU resources in the system. This overall
8142 * system CPU resource is divided among the tasks of
8143 * root_task_group and its child task-groups in a fair manner,
8144 * based on each entity's (task or task-group's) weight
8145 * (se->load.weight).
8146 *
8147 * In other words, if root_task_group has 10 tasks of weight
8148 * 1024) and two child groups A0 and A1 (of weight 1024 each),
8149 * then A0's share of the CPU resource is:
8150 *
8151 * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8152 *
8153 * We achieve this by letting root_task_group's tasks sit
8154 * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8155 */
8156 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8157 #endif /* CONFIG_FAIR_GROUP_SCHED */
8158
8159 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8160 #ifdef CONFIG_RT_GROUP_SCHED
8161 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8162 #endif
8163 #ifdef CONFIG_SMP
8164 rq->sd = NULL;
8165 rq->rd = NULL;
8166 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
8167 rq->balance_callback = NULL;
8168 rq->active_balance = 0;
8169 rq->next_balance = jiffies;
8170 rq->push_cpu = 0;
8171 rq->cpu = i;
8172 rq->online = 0;
8173 rq->idle_stamp = 0;
8174 rq->avg_idle = 2 * sysctl_sched_migration_cost;
8175 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
8176 walt_sched_init_rq(rq);
8177
8178 INIT_LIST_HEAD(&rq->cfs_tasks);
8179
8180 rq_attach_root(rq, &def_root_domain);
8181 #ifdef CONFIG_NO_HZ_COMMON
8182 rq->last_blocked_load_update_tick = jiffies;
8183 atomic_set(&rq->nohz_flags, 0);
8184
8185 rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
8186 #endif
8187 #endif /* CONFIG_SMP */
8188 hrtick_rq_init(rq);
8189 atomic_set(&rq->nr_iowait, 0);
8190 }
8191
8192 BUG_ON(alloc_related_thread_groups());
8193 set_load_weight(&init_task);
8194 /*
8195 * The boot idle thread does lazy MMU switching as well:
8196 */
8197 mmgrab(&init_mm);
8198 enter_lazy_tlb(&init_mm, current);
8199
8200 /*
8201 * Make us the idle thread. Technically, schedule() should not be
8202 * called from this thread, however somewhere below it might be,
8203 * but because we are the idle thread, we just pick up running again
8204 * when this runqueue becomes "idle".
8205 */
8206 init_idle(current, smp_processor_id());
8207 init_new_task_load(current);
8208
8209 calc_load_update = jiffies + LOAD_FREQ;
8210
8211 #ifdef CONFIG_SMP
8212 idle_thread_set_boot_cpu();
8213 #endif
8214 init_sched_fair_class();
8215
8216 init_schedstats();
8217
8218 psi_init();
8219
8220 init_uclamp();
8221
8222 scheduler_running = 1;
8223 }
8224
8225 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
preempt_count_equals(int preempt_offset)8226 static inline int preempt_count_equals(int preempt_offset)
8227 {
8228 int nested = preempt_count() + rcu_preempt_depth();
8229
8230 return (nested == preempt_offset);
8231 }
8232
__might_sleep(const char * file,int line,int preempt_offset)8233 void __might_sleep(const char *file, int line, int preempt_offset)
8234 {
8235 /*
8236 * Blocking primitives will set (and therefore destroy) current->state,
8237 * since we will exit with TASK_RUNNING make sure we enter with it,
8238 * otherwise we will destroy state.
8239 */
8240 WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
8241 "do not call blocking ops when !TASK_RUNNING; "
8242 "state=%lx set at [<%p>] %pS\n",
8243 current->state, (void *)current->task_state_change,
8244 (void *)current->task_state_change);
8245
8246 ___might_sleep(file, line, preempt_offset);
8247 }
8248 EXPORT_SYMBOL(__might_sleep);
8249
___might_sleep(const char * file,int line,int preempt_offset)8250 void ___might_sleep(const char *file, int line, int preempt_offset)
8251 {
8252 /* Ratelimiting timestamp: */
8253 static unsigned long prev_jiffy;
8254
8255 unsigned long preempt_disable_ip;
8256
8257 /* WARN_ON_ONCE() by default, no rate limit required: */
8258 rcu_sleep_check();
8259
8260 if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
8261 !is_idle_task(current) && !current->non_block_count) ||
8262 system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
8263 oops_in_progress) {
8264 return;
8265 }
8266
8267 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) {
8268 return;
8269 }
8270 prev_jiffy = jiffies;
8271
8272 /* Save this before calling printk(), since that will clobber it: */
8273 preempt_disable_ip = get_preempt_disable_ip(current);
8274
8275 printk(KERN_ERR
8276 "BUG: sleeping function called from invalid context at %s:%d\n",
8277 file, line);
8278 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: "
8279 "%d, name: %s\n",
8280 in_atomic(), irqs_disabled(), current->non_block_count, current->pid,
8281 current->comm);
8282
8283 if (task_stack_end_corrupted(current)) {
8284 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
8285 }
8286
8287 debug_show_held_locks(current);
8288 if (irqs_disabled()) {
8289 print_irqtrace_events(current);
8290 }
8291 if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) &&
8292 !preempt_count_equals(preempt_offset)) {
8293 pr_err("Preemption disabled at:");
8294 print_ip_sym(KERN_ERR, preempt_disable_ip);
8295 }
8296 dump_stack();
8297 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8298 }
8299 EXPORT_SYMBOL(___might_sleep);
8300
__cant_sleep(const char * file,int line,int preempt_offset)8301 void __cant_sleep(const char *file, int line, int preempt_offset)
8302 {
8303 static unsigned long prev_jiffy;
8304
8305 if (irqs_disabled()) {
8306 return;
8307 }
8308
8309 if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) {
8310 return;
8311 }
8312
8313 if (preempt_count() > preempt_offset) {
8314 return;
8315 }
8316
8317 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) {
8318 return;
8319 }
8320 prev_jiffy = jiffies;
8321
8322 printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
8323 printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8324 in_atomic(), irqs_disabled(), current->pid, current->comm);
8325
8326 debug_show_held_locks(current);
8327 dump_stack();
8328 add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8329 }
8330 EXPORT_SYMBOL_GPL(__cant_sleep);
8331 #endif
8332
8333 #ifdef CONFIG_MAGIC_SYSRQ
normalize_rt_tasks(void)8334 void normalize_rt_tasks(void)
8335 {
8336 struct task_struct *g, *p;
8337 struct sched_attr attr = {
8338 .sched_policy = SCHED_NORMAL,
8339 };
8340
8341 read_lock(&tasklist_lock);
8342 for_each_process_thread(g, p)
8343 {
8344 /*
8345 * Only normalize user tasks:
8346 */
8347 if (p->flags & PF_KTHREAD) {
8348 continue;
8349 }
8350
8351 p->se.exec_start = 0;
8352 schedstat_set(p->se.statistics.wait_start, 0);
8353 schedstat_set(p->se.statistics.sleep_start, 0);
8354 schedstat_set(p->se.statistics.block_start, 0);
8355
8356 if (!dl_task(p) && !rt_task(p)) {
8357 /*
8358 * Renice negative nice level userspace
8359 * tasks back to 0:
8360 */
8361 if (task_nice(p) < 0) {
8362 set_user_nice(p, 0);
8363 }
8364 continue;
8365 }
8366
8367 __sched_setscheduler(p, &attr, false, false);
8368 }
8369 read_unlock(&tasklist_lock);
8370 }
8371
8372 #endif /* CONFIG_MAGIC_SYSRQ */
8373
8374 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8375 /*
8376 * These functions are only useful for the IA64 MCA handling, or kdb.
8377 *
8378 * They can only be called when the whole system has been
8379 * stopped - every CPU needs to be quiescent, and no scheduling
8380 * activity can take place. Using them for anything else would
8381 * be a serious bug, and as a result, they aren't even visible
8382 * under any other configuration.
8383 */
8384
8385 /**
8386 * curr_task - return the current task for a given CPU.
8387 * @cpu: the processor in question.
8388 *
8389 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8390 *
8391 * Return: The current task for @cpu.
8392 */
curr_task(int cpu)8393 struct task_struct *curr_task(int cpu)
8394 {
8395 return cpu_curr(cpu);
8396 }
8397
8398 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8399
8400 #ifdef CONFIG_IA64
8401 /**
8402 * ia64_set_curr_task - set the current task for a given CPU.
8403 * @cpu: the processor in question.
8404 * @p: the task pointer to set.
8405 *
8406 * Description: This function must only be used when non-maskable interrupts
8407 * are serviced on a separate stack. It allows the architecture to switch the
8408 * notion of the current task on a CPU in a non-blocking manner. This function
8409 * must be called with all CPU's synchronized, and interrupts disabled, the
8410 * and caller must save the original value of the current task (see
8411 * curr_task() above) and restore that value before reenabling interrupts and
8412 * re-starting the system.
8413 *
8414 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8415 */
ia64_set_curr_task(int cpu,struct task_struct * p)8416 void ia64_set_curr_task(int cpu, struct task_struct *p)
8417 {
8418 cpu_curr(cpu) = p;
8419 }
8420
8421 #endif
8422
8423 #ifdef CONFIG_CGROUP_SCHED
8424 /* task_group_lock serializes the addition/removal of task groups */
8425 static DEFINE_SPINLOCK(task_group_lock);
8426
alloc_uclamp_sched_group(struct task_group * tg,struct task_group * parent)8427 static inline void alloc_uclamp_sched_group(struct task_group *tg,
8428 struct task_group *parent)
8429 {
8430 #ifdef CONFIG_UCLAMP_TASK_GROUP
8431 enum uclamp_id clamp_id;
8432
8433 cycle_each_clamp_id(clamp_id) {
8434 uclamp_se_set(&tg->uclamp_req[clamp_id], uclamp_none(clamp_id), false);
8435 tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
8436 }
8437 #endif
8438 }
8439
sched_free_group(struct task_group * tg)8440 static void sched_free_group(struct task_group *tg)
8441 {
8442 free_fair_sched_group(tg);
8443 free_rt_sched_group(tg);
8444 autogroup_free(tg);
8445 kmem_cache_free(task_group_cache, tg);
8446 }
8447
8448 /* allocate runqueue etc for a new task group */
sched_create_group(struct task_group * parent)8449 struct task_group *sched_create_group(struct task_group *parent)
8450 {
8451 struct task_group *tg;
8452
8453 tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
8454 if (!tg) {
8455 return ERR_PTR(-ENOMEM);
8456 }
8457
8458 if (!alloc_fair_sched_group(tg, parent)) {
8459 goto err;
8460 }
8461
8462 if (!alloc_rt_sched_group(tg, parent)) {
8463 goto err;
8464 }
8465
8466 alloc_uclamp_sched_group(tg, parent);
8467
8468 return tg;
8469
8470 err:
8471 sched_free_group(tg);
8472 return ERR_PTR(-ENOMEM);
8473 }
8474
sched_online_group(struct task_group * tg,struct task_group * parent)8475 void sched_online_group(struct task_group *tg, struct task_group *parent)
8476 {
8477 unsigned long flags;
8478
8479 spin_lock_irqsave(&task_group_lock, flags);
8480 list_add_rcu(&tg->list, &task_groups);
8481
8482 /* Root should already exist: */
8483 WARN_ON(!parent);
8484
8485 tg->parent = parent;
8486 INIT_LIST_HEAD(&tg->children);
8487 list_add_rcu(&tg->siblings, &parent->children);
8488 spin_unlock_irqrestore(&task_group_lock, flags);
8489
8490 online_fair_sched_group(tg);
8491 }
8492
8493 /* rcu callback to free various structures associated with a task group */
sched_free_group_rcu(struct rcu_head * rhp)8494 static void sched_free_group_rcu(struct rcu_head *rhp)
8495 {
8496 /* Now it should be safe to free those cfs_rqs: */
8497 sched_free_group(container_of(rhp, struct task_group, rcu));
8498 }
8499
sched_destroy_group(struct task_group * tg)8500 void sched_destroy_group(struct task_group *tg)
8501 {
8502 /* Wait for possible concurrent references to cfs_rqs complete: */
8503 call_rcu(&tg->rcu, sched_free_group_rcu);
8504 }
8505
sched_offline_group(struct task_group * tg)8506 void sched_offline_group(struct task_group *tg)
8507 {
8508 unsigned long flags;
8509
8510 /* End participation in shares distribution: */
8511 unregister_fair_sched_group(tg);
8512
8513 spin_lock_irqsave(&task_group_lock, flags);
8514 list_del_rcu(&tg->list);
8515 list_del_rcu(&tg->siblings);
8516 spin_unlock_irqrestore(&task_group_lock, flags);
8517 }
8518
sched_change_group(struct task_struct * tsk,int type)8519 static void sched_change_group(struct task_struct *tsk, int type)
8520 {
8521 struct task_group *tg;
8522
8523 /*
8524 * All callers are synchronized by task_rq_lock(); we do not use RCU
8525 * which is pointless here. Thus, we pass "true" to task_css_check()
8526 * to prevent lockdep warnings.
8527 */
8528 tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group,
8529 css);
8530 tg = autogroup_task_group(tsk, tg);
8531 tsk->sched_task_group = tg;
8532
8533 #ifdef CONFIG_FAIR_GROUP_SCHED
8534 if (tsk->sched_class->task_change_group) {
8535 tsk->sched_class->task_change_group(tsk, type);
8536 } else
8537 #endif
8538 set_task_rq(tsk, task_cpu(tsk));
8539 }
8540
8541 /*
8542 * Change task's runqueue when it moves between groups.
8543 *
8544 * The caller of this function should have put the task in its new group by
8545 * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
8546 * its new group.
8547 */
sched_move_task(struct task_struct * tsk)8548 void sched_move_task(struct task_struct *tsk)
8549 {
8550 int queued, running,
8551 queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
8552 struct rq_flags rf;
8553 struct rq *rq;
8554
8555 rq = task_rq_lock(tsk, &rf);
8556 update_rq_clock(rq);
8557
8558 running = task_current(rq, tsk);
8559 queued = task_on_rq_queued(tsk);
8560 if (queued) {
8561 dequeue_task(rq, tsk, queue_flags);
8562 }
8563 if (running) {
8564 put_prev_task(rq, tsk);
8565 }
8566
8567 sched_change_group(tsk, TASK_MOVE_GROUP);
8568
8569 if (queued) {
8570 enqueue_task(rq, tsk, queue_flags);
8571 }
8572 if (running) {
8573 set_next_task(rq, tsk);
8574 /*
8575 * After changing group, the running task may have joined a
8576 * throttled one but it's still the running task. Trigger a
8577 * resched to make sure that task can still run.
8578 */
8579 resched_curr(rq);
8580 }
8581
8582 task_rq_unlock(rq, tsk, &rf);
8583 }
8584
css_tg(struct cgroup_subsys_state * css)8585 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
8586 {
8587 return css ? container_of(css, struct task_group, css) : NULL;
8588 }
8589
8590 static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)8591 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8592 {
8593 struct task_group *parent = css_tg(parent_css);
8594 struct task_group *tg;
8595
8596 if (!parent) {
8597 /* This is early initialization for the top cgroup */
8598 return &root_task_group.css;
8599 }
8600
8601 tg = sched_create_group(parent);
8602 if (IS_ERR(tg)) {
8603 return ERR_PTR(-ENOMEM);
8604 }
8605
8606 #ifdef CONFIG_SCHED_RTG_CGROUP
8607 tg->colocate = false;
8608 tg->colocate_update_disabled = false;
8609 #endif
8610
8611 return &tg->css;
8612 }
8613
8614 /* Expose task group only after completing cgroup initialization */
cpu_cgroup_css_online(struct cgroup_subsys_state * css)8615 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
8616 {
8617 struct task_group *tg = css_tg(css);
8618 struct task_group *parent = css_tg(css->parent);
8619
8620 if (parent) {
8621 sched_online_group(tg, parent);
8622 }
8623
8624 #ifdef CONFIG_UCLAMP_TASK_GROUP
8625 /* Propagate the effective uclamp value for the new group */
8626 mutex_lock(&uclamp_mutex);
8627 rcu_read_lock();
8628 cpu_util_update_eff(css);
8629 rcu_read_unlock();
8630 mutex_unlock(&uclamp_mutex);
8631 #endif
8632
8633 return 0;
8634 }
8635
cpu_cgroup_css_released(struct cgroup_subsys_state * css)8636 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8637 {
8638 struct task_group *tg = css_tg(css);
8639
8640 sched_offline_group(tg);
8641 }
8642
cpu_cgroup_css_free(struct cgroup_subsys_state * css)8643 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8644 {
8645 struct task_group *tg = css_tg(css);
8646
8647 /*
8648 * Relies on the RCU grace period between css_released() and this.
8649 */
8650 sched_free_group(tg);
8651 }
8652
8653 /*
8654 * This is called before wake_up_new_task(), therefore we really only
8655 * have to set its group bits, all the other stuff does not apply.
8656 */
cpu_cgroup_fork(struct task_struct * task)8657 static void cpu_cgroup_fork(struct task_struct *task)
8658 {
8659 struct rq_flags rf;
8660 struct rq *rq;
8661
8662 rq = task_rq_lock(task, &rf);
8663
8664 update_rq_clock(rq);
8665 sched_change_group(task, TASK_SET_GROUP);
8666
8667 task_rq_unlock(rq, task, &rf);
8668 }
8669
cpu_cgroup_can_attach(struct cgroup_taskset * tset)8670 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8671 {
8672 struct task_struct *task;
8673 struct cgroup_subsys_state *css;
8674 int ret = 0;
8675
8676 cgroup_taskset_for_each(task, css, tset)
8677 {
8678 #ifdef CONFIG_RT_GROUP_SCHED
8679 if (!sched_rt_can_attach(css_tg(css), task)) {
8680 return -EINVAL;
8681 }
8682 #endif
8683 /*
8684 * Serialize against wake_up_new_task() such that if its
8685 * running, we're sure to observe its full state.
8686 */
8687 raw_spin_lock_irq(&task->pi_lock);
8688 /*
8689 * Avoid calling sched_move_task() before wake_up_new_task()
8690 * has happened. This would lead to problems with PELT, due to
8691 * move wanting to detach+attach while we're not attached yet.
8692 */
8693 if (task->state == TASK_NEW) {
8694 ret = -EINVAL;
8695 }
8696 raw_spin_unlock_irq(&task->pi_lock);
8697
8698 if (ret) {
8699 break;
8700 }
8701 }
8702 return ret;
8703 }
8704
8705 #if defined(CONFIG_UCLAMP_TASK_GROUP) && defined(CONFIG_SCHED_RTG_CGROUP)
schedgp_attach(struct cgroup_taskset * tset)8706 static void schedgp_attach(struct cgroup_taskset *tset)
8707 {
8708 struct task_struct *task;
8709 struct cgroup_subsys_state *css;
8710 bool colocate;
8711 struct task_group *tg;
8712
8713 cgroup_taskset_first(tset, &css);
8714 tg = css_tg(css);
8715
8716 colocate = tg->colocate;
8717
8718 cgroup_taskset_for_each(task, css, tset)
8719 sync_cgroup_colocation(task, colocate);
8720 }
8721 #else
schedgp_attach(struct cgroup_taskset * tset)8722 static void schedgp_attach(struct cgroup_taskset *tset)
8723 {
8724 }
8725 #endif
cpu_cgroup_attach(struct cgroup_taskset * tset)8726 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
8727 {
8728 struct task_struct *task;
8729 struct cgroup_subsys_state *css;
8730
8731 cgroup_taskset_for_each(task, css, tset) sched_move_task(task);
8732
8733 schedgp_attach(tset);
8734 }
8735
8736 #ifdef CONFIG_UCLAMP_TASK_GROUP
cpu_util_update_eff(struct cgroup_subsys_state * css)8737 static void cpu_util_update_eff(struct cgroup_subsys_state *css)
8738 {
8739 struct cgroup_subsys_state *top_css = css;
8740 struct uclamp_se *uc_parent = NULL;
8741 struct uclamp_se *uc_se = NULL;
8742 unsigned int eff[UCLAMP_CNT];
8743 enum uclamp_id clamp_id;
8744 unsigned int clamps;
8745
8746 lockdep_assert_held(&uclamp_mutex);
8747 SCHED_WARN_ON(!rcu_read_lock_held());
8748
8749 css_for_each_descendant_pre(css, top_css)
8750 {
8751 uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL;
8752
8753 cycle_each_clamp_id(clamp_id) {
8754 /* Assume effective clamps matches requested clamps */
8755 eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
8756 /* Cap effective clamps with parent's effective clamps */
8757 if (uc_parent && eff[clamp_id] > uc_parent[clamp_id].value) {
8758 eff[clamp_id] = uc_parent[clamp_id].value;
8759 }
8760 }
8761 /* Ensure protection is always capped by limit */
8762 eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
8763
8764 /* Propagate most restrictive effective clamps */
8765 clamps = 0x0;
8766 uc_se = css_tg(css)->uclamp;
8767 cycle_each_clamp_id(clamp_id) {
8768 if (eff[clamp_id] == uc_se[clamp_id].value) {
8769 continue;
8770 }
8771 uc_se[clamp_id].value = eff[clamp_id];
8772 uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
8773 clamps |= (0x1 << clamp_id);
8774 }
8775 if (!clamps) {
8776 css = css_rightmost_descendant(css);
8777 continue;
8778 }
8779
8780 /* Immediately update descendants RUNNABLE tasks */
8781 uclamp_update_active_tasks(css);
8782 }
8783 }
8784
8785 /*
8786 * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
8787 * C expression. Since there is no way to convert a macro argument (N) into a
8788 * character constant, use two levels of macros.
8789 */
8790 #define EXP_POW10(exp) ((unsigned int)1e##exp)
8791 #define POW10(exp) EXP_POW10(exp)
8792
8793 struct uclamp_request {
8794 #define UCLAMP_PERCENT_SHIFT 2
8795 #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
8796 s64 percent;
8797 u64 util;
8798 int ret;
8799 };
8800
capacity_from_percent(char * buf)8801 static inline struct uclamp_request capacity_from_percent(char *buf)
8802 {
8803 struct uclamp_request req = {
8804 .percent = UCLAMP_PERCENT_SCALE,
8805 .util = SCHED_CAPACITY_SCALE,
8806 .ret = 0,
8807 };
8808
8809 buf = strim(buf);
8810 if (strcmp(buf, "max")) {
8811 req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, &req.percent);
8812 if (req.ret) {
8813 return req;
8814 }
8815 if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
8816 req.ret = -ERANGE;
8817 return req;
8818 }
8819
8820 req.util = req.percent << SCHED_CAPACITY_SHIFT;
8821 req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
8822 }
8823
8824 return req;
8825 }
8826
cpu_uclamp_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,enum uclamp_id clamp_id)8827 static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
8828 size_t nbytes, loff_t off,
8829 enum uclamp_id clamp_id)
8830 {
8831 struct uclamp_request req;
8832 struct task_group *tg;
8833
8834 req = capacity_from_percent(buf);
8835 if (req.ret) {
8836 return req.ret;
8837 }
8838
8839 static_branch_enable(&sched_uclamp_used);
8840
8841 mutex_lock(&uclamp_mutex);
8842 rcu_read_lock();
8843
8844 tg = css_tg(of_css(of));
8845 if (tg->uclamp_req[clamp_id].value != req.util) {
8846 uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
8847 }
8848
8849 /*
8850 * Because of not recoverable conversion rounding we keep track of the
8851 * exact requested value
8852 */
8853 tg->uclamp_pct[clamp_id] = req.percent;
8854
8855 /* Update effective clamps to track the most restrictive value */
8856 cpu_util_update_eff(of_css(of));
8857
8858 rcu_read_unlock();
8859 mutex_unlock(&uclamp_mutex);
8860
8861 return nbytes;
8862 }
8863
cpu_uclamp_min_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)8864 static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, char *buf,
8865 size_t nbytes, loff_t off)
8866 {
8867 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
8868 }
8869
cpu_uclamp_max_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)8870 static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, char *buf,
8871 size_t nbytes, loff_t off)
8872 {
8873 return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
8874 }
8875
cpu_uclamp_print(struct seq_file * sf,enum uclamp_id clamp_id)8876 static inline void cpu_uclamp_print(struct seq_file *sf,
8877 enum uclamp_id clamp_id)
8878 {
8879 struct task_group *tg;
8880 u64 util_clamp;
8881 u64 percent;
8882 u32 rem;
8883
8884 rcu_read_lock();
8885 tg = css_tg(seq_css(sf));
8886 util_clamp = tg->uclamp_req[clamp_id].value;
8887 rcu_read_unlock();
8888
8889 if (util_clamp == SCHED_CAPACITY_SCALE) {
8890 seq_puts(sf, "max\n");
8891 return;
8892 }
8893
8894 percent = tg->uclamp_pct[clamp_id];
8895 percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
8896 seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
8897 }
8898
cpu_uclamp_min_show(struct seq_file * sf,void * v)8899 static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
8900 {
8901 cpu_uclamp_print(sf, UCLAMP_MIN);
8902 return 0;
8903 }
8904
cpu_uclamp_max_show(struct seq_file * sf,void * v)8905 static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
8906 {
8907 cpu_uclamp_print(sf, UCLAMP_MAX);
8908 return 0;
8909 }
8910
8911 #ifdef CONFIG_SCHED_RTG_CGROUP
sched_colocate_read(struct cgroup_subsys_state * css,struct cftype * cft)8912 static u64 sched_colocate_read(struct cgroup_subsys_state *css,
8913 struct cftype *cft)
8914 {
8915 struct task_group *tg = css_tg(css);
8916
8917 return (u64)tg->colocate;
8918 }
8919
sched_colocate_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 colocate)8920 static int sched_colocate_write(struct cgroup_subsys_state *css,
8921 struct cftype *cft, u64 colocate)
8922 {
8923 struct task_group *tg = css_tg(css);
8924
8925 if (tg->colocate_update_disabled) {
8926 return -EPERM;
8927 }
8928
8929 tg->colocate = !!colocate;
8930 tg->colocate_update_disabled = true;
8931
8932 return 0;
8933 }
8934 #endif /* CONFIG_SCHED_RTG_CGROUP */
8935 #endif /* CONFIG_UCLAMP_TASK_GROUP */
8936
8937 #ifdef CONFIG_FAIR_GROUP_SCHED
cpu_shares_write_u64(struct cgroup_subsys_state * css,struct cftype * cftype,u64 shareval)8938 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8939 struct cftype *cftype, u64 shareval)
8940 {
8941 if (shareval > scale_load_down(ULONG_MAX)) {
8942 shareval = MAX_SHARES;
8943 }
8944 return sched_group_set_shares(css_tg(css), scale_load(shareval));
8945 }
8946
cpu_shares_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)8947 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
8948 struct cftype *cft)
8949 {
8950 struct task_group *tg = css_tg(css);
8951
8952 return (u64)scale_load_down(tg->shares);
8953 }
8954
8955 #ifdef CONFIG_CFS_BANDWIDTH
8956 static DEFINE_MUTEX(cfs_constraints_mutex);
8957
8958 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
8959 static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8960 /* More than 203 days if BW_SHIFT equals 20. */
8961 static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
8962
8963 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8964
tg_set_cfs_bandwidth(struct task_group * tg,u64 period,u64 quota)8965 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8966 {
8967 int i, ret = 0, runtime_enabled, runtime_was_enabled;
8968 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8969
8970 if (tg == &root_task_group) {
8971 return -EINVAL;
8972 }
8973
8974 /*
8975 * Ensure we have at some amount of bandwidth every period. This is
8976 * to prevent reaching a state of large arrears when throttled via
8977 * entity_tick() resulting in prolonged exit starvation.
8978 */
8979 if (quota < min_cfs_quota_period || period < min_cfs_quota_period) {
8980 return -EINVAL;
8981 }
8982
8983 /*
8984 * Likewise, bound things on the otherside by preventing insane quota
8985 * periods. This also allows us to normalize in computing quota
8986 * feasibility.
8987 */
8988 if (period > max_cfs_quota_period) {
8989 return -EINVAL;
8990 }
8991
8992 /*
8993 * Bound quota to defend quota against overflow during bandwidth shift.
8994 */
8995 if (quota != RUNTIME_INF && quota > max_cfs_runtime) {
8996 return -EINVAL;
8997 }
8998
8999 /*
9000 * Prevent race between setting of cfs_rq->runtime_enabled and
9001 * unthrottle_offline_cfs_rqs().
9002 */
9003 get_online_cpus();
9004 mutex_lock(&cfs_constraints_mutex);
9005 ret = __cfs_schedulable(tg, period, quota);
9006 if (ret) {
9007 goto out_unlock;
9008 }
9009
9010 runtime_enabled = quota != RUNTIME_INF;
9011 runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9012 /*
9013 * If we need to toggle cfs_bandwidth_used, off->on must occur
9014 * before making related changes, and on->off must occur afterwards
9015 */
9016 if (runtime_enabled && !runtime_was_enabled) {
9017 cfs_bandwidth_usage_inc();
9018 }
9019 raw_spin_lock_irq(&cfs_b->lock);
9020 cfs_b->period = ns_to_ktime(period);
9021 cfs_b->quota = quota;
9022
9023 __refill_cfs_bandwidth_runtime(cfs_b);
9024
9025 /* Restart the period timer (if active) to handle new period expiry: */
9026 if (runtime_enabled) {
9027 start_cfs_bandwidth(cfs_b);
9028 }
9029
9030 raw_spin_unlock_irq(&cfs_b->lock);
9031
9032 for_each_online_cpu(i)
9033 {
9034 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9035 struct rq *rq = cfs_rq->rq;
9036 struct rq_flags rf;
9037
9038 rq_lock_irq(rq, &rf);
9039 cfs_rq->runtime_enabled = runtime_enabled;
9040 cfs_rq->runtime_remaining = 0;
9041
9042 if (cfs_rq->throttled) {
9043 unthrottle_cfs_rq(cfs_rq);
9044 }
9045 rq_unlock_irq(rq, &rf);
9046 }
9047 if (runtime_was_enabled && !runtime_enabled) {
9048 cfs_bandwidth_usage_dec();
9049 }
9050 out_unlock:
9051 mutex_unlock(&cfs_constraints_mutex);
9052 put_online_cpus();
9053
9054 return ret;
9055 }
9056
tg_set_cfs_quota(struct task_group * tg,long cfs_quota_us)9057 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9058 {
9059 u64 quota, period;
9060
9061 period = ktime_to_ns(tg->cfs_bandwidth.period);
9062 if (cfs_quota_us < 0) {
9063 quota = RUNTIME_INF;
9064 } else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) {
9065 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9066 } else {
9067 return -EINVAL;
9068 }
9069
9070 return tg_set_cfs_bandwidth(tg, period, quota);
9071 }
9072
tg_get_cfs_quota(struct task_group * tg)9073 static long tg_get_cfs_quota(struct task_group *tg)
9074 {
9075 u64 quota_us;
9076
9077 if (tg->cfs_bandwidth.quota == RUNTIME_INF) {
9078 return -1;
9079 }
9080
9081 quota_us = tg->cfs_bandwidth.quota;
9082 do_div(quota_us, NSEC_PER_USEC);
9083
9084 return quota_us;
9085 }
9086
tg_set_cfs_period(struct task_group * tg,long cfs_period_us)9087 static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9088 {
9089 u64 quota, period;
9090
9091 if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) {
9092 return -EINVAL;
9093 }
9094
9095 period = (u64)cfs_period_us * NSEC_PER_USEC;
9096 quota = tg->cfs_bandwidth.quota;
9097
9098 return tg_set_cfs_bandwidth(tg, period, quota);
9099 }
9100
tg_get_cfs_period(struct task_group * tg)9101 static long tg_get_cfs_period(struct task_group *tg)
9102 {
9103 u64 cfs_period_us;
9104
9105 cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9106 do_div(cfs_period_us, NSEC_PER_USEC);
9107
9108 return cfs_period_us;
9109 }
9110
cpu_cfs_quota_read_s64(struct cgroup_subsys_state * css,struct cftype * cft)9111 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
9112 struct cftype *cft)
9113 {
9114 return tg_get_cfs_quota(css_tg(css));
9115 }
9116
cpu_cfs_quota_write_s64(struct cgroup_subsys_state * css,struct cftype * cftype,s64 cfs_quota_us)9117 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
9118 struct cftype *cftype, s64 cfs_quota_us)
9119 {
9120 return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
9121 }
9122
cpu_cfs_period_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)9123 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
9124 struct cftype *cft)
9125 {
9126 return tg_get_cfs_period(css_tg(css));
9127 }
9128
cpu_cfs_period_write_u64(struct cgroup_subsys_state * css,struct cftype * cftype,u64 cfs_period_us)9129 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
9130 struct cftype *cftype, u64 cfs_period_us)
9131 {
9132 return tg_set_cfs_period(css_tg(css), cfs_period_us);
9133 }
9134
9135 struct cfs_schedulable_data {
9136 struct task_group *tg;
9137 u64 period, quota;
9138 };
9139
9140 /*
9141 * normalize group quota/period to be quota/max_period
9142 * note: units are usecs
9143 */
normalize_cfs_quota(struct task_group * tg,struct cfs_schedulable_data * d)9144 static u64 normalize_cfs_quota(struct task_group *tg,
9145 struct cfs_schedulable_data *d)
9146 {
9147 u64 quota, period;
9148
9149 if (tg == d->tg) {
9150 period = d->period;
9151 quota = d->quota;
9152 } else {
9153 period = tg_get_cfs_period(tg);
9154 quota = tg_get_cfs_quota(tg);
9155 }
9156
9157 /* note: these should typically be equivalent */
9158 if (quota == RUNTIME_INF || quota == -1) {
9159 return RUNTIME_INF;
9160 }
9161
9162 return to_ratio(period, quota);
9163 }
9164
tg_cfs_schedulable_down(struct task_group * tg,void * data)9165 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9166 {
9167 struct cfs_schedulable_data *d = data;
9168 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9169 s64 quota = 0, parent_quota = -1;
9170
9171 if (!tg->parent) {
9172 quota = RUNTIME_INF;
9173 } else {
9174 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9175
9176 quota = normalize_cfs_quota(tg, d);
9177 parent_quota = parent_b->hierarchical_quota;
9178
9179 /*
9180 * Ensure max(child_quota) <= parent_quota. On cgroup2,
9181 * always take the min. On cgroup1, only inherit when no
9182 * limit is set:
9183 */
9184 if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
9185 quota = min(quota, parent_quota);
9186 } else {
9187 if (quota == RUNTIME_INF) {
9188 quota = parent_quota;
9189 } else if (parent_quota != RUNTIME_INF && quota > parent_quota) {
9190 return -EINVAL;
9191 }
9192 }
9193 }
9194 cfs_b->hierarchical_quota = quota;
9195
9196 return 0;
9197 }
9198
__cfs_schedulable(struct task_group * tg,u64 period,u64 quota)9199 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9200 {
9201 int ret;
9202 struct cfs_schedulable_data data = {
9203 .tg = tg,
9204 .period = period,
9205 .quota = quota,
9206 };
9207
9208 if (quota != RUNTIME_INF) {
9209 do_div(data.period, NSEC_PER_USEC);
9210 do_div(data.quota, NSEC_PER_USEC);
9211 }
9212
9213 rcu_read_lock();
9214 ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9215 rcu_read_unlock();
9216
9217 return ret;
9218 }
9219
cpu_cfs_stat_show(struct seq_file * sf,void * v)9220 static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
9221 {
9222 struct task_group *tg = css_tg(seq_css(sf));
9223 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9224
9225 seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
9226 seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
9227 seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
9228
9229 if (schedstat_enabled() && tg != &root_task_group) {
9230 u64 ws = 0;
9231 int i;
9232
9233 for_each_possible_cpu(i) ws +=
9234 schedstat_val(tg->se[i]->statistics.wait_sum);
9235
9236 seq_printf(sf, "wait_sum %llu\n", ws);
9237 }
9238
9239 return 0;
9240 }
9241 #endif /* CONFIG_CFS_BANDWIDTH */
9242 #endif /* CONFIG_FAIR_GROUP_SCHED */
9243
9244 #ifdef CONFIG_RT_GROUP_SCHED
cpu_rt_runtime_write(struct cgroup_subsys_state * css,struct cftype * cft,s64 val)9245 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
9246 struct cftype *cft, s64 val)
9247 {
9248 return sched_group_set_rt_runtime(css_tg(css), val);
9249 }
9250
cpu_rt_runtime_read(struct cgroup_subsys_state * css,struct cftype * cft)9251 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
9252 struct cftype *cft)
9253 {
9254 return sched_group_rt_runtime(css_tg(css));
9255 }
9256
cpu_rt_period_write_uint(struct cgroup_subsys_state * css,struct cftype * cftype,u64 rt_period_us)9257 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
9258 struct cftype *cftype, u64 rt_period_us)
9259 {
9260 return sched_group_set_rt_period(css_tg(css), rt_period_us);
9261 }
9262
cpu_rt_period_read_uint(struct cgroup_subsys_state * css,struct cftype * cft)9263 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
9264 struct cftype *cft)
9265 {
9266 return sched_group_rt_period(css_tg(css));
9267 }
9268 #endif /* CONFIG_RT_GROUP_SCHED */
9269
9270 static struct cftype cpu_legacy_files[] = {
9271 #ifdef CONFIG_FAIR_GROUP_SCHED
9272 {
9273 .name = "shares",
9274 .read_u64 = cpu_shares_read_u64,
9275 .write_u64 = cpu_shares_write_u64,
9276 },
9277 #endif
9278 #ifdef CONFIG_CFS_BANDWIDTH
9279 {
9280 .name = "cfs_quota_us",
9281 .read_s64 = cpu_cfs_quota_read_s64,
9282 .write_s64 = cpu_cfs_quota_write_s64,
9283 },
9284 {
9285 .name = "cfs_period_us",
9286 .read_u64 = cpu_cfs_period_read_u64,
9287 .write_u64 = cpu_cfs_period_write_u64,
9288 },
9289 {
9290 .name = "stat",
9291 .seq_show = cpu_cfs_stat_show,
9292 },
9293 #endif
9294 #ifdef CONFIG_RT_GROUP_SCHED
9295 {
9296 .name = "rt_runtime_us",
9297 .read_s64 = cpu_rt_runtime_read,
9298 .write_s64 = cpu_rt_runtime_write,
9299 },
9300 {
9301 .name = "rt_period_us",
9302 .read_u64 = cpu_rt_period_read_uint,
9303 .write_u64 = cpu_rt_period_write_uint,
9304 },
9305 #endif
9306 #ifdef CONFIG_UCLAMP_TASK_GROUP
9307 {
9308 .name = "uclamp.min",
9309 .flags = CFTYPE_NOT_ON_ROOT,
9310 .seq_show = cpu_uclamp_min_show,
9311 .write = cpu_uclamp_min_write,
9312 },
9313 {
9314 .name = "uclamp.max",
9315 .flags = CFTYPE_NOT_ON_ROOT,
9316 .seq_show = cpu_uclamp_max_show,
9317 .write = cpu_uclamp_max_write,
9318 },
9319 #ifdef CONFIG_SCHED_RTG_CGROUP
9320 {
9321 .name = "uclamp.colocate",
9322 .flags = CFTYPE_NOT_ON_ROOT,
9323 .read_u64 = sched_colocate_read,
9324 .write_u64 = sched_colocate_write,
9325 },
9326 #endif
9327 #endif
9328 {} /* Terminate */
9329 };
9330
cpu_extra_stat_show(struct seq_file * sf,struct cgroup_subsys_state * css)9331 static int cpu_extra_stat_show(struct seq_file *sf,
9332 struct cgroup_subsys_state *css)
9333 {
9334 #ifdef CONFIG_CFS_BANDWIDTH
9335 {
9336 struct task_group *tg = css_tg(css);
9337 struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9338 u64 throttled_usec;
9339
9340 throttled_usec = cfs_b->throttled_time;
9341 do_div(throttled_usec, NSEC_PER_USEC);
9342
9343 seq_printf(sf,
9344 "nr_periods %d\n"
9345 "nr_throttled %d\n"
9346 "throttled_usec %llu\n",
9347 cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec);
9348 }
9349 #endif
9350 return 0;
9351 }
9352
9353 #ifdef CONFIG_FAIR_GROUP_SCHED
cpu_weight_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)9354 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
9355 struct cftype *cft)
9356 {
9357 struct task_group *tg = css_tg(css);
9358 u64 weight = scale_load_down(tg->shares);
9359
9360 return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 0x400);
9361 }
9362
cpu_weight_write_u64(struct cgroup_subsys_state * css,struct cftype * cft,u64 weight)9363 static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
9364 struct cftype *cft, u64 weight)
9365 {
9366 /*
9367 * cgroup weight knobs should use the common MIN, DFL and MAX
9368 * values which are 1, 100 and 10000 respectively. While it loses
9369 * a bit of range on both ends, it maps pretty well onto the shares
9370 * value used by scheduler and the round-trip conversions preserve
9371 * the original value over the entire range.
9372 */
9373 if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) {
9374 return -ERANGE;
9375 }
9376
9377 weight = DIV_ROUND_CLOSEST_ULL(weight * 0x400, CGROUP_WEIGHT_DFL);
9378
9379 return sched_group_set_shares(css_tg(css), scale_load(weight));
9380 }
9381
cpu_weight_nice_read_s64(struct cgroup_subsys_state * css,struct cftype * cft)9382 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
9383 struct cftype *cft)
9384 {
9385 unsigned long weight = scale_load_down(css_tg(css)->shares);
9386 int last_delta = INT_MAX;
9387 int prio, delta;
9388
9389 /* find the closest nice value to the current weight */
9390 for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
9391 delta = abs(sched_prio_to_weight[prio] - weight);
9392 if (delta >= last_delta) {
9393 break;
9394 }
9395 last_delta = delta;
9396 }
9397
9398 return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
9399 }
9400
cpu_weight_nice_write_s64(struct cgroup_subsys_state * css,struct cftype * cft,s64 nice)9401 static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
9402 struct cftype *cft, s64 nice)
9403 {
9404 unsigned long weight;
9405 int idx;
9406
9407 if (nice < MIN_NICE || nice > MAX_NICE) {
9408 return -ERANGE;
9409 }
9410
9411 idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
9412 idx = array_index_nospec(idx, 0x28);
9413 weight = sched_prio_to_weight[idx];
9414
9415 return sched_group_set_shares(css_tg(css), scale_load(weight));
9416 }
9417 #endif
9418
cpu_period_quota_print(struct seq_file * sf,long period,long quota)9419 static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
9420 long period, long quota)
9421 {
9422 if (quota < 0) {
9423 seq_puts(sf, "max");
9424 } else {
9425 seq_printf(sf, "%ld", quota);
9426 }
9427
9428 seq_printf(sf, " %ld\n", period);
9429 }
9430
9431 /* caller should put the current value in *@periodp before calling */
cpu_period_quota_parse(char * buf,u64 * periodp,u64 * quotap)9432 static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *periodp,
9433 u64 *quotap)
9434 {
9435 char tok[21]; /* U64_MAX */
9436
9437 if (sscanf(buf, "%20s %llu", tok, periodp) < 1) {
9438 return -EINVAL;
9439 }
9440
9441 *periodp *= NSEC_PER_USEC;
9442
9443 if (sscanf(tok, "%llu", quotap)) {
9444 *quotap *= NSEC_PER_USEC;
9445 } else if (!strcmp(tok, "max")) {
9446 *quotap = RUNTIME_INF;
9447 } else {
9448 return -EINVAL;
9449 }
9450
9451 return 0;
9452 }
9453
9454 #ifdef CONFIG_CFS_BANDWIDTH
cpu_max_show(struct seq_file * sf,void * v)9455 static int cpu_max_show(struct seq_file *sf, void *v)
9456 {
9457 struct task_group *tg = css_tg(seq_css(sf));
9458
9459 cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
9460 return 0;
9461 }
9462
cpu_max_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)9463 static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf,
9464 size_t nbytes, loff_t off)
9465 {
9466 struct task_group *tg = css_tg(of_css(of));
9467 u64 period = tg_get_cfs_period(tg);
9468 u64 quota;
9469 int ret;
9470
9471 ret = cpu_period_quota_parse(buf, &period, "a);
9472 if (!ret) {
9473 ret = tg_set_cfs_bandwidth(tg, period, quota);
9474 }
9475 return ret ?: nbytes;
9476 }
9477 #endif
9478
9479 static struct cftype cpu_files[] = {
9480 #ifdef CONFIG_FAIR_GROUP_SCHED
9481 {
9482 .name = "weight",
9483 .flags = CFTYPE_NOT_ON_ROOT,
9484 .read_u64 = cpu_weight_read_u64,
9485 .write_u64 = cpu_weight_write_u64,
9486 },
9487 {
9488 .name = "weight.nice",
9489 .flags = CFTYPE_NOT_ON_ROOT,
9490 .read_s64 = cpu_weight_nice_read_s64,
9491 .write_s64 = cpu_weight_nice_write_s64,
9492 },
9493 #endif
9494 #ifdef CONFIG_CFS_BANDWIDTH
9495 {
9496 .name = "max",
9497 .flags = CFTYPE_NOT_ON_ROOT,
9498 .seq_show = cpu_max_show,
9499 .write = cpu_max_write,
9500 },
9501 #endif
9502 #ifdef CONFIG_UCLAMP_TASK_GROUP
9503 {
9504 .name = "uclamp.min",
9505 .flags = CFTYPE_NOT_ON_ROOT,
9506 .seq_show = cpu_uclamp_min_show,
9507 .write = cpu_uclamp_min_write,
9508 },
9509 {
9510 .name = "uclamp.max",
9511 .flags = CFTYPE_NOT_ON_ROOT,
9512 .seq_show = cpu_uclamp_max_show,
9513 .write = cpu_uclamp_max_write,
9514 },
9515 #endif
9516 {} /* terminate */
9517 };
9518
9519 struct cgroup_subsys cpu_cgrp_subsys = {
9520 .css_alloc = cpu_cgroup_css_alloc,
9521 .css_online = cpu_cgroup_css_online,
9522 .css_released = cpu_cgroup_css_released,
9523 .css_free = cpu_cgroup_css_free,
9524 .css_extra_stat_show = cpu_extra_stat_show,
9525 .fork = cpu_cgroup_fork,
9526 .can_attach = cpu_cgroup_can_attach,
9527 .attach = cpu_cgroup_attach,
9528 .legacy_cftypes = cpu_legacy_files,
9529 .dfl_cftypes = cpu_files,
9530 .early_init = true,
9531 .threaded = true,
9532 };
9533
9534 #endif /* CONFIG_CGROUP_SCHED */
9535
dump_cpu_task(int cpu)9536 void dump_cpu_task(int cpu)
9537 {
9538 pr_info("Task dump for CPU %d:\n", cpu);
9539 sched_show_task(cpu_curr(cpu));
9540 }
9541
9542 /*
9543 * Nice levels are multiplicative, with a gentle 10% change for every
9544 * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
9545 * nice 1, it will get ~10% less CPU time than another CPU-bound task
9546 * that remained on nice 0.
9547 *
9548 * The "10% effect" is relative and cumulative: from _any_ nice level,
9549 * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
9550 * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
9551 * If a task goes up by ~10% and another task goes down by ~10% then
9552 * the relative distance between them is ~25%.)
9553 */
9554 const int sched_prio_to_weight[40] = {
9555 88761, 71755, 56483, 46273, 36291, /* -20 */
9556 29154, 23254, 18705, 14949, 11916, /* -15 */
9557 9548, 7620, 6100, 4904, 3906, /* -10 */
9558 3121, 2501, 1991, 1586, 1277, /* -5 */
9559 1024, 820, 655, 526, 423, /* 0 */
9560 335, 272, 215, 172, 137, /* 5 */
9561 110, 87, 70, 56, 45, /* 10 */
9562 36, 29, 23, 18, 15, /* 15 */
9563 };
9564
9565 /*
9566 * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
9567 *
9568 * In cases where the weight does not change often, we can use the
9569 * precalculated inverse to speed up arithmetics by turning divisions
9570 * into multiplications:
9571 */
9572 const u32 sched_prio_to_wmult[40] = {
9573 48388, 59856, 76040, 92818, 118348, /* -20 */
9574 147320, 184698, 229616, 287308, 360437, /* -15 */
9575 449829, 563644, 704093, 875809, 1099582, /* -10 */
9576 1376151, 1717300, 2157191, 2708050, 3363326, /* -5 */
9577 4194304, 5237765, 6557202, 8165337, 10153587, /* 0 */
9578 12820798, 15790321, 19976592, 24970740, 31350126, /* 5 */
9579 39045157, 49367440, 61356676, 76695844, 95443717, /* 10 */
9580 119304647, 148102320, 186737708, 238609294, 286331153, /* 15 */
9581 };
9582
9583 #ifdef CONFIG_SCHED_LATENCY_NICE
9584 /*
9585 * latency weight for wakeup preemption
9586 */
9587 const int sched_latency_to_weight[40] = {
9588 1024, 973, 922, 870, 819, /* -20 */
9589 768, 717, 666, 614, 563, /* -15 */
9590 512, 461, 410, 358, 307, /* -10 */
9591 256, 205, 154, 102, 51, /* -5 */
9592 0, -51, -102, -154, -205, /* 0 */
9593 -256, -307, -358, -410, -461, /* 5 */
9594 -512, -563, -614, -666, -717, /* 10 */
9595 -768, -819, -870, -922, -973, /* 15 */
9596 };
9597 #endif
9598
call_trace_sched_update_nr_running(struct rq * rq,int count)9599 void call_trace_sched_update_nr_running(struct rq *rq, int count)
9600 {
9601 trace_sched_update_nr_running_tp(rq, count);
9602 }
9603
9604 #ifdef CONFIG_SCHED_WALT
9605 /*
9606 * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
9607 *
9608 * Stop accounting (exiting) task's future cpu usage
9609 *
9610 * We need this so that reset_all_windows_stats() can function correctly.
9611 * reset_all_window_stats() depends on do_each_thread/for_each_thread task
9612 * iterators to reset *all* task's statistics. Exiting tasks however become
9613 * invisible to those iterators. sched_exit() is called on a exiting task prior
9614 * to being removed from task_list, which will let reset_all_window_stats()
9615 * function correctly.
9616 */
sched_exit(struct task_struct * p)9617 void sched_exit(struct task_struct *p)
9618 {
9619 struct rq_flags rf;
9620 struct rq *rq;
9621 u64 wallclock;
9622
9623 #ifdef CONFIG_SCHED_RTG
9624 sched_set_group_id(p, 0);
9625 #endif
9626
9627 rq = task_rq_lock(p, &rf);
9628
9629 /* rq->curr == p */
9630 wallclock = sched_ktime_clock();
9631 update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
9632 dequeue_task(rq, p, 0);
9633 /*
9634 * task's contribution is already removed from the
9635 * cumulative window demand in dequeue. As the
9636 * task's stats are reset, the next enqueue does
9637 * not change the cumulative window demand.
9638 */
9639 reset_task_stats(p);
9640 p->ravg.mark_start = wallclock;
9641 p->ravg.sum_history[0] = EXITING_TASK_MARKER;
9642
9643 enqueue_task(rq, p, 0);
9644 task_rq_unlock(rq, p, &rf);
9645 free_task_load_ptrs(p);
9646 }
9647 #endif /* CONFIG_SCHED_WALT */
9648