• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  kernel/sched/core.c
4  *
5  *  Core kernel scheduler code and related syscalls
6  *
7  *  Copyright (C) 1991-2002  Linus Torvalds
8  */
9 #define CREATE_TRACE_POINTS
10 #include <trace/events/sched.h>
11 #undef CREATE_TRACE_POINTS
12 
13 #include "sched.h"
14 
15 #include <linux/nospec.h>
16 
17 #include <linux/kcov.h>
18 #include <linux/scs.h>
19 #include <linux/irq.h>
20 #include <linux/delay.h>
21 
22 #include <asm/switch_to.h>
23 #include <asm/tlb.h>
24 
25 #include "../workqueue_internal.h"
26 #include "../../io_uring/io-wq.h"
27 #include "../smpboot.h"
28 
29 #include "pelt.h"
30 #include "smp.h"
31 #include "walt.h"
32 #include "rtg/rtg.h"
33 
34 /*
35  * Export tracepoints that act as a bare tracehook (ie: have no trace event
36  * associated with them) to allow external modules to probe them.
37  */
38 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_cfs_tp);
39 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_rt_tp);
40 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_dl_tp);
41 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_irq_tp);
42 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_se_tp);
43 EXPORT_TRACEPOINT_SYMBOL_GPL(pelt_thermal_tp);
44 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_cpu_capacity_tp);
45 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp);
46 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp);
47 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp);
48 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp);
49 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_waking);
50 #ifdef CONFIG_SCHEDSTATS
51 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_sleep);
52 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_wait);
53 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_iowait);
54 EXPORT_TRACEPOINT_SYMBOL_GPL(sched_stat_blocked);
55 #endif
56 
57 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
58 
59 #ifdef CONFIG_SCHED_DEBUG
60 /*
61  * Debugging: various feature bits
62  *
63  * If SCHED_DEBUG is disabled, each compilation unit has its own copy of
64  * sysctl_sched_features, defined in sched.h, to allow constants propagation
65  * at compile time and compiler optimization based on features default.
66  */
67 #define SCHED_FEAT(name, enabled) (1UL << __SCHED_FEAT_##name) * enabled |
68 const_debug unsigned int sysctl_sched_features =
69 #include "features.h"
70     0;
71 #undef SCHED_FEAT
72 #endif
73 
74 /*
75  * Number of tasks to iterate in a single balance run.
76  * Limited because this is done with IRQs disabled.
77  */
78 const_debug unsigned int sysctl_sched_nr_migrate = 32;
79 
80 /*
81  * period over which we measure -rt task CPU usage in us.
82  * default: 1s
83  */
84 unsigned int sysctl_sched_rt_period = 1000000;
85 
86 __read_mostly int scheduler_running;
87 
88 /*
89  * part of the period that we allow rt tasks to run in us.
90  * default: 0.95s
91  */
92 int sysctl_sched_rt_runtime = 950000;
93 
94 /*
95  * Serialization rules
96  *
97  * Lock order
98  *
99  *   p->pi_lock
100  *     rq->lock
101  *       hrtimer_cpu_base->lock (hrtimer_start() for bandwidth controls)
102  *
103  *  rq1->lock
104  *    rq2->lock  where: rq1 < rq2
105  *
106  * Regular state
107  *
108  * Normal scheduling state is serialized by rq->lock. __schedule() takes the
109  * local CPU's rq->lock, it optionally removes the task from the runqueue and
110  * always looks at the local rq data structures to find the most elegible task
111  * to run next.
112  *
113  * Task enqueue is also under rq->lock, possibly taken from another CPU.
114  * Wakeups from another LLC domain might use an IPI to transfer the enqueue to
115  * the local CPU to avoid bouncing the runqueue state around [ see
116  * ttwu_queue_wakelist() ]
117  *
118  * Task wakeup, specifically wakeups that involve migration, are horribly
119  * complicated to avoid having to take two rq->locks.
120  *
121  * Special state
122  *
123  * System-calls and anything external will use task_rq_lock() which acquires
124  * both p->pi_lock and rq->lock. As a consequence the state they change is
125  * stable while holding either lock
126  *
127  *  - sched_setaffinity()/
128  *    set_cpus_allowed_ptr():    p->cpus_ptr, p->nr_cpus_allowed
129  *  - set_user_nice():        p->se.load, p->*prio
130  *  - __sched_setscheduler():    p->sched_class, p->policy, p->*prio,
131  *                p->se.load, p->rt_priority,
132  *                p->dl.dl_{runtime, deadline, period, flags, bw, density}
133  *  - sched_setnuma():        p->numa_preferred_nid
134  *  - sched_move_task()/
135  *    cpu_cgroup_fork():    p->sched_task_group
136  *  - uclamp_update_active()    p->uclamp*
137  *
138  * p->state <- TASK_*
139  *
140  *   is changed locklessly using set_current_state(), __set_current_state() or
141  *   set_special_state(), see their respective comments, or by
142  *   try_to_wake_up(). This latter uses p->pi_lock to serialize against
143  *   concurrent self.
144  *
145  * p->on_rq <- { 0, 1 = TASK_ON_RQ_QUEUED, 2 = TASK_ON_RQ_MIGRATING }:
146  *
147  *   is set by activate_task() and cleared by deactivate_task(), under
148  *   rq->lock. Non-zero indicates the task is runnable, the special
149  *   ON_RQ_MIGRATING state is used for migration without holding both
150  *   rq->locks. It indicates task_cpu() is not stable, see task_rq_lock().
151  *
152  * p->on_cpu <- { 0, 1 }
153  *
154  *   is set by prepare_task() and cleared by finish_task() such that it will be
155  *   set before p is scheduled-in and cleared after p is scheduled-out, both
156  *   under rq->lock. Non-zero indicates the task is running on its CPU.
157  *
158  *   [ The astute reader will observe that it is possible for two tasks on one
159  *     CPU to have ->on_cpu = 1 at the same time. ]
160  *
161  * task_cpu(p): is changed by set_task_cpu(), the rules are:
162  *
163  *  - Don't call set_task_cpu() on a blocked task:
164  *
165  *    We don't care what CPU we're not running on, this simplifies hotplug,
166  *    the CPU assignment of blocked tasks isn't required to be valid.
167  *
168  *  - for try_to_wake_up(), called under p->pi_lock:
169  *
170  *    This allows try_to_wake_up() to only take one rq->lock, see its comment.
171  *
172  *  - for migration called under rq->lock:
173  *    [ see task_on_rq_migrating() in task_rq_lock() ]
174  *
175  *    o move_queued_task()
176  *    o detach_task()
177  *
178  *  - for migration called under double_rq_lock()
179  *
180  *    o __migrate_swap_task()
181  *    o push_rt_task() / pull_rt_task()
182  *    o push_dl_task() / pull_dl_task()
183  *    o dl_task_offline_migration()
184  *
185  */
186 
187 /*
188  * __task_rq_lock - lock the rq @p resides on.
189  */
__task_rq_lock(struct task_struct * p,struct rq_flags * rf)190 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
191     __acquires(rq->lock)
192 {
193     struct rq *rq;
194 
195     lockdep_assert_held(&p->pi_lock);
196 
197     for (;;) {
198         rq = task_rq(p);
199         raw_spin_lock(&rq->lock);
200         if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
201             rq_pin_lock(rq, rf);
202             return rq;
203         }
204         raw_spin_unlock(&rq->lock);
205 
206         while (unlikely(task_on_rq_migrating(p))) {
207             cpu_relax();
208         }
209     }
210 }
211 
212 /*
213  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
214  */
task_rq_lock(struct task_struct * p,struct rq_flags * rf)215 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
216     __acquires(p->pi_lock) __acquires(rq->lock)
217 {
218     struct rq *rq;
219 
220     for (;;) {
221         raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
222         rq = task_rq(p);
223         raw_spin_lock(&rq->lock);
224         /*
225          *    move_queued_task()        task_rq_lock()
226          *
227          *    ACQUIRE (rq->lock)
228          *    [S] ->on_rq = MIGRATING        [L] rq = task_rq()
229          *    WMB (__set_task_cpu())        ACQUIRE (rq->lock);
230          *    [S] ->cpu = new_cpu        [L] task_rq()
231          *                    [L] ->on_rq
232          *    RELEASE (rq->lock)
233          *
234          * If we observe the old CPU in task_rq_lock(), the acquire of
235          * the old rq->lock will fully serialize against the stores.
236          *
237          * If we observe the new CPU in task_rq_lock(), the address
238          * dependency headed by '[L] rq = task_rq()' and the acquire
239          * will pair with the WMB to ensure we then also see migrating.
240          */
241         if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
242             rq_pin_lock(rq, rf);
243             return rq;
244         }
245         raw_spin_unlock(&rq->lock);
246         raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
247 
248         while (unlikely(task_on_rq_migrating(p))) {
249             cpu_relax();
250         }
251     }
252 }
253 
254 /*
255  * RQ-clock updating methods
256  */
257 
update_rq_clock_task(struct rq * rq,s64 delta)258 static void update_rq_clock_task(struct rq *rq, s64 delta)
259 {
260     /*
261      * In theory, the compile should just see 0 here, and optimize out the call
262      * to sched_rt_avg_update. But I don't trust it...
263      */
264     s64 __maybe_unused steal = 0, irq_delta = 0;
265 
266 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
267     irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
268     /*
269      * Since irq_time is only updated on {soft,}irq_exit, we might run into
270      * this case when a previous update_rq_clock() happened inside a
271      * {soft,}irq region.
272      *
273      * When this happens, we stop ->clock_task and only update the
274      * prev_irq_time stamp to account for the part that fit, so that a next
275      * update will consume the rest. This ensures ->clock_task is
276      * monotonic.
277      *
278      * It does however cause some slight miss-attribution of {soft,}irq
279      * time, a more accurate solution would be to update the irq_time using
280      * the current rq->clock timestamp, except that would require using
281      * atomic ops.
282      */
283     if (irq_delta > delta) {
284         irq_delta = delta;
285     }
286 
287     rq->prev_irq_time += irq_delta;
288     delta -= irq_delta;
289 #endif
290 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
291     if (static_key_false((&paravirt_steal_rq_enabled))) {
292         steal = paravirt_steal_clock(cpu_of(rq));
293         steal -= rq->prev_steal_time_rq;
294 
295         if (unlikely(steal > delta)) {
296             steal = delta;
297         }
298 
299         rq->prev_steal_time_rq += steal;
300         delta -= steal;
301     }
302 #endif
303 
304     rq->clock_task += delta;
305 
306 #ifdef CONFIG_HAVE_SCHED_AVG_IRQ
307     if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY)) {
308         update_irq_load_avg(rq, irq_delta + steal);
309     }
310 #endif
311     update_rq_clock_pelt(rq, delta);
312 }
313 
update_rq_clock(struct rq * rq)314 void update_rq_clock(struct rq *rq)
315 {
316     s64 delta;
317 
318     lockdep_assert_held(&rq->lock);
319 
320     if (rq->clock_update_flags & RQCF_ACT_SKIP) {
321         return;
322     }
323 
324 #ifdef CONFIG_SCHED_DEBUG
325     if (sched_feat(WARN_DOUBLE_CLOCK)) {
326         SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
327     }
328     rq->clock_update_flags |= RQCF_UPDATED;
329 #endif
330 
331     delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
332     if (delta < 0) {
333         return;
334     }
335     rq->clock += delta;
336     update_rq_clock_task(rq, delta);
337 }
338 
rq_csd_init(struct rq * rq,struct __call_single_data * csd,smp_call_func_t func)339 static inline void rq_csd_init(struct rq *rq, struct __call_single_data *csd,
340                                smp_call_func_t func)
341 {
342     csd->flags = 0;
343     csd->func = func;
344     csd->info = rq;
345 }
346 
347 #ifdef CONFIG_SCHED_HRTICK
348 /*
349  * Use HR-timers to deliver accurate preemption points.
350  */
351 
hrtick_clear(struct rq * rq)352 static void hrtick_clear(struct rq *rq)
353 {
354     if (hrtimer_active(&rq->hrtick_timer)) {
355         hrtimer_cancel(&rq->hrtick_timer);
356     }
357 }
358 
359 /*
360  * High-resolution timer tick.
361  * Runs from hardirq context with interrupts disabled.
362  */
hrtick(struct hrtimer * timer)363 static enum hrtimer_restart hrtick(struct hrtimer *timer)
364 {
365     struct rq *rq = container_of(timer, struct rq, hrtick_timer);
366     struct rq_flags rf;
367 
368     WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
369 
370     rq_lock(rq, &rf);
371     update_rq_clock(rq);
372     rq->curr->sched_class->task_tick(rq, rq->curr, 1);
373     rq_unlock(rq, &rf);
374 
375     return HRTIMER_NORESTART;
376 }
377 
378 #ifdef CONFIG_SMP
379 
__hrtick_restart(struct rq * rq)380 static void __hrtick_restart(struct rq *rq)
381 {
382     struct hrtimer *timer = &rq->hrtick_timer;
383     ktime_t time = rq->hrtick_time;
384 
385     hrtimer_start(timer, time, HRTIMER_MODE_ABS_PINNED_HARD);
386 }
387 
388 /*
389  * called from hardirq (IPI) context
390  */
__hrtick_start(void * arg)391 static void __hrtick_start(void *arg)
392 {
393     struct rq *rq = arg;
394     struct rq_flags rf;
395 
396     rq_lock(rq, &rf);
397     __hrtick_restart(rq);
398     rq_unlock(rq, &rf);
399 }
400 
401 /*
402  * Called to set the hrtick timer state.
403  *
404  * called with rq->lock held and irqs disabled
405  */
hrtick_start(struct rq * rq,u64 delay)406 void hrtick_start(struct rq *rq, u64 delay)
407 {
408     struct hrtimer *timer = &rq->hrtick_timer;
409     s64 delta;
410 
411     /*
412      * Don't schedule slices shorter than 10000ns, that just
413      * doesn't make sense and can cause timer DoS.
414      */
415     delta = max_t(s64, delay, 10000LL);
416     rq->hrtick_time = ktime_add_ns(timer->base->get_time(), delta);
417 
418     if (rq == this_rq()) {
419         __hrtick_restart(rq);
420     } else {
421         smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
422     }
423 }
424 
425 #else
426 /*
427  * Called to set the hrtick timer state.
428  *
429  * called with rq->lock held and irqs disabled
430  */
hrtick_start(struct rq * rq,u64 delay)431 void hrtick_start(struct rq *rq, u64 delay)
432 {
433     /*
434      * Don't schedule slices shorter than 10000ns, that just
435      * doesn't make sense. Rely on vruntime for fairness.
436      */
437     delay = max_t(u64, delay, 10000LL);
438     hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
439                   HRTIMER_MODE_REL_PINNED_HARD);
440 }
441 
442 #endif /* CONFIG_SMP */
443 
hrtick_rq_init(struct rq * rq)444 static void hrtick_rq_init(struct rq *rq)
445 {
446 #ifdef CONFIG_SMP
447     rq_csd_init(rq, &rq->hrtick_csd, __hrtick_start);
448 #endif
449     hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
450     rq->hrtick_timer.function = hrtick;
451 }
452 #else  /* CONFIG_SCHED_HRTICK */
hrtick_clear(struct rq * rq)453 static inline void hrtick_clear(struct rq *rq)
454 {
455 }
456 
hrtick_rq_init(struct rq * rq)457 static inline void hrtick_rq_init(struct rq *rq)
458 {
459 }
460 #endif /* CONFIG_SCHED_HRTICK */
461 
462 /*
463  * cmpxchg based fetch_or, macro so it works for different integer types
464  */
465 #define fetch_or(ptr, mask)                                                    \
466     ( {                                                                        \
467         typeof(ptr) _ptr = (ptr);                                              \
468         typeof(mask) _mask = (mask);                                           \
469         typeof(*_ptr) _old, _val = *_ptr;                                      \
470                                                                                \
471         for ( ; ; ) {                                                             \
472             _old = cmpxchg(_ptr, _val, _val | _mask);                          \
473             if (_old == _val)                                                  \
474                 break;                                                         \
475             _val = _old;                                                       \
476         }                                                                      \
477         _old;                                                                  \
478     })
479 
480 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
481 /*
482  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
483  * this avoids any races wrt polling state changes and thereby avoids
484  * spurious IPIs.
485  */
set_nr_and_not_polling(struct task_struct * p)486 static bool set_nr_and_not_polling(struct task_struct *p)
487 {
488     struct thread_info *ti = task_thread_info(p);
489     return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
490 }
491 
492 /*
493  * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
494  *
495  * If this returns true, then the idle task promises to call
496  * sched_ttwu_pending() and reschedule soon.
497  */
set_nr_if_polling(struct task_struct * p)498 static bool set_nr_if_polling(struct task_struct *p)
499 {
500     struct thread_info *ti = task_thread_info(p);
501     typeof(ti->flags) old, val = READ_ONCE(ti->flags);
502 
503     for (;;) {
504         if (!(val & _TIF_POLLING_NRFLAG)) {
505             return false;
506         }
507         if (val & _TIF_NEED_RESCHED) {
508             return true;
509         }
510         old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
511         if (old == val) {
512             break;
513         }
514         val = old;
515     }
516     return true;
517 }
518 
519 #else
set_nr_and_not_polling(struct task_struct * p)520 static bool set_nr_and_not_polling(struct task_struct *p)
521 {
522     set_tsk_need_resched(p);
523     return true;
524 }
525 
526 #ifdef CONFIG_SMP
set_nr_if_polling(struct task_struct * p)527 static bool set_nr_if_polling(struct task_struct *p)
528 {
529     return false;
530 }
531 #endif
532 #endif
533 
__wake_q_add(struct wake_q_head * head,struct task_struct * task)534 static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task)
535 {
536     struct wake_q_node *node = &task->wake_q;
537 
538     /*
539      * Atomically grab the task, if ->wake_q is !nil already it means
540      * its already queued (either by us or someone else) and will get the
541      * wakeup due to that.
542      *
543      * In order to ensure that a pending wakeup will observe our pending
544      * state, even in the failed case, an explicit smp_mb() must be used.
545      */
546     smp_mb__before_atomic();
547     if (unlikely(cmpxchg_relaxed(&node->next, NULL, WAKE_Q_TAIL))) {
548         return false;
549     }
550 
551     /*
552      * The head is context local, there can be no concurrency.
553      */
554     *head->lastp = node;
555     head->lastp = &node->next;
556     return true;
557 }
558 
559 /**
560  * wake_q_add() - queue a wakeup for 'later' waking.
561  * @head: the wake_q_head to add @task to
562  * @task: the task to queue for 'later' wakeup
563  *
564  * Queue a task for later wakeup, most likely by the wake_up_q() call in the
565  * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
566  * instantly.
567  *
568  * This function must be used as-if it were wake_up_process(); IOW the task
569  * must be ready to be woken at this location.
570  */
wake_q_add(struct wake_q_head * head,struct task_struct * task)571 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
572 {
573     if (__wake_q_add(head, task)) {
574         get_task_struct(task);
575     }
576 }
577 
578 /**
579  * wake_q_add_safe() - safely queue a wakeup for 'later' waking.
580  * @head: the wake_q_head to add @task to
581  * @task: the task to queue for 'later' wakeup
582  *
583  * Queue a task for later wakeup, most likely by the wake_up_q() call in the
584  * same context, _HOWEVER_ this is not guaranteed, the wakeup can come
585  * instantly.
586  *
587  * This function must be used as-if it were wake_up_process(); IOW the task
588  * must be ready to be woken at this location.
589  *
590  * This function is essentially a task-safe equivalent to wake_q_add(). Callers
591  * that already hold reference to @task can call the 'safe' version and trust
592  * wake_q to do the right thing depending whether or not the @task is already
593  * queued for wakeup.
594  */
wake_q_add_safe(struct wake_q_head * head,struct task_struct * task)595 void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task)
596 {
597     if (!__wake_q_add(head, task)) {
598         put_task_struct(task);
599     }
600 }
601 
wake_up_q(struct wake_q_head * head)602 void wake_up_q(struct wake_q_head *head)
603 {
604     struct wake_q_node *node = head->first;
605 
606     while (node != WAKE_Q_TAIL) {
607         struct task_struct *task;
608 
609         task = container_of(node, struct task_struct, wake_q);
610         BUG_ON(!task);
611         /* Task can safely be re-inserted now: */
612         node = node->next;
613         task->wake_q.next = NULL;
614 
615         /*
616          * wake_up_process() executes a full barrier, which pairs with
617          * the queueing in wake_q_add() so as not to miss wakeups.
618          */
619         wake_up_process(task);
620         put_task_struct(task);
621     }
622 }
623 
624 /*
625  * resched_curr - mark rq's current task 'to be rescheduled now'.
626  *
627  * On UP this means the setting of the need_resched flag, on SMP it
628  * might also involve a cross-CPU call to trigger the scheduler on
629  * the target CPU.
630  */
resched_curr(struct rq * rq)631 void resched_curr(struct rq *rq)
632 {
633     struct task_struct *curr = rq->curr;
634     int cpu;
635 
636     lockdep_assert_held(&rq->lock);
637 
638     if (test_tsk_need_resched(curr)) {
639         return;
640     }
641 
642     cpu = cpu_of(rq);
643     if (cpu == smp_processor_id()) {
644         set_tsk_need_resched(curr);
645         set_preempt_need_resched();
646         return;
647     }
648 
649     if (set_nr_and_not_polling(curr)) {
650         smp_send_reschedule(cpu);
651     } else {
652         trace_sched_wake_idle_without_ipi(cpu);
653     }
654 }
655 
resched_cpu(int cpu)656 void resched_cpu(int cpu)
657 {
658     struct rq *rq = cpu_rq(cpu);
659     unsigned long flags;
660 
661     raw_spin_lock_irqsave(&rq->lock, flags);
662     if (cpu_online(cpu) || cpu == smp_processor_id()) {
663         resched_curr(rq);
664     }
665     raw_spin_unlock_irqrestore(&rq->lock, flags);
666 }
667 
668 #ifdef CONFIG_SMP
669 #ifdef CONFIG_NO_HZ_COMMON
670 /*
671  * In the semi idle case, use the nearest busy CPU for migrating timers
672  * from an idle CPU.  This is good for power-savings.
673  *
674  * We don't do similar optimization for completely idle system, as
675  * selecting an idle CPU will add more delays to the timers than intended
676  * (as that CPU's timer base may not be uptodate wrt jiffies etc).
677  */
get_nohz_timer_target(void)678 int get_nohz_timer_target(void)
679 {
680     int i, cpu = smp_processor_id(), default_cpu = -1;
681     struct sched_domain *sd;
682 
683     if (housekeeping_cpu(cpu, HK_FLAG_TIMER) && cpu_active(cpu)) {
684         if (!idle_cpu(cpu)) {
685             return cpu;
686         }
687         default_cpu = cpu;
688     }
689 
690     rcu_read_lock();
691     for_each_domain(cpu, sd)
692     {
693         for_each_cpu_and(i, sched_domain_span(sd),
694                          housekeeping_cpumask(HK_FLAG_TIMER))
695         {
696             if (cpu == i) {
697                 continue;
698             }
699 
700             if (!idle_cpu(i)) {
701                 cpu = i;
702                 goto unlock;
703             }
704         }
705     }
706 
707     if (default_cpu == -1) {
708         for_each_cpu_and(i, cpu_active_mask,
709                          housekeeping_cpumask(HK_FLAG_TIMER))
710         {
711             if (cpu == i) {
712                 continue;
713             }
714 
715             if (!idle_cpu(i)) {
716                 cpu = i;
717                 goto unlock;
718             }
719         }
720 
721         /* no active, not-idle, housekpeeing CPU found. */
722         default_cpu = cpumask_any(cpu_active_mask);
723         if (unlikely(default_cpu >= nr_cpu_ids)) {
724             goto unlock;
725         }
726     }
727 
728     cpu = default_cpu;
729 unlock:
730     rcu_read_unlock();
731     return cpu;
732 }
733 
734 /*
735  * When add_timer_on() enqueues a timer into the timer wheel of an
736  * idle CPU then this timer might expire before the next timer event
737  * which is scheduled to wake up that CPU. In case of a completely
738  * idle system the next event might even be infinite time into the
739  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
740  * leaves the inner idle loop so the newly added timer is taken into
741  * account when the CPU goes back to idle and evaluates the timer
742  * wheel for the next timer event.
743  */
wake_up_idle_cpu(int cpu)744 static void wake_up_idle_cpu(int cpu)
745 {
746     struct rq *rq = cpu_rq(cpu);
747 
748     if (cpu == smp_processor_id()) {
749         return;
750     }
751 
752     if (set_nr_and_not_polling(rq->idle)) {
753         smp_send_reschedule(cpu);
754     } else {
755         trace_sched_wake_idle_without_ipi(cpu);
756     }
757 }
758 
wake_up_full_nohz_cpu(int cpu)759 static bool wake_up_full_nohz_cpu(int cpu)
760 {
761     /*
762      * We just need the target to call irq_exit() and re-evaluate
763      * the next tick. The nohz full kick at least implies that.
764      * If needed we can still optimize that later with an
765      * empty IRQ.
766      */
767     if (cpu_is_offline(cpu)) {
768         return true; /* Don't try to wake offline CPUs. */
769     }
770     if (tick_nohz_full_cpu(cpu)) {
771         if (cpu != smp_processor_id() || tick_nohz_tick_stopped()) {
772             tick_nohz_full_kick_cpu(cpu);
773         }
774         return true;
775     }
776 
777     return false;
778 }
779 
780 /*
781  * Wake up the specified CPU.  If the CPU is going offline, it is the
782  * caller's responsibility to deal with the lost wakeup, for example,
783  * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
784  */
wake_up_nohz_cpu(int cpu)785 void wake_up_nohz_cpu(int cpu)
786 {
787     if (!wake_up_full_nohz_cpu(cpu)) {
788         wake_up_idle_cpu(cpu);
789     }
790 }
791 
nohz_csd_func(void * info)792 static void nohz_csd_func(void *info)
793 {
794     struct rq *rq = info;
795     int cpu = cpu_of(rq);
796     unsigned int flags;
797 
798     /*
799      * Release the rq::nohz_csd.
800      */
801     flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(cpu));
802     WARN_ON(!(flags & NOHZ_KICK_MASK));
803 
804     rq->idle_balance = idle_cpu(cpu);
805     if (rq->idle_balance && !need_resched()) {
806         rq->nohz_idle_balance = flags;
807         raise_softirq_irqoff(SCHED_SOFTIRQ);
808     }
809 }
810 
811 #endif /* CONFIG_NO_HZ_COMMON */
812 
813 #ifdef CONFIG_NO_HZ_FULL
sched_can_stop_tick(struct rq * rq)814 bool sched_can_stop_tick(struct rq *rq)
815 {
816     int fifo_nr_running;
817 
818     /* Deadline tasks, even if single, need the tick */
819     if (rq->dl.dl_nr_running) {
820         return false;
821     }
822 
823     /*
824      * If there are more than one RR tasks, we need the tick to effect the
825      * actual RR behaviour.
826      */
827     if (rq->rt.rr_nr_running) {
828         if (rq->rt.rr_nr_running == 1) {
829             return true;
830         } else {
831             return false;
832         }
833     }
834 
835     /*
836      * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
837      * forced preemption between FIFO tasks.
838      */
839     fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
840     if (fifo_nr_running) {
841         return true;
842     }
843 
844     /*
845      * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
846      * if there's more than one we need the tick for involuntary
847      * preemption.
848      */
849     if (rq->nr_running > 1) {
850         return false;
851     }
852 
853     return true;
854 }
855 #endif /* CONFIG_NO_HZ_FULL */
856 #endif /* CONFIG_SMP */
857 
858 #if defined(CONFIG_RT_GROUP_SCHED) ||                                          \
859     (defined(CONFIG_FAIR_GROUP_SCHED) &&                                       \
860      (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
861 /*
862  * Iterate task_group tree rooted at *from, calling @down when first entering a
863  * node and @up when leaving it for the final time.
864  *
865  * Caller must hold rcu_lock or sufficient equivalent.
866  */
walk_tg_tree_from(struct task_group * from,tg_visitor down,tg_visitor up,void * data)867 int walk_tg_tree_from(struct task_group *from, tg_visitor down, tg_visitor up,
868                       void *data)
869 {
870     struct task_group *parent, *child;
871     int ret;
872 
873     parent = from;
874 
875 down:
876     ret = (*down)(parent, data);
877     if (ret) {
878         goto out;
879     }
880     list_for_each_entry_rcu(child, &parent->children, siblings)
881     {
882         parent = child;
883         goto down;
884 
885     up:
886         continue;
887     }
888     ret = (*up)(parent, data);
889     if (ret || parent == from) {
890         goto out;
891     }
892 
893     child = parent;
894     parent = parent->parent;
895     if (parent) {
896         goto up;
897     }
898 out:
899     return ret;
900 }
901 
tg_nop(struct task_group * tg,void * data)902 int tg_nop(struct task_group *tg, void *data)
903 {
904     return 0;
905 }
906 #endif
907 
set_load_weight(struct task_struct * p)908 static void set_load_weight(struct task_struct *p)
909 {
910     bool update_load = !(READ_ONCE(p->state) & TASK_NEW);
911     int prio = p->static_prio - MAX_RT_PRIO;
912     struct load_weight *load = &p->se.load;
913 
914     /*
915      * SCHED_IDLE tasks get minimal weight:
916      */
917     if (task_has_idle_policy(p)) {
918         load->weight = scale_load(WEIGHT_IDLEPRIO);
919         load->inv_weight = WMULT_IDLEPRIO;
920         return;
921     }
922 
923     /*
924      * SCHED_OTHER tasks have to update their load when changing their
925      * weight
926      */
927     if (update_load && p->sched_class == &fair_sched_class) {
928         reweight_task(p, prio);
929     } else {
930         load->weight = scale_load(sched_prio_to_weight[prio]);
931         load->inv_weight = sched_prio_to_wmult[prio];
932     }
933 }
934 
935 #ifdef CONFIG_SCHED_LATENCY_NICE
set_latency_weight(struct task_struct * p)936 static void set_latency_weight(struct task_struct *p)
937 {
938     p->se.latency_weight = sched_latency_to_weight[p->latency_prio];
939 }
940 
__setscheduler_latency(struct task_struct * p,const struct sched_attr * attr)941 static void __setscheduler_latency(struct task_struct *p,
942                                    const struct sched_attr *attr)
943 {
944     if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
945         p->latency_prio = NICE_TO_LATENCY(attr->sched_latency_nice);
946         set_latency_weight(p);
947     }
948 }
949 
latency_nice_validate(struct task_struct * p,bool user,const struct sched_attr * attr)950 static int latency_nice_validate(struct task_struct *p, bool user,
951                                  const struct sched_attr *attr)
952 {
953     if (attr->sched_latency_nice > MAX_LATENCY_NICE) {
954         return -EINVAL;
955     }
956     if (attr->sched_latency_nice < MIN_LATENCY_NICE) {
957         return -EINVAL;
958     }
959     /* Use the same security checks as NICE */
960     if (user && attr->sched_latency_nice < LATENCY_TO_NICE(p->latency_prio) &&
961         !capable(CAP_SYS_NICE)) {
962         return -EPERM;
963     }
964 
965     return 0;
966 }
967 #else
__setscheduler_latency(struct task_struct * p,const struct sched_attr * attr)968 static void __setscheduler_latency(struct task_struct *p,
969                                    const struct sched_attr *attr)
970 {
971 }
972 
latency_nice_validate(struct task_struct * p,bool user,const struct sched_attr * attr)973 static inline int latency_nice_validate(struct task_struct *p, bool user,
974                                         const struct sched_attr *attr)
975 {
976     return -EOPNOTSUPP;
977 }
978 #endif
979 
980 #ifdef CONFIG_UCLAMP_TASK
981 /*
982  * Serializes updates of utilization clamp values
983  *
984  * The (slow-path) user-space triggers utilization clamp value updates which
985  * can require updates on (fast-path) scheduler's data structures used to
986  * support enqueue/dequeue operations.
987  * While the per-CPU rq lock protects fast-path update operations, user-space
988  * requests are serialized using a mutex to reduce the risk of conflicting
989  * updates or API abuses.
990  */
991 static DEFINE_MUTEX(uclamp_mutex);
992 
993 /* Max allowed minimum utilization */
994 unsigned int sysctl_sched_uclamp_util_min = SCHED_CAPACITY_SCALE;
995 
996 /* Max allowed maximum utilization */
997 unsigned int sysctl_sched_uclamp_util_max = SCHED_CAPACITY_SCALE;
998 
999 /*
1000  * By default RT tasks run at the maximum performance point/capacity of the
1001  * system. Uclamp enforces this by always setting UCLAMP_MIN of RT tasks to
1002  * SCHED_CAPACITY_SCALE.
1003  *
1004  * This knob allows admins to change the default behavior when uclamp is being
1005  * used. In battery powered devices, particularly, running at the maximum
1006  * capacity and frequency will increase energy consumption and shorten the
1007  * battery life.
1008  *
1009  * This knob only affects RT tasks that their uclamp_se->user_defined == false.
1010  *
1011  * This knob will not override the system default sched_util_clamp_min defined
1012  * above.
1013  */
1014 unsigned int sysctl_sched_uclamp_util_min_rt_default = SCHED_CAPACITY_SCALE;
1015 
1016 /* All clamps are required to be less or equal than these values */
1017 static struct uclamp_se uclamp_default[UCLAMP_CNT];
1018 
1019 /*
1020  * This static key is used to reduce the uclamp overhead in the fast path. It
1021  * primarily disables the call to uclamp_rq_{inc, dec}() in
1022  * enqueue/dequeue_task().
1023  *
1024  * This allows users to continue to enable uclamp in their kernel config with
1025  * minimum uclamp overhead in the fast path.
1026  *
1027  * As soon as userspace modifies any of the uclamp knobs, the static key is
1028  * enabled, since we have an actual users that make use of uclamp
1029  * functionality.
1030  *
1031  * The knobs that would enable this static key are:
1032  *
1033  *   * A task modifying its uclamp value with sched_setattr().
1034  *   * An admin modifying the sysctl_sched_uclamp_{min, max} via procfs.
1035  *   * An admin modifying the cgroup cpu.uclamp.{min, max}
1036  */
1037 DEFINE_STATIC_KEY_FALSE(sched_uclamp_used);
1038 
1039 /* Integer rounded range for each bucket */
1040 #define UCLAMP_BUCKET_DELTA                                                    \
1041     DIV_ROUND_CLOSEST(SCHED_CAPACITY_SCALE, UCLAMP_BUCKETS)
1042 
1043 #define cycle_each_clamp_id(clamp_id)                                            \
1044     for ((clamp_id) = 0; (clamp_id) < UCLAMP_CNT; (clamp_id)++)
1045 
uclamp_bucket_id(unsigned int clamp_value)1046 static inline unsigned int uclamp_bucket_id(unsigned int clamp_value)
1047 {
1048     return min_t(unsigned int, clamp_value / UCLAMP_BUCKET_DELTA,
1049                  UCLAMP_BUCKETS - 1);
1050 }
1051 
uclamp_none(enum uclamp_id clamp_id)1052 static inline unsigned int uclamp_none(enum uclamp_id clamp_id)
1053 {
1054     if (clamp_id == UCLAMP_MIN) {
1055         return 0;
1056     }
1057     return SCHED_CAPACITY_SCALE;
1058 }
1059 
uclamp_se_set(struct uclamp_se * uc_se,unsigned int value,bool user_defined)1060 static inline void uclamp_se_set(struct uclamp_se *uc_se, unsigned int value,
1061                                  bool user_defined)
1062 {
1063     uc_se->value = value;
1064     uc_se->bucket_id = uclamp_bucket_id(value);
1065     uc_se->user_defined = user_defined;
1066 }
1067 
uclamp_idle_value(struct rq * rq,enum uclamp_id clamp_id,unsigned int clamp_value)1068 static inline unsigned int uclamp_idle_value(struct rq *rq,
1069                                              enum uclamp_id clamp_id,
1070                                              unsigned int clamp_value)
1071 {
1072     /*
1073      * Avoid blocked utilization pushing up the frequency when we go
1074      * idle (which drops the max-clamp) by retaining the last known
1075      * max-clamp.
1076      */
1077     if (clamp_id == UCLAMP_MAX) {
1078         rq->uclamp_flags |= UCLAMP_FLAG_IDLE;
1079         return clamp_value;
1080     }
1081 
1082     return uclamp_none(UCLAMP_MIN);
1083 }
1084 
uclamp_idle_reset(struct rq * rq,enum uclamp_id clamp_id,unsigned int clamp_value)1085 static inline void uclamp_idle_reset(struct rq *rq, enum uclamp_id clamp_id,
1086                                      unsigned int clamp_value)
1087 {
1088     /* Reset max-clamp retention only on idle exit */
1089     if (!(rq->uclamp_flags & UCLAMP_FLAG_IDLE)) {
1090         return;
1091     }
1092 
1093     WRITE_ONCE(rq->uclamp[clamp_id].value, clamp_value);
1094 }
1095 
uclamp_rq_max_value(struct rq * rq,enum uclamp_id clamp_id,unsigned int clamp_value)1096 static inline unsigned int uclamp_rq_max_value(struct rq *rq,
1097                                                enum uclamp_id clamp_id,
1098                                                unsigned int clamp_value)
1099 {
1100     struct uclamp_bucket *bucket = rq->uclamp[clamp_id].bucket;
1101     int bucket_id = UCLAMP_BUCKETS - 1;
1102 
1103     /*
1104      * Since both min and max clamps are max aggregated, find the
1105      * top most bucket with tasks in.
1106      */
1107     for (; bucket_id >= 0; bucket_id--) {
1108         if (!bucket[bucket_id].tasks) {
1109             continue;
1110         }
1111         return bucket[bucket_id].value;
1112     }
1113 
1114     /* No tasks -- default clamp values */
1115     return uclamp_idle_value(rq, clamp_id, clamp_value);
1116 }
1117 
__uclamp_update_util_min_rt_default(struct task_struct * p)1118 static void __uclamp_update_util_min_rt_default(struct task_struct *p)
1119 {
1120     unsigned int default_util_min;
1121     struct uclamp_se *uc_se;
1122 
1123     lockdep_assert_held(&p->pi_lock);
1124 
1125     uc_se = &p->uclamp_req[UCLAMP_MIN];
1126 
1127     /* Only sync if user didn't override the default */
1128     if (uc_se->user_defined) {
1129         return;
1130     }
1131 
1132     default_util_min = sysctl_sched_uclamp_util_min_rt_default;
1133     uclamp_se_set(uc_se, default_util_min, false);
1134 }
1135 
uclamp_update_util_min_rt_default(struct task_struct * p)1136 static void uclamp_update_util_min_rt_default(struct task_struct *p)
1137 {
1138     struct rq_flags rf;
1139     struct rq *rq;
1140 
1141     if (!rt_task(p)) {
1142         return;
1143     }
1144 
1145     /* Protect updates to p->uclamp_* */
1146     rq = task_rq_lock(p, &rf);
1147     __uclamp_update_util_min_rt_default(p);
1148     task_rq_unlock(rq, p, &rf);
1149 }
1150 
uclamp_sync_util_min_rt_default(void)1151 static void uclamp_sync_util_min_rt_default(void)
1152 {
1153     struct task_struct *g, *p;
1154 
1155     /*
1156      * copy_process()            sysctl_uclamp
1157      *                      uclamp_min_rt = X;
1158      *   write_lock(&tasklist_lock)          read_lock(&tasklist_lock)
1159      *   // link thread              smp_mb__after_spinlock()
1160      *   write_unlock(&tasklist_lock)      read_unlock(&tasklist_lock);
1161      *   sched_post_fork()              for_each_process_thread()
1162      *     __uclamp_sync_rt()            __uclamp_sync_rt()
1163      *
1164      * Ensures that either sched_post_fork() will observe the new
1165      * uclamp_min_rt or for_each_process_thread() will observe the new
1166      * task.
1167      */
1168     read_lock(&tasklist_lock);
1169     smp_mb__after_spinlock();
1170     read_unlock(&tasklist_lock);
1171 
1172     rcu_read_lock();
1173     for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p);
1174     rcu_read_unlock();
1175 }
1176 
uclamp_tg_restrict(struct task_struct * p,enum uclamp_id clamp_id)1177 static inline struct uclamp_se uclamp_tg_restrict(struct task_struct *p,
1178                                                   enum uclamp_id clamp_id)
1179 {
1180     /* Copy by value as we could modify it */
1181     struct uclamp_se uc_req = p->uclamp_req[clamp_id];
1182 #ifdef CONFIG_UCLAMP_TASK_GROUP
1183     unsigned int tg_min, tg_max, value;
1184 
1185     /*
1186      * Tasks in autogroups or root task group will be
1187      * restricted by system defaults.
1188      */
1189     if (task_group_is_autogroup(task_group(p))) {
1190         return uc_req;
1191     }
1192     if (task_group(p) == &root_task_group) {
1193         return uc_req;
1194     }
1195 
1196     tg_min = task_group(p)->uclamp[UCLAMP_MIN].value;
1197     tg_max = task_group(p)->uclamp[UCLAMP_MAX].value;
1198     value = uc_req.value;
1199     value = clamp(value, tg_min, tg_max);
1200     uclamp_se_set(&uc_req, value, false);
1201 #endif
1202 
1203     return uc_req;
1204 }
1205 
1206 /*
1207  * The effective clamp bucket index of a task depends on, by increasing
1208  * priority:
1209  * - the task specific clamp value, when explicitly requested from userspace
1210  * - the task group effective clamp value, for tasks not either in the root
1211  *   group or in an autogroup
1212  * - the system default clamp value, defined by the sysadmin
1213  */
uclamp_eff_get(struct task_struct * p,enum uclamp_id clamp_id)1214 static inline struct uclamp_se uclamp_eff_get(struct task_struct *p,
1215                                               enum uclamp_id clamp_id)
1216 {
1217     struct uclamp_se uc_req = uclamp_tg_restrict(p, clamp_id);
1218     struct uclamp_se uc_max = uclamp_default[clamp_id];
1219 
1220     /* System default restrictions always apply */
1221     if (unlikely(uc_req.value > uc_max.value)) {
1222         return uc_max;
1223     }
1224 
1225     return uc_req;
1226 }
1227 
uclamp_eff_value(struct task_struct * p,enum uclamp_id clamp_id)1228 unsigned long uclamp_eff_value(struct task_struct *p, enum uclamp_id clamp_id)
1229 {
1230     struct uclamp_se uc_eff;
1231 
1232     /* Task currently refcounted: use back-annotated (effective) value */
1233     if (p->uclamp[clamp_id].active) {
1234         return (unsigned long)p->uclamp[clamp_id].value;
1235     }
1236 
1237     uc_eff = uclamp_eff_get(p, clamp_id);
1238 
1239     return (unsigned long)uc_eff.value;
1240 }
1241 
1242 /*
1243  * When a task is enqueued on a rq, the clamp bucket currently defined by the
1244  * task's uclamp::bucket_id is refcounted on that rq. This also immediately
1245  * updates the rq's clamp value if required.
1246  *
1247  * Tasks can have a task-specific value requested from user-space, track
1248  * within each bucket the maximum value for tasks refcounted in it.
1249  * This "local max aggregation" allows to track the exact "requested" value
1250  * for each bucket when all its RUNNABLE tasks require the same clamp.
1251  */
uclamp_rq_inc_id(struct rq * rq,struct task_struct * p,enum uclamp_id clamp_id)1252 static inline void uclamp_rq_inc_id(struct rq *rq, struct task_struct *p,
1253                                     enum uclamp_id clamp_id)
1254 {
1255     struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1256     struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1257     struct uclamp_bucket *bucket;
1258 
1259     lockdep_assert_held(&rq->lock);
1260 
1261     /* Update task effective clamp */
1262     p->uclamp[clamp_id] = uclamp_eff_get(p, clamp_id);
1263 
1264     bucket = &uc_rq->bucket[uc_se->bucket_id];
1265     bucket->tasks++;
1266     uc_se->active = true;
1267 
1268     uclamp_idle_reset(rq, clamp_id, uc_se->value);
1269 
1270     /*
1271      * Local max aggregation: rq buckets always track the max
1272      * "requested" clamp value of its RUNNABLE tasks.
1273      */
1274     if (bucket->tasks == 1 || uc_se->value > bucket->value) {
1275         bucket->value = uc_se->value;
1276     }
1277 
1278     if (uc_se->value > READ_ONCE(uc_rq->value)) {
1279         WRITE_ONCE(uc_rq->value, uc_se->value);
1280     }
1281 }
1282 
1283 /*
1284  * When a task is dequeued from a rq, the clamp bucket refcounted by the task
1285  * is released. If this is the last task reference counting the rq's max
1286  * active clamp value, then the rq's clamp value is updated.
1287  *
1288  * Both refcounted tasks and rq's cached clamp values are expected to be
1289  * always valid. If it's detected they are not, as defensive programming,
1290  * enforce the expected state and warn.
1291  */
uclamp_rq_dec_id(struct rq * rq,struct task_struct * p,enum uclamp_id clamp_id)1292 static inline void uclamp_rq_dec_id(struct rq *rq, struct task_struct *p,
1293                                     enum uclamp_id clamp_id)
1294 {
1295     struct uclamp_rq *uc_rq = &rq->uclamp[clamp_id];
1296     struct uclamp_se *uc_se = &p->uclamp[clamp_id];
1297     struct uclamp_bucket *bucket;
1298     unsigned int bkt_clamp;
1299     unsigned int rq_clamp;
1300 
1301     lockdep_assert_held(&rq->lock);
1302 
1303     /*
1304      * If sched_uclamp_used was enabled after task @p was enqueued,
1305      * we could end up with unbalanced call to uclamp_rq_dec_id().
1306      *
1307      * In this case the uc_se->active flag should be false since no uclamp
1308      * accounting was performed at enqueue time and we can just return
1309      * here.
1310      *
1311      * Need to be careful of the following enqeueue/dequeue ordering
1312      * problem too
1313      *
1314      *    enqueue(taskA)
1315      *    // sched_uclamp_used gets enabled
1316      *    enqueue(taskB)
1317      *    dequeue(taskA)
1318      *    // Must not decrement bukcet->tasks here
1319      *    dequeue(taskB)
1320      *
1321      * where we could end up with stale data in uc_se and
1322      * bucket[uc_se->bucket_id].
1323      *
1324      * The following check here eliminates the possibility of such race.
1325      */
1326     if (unlikely(!uc_se->active)) {
1327         return;
1328     }
1329 
1330     bucket = &uc_rq->bucket[uc_se->bucket_id];
1331 
1332     SCHED_WARN_ON(!bucket->tasks);
1333     if (likely(bucket->tasks)) {
1334         bucket->tasks--;
1335     }
1336 
1337     uc_se->active = false;
1338 
1339     /*
1340      * Keep "local max aggregation" simple and accept to (possibly)
1341      * overboost some RUNNABLE tasks in the same bucket.
1342      * The rq clamp bucket value is reset to its base value whenever
1343      * there are no more RUNNABLE tasks refcounting it.
1344      */
1345     if (likely(bucket->tasks)) {
1346         return;
1347     }
1348 
1349     rq_clamp = READ_ONCE(uc_rq->value);
1350     /*
1351      * Defensive programming: this should never happen. If it happens,
1352      * e.g. due to future modification, warn and fixup the expected value.
1353      */
1354     SCHED_WARN_ON(bucket->value > rq_clamp);
1355     if (bucket->value >= rq_clamp) {
1356         bkt_clamp = uclamp_rq_max_value(rq, clamp_id, uc_se->value);
1357         WRITE_ONCE(uc_rq->value, bkt_clamp);
1358     }
1359 }
1360 
uclamp_rq_inc(struct rq * rq,struct task_struct * p)1361 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1362 {
1363     enum uclamp_id clamp_id;
1364 
1365     /*
1366      * Avoid any overhead until uclamp is actually used by the userspace.
1367      *
1368      * The condition is constructed such that a NOP is generated when
1369      * sched_uclamp_used is disabled.
1370      */
1371     if (!static_branch_unlikely(&sched_uclamp_used)) {
1372         return;
1373     }
1374 
1375     if (unlikely(!p->sched_class->uclamp_enabled)) {
1376         return;
1377     }
1378 
1379     cycle_each_clamp_id(clamp_id) uclamp_rq_inc_id(rq, p, clamp_id);
1380 
1381     /* Reset clamp idle holding when there is one RUNNABLE task */
1382     if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) {
1383         rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1384     }
1385 }
1386 
uclamp_rq_dec(struct rq * rq,struct task_struct * p)1387 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1388 {
1389     enum uclamp_id clamp_id;
1390 
1391     /*
1392      * Avoid any overhead until uclamp is actually used by the userspace.
1393      *
1394      * The condition is constructed such that a NOP is generated when
1395      * sched_uclamp_used is disabled.
1396      */
1397     if (!static_branch_unlikely(&sched_uclamp_used)) {
1398         return;
1399     }
1400 
1401     if (unlikely(!p->sched_class->uclamp_enabled)) {
1402         return;
1403     }
1404 
1405     cycle_each_clamp_id(clamp_id) uclamp_rq_dec_id(rq, p, clamp_id);
1406 }
1407 
uclamp_rq_reinc_id(struct rq * rq,struct task_struct * p,enum uclamp_id clamp_id)1408 static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
1409                                       enum uclamp_id clamp_id)
1410 {
1411     if (!p->uclamp[clamp_id].active) {
1412         return;
1413     }
1414 
1415     uclamp_rq_dec_id(rq, p, clamp_id);
1416     uclamp_rq_inc_id(rq, p, clamp_id);
1417 
1418     /*
1419      * Make sure to clear the idle flag if we've transiently reached 0
1420      * active tasks on rq.
1421      */
1422     if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE)) {
1423         rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
1424     }
1425 }
1426 
uclamp_update_active(struct task_struct * p)1427 static inline void uclamp_update_active(struct task_struct *p)
1428 {
1429     enum uclamp_id clamp_id;
1430     struct rq_flags rf;
1431     struct rq *rq;
1432 
1433     /*
1434      * Lock the task and the rq where the task is (or was) queued.
1435      *
1436      * We might lock the (previous) rq of a !RUNNABLE task, but that's the
1437      * price to pay to safely serialize util_{min,max} updates with
1438      * enqueues, dequeues and migration operations.
1439      * This is the same locking schema used by __set_cpus_allowed_ptr().
1440      */
1441     rq = task_rq_lock(p, &rf);
1442 
1443     /*
1444      * Setting the clamp bucket is serialized by task_rq_lock().
1445      * If the task is not yet RUNNABLE and its task_struct is not
1446      * affecting a valid clamp bucket, the next time it's enqueued,
1447      * it will already see the updated clamp bucket value.
1448      */
1449     cycle_each_clamp_id(clamp_id) uclamp_rq_reinc_id(rq, p, clamp_id);
1450 
1451     task_rq_unlock(rq, p, &rf);
1452 }
1453 
1454 #ifdef CONFIG_UCLAMP_TASK_GROUP
uclamp_update_active_tasks(struct cgroup_subsys_state * css)1455 static inline void uclamp_update_active_tasks(struct cgroup_subsys_state *css)
1456 {
1457     struct css_task_iter it;
1458     struct task_struct *p;
1459 
1460     css_task_iter_start(css, 0, &it);
1461     while ((p = css_task_iter_next(&it))) {
1462         uclamp_update_active(p);
1463     }
1464     css_task_iter_end(&it);
1465 }
1466 
1467 static void cpu_util_update_eff(struct cgroup_subsys_state *css);
uclamp_update_root_tg(void)1468 static void uclamp_update_root_tg(void)
1469 {
1470     struct task_group *tg = &root_task_group;
1471 
1472     uclamp_se_set(&tg->uclamp_req[UCLAMP_MIN], sysctl_sched_uclamp_util_min,
1473                   false);
1474     uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max,
1475                   false);
1476 
1477     rcu_read_lock();
1478     cpu_util_update_eff(&root_task_group.css);
1479     rcu_read_unlock();
1480 }
1481 #else
uclamp_update_root_tg(void)1482 static void uclamp_update_root_tg(void)
1483 {
1484 }
1485 #endif
1486 
sysctl_sched_uclamp_handler(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)1487 int sysctl_sched_uclamp_handler(struct ctl_table *table, int write,
1488                                 void *buffer, size_t *lenp, loff_t *ppos)
1489 {
1490     bool update_root_tg = false;
1491     int old_min, old_max, old_min_rt;
1492     int result;
1493 
1494     mutex_lock(&uclamp_mutex);
1495     old_min = sysctl_sched_uclamp_util_min;
1496     old_max = sysctl_sched_uclamp_util_max;
1497     old_min_rt = sysctl_sched_uclamp_util_min_rt_default;
1498 
1499     result = proc_dointvec(table, write, buffer, lenp, ppos);
1500     if (result) {
1501         goto undo;
1502     }
1503     if (!write) {
1504         goto done;
1505     }
1506 
1507     if (sysctl_sched_uclamp_util_min > sysctl_sched_uclamp_util_max ||
1508         sysctl_sched_uclamp_util_max > SCHED_CAPACITY_SCALE ||
1509         sysctl_sched_uclamp_util_min_rt_default > SCHED_CAPACITY_SCALE) {
1510         result = -EINVAL;
1511         goto undo;
1512     }
1513 
1514     if (old_min != sysctl_sched_uclamp_util_min) {
1515         uclamp_se_set(&uclamp_default[UCLAMP_MIN], sysctl_sched_uclamp_util_min,
1516                       false);
1517         update_root_tg = true;
1518     }
1519     if (old_max != sysctl_sched_uclamp_util_max) {
1520         uclamp_se_set(&uclamp_default[UCLAMP_MAX], sysctl_sched_uclamp_util_max,
1521                       false);
1522         update_root_tg = true;
1523     }
1524 
1525     if (update_root_tg) {
1526         static_branch_enable(&sched_uclamp_used);
1527         uclamp_update_root_tg();
1528     }
1529 
1530     if (old_min_rt != sysctl_sched_uclamp_util_min_rt_default) {
1531         static_branch_enable(&sched_uclamp_used);
1532         uclamp_sync_util_min_rt_default();
1533     }
1534 
1535     /*
1536      * We update all RUNNABLE tasks only when task groups are in use.
1537      * Otherwise, keep it simple and do just a lazy update at each next
1538      * task enqueue time.
1539      */
1540 
1541     goto done;
1542 
1543 undo:
1544     sysctl_sched_uclamp_util_min = old_min;
1545     sysctl_sched_uclamp_util_max = old_max;
1546     sysctl_sched_uclamp_util_min_rt_default = old_min_rt;
1547 done:
1548     mutex_unlock(&uclamp_mutex);
1549 
1550     return result;
1551 }
1552 
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)1553 static int uclamp_validate(struct task_struct *p, const struct sched_attr *attr)
1554 {
1555     unsigned int lower_bound = p->uclamp_req[UCLAMP_MIN].value;
1556     unsigned int upper_bound = p->uclamp_req[UCLAMP_MAX].value;
1557 
1558     if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1559         lower_bound = attr->sched_util_min;
1560     }
1561     if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1562         upper_bound = attr->sched_util_max;
1563     }
1564 
1565     if (lower_bound > upper_bound) {
1566         return -EINVAL;
1567     }
1568     if (upper_bound > SCHED_CAPACITY_SCALE) {
1569         return -EINVAL;
1570     }
1571 
1572     /*
1573      * We have valid uclamp attributes; make sure uclamp is enabled.
1574      *
1575      * We need to do that here, because enabling static branches is a
1576      * blocking operation which obviously cannot be done while holding
1577      * scheduler locks.
1578      */
1579     static_branch_enable(&sched_uclamp_used);
1580 
1581     return 0;
1582 }
1583 
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)1584 static void __setscheduler_uclamp(struct task_struct *p,
1585                                   const struct sched_attr *attr)
1586 {
1587     enum uclamp_id clamp_id;
1588 
1589     /*
1590      * On scheduling class change, reset to default clamps for tasks
1591      * without a task-specific value.
1592      */
1593     cycle_each_clamp_id(clamp_id) {
1594         struct uclamp_se *uc_se = &p->uclamp_req[clamp_id];
1595 
1596         /* Keep using defined clamps across class changes */
1597         if (uc_se->user_defined) {
1598             continue;
1599         }
1600 
1601         /*
1602          * RT by default have a 100% boost value that could be modified
1603          * at runtime.
1604          */
1605         if (unlikely(rt_task(p) && clamp_id == UCLAMP_MIN)) {
1606             __uclamp_update_util_min_rt_default(p);
1607         } else {
1608             uclamp_se_set(uc_se, uclamp_none(clamp_id), false);
1609         }
1610     }
1611 
1612     if (likely(!(attr->sched_flags & SCHED_FLAG_UTIL_CLAMP))) {
1613         return;
1614     }
1615 
1616     if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MIN) {
1617         uclamp_se_set(&p->uclamp_req[UCLAMP_MIN], attr->sched_util_min, true);
1618     }
1619 
1620     if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP_MAX) {
1621         uclamp_se_set(&p->uclamp_req[UCLAMP_MAX], attr->sched_util_max, true);
1622     }
1623 }
1624 
uclamp_fork(struct task_struct * p)1625 static void uclamp_fork(struct task_struct *p)
1626 {
1627     enum uclamp_id clamp_id;
1628 
1629     /*
1630      * We don't need to hold task_rq_lock() when updating p->uclamp_* here
1631      * as the task is still at its early fork stages.
1632      */
1633     cycle_each_clamp_id(clamp_id) p->uclamp[clamp_id].active = false;
1634 
1635     if (likely(!p->sched_reset_on_fork)) {
1636         return;
1637     }
1638 
1639     cycle_each_clamp_id(clamp_id) {
1640         uclamp_se_set(&p->uclamp_req[clamp_id], uclamp_none(clamp_id), false);
1641     }
1642 }
1643 
uclamp_post_fork(struct task_struct * p)1644 static void uclamp_post_fork(struct task_struct *p)
1645 {
1646     uclamp_update_util_min_rt_default(p);
1647 }
1648 
init_uclamp_rq(struct rq * rq)1649 static void __init init_uclamp_rq(struct rq *rq)
1650 {
1651     enum uclamp_id clamp_id;
1652     struct uclamp_rq *uc_rq = rq->uclamp;
1653 
1654     cycle_each_clamp_id(clamp_id) {
1655         uc_rq[clamp_id] = (struct uclamp_rq) {.value = uclamp_none(clamp_id)};
1656     }
1657 
1658     rq->uclamp_flags = UCLAMP_FLAG_IDLE;
1659 }
1660 
init_uclamp(void)1661 static void __init init_uclamp(void)
1662 {
1663     struct uclamp_se uc_max = {};
1664     enum uclamp_id clamp_id;
1665     int cpu;
1666 
1667     for_each_possible_cpu(cpu) init_uclamp_rq(cpu_rq(cpu));
1668 
1669     cycle_each_clamp_id(clamp_id)
1670     {
1671         uclamp_se_set(&init_task.uclamp_req[clamp_id], uclamp_none(clamp_id),
1672                       false);
1673     }
1674 
1675     /* System defaults allow max clamp values for both indexes */
1676     uclamp_se_set(&uc_max, uclamp_none(UCLAMP_MAX), false);
1677     cycle_each_clamp_id(clamp_id)
1678     {
1679         uclamp_default[clamp_id] = uc_max;
1680 #ifdef CONFIG_UCLAMP_TASK_GROUP
1681         root_task_group.uclamp_req[clamp_id] = uc_max;
1682         root_task_group.uclamp[clamp_id] = uc_max;
1683 #endif
1684     }
1685 }
1686 
1687 #else  /* CONFIG_UCLAMP_TASK */
uclamp_rq_inc(struct rq * rq,struct task_struct * p)1688 static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p)
1689 {
1690 }
uclamp_rq_dec(struct rq * rq,struct task_struct * p)1691 static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
1692 {
1693 }
uclamp_validate(struct task_struct * p,const struct sched_attr * attr)1694 static inline int uclamp_validate(struct task_struct *p,
1695                                   const struct sched_attr *attr)
1696 {
1697     return -EOPNOTSUPP;
1698 }
__setscheduler_uclamp(struct task_struct * p,const struct sched_attr * attr)1699 static void __setscheduler_uclamp(struct task_struct *p,
1700                                   const struct sched_attr *attr)
1701 {
1702 }
uclamp_fork(struct task_struct * p)1703 static inline void uclamp_fork(struct task_struct *p)
1704 {
1705 }
uclamp_post_fork(struct task_struct * p)1706 static inline void uclamp_post_fork(struct task_struct *p)
1707 {
1708 }
init_uclamp(void)1709 static inline void init_uclamp(void)
1710 {
1711 }
1712 #endif /* CONFIG_UCLAMP_TASK */
1713 
enqueue_task(struct rq * rq,struct task_struct * p,int flags)1714 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
1715 {
1716     if (!(flags & ENQUEUE_NOCLOCK)) {
1717         update_rq_clock(rq);
1718     }
1719 
1720     if (!(flags & ENQUEUE_RESTORE)) {
1721         sched_info_queued(rq, p);
1722         psi_enqueue(p, flags & ENQUEUE_WAKEUP);
1723     }
1724 
1725     uclamp_rq_inc(rq, p);
1726     p->sched_class->enqueue_task(rq, p, flags);
1727 }
1728 
dequeue_task(struct rq * rq,struct task_struct * p,int flags)1729 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
1730 {
1731     if (!(flags & DEQUEUE_NOCLOCK)) {
1732         update_rq_clock(rq);
1733     }
1734 
1735     if (!(flags & DEQUEUE_SAVE)) {
1736         sched_info_dequeued(rq, p);
1737         psi_dequeue(p, flags & DEQUEUE_SLEEP);
1738     }
1739 
1740     uclamp_rq_dec(rq, p);
1741     p->sched_class->dequeue_task(rq, p, flags);
1742 }
1743 
activate_task(struct rq * rq,struct task_struct * p,int flags)1744 void activate_task(struct rq *rq, struct task_struct *p, int flags)
1745 {
1746     enqueue_task(rq, p, flags);
1747 
1748     p->on_rq = TASK_ON_RQ_QUEUED;
1749 }
1750 
deactivate_task(struct rq * rq,struct task_struct * p,int flags)1751 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
1752 {
1753     p->on_rq = (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING;
1754 
1755     dequeue_task(rq, p, flags);
1756 }
1757 
__normal_prio(int policy,int rt_prio,int nice)1758 static inline int __normal_prio(int policy, int rt_prio, int nice)
1759 {
1760     int prio;
1761 
1762     if (dl_policy(policy)) {
1763         prio = MAX_DL_PRIO - 1;
1764     } else if (rt_policy(policy)) {
1765         prio = MAX_RT_PRIO - 1 - rt_prio;
1766     } else {
1767         prio = NICE_TO_PRIO(nice);
1768     }
1769 
1770     return prio;
1771 }
1772 
1773 /*
1774  * Calculate the expected normal priority: i.e. priority
1775  * without taking RT-inheritance into account. Might be
1776  * boosted by interactivity modifiers. Changes upon fork,
1777  * setprio syscalls, and whenever the interactivity
1778  * estimator recalculates.
1779  */
normal_prio(struct task_struct * p)1780 static inline int normal_prio(struct task_struct *p)
1781 {
1782     return __normal_prio(p->policy, p->rt_priority,
1783                          PRIO_TO_NICE(p->static_prio));
1784 }
1785 
1786 /*
1787  * Calculate the current priority, i.e. the priority
1788  * taken into account by the scheduler. This value might
1789  * be boosted by RT tasks, or might be boosted by
1790  * interactivity modifiers. Will be RT if the task got
1791  * RT-boosted. If not then it returns p->normal_prio.
1792  */
effective_prio(struct task_struct * p)1793 static int effective_prio(struct task_struct *p)
1794 {
1795     p->normal_prio = normal_prio(p);
1796     /*
1797      * If we are RT tasks or we were boosted to RT priority,
1798      * keep the priority unchanged. Otherwise, update priority
1799      * to the normal priority:
1800      */
1801     if (!rt_prio(p->prio)) {
1802         return p->normal_prio;
1803     }
1804     return p->prio;
1805 }
1806 
1807 /**
1808  * task_curr - is this task currently executing on a CPU?
1809  * @p: the task in question.
1810  *
1811  * Return: 1 if the task is currently executing. 0 otherwise.
1812  */
task_curr(const struct task_struct * p)1813 inline int task_curr(const struct task_struct *p)
1814 {
1815     return cpu_curr(task_cpu(p)) == p;
1816 }
1817 
1818 /*
1819  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
1820  * use the balance_callback list if you want balancing.
1821  *
1822  * this means any call to check_class_changed() must be followed by a call to
1823  * balance_callback().
1824  */
check_class_changed(struct rq * rq,struct task_struct * p,const struct sched_class * prev_class,int oldprio)1825 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1826                                        const struct sched_class *prev_class,
1827                                        int oldprio)
1828 {
1829     if (prev_class != p->sched_class) {
1830         if (prev_class->switched_from) {
1831             prev_class->switched_from(rq, p);
1832         }
1833 
1834         p->sched_class->switched_to(rq, p);
1835     } else if (oldprio != p->prio || dl_task(p)) {
1836         p->sched_class->prio_changed(rq, p, oldprio);
1837     }
1838 }
1839 
check_preempt_curr(struct rq * rq,struct task_struct * p,int flags)1840 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
1841 {
1842     if (p->sched_class == rq->curr->sched_class) {
1843         rq->curr->sched_class->check_preempt_curr(rq, p, flags);
1844     } else if (p->sched_class > rq->curr->sched_class) {
1845         resched_curr(rq);
1846     }
1847 
1848     /*
1849      * A queue event has occurred, and we're going to schedule.  In
1850      * this case, we can save a useless back to back clock update.
1851      */
1852     if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr)) {
1853         rq_clock_skip_update(rq);
1854     }
1855 }
1856 
1857 #ifdef CONFIG_SMP
1858 
1859 /*
1860  * Per-CPU kthreads are allowed to run on !active && online CPUs, see
1861  * __set_cpus_allowed_ptr() and select_fallback_rq().
1862  */
is_cpu_allowed(struct task_struct * p,int cpu)1863 static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
1864 {
1865     if (!cpumask_test_cpu(cpu, p->cpus_ptr)) {
1866         return false;
1867     }
1868 
1869     if (is_per_cpu_kthread(p)) {
1870         return cpu_online(cpu);
1871     }
1872 
1873     if (!cpu_active(cpu)) {
1874         return false;
1875     }
1876 
1877     return cpumask_test_cpu(cpu, task_cpu_possible_mask(p));
1878 }
1879 
1880 /*
1881  * This is how migration works
1882  *
1883  * 1) we invoke migration_cpu_stop() on the target CPU using
1884  *    stop_one_cpu().
1885  * 2) stopper starts to run (implicitly forcing the migrated thread
1886  *    off the CPU)
1887  * 3) it checks whether the migrated task is still in the wrong runqueue.
1888  * 4) if it's in the wrong runqueue then the migration thread removes
1889  *    it and puts it into the right queue.
1890  * 5) stopper completes and stop_one_cpu() returns and the migration
1891  *    is done.
1892  */
1893 
1894 /*
1895  * move_queued_task - move a queued task to new rq.
1896  *
1897  * Returns (locked) new rq. Old rq's lock is released.
1898  */
move_queued_task(struct rq * rq,struct rq_flags * rf,struct task_struct * p,int new_cpu)1899 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
1900                                    struct task_struct *p, int new_cpu)
1901 {
1902     lockdep_assert_held(&rq->lock);
1903 
1904     deactivate_task(rq, p, DEQUEUE_NOCLOCK);
1905 #ifdef CONFIG_SCHED_WALT
1906     double_lock_balance(rq, cpu_rq(new_cpu));
1907     if (!(rq->clock_update_flags & RQCF_UPDATED)) {
1908         update_rq_clock(rq);
1909     }
1910 #endif
1911     set_task_cpu(p, new_cpu);
1912 #ifdef CONFIG_SCHED_WALT
1913     double_rq_unlock(cpu_rq(new_cpu), rq);
1914 #else
1915     rq_unlock(rq, rf);
1916 #endif
1917 
1918     rq = cpu_rq(new_cpu);
1919 
1920     rq_lock(rq, rf);
1921     BUG_ON(task_cpu(p) != new_cpu);
1922     activate_task(rq, p, 0);
1923     check_preempt_curr(rq, p, 0);
1924 
1925     return rq;
1926 }
1927 
1928 struct migration_arg {
1929     struct task_struct *task;
1930     int dest_cpu;
1931 };
1932 
1933 /*
1934  * Move (not current) task off this CPU, onto the destination CPU. We're doing
1935  * this because either it can't run here any more (set_cpus_allowed()
1936  * away from this CPU, or CPU going down), or because we're
1937  * attempting to rebalance this task on exec (sched_exec).
1938  *
1939  * So we race with normal scheduler movements, but that's OK, as long
1940  * as the task is no longer on this CPU.
1941  */
__migrate_task(struct rq * rq,struct rq_flags * rf,struct task_struct * p,int dest_cpu)1942 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
1943                                  struct task_struct *p, int dest_cpu)
1944 {
1945     /* Affinity changed (again). */
1946     if (!is_cpu_allowed(p, dest_cpu)) {
1947         return rq;
1948     }
1949 
1950     update_rq_clock(rq);
1951     rq = move_queued_task(rq, rf, p, dest_cpu);
1952 
1953     return rq;
1954 }
1955 
1956 /*
1957  * migration_cpu_stop - this will be executed by a highprio stopper thread
1958  * and performs thread migration by bumping thread off CPU then
1959  * 'pushing' onto another runqueue.
1960  */
migration_cpu_stop(void * data)1961 static int migration_cpu_stop(void *data)
1962 {
1963     struct migration_arg *arg = data;
1964     struct task_struct *p = arg->task;
1965     struct rq *rq = this_rq();
1966     struct rq_flags rf;
1967 
1968     /*
1969      * The original target CPU might have gone down and we might
1970      * be on another CPU but it doesn't matter.
1971      */
1972     local_irq_disable();
1973     /*
1974      * We need to explicitly wake pending tasks before running
1975      * __migrate_task() such that we will not miss enforcing cpus_ptr
1976      * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
1977      */
1978     flush_smp_call_function_from_idle();
1979 
1980     raw_spin_lock(&p->pi_lock);
1981     rq_lock(rq, &rf);
1982     /*
1983      * If task_rq(p) != rq, it cannot be migrated here, because we're
1984      * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1985      * we're holding p->pi_lock.
1986      */
1987     if (task_rq(p) == rq) {
1988         if (task_on_rq_queued(p)) {
1989             rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1990         } else {
1991             p->wake_cpu = arg->dest_cpu;
1992         }
1993     }
1994     rq_unlock(rq, &rf);
1995     raw_spin_unlock(&p->pi_lock);
1996 
1997     local_irq_enable();
1998     return 0;
1999 }
2000 
2001 /*
2002  * sched_class::set_cpus_allowed must do the below, but is not required to
2003  * actually call this function.
2004  */
set_cpus_allowed_common(struct task_struct * p,const struct cpumask * new_mask)2005 void set_cpus_allowed_common(struct task_struct *p,
2006                              const struct cpumask *new_mask)
2007 {
2008     cpumask_copy(&p->cpus_mask, new_mask);
2009     p->nr_cpus_allowed = cpumask_weight(new_mask);
2010 }
2011 
do_set_cpus_allowed(struct task_struct * p,const struct cpumask * new_mask)2012 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
2013 {
2014     struct rq *rq = task_rq(p);
2015     bool queued, running;
2016 
2017     lockdep_assert_held(&p->pi_lock);
2018 
2019     queued = task_on_rq_queued(p);
2020     running = task_current(rq, p);
2021 
2022     if (queued) {
2023         /*
2024          * Because __kthread_bind() calls this on blocked tasks without
2025          * holding rq->lock.
2026          */
2027         lockdep_assert_held(&rq->lock);
2028         dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
2029     }
2030     if (running) {
2031         put_prev_task(rq, p);
2032     }
2033 
2034     p->sched_class->set_cpus_allowed(p, new_mask);
2035 
2036     if (queued) {
2037         enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
2038     }
2039     if (running) {
2040         set_next_task(rq, p);
2041     }
2042 }
2043 
2044 /*
2045  * Change a given task's CPU affinity. Migrate the thread to a
2046  * proper CPU and schedule it away if the CPU it's executing on
2047  * is removed from the allowed bitmask.
2048  *
2049  * NOTE: the caller must have a valid reference to the task, the
2050  * task must not exit() & deallocate itself prematurely. The
2051  * call is not atomic; no spinlocks may be held.
2052  */
__set_cpus_allowed_ptr(struct task_struct * p,const struct cpumask * new_mask,bool check)2053 static int __set_cpus_allowed_ptr(struct task_struct *p,
2054                                   const struct cpumask *new_mask, bool check)
2055 {
2056     const struct cpumask *cpu_valid_mask = cpu_active_mask;
2057     unsigned int dest_cpu;
2058     struct rq_flags rf;
2059     struct rq *rq;
2060     int ret = 0;
2061 #ifdef CONFIG_CPU_ISOLATION_OPT
2062     cpumask_t allowed_mask;
2063 #endif
2064 
2065     rq = task_rq_lock(p, &rf);
2066     update_rq_clock(rq);
2067 
2068     if (p->flags & PF_KTHREAD) {
2069         /*
2070          * Kernel threads are allowed on online && !active CPUs
2071          */
2072         cpu_valid_mask = cpu_online_mask;
2073     }
2074 
2075     /*
2076      * Must re-check here, to close a race against __kthread_bind(),
2077      * sched_setaffinity() is not guaranteed to observe the flag.
2078      */
2079     if (check && (p->flags & PF_NO_SETAFFINITY)) {
2080         ret = -EINVAL;
2081         goto out;
2082     }
2083 
2084     if (cpumask_equal(&p->cpus_mask, new_mask)) {
2085         goto out;
2086     }
2087 
2088 #ifdef CONFIG_CPU_ISOLATION_OPT
2089     cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
2090     cpumask_and(&allowed_mask, &allowed_mask, cpu_valid_mask);
2091 
2092     dest_cpu = cpumask_any(&allowed_mask);
2093     if (dest_cpu >= nr_cpu_ids) {
2094         cpumask_and(&allowed_mask, cpu_valid_mask, new_mask);
2095         dest_cpu = cpumask_any(&allowed_mask);
2096         if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
2097             ret = -EINVAL;
2098             goto out;
2099         }
2100     }
2101 #else
2102     /*
2103      * Picking a ~random cpu helps in cases where we are changing affinity
2104      * for groups of tasks (ie. cpuset), so that load balancing is not
2105      * immediately required to distribute the tasks within their new mask.
2106      */
2107     dest_cpu = cpumask_any_and_distribute(cpu_valid_mask, new_mask);
2108     if (dest_cpu >= nr_cpu_ids) {
2109         ret = -EINVAL;
2110         goto out;
2111     }
2112 #endif
2113 
2114     do_set_cpus_allowed(p, new_mask);
2115 
2116     if (p->flags & PF_KTHREAD) {
2117         /*
2118          * For kernel threads that do indeed end up on online &&
2119          * !active we want to ensure they are strict per-CPU threads.
2120          */
2121         WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
2122                 !cpumask_intersects(new_mask, cpu_active_mask) &&
2123                 p->nr_cpus_allowed != 1);
2124     }
2125 
2126     /* Can the task run on the task's current CPU? If so, we're done */
2127 #ifdef CONFIG_CPU_ISOLATION_OPT
2128     if (cpumask_test_cpu(task_cpu(p), &allowed_mask)) {
2129         goto out;
2130     }
2131 #else
2132     if (cpumask_test_cpu(task_cpu(p), new_mask)) {
2133         goto out;
2134     }
2135 #endif
2136 
2137     if (task_running(rq, p) || p->state == TASK_WAKING) {
2138         struct migration_arg arg = {p, dest_cpu};
2139         /* Need help from migration thread: drop lock and wait. */
2140         task_rq_unlock(rq, p, &rf);
2141         stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
2142         return 0;
2143     } else if (task_on_rq_queued(p)) {
2144         /*
2145          * OK, since we're going to drop the lock immediately
2146          * afterwards anyway.
2147          */
2148         rq = move_queued_task(rq, &rf, p, dest_cpu);
2149     }
2150 out:
2151     task_rq_unlock(rq, p, &rf);
2152 
2153     return ret;
2154 }
2155 
set_cpus_allowed_ptr(struct task_struct * p,const struct cpumask * new_mask)2156 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
2157 {
2158     return __set_cpus_allowed_ptr(p, new_mask, false);
2159 }
2160 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
2161 
set_task_cpu(struct task_struct * p,unsigned int new_cpu)2162 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
2163 {
2164 #ifdef CONFIG_SCHED_DEBUG
2165     /*
2166      * We should never call set_task_cpu() on a blocked task,
2167      * ttwu() will sort out the placement.
2168      */
2169     WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
2170                  !p->on_rq);
2171 
2172     /*
2173      * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
2174      * because schedstat_wait_{start,end} rebase migrating task's wait_start
2175      * time relying on p->on_rq.
2176      */
2177     WARN_ON_ONCE(p->state == TASK_RUNNING &&
2178                  p->sched_class == &fair_sched_class &&
2179                  (p->on_rq && !task_on_rq_migrating(p)));
2180 
2181 #ifdef CONFIG_LOCKDEP
2182     /*
2183      * The caller should hold either p->pi_lock or rq->lock, when changing
2184      * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
2185      *
2186      * sched_move_task() holds both and thus holding either pins the cgroup,
2187      * see task_group().
2188      *
2189      * Furthermore, all task_rq users should acquire both locks, see
2190      * task_rq_lock().
2191      */
2192     WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
2193                                   lockdep_is_held(&task_rq(p)->lock)));
2194 #endif
2195     /*
2196      * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
2197      */
2198     WARN_ON_ONCE(!cpu_online(new_cpu));
2199 #endif
2200 
2201     trace_sched_migrate_task(p, new_cpu);
2202 
2203     if (task_cpu(p) != new_cpu) {
2204         if (p->sched_class->migrate_task_rq) {
2205             p->sched_class->migrate_task_rq(p, new_cpu);
2206         }
2207         p->se.nr_migrations++;
2208         rseq_migrate(p);
2209         perf_event_task_migrate(p);
2210         fixup_busy_time(p, new_cpu);
2211     }
2212 
2213     __set_task_cpu(p, new_cpu);
2214 }
2215 
2216 #ifdef CONFIG_NUMA_BALANCING
__migrate_swap_task(struct task_struct * p,int cpu)2217 static void __migrate_swap_task(struct task_struct *p, int cpu)
2218 {
2219     if (task_on_rq_queued(p)) {
2220         struct rq *src_rq, *dst_rq;
2221         struct rq_flags srf, drf;
2222 
2223         src_rq = task_rq(p);
2224         dst_rq = cpu_rq(cpu);
2225 
2226         rq_pin_lock(src_rq, &srf);
2227         rq_pin_lock(dst_rq, &drf);
2228 
2229         deactivate_task(src_rq, p, 0);
2230         set_task_cpu(p, cpu);
2231         activate_task(dst_rq, p, 0);
2232         check_preempt_curr(dst_rq, p, 0);
2233 
2234         rq_unpin_lock(dst_rq, &drf);
2235         rq_unpin_lock(src_rq, &srf);
2236     } else {
2237         /*
2238          * Task isn't running anymore; make it appear like we migrated
2239          * it before it went to sleep. This means on wakeup we make the
2240          * previous CPU our target instead of where it really is.
2241          */
2242         p->wake_cpu = cpu;
2243     }
2244 }
2245 
2246 struct migration_swap_arg {
2247     struct task_struct *src_task, *dst_task;
2248     int src_cpu, dst_cpu;
2249 };
2250 
migrate_swap_stop(void * data)2251 static int migrate_swap_stop(void *data)
2252 {
2253     struct migration_swap_arg *arg = data;
2254     struct rq *src_rq, *dst_rq;
2255     int ret = -EAGAIN;
2256 
2257     if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu)) {
2258         return -EAGAIN;
2259     }
2260 
2261     src_rq = cpu_rq(arg->src_cpu);
2262     dst_rq = cpu_rq(arg->dst_cpu);
2263 
2264     double_raw_lock(&arg->src_task->pi_lock, &arg->dst_task->pi_lock);
2265     double_rq_lock(src_rq, dst_rq);
2266 
2267     if (task_cpu(arg->dst_task) != arg->dst_cpu) {
2268         goto unlock;
2269     }
2270 
2271     if (task_cpu(arg->src_task) != arg->src_cpu) {
2272         goto unlock;
2273     }
2274 
2275     if (!cpumask_test_cpu(arg->dst_cpu, arg->src_task->cpus_ptr)) {
2276         goto unlock;
2277     }
2278 
2279     if (!cpumask_test_cpu(arg->src_cpu, arg->dst_task->cpus_ptr)) {
2280         goto unlock;
2281     }
2282 
2283     __migrate_swap_task(arg->src_task, arg->dst_cpu);
2284     __migrate_swap_task(arg->dst_task, arg->src_cpu);
2285 
2286     ret = 0;
2287 
2288 unlock:
2289     double_rq_unlock(src_rq, dst_rq);
2290     raw_spin_unlock(&arg->dst_task->pi_lock);
2291     raw_spin_unlock(&arg->src_task->pi_lock);
2292 
2293     return ret;
2294 }
2295 
2296 /*
2297  * Cross migrate two tasks
2298  */
migrate_swap(struct task_struct * cur,struct task_struct * p,int target_cpu,int curr_cpu)2299 int migrate_swap(struct task_struct *cur, struct task_struct *p, int target_cpu,
2300                  int curr_cpu)
2301 {
2302     struct migration_swap_arg arg;
2303     int ret = -EINVAL;
2304 
2305     arg = (struct migration_swap_arg) {
2306         .src_task = cur,
2307         .src_cpu = curr_cpu,
2308         .dst_task = p,
2309         .dst_cpu = target_cpu,
2310     };
2311 
2312     if (arg.src_cpu == arg.dst_cpu) {
2313         goto out;
2314     }
2315 
2316     /*
2317      * These three tests are all lockless; this is OK since all of them
2318      * will be re-checked with proper locks held further down the line.
2319      */
2320     if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu)) {
2321         goto out;
2322     }
2323 
2324     if (!cpumask_test_cpu(arg.dst_cpu, arg.src_task->cpus_ptr)) {
2325         goto out;
2326     }
2327 
2328     if (!cpumask_test_cpu(arg.src_cpu, arg.dst_task->cpus_ptr)) {
2329         goto out;
2330     }
2331 
2332     trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
2333     ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
2334 
2335 out:
2336     return ret;
2337 }
2338 #endif /* CONFIG_NUMA_BALANCING */
2339 
2340 /*
2341  * wait_task_inactive - wait for a thread to unschedule.
2342  *
2343  * If @match_state is nonzero, it's the @p->state value just checked and
2344  * not expected to change.  If it changes, i.e. @p might have woken up,
2345  * then return zero.  When we succeed in waiting for @p to be off its CPU,
2346  * we return a positive number (its total switch count).  If a second call
2347  * a short while later returns the same number, the caller can be sure that
2348  * @p has remained unscheduled the whole time.
2349  *
2350  * The caller must ensure that the task *will* unschedule sometime soon,
2351  * else this function might spin for a *long* time. This function can't
2352  * be called with interrupts off, or it may introduce deadlock with
2353  * smp_call_function() if an IPI is sent by the same process we are
2354  * waiting to become inactive.
2355  */
wait_task_inactive(struct task_struct * p,long match_state)2356 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
2357 {
2358     int running, queued;
2359     struct rq_flags rf;
2360     unsigned long ncsw;
2361     struct rq *rq;
2362 
2363     for (;;) {
2364         /*
2365          * We do the initial early heuristics without holding
2366          * any task-queue locks at all. We'll only try to get
2367          * the runqueue lock when things look like they will
2368          * work out!
2369          */
2370         rq = task_rq(p);
2371 
2372         /*
2373          * If the task is actively running on another CPU
2374          * still, just relax and busy-wait without holding
2375          * any locks.
2376          *
2377          * NOTE! Since we don't hold any locks, it's not
2378          * even sure that "rq" stays as the right runqueue!
2379          * But we don't care, since "task_running()" will
2380          * return false if the runqueue has changed and p
2381          * is actually now running somewhere else!
2382          */
2383         while (task_running(rq, p)) {
2384             if (match_state && unlikely(p->state != match_state)) {
2385                 return 0;
2386             }
2387             cpu_relax();
2388         }
2389 
2390         /*
2391          * Ok, time to look more closely! We need the rq
2392          * lock now, to be *sure*. If we're wrong, we'll
2393          * just go back and repeat.
2394          */
2395         rq = task_rq_lock(p, &rf);
2396         trace_sched_wait_task(p);
2397         running = task_running(rq, p);
2398         queued = task_on_rq_queued(p);
2399         ncsw = 0;
2400         if (!match_state || p->state == match_state) {
2401             ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
2402         }
2403         task_rq_unlock(rq, p, &rf);
2404 
2405         /*
2406          * If it changed from the expected state, bail out now.
2407          */
2408         if (unlikely(!ncsw)) {
2409             break;
2410         }
2411 
2412         /*
2413          * Was it really running after all now that we
2414          * checked with the proper locks actually held?
2415          *
2416          * Oops. Go back and try again..
2417          */
2418         if (unlikely(running)) {
2419             cpu_relax();
2420             continue;
2421         }
2422 
2423         /*
2424          * It's not enough that it's not actively running,
2425          * it must be off the runqueue _entirely_, and not
2426          * preempted!
2427          *
2428          * So if it was still runnable (but just not actively
2429          * running right now), it's preempted, and we should
2430          * yield - it could be a while.
2431          */
2432         if (unlikely(queued)) {
2433             ktime_t to = NSEC_PER_SEC / HZ;
2434 
2435             set_current_state(TASK_UNINTERRUPTIBLE);
2436             schedule_hrtimeout(&to, HRTIMER_MODE_REL);
2437             continue;
2438         }
2439 
2440         /*
2441          * Ahh, all good. It wasn't running, and it wasn't
2442          * runnable, which means that it will never become
2443          * running in the future either. We're all done!
2444          */
2445         break;
2446     }
2447 
2448     return ncsw;
2449 }
2450 
2451 /***
2452  * kick_process - kick a running thread to enter/exit the kernel
2453  * @p: the to-be-kicked thread
2454  *
2455  * Cause a process which is running on another CPU to enter
2456  * kernel-mode, without any delay. (to get signals handled.)
2457  *
2458  * NOTE: this function doesn't have to take the runqueue lock,
2459  * because all it wants to ensure is that the remote task enters
2460  * the kernel. If the IPI races and the task has been migrated
2461  * to another CPU then no harm is done and the purpose has been
2462  * achieved as well.
2463  */
kick_process(struct task_struct * p)2464 void kick_process(struct task_struct *p)
2465 {
2466     int cpu;
2467 
2468     preempt_disable();
2469     cpu = task_cpu(p);
2470     if ((cpu != smp_processor_id()) && task_curr(p)) {
2471         smp_send_reschedule(cpu);
2472     }
2473     preempt_enable();
2474 }
2475 EXPORT_SYMBOL_GPL(kick_process);
2476 
2477 /*
2478  * ->cpus_ptr is protected by both rq->lock and p->pi_lock
2479  *
2480  * A few notes on cpu_active vs cpu_online:
2481  *
2482  *  - cpu_active must be a subset of cpu_online
2483  *
2484  *  - on CPU-up we allow per-CPU kthreads on the online && !active CPU,
2485  *    see __set_cpus_allowed_ptr(). At this point the newly online
2486  *    CPU isn't yet part of the sched domains, and balancing will not
2487  *    see it.
2488  *
2489  *  - on CPU-down we clear cpu_active() to mask the sched domains and
2490  *    avoid the load balancer to place new tasks on the to be removed
2491  *    CPU. Existing tasks will remain running there and will be taken
2492  *    off.
2493  *
2494  * This means that fallback selection must not select !active CPUs.
2495  * And can assume that any active CPU must be online. Conversely
2496  * select_task_rq() below may allow selection of !active CPUs in order
2497  * to satisfy the above rules.
2498  */
2499 #ifdef CONFIG_CPU_ISOLATION_OPT
select_fallback_rq(int cpu,struct task_struct * p,bool allow_iso)2500 static int select_fallback_rq(int cpu, struct task_struct *p, bool allow_iso)
2501 #else
2502 static int select_fallback_rq(int cpu, struct task_struct *p)
2503 #endif
2504 {
2505     int nid = cpu_to_node(cpu);
2506     const struct cpumask *nodemask = NULL;
2507     enum { cpuset, possible, fail, bug } state = cpuset;
2508     int dest_cpu;
2509 #ifdef CONFIG_CPU_ISOLATION_OPT
2510     int isolated_candidate = -1;
2511 #endif
2512 
2513     /*
2514      * If the node that the CPU is on has been offlined, cpu_to_node()
2515      * will return -1. There is no CPU on the node, and we should
2516      * select the CPU on the other node.
2517      */
2518     if (nid != -1) {
2519         nodemask = cpumask_of_node(nid);
2520 
2521         /* Look for allowed, online CPU in same node. */
2522         for_each_cpu(dest_cpu, nodemask)
2523         {
2524             if (!cpu_active(dest_cpu)) {
2525                 continue;
2526             }
2527             if (cpu_isolated(dest_cpu)) {
2528                 continue;
2529             }
2530             if (cpumask_test_cpu(dest_cpu, p->cpus_ptr)) {
2531                 return dest_cpu;
2532             }
2533         }
2534     }
2535 
2536     for (;;) {
2537         /* Any allowed, online CPU? */
2538         for_each_cpu(dest_cpu, p->cpus_ptr)
2539         {
2540             if (!is_cpu_allowed(p, dest_cpu)) {
2541                 continue;
2542             }
2543 #ifdef CONFIG_CPU_ISOLATION_OPT
2544             if (cpu_isolated(dest_cpu)) {
2545                 if (allow_iso) {
2546                     isolated_candidate = dest_cpu;
2547                 }
2548                 continue;
2549             }
2550             goto out;
2551         }
2552 
2553         if (isolated_candidate != -1) {
2554             dest_cpu = isolated_candidate;
2555 #endif
2556             goto out;
2557         }
2558 
2559         /* No more Mr. Nice Guy. */
2560         switch (state) {
2561             case cpuset:
2562                 if (IS_ENABLED(CONFIG_CPUSETS)) {
2563                     cpuset_cpus_allowed_fallback(p);
2564                     state = possible;
2565                     break;
2566                 }
2567                 fallthrough;
2568             case possible:
2569                 do_set_cpus_allowed(p, task_cpu_possible_mask(p));
2570                 state = fail;
2571                 break;
2572             case fail:
2573 #ifdef CONFIG_CPU_ISOLATION_OPT
2574                 allow_iso = true;
2575                 state = bug;
2576                 break;
2577 #else
2578 #endif
2579 
2580             case bug:
2581                 BUG();
2582                 break;
2583         }
2584     }
2585 
2586 out:
2587     if (state != cpuset) {
2588         /*
2589          * Don't tell them about moving exiting tasks or
2590          * kernel threads (both mm NULL), since they never
2591          * leave kernel.
2592          */
2593         if (p->mm && printk_ratelimit()) {
2594             printk_deferred("process %d (%s) no longer affine to cpu%d\n",
2595                             task_pid_nr(p), p->comm, cpu);
2596         }
2597     }
2598 
2599     return dest_cpu;
2600 }
2601 
2602 /*
2603  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_ptr is stable.
2604  */
select_task_rq(struct task_struct * p,int cpu,int sd_flags,int wake_flags)2605 static inline int select_task_rq(struct task_struct *p, int cpu, int sd_flags,
2606                                  int wake_flags)
2607 {
2608 #ifdef CONFIG_CPU_ISOLATION_OPT
2609     bool allow_isolated = (p->flags & PF_KTHREAD);
2610 #endif
2611 
2612     lockdep_assert_held(&p->pi_lock);
2613 
2614     if (p->nr_cpus_allowed > 1) {
2615         cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
2616     } else {
2617         cpu = cpumask_any(p->cpus_ptr);
2618     }
2619 
2620     /*
2621      * In order not to call set_task_cpu() on a blocking task we need
2622      * to rely on ttwu() to place the task on a valid ->cpus_ptr
2623      * CPU.
2624      *
2625      * Since this is common to all placement strategies, this lives here.
2626      *
2627      * [ this allows ->select_task() to simply return task_cpu(p) and
2628      *   not worry about this generic constraint ]
2629      */
2630 #ifdef CONFIG_CPU_ISOLATION_OPT
2631     if (unlikely(!is_cpu_allowed(p, cpu)) ||
2632         (cpu_isolated(cpu) && !allow_isolated)) {
2633         cpu = select_fallback_rq(task_cpu(p), p, allow_isolated);
2634     }
2635 #else
2636     if (unlikely(!is_cpu_allowed(p, cpu))) {
2637         cpu = select_fallback_rq(task_cpu(p), p);
2638     }
2639 #endif
2640 
2641     return cpu;
2642 }
2643 
sched_set_stop_task(int cpu,struct task_struct * stop)2644 void sched_set_stop_task(int cpu, struct task_struct *stop)
2645 {
2646     struct sched_param param = {.sched_priority = MAX_RT_PRIO - 1};
2647     struct task_struct *old_stop = cpu_rq(cpu)->stop;
2648 
2649     if (stop) {
2650         /*
2651          * Make it appear like a SCHED_FIFO task, its something
2652          * userspace knows about and won't get confused about.
2653          *
2654          * Also, it will make PI more or less work without too
2655          * much confusion -- but then, stop work should not
2656          * rely on PI working anyway.
2657          */
2658         sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
2659 
2660         stop->sched_class = &stop_sched_class;
2661     }
2662 
2663     cpu_rq(cpu)->stop = stop;
2664 
2665     if (old_stop) {
2666         /*
2667          * Reset it back to a normal scheduling class so that
2668          * it can die in pieces.
2669          */
2670         old_stop->sched_class = &rt_sched_class;
2671     }
2672 }
2673 
2674 #else
2675 
__set_cpus_allowed_ptr(struct task_struct * p,const struct cpumask * new_mask,bool check)2676 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
2677                                          const struct cpumask *new_mask,
2678                                          bool check)
2679 {
2680     return set_cpus_allowed_ptr(p, new_mask);
2681 }
2682 
2683 #endif /* CONFIG_SMP */
2684 
ttwu_stat(struct task_struct * p,int cpu,int wake_flags)2685 static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
2686 {
2687     struct rq *rq;
2688 
2689     if (!schedstat_enabled()) {
2690         return;
2691     }
2692 
2693     rq = this_rq();
2694 #ifdef CONFIG_SMP
2695     if (cpu == rq->cpu) {
2696         __schedstat_inc(rq->ttwu_local);
2697         __schedstat_inc(p->se.statistics.nr_wakeups_local);
2698     } else {
2699         struct sched_domain *sd;
2700 
2701         __schedstat_inc(p->se.statistics.nr_wakeups_remote);
2702         rcu_read_lock();
2703         for_each_domain(rq->cpu, sd)
2704         {
2705             if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
2706                 __schedstat_inc(sd->ttwu_wake_remote);
2707                 break;
2708             }
2709         }
2710         rcu_read_unlock();
2711     }
2712 
2713     if (wake_flags & WF_MIGRATED) {
2714         __schedstat_inc(p->se.statistics.nr_wakeups_migrate);
2715     }
2716 #endif /* CONFIG_SMP */
2717 
2718     __schedstat_inc(rq->ttwu_count);
2719     __schedstat_inc(p->se.statistics.nr_wakeups);
2720 
2721     if (wake_flags & WF_SYNC) {
2722         __schedstat_inc(p->se.statistics.nr_wakeups_sync);
2723     }
2724 }
2725 
2726 /*
2727  * Mark the task runnable and perform wakeup-preemption.
2728  */
ttwu_do_wakeup(struct rq * rq,struct task_struct * p,int wake_flags,struct rq_flags * rf)2729 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
2730                            struct rq_flags *rf)
2731 {
2732     check_preempt_curr(rq, p, wake_flags);
2733     p->state = TASK_RUNNING;
2734     trace_sched_wakeup(p);
2735 
2736 #ifdef CONFIG_SMP
2737     if (p->sched_class->task_woken) {
2738         /*
2739          * Our task @p is fully woken up and running; so its safe to
2740          * drop the rq->lock, hereafter rq is only used for statistics.
2741          */
2742         rq_unpin_lock(rq, rf);
2743         p->sched_class->task_woken(rq, p);
2744         rq_repin_lock(rq, rf);
2745     }
2746 
2747     if (rq->idle_stamp) {
2748         u64 delta = rq_clock(rq) - rq->idle_stamp;
2749         u64 max = 2 * rq->max_idle_balance_cost;
2750 
2751         update_avg(&rq->avg_idle, delta);
2752 
2753         if (rq->avg_idle > max) {
2754             rq->avg_idle = max;
2755         }
2756 
2757         rq->idle_stamp = 0;
2758     }
2759 #endif
2760 }
2761 
ttwu_do_activate(struct rq * rq,struct task_struct * p,int wake_flags,struct rq_flags * rf)2762 static void ttwu_do_activate(struct rq *rq, struct task_struct *p,
2763                              int wake_flags, struct rq_flags *rf)
2764 {
2765     int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
2766 
2767     if (wake_flags & WF_SYNC) {
2768         en_flags |= ENQUEUE_WAKEUP_SYNC;
2769     }
2770 
2771     lockdep_assert_held(&rq->lock);
2772 
2773     if (p->sched_contributes_to_load) {
2774         rq->nr_uninterruptible--;
2775     }
2776 
2777 #ifdef CONFIG_SMP
2778     if (wake_flags & WF_MIGRATED) {
2779         en_flags |= ENQUEUE_MIGRATED;
2780     } else
2781 #endif
2782         if (p->in_iowait) {
2783         delayacct_blkio_end(p);
2784         atomic_dec(&task_rq(p)->nr_iowait);
2785     }
2786 
2787     activate_task(rq, p, en_flags);
2788     ttwu_do_wakeup(rq, p, wake_flags, rf);
2789 }
2790 
2791 /*
2792  * Consider @p being inside a wait loop:
2793  *
2794  *   for (;;) {
2795  *      set_current_state(TASK_UNINTERRUPTIBLE);
2796  *
2797  *      if (CONDITION)
2798  *         break;
2799  *
2800  *      schedule();
2801  *   }
2802  *   __set_current_state(TASK_RUNNING);
2803  *
2804  * between set_current_state() and schedule(). In this case @p is still
2805  * runnable, so all that needs doing is change p->state back to TASK_RUNNING in
2806  * an atomic manner.
2807  *
2808  * By taking task_rq(p)->lock we serialize against schedule(), if @p->on_rq
2809  * then schedule() must still happen and p->state can be changed to
2810  * TASK_RUNNING. Otherwise we lost the race, schedule() has happened, and we
2811  * need to do a full wakeup with enqueue.
2812  *
2813  * Returns: %true when the wakeup is done,
2814  *          %false otherwise.
2815  */
ttwu_runnable(struct task_struct * p,int wake_flags)2816 static int ttwu_runnable(struct task_struct *p, int wake_flags)
2817 {
2818     struct rq_flags rf;
2819     struct rq *rq;
2820     int ret = 0;
2821 
2822     rq = __task_rq_lock(p, &rf);
2823     if (task_on_rq_queued(p)) {
2824         /* check_preempt_curr() may use rq clock */
2825         update_rq_clock(rq);
2826         ttwu_do_wakeup(rq, p, wake_flags, &rf);
2827         ret = 1;
2828     }
2829     __task_rq_unlock(rq, &rf);
2830 
2831     return ret;
2832 }
2833 
2834 #ifdef CONFIG_SMP
sched_ttwu_pending(void * arg)2835 void sched_ttwu_pending(void *arg)
2836 {
2837     struct llist_node *llist = arg;
2838     struct rq *rq = this_rq();
2839     struct task_struct *p, *t;
2840     struct rq_flags rf;
2841 
2842     if (!llist) {
2843         return;
2844     }
2845 
2846     /*
2847      * rq::ttwu_pending racy indication of out-standing wakeups.
2848      * Races such that false-negatives are possible, since they
2849      * are shorter lived that false-positives would be.
2850      */
2851     WRITE_ONCE(rq->ttwu_pending, 0);
2852 
2853     rq_lock_irqsave(rq, &rf);
2854     update_rq_clock(rq);
2855 
2856     llist_for_each_entry_safe(p, t, llist, wake_entry.llist)
2857     {
2858         if (WARN_ON_ONCE(p->on_cpu)) {
2859             smp_cond_load_acquire(&p->on_cpu, !VAL);
2860         }
2861 
2862         if (WARN_ON_ONCE(task_cpu(p) != cpu_of(rq))) {
2863             set_task_cpu(p, cpu_of(rq));
2864         }
2865 
2866         ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
2867     }
2868 
2869     rq_unlock_irqrestore(rq, &rf);
2870 }
2871 
send_call_function_single_ipi(int cpu)2872 void send_call_function_single_ipi(int cpu)
2873 {
2874     struct rq *rq = cpu_rq(cpu);
2875 
2876     if (!set_nr_if_polling(rq->idle)) {
2877         arch_send_call_function_single_ipi(cpu);
2878     } else {
2879         trace_sched_wake_idle_without_ipi(cpu);
2880     }
2881 }
2882 
2883 /*
2884  * Queue a task on the target CPUs wake_list and wake the CPU via IPI if
2885  * necessary. The wakee CPU on receipt of the IPI will queue the task
2886  * via sched_ttwu_wakeup() for activation so the wakee incurs the cost
2887  * of the wakeup instead of the waker.
2888  */
__ttwu_queue_wakelist(struct task_struct * p,int cpu,int wake_flags)2889 static void __ttwu_queue_wakelist(struct task_struct *p, int cpu,
2890                                   int wake_flags)
2891 {
2892     struct rq *rq = cpu_rq(cpu);
2893 
2894     p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
2895 
2896     WRITE_ONCE(rq->ttwu_pending, 1);
2897     __smp_call_single_queue(cpu, &p->wake_entry.llist);
2898 }
2899 
wake_up_if_idle(int cpu)2900 void wake_up_if_idle(int cpu)
2901 {
2902     struct rq *rq = cpu_rq(cpu);
2903     struct rq_flags rf;
2904 
2905     rcu_read_lock();
2906 
2907     if (!is_idle_task(rcu_dereference(rq->curr))) {
2908         goto out;
2909     }
2910 
2911     if (set_nr_if_polling(rq->idle)) {
2912         trace_sched_wake_idle_without_ipi(cpu);
2913     } else {
2914         rq_lock_irqsave(rq, &rf);
2915         if (is_idle_task(rq->curr)) {
2916             smp_send_reschedule(cpu);
2917         }
2918         /* Else CPU is not idle, do nothing here: */
2919         rq_unlock_irqrestore(rq, &rf);
2920     }
2921 
2922 out:
2923     rcu_read_unlock();
2924 }
2925 
cpus_share_cache(int this_cpu,int that_cpu)2926 bool cpus_share_cache(int this_cpu, int that_cpu)
2927 {
2928     if (this_cpu == that_cpu) {
2929         return true;
2930     }
2931 
2932     return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
2933 }
2934 
ttwu_queue_cond(int cpu,int wake_flags)2935 static inline bool ttwu_queue_cond(int cpu, int wake_flags)
2936 {
2937     /*
2938      * If the CPU does not share cache, then queue the task on the
2939      * remote rqs wakelist to avoid accessing remote data.
2940      */
2941     if (!cpus_share_cache(smp_processor_id(), cpu)) {
2942         return true;
2943     }
2944 
2945     /*
2946      * If the task is descheduling and the only running task on the
2947      * CPU then use the wakelist to offload the task activation to
2948      * the soon-to-be-idle CPU as the current CPU is likely busy.
2949      * nr_running is checked to avoid unnecessary task stacking.
2950      */
2951     if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) {
2952         return true;
2953     }
2954 
2955     return false;
2956 }
2957 
ttwu_queue_wakelist(struct task_struct * p,int cpu,int wake_flags)2958 static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
2959 {
2960     if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
2961         if (WARN_ON_ONCE(cpu == smp_processor_id())) {
2962             return false;
2963         }
2964 
2965         sched_clock_cpu(cpu); /* Sync clocks across CPUs */
2966         __ttwu_queue_wakelist(p, cpu, wake_flags);
2967         return true;
2968     }
2969 
2970     return false;
2971 }
2972 
2973 #else /* !CONFIG_SMP */
2974 
ttwu_queue_wakelist(struct task_struct * p,int cpu,int wake_flags)2975 static inline bool ttwu_queue_wakelist(struct task_struct *p, int cpu,
2976                                        int wake_flags)
2977 {
2978     return false;
2979 }
2980 
2981 #endif /* CONFIG_SMP */
2982 
ttwu_queue(struct task_struct * p,int cpu,int wake_flags)2983 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
2984 {
2985     struct rq *rq = cpu_rq(cpu);
2986     struct rq_flags rf;
2987 
2988     if (ttwu_queue_wakelist(p, cpu, wake_flags)) {
2989         return;
2990     }
2991 
2992     rq_lock(rq, &rf);
2993     update_rq_clock(rq);
2994     ttwu_do_activate(rq, p, wake_flags, &rf);
2995     rq_unlock(rq, &rf);
2996 }
2997 
2998 /*
2999  * Notes on Program-Order guarantees on SMP systems.
3000  *
3001  *  MIGRATION
3002  *
3003  * The basic program-order guarantee on SMP systems is that when a task [t]
3004  * migrates, all its activity on its old CPU [c0] happens-before any subsequent
3005  * execution on its new CPU [c1].
3006  *
3007  * For migration (of runnable tasks) this is provided by the following means:
3008  *
3009  *  A) UNLOCK of the rq(c0)->lock scheduling out task t
3010  *  B) migration for t is required to synchronize *both* rq(c0)->lock and
3011  *     rq(c1)->lock (if not at the same time, then in that order).
3012  *  C) LOCK of the rq(c1)->lock scheduling in task
3013  *
3014  * Release/acquire chaining guarantees that B happens after A and C after B.
3015  * Note: the CPU doing B need not be c0 or c1
3016  *
3017  * Example:
3018  *
3019  *   CPU0            CPU1            CPU2
3020  *
3021  *   LOCK rq(0)->lock
3022  *   sched-out X
3023  *   sched-in Y
3024  *   UNLOCK rq(0)->lock
3025  *
3026  *                                   LOCK rq(0)->lock // orders against CPU0
3027  *                                   dequeue X
3028  *                                   UNLOCK rq(0)->lock
3029  *
3030  *                                   LOCK rq(1)->lock
3031  *                                   enqueue X
3032  *                                   UNLOCK rq(1)->lock
3033  *
3034  *                   LOCK rq(1)->lock // orders against CPU2
3035  *                   sched-out Z
3036  *                   sched-in X
3037  *                   UNLOCK rq(1)->lock
3038  *
3039  *
3040  *  BLOCKING -- aka. SLEEP + WAKEUP
3041  *
3042  * For blocking we (obviously) need to provide the same guarantee as for
3043  * migration. However the means are completely different as there is no lock
3044  * chain to provide order. Instead we do:
3045  *
3046  *   1) smp_store_release(X->on_cpu, 0)   -- finish_task()
3047  *   2) smp_cond_load_acquire(!X->on_cpu) -- try_to_wake_up()
3048  *
3049  * Example:
3050  *
3051  *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
3052  *
3053  *   LOCK rq(0)->lock LOCK X->pi_lock
3054  *   dequeue X
3055  *   sched-out X
3056  *   smp_store_release(X->on_cpu, 0);
3057  *
3058  *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
3059  *                    X->state = WAKING
3060  *                    set_task_cpu(X,2)
3061  *
3062  *                    LOCK rq(2)->lock
3063  *                    enqueue X
3064  *                    X->state = RUNNING
3065  *                    UNLOCK rq(2)->lock
3066  *
3067  *                                          LOCK rq(2)->lock // orders against
3068  * CPU1 sched-out Z sched-in X UNLOCK rq(2)->lock
3069  *
3070  *                    UNLOCK X->pi_lock
3071  *   UNLOCK rq(0)->lock
3072  *
3073  *
3074  * However, for wakeups there is a second guarantee we must provide, namely we
3075  * must ensure that CONDITION=1 done by the caller can not be reordered with
3076  * accesses to the task state; see try_to_wake_up() and set_current_state().
3077  */
3078 
3079 #ifdef CONFIG_SMP
3080 #ifdef CONFIG_SCHED_WALT
3081 /* utility function to update walt signals at wakeup */
walt_try_to_wake_up(struct task_struct * p)3082 static inline void walt_try_to_wake_up(struct task_struct *p)
3083 {
3084     struct rq *rq = cpu_rq(task_cpu(p));
3085     struct rq_flags rf;
3086     u64 wallclock;
3087 
3088     rq_lock_irqsave(rq, &rf);
3089     wallclock = sched_ktime_clock();
3090     update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
3091     update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
3092     rq_unlock_irqrestore(rq, &rf);
3093 }
3094 #else
3095 #define walt_try_to_wake_up(a)                                                 \
3096     {                                                                          \
3097     }
3098 #endif
3099 #endif
3100 
3101 /**
3102  * try_to_wake_up - wake up a thread
3103  * @p: the thread to be awakened
3104  * @state: the mask of task states that can be woken
3105  * @wake_flags: wake modifier flags (WF_*)
3106  *
3107  * Conceptually does
3108  *
3109  *   If (@state & @p->state) @p->state = TASK_RUNNING.
3110  *
3111  * If the task was not queued/runnable, also place it back on a runqueue.
3112  *
3113  * This function is atomic against schedule() which would dequeue the task.
3114  *
3115  * It issues a full memory barrier before accessing @p->state, see the comment
3116  * with set_current_state().
3117  *
3118  * Uses p->pi_lock to serialize against concurrent wake-ups.
3119  *
3120  * Relies on p->pi_lock stabilizing:
3121  *  - p->sched_class
3122  *  - p->cpus_ptr
3123  *  - p->sched_task_group
3124  * in order to do migration, see its use of select_task_rq()/set_task_cpu().
3125  *
3126  * Tries really hard to only take one task_rq(p)->lock for performance.
3127  * Takes rq->lock in:
3128  *  - ttwu_runnable()    -- old rq, unavoidable, see comment there;
3129  *  - ttwu_queue()       -- new rq, for enqueue of the task;
3130  *  - psi_ttwu_dequeue() -- much sadness :-( accounting will kill us.
3131  *
3132  * As a consequence we race really badly with just about everything. See the
3133  * many memory barriers and their comments for details.
3134  *
3135  * Return: %true if @p->state changes (an actual wakeup was done),
3136  *       %false otherwise.
3137  */
try_to_wake_up(struct task_struct * p,unsigned int state,int wake_flags)3138 static int try_to_wake_up(struct task_struct *p, unsigned int state,
3139                           int wake_flags)
3140 {
3141     unsigned long flags;
3142     int cpu, success = 0;
3143 
3144     preempt_disable();
3145     if (p == current) {
3146         /*
3147          * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
3148          * == smp_processor_id()'. Together this means we can special
3149          * case the whole 'p->on_rq && ttwu_runnable()' case below
3150          * without taking any locks.
3151          *
3152          * In particular:
3153          *  - we rely on Program-Order guarantees for all the ordering,
3154          *  - we're serialized against set_special_state() by virtue of
3155          *    it disabling IRQs (this allows not taking ->pi_lock).
3156          */
3157         if (!(p->state & state)) {
3158             goto out;
3159         }
3160 
3161         success = 1;
3162         trace_sched_waking(p);
3163         p->state = TASK_RUNNING;
3164         trace_sched_wakeup(p);
3165         goto out;
3166     }
3167 
3168     /*
3169      * If we are going to wake up a thread waiting for CONDITION we
3170      * need to ensure that CONDITION=1 done by the caller can not be
3171      * reordered with p->state check below. This pairs with smp_store_mb()
3172      * in set_current_state() that the waiting thread does.
3173      */
3174     raw_spin_lock_irqsave(&p->pi_lock, flags);
3175     smp_mb__after_spinlock();
3176     if (!(p->state & state)) {
3177         goto unlock;
3178     }
3179 
3180 #ifdef CONFIG_FREEZER
3181     /*
3182      * If we're going to wake up a thread which may be frozen, then
3183      * we can only do so if we have an active CPU which is capable of
3184      * running it. This may not be the case when resuming from suspend,
3185      * as the secondary CPUs may not yet be back online. See __thaw_task()
3186      * for the actual wakeup.
3187      */
3188     if (unlikely(frozen_or_skipped(p)) &&
3189         !cpumask_intersects(cpu_active_mask, task_cpu_possible_mask(p))) {
3190         goto unlock;
3191     }
3192 #endif
3193 
3194     trace_sched_waking(p);
3195 
3196     /* We're going to change ->state: */
3197     success = 1;
3198 
3199     /*
3200      * Ensure we load p->on_rq _after_ p->state, otherwise it would
3201      * be possible to, falsely, observe p->on_rq == 0 and get stuck
3202      * in smp_cond_load_acquire() below.
3203      *
3204      * sched_ttwu_pending()            try_to_wake_up()
3205      *   STORE p->on_rq = 1              LOAD p->state
3206      *   UNLOCK rq->lock
3207      *
3208      * __schedule() (switch to task 'p')
3209      *   LOCK rq->lock              smp_rmb();
3210      *   smp_mb__after_spinlock();
3211      *   UNLOCK rq->lock
3212      *
3213      * [task p]
3214      *   STORE p->state = UNINTERRUPTIBLE      LOAD p->on_rq
3215      *
3216      * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
3217      * __schedule().  See the comment for smp_mb__after_spinlock().
3218      *
3219      * A similar smb_rmb() lives in try_invoke_on_locked_down_task().
3220      */
3221     smp_rmb();
3222     if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) {
3223         goto unlock;
3224     }
3225 
3226 #ifdef CONFIG_SMP
3227     /*
3228      * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
3229      * possible to, falsely, observe p->on_cpu == 0.
3230      *
3231      * One must be running (->on_cpu == 1) in order to remove oneself
3232      * from the runqueue.
3233      *
3234      * __schedule() (switch to task 'p')    try_to_wake_up()
3235      *   STORE p->on_cpu = 1          LOAD p->on_rq
3236      *   UNLOCK rq->lock
3237      *
3238      * __schedule() (put 'p' to sleep)
3239      *   LOCK rq->lock              smp_rmb();
3240      *   smp_mb__after_spinlock();
3241      *   STORE p->on_rq = 0              LOAD p->on_cpu
3242      *
3243      * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in
3244      * __schedule().  See the comment for smp_mb__after_spinlock().
3245      *
3246      * Form a control-dep-acquire with p->on_rq == 0 above, to ensure
3247      * schedule()'s deactivate_task() has 'happened' and p will no longer
3248      * care about it's own p->state. See the comment in __schedule().
3249      */
3250     smp_acquire__after_ctrl_dep();
3251 
3252     walt_try_to_wake_up(p);
3253 
3254     /*
3255      * We're doing the wakeup (@success == 1), they did a dequeue (p->on_rq
3256      * == 0), which means we need to do an enqueue, change p->state to
3257      * TASK_WAKING such that we can unlock p->pi_lock before doing the
3258      * enqueue, such as ttwu_queue_wakelist().
3259      */
3260     p->state = TASK_WAKING;
3261 
3262     /*
3263      * If the owning (remote) CPU is still in the middle of schedule() with
3264      * this task as prev, considering queueing p on the remote CPUs wake_list
3265      * which potentially sends an IPI instead of spinning on p->on_cpu to
3266      * let the waker make forward progress. This is safe because IRQs are
3267      * disabled and the IPI will deliver after on_cpu is cleared.
3268      *
3269      * Ensure we load task_cpu(p) after p->on_cpu:
3270      *
3271      * set_task_cpu(p, cpu);
3272      *   STORE p->cpu = @cpu
3273      * __schedule() (switch to task 'p')
3274      *   LOCK rq->lock
3275      *   smp_mb__after_spin_lock()        smp_cond_load_acquire(&p->on_cpu)
3276      *   STORE p->on_cpu = 1        LOAD p->cpu
3277      *
3278      * to ensure we observe the correct CPU on which the task is currently
3279      * scheduling.
3280      */
3281     if (smp_load_acquire(&p->on_cpu) &&
3282         ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) {
3283         goto unlock;
3284     }
3285 
3286     /*
3287      * If the owning (remote) CPU is still in the middle of schedule() with
3288      * this task as prev, wait until its done referencing the task.
3289      *
3290      * Pairs with the smp_store_release() in finish_task().
3291      *
3292      * This ensures that tasks getting woken will be fully ordered against
3293      * their previous state and preserve Program Order.
3294      */
3295     smp_cond_load_acquire(&p->on_cpu, !VAL);
3296 
3297     cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
3298     if (task_cpu(p) != cpu) {
3299         if (p->in_iowait) {
3300             delayacct_blkio_end(p);
3301             atomic_dec(&task_rq(p)->nr_iowait);
3302         }
3303 
3304         wake_flags |= WF_MIGRATED;
3305         psi_ttwu_dequeue(p);
3306         set_task_cpu(p, cpu);
3307     }
3308 #else
3309     cpu = task_cpu(p);
3310 #endif /* CONFIG_SMP */
3311 
3312     ttwu_queue(p, cpu, wake_flags);
3313 unlock:
3314     raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3315 out:
3316     if (success) {
3317         ttwu_stat(p, task_cpu(p), wake_flags);
3318     }
3319     preempt_enable();
3320 
3321     return success;
3322 }
3323 
3324 /**
3325  * try_invoke_on_locked_down_task - Invoke a function on task in fixed state
3326  * @p: Process for which the function is to be invoked, can be @current.
3327  * @func: Function to invoke.
3328  * @arg: Argument to function.
3329  *
3330  * If the specified task can be quickly locked into a definite state
3331  * (either sleeping or on a given runqueue), arrange to keep it in that
3332  * state while invoking @func(@arg).  This function can use ->on_rq and
3333  * task_curr() to work out what the state is, if required.  Given that
3334  * @func can be invoked with a runqueue lock held, it had better be quite
3335  * lightweight.
3336  *
3337  * Returns:
3338  *    @false if the task slipped out from under the locks.
3339  *    @true if the task was locked onto a runqueue or is sleeping.
3340  *        However, @func can override this by returning @false.
3341  */
try_invoke_on_locked_down_task(struct task_struct * p,bool (* func)(struct task_struct * t,void * arg),void * arg)3342 bool try_invoke_on_locked_down_task(struct task_struct *p,
3343                                     bool (*func)(struct task_struct *t,
3344                                                  void *arg),
3345                                     void *arg)
3346 {
3347     struct rq_flags rf;
3348     bool ret = false;
3349     struct rq *rq;
3350 
3351     raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3352     if (p->on_rq) {
3353         rq = __task_rq_lock(p, &rf);
3354         if (task_rq(p) == rq) {
3355             ret = func(p, arg);
3356         }
3357         rq_unlock(rq, &rf);
3358     } else {
3359         switch (p->state) {
3360             case TASK_RUNNING:
3361             case TASK_WAKING:
3362                 break;
3363             default:
3364                 smp_rmb(); // See smp_rmb() comment in try_to_wake_up().
3365                 if (!p->on_rq) {
3366                     ret = func(p, arg);
3367                 }
3368         }
3369     }
3370     raw_spin_unlock_irqrestore(&p->pi_lock, rf.flags);
3371     return ret;
3372 }
3373 
3374 /**
3375  * wake_up_process - Wake up a specific process
3376  * @p: The process to be woken up.
3377  *
3378  * Attempt to wake up the nominated process and move it to the set of runnable
3379  * processes.
3380  *
3381  * Return: 1 if the process was woken up, 0 if it was already running.
3382  *
3383  * This function executes a full memory barrier before accessing the task state.
3384  */
wake_up_process(struct task_struct * p)3385 int wake_up_process(struct task_struct *p)
3386 {
3387     return try_to_wake_up(p, TASK_NORMAL, 0);
3388 }
3389 EXPORT_SYMBOL(wake_up_process);
3390 
wake_up_state(struct task_struct * p,unsigned int state)3391 int wake_up_state(struct task_struct *p, unsigned int state)
3392 {
3393     return try_to_wake_up(p, state, 0);
3394 }
3395 
3396 /*
3397  * Perform scheduler related setup for a newly forked process p.
3398  * p is forked by current.
3399  *
3400  * __sched_fork() is basic setup used by init_idle() too:
3401  */
__sched_fork(unsigned long clone_flags,struct task_struct * p)3402 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
3403 {
3404     p->on_rq = 0;
3405 
3406     p->se.on_rq = 0;
3407     p->se.exec_start = 0;
3408     p->se.sum_exec_runtime = 0;
3409     p->se.prev_sum_exec_runtime = 0;
3410     p->se.nr_migrations = 0;
3411     p->se.vruntime = 0;
3412     INIT_LIST_HEAD(&p->se.group_node);
3413 
3414 #ifdef CONFIG_FAIR_GROUP_SCHED
3415     p->se.cfs_rq = NULL;
3416 #endif
3417 
3418 #ifdef CONFIG_SCHEDSTATS
3419     /* Even if schedstat is disabled, there should not be garbage */
3420     memset(&p->se.statistics, 0, sizeof(p->se.statistics));
3421 #endif
3422 
3423     RB_CLEAR_NODE(&p->dl.rb_node);
3424     init_dl_task_timer(&p->dl);
3425     init_dl_inactive_task_timer(&p->dl);
3426     __dl_clear_params(p);
3427 
3428     INIT_LIST_HEAD(&p->rt.run_list);
3429     p->rt.timeout = 0;
3430     p->rt.time_slice = sched_rr_timeslice;
3431     p->rt.on_rq = 0;
3432     p->rt.on_list = 0;
3433 
3434 #ifdef CONFIG_PREEMPT_NOTIFIERS
3435     INIT_HLIST_HEAD(&p->preempt_notifiers);
3436 #endif
3437 
3438 #ifdef CONFIG_COMPACTION
3439     p->capture_control = NULL;
3440 #endif
3441     init_numa_balancing(clone_flags, p);
3442 #ifdef CONFIG_SMP
3443     p->wake_entry.u_flags = CSD_TYPE_TTWU;
3444 #endif
3445 #ifdef CONFIG_SCHED_RTG
3446     p->rtg_depth = 0;
3447 #endif
3448 }
3449 
3450 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
3451 
3452 #ifdef CONFIG_NUMA_BALANCING
3453 
set_numabalancing_state(bool enabled)3454 void set_numabalancing_state(bool enabled)
3455 {
3456     if (enabled) {
3457         static_branch_enable(&sched_numa_balancing);
3458     } else {
3459         static_branch_disable(&sched_numa_balancing);
3460     }
3461 }
3462 
3463 #ifdef CONFIG_PROC_SYSCTL
sysctl_numa_balancing(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)3464 int sysctl_numa_balancing(struct ctl_table *table, int write, void *buffer,
3465                           size_t *lenp, loff_t *ppos)
3466 {
3467     struct ctl_table t;
3468     int err;
3469     int state = static_branch_likely(&sched_numa_balancing);
3470 
3471     if (write && !capable(CAP_SYS_ADMIN)) {
3472         return -EPERM;
3473     }
3474 
3475     t = *table;
3476     t.data = &state;
3477     err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3478     if (err < 0) {
3479         return err;
3480     }
3481     if (write) {
3482         set_numabalancing_state(state);
3483     }
3484     return err;
3485 }
3486 #endif
3487 #endif
3488 
3489 #ifdef CONFIG_SCHEDSTATS
3490 
3491 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
3492 static bool __initdata __sched_schedstats = false;
3493 
set_schedstats(bool enabled)3494 static void set_schedstats(bool enabled)
3495 {
3496     if (enabled) {
3497         static_branch_enable(&sched_schedstats);
3498     } else {
3499         static_branch_disable(&sched_schedstats);
3500     }
3501 }
3502 
force_schedstat_enabled(void)3503 void force_schedstat_enabled(void)
3504 {
3505     if (!schedstat_enabled()) {
3506         pr_info("kernel profiling enabled schedstats, disable via "
3507                 "kernel.sched_schedstats.\n");
3508         static_branch_enable(&sched_schedstats);
3509     }
3510 }
3511 
setup_schedstats(char * str)3512 static int __init setup_schedstats(char *str)
3513 {
3514     int ret = 0;
3515     if (!str) {
3516         goto out;
3517     }
3518 
3519     /*
3520      * This code is called before jump labels have been set up, so we can't
3521      * change the static branch directly just yet.  Instead set a temporary
3522      * variable so init_schedstats() can do it later.
3523      */
3524     if (!strcmp(str, "enable")) {
3525         __sched_schedstats = true;
3526         ret = 1;
3527     } else if (!strcmp(str, "disable")) {
3528         __sched_schedstats = false;
3529         ret = 1;
3530     }
3531 out:
3532     if (!ret) {
3533         pr_warn("Unable to parse schedstats=\n");
3534     }
3535 
3536     return ret;
3537 }
3538 __setup("schedstats=", setup_schedstats);
3539 
init_schedstats(void)3540 static void __init init_schedstats(void)
3541 {
3542     set_schedstats(__sched_schedstats);
3543 }
3544 
3545 #ifdef CONFIG_PROC_SYSCTL
sysctl_schedstats(struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)3546 int sysctl_schedstats(struct ctl_table *table, int write, void *buffer,
3547                       size_t *lenp, loff_t *ppos)
3548 {
3549     struct ctl_table t;
3550     int err;
3551     int state = static_branch_likely(&sched_schedstats);
3552 
3553     if (write && !capable(CAP_SYS_ADMIN)) {
3554         return -EPERM;
3555     }
3556 
3557     t = *table;
3558     t.data = &state;
3559     err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
3560     if (err < 0) {
3561         return err;
3562     }
3563     if (write) {
3564         set_schedstats(state);
3565     }
3566     return err;
3567 }
3568 #endif /* CONFIG_PROC_SYSCTL */
3569 #else  /* !CONFIG_SCHEDSTATS */
init_schedstats(void)3570 static inline void init_schedstats(void)
3571 {
3572 }
3573 #endif /* CONFIG_SCHEDSTATS */
3574 
3575 /*
3576  * fork()/clone()-time setup
3577  */
sched_fork(unsigned long clone_flags,struct task_struct * p)3578 int sched_fork(unsigned long clone_flags, struct task_struct *p)
3579 {
3580     init_new_task_load(p);
3581     __sched_fork(clone_flags, p);
3582     /*
3583      * We mark the process as NEW here. This guarantees that
3584      * nobody will actually run it, and a signal or other external
3585      * event cannot wake it up and insert it on the runqueue either.
3586      */
3587     p->state = TASK_NEW;
3588 
3589     /*
3590      * Make sure we do not leak PI boosting priority to the child.
3591      */
3592     p->prio = current->normal_prio;
3593 
3594 #ifdef CONFIG_SCHED_LATENCY_NICE
3595     /* Propagate the parent's latency requirements to the child as well */
3596     p->latency_prio = current->latency_prio;
3597 #endif
3598 
3599     uclamp_fork(p);
3600 
3601     /*
3602      * Revert to default priority/policy on fork if requested.
3603      */
3604     if (unlikely(p->sched_reset_on_fork)) {
3605         if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3606             p->policy = SCHED_NORMAL;
3607 #ifdef CONFIG_SCHED_RTG
3608             if (current->rtg_depth != 0) {
3609                 p->static_prio = current->static_prio;
3610             } else {
3611                 p->static_prio = NICE_TO_PRIO(0);
3612             }
3613 #else
3614             p->static_prio = NICE_TO_PRIO(0);
3615 #endif
3616             p->rt_priority = 0;
3617         } else if (PRIO_TO_NICE(p->static_prio) < 0) {
3618             p->static_prio = NICE_TO_PRIO(0);
3619         }
3620 
3621         p->prio = p->normal_prio = p->static_prio;
3622         set_load_weight(p);
3623 
3624 #ifdef CONFIG_SCHED_LATENCY_NICE
3625         p->latency_prio = NICE_TO_LATENCY(0);
3626         set_latency_weight(p);
3627 #endif
3628 
3629         /*
3630          * We don't need the reset flag anymore after the fork. It has
3631          * fulfilled its duty:
3632          */
3633         p->sched_reset_on_fork = 0;
3634     }
3635 
3636     if (dl_prio(p->prio)) {
3637         return -EAGAIN;
3638     } else if (rt_prio(p->prio)) {
3639         p->sched_class = &rt_sched_class;
3640     } else {
3641         p->sched_class = &fair_sched_class;
3642     }
3643 
3644     init_entity_runnable_average(&p->se);
3645 
3646 #ifdef CONFIG_SCHED_INFO
3647     if (likely(sched_info_on())) {
3648         memset(&p->sched_info, 0, sizeof(p->sched_info));
3649     }
3650 #endif
3651 #if defined(CONFIG_SMP)
3652     p->on_cpu = 0;
3653 #endif
3654     init_task_preempt_count(p);
3655 #ifdef CONFIG_SMP
3656     plist_node_init(&p->pushable_tasks, MAX_PRIO);
3657     RB_CLEAR_NODE(&p->pushable_dl_tasks);
3658 #endif
3659     return 0;
3660 }
3661 
sched_post_fork(struct task_struct * p,struct kernel_clone_args * kargs)3662 void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs)
3663 {
3664     unsigned long flags;
3665 #ifdef CONFIG_CGROUP_SCHED
3666     struct task_group *tg;
3667 #endif
3668 
3669     raw_spin_lock_irqsave(&p->pi_lock, flags);
3670 #ifdef CONFIG_CGROUP_SCHED
3671     tg = container_of(kargs->cset->subsys[cpu_cgrp_id], struct task_group, css);
3672     p->sched_task_group = autogroup_task_group(p, tg);
3673 #endif
3674     rseq_migrate(p);
3675     /*
3676      * We're setting the CPU for the first time, we don't migrate,
3677      * so use __set_task_cpu().
3678      */
3679     __set_task_cpu(p, smp_processor_id());
3680     if (p->sched_class->task_fork) {
3681         p->sched_class->task_fork(p);
3682     }
3683     raw_spin_unlock_irqrestore(&p->pi_lock, flags);
3684 
3685     uclamp_post_fork(p);
3686 }
3687 
to_ratio(u64 period,u64 runtime)3688 unsigned long to_ratio(u64 period, u64 runtime)
3689 {
3690     if (runtime == RUNTIME_INF) {
3691         return BW_UNIT;
3692     }
3693 
3694     /*
3695      * Doing this here saves a lot of checks in all
3696      * the calling paths, and returning zero seems
3697      * safe for them anyway.
3698      */
3699     if (period == 0) {
3700         return 0;
3701     }
3702 
3703     return div64_u64(runtime << BW_SHIFT, period);
3704 }
3705 
3706 /*
3707  * wake_up_new_task - wake up a newly created task for the first time.
3708  *
3709  * This function will do some initial scheduler statistics housekeeping
3710  * that must be done for every newly created context, then puts the task
3711  * on the runqueue and wakes it.
3712  */
wake_up_new_task(struct task_struct * p)3713 void wake_up_new_task(struct task_struct *p)
3714 {
3715     struct rq_flags rf;
3716     struct rq *rq;
3717 
3718     raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
3719     add_new_task_to_grp(p);
3720 
3721     p->state = TASK_RUNNING;
3722 #ifdef CONFIG_SMP
3723     /*
3724      * Fork balancing, do it here and not earlier because:
3725      *  - cpus_ptr can change in the fork path
3726      *  - any previously selected CPU might disappear through hotplug
3727      *
3728      * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
3729      * as we're not fully set-up yet.
3730      */
3731     p->recent_used_cpu = task_cpu(p);
3732     rseq_migrate(p);
3733     __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
3734 #endif
3735     rq = __task_rq_lock(p, &rf);
3736     update_rq_clock(rq);
3737     post_init_entity_util_avg(p);
3738 
3739     mark_task_starting(p);
3740 
3741     activate_task(rq, p, ENQUEUE_NOCLOCK);
3742     trace_sched_wakeup_new(p);
3743     check_preempt_curr(rq, p, WF_FORK);
3744 #ifdef CONFIG_SMP
3745     if (p->sched_class->task_woken) {
3746         /*
3747          * Nothing relies on rq->lock after this, so its fine to
3748          * drop it.
3749          */
3750         rq_unpin_lock(rq, &rf);
3751         p->sched_class->task_woken(rq, p);
3752         rq_repin_lock(rq, &rf);
3753     }
3754 #endif
3755     task_rq_unlock(rq, p, &rf);
3756 }
3757 
3758 #ifdef CONFIG_PREEMPT_NOTIFIERS
3759 
3760 static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key);
3761 
preempt_notifier_inc(void)3762 void preempt_notifier_inc(void)
3763 {
3764     static_branch_inc(&preempt_notifier_key);
3765 }
3766 EXPORT_SYMBOL_GPL(preempt_notifier_inc);
3767 
preempt_notifier_dec(void)3768 void preempt_notifier_dec(void)
3769 {
3770     static_branch_dec(&preempt_notifier_key);
3771 }
3772 EXPORT_SYMBOL_GPL(preempt_notifier_dec);
3773 
3774 /**
3775  * preempt_notifier_register - tell me when current is being preempted &
3776  * rescheduled
3777  * @notifier: notifier struct to register
3778  */
preempt_notifier_register(struct preempt_notifier * notifier)3779 void preempt_notifier_register(struct preempt_notifier *notifier)
3780 {
3781     if (!static_branch_unlikely(&preempt_notifier_key)) {
3782         WARN(1, "registering preempt_notifier while notifiers disabled\n");
3783     }
3784 
3785     hlist_add_head(&notifier->link, &current->preempt_notifiers);
3786 }
3787 EXPORT_SYMBOL_GPL(preempt_notifier_register);
3788 
3789 /**
3790  * preempt_notifier_unregister - no longer interested in preemption
3791  * notifications
3792  * @notifier: notifier struct to unregister
3793  *
3794  * This is *not* safe to call from within a preemption notifier.
3795  */
preempt_notifier_unregister(struct preempt_notifier * notifier)3796 void preempt_notifier_unregister(struct preempt_notifier *notifier)
3797 {
3798     hlist_del(&notifier->link);
3799 }
3800 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
3801 
__fire_sched_in_preempt_notifiers(struct task_struct * curr)3802 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
3803 {
3804     struct preempt_notifier *notifier;
3805 
3806     hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3807         notifier->ops->sched_in(notifier, raw_smp_processor_id());
3808 }
3809 
3810 static __always_inline void
fire_sched_in_preempt_notifiers(struct task_struct * curr)3811 fire_sched_in_preempt_notifiers(struct task_struct *curr)
3812 {
3813     if (static_branch_unlikely(&preempt_notifier_key)) {
3814         __fire_sched_in_preempt_notifiers(curr);
3815     }
3816 }
3817 
__fire_sched_out_preempt_notifiers(struct task_struct * curr,struct task_struct * next)3818 static void __fire_sched_out_preempt_notifiers(struct task_struct *curr,
3819                                                struct task_struct *next)
3820 {
3821     struct preempt_notifier *notifier;
3822 
3823     hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
3824         notifier->ops->sched_out(notifier, next);
3825 }
3826 
3827 static __always_inline void
fire_sched_out_preempt_notifiers(struct task_struct * curr,struct task_struct * next)3828 fire_sched_out_preempt_notifiers(struct task_struct *curr,
3829                                  struct task_struct *next)
3830 {
3831     if (static_branch_unlikely(&preempt_notifier_key)) {
3832         __fire_sched_out_preempt_notifiers(curr, next);
3833     }
3834 }
3835 
3836 #else /* !CONFIG_PREEMPT_NOTIFIERS */
3837 
fire_sched_in_preempt_notifiers(struct task_struct * curr)3838 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
3839 {
3840 }
3841 
fire_sched_out_preempt_notifiers(struct task_struct * curr,struct task_struct * next)3842 static inline void fire_sched_out_preempt_notifiers(struct task_struct *curr,
3843                                                     struct task_struct *next)
3844 {
3845 }
3846 
3847 #endif /* CONFIG_PREEMPT_NOTIFIERS */
3848 
prepare_task(struct task_struct * next)3849 static inline void prepare_task(struct task_struct *next)
3850 {
3851 #ifdef CONFIG_SMP
3852     /*
3853      * Claim the task as running, we do this before switching to it
3854      * such that any running task will have this set.
3855      *
3856      * See the ttwu() WF_ON_CPU case and its ordering comment.
3857      */
3858     WRITE_ONCE(next->on_cpu, 1);
3859 #endif
3860 }
3861 
finish_task(struct task_struct * prev)3862 static inline void finish_task(struct task_struct *prev)
3863 {
3864 #ifdef CONFIG_SMP
3865     /*
3866      * This must be the very last reference to @prev from this CPU. After
3867      * p->on_cpu is cleared, the task can be moved to a different CPU. We
3868      * must ensure this doesn't happen until the switch is completely
3869      * finished.
3870      *
3871      * In particular, the load of prev->state in finish_task_switch() must
3872      * happen before this.
3873      *
3874      * Pairs with the smp_cond_load_acquire() in try_to_wake_up().
3875      */
3876     smp_store_release(&prev->on_cpu, 0);
3877 #endif
3878 }
3879 
prepare_lock_switch(struct rq * rq,struct task_struct * next,struct rq_flags * rf)3880 static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next,
3881                                        struct rq_flags *rf)
3882 {
3883     /*
3884      * Since the runqueue lock will be released by the next
3885      * task (which is an invalid locking op but in the case
3886      * of the scheduler it's an obvious special-case), so we
3887      * do an early lockdep release here:
3888      */
3889     rq_unpin_lock(rq, rf);
3890     spin_release(&rq->lock.dep_map, _THIS_IP_);
3891 #ifdef CONFIG_DEBUG_SPINLOCK
3892     /* this is a valid case when another task releases the spinlock */
3893     rq->lock.owner = next;
3894 #endif
3895 }
3896 
finish_lock_switch(struct rq * rq)3897 static inline void finish_lock_switch(struct rq *rq)
3898 {
3899     /*
3900      * If we are tracking spinlock dependencies then we have to
3901      * fix up the runqueue lock - which gets 'carried over' from
3902      * prev into current:
3903      */
3904     spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
3905     raw_spin_unlock_irq(&rq->lock);
3906 }
3907 
3908 /*
3909  * NOP if the arch has not defined these:
3910  */
3911 
3912 #ifndef prepare_arch_switch
3913 #define prepare_arch_switch(next)                                              \
3914     do {                                                                       \
3915     } while (0)
3916 #endif
3917 
3918 #ifndef finish_arch_post_lock_switch
3919 #define finish_arch_post_lock_switch()                                         \
3920     do {                                                                       \
3921     } while (0)
3922 #endif
3923 
3924 /**
3925  * prepare_task_switch - prepare to switch tasks
3926  * @rq: the runqueue preparing to switch
3927  * @prev: the current task that is being switched out
3928  * @next: the task we are going to switch to.
3929  *
3930  * This is called with the rq lock held and interrupts off. It must
3931  * be paired with a subsequent finish_task_switch after the context
3932  * switch.
3933  *
3934  * prepare_task_switch sets up locking and calls architecture specific
3935  * hooks.
3936  */
prepare_task_switch(struct rq * rq,struct task_struct * prev,struct task_struct * next)3937 static inline void prepare_task_switch(struct rq *rq, struct task_struct *prev,
3938                                        struct task_struct *next)
3939 {
3940     kcov_prepare_switch(prev);
3941     sched_info_switch(rq, prev, next);
3942     perf_event_task_sched_out(prev, next);
3943     rseq_preempt(prev);
3944     fire_sched_out_preempt_notifiers(prev, next);
3945     prepare_task(next);
3946     prepare_arch_switch(next);
3947 }
3948 
3949 /**
3950  * finish_task_switch - clean up after a task-switch
3951  * @prev: the thread we just switched away from.
3952  *
3953  * finish_task_switch must be called after the context switch, paired
3954  * with a prepare_task_switch call before the context switch.
3955  * finish_task_switch will reconcile locking set up by prepare_task_switch,
3956  * and do any other architecture-specific cleanup actions.
3957  *
3958  * Note that we may have delayed dropping an mm in context_switch(). If
3959  * so, we finish that here outside of the runqueue lock. (Doing it
3960  * with the lock held can cause deadlocks; see schedule() for
3961  * details.)
3962  *
3963  * The context switch have flipped the stack from under us and restored the
3964  * local variables which were saved when this task called schedule() in the
3965  * past. prev == current is still correct but we need to recalculate this_rq
3966  * because prev may have moved to another CPU.
3967  */
finish_task_switch(struct task_struct * prev)3968 static struct rq *finish_task_switch(struct task_struct *prev)
3969     __releases(rq->lock)
3970 {
3971     struct rq *rq = this_rq();
3972     struct mm_struct *mm = rq->prev_mm;
3973     long prev_state;
3974 
3975     /*
3976      * The previous task will have left us with a preempt_count of 2
3977      * because it left us after:
3978      *
3979      *    schedule()
3980      *      preempt_disable();            // 1
3981      *      __schedule()
3982      *        raw_spin_lock_irq(&rq->lock)    // 2
3983      *
3984      * Also, see FORK_PREEMPT_COUNT.
3985      */
3986     if (WARN_ONCE(preempt_count() != 2 * PREEMPT_DISABLE_OFFSET,
3987                   "corrupted preempt_count: %s/%d/0x%x\n", current->comm,
3988                   current->pid, preempt_count())) {
3989         preempt_count_set(FORK_PREEMPT_COUNT);
3990     }
3991 
3992     rq->prev_mm = NULL;
3993 
3994     /*
3995      * A task struct has one reference for the use as "current".
3996      * If a task dies, then it sets TASK_DEAD in tsk->state and calls
3997      * schedule one last time. The schedule call will never return, and
3998      * the scheduled task must drop that reference.
3999      *
4000      * We must observe prev->state before clearing prev->on_cpu (in
4001      * finish_task), otherwise a concurrent wakeup can get prev
4002      * running on another CPU and we could rave with its RUNNING -> DEAD
4003      * transition, resulting in a double drop.
4004      */
4005     prev_state = prev->state;
4006     vtime_task_switch(prev);
4007     perf_event_task_sched_in(prev, current);
4008     finish_task(prev);
4009     finish_lock_switch(rq);
4010     finish_arch_post_lock_switch();
4011     kcov_finish_switch(current);
4012 
4013     fire_sched_in_preempt_notifiers(current);
4014     /*
4015      * When switching through a kernel thread, the loop in
4016      * membarrier_{private,global}_expedited() may have observed that
4017      * kernel thread and not issued an IPI. It is therefore possible to
4018      * schedule between user->kernel->user threads without passing though
4019      * switch_mm(). Membarrier requires a barrier after storing to
4020      * rq->curr, before returning to userspace, so provide them here:
4021      *
4022      * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly
4023      *   provided by mmdrop(),
4024      * - a sync_core for SYNC_CORE.
4025      */
4026     if (mm) {
4027         membarrier_mm_sync_core_before_usermode(mm);
4028         mmdrop(mm);
4029     }
4030     if (unlikely(prev_state == TASK_DEAD)) {
4031         if (prev->sched_class->task_dead) {
4032             prev->sched_class->task_dead(prev);
4033         }
4034 
4035         /*
4036          * Remove function-return probe instances associated with this
4037          * task and put them back on the free list.
4038          */
4039         kprobe_flush_task(prev);
4040 
4041         /* Task is done with its stack. */
4042         put_task_stack(prev);
4043 
4044         put_task_struct_rcu_user(prev);
4045     }
4046 
4047     tick_nohz_task_switch();
4048     return rq;
4049 }
4050 
4051 #ifdef CONFIG_SMP
4052 
4053 /* rq->lock is NOT held, but preemption is disabled */
__balance_callback(struct rq * rq)4054 static void __balance_callback(struct rq *rq)
4055 {
4056     struct callback_head *head, *next;
4057     void (*func)(struct rq * rq);
4058     unsigned long flags;
4059 
4060     raw_spin_lock_irqsave(&rq->lock, flags);
4061     head = rq->balance_callback;
4062     rq->balance_callback = NULL;
4063     while (head) {
4064         func = (void (*)(struct rq *))head->func;
4065         next = head->next;
4066         head->next = NULL;
4067         head = next;
4068 
4069         func(rq);
4070     }
4071     raw_spin_unlock_irqrestore(&rq->lock, flags);
4072 }
4073 
balance_callback(struct rq * rq)4074 static inline void balance_callback(struct rq *rq)
4075 {
4076     if (unlikely(rq->balance_callback)) {
4077         __balance_callback(rq);
4078     }
4079 }
4080 
4081 #else
4082 
balance_callback(struct rq * rq)4083 static inline void balance_callback(struct rq *rq)
4084 {
4085 }
4086 
4087 #endif
4088 
4089 /**
4090  * schedule_tail - first thing a freshly forked thread must call.
4091  * @prev: the thread we just switched away from.
4092  */
schedule_tail(struct task_struct * prev)4093 asmlinkage __visible void schedule_tail(struct task_struct *prev)
4094     __releases(rq->lock)
4095 {
4096     struct rq *rq;
4097 
4098     /*
4099      * New tasks start with FORK_PREEMPT_COUNT, see there and
4100      * finish_task_switch() for details.
4101      *
4102      * finish_task_switch() will drop rq->lock() and lower preempt_count
4103      * and the preempt_enable() will end up enabling preemption (on
4104      * PREEMPT_COUNT kernels).
4105      */
4106 
4107     rq = finish_task_switch(prev);
4108     balance_callback(rq);
4109     preempt_enable();
4110 
4111     if (current->set_child_tid) {
4112         put_user(task_pid_vnr(current), current->set_child_tid);
4113     }
4114 
4115     calculate_sigpending();
4116 }
4117 
4118 /*
4119  * context_switch - switch to the new MM and the new thread's register state.
4120  */
context_switch(struct rq * rq,struct task_struct * prev,struct task_struct * next,struct rq_flags * rf)4121 static __always_inline struct rq *context_switch(struct rq *rq,
4122                                                  struct task_struct *prev,
4123                                                  struct task_struct *next,
4124                                                  struct rq_flags *rf)
4125 {
4126     prepare_task_switch(rq, prev, next);
4127 
4128     /*
4129      * For paravirt, this is coupled with an exit in switch_to to
4130      * combine the page table reload and the switch backend into
4131      * one hypercall.
4132      */
4133     arch_start_context_switch(prev);
4134 
4135     /*
4136      * kernel -> kernel   lazy + transfer active
4137      *   user -> kernel   lazy + mmgrab() active
4138      *
4139      * kernel ->   user   switch + mmdrop() active
4140      *   user ->   user   switch
4141      */
4142     if (!next->mm) { // to kernel
4143         enter_lazy_tlb(prev->active_mm, next);
4144 
4145         next->active_mm = prev->active_mm;
4146         if (prev->mm) { // from user
4147             mmgrab(prev->active_mm);
4148         } else {
4149             prev->active_mm = NULL;
4150         }
4151     } else { // to user
4152         membarrier_switch_mm(rq, prev->active_mm, next->mm);
4153         /*
4154          * sys_membarrier() requires an smp_mb() between setting
4155          * rq->curr / membarrier_switch_mm() and returning to userspace.
4156          *
4157          * The below provides this either through switch_mm(), or in
4158          * case 'prev->active_mm == next->mm' through
4159          * finish_task_switch()'s mmdrop().
4160          */
4161         switch_mm_irqs_off(prev->active_mm, next->mm, next);
4162 
4163         if (!prev->mm) { // from kernel
4164             /* will mmdrop() in finish_task_switch(). */
4165             rq->prev_mm = prev->active_mm;
4166             prev->active_mm = NULL;
4167         }
4168     }
4169 
4170     rq->clock_update_flags &= ~(RQCF_ACT_SKIP | RQCF_REQ_SKIP);
4171 
4172     prepare_lock_switch(rq, next, rf);
4173 
4174     /* Here we just switch the register state and the stack. */
4175     switch_to(prev, next, prev);
4176     barrier();
4177 
4178     return finish_task_switch(prev);
4179 }
4180 
4181 /*
4182  * nr_running and nr_context_switches
4183  *
4184  * externally visible scheduler statistics: current number of runnable
4185  * threads, total number of context switches performed since bootup.
4186  */
nr_running(void)4187 unsigned long nr_running(void)
4188 {
4189     unsigned long i, sum = 0;
4190 
4191     for_each_online_cpu(i) sum += cpu_rq(i)->nr_running;
4192 
4193     return sum;
4194 }
4195 
4196 /*
4197  * Check if only the current task is running on the CPU.
4198  *
4199  * Caution: this function does not check that the caller has disabled
4200  * preemption, thus the result might have a time-of-check-to-time-of-use
4201  * race.  The caller is responsible to use it correctly, for example:
4202  *
4203  * - from a non-preemptible section (of course)
4204  *
4205  * - from a thread that is bound to a single CPU
4206  *
4207  * - in a loop with very short iterations (e.g. a polling loop)
4208  */
single_task_running(void)4209 bool single_task_running(void)
4210 {
4211     return raw_rq()->nr_running == 1;
4212 }
4213 EXPORT_SYMBOL(single_task_running);
4214 
nr_context_switches(void)4215 unsigned long long nr_context_switches(void)
4216 {
4217     int i;
4218     unsigned long long sum = 0;
4219 
4220     for_each_possible_cpu(i) sum += cpu_rq(i)->nr_switches;
4221 
4222     return sum;
4223 }
4224 
4225 /*
4226  * Consumers of these two interfaces, like for example the cpuidle menu
4227  * governor, are using nonsensical data. Preferring shallow idle state selection
4228  * for a CPU that has IO-wait which might not even end up running the task when
4229  * it does become runnable.
4230  */
4231 
nr_iowait_cpu(int cpu)4232 unsigned long nr_iowait_cpu(int cpu)
4233 {
4234     return atomic_read(&cpu_rq(cpu)->nr_iowait);
4235 }
4236 
4237 /*
4238  * IO-wait accounting, and how its mostly bollocks (on SMP).
4239  *
4240  * The idea behind IO-wait account is to account the idle time that we could
4241  * have spend running if it were not for IO. That is, if we were to improve the
4242  * storage performance, we'd have a proportional reduction in IO-wait time.
4243  *
4244  * This all works nicely on UP, where, when a task blocks on IO, we account
4245  * idle time as IO-wait, because if the storage were faster, it could've been
4246  * running and we'd not be idle.
4247  *
4248  * This has been extended to SMP, by doing the same for each CPU. This however
4249  * is broken.
4250  *
4251  * Imagine for instance the case where two tasks block on one CPU, only the one
4252  * CPU will have IO-wait accounted, while the other has regular idle. Even
4253  * though, if the storage were faster, both could've ran at the same time,
4254  * utilising both CPUs.
4255  *
4256  * This means, that when looking globally, the current IO-wait accounting on
4257  * SMP is a lower bound, by reason of under accounting.
4258  *
4259  * Worse, since the numbers are provided per CPU, they are sometimes
4260  * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
4261  * associated with any one particular CPU, it can wake to another CPU than it
4262  * blocked on. This means the per CPU IO-wait number is meaningless.
4263  *
4264  * Task CPU affinities can make all that even more 'interesting'.
4265  */
4266 
nr_iowait(void)4267 unsigned long nr_iowait(void)
4268 {
4269     unsigned long i, sum = 0;
4270 
4271     for_each_possible_cpu(i) sum += nr_iowait_cpu(i);
4272 
4273     return sum;
4274 }
4275 
4276 #ifdef CONFIG_SMP
4277 
4278 /*
4279  * sched_exec - execve() is a valuable balancing opportunity, because at
4280  * this point the task has the smallest effective memory and cache footprint.
4281  */
sched_exec(void)4282 void sched_exec(void)
4283 {
4284     struct task_struct *p = current;
4285     unsigned long flags;
4286     int dest_cpu;
4287 
4288     raw_spin_lock_irqsave(&p->pi_lock, flags);
4289     dest_cpu =
4290         p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
4291     if (dest_cpu == smp_processor_id()) {
4292         goto unlock;
4293     }
4294 
4295     if (likely(cpu_active(dest_cpu) && likely(!cpu_isolated(dest_cpu)))) {
4296         struct migration_arg arg = {p, dest_cpu};
4297 
4298         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4299         stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
4300         return;
4301     }
4302 unlock:
4303     raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4304 }
4305 
4306 #endif
4307 
4308 DEFINE_PER_CPU(struct kernel_stat, kstat);
4309 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
4310 
4311 EXPORT_PER_CPU_SYMBOL(kstat);
4312 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
4313 
4314 /*
4315  * The function fair_sched_class.update_curr accesses the struct curr
4316  * and its field curr->exec_start; when called from task_sched_runtime(),
4317  * we observe a high rate of cache misses in practice.
4318  * Prefetching this data results in improved performance.
4319  */
prefetch_curr_exec_start(struct task_struct * p)4320 static inline void prefetch_curr_exec_start(struct task_struct *p)
4321 {
4322 #ifdef CONFIG_FAIR_GROUP_SCHED
4323     struct sched_entity *curr = (&p->se)->cfs_rq->curr;
4324 #else
4325     struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
4326 #endif
4327     prefetch(curr);
4328     prefetch(&curr->exec_start);
4329 }
4330 
4331 /*
4332  * Return accounted runtime for the task.
4333  * In case the task is currently running, return the runtime plus current's
4334  * pending runtime that have not been accounted yet.
4335  */
task_sched_runtime(struct task_struct * p)4336 unsigned long long task_sched_runtime(struct task_struct *p)
4337 {
4338     struct rq_flags rf;
4339     struct rq *rq;
4340     u64 ns;
4341 
4342 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
4343     /*
4344      * 64-bit doesn't need locks to atomically read a 64-bit value.
4345      * So we have a optimization chance when the task's delta_exec is 0.
4346      * Reading ->on_cpu is racy, but this is ok.
4347      *
4348      * If we race with it leaving CPU, we'll take a lock. So we're correct.
4349      * If we race with it entering CPU, unaccounted time is 0. This is
4350      * indistinguishable from the read occurring a few cycles earlier.
4351      * If we see ->on_cpu without ->on_rq, the task is leaving, and has
4352      * been accounted, so we're correct here as well.
4353      */
4354     if (!p->on_cpu || !task_on_rq_queued(p)) {
4355         return p->se.sum_exec_runtime;
4356     }
4357 #endif
4358 
4359     rq = task_rq_lock(p, &rf);
4360     /*
4361      * Must be ->curr _and_ ->on_rq.  If dequeued, we would
4362      * project cycles that may never be accounted to this
4363      * thread, breaking clock_gettime().
4364      */
4365     if (task_current(rq, p) && task_on_rq_queued(p)) {
4366         prefetch_curr_exec_start(p);
4367         update_rq_clock(rq);
4368         p->sched_class->update_curr(rq);
4369     }
4370     ns = p->se.sum_exec_runtime;
4371     task_rq_unlock(rq, p, &rf);
4372 
4373     return ns;
4374 }
4375 
4376 /*
4377  * This function gets called by the timer code, with HZ frequency.
4378  * We call it with interrupts disabled.
4379  */
scheduler_tick(void)4380 void scheduler_tick(void)
4381 {
4382     int cpu = smp_processor_id();
4383     struct rq *rq = cpu_rq(cpu);
4384     struct task_struct *curr = rq->curr;
4385     struct rq_flags rf;
4386     u64 wallclock;
4387     unsigned long thermal_pressure;
4388 
4389     arch_scale_freq_tick();
4390     sched_clock_tick();
4391 
4392     rq_lock(rq, &rf);
4393 
4394     set_window_start(rq);
4395     wallclock = sched_ktime_clock();
4396     update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
4397     update_rq_clock(rq);
4398     thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq));
4399     update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure);
4400     curr->sched_class->task_tick(rq, curr, 0);
4401     calc_global_load_tick(rq);
4402     psi_task_tick(rq);
4403 
4404     rq_unlock(rq, &rf);
4405 
4406 #ifdef CONFIG_SCHED_RTG
4407     sched_update_rtg_tick(curr);
4408 #endif
4409     perf_event_task_tick();
4410 
4411 #ifdef CONFIG_SMP
4412     rq->idle_balance = idle_cpu(cpu);
4413     trigger_load_balance(rq);
4414 
4415 #ifdef CONFIG_SCHED_EAS
4416     if (curr->sched_class->check_for_migration) {
4417         curr->sched_class->check_for_migration(rq, curr);
4418     }
4419 #endif
4420 #endif
4421 }
4422 
4423 #ifdef CONFIG_NO_HZ_FULL
4424 
4425 struct tick_work {
4426     int cpu;
4427     atomic_t state;
4428     struct delayed_work work;
4429 };
4430 /* Values for ->state, see diagram below. */
4431 #define TICK_SCHED_REMOTE_OFFLINE 0
4432 #define TICK_SCHED_REMOTE_OFFLINING 1
4433 #define TICK_SCHED_REMOTE_RUNNING 2
4434 
4435 /*
4436  * State diagram for ->state:
4437  *
4438  *
4439  *          TICK_SCHED_REMOTE_OFFLINE
4440  *                    |   ^
4441  *                    |   |
4442  *                    |   | sched_tick_remote()
4443  *                    |   |
4444  *                    |   |
4445  *                    +--TICK_SCHED_REMOTE_OFFLINING
4446  *                    |   ^
4447  *                    |   |
4448  * sched_tick_start() |   | sched_tick_stop()
4449  *                    |   |
4450  *                    V   |
4451  *          TICK_SCHED_REMOTE_RUNNING
4452  *
4453  *
4454  * Other transitions get WARN_ON_ONCE(), except that sched_tick_remote()
4455  * and sched_tick_start() are happy to leave the state in RUNNING.
4456  */
4457 
4458 static struct tick_work __percpu *tick_work_cpu;
4459 
sched_tick_remote(struct work_struct * work)4460 static void sched_tick_remote(struct work_struct *work)
4461 {
4462     struct delayed_work *dwork = to_delayed_work(work);
4463     struct tick_work *twork = container_of(dwork, struct tick_work, work);
4464     int cpu = twork->cpu;
4465     struct rq *rq = cpu_rq(cpu);
4466     struct task_struct *curr;
4467     struct rq_flags rf;
4468     u64 delta;
4469     int os;
4470 
4471     /*
4472      * Handle the tick only if it appears the remote CPU is running in full
4473      * dynticks mode. The check is racy by nature, but missing a tick or
4474      * having one too much is no big deal because the scheduler tick updates
4475      * statistics and checks timeslices in a time-independent way, regardless
4476      * of when exactly it is running.
4477      */
4478     if (!tick_nohz_tick_stopped_cpu(cpu)) {
4479         goto out_requeue;
4480     }
4481 
4482     rq_lock_irq(rq, &rf);
4483     curr = rq->curr;
4484     if (cpu_is_offline(cpu)) {
4485         goto out_unlock;
4486     }
4487 
4488     update_rq_clock(rq);
4489 
4490     if (!is_idle_task(curr)) {
4491         /*
4492          * Make sure the next tick runs within a reasonable
4493          * amount of time.
4494          */
4495         delta = rq_clock_task(rq) - curr->se.exec_start;
4496         WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 0x3);
4497     }
4498     curr->sched_class->task_tick(rq, curr, 0);
4499 
4500     calc_load_nohz_remote(rq);
4501 out_unlock:
4502     rq_unlock_irq(rq, &rf);
4503 out_requeue:
4504 
4505     /*
4506      * Run the remote tick once per second (1Hz). This arbitrary
4507      * frequency is large enough to avoid overload but short enough
4508      * to keep scheduler internal stats reasonably up to date.  But
4509      * first update state to reflect hotplug activity if required.
4510      */
4511     os = atomic_fetch_add_unless(&twork->state, -1, TICK_SCHED_REMOTE_RUNNING);
4512     WARN_ON_ONCE(os == TICK_SCHED_REMOTE_OFFLINE);
4513     if (os == TICK_SCHED_REMOTE_RUNNING) {
4514         queue_delayed_work(system_unbound_wq, dwork, HZ);
4515     }
4516 }
4517 
sched_tick_start(int cpu)4518 static void sched_tick_start(int cpu)
4519 {
4520     int os;
4521     struct tick_work *twork;
4522 
4523     if (housekeeping_cpu(cpu, HK_FLAG_TICK)) {
4524         return;
4525     }
4526 
4527     WARN_ON_ONCE(!tick_work_cpu);
4528 
4529     twork = per_cpu_ptr(tick_work_cpu, cpu);
4530     os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_RUNNING);
4531     WARN_ON_ONCE(os == TICK_SCHED_REMOTE_RUNNING);
4532     if (os == TICK_SCHED_REMOTE_OFFLINE) {
4533         twork->cpu = cpu;
4534         INIT_DELAYED_WORK(&twork->work, sched_tick_remote);
4535         queue_delayed_work(system_unbound_wq, &twork->work, HZ);
4536     }
4537 }
4538 
4539 #ifdef CONFIG_HOTPLUG_CPU
sched_tick_stop(int cpu)4540 static void sched_tick_stop(int cpu)
4541 {
4542     struct tick_work *twork;
4543     int os;
4544 
4545     if (housekeeping_cpu(cpu, HK_FLAG_TICK)) {
4546         return;
4547     }
4548 
4549     WARN_ON_ONCE(!tick_work_cpu);
4550 
4551     twork = per_cpu_ptr(tick_work_cpu, cpu);
4552     /* There cannot be competing actions, but don't rely on stop-machine. */
4553     os = atomic_xchg(&twork->state, TICK_SCHED_REMOTE_OFFLINING);
4554     WARN_ON_ONCE(os != TICK_SCHED_REMOTE_RUNNING);
4555     /* Don't cancel, as this would mess up the state machine. */
4556 }
4557 #endif /* CONFIG_HOTPLUG_CPU */
4558 
sched_tick_offload_init(void)4559 int __init sched_tick_offload_init(void)
4560 {
4561     tick_work_cpu = alloc_percpu(struct tick_work);
4562     BUG_ON(!tick_work_cpu);
4563     return 0;
4564 }
4565 
4566 #else /* !CONFIG_NO_HZ_FULL */
sched_tick_start(int cpu)4567 static inline void sched_tick_start(int cpu)
4568 {
4569 }
sched_tick_stop(int cpu)4570 static inline void sched_tick_stop(int cpu)
4571 {
4572 }
4573 #endif
4574 
4575 #if defined(CONFIG_PREEMPTION) &&                                              \
4576     (defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE))
4577 /*
4578  * If the value passed in is equal to the current preempt count
4579  * then we just disabled preemption. Start timing the latency.
4580  */
preempt_latency_start(int val)4581 static inline void preempt_latency_start(int val)
4582 {
4583     if (preempt_count() == val) {
4584         unsigned long ip = get_lock_parent_ip();
4585 #ifdef CONFIG_DEBUG_PREEMPT
4586         current->preempt_disable_ip = ip;
4587 #endif
4588         trace_preempt_off(CALLER_ADDR0, ip);
4589     }
4590 }
4591 
preempt_count_add(int val)4592 void preempt_count_add(int val)
4593 {
4594 #ifdef CONFIG_DEBUG_PREEMPT
4595     /*
4596      * Underflow?
4597      */
4598     if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) {
4599         return;
4600     }
4601 #endif
4602     __preempt_count_add(val);
4603 #ifdef CONFIG_DEBUG_PREEMPT
4604     /*
4605      * Spinlock count overflowing soon?
4606      */
4607     DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK - 0xa);
4608 #endif
4609     preempt_latency_start(val);
4610 }
4611 EXPORT_SYMBOL(preempt_count_add);
4612 NOKPROBE_SYMBOL(preempt_count_add);
4613 
4614 /*
4615  * If the value passed in equals to the current preempt count
4616  * then we just enabled preemption. Stop timing the latency.
4617  */
preempt_latency_stop(int val)4618 static inline void preempt_latency_stop(int val)
4619 {
4620     if (preempt_count() == val) {
4621         trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
4622     }
4623 }
4624 
preempt_count_sub(int val)4625 void preempt_count_sub(int val)
4626 {
4627 #ifdef CONFIG_DEBUG_PREEMPT
4628     /*
4629      * Underflow?
4630      */
4631     if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) {
4632         return;
4633     }
4634     /*
4635      * Is the spinlock portion underflowing?
4636      */
4637     if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4638                             !(preempt_count() & PREEMPT_MASK))) {
4639         return;
4640     }
4641 #endif
4642 
4643     preempt_latency_stop(val);
4644     __preempt_count_sub(val);
4645 }
4646 EXPORT_SYMBOL(preempt_count_sub);
4647 NOKPROBE_SYMBOL(preempt_count_sub);
4648 
4649 #else
preempt_latency_start(int val)4650 static inline void preempt_latency_start(int val)
4651 {
4652 }
preempt_latency_stop(int val)4653 static inline void preempt_latency_stop(int val)
4654 {
4655 }
4656 #endif
4657 
get_preempt_disable_ip(struct task_struct * p)4658 static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
4659 {
4660 #ifdef CONFIG_DEBUG_PREEMPT
4661     return p->preempt_disable_ip;
4662 #else
4663     return 0;
4664 #endif
4665 }
4666 
4667 /*
4668  * Print scheduling while atomic bug:
4669  */
__schedule_bug(struct task_struct * prev)4670 static noinline void __schedule_bug(struct task_struct *prev)
4671 {
4672     /* Save this before calling printk(), since that will clobber it */
4673     unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
4674 
4675     if (oops_in_progress) {
4676         return;
4677     }
4678 
4679     printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", prev->comm,
4680            prev->pid, preempt_count());
4681 
4682     debug_show_held_locks(prev);
4683     print_modules();
4684     if (irqs_disabled()) {
4685         print_irqtrace_events(prev);
4686     }
4687     if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) && in_atomic_preempt_off()) {
4688         pr_err("Preemption disabled at:");
4689         print_ip_sym(KERN_ERR, preempt_disable_ip);
4690     }
4691     if (panic_on_warn) {
4692         panic("scheduling while atomic\n");
4693     }
4694 
4695     dump_stack();
4696     add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4697 }
4698 
4699 /*
4700  * Various schedule()-time debugging checks and statistics:
4701  */
schedule_debug(struct task_struct * prev,bool preempt)4702 static inline void schedule_debug(struct task_struct *prev, bool preempt)
4703 {
4704 #ifdef CONFIG_SCHED_STACK_END_CHECK
4705     if (task_stack_end_corrupted(prev)) {
4706         panic("corrupted stack end detected inside scheduler\n");
4707     }
4708 
4709     if (task_scs_end_corrupted(prev)) {
4710         panic("corrupted shadow stack detected inside scheduler\n");
4711     }
4712 #endif
4713 
4714 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
4715     if (!preempt && prev->state && prev->non_block_count) {
4716         printk(KERN_ERR "BUG: scheduling in a non-blocking section: %s/%d/%i\n",
4717                prev->comm, prev->pid, prev->non_block_count);
4718         dump_stack();
4719         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
4720     }
4721 #endif
4722 
4723     if (unlikely(in_atomic_preempt_off())) {
4724         __schedule_bug(prev);
4725         preempt_count_set(PREEMPT_DISABLED);
4726     }
4727     rcu_sleep_check();
4728 
4729     profile_hit(SCHED_PROFILING, __builtin_return_address(0));
4730 
4731     schedstat_inc(this_rq()->sched_count);
4732 }
4733 
put_prev_task_balance(struct rq * rq,struct task_struct * prev,struct rq_flags * rf)4734 static void put_prev_task_balance(struct rq *rq, struct task_struct *prev,
4735                                   struct rq_flags *rf)
4736 {
4737 #ifdef CONFIG_SMP
4738     const struct sched_class *class;
4739     /*
4740      * We must do the balancing pass before put_prev_task(), such
4741      * that when we release the rq->lock the task is in the same
4742      * state as before we took rq->lock.
4743      *
4744      * We can terminate the balance pass as soon as we know there is
4745      * a runnable task of @class priority or higher.
4746      */
4747     for_class_range(class, prev->sched_class, &idle_sched_class)
4748     {
4749         if (class->balance(rq, prev, rf)) {
4750             break;
4751         }
4752     }
4753 #endif
4754 
4755     put_prev_task(rq, prev);
4756 }
4757 
4758 /*
4759  * Pick up the highest-prio task:
4760  */
4761 static inline struct task_struct *
pick_next_task(struct rq * rq,struct task_struct * prev,struct rq_flags * rf)4762 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
4763 {
4764     const struct sched_class *class;
4765     struct task_struct *p;
4766 
4767     /*
4768      * Optimization: we know that if all tasks are in the fair class we can
4769      * call that function directly, but only if the @prev task wasn't of a
4770      * higher scheduling class, because otherwise those loose the
4771      * opportunity to pull in more work from other CPUs.
4772      */
4773     if (likely(prev->sched_class <= &fair_sched_class &&
4774                rq->nr_running == rq->cfs.h_nr_running)) {
4775         p = pick_next_task_fair(rq, prev, rf);
4776         if (unlikely(p == RETRY_TASK)) {
4777             goto restart;
4778         }
4779 
4780         /* Assumes fair_sched_class->next == idle_sched_class */
4781         if (!p) {
4782             put_prev_task(rq, prev);
4783             p = pick_next_task_idle(rq);
4784         }
4785 
4786         return p;
4787     }
4788 
4789 restart:
4790     put_prev_task_balance(rq, prev, rf);
4791 
4792     for_each_class(class)
4793     {
4794         p = class->pick_next_task(rq);
4795         if (p) {
4796             return p;
4797         }
4798     }
4799 
4800     /* The idle class should always have a runnable task: */
4801     BUG();
4802 }
4803 
4804 /*
4805  * __schedule() is the main scheduler function.
4806  *
4807  * The main means of driving the scheduler and thus entering this function are:
4808  *
4809  *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
4810  *
4811  *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
4812  *      paths. For example, see arch/x86/entry_64.S.
4813  *
4814  *      To drive preemption between tasks, the scheduler sets the flag in timer
4815  *      interrupt handler scheduler_tick().
4816  *
4817  *   3. Wakeups don't really cause entry into schedule(). They add a
4818  *      task to the run-queue and that's it.
4819  *
4820  *      Now, if the new task added to the run-queue preempts the current
4821  *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
4822  *      called on the nearest possible occasion:
4823  *
4824  *       - If the kernel is preemptible (CONFIG_PREEMPTION=y):
4825  *
4826  *         - in syscall or exception context, at the next outmost
4827  *           preempt_enable(). (this might be as soon as the wake_up()'s
4828  *           spin_unlock()!)
4829  *
4830  *         - in IRQ context, return from interrupt-handler to
4831  *           preemptible context
4832  *
4833  *       - If the kernel is not preemptible (CONFIG_PREEMPTION is not set)
4834  *         then at the next:
4835  *
4836  *          - cond_resched() call
4837  *          - explicit schedule() call
4838  *          - return from syscall or exception to user-space
4839  *          - return from interrupt-handler to user-space
4840  *
4841  * WARNING: must be called with preemption disabled!
4842  */
__schedule(bool preempt)4843 static void __sched notrace __schedule(bool preempt)
4844 {
4845     struct task_struct *prev, *next;
4846     unsigned long *switch_count;
4847     unsigned long prev_state;
4848     struct rq_flags rf;
4849     struct rq *rq;
4850     int cpu;
4851     u64 wallclock;
4852 
4853     cpu = smp_processor_id();
4854     rq = cpu_rq(cpu);
4855     prev = rq->curr;
4856 
4857     schedule_debug(prev, preempt);
4858 
4859     if (sched_feat(HRTICK)) {
4860         hrtick_clear(rq);
4861     }
4862 
4863     local_irq_disable();
4864     rcu_note_context_switch(preempt);
4865 
4866     /*
4867      * Make sure that signal_pending_state()->signal_pending() below
4868      * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
4869      * done by the caller to avoid the race with signal_wake_up():
4870      *
4871      * __set_current_state(@state)        signal_wake_up()
4872      * schedule()                  set_tsk_thread_flag(p, TIF_SIGPENDING)
4873      *                      wake_up_state(p, state)
4874      *   LOCK rq->lock                LOCK p->pi_state
4875      *   smp_mb__after_spinlock()            smp_mb__after_spinlock()
4876      *     if (signal_pending_state())        if (p->state & @state)
4877      *
4878      * Also, the membarrier system call requires a full memory barrier
4879      * after coming from user-space, before storing to rq->curr.
4880      */
4881     rq_lock(rq, &rf);
4882     smp_mb__after_spinlock();
4883 
4884     /* Promote REQ to ACT */
4885     rq->clock_update_flags <<= 1;
4886     update_rq_clock(rq);
4887 
4888     switch_count = &prev->nivcsw;
4889 
4890     /*
4891      * We must load prev->state once (task_struct::state is volatile), such
4892      * that:
4893      *
4894      *  - we form a control dependency vs deactivate_task() below.
4895      *  - ptrace_{,un}freeze_traced() can change ->state underneath us.
4896      */
4897     prev_state = prev->state;
4898     if (!preempt && prev_state) {
4899         if (signal_pending_state(prev_state, prev)) {
4900             prev->state = TASK_RUNNING;
4901         } else {
4902             prev->sched_contributes_to_load =
4903                 (prev_state & TASK_UNINTERRUPTIBLE) &&
4904                 !(prev_state & TASK_NOLOAD) && !(prev->flags & PF_FROZEN);
4905 
4906             if (prev->sched_contributes_to_load) {
4907                 rq->nr_uninterruptible++;
4908             }
4909 
4910             /*
4911              * __schedule()            ttwu()
4912              *   prev_state = prev->state;    if (p->on_rq && ...)
4913              *   if (prev_state)            goto out;
4914              *     p->on_rq = 0;          smp_acquire__after_ctrl_dep();
4915              *                  p->state = TASK_WAKING
4916              *
4917              * Where __schedule() and ttwu() have matching control dependencies.
4918              *
4919              * After this, schedule() must not care about p->state any more.
4920              */
4921             deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
4922 
4923             if (prev->in_iowait) {
4924                 atomic_inc(&rq->nr_iowait);
4925                 delayacct_blkio_start();
4926             }
4927         }
4928         switch_count = &prev->nvcsw;
4929     }
4930 
4931     next = pick_next_task(rq, prev, &rf);
4932     clear_tsk_need_resched(prev);
4933     clear_preempt_need_resched();
4934 
4935     wallclock = sched_ktime_clock();
4936     if (likely(prev != next)) {
4937 #ifdef CONFIG_SCHED_WALT
4938         if (!prev->on_rq) {
4939             prev->last_sleep_ts = wallclock;
4940         }
4941 #endif
4942         update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
4943         update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
4944         rq->nr_switches++;
4945         /*
4946          * RCU users of rcu_dereference(rq->curr) may not see
4947          * changes to task_struct made by pick_next_task().
4948          */
4949         RCU_INIT_POINTER(rq->curr, next);
4950         /*
4951          * The membarrier system call requires each architecture
4952          * to have a full memory barrier after updating
4953          * rq->curr, before returning to user-space.
4954          *
4955          * Here are the schemes providing that barrier on the
4956          * various architectures:
4957          * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC.
4958          *   switch_mm() rely on membarrier_arch_switch_mm() on PowerPC.
4959          * - finish_lock_switch() for weakly-ordered
4960          *   architectures where spin_unlock is a full barrier,
4961          * - switch_to() for arm64 (weakly-ordered, spin_unlock
4962          *   is a RELEASE barrier),
4963          */
4964         ++*switch_count;
4965 
4966         psi_sched_switch(prev, next, !task_on_rq_queued(prev));
4967 
4968         trace_sched_switch(preempt, prev, next);
4969 
4970         /* Also unlocks the rq: */
4971         rq = context_switch(rq, prev, next, &rf);
4972     } else {
4973         update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0);
4974         rq->clock_update_flags &= ~(RQCF_ACT_SKIP | RQCF_REQ_SKIP);
4975         rq_unlock_irq(rq, &rf);
4976     }
4977 
4978     balance_callback(rq);
4979 }
4980 
do_task_dead(void)4981 void __noreturn do_task_dead(void)
4982 {
4983     /* Causes final put_task_struct in finish_task_switch(): */
4984     set_special_state(TASK_DEAD);
4985 
4986     /* Tell freezer to ignore us: */
4987     current->flags |= PF_NOFREEZE;
4988 
4989     __schedule(false);
4990     BUG();
4991 
4992     /* Avoid "noreturn function does return" - but don't continue if BUG() is a
4993      * NOP: */
4994     for (;;) {
4995         cpu_relax();
4996     }
4997 }
4998 
sched_submit_work(struct task_struct * tsk)4999 static inline void sched_submit_work(struct task_struct *tsk)
5000 {
5001     unsigned int task_flags;
5002 
5003     if (!tsk->state) {
5004         return;
5005     }
5006 
5007     task_flags = tsk->flags;
5008     /*
5009      * If a worker went to sleep, notify and ask workqueue whether
5010      * it wants to wake up a task to maintain concurrency.
5011      * As this function is called inside the schedule() context,
5012      * we disable preemption to avoid it calling schedule() again
5013      * in the possible wakeup of a kworker and because wq_worker_sleeping()
5014      * requires it.
5015      */
5016     if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5017         preempt_disable();
5018         if (task_flags & PF_WQ_WORKER) {
5019             wq_worker_sleeping(tsk);
5020         } else {
5021             io_wq_worker_sleeping(tsk);
5022         }
5023         preempt_enable_no_resched();
5024     }
5025 
5026     if (tsk_is_pi_blocked(tsk)) {
5027         return;
5028     }
5029 
5030     /*
5031      * If we are going to sleep and we have plugged IO queued,
5032      * make sure to submit it to avoid deadlocks.
5033      */
5034     if (blk_needs_flush_plug(tsk)) {
5035         blk_schedule_flush_plug(tsk);
5036     }
5037 }
5038 
sched_update_worker(struct task_struct * tsk)5039 static void sched_update_worker(struct task_struct *tsk)
5040 {
5041     if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
5042         if (tsk->flags & PF_WQ_WORKER) {
5043             wq_worker_running(tsk);
5044         } else {
5045             io_wq_worker_running(tsk);
5046         }
5047     }
5048 }
5049 
schedule(void)5050 asmlinkage __visible void __sched schedule(void)
5051 {
5052     struct task_struct *tsk = current;
5053 
5054     sched_submit_work(tsk);
5055     do {
5056         preempt_disable();
5057         __schedule(false);
5058         sched_preempt_enable_no_resched();
5059     } while (need_resched());
5060     sched_update_worker(tsk);
5061 }
5062 EXPORT_SYMBOL(schedule);
5063 
5064 /*
5065  * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
5066  * state (have scheduled out non-voluntarily) by making sure that all
5067  * tasks have either left the run queue or have gone into user space.
5068  * As idle tasks do not do either, they must not ever be preempted
5069  * (schedule out non-voluntarily).
5070  *
5071  * schedule_idle() is similar to schedule_preempt_disable() except that it
5072  * never enables preemption because it does not call sched_submit_work().
5073  */
schedule_idle(void)5074 void __sched schedule_idle(void)
5075 {
5076     /*
5077      * As this skips calling sched_submit_work(), which the idle task does
5078      * regardless because that function is a nop when the task is in a
5079      * TASK_RUNNING state, make sure this isn't used someplace that the
5080      * current task can be in any other state. Note, idle is always in the
5081      * TASK_RUNNING state.
5082      */
5083     WARN_ON_ONCE(current->state);
5084     do {
5085         __schedule(false);
5086     } while (need_resched());
5087 }
5088 
5089 #ifdef CONFIG_CONTEXT_TRACKING
schedule_user(void)5090 asmlinkage __visible void __sched schedule_user(void)
5091 {
5092     /*
5093      * If we come here after a random call to set_need_resched(),
5094      * or we have been woken up remotely but the IPI has not yet arrived,
5095      * we haven't yet exited the RCU idle mode. Do it here manually until
5096      * we find a better solution.
5097      *
5098      * NB: There are buggy callers of this function.  Ideally we
5099      * should warn if prev_state != CONTEXT_USER, but that will trigger
5100      * too frequently to make sense yet.
5101      */
5102     enum ctx_state prev_state = exception_enter();
5103     schedule();
5104     exception_exit(prev_state);
5105 }
5106 #endif
5107 
5108 /**
5109  * schedule_preempt_disabled - called with preemption disabled
5110  *
5111  * Returns with preemption disabled. Note: preempt_count must be 1
5112  */
schedule_preempt_disabled(void)5113 void __sched schedule_preempt_disabled(void)
5114 {
5115     sched_preempt_enable_no_resched();
5116     schedule();
5117     preempt_disable();
5118 }
5119 
preempt_schedule_common(void)5120 static void __sched notrace preempt_schedule_common(void)
5121 {
5122     do {
5123         /*
5124          * Because the function tracer can trace preempt_count_sub()
5125          * and it also uses preempt_enable/disable_notrace(), if
5126          * NEED_RESCHED is set, the preempt_enable_notrace() called
5127          * by the function tracer will call this function again and
5128          * cause infinite recursion.
5129          *
5130          * Preemption must be disabled here before the function
5131          * tracer can trace. Break up preempt_disable() into two
5132          * calls. One to disable preemption without fear of being
5133          * traced. The other to still record the preemption latency,
5134          * which can also be traced by the function tracer.
5135          */
5136         preempt_disable_notrace();
5137         preempt_latency_start(1);
5138         __schedule(true);
5139         preempt_latency_stop(1);
5140         preempt_enable_no_resched_notrace();
5141 
5142         /*
5143          * Check again in case we missed a preemption opportunity
5144          * between schedule and now.
5145          */
5146     } while (need_resched());
5147 }
5148 
5149 #ifdef CONFIG_PREEMPTION
5150 /*
5151  * This is the entry point to schedule() from in-kernel preemption
5152  * off of preempt_enable.
5153  */
preempt_schedule(void)5154 asmlinkage __visible void __sched notrace preempt_schedule(void)
5155 {
5156     /*
5157      * If there is a non-zero preempt_count or interrupts are disabled,
5158      * we do not want to preempt the current task. Just return..
5159      */
5160     if (likely(!preemptible())) {
5161         return;
5162     }
5163 
5164     preempt_schedule_common();
5165 }
5166 NOKPROBE_SYMBOL(preempt_schedule);
5167 EXPORT_SYMBOL(preempt_schedule);
5168 
5169 /**
5170  * preempt_schedule_notrace - preempt_schedule called by tracing
5171  *
5172  * The tracing infrastructure uses preempt_enable_notrace to prevent
5173  * recursion and tracing preempt enabling caused by the tracing
5174  * infrastructure itself. But as tracing can happen in areas coming
5175  * from userspace or just about to enter userspace, a preempt enable
5176  * can occur before user_exit() is called. This will cause the scheduler
5177  * to be called when the system is still in usermode.
5178  *
5179  * To prevent this, the preempt_enable_notrace will use this function
5180  * instead of preempt_schedule() to exit user context if needed before
5181  * calling the scheduler.
5182  */
preempt_schedule_notrace(void)5183 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
5184 {
5185     enum ctx_state prev_ctx;
5186 
5187     if (likely(!preemptible())) {
5188         return;
5189     }
5190 
5191     do {
5192         /*
5193          * Because the function tracer can trace preempt_count_sub()
5194          * and it also uses preempt_enable/disable_notrace(), if
5195          * NEED_RESCHED is set, the preempt_enable_notrace() called
5196          * by the function tracer will call this function again and
5197          * cause infinite recursion.
5198          *
5199          * Preemption must be disabled here before the function
5200          * tracer can trace. Break up preempt_disable() into two
5201          * calls. One to disable preemption without fear of being
5202          * traced. The other to still record the preemption latency,
5203          * which can also be traced by the function tracer.
5204          */
5205         preempt_disable_notrace();
5206         preempt_latency_start(1);
5207         /*
5208          * Needs preempt disabled in case user_exit() is traced
5209          * and the tracer calls preempt_enable_notrace() causing
5210          * an infinite recursion.
5211          */
5212         prev_ctx = exception_enter();
5213         __schedule(true);
5214         exception_exit(prev_ctx);
5215 
5216         preempt_latency_stop(1);
5217         preempt_enable_no_resched_notrace();
5218     } while (need_resched());
5219 }
5220 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
5221 
5222 #endif /* CONFIG_PREEMPTION */
5223 
5224 /*
5225  * This is the entry point to schedule() from kernel preemption
5226  * off of irq context.
5227  * Note, that this is called and return with irqs disabled. This will
5228  * protect us against recursive calling from irq.
5229  */
preempt_schedule_irq(void)5230 asmlinkage __visible void __sched preempt_schedule_irq(void)
5231 {
5232     enum ctx_state prev_state;
5233 
5234     /* Catch callers which need to be fixed */
5235     BUG_ON(preempt_count() || !irqs_disabled());
5236 
5237     prev_state = exception_enter();
5238 
5239     do {
5240         preempt_disable();
5241         local_irq_enable();
5242         __schedule(true);
5243         local_irq_disable();
5244         sched_preempt_enable_no_resched();
5245     } while (need_resched());
5246 
5247     exception_exit(prev_state);
5248 }
5249 
default_wake_function(wait_queue_entry_t * curr,unsigned mode,int wake_flags,void * key)5250 int default_wake_function(wait_queue_entry_t *curr, unsigned mode,
5251                           int wake_flags, void *key)
5252 {
5253     WARN_ON_ONCE(IS_ENABLED(CONFIG_SCHED_DEBUG) && (wake_flags & ~(WF_SYNC)));
5254     return try_to_wake_up(curr->private, mode, wake_flags);
5255 }
5256 EXPORT_SYMBOL(default_wake_function);
5257 
__setscheduler_prio(struct task_struct * p,int prio)5258 static void __setscheduler_prio(struct task_struct *p, int prio)
5259 {
5260     if (dl_prio(prio)) {
5261         p->sched_class = &dl_sched_class;
5262     } else if (rt_prio(prio)) {
5263         p->sched_class = &rt_sched_class;
5264     } else {
5265         p->sched_class = &fair_sched_class;
5266     }
5267 
5268     p->prio = prio;
5269 }
5270 
5271 #ifdef CONFIG_RT_MUTEXES
5272 
__rt_effective_prio(struct task_struct * pi_task,int prio)5273 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
5274 {
5275     if (pi_task) {
5276         prio = min(prio, pi_task->prio);
5277     }
5278 
5279     return prio;
5280 }
5281 
rt_effective_prio(struct task_struct * p,int prio)5282 static inline int rt_effective_prio(struct task_struct *p, int prio)
5283 {
5284     struct task_struct *pi_task = rt_mutex_get_top_task(p);
5285 
5286     return __rt_effective_prio(pi_task, prio);
5287 }
5288 
5289 /*
5290  * rt_mutex_setprio - set the current priority of a task
5291  * @p: task to boost
5292  * @pi_task: donor task
5293  *
5294  * This function changes the 'effective' priority of a task. It does
5295  * not touch ->normal_prio like __setscheduler().
5296  *
5297  * Used by the rt_mutex code to implement priority inheritance
5298  * logic. Call site only calls if the priority of the task changed.
5299  */
rt_mutex_setprio(struct task_struct * p,struct task_struct * pi_task)5300 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
5301 {
5302     int prio, oldprio, queued, running,
5303         queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
5304     const struct sched_class *prev_class;
5305     struct rq_flags rf;
5306     struct rq *rq;
5307 
5308     /* XXX used to be waiter->prio, not waiter->task->prio */
5309     prio = __rt_effective_prio(pi_task, p->normal_prio);
5310     /*
5311      * If nothing changed; bail early.
5312      */
5313     if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio)) {
5314         return;
5315     }
5316 
5317     rq = __task_rq_lock(p, &rf);
5318     update_rq_clock(rq);
5319     /*
5320      * Set under pi_lock && rq->lock, such that the value can be used under
5321      * either lock.
5322      *
5323      * Note that there is loads of tricky to make this pointer cache work
5324      * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
5325      * ensure a task is de-boosted (pi_task is set to NULL) before the
5326      * task is allowed to run again (and can exit). This ensures the pointer
5327      * points to a blocked task -- which guaratees the task is present.
5328      */
5329     p->pi_top_task = pi_task;
5330 
5331     /*
5332      * For FIFO/RR we only need to set prio, if that matches we're done.
5333      */
5334     if (prio == p->prio && !dl_prio(prio)) {
5335         goto out_unlock;
5336     }
5337 
5338     /*
5339      * Idle task boosting is a nono in general. There is one
5340      * exception, when PREEMPT_RT and NOHZ is active:
5341      *
5342      * The idle task calls get_next_timer_interrupt() and holds
5343      * the timer wheel base->lock on the CPU and another CPU wants
5344      * to access the timer (probably to cancel it). We can safely
5345      * ignore the boosting request, as the idle CPU runs this code
5346      * with interrupts disabled and will complete the lock
5347      * protected section without being interrupted. So there is no
5348      * real need to boost.
5349      */
5350     if (unlikely(p == rq->idle)) {
5351         WARN_ON(p != rq->curr);
5352         WARN_ON(p->pi_blocked_on);
5353         goto out_unlock;
5354     }
5355 
5356     trace_sched_pi_setprio(p, pi_task);
5357     oldprio = p->prio;
5358 
5359     if (oldprio == prio) {
5360         queue_flag &= ~DEQUEUE_MOVE;
5361     }
5362 
5363     prev_class = p->sched_class;
5364     queued = task_on_rq_queued(p);
5365     running = task_current(rq, p);
5366     if (queued) {
5367         dequeue_task(rq, p, queue_flag);
5368     }
5369     if (running) {
5370         put_prev_task(rq, p);
5371     }
5372 
5373     /*
5374      * Boosting condition are:
5375      * 1. -rt task is running and holds mutex A
5376      *      --> -dl task blocks on mutex A
5377      *
5378      * 2. -dl task is running and holds mutex A
5379      *      --> -dl task blocks on mutex A and could preempt the
5380      *          running task
5381      */
5382     if (dl_prio(prio)) {
5383         if (!dl_prio(p->normal_prio) ||
5384             (pi_task && dl_prio(pi_task->prio) &&
5385              dl_entity_preempt(&pi_task->dl, &p->dl))) {
5386             p->dl.pi_se = pi_task->dl.pi_se;
5387             queue_flag |= ENQUEUE_REPLENISH;
5388         } else {
5389             p->dl.pi_se = &p->dl;
5390         }
5391     } else if (rt_prio(prio)) {
5392         if (dl_prio(oldprio)) {
5393             p->dl.pi_se = &p->dl;
5394         }
5395         if (oldprio < prio) {
5396             queue_flag |= ENQUEUE_HEAD;
5397         }
5398     } else {
5399         if (dl_prio(oldprio)) {
5400             p->dl.pi_se = &p->dl;
5401         }
5402         if (rt_prio(oldprio)) {
5403             p->rt.timeout = 0;
5404         }
5405     }
5406 
5407     __setscheduler_prio(p, prio);
5408 
5409     if (queued) {
5410         enqueue_task(rq, p, queue_flag);
5411     }
5412     if (running) {
5413         set_next_task(rq, p);
5414     }
5415 
5416     check_class_changed(rq, p, prev_class, oldprio);
5417 out_unlock:
5418     /* Avoid rq from going away on us: */
5419     preempt_disable();
5420     __task_rq_unlock(rq, &rf);
5421 
5422     balance_callback(rq);
5423     preempt_enable();
5424 }
5425 #else
rt_effective_prio(struct task_struct * p,int prio)5426 static inline int rt_effective_prio(struct task_struct *p, int prio)
5427 {
5428     return prio;
5429 }
5430 #endif
5431 
set_user_nice(struct task_struct * p,long nice)5432 void set_user_nice(struct task_struct *p, long nice)
5433 {
5434     bool queued, running;
5435     int old_prio;
5436     struct rq_flags rf;
5437     struct rq *rq;
5438 
5439     if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) {
5440         return;
5441     }
5442     /*
5443      * We have to be careful, if called from sys_setpriority(),
5444      * the task might be in the middle of scheduling on another CPU.
5445      */
5446     rq = task_rq_lock(p, &rf);
5447     update_rq_clock(rq);
5448 
5449     /*
5450      * The RT priorities are set via sched_setscheduler(), but we still
5451      * allow the 'normal' nice value to be set - but as expected
5452      * it wont have any effect on scheduling until the task is
5453      * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
5454      */
5455     if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
5456         p->static_prio = NICE_TO_PRIO(nice);
5457         goto out_unlock;
5458     }
5459     queued = task_on_rq_queued(p);
5460     running = task_current(rq, p);
5461     if (queued) {
5462         dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
5463     }
5464     if (running) {
5465         put_prev_task(rq, p);
5466     }
5467 
5468     p->static_prio = NICE_TO_PRIO(nice);
5469     set_load_weight(p);
5470     old_prio = p->prio;
5471     p->prio = effective_prio(p);
5472 
5473     if (queued) {
5474         enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5475     }
5476     if (running) {
5477         set_next_task(rq, p);
5478     }
5479 
5480     /*
5481      * If the task increased its priority or is running and
5482      * lowered its priority, then reschedule its CPU:
5483      */
5484     p->sched_class->prio_changed(rq, p, old_prio);
5485 
5486 out_unlock:
5487     task_rq_unlock(rq, p, &rf);
5488 }
5489 EXPORT_SYMBOL(set_user_nice);
5490 
5491 /*
5492  * can_nice - check if a task can reduce its nice value
5493  * @p: task
5494  * @nice: nice value
5495  */
can_nice(const struct task_struct * p,const int nice)5496 int can_nice(const struct task_struct *p, const int nice)
5497 {
5498     /* Convert nice value [19,-20] to rlimit style value [1,40]: */
5499     int nice_rlim = nice_to_rlimit(nice);
5500 
5501     return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || capable(CAP_SYS_NICE));
5502 }
5503 
5504 #ifdef __ARCH_WANT_SYS_NICE
5505 
5506 /*
5507  * sys_nice - change the priority of the current process.
5508  * @increment: priority increment
5509  *
5510  * sys_setpriority is a more generic, but much slower function that
5511  * does similar things.
5512  */
SYSCALL_DEFINE1(nice,int,increment)5513 SYSCALL_DEFINE1(nice, int, increment)
5514 {
5515     long nice, retval;
5516 
5517     /*
5518      * Setpriority might change our priority at the same moment.
5519      * We don't have to worry. Conceptually one call occurs first
5520      * and we have a single winner.
5521      */
5522     increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
5523     nice = task_nice(current) + increment;
5524 
5525     nice = clamp_val(nice, MIN_NICE, MAX_NICE);
5526     if (increment < 0 && !can_nice(current, nice)) {
5527         return -EPERM;
5528     }
5529 
5530     retval = security_task_setnice(current, nice);
5531     if (retval) {
5532         return retval;
5533     }
5534 
5535     set_user_nice(current, nice);
5536     return 0;
5537 }
5538 
5539 #endif
5540 
5541 /**
5542  * task_prio - return the priority value of a given task.
5543  * @p: the task in question.
5544  *
5545  * Return: The priority value as seen by users in /proc.
5546  * RT tasks are offset by -200. Normal tasks are centered
5547  * around 0, value goes from -16 to +15.
5548  */
task_prio(const struct task_struct * p)5549 int task_prio(const struct task_struct *p)
5550 {
5551     return p->prio - MAX_RT_PRIO;
5552 }
5553 
5554 /**
5555  * idle_cpu - is a given CPU idle currently?
5556  * @cpu: the processor in question.
5557  *
5558  * Return: 1 if the CPU is currently idle. 0 otherwise.
5559  */
idle_cpu(int cpu)5560 int idle_cpu(int cpu)
5561 {
5562     struct rq *rq = cpu_rq(cpu);
5563 
5564     if (rq->curr != rq->idle) {
5565         return 0;
5566     }
5567 
5568     if (rq->nr_running) {
5569         return 0;
5570     }
5571 
5572 #ifdef CONFIG_SMP
5573     if (rq->ttwu_pending) {
5574         return 0;
5575     }
5576 #endif
5577 
5578     return 1;
5579 }
5580 
5581 /**
5582  * available_idle_cpu - is a given CPU idle for enqueuing work.
5583  * @cpu: the CPU in question.
5584  *
5585  * Return: 1 if the CPU is currently idle. 0 otherwise.
5586  */
available_idle_cpu(int cpu)5587 int available_idle_cpu(int cpu)
5588 {
5589     if (!idle_cpu(cpu)) {
5590         return 0;
5591     }
5592 
5593     if (vcpu_is_preempted(cpu)) {
5594         return 0;
5595     }
5596 
5597     return 1;
5598 }
5599 
5600 /**
5601  * idle_task - return the idle task for a given CPU.
5602  * @cpu: the processor in question.
5603  *
5604  * Return: The idle task for the CPU @cpu.
5605  */
idle_task(int cpu)5606 struct task_struct *idle_task(int cpu)
5607 {
5608     return cpu_rq(cpu)->idle;
5609 }
5610 
5611 /**
5612  * find_process_by_pid - find a process with a matching PID value.
5613  * @pid: the pid in question.
5614  *
5615  * The task of @pid, if found. %NULL otherwise.
5616  */
find_process_by_pid(pid_t pid)5617 static struct task_struct *find_process_by_pid(pid_t pid)
5618 {
5619     return pid ? find_task_by_vpid(pid) : current;
5620 }
5621 
5622 /*
5623  * sched_setparam() passes in -1 for its policy, to let the functions
5624  * it calls know not to change it.
5625  */
5626 #define SETPARAM_POLICY (-1)
5627 
__setscheduler_params(struct task_struct * p,const struct sched_attr * attr)5628 static void __setscheduler_params(struct task_struct *p,
5629                                   const struct sched_attr *attr)
5630 {
5631     int policy = attr->sched_policy;
5632 
5633     if (policy == SETPARAM_POLICY) {
5634         policy = p->policy;
5635     }
5636 
5637     p->policy = policy;
5638 
5639     if (dl_policy(policy)) {
5640         __setparam_dl(p, attr);
5641     } else if (fair_policy(policy)) {
5642         p->static_prio = NICE_TO_PRIO(attr->sched_nice);
5643     }
5644 
5645     /*
5646      * __sched_setscheduler() ensures attr->sched_priority == 0 when
5647      * !rt_policy. Always setting this ensures that things like
5648      * getparam()/getattr() don't report silly values for !rt tasks.
5649      */
5650     p->rt_priority = attr->sched_priority;
5651     p->normal_prio = normal_prio(p);
5652     set_load_weight(p);
5653 }
5654 
5655 /*
5656  * Check the target process has a UID that matches the current process's:
5657  */
check_same_owner(struct task_struct * p)5658 static bool check_same_owner(struct task_struct *p)
5659 {
5660     const struct cred *cred = current_cred(), *pcred;
5661     bool match;
5662 
5663     rcu_read_lock();
5664     pcred = __task_cred(p);
5665     match = (uid_eq(cred->euid, pcred->euid) || uid_eq(cred->euid, pcred->uid));
5666     rcu_read_unlock();
5667     return match;
5668 }
5669 
__sched_setscheduler(struct task_struct * p,const struct sched_attr * attr,bool user,bool pi)5670 static int __sched_setscheduler(struct task_struct *p,
5671                                 const struct sched_attr *attr, bool user,
5672                                 bool pi)
5673 {
5674     int oldpolicy = -1, policy = attr->sched_policy;
5675     int retval, oldprio, newprio, queued, running;
5676     const struct sched_class *prev_class;
5677     struct rq_flags rf;
5678     int reset_on_fork;
5679     int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
5680     struct rq *rq;
5681 
5682     /* The pi code expects interrupts enabled */
5683     BUG_ON(pi && in_interrupt());
5684     while (1) {
5685         /* Double check policy once rq lock held: */
5686         if (policy < 0) {
5687             reset_on_fork = p->sched_reset_on_fork;
5688             policy = oldpolicy = p->policy;
5689         } else {
5690             reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
5691 
5692             if (!valid_policy(policy)) {
5693                 return -EINVAL;
5694             }
5695         }
5696 
5697         if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) {
5698             return -EINVAL;
5699         }
5700 
5701         /*
5702          * Valid priorities for SCHED_FIFO and SCHED_RR are
5703          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
5704          * SCHED_BATCH and SCHED_IDLE is 0.
5705          */
5706         if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO - 1) ||
5707             (!p->mm && attr->sched_priority > MAX_RT_PRIO - 1)) {
5708             return -EINVAL;
5709         }
5710         if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
5711             (rt_policy(policy) != (attr->sched_priority != 0))) {
5712             return -EINVAL;
5713         }
5714 
5715         /*
5716          * Allow unprivileged RT tasks to decrease priority:
5717          */
5718         if (user && !capable(CAP_SYS_NICE)) {
5719             if (fair_policy(policy)) {
5720                 if (attr->sched_nice < task_nice(p) &&
5721                     !can_nice(p, attr->sched_nice)) {
5722                     return -EPERM;
5723                 }
5724             }
5725 
5726             if (rt_policy(policy)) {
5727                 unsigned long rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
5728                 /* Can't set/change the rt policy: */
5729                 if (policy != p->policy && !rlim_rtprio) {
5730                     return -EPERM;
5731                 }
5732 
5733                 /* Can't increase priority: */
5734                 if (attr->sched_priority > p->rt_priority &&
5735                     attr->sched_priority > rlim_rtprio) {
5736                     return -EPERM;
5737                 }
5738             }
5739 
5740             /*
5741              * Can't set/change SCHED_DEADLINE policy at all for now
5742              * (safest behavior); in the future we would like to allow
5743              * unprivileged DL tasks to increase their relative deadline
5744              * or reduce their runtime (both ways reducing utilization)
5745              */
5746             if (dl_policy(policy)) {
5747                 return -EPERM;
5748             }
5749 
5750             /*
5751              * Treat SCHED_IDLE as nice 20. Only allow a switch to
5752              * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
5753              */
5754             if (task_has_idle_policy(p) && !idle_policy(policy)) {
5755                 if (!can_nice(p, task_nice(p))) {
5756                     return -EPERM;
5757                 }
5758             }
5759 
5760             /* Can't change other user's priorities: */
5761             if (!check_same_owner(p)) {
5762                 return -EPERM;
5763             }
5764 
5765             /* Normal users shall not reset the sched_reset_on_fork flag: */
5766             if (p->sched_reset_on_fork && !reset_on_fork) {
5767                 return -EPERM;
5768             }
5769         }
5770 
5771         if (user) {
5772             if (attr->sched_flags & SCHED_FLAG_SUGOV) {
5773                 return -EINVAL;
5774             }
5775 
5776             retval = security_task_setscheduler(p);
5777             if (retval) {
5778                 return retval;
5779             }
5780         }
5781 
5782         /* Update task specific "requested" clamps */
5783         if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
5784             retval = uclamp_validate(p, attr);
5785             if (retval) {
5786                 return retval;
5787             }
5788         }
5789 
5790         if (attr->sched_flags & SCHED_FLAG_LATENCY_NICE) {
5791             retval = latency_nice_validate(p, user, attr);
5792             if (retval) {
5793                 return retval;
5794             }
5795         }
5796 
5797         if (pi) {
5798             cpuset_read_lock();
5799         }
5800 
5801         /*
5802          * Make sure no PI-waiters arrive (or leave) while we are
5803          * changing the priority of the task:
5804          *
5805          * To be able to change p->policy safely, the appropriate
5806          * runqueue lock must be held.
5807          */
5808         rq = task_rq_lock(p, &rf);
5809         update_rq_clock(rq);
5810 
5811         /*
5812          * Changing the policy of the stop threads its a very bad idea:
5813          */
5814         if (p == rq->stop) {
5815             retval = -EINVAL;
5816             goto unlock;
5817         }
5818 
5819         /*
5820          * If not changing anything there's no need to proceed further,
5821          * but store a possible modification of reset_on_fork.
5822          */
5823         if (unlikely(policy == p->policy)) {
5824             if (fair_policy(policy) && attr->sched_nice != task_nice(p)) {
5825                 goto change;
5826             }
5827             if (rt_policy(policy) && attr->sched_priority != p->rt_priority) {
5828                 goto change;
5829             }
5830             if (dl_policy(policy) && dl_param_changed(p, attr)) {
5831                 goto change;
5832             }
5833             if (attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) {
5834                 goto change;
5835             }
5836 #ifdef CONFIG_SCHED_LATENCY_NICE
5837             if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
5838                 (attr->sched_latency_nice != LATENCY_TO_NICE(p->latency_prio))) {
5839                 goto change;
5840             }
5841 #endif
5842 
5843             p->sched_reset_on_fork = reset_on_fork;
5844             retval = 0;
5845             goto unlock;
5846         }
5847     change:
5848 
5849         if (user) {
5850 #ifdef CONFIG_RT_GROUP_SCHED
5851             /*
5852              * Do not allow realtime tasks into groups that have no runtime
5853              * assigned.
5854              */
5855             if (rt_bandwidth_enabled() && rt_policy(policy) &&
5856                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
5857                 !task_group_is_autogroup(task_group(p))) {
5858                 retval = -EPERM;
5859                 goto unlock;
5860             }
5861 #endif
5862 #ifdef CONFIG_SMP
5863             if (dl_bandwidth_enabled() && dl_policy(policy) &&
5864                 !(attr->sched_flags & SCHED_FLAG_SUGOV)) {
5865                 cpumask_t *span = rq->rd->span;
5866 
5867                 /*
5868                  * Don't allow tasks with an affinity mask smaller than
5869                  * the entire root_domain to become SCHED_DEADLINE. We
5870                  * will also fail if there's no bandwidth available.
5871                  */
5872                 if (!cpumask_subset(span, p->cpus_ptr) ||
5873                     rq->rd->dl_bw.bw == 0) {
5874                     retval = -EPERM;
5875                     goto unlock;
5876                 }
5877             }
5878 #endif
5879         }
5880 
5881         /* Re-check policy now with rq lock held: */
5882         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
5883             policy = oldpolicy = -1;
5884             task_rq_unlock(rq, p, &rf);
5885             if (pi) {
5886                 cpuset_read_unlock();
5887             }
5888             continue;
5889         }
5890         break;
5891     }
5892 
5893     /*
5894      * If setscheduling to SCHED_DEADLINE (or changing the parameters
5895      * of a SCHED_DEADLINE task) we need to check if enough bandwidth
5896      * is available.
5897      */
5898     if ((dl_policy(policy) || dl_task(p)) &&
5899         sched_dl_overflow(p, policy, attr)) {
5900         retval = -EBUSY;
5901         goto unlock;
5902     }
5903 
5904     p->sched_reset_on_fork = reset_on_fork;
5905     oldprio = p->prio;
5906 
5907     newprio = __normal_prio(policy, attr->sched_priority, attr->sched_nice);
5908     if (pi) {
5909         /*
5910          * Take priority boosted tasks into account. If the new
5911          * effective priority is unchanged, we just store the new
5912          * normal parameters and do not touch the scheduler class and
5913          * the runqueue. This will be done when the task deboost
5914          * itself.
5915          */
5916         newprio = rt_effective_prio(p, newprio);
5917         if (newprio == oldprio) {
5918             queue_flags &= ~DEQUEUE_MOVE;
5919         }
5920     }
5921 
5922     queued = task_on_rq_queued(p);
5923     running = task_current(rq, p);
5924     if (queued) {
5925         dequeue_task(rq, p, queue_flags);
5926     }
5927     if (running) {
5928         put_prev_task(rq, p);
5929     }
5930 
5931     prev_class = p->sched_class;
5932 
5933     if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
5934         __setscheduler_params(p, attr);
5935         __setscheduler_prio(p, newprio);
5936     }
5937     __setscheduler_latency(p, attr);
5938     __setscheduler_uclamp(p, attr);
5939 
5940     if (queued) {
5941         /*
5942          * We enqueue to tail when the priority of a task is
5943          * increased (user space view).
5944          */
5945         if (oldprio < p->prio) {
5946             queue_flags |= ENQUEUE_HEAD;
5947         }
5948 
5949         enqueue_task(rq, p, queue_flags);
5950     }
5951     if (running) {
5952         set_next_task(rq, p);
5953     }
5954 
5955     check_class_changed(rq, p, prev_class, oldprio);
5956 
5957     /* Avoid rq from going away on us: */
5958     preempt_disable();
5959     task_rq_unlock(rq, p, &rf);
5960 
5961     if (pi) {
5962         cpuset_read_unlock();
5963         rt_mutex_adjust_pi(p);
5964     }
5965 
5966     /* Run balance callbacks after we've adjusted the PI chain: */
5967     balance_callback(rq);
5968     preempt_enable();
5969 
5970     return 0;
5971 
5972 unlock:
5973     task_rq_unlock(rq, p, &rf);
5974     if (pi) {
5975         cpuset_read_unlock();
5976     }
5977     return retval;
5978 }
5979 
_sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param,bool check)5980 static int _sched_setscheduler(struct task_struct *p, int policy,
5981                                const struct sched_param *param, bool check)
5982 {
5983     struct sched_attr attr = {
5984         .sched_policy = policy,
5985         .sched_priority = param->sched_priority,
5986         .sched_nice = PRIO_TO_NICE(p->static_prio),
5987     };
5988 
5989     /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
5990     if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
5991         attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
5992         policy &= ~SCHED_RESET_ON_FORK;
5993         attr.sched_policy = policy;
5994     }
5995 
5996     return __sched_setscheduler(p, &attr, check, true);
5997 }
5998 /**
5999  * sched_setscheduler - change the scheduling policy and/or RT priority of a
6000  * thread.
6001  * @p: the task in question.
6002  * @policy: new policy.
6003  * @param: structure containing the new RT priority.
6004  *
6005  * Use sched_set_fifo(), read its comment.
6006  *
6007  * Return: 0 on success. An error code otherwise.
6008  *
6009  * NOTE that the task may be already dead.
6010  */
sched_setscheduler(struct task_struct * p,int policy,const struct sched_param * param)6011 int sched_setscheduler(struct task_struct *p, int policy,
6012                        const struct sched_param *param)
6013 {
6014     return _sched_setscheduler(p, policy, param, true);
6015 }
6016 EXPORT_SYMBOL_GPL(sched_setscheduler);
6017 
sched_setattr(struct task_struct * p,const struct sched_attr * attr)6018 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
6019 {
6020     return __sched_setscheduler(p, attr, true, true);
6021 }
6022 EXPORT_SYMBOL_GPL(sched_setattr);
6023 
sched_setattr_nocheck(struct task_struct * p,const struct sched_attr * attr)6024 int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr)
6025 {
6026     return __sched_setscheduler(p, attr, false, true);
6027 }
6028 EXPORT_SYMBOL_GPL(sched_setattr_nocheck);
6029 
6030 /**
6031  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority
6032  * of a thread from kernelspace.
6033  * @p: the task in question.
6034  * @policy: new policy.
6035  * @param: structure containing the new RT priority.
6036  *
6037  * Just like sched_setscheduler, only don't bother checking if the
6038  * current context has permission.  For example, this is needed in
6039  * stop_machine(): we create temporary high priority worker threads,
6040  * but our caller might not have that capability.
6041  *
6042  * Return: 0 on success. An error code otherwise.
6043  */
sched_setscheduler_nocheck(struct task_struct * p,int policy,const struct sched_param * param)6044 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
6045                                const struct sched_param *param)
6046 {
6047     return _sched_setscheduler(p, policy, param, false);
6048 }
6049 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
6050 
6051 /*
6052  * SCHED_FIFO is a broken scheduler model; that is, it is fundamentally
6053  * incapable of resource management, which is the one thing an OS really should
6054  * be doing.
6055  *
6056  * This is of course the reason it is limited to privileged users only.
6057  *
6058  * Worse still; it is fundamentally impossible to compose static priority
6059  * workloads. You cannot take two correctly working static prio workloads
6060  * and smash them together and still expect them to work.
6061  *
6062  * For this reason 'all' FIFO tasks the kernel creates are basically at:
6063  *
6064  *   MAX_RT_PRIO / 2
6065  *
6066  * The administrator _MUST_ configure the system, the kernel simply doesn't
6067  * know enough information to make a sensible choice.
6068  */
sched_set_fifo(struct task_struct * p)6069 void sched_set_fifo(struct task_struct *p)
6070 {
6071     struct sched_param sp = {.sched_priority = MAX_RT_PRIO / 2};
6072     WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6073 }
6074 EXPORT_SYMBOL_GPL(sched_set_fifo);
6075 
6076 /*
6077  * For when you don't much care about FIFO, but want to be above SCHED_NORMAL.
6078  */
sched_set_fifo_low(struct task_struct * p)6079 void sched_set_fifo_low(struct task_struct *p)
6080 {
6081     struct sched_param sp = {.sched_priority = 1};
6082     WARN_ON_ONCE(sched_setscheduler_nocheck(p, SCHED_FIFO, &sp) != 0);
6083 }
6084 EXPORT_SYMBOL_GPL(sched_set_fifo_low);
6085 
sched_set_normal(struct task_struct * p,int nice)6086 void sched_set_normal(struct task_struct *p, int nice)
6087 {
6088     struct sched_attr attr = {
6089         .sched_policy = SCHED_NORMAL,
6090         .sched_nice = nice,
6091     };
6092     WARN_ON_ONCE(sched_setattr_nocheck(p, &attr) != 0);
6093 }
6094 EXPORT_SYMBOL_GPL(sched_set_normal);
6095 
do_sched_setscheduler(pid_t pid,int policy,struct sched_param __user * param)6096 static int do_sched_setscheduler(pid_t pid, int policy,
6097                                  struct sched_param __user *param)
6098 {
6099     struct sched_param lparam;
6100     struct task_struct *p;
6101     int retval;
6102 
6103     if (!param || pid < 0) {
6104         return -EINVAL;
6105     }
6106     if (copy_from_user(&lparam, param, sizeof(struct sched_param))) {
6107         return -EFAULT;
6108     }
6109 
6110     rcu_read_lock();
6111     retval = -ESRCH;
6112     p = find_process_by_pid(pid);
6113     if (likely(p)) {
6114         get_task_struct(p);
6115     }
6116     rcu_read_unlock();
6117 
6118     if (likely(p)) {
6119         retval = sched_setscheduler(p, policy, &lparam);
6120         put_task_struct(p);
6121     }
6122 
6123     return retval;
6124 }
6125 
6126 /*
6127  * Mimics kernel/events/core.c perf_copy_attr().
6128  */
sched_copy_attr(struct sched_attr __user * uattr,struct sched_attr * attr)6129 static int sched_copy_attr(struct sched_attr __user *uattr,
6130                            struct sched_attr *attr)
6131 {
6132     u32 size;
6133     int ret;
6134 
6135     /* Zero the full structure, so that a short copy will be nice: */
6136     memset(attr, 0, sizeof(*attr));
6137 
6138     ret = get_user(size, &uattr->size);
6139     if (ret) {
6140         return ret;
6141     }
6142 
6143     /* ABI compatibility quirk: */
6144     if (!size) {
6145         size = SCHED_ATTR_SIZE_VER0;
6146     }
6147     if (size < SCHED_ATTR_SIZE_VER0 || size > PAGE_SIZE) {
6148         goto err_size;
6149     }
6150 
6151     ret = copy_struct_from_user(attr, sizeof(*attr), uattr, size);
6152     if (ret) {
6153         if (ret == -E2BIG) {
6154             goto err_size;
6155         }
6156         return ret;
6157     }
6158 
6159     if ((attr->sched_flags & SCHED_FLAG_UTIL_CLAMP) &&
6160         size < SCHED_ATTR_SIZE_VER1) {
6161         return -EINVAL;
6162     }
6163 
6164 #ifdef CONFIG_SCHED_LATENCY_NICE
6165     if ((attr->sched_flags & SCHED_FLAG_LATENCY_NICE) &&
6166         size < SCHED_ATTR_SIZE_VER2) {
6167         return -EINVAL;
6168     }
6169 #endif
6170     /*
6171      * XXX: Do we want to be lenient like existing syscalls; or do we want
6172      * to be strict and return an error on out-of-bounds values?
6173      */
6174     attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
6175 
6176     return 0;
6177 
6178 err_size:
6179     put_user(sizeof(*attr), &uattr->size);
6180     return -E2BIG;
6181 }
6182 
6183 /**
6184  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
6185  * @pid: the pid in question.
6186  * @policy: new policy.
6187  * @param: structure containing the new RT priority.
6188  *
6189  * Return: 0 on success. An error code otherwise.
6190  */
SYSCALL_DEFINE3(sched_setscheduler,pid_t,pid,int,policy,struct sched_param __user *,param)6191 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
6192                 struct sched_param __user *, param)
6193 {
6194     if (policy < 0) {
6195         return -EINVAL;
6196     }
6197 
6198     return do_sched_setscheduler(pid, policy, param);
6199 }
6200 
6201 /**
6202  * sys_sched_setparam - set/change the RT priority of a thread
6203  * @pid: the pid in question.
6204  * @param: structure containing the new RT priority.
6205  *
6206  * Return: 0 on success. An error code otherwise.
6207  */
SYSCALL_DEFINE2(sched_setparam,pid_t,pid,struct sched_param __user *,param)6208 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
6209 {
6210     return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
6211 }
6212 
6213 /**
6214  * sys_sched_setattr - same as above, but with extended sched_attr
6215  * @pid: the pid in question.
6216  * @uattr: structure containing the extended parameters.
6217  * @flags: for future extension.
6218  */
SYSCALL_DEFINE3(sched_setattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,flags)6219 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
6220                 unsigned int, flags)
6221 {
6222     struct sched_attr attr;
6223     struct task_struct *p;
6224     int retval;
6225 
6226     if (!uattr || pid < 0 || flags) {
6227         return -EINVAL;
6228     }
6229 
6230     retval = sched_copy_attr(uattr, &attr);
6231     if (retval) {
6232         return retval;
6233     }
6234 
6235     if ((int)attr.sched_policy < 0) {
6236         return -EINVAL;
6237     }
6238     if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) {
6239         attr.sched_policy = SETPARAM_POLICY;
6240     }
6241 
6242     rcu_read_lock();
6243     retval = -ESRCH;
6244     p = find_process_by_pid(pid);
6245     if (likely(p)) {
6246         get_task_struct(p);
6247     }
6248     rcu_read_unlock();
6249 
6250     if (likely(p)) {
6251         retval = sched_setattr(p, &attr);
6252         put_task_struct(p);
6253     }
6254 
6255     return retval;
6256 }
6257 
6258 /**
6259  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
6260  * @pid: the pid in question.
6261  *
6262  * Return: On success, the policy of the thread. Otherwise, a negative error
6263  * code.
6264  */
SYSCALL_DEFINE1(sched_getscheduler,pid_t,pid)6265 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
6266 {
6267     struct task_struct *p;
6268     int retval;
6269 
6270     if (pid < 0) {
6271         return -EINVAL;
6272     }
6273 
6274     retval = -ESRCH;
6275     rcu_read_lock();
6276     p = find_process_by_pid(pid);
6277     if (p) {
6278         retval = security_task_getscheduler(p);
6279         if (!retval) {
6280             retval =
6281                 p->policy | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
6282         }
6283     }
6284     rcu_read_unlock();
6285     return retval;
6286 }
6287 
6288 /**
6289  * sys_sched_getparam - get the RT priority of a thread
6290  * @pid: the pid in question.
6291  * @param: structure containing the RT priority.
6292  *
6293  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
6294  * code.
6295  */
SYSCALL_DEFINE2(sched_getparam,pid_t,pid,struct sched_param __user *,param)6296 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
6297 {
6298     struct sched_param lp = {.sched_priority = 0};
6299     struct task_struct *p;
6300     int retval;
6301 
6302     if (!param || pid < 0) {
6303         return -EINVAL;
6304     }
6305 
6306     rcu_read_lock();
6307     p = find_process_by_pid(pid);
6308     retval = -ESRCH;
6309     if (!p) {
6310         goto out_unlock;
6311     }
6312 
6313     retval = security_task_getscheduler(p);
6314     if (retval) {
6315         goto out_unlock;
6316     }
6317 
6318     if (task_has_rt_policy(p)) {
6319         lp.sched_priority = p->rt_priority;
6320     }
6321     rcu_read_unlock();
6322 
6323     /*
6324      * This one might sleep, we cannot do it with a spinlock held ...
6325      */
6326     retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
6327 
6328     return retval;
6329 
6330 out_unlock:
6331     rcu_read_unlock();
6332     return retval;
6333 }
6334 
6335 /*
6336  * Copy the kernel size attribute structure (which might be larger
6337  * than what user-space knows about) to user-space.
6338  *
6339  * Note that all cases are valid: user-space buffer can be larger or
6340  * smaller than the kernel-space buffer. The usual case is that both
6341  * have the same size.
6342  */
sched_attr_copy_to_user(struct sched_attr __user * uattr,struct sched_attr * kattr,unsigned int usize)6343 static int sched_attr_copy_to_user(struct sched_attr __user *uattr,
6344                                    struct sched_attr *kattr, unsigned int usize)
6345 {
6346     unsigned int ksize = sizeof(*kattr);
6347 
6348     if (!access_ok(uattr, usize)) {
6349         return -EFAULT;
6350     }
6351 
6352     /*
6353      * sched_getattr() ABI forwards and backwards compatibility:
6354      *
6355      * If usize == ksize then we just copy everything to user-space and all is
6356      * good.
6357      *
6358      * If usize < ksize then we only copy as much as user-space has space for,
6359      * this keeps ABI compatibility as well. We skip the rest.
6360      *
6361      * If usize > ksize then user-space is using a newer version of the ABI,
6362      * which part the kernel doesn't know about. Just ignore it - tooling can
6363      * detect the kernel's knowledge of attributes from the attr->size value
6364      * which is set to ksize in this case.
6365      */
6366     kattr->size = min(usize, ksize);
6367 
6368     if (copy_to_user(uattr, kattr, kattr->size)) {
6369         return -EFAULT;
6370     }
6371 
6372     return 0;
6373 }
6374 
6375 /**
6376  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
6377  * @pid: the pid in question.
6378  * @uattr: structure containing the extended parameters.
6379  * @usize: sizeof(attr) for fwd/bwd comp.
6380  * @flags: for future extension.
6381  */
SYSCALL_DEFINE4(sched_getattr,pid_t,pid,struct sched_attr __user *,uattr,unsigned int,usize,unsigned int,flags)6382 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
6383                 unsigned int, usize, unsigned int, flags)
6384 {
6385     struct sched_attr kattr = {};
6386     struct task_struct *p;
6387     int retval;
6388 
6389     if (!uattr || pid < 0 || usize > PAGE_SIZE ||
6390         usize < SCHED_ATTR_SIZE_VER0 || flags) {
6391         return -EINVAL;
6392     }
6393 
6394     rcu_read_lock();
6395     p = find_process_by_pid(pid);
6396     retval = -ESRCH;
6397     if (!p) {
6398         goto out_unlock;
6399     }
6400 
6401     retval = security_task_getscheduler(p);
6402     if (retval) {
6403         goto out_unlock;
6404     }
6405 
6406     kattr.sched_policy = p->policy;
6407     if (p->sched_reset_on_fork) {
6408         kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
6409     }
6410     if (task_has_dl_policy(p)) {
6411         __getparam_dl(p, &kattr);
6412     } else if (task_has_rt_policy(p)) {
6413         kattr.sched_priority = p->rt_priority;
6414     } else {
6415         kattr.sched_nice = task_nice(p);
6416     }
6417 
6418 #ifdef CONFIG_SCHED_LATENCY_NICE
6419     kattr.sched_latency_nice = LATENCY_TO_NICE(p->latency_prio);
6420 #endif
6421 
6422 #ifdef CONFIG_UCLAMP_TASK
6423     /*
6424      * This could race with another potential updater, but this is fine
6425      * because it'll correctly read the old or the new value. We don't need
6426      * to guarantee who wins the race as long as it doesn't return garbage.
6427      */
6428     kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value;
6429     kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value;
6430 #endif
6431 
6432     rcu_read_unlock();
6433 
6434     return sched_attr_copy_to_user(uattr, &kattr, usize);
6435 
6436 out_unlock:
6437     rcu_read_unlock();
6438     return retval;
6439 }
6440 
sched_setaffinity(pid_t pid,const struct cpumask * in_mask)6441 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
6442 {
6443     cpumask_var_t cpus_allowed, new_mask;
6444     struct task_struct *p;
6445     int retval;
6446 #ifdef CONFIG_CPU_ISOLATION_OPT
6447     int dest_cpu;
6448     cpumask_t allowed_mask;
6449 #endif
6450 
6451     rcu_read_lock();
6452 
6453     p = find_process_by_pid(pid);
6454     if (!p) {
6455         rcu_read_unlock();
6456         return -ESRCH;
6457     }
6458 
6459     /* Prevent p going away */
6460     get_task_struct(p);
6461     rcu_read_unlock();
6462 
6463     if (p->flags & PF_NO_SETAFFINITY) {
6464         retval = -EINVAL;
6465         goto out_put_task;
6466     }
6467     if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
6468         retval = -ENOMEM;
6469         goto out_put_task;
6470     }
6471     if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6472         retval = -ENOMEM;
6473         goto out_free_cpus_allowed;
6474     }
6475     retval = -EPERM;
6476     if (!check_same_owner(p)) {
6477         rcu_read_lock();
6478         if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
6479             rcu_read_unlock();
6480             goto out_free_new_mask;
6481         }
6482         rcu_read_unlock();
6483     }
6484 
6485     retval = security_task_setscheduler(p);
6486     if (retval) {
6487         goto out_free_new_mask;
6488     }
6489 
6490     cpuset_cpus_allowed(p, cpus_allowed);
6491     cpumask_and(new_mask, in_mask, cpus_allowed);
6492 
6493     /*
6494      * Since bandwidth control happens on root_domain basis,
6495      * if admission test is enabled, we only admit -deadline
6496      * tasks allowed to run on all the CPUs in the task's
6497      * root_domain.
6498      */
6499 #ifdef CONFIG_SMP
6500     if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
6501         rcu_read_lock();
6502         if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
6503             retval = -EBUSY;
6504             rcu_read_unlock();
6505             goto out_free_new_mask;
6506         }
6507         rcu_read_unlock();
6508     }
6509 #endif
6510     while (1) {
6511 #ifdef CONFIG_CPU_ISOLATION_OPT
6512         cpumask_andnot(&allowed_mask, new_mask, cpu_isolated_mask);
6513         dest_cpu = cpumask_any_and(cpu_active_mask, &allowed_mask);
6514         if (dest_cpu < nr_cpu_ids) {
6515 #endif
6516             retval = __set_cpus_allowed_ptr(p, new_mask, true);
6517             if (!retval) {
6518                 cpuset_cpus_allowed(p, cpus_allowed);
6519                 if (!cpumask_subset(new_mask, cpus_allowed)) {
6520                     /*
6521                      * We must have raced with a concurrent cpuset
6522                      * update. Just reset the cpus_allowed to the
6523                      * cpuset's cpus_allowed
6524                      */
6525                     cpumask_copy(new_mask, cpus_allowed);
6526                     continue;
6527                 }
6528             }
6529 #ifdef CONFIG_CPU_ISOLATION_OPT
6530         } else {
6531             retval = -EINVAL;
6532         }
6533 #endif
6534         break;
6535     }
6536 
6537 out_free_new_mask:
6538     free_cpumask_var(new_mask);
6539 out_free_cpus_allowed:
6540     free_cpumask_var(cpus_allowed);
6541 out_put_task:
6542     put_task_struct(p);
6543     return retval;
6544 }
6545 
get_user_cpu_mask(unsigned long __user * user_mask_ptr,unsigned len,struct cpumask * new_mask)6546 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
6547                              struct cpumask *new_mask)
6548 {
6549     if (len < cpumask_size()) {
6550         cpumask_clear(new_mask);
6551     } else if (len > cpumask_size()) {
6552         len = cpumask_size();
6553     }
6554 
6555     return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
6556 }
6557 
6558 /**
6559  * sys_sched_setaffinity - set the CPU affinity of a process
6560  * @pid: pid of the process
6561  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6562  * @user_mask_ptr: user-space pointer to the new CPU mask
6563  *
6564  * Return: 0 on success. An error code otherwise.
6565  */
SYSCALL_DEFINE3(sched_setaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)6566 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
6567                 unsigned long __user *, user_mask_ptr)
6568 {
6569     cpumask_var_t new_mask;
6570     int retval;
6571 
6572     if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
6573         return -ENOMEM;
6574     }
6575 
6576     retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
6577     if (retval == 0) {
6578         retval = sched_setaffinity(pid, new_mask);
6579     }
6580     free_cpumask_var(new_mask);
6581     return retval;
6582 }
6583 
sched_getaffinity(pid_t pid,struct cpumask * mask)6584 long sched_getaffinity(pid_t pid, struct cpumask *mask)
6585 {
6586     struct task_struct *p;
6587     unsigned long flags;
6588     int retval;
6589 
6590     rcu_read_lock();
6591 
6592     retval = -ESRCH;
6593     p = find_process_by_pid(pid);
6594     if (!p) {
6595         goto out_unlock;
6596     }
6597 
6598     retval = security_task_getscheduler(p);
6599     if (retval) {
6600         goto out_unlock;
6601     }
6602 
6603     raw_spin_lock_irqsave(&p->pi_lock, flags);
6604     cpumask_and(mask, &p->cpus_mask, cpu_active_mask);
6605 
6606 #ifdef CONFIG_CPU_ISOLATION_OPT
6607     /* The userspace tasks are forbidden to run on
6608      * isolated CPUs. So exclude isolated CPUs from
6609      * the getaffinity.
6610      */
6611     if (!(p->flags & PF_KTHREAD)) {
6612         cpumask_andnot(mask, mask, cpu_isolated_mask);
6613     }
6614 #endif
6615 
6616     raw_spin_unlock_irqrestore(&p->pi_lock, flags);
6617 
6618 out_unlock:
6619     rcu_read_unlock();
6620 
6621     return retval;
6622 }
6623 
6624 /**
6625  * sys_sched_getaffinity - get the CPU affinity of a process
6626  * @pid: pid of the process
6627  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
6628  * @user_mask_ptr: user-space pointer to hold the current CPU mask
6629  *
6630  * Return: size of CPU mask copied to user_mask_ptr on success. An
6631  * error code otherwise.
6632  */
SYSCALL_DEFINE3(sched_getaffinity,pid_t,pid,unsigned int,len,unsigned long __user *,user_mask_ptr)6633 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
6634                 unsigned long __user *, user_mask_ptr)
6635 {
6636     int ret;
6637     cpumask_var_t mask;
6638 
6639     if ((len * BITS_PER_BYTE) < nr_cpu_ids) {
6640         return -EINVAL;
6641     }
6642     if (len & (sizeof(unsigned long) - 1)) {
6643         return -EINVAL;
6644     }
6645 
6646     if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
6647         return -ENOMEM;
6648     }
6649 
6650     ret = sched_getaffinity(pid, mask);
6651     if (ret == 0) {
6652         unsigned int retlen = min(len, cpumask_size());
6653 
6654         if (copy_to_user(user_mask_ptr, mask, retlen)) {
6655             ret = -EFAULT;
6656         } else {
6657             ret = retlen;
6658         }
6659     }
6660     free_cpumask_var(mask);
6661 
6662     return ret;
6663 }
6664 
6665 /**
6666  * sys_sched_yield - yield the current processor to other threads.
6667  *
6668  * This function yields the current CPU to other tasks. If there are no
6669  * other threads running on this CPU then this function will return.
6670  *
6671  * Return: 0.
6672  */
do_sched_yield(void)6673 static void do_sched_yield(void)
6674 {
6675     struct rq_flags rf;
6676     struct rq *rq;
6677 
6678     rq = this_rq_lock_irq(&rf);
6679 
6680     schedstat_inc(rq->yld_count);
6681     current->sched_class->yield_task(rq);
6682 
6683     preempt_disable();
6684     rq_unlock_irq(rq, &rf);
6685     sched_preempt_enable_no_resched();
6686 
6687     schedule();
6688 }
6689 
SYSCALL_DEFINE0(sched_yield)6690 SYSCALL_DEFINE0(sched_yield)
6691 {
6692     do_sched_yield();
6693     return 0;
6694 }
6695 
6696 #ifndef CONFIG_PREEMPTION
_cond_resched(void)6697 int __sched _cond_resched(void)
6698 {
6699     if (should_resched(0)) {
6700         preempt_schedule_common();
6701         return 1;
6702     }
6703     rcu_all_qs();
6704     return 0;
6705 }
6706 EXPORT_SYMBOL(_cond_resched);
6707 #endif
6708 
6709 /*
6710  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
6711  * call schedule, and on return reacquire the lock.
6712  *
6713  * This works OK both with and without CONFIG_PREEMPTION. We do strange
6714  * low-level operations here to prevent schedule() from being called twice (once
6715  * via spin_unlock(), once by hand).
6716  */
__cond_resched_lock(spinlock_t * lock)6717 int __cond_resched_lock(spinlock_t *lock)
6718 {
6719     int resched = should_resched(PREEMPT_LOCK_OFFSET);
6720     int ret = 0;
6721 
6722     lockdep_assert_held(lock);
6723 
6724     if (spin_needbreak(lock) || resched) {
6725         spin_unlock(lock);
6726         if (resched) {
6727             preempt_schedule_common();
6728         } else {
6729             cpu_relax();
6730         }
6731         ret = 1;
6732         spin_lock(lock);
6733     }
6734     return ret;
6735 }
6736 EXPORT_SYMBOL(__cond_resched_lock);
6737 
6738 /**
6739  * yield - yield the current processor to other threads.
6740  *
6741  * Do not ever use this function, there's a 99% chance you're doing it wrong.
6742  *
6743  * The scheduler is at all times free to pick the calling task as the most
6744  * eligible task to run, if removing the yield() call from your code breaks
6745  * it, its already broken.
6746  *
6747  * Typical broken usage is:
6748  *
6749  * while (!event)
6750  *    yield();
6751  *
6752  * where one assumes that yield() will let 'the other' process run that will
6753  * make event true. If the current task is a SCHED_FIFO task that will never
6754  * happen. Never use yield() as a progress guarantee!!
6755  *
6756  * If you want to use yield() to wait for something, use wait_event().
6757  * If you want to use yield() to be 'nice' for others, use cond_resched().
6758  * If you still want to use yield(), do not!
6759  */
yield(void)6760 void __sched yield(void)
6761 {
6762     set_current_state(TASK_RUNNING);
6763     do_sched_yield();
6764 }
6765 EXPORT_SYMBOL(yield);
6766 
6767 /**
6768  * yield_to - yield the current processor to another thread in
6769  * your thread group, or accelerate that thread toward the
6770  * processor it's on.
6771  * @p: target task
6772  * @preempt: whether task preemption is allowed or not
6773  *
6774  * It's the caller's job to ensure that the target task struct
6775  * can't go away on us before we can do any checks.
6776  *
6777  * Return:
6778  *    true (>0) if we indeed boosted the target task.
6779  *    false (0) if we failed to boost the target.
6780  *    -ESRCH if there's no task to yield to.
6781  */
yield_to(struct task_struct * p,bool preempt)6782 int __sched yield_to(struct task_struct *p, bool preempt)
6783 {
6784     struct task_struct *curr = current;
6785     struct rq *rq, *p_rq;
6786     unsigned long flags;
6787     int yielded = 0;
6788 
6789     local_irq_save(flags);
6790     rq = this_rq();
6791 
6792 again:
6793     p_rq = task_rq(p);
6794     /*
6795      * If we're the only runnable task on the rq and target rq also
6796      * has only one task, there's absolutely no point in yielding.
6797      */
6798     if (rq->nr_running == 1 && p_rq->nr_running == 1) {
6799         yielded = -ESRCH;
6800         goto out_irq;
6801     }
6802 
6803     double_rq_lock(rq, p_rq);
6804     if (task_rq(p) != p_rq) {
6805         double_rq_unlock(rq, p_rq);
6806         goto again;
6807     }
6808 
6809     if (!curr->sched_class->yield_to_task) {
6810         goto out_unlock;
6811     }
6812 
6813     if (curr->sched_class != p->sched_class) {
6814         goto out_unlock;
6815     }
6816 
6817     if (task_running(p_rq, p) || p->state) {
6818         goto out_unlock;
6819     }
6820 
6821     yielded = curr->sched_class->yield_to_task(rq, p);
6822     if (yielded) {
6823         schedstat_inc(rq->yld_count);
6824         /*
6825          * Make p's CPU reschedule; pick_next_entity takes care of
6826          * fairness.
6827          */
6828         if (preempt && rq != p_rq) {
6829             resched_curr(p_rq);
6830         }
6831     }
6832 
6833 out_unlock:
6834     double_rq_unlock(rq, p_rq);
6835 out_irq:
6836     local_irq_restore(flags);
6837 
6838     if (yielded > 0) {
6839         schedule();
6840     }
6841 
6842     return yielded;
6843 }
6844 EXPORT_SYMBOL_GPL(yield_to);
6845 
io_schedule_prepare(void)6846 int io_schedule_prepare(void)
6847 {
6848     int old_iowait = current->in_iowait;
6849 
6850     current->in_iowait = 1;
6851     blk_schedule_flush_plug(current);
6852 
6853     return old_iowait;
6854 }
6855 
io_schedule_finish(int token)6856 void io_schedule_finish(int token)
6857 {
6858     current->in_iowait = token;
6859 }
6860 
6861 /*
6862  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
6863  * that process accounting knows that this is a task in IO wait state.
6864  */
io_schedule_timeout(long timeout)6865 long __sched io_schedule_timeout(long timeout)
6866 {
6867     int token;
6868     long ret;
6869 
6870     token = io_schedule_prepare();
6871     ret = schedule_timeout(timeout);
6872     io_schedule_finish(token);
6873 
6874     return ret;
6875 }
6876 EXPORT_SYMBOL(io_schedule_timeout);
6877 
io_schedule(void)6878 void __sched io_schedule(void)
6879 {
6880     int token;
6881 
6882     token = io_schedule_prepare();
6883     schedule();
6884     io_schedule_finish(token);
6885 }
6886 EXPORT_SYMBOL(io_schedule);
6887 
6888 /**
6889  * sys_sched_get_priority_max - return maximum RT priority.
6890  * @policy: scheduling class.
6891  *
6892  * Return: On success, this syscall returns the maximum
6893  * rt_priority that can be used by a given scheduling class.
6894  * On failure, a negative error code is returned.
6895  */
SYSCALL_DEFINE1(sched_get_priority_max,int,policy)6896 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
6897 {
6898     int ret = -EINVAL;
6899 
6900     switch (policy) {
6901         case SCHED_FIFO:
6902         case SCHED_RR:
6903             ret = MAX_USER_RT_PRIO - 1;
6904             break;
6905         case SCHED_DEADLINE:
6906         case SCHED_NORMAL:
6907         case SCHED_BATCH:
6908         case SCHED_IDLE:
6909             ret = 0;
6910             break;
6911     }
6912     return ret;
6913 }
6914 
6915 /**
6916  * sys_sched_get_priority_min - return minimum RT priority.
6917  * @policy: scheduling class.
6918  *
6919  * Return: On success, this syscall returns the minimum
6920  * rt_priority that can be used by a given scheduling class.
6921  * On failure, a negative error code is returned.
6922  */
SYSCALL_DEFINE1(sched_get_priority_min,int,policy)6923 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
6924 {
6925     int ret = -EINVAL;
6926 
6927     switch (policy) {
6928         case SCHED_FIFO:
6929         case SCHED_RR:
6930             ret = 1;
6931             break;
6932         case SCHED_DEADLINE:
6933         case SCHED_NORMAL:
6934         case SCHED_BATCH:
6935         case SCHED_IDLE:
6936             ret = 0;
6937     }
6938     return ret;
6939 }
6940 
sched_rr_get_interval(pid_t pid,struct timespec64 * t)6941 static int sched_rr_get_interval(pid_t pid, struct timespec64 *t)
6942 {
6943     struct task_struct *p;
6944     unsigned int time_slice;
6945     struct rq_flags rf;
6946     struct rq *rq;
6947     int retval;
6948 
6949     if (pid < 0) {
6950         return -EINVAL;
6951     }
6952 
6953     retval = -ESRCH;
6954     rcu_read_lock();
6955     p = find_process_by_pid(pid);
6956     if (!p) {
6957         goto out_unlock;
6958     }
6959 
6960     retval = security_task_getscheduler(p);
6961     if (retval) {
6962         goto out_unlock;
6963     }
6964 
6965     rq = task_rq_lock(p, &rf);
6966     time_slice = 0;
6967     if (p->sched_class->get_rr_interval) {
6968         time_slice = p->sched_class->get_rr_interval(rq, p);
6969     }
6970     task_rq_unlock(rq, p, &rf);
6971 
6972     rcu_read_unlock();
6973     jiffies_to_timespec64(time_slice, t);
6974     return 0;
6975 
6976 out_unlock:
6977     rcu_read_unlock();
6978     return retval;
6979 }
6980 
6981 /**
6982  * sys_sched_rr_get_interval - return the default timeslice of a process.
6983  * @pid: pid of the process.
6984  * @interval: userspace pointer to the timeslice value.
6985  *
6986  * this syscall writes the default timeslice value of a given process
6987  * into the user-space timespec buffer. A value of '0' means infinity.
6988  *
6989  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
6990  * an error code.
6991  */
SYSCALL_DEFINE2(sched_rr_get_interval,pid_t,pid,struct __kernel_timespec __user *,interval)6992 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
6993                 struct __kernel_timespec __user *, interval)
6994 {
6995     struct timespec64 t;
6996     int retval = sched_rr_get_interval(pid, &t);
6997 
6998     if (retval == 0) {
6999         retval = put_timespec64(&t, interval);
7000     }
7001 
7002     return retval;
7003 }
7004 
7005 #ifdef CONFIG_COMPAT_32BIT_TIME
SYSCALL_DEFINE2(sched_rr_get_interval_time32,pid_t,pid,struct old_timespec32 __user *,interval)7006 SYSCALL_DEFINE2(sched_rr_get_interval_time32, pid_t, pid,
7007                 struct old_timespec32 __user *, interval)
7008 {
7009     struct timespec64 t;
7010     int retval = sched_rr_get_interval(pid, &t);
7011 
7012     if (retval == 0) {
7013         retval = put_old_timespec32(&t, interval);
7014     }
7015     return retval;
7016 }
7017 #endif
7018 
sched_show_task(struct task_struct * p)7019 void sched_show_task(struct task_struct *p)
7020 {
7021     unsigned long free = 0;
7022     int ppid;
7023 
7024     if (!try_get_task_stack(p)) {
7025         return;
7026     }
7027 
7028     pr_info("task:%-15.15s state:%c", p->comm, task_state_to_char(p));
7029 
7030     if (p->state == TASK_RUNNING) {
7031         pr_cont("  running task    ");
7032     }
7033 #ifdef CONFIG_DEBUG_STACK_USAGE
7034     free = stack_not_used(p);
7035 #endif
7036     ppid = 0;
7037     rcu_read_lock();
7038     if (pid_alive(p)) {
7039         ppid = task_pid_nr(rcu_dereference(p->real_parent));
7040     }
7041     rcu_read_unlock();
7042     pr_cont(" stack:%5lu pid:%5d ppid:%6d flags:0x%08lx\n", free,
7043             task_pid_nr(p), ppid, (unsigned long)task_thread_info(p)->flags);
7044 
7045     print_worker_info(KERN_INFO, p);
7046     show_stack(p, NULL, KERN_INFO);
7047     put_task_stack(p);
7048 }
7049 EXPORT_SYMBOL_GPL(sched_show_task);
7050 
state_filter_match(unsigned long state_filter,struct task_struct * p)7051 static inline bool state_filter_match(unsigned long state_filter,
7052                                       struct task_struct *p)
7053 {
7054     /* no filter, everything matches */
7055     if (!state_filter) {
7056         return true;
7057     }
7058 
7059     /* filter, but doesn't match */
7060     if (!(p->state & state_filter)) {
7061         return false;
7062     }
7063 
7064     /*
7065      * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
7066      * TASK_KILLABLE).
7067      */
7068     if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) {
7069         return false;
7070     }
7071 
7072     return true;
7073 }
7074 
show_state_filter(unsigned long state_filter)7075 void show_state_filter(unsigned long state_filter)
7076 {
7077     struct task_struct *g, *p;
7078 
7079     rcu_read_lock();
7080     for_each_process_thread(g, p)
7081     {
7082         /*
7083          * reset the NMI-timeout, listing all files on a slow
7084          * console might take a lot of time:
7085          * Also, reset softlockup watchdogs on all CPUs, because
7086          * another CPU might be blocked waiting for us to process
7087          * an IPI.
7088          */
7089         touch_nmi_watchdog();
7090         touch_all_softlockup_watchdogs();
7091         if (state_filter_match(state_filter, p)) {
7092             sched_show_task(p);
7093         }
7094     }
7095 
7096 #ifdef CONFIG_SCHED_DEBUG
7097     if (!state_filter) {
7098         sysrq_sched_debug_show();
7099     }
7100 #endif
7101     rcu_read_unlock();
7102     /*
7103      * Only show locks if all tasks are dumped:
7104      */
7105     if (!state_filter) {
7106         debug_show_all_locks();
7107     }
7108 }
7109 
7110 /**
7111  * init_idle - set up an idle thread for a given CPU
7112  * @idle: task in question
7113  * @cpu: CPU the idle task belongs to
7114  *
7115  * NOTE: this function does not set the idle thread's NEED_RESCHED
7116  * flag, to make booting more robust.
7117  */
init_idle(struct task_struct * idle,int cpu)7118 void __init init_idle(struct task_struct *idle, int cpu)
7119 {
7120     struct rq *rq = cpu_rq(cpu);
7121     unsigned long flags;
7122 
7123     __sched_fork(0, idle);
7124 
7125     raw_spin_lock_irqsave(&idle->pi_lock, flags);
7126     raw_spin_lock(&rq->lock);
7127 
7128     idle->state = TASK_RUNNING;
7129     idle->se.exec_start = sched_clock();
7130     idle->flags |= PF_IDLE;
7131 
7132 
7133 #ifdef CONFIG_SMP
7134     /*
7135      * Its possible that init_idle() gets called multiple times on a task,
7136      * in that case do_set_cpus_allowed() will not do the right thing.
7137      *
7138      * And since this is boot we can forgo the serialization.
7139      */
7140     set_cpus_allowed_common(idle, cpumask_of(cpu));
7141 #endif
7142     /*
7143      * We're having a chicken and egg problem, even though we are
7144      * holding rq->lock, the CPU isn't yet set to this CPU so the
7145      * lockdep check in task_group() will fail.
7146      *
7147      * Similar case to sched_fork(). / Alternatively we could
7148      * use task_rq_lock() here and obtain the other rq->lock.
7149      *
7150      * Silence PROVE_RCU
7151      */
7152     rcu_read_lock();
7153     __set_task_cpu(idle, cpu);
7154     rcu_read_unlock();
7155 
7156     rq->idle = idle;
7157     rcu_assign_pointer(rq->curr, idle);
7158     idle->on_rq = TASK_ON_RQ_QUEUED;
7159 #ifdef CONFIG_SMP
7160     idle->on_cpu = 1;
7161 #endif
7162     raw_spin_unlock(&rq->lock);
7163     raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
7164 
7165     /* Set the preempt count _outside_ the spinlocks! */
7166     init_idle_preempt_count(idle, cpu);
7167 
7168     /*
7169      * The idle tasks have their own, simple scheduling class:
7170      */
7171     idle->sched_class = &idle_sched_class;
7172     ftrace_graph_init_idle_task(idle, cpu);
7173     vtime_init_idle(idle, cpu);
7174 #ifdef CONFIG_SMP
7175     sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
7176 #endif
7177 }
7178 
7179 #ifdef CONFIG_SMP
7180 
cpuset_cpumask_can_shrink(const struct cpumask * cur,const struct cpumask * trial)7181 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
7182                               const struct cpumask *trial)
7183 {
7184     int ret = 1;
7185 
7186     if (!cpumask_weight(cur)) {
7187         return ret;
7188     }
7189 
7190     ret = dl_cpuset_cpumask_can_shrink(cur, trial);
7191 
7192     return ret;
7193 }
7194 
task_can_attach(struct task_struct * p,const struct cpumask * cs_effective_cpus)7195 int task_can_attach(struct task_struct *p,
7196                     const struct cpumask *cs_effective_cpus)
7197 {
7198     int ret = 0;
7199 
7200     /*
7201      * Kthreads which disallow setaffinity shouldn't be moved
7202      * to a new cpuset; we don't want to change their CPU
7203      * affinity and isolating such threads by their set of
7204      * allowed nodes is unnecessary.  Thus, cpusets are not
7205      * applicable for such threads.  This prevents checking for
7206      * success of set_cpus_allowed_ptr() on all attached tasks
7207      * before cpus_mask may be changed.
7208      */
7209     if (p->flags & PF_NO_SETAFFINITY) {
7210         ret = -EINVAL;
7211         goto out;
7212     }
7213 
7214     if (dl_task(p) &&
7215         !cpumask_intersects(task_rq(p)->rd->span, cs_effective_cpus)) {
7216         int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
7217         if (unlikely(cpu >= nr_cpu_ids))
7218             return -EINVAL;
7219         ret = dl_cpu_busy(cpu, p);
7220     }
7221 
7222 out:
7223     return ret;
7224 }
7225 
7226 bool sched_smp_initialized __read_mostly;
7227 
7228 #ifdef CONFIG_NUMA_BALANCING
7229 /* Migrate current task p to target_cpu */
migrate_task_to(struct task_struct * p,int target_cpu)7230 int migrate_task_to(struct task_struct *p, int target_cpu)
7231 {
7232     struct migration_arg arg = {p, target_cpu};
7233     int curr_cpu = task_cpu(p);
7234     if (curr_cpu == target_cpu) {
7235         return 0;
7236     }
7237 
7238     if (!cpumask_test_cpu(target_cpu, p->cpus_ptr)) {
7239         return -EINVAL;
7240     }
7241 
7242     /* This is not properly updating schedstats */
7243 
7244     trace_sched_move_numa(p, curr_cpu, target_cpu);
7245     return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
7246 }
7247 
7248 /*
7249  * Requeue a task on a given node and accurately track the number of NUMA
7250  * tasks on the runqueues
7251  */
sched_setnuma(struct task_struct * p,int nid)7252 void sched_setnuma(struct task_struct *p, int nid)
7253 {
7254     bool queued, running;
7255     struct rq_flags rf;
7256     struct rq *rq;
7257 
7258     rq = task_rq_lock(p, &rf);
7259     queued = task_on_rq_queued(p);
7260     running = task_current(rq, p);
7261 
7262     if (queued) {
7263         dequeue_task(rq, p, DEQUEUE_SAVE);
7264     }
7265     if (running) {
7266         put_prev_task(rq, p);
7267     }
7268 
7269     p->numa_preferred_nid = nid;
7270 
7271     if (queued) {
7272         enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
7273     }
7274     if (running) {
7275         set_next_task(rq, p);
7276     }
7277     task_rq_unlock(rq, p, &rf);
7278 }
7279 #endif /* CONFIG_NUMA_BALANCING */
7280 
7281 #ifdef CONFIG_HOTPLUG_CPU
7282 /*
7283  * Ensure that the idle task is using init_mm right before its CPU goes
7284  * offline.
7285  */
idle_task_exit(void)7286 void idle_task_exit(void)
7287 {
7288     struct mm_struct *mm = current->active_mm;
7289 
7290     BUG_ON(cpu_online(smp_processor_id()));
7291     BUG_ON(current != this_rq()->idle);
7292 
7293     if (mm != &init_mm) {
7294         switch_mm(mm, &init_mm, current);
7295         finish_arch_post_lock_switch();
7296     }
7297 
7298     /* finish_cpu(), as ran on the BP, will clean up the active_mm state */
7299 }
7300 
7301 /*
7302  * Since this CPU is going 'away' for a while, fold any nr_active delta
7303  * we might have. Assumes we're called after migrate_tasks() so that the
7304  * nr_active count is stable. We need to take the teardown thread which
7305  * is calling this into account, so we hand in adjust = 1 to the load
7306  * calculation.
7307  *
7308  * Also see the comment "Global load-average calculations".
7309  */
calc_load_migrate(struct rq * rq)7310 static void calc_load_migrate(struct rq *rq)
7311 {
7312     long delta = calc_load_fold_active(rq, 1);
7313     if (delta) {
7314         atomic_long_add(delta, &calc_load_tasks);
7315     }
7316 }
7317 
__pick_migrate_task(struct rq * rq)7318 static struct task_struct *__pick_migrate_task(struct rq *rq)
7319 {
7320     const struct sched_class *class;
7321     struct task_struct *next;
7322 
7323     for_each_class(class)
7324     {
7325         next = class->pick_next_task(rq);
7326         if (next) {
7327             next->sched_class->put_prev_task(rq, next);
7328             return next;
7329         }
7330     }
7331 
7332     /* The idle class should always have a runnable task */
7333     BUG();
7334 }
7335 
7336 #ifdef CONFIG_CPU_ISOLATION_OPT
7337 /*
7338  * Remove a task from the runqueue and pretend that it's migrating. This
7339  * should prevent migrations for the detached task and disallow further
7340  * changes to tsk_cpus_allowed.
7341  */
detach_one_task_core(struct task_struct * p,struct rq * rq,struct list_head * tasks)7342 static void detach_one_task_core(struct task_struct *p, struct rq *rq,
7343                                  struct list_head *tasks)
7344 {
7345     lockdep_assert_held(&rq->lock);
7346 
7347     p->on_rq = TASK_ON_RQ_MIGRATING;
7348     deactivate_task(rq, p, 0);
7349     list_add(&p->se.group_node, tasks);
7350 }
7351 
attach_tasks_core(struct list_head * tasks,struct rq * rq)7352 static void attach_tasks_core(struct list_head *tasks, struct rq *rq)
7353 {
7354     struct task_struct *p;
7355 
7356     lockdep_assert_held(&rq->lock);
7357 
7358     while (!list_empty(tasks)) {
7359         p = list_first_entry(tasks, struct task_struct, se.group_node);
7360         list_del_init(&p->se.group_node);
7361 
7362         BUG_ON(task_rq(p) != rq);
7363         activate_task(rq, p, 0);
7364         p->on_rq = TASK_ON_RQ_QUEUED;
7365     }
7366 }
7367 
7368 #else
7369 
detach_one_task_core(struct task_struct * p,struct rq * rq,struct list_head * tasks)7370 static void detach_one_task_core(struct task_struct *p, struct rq *rq,
7371                                  struct list_head *tasks)
7372 {
7373 }
7374 
attach_tasks_core(struct list_head * tasks,struct rq * rq)7375 static void attach_tasks_core(struct list_head *tasks, struct rq *rq)
7376 {
7377 }
7378 
7379 #endif /* CONFIG_CPU_ISOLATION_OPT */
7380 
7381 /*
7382  * Migrate all tasks (not pinned if pinned argument say so) from the rq,
7383  * sleeping tasks will be migrated by try_to_wake_up()->select_task_rq().
7384  *
7385  * Called with rq->lock held even though we'er in stop_machine() and
7386  * there's no concurrency possible, we hold the required locks anyway
7387  * because of lock validation efforts.
7388  */
migrate_tasks(struct rq * dead_rq,struct rq_flags * rf,bool migrate_pinned_tasks)7389 void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf,
7390                    bool migrate_pinned_tasks)
7391 {
7392     struct rq *rq = dead_rq;
7393     struct task_struct *next, *stop = rq->stop;
7394     struct rq_flags orf = *rf;
7395     int dest_cpu;
7396     unsigned int num_pinned_kthreads = 1; /* this thread */
7397     LIST_HEAD(tasks);
7398     cpumask_t avail_cpus;
7399 
7400 #ifdef CONFIG_CPU_ISOLATION_OPT
7401     cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
7402 #else
7403     cpumask_copy(&avail_cpus, cpu_online_mask);
7404 #endif
7405 
7406     /*
7407      * Fudge the rq selection such that the below task selection loop
7408      * doesn't get stuck on the currently eligible stop task.
7409      *
7410      * We're currently inside stop_machine() and the rq is either stuck
7411      * in the stop_machine_cpu_stop() loop, or we're executing this code,
7412      * either way we should never end up calling schedule() until we're
7413      * done here.
7414      */
7415     rq->stop = NULL;
7416 
7417     /*
7418      * put_prev_task() and pick_next_task() sched
7419      * class method both need to have an up-to-date
7420      * value of rq->clock[_task]
7421      */
7422     update_rq_clock(rq);
7423 
7424 #ifdef CONFIG_SCHED_DEBUG
7425     /* note the clock update in orf */
7426     orf.clock_update_flags |= RQCF_UPDATED;
7427 #endif
7428 
7429     for (;;) {
7430         /*
7431          * There's this thread running, bail when that's the only
7432          * remaining thread.
7433          */
7434         if (rq->nr_running == 1) {
7435             break;
7436         }
7437 
7438         next = __pick_migrate_task(rq);
7439         if (!migrate_pinned_tasks && (next->flags & PF_KTHREAD) &&
7440             !cpumask_intersects(&avail_cpus, &next->cpus_mask)) {
7441             detach_one_task_core(next, rq, &tasks);
7442             num_pinned_kthreads += 1;
7443             continue;
7444         }
7445 
7446         /*
7447          * Rules for changing task_struct::cpus_mask are holding
7448          * both pi_lock and rq->lock, such that holding either
7449          * stabilizes the mask.
7450          *
7451          * Drop rq->lock is not quite as disastrous as it usually is
7452          * because !cpu_active at this point, which means load-balance
7453          * will not interfere. Also, stop-machine.
7454          */
7455         rq_unlock(rq, rf);
7456         raw_spin_lock(&next->pi_lock);
7457         rq_relock(rq, rf);
7458         if (!(rq->clock_update_flags & RQCF_UPDATED)) {
7459             update_rq_clock(rq);
7460         }
7461 
7462         /*
7463          * Since we're inside stop-machine, _nothing_ should have
7464          * changed the task, WARN if weird stuff happened, because in
7465          * that case the above rq->lock drop is a fail too.
7466          * However, during cpu isolation the load balancer might have
7467          * interferred since we don't stop all CPUs. Ignore warning for
7468          * this case.
7469          */
7470         if (task_rq(next) != rq || !task_on_rq_queued(next)) {
7471             WARN_ON(migrate_pinned_tasks);
7472             raw_spin_unlock(&next->pi_lock);
7473             continue;
7474         }
7475 
7476         /* Find suitable destination for @next, with force if needed. */
7477 #ifdef CONFIG_CPU_ISOLATION_OPT
7478         dest_cpu = select_fallback_rq(dead_rq->cpu, next, false);
7479 #else
7480         dest_cpu = select_fallback_rq(dead_rq->cpu, next);
7481 #endif
7482         rq = __migrate_task(rq, rf, next, dest_cpu);
7483         if (rq != dead_rq) {
7484             rq_unlock(rq, rf);
7485             rq = dead_rq;
7486             *rf = orf;
7487             rq_relock(rq, rf);
7488             if (!(rq->clock_update_flags & RQCF_UPDATED)) {
7489                 update_rq_clock(rq);
7490             }
7491         }
7492         raw_spin_unlock(&next->pi_lock);
7493     }
7494 
7495     rq->stop = stop;
7496 
7497     if (num_pinned_kthreads > 1) {
7498         attach_tasks_core(&tasks, rq);
7499     }
7500 }
7501 
7502 #ifdef CONFIG_SCHED_EAS
clear_eas_migration_request(int cpu)7503 static void clear_eas_migration_request(int cpu)
7504 {
7505     struct rq *rq = cpu_rq(cpu);
7506     unsigned long flags;
7507 
7508     clear_reserved(cpu);
7509     if (rq->push_task) {
7510         struct task_struct *push_task = NULL;
7511 
7512         raw_spin_lock_irqsave(&rq->lock, flags);
7513         if (rq->push_task) {
7514             clear_reserved(rq->push_cpu);
7515             push_task = rq->push_task;
7516             rq->push_task = NULL;
7517         }
7518         rq->active_balance = 0;
7519         raw_spin_unlock_irqrestore(&rq->lock, flags);
7520         if (push_task) {
7521             put_task_struct(push_task);
7522         }
7523     }
7524 }
7525 #else
clear_eas_migration_request(int cpu)7526 static inline void clear_eas_migration_request(int cpu)
7527 {
7528 }
7529 #endif
7530 
7531 #ifdef CONFIG_CPU_ISOLATION_OPT
do_isolation_work_cpu_stop(void * data)7532 int do_isolation_work_cpu_stop(void *data)
7533 {
7534     unsigned int cpu = smp_processor_id();
7535     struct rq *rq = cpu_rq(cpu);
7536     struct rq_flags rf;
7537 
7538     watchdog_disable(cpu);
7539 
7540     local_irq_disable();
7541 
7542     irq_migrate_all_off_this_cpu();
7543 
7544     flush_smp_call_function_from_idle();
7545 
7546     /* Update our root-domain */
7547     rq_lock(rq, &rf);
7548 
7549     /*
7550      * Temporarily mark the rq as offline. This will allow us to
7551      * move tasks off the CPU.
7552      */
7553     if (rq->rd) {
7554         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7555         set_rq_offline(rq);
7556     }
7557 
7558     migrate_tasks(rq, &rf, false);
7559 
7560     if (rq->rd) {
7561         set_rq_online(rq);
7562     }
7563     rq_unlock(rq, &rf);
7564 
7565     clear_eas_migration_request(cpu);
7566     local_irq_enable();
7567     return 0;
7568 }
7569 
do_unisolation_work_cpu_stop(void * data)7570 int do_unisolation_work_cpu_stop(void *data)
7571 {
7572     watchdog_enable(smp_processor_id());
7573     return 0;
7574 }
7575 
sched_update_group_capacities(int cpu)7576 static void sched_update_group_capacities(int cpu)
7577 {
7578     struct sched_domain *sd;
7579 
7580     mutex_lock(&sched_domains_mutex);
7581     rcu_read_lock();
7582 
7583     for_each_domain(cpu, sd)
7584     {
7585         int balance_cpu = group_balance_cpu(sd->groups);
7586 
7587         init_sched_groups_capacity(cpu, sd);
7588         /*
7589          * Need to ensure this is also called with balancing
7590          * cpu.
7591          */
7592         if (cpu != balance_cpu) {
7593             init_sched_groups_capacity(balance_cpu, sd);
7594         }
7595     }
7596 
7597     rcu_read_unlock();
7598     mutex_unlock(&sched_domains_mutex);
7599 }
7600 
7601 static unsigned int cpu_isolation_vote[NR_CPUS];
7602 
sched_isolate_count(const cpumask_t * mask,bool include_offline)7603 int sched_isolate_count(const cpumask_t *mask, bool include_offline)
7604 {
7605     cpumask_t count_mask = CPU_MASK_NONE;
7606 
7607     if (include_offline) {
7608         cpumask_complement(&count_mask, cpu_online_mask);
7609         cpumask_or(&count_mask, &count_mask, cpu_isolated_mask);
7610         cpumask_and(&count_mask, &count_mask, mask);
7611     } else {
7612         cpumask_and(&count_mask, mask, cpu_isolated_mask);
7613     }
7614 
7615     return cpumask_weight(&count_mask);
7616 }
7617 
7618 /*
7619  * 1) CPU is isolated and cpu is offlined:
7620  *    Unisolate the core.
7621  * 2) CPU is not isolated and CPU is offlined:
7622  *    No action taken.
7623  * 3) CPU is offline and request to isolate
7624  *    Request ignored.
7625  * 4) CPU is offline and isolated:
7626  *    Not a possible state.
7627  * 5) CPU is online and request to isolate
7628  *    Normal case: Isolate the CPU
7629  * 6) CPU is not isolated and comes back online
7630  *    Nothing to do
7631  *
7632  * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
7633  * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
7634  * Client is also responsible for unisolating when a core goes offline
7635  * (after CPU is marked offline).
7636  */
sched_isolate_cpu(int cpu)7637 int sched_isolate_cpu(int cpu)
7638 {
7639     struct rq *rq;
7640     cpumask_t avail_cpus;
7641     int ret_code = 0;
7642     u64 start_time = 0;
7643 
7644     if (trace_sched_isolate_enabled()) {
7645         start_time = sched_clock();
7646     }
7647 
7648     cpu_maps_update_begin();
7649 
7650     cpumask_andnot(&avail_cpus, cpu_online_mask, cpu_isolated_mask);
7651 
7652     if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) ||
7653         !cpu_online(cpu) || cpu >= NR_CPUS) {
7654         ret_code = -EINVAL;
7655         goto out;
7656     }
7657 
7658     rq = cpu_rq(cpu);
7659 
7660     if (++cpu_isolation_vote[cpu] > 1) {
7661         goto out;
7662     }
7663 
7664     /* We cannot isolate ALL cpus in the system */
7665     if (cpumask_weight(&avail_cpus) == 1) {
7666         --cpu_isolation_vote[cpu];
7667         ret_code = -EINVAL;
7668         goto out;
7669     }
7670 
7671     /*
7672      * There is a race between watchdog being enabled by hotplug and
7673      * core isolation disabling the watchdog. When a CPU is hotplugged in
7674      * and the hotplug lock has been released the watchdog thread might
7675      * not have run yet to enable the watchdog.
7676      * We have to wait for the watchdog to be enabled before proceeding.
7677      */
7678     if (!watchdog_configured(cpu)) {
7679         msleep(0x14);
7680         if (!watchdog_configured(cpu)) {
7681             --cpu_isolation_vote[cpu];
7682             ret_code = -EBUSY;
7683             goto out;
7684         }
7685     }
7686 
7687     set_cpu_isolated(cpu, true);
7688     cpumask_clear_cpu(cpu, &avail_cpus);
7689 
7690     /* Migrate timers */
7691     smp_call_function_any(&avail_cpus, hrtimer_quiesce_cpu, &cpu, 1);
7692     smp_call_function_any(&avail_cpus, timer_quiesce_cpu, &cpu, 1);
7693 
7694     watchdog_disable(cpu);
7695     irq_lock_sparse();
7696     stop_cpus(cpumask_of(cpu), do_isolation_work_cpu_stop, 0);
7697     irq_unlock_sparse();
7698 
7699     calc_load_migrate(rq);
7700     update_max_interval();
7701     sched_update_group_capacities(cpu);
7702 
7703 out:
7704     cpu_maps_update_done();
7705     trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], start_time, 1);
7706     return ret_code;
7707 }
7708 
7709 /*
7710  * Note: The client calling sched_isolate_cpu() is repsonsible for ONLY
7711  * calling sched_unisolate_cpu() on a CPU that the client previously isolated.
7712  * Client is also responsible for unisolating when a core goes offline
7713  * (after CPU is marked offline).
7714  */
sched_unisolate_cpu_unlocked(int cpu)7715 int sched_unisolate_cpu_unlocked(int cpu)
7716 {
7717     int ret_code = 0;
7718     u64 start_time = 0;
7719 
7720     if (cpu < 0 || cpu >= nr_cpu_ids || !cpu_possible(cpu) || cpu >= NR_CPUS) {
7721         ret_code = -EINVAL;
7722         goto out;
7723     }
7724 
7725     if (trace_sched_isolate_enabled()) {
7726         start_time = sched_clock();
7727     }
7728 
7729     if (!cpu_isolation_vote[cpu]) {
7730         ret_code = -EINVAL;
7731         goto out;
7732     }
7733 
7734     if (--cpu_isolation_vote[cpu]) {
7735         goto out;
7736     }
7737 
7738     set_cpu_isolated(cpu, false);
7739     update_max_interval();
7740     sched_update_group_capacities(cpu);
7741 
7742     if (cpu_online(cpu)) {
7743         stop_cpus(cpumask_of(cpu), do_unisolation_work_cpu_stop, 0);
7744 
7745         /* Kick CPU to immediately do load balancing */
7746         if (!atomic_fetch_or(NOHZ_KICK_MASK, nohz_flags(cpu))) {
7747             smp_send_reschedule(cpu);
7748         }
7749     }
7750 
7751 out:
7752     trace_sched_isolate(cpu, cpumask_bits(cpu_isolated_mask)[0], start_time, 0);
7753     return ret_code;
7754 }
7755 
sched_unisolate_cpu(int cpu)7756 int sched_unisolate_cpu(int cpu)
7757 {
7758     int ret_code;
7759 
7760     cpu_maps_update_begin();
7761     ret_code = sched_unisolate_cpu_unlocked(cpu);
7762     cpu_maps_update_done();
7763     return ret_code;
7764 }
7765 
7766 #endif /* CONFIG_CPU_ISOLATION_OPT */
7767 
7768 #endif /* CONFIG_HOTPLUG_CPU */
7769 
set_rq_online(struct rq * rq)7770 void set_rq_online(struct rq *rq)
7771 {
7772     if (!rq->online) {
7773         const struct sched_class *class;
7774 
7775         cpumask_set_cpu(rq->cpu, rq->rd->online);
7776         rq->online = 1;
7777 
7778         for_each_class(class)
7779         {
7780             if (class->rq_online) {
7781                 class->rq_online(rq);
7782             }
7783         }
7784     }
7785 }
7786 
set_rq_offline(struct rq * rq)7787 void set_rq_offline(struct rq *rq)
7788 {
7789     if (rq->online) {
7790         const struct sched_class *class;
7791 
7792         for_each_class(class)
7793         {
7794             if (class->rq_offline) {
7795                 class->rq_offline(rq);
7796             }
7797         }
7798 
7799         cpumask_clear_cpu(rq->cpu, rq->rd->online);
7800         rq->online = 0;
7801     }
7802 }
7803 
7804 /*
7805  * used to mark begin/end of suspend/resume:
7806  */
7807 static int num_cpus_frozen;
7808 
7809 /*
7810  * Update cpusets according to cpu_active mask.  If cpusets are
7811  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
7812  * around partition_sched_domains().
7813  *
7814  * If we come here as part of a suspend/resume, don't touch cpusets because we
7815  * want to restore it back to its original state upon resume anyway.
7816  */
cpuset_cpu_active(void)7817 static void cpuset_cpu_active(void)
7818 {
7819     if (cpuhp_tasks_frozen) {
7820         /*
7821          * num_cpus_frozen tracks how many CPUs are involved in suspend
7822          * resume sequence. As long as this is not the last online
7823          * operation in the resume sequence, just build a single sched
7824          * domain, ignoring cpusets.
7825          */
7826         partition_sched_domains(1, NULL, NULL);
7827         if (--num_cpus_frozen) {
7828             return;
7829         }
7830         /*
7831          * This is the last CPU online operation. So fall through and
7832          * restore the original sched domains by considering the
7833          * cpuset configurations.
7834          */
7835         cpuset_force_rebuild();
7836     }
7837     cpuset_update_active_cpus();
7838 }
7839 
cpuset_cpu_inactive(unsigned int cpu)7840 static int cpuset_cpu_inactive(unsigned int cpu)
7841 {
7842     if (!cpuhp_tasks_frozen) {
7843         int ret = dl_cpu_busy(cpu, NULL);
7844         if (ret) {
7845             return ret;
7846         }
7847         cpuset_update_active_cpus();
7848     } else {
7849         num_cpus_frozen++;
7850         partition_sched_domains(1, NULL, NULL);
7851     }
7852     return 0;
7853 }
7854 
sched_cpu_activate(unsigned int cpu)7855 int sched_cpu_activate(unsigned int cpu)
7856 {
7857     struct rq *rq = cpu_rq(cpu);
7858     struct rq_flags rf;
7859 
7860 #ifdef CONFIG_SCHED_SMT
7861     /*
7862      * When going up, increment the number of cores with SMT present.
7863      */
7864     if (cpumask_weight(cpu_smt_mask(cpu)) == 0x2) {
7865         static_branch_inc_cpuslocked(&sched_smt_present);
7866     }
7867 #endif
7868     set_cpu_active(cpu, true);
7869 
7870     if (sched_smp_initialized) {
7871         sched_domains_numa_masks_set(cpu);
7872         cpuset_cpu_active();
7873     }
7874 
7875     /*
7876      * Put the rq online, if not already. This happens:
7877      *
7878      * 1) In the early boot process, because we build the real domains
7879      *    after all CPUs have been brought up.
7880      *
7881      * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
7882      *    domains.
7883      */
7884     rq_lock_irqsave(rq, &rf);
7885     if (rq->rd) {
7886         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7887         set_rq_online(rq);
7888     }
7889     rq_unlock_irqrestore(rq, &rf);
7890 
7891     return 0;
7892 }
7893 
sched_cpu_deactivate(unsigned int cpu)7894 int sched_cpu_deactivate(unsigned int cpu)
7895 {
7896     int ret;
7897 
7898     set_cpu_active(cpu, false);
7899     /*
7900      * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
7901      * users of this state to go away such that all new such users will
7902      * observe it.
7903      *
7904      * Do sync before park smpboot threads to take care the rcu boost case.
7905      */
7906     synchronize_rcu();
7907 
7908 #ifdef CONFIG_SCHED_SMT
7909     /*
7910      * When going down, decrement the number of cores with SMT present.
7911      */
7912     if (cpumask_weight(cpu_smt_mask(cpu)) == 0x2) {
7913         static_branch_dec_cpuslocked(&sched_smt_present);
7914     }
7915 #endif
7916 
7917     if (!sched_smp_initialized) {
7918         return 0;
7919     }
7920 
7921     ret = cpuset_cpu_inactive(cpu);
7922     if (ret) {
7923         set_cpu_active(cpu, true);
7924         return ret;
7925     }
7926     sched_domains_numa_masks_clear(cpu);
7927     return 0;
7928 }
7929 
sched_rq_cpu_starting(unsigned int cpu)7930 static void sched_rq_cpu_starting(unsigned int cpu)
7931 {
7932     struct rq *rq = cpu_rq(cpu);
7933     unsigned long flags;
7934 
7935     raw_spin_lock_irqsave(&rq->lock, flags);
7936     set_window_start(rq);
7937     raw_spin_unlock_irqrestore(&rq->lock, flags);
7938 
7939     rq->calc_load_update = calc_load_update;
7940     update_max_interval();
7941 }
7942 
sched_cpu_starting(unsigned int cpu)7943 int sched_cpu_starting(unsigned int cpu)
7944 {
7945     sched_rq_cpu_starting(cpu);
7946     sched_tick_start(cpu);
7947     clear_eas_migration_request(cpu);
7948     return 0;
7949 }
7950 
7951 #ifdef CONFIG_HOTPLUG_CPU
sched_cpu_dying(unsigned int cpu)7952 int sched_cpu_dying(unsigned int cpu)
7953 {
7954     struct rq *rq = cpu_rq(cpu);
7955     struct rq_flags rf;
7956 
7957     /* Handle pending wakeups and then migrate everything off */
7958     sched_tick_stop(cpu);
7959 
7960     rq_lock_irqsave(rq, &rf);
7961 
7962     if (rq->rd) {
7963         BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
7964         set_rq_offline(rq);
7965     }
7966     migrate_tasks(rq, &rf, true);
7967     BUG_ON(rq->nr_running != 1);
7968     rq_unlock_irqrestore(rq, &rf);
7969 
7970     clear_eas_migration_request(cpu);
7971 
7972     calc_load_migrate(rq);
7973     update_max_interval();
7974     nohz_balance_exit_idle(rq);
7975     hrtick_clear(rq);
7976     return 0;
7977 }
7978 #endif
7979 
sched_init_smp(void)7980 void __init sched_init_smp(void)
7981 {
7982     sched_init_numa();
7983 
7984     /*
7985      * There's no userspace yet to cause hotplug operations; hence all the
7986      * CPU masks are stable and all blatant races in the below code cannot
7987      * happen.
7988      */
7989     mutex_lock(&sched_domains_mutex);
7990     sched_init_domains(cpu_active_mask);
7991     mutex_unlock(&sched_domains_mutex);
7992 
7993     update_cluster_topology();
7994 
7995     /* Move init over to a non-isolated CPU */
7996     if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) <
7997         0) {
7998         BUG();
7999     }
8000     sched_init_granularity();
8001 
8002     init_sched_rt_class();
8003     init_sched_dl_class();
8004 
8005     sched_smp_initialized = true;
8006 }
8007 
migration_init(void)8008 static int __init migration_init(void)
8009 {
8010     sched_cpu_starting(smp_processor_id());
8011     return 0;
8012 }
8013 early_initcall(migration_init);
8014 
8015 #else
sched_init_smp(void)8016 void __init sched_init_smp(void)
8017 {
8018     sched_init_granularity();
8019 }
8020 #endif /* CONFIG_SMP */
8021 
in_sched_functions(unsigned long addr)8022 int in_sched_functions(unsigned long addr)
8023 {
8024     return in_lock_functions(addr) ||
8025            (addr >= (unsigned long)__sched_text_start &&
8026             addr < (unsigned long)__sched_text_end);
8027 }
8028 
8029 #ifdef CONFIG_CGROUP_SCHED
8030 /*
8031  * Default task group.
8032  * Every task in system belongs to this group at bootup.
8033  */
8034 struct task_group root_task_group;
8035 LIST_HEAD(task_groups);
8036 
8037 /* Cacheline aligned slab cache for task_group */
8038 static struct kmem_cache *task_group_cache __read_mostly;
8039 #endif
8040 
8041 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
8042 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
8043 
sched_init(void)8044 void __init sched_init(void)
8045 {
8046     unsigned long ptr = 0;
8047     int i;
8048 
8049     /* Make sure the linker didn't screw up */
8050     BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
8051            &fair_sched_class + 1 != &rt_sched_class ||
8052            &rt_sched_class + 1 != &dl_sched_class);
8053 #ifdef CONFIG_SMP
8054     BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
8055 #endif
8056 
8057     wait_bit_init();
8058 
8059     init_clusters();
8060 
8061 #ifdef CONFIG_FAIR_GROUP_SCHED
8062     ptr += 2 * nr_cpu_ids * sizeof(void **);
8063 #endif
8064 #ifdef CONFIG_RT_GROUP_SCHED
8065     ptr += 2 * nr_cpu_ids * sizeof(void **);
8066 #endif
8067     if (ptr) {
8068         ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT);
8069 
8070 #ifdef CONFIG_FAIR_GROUP_SCHED
8071         root_task_group.se = (struct sched_entity **)ptr;
8072         ptr += nr_cpu_ids * sizeof(void **);
8073 
8074         root_task_group.cfs_rq = (struct cfs_rq **)ptr;
8075         ptr += nr_cpu_ids * sizeof(void **);
8076 
8077         root_task_group.shares = ROOT_TASK_GROUP_LOAD;
8078         init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
8079 #endif /* CONFIG_FAIR_GROUP_SCHED */
8080 #ifdef CONFIG_RT_GROUP_SCHED
8081         root_task_group.rt_se = (struct sched_rt_entity **)ptr;
8082         ptr += nr_cpu_ids * sizeof(void **);
8083 
8084         root_task_group.rt_rq = (struct rt_rq **)ptr;
8085         ptr += nr_cpu_ids * sizeof(void **);
8086 
8087 #endif /* CONFIG_RT_GROUP_SCHED */
8088     }
8089 #ifdef CONFIG_CPUMASK_OFFSTACK
8090     for_each_possible_cpu(i)
8091     {
8092         per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
8093             cpumask_size(), GFP_KERNEL, cpu_to_node(i));
8094         per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
8095             cpumask_size(), GFP_KERNEL, cpu_to_node(i));
8096     }
8097 #endif /* CONFIG_CPUMASK_OFFSTACK */
8098 
8099     init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(),
8100                       global_rt_runtime());
8101     init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(),
8102                       global_rt_runtime());
8103 
8104 #ifdef CONFIG_SMP
8105     init_defrootdomain();
8106 #endif
8107 
8108 #ifdef CONFIG_RT_GROUP_SCHED
8109     init_rt_bandwidth(&root_task_group.rt_bandwidth, global_rt_period(),
8110                       global_rt_runtime());
8111 #endif /* CONFIG_RT_GROUP_SCHED */
8112 
8113 #ifdef CONFIG_CGROUP_SCHED
8114     task_group_cache = KMEM_CACHE(task_group, 0);
8115 
8116     list_add(&root_task_group.list, &task_groups);
8117     INIT_LIST_HEAD(&root_task_group.children);
8118     INIT_LIST_HEAD(&root_task_group.siblings);
8119     autogroup_init(&init_task);
8120 #endif /* CONFIG_CGROUP_SCHED */
8121 
8122     for_each_possible_cpu(i)
8123     {
8124         struct rq *rq;
8125 
8126         rq = cpu_rq(i);
8127         raw_spin_lock_init(&rq->lock);
8128         rq->nr_running = 0;
8129         rq->calc_load_active = 0;
8130         rq->calc_load_update = jiffies + LOAD_FREQ;
8131         init_cfs_rq(&rq->cfs);
8132         init_rt_rq(&rq->rt);
8133         init_dl_rq(&rq->dl);
8134 #ifdef CONFIG_FAIR_GROUP_SCHED
8135         INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
8136         rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
8137         /*
8138          * How much CPU bandwidth does root_task_group get?
8139          *
8140          * In case of task-groups formed thr' the cgroup filesystem, it
8141          * gets 100% of the CPU resources in the system. This overall
8142          * system CPU resource is divided among the tasks of
8143          * root_task_group and its child task-groups in a fair manner,
8144          * based on each entity's (task or task-group's) weight
8145          * (se->load.weight).
8146          *
8147          * In other words, if root_task_group has 10 tasks of weight
8148          * 1024) and two child groups A0 and A1 (of weight 1024 each),
8149          * then A0's share of the CPU resource is:
8150          *
8151          *    A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
8152          *
8153          * We achieve this by letting root_task_group's tasks sit
8154          * directly in rq->cfs (i.e root_task_group->se[] = NULL).
8155          */
8156         init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
8157 #endif /* CONFIG_FAIR_GROUP_SCHED */
8158 
8159         rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
8160 #ifdef CONFIG_RT_GROUP_SCHED
8161         init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
8162 #endif
8163 #ifdef CONFIG_SMP
8164         rq->sd = NULL;
8165         rq->rd = NULL;
8166         rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
8167         rq->balance_callback = NULL;
8168         rq->active_balance = 0;
8169         rq->next_balance = jiffies;
8170         rq->push_cpu = 0;
8171         rq->cpu = i;
8172         rq->online = 0;
8173         rq->idle_stamp = 0;
8174         rq->avg_idle = 2 * sysctl_sched_migration_cost;
8175         rq->max_idle_balance_cost = sysctl_sched_migration_cost;
8176         walt_sched_init_rq(rq);
8177 
8178         INIT_LIST_HEAD(&rq->cfs_tasks);
8179 
8180         rq_attach_root(rq, &def_root_domain);
8181 #ifdef CONFIG_NO_HZ_COMMON
8182         rq->last_blocked_load_update_tick = jiffies;
8183         atomic_set(&rq->nohz_flags, 0);
8184 
8185         rq_csd_init(rq, &rq->nohz_csd, nohz_csd_func);
8186 #endif
8187 #endif /* CONFIG_SMP */
8188         hrtick_rq_init(rq);
8189         atomic_set(&rq->nr_iowait, 0);
8190     }
8191 
8192     BUG_ON(alloc_related_thread_groups());
8193     set_load_weight(&init_task);
8194     /*
8195      * The boot idle thread does lazy MMU switching as well:
8196      */
8197     mmgrab(&init_mm);
8198     enter_lazy_tlb(&init_mm, current);
8199 
8200     /*
8201      * Make us the idle thread. Technically, schedule() should not be
8202      * called from this thread, however somewhere below it might be,
8203      * but because we are the idle thread, we just pick up running again
8204      * when this runqueue becomes "idle".
8205      */
8206     init_idle(current, smp_processor_id());
8207     init_new_task_load(current);
8208 
8209     calc_load_update = jiffies + LOAD_FREQ;
8210 
8211 #ifdef CONFIG_SMP
8212     idle_thread_set_boot_cpu();
8213 #endif
8214     init_sched_fair_class();
8215 
8216     init_schedstats();
8217 
8218     psi_init();
8219 
8220     init_uclamp();
8221 
8222     scheduler_running = 1;
8223 }
8224 
8225 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
preempt_count_equals(int preempt_offset)8226 static inline int preempt_count_equals(int preempt_offset)
8227 {
8228     int nested = preempt_count() + rcu_preempt_depth();
8229 
8230     return (nested == preempt_offset);
8231 }
8232 
__might_sleep(const char * file,int line,int preempt_offset)8233 void __might_sleep(const char *file, int line, int preempt_offset)
8234 {
8235     /*
8236      * Blocking primitives will set (and therefore destroy) current->state,
8237      * since we will exit with TASK_RUNNING make sure we enter with it,
8238      * otherwise we will destroy state.
8239      */
8240     WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
8241               "do not call blocking ops when !TASK_RUNNING; "
8242               "state=%lx set at [<%p>] %pS\n",
8243               current->state, (void *)current->task_state_change,
8244               (void *)current->task_state_change);
8245 
8246     ___might_sleep(file, line, preempt_offset);
8247 }
8248 EXPORT_SYMBOL(__might_sleep);
8249 
___might_sleep(const char * file,int line,int preempt_offset)8250 void ___might_sleep(const char *file, int line, int preempt_offset)
8251 {
8252     /* Ratelimiting timestamp: */
8253     static unsigned long prev_jiffy;
8254 
8255     unsigned long preempt_disable_ip;
8256 
8257     /* WARN_ON_ONCE() by default, no rate limit required: */
8258     rcu_sleep_check();
8259 
8260     if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
8261          !is_idle_task(current) && !current->non_block_count) ||
8262         system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
8263         oops_in_progress) {
8264         return;
8265     }
8266 
8267     if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) {
8268         return;
8269     }
8270     prev_jiffy = jiffies;
8271 
8272     /* Save this before calling printk(), since that will clobber it: */
8273     preempt_disable_ip = get_preempt_disable_ip(current);
8274 
8275     printk(KERN_ERR
8276            "BUG: sleeping function called from invalid context at %s:%d\n",
8277            file, line);
8278     printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, non_block: %d, pid: "
8279                     "%d, name: %s\n",
8280            in_atomic(), irqs_disabled(), current->non_block_count, current->pid,
8281            current->comm);
8282 
8283     if (task_stack_end_corrupted(current)) {
8284         printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
8285     }
8286 
8287     debug_show_held_locks(current);
8288     if (irqs_disabled()) {
8289         print_irqtrace_events(current);
8290     }
8291     if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) &&
8292         !preempt_count_equals(preempt_offset)) {
8293         pr_err("Preemption disabled at:");
8294         print_ip_sym(KERN_ERR, preempt_disable_ip);
8295     }
8296     dump_stack();
8297     add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8298 }
8299 EXPORT_SYMBOL(___might_sleep);
8300 
__cant_sleep(const char * file,int line,int preempt_offset)8301 void __cant_sleep(const char *file, int line, int preempt_offset)
8302 {
8303     static unsigned long prev_jiffy;
8304 
8305     if (irqs_disabled()) {
8306         return;
8307     }
8308 
8309     if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) {
8310         return;
8311     }
8312 
8313     if (preempt_count() > preempt_offset) {
8314         return;
8315     }
8316 
8317     if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) {
8318         return;
8319     }
8320     prev_jiffy = jiffies;
8321 
8322     printk(KERN_ERR "BUG: assuming atomic context at %s:%d\n", file, line);
8323     printk(KERN_ERR "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8324            in_atomic(), irqs_disabled(), current->pid, current->comm);
8325 
8326     debug_show_held_locks(current);
8327     dump_stack();
8328     add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
8329 }
8330 EXPORT_SYMBOL_GPL(__cant_sleep);
8331 #endif
8332 
8333 #ifdef CONFIG_MAGIC_SYSRQ
normalize_rt_tasks(void)8334 void normalize_rt_tasks(void)
8335 {
8336     struct task_struct *g, *p;
8337     struct sched_attr attr = {
8338         .sched_policy = SCHED_NORMAL,
8339     };
8340 
8341     read_lock(&tasklist_lock);
8342     for_each_process_thread(g, p)
8343     {
8344         /*
8345          * Only normalize user tasks:
8346          */
8347         if (p->flags & PF_KTHREAD) {
8348             continue;
8349         }
8350 
8351         p->se.exec_start = 0;
8352         schedstat_set(p->se.statistics.wait_start, 0);
8353         schedstat_set(p->se.statistics.sleep_start, 0);
8354         schedstat_set(p->se.statistics.block_start, 0);
8355 
8356         if (!dl_task(p) && !rt_task(p)) {
8357             /*
8358              * Renice negative nice level userspace
8359              * tasks back to 0:
8360              */
8361             if (task_nice(p) < 0) {
8362                 set_user_nice(p, 0);
8363             }
8364             continue;
8365         }
8366 
8367         __sched_setscheduler(p, &attr, false, false);
8368     }
8369     read_unlock(&tasklist_lock);
8370 }
8371 
8372 #endif /* CONFIG_MAGIC_SYSRQ */
8373 
8374 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
8375 /*
8376  * These functions are only useful for the IA64 MCA handling, or kdb.
8377  *
8378  * They can only be called when the whole system has been
8379  * stopped - every CPU needs to be quiescent, and no scheduling
8380  * activity can take place. Using them for anything else would
8381  * be a serious bug, and as a result, they aren't even visible
8382  * under any other configuration.
8383  */
8384 
8385 /**
8386  * curr_task - return the current task for a given CPU.
8387  * @cpu: the processor in question.
8388  *
8389  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8390  *
8391  * Return: The current task for @cpu.
8392  */
curr_task(int cpu)8393 struct task_struct *curr_task(int cpu)
8394 {
8395     return cpu_curr(cpu);
8396 }
8397 
8398 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
8399 
8400 #ifdef CONFIG_IA64
8401 /**
8402  * ia64_set_curr_task - set the current task for a given CPU.
8403  * @cpu: the processor in question.
8404  * @p: the task pointer to set.
8405  *
8406  * Description: This function must only be used when non-maskable interrupts
8407  * are serviced on a separate stack. It allows the architecture to switch the
8408  * notion of the current task on a CPU in a non-blocking manner. This function
8409  * must be called with all CPU's synchronized, and interrupts disabled, the
8410  * and caller must save the original value of the current task (see
8411  * curr_task() above) and restore that value before reenabling interrupts and
8412  * re-starting the system.
8413  *
8414  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
8415  */
ia64_set_curr_task(int cpu,struct task_struct * p)8416 void ia64_set_curr_task(int cpu, struct task_struct *p)
8417 {
8418     cpu_curr(cpu) = p;
8419 }
8420 
8421 #endif
8422 
8423 #ifdef CONFIG_CGROUP_SCHED
8424 /* task_group_lock serializes the addition/removal of task groups */
8425 static DEFINE_SPINLOCK(task_group_lock);
8426 
alloc_uclamp_sched_group(struct task_group * tg,struct task_group * parent)8427 static inline void alloc_uclamp_sched_group(struct task_group *tg,
8428                                             struct task_group *parent)
8429 {
8430 #ifdef CONFIG_UCLAMP_TASK_GROUP
8431     enum uclamp_id clamp_id;
8432 
8433     cycle_each_clamp_id(clamp_id) {
8434         uclamp_se_set(&tg->uclamp_req[clamp_id], uclamp_none(clamp_id), false);
8435         tg->uclamp[clamp_id] = parent->uclamp[clamp_id];
8436     }
8437 #endif
8438 }
8439 
sched_free_group(struct task_group * tg)8440 static void sched_free_group(struct task_group *tg)
8441 {
8442     free_fair_sched_group(tg);
8443     free_rt_sched_group(tg);
8444     autogroup_free(tg);
8445     kmem_cache_free(task_group_cache, tg);
8446 }
8447 
8448 /* allocate runqueue etc for a new task group */
sched_create_group(struct task_group * parent)8449 struct task_group *sched_create_group(struct task_group *parent)
8450 {
8451     struct task_group *tg;
8452 
8453     tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
8454     if (!tg) {
8455         return ERR_PTR(-ENOMEM);
8456     }
8457 
8458     if (!alloc_fair_sched_group(tg, parent)) {
8459         goto err;
8460     }
8461 
8462     if (!alloc_rt_sched_group(tg, parent)) {
8463         goto err;
8464     }
8465 
8466     alloc_uclamp_sched_group(tg, parent);
8467 
8468     return tg;
8469 
8470 err:
8471     sched_free_group(tg);
8472     return ERR_PTR(-ENOMEM);
8473 }
8474 
sched_online_group(struct task_group * tg,struct task_group * parent)8475 void sched_online_group(struct task_group *tg, struct task_group *parent)
8476 {
8477     unsigned long flags;
8478 
8479     spin_lock_irqsave(&task_group_lock, flags);
8480     list_add_rcu(&tg->list, &task_groups);
8481 
8482     /* Root should already exist: */
8483     WARN_ON(!parent);
8484 
8485     tg->parent = parent;
8486     INIT_LIST_HEAD(&tg->children);
8487     list_add_rcu(&tg->siblings, &parent->children);
8488     spin_unlock_irqrestore(&task_group_lock, flags);
8489 
8490     online_fair_sched_group(tg);
8491 }
8492 
8493 /* rcu callback to free various structures associated with a task group */
sched_free_group_rcu(struct rcu_head * rhp)8494 static void sched_free_group_rcu(struct rcu_head *rhp)
8495 {
8496     /* Now it should be safe to free those cfs_rqs: */
8497     sched_free_group(container_of(rhp, struct task_group, rcu));
8498 }
8499 
sched_destroy_group(struct task_group * tg)8500 void sched_destroy_group(struct task_group *tg)
8501 {
8502     /* Wait for possible concurrent references to cfs_rqs complete: */
8503     call_rcu(&tg->rcu, sched_free_group_rcu);
8504 }
8505 
sched_offline_group(struct task_group * tg)8506 void sched_offline_group(struct task_group *tg)
8507 {
8508     unsigned long flags;
8509 
8510     /* End participation in shares distribution: */
8511     unregister_fair_sched_group(tg);
8512 
8513     spin_lock_irqsave(&task_group_lock, flags);
8514     list_del_rcu(&tg->list);
8515     list_del_rcu(&tg->siblings);
8516     spin_unlock_irqrestore(&task_group_lock, flags);
8517 }
8518 
sched_change_group(struct task_struct * tsk,int type)8519 static void sched_change_group(struct task_struct *tsk, int type)
8520 {
8521     struct task_group *tg;
8522 
8523     /*
8524      * All callers are synchronized by task_rq_lock(); we do not use RCU
8525      * which is pointless here. Thus, we pass "true" to task_css_check()
8526      * to prevent lockdep warnings.
8527      */
8528     tg = container_of(task_css_check(tsk, cpu_cgrp_id, true), struct task_group,
8529                       css);
8530     tg = autogroup_task_group(tsk, tg);
8531     tsk->sched_task_group = tg;
8532 
8533 #ifdef CONFIG_FAIR_GROUP_SCHED
8534     if (tsk->sched_class->task_change_group) {
8535         tsk->sched_class->task_change_group(tsk, type);
8536     } else
8537 #endif
8538         set_task_rq(tsk, task_cpu(tsk));
8539 }
8540 
8541 /*
8542  * Change task's runqueue when it moves between groups.
8543  *
8544  * The caller of this function should have put the task in its new group by
8545  * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
8546  * its new group.
8547  */
sched_move_task(struct task_struct * tsk)8548 void sched_move_task(struct task_struct *tsk)
8549 {
8550     int queued, running,
8551         queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
8552     struct rq_flags rf;
8553     struct rq *rq;
8554 
8555     rq = task_rq_lock(tsk, &rf);
8556     update_rq_clock(rq);
8557 
8558     running = task_current(rq, tsk);
8559     queued = task_on_rq_queued(tsk);
8560     if (queued) {
8561         dequeue_task(rq, tsk, queue_flags);
8562     }
8563     if (running) {
8564         put_prev_task(rq, tsk);
8565     }
8566 
8567     sched_change_group(tsk, TASK_MOVE_GROUP);
8568 
8569     if (queued) {
8570         enqueue_task(rq, tsk, queue_flags);
8571     }
8572     if (running) {
8573         set_next_task(rq, tsk);
8574         /*
8575          * After changing group, the running task may have joined a
8576          * throttled one but it's still the running task. Trigger a
8577          * resched to make sure that task can still run.
8578          */
8579         resched_curr(rq);
8580     }
8581 
8582     task_rq_unlock(rq, tsk, &rf);
8583 }
8584 
css_tg(struct cgroup_subsys_state * css)8585 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
8586 {
8587     return css ? container_of(css, struct task_group, css) : NULL;
8588 }
8589 
8590 static struct cgroup_subsys_state *
cpu_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)8591 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8592 {
8593     struct task_group *parent = css_tg(parent_css);
8594     struct task_group *tg;
8595 
8596     if (!parent) {
8597         /* This is early initialization for the top cgroup */
8598         return &root_task_group.css;
8599     }
8600 
8601     tg = sched_create_group(parent);
8602     if (IS_ERR(tg)) {
8603         return ERR_PTR(-ENOMEM);
8604     }
8605 
8606 #ifdef CONFIG_SCHED_RTG_CGROUP
8607     tg->colocate = false;
8608     tg->colocate_update_disabled = false;
8609 #endif
8610 
8611     return &tg->css;
8612 }
8613 
8614 /* Expose task group only after completing cgroup initialization */
cpu_cgroup_css_online(struct cgroup_subsys_state * css)8615 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
8616 {
8617     struct task_group *tg = css_tg(css);
8618     struct task_group *parent = css_tg(css->parent);
8619 
8620     if (parent) {
8621         sched_online_group(tg, parent);
8622     }
8623 
8624 #ifdef CONFIG_UCLAMP_TASK_GROUP
8625     /* Propagate the effective uclamp value for the new group */
8626     mutex_lock(&uclamp_mutex);
8627     rcu_read_lock();
8628     cpu_util_update_eff(css);
8629     rcu_read_unlock();
8630     mutex_unlock(&uclamp_mutex);
8631 #endif
8632 
8633     return 0;
8634 }
8635 
cpu_cgroup_css_released(struct cgroup_subsys_state * css)8636 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
8637 {
8638     struct task_group *tg = css_tg(css);
8639 
8640     sched_offline_group(tg);
8641 }
8642 
cpu_cgroup_css_free(struct cgroup_subsys_state * css)8643 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
8644 {
8645     struct task_group *tg = css_tg(css);
8646 
8647     /*
8648      * Relies on the RCU grace period between css_released() and this.
8649      */
8650     sched_free_group(tg);
8651 }
8652 
8653 /*
8654  * This is called before wake_up_new_task(), therefore we really only
8655  * have to set its group bits, all the other stuff does not apply.
8656  */
cpu_cgroup_fork(struct task_struct * task)8657 static void cpu_cgroup_fork(struct task_struct *task)
8658 {
8659     struct rq_flags rf;
8660     struct rq *rq;
8661 
8662     rq = task_rq_lock(task, &rf);
8663 
8664     update_rq_clock(rq);
8665     sched_change_group(task, TASK_SET_GROUP);
8666 
8667     task_rq_unlock(rq, task, &rf);
8668 }
8669 
cpu_cgroup_can_attach(struct cgroup_taskset * tset)8670 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
8671 {
8672     struct task_struct *task;
8673     struct cgroup_subsys_state *css;
8674     int ret = 0;
8675 
8676     cgroup_taskset_for_each(task, css, tset)
8677     {
8678 #ifdef CONFIG_RT_GROUP_SCHED
8679         if (!sched_rt_can_attach(css_tg(css), task)) {
8680             return -EINVAL;
8681         }
8682 #endif
8683         /*
8684          * Serialize against wake_up_new_task() such that if its
8685          * running, we're sure to observe its full state.
8686          */
8687         raw_spin_lock_irq(&task->pi_lock);
8688         /*
8689          * Avoid calling sched_move_task() before wake_up_new_task()
8690          * has happened. This would lead to problems with PELT, due to
8691          * move wanting to detach+attach while we're not attached yet.
8692          */
8693         if (task->state == TASK_NEW) {
8694             ret = -EINVAL;
8695         }
8696         raw_spin_unlock_irq(&task->pi_lock);
8697 
8698         if (ret) {
8699             break;
8700         }
8701     }
8702     return ret;
8703 }
8704 
8705 #if defined(CONFIG_UCLAMP_TASK_GROUP) && defined(CONFIG_SCHED_RTG_CGROUP)
schedgp_attach(struct cgroup_taskset * tset)8706 static void schedgp_attach(struct cgroup_taskset *tset)
8707 {
8708     struct task_struct *task;
8709     struct cgroup_subsys_state *css;
8710     bool colocate;
8711     struct task_group *tg;
8712 
8713     cgroup_taskset_first(tset, &css);
8714     tg = css_tg(css);
8715 
8716     colocate = tg->colocate;
8717 
8718     cgroup_taskset_for_each(task, css, tset)
8719         sync_cgroup_colocation(task, colocate);
8720 }
8721 #else
schedgp_attach(struct cgroup_taskset * tset)8722 static void schedgp_attach(struct cgroup_taskset *tset)
8723 {
8724 }
8725 #endif
cpu_cgroup_attach(struct cgroup_taskset * tset)8726 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
8727 {
8728     struct task_struct *task;
8729     struct cgroup_subsys_state *css;
8730 
8731     cgroup_taskset_for_each(task, css, tset) sched_move_task(task);
8732 
8733     schedgp_attach(tset);
8734 }
8735 
8736 #ifdef CONFIG_UCLAMP_TASK_GROUP
cpu_util_update_eff(struct cgroup_subsys_state * css)8737 static void cpu_util_update_eff(struct cgroup_subsys_state *css)
8738 {
8739     struct cgroup_subsys_state *top_css = css;
8740     struct uclamp_se *uc_parent = NULL;
8741     struct uclamp_se *uc_se = NULL;
8742     unsigned int eff[UCLAMP_CNT];
8743     enum uclamp_id clamp_id;
8744     unsigned int clamps;
8745 
8746     lockdep_assert_held(&uclamp_mutex);
8747     SCHED_WARN_ON(!rcu_read_lock_held());
8748 
8749     css_for_each_descendant_pre(css, top_css)
8750     {
8751         uc_parent = css_tg(css)->parent ? css_tg(css)->parent->uclamp : NULL;
8752 
8753         cycle_each_clamp_id(clamp_id) {
8754             /* Assume effective clamps matches requested clamps */
8755             eff[clamp_id] = css_tg(css)->uclamp_req[clamp_id].value;
8756             /* Cap effective clamps with parent's effective clamps */
8757             if (uc_parent && eff[clamp_id] > uc_parent[clamp_id].value) {
8758                 eff[clamp_id] = uc_parent[clamp_id].value;
8759             }
8760         }
8761         /* Ensure protection is always capped by limit */
8762         eff[UCLAMP_MIN] = min(eff[UCLAMP_MIN], eff[UCLAMP_MAX]);
8763 
8764         /* Propagate most restrictive effective clamps */
8765         clamps = 0x0;
8766         uc_se = css_tg(css)->uclamp;
8767         cycle_each_clamp_id(clamp_id) {
8768             if (eff[clamp_id] == uc_se[clamp_id].value) {
8769                 continue;
8770             }
8771             uc_se[clamp_id].value = eff[clamp_id];
8772             uc_se[clamp_id].bucket_id = uclamp_bucket_id(eff[clamp_id]);
8773             clamps |= (0x1 << clamp_id);
8774         }
8775         if (!clamps) {
8776             css = css_rightmost_descendant(css);
8777             continue;
8778         }
8779 
8780         /* Immediately update descendants RUNNABLE tasks */
8781         uclamp_update_active_tasks(css);
8782     }
8783 }
8784 
8785 /*
8786  * Integer 10^N with a given N exponent by casting to integer the literal "1eN"
8787  * C expression. Since there is no way to convert a macro argument (N) into a
8788  * character constant, use two levels of macros.
8789  */
8790 #define EXP_POW10(exp) ((unsigned int)1e##exp)
8791 #define POW10(exp) EXP_POW10(exp)
8792 
8793 struct uclamp_request {
8794 #define UCLAMP_PERCENT_SHIFT 2
8795 #define UCLAMP_PERCENT_SCALE (100 * POW10(UCLAMP_PERCENT_SHIFT))
8796     s64 percent;
8797     u64 util;
8798     int ret;
8799 };
8800 
capacity_from_percent(char * buf)8801 static inline struct uclamp_request capacity_from_percent(char *buf)
8802 {
8803     struct uclamp_request req = {
8804         .percent = UCLAMP_PERCENT_SCALE,
8805         .util = SCHED_CAPACITY_SCALE,
8806         .ret = 0,
8807     };
8808 
8809     buf = strim(buf);
8810     if (strcmp(buf, "max")) {
8811         req.ret = cgroup_parse_float(buf, UCLAMP_PERCENT_SHIFT, &req.percent);
8812         if (req.ret) {
8813             return req;
8814         }
8815         if ((u64)req.percent > UCLAMP_PERCENT_SCALE) {
8816             req.ret = -ERANGE;
8817             return req;
8818         }
8819 
8820         req.util = req.percent << SCHED_CAPACITY_SHIFT;
8821         req.util = DIV_ROUND_CLOSEST_ULL(req.util, UCLAMP_PERCENT_SCALE);
8822     }
8823 
8824     return req;
8825 }
8826 
cpu_uclamp_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off,enum uclamp_id clamp_id)8827 static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf,
8828                                 size_t nbytes, loff_t off,
8829                                 enum uclamp_id clamp_id)
8830 {
8831     struct uclamp_request req;
8832     struct task_group *tg;
8833 
8834     req = capacity_from_percent(buf);
8835     if (req.ret) {
8836         return req.ret;
8837     }
8838 
8839     static_branch_enable(&sched_uclamp_used);
8840 
8841     mutex_lock(&uclamp_mutex);
8842     rcu_read_lock();
8843 
8844     tg = css_tg(of_css(of));
8845     if (tg->uclamp_req[clamp_id].value != req.util) {
8846         uclamp_se_set(&tg->uclamp_req[clamp_id], req.util, false);
8847     }
8848 
8849     /*
8850      * Because of not recoverable conversion rounding we keep track of the
8851      * exact requested value
8852      */
8853     tg->uclamp_pct[clamp_id] = req.percent;
8854 
8855     /* Update effective clamps to track the most restrictive value */
8856     cpu_util_update_eff(of_css(of));
8857 
8858     rcu_read_unlock();
8859     mutex_unlock(&uclamp_mutex);
8860 
8861     return nbytes;
8862 }
8863 
cpu_uclamp_min_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)8864 static ssize_t cpu_uclamp_min_write(struct kernfs_open_file *of, char *buf,
8865                                     size_t nbytes, loff_t off)
8866 {
8867     return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MIN);
8868 }
8869 
cpu_uclamp_max_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)8870 static ssize_t cpu_uclamp_max_write(struct kernfs_open_file *of, char *buf,
8871                                     size_t nbytes, loff_t off)
8872 {
8873     return cpu_uclamp_write(of, buf, nbytes, off, UCLAMP_MAX);
8874 }
8875 
cpu_uclamp_print(struct seq_file * sf,enum uclamp_id clamp_id)8876 static inline void cpu_uclamp_print(struct seq_file *sf,
8877                                     enum uclamp_id clamp_id)
8878 {
8879     struct task_group *tg;
8880     u64 util_clamp;
8881     u64 percent;
8882     u32 rem;
8883 
8884     rcu_read_lock();
8885     tg = css_tg(seq_css(sf));
8886     util_clamp = tg->uclamp_req[clamp_id].value;
8887     rcu_read_unlock();
8888 
8889     if (util_clamp == SCHED_CAPACITY_SCALE) {
8890         seq_puts(sf, "max\n");
8891         return;
8892     }
8893 
8894     percent = tg->uclamp_pct[clamp_id];
8895     percent = div_u64_rem(percent, POW10(UCLAMP_PERCENT_SHIFT), &rem);
8896     seq_printf(sf, "%llu.%0*u\n", percent, UCLAMP_PERCENT_SHIFT, rem);
8897 }
8898 
cpu_uclamp_min_show(struct seq_file * sf,void * v)8899 static int cpu_uclamp_min_show(struct seq_file *sf, void *v)
8900 {
8901     cpu_uclamp_print(sf, UCLAMP_MIN);
8902     return 0;
8903 }
8904 
cpu_uclamp_max_show(struct seq_file * sf,void * v)8905 static int cpu_uclamp_max_show(struct seq_file *sf, void *v)
8906 {
8907     cpu_uclamp_print(sf, UCLAMP_MAX);
8908     return 0;
8909 }
8910 
8911 #ifdef CONFIG_SCHED_RTG_CGROUP
sched_colocate_read(struct cgroup_subsys_state * css,struct cftype * cft)8912 static u64 sched_colocate_read(struct cgroup_subsys_state *css,
8913                                struct cftype *cft)
8914 {
8915     struct task_group *tg = css_tg(css);
8916 
8917     return (u64)tg->colocate;
8918 }
8919 
sched_colocate_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 colocate)8920 static int sched_colocate_write(struct cgroup_subsys_state *css,
8921                                 struct cftype *cft, u64 colocate)
8922 {
8923     struct task_group *tg = css_tg(css);
8924 
8925     if (tg->colocate_update_disabled) {
8926         return -EPERM;
8927     }
8928 
8929     tg->colocate = !!colocate;
8930     tg->colocate_update_disabled = true;
8931 
8932     return 0;
8933 }
8934 #endif /* CONFIG_SCHED_RTG_CGROUP */
8935 #endif /* CONFIG_UCLAMP_TASK_GROUP */
8936 
8937 #ifdef CONFIG_FAIR_GROUP_SCHED
cpu_shares_write_u64(struct cgroup_subsys_state * css,struct cftype * cftype,u64 shareval)8938 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
8939                                 struct cftype *cftype, u64 shareval)
8940 {
8941     if (shareval > scale_load_down(ULONG_MAX)) {
8942         shareval = MAX_SHARES;
8943     }
8944     return sched_group_set_shares(css_tg(css), scale_load(shareval));
8945 }
8946 
cpu_shares_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)8947 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
8948                                struct cftype *cft)
8949 {
8950     struct task_group *tg = css_tg(css);
8951 
8952     return (u64)scale_load_down(tg->shares);
8953 }
8954 
8955 #ifdef CONFIG_CFS_BANDWIDTH
8956 static DEFINE_MUTEX(cfs_constraints_mutex);
8957 
8958 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC;         /* 1s */
8959 static const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
8960 /* More than 203 days if BW_SHIFT equals 20. */
8961 static const u64 max_cfs_runtime = MAX_BW * NSEC_PER_USEC;
8962 
8963 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
8964 
tg_set_cfs_bandwidth(struct task_group * tg,u64 period,u64 quota)8965 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
8966 {
8967     int i, ret = 0, runtime_enabled, runtime_was_enabled;
8968     struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
8969 
8970     if (tg == &root_task_group) {
8971         return -EINVAL;
8972     }
8973 
8974     /*
8975      * Ensure we have at some amount of bandwidth every period.  This is
8976      * to prevent reaching a state of large arrears when throttled via
8977      * entity_tick() resulting in prolonged exit starvation.
8978      */
8979     if (quota < min_cfs_quota_period || period < min_cfs_quota_period) {
8980         return -EINVAL;
8981     }
8982 
8983     /*
8984      * Likewise, bound things on the otherside by preventing insane quota
8985      * periods.  This also allows us to normalize in computing quota
8986      * feasibility.
8987      */
8988     if (period > max_cfs_quota_period) {
8989         return -EINVAL;
8990     }
8991 
8992     /*
8993      * Bound quota to defend quota against overflow during bandwidth shift.
8994      */
8995     if (quota != RUNTIME_INF && quota > max_cfs_runtime) {
8996         return -EINVAL;
8997     }
8998 
8999     /*
9000      * Prevent race between setting of cfs_rq->runtime_enabled and
9001      * unthrottle_offline_cfs_rqs().
9002      */
9003     get_online_cpus();
9004     mutex_lock(&cfs_constraints_mutex);
9005     ret = __cfs_schedulable(tg, period, quota);
9006     if (ret) {
9007         goto out_unlock;
9008     }
9009 
9010     runtime_enabled = quota != RUNTIME_INF;
9011     runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
9012     /*
9013      * If we need to toggle cfs_bandwidth_used, off->on must occur
9014      * before making related changes, and on->off must occur afterwards
9015      */
9016     if (runtime_enabled && !runtime_was_enabled) {
9017         cfs_bandwidth_usage_inc();
9018     }
9019     raw_spin_lock_irq(&cfs_b->lock);
9020     cfs_b->period = ns_to_ktime(period);
9021     cfs_b->quota = quota;
9022 
9023     __refill_cfs_bandwidth_runtime(cfs_b);
9024 
9025     /* Restart the period timer (if active) to handle new period expiry: */
9026     if (runtime_enabled) {
9027         start_cfs_bandwidth(cfs_b);
9028     }
9029 
9030     raw_spin_unlock_irq(&cfs_b->lock);
9031 
9032     for_each_online_cpu(i)
9033     {
9034         struct cfs_rq *cfs_rq = tg->cfs_rq[i];
9035         struct rq *rq = cfs_rq->rq;
9036         struct rq_flags rf;
9037 
9038         rq_lock_irq(rq, &rf);
9039         cfs_rq->runtime_enabled = runtime_enabled;
9040         cfs_rq->runtime_remaining = 0;
9041 
9042         if (cfs_rq->throttled) {
9043             unthrottle_cfs_rq(cfs_rq);
9044         }
9045         rq_unlock_irq(rq, &rf);
9046     }
9047     if (runtime_was_enabled && !runtime_enabled) {
9048         cfs_bandwidth_usage_dec();
9049     }
9050 out_unlock:
9051     mutex_unlock(&cfs_constraints_mutex);
9052     put_online_cpus();
9053 
9054     return ret;
9055 }
9056 
tg_set_cfs_quota(struct task_group * tg,long cfs_quota_us)9057 static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
9058 {
9059     u64 quota, period;
9060 
9061     period = ktime_to_ns(tg->cfs_bandwidth.period);
9062     if (cfs_quota_us < 0) {
9063         quota = RUNTIME_INF;
9064     } else if ((u64)cfs_quota_us <= U64_MAX / NSEC_PER_USEC) {
9065         quota = (u64)cfs_quota_us * NSEC_PER_USEC;
9066     } else {
9067         return -EINVAL;
9068     }
9069 
9070     return tg_set_cfs_bandwidth(tg, period, quota);
9071 }
9072 
tg_get_cfs_quota(struct task_group * tg)9073 static long tg_get_cfs_quota(struct task_group *tg)
9074 {
9075     u64 quota_us;
9076 
9077     if (tg->cfs_bandwidth.quota == RUNTIME_INF) {
9078         return -1;
9079     }
9080 
9081     quota_us = tg->cfs_bandwidth.quota;
9082     do_div(quota_us, NSEC_PER_USEC);
9083 
9084     return quota_us;
9085 }
9086 
tg_set_cfs_period(struct task_group * tg,long cfs_period_us)9087 static int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
9088 {
9089     u64 quota, period;
9090 
9091     if ((u64)cfs_period_us > U64_MAX / NSEC_PER_USEC) {
9092         return -EINVAL;
9093     }
9094 
9095     period = (u64)cfs_period_us * NSEC_PER_USEC;
9096     quota = tg->cfs_bandwidth.quota;
9097 
9098     return tg_set_cfs_bandwidth(tg, period, quota);
9099 }
9100 
tg_get_cfs_period(struct task_group * tg)9101 static long tg_get_cfs_period(struct task_group *tg)
9102 {
9103     u64 cfs_period_us;
9104 
9105     cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
9106     do_div(cfs_period_us, NSEC_PER_USEC);
9107 
9108     return cfs_period_us;
9109 }
9110 
cpu_cfs_quota_read_s64(struct cgroup_subsys_state * css,struct cftype * cft)9111 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
9112                                   struct cftype *cft)
9113 {
9114     return tg_get_cfs_quota(css_tg(css));
9115 }
9116 
cpu_cfs_quota_write_s64(struct cgroup_subsys_state * css,struct cftype * cftype,s64 cfs_quota_us)9117 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
9118                                    struct cftype *cftype, s64 cfs_quota_us)
9119 {
9120     return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
9121 }
9122 
cpu_cfs_period_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)9123 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
9124                                    struct cftype *cft)
9125 {
9126     return tg_get_cfs_period(css_tg(css));
9127 }
9128 
cpu_cfs_period_write_u64(struct cgroup_subsys_state * css,struct cftype * cftype,u64 cfs_period_us)9129 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
9130                                     struct cftype *cftype, u64 cfs_period_us)
9131 {
9132     return tg_set_cfs_period(css_tg(css), cfs_period_us);
9133 }
9134 
9135 struct cfs_schedulable_data {
9136     struct task_group *tg;
9137     u64 period, quota;
9138 };
9139 
9140 /*
9141  * normalize group quota/period to be quota/max_period
9142  * note: units are usecs
9143  */
normalize_cfs_quota(struct task_group * tg,struct cfs_schedulable_data * d)9144 static u64 normalize_cfs_quota(struct task_group *tg,
9145                                struct cfs_schedulable_data *d)
9146 {
9147     u64 quota, period;
9148 
9149     if (tg == d->tg) {
9150         period = d->period;
9151         quota = d->quota;
9152     } else {
9153         period = tg_get_cfs_period(tg);
9154         quota = tg_get_cfs_quota(tg);
9155     }
9156 
9157     /* note: these should typically be equivalent */
9158     if (quota == RUNTIME_INF || quota == -1) {
9159         return RUNTIME_INF;
9160     }
9161 
9162     return to_ratio(period, quota);
9163 }
9164 
tg_cfs_schedulable_down(struct task_group * tg,void * data)9165 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
9166 {
9167     struct cfs_schedulable_data *d = data;
9168     struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9169     s64 quota = 0, parent_quota = -1;
9170 
9171     if (!tg->parent) {
9172         quota = RUNTIME_INF;
9173     } else {
9174         struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
9175 
9176         quota = normalize_cfs_quota(tg, d);
9177         parent_quota = parent_b->hierarchical_quota;
9178 
9179         /*
9180          * Ensure max(child_quota) <= parent_quota.  On cgroup2,
9181          * always take the min.  On cgroup1, only inherit when no
9182          * limit is set:
9183          */
9184         if (cgroup_subsys_on_dfl(cpu_cgrp_subsys)) {
9185             quota = min(quota, parent_quota);
9186         } else {
9187             if (quota == RUNTIME_INF) {
9188                 quota = parent_quota;
9189             } else if (parent_quota != RUNTIME_INF && quota > parent_quota) {
9190                 return -EINVAL;
9191             }
9192         }
9193     }
9194     cfs_b->hierarchical_quota = quota;
9195 
9196     return 0;
9197 }
9198 
__cfs_schedulable(struct task_group * tg,u64 period,u64 quota)9199 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
9200 {
9201     int ret;
9202     struct cfs_schedulable_data data = {
9203         .tg = tg,
9204         .period = period,
9205         .quota = quota,
9206     };
9207 
9208     if (quota != RUNTIME_INF) {
9209         do_div(data.period, NSEC_PER_USEC);
9210         do_div(data.quota, NSEC_PER_USEC);
9211     }
9212 
9213     rcu_read_lock();
9214     ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
9215     rcu_read_unlock();
9216 
9217     return ret;
9218 }
9219 
cpu_cfs_stat_show(struct seq_file * sf,void * v)9220 static int cpu_cfs_stat_show(struct seq_file *sf, void *v)
9221 {
9222     struct task_group *tg = css_tg(seq_css(sf));
9223     struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9224 
9225     seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
9226     seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
9227     seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
9228 
9229     if (schedstat_enabled() && tg != &root_task_group) {
9230         u64 ws = 0;
9231         int i;
9232 
9233         for_each_possible_cpu(i) ws +=
9234             schedstat_val(tg->se[i]->statistics.wait_sum);
9235 
9236         seq_printf(sf, "wait_sum %llu\n", ws);
9237     }
9238 
9239     return 0;
9240 }
9241 #endif /* CONFIG_CFS_BANDWIDTH */
9242 #endif /* CONFIG_FAIR_GROUP_SCHED */
9243 
9244 #ifdef CONFIG_RT_GROUP_SCHED
cpu_rt_runtime_write(struct cgroup_subsys_state * css,struct cftype * cft,s64 val)9245 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
9246                                 struct cftype *cft, s64 val)
9247 {
9248     return sched_group_set_rt_runtime(css_tg(css), val);
9249 }
9250 
cpu_rt_runtime_read(struct cgroup_subsys_state * css,struct cftype * cft)9251 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
9252                                struct cftype *cft)
9253 {
9254     return sched_group_rt_runtime(css_tg(css));
9255 }
9256 
cpu_rt_period_write_uint(struct cgroup_subsys_state * css,struct cftype * cftype,u64 rt_period_us)9257 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
9258                                     struct cftype *cftype, u64 rt_period_us)
9259 {
9260     return sched_group_set_rt_period(css_tg(css), rt_period_us);
9261 }
9262 
cpu_rt_period_read_uint(struct cgroup_subsys_state * css,struct cftype * cft)9263 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
9264                                    struct cftype *cft)
9265 {
9266     return sched_group_rt_period(css_tg(css));
9267 }
9268 #endif /* CONFIG_RT_GROUP_SCHED */
9269 
9270 static struct cftype cpu_legacy_files[] = {
9271 #ifdef CONFIG_FAIR_GROUP_SCHED
9272     {
9273         .name = "shares",
9274         .read_u64 = cpu_shares_read_u64,
9275         .write_u64 = cpu_shares_write_u64,
9276     },
9277 #endif
9278 #ifdef CONFIG_CFS_BANDWIDTH
9279     {
9280         .name = "cfs_quota_us",
9281         .read_s64 = cpu_cfs_quota_read_s64,
9282         .write_s64 = cpu_cfs_quota_write_s64,
9283     },
9284     {
9285         .name = "cfs_period_us",
9286         .read_u64 = cpu_cfs_period_read_u64,
9287         .write_u64 = cpu_cfs_period_write_u64,
9288     },
9289     {
9290         .name = "stat",
9291         .seq_show = cpu_cfs_stat_show,
9292     },
9293 #endif
9294 #ifdef CONFIG_RT_GROUP_SCHED
9295     {
9296         .name = "rt_runtime_us",
9297         .read_s64 = cpu_rt_runtime_read,
9298         .write_s64 = cpu_rt_runtime_write,
9299     },
9300     {
9301         .name = "rt_period_us",
9302         .read_u64 = cpu_rt_period_read_uint,
9303         .write_u64 = cpu_rt_period_write_uint,
9304     },
9305 #endif
9306 #ifdef CONFIG_UCLAMP_TASK_GROUP
9307     {
9308         .name = "uclamp.min",
9309         .flags = CFTYPE_NOT_ON_ROOT,
9310         .seq_show = cpu_uclamp_min_show,
9311         .write = cpu_uclamp_min_write,
9312     },
9313     {
9314         .name = "uclamp.max",
9315         .flags = CFTYPE_NOT_ON_ROOT,
9316         .seq_show = cpu_uclamp_max_show,
9317         .write = cpu_uclamp_max_write,
9318     },
9319 #ifdef CONFIG_SCHED_RTG_CGROUP
9320     {
9321         .name = "uclamp.colocate",
9322         .flags = CFTYPE_NOT_ON_ROOT,
9323         .read_u64 = sched_colocate_read,
9324         .write_u64 = sched_colocate_write,
9325     },
9326 #endif
9327 #endif
9328     {} /* Terminate */
9329 };
9330 
cpu_extra_stat_show(struct seq_file * sf,struct cgroup_subsys_state * css)9331 static int cpu_extra_stat_show(struct seq_file *sf,
9332                                struct cgroup_subsys_state *css)
9333 {
9334 #ifdef CONFIG_CFS_BANDWIDTH
9335     {
9336         struct task_group *tg = css_tg(css);
9337         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
9338         u64 throttled_usec;
9339 
9340         throttled_usec = cfs_b->throttled_time;
9341         do_div(throttled_usec, NSEC_PER_USEC);
9342 
9343         seq_printf(sf,
9344                    "nr_periods %d\n"
9345                    "nr_throttled %d\n"
9346                    "throttled_usec %llu\n",
9347                    cfs_b->nr_periods, cfs_b->nr_throttled, throttled_usec);
9348     }
9349 #endif
9350     return 0;
9351 }
9352 
9353 #ifdef CONFIG_FAIR_GROUP_SCHED
cpu_weight_read_u64(struct cgroup_subsys_state * css,struct cftype * cft)9354 static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css,
9355                                struct cftype *cft)
9356 {
9357     struct task_group *tg = css_tg(css);
9358     u64 weight = scale_load_down(tg->shares);
9359 
9360     return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 0x400);
9361 }
9362 
cpu_weight_write_u64(struct cgroup_subsys_state * css,struct cftype * cft,u64 weight)9363 static int cpu_weight_write_u64(struct cgroup_subsys_state *css,
9364                                 struct cftype *cft, u64 weight)
9365 {
9366     /*
9367      * cgroup weight knobs should use the common MIN, DFL and MAX
9368      * values which are 1, 100 and 10000 respectively.  While it loses
9369      * a bit of range on both ends, it maps pretty well onto the shares
9370      * value used by scheduler and the round-trip conversions preserve
9371      * the original value over the entire range.
9372      */
9373     if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) {
9374         return -ERANGE;
9375     }
9376 
9377     weight = DIV_ROUND_CLOSEST_ULL(weight * 0x400, CGROUP_WEIGHT_DFL);
9378 
9379     return sched_group_set_shares(css_tg(css), scale_load(weight));
9380 }
9381 
cpu_weight_nice_read_s64(struct cgroup_subsys_state * css,struct cftype * cft)9382 static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css,
9383                                     struct cftype *cft)
9384 {
9385     unsigned long weight = scale_load_down(css_tg(css)->shares);
9386     int last_delta = INT_MAX;
9387     int prio, delta;
9388 
9389     /* find the closest nice value to the current weight */
9390     for (prio = 0; prio < ARRAY_SIZE(sched_prio_to_weight); prio++) {
9391         delta = abs(sched_prio_to_weight[prio] - weight);
9392         if (delta >= last_delta) {
9393             break;
9394         }
9395         last_delta = delta;
9396     }
9397 
9398     return PRIO_TO_NICE(prio - 1 + MAX_RT_PRIO);
9399 }
9400 
cpu_weight_nice_write_s64(struct cgroup_subsys_state * css,struct cftype * cft,s64 nice)9401 static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css,
9402                                      struct cftype *cft, s64 nice)
9403 {
9404     unsigned long weight;
9405     int idx;
9406 
9407     if (nice < MIN_NICE || nice > MAX_NICE) {
9408         return -ERANGE;
9409     }
9410 
9411     idx = NICE_TO_PRIO(nice) - MAX_RT_PRIO;
9412     idx = array_index_nospec(idx, 0x28);
9413     weight = sched_prio_to_weight[idx];
9414 
9415     return sched_group_set_shares(css_tg(css), scale_load(weight));
9416 }
9417 #endif
9418 
cpu_period_quota_print(struct seq_file * sf,long period,long quota)9419 static void __maybe_unused cpu_period_quota_print(struct seq_file *sf,
9420                                                   long period, long quota)
9421 {
9422     if (quota < 0) {
9423         seq_puts(sf, "max");
9424     } else {
9425         seq_printf(sf, "%ld", quota);
9426     }
9427 
9428     seq_printf(sf, " %ld\n", period);
9429 }
9430 
9431 /* caller should put the current value in *@periodp before calling */
cpu_period_quota_parse(char * buf,u64 * periodp,u64 * quotap)9432 static int __maybe_unused cpu_period_quota_parse(char *buf, u64 *periodp,
9433                                                  u64 *quotap)
9434 {
9435     char tok[21]; /* U64_MAX */
9436 
9437     if (sscanf(buf, "%20s %llu", tok, periodp) < 1) {
9438         return -EINVAL;
9439     }
9440 
9441     *periodp *= NSEC_PER_USEC;
9442 
9443     if (sscanf(tok, "%llu", quotap)) {
9444         *quotap *= NSEC_PER_USEC;
9445     } else if (!strcmp(tok, "max")) {
9446         *quotap = RUNTIME_INF;
9447     } else {
9448         return -EINVAL;
9449     }
9450 
9451     return 0;
9452 }
9453 
9454 #ifdef CONFIG_CFS_BANDWIDTH
cpu_max_show(struct seq_file * sf,void * v)9455 static int cpu_max_show(struct seq_file *sf, void *v)
9456 {
9457     struct task_group *tg = css_tg(seq_css(sf));
9458 
9459     cpu_period_quota_print(sf, tg_get_cfs_period(tg), tg_get_cfs_quota(tg));
9460     return 0;
9461 }
9462 
cpu_max_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)9463 static ssize_t cpu_max_write(struct kernfs_open_file *of, char *buf,
9464                              size_t nbytes, loff_t off)
9465 {
9466     struct task_group *tg = css_tg(of_css(of));
9467     u64 period = tg_get_cfs_period(tg);
9468     u64 quota;
9469     int ret;
9470 
9471     ret = cpu_period_quota_parse(buf, &period, &quota);
9472     if (!ret) {
9473         ret = tg_set_cfs_bandwidth(tg, period, quota);
9474     }
9475     return ret ?: nbytes;
9476 }
9477 #endif
9478 
9479 static struct cftype cpu_files[] = {
9480 #ifdef CONFIG_FAIR_GROUP_SCHED
9481     {
9482         .name = "weight",
9483         .flags = CFTYPE_NOT_ON_ROOT,
9484         .read_u64 = cpu_weight_read_u64,
9485         .write_u64 = cpu_weight_write_u64,
9486     },
9487     {
9488         .name = "weight.nice",
9489         .flags = CFTYPE_NOT_ON_ROOT,
9490         .read_s64 = cpu_weight_nice_read_s64,
9491         .write_s64 = cpu_weight_nice_write_s64,
9492     },
9493 #endif
9494 #ifdef CONFIG_CFS_BANDWIDTH
9495     {
9496         .name = "max",
9497         .flags = CFTYPE_NOT_ON_ROOT,
9498         .seq_show = cpu_max_show,
9499         .write = cpu_max_write,
9500     },
9501 #endif
9502 #ifdef CONFIG_UCLAMP_TASK_GROUP
9503     {
9504         .name = "uclamp.min",
9505         .flags = CFTYPE_NOT_ON_ROOT,
9506         .seq_show = cpu_uclamp_min_show,
9507         .write = cpu_uclamp_min_write,
9508     },
9509     {
9510         .name = "uclamp.max",
9511         .flags = CFTYPE_NOT_ON_ROOT,
9512         .seq_show = cpu_uclamp_max_show,
9513         .write = cpu_uclamp_max_write,
9514     },
9515 #endif
9516     {} /* terminate */
9517 };
9518 
9519 struct cgroup_subsys cpu_cgrp_subsys = {
9520     .css_alloc = cpu_cgroup_css_alloc,
9521     .css_online = cpu_cgroup_css_online,
9522     .css_released = cpu_cgroup_css_released,
9523     .css_free = cpu_cgroup_css_free,
9524     .css_extra_stat_show = cpu_extra_stat_show,
9525     .fork = cpu_cgroup_fork,
9526     .can_attach = cpu_cgroup_can_attach,
9527     .attach = cpu_cgroup_attach,
9528     .legacy_cftypes = cpu_legacy_files,
9529     .dfl_cftypes = cpu_files,
9530     .early_init = true,
9531     .threaded = true,
9532 };
9533 
9534 #endif /* CONFIG_CGROUP_SCHED */
9535 
dump_cpu_task(int cpu)9536 void dump_cpu_task(int cpu)
9537 {
9538     pr_info("Task dump for CPU %d:\n", cpu);
9539     sched_show_task(cpu_curr(cpu));
9540 }
9541 
9542 /*
9543  * Nice levels are multiplicative, with a gentle 10% change for every
9544  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
9545  * nice 1, it will get ~10% less CPU time than another CPU-bound task
9546  * that remained on nice 0.
9547  *
9548  * The "10% effect" is relative and cumulative: from _any_ nice level,
9549  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
9550  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
9551  * If a task goes up by ~10% and another task goes down by ~10% then
9552  * the relative distance between them is ~25%.)
9553  */
9554 const int sched_prio_to_weight[40] = {
9555     88761, 71755, 56483, 46273, 36291, /* -20 */
9556     29154, 23254, 18705, 14949, 11916, /* -15 */
9557     9548,  7620,  6100,  4904,  3906,  /* -10 */
9558     3121,  2501,  1991,  1586,  1277,  /*  -5 */
9559     1024,  820,   655,   526,   423,   /*   0 */
9560     335,   272,   215,   172,   137,   /*   5 */
9561     110,   87,    70,    56,    45,    /*  10 */
9562     36,    29,    23,    18,    15,    /*  15 */
9563 };
9564 
9565 /*
9566  * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
9567  *
9568  * In cases where the weight does not change often, we can use the
9569  * precalculated inverse to speed up arithmetics by turning divisions
9570  * into multiplications:
9571  */
9572 const u32 sched_prio_to_wmult[40] = {
9573     48388,     59856,     76040,     92818,     118348,     /* -20 */
9574     147320,    184698,    229616,    287308,    360437,     /* -15 */
9575     449829,    563644,    704093,    875809,    1099582,    /* -10 */
9576     1376151,   1717300,   2157191,   2708050,   3363326,    /*  -5 */
9577     4194304,   5237765,   6557202,   8165337,   10153587,   /*   0 */
9578     12820798,  15790321,  19976592,  24970740,  31350126,   /*   5 */
9579     39045157,  49367440,  61356676,  76695844,  95443717,   /*  10 */
9580     119304647, 148102320, 186737708, 238609294, 286331153,  /*  15 */
9581 };
9582 
9583 #ifdef CONFIG_SCHED_LATENCY_NICE
9584 /*
9585  * latency weight for wakeup preemption
9586  */
9587 const int sched_latency_to_weight[40] = {
9588     1024, 973,  922,  870,  819,  /* -20 */
9589     768,  717,  666,  614,  563,  /* -15 */
9590     512,  461,  410,  358,  307,  /* -10 */
9591     256,  205,  154,  102,  51,   /*  -5 */
9592     0,    -51,  -102, -154, -205, /*   0 */
9593     -256, -307, -358, -410, -461, /*   5 */
9594     -512, -563, -614, -666, -717, /*  10 */
9595     -768, -819, -870, -922, -973, /*  15 */
9596 };
9597 #endif
9598 
call_trace_sched_update_nr_running(struct rq * rq,int count)9599 void call_trace_sched_update_nr_running(struct rq *rq, int count)
9600 {
9601     trace_sched_update_nr_running_tp(rq, count);
9602 }
9603 
9604 #ifdef CONFIG_SCHED_WALT
9605 /*
9606  * sched_exit() - Set EXITING_TASK_MARKER in task's ravg.demand field
9607  *
9608  * Stop accounting (exiting) task's future cpu usage
9609  *
9610  * We need this so that reset_all_windows_stats() can function correctly.
9611  * reset_all_window_stats() depends on do_each_thread/for_each_thread task
9612  * iterators to reset *all* task's statistics. Exiting tasks however become
9613  * invisible to those iterators. sched_exit() is called on a exiting task prior
9614  * to being removed from task_list, which will let reset_all_window_stats()
9615  * function correctly.
9616  */
sched_exit(struct task_struct * p)9617 void sched_exit(struct task_struct *p)
9618 {
9619     struct rq_flags rf;
9620     struct rq *rq;
9621     u64 wallclock;
9622 
9623 #ifdef CONFIG_SCHED_RTG
9624     sched_set_group_id(p, 0);
9625 #endif
9626 
9627     rq = task_rq_lock(p, &rf);
9628 
9629     /* rq->curr == p */
9630     wallclock = sched_ktime_clock();
9631     update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
9632     dequeue_task(rq, p, 0);
9633     /*
9634      * task's contribution is already removed from the
9635      * cumulative window demand in dequeue. As the
9636      * task's stats are reset, the next enqueue does
9637      * not change the cumulative window demand.
9638      */
9639     reset_task_stats(p);
9640     p->ravg.mark_start = wallclock;
9641     p->ravg.sum_history[0] = EXITING_TASK_MARKER;
9642 
9643     enqueue_task(rq, p, 0);
9644     task_rq_unlock(rq, p, &rf);
9645     free_task_load_ptrs(p);
9646 }
9647 #endif /* CONFIG_SCHED_WALT */
9648