1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies)
5 */
6
7 #include <trace/hooks/sched.h>
8 #undef TRACE_INCLUDE_PATH
9
10 int sched_rr_timeslice = RR_TIMESLICE;
11 /* More than 4 hours if BW_SHIFT equals 20. */
12 static const u64 max_rt_runtime = MAX_BW;
13
14 /*
15 * period over which we measure -rt task CPU usage in us.
16 * default: 1s
17 */
18 int sysctl_sched_rt_period = 1000000;
19
20 /*
21 * part of the period that we allow rt tasks to run in us.
22 * default: 0.95s
23 */
24 int sysctl_sched_rt_runtime = 950000;
25
26 #ifdef CONFIG_SYSCTL
27 static int sysctl_sched_rr_timeslice = (MSEC_PER_SEC * RR_TIMESLICE) / HZ;
28 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
29 size_t *lenp, loff_t *ppos);
30 static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
31 size_t *lenp, loff_t *ppos);
32 static struct ctl_table sched_rt_sysctls[] = {
33 {
34 .procname = "sched_rt_period_us",
35 .data = &sysctl_sched_rt_period,
36 .maxlen = sizeof(int),
37 .mode = 0644,
38 .proc_handler = sched_rt_handler,
39 .extra1 = SYSCTL_ONE,
40 .extra2 = SYSCTL_INT_MAX,
41 },
42 {
43 .procname = "sched_rt_runtime_us",
44 .data = &sysctl_sched_rt_runtime,
45 .maxlen = sizeof(int),
46 .mode = 0644,
47 .proc_handler = sched_rt_handler,
48 .extra1 = SYSCTL_NEG_ONE,
49 .extra2 = (void *)&sysctl_sched_rt_period,
50 },
51 {
52 .procname = "sched_rr_timeslice_ms",
53 .data = &sysctl_sched_rr_timeslice,
54 .maxlen = sizeof(int),
55 .mode = 0644,
56 .proc_handler = sched_rr_handler,
57 },
58 };
59
sched_rt_sysctl_init(void)60 static int __init sched_rt_sysctl_init(void)
61 {
62 register_sysctl_init("kernel", sched_rt_sysctls);
63 return 0;
64 }
65 late_initcall(sched_rt_sysctl_init);
66 #endif
67
init_rt_rq(struct rt_rq * rt_rq)68 void init_rt_rq(struct rt_rq *rt_rq)
69 {
70 struct rt_prio_array *array;
71 int i;
72
73 array = &rt_rq->active;
74 for (i = 0; i < MAX_RT_PRIO; i++) {
75 INIT_LIST_HEAD(array->queue + i);
76 __clear_bit(i, array->bitmap);
77 }
78 /* delimiter for bitsearch: */
79 __set_bit(MAX_RT_PRIO, array->bitmap);
80
81 #if defined CONFIG_SMP
82 rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
83 rt_rq->highest_prio.next = MAX_RT_PRIO-1;
84 rt_rq->overloaded = 0;
85 plist_head_init(&rt_rq->pushable_tasks);
86 #endif /* CONFIG_SMP */
87 /* We start is dequeued state, because no RT tasks are queued */
88 rt_rq->rt_queued = 0;
89
90 #ifdef CONFIG_RT_GROUP_SCHED
91 rt_rq->rt_time = 0;
92 rt_rq->rt_throttled = 0;
93 rt_rq->rt_runtime = 0;
94 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
95 #endif
96 }
97
98 #ifdef CONFIG_RT_GROUP_SCHED
99
100 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
101
sched_rt_period_timer(struct hrtimer * timer)102 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
103 {
104 struct rt_bandwidth *rt_b =
105 container_of(timer, struct rt_bandwidth, rt_period_timer);
106 int idle = 0;
107 int overrun;
108
109 raw_spin_lock(&rt_b->rt_runtime_lock);
110 for (;;) {
111 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
112 if (!overrun)
113 break;
114
115 raw_spin_unlock(&rt_b->rt_runtime_lock);
116 idle = do_sched_rt_period_timer(rt_b, overrun);
117 raw_spin_lock(&rt_b->rt_runtime_lock);
118 }
119 if (idle)
120 rt_b->rt_period_active = 0;
121 raw_spin_unlock(&rt_b->rt_runtime_lock);
122
123 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
124 }
125
init_rt_bandwidth(struct rt_bandwidth * rt_b,u64 period,u64 runtime)126 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
127 {
128 rt_b->rt_period = ns_to_ktime(period);
129 rt_b->rt_runtime = runtime;
130
131 raw_spin_lock_init(&rt_b->rt_runtime_lock);
132
133 hrtimer_init(&rt_b->rt_period_timer, CLOCK_MONOTONIC,
134 HRTIMER_MODE_REL_HARD);
135 rt_b->rt_period_timer.function = sched_rt_period_timer;
136 }
137
do_start_rt_bandwidth(struct rt_bandwidth * rt_b)138 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
139 {
140 raw_spin_lock(&rt_b->rt_runtime_lock);
141 if (!rt_b->rt_period_active) {
142 rt_b->rt_period_active = 1;
143 /*
144 * SCHED_DEADLINE updates the bandwidth, as a run away
145 * RT task with a DL task could hog a CPU. But DL does
146 * not reset the period. If a deadline task was running
147 * without an RT task running, it can cause RT tasks to
148 * throttle when they start up. Kick the timer right away
149 * to update the period.
150 */
151 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
152 hrtimer_start_expires(&rt_b->rt_period_timer,
153 HRTIMER_MODE_ABS_PINNED_HARD);
154 }
155 raw_spin_unlock(&rt_b->rt_runtime_lock);
156 }
157
start_rt_bandwidth(struct rt_bandwidth * rt_b)158 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
159 {
160 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
161 return;
162
163 do_start_rt_bandwidth(rt_b);
164 }
165
destroy_rt_bandwidth(struct rt_bandwidth * rt_b)166 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
167 {
168 hrtimer_cancel(&rt_b->rt_period_timer);
169 }
170
171 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
172
rt_task_of(struct sched_rt_entity * rt_se)173 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
174 {
175 #ifdef CONFIG_SCHED_DEBUG
176 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
177 #endif
178 return container_of(rt_se, struct task_struct, rt);
179 }
180
rq_of_rt_rq(struct rt_rq * rt_rq)181 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
182 {
183 return rt_rq->rq;
184 }
185
rt_rq_of_se(struct sched_rt_entity * rt_se)186 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
187 {
188 return rt_se->rt_rq;
189 }
190
rq_of_rt_se(struct sched_rt_entity * rt_se)191 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
192 {
193 struct rt_rq *rt_rq = rt_se->rt_rq;
194
195 return rt_rq->rq;
196 }
197
unregister_rt_sched_group(struct task_group * tg)198 void unregister_rt_sched_group(struct task_group *tg)
199 {
200 if (tg->rt_se)
201 destroy_rt_bandwidth(&tg->rt_bandwidth);
202 }
203
free_rt_sched_group(struct task_group * tg)204 void free_rt_sched_group(struct task_group *tg)
205 {
206 int i;
207
208 for_each_possible_cpu(i) {
209 if (tg->rt_rq)
210 kfree(tg->rt_rq[i]);
211 if (tg->rt_se)
212 kfree(tg->rt_se[i]);
213 }
214
215 kfree(tg->rt_rq);
216 kfree(tg->rt_se);
217 }
218
init_tg_rt_entry(struct task_group * tg,struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int cpu,struct sched_rt_entity * parent)219 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
220 struct sched_rt_entity *rt_se, int cpu,
221 struct sched_rt_entity *parent)
222 {
223 struct rq *rq = cpu_rq(cpu);
224
225 rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
226 rt_rq->rt_nr_boosted = 0;
227 rt_rq->rq = rq;
228 rt_rq->tg = tg;
229
230 tg->rt_rq[cpu] = rt_rq;
231 tg->rt_se[cpu] = rt_se;
232
233 if (!rt_se)
234 return;
235
236 if (!parent)
237 rt_se->rt_rq = &rq->rt;
238 else
239 rt_se->rt_rq = parent->my_q;
240
241 rt_se->my_q = rt_rq;
242 rt_se->parent = parent;
243 INIT_LIST_HEAD(&rt_se->run_list);
244 }
245
alloc_rt_sched_group(struct task_group * tg,struct task_group * parent)246 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
247 {
248 struct rt_rq *rt_rq;
249 struct sched_rt_entity *rt_se;
250 int i;
251
252 tg->rt_rq = kcalloc(nr_cpu_ids, sizeof(rt_rq), GFP_KERNEL);
253 if (!tg->rt_rq)
254 goto err;
255 tg->rt_se = kcalloc(nr_cpu_ids, sizeof(rt_se), GFP_KERNEL);
256 if (!tg->rt_se)
257 goto err;
258
259 init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0);
260
261 for_each_possible_cpu(i) {
262 rt_rq = kzalloc_node(sizeof(struct rt_rq),
263 GFP_KERNEL, cpu_to_node(i));
264 if (!rt_rq)
265 goto err;
266
267 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
268 GFP_KERNEL, cpu_to_node(i));
269 if (!rt_se)
270 goto err_free_rq;
271
272 init_rt_rq(rt_rq);
273 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
274 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
275 }
276
277 return 1;
278
279 err_free_rq:
280 kfree(rt_rq);
281 err:
282 return 0;
283 }
284
285 #else /* CONFIG_RT_GROUP_SCHED */
286
287 #define rt_entity_is_task(rt_se) (1)
288
rt_task_of(struct sched_rt_entity * rt_se)289 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
290 {
291 return container_of(rt_se, struct task_struct, rt);
292 }
293
rq_of_rt_rq(struct rt_rq * rt_rq)294 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
295 {
296 return container_of(rt_rq, struct rq, rt);
297 }
298
rq_of_rt_se(struct sched_rt_entity * rt_se)299 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
300 {
301 struct task_struct *p = rt_task_of(rt_se);
302
303 return task_rq(p);
304 }
305
rt_rq_of_se(struct sched_rt_entity * rt_se)306 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
307 {
308 struct rq *rq = rq_of_rt_se(rt_se);
309
310 return &rq->rt;
311 }
312
unregister_rt_sched_group(struct task_group * tg)313 void unregister_rt_sched_group(struct task_group *tg) { }
314
free_rt_sched_group(struct task_group * tg)315 void free_rt_sched_group(struct task_group *tg) { }
316
alloc_rt_sched_group(struct task_group * tg,struct task_group * parent)317 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
318 {
319 return 1;
320 }
321 #endif /* CONFIG_RT_GROUP_SCHED */
322
323 #ifdef CONFIG_SMP
324
need_pull_rt_task(struct rq * rq,struct task_struct * prev)325 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
326 {
327 /* Try to pull RT tasks here if we lower this rq's prio */
328 return rq->online && rq->rt.highest_prio.curr > prev->prio;
329 }
330
rt_overloaded(struct rq * rq)331 static inline int rt_overloaded(struct rq *rq)
332 {
333 return atomic_read(&rq->rd->rto_count);
334 }
335
rt_set_overload(struct rq * rq)336 static inline void rt_set_overload(struct rq *rq)
337 {
338 if (!rq->online)
339 return;
340
341 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
342 /*
343 * Make sure the mask is visible before we set
344 * the overload count. That is checked to determine
345 * if we should look at the mask. It would be a shame
346 * if we looked at the mask, but the mask was not
347 * updated yet.
348 *
349 * Matched by the barrier in pull_rt_task().
350 */
351 smp_wmb();
352 atomic_inc(&rq->rd->rto_count);
353 }
354
rt_clear_overload(struct rq * rq)355 static inline void rt_clear_overload(struct rq *rq)
356 {
357 if (!rq->online)
358 return;
359
360 /* the order here really doesn't matter */
361 atomic_dec(&rq->rd->rto_count);
362 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
363 }
364
has_pushable_tasks(struct rq * rq)365 static inline int has_pushable_tasks(struct rq *rq)
366 {
367 return !plist_head_empty(&rq->rt.pushable_tasks);
368 }
369
370 static DEFINE_PER_CPU(struct balance_callback, rt_push_head);
371 static DEFINE_PER_CPU(struct balance_callback, rt_pull_head);
372
373 static void push_rt_tasks(struct rq *);
374 static void pull_rt_task(struct rq *);
375
rt_queue_push_tasks(struct rq * rq)376 static inline void rt_queue_push_tasks(struct rq *rq)
377 {
378 if (!has_pushable_tasks(rq))
379 return;
380
381 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
382 }
383
rt_queue_pull_task(struct rq * rq)384 static inline void rt_queue_pull_task(struct rq *rq)
385 {
386 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
387 }
388
enqueue_pushable_task(struct rq * rq,struct task_struct * p)389 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
390 {
391 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
392 plist_node_init(&p->pushable_tasks, p->prio);
393 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
394
395 /* Update the highest prio pushable task */
396 if (p->prio < rq->rt.highest_prio.next)
397 rq->rt.highest_prio.next = p->prio;
398
399 if (!rq->rt.overloaded) {
400 rt_set_overload(rq);
401 rq->rt.overloaded = 1;
402 }
403 }
404
dequeue_pushable_task(struct rq * rq,struct task_struct * p)405 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
406 {
407 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
408
409 /* Update the new highest prio pushable task */
410 if (has_pushable_tasks(rq)) {
411 p = plist_first_entry(&rq->rt.pushable_tasks,
412 struct task_struct, pushable_tasks);
413 rq->rt.highest_prio.next = p->prio;
414 } else {
415 rq->rt.highest_prio.next = MAX_RT_PRIO-1;
416
417 if (rq->rt.overloaded) {
418 rt_clear_overload(rq);
419 rq->rt.overloaded = 0;
420 }
421 }
422 }
423
424 #else
425
enqueue_pushable_task(struct rq * rq,struct task_struct * p)426 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
427 {
428 }
429
dequeue_pushable_task(struct rq * rq,struct task_struct * p)430 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
431 {
432 }
433
rt_queue_push_tasks(struct rq * rq)434 static inline void rt_queue_push_tasks(struct rq *rq)
435 {
436 }
437 #endif /* CONFIG_SMP */
438
439 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
440 static void dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count);
441
on_rt_rq(struct sched_rt_entity * rt_se)442 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
443 {
444 return rt_se->on_rq;
445 }
446
447 #ifdef CONFIG_UCLAMP_TASK
448 /*
449 * Verify the fitness of task @p to run on @cpu taking into account the uclamp
450 * settings.
451 *
452 * This check is only important for heterogeneous systems where uclamp_min value
453 * is higher than the capacity of a @cpu. For non-heterogeneous system this
454 * function will always return true.
455 *
456 * The function will return true if the capacity of the @cpu is >= the
457 * uclamp_min and false otherwise.
458 *
459 * Note that uclamp_min will be clamped to uclamp_max if uclamp_min
460 * > uclamp_max.
461 */
rt_task_fits_capacity(struct task_struct * p,int cpu)462 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
463 {
464 unsigned int min_cap;
465 unsigned int max_cap;
466 unsigned int cpu_cap;
467
468 /* Only heterogeneous systems can benefit from this check */
469 if (!sched_asym_cpucap_active())
470 return true;
471
472 min_cap = uclamp_eff_value(p, UCLAMP_MIN);
473 max_cap = uclamp_eff_value(p, UCLAMP_MAX);
474
475 cpu_cap = arch_scale_cpu_capacity(cpu);
476
477 return cpu_cap >= min(min_cap, max_cap);
478 }
479 #else
rt_task_fits_capacity(struct task_struct * p,int cpu)480 static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu)
481 {
482 return true;
483 }
484 #endif
485
486 #ifdef CONFIG_RT_GROUP_SCHED
487
sched_rt_runtime(struct rt_rq * rt_rq)488 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
489 {
490 if (!rt_rq->tg)
491 return RUNTIME_INF;
492
493 return rt_rq->rt_runtime;
494 }
495
sched_rt_period(struct rt_rq * rt_rq)496 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
497 {
498 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
499 }
500
501 typedef struct task_group *rt_rq_iter_t;
502
next_task_group(struct task_group * tg)503 static inline struct task_group *next_task_group(struct task_group *tg)
504 {
505 do {
506 tg = list_entry_rcu(tg->list.next,
507 typeof(struct task_group), list);
508 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
509
510 if (&tg->list == &task_groups)
511 tg = NULL;
512
513 return tg;
514 }
515
516 #define for_each_rt_rq(rt_rq, iter, rq) \
517 for (iter = container_of(&task_groups, typeof(*iter), list); \
518 (iter = next_task_group(iter)) && \
519 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
520
521 #define for_each_sched_rt_entity(rt_se) \
522 for (; rt_se; rt_se = rt_se->parent)
523
group_rt_rq(struct sched_rt_entity * rt_se)524 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
525 {
526 return rt_se->my_q;
527 }
528
529 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
530 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
531
sched_rt_rq_enqueue(struct rt_rq * rt_rq)532 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
533 {
534 struct task_struct *donor = rq_of_rt_rq(rt_rq)->donor;
535 struct rq *rq = rq_of_rt_rq(rt_rq);
536 struct sched_rt_entity *rt_se;
537
538 int cpu = cpu_of(rq);
539
540 rt_se = rt_rq->tg->rt_se[cpu];
541
542 if (rt_rq->rt_nr_running) {
543 if (!rt_se)
544 enqueue_top_rt_rq(rt_rq);
545 else if (!on_rt_rq(rt_se))
546 enqueue_rt_entity(rt_se, 0);
547
548 if (rt_rq->highest_prio.curr < donor->prio)
549 resched_curr(rq);
550 }
551 }
552
sched_rt_rq_dequeue(struct rt_rq * rt_rq)553 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
554 {
555 struct sched_rt_entity *rt_se;
556 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
557
558 rt_se = rt_rq->tg->rt_se[cpu];
559
560 if (!rt_se) {
561 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
562 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
563 cpufreq_update_util(rq_of_rt_rq(rt_rq), 0);
564 }
565 else if (on_rt_rq(rt_se))
566 dequeue_rt_entity(rt_se, 0);
567 }
568
rt_rq_throttled(struct rt_rq * rt_rq)569 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
570 {
571 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
572 }
573
rt_se_boosted(struct sched_rt_entity * rt_se)574 static int rt_se_boosted(struct sched_rt_entity *rt_se)
575 {
576 struct rt_rq *rt_rq = group_rt_rq(rt_se);
577 struct task_struct *p;
578
579 if (rt_rq)
580 return !!rt_rq->rt_nr_boosted;
581
582 p = rt_task_of(rt_se);
583 return p->prio != p->normal_prio;
584 }
585
586 #ifdef CONFIG_SMP
sched_rt_period_mask(void)587 static inline const struct cpumask *sched_rt_period_mask(void)
588 {
589 return this_rq()->rd->span;
590 }
591 #else
sched_rt_period_mask(void)592 static inline const struct cpumask *sched_rt_period_mask(void)
593 {
594 return cpu_online_mask;
595 }
596 #endif
597
598 static inline
sched_rt_period_rt_rq(struct rt_bandwidth * rt_b,int cpu)599 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
600 {
601 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
602 }
603
sched_rt_bandwidth(struct rt_rq * rt_rq)604 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
605 {
606 return &rt_rq->tg->rt_bandwidth;
607 }
608
sched_rt_bandwidth_account(struct rt_rq * rt_rq)609 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
610 {
611 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
612
613 return (hrtimer_active(&rt_b->rt_period_timer) ||
614 rt_rq->rt_time < rt_b->rt_runtime);
615 }
616
617 #ifdef CONFIG_SMP
618 /*
619 * We ran out of runtime, see if we can borrow some from our neighbours.
620 */
do_balance_runtime(struct rt_rq * rt_rq)621 static void do_balance_runtime(struct rt_rq *rt_rq)
622 {
623 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
624 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
625 int i, weight;
626 u64 rt_period;
627
628 weight = cpumask_weight(rd->span);
629
630 raw_spin_lock(&rt_b->rt_runtime_lock);
631 rt_period = ktime_to_ns(rt_b->rt_period);
632 for_each_cpu(i, rd->span) {
633 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
634 s64 diff;
635
636 if (iter == rt_rq)
637 continue;
638
639 raw_spin_lock(&iter->rt_runtime_lock);
640 /*
641 * Either all rqs have inf runtime and there's nothing to steal
642 * or __disable_runtime() below sets a specific rq to inf to
643 * indicate its been disabled and disallow stealing.
644 */
645 if (iter->rt_runtime == RUNTIME_INF)
646 goto next;
647
648 /*
649 * From runqueues with spare time, take 1/n part of their
650 * spare time, but no more than our period.
651 */
652 diff = iter->rt_runtime - iter->rt_time;
653 if (diff > 0) {
654 diff = div_u64((u64)diff, weight);
655 if (rt_rq->rt_runtime + diff > rt_period)
656 diff = rt_period - rt_rq->rt_runtime;
657 iter->rt_runtime -= diff;
658 rt_rq->rt_runtime += diff;
659 if (rt_rq->rt_runtime == rt_period) {
660 raw_spin_unlock(&iter->rt_runtime_lock);
661 break;
662 }
663 }
664 next:
665 raw_spin_unlock(&iter->rt_runtime_lock);
666 }
667 raw_spin_unlock(&rt_b->rt_runtime_lock);
668 }
669
670 /*
671 * Ensure this RQ takes back all the runtime it lend to its neighbours.
672 */
__disable_runtime(struct rq * rq)673 static void __disable_runtime(struct rq *rq)
674 {
675 struct root_domain *rd = rq->rd;
676 rt_rq_iter_t iter;
677 struct rt_rq *rt_rq;
678
679 if (unlikely(!scheduler_running))
680 return;
681
682 for_each_rt_rq(rt_rq, iter, rq) {
683 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
684 s64 want;
685 int i;
686
687 raw_spin_lock(&rt_b->rt_runtime_lock);
688 raw_spin_lock(&rt_rq->rt_runtime_lock);
689 /*
690 * Either we're all inf and nobody needs to borrow, or we're
691 * already disabled and thus have nothing to do, or we have
692 * exactly the right amount of runtime to take out.
693 */
694 if (rt_rq->rt_runtime == RUNTIME_INF ||
695 rt_rq->rt_runtime == rt_b->rt_runtime)
696 goto balanced;
697 raw_spin_unlock(&rt_rq->rt_runtime_lock);
698
699 /*
700 * Calculate the difference between what we started out with
701 * and what we current have, that's the amount of runtime
702 * we lend and now have to reclaim.
703 */
704 want = rt_b->rt_runtime - rt_rq->rt_runtime;
705
706 /*
707 * Greedy reclaim, take back as much as we can.
708 */
709 for_each_cpu(i, rd->span) {
710 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
711 s64 diff;
712
713 /*
714 * Can't reclaim from ourselves or disabled runqueues.
715 */
716 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
717 continue;
718
719 raw_spin_lock(&iter->rt_runtime_lock);
720 if (want > 0) {
721 diff = min_t(s64, iter->rt_runtime, want);
722 iter->rt_runtime -= diff;
723 want -= diff;
724 } else {
725 iter->rt_runtime -= want;
726 want -= want;
727 }
728 raw_spin_unlock(&iter->rt_runtime_lock);
729
730 if (!want)
731 break;
732 }
733
734 raw_spin_lock(&rt_rq->rt_runtime_lock);
735 /*
736 * We cannot be left wanting - that would mean some runtime
737 * leaked out of the system.
738 */
739 WARN_ON_ONCE(want);
740 balanced:
741 /*
742 * Disable all the borrow logic by pretending we have inf
743 * runtime - in which case borrowing doesn't make sense.
744 */
745 rt_rq->rt_runtime = RUNTIME_INF;
746 rt_rq->rt_throttled = 0;
747 raw_spin_unlock(&rt_rq->rt_runtime_lock);
748 raw_spin_unlock(&rt_b->rt_runtime_lock);
749
750 /* Make rt_rq available for pick_next_task() */
751 sched_rt_rq_enqueue(rt_rq);
752 }
753 }
754
__enable_runtime(struct rq * rq)755 static void __enable_runtime(struct rq *rq)
756 {
757 rt_rq_iter_t iter;
758 struct rt_rq *rt_rq;
759
760 if (unlikely(!scheduler_running))
761 return;
762
763 /*
764 * Reset each runqueue's bandwidth settings
765 */
766 for_each_rt_rq(rt_rq, iter, rq) {
767 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
768
769 raw_spin_lock(&rt_b->rt_runtime_lock);
770 raw_spin_lock(&rt_rq->rt_runtime_lock);
771 rt_rq->rt_runtime = rt_b->rt_runtime;
772 rt_rq->rt_time = 0;
773 rt_rq->rt_throttled = 0;
774 raw_spin_unlock(&rt_rq->rt_runtime_lock);
775 raw_spin_unlock(&rt_b->rt_runtime_lock);
776 }
777 }
778
balance_runtime(struct rt_rq * rt_rq)779 static void balance_runtime(struct rt_rq *rt_rq)
780 {
781 if (!sched_feat(RT_RUNTIME_SHARE))
782 return;
783
784 if (rt_rq->rt_time > rt_rq->rt_runtime) {
785 raw_spin_unlock(&rt_rq->rt_runtime_lock);
786 do_balance_runtime(rt_rq);
787 raw_spin_lock(&rt_rq->rt_runtime_lock);
788 }
789 }
790 #else /* !CONFIG_SMP */
balance_runtime(struct rt_rq * rt_rq)791 static inline void balance_runtime(struct rt_rq *rt_rq) {}
792 #endif /* CONFIG_SMP */
793
do_sched_rt_period_timer(struct rt_bandwidth * rt_b,int overrun)794 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
795 {
796 int i, idle = 1, throttled = 0;
797 const struct cpumask *span;
798
799 span = sched_rt_period_mask();
800
801 /*
802 * FIXME: isolated CPUs should really leave the root task group,
803 * whether they are isolcpus or were isolated via cpusets, lest
804 * the timer run on a CPU which does not service all runqueues,
805 * potentially leaving other CPUs indefinitely throttled. If
806 * isolation is really required, the user will turn the throttle
807 * off to kill the perturbations it causes anyway. Meanwhile,
808 * this maintains functionality for boot and/or troubleshooting.
809 */
810 if (rt_b == &root_task_group.rt_bandwidth)
811 span = cpu_online_mask;
812
813 for_each_cpu(i, span) {
814 int enqueue = 0;
815 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
816 struct rq *rq = rq_of_rt_rq(rt_rq);
817 struct rq_flags rf;
818 int skip;
819
820 /*
821 * When span == cpu_online_mask, taking each rq->lock
822 * can be time-consuming. Try to avoid it when possible.
823 */
824 raw_spin_lock(&rt_rq->rt_runtime_lock);
825 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
826 rt_rq->rt_runtime = rt_b->rt_runtime;
827 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
828 raw_spin_unlock(&rt_rq->rt_runtime_lock);
829 if (skip)
830 continue;
831
832 rq_lock(rq, &rf);
833 update_rq_clock(rq);
834
835 if (rt_rq->rt_time) {
836 u64 runtime;
837
838 raw_spin_lock(&rt_rq->rt_runtime_lock);
839 if (rt_rq->rt_throttled)
840 balance_runtime(rt_rq);
841 runtime = rt_rq->rt_runtime;
842 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
843 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
844 rt_rq->rt_throttled = 0;
845 enqueue = 1;
846
847 /*
848 * When we're idle and a woken (rt) task is
849 * throttled wakeup_preempt() will set
850 * skip_update and the time between the wakeup
851 * and this unthrottle will get accounted as
852 * 'runtime'.
853 */
854 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
855 rq_clock_cancel_skipupdate(rq);
856 }
857 if (rt_rq->rt_time || rt_rq->rt_nr_running)
858 idle = 0;
859 raw_spin_unlock(&rt_rq->rt_runtime_lock);
860 } else if (rt_rq->rt_nr_running) {
861 idle = 0;
862 if (!rt_rq_throttled(rt_rq))
863 enqueue = 1;
864 }
865 if (rt_rq->rt_throttled)
866 throttled = 1;
867
868 if (enqueue)
869 sched_rt_rq_enqueue(rt_rq);
870 rq_unlock(rq, &rf);
871 }
872
873 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
874 return 1;
875
876 return idle;
877 }
878
sched_rt_runtime_exceeded(struct rt_rq * rt_rq)879 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
880 {
881 u64 runtime = sched_rt_runtime(rt_rq);
882
883 if (rt_rq->rt_throttled)
884 return rt_rq_throttled(rt_rq);
885
886 if (runtime >= sched_rt_period(rt_rq))
887 return 0;
888
889 balance_runtime(rt_rq);
890 runtime = sched_rt_runtime(rt_rq);
891 if (runtime == RUNTIME_INF)
892 return 0;
893
894 if (rt_rq->rt_time > runtime) {
895 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
896
897 /*
898 * Don't actually throttle groups that have no runtime assigned
899 * but accrue some time due to boosting.
900 */
901 if (likely(rt_b->rt_runtime)) {
902 rt_rq->rt_throttled = 1;
903 printk_deferred_once("sched: RT throttling activated\n");
904
905 trace_android_vh_dump_throttled_rt_tasks(
906 raw_smp_processor_id(),
907 rq_clock(rq_of_rt_rq(rt_rq)),
908 sched_rt_period(rt_rq),
909 runtime,
910 hrtimer_get_expires_ns(&rt_b->rt_period_timer));
911 } else {
912 /*
913 * In case we did anyway, make it go away,
914 * replenishment is a joke, since it will replenish us
915 * with exactly 0 ns.
916 */
917 rt_rq->rt_time = 0;
918 }
919
920 if (rt_rq_throttled(rt_rq)) {
921 sched_rt_rq_dequeue(rt_rq);
922 return 1;
923 }
924 }
925
926 return 0;
927 }
928
929 #else /* !CONFIG_RT_GROUP_SCHED */
930
931 typedef struct rt_rq *rt_rq_iter_t;
932
933 #define for_each_rt_rq(rt_rq, iter, rq) \
934 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
935
936 #define for_each_sched_rt_entity(rt_se) \
937 for (; rt_se; rt_se = NULL)
938
group_rt_rq(struct sched_rt_entity * rt_se)939 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
940 {
941 return NULL;
942 }
943
sched_rt_rq_enqueue(struct rt_rq * rt_rq)944 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
945 {
946 struct rq *rq = rq_of_rt_rq(rt_rq);
947
948 if (!rt_rq->rt_nr_running)
949 return;
950
951 enqueue_top_rt_rq(rt_rq);
952 resched_curr(rq);
953 }
954
sched_rt_rq_dequeue(struct rt_rq * rt_rq)955 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
956 {
957 dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running);
958 }
959
rt_rq_throttled(struct rt_rq * rt_rq)960 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
961 {
962 return false;
963 }
964
sched_rt_period_mask(void)965 static inline const struct cpumask *sched_rt_period_mask(void)
966 {
967 return cpu_online_mask;
968 }
969
970 static inline
sched_rt_period_rt_rq(struct rt_bandwidth * rt_b,int cpu)971 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
972 {
973 return &cpu_rq(cpu)->rt;
974 }
975
976 #ifdef CONFIG_SMP
__enable_runtime(struct rq * rq)977 static void __enable_runtime(struct rq *rq) { }
__disable_runtime(struct rq * rq)978 static void __disable_runtime(struct rq *rq) { }
979 #endif
980
981 #endif /* CONFIG_RT_GROUP_SCHED */
982
rt_se_prio(struct sched_rt_entity * rt_se)983 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
984 {
985 #ifdef CONFIG_RT_GROUP_SCHED
986 struct rt_rq *rt_rq = group_rt_rq(rt_se);
987
988 if (rt_rq)
989 return rt_rq->highest_prio.curr;
990 #endif
991
992 return rt_task_of(rt_se)->prio;
993 }
994
995 /*
996 * Update the current task's runtime statistics. Skip current tasks that
997 * are not in our scheduling class.
998 */
update_curr_rt(struct rq * rq)999 static void update_curr_rt(struct rq *rq)
1000 {
1001 struct task_struct *donor = rq->donor;
1002 s64 delta_exec;
1003
1004 if (donor->sched_class != &rt_sched_class)
1005 return;
1006
1007 delta_exec = update_curr_common(rq);
1008 if (unlikely(delta_exec <= 0))
1009 return;
1010
1011 #ifdef CONFIG_RT_GROUP_SCHED
1012 struct sched_rt_entity *rt_se = &donor->rt;
1013
1014 if (!rt_bandwidth_enabled())
1015 return;
1016
1017 for_each_sched_rt_entity(rt_se) {
1018 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1019 int exceeded;
1020
1021 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1022 raw_spin_lock(&rt_rq->rt_runtime_lock);
1023 rt_rq->rt_time += delta_exec;
1024 exceeded = sched_rt_runtime_exceeded(rt_rq);
1025 if (exceeded)
1026 resched_curr(rq);
1027 raw_spin_unlock(&rt_rq->rt_runtime_lock);
1028 if (exceeded)
1029 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1030 }
1031 }
1032 #endif
1033 }
1034
1035 static void
dequeue_top_rt_rq(struct rt_rq * rt_rq,unsigned int count)1036 dequeue_top_rt_rq(struct rt_rq *rt_rq, unsigned int count)
1037 {
1038 struct rq *rq = rq_of_rt_rq(rt_rq);
1039
1040 BUG_ON(&rq->rt != rt_rq);
1041
1042 if (!rt_rq->rt_queued)
1043 return;
1044
1045 BUG_ON(!rq->nr_running);
1046
1047 sub_nr_running(rq, count);
1048 rt_rq->rt_queued = 0;
1049
1050 }
1051
1052 static void
enqueue_top_rt_rq(struct rt_rq * rt_rq)1053 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1054 {
1055 struct rq *rq = rq_of_rt_rq(rt_rq);
1056
1057 BUG_ON(&rq->rt != rt_rq);
1058
1059 if (rt_rq->rt_queued)
1060 return;
1061
1062 if (rt_rq_throttled(rt_rq))
1063 return;
1064
1065 if (rt_rq->rt_nr_running) {
1066 add_nr_running(rq, rt_rq->rt_nr_running);
1067 rt_rq->rt_queued = 1;
1068 }
1069
1070 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1071 cpufreq_update_util(rq, 0);
1072 }
1073
1074 #if defined CONFIG_SMP
1075
1076 static void
inc_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1077 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1078 {
1079 struct rq *rq = rq_of_rt_rq(rt_rq);
1080
1081 #ifdef CONFIG_RT_GROUP_SCHED
1082 /*
1083 * Change rq's cpupri only if rt_rq is the top queue.
1084 */
1085 if (&rq->rt != rt_rq)
1086 return;
1087 #endif
1088 if (rq->online && prio < prev_prio)
1089 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1090 }
1091
1092 static void
dec_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1093 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1094 {
1095 struct rq *rq = rq_of_rt_rq(rt_rq);
1096
1097 #ifdef CONFIG_RT_GROUP_SCHED
1098 /*
1099 * Change rq's cpupri only if rt_rq is the top queue.
1100 */
1101 if (&rq->rt != rt_rq)
1102 return;
1103 #endif
1104 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1105 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1106 }
1107
1108 #else /* CONFIG_SMP */
1109
1110 static inline
inc_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1111 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1112 static inline
dec_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1113 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1114
1115 #endif /* CONFIG_SMP */
1116
1117 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1118 static void
inc_rt_prio(struct rt_rq * rt_rq,int prio)1119 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1120 {
1121 int prev_prio = rt_rq->highest_prio.curr;
1122
1123 if (prio < prev_prio)
1124 rt_rq->highest_prio.curr = prio;
1125
1126 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1127 }
1128
1129 static void
dec_rt_prio(struct rt_rq * rt_rq,int prio)1130 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1131 {
1132 int prev_prio = rt_rq->highest_prio.curr;
1133
1134 if (rt_rq->rt_nr_running) {
1135
1136 WARN_ON(prio < prev_prio);
1137
1138 /*
1139 * This may have been our highest task, and therefore
1140 * we may have some re-computation to do
1141 */
1142 if (prio == prev_prio) {
1143 struct rt_prio_array *array = &rt_rq->active;
1144
1145 rt_rq->highest_prio.curr =
1146 sched_find_first_bit(array->bitmap);
1147 }
1148
1149 } else {
1150 rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
1151 }
1152
1153 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1154 }
1155
1156 #else
1157
inc_rt_prio(struct rt_rq * rt_rq,int prio)1158 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
dec_rt_prio(struct rt_rq * rt_rq,int prio)1159 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1160
1161 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1162
1163 #ifdef CONFIG_RT_GROUP_SCHED
1164
1165 static void
inc_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1166 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1167 {
1168 if (rt_se_boosted(rt_se))
1169 rt_rq->rt_nr_boosted++;
1170
1171 if (rt_rq->tg)
1172 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1173 }
1174
1175 static void
dec_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1176 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1177 {
1178 if (rt_se_boosted(rt_se))
1179 rt_rq->rt_nr_boosted--;
1180
1181 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1182 }
1183
1184 #else /* CONFIG_RT_GROUP_SCHED */
1185
1186 static void
inc_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1187 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1188 {
1189 }
1190
1191 static inline
dec_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1192 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1193
1194 #endif /* CONFIG_RT_GROUP_SCHED */
1195
1196 static inline
rt_se_nr_running(struct sched_rt_entity * rt_se)1197 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1198 {
1199 struct rt_rq *group_rq = group_rt_rq(rt_se);
1200
1201 if (group_rq)
1202 return group_rq->rt_nr_running;
1203 else
1204 return 1;
1205 }
1206
1207 static inline
rt_se_rr_nr_running(struct sched_rt_entity * rt_se)1208 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1209 {
1210 struct rt_rq *group_rq = group_rt_rq(rt_se);
1211 struct task_struct *tsk;
1212
1213 if (group_rq)
1214 return group_rq->rr_nr_running;
1215
1216 tsk = rt_task_of(rt_se);
1217
1218 return (tsk->policy == SCHED_RR) ? 1 : 0;
1219 }
1220
1221 static inline
inc_rt_tasks(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1222 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1223 {
1224 int prio = rt_se_prio(rt_se);
1225
1226 WARN_ON(!rt_prio(prio));
1227 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1228 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1229
1230 inc_rt_prio(rt_rq, prio);
1231 inc_rt_group(rt_se, rt_rq);
1232 }
1233
1234 static inline
dec_rt_tasks(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1235 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1236 {
1237 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1238 WARN_ON(!rt_rq->rt_nr_running);
1239 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1240 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1241
1242 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1243 dec_rt_group(rt_se, rt_rq);
1244 }
1245
1246 /*
1247 * Change rt_se->run_list location unless SAVE && !MOVE
1248 *
1249 * assumes ENQUEUE/DEQUEUE flags match
1250 */
move_entity(unsigned int flags)1251 static inline bool move_entity(unsigned int flags)
1252 {
1253 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1254 return false;
1255
1256 return true;
1257 }
1258
__delist_rt_entity(struct sched_rt_entity * rt_se,struct rt_prio_array * array)1259 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1260 {
1261 list_del_init(&rt_se->run_list);
1262
1263 if (list_empty(array->queue + rt_se_prio(rt_se)))
1264 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1265
1266 rt_se->on_list = 0;
1267 }
1268
1269 static inline struct sched_statistics *
__schedstats_from_rt_se(struct sched_rt_entity * rt_se)1270 __schedstats_from_rt_se(struct sched_rt_entity *rt_se)
1271 {
1272 #ifdef CONFIG_RT_GROUP_SCHED
1273 /* schedstats is not supported for rt group. */
1274 if (!rt_entity_is_task(rt_se))
1275 return NULL;
1276 #endif
1277
1278 return &rt_task_of(rt_se)->stats;
1279 }
1280
1281 static inline void
update_stats_wait_start_rt(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se)1282 update_stats_wait_start_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1283 {
1284 struct sched_statistics *stats;
1285 struct task_struct *p = NULL;
1286
1287 if (!schedstat_enabled())
1288 return;
1289
1290 if (rt_entity_is_task(rt_se))
1291 p = rt_task_of(rt_se);
1292
1293 stats = __schedstats_from_rt_se(rt_se);
1294 if (!stats)
1295 return;
1296
1297 __update_stats_wait_start(rq_of_rt_rq(rt_rq), p, stats);
1298 }
1299
1300 static inline void
update_stats_enqueue_sleeper_rt(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se)1301 update_stats_enqueue_sleeper_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1302 {
1303 struct sched_statistics *stats;
1304 struct task_struct *p = NULL;
1305
1306 if (!schedstat_enabled())
1307 return;
1308
1309 if (rt_entity_is_task(rt_se))
1310 p = rt_task_of(rt_se);
1311
1312 stats = __schedstats_from_rt_se(rt_se);
1313 if (!stats)
1314 return;
1315
1316 __update_stats_enqueue_sleeper(rq_of_rt_rq(rt_rq), p, stats);
1317 }
1318
1319 static inline void
update_stats_enqueue_rt(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int flags)1320 update_stats_enqueue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1321 int flags)
1322 {
1323 if (!schedstat_enabled())
1324 return;
1325
1326 if (flags & ENQUEUE_WAKEUP)
1327 update_stats_enqueue_sleeper_rt(rt_rq, rt_se);
1328 }
1329
1330 static inline void
update_stats_wait_end_rt(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se)1331 update_stats_wait_end_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1332 {
1333 struct sched_statistics *stats;
1334 struct task_struct *p = NULL;
1335
1336 if (!schedstat_enabled())
1337 return;
1338
1339 if (rt_entity_is_task(rt_se))
1340 p = rt_task_of(rt_se);
1341
1342 stats = __schedstats_from_rt_se(rt_se);
1343 if (!stats)
1344 return;
1345
1346 __update_stats_wait_end(rq_of_rt_rq(rt_rq), p, stats);
1347 }
1348
1349 static inline void
update_stats_dequeue_rt(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int flags)1350 update_stats_dequeue_rt(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
1351 int flags)
1352 {
1353 struct task_struct *p = NULL;
1354
1355 if (!schedstat_enabled())
1356 return;
1357
1358 if (rt_entity_is_task(rt_se))
1359 p = rt_task_of(rt_se);
1360
1361 if ((flags & DEQUEUE_SLEEP) && p) {
1362 unsigned int state;
1363
1364 state = READ_ONCE(p->__state);
1365 if (state & TASK_INTERRUPTIBLE)
1366 __schedstat_set(p->stats.sleep_start,
1367 rq_clock(rq_of_rt_rq(rt_rq)));
1368
1369 if (state & TASK_UNINTERRUPTIBLE)
1370 __schedstat_set(p->stats.block_start,
1371 rq_clock(rq_of_rt_rq(rt_rq)));
1372 }
1373 }
1374
__enqueue_rt_entity(struct sched_rt_entity * rt_se,unsigned int flags)1375 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1376 {
1377 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1378 struct rt_prio_array *array = &rt_rq->active;
1379 struct rt_rq *group_rq = group_rt_rq(rt_se);
1380 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1381
1382 /*
1383 * Don't enqueue the group if its throttled, or when empty.
1384 * The latter is a consequence of the former when a child group
1385 * get throttled and the current group doesn't have any other
1386 * active members.
1387 */
1388 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1389 if (rt_se->on_list)
1390 __delist_rt_entity(rt_se, array);
1391 return;
1392 }
1393
1394 if (move_entity(flags)) {
1395 WARN_ON_ONCE(rt_se->on_list);
1396 if (flags & ENQUEUE_HEAD)
1397 list_add(&rt_se->run_list, queue);
1398 else
1399 list_add_tail(&rt_se->run_list, queue);
1400
1401 __set_bit(rt_se_prio(rt_se), array->bitmap);
1402 rt_se->on_list = 1;
1403 }
1404 rt_se->on_rq = 1;
1405
1406 inc_rt_tasks(rt_se, rt_rq);
1407 }
1408
__dequeue_rt_entity(struct sched_rt_entity * rt_se,unsigned int flags)1409 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1410 {
1411 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1412 struct rt_prio_array *array = &rt_rq->active;
1413
1414 if (move_entity(flags)) {
1415 WARN_ON_ONCE(!rt_se->on_list);
1416 __delist_rt_entity(rt_se, array);
1417 }
1418 rt_se->on_rq = 0;
1419
1420 dec_rt_tasks(rt_se, rt_rq);
1421 }
1422
1423 /*
1424 * Because the prio of an upper entry depends on the lower
1425 * entries, we must remove entries top - down.
1426 */
dequeue_rt_stack(struct sched_rt_entity * rt_se,unsigned int flags)1427 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1428 {
1429 struct sched_rt_entity *back = NULL;
1430 unsigned int rt_nr_running;
1431
1432 for_each_sched_rt_entity(rt_se) {
1433 rt_se->back = back;
1434 back = rt_se;
1435 }
1436
1437 rt_nr_running = rt_rq_of_se(back)->rt_nr_running;
1438
1439 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1440 if (on_rt_rq(rt_se))
1441 __dequeue_rt_entity(rt_se, flags);
1442 }
1443
1444 dequeue_top_rt_rq(rt_rq_of_se(back), rt_nr_running);
1445 }
1446
enqueue_rt_entity(struct sched_rt_entity * rt_se,unsigned int flags)1447 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1448 {
1449 struct rq *rq = rq_of_rt_se(rt_se);
1450
1451 update_stats_enqueue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1452
1453 dequeue_rt_stack(rt_se, flags);
1454 for_each_sched_rt_entity(rt_se)
1455 __enqueue_rt_entity(rt_se, flags);
1456 enqueue_top_rt_rq(&rq->rt);
1457 }
1458
dequeue_rt_entity(struct sched_rt_entity * rt_se,unsigned int flags)1459 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1460 {
1461 struct rq *rq = rq_of_rt_se(rt_se);
1462
1463 update_stats_dequeue_rt(rt_rq_of_se(rt_se), rt_se, flags);
1464
1465 dequeue_rt_stack(rt_se, flags);
1466
1467 for_each_sched_rt_entity(rt_se) {
1468 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1469
1470 if (rt_rq && rt_rq->rt_nr_running)
1471 __enqueue_rt_entity(rt_se, flags);
1472 }
1473 enqueue_top_rt_rq(&rq->rt);
1474 }
1475
1476 #ifdef CONFIG_SMP
should_honor_rt_sync(struct rq * rq,struct task_struct * p,bool sync)1477 static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1478 bool sync)
1479 {
1480 /*
1481 * If the waker is CFS, then an RT sync wakeup would preempt the waker
1482 * and force it to run for a likely small time after the RT wakee is
1483 * done. So, only honor RT sync wakeups from RT wakers.
1484 */
1485 return sync && task_has_rt_policy(rq->curr) &&
1486 p->prio <= rq->rt.highest_prio.next &&
1487 rq->rt.rt_nr_running <= 2;
1488 }
1489 #else
should_honor_rt_sync(struct rq * rq,struct task_struct * p,bool sync)1490 static inline bool should_honor_rt_sync(struct rq *rq, struct task_struct *p,
1491 bool sync)
1492 {
1493 return 0;
1494 }
1495 #endif
1496
1497 /*
1498 * Adding/removing a task to/from a priority array:
1499 */
1500 static void
enqueue_task_rt(struct rq * rq,struct task_struct * p,int flags)1501 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1502 {
1503 struct sched_rt_entity *rt_se = &p->rt;
1504 bool sync = !!(flags & ENQUEUE_WAKEUP_SYNC);
1505
1506 if (flags & ENQUEUE_WAKEUP)
1507 rt_se->timeout = 0;
1508
1509 check_schedstat_required();
1510 update_stats_wait_start_rt(rt_rq_of_se(rt_se), rt_se);
1511
1512 enqueue_rt_entity(rt_se, flags);
1513
1514 if (should_honor_rt_sync(rq, p, sync))
1515 return;
1516
1517 if (task_is_blocked(p))
1518 return;
1519
1520 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1521 enqueue_pushable_task(rq, p);
1522 }
1523
dequeue_task_rt(struct rq * rq,struct task_struct * p,int flags)1524 static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1525 {
1526 struct sched_rt_entity *rt_se = &p->rt;
1527
1528 update_curr_rt(rq);
1529 dequeue_rt_entity(rt_se, flags);
1530
1531 dequeue_pushable_task(rq, p);
1532
1533 return true;
1534 }
1535
1536 /*
1537 * Put task to the head or the end of the run list without the overhead of
1538 * dequeue followed by enqueue.
1539 */
1540 static void
requeue_rt_entity(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int head)1541 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1542 {
1543 if (on_rt_rq(rt_se)) {
1544 struct rt_prio_array *array = &rt_rq->active;
1545 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1546
1547 if (head)
1548 list_move(&rt_se->run_list, queue);
1549 else
1550 list_move_tail(&rt_se->run_list, queue);
1551 }
1552 }
1553
requeue_task_rt(struct rq * rq,struct task_struct * p,int head)1554 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1555 {
1556 struct sched_rt_entity *rt_se = &p->rt;
1557 struct rt_rq *rt_rq;
1558
1559 for_each_sched_rt_entity(rt_se) {
1560 rt_rq = rt_rq_of_se(rt_se);
1561 requeue_rt_entity(rt_rq, rt_se, head);
1562 }
1563 }
1564
yield_task_rt(struct rq * rq)1565 static void yield_task_rt(struct rq *rq)
1566 {
1567 requeue_task_rt(rq, rq->curr, 0);
1568 }
1569
1570 #ifdef CONFIG_SMP
1571 static int find_lowest_rq(struct task_struct *sched_ctx, struct task_struct *exec_ctx);
1572
1573 #ifdef CONFIG_RT_SOFTIRQ_AWARE_SCHED
1574 /*
1575 * Return whether the given cpu is currently non-preemptible
1576 * while handling a potentially long softirq, or if the current
1577 * task is likely to block preemptions soon because it is a
1578 * ksoftirq thread that is handling softirqs.
1579 */
cpu_busy_with_softirqs(int cpu)1580 bool cpu_busy_with_softirqs(int cpu)
1581 {
1582 u32 softirqs = per_cpu(active_softirqs, cpu) |
1583 __cpu_softirq_pending(cpu);
1584
1585 return softirqs & LONG_SOFTIRQ_MASK;
1586 }
1587 EXPORT_SYMBOL_GPL(cpu_busy_with_softirqs);
1588 #endif /* CONFIG_RT_SOFTIRQ_AWARE_SCHED */
1589
rt_task_fits_cpu(struct task_struct * p,int cpu)1590 static bool rt_task_fits_cpu(struct task_struct *p, int cpu)
1591 {
1592 return rt_task_fits_capacity(p, cpu) && !cpu_busy_with_softirqs(cpu);
1593 }
1594
1595 static int
select_task_rq_rt(struct task_struct * p,int cpu,int flags)1596 select_task_rq_rt(struct task_struct *p, int cpu, int flags)
1597 {
1598 struct task_struct *curr, *donor;
1599 struct rq *rq;
1600 struct rq *this_cpu_rq;
1601 bool test;
1602 int target_cpu = -1;
1603 bool sync = !!(flags & WF_SYNC);
1604 int this_cpu;
1605
1606 trace_android_rvh_select_task_rq_rt(p, cpu, flags & 0xF,
1607 flags, &target_cpu);
1608 if (target_cpu >= 0)
1609 return target_cpu;
1610
1611 /* For anything but wake ups, just return the task_cpu */
1612 if (!(flags & (WF_TTWU | WF_FORK)))
1613 goto out;
1614
1615 rq = cpu_rq(cpu);
1616
1617 rcu_read_lock();
1618 curr = READ_ONCE(rq->curr); /* unlocked access */
1619 donor = READ_ONCE(rq->donor);
1620 this_cpu = smp_processor_id();
1621 this_cpu_rq = cpu_rq(this_cpu);
1622
1623 /*
1624 * If the current task on @p's runqueue is an RT task, then
1625 * try to see if we can wake this RT task up on another
1626 * runqueue. Otherwise simply start this RT task
1627 * on its current runqueue.
1628 *
1629 * We want to avoid overloading runqueues. If the woken
1630 * task is a higher priority, then it will stay on this CPU
1631 * and the lower prio task should be moved to another CPU.
1632 * Even though this will probably make the lower prio task
1633 * lose its cache, we do not want to bounce a higher task
1634 * around just because it gave up its CPU, perhaps for a
1635 * lock?
1636 *
1637 * For equal prio tasks, we just let the scheduler sort it out.
1638 *
1639 * Otherwise, just let it ride on the affine RQ and the
1640 * post-schedule router will push the preempted task away
1641 *
1642 * This test is optimistic, if we get it wrong the load-balancer
1643 * will have to sort it out.
1644 *
1645 * We use rt_task_fits_cpu() to evaluate if the CPU is busy with
1646 * potentially long-running softirq work, as well as take into
1647 * account the capacity of the CPU to ensure it fits the
1648 * requirement of the task - which is only important on
1649 * heterogeneous systems like big.LITTLE.
1650 */
1651 test = curr &&
1652 unlikely(rt_task(donor)) &&
1653 (curr->nr_cpus_allowed < 2 || donor->prio <= p->prio);
1654
1655 /*
1656 * Respect the sync flag as long as the task can run on this CPU.
1657 */
1658 if (should_honor_rt_sync(this_cpu_rq, p, sync) &&
1659 cpumask_test_cpu(this_cpu, p->cpus_ptr)) {
1660 cpu = this_cpu;
1661 goto out_unlock;
1662 }
1663
1664 if (test || !rt_task_fits_cpu(p, cpu)) {
1665 int target = find_lowest_rq(p, p);
1666
1667 /*
1668 * Bail out if we were forcing a migration to find a better
1669 * fitting CPU but our search failed.
1670 */
1671 if (!test && target != -1 && !rt_task_fits_cpu(p, target))
1672 goto out_unlock;
1673
1674 /*
1675 * Don't bother moving it if the destination CPU is
1676 * not running a lower priority task.
1677 */
1678 if (target != -1 &&
1679 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1680 cpu = target;
1681 }
1682
1683 out_unlock:
1684 rcu_read_unlock();
1685
1686 out:
1687 return cpu;
1688 }
1689
check_preempt_equal_prio(struct rq * rq,struct task_struct * p)1690 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1691 {
1692 struct task_struct *exec_ctx = p;
1693 /*
1694 * Current can't be migrated, useless to reschedule,
1695 * let's hope p can move out.
1696 */
1697 if (rq->curr->nr_cpus_allowed == 1 ||
1698 !cpupri_find(&rq->rd->cpupri, rq->donor, rq->curr, NULL))
1699 return;
1700
1701 /* No reason to preempt since rq->curr wouldn't change anyway */
1702 exec_ctx = find_exec_ctx(rq, p);
1703 if (task_current(rq, exec_ctx))
1704 return;
1705
1706 /*
1707 * p is migratable, so let's not schedule it and
1708 * see if it is pushed or pulled somewhere else.
1709 */
1710 if (p->nr_cpus_allowed != 1 &&
1711 cpupri_find(&rq->rd->cpupri, p, exec_ctx, NULL))
1712 return;
1713
1714 /*
1715 * There appear to be other CPUs that can accept
1716 * the current task but none can run 'p', so lets reschedule
1717 * to try and push the current task away:
1718 */
1719 requeue_task_rt(rq, p, 1);
1720 resched_curr(rq);
1721 }
1722
balance_rt(struct rq * rq,struct task_struct * p,struct rq_flags * rf)1723 static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf)
1724 {
1725 if (!on_rt_rq(&p->rt) && need_pull_rt_task(rq, p)) {
1726 int done = 0;
1727
1728 /*
1729 * This is OK, because current is on_cpu, which avoids it being
1730 * picked for load-balance and preemption/IRQs are still
1731 * disabled avoiding further scheduler activity on it and we've
1732 * not yet started the picking loop.
1733 */
1734 rq_unpin_lock(rq, rf);
1735 trace_android_rvh_sched_balance_rt(rq, p, &done);
1736 if (!done)
1737 pull_rt_task(rq);
1738 rq_repin_lock(rq, rf);
1739 }
1740
1741 return sched_stop_runnable(rq) || sched_dl_runnable(rq) || sched_rt_runnable(rq);
1742 }
1743 #endif /* CONFIG_SMP */
1744
1745 /*
1746 * Preempt the current task with a newly woken task if needed:
1747 */
wakeup_preempt_rt(struct rq * rq,struct task_struct * p,int flags)1748 static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags)
1749 {
1750 struct task_struct *donor = rq->donor;
1751
1752 if (p->prio < donor->prio) {
1753 resched_curr(rq);
1754 return;
1755 }
1756
1757 #ifdef CONFIG_SMP
1758 /*
1759 * If:
1760 *
1761 * - the newly woken task is of equal priority to the current task
1762 * - the newly woken task is non-migratable while current is migratable
1763 * - current will be preempted on the next reschedule
1764 *
1765 * we should check to see if current can readily move to a different
1766 * cpu. If so, we will reschedule to allow the push logic to try
1767 * to move current somewhere else, making room for our non-migratable
1768 * task.
1769 */
1770 if (p->prio == donor->prio && !test_tsk_need_resched(rq->curr))
1771 check_preempt_equal_prio(rq, p);
1772 #endif
1773 }
1774
set_next_task_rt(struct rq * rq,struct task_struct * p,bool first)1775 static inline void set_next_task_rt(struct rq *rq, struct task_struct *p, bool first)
1776 {
1777 struct sched_rt_entity *rt_se = &p->rt;
1778 struct rt_rq *rt_rq = &rq->rt;
1779
1780 p->se.exec_start = rq_clock_task(rq);
1781 if (on_rt_rq(&p->rt))
1782 update_stats_wait_end_rt(rt_rq, rt_se);
1783
1784 /* The running task is never eligible for pushing */
1785 dequeue_pushable_task(rq, p);
1786
1787 if (!first)
1788 return;
1789
1790 /*
1791 * If prev task was rt, put_prev_task() has already updated the
1792 * utilization. We only care of the case where we start to schedule a
1793 * rt task
1794 */
1795 if (rq->donor->sched_class != &rt_sched_class)
1796 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
1797 trace_android_rvh_update_rt_rq_load_avg(rq_clock_pelt(rq), rq, p, 0);
1798
1799 rt_queue_push_tasks(rq);
1800 }
1801
pick_next_rt_entity(struct rt_rq * rt_rq)1802 static struct sched_rt_entity *pick_next_rt_entity(struct rt_rq *rt_rq)
1803 {
1804 struct rt_prio_array *array = &rt_rq->active;
1805 struct sched_rt_entity *next = NULL;
1806 struct list_head *queue;
1807 int idx;
1808
1809 idx = sched_find_first_bit(array->bitmap);
1810 BUG_ON(idx >= MAX_RT_PRIO);
1811
1812 queue = array->queue + idx;
1813 if (SCHED_WARN_ON(list_empty(queue)))
1814 return NULL;
1815 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1816
1817 return next;
1818 }
1819
_pick_next_task_rt(struct rq * rq)1820 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1821 {
1822 struct sched_rt_entity *rt_se;
1823 struct rt_rq *rt_rq = &rq->rt;
1824
1825 do {
1826 rt_se = pick_next_rt_entity(rt_rq);
1827 if (unlikely(!rt_se))
1828 return NULL;
1829 rt_rq = group_rt_rq(rt_se);
1830 } while (rt_rq);
1831
1832 return rt_task_of(rt_se);
1833 }
1834
pick_task_rt(struct rq * rq)1835 static struct task_struct *pick_task_rt(struct rq *rq)
1836 {
1837 struct task_struct *p;
1838
1839 if (!sched_rt_runnable(rq))
1840 return NULL;
1841
1842 p = _pick_next_task_rt(rq);
1843
1844 return p;
1845 }
1846
put_prev_task_rt(struct rq * rq,struct task_struct * p,struct task_struct * next)1847 static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next)
1848 {
1849 struct sched_rt_entity *rt_se = &p->rt;
1850 struct rt_rq *rt_rq = &rq->rt;
1851
1852 if (on_rt_rq(&p->rt))
1853 update_stats_wait_start_rt(rt_rq, rt_se);
1854
1855 update_curr_rt(rq);
1856
1857 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
1858 trace_android_rvh_update_rt_rq_load_avg(rq_clock_pelt(rq), rq, p, 1);
1859
1860 if (task_is_blocked(p))
1861 return;
1862 /*
1863 * The previous task needs to be made eligible for pushing
1864 * if it is still active
1865 */
1866 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1867 enqueue_pushable_task(rq, p);
1868 }
1869
1870 #ifdef CONFIG_SMP
1871
1872 /* Only try algorithms three times */
1873 #define RT_MAX_TRIES 3
1874
1875 /*
1876 * Return the highest pushable rq's task, which is suitable to be executed
1877 * on the CPU, NULL otherwise
1878 */
pick_highest_pushable_task(struct rq * rq,int cpu)1879 struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1880 {
1881 struct plist_head *head = &rq->rt.pushable_tasks;
1882 struct task_struct *p;
1883
1884 if (!has_pushable_tasks(rq))
1885 return NULL;
1886
1887 plist_for_each_entry(p, head, pushable_tasks) {
1888 if (task_is_pushable(rq, p, cpu) == 1)
1889 return p;
1890 }
1891
1892 return NULL;
1893 }
1894 EXPORT_SYMBOL_GPL(pick_highest_pushable_task);
1895
1896 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1897
find_lowest_rq(struct task_struct * sched_ctx,struct task_struct * exec_ctx)1898 static int find_lowest_rq(struct task_struct *sched_ctx, struct task_struct *exec_ctx)
1899 {
1900 struct sched_domain *sd;
1901 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1902 int this_cpu = smp_processor_id();
1903 int cpu = -1;
1904 int ret;
1905
1906 /* Make sure the mask is initialized first */
1907 if (unlikely(!lowest_mask))
1908 return -1;
1909
1910 if (exec_ctx && exec_ctx->nr_cpus_allowed == 1)
1911 return -1; /* No other targets possible */
1912
1913 /*
1914 * If we're using the softirq optimization or if we are
1915 * on asym system, ensure we consider the softirq processing
1916 * or different capacities of the CPUs when searching for the
1917 * lowest_mask.
1918 */
1919 if (IS_ENABLED(CONFIG_RT_SOFTIRQ_AWARE_SCHED) ||
1920 sched_asym_cpucap_active()) {
1921
1922 ret = cpupri_find_fitness(&task_rq(sched_ctx)->rd->cpupri,
1923 sched_ctx, exec_ctx, lowest_mask,
1924 rt_task_fits_cpu);
1925 } else {
1926
1927 ret = cpupri_find(&task_rq(sched_ctx)->rd->cpupri,
1928 sched_ctx, exec_ctx, lowest_mask);
1929 }
1930
1931 trace_android_rvh_find_lowest_rq(sched_ctx, exec_ctx, lowest_mask, ret, &cpu);
1932 if (cpu >= 0)
1933 return cpu;
1934
1935 if (!ret)
1936 return -1; /* No targets found */
1937
1938 cpu = task_cpu(sched_ctx);
1939
1940 /*
1941 * At this point we have built a mask of CPUs representing the
1942 * lowest priority tasks in the system. Now we want to elect
1943 * the best one based on our affinity and topology.
1944 *
1945 * We prioritize the last CPU that the task executed on since
1946 * it is most likely cache-hot in that location.
1947 */
1948 if (cpumask_test_cpu(cpu, lowest_mask))
1949 return cpu;
1950
1951 /*
1952 * Otherwise, we consult the sched_domains span maps to figure
1953 * out which CPU is logically closest to our hot cache data.
1954 */
1955 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1956 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1957
1958 rcu_read_lock();
1959 for_each_domain(cpu, sd) {
1960 if (sd->flags & SD_WAKE_AFFINE) {
1961 int best_cpu;
1962
1963 /*
1964 * "this_cpu" is cheaper to preempt than a
1965 * remote processor.
1966 */
1967 if (this_cpu != -1 &&
1968 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1969 rcu_read_unlock();
1970 return this_cpu;
1971 }
1972
1973 best_cpu = cpumask_any_and_distribute(lowest_mask,
1974 sched_domain_span(sd));
1975 if (best_cpu < nr_cpu_ids) {
1976 rcu_read_unlock();
1977 return best_cpu;
1978 }
1979 }
1980 }
1981 rcu_read_unlock();
1982
1983 /*
1984 * And finally, if there were no matches within the domains
1985 * just give the caller *something* to work with from the compatible
1986 * locations.
1987 */
1988 if (this_cpu != -1)
1989 return this_cpu;
1990
1991 cpu = cpumask_any_distribute(lowest_mask);
1992 if (cpu < nr_cpu_ids)
1993 return cpu;
1994
1995 return -1;
1996 }
1997
pick_next_pushable_task(struct rq * rq)1998 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1999 {
2000 struct plist_head *head = &rq->rt.pushable_tasks;
2001 struct task_struct *p, *push_task = NULL;
2002
2003 if (!has_pushable_tasks(rq))
2004 return NULL;
2005
2006 plist_for_each_entry(p, head, pushable_tasks) {
2007 if (task_is_pushable(rq, p, 0)) {
2008 push_task = p;
2009 break;
2010 }
2011 }
2012
2013 if (!push_task)
2014 return NULL;
2015
2016 BUG_ON(rq->cpu != task_cpu(push_task));
2017 BUG_ON(task_current(rq, push_task));
2018 BUG_ON(task_current_donor(rq, push_task));
2019
2020 BUG_ON(!task_on_rq_queued(push_task));
2021 BUG_ON(!rt_task(push_task));
2022
2023 return push_task;
2024 }
2025
__rt_revalidate_rq_state(struct task_struct * task,struct rq * rq,struct rq * lowest)2026 static inline bool __rt_revalidate_rq_state(struct task_struct *task, struct rq *rq,
2027 struct rq *lowest)
2028 {
2029 if (!rt_task(task))
2030 return false;
2031 return __revalidate_rq_state(task, rq, lowest);
2032 }
2033
rt_revalidate_rq_state(struct task_struct * task,struct rq * rq,struct rq * lowest,bool * retry)2034 static inline bool rt_revalidate_rq_state(struct task_struct *task, struct rq *rq,
2035 struct rq *lowest, bool *retry)
2036 {
2037 if (!sched_proxy_exec())
2038 return __rt_revalidate_rq_state(task, rq, lowest);
2039 /*
2040 * Releasing the rq lock means we need to re-check pushability.
2041 * Some scenarios:
2042 * 1) If a migration from another CPU sent a task/chain to rq
2043 * that made task newly unpushable by completing a chain
2044 * from task to rq->curr, then we need to bail out and push something
2045 * else.
2046 * 2) If our chain led off this CPU or to a dequeued task, the last waiter
2047 * on this CPU might have acquired the lock and woken (or even migrated
2048 * & run, handed off the lock it held, etc...). This can invalidate the
2049 * result of find_lowest_rq() if our chain previously ended in a blocked
2050 * task whose affinity we could ignore, but now ends in an unblocked
2051 * task that can't run on lowest_rq.
2052 * 3) Race described at https://lore.kernel.org/all/1523536384-26781-2-git-send-email-huawei.libin@huawei.com/
2053 *
2054 * Notes on these:
2055 * - Scenario #2 is properly handled by rerunning find_lowest_rq
2056 * - Scenario #1 requires that we fail
2057 * - Scenario #3 can AFAICT only occur when rq is not this_rq(). And the
2058 * suggested fix is not universally correct now that push_cpu_stop() can
2059 * call this function.
2060 */
2061 if (!rt_task(task) || is_migration_disabled(task)) {
2062 return false;
2063 } else if (rq != this_rq()) {
2064 /*
2065 * If we are dealing with a remote rq, then all bets are off
2066 * because task might have run & then been dequeued since we
2067 * released the lock, at which point our normal checks can race
2068 * with migration, as described in
2069 * https://lore.kernel.org/all/1523536384-26781-2-git-send-email-huawei.libin@huawei.com/
2070 * Need to repick to ensure we avoid a race.
2071 * But re-picking would be unnecessary & incorrect in the
2072 * push_cpu_stop() path.
2073 */
2074 struct task_struct *next_task = pick_next_pushable_task(rq);
2075 struct task_struct *exec_ctx;
2076
2077 if (next_task != task)
2078 return false;
2079
2080 exec_ctx = find_exec_ctx(rq, next_task);
2081 *retry = (exec_ctx &&
2082 !cpumask_test_cpu(lowest->cpu,
2083 &exec_ctx->cpus_mask));
2084 } else {
2085 /*
2086 * Chain level balancing introduces new ways for our choice of
2087 * task & rq to become invalid when we release the rq lock, e.g.:
2088 * 1) Migration to rq from another CPU makes task newly unpushable
2089 * by completing a "blocked chain" from task to rq->curr.
2090 * Fail so a different task can be chosen for push.
2091 * 2) In cases where task's blocked chain led to a dequeued task
2092 * or one on another rq, the last waiter in the chain on this
2093 * rq might have acquired the lock and woken, meaning we must
2094 * pick a different rq if its affinity prevents running on
2095 * lowest rq.
2096 */
2097 int pushable = task_is_pushable(rq, task, lowest->cpu);
2098
2099 *retry = pushable == -1;
2100 if (!pushable)
2101 return false;
2102 }
2103
2104 return true;
2105 }
2106
2107 /* Will lock the rq it finds */
find_lock_lowest_rq(struct task_struct * task,struct rq * rq)2108 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
2109 {
2110 struct task_struct *exec_ctx;
2111 struct rq *lowest_rq = NULL;
2112 bool retry;
2113 int tries;
2114 int cpu;
2115
2116 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
2117 retry = false;
2118 exec_ctx = find_exec_ctx(rq, task);
2119 cpu = find_lowest_rq(task, exec_ctx);
2120
2121 if ((cpu == -1) || (cpu == rq->cpu))
2122 break;
2123
2124 lowest_rq = cpu_rq(cpu);
2125
2126 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
2127 /*
2128 * Target rq has tasks of equal or higher priority,
2129 * retrying does not release any lock and is unlikely
2130 * to yield a different result.
2131 */
2132 lowest_rq = NULL;
2133 break;
2134 }
2135
2136 /* if the prio of this runqueue changed, try again */
2137 if (double_lock_balance(rq, lowest_rq)) {
2138 if (unlikely(!rt_revalidate_rq_state(task, rq, lowest_rq, &retry))) {
2139 double_unlock_balance(rq, lowest_rq);
2140 lowest_rq = NULL;
2141 break;
2142 }
2143 }
2144
2145 /* If this rq is still suitable use it. */
2146 if (lowest_rq->rt.highest_prio.curr > task->prio && !retry)
2147 break;
2148
2149 /* try again */
2150 double_unlock_balance(rq, lowest_rq);
2151 lowest_rq = NULL;
2152 }
2153
2154 return lowest_rq;
2155 }
2156
2157 /*
2158 * If the current CPU has more than one RT task, see if the non
2159 * running task can migrate over to a CPU that is running a task
2160 * of lesser priority.
2161 */
push_rt_task(struct rq * rq,bool pull)2162 static int push_rt_task(struct rq *rq, bool pull)
2163 {
2164 struct task_struct *next_task;
2165 struct rq *lowest_rq;
2166 int ret = 0;
2167
2168 if (!rq->rt.overloaded)
2169 return 0;
2170
2171 next_task = pick_next_pushable_task(rq);
2172 if (!next_task)
2173 return 0;
2174
2175 retry:
2176 /*
2177 * It's possible that the next_task slipped in of
2178 * higher priority than current. If that's the case
2179 * just reschedule current.
2180 */
2181 if (unlikely(next_task->prio < rq->donor->prio)) {
2182 resched_curr(rq);
2183 return 0;
2184 }
2185
2186 if (is_migration_disabled(next_task)) {
2187 struct task_struct *push_task = NULL;
2188 int cpu;
2189
2190 if (!pull || rq->push_busy)
2191 return 0;
2192
2193 /*
2194 * Invoking find_lowest_rq() on anything but an RT task doesn't
2195 * make sense. Per the above priority check, curr has to
2196 * be of higher priority than next_task, so no need to
2197 * reschedule when bailing out.
2198 *
2199 * Note that the stoppers are masqueraded as SCHED_FIFO
2200 * (cf. sched_set_stop_task()), so we can't rely on rt_task().
2201 */
2202 if (rq->donor->sched_class != &rt_sched_class)
2203 return 0;
2204
2205 cpu = find_lowest_rq(rq->donor, rq->curr);
2206 if (cpu == -1 || cpu == rq->cpu)
2207 return 0;
2208
2209 /*
2210 * Given we found a CPU with lower priority than @next_task,
2211 * therefore it should be running. However we cannot migrate it
2212 * to this other CPU, instead attempt to push the current
2213 * running task on this CPU away.
2214 */
2215 push_task = get_push_task(rq);
2216 if (push_task) {
2217 preempt_disable();
2218 raw_spin_rq_unlock(rq);
2219 stop_one_cpu_nowait(rq->cpu, push_cpu_stop,
2220 push_task, &rq->push_work);
2221 preempt_enable();
2222 raw_spin_rq_lock(rq);
2223 }
2224
2225 return 0;
2226 }
2227
2228 if (WARN_ON(next_task == rq->curr))
2229 return 0;
2230
2231 /* We might release rq lock */
2232 get_task_struct(next_task);
2233
2234 /* find_lock_lowest_rq locks the rq if found */
2235 lowest_rq = find_lock_lowest_rq(next_task, rq);
2236 if (!lowest_rq) {
2237 struct task_struct *task;
2238 /*
2239 * find_lock_lowest_rq releases rq->lock
2240 * so it is possible that next_task has migrated.
2241 *
2242 * We need to make sure that the task is still on the same
2243 * run-queue and is also still the next task eligible for
2244 * pushing.
2245 */
2246 task = pick_next_pushable_task(rq);
2247 if (task == next_task) {
2248 /*
2249 * The task hasn't migrated, and is still the next
2250 * eligible task, but we failed to find a run-queue
2251 * to push it to. Do not retry in this case, since
2252 * other CPUs will pull from us when ready.
2253 */
2254 goto out;
2255 }
2256
2257 if (!task)
2258 /* No more tasks, just exit */
2259 goto out;
2260
2261 /*
2262 * Something has shifted, try again.
2263 */
2264 put_task_struct(next_task);
2265 next_task = task;
2266 goto retry;
2267 }
2268
2269 move_queued_task_locked(rq, lowest_rq, next_task);
2270 resched_curr(lowest_rq);
2271 ret = 1;
2272
2273 double_unlock_balance(rq, lowest_rq);
2274 out:
2275 put_task_struct(next_task);
2276
2277 return ret;
2278 }
2279
push_rt_tasks(struct rq * rq)2280 static void push_rt_tasks(struct rq *rq)
2281 {
2282 /* push_rt_task will return true if it moved an RT */
2283 while (push_rt_task(rq, false))
2284 ;
2285 }
2286
2287 #ifdef HAVE_RT_PUSH_IPI
2288
2289 /*
2290 * When a high priority task schedules out from a CPU and a lower priority
2291 * task is scheduled in, a check is made to see if there's any RT tasks
2292 * on other CPUs that are waiting to run because a higher priority RT task
2293 * is currently running on its CPU. In this case, the CPU with multiple RT
2294 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2295 * up that may be able to run one of its non-running queued RT tasks.
2296 *
2297 * All CPUs with overloaded RT tasks need to be notified as there is currently
2298 * no way to know which of these CPUs have the highest priority task waiting
2299 * to run. Instead of trying to take a spinlock on each of these CPUs,
2300 * which has shown to cause large latency when done on machines with many
2301 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2302 * RT tasks waiting to run.
2303 *
2304 * Just sending an IPI to each of the CPUs is also an issue, as on large
2305 * count CPU machines, this can cause an IPI storm on a CPU, especially
2306 * if its the only CPU with multiple RT tasks queued, and a large number
2307 * of CPUs scheduling a lower priority task at the same time.
2308 *
2309 * Each root domain has its own IRQ work function that can iterate over
2310 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2311 * task must be checked if there's one or many CPUs that are lowering
2312 * their priority, there's a single IRQ work iterator that will try to
2313 * push off RT tasks that are waiting to run.
2314 *
2315 * When a CPU schedules a lower priority task, it will kick off the
2316 * IRQ work iterator that will jump to each CPU with overloaded RT tasks.
2317 * As it only takes the first CPU that schedules a lower priority task
2318 * to start the process, the rto_start variable is incremented and if
2319 * the atomic result is one, then that CPU will try to take the rto_lock.
2320 * This prevents high contention on the lock as the process handles all
2321 * CPUs scheduling lower priority tasks.
2322 *
2323 * All CPUs that are scheduling a lower priority task will increment the
2324 * rt_loop_next variable. This will make sure that the IRQ work iterator
2325 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2326 * priority task, even if the iterator is in the middle of a scan. Incrementing
2327 * the rt_loop_next will cause the iterator to perform another scan.
2328 *
2329 */
rto_next_cpu(struct root_domain * rd)2330 static int rto_next_cpu(struct root_domain *rd)
2331 {
2332 int next;
2333 int cpu;
2334
2335 /*
2336 * When starting the IPI RT pushing, the rto_cpu is set to -1,
2337 * rt_next_cpu() will simply return the first CPU found in
2338 * the rto_mask.
2339 *
2340 * If rto_next_cpu() is called with rto_cpu is a valid CPU, it
2341 * will return the next CPU found in the rto_mask.
2342 *
2343 * If there are no more CPUs left in the rto_mask, then a check is made
2344 * against rto_loop and rto_loop_next. rto_loop is only updated with
2345 * the rto_lock held, but any CPU may increment the rto_loop_next
2346 * without any locking.
2347 */
2348 for (;;) {
2349
2350 /* When rto_cpu is -1 this acts like cpumask_first() */
2351 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2352
2353 /* this will be any CPU in the rd->rto_mask, and can be a halted cpu update it */
2354 trace_android_rvh_rto_next_cpu(rd->rto_cpu, rd->rto_mask, &cpu);
2355
2356 rd->rto_cpu = cpu;
2357
2358 if (cpu < nr_cpu_ids)
2359 return cpu;
2360
2361 rd->rto_cpu = -1;
2362
2363 /*
2364 * ACQUIRE ensures we see the @rto_mask changes
2365 * made prior to the @next value observed.
2366 *
2367 * Matches WMB in rt_set_overload().
2368 */
2369 next = atomic_read_acquire(&rd->rto_loop_next);
2370
2371 if (rd->rto_loop == next)
2372 break;
2373
2374 rd->rto_loop = next;
2375 }
2376
2377 return -1;
2378 }
2379
rto_start_trylock(atomic_t * v)2380 static inline bool rto_start_trylock(atomic_t *v)
2381 {
2382 return !atomic_cmpxchg_acquire(v, 0, 1);
2383 }
2384
rto_start_unlock(atomic_t * v)2385 static inline void rto_start_unlock(atomic_t *v)
2386 {
2387 atomic_set_release(v, 0);
2388 }
2389
tell_cpu_to_push(struct rq * rq)2390 static void tell_cpu_to_push(struct rq *rq)
2391 {
2392 int cpu = -1;
2393
2394 /* Keep the loop going if the IPI is currently active */
2395 atomic_inc(&rq->rd->rto_loop_next);
2396
2397 /* Only one CPU can initiate a loop at a time */
2398 if (!rto_start_trylock(&rq->rd->rto_loop_start))
2399 return;
2400
2401 raw_spin_lock(&rq->rd->rto_lock);
2402
2403 /*
2404 * The rto_cpu is updated under the lock, if it has a valid CPU
2405 * then the IPI is still running and will continue due to the
2406 * update to loop_next, and nothing needs to be done here.
2407 * Otherwise it is finishing up and an IPI needs to be sent.
2408 */
2409 if (rq->rd->rto_cpu < 0)
2410 cpu = rto_next_cpu(rq->rd);
2411
2412 raw_spin_unlock(&rq->rd->rto_lock);
2413
2414 rto_start_unlock(&rq->rd->rto_loop_start);
2415
2416 if (cpu >= 0) {
2417 /* Make sure the rd does not get freed while pushing */
2418 sched_get_rd(rq->rd);
2419 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2420 }
2421 }
2422
2423 /* Called from hardirq context */
rto_push_irq_work_func(struct irq_work * work)2424 void rto_push_irq_work_func(struct irq_work *work)
2425 {
2426 struct root_domain *rd =
2427 container_of(work, struct root_domain, rto_push_work);
2428 struct rq *rq;
2429 int cpu;
2430
2431 rq = this_rq();
2432
2433 /*
2434 * We do not need to grab the lock to check for has_pushable_tasks.
2435 * When it gets updated, a check is made if a push is possible.
2436 */
2437 if (has_pushable_tasks(rq)) {
2438 raw_spin_rq_lock(rq);
2439 while (push_rt_task(rq, true))
2440 ;
2441 raw_spin_rq_unlock(rq);
2442 }
2443
2444 raw_spin_lock(&rd->rto_lock);
2445
2446 /* Pass the IPI to the next rt overloaded queue */
2447 cpu = rto_next_cpu(rd);
2448
2449 raw_spin_unlock(&rd->rto_lock);
2450
2451 if (cpu < 0) {
2452 sched_put_rd(rd);
2453 return;
2454 }
2455
2456 /* Try the next RT overloaded CPU */
2457 irq_work_queue_on(&rd->rto_push_work, cpu);
2458 }
2459 #endif /* HAVE_RT_PUSH_IPI */
2460
pull_rt_task(struct rq * this_rq)2461 static void pull_rt_task(struct rq *this_rq)
2462 {
2463 int this_cpu = this_rq->cpu, cpu;
2464 bool resched = false;
2465 struct task_struct *p, *push_task;
2466 struct rq *src_rq;
2467 int rt_overload_count = rt_overloaded(this_rq);
2468
2469 if (likely(!rt_overload_count))
2470 return;
2471
2472 /*
2473 * Match the barrier from rt_set_overloaded; this guarantees that if we
2474 * see overloaded we must also see the rto_mask bit.
2475 */
2476 smp_rmb();
2477
2478 /* If we are the only overloaded CPU do nothing */
2479 if (rt_overload_count == 1 &&
2480 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2481 return;
2482
2483 #ifdef HAVE_RT_PUSH_IPI
2484 if (sched_feat(RT_PUSH_IPI)) {
2485 tell_cpu_to_push(this_rq);
2486 return;
2487 }
2488 #endif
2489
2490 for_each_cpu(cpu, this_rq->rd->rto_mask) {
2491 if (this_cpu == cpu)
2492 continue;
2493
2494 src_rq = cpu_rq(cpu);
2495
2496 /*
2497 * Don't bother taking the src_rq->lock if the next highest
2498 * task is known to be lower-priority than our current task.
2499 * This may look racy, but if this value is about to go
2500 * logically higher, the src_rq will push this task away.
2501 * And if its going logically lower, we do not care
2502 */
2503 if (src_rq->rt.highest_prio.next >=
2504 this_rq->rt.highest_prio.curr)
2505 continue;
2506
2507 /*
2508 * We can potentially drop this_rq's lock in
2509 * double_lock_balance, and another CPU could
2510 * alter this_rq
2511 */
2512 push_task = NULL;
2513 double_lock_balance(this_rq, src_rq);
2514
2515 /*
2516 * We can pull only a task, which is pushable
2517 * on its rq, and no others.
2518 */
2519 p = pick_highest_pushable_task(src_rq, this_cpu);
2520
2521 /*
2522 * Do we have an RT task that preempts
2523 * the to-be-scheduled task?
2524 */
2525 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2526 WARN_ON(p == src_rq->curr);
2527 WARN_ON(!task_on_rq_queued(p));
2528
2529 /*
2530 * There's a chance that p is higher in priority
2531 * than what's currently running on its CPU.
2532 * This is just that p is waking up and hasn't
2533 * had a chance to schedule. We only pull
2534 * p if it is lower in priority than the
2535 * current task on the run queue
2536 */
2537 if (p->prio < src_rq->donor->prio)
2538 goto skip;
2539
2540 if (is_migration_disabled(p)) {
2541 push_task = get_push_task(src_rq);
2542 } else {
2543 move_queued_task_locked(src_rq, this_rq, p);
2544 resched = true;
2545 }
2546 /*
2547 * We continue with the search, just in
2548 * case there's an even higher prio task
2549 * in another runqueue. (low likelihood
2550 * but possible)
2551 */
2552 }
2553 skip:
2554 double_unlock_balance(this_rq, src_rq);
2555
2556 if (push_task) {
2557 preempt_disable();
2558 raw_spin_rq_unlock(this_rq);
2559 stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop,
2560 push_task, &src_rq->push_work);
2561 preempt_enable();
2562 raw_spin_rq_lock(this_rq);
2563 }
2564 }
2565
2566 if (resched)
2567 resched_curr(this_rq);
2568 }
2569
2570 /*
2571 * If we are not running and we are not going to reschedule soon, we should
2572 * try to push tasks away now
2573 */
task_woken_rt(struct rq * rq,struct task_struct * p)2574 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2575 {
2576 bool need_to_push = !task_on_cpu(rq, p) &&
2577 !test_tsk_need_resched(rq->curr) &&
2578 p->nr_cpus_allowed > 1 &&
2579 (dl_task(rq->donor) || rt_task(rq->donor)) &&
2580 (rq->curr->nr_cpus_allowed < 2 ||
2581 rq->donor->prio <= p->prio);
2582
2583 if (need_to_push)
2584 push_rt_tasks(rq);
2585 }
2586
2587 /* Assumes rq->lock is held */
rq_online_rt(struct rq * rq)2588 static void rq_online_rt(struct rq *rq)
2589 {
2590 if (rq->rt.overloaded)
2591 rt_set_overload(rq);
2592
2593 __enable_runtime(rq);
2594
2595 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2596 }
2597
2598 /* Assumes rq->lock is held */
rq_offline_rt(struct rq * rq)2599 static void rq_offline_rt(struct rq *rq)
2600 {
2601 if (rq->rt.overloaded)
2602 rt_clear_overload(rq);
2603
2604 __disable_runtime(rq);
2605
2606 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2607 }
2608
2609 /*
2610 * When switch from the rt queue, we bring ourselves to a position
2611 * that we might want to pull RT tasks from other runqueues.
2612 */
switched_from_rt(struct rq * rq,struct task_struct * p)2613 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2614 {
2615 /*
2616 * If there are other RT tasks then we will reschedule
2617 * and the scheduling of the other RT tasks will handle
2618 * the balancing. But if we are the last RT task
2619 * we may need to handle the pulling of RT tasks
2620 * now.
2621 */
2622 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2623 return;
2624
2625 rt_queue_pull_task(rq);
2626 }
2627
init_sched_rt_class(void)2628 void __init init_sched_rt_class(void)
2629 {
2630 unsigned int i;
2631
2632 for_each_possible_cpu(i) {
2633 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2634 GFP_KERNEL, cpu_to_node(i));
2635 }
2636 }
2637 #endif /* CONFIG_SMP */
2638
2639 /*
2640 * When switching a task to RT, we may overload the runqueue
2641 * with RT tasks. In this case we try to push them off to
2642 * other runqueues.
2643 */
switched_to_rt(struct rq * rq,struct task_struct * p)2644 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2645 {
2646 /*
2647 * If we are running, update the avg_rt tracking, as the running time
2648 * will now on be accounted into the latter.
2649 */
2650 if (task_current(rq, p)) {
2651 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
2652 return;
2653 }
2654
2655 /*
2656 * If we are not running we may need to preempt the current
2657 * running task. If that current running task is also an RT task
2658 * then see if we can move to another run queue.
2659 */
2660 if (task_on_rq_queued(p)) {
2661 #ifdef CONFIG_SMP
2662 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2663 rt_queue_push_tasks(rq);
2664 #endif /* CONFIG_SMP */
2665 if (p->prio < rq->donor->prio && cpu_online(cpu_of(rq)))
2666 resched_curr(rq);
2667 }
2668 }
2669
2670 /*
2671 * Priority of the task has changed. This may cause
2672 * us to initiate a push or pull.
2673 */
2674 static void
prio_changed_rt(struct rq * rq,struct task_struct * p,int oldprio)2675 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2676 {
2677 if (!task_on_rq_queued(p))
2678 return;
2679
2680 if (task_current_donor(rq, p)) {
2681 #ifdef CONFIG_SMP
2682 /*
2683 * If our priority decreases while running, we
2684 * may need to pull tasks to this runqueue.
2685 */
2686 if (oldprio < p->prio)
2687 rt_queue_pull_task(rq);
2688
2689 /*
2690 * If there's a higher priority task waiting to run
2691 * then reschedule.
2692 */
2693 if (p->prio > rq->rt.highest_prio.curr)
2694 resched_curr(rq);
2695 #else
2696 /* For UP simply resched on drop of prio */
2697 if (oldprio < p->prio)
2698 resched_curr(rq);
2699 #endif /* CONFIG_SMP */
2700 } else {
2701 /*
2702 * This task is not running, but if it is
2703 * greater than the current running task
2704 * then reschedule.
2705 */
2706 if (p->prio < rq->donor->prio)
2707 resched_curr(rq);
2708 }
2709 }
2710
2711 #ifdef CONFIG_POSIX_TIMERS
watchdog(struct rq * rq,struct task_struct * p)2712 static void watchdog(struct rq *rq, struct task_struct *p)
2713 {
2714 unsigned long soft, hard;
2715
2716 /* max may change after cur was read, this will be fixed next tick */
2717 soft = task_rlimit(p, RLIMIT_RTTIME);
2718 hard = task_rlimit_max(p, RLIMIT_RTTIME);
2719
2720 if (soft != RLIM_INFINITY) {
2721 unsigned long next;
2722
2723 if (p->rt.watchdog_stamp != jiffies) {
2724 p->rt.timeout++;
2725 p->rt.watchdog_stamp = jiffies;
2726 }
2727
2728 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2729 if (p->rt.timeout > next) {
2730 posix_cputimers_rt_watchdog(&p->posix_cputimers,
2731 p->se.sum_exec_runtime);
2732 }
2733 }
2734 }
2735 #else
watchdog(struct rq * rq,struct task_struct * p)2736 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2737 #endif
2738
2739 /*
2740 * scheduler tick hitting a task of our scheduling class.
2741 *
2742 * NOTE: This function can be called remotely by the tick offload that
2743 * goes along full dynticks. Therefore no local assumption can be made
2744 * and everything must be accessed through the @rq and @curr passed in
2745 * parameters.
2746 */
task_tick_rt(struct rq * rq,struct task_struct * p,int queued)2747 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2748 {
2749 struct sched_rt_entity *rt_se = &p->rt;
2750
2751 update_curr_rt(rq);
2752 update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 1);
2753 trace_android_rvh_update_rt_rq_load_avg(rq_clock_pelt(rq), rq, p, 1);
2754
2755 watchdog(rq, p);
2756
2757 /*
2758 * RR tasks need a special form of time-slice management.
2759 * FIFO tasks have no timeslices.
2760 */
2761 if (p->policy != SCHED_RR)
2762 return;
2763
2764 if (--p->rt.time_slice)
2765 return;
2766
2767 p->rt.time_slice = sched_rr_timeslice;
2768
2769 /*
2770 * Requeue to the end of queue if we (and all of our ancestors) are not
2771 * the only element on the queue
2772 */
2773 for_each_sched_rt_entity(rt_se) {
2774 if (rt_se->run_list.prev != rt_se->run_list.next) {
2775 requeue_task_rt(rq, p, 0);
2776 resched_curr(rq);
2777 return;
2778 }
2779 }
2780 }
2781
get_rr_interval_rt(struct rq * rq,struct task_struct * task)2782 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2783 {
2784 /*
2785 * Time slice is 0 for SCHED_FIFO tasks
2786 */
2787 if (task->policy == SCHED_RR)
2788 return sched_rr_timeslice;
2789 else
2790 return 0;
2791 }
2792
2793 #ifdef CONFIG_SCHED_CORE
task_is_throttled_rt(struct task_struct * p,int cpu)2794 static int task_is_throttled_rt(struct task_struct *p, int cpu)
2795 {
2796 struct rt_rq *rt_rq;
2797
2798 #ifdef CONFIG_RT_GROUP_SCHED
2799 rt_rq = task_group(p)->rt_rq[cpu];
2800 #else
2801 rt_rq = &cpu_rq(cpu)->rt;
2802 #endif
2803
2804 return rt_rq_throttled(rt_rq);
2805 }
2806 #endif
2807
2808 DEFINE_SCHED_CLASS(rt) = {
2809
2810 .enqueue_task = enqueue_task_rt,
2811 .dequeue_task = dequeue_task_rt,
2812 .yield_task = yield_task_rt,
2813
2814 .wakeup_preempt = wakeup_preempt_rt,
2815
2816 .pick_task = pick_task_rt,
2817 .put_prev_task = put_prev_task_rt,
2818 .set_next_task = set_next_task_rt,
2819
2820 #ifdef CONFIG_SMP
2821 .balance = balance_rt,
2822 .select_task_rq = select_task_rq_rt,
2823 .set_cpus_allowed = set_cpus_allowed_common,
2824 .rq_online = rq_online_rt,
2825 .rq_offline = rq_offline_rt,
2826 .task_woken = task_woken_rt,
2827 .switched_from = switched_from_rt,
2828 .find_lock_rq = find_lock_lowest_rq,
2829 #endif
2830
2831 .task_tick = task_tick_rt,
2832
2833 .get_rr_interval = get_rr_interval_rt,
2834
2835 .prio_changed = prio_changed_rt,
2836 .switched_to = switched_to_rt,
2837
2838 .update_curr = update_curr_rt,
2839
2840 #ifdef CONFIG_SCHED_CORE
2841 .task_is_throttled = task_is_throttled_rt,
2842 #endif
2843
2844 #ifdef CONFIG_UCLAMP_TASK
2845 .uclamp_enabled = 1,
2846 #endif
2847 };
2848
2849 #ifdef CONFIG_RT_GROUP_SCHED
2850 /*
2851 * Ensure that the real time constraints are schedulable.
2852 */
2853 static DEFINE_MUTEX(rt_constraints_mutex);
2854
tg_has_rt_tasks(struct task_group * tg)2855 static inline int tg_has_rt_tasks(struct task_group *tg)
2856 {
2857 struct task_struct *task;
2858 struct css_task_iter it;
2859 int ret = 0;
2860
2861 /*
2862 * Autogroups do not have RT tasks; see autogroup_create().
2863 */
2864 if (task_group_is_autogroup(tg))
2865 return 0;
2866
2867 css_task_iter_start(&tg->css, 0, &it);
2868 while (!ret && (task = css_task_iter_next(&it)))
2869 ret |= rt_task(task);
2870 css_task_iter_end(&it);
2871
2872 return ret;
2873 }
2874
2875 struct rt_schedulable_data {
2876 struct task_group *tg;
2877 u64 rt_period;
2878 u64 rt_runtime;
2879 };
2880
tg_rt_schedulable(struct task_group * tg,void * data)2881 static int tg_rt_schedulable(struct task_group *tg, void *data)
2882 {
2883 struct rt_schedulable_data *d = data;
2884 struct task_group *child;
2885 unsigned long total, sum = 0;
2886 u64 period, runtime;
2887
2888 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2889 runtime = tg->rt_bandwidth.rt_runtime;
2890
2891 if (tg == d->tg) {
2892 period = d->rt_period;
2893 runtime = d->rt_runtime;
2894 }
2895
2896 /*
2897 * Cannot have more runtime than the period.
2898 */
2899 if (runtime > period && runtime != RUNTIME_INF)
2900 return -EINVAL;
2901
2902 /*
2903 * Ensure we don't starve existing RT tasks if runtime turns zero.
2904 */
2905 if (rt_bandwidth_enabled() && !runtime &&
2906 tg->rt_bandwidth.rt_runtime && tg_has_rt_tasks(tg))
2907 return -EBUSY;
2908
2909 total = to_ratio(period, runtime);
2910
2911 /*
2912 * Nobody can have more than the global setting allows.
2913 */
2914 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2915 return -EINVAL;
2916
2917 /*
2918 * The sum of our children's runtime should not exceed our own.
2919 */
2920 list_for_each_entry_rcu(child, &tg->children, siblings) {
2921 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2922 runtime = child->rt_bandwidth.rt_runtime;
2923
2924 if (child == d->tg) {
2925 period = d->rt_period;
2926 runtime = d->rt_runtime;
2927 }
2928
2929 sum += to_ratio(period, runtime);
2930 }
2931
2932 if (sum > total)
2933 return -EINVAL;
2934
2935 return 0;
2936 }
2937
__rt_schedulable(struct task_group * tg,u64 period,u64 runtime)2938 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2939 {
2940 int ret;
2941
2942 struct rt_schedulable_data data = {
2943 .tg = tg,
2944 .rt_period = period,
2945 .rt_runtime = runtime,
2946 };
2947
2948 rcu_read_lock();
2949 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2950 rcu_read_unlock();
2951
2952 return ret;
2953 }
2954
tg_set_rt_bandwidth(struct task_group * tg,u64 rt_period,u64 rt_runtime)2955 static int tg_set_rt_bandwidth(struct task_group *tg,
2956 u64 rt_period, u64 rt_runtime)
2957 {
2958 int i, err = 0;
2959
2960 /*
2961 * Disallowing the root group RT runtime is BAD, it would disallow the
2962 * kernel creating (and or operating) RT threads.
2963 */
2964 if (tg == &root_task_group && rt_runtime == 0)
2965 return -EINVAL;
2966
2967 /* No period doesn't make any sense. */
2968 if (rt_period == 0)
2969 return -EINVAL;
2970
2971 /*
2972 * Bound quota to defend quota against overflow during bandwidth shift.
2973 */
2974 if (rt_runtime != RUNTIME_INF && rt_runtime > max_rt_runtime)
2975 return -EINVAL;
2976
2977 mutex_lock(&rt_constraints_mutex);
2978 err = __rt_schedulable(tg, rt_period, rt_runtime);
2979 if (err)
2980 goto unlock;
2981
2982 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2983 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2984 tg->rt_bandwidth.rt_runtime = rt_runtime;
2985
2986 for_each_possible_cpu(i) {
2987 struct rt_rq *rt_rq = tg->rt_rq[i];
2988
2989 raw_spin_lock(&rt_rq->rt_runtime_lock);
2990 rt_rq->rt_runtime = rt_runtime;
2991 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2992 }
2993 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2994 unlock:
2995 mutex_unlock(&rt_constraints_mutex);
2996
2997 return err;
2998 }
2999
sched_group_set_rt_runtime(struct task_group * tg,long rt_runtime_us)3000 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
3001 {
3002 u64 rt_runtime, rt_period;
3003
3004 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
3005 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
3006 if (rt_runtime_us < 0)
3007 rt_runtime = RUNTIME_INF;
3008 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
3009 return -EINVAL;
3010
3011 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3012 }
3013
sched_group_rt_runtime(struct task_group * tg)3014 long sched_group_rt_runtime(struct task_group *tg)
3015 {
3016 u64 rt_runtime_us;
3017
3018 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
3019 return -1;
3020
3021 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
3022 do_div(rt_runtime_us, NSEC_PER_USEC);
3023 return rt_runtime_us;
3024 }
3025
sched_group_set_rt_period(struct task_group * tg,u64 rt_period_us)3026 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
3027 {
3028 u64 rt_runtime, rt_period;
3029
3030 if (rt_period_us > U64_MAX / NSEC_PER_USEC)
3031 return -EINVAL;
3032
3033 rt_period = rt_period_us * NSEC_PER_USEC;
3034 rt_runtime = tg->rt_bandwidth.rt_runtime;
3035
3036 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3037 }
3038
sched_group_rt_period(struct task_group * tg)3039 long sched_group_rt_period(struct task_group *tg)
3040 {
3041 u64 rt_period_us;
3042
3043 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
3044 do_div(rt_period_us, NSEC_PER_USEC);
3045 return rt_period_us;
3046 }
3047
3048 #ifdef CONFIG_SYSCTL
sched_rt_global_constraints(void)3049 static int sched_rt_global_constraints(void)
3050 {
3051 int ret = 0;
3052
3053 mutex_lock(&rt_constraints_mutex);
3054 ret = __rt_schedulable(NULL, 0, 0);
3055 mutex_unlock(&rt_constraints_mutex);
3056
3057 return ret;
3058 }
3059 #endif /* CONFIG_SYSCTL */
3060
sched_rt_can_attach(struct task_group * tg,struct task_struct * tsk)3061 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
3062 {
3063 /* Don't accept real-time tasks when there is no way for them to run */
3064 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
3065 return 0;
3066
3067 return 1;
3068 }
3069
3070 #else /* !CONFIG_RT_GROUP_SCHED */
3071
3072 #ifdef CONFIG_SYSCTL
sched_rt_global_constraints(void)3073 static int sched_rt_global_constraints(void)
3074 {
3075 return 0;
3076 }
3077 #endif /* CONFIG_SYSCTL */
3078 #endif /* CONFIG_RT_GROUP_SCHED */
3079
3080 #ifdef CONFIG_SYSCTL
sched_rt_global_validate(void)3081 static int sched_rt_global_validate(void)
3082 {
3083 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
3084 ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) ||
3085 ((u64)sysctl_sched_rt_runtime *
3086 NSEC_PER_USEC > max_rt_runtime)))
3087 return -EINVAL;
3088
3089 return 0;
3090 }
3091
sched_rt_do_global(void)3092 static void sched_rt_do_global(void)
3093 {
3094 }
3095
sched_rt_handler(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)3096 static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer,
3097 size_t *lenp, loff_t *ppos)
3098 {
3099 int old_period, old_runtime;
3100 static DEFINE_MUTEX(mutex);
3101 int ret;
3102
3103 mutex_lock(&mutex);
3104 old_period = sysctl_sched_rt_period;
3105 old_runtime = sysctl_sched_rt_runtime;
3106
3107 ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
3108
3109 if (!ret && write) {
3110 ret = sched_rt_global_validate();
3111 if (ret)
3112 goto undo;
3113
3114 ret = sched_dl_global_validate();
3115 if (ret)
3116 goto undo;
3117
3118 ret = sched_rt_global_constraints();
3119 if (ret)
3120 goto undo;
3121
3122 sched_rt_do_global();
3123 sched_dl_do_global();
3124 }
3125 if (0) {
3126 undo:
3127 sysctl_sched_rt_period = old_period;
3128 sysctl_sched_rt_runtime = old_runtime;
3129 }
3130 mutex_unlock(&mutex);
3131
3132 /*
3133 * After changing maximum available bandwidth for DEADLINE, we need to
3134 * recompute per root domain and per cpus variables accordingly.
3135 */
3136 rebuild_sched_domains();
3137
3138 return ret;
3139 }
3140
sched_rr_handler(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)3141 static int sched_rr_handler(const struct ctl_table *table, int write, void *buffer,
3142 size_t *lenp, loff_t *ppos)
3143 {
3144 int ret;
3145 static DEFINE_MUTEX(mutex);
3146
3147 mutex_lock(&mutex);
3148 ret = proc_dointvec(table, write, buffer, lenp, ppos);
3149 /*
3150 * Make sure that internally we keep jiffies.
3151 * Also, writing zero resets the time-slice to default:
3152 */
3153 if (!ret && write) {
3154 sched_rr_timeslice =
3155 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
3156 msecs_to_jiffies(sysctl_sched_rr_timeslice);
3157
3158 if (sysctl_sched_rr_timeslice <= 0)
3159 sysctl_sched_rr_timeslice = jiffies_to_msecs(RR_TIMESLICE);
3160 }
3161 mutex_unlock(&mutex);
3162
3163 return ret;
3164 }
3165 #endif /* CONFIG_SYSCTL */
3166
3167 #ifdef CONFIG_SCHED_DEBUG
print_rt_stats(struct seq_file * m,int cpu)3168 void print_rt_stats(struct seq_file *m, int cpu)
3169 {
3170 rt_rq_iter_t iter;
3171 struct rt_rq *rt_rq;
3172
3173 rcu_read_lock();
3174 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
3175 print_rt_rq(m, cpu, rt_rq);
3176 rcu_read_unlock();
3177 }
3178 #endif /* CONFIG_SCHED_DEBUG */
3179