1 /*
2 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3 * policies)
4 */
5
6 #include "sched.h"
7
8 #include <linux/slab.h>
9
10 #include "walt.h"
11
12 int sched_rr_timeslice = RR_TIMESLICE;
13
14 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
15
16 struct rt_bandwidth def_rt_bandwidth;
17
sched_rt_period_timer(struct hrtimer * timer)18 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
19 {
20 struct rt_bandwidth *rt_b =
21 container_of(timer, struct rt_bandwidth, rt_period_timer);
22 ktime_t now;
23 int overrun;
24 int idle = 0;
25
26 for (;;) {
27 now = hrtimer_cb_get_time(timer);
28 overrun = hrtimer_forward(timer, now, rt_b->rt_period);
29
30 if (!overrun)
31 break;
32
33 idle = do_sched_rt_period_timer(rt_b, overrun);
34 }
35
36 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
37 }
38
init_rt_bandwidth(struct rt_bandwidth * rt_b,u64 period,u64 runtime)39 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
40 {
41 rt_b->rt_period = ns_to_ktime(period);
42 rt_b->rt_runtime = runtime;
43
44 raw_spin_lock_init(&rt_b->rt_runtime_lock);
45
46 hrtimer_init(&rt_b->rt_period_timer,
47 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
48 rt_b->rt_period_timer.function = sched_rt_period_timer;
49 }
50
start_rt_bandwidth(struct rt_bandwidth * rt_b)51 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
52 {
53 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
54 return;
55
56 if (hrtimer_active(&rt_b->rt_period_timer))
57 return;
58
59 raw_spin_lock(&rt_b->rt_runtime_lock);
60 start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
61 raw_spin_unlock(&rt_b->rt_runtime_lock);
62 }
63
init_rt_rq(struct rt_rq * rt_rq,struct rq * rq)64 void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
65 {
66 struct rt_prio_array *array;
67 int i;
68
69 array = &rt_rq->active;
70 for (i = 0; i < MAX_RT_PRIO; i++) {
71 INIT_LIST_HEAD(array->queue + i);
72 __clear_bit(i, array->bitmap);
73 }
74 /* delimiter for bitsearch: */
75 __set_bit(MAX_RT_PRIO, array->bitmap);
76
77 #if defined CONFIG_SMP
78 rt_rq->highest_prio.curr = MAX_RT_PRIO;
79 rt_rq->highest_prio.next = MAX_RT_PRIO;
80 rt_rq->rt_nr_migratory = 0;
81 rt_rq->overloaded = 0;
82 plist_head_init(&rt_rq->pushable_tasks);
83 #endif
84 /* We start is dequeued state, because no RT tasks are queued */
85 rt_rq->rt_queued = 0;
86
87 rt_rq->rt_time = 0;
88 rt_rq->rt_throttled = 0;
89 rt_rq->rt_runtime = 0;
90 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
91 }
92
93 #ifdef CONFIG_RT_GROUP_SCHED
destroy_rt_bandwidth(struct rt_bandwidth * rt_b)94 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
95 {
96 hrtimer_cancel(&rt_b->rt_period_timer);
97 }
98
99 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
100
rt_task_of(struct sched_rt_entity * rt_se)101 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
102 {
103 #ifdef CONFIG_SCHED_DEBUG
104 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
105 #endif
106 return container_of(rt_se, struct task_struct, rt);
107 }
108
rq_of_rt_rq(struct rt_rq * rt_rq)109 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
110 {
111 return rt_rq->rq;
112 }
113
rt_rq_of_se(struct sched_rt_entity * rt_se)114 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
115 {
116 return rt_se->rt_rq;
117 }
118
rq_of_rt_se(struct sched_rt_entity * rt_se)119 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
120 {
121 struct rt_rq *rt_rq = rt_se->rt_rq;
122
123 return rt_rq->rq;
124 }
125
free_rt_sched_group(struct task_group * tg)126 void free_rt_sched_group(struct task_group *tg)
127 {
128 int i;
129
130 if (tg->rt_se)
131 destroy_rt_bandwidth(&tg->rt_bandwidth);
132
133 for_each_possible_cpu(i) {
134 if (tg->rt_rq)
135 kfree(tg->rt_rq[i]);
136 if (tg->rt_se)
137 kfree(tg->rt_se[i]);
138 }
139
140 kfree(tg->rt_rq);
141 kfree(tg->rt_se);
142 }
143
init_tg_rt_entry(struct task_group * tg,struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int cpu,struct sched_rt_entity * parent)144 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
145 struct sched_rt_entity *rt_se, int cpu,
146 struct sched_rt_entity *parent)
147 {
148 struct rq *rq = cpu_rq(cpu);
149
150 rt_rq->highest_prio.curr = MAX_RT_PRIO;
151 rt_rq->rt_nr_boosted = 0;
152 rt_rq->rq = rq;
153 rt_rq->tg = tg;
154
155 tg->rt_rq[cpu] = rt_rq;
156 tg->rt_se[cpu] = rt_se;
157
158 if (!rt_se)
159 return;
160
161 if (!parent)
162 rt_se->rt_rq = &rq->rt;
163 else
164 rt_se->rt_rq = parent->my_q;
165
166 rt_se->my_q = rt_rq;
167 rt_se->parent = parent;
168 INIT_LIST_HEAD(&rt_se->run_list);
169 }
170
alloc_rt_sched_group(struct task_group * tg,struct task_group * parent)171 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
172 {
173 struct rt_rq *rt_rq;
174 struct sched_rt_entity *rt_se;
175 int i;
176
177 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
178 if (!tg->rt_rq)
179 goto err;
180 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
181 if (!tg->rt_se)
182 goto err;
183
184 init_rt_bandwidth(&tg->rt_bandwidth,
185 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
186
187 for_each_possible_cpu(i) {
188 rt_rq = kzalloc_node(sizeof(struct rt_rq),
189 GFP_KERNEL, cpu_to_node(i));
190 if (!rt_rq)
191 goto err;
192
193 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
194 GFP_KERNEL, cpu_to_node(i));
195 if (!rt_se)
196 goto err_free_rq;
197
198 init_rt_rq(rt_rq, cpu_rq(i));
199 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
200 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
201 }
202
203 return 1;
204
205 err_free_rq:
206 kfree(rt_rq);
207 err:
208 return 0;
209 }
210
211 #else /* CONFIG_RT_GROUP_SCHED */
212
213 #define rt_entity_is_task(rt_se) (1)
214
rt_task_of(struct sched_rt_entity * rt_se)215 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
216 {
217 return container_of(rt_se, struct task_struct, rt);
218 }
219
rq_of_rt_rq(struct rt_rq * rt_rq)220 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
221 {
222 return container_of(rt_rq, struct rq, rt);
223 }
224
rq_of_rt_se(struct sched_rt_entity * rt_se)225 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
226 {
227 struct task_struct *p = rt_task_of(rt_se);
228
229 return task_rq(p);
230 }
231
rt_rq_of_se(struct sched_rt_entity * rt_se)232 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
233 {
234 struct rq *rq = rq_of_rt_se(rt_se);
235
236 return &rq->rt;
237 }
238
free_rt_sched_group(struct task_group * tg)239 void free_rt_sched_group(struct task_group *tg) { }
240
alloc_rt_sched_group(struct task_group * tg,struct task_group * parent)241 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
242 {
243 return 1;
244 }
245 #endif /* CONFIG_RT_GROUP_SCHED */
246
247 #ifdef CONFIG_SMP
248
249 static int pull_rt_task(struct rq *this_rq);
250
need_pull_rt_task(struct rq * rq,struct task_struct * prev)251 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
252 {
253 /* Try to pull RT tasks here if we lower this rq's prio */
254 return rq->rt.highest_prio.curr > prev->prio;
255 }
256
rt_overloaded(struct rq * rq)257 static inline int rt_overloaded(struct rq *rq)
258 {
259 return atomic_read(&rq->rd->rto_count);
260 }
261
rt_set_overload(struct rq * rq)262 static inline void rt_set_overload(struct rq *rq)
263 {
264 if (!rq->online)
265 return;
266
267 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
268 /*
269 * Make sure the mask is visible before we set
270 * the overload count. That is checked to determine
271 * if we should look at the mask. It would be a shame
272 * if we looked at the mask, but the mask was not
273 * updated yet.
274 *
275 * Matched by the barrier in pull_rt_task().
276 */
277 smp_wmb();
278 atomic_inc(&rq->rd->rto_count);
279 }
280
rt_clear_overload(struct rq * rq)281 static inline void rt_clear_overload(struct rq *rq)
282 {
283 if (!rq->online)
284 return;
285
286 /* the order here really doesn't matter */
287 atomic_dec(&rq->rd->rto_count);
288 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
289 }
290
update_rt_migration(struct rt_rq * rt_rq)291 static void update_rt_migration(struct rt_rq *rt_rq)
292 {
293 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
294 if (!rt_rq->overloaded) {
295 rt_set_overload(rq_of_rt_rq(rt_rq));
296 rt_rq->overloaded = 1;
297 }
298 } else if (rt_rq->overloaded) {
299 rt_clear_overload(rq_of_rt_rq(rt_rq));
300 rt_rq->overloaded = 0;
301 }
302 }
303
inc_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)304 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
305 {
306 struct task_struct *p;
307
308 if (!rt_entity_is_task(rt_se))
309 return;
310
311 p = rt_task_of(rt_se);
312 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
313
314 rt_rq->rt_nr_total++;
315 if (p->nr_cpus_allowed > 1)
316 rt_rq->rt_nr_migratory++;
317
318 update_rt_migration(rt_rq);
319 }
320
dec_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)321 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
322 {
323 struct task_struct *p;
324
325 if (!rt_entity_is_task(rt_se))
326 return;
327
328 p = rt_task_of(rt_se);
329 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
330
331 rt_rq->rt_nr_total--;
332 if (p->nr_cpus_allowed > 1)
333 rt_rq->rt_nr_migratory--;
334
335 update_rt_migration(rt_rq);
336 }
337
has_pushable_tasks(struct rq * rq)338 static inline int has_pushable_tasks(struct rq *rq)
339 {
340 return !plist_head_empty(&rq->rt.pushable_tasks);
341 }
342
set_post_schedule(struct rq * rq)343 static inline void set_post_schedule(struct rq *rq)
344 {
345 /*
346 * We detect this state here so that we can avoid taking the RQ
347 * lock again later if there is no need to push
348 */
349 rq->post_schedule = has_pushable_tasks(rq);
350 }
351
enqueue_pushable_task(struct rq * rq,struct task_struct * p)352 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
353 {
354 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
355 plist_node_init(&p->pushable_tasks, p->prio);
356 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
357
358 /* Update the highest prio pushable task */
359 if (p->prio < rq->rt.highest_prio.next)
360 rq->rt.highest_prio.next = p->prio;
361 }
362
dequeue_pushable_task(struct rq * rq,struct task_struct * p)363 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
364 {
365 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
366
367 /* Update the new highest prio pushable task */
368 if (has_pushable_tasks(rq)) {
369 p = plist_first_entry(&rq->rt.pushable_tasks,
370 struct task_struct, pushable_tasks);
371 rq->rt.highest_prio.next = p->prio;
372 } else
373 rq->rt.highest_prio.next = MAX_RT_PRIO;
374 }
375
376 #else
377
enqueue_pushable_task(struct rq * rq,struct task_struct * p)378 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
379 {
380 }
381
dequeue_pushable_task(struct rq * rq,struct task_struct * p)382 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
383 {
384 }
385
386 static inline
inc_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)387 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
388 {
389 }
390
391 static inline
dec_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)392 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
393 {
394 }
395
need_pull_rt_task(struct rq * rq,struct task_struct * prev)396 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
397 {
398 return false;
399 }
400
pull_rt_task(struct rq * this_rq)401 static inline int pull_rt_task(struct rq *this_rq)
402 {
403 return 0;
404 }
405
set_post_schedule(struct rq * rq)406 static inline void set_post_schedule(struct rq *rq)
407 {
408 }
409 #endif /* CONFIG_SMP */
410
411 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
412 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
413
on_rt_rq(struct sched_rt_entity * rt_se)414 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
415 {
416 return !list_empty(&rt_se->run_list);
417 }
418
419 #ifdef CONFIG_RT_GROUP_SCHED
420
sched_rt_runtime(struct rt_rq * rt_rq)421 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
422 {
423 if (!rt_rq->tg)
424 return RUNTIME_INF;
425
426 return rt_rq->rt_runtime;
427 }
428
sched_rt_period(struct rt_rq * rt_rq)429 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
430 {
431 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
432 }
433
434 typedef struct task_group *rt_rq_iter_t;
435
next_task_group(struct task_group * tg)436 static inline struct task_group *next_task_group(struct task_group *tg)
437 {
438 do {
439 tg = list_entry_rcu(tg->list.next,
440 typeof(struct task_group), list);
441 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
442
443 if (&tg->list == &task_groups)
444 tg = NULL;
445
446 return tg;
447 }
448
449 #define for_each_rt_rq(rt_rq, iter, rq) \
450 for (iter = container_of(&task_groups, typeof(*iter), list); \
451 (iter = next_task_group(iter)) && \
452 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
453
454 #define for_each_sched_rt_entity(rt_se) \
455 for (; rt_se; rt_se = rt_se->parent)
456
group_rt_rq(struct sched_rt_entity * rt_se)457 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
458 {
459 return rt_se->my_q;
460 }
461
462 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
463 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
464
sched_rt_rq_enqueue(struct rt_rq * rt_rq)465 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
466 {
467 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
468 struct rq *rq = rq_of_rt_rq(rt_rq);
469 struct sched_rt_entity *rt_se;
470
471 int cpu = cpu_of(rq);
472
473 rt_se = rt_rq->tg->rt_se[cpu];
474
475 if (rt_rq->rt_nr_running) {
476 if (!rt_se)
477 enqueue_top_rt_rq(rt_rq);
478 else if (!on_rt_rq(rt_se))
479 enqueue_rt_entity(rt_se, false);
480
481 if (rt_rq->highest_prio.curr < curr->prio)
482 resched_curr(rq);
483 }
484 }
485
sched_rt_rq_dequeue(struct rt_rq * rt_rq)486 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
487 {
488 struct sched_rt_entity *rt_se;
489 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
490
491 rt_se = rt_rq->tg->rt_se[cpu];
492
493 if (!rt_se)
494 dequeue_top_rt_rq(rt_rq);
495 else if (on_rt_rq(rt_se))
496 dequeue_rt_entity(rt_se);
497 }
498
rt_rq_throttled(struct rt_rq * rt_rq)499 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
500 {
501 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
502 }
503
rt_se_boosted(struct sched_rt_entity * rt_se)504 static int rt_se_boosted(struct sched_rt_entity *rt_se)
505 {
506 struct rt_rq *rt_rq = group_rt_rq(rt_se);
507 struct task_struct *p;
508
509 if (rt_rq)
510 return !!rt_rq->rt_nr_boosted;
511
512 p = rt_task_of(rt_se);
513 return p->prio != p->normal_prio;
514 }
515
516 #ifdef CONFIG_SMP
sched_rt_period_mask(void)517 static inline const struct cpumask *sched_rt_period_mask(void)
518 {
519 return this_rq()->rd->span;
520 }
521 #else
sched_rt_period_mask(void)522 static inline const struct cpumask *sched_rt_period_mask(void)
523 {
524 return cpu_online_mask;
525 }
526 #endif
527
528 static inline
sched_rt_period_rt_rq(struct rt_bandwidth * rt_b,int cpu)529 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
530 {
531 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
532 }
533
sched_rt_bandwidth(struct rt_rq * rt_rq)534 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
535 {
536 return &rt_rq->tg->rt_bandwidth;
537 }
538
539 #else /* !CONFIG_RT_GROUP_SCHED */
540
sched_rt_runtime(struct rt_rq * rt_rq)541 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
542 {
543 return rt_rq->rt_runtime;
544 }
545
sched_rt_period(struct rt_rq * rt_rq)546 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
547 {
548 return ktime_to_ns(def_rt_bandwidth.rt_period);
549 }
550
551 typedef struct rt_rq *rt_rq_iter_t;
552
553 #define for_each_rt_rq(rt_rq, iter, rq) \
554 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
555
556 #define for_each_sched_rt_entity(rt_se) \
557 for (; rt_se; rt_se = NULL)
558
group_rt_rq(struct sched_rt_entity * rt_se)559 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
560 {
561 return NULL;
562 }
563
sched_rt_rq_enqueue(struct rt_rq * rt_rq)564 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
565 {
566 struct rq *rq = rq_of_rt_rq(rt_rq);
567
568 if (!rt_rq->rt_nr_running)
569 return;
570
571 enqueue_top_rt_rq(rt_rq);
572 resched_curr(rq);
573 }
574
sched_rt_rq_dequeue(struct rt_rq * rt_rq)575 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
576 {
577 dequeue_top_rt_rq(rt_rq);
578 }
579
rt_rq_throttled(struct rt_rq * rt_rq)580 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
581 {
582 return rt_rq->rt_throttled;
583 }
584
sched_rt_period_mask(void)585 static inline const struct cpumask *sched_rt_period_mask(void)
586 {
587 return cpu_online_mask;
588 }
589
590 static inline
sched_rt_period_rt_rq(struct rt_bandwidth * rt_b,int cpu)591 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
592 {
593 return &cpu_rq(cpu)->rt;
594 }
595
sched_rt_bandwidth(struct rt_rq * rt_rq)596 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
597 {
598 return &def_rt_bandwidth;
599 }
600
601 #endif /* CONFIG_RT_GROUP_SCHED */
602
sched_rt_bandwidth_account(struct rt_rq * rt_rq)603 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
604 {
605 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
606
607 return (hrtimer_active(&rt_b->rt_period_timer) ||
608 rt_rq->rt_time < rt_b->rt_runtime);
609 }
610
611 #ifdef CONFIG_SMP
612 /*
613 * We ran out of runtime, see if we can borrow some from our neighbours.
614 */
do_balance_runtime(struct rt_rq * rt_rq)615 static int do_balance_runtime(struct rt_rq *rt_rq)
616 {
617 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
618 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
619 int i, weight, more = 0;
620 u64 rt_period;
621
622 weight = cpumask_weight(rd->span);
623
624 raw_spin_lock(&rt_b->rt_runtime_lock);
625 rt_period = ktime_to_ns(rt_b->rt_period);
626 for_each_cpu(i, rd->span) {
627 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
628 s64 diff;
629
630 if (iter == rt_rq)
631 continue;
632
633 raw_spin_lock(&iter->rt_runtime_lock);
634 /*
635 * Either all rqs have inf runtime and there's nothing to steal
636 * or __disable_runtime() below sets a specific rq to inf to
637 * indicate its been disabled and disalow stealing.
638 */
639 if (iter->rt_runtime == RUNTIME_INF)
640 goto next;
641
642 /*
643 * From runqueues with spare time, take 1/n part of their
644 * spare time, but no more than our period.
645 */
646 diff = iter->rt_runtime - iter->rt_time;
647 if (diff > 0) {
648 diff = div_u64((u64)diff, weight);
649 if (rt_rq->rt_runtime + diff > rt_period)
650 diff = rt_period - rt_rq->rt_runtime;
651 iter->rt_runtime -= diff;
652 rt_rq->rt_runtime += diff;
653 more = 1;
654 if (rt_rq->rt_runtime == rt_period) {
655 raw_spin_unlock(&iter->rt_runtime_lock);
656 break;
657 }
658 }
659 next:
660 raw_spin_unlock(&iter->rt_runtime_lock);
661 }
662 raw_spin_unlock(&rt_b->rt_runtime_lock);
663
664 return more;
665 }
666
667 /*
668 * Ensure this RQ takes back all the runtime it lend to its neighbours.
669 */
__disable_runtime(struct rq * rq)670 static void __disable_runtime(struct rq *rq)
671 {
672 struct root_domain *rd = rq->rd;
673 rt_rq_iter_t iter;
674 struct rt_rq *rt_rq;
675
676 if (unlikely(!scheduler_running))
677 return;
678
679 for_each_rt_rq(rt_rq, iter, rq) {
680 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
681 s64 want;
682 int i;
683
684 raw_spin_lock(&rt_b->rt_runtime_lock);
685 raw_spin_lock(&rt_rq->rt_runtime_lock);
686 /*
687 * Either we're all inf and nobody needs to borrow, or we're
688 * already disabled and thus have nothing to do, or we have
689 * exactly the right amount of runtime to take out.
690 */
691 if (rt_rq->rt_runtime == RUNTIME_INF ||
692 rt_rq->rt_runtime == rt_b->rt_runtime)
693 goto balanced;
694 raw_spin_unlock(&rt_rq->rt_runtime_lock);
695
696 /*
697 * Calculate the difference between what we started out with
698 * and what we current have, that's the amount of runtime
699 * we lend and now have to reclaim.
700 */
701 want = rt_b->rt_runtime - rt_rq->rt_runtime;
702
703 /*
704 * Greedy reclaim, take back as much as we can.
705 */
706 for_each_cpu(i, rd->span) {
707 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
708 s64 diff;
709
710 /*
711 * Can't reclaim from ourselves or disabled runqueues.
712 */
713 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
714 continue;
715
716 raw_spin_lock(&iter->rt_runtime_lock);
717 if (want > 0) {
718 diff = min_t(s64, iter->rt_runtime, want);
719 iter->rt_runtime -= diff;
720 want -= diff;
721 } else {
722 iter->rt_runtime -= want;
723 want -= want;
724 }
725 raw_spin_unlock(&iter->rt_runtime_lock);
726
727 if (!want)
728 break;
729 }
730
731 raw_spin_lock(&rt_rq->rt_runtime_lock);
732 /*
733 * We cannot be left wanting - that would mean some runtime
734 * leaked out of the system.
735 */
736 BUG_ON(want);
737 balanced:
738 /*
739 * Disable all the borrow logic by pretending we have inf
740 * runtime - in which case borrowing doesn't make sense.
741 */
742 rt_rq->rt_runtime = RUNTIME_INF;
743 rt_rq->rt_throttled = 0;
744 raw_spin_unlock(&rt_rq->rt_runtime_lock);
745 raw_spin_unlock(&rt_b->rt_runtime_lock);
746
747 /* Make rt_rq available for pick_next_task() */
748 sched_rt_rq_enqueue(rt_rq);
749 }
750 }
751
__enable_runtime(struct rq * rq)752 static void __enable_runtime(struct rq *rq)
753 {
754 rt_rq_iter_t iter;
755 struct rt_rq *rt_rq;
756
757 if (unlikely(!scheduler_running))
758 return;
759
760 /*
761 * Reset each runqueue's bandwidth settings
762 */
763 for_each_rt_rq(rt_rq, iter, rq) {
764 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
765
766 raw_spin_lock(&rt_b->rt_runtime_lock);
767 raw_spin_lock(&rt_rq->rt_runtime_lock);
768 rt_rq->rt_runtime = rt_b->rt_runtime;
769 rt_rq->rt_time = 0;
770 rt_rq->rt_throttled = 0;
771 raw_spin_unlock(&rt_rq->rt_runtime_lock);
772 raw_spin_unlock(&rt_b->rt_runtime_lock);
773 }
774 }
775
balance_runtime(struct rt_rq * rt_rq)776 static int balance_runtime(struct rt_rq *rt_rq)
777 {
778 int more = 0;
779
780 if (!sched_feat(RT_RUNTIME_SHARE))
781 return more;
782
783 if (rt_rq->rt_time > rt_rq->rt_runtime) {
784 raw_spin_unlock(&rt_rq->rt_runtime_lock);
785 more = do_balance_runtime(rt_rq);
786 raw_spin_lock(&rt_rq->rt_runtime_lock);
787 }
788
789 return more;
790 }
791 #else /* !CONFIG_SMP */
balance_runtime(struct rt_rq * rt_rq)792 static inline int balance_runtime(struct rt_rq *rt_rq)
793 {
794 return 0;
795 }
796 #endif /* CONFIG_SMP */
797
do_sched_rt_period_timer(struct rt_bandwidth * rt_b,int overrun)798 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
799 {
800 int i, idle = 1, throttled = 0;
801 const struct cpumask *span;
802
803 span = sched_rt_period_mask();
804 #ifdef CONFIG_RT_GROUP_SCHED
805 /*
806 * FIXME: isolated CPUs should really leave the root task group,
807 * whether they are isolcpus or were isolated via cpusets, lest
808 * the timer run on a CPU which does not service all runqueues,
809 * potentially leaving other CPUs indefinitely throttled. If
810 * isolation is really required, the user will turn the throttle
811 * off to kill the perturbations it causes anyway. Meanwhile,
812 * this maintains functionality for boot and/or troubleshooting.
813 */
814 if (rt_b == &root_task_group.rt_bandwidth)
815 span = cpu_online_mask;
816 #endif
817 for_each_cpu(i, span) {
818 int enqueue = 0;
819 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
820 struct rq *rq = rq_of_rt_rq(rt_rq);
821
822 raw_spin_lock(&rq->lock);
823 if (rt_rq->rt_time) {
824 u64 runtime;
825
826 raw_spin_lock(&rt_rq->rt_runtime_lock);
827 if (rt_rq->rt_throttled)
828 balance_runtime(rt_rq);
829 runtime = rt_rq->rt_runtime;
830 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
831 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
832 rt_rq->rt_throttled = 0;
833 enqueue = 1;
834
835 /*
836 * Force a clock update if the CPU was idle,
837 * lest wakeup -> unthrottle time accumulate.
838 */
839 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
840 rq->skip_clock_update = -1;
841 }
842 if (rt_rq->rt_time || rt_rq->rt_nr_running)
843 idle = 0;
844 raw_spin_unlock(&rt_rq->rt_runtime_lock);
845 } else if (rt_rq->rt_nr_running) {
846 idle = 0;
847 if (!rt_rq_throttled(rt_rq))
848 enqueue = 1;
849 }
850 if (rt_rq->rt_throttled)
851 throttled = 1;
852
853 if (enqueue)
854 sched_rt_rq_enqueue(rt_rq);
855 raw_spin_unlock(&rq->lock);
856 }
857
858 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
859 return 1;
860
861 return idle;
862 }
863
rt_se_prio(struct sched_rt_entity * rt_se)864 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
865 {
866 #ifdef CONFIG_RT_GROUP_SCHED
867 struct rt_rq *rt_rq = group_rt_rq(rt_se);
868
869 if (rt_rq)
870 return rt_rq->highest_prio.curr;
871 #endif
872
873 return rt_task_of(rt_se)->prio;
874 }
875
dump_throttled_rt_tasks(struct rt_rq * rt_rq)876 static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
877 {
878 struct rt_prio_array *array = &rt_rq->active;
879 struct sched_rt_entity *rt_se;
880 char buf[500];
881 char *pos = buf;
882 char *end = buf + sizeof(buf);
883 int idx;
884
885 pos += snprintf(pos, sizeof(buf),
886 "sched: RT throttling activated for rt_rq %p (cpu %d)\n",
887 rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
888
889 if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
890 goto out;
891
892 pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
893 idx = sched_find_first_bit(array->bitmap);
894 while (idx < MAX_RT_PRIO) {
895 list_for_each_entry(rt_se, array->queue + idx, run_list) {
896 struct task_struct *p;
897
898 if (!rt_entity_is_task(rt_se))
899 continue;
900
901 p = rt_task_of(rt_se);
902 if (pos < end)
903 pos += snprintf(pos, end - pos, "\t%s (%d)\n",
904 p->comm, p->pid);
905 }
906 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
907 }
908 out:
909 #ifdef CONFIG_PANIC_ON_RT_THROTTLING
910 /*
911 * Use pr_err() in the BUG() case since printk_sched() will
912 * not get flushed and deadlock is not a concern.
913 */
914 pr_err("%s", buf);
915 BUG();
916 #else
917 printk_deferred("%s", buf);
918 #endif
919 }
920
sched_rt_runtime_exceeded(struct rt_rq * rt_rq)921 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
922 {
923 u64 runtime = sched_rt_runtime(rt_rq);
924
925 if (rt_rq->rt_throttled)
926 return rt_rq_throttled(rt_rq);
927
928 if (runtime >= sched_rt_period(rt_rq))
929 return 0;
930
931 balance_runtime(rt_rq);
932 runtime = sched_rt_runtime(rt_rq);
933 if (runtime == RUNTIME_INF)
934 return 0;
935
936 if (rt_rq->rt_time > runtime) {
937 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
938
939 /*
940 * Don't actually throttle groups that have no runtime assigned
941 * but accrue some time due to boosting.
942 */
943 if (likely(rt_b->rt_runtime)) {
944 static bool once = false;
945
946 rt_rq->rt_throttled = 1;
947
948 if (!once) {
949 once = true;
950 dump_throttled_rt_tasks(rt_rq);
951 }
952 } else {
953 /*
954 * In case we did anyway, make it go away,
955 * replenishment is a joke, since it will replenish us
956 * with exactly 0 ns.
957 */
958 rt_rq->rt_time = 0;
959 }
960
961 if (rt_rq_throttled(rt_rq)) {
962 sched_rt_rq_dequeue(rt_rq);
963 return 1;
964 }
965 }
966
967 return 0;
968 }
969
970 /*
971 * Update the current task's runtime statistics. Skip current tasks that
972 * are not in our scheduling class.
973 */
update_curr_rt(struct rq * rq)974 static void update_curr_rt(struct rq *rq)
975 {
976 struct task_struct *curr = rq->curr;
977 struct sched_rt_entity *rt_se = &curr->rt;
978 u64 delta_exec;
979
980 if (curr->sched_class != &rt_sched_class)
981 return;
982
983 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
984 if (unlikely((s64)delta_exec <= 0))
985 return;
986
987 schedstat_set(curr->se.statistics.exec_max,
988 max(curr->se.statistics.exec_max, delta_exec));
989
990 curr->se.sum_exec_runtime += delta_exec;
991 account_group_exec_runtime(curr, delta_exec);
992
993 curr->se.exec_start = rq_clock_task(rq);
994 cpuacct_charge(curr, delta_exec);
995
996 sched_rt_avg_update(rq, delta_exec);
997
998 if (!rt_bandwidth_enabled())
999 return;
1000
1001 for_each_sched_rt_entity(rt_se) {
1002 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1003
1004 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1005 raw_spin_lock(&rt_rq->rt_runtime_lock);
1006 rt_rq->rt_time += delta_exec;
1007 if (sched_rt_runtime_exceeded(rt_rq))
1008 resched_curr(rq);
1009 raw_spin_unlock(&rt_rq->rt_runtime_lock);
1010 }
1011 }
1012 }
1013
1014 static void
dequeue_top_rt_rq(struct rt_rq * rt_rq)1015 dequeue_top_rt_rq(struct rt_rq *rt_rq)
1016 {
1017 struct rq *rq = rq_of_rt_rq(rt_rq);
1018
1019 BUG_ON(&rq->rt != rt_rq);
1020
1021 if (!rt_rq->rt_queued)
1022 return;
1023
1024 BUG_ON(!rq->nr_running);
1025
1026 sub_nr_running(rq, rt_rq->rt_nr_running);
1027 rt_rq->rt_queued = 0;
1028 }
1029
1030 static void
enqueue_top_rt_rq(struct rt_rq * rt_rq)1031 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1032 {
1033 struct rq *rq = rq_of_rt_rq(rt_rq);
1034
1035 BUG_ON(&rq->rt != rt_rq);
1036
1037 if (rt_rq->rt_queued)
1038 return;
1039 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1040 return;
1041
1042 add_nr_running(rq, rt_rq->rt_nr_running);
1043 rt_rq->rt_queued = 1;
1044 }
1045
1046 #if defined CONFIG_SMP
1047
1048 static void
inc_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1049 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1050 {
1051 struct rq *rq = rq_of_rt_rq(rt_rq);
1052
1053 #ifdef CONFIG_RT_GROUP_SCHED
1054 /*
1055 * Change rq's cpupri only if rt_rq is the top queue.
1056 */
1057 if (&rq->rt != rt_rq)
1058 return;
1059 #endif
1060 if (rq->online && prio < prev_prio)
1061 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1062 }
1063
1064 static void
dec_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1065 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1066 {
1067 struct rq *rq = rq_of_rt_rq(rt_rq);
1068
1069 #ifdef CONFIG_RT_GROUP_SCHED
1070 /*
1071 * Change rq's cpupri only if rt_rq is the top queue.
1072 */
1073 if (&rq->rt != rt_rq)
1074 return;
1075 #endif
1076 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1077 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1078 }
1079
1080 #else /* CONFIG_SMP */
1081
1082 static inline
inc_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1083 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1084 static inline
dec_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1085 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1086
1087 #endif /* CONFIG_SMP */
1088
1089 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1090 static void
inc_rt_prio(struct rt_rq * rt_rq,int prio)1091 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1092 {
1093 int prev_prio = rt_rq->highest_prio.curr;
1094
1095 if (prio < prev_prio)
1096 rt_rq->highest_prio.curr = prio;
1097
1098 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1099 }
1100
1101 static void
dec_rt_prio(struct rt_rq * rt_rq,int prio)1102 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1103 {
1104 int prev_prio = rt_rq->highest_prio.curr;
1105
1106 if (rt_rq->rt_nr_running) {
1107
1108 WARN_ON(prio < prev_prio);
1109
1110 /*
1111 * This may have been our highest task, and therefore
1112 * we may have some recomputation to do
1113 */
1114 if (prio == prev_prio) {
1115 struct rt_prio_array *array = &rt_rq->active;
1116
1117 rt_rq->highest_prio.curr =
1118 sched_find_first_bit(array->bitmap);
1119 }
1120
1121 } else
1122 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1123
1124 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1125 }
1126
1127 #else
1128
inc_rt_prio(struct rt_rq * rt_rq,int prio)1129 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
dec_rt_prio(struct rt_rq * rt_rq,int prio)1130 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1131
1132 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1133
1134 #ifdef CONFIG_RT_GROUP_SCHED
1135
1136 static void
inc_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1137 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1138 {
1139 if (rt_se_boosted(rt_se))
1140 rt_rq->rt_nr_boosted++;
1141
1142 if (rt_rq->tg)
1143 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1144 }
1145
1146 static void
dec_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1147 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1148 {
1149 if (rt_se_boosted(rt_se))
1150 rt_rq->rt_nr_boosted--;
1151
1152 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1153 }
1154
1155 #else /* CONFIG_RT_GROUP_SCHED */
1156
1157 static void
inc_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1158 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1159 {
1160 start_rt_bandwidth(&def_rt_bandwidth);
1161 }
1162
1163 static inline
dec_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1164 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1165
1166 #endif /* CONFIG_RT_GROUP_SCHED */
1167
1168 static inline
rt_se_nr_running(struct sched_rt_entity * rt_se)1169 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1170 {
1171 struct rt_rq *group_rq = group_rt_rq(rt_se);
1172
1173 if (group_rq)
1174 return group_rq->rt_nr_running;
1175 else
1176 return 1;
1177 }
1178
1179 static inline
inc_rt_tasks(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1180 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1181 {
1182 int prio = rt_se_prio(rt_se);
1183
1184 WARN_ON(!rt_prio(prio));
1185 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1186
1187 inc_rt_prio(rt_rq, prio);
1188 inc_rt_migration(rt_se, rt_rq);
1189 inc_rt_group(rt_se, rt_rq);
1190 }
1191
1192 static inline
dec_rt_tasks(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1193 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1194 {
1195 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1196 WARN_ON(!rt_rq->rt_nr_running);
1197 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1198
1199 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1200 dec_rt_migration(rt_se, rt_rq);
1201 dec_rt_group(rt_se, rt_rq);
1202 }
1203
__enqueue_rt_entity(struct sched_rt_entity * rt_se,bool head)1204 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1205 {
1206 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1207 struct rt_prio_array *array = &rt_rq->active;
1208 struct rt_rq *group_rq = group_rt_rq(rt_se);
1209 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1210
1211 /*
1212 * Don't enqueue the group if its throttled, or when empty.
1213 * The latter is a consequence of the former when a child group
1214 * get throttled and the current group doesn't have any other
1215 * active members.
1216 */
1217 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1218 return;
1219
1220 if (head)
1221 list_add(&rt_se->run_list, queue);
1222 else
1223 list_add_tail(&rt_se->run_list, queue);
1224 __set_bit(rt_se_prio(rt_se), array->bitmap);
1225
1226 inc_rt_tasks(rt_se, rt_rq);
1227 }
1228
__dequeue_rt_entity(struct sched_rt_entity * rt_se)1229 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1230 {
1231 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1232 struct rt_prio_array *array = &rt_rq->active;
1233
1234 list_del_init(&rt_se->run_list);
1235 if (list_empty(array->queue + rt_se_prio(rt_se)))
1236 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1237
1238 dec_rt_tasks(rt_se, rt_rq);
1239 }
1240
1241 /*
1242 * Because the prio of an upper entry depends on the lower
1243 * entries, we must remove entries top - down.
1244 */
dequeue_rt_stack(struct sched_rt_entity * rt_se)1245 static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1246 {
1247 struct sched_rt_entity *back = NULL;
1248
1249 for_each_sched_rt_entity(rt_se) {
1250 rt_se->back = back;
1251 back = rt_se;
1252 }
1253
1254 dequeue_top_rt_rq(rt_rq_of_se(back));
1255
1256 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1257 if (on_rt_rq(rt_se))
1258 __dequeue_rt_entity(rt_se);
1259 }
1260 }
1261
enqueue_rt_entity(struct sched_rt_entity * rt_se,bool head)1262 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1263 {
1264 struct rq *rq = rq_of_rt_se(rt_se);
1265
1266 dequeue_rt_stack(rt_se);
1267 for_each_sched_rt_entity(rt_se)
1268 __enqueue_rt_entity(rt_se, head);
1269 enqueue_top_rt_rq(&rq->rt);
1270 }
1271
dequeue_rt_entity(struct sched_rt_entity * rt_se)1272 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1273 {
1274 struct rq *rq = rq_of_rt_se(rt_se);
1275
1276 dequeue_rt_stack(rt_se);
1277
1278 for_each_sched_rt_entity(rt_se) {
1279 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1280
1281 if (rt_rq && rt_rq->rt_nr_running)
1282 __enqueue_rt_entity(rt_se, false);
1283 }
1284 enqueue_top_rt_rq(&rq->rt);
1285 }
1286
1287 /*
1288 * Adding/removing a task to/from a priority array:
1289 */
1290 static void
enqueue_task_rt(struct rq * rq,struct task_struct * p,int flags)1291 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1292 {
1293 struct sched_rt_entity *rt_se = &p->rt;
1294
1295 if (flags & ENQUEUE_WAKEUP)
1296 rt_se->timeout = 0;
1297
1298 enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1299 walt_inc_cumulative_runnable_avg(rq, p);
1300
1301 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1302 enqueue_pushable_task(rq, p);
1303 }
1304
dequeue_task_rt(struct rq * rq,struct task_struct * p,int flags)1305 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1306 {
1307 struct sched_rt_entity *rt_se = &p->rt;
1308
1309 update_curr_rt(rq);
1310 dequeue_rt_entity(rt_se);
1311 walt_dec_cumulative_runnable_avg(rq, p);
1312
1313 dequeue_pushable_task(rq, p);
1314 }
1315
1316 /*
1317 * Put task to the head or the end of the run list without the overhead of
1318 * dequeue followed by enqueue.
1319 */
1320 static void
requeue_rt_entity(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int head)1321 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1322 {
1323 if (on_rt_rq(rt_se)) {
1324 struct rt_prio_array *array = &rt_rq->active;
1325 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1326
1327 if (head)
1328 list_move(&rt_se->run_list, queue);
1329 else
1330 list_move_tail(&rt_se->run_list, queue);
1331 }
1332 }
1333
requeue_task_rt(struct rq * rq,struct task_struct * p,int head)1334 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1335 {
1336 struct sched_rt_entity *rt_se = &p->rt;
1337 struct rt_rq *rt_rq;
1338
1339 for_each_sched_rt_entity(rt_se) {
1340 rt_rq = rt_rq_of_se(rt_se);
1341 requeue_rt_entity(rt_rq, rt_se, head);
1342 }
1343 }
1344
yield_task_rt(struct rq * rq)1345 static void yield_task_rt(struct rq *rq)
1346 {
1347 requeue_task_rt(rq, rq->curr, 0);
1348 }
1349
1350 #ifdef CONFIG_SMP
1351 static int find_lowest_rq(struct task_struct *task);
1352
1353 static int
select_task_rq_rt(struct task_struct * p,int cpu,int sd_flag,int flags)1354 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1355 {
1356 struct task_struct *curr;
1357 struct rq *rq;
1358
1359 if (p->nr_cpus_allowed == 1)
1360 goto out;
1361
1362 /* For anything but wake ups, just return the task_cpu */
1363 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1364 goto out;
1365
1366 rq = cpu_rq(cpu);
1367
1368 rcu_read_lock();
1369 curr = READ_ONCE(rq->curr); /* unlocked access */
1370
1371 /*
1372 * If the current task on @p's runqueue is an RT task, then
1373 * try to see if we can wake this RT task up on another
1374 * runqueue. Otherwise simply start this RT task
1375 * on its current runqueue.
1376 *
1377 * We want to avoid overloading runqueues. If the woken
1378 * task is a higher priority, then it will stay on this CPU
1379 * and the lower prio task should be moved to another CPU.
1380 * Even though this will probably make the lower prio task
1381 * lose its cache, we do not want to bounce a higher task
1382 * around just because it gave up its CPU, perhaps for a
1383 * lock?
1384 *
1385 * For equal prio tasks, we just let the scheduler sort it out.
1386 *
1387 * Otherwise, just let it ride on the affined RQ and the
1388 * post-schedule router will push the preempted task away
1389 *
1390 * This test is optimistic, if we get it wrong the load-balancer
1391 * will have to sort it out.
1392 */
1393 if (curr && unlikely(rt_task(curr)) &&
1394 (curr->nr_cpus_allowed < 2 ||
1395 curr->prio <= p->prio)) {
1396 int target = find_lowest_rq(p);
1397
1398 /*
1399 * Possible race. Don't bother moving it if the
1400 * destination CPU is not running a lower priority task.
1401 */
1402 if (target != -1 &&
1403 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1404 cpu = target;
1405 }
1406 rcu_read_unlock();
1407
1408 out:
1409 return cpu;
1410 }
1411
check_preempt_equal_prio(struct rq * rq,struct task_struct * p)1412 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1413 {
1414 if (rq->curr->nr_cpus_allowed == 1)
1415 return;
1416
1417 if (p->nr_cpus_allowed != 1
1418 && cpupri_find(&rq->rd->cpupri, p, NULL))
1419 return;
1420
1421 if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1422 return;
1423
1424 /*
1425 * There appears to be other cpus that can accept
1426 * current and none to run 'p', so lets reschedule
1427 * to try and push current away:
1428 */
1429 requeue_task_rt(rq, p, 1);
1430 resched_curr(rq);
1431 }
1432
1433 #endif /* CONFIG_SMP */
1434
1435 /*
1436 * Preempt the current task with a newly woken task if needed:
1437 */
check_preempt_curr_rt(struct rq * rq,struct task_struct * p,int flags)1438 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1439 {
1440 if (p->prio < rq->curr->prio) {
1441 resched_curr(rq);
1442 return;
1443 }
1444
1445 #ifdef CONFIG_SMP
1446 /*
1447 * If:
1448 *
1449 * - the newly woken task is of equal priority to the current task
1450 * - the newly woken task is non-migratable while current is migratable
1451 * - current will be preempted on the next reschedule
1452 *
1453 * we should check to see if current can readily move to a different
1454 * cpu. If so, we will reschedule to allow the push logic to try
1455 * to move current somewhere else, making room for our non-migratable
1456 * task.
1457 */
1458 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1459 check_preempt_equal_prio(rq, p);
1460 #endif
1461 }
1462
1463 #ifdef CONFIG_SMP
sched_rt_update_capacity_req(struct rq * rq)1464 static void sched_rt_update_capacity_req(struct rq *rq)
1465 {
1466 u64 total, used, age_stamp, avg;
1467 s64 delta;
1468
1469 if (!sched_freq())
1470 return;
1471
1472 sched_avg_update(rq);
1473 /*
1474 * Since we're reading these variables without serialization make sure
1475 * we read them once before doing sanity checks on them.
1476 */
1477 age_stamp = READ_ONCE(rq->age_stamp);
1478 avg = READ_ONCE(rq->rt_avg);
1479 delta = rq_clock(rq) - age_stamp;
1480
1481 if (unlikely(delta < 0))
1482 delta = 0;
1483
1484 total = sched_avg_period() + delta;
1485
1486 used = div_u64(avg, total);
1487 if (unlikely(used > SCHED_CAPACITY_SCALE))
1488 used = SCHED_CAPACITY_SCALE;
1489
1490 set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
1491 }
1492 #else
sched_rt_update_capacity_req(struct rq * rq)1493 static inline void sched_rt_update_capacity_req(struct rq *rq)
1494 { }
1495
1496 #endif
1497
pick_next_rt_entity(struct rq * rq,struct rt_rq * rt_rq)1498 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1499 struct rt_rq *rt_rq)
1500 {
1501 struct rt_prio_array *array = &rt_rq->active;
1502 struct sched_rt_entity *next = NULL;
1503 struct list_head *queue;
1504 int idx;
1505
1506 idx = sched_find_first_bit(array->bitmap);
1507 BUG_ON(idx >= MAX_RT_PRIO);
1508
1509 queue = array->queue + idx;
1510 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1511
1512 return next;
1513 }
1514
_pick_next_task_rt(struct rq * rq)1515 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1516 {
1517 struct sched_rt_entity *rt_se;
1518 struct task_struct *p;
1519 struct rt_rq *rt_rq = &rq->rt;
1520
1521 do {
1522 rt_se = pick_next_rt_entity(rq, rt_rq);
1523 BUG_ON(!rt_se);
1524 rt_rq = group_rt_rq(rt_se);
1525 } while (rt_rq);
1526
1527 p = rt_task_of(rt_se);
1528 p->se.exec_start = rq_clock_task(rq);
1529
1530 return p;
1531 }
1532
1533 static struct task_struct *
pick_next_task_rt(struct rq * rq,struct task_struct * prev)1534 pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1535 {
1536 struct task_struct *p;
1537 struct rt_rq *rt_rq = &rq->rt;
1538
1539 if (need_pull_rt_task(rq, prev)) {
1540 pull_rt_task(rq);
1541 /*
1542 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1543 * means a dl or stop task can slip in, in which case we need
1544 * to re-start task selection.
1545 */
1546 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1547 rq->dl.dl_nr_running))
1548 return RETRY_TASK;
1549 }
1550
1551 /*
1552 * We may dequeue prev's rt_rq in put_prev_task().
1553 * So, we update time before rt_nr_running check.
1554 */
1555 if (prev->sched_class == &rt_sched_class)
1556 update_curr_rt(rq);
1557
1558 if (!rt_rq->rt_queued) {
1559 /*
1560 * The next task to be picked on this rq will have a lower
1561 * priority than rt tasks so we can spend some time to update
1562 * the capacity used by rt tasks based on the last activity.
1563 * This value will be the used as an estimation of the next
1564 * activity.
1565 */
1566 sched_rt_update_capacity_req(rq);
1567 return NULL;
1568 }
1569
1570 put_prev_task(rq, prev);
1571
1572 p = _pick_next_task_rt(rq);
1573
1574 /* The running task is never eligible for pushing */
1575 dequeue_pushable_task(rq, p);
1576
1577 set_post_schedule(rq);
1578
1579 return p;
1580 }
1581
put_prev_task_rt(struct rq * rq,struct task_struct * p)1582 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1583 {
1584 update_curr_rt(rq);
1585
1586 /*
1587 * The previous task needs to be made eligible for pushing
1588 * if it is still active
1589 */
1590 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1591 enqueue_pushable_task(rq, p);
1592 }
1593
1594 #ifdef CONFIG_SMP
1595
1596 /* Only try algorithms three times */
1597 #define RT_MAX_TRIES 3
1598
pick_rt_task(struct rq * rq,struct task_struct * p,int cpu)1599 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1600 {
1601 if (!task_running(rq, p) &&
1602 cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1603 return 1;
1604 return 0;
1605 }
1606
1607 /*
1608 * Return the highest pushable rq's task, which is suitable to be executed
1609 * on the cpu, NULL otherwise
1610 */
pick_highest_pushable_task(struct rq * rq,int cpu)1611 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1612 {
1613 struct plist_head *head = &rq->rt.pushable_tasks;
1614 struct task_struct *p;
1615
1616 if (!has_pushable_tasks(rq))
1617 return NULL;
1618
1619 plist_for_each_entry(p, head, pushable_tasks) {
1620 if (pick_rt_task(rq, p, cpu))
1621 return p;
1622 }
1623
1624 return NULL;
1625 }
1626
1627 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1628
find_lowest_rq(struct task_struct * task)1629 static int find_lowest_rq(struct task_struct *task)
1630 {
1631 struct sched_domain *sd;
1632 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1633 int this_cpu = smp_processor_id();
1634 int cpu = task_cpu(task);
1635
1636 /* Make sure the mask is initialized first */
1637 if (unlikely(!lowest_mask))
1638 return -1;
1639
1640 if (task->nr_cpus_allowed == 1)
1641 return -1; /* No other targets possible */
1642
1643 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1644 return -1; /* No targets found */
1645
1646 /*
1647 * At this point we have built a mask of cpus representing the
1648 * lowest priority tasks in the system. Now we want to elect
1649 * the best one based on our affinity and topology.
1650 *
1651 * We prioritize the last cpu that the task executed on since
1652 * it is most likely cache-hot in that location.
1653 */
1654 if (cpumask_test_cpu(cpu, lowest_mask))
1655 return cpu;
1656
1657 /*
1658 * Otherwise, we consult the sched_domains span maps to figure
1659 * out which cpu is logically closest to our hot cache data.
1660 */
1661 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1662 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1663
1664 rcu_read_lock();
1665 for_each_domain(cpu, sd) {
1666 if (sd->flags & SD_WAKE_AFFINE) {
1667 int best_cpu;
1668
1669 /*
1670 * "this_cpu" is cheaper to preempt than a
1671 * remote processor.
1672 */
1673 if (this_cpu != -1 &&
1674 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1675 rcu_read_unlock();
1676 return this_cpu;
1677 }
1678
1679 best_cpu = cpumask_first_and(lowest_mask,
1680 sched_domain_span(sd));
1681 if (best_cpu < nr_cpu_ids) {
1682 rcu_read_unlock();
1683 return best_cpu;
1684 }
1685 }
1686 }
1687 rcu_read_unlock();
1688
1689 /*
1690 * And finally, if there were no matches within the domains
1691 * just give the caller *something* to work with from the compatible
1692 * locations.
1693 */
1694 if (this_cpu != -1)
1695 return this_cpu;
1696
1697 cpu = cpumask_any(lowest_mask);
1698 if (cpu < nr_cpu_ids)
1699 return cpu;
1700 return -1;
1701 }
1702
1703 /* Will lock the rq it finds */
find_lock_lowest_rq(struct task_struct * task,struct rq * rq)1704 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1705 {
1706 struct rq *lowest_rq = NULL;
1707 int tries;
1708 int cpu;
1709
1710 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1711 cpu = find_lowest_rq(task);
1712
1713 if ((cpu == -1) || (cpu == rq->cpu))
1714 break;
1715
1716 lowest_rq = cpu_rq(cpu);
1717
1718 /* if the prio of this runqueue changed, try again */
1719 if (double_lock_balance(rq, lowest_rq)) {
1720 /*
1721 * We had to unlock the run queue. In
1722 * the mean time, task could have
1723 * migrated already or had its affinity changed.
1724 * Also make sure that it wasn't scheduled on its rq.
1725 */
1726 if (unlikely(task_rq(task) != rq ||
1727 !cpumask_test_cpu(lowest_rq->cpu,
1728 tsk_cpus_allowed(task)) ||
1729 task_running(rq, task) ||
1730 !task_on_rq_queued(task))) {
1731
1732 double_unlock_balance(rq, lowest_rq);
1733 lowest_rq = NULL;
1734 break;
1735 }
1736 }
1737
1738 /* If this rq is still suitable use it. */
1739 if (lowest_rq->rt.highest_prio.curr > task->prio)
1740 break;
1741
1742 /* try again */
1743 double_unlock_balance(rq, lowest_rq);
1744 lowest_rq = NULL;
1745 }
1746
1747 return lowest_rq;
1748 }
1749
pick_next_pushable_task(struct rq * rq)1750 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1751 {
1752 struct task_struct *p;
1753
1754 if (!has_pushable_tasks(rq))
1755 return NULL;
1756
1757 p = plist_first_entry(&rq->rt.pushable_tasks,
1758 struct task_struct, pushable_tasks);
1759
1760 BUG_ON(rq->cpu != task_cpu(p));
1761 BUG_ON(task_current(rq, p));
1762 BUG_ON(p->nr_cpus_allowed <= 1);
1763
1764 BUG_ON(!task_on_rq_queued(p));
1765 BUG_ON(!rt_task(p));
1766
1767 return p;
1768 }
1769
1770 /*
1771 * If the current CPU has more than one RT task, see if the non
1772 * running task can migrate over to a CPU that is running a task
1773 * of lesser priority.
1774 */
push_rt_task(struct rq * rq)1775 static int push_rt_task(struct rq *rq)
1776 {
1777 struct task_struct *next_task;
1778 struct rq *lowest_rq;
1779 int ret = 0;
1780
1781 if (!rq->rt.overloaded)
1782 return 0;
1783
1784 next_task = pick_next_pushable_task(rq);
1785 if (!next_task)
1786 return 0;
1787
1788 retry:
1789 if (unlikely(next_task == rq->curr)) {
1790 WARN_ON(1);
1791 return 0;
1792 }
1793
1794 /*
1795 * It's possible that the next_task slipped in of
1796 * higher priority than current. If that's the case
1797 * just reschedule current.
1798 */
1799 if (unlikely(next_task->prio < rq->curr->prio)) {
1800 resched_curr(rq);
1801 return 0;
1802 }
1803
1804 /* We might release rq lock */
1805 get_task_struct(next_task);
1806
1807 /* find_lock_lowest_rq locks the rq if found */
1808 lowest_rq = find_lock_lowest_rq(next_task, rq);
1809 if (!lowest_rq) {
1810 struct task_struct *task;
1811 /*
1812 * find_lock_lowest_rq releases rq->lock
1813 * so it is possible that next_task has migrated.
1814 *
1815 * We need to make sure that the task is still on the same
1816 * run-queue and is also still the next task eligible for
1817 * pushing.
1818 */
1819 task = pick_next_pushable_task(rq);
1820 if (task_cpu(next_task) == rq->cpu && task == next_task) {
1821 /*
1822 * The task hasn't migrated, and is still the next
1823 * eligible task, but we failed to find a run-queue
1824 * to push it to. Do not retry in this case, since
1825 * other cpus will pull from us when ready.
1826 */
1827 goto out;
1828 }
1829
1830 if (!task)
1831 /* No more tasks, just exit */
1832 goto out;
1833
1834 /*
1835 * Something has shifted, try again.
1836 */
1837 put_task_struct(next_task);
1838 next_task = task;
1839 goto retry;
1840 }
1841
1842 deactivate_task(rq, next_task, 0);
1843 set_task_cpu(next_task, lowest_rq->cpu);
1844 activate_task(lowest_rq, next_task, 0);
1845 ret = 1;
1846
1847 resched_curr(lowest_rq);
1848
1849 double_unlock_balance(rq, lowest_rq);
1850
1851 out:
1852 put_task_struct(next_task);
1853
1854 return ret;
1855 }
1856
push_rt_tasks(struct rq * rq)1857 static void push_rt_tasks(struct rq *rq)
1858 {
1859 /* push_rt_task will return true if it moved an RT */
1860 while (push_rt_task(rq))
1861 ;
1862 }
1863
pull_rt_task(struct rq * this_rq)1864 static int pull_rt_task(struct rq *this_rq)
1865 {
1866 int this_cpu = this_rq->cpu, ret = 0, cpu;
1867 struct task_struct *p;
1868 struct rq *src_rq;
1869
1870 if (likely(!rt_overloaded(this_rq)))
1871 return 0;
1872
1873 /*
1874 * Match the barrier from rt_set_overloaded; this guarantees that if we
1875 * see overloaded we must also see the rto_mask bit.
1876 */
1877 smp_rmb();
1878
1879 for_each_cpu(cpu, this_rq->rd->rto_mask) {
1880 if (this_cpu == cpu)
1881 continue;
1882
1883 src_rq = cpu_rq(cpu);
1884
1885 /*
1886 * Don't bother taking the src_rq->lock if the next highest
1887 * task is known to be lower-priority than our current task.
1888 * This may look racy, but if this value is about to go
1889 * logically higher, the src_rq will push this task away.
1890 * And if its going logically lower, we do not care
1891 */
1892 if (src_rq->rt.highest_prio.next >=
1893 this_rq->rt.highest_prio.curr)
1894 continue;
1895
1896 /*
1897 * We can potentially drop this_rq's lock in
1898 * double_lock_balance, and another CPU could
1899 * alter this_rq
1900 */
1901 double_lock_balance(this_rq, src_rq);
1902
1903 /*
1904 * We can pull only a task, which is pushable
1905 * on its rq, and no others.
1906 */
1907 p = pick_highest_pushable_task(src_rq, this_cpu);
1908
1909 /*
1910 * Do we have an RT task that preempts
1911 * the to-be-scheduled task?
1912 */
1913 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1914 WARN_ON(p == src_rq->curr);
1915 WARN_ON(!task_on_rq_queued(p));
1916
1917 /*
1918 * There's a chance that p is higher in priority
1919 * than what's currently running on its cpu.
1920 * This is just that p is wakeing up and hasn't
1921 * had a chance to schedule. We only pull
1922 * p if it is lower in priority than the
1923 * current task on the run queue
1924 */
1925 if (p->prio < src_rq->curr->prio)
1926 goto skip;
1927
1928 ret = 1;
1929
1930 deactivate_task(src_rq, p, 0);
1931 set_task_cpu(p, this_cpu);
1932 activate_task(this_rq, p, 0);
1933 /*
1934 * We continue with the search, just in
1935 * case there's an even higher prio task
1936 * in another runqueue. (low likelihood
1937 * but possible)
1938 */
1939 }
1940 skip:
1941 double_unlock_balance(this_rq, src_rq);
1942 }
1943
1944 return ret;
1945 }
1946
post_schedule_rt(struct rq * rq)1947 static void post_schedule_rt(struct rq *rq)
1948 {
1949 push_rt_tasks(rq);
1950 }
1951
1952 /*
1953 * If we are not running and we are not going to reschedule soon, we should
1954 * try to push tasks away now
1955 */
task_woken_rt(struct rq * rq,struct task_struct * p)1956 static void task_woken_rt(struct rq *rq, struct task_struct *p)
1957 {
1958 if (!task_running(rq, p) &&
1959 !test_tsk_need_resched(rq->curr) &&
1960 has_pushable_tasks(rq) &&
1961 p->nr_cpus_allowed > 1 &&
1962 (dl_task(rq->curr) || rt_task(rq->curr)) &&
1963 (rq->curr->nr_cpus_allowed < 2 ||
1964 rq->curr->prio <= p->prio))
1965 push_rt_tasks(rq);
1966 }
1967
set_cpus_allowed_rt(struct task_struct * p,const struct cpumask * new_mask)1968 static void set_cpus_allowed_rt(struct task_struct *p,
1969 const struct cpumask *new_mask)
1970 {
1971 struct rq *rq;
1972 int weight;
1973
1974 BUG_ON(!rt_task(p));
1975
1976 if (!task_on_rq_queued(p))
1977 return;
1978
1979 weight = cpumask_weight(new_mask);
1980
1981 /*
1982 * Only update if the process changes its state from whether it
1983 * can migrate or not.
1984 */
1985 if ((p->nr_cpus_allowed > 1) == (weight > 1))
1986 return;
1987
1988 rq = task_rq(p);
1989
1990 /*
1991 * The process used to be able to migrate OR it can now migrate
1992 */
1993 if (weight <= 1) {
1994 if (!task_current(rq, p))
1995 dequeue_pushable_task(rq, p);
1996 BUG_ON(!rq->rt.rt_nr_migratory);
1997 rq->rt.rt_nr_migratory--;
1998 } else {
1999 if (!task_current(rq, p))
2000 enqueue_pushable_task(rq, p);
2001 rq->rt.rt_nr_migratory++;
2002 }
2003
2004 update_rt_migration(&rq->rt);
2005 }
2006
2007 /* Assumes rq->lock is held */
rq_online_rt(struct rq * rq)2008 static void rq_online_rt(struct rq *rq)
2009 {
2010 if (rq->rt.overloaded)
2011 rt_set_overload(rq);
2012
2013 __enable_runtime(rq);
2014
2015 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2016 }
2017
2018 /* Assumes rq->lock is held */
rq_offline_rt(struct rq * rq)2019 static void rq_offline_rt(struct rq *rq)
2020 {
2021 if (rq->rt.overloaded)
2022 rt_clear_overload(rq);
2023
2024 __disable_runtime(rq);
2025
2026 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2027 }
2028
2029 /*
2030 * When switch from the rt queue, we bring ourselves to a position
2031 * that we might want to pull RT tasks from other runqueues.
2032 */
switched_from_rt(struct rq * rq,struct task_struct * p)2033 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2034 {
2035 /*
2036 * If there are other RT tasks then we will reschedule
2037 * and the scheduling of the other RT tasks will handle
2038 * the balancing. But if we are the last RT task
2039 * we may need to handle the pulling of RT tasks
2040 * now.
2041 */
2042 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2043 return;
2044
2045 if (pull_rt_task(rq))
2046 resched_curr(rq);
2047 }
2048
init_sched_rt_class(void)2049 void __init init_sched_rt_class(void)
2050 {
2051 unsigned int i;
2052
2053 for_each_possible_cpu(i) {
2054 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2055 GFP_KERNEL, cpu_to_node(i));
2056 }
2057 }
2058 #endif /* CONFIG_SMP */
2059
2060 /*
2061 * When switching a task to RT, we may overload the runqueue
2062 * with RT tasks. In this case we try to push them off to
2063 * other runqueues.
2064 */
switched_to_rt(struct rq * rq,struct task_struct * p)2065 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2066 {
2067 int check_resched = 1;
2068
2069 /*
2070 * If we are already running, then there's nothing
2071 * that needs to be done. But if we are not running
2072 * we may need to preempt the current running task.
2073 * If that current running task is also an RT task
2074 * then see if we can move to another run queue.
2075 */
2076 if (task_on_rq_queued(p) && rq->curr != p) {
2077 #ifdef CONFIG_SMP
2078 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
2079 /* Don't resched if we changed runqueues */
2080 push_rt_task(rq) && rq != task_rq(p))
2081 check_resched = 0;
2082 #endif /* CONFIG_SMP */
2083 if (check_resched && p->prio < rq->curr->prio)
2084 resched_curr(rq);
2085 }
2086 }
2087
2088 /*
2089 * Priority of the task has changed. This may cause
2090 * us to initiate a push or pull.
2091 */
2092 static void
prio_changed_rt(struct rq * rq,struct task_struct * p,int oldprio)2093 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2094 {
2095 if (!task_on_rq_queued(p))
2096 return;
2097
2098 if (rq->curr == p) {
2099 #ifdef CONFIG_SMP
2100 /*
2101 * If our priority decreases while running, we
2102 * may need to pull tasks to this runqueue.
2103 */
2104 if (oldprio < p->prio)
2105 pull_rt_task(rq);
2106 /*
2107 * If there's a higher priority task waiting to run
2108 * then reschedule. Note, the above pull_rt_task
2109 * can release the rq lock and p could migrate.
2110 * Only reschedule if p is still on the same runqueue.
2111 */
2112 if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
2113 resched_curr(rq);
2114 #else
2115 /* For UP simply resched on drop of prio */
2116 if (oldprio < p->prio)
2117 resched_curr(rq);
2118 #endif /* CONFIG_SMP */
2119 } else {
2120 /*
2121 * This task is not running, but if it is
2122 * greater than the current running task
2123 * then reschedule.
2124 */
2125 if (p->prio < rq->curr->prio)
2126 resched_curr(rq);
2127 }
2128 }
2129
watchdog(struct rq * rq,struct task_struct * p)2130 static void watchdog(struct rq *rq, struct task_struct *p)
2131 {
2132 unsigned long soft, hard;
2133
2134 /* max may change after cur was read, this will be fixed next tick */
2135 soft = task_rlimit(p, RLIMIT_RTTIME);
2136 hard = task_rlimit_max(p, RLIMIT_RTTIME);
2137
2138 if (soft != RLIM_INFINITY) {
2139 unsigned long next;
2140
2141 if (p->rt.watchdog_stamp != jiffies) {
2142 p->rt.timeout++;
2143 p->rt.watchdog_stamp = jiffies;
2144 }
2145
2146 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2147 if (p->rt.timeout > next)
2148 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2149 }
2150 }
2151
task_tick_rt(struct rq * rq,struct task_struct * p,int queued)2152 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2153 {
2154 struct sched_rt_entity *rt_se = &p->rt;
2155
2156 update_curr_rt(rq);
2157
2158 if (rq->rt.rt_nr_running)
2159 sched_rt_update_capacity_req(rq);
2160
2161 watchdog(rq, p);
2162
2163 /*
2164 * RR tasks need a special form of timeslice management.
2165 * FIFO tasks have no timeslices.
2166 */
2167 if (p->policy != SCHED_RR)
2168 return;
2169
2170 if (--p->rt.time_slice)
2171 return;
2172
2173 p->rt.time_slice = sched_rr_timeslice;
2174
2175 /*
2176 * Requeue to the end of queue if we (and all of our ancestors) are not
2177 * the only element on the queue
2178 */
2179 for_each_sched_rt_entity(rt_se) {
2180 if (rt_se->run_list.prev != rt_se->run_list.next) {
2181 requeue_task_rt(rq, p, 0);
2182 resched_curr(rq);
2183 return;
2184 }
2185 }
2186 }
2187
set_curr_task_rt(struct rq * rq)2188 static void set_curr_task_rt(struct rq *rq)
2189 {
2190 struct task_struct *p = rq->curr;
2191
2192 p->se.exec_start = rq_clock_task(rq);
2193
2194 /* The running task is never eligible for pushing */
2195 dequeue_pushable_task(rq, p);
2196 }
2197
get_rr_interval_rt(struct rq * rq,struct task_struct * task)2198 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2199 {
2200 /*
2201 * Time slice is 0 for SCHED_FIFO tasks
2202 */
2203 if (task->policy == SCHED_RR)
2204 return sched_rr_timeslice;
2205 else
2206 return 0;
2207 }
2208
2209 const struct sched_class rt_sched_class = {
2210 .next = &fair_sched_class,
2211 .enqueue_task = enqueue_task_rt,
2212 .dequeue_task = dequeue_task_rt,
2213 .yield_task = yield_task_rt,
2214
2215 .check_preempt_curr = check_preempt_curr_rt,
2216
2217 .pick_next_task = pick_next_task_rt,
2218 .put_prev_task = put_prev_task_rt,
2219
2220 #ifdef CONFIG_SMP
2221 .select_task_rq = select_task_rq_rt,
2222
2223 .set_cpus_allowed = set_cpus_allowed_rt,
2224 .rq_online = rq_online_rt,
2225 .rq_offline = rq_offline_rt,
2226 .post_schedule = post_schedule_rt,
2227 .task_woken = task_woken_rt,
2228 .switched_from = switched_from_rt,
2229 #endif
2230
2231 .set_curr_task = set_curr_task_rt,
2232 .task_tick = task_tick_rt,
2233
2234 .get_rr_interval = get_rr_interval_rt,
2235
2236 .prio_changed = prio_changed_rt,
2237 .switched_to = switched_to_rt,
2238
2239 .update_curr = update_curr_rt,
2240 };
2241
2242 #ifdef CONFIG_SCHED_DEBUG
2243 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2244
print_rt_stats(struct seq_file * m,int cpu)2245 void print_rt_stats(struct seq_file *m, int cpu)
2246 {
2247 rt_rq_iter_t iter;
2248 struct rt_rq *rt_rq;
2249
2250 rcu_read_lock();
2251 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2252 print_rt_rq(m, cpu, rt_rq);
2253 rcu_read_unlock();
2254 }
2255 #endif /* CONFIG_SCHED_DEBUG */
2256