• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
3  * policies)
4  */
5 
6 #include "sched.h"
7 
8 #include <linux/slab.h>
9 
10 #include "walt.h"
11 
12 int sched_rr_timeslice = RR_TIMESLICE;
13 
14 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
15 
16 struct rt_bandwidth def_rt_bandwidth;
17 
sched_rt_period_timer(struct hrtimer * timer)18 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
19 {
20 	struct rt_bandwidth *rt_b =
21 		container_of(timer, struct rt_bandwidth, rt_period_timer);
22 	ktime_t now;
23 	int overrun;
24 	int idle = 0;
25 
26 	for (;;) {
27 		now = hrtimer_cb_get_time(timer);
28 		overrun = hrtimer_forward(timer, now, rt_b->rt_period);
29 
30 		if (!overrun)
31 			break;
32 
33 		idle = do_sched_rt_period_timer(rt_b, overrun);
34 	}
35 
36 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
37 }
38 
init_rt_bandwidth(struct rt_bandwidth * rt_b,u64 period,u64 runtime)39 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
40 {
41 	rt_b->rt_period = ns_to_ktime(period);
42 	rt_b->rt_runtime = runtime;
43 
44 	raw_spin_lock_init(&rt_b->rt_runtime_lock);
45 
46 	hrtimer_init(&rt_b->rt_period_timer,
47 			CLOCK_MONOTONIC, HRTIMER_MODE_REL);
48 	rt_b->rt_period_timer.function = sched_rt_period_timer;
49 }
50 
start_rt_bandwidth(struct rt_bandwidth * rt_b)51 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
52 {
53 	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
54 		return;
55 
56 	if (hrtimer_active(&rt_b->rt_period_timer))
57 		return;
58 
59 	raw_spin_lock(&rt_b->rt_runtime_lock);
60 	start_bandwidth_timer(&rt_b->rt_period_timer, rt_b->rt_period);
61 	raw_spin_unlock(&rt_b->rt_runtime_lock);
62 }
63 
init_rt_rq(struct rt_rq * rt_rq,struct rq * rq)64 void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
65 {
66 	struct rt_prio_array *array;
67 	int i;
68 
69 	array = &rt_rq->active;
70 	for (i = 0; i < MAX_RT_PRIO; i++) {
71 		INIT_LIST_HEAD(array->queue + i);
72 		__clear_bit(i, array->bitmap);
73 	}
74 	/* delimiter for bitsearch: */
75 	__set_bit(MAX_RT_PRIO, array->bitmap);
76 
77 #if defined CONFIG_SMP
78 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
79 	rt_rq->highest_prio.next = MAX_RT_PRIO;
80 	rt_rq->rt_nr_migratory = 0;
81 	rt_rq->overloaded = 0;
82 	plist_head_init(&rt_rq->pushable_tasks);
83 #endif
84 	/* We start is dequeued state, because no RT tasks are queued */
85 	rt_rq->rt_queued = 0;
86 
87 	rt_rq->rt_time = 0;
88 	rt_rq->rt_throttled = 0;
89 	rt_rq->rt_runtime = 0;
90 	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
91 }
92 
93 #ifdef CONFIG_RT_GROUP_SCHED
destroy_rt_bandwidth(struct rt_bandwidth * rt_b)94 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
95 {
96 	hrtimer_cancel(&rt_b->rt_period_timer);
97 }
98 
99 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
100 
rt_task_of(struct sched_rt_entity * rt_se)101 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
102 {
103 #ifdef CONFIG_SCHED_DEBUG
104 	WARN_ON_ONCE(!rt_entity_is_task(rt_se));
105 #endif
106 	return container_of(rt_se, struct task_struct, rt);
107 }
108 
rq_of_rt_rq(struct rt_rq * rt_rq)109 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
110 {
111 	return rt_rq->rq;
112 }
113 
rt_rq_of_se(struct sched_rt_entity * rt_se)114 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
115 {
116 	return rt_se->rt_rq;
117 }
118 
rq_of_rt_se(struct sched_rt_entity * rt_se)119 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
120 {
121 	struct rt_rq *rt_rq = rt_se->rt_rq;
122 
123 	return rt_rq->rq;
124 }
125 
free_rt_sched_group(struct task_group * tg)126 void free_rt_sched_group(struct task_group *tg)
127 {
128 	int i;
129 
130 	if (tg->rt_se)
131 		destroy_rt_bandwidth(&tg->rt_bandwidth);
132 
133 	for_each_possible_cpu(i) {
134 		if (tg->rt_rq)
135 			kfree(tg->rt_rq[i]);
136 		if (tg->rt_se)
137 			kfree(tg->rt_se[i]);
138 	}
139 
140 	kfree(tg->rt_rq);
141 	kfree(tg->rt_se);
142 }
143 
init_tg_rt_entry(struct task_group * tg,struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int cpu,struct sched_rt_entity * parent)144 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
145 		struct sched_rt_entity *rt_se, int cpu,
146 		struct sched_rt_entity *parent)
147 {
148 	struct rq *rq = cpu_rq(cpu);
149 
150 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
151 	rt_rq->rt_nr_boosted = 0;
152 	rt_rq->rq = rq;
153 	rt_rq->tg = tg;
154 
155 	tg->rt_rq[cpu] = rt_rq;
156 	tg->rt_se[cpu] = rt_se;
157 
158 	if (!rt_se)
159 		return;
160 
161 	if (!parent)
162 		rt_se->rt_rq = &rq->rt;
163 	else
164 		rt_se->rt_rq = parent->my_q;
165 
166 	rt_se->my_q = rt_rq;
167 	rt_se->parent = parent;
168 	INIT_LIST_HEAD(&rt_se->run_list);
169 }
170 
alloc_rt_sched_group(struct task_group * tg,struct task_group * parent)171 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
172 {
173 	struct rt_rq *rt_rq;
174 	struct sched_rt_entity *rt_se;
175 	int i;
176 
177 	tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
178 	if (!tg->rt_rq)
179 		goto err;
180 	tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
181 	if (!tg->rt_se)
182 		goto err;
183 
184 	init_rt_bandwidth(&tg->rt_bandwidth,
185 			ktime_to_ns(def_rt_bandwidth.rt_period), 0);
186 
187 	for_each_possible_cpu(i) {
188 		rt_rq = kzalloc_node(sizeof(struct rt_rq),
189 				     GFP_KERNEL, cpu_to_node(i));
190 		if (!rt_rq)
191 			goto err;
192 
193 		rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
194 				     GFP_KERNEL, cpu_to_node(i));
195 		if (!rt_se)
196 			goto err_free_rq;
197 
198 		init_rt_rq(rt_rq, cpu_rq(i));
199 		rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
200 		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
201 	}
202 
203 	return 1;
204 
205 err_free_rq:
206 	kfree(rt_rq);
207 err:
208 	return 0;
209 }
210 
211 #else /* CONFIG_RT_GROUP_SCHED */
212 
213 #define rt_entity_is_task(rt_se) (1)
214 
rt_task_of(struct sched_rt_entity * rt_se)215 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
216 {
217 	return container_of(rt_se, struct task_struct, rt);
218 }
219 
rq_of_rt_rq(struct rt_rq * rt_rq)220 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
221 {
222 	return container_of(rt_rq, struct rq, rt);
223 }
224 
rq_of_rt_se(struct sched_rt_entity * rt_se)225 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
226 {
227 	struct task_struct *p = rt_task_of(rt_se);
228 
229 	return task_rq(p);
230 }
231 
rt_rq_of_se(struct sched_rt_entity * rt_se)232 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
233 {
234 	struct rq *rq = rq_of_rt_se(rt_se);
235 
236 	return &rq->rt;
237 }
238 
free_rt_sched_group(struct task_group * tg)239 void free_rt_sched_group(struct task_group *tg) { }
240 
alloc_rt_sched_group(struct task_group * tg,struct task_group * parent)241 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
242 {
243 	return 1;
244 }
245 #endif /* CONFIG_RT_GROUP_SCHED */
246 
247 #ifdef CONFIG_SMP
248 
249 static int pull_rt_task(struct rq *this_rq);
250 
need_pull_rt_task(struct rq * rq,struct task_struct * prev)251 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
252 {
253 	/* Try to pull RT tasks here if we lower this rq's prio */
254 	return rq->rt.highest_prio.curr > prev->prio;
255 }
256 
rt_overloaded(struct rq * rq)257 static inline int rt_overloaded(struct rq *rq)
258 {
259 	return atomic_read(&rq->rd->rto_count);
260 }
261 
rt_set_overload(struct rq * rq)262 static inline void rt_set_overload(struct rq *rq)
263 {
264 	if (!rq->online)
265 		return;
266 
267 	cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
268 	/*
269 	 * Make sure the mask is visible before we set
270 	 * the overload count. That is checked to determine
271 	 * if we should look at the mask. It would be a shame
272 	 * if we looked at the mask, but the mask was not
273 	 * updated yet.
274 	 *
275 	 * Matched by the barrier in pull_rt_task().
276 	 */
277 	smp_wmb();
278 	atomic_inc(&rq->rd->rto_count);
279 }
280 
rt_clear_overload(struct rq * rq)281 static inline void rt_clear_overload(struct rq *rq)
282 {
283 	if (!rq->online)
284 		return;
285 
286 	/* the order here really doesn't matter */
287 	atomic_dec(&rq->rd->rto_count);
288 	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
289 }
290 
update_rt_migration(struct rt_rq * rt_rq)291 static void update_rt_migration(struct rt_rq *rt_rq)
292 {
293 	if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
294 		if (!rt_rq->overloaded) {
295 			rt_set_overload(rq_of_rt_rq(rt_rq));
296 			rt_rq->overloaded = 1;
297 		}
298 	} else if (rt_rq->overloaded) {
299 		rt_clear_overload(rq_of_rt_rq(rt_rq));
300 		rt_rq->overloaded = 0;
301 	}
302 }
303 
inc_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)304 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
305 {
306 	struct task_struct *p;
307 
308 	if (!rt_entity_is_task(rt_se))
309 		return;
310 
311 	p = rt_task_of(rt_se);
312 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
313 
314 	rt_rq->rt_nr_total++;
315 	if (p->nr_cpus_allowed > 1)
316 		rt_rq->rt_nr_migratory++;
317 
318 	update_rt_migration(rt_rq);
319 }
320 
dec_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)321 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
322 {
323 	struct task_struct *p;
324 
325 	if (!rt_entity_is_task(rt_se))
326 		return;
327 
328 	p = rt_task_of(rt_se);
329 	rt_rq = &rq_of_rt_rq(rt_rq)->rt;
330 
331 	rt_rq->rt_nr_total--;
332 	if (p->nr_cpus_allowed > 1)
333 		rt_rq->rt_nr_migratory--;
334 
335 	update_rt_migration(rt_rq);
336 }
337 
has_pushable_tasks(struct rq * rq)338 static inline int has_pushable_tasks(struct rq *rq)
339 {
340 	return !plist_head_empty(&rq->rt.pushable_tasks);
341 }
342 
set_post_schedule(struct rq * rq)343 static inline void set_post_schedule(struct rq *rq)
344 {
345 	/*
346 	 * We detect this state here so that we can avoid taking the RQ
347 	 * lock again later if there is no need to push
348 	 */
349 	rq->post_schedule = has_pushable_tasks(rq);
350 }
351 
enqueue_pushable_task(struct rq * rq,struct task_struct * p)352 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
353 {
354 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
355 	plist_node_init(&p->pushable_tasks, p->prio);
356 	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
357 
358 	/* Update the highest prio pushable task */
359 	if (p->prio < rq->rt.highest_prio.next)
360 		rq->rt.highest_prio.next = p->prio;
361 }
362 
dequeue_pushable_task(struct rq * rq,struct task_struct * p)363 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
364 {
365 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
366 
367 	/* Update the new highest prio pushable task */
368 	if (has_pushable_tasks(rq)) {
369 		p = plist_first_entry(&rq->rt.pushable_tasks,
370 				      struct task_struct, pushable_tasks);
371 		rq->rt.highest_prio.next = p->prio;
372 	} else
373 		rq->rt.highest_prio.next = MAX_RT_PRIO;
374 }
375 
376 #else
377 
enqueue_pushable_task(struct rq * rq,struct task_struct * p)378 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
379 {
380 }
381 
dequeue_pushable_task(struct rq * rq,struct task_struct * p)382 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
383 {
384 }
385 
386 static inline
inc_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)387 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
388 {
389 }
390 
391 static inline
dec_rt_migration(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)392 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
393 {
394 }
395 
need_pull_rt_task(struct rq * rq,struct task_struct * prev)396 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
397 {
398 	return false;
399 }
400 
pull_rt_task(struct rq * this_rq)401 static inline int pull_rt_task(struct rq *this_rq)
402 {
403 	return 0;
404 }
405 
set_post_schedule(struct rq * rq)406 static inline void set_post_schedule(struct rq *rq)
407 {
408 }
409 #endif /* CONFIG_SMP */
410 
411 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
412 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
413 
on_rt_rq(struct sched_rt_entity * rt_se)414 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
415 {
416 	return !list_empty(&rt_se->run_list);
417 }
418 
419 #ifdef CONFIG_RT_GROUP_SCHED
420 
sched_rt_runtime(struct rt_rq * rt_rq)421 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
422 {
423 	if (!rt_rq->tg)
424 		return RUNTIME_INF;
425 
426 	return rt_rq->rt_runtime;
427 }
428 
sched_rt_period(struct rt_rq * rt_rq)429 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
430 {
431 	return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
432 }
433 
434 typedef struct task_group *rt_rq_iter_t;
435 
next_task_group(struct task_group * tg)436 static inline struct task_group *next_task_group(struct task_group *tg)
437 {
438 	do {
439 		tg = list_entry_rcu(tg->list.next,
440 			typeof(struct task_group), list);
441 	} while (&tg->list != &task_groups && task_group_is_autogroup(tg));
442 
443 	if (&tg->list == &task_groups)
444 		tg = NULL;
445 
446 	return tg;
447 }
448 
449 #define for_each_rt_rq(rt_rq, iter, rq)					\
450 	for (iter = container_of(&task_groups, typeof(*iter), list);	\
451 		(iter = next_task_group(iter)) &&			\
452 		(rt_rq = iter->rt_rq[cpu_of(rq)]);)
453 
454 #define for_each_sched_rt_entity(rt_se) \
455 	for (; rt_se; rt_se = rt_se->parent)
456 
group_rt_rq(struct sched_rt_entity * rt_se)457 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
458 {
459 	return rt_se->my_q;
460 }
461 
462 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
463 static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
464 
sched_rt_rq_enqueue(struct rt_rq * rt_rq)465 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
466 {
467 	struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
468 	struct rq *rq = rq_of_rt_rq(rt_rq);
469 	struct sched_rt_entity *rt_se;
470 
471 	int cpu = cpu_of(rq);
472 
473 	rt_se = rt_rq->tg->rt_se[cpu];
474 
475 	if (rt_rq->rt_nr_running) {
476 		if (!rt_se)
477 			enqueue_top_rt_rq(rt_rq);
478 		else if (!on_rt_rq(rt_se))
479 			enqueue_rt_entity(rt_se, false);
480 
481 		if (rt_rq->highest_prio.curr < curr->prio)
482 			resched_curr(rq);
483 	}
484 }
485 
sched_rt_rq_dequeue(struct rt_rq * rt_rq)486 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
487 {
488 	struct sched_rt_entity *rt_se;
489 	int cpu = cpu_of(rq_of_rt_rq(rt_rq));
490 
491 	rt_se = rt_rq->tg->rt_se[cpu];
492 
493 	if (!rt_se)
494 		dequeue_top_rt_rq(rt_rq);
495 	else if (on_rt_rq(rt_se))
496 		dequeue_rt_entity(rt_se);
497 }
498 
rt_rq_throttled(struct rt_rq * rt_rq)499 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
500 {
501 	return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
502 }
503 
rt_se_boosted(struct sched_rt_entity * rt_se)504 static int rt_se_boosted(struct sched_rt_entity *rt_se)
505 {
506 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
507 	struct task_struct *p;
508 
509 	if (rt_rq)
510 		return !!rt_rq->rt_nr_boosted;
511 
512 	p = rt_task_of(rt_se);
513 	return p->prio != p->normal_prio;
514 }
515 
516 #ifdef CONFIG_SMP
sched_rt_period_mask(void)517 static inline const struct cpumask *sched_rt_period_mask(void)
518 {
519 	return this_rq()->rd->span;
520 }
521 #else
sched_rt_period_mask(void)522 static inline const struct cpumask *sched_rt_period_mask(void)
523 {
524 	return cpu_online_mask;
525 }
526 #endif
527 
528 static inline
sched_rt_period_rt_rq(struct rt_bandwidth * rt_b,int cpu)529 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
530 {
531 	return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
532 }
533 
sched_rt_bandwidth(struct rt_rq * rt_rq)534 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
535 {
536 	return &rt_rq->tg->rt_bandwidth;
537 }
538 
539 #else /* !CONFIG_RT_GROUP_SCHED */
540 
sched_rt_runtime(struct rt_rq * rt_rq)541 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
542 {
543 	return rt_rq->rt_runtime;
544 }
545 
sched_rt_period(struct rt_rq * rt_rq)546 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
547 {
548 	return ktime_to_ns(def_rt_bandwidth.rt_period);
549 }
550 
551 typedef struct rt_rq *rt_rq_iter_t;
552 
553 #define for_each_rt_rq(rt_rq, iter, rq) \
554 	for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
555 
556 #define for_each_sched_rt_entity(rt_se) \
557 	for (; rt_se; rt_se = NULL)
558 
group_rt_rq(struct sched_rt_entity * rt_se)559 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
560 {
561 	return NULL;
562 }
563 
sched_rt_rq_enqueue(struct rt_rq * rt_rq)564 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
565 {
566 	struct rq *rq = rq_of_rt_rq(rt_rq);
567 
568 	if (!rt_rq->rt_nr_running)
569 		return;
570 
571 	enqueue_top_rt_rq(rt_rq);
572 	resched_curr(rq);
573 }
574 
sched_rt_rq_dequeue(struct rt_rq * rt_rq)575 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
576 {
577 	dequeue_top_rt_rq(rt_rq);
578 }
579 
rt_rq_throttled(struct rt_rq * rt_rq)580 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
581 {
582 	return rt_rq->rt_throttled;
583 }
584 
sched_rt_period_mask(void)585 static inline const struct cpumask *sched_rt_period_mask(void)
586 {
587 	return cpu_online_mask;
588 }
589 
590 static inline
sched_rt_period_rt_rq(struct rt_bandwidth * rt_b,int cpu)591 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
592 {
593 	return &cpu_rq(cpu)->rt;
594 }
595 
sched_rt_bandwidth(struct rt_rq * rt_rq)596 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
597 {
598 	return &def_rt_bandwidth;
599 }
600 
601 #endif /* CONFIG_RT_GROUP_SCHED */
602 
sched_rt_bandwidth_account(struct rt_rq * rt_rq)603 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
604 {
605 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
606 
607 	return (hrtimer_active(&rt_b->rt_period_timer) ||
608 		rt_rq->rt_time < rt_b->rt_runtime);
609 }
610 
611 #ifdef CONFIG_SMP
612 /*
613  * We ran out of runtime, see if we can borrow some from our neighbours.
614  */
do_balance_runtime(struct rt_rq * rt_rq)615 static int do_balance_runtime(struct rt_rq *rt_rq)
616 {
617 	struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
618 	struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
619 	int i, weight, more = 0;
620 	u64 rt_period;
621 
622 	weight = cpumask_weight(rd->span);
623 
624 	raw_spin_lock(&rt_b->rt_runtime_lock);
625 	rt_period = ktime_to_ns(rt_b->rt_period);
626 	for_each_cpu(i, rd->span) {
627 		struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
628 		s64 diff;
629 
630 		if (iter == rt_rq)
631 			continue;
632 
633 		raw_spin_lock(&iter->rt_runtime_lock);
634 		/*
635 		 * Either all rqs have inf runtime and there's nothing to steal
636 		 * or __disable_runtime() below sets a specific rq to inf to
637 		 * indicate its been disabled and disalow stealing.
638 		 */
639 		if (iter->rt_runtime == RUNTIME_INF)
640 			goto next;
641 
642 		/*
643 		 * From runqueues with spare time, take 1/n part of their
644 		 * spare time, but no more than our period.
645 		 */
646 		diff = iter->rt_runtime - iter->rt_time;
647 		if (diff > 0) {
648 			diff = div_u64((u64)diff, weight);
649 			if (rt_rq->rt_runtime + diff > rt_period)
650 				diff = rt_period - rt_rq->rt_runtime;
651 			iter->rt_runtime -= diff;
652 			rt_rq->rt_runtime += diff;
653 			more = 1;
654 			if (rt_rq->rt_runtime == rt_period) {
655 				raw_spin_unlock(&iter->rt_runtime_lock);
656 				break;
657 			}
658 		}
659 next:
660 		raw_spin_unlock(&iter->rt_runtime_lock);
661 	}
662 	raw_spin_unlock(&rt_b->rt_runtime_lock);
663 
664 	return more;
665 }
666 
667 /*
668  * Ensure this RQ takes back all the runtime it lend to its neighbours.
669  */
__disable_runtime(struct rq * rq)670 static void __disable_runtime(struct rq *rq)
671 {
672 	struct root_domain *rd = rq->rd;
673 	rt_rq_iter_t iter;
674 	struct rt_rq *rt_rq;
675 
676 	if (unlikely(!scheduler_running))
677 		return;
678 
679 	for_each_rt_rq(rt_rq, iter, rq) {
680 		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
681 		s64 want;
682 		int i;
683 
684 		raw_spin_lock(&rt_b->rt_runtime_lock);
685 		raw_spin_lock(&rt_rq->rt_runtime_lock);
686 		/*
687 		 * Either we're all inf and nobody needs to borrow, or we're
688 		 * already disabled and thus have nothing to do, or we have
689 		 * exactly the right amount of runtime to take out.
690 		 */
691 		if (rt_rq->rt_runtime == RUNTIME_INF ||
692 				rt_rq->rt_runtime == rt_b->rt_runtime)
693 			goto balanced;
694 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
695 
696 		/*
697 		 * Calculate the difference between what we started out with
698 		 * and what we current have, that's the amount of runtime
699 		 * we lend and now have to reclaim.
700 		 */
701 		want = rt_b->rt_runtime - rt_rq->rt_runtime;
702 
703 		/*
704 		 * Greedy reclaim, take back as much as we can.
705 		 */
706 		for_each_cpu(i, rd->span) {
707 			struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
708 			s64 diff;
709 
710 			/*
711 			 * Can't reclaim from ourselves or disabled runqueues.
712 			 */
713 			if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
714 				continue;
715 
716 			raw_spin_lock(&iter->rt_runtime_lock);
717 			if (want > 0) {
718 				diff = min_t(s64, iter->rt_runtime, want);
719 				iter->rt_runtime -= diff;
720 				want -= diff;
721 			} else {
722 				iter->rt_runtime -= want;
723 				want -= want;
724 			}
725 			raw_spin_unlock(&iter->rt_runtime_lock);
726 
727 			if (!want)
728 				break;
729 		}
730 
731 		raw_spin_lock(&rt_rq->rt_runtime_lock);
732 		/*
733 		 * We cannot be left wanting - that would mean some runtime
734 		 * leaked out of the system.
735 		 */
736 		BUG_ON(want);
737 balanced:
738 		/*
739 		 * Disable all the borrow logic by pretending we have inf
740 		 * runtime - in which case borrowing doesn't make sense.
741 		 */
742 		rt_rq->rt_runtime = RUNTIME_INF;
743 		rt_rq->rt_throttled = 0;
744 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
745 		raw_spin_unlock(&rt_b->rt_runtime_lock);
746 
747 		/* Make rt_rq available for pick_next_task() */
748 		sched_rt_rq_enqueue(rt_rq);
749 	}
750 }
751 
__enable_runtime(struct rq * rq)752 static void __enable_runtime(struct rq *rq)
753 {
754 	rt_rq_iter_t iter;
755 	struct rt_rq *rt_rq;
756 
757 	if (unlikely(!scheduler_running))
758 		return;
759 
760 	/*
761 	 * Reset each runqueue's bandwidth settings
762 	 */
763 	for_each_rt_rq(rt_rq, iter, rq) {
764 		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
765 
766 		raw_spin_lock(&rt_b->rt_runtime_lock);
767 		raw_spin_lock(&rt_rq->rt_runtime_lock);
768 		rt_rq->rt_runtime = rt_b->rt_runtime;
769 		rt_rq->rt_time = 0;
770 		rt_rq->rt_throttled = 0;
771 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
772 		raw_spin_unlock(&rt_b->rt_runtime_lock);
773 	}
774 }
775 
balance_runtime(struct rt_rq * rt_rq)776 static int balance_runtime(struct rt_rq *rt_rq)
777 {
778 	int more = 0;
779 
780 	if (!sched_feat(RT_RUNTIME_SHARE))
781 		return more;
782 
783 	if (rt_rq->rt_time > rt_rq->rt_runtime) {
784 		raw_spin_unlock(&rt_rq->rt_runtime_lock);
785 		more = do_balance_runtime(rt_rq);
786 		raw_spin_lock(&rt_rq->rt_runtime_lock);
787 	}
788 
789 	return more;
790 }
791 #else /* !CONFIG_SMP */
balance_runtime(struct rt_rq * rt_rq)792 static inline int balance_runtime(struct rt_rq *rt_rq)
793 {
794 	return 0;
795 }
796 #endif /* CONFIG_SMP */
797 
do_sched_rt_period_timer(struct rt_bandwidth * rt_b,int overrun)798 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
799 {
800 	int i, idle = 1, throttled = 0;
801 	const struct cpumask *span;
802 
803 	span = sched_rt_period_mask();
804 #ifdef CONFIG_RT_GROUP_SCHED
805 	/*
806 	 * FIXME: isolated CPUs should really leave the root task group,
807 	 * whether they are isolcpus or were isolated via cpusets, lest
808 	 * the timer run on a CPU which does not service all runqueues,
809 	 * potentially leaving other CPUs indefinitely throttled.  If
810 	 * isolation is really required, the user will turn the throttle
811 	 * off to kill the perturbations it causes anyway.  Meanwhile,
812 	 * this maintains functionality for boot and/or troubleshooting.
813 	 */
814 	if (rt_b == &root_task_group.rt_bandwidth)
815 		span = cpu_online_mask;
816 #endif
817 	for_each_cpu(i, span) {
818 		int enqueue = 0;
819 		struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
820 		struct rq *rq = rq_of_rt_rq(rt_rq);
821 
822 		raw_spin_lock(&rq->lock);
823 		if (rt_rq->rt_time) {
824 			u64 runtime;
825 
826 			raw_spin_lock(&rt_rq->rt_runtime_lock);
827 			if (rt_rq->rt_throttled)
828 				balance_runtime(rt_rq);
829 			runtime = rt_rq->rt_runtime;
830 			rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
831 			if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
832 				rt_rq->rt_throttled = 0;
833 				enqueue = 1;
834 
835 				/*
836 				 * Force a clock update if the CPU was idle,
837 				 * lest wakeup -> unthrottle time accumulate.
838 				 */
839 				if (rt_rq->rt_nr_running && rq->curr == rq->idle)
840 					rq->skip_clock_update = -1;
841 			}
842 			if (rt_rq->rt_time || rt_rq->rt_nr_running)
843 				idle = 0;
844 			raw_spin_unlock(&rt_rq->rt_runtime_lock);
845 		} else if (rt_rq->rt_nr_running) {
846 			idle = 0;
847 			if (!rt_rq_throttled(rt_rq))
848 				enqueue = 1;
849 		}
850 		if (rt_rq->rt_throttled)
851 			throttled = 1;
852 
853 		if (enqueue)
854 			sched_rt_rq_enqueue(rt_rq);
855 		raw_spin_unlock(&rq->lock);
856 	}
857 
858 	if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
859 		return 1;
860 
861 	return idle;
862 }
863 
rt_se_prio(struct sched_rt_entity * rt_se)864 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
865 {
866 #ifdef CONFIG_RT_GROUP_SCHED
867 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
868 
869 	if (rt_rq)
870 		return rt_rq->highest_prio.curr;
871 #endif
872 
873 	return rt_task_of(rt_se)->prio;
874 }
875 
dump_throttled_rt_tasks(struct rt_rq * rt_rq)876 static void dump_throttled_rt_tasks(struct rt_rq *rt_rq)
877 {
878 	struct rt_prio_array *array = &rt_rq->active;
879 	struct sched_rt_entity *rt_se;
880 	char buf[500];
881 	char *pos = buf;
882 	char *end = buf + sizeof(buf);
883 	int idx;
884 
885 	pos += snprintf(pos, sizeof(buf),
886 		"sched: RT throttling activated for rt_rq %p (cpu %d)\n",
887 		rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
888 
889 	if (bitmap_empty(array->bitmap, MAX_RT_PRIO))
890 		goto out;
891 
892 	pos += snprintf(pos, end - pos, "potential CPU hogs:\n");
893 	idx = sched_find_first_bit(array->bitmap);
894 	while (idx < MAX_RT_PRIO) {
895 		list_for_each_entry(rt_se, array->queue + idx, run_list) {
896 			struct task_struct *p;
897 
898 			if (!rt_entity_is_task(rt_se))
899 				continue;
900 
901 			p = rt_task_of(rt_se);
902 			if (pos < end)
903 				pos += snprintf(pos, end - pos, "\t%s (%d)\n",
904 					p->comm, p->pid);
905 		}
906 		idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
907 	}
908 out:
909 #ifdef CONFIG_PANIC_ON_RT_THROTTLING
910 	/*
911 	 * Use pr_err() in the BUG() case since printk_sched() will
912 	 * not get flushed and deadlock is not a concern.
913 	 */
914 	pr_err("%s", buf);
915 	BUG();
916 #else
917 	printk_deferred("%s", buf);
918 #endif
919 }
920 
sched_rt_runtime_exceeded(struct rt_rq * rt_rq)921 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
922 {
923 	u64 runtime = sched_rt_runtime(rt_rq);
924 
925 	if (rt_rq->rt_throttled)
926 		return rt_rq_throttled(rt_rq);
927 
928 	if (runtime >= sched_rt_period(rt_rq))
929 		return 0;
930 
931 	balance_runtime(rt_rq);
932 	runtime = sched_rt_runtime(rt_rq);
933 	if (runtime == RUNTIME_INF)
934 		return 0;
935 
936 	if (rt_rq->rt_time > runtime) {
937 		struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
938 
939 		/*
940 		 * Don't actually throttle groups that have no runtime assigned
941 		 * but accrue some time due to boosting.
942 		 */
943 		if (likely(rt_b->rt_runtime)) {
944 			static bool once = false;
945 
946 			rt_rq->rt_throttled = 1;
947 
948 			if (!once) {
949 				once = true;
950 				dump_throttled_rt_tasks(rt_rq);
951 			}
952 		} else {
953 			/*
954 			 * In case we did anyway, make it go away,
955 			 * replenishment is a joke, since it will replenish us
956 			 * with exactly 0 ns.
957 			 */
958 			rt_rq->rt_time = 0;
959 		}
960 
961 		if (rt_rq_throttled(rt_rq)) {
962 			sched_rt_rq_dequeue(rt_rq);
963 			return 1;
964 		}
965 	}
966 
967 	return 0;
968 }
969 
970 /*
971  * Update the current task's runtime statistics. Skip current tasks that
972  * are not in our scheduling class.
973  */
update_curr_rt(struct rq * rq)974 static void update_curr_rt(struct rq *rq)
975 {
976 	struct task_struct *curr = rq->curr;
977 	struct sched_rt_entity *rt_se = &curr->rt;
978 	u64 delta_exec;
979 
980 	if (curr->sched_class != &rt_sched_class)
981 		return;
982 
983 	delta_exec = rq_clock_task(rq) - curr->se.exec_start;
984 	if (unlikely((s64)delta_exec <= 0))
985 		return;
986 
987 	schedstat_set(curr->se.statistics.exec_max,
988 		      max(curr->se.statistics.exec_max, delta_exec));
989 
990 	curr->se.sum_exec_runtime += delta_exec;
991 	account_group_exec_runtime(curr, delta_exec);
992 
993 	curr->se.exec_start = rq_clock_task(rq);
994 	cpuacct_charge(curr, delta_exec);
995 
996 	sched_rt_avg_update(rq, delta_exec);
997 
998 	if (!rt_bandwidth_enabled())
999 		return;
1000 
1001 	for_each_sched_rt_entity(rt_se) {
1002 		struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1003 
1004 		if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1005 			raw_spin_lock(&rt_rq->rt_runtime_lock);
1006 			rt_rq->rt_time += delta_exec;
1007 			if (sched_rt_runtime_exceeded(rt_rq))
1008 				resched_curr(rq);
1009 			raw_spin_unlock(&rt_rq->rt_runtime_lock);
1010 		}
1011 	}
1012 }
1013 
1014 static void
dequeue_top_rt_rq(struct rt_rq * rt_rq)1015 dequeue_top_rt_rq(struct rt_rq *rt_rq)
1016 {
1017 	struct rq *rq = rq_of_rt_rq(rt_rq);
1018 
1019 	BUG_ON(&rq->rt != rt_rq);
1020 
1021 	if (!rt_rq->rt_queued)
1022 		return;
1023 
1024 	BUG_ON(!rq->nr_running);
1025 
1026 	sub_nr_running(rq, rt_rq->rt_nr_running);
1027 	rt_rq->rt_queued = 0;
1028 }
1029 
1030 static void
enqueue_top_rt_rq(struct rt_rq * rt_rq)1031 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1032 {
1033 	struct rq *rq = rq_of_rt_rq(rt_rq);
1034 
1035 	BUG_ON(&rq->rt != rt_rq);
1036 
1037 	if (rt_rq->rt_queued)
1038 		return;
1039 	if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1040 		return;
1041 
1042 	add_nr_running(rq, rt_rq->rt_nr_running);
1043 	rt_rq->rt_queued = 1;
1044 }
1045 
1046 #if defined CONFIG_SMP
1047 
1048 static void
inc_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1049 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1050 {
1051 	struct rq *rq = rq_of_rt_rq(rt_rq);
1052 
1053 #ifdef CONFIG_RT_GROUP_SCHED
1054 	/*
1055 	 * Change rq's cpupri only if rt_rq is the top queue.
1056 	 */
1057 	if (&rq->rt != rt_rq)
1058 		return;
1059 #endif
1060 	if (rq->online && prio < prev_prio)
1061 		cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1062 }
1063 
1064 static void
dec_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1065 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1066 {
1067 	struct rq *rq = rq_of_rt_rq(rt_rq);
1068 
1069 #ifdef CONFIG_RT_GROUP_SCHED
1070 	/*
1071 	 * Change rq's cpupri only if rt_rq is the top queue.
1072 	 */
1073 	if (&rq->rt != rt_rq)
1074 		return;
1075 #endif
1076 	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1077 		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1078 }
1079 
1080 #else /* CONFIG_SMP */
1081 
1082 static inline
inc_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1083 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1084 static inline
dec_rt_prio_smp(struct rt_rq * rt_rq,int prio,int prev_prio)1085 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1086 
1087 #endif /* CONFIG_SMP */
1088 
1089 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1090 static void
inc_rt_prio(struct rt_rq * rt_rq,int prio)1091 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1092 {
1093 	int prev_prio = rt_rq->highest_prio.curr;
1094 
1095 	if (prio < prev_prio)
1096 		rt_rq->highest_prio.curr = prio;
1097 
1098 	inc_rt_prio_smp(rt_rq, prio, prev_prio);
1099 }
1100 
1101 static void
dec_rt_prio(struct rt_rq * rt_rq,int prio)1102 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1103 {
1104 	int prev_prio = rt_rq->highest_prio.curr;
1105 
1106 	if (rt_rq->rt_nr_running) {
1107 
1108 		WARN_ON(prio < prev_prio);
1109 
1110 		/*
1111 		 * This may have been our highest task, and therefore
1112 		 * we may have some recomputation to do
1113 		 */
1114 		if (prio == prev_prio) {
1115 			struct rt_prio_array *array = &rt_rq->active;
1116 
1117 			rt_rq->highest_prio.curr =
1118 				sched_find_first_bit(array->bitmap);
1119 		}
1120 
1121 	} else
1122 		rt_rq->highest_prio.curr = MAX_RT_PRIO;
1123 
1124 	dec_rt_prio_smp(rt_rq, prio, prev_prio);
1125 }
1126 
1127 #else
1128 
inc_rt_prio(struct rt_rq * rt_rq,int prio)1129 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
dec_rt_prio(struct rt_rq * rt_rq,int prio)1130 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1131 
1132 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1133 
1134 #ifdef CONFIG_RT_GROUP_SCHED
1135 
1136 static void
inc_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1137 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1138 {
1139 	if (rt_se_boosted(rt_se))
1140 		rt_rq->rt_nr_boosted++;
1141 
1142 	if (rt_rq->tg)
1143 		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1144 }
1145 
1146 static void
dec_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1147 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1148 {
1149 	if (rt_se_boosted(rt_se))
1150 		rt_rq->rt_nr_boosted--;
1151 
1152 	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1153 }
1154 
1155 #else /* CONFIG_RT_GROUP_SCHED */
1156 
1157 static void
inc_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1158 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1159 {
1160 	start_rt_bandwidth(&def_rt_bandwidth);
1161 }
1162 
1163 static inline
dec_rt_group(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1164 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1165 
1166 #endif /* CONFIG_RT_GROUP_SCHED */
1167 
1168 static inline
rt_se_nr_running(struct sched_rt_entity * rt_se)1169 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1170 {
1171 	struct rt_rq *group_rq = group_rt_rq(rt_se);
1172 
1173 	if (group_rq)
1174 		return group_rq->rt_nr_running;
1175 	else
1176 		return 1;
1177 }
1178 
1179 static inline
inc_rt_tasks(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1180 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1181 {
1182 	int prio = rt_se_prio(rt_se);
1183 
1184 	WARN_ON(!rt_prio(prio));
1185 	rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1186 
1187 	inc_rt_prio(rt_rq, prio);
1188 	inc_rt_migration(rt_se, rt_rq);
1189 	inc_rt_group(rt_se, rt_rq);
1190 }
1191 
1192 static inline
dec_rt_tasks(struct sched_rt_entity * rt_se,struct rt_rq * rt_rq)1193 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1194 {
1195 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1196 	WARN_ON(!rt_rq->rt_nr_running);
1197 	rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1198 
1199 	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1200 	dec_rt_migration(rt_se, rt_rq);
1201 	dec_rt_group(rt_se, rt_rq);
1202 }
1203 
__enqueue_rt_entity(struct sched_rt_entity * rt_se,bool head)1204 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1205 {
1206 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1207 	struct rt_prio_array *array = &rt_rq->active;
1208 	struct rt_rq *group_rq = group_rt_rq(rt_se);
1209 	struct list_head *queue = array->queue + rt_se_prio(rt_se);
1210 
1211 	/*
1212 	 * Don't enqueue the group if its throttled, or when empty.
1213 	 * The latter is a consequence of the former when a child group
1214 	 * get throttled and the current group doesn't have any other
1215 	 * active members.
1216 	 */
1217 	if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
1218 		return;
1219 
1220 	if (head)
1221 		list_add(&rt_se->run_list, queue);
1222 	else
1223 		list_add_tail(&rt_se->run_list, queue);
1224 	__set_bit(rt_se_prio(rt_se), array->bitmap);
1225 
1226 	inc_rt_tasks(rt_se, rt_rq);
1227 }
1228 
__dequeue_rt_entity(struct sched_rt_entity * rt_se)1229 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
1230 {
1231 	struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1232 	struct rt_prio_array *array = &rt_rq->active;
1233 
1234 	list_del_init(&rt_se->run_list);
1235 	if (list_empty(array->queue + rt_se_prio(rt_se)))
1236 		__clear_bit(rt_se_prio(rt_se), array->bitmap);
1237 
1238 	dec_rt_tasks(rt_se, rt_rq);
1239 }
1240 
1241 /*
1242  * Because the prio of an upper entry depends on the lower
1243  * entries, we must remove entries top - down.
1244  */
dequeue_rt_stack(struct sched_rt_entity * rt_se)1245 static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
1246 {
1247 	struct sched_rt_entity *back = NULL;
1248 
1249 	for_each_sched_rt_entity(rt_se) {
1250 		rt_se->back = back;
1251 		back = rt_se;
1252 	}
1253 
1254 	dequeue_top_rt_rq(rt_rq_of_se(back));
1255 
1256 	for (rt_se = back; rt_se; rt_se = rt_se->back) {
1257 		if (on_rt_rq(rt_se))
1258 			__dequeue_rt_entity(rt_se);
1259 	}
1260 }
1261 
enqueue_rt_entity(struct sched_rt_entity * rt_se,bool head)1262 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
1263 {
1264 	struct rq *rq = rq_of_rt_se(rt_se);
1265 
1266 	dequeue_rt_stack(rt_se);
1267 	for_each_sched_rt_entity(rt_se)
1268 		__enqueue_rt_entity(rt_se, head);
1269 	enqueue_top_rt_rq(&rq->rt);
1270 }
1271 
dequeue_rt_entity(struct sched_rt_entity * rt_se)1272 static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
1273 {
1274 	struct rq *rq = rq_of_rt_se(rt_se);
1275 
1276 	dequeue_rt_stack(rt_se);
1277 
1278 	for_each_sched_rt_entity(rt_se) {
1279 		struct rt_rq *rt_rq = group_rt_rq(rt_se);
1280 
1281 		if (rt_rq && rt_rq->rt_nr_running)
1282 			__enqueue_rt_entity(rt_se, false);
1283 	}
1284 	enqueue_top_rt_rq(&rq->rt);
1285 }
1286 
1287 /*
1288  * Adding/removing a task to/from a priority array:
1289  */
1290 static void
enqueue_task_rt(struct rq * rq,struct task_struct * p,int flags)1291 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1292 {
1293 	struct sched_rt_entity *rt_se = &p->rt;
1294 
1295 	if (flags & ENQUEUE_WAKEUP)
1296 		rt_se->timeout = 0;
1297 
1298 	enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
1299 	walt_inc_cumulative_runnable_avg(rq, p);
1300 
1301 	if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1302 		enqueue_pushable_task(rq, p);
1303 }
1304 
dequeue_task_rt(struct rq * rq,struct task_struct * p,int flags)1305 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1306 {
1307 	struct sched_rt_entity *rt_se = &p->rt;
1308 
1309 	update_curr_rt(rq);
1310 	dequeue_rt_entity(rt_se);
1311 	walt_dec_cumulative_runnable_avg(rq, p);
1312 
1313 	dequeue_pushable_task(rq, p);
1314 }
1315 
1316 /*
1317  * Put task to the head or the end of the run list without the overhead of
1318  * dequeue followed by enqueue.
1319  */
1320 static void
requeue_rt_entity(struct rt_rq * rt_rq,struct sched_rt_entity * rt_se,int head)1321 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1322 {
1323 	if (on_rt_rq(rt_se)) {
1324 		struct rt_prio_array *array = &rt_rq->active;
1325 		struct list_head *queue = array->queue + rt_se_prio(rt_se);
1326 
1327 		if (head)
1328 			list_move(&rt_se->run_list, queue);
1329 		else
1330 			list_move_tail(&rt_se->run_list, queue);
1331 	}
1332 }
1333 
requeue_task_rt(struct rq * rq,struct task_struct * p,int head)1334 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1335 {
1336 	struct sched_rt_entity *rt_se = &p->rt;
1337 	struct rt_rq *rt_rq;
1338 
1339 	for_each_sched_rt_entity(rt_se) {
1340 		rt_rq = rt_rq_of_se(rt_se);
1341 		requeue_rt_entity(rt_rq, rt_se, head);
1342 	}
1343 }
1344 
yield_task_rt(struct rq * rq)1345 static void yield_task_rt(struct rq *rq)
1346 {
1347 	requeue_task_rt(rq, rq->curr, 0);
1348 }
1349 
1350 #ifdef CONFIG_SMP
1351 static int find_lowest_rq(struct task_struct *task);
1352 
1353 static int
select_task_rq_rt(struct task_struct * p,int cpu,int sd_flag,int flags)1354 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
1355 {
1356 	struct task_struct *curr;
1357 	struct rq *rq;
1358 
1359 	if (p->nr_cpus_allowed == 1)
1360 		goto out;
1361 
1362 	/* For anything but wake ups, just return the task_cpu */
1363 	if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1364 		goto out;
1365 
1366 	rq = cpu_rq(cpu);
1367 
1368 	rcu_read_lock();
1369 	curr = READ_ONCE(rq->curr); /* unlocked access */
1370 
1371 	/*
1372 	 * If the current task on @p's runqueue is an RT task, then
1373 	 * try to see if we can wake this RT task up on another
1374 	 * runqueue. Otherwise simply start this RT task
1375 	 * on its current runqueue.
1376 	 *
1377 	 * We want to avoid overloading runqueues. If the woken
1378 	 * task is a higher priority, then it will stay on this CPU
1379 	 * and the lower prio task should be moved to another CPU.
1380 	 * Even though this will probably make the lower prio task
1381 	 * lose its cache, we do not want to bounce a higher task
1382 	 * around just because it gave up its CPU, perhaps for a
1383 	 * lock?
1384 	 *
1385 	 * For equal prio tasks, we just let the scheduler sort it out.
1386 	 *
1387 	 * Otherwise, just let it ride on the affined RQ and the
1388 	 * post-schedule router will push the preempted task away
1389 	 *
1390 	 * This test is optimistic, if we get it wrong the load-balancer
1391 	 * will have to sort it out.
1392 	 */
1393 	if (curr && unlikely(rt_task(curr)) &&
1394 	    (curr->nr_cpus_allowed < 2 ||
1395 	     curr->prio <= p->prio)) {
1396 		int target = find_lowest_rq(p);
1397 
1398 		/*
1399 		 * Possible race. Don't bother moving it if the
1400 		 * destination CPU is not running a lower priority task.
1401 		 */
1402                 if (target != -1 &&
1403                     p->prio < cpu_rq(target)->rt.highest_prio.curr)
1404 			cpu = target;
1405 	}
1406 	rcu_read_unlock();
1407 
1408 out:
1409 	return cpu;
1410 }
1411 
check_preempt_equal_prio(struct rq * rq,struct task_struct * p)1412 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1413 {
1414 	if (rq->curr->nr_cpus_allowed == 1)
1415 		return;
1416 
1417 	if (p->nr_cpus_allowed != 1
1418 	    && cpupri_find(&rq->rd->cpupri, p, NULL))
1419 		return;
1420 
1421 	if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1422 		return;
1423 
1424 	/*
1425 	 * There appears to be other cpus that can accept
1426 	 * current and none to run 'p', so lets reschedule
1427 	 * to try and push current away:
1428 	 */
1429 	requeue_task_rt(rq, p, 1);
1430 	resched_curr(rq);
1431 }
1432 
1433 #endif /* CONFIG_SMP */
1434 
1435 /*
1436  * Preempt the current task with a newly woken task if needed:
1437  */
check_preempt_curr_rt(struct rq * rq,struct task_struct * p,int flags)1438 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1439 {
1440 	if (p->prio < rq->curr->prio) {
1441 		resched_curr(rq);
1442 		return;
1443 	}
1444 
1445 #ifdef CONFIG_SMP
1446 	/*
1447 	 * If:
1448 	 *
1449 	 * - the newly woken task is of equal priority to the current task
1450 	 * - the newly woken task is non-migratable while current is migratable
1451 	 * - current will be preempted on the next reschedule
1452 	 *
1453 	 * we should check to see if current can readily move to a different
1454 	 * cpu.  If so, we will reschedule to allow the push logic to try
1455 	 * to move current somewhere else, making room for our non-migratable
1456 	 * task.
1457 	 */
1458 	if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1459 		check_preempt_equal_prio(rq, p);
1460 #endif
1461 }
1462 
1463 #ifdef CONFIG_SMP
sched_rt_update_capacity_req(struct rq * rq)1464 static void sched_rt_update_capacity_req(struct rq *rq)
1465 {
1466 	u64 total, used, age_stamp, avg;
1467 	s64 delta;
1468 
1469 	if (!sched_freq())
1470 		return;
1471 
1472 	sched_avg_update(rq);
1473 	/*
1474 	 * Since we're reading these variables without serialization make sure
1475 	 * we read them once before doing sanity checks on them.
1476 	 */
1477 	age_stamp = READ_ONCE(rq->age_stamp);
1478 	avg = READ_ONCE(rq->rt_avg);
1479 	delta = rq_clock(rq) - age_stamp;
1480 
1481 	if (unlikely(delta < 0))
1482 		delta = 0;
1483 
1484 	total = sched_avg_period() + delta;
1485 
1486 	used = div_u64(avg, total);
1487 	if (unlikely(used > SCHED_CAPACITY_SCALE))
1488 		used = SCHED_CAPACITY_SCALE;
1489 
1490 	set_rt_cpu_capacity(rq->cpu, 1, (unsigned long)(used));
1491 }
1492 #else
sched_rt_update_capacity_req(struct rq * rq)1493 static inline void sched_rt_update_capacity_req(struct rq *rq)
1494 { }
1495 
1496 #endif
1497 
pick_next_rt_entity(struct rq * rq,struct rt_rq * rt_rq)1498 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1499 						   struct rt_rq *rt_rq)
1500 {
1501 	struct rt_prio_array *array = &rt_rq->active;
1502 	struct sched_rt_entity *next = NULL;
1503 	struct list_head *queue;
1504 	int idx;
1505 
1506 	idx = sched_find_first_bit(array->bitmap);
1507 	BUG_ON(idx >= MAX_RT_PRIO);
1508 
1509 	queue = array->queue + idx;
1510 	next = list_entry(queue->next, struct sched_rt_entity, run_list);
1511 
1512 	return next;
1513 }
1514 
_pick_next_task_rt(struct rq * rq)1515 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1516 {
1517 	struct sched_rt_entity *rt_se;
1518 	struct task_struct *p;
1519 	struct rt_rq *rt_rq  = &rq->rt;
1520 
1521 	do {
1522 		rt_se = pick_next_rt_entity(rq, rt_rq);
1523 		BUG_ON(!rt_se);
1524 		rt_rq = group_rt_rq(rt_se);
1525 	} while (rt_rq);
1526 
1527 	p = rt_task_of(rt_se);
1528 	p->se.exec_start = rq_clock_task(rq);
1529 
1530 	return p;
1531 }
1532 
1533 static struct task_struct *
pick_next_task_rt(struct rq * rq,struct task_struct * prev)1534 pick_next_task_rt(struct rq *rq, struct task_struct *prev)
1535 {
1536 	struct task_struct *p;
1537 	struct rt_rq *rt_rq = &rq->rt;
1538 
1539 	if (need_pull_rt_task(rq, prev)) {
1540 		pull_rt_task(rq);
1541 		/*
1542 		 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1543 		 * means a dl or stop task can slip in, in which case we need
1544 		 * to re-start task selection.
1545 		 */
1546 		if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1547 			     rq->dl.dl_nr_running))
1548 			return RETRY_TASK;
1549 	}
1550 
1551 	/*
1552 	 * We may dequeue prev's rt_rq in put_prev_task().
1553 	 * So, we update time before rt_nr_running check.
1554 	 */
1555 	if (prev->sched_class == &rt_sched_class)
1556 		update_curr_rt(rq);
1557 
1558 	if (!rt_rq->rt_queued) {
1559 		/*
1560 		 * The next task to be picked on this rq will have a lower
1561 		 * priority than rt tasks so we can spend some time to update
1562 		 * the capacity used by rt tasks based on the last activity.
1563 		 * This value will be the used as an estimation of the next
1564 		 * activity.
1565 		 */
1566 		sched_rt_update_capacity_req(rq);
1567 		return NULL;
1568 	}
1569 
1570 	put_prev_task(rq, prev);
1571 
1572 	p = _pick_next_task_rt(rq);
1573 
1574 	/* The running task is never eligible for pushing */
1575 	dequeue_pushable_task(rq, p);
1576 
1577 	set_post_schedule(rq);
1578 
1579 	return p;
1580 }
1581 
put_prev_task_rt(struct rq * rq,struct task_struct * p)1582 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1583 {
1584 	update_curr_rt(rq);
1585 
1586 	/*
1587 	 * The previous task needs to be made eligible for pushing
1588 	 * if it is still active
1589 	 */
1590 	if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1591 		enqueue_pushable_task(rq, p);
1592 }
1593 
1594 #ifdef CONFIG_SMP
1595 
1596 /* Only try algorithms three times */
1597 #define RT_MAX_TRIES 3
1598 
pick_rt_task(struct rq * rq,struct task_struct * p,int cpu)1599 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1600 {
1601 	if (!task_running(rq, p) &&
1602 	    cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
1603 		return 1;
1604 	return 0;
1605 }
1606 
1607 /*
1608  * Return the highest pushable rq's task, which is suitable to be executed
1609  * on the cpu, NULL otherwise
1610  */
pick_highest_pushable_task(struct rq * rq,int cpu)1611 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1612 {
1613 	struct plist_head *head = &rq->rt.pushable_tasks;
1614 	struct task_struct *p;
1615 
1616 	if (!has_pushable_tasks(rq))
1617 		return NULL;
1618 
1619 	plist_for_each_entry(p, head, pushable_tasks) {
1620 		if (pick_rt_task(rq, p, cpu))
1621 			return p;
1622 	}
1623 
1624 	return NULL;
1625 }
1626 
1627 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1628 
find_lowest_rq(struct task_struct * task)1629 static int find_lowest_rq(struct task_struct *task)
1630 {
1631 	struct sched_domain *sd;
1632 	struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1633 	int this_cpu = smp_processor_id();
1634 	int cpu      = task_cpu(task);
1635 
1636 	/* Make sure the mask is initialized first */
1637 	if (unlikely(!lowest_mask))
1638 		return -1;
1639 
1640 	if (task->nr_cpus_allowed == 1)
1641 		return -1; /* No other targets possible */
1642 
1643 	if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1644 		return -1; /* No targets found */
1645 
1646 	/*
1647 	 * At this point we have built a mask of cpus representing the
1648 	 * lowest priority tasks in the system.  Now we want to elect
1649 	 * the best one based on our affinity and topology.
1650 	 *
1651 	 * We prioritize the last cpu that the task executed on since
1652 	 * it is most likely cache-hot in that location.
1653 	 */
1654 	if (cpumask_test_cpu(cpu, lowest_mask))
1655 		return cpu;
1656 
1657 	/*
1658 	 * Otherwise, we consult the sched_domains span maps to figure
1659 	 * out which cpu is logically closest to our hot cache data.
1660 	 */
1661 	if (!cpumask_test_cpu(this_cpu, lowest_mask))
1662 		this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1663 
1664 	rcu_read_lock();
1665 	for_each_domain(cpu, sd) {
1666 		if (sd->flags & SD_WAKE_AFFINE) {
1667 			int best_cpu;
1668 
1669 			/*
1670 			 * "this_cpu" is cheaper to preempt than a
1671 			 * remote processor.
1672 			 */
1673 			if (this_cpu != -1 &&
1674 			    cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1675 				rcu_read_unlock();
1676 				return this_cpu;
1677 			}
1678 
1679 			best_cpu = cpumask_first_and(lowest_mask,
1680 						     sched_domain_span(sd));
1681 			if (best_cpu < nr_cpu_ids) {
1682 				rcu_read_unlock();
1683 				return best_cpu;
1684 			}
1685 		}
1686 	}
1687 	rcu_read_unlock();
1688 
1689 	/*
1690 	 * And finally, if there were no matches within the domains
1691 	 * just give the caller *something* to work with from the compatible
1692 	 * locations.
1693 	 */
1694 	if (this_cpu != -1)
1695 		return this_cpu;
1696 
1697 	cpu = cpumask_any(lowest_mask);
1698 	if (cpu < nr_cpu_ids)
1699 		return cpu;
1700 	return -1;
1701 }
1702 
1703 /* Will lock the rq it finds */
find_lock_lowest_rq(struct task_struct * task,struct rq * rq)1704 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1705 {
1706 	struct rq *lowest_rq = NULL;
1707 	int tries;
1708 	int cpu;
1709 
1710 	for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1711 		cpu = find_lowest_rq(task);
1712 
1713 		if ((cpu == -1) || (cpu == rq->cpu))
1714 			break;
1715 
1716 		lowest_rq = cpu_rq(cpu);
1717 
1718 		/* if the prio of this runqueue changed, try again */
1719 		if (double_lock_balance(rq, lowest_rq)) {
1720 			/*
1721 			 * We had to unlock the run queue. In
1722 			 * the mean time, task could have
1723 			 * migrated already or had its affinity changed.
1724 			 * Also make sure that it wasn't scheduled on its rq.
1725 			 */
1726 			if (unlikely(task_rq(task) != rq ||
1727 				     !cpumask_test_cpu(lowest_rq->cpu,
1728 						       tsk_cpus_allowed(task)) ||
1729 				     task_running(rq, task) ||
1730 				     !task_on_rq_queued(task))) {
1731 
1732 				double_unlock_balance(rq, lowest_rq);
1733 				lowest_rq = NULL;
1734 				break;
1735 			}
1736 		}
1737 
1738 		/* If this rq is still suitable use it. */
1739 		if (lowest_rq->rt.highest_prio.curr > task->prio)
1740 			break;
1741 
1742 		/* try again */
1743 		double_unlock_balance(rq, lowest_rq);
1744 		lowest_rq = NULL;
1745 	}
1746 
1747 	return lowest_rq;
1748 }
1749 
pick_next_pushable_task(struct rq * rq)1750 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1751 {
1752 	struct task_struct *p;
1753 
1754 	if (!has_pushable_tasks(rq))
1755 		return NULL;
1756 
1757 	p = plist_first_entry(&rq->rt.pushable_tasks,
1758 			      struct task_struct, pushable_tasks);
1759 
1760 	BUG_ON(rq->cpu != task_cpu(p));
1761 	BUG_ON(task_current(rq, p));
1762 	BUG_ON(p->nr_cpus_allowed <= 1);
1763 
1764 	BUG_ON(!task_on_rq_queued(p));
1765 	BUG_ON(!rt_task(p));
1766 
1767 	return p;
1768 }
1769 
1770 /*
1771  * If the current CPU has more than one RT task, see if the non
1772  * running task can migrate over to a CPU that is running a task
1773  * of lesser priority.
1774  */
push_rt_task(struct rq * rq)1775 static int push_rt_task(struct rq *rq)
1776 {
1777 	struct task_struct *next_task;
1778 	struct rq *lowest_rq;
1779 	int ret = 0;
1780 
1781 	if (!rq->rt.overloaded)
1782 		return 0;
1783 
1784 	next_task = pick_next_pushable_task(rq);
1785 	if (!next_task)
1786 		return 0;
1787 
1788 retry:
1789 	if (unlikely(next_task == rq->curr)) {
1790 		WARN_ON(1);
1791 		return 0;
1792 	}
1793 
1794 	/*
1795 	 * It's possible that the next_task slipped in of
1796 	 * higher priority than current. If that's the case
1797 	 * just reschedule current.
1798 	 */
1799 	if (unlikely(next_task->prio < rq->curr->prio)) {
1800 		resched_curr(rq);
1801 		return 0;
1802 	}
1803 
1804 	/* We might release rq lock */
1805 	get_task_struct(next_task);
1806 
1807 	/* find_lock_lowest_rq locks the rq if found */
1808 	lowest_rq = find_lock_lowest_rq(next_task, rq);
1809 	if (!lowest_rq) {
1810 		struct task_struct *task;
1811 		/*
1812 		 * find_lock_lowest_rq releases rq->lock
1813 		 * so it is possible that next_task has migrated.
1814 		 *
1815 		 * We need to make sure that the task is still on the same
1816 		 * run-queue and is also still the next task eligible for
1817 		 * pushing.
1818 		 */
1819 		task = pick_next_pushable_task(rq);
1820 		if (task_cpu(next_task) == rq->cpu && task == next_task) {
1821 			/*
1822 			 * The task hasn't migrated, and is still the next
1823 			 * eligible task, but we failed to find a run-queue
1824 			 * to push it to.  Do not retry in this case, since
1825 			 * other cpus will pull from us when ready.
1826 			 */
1827 			goto out;
1828 		}
1829 
1830 		if (!task)
1831 			/* No more tasks, just exit */
1832 			goto out;
1833 
1834 		/*
1835 		 * Something has shifted, try again.
1836 		 */
1837 		put_task_struct(next_task);
1838 		next_task = task;
1839 		goto retry;
1840 	}
1841 
1842 	deactivate_task(rq, next_task, 0);
1843 	set_task_cpu(next_task, lowest_rq->cpu);
1844 	activate_task(lowest_rq, next_task, 0);
1845 	ret = 1;
1846 
1847 	resched_curr(lowest_rq);
1848 
1849 	double_unlock_balance(rq, lowest_rq);
1850 
1851 out:
1852 	put_task_struct(next_task);
1853 
1854 	return ret;
1855 }
1856 
push_rt_tasks(struct rq * rq)1857 static void push_rt_tasks(struct rq *rq)
1858 {
1859 	/* push_rt_task will return true if it moved an RT */
1860 	while (push_rt_task(rq))
1861 		;
1862 }
1863 
pull_rt_task(struct rq * this_rq)1864 static int pull_rt_task(struct rq *this_rq)
1865 {
1866 	int this_cpu = this_rq->cpu, ret = 0, cpu;
1867 	struct task_struct *p;
1868 	struct rq *src_rq;
1869 
1870 	if (likely(!rt_overloaded(this_rq)))
1871 		return 0;
1872 
1873 	/*
1874 	 * Match the barrier from rt_set_overloaded; this guarantees that if we
1875 	 * see overloaded we must also see the rto_mask bit.
1876 	 */
1877 	smp_rmb();
1878 
1879 	for_each_cpu(cpu, this_rq->rd->rto_mask) {
1880 		if (this_cpu == cpu)
1881 			continue;
1882 
1883 		src_rq = cpu_rq(cpu);
1884 
1885 		/*
1886 		 * Don't bother taking the src_rq->lock if the next highest
1887 		 * task is known to be lower-priority than our current task.
1888 		 * This may look racy, but if this value is about to go
1889 		 * logically higher, the src_rq will push this task away.
1890 		 * And if its going logically lower, we do not care
1891 		 */
1892 		if (src_rq->rt.highest_prio.next >=
1893 		    this_rq->rt.highest_prio.curr)
1894 			continue;
1895 
1896 		/*
1897 		 * We can potentially drop this_rq's lock in
1898 		 * double_lock_balance, and another CPU could
1899 		 * alter this_rq
1900 		 */
1901 		double_lock_balance(this_rq, src_rq);
1902 
1903 		/*
1904 		 * We can pull only a task, which is pushable
1905 		 * on its rq, and no others.
1906 		 */
1907 		p = pick_highest_pushable_task(src_rq, this_cpu);
1908 
1909 		/*
1910 		 * Do we have an RT task that preempts
1911 		 * the to-be-scheduled task?
1912 		 */
1913 		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
1914 			WARN_ON(p == src_rq->curr);
1915 			WARN_ON(!task_on_rq_queued(p));
1916 
1917 			/*
1918 			 * There's a chance that p is higher in priority
1919 			 * than what's currently running on its cpu.
1920 			 * This is just that p is wakeing up and hasn't
1921 			 * had a chance to schedule. We only pull
1922 			 * p if it is lower in priority than the
1923 			 * current task on the run queue
1924 			 */
1925 			if (p->prio < src_rq->curr->prio)
1926 				goto skip;
1927 
1928 			ret = 1;
1929 
1930 			deactivate_task(src_rq, p, 0);
1931 			set_task_cpu(p, this_cpu);
1932 			activate_task(this_rq, p, 0);
1933 			/*
1934 			 * We continue with the search, just in
1935 			 * case there's an even higher prio task
1936 			 * in another runqueue. (low likelihood
1937 			 * but possible)
1938 			 */
1939 		}
1940 skip:
1941 		double_unlock_balance(this_rq, src_rq);
1942 	}
1943 
1944 	return ret;
1945 }
1946 
post_schedule_rt(struct rq * rq)1947 static void post_schedule_rt(struct rq *rq)
1948 {
1949 	push_rt_tasks(rq);
1950 }
1951 
1952 /*
1953  * If we are not running and we are not going to reschedule soon, we should
1954  * try to push tasks away now
1955  */
task_woken_rt(struct rq * rq,struct task_struct * p)1956 static void task_woken_rt(struct rq *rq, struct task_struct *p)
1957 {
1958 	if (!task_running(rq, p) &&
1959 	    !test_tsk_need_resched(rq->curr) &&
1960 	    has_pushable_tasks(rq) &&
1961 	    p->nr_cpus_allowed > 1 &&
1962 	    (dl_task(rq->curr) || rt_task(rq->curr)) &&
1963 	    (rq->curr->nr_cpus_allowed < 2 ||
1964 	     rq->curr->prio <= p->prio))
1965 		push_rt_tasks(rq);
1966 }
1967 
set_cpus_allowed_rt(struct task_struct * p,const struct cpumask * new_mask)1968 static void set_cpus_allowed_rt(struct task_struct *p,
1969 				const struct cpumask *new_mask)
1970 {
1971 	struct rq *rq;
1972 	int weight;
1973 
1974 	BUG_ON(!rt_task(p));
1975 
1976 	if (!task_on_rq_queued(p))
1977 		return;
1978 
1979 	weight = cpumask_weight(new_mask);
1980 
1981 	/*
1982 	 * Only update if the process changes its state from whether it
1983 	 * can migrate or not.
1984 	 */
1985 	if ((p->nr_cpus_allowed > 1) == (weight > 1))
1986 		return;
1987 
1988 	rq = task_rq(p);
1989 
1990 	/*
1991 	 * The process used to be able to migrate OR it can now migrate
1992 	 */
1993 	if (weight <= 1) {
1994 		if (!task_current(rq, p))
1995 			dequeue_pushable_task(rq, p);
1996 		BUG_ON(!rq->rt.rt_nr_migratory);
1997 		rq->rt.rt_nr_migratory--;
1998 	} else {
1999 		if (!task_current(rq, p))
2000 			enqueue_pushable_task(rq, p);
2001 		rq->rt.rt_nr_migratory++;
2002 	}
2003 
2004 	update_rt_migration(&rq->rt);
2005 }
2006 
2007 /* Assumes rq->lock is held */
rq_online_rt(struct rq * rq)2008 static void rq_online_rt(struct rq *rq)
2009 {
2010 	if (rq->rt.overloaded)
2011 		rt_set_overload(rq);
2012 
2013 	__enable_runtime(rq);
2014 
2015 	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2016 }
2017 
2018 /* Assumes rq->lock is held */
rq_offline_rt(struct rq * rq)2019 static void rq_offline_rt(struct rq *rq)
2020 {
2021 	if (rq->rt.overloaded)
2022 		rt_clear_overload(rq);
2023 
2024 	__disable_runtime(rq);
2025 
2026 	cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2027 }
2028 
2029 /*
2030  * When switch from the rt queue, we bring ourselves to a position
2031  * that we might want to pull RT tasks from other runqueues.
2032  */
switched_from_rt(struct rq * rq,struct task_struct * p)2033 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2034 {
2035 	/*
2036 	 * If there are other RT tasks then we will reschedule
2037 	 * and the scheduling of the other RT tasks will handle
2038 	 * the balancing. But if we are the last RT task
2039 	 * we may need to handle the pulling of RT tasks
2040 	 * now.
2041 	 */
2042 	if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2043 		return;
2044 
2045 	if (pull_rt_task(rq))
2046 		resched_curr(rq);
2047 }
2048 
init_sched_rt_class(void)2049 void __init init_sched_rt_class(void)
2050 {
2051 	unsigned int i;
2052 
2053 	for_each_possible_cpu(i) {
2054 		zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2055 					GFP_KERNEL, cpu_to_node(i));
2056 	}
2057 }
2058 #endif /* CONFIG_SMP */
2059 
2060 /*
2061  * When switching a task to RT, we may overload the runqueue
2062  * with RT tasks. In this case we try to push them off to
2063  * other runqueues.
2064  */
switched_to_rt(struct rq * rq,struct task_struct * p)2065 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2066 {
2067 	int check_resched = 1;
2068 
2069 	/*
2070 	 * If we are already running, then there's nothing
2071 	 * that needs to be done. But if we are not running
2072 	 * we may need to preempt the current running task.
2073 	 * If that current running task is also an RT task
2074 	 * then see if we can move to another run queue.
2075 	 */
2076 	if (task_on_rq_queued(p) && rq->curr != p) {
2077 #ifdef CONFIG_SMP
2078 		if (p->nr_cpus_allowed > 1 && rq->rt.overloaded &&
2079 		    /* Don't resched if we changed runqueues */
2080 		    push_rt_task(rq) && rq != task_rq(p))
2081 			check_resched = 0;
2082 #endif /* CONFIG_SMP */
2083 		if (check_resched && p->prio < rq->curr->prio)
2084 			resched_curr(rq);
2085 	}
2086 }
2087 
2088 /*
2089  * Priority of the task has changed. This may cause
2090  * us to initiate a push or pull.
2091  */
2092 static void
prio_changed_rt(struct rq * rq,struct task_struct * p,int oldprio)2093 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2094 {
2095 	if (!task_on_rq_queued(p))
2096 		return;
2097 
2098 	if (rq->curr == p) {
2099 #ifdef CONFIG_SMP
2100 		/*
2101 		 * If our priority decreases while running, we
2102 		 * may need to pull tasks to this runqueue.
2103 		 */
2104 		if (oldprio < p->prio)
2105 			pull_rt_task(rq);
2106 		/*
2107 		 * If there's a higher priority task waiting to run
2108 		 * then reschedule. Note, the above pull_rt_task
2109 		 * can release the rq lock and p could migrate.
2110 		 * Only reschedule if p is still on the same runqueue.
2111 		 */
2112 		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
2113 			resched_curr(rq);
2114 #else
2115 		/* For UP simply resched on drop of prio */
2116 		if (oldprio < p->prio)
2117 			resched_curr(rq);
2118 #endif /* CONFIG_SMP */
2119 	} else {
2120 		/*
2121 		 * This task is not running, but if it is
2122 		 * greater than the current running task
2123 		 * then reschedule.
2124 		 */
2125 		if (p->prio < rq->curr->prio)
2126 			resched_curr(rq);
2127 	}
2128 }
2129 
watchdog(struct rq * rq,struct task_struct * p)2130 static void watchdog(struct rq *rq, struct task_struct *p)
2131 {
2132 	unsigned long soft, hard;
2133 
2134 	/* max may change after cur was read, this will be fixed next tick */
2135 	soft = task_rlimit(p, RLIMIT_RTTIME);
2136 	hard = task_rlimit_max(p, RLIMIT_RTTIME);
2137 
2138 	if (soft != RLIM_INFINITY) {
2139 		unsigned long next;
2140 
2141 		if (p->rt.watchdog_stamp != jiffies) {
2142 			p->rt.timeout++;
2143 			p->rt.watchdog_stamp = jiffies;
2144 		}
2145 
2146 		next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2147 		if (p->rt.timeout > next)
2148 			p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2149 	}
2150 }
2151 
task_tick_rt(struct rq * rq,struct task_struct * p,int queued)2152 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2153 {
2154 	struct sched_rt_entity *rt_se = &p->rt;
2155 
2156 	update_curr_rt(rq);
2157 
2158 	if (rq->rt.rt_nr_running)
2159 		sched_rt_update_capacity_req(rq);
2160 
2161 	watchdog(rq, p);
2162 
2163 	/*
2164 	 * RR tasks need a special form of timeslice management.
2165 	 * FIFO tasks have no timeslices.
2166 	 */
2167 	if (p->policy != SCHED_RR)
2168 		return;
2169 
2170 	if (--p->rt.time_slice)
2171 		return;
2172 
2173 	p->rt.time_slice = sched_rr_timeslice;
2174 
2175 	/*
2176 	 * Requeue to the end of queue if we (and all of our ancestors) are not
2177 	 * the only element on the queue
2178 	 */
2179 	for_each_sched_rt_entity(rt_se) {
2180 		if (rt_se->run_list.prev != rt_se->run_list.next) {
2181 			requeue_task_rt(rq, p, 0);
2182 			resched_curr(rq);
2183 			return;
2184 		}
2185 	}
2186 }
2187 
set_curr_task_rt(struct rq * rq)2188 static void set_curr_task_rt(struct rq *rq)
2189 {
2190 	struct task_struct *p = rq->curr;
2191 
2192 	p->se.exec_start = rq_clock_task(rq);
2193 
2194 	/* The running task is never eligible for pushing */
2195 	dequeue_pushable_task(rq, p);
2196 }
2197 
get_rr_interval_rt(struct rq * rq,struct task_struct * task)2198 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2199 {
2200 	/*
2201 	 * Time slice is 0 for SCHED_FIFO tasks
2202 	 */
2203 	if (task->policy == SCHED_RR)
2204 		return sched_rr_timeslice;
2205 	else
2206 		return 0;
2207 }
2208 
2209 const struct sched_class rt_sched_class = {
2210 	.next			= &fair_sched_class,
2211 	.enqueue_task		= enqueue_task_rt,
2212 	.dequeue_task		= dequeue_task_rt,
2213 	.yield_task		= yield_task_rt,
2214 
2215 	.check_preempt_curr	= check_preempt_curr_rt,
2216 
2217 	.pick_next_task		= pick_next_task_rt,
2218 	.put_prev_task		= put_prev_task_rt,
2219 
2220 #ifdef CONFIG_SMP
2221 	.select_task_rq		= select_task_rq_rt,
2222 
2223 	.set_cpus_allowed       = set_cpus_allowed_rt,
2224 	.rq_online              = rq_online_rt,
2225 	.rq_offline             = rq_offline_rt,
2226 	.post_schedule		= post_schedule_rt,
2227 	.task_woken		= task_woken_rt,
2228 	.switched_from		= switched_from_rt,
2229 #endif
2230 
2231 	.set_curr_task          = set_curr_task_rt,
2232 	.task_tick		= task_tick_rt,
2233 
2234 	.get_rr_interval	= get_rr_interval_rt,
2235 
2236 	.prio_changed		= prio_changed_rt,
2237 	.switched_to		= switched_to_rt,
2238 
2239 	.update_curr		= update_curr_rt,
2240 };
2241 
2242 #ifdef CONFIG_SCHED_DEBUG
2243 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2244 
print_rt_stats(struct seq_file * m,int cpu)2245 void print_rt_stats(struct seq_file *m, int cpu)
2246 {
2247 	rt_rq_iter_t iter;
2248 	struct rt_rq *rt_rq;
2249 
2250 	rcu_read_lock();
2251 	for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2252 		print_rt_rq(m, cpu, rt_rq);
2253 	rcu_read_unlock();
2254 }
2255 #endif /* CONFIG_SCHED_DEBUG */
2256