1 /*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
21 */
22
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 #include <linux/module.h>
34
35 #include <trace/events/sched.h>
36
37 #include "sched.h"
38 #include "tune.h"
39 #include "walt.h"
40
41 /*
42 * Targeted preemption latency for CPU-bound tasks:
43 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
44 *
45 * NOTE: this latency value is not the same as the concept of
46 * 'timeslice length' - timeslices in CFS are of variable length
47 * and have no persistent notion like in traditional, time-slice
48 * based scheduling concepts.
49 *
50 * (to see the precise effective timeslice length of your workload,
51 * run vmstat and monitor the context-switches (cs) field)
52 */
53 unsigned int sysctl_sched_latency = 6000000ULL;
54 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
55
56 unsigned int sysctl_sched_sync_hint_enable = 1;
57 unsigned int sysctl_sched_cstate_aware = 1;
58
59 #ifdef CONFIG_SCHED_WALT
60 unsigned int sysctl_sched_use_walt_cpu_util = 1;
61 unsigned int sysctl_sched_use_walt_task_util = 1;
62 __read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
63 (10 * NSEC_PER_MSEC);
64 #endif
65 /*
66 * The initial- and re-scaling of tunables is configurable
67 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
68 *
69 * Options are:
70 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
71 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
72 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
73 */
74 enum sched_tunable_scaling sysctl_sched_tunable_scaling
75 = SCHED_TUNABLESCALING_LOG;
76
77 /*
78 * Minimal preemption granularity for CPU-bound tasks:
79 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
80 */
81 unsigned int sysctl_sched_min_granularity = 750000ULL;
82 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
83
84 /*
85 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
86 */
87 static unsigned int sched_nr_latency = 8;
88
89 /*
90 * After fork, child runs first. If set to 0 (default) then
91 * parent will (try to) run first.
92 */
93 unsigned int sysctl_sched_child_runs_first __read_mostly;
94
95 /*
96 * SCHED_OTHER wake-up granularity.
97 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
98 *
99 * This option delays the preemption effects of decoupled workloads
100 * and reduces their over-scheduling. Synchronous workloads will still
101 * have immediate wakeup/sleep latencies.
102 */
103 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
104 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
105
106 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
107
108 /*
109 * The exponential sliding window over which load is averaged for shares
110 * distribution.
111 * (default: 10msec)
112 */
113 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
114
115 #ifdef CONFIG_CFS_BANDWIDTH
116 /*
117 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
118 * each time a cfs_rq requests quota.
119 *
120 * Note: in the case that the slice exceeds the runtime remaining (either due
121 * to consumption or the quota being specified to be smaller than the slice)
122 * we will always only issue the remaining available time.
123 *
124 * default: 5 msec, units: microseconds
125 */
126 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
127 #endif
128
129 /*
130 * The margin used when comparing utilization with CPU capacity:
131 * util * margin < capacity * 1024
132 */
133 unsigned int capacity_margin = 1280; /* ~20% */
134
update_load_add(struct load_weight * lw,unsigned long inc)135 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
136 {
137 lw->weight += inc;
138 lw->inv_weight = 0;
139 }
140
update_load_sub(struct load_weight * lw,unsigned long dec)141 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
142 {
143 lw->weight -= dec;
144 lw->inv_weight = 0;
145 }
146
update_load_set(struct load_weight * lw,unsigned long w)147 static inline void update_load_set(struct load_weight *lw, unsigned long w)
148 {
149 lw->weight = w;
150 lw->inv_weight = 0;
151 }
152
153 /*
154 * Increase the granularity value when there are more CPUs,
155 * because with more CPUs the 'effective latency' as visible
156 * to users decreases. But the relationship is not linear,
157 * so pick a second-best guess by going with the log2 of the
158 * number of CPUs.
159 *
160 * This idea comes from the SD scheduler of Con Kolivas:
161 */
get_update_sysctl_factor(void)162 static unsigned int get_update_sysctl_factor(void)
163 {
164 unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
165 unsigned int factor;
166
167 switch (sysctl_sched_tunable_scaling) {
168 case SCHED_TUNABLESCALING_NONE:
169 factor = 1;
170 break;
171 case SCHED_TUNABLESCALING_LINEAR:
172 factor = cpus;
173 break;
174 case SCHED_TUNABLESCALING_LOG:
175 default:
176 factor = 1 + ilog2(cpus);
177 break;
178 }
179
180 return factor;
181 }
182
update_sysctl(void)183 static void update_sysctl(void)
184 {
185 unsigned int factor = get_update_sysctl_factor();
186
187 #define SET_SYSCTL(name) \
188 (sysctl_##name = (factor) * normalized_sysctl_##name)
189 SET_SYSCTL(sched_min_granularity);
190 SET_SYSCTL(sched_latency);
191 SET_SYSCTL(sched_wakeup_granularity);
192 #undef SET_SYSCTL
193 }
194
sched_init_granularity(void)195 void sched_init_granularity(void)
196 {
197 update_sysctl();
198 }
199
200 #define WMULT_CONST (~0U)
201 #define WMULT_SHIFT 32
202
__update_inv_weight(struct load_weight * lw)203 static void __update_inv_weight(struct load_weight *lw)
204 {
205 unsigned long w;
206
207 if (likely(lw->inv_weight))
208 return;
209
210 w = scale_load_down(lw->weight);
211
212 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
213 lw->inv_weight = 1;
214 else if (unlikely(!w))
215 lw->inv_weight = WMULT_CONST;
216 else
217 lw->inv_weight = WMULT_CONST / w;
218 }
219
220 /*
221 * delta_exec * weight / lw.weight
222 * OR
223 * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
224 *
225 * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
226 * we're guaranteed shift stays positive because inv_weight is guaranteed to
227 * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
228 *
229 * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
230 * weight/lw.weight <= 1, and therefore our shift will also be positive.
231 */
__calc_delta(u64 delta_exec,unsigned long weight,struct load_weight * lw)232 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
233 {
234 u64 fact = scale_load_down(weight);
235 int shift = WMULT_SHIFT;
236
237 __update_inv_weight(lw);
238
239 if (unlikely(fact >> 32)) {
240 while (fact >> 32) {
241 fact >>= 1;
242 shift--;
243 }
244 }
245
246 /* hint to use a 32x32->64 mul */
247 fact = (u64)(u32)fact * lw->inv_weight;
248
249 while (fact >> 32) {
250 fact >>= 1;
251 shift--;
252 }
253
254 return mul_u64_u32_shr(delta_exec, fact, shift);
255 }
256
257
258 const struct sched_class fair_sched_class;
259
260 /**************************************************************
261 * CFS operations on generic schedulable entities:
262 */
263
264 #ifdef CONFIG_FAIR_GROUP_SCHED
265
266 /* cpu runqueue to which this cfs_rq is attached */
rq_of(struct cfs_rq * cfs_rq)267 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
268 {
269 return cfs_rq->rq;
270 }
271
272 /* An entity is a task if it doesn't "own" a runqueue */
273 #define entity_is_task(se) (!se->my_q)
274
task_of(struct sched_entity * se)275 static inline struct task_struct *task_of(struct sched_entity *se)
276 {
277 #ifdef CONFIG_SCHED_DEBUG
278 WARN_ON_ONCE(!entity_is_task(se));
279 #endif
280 return container_of(se, struct task_struct, se);
281 }
282
283 /* Walk up scheduling entities hierarchy */
284 #define for_each_sched_entity(se) \
285 for (; se; se = se->parent)
286
task_cfs_rq(struct task_struct * p)287 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
288 {
289 return p->se.cfs_rq;
290 }
291
292 /* runqueue on which this entity is (to be) queued */
cfs_rq_of(struct sched_entity * se)293 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
294 {
295 return se->cfs_rq;
296 }
297
298 /* runqueue "owned" by this group */
group_cfs_rq(struct sched_entity * grp)299 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
300 {
301 return grp->my_q;
302 }
303
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)304 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
305 {
306 if (!cfs_rq->on_list) {
307 struct rq *rq = rq_of(cfs_rq);
308 int cpu = cpu_of(rq);
309 /*
310 * Ensure we either appear before our parent (if already
311 * enqueued) or force our parent to appear after us when it is
312 * enqueued. The fact that we always enqueue bottom-up
313 * reduces this to two cases and a special case for the root
314 * cfs_rq. Furthermore, it also means that we will always reset
315 * tmp_alone_branch either when the branch is connected
316 * to a tree or when we reach the beg of the tree
317 */
318 if (cfs_rq->tg->parent &&
319 cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
320 /*
321 * If parent is already on the list, we add the child
322 * just before. Thanks to circular linked property of
323 * the list, this means to put the child at the tail
324 * of the list that starts by parent.
325 */
326 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
327 &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
328 /*
329 * The branch is now connected to its tree so we can
330 * reset tmp_alone_branch to the beginning of the
331 * list.
332 */
333 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
334 } else if (!cfs_rq->tg->parent) {
335 /*
336 * cfs rq without parent should be put
337 * at the tail of the list.
338 */
339 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
340 &rq->leaf_cfs_rq_list);
341 /*
342 * We have reach the beg of a tree so we can reset
343 * tmp_alone_branch to the beginning of the list.
344 */
345 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
346 } else {
347 /*
348 * The parent has not already been added so we want to
349 * make sure that it will be put after us.
350 * tmp_alone_branch points to the beg of the branch
351 * where we will add parent.
352 */
353 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
354 rq->tmp_alone_branch);
355 /*
356 * update tmp_alone_branch to points to the new beg
357 * of the branch
358 */
359 rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
360 }
361
362 cfs_rq->on_list = 1;
363 }
364 }
365
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)366 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
367 {
368 if (cfs_rq->on_list) {
369 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
370 cfs_rq->on_list = 0;
371 }
372 }
373
374 /* Iterate thr' all leaf cfs_rq's on a runqueue */
375 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
376 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
377
378 /* Do the two (enqueued) entities belong to the same group ? */
379 static inline struct cfs_rq *
is_same_group(struct sched_entity * se,struct sched_entity * pse)380 is_same_group(struct sched_entity *se, struct sched_entity *pse)
381 {
382 if (se->cfs_rq == pse->cfs_rq)
383 return se->cfs_rq;
384
385 return NULL;
386 }
387
parent_entity(struct sched_entity * se)388 static inline struct sched_entity *parent_entity(struct sched_entity *se)
389 {
390 return se->parent;
391 }
392
393 static void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)394 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
395 {
396 int se_depth, pse_depth;
397
398 /*
399 * preemption test can be made between sibling entities who are in the
400 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
401 * both tasks until we find their ancestors who are siblings of common
402 * parent.
403 */
404
405 /* First walk up until both entities are at same depth */
406 se_depth = (*se)->depth;
407 pse_depth = (*pse)->depth;
408
409 while (se_depth > pse_depth) {
410 se_depth--;
411 *se = parent_entity(*se);
412 }
413
414 while (pse_depth > se_depth) {
415 pse_depth--;
416 *pse = parent_entity(*pse);
417 }
418
419 while (!is_same_group(*se, *pse)) {
420 *se = parent_entity(*se);
421 *pse = parent_entity(*pse);
422 }
423 }
424
425 #else /* !CONFIG_FAIR_GROUP_SCHED */
426
task_of(struct sched_entity * se)427 static inline struct task_struct *task_of(struct sched_entity *se)
428 {
429 return container_of(se, struct task_struct, se);
430 }
431
rq_of(struct cfs_rq * cfs_rq)432 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
433 {
434 return container_of(cfs_rq, struct rq, cfs);
435 }
436
437 #define entity_is_task(se) 1
438
439 #define for_each_sched_entity(se) \
440 for (; se; se = NULL)
441
task_cfs_rq(struct task_struct * p)442 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
443 {
444 return &task_rq(p)->cfs;
445 }
446
cfs_rq_of(struct sched_entity * se)447 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
448 {
449 struct task_struct *p = task_of(se);
450 struct rq *rq = task_rq(p);
451
452 return &rq->cfs;
453 }
454
455 /* runqueue "owned" by this group */
group_cfs_rq(struct sched_entity * grp)456 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
457 {
458 return NULL;
459 }
460
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)461 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
462 {
463 }
464
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)465 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
466 {
467 }
468
469 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
470 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
471
parent_entity(struct sched_entity * se)472 static inline struct sched_entity *parent_entity(struct sched_entity *se)
473 {
474 return NULL;
475 }
476
477 static inline void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)478 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
479 {
480 }
481
482 #endif /* CONFIG_FAIR_GROUP_SCHED */
483
484 static __always_inline
485 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
486
487 /**************************************************************
488 * Scheduling class tree data structure manipulation methods:
489 */
490
max_vruntime(u64 max_vruntime,u64 vruntime)491 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
492 {
493 s64 delta = (s64)(vruntime - max_vruntime);
494 if (delta > 0)
495 max_vruntime = vruntime;
496
497 return max_vruntime;
498 }
499
min_vruntime(u64 min_vruntime,u64 vruntime)500 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
501 {
502 s64 delta = (s64)(vruntime - min_vruntime);
503 if (delta < 0)
504 min_vruntime = vruntime;
505
506 return min_vruntime;
507 }
508
entity_before(struct sched_entity * a,struct sched_entity * b)509 static inline int entity_before(struct sched_entity *a,
510 struct sched_entity *b)
511 {
512 return (s64)(a->vruntime - b->vruntime) < 0;
513 }
514
update_min_vruntime(struct cfs_rq * cfs_rq)515 static void update_min_vruntime(struct cfs_rq *cfs_rq)
516 {
517 u64 vruntime = cfs_rq->min_vruntime;
518
519 if (cfs_rq->curr)
520 vruntime = cfs_rq->curr->vruntime;
521
522 if (cfs_rq->rb_leftmost) {
523 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
524 struct sched_entity,
525 run_node);
526
527 if (!cfs_rq->curr)
528 vruntime = se->vruntime;
529 else
530 vruntime = min_vruntime(vruntime, se->vruntime);
531 }
532
533 /* ensure we never gain time by being placed backwards. */
534 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
535 #ifndef CONFIG_64BIT
536 smp_wmb();
537 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
538 #endif
539 }
540
541 /*
542 * Enqueue an entity into the rb-tree:
543 */
__enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)544 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
545 {
546 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
547 struct rb_node *parent = NULL;
548 struct sched_entity *entry;
549 int leftmost = 1;
550
551 /*
552 * Find the right place in the rbtree:
553 */
554 while (*link) {
555 parent = *link;
556 entry = rb_entry(parent, struct sched_entity, run_node);
557 /*
558 * We dont care about collisions. Nodes with
559 * the same key stay together.
560 */
561 if (entity_before(se, entry)) {
562 link = &parent->rb_left;
563 } else {
564 link = &parent->rb_right;
565 leftmost = 0;
566 }
567 }
568
569 /*
570 * Maintain a cache of leftmost tree entries (it is frequently
571 * used):
572 */
573 if (leftmost)
574 cfs_rq->rb_leftmost = &se->run_node;
575
576 rb_link_node(&se->run_node, parent, link);
577 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
578 }
579
__dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)580 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
581 {
582 if (cfs_rq->rb_leftmost == &se->run_node) {
583 struct rb_node *next_node;
584
585 next_node = rb_next(&se->run_node);
586 cfs_rq->rb_leftmost = next_node;
587 }
588
589 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
590 }
591
__pick_first_entity(struct cfs_rq * cfs_rq)592 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
593 {
594 struct rb_node *left = cfs_rq->rb_leftmost;
595
596 if (!left)
597 return NULL;
598
599 return rb_entry(left, struct sched_entity, run_node);
600 }
601
__pick_next_entity(struct sched_entity * se)602 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
603 {
604 struct rb_node *next = rb_next(&se->run_node);
605
606 if (!next)
607 return NULL;
608
609 return rb_entry(next, struct sched_entity, run_node);
610 }
611
612 #ifdef CONFIG_SCHED_DEBUG
__pick_last_entity(struct cfs_rq * cfs_rq)613 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
614 {
615 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
616
617 if (!last)
618 return NULL;
619
620 return rb_entry(last, struct sched_entity, run_node);
621 }
622
623 /**************************************************************
624 * Scheduling class statistics methods:
625 */
626
sched_proc_update_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)627 int sched_proc_update_handler(struct ctl_table *table, int write,
628 void __user *buffer, size_t *lenp,
629 loff_t *ppos)
630 {
631 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
632 unsigned int factor = get_update_sysctl_factor();
633
634 if (ret || !write)
635 return ret;
636
637 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
638 sysctl_sched_min_granularity);
639
640 #define WRT_SYSCTL(name) \
641 (normalized_sysctl_##name = sysctl_##name / (factor))
642 WRT_SYSCTL(sched_min_granularity);
643 WRT_SYSCTL(sched_latency);
644 WRT_SYSCTL(sched_wakeup_granularity);
645 #undef WRT_SYSCTL
646
647 return 0;
648 }
649 #endif
650
651 /*
652 * delta /= w
653 */
calc_delta_fair(u64 delta,struct sched_entity * se)654 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
655 {
656 if (unlikely(se->load.weight != NICE_0_LOAD))
657 delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
658
659 return delta;
660 }
661
662 /*
663 * The idea is to set a period in which each task runs once.
664 *
665 * When there are too many tasks (sched_nr_latency) we have to stretch
666 * this period because otherwise the slices get too small.
667 *
668 * p = (nr <= nl) ? l : l*nr/nl
669 */
__sched_period(unsigned long nr_running)670 static u64 __sched_period(unsigned long nr_running)
671 {
672 if (unlikely(nr_running > sched_nr_latency))
673 return nr_running * sysctl_sched_min_granularity;
674 else
675 return sysctl_sched_latency;
676 }
677
678 /*
679 * We calculate the wall-time slice from the period by taking a part
680 * proportional to the weight.
681 *
682 * s = p*P[w/rw]
683 */
sched_slice(struct cfs_rq * cfs_rq,struct sched_entity * se)684 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
685 {
686 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
687
688 for_each_sched_entity(se) {
689 struct load_weight *load;
690 struct load_weight lw;
691
692 cfs_rq = cfs_rq_of(se);
693 load = &cfs_rq->load;
694
695 if (unlikely(!se->on_rq)) {
696 lw = cfs_rq->load;
697
698 update_load_add(&lw, se->load.weight);
699 load = &lw;
700 }
701 slice = __calc_delta(slice, se->load.weight, load);
702 }
703 return slice;
704 }
705
706 /*
707 * We calculate the vruntime slice of a to-be-inserted task.
708 *
709 * vs = s/w
710 */
sched_vslice(struct cfs_rq * cfs_rq,struct sched_entity * se)711 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
712 {
713 return calc_delta_fair(sched_slice(cfs_rq, se), se);
714 }
715
716 #ifdef CONFIG_SMP
717 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
718 static unsigned long task_h_load(struct task_struct *p);
719
720 /*
721 * We choose a half-life close to 1 scheduling period.
722 * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
723 * dependent on this value.
724 */
725 #define LOAD_AVG_PERIOD 32
726 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
727 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
728
729 /* Give new sched_entity start runnable values to heavy its load in infant time */
init_entity_runnable_average(struct sched_entity * se)730 void init_entity_runnable_average(struct sched_entity *se)
731 {
732 struct sched_avg *sa = &se->avg;
733
734 sa->last_update_time = 0;
735 /*
736 * sched_avg's period_contrib should be strictly less then 1024, so
737 * we give it 1023 to make sure it is almost a period (1024us), and
738 * will definitely be update (after enqueue).
739 */
740 sa->period_contrib = 1023;
741 /*
742 * Tasks are intialized with full load to be seen as heavy tasks until
743 * they get a chance to stabilize to their real load level.
744 * Group entities are intialized with zero load to reflect the fact that
745 * nothing has been attached to the task group yet.
746 */
747 if (entity_is_task(se))
748 sa->load_avg = scale_load_down(se->load.weight);
749 sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
750 /*
751 * In previous Android versions, we used to have:
752 * sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
753 * sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
754 * However, that functionality has been moved to enqueue.
755 * It is unclear if we should restore this in enqueue.
756 */
757 /*
758 * At this point, util_avg won't be used in select_task_rq_fair anyway
759 */
760 sa->util_avg = 0;
761 sa->util_sum = 0;
762 /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
763 }
764
765 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
766 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
767 static void attach_entity_cfs_rq(struct sched_entity *se);
768 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
769
770 /*
771 * With new tasks being created, their initial util_avgs are extrapolated
772 * based on the cfs_rq's current util_avg:
773 *
774 * util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
775 *
776 * However, in many cases, the above util_avg does not give a desired
777 * value. Moreover, the sum of the util_avgs may be divergent, such
778 * as when the series is a harmonic series.
779 *
780 * To solve this problem, we also cap the util_avg of successive tasks to
781 * only 1/2 of the left utilization budget:
782 *
783 * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
784 *
785 * where n denotes the nth task.
786 *
787 * For example, a simplest series from the beginning would be like:
788 *
789 * task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
790 * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
791 *
792 * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
793 * if util_avg > util_avg_cap.
794 */
post_init_entity_util_avg(struct sched_entity * se)795 void post_init_entity_util_avg(struct sched_entity *se)
796 {
797 struct cfs_rq *cfs_rq = cfs_rq_of(se);
798 struct sched_avg *sa = &se->avg;
799 long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
800
801 if (cap > 0) {
802 if (cfs_rq->avg.util_avg != 0) {
803 sa->util_avg = cfs_rq->avg.util_avg * se->load.weight;
804 sa->util_avg /= (cfs_rq->avg.load_avg + 1);
805
806 if (sa->util_avg > cap)
807 sa->util_avg = cap;
808 } else {
809 sa->util_avg = cap;
810 }
811 /*
812 * If we wish to restore tuning via setting initial util,
813 * this is where we should do it.
814 */
815 sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
816 }
817
818 if (entity_is_task(se)) {
819 struct task_struct *p = task_of(se);
820 if (p->sched_class != &fair_sched_class) {
821 /*
822 * For !fair tasks do:
823 *
824 update_cfs_rq_load_avg(now, cfs_rq, false);
825 attach_entity_load_avg(cfs_rq, se);
826 switched_from_fair(rq, p);
827 *
828 * such that the next switched_to_fair() has the
829 * expected state.
830 */
831 se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
832 return;
833 }
834 }
835
836 attach_entity_cfs_rq(se);
837 }
838
839 #else /* !CONFIG_SMP */
init_entity_runnable_average(struct sched_entity * se)840 void init_entity_runnable_average(struct sched_entity *se)
841 {
842 }
post_init_entity_util_avg(struct sched_entity * se)843 void post_init_entity_util_avg(struct sched_entity *se)
844 {
845 }
update_tg_load_avg(struct cfs_rq * cfs_rq,int force)846 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
847 {
848 }
849 #endif /* CONFIG_SMP */
850
851 /*
852 * Update the current task's runtime statistics.
853 */
update_curr(struct cfs_rq * cfs_rq)854 static void update_curr(struct cfs_rq *cfs_rq)
855 {
856 struct sched_entity *curr = cfs_rq->curr;
857 u64 now = rq_clock_task(rq_of(cfs_rq));
858 u64 delta_exec;
859
860 if (unlikely(!curr))
861 return;
862
863 delta_exec = now - curr->exec_start;
864 if (unlikely((s64)delta_exec <= 0))
865 return;
866
867 curr->exec_start = now;
868
869 schedstat_set(curr->statistics.exec_max,
870 max(delta_exec, curr->statistics.exec_max));
871
872 curr->sum_exec_runtime += delta_exec;
873 schedstat_add(cfs_rq, exec_clock, delta_exec);
874
875 curr->vruntime += calc_delta_fair(delta_exec, curr);
876 update_min_vruntime(cfs_rq);
877
878 if (entity_is_task(curr)) {
879 struct task_struct *curtask = task_of(curr);
880
881 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
882 cpuacct_charge(curtask, delta_exec);
883 account_group_exec_runtime(curtask, delta_exec);
884 }
885
886 account_cfs_rq_runtime(cfs_rq, delta_exec);
887 }
888
update_curr_fair(struct rq * rq)889 static void update_curr_fair(struct rq *rq)
890 {
891 update_curr(cfs_rq_of(&rq->curr->se));
892 }
893
894 static inline void
update_stats_wait_start(struct cfs_rq * cfs_rq,struct sched_entity * se)895 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
896 {
897 schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
898 }
899
900 /*
901 * Task is being enqueued - update stats:
902 */
update_stats_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)903 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
904 {
905 /*
906 * Are we enqueueing a waiting task? (for current tasks
907 * a dequeue/enqueue event is a NOP)
908 */
909 if (se != cfs_rq->curr)
910 update_stats_wait_start(cfs_rq, se);
911 }
912
913 static void
update_stats_wait_end(struct cfs_rq * cfs_rq,struct sched_entity * se)914 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
915 {
916 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
917 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
918 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
919 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
920 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
921 #ifdef CONFIG_SCHEDSTATS
922 if (entity_is_task(se)) {
923 trace_sched_stat_wait(task_of(se),
924 rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
925 }
926 #endif
927 schedstat_set(se->statistics.wait_start, 0);
928 }
929
930 static inline void
update_stats_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)931 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
932 {
933 /*
934 * Mark the end of the wait period if dequeueing a
935 * waiting task:
936 */
937 if (se != cfs_rq->curr)
938 update_stats_wait_end(cfs_rq, se);
939 }
940
941 /*
942 * We are picking a new current task - update its stats:
943 */
944 static inline void
update_stats_curr_start(struct cfs_rq * cfs_rq,struct sched_entity * se)945 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
946 {
947 /*
948 * We are starting a new run period:
949 */
950 se->exec_start = rq_clock_task(rq_of(cfs_rq));
951 }
952
953 /**************************************************
954 * Scheduling class queueing methods:
955 */
956
957 #ifdef CONFIG_NUMA_BALANCING
958 /*
959 * Approximate time to scan a full NUMA task in ms. The task scan period is
960 * calculated based on the tasks virtual memory size and
961 * numa_balancing_scan_size.
962 */
963 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
964 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
965
966 /* Portion of address space to scan in MB */
967 unsigned int sysctl_numa_balancing_scan_size = 256;
968
969 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
970 unsigned int sysctl_numa_balancing_scan_delay = 1000;
971
task_nr_scan_windows(struct task_struct * p)972 static unsigned int task_nr_scan_windows(struct task_struct *p)
973 {
974 unsigned long rss = 0;
975 unsigned long nr_scan_pages;
976
977 /*
978 * Calculations based on RSS as non-present and empty pages are skipped
979 * by the PTE scanner and NUMA hinting faults should be trapped based
980 * on resident pages
981 */
982 nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
983 rss = get_mm_rss(p->mm);
984 if (!rss)
985 rss = nr_scan_pages;
986
987 rss = round_up(rss, nr_scan_pages);
988 return rss / nr_scan_pages;
989 }
990
991 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
992 #define MAX_SCAN_WINDOW 2560
993
task_scan_min(struct task_struct * p)994 static unsigned int task_scan_min(struct task_struct *p)
995 {
996 unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
997 unsigned int scan, floor;
998 unsigned int windows = 1;
999
1000 if (scan_size < MAX_SCAN_WINDOW)
1001 windows = MAX_SCAN_WINDOW / scan_size;
1002 floor = 1000 / windows;
1003
1004 scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1005 return max_t(unsigned int, floor, scan);
1006 }
1007
task_scan_max(struct task_struct * p)1008 static unsigned int task_scan_max(struct task_struct *p)
1009 {
1010 unsigned int smin = task_scan_min(p);
1011 unsigned int smax;
1012
1013 /* Watch for min being lower than max due to floor calculations */
1014 smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1015 return max(smin, smax);
1016 }
1017
account_numa_enqueue(struct rq * rq,struct task_struct * p)1018 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1019 {
1020 rq->nr_numa_running += (p->numa_preferred_nid != -1);
1021 rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1022 }
1023
account_numa_dequeue(struct rq * rq,struct task_struct * p)1024 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1025 {
1026 rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1027 rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1028 }
1029
1030 struct numa_group {
1031 atomic_t refcount;
1032
1033 spinlock_t lock; /* nr_tasks, tasks */
1034 int nr_tasks;
1035 pid_t gid;
1036
1037 struct rcu_head rcu;
1038 nodemask_t active_nodes;
1039 unsigned long total_faults;
1040 /*
1041 * Faults_cpu is used to decide whether memory should move
1042 * towards the CPU. As a consequence, these stats are weighted
1043 * more by CPU use than by memory faults.
1044 */
1045 unsigned long *faults_cpu;
1046 unsigned long faults[0];
1047 };
1048
1049 /* Shared or private faults. */
1050 #define NR_NUMA_HINT_FAULT_TYPES 2
1051
1052 /* Memory and CPU locality */
1053 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1054
1055 /* Averaged statistics, and temporary buffers. */
1056 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1057
task_numa_group_id(struct task_struct * p)1058 pid_t task_numa_group_id(struct task_struct *p)
1059 {
1060 return p->numa_group ? p->numa_group->gid : 0;
1061 }
1062
1063 /*
1064 * The averaged statistics, shared & private, memory & cpu,
1065 * occupy the first half of the array. The second half of the
1066 * array is for current counters, which are averaged into the
1067 * first set by task_numa_placement.
1068 */
task_faults_idx(enum numa_faults_stats s,int nid,int priv)1069 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1070 {
1071 return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1072 }
1073
task_faults(struct task_struct * p,int nid)1074 static inline unsigned long task_faults(struct task_struct *p, int nid)
1075 {
1076 if (!p->numa_faults)
1077 return 0;
1078
1079 return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1080 p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1081 }
1082
group_faults(struct task_struct * p,int nid)1083 static inline unsigned long group_faults(struct task_struct *p, int nid)
1084 {
1085 if (!p->numa_group)
1086 return 0;
1087
1088 return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1089 p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1090 }
1091
group_faults_cpu(struct numa_group * group,int nid)1092 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1093 {
1094 return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1095 group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1096 }
1097
1098 /* Handle placement on systems where not all nodes are directly connected. */
score_nearby_nodes(struct task_struct * p,int nid,int maxdist,bool task)1099 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1100 int maxdist, bool task)
1101 {
1102 unsigned long score = 0;
1103 int node;
1104
1105 /*
1106 * All nodes are directly connected, and the same distance
1107 * from each other. No need for fancy placement algorithms.
1108 */
1109 if (sched_numa_topology_type == NUMA_DIRECT)
1110 return 0;
1111
1112 /*
1113 * This code is called for each node, introducing N^2 complexity,
1114 * which should be ok given the number of nodes rarely exceeds 8.
1115 */
1116 for_each_online_node(node) {
1117 unsigned long faults;
1118 int dist = node_distance(nid, node);
1119
1120 /*
1121 * The furthest away nodes in the system are not interesting
1122 * for placement; nid was already counted.
1123 */
1124 if (dist == sched_max_numa_distance || node == nid)
1125 continue;
1126
1127 /*
1128 * On systems with a backplane NUMA topology, compare groups
1129 * of nodes, and move tasks towards the group with the most
1130 * memory accesses. When comparing two nodes at distance
1131 * "hoplimit", only nodes closer by than "hoplimit" are part
1132 * of each group. Skip other nodes.
1133 */
1134 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1135 dist > maxdist)
1136 continue;
1137
1138 /* Add up the faults from nearby nodes. */
1139 if (task)
1140 faults = task_faults(p, node);
1141 else
1142 faults = group_faults(p, node);
1143
1144 /*
1145 * On systems with a glueless mesh NUMA topology, there are
1146 * no fixed "groups of nodes". Instead, nodes that are not
1147 * directly connected bounce traffic through intermediate
1148 * nodes; a numa_group can occupy any set of nodes.
1149 * The further away a node is, the less the faults count.
1150 * This seems to result in good task placement.
1151 */
1152 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1153 faults *= (sched_max_numa_distance - dist);
1154 faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1155 }
1156
1157 score += faults;
1158 }
1159
1160 return score;
1161 }
1162
1163 /*
1164 * These return the fraction of accesses done by a particular task, or
1165 * task group, on a particular numa node. The group weight is given a
1166 * larger multiplier, in order to group tasks together that are almost
1167 * evenly spread out between numa nodes.
1168 */
task_weight(struct task_struct * p,int nid,int dist)1169 static inline unsigned long task_weight(struct task_struct *p, int nid,
1170 int dist)
1171 {
1172 unsigned long faults, total_faults;
1173
1174 if (!p->numa_faults)
1175 return 0;
1176
1177 total_faults = p->total_numa_faults;
1178
1179 if (!total_faults)
1180 return 0;
1181
1182 faults = task_faults(p, nid);
1183 faults += score_nearby_nodes(p, nid, dist, true);
1184
1185 return 1000 * faults / total_faults;
1186 }
1187
group_weight(struct task_struct * p,int nid,int dist)1188 static inline unsigned long group_weight(struct task_struct *p, int nid,
1189 int dist)
1190 {
1191 unsigned long faults, total_faults;
1192
1193 if (!p->numa_group)
1194 return 0;
1195
1196 total_faults = p->numa_group->total_faults;
1197
1198 if (!total_faults)
1199 return 0;
1200
1201 faults = group_faults(p, nid);
1202 faults += score_nearby_nodes(p, nid, dist, false);
1203
1204 return 1000 * faults / total_faults;
1205 }
1206
should_numa_migrate_memory(struct task_struct * p,struct page * page,int src_nid,int dst_cpu)1207 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1208 int src_nid, int dst_cpu)
1209 {
1210 struct numa_group *ng = p->numa_group;
1211 int dst_nid = cpu_to_node(dst_cpu);
1212 int last_cpupid, this_cpupid;
1213
1214 this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1215
1216 /*
1217 * Multi-stage node selection is used in conjunction with a periodic
1218 * migration fault to build a temporal task<->page relation. By using
1219 * a two-stage filter we remove short/unlikely relations.
1220 *
1221 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1222 * a task's usage of a particular page (n_p) per total usage of this
1223 * page (n_t) (in a given time-span) to a probability.
1224 *
1225 * Our periodic faults will sample this probability and getting the
1226 * same result twice in a row, given these samples are fully
1227 * independent, is then given by P(n)^2, provided our sample period
1228 * is sufficiently short compared to the usage pattern.
1229 *
1230 * This quadric squishes small probabilities, making it less likely we
1231 * act on an unlikely task<->page relation.
1232 */
1233 last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1234 if (!cpupid_pid_unset(last_cpupid) &&
1235 cpupid_to_nid(last_cpupid) != dst_nid)
1236 return false;
1237
1238 /* Always allow migrate on private faults */
1239 if (cpupid_match_pid(p, last_cpupid))
1240 return true;
1241
1242 /* A shared fault, but p->numa_group has not been set up yet. */
1243 if (!ng)
1244 return true;
1245
1246 /*
1247 * Do not migrate if the destination is not a node that
1248 * is actively used by this numa group.
1249 */
1250 if (!node_isset(dst_nid, ng->active_nodes))
1251 return false;
1252
1253 /*
1254 * Source is a node that is not actively used by this
1255 * numa group, while the destination is. Migrate.
1256 */
1257 if (!node_isset(src_nid, ng->active_nodes))
1258 return true;
1259
1260 /*
1261 * Both source and destination are nodes in active
1262 * use by this numa group. Maximize memory bandwidth
1263 * by migrating from more heavily used groups, to less
1264 * heavily used ones, spreading the load around.
1265 * Use a 1/4 hysteresis to avoid spurious page movement.
1266 */
1267 return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1268 }
1269
1270 static unsigned long weighted_cpuload(const int cpu);
1271 static unsigned long source_load(int cpu, int type);
1272 static unsigned long target_load(int cpu, int type);
1273 static unsigned long capacity_of(int cpu);
1274 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1275
1276 /* Cached statistics for all CPUs within a node */
1277 struct numa_stats {
1278 unsigned long nr_running;
1279 unsigned long load;
1280
1281 /* Total compute capacity of CPUs on a node */
1282 unsigned long compute_capacity;
1283
1284 /* Approximate capacity in terms of runnable tasks on a node */
1285 unsigned long task_capacity;
1286 int has_free_capacity;
1287 };
1288
1289 /*
1290 * XXX borrowed from update_sg_lb_stats
1291 */
update_numa_stats(struct numa_stats * ns,int nid)1292 static void update_numa_stats(struct numa_stats *ns, int nid)
1293 {
1294 int smt, cpu, cpus = 0;
1295 unsigned long capacity;
1296
1297 memset(ns, 0, sizeof(*ns));
1298 for_each_cpu(cpu, cpumask_of_node(nid)) {
1299 struct rq *rq = cpu_rq(cpu);
1300
1301 ns->nr_running += rq->nr_running;
1302 ns->load += weighted_cpuload(cpu);
1303 ns->compute_capacity += capacity_of(cpu);
1304
1305 cpus++;
1306 }
1307
1308 /*
1309 * If we raced with hotplug and there are no CPUs left in our mask
1310 * the @ns structure is NULL'ed and task_numa_compare() will
1311 * not find this node attractive.
1312 *
1313 * We'll either bail at !has_free_capacity, or we'll detect a huge
1314 * imbalance and bail there.
1315 */
1316 if (!cpus)
1317 return;
1318
1319 /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1320 smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1321 capacity = cpus / smt; /* cores */
1322
1323 ns->task_capacity = min_t(unsigned, capacity,
1324 DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1325 ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1326 }
1327
1328 struct task_numa_env {
1329 struct task_struct *p;
1330
1331 int src_cpu, src_nid;
1332 int dst_cpu, dst_nid;
1333
1334 struct numa_stats src_stats, dst_stats;
1335
1336 int imbalance_pct;
1337 int dist;
1338
1339 struct task_struct *best_task;
1340 long best_imp;
1341 int best_cpu;
1342 };
1343
task_numa_assign(struct task_numa_env * env,struct task_struct * p,long imp)1344 static void task_numa_assign(struct task_numa_env *env,
1345 struct task_struct *p, long imp)
1346 {
1347 if (env->best_task)
1348 put_task_struct(env->best_task);
1349
1350 env->best_task = p;
1351 env->best_imp = imp;
1352 env->best_cpu = env->dst_cpu;
1353 }
1354
load_too_imbalanced(long src_load,long dst_load,struct task_numa_env * env)1355 static bool load_too_imbalanced(long src_load, long dst_load,
1356 struct task_numa_env *env)
1357 {
1358 long imb, old_imb;
1359 long orig_src_load, orig_dst_load;
1360 long src_capacity, dst_capacity;
1361
1362 /*
1363 * The load is corrected for the CPU capacity available on each node.
1364 *
1365 * src_load dst_load
1366 * ------------ vs ---------
1367 * src_capacity dst_capacity
1368 */
1369 src_capacity = env->src_stats.compute_capacity;
1370 dst_capacity = env->dst_stats.compute_capacity;
1371
1372 /* We care about the slope of the imbalance, not the direction. */
1373 if (dst_load < src_load)
1374 swap(dst_load, src_load);
1375
1376 /* Is the difference below the threshold? */
1377 imb = dst_load * src_capacity * 100 -
1378 src_load * dst_capacity * env->imbalance_pct;
1379 if (imb <= 0)
1380 return false;
1381
1382 /*
1383 * The imbalance is above the allowed threshold.
1384 * Compare it with the old imbalance.
1385 */
1386 orig_src_load = env->src_stats.load;
1387 orig_dst_load = env->dst_stats.load;
1388
1389 if (orig_dst_load < orig_src_load)
1390 swap(orig_dst_load, orig_src_load);
1391
1392 old_imb = orig_dst_load * src_capacity * 100 -
1393 orig_src_load * dst_capacity * env->imbalance_pct;
1394
1395 /* Would this change make things worse? */
1396 return (imb > old_imb);
1397 }
1398
1399 /*
1400 * This checks if the overall compute and NUMA accesses of the system would
1401 * be improved if the source tasks was migrated to the target dst_cpu taking
1402 * into account that it might be best if task running on the dst_cpu should
1403 * be exchanged with the source task
1404 */
task_numa_compare(struct task_numa_env * env,long taskimp,long groupimp)1405 static void task_numa_compare(struct task_numa_env *env,
1406 long taskimp, long groupimp)
1407 {
1408 struct rq *src_rq = cpu_rq(env->src_cpu);
1409 struct rq *dst_rq = cpu_rq(env->dst_cpu);
1410 struct task_struct *cur;
1411 long src_load, dst_load;
1412 long load;
1413 long imp = env->p->numa_group ? groupimp : taskimp;
1414 long moveimp = imp;
1415 int dist = env->dist;
1416 bool assigned = false;
1417
1418 rcu_read_lock();
1419
1420 raw_spin_lock_irq(&dst_rq->lock);
1421 cur = dst_rq->curr;
1422 /*
1423 * No need to move the exiting task or idle task.
1424 */
1425 if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1426 cur = NULL;
1427 else {
1428 /*
1429 * The task_struct must be protected here to protect the
1430 * p->numa_faults access in the task_weight since the
1431 * numa_faults could already be freed in the following path:
1432 * finish_task_switch()
1433 * --> put_task_struct()
1434 * --> __put_task_struct()
1435 * --> task_numa_free()
1436 */
1437 get_task_struct(cur);
1438 }
1439
1440 raw_spin_unlock_irq(&dst_rq->lock);
1441
1442 /*
1443 * Because we have preemption enabled we can get migrated around and
1444 * end try selecting ourselves (current == env->p) as a swap candidate.
1445 */
1446 if (cur == env->p)
1447 goto unlock;
1448
1449 /*
1450 * "imp" is the fault differential for the source task between the
1451 * source and destination node. Calculate the total differential for
1452 * the source task and potential destination task. The more negative
1453 * the value is, the more rmeote accesses that would be expected to
1454 * be incurred if the tasks were swapped.
1455 */
1456 if (cur) {
1457 /* Skip this swap candidate if cannot move to the source cpu */
1458 if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1459 goto unlock;
1460
1461 /*
1462 * If dst and source tasks are in the same NUMA group, or not
1463 * in any group then look only at task weights.
1464 */
1465 if (cur->numa_group == env->p->numa_group) {
1466 imp = taskimp + task_weight(cur, env->src_nid, dist) -
1467 task_weight(cur, env->dst_nid, dist);
1468 /*
1469 * Add some hysteresis to prevent swapping the
1470 * tasks within a group over tiny differences.
1471 */
1472 if (cur->numa_group)
1473 imp -= imp/16;
1474 } else {
1475 /*
1476 * Compare the group weights. If a task is all by
1477 * itself (not part of a group), use the task weight
1478 * instead.
1479 */
1480 if (cur->numa_group)
1481 imp += group_weight(cur, env->src_nid, dist) -
1482 group_weight(cur, env->dst_nid, dist);
1483 else
1484 imp += task_weight(cur, env->src_nid, dist) -
1485 task_weight(cur, env->dst_nid, dist);
1486 }
1487 }
1488
1489 if (imp <= env->best_imp && moveimp <= env->best_imp)
1490 goto unlock;
1491
1492 if (!cur) {
1493 /* Is there capacity at our destination? */
1494 if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1495 !env->dst_stats.has_free_capacity)
1496 goto unlock;
1497
1498 goto balance;
1499 }
1500
1501 /* Balance doesn't matter much if we're running a task per cpu */
1502 if (imp > env->best_imp && src_rq->nr_running == 1 &&
1503 dst_rq->nr_running == 1)
1504 goto assign;
1505
1506 /*
1507 * In the overloaded case, try and keep the load balanced.
1508 */
1509 balance:
1510 load = task_h_load(env->p);
1511 dst_load = env->dst_stats.load + load;
1512 src_load = env->src_stats.load - load;
1513
1514 if (moveimp > imp && moveimp > env->best_imp) {
1515 /*
1516 * If the improvement from just moving env->p direction is
1517 * better than swapping tasks around, check if a move is
1518 * possible. Store a slightly smaller score than moveimp,
1519 * so an actually idle CPU will win.
1520 */
1521 if (!load_too_imbalanced(src_load, dst_load, env)) {
1522 imp = moveimp - 1;
1523 put_task_struct(cur);
1524 cur = NULL;
1525 goto assign;
1526 }
1527 }
1528
1529 if (imp <= env->best_imp)
1530 goto unlock;
1531
1532 if (cur) {
1533 load = task_h_load(cur);
1534 dst_load -= load;
1535 src_load += load;
1536 }
1537
1538 if (load_too_imbalanced(src_load, dst_load, env))
1539 goto unlock;
1540
1541 /*
1542 * One idle CPU per node is evaluated for a task numa move.
1543 * Call select_idle_sibling to maybe find a better one.
1544 */
1545 if (!cur)
1546 env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1547 env->dst_cpu);
1548
1549 assign:
1550 assigned = true;
1551 task_numa_assign(env, cur, imp);
1552 unlock:
1553 rcu_read_unlock();
1554 /*
1555 * The dst_rq->curr isn't assigned. The protection for task_struct is
1556 * finished.
1557 */
1558 if (cur && !assigned)
1559 put_task_struct(cur);
1560 }
1561
task_numa_find_cpu(struct task_numa_env * env,long taskimp,long groupimp)1562 static void task_numa_find_cpu(struct task_numa_env *env,
1563 long taskimp, long groupimp)
1564 {
1565 int cpu;
1566
1567 for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1568 /* Skip this CPU if the source task cannot migrate */
1569 if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1570 continue;
1571
1572 env->dst_cpu = cpu;
1573 task_numa_compare(env, taskimp, groupimp);
1574 }
1575 }
1576
1577 /* Only move tasks to a NUMA node less busy than the current node. */
numa_has_capacity(struct task_numa_env * env)1578 static bool numa_has_capacity(struct task_numa_env *env)
1579 {
1580 struct numa_stats *src = &env->src_stats;
1581 struct numa_stats *dst = &env->dst_stats;
1582
1583 if (src->has_free_capacity && !dst->has_free_capacity)
1584 return false;
1585
1586 /*
1587 * Only consider a task move if the source has a higher load
1588 * than the destination, corrected for CPU capacity on each node.
1589 *
1590 * src->load dst->load
1591 * --------------------- vs ---------------------
1592 * src->compute_capacity dst->compute_capacity
1593 */
1594 if (src->load * dst->compute_capacity * env->imbalance_pct >
1595
1596 dst->load * src->compute_capacity * 100)
1597 return true;
1598
1599 return false;
1600 }
1601
task_numa_migrate(struct task_struct * p)1602 static int task_numa_migrate(struct task_struct *p)
1603 {
1604 struct task_numa_env env = {
1605 .p = p,
1606
1607 .src_cpu = task_cpu(p),
1608 .src_nid = task_node(p),
1609
1610 .imbalance_pct = 112,
1611
1612 .best_task = NULL,
1613 .best_imp = 0,
1614 .best_cpu = -1
1615 };
1616 struct sched_domain *sd;
1617 unsigned long taskweight, groupweight;
1618 int nid, ret, dist;
1619 long taskimp, groupimp;
1620
1621 /*
1622 * Pick the lowest SD_NUMA domain, as that would have the smallest
1623 * imbalance and would be the first to start moving tasks about.
1624 *
1625 * And we want to avoid any moving of tasks about, as that would create
1626 * random movement of tasks -- counter the numa conditions we're trying
1627 * to satisfy here.
1628 */
1629 rcu_read_lock();
1630 sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1631 if (sd)
1632 env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1633 rcu_read_unlock();
1634
1635 /*
1636 * Cpusets can break the scheduler domain tree into smaller
1637 * balance domains, some of which do not cross NUMA boundaries.
1638 * Tasks that are "trapped" in such domains cannot be migrated
1639 * elsewhere, so there is no point in (re)trying.
1640 */
1641 if (unlikely(!sd)) {
1642 p->numa_preferred_nid = task_node(p);
1643 return -EINVAL;
1644 }
1645
1646 env.dst_nid = p->numa_preferred_nid;
1647 dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1648 taskweight = task_weight(p, env.src_nid, dist);
1649 groupweight = group_weight(p, env.src_nid, dist);
1650 update_numa_stats(&env.src_stats, env.src_nid);
1651 taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1652 groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1653 update_numa_stats(&env.dst_stats, env.dst_nid);
1654
1655 /* Try to find a spot on the preferred nid. */
1656 if (numa_has_capacity(&env))
1657 task_numa_find_cpu(&env, taskimp, groupimp);
1658
1659 /*
1660 * Look at other nodes in these cases:
1661 * - there is no space available on the preferred_nid
1662 * - the task is part of a numa_group that is interleaved across
1663 * multiple NUMA nodes; in order to better consolidate the group,
1664 * we need to check other locations.
1665 */
1666 if (env.best_cpu == -1 || (p->numa_group &&
1667 nodes_weight(p->numa_group->active_nodes) > 1)) {
1668 for_each_online_node(nid) {
1669 if (nid == env.src_nid || nid == p->numa_preferred_nid)
1670 continue;
1671
1672 dist = node_distance(env.src_nid, env.dst_nid);
1673 if (sched_numa_topology_type == NUMA_BACKPLANE &&
1674 dist != env.dist) {
1675 taskweight = task_weight(p, env.src_nid, dist);
1676 groupweight = group_weight(p, env.src_nid, dist);
1677 }
1678
1679 /* Only consider nodes where both task and groups benefit */
1680 taskimp = task_weight(p, nid, dist) - taskweight;
1681 groupimp = group_weight(p, nid, dist) - groupweight;
1682 if (taskimp < 0 && groupimp < 0)
1683 continue;
1684
1685 env.dist = dist;
1686 env.dst_nid = nid;
1687 update_numa_stats(&env.dst_stats, env.dst_nid);
1688 if (numa_has_capacity(&env))
1689 task_numa_find_cpu(&env, taskimp, groupimp);
1690 }
1691 }
1692
1693 /*
1694 * If the task is part of a workload that spans multiple NUMA nodes,
1695 * and is migrating into one of the workload's active nodes, remember
1696 * this node as the task's preferred numa node, so the workload can
1697 * settle down.
1698 * A task that migrated to a second choice node will be better off
1699 * trying for a better one later. Do not set the preferred node here.
1700 */
1701 if (p->numa_group) {
1702 if (env.best_cpu == -1)
1703 nid = env.src_nid;
1704 else
1705 nid = env.dst_nid;
1706
1707 if (node_isset(nid, p->numa_group->active_nodes))
1708 sched_setnuma(p, env.dst_nid);
1709 }
1710
1711 /* No better CPU than the current one was found. */
1712 if (env.best_cpu == -1)
1713 return -EAGAIN;
1714
1715 /*
1716 * Reset the scan period if the task is being rescheduled on an
1717 * alternative node to recheck if the tasks is now properly placed.
1718 */
1719 p->numa_scan_period = task_scan_min(p);
1720
1721 if (env.best_task == NULL) {
1722 ret = migrate_task_to(p, env.best_cpu);
1723 if (ret != 0)
1724 trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1725 return ret;
1726 }
1727
1728 ret = migrate_swap(p, env.best_task);
1729 if (ret != 0)
1730 trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1731 put_task_struct(env.best_task);
1732 return ret;
1733 }
1734
1735 /* Attempt to migrate a task to a CPU on the preferred node. */
numa_migrate_preferred(struct task_struct * p)1736 static void numa_migrate_preferred(struct task_struct *p)
1737 {
1738 unsigned long interval = HZ;
1739
1740 /* This task has no NUMA fault statistics yet */
1741 if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1742 return;
1743
1744 /* Periodically retry migrating the task to the preferred node */
1745 interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1746 p->numa_migrate_retry = jiffies + interval;
1747
1748 /* Success if task is already running on preferred CPU */
1749 if (task_node(p) == p->numa_preferred_nid)
1750 return;
1751
1752 /* Otherwise, try migrate to a CPU on the preferred node */
1753 task_numa_migrate(p);
1754 }
1755
1756 /*
1757 * Find the nodes on which the workload is actively running. We do this by
1758 * tracking the nodes from which NUMA hinting faults are triggered. This can
1759 * be different from the set of nodes where the workload's memory is currently
1760 * located.
1761 *
1762 * The bitmask is used to make smarter decisions on when to do NUMA page
1763 * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1764 * are added when they cause over 6/16 of the maximum number of faults, but
1765 * only removed when they drop below 3/16.
1766 */
update_numa_active_node_mask(struct numa_group * numa_group)1767 static void update_numa_active_node_mask(struct numa_group *numa_group)
1768 {
1769 unsigned long faults, max_faults = 0;
1770 int nid;
1771
1772 for_each_online_node(nid) {
1773 faults = group_faults_cpu(numa_group, nid);
1774 if (faults > max_faults)
1775 max_faults = faults;
1776 }
1777
1778 for_each_online_node(nid) {
1779 faults = group_faults_cpu(numa_group, nid);
1780 if (!node_isset(nid, numa_group->active_nodes)) {
1781 if (faults > max_faults * 6 / 16)
1782 node_set(nid, numa_group->active_nodes);
1783 } else if (faults < max_faults * 3 / 16)
1784 node_clear(nid, numa_group->active_nodes);
1785 }
1786 }
1787
1788 /*
1789 * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1790 * increments. The more local the fault statistics are, the higher the scan
1791 * period will be for the next scan window. If local/(local+remote) ratio is
1792 * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1793 * the scan period will decrease. Aim for 70% local accesses.
1794 */
1795 #define NUMA_PERIOD_SLOTS 10
1796 #define NUMA_PERIOD_THRESHOLD 7
1797
1798 /*
1799 * Increase the scan period (slow down scanning) if the majority of
1800 * our memory is already on our local node, or if the majority of
1801 * the page accesses are shared with other processes.
1802 * Otherwise, decrease the scan period.
1803 */
update_task_scan_period(struct task_struct * p,unsigned long shared,unsigned long private)1804 static void update_task_scan_period(struct task_struct *p,
1805 unsigned long shared, unsigned long private)
1806 {
1807 unsigned int period_slot;
1808 int ratio;
1809 int diff;
1810
1811 unsigned long remote = p->numa_faults_locality[0];
1812 unsigned long local = p->numa_faults_locality[1];
1813
1814 /*
1815 * If there were no record hinting faults then either the task is
1816 * completely idle or all activity is areas that are not of interest
1817 * to automatic numa balancing. Related to that, if there were failed
1818 * migration then it implies we are migrating too quickly or the local
1819 * node is overloaded. In either case, scan slower
1820 */
1821 if (local + shared == 0 || p->numa_faults_locality[2]) {
1822 p->numa_scan_period = min(p->numa_scan_period_max,
1823 p->numa_scan_period << 1);
1824
1825 p->mm->numa_next_scan = jiffies +
1826 msecs_to_jiffies(p->numa_scan_period);
1827
1828 return;
1829 }
1830
1831 /*
1832 * Prepare to scale scan period relative to the current period.
1833 * == NUMA_PERIOD_THRESHOLD scan period stays the same
1834 * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1835 * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1836 */
1837 period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1838 ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1839 if (ratio >= NUMA_PERIOD_THRESHOLD) {
1840 int slot = ratio - NUMA_PERIOD_THRESHOLD;
1841 if (!slot)
1842 slot = 1;
1843 diff = slot * period_slot;
1844 } else {
1845 diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1846
1847 /*
1848 * Scale scan rate increases based on sharing. There is an
1849 * inverse relationship between the degree of sharing and
1850 * the adjustment made to the scanning period. Broadly
1851 * speaking the intent is that there is little point
1852 * scanning faster if shared accesses dominate as it may
1853 * simply bounce migrations uselessly
1854 */
1855 ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1856 diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1857 }
1858
1859 p->numa_scan_period = clamp(p->numa_scan_period + diff,
1860 task_scan_min(p), task_scan_max(p));
1861 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1862 }
1863
1864 /*
1865 * Get the fraction of time the task has been running since the last
1866 * NUMA placement cycle. The scheduler keeps similar statistics, but
1867 * decays those on a 32ms period, which is orders of magnitude off
1868 * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1869 * stats only if the task is so new there are no NUMA statistics yet.
1870 */
numa_get_avg_runtime(struct task_struct * p,u64 * period)1871 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1872 {
1873 u64 runtime, delta, now;
1874 /* Use the start of this time slice to avoid calculations. */
1875 now = p->se.exec_start;
1876 runtime = p->se.sum_exec_runtime;
1877
1878 if (p->last_task_numa_placement) {
1879 delta = runtime - p->last_sum_exec_runtime;
1880 *period = now - p->last_task_numa_placement;
1881
1882 /* Avoid time going backwards, prevent potential divide error: */
1883 if (unlikely((s64)*period < 0))
1884 *period = 0;
1885 } else {
1886 delta = p->se.avg.load_sum / p->se.load.weight;
1887 *period = LOAD_AVG_MAX;
1888 }
1889
1890 p->last_sum_exec_runtime = runtime;
1891 p->last_task_numa_placement = now;
1892
1893 return delta;
1894 }
1895
1896 /*
1897 * Determine the preferred nid for a task in a numa_group. This needs to
1898 * be done in a way that produces consistent results with group_weight,
1899 * otherwise workloads might not converge.
1900 */
preferred_group_nid(struct task_struct * p,int nid)1901 static int preferred_group_nid(struct task_struct *p, int nid)
1902 {
1903 nodemask_t nodes;
1904 int dist;
1905
1906 /* Direct connections between all NUMA nodes. */
1907 if (sched_numa_topology_type == NUMA_DIRECT)
1908 return nid;
1909
1910 /*
1911 * On a system with glueless mesh NUMA topology, group_weight
1912 * scores nodes according to the number of NUMA hinting faults on
1913 * both the node itself, and on nearby nodes.
1914 */
1915 if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1916 unsigned long score, max_score = 0;
1917 int node, max_node = nid;
1918
1919 dist = sched_max_numa_distance;
1920
1921 for_each_online_node(node) {
1922 score = group_weight(p, node, dist);
1923 if (score > max_score) {
1924 max_score = score;
1925 max_node = node;
1926 }
1927 }
1928 return max_node;
1929 }
1930
1931 /*
1932 * Finding the preferred nid in a system with NUMA backplane
1933 * interconnect topology is more involved. The goal is to locate
1934 * tasks from numa_groups near each other in the system, and
1935 * untangle workloads from different sides of the system. This requires
1936 * searching down the hierarchy of node groups, recursively searching
1937 * inside the highest scoring group of nodes. The nodemask tricks
1938 * keep the complexity of the search down.
1939 */
1940 nodes = node_online_map;
1941 for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1942 unsigned long max_faults = 0;
1943 nodemask_t max_group = NODE_MASK_NONE;
1944 int a, b;
1945
1946 /* Are there nodes at this distance from each other? */
1947 if (!find_numa_distance(dist))
1948 continue;
1949
1950 for_each_node_mask(a, nodes) {
1951 unsigned long faults = 0;
1952 nodemask_t this_group;
1953 nodes_clear(this_group);
1954
1955 /* Sum group's NUMA faults; includes a==b case. */
1956 for_each_node_mask(b, nodes) {
1957 if (node_distance(a, b) < dist) {
1958 faults += group_faults(p, b);
1959 node_set(b, this_group);
1960 node_clear(b, nodes);
1961 }
1962 }
1963
1964 /* Remember the top group. */
1965 if (faults > max_faults) {
1966 max_faults = faults;
1967 max_group = this_group;
1968 /*
1969 * subtle: at the smallest distance there is
1970 * just one node left in each "group", the
1971 * winner is the preferred nid.
1972 */
1973 nid = a;
1974 }
1975 }
1976 /* Next round, evaluate the nodes within max_group. */
1977 if (!max_faults)
1978 break;
1979 nodes = max_group;
1980 }
1981 return nid;
1982 }
1983
task_numa_placement(struct task_struct * p)1984 static void task_numa_placement(struct task_struct *p)
1985 {
1986 int seq, nid, max_nid = -1, max_group_nid = -1;
1987 unsigned long max_faults = 0, max_group_faults = 0;
1988 unsigned long fault_types[2] = { 0, 0 };
1989 unsigned long total_faults;
1990 u64 runtime, period;
1991 spinlock_t *group_lock = NULL;
1992
1993 /*
1994 * The p->mm->numa_scan_seq field gets updated without
1995 * exclusive access. Use READ_ONCE() here to ensure
1996 * that the field is read in a single access:
1997 */
1998 seq = READ_ONCE(p->mm->numa_scan_seq);
1999 if (p->numa_scan_seq == seq)
2000 return;
2001 p->numa_scan_seq = seq;
2002 p->numa_scan_period_max = task_scan_max(p);
2003
2004 total_faults = p->numa_faults_locality[0] +
2005 p->numa_faults_locality[1];
2006 runtime = numa_get_avg_runtime(p, &period);
2007
2008 /* If the task is part of a group prevent parallel updates to group stats */
2009 if (p->numa_group) {
2010 group_lock = &p->numa_group->lock;
2011 spin_lock_irq(group_lock);
2012 }
2013
2014 /* Find the node with the highest number of faults */
2015 for_each_online_node(nid) {
2016 /* Keep track of the offsets in numa_faults array */
2017 int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2018 unsigned long faults = 0, group_faults = 0;
2019 int priv;
2020
2021 for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2022 long diff, f_diff, f_weight;
2023
2024 mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2025 membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2026 cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2027 cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2028
2029 /* Decay existing window, copy faults since last scan */
2030 diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2031 fault_types[priv] += p->numa_faults[membuf_idx];
2032 p->numa_faults[membuf_idx] = 0;
2033
2034 /*
2035 * Normalize the faults_from, so all tasks in a group
2036 * count according to CPU use, instead of by the raw
2037 * number of faults. Tasks with little runtime have
2038 * little over-all impact on throughput, and thus their
2039 * faults are less important.
2040 */
2041 f_weight = div64_u64(runtime << 16, period + 1);
2042 f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2043 (total_faults + 1);
2044 f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2045 p->numa_faults[cpubuf_idx] = 0;
2046
2047 p->numa_faults[mem_idx] += diff;
2048 p->numa_faults[cpu_idx] += f_diff;
2049 faults += p->numa_faults[mem_idx];
2050 p->total_numa_faults += diff;
2051 if (p->numa_group) {
2052 /*
2053 * safe because we can only change our own group
2054 *
2055 * mem_idx represents the offset for a given
2056 * nid and priv in a specific region because it
2057 * is at the beginning of the numa_faults array.
2058 */
2059 p->numa_group->faults[mem_idx] += diff;
2060 p->numa_group->faults_cpu[mem_idx] += f_diff;
2061 p->numa_group->total_faults += diff;
2062 group_faults += p->numa_group->faults[mem_idx];
2063 }
2064 }
2065
2066 if (faults > max_faults) {
2067 max_faults = faults;
2068 max_nid = nid;
2069 }
2070
2071 if (group_faults > max_group_faults) {
2072 max_group_faults = group_faults;
2073 max_group_nid = nid;
2074 }
2075 }
2076
2077 update_task_scan_period(p, fault_types[0], fault_types[1]);
2078
2079 if (p->numa_group) {
2080 update_numa_active_node_mask(p->numa_group);
2081 spin_unlock_irq(group_lock);
2082 max_nid = preferred_group_nid(p, max_group_nid);
2083 }
2084
2085 if (max_faults) {
2086 /* Set the new preferred node */
2087 if (max_nid != p->numa_preferred_nid)
2088 sched_setnuma(p, max_nid);
2089
2090 if (task_node(p) != p->numa_preferred_nid)
2091 numa_migrate_preferred(p);
2092 }
2093 }
2094
get_numa_group(struct numa_group * grp)2095 static inline int get_numa_group(struct numa_group *grp)
2096 {
2097 return atomic_inc_not_zero(&grp->refcount);
2098 }
2099
put_numa_group(struct numa_group * grp)2100 static inline void put_numa_group(struct numa_group *grp)
2101 {
2102 if (atomic_dec_and_test(&grp->refcount))
2103 kfree_rcu(grp, rcu);
2104 }
2105
task_numa_group(struct task_struct * p,int cpupid,int flags,int * priv)2106 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2107 int *priv)
2108 {
2109 struct numa_group *grp, *my_grp;
2110 struct task_struct *tsk;
2111 bool join = false;
2112 int cpu = cpupid_to_cpu(cpupid);
2113 int i;
2114
2115 if (unlikely(!p->numa_group)) {
2116 unsigned int size = sizeof(struct numa_group) +
2117 4*nr_node_ids*sizeof(unsigned long);
2118
2119 grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2120 if (!grp)
2121 return;
2122
2123 atomic_set(&grp->refcount, 1);
2124 spin_lock_init(&grp->lock);
2125 grp->gid = p->pid;
2126 /* Second half of the array tracks nids where faults happen */
2127 grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2128 nr_node_ids;
2129
2130 node_set(task_node(current), grp->active_nodes);
2131
2132 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2133 grp->faults[i] = p->numa_faults[i];
2134
2135 grp->total_faults = p->total_numa_faults;
2136
2137 grp->nr_tasks++;
2138 rcu_assign_pointer(p->numa_group, grp);
2139 }
2140
2141 rcu_read_lock();
2142 tsk = READ_ONCE(cpu_rq(cpu)->curr);
2143
2144 if (!cpupid_match_pid(tsk, cpupid))
2145 goto no_join;
2146
2147 grp = rcu_dereference(tsk->numa_group);
2148 if (!grp)
2149 goto no_join;
2150
2151 my_grp = p->numa_group;
2152 if (grp == my_grp)
2153 goto no_join;
2154
2155 /*
2156 * Only join the other group if its bigger; if we're the bigger group,
2157 * the other task will join us.
2158 */
2159 if (my_grp->nr_tasks > grp->nr_tasks)
2160 goto no_join;
2161
2162 /*
2163 * Tie-break on the grp address.
2164 */
2165 if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2166 goto no_join;
2167
2168 /* Always join threads in the same process. */
2169 if (tsk->mm == current->mm)
2170 join = true;
2171
2172 /* Simple filter to avoid false positives due to PID collisions */
2173 if (flags & TNF_SHARED)
2174 join = true;
2175
2176 /* Update priv based on whether false sharing was detected */
2177 *priv = !join;
2178
2179 if (join && !get_numa_group(grp))
2180 goto no_join;
2181
2182 rcu_read_unlock();
2183
2184 if (!join)
2185 return;
2186
2187 BUG_ON(irqs_disabled());
2188 double_lock_irq(&my_grp->lock, &grp->lock);
2189
2190 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2191 my_grp->faults[i] -= p->numa_faults[i];
2192 grp->faults[i] += p->numa_faults[i];
2193 }
2194 my_grp->total_faults -= p->total_numa_faults;
2195 grp->total_faults += p->total_numa_faults;
2196
2197 my_grp->nr_tasks--;
2198 grp->nr_tasks++;
2199
2200 spin_unlock(&my_grp->lock);
2201 spin_unlock_irq(&grp->lock);
2202
2203 rcu_assign_pointer(p->numa_group, grp);
2204
2205 put_numa_group(my_grp);
2206 return;
2207
2208 no_join:
2209 rcu_read_unlock();
2210 return;
2211 }
2212
2213 /*
2214 * Get rid of NUMA staticstics associated with a task (either current or dead).
2215 * If @final is set, the task is dead and has reached refcount zero, so we can
2216 * safely free all relevant data structures. Otherwise, there might be
2217 * concurrent reads from places like load balancing and procfs, and we should
2218 * reset the data back to default state without freeing ->numa_faults.
2219 */
task_numa_free(struct task_struct * p,bool final)2220 void task_numa_free(struct task_struct *p, bool final)
2221 {
2222 struct numa_group *grp = p->numa_group;
2223 unsigned long *numa_faults = p->numa_faults;
2224 unsigned long flags;
2225 int i;
2226
2227 if (!numa_faults)
2228 return;
2229
2230 if (grp) {
2231 spin_lock_irqsave(&grp->lock, flags);
2232 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2233 grp->faults[i] -= p->numa_faults[i];
2234 grp->total_faults -= p->total_numa_faults;
2235
2236 grp->nr_tasks--;
2237 spin_unlock_irqrestore(&grp->lock, flags);
2238 RCU_INIT_POINTER(p->numa_group, NULL);
2239 put_numa_group(grp);
2240 }
2241
2242 if (final) {
2243 p->numa_faults = NULL;
2244 kfree(numa_faults);
2245 } else {
2246 p->total_numa_faults = 0;
2247 for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2248 numa_faults[i] = 0;
2249 }
2250 }
2251
2252 /*
2253 * Got a PROT_NONE fault for a page on @node.
2254 */
task_numa_fault(int last_cpupid,int mem_node,int pages,int flags)2255 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2256 {
2257 struct task_struct *p = current;
2258 bool migrated = flags & TNF_MIGRATED;
2259 int cpu_node = task_node(current);
2260 int local = !!(flags & TNF_FAULT_LOCAL);
2261 int priv;
2262
2263 if (!static_branch_likely(&sched_numa_balancing))
2264 return;
2265
2266 /* for example, ksmd faulting in a user's mm */
2267 if (!p->mm)
2268 return;
2269
2270 /* Allocate buffer to track faults on a per-node basis */
2271 if (unlikely(!p->numa_faults)) {
2272 int size = sizeof(*p->numa_faults) *
2273 NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2274
2275 p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2276 if (!p->numa_faults)
2277 return;
2278
2279 p->total_numa_faults = 0;
2280 memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2281 }
2282
2283 /*
2284 * First accesses are treated as private, otherwise consider accesses
2285 * to be private if the accessing pid has not changed
2286 */
2287 if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2288 priv = 1;
2289 } else {
2290 priv = cpupid_match_pid(p, last_cpupid);
2291 if (!priv && !(flags & TNF_NO_GROUP))
2292 task_numa_group(p, last_cpupid, flags, &priv);
2293 }
2294
2295 /*
2296 * If a workload spans multiple NUMA nodes, a shared fault that
2297 * occurs wholly within the set of nodes that the workload is
2298 * actively using should be counted as local. This allows the
2299 * scan rate to slow down when a workload has settled down.
2300 */
2301 if (!priv && !local && p->numa_group &&
2302 node_isset(cpu_node, p->numa_group->active_nodes) &&
2303 node_isset(mem_node, p->numa_group->active_nodes))
2304 local = 1;
2305
2306 task_numa_placement(p);
2307
2308 /*
2309 * Retry task to preferred node migration periodically, in case it
2310 * case it previously failed, or the scheduler moved us.
2311 */
2312 if (time_after(jiffies, p->numa_migrate_retry))
2313 numa_migrate_preferred(p);
2314
2315 if (migrated)
2316 p->numa_pages_migrated += pages;
2317 if (flags & TNF_MIGRATE_FAIL)
2318 p->numa_faults_locality[2] += pages;
2319
2320 p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2321 p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2322 p->numa_faults_locality[local] += pages;
2323 }
2324
reset_ptenuma_scan(struct task_struct * p)2325 static void reset_ptenuma_scan(struct task_struct *p)
2326 {
2327 /*
2328 * We only did a read acquisition of the mmap sem, so
2329 * p->mm->numa_scan_seq is written to without exclusive access
2330 * and the update is not guaranteed to be atomic. That's not
2331 * much of an issue though, since this is just used for
2332 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2333 * expensive, to avoid any form of compiler optimizations:
2334 */
2335 WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2336 p->mm->numa_scan_offset = 0;
2337 }
2338
2339 /*
2340 * The expensive part of numa migration is done from task_work context.
2341 * Triggered from task_tick_numa().
2342 */
task_numa_work(struct callback_head * work)2343 void task_numa_work(struct callback_head *work)
2344 {
2345 unsigned long migrate, next_scan, now = jiffies;
2346 struct task_struct *p = current;
2347 struct mm_struct *mm = p->mm;
2348 struct vm_area_struct *vma;
2349 unsigned long start, end;
2350 unsigned long nr_pte_updates = 0;
2351 long pages, virtpages;
2352
2353 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2354
2355 work->next = work; /* protect against double add */
2356 /*
2357 * Who cares about NUMA placement when they're dying.
2358 *
2359 * NOTE: make sure not to dereference p->mm before this check,
2360 * exit_task_work() happens _after_ exit_mm() so we could be called
2361 * without p->mm even though we still had it when we enqueued this
2362 * work.
2363 */
2364 if (p->flags & PF_EXITING)
2365 return;
2366
2367 if (!mm->numa_next_scan) {
2368 mm->numa_next_scan = now +
2369 msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2370 }
2371
2372 /*
2373 * Enforce maximal scan/migration frequency..
2374 */
2375 migrate = mm->numa_next_scan;
2376 if (time_before(now, migrate))
2377 return;
2378
2379 if (p->numa_scan_period == 0) {
2380 p->numa_scan_period_max = task_scan_max(p);
2381 p->numa_scan_period = task_scan_min(p);
2382 }
2383
2384 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2385 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2386 return;
2387
2388 /*
2389 * Delay this task enough that another task of this mm will likely win
2390 * the next time around.
2391 */
2392 p->node_stamp += 2 * TICK_NSEC;
2393
2394 start = mm->numa_scan_offset;
2395 pages = sysctl_numa_balancing_scan_size;
2396 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2397 virtpages = pages * 8; /* Scan up to this much virtual space */
2398 if (!pages)
2399 return;
2400
2401
2402 if (!down_read_trylock(&mm->mmap_sem))
2403 return;
2404 vma = find_vma(mm, start);
2405 if (!vma) {
2406 reset_ptenuma_scan(p);
2407 start = 0;
2408 vma = mm->mmap;
2409 }
2410 for (; vma; vma = vma->vm_next) {
2411 if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2412 is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2413 continue;
2414 }
2415
2416 /*
2417 * Shared library pages mapped by multiple processes are not
2418 * migrated as it is expected they are cache replicated. Avoid
2419 * hinting faults in read-only file-backed mappings or the vdso
2420 * as migrating the pages will be of marginal benefit.
2421 */
2422 if (!vma->vm_mm ||
2423 (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2424 continue;
2425
2426 /*
2427 * Skip inaccessible VMAs to avoid any confusion between
2428 * PROT_NONE and NUMA hinting ptes
2429 */
2430 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2431 continue;
2432
2433 do {
2434 start = max(start, vma->vm_start);
2435 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2436 end = min(end, vma->vm_end);
2437 nr_pte_updates = change_prot_numa(vma, start, end);
2438
2439 /*
2440 * Try to scan sysctl_numa_balancing_size worth of
2441 * hpages that have at least one present PTE that
2442 * is not already pte-numa. If the VMA contains
2443 * areas that are unused or already full of prot_numa
2444 * PTEs, scan up to virtpages, to skip through those
2445 * areas faster.
2446 */
2447 if (nr_pte_updates)
2448 pages -= (end - start) >> PAGE_SHIFT;
2449 virtpages -= (end - start) >> PAGE_SHIFT;
2450
2451 start = end;
2452 if (pages <= 0 || virtpages <= 0)
2453 goto out;
2454
2455 cond_resched();
2456 } while (end != vma->vm_end);
2457 }
2458
2459 out:
2460 /*
2461 * It is possible to reach the end of the VMA list but the last few
2462 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2463 * would find the !migratable VMA on the next scan but not reset the
2464 * scanner to the start so check it now.
2465 */
2466 if (vma)
2467 mm->numa_scan_offset = start;
2468 else
2469 reset_ptenuma_scan(p);
2470 up_read(&mm->mmap_sem);
2471 }
2472
2473 /*
2474 * Drive the periodic memory faults..
2475 */
task_tick_numa(struct rq * rq,struct task_struct * curr)2476 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2477 {
2478 struct callback_head *work = &curr->numa_work;
2479 u64 period, now;
2480
2481 /*
2482 * We don't care about NUMA placement if we don't have memory.
2483 */
2484 if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2485 return;
2486
2487 /*
2488 * Using runtime rather than walltime has the dual advantage that
2489 * we (mostly) drive the selection from busy threads and that the
2490 * task needs to have done some actual work before we bother with
2491 * NUMA placement.
2492 */
2493 now = curr->se.sum_exec_runtime;
2494 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2495
2496 if (now > curr->node_stamp + period) {
2497 if (!curr->node_stamp)
2498 curr->numa_scan_period = task_scan_min(curr);
2499 curr->node_stamp += period;
2500
2501 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2502 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2503 task_work_add(curr, work, true);
2504 }
2505 }
2506 }
2507 #else
task_tick_numa(struct rq * rq,struct task_struct * curr)2508 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2509 {
2510 }
2511
account_numa_enqueue(struct rq * rq,struct task_struct * p)2512 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2513 {
2514 }
2515
account_numa_dequeue(struct rq * rq,struct task_struct * p)2516 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2517 {
2518 }
2519 #endif /* CONFIG_NUMA_BALANCING */
2520
2521 static void
account_entity_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)2522 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2523 {
2524 update_load_add(&cfs_rq->load, se->load.weight);
2525 if (!parent_entity(se))
2526 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2527 #ifdef CONFIG_SMP
2528 if (entity_is_task(se)) {
2529 struct rq *rq = rq_of(cfs_rq);
2530
2531 account_numa_enqueue(rq, task_of(se));
2532 list_add(&se->group_node, &rq->cfs_tasks);
2533 }
2534 #endif
2535 cfs_rq->nr_running++;
2536 }
2537
2538 static void
account_entity_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)2539 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2540 {
2541 update_load_sub(&cfs_rq->load, se->load.weight);
2542 if (!parent_entity(se))
2543 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2544 if (entity_is_task(se)) {
2545 account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2546 list_del_init(&se->group_node);
2547 }
2548 cfs_rq->nr_running--;
2549 }
2550
2551 #ifdef CONFIG_FAIR_GROUP_SCHED
2552 # ifdef CONFIG_SMP
calc_cfs_shares(struct cfs_rq * cfs_rq,struct task_group * tg)2553 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2554 {
2555 long tg_weight, load, shares;
2556
2557 /*
2558 * This really should be: cfs_rq->avg.load_avg, but instead we use
2559 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2560 * the shares for small weight interactive tasks.
2561 */
2562 load = scale_load_down(cfs_rq->load.weight);
2563
2564 tg_weight = atomic_long_read(&tg->load_avg);
2565
2566 /* Ensure tg_weight >= load */
2567 tg_weight -= cfs_rq->tg_load_avg_contrib;
2568 tg_weight += load;
2569
2570 shares = (tg->shares * load);
2571 if (tg_weight)
2572 shares /= tg_weight;
2573
2574 if (shares < MIN_SHARES)
2575 shares = MIN_SHARES;
2576 if (shares > tg->shares)
2577 shares = tg->shares;
2578
2579 return shares;
2580 }
2581 # else /* CONFIG_SMP */
calc_cfs_shares(struct cfs_rq * cfs_rq,struct task_group * tg)2582 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2583 {
2584 return tg->shares;
2585 }
2586 # endif /* CONFIG_SMP */
2587
reweight_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,unsigned long weight)2588 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2589 unsigned long weight)
2590 {
2591 if (se->on_rq) {
2592 /* commit outstanding execution time */
2593 if (cfs_rq->curr == se)
2594 update_curr(cfs_rq);
2595 account_entity_dequeue(cfs_rq, se);
2596 }
2597
2598 update_load_set(&se->load, weight);
2599
2600 if (se->on_rq)
2601 account_entity_enqueue(cfs_rq, se);
2602 }
2603
2604 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2605
update_cfs_shares(struct sched_entity * se)2606 static void update_cfs_shares(struct sched_entity *se)
2607 {
2608 struct cfs_rq *cfs_rq = group_cfs_rq(se);
2609 struct task_group *tg;
2610 long shares;
2611
2612 if (!cfs_rq)
2613 return;
2614
2615 if (throttled_hierarchy(cfs_rq))
2616 return;
2617
2618 tg = cfs_rq->tg;
2619
2620 #ifndef CONFIG_SMP
2621 if (likely(se->load.weight == tg->shares))
2622 return;
2623 #endif
2624 shares = calc_cfs_shares(cfs_rq, tg);
2625
2626 reweight_entity(cfs_rq_of(se), se, shares);
2627 }
2628
2629 #else /* CONFIG_FAIR_GROUP_SCHED */
update_cfs_shares(struct sched_entity * se)2630 static inline void update_cfs_shares(struct sched_entity *se)
2631 {
2632 }
2633 #endif /* CONFIG_FAIR_GROUP_SCHED */
2634
2635 #ifdef CONFIG_SMP
2636 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2637 static const u32 runnable_avg_yN_inv[] = {
2638 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2639 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2640 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2641 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2642 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2643 0x85aac367, 0x82cd8698,
2644 };
2645
2646 /*
2647 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
2648 * over-estimates when re-combining.
2649 */
2650 static const u32 runnable_avg_yN_sum[] = {
2651 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2652 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2653 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2654 };
2655
2656 /*
2657 * Approximate:
2658 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
2659 */
decay_load(u64 val,u64 n)2660 static __always_inline u64 decay_load(u64 val, u64 n)
2661 {
2662 unsigned int local_n;
2663
2664 if (!n)
2665 return val;
2666 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2667 return 0;
2668
2669 /* after bounds checking we can collapse to 32-bit */
2670 local_n = n;
2671
2672 /*
2673 * As y^PERIOD = 1/2, we can combine
2674 * y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2675 * With a look-up table which covers y^n (n<PERIOD)
2676 *
2677 * To achieve constant time decay_load.
2678 */
2679 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2680 val >>= local_n / LOAD_AVG_PERIOD;
2681 local_n %= LOAD_AVG_PERIOD;
2682 }
2683
2684 val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2685 return val;
2686 }
2687
2688 /*
2689 * For updates fully spanning n periods, the contribution to runnable
2690 * average will be: \Sum 1024*y^n
2691 *
2692 * We can compute this reasonably efficiently by combining:
2693 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
2694 */
__compute_runnable_contrib(u64 n)2695 static u32 __compute_runnable_contrib(u64 n)
2696 {
2697 u32 contrib = 0;
2698
2699 if (likely(n <= LOAD_AVG_PERIOD))
2700 return runnable_avg_yN_sum[n];
2701 else if (unlikely(n >= LOAD_AVG_MAX_N))
2702 return LOAD_AVG_MAX;
2703
2704 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2705 do {
2706 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2707 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2708
2709 n -= LOAD_AVG_PERIOD;
2710 } while (n > LOAD_AVG_PERIOD);
2711
2712 contrib = decay_load(contrib, n);
2713 return contrib + runnable_avg_yN_sum[n];
2714 }
2715
2716 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2717 #error "load tracking assumes 2^10 as unit"
2718 #endif
2719
2720 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2721
2722 /*
2723 * We can represent the historical contribution to runnable average as the
2724 * coefficients of a geometric series. To do this we sub-divide our runnable
2725 * history into segments of approximately 1ms (1024us); label the segment that
2726 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2727 *
2728 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2729 * p0 p1 p2
2730 * (now) (~1ms ago) (~2ms ago)
2731 *
2732 * Let u_i denote the fraction of p_i that the entity was runnable.
2733 *
2734 * We then designate the fractions u_i as our co-efficients, yielding the
2735 * following representation of historical load:
2736 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2737 *
2738 * We choose y based on the with of a reasonably scheduling period, fixing:
2739 * y^32 = 0.5
2740 *
2741 * This means that the contribution to load ~32ms ago (u_32) will be weighted
2742 * approximately half as much as the contribution to load within the last ms
2743 * (u_0).
2744 *
2745 * When a period "rolls over" and we have new u_0`, multiplying the previous
2746 * sum again by y is sufficient to update:
2747 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2748 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2749 */
2750 static __always_inline int
__update_load_avg(u64 now,int cpu,struct sched_avg * sa,unsigned long weight,int running,struct cfs_rq * cfs_rq)2751 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2752 unsigned long weight, int running, struct cfs_rq *cfs_rq)
2753 {
2754 u64 delta, scaled_delta, periods;
2755 u32 contrib;
2756 unsigned int delta_w, scaled_delta_w, decayed = 0;
2757 unsigned long scale_freq, scale_cpu;
2758
2759 delta = now - sa->last_update_time;
2760 /*
2761 * This should only happen when time goes backwards, which it
2762 * unfortunately does during sched clock init when we swap over to TSC.
2763 */
2764 if ((s64)delta < 0) {
2765 sa->last_update_time = now;
2766 return 0;
2767 }
2768
2769 /*
2770 * Use 1024ns as the unit of measurement since it's a reasonable
2771 * approximation of 1us and fast to compute.
2772 */
2773 delta >>= 10;
2774 if (!delta)
2775 return 0;
2776 sa->last_update_time = now;
2777
2778 scale_freq = arch_scale_freq_capacity(NULL, cpu);
2779 scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2780 trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
2781
2782 /* delta_w is the amount already accumulated against our next period */
2783 delta_w = sa->period_contrib;
2784 if (delta + delta_w >= 1024) {
2785 decayed = 1;
2786
2787 /* how much left for next period will start over, we don't know yet */
2788 sa->period_contrib = 0;
2789
2790 /*
2791 * Now that we know we're crossing a period boundary, figure
2792 * out how much from delta we need to complete the current
2793 * period and accrue it.
2794 */
2795 delta_w = 1024 - delta_w;
2796 scaled_delta_w = cap_scale(delta_w, scale_freq);
2797 if (weight) {
2798 sa->load_sum += weight * scaled_delta_w;
2799 if (cfs_rq) {
2800 cfs_rq->runnable_load_sum +=
2801 weight * scaled_delta_w;
2802 }
2803 }
2804 if (running)
2805 sa->util_sum += scaled_delta_w * scale_cpu;
2806
2807 delta -= delta_w;
2808
2809 /* Figure out how many additional periods this update spans */
2810 periods = delta / 1024;
2811 delta %= 1024;
2812
2813 sa->load_sum = decay_load(sa->load_sum, periods + 1);
2814 if (cfs_rq) {
2815 cfs_rq->runnable_load_sum =
2816 decay_load(cfs_rq->runnable_load_sum, periods + 1);
2817 }
2818 sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2819
2820 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
2821 contrib = __compute_runnable_contrib(periods);
2822 contrib = cap_scale(contrib, scale_freq);
2823 if (weight) {
2824 sa->load_sum += weight * contrib;
2825 if (cfs_rq)
2826 cfs_rq->runnable_load_sum += weight * contrib;
2827 }
2828 if (running)
2829 sa->util_sum += contrib * scale_cpu;
2830 }
2831
2832 /* Remainder of delta accrued against u_0` */
2833 scaled_delta = cap_scale(delta, scale_freq);
2834 if (weight) {
2835 sa->load_sum += weight * scaled_delta;
2836 if (cfs_rq)
2837 cfs_rq->runnable_load_sum += weight * scaled_delta;
2838 }
2839 if (running)
2840 sa->util_sum += scaled_delta * scale_cpu;
2841
2842 sa->period_contrib += delta;
2843
2844 if (decayed) {
2845 sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2846 if (cfs_rq) {
2847 cfs_rq->runnable_load_avg =
2848 div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2849 }
2850 sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2851 }
2852
2853 return decayed;
2854 }
2855
2856 /*
2857 * Signed add and clamp on underflow.
2858 *
2859 * Explicitly do a load-store to ensure the intermediate value never hits
2860 * memory. This allows lockless observations without ever seeing the negative
2861 * values.
2862 */
2863 #define add_positive(_ptr, _val) do { \
2864 typeof(_ptr) ptr = (_ptr); \
2865 typeof(_val) val = (_val); \
2866 typeof(*ptr) res, var = READ_ONCE(*ptr); \
2867 \
2868 res = var + val; \
2869 \
2870 if (val < 0 && res > var) \
2871 res = 0; \
2872 \
2873 WRITE_ONCE(*ptr, res); \
2874 } while (0)
2875
2876 #ifdef CONFIG_FAIR_GROUP_SCHED
2877 /**
2878 * update_tg_load_avg - update the tg's load avg
2879 * @cfs_rq: the cfs_rq whose avg changed
2880 * @force: update regardless of how small the difference
2881 *
2882 * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
2883 * However, because tg->load_avg is a global value there are performance
2884 * considerations.
2885 *
2886 * In order to avoid having to look at the other cfs_rq's, we use a
2887 * differential update where we store the last value we propagated. This in
2888 * turn allows skipping updates if the differential is 'small'.
2889 *
2890 * Updating tg's load_avg is necessary before update_cfs_share() (which is
2891 * done) and effective_load() (which is not done because it is too costly).
2892 */
update_tg_load_avg(struct cfs_rq * cfs_rq,int force)2893 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2894 {
2895 long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2896
2897 /*
2898 * No need to update load_avg for root_task_group as it is not used.
2899 */
2900 if (cfs_rq->tg == &root_task_group)
2901 return;
2902
2903 if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2904 atomic_long_add(delta, &cfs_rq->tg->load_avg);
2905 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2906 }
2907 }
2908
2909 /*
2910 * Called within set_task_rq() right before setting a task's cpu. The
2911 * caller only guarantees p->pi_lock is held; no other assumptions,
2912 * including the state of rq->lock, should be made.
2913 */
set_task_rq_fair(struct sched_entity * se,struct cfs_rq * prev,struct cfs_rq * next)2914 void set_task_rq_fair(struct sched_entity *se,
2915 struct cfs_rq *prev, struct cfs_rq *next)
2916 {
2917 if (!sched_feat(ATTACH_AGE_LOAD))
2918 return;
2919
2920 /*
2921 * We are supposed to update the task to "current" time, then its up to
2922 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2923 * getting what current time is, so simply throw away the out-of-date
2924 * time. This will result in the wakee task is less decayed, but giving
2925 * the wakee more load sounds not bad.
2926 */
2927 if (se->avg.last_update_time && prev) {
2928 u64 p_last_update_time;
2929 u64 n_last_update_time;
2930
2931 #ifndef CONFIG_64BIT
2932 u64 p_last_update_time_copy;
2933 u64 n_last_update_time_copy;
2934
2935 do {
2936 p_last_update_time_copy = prev->load_last_update_time_copy;
2937 n_last_update_time_copy = next->load_last_update_time_copy;
2938
2939 smp_rmb();
2940
2941 p_last_update_time = prev->avg.last_update_time;
2942 n_last_update_time = next->avg.last_update_time;
2943
2944 } while (p_last_update_time != p_last_update_time_copy ||
2945 n_last_update_time != n_last_update_time_copy);
2946 #else
2947 p_last_update_time = prev->avg.last_update_time;
2948 n_last_update_time = next->avg.last_update_time;
2949 #endif
2950 __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2951 &se->avg, 0, 0, NULL);
2952 se->avg.last_update_time = n_last_update_time;
2953 }
2954 }
2955
2956 /* Take into account change of utilization of a child task group */
2957 static inline void
update_tg_cfs_util(struct cfs_rq * cfs_rq,struct sched_entity * se)2958 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
2959 {
2960 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
2961 long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
2962
2963 /* Nothing to update */
2964 if (!delta)
2965 return;
2966
2967 /* Set new sched_entity's utilization */
2968 se->avg.util_avg = gcfs_rq->avg.util_avg;
2969 se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
2970
2971 /* Update parent cfs_rq utilization */
2972 add_positive(&cfs_rq->avg.util_avg, delta);
2973 cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
2974 }
2975
2976 /* Take into account change of load of a child task group */
2977 static inline void
update_tg_cfs_load(struct cfs_rq * cfs_rq,struct sched_entity * se)2978 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
2979 {
2980 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
2981 long delta, load = gcfs_rq->avg.load_avg;
2982
2983 /*
2984 * If the load of group cfs_rq is null, the load of the
2985 * sched_entity will also be null so we can skip the formula
2986 */
2987 if (load) {
2988 long tg_load;
2989
2990 /* Get tg's load and ensure tg_load > 0 */
2991 tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
2992
2993 /* Ensure tg_load >= load and updated with current load*/
2994 tg_load -= gcfs_rq->tg_load_avg_contrib;
2995 tg_load += load;
2996
2997 /*
2998 * We need to compute a correction term in the case that the
2999 * task group is consuming more CPU than a task of equal
3000 * weight. A task with a weight equals to tg->shares will have
3001 * a load less or equal to scale_load_down(tg->shares).
3002 * Similarly, the sched_entities that represent the task group
3003 * at parent level, can't have a load higher than
3004 * scale_load_down(tg->shares). And the Sum of sched_entities'
3005 * load must be <= scale_load_down(tg->shares).
3006 */
3007 if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
3008 /* scale gcfs_rq's load into tg's shares*/
3009 load *= scale_load_down(gcfs_rq->tg->shares);
3010 load /= tg_load;
3011 }
3012 }
3013
3014 delta = load - se->avg.load_avg;
3015
3016 /* Nothing to update */
3017 if (!delta)
3018 return;
3019
3020 /* Set new sched_entity's load */
3021 se->avg.load_avg = load;
3022 se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
3023
3024 /* Update parent cfs_rq load */
3025 add_positive(&cfs_rq->avg.load_avg, delta);
3026 cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
3027
3028 /*
3029 * If the sched_entity is already enqueued, we also have to update the
3030 * runnable load avg.
3031 */
3032 if (se->on_rq) {
3033 /* Update parent cfs_rq runnable_load_avg */
3034 add_positive(&cfs_rq->runnable_load_avg, delta);
3035 cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
3036 }
3037 }
3038
set_tg_cfs_propagate(struct cfs_rq * cfs_rq)3039 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
3040 {
3041 cfs_rq->propagate_avg = 1;
3042 }
3043
test_and_clear_tg_cfs_propagate(struct sched_entity * se)3044 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
3045 {
3046 struct cfs_rq *cfs_rq = group_cfs_rq(se);
3047
3048 if (!cfs_rq->propagate_avg)
3049 return 0;
3050
3051 cfs_rq->propagate_avg = 0;
3052 return 1;
3053 }
3054
3055 /* Update task and its cfs_rq load average */
propagate_entity_load_avg(struct sched_entity * se)3056 static inline int propagate_entity_load_avg(struct sched_entity *se)
3057 {
3058 struct cfs_rq *cfs_rq;
3059
3060 if (entity_is_task(se))
3061 return 0;
3062
3063 if (!test_and_clear_tg_cfs_propagate(se))
3064 return 0;
3065
3066 cfs_rq = cfs_rq_of(se);
3067
3068 set_tg_cfs_propagate(cfs_rq);
3069
3070 update_tg_cfs_util(cfs_rq, se);
3071 update_tg_cfs_load(cfs_rq, se);
3072
3073 return 1;
3074 }
3075
3076 /*
3077 * Check if we need to update the load and the utilization of a blocked
3078 * group_entity:
3079 */
skip_blocked_update(struct sched_entity * se)3080 static inline bool skip_blocked_update(struct sched_entity *se)
3081 {
3082 struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3083
3084 /*
3085 * If sched_entity still have not zero load or utilization, we have to
3086 * decay it:
3087 */
3088 if (se->avg.load_avg || se->avg.util_avg)
3089 return false;
3090
3091 /*
3092 * If there is a pending propagation, we have to update the load and
3093 * the utilization of the sched_entity:
3094 */
3095 if (gcfs_rq->propagate_avg)
3096 return false;
3097
3098 /*
3099 * Otherwise, the load and the utilization of the sched_entity is
3100 * already zero and there is no pending propagation, so it will be a
3101 * waste of time to try to decay it:
3102 */
3103 return true;
3104 }
3105
3106 #else /* CONFIG_FAIR_GROUP_SCHED */
3107
update_tg_load_avg(struct cfs_rq * cfs_rq,int force)3108 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3109
propagate_entity_load_avg(struct sched_entity * se)3110 static inline int propagate_entity_load_avg(struct sched_entity *se)
3111 {
3112 return 0;
3113 }
3114
set_tg_cfs_propagate(struct cfs_rq * cfs_rq)3115 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
3116
3117 #endif /* CONFIG_FAIR_GROUP_SCHED */
3118
cfs_rq_util_change(struct cfs_rq * cfs_rq)3119 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3120 {
3121 if (&this_rq()->cfs == cfs_rq) {
3122 /*
3123 * There are a few boundary cases this might miss but it should
3124 * get called often enough that that should (hopefully) not be
3125 * a real problem -- added to that it only calls on the local
3126 * CPU, so if we enqueue remotely we'll miss an update, but
3127 * the next tick/schedule should update.
3128 *
3129 * It will not get called when we go idle, because the idle
3130 * thread is a different class (!fair), nor will the utilization
3131 * number include things like RT tasks.
3132 *
3133 * As is, the util number is not freq-invariant (we'd have to
3134 * implement arch_scale_freq_capacity() for that).
3135 *
3136 * See cpu_util().
3137 */
3138 cpufreq_update_util(rq_of(cfs_rq), 0);
3139 }
3140 }
3141
3142 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
3143
3144 /*
3145 * Unsigned subtract and clamp on underflow.
3146 *
3147 * Explicitly do a load-store to ensure the intermediate value never hits
3148 * memory. This allows lockless observations without ever seeing the negative
3149 * values.
3150 */
3151 #define sub_positive(_ptr, _val) do { \
3152 typeof(_ptr) ptr = (_ptr); \
3153 typeof(*ptr) val = (_val); \
3154 typeof(*ptr) res, var = READ_ONCE(*ptr); \
3155 res = var - val; \
3156 if (res > var) \
3157 res = 0; \
3158 WRITE_ONCE(*ptr, res); \
3159 } while (0)
3160
3161 /**
3162 * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3163 * @now: current time, as per cfs_rq_clock_task()
3164 * @cfs_rq: cfs_rq to update
3165 * @update_freq: should we call cfs_rq_util_change() or will the call do so
3166 *
3167 * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3168 * avg. The immediate corollary is that all (fair) tasks must be attached, see
3169 * post_init_entity_util_avg().
3170 *
3171 * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3172 *
3173 * Returns true if the load decayed or we removed load.
3174 *
3175 * Since both these conditions indicate a changed cfs_rq->avg.load we should
3176 * call update_tg_load_avg() when this function returns true.
3177 */
3178 static inline int
update_cfs_rq_load_avg(u64 now,struct cfs_rq * cfs_rq,bool update_freq)3179 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3180 {
3181 struct sched_avg *sa = &cfs_rq->avg;
3182 int decayed, removed = 0, removed_util = 0;
3183
3184 if (atomic_long_read(&cfs_rq->removed_load_avg)) {
3185 s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
3186 sub_positive(&sa->load_avg, r);
3187 sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
3188 removed = 1;
3189 set_tg_cfs_propagate(cfs_rq);
3190 }
3191
3192 if (atomic_long_read(&cfs_rq->removed_util_avg)) {
3193 long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
3194 sub_positive(&sa->util_avg, r);
3195 sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
3196 removed_util = 1;
3197 set_tg_cfs_propagate(cfs_rq);
3198 }
3199
3200 decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3201 scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
3202
3203 #ifndef CONFIG_64BIT
3204 smp_wmb();
3205 cfs_rq->load_last_update_time_copy = sa->last_update_time;
3206 #endif
3207
3208 /* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
3209 if (cfs_rq == &rq_of(cfs_rq)->cfs)
3210 trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3211
3212 if (update_freq && (decayed || removed_util))
3213 cfs_rq_util_change(cfs_rq);
3214
3215 return decayed || removed;
3216 }
3217
3218 /*
3219 * Optional action to be done while updating the load average
3220 */
3221 #define UPDATE_TG 0x1
3222 #define SKIP_AGE_LOAD 0x2
3223
3224 /* Update task and its cfs_rq load average */
update_load_avg(struct sched_entity * se,int flags)3225 static inline void update_load_avg(struct sched_entity *se, int flags)
3226 {
3227 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3228 u64 now = cfs_rq_clock_task(cfs_rq);
3229 int cpu = cpu_of(rq_of(cfs_rq));
3230 int decayed;
3231 void *ptr = NULL;
3232
3233 /*
3234 * Track task load average for carrying it to new CPU after migrated, and
3235 * track group sched_entity load average for task_h_load calc in migration
3236 */
3237 if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
3238 __update_load_avg(now, cpu, &se->avg,
3239 se->on_rq * scale_load_down(se->load.weight),
3240 cfs_rq->curr == se, NULL);
3241 }
3242
3243 decayed = update_cfs_rq_load_avg(now, cfs_rq, true);
3244 decayed |= propagate_entity_load_avg(se);
3245
3246 if (decayed && (flags & UPDATE_TG))
3247 update_tg_load_avg(cfs_rq, 0);
3248
3249 if (entity_is_task(se)) {
3250 #ifdef CONFIG_SCHED_WALT
3251 ptr = (void *)&(task_of(se)->ravg);
3252 #endif
3253 trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
3254 }
3255 }
3256
3257 /**
3258 * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3259 * @cfs_rq: cfs_rq to attach to
3260 * @se: sched_entity to attach
3261 *
3262 * Must call update_cfs_rq_load_avg() before this, since we rely on
3263 * cfs_rq->avg.last_update_time being current.
3264 */
attach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3265 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3266 {
3267 se->avg.last_update_time = cfs_rq->avg.last_update_time;
3268 cfs_rq->avg.load_avg += se->avg.load_avg;
3269 cfs_rq->avg.load_sum += se->avg.load_sum;
3270 cfs_rq->avg.util_avg += se->avg.util_avg;
3271 cfs_rq->avg.util_sum += se->avg.util_sum;
3272 set_tg_cfs_propagate(cfs_rq);
3273
3274 cfs_rq_util_change(cfs_rq);
3275 }
3276
3277 /**
3278 * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3279 * @cfs_rq: cfs_rq to detach from
3280 * @se: sched_entity to detach
3281 *
3282 * Must call update_cfs_rq_load_avg() before this, since we rely on
3283 * cfs_rq->avg.last_update_time being current.
3284 */
detach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3285 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3286 {
3287
3288 sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3289 sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3290 sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3291 sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3292 set_tg_cfs_propagate(cfs_rq);
3293
3294 cfs_rq_util_change(cfs_rq);
3295 }
3296
3297 /* Add the load generated by se into cfs_rq's load average */
3298 static inline void
enqueue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3299 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3300 {
3301 struct sched_avg *sa = &se->avg;
3302
3303 cfs_rq->runnable_load_avg += sa->load_avg;
3304 cfs_rq->runnable_load_sum += sa->load_sum;
3305
3306 if (!sa->last_update_time) {
3307 attach_entity_load_avg(cfs_rq, se);
3308 update_tg_load_avg(cfs_rq, 0);
3309 }
3310 }
3311
3312 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
3313 static inline void
dequeue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3314 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3315 {
3316 cfs_rq->runnable_load_avg =
3317 max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3318 cfs_rq->runnable_load_sum =
3319 max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3320 }
3321
3322 #ifndef CONFIG_64BIT
cfs_rq_last_update_time(struct cfs_rq * cfs_rq)3323 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3324 {
3325 u64 last_update_time_copy;
3326 u64 last_update_time;
3327
3328 do {
3329 last_update_time_copy = cfs_rq->load_last_update_time_copy;
3330 smp_rmb();
3331 last_update_time = cfs_rq->avg.last_update_time;
3332 } while (last_update_time != last_update_time_copy);
3333
3334 return last_update_time;
3335 }
3336 #else
cfs_rq_last_update_time(struct cfs_rq * cfs_rq)3337 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3338 {
3339 return cfs_rq->avg.last_update_time;
3340 }
3341 #endif
3342
3343 /*
3344 * Synchronize entity load avg of dequeued entity without locking
3345 * the previous rq.
3346 */
sync_entity_load_avg(struct sched_entity * se)3347 void sync_entity_load_avg(struct sched_entity *se)
3348 {
3349 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3350 u64 last_update_time;
3351
3352 last_update_time = cfs_rq_last_update_time(cfs_rq);
3353 __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3354 }
3355
3356 /*
3357 * Task first catches up with cfs_rq, and then subtract
3358 * itself from the cfs_rq (task must be off the queue now).
3359 */
remove_entity_load_avg(struct sched_entity * se)3360 void remove_entity_load_avg(struct sched_entity *se)
3361 {
3362 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3363
3364 /*
3365 * tasks cannot exit without having gone through wake_up_new_task() ->
3366 * post_init_entity_util_avg() which will have added things to the
3367 * cfs_rq, so we can remove unconditionally.
3368 *
3369 * Similarly for groups, they will have passed through
3370 * post_init_entity_util_avg() before unregister_sched_fair_group()
3371 * calls this.
3372 */
3373
3374 sync_entity_load_avg(se);
3375 atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3376 atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3377 }
3378
3379 /*
3380 * Update the rq's load with the elapsed running time before entering
3381 * idle. if the last scheduled task is not a CFS task, idle_enter will
3382 * be the only way to update the runnable statistic.
3383 */
idle_enter_fair(struct rq * this_rq)3384 void idle_enter_fair(struct rq *this_rq)
3385 {
3386 }
3387
3388 /*
3389 * Update the rq's load with the elapsed idle time before a task is
3390 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
3391 * be the only way to update the runnable statistic.
3392 */
idle_exit_fair(struct rq * this_rq)3393 void idle_exit_fair(struct rq *this_rq)
3394 {
3395 }
3396
cfs_rq_runnable_load_avg(struct cfs_rq * cfs_rq)3397 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3398 {
3399 return cfs_rq->runnable_load_avg;
3400 }
3401
cfs_rq_load_avg(struct cfs_rq * cfs_rq)3402 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3403 {
3404 return cfs_rq->avg.load_avg;
3405 }
3406
3407 static int idle_balance(struct rq *this_rq);
3408
3409 #else /* CONFIG_SMP */
3410
3411 static inline int
update_cfs_rq_load_avg(u64 now,struct cfs_rq * cfs_rq,bool update_freq)3412 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3413 {
3414 return 0;
3415 }
3416
3417 #define UPDATE_TG 0x0
3418 #define SKIP_AGE_LOAD 0x0
3419
update_load_avg(struct sched_entity * se,int not_used1)3420 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
3421 static inline void
enqueue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3422 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3423 static inline void
dequeue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3424 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
remove_entity_load_avg(struct sched_entity * se)3425 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3426
3427 static inline void
attach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3428 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3429 static inline void
detach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3430 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3431
idle_balance(struct rq * rq)3432 static inline int idle_balance(struct rq *rq)
3433 {
3434 return 0;
3435 }
3436
3437 #endif /* CONFIG_SMP */
3438
enqueue_sleeper(struct cfs_rq * cfs_rq,struct sched_entity * se)3439 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3440 {
3441 #ifdef CONFIG_SCHEDSTATS
3442 struct task_struct *tsk = NULL;
3443
3444 if (entity_is_task(se))
3445 tsk = task_of(se);
3446
3447 if (se->statistics.sleep_start) {
3448 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3449
3450 if ((s64)delta < 0)
3451 delta = 0;
3452
3453 if (unlikely(delta > se->statistics.sleep_max))
3454 se->statistics.sleep_max = delta;
3455
3456 se->statistics.sleep_start = 0;
3457 se->statistics.sum_sleep_runtime += delta;
3458
3459 if (tsk) {
3460 account_scheduler_latency(tsk, delta >> 10, 1);
3461 trace_sched_stat_sleep(tsk, delta);
3462 }
3463 }
3464 if (se->statistics.block_start) {
3465 u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3466
3467 if ((s64)delta < 0)
3468 delta = 0;
3469
3470 if (unlikely(delta > se->statistics.block_max))
3471 se->statistics.block_max = delta;
3472
3473 se->statistics.block_start = 0;
3474 se->statistics.sum_sleep_runtime += delta;
3475
3476 if (tsk) {
3477 if (tsk->in_iowait) {
3478 se->statistics.iowait_sum += delta;
3479 se->statistics.iowait_count++;
3480 trace_sched_stat_iowait(tsk, delta);
3481 }
3482
3483 trace_sched_stat_blocked(tsk, delta);
3484 trace_sched_blocked_reason(tsk);
3485
3486 /*
3487 * Blocking time is in units of nanosecs, so shift by
3488 * 20 to get a milliseconds-range estimation of the
3489 * amount of time that the task spent sleeping:
3490 */
3491 if (unlikely(prof_on == SLEEP_PROFILING)) {
3492 profile_hits(SLEEP_PROFILING,
3493 (void *)get_wchan(tsk),
3494 delta >> 20);
3495 }
3496 account_scheduler_latency(tsk, delta >> 10, 0);
3497 }
3498 }
3499 #endif
3500 }
3501
check_spread(struct cfs_rq * cfs_rq,struct sched_entity * se)3502 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3503 {
3504 #ifdef CONFIG_SCHED_DEBUG
3505 s64 d = se->vruntime - cfs_rq->min_vruntime;
3506
3507 if (d < 0)
3508 d = -d;
3509
3510 if (d > 3*sysctl_sched_latency)
3511 schedstat_inc(cfs_rq, nr_spread_over);
3512 #endif
3513 }
3514
3515 static void
place_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int initial)3516 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3517 {
3518 u64 vruntime = cfs_rq->min_vruntime;
3519
3520 /*
3521 * The 'current' period is already promised to the current tasks,
3522 * however the extra weight of the new task will slow them down a
3523 * little, place the new task so that it fits in the slot that
3524 * stays open at the end.
3525 */
3526 if (initial && sched_feat(START_DEBIT))
3527 vruntime += sched_vslice(cfs_rq, se);
3528
3529 /* sleeps up to a single latency don't count. */
3530 if (!initial) {
3531 unsigned long thresh = sysctl_sched_latency;
3532
3533 /*
3534 * Halve their sleep time's effect, to allow
3535 * for a gentler effect of sleepers:
3536 */
3537 if (sched_feat(GENTLE_FAIR_SLEEPERS))
3538 thresh >>= 1;
3539
3540 vruntime -= thresh;
3541 }
3542
3543 /* ensure we never gain time by being placed backwards. */
3544 se->vruntime = max_vruntime(se->vruntime, vruntime);
3545 }
3546
3547 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3548
3549 static void
enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)3550 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3551 {
3552 /*
3553 * Update the normalized vruntime before updating min_vruntime
3554 * through calling update_curr().
3555 */
3556 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3557 se->vruntime += cfs_rq->min_vruntime;
3558
3559 /*
3560 * Update run-time statistics of the 'current'.
3561 */
3562 update_curr(cfs_rq);
3563 update_load_avg(se, UPDATE_TG);
3564 enqueue_entity_load_avg(cfs_rq, se);
3565 update_cfs_shares(se);
3566 account_entity_enqueue(cfs_rq, se);
3567
3568 if (flags & ENQUEUE_WAKEUP) {
3569 place_entity(cfs_rq, se, 0);
3570 enqueue_sleeper(cfs_rq, se);
3571 }
3572
3573 update_stats_enqueue(cfs_rq, se);
3574 check_spread(cfs_rq, se);
3575 if (se != cfs_rq->curr)
3576 __enqueue_entity(cfs_rq, se);
3577 se->on_rq = 1;
3578
3579 if (cfs_rq->nr_running == 1) {
3580 list_add_leaf_cfs_rq(cfs_rq);
3581 check_enqueue_throttle(cfs_rq);
3582 }
3583 }
3584
__clear_buddies_last(struct sched_entity * se)3585 static void __clear_buddies_last(struct sched_entity *se)
3586 {
3587 for_each_sched_entity(se) {
3588 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3589 if (cfs_rq->last != se)
3590 break;
3591
3592 cfs_rq->last = NULL;
3593 }
3594 }
3595
__clear_buddies_next(struct sched_entity * se)3596 static void __clear_buddies_next(struct sched_entity *se)
3597 {
3598 for_each_sched_entity(se) {
3599 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3600 if (cfs_rq->next != se)
3601 break;
3602
3603 cfs_rq->next = NULL;
3604 }
3605 }
3606
__clear_buddies_skip(struct sched_entity * se)3607 static void __clear_buddies_skip(struct sched_entity *se)
3608 {
3609 for_each_sched_entity(se) {
3610 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3611 if (cfs_rq->skip != se)
3612 break;
3613
3614 cfs_rq->skip = NULL;
3615 }
3616 }
3617
clear_buddies(struct cfs_rq * cfs_rq,struct sched_entity * se)3618 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3619 {
3620 if (cfs_rq->last == se)
3621 __clear_buddies_last(se);
3622
3623 if (cfs_rq->next == se)
3624 __clear_buddies_next(se);
3625
3626 if (cfs_rq->skip == se)
3627 __clear_buddies_skip(se);
3628 }
3629
3630 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3631
3632 static void
dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)3633 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3634 {
3635 /*
3636 * Update run-time statistics of the 'current'.
3637 */
3638 update_curr(cfs_rq);
3639
3640 /*
3641 * When dequeuing a sched_entity, we must:
3642 * - Update loads to have both entity and cfs_rq synced with now.
3643 * - Substract its load from the cfs_rq->runnable_avg.
3644 * - Substract its previous weight from cfs_rq->load.weight.
3645 * - For group entity, update its weight to reflect the new share
3646 * of its group cfs_rq.
3647 */
3648 update_load_avg(se, UPDATE_TG);
3649 dequeue_entity_load_avg(cfs_rq, se);
3650
3651 update_stats_dequeue(cfs_rq, se);
3652 if (flags & DEQUEUE_SLEEP) {
3653 #ifdef CONFIG_SCHEDSTATS
3654 if (entity_is_task(se)) {
3655 struct task_struct *tsk = task_of(se);
3656
3657 if (tsk->state & TASK_INTERRUPTIBLE)
3658 se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3659 if (tsk->state & TASK_UNINTERRUPTIBLE)
3660 se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3661 }
3662 #endif
3663 }
3664
3665 clear_buddies(cfs_rq, se);
3666
3667 if (se != cfs_rq->curr)
3668 __dequeue_entity(cfs_rq, se);
3669 se->on_rq = 0;
3670 account_entity_dequeue(cfs_rq, se);
3671
3672 /*
3673 * Normalize the entity after updating the min_vruntime because the
3674 * update can refer to the ->curr item and we need to reflect this
3675 * movement in our normalized position.
3676 */
3677 if (!(flags & DEQUEUE_SLEEP))
3678 se->vruntime -= cfs_rq->min_vruntime;
3679
3680 /* return excess runtime on last dequeue */
3681 return_cfs_rq_runtime(cfs_rq);
3682
3683 update_min_vruntime(cfs_rq);
3684 update_cfs_shares(se);
3685 }
3686
3687 /*
3688 * Preempt the current task with a newly woken task if needed:
3689 */
3690 static void
check_preempt_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr)3691 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3692 {
3693 unsigned long ideal_runtime, delta_exec;
3694 struct sched_entity *se;
3695 s64 delta;
3696
3697 ideal_runtime = sched_slice(cfs_rq, curr);
3698 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3699 if (delta_exec > ideal_runtime) {
3700 resched_curr(rq_of(cfs_rq));
3701 /*
3702 * The current task ran long enough, ensure it doesn't get
3703 * re-elected due to buddy favours.
3704 */
3705 clear_buddies(cfs_rq, curr);
3706 return;
3707 }
3708
3709 /*
3710 * Ensure that a task that missed wakeup preemption by a
3711 * narrow margin doesn't have to wait for a full slice.
3712 * This also mitigates buddy induced latencies under load.
3713 */
3714 if (delta_exec < sysctl_sched_min_granularity)
3715 return;
3716
3717 se = __pick_first_entity(cfs_rq);
3718 delta = curr->vruntime - se->vruntime;
3719
3720 if (delta < 0)
3721 return;
3722
3723 if (delta > ideal_runtime)
3724 resched_curr(rq_of(cfs_rq));
3725 }
3726
3727 static void
set_next_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)3728 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3729 {
3730 /* 'current' is not kept within the tree. */
3731 if (se->on_rq) {
3732 /*
3733 * Any task has to be enqueued before it get to execute on
3734 * a CPU. So account for the time it spent waiting on the
3735 * runqueue.
3736 */
3737 update_stats_wait_end(cfs_rq, se);
3738 __dequeue_entity(cfs_rq, se);
3739 update_load_avg(se, UPDATE_TG);
3740 }
3741
3742 update_stats_curr_start(cfs_rq, se);
3743 cfs_rq->curr = se;
3744 #ifdef CONFIG_SCHEDSTATS
3745 /*
3746 * Track our maximum slice length, if the CPU's load is at
3747 * least twice that of our own weight (i.e. dont track it
3748 * when there are only lesser-weight tasks around):
3749 */
3750 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3751 se->statistics.slice_max = max(se->statistics.slice_max,
3752 se->sum_exec_runtime - se->prev_sum_exec_runtime);
3753 }
3754 #endif
3755 se->prev_sum_exec_runtime = se->sum_exec_runtime;
3756 }
3757
3758 static int
3759 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3760
3761 /*
3762 * Pick the next process, keeping these things in mind, in this order:
3763 * 1) keep things fair between processes/task groups
3764 * 2) pick the "next" process, since someone really wants that to run
3765 * 3) pick the "last" process, for cache locality
3766 * 4) do not run the "skip" process, if something else is available
3767 */
3768 static struct sched_entity *
pick_next_entity(struct cfs_rq * cfs_rq,struct sched_entity * curr)3769 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3770 {
3771 struct sched_entity *left = __pick_first_entity(cfs_rq);
3772 struct sched_entity *se;
3773
3774 /*
3775 * If curr is set we have to see if its left of the leftmost entity
3776 * still in the tree, provided there was anything in the tree at all.
3777 */
3778 if (!left || (curr && entity_before(curr, left)))
3779 left = curr;
3780
3781 se = left; /* ideally we run the leftmost entity */
3782
3783 /*
3784 * Avoid running the skip buddy, if running something else can
3785 * be done without getting too unfair.
3786 */
3787 if (cfs_rq->skip == se) {
3788 struct sched_entity *second;
3789
3790 if (se == curr) {
3791 second = __pick_first_entity(cfs_rq);
3792 } else {
3793 second = __pick_next_entity(se);
3794 if (!second || (curr && entity_before(curr, second)))
3795 second = curr;
3796 }
3797
3798 if (second && wakeup_preempt_entity(second, left) < 1)
3799 se = second;
3800 }
3801
3802 /*
3803 * Prefer last buddy, try to return the CPU to a preempted task.
3804 */
3805 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3806 se = cfs_rq->last;
3807
3808 /*
3809 * Someone really wants this to run. If it's not unfair, run it.
3810 */
3811 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3812 se = cfs_rq->next;
3813
3814 clear_buddies(cfs_rq, se);
3815
3816 return se;
3817 }
3818
3819 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3820
put_prev_entity(struct cfs_rq * cfs_rq,struct sched_entity * prev)3821 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3822 {
3823 /*
3824 * If still on the runqueue then deactivate_task()
3825 * was not called and update_curr() has to be done:
3826 */
3827 if (prev->on_rq)
3828 update_curr(cfs_rq);
3829
3830 /* throttle cfs_rqs exceeding runtime */
3831 check_cfs_rq_runtime(cfs_rq);
3832
3833 check_spread(cfs_rq, prev);
3834 if (prev->on_rq) {
3835 update_stats_wait_start(cfs_rq, prev);
3836 /* Put 'current' back into the tree. */
3837 __enqueue_entity(cfs_rq, prev);
3838 /* in !on_rq case, update occurred at dequeue */
3839 update_load_avg(prev, 0);
3840 }
3841 cfs_rq->curr = NULL;
3842 }
3843
3844 static void
entity_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr,int queued)3845 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3846 {
3847 /*
3848 * Update run-time statistics of the 'current'.
3849 */
3850 update_curr(cfs_rq);
3851
3852 /*
3853 * Ensure that runnable average is periodically updated.
3854 */
3855 update_load_avg(curr, UPDATE_TG);
3856 update_cfs_shares(curr);
3857
3858 #ifdef CONFIG_SCHED_HRTICK
3859 /*
3860 * queued ticks are scheduled to match the slice, so don't bother
3861 * validating it and just reschedule.
3862 */
3863 if (queued) {
3864 resched_curr(rq_of(cfs_rq));
3865 return;
3866 }
3867 /*
3868 * don't let the period tick interfere with the hrtick preemption
3869 */
3870 if (!sched_feat(DOUBLE_TICK) &&
3871 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3872 return;
3873 #endif
3874
3875 if (cfs_rq->nr_running > 1)
3876 check_preempt_tick(cfs_rq, curr);
3877 }
3878
3879
3880 /**************************************************
3881 * CFS bandwidth control machinery
3882 */
3883
3884 #ifdef CONFIG_CFS_BANDWIDTH
3885
3886 #ifdef HAVE_JUMP_LABEL
3887 static struct static_key __cfs_bandwidth_used;
3888
cfs_bandwidth_used(void)3889 static inline bool cfs_bandwidth_used(void)
3890 {
3891 return static_key_false(&__cfs_bandwidth_used);
3892 }
3893
cfs_bandwidth_usage_inc(void)3894 void cfs_bandwidth_usage_inc(void)
3895 {
3896 static_key_slow_inc(&__cfs_bandwidth_used);
3897 }
3898
cfs_bandwidth_usage_dec(void)3899 void cfs_bandwidth_usage_dec(void)
3900 {
3901 static_key_slow_dec(&__cfs_bandwidth_used);
3902 }
3903 #else /* HAVE_JUMP_LABEL */
cfs_bandwidth_used(void)3904 static bool cfs_bandwidth_used(void)
3905 {
3906 return true;
3907 }
3908
cfs_bandwidth_usage_inc(void)3909 void cfs_bandwidth_usage_inc(void) {}
cfs_bandwidth_usage_dec(void)3910 void cfs_bandwidth_usage_dec(void) {}
3911 #endif /* HAVE_JUMP_LABEL */
3912
3913 /*
3914 * default period for cfs group bandwidth.
3915 * default: 0.1s, units: nanoseconds
3916 */
default_cfs_period(void)3917 static inline u64 default_cfs_period(void)
3918 {
3919 return 100000000ULL;
3920 }
3921
sched_cfs_bandwidth_slice(void)3922 static inline u64 sched_cfs_bandwidth_slice(void)
3923 {
3924 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3925 }
3926
3927 /*
3928 * Replenish runtime according to assigned quota and update expiration time.
3929 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3930 * additional synchronization around rq->lock.
3931 *
3932 * requires cfs_b->lock
3933 */
__refill_cfs_bandwidth_runtime(struct cfs_bandwidth * cfs_b)3934 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3935 {
3936 u64 now;
3937
3938 if (cfs_b->quota == RUNTIME_INF)
3939 return;
3940
3941 now = sched_clock_cpu(smp_processor_id());
3942 cfs_b->runtime = cfs_b->quota;
3943 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3944 }
3945
tg_cfs_bandwidth(struct task_group * tg)3946 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3947 {
3948 return &tg->cfs_bandwidth;
3949 }
3950
3951 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
cfs_rq_clock_task(struct cfs_rq * cfs_rq)3952 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3953 {
3954 if (unlikely(cfs_rq->throttle_count))
3955 return cfs_rq->throttled_clock_task;
3956
3957 return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3958 }
3959
3960 /* returns 0 on failure to allocate runtime */
assign_cfs_rq_runtime(struct cfs_rq * cfs_rq)3961 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3962 {
3963 struct task_group *tg = cfs_rq->tg;
3964 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3965 u64 amount = 0, min_amount, expires;
3966
3967 /* note: this is a positive sum as runtime_remaining <= 0 */
3968 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3969
3970 raw_spin_lock(&cfs_b->lock);
3971 if (cfs_b->quota == RUNTIME_INF)
3972 amount = min_amount;
3973 else {
3974 start_cfs_bandwidth(cfs_b);
3975
3976 if (cfs_b->runtime > 0) {
3977 amount = min(cfs_b->runtime, min_amount);
3978 cfs_b->runtime -= amount;
3979 cfs_b->idle = 0;
3980 }
3981 }
3982 expires = cfs_b->runtime_expires;
3983 raw_spin_unlock(&cfs_b->lock);
3984
3985 cfs_rq->runtime_remaining += amount;
3986 /*
3987 * we may have advanced our local expiration to account for allowed
3988 * spread between our sched_clock and the one on which runtime was
3989 * issued.
3990 */
3991 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3992 cfs_rq->runtime_expires = expires;
3993
3994 return cfs_rq->runtime_remaining > 0;
3995 }
3996
3997 /*
3998 * Note: This depends on the synchronization provided by sched_clock and the
3999 * fact that rq->clock snapshots this value.
4000 */
expire_cfs_rq_runtime(struct cfs_rq * cfs_rq)4001 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4002 {
4003 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4004
4005 /* if the deadline is ahead of our clock, nothing to do */
4006 if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4007 return;
4008
4009 if (cfs_rq->runtime_remaining < 0)
4010 return;
4011
4012 /*
4013 * If the local deadline has passed we have to consider the
4014 * possibility that our sched_clock is 'fast' and the global deadline
4015 * has not truly expired.
4016 *
4017 * Fortunately we can check determine whether this the case by checking
4018 * whether the global deadline has advanced. It is valid to compare
4019 * cfs_b->runtime_expires without any locks since we only care about
4020 * exact equality, so a partial write will still work.
4021 */
4022
4023 if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
4024 /* extend local deadline, drift is bounded above by 2 ticks */
4025 cfs_rq->runtime_expires += TICK_NSEC;
4026 } else {
4027 /* global deadline is ahead, expiration has passed */
4028 cfs_rq->runtime_remaining = 0;
4029 }
4030 }
4031
__account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)4032 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4033 {
4034 /* dock delta_exec before expiring quota (as it could span periods) */
4035 cfs_rq->runtime_remaining -= delta_exec;
4036 expire_cfs_rq_runtime(cfs_rq);
4037
4038 if (likely(cfs_rq->runtime_remaining > 0))
4039 return;
4040
4041 /*
4042 * if we're unable to extend our runtime we resched so that the active
4043 * hierarchy can be throttled
4044 */
4045 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4046 resched_curr(rq_of(cfs_rq));
4047 }
4048
4049 static __always_inline
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)4050 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4051 {
4052 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4053 return;
4054
4055 __account_cfs_rq_runtime(cfs_rq, delta_exec);
4056 }
4057
cfs_rq_throttled(struct cfs_rq * cfs_rq)4058 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4059 {
4060 return cfs_bandwidth_used() && cfs_rq->throttled;
4061 }
4062
4063 /* check whether cfs_rq, or any parent, is throttled */
throttled_hierarchy(struct cfs_rq * cfs_rq)4064 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4065 {
4066 return cfs_bandwidth_used() && cfs_rq->throttle_count;
4067 }
4068
4069 /*
4070 * Ensure that neither of the group entities corresponding to src_cpu or
4071 * dest_cpu are members of a throttled hierarchy when performing group
4072 * load-balance operations.
4073 */
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)4074 static inline int throttled_lb_pair(struct task_group *tg,
4075 int src_cpu, int dest_cpu)
4076 {
4077 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4078
4079 src_cfs_rq = tg->cfs_rq[src_cpu];
4080 dest_cfs_rq = tg->cfs_rq[dest_cpu];
4081
4082 return throttled_hierarchy(src_cfs_rq) ||
4083 throttled_hierarchy(dest_cfs_rq);
4084 }
4085
4086 /* updated child weight may affect parent so we have to do this bottom up */
tg_unthrottle_up(struct task_group * tg,void * data)4087 static int tg_unthrottle_up(struct task_group *tg, void *data)
4088 {
4089 struct rq *rq = data;
4090 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4091
4092 cfs_rq->throttle_count--;
4093 #ifdef CONFIG_SMP
4094 if (!cfs_rq->throttle_count) {
4095 /* adjust cfs_rq_clock_task() */
4096 cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4097 cfs_rq->throttled_clock_task;
4098 }
4099 #endif
4100
4101 return 0;
4102 }
4103
tg_throttle_down(struct task_group * tg,void * data)4104 static int tg_throttle_down(struct task_group *tg, void *data)
4105 {
4106 struct rq *rq = data;
4107 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4108
4109 /* group is entering throttled state, stop time */
4110 if (!cfs_rq->throttle_count)
4111 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4112 cfs_rq->throttle_count++;
4113
4114 return 0;
4115 }
4116
throttle_cfs_rq(struct cfs_rq * cfs_rq)4117 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4118 {
4119 struct rq *rq = rq_of(cfs_rq);
4120 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4121 struct sched_entity *se;
4122 long task_delta, dequeue = 1;
4123 bool empty;
4124
4125 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4126
4127 /* freeze hierarchy runnable averages while throttled */
4128 rcu_read_lock();
4129 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4130 rcu_read_unlock();
4131
4132 task_delta = cfs_rq->h_nr_running;
4133 for_each_sched_entity(se) {
4134 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4135 /* throttled entity or throttle-on-deactivate */
4136 if (!se->on_rq)
4137 break;
4138
4139 if (dequeue)
4140 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4141 qcfs_rq->h_nr_running -= task_delta;
4142
4143 if (qcfs_rq->load.weight)
4144 dequeue = 0;
4145 }
4146
4147 if (!se)
4148 sub_nr_running(rq, task_delta);
4149
4150 cfs_rq->throttled = 1;
4151 cfs_rq->throttled_clock = rq_clock(rq);
4152 raw_spin_lock(&cfs_b->lock);
4153 empty = list_empty(&cfs_b->throttled_cfs_rq);
4154
4155 /*
4156 * Add to the _head_ of the list, so that an already-started
4157 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4158 * not running add to the tail so that later runqueues don't get starved.
4159 */
4160 if (cfs_b->distribute_running)
4161 list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4162 else
4163 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4164
4165 /*
4166 * If we're the first throttled task, make sure the bandwidth
4167 * timer is running.
4168 */
4169 if (empty)
4170 start_cfs_bandwidth(cfs_b);
4171
4172 raw_spin_unlock(&cfs_b->lock);
4173 }
4174
unthrottle_cfs_rq(struct cfs_rq * cfs_rq)4175 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4176 {
4177 struct rq *rq = rq_of(cfs_rq);
4178 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4179 struct sched_entity *se;
4180 int enqueue = 1;
4181 long task_delta;
4182
4183 se = cfs_rq->tg->se[cpu_of(rq)];
4184
4185 cfs_rq->throttled = 0;
4186
4187 update_rq_clock(rq);
4188
4189 raw_spin_lock(&cfs_b->lock);
4190 cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4191 list_del_rcu(&cfs_rq->throttled_list);
4192 raw_spin_unlock(&cfs_b->lock);
4193
4194 /* update hierarchical throttle state */
4195 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4196
4197 if (!cfs_rq->load.weight)
4198 return;
4199
4200 task_delta = cfs_rq->h_nr_running;
4201 for_each_sched_entity(se) {
4202 if (se->on_rq)
4203 enqueue = 0;
4204
4205 cfs_rq = cfs_rq_of(se);
4206 if (enqueue)
4207 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4208 cfs_rq->h_nr_running += task_delta;
4209
4210 if (cfs_rq_throttled(cfs_rq))
4211 break;
4212 }
4213
4214 if (!se)
4215 add_nr_running(rq, task_delta);
4216
4217 /* determine whether we need to wake up potentially idle cpu */
4218 if (rq->curr == rq->idle && rq->cfs.nr_running)
4219 resched_curr(rq);
4220 }
4221
distribute_cfs_runtime(struct cfs_bandwidth * cfs_b,u64 remaining,u64 expires)4222 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4223 u64 remaining, u64 expires)
4224 {
4225 struct cfs_rq *cfs_rq;
4226 u64 runtime;
4227 u64 starting_runtime = remaining;
4228
4229 rcu_read_lock();
4230 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4231 throttled_list) {
4232 struct rq *rq = rq_of(cfs_rq);
4233
4234 raw_spin_lock(&rq->lock);
4235 if (!cfs_rq_throttled(cfs_rq))
4236 goto next;
4237
4238 runtime = -cfs_rq->runtime_remaining + 1;
4239 if (runtime > remaining)
4240 runtime = remaining;
4241 remaining -= runtime;
4242
4243 cfs_rq->runtime_remaining += runtime;
4244 cfs_rq->runtime_expires = expires;
4245
4246 /* we check whether we're throttled above */
4247 if (cfs_rq->runtime_remaining > 0)
4248 unthrottle_cfs_rq(cfs_rq);
4249
4250 next:
4251 raw_spin_unlock(&rq->lock);
4252
4253 if (!remaining)
4254 break;
4255 }
4256 rcu_read_unlock();
4257
4258 return starting_runtime - remaining;
4259 }
4260
4261 /*
4262 * Responsible for refilling a task_group's bandwidth and unthrottling its
4263 * cfs_rqs as appropriate. If there has been no activity within the last
4264 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4265 * used to track this state.
4266 */
do_sched_cfs_period_timer(struct cfs_bandwidth * cfs_b,int overrun)4267 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4268 {
4269 u64 runtime, runtime_expires;
4270 int throttled;
4271
4272 /* no need to continue the timer with no bandwidth constraint */
4273 if (cfs_b->quota == RUNTIME_INF)
4274 goto out_deactivate;
4275
4276 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4277 cfs_b->nr_periods += overrun;
4278
4279 /*
4280 * idle depends on !throttled (for the case of a large deficit), and if
4281 * we're going inactive then everything else can be deferred
4282 */
4283 if (cfs_b->idle && !throttled)
4284 goto out_deactivate;
4285
4286 __refill_cfs_bandwidth_runtime(cfs_b);
4287
4288 if (!throttled) {
4289 /* mark as potentially idle for the upcoming period */
4290 cfs_b->idle = 1;
4291 return 0;
4292 }
4293
4294 /* account preceding periods in which throttling occurred */
4295 cfs_b->nr_throttled += overrun;
4296
4297 runtime_expires = cfs_b->runtime_expires;
4298
4299 /*
4300 * This check is repeated as we are holding onto the new bandwidth while
4301 * we unthrottle. This can potentially race with an unthrottled group
4302 * trying to acquire new bandwidth from the global pool. This can result
4303 * in us over-using our runtime if it is all used during this loop, but
4304 * only by limited amounts in that extreme case.
4305 */
4306 while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4307 runtime = cfs_b->runtime;
4308 cfs_b->distribute_running = 1;
4309 raw_spin_unlock(&cfs_b->lock);
4310 /* we can't nest cfs_b->lock while distributing bandwidth */
4311 runtime = distribute_cfs_runtime(cfs_b, runtime,
4312 runtime_expires);
4313 raw_spin_lock(&cfs_b->lock);
4314
4315 cfs_b->distribute_running = 0;
4316 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4317
4318 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4319 }
4320
4321 /*
4322 * While we are ensured activity in the period following an
4323 * unthrottle, this also covers the case in which the new bandwidth is
4324 * insufficient to cover the existing bandwidth deficit. (Forcing the
4325 * timer to remain active while there are any throttled entities.)
4326 */
4327 cfs_b->idle = 0;
4328
4329 return 0;
4330
4331 out_deactivate:
4332 return 1;
4333 }
4334
4335 /* a cfs_rq won't donate quota below this amount */
4336 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4337 /* minimum remaining period time to redistribute slack quota */
4338 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4339 /* how long we wait to gather additional slack before distributing */
4340 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4341
4342 /*
4343 * Are we near the end of the current quota period?
4344 *
4345 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4346 * hrtimer base being cleared by hrtimer_start. In the case of
4347 * migrate_hrtimers, base is never cleared, so we are fine.
4348 */
runtime_refresh_within(struct cfs_bandwidth * cfs_b,u64 min_expire)4349 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4350 {
4351 struct hrtimer *refresh_timer = &cfs_b->period_timer;
4352 s64 remaining;
4353
4354 /* if the call-back is running a quota refresh is already occurring */
4355 if (hrtimer_callback_running(refresh_timer))
4356 return 1;
4357
4358 /* is a quota refresh about to occur? */
4359 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4360 if (remaining < (s64)min_expire)
4361 return 1;
4362
4363 return 0;
4364 }
4365
start_cfs_slack_bandwidth(struct cfs_bandwidth * cfs_b)4366 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4367 {
4368 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4369
4370 /* if there's a quota refresh soon don't bother with slack */
4371 if (runtime_refresh_within(cfs_b, min_left))
4372 return;
4373
4374 hrtimer_start(&cfs_b->slack_timer,
4375 ns_to_ktime(cfs_bandwidth_slack_period),
4376 HRTIMER_MODE_REL);
4377 }
4378
4379 /* we know any runtime found here is valid as update_curr() precedes return */
__return_cfs_rq_runtime(struct cfs_rq * cfs_rq)4380 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4381 {
4382 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4383 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4384
4385 if (slack_runtime <= 0)
4386 return;
4387
4388 raw_spin_lock(&cfs_b->lock);
4389 if (cfs_b->quota != RUNTIME_INF &&
4390 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4391 cfs_b->runtime += slack_runtime;
4392
4393 /* we are under rq->lock, defer unthrottling using a timer */
4394 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4395 !list_empty(&cfs_b->throttled_cfs_rq))
4396 start_cfs_slack_bandwidth(cfs_b);
4397 }
4398 raw_spin_unlock(&cfs_b->lock);
4399
4400 /* even if it's not valid for return we don't want to try again */
4401 cfs_rq->runtime_remaining -= slack_runtime;
4402 }
4403
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)4404 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4405 {
4406 if (!cfs_bandwidth_used())
4407 return;
4408
4409 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4410 return;
4411
4412 __return_cfs_rq_runtime(cfs_rq);
4413 }
4414
4415 /*
4416 * This is done with a timer (instead of inline with bandwidth return) since
4417 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4418 */
do_sched_cfs_slack_timer(struct cfs_bandwidth * cfs_b)4419 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4420 {
4421 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4422 u64 expires;
4423
4424 /* confirm we're still not at a refresh boundary */
4425 raw_spin_lock(&cfs_b->lock);
4426 if (cfs_b->distribute_running) {
4427 raw_spin_unlock(&cfs_b->lock);
4428 return;
4429 }
4430
4431 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4432 raw_spin_unlock(&cfs_b->lock);
4433 return;
4434 }
4435
4436 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4437 runtime = cfs_b->runtime;
4438
4439 expires = cfs_b->runtime_expires;
4440 if (runtime)
4441 cfs_b->distribute_running = 1;
4442
4443 raw_spin_unlock(&cfs_b->lock);
4444
4445 if (!runtime)
4446 return;
4447
4448 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4449
4450 raw_spin_lock(&cfs_b->lock);
4451 if (expires == cfs_b->runtime_expires)
4452 cfs_b->runtime -= min(runtime, cfs_b->runtime);
4453 cfs_b->distribute_running = 0;
4454 raw_spin_unlock(&cfs_b->lock);
4455 }
4456
4457 /*
4458 * When a group wakes up we want to make sure that its quota is not already
4459 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4460 * runtime as update_curr() throttling can not not trigger until it's on-rq.
4461 */
check_enqueue_throttle(struct cfs_rq * cfs_rq)4462 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4463 {
4464 if (!cfs_bandwidth_used())
4465 return;
4466
4467 /* Synchronize hierarchical throttle counter: */
4468 if (unlikely(!cfs_rq->throttle_uptodate)) {
4469 struct rq *rq = rq_of(cfs_rq);
4470 struct cfs_rq *pcfs_rq;
4471 struct task_group *tg;
4472
4473 cfs_rq->throttle_uptodate = 1;
4474
4475 /* Get closest up-to-date node, because leaves go first: */
4476 for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
4477 pcfs_rq = tg->cfs_rq[cpu_of(rq)];
4478 if (pcfs_rq->throttle_uptodate)
4479 break;
4480 }
4481 if (tg) {
4482 cfs_rq->throttle_count = pcfs_rq->throttle_count;
4483 cfs_rq->throttled_clock_task = rq_clock_task(rq);
4484 }
4485 }
4486
4487 /* an active group must be handled by the update_curr()->put() path */
4488 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4489 return;
4490
4491 /* ensure the group is not already throttled */
4492 if (cfs_rq_throttled(cfs_rq))
4493 return;
4494
4495 /* update runtime allocation */
4496 account_cfs_rq_runtime(cfs_rq, 0);
4497 if (cfs_rq->runtime_remaining <= 0)
4498 throttle_cfs_rq(cfs_rq);
4499 }
4500
4501 /* conditionally throttle active cfs_rq's from put_prev_entity() */
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)4502 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4503 {
4504 if (!cfs_bandwidth_used())
4505 return false;
4506
4507 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4508 return false;
4509
4510 /*
4511 * it's possible for a throttled entity to be forced into a running
4512 * state (e.g. set_curr_task), in this case we're finished.
4513 */
4514 if (cfs_rq_throttled(cfs_rq))
4515 return true;
4516
4517 throttle_cfs_rq(cfs_rq);
4518 return true;
4519 }
4520
sched_cfs_slack_timer(struct hrtimer * timer)4521 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4522 {
4523 struct cfs_bandwidth *cfs_b =
4524 container_of(timer, struct cfs_bandwidth, slack_timer);
4525
4526 do_sched_cfs_slack_timer(cfs_b);
4527
4528 return HRTIMER_NORESTART;
4529 }
4530
4531 extern const u64 max_cfs_quota_period;
4532
sched_cfs_period_timer(struct hrtimer * timer)4533 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4534 {
4535 struct cfs_bandwidth *cfs_b =
4536 container_of(timer, struct cfs_bandwidth, period_timer);
4537 int overrun;
4538 int idle = 0;
4539 int count = 0;
4540
4541 raw_spin_lock(&cfs_b->lock);
4542 for (;;) {
4543 overrun = hrtimer_forward_now(timer, cfs_b->period);
4544 if (!overrun)
4545 break;
4546
4547 if (++count > 3) {
4548 u64 new, old = ktime_to_ns(cfs_b->period);
4549
4550 /*
4551 * Grow period by a factor of 2 to avoid losing precision.
4552 * Precision loss in the quota/period ratio can cause __cfs_schedulable
4553 * to fail.
4554 */
4555 new = old * 2;
4556 if (new < max_cfs_quota_period) {
4557 cfs_b->period = ns_to_ktime(new);
4558 cfs_b->quota *= 2;
4559
4560 pr_warn_ratelimited(
4561 "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4562 smp_processor_id(),
4563 div_u64(new, NSEC_PER_USEC),
4564 div_u64(cfs_b->quota, NSEC_PER_USEC));
4565 } else {
4566 pr_warn_ratelimited(
4567 "cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4568 smp_processor_id(),
4569 div_u64(old, NSEC_PER_USEC),
4570 div_u64(cfs_b->quota, NSEC_PER_USEC));
4571 }
4572
4573 /* reset count so we don't come right back in here */
4574 count = 0;
4575 }
4576
4577 idle = do_sched_cfs_period_timer(cfs_b, overrun);
4578 }
4579 if (idle)
4580 cfs_b->period_active = 0;
4581 raw_spin_unlock(&cfs_b->lock);
4582
4583 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4584 }
4585
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4586 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4587 {
4588 raw_spin_lock_init(&cfs_b->lock);
4589 cfs_b->runtime = 0;
4590 cfs_b->quota = RUNTIME_INF;
4591 cfs_b->period = ns_to_ktime(default_cfs_period());
4592
4593 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4594 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4595 cfs_b->period_timer.function = sched_cfs_period_timer;
4596 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4597 cfs_b->slack_timer.function = sched_cfs_slack_timer;
4598 cfs_b->distribute_running = 0;
4599 }
4600
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)4601 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4602 {
4603 cfs_rq->runtime_enabled = 0;
4604 INIT_LIST_HEAD(&cfs_rq->throttled_list);
4605 }
4606
start_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4607 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4608 {
4609 lockdep_assert_held(&cfs_b->lock);
4610
4611 if (!cfs_b->period_active) {
4612 cfs_b->period_active = 1;
4613 hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4614 hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4615 }
4616 }
4617
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4618 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4619 {
4620 /* init_cfs_bandwidth() was not called */
4621 if (!cfs_b->throttled_cfs_rq.next)
4622 return;
4623
4624 hrtimer_cancel(&cfs_b->period_timer);
4625 hrtimer_cancel(&cfs_b->slack_timer);
4626 }
4627
update_runtime_enabled(struct rq * rq)4628 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4629 {
4630 struct cfs_rq *cfs_rq;
4631
4632 for_each_leaf_cfs_rq(rq, cfs_rq) {
4633 struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4634
4635 raw_spin_lock(&cfs_b->lock);
4636 cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4637 raw_spin_unlock(&cfs_b->lock);
4638 }
4639 }
4640
unthrottle_offline_cfs_rqs(struct rq * rq)4641 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4642 {
4643 struct cfs_rq *cfs_rq;
4644
4645 for_each_leaf_cfs_rq(rq, cfs_rq) {
4646 if (!cfs_rq->runtime_enabled)
4647 continue;
4648
4649 /*
4650 * clock_task is not advancing so we just need to make sure
4651 * there's some valid quota amount
4652 */
4653 cfs_rq->runtime_remaining = 1;
4654 /*
4655 * Offline rq is schedulable till cpu is completely disabled
4656 * in take_cpu_down(), so we prevent new cfs throttling here.
4657 */
4658 cfs_rq->runtime_enabled = 0;
4659
4660 if (cfs_rq_throttled(cfs_rq))
4661 unthrottle_cfs_rq(cfs_rq);
4662 }
4663 }
4664
4665 #else /* CONFIG_CFS_BANDWIDTH */
cfs_rq_clock_task(struct cfs_rq * cfs_rq)4666 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4667 {
4668 return rq_clock_task(rq_of(cfs_rq));
4669 }
4670
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)4671 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)4672 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
check_enqueue_throttle(struct cfs_rq * cfs_rq)4673 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)4674 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4675
cfs_rq_throttled(struct cfs_rq * cfs_rq)4676 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4677 {
4678 return 0;
4679 }
4680
throttled_hierarchy(struct cfs_rq * cfs_rq)4681 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4682 {
4683 return 0;
4684 }
4685
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)4686 static inline int throttled_lb_pair(struct task_group *tg,
4687 int src_cpu, int dest_cpu)
4688 {
4689 return 0;
4690 }
4691
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4692 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4693
4694 #ifdef CONFIG_FAIR_GROUP_SCHED
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)4695 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4696 #endif
4697
tg_cfs_bandwidth(struct task_group * tg)4698 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4699 {
4700 return NULL;
4701 }
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4702 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
update_runtime_enabled(struct rq * rq)4703 static inline void update_runtime_enabled(struct rq *rq) {}
unthrottle_offline_cfs_rqs(struct rq * rq)4704 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4705
4706 #endif /* CONFIG_CFS_BANDWIDTH */
4707
4708 /**************************************************
4709 * CFS operations on tasks:
4710 */
4711
4712 #ifdef CONFIG_SCHED_HRTICK
hrtick_start_fair(struct rq * rq,struct task_struct * p)4713 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4714 {
4715 struct sched_entity *se = &p->se;
4716 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4717
4718 WARN_ON(task_rq(p) != rq);
4719
4720 if (cfs_rq->nr_running > 1) {
4721 u64 slice = sched_slice(cfs_rq, se);
4722 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4723 s64 delta = slice - ran;
4724
4725 if (delta < 0) {
4726 if (rq->curr == p)
4727 resched_curr(rq);
4728 return;
4729 }
4730 hrtick_start(rq, delta);
4731 }
4732 }
4733
4734 /*
4735 * called from enqueue/dequeue and updates the hrtick when the
4736 * current task is from our class and nr_running is low enough
4737 * to matter.
4738 */
hrtick_update(struct rq * rq)4739 static void hrtick_update(struct rq *rq)
4740 {
4741 struct task_struct *curr = rq->curr;
4742
4743 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4744 return;
4745
4746 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4747 hrtick_start_fair(rq, curr);
4748 }
4749 #else /* !CONFIG_SCHED_HRTICK */
4750 static inline void
hrtick_start_fair(struct rq * rq,struct task_struct * p)4751 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4752 {
4753 }
4754
hrtick_update(struct rq * rq)4755 static inline void hrtick_update(struct rq *rq)
4756 {
4757 }
4758 #endif
4759
4760 #ifdef CONFIG_SMP
4761 static bool __cpu_overutilized(int cpu, int delta);
4762 static bool cpu_overutilized(int cpu);
4763 unsigned long boosted_cpu_util(int cpu);
4764 #else
4765 #define boosted_cpu_util(cpu) cpu_util_freq(cpu)
4766 #endif
4767
4768 /*
4769 * The enqueue_task method is called before nr_running is
4770 * increased. Here we update the fair scheduling stats and
4771 * then put the task into the rbtree:
4772 */
4773 static void
enqueue_task_fair(struct rq * rq,struct task_struct * p,int flags)4774 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4775 {
4776 struct cfs_rq *cfs_rq;
4777 struct sched_entity *se = &p->se;
4778 #ifdef CONFIG_SMP
4779 int task_new = flags & ENQUEUE_WAKEUP_NEW;
4780 #endif
4781
4782 /*
4783 * If in_iowait is set, the code below may not trigger any cpufreq
4784 * utilization updates, so do it here explicitly with the IOWAIT flag
4785 * passed.
4786 */
4787 if (p->in_iowait)
4788 cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
4789
4790 for_each_sched_entity(se) {
4791 if (se->on_rq)
4792 break;
4793 cfs_rq = cfs_rq_of(se);
4794 enqueue_entity(cfs_rq, se, flags);
4795
4796 /*
4797 * end evaluation on encountering a throttled cfs_rq
4798 *
4799 * note: in the case of encountering a throttled cfs_rq we will
4800 * post the final h_nr_running increment below.
4801 */
4802 if (cfs_rq_throttled(cfs_rq))
4803 break;
4804 cfs_rq->h_nr_running++;
4805 walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
4806
4807 flags = ENQUEUE_WAKEUP;
4808 }
4809
4810 for_each_sched_entity(se) {
4811 cfs_rq = cfs_rq_of(se);
4812 cfs_rq->h_nr_running++;
4813 walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
4814
4815 if (cfs_rq_throttled(cfs_rq))
4816 break;
4817
4818 update_load_avg(se, UPDATE_TG);
4819 update_cfs_shares(se);
4820 }
4821
4822 if (!se)
4823 add_nr_running(rq, 1);
4824
4825 #ifdef CONFIG_SMP
4826
4827 /*
4828 * Update SchedTune accounting.
4829 *
4830 * We do it before updating the CPU capacity to ensure the
4831 * boost value of the current task is accounted for in the
4832 * selection of the OPP.
4833 *
4834 * We do it also in the case where we enqueue a throttled task;
4835 * we could argue that a throttled task should not boost a CPU,
4836 * however:
4837 * a) properly implementing CPU boosting considering throttled
4838 * tasks will increase a lot the complexity of the solution
4839 * b) it's not easy to quantify the benefits introduced by
4840 * such a more complex solution.
4841 * Thus, for the time being we go for the simple solution and boost
4842 * also for throttled RQs.
4843 */
4844 schedtune_enqueue_task(p, cpu_of(rq));
4845
4846 if (!se) {
4847 walt_inc_cumulative_runnable_avg(rq, p);
4848 if (!task_new && !rq->rd->overutilized &&
4849 cpu_overutilized(rq->cpu)) {
4850 rq->rd->overutilized = true;
4851 trace_sched_overutilized(true);
4852 }
4853 }
4854
4855 #endif /* CONFIG_SMP */
4856 hrtick_update(rq);
4857 }
4858
4859 static void set_next_buddy(struct sched_entity *se);
4860
4861 /*
4862 * The dequeue_task method is called before nr_running is
4863 * decreased. We remove the task from the rbtree and
4864 * update the fair scheduling stats:
4865 */
dequeue_task_fair(struct rq * rq,struct task_struct * p,int flags)4866 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4867 {
4868 struct cfs_rq *cfs_rq;
4869 struct sched_entity *se = &p->se;
4870 int task_sleep = flags & DEQUEUE_SLEEP;
4871
4872 for_each_sched_entity(se) {
4873 cfs_rq = cfs_rq_of(se);
4874 dequeue_entity(cfs_rq, se, flags);
4875
4876 /*
4877 * end evaluation on encountering a throttled cfs_rq
4878 *
4879 * note: in the case of encountering a throttled cfs_rq we will
4880 * post the final h_nr_running decrement below.
4881 */
4882 if (cfs_rq_throttled(cfs_rq))
4883 break;
4884 cfs_rq->h_nr_running--;
4885 walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
4886
4887 /* Don't dequeue parent if it has other entities besides us */
4888 if (cfs_rq->load.weight) {
4889 /* Avoid re-evaluating load for this entity: */
4890 se = parent_entity(se);
4891 /*
4892 * Bias pick_next to pick a task from this cfs_rq, as
4893 * p is sleeping when it is within its sched_slice.
4894 */
4895 if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4896 set_next_buddy(se);
4897 break;
4898 }
4899 flags |= DEQUEUE_SLEEP;
4900 }
4901
4902 for_each_sched_entity(se) {
4903 cfs_rq = cfs_rq_of(se);
4904 cfs_rq->h_nr_running--;
4905 walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
4906
4907 if (cfs_rq_throttled(cfs_rq))
4908 break;
4909
4910 update_load_avg(se, UPDATE_TG);
4911 update_cfs_shares(se);
4912 }
4913
4914 if (!se)
4915 sub_nr_running(rq, 1);
4916
4917 #ifdef CONFIG_SMP
4918
4919 /*
4920 * Update SchedTune accounting
4921 *
4922 * We do it before updating the CPU capacity to ensure the
4923 * boost value of the current task is accounted for in the
4924 * selection of the OPP.
4925 */
4926 schedtune_dequeue_task(p, cpu_of(rq));
4927
4928 if (!se)
4929 walt_dec_cumulative_runnable_avg(rq, p);
4930 #endif /* CONFIG_SMP */
4931
4932 hrtick_update(rq);
4933 }
4934
4935 #ifdef CONFIG_SMP
4936
4937 /*
4938 * per rq 'load' arrray crap; XXX kill this.
4939 */
4940
4941 /*
4942 * The exact cpuload at various idx values, calculated at every tick would be
4943 * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4944 *
4945 * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4946 * on nth tick when cpu may be busy, then we have:
4947 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4948 * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4949 *
4950 * decay_load_missed() below does efficient calculation of
4951 * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4952 * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4953 *
4954 * The calculation is approximated on a 128 point scale.
4955 * degrade_zero_ticks is the number of ticks after which load at any
4956 * particular idx is approximated to be zero.
4957 * degrade_factor is a precomputed table, a row for each load idx.
4958 * Each column corresponds to degradation factor for a power of two ticks,
4959 * based on 128 point scale.
4960 * Example:
4961 * row 2, col 3 (=12) says that the degradation at load idx 2 after
4962 * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4963 *
4964 * With this power of 2 load factors, we can degrade the load n times
4965 * by looking at 1 bits in n and doing as many mult/shift instead of
4966 * n mult/shifts needed by the exact degradation.
4967 */
4968 #define DEGRADE_SHIFT 7
4969 static const unsigned char
4970 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4971 static const unsigned char
4972 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4973 {0, 0, 0, 0, 0, 0, 0, 0},
4974 {64, 32, 8, 0, 0, 0, 0, 0},
4975 {96, 72, 40, 12, 1, 0, 0},
4976 {112, 98, 75, 43, 15, 1, 0},
4977 {120, 112, 98, 76, 45, 16, 2} };
4978
4979 /*
4980 * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4981 * would be when CPU is idle and so we just decay the old load without
4982 * adding any new load.
4983 */
4984 static unsigned long
decay_load_missed(unsigned long load,unsigned long missed_updates,int idx)4985 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4986 {
4987 int j = 0;
4988
4989 if (!missed_updates)
4990 return load;
4991
4992 if (missed_updates >= degrade_zero_ticks[idx])
4993 return 0;
4994
4995 if (idx == 1)
4996 return load >> missed_updates;
4997
4998 while (missed_updates) {
4999 if (missed_updates % 2)
5000 load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5001
5002 missed_updates >>= 1;
5003 j++;
5004 }
5005 return load;
5006 }
5007
5008 /*
5009 * Update rq->cpu_load[] statistics. This function is usually called every
5010 * scheduler tick (TICK_NSEC). With tickless idle this will not be called
5011 * every tick. We fix it up based on jiffies.
5012 */
__update_cpu_load(struct rq * this_rq,unsigned long this_load,unsigned long pending_updates)5013 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
5014 unsigned long pending_updates)
5015 {
5016 int i, scale;
5017
5018 this_rq->nr_load_updates++;
5019
5020 /* Update our load: */
5021 this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5022 for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5023 unsigned long old_load, new_load;
5024
5025 /* scale is effectively 1 << i now, and >> i divides by scale */
5026
5027 old_load = this_rq->cpu_load[i];
5028 old_load = decay_load_missed(old_load, pending_updates - 1, i);
5029 new_load = this_load;
5030 /*
5031 * Round up the averaging division if load is increasing. This
5032 * prevents us from getting stuck on 9 if the load is 10, for
5033 * example.
5034 */
5035 if (new_load > old_load)
5036 new_load += scale - 1;
5037
5038 this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5039 }
5040
5041 sched_avg_update(this_rq);
5042 }
5043
5044 /* Used instead of source_load when we know the type == 0 */
weighted_cpuload(const int cpu)5045 static unsigned long weighted_cpuload(const int cpu)
5046 {
5047 return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
5048 }
5049
5050 #ifdef CONFIG_NO_HZ_COMMON
5051 /*
5052 * There is no sane way to deal with nohz on smp when using jiffies because the
5053 * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
5054 * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5055 *
5056 * Therefore we cannot use the delta approach from the regular tick since that
5057 * would seriously skew the load calculation. However we'll make do for those
5058 * updates happening while idle (nohz_idle_balance) or coming out of idle
5059 * (tick_nohz_idle_exit).
5060 *
5061 * This means we might still be one tick off for nohz periods.
5062 */
5063
5064 /*
5065 * Called from nohz_idle_balance() to update the load ratings before doing the
5066 * idle balance.
5067 */
update_idle_cpu_load(struct rq * this_rq)5068 static void update_idle_cpu_load(struct rq *this_rq)
5069 {
5070 unsigned long curr_jiffies = READ_ONCE(jiffies);
5071 unsigned long load = weighted_cpuload(cpu_of(this_rq));
5072 unsigned long pending_updates;
5073
5074 /*
5075 * bail if there's load or we're actually up-to-date.
5076 */
5077 if (load || curr_jiffies == this_rq->last_load_update_tick)
5078 return;
5079
5080 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5081 this_rq->last_load_update_tick = curr_jiffies;
5082
5083 __update_cpu_load(this_rq, load, pending_updates);
5084 }
5085
5086 /*
5087 * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
5088 */
update_cpu_load_nohz(void)5089 void update_cpu_load_nohz(void)
5090 {
5091 struct rq *this_rq = this_rq();
5092 unsigned long curr_jiffies = READ_ONCE(jiffies);
5093 unsigned long pending_updates;
5094
5095 if (curr_jiffies == this_rq->last_load_update_tick)
5096 return;
5097
5098 raw_spin_lock(&this_rq->lock);
5099 pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5100 if (pending_updates) {
5101 this_rq->last_load_update_tick = curr_jiffies;
5102 /*
5103 * We were idle, this means load 0, the current load might be
5104 * !0 due to remote wakeups and the sort.
5105 */
5106 __update_cpu_load(this_rq, 0, pending_updates);
5107 }
5108 raw_spin_unlock(&this_rq->lock);
5109 }
5110 #endif /* CONFIG_NO_HZ */
5111
5112 /*
5113 * Called from scheduler_tick()
5114 */
update_cpu_load_active(struct rq * this_rq)5115 void update_cpu_load_active(struct rq *this_rq)
5116 {
5117 unsigned long load = weighted_cpuload(cpu_of(this_rq));
5118 /*
5119 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
5120 */
5121 this_rq->last_load_update_tick = jiffies;
5122 __update_cpu_load(this_rq, load, 1);
5123 }
5124
5125 /*
5126 * Return a low guess at the load of a migration-source cpu weighted
5127 * according to the scheduling class and "nice" value.
5128 *
5129 * We want to under-estimate the load of migration sources, to
5130 * balance conservatively.
5131 */
source_load(int cpu,int type)5132 static unsigned long source_load(int cpu, int type)
5133 {
5134 struct rq *rq = cpu_rq(cpu);
5135 unsigned long total = weighted_cpuload(cpu);
5136
5137 if (type == 0 || !sched_feat(LB_BIAS))
5138 return total;
5139
5140 return min(rq->cpu_load[type-1], total);
5141 }
5142
5143 /*
5144 * Return a high guess at the load of a migration-target cpu weighted
5145 * according to the scheduling class and "nice" value.
5146 */
target_load(int cpu,int type)5147 static unsigned long target_load(int cpu, int type)
5148 {
5149 struct rq *rq = cpu_rq(cpu);
5150 unsigned long total = weighted_cpuload(cpu);
5151
5152 if (type == 0 || !sched_feat(LB_BIAS))
5153 return total;
5154
5155 return max(rq->cpu_load[type-1], total);
5156 }
5157
5158
cpu_avg_load_per_task(int cpu)5159 static unsigned long cpu_avg_load_per_task(int cpu)
5160 {
5161 struct rq *rq = cpu_rq(cpu);
5162 unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5163 unsigned long load_avg = weighted_cpuload(cpu);
5164
5165 if (nr_running)
5166 return load_avg / nr_running;
5167
5168 return 0;
5169 }
5170
record_wakee(struct task_struct * p)5171 static void record_wakee(struct task_struct *p)
5172 {
5173 /*
5174 * Rough decay (wiping) for cost saving, don't worry
5175 * about the boundary, really active task won't care
5176 * about the loss.
5177 */
5178 if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5179 current->wakee_flips >>= 1;
5180 current->wakee_flip_decay_ts = jiffies;
5181 }
5182
5183 if (current->last_wakee != p) {
5184 current->last_wakee = p;
5185 current->wakee_flips++;
5186 }
5187 }
5188
task_waking_fair(struct task_struct * p)5189 static void task_waking_fair(struct task_struct *p)
5190 {
5191 struct sched_entity *se = &p->se;
5192 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5193 u64 min_vruntime;
5194
5195 #ifndef CONFIG_64BIT
5196 u64 min_vruntime_copy;
5197
5198 do {
5199 min_vruntime_copy = cfs_rq->min_vruntime_copy;
5200 smp_rmb();
5201 min_vruntime = cfs_rq->min_vruntime;
5202 } while (min_vruntime != min_vruntime_copy);
5203 #else
5204 min_vruntime = cfs_rq->min_vruntime;
5205 #endif
5206
5207 se->vruntime -= min_vruntime;
5208 record_wakee(p);
5209 }
5210
5211 #ifdef CONFIG_FAIR_GROUP_SCHED
5212 /*
5213 * effective_load() calculates the load change as seen from the root_task_group
5214 *
5215 * Adding load to a group doesn't make a group heavier, but can cause movement
5216 * of group shares between cpus. Assuming the shares were perfectly aligned one
5217 * can calculate the shift in shares.
5218 *
5219 * Calculate the effective load difference if @wl is added (subtracted) to @tg
5220 * on this @cpu and results in a total addition (subtraction) of @wg to the
5221 * total group weight.
5222 *
5223 * Given a runqueue weight distribution (rw_i) we can compute a shares
5224 * distribution (s_i) using:
5225 *
5226 * s_i = rw_i / \Sum rw_j (1)
5227 *
5228 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
5229 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
5230 * shares distribution (s_i):
5231 *
5232 * rw_i = { 2, 4, 1, 0 }
5233 * s_i = { 2/7, 4/7, 1/7, 0 }
5234 *
5235 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
5236 * task used to run on and the CPU the waker is running on), we need to
5237 * compute the effect of waking a task on either CPU and, in case of a sync
5238 * wakeup, compute the effect of the current task going to sleep.
5239 *
5240 * So for a change of @wl to the local @cpu with an overall group weight change
5241 * of @wl we can compute the new shares distribution (s'_i) using:
5242 *
5243 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
5244 *
5245 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
5246 * differences in waking a task to CPU 0. The additional task changes the
5247 * weight and shares distributions like:
5248 *
5249 * rw'_i = { 3, 4, 1, 0 }
5250 * s'_i = { 3/8, 4/8, 1/8, 0 }
5251 *
5252 * We can then compute the difference in effective weight by using:
5253 *
5254 * dw_i = S * (s'_i - s_i) (3)
5255 *
5256 * Where 'S' is the group weight as seen by its parent.
5257 *
5258 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
5259 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
5260 * 4/7) times the weight of the group.
5261 */
effective_load(struct task_group * tg,int cpu,long wl,long wg)5262 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5263 {
5264 struct sched_entity *se = tg->se[cpu];
5265
5266 if (!tg->parent) /* the trivial, non-cgroup case */
5267 return wl;
5268
5269 for_each_sched_entity(se) {
5270 struct cfs_rq *cfs_rq = se->my_q;
5271 long W, w = cfs_rq_load_avg(cfs_rq);
5272
5273 tg = cfs_rq->tg;
5274
5275 /*
5276 * W = @wg + \Sum rw_j
5277 */
5278 W = wg + atomic_long_read(&tg->load_avg);
5279
5280 /* Ensure \Sum rw_j >= rw_i */
5281 W -= cfs_rq->tg_load_avg_contrib;
5282 W += w;
5283
5284 /*
5285 * w = rw_i + @wl
5286 */
5287 w += wl;
5288
5289 /*
5290 * wl = S * s'_i; see (2)
5291 */
5292 if (W > 0 && w < W)
5293 wl = (w * (long)tg->shares) / W;
5294 else
5295 wl = tg->shares;
5296
5297 /*
5298 * Per the above, wl is the new se->load.weight value; since
5299 * those are clipped to [MIN_SHARES, ...) do so now. See
5300 * calc_cfs_shares().
5301 */
5302 if (wl < MIN_SHARES)
5303 wl = MIN_SHARES;
5304
5305 /*
5306 * wl = dw_i = S * (s'_i - s_i); see (3)
5307 */
5308 wl -= se->avg.load_avg;
5309
5310 /*
5311 * Recursively apply this logic to all parent groups to compute
5312 * the final effective load change on the root group. Since
5313 * only the @tg group gets extra weight, all parent groups can
5314 * only redistribute existing shares. @wl is the shift in shares
5315 * resulting from this level per the above.
5316 */
5317 wg = 0;
5318 }
5319
5320 return wl;
5321 }
5322 #else
5323
effective_load(struct task_group * tg,int cpu,long wl,long wg)5324 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5325 {
5326 return wl;
5327 }
5328
5329 #endif
5330
5331 /*
5332 * Returns the current capacity of cpu after applying both
5333 * cpu and freq scaling.
5334 */
capacity_curr_of(int cpu)5335 unsigned long capacity_curr_of(int cpu)
5336 {
5337 return cpu_rq(cpu)->cpu_capacity_orig *
5338 arch_scale_freq_capacity(NULL, cpu)
5339 >> SCHED_CAPACITY_SHIFT;
5340 }
5341
energy_aware(void)5342 static inline bool energy_aware(void)
5343 {
5344 return sched_feat(ENERGY_AWARE);
5345 }
5346
5347 struct energy_env {
5348 struct sched_group *sg_top;
5349 struct sched_group *sg_cap;
5350 int cap_idx;
5351 int util_delta;
5352 int src_cpu;
5353 int dst_cpu;
5354 int trg_cpu;
5355 int energy;
5356 int payoff;
5357 struct task_struct *task;
5358 struct {
5359 int before;
5360 int after;
5361 int delta;
5362 int diff;
5363 } nrg;
5364 struct {
5365 int before;
5366 int after;
5367 int delta;
5368 } cap;
5369 };
5370
5371 static int cpu_util_wake(int cpu, struct task_struct *p);
5372
5373 /*
5374 * __cpu_norm_util() returns the cpu util relative to a specific capacity,
5375 * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
5376 * energy calculations.
5377 *
5378 * Since util is a scale-invariant utilization defined as:
5379 *
5380 * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
5381 *
5382 * the normalized util can be found using the specific capacity.
5383 *
5384 * capacity = capacity_orig * curr_freq/max_freq
5385 *
5386 * norm_util = running_time/time ~ util/capacity
5387 */
__cpu_norm_util(unsigned long util,unsigned long capacity)5388 static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
5389 {
5390 if (util >= capacity)
5391 return SCHED_CAPACITY_SCALE;
5392
5393 return (util << SCHED_CAPACITY_SHIFT)/capacity;
5394 }
5395
group_max_util(struct energy_env * eenv)5396 static unsigned long group_max_util(struct energy_env *eenv)
5397 {
5398 unsigned long max_util = 0;
5399 unsigned long util;
5400 int cpu;
5401
5402 for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
5403 util = cpu_util_wake(cpu, eenv->task);
5404
5405 /*
5406 * If we are looking at the target CPU specified by the eenv,
5407 * then we should add the (estimated) utilization of the task
5408 * assuming we will wake it up on that CPU.
5409 */
5410 if (unlikely(cpu == eenv->trg_cpu))
5411 util += eenv->util_delta;
5412
5413 max_util = max(max_util, util);
5414 }
5415
5416 return max_util;
5417 }
5418
5419 /*
5420 * group_norm_util() returns the approximated group util relative to it's
5421 * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
5422 * in energy calculations.
5423 *
5424 * Since task executions may or may not overlap in time in the group the true
5425 * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
5426 * when iterating over all CPUs in the group.
5427 * The latter estimate is used as it leads to a more pessimistic energy
5428 * estimate (more busy).
5429 */
5430 static unsigned
group_norm_util(struct energy_env * eenv,struct sched_group * sg)5431 long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
5432 {
5433 unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
5434 unsigned long util, util_sum = 0;
5435 int cpu;
5436
5437 for_each_cpu(cpu, sched_group_cpus(sg)) {
5438 util = cpu_util_wake(cpu, eenv->task);
5439
5440 /*
5441 * If we are looking at the target CPU specified by the eenv,
5442 * then we should add the (estimated) utilization of the task
5443 * assuming we will wake it up on that CPU.
5444 */
5445 if (unlikely(cpu == eenv->trg_cpu))
5446 util += eenv->util_delta;
5447
5448 util_sum += __cpu_norm_util(util, capacity);
5449 }
5450
5451 return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
5452 }
5453
find_new_capacity(struct energy_env * eenv,const struct sched_group_energy * const sge)5454 static int find_new_capacity(struct energy_env *eenv,
5455 const struct sched_group_energy * const sge)
5456 {
5457 int idx, max_idx = sge->nr_cap_states - 1;
5458 unsigned long util = group_max_util(eenv);
5459
5460 /* default is max_cap if we don't find a match */
5461 eenv->cap_idx = max_idx;
5462
5463 for (idx = 0; idx < sge->nr_cap_states; idx++) {
5464 if (sge->cap_states[idx].cap >= util) {
5465 eenv->cap_idx = idx;
5466 break;
5467 }
5468 }
5469
5470 return eenv->cap_idx;
5471 }
5472
group_idle_state(struct energy_env * eenv,struct sched_group * sg)5473 static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
5474 {
5475 int i, state = INT_MAX;
5476 int src_in_grp, dst_in_grp;
5477 long grp_util = 0;
5478
5479 /* Find the shallowest idle state in the sched group. */
5480 for_each_cpu(i, sched_group_cpus(sg))
5481 state = min(state, idle_get_state_idx(cpu_rq(i)));
5482
5483 /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
5484 state++;
5485
5486 src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
5487 dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
5488 if (src_in_grp == dst_in_grp) {
5489 /* both CPUs under consideration are in the same group or not in
5490 * either group, migration should leave idle state the same.
5491 */
5492 goto end;
5493 }
5494
5495 /*
5496 * Try to estimate if a deeper idle state is
5497 * achievable when we move the task.
5498 */
5499 for_each_cpu(i, sched_group_cpus(sg)) {
5500 grp_util += cpu_util_wake(i, eenv->task);
5501 if (unlikely(i == eenv->trg_cpu))
5502 grp_util += eenv->util_delta;
5503 }
5504
5505 if (grp_util <=
5506 ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
5507 /* after moving, this group is at most partly
5508 * occupied, so it should have some idle time.
5509 */
5510 int max_idle_state_idx = sg->sge->nr_idle_states - 2;
5511 int new_state = grp_util * max_idle_state_idx;
5512 if (grp_util <= 0)
5513 /* group will have no util, use lowest state */
5514 new_state = max_idle_state_idx + 1;
5515 else {
5516 /* for partially idle, linearly map util to idle
5517 * states, excluding the lowest one. This does not
5518 * correspond to the state we expect to enter in
5519 * reality, but an indication of what might happen.
5520 */
5521 new_state = min(max_idle_state_idx, (int)
5522 (new_state / sg->sgc->max_capacity));
5523 new_state = max_idle_state_idx - new_state;
5524 }
5525 state = new_state;
5526 } else {
5527 /* After moving, the group will be fully occupied
5528 * so assume it will not be idle at all.
5529 */
5530 state = 0;
5531 }
5532 end:
5533 return state;
5534 }
5535
5536 /*
5537 * sched_group_energy(): Computes the absolute energy consumption of cpus
5538 * belonging to the sched_group including shared resources shared only by
5539 * members of the group. Iterates over all cpus in the hierarchy below the
5540 * sched_group starting from the bottom working it's way up before going to
5541 * the next cpu until all cpus are covered at all levels. The current
5542 * implementation is likely to gather the same util statistics multiple times.
5543 * This can probably be done in a faster but more complex way.
5544 * Note: sched_group_energy() may fail when racing with sched_domain updates.
5545 */
sched_group_energy(struct energy_env * eenv)5546 static int sched_group_energy(struct energy_env *eenv)
5547 {
5548 struct cpumask visit_cpus;
5549 u64 total_energy = 0;
5550 int cpu_count;
5551
5552 WARN_ON(!eenv->sg_top->sge);
5553
5554 cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
5555 /* If a cpu is hotplugged in while we are in this function,
5556 * it does not appear in the existing visit_cpus mask
5557 * which came from the sched_group pointer of the
5558 * sched_domain pointed at by sd_ea for either the prev
5559 * or next cpu and was dereferenced in __energy_diff.
5560 * Since we will dereference sd_scs later as we iterate
5561 * through the CPUs we expect to visit, new CPUs can
5562 * be present which are not in the visit_cpus mask.
5563 * Guard this with cpu_count.
5564 */
5565 cpu_count = cpumask_weight(&visit_cpus);
5566
5567 while (!cpumask_empty(&visit_cpus)) {
5568 struct sched_group *sg_shared_cap = NULL;
5569 int cpu = cpumask_first(&visit_cpus);
5570 struct sched_domain *sd;
5571
5572 /*
5573 * Is the group utilization affected by cpus outside this
5574 * sched_group?
5575 * This sd may have groups with cpus which were not present
5576 * when we took visit_cpus.
5577 */
5578 sd = rcu_dereference(per_cpu(sd_scs, cpu));
5579
5580 if (sd && sd->parent)
5581 sg_shared_cap = sd->parent->groups;
5582
5583 for_each_domain(cpu, sd) {
5584 struct sched_group *sg = sd->groups;
5585
5586 /* Has this sched_domain already been visited? */
5587 if (sd->child && group_first_cpu(sg) != cpu)
5588 break;
5589
5590 do {
5591 unsigned long group_util;
5592 int sg_busy_energy, sg_idle_energy;
5593 int cap_idx, idle_idx;
5594
5595 if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
5596 eenv->sg_cap = sg_shared_cap;
5597 else
5598 eenv->sg_cap = sg;
5599
5600 cap_idx = find_new_capacity(eenv, sg->sge);
5601
5602 if (sg->group_weight == 1) {
5603 /* Remove capacity of src CPU (before task move) */
5604 if (eenv->trg_cpu == eenv->src_cpu &&
5605 cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
5606 eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
5607 eenv->cap.delta -= eenv->cap.before;
5608 }
5609 /* Add capacity of dst CPU (after task move) */
5610 if (eenv->trg_cpu == eenv->dst_cpu &&
5611 cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
5612 eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
5613 eenv->cap.delta += eenv->cap.after;
5614 }
5615 }
5616
5617 idle_idx = group_idle_state(eenv, sg);
5618 group_util = group_norm_util(eenv, sg);
5619
5620 sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
5621 sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
5622 * sg->sge->idle_states[idle_idx].power);
5623
5624 total_energy += sg_busy_energy + sg_idle_energy;
5625
5626 if (!sd->child) {
5627 /*
5628 * cpu_count here is the number of
5629 * cpus we expect to visit in this
5630 * calculation. If we race against
5631 * hotplug, we can have extra cpus
5632 * added to the groups we are
5633 * iterating which do not appear in
5634 * the visit_cpus mask. In that case
5635 * we are not able to calculate energy
5636 * without restarting so we will bail
5637 * out and use prev_cpu this time.
5638 */
5639 if (!cpu_count)
5640 return -EINVAL;
5641 cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
5642 cpu_count--;
5643 }
5644
5645 if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
5646 goto next_cpu;
5647
5648 } while (sg = sg->next, sg != sd->groups);
5649 }
5650
5651 /*
5652 * If we raced with hotplug and got an sd NULL-pointer;
5653 * returning a wrong energy estimation is better than
5654 * entering an infinite loop.
5655 * Specifically: If a cpu is unplugged after we took
5656 * the visit_cpus mask, it no longer has an sd_scs
5657 * pointer, so when we dereference it, we get NULL.
5658 */
5659 if (cpumask_test_cpu(cpu, &visit_cpus))
5660 return -EINVAL;
5661 next_cpu:
5662 cpumask_clear_cpu(cpu, &visit_cpus);
5663 continue;
5664 }
5665
5666 eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
5667 return 0;
5668 }
5669
cpu_in_sg(struct sched_group * sg,int cpu)5670 static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
5671 {
5672 return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
5673 }
5674
5675 static inline unsigned long task_util(struct task_struct *p);
5676
5677 /*
5678 * energy_diff(): Estimate the energy impact of changing the utilization
5679 * distribution. eenv specifies the change: utilisation amount, source, and
5680 * destination cpu. Source or destination cpu may be -1 in which case the
5681 * utilization is removed from or added to the system (e.g. task wake-up). If
5682 * both are specified, the utilization is migrated.
5683 */
__energy_diff(struct energy_env * eenv)5684 static inline int __energy_diff(struct energy_env *eenv)
5685 {
5686 struct sched_domain *sd;
5687 struct sched_group *sg;
5688 int sd_cpu = -1, energy_before = 0, energy_after = 0;
5689 int diff, margin;
5690
5691 struct energy_env eenv_before = {
5692 .util_delta = task_util(eenv->task),
5693 .src_cpu = eenv->src_cpu,
5694 .dst_cpu = eenv->dst_cpu,
5695 .trg_cpu = eenv->src_cpu,
5696 .nrg = { 0, 0, 0, 0},
5697 .cap = { 0, 0, 0 },
5698 .task = eenv->task,
5699 };
5700
5701 if (eenv->src_cpu == eenv->dst_cpu)
5702 return 0;
5703
5704 sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
5705 sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
5706
5707 if (!sd)
5708 return 0; /* Error */
5709
5710 sg = sd->groups;
5711
5712 do {
5713 if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
5714 eenv_before.sg_top = eenv->sg_top = sg;
5715
5716 if (sched_group_energy(&eenv_before))
5717 return 0; /* Invalid result abort */
5718 energy_before += eenv_before.energy;
5719
5720 /* Keep track of SRC cpu (before) capacity */
5721 eenv->cap.before = eenv_before.cap.before;
5722 eenv->cap.delta = eenv_before.cap.delta;
5723
5724 if (sched_group_energy(eenv))
5725 return 0; /* Invalid result abort */
5726 energy_after += eenv->energy;
5727 }
5728 } while (sg = sg->next, sg != sd->groups);
5729
5730 eenv->nrg.before = energy_before;
5731 eenv->nrg.after = energy_after;
5732 eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
5733 eenv->payoff = 0;
5734 #ifndef CONFIG_SCHED_TUNE
5735 trace_sched_energy_diff(eenv->task,
5736 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
5737 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
5738 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
5739 eenv->nrg.delta, eenv->payoff);
5740 #endif
5741 /*
5742 * Dead-zone margin preventing too many migrations.
5743 */
5744
5745 margin = eenv->nrg.before >> 6; /* ~1.56% */
5746
5747 diff = eenv->nrg.after - eenv->nrg.before;
5748
5749 eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
5750
5751 return eenv->nrg.diff;
5752 }
5753
5754 #ifdef CONFIG_SCHED_TUNE
5755
5756 struct target_nrg schedtune_target_nrg;
5757
5758 #ifdef CONFIG_CGROUP_SCHEDTUNE
5759 extern bool schedtune_initialized;
5760 #endif /* CONFIG_CGROUP_SCHEDTUNE */
5761
5762 /*
5763 * System energy normalization
5764 * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
5765 * corresponding to the specified energy variation.
5766 */
5767 static inline int
normalize_energy(int energy_diff)5768 normalize_energy(int energy_diff)
5769 {
5770 u32 normalized_nrg;
5771
5772 #ifdef CONFIG_CGROUP_SCHEDTUNE
5773 /* during early setup, we don't know the extents */
5774 if (unlikely(!schedtune_initialized))
5775 return energy_diff < 0 ? -1 : 1 ;
5776 #endif /* CONFIG_CGROUP_SCHEDTUNE */
5777
5778 #ifdef CONFIG_SCHED_DEBUG
5779 {
5780 int max_delta;
5781
5782 /* Check for boundaries */
5783 max_delta = schedtune_target_nrg.max_power;
5784 max_delta -= schedtune_target_nrg.min_power;
5785 WARN_ON(abs(energy_diff) >= max_delta);
5786 }
5787 #endif
5788
5789 /* Do scaling using positive numbers to increase the range */
5790 normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
5791
5792 /* Scale by energy magnitude */
5793 normalized_nrg <<= SCHED_CAPACITY_SHIFT;
5794
5795 /* Normalize on max energy for target platform */
5796 normalized_nrg = reciprocal_divide(
5797 normalized_nrg, schedtune_target_nrg.rdiv);
5798
5799 return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
5800 }
5801
5802 static inline int
energy_diff(struct energy_env * eenv)5803 energy_diff(struct energy_env *eenv)
5804 {
5805 int boost = schedtune_task_boost(eenv->task);
5806 int nrg_delta;
5807
5808 /* Conpute "absolute" energy diff */
5809 __energy_diff(eenv);
5810
5811 /* Return energy diff when boost margin is 0 */
5812 if (boost == 0) {
5813 trace_sched_energy_diff(eenv->task,
5814 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
5815 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
5816 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
5817 0, -eenv->nrg.diff);
5818 return eenv->nrg.diff;
5819 }
5820
5821 /* Compute normalized energy diff */
5822 nrg_delta = normalize_energy(eenv->nrg.diff);
5823 eenv->nrg.delta = nrg_delta;
5824
5825 eenv->payoff = schedtune_accept_deltas(
5826 eenv->nrg.delta,
5827 eenv->cap.delta,
5828 eenv->task);
5829
5830 trace_sched_energy_diff(eenv->task,
5831 eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
5832 eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
5833 eenv->cap.before, eenv->cap.after, eenv->cap.delta,
5834 eenv->nrg.delta, eenv->payoff);
5835
5836 /*
5837 * When SchedTune is enabled, the energy_diff() function will return
5838 * the computed energy payoff value. Since the energy_diff() return
5839 * value is expected to be negative by its callers, this evaluation
5840 * function return a negative value each time the evaluation return a
5841 * positive payoff, which is the condition for the acceptance of
5842 * a scheduling decision
5843 */
5844 return -eenv->payoff;
5845 }
5846 #else /* CONFIG_SCHED_TUNE */
5847 #define energy_diff(eenv) __energy_diff(eenv)
5848 #endif
5849
5850 /*
5851 * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5852 * A waker of many should wake a different task than the one last awakened
5853 * at a frequency roughly N times higher than one of its wakees. In order
5854 * to determine whether we should let the load spread vs consolodating to
5855 * shared cache, we look for a minimum 'flip' frequency of llc_size in one
5856 * partner, and a factor of lls_size higher frequency in the other. With
5857 * both conditions met, we can be relatively sure that the relationship is
5858 * non-monogamous, with partner count exceeding socket size. Waker/wakee
5859 * being client/server, worker/dispatcher, interrupt source or whatever is
5860 * irrelevant, spread criteria is apparent partner count exceeds socket size.
5861 */
wake_wide(struct task_struct * p,int sibling_count_hint)5862 static int wake_wide(struct task_struct *p, int sibling_count_hint)
5863 {
5864 unsigned int master = current->wakee_flips;
5865 unsigned int slave = p->wakee_flips;
5866 int llc_size = this_cpu_read(sd_llc_size);
5867
5868 if (sibling_count_hint >= llc_size)
5869 return 1;
5870
5871 if (master < slave)
5872 swap(master, slave);
5873 if (slave < llc_size || master < slave * llc_size)
5874 return 0;
5875 return 1;
5876 }
5877
wake_affine(struct sched_domain * sd,struct task_struct * p,int prev_cpu,int sync)5878 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5879 int prev_cpu, int sync)
5880 {
5881 s64 this_load, load;
5882 s64 this_eff_load, prev_eff_load;
5883 int idx, this_cpu;
5884 struct task_group *tg;
5885 unsigned long weight;
5886 int balanced;
5887
5888 idx = sd->wake_idx;
5889 this_cpu = smp_processor_id();
5890 load = source_load(prev_cpu, idx);
5891 this_load = target_load(this_cpu, idx);
5892
5893 /*
5894 * If sync wakeup then subtract the (maximum possible)
5895 * effect of the currently running task from the load
5896 * of the current CPU:
5897 */
5898 if (sync) {
5899 tg = task_group(current);
5900 weight = current->se.avg.load_avg;
5901
5902 this_load += effective_load(tg, this_cpu, -weight, -weight);
5903 load += effective_load(tg, prev_cpu, 0, -weight);
5904 }
5905
5906 tg = task_group(p);
5907 weight = p->se.avg.load_avg;
5908
5909 /*
5910 * In low-load situations, where prev_cpu is idle and this_cpu is idle
5911 * due to the sync cause above having dropped this_load to 0, we'll
5912 * always have an imbalance, but there's really nothing you can do
5913 * about that, so that's good too.
5914 *
5915 * Otherwise check if either cpus are near enough in load to allow this
5916 * task to be woken on this_cpu.
5917 */
5918 this_eff_load = 100;
5919 this_eff_load *= capacity_of(prev_cpu);
5920
5921 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5922 prev_eff_load *= capacity_of(this_cpu);
5923
5924 if (this_load > 0) {
5925 this_eff_load *= this_load +
5926 effective_load(tg, this_cpu, weight, weight);
5927
5928 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
5929 }
5930
5931 balanced = this_eff_load <= prev_eff_load;
5932
5933 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
5934
5935 if (!balanced)
5936 return 0;
5937
5938 schedstat_inc(sd, ttwu_move_affine);
5939 schedstat_inc(p, se.statistics.nr_wakeups_affine);
5940
5941 return 1;
5942 }
5943
task_util(struct task_struct * p)5944 static inline unsigned long task_util(struct task_struct *p)
5945 {
5946 #ifdef CONFIG_SCHED_WALT
5947 if (!walt_disabled && sysctl_sched_use_walt_task_util) {
5948 unsigned long demand = p->ravg.demand;
5949 return (demand << 10) / walt_ravg_window;
5950 }
5951 #endif
5952 return p->se.avg.util_avg;
5953 }
5954
5955 static inline unsigned long boosted_task_util(struct task_struct *task);
5956
__task_fits(struct task_struct * p,int cpu,int util)5957 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
5958 {
5959 unsigned long capacity = capacity_of(cpu);
5960
5961 util += boosted_task_util(p);
5962
5963 return (capacity * 1024) > (util * capacity_margin);
5964 }
5965
task_fits_max(struct task_struct * p,int cpu)5966 static inline bool task_fits_max(struct task_struct *p, int cpu)
5967 {
5968 unsigned long capacity = capacity_of(cpu);
5969 unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
5970
5971 if (capacity == max_capacity)
5972 return true;
5973
5974 if (capacity * capacity_margin > max_capacity * 1024)
5975 return true;
5976
5977 return __task_fits(p, cpu, 0);
5978 }
5979
__cpu_overutilized(int cpu,int delta)5980 static bool __cpu_overutilized(int cpu, int delta)
5981 {
5982 return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
5983 }
5984
cpu_overutilized(int cpu)5985 static bool cpu_overutilized(int cpu)
5986 {
5987 return __cpu_overutilized(cpu, 0);
5988 }
5989
5990 #ifdef CONFIG_SCHED_TUNE
5991
5992 struct reciprocal_value schedtune_spc_rdiv;
5993
5994 static long
schedtune_margin(unsigned long signal,long boost)5995 schedtune_margin(unsigned long signal, long boost)
5996 {
5997 long long margin = 0;
5998
5999 /*
6000 * Signal proportional compensation (SPC)
6001 *
6002 * The Boost (B) value is used to compute a Margin (M) which is
6003 * proportional to the complement of the original Signal (S):
6004 * M = B * (SCHED_CAPACITY_SCALE - S)
6005 * The obtained M could be used by the caller to "boost" S.
6006 */
6007 if (boost >= 0) {
6008 margin = SCHED_CAPACITY_SCALE - signal;
6009 margin *= boost;
6010 } else
6011 margin = -signal * boost;
6012
6013 margin = reciprocal_divide(margin, schedtune_spc_rdiv);
6014
6015 if (boost < 0)
6016 margin *= -1;
6017 return margin;
6018 }
6019
6020 static inline int
schedtune_cpu_margin(unsigned long util,int cpu)6021 schedtune_cpu_margin(unsigned long util, int cpu)
6022 {
6023 int boost = schedtune_cpu_boost(cpu);
6024
6025 if (boost == 0)
6026 return 0;
6027
6028 return schedtune_margin(util, boost);
6029 }
6030
6031 static inline long
schedtune_task_margin(struct task_struct * task)6032 schedtune_task_margin(struct task_struct *task)
6033 {
6034 int boost = schedtune_task_boost(task);
6035 unsigned long util;
6036 long margin;
6037
6038 if (boost == 0)
6039 return 0;
6040
6041 util = task_util(task);
6042 margin = schedtune_margin(util, boost);
6043
6044 return margin;
6045 }
6046
6047 #else /* CONFIG_SCHED_TUNE */
6048
6049 static inline int
schedtune_cpu_margin(unsigned long util,int cpu)6050 schedtune_cpu_margin(unsigned long util, int cpu)
6051 {
6052 return 0;
6053 }
6054
6055 static inline int
schedtune_task_margin(struct task_struct * task)6056 schedtune_task_margin(struct task_struct *task)
6057 {
6058 return 0;
6059 }
6060
6061 #endif /* CONFIG_SCHED_TUNE */
6062
6063 unsigned long
boosted_cpu_util(int cpu)6064 boosted_cpu_util(int cpu)
6065 {
6066 unsigned long util = cpu_util_freq(cpu);
6067 long margin = schedtune_cpu_margin(util, cpu);
6068
6069 trace_sched_boost_cpu(cpu, util, margin);
6070
6071 return util + margin;
6072 }
6073
6074 static inline unsigned long
boosted_task_util(struct task_struct * task)6075 boosted_task_util(struct task_struct *task)
6076 {
6077 unsigned long util = task_util(task);
6078 long margin = schedtune_task_margin(task);
6079
6080 trace_sched_boost_task(task, util, margin);
6081
6082 return util + margin;
6083 }
6084
capacity_spare_wake(int cpu,struct task_struct * p)6085 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
6086 {
6087 return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
6088 }
6089
6090 /*
6091 * find_idlest_group finds and returns the least busy CPU group within the
6092 * domain.
6093 *
6094 * Assumes p is allowed on at least one CPU in sd.
6095 */
6096 static struct sched_group *
find_idlest_group(struct sched_domain * sd,struct task_struct * p,int this_cpu,int sd_flag)6097 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
6098 int this_cpu, int sd_flag)
6099 {
6100 struct sched_group *idlest = NULL, *group = sd->groups;
6101 struct sched_group *most_spare_sg = NULL;
6102 unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
6103 unsigned long most_spare = 0, this_spare = 0;
6104 int load_idx = sd->forkexec_idx;
6105 int imbalance = 100 + (sd->imbalance_pct-100)/2;
6106
6107 if (sd_flag & SD_BALANCE_WAKE)
6108 load_idx = sd->wake_idx;
6109
6110 do {
6111 unsigned long load, avg_load, spare_cap, max_spare_cap;
6112 int local_group;
6113 int i;
6114
6115 /* Skip over this group if it has no CPUs allowed */
6116 if (!cpumask_intersects(sched_group_cpus(group),
6117 tsk_cpus_allowed(p)))
6118 continue;
6119
6120 local_group = cpumask_test_cpu(this_cpu,
6121 sched_group_cpus(group));
6122
6123 /*
6124 * Tally up the load of all CPUs in the group and find
6125 * the group containing the CPU with most spare capacity.
6126 */
6127 avg_load = 0;
6128 max_spare_cap = 0;
6129
6130 for_each_cpu(i, sched_group_cpus(group)) {
6131 /* Bias balancing toward cpus of our domain */
6132 if (local_group)
6133 load = source_load(i, load_idx);
6134 else
6135 load = target_load(i, load_idx);
6136
6137 avg_load += load;
6138
6139 spare_cap = capacity_spare_wake(i, p);
6140
6141 if (spare_cap > max_spare_cap)
6142 max_spare_cap = spare_cap;
6143 }
6144
6145 /* Adjust by relative CPU capacity of the group */
6146 avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
6147
6148 if (local_group) {
6149 this_load = avg_load;
6150 this_spare = max_spare_cap;
6151 } else {
6152 if (avg_load < min_load) {
6153 min_load = avg_load;
6154 idlest = group;
6155 }
6156
6157 if (most_spare < max_spare_cap) {
6158 most_spare = max_spare_cap;
6159 most_spare_sg = group;
6160 }
6161 }
6162 } while (group = group->next, group != sd->groups);
6163
6164 /*
6165 * The cross-over point between using spare capacity or least load
6166 * is too conservative for high utilization tasks on partially
6167 * utilized systems if we require spare_capacity > task_util(p),
6168 * so we allow for some task stuffing by using
6169 * spare_capacity > task_util(p)/2.
6170 *
6171 * Spare capacity can't be used for fork because the utilization has
6172 * not been set yet, we must first select a rq to compute the initial
6173 * utilization.
6174 */
6175 if (sd_flag & SD_BALANCE_FORK)
6176 goto skip_spare;
6177
6178 if (this_spare > task_util(p) / 2 &&
6179 imbalance*this_spare > 100*most_spare)
6180 return NULL;
6181 else if (most_spare > task_util(p) / 2)
6182 return most_spare_sg;
6183
6184 skip_spare:
6185 if (!idlest || 100*this_load < imbalance*min_load)
6186 return NULL;
6187 return idlest;
6188 }
6189
6190 /*
6191 * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
6192 */
6193 static int
find_idlest_group_cpu(struct sched_group * group,struct task_struct * p,int this_cpu)6194 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
6195 {
6196 unsigned long load, min_load = ULONG_MAX;
6197 unsigned int min_exit_latency = UINT_MAX;
6198 u64 latest_idle_timestamp = 0;
6199 int least_loaded_cpu = this_cpu;
6200 int shallowest_idle_cpu = -1;
6201 int i;
6202
6203 /* Check if we have any choice: */
6204 if (group->group_weight == 1)
6205 return cpumask_first(sched_group_cpus(group));
6206
6207 /* Traverse only the allowed CPUs */
6208 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
6209 if (idle_cpu(i)) {
6210 struct rq *rq = cpu_rq(i);
6211 struct cpuidle_state *idle = idle_get_state(rq);
6212 if (idle && idle->exit_latency < min_exit_latency) {
6213 /*
6214 * We give priority to a CPU whose idle state
6215 * has the smallest exit latency irrespective
6216 * of any idle timestamp.
6217 */
6218 min_exit_latency = idle->exit_latency;
6219 latest_idle_timestamp = rq->idle_stamp;
6220 shallowest_idle_cpu = i;
6221 } else if ((!idle || idle->exit_latency == min_exit_latency) &&
6222 rq->idle_stamp > latest_idle_timestamp) {
6223 /*
6224 * If equal or no active idle state, then
6225 * the most recently idled CPU might have
6226 * a warmer cache.
6227 */
6228 latest_idle_timestamp = rq->idle_stamp;
6229 shallowest_idle_cpu = i;
6230 }
6231 } else if (shallowest_idle_cpu == -1) {
6232 load = weighted_cpuload(i);
6233 if (load < min_load || (load == min_load && i == this_cpu)) {
6234 min_load = load;
6235 least_loaded_cpu = i;
6236 }
6237 }
6238 }
6239
6240 return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
6241 }
6242
find_idlest_cpu(struct sched_domain * sd,struct task_struct * p,int cpu,int prev_cpu,int sd_flag)6243 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6244 int cpu, int prev_cpu, int sd_flag)
6245 {
6246 int new_cpu = cpu;
6247 int wu = sd_flag & SD_BALANCE_WAKE;
6248 int cas_cpu = -1;
6249
6250 if (wu) {
6251 schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
6252 schedstat_inc(this_rq(), eas_stats.cas_attempts);
6253 }
6254
6255 if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6256 return prev_cpu;
6257
6258 while (sd) {
6259 struct sched_group *group;
6260 struct sched_domain *tmp;
6261 int weight;
6262
6263 if (wu)
6264 schedstat_inc(sd, eas_stats.cas_attempts);
6265
6266 if (!(sd->flags & sd_flag)) {
6267 sd = sd->child;
6268 continue;
6269 }
6270
6271 group = find_idlest_group(sd, p, cpu, sd_flag);
6272 if (!group) {
6273 sd = sd->child;
6274 continue;
6275 }
6276
6277 new_cpu = find_idlest_group_cpu(group, p, cpu);
6278 if (new_cpu == cpu) {
6279 /* Now try balancing at a lower domain level of cpu */
6280 sd = sd->child;
6281 continue;
6282 }
6283
6284 /* Now try balancing at a lower domain level of new_cpu */
6285 cpu = cas_cpu = new_cpu;
6286 weight = sd->span_weight;
6287 sd = NULL;
6288 for_each_domain(cpu, tmp) {
6289 if (weight <= tmp->span_weight)
6290 break;
6291 if (tmp->flags & sd_flag)
6292 sd = tmp;
6293 }
6294 /* while loop will break here if sd == NULL */
6295 }
6296
6297 if (wu && (cas_cpu >= 0)) {
6298 schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
6299 schedstat_inc(this_rq(), eas_stats.cas_count);
6300 }
6301
6302 return new_cpu;
6303 }
6304
6305 /*
6306 * Try and locate an idle CPU in the sched_domain.
6307 */
select_idle_sibling(struct task_struct * p,int prev,int target)6308 static int select_idle_sibling(struct task_struct *p, int prev, int target)
6309 {
6310 struct sched_domain *sd;
6311 struct sched_group *sg;
6312 int best_idle_cpu = -1;
6313 int best_idle_cstate = INT_MAX;
6314 unsigned long best_idle_capacity = ULONG_MAX;
6315
6316 schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
6317 schedstat_inc(this_rq(), eas_stats.sis_attempts);
6318
6319 if (!sysctl_sched_cstate_aware) {
6320 if (idle_cpu(target)) {
6321 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
6322 schedstat_inc(this_rq(), eas_stats.sis_idle);
6323 return target;
6324 }
6325
6326 /*
6327 * If the prevous cpu is cache affine and idle, don't be stupid.
6328 */
6329 if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
6330 schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
6331 schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
6332 return prev;
6333 }
6334 }
6335
6336 /*
6337 * Otherwise, iterate the domains and find an elegible idle cpu.
6338 */
6339 sd = rcu_dereference(per_cpu(sd_llc, target));
6340 for_each_lower_domain(sd) {
6341 sg = sd->groups;
6342 do {
6343 int i;
6344 if (!cpumask_intersects(sched_group_cpus(sg),
6345 tsk_cpus_allowed(p)))
6346 goto next;
6347
6348 if (sysctl_sched_cstate_aware) {
6349 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
6350 int idle_idx = idle_get_state_idx(cpu_rq(i));
6351 unsigned long new_usage = boosted_task_util(p);
6352 unsigned long capacity_orig = capacity_orig_of(i);
6353
6354 if (new_usage > capacity_orig || !idle_cpu(i))
6355 goto next;
6356
6357 if (i == target && new_usage <= capacity_curr_of(target)) {
6358 schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
6359 schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
6360 schedstat_inc(sd, eas_stats.sis_suff_cap);
6361 return target;
6362 }
6363
6364 if (idle_idx < best_idle_cstate &&
6365 capacity_orig <= best_idle_capacity) {
6366 best_idle_cpu = i;
6367 best_idle_cstate = idle_idx;
6368 best_idle_capacity = capacity_orig;
6369 }
6370 }
6371 } else {
6372 for_each_cpu(i, sched_group_cpus(sg)) {
6373 if (i == target || !idle_cpu(i))
6374 goto next;
6375 }
6376
6377 target = cpumask_first_and(sched_group_cpus(sg),
6378 tsk_cpus_allowed(p));
6379 schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
6380 schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
6381 schedstat_inc(sd, eas_stats.sis_idle_cpu);
6382 goto done;
6383 }
6384 next:
6385 sg = sg->next;
6386 } while (sg != sd->groups);
6387 }
6388
6389 if (best_idle_cpu >= 0)
6390 target = best_idle_cpu;
6391
6392 done:
6393 schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
6394 schedstat_inc(this_rq(), eas_stats.sis_count);
6395
6396 return target;
6397 }
6398
6399 /*
6400 * cpu_util_wake: Compute cpu utilization with any contributions from
6401 * the waking task p removed. check_for_migration() looks for a better CPU of
6402 * rq->curr. For that case we should return cpu util with contributions from
6403 * currently running task p removed.
6404 */
cpu_util_wake(int cpu,struct task_struct * p)6405 static int cpu_util_wake(int cpu, struct task_struct *p)
6406 {
6407 unsigned long util, capacity;
6408
6409 #ifdef CONFIG_SCHED_WALT
6410 /*
6411 * WALT does not decay idle tasks in the same manner
6412 * as PELT, so it makes little sense to subtract task
6413 * utilization from cpu utilization. Instead just use
6414 * cpu_util for this case.
6415 */
6416 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
6417 p->state == TASK_WAKING)
6418 return cpu_util(cpu);
6419 #endif
6420 /* Task has no contribution or is new */
6421 if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
6422 return cpu_util(cpu);
6423
6424 capacity = capacity_orig_of(cpu);
6425 util = max_t(long, cpu_util(cpu) - task_util(p), 0);
6426
6427 return (util >= capacity) ? capacity : util;
6428 }
6429
start_cpu(bool boosted)6430 static int start_cpu(bool boosted)
6431 {
6432 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6433
6434 return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
6435 }
6436
find_best_target(struct task_struct * p,int * backup_cpu,bool boosted,bool prefer_idle)6437 static inline int find_best_target(struct task_struct *p, int *backup_cpu,
6438 bool boosted, bool prefer_idle)
6439 {
6440 unsigned long best_idle_min_cap_orig = ULONG_MAX;
6441 unsigned long min_util = boosted_task_util(p);
6442 unsigned long target_capacity = ULONG_MAX;
6443 unsigned long min_wake_util = ULONG_MAX;
6444 unsigned long target_max_spare_cap = 0;
6445 unsigned long best_active_util = ULONG_MAX;
6446 int best_idle_cstate = INT_MAX;
6447 struct sched_domain *sd;
6448 struct sched_group *sg;
6449 int best_active_cpu = -1;
6450 int best_idle_cpu = -1;
6451 int target_cpu = -1;
6452 int cpu, i;
6453
6454 *backup_cpu = -1;
6455
6456 schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
6457 schedstat_inc(this_rq(), eas_stats.fbt_attempts);
6458
6459 /* Find start CPU based on boost value */
6460 cpu = start_cpu(boosted);
6461 if (cpu < 0) {
6462 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
6463 schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
6464 return -1;
6465 }
6466
6467 /* Find SD for the start CPU */
6468 sd = rcu_dereference(per_cpu(sd_ea, cpu));
6469 if (!sd) {
6470 schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
6471 schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
6472 return -1;
6473 }
6474
6475 /* Scan CPUs in all SDs */
6476 sg = sd->groups;
6477 do {
6478 for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
6479 unsigned long capacity_curr = capacity_curr_of(i);
6480 unsigned long capacity_orig = capacity_orig_of(i);
6481 unsigned long wake_util, new_util;
6482
6483 if (!cpu_online(i))
6484 continue;
6485
6486 if (walt_cpu_high_irqload(i))
6487 continue;
6488
6489 /*
6490 * p's blocked utilization is still accounted for on prev_cpu
6491 * so prev_cpu will receive a negative bias due to the double
6492 * accounting. However, the blocked utilization may be zero.
6493 */
6494 wake_util = cpu_util_wake(i, p);
6495 new_util = wake_util + task_util(p);
6496
6497 /*
6498 * Ensure minimum capacity to grant the required boost.
6499 * The target CPU can be already at a capacity level higher
6500 * than the one required to boost the task.
6501 */
6502 new_util = max(min_util, new_util);
6503 if (new_util > capacity_orig)
6504 continue;
6505
6506 /*
6507 * Case A) Latency sensitive tasks
6508 *
6509 * Unconditionally favoring tasks that prefer idle CPU to
6510 * improve latency.
6511 *
6512 * Looking for:
6513 * - an idle CPU, whatever its idle_state is, since
6514 * the first CPUs we explore are more likely to be
6515 * reserved for latency sensitive tasks.
6516 * - a non idle CPU where the task fits in its current
6517 * capacity and has the maximum spare capacity.
6518 * - a non idle CPU with lower contention from other
6519 * tasks and running at the lowest possible OPP.
6520 *
6521 * The last two goals tries to favor a non idle CPU
6522 * where the task can run as if it is "almost alone".
6523 * A maximum spare capacity CPU is favoured since
6524 * the task already fits into that CPU's capacity
6525 * without waiting for an OPP chance.
6526 *
6527 * The following code path is the only one in the CPUs
6528 * exploration loop which is always used by
6529 * prefer_idle tasks. It exits the loop with wither a
6530 * best_active_cpu or a target_cpu which should
6531 * represent an optimal choice for latency sensitive
6532 * tasks.
6533 */
6534 if (prefer_idle) {
6535
6536 /*
6537 * Case A.1: IDLE CPU
6538 * Return the first IDLE CPU we find.
6539 */
6540 if (idle_cpu(i)) {
6541 schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
6542 schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
6543
6544 trace_sched_find_best_target(p,
6545 prefer_idle, min_util,
6546 cpu, best_idle_cpu,
6547 best_active_cpu, i);
6548
6549 return i;
6550 }
6551
6552 /*
6553 * Case A.2: Target ACTIVE CPU
6554 * Favor CPUs with max spare capacity.
6555 */
6556 if ((capacity_curr > new_util) &&
6557 (capacity_orig - new_util > target_max_spare_cap)) {
6558 target_max_spare_cap = capacity_orig - new_util;
6559 target_cpu = i;
6560 continue;
6561 }
6562 if (target_cpu != -1)
6563 continue;
6564
6565
6566 /*
6567 * Case A.3: Backup ACTIVE CPU
6568 * Favor CPUs with:
6569 * - lower utilization due to other tasks
6570 * - lower utilization with the task in
6571 */
6572 if (wake_util > min_wake_util)
6573 continue;
6574 if (new_util > best_active_util)
6575 continue;
6576 min_wake_util = wake_util;
6577 best_active_util = new_util;
6578 best_active_cpu = i;
6579 continue;
6580 }
6581
6582 /*
6583 * Enforce EAS mode
6584 *
6585 * For non latency sensitive tasks, skip CPUs that
6586 * will be overutilized by moving the task there.
6587 *
6588 * The goal here is to remain in EAS mode as long as
6589 * possible at least for !prefer_idle tasks.
6590 */
6591 if ((new_util * capacity_margin) >
6592 (capacity_orig * SCHED_CAPACITY_SCALE))
6593 continue;
6594
6595 /*
6596 * Case B) Non latency sensitive tasks on IDLE CPUs.
6597 *
6598 * Find an optimal backup IDLE CPU for non latency
6599 * sensitive tasks.
6600 *
6601 * Looking for:
6602 * - minimizing the capacity_orig,
6603 * i.e. preferring LITTLE CPUs
6604 * - favoring shallowest idle states
6605 * i.e. avoid to wakeup deep-idle CPUs
6606 *
6607 * The following code path is used by non latency
6608 * sensitive tasks if IDLE CPUs are available. If at
6609 * least one of such CPUs are available it sets the
6610 * best_idle_cpu to the most suitable idle CPU to be
6611 * selected.
6612 *
6613 * If idle CPUs are available, favour these CPUs to
6614 * improve performances by spreading tasks.
6615 * Indeed, the energy_diff() computed by the caller
6616 * will take care to ensure the minimization of energy
6617 * consumptions without affecting performance.
6618 */
6619 if (idle_cpu(i)) {
6620 int idle_idx = idle_get_state_idx(cpu_rq(i));
6621
6622 /* Select idle CPU with lower cap_orig */
6623 if (capacity_orig > best_idle_min_cap_orig)
6624 continue;
6625
6626 /*
6627 * Skip CPUs in deeper idle state, but only
6628 * if they are also less energy efficient.
6629 * IOW, prefer a deep IDLE LITTLE CPU vs a
6630 * shallow idle big CPU.
6631 */
6632 if (sysctl_sched_cstate_aware &&
6633 best_idle_cstate <= idle_idx)
6634 continue;
6635
6636 /* Keep track of best idle CPU */
6637 best_idle_min_cap_orig = capacity_orig;
6638 best_idle_cstate = idle_idx;
6639 best_idle_cpu = i;
6640 continue;
6641 }
6642
6643 /*
6644 * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6645 *
6646 * Pack tasks in the most energy efficient capacities.
6647 *
6648 * This task packing strategy prefers more energy
6649 * efficient CPUs (i.e. pack on smaller maximum
6650 * capacity CPUs) while also trying to spread tasks to
6651 * run them all at the lower OPP.
6652 *
6653 * This assumes for example that it's more energy
6654 * efficient to run two tasks on two CPUs at a lower
6655 * OPP than packing both on a single CPU but running
6656 * that CPU at an higher OPP.
6657 *
6658 * Thus, this case keep track of the CPU with the
6659 * smallest maximum capacity and highest spare maximum
6660 * capacity.
6661 */
6662
6663 /* Favor CPUs with smaller capacity */
6664 if (capacity_orig > target_capacity)
6665 continue;
6666
6667 /* Favor CPUs with maximum spare capacity */
6668 if ((capacity_orig - new_util) < target_max_spare_cap)
6669 continue;
6670
6671 target_max_spare_cap = capacity_orig - new_util;
6672 target_capacity = capacity_orig;
6673 target_cpu = i;
6674 }
6675
6676 } while (sg = sg->next, sg != sd->groups);
6677
6678 /*
6679 * For non latency sensitive tasks, cases B and C in the previous loop,
6680 * we pick the best IDLE CPU only if we was not able to find a target
6681 * ACTIVE CPU.
6682 *
6683 * Policies priorities:
6684 *
6685 * - prefer_idle tasks:
6686 *
6687 * a) IDLE CPU available, we return immediately
6688 * b) ACTIVE CPU where task fits and has the bigger maximum spare
6689 * capacity (i.e. target_cpu)
6690 * c) ACTIVE CPU with less contention due to other tasks
6691 * (i.e. best_active_cpu)
6692 *
6693 * - NON prefer_idle tasks:
6694 *
6695 * a) ACTIVE CPU: target_cpu
6696 * b) IDLE CPU: best_idle_cpu
6697 */
6698 if (target_cpu == -1)
6699 target_cpu = prefer_idle
6700 ? best_active_cpu
6701 : best_idle_cpu;
6702 else
6703 *backup_cpu = prefer_idle
6704 ? best_active_cpu
6705 : best_idle_cpu;
6706
6707 trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
6708 best_idle_cpu, best_active_cpu,
6709 target_cpu);
6710
6711 schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
6712 schedstat_inc(this_rq(), eas_stats.fbt_count);
6713
6714 return target_cpu;
6715 }
6716
6717 /*
6718 * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6719 * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6720 *
6721 * In that case WAKE_AFFINE doesn't make sense and we'll let
6722 * BALANCE_WAKE sort things out.
6723 */
wake_cap(struct task_struct * p,int cpu,int prev_cpu)6724 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6725 {
6726 long min_cap, max_cap;
6727
6728 min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6729 max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6730
6731 /* Minimum capacity is close to max, no need to abort wake_affine */
6732 if (max_cap - min_cap < max_cap >> 3)
6733 return 0;
6734
6735 /* Bring task utilization in sync with prev_cpu */
6736 sync_entity_load_avg(&p->se);
6737
6738 return min_cap * 1024 < task_util(p) * capacity_margin;
6739 }
6740
select_energy_cpu_brute(struct task_struct * p,int prev_cpu,int sync)6741 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
6742 {
6743 struct sched_domain *sd;
6744 int target_cpu = prev_cpu, tmp_target, tmp_backup;
6745 bool boosted, prefer_idle;
6746
6747 schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
6748 schedstat_inc(this_rq(), eas_stats.secb_attempts);
6749
6750 if (sysctl_sched_sync_hint_enable && sync) {
6751 int cpu = smp_processor_id();
6752
6753 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6754 schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
6755 schedstat_inc(this_rq(), eas_stats.secb_sync);
6756 return cpu;
6757 }
6758 }
6759
6760 rcu_read_lock();
6761 #ifdef CONFIG_CGROUP_SCHEDTUNE
6762 boosted = schedtune_task_boost(p) > 0;
6763 prefer_idle = schedtune_prefer_idle(p) > 0;
6764 #else
6765 boosted = get_sysctl_sched_cfs_boost() > 0;
6766 prefer_idle = 0;
6767 #endif
6768
6769 sync_entity_load_avg(&p->se);
6770
6771 sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
6772 /* Find a cpu with sufficient capacity */
6773 tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
6774
6775 if (!sd)
6776 goto unlock;
6777 if (tmp_target >= 0) {
6778 target_cpu = tmp_target;
6779 if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
6780 schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
6781 schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
6782 goto unlock;
6783 }
6784 }
6785
6786 if (target_cpu != prev_cpu) {
6787 int delta = 0;
6788 struct energy_env eenv = {
6789 .util_delta = task_util(p),
6790 .src_cpu = prev_cpu,
6791 .dst_cpu = target_cpu,
6792 .task = p,
6793 .trg_cpu = target_cpu,
6794 };
6795
6796
6797 #ifdef CONFIG_SCHED_WALT
6798 if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
6799 p->state == TASK_WAKING)
6800 delta = task_util(p);
6801 #endif
6802 /* Not enough spare capacity on previous cpu */
6803 if (__cpu_overutilized(prev_cpu, delta)) {
6804 schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
6805 schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
6806 goto unlock;
6807 }
6808
6809 if (energy_diff(&eenv) >= 0) {
6810 /* No energy saving for target_cpu, try backup */
6811 target_cpu = tmp_backup;
6812 eenv.dst_cpu = target_cpu;
6813 eenv.trg_cpu = target_cpu;
6814 if (tmp_backup < 0 ||
6815 tmp_backup == prev_cpu ||
6816 energy_diff(&eenv) >= 0) {
6817 schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
6818 schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
6819 target_cpu = prev_cpu;
6820 goto unlock;
6821 }
6822 }
6823
6824 schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
6825 schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
6826 goto unlock;
6827 }
6828
6829 schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
6830 schedstat_inc(this_rq(), eas_stats.secb_count);
6831
6832 unlock:
6833 rcu_read_unlock();
6834
6835 return target_cpu;
6836 }
6837
6838 /*
6839 * select_task_rq_fair: Select target runqueue for the waking task in domains
6840 * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6841 * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6842 *
6843 * Balances load by selecting the idlest cpu in the idlest group, or under
6844 * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
6845 *
6846 * Returns the target cpu number.
6847 *
6848 * preempt must be disabled.
6849 */
6850 static int
select_task_rq_fair(struct task_struct * p,int prev_cpu,int sd_flag,int wake_flags,int sibling_count_hint)6851 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
6852 int sibling_count_hint)
6853 {
6854 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
6855 int cpu = smp_processor_id();
6856 int new_cpu = prev_cpu;
6857 int want_affine = 0;
6858 int sync = wake_flags & WF_SYNC;
6859
6860 if (sd_flag & SD_BALANCE_WAKE) {
6861 record_wakee(p);
6862 want_affine = !wake_wide(p, sibling_count_hint) &&
6863 !wake_cap(p, cpu, prev_cpu) &&
6864 cpumask_test_cpu(cpu, &p->cpus_allowed);
6865 }
6866
6867 if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
6868 return select_energy_cpu_brute(p, prev_cpu, sync);
6869
6870 rcu_read_lock();
6871 for_each_domain(cpu, tmp) {
6872 if (!(tmp->flags & SD_LOAD_BALANCE))
6873 break;
6874
6875 /*
6876 * If both cpu and prev_cpu are part of this domain,
6877 * cpu is a valid SD_WAKE_AFFINE target.
6878 */
6879 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6880 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6881 affine_sd = tmp;
6882 break;
6883 }
6884
6885 if (tmp->flags & sd_flag)
6886 sd = tmp;
6887 else if (!want_affine)
6888 break;
6889 }
6890
6891 if (affine_sd) {
6892 sd = NULL; /* Prefer wake_affine over balance flags */
6893 if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
6894 new_cpu = cpu;
6895 }
6896
6897 if (sd && !(sd_flag & SD_BALANCE_FORK)) {
6898 /*
6899 * We're going to need the task's util for capacity_spare_wake
6900 * in find_idlest_group. Sync it up to prev_cpu's
6901 * last_update_time.
6902 */
6903 sync_entity_load_avg(&p->se);
6904 }
6905
6906 if (!sd) {
6907 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
6908 new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6909
6910 } else {
6911 new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6912 }
6913 rcu_read_unlock();
6914
6915 return new_cpu;
6916 }
6917
6918 /*
6919 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
6920 * cfs_rq_of(p) references at time of call are still valid and identify the
6921 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
6922 * other assumptions, including the state of rq->lock, should be made.
6923 */
migrate_task_rq_fair(struct task_struct * p)6924 static void migrate_task_rq_fair(struct task_struct *p)
6925 {
6926 /*
6927 * We are supposed to update the task to "current" time, then its up to date
6928 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
6929 * what current time is, so simply throw away the out-of-date time. This
6930 * will result in the wakee task is less decayed, but giving the wakee more
6931 * load sounds not bad.
6932 */
6933 remove_entity_load_avg(&p->se);
6934
6935 /* Tell new CPU we are migrated */
6936 p->se.avg.last_update_time = 0;
6937
6938 /* We have migrated, no longer consider this task hot */
6939 p->se.exec_start = 0;
6940 }
6941
task_dead_fair(struct task_struct * p)6942 static void task_dead_fair(struct task_struct *p)
6943 {
6944 remove_entity_load_avg(&p->se);
6945 }
6946 #else
6947 #define task_fits_max(p, cpu) true
6948 #endif /* CONFIG_SMP */
6949
6950 static unsigned long
wakeup_gran(struct sched_entity * curr,struct sched_entity * se)6951 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
6952 {
6953 unsigned long gran = sysctl_sched_wakeup_granularity;
6954
6955 /*
6956 * Since its curr running now, convert the gran from real-time
6957 * to virtual-time in his units.
6958 *
6959 * By using 'se' instead of 'curr' we penalize light tasks, so
6960 * they get preempted easier. That is, if 'se' < 'curr' then
6961 * the resulting gran will be larger, therefore penalizing the
6962 * lighter, if otoh 'se' > 'curr' then the resulting gran will
6963 * be smaller, again penalizing the lighter task.
6964 *
6965 * This is especially important for buddies when the leftmost
6966 * task is higher priority than the buddy.
6967 */
6968 return calc_delta_fair(gran, se);
6969 }
6970
6971 /*
6972 * Should 'se' preempt 'curr'.
6973 *
6974 * |s1
6975 * |s2
6976 * |s3
6977 * g
6978 * |<--->|c
6979 *
6980 * w(c, s1) = -1
6981 * w(c, s2) = 0
6982 * w(c, s3) = 1
6983 *
6984 */
6985 static int
wakeup_preempt_entity(struct sched_entity * curr,struct sched_entity * se)6986 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6987 {
6988 s64 gran, vdiff = curr->vruntime - se->vruntime;
6989
6990 if (vdiff <= 0)
6991 return -1;
6992
6993 gran = wakeup_gran(curr, se);
6994 if (vdiff > gran)
6995 return 1;
6996
6997 return 0;
6998 }
6999
set_last_buddy(struct sched_entity * se)7000 static void set_last_buddy(struct sched_entity *se)
7001 {
7002 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7003 return;
7004
7005 for_each_sched_entity(se)
7006 cfs_rq_of(se)->last = se;
7007 }
7008
set_next_buddy(struct sched_entity * se)7009 static void set_next_buddy(struct sched_entity *se)
7010 {
7011 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7012 return;
7013
7014 for_each_sched_entity(se)
7015 cfs_rq_of(se)->next = se;
7016 }
7017
set_skip_buddy(struct sched_entity * se)7018 static void set_skip_buddy(struct sched_entity *se)
7019 {
7020 for_each_sched_entity(se)
7021 cfs_rq_of(se)->skip = se;
7022 }
7023
7024 /*
7025 * Preempt the current task with a newly woken task if needed:
7026 */
check_preempt_wakeup(struct rq * rq,struct task_struct * p,int wake_flags)7027 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
7028 {
7029 struct task_struct *curr = rq->curr;
7030 struct sched_entity *se = &curr->se, *pse = &p->se;
7031 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7032 int scale = cfs_rq->nr_running >= sched_nr_latency;
7033 int next_buddy_marked = 0;
7034
7035 if (unlikely(se == pse))
7036 return;
7037
7038 /*
7039 * This is possible from callers such as attach_tasks(), in which we
7040 * unconditionally check_prempt_curr() after an enqueue (which may have
7041 * lead to a throttle). This both saves work and prevents false
7042 * next-buddy nomination below.
7043 */
7044 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7045 return;
7046
7047 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
7048 set_next_buddy(pse);
7049 next_buddy_marked = 1;
7050 }
7051
7052 /*
7053 * We can come here with TIF_NEED_RESCHED already set from new task
7054 * wake up path.
7055 *
7056 * Note: this also catches the edge-case of curr being in a throttled
7057 * group (e.g. via set_curr_task), since update_curr() (in the
7058 * enqueue of curr) will have resulted in resched being set. This
7059 * prevents us from potentially nominating it as a false LAST_BUDDY
7060 * below.
7061 */
7062 if (test_tsk_need_resched(curr))
7063 return;
7064
7065 /* Idle tasks are by definition preempted by non-idle tasks. */
7066 if (unlikely(curr->policy == SCHED_IDLE) &&
7067 likely(p->policy != SCHED_IDLE))
7068 goto preempt;
7069
7070 /*
7071 * Batch and idle tasks do not preempt non-idle tasks (their preemption
7072 * is driven by the tick):
7073 */
7074 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
7075 return;
7076
7077 find_matching_se(&se, &pse);
7078 update_curr(cfs_rq_of(se));
7079 BUG_ON(!pse);
7080 if (wakeup_preempt_entity(se, pse) == 1) {
7081 /*
7082 * Bias pick_next to pick the sched entity that is
7083 * triggering this preemption.
7084 */
7085 if (!next_buddy_marked)
7086 set_next_buddy(pse);
7087 goto preempt;
7088 }
7089
7090 return;
7091
7092 preempt:
7093 resched_curr(rq);
7094 /*
7095 * Only set the backward buddy when the current task is still
7096 * on the rq. This can happen when a wakeup gets interleaved
7097 * with schedule on the ->pre_schedule() or idle_balance()
7098 * point, either of which can * drop the rq lock.
7099 *
7100 * Also, during early boot the idle thread is in the fair class,
7101 * for obvious reasons its a bad idea to schedule back to it.
7102 */
7103 if (unlikely(!se->on_rq || curr == rq->idle))
7104 return;
7105
7106 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7107 set_last_buddy(se);
7108 }
7109
7110 static struct task_struct *
pick_next_task_fair(struct rq * rq,struct task_struct * prev)7111 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
7112 {
7113 struct cfs_rq *cfs_rq = &rq->cfs;
7114 struct sched_entity *se;
7115 struct task_struct *p;
7116 int new_tasks;
7117
7118 again:
7119 #ifdef CONFIG_FAIR_GROUP_SCHED
7120 if (!cfs_rq->nr_running)
7121 goto idle;
7122
7123 if (prev->sched_class != &fair_sched_class)
7124 goto simple;
7125
7126 /*
7127 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
7128 * likely that a next task is from the same cgroup as the current.
7129 *
7130 * Therefore attempt to avoid putting and setting the entire cgroup
7131 * hierarchy, only change the part that actually changes.
7132 */
7133
7134 do {
7135 struct sched_entity *curr = cfs_rq->curr;
7136
7137 /*
7138 * Since we got here without doing put_prev_entity() we also
7139 * have to consider cfs_rq->curr. If it is still a runnable
7140 * entity, update_curr() will update its vruntime, otherwise
7141 * forget we've ever seen it.
7142 */
7143 if (curr) {
7144 if (curr->on_rq)
7145 update_curr(cfs_rq);
7146 else
7147 curr = NULL;
7148
7149 /*
7150 * This call to check_cfs_rq_runtime() will do the
7151 * throttle and dequeue its entity in the parent(s).
7152 * Therefore the 'simple' nr_running test will indeed
7153 * be correct.
7154 */
7155 if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7156 goto simple;
7157 }
7158
7159 se = pick_next_entity(cfs_rq, curr);
7160 cfs_rq = group_cfs_rq(se);
7161 } while (cfs_rq);
7162
7163 p = task_of(se);
7164
7165 /*
7166 * Since we haven't yet done put_prev_entity and if the selected task
7167 * is a different task than we started out with, try and touch the
7168 * least amount of cfs_rqs.
7169 */
7170 if (prev != p) {
7171 struct sched_entity *pse = &prev->se;
7172
7173 while (!(cfs_rq = is_same_group(se, pse))) {
7174 int se_depth = se->depth;
7175 int pse_depth = pse->depth;
7176
7177 if (se_depth <= pse_depth) {
7178 put_prev_entity(cfs_rq_of(pse), pse);
7179 pse = parent_entity(pse);
7180 }
7181 if (se_depth >= pse_depth) {
7182 set_next_entity(cfs_rq_of(se), se);
7183 se = parent_entity(se);
7184 }
7185 }
7186
7187 put_prev_entity(cfs_rq, pse);
7188 set_next_entity(cfs_rq, se);
7189 }
7190
7191 if (hrtick_enabled(rq))
7192 hrtick_start_fair(rq, p);
7193
7194 rq->misfit_task = !task_fits_max(p, rq->cpu);
7195
7196 return p;
7197 simple:
7198 cfs_rq = &rq->cfs;
7199 #endif
7200
7201 if (!cfs_rq->nr_running)
7202 goto idle;
7203
7204 put_prev_task(rq, prev);
7205
7206 do {
7207 se = pick_next_entity(cfs_rq, NULL);
7208 set_next_entity(cfs_rq, se);
7209 cfs_rq = group_cfs_rq(se);
7210 } while (cfs_rq);
7211
7212 p = task_of(se);
7213
7214 if (hrtick_enabled(rq))
7215 hrtick_start_fair(rq, p);
7216
7217 rq->misfit_task = !task_fits_max(p, rq->cpu);
7218
7219 return p;
7220
7221 idle:
7222 rq->misfit_task = 0;
7223 /*
7224 * This is OK, because current is on_cpu, which avoids it being picked
7225 * for load-balance and preemption/IRQs are still disabled avoiding
7226 * further scheduler activity on it and we're being very careful to
7227 * re-start the picking loop.
7228 */
7229 lockdep_unpin_lock(&rq->lock);
7230 new_tasks = idle_balance(rq);
7231 lockdep_pin_lock(&rq->lock);
7232 /*
7233 * Because idle_balance() releases (and re-acquires) rq->lock, it is
7234 * possible for any higher priority task to appear. In that case we
7235 * must re-start the pick_next_entity() loop.
7236 */
7237 if (new_tasks < 0)
7238 return RETRY_TASK;
7239
7240 if (new_tasks > 0)
7241 goto again;
7242
7243 return NULL;
7244 }
7245
7246 /*
7247 * Account for a descheduled task:
7248 */
put_prev_task_fair(struct rq * rq,struct task_struct * prev)7249 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7250 {
7251 struct sched_entity *se = &prev->se;
7252 struct cfs_rq *cfs_rq;
7253
7254 for_each_sched_entity(se) {
7255 cfs_rq = cfs_rq_of(se);
7256 put_prev_entity(cfs_rq, se);
7257 }
7258 }
7259
7260 /*
7261 * sched_yield() is very simple
7262 *
7263 * The magic of dealing with the ->skip buddy is in pick_next_entity.
7264 */
yield_task_fair(struct rq * rq)7265 static void yield_task_fair(struct rq *rq)
7266 {
7267 struct task_struct *curr = rq->curr;
7268 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7269 struct sched_entity *se = &curr->se;
7270
7271 /*
7272 * Are we the only task in the tree?
7273 */
7274 if (unlikely(rq->nr_running == 1))
7275 return;
7276
7277 clear_buddies(cfs_rq, se);
7278
7279 if (curr->policy != SCHED_BATCH) {
7280 update_rq_clock(rq);
7281 /*
7282 * Update run-time statistics of the 'current'.
7283 */
7284 update_curr(cfs_rq);
7285 /*
7286 * Tell update_rq_clock() that we've just updated,
7287 * so we don't do microscopic update in schedule()
7288 * and double the fastpath cost.
7289 */
7290 rq_clock_skip_update(rq, true);
7291 }
7292
7293 set_skip_buddy(se);
7294 }
7295
yield_to_task_fair(struct rq * rq,struct task_struct * p,bool preempt)7296 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7297 {
7298 struct sched_entity *se = &p->se;
7299
7300 /* throttled hierarchies are not runnable */
7301 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7302 return false;
7303
7304 /* Tell the scheduler that we'd really like pse to run next. */
7305 set_next_buddy(se);
7306
7307 yield_task_fair(rq);
7308
7309 return true;
7310 }
7311
7312 #ifdef CONFIG_SMP
7313 /**************************************************
7314 * Fair scheduling class load-balancing methods.
7315 *
7316 * BASICS
7317 *
7318 * The purpose of load-balancing is to achieve the same basic fairness the
7319 * per-cpu scheduler provides, namely provide a proportional amount of compute
7320 * time to each task. This is expressed in the following equation:
7321 *
7322 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
7323 *
7324 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
7325 * W_i,0 is defined as:
7326 *
7327 * W_i,0 = \Sum_j w_i,j (2)
7328 *
7329 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
7330 * is derived from the nice value as per prio_to_weight[].
7331 *
7332 * The weight average is an exponential decay average of the instantaneous
7333 * weight:
7334 *
7335 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
7336 *
7337 * C_i is the compute capacity of cpu i, typically it is the
7338 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
7339 * can also include other factors [XXX].
7340 *
7341 * To achieve this balance we define a measure of imbalance which follows
7342 * directly from (1):
7343 *
7344 * imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j } (4)
7345 *
7346 * We them move tasks around to minimize the imbalance. In the continuous
7347 * function space it is obvious this converges, in the discrete case we get
7348 * a few fun cases generally called infeasible weight scenarios.
7349 *
7350 * [XXX expand on:
7351 * - infeasible weights;
7352 * - local vs global optima in the discrete case. ]
7353 *
7354 *
7355 * SCHED DOMAINS
7356 *
7357 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
7358 * for all i,j solution, we create a tree of cpus that follows the hardware
7359 * topology where each level pairs two lower groups (or better). This results
7360 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
7361 * tree to only the first of the previous level and we decrease the frequency
7362 * of load-balance at each level inv. proportional to the number of cpus in
7363 * the groups.
7364 *
7365 * This yields:
7366 *
7367 * log_2 n 1 n
7368 * \Sum { --- * --- * 2^i } = O(n) (5)
7369 * i = 0 2^i 2^i
7370 * `- size of each group
7371 * | | `- number of cpus doing load-balance
7372 * | `- freq
7373 * `- sum over all levels
7374 *
7375 * Coupled with a limit on how many tasks we can migrate every balance pass,
7376 * this makes (5) the runtime complexity of the balancer.
7377 *
7378 * An important property here is that each CPU is still (indirectly) connected
7379 * to every other cpu in at most O(log n) steps:
7380 *
7381 * The adjacency matrix of the resulting graph is given by:
7382 *
7383 * log_2 n
7384 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
7385 * k = 0
7386 *
7387 * And you'll find that:
7388 *
7389 * A^(log_2 n)_i,j != 0 for all i,j (7)
7390 *
7391 * Showing there's indeed a path between every cpu in at most O(log n) steps.
7392 * The task movement gives a factor of O(m), giving a convergence complexity
7393 * of:
7394 *
7395 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
7396 *
7397 *
7398 * WORK CONSERVING
7399 *
7400 * In order to avoid CPUs going idle while there's still work to do, new idle
7401 * balancing is more aggressive and has the newly idle cpu iterate up the domain
7402 * tree itself instead of relying on other CPUs to bring it work.
7403 *
7404 * This adds some complexity to both (5) and (8) but it reduces the total idle
7405 * time.
7406 *
7407 * [XXX more?]
7408 *
7409 *
7410 * CGROUPS
7411 *
7412 * Cgroups make a horror show out of (2), instead of a simple sum we get:
7413 *
7414 * s_k,i
7415 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
7416 * S_k
7417 *
7418 * Where
7419 *
7420 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
7421 *
7422 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
7423 *
7424 * The big problem is S_k, its a global sum needed to compute a local (W_i)
7425 * property.
7426 *
7427 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7428 * rewrite all of this once again.]
7429 */
7430
7431 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7432
7433 enum fbq_type { regular, remote, all };
7434
7435 enum group_type {
7436 group_other = 0,
7437 group_misfit_task,
7438 group_imbalanced,
7439 group_overloaded,
7440 };
7441
7442 #define LBF_ALL_PINNED 0x01
7443 #define LBF_NEED_BREAK 0x02
7444 #define LBF_DST_PINNED 0x04
7445 #define LBF_SOME_PINNED 0x08
7446
7447 struct lb_env {
7448 struct sched_domain *sd;
7449
7450 struct rq *src_rq;
7451 int src_cpu;
7452
7453 int dst_cpu;
7454 struct rq *dst_rq;
7455
7456 struct cpumask *dst_grpmask;
7457 int new_dst_cpu;
7458 enum cpu_idle_type idle;
7459 long imbalance;
7460 unsigned int src_grp_nr_running;
7461 /* The set of CPUs under consideration for load-balancing */
7462 struct cpumask *cpus;
7463
7464 unsigned int flags;
7465
7466 unsigned int loop;
7467 unsigned int loop_break;
7468 unsigned int loop_max;
7469
7470 enum fbq_type fbq_type;
7471 enum group_type busiest_group_type;
7472 struct list_head tasks;
7473 };
7474
7475 /*
7476 * Is this task likely cache-hot:
7477 */
task_hot(struct task_struct * p,struct lb_env * env)7478 static int task_hot(struct task_struct *p, struct lb_env *env)
7479 {
7480 s64 delta;
7481
7482 lockdep_assert_held(&env->src_rq->lock);
7483
7484 if (p->sched_class != &fair_sched_class)
7485 return 0;
7486
7487 if (unlikely(p->policy == SCHED_IDLE))
7488 return 0;
7489
7490 /*
7491 * Buddy candidates are cache hot:
7492 */
7493 if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7494 (&p->se == cfs_rq_of(&p->se)->next ||
7495 &p->se == cfs_rq_of(&p->se)->last))
7496 return 1;
7497
7498 if (sysctl_sched_migration_cost == -1)
7499 return 1;
7500 if (sysctl_sched_migration_cost == 0)
7501 return 0;
7502
7503 delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7504
7505 return delta < (s64)sysctl_sched_migration_cost;
7506 }
7507
7508 #ifdef CONFIG_NUMA_BALANCING
7509 /*
7510 * Returns 1, if task migration degrades locality
7511 * Returns 0, if task migration improves locality i.e migration preferred.
7512 * Returns -1, if task migration is not affected by locality.
7513 */
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)7514 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7515 {
7516 struct numa_group *numa_group = rcu_dereference(p->numa_group);
7517 unsigned long src_faults, dst_faults;
7518 int src_nid, dst_nid;
7519
7520 if (!static_branch_likely(&sched_numa_balancing))
7521 return -1;
7522
7523 if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7524 return -1;
7525
7526 src_nid = cpu_to_node(env->src_cpu);
7527 dst_nid = cpu_to_node(env->dst_cpu);
7528
7529 if (src_nid == dst_nid)
7530 return -1;
7531
7532 /* Migrating away from the preferred node is always bad. */
7533 if (src_nid == p->numa_preferred_nid) {
7534 if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7535 return 1;
7536 else
7537 return -1;
7538 }
7539
7540 /* Encourage migration to the preferred node. */
7541 if (dst_nid == p->numa_preferred_nid)
7542 return 0;
7543
7544 if (numa_group) {
7545 src_faults = group_faults(p, src_nid);
7546 dst_faults = group_faults(p, dst_nid);
7547 } else {
7548 src_faults = task_faults(p, src_nid);
7549 dst_faults = task_faults(p, dst_nid);
7550 }
7551
7552 return dst_faults < src_faults;
7553 }
7554
7555 #else
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)7556 static inline int migrate_degrades_locality(struct task_struct *p,
7557 struct lb_env *env)
7558 {
7559 return -1;
7560 }
7561 #endif
7562
7563 /*
7564 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7565 */
7566 static
can_migrate_task(struct task_struct * p,struct lb_env * env)7567 int can_migrate_task(struct task_struct *p, struct lb_env *env)
7568 {
7569 int tsk_cache_hot;
7570
7571 lockdep_assert_held(&env->src_rq->lock);
7572
7573 /*
7574 * We do not migrate tasks that are:
7575 * 1) throttled_lb_pair, or
7576 * 2) cannot be migrated to this CPU due to cpus_allowed, or
7577 * 3) running (obviously), or
7578 * 4) are cache-hot on their current CPU.
7579 */
7580 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7581 return 0;
7582
7583 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
7584 int cpu;
7585
7586 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
7587
7588 env->flags |= LBF_SOME_PINNED;
7589
7590 /*
7591 * Remember if this task can be migrated to any other cpu in
7592 * our sched_group. We may want to revisit it if we couldn't
7593 * meet load balance goals by pulling other tasks on src_cpu.
7594 *
7595 * Also avoid computing new_dst_cpu if we have already computed
7596 * one in current iteration.
7597 */
7598 if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
7599 return 0;
7600
7601 /* Prevent to re-select dst_cpu via env's cpus */
7602 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7603 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
7604 env->flags |= LBF_DST_PINNED;
7605 env->new_dst_cpu = cpu;
7606 break;
7607 }
7608 }
7609
7610 return 0;
7611 }
7612
7613 /* Record that we found atleast one task that could run on dst_cpu */
7614 env->flags &= ~LBF_ALL_PINNED;
7615
7616 if (task_running(env->src_rq, p)) {
7617 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
7618 return 0;
7619 }
7620
7621 /*
7622 * Aggressive migration if:
7623 * 1) destination numa is preferred
7624 * 2) task is cache cold, or
7625 * 3) too many balance attempts have failed.
7626 */
7627 tsk_cache_hot = migrate_degrades_locality(p, env);
7628 if (tsk_cache_hot == -1)
7629 tsk_cache_hot = task_hot(p, env);
7630
7631 if (tsk_cache_hot <= 0 ||
7632 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7633 if (tsk_cache_hot == 1) {
7634 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
7635 schedstat_inc(p, se.statistics.nr_forced_migrations);
7636 }
7637 return 1;
7638 }
7639
7640 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
7641 return 0;
7642 }
7643
7644 /*
7645 * detach_task() -- detach the task for the migration specified in env
7646 */
detach_task(struct task_struct * p,struct lb_env * env)7647 static void detach_task(struct task_struct *p, struct lb_env *env)
7648 {
7649 lockdep_assert_held(&env->src_rq->lock);
7650
7651 deactivate_task(env->src_rq, p, 0);
7652 p->on_rq = TASK_ON_RQ_MIGRATING;
7653 double_lock_balance(env->src_rq, env->dst_rq);
7654 set_task_cpu(p, env->dst_cpu);
7655 double_unlock_balance(env->src_rq, env->dst_rq);
7656 }
7657
7658 /*
7659 * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
7660 * part of active balancing operations within "domain".
7661 *
7662 * Returns a task if successful and NULL otherwise.
7663 */
detach_one_task(struct lb_env * env)7664 static struct task_struct *detach_one_task(struct lb_env *env)
7665 {
7666 struct task_struct *p, *n;
7667
7668 lockdep_assert_held(&env->src_rq->lock);
7669
7670 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
7671 if (!can_migrate_task(p, env))
7672 continue;
7673
7674 detach_task(p, env);
7675
7676 /*
7677 * Right now, this is only the second place where
7678 * lb_gained[env->idle] is updated (other is detach_tasks)
7679 * so we can safely collect stats here rather than
7680 * inside detach_tasks().
7681 */
7682 schedstat_inc(env->sd, lb_gained[env->idle]);
7683 return p;
7684 }
7685 return NULL;
7686 }
7687
7688 static const unsigned int sched_nr_migrate_break = 32;
7689
7690 /*
7691 * detach_tasks() -- tries to detach up to imbalance weighted load from
7692 * busiest_rq, as part of a balancing operation within domain "sd".
7693 *
7694 * Returns number of detached tasks if successful and 0 otherwise.
7695 */
detach_tasks(struct lb_env * env)7696 static int detach_tasks(struct lb_env *env)
7697 {
7698 struct list_head *tasks = &env->src_rq->cfs_tasks;
7699 struct task_struct *p;
7700 unsigned long load;
7701 int detached = 0;
7702
7703 lockdep_assert_held(&env->src_rq->lock);
7704
7705 if (env->imbalance <= 0)
7706 return 0;
7707
7708 while (!list_empty(tasks)) {
7709 /*
7710 * We don't want to steal all, otherwise we may be treated likewise,
7711 * which could at worst lead to a livelock crash.
7712 */
7713 if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7714 break;
7715
7716 p = list_first_entry(tasks, struct task_struct, se.group_node);
7717
7718 env->loop++;
7719 /* We've more or less seen every task there is, call it quits */
7720 if (env->loop > env->loop_max)
7721 break;
7722
7723 /* take a breather every nr_migrate tasks */
7724 if (env->loop > env->loop_break) {
7725 env->loop_break += sched_nr_migrate_break;
7726 env->flags |= LBF_NEED_BREAK;
7727 break;
7728 }
7729
7730 if (!can_migrate_task(p, env))
7731 goto next;
7732
7733 /*
7734 * Depending of the number of CPUs and tasks and the
7735 * cgroup hierarchy, task_h_load() can return a null
7736 * value. Make sure that env->imbalance decreases
7737 * otherwise detach_tasks() will stop only after
7738 * detaching up to loop_max tasks.
7739 */
7740 load = max_t(unsigned long, task_h_load(p), 1);
7741
7742
7743 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7744 goto next;
7745
7746 if ((load / 2) > env->imbalance)
7747 goto next;
7748
7749 detach_task(p, env);
7750 list_add(&p->se.group_node, &env->tasks);
7751
7752 detached++;
7753 env->imbalance -= load;
7754
7755 #ifdef CONFIG_PREEMPT
7756 /*
7757 * NEWIDLE balancing is a source of latency, so preemptible
7758 * kernels will stop after the first task is detached to minimize
7759 * the critical section.
7760 */
7761 if (env->idle == CPU_NEWLY_IDLE)
7762 break;
7763 #endif
7764
7765 /*
7766 * We only want to steal up to the prescribed amount of
7767 * weighted load.
7768 */
7769 if (env->imbalance <= 0)
7770 break;
7771
7772 continue;
7773 next:
7774 list_move_tail(&p->se.group_node, tasks);
7775 }
7776
7777 /*
7778 * Right now, this is one of only two places we collect this stat
7779 * so we can safely collect detach_one_task() stats here rather
7780 * than inside detach_one_task().
7781 */
7782 schedstat_add(env->sd, lb_gained[env->idle], detached);
7783
7784 return detached;
7785 }
7786
7787 /*
7788 * attach_task() -- attach the task detached by detach_task() to its new rq.
7789 */
attach_task(struct rq * rq,struct task_struct * p)7790 static void attach_task(struct rq *rq, struct task_struct *p)
7791 {
7792 lockdep_assert_held(&rq->lock);
7793
7794 BUG_ON(task_rq(p) != rq);
7795 p->on_rq = TASK_ON_RQ_QUEUED;
7796 activate_task(rq, p, 0);
7797 check_preempt_curr(rq, p, 0);
7798 }
7799
7800 /*
7801 * attach_one_task() -- attaches the task returned from detach_one_task() to
7802 * its new rq.
7803 */
attach_one_task(struct rq * rq,struct task_struct * p)7804 static void attach_one_task(struct rq *rq, struct task_struct *p)
7805 {
7806 raw_spin_lock(&rq->lock);
7807 attach_task(rq, p);
7808 raw_spin_unlock(&rq->lock);
7809 }
7810
7811 /*
7812 * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
7813 * new rq.
7814 */
attach_tasks(struct lb_env * env)7815 static void attach_tasks(struct lb_env *env)
7816 {
7817 struct list_head *tasks = &env->tasks;
7818 struct task_struct *p;
7819
7820 raw_spin_lock(&env->dst_rq->lock);
7821
7822 while (!list_empty(tasks)) {
7823 p = list_first_entry(tasks, struct task_struct, se.group_node);
7824 list_del_init(&p->se.group_node);
7825
7826 attach_task(env->dst_rq, p);
7827 }
7828
7829 raw_spin_unlock(&env->dst_rq->lock);
7830 }
7831
7832 #ifdef CONFIG_FAIR_GROUP_SCHED
update_blocked_averages(int cpu)7833 static void update_blocked_averages(int cpu)
7834 {
7835 struct rq *rq = cpu_rq(cpu);
7836 struct cfs_rq *cfs_rq;
7837 unsigned long flags;
7838
7839 raw_spin_lock_irqsave(&rq->lock, flags);
7840 update_rq_clock(rq);
7841
7842 /*
7843 * Iterates the task_group tree in a bottom up fashion, see
7844 * list_add_leaf_cfs_rq() for details.
7845 */
7846 for_each_leaf_cfs_rq(rq, cfs_rq) {
7847 struct sched_entity *se;
7848
7849 /* throttled entities do not contribute to load */
7850 if (throttled_hierarchy(cfs_rq))
7851 continue;
7852
7853 if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
7854 true))
7855 update_tg_load_avg(cfs_rq, 0);
7856
7857 /* Propagate pending load changes to the parent, if any: */
7858 se = cfs_rq->tg->se[cpu];
7859 if (se && !skip_blocked_update(se))
7860 update_load_avg(se, 0);
7861 }
7862 raw_spin_unlock_irqrestore(&rq->lock, flags);
7863 }
7864
7865 /*
7866 * Compute the hierarchical load factor for cfs_rq and all its ascendants.
7867 * This needs to be done in a top-down fashion because the load of a child
7868 * group is a fraction of its parents load.
7869 */
update_cfs_rq_h_load(struct cfs_rq * cfs_rq)7870 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7871 {
7872 struct rq *rq = rq_of(cfs_rq);
7873 struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7874 unsigned long now = jiffies;
7875 unsigned long load;
7876
7877 if (cfs_rq->last_h_load_update == now)
7878 return;
7879
7880 WRITE_ONCE(cfs_rq->h_load_next, NULL);
7881 for_each_sched_entity(se) {
7882 cfs_rq = cfs_rq_of(se);
7883 WRITE_ONCE(cfs_rq->h_load_next, se);
7884 if (cfs_rq->last_h_load_update == now)
7885 break;
7886 }
7887
7888 if (!se) {
7889 cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7890 cfs_rq->last_h_load_update = now;
7891 }
7892
7893 while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7894 load = cfs_rq->h_load;
7895 load = div64_ul(load * se->avg.load_avg,
7896 cfs_rq_load_avg(cfs_rq) + 1);
7897 cfs_rq = group_cfs_rq(se);
7898 cfs_rq->h_load = load;
7899 cfs_rq->last_h_load_update = now;
7900 }
7901 }
7902
task_h_load(struct task_struct * p)7903 static unsigned long task_h_load(struct task_struct *p)
7904 {
7905 struct cfs_rq *cfs_rq = task_cfs_rq(p);
7906
7907 update_cfs_rq_h_load(cfs_rq);
7908 return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7909 cfs_rq_load_avg(cfs_rq) + 1);
7910 }
7911 #else
update_blocked_averages(int cpu)7912 static inline void update_blocked_averages(int cpu)
7913 {
7914 struct rq *rq = cpu_rq(cpu);
7915 struct cfs_rq *cfs_rq = &rq->cfs;
7916 unsigned long flags;
7917
7918 raw_spin_lock_irqsave(&rq->lock, flags);
7919 update_rq_clock(rq);
7920 update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
7921 raw_spin_unlock_irqrestore(&rq->lock, flags);
7922 }
7923
task_h_load(struct task_struct * p)7924 static unsigned long task_h_load(struct task_struct *p)
7925 {
7926 return p->se.avg.load_avg;
7927 }
7928 #endif
7929
7930 /********** Helpers for find_busiest_group ************************/
7931
7932 /*
7933 * sg_lb_stats - stats of a sched_group required for load_balancing
7934 */
7935 struct sg_lb_stats {
7936 unsigned long avg_load; /*Avg load across the CPUs of the group */
7937 unsigned long group_load; /* Total load over the CPUs of the group */
7938 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
7939 unsigned long load_per_task;
7940 unsigned long group_capacity;
7941 unsigned long group_util; /* Total utilization of the group */
7942 unsigned int sum_nr_running; /* Nr tasks running in the group */
7943 unsigned int idle_cpus;
7944 unsigned int group_weight;
7945 enum group_type group_type;
7946 int group_no_capacity;
7947 int group_misfit_task; /* A cpu has a task too big for its capacity */
7948 #ifdef CONFIG_NUMA_BALANCING
7949 unsigned int nr_numa_running;
7950 unsigned int nr_preferred_running;
7951 #endif
7952 };
7953
7954 /*
7955 * sd_lb_stats - Structure to store the statistics of a sched_domain
7956 * during load balancing.
7957 */
7958 struct sd_lb_stats {
7959 struct sched_group *busiest; /* Busiest group in this sd */
7960 struct sched_group *local; /* Local group in this sd */
7961 unsigned long total_load; /* Total load of all groups in sd */
7962 unsigned long total_capacity; /* Total capacity of all groups in sd */
7963 unsigned long avg_load; /* Average load across all groups in sd */
7964
7965 struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
7966 struct sg_lb_stats local_stat; /* Statistics of the local group */
7967 };
7968
init_sd_lb_stats(struct sd_lb_stats * sds)7969 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7970 {
7971 /*
7972 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
7973 * local_stat because update_sg_lb_stats() does a full clear/assignment.
7974 * We must however clear busiest_stat::avg_load because
7975 * update_sd_pick_busiest() reads this before assignment.
7976 */
7977 *sds = (struct sd_lb_stats){
7978 .busiest = NULL,
7979 .local = NULL,
7980 .total_load = 0UL,
7981 .total_capacity = 0UL,
7982 .busiest_stat = {
7983 .avg_load = 0UL,
7984 .sum_nr_running = 0,
7985 .group_type = group_other,
7986 },
7987 };
7988 }
7989
7990 /**
7991 * get_sd_load_idx - Obtain the load index for a given sched domain.
7992 * @sd: The sched_domain whose load_idx is to be obtained.
7993 * @idle: The idle status of the CPU for whose sd load_idx is obtained.
7994 *
7995 * Return: The load index.
7996 */
get_sd_load_idx(struct sched_domain * sd,enum cpu_idle_type idle)7997 static inline int get_sd_load_idx(struct sched_domain *sd,
7998 enum cpu_idle_type idle)
7999 {
8000 int load_idx;
8001
8002 switch (idle) {
8003 case CPU_NOT_IDLE:
8004 load_idx = sd->busy_idx;
8005 break;
8006
8007 case CPU_NEWLY_IDLE:
8008 load_idx = sd->newidle_idx;
8009 break;
8010 default:
8011 load_idx = sd->idle_idx;
8012 break;
8013 }
8014
8015 return load_idx;
8016 }
8017
scale_rt_capacity(int cpu)8018 static unsigned long scale_rt_capacity(int cpu)
8019 {
8020 struct rq *rq = cpu_rq(cpu);
8021 u64 total, used, age_stamp, avg;
8022 s64 delta;
8023
8024 /*
8025 * Since we're reading these variables without serialization make sure
8026 * we read them once before doing sanity checks on them.
8027 */
8028 age_stamp = READ_ONCE(rq->age_stamp);
8029 avg = READ_ONCE(rq->rt_avg);
8030 delta = __rq_clock_broken(rq) - age_stamp;
8031
8032 if (unlikely(delta < 0))
8033 delta = 0;
8034
8035 total = sched_avg_period() + delta;
8036
8037 used = div_u64(avg, total);
8038
8039 /*
8040 * deadline bandwidth is defined at system level so we must
8041 * weight this bandwidth with the max capacity of the system.
8042 * As a reminder, avg_bw is 20bits width and
8043 * scale_cpu_capacity is 10 bits width
8044 */
8045 used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
8046
8047 if (likely(used < SCHED_CAPACITY_SCALE))
8048 return SCHED_CAPACITY_SCALE - used;
8049
8050 return 1;
8051 }
8052
init_max_cpu_capacity(struct max_cpu_capacity * mcc)8053 void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
8054 {
8055 raw_spin_lock_init(&mcc->lock);
8056 mcc->val = 0;
8057 mcc->cpu = -1;
8058 }
8059
update_cpu_capacity(struct sched_domain * sd,int cpu)8060 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8061 {
8062 unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
8063 struct sched_group *sdg = sd->groups;
8064 struct max_cpu_capacity *mcc;
8065 unsigned long max_capacity;
8066 int max_cap_cpu;
8067 unsigned long flags;
8068
8069 cpu_rq(cpu)->cpu_capacity_orig = capacity;
8070
8071 mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8072
8073 raw_spin_lock_irqsave(&mcc->lock, flags);
8074 max_capacity = mcc->val;
8075 max_cap_cpu = mcc->cpu;
8076
8077 if ((max_capacity > capacity && max_cap_cpu == cpu) ||
8078 (max_capacity < capacity)) {
8079 mcc->val = capacity;
8080 mcc->cpu = cpu;
8081 #ifdef CONFIG_SCHED_DEBUG
8082 raw_spin_unlock_irqrestore(&mcc->lock, flags);
8083 printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8084 cpu, capacity);
8085 goto skip_unlock;
8086 #endif
8087 }
8088 raw_spin_unlock_irqrestore(&mcc->lock, flags);
8089
8090 skip_unlock: __attribute__ ((unused));
8091 capacity *= scale_rt_capacity(cpu);
8092 capacity >>= SCHED_CAPACITY_SHIFT;
8093
8094 if (!capacity)
8095 capacity = 1;
8096
8097 cpu_rq(cpu)->cpu_capacity = capacity;
8098 sdg->sgc->capacity = capacity;
8099 sdg->sgc->max_capacity = capacity;
8100 sdg->sgc->min_capacity = capacity;
8101 }
8102
update_group_capacity(struct sched_domain * sd,int cpu)8103 void update_group_capacity(struct sched_domain *sd, int cpu)
8104 {
8105 struct sched_domain *child = sd->child;
8106 struct sched_group *group, *sdg = sd->groups;
8107 unsigned long capacity, max_capacity, min_capacity;
8108 unsigned long interval;
8109
8110 interval = msecs_to_jiffies(sd->balance_interval);
8111 interval = clamp(interval, 1UL, max_load_balance_interval);
8112 sdg->sgc->next_update = jiffies + interval;
8113
8114 if (!child) {
8115 update_cpu_capacity(sd, cpu);
8116 return;
8117 }
8118
8119 capacity = 0;
8120 max_capacity = 0;
8121 min_capacity = ULONG_MAX;
8122
8123 if (child->flags & SD_OVERLAP) {
8124 /*
8125 * SD_OVERLAP domains cannot assume that child groups
8126 * span the current group.
8127 */
8128
8129 for_each_cpu(cpu, sched_group_cpus(sdg)) {
8130 struct sched_group_capacity *sgc;
8131 struct rq *rq = cpu_rq(cpu);
8132
8133 /*
8134 * build_sched_domains() -> init_sched_groups_capacity()
8135 * gets here before we've attached the domains to the
8136 * runqueues.
8137 *
8138 * Use capacity_of(), which is set irrespective of domains
8139 * in update_cpu_capacity().
8140 *
8141 * This avoids capacity from being 0 and
8142 * causing divide-by-zero issues on boot.
8143 */
8144 if (unlikely(!rq->sd)) {
8145 capacity += capacity_of(cpu);
8146 } else {
8147 sgc = rq->sd->groups->sgc;
8148 capacity += sgc->capacity;
8149 }
8150
8151 max_capacity = max(capacity, max_capacity);
8152 min_capacity = min(capacity, min_capacity);
8153 }
8154 } else {
8155 /*
8156 * !SD_OVERLAP domains can assume that child groups
8157 * span the current group.
8158 */
8159
8160 group = child->groups;
8161 do {
8162 struct sched_group_capacity *sgc = group->sgc;
8163
8164 capacity += sgc->capacity;
8165 max_capacity = max(sgc->max_capacity, max_capacity);
8166 min_capacity = min(sgc->min_capacity, min_capacity);
8167 group = group->next;
8168 } while (group != child->groups);
8169 }
8170
8171 sdg->sgc->capacity = capacity;
8172 sdg->sgc->max_capacity = max_capacity;
8173 sdg->sgc->min_capacity = min_capacity;
8174 }
8175
8176 /*
8177 * Check whether the capacity of the rq has been noticeably reduced by side
8178 * activity. The imbalance_pct is used for the threshold.
8179 * Return true is the capacity is reduced
8180 */
8181 static inline int
check_cpu_capacity(struct rq * rq,struct sched_domain * sd)8182 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8183 {
8184 return ((rq->cpu_capacity * sd->imbalance_pct) <
8185 (rq->cpu_capacity_orig * 100));
8186 }
8187
8188 /*
8189 * Group imbalance indicates (and tries to solve) the problem where balancing
8190 * groups is inadequate due to tsk_cpus_allowed() constraints.
8191 *
8192 * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
8193 * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
8194 * Something like:
8195 *
8196 * { 0 1 2 3 } { 4 5 6 7 }
8197 * * * * *
8198 *
8199 * If we were to balance group-wise we'd place two tasks in the first group and
8200 * two tasks in the second group. Clearly this is undesired as it will overload
8201 * cpu 3 and leave one of the cpus in the second group unused.
8202 *
8203 * The current solution to this issue is detecting the skew in the first group
8204 * by noticing the lower domain failed to reach balance and had difficulty
8205 * moving tasks due to affinity constraints.
8206 *
8207 * When this is so detected; this group becomes a candidate for busiest; see
8208 * update_sd_pick_busiest(). And calculate_imbalance() and
8209 * find_busiest_group() avoid some of the usual balance conditions to allow it
8210 * to create an effective group imbalance.
8211 *
8212 * This is a somewhat tricky proposition since the next run might not find the
8213 * group imbalance and decide the groups need to be balanced again. A most
8214 * subtle and fragile situation.
8215 */
8216
sg_imbalanced(struct sched_group * group)8217 static inline int sg_imbalanced(struct sched_group *group)
8218 {
8219 return group->sgc->imbalance;
8220 }
8221
8222 /*
8223 * group_has_capacity returns true if the group has spare capacity that could
8224 * be used by some tasks.
8225 * We consider that a group has spare capacity if the * number of task is
8226 * smaller than the number of CPUs or if the utilization is lower than the
8227 * available capacity for CFS tasks.
8228 * For the latter, we use a threshold to stabilize the state, to take into
8229 * account the variance of the tasks' load and to return true if the available
8230 * capacity in meaningful for the load balancer.
8231 * As an example, an available capacity of 1% can appear but it doesn't make
8232 * any benefit for the load balance.
8233 */
8234 static inline bool
group_has_capacity(struct lb_env * env,struct sg_lb_stats * sgs)8235 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
8236 {
8237 if (sgs->sum_nr_running < sgs->group_weight)
8238 return true;
8239
8240 if ((sgs->group_capacity * 100) >
8241 (sgs->group_util * env->sd->imbalance_pct))
8242 return true;
8243
8244 return false;
8245 }
8246
8247 /*
8248 * group_is_overloaded returns true if the group has more tasks than it can
8249 * handle.
8250 * group_is_overloaded is not equals to !group_has_capacity because a group
8251 * with the exact right number of tasks, has no more spare capacity but is not
8252 * overloaded so both group_has_capacity and group_is_overloaded return
8253 * false.
8254 */
8255 static inline bool
group_is_overloaded(struct lb_env * env,struct sg_lb_stats * sgs)8256 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
8257 {
8258 if (sgs->sum_nr_running <= sgs->group_weight)
8259 return false;
8260
8261 if ((sgs->group_capacity * 100) <
8262 (sgs->group_util * env->sd->imbalance_pct))
8263 return true;
8264
8265 return false;
8266 }
8267
8268
8269 /*
8270 * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
8271 * per-cpu capacity than sched_group ref.
8272 */
8273 static inline bool
group_smaller_cpu_capacity(struct sched_group * sg,struct sched_group * ref)8274 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8275 {
8276 return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
8277 ref->sgc->max_capacity;
8278 }
8279
8280 static inline enum
group_classify(struct sched_group * group,struct sg_lb_stats * sgs)8281 group_type group_classify(struct sched_group *group,
8282 struct sg_lb_stats *sgs)
8283 {
8284 if (sgs->group_no_capacity)
8285 return group_overloaded;
8286
8287 if (sg_imbalanced(group))
8288 return group_imbalanced;
8289
8290 if (sgs->group_misfit_task)
8291 return group_misfit_task;
8292
8293 return group_other;
8294 }
8295
8296 #ifdef CONFIG_NO_HZ_COMMON
8297 /*
8298 * idle load balancing data
8299 * - used by the nohz balance, but we want it available here
8300 * so that we can see which CPUs have no tick.
8301 */
8302 static struct {
8303 cpumask_var_t idle_cpus_mask;
8304 atomic_t nr_cpus;
8305 unsigned long next_balance; /* in jiffy units */
8306 } nohz ____cacheline_aligned;
8307
update_cpu_stats_if_tickless(struct rq * rq)8308 static inline void update_cpu_stats_if_tickless(struct rq *rq)
8309 {
8310 /* only called from update_sg_lb_stats when irqs are disabled */
8311 if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
8312 /* rate limit updates to once-per-jiffie at most */
8313 if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
8314 return;
8315
8316 raw_spin_lock(&rq->lock);
8317 update_rq_clock(rq);
8318 update_idle_cpu_load(rq);
8319 update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
8320 raw_spin_unlock(&rq->lock);
8321 }
8322 }
8323
8324 #else
update_cpu_stats_if_tickless(struct rq * rq)8325 static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
8326 #endif
8327
8328 /**
8329 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8330 * @env: The load balancing environment.
8331 * @group: sched_group whose statistics are to be updated.
8332 * @load_idx: Load index of sched_domain of this_cpu for load calc.
8333 * @local_group: Does group contain this_cpu.
8334 * @sgs: variable to hold the statistics for this group.
8335 * @overload: Indicate more than one runnable task for any CPU.
8336 * @overutilized: Indicate overutilization for any CPU.
8337 */
update_sg_lb_stats(struct lb_env * env,struct sched_group * group,int load_idx,int local_group,struct sg_lb_stats * sgs,bool * overload,bool * overutilized)8338 static inline void update_sg_lb_stats(struct lb_env *env,
8339 struct sched_group *group, int load_idx,
8340 int local_group, struct sg_lb_stats *sgs,
8341 bool *overload, bool *overutilized)
8342 {
8343 unsigned long load;
8344 int i, nr_running;
8345
8346 memset(sgs, 0, sizeof(*sgs));
8347
8348 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
8349 struct rq *rq = cpu_rq(i);
8350
8351 /* if we are entering idle and there are CPUs with
8352 * their tick stopped, do an update for them
8353 */
8354 if (env->idle == CPU_NEWLY_IDLE)
8355 update_cpu_stats_if_tickless(rq);
8356
8357 /* Bias balancing toward cpus of our domain */
8358 if (local_group)
8359 load = target_load(i, load_idx);
8360 else
8361 load = source_load(i, load_idx);
8362
8363 sgs->group_load += load;
8364 sgs->group_util += cpu_util(i);
8365 sgs->sum_nr_running += rq->cfs.h_nr_running;
8366
8367 nr_running = rq->nr_running;
8368 if (nr_running > 1)
8369 *overload = true;
8370
8371 #ifdef CONFIG_NUMA_BALANCING
8372 sgs->nr_numa_running += rq->nr_numa_running;
8373 sgs->nr_preferred_running += rq->nr_preferred_running;
8374 #endif
8375 sgs->sum_weighted_load += weighted_cpuload(i);
8376 /*
8377 * No need to call idle_cpu() if nr_running is not 0
8378 */
8379 if (!nr_running && idle_cpu(i))
8380 sgs->idle_cpus++;
8381
8382 if (cpu_overutilized(i)) {
8383 *overutilized = true;
8384 if (!sgs->group_misfit_task && rq->misfit_task)
8385 sgs->group_misfit_task = capacity_of(i);
8386 }
8387 }
8388
8389 /* Adjust by relative CPU capacity of the group */
8390 sgs->group_capacity = group->sgc->capacity;
8391 sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8392
8393 if (sgs->sum_nr_running)
8394 sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
8395
8396 sgs->group_weight = group->group_weight;
8397
8398 sgs->group_no_capacity = group_is_overloaded(env, sgs);
8399 sgs->group_type = group_classify(group, sgs);
8400 }
8401
8402 /**
8403 * update_sd_pick_busiest - return 1 on busiest group
8404 * @env: The load balancing environment.
8405 * @sds: sched_domain statistics
8406 * @sg: sched_group candidate to be checked for being the busiest
8407 * @sgs: sched_group statistics
8408 *
8409 * Determine if @sg is a busier group than the previously selected
8410 * busiest group.
8411 *
8412 * Return: %true if @sg is a busier group than the previously selected
8413 * busiest group. %false otherwise.
8414 */
update_sd_pick_busiest(struct lb_env * env,struct sd_lb_stats * sds,struct sched_group * sg,struct sg_lb_stats * sgs)8415 static bool update_sd_pick_busiest(struct lb_env *env,
8416 struct sd_lb_stats *sds,
8417 struct sched_group *sg,
8418 struct sg_lb_stats *sgs)
8419 {
8420 struct sg_lb_stats *busiest = &sds->busiest_stat;
8421
8422 if (sgs->group_type > busiest->group_type)
8423 return true;
8424
8425 if (sgs->group_type < busiest->group_type)
8426 return false;
8427
8428 /*
8429 * Candidate sg doesn't face any serious load-balance problems
8430 * so don't pick it if the local sg is already filled up.
8431 */
8432 if (sgs->group_type == group_other &&
8433 !group_has_capacity(env, &sds->local_stat))
8434 return false;
8435
8436 if (sgs->avg_load <= busiest->avg_load)
8437 return false;
8438
8439 if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
8440 goto asym_packing;
8441
8442 /*
8443 * Candidate sg has no more than one task per CPU and
8444 * has higher per-CPU capacity. Migrating tasks to less
8445 * capable CPUs may harm throughput. Maximize throughput,
8446 * power/energy consequences are not considered.
8447 */
8448 if (sgs->sum_nr_running <= sgs->group_weight &&
8449 group_smaller_cpu_capacity(sds->local, sg))
8450 return false;
8451
8452 asym_packing:
8453 /* This is the busiest node in its class. */
8454 if (!(env->sd->flags & SD_ASYM_PACKING))
8455 return true;
8456
8457 /*
8458 * ASYM_PACKING needs to move all the work to the lowest
8459 * numbered CPUs in the group, therefore mark all groups
8460 * higher than ourself as busy.
8461 */
8462 if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
8463 if (!sds->busiest)
8464 return true;
8465
8466 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
8467 return true;
8468 }
8469
8470 return false;
8471 }
8472
8473 #ifdef CONFIG_NUMA_BALANCING
fbq_classify_group(struct sg_lb_stats * sgs)8474 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8475 {
8476 if (sgs->sum_nr_running > sgs->nr_numa_running)
8477 return regular;
8478 if (sgs->sum_nr_running > sgs->nr_preferred_running)
8479 return remote;
8480 return all;
8481 }
8482
fbq_classify_rq(struct rq * rq)8483 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8484 {
8485 if (rq->nr_running > rq->nr_numa_running)
8486 return regular;
8487 if (rq->nr_running > rq->nr_preferred_running)
8488 return remote;
8489 return all;
8490 }
8491 #else
fbq_classify_group(struct sg_lb_stats * sgs)8492 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8493 {
8494 return all;
8495 }
8496
fbq_classify_rq(struct rq * rq)8497 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8498 {
8499 return regular;
8500 }
8501 #endif /* CONFIG_NUMA_BALANCING */
8502
8503 #define lb_sd_parent(sd) \
8504 (sd->parent && sd->parent->groups != sd->parent->groups->next)
8505
8506 /**
8507 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
8508 * @env: The load balancing environment.
8509 * @sds: variable to hold the statistics for this sched_domain.
8510 */
update_sd_lb_stats(struct lb_env * env,struct sd_lb_stats * sds)8511 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8512 {
8513 struct sched_domain *child = env->sd->child;
8514 struct sched_group *sg = env->sd->groups;
8515 struct sg_lb_stats tmp_sgs;
8516 int load_idx, prefer_sibling = 0;
8517 bool overload = false, overutilized = false;
8518
8519 if (child && child->flags & SD_PREFER_SIBLING)
8520 prefer_sibling = 1;
8521
8522 load_idx = get_sd_load_idx(env->sd, env->idle);
8523
8524 do {
8525 struct sg_lb_stats *sgs = &tmp_sgs;
8526 int local_group;
8527
8528 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
8529 if (local_group) {
8530 sds->local = sg;
8531 sgs = &sds->local_stat;
8532
8533 if (env->idle != CPU_NEWLY_IDLE ||
8534 time_after_eq(jiffies, sg->sgc->next_update))
8535 update_group_capacity(env->sd, env->dst_cpu);
8536 }
8537
8538 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
8539 &overload, &overutilized);
8540
8541 if (local_group)
8542 goto next_group;
8543
8544 /*
8545 * In case the child domain prefers tasks go to siblings
8546 * first, lower the sg capacity so that we'll try
8547 * and move all the excess tasks away. We lower the capacity
8548 * of a group only if the local group has the capacity to fit
8549 * these excess tasks. The extra check prevents the case where
8550 * you always pull from the heaviest group when it is already
8551 * under-utilized (possible with a large weight task outweighs
8552 * the tasks on the system).
8553 */
8554 if (prefer_sibling && sds->local &&
8555 group_has_capacity(env, &sds->local_stat) &&
8556 (sgs->sum_nr_running > 1)) {
8557 sgs->group_no_capacity = 1;
8558 sgs->group_type = group_classify(sg, sgs);
8559 }
8560
8561 /*
8562 * Ignore task groups with misfit tasks if local group has no
8563 * capacity or if per-cpu capacity isn't higher.
8564 */
8565 if (sgs->group_type == group_misfit_task &&
8566 (!group_has_capacity(env, &sds->local_stat) ||
8567 !group_smaller_cpu_capacity(sg, sds->local)))
8568 sgs->group_type = group_other;
8569
8570 if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8571 sds->busiest = sg;
8572 sds->busiest_stat = *sgs;
8573 }
8574
8575 next_group:
8576 /* Now, start updating sd_lb_stats */
8577 sds->total_load += sgs->group_load;
8578 sds->total_capacity += sgs->group_capacity;
8579
8580 sg = sg->next;
8581 } while (sg != env->sd->groups);
8582
8583 if (env->sd->flags & SD_NUMA)
8584 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8585
8586 env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
8587
8588 if (!lb_sd_parent(env->sd)) {
8589 /* update overload indicator if we are at root domain */
8590 if (env->dst_rq->rd->overload != overload)
8591 env->dst_rq->rd->overload = overload;
8592
8593 /* Update over-utilization (tipping point, U >= 0) indicator */
8594 if (env->dst_rq->rd->overutilized != overutilized) {
8595 env->dst_rq->rd->overutilized = overutilized;
8596 trace_sched_overutilized(overutilized);
8597 }
8598 } else {
8599 if (!env->dst_rq->rd->overutilized && overutilized) {
8600 env->dst_rq->rd->overutilized = true;
8601 trace_sched_overutilized(true);
8602 }
8603 }
8604
8605 }
8606
8607 /**
8608 * check_asym_packing - Check to see if the group is packed into the
8609 * sched doman.
8610 *
8611 * This is primarily intended to used at the sibling level. Some
8612 * cores like POWER7 prefer to use lower numbered SMT threads. In the
8613 * case of POWER7, it can move to lower SMT modes only when higher
8614 * threads are idle. When in lower SMT modes, the threads will
8615 * perform better since they share less core resources. Hence when we
8616 * have idle threads, we want them to be the higher ones.
8617 *
8618 * This packing function is run on idle threads. It checks to see if
8619 * the busiest CPU in this domain (core in the P7 case) has a higher
8620 * CPU number than the packing function is being run on. Here we are
8621 * assuming lower CPU number will be equivalent to lower a SMT thread
8622 * number.
8623 *
8624 * Return: 1 when packing is required and a task should be moved to
8625 * this CPU. The amount of the imbalance is returned in *imbalance.
8626 *
8627 * @env: The load balancing environment.
8628 * @sds: Statistics of the sched_domain which is to be packed
8629 */
check_asym_packing(struct lb_env * env,struct sd_lb_stats * sds)8630 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8631 {
8632 int busiest_cpu;
8633
8634 if (!(env->sd->flags & SD_ASYM_PACKING))
8635 return 0;
8636
8637 if (!sds->busiest)
8638 return 0;
8639
8640 busiest_cpu = group_first_cpu(sds->busiest);
8641 if (env->dst_cpu > busiest_cpu)
8642 return 0;
8643
8644 env->imbalance = DIV_ROUND_CLOSEST(
8645 sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
8646 SCHED_CAPACITY_SCALE);
8647
8648 return 1;
8649 }
8650
8651 /**
8652 * fix_small_imbalance - Calculate the minor imbalance that exists
8653 * amongst the groups of a sched_domain, during
8654 * load balancing.
8655 * @env: The load balancing environment.
8656 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
8657 */
8658 static inline
fix_small_imbalance(struct lb_env * env,struct sd_lb_stats * sds)8659 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8660 {
8661 unsigned long tmp, capa_now = 0, capa_move = 0;
8662 unsigned int imbn = 2;
8663 unsigned long scaled_busy_load_per_task;
8664 struct sg_lb_stats *local, *busiest;
8665
8666 local = &sds->local_stat;
8667 busiest = &sds->busiest_stat;
8668
8669 if (!local->sum_nr_running)
8670 local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8671 else if (busiest->load_per_task > local->load_per_task)
8672 imbn = 1;
8673
8674 scaled_busy_load_per_task =
8675 (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8676 busiest->group_capacity;
8677
8678 if (busiest->avg_load + scaled_busy_load_per_task >=
8679 local->avg_load + (scaled_busy_load_per_task * imbn)) {
8680 env->imbalance = busiest->load_per_task;
8681 return;
8682 }
8683
8684 /*
8685 * OK, we don't have enough imbalance to justify moving tasks,
8686 * however we may be able to increase total CPU capacity used by
8687 * moving them.
8688 */
8689
8690 capa_now += busiest->group_capacity *
8691 min(busiest->load_per_task, busiest->avg_load);
8692 capa_now += local->group_capacity *
8693 min(local->load_per_task, local->avg_load);
8694 capa_now /= SCHED_CAPACITY_SCALE;
8695
8696 /* Amount of load we'd subtract */
8697 if (busiest->avg_load > scaled_busy_load_per_task) {
8698 capa_move += busiest->group_capacity *
8699 min(busiest->load_per_task,
8700 busiest->avg_load - scaled_busy_load_per_task);
8701 }
8702
8703 /* Amount of load we'd add */
8704 if (busiest->avg_load * busiest->group_capacity <
8705 busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8706 tmp = (busiest->avg_load * busiest->group_capacity) /
8707 local->group_capacity;
8708 } else {
8709 tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8710 local->group_capacity;
8711 }
8712 capa_move += local->group_capacity *
8713 min(local->load_per_task, local->avg_load + tmp);
8714 capa_move /= SCHED_CAPACITY_SCALE;
8715
8716 /* Move if we gain throughput */
8717 if (capa_move > capa_now)
8718 env->imbalance = busiest->load_per_task;
8719 }
8720
8721 /**
8722 * calculate_imbalance - Calculate the amount of imbalance present within the
8723 * groups of a given sched_domain during load balance.
8724 * @env: load balance environment
8725 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
8726 */
calculate_imbalance(struct lb_env * env,struct sd_lb_stats * sds)8727 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8728 {
8729 unsigned long max_pull, load_above_capacity = ~0UL;
8730 struct sg_lb_stats *local, *busiest;
8731
8732 local = &sds->local_stat;
8733 busiest = &sds->busiest_stat;
8734
8735 if (busiest->group_type == group_imbalanced) {
8736 /*
8737 * In the group_imb case we cannot rely on group-wide averages
8738 * to ensure cpu-load equilibrium, look at wider averages. XXX
8739 */
8740 busiest->load_per_task =
8741 min(busiest->load_per_task, sds->avg_load);
8742 }
8743
8744 /*
8745 * In the presence of smp nice balancing, certain scenarios can have
8746 * max load less than avg load(as we skip the groups at or below
8747 * its cpu_capacity, while calculating max_load..)
8748 */
8749 if (busiest->avg_load <= sds->avg_load ||
8750 local->avg_load >= sds->avg_load) {
8751 /* Misfitting tasks should be migrated in any case */
8752 if (busiest->group_type == group_misfit_task) {
8753 env->imbalance = busiest->group_misfit_task;
8754 return;
8755 }
8756
8757 /*
8758 * Busiest group is overloaded, local is not, use the spare
8759 * cycles to maximize throughput
8760 */
8761 if (busiest->group_type == group_overloaded &&
8762 local->group_type <= group_misfit_task) {
8763 env->imbalance = busiest->load_per_task;
8764 return;
8765 }
8766
8767 env->imbalance = 0;
8768 return fix_small_imbalance(env, sds);
8769 }
8770
8771 /*
8772 * If there aren't any idle cpus, avoid creating some.
8773 */
8774 if (busiest->group_type == group_overloaded &&
8775 local->group_type == group_overloaded) {
8776 load_above_capacity = busiest->sum_nr_running *
8777 SCHED_LOAD_SCALE;
8778 if (load_above_capacity > busiest->group_capacity)
8779 load_above_capacity -= busiest->group_capacity;
8780 else
8781 load_above_capacity = ~0UL;
8782 }
8783
8784 /*
8785 * We're trying to get all the cpus to the average_load, so we don't
8786 * want to push ourselves above the average load, nor do we wish to
8787 * reduce the max loaded cpu below the average load. At the same time,
8788 * we also don't want to reduce the group load below the group capacity
8789 * (so that we can implement power-savings policies etc). Thus we look
8790 * for the minimum possible imbalance.
8791 */
8792 max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8793
8794 /* How much load to actually move to equalise the imbalance */
8795 env->imbalance = min(
8796 max_pull * busiest->group_capacity,
8797 (sds->avg_load - local->avg_load) * local->group_capacity
8798 ) / SCHED_CAPACITY_SCALE;
8799
8800 /* Boost imbalance to allow misfit task to be balanced. */
8801 if (busiest->group_type == group_misfit_task)
8802 env->imbalance = max_t(long, env->imbalance,
8803 busiest->group_misfit_task);
8804
8805 /*
8806 * if *imbalance is less than the average load per runnable task
8807 * there is no guarantee that any tasks will be moved so we'll have
8808 * a think about bumping its value to force at least one task to be
8809 * moved
8810 */
8811 if (env->imbalance < busiest->load_per_task)
8812 return fix_small_imbalance(env, sds);
8813 }
8814
8815 /******* find_busiest_group() helpers end here *********************/
8816
8817 /**
8818 * find_busiest_group - Returns the busiest group within the sched_domain
8819 * if there is an imbalance. If there isn't an imbalance, and
8820 * the user has opted for power-savings, it returns a group whose
8821 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
8822 * such a group exists.
8823 *
8824 * Also calculates the amount of weighted load which should be moved
8825 * to restore balance.
8826 *
8827 * @env: The load balancing environment.
8828 *
8829 * Return: - The busiest group if imbalance exists.
8830 * - If no imbalance and user has opted for power-savings balance,
8831 * return the least loaded group whose CPUs can be
8832 * put to idle by rebalancing its tasks onto our group.
8833 */
find_busiest_group(struct lb_env * env)8834 static struct sched_group *find_busiest_group(struct lb_env *env)
8835 {
8836 struct sg_lb_stats *local, *busiest;
8837 struct sd_lb_stats sds;
8838
8839 init_sd_lb_stats(&sds);
8840
8841 /*
8842 * Compute the various statistics relavent for load balancing at
8843 * this level.
8844 */
8845 update_sd_lb_stats(env, &sds);
8846
8847 if (energy_aware() && !env->dst_rq->rd->overutilized)
8848 goto out_balanced;
8849
8850 local = &sds.local_stat;
8851 busiest = &sds.busiest_stat;
8852
8853 /* ASYM feature bypasses nice load balance check */
8854 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
8855 check_asym_packing(env, &sds))
8856 return sds.busiest;
8857
8858 /* There is no busy sibling group to pull tasks from */
8859 if (!sds.busiest || busiest->sum_nr_running == 0)
8860 goto out_balanced;
8861
8862 sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8863 / sds.total_capacity;
8864
8865 /*
8866 * If the busiest group is imbalanced the below checks don't
8867 * work because they assume all things are equal, which typically
8868 * isn't true due to cpus_allowed constraints and the like.
8869 */
8870 if (busiest->group_type == group_imbalanced)
8871 goto force_balance;
8872
8873 /*
8874 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
8875 * capacities from resulting in underutilization due to avg_load.
8876 */
8877 if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8878 busiest->group_no_capacity)
8879 goto force_balance;
8880
8881 /* Misfitting tasks should be dealt with regardless of the avg load */
8882 if (busiest->group_type == group_misfit_task) {
8883 goto force_balance;
8884 }
8885
8886 /*
8887 * If the local group is busier than the selected busiest group
8888 * don't try and pull any tasks.
8889 */
8890 if (local->avg_load >= busiest->avg_load)
8891 goto out_balanced;
8892
8893 /*
8894 * Don't pull any tasks if this group is already above the domain
8895 * average load.
8896 */
8897 if (local->avg_load >= sds.avg_load)
8898 goto out_balanced;
8899
8900 if (env->idle == CPU_IDLE) {
8901 /*
8902 * This cpu is idle. If the busiest group is not overloaded
8903 * and there is no imbalance between this and busiest group
8904 * wrt idle cpus, it is balanced. The imbalance becomes
8905 * significant if the diff is greater than 1 otherwise we
8906 * might end up to just move the imbalance on another group
8907 */
8908 if ((busiest->group_type != group_overloaded) &&
8909 (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
8910 !group_smaller_cpu_capacity(sds.busiest, sds.local))
8911 goto out_balanced;
8912 } else {
8913 /*
8914 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
8915 * imbalance_pct to be conservative.
8916 */
8917 if (100 * busiest->avg_load <=
8918 env->sd->imbalance_pct * local->avg_load)
8919 goto out_balanced;
8920 }
8921
8922 force_balance:
8923 env->busiest_group_type = busiest->group_type;
8924 /* Looks like there is an imbalance. Compute it */
8925 calculate_imbalance(env, &sds);
8926 return sds.busiest;
8927
8928 out_balanced:
8929 env->imbalance = 0;
8930 return NULL;
8931 }
8932
8933 /*
8934 * find_busiest_queue - find the busiest runqueue among the cpus in group.
8935 */
find_busiest_queue(struct lb_env * env,struct sched_group * group)8936 static struct rq *find_busiest_queue(struct lb_env *env,
8937 struct sched_group *group)
8938 {
8939 struct rq *busiest = NULL, *rq;
8940 unsigned long busiest_load = 0, busiest_capacity = 1;
8941 int i;
8942
8943 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
8944 unsigned long capacity, wl;
8945 enum fbq_type rt;
8946
8947 rq = cpu_rq(i);
8948 rt = fbq_classify_rq(rq);
8949
8950 /*
8951 * We classify groups/runqueues into three groups:
8952 * - regular: there are !numa tasks
8953 * - remote: there are numa tasks that run on the 'wrong' node
8954 * - all: there is no distinction
8955 *
8956 * In order to avoid migrating ideally placed numa tasks,
8957 * ignore those when there's better options.
8958 *
8959 * If we ignore the actual busiest queue to migrate another
8960 * task, the next balance pass can still reduce the busiest
8961 * queue by moving tasks around inside the node.
8962 *
8963 * If we cannot move enough load due to this classification
8964 * the next pass will adjust the group classification and
8965 * allow migration of more tasks.
8966 *
8967 * Both cases only affect the total convergence complexity.
8968 */
8969 if (rt > env->fbq_type)
8970 continue;
8971
8972 capacity = capacity_of(i);
8973
8974 wl = weighted_cpuload(i);
8975
8976 /*
8977 * When comparing with imbalance, use weighted_cpuload()
8978 * which is not scaled with the cpu capacity.
8979 */
8980
8981 if (rq->nr_running == 1 && wl > env->imbalance &&
8982 !check_cpu_capacity(rq, env->sd) &&
8983 env->busiest_group_type != group_misfit_task)
8984 continue;
8985
8986 /*
8987 * For the load comparisons with the other cpu's, consider
8988 * the weighted_cpuload() scaled with the cpu capacity, so
8989 * that the load can be moved away from the cpu that is
8990 * potentially running at a lower capacity.
8991 *
8992 * Thus we're looking for max(wl_i / capacity_i), crosswise
8993 * multiplication to rid ourselves of the division works out
8994 * to: wl_i * capacity_j > wl_j * capacity_i; where j is
8995 * our previous maximum.
8996 */
8997 if (wl * busiest_capacity > busiest_load * capacity) {
8998 busiest_load = wl;
8999 busiest_capacity = capacity;
9000 busiest = rq;
9001 }
9002 }
9003
9004 return busiest;
9005 }
9006
9007 /*
9008 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
9009 * so long as it is large enough.
9010 */
9011 #define MAX_PINNED_INTERVAL 512
9012
9013 /* Working cpumask for load_balance and load_balance_newidle. */
9014 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
9015
need_active_balance(struct lb_env * env)9016 static int need_active_balance(struct lb_env *env)
9017 {
9018 struct sched_domain *sd = env->sd;
9019
9020 if (env->idle == CPU_NEWLY_IDLE) {
9021
9022 /*
9023 * ASYM_PACKING needs to force migrate tasks from busy but
9024 * higher numbered CPUs in order to pack all tasks in the
9025 * lowest numbered CPUs.
9026 */
9027 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
9028 return 1;
9029 }
9030
9031 /*
9032 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
9033 * It's worth migrating the task if the src_cpu's capacity is reduced
9034 * because of other sched_class or IRQs if more capacity stays
9035 * available on dst_cpu.
9036 */
9037 if ((env->idle != CPU_NOT_IDLE) &&
9038 (env->src_rq->cfs.h_nr_running == 1)) {
9039 if ((check_cpu_capacity(env->src_rq, sd)) &&
9040 (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9041 return 1;
9042 }
9043
9044 if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9045 ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
9046 env->src_rq->cfs.h_nr_running == 1 &&
9047 cpu_overutilized(env->src_cpu) &&
9048 !cpu_overutilized(env->dst_cpu)) {
9049 return 1;
9050 }
9051
9052 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
9053 }
9054
9055 static int active_load_balance_cpu_stop(void *data);
9056
should_we_balance(struct lb_env * env)9057 static int should_we_balance(struct lb_env *env)
9058 {
9059 struct sched_group *sg = env->sd->groups;
9060 struct cpumask *sg_cpus, *sg_mask;
9061 int cpu, balance_cpu = -1;
9062
9063 /*
9064 * In the newly idle case, we will allow all the cpu's
9065 * to do the newly idle load balance.
9066 */
9067 if (env->idle == CPU_NEWLY_IDLE)
9068 return 1;
9069
9070 sg_cpus = sched_group_cpus(sg);
9071 sg_mask = sched_group_mask(sg);
9072 /* Try to find first idle cpu */
9073 for_each_cpu_and(cpu, sg_cpus, env->cpus) {
9074 if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
9075 continue;
9076
9077 balance_cpu = cpu;
9078 break;
9079 }
9080
9081 if (balance_cpu == -1)
9082 balance_cpu = group_balance_cpu(sg);
9083
9084 /*
9085 * First idle cpu or the first cpu(busiest) in this sched group
9086 * is eligible for doing load balancing at this and above domains.
9087 */
9088 return balance_cpu == env->dst_cpu;
9089 }
9090
9091 /*
9092 * Check this_cpu to ensure it is balanced within domain. Attempt to move
9093 * tasks if there is an imbalance.
9094 */
load_balance(int this_cpu,struct rq * this_rq,struct sched_domain * sd,enum cpu_idle_type idle,int * continue_balancing)9095 static int load_balance(int this_cpu, struct rq *this_rq,
9096 struct sched_domain *sd, enum cpu_idle_type idle,
9097 int *continue_balancing)
9098 {
9099 int ld_moved, cur_ld_moved, active_balance = 0;
9100 struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
9101 struct sched_group *group;
9102 struct rq *busiest;
9103 unsigned long flags;
9104 struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
9105
9106 struct lb_env env = {
9107 .sd = sd,
9108 .dst_cpu = this_cpu,
9109 .dst_rq = this_rq,
9110 .dst_grpmask = sched_group_cpus(sd->groups),
9111 .idle = idle,
9112 .loop_break = sched_nr_migrate_break,
9113 .cpus = cpus,
9114 .fbq_type = all,
9115 .tasks = LIST_HEAD_INIT(env.tasks),
9116 };
9117
9118 /*
9119 * For NEWLY_IDLE load_balancing, we don't need to consider
9120 * other cpus in our group
9121 */
9122 if (idle == CPU_NEWLY_IDLE)
9123 env.dst_grpmask = NULL;
9124
9125 cpumask_copy(cpus, cpu_active_mask);
9126
9127 schedstat_inc(sd, lb_count[idle]);
9128
9129 redo:
9130 if (!should_we_balance(&env)) {
9131 *continue_balancing = 0;
9132 goto out_balanced;
9133 }
9134
9135 group = find_busiest_group(&env);
9136 if (!group) {
9137 schedstat_inc(sd, lb_nobusyg[idle]);
9138 goto out_balanced;
9139 }
9140
9141 busiest = find_busiest_queue(&env, group);
9142 if (!busiest) {
9143 schedstat_inc(sd, lb_nobusyq[idle]);
9144 goto out_balanced;
9145 }
9146
9147 BUG_ON(busiest == env.dst_rq);
9148
9149 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
9150
9151 env.src_cpu = busiest->cpu;
9152 env.src_rq = busiest;
9153
9154 ld_moved = 0;
9155 if (busiest->nr_running > 1) {
9156 /*
9157 * Attempt to move tasks. If find_busiest_group has found
9158 * an imbalance but busiest->nr_running <= 1, the group is
9159 * still unbalanced. ld_moved simply stays zero, so it is
9160 * correctly treated as an imbalance.
9161 */
9162 env.flags |= LBF_ALL_PINNED;
9163 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
9164
9165 more_balance:
9166 raw_spin_lock_irqsave(&busiest->lock, flags);
9167 update_rq_clock(busiest);
9168
9169 /*
9170 * cur_ld_moved - load moved in current iteration
9171 * ld_moved - cumulative load moved across iterations
9172 */
9173 cur_ld_moved = detach_tasks(&env);
9174
9175 /*
9176 * We've detached some tasks from busiest_rq. Every
9177 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
9178 * unlock busiest->lock, and we are able to be sure
9179 * that nobody can manipulate the tasks in parallel.
9180 * See task_rq_lock() family for the details.
9181 */
9182
9183 raw_spin_unlock(&busiest->lock);
9184
9185 if (cur_ld_moved) {
9186 attach_tasks(&env);
9187 ld_moved += cur_ld_moved;
9188 }
9189
9190 local_irq_restore(flags);
9191
9192 if (env.flags & LBF_NEED_BREAK) {
9193 env.flags &= ~LBF_NEED_BREAK;
9194 goto more_balance;
9195 }
9196
9197 /*
9198 * Revisit (affine) tasks on src_cpu that couldn't be moved to
9199 * us and move them to an alternate dst_cpu in our sched_group
9200 * where they can run. The upper limit on how many times we
9201 * iterate on same src_cpu is dependent on number of cpus in our
9202 * sched_group.
9203 *
9204 * This changes load balance semantics a bit on who can move
9205 * load to a given_cpu. In addition to the given_cpu itself
9206 * (or a ilb_cpu acting on its behalf where given_cpu is
9207 * nohz-idle), we now have balance_cpu in a position to move
9208 * load to given_cpu. In rare situations, this may cause
9209 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
9210 * _independently_ and at _same_ time to move some load to
9211 * given_cpu) causing exceess load to be moved to given_cpu.
9212 * This however should not happen so much in practice and
9213 * moreover subsequent load balance cycles should correct the
9214 * excess load moved.
9215 */
9216 if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9217
9218 /* Prevent to re-select dst_cpu via env's cpus */
9219 cpumask_clear_cpu(env.dst_cpu, env.cpus);
9220
9221 env.dst_rq = cpu_rq(env.new_dst_cpu);
9222 env.dst_cpu = env.new_dst_cpu;
9223 env.flags &= ~LBF_DST_PINNED;
9224 env.loop = 0;
9225 env.loop_break = sched_nr_migrate_break;
9226
9227 /*
9228 * Go back to "more_balance" rather than "redo" since we
9229 * need to continue with same src_cpu.
9230 */
9231 goto more_balance;
9232 }
9233
9234 /*
9235 * We failed to reach balance because of affinity.
9236 */
9237 if (sd_parent) {
9238 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9239
9240 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
9241 *group_imbalance = 1;
9242 }
9243
9244 /* All tasks on this runqueue were pinned by CPU affinity */
9245 if (unlikely(env.flags & LBF_ALL_PINNED)) {
9246 cpumask_clear_cpu(cpu_of(busiest), cpus);
9247 if (!cpumask_empty(cpus)) {
9248 env.loop = 0;
9249 env.loop_break = sched_nr_migrate_break;
9250 goto redo;
9251 }
9252 goto out_all_pinned;
9253 }
9254 }
9255
9256 if (!ld_moved) {
9257 schedstat_inc(sd, lb_failed[idle]);
9258 /*
9259 * Increment the failure counter only on periodic balance.
9260 * We do not want newidle balance, which can be very
9261 * frequent, pollute the failure counter causing
9262 * excessive cache_hot migrations and active balances.
9263 */
9264 if (idle != CPU_NEWLY_IDLE)
9265 if (env.src_grp_nr_running > 1)
9266 sd->nr_balance_failed++;
9267
9268 if (need_active_balance(&env)) {
9269 raw_spin_lock_irqsave(&busiest->lock, flags);
9270
9271 /* don't kick the active_load_balance_cpu_stop,
9272 * if the curr task on busiest cpu can't be
9273 * moved to this_cpu
9274 */
9275 if (!cpumask_test_cpu(this_cpu,
9276 tsk_cpus_allowed(busiest->curr))) {
9277 raw_spin_unlock_irqrestore(&busiest->lock,
9278 flags);
9279 env.flags |= LBF_ALL_PINNED;
9280 goto out_one_pinned;
9281 }
9282
9283 /*
9284 * ->active_balance synchronizes accesses to
9285 * ->active_balance_work. Once set, it's cleared
9286 * only after active load balance is finished.
9287 */
9288 if (!busiest->active_balance) {
9289 busiest->active_balance = 1;
9290 busiest->push_cpu = this_cpu;
9291 active_balance = 1;
9292 }
9293 raw_spin_unlock_irqrestore(&busiest->lock, flags);
9294
9295 if (active_balance) {
9296 stop_one_cpu_nowait(cpu_of(busiest),
9297 active_load_balance_cpu_stop, busiest,
9298 &busiest->active_balance_work);
9299 }
9300
9301 /*
9302 * We've kicked active balancing, reset the failure
9303 * counter.
9304 */
9305 sd->nr_balance_failed = sd->cache_nice_tries+1;
9306 }
9307 } else
9308 sd->nr_balance_failed = 0;
9309
9310 if (likely(!active_balance)) {
9311 /* We were unbalanced, so reset the balancing interval */
9312 sd->balance_interval = sd->min_interval;
9313 } else {
9314 /*
9315 * If we've begun active balancing, start to back off. This
9316 * case may not be covered by the all_pinned logic if there
9317 * is only 1 task on the busy runqueue (because we don't call
9318 * detach_tasks).
9319 */
9320 if (sd->balance_interval < sd->max_interval)
9321 sd->balance_interval *= 2;
9322 }
9323
9324 goto out;
9325
9326 out_balanced:
9327 /*
9328 * We reach balance although we may have faced some affinity
9329 * constraints. Clear the imbalance flag only if other tasks got
9330 * a chance to move and fix the imbalance.
9331 */
9332 if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
9333 int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9334
9335 if (*group_imbalance)
9336 *group_imbalance = 0;
9337 }
9338
9339 out_all_pinned:
9340 /*
9341 * We reach balance because all tasks are pinned at this level so
9342 * we can't migrate them. Let the imbalance flag set so parent level
9343 * can try to migrate them.
9344 */
9345 schedstat_inc(sd, lb_balanced[idle]);
9346
9347 sd->nr_balance_failed = 0;
9348
9349 out_one_pinned:
9350 ld_moved = 0;
9351
9352 /*
9353 * idle_balance() disregards balance intervals, so we could repeatedly
9354 * reach this code, which would lead to balance_interval skyrocketting
9355 * in a short amount of time. Skip the balance_interval increase logic
9356 * to avoid that.
9357 */
9358 if (env.idle == CPU_NEWLY_IDLE)
9359 goto out;
9360
9361 /* tune up the balancing interval */
9362 if (((env.flags & LBF_ALL_PINNED) &&
9363 sd->balance_interval < MAX_PINNED_INTERVAL) ||
9364 (sd->balance_interval < sd->max_interval))
9365 sd->balance_interval *= 2;
9366 out:
9367 return ld_moved;
9368 }
9369
9370 static inline unsigned long
get_sd_balance_interval(struct sched_domain * sd,int cpu_busy)9371 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9372 {
9373 unsigned long interval = sd->balance_interval;
9374
9375 if (cpu_busy)
9376 interval *= sd->busy_factor;
9377
9378 /* scale ms to jiffies */
9379 interval = msecs_to_jiffies(interval);
9380 interval = clamp(interval, 1UL, max_load_balance_interval);
9381
9382 return interval;
9383 }
9384
9385 static inline void
update_next_balance(struct sched_domain * sd,int cpu_busy,unsigned long * next_balance)9386 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
9387 {
9388 unsigned long interval, next;
9389
9390 interval = get_sd_balance_interval(sd, cpu_busy);
9391 next = sd->last_balance + interval;
9392
9393 if (time_after(*next_balance, next))
9394 *next_balance = next;
9395 }
9396
9397 /*
9398 * idle_balance is called by schedule() if this_cpu is about to become
9399 * idle. Attempts to pull tasks from other CPUs.
9400 */
idle_balance(struct rq * this_rq)9401 static int idle_balance(struct rq *this_rq)
9402 {
9403 unsigned long next_balance = jiffies + HZ;
9404 int this_cpu = this_rq->cpu;
9405 struct sched_domain *sd;
9406 int pulled_task = 0;
9407 u64 curr_cost = 0;
9408
9409 idle_enter_fair(this_rq);
9410
9411 /*
9412 * We must set idle_stamp _before_ calling idle_balance(), such that we
9413 * measure the duration of idle_balance() as idle time.
9414 */
9415 this_rq->idle_stamp = rq_clock(this_rq);
9416
9417 if (!energy_aware() &&
9418 (this_rq->avg_idle < sysctl_sched_migration_cost ||
9419 !this_rq->rd->overload)) {
9420 rcu_read_lock();
9421 sd = rcu_dereference_check_sched_domain(this_rq->sd);
9422 if (sd)
9423 update_next_balance(sd, 0, &next_balance);
9424 rcu_read_unlock();
9425
9426 goto out;
9427 }
9428
9429 raw_spin_unlock(&this_rq->lock);
9430
9431 update_blocked_averages(this_cpu);
9432 rcu_read_lock();
9433 for_each_domain(this_cpu, sd) {
9434 int continue_balancing = 1;
9435 u64 t0, domain_cost;
9436
9437 if (!(sd->flags & SD_LOAD_BALANCE))
9438 continue;
9439
9440 if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9441 update_next_balance(sd, 0, &next_balance);
9442 break;
9443 }
9444
9445 if (sd->flags & SD_BALANCE_NEWIDLE) {
9446 t0 = sched_clock_cpu(this_cpu);
9447
9448 pulled_task = load_balance(this_cpu, this_rq,
9449 sd, CPU_NEWLY_IDLE,
9450 &continue_balancing);
9451
9452 domain_cost = sched_clock_cpu(this_cpu) - t0;
9453 if (domain_cost > sd->max_newidle_lb_cost)
9454 sd->max_newidle_lb_cost = domain_cost;
9455
9456 curr_cost += domain_cost;
9457 }
9458
9459 update_next_balance(sd, 0, &next_balance);
9460
9461 /*
9462 * Stop searching for tasks to pull if there are
9463 * now runnable tasks on this rq.
9464 */
9465 if (pulled_task || this_rq->nr_running > 0)
9466 break;
9467 }
9468 rcu_read_unlock();
9469
9470 raw_spin_lock(&this_rq->lock);
9471
9472 if (curr_cost > this_rq->max_idle_balance_cost)
9473 this_rq->max_idle_balance_cost = curr_cost;
9474
9475 /*
9476 * While browsing the domains, we released the rq lock, a task could
9477 * have been enqueued in the meantime. Since we're not going idle,
9478 * pretend we pulled a task.
9479 */
9480 if (this_rq->cfs.h_nr_running && !pulled_task)
9481 pulled_task = 1;
9482
9483 out:
9484 /* Move the next balance forward */
9485 if (time_after(this_rq->next_balance, next_balance))
9486 this_rq->next_balance = next_balance;
9487
9488 /* Is there a task of a high priority class? */
9489 if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9490 pulled_task = -1;
9491
9492 if (pulled_task) {
9493 idle_exit_fair(this_rq);
9494 this_rq->idle_stamp = 0;
9495 }
9496
9497 return pulled_task;
9498 }
9499
9500 /*
9501 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
9502 * running tasks off the busiest CPU onto idle CPUs. It requires at
9503 * least 1 task to be running on each physical CPU where possible, and
9504 * avoids physical / logical imbalances.
9505 */
active_load_balance_cpu_stop(void * data)9506 static int active_load_balance_cpu_stop(void *data)
9507 {
9508 struct rq *busiest_rq = data;
9509 int busiest_cpu = cpu_of(busiest_rq);
9510 int target_cpu = busiest_rq->push_cpu;
9511 struct rq *target_rq = cpu_rq(target_cpu);
9512 struct sched_domain *sd = NULL;
9513 struct task_struct *p = NULL;
9514 struct task_struct *push_task = NULL;
9515 int push_task_detached = 0;
9516 struct lb_env env = {
9517 .sd = sd,
9518 .dst_cpu = target_cpu,
9519 .dst_rq = target_rq,
9520 .src_cpu = busiest_rq->cpu,
9521 .src_rq = busiest_rq,
9522 .idle = CPU_IDLE,
9523 };
9524
9525 raw_spin_lock_irq(&busiest_rq->lock);
9526
9527 /* make sure the requested cpu hasn't gone down in the meantime */
9528 if (unlikely(busiest_cpu != smp_processor_id() ||
9529 !busiest_rq->active_balance))
9530 goto out_unlock;
9531
9532 /* Is there any task to move? */
9533 if (busiest_rq->nr_running <= 1)
9534 goto out_unlock;
9535
9536 /*
9537 * This condition is "impossible", if it occurs
9538 * we need to fix it. Originally reported by
9539 * Bjorn Helgaas on a 128-cpu setup.
9540 */
9541 BUG_ON(busiest_rq == target_rq);
9542
9543 push_task = busiest_rq->push_task;
9544 if (push_task) {
9545 if (task_on_rq_queued(push_task) &&
9546 task_cpu(push_task) == busiest_cpu &&
9547 cpu_online(target_cpu)) {
9548 detach_task(push_task, &env);
9549 push_task_detached = 1;
9550 }
9551 goto out_unlock;
9552 }
9553
9554 /* Search for an sd spanning us and the target CPU. */
9555 rcu_read_lock();
9556 for_each_domain(target_cpu, sd) {
9557 if ((sd->flags & SD_LOAD_BALANCE) &&
9558 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9559 break;
9560 }
9561
9562 if (likely(sd)) {
9563 env.sd = sd;
9564 schedstat_inc(sd, alb_count);
9565 update_rq_clock(busiest_rq);
9566
9567 p = detach_one_task(&env);
9568 if (p)
9569 schedstat_inc(sd, alb_pushed);
9570 else
9571 schedstat_inc(sd, alb_failed);
9572 }
9573 rcu_read_unlock();
9574 out_unlock:
9575 busiest_rq->active_balance = 0;
9576
9577 if (push_task)
9578 busiest_rq->push_task = NULL;
9579
9580 raw_spin_unlock(&busiest_rq->lock);
9581
9582 if (push_task) {
9583 if (push_task_detached)
9584 attach_one_task(target_rq, push_task);
9585 put_task_struct(push_task);
9586 }
9587
9588 if (p)
9589 attach_one_task(target_rq, p);
9590
9591 local_irq_enable();
9592
9593 return 0;
9594 }
9595
on_null_domain(struct rq * rq)9596 static inline int on_null_domain(struct rq *rq)
9597 {
9598 return unlikely(!rcu_dereference_sched(rq->sd));
9599 }
9600
9601 #ifdef CONFIG_NO_HZ_COMMON
9602 /*
9603 * idle load balancing details
9604 * - When one of the busy CPUs notice that there may be an idle rebalancing
9605 * needed, they will kick the idle load balancer, which then does idle
9606 * load balancing for all the idle CPUs.
9607 */
find_new_ilb(void)9608 static inline int find_new_ilb(void)
9609 {
9610 int ilb = cpumask_first(nohz.idle_cpus_mask);
9611
9612 if (ilb < nr_cpu_ids && idle_cpu(ilb))
9613 return ilb;
9614
9615 return nr_cpu_ids;
9616 }
9617
9618 /*
9619 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9620 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9621 * CPU (if there is one).
9622 */
nohz_balancer_kick(void)9623 static void nohz_balancer_kick(void)
9624 {
9625 int ilb_cpu;
9626
9627 nohz.next_balance++;
9628
9629 ilb_cpu = find_new_ilb();
9630
9631 if (ilb_cpu >= nr_cpu_ids)
9632 return;
9633
9634 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
9635 return;
9636 /*
9637 * Use smp_send_reschedule() instead of resched_cpu().
9638 * This way we generate a sched IPI on the target cpu which
9639 * is idle. And the softirq performing nohz idle load balance
9640 * will be run before returning from the IPI.
9641 */
9642 smp_send_reschedule(ilb_cpu);
9643 return;
9644 }
9645
nohz_balance_exit_idle(int cpu)9646 static inline void nohz_balance_exit_idle(int cpu)
9647 {
9648 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
9649 /*
9650 * Completely isolated CPUs don't ever set, so we must test.
9651 */
9652 if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9653 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9654 atomic_dec(&nohz.nr_cpus);
9655 }
9656 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9657 }
9658 }
9659
set_cpu_sd_state_busy(void)9660 static inline void set_cpu_sd_state_busy(void)
9661 {
9662 struct sched_domain *sd;
9663 int cpu = smp_processor_id();
9664
9665 rcu_read_lock();
9666 sd = rcu_dereference(per_cpu(sd_busy, cpu));
9667
9668 if (!sd || !sd->nohz_idle)
9669 goto unlock;
9670 sd->nohz_idle = 0;
9671
9672 atomic_inc(&sd->groups->sgc->nr_busy_cpus);
9673 unlock:
9674 rcu_read_unlock();
9675 }
9676
set_cpu_sd_state_idle(void)9677 void set_cpu_sd_state_idle(void)
9678 {
9679 struct sched_domain *sd;
9680 int cpu = smp_processor_id();
9681
9682 rcu_read_lock();
9683 sd = rcu_dereference(per_cpu(sd_busy, cpu));
9684
9685 if (!sd || sd->nohz_idle)
9686 goto unlock;
9687 sd->nohz_idle = 1;
9688
9689 atomic_dec(&sd->groups->sgc->nr_busy_cpus);
9690 unlock:
9691 rcu_read_unlock();
9692 }
9693
9694 /*
9695 * This routine will record that the cpu is going idle with tick stopped.
9696 * This info will be used in performing idle load balancing in the future.
9697 */
nohz_balance_enter_idle(int cpu)9698 void nohz_balance_enter_idle(int cpu)
9699 {
9700 /*
9701 * If this cpu is going down, then nothing needs to be done.
9702 */
9703 if (!cpu_active(cpu))
9704 return;
9705
9706 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9707 return;
9708
9709 /*
9710 * If we're a completely isolated CPU, we don't play.
9711 */
9712 if (on_null_domain(cpu_rq(cpu)))
9713 return;
9714
9715 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9716 atomic_inc(&nohz.nr_cpus);
9717 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9718 }
9719
sched_ilb_notifier(struct notifier_block * nfb,unsigned long action,void * hcpu)9720 static int sched_ilb_notifier(struct notifier_block *nfb,
9721 unsigned long action, void *hcpu)
9722 {
9723 switch (action & ~CPU_TASKS_FROZEN) {
9724 case CPU_DYING:
9725 nohz_balance_exit_idle(smp_processor_id());
9726 return NOTIFY_OK;
9727 default:
9728 return NOTIFY_DONE;
9729 }
9730 }
9731 #endif
9732
9733 static DEFINE_SPINLOCK(balancing);
9734
9735 /*
9736 * Scale the max load_balance interval with the number of CPUs in the system.
9737 * This trades load-balance latency on larger machines for less cross talk.
9738 */
update_max_interval(void)9739 void update_max_interval(void)
9740 {
9741 max_load_balance_interval = HZ*num_online_cpus()/10;
9742 }
9743
9744 /*
9745 * It checks each scheduling domain to see if it is due to be balanced,
9746 * and initiates a balancing operation if so.
9747 *
9748 * Balancing parameters are set up in init_sched_domains.
9749 */
rebalance_domains(struct rq * rq,enum cpu_idle_type idle)9750 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9751 {
9752 int continue_balancing = 1;
9753 int cpu = rq->cpu;
9754 unsigned long interval;
9755 struct sched_domain *sd;
9756 /* Earliest time when we have to do rebalance again */
9757 unsigned long next_balance = jiffies + 60*HZ;
9758 int update_next_balance = 0;
9759 int need_serialize, need_decay = 0;
9760 u64 max_cost = 0;
9761
9762 update_blocked_averages(cpu);
9763
9764 rcu_read_lock();
9765 for_each_domain(cpu, sd) {
9766 /*
9767 * Decay the newidle max times here because this is a regular
9768 * visit to all the domains. Decay ~1% per second.
9769 */
9770 if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9771 sd->max_newidle_lb_cost =
9772 (sd->max_newidle_lb_cost * 253) / 256;
9773 sd->next_decay_max_lb_cost = jiffies + HZ;
9774 need_decay = 1;
9775 }
9776 max_cost += sd->max_newidle_lb_cost;
9777
9778 if (!(sd->flags & SD_LOAD_BALANCE))
9779 continue;
9780
9781 /*
9782 * Stop the load balance at this level. There is another
9783 * CPU in our sched group which is doing load balancing more
9784 * actively.
9785 */
9786 if (!continue_balancing) {
9787 if (need_decay)
9788 continue;
9789 break;
9790 }
9791
9792 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9793
9794 need_serialize = sd->flags & SD_SERIALIZE;
9795 if (need_serialize) {
9796 if (!spin_trylock(&balancing))
9797 goto out;
9798 }
9799
9800 if (time_after_eq(jiffies, sd->last_balance + interval)) {
9801 if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9802 /*
9803 * The LBF_DST_PINNED logic could have changed
9804 * env->dst_cpu, so we can't know our idle
9805 * state even if we migrated tasks. Update it.
9806 */
9807 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9808 }
9809 sd->last_balance = jiffies;
9810 interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9811 }
9812 if (need_serialize)
9813 spin_unlock(&balancing);
9814 out:
9815 if (time_after(next_balance, sd->last_balance + interval)) {
9816 next_balance = sd->last_balance + interval;
9817 update_next_balance = 1;
9818 }
9819 }
9820 if (need_decay) {
9821 /*
9822 * Ensure the rq-wide value also decays but keep it at a
9823 * reasonable floor to avoid funnies with rq->avg_idle.
9824 */
9825 rq->max_idle_balance_cost =
9826 max((u64)sysctl_sched_migration_cost, max_cost);
9827 }
9828 rcu_read_unlock();
9829
9830 /*
9831 * next_balance will be updated only when there is a need.
9832 * When the cpu is attached to null domain for ex, it will not be
9833 * updated.
9834 */
9835 if (likely(update_next_balance)) {
9836 rq->next_balance = next_balance;
9837
9838 #ifdef CONFIG_NO_HZ_COMMON
9839 /*
9840 * If this CPU has been elected to perform the nohz idle
9841 * balance. Other idle CPUs have already rebalanced with
9842 * nohz_idle_balance() and nohz.next_balance has been
9843 * updated accordingly. This CPU is now running the idle load
9844 * balance for itself and we need to update the
9845 * nohz.next_balance accordingly.
9846 */
9847 if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9848 nohz.next_balance = rq->next_balance;
9849 #endif
9850 }
9851 }
9852
9853 #ifdef CONFIG_NO_HZ_COMMON
9854 /*
9855 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9856 * rebalancing for all the cpus for whom scheduler ticks are stopped.
9857 */
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)9858 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9859 {
9860 int this_cpu = this_rq->cpu;
9861 struct rq *rq;
9862 int balance_cpu;
9863 /* Earliest time when we have to do rebalance again */
9864 unsigned long next_balance = jiffies + 60*HZ;
9865 int update_next_balance = 0;
9866
9867 if (idle != CPU_IDLE ||
9868 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
9869 goto end;
9870
9871 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9872 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9873 continue;
9874
9875 /*
9876 * If this cpu gets work to do, stop the load balancing
9877 * work being done for other cpus. Next load
9878 * balancing owner will pick it up.
9879 */
9880 if (need_resched())
9881 break;
9882
9883 rq = cpu_rq(balance_cpu);
9884
9885 /*
9886 * If time for next balance is due,
9887 * do the balance.
9888 */
9889 if (time_after_eq(jiffies, rq->next_balance)) {
9890 raw_spin_lock_irq(&rq->lock);
9891 update_rq_clock(rq);
9892 update_idle_cpu_load(rq);
9893 raw_spin_unlock_irq(&rq->lock);
9894 rebalance_domains(rq, CPU_IDLE);
9895 }
9896
9897 if (time_after(next_balance, rq->next_balance)) {
9898 next_balance = rq->next_balance;
9899 update_next_balance = 1;
9900 }
9901 }
9902
9903 /*
9904 * next_balance will be updated only when there is a need.
9905 * When the CPU is attached to null domain for ex, it will not be
9906 * updated.
9907 */
9908 if (likely(update_next_balance))
9909 nohz.next_balance = next_balance;
9910 end:
9911 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
9912 }
9913
9914 /*
9915 * Current heuristic for kicking the idle load balancer in the presence
9916 * of an idle cpu in the system.
9917 * - This rq has more than one task.
9918 * - This rq has at least one CFS task and the capacity of the CPU is
9919 * significantly reduced because of RT tasks or IRQs.
9920 * - At parent of LLC scheduler domain level, this cpu's scheduler group has
9921 * multiple busy cpu.
9922 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9923 * domain span are idle.
9924 */
nohz_kick_needed(struct rq * rq)9925 static inline bool nohz_kick_needed(struct rq *rq)
9926 {
9927 unsigned long now = jiffies;
9928 struct sched_domain *sd;
9929 struct sched_group_capacity *sgc;
9930 int nr_busy, cpu = rq->cpu;
9931 bool kick = false;
9932
9933 if (unlikely(rq->idle_balance))
9934 return false;
9935
9936 /*
9937 * We may be recently in ticked or tickless idle mode. At the first
9938 * busy tick after returning from idle, we will update the busy stats.
9939 */
9940 set_cpu_sd_state_busy();
9941 nohz_balance_exit_idle(cpu);
9942
9943 /*
9944 * None are in tickless mode and hence no need for NOHZ idle load
9945 * balancing.
9946 */
9947 if (likely(!atomic_read(&nohz.nr_cpus)))
9948 return false;
9949
9950 if (time_before(now, nohz.next_balance))
9951 return false;
9952
9953 if (rq->nr_running >= 2 &&
9954 (!energy_aware() || cpu_overutilized(cpu)))
9955 return true;
9956
9957 /* Do idle load balance if there have misfit task */
9958 if (energy_aware())
9959 return rq->misfit_task;
9960
9961 rcu_read_lock();
9962 sd = rcu_dereference(per_cpu(sd_busy, cpu));
9963 if (sd) {
9964 sgc = sd->groups->sgc;
9965 nr_busy = atomic_read(&sgc->nr_busy_cpus);
9966
9967 if (nr_busy > 1) {
9968 kick = true;
9969 goto unlock;
9970 }
9971
9972 }
9973
9974 sd = rcu_dereference(rq->sd);
9975 if (sd) {
9976 if ((rq->cfs.h_nr_running >= 1) &&
9977 check_cpu_capacity(rq, sd)) {
9978 kick = true;
9979 goto unlock;
9980 }
9981 }
9982
9983 sd = rcu_dereference(per_cpu(sd_asym, cpu));
9984 if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
9985 sched_domain_span(sd)) < cpu)) {
9986 kick = true;
9987 goto unlock;
9988 }
9989
9990 unlock:
9991 rcu_read_unlock();
9992 return kick;
9993 }
9994 #else
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)9995 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
9996 #endif
9997
9998 /*
9999 * run_rebalance_domains is triggered when needed from the scheduler tick.
10000 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
10001 */
run_rebalance_domains(struct softirq_action * h)10002 static void run_rebalance_domains(struct softirq_action *h)
10003 {
10004 struct rq *this_rq = this_rq();
10005 enum cpu_idle_type idle = this_rq->idle_balance ?
10006 CPU_IDLE : CPU_NOT_IDLE;
10007
10008 /*
10009 * If this cpu has a pending nohz_balance_kick, then do the
10010 * balancing on behalf of the other idle cpus whose ticks are
10011 * stopped. Do nohz_idle_balance *before* rebalance_domains to
10012 * give the idle cpus a chance to load balance. Else we may
10013 * load balance only within the local sched_domain hierarchy
10014 * and abort nohz_idle_balance altogether if we pull some load.
10015 */
10016 nohz_idle_balance(this_rq, idle);
10017 rebalance_domains(this_rq, idle);
10018 }
10019
10020 /*
10021 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
10022 */
trigger_load_balance(struct rq * rq)10023 void trigger_load_balance(struct rq *rq)
10024 {
10025 /* Don't need to rebalance while attached to NULL domain */
10026 if (unlikely(on_null_domain(rq)))
10027 return;
10028
10029 if (time_after_eq(jiffies, rq->next_balance))
10030 raise_softirq(SCHED_SOFTIRQ);
10031 #ifdef CONFIG_NO_HZ_COMMON
10032 if (nohz_kick_needed(rq))
10033 nohz_balancer_kick();
10034 #endif
10035 }
10036
rq_online_fair(struct rq * rq)10037 static void rq_online_fair(struct rq *rq)
10038 {
10039 update_sysctl();
10040
10041 update_runtime_enabled(rq);
10042 }
10043
rq_offline_fair(struct rq * rq)10044 static void rq_offline_fair(struct rq *rq)
10045 {
10046 update_sysctl();
10047
10048 /* Ensure any throttled groups are reachable by pick_next_task */
10049 unthrottle_offline_cfs_rqs(rq);
10050 }
10051
10052 static inline int
kick_active_balance(struct rq * rq,struct task_struct * p,int new_cpu)10053 kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
10054 {
10055 int rc = 0;
10056
10057 /* Invoke active balance to force migrate currently running task */
10058 raw_spin_lock(&rq->lock);
10059 if (!rq->active_balance) {
10060 rq->active_balance = 1;
10061 rq->push_cpu = new_cpu;
10062 get_task_struct(p);
10063 rq->push_task = p;
10064 rc = 1;
10065 }
10066 raw_spin_unlock(&rq->lock);
10067
10068 return rc;
10069 }
10070
check_for_migration(struct rq * rq,struct task_struct * p)10071 void check_for_migration(struct rq *rq, struct task_struct *p)
10072 {
10073 int new_cpu;
10074 int active_balance;
10075 int cpu = task_cpu(p);
10076
10077 if (energy_aware() && rq->misfit_task) {
10078 if (rq->curr->state != TASK_RUNNING ||
10079 rq->curr->nr_cpus_allowed == 1)
10080 return;
10081
10082 new_cpu = select_energy_cpu_brute(p, cpu, 0);
10083 if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
10084 active_balance = kick_active_balance(rq, p, new_cpu);
10085 if (active_balance)
10086 stop_one_cpu_nowait(cpu,
10087 active_load_balance_cpu_stop,
10088 rq, &rq->active_balance_work);
10089 }
10090 }
10091 }
10092
10093 #endif /* CONFIG_SMP */
10094
10095 /*
10096 * scheduler tick hitting a task of our scheduling class:
10097 */
task_tick_fair(struct rq * rq,struct task_struct * curr,int queued)10098 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
10099 {
10100 struct cfs_rq *cfs_rq;
10101 struct sched_entity *se = &curr->se;
10102
10103 for_each_sched_entity(se) {
10104 cfs_rq = cfs_rq_of(se);
10105 entity_tick(cfs_rq, se, queued);
10106 }
10107
10108 if (static_branch_unlikely(&sched_numa_balancing))
10109 task_tick_numa(rq, curr);
10110
10111 #ifdef CONFIG_SMP
10112 if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
10113 rq->rd->overutilized = true;
10114 trace_sched_overutilized(true);
10115 }
10116
10117 rq->misfit_task = !task_fits_max(curr, rq->cpu);
10118 #endif
10119
10120 }
10121
10122 /*
10123 * called on fork with the child task as argument from the parent's context
10124 * - child not yet on the tasklist
10125 * - preemption disabled
10126 */
task_fork_fair(struct task_struct * p)10127 static void task_fork_fair(struct task_struct *p)
10128 {
10129 struct cfs_rq *cfs_rq;
10130 struct sched_entity *se = &p->se, *curr;
10131 struct rq *rq = this_rq();
10132
10133 raw_spin_lock(&rq->lock);
10134 update_rq_clock(rq);
10135
10136 cfs_rq = task_cfs_rq(current);
10137 curr = cfs_rq->curr;
10138 if (curr) {
10139 update_curr(cfs_rq);
10140 se->vruntime = curr->vruntime;
10141 }
10142 place_entity(cfs_rq, se, 1);
10143
10144 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
10145 /*
10146 * Upon rescheduling, sched_class::put_prev_task() will place
10147 * 'current' within the tree based on its new key value.
10148 */
10149 swap(curr->vruntime, se->vruntime);
10150 resched_curr(rq);
10151 }
10152
10153 se->vruntime -= cfs_rq->min_vruntime;
10154 raw_spin_unlock(&rq->lock);
10155 }
10156
10157 /*
10158 * Priority of the task has changed. Check to see if we preempt
10159 * the current task.
10160 */
10161 static void
prio_changed_fair(struct rq * rq,struct task_struct * p,int oldprio)10162 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10163 {
10164 if (!task_on_rq_queued(p))
10165 return;
10166
10167 /*
10168 * Reschedule if we are currently running on this runqueue and
10169 * our priority decreased, or if we are not currently running on
10170 * this runqueue and our priority is higher than the current's
10171 */
10172 if (rq->curr == p) {
10173 if (p->prio > oldprio)
10174 resched_curr(rq);
10175 } else
10176 check_preempt_curr(rq, p, 0);
10177 }
10178
vruntime_normalized(struct task_struct * p)10179 static inline bool vruntime_normalized(struct task_struct *p)
10180 {
10181 struct sched_entity *se = &p->se;
10182
10183 /*
10184 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
10185 * the dequeue_entity(.flags=0) will already have normalized the
10186 * vruntime.
10187 */
10188 if (p->on_rq)
10189 return true;
10190
10191 /*
10192 * When !on_rq, vruntime of the task has usually NOT been normalized.
10193 * But there are some cases where it has already been normalized:
10194 *
10195 * - A forked child which is waiting for being woken up by
10196 * wake_up_new_task().
10197 * - A task which has been woken up by try_to_wake_up() and
10198 * waiting for actually being woken up by sched_ttwu_pending().
10199 */
10200 if (!se->sum_exec_runtime || p->state == TASK_WAKING)
10201 return true;
10202
10203 return false;
10204 }
10205
10206 #ifdef CONFIG_FAIR_GROUP_SCHED
10207 /*
10208 * Propagate the changes of the sched_entity across the tg tree to make it
10209 * visible to the root
10210 */
propagate_entity_cfs_rq(struct sched_entity * se)10211 static void propagate_entity_cfs_rq(struct sched_entity *se)
10212 {
10213 struct cfs_rq *cfs_rq;
10214
10215 /* Start to propagate at parent */
10216 se = se->parent;
10217
10218 for_each_sched_entity(se) {
10219 cfs_rq = cfs_rq_of(se);
10220
10221 if (cfs_rq_throttled(cfs_rq))
10222 break;
10223
10224 update_load_avg(se, UPDATE_TG);
10225 }
10226 }
10227 #else
propagate_entity_cfs_rq(struct sched_entity * se)10228 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10229 #endif
10230
detach_entity_cfs_rq(struct sched_entity * se)10231 static void detach_entity_cfs_rq(struct sched_entity *se)
10232 {
10233 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10234
10235 /* Catch up with the cfs_rq and remove our load when we leave */
10236 update_load_avg(se, 0);
10237 detach_entity_load_avg(cfs_rq, se);
10238 update_tg_load_avg(cfs_rq, false);
10239 propagate_entity_cfs_rq(se);
10240 }
10241
attach_entity_cfs_rq(struct sched_entity * se)10242 static void attach_entity_cfs_rq(struct sched_entity *se)
10243 {
10244 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10245
10246 #ifdef CONFIG_FAIR_GROUP_SCHED
10247 /*
10248 * Since the real-depth could have been changed (only FAIR
10249 * class maintain depth value), reset depth properly.
10250 */
10251 se->depth = se->parent ? se->parent->depth + 1 : 0;
10252 #endif
10253
10254 /* Synchronize entity with its cfs_rq */
10255 update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10256 attach_entity_load_avg(cfs_rq, se);
10257 update_tg_load_avg(cfs_rq, false);
10258 propagate_entity_cfs_rq(se);
10259 }
10260
detach_task_cfs_rq(struct task_struct * p)10261 static void detach_task_cfs_rq(struct task_struct *p)
10262 {
10263 struct sched_entity *se = &p->se;
10264 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10265
10266 if (!vruntime_normalized(p)) {
10267 /*
10268 * Fix up our vruntime so that the current sleep doesn't
10269 * cause 'unlimited' sleep bonus.
10270 */
10271 place_entity(cfs_rq, se, 0);
10272 se->vruntime -= cfs_rq->min_vruntime;
10273 }
10274
10275 detach_entity_cfs_rq(se);
10276 }
10277
attach_task_cfs_rq(struct task_struct * p)10278 static void attach_task_cfs_rq(struct task_struct *p)
10279 {
10280 struct sched_entity *se = &p->se;
10281 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10282
10283 attach_entity_cfs_rq(se);
10284
10285 if (!vruntime_normalized(p))
10286 se->vruntime += cfs_rq->min_vruntime;
10287 }
10288
switched_from_fair(struct rq * rq,struct task_struct * p)10289 static void switched_from_fair(struct rq *rq, struct task_struct *p)
10290 {
10291 detach_task_cfs_rq(p);
10292 }
10293
switched_to_fair(struct rq * rq,struct task_struct * p)10294 static void switched_to_fair(struct rq *rq, struct task_struct *p)
10295 {
10296 attach_task_cfs_rq(p);
10297
10298 if (task_on_rq_queued(p)) {
10299 /*
10300 * We were most likely switched from sched_rt, so
10301 * kick off the schedule if running, otherwise just see
10302 * if we can still preempt the current task.
10303 */
10304 if (rq->curr == p)
10305 resched_curr(rq);
10306 else
10307 check_preempt_curr(rq, p, 0);
10308 }
10309 }
10310
10311 /* Account for a task changing its policy or group.
10312 *
10313 * This routine is mostly called to set cfs_rq->curr field when a task
10314 * migrates between groups/classes.
10315 */
set_curr_task_fair(struct rq * rq)10316 static void set_curr_task_fair(struct rq *rq)
10317 {
10318 struct sched_entity *se = &rq->curr->se;
10319
10320 for_each_sched_entity(se) {
10321 struct cfs_rq *cfs_rq = cfs_rq_of(se);
10322
10323 set_next_entity(cfs_rq, se);
10324 /* ensure bandwidth has been allocated on our new cfs_rq */
10325 account_cfs_rq_runtime(cfs_rq, 0);
10326 }
10327 }
10328
init_cfs_rq(struct cfs_rq * cfs_rq)10329 void init_cfs_rq(struct cfs_rq *cfs_rq)
10330 {
10331 cfs_rq->tasks_timeline = RB_ROOT;
10332 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10333 #ifndef CONFIG_64BIT
10334 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10335 #endif
10336 #ifdef CONFIG_SMP
10337 #ifdef CONFIG_FAIR_GROUP_SCHED
10338 cfs_rq->propagate_avg = 0;
10339 #endif
10340 atomic_long_set(&cfs_rq->removed_load_avg, 0);
10341 atomic_long_set(&cfs_rq->removed_util_avg, 0);
10342 #endif
10343 }
10344
10345 #ifdef CONFIG_FAIR_GROUP_SCHED
task_set_group_fair(struct task_struct * p)10346 static void task_set_group_fair(struct task_struct *p)
10347 {
10348 struct sched_entity *se = &p->se;
10349
10350 set_task_rq(p, task_cpu(p));
10351 se->depth = se->parent ? se->parent->depth + 1 : 0;
10352 }
10353
task_move_group_fair(struct task_struct * p)10354 static void task_move_group_fair(struct task_struct *p)
10355 {
10356 detach_task_cfs_rq(p);
10357 set_task_rq(p, task_cpu(p));
10358
10359 #ifdef CONFIG_SMP
10360 /* Tell se's cfs_rq has been changed -- migrated */
10361 p->se.avg.last_update_time = 0;
10362 #endif
10363 attach_task_cfs_rq(p);
10364 }
10365
task_change_group_fair(struct task_struct * p,int type)10366 static void task_change_group_fair(struct task_struct *p, int type)
10367 {
10368 switch (type) {
10369 case TASK_SET_GROUP:
10370 task_set_group_fair(p);
10371 break;
10372
10373 case TASK_MOVE_GROUP:
10374 task_move_group_fair(p);
10375 break;
10376 }
10377 }
10378
free_fair_sched_group(struct task_group * tg)10379 void free_fair_sched_group(struct task_group *tg)
10380 {
10381 int i;
10382
10383 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10384
10385 for_each_possible_cpu(i) {
10386 if (tg->cfs_rq)
10387 kfree(tg->cfs_rq[i]);
10388 if (tg->se)
10389 kfree(tg->se[i]);
10390 }
10391
10392 kfree(tg->cfs_rq);
10393 kfree(tg->se);
10394 }
10395
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)10396 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10397 {
10398 struct sched_entity *se;
10399 struct cfs_rq *cfs_rq;
10400 struct rq *rq;
10401 int i;
10402
10403 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
10404 if (!tg->cfs_rq)
10405 goto err;
10406 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
10407 if (!tg->se)
10408 goto err;
10409
10410 tg->shares = NICE_0_LOAD;
10411
10412 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10413
10414 for_each_possible_cpu(i) {
10415 rq = cpu_rq(i);
10416
10417 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10418 GFP_KERNEL, cpu_to_node(i));
10419 if (!cfs_rq)
10420 goto err;
10421
10422 se = kzalloc_node(sizeof(struct sched_entity),
10423 GFP_KERNEL, cpu_to_node(i));
10424 if (!se)
10425 goto err_free_rq;
10426
10427 init_cfs_rq(cfs_rq);
10428 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10429 init_entity_runnable_average(se);
10430
10431 raw_spin_lock_irq(&rq->lock);
10432 post_init_entity_util_avg(se);
10433 raw_spin_unlock_irq(&rq->lock);
10434 }
10435
10436 return 1;
10437
10438 err_free_rq:
10439 kfree(cfs_rq);
10440 err:
10441 return 0;
10442 }
10443
unregister_fair_sched_group(struct task_group * tg)10444 void unregister_fair_sched_group(struct task_group *tg)
10445 {
10446 unsigned long flags;
10447 struct rq *rq;
10448 int cpu;
10449
10450 for_each_possible_cpu(cpu) {
10451 if (tg->se[cpu])
10452 remove_entity_load_avg(tg->se[cpu]);
10453
10454 /*
10455 * Only empty task groups can be destroyed; so we can speculatively
10456 * check on_list without danger of it being re-added.
10457 */
10458 if (!tg->cfs_rq[cpu]->on_list)
10459 continue;
10460
10461 rq = cpu_rq(cpu);
10462
10463 raw_spin_lock_irqsave(&rq->lock, flags);
10464 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10465 raw_spin_unlock_irqrestore(&rq->lock, flags);
10466 }
10467 }
10468
init_tg_cfs_entry(struct task_group * tg,struct cfs_rq * cfs_rq,struct sched_entity * se,int cpu,struct sched_entity * parent)10469 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10470 struct sched_entity *se, int cpu,
10471 struct sched_entity *parent)
10472 {
10473 struct rq *rq = cpu_rq(cpu);
10474
10475 cfs_rq->tg = tg;
10476 cfs_rq->rq = rq;
10477 init_cfs_rq_runtime(cfs_rq);
10478
10479 tg->cfs_rq[cpu] = cfs_rq;
10480 tg->se[cpu] = se;
10481
10482 /* se could be NULL for root_task_group */
10483 if (!se)
10484 return;
10485
10486 if (!parent) {
10487 se->cfs_rq = &rq->cfs;
10488 se->depth = 0;
10489 } else {
10490 se->cfs_rq = parent->my_q;
10491 se->depth = parent->depth + 1;
10492 }
10493
10494 se->my_q = cfs_rq;
10495 /* guarantee group entities always have weight */
10496 update_load_set(&se->load, NICE_0_LOAD);
10497 se->parent = parent;
10498 }
10499
10500 static DEFINE_MUTEX(shares_mutex);
10501
sched_group_set_shares(struct task_group * tg,unsigned long shares)10502 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10503 {
10504 int i;
10505 unsigned long flags;
10506
10507 /*
10508 * We can't change the weight of the root cgroup.
10509 */
10510 if (!tg->se[0])
10511 return -EINVAL;
10512
10513 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10514
10515 mutex_lock(&shares_mutex);
10516 if (tg->shares == shares)
10517 goto done;
10518
10519 tg->shares = shares;
10520 for_each_possible_cpu(i) {
10521 struct rq *rq = cpu_rq(i);
10522 struct sched_entity *se;
10523
10524 se = tg->se[i];
10525 /* Propagate contribution to hierarchy */
10526 raw_spin_lock_irqsave(&rq->lock, flags);
10527
10528 /* Possible calls to update_curr() need rq clock */
10529 update_rq_clock(rq);
10530 for_each_sched_entity(se) {
10531 update_load_avg(se, UPDATE_TG);
10532 update_cfs_shares(se);
10533 }
10534 raw_spin_unlock_irqrestore(&rq->lock, flags);
10535 }
10536
10537 done:
10538 mutex_unlock(&shares_mutex);
10539 return 0;
10540 }
10541 #else /* CONFIG_FAIR_GROUP_SCHED */
10542
free_fair_sched_group(struct task_group * tg)10543 void free_fair_sched_group(struct task_group *tg) { }
10544
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)10545 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10546 {
10547 return 1;
10548 }
10549
unregister_fair_sched_group(struct task_group * tg)10550 void unregister_fair_sched_group(struct task_group *tg) { }
10551
10552 #endif /* CONFIG_FAIR_GROUP_SCHED */
10553
10554
get_rr_interval_fair(struct rq * rq,struct task_struct * task)10555 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10556 {
10557 struct sched_entity *se = &task->se;
10558 unsigned int rr_interval = 0;
10559
10560 /*
10561 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
10562 * idle runqueue:
10563 */
10564 if (rq->cfs.load.weight)
10565 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10566
10567 return rr_interval;
10568 }
10569
10570 /*
10571 * All the scheduling class methods:
10572 */
10573 const struct sched_class fair_sched_class = {
10574 .next = &idle_sched_class,
10575 .enqueue_task = enqueue_task_fair,
10576 .dequeue_task = dequeue_task_fair,
10577 .yield_task = yield_task_fair,
10578 .yield_to_task = yield_to_task_fair,
10579
10580 .check_preempt_curr = check_preempt_wakeup,
10581
10582 .pick_next_task = pick_next_task_fair,
10583 .put_prev_task = put_prev_task_fair,
10584
10585 #ifdef CONFIG_SMP
10586 .select_task_rq = select_task_rq_fair,
10587 .migrate_task_rq = migrate_task_rq_fair,
10588
10589 .rq_online = rq_online_fair,
10590 .rq_offline = rq_offline_fair,
10591
10592 .task_waking = task_waking_fair,
10593 .task_dead = task_dead_fair,
10594 .set_cpus_allowed = set_cpus_allowed_common,
10595 #endif
10596
10597 .set_curr_task = set_curr_task_fair,
10598 .task_tick = task_tick_fair,
10599 .task_fork = task_fork_fair,
10600
10601 .prio_changed = prio_changed_fair,
10602 .switched_from = switched_from_fair,
10603 .switched_to = switched_to_fair,
10604
10605 .get_rr_interval = get_rr_interval_fair,
10606
10607 .update_curr = update_curr_fair,
10608
10609 #ifdef CONFIG_FAIR_GROUP_SCHED
10610 .task_change_group = task_change_group_fair,
10611 #endif
10612 };
10613
10614 #ifdef CONFIG_SCHED_DEBUG
print_cfs_stats(struct seq_file * m,int cpu)10615 void print_cfs_stats(struct seq_file *m, int cpu)
10616 {
10617 struct cfs_rq *cfs_rq;
10618
10619 rcu_read_lock();
10620 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
10621 print_cfs_rq(m, cpu, cfs_rq);
10622 rcu_read_unlock();
10623 }
10624
10625 #ifdef CONFIG_NUMA_BALANCING
show_numa_stats(struct task_struct * p,struct seq_file * m)10626 void show_numa_stats(struct task_struct *p, struct seq_file *m)
10627 {
10628 int node;
10629 unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10630
10631 for_each_online_node(node) {
10632 if (p->numa_faults) {
10633 tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10634 tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10635 }
10636 if (p->numa_group) {
10637 gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10638 gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10639 }
10640 print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10641 }
10642 }
10643 #endif /* CONFIG_NUMA_BALANCING */
10644 #endif /* CONFIG_SCHED_DEBUG */
10645
init_sched_fair_class(void)10646 __init void init_sched_fair_class(void)
10647 {
10648 #ifdef CONFIG_SMP
10649 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10650
10651 #ifdef CONFIG_NO_HZ_COMMON
10652 nohz.next_balance = jiffies;
10653 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10654 cpu_notifier(sched_ilb_notifier, 0);
10655 #endif
10656 #endif /* SMP */
10657
10658 }
10659