• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3  *
4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5  *
6  *  Interactivity improvements by Mike Galbraith
7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
8  *
9  *  Various enhancements by Dmitry Adamushko.
10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11  *
12  *  Group scheduling enhancements by Srivatsa Vaddagiri
13  *  Copyright IBM Corporation, 2007
14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15  *
16  *  Scaled math optimizations by Thomas Gleixner
17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18  *
19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
21  */
22 
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/cpuidle.h>
27 #include <linux/slab.h>
28 #include <linux/profile.h>
29 #include <linux/interrupt.h>
30 #include <linux/mempolicy.h>
31 #include <linux/migrate.h>
32 #include <linux/task_work.h>
33 #include <linux/module.h>
34 
35 #include <trace/events/sched.h>
36 
37 #include "sched.h"
38 #include "tune.h"
39 #include "walt.h"
40 
41 /*
42  * Targeted preemption latency for CPU-bound tasks:
43  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
44  *
45  * NOTE: this latency value is not the same as the concept of
46  * 'timeslice length' - timeslices in CFS are of variable length
47  * and have no persistent notion like in traditional, time-slice
48  * based scheduling concepts.
49  *
50  * (to see the precise effective timeslice length of your workload,
51  *  run vmstat and monitor the context-switches (cs) field)
52  */
53 unsigned int sysctl_sched_latency = 6000000ULL;
54 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
55 
56 unsigned int sysctl_sched_sync_hint_enable = 1;
57 unsigned int sysctl_sched_cstate_aware = 1;
58 
59 #ifdef CONFIG_SCHED_WALT
60 unsigned int sysctl_sched_use_walt_cpu_util = 1;
61 unsigned int sysctl_sched_use_walt_task_util = 1;
62 __read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
63     (10 * NSEC_PER_MSEC);
64 #endif
65 /*
66  * The initial- and re-scaling of tunables is configurable
67  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
68  *
69  * Options are:
70  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
71  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
72  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
73  */
74 enum sched_tunable_scaling sysctl_sched_tunable_scaling
75 	= SCHED_TUNABLESCALING_LOG;
76 
77 /*
78  * Minimal preemption granularity for CPU-bound tasks:
79  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
80  */
81 unsigned int sysctl_sched_min_granularity = 750000ULL;
82 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
83 
84 /*
85  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
86  */
87 static unsigned int sched_nr_latency = 8;
88 
89 /*
90  * After fork, child runs first. If set to 0 (default) then
91  * parent will (try to) run first.
92  */
93 unsigned int sysctl_sched_child_runs_first __read_mostly;
94 
95 /*
96  * SCHED_OTHER wake-up granularity.
97  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
98  *
99  * This option delays the preemption effects of decoupled workloads
100  * and reduces their over-scheduling. Synchronous workloads will still
101  * have immediate wakeup/sleep latencies.
102  */
103 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
104 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
105 
106 const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
107 
108 /*
109  * The exponential sliding  window over which load is averaged for shares
110  * distribution.
111  * (default: 10msec)
112  */
113 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
114 
115 #ifdef CONFIG_CFS_BANDWIDTH
116 /*
117  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
118  * each time a cfs_rq requests quota.
119  *
120  * Note: in the case that the slice exceeds the runtime remaining (either due
121  * to consumption or the quota being specified to be smaller than the slice)
122  * we will always only issue the remaining available time.
123  *
124  * default: 5 msec, units: microseconds
125   */
126 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
127 #endif
128 
129 /*
130  * The margin used when comparing utilization with CPU capacity:
131  * util * margin < capacity * 1024
132  */
133 unsigned int capacity_margin = 1280; /* ~20% */
134 
update_load_add(struct load_weight * lw,unsigned long inc)135 static inline void update_load_add(struct load_weight *lw, unsigned long inc)
136 {
137 	lw->weight += inc;
138 	lw->inv_weight = 0;
139 }
140 
update_load_sub(struct load_weight * lw,unsigned long dec)141 static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
142 {
143 	lw->weight -= dec;
144 	lw->inv_weight = 0;
145 }
146 
update_load_set(struct load_weight * lw,unsigned long w)147 static inline void update_load_set(struct load_weight *lw, unsigned long w)
148 {
149 	lw->weight = w;
150 	lw->inv_weight = 0;
151 }
152 
153 /*
154  * Increase the granularity value when there are more CPUs,
155  * because with more CPUs the 'effective latency' as visible
156  * to users decreases. But the relationship is not linear,
157  * so pick a second-best guess by going with the log2 of the
158  * number of CPUs.
159  *
160  * This idea comes from the SD scheduler of Con Kolivas:
161  */
get_update_sysctl_factor(void)162 static unsigned int get_update_sysctl_factor(void)
163 {
164 	unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8);
165 	unsigned int factor;
166 
167 	switch (sysctl_sched_tunable_scaling) {
168 	case SCHED_TUNABLESCALING_NONE:
169 		factor = 1;
170 		break;
171 	case SCHED_TUNABLESCALING_LINEAR:
172 		factor = cpus;
173 		break;
174 	case SCHED_TUNABLESCALING_LOG:
175 	default:
176 		factor = 1 + ilog2(cpus);
177 		break;
178 	}
179 
180 	return factor;
181 }
182 
update_sysctl(void)183 static void update_sysctl(void)
184 {
185 	unsigned int factor = get_update_sysctl_factor();
186 
187 #define SET_SYSCTL(name) \
188 	(sysctl_##name = (factor) * normalized_sysctl_##name)
189 	SET_SYSCTL(sched_min_granularity);
190 	SET_SYSCTL(sched_latency);
191 	SET_SYSCTL(sched_wakeup_granularity);
192 #undef SET_SYSCTL
193 }
194 
sched_init_granularity(void)195 void sched_init_granularity(void)
196 {
197 	update_sysctl();
198 }
199 
200 #define WMULT_CONST	(~0U)
201 #define WMULT_SHIFT	32
202 
__update_inv_weight(struct load_weight * lw)203 static void __update_inv_weight(struct load_weight *lw)
204 {
205 	unsigned long w;
206 
207 	if (likely(lw->inv_weight))
208 		return;
209 
210 	w = scale_load_down(lw->weight);
211 
212 	if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
213 		lw->inv_weight = 1;
214 	else if (unlikely(!w))
215 		lw->inv_weight = WMULT_CONST;
216 	else
217 		lw->inv_weight = WMULT_CONST / w;
218 }
219 
220 /*
221  * delta_exec * weight / lw.weight
222  *   OR
223  * (delta_exec * (weight * lw->inv_weight)) >> WMULT_SHIFT
224  *
225  * Either weight := NICE_0_LOAD and lw \e prio_to_wmult[], in which case
226  * we're guaranteed shift stays positive because inv_weight is guaranteed to
227  * fit 32 bits, and NICE_0_LOAD gives another 10 bits; therefore shift >= 22.
228  *
229  * Or, weight =< lw.weight (because lw.weight is the runqueue weight), thus
230  * weight/lw.weight <= 1, and therefore our shift will also be positive.
231  */
__calc_delta(u64 delta_exec,unsigned long weight,struct load_weight * lw)232 static u64 __calc_delta(u64 delta_exec, unsigned long weight, struct load_weight *lw)
233 {
234 	u64 fact = scale_load_down(weight);
235 	int shift = WMULT_SHIFT;
236 
237 	__update_inv_weight(lw);
238 
239 	if (unlikely(fact >> 32)) {
240 		while (fact >> 32) {
241 			fact >>= 1;
242 			shift--;
243 		}
244 	}
245 
246 	/* hint to use a 32x32->64 mul */
247 	fact = (u64)(u32)fact * lw->inv_weight;
248 
249 	while (fact >> 32) {
250 		fact >>= 1;
251 		shift--;
252 	}
253 
254 	return mul_u64_u32_shr(delta_exec, fact, shift);
255 }
256 
257 
258 const struct sched_class fair_sched_class;
259 
260 /**************************************************************
261  * CFS operations on generic schedulable entities:
262  */
263 
264 #ifdef CONFIG_FAIR_GROUP_SCHED
265 
266 /* cpu runqueue to which this cfs_rq is attached */
rq_of(struct cfs_rq * cfs_rq)267 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
268 {
269 	return cfs_rq->rq;
270 }
271 
272 /* An entity is a task if it doesn't "own" a runqueue */
273 #define entity_is_task(se)	(!se->my_q)
274 
task_of(struct sched_entity * se)275 static inline struct task_struct *task_of(struct sched_entity *se)
276 {
277 #ifdef CONFIG_SCHED_DEBUG
278 	WARN_ON_ONCE(!entity_is_task(se));
279 #endif
280 	return container_of(se, struct task_struct, se);
281 }
282 
283 /* Walk up scheduling entities hierarchy */
284 #define for_each_sched_entity(se) \
285 		for (; se; se = se->parent)
286 
task_cfs_rq(struct task_struct * p)287 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
288 {
289 	return p->se.cfs_rq;
290 }
291 
292 /* runqueue on which this entity is (to be) queued */
cfs_rq_of(struct sched_entity * se)293 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
294 {
295 	return se->cfs_rq;
296 }
297 
298 /* runqueue "owned" by this group */
group_cfs_rq(struct sched_entity * grp)299 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
300 {
301 	return grp->my_q;
302 }
303 
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)304 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
305 {
306 	if (!cfs_rq->on_list) {
307 		struct rq *rq = rq_of(cfs_rq);
308 		int cpu = cpu_of(rq);
309 		/*
310 		 * Ensure we either appear before our parent (if already
311 		 * enqueued) or force our parent to appear after us when it is
312 		 * enqueued. The fact that we always enqueue bottom-up
313 		 * reduces this to two cases and a special case for the root
314 		 * cfs_rq. Furthermore, it also means that we will always reset
315 		 * tmp_alone_branch either when the branch is connected
316 		 * to a tree or when we reach the beg of the tree
317 		 */
318 		if (cfs_rq->tg->parent &&
319 		    cfs_rq->tg->parent->cfs_rq[cpu]->on_list) {
320 			/*
321 			 * If parent is already on the list, we add the child
322 			 * just before. Thanks to circular linked property of
323 			 * the list, this means to put the child at the tail
324 			 * of the list that starts by parent.
325 			 */
326 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
327 				&(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list));
328 			/*
329 			 * The branch is now connected to its tree so we can
330 			 * reset tmp_alone_branch to the beginning of the
331 			 * list.
332 			 */
333 			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
334 		} else if (!cfs_rq->tg->parent) {
335 			/*
336 			 * cfs rq without parent should be put
337 			 * at the tail of the list.
338 			 */
339 			list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
340 				&rq->leaf_cfs_rq_list);
341 			/*
342 			 * We have reach the beg of a tree so we can reset
343 			 * tmp_alone_branch to the beginning of the list.
344 			 */
345 			rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
346 		} else {
347 			/*
348 			 * The parent has not already been added so we want to
349 			 * make sure that it will be put after us.
350 			 * tmp_alone_branch points to the beg of the branch
351 			 * where we will add parent.
352 			 */
353 			list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
354 				rq->tmp_alone_branch);
355 			/*
356 			 * update tmp_alone_branch to points to the new beg
357 			 * of the branch
358 			 */
359 			rq->tmp_alone_branch = &cfs_rq->leaf_cfs_rq_list;
360 		}
361 
362 		cfs_rq->on_list = 1;
363 	}
364 }
365 
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)366 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
367 {
368 	if (cfs_rq->on_list) {
369 		list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
370 		cfs_rq->on_list = 0;
371 	}
372 }
373 
374 /* Iterate thr' all leaf cfs_rq's on a runqueue */
375 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
376 	list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
377 
378 /* Do the two (enqueued) entities belong to the same group ? */
379 static inline struct cfs_rq *
is_same_group(struct sched_entity * se,struct sched_entity * pse)380 is_same_group(struct sched_entity *se, struct sched_entity *pse)
381 {
382 	if (se->cfs_rq == pse->cfs_rq)
383 		return se->cfs_rq;
384 
385 	return NULL;
386 }
387 
parent_entity(struct sched_entity * se)388 static inline struct sched_entity *parent_entity(struct sched_entity *se)
389 {
390 	return se->parent;
391 }
392 
393 static void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)394 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
395 {
396 	int se_depth, pse_depth;
397 
398 	/*
399 	 * preemption test can be made between sibling entities who are in the
400 	 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
401 	 * both tasks until we find their ancestors who are siblings of common
402 	 * parent.
403 	 */
404 
405 	/* First walk up until both entities are at same depth */
406 	se_depth = (*se)->depth;
407 	pse_depth = (*pse)->depth;
408 
409 	while (se_depth > pse_depth) {
410 		se_depth--;
411 		*se = parent_entity(*se);
412 	}
413 
414 	while (pse_depth > se_depth) {
415 		pse_depth--;
416 		*pse = parent_entity(*pse);
417 	}
418 
419 	while (!is_same_group(*se, *pse)) {
420 		*se = parent_entity(*se);
421 		*pse = parent_entity(*pse);
422 	}
423 }
424 
425 #else	/* !CONFIG_FAIR_GROUP_SCHED */
426 
task_of(struct sched_entity * se)427 static inline struct task_struct *task_of(struct sched_entity *se)
428 {
429 	return container_of(se, struct task_struct, se);
430 }
431 
rq_of(struct cfs_rq * cfs_rq)432 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
433 {
434 	return container_of(cfs_rq, struct rq, cfs);
435 }
436 
437 #define entity_is_task(se)	1
438 
439 #define for_each_sched_entity(se) \
440 		for (; se; se = NULL)
441 
task_cfs_rq(struct task_struct * p)442 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
443 {
444 	return &task_rq(p)->cfs;
445 }
446 
cfs_rq_of(struct sched_entity * se)447 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
448 {
449 	struct task_struct *p = task_of(se);
450 	struct rq *rq = task_rq(p);
451 
452 	return &rq->cfs;
453 }
454 
455 /* runqueue "owned" by this group */
group_cfs_rq(struct sched_entity * grp)456 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
457 {
458 	return NULL;
459 }
460 
list_add_leaf_cfs_rq(struct cfs_rq * cfs_rq)461 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
462 {
463 }
464 
list_del_leaf_cfs_rq(struct cfs_rq * cfs_rq)465 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
466 {
467 }
468 
469 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
470 		for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
471 
parent_entity(struct sched_entity * se)472 static inline struct sched_entity *parent_entity(struct sched_entity *se)
473 {
474 	return NULL;
475 }
476 
477 static inline void
find_matching_se(struct sched_entity ** se,struct sched_entity ** pse)478 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
479 {
480 }
481 
482 #endif	/* CONFIG_FAIR_GROUP_SCHED */
483 
484 static __always_inline
485 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
486 
487 /**************************************************************
488  * Scheduling class tree data structure manipulation methods:
489  */
490 
max_vruntime(u64 max_vruntime,u64 vruntime)491 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
492 {
493 	s64 delta = (s64)(vruntime - max_vruntime);
494 	if (delta > 0)
495 		max_vruntime = vruntime;
496 
497 	return max_vruntime;
498 }
499 
min_vruntime(u64 min_vruntime,u64 vruntime)500 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
501 {
502 	s64 delta = (s64)(vruntime - min_vruntime);
503 	if (delta < 0)
504 		min_vruntime = vruntime;
505 
506 	return min_vruntime;
507 }
508 
entity_before(struct sched_entity * a,struct sched_entity * b)509 static inline int entity_before(struct sched_entity *a,
510 				struct sched_entity *b)
511 {
512 	return (s64)(a->vruntime - b->vruntime) < 0;
513 }
514 
update_min_vruntime(struct cfs_rq * cfs_rq)515 static void update_min_vruntime(struct cfs_rq *cfs_rq)
516 {
517 	u64 vruntime = cfs_rq->min_vruntime;
518 
519 	if (cfs_rq->curr)
520 		vruntime = cfs_rq->curr->vruntime;
521 
522 	if (cfs_rq->rb_leftmost) {
523 		struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
524 						   struct sched_entity,
525 						   run_node);
526 
527 		if (!cfs_rq->curr)
528 			vruntime = se->vruntime;
529 		else
530 			vruntime = min_vruntime(vruntime, se->vruntime);
531 	}
532 
533 	/* ensure we never gain time by being placed backwards. */
534 	cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
535 #ifndef CONFIG_64BIT
536 	smp_wmb();
537 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
538 #endif
539 }
540 
541 /*
542  * Enqueue an entity into the rb-tree:
543  */
__enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)544 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
545 {
546 	struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
547 	struct rb_node *parent = NULL;
548 	struct sched_entity *entry;
549 	int leftmost = 1;
550 
551 	/*
552 	 * Find the right place in the rbtree:
553 	 */
554 	while (*link) {
555 		parent = *link;
556 		entry = rb_entry(parent, struct sched_entity, run_node);
557 		/*
558 		 * We dont care about collisions. Nodes with
559 		 * the same key stay together.
560 		 */
561 		if (entity_before(se, entry)) {
562 			link = &parent->rb_left;
563 		} else {
564 			link = &parent->rb_right;
565 			leftmost = 0;
566 		}
567 	}
568 
569 	/*
570 	 * Maintain a cache of leftmost tree entries (it is frequently
571 	 * used):
572 	 */
573 	if (leftmost)
574 		cfs_rq->rb_leftmost = &se->run_node;
575 
576 	rb_link_node(&se->run_node, parent, link);
577 	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
578 }
579 
__dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)580 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
581 {
582 	if (cfs_rq->rb_leftmost == &se->run_node) {
583 		struct rb_node *next_node;
584 
585 		next_node = rb_next(&se->run_node);
586 		cfs_rq->rb_leftmost = next_node;
587 	}
588 
589 	rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
590 }
591 
__pick_first_entity(struct cfs_rq * cfs_rq)592 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
593 {
594 	struct rb_node *left = cfs_rq->rb_leftmost;
595 
596 	if (!left)
597 		return NULL;
598 
599 	return rb_entry(left, struct sched_entity, run_node);
600 }
601 
__pick_next_entity(struct sched_entity * se)602 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
603 {
604 	struct rb_node *next = rb_next(&se->run_node);
605 
606 	if (!next)
607 		return NULL;
608 
609 	return rb_entry(next, struct sched_entity, run_node);
610 }
611 
612 #ifdef CONFIG_SCHED_DEBUG
__pick_last_entity(struct cfs_rq * cfs_rq)613 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
614 {
615 	struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
616 
617 	if (!last)
618 		return NULL;
619 
620 	return rb_entry(last, struct sched_entity, run_node);
621 }
622 
623 /**************************************************************
624  * Scheduling class statistics methods:
625  */
626 
sched_proc_update_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)627 int sched_proc_update_handler(struct ctl_table *table, int write,
628 		void __user *buffer, size_t *lenp,
629 		loff_t *ppos)
630 {
631 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
632 	unsigned int factor = get_update_sysctl_factor();
633 
634 	if (ret || !write)
635 		return ret;
636 
637 	sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
638 					sysctl_sched_min_granularity);
639 
640 #define WRT_SYSCTL(name) \
641 	(normalized_sysctl_##name = sysctl_##name / (factor))
642 	WRT_SYSCTL(sched_min_granularity);
643 	WRT_SYSCTL(sched_latency);
644 	WRT_SYSCTL(sched_wakeup_granularity);
645 #undef WRT_SYSCTL
646 
647 	return 0;
648 }
649 #endif
650 
651 /*
652  * delta /= w
653  */
calc_delta_fair(u64 delta,struct sched_entity * se)654 static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
655 {
656 	if (unlikely(se->load.weight != NICE_0_LOAD))
657 		delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
658 
659 	return delta;
660 }
661 
662 /*
663  * The idea is to set a period in which each task runs once.
664  *
665  * When there are too many tasks (sched_nr_latency) we have to stretch
666  * this period because otherwise the slices get too small.
667  *
668  * p = (nr <= nl) ? l : l*nr/nl
669  */
__sched_period(unsigned long nr_running)670 static u64 __sched_period(unsigned long nr_running)
671 {
672 	if (unlikely(nr_running > sched_nr_latency))
673 		return nr_running * sysctl_sched_min_granularity;
674 	else
675 		return sysctl_sched_latency;
676 }
677 
678 /*
679  * We calculate the wall-time slice from the period by taking a part
680  * proportional to the weight.
681  *
682  * s = p*P[w/rw]
683  */
sched_slice(struct cfs_rq * cfs_rq,struct sched_entity * se)684 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
685 {
686 	u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
687 
688 	for_each_sched_entity(se) {
689 		struct load_weight *load;
690 		struct load_weight lw;
691 
692 		cfs_rq = cfs_rq_of(se);
693 		load = &cfs_rq->load;
694 
695 		if (unlikely(!se->on_rq)) {
696 			lw = cfs_rq->load;
697 
698 			update_load_add(&lw, se->load.weight);
699 			load = &lw;
700 		}
701 		slice = __calc_delta(slice, se->load.weight, load);
702 	}
703 	return slice;
704 }
705 
706 /*
707  * We calculate the vruntime slice of a to-be-inserted task.
708  *
709  * vs = s/w
710  */
sched_vslice(struct cfs_rq * cfs_rq,struct sched_entity * se)711 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
712 {
713 	return calc_delta_fair(sched_slice(cfs_rq, se), se);
714 }
715 
716 #ifdef CONFIG_SMP
717 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
718 static unsigned long task_h_load(struct task_struct *p);
719 
720 /*
721  * We choose a half-life close to 1 scheduling period.
722  * Note: The tables runnable_avg_yN_inv and runnable_avg_yN_sum are
723  * dependent on this value.
724  */
725 #define LOAD_AVG_PERIOD 32
726 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
727 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_AVG_MAX */
728 
729 /* Give new sched_entity start runnable values to heavy its load in infant time */
init_entity_runnable_average(struct sched_entity * se)730 void init_entity_runnable_average(struct sched_entity *se)
731 {
732 	struct sched_avg *sa = &se->avg;
733 
734 	sa->last_update_time = 0;
735 	/*
736 	 * sched_avg's period_contrib should be strictly less then 1024, so
737 	 * we give it 1023 to make sure it is almost a period (1024us), and
738 	 * will definitely be update (after enqueue).
739 	 */
740 	sa->period_contrib = 1023;
741 	/*
742 	 * Tasks are intialized with full load to be seen as heavy tasks until
743 	 * they get a chance to stabilize to their real load level.
744 	 * Group entities are intialized with zero load to reflect the fact that
745 	 * nothing has been attached to the task group yet.
746 	 */
747 	if (entity_is_task(se))
748 		sa->load_avg = scale_load_down(se->load.weight);
749 	sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
750 	/*
751 	 * In previous Android versions, we used to have:
752 	 * 	sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
753 	 * 	sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
754 	 * However, that functionality has been moved to enqueue.
755 	 * It is unclear if we should restore this in enqueue.
756 	 */
757 	/*
758 	 * At this point, util_avg won't be used in select_task_rq_fair anyway
759 	 */
760 	sa->util_avg = 0;
761 	sa->util_sum = 0;
762 	/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
763 }
764 
765 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
766 static int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq);
767 static void attach_entity_cfs_rq(struct sched_entity *se);
768 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se);
769 
770 /*
771  * With new tasks being created, their initial util_avgs are extrapolated
772  * based on the cfs_rq's current util_avg:
773  *
774  *   util_avg = cfs_rq->util_avg / (cfs_rq->load_avg + 1) * se.load.weight
775  *
776  * However, in many cases, the above util_avg does not give a desired
777  * value. Moreover, the sum of the util_avgs may be divergent, such
778  * as when the series is a harmonic series.
779  *
780  * To solve this problem, we also cap the util_avg of successive tasks to
781  * only 1/2 of the left utilization budget:
782  *
783  *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
784  *
785  * where n denotes the nth task.
786  *
787  * For example, a simplest series from the beginning would be like:
788  *
789  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
790  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
791  *
792  * Finally, that extrapolated util_avg is clamped to the cap (util_avg_cap)
793  * if util_avg > util_avg_cap.
794  */
post_init_entity_util_avg(struct sched_entity * se)795 void post_init_entity_util_avg(struct sched_entity *se)
796 {
797 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
798 	struct sched_avg *sa = &se->avg;
799 	long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
800 
801 	if (cap > 0) {
802 		if (cfs_rq->avg.util_avg != 0) {
803 			sa->util_avg  = cfs_rq->avg.util_avg * se->load.weight;
804 			sa->util_avg /= (cfs_rq->avg.load_avg + 1);
805 
806 			if (sa->util_avg > cap)
807 				sa->util_avg = cap;
808 		} else {
809 			sa->util_avg = cap;
810 		}
811 		/*
812 		 * If we wish to restore tuning via setting initial util,
813 		 * this is where we should do it.
814 		 */
815 		sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
816 	}
817 
818 	if (entity_is_task(se)) {
819 		struct task_struct *p = task_of(se);
820 		if (p->sched_class != &fair_sched_class) {
821 			/*
822 			 * For !fair tasks do:
823 			 *
824 			update_cfs_rq_load_avg(now, cfs_rq, false);
825 			attach_entity_load_avg(cfs_rq, se);
826 			switched_from_fair(rq, p);
827 			 *
828 			 * such that the next switched_to_fair() has the
829 			 * expected state.
830 			 */
831 			se->avg.last_update_time = cfs_rq_clock_task(cfs_rq);
832 			return;
833 		}
834 	}
835 
836 	attach_entity_cfs_rq(se);
837 }
838 
839 #else /* !CONFIG_SMP */
init_entity_runnable_average(struct sched_entity * se)840 void init_entity_runnable_average(struct sched_entity *se)
841 {
842 }
post_init_entity_util_avg(struct sched_entity * se)843 void post_init_entity_util_avg(struct sched_entity *se)
844 {
845 }
update_tg_load_avg(struct cfs_rq * cfs_rq,int force)846 static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
847 {
848 }
849 #endif /* CONFIG_SMP */
850 
851 /*
852  * Update the current task's runtime statistics.
853  */
update_curr(struct cfs_rq * cfs_rq)854 static void update_curr(struct cfs_rq *cfs_rq)
855 {
856 	struct sched_entity *curr = cfs_rq->curr;
857 	u64 now = rq_clock_task(rq_of(cfs_rq));
858 	u64 delta_exec;
859 
860 	if (unlikely(!curr))
861 		return;
862 
863 	delta_exec = now - curr->exec_start;
864 	if (unlikely((s64)delta_exec <= 0))
865 		return;
866 
867 	curr->exec_start = now;
868 
869 	schedstat_set(curr->statistics.exec_max,
870 		      max(delta_exec, curr->statistics.exec_max));
871 
872 	curr->sum_exec_runtime += delta_exec;
873 	schedstat_add(cfs_rq, exec_clock, delta_exec);
874 
875 	curr->vruntime += calc_delta_fair(delta_exec, curr);
876 	update_min_vruntime(cfs_rq);
877 
878 	if (entity_is_task(curr)) {
879 		struct task_struct *curtask = task_of(curr);
880 
881 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
882 		cpuacct_charge(curtask, delta_exec);
883 		account_group_exec_runtime(curtask, delta_exec);
884 	}
885 
886 	account_cfs_rq_runtime(cfs_rq, delta_exec);
887 }
888 
update_curr_fair(struct rq * rq)889 static void update_curr_fair(struct rq *rq)
890 {
891 	update_curr(cfs_rq_of(&rq->curr->se));
892 }
893 
894 static inline void
update_stats_wait_start(struct cfs_rq * cfs_rq,struct sched_entity * se)895 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
896 {
897 	schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq)));
898 }
899 
900 /*
901  * Task is being enqueued - update stats:
902  */
update_stats_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)903 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
904 {
905 	/*
906 	 * Are we enqueueing a waiting task? (for current tasks
907 	 * a dequeue/enqueue event is a NOP)
908 	 */
909 	if (se != cfs_rq->curr)
910 		update_stats_wait_start(cfs_rq, se);
911 }
912 
913 static void
update_stats_wait_end(struct cfs_rq * cfs_rq,struct sched_entity * se)914 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
915 {
916 	schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
917 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start));
918 	schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
919 	schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
920 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
921 #ifdef CONFIG_SCHEDSTATS
922 	if (entity_is_task(se)) {
923 		trace_sched_stat_wait(task_of(se),
924 			rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start);
925 	}
926 #endif
927 	schedstat_set(se->statistics.wait_start, 0);
928 }
929 
930 static inline void
update_stats_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)931 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
932 {
933 	/*
934 	 * Mark the end of the wait period if dequeueing a
935 	 * waiting task:
936 	 */
937 	if (se != cfs_rq->curr)
938 		update_stats_wait_end(cfs_rq, se);
939 }
940 
941 /*
942  * We are picking a new current task - update its stats:
943  */
944 static inline void
update_stats_curr_start(struct cfs_rq * cfs_rq,struct sched_entity * se)945 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
946 {
947 	/*
948 	 * We are starting a new run period:
949 	 */
950 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
951 }
952 
953 /**************************************************
954  * Scheduling class queueing methods:
955  */
956 
957 #ifdef CONFIG_NUMA_BALANCING
958 /*
959  * Approximate time to scan a full NUMA task in ms. The task scan period is
960  * calculated based on the tasks virtual memory size and
961  * numa_balancing_scan_size.
962  */
963 unsigned int sysctl_numa_balancing_scan_period_min = 1000;
964 unsigned int sysctl_numa_balancing_scan_period_max = 60000;
965 
966 /* Portion of address space to scan in MB */
967 unsigned int sysctl_numa_balancing_scan_size = 256;
968 
969 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
970 unsigned int sysctl_numa_balancing_scan_delay = 1000;
971 
task_nr_scan_windows(struct task_struct * p)972 static unsigned int task_nr_scan_windows(struct task_struct *p)
973 {
974 	unsigned long rss = 0;
975 	unsigned long nr_scan_pages;
976 
977 	/*
978 	 * Calculations based on RSS as non-present and empty pages are skipped
979 	 * by the PTE scanner and NUMA hinting faults should be trapped based
980 	 * on resident pages
981 	 */
982 	nr_scan_pages = sysctl_numa_balancing_scan_size << (20 - PAGE_SHIFT);
983 	rss = get_mm_rss(p->mm);
984 	if (!rss)
985 		rss = nr_scan_pages;
986 
987 	rss = round_up(rss, nr_scan_pages);
988 	return rss / nr_scan_pages;
989 }
990 
991 /* For sanitys sake, never scan more PTEs than MAX_SCAN_WINDOW MB/sec. */
992 #define MAX_SCAN_WINDOW 2560
993 
task_scan_min(struct task_struct * p)994 static unsigned int task_scan_min(struct task_struct *p)
995 {
996 	unsigned int scan_size = READ_ONCE(sysctl_numa_balancing_scan_size);
997 	unsigned int scan, floor;
998 	unsigned int windows = 1;
999 
1000 	if (scan_size < MAX_SCAN_WINDOW)
1001 		windows = MAX_SCAN_WINDOW / scan_size;
1002 	floor = 1000 / windows;
1003 
1004 	scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
1005 	return max_t(unsigned int, floor, scan);
1006 }
1007 
task_scan_max(struct task_struct * p)1008 static unsigned int task_scan_max(struct task_struct *p)
1009 {
1010 	unsigned int smin = task_scan_min(p);
1011 	unsigned int smax;
1012 
1013 	/* Watch for min being lower than max due to floor calculations */
1014 	smax = sysctl_numa_balancing_scan_period_max / task_nr_scan_windows(p);
1015 	return max(smin, smax);
1016 }
1017 
account_numa_enqueue(struct rq * rq,struct task_struct * p)1018 static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
1019 {
1020 	rq->nr_numa_running += (p->numa_preferred_nid != -1);
1021 	rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
1022 }
1023 
account_numa_dequeue(struct rq * rq,struct task_struct * p)1024 static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
1025 {
1026 	rq->nr_numa_running -= (p->numa_preferred_nid != -1);
1027 	rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
1028 }
1029 
1030 struct numa_group {
1031 	atomic_t refcount;
1032 
1033 	spinlock_t lock; /* nr_tasks, tasks */
1034 	int nr_tasks;
1035 	pid_t gid;
1036 
1037 	struct rcu_head rcu;
1038 	nodemask_t active_nodes;
1039 	unsigned long total_faults;
1040 	/*
1041 	 * Faults_cpu is used to decide whether memory should move
1042 	 * towards the CPU. As a consequence, these stats are weighted
1043 	 * more by CPU use than by memory faults.
1044 	 */
1045 	unsigned long *faults_cpu;
1046 	unsigned long faults[0];
1047 };
1048 
1049 /* Shared or private faults. */
1050 #define NR_NUMA_HINT_FAULT_TYPES 2
1051 
1052 /* Memory and CPU locality */
1053 #define NR_NUMA_HINT_FAULT_STATS (NR_NUMA_HINT_FAULT_TYPES * 2)
1054 
1055 /* Averaged statistics, and temporary buffers. */
1056 #define NR_NUMA_HINT_FAULT_BUCKETS (NR_NUMA_HINT_FAULT_STATS * 2)
1057 
task_numa_group_id(struct task_struct * p)1058 pid_t task_numa_group_id(struct task_struct *p)
1059 {
1060 	return p->numa_group ? p->numa_group->gid : 0;
1061 }
1062 
1063 /*
1064  * The averaged statistics, shared & private, memory & cpu,
1065  * occupy the first half of the array. The second half of the
1066  * array is for current counters, which are averaged into the
1067  * first set by task_numa_placement.
1068  */
task_faults_idx(enum numa_faults_stats s,int nid,int priv)1069 static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
1070 {
1071 	return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
1072 }
1073 
task_faults(struct task_struct * p,int nid)1074 static inline unsigned long task_faults(struct task_struct *p, int nid)
1075 {
1076 	if (!p->numa_faults)
1077 		return 0;
1078 
1079 	return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1080 		p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
1081 }
1082 
group_faults(struct task_struct * p,int nid)1083 static inline unsigned long group_faults(struct task_struct *p, int nid)
1084 {
1085 	if (!p->numa_group)
1086 		return 0;
1087 
1088 	return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
1089 		p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
1090 }
1091 
group_faults_cpu(struct numa_group * group,int nid)1092 static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
1093 {
1094 	return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
1095 		group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
1096 }
1097 
1098 /* Handle placement on systems where not all nodes are directly connected. */
score_nearby_nodes(struct task_struct * p,int nid,int maxdist,bool task)1099 static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
1100 					int maxdist, bool task)
1101 {
1102 	unsigned long score = 0;
1103 	int node;
1104 
1105 	/*
1106 	 * All nodes are directly connected, and the same distance
1107 	 * from each other. No need for fancy placement algorithms.
1108 	 */
1109 	if (sched_numa_topology_type == NUMA_DIRECT)
1110 		return 0;
1111 
1112 	/*
1113 	 * This code is called for each node, introducing N^2 complexity,
1114 	 * which should be ok given the number of nodes rarely exceeds 8.
1115 	 */
1116 	for_each_online_node(node) {
1117 		unsigned long faults;
1118 		int dist = node_distance(nid, node);
1119 
1120 		/*
1121 		 * The furthest away nodes in the system are not interesting
1122 		 * for placement; nid was already counted.
1123 		 */
1124 		if (dist == sched_max_numa_distance || node == nid)
1125 			continue;
1126 
1127 		/*
1128 		 * On systems with a backplane NUMA topology, compare groups
1129 		 * of nodes, and move tasks towards the group with the most
1130 		 * memory accesses. When comparing two nodes at distance
1131 		 * "hoplimit", only nodes closer by than "hoplimit" are part
1132 		 * of each group. Skip other nodes.
1133 		 */
1134 		if (sched_numa_topology_type == NUMA_BACKPLANE &&
1135 					dist > maxdist)
1136 			continue;
1137 
1138 		/* Add up the faults from nearby nodes. */
1139 		if (task)
1140 			faults = task_faults(p, node);
1141 		else
1142 			faults = group_faults(p, node);
1143 
1144 		/*
1145 		 * On systems with a glueless mesh NUMA topology, there are
1146 		 * no fixed "groups of nodes". Instead, nodes that are not
1147 		 * directly connected bounce traffic through intermediate
1148 		 * nodes; a numa_group can occupy any set of nodes.
1149 		 * The further away a node is, the less the faults count.
1150 		 * This seems to result in good task placement.
1151 		 */
1152 		if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1153 			faults *= (sched_max_numa_distance - dist);
1154 			faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
1155 		}
1156 
1157 		score += faults;
1158 	}
1159 
1160 	return score;
1161 }
1162 
1163 /*
1164  * These return the fraction of accesses done by a particular task, or
1165  * task group, on a particular numa node.  The group weight is given a
1166  * larger multiplier, in order to group tasks together that are almost
1167  * evenly spread out between numa nodes.
1168  */
task_weight(struct task_struct * p,int nid,int dist)1169 static inline unsigned long task_weight(struct task_struct *p, int nid,
1170 					int dist)
1171 {
1172 	unsigned long faults, total_faults;
1173 
1174 	if (!p->numa_faults)
1175 		return 0;
1176 
1177 	total_faults = p->total_numa_faults;
1178 
1179 	if (!total_faults)
1180 		return 0;
1181 
1182 	faults = task_faults(p, nid);
1183 	faults += score_nearby_nodes(p, nid, dist, true);
1184 
1185 	return 1000 * faults / total_faults;
1186 }
1187 
group_weight(struct task_struct * p,int nid,int dist)1188 static inline unsigned long group_weight(struct task_struct *p, int nid,
1189 					 int dist)
1190 {
1191 	unsigned long faults, total_faults;
1192 
1193 	if (!p->numa_group)
1194 		return 0;
1195 
1196 	total_faults = p->numa_group->total_faults;
1197 
1198 	if (!total_faults)
1199 		return 0;
1200 
1201 	faults = group_faults(p, nid);
1202 	faults += score_nearby_nodes(p, nid, dist, false);
1203 
1204 	return 1000 * faults / total_faults;
1205 }
1206 
should_numa_migrate_memory(struct task_struct * p,struct page * page,int src_nid,int dst_cpu)1207 bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
1208 				int src_nid, int dst_cpu)
1209 {
1210 	struct numa_group *ng = p->numa_group;
1211 	int dst_nid = cpu_to_node(dst_cpu);
1212 	int last_cpupid, this_cpupid;
1213 
1214 	this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
1215 
1216 	/*
1217 	 * Multi-stage node selection is used in conjunction with a periodic
1218 	 * migration fault to build a temporal task<->page relation. By using
1219 	 * a two-stage filter we remove short/unlikely relations.
1220 	 *
1221 	 * Using P(p) ~ n_p / n_t as per frequentist probability, we can equate
1222 	 * a task's usage of a particular page (n_p) per total usage of this
1223 	 * page (n_t) (in a given time-span) to a probability.
1224 	 *
1225 	 * Our periodic faults will sample this probability and getting the
1226 	 * same result twice in a row, given these samples are fully
1227 	 * independent, is then given by P(n)^2, provided our sample period
1228 	 * is sufficiently short compared to the usage pattern.
1229 	 *
1230 	 * This quadric squishes small probabilities, making it less likely we
1231 	 * act on an unlikely task<->page relation.
1232 	 */
1233 	last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
1234 	if (!cpupid_pid_unset(last_cpupid) &&
1235 				cpupid_to_nid(last_cpupid) != dst_nid)
1236 		return false;
1237 
1238 	/* Always allow migrate on private faults */
1239 	if (cpupid_match_pid(p, last_cpupid))
1240 		return true;
1241 
1242 	/* A shared fault, but p->numa_group has not been set up yet. */
1243 	if (!ng)
1244 		return true;
1245 
1246 	/*
1247 	 * Do not migrate if the destination is not a node that
1248 	 * is actively used by this numa group.
1249 	 */
1250 	if (!node_isset(dst_nid, ng->active_nodes))
1251 		return false;
1252 
1253 	/*
1254 	 * Source is a node that is not actively used by this
1255 	 * numa group, while the destination is. Migrate.
1256 	 */
1257 	if (!node_isset(src_nid, ng->active_nodes))
1258 		return true;
1259 
1260 	/*
1261 	 * Both source and destination are nodes in active
1262 	 * use by this numa group. Maximize memory bandwidth
1263 	 * by migrating from more heavily used groups, to less
1264 	 * heavily used ones, spreading the load around.
1265 	 * Use a 1/4 hysteresis to avoid spurious page movement.
1266 	 */
1267 	return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4);
1268 }
1269 
1270 static unsigned long weighted_cpuload(const int cpu);
1271 static unsigned long source_load(int cpu, int type);
1272 static unsigned long target_load(int cpu, int type);
1273 static unsigned long capacity_of(int cpu);
1274 static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
1275 
1276 /* Cached statistics for all CPUs within a node */
1277 struct numa_stats {
1278 	unsigned long nr_running;
1279 	unsigned long load;
1280 
1281 	/* Total compute capacity of CPUs on a node */
1282 	unsigned long compute_capacity;
1283 
1284 	/* Approximate capacity in terms of runnable tasks on a node */
1285 	unsigned long task_capacity;
1286 	int has_free_capacity;
1287 };
1288 
1289 /*
1290  * XXX borrowed from update_sg_lb_stats
1291  */
update_numa_stats(struct numa_stats * ns,int nid)1292 static void update_numa_stats(struct numa_stats *ns, int nid)
1293 {
1294 	int smt, cpu, cpus = 0;
1295 	unsigned long capacity;
1296 
1297 	memset(ns, 0, sizeof(*ns));
1298 	for_each_cpu(cpu, cpumask_of_node(nid)) {
1299 		struct rq *rq = cpu_rq(cpu);
1300 
1301 		ns->nr_running += rq->nr_running;
1302 		ns->load += weighted_cpuload(cpu);
1303 		ns->compute_capacity += capacity_of(cpu);
1304 
1305 		cpus++;
1306 	}
1307 
1308 	/*
1309 	 * If we raced with hotplug and there are no CPUs left in our mask
1310 	 * the @ns structure is NULL'ed and task_numa_compare() will
1311 	 * not find this node attractive.
1312 	 *
1313 	 * We'll either bail at !has_free_capacity, or we'll detect a huge
1314 	 * imbalance and bail there.
1315 	 */
1316 	if (!cpus)
1317 		return;
1318 
1319 	/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
1320 	smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
1321 	capacity = cpus / smt; /* cores */
1322 
1323 	ns->task_capacity = min_t(unsigned, capacity,
1324 		DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
1325 	ns->has_free_capacity = (ns->nr_running < ns->task_capacity);
1326 }
1327 
1328 struct task_numa_env {
1329 	struct task_struct *p;
1330 
1331 	int src_cpu, src_nid;
1332 	int dst_cpu, dst_nid;
1333 
1334 	struct numa_stats src_stats, dst_stats;
1335 
1336 	int imbalance_pct;
1337 	int dist;
1338 
1339 	struct task_struct *best_task;
1340 	long best_imp;
1341 	int best_cpu;
1342 };
1343 
task_numa_assign(struct task_numa_env * env,struct task_struct * p,long imp)1344 static void task_numa_assign(struct task_numa_env *env,
1345 			     struct task_struct *p, long imp)
1346 {
1347 	if (env->best_task)
1348 		put_task_struct(env->best_task);
1349 
1350 	env->best_task = p;
1351 	env->best_imp = imp;
1352 	env->best_cpu = env->dst_cpu;
1353 }
1354 
load_too_imbalanced(long src_load,long dst_load,struct task_numa_env * env)1355 static bool load_too_imbalanced(long src_load, long dst_load,
1356 				struct task_numa_env *env)
1357 {
1358 	long imb, old_imb;
1359 	long orig_src_load, orig_dst_load;
1360 	long src_capacity, dst_capacity;
1361 
1362 	/*
1363 	 * The load is corrected for the CPU capacity available on each node.
1364 	 *
1365 	 * src_load        dst_load
1366 	 * ------------ vs ---------
1367 	 * src_capacity    dst_capacity
1368 	 */
1369 	src_capacity = env->src_stats.compute_capacity;
1370 	dst_capacity = env->dst_stats.compute_capacity;
1371 
1372 	/* We care about the slope of the imbalance, not the direction. */
1373 	if (dst_load < src_load)
1374 		swap(dst_load, src_load);
1375 
1376 	/* Is the difference below the threshold? */
1377 	imb = dst_load * src_capacity * 100 -
1378 	      src_load * dst_capacity * env->imbalance_pct;
1379 	if (imb <= 0)
1380 		return false;
1381 
1382 	/*
1383 	 * The imbalance is above the allowed threshold.
1384 	 * Compare it with the old imbalance.
1385 	 */
1386 	orig_src_load = env->src_stats.load;
1387 	orig_dst_load = env->dst_stats.load;
1388 
1389 	if (orig_dst_load < orig_src_load)
1390 		swap(orig_dst_load, orig_src_load);
1391 
1392 	old_imb = orig_dst_load * src_capacity * 100 -
1393 		  orig_src_load * dst_capacity * env->imbalance_pct;
1394 
1395 	/* Would this change make things worse? */
1396 	return (imb > old_imb);
1397 }
1398 
1399 /*
1400  * This checks if the overall compute and NUMA accesses of the system would
1401  * be improved if the source tasks was migrated to the target dst_cpu taking
1402  * into account that it might be best if task running on the dst_cpu should
1403  * be exchanged with the source task
1404  */
task_numa_compare(struct task_numa_env * env,long taskimp,long groupimp)1405 static void task_numa_compare(struct task_numa_env *env,
1406 			      long taskimp, long groupimp)
1407 {
1408 	struct rq *src_rq = cpu_rq(env->src_cpu);
1409 	struct rq *dst_rq = cpu_rq(env->dst_cpu);
1410 	struct task_struct *cur;
1411 	long src_load, dst_load;
1412 	long load;
1413 	long imp = env->p->numa_group ? groupimp : taskimp;
1414 	long moveimp = imp;
1415 	int dist = env->dist;
1416 	bool assigned = false;
1417 
1418 	rcu_read_lock();
1419 
1420 	raw_spin_lock_irq(&dst_rq->lock);
1421 	cur = dst_rq->curr;
1422 	/*
1423 	 * No need to move the exiting task or idle task.
1424 	 */
1425 	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
1426 		cur = NULL;
1427 	else {
1428 		/*
1429 		 * The task_struct must be protected here to protect the
1430 		 * p->numa_faults access in the task_weight since the
1431 		 * numa_faults could already be freed in the following path:
1432 		 * finish_task_switch()
1433 		 *     --> put_task_struct()
1434 		 *         --> __put_task_struct()
1435 		 *             --> task_numa_free()
1436 		 */
1437 		get_task_struct(cur);
1438 	}
1439 
1440 	raw_spin_unlock_irq(&dst_rq->lock);
1441 
1442 	/*
1443 	 * Because we have preemption enabled we can get migrated around and
1444 	 * end try selecting ourselves (current == env->p) as a swap candidate.
1445 	 */
1446 	if (cur == env->p)
1447 		goto unlock;
1448 
1449 	/*
1450 	 * "imp" is the fault differential for the source task between the
1451 	 * source and destination node. Calculate the total differential for
1452 	 * the source task and potential destination task. The more negative
1453 	 * the value is, the more rmeote accesses that would be expected to
1454 	 * be incurred if the tasks were swapped.
1455 	 */
1456 	if (cur) {
1457 		/* Skip this swap candidate if cannot move to the source cpu */
1458 		if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
1459 			goto unlock;
1460 
1461 		/*
1462 		 * If dst and source tasks are in the same NUMA group, or not
1463 		 * in any group then look only at task weights.
1464 		 */
1465 		if (cur->numa_group == env->p->numa_group) {
1466 			imp = taskimp + task_weight(cur, env->src_nid, dist) -
1467 			      task_weight(cur, env->dst_nid, dist);
1468 			/*
1469 			 * Add some hysteresis to prevent swapping the
1470 			 * tasks within a group over tiny differences.
1471 			 */
1472 			if (cur->numa_group)
1473 				imp -= imp/16;
1474 		} else {
1475 			/*
1476 			 * Compare the group weights. If a task is all by
1477 			 * itself (not part of a group), use the task weight
1478 			 * instead.
1479 			 */
1480 			if (cur->numa_group)
1481 				imp += group_weight(cur, env->src_nid, dist) -
1482 				       group_weight(cur, env->dst_nid, dist);
1483 			else
1484 				imp += task_weight(cur, env->src_nid, dist) -
1485 				       task_weight(cur, env->dst_nid, dist);
1486 		}
1487 	}
1488 
1489 	if (imp <= env->best_imp && moveimp <= env->best_imp)
1490 		goto unlock;
1491 
1492 	if (!cur) {
1493 		/* Is there capacity at our destination? */
1494 		if (env->src_stats.nr_running <= env->src_stats.task_capacity &&
1495 		    !env->dst_stats.has_free_capacity)
1496 			goto unlock;
1497 
1498 		goto balance;
1499 	}
1500 
1501 	/* Balance doesn't matter much if we're running a task per cpu */
1502 	if (imp > env->best_imp && src_rq->nr_running == 1 &&
1503 			dst_rq->nr_running == 1)
1504 		goto assign;
1505 
1506 	/*
1507 	 * In the overloaded case, try and keep the load balanced.
1508 	 */
1509 balance:
1510 	load = task_h_load(env->p);
1511 	dst_load = env->dst_stats.load + load;
1512 	src_load = env->src_stats.load - load;
1513 
1514 	if (moveimp > imp && moveimp > env->best_imp) {
1515 		/*
1516 		 * If the improvement from just moving env->p direction is
1517 		 * better than swapping tasks around, check if a move is
1518 		 * possible. Store a slightly smaller score than moveimp,
1519 		 * so an actually idle CPU will win.
1520 		 */
1521 		if (!load_too_imbalanced(src_load, dst_load, env)) {
1522 			imp = moveimp - 1;
1523 			put_task_struct(cur);
1524 			cur = NULL;
1525 			goto assign;
1526 		}
1527 	}
1528 
1529 	if (imp <= env->best_imp)
1530 		goto unlock;
1531 
1532 	if (cur) {
1533 		load = task_h_load(cur);
1534 		dst_load -= load;
1535 		src_load += load;
1536 	}
1537 
1538 	if (load_too_imbalanced(src_load, dst_load, env))
1539 		goto unlock;
1540 
1541 	/*
1542 	 * One idle CPU per node is evaluated for a task numa move.
1543 	 * Call select_idle_sibling to maybe find a better one.
1544 	 */
1545 	if (!cur)
1546 		env->dst_cpu = select_idle_sibling(env->p, env->src_cpu,
1547 						   env->dst_cpu);
1548 
1549 assign:
1550 	assigned = true;
1551 	task_numa_assign(env, cur, imp);
1552 unlock:
1553 	rcu_read_unlock();
1554 	/*
1555 	 * The dst_rq->curr isn't assigned. The protection for task_struct is
1556 	 * finished.
1557 	 */
1558 	if (cur && !assigned)
1559 		put_task_struct(cur);
1560 }
1561 
task_numa_find_cpu(struct task_numa_env * env,long taskimp,long groupimp)1562 static void task_numa_find_cpu(struct task_numa_env *env,
1563 				long taskimp, long groupimp)
1564 {
1565 	int cpu;
1566 
1567 	for_each_cpu(cpu, cpumask_of_node(env->dst_nid)) {
1568 		/* Skip this CPU if the source task cannot migrate */
1569 		if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(env->p)))
1570 			continue;
1571 
1572 		env->dst_cpu = cpu;
1573 		task_numa_compare(env, taskimp, groupimp);
1574 	}
1575 }
1576 
1577 /* Only move tasks to a NUMA node less busy than the current node. */
numa_has_capacity(struct task_numa_env * env)1578 static bool numa_has_capacity(struct task_numa_env *env)
1579 {
1580 	struct numa_stats *src = &env->src_stats;
1581 	struct numa_stats *dst = &env->dst_stats;
1582 
1583 	if (src->has_free_capacity && !dst->has_free_capacity)
1584 		return false;
1585 
1586 	/*
1587 	 * Only consider a task move if the source has a higher load
1588 	 * than the destination, corrected for CPU capacity on each node.
1589 	 *
1590 	 *      src->load                dst->load
1591 	 * --------------------- vs ---------------------
1592 	 * src->compute_capacity    dst->compute_capacity
1593 	 */
1594 	if (src->load * dst->compute_capacity * env->imbalance_pct >
1595 
1596 	    dst->load * src->compute_capacity * 100)
1597 		return true;
1598 
1599 	return false;
1600 }
1601 
task_numa_migrate(struct task_struct * p)1602 static int task_numa_migrate(struct task_struct *p)
1603 {
1604 	struct task_numa_env env = {
1605 		.p = p,
1606 
1607 		.src_cpu = task_cpu(p),
1608 		.src_nid = task_node(p),
1609 
1610 		.imbalance_pct = 112,
1611 
1612 		.best_task = NULL,
1613 		.best_imp = 0,
1614 		.best_cpu = -1
1615 	};
1616 	struct sched_domain *sd;
1617 	unsigned long taskweight, groupweight;
1618 	int nid, ret, dist;
1619 	long taskimp, groupimp;
1620 
1621 	/*
1622 	 * Pick the lowest SD_NUMA domain, as that would have the smallest
1623 	 * imbalance and would be the first to start moving tasks about.
1624 	 *
1625 	 * And we want to avoid any moving of tasks about, as that would create
1626 	 * random movement of tasks -- counter the numa conditions we're trying
1627 	 * to satisfy here.
1628 	 */
1629 	rcu_read_lock();
1630 	sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
1631 	if (sd)
1632 		env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
1633 	rcu_read_unlock();
1634 
1635 	/*
1636 	 * Cpusets can break the scheduler domain tree into smaller
1637 	 * balance domains, some of which do not cross NUMA boundaries.
1638 	 * Tasks that are "trapped" in such domains cannot be migrated
1639 	 * elsewhere, so there is no point in (re)trying.
1640 	 */
1641 	if (unlikely(!sd)) {
1642 		p->numa_preferred_nid = task_node(p);
1643 		return -EINVAL;
1644 	}
1645 
1646 	env.dst_nid = p->numa_preferred_nid;
1647 	dist = env.dist = node_distance(env.src_nid, env.dst_nid);
1648 	taskweight = task_weight(p, env.src_nid, dist);
1649 	groupweight = group_weight(p, env.src_nid, dist);
1650 	update_numa_stats(&env.src_stats, env.src_nid);
1651 	taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
1652 	groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
1653 	update_numa_stats(&env.dst_stats, env.dst_nid);
1654 
1655 	/* Try to find a spot on the preferred nid. */
1656 	if (numa_has_capacity(&env))
1657 		task_numa_find_cpu(&env, taskimp, groupimp);
1658 
1659 	/*
1660 	 * Look at other nodes in these cases:
1661 	 * - there is no space available on the preferred_nid
1662 	 * - the task is part of a numa_group that is interleaved across
1663 	 *   multiple NUMA nodes; in order to better consolidate the group,
1664 	 *   we need to check other locations.
1665 	 */
1666 	if (env.best_cpu == -1 || (p->numa_group &&
1667 			nodes_weight(p->numa_group->active_nodes) > 1)) {
1668 		for_each_online_node(nid) {
1669 			if (nid == env.src_nid || nid == p->numa_preferred_nid)
1670 				continue;
1671 
1672 			dist = node_distance(env.src_nid, env.dst_nid);
1673 			if (sched_numa_topology_type == NUMA_BACKPLANE &&
1674 						dist != env.dist) {
1675 				taskweight = task_weight(p, env.src_nid, dist);
1676 				groupweight = group_weight(p, env.src_nid, dist);
1677 			}
1678 
1679 			/* Only consider nodes where both task and groups benefit */
1680 			taskimp = task_weight(p, nid, dist) - taskweight;
1681 			groupimp = group_weight(p, nid, dist) - groupweight;
1682 			if (taskimp < 0 && groupimp < 0)
1683 				continue;
1684 
1685 			env.dist = dist;
1686 			env.dst_nid = nid;
1687 			update_numa_stats(&env.dst_stats, env.dst_nid);
1688 			if (numa_has_capacity(&env))
1689 				task_numa_find_cpu(&env, taskimp, groupimp);
1690 		}
1691 	}
1692 
1693 	/*
1694 	 * If the task is part of a workload that spans multiple NUMA nodes,
1695 	 * and is migrating into one of the workload's active nodes, remember
1696 	 * this node as the task's preferred numa node, so the workload can
1697 	 * settle down.
1698 	 * A task that migrated to a second choice node will be better off
1699 	 * trying for a better one later. Do not set the preferred node here.
1700 	 */
1701 	if (p->numa_group) {
1702 		if (env.best_cpu == -1)
1703 			nid = env.src_nid;
1704 		else
1705 			nid = env.dst_nid;
1706 
1707 		if (node_isset(nid, p->numa_group->active_nodes))
1708 			sched_setnuma(p, env.dst_nid);
1709 	}
1710 
1711 	/* No better CPU than the current one was found. */
1712 	if (env.best_cpu == -1)
1713 		return -EAGAIN;
1714 
1715 	/*
1716 	 * Reset the scan period if the task is being rescheduled on an
1717 	 * alternative node to recheck if the tasks is now properly placed.
1718 	 */
1719 	p->numa_scan_period = task_scan_min(p);
1720 
1721 	if (env.best_task == NULL) {
1722 		ret = migrate_task_to(p, env.best_cpu);
1723 		if (ret != 0)
1724 			trace_sched_stick_numa(p, env.src_cpu, env.best_cpu);
1725 		return ret;
1726 	}
1727 
1728 	ret = migrate_swap(p, env.best_task);
1729 	if (ret != 0)
1730 		trace_sched_stick_numa(p, env.src_cpu, task_cpu(env.best_task));
1731 	put_task_struct(env.best_task);
1732 	return ret;
1733 }
1734 
1735 /* Attempt to migrate a task to a CPU on the preferred node. */
numa_migrate_preferred(struct task_struct * p)1736 static void numa_migrate_preferred(struct task_struct *p)
1737 {
1738 	unsigned long interval = HZ;
1739 
1740 	/* This task has no NUMA fault statistics yet */
1741 	if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
1742 		return;
1743 
1744 	/* Periodically retry migrating the task to the preferred node */
1745 	interval = min(interval, msecs_to_jiffies(p->numa_scan_period) / 16);
1746 	p->numa_migrate_retry = jiffies + interval;
1747 
1748 	/* Success if task is already running on preferred CPU */
1749 	if (task_node(p) == p->numa_preferred_nid)
1750 		return;
1751 
1752 	/* Otherwise, try migrate to a CPU on the preferred node */
1753 	task_numa_migrate(p);
1754 }
1755 
1756 /*
1757  * Find the nodes on which the workload is actively running. We do this by
1758  * tracking the nodes from which NUMA hinting faults are triggered. This can
1759  * be different from the set of nodes where the workload's memory is currently
1760  * located.
1761  *
1762  * The bitmask is used to make smarter decisions on when to do NUMA page
1763  * migrations, To prevent flip-flopping, and excessive page migrations, nodes
1764  * are added when they cause over 6/16 of the maximum number of faults, but
1765  * only removed when they drop below 3/16.
1766  */
update_numa_active_node_mask(struct numa_group * numa_group)1767 static void update_numa_active_node_mask(struct numa_group *numa_group)
1768 {
1769 	unsigned long faults, max_faults = 0;
1770 	int nid;
1771 
1772 	for_each_online_node(nid) {
1773 		faults = group_faults_cpu(numa_group, nid);
1774 		if (faults > max_faults)
1775 			max_faults = faults;
1776 	}
1777 
1778 	for_each_online_node(nid) {
1779 		faults = group_faults_cpu(numa_group, nid);
1780 		if (!node_isset(nid, numa_group->active_nodes)) {
1781 			if (faults > max_faults * 6 / 16)
1782 				node_set(nid, numa_group->active_nodes);
1783 		} else if (faults < max_faults * 3 / 16)
1784 			node_clear(nid, numa_group->active_nodes);
1785 	}
1786 }
1787 
1788 /*
1789  * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
1790  * increments. The more local the fault statistics are, the higher the scan
1791  * period will be for the next scan window. If local/(local+remote) ratio is
1792  * below NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS)
1793  * the scan period will decrease. Aim for 70% local accesses.
1794  */
1795 #define NUMA_PERIOD_SLOTS 10
1796 #define NUMA_PERIOD_THRESHOLD 7
1797 
1798 /*
1799  * Increase the scan period (slow down scanning) if the majority of
1800  * our memory is already on our local node, or if the majority of
1801  * the page accesses are shared with other processes.
1802  * Otherwise, decrease the scan period.
1803  */
update_task_scan_period(struct task_struct * p,unsigned long shared,unsigned long private)1804 static void update_task_scan_period(struct task_struct *p,
1805 			unsigned long shared, unsigned long private)
1806 {
1807 	unsigned int period_slot;
1808 	int ratio;
1809 	int diff;
1810 
1811 	unsigned long remote = p->numa_faults_locality[0];
1812 	unsigned long local = p->numa_faults_locality[1];
1813 
1814 	/*
1815 	 * If there were no record hinting faults then either the task is
1816 	 * completely idle or all activity is areas that are not of interest
1817 	 * to automatic numa balancing. Related to that, if there were failed
1818 	 * migration then it implies we are migrating too quickly or the local
1819 	 * node is overloaded. In either case, scan slower
1820 	 */
1821 	if (local + shared == 0 || p->numa_faults_locality[2]) {
1822 		p->numa_scan_period = min(p->numa_scan_period_max,
1823 			p->numa_scan_period << 1);
1824 
1825 		p->mm->numa_next_scan = jiffies +
1826 			msecs_to_jiffies(p->numa_scan_period);
1827 
1828 		return;
1829 	}
1830 
1831 	/*
1832 	 * Prepare to scale scan period relative to the current period.
1833 	 *	 == NUMA_PERIOD_THRESHOLD scan period stays the same
1834 	 *       <  NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
1835 	 *	 >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
1836 	 */
1837 	period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
1838 	ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
1839 	if (ratio >= NUMA_PERIOD_THRESHOLD) {
1840 		int slot = ratio - NUMA_PERIOD_THRESHOLD;
1841 		if (!slot)
1842 			slot = 1;
1843 		diff = slot * period_slot;
1844 	} else {
1845 		diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
1846 
1847 		/*
1848 		 * Scale scan rate increases based on sharing. There is an
1849 		 * inverse relationship between the degree of sharing and
1850 		 * the adjustment made to the scanning period. Broadly
1851 		 * speaking the intent is that there is little point
1852 		 * scanning faster if shared accesses dominate as it may
1853 		 * simply bounce migrations uselessly
1854 		 */
1855 		ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
1856 		diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
1857 	}
1858 
1859 	p->numa_scan_period = clamp(p->numa_scan_period + diff,
1860 			task_scan_min(p), task_scan_max(p));
1861 	memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
1862 }
1863 
1864 /*
1865  * Get the fraction of time the task has been running since the last
1866  * NUMA placement cycle. The scheduler keeps similar statistics, but
1867  * decays those on a 32ms period, which is orders of magnitude off
1868  * from the dozens-of-seconds NUMA balancing period. Use the scheduler
1869  * stats only if the task is so new there are no NUMA statistics yet.
1870  */
numa_get_avg_runtime(struct task_struct * p,u64 * period)1871 static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
1872 {
1873 	u64 runtime, delta, now;
1874 	/* Use the start of this time slice to avoid calculations. */
1875 	now = p->se.exec_start;
1876 	runtime = p->se.sum_exec_runtime;
1877 
1878 	if (p->last_task_numa_placement) {
1879 		delta = runtime - p->last_sum_exec_runtime;
1880 		*period = now - p->last_task_numa_placement;
1881 
1882 		/* Avoid time going backwards, prevent potential divide error: */
1883 		if (unlikely((s64)*period < 0))
1884 			*period = 0;
1885 	} else {
1886 		delta = p->se.avg.load_sum / p->se.load.weight;
1887 		*period = LOAD_AVG_MAX;
1888 	}
1889 
1890 	p->last_sum_exec_runtime = runtime;
1891 	p->last_task_numa_placement = now;
1892 
1893 	return delta;
1894 }
1895 
1896 /*
1897  * Determine the preferred nid for a task in a numa_group. This needs to
1898  * be done in a way that produces consistent results with group_weight,
1899  * otherwise workloads might not converge.
1900  */
preferred_group_nid(struct task_struct * p,int nid)1901 static int preferred_group_nid(struct task_struct *p, int nid)
1902 {
1903 	nodemask_t nodes;
1904 	int dist;
1905 
1906 	/* Direct connections between all NUMA nodes. */
1907 	if (sched_numa_topology_type == NUMA_DIRECT)
1908 		return nid;
1909 
1910 	/*
1911 	 * On a system with glueless mesh NUMA topology, group_weight
1912 	 * scores nodes according to the number of NUMA hinting faults on
1913 	 * both the node itself, and on nearby nodes.
1914 	 */
1915 	if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
1916 		unsigned long score, max_score = 0;
1917 		int node, max_node = nid;
1918 
1919 		dist = sched_max_numa_distance;
1920 
1921 		for_each_online_node(node) {
1922 			score = group_weight(p, node, dist);
1923 			if (score > max_score) {
1924 				max_score = score;
1925 				max_node = node;
1926 			}
1927 		}
1928 		return max_node;
1929 	}
1930 
1931 	/*
1932 	 * Finding the preferred nid in a system with NUMA backplane
1933 	 * interconnect topology is more involved. The goal is to locate
1934 	 * tasks from numa_groups near each other in the system, and
1935 	 * untangle workloads from different sides of the system. This requires
1936 	 * searching down the hierarchy of node groups, recursively searching
1937 	 * inside the highest scoring group of nodes. The nodemask tricks
1938 	 * keep the complexity of the search down.
1939 	 */
1940 	nodes = node_online_map;
1941 	for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
1942 		unsigned long max_faults = 0;
1943 		nodemask_t max_group = NODE_MASK_NONE;
1944 		int a, b;
1945 
1946 		/* Are there nodes at this distance from each other? */
1947 		if (!find_numa_distance(dist))
1948 			continue;
1949 
1950 		for_each_node_mask(a, nodes) {
1951 			unsigned long faults = 0;
1952 			nodemask_t this_group;
1953 			nodes_clear(this_group);
1954 
1955 			/* Sum group's NUMA faults; includes a==b case. */
1956 			for_each_node_mask(b, nodes) {
1957 				if (node_distance(a, b) < dist) {
1958 					faults += group_faults(p, b);
1959 					node_set(b, this_group);
1960 					node_clear(b, nodes);
1961 				}
1962 			}
1963 
1964 			/* Remember the top group. */
1965 			if (faults > max_faults) {
1966 				max_faults = faults;
1967 				max_group = this_group;
1968 				/*
1969 				 * subtle: at the smallest distance there is
1970 				 * just one node left in each "group", the
1971 				 * winner is the preferred nid.
1972 				 */
1973 				nid = a;
1974 			}
1975 		}
1976 		/* Next round, evaluate the nodes within max_group. */
1977 		if (!max_faults)
1978 			break;
1979 		nodes = max_group;
1980 	}
1981 	return nid;
1982 }
1983 
task_numa_placement(struct task_struct * p)1984 static void task_numa_placement(struct task_struct *p)
1985 {
1986 	int seq, nid, max_nid = -1, max_group_nid = -1;
1987 	unsigned long max_faults = 0, max_group_faults = 0;
1988 	unsigned long fault_types[2] = { 0, 0 };
1989 	unsigned long total_faults;
1990 	u64 runtime, period;
1991 	spinlock_t *group_lock = NULL;
1992 
1993 	/*
1994 	 * The p->mm->numa_scan_seq field gets updated without
1995 	 * exclusive access. Use READ_ONCE() here to ensure
1996 	 * that the field is read in a single access:
1997 	 */
1998 	seq = READ_ONCE(p->mm->numa_scan_seq);
1999 	if (p->numa_scan_seq == seq)
2000 		return;
2001 	p->numa_scan_seq = seq;
2002 	p->numa_scan_period_max = task_scan_max(p);
2003 
2004 	total_faults = p->numa_faults_locality[0] +
2005 		       p->numa_faults_locality[1];
2006 	runtime = numa_get_avg_runtime(p, &period);
2007 
2008 	/* If the task is part of a group prevent parallel updates to group stats */
2009 	if (p->numa_group) {
2010 		group_lock = &p->numa_group->lock;
2011 		spin_lock_irq(group_lock);
2012 	}
2013 
2014 	/* Find the node with the highest number of faults */
2015 	for_each_online_node(nid) {
2016 		/* Keep track of the offsets in numa_faults array */
2017 		int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
2018 		unsigned long faults = 0, group_faults = 0;
2019 		int priv;
2020 
2021 		for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
2022 			long diff, f_diff, f_weight;
2023 
2024 			mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
2025 			membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
2026 			cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
2027 			cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
2028 
2029 			/* Decay existing window, copy faults since last scan */
2030 			diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
2031 			fault_types[priv] += p->numa_faults[membuf_idx];
2032 			p->numa_faults[membuf_idx] = 0;
2033 
2034 			/*
2035 			 * Normalize the faults_from, so all tasks in a group
2036 			 * count according to CPU use, instead of by the raw
2037 			 * number of faults. Tasks with little runtime have
2038 			 * little over-all impact on throughput, and thus their
2039 			 * faults are less important.
2040 			 */
2041 			f_weight = div64_u64(runtime << 16, period + 1);
2042 			f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
2043 				   (total_faults + 1);
2044 			f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
2045 			p->numa_faults[cpubuf_idx] = 0;
2046 
2047 			p->numa_faults[mem_idx] += diff;
2048 			p->numa_faults[cpu_idx] += f_diff;
2049 			faults += p->numa_faults[mem_idx];
2050 			p->total_numa_faults += diff;
2051 			if (p->numa_group) {
2052 				/*
2053 				 * safe because we can only change our own group
2054 				 *
2055 				 * mem_idx represents the offset for a given
2056 				 * nid and priv in a specific region because it
2057 				 * is at the beginning of the numa_faults array.
2058 				 */
2059 				p->numa_group->faults[mem_idx] += diff;
2060 				p->numa_group->faults_cpu[mem_idx] += f_diff;
2061 				p->numa_group->total_faults += diff;
2062 				group_faults += p->numa_group->faults[mem_idx];
2063 			}
2064 		}
2065 
2066 		if (faults > max_faults) {
2067 			max_faults = faults;
2068 			max_nid = nid;
2069 		}
2070 
2071 		if (group_faults > max_group_faults) {
2072 			max_group_faults = group_faults;
2073 			max_group_nid = nid;
2074 		}
2075 	}
2076 
2077 	update_task_scan_period(p, fault_types[0], fault_types[1]);
2078 
2079 	if (p->numa_group) {
2080 		update_numa_active_node_mask(p->numa_group);
2081 		spin_unlock_irq(group_lock);
2082 		max_nid = preferred_group_nid(p, max_group_nid);
2083 	}
2084 
2085 	if (max_faults) {
2086 		/* Set the new preferred node */
2087 		if (max_nid != p->numa_preferred_nid)
2088 			sched_setnuma(p, max_nid);
2089 
2090 		if (task_node(p) != p->numa_preferred_nid)
2091 			numa_migrate_preferred(p);
2092 	}
2093 }
2094 
get_numa_group(struct numa_group * grp)2095 static inline int get_numa_group(struct numa_group *grp)
2096 {
2097 	return atomic_inc_not_zero(&grp->refcount);
2098 }
2099 
put_numa_group(struct numa_group * grp)2100 static inline void put_numa_group(struct numa_group *grp)
2101 {
2102 	if (atomic_dec_and_test(&grp->refcount))
2103 		kfree_rcu(grp, rcu);
2104 }
2105 
task_numa_group(struct task_struct * p,int cpupid,int flags,int * priv)2106 static void task_numa_group(struct task_struct *p, int cpupid, int flags,
2107 			int *priv)
2108 {
2109 	struct numa_group *grp, *my_grp;
2110 	struct task_struct *tsk;
2111 	bool join = false;
2112 	int cpu = cpupid_to_cpu(cpupid);
2113 	int i;
2114 
2115 	if (unlikely(!p->numa_group)) {
2116 		unsigned int size = sizeof(struct numa_group) +
2117 				    4*nr_node_ids*sizeof(unsigned long);
2118 
2119 		grp = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
2120 		if (!grp)
2121 			return;
2122 
2123 		atomic_set(&grp->refcount, 1);
2124 		spin_lock_init(&grp->lock);
2125 		grp->gid = p->pid;
2126 		/* Second half of the array tracks nids where faults happen */
2127 		grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
2128 						nr_node_ids;
2129 
2130 		node_set(task_node(current), grp->active_nodes);
2131 
2132 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2133 			grp->faults[i] = p->numa_faults[i];
2134 
2135 		grp->total_faults = p->total_numa_faults;
2136 
2137 		grp->nr_tasks++;
2138 		rcu_assign_pointer(p->numa_group, grp);
2139 	}
2140 
2141 	rcu_read_lock();
2142 	tsk = READ_ONCE(cpu_rq(cpu)->curr);
2143 
2144 	if (!cpupid_match_pid(tsk, cpupid))
2145 		goto no_join;
2146 
2147 	grp = rcu_dereference(tsk->numa_group);
2148 	if (!grp)
2149 		goto no_join;
2150 
2151 	my_grp = p->numa_group;
2152 	if (grp == my_grp)
2153 		goto no_join;
2154 
2155 	/*
2156 	 * Only join the other group if its bigger; if we're the bigger group,
2157 	 * the other task will join us.
2158 	 */
2159 	if (my_grp->nr_tasks > grp->nr_tasks)
2160 		goto no_join;
2161 
2162 	/*
2163 	 * Tie-break on the grp address.
2164 	 */
2165 	if (my_grp->nr_tasks == grp->nr_tasks && my_grp > grp)
2166 		goto no_join;
2167 
2168 	/* Always join threads in the same process. */
2169 	if (tsk->mm == current->mm)
2170 		join = true;
2171 
2172 	/* Simple filter to avoid false positives due to PID collisions */
2173 	if (flags & TNF_SHARED)
2174 		join = true;
2175 
2176 	/* Update priv based on whether false sharing was detected */
2177 	*priv = !join;
2178 
2179 	if (join && !get_numa_group(grp))
2180 		goto no_join;
2181 
2182 	rcu_read_unlock();
2183 
2184 	if (!join)
2185 		return;
2186 
2187 	BUG_ON(irqs_disabled());
2188 	double_lock_irq(&my_grp->lock, &grp->lock);
2189 
2190 	for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
2191 		my_grp->faults[i] -= p->numa_faults[i];
2192 		grp->faults[i] += p->numa_faults[i];
2193 	}
2194 	my_grp->total_faults -= p->total_numa_faults;
2195 	grp->total_faults += p->total_numa_faults;
2196 
2197 	my_grp->nr_tasks--;
2198 	grp->nr_tasks++;
2199 
2200 	spin_unlock(&my_grp->lock);
2201 	spin_unlock_irq(&grp->lock);
2202 
2203 	rcu_assign_pointer(p->numa_group, grp);
2204 
2205 	put_numa_group(my_grp);
2206 	return;
2207 
2208 no_join:
2209 	rcu_read_unlock();
2210 	return;
2211 }
2212 
2213 /*
2214  * Get rid of NUMA staticstics associated with a task (either current or dead).
2215  * If @final is set, the task is dead and has reached refcount zero, so we can
2216  * safely free all relevant data structures. Otherwise, there might be
2217  * concurrent reads from places like load balancing and procfs, and we should
2218  * reset the data back to default state without freeing ->numa_faults.
2219  */
task_numa_free(struct task_struct * p,bool final)2220 void task_numa_free(struct task_struct *p, bool final)
2221 {
2222 	struct numa_group *grp = p->numa_group;
2223 	unsigned long *numa_faults = p->numa_faults;
2224 	unsigned long flags;
2225 	int i;
2226 
2227 	if (!numa_faults)
2228 		return;
2229 
2230 	if (grp) {
2231 		spin_lock_irqsave(&grp->lock, flags);
2232 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2233 			grp->faults[i] -= p->numa_faults[i];
2234 		grp->total_faults -= p->total_numa_faults;
2235 
2236 		grp->nr_tasks--;
2237 		spin_unlock_irqrestore(&grp->lock, flags);
2238 		RCU_INIT_POINTER(p->numa_group, NULL);
2239 		put_numa_group(grp);
2240 	}
2241 
2242 	if (final) {
2243 		p->numa_faults = NULL;
2244 		kfree(numa_faults);
2245 	} else {
2246 		p->total_numa_faults = 0;
2247 		for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
2248 			numa_faults[i] = 0;
2249 	}
2250 }
2251 
2252 /*
2253  * Got a PROT_NONE fault for a page on @node.
2254  */
task_numa_fault(int last_cpupid,int mem_node,int pages,int flags)2255 void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
2256 {
2257 	struct task_struct *p = current;
2258 	bool migrated = flags & TNF_MIGRATED;
2259 	int cpu_node = task_node(current);
2260 	int local = !!(flags & TNF_FAULT_LOCAL);
2261 	int priv;
2262 
2263 	if (!static_branch_likely(&sched_numa_balancing))
2264 		return;
2265 
2266 	/* for example, ksmd faulting in a user's mm */
2267 	if (!p->mm)
2268 		return;
2269 
2270 	/* Allocate buffer to track faults on a per-node basis */
2271 	if (unlikely(!p->numa_faults)) {
2272 		int size = sizeof(*p->numa_faults) *
2273 			   NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
2274 
2275 		p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
2276 		if (!p->numa_faults)
2277 			return;
2278 
2279 		p->total_numa_faults = 0;
2280 		memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
2281 	}
2282 
2283 	/*
2284 	 * First accesses are treated as private, otherwise consider accesses
2285 	 * to be private if the accessing pid has not changed
2286 	 */
2287 	if (unlikely(last_cpupid == (-1 & LAST_CPUPID_MASK))) {
2288 		priv = 1;
2289 	} else {
2290 		priv = cpupid_match_pid(p, last_cpupid);
2291 		if (!priv && !(flags & TNF_NO_GROUP))
2292 			task_numa_group(p, last_cpupid, flags, &priv);
2293 	}
2294 
2295 	/*
2296 	 * If a workload spans multiple NUMA nodes, a shared fault that
2297 	 * occurs wholly within the set of nodes that the workload is
2298 	 * actively using should be counted as local. This allows the
2299 	 * scan rate to slow down when a workload has settled down.
2300 	 */
2301 	if (!priv && !local && p->numa_group &&
2302 			node_isset(cpu_node, p->numa_group->active_nodes) &&
2303 			node_isset(mem_node, p->numa_group->active_nodes))
2304 		local = 1;
2305 
2306 	task_numa_placement(p);
2307 
2308 	/*
2309 	 * Retry task to preferred node migration periodically, in case it
2310 	 * case it previously failed, or the scheduler moved us.
2311 	 */
2312 	if (time_after(jiffies, p->numa_migrate_retry))
2313 		numa_migrate_preferred(p);
2314 
2315 	if (migrated)
2316 		p->numa_pages_migrated += pages;
2317 	if (flags & TNF_MIGRATE_FAIL)
2318 		p->numa_faults_locality[2] += pages;
2319 
2320 	p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
2321 	p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
2322 	p->numa_faults_locality[local] += pages;
2323 }
2324 
reset_ptenuma_scan(struct task_struct * p)2325 static void reset_ptenuma_scan(struct task_struct *p)
2326 {
2327 	/*
2328 	 * We only did a read acquisition of the mmap sem, so
2329 	 * p->mm->numa_scan_seq is written to without exclusive access
2330 	 * and the update is not guaranteed to be atomic. That's not
2331 	 * much of an issue though, since this is just used for
2332 	 * statistical sampling. Use READ_ONCE/WRITE_ONCE, which are not
2333 	 * expensive, to avoid any form of compiler optimizations:
2334 	 */
2335 	WRITE_ONCE(p->mm->numa_scan_seq, READ_ONCE(p->mm->numa_scan_seq) + 1);
2336 	p->mm->numa_scan_offset = 0;
2337 }
2338 
2339 /*
2340  * The expensive part of numa migration is done from task_work context.
2341  * Triggered from task_tick_numa().
2342  */
task_numa_work(struct callback_head * work)2343 void task_numa_work(struct callback_head *work)
2344 {
2345 	unsigned long migrate, next_scan, now = jiffies;
2346 	struct task_struct *p = current;
2347 	struct mm_struct *mm = p->mm;
2348 	struct vm_area_struct *vma;
2349 	unsigned long start, end;
2350 	unsigned long nr_pte_updates = 0;
2351 	long pages, virtpages;
2352 
2353 	WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
2354 
2355 	work->next = work; /* protect against double add */
2356 	/*
2357 	 * Who cares about NUMA placement when they're dying.
2358 	 *
2359 	 * NOTE: make sure not to dereference p->mm before this check,
2360 	 * exit_task_work() happens _after_ exit_mm() so we could be called
2361 	 * without p->mm even though we still had it when we enqueued this
2362 	 * work.
2363 	 */
2364 	if (p->flags & PF_EXITING)
2365 		return;
2366 
2367 	if (!mm->numa_next_scan) {
2368 		mm->numa_next_scan = now +
2369 			msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2370 	}
2371 
2372 	/*
2373 	 * Enforce maximal scan/migration frequency..
2374 	 */
2375 	migrate = mm->numa_next_scan;
2376 	if (time_before(now, migrate))
2377 		return;
2378 
2379 	if (p->numa_scan_period == 0) {
2380 		p->numa_scan_period_max = task_scan_max(p);
2381 		p->numa_scan_period = task_scan_min(p);
2382 	}
2383 
2384 	next_scan = now + msecs_to_jiffies(p->numa_scan_period);
2385 	if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
2386 		return;
2387 
2388 	/*
2389 	 * Delay this task enough that another task of this mm will likely win
2390 	 * the next time around.
2391 	 */
2392 	p->node_stamp += 2 * TICK_NSEC;
2393 
2394 	start = mm->numa_scan_offset;
2395 	pages = sysctl_numa_balancing_scan_size;
2396 	pages <<= 20 - PAGE_SHIFT; /* MB in pages */
2397 	virtpages = pages * 8;	   /* Scan up to this much virtual space */
2398 	if (!pages)
2399 		return;
2400 
2401 
2402 	if (!down_read_trylock(&mm->mmap_sem))
2403 		return;
2404 	vma = find_vma(mm, start);
2405 	if (!vma) {
2406 		reset_ptenuma_scan(p);
2407 		start = 0;
2408 		vma = mm->mmap;
2409 	}
2410 	for (; vma; vma = vma->vm_next) {
2411 		if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
2412 			is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) {
2413 			continue;
2414 		}
2415 
2416 		/*
2417 		 * Shared library pages mapped by multiple processes are not
2418 		 * migrated as it is expected they are cache replicated. Avoid
2419 		 * hinting faults in read-only file-backed mappings or the vdso
2420 		 * as migrating the pages will be of marginal benefit.
2421 		 */
2422 		if (!vma->vm_mm ||
2423 		    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
2424 			continue;
2425 
2426 		/*
2427 		 * Skip inaccessible VMAs to avoid any confusion between
2428 		 * PROT_NONE and NUMA hinting ptes
2429 		 */
2430 		if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
2431 			continue;
2432 
2433 		do {
2434 			start = max(start, vma->vm_start);
2435 			end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
2436 			end = min(end, vma->vm_end);
2437 			nr_pte_updates = change_prot_numa(vma, start, end);
2438 
2439 			/*
2440 			 * Try to scan sysctl_numa_balancing_size worth of
2441 			 * hpages that have at least one present PTE that
2442 			 * is not already pte-numa. If the VMA contains
2443 			 * areas that are unused or already full of prot_numa
2444 			 * PTEs, scan up to virtpages, to skip through those
2445 			 * areas faster.
2446 			 */
2447 			if (nr_pte_updates)
2448 				pages -= (end - start) >> PAGE_SHIFT;
2449 			virtpages -= (end - start) >> PAGE_SHIFT;
2450 
2451 			start = end;
2452 			if (pages <= 0 || virtpages <= 0)
2453 				goto out;
2454 
2455 			cond_resched();
2456 		} while (end != vma->vm_end);
2457 	}
2458 
2459 out:
2460 	/*
2461 	 * It is possible to reach the end of the VMA list but the last few
2462 	 * VMAs are not guaranteed to the vma_migratable. If they are not, we
2463 	 * would find the !migratable VMA on the next scan but not reset the
2464 	 * scanner to the start so check it now.
2465 	 */
2466 	if (vma)
2467 		mm->numa_scan_offset = start;
2468 	else
2469 		reset_ptenuma_scan(p);
2470 	up_read(&mm->mmap_sem);
2471 }
2472 
2473 /*
2474  * Drive the periodic memory faults..
2475  */
task_tick_numa(struct rq * rq,struct task_struct * curr)2476 void task_tick_numa(struct rq *rq, struct task_struct *curr)
2477 {
2478 	struct callback_head *work = &curr->numa_work;
2479 	u64 period, now;
2480 
2481 	/*
2482 	 * We don't care about NUMA placement if we don't have memory.
2483 	 */
2484 	if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
2485 		return;
2486 
2487 	/*
2488 	 * Using runtime rather than walltime has the dual advantage that
2489 	 * we (mostly) drive the selection from busy threads and that the
2490 	 * task needs to have done some actual work before we bother with
2491 	 * NUMA placement.
2492 	 */
2493 	now = curr->se.sum_exec_runtime;
2494 	period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
2495 
2496 	if (now > curr->node_stamp + period) {
2497 		if (!curr->node_stamp)
2498 			curr->numa_scan_period = task_scan_min(curr);
2499 		curr->node_stamp += period;
2500 
2501 		if (!time_before(jiffies, curr->mm->numa_next_scan)) {
2502 			init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
2503 			task_work_add(curr, work, true);
2504 		}
2505 	}
2506 }
2507 #else
task_tick_numa(struct rq * rq,struct task_struct * curr)2508 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
2509 {
2510 }
2511 
account_numa_enqueue(struct rq * rq,struct task_struct * p)2512 static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
2513 {
2514 }
2515 
account_numa_dequeue(struct rq * rq,struct task_struct * p)2516 static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
2517 {
2518 }
2519 #endif /* CONFIG_NUMA_BALANCING */
2520 
2521 static void
account_entity_enqueue(struct cfs_rq * cfs_rq,struct sched_entity * se)2522 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2523 {
2524 	update_load_add(&cfs_rq->load, se->load.weight);
2525 	if (!parent_entity(se))
2526 		update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
2527 #ifdef CONFIG_SMP
2528 	if (entity_is_task(se)) {
2529 		struct rq *rq = rq_of(cfs_rq);
2530 
2531 		account_numa_enqueue(rq, task_of(se));
2532 		list_add(&se->group_node, &rq->cfs_tasks);
2533 	}
2534 #endif
2535 	cfs_rq->nr_running++;
2536 }
2537 
2538 static void
account_entity_dequeue(struct cfs_rq * cfs_rq,struct sched_entity * se)2539 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
2540 {
2541 	update_load_sub(&cfs_rq->load, se->load.weight);
2542 	if (!parent_entity(se))
2543 		update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
2544 	if (entity_is_task(se)) {
2545 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
2546 		list_del_init(&se->group_node);
2547 	}
2548 	cfs_rq->nr_running--;
2549 }
2550 
2551 #ifdef CONFIG_FAIR_GROUP_SCHED
2552 # ifdef CONFIG_SMP
calc_cfs_shares(struct cfs_rq * cfs_rq,struct task_group * tg)2553 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2554 {
2555 	long tg_weight, load, shares;
2556 
2557 	/*
2558 	 * This really should be: cfs_rq->avg.load_avg, but instead we use
2559 	 * cfs_rq->load.weight, which is its upper bound. This helps ramp up
2560 	 * the shares for small weight interactive tasks.
2561 	 */
2562 	load = scale_load_down(cfs_rq->load.weight);
2563 
2564 	tg_weight = atomic_long_read(&tg->load_avg);
2565 
2566 	/* Ensure tg_weight >= load */
2567 	tg_weight -= cfs_rq->tg_load_avg_contrib;
2568 	tg_weight += load;
2569 
2570 	shares = (tg->shares * load);
2571 	if (tg_weight)
2572 		shares /= tg_weight;
2573 
2574 	if (shares < MIN_SHARES)
2575 		shares = MIN_SHARES;
2576 	if (shares > tg->shares)
2577 		shares = tg->shares;
2578 
2579 	return shares;
2580 }
2581 # else /* CONFIG_SMP */
calc_cfs_shares(struct cfs_rq * cfs_rq,struct task_group * tg)2582 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
2583 {
2584 	return tg->shares;
2585 }
2586 # endif /* CONFIG_SMP */
2587 
reweight_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,unsigned long weight)2588 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
2589 			    unsigned long weight)
2590 {
2591 	if (se->on_rq) {
2592 		/* commit outstanding execution time */
2593 		if (cfs_rq->curr == se)
2594 			update_curr(cfs_rq);
2595 		account_entity_dequeue(cfs_rq, se);
2596 	}
2597 
2598 	update_load_set(&se->load, weight);
2599 
2600 	if (se->on_rq)
2601 		account_entity_enqueue(cfs_rq, se);
2602 }
2603 
2604 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
2605 
update_cfs_shares(struct sched_entity * se)2606 static void update_cfs_shares(struct sched_entity *se)
2607 {
2608 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
2609 	struct task_group *tg;
2610 	long shares;
2611 
2612 	if (!cfs_rq)
2613 		return;
2614 
2615 	if (throttled_hierarchy(cfs_rq))
2616 		return;
2617 
2618 	tg = cfs_rq->tg;
2619 
2620 #ifndef CONFIG_SMP
2621 	if (likely(se->load.weight == tg->shares))
2622 		return;
2623 #endif
2624 	shares = calc_cfs_shares(cfs_rq, tg);
2625 
2626 	reweight_entity(cfs_rq_of(se), se, shares);
2627 }
2628 
2629 #else /* CONFIG_FAIR_GROUP_SCHED */
update_cfs_shares(struct sched_entity * se)2630 static inline void update_cfs_shares(struct sched_entity *se)
2631 {
2632 }
2633 #endif /* CONFIG_FAIR_GROUP_SCHED */
2634 
2635 #ifdef CONFIG_SMP
2636 /* Precomputed fixed inverse multiplies for multiplication by y^n */
2637 static const u32 runnable_avg_yN_inv[] = {
2638 	0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
2639 	0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
2640 	0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
2641 	0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
2642 	0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
2643 	0x85aac367, 0x82cd8698,
2644 };
2645 
2646 /*
2647  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
2648  * over-estimates when re-combining.
2649  */
2650 static const u32 runnable_avg_yN_sum[] = {
2651 	    0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
2652 	 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
2653 	17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
2654 };
2655 
2656 /*
2657  * Approximate:
2658  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
2659  */
decay_load(u64 val,u64 n)2660 static __always_inline u64 decay_load(u64 val, u64 n)
2661 {
2662 	unsigned int local_n;
2663 
2664 	if (!n)
2665 		return val;
2666 	else if (unlikely(n > LOAD_AVG_PERIOD * 63))
2667 		return 0;
2668 
2669 	/* after bounds checking we can collapse to 32-bit */
2670 	local_n = n;
2671 
2672 	/*
2673 	 * As y^PERIOD = 1/2, we can combine
2674 	 *    y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
2675 	 * With a look-up table which covers y^n (n<PERIOD)
2676 	 *
2677 	 * To achieve constant time decay_load.
2678 	 */
2679 	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
2680 		val >>= local_n / LOAD_AVG_PERIOD;
2681 		local_n %= LOAD_AVG_PERIOD;
2682 	}
2683 
2684 	val = mul_u64_u32_shr(val, runnable_avg_yN_inv[local_n], 32);
2685 	return val;
2686 }
2687 
2688 /*
2689  * For updates fully spanning n periods, the contribution to runnable
2690  * average will be: \Sum 1024*y^n
2691  *
2692  * We can compute this reasonably efficiently by combining:
2693  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
2694  */
__compute_runnable_contrib(u64 n)2695 static u32 __compute_runnable_contrib(u64 n)
2696 {
2697 	u32 contrib = 0;
2698 
2699 	if (likely(n <= LOAD_AVG_PERIOD))
2700 		return runnable_avg_yN_sum[n];
2701 	else if (unlikely(n >= LOAD_AVG_MAX_N))
2702 		return LOAD_AVG_MAX;
2703 
2704 	/* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
2705 	do {
2706 		contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
2707 		contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
2708 
2709 		n -= LOAD_AVG_PERIOD;
2710 	} while (n > LOAD_AVG_PERIOD);
2711 
2712 	contrib = decay_load(contrib, n);
2713 	return contrib + runnable_avg_yN_sum[n];
2714 }
2715 
2716 #if (SCHED_LOAD_SHIFT - SCHED_LOAD_RESOLUTION) != 10 || SCHED_CAPACITY_SHIFT != 10
2717 #error "load tracking assumes 2^10 as unit"
2718 #endif
2719 
2720 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
2721 
2722 /*
2723  * We can represent the historical contribution to runnable average as the
2724  * coefficients of a geometric series.  To do this we sub-divide our runnable
2725  * history into segments of approximately 1ms (1024us); label the segment that
2726  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
2727  *
2728  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
2729  *      p0            p1           p2
2730  *     (now)       (~1ms ago)  (~2ms ago)
2731  *
2732  * Let u_i denote the fraction of p_i that the entity was runnable.
2733  *
2734  * We then designate the fractions u_i as our co-efficients, yielding the
2735  * following representation of historical load:
2736  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
2737  *
2738  * We choose y based on the with of a reasonably scheduling period, fixing:
2739  *   y^32 = 0.5
2740  *
2741  * This means that the contribution to load ~32ms ago (u_32) will be weighted
2742  * approximately half as much as the contribution to load within the last ms
2743  * (u_0).
2744  *
2745  * When a period "rolls over" and we have new u_0`, multiplying the previous
2746  * sum again by y is sufficient to update:
2747  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
2748  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
2749  */
2750 static __always_inline int
__update_load_avg(u64 now,int cpu,struct sched_avg * sa,unsigned long weight,int running,struct cfs_rq * cfs_rq)2751 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
2752 		  unsigned long weight, int running, struct cfs_rq *cfs_rq)
2753 {
2754 	u64 delta, scaled_delta, periods;
2755 	u32 contrib;
2756 	unsigned int delta_w, scaled_delta_w, decayed = 0;
2757 	unsigned long scale_freq, scale_cpu;
2758 
2759 	delta = now - sa->last_update_time;
2760 	/*
2761 	 * This should only happen when time goes backwards, which it
2762 	 * unfortunately does during sched clock init when we swap over to TSC.
2763 	 */
2764 	if ((s64)delta < 0) {
2765 		sa->last_update_time = now;
2766 		return 0;
2767 	}
2768 
2769 	/*
2770 	 * Use 1024ns as the unit of measurement since it's a reasonable
2771 	 * approximation of 1us and fast to compute.
2772 	 */
2773 	delta >>= 10;
2774 	if (!delta)
2775 		return 0;
2776 	sa->last_update_time = now;
2777 
2778 	scale_freq = arch_scale_freq_capacity(NULL, cpu);
2779 	scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
2780 	trace_sched_contrib_scale_f(cpu, scale_freq, scale_cpu);
2781 
2782 	/* delta_w is the amount already accumulated against our next period */
2783 	delta_w = sa->period_contrib;
2784 	if (delta + delta_w >= 1024) {
2785 		decayed = 1;
2786 
2787 		/* how much left for next period will start over, we don't know yet */
2788 		sa->period_contrib = 0;
2789 
2790 		/*
2791 		 * Now that we know we're crossing a period boundary, figure
2792 		 * out how much from delta we need to complete the current
2793 		 * period and accrue it.
2794 		 */
2795 		delta_w = 1024 - delta_w;
2796 		scaled_delta_w = cap_scale(delta_w, scale_freq);
2797 		if (weight) {
2798 			sa->load_sum += weight * scaled_delta_w;
2799 			if (cfs_rq) {
2800 				cfs_rq->runnable_load_sum +=
2801 						weight * scaled_delta_w;
2802 			}
2803 		}
2804 		if (running)
2805 			sa->util_sum += scaled_delta_w * scale_cpu;
2806 
2807 		delta -= delta_w;
2808 
2809 		/* Figure out how many additional periods this update spans */
2810 		periods = delta / 1024;
2811 		delta %= 1024;
2812 
2813 		sa->load_sum = decay_load(sa->load_sum, periods + 1);
2814 		if (cfs_rq) {
2815 			cfs_rq->runnable_load_sum =
2816 				decay_load(cfs_rq->runnable_load_sum, periods + 1);
2817 		}
2818 		sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
2819 
2820 		/* Efficiently calculate \sum (1..n_period) 1024*y^i */
2821 		contrib = __compute_runnable_contrib(periods);
2822 		contrib = cap_scale(contrib, scale_freq);
2823 		if (weight) {
2824 			sa->load_sum += weight * contrib;
2825 			if (cfs_rq)
2826 				cfs_rq->runnable_load_sum += weight * contrib;
2827 		}
2828 		if (running)
2829 			sa->util_sum += contrib * scale_cpu;
2830 	}
2831 
2832 	/* Remainder of delta accrued against u_0` */
2833 	scaled_delta = cap_scale(delta, scale_freq);
2834 	if (weight) {
2835 		sa->load_sum += weight * scaled_delta;
2836 		if (cfs_rq)
2837 			cfs_rq->runnable_load_sum += weight * scaled_delta;
2838 	}
2839 	if (running)
2840 		sa->util_sum += scaled_delta * scale_cpu;
2841 
2842 	sa->period_contrib += delta;
2843 
2844 	if (decayed) {
2845 		sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
2846 		if (cfs_rq) {
2847 			cfs_rq->runnable_load_avg =
2848 				div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
2849 		}
2850 		sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
2851 	}
2852 
2853 	return decayed;
2854 }
2855 
2856 /*
2857  * Signed add and clamp on underflow.
2858  *
2859  * Explicitly do a load-store to ensure the intermediate value never hits
2860  * memory. This allows lockless observations without ever seeing the negative
2861  * values.
2862  */
2863 #define add_positive(_ptr, _val) do {                           \
2864 	typeof(_ptr) ptr = (_ptr);                              \
2865 	typeof(_val) val = (_val);                              \
2866 	typeof(*ptr) res, var = READ_ONCE(*ptr);                \
2867 								\
2868 	res = var + val;                                        \
2869 								\
2870 	if (val < 0 && res > var)                               \
2871 		res = 0;                                        \
2872 								\
2873 	WRITE_ONCE(*ptr, res);                                  \
2874 } while (0)
2875 
2876 #ifdef CONFIG_FAIR_GROUP_SCHED
2877 /**
2878  * update_tg_load_avg - update the tg's load avg
2879  * @cfs_rq: the cfs_rq whose avg changed
2880  * @force: update regardless of how small the difference
2881  *
2882  * This function 'ensures': tg->load_avg := \Sum tg->cfs_rq[]->avg.load.
2883  * However, because tg->load_avg is a global value there are performance
2884  * considerations.
2885  *
2886  * In order to avoid having to look at the other cfs_rq's, we use a
2887  * differential update where we store the last value we propagated. This in
2888  * turn allows skipping updates if the differential is 'small'.
2889  *
2890  * Updating tg's load_avg is necessary before update_cfs_share() (which is
2891  * done) and effective_load() (which is not done because it is too costly).
2892  */
update_tg_load_avg(struct cfs_rq * cfs_rq,int force)2893 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
2894 {
2895 	long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib;
2896 
2897 	/*
2898 	 * No need to update load_avg for root_task_group as it is not used.
2899 	 */
2900 	if (cfs_rq->tg == &root_task_group)
2901 		return;
2902 
2903 	if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) {
2904 		atomic_long_add(delta, &cfs_rq->tg->load_avg);
2905 		cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
2906 	}
2907 }
2908 
2909 /*
2910  * Called within set_task_rq() right before setting a task's cpu. The
2911  * caller only guarantees p->pi_lock is held; no other assumptions,
2912  * including the state of rq->lock, should be made.
2913  */
set_task_rq_fair(struct sched_entity * se,struct cfs_rq * prev,struct cfs_rq * next)2914 void set_task_rq_fair(struct sched_entity *se,
2915 		      struct cfs_rq *prev, struct cfs_rq *next)
2916 {
2917 	if (!sched_feat(ATTACH_AGE_LOAD))
2918 		return;
2919 
2920 	/*
2921 	 * We are supposed to update the task to "current" time, then its up to
2922 	 * date and ready to go to new CPU/cfs_rq. But we have difficulty in
2923 	 * getting what current time is, so simply throw away the out-of-date
2924 	 * time. This will result in the wakee task is less decayed, but giving
2925 	 * the wakee more load sounds not bad.
2926 	 */
2927 	if (se->avg.last_update_time && prev) {
2928 		u64 p_last_update_time;
2929 		u64 n_last_update_time;
2930 
2931 #ifndef CONFIG_64BIT
2932 		u64 p_last_update_time_copy;
2933 		u64 n_last_update_time_copy;
2934 
2935 		do {
2936 			p_last_update_time_copy = prev->load_last_update_time_copy;
2937 			n_last_update_time_copy = next->load_last_update_time_copy;
2938 
2939 			smp_rmb();
2940 
2941 			p_last_update_time = prev->avg.last_update_time;
2942 			n_last_update_time = next->avg.last_update_time;
2943 
2944 		} while (p_last_update_time != p_last_update_time_copy ||
2945 			 n_last_update_time != n_last_update_time_copy);
2946 #else
2947 		p_last_update_time = prev->avg.last_update_time;
2948 		n_last_update_time = next->avg.last_update_time;
2949 #endif
2950 		__update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
2951 				  &se->avg, 0, 0, NULL);
2952 		se->avg.last_update_time = n_last_update_time;
2953 	}
2954 }
2955 
2956 /* Take into account change of utilization of a child task group */
2957 static inline void
update_tg_cfs_util(struct cfs_rq * cfs_rq,struct sched_entity * se)2958 update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se)
2959 {
2960 	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
2961 	long delta = gcfs_rq->avg.util_avg - se->avg.util_avg;
2962 
2963 	/* Nothing to update */
2964 	if (!delta)
2965 		return;
2966 
2967 	/* Set new sched_entity's utilization */
2968 	se->avg.util_avg = gcfs_rq->avg.util_avg;
2969 	se->avg.util_sum = se->avg.util_avg * LOAD_AVG_MAX;
2970 
2971 	/* Update parent cfs_rq utilization */
2972 	add_positive(&cfs_rq->avg.util_avg, delta);
2973 	cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
2974 }
2975 
2976 /* Take into account change of load of a child task group */
2977 static inline void
update_tg_cfs_load(struct cfs_rq * cfs_rq,struct sched_entity * se)2978 update_tg_cfs_load(struct cfs_rq *cfs_rq, struct sched_entity *se)
2979 {
2980 	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
2981 	long delta, load = gcfs_rq->avg.load_avg;
2982 
2983 	/*
2984 	 * If the load of group cfs_rq is null, the load of the
2985 	 * sched_entity will also be null so we can skip the formula
2986 	 */
2987 	if (load) {
2988 		long tg_load;
2989 
2990 		/* Get tg's load and ensure tg_load > 0 */
2991 		tg_load = atomic_long_read(&gcfs_rq->tg->load_avg) + 1;
2992 
2993 		/* Ensure tg_load >= load and updated with current load*/
2994 		tg_load -= gcfs_rq->tg_load_avg_contrib;
2995 		tg_load += load;
2996 
2997 		/*
2998 		 * We need to compute a correction term in the case that the
2999 		 * task group is consuming more CPU than a task of equal
3000 		 * weight. A task with a weight equals to tg->shares will have
3001 		 * a load less or equal to scale_load_down(tg->shares).
3002 		 * Similarly, the sched_entities that represent the task group
3003 		 * at parent level, can't have a load higher than
3004 		 * scale_load_down(tg->shares). And the Sum of sched_entities'
3005 		 * load must be <= scale_load_down(tg->shares).
3006 		 */
3007 		if (tg_load > scale_load_down(gcfs_rq->tg->shares)) {
3008 			/* scale gcfs_rq's load into tg's shares*/
3009 			load *= scale_load_down(gcfs_rq->tg->shares);
3010 			load /= tg_load;
3011 		}
3012 	}
3013 
3014 	delta = load - se->avg.load_avg;
3015 
3016 	/* Nothing to update */
3017 	if (!delta)
3018 		return;
3019 
3020 	/* Set new sched_entity's load */
3021 	se->avg.load_avg = load;
3022 	se->avg.load_sum = se->avg.load_avg * LOAD_AVG_MAX;
3023 
3024 	/* Update parent cfs_rq load */
3025 	add_positive(&cfs_rq->avg.load_avg, delta);
3026 	cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * LOAD_AVG_MAX;
3027 
3028 	/*
3029 	 * If the sched_entity is already enqueued, we also have to update the
3030 	 * runnable load avg.
3031 	 */
3032 	if (se->on_rq) {
3033 		/* Update parent cfs_rq runnable_load_avg */
3034 		add_positive(&cfs_rq->runnable_load_avg, delta);
3035 		cfs_rq->runnable_load_sum = cfs_rq->runnable_load_avg * LOAD_AVG_MAX;
3036 	}
3037 }
3038 
set_tg_cfs_propagate(struct cfs_rq * cfs_rq)3039 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq)
3040 {
3041 	cfs_rq->propagate_avg = 1;
3042 }
3043 
test_and_clear_tg_cfs_propagate(struct sched_entity * se)3044 static inline int test_and_clear_tg_cfs_propagate(struct sched_entity *se)
3045 {
3046 	struct cfs_rq *cfs_rq = group_cfs_rq(se);
3047 
3048 	if (!cfs_rq->propagate_avg)
3049 		return 0;
3050 
3051 	cfs_rq->propagate_avg = 0;
3052 	return 1;
3053 }
3054 
3055 /* Update task and its cfs_rq load average */
propagate_entity_load_avg(struct sched_entity * se)3056 static inline int propagate_entity_load_avg(struct sched_entity *se)
3057 {
3058 	struct cfs_rq *cfs_rq;
3059 
3060 	if (entity_is_task(se))
3061 		return 0;
3062 
3063 	if (!test_and_clear_tg_cfs_propagate(se))
3064 		return 0;
3065 
3066 	cfs_rq = cfs_rq_of(se);
3067 
3068 	set_tg_cfs_propagate(cfs_rq);
3069 
3070 	update_tg_cfs_util(cfs_rq, se);
3071 	update_tg_cfs_load(cfs_rq, se);
3072 
3073 	return 1;
3074 }
3075 
3076 /*
3077  * Check if we need to update the load and the utilization of a blocked
3078  * group_entity:
3079  */
skip_blocked_update(struct sched_entity * se)3080 static inline bool skip_blocked_update(struct sched_entity *se)
3081 {
3082 	struct cfs_rq *gcfs_rq = group_cfs_rq(se);
3083 
3084 	/*
3085 	 * If sched_entity still have not zero load or utilization, we have to
3086 	 * decay it:
3087 	 */
3088 	if (se->avg.load_avg || se->avg.util_avg)
3089 		return false;
3090 
3091 	/*
3092 	 * If there is a pending propagation, we have to update the load and
3093 	 * the utilization of the sched_entity:
3094 	 */
3095 	if (gcfs_rq->propagate_avg)
3096 		return false;
3097 
3098 	/*
3099 	 * Otherwise, the load and the utilization of the sched_entity is
3100 	 * already zero and there is no pending propagation, so it will be a
3101 	 * waste of time to try to decay it:
3102 	 */
3103 	return true;
3104 }
3105 
3106 #else /* CONFIG_FAIR_GROUP_SCHED */
3107 
update_tg_load_avg(struct cfs_rq * cfs_rq,int force)3108 static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
3109 
propagate_entity_load_avg(struct sched_entity * se)3110 static inline int propagate_entity_load_avg(struct sched_entity *se)
3111 {
3112 	return 0;
3113 }
3114 
set_tg_cfs_propagate(struct cfs_rq * cfs_rq)3115 static inline void set_tg_cfs_propagate(struct cfs_rq *cfs_rq) {}
3116 
3117 #endif /* CONFIG_FAIR_GROUP_SCHED */
3118 
cfs_rq_util_change(struct cfs_rq * cfs_rq)3119 static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
3120 {
3121         if (&this_rq()->cfs == cfs_rq) {
3122                 /*
3123                  * There are a few boundary cases this might miss but it should
3124                  * get called often enough that that should (hopefully) not be
3125                  * a real problem -- added to that it only calls on the local
3126                  * CPU, so if we enqueue remotely we'll miss an update, but
3127                  * the next tick/schedule should update.
3128                  *
3129                  * It will not get called when we go idle, because the idle
3130                  * thread is a different class (!fair), nor will the utilization
3131                  * number include things like RT tasks.
3132                  *
3133                  * As is, the util number is not freq-invariant (we'd have to
3134                  * implement arch_scale_freq_capacity() for that).
3135                  *
3136                  * See cpu_util().
3137                  */
3138                 cpufreq_update_util(rq_of(cfs_rq), 0);
3139         }
3140 }
3141 
3142 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
3143 
3144 /*
3145  * Unsigned subtract and clamp on underflow.
3146  *
3147  * Explicitly do a load-store to ensure the intermediate value never hits
3148  * memory. This allows lockless observations without ever seeing the negative
3149  * values.
3150  */
3151 #define sub_positive(_ptr, _val) do {				\
3152 	typeof(_ptr) ptr = (_ptr);				\
3153 	typeof(*ptr) val = (_val);				\
3154 	typeof(*ptr) res, var = READ_ONCE(*ptr);		\
3155 	res = var - val;					\
3156 	if (res > var)						\
3157 		res = 0;					\
3158 	WRITE_ONCE(*ptr, res);					\
3159 } while (0)
3160 
3161 /**
3162  * update_cfs_rq_load_avg - update the cfs_rq's load/util averages
3163  * @now: current time, as per cfs_rq_clock_task()
3164  * @cfs_rq: cfs_rq to update
3165  * @update_freq: should we call cfs_rq_util_change() or will the call do so
3166  *
3167  * The cfs_rq avg is the direct sum of all its entities (blocked and runnable)
3168  * avg. The immediate corollary is that all (fair) tasks must be attached, see
3169  * post_init_entity_util_avg().
3170  *
3171  * cfs_rq->avg is used for task_h_load() and update_cfs_share() for example.
3172  *
3173  * Returns true if the load decayed or we removed load.
3174  *
3175  * Since both these conditions indicate a changed cfs_rq->avg.load we should
3176  * call update_tg_load_avg() when this function returns true.
3177  */
3178 static inline int
update_cfs_rq_load_avg(u64 now,struct cfs_rq * cfs_rq,bool update_freq)3179 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3180 {
3181 	struct sched_avg *sa = &cfs_rq->avg;
3182 	int decayed, removed = 0, removed_util = 0;
3183 
3184 	if (atomic_long_read(&cfs_rq->removed_load_avg)) {
3185 		s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
3186 		sub_positive(&sa->load_avg, r);
3187 		sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
3188 		removed = 1;
3189 		set_tg_cfs_propagate(cfs_rq);
3190 	}
3191 
3192 	if (atomic_long_read(&cfs_rq->removed_util_avg)) {
3193 		long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
3194 		sub_positive(&sa->util_avg, r);
3195 		sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
3196 		removed_util = 1;
3197 		set_tg_cfs_propagate(cfs_rq);
3198 	}
3199 
3200 	decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
3201 		scale_load_down(cfs_rq->load.weight), cfs_rq->curr != NULL, cfs_rq);
3202 
3203 #ifndef CONFIG_64BIT
3204 	smp_wmb();
3205 	cfs_rq->load_last_update_time_copy = sa->last_update_time;
3206 #endif
3207 
3208 	/* Trace CPU load, unless cfs_rq belongs to a non-root task_group */
3209 	if (cfs_rq == &rq_of(cfs_rq)->cfs)
3210 		trace_sched_load_avg_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
3211 
3212 	if (update_freq && (decayed || removed_util))
3213 		cfs_rq_util_change(cfs_rq);
3214 
3215 	return decayed || removed;
3216 }
3217 
3218 /*
3219  * Optional action to be done while updating the load average
3220  */
3221 #define UPDATE_TG	0x1
3222 #define SKIP_AGE_LOAD	0x2
3223 
3224 /* Update task and its cfs_rq load average */
update_load_avg(struct sched_entity * se,int flags)3225 static inline void update_load_avg(struct sched_entity *se, int flags)
3226 {
3227 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3228 	u64 now = cfs_rq_clock_task(cfs_rq);
3229 	int cpu = cpu_of(rq_of(cfs_rq));
3230 	int decayed;
3231 	void *ptr = NULL;
3232 
3233 	/*
3234 	 * Track task load average for carrying it to new CPU after migrated, and
3235 	 * track group sched_entity load average for task_h_load calc in migration
3236 	 */
3237 	if (se->avg.last_update_time && !(flags & SKIP_AGE_LOAD)) {
3238 		__update_load_avg(now, cpu, &se->avg,
3239 			  se->on_rq * scale_load_down(se->load.weight),
3240 			  cfs_rq->curr == se, NULL);
3241 	}
3242 
3243 	decayed  = update_cfs_rq_load_avg(now, cfs_rq, true);
3244 	decayed |= propagate_entity_load_avg(se);
3245 
3246 	if (decayed && (flags & UPDATE_TG))
3247 		update_tg_load_avg(cfs_rq, 0);
3248 
3249 	if (entity_is_task(se)) {
3250 #ifdef CONFIG_SCHED_WALT
3251 		ptr = (void *)&(task_of(se)->ravg);
3252 #endif
3253 		trace_sched_load_avg_task(task_of(se), &se->avg, ptr);
3254 	}
3255 }
3256 
3257 /**
3258  * attach_entity_load_avg - attach this entity to its cfs_rq load avg
3259  * @cfs_rq: cfs_rq to attach to
3260  * @se: sched_entity to attach
3261  *
3262  * Must call update_cfs_rq_load_avg() before this, since we rely on
3263  * cfs_rq->avg.last_update_time being current.
3264  */
attach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3265 static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3266 {
3267 	se->avg.last_update_time = cfs_rq->avg.last_update_time;
3268 	cfs_rq->avg.load_avg += se->avg.load_avg;
3269 	cfs_rq->avg.load_sum += se->avg.load_sum;
3270 	cfs_rq->avg.util_avg += se->avg.util_avg;
3271 	cfs_rq->avg.util_sum += se->avg.util_sum;
3272 	set_tg_cfs_propagate(cfs_rq);
3273 
3274 	cfs_rq_util_change(cfs_rq);
3275 }
3276 
3277 /**
3278  * detach_entity_load_avg - detach this entity from its cfs_rq load avg
3279  * @cfs_rq: cfs_rq to detach from
3280  * @se: sched_entity to detach
3281  *
3282  * Must call update_cfs_rq_load_avg() before this, since we rely on
3283  * cfs_rq->avg.last_update_time being current.
3284  */
detach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3285 static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3286 {
3287 
3288 	sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
3289 	sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
3290 	sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
3291 	sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
3292 	set_tg_cfs_propagate(cfs_rq);
3293 
3294 	cfs_rq_util_change(cfs_rq);
3295 }
3296 
3297 /* Add the load generated by se into cfs_rq's load average */
3298 static inline void
enqueue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3299 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3300 {
3301 	struct sched_avg *sa = &se->avg;
3302 
3303 	cfs_rq->runnable_load_avg += sa->load_avg;
3304 	cfs_rq->runnable_load_sum += sa->load_sum;
3305 
3306 	if (!sa->last_update_time) {
3307 		attach_entity_load_avg(cfs_rq, se);
3308 		update_tg_load_avg(cfs_rq, 0);
3309 	}
3310 }
3311 
3312 /* Remove the runnable load generated by se from cfs_rq's runnable load average */
3313 static inline void
dequeue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3314 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
3315 {
3316 	cfs_rq->runnable_load_avg =
3317 		max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
3318 	cfs_rq->runnable_load_sum =
3319 		max_t(s64,  cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
3320 }
3321 
3322 #ifndef CONFIG_64BIT
cfs_rq_last_update_time(struct cfs_rq * cfs_rq)3323 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3324 {
3325 	u64 last_update_time_copy;
3326 	u64 last_update_time;
3327 
3328 	do {
3329 		last_update_time_copy = cfs_rq->load_last_update_time_copy;
3330 		smp_rmb();
3331 		last_update_time = cfs_rq->avg.last_update_time;
3332 	} while (last_update_time != last_update_time_copy);
3333 
3334 	return last_update_time;
3335 }
3336 #else
cfs_rq_last_update_time(struct cfs_rq * cfs_rq)3337 static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq)
3338 {
3339 	return cfs_rq->avg.last_update_time;
3340 }
3341 #endif
3342 
3343 /*
3344  * Synchronize entity load avg of dequeued entity without locking
3345  * the previous rq.
3346  */
sync_entity_load_avg(struct sched_entity * se)3347 void sync_entity_load_avg(struct sched_entity *se)
3348 {
3349 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3350 	u64 last_update_time;
3351 
3352 	last_update_time = cfs_rq_last_update_time(cfs_rq);
3353 	__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
3354 }
3355 
3356 /*
3357  * Task first catches up with cfs_rq, and then subtract
3358  * itself from the cfs_rq (task must be off the queue now).
3359  */
remove_entity_load_avg(struct sched_entity * se)3360 void remove_entity_load_avg(struct sched_entity *se)
3361 {
3362 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
3363 
3364 	/*
3365 	 * tasks cannot exit without having gone through wake_up_new_task() ->
3366 	 * post_init_entity_util_avg() which will have added things to the
3367 	 * cfs_rq, so we can remove unconditionally.
3368 	 *
3369 	 * Similarly for groups, they will have passed through
3370 	 * post_init_entity_util_avg() before unregister_sched_fair_group()
3371 	 * calls this.
3372 	 */
3373 
3374 	sync_entity_load_avg(se);
3375 	atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
3376 	atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
3377 }
3378 
3379 /*
3380  * Update the rq's load with the elapsed running time before entering
3381  * idle. if the last scheduled task is not a CFS task, idle_enter will
3382  * be the only way to update the runnable statistic.
3383  */
idle_enter_fair(struct rq * this_rq)3384 void idle_enter_fair(struct rq *this_rq)
3385 {
3386 }
3387 
3388 /*
3389  * Update the rq's load with the elapsed idle time before a task is
3390  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
3391  * be the only way to update the runnable statistic.
3392  */
idle_exit_fair(struct rq * this_rq)3393 void idle_exit_fair(struct rq *this_rq)
3394 {
3395 }
3396 
cfs_rq_runnable_load_avg(struct cfs_rq * cfs_rq)3397 static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq)
3398 {
3399 	return cfs_rq->runnable_load_avg;
3400 }
3401 
cfs_rq_load_avg(struct cfs_rq * cfs_rq)3402 static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
3403 {
3404 	return cfs_rq->avg.load_avg;
3405 }
3406 
3407 static int idle_balance(struct rq *this_rq);
3408 
3409 #else /* CONFIG_SMP */
3410 
3411 static inline int
update_cfs_rq_load_avg(u64 now,struct cfs_rq * cfs_rq,bool update_freq)3412 update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
3413 {
3414 	return 0;
3415 }
3416 
3417 #define UPDATE_TG	0x0
3418 #define SKIP_AGE_LOAD	0x0
3419 
update_load_avg(struct sched_entity * se,int not_used1)3420 static inline void update_load_avg(struct sched_entity *se, int not_used1){}
3421 static inline void
enqueue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3422 enqueue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3423 static inline void
dequeue_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3424 dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
remove_entity_load_avg(struct sched_entity * se)3425 static inline void remove_entity_load_avg(struct sched_entity *se) {}
3426 
3427 static inline void
attach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3428 attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3429 static inline void
detach_entity_load_avg(struct cfs_rq * cfs_rq,struct sched_entity * se)3430 detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
3431 
idle_balance(struct rq * rq)3432 static inline int idle_balance(struct rq *rq)
3433 {
3434 	return 0;
3435 }
3436 
3437 #endif /* CONFIG_SMP */
3438 
enqueue_sleeper(struct cfs_rq * cfs_rq,struct sched_entity * se)3439 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
3440 {
3441 #ifdef CONFIG_SCHEDSTATS
3442 	struct task_struct *tsk = NULL;
3443 
3444 	if (entity_is_task(se))
3445 		tsk = task_of(se);
3446 
3447 	if (se->statistics.sleep_start) {
3448 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.sleep_start;
3449 
3450 		if ((s64)delta < 0)
3451 			delta = 0;
3452 
3453 		if (unlikely(delta > se->statistics.sleep_max))
3454 			se->statistics.sleep_max = delta;
3455 
3456 		se->statistics.sleep_start = 0;
3457 		se->statistics.sum_sleep_runtime += delta;
3458 
3459 		if (tsk) {
3460 			account_scheduler_latency(tsk, delta >> 10, 1);
3461 			trace_sched_stat_sleep(tsk, delta);
3462 		}
3463 	}
3464 	if (se->statistics.block_start) {
3465 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics.block_start;
3466 
3467 		if ((s64)delta < 0)
3468 			delta = 0;
3469 
3470 		if (unlikely(delta > se->statistics.block_max))
3471 			se->statistics.block_max = delta;
3472 
3473 		se->statistics.block_start = 0;
3474 		se->statistics.sum_sleep_runtime += delta;
3475 
3476 		if (tsk) {
3477 			if (tsk->in_iowait) {
3478 				se->statistics.iowait_sum += delta;
3479 				se->statistics.iowait_count++;
3480 				trace_sched_stat_iowait(tsk, delta);
3481 			}
3482 
3483 			trace_sched_stat_blocked(tsk, delta);
3484 			trace_sched_blocked_reason(tsk);
3485 
3486 			/*
3487 			 * Blocking time is in units of nanosecs, so shift by
3488 			 * 20 to get a milliseconds-range estimation of the
3489 			 * amount of time that the task spent sleeping:
3490 			 */
3491 			if (unlikely(prof_on == SLEEP_PROFILING)) {
3492 				profile_hits(SLEEP_PROFILING,
3493 						(void *)get_wchan(tsk),
3494 						delta >> 20);
3495 			}
3496 			account_scheduler_latency(tsk, delta >> 10, 0);
3497 		}
3498 	}
3499 #endif
3500 }
3501 
check_spread(struct cfs_rq * cfs_rq,struct sched_entity * se)3502 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
3503 {
3504 #ifdef CONFIG_SCHED_DEBUG
3505 	s64 d = se->vruntime - cfs_rq->min_vruntime;
3506 
3507 	if (d < 0)
3508 		d = -d;
3509 
3510 	if (d > 3*sysctl_sched_latency)
3511 		schedstat_inc(cfs_rq, nr_spread_over);
3512 #endif
3513 }
3514 
3515 static void
place_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int initial)3516 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
3517 {
3518 	u64 vruntime = cfs_rq->min_vruntime;
3519 
3520 	/*
3521 	 * The 'current' period is already promised to the current tasks,
3522 	 * however the extra weight of the new task will slow them down a
3523 	 * little, place the new task so that it fits in the slot that
3524 	 * stays open at the end.
3525 	 */
3526 	if (initial && sched_feat(START_DEBIT))
3527 		vruntime += sched_vslice(cfs_rq, se);
3528 
3529 	/* sleeps up to a single latency don't count. */
3530 	if (!initial) {
3531 		unsigned long thresh = sysctl_sched_latency;
3532 
3533 		/*
3534 		 * Halve their sleep time's effect, to allow
3535 		 * for a gentler effect of sleepers:
3536 		 */
3537 		if (sched_feat(GENTLE_FAIR_SLEEPERS))
3538 			thresh >>= 1;
3539 
3540 		vruntime -= thresh;
3541 	}
3542 
3543 	/* ensure we never gain time by being placed backwards. */
3544 	se->vruntime = max_vruntime(se->vruntime, vruntime);
3545 }
3546 
3547 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
3548 
3549 static void
enqueue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)3550 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3551 {
3552 	/*
3553 	 * Update the normalized vruntime before updating min_vruntime
3554 	 * through calling update_curr().
3555 	 */
3556 	if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
3557 		se->vruntime += cfs_rq->min_vruntime;
3558 
3559 	/*
3560 	 * Update run-time statistics of the 'current'.
3561 	 */
3562 	update_curr(cfs_rq);
3563 	update_load_avg(se, UPDATE_TG);
3564 	enqueue_entity_load_avg(cfs_rq, se);
3565 	update_cfs_shares(se);
3566 	account_entity_enqueue(cfs_rq, se);
3567 
3568 	if (flags & ENQUEUE_WAKEUP) {
3569 		place_entity(cfs_rq, se, 0);
3570 		enqueue_sleeper(cfs_rq, se);
3571 	}
3572 
3573 	update_stats_enqueue(cfs_rq, se);
3574 	check_spread(cfs_rq, se);
3575 	if (se != cfs_rq->curr)
3576 		__enqueue_entity(cfs_rq, se);
3577 	se->on_rq = 1;
3578 
3579 	if (cfs_rq->nr_running == 1) {
3580 		list_add_leaf_cfs_rq(cfs_rq);
3581 		check_enqueue_throttle(cfs_rq);
3582 	}
3583 }
3584 
__clear_buddies_last(struct sched_entity * se)3585 static void __clear_buddies_last(struct sched_entity *se)
3586 {
3587 	for_each_sched_entity(se) {
3588 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3589 		if (cfs_rq->last != se)
3590 			break;
3591 
3592 		cfs_rq->last = NULL;
3593 	}
3594 }
3595 
__clear_buddies_next(struct sched_entity * se)3596 static void __clear_buddies_next(struct sched_entity *se)
3597 {
3598 	for_each_sched_entity(se) {
3599 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3600 		if (cfs_rq->next != se)
3601 			break;
3602 
3603 		cfs_rq->next = NULL;
3604 	}
3605 }
3606 
__clear_buddies_skip(struct sched_entity * se)3607 static void __clear_buddies_skip(struct sched_entity *se)
3608 {
3609 	for_each_sched_entity(se) {
3610 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
3611 		if (cfs_rq->skip != se)
3612 			break;
3613 
3614 		cfs_rq->skip = NULL;
3615 	}
3616 }
3617 
clear_buddies(struct cfs_rq * cfs_rq,struct sched_entity * se)3618 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
3619 {
3620 	if (cfs_rq->last == se)
3621 		__clear_buddies_last(se);
3622 
3623 	if (cfs_rq->next == se)
3624 		__clear_buddies_next(se);
3625 
3626 	if (cfs_rq->skip == se)
3627 		__clear_buddies_skip(se);
3628 }
3629 
3630 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3631 
3632 static void
dequeue_entity(struct cfs_rq * cfs_rq,struct sched_entity * se,int flags)3633 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
3634 {
3635 	/*
3636 	 * Update run-time statistics of the 'current'.
3637 	 */
3638 	update_curr(cfs_rq);
3639 
3640 	/*
3641 	 * When dequeuing a sched_entity, we must:
3642 	 *   - Update loads to have both entity and cfs_rq synced with now.
3643 	 *   - Substract its load from the cfs_rq->runnable_avg.
3644 	 *   - Substract its previous weight from cfs_rq->load.weight.
3645 	 *   - For group entity, update its weight to reflect the new share
3646 	 *     of its group cfs_rq.
3647 	 */
3648 	update_load_avg(se, UPDATE_TG);
3649 	dequeue_entity_load_avg(cfs_rq, se);
3650 
3651 	update_stats_dequeue(cfs_rq, se);
3652 	if (flags & DEQUEUE_SLEEP) {
3653 #ifdef CONFIG_SCHEDSTATS
3654 		if (entity_is_task(se)) {
3655 			struct task_struct *tsk = task_of(se);
3656 
3657 			if (tsk->state & TASK_INTERRUPTIBLE)
3658 				se->statistics.sleep_start = rq_clock(rq_of(cfs_rq));
3659 			if (tsk->state & TASK_UNINTERRUPTIBLE)
3660 				se->statistics.block_start = rq_clock(rq_of(cfs_rq));
3661 		}
3662 #endif
3663 	}
3664 
3665 	clear_buddies(cfs_rq, se);
3666 
3667 	if (se != cfs_rq->curr)
3668 		__dequeue_entity(cfs_rq, se);
3669 	se->on_rq = 0;
3670 	account_entity_dequeue(cfs_rq, se);
3671 
3672 	/*
3673 	 * Normalize the entity after updating the min_vruntime because the
3674 	 * update can refer to the ->curr item and we need to reflect this
3675 	 * movement in our normalized position.
3676 	 */
3677 	if (!(flags & DEQUEUE_SLEEP))
3678 		se->vruntime -= cfs_rq->min_vruntime;
3679 
3680 	/* return excess runtime on last dequeue */
3681 	return_cfs_rq_runtime(cfs_rq);
3682 
3683 	update_min_vruntime(cfs_rq);
3684 	update_cfs_shares(se);
3685 }
3686 
3687 /*
3688  * Preempt the current task with a newly woken task if needed:
3689  */
3690 static void
check_preempt_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr)3691 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3692 {
3693 	unsigned long ideal_runtime, delta_exec;
3694 	struct sched_entity *se;
3695 	s64 delta;
3696 
3697 	ideal_runtime = sched_slice(cfs_rq, curr);
3698 	delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
3699 	if (delta_exec > ideal_runtime) {
3700 		resched_curr(rq_of(cfs_rq));
3701 		/*
3702 		 * The current task ran long enough, ensure it doesn't get
3703 		 * re-elected due to buddy favours.
3704 		 */
3705 		clear_buddies(cfs_rq, curr);
3706 		return;
3707 	}
3708 
3709 	/*
3710 	 * Ensure that a task that missed wakeup preemption by a
3711 	 * narrow margin doesn't have to wait for a full slice.
3712 	 * This also mitigates buddy induced latencies under load.
3713 	 */
3714 	if (delta_exec < sysctl_sched_min_granularity)
3715 		return;
3716 
3717 	se = __pick_first_entity(cfs_rq);
3718 	delta = curr->vruntime - se->vruntime;
3719 
3720 	if (delta < 0)
3721 		return;
3722 
3723 	if (delta > ideal_runtime)
3724 		resched_curr(rq_of(cfs_rq));
3725 }
3726 
3727 static void
set_next_entity(struct cfs_rq * cfs_rq,struct sched_entity * se)3728 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
3729 {
3730 	/* 'current' is not kept within the tree. */
3731 	if (se->on_rq) {
3732 		/*
3733 		 * Any task has to be enqueued before it get to execute on
3734 		 * a CPU. So account for the time it spent waiting on the
3735 		 * runqueue.
3736 		 */
3737 		update_stats_wait_end(cfs_rq, se);
3738 		__dequeue_entity(cfs_rq, se);
3739 		update_load_avg(se, UPDATE_TG);
3740 	}
3741 
3742 	update_stats_curr_start(cfs_rq, se);
3743 	cfs_rq->curr = se;
3744 #ifdef CONFIG_SCHEDSTATS
3745 	/*
3746 	 * Track our maximum slice length, if the CPU's load is at
3747 	 * least twice that of our own weight (i.e. dont track it
3748 	 * when there are only lesser-weight tasks around):
3749 	 */
3750 	if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
3751 		se->statistics.slice_max = max(se->statistics.slice_max,
3752 			se->sum_exec_runtime - se->prev_sum_exec_runtime);
3753 	}
3754 #endif
3755 	se->prev_sum_exec_runtime = se->sum_exec_runtime;
3756 }
3757 
3758 static int
3759 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
3760 
3761 /*
3762  * Pick the next process, keeping these things in mind, in this order:
3763  * 1) keep things fair between processes/task groups
3764  * 2) pick the "next" process, since someone really wants that to run
3765  * 3) pick the "last" process, for cache locality
3766  * 4) do not run the "skip" process, if something else is available
3767  */
3768 static struct sched_entity *
pick_next_entity(struct cfs_rq * cfs_rq,struct sched_entity * curr)3769 pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr)
3770 {
3771 	struct sched_entity *left = __pick_first_entity(cfs_rq);
3772 	struct sched_entity *se;
3773 
3774 	/*
3775 	 * If curr is set we have to see if its left of the leftmost entity
3776 	 * still in the tree, provided there was anything in the tree at all.
3777 	 */
3778 	if (!left || (curr && entity_before(curr, left)))
3779 		left = curr;
3780 
3781 	se = left; /* ideally we run the leftmost entity */
3782 
3783 	/*
3784 	 * Avoid running the skip buddy, if running something else can
3785 	 * be done without getting too unfair.
3786 	 */
3787 	if (cfs_rq->skip == se) {
3788 		struct sched_entity *second;
3789 
3790 		if (se == curr) {
3791 			second = __pick_first_entity(cfs_rq);
3792 		} else {
3793 			second = __pick_next_entity(se);
3794 			if (!second || (curr && entity_before(curr, second)))
3795 				second = curr;
3796 		}
3797 
3798 		if (second && wakeup_preempt_entity(second, left) < 1)
3799 			se = second;
3800 	}
3801 
3802 	/*
3803 	 * Prefer last buddy, try to return the CPU to a preempted task.
3804 	 */
3805 	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
3806 		se = cfs_rq->last;
3807 
3808 	/*
3809 	 * Someone really wants this to run. If it's not unfair, run it.
3810 	 */
3811 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
3812 		se = cfs_rq->next;
3813 
3814 	clear_buddies(cfs_rq, se);
3815 
3816 	return se;
3817 }
3818 
3819 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
3820 
put_prev_entity(struct cfs_rq * cfs_rq,struct sched_entity * prev)3821 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
3822 {
3823 	/*
3824 	 * If still on the runqueue then deactivate_task()
3825 	 * was not called and update_curr() has to be done:
3826 	 */
3827 	if (prev->on_rq)
3828 		update_curr(cfs_rq);
3829 
3830 	/* throttle cfs_rqs exceeding runtime */
3831 	check_cfs_rq_runtime(cfs_rq);
3832 
3833 	check_spread(cfs_rq, prev);
3834 	if (prev->on_rq) {
3835 		update_stats_wait_start(cfs_rq, prev);
3836 		/* Put 'current' back into the tree. */
3837 		__enqueue_entity(cfs_rq, prev);
3838 		/* in !on_rq case, update occurred at dequeue */
3839 		update_load_avg(prev, 0);
3840 	}
3841 	cfs_rq->curr = NULL;
3842 }
3843 
3844 static void
entity_tick(struct cfs_rq * cfs_rq,struct sched_entity * curr,int queued)3845 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
3846 {
3847 	/*
3848 	 * Update run-time statistics of the 'current'.
3849 	 */
3850 	update_curr(cfs_rq);
3851 
3852 	/*
3853 	 * Ensure that runnable average is periodically updated.
3854 	 */
3855 	update_load_avg(curr, UPDATE_TG);
3856 	update_cfs_shares(curr);
3857 
3858 #ifdef CONFIG_SCHED_HRTICK
3859 	/*
3860 	 * queued ticks are scheduled to match the slice, so don't bother
3861 	 * validating it and just reschedule.
3862 	 */
3863 	if (queued) {
3864 		resched_curr(rq_of(cfs_rq));
3865 		return;
3866 	}
3867 	/*
3868 	 * don't let the period tick interfere with the hrtick preemption
3869 	 */
3870 	if (!sched_feat(DOUBLE_TICK) &&
3871 			hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
3872 		return;
3873 #endif
3874 
3875 	if (cfs_rq->nr_running > 1)
3876 		check_preempt_tick(cfs_rq, curr);
3877 }
3878 
3879 
3880 /**************************************************
3881  * CFS bandwidth control machinery
3882  */
3883 
3884 #ifdef CONFIG_CFS_BANDWIDTH
3885 
3886 #ifdef HAVE_JUMP_LABEL
3887 static struct static_key __cfs_bandwidth_used;
3888 
cfs_bandwidth_used(void)3889 static inline bool cfs_bandwidth_used(void)
3890 {
3891 	return static_key_false(&__cfs_bandwidth_used);
3892 }
3893 
cfs_bandwidth_usage_inc(void)3894 void cfs_bandwidth_usage_inc(void)
3895 {
3896 	static_key_slow_inc(&__cfs_bandwidth_used);
3897 }
3898 
cfs_bandwidth_usage_dec(void)3899 void cfs_bandwidth_usage_dec(void)
3900 {
3901 	static_key_slow_dec(&__cfs_bandwidth_used);
3902 }
3903 #else /* HAVE_JUMP_LABEL */
cfs_bandwidth_used(void)3904 static bool cfs_bandwidth_used(void)
3905 {
3906 	return true;
3907 }
3908 
cfs_bandwidth_usage_inc(void)3909 void cfs_bandwidth_usage_inc(void) {}
cfs_bandwidth_usage_dec(void)3910 void cfs_bandwidth_usage_dec(void) {}
3911 #endif /* HAVE_JUMP_LABEL */
3912 
3913 /*
3914  * default period for cfs group bandwidth.
3915  * default: 0.1s, units: nanoseconds
3916  */
default_cfs_period(void)3917 static inline u64 default_cfs_period(void)
3918 {
3919 	return 100000000ULL;
3920 }
3921 
sched_cfs_bandwidth_slice(void)3922 static inline u64 sched_cfs_bandwidth_slice(void)
3923 {
3924 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
3925 }
3926 
3927 /*
3928  * Replenish runtime according to assigned quota and update expiration time.
3929  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
3930  * additional synchronization around rq->lock.
3931  *
3932  * requires cfs_b->lock
3933  */
__refill_cfs_bandwidth_runtime(struct cfs_bandwidth * cfs_b)3934 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
3935 {
3936 	u64 now;
3937 
3938 	if (cfs_b->quota == RUNTIME_INF)
3939 		return;
3940 
3941 	now = sched_clock_cpu(smp_processor_id());
3942 	cfs_b->runtime = cfs_b->quota;
3943 	cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
3944 }
3945 
tg_cfs_bandwidth(struct task_group * tg)3946 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3947 {
3948 	return &tg->cfs_bandwidth;
3949 }
3950 
3951 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
cfs_rq_clock_task(struct cfs_rq * cfs_rq)3952 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3953 {
3954 	if (unlikely(cfs_rq->throttle_count))
3955 		return cfs_rq->throttled_clock_task;
3956 
3957 	return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
3958 }
3959 
3960 /* returns 0 on failure to allocate runtime */
assign_cfs_rq_runtime(struct cfs_rq * cfs_rq)3961 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3962 {
3963 	struct task_group *tg = cfs_rq->tg;
3964 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
3965 	u64 amount = 0, min_amount, expires;
3966 
3967 	/* note: this is a positive sum as runtime_remaining <= 0 */
3968 	min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
3969 
3970 	raw_spin_lock(&cfs_b->lock);
3971 	if (cfs_b->quota == RUNTIME_INF)
3972 		amount = min_amount;
3973 	else {
3974 		start_cfs_bandwidth(cfs_b);
3975 
3976 		if (cfs_b->runtime > 0) {
3977 			amount = min(cfs_b->runtime, min_amount);
3978 			cfs_b->runtime -= amount;
3979 			cfs_b->idle = 0;
3980 		}
3981 	}
3982 	expires = cfs_b->runtime_expires;
3983 	raw_spin_unlock(&cfs_b->lock);
3984 
3985 	cfs_rq->runtime_remaining += amount;
3986 	/*
3987 	 * we may have advanced our local expiration to account for allowed
3988 	 * spread between our sched_clock and the one on which runtime was
3989 	 * issued.
3990 	 */
3991 	if ((s64)(expires - cfs_rq->runtime_expires) > 0)
3992 		cfs_rq->runtime_expires = expires;
3993 
3994 	return cfs_rq->runtime_remaining > 0;
3995 }
3996 
3997 /*
3998  * Note: This depends on the synchronization provided by sched_clock and the
3999  * fact that rq->clock snapshots this value.
4000  */
expire_cfs_rq_runtime(struct cfs_rq * cfs_rq)4001 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4002 {
4003 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4004 
4005 	/* if the deadline is ahead of our clock, nothing to do */
4006 	if (likely((s64)(rq_clock(rq_of(cfs_rq)) - cfs_rq->runtime_expires) < 0))
4007 		return;
4008 
4009 	if (cfs_rq->runtime_remaining < 0)
4010 		return;
4011 
4012 	/*
4013 	 * If the local deadline has passed we have to consider the
4014 	 * possibility that our sched_clock is 'fast' and the global deadline
4015 	 * has not truly expired.
4016 	 *
4017 	 * Fortunately we can check determine whether this the case by checking
4018 	 * whether the global deadline has advanced. It is valid to compare
4019 	 * cfs_b->runtime_expires without any locks since we only care about
4020 	 * exact equality, so a partial write will still work.
4021 	 */
4022 
4023 	if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
4024 		/* extend local deadline, drift is bounded above by 2 ticks */
4025 		cfs_rq->runtime_expires += TICK_NSEC;
4026 	} else {
4027 		/* global deadline is ahead, expiration has passed */
4028 		cfs_rq->runtime_remaining = 0;
4029 	}
4030 }
4031 
__account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)4032 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4033 {
4034 	/* dock delta_exec before expiring quota (as it could span periods) */
4035 	cfs_rq->runtime_remaining -= delta_exec;
4036 	expire_cfs_rq_runtime(cfs_rq);
4037 
4038 	if (likely(cfs_rq->runtime_remaining > 0))
4039 		return;
4040 
4041 	/*
4042 	 * if we're unable to extend our runtime we resched so that the active
4043 	 * hierarchy can be throttled
4044 	 */
4045 	if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
4046 		resched_curr(rq_of(cfs_rq));
4047 }
4048 
4049 static __always_inline
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)4050 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
4051 {
4052 	if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
4053 		return;
4054 
4055 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
4056 }
4057 
cfs_rq_throttled(struct cfs_rq * cfs_rq)4058 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4059 {
4060 	return cfs_bandwidth_used() && cfs_rq->throttled;
4061 }
4062 
4063 /* check whether cfs_rq, or any parent, is throttled */
throttled_hierarchy(struct cfs_rq * cfs_rq)4064 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4065 {
4066 	return cfs_bandwidth_used() && cfs_rq->throttle_count;
4067 }
4068 
4069 /*
4070  * Ensure that neither of the group entities corresponding to src_cpu or
4071  * dest_cpu are members of a throttled hierarchy when performing group
4072  * load-balance operations.
4073  */
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)4074 static inline int throttled_lb_pair(struct task_group *tg,
4075 				    int src_cpu, int dest_cpu)
4076 {
4077 	struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
4078 
4079 	src_cfs_rq = tg->cfs_rq[src_cpu];
4080 	dest_cfs_rq = tg->cfs_rq[dest_cpu];
4081 
4082 	return throttled_hierarchy(src_cfs_rq) ||
4083 	       throttled_hierarchy(dest_cfs_rq);
4084 }
4085 
4086 /* updated child weight may affect parent so we have to do this bottom up */
tg_unthrottle_up(struct task_group * tg,void * data)4087 static int tg_unthrottle_up(struct task_group *tg, void *data)
4088 {
4089 	struct rq *rq = data;
4090 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4091 
4092 	cfs_rq->throttle_count--;
4093 #ifdef CONFIG_SMP
4094 	if (!cfs_rq->throttle_count) {
4095 		/* adjust cfs_rq_clock_task() */
4096 		cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
4097 					     cfs_rq->throttled_clock_task;
4098 	}
4099 #endif
4100 
4101 	return 0;
4102 }
4103 
tg_throttle_down(struct task_group * tg,void * data)4104 static int tg_throttle_down(struct task_group *tg, void *data)
4105 {
4106 	struct rq *rq = data;
4107 	struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
4108 
4109 	/* group is entering throttled state, stop time */
4110 	if (!cfs_rq->throttle_count)
4111 		cfs_rq->throttled_clock_task = rq_clock_task(rq);
4112 	cfs_rq->throttle_count++;
4113 
4114 	return 0;
4115 }
4116 
throttle_cfs_rq(struct cfs_rq * cfs_rq)4117 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
4118 {
4119 	struct rq *rq = rq_of(cfs_rq);
4120 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4121 	struct sched_entity *se;
4122 	long task_delta, dequeue = 1;
4123 	bool empty;
4124 
4125 	se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
4126 
4127 	/* freeze hierarchy runnable averages while throttled */
4128 	rcu_read_lock();
4129 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
4130 	rcu_read_unlock();
4131 
4132 	task_delta = cfs_rq->h_nr_running;
4133 	for_each_sched_entity(se) {
4134 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
4135 		/* throttled entity or throttle-on-deactivate */
4136 		if (!se->on_rq)
4137 			break;
4138 
4139 		if (dequeue)
4140 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
4141 		qcfs_rq->h_nr_running -= task_delta;
4142 
4143 		if (qcfs_rq->load.weight)
4144 			dequeue = 0;
4145 	}
4146 
4147 	if (!se)
4148 		sub_nr_running(rq, task_delta);
4149 
4150 	cfs_rq->throttled = 1;
4151 	cfs_rq->throttled_clock = rq_clock(rq);
4152 	raw_spin_lock(&cfs_b->lock);
4153 	empty = list_empty(&cfs_b->throttled_cfs_rq);
4154 
4155 	/*
4156 	 * Add to the _head_ of the list, so that an already-started
4157 	 * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
4158 	 * not running add to the tail so that later runqueues don't get starved.
4159 	 */
4160 	if (cfs_b->distribute_running)
4161 		list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4162 	else
4163 		list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
4164 
4165 	/*
4166 	 * If we're the first throttled task, make sure the bandwidth
4167 	 * timer is running.
4168 	 */
4169 	if (empty)
4170 		start_cfs_bandwidth(cfs_b);
4171 
4172 	raw_spin_unlock(&cfs_b->lock);
4173 }
4174 
unthrottle_cfs_rq(struct cfs_rq * cfs_rq)4175 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
4176 {
4177 	struct rq *rq = rq_of(cfs_rq);
4178 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4179 	struct sched_entity *se;
4180 	int enqueue = 1;
4181 	long task_delta;
4182 
4183 	se = cfs_rq->tg->se[cpu_of(rq)];
4184 
4185 	cfs_rq->throttled = 0;
4186 
4187 	update_rq_clock(rq);
4188 
4189 	raw_spin_lock(&cfs_b->lock);
4190 	cfs_b->throttled_time += rq_clock(rq) - cfs_rq->throttled_clock;
4191 	list_del_rcu(&cfs_rq->throttled_list);
4192 	raw_spin_unlock(&cfs_b->lock);
4193 
4194 	/* update hierarchical throttle state */
4195 	walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
4196 
4197 	if (!cfs_rq->load.weight)
4198 		return;
4199 
4200 	task_delta = cfs_rq->h_nr_running;
4201 	for_each_sched_entity(se) {
4202 		if (se->on_rq)
4203 			enqueue = 0;
4204 
4205 		cfs_rq = cfs_rq_of(se);
4206 		if (enqueue)
4207 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
4208 		cfs_rq->h_nr_running += task_delta;
4209 
4210 		if (cfs_rq_throttled(cfs_rq))
4211 			break;
4212 	}
4213 
4214 	if (!se)
4215 		add_nr_running(rq, task_delta);
4216 
4217 	/* determine whether we need to wake up potentially idle cpu */
4218 	if (rq->curr == rq->idle && rq->cfs.nr_running)
4219 		resched_curr(rq);
4220 }
4221 
distribute_cfs_runtime(struct cfs_bandwidth * cfs_b,u64 remaining,u64 expires)4222 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
4223 		u64 remaining, u64 expires)
4224 {
4225 	struct cfs_rq *cfs_rq;
4226 	u64 runtime;
4227 	u64 starting_runtime = remaining;
4228 
4229 	rcu_read_lock();
4230 	list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
4231 				throttled_list) {
4232 		struct rq *rq = rq_of(cfs_rq);
4233 
4234 		raw_spin_lock(&rq->lock);
4235 		if (!cfs_rq_throttled(cfs_rq))
4236 			goto next;
4237 
4238 		runtime = -cfs_rq->runtime_remaining + 1;
4239 		if (runtime > remaining)
4240 			runtime = remaining;
4241 		remaining -= runtime;
4242 
4243 		cfs_rq->runtime_remaining += runtime;
4244 		cfs_rq->runtime_expires = expires;
4245 
4246 		/* we check whether we're throttled above */
4247 		if (cfs_rq->runtime_remaining > 0)
4248 			unthrottle_cfs_rq(cfs_rq);
4249 
4250 next:
4251 		raw_spin_unlock(&rq->lock);
4252 
4253 		if (!remaining)
4254 			break;
4255 	}
4256 	rcu_read_unlock();
4257 
4258 	return starting_runtime - remaining;
4259 }
4260 
4261 /*
4262  * Responsible for refilling a task_group's bandwidth and unthrottling its
4263  * cfs_rqs as appropriate. If there has been no activity within the last
4264  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
4265  * used to track this state.
4266  */
do_sched_cfs_period_timer(struct cfs_bandwidth * cfs_b,int overrun)4267 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
4268 {
4269 	u64 runtime, runtime_expires;
4270 	int throttled;
4271 
4272 	/* no need to continue the timer with no bandwidth constraint */
4273 	if (cfs_b->quota == RUNTIME_INF)
4274 		goto out_deactivate;
4275 
4276 	throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4277 	cfs_b->nr_periods += overrun;
4278 
4279 	/*
4280 	 * idle depends on !throttled (for the case of a large deficit), and if
4281 	 * we're going inactive then everything else can be deferred
4282 	 */
4283 	if (cfs_b->idle && !throttled)
4284 		goto out_deactivate;
4285 
4286 	__refill_cfs_bandwidth_runtime(cfs_b);
4287 
4288 	if (!throttled) {
4289 		/* mark as potentially idle for the upcoming period */
4290 		cfs_b->idle = 1;
4291 		return 0;
4292 	}
4293 
4294 	/* account preceding periods in which throttling occurred */
4295 	cfs_b->nr_throttled += overrun;
4296 
4297 	runtime_expires = cfs_b->runtime_expires;
4298 
4299 	/*
4300 	 * This check is repeated as we are holding onto the new bandwidth while
4301 	 * we unthrottle. This can potentially race with an unthrottled group
4302 	 * trying to acquire new bandwidth from the global pool. This can result
4303 	 * in us over-using our runtime if it is all used during this loop, but
4304 	 * only by limited amounts in that extreme case.
4305 	 */
4306 	while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
4307 		runtime = cfs_b->runtime;
4308 		cfs_b->distribute_running = 1;
4309 		raw_spin_unlock(&cfs_b->lock);
4310 		/* we can't nest cfs_b->lock while distributing bandwidth */
4311 		runtime = distribute_cfs_runtime(cfs_b, runtime,
4312 						 runtime_expires);
4313 		raw_spin_lock(&cfs_b->lock);
4314 
4315 		cfs_b->distribute_running = 0;
4316 		throttled = !list_empty(&cfs_b->throttled_cfs_rq);
4317 
4318 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
4319 	}
4320 
4321 	/*
4322 	 * While we are ensured activity in the period following an
4323 	 * unthrottle, this also covers the case in which the new bandwidth is
4324 	 * insufficient to cover the existing bandwidth deficit.  (Forcing the
4325 	 * timer to remain active while there are any throttled entities.)
4326 	 */
4327 	cfs_b->idle = 0;
4328 
4329 	return 0;
4330 
4331 out_deactivate:
4332 	return 1;
4333 }
4334 
4335 /* a cfs_rq won't donate quota below this amount */
4336 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
4337 /* minimum remaining period time to redistribute slack quota */
4338 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
4339 /* how long we wait to gather additional slack before distributing */
4340 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
4341 
4342 /*
4343  * Are we near the end of the current quota period?
4344  *
4345  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
4346  * hrtimer base being cleared by hrtimer_start. In the case of
4347  * migrate_hrtimers, base is never cleared, so we are fine.
4348  */
runtime_refresh_within(struct cfs_bandwidth * cfs_b,u64 min_expire)4349 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
4350 {
4351 	struct hrtimer *refresh_timer = &cfs_b->period_timer;
4352 	s64 remaining;
4353 
4354 	/* if the call-back is running a quota refresh is already occurring */
4355 	if (hrtimer_callback_running(refresh_timer))
4356 		return 1;
4357 
4358 	/* is a quota refresh about to occur? */
4359 	remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
4360 	if (remaining < (s64)min_expire)
4361 		return 1;
4362 
4363 	return 0;
4364 }
4365 
start_cfs_slack_bandwidth(struct cfs_bandwidth * cfs_b)4366 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
4367 {
4368 	u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
4369 
4370 	/* if there's a quota refresh soon don't bother with slack */
4371 	if (runtime_refresh_within(cfs_b, min_left))
4372 		return;
4373 
4374 	hrtimer_start(&cfs_b->slack_timer,
4375 			ns_to_ktime(cfs_bandwidth_slack_period),
4376 			HRTIMER_MODE_REL);
4377 }
4378 
4379 /* we know any runtime found here is valid as update_curr() precedes return */
__return_cfs_rq_runtime(struct cfs_rq * cfs_rq)4380 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4381 {
4382 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
4383 	s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
4384 
4385 	if (slack_runtime <= 0)
4386 		return;
4387 
4388 	raw_spin_lock(&cfs_b->lock);
4389 	if (cfs_b->quota != RUNTIME_INF &&
4390 	    cfs_rq->runtime_expires == cfs_b->runtime_expires) {
4391 		cfs_b->runtime += slack_runtime;
4392 
4393 		/* we are under rq->lock, defer unthrottling using a timer */
4394 		if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
4395 		    !list_empty(&cfs_b->throttled_cfs_rq))
4396 			start_cfs_slack_bandwidth(cfs_b);
4397 	}
4398 	raw_spin_unlock(&cfs_b->lock);
4399 
4400 	/* even if it's not valid for return we don't want to try again */
4401 	cfs_rq->runtime_remaining -= slack_runtime;
4402 }
4403 
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)4404 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4405 {
4406 	if (!cfs_bandwidth_used())
4407 		return;
4408 
4409 	if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
4410 		return;
4411 
4412 	__return_cfs_rq_runtime(cfs_rq);
4413 }
4414 
4415 /*
4416  * This is done with a timer (instead of inline with bandwidth return) since
4417  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
4418  */
do_sched_cfs_slack_timer(struct cfs_bandwidth * cfs_b)4419 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
4420 {
4421 	u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
4422 	u64 expires;
4423 
4424 	/* confirm we're still not at a refresh boundary */
4425 	raw_spin_lock(&cfs_b->lock);
4426 	if (cfs_b->distribute_running) {
4427 		raw_spin_unlock(&cfs_b->lock);
4428 		return;
4429 	}
4430 
4431 	if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
4432 		raw_spin_unlock(&cfs_b->lock);
4433 		return;
4434 	}
4435 
4436 	if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice)
4437 		runtime = cfs_b->runtime;
4438 
4439 	expires = cfs_b->runtime_expires;
4440 	if (runtime)
4441 		cfs_b->distribute_running = 1;
4442 
4443 	raw_spin_unlock(&cfs_b->lock);
4444 
4445 	if (!runtime)
4446 		return;
4447 
4448 	runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
4449 
4450 	raw_spin_lock(&cfs_b->lock);
4451 	if (expires == cfs_b->runtime_expires)
4452 		cfs_b->runtime -= min(runtime, cfs_b->runtime);
4453 	cfs_b->distribute_running = 0;
4454 	raw_spin_unlock(&cfs_b->lock);
4455 }
4456 
4457 /*
4458  * When a group wakes up we want to make sure that its quota is not already
4459  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
4460  * runtime as update_curr() throttling can not not trigger until it's on-rq.
4461  */
check_enqueue_throttle(struct cfs_rq * cfs_rq)4462 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
4463 {
4464 	if (!cfs_bandwidth_used())
4465 		return;
4466 
4467 	/* Synchronize hierarchical throttle counter: */
4468 	if (unlikely(!cfs_rq->throttle_uptodate)) {
4469 		struct rq *rq = rq_of(cfs_rq);
4470 		struct cfs_rq *pcfs_rq;
4471 		struct task_group *tg;
4472 
4473 		cfs_rq->throttle_uptodate = 1;
4474 
4475 		/* Get closest up-to-date node, because leaves go first: */
4476 		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
4477 			pcfs_rq = tg->cfs_rq[cpu_of(rq)];
4478 			if (pcfs_rq->throttle_uptodate)
4479 				break;
4480 		}
4481 		if (tg) {
4482 			cfs_rq->throttle_count = pcfs_rq->throttle_count;
4483 			cfs_rq->throttled_clock_task = rq_clock_task(rq);
4484 		}
4485 	}
4486 
4487 	/* an active group must be handled by the update_curr()->put() path */
4488 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
4489 		return;
4490 
4491 	/* ensure the group is not already throttled */
4492 	if (cfs_rq_throttled(cfs_rq))
4493 		return;
4494 
4495 	/* update runtime allocation */
4496 	account_cfs_rq_runtime(cfs_rq, 0);
4497 	if (cfs_rq->runtime_remaining <= 0)
4498 		throttle_cfs_rq(cfs_rq);
4499 }
4500 
4501 /* conditionally throttle active cfs_rq's from put_prev_entity() */
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)4502 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4503 {
4504 	if (!cfs_bandwidth_used())
4505 		return false;
4506 
4507 	if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
4508 		return false;
4509 
4510 	/*
4511 	 * it's possible for a throttled entity to be forced into a running
4512 	 * state (e.g. set_curr_task), in this case we're finished.
4513 	 */
4514 	if (cfs_rq_throttled(cfs_rq))
4515 		return true;
4516 
4517 	throttle_cfs_rq(cfs_rq);
4518 	return true;
4519 }
4520 
sched_cfs_slack_timer(struct hrtimer * timer)4521 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
4522 {
4523 	struct cfs_bandwidth *cfs_b =
4524 		container_of(timer, struct cfs_bandwidth, slack_timer);
4525 
4526 	do_sched_cfs_slack_timer(cfs_b);
4527 
4528 	return HRTIMER_NORESTART;
4529 }
4530 
4531 extern const u64 max_cfs_quota_period;
4532 
sched_cfs_period_timer(struct hrtimer * timer)4533 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
4534 {
4535 	struct cfs_bandwidth *cfs_b =
4536 		container_of(timer, struct cfs_bandwidth, period_timer);
4537 	int overrun;
4538 	int idle = 0;
4539 	int count = 0;
4540 
4541 	raw_spin_lock(&cfs_b->lock);
4542 	for (;;) {
4543 		overrun = hrtimer_forward_now(timer, cfs_b->period);
4544 		if (!overrun)
4545 			break;
4546 
4547 		if (++count > 3) {
4548 			u64 new, old = ktime_to_ns(cfs_b->period);
4549 
4550 			/*
4551 			 * Grow period by a factor of 2 to avoid losing precision.
4552 			 * Precision loss in the quota/period ratio can cause __cfs_schedulable
4553 			 * to fail.
4554 			 */
4555 			new = old * 2;
4556 			if (new < max_cfs_quota_period) {
4557 				cfs_b->period = ns_to_ktime(new);
4558 				cfs_b->quota *= 2;
4559 
4560 				pr_warn_ratelimited(
4561 	"cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4562 					smp_processor_id(),
4563 					div_u64(new, NSEC_PER_USEC),
4564 					div_u64(cfs_b->quota, NSEC_PER_USEC));
4565 			} else {
4566 				pr_warn_ratelimited(
4567 	"cfs_period_timer[cpu%d]: period too short, but cannot scale up without losing precision (cfs_period_us = %lld, cfs_quota_us = %lld)\n",
4568 					smp_processor_id(),
4569 					div_u64(old, NSEC_PER_USEC),
4570 					div_u64(cfs_b->quota, NSEC_PER_USEC));
4571 			}
4572 
4573 			/* reset count so we don't come right back in here */
4574 			count = 0;
4575 		}
4576 
4577 		idle = do_sched_cfs_period_timer(cfs_b, overrun);
4578 	}
4579 	if (idle)
4580 		cfs_b->period_active = 0;
4581 	raw_spin_unlock(&cfs_b->lock);
4582 
4583 	return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
4584 }
4585 
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4586 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4587 {
4588 	raw_spin_lock_init(&cfs_b->lock);
4589 	cfs_b->runtime = 0;
4590 	cfs_b->quota = RUNTIME_INF;
4591 	cfs_b->period = ns_to_ktime(default_cfs_period());
4592 
4593 	INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
4594 	hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
4595 	cfs_b->period_timer.function = sched_cfs_period_timer;
4596 	hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
4597 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
4598 	cfs_b->distribute_running = 0;
4599 }
4600 
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)4601 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
4602 {
4603 	cfs_rq->runtime_enabled = 0;
4604 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
4605 }
4606 
start_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4607 void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4608 {
4609 	lockdep_assert_held(&cfs_b->lock);
4610 
4611 	if (!cfs_b->period_active) {
4612 		cfs_b->period_active = 1;
4613 		hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period);
4614 		hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED);
4615 	}
4616 }
4617 
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4618 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
4619 {
4620 	/* init_cfs_bandwidth() was not called */
4621 	if (!cfs_b->throttled_cfs_rq.next)
4622 		return;
4623 
4624 	hrtimer_cancel(&cfs_b->period_timer);
4625 	hrtimer_cancel(&cfs_b->slack_timer);
4626 }
4627 
update_runtime_enabled(struct rq * rq)4628 static void __maybe_unused update_runtime_enabled(struct rq *rq)
4629 {
4630 	struct cfs_rq *cfs_rq;
4631 
4632 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4633 		struct cfs_bandwidth *cfs_b = &cfs_rq->tg->cfs_bandwidth;
4634 
4635 		raw_spin_lock(&cfs_b->lock);
4636 		cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
4637 		raw_spin_unlock(&cfs_b->lock);
4638 	}
4639 }
4640 
unthrottle_offline_cfs_rqs(struct rq * rq)4641 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
4642 {
4643 	struct cfs_rq *cfs_rq;
4644 
4645 	for_each_leaf_cfs_rq(rq, cfs_rq) {
4646 		if (!cfs_rq->runtime_enabled)
4647 			continue;
4648 
4649 		/*
4650 		 * clock_task is not advancing so we just need to make sure
4651 		 * there's some valid quota amount
4652 		 */
4653 		cfs_rq->runtime_remaining = 1;
4654 		/*
4655 		 * Offline rq is schedulable till cpu is completely disabled
4656 		 * in take_cpu_down(), so we prevent new cfs throttling here.
4657 		 */
4658 		cfs_rq->runtime_enabled = 0;
4659 
4660 		if (cfs_rq_throttled(cfs_rq))
4661 			unthrottle_cfs_rq(cfs_rq);
4662 	}
4663 }
4664 
4665 #else /* CONFIG_CFS_BANDWIDTH */
cfs_rq_clock_task(struct cfs_rq * cfs_rq)4666 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
4667 {
4668 	return rq_clock_task(rq_of(cfs_rq));
4669 }
4670 
account_cfs_rq_runtime(struct cfs_rq * cfs_rq,u64 delta_exec)4671 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
check_cfs_rq_runtime(struct cfs_rq * cfs_rq)4672 static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; }
check_enqueue_throttle(struct cfs_rq * cfs_rq)4673 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
return_cfs_rq_runtime(struct cfs_rq * cfs_rq)4674 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4675 
cfs_rq_throttled(struct cfs_rq * cfs_rq)4676 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
4677 {
4678 	return 0;
4679 }
4680 
throttled_hierarchy(struct cfs_rq * cfs_rq)4681 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
4682 {
4683 	return 0;
4684 }
4685 
throttled_lb_pair(struct task_group * tg,int src_cpu,int dest_cpu)4686 static inline int throttled_lb_pair(struct task_group *tg,
4687 				    int src_cpu, int dest_cpu)
4688 {
4689 	return 0;
4690 }
4691 
init_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4692 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
4693 
4694 #ifdef CONFIG_FAIR_GROUP_SCHED
init_cfs_rq_runtime(struct cfs_rq * cfs_rq)4695 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
4696 #endif
4697 
tg_cfs_bandwidth(struct task_group * tg)4698 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
4699 {
4700 	return NULL;
4701 }
destroy_cfs_bandwidth(struct cfs_bandwidth * cfs_b)4702 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
update_runtime_enabled(struct rq * rq)4703 static inline void update_runtime_enabled(struct rq *rq) {}
unthrottle_offline_cfs_rqs(struct rq * rq)4704 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
4705 
4706 #endif /* CONFIG_CFS_BANDWIDTH */
4707 
4708 /**************************************************
4709  * CFS operations on tasks:
4710  */
4711 
4712 #ifdef CONFIG_SCHED_HRTICK
hrtick_start_fair(struct rq * rq,struct task_struct * p)4713 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
4714 {
4715 	struct sched_entity *se = &p->se;
4716 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
4717 
4718 	WARN_ON(task_rq(p) != rq);
4719 
4720 	if (cfs_rq->nr_running > 1) {
4721 		u64 slice = sched_slice(cfs_rq, se);
4722 		u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
4723 		s64 delta = slice - ran;
4724 
4725 		if (delta < 0) {
4726 			if (rq->curr == p)
4727 				resched_curr(rq);
4728 			return;
4729 		}
4730 		hrtick_start(rq, delta);
4731 	}
4732 }
4733 
4734 /*
4735  * called from enqueue/dequeue and updates the hrtick when the
4736  * current task is from our class and nr_running is low enough
4737  * to matter.
4738  */
hrtick_update(struct rq * rq)4739 static void hrtick_update(struct rq *rq)
4740 {
4741 	struct task_struct *curr = rq->curr;
4742 
4743 	if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
4744 		return;
4745 
4746 	if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
4747 		hrtick_start_fair(rq, curr);
4748 }
4749 #else /* !CONFIG_SCHED_HRTICK */
4750 static inline void
hrtick_start_fair(struct rq * rq,struct task_struct * p)4751 hrtick_start_fair(struct rq *rq, struct task_struct *p)
4752 {
4753 }
4754 
hrtick_update(struct rq * rq)4755 static inline void hrtick_update(struct rq *rq)
4756 {
4757 }
4758 #endif
4759 
4760 #ifdef CONFIG_SMP
4761 static bool __cpu_overutilized(int cpu, int delta);
4762 static bool cpu_overutilized(int cpu);
4763 unsigned long boosted_cpu_util(int cpu);
4764 #else
4765 #define boosted_cpu_util(cpu) cpu_util_freq(cpu)
4766 #endif
4767 
4768 /*
4769  * The enqueue_task method is called before nr_running is
4770  * increased. Here we update the fair scheduling stats and
4771  * then put the task into the rbtree:
4772  */
4773 static void
enqueue_task_fair(struct rq * rq,struct task_struct * p,int flags)4774 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4775 {
4776 	struct cfs_rq *cfs_rq;
4777 	struct sched_entity *se = &p->se;
4778 #ifdef CONFIG_SMP
4779 	int task_new = flags & ENQUEUE_WAKEUP_NEW;
4780 #endif
4781 
4782 	/*
4783 	 * If in_iowait is set, the code below may not trigger any cpufreq
4784 	 * utilization updates, so do it here explicitly with the IOWAIT flag
4785 	 * passed.
4786 	 */
4787 	if (p->in_iowait)
4788 		cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_IOWAIT);
4789 
4790 	for_each_sched_entity(se) {
4791 		if (se->on_rq)
4792 			break;
4793 		cfs_rq = cfs_rq_of(se);
4794 		enqueue_entity(cfs_rq, se, flags);
4795 
4796 		/*
4797 		 * end evaluation on encountering a throttled cfs_rq
4798 		 *
4799 		 * note: in the case of encountering a throttled cfs_rq we will
4800 		 * post the final h_nr_running increment below.
4801 		 */
4802 		if (cfs_rq_throttled(cfs_rq))
4803 			break;
4804 		cfs_rq->h_nr_running++;
4805 		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
4806 
4807 		flags = ENQUEUE_WAKEUP;
4808 	}
4809 
4810 	for_each_sched_entity(se) {
4811 		cfs_rq = cfs_rq_of(se);
4812 		cfs_rq->h_nr_running++;
4813 		walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
4814 
4815 		if (cfs_rq_throttled(cfs_rq))
4816 			break;
4817 
4818 		update_load_avg(se, UPDATE_TG);
4819 		update_cfs_shares(se);
4820 	}
4821 
4822 	if (!se)
4823 		add_nr_running(rq, 1);
4824 
4825 #ifdef CONFIG_SMP
4826 
4827 	/*
4828 	 * Update SchedTune accounting.
4829 	 *
4830 	 * We do it before updating the CPU capacity to ensure the
4831 	 * boost value of the current task is accounted for in the
4832 	 * selection of the OPP.
4833 	 *
4834 	 * We do it also in the case where we enqueue a throttled task;
4835 	 * we could argue that a throttled task should not boost a CPU,
4836 	 * however:
4837 	 * a) properly implementing CPU boosting considering throttled
4838 	 *    tasks will increase a lot the complexity of the solution
4839 	 * b) it's not easy to quantify the benefits introduced by
4840 	 *    such a more complex solution.
4841 	 * Thus, for the time being we go for the simple solution and boost
4842 	 * also for throttled RQs.
4843 	 */
4844 	schedtune_enqueue_task(p, cpu_of(rq));
4845 
4846 	if (!se) {
4847 		walt_inc_cumulative_runnable_avg(rq, p);
4848 		if (!task_new && !rq->rd->overutilized &&
4849 		    cpu_overutilized(rq->cpu)) {
4850 			rq->rd->overutilized = true;
4851 			trace_sched_overutilized(true);
4852 		}
4853 	}
4854 
4855 #endif /* CONFIG_SMP */
4856 	hrtick_update(rq);
4857 }
4858 
4859 static void set_next_buddy(struct sched_entity *se);
4860 
4861 /*
4862  * The dequeue_task method is called before nr_running is
4863  * decreased. We remove the task from the rbtree and
4864  * update the fair scheduling stats:
4865  */
dequeue_task_fair(struct rq * rq,struct task_struct * p,int flags)4866 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4867 {
4868 	struct cfs_rq *cfs_rq;
4869 	struct sched_entity *se = &p->se;
4870 	int task_sleep = flags & DEQUEUE_SLEEP;
4871 
4872 	for_each_sched_entity(se) {
4873 		cfs_rq = cfs_rq_of(se);
4874 		dequeue_entity(cfs_rq, se, flags);
4875 
4876 		/*
4877 		 * end evaluation on encountering a throttled cfs_rq
4878 		 *
4879 		 * note: in the case of encountering a throttled cfs_rq we will
4880 		 * post the final h_nr_running decrement below.
4881 		*/
4882 		if (cfs_rq_throttled(cfs_rq))
4883 			break;
4884 		cfs_rq->h_nr_running--;
4885 		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
4886 
4887 		/* Don't dequeue parent if it has other entities besides us */
4888 		if (cfs_rq->load.weight) {
4889 			/* Avoid re-evaluating load for this entity: */
4890 			se = parent_entity(se);
4891 			/*
4892 			 * Bias pick_next to pick a task from this cfs_rq, as
4893 			 * p is sleeping when it is within its sched_slice.
4894 			 */
4895 			if (task_sleep && se && !throttled_hierarchy(cfs_rq))
4896 				set_next_buddy(se);
4897 			break;
4898 		}
4899 		flags |= DEQUEUE_SLEEP;
4900 	}
4901 
4902 	for_each_sched_entity(se) {
4903 		cfs_rq = cfs_rq_of(se);
4904 		cfs_rq->h_nr_running--;
4905 		walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
4906 
4907 		if (cfs_rq_throttled(cfs_rq))
4908 			break;
4909 
4910 		update_load_avg(se, UPDATE_TG);
4911 		update_cfs_shares(se);
4912 	}
4913 
4914 	if (!se)
4915 		sub_nr_running(rq, 1);
4916 
4917 #ifdef CONFIG_SMP
4918 
4919 	/*
4920 	 * Update SchedTune accounting
4921 	 *
4922 	 * We do it before updating the CPU capacity to ensure the
4923 	 * boost value of the current task is accounted for in the
4924 	 * selection of the OPP.
4925 	 */
4926 	schedtune_dequeue_task(p, cpu_of(rq));
4927 
4928 	if (!se)
4929 		walt_dec_cumulative_runnable_avg(rq, p);
4930 #endif /* CONFIG_SMP */
4931 
4932 	hrtick_update(rq);
4933 }
4934 
4935 #ifdef CONFIG_SMP
4936 
4937 /*
4938  * per rq 'load' arrray crap; XXX kill this.
4939  */
4940 
4941 /*
4942  * The exact cpuload at various idx values, calculated at every tick would be
4943  * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load
4944  *
4945  * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called
4946  * on nth tick when cpu may be busy, then we have:
4947  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4948  * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load
4949  *
4950  * decay_load_missed() below does efficient calculation of
4951  * load = ((2^idx - 1) / 2^idx)^(n-1) * load
4952  * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load
4953  *
4954  * The calculation is approximated on a 128 point scale.
4955  * degrade_zero_ticks is the number of ticks after which load at any
4956  * particular idx is approximated to be zero.
4957  * degrade_factor is a precomputed table, a row for each load idx.
4958  * Each column corresponds to degradation factor for a power of two ticks,
4959  * based on 128 point scale.
4960  * Example:
4961  * row 2, col 3 (=12) says that the degradation at load idx 2 after
4962  * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8).
4963  *
4964  * With this power of 2 load factors, we can degrade the load n times
4965  * by looking at 1 bits in n and doing as many mult/shift instead of
4966  * n mult/shifts needed by the exact degradation.
4967  */
4968 #define DEGRADE_SHIFT		7
4969 static const unsigned char
4970 		degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128};
4971 static const unsigned char
4972 		degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = {
4973 					{0, 0, 0, 0, 0, 0, 0, 0},
4974 					{64, 32, 8, 0, 0, 0, 0, 0},
4975 					{96, 72, 40, 12, 1, 0, 0},
4976 					{112, 98, 75, 43, 15, 1, 0},
4977 					{120, 112, 98, 76, 45, 16, 2} };
4978 
4979 /*
4980  * Update cpu_load for any missed ticks, due to tickless idle. The backlog
4981  * would be when CPU is idle and so we just decay the old load without
4982  * adding any new load.
4983  */
4984 static unsigned long
decay_load_missed(unsigned long load,unsigned long missed_updates,int idx)4985 decay_load_missed(unsigned long load, unsigned long missed_updates, int idx)
4986 {
4987 	int j = 0;
4988 
4989 	if (!missed_updates)
4990 		return load;
4991 
4992 	if (missed_updates >= degrade_zero_ticks[idx])
4993 		return 0;
4994 
4995 	if (idx == 1)
4996 		return load >> missed_updates;
4997 
4998 	while (missed_updates) {
4999 		if (missed_updates % 2)
5000 			load = (load * degrade_factor[idx][j]) >> DEGRADE_SHIFT;
5001 
5002 		missed_updates >>= 1;
5003 		j++;
5004 	}
5005 	return load;
5006 }
5007 
5008 /*
5009  * Update rq->cpu_load[] statistics. This function is usually called every
5010  * scheduler tick (TICK_NSEC). With tickless idle this will not be called
5011  * every tick. We fix it up based on jiffies.
5012  */
__update_cpu_load(struct rq * this_rq,unsigned long this_load,unsigned long pending_updates)5013 static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
5014 			      unsigned long pending_updates)
5015 {
5016 	int i, scale;
5017 
5018 	this_rq->nr_load_updates++;
5019 
5020 	/* Update our load: */
5021 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
5022 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
5023 		unsigned long old_load, new_load;
5024 
5025 		/* scale is effectively 1 << i now, and >> i divides by scale */
5026 
5027 		old_load = this_rq->cpu_load[i];
5028 		old_load = decay_load_missed(old_load, pending_updates - 1, i);
5029 		new_load = this_load;
5030 		/*
5031 		 * Round up the averaging division if load is increasing. This
5032 		 * prevents us from getting stuck on 9 if the load is 10, for
5033 		 * example.
5034 		 */
5035 		if (new_load > old_load)
5036 			new_load += scale - 1;
5037 
5038 		this_rq->cpu_load[i] = (old_load * (scale - 1) + new_load) >> i;
5039 	}
5040 
5041 	sched_avg_update(this_rq);
5042 }
5043 
5044 /* Used instead of source_load when we know the type == 0 */
weighted_cpuload(const int cpu)5045 static unsigned long weighted_cpuload(const int cpu)
5046 {
5047 	return cfs_rq_runnable_load_avg(&cpu_rq(cpu)->cfs);
5048 }
5049 
5050 #ifdef CONFIG_NO_HZ_COMMON
5051 /*
5052  * There is no sane way to deal with nohz on smp when using jiffies because the
5053  * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading
5054  * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}.
5055  *
5056  * Therefore we cannot use the delta approach from the regular tick since that
5057  * would seriously skew the load calculation. However we'll make do for those
5058  * updates happening while idle (nohz_idle_balance) or coming out of idle
5059  * (tick_nohz_idle_exit).
5060  *
5061  * This means we might still be one tick off for nohz periods.
5062  */
5063 
5064 /*
5065  * Called from nohz_idle_balance() to update the load ratings before doing the
5066  * idle balance.
5067  */
update_idle_cpu_load(struct rq * this_rq)5068 static void update_idle_cpu_load(struct rq *this_rq)
5069 {
5070 	unsigned long curr_jiffies = READ_ONCE(jiffies);
5071 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
5072 	unsigned long pending_updates;
5073 
5074 	/*
5075 	 * bail if there's load or we're actually up-to-date.
5076 	 */
5077 	if (load || curr_jiffies == this_rq->last_load_update_tick)
5078 		return;
5079 
5080 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5081 	this_rq->last_load_update_tick = curr_jiffies;
5082 
5083 	__update_cpu_load(this_rq, load, pending_updates);
5084 }
5085 
5086 /*
5087  * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed.
5088  */
update_cpu_load_nohz(void)5089 void update_cpu_load_nohz(void)
5090 {
5091 	struct rq *this_rq = this_rq();
5092 	unsigned long curr_jiffies = READ_ONCE(jiffies);
5093 	unsigned long pending_updates;
5094 
5095 	if (curr_jiffies == this_rq->last_load_update_tick)
5096 		return;
5097 
5098 	raw_spin_lock(&this_rq->lock);
5099 	pending_updates = curr_jiffies - this_rq->last_load_update_tick;
5100 	if (pending_updates) {
5101 		this_rq->last_load_update_tick = curr_jiffies;
5102 		/*
5103 		 * We were idle, this means load 0, the current load might be
5104 		 * !0 due to remote wakeups and the sort.
5105 		 */
5106 		__update_cpu_load(this_rq, 0, pending_updates);
5107 	}
5108 	raw_spin_unlock(&this_rq->lock);
5109 }
5110 #endif /* CONFIG_NO_HZ */
5111 
5112 /*
5113  * Called from scheduler_tick()
5114  */
update_cpu_load_active(struct rq * this_rq)5115 void update_cpu_load_active(struct rq *this_rq)
5116 {
5117 	unsigned long load = weighted_cpuload(cpu_of(this_rq));
5118 	/*
5119 	 * See the mess around update_idle_cpu_load() / update_cpu_load_nohz().
5120 	 */
5121 	this_rq->last_load_update_tick = jiffies;
5122 	__update_cpu_load(this_rq, load, 1);
5123 }
5124 
5125 /*
5126  * Return a low guess at the load of a migration-source cpu weighted
5127  * according to the scheduling class and "nice" value.
5128  *
5129  * We want to under-estimate the load of migration sources, to
5130  * balance conservatively.
5131  */
source_load(int cpu,int type)5132 static unsigned long source_load(int cpu, int type)
5133 {
5134 	struct rq *rq = cpu_rq(cpu);
5135 	unsigned long total = weighted_cpuload(cpu);
5136 
5137 	if (type == 0 || !sched_feat(LB_BIAS))
5138 		return total;
5139 
5140 	return min(rq->cpu_load[type-1], total);
5141 }
5142 
5143 /*
5144  * Return a high guess at the load of a migration-target cpu weighted
5145  * according to the scheduling class and "nice" value.
5146  */
target_load(int cpu,int type)5147 static unsigned long target_load(int cpu, int type)
5148 {
5149 	struct rq *rq = cpu_rq(cpu);
5150 	unsigned long total = weighted_cpuload(cpu);
5151 
5152 	if (type == 0 || !sched_feat(LB_BIAS))
5153 		return total;
5154 
5155 	return max(rq->cpu_load[type-1], total);
5156 }
5157 
5158 
cpu_avg_load_per_task(int cpu)5159 static unsigned long cpu_avg_load_per_task(int cpu)
5160 {
5161 	struct rq *rq = cpu_rq(cpu);
5162 	unsigned long nr_running = READ_ONCE(rq->cfs.h_nr_running);
5163 	unsigned long load_avg = weighted_cpuload(cpu);
5164 
5165 	if (nr_running)
5166 		return load_avg / nr_running;
5167 
5168 	return 0;
5169 }
5170 
record_wakee(struct task_struct * p)5171 static void record_wakee(struct task_struct *p)
5172 {
5173 	/*
5174 	 * Rough decay (wiping) for cost saving, don't worry
5175 	 * about the boundary, really active task won't care
5176 	 * about the loss.
5177 	 */
5178 	if (time_after(jiffies, current->wakee_flip_decay_ts + HZ)) {
5179 		current->wakee_flips >>= 1;
5180 		current->wakee_flip_decay_ts = jiffies;
5181 	}
5182 
5183 	if (current->last_wakee != p) {
5184 		current->last_wakee = p;
5185 		current->wakee_flips++;
5186 	}
5187 }
5188 
task_waking_fair(struct task_struct * p)5189 static void task_waking_fair(struct task_struct *p)
5190 {
5191 	struct sched_entity *se = &p->se;
5192 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
5193 	u64 min_vruntime;
5194 
5195 #ifndef CONFIG_64BIT
5196 	u64 min_vruntime_copy;
5197 
5198 	do {
5199 		min_vruntime_copy = cfs_rq->min_vruntime_copy;
5200 		smp_rmb();
5201 		min_vruntime = cfs_rq->min_vruntime;
5202 	} while (min_vruntime != min_vruntime_copy);
5203 #else
5204 	min_vruntime = cfs_rq->min_vruntime;
5205 #endif
5206 
5207 	se->vruntime -= min_vruntime;
5208 	record_wakee(p);
5209 }
5210 
5211 #ifdef CONFIG_FAIR_GROUP_SCHED
5212 /*
5213  * effective_load() calculates the load change as seen from the root_task_group
5214  *
5215  * Adding load to a group doesn't make a group heavier, but can cause movement
5216  * of group shares between cpus. Assuming the shares were perfectly aligned one
5217  * can calculate the shift in shares.
5218  *
5219  * Calculate the effective load difference if @wl is added (subtracted) to @tg
5220  * on this @cpu and results in a total addition (subtraction) of @wg to the
5221  * total group weight.
5222  *
5223  * Given a runqueue weight distribution (rw_i) we can compute a shares
5224  * distribution (s_i) using:
5225  *
5226  *   s_i = rw_i / \Sum rw_j						(1)
5227  *
5228  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
5229  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
5230  * shares distribution (s_i):
5231  *
5232  *   rw_i = {   2,   4,   1,   0 }
5233  *   s_i  = { 2/7, 4/7, 1/7,   0 }
5234  *
5235  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
5236  * task used to run on and the CPU the waker is running on), we need to
5237  * compute the effect of waking a task on either CPU and, in case of a sync
5238  * wakeup, compute the effect of the current task going to sleep.
5239  *
5240  * So for a change of @wl to the local @cpu with an overall group weight change
5241  * of @wl we can compute the new shares distribution (s'_i) using:
5242  *
5243  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)				(2)
5244  *
5245  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
5246  * differences in waking a task to CPU 0. The additional task changes the
5247  * weight and shares distributions like:
5248  *
5249  *   rw'_i = {   3,   4,   1,   0 }
5250  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
5251  *
5252  * We can then compute the difference in effective weight by using:
5253  *
5254  *   dw_i = S * (s'_i - s_i)						(3)
5255  *
5256  * Where 'S' is the group weight as seen by its parent.
5257  *
5258  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
5259  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
5260  * 4/7) times the weight of the group.
5261  */
effective_load(struct task_group * tg,int cpu,long wl,long wg)5262 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5263 {
5264 	struct sched_entity *se = tg->se[cpu];
5265 
5266 	if (!tg->parent)	/* the trivial, non-cgroup case */
5267 		return wl;
5268 
5269 	for_each_sched_entity(se) {
5270 		struct cfs_rq *cfs_rq = se->my_q;
5271 		long W, w = cfs_rq_load_avg(cfs_rq);
5272 
5273 		tg = cfs_rq->tg;
5274 
5275 		/*
5276 		 * W = @wg + \Sum rw_j
5277 		 */
5278 		W = wg + atomic_long_read(&tg->load_avg);
5279 
5280 		/* Ensure \Sum rw_j >= rw_i */
5281 		W -= cfs_rq->tg_load_avg_contrib;
5282 		W += w;
5283 
5284 		/*
5285 		 * w = rw_i + @wl
5286 		 */
5287 		w += wl;
5288 
5289 		/*
5290 		 * wl = S * s'_i; see (2)
5291 		 */
5292 		if (W > 0 && w < W)
5293 			wl = (w * (long)tg->shares) / W;
5294 		else
5295 			wl = tg->shares;
5296 
5297 		/*
5298 		 * Per the above, wl is the new se->load.weight value; since
5299 		 * those are clipped to [MIN_SHARES, ...) do so now. See
5300 		 * calc_cfs_shares().
5301 		 */
5302 		if (wl < MIN_SHARES)
5303 			wl = MIN_SHARES;
5304 
5305 		/*
5306 		 * wl = dw_i = S * (s'_i - s_i); see (3)
5307 		 */
5308 		wl -= se->avg.load_avg;
5309 
5310 		/*
5311 		 * Recursively apply this logic to all parent groups to compute
5312 		 * the final effective load change on the root group. Since
5313 		 * only the @tg group gets extra weight, all parent groups can
5314 		 * only redistribute existing shares. @wl is the shift in shares
5315 		 * resulting from this level per the above.
5316 		 */
5317 		wg = 0;
5318 	}
5319 
5320 	return wl;
5321 }
5322 #else
5323 
effective_load(struct task_group * tg,int cpu,long wl,long wg)5324 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
5325 {
5326 	return wl;
5327 }
5328 
5329 #endif
5330 
5331 /*
5332  * Returns the current capacity of cpu after applying both
5333  * cpu and freq scaling.
5334  */
capacity_curr_of(int cpu)5335 unsigned long capacity_curr_of(int cpu)
5336 {
5337 	return cpu_rq(cpu)->cpu_capacity_orig *
5338 	       arch_scale_freq_capacity(NULL, cpu)
5339 	       >> SCHED_CAPACITY_SHIFT;
5340 }
5341 
energy_aware(void)5342 static inline bool energy_aware(void)
5343 {
5344 	return sched_feat(ENERGY_AWARE);
5345 }
5346 
5347 struct energy_env {
5348 	struct sched_group	*sg_top;
5349 	struct sched_group	*sg_cap;
5350 	int			cap_idx;
5351 	int			util_delta;
5352 	int			src_cpu;
5353 	int			dst_cpu;
5354 	int			trg_cpu;
5355 	int			energy;
5356 	int			payoff;
5357 	struct task_struct	*task;
5358 	struct {
5359 		int before;
5360 		int after;
5361 		int delta;
5362 		int diff;
5363 	} nrg;
5364 	struct {
5365 		int before;
5366 		int after;
5367 		int delta;
5368 	} cap;
5369 };
5370 
5371 static int cpu_util_wake(int cpu, struct task_struct *p);
5372 
5373 /*
5374  * __cpu_norm_util() returns the cpu util relative to a specific capacity,
5375  * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for
5376  * energy calculations.
5377  *
5378  * Since util is a scale-invariant utilization defined as:
5379  *
5380  *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
5381  *
5382  * the normalized util can be found using the specific capacity.
5383  *
5384  *   capacity = capacity_orig * curr_freq/max_freq
5385  *
5386  *   norm_util = running_time/time ~ util/capacity
5387  */
__cpu_norm_util(unsigned long util,unsigned long capacity)5388 static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
5389 {
5390 	if (util >= capacity)
5391 		return SCHED_CAPACITY_SCALE;
5392 
5393 	return (util << SCHED_CAPACITY_SHIFT)/capacity;
5394 }
5395 
group_max_util(struct energy_env * eenv)5396 static unsigned long group_max_util(struct energy_env *eenv)
5397 {
5398 	unsigned long max_util = 0;
5399 	unsigned long util;
5400 	int cpu;
5401 
5402 	for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) {
5403 		util = cpu_util_wake(cpu, eenv->task);
5404 
5405 		/*
5406 		 * If we are looking at the target CPU specified by the eenv,
5407 		 * then we should add the (estimated) utilization of the task
5408 		 * assuming we will wake it up on that CPU.
5409 		 */
5410 		if (unlikely(cpu == eenv->trg_cpu))
5411 			util += eenv->util_delta;
5412 
5413 		max_util = max(max_util, util);
5414 	}
5415 
5416 	return max_util;
5417 }
5418 
5419 /*
5420  * group_norm_util() returns the approximated group util relative to it's
5421  * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use
5422  * in energy calculations.
5423  *
5424  * Since task executions may or may not overlap in time in the group the true
5425  * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i))
5426  * when iterating over all CPUs in the group.
5427  * The latter estimate is used as it leads to a more pessimistic energy
5428  * estimate (more busy).
5429  */
5430 static unsigned
group_norm_util(struct energy_env * eenv,struct sched_group * sg)5431 long group_norm_util(struct energy_env *eenv, struct sched_group *sg)
5432 {
5433 	unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap;
5434 	unsigned long util, util_sum = 0;
5435 	int cpu;
5436 
5437 	for_each_cpu(cpu, sched_group_cpus(sg)) {
5438 		util = cpu_util_wake(cpu, eenv->task);
5439 
5440 		/*
5441 		 * If we are looking at the target CPU specified by the eenv,
5442 		 * then we should add the (estimated) utilization of the task
5443 		 * assuming we will wake it up on that CPU.
5444 		 */
5445 		if (unlikely(cpu == eenv->trg_cpu))
5446 			util += eenv->util_delta;
5447 
5448 		util_sum += __cpu_norm_util(util, capacity);
5449 	}
5450 
5451 	return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE);
5452 }
5453 
find_new_capacity(struct energy_env * eenv,const struct sched_group_energy * const sge)5454 static int find_new_capacity(struct energy_env *eenv,
5455 	const struct sched_group_energy * const sge)
5456 {
5457 	int idx, max_idx = sge->nr_cap_states - 1;
5458 	unsigned long util = group_max_util(eenv);
5459 
5460 	/* default is max_cap if we don't find a match */
5461 	eenv->cap_idx = max_idx;
5462 
5463 	for (idx = 0; idx < sge->nr_cap_states; idx++) {
5464 		if (sge->cap_states[idx].cap >= util) {
5465 			eenv->cap_idx = idx;
5466 			break;
5467 		}
5468 	}
5469 
5470 	return eenv->cap_idx;
5471 }
5472 
group_idle_state(struct energy_env * eenv,struct sched_group * sg)5473 static int group_idle_state(struct energy_env *eenv, struct sched_group *sg)
5474 {
5475 	int i, state = INT_MAX;
5476 	int src_in_grp, dst_in_grp;
5477 	long grp_util = 0;
5478 
5479 	/* Find the shallowest idle state in the sched group. */
5480 	for_each_cpu(i, sched_group_cpus(sg))
5481 		state = min(state, idle_get_state_idx(cpu_rq(i)));
5482 
5483 	/* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
5484 	state++;
5485 
5486 	src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg));
5487 	dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg));
5488 	if (src_in_grp == dst_in_grp) {
5489 		/* both CPUs under consideration are in the same group or not in
5490 		 * either group, migration should leave idle state the same.
5491 		 */
5492 		goto end;
5493 	}
5494 
5495 	/*
5496 	 * Try to estimate if a deeper idle state is
5497 	 * achievable when we move the task.
5498 	 */
5499 	for_each_cpu(i, sched_group_cpus(sg)) {
5500 		grp_util += cpu_util_wake(i, eenv->task);
5501 		if (unlikely(i == eenv->trg_cpu))
5502 			grp_util += eenv->util_delta;
5503 	}
5504 
5505 	if (grp_util <=
5506 		((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
5507 		/* after moving, this group is at most partly
5508 		 * occupied, so it should have some idle time.
5509 		 */
5510 		int max_idle_state_idx = sg->sge->nr_idle_states - 2;
5511 		int new_state = grp_util * max_idle_state_idx;
5512 		if (grp_util <= 0)
5513 			/* group will have no util, use lowest state */
5514 			new_state = max_idle_state_idx + 1;
5515 		else {
5516 			/* for partially idle, linearly map util to idle
5517 			 * states, excluding the lowest one. This does not
5518 			 * correspond to the state we expect to enter in
5519 			 * reality, but an indication of what might happen.
5520 			 */
5521 			new_state = min(max_idle_state_idx, (int)
5522 					(new_state / sg->sgc->max_capacity));
5523 			new_state = max_idle_state_idx - new_state;
5524 		}
5525 		state = new_state;
5526 	} else {
5527 		/* After moving, the group will be fully occupied
5528 		 * so assume it will not be idle at all.
5529 		 */
5530 		state = 0;
5531 	}
5532 end:
5533 	return state;
5534 }
5535 
5536 /*
5537  * sched_group_energy(): Computes the absolute energy consumption of cpus
5538  * belonging to the sched_group including shared resources shared only by
5539  * members of the group. Iterates over all cpus in the hierarchy below the
5540  * sched_group starting from the bottom working it's way up before going to
5541  * the next cpu until all cpus are covered at all levels. The current
5542  * implementation is likely to gather the same util statistics multiple times.
5543  * This can probably be done in a faster but more complex way.
5544  * Note: sched_group_energy() may fail when racing with sched_domain updates.
5545  */
sched_group_energy(struct energy_env * eenv)5546 static int sched_group_energy(struct energy_env *eenv)
5547 {
5548 	struct cpumask visit_cpus;
5549 	u64 total_energy = 0;
5550 	int cpu_count;
5551 
5552 	WARN_ON(!eenv->sg_top->sge);
5553 
5554 	cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
5555 	/* If a cpu is hotplugged in while we are in this function,
5556 	 * it does not appear in the existing visit_cpus mask
5557 	 * which came from the sched_group pointer of the
5558 	 * sched_domain pointed at by sd_ea for either the prev
5559 	 * or next cpu and was dereferenced in __energy_diff.
5560 	 * Since we will dereference sd_scs later as we iterate
5561 	 * through the CPUs we expect to visit, new CPUs can
5562 	 * be present which are not in the visit_cpus mask.
5563 	 * Guard this with cpu_count.
5564 	 */
5565 	cpu_count = cpumask_weight(&visit_cpus);
5566 
5567 	while (!cpumask_empty(&visit_cpus)) {
5568 		struct sched_group *sg_shared_cap = NULL;
5569 		int cpu = cpumask_first(&visit_cpus);
5570 		struct sched_domain *sd;
5571 
5572 		/*
5573 		 * Is the group utilization affected by cpus outside this
5574 		 * sched_group?
5575 		 * This sd may have groups with cpus which were not present
5576 		 * when we took visit_cpus.
5577 		 */
5578 		sd = rcu_dereference(per_cpu(sd_scs, cpu));
5579 
5580 		if (sd && sd->parent)
5581 			sg_shared_cap = sd->parent->groups;
5582 
5583 		for_each_domain(cpu, sd) {
5584 			struct sched_group *sg = sd->groups;
5585 
5586 			/* Has this sched_domain already been visited? */
5587 			if (sd->child && group_first_cpu(sg) != cpu)
5588 				break;
5589 
5590 			do {
5591 				unsigned long group_util;
5592 				int sg_busy_energy, sg_idle_energy;
5593 				int cap_idx, idle_idx;
5594 
5595 				if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
5596 					eenv->sg_cap = sg_shared_cap;
5597 				else
5598 					eenv->sg_cap = sg;
5599 
5600 				cap_idx = find_new_capacity(eenv, sg->sge);
5601 
5602 				if (sg->group_weight == 1) {
5603 					/* Remove capacity of src CPU (before task move) */
5604 					if (eenv->trg_cpu == eenv->src_cpu &&
5605 					    cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
5606 						eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
5607 						eenv->cap.delta -= eenv->cap.before;
5608 					}
5609 					/* Add capacity of dst CPU  (after task move) */
5610 					if (eenv->trg_cpu == eenv->dst_cpu &&
5611 					    cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
5612 						eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
5613 						eenv->cap.delta += eenv->cap.after;
5614 					}
5615 				}
5616 
5617 				idle_idx = group_idle_state(eenv, sg);
5618 				group_util = group_norm_util(eenv, sg);
5619 
5620 				sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power);
5621 				sg_idle_energy = ((SCHED_LOAD_SCALE-group_util)
5622 								* sg->sge->idle_states[idle_idx].power);
5623 
5624 				total_energy += sg_busy_energy + sg_idle_energy;
5625 
5626 				if (!sd->child) {
5627 					/*
5628 					 * cpu_count here is the number of
5629 					 * cpus we expect to visit in this
5630 					 * calculation. If we race against
5631 					 * hotplug, we can have extra cpus
5632 					 * added to the groups we are
5633 					 * iterating which do not appear in
5634 					 * the visit_cpus mask. In that case
5635 					 * we are not able to calculate energy
5636 					 * without restarting so we will bail
5637 					 * out and use prev_cpu this time.
5638 					 */
5639 					if (!cpu_count)
5640 						return -EINVAL;
5641 					cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
5642 					cpu_count--;
5643 				}
5644 
5645 				if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
5646 					goto next_cpu;
5647 
5648 			} while (sg = sg->next, sg != sd->groups);
5649 		}
5650 
5651 		/*
5652 		 * If we raced with hotplug and got an sd NULL-pointer;
5653 		 * returning a wrong energy estimation is better than
5654 		 * entering an infinite loop.
5655 		 * Specifically: If a cpu is unplugged after we took
5656 		 * the visit_cpus mask, it no longer has an sd_scs
5657 		 * pointer, so when we dereference it, we get NULL.
5658 		 */
5659 		if (cpumask_test_cpu(cpu, &visit_cpus))
5660 			return -EINVAL;
5661 next_cpu:
5662 		cpumask_clear_cpu(cpu, &visit_cpus);
5663 		continue;
5664 	}
5665 
5666 	eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT;
5667 	return 0;
5668 }
5669 
cpu_in_sg(struct sched_group * sg,int cpu)5670 static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
5671 {
5672 	return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg));
5673 }
5674 
5675 static inline unsigned long task_util(struct task_struct *p);
5676 
5677 /*
5678  * energy_diff(): Estimate the energy impact of changing the utilization
5679  * distribution. eenv specifies the change: utilisation amount, source, and
5680  * destination cpu. Source or destination cpu may be -1 in which case the
5681  * utilization is removed from or added to the system (e.g. task wake-up). If
5682  * both are specified, the utilization is migrated.
5683  */
__energy_diff(struct energy_env * eenv)5684 static inline int __energy_diff(struct energy_env *eenv)
5685 {
5686 	struct sched_domain *sd;
5687 	struct sched_group *sg;
5688 	int sd_cpu = -1, energy_before = 0, energy_after = 0;
5689 	int diff, margin;
5690 
5691 	struct energy_env eenv_before = {
5692 		.util_delta	= task_util(eenv->task),
5693 		.src_cpu	= eenv->src_cpu,
5694 		.dst_cpu	= eenv->dst_cpu,
5695 		.trg_cpu	= eenv->src_cpu,
5696 		.nrg		= { 0, 0, 0, 0},
5697 		.cap		= { 0, 0, 0 },
5698 		.task		= eenv->task,
5699 	};
5700 
5701 	if (eenv->src_cpu == eenv->dst_cpu)
5702 		return 0;
5703 
5704 	sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
5705 	sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
5706 
5707 	if (!sd)
5708 		return 0; /* Error */
5709 
5710 	sg = sd->groups;
5711 
5712 	do {
5713 		if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
5714 			eenv_before.sg_top = eenv->sg_top = sg;
5715 
5716 			if (sched_group_energy(&eenv_before))
5717 				return 0; /* Invalid result abort */
5718 			energy_before += eenv_before.energy;
5719 
5720 			/* Keep track of SRC cpu (before) capacity */
5721 			eenv->cap.before = eenv_before.cap.before;
5722 			eenv->cap.delta = eenv_before.cap.delta;
5723 
5724 			if (sched_group_energy(eenv))
5725 				return 0; /* Invalid result abort */
5726 			energy_after += eenv->energy;
5727 		}
5728 	} while (sg = sg->next, sg != sd->groups);
5729 
5730 	eenv->nrg.before = energy_before;
5731 	eenv->nrg.after = energy_after;
5732 	eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
5733 	eenv->payoff = 0;
5734 #ifndef CONFIG_SCHED_TUNE
5735 	trace_sched_energy_diff(eenv->task,
5736 			eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
5737 			eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
5738 			eenv->cap.before, eenv->cap.after, eenv->cap.delta,
5739 			eenv->nrg.delta, eenv->payoff);
5740 #endif
5741 	/*
5742 	 * Dead-zone margin preventing too many migrations.
5743 	 */
5744 
5745 	margin = eenv->nrg.before >> 6; /* ~1.56% */
5746 
5747 	diff = eenv->nrg.after - eenv->nrg.before;
5748 
5749 	eenv->nrg.diff = (abs(diff) < margin) ? 0 : eenv->nrg.diff;
5750 
5751 	return eenv->nrg.diff;
5752 }
5753 
5754 #ifdef CONFIG_SCHED_TUNE
5755 
5756 struct target_nrg schedtune_target_nrg;
5757 
5758 #ifdef CONFIG_CGROUP_SCHEDTUNE
5759 extern bool schedtune_initialized;
5760 #endif /* CONFIG_CGROUP_SCHEDTUNE */
5761 
5762 /*
5763  * System energy normalization
5764  * Returns the normalized value, in the range [0..SCHED_CAPACITY_SCALE],
5765  * corresponding to the specified energy variation.
5766  */
5767 static inline int
normalize_energy(int energy_diff)5768 normalize_energy(int energy_diff)
5769 {
5770 	u32 normalized_nrg;
5771 
5772 #ifdef CONFIG_CGROUP_SCHEDTUNE
5773 	/* during early setup, we don't know the extents */
5774 	if (unlikely(!schedtune_initialized))
5775 		return energy_diff < 0 ? -1 : 1 ;
5776 #endif /* CONFIG_CGROUP_SCHEDTUNE */
5777 
5778 #ifdef CONFIG_SCHED_DEBUG
5779 	{
5780 	int max_delta;
5781 
5782 	/* Check for boundaries */
5783 	max_delta  = schedtune_target_nrg.max_power;
5784 	max_delta -= schedtune_target_nrg.min_power;
5785 	WARN_ON(abs(energy_diff) >= max_delta);
5786 	}
5787 #endif
5788 
5789 	/* Do scaling using positive numbers to increase the range */
5790 	normalized_nrg = (energy_diff < 0) ? -energy_diff : energy_diff;
5791 
5792 	/* Scale by energy magnitude */
5793 	normalized_nrg <<= SCHED_CAPACITY_SHIFT;
5794 
5795 	/* Normalize on max energy for target platform */
5796 	normalized_nrg = reciprocal_divide(
5797 			normalized_nrg, schedtune_target_nrg.rdiv);
5798 
5799 	return (energy_diff < 0) ? -normalized_nrg : normalized_nrg;
5800 }
5801 
5802 static inline int
energy_diff(struct energy_env * eenv)5803 energy_diff(struct energy_env *eenv)
5804 {
5805 	int boost = schedtune_task_boost(eenv->task);
5806 	int nrg_delta;
5807 
5808 	/* Conpute "absolute" energy diff */
5809 	__energy_diff(eenv);
5810 
5811 	/* Return energy diff when boost margin is 0 */
5812 	if (boost == 0) {
5813 		trace_sched_energy_diff(eenv->task,
5814 				eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
5815 				eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
5816 				eenv->cap.before, eenv->cap.after, eenv->cap.delta,
5817 				0, -eenv->nrg.diff);
5818 		return eenv->nrg.diff;
5819 	}
5820 
5821 	/* Compute normalized energy diff */
5822 	nrg_delta = normalize_energy(eenv->nrg.diff);
5823 	eenv->nrg.delta = nrg_delta;
5824 
5825 	eenv->payoff = schedtune_accept_deltas(
5826 			eenv->nrg.delta,
5827 			eenv->cap.delta,
5828 			eenv->task);
5829 
5830 	trace_sched_energy_diff(eenv->task,
5831 			eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
5832 			eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
5833 			eenv->cap.before, eenv->cap.after, eenv->cap.delta,
5834 			eenv->nrg.delta, eenv->payoff);
5835 
5836 	/*
5837 	 * When SchedTune is enabled, the energy_diff() function will return
5838 	 * the computed energy payoff value. Since the energy_diff() return
5839 	 * value is expected to be negative by its callers, this evaluation
5840 	 * function return a negative value each time the evaluation return a
5841 	 * positive payoff, which is the condition for the acceptance of
5842 	 * a scheduling decision
5843 	 */
5844 	return -eenv->payoff;
5845 }
5846 #else /* CONFIG_SCHED_TUNE */
5847 #define energy_diff(eenv) __energy_diff(eenv)
5848 #endif
5849 
5850 /*
5851  * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
5852  * A waker of many should wake a different task than the one last awakened
5853  * at a frequency roughly N times higher than one of its wakees.  In order
5854  * to determine whether we should let the load spread vs consolodating to
5855  * shared cache, we look for a minimum 'flip' frequency of llc_size in one
5856  * partner, and a factor of lls_size higher frequency in the other.  With
5857  * both conditions met, we can be relatively sure that the relationship is
5858  * non-monogamous, with partner count exceeding socket size.  Waker/wakee
5859  * being client/server, worker/dispatcher, interrupt source or whatever is
5860  * irrelevant, spread criteria is apparent partner count exceeds socket size.
5861  */
wake_wide(struct task_struct * p,int sibling_count_hint)5862 static int wake_wide(struct task_struct *p, int sibling_count_hint)
5863 {
5864 	unsigned int master = current->wakee_flips;
5865 	unsigned int slave = p->wakee_flips;
5866 	int llc_size = this_cpu_read(sd_llc_size);
5867 
5868 	if (sibling_count_hint >= llc_size)
5869 		return 1;
5870 
5871 	if (master < slave)
5872 		swap(master, slave);
5873 	if (slave < llc_size || master < slave * llc_size)
5874 		return 0;
5875 	return 1;
5876 }
5877 
wake_affine(struct sched_domain * sd,struct task_struct * p,int prev_cpu,int sync)5878 static int wake_affine(struct sched_domain *sd, struct task_struct *p,
5879 		       int prev_cpu, int sync)
5880 {
5881 	s64 this_load, load;
5882 	s64 this_eff_load, prev_eff_load;
5883 	int idx, this_cpu;
5884 	struct task_group *tg;
5885 	unsigned long weight;
5886 	int balanced;
5887 
5888 	idx	  = sd->wake_idx;
5889 	this_cpu  = smp_processor_id();
5890 	load	  = source_load(prev_cpu, idx);
5891 	this_load = target_load(this_cpu, idx);
5892 
5893 	/*
5894 	 * If sync wakeup then subtract the (maximum possible)
5895 	 * effect of the currently running task from the load
5896 	 * of the current CPU:
5897 	 */
5898 	if (sync) {
5899 		tg = task_group(current);
5900 		weight = current->se.avg.load_avg;
5901 
5902 		this_load += effective_load(tg, this_cpu, -weight, -weight);
5903 		load += effective_load(tg, prev_cpu, 0, -weight);
5904 	}
5905 
5906 	tg = task_group(p);
5907 	weight = p->se.avg.load_avg;
5908 
5909 	/*
5910 	 * In low-load situations, where prev_cpu is idle and this_cpu is idle
5911 	 * due to the sync cause above having dropped this_load to 0, we'll
5912 	 * always have an imbalance, but there's really nothing you can do
5913 	 * about that, so that's good too.
5914 	 *
5915 	 * Otherwise check if either cpus are near enough in load to allow this
5916 	 * task to be woken on this_cpu.
5917 	 */
5918 	this_eff_load = 100;
5919 	this_eff_load *= capacity_of(prev_cpu);
5920 
5921 	prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
5922 	prev_eff_load *= capacity_of(this_cpu);
5923 
5924 	if (this_load > 0) {
5925 		this_eff_load *= this_load +
5926 			effective_load(tg, this_cpu, weight, weight);
5927 
5928 		prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
5929 	}
5930 
5931 	balanced = this_eff_load <= prev_eff_load;
5932 
5933 	schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
5934 
5935 	if (!balanced)
5936 		return 0;
5937 
5938 	schedstat_inc(sd, ttwu_move_affine);
5939 	schedstat_inc(p, se.statistics.nr_wakeups_affine);
5940 
5941 	return 1;
5942 }
5943 
task_util(struct task_struct * p)5944 static inline unsigned long task_util(struct task_struct *p)
5945 {
5946 #ifdef CONFIG_SCHED_WALT
5947 	if (!walt_disabled && sysctl_sched_use_walt_task_util) {
5948 		unsigned long demand = p->ravg.demand;
5949 		return (demand << 10) / walt_ravg_window;
5950 	}
5951 #endif
5952 	return p->se.avg.util_avg;
5953 }
5954 
5955 static inline unsigned long boosted_task_util(struct task_struct *task);
5956 
__task_fits(struct task_struct * p,int cpu,int util)5957 static inline bool __task_fits(struct task_struct *p, int cpu, int util)
5958 {
5959 	unsigned long capacity = capacity_of(cpu);
5960 
5961 	util += boosted_task_util(p);
5962 
5963 	return (capacity * 1024) > (util * capacity_margin);
5964 }
5965 
task_fits_max(struct task_struct * p,int cpu)5966 static inline bool task_fits_max(struct task_struct *p, int cpu)
5967 {
5968 	unsigned long capacity = capacity_of(cpu);
5969 	unsigned long max_capacity = cpu_rq(cpu)->rd->max_cpu_capacity.val;
5970 
5971 	if (capacity == max_capacity)
5972 		return true;
5973 
5974 	if (capacity * capacity_margin > max_capacity * 1024)
5975 		return true;
5976 
5977 	return __task_fits(p, cpu, 0);
5978 }
5979 
__cpu_overutilized(int cpu,int delta)5980 static bool __cpu_overutilized(int cpu, int delta)
5981 {
5982 	return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin);
5983 }
5984 
cpu_overutilized(int cpu)5985 static bool cpu_overutilized(int cpu)
5986 {
5987 	return __cpu_overutilized(cpu, 0);
5988 }
5989 
5990 #ifdef CONFIG_SCHED_TUNE
5991 
5992 struct reciprocal_value schedtune_spc_rdiv;
5993 
5994 static long
schedtune_margin(unsigned long signal,long boost)5995 schedtune_margin(unsigned long signal, long boost)
5996 {
5997 	long long margin = 0;
5998 
5999 	/*
6000 	 * Signal proportional compensation (SPC)
6001 	 *
6002 	 * The Boost (B) value is used to compute a Margin (M) which is
6003 	 * proportional to the complement of the original Signal (S):
6004 	 *   M = B * (SCHED_CAPACITY_SCALE - S)
6005 	 * The obtained M could be used by the caller to "boost" S.
6006 	 */
6007 	if (boost >= 0) {
6008 		margin  = SCHED_CAPACITY_SCALE - signal;
6009 		margin *= boost;
6010 	} else
6011 		margin = -signal * boost;
6012 
6013 	margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
6014 
6015 	if (boost < 0)
6016 		margin *= -1;
6017 	return margin;
6018 }
6019 
6020 static inline int
schedtune_cpu_margin(unsigned long util,int cpu)6021 schedtune_cpu_margin(unsigned long util, int cpu)
6022 {
6023 	int boost = schedtune_cpu_boost(cpu);
6024 
6025 	if (boost == 0)
6026 		return 0;
6027 
6028 	return schedtune_margin(util, boost);
6029 }
6030 
6031 static inline long
schedtune_task_margin(struct task_struct * task)6032 schedtune_task_margin(struct task_struct *task)
6033 {
6034 	int boost = schedtune_task_boost(task);
6035 	unsigned long util;
6036 	long margin;
6037 
6038 	if (boost == 0)
6039 		return 0;
6040 
6041 	util = task_util(task);
6042 	margin = schedtune_margin(util, boost);
6043 
6044 	return margin;
6045 }
6046 
6047 #else /* CONFIG_SCHED_TUNE */
6048 
6049 static inline int
schedtune_cpu_margin(unsigned long util,int cpu)6050 schedtune_cpu_margin(unsigned long util, int cpu)
6051 {
6052 	return 0;
6053 }
6054 
6055 static inline int
schedtune_task_margin(struct task_struct * task)6056 schedtune_task_margin(struct task_struct *task)
6057 {
6058 	return 0;
6059 }
6060 
6061 #endif /* CONFIG_SCHED_TUNE */
6062 
6063 unsigned long
boosted_cpu_util(int cpu)6064 boosted_cpu_util(int cpu)
6065 {
6066 	unsigned long util = cpu_util_freq(cpu);
6067 	long margin = schedtune_cpu_margin(util, cpu);
6068 
6069 	trace_sched_boost_cpu(cpu, util, margin);
6070 
6071 	return util + margin;
6072 }
6073 
6074 static inline unsigned long
boosted_task_util(struct task_struct * task)6075 boosted_task_util(struct task_struct *task)
6076 {
6077 	unsigned long util = task_util(task);
6078 	long margin = schedtune_task_margin(task);
6079 
6080 	trace_sched_boost_task(task, util, margin);
6081 
6082 	return util + margin;
6083 }
6084 
capacity_spare_wake(int cpu,struct task_struct * p)6085 static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
6086 {
6087 	return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
6088 }
6089 
6090 /*
6091  * find_idlest_group finds and returns the least busy CPU group within the
6092  * domain.
6093  *
6094  * Assumes p is allowed on at least one CPU in sd.
6095  */
6096 static struct sched_group *
find_idlest_group(struct sched_domain * sd,struct task_struct * p,int this_cpu,int sd_flag)6097 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
6098 		  int this_cpu, int sd_flag)
6099 {
6100 	struct sched_group *idlest = NULL, *group = sd->groups;
6101 	struct sched_group *most_spare_sg = NULL;
6102 	unsigned long min_load = ULONG_MAX, this_load = ULONG_MAX;
6103 	unsigned long most_spare = 0, this_spare = 0;
6104 	int load_idx = sd->forkexec_idx;
6105 	int imbalance = 100 + (sd->imbalance_pct-100)/2;
6106 
6107 	if (sd_flag & SD_BALANCE_WAKE)
6108 		load_idx = sd->wake_idx;
6109 
6110 	do {
6111 		unsigned long load, avg_load, spare_cap, max_spare_cap;
6112 		int local_group;
6113 		int i;
6114 
6115 		/* Skip over this group if it has no CPUs allowed */
6116 		if (!cpumask_intersects(sched_group_cpus(group),
6117 					tsk_cpus_allowed(p)))
6118 			continue;
6119 
6120 		local_group = cpumask_test_cpu(this_cpu,
6121 					       sched_group_cpus(group));
6122 
6123 		/*
6124 		 * Tally up the load of all CPUs in the group and find
6125 		 * the group containing the CPU with most spare capacity.
6126 		 */
6127 		avg_load = 0;
6128 		max_spare_cap = 0;
6129 
6130 		for_each_cpu(i, sched_group_cpus(group)) {
6131 			/* Bias balancing toward cpus of our domain */
6132 			if (local_group)
6133 				load = source_load(i, load_idx);
6134 			else
6135 				load = target_load(i, load_idx);
6136 
6137 			avg_load += load;
6138 
6139 			spare_cap = capacity_spare_wake(i, p);
6140 
6141 			if (spare_cap > max_spare_cap)
6142 				max_spare_cap = spare_cap;
6143 		}
6144 
6145 		/* Adjust by relative CPU capacity of the group */
6146 		avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
6147 
6148 		if (local_group) {
6149 			this_load = avg_load;
6150 			this_spare = max_spare_cap;
6151 		} else {
6152 			if (avg_load < min_load) {
6153 				min_load = avg_load;
6154 				idlest = group;
6155 			}
6156 
6157 			if (most_spare < max_spare_cap) {
6158 				most_spare = max_spare_cap;
6159 				most_spare_sg = group;
6160 			}
6161 		}
6162 	} while (group = group->next, group != sd->groups);
6163 
6164 	/*
6165 	 * The cross-over point between using spare capacity or least load
6166 	 * is too conservative for high utilization tasks on partially
6167 	 * utilized systems if we require spare_capacity > task_util(p),
6168 	 * so we allow for some task stuffing by using
6169 	 * spare_capacity > task_util(p)/2.
6170 	 *
6171 	 * Spare capacity can't be used for fork because the utilization has
6172 	 * not been set yet, we must first select a rq to compute the initial
6173 	 * utilization.
6174 	 */
6175 	if (sd_flag & SD_BALANCE_FORK)
6176 		goto skip_spare;
6177 
6178 	if (this_spare > task_util(p) / 2 &&
6179 	    imbalance*this_spare > 100*most_spare)
6180 		return NULL;
6181 	else if (most_spare > task_util(p) / 2)
6182 		return most_spare_sg;
6183 
6184 skip_spare:
6185 	if (!idlest || 100*this_load < imbalance*min_load)
6186 		return NULL;
6187 	return idlest;
6188 }
6189 
6190 /*
6191  * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
6192  */
6193 static int
find_idlest_group_cpu(struct sched_group * group,struct task_struct * p,int this_cpu)6194 find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
6195 {
6196 	unsigned long load, min_load = ULONG_MAX;
6197 	unsigned int min_exit_latency = UINT_MAX;
6198 	u64 latest_idle_timestamp = 0;
6199 	int least_loaded_cpu = this_cpu;
6200 	int shallowest_idle_cpu = -1;
6201 	int i;
6202 
6203 	/* Check if we have any choice: */
6204 	if (group->group_weight == 1)
6205 		return cpumask_first(sched_group_cpus(group));
6206 
6207 	/* Traverse only the allowed CPUs */
6208 	for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
6209 		if (idle_cpu(i)) {
6210 			struct rq *rq = cpu_rq(i);
6211 			struct cpuidle_state *idle = idle_get_state(rq);
6212 			if (idle && idle->exit_latency < min_exit_latency) {
6213 				/*
6214 				 * We give priority to a CPU whose idle state
6215 				 * has the smallest exit latency irrespective
6216 				 * of any idle timestamp.
6217 				 */
6218 				min_exit_latency = idle->exit_latency;
6219 				latest_idle_timestamp = rq->idle_stamp;
6220 				shallowest_idle_cpu = i;
6221 			} else if ((!idle || idle->exit_latency == min_exit_latency) &&
6222 				   rq->idle_stamp > latest_idle_timestamp) {
6223 				/*
6224 				 * If equal or no active idle state, then
6225 				 * the most recently idled CPU might have
6226 				 * a warmer cache.
6227 				 */
6228 				latest_idle_timestamp = rq->idle_stamp;
6229 				shallowest_idle_cpu = i;
6230 			}
6231 		} else if (shallowest_idle_cpu == -1) {
6232 			load = weighted_cpuload(i);
6233 			if (load < min_load || (load == min_load && i == this_cpu)) {
6234 				min_load = load;
6235 				least_loaded_cpu = i;
6236 			}
6237 		}
6238 	}
6239 
6240 	return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
6241  }
6242 
find_idlest_cpu(struct sched_domain * sd,struct task_struct * p,int cpu,int prev_cpu,int sd_flag)6243 static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
6244 				  int cpu, int prev_cpu, int sd_flag)
6245 {
6246 	int new_cpu = cpu;
6247 	int wu = sd_flag & SD_BALANCE_WAKE;
6248 	int cas_cpu = -1;
6249 
6250 	if (wu) {
6251 		schedstat_inc(p, se.statistics.nr_wakeups_cas_attempts);
6252 		schedstat_inc(this_rq(), eas_stats.cas_attempts);
6253 	}
6254 
6255 	if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
6256 		return prev_cpu;
6257 
6258 	while (sd) {
6259 		struct sched_group *group;
6260 		struct sched_domain *tmp;
6261 		int weight;
6262 
6263 		if (wu)
6264 			schedstat_inc(sd, eas_stats.cas_attempts);
6265 
6266 		if (!(sd->flags & sd_flag)) {
6267 			sd = sd->child;
6268 			continue;
6269 		}
6270 
6271 		group = find_idlest_group(sd, p, cpu, sd_flag);
6272 		if (!group) {
6273 			sd = sd->child;
6274 			continue;
6275 		}
6276 
6277 		new_cpu = find_idlest_group_cpu(group, p, cpu);
6278 		if (new_cpu == cpu) {
6279 			/* Now try balancing at a lower domain level of cpu */
6280 			sd = sd->child;
6281 			continue;
6282 		}
6283 
6284 		/* Now try balancing at a lower domain level of new_cpu */
6285 		cpu = cas_cpu = new_cpu;
6286 		weight = sd->span_weight;
6287 		sd = NULL;
6288 		for_each_domain(cpu, tmp) {
6289 			if (weight <= tmp->span_weight)
6290 				break;
6291 			if (tmp->flags & sd_flag)
6292 				sd = tmp;
6293 		}
6294 		/* while loop will break here if sd == NULL */
6295 	}
6296 
6297 	if (wu && (cas_cpu >= 0)) {
6298 		schedstat_inc(p, se.statistics.nr_wakeups_cas_count);
6299 		schedstat_inc(this_rq(), eas_stats.cas_count);
6300 	}
6301 
6302 	return new_cpu;
6303 }
6304 
6305 /*
6306  * Try and locate an idle CPU in the sched_domain.
6307  */
select_idle_sibling(struct task_struct * p,int prev,int target)6308 static int select_idle_sibling(struct task_struct *p, int prev, int target)
6309 {
6310 	struct sched_domain *sd;
6311 	struct sched_group *sg;
6312 	int best_idle_cpu = -1;
6313 	int best_idle_cstate = INT_MAX;
6314 	unsigned long best_idle_capacity = ULONG_MAX;
6315 
6316 	schedstat_inc(p, se.statistics.nr_wakeups_sis_attempts);
6317 	schedstat_inc(this_rq(), eas_stats.sis_attempts);
6318 
6319 	if (!sysctl_sched_cstate_aware) {
6320 		if (idle_cpu(target)) {
6321 			schedstat_inc(p, se.statistics.nr_wakeups_sis_idle);
6322 			schedstat_inc(this_rq(), eas_stats.sis_idle);
6323 			return target;
6324 		}
6325 
6326 		/*
6327 		 * If the prevous cpu is cache affine and idle, don't be stupid.
6328 		 */
6329 		if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) {
6330 			schedstat_inc(p, se.statistics.nr_wakeups_sis_cache_affine);
6331 			schedstat_inc(this_rq(), eas_stats.sis_cache_affine);
6332 			return prev;
6333 		}
6334 	}
6335 
6336 	/*
6337 	 * Otherwise, iterate the domains and find an elegible idle cpu.
6338 	 */
6339 	sd = rcu_dereference(per_cpu(sd_llc, target));
6340 	for_each_lower_domain(sd) {
6341 		sg = sd->groups;
6342 		do {
6343 			int i;
6344 			if (!cpumask_intersects(sched_group_cpus(sg),
6345 						tsk_cpus_allowed(p)))
6346 				goto next;
6347 
6348 			if (sysctl_sched_cstate_aware) {
6349 				for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
6350 					int idle_idx = idle_get_state_idx(cpu_rq(i));
6351 					unsigned long new_usage = boosted_task_util(p);
6352 					unsigned long capacity_orig = capacity_orig_of(i);
6353 
6354 					if (new_usage > capacity_orig || !idle_cpu(i))
6355 						goto next;
6356 
6357 					if (i == target && new_usage <= capacity_curr_of(target)) {
6358 						schedstat_inc(p, se.statistics.nr_wakeups_sis_suff_cap);
6359 						schedstat_inc(this_rq(), eas_stats.sis_suff_cap);
6360 						schedstat_inc(sd, eas_stats.sis_suff_cap);
6361 						return target;
6362 					}
6363 
6364 					if (idle_idx < best_idle_cstate &&
6365 					    capacity_orig <= best_idle_capacity) {
6366 						best_idle_cpu = i;
6367 						best_idle_cstate = idle_idx;
6368 						best_idle_capacity = capacity_orig;
6369 					}
6370 				}
6371 			} else {
6372 				for_each_cpu(i, sched_group_cpus(sg)) {
6373 					if (i == target || !idle_cpu(i))
6374 						goto next;
6375 				}
6376 
6377 				target = cpumask_first_and(sched_group_cpus(sg),
6378 					tsk_cpus_allowed(p));
6379 				schedstat_inc(p, se.statistics.nr_wakeups_sis_idle_cpu);
6380 				schedstat_inc(this_rq(), eas_stats.sis_idle_cpu);
6381 				schedstat_inc(sd, eas_stats.sis_idle_cpu);
6382 				goto done;
6383 			}
6384 next:
6385 			sg = sg->next;
6386 		} while (sg != sd->groups);
6387 	}
6388 
6389 	if (best_idle_cpu >= 0)
6390 		target = best_idle_cpu;
6391 
6392 done:
6393 	schedstat_inc(p, se.statistics.nr_wakeups_sis_count);
6394 	schedstat_inc(this_rq(), eas_stats.sis_count);
6395 
6396 	return target;
6397 }
6398 
6399 /*
6400  * cpu_util_wake: Compute cpu utilization with any contributions from
6401  * the waking task p removed.  check_for_migration() looks for a better CPU of
6402  * rq->curr. For that case we should return cpu util with contributions from
6403  * currently running task p removed.
6404  */
cpu_util_wake(int cpu,struct task_struct * p)6405 static int cpu_util_wake(int cpu, struct task_struct *p)
6406 {
6407 	unsigned long util, capacity;
6408 
6409 #ifdef CONFIG_SCHED_WALT
6410 	/*
6411 	 * WALT does not decay idle tasks in the same manner
6412 	 * as PELT, so it makes little sense to subtract task
6413 	 * utilization from cpu utilization. Instead just use
6414 	 * cpu_util for this case.
6415 	 */
6416 	if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
6417 	    p->state == TASK_WAKING)
6418 		return cpu_util(cpu);
6419 #endif
6420 	/* Task has no contribution or is new */
6421 	if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
6422 		return cpu_util(cpu);
6423 
6424 	capacity = capacity_orig_of(cpu);
6425 	util = max_t(long, cpu_util(cpu) - task_util(p), 0);
6426 
6427 	return (util >= capacity) ? capacity : util;
6428 }
6429 
start_cpu(bool boosted)6430 static int start_cpu(bool boosted)
6431 {
6432 	struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
6433 
6434 	return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
6435 }
6436 
find_best_target(struct task_struct * p,int * backup_cpu,bool boosted,bool prefer_idle)6437 static inline int find_best_target(struct task_struct *p, int *backup_cpu,
6438 				   bool boosted, bool prefer_idle)
6439 {
6440 	unsigned long best_idle_min_cap_orig = ULONG_MAX;
6441 	unsigned long min_util = boosted_task_util(p);
6442 	unsigned long target_capacity = ULONG_MAX;
6443 	unsigned long min_wake_util = ULONG_MAX;
6444 	unsigned long target_max_spare_cap = 0;
6445 	unsigned long best_active_util = ULONG_MAX;
6446 	int best_idle_cstate = INT_MAX;
6447 	struct sched_domain *sd;
6448 	struct sched_group *sg;
6449 	int best_active_cpu = -1;
6450 	int best_idle_cpu = -1;
6451 	int target_cpu = -1;
6452 	int cpu, i;
6453 
6454 	*backup_cpu = -1;
6455 
6456 	schedstat_inc(p, se.statistics.nr_wakeups_fbt_attempts);
6457 	schedstat_inc(this_rq(), eas_stats.fbt_attempts);
6458 
6459 	/* Find start CPU based on boost value */
6460 	cpu = start_cpu(boosted);
6461 	if (cpu < 0) {
6462 		schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_cpu);
6463 		schedstat_inc(this_rq(), eas_stats.fbt_no_cpu);
6464 		return -1;
6465 	}
6466 
6467 	/* Find SD for the start CPU */
6468 	sd = rcu_dereference(per_cpu(sd_ea, cpu));
6469 	if (!sd) {
6470 		schedstat_inc(p, se.statistics.nr_wakeups_fbt_no_sd);
6471 		schedstat_inc(this_rq(), eas_stats.fbt_no_sd);
6472 		return -1;
6473 	}
6474 
6475 	/* Scan CPUs in all SDs */
6476 	sg = sd->groups;
6477 	do {
6478 		for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) {
6479 			unsigned long capacity_curr = capacity_curr_of(i);
6480 			unsigned long capacity_orig = capacity_orig_of(i);
6481 			unsigned long wake_util, new_util;
6482 
6483 			if (!cpu_online(i))
6484 				continue;
6485 
6486 			if (walt_cpu_high_irqload(i))
6487 				continue;
6488 
6489 			/*
6490 			 * p's blocked utilization is still accounted for on prev_cpu
6491 			 * so prev_cpu will receive a negative bias due to the double
6492 			 * accounting. However, the blocked utilization may be zero.
6493 			 */
6494 			wake_util = cpu_util_wake(i, p);
6495 			new_util = wake_util + task_util(p);
6496 
6497 			/*
6498 			 * Ensure minimum capacity to grant the required boost.
6499 			 * The target CPU can be already at a capacity level higher
6500 			 * than the one required to boost the task.
6501 			 */
6502 			new_util = max(min_util, new_util);
6503 			if (new_util > capacity_orig)
6504 				continue;
6505 
6506 			/*
6507 			 * Case A) Latency sensitive tasks
6508 			 *
6509 			 * Unconditionally favoring tasks that prefer idle CPU to
6510 			 * improve latency.
6511 			 *
6512 			 * Looking for:
6513 			 * - an idle CPU, whatever its idle_state is, since
6514 			 *   the first CPUs we explore are more likely to be
6515 			 *   reserved for latency sensitive tasks.
6516 			 * - a non idle CPU where the task fits in its current
6517 			 *   capacity and has the maximum spare capacity.
6518 			 * - a non idle CPU with lower contention from other
6519 			 *   tasks and running at the lowest possible OPP.
6520 			 *
6521 			 * The last two goals tries to favor a non idle CPU
6522 			 * where the task can run as if it is "almost alone".
6523 			 * A maximum spare capacity CPU is favoured since
6524 			 * the task already fits into that CPU's capacity
6525 			 * without waiting for an OPP chance.
6526 			 *
6527 			 * The following code path is the only one in the CPUs
6528 			 * exploration loop which is always used by
6529 			 * prefer_idle tasks. It exits the loop with wither a
6530 			 * best_active_cpu or a target_cpu which should
6531 			 * represent an optimal choice for latency sensitive
6532 			 * tasks.
6533 			 */
6534 			if (prefer_idle) {
6535 
6536 				/*
6537 				 * Case A.1: IDLE CPU
6538 				 * Return the first IDLE CPU we find.
6539 				 */
6540 				if (idle_cpu(i)) {
6541 					schedstat_inc(p, se.statistics.nr_wakeups_fbt_pref_idle);
6542 					schedstat_inc(this_rq(), eas_stats.fbt_pref_idle);
6543 
6544 					trace_sched_find_best_target(p,
6545 							prefer_idle, min_util,
6546 							cpu, best_idle_cpu,
6547 							best_active_cpu, i);
6548 
6549 					return i;
6550 				}
6551 
6552 				/*
6553 				 * Case A.2: Target ACTIVE CPU
6554 				 * Favor CPUs with max spare capacity.
6555 				 */
6556 				if ((capacity_curr > new_util) &&
6557 					(capacity_orig - new_util > target_max_spare_cap)) {
6558 					target_max_spare_cap = capacity_orig - new_util;
6559 					target_cpu = i;
6560 					continue;
6561 				}
6562 				if (target_cpu != -1)
6563 					continue;
6564 
6565 
6566 				/*
6567 				 * Case A.3: Backup ACTIVE CPU
6568 				 * Favor CPUs with:
6569 				 * - lower utilization due to other tasks
6570 				 * - lower utilization with the task in
6571 				 */
6572 				if (wake_util > min_wake_util)
6573 					continue;
6574 				if (new_util > best_active_util)
6575 					continue;
6576 				min_wake_util = wake_util;
6577 				best_active_util = new_util;
6578 				best_active_cpu = i;
6579 				continue;
6580 			}
6581 
6582 			/*
6583 			 * Enforce EAS mode
6584 			 *
6585 			 * For non latency sensitive tasks, skip CPUs that
6586 			 * will be overutilized by moving the task there.
6587 			 *
6588 			 * The goal here is to remain in EAS mode as long as
6589 			 * possible at least for !prefer_idle tasks.
6590 			 */
6591 			if ((new_util * capacity_margin) >
6592 			    (capacity_orig * SCHED_CAPACITY_SCALE))
6593 				continue;
6594 
6595 			/*
6596 			 * Case B) Non latency sensitive tasks on IDLE CPUs.
6597 			 *
6598 			 * Find an optimal backup IDLE CPU for non latency
6599 			 * sensitive tasks.
6600 			 *
6601 			 * Looking for:
6602 			 * - minimizing the capacity_orig,
6603 			 *   i.e. preferring LITTLE CPUs
6604 			 * - favoring shallowest idle states
6605 			 *   i.e. avoid to wakeup deep-idle CPUs
6606 			 *
6607 			 * The following code path is used by non latency
6608 			 * sensitive tasks if IDLE CPUs are available. If at
6609 			 * least one of such CPUs are available it sets the
6610 			 * best_idle_cpu to the most suitable idle CPU to be
6611 			 * selected.
6612 			 *
6613 			 * If idle CPUs are available, favour these CPUs to
6614 			 * improve performances by spreading tasks.
6615 			 * Indeed, the energy_diff() computed by the caller
6616 			 * will take care to ensure the minimization of energy
6617 			 * consumptions without affecting performance.
6618 			 */
6619 			if (idle_cpu(i)) {
6620 				int idle_idx = idle_get_state_idx(cpu_rq(i));
6621 
6622 				/* Select idle CPU with lower cap_orig */
6623 				if (capacity_orig > best_idle_min_cap_orig)
6624 					continue;
6625 
6626 				/*
6627 				 * Skip CPUs in deeper idle state, but only
6628 				 * if they are also less energy efficient.
6629 				 * IOW, prefer a deep IDLE LITTLE CPU vs a
6630 				 * shallow idle big CPU.
6631 				 */
6632 				if (sysctl_sched_cstate_aware &&
6633 				    best_idle_cstate <= idle_idx)
6634 					continue;
6635 
6636 				/* Keep track of best idle CPU */
6637 				best_idle_min_cap_orig = capacity_orig;
6638 				best_idle_cstate = idle_idx;
6639 				best_idle_cpu = i;
6640 				continue;
6641 			}
6642 
6643 			/*
6644 			 * Case C) Non latency sensitive tasks on ACTIVE CPUs.
6645 			 *
6646 			 * Pack tasks in the most energy efficient capacities.
6647 			 *
6648 			 * This task packing strategy prefers more energy
6649 			 * efficient CPUs (i.e. pack on smaller maximum
6650 			 * capacity CPUs) while also trying to spread tasks to
6651 			 * run them all at the lower OPP.
6652 			 *
6653 			 * This assumes for example that it's more energy
6654 			 * efficient to run two tasks on two CPUs at a lower
6655 			 * OPP than packing both on a single CPU but running
6656 			 * that CPU at an higher OPP.
6657 			 *
6658 			 * Thus, this case keep track of the CPU with the
6659 			 * smallest maximum capacity and highest spare maximum
6660 			 * capacity.
6661 			 */
6662 
6663 			/* Favor CPUs with smaller capacity */
6664 			if (capacity_orig > target_capacity)
6665 				continue;
6666 
6667 			/* Favor CPUs with maximum spare capacity */
6668 			if ((capacity_orig - new_util) < target_max_spare_cap)
6669 				continue;
6670 
6671 			target_max_spare_cap = capacity_orig - new_util;
6672 			target_capacity = capacity_orig;
6673 			target_cpu = i;
6674 		}
6675 
6676 	} while (sg = sg->next, sg != sd->groups);
6677 
6678 	/*
6679 	 * For non latency sensitive tasks, cases B and C in the previous loop,
6680 	 * we pick the best IDLE CPU only if we was not able to find a target
6681 	 * ACTIVE CPU.
6682 	 *
6683 	 * Policies priorities:
6684 	 *
6685 	 * - prefer_idle tasks:
6686 	 *
6687 	 *   a) IDLE CPU available, we return immediately
6688 	 *   b) ACTIVE CPU where task fits and has the bigger maximum spare
6689 	 *      capacity (i.e. target_cpu)
6690 	 *   c) ACTIVE CPU with less contention due to other tasks
6691 	 *      (i.e. best_active_cpu)
6692 	 *
6693 	 * - NON prefer_idle tasks:
6694 	 *
6695 	 *   a) ACTIVE CPU: target_cpu
6696 	 *   b) IDLE CPU: best_idle_cpu
6697 	 */
6698 	if (target_cpu == -1)
6699 		target_cpu = prefer_idle
6700 			? best_active_cpu
6701 			: best_idle_cpu;
6702 	else
6703 		*backup_cpu = prefer_idle
6704 		? best_active_cpu
6705 		: best_idle_cpu;
6706 
6707 	trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
6708 				     best_idle_cpu, best_active_cpu,
6709 				     target_cpu);
6710 
6711 	schedstat_inc(p, se.statistics.nr_wakeups_fbt_count);
6712 	schedstat_inc(this_rq(), eas_stats.fbt_count);
6713 
6714 	return target_cpu;
6715 }
6716 
6717 /*
6718  * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
6719  * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
6720  *
6721  * In that case WAKE_AFFINE doesn't make sense and we'll let
6722  * BALANCE_WAKE sort things out.
6723  */
wake_cap(struct task_struct * p,int cpu,int prev_cpu)6724 static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
6725 {
6726 	long min_cap, max_cap;
6727 
6728 	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
6729 	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
6730 
6731 	/* Minimum capacity is close to max, no need to abort wake_affine */
6732 	if (max_cap - min_cap < max_cap >> 3)
6733 		return 0;
6734 
6735 	/* Bring task utilization in sync with prev_cpu */
6736 	sync_entity_load_avg(&p->se);
6737 
6738 	return min_cap * 1024 < task_util(p) * capacity_margin;
6739 }
6740 
select_energy_cpu_brute(struct task_struct * p,int prev_cpu,int sync)6741 static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync)
6742 {
6743 	struct sched_domain *sd;
6744 	int target_cpu = prev_cpu, tmp_target, tmp_backup;
6745 	bool boosted, prefer_idle;
6746 
6747 	schedstat_inc(p, se.statistics.nr_wakeups_secb_attempts);
6748 	schedstat_inc(this_rq(), eas_stats.secb_attempts);
6749 
6750 	if (sysctl_sched_sync_hint_enable && sync) {
6751 		int cpu = smp_processor_id();
6752 
6753 		if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
6754 			schedstat_inc(p, se.statistics.nr_wakeups_secb_sync);
6755 			schedstat_inc(this_rq(), eas_stats.secb_sync);
6756 			return cpu;
6757 		}
6758 	}
6759 
6760 	rcu_read_lock();
6761 #ifdef CONFIG_CGROUP_SCHEDTUNE
6762 	boosted = schedtune_task_boost(p) > 0;
6763 	prefer_idle = schedtune_prefer_idle(p) > 0;
6764 #else
6765 	boosted = get_sysctl_sched_cfs_boost() > 0;
6766 	prefer_idle = 0;
6767 #endif
6768 
6769 	sync_entity_load_avg(&p->se);
6770 
6771 	sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
6772 	/* Find a cpu with sufficient capacity */
6773 	tmp_target = find_best_target(p, &tmp_backup, boosted, prefer_idle);
6774 
6775 	if (!sd)
6776 		goto unlock;
6777 	if (tmp_target >= 0) {
6778 		target_cpu = tmp_target;
6779 		if ((boosted || prefer_idle) && idle_cpu(target_cpu)) {
6780 			schedstat_inc(p, se.statistics.nr_wakeups_secb_idle_bt);
6781 			schedstat_inc(this_rq(), eas_stats.secb_idle_bt);
6782 			goto unlock;
6783 		}
6784 	}
6785 
6786 	if (target_cpu != prev_cpu) {
6787 		int delta = 0;
6788 		struct energy_env eenv = {
6789 			.util_delta     = task_util(p),
6790 			.src_cpu        = prev_cpu,
6791 			.dst_cpu        = target_cpu,
6792 			.task           = p,
6793 			.trg_cpu	= target_cpu,
6794 		};
6795 
6796 
6797 #ifdef CONFIG_SCHED_WALT
6798 		if (!walt_disabled && sysctl_sched_use_walt_cpu_util &&
6799 			p->state == TASK_WAKING)
6800 			delta = task_util(p);
6801 #endif
6802 		/* Not enough spare capacity on previous cpu */
6803 		if (__cpu_overutilized(prev_cpu, delta)) {
6804 			schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap);
6805 			schedstat_inc(this_rq(), eas_stats.secb_insuff_cap);
6806 			goto unlock;
6807 		}
6808 
6809 		if (energy_diff(&eenv) >= 0) {
6810 			/* No energy saving for target_cpu, try backup */
6811 			target_cpu = tmp_backup;
6812 			eenv.dst_cpu = target_cpu;
6813 			eenv.trg_cpu = target_cpu;
6814 			if (tmp_backup < 0 ||
6815 			    tmp_backup == prev_cpu ||
6816 			    energy_diff(&eenv) >= 0) {
6817 				schedstat_inc(p, se.statistics.nr_wakeups_secb_no_nrg_sav);
6818 				schedstat_inc(this_rq(), eas_stats.secb_no_nrg_sav);
6819 				target_cpu = prev_cpu;
6820 				goto unlock;
6821 			}
6822 		}
6823 
6824 		schedstat_inc(p, se.statistics.nr_wakeups_secb_nrg_sav);
6825 		schedstat_inc(this_rq(), eas_stats.secb_nrg_sav);
6826 		goto unlock;
6827 	}
6828 
6829 	schedstat_inc(p, se.statistics.nr_wakeups_secb_count);
6830 	schedstat_inc(this_rq(), eas_stats.secb_count);
6831 
6832 unlock:
6833 	rcu_read_unlock();
6834 
6835 	return target_cpu;
6836 }
6837 
6838 /*
6839  * select_task_rq_fair: Select target runqueue for the waking task in domains
6840  * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
6841  * SD_BALANCE_FORK, or SD_BALANCE_EXEC.
6842  *
6843  * Balances load by selecting the idlest cpu in the idlest group, or under
6844  * certain conditions an idle sibling cpu if the domain has SD_WAKE_AFFINE set.
6845  *
6846  * Returns the target cpu number.
6847  *
6848  * preempt must be disabled.
6849  */
6850 static int
select_task_rq_fair(struct task_struct * p,int prev_cpu,int sd_flag,int wake_flags,int sibling_count_hint)6851 select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
6852 		    int sibling_count_hint)
6853 {
6854 	struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
6855 	int cpu = smp_processor_id();
6856 	int new_cpu = prev_cpu;
6857 	int want_affine = 0;
6858 	int sync = wake_flags & WF_SYNC;
6859 
6860 	if (sd_flag & SD_BALANCE_WAKE) {
6861 		record_wakee(p);
6862 		want_affine = !wake_wide(p, sibling_count_hint) &&
6863 			      !wake_cap(p, cpu, prev_cpu) &&
6864 			      cpumask_test_cpu(cpu, &p->cpus_allowed);
6865 	}
6866 
6867 	if (energy_aware() && !(cpu_rq(prev_cpu)->rd->overutilized))
6868 		return select_energy_cpu_brute(p, prev_cpu, sync);
6869 
6870 	rcu_read_lock();
6871 	for_each_domain(cpu, tmp) {
6872 		if (!(tmp->flags & SD_LOAD_BALANCE))
6873 			break;
6874 
6875 		/*
6876 		 * If both cpu and prev_cpu are part of this domain,
6877 		 * cpu is a valid SD_WAKE_AFFINE target.
6878 		 */
6879 		if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
6880 		    cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
6881 			affine_sd = tmp;
6882 			break;
6883 		}
6884 
6885 		if (tmp->flags & sd_flag)
6886 			sd = tmp;
6887 		else if (!want_affine)
6888 			break;
6889 	}
6890 
6891 	if (affine_sd) {
6892 		sd = NULL; /* Prefer wake_affine over balance flags */
6893 		if (cpu != prev_cpu && wake_affine(affine_sd, p, prev_cpu, sync))
6894 			new_cpu = cpu;
6895 	}
6896 
6897 	if (sd && !(sd_flag & SD_BALANCE_FORK)) {
6898 		/*
6899 		 * We're going to need the task's util for capacity_spare_wake
6900 		 * in find_idlest_group. Sync it up to prev_cpu's
6901 		 * last_update_time.
6902 		 */
6903 		sync_entity_load_avg(&p->se);
6904 	}
6905 
6906 	if (!sd) {
6907 		if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
6908 			new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
6909 
6910 	} else {
6911 		new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
6912 	}
6913 	rcu_read_unlock();
6914 
6915 	return new_cpu;
6916 }
6917 
6918 /*
6919  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
6920  * cfs_rq_of(p) references at time of call are still valid and identify the
6921  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
6922  * other assumptions, including the state of rq->lock, should be made.
6923  */
migrate_task_rq_fair(struct task_struct * p)6924 static void migrate_task_rq_fair(struct task_struct *p)
6925 {
6926 	/*
6927 	 * We are supposed to update the task to "current" time, then its up to date
6928 	 * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
6929 	 * what current time is, so simply throw away the out-of-date time. This
6930 	 * will result in the wakee task is less decayed, but giving the wakee more
6931 	 * load sounds not bad.
6932 	 */
6933 	remove_entity_load_avg(&p->se);
6934 
6935 	/* Tell new CPU we are migrated */
6936 	p->se.avg.last_update_time = 0;
6937 
6938 	/* We have migrated, no longer consider this task hot */
6939 	p->se.exec_start = 0;
6940 }
6941 
task_dead_fair(struct task_struct * p)6942 static void task_dead_fair(struct task_struct *p)
6943 {
6944 	remove_entity_load_avg(&p->se);
6945 }
6946 #else
6947 #define task_fits_max(p, cpu) true
6948 #endif /* CONFIG_SMP */
6949 
6950 static unsigned long
wakeup_gran(struct sched_entity * curr,struct sched_entity * se)6951 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
6952 {
6953 	unsigned long gran = sysctl_sched_wakeup_granularity;
6954 
6955 	/*
6956 	 * Since its curr running now, convert the gran from real-time
6957 	 * to virtual-time in his units.
6958 	 *
6959 	 * By using 'se' instead of 'curr' we penalize light tasks, so
6960 	 * they get preempted easier. That is, if 'se' < 'curr' then
6961 	 * the resulting gran will be larger, therefore penalizing the
6962 	 * lighter, if otoh 'se' > 'curr' then the resulting gran will
6963 	 * be smaller, again penalizing the lighter task.
6964 	 *
6965 	 * This is especially important for buddies when the leftmost
6966 	 * task is higher priority than the buddy.
6967 	 */
6968 	return calc_delta_fair(gran, se);
6969 }
6970 
6971 /*
6972  * Should 'se' preempt 'curr'.
6973  *
6974  *             |s1
6975  *        |s2
6976  *   |s3
6977  *         g
6978  *      |<--->|c
6979  *
6980  *  w(c, s1) = -1
6981  *  w(c, s2) =  0
6982  *  w(c, s3) =  1
6983  *
6984  */
6985 static int
wakeup_preempt_entity(struct sched_entity * curr,struct sched_entity * se)6986 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
6987 {
6988 	s64 gran, vdiff = curr->vruntime - se->vruntime;
6989 
6990 	if (vdiff <= 0)
6991 		return -1;
6992 
6993 	gran = wakeup_gran(curr, se);
6994 	if (vdiff > gran)
6995 		return 1;
6996 
6997 	return 0;
6998 }
6999 
set_last_buddy(struct sched_entity * se)7000 static void set_last_buddy(struct sched_entity *se)
7001 {
7002 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7003 		return;
7004 
7005 	for_each_sched_entity(se)
7006 		cfs_rq_of(se)->last = se;
7007 }
7008 
set_next_buddy(struct sched_entity * se)7009 static void set_next_buddy(struct sched_entity *se)
7010 {
7011 	if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
7012 		return;
7013 
7014 	for_each_sched_entity(se)
7015 		cfs_rq_of(se)->next = se;
7016 }
7017 
set_skip_buddy(struct sched_entity * se)7018 static void set_skip_buddy(struct sched_entity *se)
7019 {
7020 	for_each_sched_entity(se)
7021 		cfs_rq_of(se)->skip = se;
7022 }
7023 
7024 /*
7025  * Preempt the current task with a newly woken task if needed:
7026  */
check_preempt_wakeup(struct rq * rq,struct task_struct * p,int wake_flags)7027 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
7028 {
7029 	struct task_struct *curr = rq->curr;
7030 	struct sched_entity *se = &curr->se, *pse = &p->se;
7031 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7032 	int scale = cfs_rq->nr_running >= sched_nr_latency;
7033 	int next_buddy_marked = 0;
7034 
7035 	if (unlikely(se == pse))
7036 		return;
7037 
7038 	/*
7039 	 * This is possible from callers such as attach_tasks(), in which we
7040 	 * unconditionally check_prempt_curr() after an enqueue (which may have
7041 	 * lead to a throttle).  This both saves work and prevents false
7042 	 * next-buddy nomination below.
7043 	 */
7044 	if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
7045 		return;
7046 
7047 	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
7048 		set_next_buddy(pse);
7049 		next_buddy_marked = 1;
7050 	}
7051 
7052 	/*
7053 	 * We can come here with TIF_NEED_RESCHED already set from new task
7054 	 * wake up path.
7055 	 *
7056 	 * Note: this also catches the edge-case of curr being in a throttled
7057 	 * group (e.g. via set_curr_task), since update_curr() (in the
7058 	 * enqueue of curr) will have resulted in resched being set.  This
7059 	 * prevents us from potentially nominating it as a false LAST_BUDDY
7060 	 * below.
7061 	 */
7062 	if (test_tsk_need_resched(curr))
7063 		return;
7064 
7065 	/* Idle tasks are by definition preempted by non-idle tasks. */
7066 	if (unlikely(curr->policy == SCHED_IDLE) &&
7067 	    likely(p->policy != SCHED_IDLE))
7068 		goto preempt;
7069 
7070 	/*
7071 	 * Batch and idle tasks do not preempt non-idle tasks (their preemption
7072 	 * is driven by the tick):
7073 	 */
7074 	if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
7075 		return;
7076 
7077 	find_matching_se(&se, &pse);
7078 	update_curr(cfs_rq_of(se));
7079 	BUG_ON(!pse);
7080 	if (wakeup_preempt_entity(se, pse) == 1) {
7081 		/*
7082 		 * Bias pick_next to pick the sched entity that is
7083 		 * triggering this preemption.
7084 		 */
7085 		if (!next_buddy_marked)
7086 			set_next_buddy(pse);
7087 		goto preempt;
7088 	}
7089 
7090 	return;
7091 
7092 preempt:
7093 	resched_curr(rq);
7094 	/*
7095 	 * Only set the backward buddy when the current task is still
7096 	 * on the rq. This can happen when a wakeup gets interleaved
7097 	 * with schedule on the ->pre_schedule() or idle_balance()
7098 	 * point, either of which can * drop the rq lock.
7099 	 *
7100 	 * Also, during early boot the idle thread is in the fair class,
7101 	 * for obvious reasons its a bad idea to schedule back to it.
7102 	 */
7103 	if (unlikely(!se->on_rq || curr == rq->idle))
7104 		return;
7105 
7106 	if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
7107 		set_last_buddy(se);
7108 }
7109 
7110 static struct task_struct *
pick_next_task_fair(struct rq * rq,struct task_struct * prev)7111 pick_next_task_fair(struct rq *rq, struct task_struct *prev)
7112 {
7113 	struct cfs_rq *cfs_rq = &rq->cfs;
7114 	struct sched_entity *se;
7115 	struct task_struct *p;
7116 	int new_tasks;
7117 
7118 again:
7119 #ifdef CONFIG_FAIR_GROUP_SCHED
7120 	if (!cfs_rq->nr_running)
7121 		goto idle;
7122 
7123 	if (prev->sched_class != &fair_sched_class)
7124 		goto simple;
7125 
7126 	/*
7127 	 * Because of the set_next_buddy() in dequeue_task_fair() it is rather
7128 	 * likely that a next task is from the same cgroup as the current.
7129 	 *
7130 	 * Therefore attempt to avoid putting and setting the entire cgroup
7131 	 * hierarchy, only change the part that actually changes.
7132 	 */
7133 
7134 	do {
7135 		struct sched_entity *curr = cfs_rq->curr;
7136 
7137 		/*
7138 		 * Since we got here without doing put_prev_entity() we also
7139 		 * have to consider cfs_rq->curr. If it is still a runnable
7140 		 * entity, update_curr() will update its vruntime, otherwise
7141 		 * forget we've ever seen it.
7142 		 */
7143 		if (curr) {
7144 			if (curr->on_rq)
7145 				update_curr(cfs_rq);
7146 			else
7147 				curr = NULL;
7148 
7149 			/*
7150 			 * This call to check_cfs_rq_runtime() will do the
7151 			 * throttle and dequeue its entity in the parent(s).
7152 			 * Therefore the 'simple' nr_running test will indeed
7153 			 * be correct.
7154 			 */
7155 			if (unlikely(check_cfs_rq_runtime(cfs_rq)))
7156 				goto simple;
7157 		}
7158 
7159 		se = pick_next_entity(cfs_rq, curr);
7160 		cfs_rq = group_cfs_rq(se);
7161 	} while (cfs_rq);
7162 
7163 	p = task_of(se);
7164 
7165 	/*
7166 	 * Since we haven't yet done put_prev_entity and if the selected task
7167 	 * is a different task than we started out with, try and touch the
7168 	 * least amount of cfs_rqs.
7169 	 */
7170 	if (prev != p) {
7171 		struct sched_entity *pse = &prev->se;
7172 
7173 		while (!(cfs_rq = is_same_group(se, pse))) {
7174 			int se_depth = se->depth;
7175 			int pse_depth = pse->depth;
7176 
7177 			if (se_depth <= pse_depth) {
7178 				put_prev_entity(cfs_rq_of(pse), pse);
7179 				pse = parent_entity(pse);
7180 			}
7181 			if (se_depth >= pse_depth) {
7182 				set_next_entity(cfs_rq_of(se), se);
7183 				se = parent_entity(se);
7184 			}
7185 		}
7186 
7187 		put_prev_entity(cfs_rq, pse);
7188 		set_next_entity(cfs_rq, se);
7189 	}
7190 
7191 	if (hrtick_enabled(rq))
7192 		hrtick_start_fair(rq, p);
7193 
7194 	rq->misfit_task = !task_fits_max(p, rq->cpu);
7195 
7196 	return p;
7197 simple:
7198 	cfs_rq = &rq->cfs;
7199 #endif
7200 
7201 	if (!cfs_rq->nr_running)
7202 		goto idle;
7203 
7204 	put_prev_task(rq, prev);
7205 
7206 	do {
7207 		se = pick_next_entity(cfs_rq, NULL);
7208 		set_next_entity(cfs_rq, se);
7209 		cfs_rq = group_cfs_rq(se);
7210 	} while (cfs_rq);
7211 
7212 	p = task_of(se);
7213 
7214 	if (hrtick_enabled(rq))
7215 		hrtick_start_fair(rq, p);
7216 
7217 	rq->misfit_task = !task_fits_max(p, rq->cpu);
7218 
7219 	return p;
7220 
7221 idle:
7222 	rq->misfit_task = 0;
7223 	/*
7224 	 * This is OK, because current is on_cpu, which avoids it being picked
7225 	 * for load-balance and preemption/IRQs are still disabled avoiding
7226 	 * further scheduler activity on it and we're being very careful to
7227 	 * re-start the picking loop.
7228 	 */
7229 	lockdep_unpin_lock(&rq->lock);
7230 	new_tasks = idle_balance(rq);
7231 	lockdep_pin_lock(&rq->lock);
7232 	/*
7233 	 * Because idle_balance() releases (and re-acquires) rq->lock, it is
7234 	 * possible for any higher priority task to appear. In that case we
7235 	 * must re-start the pick_next_entity() loop.
7236 	 */
7237 	if (new_tasks < 0)
7238 		return RETRY_TASK;
7239 
7240 	if (new_tasks > 0)
7241 		goto again;
7242 
7243 	return NULL;
7244 }
7245 
7246 /*
7247  * Account for a descheduled task:
7248  */
put_prev_task_fair(struct rq * rq,struct task_struct * prev)7249 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
7250 {
7251 	struct sched_entity *se = &prev->se;
7252 	struct cfs_rq *cfs_rq;
7253 
7254 	for_each_sched_entity(se) {
7255 		cfs_rq = cfs_rq_of(se);
7256 		put_prev_entity(cfs_rq, se);
7257 	}
7258 }
7259 
7260 /*
7261  * sched_yield() is very simple
7262  *
7263  * The magic of dealing with the ->skip buddy is in pick_next_entity.
7264  */
yield_task_fair(struct rq * rq)7265 static void yield_task_fair(struct rq *rq)
7266 {
7267 	struct task_struct *curr = rq->curr;
7268 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
7269 	struct sched_entity *se = &curr->se;
7270 
7271 	/*
7272 	 * Are we the only task in the tree?
7273 	 */
7274 	if (unlikely(rq->nr_running == 1))
7275 		return;
7276 
7277 	clear_buddies(cfs_rq, se);
7278 
7279 	if (curr->policy != SCHED_BATCH) {
7280 		update_rq_clock(rq);
7281 		/*
7282 		 * Update run-time statistics of the 'current'.
7283 		 */
7284 		update_curr(cfs_rq);
7285 		/*
7286 		 * Tell update_rq_clock() that we've just updated,
7287 		 * so we don't do microscopic update in schedule()
7288 		 * and double the fastpath cost.
7289 		 */
7290 		rq_clock_skip_update(rq, true);
7291 	}
7292 
7293 	set_skip_buddy(se);
7294 }
7295 
yield_to_task_fair(struct rq * rq,struct task_struct * p,bool preempt)7296 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
7297 {
7298 	struct sched_entity *se = &p->se;
7299 
7300 	/* throttled hierarchies are not runnable */
7301 	if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
7302 		return false;
7303 
7304 	/* Tell the scheduler that we'd really like pse to run next. */
7305 	set_next_buddy(se);
7306 
7307 	yield_task_fair(rq);
7308 
7309 	return true;
7310 }
7311 
7312 #ifdef CONFIG_SMP
7313 /**************************************************
7314  * Fair scheduling class load-balancing methods.
7315  *
7316  * BASICS
7317  *
7318  * The purpose of load-balancing is to achieve the same basic fairness the
7319  * per-cpu scheduler provides, namely provide a proportional amount of compute
7320  * time to each task. This is expressed in the following equation:
7321  *
7322  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
7323  *
7324  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
7325  * W_i,0 is defined as:
7326  *
7327  *   W_i,0 = \Sum_j w_i,j                                             (2)
7328  *
7329  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
7330  * is derived from the nice value as per prio_to_weight[].
7331  *
7332  * The weight average is an exponential decay average of the instantaneous
7333  * weight:
7334  *
7335  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
7336  *
7337  * C_i is the compute capacity of cpu i, typically it is the
7338  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
7339  * can also include other factors [XXX].
7340  *
7341  * To achieve this balance we define a measure of imbalance which follows
7342  * directly from (1):
7343  *
7344  *   imb_i,j = max{ avg(W/C), W_i/C_i } - min{ avg(W/C), W_j/C_j }    (4)
7345  *
7346  * We them move tasks around to minimize the imbalance. In the continuous
7347  * function space it is obvious this converges, in the discrete case we get
7348  * a few fun cases generally called infeasible weight scenarios.
7349  *
7350  * [XXX expand on:
7351  *     - infeasible weights;
7352  *     - local vs global optima in the discrete case. ]
7353  *
7354  *
7355  * SCHED DOMAINS
7356  *
7357  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
7358  * for all i,j solution, we create a tree of cpus that follows the hardware
7359  * topology where each level pairs two lower groups (or better). This results
7360  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
7361  * tree to only the first of the previous level and we decrease the frequency
7362  * of load-balance at each level inv. proportional to the number of cpus in
7363  * the groups.
7364  *
7365  * This yields:
7366  *
7367  *     log_2 n     1     n
7368  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
7369  *     i = 0      2^i   2^i
7370  *                               `- size of each group
7371  *         |         |     `- number of cpus doing load-balance
7372  *         |         `- freq
7373  *         `- sum over all levels
7374  *
7375  * Coupled with a limit on how many tasks we can migrate every balance pass,
7376  * this makes (5) the runtime complexity of the balancer.
7377  *
7378  * An important property here is that each CPU is still (indirectly) connected
7379  * to every other cpu in at most O(log n) steps:
7380  *
7381  * The adjacency matrix of the resulting graph is given by:
7382  *
7383  *             log_2 n
7384  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
7385  *             k = 0
7386  *
7387  * And you'll find that:
7388  *
7389  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
7390  *
7391  * Showing there's indeed a path between every cpu in at most O(log n) steps.
7392  * The task movement gives a factor of O(m), giving a convergence complexity
7393  * of:
7394  *
7395  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
7396  *
7397  *
7398  * WORK CONSERVING
7399  *
7400  * In order to avoid CPUs going idle while there's still work to do, new idle
7401  * balancing is more aggressive and has the newly idle cpu iterate up the domain
7402  * tree itself instead of relying on other CPUs to bring it work.
7403  *
7404  * This adds some complexity to both (5) and (8) but it reduces the total idle
7405  * time.
7406  *
7407  * [XXX more?]
7408  *
7409  *
7410  * CGROUPS
7411  *
7412  * Cgroups make a horror show out of (2), instead of a simple sum we get:
7413  *
7414  *                                s_k,i
7415  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
7416  *                                 S_k
7417  *
7418  * Where
7419  *
7420  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
7421  *
7422  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
7423  *
7424  * The big problem is S_k, its a global sum needed to compute a local (W_i)
7425  * property.
7426  *
7427  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
7428  *      rewrite all of this once again.]
7429  */
7430 
7431 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
7432 
7433 enum fbq_type { regular, remote, all };
7434 
7435 enum group_type {
7436 	group_other = 0,
7437 	group_misfit_task,
7438 	group_imbalanced,
7439 	group_overloaded,
7440 };
7441 
7442 #define LBF_ALL_PINNED	0x01
7443 #define LBF_NEED_BREAK	0x02
7444 #define LBF_DST_PINNED  0x04
7445 #define LBF_SOME_PINNED	0x08
7446 
7447 struct lb_env {
7448 	struct sched_domain	*sd;
7449 
7450 	struct rq		*src_rq;
7451 	int			src_cpu;
7452 
7453 	int			dst_cpu;
7454 	struct rq		*dst_rq;
7455 
7456 	struct cpumask		*dst_grpmask;
7457 	int			new_dst_cpu;
7458 	enum cpu_idle_type	idle;
7459 	long			imbalance;
7460 	unsigned int		src_grp_nr_running;
7461 	/* The set of CPUs under consideration for load-balancing */
7462 	struct cpumask		*cpus;
7463 
7464 	unsigned int		flags;
7465 
7466 	unsigned int		loop;
7467 	unsigned int		loop_break;
7468 	unsigned int		loop_max;
7469 
7470 	enum fbq_type		fbq_type;
7471 	enum group_type		busiest_group_type;
7472 	struct list_head	tasks;
7473 };
7474 
7475 /*
7476  * Is this task likely cache-hot:
7477  */
task_hot(struct task_struct * p,struct lb_env * env)7478 static int task_hot(struct task_struct *p, struct lb_env *env)
7479 {
7480 	s64 delta;
7481 
7482 	lockdep_assert_held(&env->src_rq->lock);
7483 
7484 	if (p->sched_class != &fair_sched_class)
7485 		return 0;
7486 
7487 	if (unlikely(p->policy == SCHED_IDLE))
7488 		return 0;
7489 
7490 	/*
7491 	 * Buddy candidates are cache hot:
7492 	 */
7493 	if (sched_feat(CACHE_HOT_BUDDY) && env->dst_rq->nr_running &&
7494 			(&p->se == cfs_rq_of(&p->se)->next ||
7495 			 &p->se == cfs_rq_of(&p->se)->last))
7496 		return 1;
7497 
7498 	if (sysctl_sched_migration_cost == -1)
7499 		return 1;
7500 	if (sysctl_sched_migration_cost == 0)
7501 		return 0;
7502 
7503 	delta = rq_clock_task(env->src_rq) - p->se.exec_start;
7504 
7505 	return delta < (s64)sysctl_sched_migration_cost;
7506 }
7507 
7508 #ifdef CONFIG_NUMA_BALANCING
7509 /*
7510  * Returns 1, if task migration degrades locality
7511  * Returns 0, if task migration improves locality i.e migration preferred.
7512  * Returns -1, if task migration is not affected by locality.
7513  */
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)7514 static int migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
7515 {
7516 	struct numa_group *numa_group = rcu_dereference(p->numa_group);
7517 	unsigned long src_faults, dst_faults;
7518 	int src_nid, dst_nid;
7519 
7520 	if (!static_branch_likely(&sched_numa_balancing))
7521 		return -1;
7522 
7523 	if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
7524 		return -1;
7525 
7526 	src_nid = cpu_to_node(env->src_cpu);
7527 	dst_nid = cpu_to_node(env->dst_cpu);
7528 
7529 	if (src_nid == dst_nid)
7530 		return -1;
7531 
7532 	/* Migrating away from the preferred node is always bad. */
7533 	if (src_nid == p->numa_preferred_nid) {
7534 		if (env->src_rq->nr_running > env->src_rq->nr_preferred_running)
7535 			return 1;
7536 		else
7537 			return -1;
7538 	}
7539 
7540 	/* Encourage migration to the preferred node. */
7541 	if (dst_nid == p->numa_preferred_nid)
7542 		return 0;
7543 
7544 	if (numa_group) {
7545 		src_faults = group_faults(p, src_nid);
7546 		dst_faults = group_faults(p, dst_nid);
7547 	} else {
7548 		src_faults = task_faults(p, src_nid);
7549 		dst_faults = task_faults(p, dst_nid);
7550 	}
7551 
7552 	return dst_faults < src_faults;
7553 }
7554 
7555 #else
migrate_degrades_locality(struct task_struct * p,struct lb_env * env)7556 static inline int migrate_degrades_locality(struct task_struct *p,
7557 					     struct lb_env *env)
7558 {
7559 	return -1;
7560 }
7561 #endif
7562 
7563 /*
7564  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
7565  */
7566 static
can_migrate_task(struct task_struct * p,struct lb_env * env)7567 int can_migrate_task(struct task_struct *p, struct lb_env *env)
7568 {
7569 	int tsk_cache_hot;
7570 
7571 	lockdep_assert_held(&env->src_rq->lock);
7572 
7573 	/*
7574 	 * We do not migrate tasks that are:
7575 	 * 1) throttled_lb_pair, or
7576 	 * 2) cannot be migrated to this CPU due to cpus_allowed, or
7577 	 * 3) running (obviously), or
7578 	 * 4) are cache-hot on their current CPU.
7579 	 */
7580 	if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
7581 		return 0;
7582 
7583 	if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
7584 		int cpu;
7585 
7586 		schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
7587 
7588 		env->flags |= LBF_SOME_PINNED;
7589 
7590 		/*
7591 		 * Remember if this task can be migrated to any other cpu in
7592 		 * our sched_group. We may want to revisit it if we couldn't
7593 		 * meet load balance goals by pulling other tasks on src_cpu.
7594 		 *
7595 		 * Also avoid computing new_dst_cpu if we have already computed
7596 		 * one in current iteration.
7597 		 */
7598 		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
7599 			return 0;
7600 
7601 		/* Prevent to re-select dst_cpu via env's cpus */
7602 		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
7603 			if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
7604 				env->flags |= LBF_DST_PINNED;
7605 				env->new_dst_cpu = cpu;
7606 				break;
7607 			}
7608 		}
7609 
7610 		return 0;
7611 	}
7612 
7613 	/* Record that we found atleast one task that could run on dst_cpu */
7614 	env->flags &= ~LBF_ALL_PINNED;
7615 
7616 	if (task_running(env->src_rq, p)) {
7617 		schedstat_inc(p, se.statistics.nr_failed_migrations_running);
7618 		return 0;
7619 	}
7620 
7621 	/*
7622 	 * Aggressive migration if:
7623 	 * 1) destination numa is preferred
7624 	 * 2) task is cache cold, or
7625 	 * 3) too many balance attempts have failed.
7626 	 */
7627 	tsk_cache_hot = migrate_degrades_locality(p, env);
7628 	if (tsk_cache_hot == -1)
7629 		tsk_cache_hot = task_hot(p, env);
7630 
7631 	if (tsk_cache_hot <= 0 ||
7632 	    env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
7633 		if (tsk_cache_hot == 1) {
7634 			schedstat_inc(env->sd, lb_hot_gained[env->idle]);
7635 			schedstat_inc(p, se.statistics.nr_forced_migrations);
7636 		}
7637 		return 1;
7638 	}
7639 
7640 	schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
7641 	return 0;
7642 }
7643 
7644 /*
7645  * detach_task() -- detach the task for the migration specified in env
7646  */
detach_task(struct task_struct * p,struct lb_env * env)7647 static void detach_task(struct task_struct *p, struct lb_env *env)
7648 {
7649 	lockdep_assert_held(&env->src_rq->lock);
7650 
7651 	deactivate_task(env->src_rq, p, 0);
7652 	p->on_rq = TASK_ON_RQ_MIGRATING;
7653 	double_lock_balance(env->src_rq, env->dst_rq);
7654 	set_task_cpu(p, env->dst_cpu);
7655 	double_unlock_balance(env->src_rq, env->dst_rq);
7656 }
7657 
7658 /*
7659  * detach_one_task() -- tries to dequeue exactly one task from env->src_rq, as
7660  * part of active balancing operations within "domain".
7661  *
7662  * Returns a task if successful and NULL otherwise.
7663  */
detach_one_task(struct lb_env * env)7664 static struct task_struct *detach_one_task(struct lb_env *env)
7665 {
7666 	struct task_struct *p, *n;
7667 
7668 	lockdep_assert_held(&env->src_rq->lock);
7669 
7670 	list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
7671 		if (!can_migrate_task(p, env))
7672 			continue;
7673 
7674 		detach_task(p, env);
7675 
7676 		/*
7677 		 * Right now, this is only the second place where
7678 		 * lb_gained[env->idle] is updated (other is detach_tasks)
7679 		 * so we can safely collect stats here rather than
7680 		 * inside detach_tasks().
7681 		 */
7682 		schedstat_inc(env->sd, lb_gained[env->idle]);
7683 		return p;
7684 	}
7685 	return NULL;
7686 }
7687 
7688 static const unsigned int sched_nr_migrate_break = 32;
7689 
7690 /*
7691  * detach_tasks() -- tries to detach up to imbalance weighted load from
7692  * busiest_rq, as part of a balancing operation within domain "sd".
7693  *
7694  * Returns number of detached tasks if successful and 0 otherwise.
7695  */
detach_tasks(struct lb_env * env)7696 static int detach_tasks(struct lb_env *env)
7697 {
7698 	struct list_head *tasks = &env->src_rq->cfs_tasks;
7699 	struct task_struct *p;
7700 	unsigned long load;
7701 	int detached = 0;
7702 
7703 	lockdep_assert_held(&env->src_rq->lock);
7704 
7705 	if (env->imbalance <= 0)
7706 		return 0;
7707 
7708 	while (!list_empty(tasks)) {
7709 		/*
7710 		 * We don't want to steal all, otherwise we may be treated likewise,
7711 		 * which could at worst lead to a livelock crash.
7712 		 */
7713 		if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
7714 			break;
7715 
7716 		p = list_first_entry(tasks, struct task_struct, se.group_node);
7717 
7718 		env->loop++;
7719 		/* We've more or less seen every task there is, call it quits */
7720 		if (env->loop > env->loop_max)
7721 			break;
7722 
7723 		/* take a breather every nr_migrate tasks */
7724 		if (env->loop > env->loop_break) {
7725 			env->loop_break += sched_nr_migrate_break;
7726 			env->flags |= LBF_NEED_BREAK;
7727 			break;
7728 		}
7729 
7730 		if (!can_migrate_task(p, env))
7731 			goto next;
7732 
7733 		/*
7734 		 * Depending of the number of CPUs and tasks and the
7735 		 * cgroup hierarchy, task_h_load() can return a null
7736 		 * value. Make sure that env->imbalance decreases
7737 		 * otherwise detach_tasks() will stop only after
7738 		 * detaching up to loop_max tasks.
7739 		 */
7740 		load = max_t(unsigned long, task_h_load(p), 1);
7741 
7742 
7743 		if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
7744 			goto next;
7745 
7746 		if ((load / 2) > env->imbalance)
7747 			goto next;
7748 
7749 		detach_task(p, env);
7750 		list_add(&p->se.group_node, &env->tasks);
7751 
7752 		detached++;
7753 		env->imbalance -= load;
7754 
7755 #ifdef CONFIG_PREEMPT
7756 		/*
7757 		 * NEWIDLE balancing is a source of latency, so preemptible
7758 		 * kernels will stop after the first task is detached to minimize
7759 		 * the critical section.
7760 		 */
7761 		if (env->idle == CPU_NEWLY_IDLE)
7762 			break;
7763 #endif
7764 
7765 		/*
7766 		 * We only want to steal up to the prescribed amount of
7767 		 * weighted load.
7768 		 */
7769 		if (env->imbalance <= 0)
7770 			break;
7771 
7772 		continue;
7773 next:
7774 		list_move_tail(&p->se.group_node, tasks);
7775 	}
7776 
7777 	/*
7778 	 * Right now, this is one of only two places we collect this stat
7779 	 * so we can safely collect detach_one_task() stats here rather
7780 	 * than inside detach_one_task().
7781 	 */
7782 	schedstat_add(env->sd, lb_gained[env->idle], detached);
7783 
7784 	return detached;
7785 }
7786 
7787 /*
7788  * attach_task() -- attach the task detached by detach_task() to its new rq.
7789  */
attach_task(struct rq * rq,struct task_struct * p)7790 static void attach_task(struct rq *rq, struct task_struct *p)
7791 {
7792 	lockdep_assert_held(&rq->lock);
7793 
7794 	BUG_ON(task_rq(p) != rq);
7795 	p->on_rq = TASK_ON_RQ_QUEUED;
7796 	activate_task(rq, p, 0);
7797 	check_preempt_curr(rq, p, 0);
7798 }
7799 
7800 /*
7801  * attach_one_task() -- attaches the task returned from detach_one_task() to
7802  * its new rq.
7803  */
attach_one_task(struct rq * rq,struct task_struct * p)7804 static void attach_one_task(struct rq *rq, struct task_struct *p)
7805 {
7806 	raw_spin_lock(&rq->lock);
7807 	attach_task(rq, p);
7808 	raw_spin_unlock(&rq->lock);
7809 }
7810 
7811 /*
7812  * attach_tasks() -- attaches all tasks detached by detach_tasks() to their
7813  * new rq.
7814  */
attach_tasks(struct lb_env * env)7815 static void attach_tasks(struct lb_env *env)
7816 {
7817 	struct list_head *tasks = &env->tasks;
7818 	struct task_struct *p;
7819 
7820 	raw_spin_lock(&env->dst_rq->lock);
7821 
7822 	while (!list_empty(tasks)) {
7823 		p = list_first_entry(tasks, struct task_struct, se.group_node);
7824 		list_del_init(&p->se.group_node);
7825 
7826 		attach_task(env->dst_rq, p);
7827 	}
7828 
7829 	raw_spin_unlock(&env->dst_rq->lock);
7830 }
7831 
7832 #ifdef CONFIG_FAIR_GROUP_SCHED
update_blocked_averages(int cpu)7833 static void update_blocked_averages(int cpu)
7834 {
7835 	struct rq *rq = cpu_rq(cpu);
7836 	struct cfs_rq *cfs_rq;
7837 	unsigned long flags;
7838 
7839 	raw_spin_lock_irqsave(&rq->lock, flags);
7840 	update_rq_clock(rq);
7841 
7842 	/*
7843 	 * Iterates the task_group tree in a bottom up fashion, see
7844 	 * list_add_leaf_cfs_rq() for details.
7845 	 */
7846 	for_each_leaf_cfs_rq(rq, cfs_rq) {
7847 		struct sched_entity *se;
7848 
7849 		/* throttled entities do not contribute to load */
7850 		if (throttled_hierarchy(cfs_rq))
7851 			continue;
7852 
7853 		if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq,
7854 					   true))
7855 			update_tg_load_avg(cfs_rq, 0);
7856 
7857 		/* Propagate pending load changes to the parent, if any: */
7858 		se = cfs_rq->tg->se[cpu];
7859 		if (se && !skip_blocked_update(se))
7860 			update_load_avg(se, 0);
7861 	}
7862 	raw_spin_unlock_irqrestore(&rq->lock, flags);
7863 }
7864 
7865 /*
7866  * Compute the hierarchical load factor for cfs_rq and all its ascendants.
7867  * This needs to be done in a top-down fashion because the load of a child
7868  * group is a fraction of its parents load.
7869  */
update_cfs_rq_h_load(struct cfs_rq * cfs_rq)7870 static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
7871 {
7872 	struct rq *rq = rq_of(cfs_rq);
7873 	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
7874 	unsigned long now = jiffies;
7875 	unsigned long load;
7876 
7877 	if (cfs_rq->last_h_load_update == now)
7878 		return;
7879 
7880 	WRITE_ONCE(cfs_rq->h_load_next, NULL);
7881 	for_each_sched_entity(se) {
7882 		cfs_rq = cfs_rq_of(se);
7883 		WRITE_ONCE(cfs_rq->h_load_next, se);
7884 		if (cfs_rq->last_h_load_update == now)
7885 			break;
7886 	}
7887 
7888 	if (!se) {
7889 		cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
7890 		cfs_rq->last_h_load_update = now;
7891 	}
7892 
7893 	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
7894 		load = cfs_rq->h_load;
7895 		load = div64_ul(load * se->avg.load_avg,
7896 			cfs_rq_load_avg(cfs_rq) + 1);
7897 		cfs_rq = group_cfs_rq(se);
7898 		cfs_rq->h_load = load;
7899 		cfs_rq->last_h_load_update = now;
7900 	}
7901 }
7902 
task_h_load(struct task_struct * p)7903 static unsigned long task_h_load(struct task_struct *p)
7904 {
7905 	struct cfs_rq *cfs_rq = task_cfs_rq(p);
7906 
7907 	update_cfs_rq_h_load(cfs_rq);
7908 	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
7909 			cfs_rq_load_avg(cfs_rq) + 1);
7910 }
7911 #else
update_blocked_averages(int cpu)7912 static inline void update_blocked_averages(int cpu)
7913 {
7914 	struct rq *rq = cpu_rq(cpu);
7915 	struct cfs_rq *cfs_rq = &rq->cfs;
7916 	unsigned long flags;
7917 
7918 	raw_spin_lock_irqsave(&rq->lock, flags);
7919 	update_rq_clock(rq);
7920 	update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
7921 	raw_spin_unlock_irqrestore(&rq->lock, flags);
7922 }
7923 
task_h_load(struct task_struct * p)7924 static unsigned long task_h_load(struct task_struct *p)
7925 {
7926 	return p->se.avg.load_avg;
7927 }
7928 #endif
7929 
7930 /********** Helpers for find_busiest_group ************************/
7931 
7932 /*
7933  * sg_lb_stats - stats of a sched_group required for load_balancing
7934  */
7935 struct sg_lb_stats {
7936 	unsigned long avg_load; /*Avg load across the CPUs of the group */
7937 	unsigned long group_load; /* Total load over the CPUs of the group */
7938 	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
7939 	unsigned long load_per_task;
7940 	unsigned long group_capacity;
7941 	unsigned long group_util; /* Total utilization of the group */
7942 	unsigned int sum_nr_running; /* Nr tasks running in the group */
7943 	unsigned int idle_cpus;
7944 	unsigned int group_weight;
7945 	enum group_type group_type;
7946 	int group_no_capacity;
7947 	int group_misfit_task; /* A cpu has a task too big for its capacity */
7948 #ifdef CONFIG_NUMA_BALANCING
7949 	unsigned int nr_numa_running;
7950 	unsigned int nr_preferred_running;
7951 #endif
7952 };
7953 
7954 /*
7955  * sd_lb_stats - Structure to store the statistics of a sched_domain
7956  *		 during load balancing.
7957  */
7958 struct sd_lb_stats {
7959 	struct sched_group *busiest;	/* Busiest group in this sd */
7960 	struct sched_group *local;	/* Local group in this sd */
7961 	unsigned long total_load;	/* Total load of all groups in sd */
7962 	unsigned long total_capacity;	/* Total capacity of all groups in sd */
7963 	unsigned long avg_load;	/* Average load across all groups in sd */
7964 
7965 	struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
7966 	struct sg_lb_stats local_stat;	/* Statistics of the local group */
7967 };
7968 
init_sd_lb_stats(struct sd_lb_stats * sds)7969 static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
7970 {
7971 	/*
7972 	 * Skimp on the clearing to avoid duplicate work. We can avoid clearing
7973 	 * local_stat because update_sg_lb_stats() does a full clear/assignment.
7974 	 * We must however clear busiest_stat::avg_load because
7975 	 * update_sd_pick_busiest() reads this before assignment.
7976 	 */
7977 	*sds = (struct sd_lb_stats){
7978 		.busiest = NULL,
7979 		.local = NULL,
7980 		.total_load = 0UL,
7981 		.total_capacity = 0UL,
7982 		.busiest_stat = {
7983 			.avg_load = 0UL,
7984 			.sum_nr_running = 0,
7985 			.group_type = group_other,
7986 		},
7987 	};
7988 }
7989 
7990 /**
7991  * get_sd_load_idx - Obtain the load index for a given sched domain.
7992  * @sd: The sched_domain whose load_idx is to be obtained.
7993  * @idle: The idle status of the CPU for whose sd load_idx is obtained.
7994  *
7995  * Return: The load index.
7996  */
get_sd_load_idx(struct sched_domain * sd,enum cpu_idle_type idle)7997 static inline int get_sd_load_idx(struct sched_domain *sd,
7998 					enum cpu_idle_type idle)
7999 {
8000 	int load_idx;
8001 
8002 	switch (idle) {
8003 	case CPU_NOT_IDLE:
8004 		load_idx = sd->busy_idx;
8005 		break;
8006 
8007 	case CPU_NEWLY_IDLE:
8008 		load_idx = sd->newidle_idx;
8009 		break;
8010 	default:
8011 		load_idx = sd->idle_idx;
8012 		break;
8013 	}
8014 
8015 	return load_idx;
8016 }
8017 
scale_rt_capacity(int cpu)8018 static unsigned long scale_rt_capacity(int cpu)
8019 {
8020 	struct rq *rq = cpu_rq(cpu);
8021 	u64 total, used, age_stamp, avg;
8022 	s64 delta;
8023 
8024 	/*
8025 	 * Since we're reading these variables without serialization make sure
8026 	 * we read them once before doing sanity checks on them.
8027 	 */
8028 	age_stamp = READ_ONCE(rq->age_stamp);
8029 	avg = READ_ONCE(rq->rt_avg);
8030 	delta = __rq_clock_broken(rq) - age_stamp;
8031 
8032 	if (unlikely(delta < 0))
8033 		delta = 0;
8034 
8035 	total = sched_avg_period() + delta;
8036 
8037 	used = div_u64(avg, total);
8038 
8039 	/*
8040 	 * deadline bandwidth is defined at system level so we must
8041 	 * weight this bandwidth with the max capacity of the system.
8042 	 * As a reminder, avg_bw is 20bits width and
8043 	 * scale_cpu_capacity is 10 bits width
8044 	 */
8045 	used += div_u64(rq->dl.avg_bw, arch_scale_cpu_capacity(NULL, cpu));
8046 
8047 	if (likely(used < SCHED_CAPACITY_SCALE))
8048 		return SCHED_CAPACITY_SCALE - used;
8049 
8050 	return 1;
8051 }
8052 
init_max_cpu_capacity(struct max_cpu_capacity * mcc)8053 void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
8054 {
8055 	raw_spin_lock_init(&mcc->lock);
8056 	mcc->val = 0;
8057 	mcc->cpu = -1;
8058 }
8059 
update_cpu_capacity(struct sched_domain * sd,int cpu)8060 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
8061 {
8062 	unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
8063 	struct sched_group *sdg = sd->groups;
8064 	struct max_cpu_capacity *mcc;
8065 	unsigned long max_capacity;
8066 	int max_cap_cpu;
8067 	unsigned long flags;
8068 
8069 	cpu_rq(cpu)->cpu_capacity_orig = capacity;
8070 
8071 	mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
8072 
8073 	raw_spin_lock_irqsave(&mcc->lock, flags);
8074 	max_capacity = mcc->val;
8075 	max_cap_cpu = mcc->cpu;
8076 
8077 	if ((max_capacity > capacity && max_cap_cpu == cpu) ||
8078 	    (max_capacity < capacity)) {
8079 		mcc->val = capacity;
8080 		mcc->cpu = cpu;
8081 #ifdef CONFIG_SCHED_DEBUG
8082 		raw_spin_unlock_irqrestore(&mcc->lock, flags);
8083 		printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
8084 				cpu, capacity);
8085 		goto skip_unlock;
8086 #endif
8087 	}
8088 	raw_spin_unlock_irqrestore(&mcc->lock, flags);
8089 
8090 skip_unlock: __attribute__ ((unused));
8091 	capacity *= scale_rt_capacity(cpu);
8092 	capacity >>= SCHED_CAPACITY_SHIFT;
8093 
8094 	if (!capacity)
8095 		capacity = 1;
8096 
8097 	cpu_rq(cpu)->cpu_capacity = capacity;
8098 	sdg->sgc->capacity = capacity;
8099 	sdg->sgc->max_capacity = capacity;
8100 	sdg->sgc->min_capacity = capacity;
8101 }
8102 
update_group_capacity(struct sched_domain * sd,int cpu)8103 void update_group_capacity(struct sched_domain *sd, int cpu)
8104 {
8105 	struct sched_domain *child = sd->child;
8106 	struct sched_group *group, *sdg = sd->groups;
8107 	unsigned long capacity, max_capacity, min_capacity;
8108 	unsigned long interval;
8109 
8110 	interval = msecs_to_jiffies(sd->balance_interval);
8111 	interval = clamp(interval, 1UL, max_load_balance_interval);
8112 	sdg->sgc->next_update = jiffies + interval;
8113 
8114 	if (!child) {
8115 		update_cpu_capacity(sd, cpu);
8116 		return;
8117 	}
8118 
8119 	capacity = 0;
8120 	max_capacity = 0;
8121 	min_capacity = ULONG_MAX;
8122 
8123 	if (child->flags & SD_OVERLAP) {
8124 		/*
8125 		 * SD_OVERLAP domains cannot assume that child groups
8126 		 * span the current group.
8127 		 */
8128 
8129 		for_each_cpu(cpu, sched_group_cpus(sdg)) {
8130 			struct sched_group_capacity *sgc;
8131 			struct rq *rq = cpu_rq(cpu);
8132 
8133 			/*
8134 			 * build_sched_domains() -> init_sched_groups_capacity()
8135 			 * gets here before we've attached the domains to the
8136 			 * runqueues.
8137 			 *
8138 			 * Use capacity_of(), which is set irrespective of domains
8139 			 * in update_cpu_capacity().
8140 			 *
8141 			 * This avoids capacity from being 0 and
8142 			 * causing divide-by-zero issues on boot.
8143 			 */
8144 			if (unlikely(!rq->sd)) {
8145 				capacity += capacity_of(cpu);
8146 			} else {
8147 				sgc = rq->sd->groups->sgc;
8148 				capacity += sgc->capacity;
8149 			}
8150 
8151 			max_capacity = max(capacity, max_capacity);
8152 			min_capacity = min(capacity, min_capacity);
8153 		}
8154 	} else  {
8155 		/*
8156 		 * !SD_OVERLAP domains can assume that child groups
8157 		 * span the current group.
8158 		 */
8159 
8160 		group = child->groups;
8161 		do {
8162 			struct sched_group_capacity *sgc = group->sgc;
8163 
8164 			capacity += sgc->capacity;
8165 			max_capacity = max(sgc->max_capacity, max_capacity);
8166 			min_capacity = min(sgc->min_capacity, min_capacity);
8167 			group = group->next;
8168 		} while (group != child->groups);
8169 	}
8170 
8171 	sdg->sgc->capacity = capacity;
8172 	sdg->sgc->max_capacity = max_capacity;
8173 	sdg->sgc->min_capacity = min_capacity;
8174 }
8175 
8176 /*
8177  * Check whether the capacity of the rq has been noticeably reduced by side
8178  * activity. The imbalance_pct is used for the threshold.
8179  * Return true is the capacity is reduced
8180  */
8181 static inline int
check_cpu_capacity(struct rq * rq,struct sched_domain * sd)8182 check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
8183 {
8184 	return ((rq->cpu_capacity * sd->imbalance_pct) <
8185 				(rq->cpu_capacity_orig * 100));
8186 }
8187 
8188 /*
8189  * Group imbalance indicates (and tries to solve) the problem where balancing
8190  * groups is inadequate due to tsk_cpus_allowed() constraints.
8191  *
8192  * Imagine a situation of two groups of 4 cpus each and 4 tasks each with a
8193  * cpumask covering 1 cpu of the first group and 3 cpus of the second group.
8194  * Something like:
8195  *
8196  * 	{ 0 1 2 3 } { 4 5 6 7 }
8197  * 	        *     * * *
8198  *
8199  * If we were to balance group-wise we'd place two tasks in the first group and
8200  * two tasks in the second group. Clearly this is undesired as it will overload
8201  * cpu 3 and leave one of the cpus in the second group unused.
8202  *
8203  * The current solution to this issue is detecting the skew in the first group
8204  * by noticing the lower domain failed to reach balance and had difficulty
8205  * moving tasks due to affinity constraints.
8206  *
8207  * When this is so detected; this group becomes a candidate for busiest; see
8208  * update_sd_pick_busiest(). And calculate_imbalance() and
8209  * find_busiest_group() avoid some of the usual balance conditions to allow it
8210  * to create an effective group imbalance.
8211  *
8212  * This is a somewhat tricky proposition since the next run might not find the
8213  * group imbalance and decide the groups need to be balanced again. A most
8214  * subtle and fragile situation.
8215  */
8216 
sg_imbalanced(struct sched_group * group)8217 static inline int sg_imbalanced(struct sched_group *group)
8218 {
8219 	return group->sgc->imbalance;
8220 }
8221 
8222 /*
8223  * group_has_capacity returns true if the group has spare capacity that could
8224  * be used by some tasks.
8225  * We consider that a group has spare capacity if the  * number of task is
8226  * smaller than the number of CPUs or if the utilization is lower than the
8227  * available capacity for CFS tasks.
8228  * For the latter, we use a threshold to stabilize the state, to take into
8229  * account the variance of the tasks' load and to return true if the available
8230  * capacity in meaningful for the load balancer.
8231  * As an example, an available capacity of 1% can appear but it doesn't make
8232  * any benefit for the load balance.
8233  */
8234 static inline bool
group_has_capacity(struct lb_env * env,struct sg_lb_stats * sgs)8235 group_has_capacity(struct lb_env *env, struct sg_lb_stats *sgs)
8236 {
8237 	if (sgs->sum_nr_running < sgs->group_weight)
8238 		return true;
8239 
8240 	if ((sgs->group_capacity * 100) >
8241 			(sgs->group_util * env->sd->imbalance_pct))
8242 		return true;
8243 
8244 	return false;
8245 }
8246 
8247 /*
8248  *  group_is_overloaded returns true if the group has more tasks than it can
8249  *  handle.
8250  *  group_is_overloaded is not equals to !group_has_capacity because a group
8251  *  with the exact right number of tasks, has no more spare capacity but is not
8252  *  overloaded so both group_has_capacity and group_is_overloaded return
8253  *  false.
8254  */
8255 static inline bool
group_is_overloaded(struct lb_env * env,struct sg_lb_stats * sgs)8256 group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
8257 {
8258 	if (sgs->sum_nr_running <= sgs->group_weight)
8259 		return false;
8260 
8261 	if ((sgs->group_capacity * 100) <
8262 			(sgs->group_util * env->sd->imbalance_pct))
8263 		return true;
8264 
8265 	return false;
8266 }
8267 
8268 
8269 /*
8270  * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
8271  * per-cpu capacity than sched_group ref.
8272  */
8273 static inline bool
group_smaller_cpu_capacity(struct sched_group * sg,struct sched_group * ref)8274 group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
8275 {
8276 	return sg->sgc->max_capacity + capacity_margin - SCHED_LOAD_SCALE <
8277 							ref->sgc->max_capacity;
8278 }
8279 
8280 static inline enum
group_classify(struct sched_group * group,struct sg_lb_stats * sgs)8281 group_type group_classify(struct sched_group *group,
8282 			  struct sg_lb_stats *sgs)
8283 {
8284 	if (sgs->group_no_capacity)
8285 		return group_overloaded;
8286 
8287 	if (sg_imbalanced(group))
8288 		return group_imbalanced;
8289 
8290 	if (sgs->group_misfit_task)
8291 		return group_misfit_task;
8292 
8293 	return group_other;
8294 }
8295 
8296 #ifdef CONFIG_NO_HZ_COMMON
8297 /*
8298  * idle load balancing data
8299  *  - used by the nohz balance, but we want it available here
8300  *    so that we can see which CPUs have no tick.
8301  */
8302 static struct {
8303 	cpumask_var_t idle_cpus_mask;
8304 	atomic_t nr_cpus;
8305 	unsigned long next_balance;     /* in jiffy units */
8306 } nohz ____cacheline_aligned;
8307 
update_cpu_stats_if_tickless(struct rq * rq)8308 static inline void update_cpu_stats_if_tickless(struct rq *rq)
8309 {
8310 	/* only called from update_sg_lb_stats when irqs are disabled */
8311 	if (cpumask_test_cpu(rq->cpu, nohz.idle_cpus_mask)) {
8312 		/* rate limit updates to once-per-jiffie at most */
8313 		if (READ_ONCE(jiffies) <= rq->last_load_update_tick)
8314 			return;
8315 
8316 		raw_spin_lock(&rq->lock);
8317 		update_rq_clock(rq);
8318 		update_idle_cpu_load(rq);
8319 		update_cfs_rq_load_avg(rq->clock_task, &rq->cfs, false);
8320 		raw_spin_unlock(&rq->lock);
8321 	}
8322 }
8323 
8324 #else
update_cpu_stats_if_tickless(struct rq * rq)8325 static inline void update_cpu_stats_if_tickless(struct rq *rq) { }
8326 #endif
8327 
8328 /**
8329  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
8330  * @env: The load balancing environment.
8331  * @group: sched_group whose statistics are to be updated.
8332  * @load_idx: Load index of sched_domain of this_cpu for load calc.
8333  * @local_group: Does group contain this_cpu.
8334  * @sgs: variable to hold the statistics for this group.
8335  * @overload: Indicate more than one runnable task for any CPU.
8336  * @overutilized: Indicate overutilization for any CPU.
8337  */
update_sg_lb_stats(struct lb_env * env,struct sched_group * group,int load_idx,int local_group,struct sg_lb_stats * sgs,bool * overload,bool * overutilized)8338 static inline void update_sg_lb_stats(struct lb_env *env,
8339 			struct sched_group *group, int load_idx,
8340 			int local_group, struct sg_lb_stats *sgs,
8341 			bool *overload, bool *overutilized)
8342 {
8343 	unsigned long load;
8344 	int i, nr_running;
8345 
8346 	memset(sgs, 0, sizeof(*sgs));
8347 
8348 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
8349 		struct rq *rq = cpu_rq(i);
8350 
8351 		/* if we are entering idle and there are CPUs with
8352 		 * their tick stopped, do an update for them
8353 		 */
8354 		if (env->idle == CPU_NEWLY_IDLE)
8355 			update_cpu_stats_if_tickless(rq);
8356 
8357 		/* Bias balancing toward cpus of our domain */
8358 		if (local_group)
8359 			load = target_load(i, load_idx);
8360 		else
8361 			load = source_load(i, load_idx);
8362 
8363 		sgs->group_load += load;
8364 		sgs->group_util += cpu_util(i);
8365 		sgs->sum_nr_running += rq->cfs.h_nr_running;
8366 
8367 		nr_running = rq->nr_running;
8368 		if (nr_running > 1)
8369 			*overload = true;
8370 
8371 #ifdef CONFIG_NUMA_BALANCING
8372 		sgs->nr_numa_running += rq->nr_numa_running;
8373 		sgs->nr_preferred_running += rq->nr_preferred_running;
8374 #endif
8375 		sgs->sum_weighted_load += weighted_cpuload(i);
8376 		/*
8377 		 * No need to call idle_cpu() if nr_running is not 0
8378 		 */
8379 		if (!nr_running && idle_cpu(i))
8380 			sgs->idle_cpus++;
8381 
8382 		if (cpu_overutilized(i)) {
8383 			*overutilized = true;
8384 			if (!sgs->group_misfit_task && rq->misfit_task)
8385 				sgs->group_misfit_task = capacity_of(i);
8386 		}
8387 	}
8388 
8389 	/* Adjust by relative CPU capacity of the group */
8390 	sgs->group_capacity = group->sgc->capacity;
8391 	sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
8392 
8393 	if (sgs->sum_nr_running)
8394 		sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
8395 
8396 	sgs->group_weight = group->group_weight;
8397 
8398 	sgs->group_no_capacity = group_is_overloaded(env, sgs);
8399 	sgs->group_type = group_classify(group, sgs);
8400 }
8401 
8402 /**
8403  * update_sd_pick_busiest - return 1 on busiest group
8404  * @env: The load balancing environment.
8405  * @sds: sched_domain statistics
8406  * @sg: sched_group candidate to be checked for being the busiest
8407  * @sgs: sched_group statistics
8408  *
8409  * Determine if @sg is a busier group than the previously selected
8410  * busiest group.
8411  *
8412  * Return: %true if @sg is a busier group than the previously selected
8413  * busiest group. %false otherwise.
8414  */
update_sd_pick_busiest(struct lb_env * env,struct sd_lb_stats * sds,struct sched_group * sg,struct sg_lb_stats * sgs)8415 static bool update_sd_pick_busiest(struct lb_env *env,
8416 				   struct sd_lb_stats *sds,
8417 				   struct sched_group *sg,
8418 				   struct sg_lb_stats *sgs)
8419 {
8420 	struct sg_lb_stats *busiest = &sds->busiest_stat;
8421 
8422 	if (sgs->group_type > busiest->group_type)
8423 		return true;
8424 
8425 	if (sgs->group_type < busiest->group_type)
8426 		return false;
8427 
8428 	/*
8429 	 * Candidate sg doesn't face any serious load-balance problems
8430 	 * so don't pick it if the local sg is already filled up.
8431 	 */
8432 	if (sgs->group_type == group_other &&
8433 	    !group_has_capacity(env, &sds->local_stat))
8434 		return false;
8435 
8436 	if (sgs->avg_load <= busiest->avg_load)
8437 		return false;
8438 
8439 	if (!(env->sd->flags & SD_ASYM_CPUCAPACITY))
8440 		goto asym_packing;
8441 
8442 	/*
8443 	 * Candidate sg has no more than one task per CPU and
8444 	 * has higher per-CPU capacity. Migrating tasks to less
8445 	 * capable CPUs may harm throughput. Maximize throughput,
8446 	 * power/energy consequences are not considered.
8447 	 */
8448 	if (sgs->sum_nr_running <= sgs->group_weight &&
8449 	    group_smaller_cpu_capacity(sds->local, sg))
8450 		return false;
8451 
8452 asym_packing:
8453 	/* This is the busiest node in its class. */
8454 	if (!(env->sd->flags & SD_ASYM_PACKING))
8455 		return true;
8456 
8457 	/*
8458 	 * ASYM_PACKING needs to move all the work to the lowest
8459 	 * numbered CPUs in the group, therefore mark all groups
8460 	 * higher than ourself as busy.
8461 	 */
8462 	if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
8463 		if (!sds->busiest)
8464 			return true;
8465 
8466 		if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
8467 			return true;
8468 	}
8469 
8470 	return false;
8471 }
8472 
8473 #ifdef CONFIG_NUMA_BALANCING
fbq_classify_group(struct sg_lb_stats * sgs)8474 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8475 {
8476 	if (sgs->sum_nr_running > sgs->nr_numa_running)
8477 		return regular;
8478 	if (sgs->sum_nr_running > sgs->nr_preferred_running)
8479 		return remote;
8480 	return all;
8481 }
8482 
fbq_classify_rq(struct rq * rq)8483 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8484 {
8485 	if (rq->nr_running > rq->nr_numa_running)
8486 		return regular;
8487 	if (rq->nr_running > rq->nr_preferred_running)
8488 		return remote;
8489 	return all;
8490 }
8491 #else
fbq_classify_group(struct sg_lb_stats * sgs)8492 static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
8493 {
8494 	return all;
8495 }
8496 
fbq_classify_rq(struct rq * rq)8497 static inline enum fbq_type fbq_classify_rq(struct rq *rq)
8498 {
8499 	return regular;
8500 }
8501 #endif /* CONFIG_NUMA_BALANCING */
8502 
8503 #define lb_sd_parent(sd) \
8504 	(sd->parent && sd->parent->groups != sd->parent->groups->next)
8505 
8506 /**
8507  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
8508  * @env: The load balancing environment.
8509  * @sds: variable to hold the statistics for this sched_domain.
8510  */
update_sd_lb_stats(struct lb_env * env,struct sd_lb_stats * sds)8511 static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
8512 {
8513 	struct sched_domain *child = env->sd->child;
8514 	struct sched_group *sg = env->sd->groups;
8515 	struct sg_lb_stats tmp_sgs;
8516 	int load_idx, prefer_sibling = 0;
8517 	bool overload = false, overutilized = false;
8518 
8519 	if (child && child->flags & SD_PREFER_SIBLING)
8520 		prefer_sibling = 1;
8521 
8522 	load_idx = get_sd_load_idx(env->sd, env->idle);
8523 
8524 	do {
8525 		struct sg_lb_stats *sgs = &tmp_sgs;
8526 		int local_group;
8527 
8528 		local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
8529 		if (local_group) {
8530 			sds->local = sg;
8531 			sgs = &sds->local_stat;
8532 
8533 			if (env->idle != CPU_NEWLY_IDLE ||
8534 			    time_after_eq(jiffies, sg->sgc->next_update))
8535 				update_group_capacity(env->sd, env->dst_cpu);
8536 		}
8537 
8538 		update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
8539 						&overload, &overutilized);
8540 
8541 		if (local_group)
8542 			goto next_group;
8543 
8544 		/*
8545 		 * In case the child domain prefers tasks go to siblings
8546 		 * first, lower the sg capacity so that we'll try
8547 		 * and move all the excess tasks away. We lower the capacity
8548 		 * of a group only if the local group has the capacity to fit
8549 		 * these excess tasks. The extra check prevents the case where
8550 		 * you always pull from the heaviest group when it is already
8551 		 * under-utilized (possible with a large weight task outweighs
8552 		 * the tasks on the system).
8553 		 */
8554 		if (prefer_sibling && sds->local &&
8555 		    group_has_capacity(env, &sds->local_stat) &&
8556 		    (sgs->sum_nr_running > 1)) {
8557 			sgs->group_no_capacity = 1;
8558 			sgs->group_type = group_classify(sg, sgs);
8559 		}
8560 
8561 		/*
8562 		 * Ignore task groups with misfit tasks if local group has no
8563 		 * capacity or if per-cpu capacity isn't higher.
8564 		 */
8565 		if (sgs->group_type == group_misfit_task &&
8566 		    (!group_has_capacity(env, &sds->local_stat) ||
8567 		     !group_smaller_cpu_capacity(sg, sds->local)))
8568 			sgs->group_type = group_other;
8569 
8570 		if (update_sd_pick_busiest(env, sds, sg, sgs)) {
8571 			sds->busiest = sg;
8572 			sds->busiest_stat = *sgs;
8573 		}
8574 
8575 next_group:
8576 		/* Now, start updating sd_lb_stats */
8577 		sds->total_load += sgs->group_load;
8578 		sds->total_capacity += sgs->group_capacity;
8579 
8580 		sg = sg->next;
8581 	} while (sg != env->sd->groups);
8582 
8583 	if (env->sd->flags & SD_NUMA)
8584 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
8585 
8586 	env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
8587 
8588 	if (!lb_sd_parent(env->sd)) {
8589 		/* update overload indicator if we are at root domain */
8590 		if (env->dst_rq->rd->overload != overload)
8591 			env->dst_rq->rd->overload = overload;
8592 
8593 		/* Update over-utilization (tipping point, U >= 0) indicator */
8594 		if (env->dst_rq->rd->overutilized != overutilized) {
8595 			env->dst_rq->rd->overutilized = overutilized;
8596 			trace_sched_overutilized(overutilized);
8597 		}
8598 	} else {
8599 		if (!env->dst_rq->rd->overutilized && overutilized) {
8600 			env->dst_rq->rd->overutilized = true;
8601 			trace_sched_overutilized(true);
8602 		}
8603 	}
8604 
8605 }
8606 
8607 /**
8608  * check_asym_packing - Check to see if the group is packed into the
8609  *			sched doman.
8610  *
8611  * This is primarily intended to used at the sibling level.  Some
8612  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
8613  * case of POWER7, it can move to lower SMT modes only when higher
8614  * threads are idle.  When in lower SMT modes, the threads will
8615  * perform better since they share less core resources.  Hence when we
8616  * have idle threads, we want them to be the higher ones.
8617  *
8618  * This packing function is run on idle threads.  It checks to see if
8619  * the busiest CPU in this domain (core in the P7 case) has a higher
8620  * CPU number than the packing function is being run on.  Here we are
8621  * assuming lower CPU number will be equivalent to lower a SMT thread
8622  * number.
8623  *
8624  * Return: 1 when packing is required and a task should be moved to
8625  * this CPU.  The amount of the imbalance is returned in *imbalance.
8626  *
8627  * @env: The load balancing environment.
8628  * @sds: Statistics of the sched_domain which is to be packed
8629  */
check_asym_packing(struct lb_env * env,struct sd_lb_stats * sds)8630 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
8631 {
8632 	int busiest_cpu;
8633 
8634 	if (!(env->sd->flags & SD_ASYM_PACKING))
8635 		return 0;
8636 
8637 	if (!sds->busiest)
8638 		return 0;
8639 
8640 	busiest_cpu = group_first_cpu(sds->busiest);
8641 	if (env->dst_cpu > busiest_cpu)
8642 		return 0;
8643 
8644 	env->imbalance = DIV_ROUND_CLOSEST(
8645 		sds->busiest_stat.avg_load * sds->busiest_stat.group_capacity,
8646 		SCHED_CAPACITY_SCALE);
8647 
8648 	return 1;
8649 }
8650 
8651 /**
8652  * fix_small_imbalance - Calculate the minor imbalance that exists
8653  *			amongst the groups of a sched_domain, during
8654  *			load balancing.
8655  * @env: The load balancing environment.
8656  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
8657  */
8658 static inline
fix_small_imbalance(struct lb_env * env,struct sd_lb_stats * sds)8659 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8660 {
8661 	unsigned long tmp, capa_now = 0, capa_move = 0;
8662 	unsigned int imbn = 2;
8663 	unsigned long scaled_busy_load_per_task;
8664 	struct sg_lb_stats *local, *busiest;
8665 
8666 	local = &sds->local_stat;
8667 	busiest = &sds->busiest_stat;
8668 
8669 	if (!local->sum_nr_running)
8670 		local->load_per_task = cpu_avg_load_per_task(env->dst_cpu);
8671 	else if (busiest->load_per_task > local->load_per_task)
8672 		imbn = 1;
8673 
8674 	scaled_busy_load_per_task =
8675 		(busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8676 		busiest->group_capacity;
8677 
8678 	if (busiest->avg_load + scaled_busy_load_per_task >=
8679 	    local->avg_load + (scaled_busy_load_per_task * imbn)) {
8680 		env->imbalance = busiest->load_per_task;
8681 		return;
8682 	}
8683 
8684 	/*
8685 	 * OK, we don't have enough imbalance to justify moving tasks,
8686 	 * however we may be able to increase total CPU capacity used by
8687 	 * moving them.
8688 	 */
8689 
8690 	capa_now += busiest->group_capacity *
8691 			min(busiest->load_per_task, busiest->avg_load);
8692 	capa_now += local->group_capacity *
8693 			min(local->load_per_task, local->avg_load);
8694 	capa_now /= SCHED_CAPACITY_SCALE;
8695 
8696 	/* Amount of load we'd subtract */
8697 	if (busiest->avg_load > scaled_busy_load_per_task) {
8698 		capa_move += busiest->group_capacity *
8699 			    min(busiest->load_per_task,
8700 				busiest->avg_load - scaled_busy_load_per_task);
8701 	}
8702 
8703 	/* Amount of load we'd add */
8704 	if (busiest->avg_load * busiest->group_capacity <
8705 	    busiest->load_per_task * SCHED_CAPACITY_SCALE) {
8706 		tmp = (busiest->avg_load * busiest->group_capacity) /
8707 		      local->group_capacity;
8708 	} else {
8709 		tmp = (busiest->load_per_task * SCHED_CAPACITY_SCALE) /
8710 		      local->group_capacity;
8711 	}
8712 	capa_move += local->group_capacity *
8713 		    min(local->load_per_task, local->avg_load + tmp);
8714 	capa_move /= SCHED_CAPACITY_SCALE;
8715 
8716 	/* Move if we gain throughput */
8717 	if (capa_move > capa_now)
8718 		env->imbalance = busiest->load_per_task;
8719 }
8720 
8721 /**
8722  * calculate_imbalance - Calculate the amount of imbalance present within the
8723  *			 groups of a given sched_domain during load balance.
8724  * @env: load balance environment
8725  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
8726  */
calculate_imbalance(struct lb_env * env,struct sd_lb_stats * sds)8727 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
8728 {
8729 	unsigned long max_pull, load_above_capacity = ~0UL;
8730 	struct sg_lb_stats *local, *busiest;
8731 
8732 	local = &sds->local_stat;
8733 	busiest = &sds->busiest_stat;
8734 
8735 	if (busiest->group_type == group_imbalanced) {
8736 		/*
8737 		 * In the group_imb case we cannot rely on group-wide averages
8738 		 * to ensure cpu-load equilibrium, look at wider averages. XXX
8739 		 */
8740 		busiest->load_per_task =
8741 			min(busiest->load_per_task, sds->avg_load);
8742 	}
8743 
8744 	/*
8745 	 * In the presence of smp nice balancing, certain scenarios can have
8746 	 * max load less than avg load(as we skip the groups at or below
8747 	 * its cpu_capacity, while calculating max_load..)
8748 	 */
8749 	if (busiest->avg_load <= sds->avg_load ||
8750 	    local->avg_load >= sds->avg_load) {
8751 		/* Misfitting tasks should be migrated in any case */
8752 		if (busiest->group_type == group_misfit_task) {
8753 			env->imbalance = busiest->group_misfit_task;
8754 			return;
8755 		}
8756 
8757 		/*
8758 		 * Busiest group is overloaded, local is not, use the spare
8759 		 * cycles to maximize throughput
8760 		 */
8761 		if (busiest->group_type == group_overloaded &&
8762 		    local->group_type <= group_misfit_task) {
8763 			env->imbalance = busiest->load_per_task;
8764 			return;
8765 		}
8766 
8767 		env->imbalance = 0;
8768 		return fix_small_imbalance(env, sds);
8769 	}
8770 
8771 	/*
8772 	 * If there aren't any idle cpus, avoid creating some.
8773 	 */
8774 	if (busiest->group_type == group_overloaded &&
8775 	    local->group_type   == group_overloaded) {
8776 		load_above_capacity = busiest->sum_nr_running *
8777 					SCHED_LOAD_SCALE;
8778 		if (load_above_capacity > busiest->group_capacity)
8779 			load_above_capacity -= busiest->group_capacity;
8780 		else
8781 			load_above_capacity = ~0UL;
8782 	}
8783 
8784 	/*
8785 	 * We're trying to get all the cpus to the average_load, so we don't
8786 	 * want to push ourselves above the average load, nor do we wish to
8787 	 * reduce the max loaded cpu below the average load. At the same time,
8788 	 * we also don't want to reduce the group load below the group capacity
8789 	 * (so that we can implement power-savings policies etc). Thus we look
8790 	 * for the minimum possible imbalance.
8791 	 */
8792 	max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
8793 
8794 	/* How much load to actually move to equalise the imbalance */
8795 	env->imbalance = min(
8796 		max_pull * busiest->group_capacity,
8797 		(sds->avg_load - local->avg_load) * local->group_capacity
8798 	) / SCHED_CAPACITY_SCALE;
8799 
8800 	/* Boost imbalance to allow misfit task to be balanced. */
8801 	if (busiest->group_type == group_misfit_task)
8802 		env->imbalance = max_t(long, env->imbalance,
8803 				     busiest->group_misfit_task);
8804 
8805 	/*
8806 	 * if *imbalance is less than the average load per runnable task
8807 	 * there is no guarantee that any tasks will be moved so we'll have
8808 	 * a think about bumping its value to force at least one task to be
8809 	 * moved
8810 	 */
8811 	if (env->imbalance < busiest->load_per_task)
8812 		return fix_small_imbalance(env, sds);
8813 }
8814 
8815 /******* find_busiest_group() helpers end here *********************/
8816 
8817 /**
8818  * find_busiest_group - Returns the busiest group within the sched_domain
8819  * if there is an imbalance. If there isn't an imbalance, and
8820  * the user has opted for power-savings, it returns a group whose
8821  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
8822  * such a group exists.
8823  *
8824  * Also calculates the amount of weighted load which should be moved
8825  * to restore balance.
8826  *
8827  * @env: The load balancing environment.
8828  *
8829  * Return:	- The busiest group if imbalance exists.
8830  *		- If no imbalance and user has opted for power-savings balance,
8831  *		   return the least loaded group whose CPUs can be
8832  *		   put to idle by rebalancing its tasks onto our group.
8833  */
find_busiest_group(struct lb_env * env)8834 static struct sched_group *find_busiest_group(struct lb_env *env)
8835 {
8836 	struct sg_lb_stats *local, *busiest;
8837 	struct sd_lb_stats sds;
8838 
8839 	init_sd_lb_stats(&sds);
8840 
8841 	/*
8842 	 * Compute the various statistics relavent for load balancing at
8843 	 * this level.
8844 	 */
8845 	update_sd_lb_stats(env, &sds);
8846 
8847 	if (energy_aware() && !env->dst_rq->rd->overutilized)
8848 		goto out_balanced;
8849 
8850 	local = &sds.local_stat;
8851 	busiest = &sds.busiest_stat;
8852 
8853 	/* ASYM feature bypasses nice load balance check */
8854 	if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
8855 	    check_asym_packing(env, &sds))
8856 		return sds.busiest;
8857 
8858 	/* There is no busy sibling group to pull tasks from */
8859 	if (!sds.busiest || busiest->sum_nr_running == 0)
8860 		goto out_balanced;
8861 
8862 	sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
8863 						/ sds.total_capacity;
8864 
8865 	/*
8866 	 * If the busiest group is imbalanced the below checks don't
8867 	 * work because they assume all things are equal, which typically
8868 	 * isn't true due to cpus_allowed constraints and the like.
8869 	 */
8870 	if (busiest->group_type == group_imbalanced)
8871 		goto force_balance;
8872 
8873 	/*
8874 	 * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
8875 	 * capacities from resulting in underutilization due to avg_load.
8876 	 */
8877 	if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
8878 	    busiest->group_no_capacity)
8879 		goto force_balance;
8880 
8881 	/* Misfitting tasks should be dealt with regardless of the avg load */
8882 	if (busiest->group_type == group_misfit_task) {
8883 		goto force_balance;
8884 	}
8885 
8886 	/*
8887 	 * If the local group is busier than the selected busiest group
8888 	 * don't try and pull any tasks.
8889 	 */
8890 	if (local->avg_load >= busiest->avg_load)
8891 		goto out_balanced;
8892 
8893 	/*
8894 	 * Don't pull any tasks if this group is already above the domain
8895 	 * average load.
8896 	 */
8897 	if (local->avg_load >= sds.avg_load)
8898 		goto out_balanced;
8899 
8900 	if (env->idle == CPU_IDLE) {
8901 		/*
8902 		 * This cpu is idle. If the busiest group is not overloaded
8903 		 * and there is no imbalance between this and busiest group
8904 		 * wrt idle cpus, it is balanced. The imbalance becomes
8905 		 * significant if the diff is greater than 1 otherwise we
8906 		 * might end up to just move the imbalance on another group
8907 		 */
8908 		if ((busiest->group_type != group_overloaded) &&
8909 		    (local->idle_cpus <= (busiest->idle_cpus + 1)) &&
8910 		    !group_smaller_cpu_capacity(sds.busiest, sds.local))
8911 			goto out_balanced;
8912 	} else {
8913 		/*
8914 		 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
8915 		 * imbalance_pct to be conservative.
8916 		 */
8917 		if (100 * busiest->avg_load <=
8918 				env->sd->imbalance_pct * local->avg_load)
8919 			goto out_balanced;
8920 	}
8921 
8922 force_balance:
8923 	env->busiest_group_type = busiest->group_type;
8924 	/* Looks like there is an imbalance. Compute it */
8925 	calculate_imbalance(env, &sds);
8926 	return sds.busiest;
8927 
8928 out_balanced:
8929 	env->imbalance = 0;
8930 	return NULL;
8931 }
8932 
8933 /*
8934  * find_busiest_queue - find the busiest runqueue among the cpus in group.
8935  */
find_busiest_queue(struct lb_env * env,struct sched_group * group)8936 static struct rq *find_busiest_queue(struct lb_env *env,
8937 				     struct sched_group *group)
8938 {
8939 	struct rq *busiest = NULL, *rq;
8940 	unsigned long busiest_load = 0, busiest_capacity = 1;
8941 	int i;
8942 
8943 	for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
8944 		unsigned long capacity, wl;
8945 		enum fbq_type rt;
8946 
8947 		rq = cpu_rq(i);
8948 		rt = fbq_classify_rq(rq);
8949 
8950 		/*
8951 		 * We classify groups/runqueues into three groups:
8952 		 *  - regular: there are !numa tasks
8953 		 *  - remote:  there are numa tasks that run on the 'wrong' node
8954 		 *  - all:     there is no distinction
8955 		 *
8956 		 * In order to avoid migrating ideally placed numa tasks,
8957 		 * ignore those when there's better options.
8958 		 *
8959 		 * If we ignore the actual busiest queue to migrate another
8960 		 * task, the next balance pass can still reduce the busiest
8961 		 * queue by moving tasks around inside the node.
8962 		 *
8963 		 * If we cannot move enough load due to this classification
8964 		 * the next pass will adjust the group classification and
8965 		 * allow migration of more tasks.
8966 		 *
8967 		 * Both cases only affect the total convergence complexity.
8968 		 */
8969 		if (rt > env->fbq_type)
8970 			continue;
8971 
8972 		capacity = capacity_of(i);
8973 
8974 		wl = weighted_cpuload(i);
8975 
8976 		/*
8977 		 * When comparing with imbalance, use weighted_cpuload()
8978 		 * which is not scaled with the cpu capacity.
8979 		 */
8980 
8981 		if (rq->nr_running == 1 && wl > env->imbalance &&
8982 		    !check_cpu_capacity(rq, env->sd) &&
8983 		    env->busiest_group_type != group_misfit_task)
8984 			continue;
8985 
8986 		/*
8987 		 * For the load comparisons with the other cpu's, consider
8988 		 * the weighted_cpuload() scaled with the cpu capacity, so
8989 		 * that the load can be moved away from the cpu that is
8990 		 * potentially running at a lower capacity.
8991 		 *
8992 		 * Thus we're looking for max(wl_i / capacity_i), crosswise
8993 		 * multiplication to rid ourselves of the division works out
8994 		 * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
8995 		 * our previous maximum.
8996 		 */
8997 		if (wl * busiest_capacity > busiest_load * capacity) {
8998 			busiest_load = wl;
8999 			busiest_capacity = capacity;
9000 			busiest = rq;
9001 		}
9002 	}
9003 
9004 	return busiest;
9005 }
9006 
9007 /*
9008  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
9009  * so long as it is large enough.
9010  */
9011 #define MAX_PINNED_INTERVAL	512
9012 
9013 /* Working cpumask for load_balance and load_balance_newidle. */
9014 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
9015 
need_active_balance(struct lb_env * env)9016 static int need_active_balance(struct lb_env *env)
9017 {
9018 	struct sched_domain *sd = env->sd;
9019 
9020 	if (env->idle == CPU_NEWLY_IDLE) {
9021 
9022 		/*
9023 		 * ASYM_PACKING needs to force migrate tasks from busy but
9024 		 * higher numbered CPUs in order to pack all tasks in the
9025 		 * lowest numbered CPUs.
9026 		 */
9027 		if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
9028 			return 1;
9029 	}
9030 
9031 	/*
9032 	 * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task.
9033 	 * It's worth migrating the task if the src_cpu's capacity is reduced
9034 	 * because of other sched_class or IRQs if more capacity stays
9035 	 * available on dst_cpu.
9036 	 */
9037 	if ((env->idle != CPU_NOT_IDLE) &&
9038 	    (env->src_rq->cfs.h_nr_running == 1)) {
9039 		if ((check_cpu_capacity(env->src_rq, sd)) &&
9040 		    (capacity_of(env->src_cpu)*sd->imbalance_pct < capacity_of(env->dst_cpu)*100))
9041 			return 1;
9042 	}
9043 
9044 	if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
9045 	    ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
9046 				env->src_rq->cfs.h_nr_running == 1 &&
9047 				cpu_overutilized(env->src_cpu) &&
9048 				!cpu_overutilized(env->dst_cpu)) {
9049 			return 1;
9050 	}
9051 
9052 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
9053 }
9054 
9055 static int active_load_balance_cpu_stop(void *data);
9056 
should_we_balance(struct lb_env * env)9057 static int should_we_balance(struct lb_env *env)
9058 {
9059 	struct sched_group *sg = env->sd->groups;
9060 	struct cpumask *sg_cpus, *sg_mask;
9061 	int cpu, balance_cpu = -1;
9062 
9063 	/*
9064 	 * In the newly idle case, we will allow all the cpu's
9065 	 * to do the newly idle load balance.
9066 	 */
9067 	if (env->idle == CPU_NEWLY_IDLE)
9068 		return 1;
9069 
9070 	sg_cpus = sched_group_cpus(sg);
9071 	sg_mask = sched_group_mask(sg);
9072 	/* Try to find first idle cpu */
9073 	for_each_cpu_and(cpu, sg_cpus, env->cpus) {
9074 		if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
9075 			continue;
9076 
9077 		balance_cpu = cpu;
9078 		break;
9079 	}
9080 
9081 	if (balance_cpu == -1)
9082 		balance_cpu = group_balance_cpu(sg);
9083 
9084 	/*
9085 	 * First idle cpu or the first cpu(busiest) in this sched group
9086 	 * is eligible for doing load balancing at this and above domains.
9087 	 */
9088 	return balance_cpu == env->dst_cpu;
9089 }
9090 
9091 /*
9092  * Check this_cpu to ensure it is balanced within domain. Attempt to move
9093  * tasks if there is an imbalance.
9094  */
load_balance(int this_cpu,struct rq * this_rq,struct sched_domain * sd,enum cpu_idle_type idle,int * continue_balancing)9095 static int load_balance(int this_cpu, struct rq *this_rq,
9096 			struct sched_domain *sd, enum cpu_idle_type idle,
9097 			int *continue_balancing)
9098 {
9099 	int ld_moved, cur_ld_moved, active_balance = 0;
9100 	struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
9101 	struct sched_group *group;
9102 	struct rq *busiest;
9103 	unsigned long flags;
9104 	struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
9105 
9106 	struct lb_env env = {
9107 		.sd		= sd,
9108 		.dst_cpu	= this_cpu,
9109 		.dst_rq		= this_rq,
9110 		.dst_grpmask    = sched_group_cpus(sd->groups),
9111 		.idle		= idle,
9112 		.loop_break	= sched_nr_migrate_break,
9113 		.cpus		= cpus,
9114 		.fbq_type	= all,
9115 		.tasks		= LIST_HEAD_INIT(env.tasks),
9116 	};
9117 
9118 	/*
9119 	 * For NEWLY_IDLE load_balancing, we don't need to consider
9120 	 * other cpus in our group
9121 	 */
9122 	if (idle == CPU_NEWLY_IDLE)
9123 		env.dst_grpmask = NULL;
9124 
9125 	cpumask_copy(cpus, cpu_active_mask);
9126 
9127 	schedstat_inc(sd, lb_count[idle]);
9128 
9129 redo:
9130 	if (!should_we_balance(&env)) {
9131 		*continue_balancing = 0;
9132 		goto out_balanced;
9133 	}
9134 
9135 	group = find_busiest_group(&env);
9136 	if (!group) {
9137 		schedstat_inc(sd, lb_nobusyg[idle]);
9138 		goto out_balanced;
9139 	}
9140 
9141 	busiest = find_busiest_queue(&env, group);
9142 	if (!busiest) {
9143 		schedstat_inc(sd, lb_nobusyq[idle]);
9144 		goto out_balanced;
9145 	}
9146 
9147 	BUG_ON(busiest == env.dst_rq);
9148 
9149 	schedstat_add(sd, lb_imbalance[idle], env.imbalance);
9150 
9151 	env.src_cpu = busiest->cpu;
9152 	env.src_rq = busiest;
9153 
9154 	ld_moved = 0;
9155 	if (busiest->nr_running > 1) {
9156 		/*
9157 		 * Attempt to move tasks. If find_busiest_group has found
9158 		 * an imbalance but busiest->nr_running <= 1, the group is
9159 		 * still unbalanced. ld_moved simply stays zero, so it is
9160 		 * correctly treated as an imbalance.
9161 		 */
9162 		env.flags |= LBF_ALL_PINNED;
9163 		env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
9164 
9165 more_balance:
9166 		raw_spin_lock_irqsave(&busiest->lock, flags);
9167 		update_rq_clock(busiest);
9168 
9169 		/*
9170 		 * cur_ld_moved - load moved in current iteration
9171 		 * ld_moved     - cumulative load moved across iterations
9172 		 */
9173 		cur_ld_moved = detach_tasks(&env);
9174 
9175 		/*
9176 		 * We've detached some tasks from busiest_rq. Every
9177 		 * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
9178 		 * unlock busiest->lock, and we are able to be sure
9179 		 * that nobody can manipulate the tasks in parallel.
9180 		 * See task_rq_lock() family for the details.
9181 		 */
9182 
9183 		raw_spin_unlock(&busiest->lock);
9184 
9185 		if (cur_ld_moved) {
9186 			attach_tasks(&env);
9187 			ld_moved += cur_ld_moved;
9188 		}
9189 
9190 		local_irq_restore(flags);
9191 
9192 		if (env.flags & LBF_NEED_BREAK) {
9193 			env.flags &= ~LBF_NEED_BREAK;
9194 			goto more_balance;
9195 		}
9196 
9197 		/*
9198 		 * Revisit (affine) tasks on src_cpu that couldn't be moved to
9199 		 * us and move them to an alternate dst_cpu in our sched_group
9200 		 * where they can run. The upper limit on how many times we
9201 		 * iterate on same src_cpu is dependent on number of cpus in our
9202 		 * sched_group.
9203 		 *
9204 		 * This changes load balance semantics a bit on who can move
9205 		 * load to a given_cpu. In addition to the given_cpu itself
9206 		 * (or a ilb_cpu acting on its behalf where given_cpu is
9207 		 * nohz-idle), we now have balance_cpu in a position to move
9208 		 * load to given_cpu. In rare situations, this may cause
9209 		 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
9210 		 * _independently_ and at _same_ time to move some load to
9211 		 * given_cpu) causing exceess load to be moved to given_cpu.
9212 		 * This however should not happen so much in practice and
9213 		 * moreover subsequent load balance cycles should correct the
9214 		 * excess load moved.
9215 		 */
9216 		if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
9217 
9218 			/* Prevent to re-select dst_cpu via env's cpus */
9219 			cpumask_clear_cpu(env.dst_cpu, env.cpus);
9220 
9221 			env.dst_rq	 = cpu_rq(env.new_dst_cpu);
9222 			env.dst_cpu	 = env.new_dst_cpu;
9223 			env.flags	&= ~LBF_DST_PINNED;
9224 			env.loop	 = 0;
9225 			env.loop_break	 = sched_nr_migrate_break;
9226 
9227 			/*
9228 			 * Go back to "more_balance" rather than "redo" since we
9229 			 * need to continue with same src_cpu.
9230 			 */
9231 			goto more_balance;
9232 		}
9233 
9234 		/*
9235 		 * We failed to reach balance because of affinity.
9236 		 */
9237 		if (sd_parent) {
9238 			int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9239 
9240 			if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
9241 				*group_imbalance = 1;
9242 		}
9243 
9244 		/* All tasks on this runqueue were pinned by CPU affinity */
9245 		if (unlikely(env.flags & LBF_ALL_PINNED)) {
9246 			cpumask_clear_cpu(cpu_of(busiest), cpus);
9247 			if (!cpumask_empty(cpus)) {
9248 				env.loop = 0;
9249 				env.loop_break = sched_nr_migrate_break;
9250 				goto redo;
9251 			}
9252 			goto out_all_pinned;
9253 		}
9254 	}
9255 
9256 	if (!ld_moved) {
9257 		schedstat_inc(sd, lb_failed[idle]);
9258 		/*
9259 		 * Increment the failure counter only on periodic balance.
9260 		 * We do not want newidle balance, which can be very
9261 		 * frequent, pollute the failure counter causing
9262 		 * excessive cache_hot migrations and active balances.
9263 		 */
9264 		if (idle != CPU_NEWLY_IDLE)
9265 			if (env.src_grp_nr_running > 1)
9266 				sd->nr_balance_failed++;
9267 
9268 		if (need_active_balance(&env)) {
9269 			raw_spin_lock_irqsave(&busiest->lock, flags);
9270 
9271 			/* don't kick the active_load_balance_cpu_stop,
9272 			 * if the curr task on busiest cpu can't be
9273 			 * moved to this_cpu
9274 			 */
9275 			if (!cpumask_test_cpu(this_cpu,
9276 					tsk_cpus_allowed(busiest->curr))) {
9277 				raw_spin_unlock_irqrestore(&busiest->lock,
9278 							    flags);
9279 				env.flags |= LBF_ALL_PINNED;
9280 				goto out_one_pinned;
9281 			}
9282 
9283 			/*
9284 			 * ->active_balance synchronizes accesses to
9285 			 * ->active_balance_work.  Once set, it's cleared
9286 			 * only after active load balance is finished.
9287 			 */
9288 			if (!busiest->active_balance) {
9289 				busiest->active_balance = 1;
9290 				busiest->push_cpu = this_cpu;
9291 				active_balance = 1;
9292 			}
9293 			raw_spin_unlock_irqrestore(&busiest->lock, flags);
9294 
9295 			if (active_balance) {
9296 				stop_one_cpu_nowait(cpu_of(busiest),
9297 					active_load_balance_cpu_stop, busiest,
9298 					&busiest->active_balance_work);
9299 			}
9300 
9301 			/*
9302 			 * We've kicked active balancing, reset the failure
9303 			 * counter.
9304 			 */
9305 			sd->nr_balance_failed = sd->cache_nice_tries+1;
9306 		}
9307 	} else
9308 		sd->nr_balance_failed = 0;
9309 
9310 	if (likely(!active_balance)) {
9311 		/* We were unbalanced, so reset the balancing interval */
9312 		sd->balance_interval = sd->min_interval;
9313 	} else {
9314 		/*
9315 		 * If we've begun active balancing, start to back off. This
9316 		 * case may not be covered by the all_pinned logic if there
9317 		 * is only 1 task on the busy runqueue (because we don't call
9318 		 * detach_tasks).
9319 		 */
9320 		if (sd->balance_interval < sd->max_interval)
9321 			sd->balance_interval *= 2;
9322 	}
9323 
9324 	goto out;
9325 
9326 out_balanced:
9327 	/*
9328 	 * We reach balance although we may have faced some affinity
9329 	 * constraints. Clear the imbalance flag only if other tasks got
9330 	 * a chance to move and fix the imbalance.
9331 	 */
9332 	if (sd_parent && !(env.flags & LBF_ALL_PINNED)) {
9333 		int *group_imbalance = &sd_parent->groups->sgc->imbalance;
9334 
9335 		if (*group_imbalance)
9336 			*group_imbalance = 0;
9337 	}
9338 
9339 out_all_pinned:
9340 	/*
9341 	 * We reach balance because all tasks are pinned at this level so
9342 	 * we can't migrate them. Let the imbalance flag set so parent level
9343 	 * can try to migrate them.
9344 	 */
9345 	schedstat_inc(sd, lb_balanced[idle]);
9346 
9347 	sd->nr_balance_failed = 0;
9348 
9349 out_one_pinned:
9350 	ld_moved = 0;
9351 
9352 	/*
9353 	 * idle_balance() disregards balance intervals, so we could repeatedly
9354 	 * reach this code, which would lead to balance_interval skyrocketting
9355 	 * in a short amount of time. Skip the balance_interval increase logic
9356 	 * to avoid that.
9357 	 */
9358 	if (env.idle == CPU_NEWLY_IDLE)
9359 		goto out;
9360 
9361 	/* tune up the balancing interval */
9362 	if (((env.flags & LBF_ALL_PINNED) &&
9363 			sd->balance_interval < MAX_PINNED_INTERVAL) ||
9364 			(sd->balance_interval < sd->max_interval))
9365 		sd->balance_interval *= 2;
9366 out:
9367 	return ld_moved;
9368 }
9369 
9370 static inline unsigned long
get_sd_balance_interval(struct sched_domain * sd,int cpu_busy)9371 get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
9372 {
9373 	unsigned long interval = sd->balance_interval;
9374 
9375 	if (cpu_busy)
9376 		interval *= sd->busy_factor;
9377 
9378 	/* scale ms to jiffies */
9379 	interval = msecs_to_jiffies(interval);
9380 	interval = clamp(interval, 1UL, max_load_balance_interval);
9381 
9382 	return interval;
9383 }
9384 
9385 static inline void
update_next_balance(struct sched_domain * sd,int cpu_busy,unsigned long * next_balance)9386 update_next_balance(struct sched_domain *sd, int cpu_busy, unsigned long *next_balance)
9387 {
9388 	unsigned long interval, next;
9389 
9390 	interval = get_sd_balance_interval(sd, cpu_busy);
9391 	next = sd->last_balance + interval;
9392 
9393 	if (time_after(*next_balance, next))
9394 		*next_balance = next;
9395 }
9396 
9397 /*
9398  * idle_balance is called by schedule() if this_cpu is about to become
9399  * idle. Attempts to pull tasks from other CPUs.
9400  */
idle_balance(struct rq * this_rq)9401 static int idle_balance(struct rq *this_rq)
9402 {
9403 	unsigned long next_balance = jiffies + HZ;
9404 	int this_cpu = this_rq->cpu;
9405 	struct sched_domain *sd;
9406 	int pulled_task = 0;
9407 	u64 curr_cost = 0;
9408 
9409 	idle_enter_fair(this_rq);
9410 
9411 	/*
9412 	 * We must set idle_stamp _before_ calling idle_balance(), such that we
9413 	 * measure the duration of idle_balance() as idle time.
9414 	 */
9415 	this_rq->idle_stamp = rq_clock(this_rq);
9416 
9417 	if (!energy_aware() &&
9418 	    (this_rq->avg_idle < sysctl_sched_migration_cost ||
9419 	     !this_rq->rd->overload)) {
9420 		rcu_read_lock();
9421 		sd = rcu_dereference_check_sched_domain(this_rq->sd);
9422 		if (sd)
9423 			update_next_balance(sd, 0, &next_balance);
9424 		rcu_read_unlock();
9425 
9426 		goto out;
9427 	}
9428 
9429 	raw_spin_unlock(&this_rq->lock);
9430 
9431 	update_blocked_averages(this_cpu);
9432 	rcu_read_lock();
9433 	for_each_domain(this_cpu, sd) {
9434 		int continue_balancing = 1;
9435 		u64 t0, domain_cost;
9436 
9437 		if (!(sd->flags & SD_LOAD_BALANCE))
9438 			continue;
9439 
9440 		if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
9441 			update_next_balance(sd, 0, &next_balance);
9442 			break;
9443 		}
9444 
9445 		if (sd->flags & SD_BALANCE_NEWIDLE) {
9446 			t0 = sched_clock_cpu(this_cpu);
9447 
9448 			pulled_task = load_balance(this_cpu, this_rq,
9449 						   sd, CPU_NEWLY_IDLE,
9450 						   &continue_balancing);
9451 
9452 			domain_cost = sched_clock_cpu(this_cpu) - t0;
9453 			if (domain_cost > sd->max_newidle_lb_cost)
9454 				sd->max_newidle_lb_cost = domain_cost;
9455 
9456 			curr_cost += domain_cost;
9457 		}
9458 
9459 		update_next_balance(sd, 0, &next_balance);
9460 
9461 		/*
9462 		 * Stop searching for tasks to pull if there are
9463 		 * now runnable tasks on this rq.
9464 		 */
9465 		if (pulled_task || this_rq->nr_running > 0)
9466 			break;
9467 	}
9468 	rcu_read_unlock();
9469 
9470 	raw_spin_lock(&this_rq->lock);
9471 
9472 	if (curr_cost > this_rq->max_idle_balance_cost)
9473 		this_rq->max_idle_balance_cost = curr_cost;
9474 
9475 	/*
9476 	 * While browsing the domains, we released the rq lock, a task could
9477 	 * have been enqueued in the meantime. Since we're not going idle,
9478 	 * pretend we pulled a task.
9479 	 */
9480 	if (this_rq->cfs.h_nr_running && !pulled_task)
9481 		pulled_task = 1;
9482 
9483 out:
9484 	/* Move the next balance forward */
9485 	if (time_after(this_rq->next_balance, next_balance))
9486 		this_rq->next_balance = next_balance;
9487 
9488 	/* Is there a task of a high priority class? */
9489 	if (this_rq->nr_running != this_rq->cfs.h_nr_running)
9490 		pulled_task = -1;
9491 
9492 	if (pulled_task) {
9493 		idle_exit_fair(this_rq);
9494 		this_rq->idle_stamp = 0;
9495 	}
9496 
9497 	return pulled_task;
9498 }
9499 
9500 /*
9501  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
9502  * running tasks off the busiest CPU onto idle CPUs. It requires at
9503  * least 1 task to be running on each physical CPU where possible, and
9504  * avoids physical / logical imbalances.
9505  */
active_load_balance_cpu_stop(void * data)9506 static int active_load_balance_cpu_stop(void *data)
9507 {
9508 	struct rq *busiest_rq = data;
9509 	int busiest_cpu = cpu_of(busiest_rq);
9510 	int target_cpu = busiest_rq->push_cpu;
9511 	struct rq *target_rq = cpu_rq(target_cpu);
9512 	struct sched_domain *sd = NULL;
9513 	struct task_struct *p = NULL;
9514 	struct task_struct *push_task = NULL;
9515 	int push_task_detached = 0;
9516 	struct lb_env env = {
9517 		.sd		= sd,
9518 		.dst_cpu	= target_cpu,
9519 		.dst_rq		= target_rq,
9520 		.src_cpu	= busiest_rq->cpu,
9521 		.src_rq		= busiest_rq,
9522 		.idle		= CPU_IDLE,
9523 	};
9524 
9525 	raw_spin_lock_irq(&busiest_rq->lock);
9526 
9527 	/* make sure the requested cpu hasn't gone down in the meantime */
9528 	if (unlikely(busiest_cpu != smp_processor_id() ||
9529 		     !busiest_rq->active_balance))
9530 		goto out_unlock;
9531 
9532 	/* Is there any task to move? */
9533 	if (busiest_rq->nr_running <= 1)
9534 		goto out_unlock;
9535 
9536 	/*
9537 	 * This condition is "impossible", if it occurs
9538 	 * we need to fix it. Originally reported by
9539 	 * Bjorn Helgaas on a 128-cpu setup.
9540 	 */
9541 	BUG_ON(busiest_rq == target_rq);
9542 
9543 	push_task = busiest_rq->push_task;
9544 	if (push_task) {
9545 		if (task_on_rq_queued(push_task) &&
9546 			task_cpu(push_task) == busiest_cpu &&
9547 					cpu_online(target_cpu)) {
9548 			detach_task(push_task, &env);
9549 			push_task_detached = 1;
9550 		}
9551 		goto out_unlock;
9552 	}
9553 
9554 	/* Search for an sd spanning us and the target CPU. */
9555 	rcu_read_lock();
9556 	for_each_domain(target_cpu, sd) {
9557 		if ((sd->flags & SD_LOAD_BALANCE) &&
9558 		    cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9559 				break;
9560 	}
9561 
9562 	if (likely(sd)) {
9563 		env.sd = sd;
9564 		schedstat_inc(sd, alb_count);
9565 		update_rq_clock(busiest_rq);
9566 
9567 		p = detach_one_task(&env);
9568 		if (p)
9569 			schedstat_inc(sd, alb_pushed);
9570 		else
9571 			schedstat_inc(sd, alb_failed);
9572 	}
9573 	rcu_read_unlock();
9574 out_unlock:
9575 	busiest_rq->active_balance = 0;
9576 
9577 	if (push_task)
9578 		busiest_rq->push_task = NULL;
9579 
9580 	raw_spin_unlock(&busiest_rq->lock);
9581 
9582 	if (push_task) {
9583 		if (push_task_detached)
9584 			attach_one_task(target_rq, push_task);
9585 		put_task_struct(push_task);
9586 	}
9587 
9588 	if (p)
9589 		attach_one_task(target_rq, p);
9590 
9591 	local_irq_enable();
9592 
9593 	return 0;
9594 }
9595 
on_null_domain(struct rq * rq)9596 static inline int on_null_domain(struct rq *rq)
9597 {
9598 	return unlikely(!rcu_dereference_sched(rq->sd));
9599 }
9600 
9601 #ifdef CONFIG_NO_HZ_COMMON
9602 /*
9603  * idle load balancing details
9604  * - When one of the busy CPUs notice that there may be an idle rebalancing
9605  *   needed, they will kick the idle load balancer, which then does idle
9606  *   load balancing for all the idle CPUs.
9607  */
find_new_ilb(void)9608 static inline int find_new_ilb(void)
9609 {
9610 	int ilb = cpumask_first(nohz.idle_cpus_mask);
9611 
9612 	if (ilb < nr_cpu_ids && idle_cpu(ilb))
9613 		return ilb;
9614 
9615 	return nr_cpu_ids;
9616 }
9617 
9618 /*
9619  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
9620  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
9621  * CPU (if there is one).
9622  */
nohz_balancer_kick(void)9623 static void nohz_balancer_kick(void)
9624 {
9625 	int ilb_cpu;
9626 
9627 	nohz.next_balance++;
9628 
9629 	ilb_cpu = find_new_ilb();
9630 
9631 	if (ilb_cpu >= nr_cpu_ids)
9632 		return;
9633 
9634 	if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
9635 		return;
9636 	/*
9637 	 * Use smp_send_reschedule() instead of resched_cpu().
9638 	 * This way we generate a sched IPI on the target cpu which
9639 	 * is idle. And the softirq performing nohz idle load balance
9640 	 * will be run before returning from the IPI.
9641 	 */
9642 	smp_send_reschedule(ilb_cpu);
9643 	return;
9644 }
9645 
nohz_balance_exit_idle(int cpu)9646 static inline void nohz_balance_exit_idle(int cpu)
9647 {
9648 	if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
9649 		/*
9650 		 * Completely isolated CPUs don't ever set, so we must test.
9651 		 */
9652 		if (likely(cpumask_test_cpu(cpu, nohz.idle_cpus_mask))) {
9653 			cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
9654 			atomic_dec(&nohz.nr_cpus);
9655 		}
9656 		clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9657 	}
9658 }
9659 
set_cpu_sd_state_busy(void)9660 static inline void set_cpu_sd_state_busy(void)
9661 {
9662 	struct sched_domain *sd;
9663 	int cpu = smp_processor_id();
9664 
9665 	rcu_read_lock();
9666 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
9667 
9668 	if (!sd || !sd->nohz_idle)
9669 		goto unlock;
9670 	sd->nohz_idle = 0;
9671 
9672 	atomic_inc(&sd->groups->sgc->nr_busy_cpus);
9673 unlock:
9674 	rcu_read_unlock();
9675 }
9676 
set_cpu_sd_state_idle(void)9677 void set_cpu_sd_state_idle(void)
9678 {
9679 	struct sched_domain *sd;
9680 	int cpu = smp_processor_id();
9681 
9682 	rcu_read_lock();
9683 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
9684 
9685 	if (!sd || sd->nohz_idle)
9686 		goto unlock;
9687 	sd->nohz_idle = 1;
9688 
9689 	atomic_dec(&sd->groups->sgc->nr_busy_cpus);
9690 unlock:
9691 	rcu_read_unlock();
9692 }
9693 
9694 /*
9695  * This routine will record that the cpu is going idle with tick stopped.
9696  * This info will be used in performing idle load balancing in the future.
9697  */
nohz_balance_enter_idle(int cpu)9698 void nohz_balance_enter_idle(int cpu)
9699 {
9700 	/*
9701 	 * If this cpu is going down, then nothing needs to be done.
9702 	 */
9703 	if (!cpu_active(cpu))
9704 		return;
9705 
9706 	if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
9707 		return;
9708 
9709 	/*
9710 	 * If we're a completely isolated CPU, we don't play.
9711 	 */
9712 	if (on_null_domain(cpu_rq(cpu)))
9713 		return;
9714 
9715 	cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
9716 	atomic_inc(&nohz.nr_cpus);
9717 	set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
9718 }
9719 
sched_ilb_notifier(struct notifier_block * nfb,unsigned long action,void * hcpu)9720 static int sched_ilb_notifier(struct notifier_block *nfb,
9721 					unsigned long action, void *hcpu)
9722 {
9723 	switch (action & ~CPU_TASKS_FROZEN) {
9724 	case CPU_DYING:
9725 		nohz_balance_exit_idle(smp_processor_id());
9726 		return NOTIFY_OK;
9727 	default:
9728 		return NOTIFY_DONE;
9729 	}
9730 }
9731 #endif
9732 
9733 static DEFINE_SPINLOCK(balancing);
9734 
9735 /*
9736  * Scale the max load_balance interval with the number of CPUs in the system.
9737  * This trades load-balance latency on larger machines for less cross talk.
9738  */
update_max_interval(void)9739 void update_max_interval(void)
9740 {
9741 	max_load_balance_interval = HZ*num_online_cpus()/10;
9742 }
9743 
9744 /*
9745  * It checks each scheduling domain to see if it is due to be balanced,
9746  * and initiates a balancing operation if so.
9747  *
9748  * Balancing parameters are set up in init_sched_domains.
9749  */
rebalance_domains(struct rq * rq,enum cpu_idle_type idle)9750 static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
9751 {
9752 	int continue_balancing = 1;
9753 	int cpu = rq->cpu;
9754 	unsigned long interval;
9755 	struct sched_domain *sd;
9756 	/* Earliest time when we have to do rebalance again */
9757 	unsigned long next_balance = jiffies + 60*HZ;
9758 	int update_next_balance = 0;
9759 	int need_serialize, need_decay = 0;
9760 	u64 max_cost = 0;
9761 
9762 	update_blocked_averages(cpu);
9763 
9764 	rcu_read_lock();
9765 	for_each_domain(cpu, sd) {
9766 		/*
9767 		 * Decay the newidle max times here because this is a regular
9768 		 * visit to all the domains. Decay ~1% per second.
9769 		 */
9770 		if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
9771 			sd->max_newidle_lb_cost =
9772 				(sd->max_newidle_lb_cost * 253) / 256;
9773 			sd->next_decay_max_lb_cost = jiffies + HZ;
9774 			need_decay = 1;
9775 		}
9776 		max_cost += sd->max_newidle_lb_cost;
9777 
9778 		if (!(sd->flags & SD_LOAD_BALANCE))
9779 			continue;
9780 
9781 		/*
9782 		 * Stop the load balance at this level. There is another
9783 		 * CPU in our sched group which is doing load balancing more
9784 		 * actively.
9785 		 */
9786 		if (!continue_balancing) {
9787 			if (need_decay)
9788 				continue;
9789 			break;
9790 		}
9791 
9792 		interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9793 
9794 		need_serialize = sd->flags & SD_SERIALIZE;
9795 		if (need_serialize) {
9796 			if (!spin_trylock(&balancing))
9797 				goto out;
9798 		}
9799 
9800 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
9801 			if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
9802 				/*
9803 				 * The LBF_DST_PINNED logic could have changed
9804 				 * env->dst_cpu, so we can't know our idle
9805 				 * state even if we migrated tasks. Update it.
9806 				 */
9807 				idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
9808 			}
9809 			sd->last_balance = jiffies;
9810 			interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
9811 		}
9812 		if (need_serialize)
9813 			spin_unlock(&balancing);
9814 out:
9815 		if (time_after(next_balance, sd->last_balance + interval)) {
9816 			next_balance = sd->last_balance + interval;
9817 			update_next_balance = 1;
9818 		}
9819 	}
9820 	if (need_decay) {
9821 		/*
9822 		 * Ensure the rq-wide value also decays but keep it at a
9823 		 * reasonable floor to avoid funnies with rq->avg_idle.
9824 		 */
9825 		rq->max_idle_balance_cost =
9826 			max((u64)sysctl_sched_migration_cost, max_cost);
9827 	}
9828 	rcu_read_unlock();
9829 
9830 	/*
9831 	 * next_balance will be updated only when there is a need.
9832 	 * When the cpu is attached to null domain for ex, it will not be
9833 	 * updated.
9834 	 */
9835 	if (likely(update_next_balance)) {
9836 		rq->next_balance = next_balance;
9837 
9838 #ifdef CONFIG_NO_HZ_COMMON
9839 		/*
9840 		 * If this CPU has been elected to perform the nohz idle
9841 		 * balance. Other idle CPUs have already rebalanced with
9842 		 * nohz_idle_balance() and nohz.next_balance has been
9843 		 * updated accordingly. This CPU is now running the idle load
9844 		 * balance for itself and we need to update the
9845 		 * nohz.next_balance accordingly.
9846 		 */
9847 		if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
9848 			nohz.next_balance = rq->next_balance;
9849 #endif
9850 	}
9851 }
9852 
9853 #ifdef CONFIG_NO_HZ_COMMON
9854 /*
9855  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
9856  * rebalancing for all the cpus for whom scheduler ticks are stopped.
9857  */
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)9858 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
9859 {
9860 	int this_cpu = this_rq->cpu;
9861 	struct rq *rq;
9862 	int balance_cpu;
9863 	/* Earliest time when we have to do rebalance again */
9864 	unsigned long next_balance = jiffies + 60*HZ;
9865 	int update_next_balance = 0;
9866 
9867 	if (idle != CPU_IDLE ||
9868 	    !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
9869 		goto end;
9870 
9871 	for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
9872 		if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
9873 			continue;
9874 
9875 		/*
9876 		 * If this cpu gets work to do, stop the load balancing
9877 		 * work being done for other cpus. Next load
9878 		 * balancing owner will pick it up.
9879 		 */
9880 		if (need_resched())
9881 			break;
9882 
9883 		rq = cpu_rq(balance_cpu);
9884 
9885 		/*
9886 		 * If time for next balance is due,
9887 		 * do the balance.
9888 		 */
9889 		if (time_after_eq(jiffies, rq->next_balance)) {
9890 			raw_spin_lock_irq(&rq->lock);
9891 			update_rq_clock(rq);
9892 			update_idle_cpu_load(rq);
9893 			raw_spin_unlock_irq(&rq->lock);
9894 			rebalance_domains(rq, CPU_IDLE);
9895 		}
9896 
9897 		if (time_after(next_balance, rq->next_balance)) {
9898 			next_balance = rq->next_balance;
9899 			update_next_balance = 1;
9900 		}
9901 	}
9902 
9903 	/*
9904 	 * next_balance will be updated only when there is a need.
9905 	 * When the CPU is attached to null domain for ex, it will not be
9906 	 * updated.
9907 	 */
9908 	if (likely(update_next_balance))
9909 		nohz.next_balance = next_balance;
9910 end:
9911 	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
9912 }
9913 
9914 /*
9915  * Current heuristic for kicking the idle load balancer in the presence
9916  * of an idle cpu in the system.
9917  *   - This rq has more than one task.
9918  *   - This rq has at least one CFS task and the capacity of the CPU is
9919  *     significantly reduced because of RT tasks or IRQs.
9920  *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
9921  *     multiple busy cpu.
9922  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
9923  *     domain span are idle.
9924  */
nohz_kick_needed(struct rq * rq)9925 static inline bool nohz_kick_needed(struct rq *rq)
9926 {
9927 	unsigned long now = jiffies;
9928 	struct sched_domain *sd;
9929 	struct sched_group_capacity *sgc;
9930 	int nr_busy, cpu = rq->cpu;
9931 	bool kick = false;
9932 
9933 	if (unlikely(rq->idle_balance))
9934 		return false;
9935 
9936        /*
9937 	* We may be recently in ticked or tickless idle mode. At the first
9938 	* busy tick after returning from idle, we will update the busy stats.
9939 	*/
9940 	set_cpu_sd_state_busy();
9941 	nohz_balance_exit_idle(cpu);
9942 
9943 	/*
9944 	 * None are in tickless mode and hence no need for NOHZ idle load
9945 	 * balancing.
9946 	 */
9947 	if (likely(!atomic_read(&nohz.nr_cpus)))
9948 		return false;
9949 
9950 	if (time_before(now, nohz.next_balance))
9951 		return false;
9952 
9953 	if (rq->nr_running >= 2 &&
9954 	    (!energy_aware() || cpu_overutilized(cpu)))
9955 		return true;
9956 
9957 	/* Do idle load balance if there have misfit task */
9958 	if (energy_aware())
9959 		return rq->misfit_task;
9960 
9961 	rcu_read_lock();
9962 	sd = rcu_dereference(per_cpu(sd_busy, cpu));
9963 	if (sd) {
9964 		sgc = sd->groups->sgc;
9965 		nr_busy = atomic_read(&sgc->nr_busy_cpus);
9966 
9967 		if (nr_busy > 1) {
9968 			kick = true;
9969 			goto unlock;
9970 		}
9971 
9972 	}
9973 
9974 	sd = rcu_dereference(rq->sd);
9975 	if (sd) {
9976 		if ((rq->cfs.h_nr_running >= 1) &&
9977 				check_cpu_capacity(rq, sd)) {
9978 			kick = true;
9979 			goto unlock;
9980 		}
9981 	}
9982 
9983 	sd = rcu_dereference(per_cpu(sd_asym, cpu));
9984 	if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
9985 				  sched_domain_span(sd)) < cpu)) {
9986 		kick = true;
9987 		goto unlock;
9988 	}
9989 
9990 unlock:
9991 	rcu_read_unlock();
9992 	return kick;
9993 }
9994 #else
nohz_idle_balance(struct rq * this_rq,enum cpu_idle_type idle)9995 static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
9996 #endif
9997 
9998 /*
9999  * run_rebalance_domains is triggered when needed from the scheduler tick.
10000  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
10001  */
run_rebalance_domains(struct softirq_action * h)10002 static void run_rebalance_domains(struct softirq_action *h)
10003 {
10004 	struct rq *this_rq = this_rq();
10005 	enum cpu_idle_type idle = this_rq->idle_balance ?
10006 						CPU_IDLE : CPU_NOT_IDLE;
10007 
10008 	/*
10009 	 * If this cpu has a pending nohz_balance_kick, then do the
10010 	 * balancing on behalf of the other idle cpus whose ticks are
10011 	 * stopped. Do nohz_idle_balance *before* rebalance_domains to
10012 	 * give the idle cpus a chance to load balance. Else we may
10013 	 * load balance only within the local sched_domain hierarchy
10014 	 * and abort nohz_idle_balance altogether if we pull some load.
10015 	 */
10016 	nohz_idle_balance(this_rq, idle);
10017 	rebalance_domains(this_rq, idle);
10018 }
10019 
10020 /*
10021  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
10022  */
trigger_load_balance(struct rq * rq)10023 void trigger_load_balance(struct rq *rq)
10024 {
10025 	/* Don't need to rebalance while attached to NULL domain */
10026 	if (unlikely(on_null_domain(rq)))
10027 		return;
10028 
10029 	if (time_after_eq(jiffies, rq->next_balance))
10030 		raise_softirq(SCHED_SOFTIRQ);
10031 #ifdef CONFIG_NO_HZ_COMMON
10032 	if (nohz_kick_needed(rq))
10033 		nohz_balancer_kick();
10034 #endif
10035 }
10036 
rq_online_fair(struct rq * rq)10037 static void rq_online_fair(struct rq *rq)
10038 {
10039 	update_sysctl();
10040 
10041 	update_runtime_enabled(rq);
10042 }
10043 
rq_offline_fair(struct rq * rq)10044 static void rq_offline_fair(struct rq *rq)
10045 {
10046 	update_sysctl();
10047 
10048 	/* Ensure any throttled groups are reachable by pick_next_task */
10049 	unthrottle_offline_cfs_rqs(rq);
10050 }
10051 
10052 static inline int
kick_active_balance(struct rq * rq,struct task_struct * p,int new_cpu)10053 kick_active_balance(struct rq *rq, struct task_struct *p, int new_cpu)
10054 {
10055 	int rc = 0;
10056 
10057 	/* Invoke active balance to force migrate currently running task */
10058 	raw_spin_lock(&rq->lock);
10059 	if (!rq->active_balance) {
10060 		rq->active_balance = 1;
10061 		rq->push_cpu = new_cpu;
10062 		get_task_struct(p);
10063 		rq->push_task = p;
10064 		rc = 1;
10065 	}
10066 	raw_spin_unlock(&rq->lock);
10067 
10068 	return rc;
10069 }
10070 
check_for_migration(struct rq * rq,struct task_struct * p)10071 void check_for_migration(struct rq *rq, struct task_struct *p)
10072 {
10073 	int new_cpu;
10074 	int active_balance;
10075 	int cpu = task_cpu(p);
10076 
10077 	if (energy_aware() && rq->misfit_task) {
10078 		if (rq->curr->state != TASK_RUNNING ||
10079 		    rq->curr->nr_cpus_allowed == 1)
10080 			return;
10081 
10082 		new_cpu = select_energy_cpu_brute(p, cpu, 0);
10083 		if (capacity_orig_of(new_cpu) > capacity_orig_of(cpu)) {
10084 			active_balance = kick_active_balance(rq, p, new_cpu);
10085 			if (active_balance)
10086 				stop_one_cpu_nowait(cpu,
10087 						active_load_balance_cpu_stop,
10088 						rq, &rq->active_balance_work);
10089 		}
10090 	}
10091 }
10092 
10093 #endif /* CONFIG_SMP */
10094 
10095 /*
10096  * scheduler tick hitting a task of our scheduling class:
10097  */
task_tick_fair(struct rq * rq,struct task_struct * curr,int queued)10098 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
10099 {
10100 	struct cfs_rq *cfs_rq;
10101 	struct sched_entity *se = &curr->se;
10102 
10103 	for_each_sched_entity(se) {
10104 		cfs_rq = cfs_rq_of(se);
10105 		entity_tick(cfs_rq, se, queued);
10106 	}
10107 
10108 	if (static_branch_unlikely(&sched_numa_balancing))
10109 		task_tick_numa(rq, curr);
10110 
10111 #ifdef CONFIG_SMP
10112 	if (!rq->rd->overutilized && cpu_overutilized(task_cpu(curr))) {
10113 		rq->rd->overutilized = true;
10114 		trace_sched_overutilized(true);
10115 	}
10116 
10117 	rq->misfit_task = !task_fits_max(curr, rq->cpu);
10118 #endif
10119 
10120 }
10121 
10122 /*
10123  * called on fork with the child task as argument from the parent's context
10124  *  - child not yet on the tasklist
10125  *  - preemption disabled
10126  */
task_fork_fair(struct task_struct * p)10127 static void task_fork_fair(struct task_struct *p)
10128 {
10129 	struct cfs_rq *cfs_rq;
10130 	struct sched_entity *se = &p->se, *curr;
10131 	struct rq *rq = this_rq();
10132 
10133 	raw_spin_lock(&rq->lock);
10134 	update_rq_clock(rq);
10135 
10136 	cfs_rq = task_cfs_rq(current);
10137 	curr = cfs_rq->curr;
10138 	if (curr) {
10139 		update_curr(cfs_rq);
10140 		se->vruntime = curr->vruntime;
10141 	}
10142 	place_entity(cfs_rq, se, 1);
10143 
10144 	if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
10145 		/*
10146 		 * Upon rescheduling, sched_class::put_prev_task() will place
10147 		 * 'current' within the tree based on its new key value.
10148 		 */
10149 		swap(curr->vruntime, se->vruntime);
10150 		resched_curr(rq);
10151 	}
10152 
10153 	se->vruntime -= cfs_rq->min_vruntime;
10154 	raw_spin_unlock(&rq->lock);
10155 }
10156 
10157 /*
10158  * Priority of the task has changed. Check to see if we preempt
10159  * the current task.
10160  */
10161 static void
prio_changed_fair(struct rq * rq,struct task_struct * p,int oldprio)10162 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
10163 {
10164 	if (!task_on_rq_queued(p))
10165 		return;
10166 
10167 	/*
10168 	 * Reschedule if we are currently running on this runqueue and
10169 	 * our priority decreased, or if we are not currently running on
10170 	 * this runqueue and our priority is higher than the current's
10171 	 */
10172 	if (rq->curr == p) {
10173 		if (p->prio > oldprio)
10174 			resched_curr(rq);
10175 	} else
10176 		check_preempt_curr(rq, p, 0);
10177 }
10178 
vruntime_normalized(struct task_struct * p)10179 static inline bool vruntime_normalized(struct task_struct *p)
10180 {
10181 	struct sched_entity *se = &p->se;
10182 
10183 	/*
10184 	 * In both the TASK_ON_RQ_QUEUED and TASK_ON_RQ_MIGRATING cases,
10185 	 * the dequeue_entity(.flags=0) will already have normalized the
10186 	 * vruntime.
10187 	 */
10188 	if (p->on_rq)
10189 		return true;
10190 
10191 	/*
10192 	 * When !on_rq, vruntime of the task has usually NOT been normalized.
10193 	 * But there are some cases where it has already been normalized:
10194 	 *
10195 	 * - A forked child which is waiting for being woken up by
10196 	 *   wake_up_new_task().
10197 	 * - A task which has been woken up by try_to_wake_up() and
10198 	 *   waiting for actually being woken up by sched_ttwu_pending().
10199 	 */
10200 	if (!se->sum_exec_runtime || p->state == TASK_WAKING)
10201 		return true;
10202 
10203 	return false;
10204 }
10205 
10206 #ifdef CONFIG_FAIR_GROUP_SCHED
10207 /*
10208  * Propagate the changes of the sched_entity across the tg tree to make it
10209  * visible to the root
10210  */
propagate_entity_cfs_rq(struct sched_entity * se)10211 static void propagate_entity_cfs_rq(struct sched_entity *se)
10212 {
10213 	struct cfs_rq *cfs_rq;
10214 
10215 	/* Start to propagate at parent */
10216 	se = se->parent;
10217 
10218 	for_each_sched_entity(se) {
10219 		cfs_rq = cfs_rq_of(se);
10220 
10221 		if (cfs_rq_throttled(cfs_rq))
10222 			break;
10223 
10224 		update_load_avg(se, UPDATE_TG);
10225 	}
10226 }
10227 #else
propagate_entity_cfs_rq(struct sched_entity * se)10228 static void propagate_entity_cfs_rq(struct sched_entity *se) { }
10229 #endif
10230 
detach_entity_cfs_rq(struct sched_entity * se)10231 static void detach_entity_cfs_rq(struct sched_entity *se)
10232 {
10233 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10234 
10235 	/* Catch up with the cfs_rq and remove our load when we leave */
10236 	update_load_avg(se, 0);
10237 	detach_entity_load_avg(cfs_rq, se);
10238 	update_tg_load_avg(cfs_rq, false);
10239 	propagate_entity_cfs_rq(se);
10240 }
10241 
attach_entity_cfs_rq(struct sched_entity * se)10242 static void attach_entity_cfs_rq(struct sched_entity *se)
10243 {
10244 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10245 
10246 #ifdef CONFIG_FAIR_GROUP_SCHED
10247 	/*
10248 	 * Since the real-depth could have been changed (only FAIR
10249 	 * class maintain depth value), reset depth properly.
10250 	 */
10251 	se->depth = se->parent ? se->parent->depth + 1 : 0;
10252 #endif
10253 
10254 	/* Synchronize entity with its cfs_rq */
10255 	update_load_avg(se, sched_feat(ATTACH_AGE_LOAD) ? 0 : SKIP_AGE_LOAD);
10256 	attach_entity_load_avg(cfs_rq, se);
10257 	update_tg_load_avg(cfs_rq, false);
10258 	propagate_entity_cfs_rq(se);
10259 }
10260 
detach_task_cfs_rq(struct task_struct * p)10261 static void detach_task_cfs_rq(struct task_struct *p)
10262 {
10263 	struct sched_entity *se = &p->se;
10264 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10265 
10266 	if (!vruntime_normalized(p)) {
10267 		/*
10268 		 * Fix up our vruntime so that the current sleep doesn't
10269 		 * cause 'unlimited' sleep bonus.
10270 		 */
10271 		place_entity(cfs_rq, se, 0);
10272 		se->vruntime -= cfs_rq->min_vruntime;
10273 	}
10274 
10275 	detach_entity_cfs_rq(se);
10276 }
10277 
attach_task_cfs_rq(struct task_struct * p)10278 static void attach_task_cfs_rq(struct task_struct *p)
10279 {
10280 	struct sched_entity *se = &p->se;
10281 	struct cfs_rq *cfs_rq = cfs_rq_of(se);
10282 
10283 	attach_entity_cfs_rq(se);
10284 
10285 	if (!vruntime_normalized(p))
10286 		se->vruntime += cfs_rq->min_vruntime;
10287 }
10288 
switched_from_fair(struct rq * rq,struct task_struct * p)10289 static void switched_from_fair(struct rq *rq, struct task_struct *p)
10290 {
10291 	detach_task_cfs_rq(p);
10292 }
10293 
switched_to_fair(struct rq * rq,struct task_struct * p)10294 static void switched_to_fair(struct rq *rq, struct task_struct *p)
10295 {
10296 	attach_task_cfs_rq(p);
10297 
10298 	if (task_on_rq_queued(p)) {
10299 		/*
10300 		 * We were most likely switched from sched_rt, so
10301 		 * kick off the schedule if running, otherwise just see
10302 		 * if we can still preempt the current task.
10303 		 */
10304 		if (rq->curr == p)
10305 			resched_curr(rq);
10306 		else
10307 			check_preempt_curr(rq, p, 0);
10308 	}
10309 }
10310 
10311 /* Account for a task changing its policy or group.
10312  *
10313  * This routine is mostly called to set cfs_rq->curr field when a task
10314  * migrates between groups/classes.
10315  */
set_curr_task_fair(struct rq * rq)10316 static void set_curr_task_fair(struct rq *rq)
10317 {
10318 	struct sched_entity *se = &rq->curr->se;
10319 
10320 	for_each_sched_entity(se) {
10321 		struct cfs_rq *cfs_rq = cfs_rq_of(se);
10322 
10323 		set_next_entity(cfs_rq, se);
10324 		/* ensure bandwidth has been allocated on our new cfs_rq */
10325 		account_cfs_rq_runtime(cfs_rq, 0);
10326 	}
10327 }
10328 
init_cfs_rq(struct cfs_rq * cfs_rq)10329 void init_cfs_rq(struct cfs_rq *cfs_rq)
10330 {
10331 	cfs_rq->tasks_timeline = RB_ROOT;
10332 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
10333 #ifndef CONFIG_64BIT
10334 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
10335 #endif
10336 #ifdef CONFIG_SMP
10337 #ifdef CONFIG_FAIR_GROUP_SCHED
10338 	cfs_rq->propagate_avg = 0;
10339 #endif
10340 	atomic_long_set(&cfs_rq->removed_load_avg, 0);
10341 	atomic_long_set(&cfs_rq->removed_util_avg, 0);
10342 #endif
10343 }
10344 
10345 #ifdef CONFIG_FAIR_GROUP_SCHED
task_set_group_fair(struct task_struct * p)10346 static void task_set_group_fair(struct task_struct *p)
10347 {
10348 	struct sched_entity *se = &p->se;
10349 
10350 	set_task_rq(p, task_cpu(p));
10351 	se->depth = se->parent ? se->parent->depth + 1 : 0;
10352 }
10353 
task_move_group_fair(struct task_struct * p)10354 static void task_move_group_fair(struct task_struct *p)
10355 {
10356 	detach_task_cfs_rq(p);
10357 	set_task_rq(p, task_cpu(p));
10358 
10359 #ifdef CONFIG_SMP
10360 	/* Tell se's cfs_rq has been changed -- migrated */
10361 	p->se.avg.last_update_time = 0;
10362 #endif
10363 	attach_task_cfs_rq(p);
10364 }
10365 
task_change_group_fair(struct task_struct * p,int type)10366 static void task_change_group_fair(struct task_struct *p, int type)
10367 {
10368 	switch (type) {
10369 	case TASK_SET_GROUP:
10370 		task_set_group_fair(p);
10371 		break;
10372 
10373 	case TASK_MOVE_GROUP:
10374 		task_move_group_fair(p);
10375 		break;
10376 	}
10377 }
10378 
free_fair_sched_group(struct task_group * tg)10379 void free_fair_sched_group(struct task_group *tg)
10380 {
10381 	int i;
10382 
10383 	destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
10384 
10385 	for_each_possible_cpu(i) {
10386 		if (tg->cfs_rq)
10387 			kfree(tg->cfs_rq[i]);
10388 		if (tg->se)
10389 			kfree(tg->se[i]);
10390 	}
10391 
10392 	kfree(tg->cfs_rq);
10393 	kfree(tg->se);
10394 }
10395 
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)10396 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10397 {
10398 	struct sched_entity *se;
10399 	struct cfs_rq *cfs_rq;
10400 	struct rq *rq;
10401 	int i;
10402 
10403 	tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
10404 	if (!tg->cfs_rq)
10405 		goto err;
10406 	tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
10407 	if (!tg->se)
10408 		goto err;
10409 
10410 	tg->shares = NICE_0_LOAD;
10411 
10412 	init_cfs_bandwidth(tg_cfs_bandwidth(tg));
10413 
10414 	for_each_possible_cpu(i) {
10415 		rq = cpu_rq(i);
10416 
10417 		cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
10418 				      GFP_KERNEL, cpu_to_node(i));
10419 		if (!cfs_rq)
10420 			goto err;
10421 
10422 		se = kzalloc_node(sizeof(struct sched_entity),
10423 				  GFP_KERNEL, cpu_to_node(i));
10424 		if (!se)
10425 			goto err_free_rq;
10426 
10427 		init_cfs_rq(cfs_rq);
10428 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
10429 		init_entity_runnable_average(se);
10430 
10431 		raw_spin_lock_irq(&rq->lock);
10432 		post_init_entity_util_avg(se);
10433 		raw_spin_unlock_irq(&rq->lock);
10434 	}
10435 
10436 	return 1;
10437 
10438 err_free_rq:
10439 	kfree(cfs_rq);
10440 err:
10441 	return 0;
10442 }
10443 
unregister_fair_sched_group(struct task_group * tg)10444 void unregister_fair_sched_group(struct task_group *tg)
10445 {
10446 	unsigned long flags;
10447 	struct rq *rq;
10448 	int cpu;
10449 
10450 	for_each_possible_cpu(cpu) {
10451 		if (tg->se[cpu])
10452 			remove_entity_load_avg(tg->se[cpu]);
10453 
10454 		/*
10455 		 * Only empty task groups can be destroyed; so we can speculatively
10456 		 * check on_list without danger of it being re-added.
10457 		 */
10458 		if (!tg->cfs_rq[cpu]->on_list)
10459 			continue;
10460 
10461 		rq = cpu_rq(cpu);
10462 
10463 		raw_spin_lock_irqsave(&rq->lock, flags);
10464 		list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
10465 		raw_spin_unlock_irqrestore(&rq->lock, flags);
10466 	}
10467 }
10468 
init_tg_cfs_entry(struct task_group * tg,struct cfs_rq * cfs_rq,struct sched_entity * se,int cpu,struct sched_entity * parent)10469 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
10470 			struct sched_entity *se, int cpu,
10471 			struct sched_entity *parent)
10472 {
10473 	struct rq *rq = cpu_rq(cpu);
10474 
10475 	cfs_rq->tg = tg;
10476 	cfs_rq->rq = rq;
10477 	init_cfs_rq_runtime(cfs_rq);
10478 
10479 	tg->cfs_rq[cpu] = cfs_rq;
10480 	tg->se[cpu] = se;
10481 
10482 	/* se could be NULL for root_task_group */
10483 	if (!se)
10484 		return;
10485 
10486 	if (!parent) {
10487 		se->cfs_rq = &rq->cfs;
10488 		se->depth = 0;
10489 	} else {
10490 		se->cfs_rq = parent->my_q;
10491 		se->depth = parent->depth + 1;
10492 	}
10493 
10494 	se->my_q = cfs_rq;
10495 	/* guarantee group entities always have weight */
10496 	update_load_set(&se->load, NICE_0_LOAD);
10497 	se->parent = parent;
10498 }
10499 
10500 static DEFINE_MUTEX(shares_mutex);
10501 
sched_group_set_shares(struct task_group * tg,unsigned long shares)10502 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
10503 {
10504 	int i;
10505 	unsigned long flags;
10506 
10507 	/*
10508 	 * We can't change the weight of the root cgroup.
10509 	 */
10510 	if (!tg->se[0])
10511 		return -EINVAL;
10512 
10513 	shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
10514 
10515 	mutex_lock(&shares_mutex);
10516 	if (tg->shares == shares)
10517 		goto done;
10518 
10519 	tg->shares = shares;
10520 	for_each_possible_cpu(i) {
10521 		struct rq *rq = cpu_rq(i);
10522 		struct sched_entity *se;
10523 
10524 		se = tg->se[i];
10525 		/* Propagate contribution to hierarchy */
10526 		raw_spin_lock_irqsave(&rq->lock, flags);
10527 
10528 		/* Possible calls to update_curr() need rq clock */
10529 		update_rq_clock(rq);
10530 		for_each_sched_entity(se) {
10531 			update_load_avg(se, UPDATE_TG);
10532 			update_cfs_shares(se);
10533 		}
10534 		raw_spin_unlock_irqrestore(&rq->lock, flags);
10535 	}
10536 
10537 done:
10538 	mutex_unlock(&shares_mutex);
10539 	return 0;
10540 }
10541 #else /* CONFIG_FAIR_GROUP_SCHED */
10542 
free_fair_sched_group(struct task_group * tg)10543 void free_fair_sched_group(struct task_group *tg) { }
10544 
alloc_fair_sched_group(struct task_group * tg,struct task_group * parent)10545 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
10546 {
10547 	return 1;
10548 }
10549 
unregister_fair_sched_group(struct task_group * tg)10550 void unregister_fair_sched_group(struct task_group *tg) { }
10551 
10552 #endif /* CONFIG_FAIR_GROUP_SCHED */
10553 
10554 
get_rr_interval_fair(struct rq * rq,struct task_struct * task)10555 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
10556 {
10557 	struct sched_entity *se = &task->se;
10558 	unsigned int rr_interval = 0;
10559 
10560 	/*
10561 	 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
10562 	 * idle runqueue:
10563 	 */
10564 	if (rq->cfs.load.weight)
10565 		rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
10566 
10567 	return rr_interval;
10568 }
10569 
10570 /*
10571  * All the scheduling class methods:
10572  */
10573 const struct sched_class fair_sched_class = {
10574 	.next			= &idle_sched_class,
10575 	.enqueue_task		= enqueue_task_fair,
10576 	.dequeue_task		= dequeue_task_fair,
10577 	.yield_task		= yield_task_fair,
10578 	.yield_to_task		= yield_to_task_fair,
10579 
10580 	.check_preempt_curr	= check_preempt_wakeup,
10581 
10582 	.pick_next_task		= pick_next_task_fair,
10583 	.put_prev_task		= put_prev_task_fair,
10584 
10585 #ifdef CONFIG_SMP
10586 	.select_task_rq		= select_task_rq_fair,
10587 	.migrate_task_rq	= migrate_task_rq_fair,
10588 
10589 	.rq_online		= rq_online_fair,
10590 	.rq_offline		= rq_offline_fair,
10591 
10592 	.task_waking		= task_waking_fair,
10593 	.task_dead		= task_dead_fair,
10594 	.set_cpus_allowed	= set_cpus_allowed_common,
10595 #endif
10596 
10597 	.set_curr_task          = set_curr_task_fair,
10598 	.task_tick		= task_tick_fair,
10599 	.task_fork		= task_fork_fair,
10600 
10601 	.prio_changed		= prio_changed_fair,
10602 	.switched_from		= switched_from_fair,
10603 	.switched_to		= switched_to_fair,
10604 
10605 	.get_rr_interval	= get_rr_interval_fair,
10606 
10607 	.update_curr		= update_curr_fair,
10608 
10609 #ifdef CONFIG_FAIR_GROUP_SCHED
10610 	.task_change_group	= task_change_group_fair,
10611 #endif
10612 };
10613 
10614 #ifdef CONFIG_SCHED_DEBUG
print_cfs_stats(struct seq_file * m,int cpu)10615 void print_cfs_stats(struct seq_file *m, int cpu)
10616 {
10617 	struct cfs_rq *cfs_rq;
10618 
10619 	rcu_read_lock();
10620 	for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
10621 		print_cfs_rq(m, cpu, cfs_rq);
10622 	rcu_read_unlock();
10623 }
10624 
10625 #ifdef CONFIG_NUMA_BALANCING
show_numa_stats(struct task_struct * p,struct seq_file * m)10626 void show_numa_stats(struct task_struct *p, struct seq_file *m)
10627 {
10628 	int node;
10629 	unsigned long tsf = 0, tpf = 0, gsf = 0, gpf = 0;
10630 
10631 	for_each_online_node(node) {
10632 		if (p->numa_faults) {
10633 			tsf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 0)];
10634 			tpf = p->numa_faults[task_faults_idx(NUMA_MEM, node, 1)];
10635 		}
10636 		if (p->numa_group) {
10637 			gsf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 0)],
10638 			gpf = p->numa_group->faults[task_faults_idx(NUMA_MEM, node, 1)];
10639 		}
10640 		print_numa_stats(m, node, tsf, tpf, gsf, gpf);
10641 	}
10642 }
10643 #endif /* CONFIG_NUMA_BALANCING */
10644 #endif /* CONFIG_SCHED_DEBUG */
10645 
init_sched_fair_class(void)10646 __init void init_sched_fair_class(void)
10647 {
10648 #ifdef CONFIG_SMP
10649 	open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10650 
10651 #ifdef CONFIG_NO_HZ_COMMON
10652 	nohz.next_balance = jiffies;
10653 	zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10654 	cpu_notifier(sched_ilb_notifier, 0);
10655 #endif
10656 #endif /* SMP */
10657 
10658 }
10659