1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Scheduler topology setup/handling methods
4 */
5 #include <linux/sched.h>
6 #include <linux/mutex.h>
7
8 #include "sched.h"
9
10 DEFINE_MUTEX(sched_domains_mutex);
11
12 /* Protected by sched_domains_mutex: */
13 cpumask_var_t sched_domains_tmpmask;
14 cpumask_var_t sched_domains_tmpmask2;
15
16 #ifdef CONFIG_SCHED_DEBUG
17
sched_debug_setup(char * str)18 static int __init sched_debug_setup(char *str)
19 {
20 sched_debug_enabled = true;
21
22 return 0;
23 }
24 early_param("sched_debug", sched_debug_setup);
25
sched_debug(void)26 static inline bool sched_debug(void)
27 {
28 return sched_debug_enabled;
29 }
30
sched_domain_debug_one(struct sched_domain * sd,int cpu,int level,struct cpumask * groupmask)31 static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
32 struct cpumask *groupmask)
33 {
34 struct sched_group *group = sd->groups;
35
36 cpumask_clear(groupmask);
37
38 printk(KERN_DEBUG "%*s domain-%d: ", level, "", level);
39
40 if (!(sd->flags & SD_LOAD_BALANCE)) {
41 printk("does not load-balance\n");
42 return -1;
43 }
44
45 printk(KERN_CONT "span=%*pbl level=%s\n",
46 cpumask_pr_args(sched_domain_span(sd)), sd->name);
47
48 if (!cpumask_test_cpu(cpu, sched_domain_span(sd))) {
49 printk(KERN_ERR "ERROR: domain->span does not contain "
50 "CPU%d\n", cpu);
51 }
52 if (!cpumask_test_cpu(cpu, sched_group_span(group))) {
53 printk(KERN_ERR "ERROR: domain->groups does not contain"
54 " CPU%d\n", cpu);
55 }
56
57 printk(KERN_DEBUG "%*s groups:", level + 1, "");
58 do {
59 if (!group) {
60 printk("\n");
61 printk(KERN_ERR "ERROR: group is NULL\n");
62 break;
63 }
64
65 if (!cpumask_weight(sched_group_span(group))) {
66 printk(KERN_CONT "\n");
67 printk(KERN_ERR "ERROR: empty group\n");
68 break;
69 }
70
71 if (!(sd->flags & SD_OVERLAP) &&
72 cpumask_intersects(groupmask, sched_group_span(group))) {
73 printk(KERN_CONT "\n");
74 printk(KERN_ERR "ERROR: repeated CPUs\n");
75 break;
76 }
77
78 cpumask_or(groupmask, groupmask, sched_group_span(group));
79
80 printk(KERN_CONT " %d:{ span=%*pbl",
81 group->sgc->id,
82 cpumask_pr_args(sched_group_span(group)));
83
84 if ((sd->flags & SD_OVERLAP) &&
85 !cpumask_equal(group_balance_mask(group), sched_group_span(group))) {
86 printk(KERN_CONT " mask=%*pbl",
87 cpumask_pr_args(group_balance_mask(group)));
88 }
89
90 if (group->sgc->capacity != SCHED_CAPACITY_SCALE)
91 printk(KERN_CONT " cap=%lu", group->sgc->capacity);
92
93 if (group == sd->groups && sd->child &&
94 !cpumask_equal(sched_domain_span(sd->child),
95 sched_group_span(group))) {
96 printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n");
97 }
98
99 printk(KERN_CONT " }");
100
101 group = group->next;
102
103 if (group != sd->groups)
104 printk(KERN_CONT ",");
105
106 } while (group != sd->groups);
107 printk(KERN_CONT "\n");
108
109 if (!cpumask_equal(sched_domain_span(sd), groupmask))
110 printk(KERN_ERR "ERROR: groups don't span domain->span\n");
111
112 if (sd->parent &&
113 !cpumask_subset(groupmask, sched_domain_span(sd->parent)))
114 printk(KERN_ERR "ERROR: parent span is not a superset "
115 "of domain->span\n");
116 return 0;
117 }
118
sched_domain_debug(struct sched_domain * sd,int cpu)119 static void sched_domain_debug(struct sched_domain *sd, int cpu)
120 {
121 int level = 0;
122
123 if (!sched_debug_enabled)
124 return;
125
126 if (!sd) {
127 printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n", cpu);
128 return;
129 }
130
131 printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n", cpu);
132
133 for (;;) {
134 if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
135 break;
136 level++;
137 sd = sd->parent;
138 if (!sd)
139 break;
140 }
141 }
142 #else /* !CONFIG_SCHED_DEBUG */
143
144 # define sched_debug_enabled 0
145 # define sched_domain_debug(sd, cpu) do { } while (0)
sched_debug(void)146 static inline bool sched_debug(void)
147 {
148 return false;
149 }
150 #endif /* CONFIG_SCHED_DEBUG */
151
sd_degenerate(struct sched_domain * sd)152 static int sd_degenerate(struct sched_domain *sd)
153 {
154 if (cpumask_weight(sched_domain_span(sd)) == 1) {
155 if (sd->groups->sge)
156 sd->flags &= ~SD_LOAD_BALANCE;
157 else
158 return 1;
159 }
160
161 /* Following flags need at least 2 groups */
162 if (sd->flags & (SD_LOAD_BALANCE |
163 SD_BALANCE_NEWIDLE |
164 SD_BALANCE_FORK |
165 SD_BALANCE_EXEC |
166 SD_SHARE_CPUCAPACITY |
167 SD_ASYM_CPUCAPACITY |
168 SD_SHARE_PKG_RESOURCES |
169 SD_SHARE_POWERDOMAIN |
170 SD_SHARE_CAP_STATES)) {
171 if (sd->groups != sd->groups->next)
172 return 0;
173 }
174
175 /* Following flags don't use groups */
176 if (sd->flags & (SD_WAKE_AFFINE))
177 return 0;
178
179 return 1;
180 }
181
182 static int
sd_parent_degenerate(struct sched_domain * sd,struct sched_domain * parent)183 sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
184 {
185 unsigned long cflags = sd->flags, pflags = parent->flags;
186
187 if (sd_degenerate(parent))
188 return 1;
189
190 if (!cpumask_equal(sched_domain_span(sd), sched_domain_span(parent)))
191 return 0;
192
193 /* Flags needing groups don't count if only 1 group in parent */
194 if (parent->groups == parent->groups->next) {
195 pflags &= ~(SD_LOAD_BALANCE |
196 SD_BALANCE_NEWIDLE |
197 SD_BALANCE_FORK |
198 SD_BALANCE_EXEC |
199 SD_ASYM_CPUCAPACITY |
200 SD_SHARE_CPUCAPACITY |
201 SD_SHARE_PKG_RESOURCES |
202 SD_PREFER_SIBLING |
203 SD_SHARE_POWERDOMAIN |
204 SD_SHARE_CAP_STATES);
205 if (parent->groups->sge) {
206 parent->flags &= ~SD_LOAD_BALANCE;
207 return 0;
208 }
209 if (nr_node_ids == 1)
210 pflags &= ~SD_SERIALIZE;
211 }
212 if (~cflags & pflags)
213 return 0;
214
215 return 1;
216 }
217
free_rootdomain(struct rcu_head * rcu)218 static void free_rootdomain(struct rcu_head *rcu)
219 {
220 struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
221
222 cpupri_cleanup(&rd->cpupri);
223 cpudl_cleanup(&rd->cpudl);
224 free_cpumask_var(rd->dlo_mask);
225 free_cpumask_var(rd->rto_mask);
226 free_cpumask_var(rd->online);
227 free_cpumask_var(rd->span);
228 kfree(rd);
229 }
230
rq_attach_root(struct rq * rq,struct root_domain * rd)231 void rq_attach_root(struct rq *rq, struct root_domain *rd)
232 {
233 struct root_domain *old_rd = NULL;
234 unsigned long flags;
235
236 raw_spin_lock_irqsave(&rq->lock, flags);
237
238 if (rq->rd) {
239 old_rd = rq->rd;
240
241 if (cpumask_test_cpu(rq->cpu, old_rd->online))
242 set_rq_offline(rq);
243
244 cpumask_clear_cpu(rq->cpu, old_rd->span);
245
246 /*
247 * If we dont want to free the old_rd yet then
248 * set old_rd to NULL to skip the freeing later
249 * in this function:
250 */
251 if (!atomic_dec_and_test(&old_rd->refcount))
252 old_rd = NULL;
253 }
254
255 atomic_inc(&rd->refcount);
256 rq->rd = rd;
257
258 cpumask_set_cpu(rq->cpu, rd->span);
259 if (cpumask_test_cpu(rq->cpu, cpu_active_mask))
260 set_rq_online(rq);
261
262 raw_spin_unlock_irqrestore(&rq->lock, flags);
263
264 if (old_rd)
265 call_rcu_sched(&old_rd->rcu, free_rootdomain);
266 }
267
sched_get_rd(struct root_domain * rd)268 void sched_get_rd(struct root_domain *rd)
269 {
270 atomic_inc(&rd->refcount);
271 }
272
sched_put_rd(struct root_domain * rd)273 void sched_put_rd(struct root_domain *rd)
274 {
275 if (!atomic_dec_and_test(&rd->refcount))
276 return;
277
278 call_rcu_sched(&rd->rcu, free_rootdomain);
279 }
280
init_rootdomain(struct root_domain * rd)281 static int init_rootdomain(struct root_domain *rd)
282 {
283 if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL))
284 goto out;
285 if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL))
286 goto free_span;
287 if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL))
288 goto free_online;
289 if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
290 goto free_dlo_mask;
291
292 #ifdef HAVE_RT_PUSH_IPI
293 rd->rto_cpu = -1;
294 raw_spin_lock_init(&rd->rto_lock);
295 init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
296 #endif
297
298 init_dl_bw(&rd->dl_bw);
299 if (cpudl_init(&rd->cpudl) != 0)
300 goto free_rto_mask;
301
302 if (cpupri_init(&rd->cpupri) != 0)
303 goto free_cpudl;
304
305 rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
306
307 init_max_cpu_capacity(&rd->max_cpu_capacity);
308
309 return 0;
310
311 free_cpudl:
312 cpudl_cleanup(&rd->cpudl);
313 free_rto_mask:
314 free_cpumask_var(rd->rto_mask);
315 free_dlo_mask:
316 free_cpumask_var(rd->dlo_mask);
317 free_online:
318 free_cpumask_var(rd->online);
319 free_span:
320 free_cpumask_var(rd->span);
321 out:
322 return -ENOMEM;
323 }
324
325 /*
326 * By default the system creates a single root-domain with all CPUs as
327 * members (mimicking the global state we have today).
328 */
329 struct root_domain def_root_domain;
330
init_defrootdomain(void)331 void init_defrootdomain(void)
332 {
333 init_rootdomain(&def_root_domain);
334
335 atomic_set(&def_root_domain.refcount, 1);
336 }
337
alloc_rootdomain(void)338 static struct root_domain *alloc_rootdomain(void)
339 {
340 struct root_domain *rd;
341
342 rd = kzalloc(sizeof(*rd), GFP_KERNEL);
343 if (!rd)
344 return NULL;
345
346 if (init_rootdomain(rd) != 0) {
347 kfree(rd);
348 return NULL;
349 }
350
351 return rd;
352 }
353
free_sched_groups(struct sched_group * sg,int free_sgc)354 static void free_sched_groups(struct sched_group *sg, int free_sgc)
355 {
356 struct sched_group *tmp, *first;
357
358 if (!sg)
359 return;
360
361 first = sg;
362 do {
363 tmp = sg->next;
364
365 if (free_sgc && atomic_dec_and_test(&sg->sgc->ref))
366 kfree(sg->sgc);
367
368 if (atomic_dec_and_test(&sg->ref))
369 kfree(sg);
370 sg = tmp;
371 } while (sg != first);
372 }
373
destroy_sched_domain(struct sched_domain * sd)374 static void destroy_sched_domain(struct sched_domain *sd)
375 {
376 /*
377 * A normal sched domain may have multiple group references, an
378 * overlapping domain, having private groups, only one. Iterate,
379 * dropping group/capacity references, freeing where none remain.
380 */
381 free_sched_groups(sd->groups, 1);
382
383 if (sd->shared && atomic_dec_and_test(&sd->shared->ref))
384 kfree(sd->shared);
385 kfree(sd);
386 }
387
destroy_sched_domains_rcu(struct rcu_head * rcu)388 static void destroy_sched_domains_rcu(struct rcu_head *rcu)
389 {
390 struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
391
392 while (sd) {
393 struct sched_domain *parent = sd->parent;
394 destroy_sched_domain(sd);
395 sd = parent;
396 }
397 }
398
destroy_sched_domains(struct sched_domain * sd)399 static void destroy_sched_domains(struct sched_domain *sd)
400 {
401 if (sd)
402 call_rcu(&sd->rcu, destroy_sched_domains_rcu);
403 }
404
405 /*
406 * Keep a special pointer to the highest sched_domain that has
407 * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this
408 * allows us to avoid some pointer chasing select_idle_sibling().
409 *
410 * Also keep a unique ID per domain (we use the first CPU number in
411 * the cpumask of the domain), this allows us to quickly tell if
412 * two CPUs are in the same cache domain, see cpus_share_cache().
413 */
414 DEFINE_PER_CPU(struct sched_domain *, sd_llc);
415 DEFINE_PER_CPU(int, sd_llc_size);
416 DEFINE_PER_CPU(int, sd_llc_id);
417 DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
418 DEFINE_PER_CPU(struct sched_domain *, sd_numa);
419 DEFINE_PER_CPU(struct sched_domain *, sd_asym);
420 DEFINE_PER_CPU(struct sched_domain *, sd_ea);
421 DEFINE_PER_CPU(struct sched_domain *, sd_scs);
422 DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
423
update_top_cache_domain(int cpu)424 static void update_top_cache_domain(int cpu)
425 {
426 struct sched_domain_shared *sds = NULL;
427 struct sched_domain *sd;
428 struct sched_domain *ea_sd = NULL;
429 int id = cpu;
430 int size = 1;
431
432 sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
433 if (sd) {
434 id = cpumask_first(sched_domain_span(sd));
435 size = cpumask_weight(sched_domain_span(sd));
436 sds = sd->shared;
437 }
438
439 rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
440 per_cpu(sd_llc_size, cpu) = size;
441 per_cpu(sd_llc_id, cpu) = id;
442 rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds);
443
444 sd = lowest_flag_domain(cpu, SD_NUMA);
445 rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
446
447 sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
448 rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
449
450 for_each_domain(cpu, sd) {
451 if (sd->groups->sge)
452 ea_sd = sd;
453 else
454 break;
455 }
456 rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
457
458 sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
459 rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
460 }
461
update_asym_cpucapacity(int cpu)462 static void update_asym_cpucapacity(int cpu)
463 {
464 int enable = false;
465
466 rcu_read_lock();
467 if (lowest_flag_domain(cpu, SD_ASYM_CPUCAPACITY))
468 enable = true;
469 rcu_read_unlock();
470
471 if (enable) {
472 /* This expects to be hotplug-safe */
473 static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
474 }
475 }
476
477 /*
478 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
479 * hold the hotplug lock.
480 */
481 static void
cpu_attach_domain(struct sched_domain * sd,struct root_domain * rd,int cpu)482 cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
483 {
484 struct rq *rq = cpu_rq(cpu);
485 struct sched_domain *tmp;
486
487 /* Remove the sched domains which do not contribute to scheduling. */
488 for (tmp = sd; tmp; ) {
489 struct sched_domain *parent = tmp->parent;
490 if (!parent)
491 break;
492
493 if (sd_parent_degenerate(tmp, parent)) {
494 tmp->parent = parent->parent;
495 if (parent->parent)
496 parent->parent->child = tmp;
497 /*
498 * Transfer SD_PREFER_SIBLING down in case of a
499 * degenerate parent; the spans match for this
500 * so the property transfers.
501 */
502 if (parent->flags & SD_PREFER_SIBLING)
503 tmp->flags |= SD_PREFER_SIBLING;
504 destroy_sched_domain(parent);
505 } else
506 tmp = tmp->parent;
507 }
508
509 if (sd && sd_degenerate(sd)) {
510 tmp = sd;
511 sd = sd->parent;
512 destroy_sched_domain(tmp);
513 if (sd)
514 sd->child = NULL;
515 }
516
517 sched_domain_debug(sd, cpu);
518
519 rq_attach_root(rq, rd);
520 tmp = rq->sd;
521 rcu_assign_pointer(rq->sd, sd);
522 dirty_sched_domain_sysctl(cpu);
523 destroy_sched_domains(tmp);
524
525 update_top_cache_domain(cpu);
526 }
527
528 /* Setup the mask of CPUs configured for isolated domains */
isolated_cpu_setup(char * str)529 static int __init isolated_cpu_setup(char *str)
530 {
531 int ret;
532
533 alloc_bootmem_cpumask_var(&cpu_isolated_map);
534 ret = cpulist_parse(str, cpu_isolated_map);
535 if (ret) {
536 pr_err("sched: Error, all isolcpus= values must be between 0 and %u\n", nr_cpu_ids);
537 return 0;
538 }
539 return 1;
540 }
541 __setup("isolcpus=", isolated_cpu_setup);
542
543 struct s_data {
544 struct sched_domain * __percpu *sd;
545 struct root_domain *rd;
546 };
547
548 enum s_alloc {
549 sa_rootdomain,
550 sa_sd,
551 sa_sd_storage,
552 sa_none,
553 };
554
555 /*
556 * Return the canonical balance CPU for this group, this is the first CPU
557 * of this group that's also in the balance mask.
558 *
559 * The balance mask are all those CPUs that could actually end up at this
560 * group. See build_balance_mask().
561 *
562 * Also see should_we_balance().
563 */
group_balance_cpu(struct sched_group * sg)564 int group_balance_cpu(struct sched_group *sg)
565 {
566 return cpumask_first(group_balance_mask(sg));
567 }
568
569
570 /*
571 * NUMA topology (first read the regular topology blurb below)
572 *
573 * Given a node-distance table, for example:
574 *
575 * node 0 1 2 3
576 * 0: 10 20 30 20
577 * 1: 20 10 20 30
578 * 2: 30 20 10 20
579 * 3: 20 30 20 10
580 *
581 * which represents a 4 node ring topology like:
582 *
583 * 0 ----- 1
584 * | |
585 * | |
586 * | |
587 * 3 ----- 2
588 *
589 * We want to construct domains and groups to represent this. The way we go
590 * about doing this is to build the domains on 'hops'. For each NUMA level we
591 * construct the mask of all nodes reachable in @level hops.
592 *
593 * For the above NUMA topology that gives 3 levels:
594 *
595 * NUMA-2 0-3 0-3 0-3 0-3
596 * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2}
597 *
598 * NUMA-1 0-1,3 0-2 1-3 0,2-3
599 * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3}
600 *
601 * NUMA-0 0 1 2 3
602 *
603 *
604 * As can be seen; things don't nicely line up as with the regular topology.
605 * When we iterate a domain in child domain chunks some nodes can be
606 * represented multiple times -- hence the "overlap" naming for this part of
607 * the topology.
608 *
609 * In order to minimize this overlap, we only build enough groups to cover the
610 * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3.
611 *
612 * Because:
613 *
614 * - the first group of each domain is its child domain; this
615 * gets us the first 0-1,3
616 * - the only uncovered node is 2, who's child domain is 1-3.
617 *
618 * However, because of the overlap, computing a unique CPU for each group is
619 * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both
620 * groups include the CPUs of Node-0, while those CPUs would not in fact ever
621 * end up at those groups (they would end up in group: 0-1,3).
622 *
623 * To correct this we have to introduce the group balance mask. This mask
624 * will contain those CPUs in the group that can reach this group given the
625 * (child) domain tree.
626 *
627 * With this we can once again compute balance_cpu and sched_group_capacity
628 * relations.
629 *
630 * XXX include words on how balance_cpu is unique and therefore can be
631 * used for sched_group_capacity links.
632 *
633 *
634 * Another 'interesting' topology is:
635 *
636 * node 0 1 2 3
637 * 0: 10 20 20 30
638 * 1: 20 10 20 20
639 * 2: 20 20 10 20
640 * 3: 30 20 20 10
641 *
642 * Which looks a little like:
643 *
644 * 0 ----- 1
645 * | / |
646 * | / |
647 * | / |
648 * 2 ----- 3
649 *
650 * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3
651 * are not.
652 *
653 * This leads to a few particularly weird cases where the sched_domain's are
654 * not of the same number for each cpu. Consider:
655 *
656 * NUMA-2 0-3 0-3
657 * groups: {0-2},{1-3} {1-3},{0-2}
658 *
659 * NUMA-1 0-2 0-3 0-3 1-3
660 *
661 * NUMA-0 0 1 2 3
662 *
663 */
664
665
666 /*
667 * Build the balance mask; it contains only those CPUs that can arrive at this
668 * group and should be considered to continue balancing.
669 *
670 * We do this during the group creation pass, therefore the group information
671 * isn't complete yet, however since each group represents a (child) domain we
672 * can fully construct this using the sched_domain bits (which are already
673 * complete).
674 */
675 static void
build_balance_mask(struct sched_domain * sd,struct sched_group * sg,struct cpumask * mask)676 build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask)
677 {
678 const struct cpumask *sg_span = sched_group_span(sg);
679 struct sd_data *sdd = sd->private;
680 struct sched_domain *sibling;
681 int i;
682
683 cpumask_clear(mask);
684
685 for_each_cpu(i, sg_span) {
686 sibling = *per_cpu_ptr(sdd->sd, i);
687
688 /*
689 * Can happen in the asymmetric case, where these siblings are
690 * unused. The mask will not be empty because those CPUs that
691 * do have the top domain _should_ span the domain.
692 */
693 if (!sibling->child)
694 continue;
695
696 /* If we would not end up here, we can't continue from here */
697 if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
698 continue;
699
700 cpumask_set_cpu(i, mask);
701 }
702
703 /* We must not have empty masks here */
704 WARN_ON_ONCE(cpumask_empty(mask));
705 }
706
707 /*
708 * XXX: This creates per-node group entries; since the load-balancer will
709 * immediately access remote memory to construct this group's load-balance
710 * statistics having the groups node local is of dubious benefit.
711 */
712 static struct sched_group *
build_group_from_child_sched_domain(struct sched_domain * sd,int cpu)713 build_group_from_child_sched_domain(struct sched_domain *sd, int cpu)
714 {
715 struct sched_group *sg;
716 struct cpumask *sg_span;
717
718 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
719 GFP_KERNEL, cpu_to_node(cpu));
720
721 if (!sg)
722 return NULL;
723
724 sg_span = sched_group_span(sg);
725 if (sd->child)
726 cpumask_copy(sg_span, sched_domain_span(sd->child));
727 else
728 cpumask_copy(sg_span, sched_domain_span(sd));
729
730 atomic_inc(&sg->ref);
731 return sg;
732 }
733
init_overlap_sched_group(struct sched_domain * sd,struct sched_group * sg)734 static void init_overlap_sched_group(struct sched_domain *sd,
735 struct sched_group *sg)
736 {
737 struct cpumask *mask = sched_domains_tmpmask2;
738 struct sd_data *sdd = sd->private;
739 struct cpumask *sg_span;
740 int cpu;
741
742 build_balance_mask(sd, sg, mask);
743 cpu = cpumask_first_and(sched_group_span(sg), mask);
744
745 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
746 if (atomic_inc_return(&sg->sgc->ref) == 1)
747 cpumask_copy(group_balance_mask(sg), mask);
748 else
749 WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask));
750
751 /*
752 * Initialize sgc->capacity such that even if we mess up the
753 * domains and no possible iteration will get us here, we won't
754 * die on a /0 trap.
755 */
756 sg_span = sched_group_span(sg);
757 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
758 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
759 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
760 }
761
762 static int
build_overlap_sched_groups(struct sched_domain * sd,int cpu)763 build_overlap_sched_groups(struct sched_domain *sd, int cpu)
764 {
765 struct sched_group *first = NULL, *last = NULL, *sg;
766 const struct cpumask *span = sched_domain_span(sd);
767 struct cpumask *covered = sched_domains_tmpmask;
768 struct sd_data *sdd = sd->private;
769 struct sched_domain *sibling;
770 int i;
771
772 cpumask_clear(covered);
773
774 for_each_cpu_wrap(i, span, cpu) {
775 struct cpumask *sg_span;
776
777 if (cpumask_test_cpu(i, covered))
778 continue;
779
780 sibling = *per_cpu_ptr(sdd->sd, i);
781
782 /*
783 * Asymmetric node setups can result in situations where the
784 * domain tree is of unequal depth, make sure to skip domains
785 * that already cover the entire range.
786 *
787 * In that case build_sched_domains() will have terminated the
788 * iteration early and our sibling sd spans will be empty.
789 * Domains should always include the CPU they're built on, so
790 * check that.
791 */
792 if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
793 continue;
794
795 sg = build_group_from_child_sched_domain(sibling, cpu);
796 if (!sg)
797 goto fail;
798
799 sg_span = sched_group_span(sg);
800 cpumask_or(covered, covered, sg_span);
801
802 init_overlap_sched_group(sd, sg);
803
804 if (!first)
805 first = sg;
806 if (last)
807 last->next = sg;
808 last = sg;
809 last->next = first;
810 }
811 sd->groups = first;
812
813 return 0;
814
815 fail:
816 free_sched_groups(first, 0);
817
818 return -ENOMEM;
819 }
820
821
822 /*
823 * Package topology (also see the load-balance blurb in fair.c)
824 *
825 * The scheduler builds a tree structure to represent a number of important
826 * topology features. By default (default_topology[]) these include:
827 *
828 * - Simultaneous multithreading (SMT)
829 * - Multi-Core Cache (MC)
830 * - Package (DIE)
831 *
832 * Where the last one more or less denotes everything up to a NUMA node.
833 *
834 * The tree consists of 3 primary data structures:
835 *
836 * sched_domain -> sched_group -> sched_group_capacity
837 * ^ ^ ^ ^
838 * `-' `-'
839 *
840 * The sched_domains are per-cpu and have a two way link (parent & child) and
841 * denote the ever growing mask of CPUs belonging to that level of topology.
842 *
843 * Each sched_domain has a circular (double) linked list of sched_group's, each
844 * denoting the domains of the level below (or individual CPUs in case of the
845 * first domain level). The sched_group linked by a sched_domain includes the
846 * CPU of that sched_domain [*].
847 *
848 * Take for instance a 2 threaded, 2 core, 2 cache cluster part:
849 *
850 * CPU 0 1 2 3 4 5 6 7
851 *
852 * DIE [ ]
853 * MC [ ] [ ]
854 * SMT [ ] [ ] [ ] [ ]
855 *
856 * - or -
857 *
858 * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7
859 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7
860 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7
861 *
862 * CPU 0 1 2 3 4 5 6 7
863 *
864 * One way to think about it is: sched_domain moves you up and down among these
865 * topology levels, while sched_group moves you sideways through it, at child
866 * domain granularity.
867 *
868 * sched_group_capacity ensures each unique sched_group has shared storage.
869 *
870 * There are two related construction problems, both require a CPU that
871 * uniquely identify each group (for a given domain):
872 *
873 * - The first is the balance_cpu (see should_we_balance() and the
874 * load-balance blub in fair.c); for each group we only want 1 CPU to
875 * continue balancing at a higher domain.
876 *
877 * - The second is the sched_group_capacity; we want all identical groups
878 * to share a single sched_group_capacity.
879 *
880 * Since these topologies are exclusive by construction. That is, its
881 * impossible for an SMT thread to belong to multiple cores, and cores to
882 * be part of multiple caches. There is a very clear and unique location
883 * for each CPU in the hierarchy.
884 *
885 * Therefore computing a unique CPU for each group is trivial (the iteration
886 * mask is redundant and set all 1s; all CPUs in a group will end up at _that_
887 * group), we can simply pick the first CPU in each group.
888 *
889 *
890 * [*] in other words, the first group of each domain is its child domain.
891 */
892
get_group(int cpu,struct sd_data * sdd)893 static struct sched_group *get_group(int cpu, struct sd_data *sdd)
894 {
895 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
896 struct sched_domain *child = sd->child;
897 struct sched_group *sg;
898
899 if (child)
900 cpu = cpumask_first(sched_domain_span(child));
901
902 sg = *per_cpu_ptr(sdd->sg, cpu);
903 sg->sgc = *per_cpu_ptr(sdd->sgc, cpu);
904
905 /* For claim_allocations: */
906 atomic_inc(&sg->ref);
907 atomic_inc(&sg->sgc->ref);
908
909 if (child) {
910 cpumask_copy(sched_group_span(sg), sched_domain_span(child));
911 cpumask_copy(group_balance_mask(sg), sched_group_span(sg));
912 } else {
913 cpumask_set_cpu(cpu, sched_group_span(sg));
914 cpumask_set_cpu(cpu, group_balance_mask(sg));
915 }
916
917 sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
918 sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
919 sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
920
921 return sg;
922 }
923
924 /*
925 * build_sched_groups will build a circular linked list of the groups
926 * covered by the given span, and will set each group's ->cpumask correctly,
927 * and ->cpu_capacity to 0.
928 *
929 * Assumes the sched_domain tree is fully constructed
930 */
931 static int
build_sched_groups(struct sched_domain * sd,int cpu)932 build_sched_groups(struct sched_domain *sd, int cpu)
933 {
934 struct sched_group *first = NULL, *last = NULL;
935 struct sd_data *sdd = sd->private;
936 const struct cpumask *span = sched_domain_span(sd);
937 struct cpumask *covered;
938 int i;
939
940 lockdep_assert_held(&sched_domains_mutex);
941 covered = sched_domains_tmpmask;
942
943 cpumask_clear(covered);
944
945 for_each_cpu_wrap(i, span, cpu) {
946 struct sched_group *sg;
947
948 if (cpumask_test_cpu(i, covered))
949 continue;
950
951 sg = get_group(i, sdd);
952
953 cpumask_or(covered, covered, sched_group_span(sg));
954
955 if (!first)
956 first = sg;
957 if (last)
958 last->next = sg;
959 last = sg;
960 }
961 last->next = first;
962 sd->groups = first;
963
964 return 0;
965 }
966
967 /*
968 * Initialize sched groups cpu_capacity.
969 *
970 * cpu_capacity indicates the capacity of sched group, which is used while
971 * distributing the load between different sched groups in a sched domain.
972 * Typically cpu_capacity for all the groups in a sched domain will be same
973 * unless there are asymmetries in the topology. If there are asymmetries,
974 * group having more cpu_capacity will pickup more load compared to the
975 * group having less cpu_capacity.
976 */
init_sched_groups_capacity(int cpu,struct sched_domain * sd)977 static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
978 {
979 struct sched_group *sg = sd->groups;
980
981 WARN_ON(!sg);
982
983 do {
984 int cpu, max_cpu = -1;
985
986 sg->group_weight = cpumask_weight(sched_group_span(sg));
987
988 if (!(sd->flags & SD_ASYM_PACKING))
989 goto next;
990
991 for_each_cpu(cpu, sched_group_span(sg)) {
992 if (max_cpu < 0)
993 max_cpu = cpu;
994 else if (sched_asym_prefer(cpu, max_cpu))
995 max_cpu = cpu;
996 }
997 sg->asym_prefer_cpu = max_cpu;
998
999 next:
1000 sg = sg->next;
1001 } while (sg != sd->groups);
1002
1003 if (cpu != group_balance_cpu(sg))
1004 return;
1005
1006 update_group_capacity(sd, cpu);
1007 }
1008
1009 #define cap_state_power(s,i) (s->cap_states[i].power)
1010 #define cap_state_cap(s,i) (s->cap_states[i].cap)
1011 #define idle_state_power(s,i) (s->idle_states[i].power)
1012
sched_group_energy_equal(const struct sched_group_energy * a,const struct sched_group_energy * b)1013 static inline int sched_group_energy_equal(const struct sched_group_energy *a,
1014 const struct sched_group_energy *b)
1015 {
1016 int i;
1017
1018 /* check pointers first */
1019 if (a == b)
1020 return true;
1021
1022 /* check contents are equivalent */
1023 if (a->nr_cap_states != b->nr_cap_states)
1024 return false;
1025 if (a->nr_idle_states != b->nr_idle_states)
1026 return false;
1027 for (i=0;i<a->nr_cap_states;i++){
1028 if (cap_state_power(a,i) !=
1029 cap_state_power(b,i))
1030 return false;
1031 if (cap_state_cap(a,i) !=
1032 cap_state_cap(b,i))
1033 return false;
1034 }
1035 for (i=0;i<a->nr_idle_states;i++){
1036 if (idle_state_power(a,i) !=
1037 idle_state_power(b,i))
1038 return false;
1039 }
1040
1041 return true;
1042 }
1043
1044 #define energy_eff(e, n) \
1045 ((e->cap_states[n].cap << SCHED_CAPACITY_SHIFT)/e->cap_states[n].power)
1046
init_sched_groups_energy(int cpu,struct sched_domain * sd,sched_domain_energy_f fn)1047 static void init_sched_groups_energy(int cpu, struct sched_domain *sd,
1048 sched_domain_energy_f fn)
1049 {
1050 struct sched_group *sg = sd->groups;
1051 const struct sched_group_energy *sge;
1052 int i;
1053
1054 if (!(fn && fn(cpu)))
1055 return;
1056
1057 if (cpu != group_balance_cpu(sg))
1058 return;
1059
1060 if (sd->flags & SD_OVERLAP) {
1061 pr_err("BUG: EAS does not support overlapping sd spans\n");
1062 #ifdef CONFIG_SCHED_DEBUG
1063 pr_err(" the %s domain has SD_OVERLAP set\n", sd->name);
1064 #endif
1065 return;
1066 }
1067
1068 if (sd->child && !sd->child->groups->sge) {
1069 pr_err("BUG: EAS setup borken for CPU%d\n", cpu);
1070 #ifdef CONFIG_SCHED_DEBUG
1071 pr_err(" energy data on %s but not on %s domain\n",
1072 sd->name, sd->child->name);
1073 #endif
1074 return;
1075 }
1076
1077 sge = fn(cpu);
1078
1079 /*
1080 * Check that the per-cpu provided sd energy data is consistent for all
1081 * cpus within the mask.
1082 */
1083 if (cpumask_weight(sched_group_span(sg)) > 1) {
1084 struct cpumask mask;
1085
1086 cpumask_xor(&mask, sched_group_span(sg), get_cpu_mask(cpu));
1087
1088 for_each_cpu(i, &mask)
1089 BUG_ON(!sched_group_energy_equal(sge,fn(i)));
1090 }
1091
1092 /* Check that energy efficiency (capacity/power) is monotonically
1093 * decreasing in the capacity state vector with higher indexes
1094 */
1095 for (i = 0; i < (sge->nr_cap_states - 1); i++) {
1096 if (energy_eff(sge, i) > energy_eff(sge, i+1))
1097 continue;
1098 #ifdef CONFIG_SCHED_DEBUG
1099 pr_warn("WARN: cpu=%d, domain=%s: incr. energy eff %lu[%d]->%lu[%d]\n",
1100 cpu, sd->name, energy_eff(sge, i), i,
1101 energy_eff(sge, i+1), i+1);
1102 #else
1103 pr_warn("WARN: cpu=%d: incr. energy eff %lu[%d]->%lu[%d]\n",
1104 cpu, energy_eff(sge, i), i, energy_eff(sge, i+1), i+1);
1105 #endif
1106 }
1107
1108 sd->groups->sge = fn(cpu);
1109 }
1110
1111 /*
1112 * Initializers for schedule domains
1113 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
1114 */
1115
1116 static int default_relax_domain_level = -1;
1117 int sched_domain_level_max;
1118
setup_relax_domain_level(char * str)1119 static int __init setup_relax_domain_level(char *str)
1120 {
1121 if (kstrtoint(str, 0, &default_relax_domain_level))
1122 pr_warn("Unable to set relax_domain_level\n");
1123
1124 return 1;
1125 }
1126 __setup("relax_domain_level=", setup_relax_domain_level);
1127
set_domain_attribute(struct sched_domain * sd,struct sched_domain_attr * attr)1128 static void set_domain_attribute(struct sched_domain *sd,
1129 struct sched_domain_attr *attr)
1130 {
1131 int request;
1132
1133 if (!attr || attr->relax_domain_level < 0) {
1134 if (default_relax_domain_level < 0)
1135 return;
1136 else
1137 request = default_relax_domain_level;
1138 } else
1139 request = attr->relax_domain_level;
1140 if (request < sd->level) {
1141 /* Turn off idle balance on this domain: */
1142 sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1143 } else {
1144 /* Turn on idle balance on this domain: */
1145 sd->flags |= (SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE);
1146 }
1147 }
1148
1149 static void __sdt_free(const struct cpumask *cpu_map);
1150 static int __sdt_alloc(const struct cpumask *cpu_map);
1151
__free_domain_allocs(struct s_data * d,enum s_alloc what,const struct cpumask * cpu_map)1152 static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
1153 const struct cpumask *cpu_map)
1154 {
1155 switch (what) {
1156 case sa_rootdomain:
1157 if (!atomic_read(&d->rd->refcount))
1158 free_rootdomain(&d->rd->rcu);
1159 /* Fall through */
1160 case sa_sd:
1161 free_percpu(d->sd);
1162 /* Fall through */
1163 case sa_sd_storage:
1164 __sdt_free(cpu_map);
1165 /* Fall through */
1166 case sa_none:
1167 break;
1168 }
1169 }
1170
1171 static enum s_alloc
__visit_domain_allocation_hell(struct s_data * d,const struct cpumask * cpu_map)1172 __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map)
1173 {
1174 memset(d, 0, sizeof(*d));
1175
1176 if (__sdt_alloc(cpu_map))
1177 return sa_sd_storage;
1178 d->sd = alloc_percpu(struct sched_domain *);
1179 if (!d->sd)
1180 return sa_sd_storage;
1181 d->rd = alloc_rootdomain();
1182 if (!d->rd)
1183 return sa_sd;
1184 return sa_rootdomain;
1185 }
1186
1187 /*
1188 * NULL the sd_data elements we've used to build the sched_domain and
1189 * sched_group structure so that the subsequent __free_domain_allocs()
1190 * will not free the data we're using.
1191 */
claim_allocations(int cpu,struct sched_domain * sd)1192 static void claim_allocations(int cpu, struct sched_domain *sd)
1193 {
1194 struct sd_data *sdd = sd->private;
1195
1196 WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
1197 *per_cpu_ptr(sdd->sd, cpu) = NULL;
1198
1199 if (atomic_read(&(*per_cpu_ptr(sdd->sds, cpu))->ref))
1200 *per_cpu_ptr(sdd->sds, cpu) = NULL;
1201
1202 if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
1203 *per_cpu_ptr(sdd->sg, cpu) = NULL;
1204
1205 if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
1206 *per_cpu_ptr(sdd->sgc, cpu) = NULL;
1207 }
1208
1209 #ifdef CONFIG_NUMA
1210 static int sched_domains_numa_levels;
1211 enum numa_topology_type sched_numa_topology_type;
1212 static int *sched_domains_numa_distance;
1213 int sched_max_numa_distance;
1214 static struct cpumask ***sched_domains_numa_masks;
1215 static int sched_domains_curr_level;
1216 #endif
1217
1218 /*
1219 * SD_flags allowed in topology descriptions.
1220 *
1221 * These flags are purely descriptive of the topology and do not prescribe
1222 * behaviour. Behaviour is artificial and mapped in the below sd_init()
1223 * function:
1224 *
1225 * SD_SHARE_CPUCAPACITY - describes SMT topologies
1226 * SD_SHARE_PKG_RESOURCES - describes shared caches
1227 * SD_NUMA - describes NUMA topologies
1228 * SD_SHARE_POWERDOMAIN - describes shared power domain
1229 * SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
1230 * SD_SHARE_CAP_STATES - describes shared capacity states
1231 *
1232 * Odd one out, which beside describing the topology has a quirk also
1233 * prescribes the desired behaviour that goes along with it:
1234 *
1235 * SD_ASYM_PACKING - describes SMT quirks
1236 */
1237 #define TOPOLOGY_SD_FLAGS \
1238 (SD_SHARE_CPUCAPACITY | \
1239 SD_SHARE_PKG_RESOURCES | \
1240 SD_NUMA | \
1241 SD_ASYM_PACKING | \
1242 SD_ASYM_CPUCAPACITY | \
1243 SD_SHARE_POWERDOMAIN | \
1244 SD_SHARE_CAP_STATES)
1245
1246 static struct sched_domain *
sd_init(struct sched_domain_topology_level * tl,const struct cpumask * cpu_map,struct sched_domain * child,int cpu)1247 sd_init(struct sched_domain_topology_level *tl,
1248 const struct cpumask *cpu_map,
1249 struct sched_domain *child, int cpu)
1250 {
1251 struct sd_data *sdd = &tl->data;
1252 struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
1253 int sd_id, sd_weight, sd_flags = 0;
1254
1255 #ifdef CONFIG_NUMA
1256 /*
1257 * Ugly hack to pass state to sd_numa_mask()...
1258 */
1259 sched_domains_curr_level = tl->numa_level;
1260 #endif
1261
1262 sd_weight = cpumask_weight(tl->mask(cpu));
1263
1264 if (tl->sd_flags)
1265 sd_flags = (*tl->sd_flags)();
1266 if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS,
1267 "wrong sd_flags in topology description\n"))
1268 sd_flags &= ~TOPOLOGY_SD_FLAGS;
1269
1270 *sd = (struct sched_domain){
1271 .min_interval = sd_weight,
1272 .max_interval = 2*sd_weight,
1273 .busy_factor = 32,
1274 .imbalance_pct = 125,
1275
1276 .cache_nice_tries = 0,
1277 .busy_idx = 0,
1278 .idle_idx = 0,
1279 .newidle_idx = 0,
1280 .wake_idx = 0,
1281 .forkexec_idx = 0,
1282
1283 .flags = 1*SD_LOAD_BALANCE
1284 | 1*SD_BALANCE_NEWIDLE
1285 | 1*SD_BALANCE_EXEC
1286 | 1*SD_BALANCE_FORK
1287 | 0*SD_BALANCE_WAKE
1288 | 1*SD_WAKE_AFFINE
1289 | 0*SD_SHARE_CPUCAPACITY
1290 | 0*SD_SHARE_PKG_RESOURCES
1291 | 0*SD_SERIALIZE
1292 | 1*SD_PREFER_SIBLING
1293 | 0*SD_NUMA
1294 | sd_flags
1295 ,
1296
1297 .last_balance = jiffies,
1298 .balance_interval = sd_weight,
1299 .smt_gain = 0,
1300 .max_newidle_lb_cost = 0,
1301 .next_decay_max_lb_cost = jiffies,
1302 .child = child,
1303 #ifdef CONFIG_SCHED_DEBUG
1304 .name = tl->name,
1305 #endif
1306 };
1307
1308 cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
1309 sd_id = cpumask_first(sched_domain_span(sd));
1310
1311 /*
1312 * Check if cpu_map eclipses cpu capacity asymmetry.
1313 */
1314
1315 if (sd->flags & SD_ASYM_CPUCAPACITY) {
1316 long capacity = arch_scale_cpu_capacity(NULL, sd_id);
1317 bool disable = true;
1318 int i;
1319
1320 for_each_cpu(i, sched_domain_span(sd)) {
1321 if (capacity != arch_scale_cpu_capacity(NULL, i)) {
1322 disable = false;
1323 break;
1324 }
1325 }
1326
1327 if (disable)
1328 sd->flags &= ~SD_ASYM_CPUCAPACITY;
1329 }
1330
1331 /*
1332 * Convert topological properties into behaviour.
1333 */
1334
1335 if (sd->flags & SD_ASYM_CPUCAPACITY) {
1336 struct sched_domain *t = sd;
1337
1338 /*
1339 * Don't attempt to spread across cpus of different capacities.
1340 */
1341 if (sd->child)
1342 sd->child->flags &= ~SD_PREFER_SIBLING;
1343
1344 for_each_lower_domain(t)
1345 t->flags |= SD_BALANCE_WAKE;
1346 }
1347
1348 if (sd->flags & SD_SHARE_CPUCAPACITY) {
1349 sd->imbalance_pct = 110;
1350 sd->smt_gain = 1178; /* ~15% */
1351
1352 } else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
1353 sd->imbalance_pct = 117;
1354 sd->cache_nice_tries = 1;
1355 sd->busy_idx = 2;
1356
1357 #ifdef CONFIG_NUMA
1358 } else if (sd->flags & SD_NUMA) {
1359 sd->cache_nice_tries = 2;
1360 sd->busy_idx = 3;
1361 sd->idle_idx = 2;
1362
1363 sd->flags &= ~SD_PREFER_SIBLING;
1364 sd->flags |= SD_SERIALIZE;
1365 if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
1366 sd->flags &= ~(SD_BALANCE_EXEC |
1367 SD_BALANCE_FORK |
1368 SD_WAKE_AFFINE);
1369 }
1370
1371 #endif
1372 } else {
1373 sd->cache_nice_tries = 1;
1374 sd->busy_idx = 2;
1375 sd->idle_idx = 1;
1376 }
1377
1378 sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
1379 atomic_inc(&sd->shared->ref);
1380
1381 if (sd->flags & SD_SHARE_PKG_RESOURCES)
1382 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
1383
1384 sd->private = sdd;
1385
1386 return sd;
1387 }
1388
1389 /*
1390 * Topology list, bottom-up.
1391 */
1392 static struct sched_domain_topology_level default_topology[] = {
1393 #ifdef CONFIG_SCHED_SMT
1394 { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
1395 #endif
1396 #ifdef CONFIG_SCHED_MC
1397 { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
1398 #endif
1399 { cpu_cpu_mask, SD_INIT_NAME(DIE) },
1400 { NULL, },
1401 };
1402
1403 static struct sched_domain_topology_level *sched_domain_topology =
1404 default_topology;
1405
1406 #define for_each_sd_topology(tl) \
1407 for (tl = sched_domain_topology; tl->mask; tl++)
1408
set_sched_topology(struct sched_domain_topology_level * tl)1409 void set_sched_topology(struct sched_domain_topology_level *tl)
1410 {
1411 if (WARN_ON_ONCE(sched_smp_initialized))
1412 return;
1413
1414 sched_domain_topology = tl;
1415 }
1416
1417 #ifdef CONFIG_NUMA
1418
sd_numa_mask(int cpu)1419 static const struct cpumask *sd_numa_mask(int cpu)
1420 {
1421 return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)];
1422 }
1423
sched_numa_warn(const char * str)1424 static void sched_numa_warn(const char *str)
1425 {
1426 static int done = false;
1427 int i,j;
1428
1429 if (done)
1430 return;
1431
1432 done = true;
1433
1434 printk(KERN_WARNING "ERROR: %s\n\n", str);
1435
1436 for (i = 0; i < nr_node_ids; i++) {
1437 printk(KERN_WARNING " ");
1438 for (j = 0; j < nr_node_ids; j++)
1439 printk(KERN_CONT "%02d ", node_distance(i,j));
1440 printk(KERN_CONT "\n");
1441 }
1442 printk(KERN_WARNING "\n");
1443 }
1444
find_numa_distance(int distance)1445 bool find_numa_distance(int distance)
1446 {
1447 int i;
1448
1449 if (distance == node_distance(0, 0))
1450 return true;
1451
1452 for (i = 0; i < sched_domains_numa_levels; i++) {
1453 if (sched_domains_numa_distance[i] == distance)
1454 return true;
1455 }
1456
1457 return false;
1458 }
1459
1460 /*
1461 * A system can have three types of NUMA topology:
1462 * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
1463 * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
1464 * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
1465 *
1466 * The difference between a glueless mesh topology and a backplane
1467 * topology lies in whether communication between not directly
1468 * connected nodes goes through intermediary nodes (where programs
1469 * could run), or through backplane controllers. This affects
1470 * placement of programs.
1471 *
1472 * The type of topology can be discerned with the following tests:
1473 * - If the maximum distance between any nodes is 1 hop, the system
1474 * is directly connected.
1475 * - If for two nodes A and B, located N > 1 hops away from each other,
1476 * there is an intermediary node C, which is < N hops away from both
1477 * nodes A and B, the system is a glueless mesh.
1478 */
init_numa_topology_type(void)1479 static void init_numa_topology_type(void)
1480 {
1481 int a, b, c, n;
1482
1483 n = sched_max_numa_distance;
1484
1485 if (sched_domains_numa_levels <= 1) {
1486 sched_numa_topology_type = NUMA_DIRECT;
1487 return;
1488 }
1489
1490 for_each_online_node(a) {
1491 for_each_online_node(b) {
1492 /* Find two nodes furthest removed from each other. */
1493 if (node_distance(a, b) < n)
1494 continue;
1495
1496 /* Is there an intermediary node between a and b? */
1497 for_each_online_node(c) {
1498 if (node_distance(a, c) < n &&
1499 node_distance(b, c) < n) {
1500 sched_numa_topology_type =
1501 NUMA_GLUELESS_MESH;
1502 return;
1503 }
1504 }
1505
1506 sched_numa_topology_type = NUMA_BACKPLANE;
1507 return;
1508 }
1509 }
1510 }
1511
sched_init_numa(void)1512 void sched_init_numa(void)
1513 {
1514 int next_distance, curr_distance = node_distance(0, 0);
1515 struct sched_domain_topology_level *tl;
1516 int level = 0;
1517 int i, j, k;
1518
1519 sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
1520 if (!sched_domains_numa_distance)
1521 return;
1522
1523 /*
1524 * O(nr_nodes^2) deduplicating selection sort -- in order to find the
1525 * unique distances in the node_distance() table.
1526 *
1527 * Assumes node_distance(0,j) includes all distances in
1528 * node_distance(i,j) in order to avoid cubic time.
1529 */
1530 next_distance = curr_distance;
1531 for (i = 0; i < nr_node_ids; i++) {
1532 for (j = 0; j < nr_node_ids; j++) {
1533 for (k = 0; k < nr_node_ids; k++) {
1534 int distance = node_distance(i, k);
1535
1536 if (distance > curr_distance &&
1537 (distance < next_distance ||
1538 next_distance == curr_distance))
1539 next_distance = distance;
1540
1541 /*
1542 * While not a strong assumption it would be nice to know
1543 * about cases where if node A is connected to B, B is not
1544 * equally connected to A.
1545 */
1546 if (sched_debug() && node_distance(k, i) != distance)
1547 sched_numa_warn("Node-distance not symmetric");
1548
1549 if (sched_debug() && i && !find_numa_distance(distance))
1550 sched_numa_warn("Node-0 not representative");
1551 }
1552 if (next_distance != curr_distance) {
1553 sched_domains_numa_distance[level++] = next_distance;
1554 sched_domains_numa_levels = level;
1555 curr_distance = next_distance;
1556 } else break;
1557 }
1558
1559 /*
1560 * In case of sched_debug() we verify the above assumption.
1561 */
1562 if (!sched_debug())
1563 break;
1564 }
1565
1566 if (!level)
1567 return;
1568
1569 /*
1570 * 'level' contains the number of unique distances, excluding the
1571 * identity distance node_distance(i,i).
1572 *
1573 * The sched_domains_numa_distance[] array includes the actual distance
1574 * numbers.
1575 */
1576
1577 /*
1578 * Here, we should temporarily reset sched_domains_numa_levels to 0.
1579 * If it fails to allocate memory for array sched_domains_numa_masks[][],
1580 * the array will contain less then 'level' members. This could be
1581 * dangerous when we use it to iterate array sched_domains_numa_masks[][]
1582 * in other functions.
1583 *
1584 * We reset it to 'level' at the end of this function.
1585 */
1586 sched_domains_numa_levels = 0;
1587
1588 sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
1589 if (!sched_domains_numa_masks)
1590 return;
1591
1592 /*
1593 * Now for each level, construct a mask per node which contains all
1594 * CPUs of nodes that are that many hops away from us.
1595 */
1596 for (i = 0; i < level; i++) {
1597 sched_domains_numa_masks[i] =
1598 kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
1599 if (!sched_domains_numa_masks[i])
1600 return;
1601
1602 for (j = 0; j < nr_node_ids; j++) {
1603 struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
1604 if (!mask)
1605 return;
1606
1607 sched_domains_numa_masks[i][j] = mask;
1608
1609 for_each_node(k) {
1610 if (node_distance(j, k) > sched_domains_numa_distance[i])
1611 continue;
1612
1613 cpumask_or(mask, mask, cpumask_of_node(k));
1614 }
1615 }
1616 }
1617
1618 /* Compute default topology size */
1619 for (i = 0; sched_domain_topology[i].mask; i++);
1620
1621 tl = kzalloc((i + level + 1) *
1622 sizeof(struct sched_domain_topology_level), GFP_KERNEL);
1623 if (!tl)
1624 return;
1625
1626 /*
1627 * Copy the default topology bits..
1628 */
1629 for (i = 0; sched_domain_topology[i].mask; i++)
1630 tl[i] = sched_domain_topology[i];
1631
1632 /*
1633 * .. and append 'j' levels of NUMA goodness.
1634 */
1635 for (j = 0; j < level; i++, j++) {
1636 tl[i] = (struct sched_domain_topology_level){
1637 .mask = sd_numa_mask,
1638 .sd_flags = cpu_numa_flags,
1639 .flags = SDTL_OVERLAP,
1640 .numa_level = j,
1641 SD_INIT_NAME(NUMA)
1642 };
1643 }
1644
1645 sched_domain_topology = tl;
1646
1647 sched_domains_numa_levels = level;
1648 sched_max_numa_distance = sched_domains_numa_distance[level - 1];
1649
1650 init_numa_topology_type();
1651 }
1652
sched_domains_numa_masks_set(unsigned int cpu)1653 void sched_domains_numa_masks_set(unsigned int cpu)
1654 {
1655 int node = cpu_to_node(cpu);
1656 int i, j;
1657
1658 for (i = 0; i < sched_domains_numa_levels; i++) {
1659 for (j = 0; j < nr_node_ids; j++) {
1660 if (node_distance(j, node) <= sched_domains_numa_distance[i])
1661 cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
1662 }
1663 }
1664 }
1665
sched_domains_numa_masks_clear(unsigned int cpu)1666 void sched_domains_numa_masks_clear(unsigned int cpu)
1667 {
1668 int i, j;
1669
1670 for (i = 0; i < sched_domains_numa_levels; i++) {
1671 for (j = 0; j < nr_node_ids; j++)
1672 cpumask_clear_cpu(cpu, sched_domains_numa_masks[i][j]);
1673 }
1674 }
1675
1676 #endif /* CONFIG_NUMA */
1677
__sdt_alloc(const struct cpumask * cpu_map)1678 static int __sdt_alloc(const struct cpumask *cpu_map)
1679 {
1680 struct sched_domain_topology_level *tl;
1681 int j;
1682
1683 for_each_sd_topology(tl) {
1684 struct sd_data *sdd = &tl->data;
1685
1686 sdd->sd = alloc_percpu(struct sched_domain *);
1687 if (!sdd->sd)
1688 return -ENOMEM;
1689
1690 sdd->sds = alloc_percpu(struct sched_domain_shared *);
1691 if (!sdd->sds)
1692 return -ENOMEM;
1693
1694 sdd->sg = alloc_percpu(struct sched_group *);
1695 if (!sdd->sg)
1696 return -ENOMEM;
1697
1698 sdd->sgc = alloc_percpu(struct sched_group_capacity *);
1699 if (!sdd->sgc)
1700 return -ENOMEM;
1701
1702 for_each_cpu(j, cpu_map) {
1703 struct sched_domain *sd;
1704 struct sched_domain_shared *sds;
1705 struct sched_group *sg;
1706 struct sched_group_capacity *sgc;
1707
1708 sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
1709 GFP_KERNEL, cpu_to_node(j));
1710 if (!sd)
1711 return -ENOMEM;
1712
1713 *per_cpu_ptr(sdd->sd, j) = sd;
1714
1715 sds = kzalloc_node(sizeof(struct sched_domain_shared),
1716 GFP_KERNEL, cpu_to_node(j));
1717 if (!sds)
1718 return -ENOMEM;
1719
1720 *per_cpu_ptr(sdd->sds, j) = sds;
1721
1722 sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
1723 GFP_KERNEL, cpu_to_node(j));
1724 if (!sg)
1725 return -ENOMEM;
1726
1727 sg->next = sg;
1728
1729 *per_cpu_ptr(sdd->sg, j) = sg;
1730
1731 sgc = kzalloc_node(sizeof(struct sched_group_capacity) + cpumask_size(),
1732 GFP_KERNEL, cpu_to_node(j));
1733 if (!sgc)
1734 return -ENOMEM;
1735
1736 #ifdef CONFIG_SCHED_DEBUG
1737 sgc->id = j;
1738 #endif
1739
1740 *per_cpu_ptr(sdd->sgc, j) = sgc;
1741 }
1742 }
1743
1744 return 0;
1745 }
1746
__sdt_free(const struct cpumask * cpu_map)1747 static void __sdt_free(const struct cpumask *cpu_map)
1748 {
1749 struct sched_domain_topology_level *tl;
1750 int j;
1751
1752 for_each_sd_topology(tl) {
1753 struct sd_data *sdd = &tl->data;
1754
1755 for_each_cpu(j, cpu_map) {
1756 struct sched_domain *sd;
1757
1758 if (sdd->sd) {
1759 sd = *per_cpu_ptr(sdd->sd, j);
1760 if (sd && (sd->flags & SD_OVERLAP))
1761 free_sched_groups(sd->groups, 0);
1762 kfree(*per_cpu_ptr(sdd->sd, j));
1763 }
1764
1765 if (sdd->sds)
1766 kfree(*per_cpu_ptr(sdd->sds, j));
1767 if (sdd->sg)
1768 kfree(*per_cpu_ptr(sdd->sg, j));
1769 if (sdd->sgc)
1770 kfree(*per_cpu_ptr(sdd->sgc, j));
1771 }
1772 free_percpu(sdd->sd);
1773 sdd->sd = NULL;
1774 free_percpu(sdd->sds);
1775 sdd->sds = NULL;
1776 free_percpu(sdd->sg);
1777 sdd->sg = NULL;
1778 free_percpu(sdd->sgc);
1779 sdd->sgc = NULL;
1780 }
1781 }
1782
build_sched_domain(struct sched_domain_topology_level * tl,const struct cpumask * cpu_map,struct sched_domain_attr * attr,struct sched_domain * child,int cpu)1783 static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
1784 const struct cpumask *cpu_map, struct sched_domain_attr *attr,
1785 struct sched_domain *child, int cpu)
1786 {
1787 struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
1788
1789 if (child) {
1790 sd->level = child->level + 1;
1791 sched_domain_level_max = max(sched_domain_level_max, sd->level);
1792 child->parent = sd;
1793
1794 if (!cpumask_subset(sched_domain_span(child),
1795 sched_domain_span(sd))) {
1796 pr_err("BUG: arch topology borken\n");
1797 #ifdef CONFIG_SCHED_DEBUG
1798 pr_err(" the %s domain not a subset of the %s domain\n",
1799 child->name, sd->name);
1800 #endif
1801 /* Fixup, ensure @sd has at least @child cpus. */
1802 cpumask_or(sched_domain_span(sd),
1803 sched_domain_span(sd),
1804 sched_domain_span(child));
1805 }
1806
1807 }
1808 set_domain_attribute(sd, attr);
1809
1810 return sd;
1811 }
1812
1813 /*
1814 * Build sched domains for a given set of CPUs and attach the sched domains
1815 * to the individual CPUs
1816 */
1817 static int
build_sched_domains(const struct cpumask * cpu_map,struct sched_domain_attr * attr)1818 build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr)
1819 {
1820 enum s_alloc alloc_state;
1821 struct sched_domain *sd;
1822 struct s_data d;
1823 int i, ret = -ENOMEM;
1824
1825 alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
1826 if (alloc_state != sa_rootdomain)
1827 goto error;
1828
1829 /* Set up domains for CPUs specified by the cpu_map: */
1830 for_each_cpu(i, cpu_map) {
1831 struct sched_domain_topology_level *tl;
1832
1833 sd = NULL;
1834 for_each_sd_topology(tl) {
1835 sd = build_sched_domain(tl, cpu_map, attr, sd, i);
1836 if (tl == sched_domain_topology)
1837 *per_cpu_ptr(d.sd, i) = sd;
1838 if (tl->flags & SDTL_OVERLAP)
1839 sd->flags |= SD_OVERLAP;
1840 }
1841 }
1842
1843 /* Build the groups for the domains */
1844 for_each_cpu(i, cpu_map) {
1845 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
1846 sd->span_weight = cpumask_weight(sched_domain_span(sd));
1847 if (sd->flags & SD_OVERLAP) {
1848 if (build_overlap_sched_groups(sd, i))
1849 goto error;
1850 } else {
1851 if (build_sched_groups(sd, i))
1852 goto error;
1853 }
1854 }
1855 }
1856
1857 /* Calculate CPU capacity for physical packages and nodes */
1858 for (i = nr_cpumask_bits-1; i >= 0; i--) {
1859 struct sched_domain_topology_level *tl = sched_domain_topology;
1860
1861 if (!cpumask_test_cpu(i, cpu_map))
1862 continue;
1863
1864 for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
1865 init_sched_groups_energy(i, sd, tl->energy);
1866 claim_allocations(i, sd);
1867 init_sched_groups_capacity(i, sd);
1868 }
1869 }
1870
1871 /* Attach the domains */
1872 rcu_read_lock();
1873 for_each_cpu(i, cpu_map) {
1874 int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
1875 int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
1876
1877 sd = *per_cpu_ptr(d.sd, i);
1878
1879 if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
1880 cpu_rq(max_cpu)->cpu_capacity_orig))
1881 WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
1882
1883 if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
1884 cpu_rq(min_cpu)->cpu_capacity_orig))
1885 WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
1886
1887 cpu_attach_domain(sd, d.rd, i);
1888 }
1889 rcu_read_unlock();
1890
1891 if (!cpumask_empty(cpu_map))
1892 update_asym_cpucapacity(cpumask_first(cpu_map));
1893
1894 ret = 0;
1895 error:
1896 __free_domain_allocs(&d, alloc_state, cpu_map);
1897 return ret;
1898 }
1899
1900 /* Current sched domains: */
1901 static cpumask_var_t *doms_cur;
1902
1903 /* Number of sched domains in 'doms_cur': */
1904 static int ndoms_cur;
1905
1906 /* Attribues of custom domains in 'doms_cur' */
1907 static struct sched_domain_attr *dattr_cur;
1908
1909 /*
1910 * Special case: If a kmalloc() of a doms_cur partition (array of
1911 * cpumask) fails, then fallback to a single sched domain,
1912 * as determined by the single cpumask fallback_doms.
1913 */
1914 static cpumask_var_t fallback_doms;
1915
1916 /*
1917 * arch_update_cpu_topology lets virtualized architectures update the
1918 * CPU core maps. It is supposed to return 1 if the topology changed
1919 * or 0 if it stayed the same.
1920 */
arch_update_cpu_topology(void)1921 int __weak arch_update_cpu_topology(void)
1922 {
1923 return 0;
1924 }
1925
alloc_sched_domains(unsigned int ndoms)1926 cpumask_var_t *alloc_sched_domains(unsigned int ndoms)
1927 {
1928 int i;
1929 cpumask_var_t *doms;
1930
1931 doms = kmalloc(sizeof(*doms) * ndoms, GFP_KERNEL);
1932 if (!doms)
1933 return NULL;
1934 for (i = 0; i < ndoms; i++) {
1935 if (!alloc_cpumask_var(&doms[i], GFP_KERNEL)) {
1936 free_sched_domains(doms, i);
1937 return NULL;
1938 }
1939 }
1940 return doms;
1941 }
1942
free_sched_domains(cpumask_var_t doms[],unsigned int ndoms)1943 void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
1944 {
1945 unsigned int i;
1946 for (i = 0; i < ndoms; i++)
1947 free_cpumask_var(doms[i]);
1948 kfree(doms);
1949 }
1950
1951 /*
1952 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
1953 * For now this just excludes isolated CPUs, but could be used to
1954 * exclude other special cases in the future.
1955 */
sched_init_domains(const struct cpumask * cpu_map)1956 int sched_init_domains(const struct cpumask *cpu_map)
1957 {
1958 int err;
1959
1960 zalloc_cpumask_var(&sched_domains_tmpmask, GFP_KERNEL);
1961 zalloc_cpumask_var(&sched_domains_tmpmask2, GFP_KERNEL);
1962 zalloc_cpumask_var(&fallback_doms, GFP_KERNEL);
1963
1964 arch_update_cpu_topology();
1965 ndoms_cur = 1;
1966 doms_cur = alloc_sched_domains(ndoms_cur);
1967 if (!doms_cur)
1968 doms_cur = &fallback_doms;
1969 cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
1970 err = build_sched_domains(doms_cur[0], NULL);
1971 register_sched_domain_sysctl();
1972
1973 return err;
1974 }
1975
1976 /*
1977 * Detach sched domains from a group of CPUs specified in cpu_map
1978 * These CPUs will now be attached to the NULL domain
1979 */
detach_destroy_domains(const struct cpumask * cpu_map)1980 static void detach_destroy_domains(const struct cpumask *cpu_map)
1981 {
1982 int i;
1983
1984 rcu_read_lock();
1985 for_each_cpu(i, cpu_map)
1986 cpu_attach_domain(NULL, &def_root_domain, i);
1987 rcu_read_unlock();
1988 }
1989
1990 /* handle null as "default" */
dattrs_equal(struct sched_domain_attr * cur,int idx_cur,struct sched_domain_attr * new,int idx_new)1991 static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
1992 struct sched_domain_attr *new, int idx_new)
1993 {
1994 struct sched_domain_attr tmp;
1995
1996 /* Fast path: */
1997 if (!new && !cur)
1998 return 1;
1999
2000 tmp = SD_ATTR_INIT;
2001 return !memcmp(cur ? (cur + idx_cur) : &tmp,
2002 new ? (new + idx_new) : &tmp,
2003 sizeof(struct sched_domain_attr));
2004 }
2005
2006 /*
2007 * Partition sched domains as specified by the 'ndoms_new'
2008 * cpumasks in the array doms_new[] of cpumasks. This compares
2009 * doms_new[] to the current sched domain partitioning, doms_cur[].
2010 * It destroys each deleted domain and builds each new domain.
2011 *
2012 * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'.
2013 * The masks don't intersect (don't overlap.) We should setup one
2014 * sched domain for each mask. CPUs not in any of the cpumasks will
2015 * not be load balanced. If the same cpumask appears both in the
2016 * current 'doms_cur' domains and in the new 'doms_new', we can leave
2017 * it as it is.
2018 *
2019 * The passed in 'doms_new' should be allocated using
2020 * alloc_sched_domains. This routine takes ownership of it and will
2021 * free_sched_domains it when done with it. If the caller failed the
2022 * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1,
2023 * and partition_sched_domains() will fallback to the single partition
2024 * 'fallback_doms', it also forces the domains to be rebuilt.
2025 *
2026 * If doms_new == NULL it will be replaced with cpu_online_mask.
2027 * ndoms_new == 0 is a special case for destroying existing domains,
2028 * and it will not create the default domain.
2029 *
2030 * Call with hotplug lock held
2031 */
partition_sched_domains(int ndoms_new,cpumask_var_t doms_new[],struct sched_domain_attr * dattr_new)2032 void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
2033 struct sched_domain_attr *dattr_new)
2034 {
2035 int i, j, n;
2036 int new_topology;
2037
2038 mutex_lock(&sched_domains_mutex);
2039
2040 /* Always unregister in case we don't destroy any domains: */
2041 unregister_sched_domain_sysctl();
2042
2043 /* Let the architecture update CPU core mappings: */
2044 new_topology = arch_update_cpu_topology();
2045
2046 if (!doms_new) {
2047 WARN_ON_ONCE(dattr_new);
2048 n = 0;
2049 doms_new = alloc_sched_domains(1);
2050 if (doms_new) {
2051 n = 1;
2052 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
2053 }
2054 } else {
2055 n = ndoms_new;
2056 }
2057
2058 /* Destroy deleted domains: */
2059 for (i = 0; i < ndoms_cur; i++) {
2060 for (j = 0; j < n && !new_topology; j++) {
2061 if (cpumask_equal(doms_cur[i], doms_new[j])
2062 && dattrs_equal(dattr_cur, i, dattr_new, j))
2063 goto match1;
2064 }
2065 /* No match - a current sched domain not in new doms_new[] */
2066 detach_destroy_domains(doms_cur[i]);
2067 match1:
2068 ;
2069 }
2070
2071 n = ndoms_cur;
2072 if (!doms_new) {
2073 n = 0;
2074 doms_new = &fallback_doms;
2075 cpumask_andnot(doms_new[0], cpu_active_mask, cpu_isolated_map);
2076 }
2077
2078 /* Build new domains: */
2079 for (i = 0; i < ndoms_new; i++) {
2080 for (j = 0; j < n && !new_topology; j++) {
2081 if (cpumask_equal(doms_new[i], doms_cur[j])
2082 && dattrs_equal(dattr_new, i, dattr_cur, j))
2083 goto match2;
2084 }
2085 /* No match - add a new doms_new */
2086 build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
2087 match2:
2088 ;
2089 }
2090
2091 /* Remember the new sched domains: */
2092 if (doms_cur != &fallback_doms)
2093 free_sched_domains(doms_cur, ndoms_cur);
2094
2095 kfree(dattr_cur);
2096 doms_cur = doms_new;
2097 dattr_cur = dattr_new;
2098 ndoms_cur = ndoms_new;
2099
2100 register_sched_domain_sysctl();
2101
2102 mutex_unlock(&sched_domains_mutex);
2103 }
2104