• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <linux/cgroup.h>
2 #include <linux/err.h>
3 #include <linux/kernel.h>
4 #include <linux/percpu.h>
5 #include <linux/printk.h>
6 #include <linux/rcupdate.h>
7 #include <linux/slab.h>
8 
9 #include <trace/events/sched.h>
10 
11 #include "sched.h"
12 #include "tune.h"
13 
14 #ifdef CONFIG_CGROUP_SCHEDTUNE
15 bool schedtune_initialized = false;
16 #endif
17 
18 unsigned int sysctl_sched_cfs_boost __read_mostly;
19 
20 extern struct reciprocal_value schedtune_spc_rdiv;
21 struct target_nrg schedtune_target_nrg;
22 
23 /* Performance Boost region (B) threshold params */
24 static int perf_boost_idx;
25 
26 /* Performance Constraint region (C) threshold params */
27 static int perf_constrain_idx;
28 
29 /**
30  * Performance-Energy (P-E) Space thresholds constants
31  */
32 struct threshold_params {
33 	int nrg_gain;
34 	int cap_gain;
35 };
36 
37 /*
38  * System specific P-E space thresholds constants
39  */
40 static struct threshold_params
41 threshold_gains[] = {
42 	{ 0, 5 }, /*   < 10% */
43 	{ 1, 5 }, /*   < 20% */
44 	{ 2, 5 }, /*   < 30% */
45 	{ 3, 5 }, /*   < 40% */
46 	{ 4, 5 }, /*   < 50% */
47 	{ 5, 4 }, /*   < 60% */
48 	{ 5, 3 }, /*   < 70% */
49 	{ 5, 2 }, /*   < 80% */
50 	{ 5, 1 }, /*   < 90% */
51 	{ 5, 0 }  /* <= 100% */
52 };
53 
54 static int
__schedtune_accept_deltas(int nrg_delta,int cap_delta,int perf_boost_idx,int perf_constrain_idx)55 __schedtune_accept_deltas(int nrg_delta, int cap_delta,
56 			  int perf_boost_idx, int perf_constrain_idx)
57 {
58 	int payoff = -INT_MAX;
59 	int gain_idx = -1;
60 
61 	/* Performance Boost (B) region */
62 	if (nrg_delta >= 0 && cap_delta > 0)
63 		gain_idx = perf_boost_idx;
64 	/* Performance Constraint (C) region */
65 	else if (nrg_delta < 0 && cap_delta <= 0)
66 		gain_idx = perf_constrain_idx;
67 
68 	/* Default: reject schedule candidate */
69 	if (gain_idx == -1)
70 		return payoff;
71 
72 	/*
73 	 * Evaluate "Performance Boost" vs "Energy Increase"
74 	 *
75 	 * - Performance Boost (B) region
76 	 *
77 	 *   Condition: nrg_delta > 0 && cap_delta > 0
78 	 *   Payoff criteria:
79 	 *     cap_gain / nrg_gain  < cap_delta / nrg_delta =
80 	 *     cap_gain * nrg_delta < cap_delta * nrg_gain
81 	 *   Note that since both nrg_gain and nrg_delta are positive, the
82 	 *   inequality does not change. Thus:
83 	 *
84 	 *     payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
85 	 *
86 	 * - Performance Constraint (C) region
87 	 *
88 	 *   Condition: nrg_delta < 0 && cap_delta < 0
89 	 *   payoff criteria:
90 	 *     cap_gain / nrg_gain  > cap_delta / nrg_delta =
91 	 *     cap_gain * nrg_delta < cap_delta * nrg_gain
92 	 *   Note that since nrg_gain > 0 while nrg_delta < 0, the
93 	 *   inequality change. Thus:
94 	 *
95 	 *     payoff = (cap_delta * nrg_gain) - (cap_gain * nrg_delta)
96 	 *
97 	 * This means that, in case of same positive defined {cap,nrg}_gain
98 	 * for both the B and C regions, we can use the same payoff formula
99 	 * where a positive value represents the accept condition.
100 	 */
101 	payoff  = cap_delta * threshold_gains[gain_idx].nrg_gain;
102 	payoff -= nrg_delta * threshold_gains[gain_idx].cap_gain;
103 
104 	return payoff;
105 }
106 
107 #ifdef CONFIG_CGROUP_SCHEDTUNE
108 
109 /*
110  * EAS scheduler tunables for task groups.
111  */
112 
113 /* SchdTune tunables for a group of tasks */
114 struct schedtune {
115 	/* SchedTune CGroup subsystem */
116 	struct cgroup_subsys_state css;
117 
118 	/* Boost group allocated ID */
119 	int idx;
120 
121 	/* Boost value for tasks on that SchedTune CGroup */
122 	int boost;
123 
124 	/* Performance Boost (B) region threshold params */
125 	int perf_boost_idx;
126 
127 	/* Performance Constraint (C) region threshold params */
128 	int perf_constrain_idx;
129 
130 	/* Hint to bias scheduling of tasks on that SchedTune CGroup
131 	 * towards idle CPUs */
132 	int prefer_idle;
133 };
134 
css_st(struct cgroup_subsys_state * css)135 static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
136 {
137 	return css ? container_of(css, struct schedtune, css) : NULL;
138 }
139 
task_schedtune(struct task_struct * tsk)140 static inline struct schedtune *task_schedtune(struct task_struct *tsk)
141 {
142 	return css_st(task_css(tsk, schedtune_cgrp_id));
143 }
144 
parent_st(struct schedtune * st)145 static inline struct schedtune *parent_st(struct schedtune *st)
146 {
147 	return css_st(st->css.parent);
148 }
149 
150 /*
151  * SchedTune root control group
152  * The root control group is used to defined a system-wide boosting tuning,
153  * which is applied to all tasks in the system.
154  * Task specific boost tuning could be specified by creating and
155  * configuring a child control group under the root one.
156  * By default, system-wide boosting is disabled, i.e. no boosting is applied
157  * to tasks which are not into a child control group.
158  */
159 static struct schedtune
160 root_schedtune = {
161 	.boost	= 0,
162 	.perf_boost_idx = 0,
163 	.perf_constrain_idx = 0,
164 	.prefer_idle = 0,
165 };
166 
167 int
schedtune_accept_deltas(int nrg_delta,int cap_delta,struct task_struct * task)168 schedtune_accept_deltas(int nrg_delta, int cap_delta,
169 			struct task_struct *task)
170 {
171 	struct schedtune *ct;
172 	int perf_boost_idx;
173 	int perf_constrain_idx;
174 
175 	/* Optimal (O) region */
176 	if (nrg_delta < 0 && cap_delta > 0) {
177 		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
178 		return INT_MAX;
179 	}
180 
181 	/* Suboptimal (S) region */
182 	if (nrg_delta > 0 && cap_delta < 0) {
183 		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
184 		return -INT_MAX;
185 	}
186 
187 	/* Get task specific perf Boost/Constraints indexes */
188 	rcu_read_lock();
189 	ct = task_schedtune(task);
190 	perf_boost_idx = ct->perf_boost_idx;
191 	perf_constrain_idx = ct->perf_constrain_idx;
192 	rcu_read_unlock();
193 
194 	return __schedtune_accept_deltas(nrg_delta, cap_delta,
195 			perf_boost_idx, perf_constrain_idx);
196 }
197 
198 /*
199  * Maximum number of boost groups to support
200  * When per-task boosting is used we still allow only limited number of
201  * boost groups for two main reasons:
202  * 1. on a real system we usually have only few classes of workloads which
203  *    make sense to boost with different values (e.g. background vs foreground
204  *    tasks, interactive vs low-priority tasks)
205  * 2. a limited number allows for a simpler and more memory/time efficient
206  *    implementation especially for the computation of the per-CPU boost
207  *    value
208  */
209 #define BOOSTGROUPS_COUNT 5
210 
211 /* Array of configured boostgroups */
212 static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
213 	&root_schedtune,
214 	NULL,
215 };
216 
217 /* SchedTune boost groups
218  * Keep track of all the boost groups which impact on CPU, for example when a
219  * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
220  * likely with different boost values.
221  * Since on each system we expect only a limited number of boost groups, here
222  * we use a simple array to keep track of the metrics required to compute the
223  * maximum per-CPU boosting value.
224  */
225 struct boost_groups {
226 	/* Maximum boost value for all RUNNABLE tasks on a CPU */
227 	bool idle;
228 	int boost_max;
229 	struct {
230 		/* The boost for tasks on that boost group */
231 		int boost;
232 		/* Count of RUNNABLE tasks on that boost group */
233 		unsigned tasks;
234 	} group[BOOSTGROUPS_COUNT];
235 	/* CPU's boost group locking */
236 	raw_spinlock_t lock;
237 };
238 
239 /* Boost groups affecting each CPU in the system */
240 DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
241 
242 static void
schedtune_cpu_update(int cpu)243 schedtune_cpu_update(int cpu)
244 {
245 	struct boost_groups *bg;
246 	int boost_max;
247 	int idx;
248 
249 	bg = &per_cpu(cpu_boost_groups, cpu);
250 
251 	/* The root boost group is always active */
252 	boost_max = bg->group[0].boost;
253 	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
254 		/*
255 		 * A boost group affects a CPU only if it has
256 		 * RUNNABLE tasks on that CPU
257 		 */
258 		if (bg->group[idx].tasks == 0)
259 			continue;
260 
261 		boost_max = max(boost_max, bg->group[idx].boost);
262 	}
263 	/* Ensures boost_max is non-negative when all cgroup boost values
264 	 * are neagtive. Avoids under-accounting of cpu capacity which may cause
265 	 * task stacking and frequency spikes.*/
266 	boost_max = max(boost_max, 0);
267 	bg->boost_max = boost_max;
268 }
269 
270 static int
schedtune_boostgroup_update(int idx,int boost)271 schedtune_boostgroup_update(int idx, int boost)
272 {
273 	struct boost_groups *bg;
274 	int cur_boost_max;
275 	int old_boost;
276 	int cpu;
277 
278 	/* Update per CPU boost groups */
279 	for_each_possible_cpu(cpu) {
280 		bg = &per_cpu(cpu_boost_groups, cpu);
281 
282 		/*
283 		 * Keep track of current boost values to compute the per CPU
284 		 * maximum only when it has been affected by the new value of
285 		 * the updated boost group
286 		 */
287 		cur_boost_max = bg->boost_max;
288 		old_boost = bg->group[idx].boost;
289 
290 		/* Update the boost value of this boost group */
291 		bg->group[idx].boost = boost;
292 
293 		/* Check if this update increase current max */
294 		if (boost > cur_boost_max && bg->group[idx].tasks) {
295 			bg->boost_max = boost;
296 			trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
297 			continue;
298 		}
299 
300 		/* Check if this update has decreased current max */
301 		if (cur_boost_max == old_boost && old_boost > boost) {
302 			schedtune_cpu_update(cpu);
303 			trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
304 			continue;
305 		}
306 
307 		trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
308 	}
309 
310 	return 0;
311 }
312 
313 #define ENQUEUE_TASK  1
314 #define DEQUEUE_TASK -1
315 
316 static inline void
schedtune_tasks_update(struct task_struct * p,int cpu,int idx,int task_count)317 schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
318 {
319 	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
320 	int tasks = bg->group[idx].tasks + task_count;
321 
322 	/* Update boosted tasks count while avoiding to make it negative */
323 	bg->group[idx].tasks = max(0, tasks);
324 
325 	trace_sched_tune_tasks_update(p, cpu, tasks, idx,
326 			bg->group[idx].boost, bg->boost_max);
327 
328 	/* Boost group activation or deactivation on that RQ */
329 	if (tasks == 1 || tasks == 0)
330 		schedtune_cpu_update(cpu);
331 }
332 
333 /*
334  * NOTE: This function must be called while holding the lock on the CPU RQ
335  */
schedtune_enqueue_task(struct task_struct * p,int cpu)336 void schedtune_enqueue_task(struct task_struct *p, int cpu)
337 {
338 	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
339 	unsigned long irq_flags;
340 	struct schedtune *st;
341 	int idx;
342 
343 	if (!unlikely(schedtune_initialized))
344 		return;
345 
346 	/*
347 	 * When a task is marked PF_EXITING by do_exit() it's going to be
348 	 * dequeued and enqueued multiple times in the exit path.
349 	 * Thus we avoid any further update, since we do not want to change
350 	 * CPU boosting while the task is exiting.
351 	 */
352 	if (p->flags & PF_EXITING)
353 		return;
354 
355 	/*
356 	 * Boost group accouting is protected by a per-cpu lock and requires
357 	 * interrupt to be disabled to avoid race conditions for example on
358 	 * do_exit()::cgroup_exit() and task migration.
359 	 */
360 	raw_spin_lock_irqsave(&bg->lock, irq_flags);
361 	rcu_read_lock();
362 
363 	st = task_schedtune(p);
364 	idx = st->idx;
365 
366 	schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
367 
368 	rcu_read_unlock();
369 	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
370 }
371 
schedtune_can_attach(struct cgroup_taskset * tset)372 int schedtune_can_attach(struct cgroup_taskset *tset)
373 {
374 	struct task_struct *task;
375 	struct cgroup_subsys_state *css;
376 	struct boost_groups *bg;
377 	struct rq_flags irq_flags;
378 	unsigned int cpu;
379 	struct rq *rq;
380 	int src_bg; /* Source boost group index */
381 	int dst_bg; /* Destination boost group index */
382 	int tasks;
383 
384 	if (!unlikely(schedtune_initialized))
385 		return 0;
386 
387 
388 	cgroup_taskset_for_each(task, css, tset) {
389 
390 		/*
391 		 * Lock the CPU's RQ the task is enqueued to avoid race
392 		 * conditions with migration code while the task is being
393 		 * accounted
394 		 */
395 		rq = lock_rq_of(task, &irq_flags);
396 
397 		if (!task->on_rq) {
398 			unlock_rq_of(rq, task, &irq_flags);
399 			continue;
400 		}
401 
402 		/*
403 		 * Boost group accouting is protected by a per-cpu lock and requires
404 		 * interrupt to be disabled to avoid race conditions on...
405 		 */
406 		cpu = cpu_of(rq);
407 		bg = &per_cpu(cpu_boost_groups, cpu);
408 		raw_spin_lock(&bg->lock);
409 
410 		dst_bg = css_st(css)->idx;
411 		src_bg = task_schedtune(task)->idx;
412 
413 		/*
414 		 * Current task is not changing boostgroup, which can
415 		 * happen when the new hierarchy is in use.
416 		 */
417 		if (unlikely(dst_bg == src_bg)) {
418 			raw_spin_unlock(&bg->lock);
419 			unlock_rq_of(rq, task, &irq_flags);
420 			continue;
421 		}
422 
423 		/*
424 		 * This is the case of a RUNNABLE task which is switching its
425 		 * current boost group.
426 		 */
427 
428 		/* Move task from src to dst boost group */
429 		tasks = bg->group[src_bg].tasks - 1;
430 		bg->group[src_bg].tasks = max(0, tasks);
431 		bg->group[dst_bg].tasks += 1;
432 
433 		raw_spin_unlock(&bg->lock);
434 		unlock_rq_of(rq, task, &irq_flags);
435 
436 		/* Update CPU boost group */
437 		if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
438 			schedtune_cpu_update(task_cpu(task));
439 
440 	}
441 
442 	return 0;
443 }
444 
schedtune_cancel_attach(struct cgroup_taskset * tset)445 void schedtune_cancel_attach(struct cgroup_taskset *tset)
446 {
447 	/* This can happen only if SchedTune controller is mounted with
448 	 * other hierarchies ane one of them fails. Since usually SchedTune is
449 	 * mouted on its own hierarcy, for the time being we do not implement
450 	 * a proper rollback mechanism */
451 	WARN(1, "SchedTune cancel attach not implemented");
452 }
453 
454 /*
455  * NOTE: This function must be called while holding the lock on the CPU RQ
456  */
schedtune_dequeue_task(struct task_struct * p,int cpu)457 void schedtune_dequeue_task(struct task_struct *p, int cpu)
458 {
459 	struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
460 	unsigned long irq_flags;
461 	struct schedtune *st;
462 	int idx;
463 
464 	if (!unlikely(schedtune_initialized))
465 		return;
466 
467 	/*
468 	 * When a task is marked PF_EXITING by do_exit() it's going to be
469 	 * dequeued and enqueued multiple times in the exit path.
470 	 * Thus we avoid any further update, since we do not want to change
471 	 * CPU boosting while the task is exiting.
472 	 * The last dequeue is already enforce by the do_exit() code path
473 	 * via schedtune_exit_task().
474 	 */
475 	if (p->flags & PF_EXITING)
476 		return;
477 
478 	/*
479 	 * Boost group accouting is protected by a per-cpu lock and requires
480 	 * interrupt to be disabled to avoid race conditions on...
481 	 */
482 	raw_spin_lock_irqsave(&bg->lock, irq_flags);
483 	rcu_read_lock();
484 
485 	st = task_schedtune(p);
486 	idx = st->idx;
487 
488 	schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
489 
490 	rcu_read_unlock();
491 	raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
492 }
493 
schedtune_exit_task(struct task_struct * tsk)494 void schedtune_exit_task(struct task_struct *tsk)
495 {
496 	struct schedtune *st;
497 	struct rq_flags irq_flags;
498 	unsigned int cpu;
499 	struct rq *rq;
500 	int idx;
501 
502 	if (!unlikely(schedtune_initialized))
503 		return;
504 
505 	rq = lock_rq_of(tsk, &irq_flags);
506 	rcu_read_lock();
507 
508 	cpu = cpu_of(rq);
509 	st = task_schedtune(tsk);
510 	idx = st->idx;
511 	schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
512 
513 	rcu_read_unlock();
514 	unlock_rq_of(rq, tsk, &irq_flags);
515 }
516 
schedtune_cpu_boost(int cpu)517 int schedtune_cpu_boost(int cpu)
518 {
519 	struct boost_groups *bg;
520 
521 	bg = &per_cpu(cpu_boost_groups, cpu);
522 	return bg->boost_max;
523 }
524 
schedtune_task_boost(struct task_struct * p)525 int schedtune_task_boost(struct task_struct *p)
526 {
527 	struct schedtune *st;
528 	int task_boost;
529 
530 	if (!unlikely(schedtune_initialized))
531 		return 0;
532 
533 	/* Get task boost value */
534 	rcu_read_lock();
535 	st = task_schedtune(p);
536 	task_boost = st->boost;
537 	rcu_read_unlock();
538 
539 	return task_boost;
540 }
541 
schedtune_prefer_idle(struct task_struct * p)542 int schedtune_prefer_idle(struct task_struct *p)
543 {
544 	struct schedtune *st;
545 	int prefer_idle;
546 
547 	if (!unlikely(schedtune_initialized))
548 		return 0;
549 
550 	/* Get prefer_idle value */
551 	rcu_read_lock();
552 	st = task_schedtune(p);
553 	prefer_idle = st->prefer_idle;
554 	rcu_read_unlock();
555 
556 	return prefer_idle;
557 }
558 
559 static u64
prefer_idle_read(struct cgroup_subsys_state * css,struct cftype * cft)560 prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
561 {
562 	struct schedtune *st = css_st(css);
563 
564 	return st->prefer_idle;
565 }
566 
567 static int
prefer_idle_write(struct cgroup_subsys_state * css,struct cftype * cft,u64 prefer_idle)568 prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
569 	    u64 prefer_idle)
570 {
571 	struct schedtune *st = css_st(css);
572 	st->prefer_idle = prefer_idle;
573 
574 	return 0;
575 }
576 
577 static s64
boost_read(struct cgroup_subsys_state * css,struct cftype * cft)578 boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
579 {
580 	struct schedtune *st = css_st(css);
581 
582 	return st->boost;
583 }
584 
585 static int
boost_write(struct cgroup_subsys_state * css,struct cftype * cft,s64 boost)586 boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
587 	    s64 boost)
588 {
589 	struct schedtune *st = css_st(css);
590 	unsigned threshold_idx;
591 	int boost_pct;
592 
593 	if (boost < -100 || boost > 100)
594 		return -EINVAL;
595 	boost_pct = boost;
596 
597 	/*
598 	 * Update threshold params for Performance Boost (B)
599 	 * and Performance Constraint (C) regions.
600 	 * The current implementatio uses the same cuts for both
601 	 * B and C regions.
602 	 */
603 	threshold_idx = clamp(boost_pct, 0, 99) / 10;
604 	st->perf_boost_idx = threshold_idx;
605 	st->perf_constrain_idx = threshold_idx;
606 
607 	st->boost = boost;
608 	if (css == &root_schedtune.css) {
609 		sysctl_sched_cfs_boost = boost;
610 		perf_boost_idx  = threshold_idx;
611 		perf_constrain_idx  = threshold_idx;
612 	}
613 
614 	/* Update CPU boost */
615 	schedtune_boostgroup_update(st->idx, st->boost);
616 
617 	trace_sched_tune_config(st->boost);
618 
619 	return 0;
620 }
621 
622 static struct cftype files[] = {
623 	{
624 		.name = "boost",
625 		.read_s64 = boost_read,
626 		.write_s64 = boost_write,
627 	},
628 	{
629 		.name = "prefer_idle",
630 		.read_u64 = prefer_idle_read,
631 		.write_u64 = prefer_idle_write,
632 	},
633 	{ }	/* terminate */
634 };
635 
636 static int
schedtune_boostgroup_init(struct schedtune * st)637 schedtune_boostgroup_init(struct schedtune *st)
638 {
639 	struct boost_groups *bg;
640 	int cpu;
641 
642 	/* Keep track of allocated boost groups */
643 	allocated_group[st->idx] = st;
644 
645 	/* Initialize the per CPU boost groups */
646 	for_each_possible_cpu(cpu) {
647 		bg = &per_cpu(cpu_boost_groups, cpu);
648 		bg->group[st->idx].boost = 0;
649 		bg->group[st->idx].tasks = 0;
650 	}
651 
652 	return 0;
653 }
654 
655 static struct cgroup_subsys_state *
schedtune_css_alloc(struct cgroup_subsys_state * parent_css)656 schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
657 {
658 	struct schedtune *st;
659 	int idx;
660 
661 	if (!parent_css)
662 		return &root_schedtune.css;
663 
664 	/* Allow only single level hierachies */
665 	if (parent_css != &root_schedtune.css) {
666 		pr_err("Nested SchedTune boosting groups not allowed\n");
667 		return ERR_PTR(-ENOMEM);
668 	}
669 
670 	/* Allow only a limited number of boosting groups */
671 	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
672 		if (!allocated_group[idx])
673 			break;
674 	if (idx == BOOSTGROUPS_COUNT) {
675 		pr_err("Trying to create more than %d SchedTune boosting groups\n",
676 		       BOOSTGROUPS_COUNT);
677 		return ERR_PTR(-ENOSPC);
678 	}
679 
680 	st = kzalloc(sizeof(*st), GFP_KERNEL);
681 	if (!st)
682 		goto out;
683 
684 	/* Initialize per CPUs boost group support */
685 	st->idx = idx;
686 	if (schedtune_boostgroup_init(st))
687 		goto release;
688 
689 	return &st->css;
690 
691 release:
692 	kfree(st);
693 out:
694 	return ERR_PTR(-ENOMEM);
695 }
696 
697 static void
schedtune_boostgroup_release(struct schedtune * st)698 schedtune_boostgroup_release(struct schedtune *st)
699 {
700 	/* Reset this boost group */
701 	schedtune_boostgroup_update(st->idx, 0);
702 
703 	/* Keep track of allocated boost groups */
704 	allocated_group[st->idx] = NULL;
705 }
706 
707 static void
schedtune_css_free(struct cgroup_subsys_state * css)708 schedtune_css_free(struct cgroup_subsys_state *css)
709 {
710 	struct schedtune *st = css_st(css);
711 
712 	schedtune_boostgroup_release(st);
713 	kfree(st);
714 }
715 
716 struct cgroup_subsys schedtune_cgrp_subsys = {
717 	.css_alloc	= schedtune_css_alloc,
718 	.css_free	= schedtune_css_free,
719 	.can_attach     = schedtune_can_attach,
720 	.cancel_attach  = schedtune_cancel_attach,
721 	.legacy_cftypes	= files,
722 	.early_init	= 1,
723 };
724 
725 static inline void
schedtune_init_cgroups(void)726 schedtune_init_cgroups(void)
727 {
728 	struct boost_groups *bg;
729 	int cpu;
730 
731 	/* Initialize the per CPU boost groups */
732 	for_each_possible_cpu(cpu) {
733 		bg = &per_cpu(cpu_boost_groups, cpu);
734 		memset(bg, 0, sizeof(struct boost_groups));
735 		raw_spin_lock_init(&bg->lock);
736 	}
737 
738 	pr_info("schedtune: configured to support %d boost groups\n",
739 		BOOSTGROUPS_COUNT);
740 
741 	schedtune_initialized = true;
742 }
743 
744 #else /* CONFIG_CGROUP_SCHEDTUNE */
745 
746 int
schedtune_accept_deltas(int nrg_delta,int cap_delta,struct task_struct * task)747 schedtune_accept_deltas(int nrg_delta, int cap_delta,
748 			struct task_struct *task)
749 {
750 	/* Optimal (O) region */
751 	if (nrg_delta < 0 && cap_delta > 0) {
752 		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, 1, 0);
753 		return INT_MAX;
754 	}
755 
756 	/* Suboptimal (S) region */
757 	if (nrg_delta > 0 && cap_delta < 0) {
758 		trace_sched_tune_filter(nrg_delta, cap_delta, 0, 0, -1, 5);
759 		return -INT_MAX;
760 	}
761 
762 	return __schedtune_accept_deltas(nrg_delta, cap_delta,
763 			perf_boost_idx, perf_constrain_idx);
764 }
765 
766 #endif /* CONFIG_CGROUP_SCHEDTUNE */
767 
768 int
sysctl_sched_cfs_boost_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)769 sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
770 			       void __user *buffer, size_t *lenp,
771 			       loff_t *ppos)
772 {
773 	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
774 	unsigned threshold_idx;
775 	int boost_pct;
776 
777 	if (ret || !write)
778 		return ret;
779 
780 	if (sysctl_sched_cfs_boost < -100 || sysctl_sched_cfs_boost > 100)
781 		return -EINVAL;
782 	boost_pct = sysctl_sched_cfs_boost;
783 
784 	/*
785 	 * Update threshold params for Performance Boost (B)
786 	 * and Performance Constraint (C) regions.
787 	 * The current implementatio uses the same cuts for both
788 	 * B and C regions.
789 	 */
790 	threshold_idx = clamp(boost_pct, 0, 99) / 10;
791 	perf_boost_idx = threshold_idx;
792 	perf_constrain_idx = threshold_idx;
793 
794 	return 0;
795 }
796 
797 #ifdef CONFIG_SCHED_DEBUG
798 static void
schedtune_test_nrg(unsigned long delta_pwr)799 schedtune_test_nrg(unsigned long delta_pwr)
800 {
801 	unsigned long test_delta_pwr;
802 	unsigned long test_norm_pwr;
803 	int idx;
804 
805 	/*
806 	 * Check normalization constants using some constant system
807 	 * energy values
808 	 */
809 	pr_info("schedtune: verify normalization constants...\n");
810 	for (idx = 0; idx < 6; ++idx) {
811 		test_delta_pwr = delta_pwr >> idx;
812 
813 		/* Normalize on max energy for target platform */
814 		test_norm_pwr = reciprocal_divide(
815 					test_delta_pwr << SCHED_CAPACITY_SHIFT,
816 					schedtune_target_nrg.rdiv);
817 
818 		pr_info("schedtune: max_pwr/2^%d: %4lu => norm_pwr: %5lu\n",
819 			idx, test_delta_pwr, test_norm_pwr);
820 	}
821 }
822 #else
823 #define schedtune_test_nrg(delta_pwr)
824 #endif
825 
826 /*
827  * Compute the min/max power consumption of a cluster and all its CPUs
828  */
829 static void
schedtune_add_cluster_nrg(struct sched_domain * sd,struct sched_group * sg,struct target_nrg * ste)830 schedtune_add_cluster_nrg(
831 		struct sched_domain *sd,
832 		struct sched_group *sg,
833 		struct target_nrg *ste)
834 {
835 	struct sched_domain *sd2;
836 	struct sched_group *sg2;
837 
838 	struct cpumask *cluster_cpus;
839 	char str[32];
840 
841 	unsigned long min_pwr;
842 	unsigned long max_pwr;
843 	int cpu;
844 
845 	/* Get Cluster energy using EM data for the first CPU */
846 	cluster_cpus = sched_group_cpus(sg);
847 	snprintf(str, 32, "CLUSTER[%*pbl]",
848 		 cpumask_pr_args(cluster_cpus));
849 
850 	min_pwr = sg->sge->idle_states[sg->sge->nr_idle_states - 1].power;
851 	max_pwr = sg->sge->cap_states[sg->sge->nr_cap_states - 1].power;
852 	pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
853 		str, min_pwr, max_pwr);
854 
855 	/*
856 	 * Keep track of this cluster's energy in the computation of the
857 	 * overall system energy
858 	 */
859 	ste->min_power += min_pwr;
860 	ste->max_power += max_pwr;
861 
862 	/* Get CPU energy using EM data for each CPU in the group */
863 	for_each_cpu(cpu, cluster_cpus) {
864 		/* Get a SD view for the specific CPU */
865 		for_each_domain(cpu, sd2) {
866 			/* Get the CPU group */
867 			sg2 = sd2->groups;
868 			min_pwr = sg2->sge->idle_states[sg2->sge->nr_idle_states - 1].power;
869 			max_pwr = sg2->sge->cap_states[sg2->sge->nr_cap_states - 1].power;
870 
871 			ste->min_power += min_pwr;
872 			ste->max_power += max_pwr;
873 
874 			snprintf(str, 32, "CPU[%d]", cpu);
875 			pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
876 				str, min_pwr, max_pwr);
877 
878 			/*
879 			 * Assume we have EM data only at the CPU and
880 			 * the upper CLUSTER level
881 			 */
882 			BUG_ON(!cpumask_equal(
883 				sched_group_cpus(sg),
884 				sched_group_cpus(sd2->parent->groups)
885 				));
886 			break;
887 		}
888 	}
889 }
890 
891 /*
892  * Initialize the constants required to compute normalized energy.
893  * The values of these constants depends on the EM data for the specific
894  * target system and topology.
895  * Thus, this function is expected to be called by the code
896  * that bind the EM to the topology information.
897  */
898 static int
schedtune_init(void)899 schedtune_init(void)
900 {
901 	struct target_nrg *ste = &schedtune_target_nrg;
902 	unsigned long delta_pwr = 0;
903 	struct sched_domain *sd;
904 	struct sched_group *sg;
905 
906 	pr_info("schedtune: init normalization constants...\n");
907 	ste->max_power = 0;
908 	ste->min_power = 0;
909 
910 	rcu_read_lock();
911 
912 	/*
913 	 * When EAS is in use, we always have a pointer to the highest SD
914 	 * which provides EM data.
915 	 */
916 	sd = rcu_dereference(per_cpu(sd_ea, cpumask_first(cpu_online_mask)));
917 	if (!sd) {
918 		pr_info("schedtune: no energy model data\n");
919 		goto nodata;
920 	}
921 
922 	sg = sd->groups;
923 	do {
924 		schedtune_add_cluster_nrg(sd, sg, ste);
925 	} while (sg = sg->next, sg != sd->groups);
926 
927 	rcu_read_unlock();
928 
929 	pr_info("schedtune: %-17s min_pwr: %5lu max_pwr: %5lu\n",
930 		"SYSTEM", ste->min_power, ste->max_power);
931 
932 	/* Compute normalization constants */
933 	delta_pwr = ste->max_power - ste->min_power;
934 	ste->rdiv = reciprocal_value(delta_pwr);
935 	pr_info("schedtune: using normalization constants mul: %u sh1: %u sh2: %u\n",
936 		ste->rdiv.m, ste->rdiv.sh1, ste->rdiv.sh2);
937 
938 	schedtune_test_nrg(delta_pwr);
939 
940 #ifdef CONFIG_CGROUP_SCHEDTUNE
941 	schedtune_init_cgroups();
942 #else
943 	pr_info("schedtune: configured to support global boosting only\n");
944 #endif
945 
946 	schedtune_spc_rdiv = reciprocal_value(100);
947 
948 	return 0;
949 
950 nodata:
951 	pr_warning("schedtune: disabled!\n");
952 	rcu_read_unlock();
953 	return -EINVAL;
954 }
955 postcore_initcall(schedtune_init);
956