1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * walt.c
4 *
5 * Window Assistant Load Tracking
6 *
7 * This software is licensed under the terms of the GNU General Public
8 * License version 2, as published by the Free Software Foundation, and
9 * may be copied, distributed, and modified under those terms.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 */
17
18 #include <linux/syscore_ops.h>
19 #include <linux/cpufreq.h>
20 #include <linux/list_sort.h>
21 #include <linux/jiffies.h>
22 #include <linux/sched/stat.h>
23 #include <trace/events/sched.h>
24 #include "sched.h"
25 #include "walt.h"
26 #include "core_ctl.h"
27 #include "rtg/rtg.h"
28 #define CREATE_TRACE_POINTS
29 #include <trace/events/walt.h>
30 #undef CREATE_TRACE_POINTS
31
32 const char *task_event_names[] = {"PUT_PREV_TASK", "PICK_NEXT_TASK",
33 "TASK_WAKE", "TASK_MIGRATE", "TASK_UPDATE",
34 "IRQ_UPDATE"};
35 const char *migrate_type_names[] = {"GROUP_TO_RQ", "RQ_TO_GROUP",
36 "RQ_TO_RQ", "GROUP_TO_GROUP"};
37
38 #define SCHED_FREQ_ACCOUNT_WAIT_TIME 0
39 #define SCHED_ACCOUNT_WAIT_TIME 1
40
41 static ktime_t ktime_last;
42 static bool sched_ktime_suspended;
43 DEFINE_MUTEX(cluster_lock);
44 static atomic64_t walt_irq_work_lastq_ws;
45 u64 walt_load_reported_window;
46
47 static struct irq_work walt_cpufreq_irq_work;
48 static struct irq_work walt_migration_irq_work;
49
sched_ktime_clock(void)50 u64 sched_ktime_clock(void)
51 {
52 if (unlikely(sched_ktime_suspended))
53 return ktime_to_ns(ktime_last);
54 return ktime_get_ns();
55 }
56
sched_resume(void)57 static void sched_resume(void)
58 {
59 sched_ktime_suspended = false;
60 }
61
sched_suspend(void)62 static int sched_suspend(void)
63 {
64 ktime_last = ktime_get();
65 sched_ktime_suspended = true;
66 return 0;
67 }
68
69 static struct syscore_ops sched_syscore_ops = {
70 .resume = sched_resume,
71 .suspend = sched_suspend
72 };
73
sched_init_ops(void)74 static int __init sched_init_ops(void)
75 {
76 register_syscore_ops(&sched_syscore_ops);
77 return 0;
78 }
79 late_initcall(sched_init_ops);
80
acquire_rq_locks_irqsave(const cpumask_t * cpus,unsigned long * flags)81 static void acquire_rq_locks_irqsave(const cpumask_t *cpus,
82 unsigned long *flags)
83 {
84 int cpu;
85 int level = 0;
86
87 local_irq_save(*flags);
88 for_each_cpu(cpu, cpus) {
89 if (level == 0)
90 raw_spin_lock(&cpu_rq(cpu)->lock);
91 else
92 raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
93 level++;
94 }
95 }
96
release_rq_locks_irqrestore(const cpumask_t * cpus,unsigned long * flags)97 static void release_rq_locks_irqrestore(const cpumask_t *cpus,
98 unsigned long *flags)
99 {
100 int cpu;
101
102 for_each_cpu(cpu, cpus)
103 raw_spin_unlock(&cpu_rq(cpu)->lock);
104 local_irq_restore(*flags);
105 }
106
107 #ifdef CONFIG_HZ_300
108 /*
109 * Tick interval becomes to 3333333 due to
110 * rounding error when HZ=300.
111 */
112 #define MIN_SCHED_RAVG_WINDOW (3333333 * 6)
113 #else
114 /* Min window size (in ns) = 20ms */
115 #define MIN_SCHED_RAVG_WINDOW 20000000
116 #endif
117
118 /* Max window size (in ns) = 1s */
119 #define MAX_SCHED_RAVG_WINDOW 1000000000
120
121 /* 1 -> use PELT based load stats, 0 -> use window-based load stats */
122 unsigned int __read_mostly walt_disabled;
123
124 __read_mostly unsigned int sysctl_sched_cpu_high_irqload = (10 * NSEC_PER_MSEC);
125
126 /*
127 * sched_window_stats_policy and sched_ravg_hist_size have a 'sysctl' copy
128 * associated with them. This is required for atomic update of those variables
129 * when being modifed via sysctl interface.
130 *
131 * IMPORTANT: Initialize both copies to same value!!
132 */
133
134 __read_mostly unsigned int sched_ravg_hist_size = 5;
135 __read_mostly unsigned int sysctl_sched_ravg_hist_size = 5;
136
137 __read_mostly unsigned int sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG;
138 __read_mostly unsigned int sysctl_sched_window_stats_policy = WINDOW_STATS_MAX_RECENT_AVG;
139
140 static __read_mostly unsigned int sched_io_is_busy = 1;
141
142 unsigned int sysctl_sched_use_walt_cpu_util = 1;
143 unsigned int sysctl_sched_use_walt_task_util = 1;
144 unsigned int sysctl_sched_walt_init_task_load_pct = 15;
145 __read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload = (10 * NSEC_PER_MSEC);
146
147 /* Window size (in ns) */
148 __read_mostly unsigned int sched_ravg_window = MIN_SCHED_RAVG_WINDOW;
149
150 /*
151 * A after-boot constant divisor for cpu_util_freq_walt() to apply the load
152 * boost.
153 */
154 __read_mostly unsigned int walt_cpu_util_freq_divisor;
155
156 /* Initial task load. Newly created tasks are assigned this load. */
157 unsigned int __read_mostly sched_init_task_load_windows;
158 unsigned int __read_mostly sched_init_task_load_windows_scaled;
159 unsigned int __read_mostly sysctl_sched_init_task_load_pct = 15;
160
161 /*
162 * Maximum possible frequency across all cpus. Task demand and cpu
163 * capacity (cpu_power) metrics are scaled in reference to it.
164 */
165 unsigned int max_possible_freq = 1;
166
167 /*
168 * Minimum possible max_freq across all cpus. This will be same as
169 * max_possible_freq on homogeneous systems and could be different from
170 * max_possible_freq on heterogenous systems. min_max_freq is used to derive
171 */
172 unsigned int min_max_freq = 1;
173
174 unsigned int max_capacity = 1024; /* max(rq->capacity) */
175 unsigned int min_capacity = 1024; /* min(rq->capacity) */
176 unsigned int max_possible_capacity = 1024; /* max(rq->max_possible_capacity) */
177 unsigned int
178 min_max_possible_capacity = 1024; /* min(rq->max_possible_capacity) */
179
180 /* Temporarily disable window-stats activity on all cpus */
181 unsigned int __read_mostly sched_disable_window_stats;
182
183 /*
184 * This governs what load needs to be used when reporting CPU busy time
185 * to the cpufreq governor.
186 */
187 __read_mostly unsigned int sysctl_sched_freq_reporting_policy;
188
set_sched_ravg_window(char * str)189 static int __init set_sched_ravg_window(char *str)
190 {
191 unsigned int window_size;
192
193 get_option(&str, &window_size);
194
195 if (window_size < MIN_SCHED_RAVG_WINDOW ||
196 window_size > MAX_SCHED_RAVG_WINDOW) {
197 WARN_ON(1);
198 return -EINVAL;
199 }
200
201 sched_ravg_window = window_size;
202 return 0;
203 }
204 early_param("sched_ravg_window", set_sched_ravg_window);
205
206 __read_mostly unsigned int walt_scale_demand_divisor;
207 #define scale_demand(d) ((d)/walt_scale_demand_divisor)
208
inc_rq_walt_stats(struct rq * rq,struct task_struct * p)209 void inc_rq_walt_stats(struct rq *rq, struct task_struct *p)
210 {
211 walt_inc_cumulative_runnable_avg(rq, p);
212 }
213
dec_rq_walt_stats(struct rq * rq,struct task_struct * p)214 void dec_rq_walt_stats(struct rq *rq, struct task_struct *p)
215 {
216 walt_dec_cumulative_runnable_avg(rq, p);
217 }
218
fixup_walt_sched_stats_common(struct rq * rq,struct task_struct * p,u16 updated_demand_scaled)219 void fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p,
220 u16 updated_demand_scaled)
221 {
222 s64 task_load_delta = (s64)updated_demand_scaled -
223 p->ravg.demand_scaled;
224
225 fixup_cumulative_runnable_avg(&rq->walt_stats, task_load_delta);
226
227 walt_fixup_cum_window_demand(rq, task_load_delta);
228 }
229
230 static u64
update_window_start(struct rq * rq,u64 wallclock,int event)231 update_window_start(struct rq *rq, u64 wallclock, int event)
232 {
233 s64 delta;
234 int nr_windows;
235 u64 old_window_start = rq->window_start;
236
237 delta = wallclock - rq->window_start;
238 BUG_ON(delta < 0);
239 if (delta < sched_ravg_window)
240 return old_window_start;
241
242 nr_windows = div64_u64(delta, sched_ravg_window);
243 rq->window_start += (u64)nr_windows * (u64)sched_ravg_window;
244
245 rq->cum_window_demand_scaled =
246 rq->walt_stats.cumulative_runnable_avg_scaled;
247
248 return old_window_start;
249 }
250
sched_account_irqtime(int cpu,struct task_struct * curr,u64 delta,u64 wallclock)251 void sched_account_irqtime(int cpu, struct task_struct *curr,
252 u64 delta, u64 wallclock)
253 {
254 struct rq *rq = cpu_rq(cpu);
255 unsigned long flags, nr_windows;
256 u64 cur_jiffies_ts;
257
258 raw_spin_lock_irqsave(&rq->lock, flags);
259
260 /*
261 * cputime (wallclock) uses sched_clock so use the same here for
262 * consistency.
263 */
264 delta += sched_clock() - wallclock;
265 cur_jiffies_ts = get_jiffies_64();
266
267 if (is_idle_task(curr))
268 update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(),
269 delta);
270
271 nr_windows = cur_jiffies_ts - rq->irqload_ts;
272
273 if (nr_windows) {
274 if (nr_windows < 10) {
275 /* Decay CPU's irqload by 3/4 for each window. */
276 rq->avg_irqload *= (3 * nr_windows);
277 rq->avg_irqload = div64_u64(rq->avg_irqload,
278 4 * nr_windows);
279 } else {
280 rq->avg_irqload = 0;
281 }
282 rq->avg_irqload += rq->cur_irqload;
283 rq->cur_irqload = 0;
284 }
285
286 rq->cur_irqload += delta;
287 rq->irqload_ts = cur_jiffies_ts;
288 raw_spin_unlock_irqrestore(&rq->lock, flags);
289 }
290
291 static int
account_busy_for_task_demand(struct rq * rq,struct task_struct * p,int event)292 account_busy_for_task_demand(struct rq *rq, struct task_struct *p, int event)
293 {
294 /*
295 * No need to bother updating task demand for exiting tasks
296 * or the idle task.
297 */
298 if (exiting_task(p) || is_idle_task(p))
299 return 0;
300
301 /*
302 * When a task is waking up it is completing a segment of non-busy
303 * time. Likewise, if wait time is not treated as busy time, then
304 * when a task begins to run or is migrated, it is not running and
305 * is completing a segment of non-busy time.
306 */
307 if (event == TASK_WAKE || (!SCHED_ACCOUNT_WAIT_TIME &&
308 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
309 return 0;
310
311 /*
312 * The idle exit time is not accounted for the first task _picked_ up to
313 * run on the idle CPU.
314 */
315 if (event == PICK_NEXT_TASK && rq->curr == rq->idle)
316 return 0;
317
318 /*
319 * TASK_UPDATE can be called on sleeping task, when its moved between
320 * related groups
321 */
322 if (event == TASK_UPDATE) {
323 if (rq->curr == p)
324 return 1;
325
326 return p->on_rq ? SCHED_ACCOUNT_WAIT_TIME : 0;
327 }
328
329 return 1;
330 }
331
332 /*
333 * In this function we match the accumulated subtractions with the current
334 * and previous windows we are operating with. Ignore any entries where
335 * the window start in the load_subtraction struct does not match either
336 * the curent or the previous window. This could happen whenever CPUs
337 * become idle or busy with interrupts disabled for an extended period.
338 */
account_load_subtractions(struct rq * rq)339 static inline void account_load_subtractions(struct rq *rq)
340 {
341 u64 ws = rq->window_start;
342 u64 prev_ws = ws - sched_ravg_window;
343 struct load_subtractions *ls = rq->load_subs;
344 int i;
345
346 for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
347 if (ls[i].window_start == ws) {
348 rq->curr_runnable_sum -= ls[i].subs;
349 rq->nt_curr_runnable_sum -= ls[i].new_subs;
350 } else if (ls[i].window_start == prev_ws) {
351 rq->prev_runnable_sum -= ls[i].subs;
352 rq->nt_prev_runnable_sum -= ls[i].new_subs;
353 }
354
355 ls[i].subs = 0;
356 ls[i].new_subs = 0;
357 }
358
359 BUG_ON((s64)rq->prev_runnable_sum < 0);
360 BUG_ON((s64)rq->curr_runnable_sum < 0);
361 BUG_ON((s64)rq->nt_prev_runnable_sum < 0);
362 BUG_ON((s64)rq->nt_curr_runnable_sum < 0);
363 }
364
create_subtraction_entry(struct rq * rq,u64 ws,int index)365 static inline void create_subtraction_entry(struct rq *rq, u64 ws, int index)
366 {
367 rq->load_subs[index].window_start = ws;
368 rq->load_subs[index].subs = 0;
369 rq->load_subs[index].new_subs = 0;
370 }
371
get_subtraction_index(struct rq * rq,u64 ws)372 static bool get_subtraction_index(struct rq *rq, u64 ws)
373 {
374 int i;
375 u64 oldest = ULLONG_MAX;
376 int oldest_index = 0;
377
378 for (i = 0; i < NUM_TRACKED_WINDOWS; i++) {
379 u64 entry_ws = rq->load_subs[i].window_start;
380
381 if (ws == entry_ws)
382 return i;
383
384 if (entry_ws < oldest) {
385 oldest = entry_ws;
386 oldest_index = i;
387 }
388 }
389
390 create_subtraction_entry(rq, ws, oldest_index);
391 return oldest_index;
392 }
393
update_rq_load_subtractions(int index,struct rq * rq,u32 sub_load,bool new_task)394 static void update_rq_load_subtractions(int index, struct rq *rq,
395 u32 sub_load, bool new_task)
396 {
397 rq->load_subs[index].subs += sub_load;
398 if (new_task)
399 rq->load_subs[index].new_subs += sub_load;
400 }
401
update_cluster_load_subtractions(struct task_struct * p,int cpu,u64 ws,bool new_task)402 void update_cluster_load_subtractions(struct task_struct *p,
403 int cpu, u64 ws, bool new_task)
404 {
405 struct sched_cluster *cluster = cpu_cluster(cpu);
406 struct cpumask cluster_cpus = cluster->cpus;
407 u64 prev_ws = ws - sched_ravg_window;
408 int i;
409
410 cpumask_clear_cpu(cpu, &cluster_cpus);
411 raw_spin_lock(&cluster->load_lock);
412
413 for_each_cpu(i, &cluster_cpus) {
414 struct rq *rq = cpu_rq(i);
415 int index;
416
417 if (p->ravg.curr_window_cpu[i]) {
418 index = get_subtraction_index(rq, ws);
419 update_rq_load_subtractions(index, rq,
420 p->ravg.curr_window_cpu[i], new_task);
421 p->ravg.curr_window_cpu[i] = 0;
422 }
423
424 if (p->ravg.prev_window_cpu[i]) {
425 index = get_subtraction_index(rq, prev_ws);
426 update_rq_load_subtractions(index, rq,
427 p->ravg.prev_window_cpu[i], new_task);
428 p->ravg.prev_window_cpu[i] = 0;
429 }
430 }
431
432 raw_spin_unlock(&cluster->load_lock);
433 }
434
inter_cluster_migration_fixup(struct task_struct * p,int new_cpu,int task_cpu,bool new_task)435 static inline void inter_cluster_migration_fixup
436 (struct task_struct *p, int new_cpu, int task_cpu, bool new_task)
437 {
438 struct rq *dest_rq = cpu_rq(new_cpu);
439 struct rq *src_rq = cpu_rq(task_cpu);
440
441 if (same_freq_domain(new_cpu, task_cpu))
442 return;
443
444 p->ravg.curr_window_cpu[new_cpu] = p->ravg.curr_window;
445 p->ravg.prev_window_cpu[new_cpu] = p->ravg.prev_window;
446
447 dest_rq->curr_runnable_sum += p->ravg.curr_window;
448 dest_rq->prev_runnable_sum += p->ravg.prev_window;
449
450 src_rq->curr_runnable_sum -= p->ravg.curr_window_cpu[task_cpu];
451 src_rq->prev_runnable_sum -= p->ravg.prev_window_cpu[task_cpu];
452
453 if (new_task) {
454 dest_rq->nt_curr_runnable_sum += p->ravg.curr_window;
455 dest_rq->nt_prev_runnable_sum += p->ravg.prev_window;
456
457 src_rq->nt_curr_runnable_sum -=
458 p->ravg.curr_window_cpu[task_cpu];
459 src_rq->nt_prev_runnable_sum -=
460 p->ravg.prev_window_cpu[task_cpu];
461 }
462
463 p->ravg.curr_window_cpu[task_cpu] = 0;
464 p->ravg.prev_window_cpu[task_cpu] = 0;
465
466 update_cluster_load_subtractions(p, task_cpu,
467 src_rq->window_start, new_task);
468
469 BUG_ON((s64)src_rq->prev_runnable_sum < 0);
470 BUG_ON((s64)src_rq->curr_runnable_sum < 0);
471 BUG_ON((s64)src_rq->nt_prev_runnable_sum < 0);
472 BUG_ON((s64)src_rq->nt_curr_runnable_sum < 0);
473 }
474
fixup_busy_time(struct task_struct * p,int new_cpu)475 void fixup_busy_time(struct task_struct *p, int new_cpu)
476 {
477 struct rq *src_rq = task_rq(p);
478 struct rq *dest_rq = cpu_rq(new_cpu);
479 u64 wallclock;
480 bool new_task;
481 #ifdef CONFIG_SCHED_RTG
482 u64 *src_curr_runnable_sum, *dst_curr_runnable_sum;
483 u64 *src_prev_runnable_sum, *dst_prev_runnable_sum;
484 u64 *src_nt_curr_runnable_sum, *dst_nt_curr_runnable_sum;
485 u64 *src_nt_prev_runnable_sum, *dst_nt_prev_runnable_sum;
486 struct related_thread_group *grp;
487 #endif
488
489 if (!p->on_rq && p->state != TASK_WAKING)
490 return;
491
492 if (exiting_task(p))
493 return;
494
495 if (p->state == TASK_WAKING)
496 double_rq_lock(src_rq, dest_rq);
497
498 if (sched_disable_window_stats)
499 goto done;
500
501 wallclock = sched_ktime_clock();
502
503 update_task_ravg(task_rq(p)->curr, task_rq(p),
504 TASK_UPDATE,
505 wallclock, 0);
506 update_task_ravg(dest_rq->curr, dest_rq,
507 TASK_UPDATE, wallclock, 0);
508
509 update_task_ravg(p, task_rq(p), TASK_MIGRATE,
510 wallclock, 0);
511
512 /*
513 * When a task is migrating during the wakeup, adjust
514 * the task's contribution towards cumulative window
515 * demand.
516 */
517 if (p->state == TASK_WAKING && p->last_sleep_ts >=
518 src_rq->window_start) {
519 walt_fixup_cum_window_demand(src_rq,
520 -(s64)p->ravg.demand_scaled);
521 walt_fixup_cum_window_demand(dest_rq, p->ravg.demand_scaled);
522 }
523
524 new_task = is_new_task(p);
525 #ifdef CONFIG_SCHED_RTG
526 /* Protected by rq_lock */
527 grp = task_related_thread_group(p);
528
529 /*
530 * For frequency aggregation, we continue to do migration fixups
531 * even for intra cluster migrations. This is because, the aggregated
532 * load has to reported on a single CPU regardless.
533 */
534 if (grp) {
535 struct group_cpu_time *cpu_time;
536
537 cpu_time = &src_rq->grp_time;
538 src_curr_runnable_sum = &cpu_time->curr_runnable_sum;
539 src_prev_runnable_sum = &cpu_time->prev_runnable_sum;
540 src_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
541 src_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
542
543 cpu_time = &dest_rq->grp_time;
544 dst_curr_runnable_sum = &cpu_time->curr_runnable_sum;
545 dst_prev_runnable_sum = &cpu_time->prev_runnable_sum;
546 dst_nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
547 dst_nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
548
549 if (p->ravg.curr_window) {
550 *src_curr_runnable_sum -= p->ravg.curr_window;
551 *dst_curr_runnable_sum += p->ravg.curr_window;
552 if (new_task) {
553 *src_nt_curr_runnable_sum -=
554 p->ravg.curr_window;
555 *dst_nt_curr_runnable_sum +=
556 p->ravg.curr_window;
557 }
558 }
559
560 if (p->ravg.prev_window) {
561 *src_prev_runnable_sum -= p->ravg.prev_window;
562 *dst_prev_runnable_sum += p->ravg.prev_window;
563 if (new_task) {
564 *src_nt_prev_runnable_sum -=
565 p->ravg.prev_window;
566 *dst_nt_prev_runnable_sum +=
567 p->ravg.prev_window;
568 }
569 }
570 } else {
571 #endif
572 inter_cluster_migration_fixup(p, new_cpu,
573 task_cpu(p), new_task);
574 #ifdef CONFIG_SCHED_RTG
575 }
576 #endif
577
578 if (!same_freq_domain(new_cpu, task_cpu(p)))
579 irq_work_queue(&walt_migration_irq_work);
580
581 done:
582 if (p->state == TASK_WAKING)
583 double_rq_unlock(src_rq, dest_rq);
584 }
585
set_window_start(struct rq * rq)586 void set_window_start(struct rq *rq)
587 {
588 static int sync_cpu_available;
589
590 if (likely(rq->window_start))
591 return;
592
593 if (!sync_cpu_available) {
594 rq->window_start = 1;
595 sync_cpu_available = 1;
596 atomic64_set(&walt_irq_work_lastq_ws, rq->window_start);
597 walt_load_reported_window =
598 atomic64_read(&walt_irq_work_lastq_ws);
599
600 } else {
601 struct rq *sync_rq = cpu_rq(cpumask_any(cpu_online_mask));
602
603 raw_spin_unlock(&rq->lock);
604 double_rq_lock(rq, sync_rq);
605 rq->window_start = sync_rq->window_start;
606 rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
607 rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
608 raw_spin_unlock(&sync_rq->lock);
609 }
610
611 rq->curr->ravg.mark_start = rq->window_start;
612 }
613
614 /*
615 * Called when new window is starting for a task, to record cpu usage over
616 * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
617 * when, say, a real-time task runs without preemption for several windows at a
618 * stretch.
619 */
update_history(struct rq * rq,struct task_struct * p,u32 runtime,int samples,int event)620 static void update_history(struct rq *rq, struct task_struct *p,
621 u32 runtime, int samples, int event)
622 {
623 u32 *hist = &p->ravg.sum_history[0];
624 int ridx, widx;
625 u32 max = 0, avg, demand;
626 u64 sum = 0;
627 u16 demand_scaled;
628
629 /* Ignore windows where task had no activity */
630 if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
631 goto done;
632
633 /* Push new 'runtime' value onto stack */
634 widx = sched_ravg_hist_size - 1;
635 ridx = widx - samples;
636 for (; ridx >= 0; --widx, --ridx) {
637 hist[widx] = hist[ridx];
638 sum += hist[widx];
639 if (hist[widx] > max)
640 max = hist[widx];
641 }
642
643 for (widx = 0; widx < samples && widx < sched_ravg_hist_size; widx++) {
644 hist[widx] = runtime;
645 sum += hist[widx];
646 if (hist[widx] > max)
647 max = hist[widx];
648 }
649
650 p->ravg.sum = 0;
651
652 if (sched_window_stats_policy == WINDOW_STATS_RECENT) {
653 demand = runtime;
654 } else if (sched_window_stats_policy == WINDOW_STATS_MAX) {
655 demand = max;
656 } else {
657 avg = div64_u64(sum, sched_ravg_hist_size);
658 if (sched_window_stats_policy == WINDOW_STATS_AVG)
659 demand = avg;
660 else
661 demand = max(avg, runtime);
662 }
663 demand_scaled = scale_demand(demand);
664
665 /*
666 * A throttled deadline sched class task gets dequeued without
667 * changing p->on_rq. Since the dequeue decrements walt stats
668 * avoid decrementing it here again.
669 *
670 * When window is rolled over, the cumulative window demand
671 * is reset to the cumulative runnable average (contribution from
672 * the tasks on the runqueue). If the current task is dequeued
673 * already, it's demand is not included in the cumulative runnable
674 * average. So add the task demand separately to cumulative window
675 * demand.
676 */
677 if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
678 if (task_on_rq_queued(p)
679 && p->sched_class->fixup_walt_sched_stats)
680 p->sched_class->fixup_walt_sched_stats(rq, p,
681 demand_scaled);
682 else if (rq->curr == p)
683 walt_fixup_cum_window_demand(rq, demand_scaled);
684 }
685
686 p->ravg.demand = demand;
687 p->ravg.demand_scaled = demand_scaled;
688
689 done:
690 trace_sched_update_history(rq, p, runtime, samples, event);
691 }
692
693 #define DIV64_U64_ROUNDUP(X, Y) div64_u64((X) + (Y - 1), Y)
694
add_to_task_demand(struct rq * rq,struct task_struct * p,u64 delta)695 static u64 add_to_task_demand(struct rq *rq, struct task_struct *p, u64 delta)
696 {
697 delta = scale_exec_time(delta, rq);
698 p->ravg.sum += delta;
699 if (unlikely(p->ravg.sum > sched_ravg_window))
700 p->ravg.sum = sched_ravg_window;
701
702 return delta;
703 }
704
705 /*
706 * Account cpu demand of task and/or update task's cpu demand history
707 *
708 * ms = p->ravg.mark_start;
709 * wc = wallclock
710 * ws = rq->window_start
711 *
712 * Three possibilities:
713 *
714 * a) Task event is contained within one window.
715 * window_start < mark_start < wallclock
716 *
717 * ws ms wc
718 * | | |
719 * V V V
720 * |---------------|
721 *
722 * In this case, p->ravg.sum is updated *iff* event is appropriate
723 * (ex: event == PUT_PREV_TASK)
724 *
725 * b) Task event spans two windows.
726 * mark_start < window_start < wallclock
727 *
728 * ms ws wc
729 * | | |
730 * V V V
731 * -----|-------------------
732 *
733 * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
734 * is appropriate, then a new window sample is recorded followed
735 * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
736 *
737 * c) Task event spans more than two windows.
738 *
739 * ms ws_tmp ws wc
740 * | | | |
741 * V V V V
742 * ---|-------|-------|-------|-------|------
743 * | |
744 * |<------ nr_full_windows ------>|
745 *
746 * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
747 * event is appropriate, window sample of p->ravg.sum is recorded,
748 * 'nr_full_window' samples of window_size is also recorded *iff*
749 * event is appropriate and finally p->ravg.sum is set to (wc - ws)
750 * *iff* event is appropriate.
751 *
752 * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
753 * depends on it!
754 */
update_task_demand(struct task_struct * p,struct rq * rq,int event,u64 wallclock)755 static u64 update_task_demand(struct task_struct *p, struct rq *rq,
756 int event, u64 wallclock)
757 {
758 u64 mark_start = p->ravg.mark_start;
759 u64 delta, window_start = rq->window_start;
760 int new_window, nr_full_windows;
761 u32 window_size = sched_ravg_window;
762 u64 runtime;
763
764 #ifdef CONFIG_SCHED_RTG
765 update_group_demand(p, rq, event, wallclock);
766 #endif
767
768 new_window = mark_start < window_start;
769 if (!account_busy_for_task_demand(rq, p, event)) {
770 if (new_window)
771 /*
772 * If the time accounted isn't being accounted as
773 * busy time, and a new window started, only the
774 * previous window need be closed out with the
775 * pre-existing demand. Multiple windows may have
776 * elapsed, but since empty windows are dropped,
777 * it is not necessary to account those.
778 */
779 update_history(rq, p, p->ravg.sum, 1, event);
780 return 0;
781 }
782
783 if (!new_window) {
784 /*
785 * The simple case - busy time contained within the existing
786 * window.
787 */
788 return add_to_task_demand(rq, p, wallclock - mark_start);
789 }
790
791 /*
792 * Busy time spans at least two windows. Temporarily rewind
793 * window_start to first window boundary after mark_start.
794 */
795 delta = window_start - mark_start;
796 nr_full_windows = div64_u64(delta, window_size);
797 window_start -= (u64)nr_full_windows * (u64)window_size;
798
799 /* Process (window_start - mark_start) first */
800 runtime = add_to_task_demand(rq, p, window_start - mark_start);
801
802 /* Push new sample(s) into task's demand history */
803 update_history(rq, p, p->ravg.sum, 1, event);
804 if (nr_full_windows) {
805 u64 scaled_window = scale_exec_time(window_size, rq);
806
807 update_history(rq, p, scaled_window, nr_full_windows, event);
808 runtime += nr_full_windows * scaled_window;
809 }
810
811 /*
812 * Roll window_start back to current to process any remainder
813 * in current window.
814 */
815 window_start += (u64)nr_full_windows * (u64)window_size;
816
817 /* Process (wallclock - window_start) next */
818 mark_start = window_start;
819 runtime += add_to_task_demand(rq, p, wallclock - mark_start);
820
821 return runtime;
822 }
823
824 static u32 empty_windows[NR_CPUS];
825
rollover_task_window(struct task_struct * p,bool full_window)826 static void rollover_task_window(struct task_struct *p, bool full_window)
827 {
828 u32 *curr_cpu_windows = empty_windows;
829 u32 curr_window;
830 int i;
831
832 /* Rollover the sum */
833 curr_window = 0;
834
835 if (!full_window) {
836 curr_window = p->ravg.curr_window;
837 curr_cpu_windows = p->ravg.curr_window_cpu;
838 }
839
840 p->ravg.prev_window = curr_window;
841 p->ravg.curr_window = 0;
842
843 /* Roll over individual CPU contributions */
844 for (i = 0; i < nr_cpu_ids; i++) {
845 p->ravg.prev_window_cpu[i] = curr_cpu_windows[i];
846 p->ravg.curr_window_cpu[i] = 0;
847 }
848 }
849
rollover_cpu_window(struct rq * rq,bool full_window)850 static void rollover_cpu_window(struct rq *rq, bool full_window)
851 {
852 u64 curr_sum = rq->curr_runnable_sum;
853 u64 nt_curr_sum = rq->nt_curr_runnable_sum;
854
855 if (unlikely(full_window)) {
856 curr_sum = 0;
857 nt_curr_sum = 0;
858 }
859
860 rq->prev_runnable_sum = curr_sum;
861 rq->nt_prev_runnable_sum = nt_curr_sum;
862
863 rq->curr_runnable_sum = 0;
864 rq->nt_curr_runnable_sum = 0;
865 }
866
cpu_is_waiting_on_io(struct rq * rq)867 static inline int cpu_is_waiting_on_io(struct rq *rq)
868 {
869 if (!sched_io_is_busy)
870 return 0;
871
872 return atomic_read(&rq->nr_iowait);
873 }
874
account_busy_for_cpu_time(struct rq * rq,struct task_struct * p,u64 irqtime,int event)875 static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
876 u64 irqtime, int event)
877 {
878 if (is_idle_task(p)) {
879 /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
880 if (event == PICK_NEXT_TASK)
881 return 0;
882
883 /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
884 return irqtime || cpu_is_waiting_on_io(rq);
885 }
886
887 if (event == TASK_WAKE)
888 return 0;
889
890 if (event == PUT_PREV_TASK || event == IRQ_UPDATE)
891 return 1;
892
893 /*
894 * TASK_UPDATE can be called on sleeping task, when its moved between
895 * related groups
896 */
897 if (event == TASK_UPDATE) {
898 if (rq->curr == p)
899 return 1;
900
901 return p->on_rq ? SCHED_FREQ_ACCOUNT_WAIT_TIME : 0;
902 }
903
904 /* TASK_MIGRATE, PICK_NEXT_TASK left */
905 return SCHED_FREQ_ACCOUNT_WAIT_TIME;
906 }
907
908 /*
909 * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
910 */
update_cpu_busy_time(struct task_struct * p,struct rq * rq,int event,u64 wallclock,u64 irqtime)911 static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
912 int event, u64 wallclock, u64 irqtime)
913 {
914 int new_window, full_window = 0;
915 int p_is_curr_task = (p == rq->curr);
916 u64 mark_start = p->ravg.mark_start;
917 u64 window_start = rq->window_start;
918 u32 window_size = sched_ravg_window;
919 u64 delta;
920 u64 *curr_runnable_sum = &rq->curr_runnable_sum;
921 u64 *prev_runnable_sum = &rq->prev_runnable_sum;
922 u64 *nt_curr_runnable_sum = &rq->nt_curr_runnable_sum;
923 u64 *nt_prev_runnable_sum = &rq->nt_prev_runnable_sum;
924 bool new_task;
925 int cpu = rq->cpu;
926 #ifdef CONFIG_SCHED_RTG
927 struct group_cpu_time *cpu_time;
928 struct related_thread_group *grp;
929 #endif
930
931 new_window = mark_start < window_start;
932 if (new_window) {
933 full_window = (window_start - mark_start) >= window_size;
934 if (p->ravg.active_windows < USHRT_MAX)
935 p->ravg.active_windows++;
936 }
937
938 new_task = is_new_task(p);
939
940 /*
941 * Handle per-task window rollover. We don't care about the idle
942 * task or exiting tasks.
943 */
944 if (!is_idle_task(p) && !exiting_task(p)) {
945 if (new_window)
946 rollover_task_window(p, full_window);
947 }
948
949 if (p_is_curr_task && new_window)
950 rollover_cpu_window(rq, full_window);
951
952 if (!account_busy_for_cpu_time(rq, p, irqtime, event))
953 goto done;
954
955 #ifdef CONFIG_SCHED_RTG
956 grp = task_related_thread_group(p);
957 if (grp) {
958 cpu_time = &rq->grp_time;
959
960 curr_runnable_sum = &cpu_time->curr_runnable_sum;
961 prev_runnable_sum = &cpu_time->prev_runnable_sum;
962
963 nt_curr_runnable_sum = &cpu_time->nt_curr_runnable_sum;
964 nt_prev_runnable_sum = &cpu_time->nt_prev_runnable_sum;
965 }
966 #endif
967
968 if (!new_window) {
969 /*
970 * account_busy_for_cpu_time() = 1 so busy time needs
971 * to be accounted to the current window. No rollover
972 * since we didn't start a new window. An example of this is
973 * when a task starts execution and then sleeps within the
974 * same window.
975 */
976
977 if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
978 delta = wallclock - mark_start;
979 else
980 delta = irqtime;
981 delta = scale_exec_time(delta, rq);
982 *curr_runnable_sum += delta;
983 if (new_task)
984 *nt_curr_runnable_sum += delta;
985
986 if (!is_idle_task(p) && !exiting_task(p)) {
987 p->ravg.curr_window += delta;
988 p->ravg.curr_window_cpu[cpu] += delta;
989 }
990
991 goto done;
992 }
993
994 if (!p_is_curr_task) {
995 /*
996 * account_busy_for_cpu_time() = 1 so busy time needs
997 * to be accounted to the current window. A new window
998 * has also started, but p is not the current task, so the
999 * window is not rolled over - just split up and account
1000 * as necessary into curr and prev. The window is only
1001 * rolled over when a new window is processed for the current
1002 * task.
1003 *
1004 * Irqtime can't be accounted by a task that isn't the
1005 * currently running task.
1006 */
1007
1008 if (!full_window) {
1009 /*
1010 * A full window hasn't elapsed, account partial
1011 * contribution to previous completed window.
1012 */
1013 delta = scale_exec_time(window_start - mark_start, rq);
1014 if (!exiting_task(p)) {
1015 p->ravg.prev_window += delta;
1016 p->ravg.prev_window_cpu[cpu] += delta;
1017 }
1018 } else {
1019 /*
1020 * Since at least one full window has elapsed,
1021 * the contribution to the previous window is the
1022 * full window (window_size).
1023 */
1024 delta = scale_exec_time(window_size, rq);
1025 if (!exiting_task(p)) {
1026 p->ravg.prev_window = delta;
1027 p->ravg.prev_window_cpu[cpu] = delta;
1028 }
1029 }
1030
1031 *prev_runnable_sum += delta;
1032 if (new_task)
1033 *nt_prev_runnable_sum += delta;
1034
1035 /* Account piece of busy time in the current window. */
1036 delta = scale_exec_time(wallclock - window_start, rq);
1037 *curr_runnable_sum += delta;
1038 if (new_task)
1039 *nt_curr_runnable_sum += delta;
1040
1041 if (!exiting_task(p)) {
1042 p->ravg.curr_window = delta;
1043 p->ravg.curr_window_cpu[cpu] = delta;
1044 }
1045
1046 goto done;
1047 }
1048
1049 if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
1050 /*
1051 * account_busy_for_cpu_time() = 1 so busy time needs
1052 * to be accounted to the current window. A new window
1053 * has started and p is the current task so rollover is
1054 * needed. If any of these three above conditions are true
1055 * then this busy time can't be accounted as irqtime.
1056 *
1057 * Busy time for the idle task or exiting tasks need not
1058 * be accounted.
1059 *
1060 * An example of this would be a task that starts execution
1061 * and then sleeps once a new window has begun.
1062 */
1063
1064 if (!full_window) {
1065 /*
1066 * A full window hasn't elapsed, account partial
1067 * contribution to previous completed window.
1068 */
1069 delta = scale_exec_time(window_start - mark_start, rq);
1070 if (!is_idle_task(p) && !exiting_task(p)) {
1071 p->ravg.prev_window += delta;
1072 p->ravg.prev_window_cpu[cpu] += delta;
1073 }
1074 } else {
1075 /*
1076 * Since at least one full window has elapsed,
1077 * the contribution to the previous window is the
1078 * full window (window_size).
1079 */
1080 delta = scale_exec_time(window_size, rq);
1081 if (!is_idle_task(p) && !exiting_task(p)) {
1082 p->ravg.prev_window = delta;
1083 p->ravg.prev_window_cpu[cpu] = delta;
1084 }
1085 }
1086
1087 /*
1088 * Rollover is done here by overwriting the values in
1089 * prev_runnable_sum and curr_runnable_sum.
1090 */
1091 *prev_runnable_sum += delta;
1092 if (new_task)
1093 *nt_prev_runnable_sum += delta;
1094
1095 /* Account piece of busy time in the current window. */
1096 delta = scale_exec_time(wallclock - window_start, rq);
1097 *curr_runnable_sum += delta;
1098 if (new_task)
1099 *nt_curr_runnable_sum += delta;
1100
1101 if (!is_idle_task(p) && !exiting_task(p)) {
1102 p->ravg.curr_window = delta;
1103 p->ravg.curr_window_cpu[cpu] = delta;
1104 }
1105
1106 goto done;
1107 }
1108
1109 if (irqtime) {
1110 /*
1111 * account_busy_for_cpu_time() = 1 so busy time needs
1112 * to be accounted to the current window. A new window
1113 * has started and p is the current task so rollover is
1114 * needed. The current task must be the idle task because
1115 * irqtime is not accounted for any other task.
1116 *
1117 * Irqtime will be accounted each time we process IRQ activity
1118 * after a period of idleness, so we know the IRQ busy time
1119 * started at wallclock - irqtime.
1120 */
1121
1122 BUG_ON(!is_idle_task(p));
1123 mark_start = wallclock - irqtime;
1124
1125 /*
1126 * Roll window over. If IRQ busy time was just in the current
1127 * window then that is all that need be accounted.
1128 */
1129 if (mark_start > window_start) {
1130 *curr_runnable_sum = scale_exec_time(irqtime, rq);
1131 return;
1132 }
1133
1134 /*
1135 * The IRQ busy time spanned multiple windows. Process the
1136 * window then that is all that need be accounted.
1137 */
1138 delta = window_start - mark_start;
1139 if (delta > window_size)
1140 delta = window_size;
1141 delta = scale_exec_time(delta, rq);
1142 *prev_runnable_sum += delta;
1143
1144 /* Process the remaining IRQ busy time in the current window. */
1145 delta = wallclock - window_start;
1146 rq->curr_runnable_sum = scale_exec_time(delta, rq);
1147
1148 return;
1149 }
1150
1151 done:
1152 return;
1153 }
1154
run_walt_irq_work(u64 old_window_start,struct rq * rq)1155 static inline void run_walt_irq_work(u64 old_window_start, struct rq *rq)
1156 {
1157 u64 result;
1158
1159 if (old_window_start == rq->window_start)
1160 return;
1161
1162 result = atomic64_cmpxchg(&walt_irq_work_lastq_ws, old_window_start,
1163 rq->window_start);
1164 if (result == old_window_start)
1165 irq_work_queue(&walt_cpufreq_irq_work);
1166 }
1167
1168 /* Reflect task activity on its demand and cpu's busy time statistics */
update_task_ravg(struct task_struct * p,struct rq * rq,int event,u64 wallclock,u64 irqtime)1169 void update_task_ravg(struct task_struct *p, struct rq *rq, int event,
1170 u64 wallclock, u64 irqtime)
1171 {
1172 u64 old_window_start;
1173
1174 if (!rq->window_start || sched_disable_window_stats ||
1175 p->ravg.mark_start == wallclock)
1176 return;
1177
1178 lockdep_assert_held(&rq->lock);
1179
1180 old_window_start = update_window_start(rq, wallclock, event);
1181
1182 #ifdef CONFIG_SCHED_RTG
1183 update_group_nr_running(p, event, wallclock);
1184 #endif
1185 if (!p->ravg.mark_start)
1186 goto done;
1187
1188 update_task_demand(p, rq, event, wallclock);
1189 update_cpu_busy_time(p, rq, event, wallclock, irqtime);
1190
1191 if (exiting_task(p))
1192 goto done;
1193
1194 trace_sched_update_task_ravg(p, rq, event, wallclock, irqtime);
1195 done:
1196 p->ravg.mark_start = wallclock;
1197
1198 run_walt_irq_work(old_window_start, rq);
1199 }
1200
sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table * table,int write,void __user * buffer,size_t * length,loff_t * ppos)1201 int sysctl_sched_walt_init_task_load_pct_sysctl_handler(struct ctl_table *table,
1202 int write, void __user *buffer, size_t *length, loff_t *ppos)
1203 {
1204 int rc;
1205
1206 rc = proc_dointvec(table, write, buffer, length, ppos);
1207 if (rc)
1208 return rc;
1209
1210 sysctl_sched_init_task_load_pct = sysctl_sched_walt_init_task_load_pct;
1211
1212 return 0;
1213 }
1214
sched_get_init_task_load(struct task_struct * p)1215 u32 sched_get_init_task_load(struct task_struct *p)
1216 {
1217 return p->init_load_pct;
1218 }
1219
sched_set_init_task_load(struct task_struct * p,int init_load_pct)1220 int sched_set_init_task_load(struct task_struct *p, int init_load_pct)
1221 {
1222 if (init_load_pct < 0 || init_load_pct > 100)
1223 return -EINVAL;
1224
1225 p->init_load_pct = init_load_pct;
1226
1227 return 0;
1228 }
1229
init_new_task_load(struct task_struct * p)1230 void init_new_task_load(struct task_struct *p)
1231 {
1232 int i;
1233 u32 init_load_windows = sched_init_task_load_windows;
1234 u32 init_load_windows_scaled = sched_init_task_load_windows_scaled;
1235 u32 init_load_pct = current->init_load_pct;
1236
1237 #ifdef CONFIG_SCHED_RTG
1238 init_task_rtg(p);
1239 #endif
1240
1241 p->last_sleep_ts = 0;
1242 p->init_load_pct = 0;
1243 memset(&p->ravg, 0, sizeof(struct ravg));
1244
1245 p->ravg.curr_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32),
1246 GFP_KERNEL | __GFP_NOFAIL);
1247 p->ravg.prev_window_cpu = kcalloc(nr_cpu_ids, sizeof(u32),
1248 GFP_KERNEL | __GFP_NOFAIL);
1249
1250 if (init_load_pct) {
1251 init_load_windows = div64_u64((u64)init_load_pct *
1252 (u64)sched_ravg_window, 100);
1253 init_load_windows_scaled = scale_demand(init_load_windows);
1254 }
1255
1256 p->ravg.demand = init_load_windows;
1257 p->ravg.demand_scaled = init_load_windows_scaled;
1258 for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
1259 p->ravg.sum_history[i] = init_load_windows;
1260 }
1261
free_task_load_ptrs(struct task_struct * p)1262 void free_task_load_ptrs(struct task_struct *p)
1263 {
1264 kfree(p->ravg.curr_window_cpu);
1265 kfree(p->ravg.prev_window_cpu);
1266
1267 /*
1268 * update_task_ravg() can be called for exiting tasks. While the
1269 * function itself ensures correct behavior, the corresponding
1270 * trace event requires that these pointers be NULL.
1271 */
1272 p->ravg.curr_window_cpu = NULL;
1273 p->ravg.prev_window_cpu = NULL;
1274 }
1275
reset_task_stats(struct task_struct * p)1276 void reset_task_stats(struct task_struct *p)
1277 {
1278 u32 sum = 0;
1279 u32 *curr_window_ptr = NULL;
1280 u32 *prev_window_ptr = NULL;
1281
1282 if (exiting_task(p)) {
1283 sum = EXITING_TASK_MARKER;
1284 } else {
1285 curr_window_ptr = p->ravg.curr_window_cpu;
1286 prev_window_ptr = p->ravg.prev_window_cpu;
1287 memset(curr_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
1288 memset(prev_window_ptr, 0, sizeof(u32) * nr_cpu_ids);
1289 }
1290
1291 memset(&p->ravg, 0, sizeof(struct ravg));
1292
1293 p->ravg.curr_window_cpu = curr_window_ptr;
1294 p->ravg.prev_window_cpu = prev_window_ptr;
1295
1296 /* Retain EXITING_TASK marker */
1297 p->ravg.sum_history[0] = sum;
1298 }
1299
mark_task_starting(struct task_struct * p)1300 void mark_task_starting(struct task_struct *p)
1301 {
1302 u64 wallclock;
1303 struct rq *rq = task_rq(p);
1304
1305 if (!rq->window_start || sched_disable_window_stats) {
1306 reset_task_stats(p);
1307 return;
1308 }
1309
1310 wallclock = sched_ktime_clock();
1311 p->ravg.mark_start = wallclock;
1312 }
1313
1314 unsigned int max_possible_efficiency = 1;
1315 unsigned int min_possible_efficiency = UINT_MAX;
1316 unsigned int max_power_cost = 1;
1317
1318 static cpumask_t all_cluster_cpus = CPU_MASK_NONE;
1319 DECLARE_BITMAP(all_cluster_ids, NR_CPUS);
1320 struct sched_cluster *sched_cluster[NR_CPUS];
1321 int num_clusters;
1322
1323 struct list_head cluster_head;
1324
1325 static void
insert_cluster(struct sched_cluster * cluster,struct list_head * head)1326 insert_cluster(struct sched_cluster *cluster, struct list_head *head)
1327 {
1328 struct sched_cluster *tmp;
1329 struct list_head *iter = head;
1330
1331 list_for_each_entry(tmp, head, list) {
1332 if (cluster->max_power_cost < tmp->max_power_cost)
1333 break;
1334 iter = &tmp->list;
1335 }
1336
1337 list_add(&cluster->list, iter);
1338 }
1339
alloc_new_cluster(const struct cpumask * cpus)1340 static struct sched_cluster *alloc_new_cluster(const struct cpumask *cpus)
1341 {
1342 struct sched_cluster *cluster = NULL;
1343
1344 cluster = kzalloc(sizeof(struct sched_cluster), GFP_ATOMIC);
1345 if (!cluster) {
1346 pr_warn("Cluster allocation failed. Possible bad scheduling\n");
1347 return NULL;
1348 }
1349
1350 INIT_LIST_HEAD(&cluster->list);
1351 cluster->max_power_cost = 1;
1352 cluster->min_power_cost = 1;
1353 cluster->capacity = 1024;
1354 cluster->max_possible_capacity = 1024;
1355 cluster->efficiency = 1;
1356 cluster->load_scale_factor = 1024;
1357 cluster->cur_freq = 1;
1358 cluster->max_freq = 1;
1359 cluster->min_freq = 1;
1360 cluster->max_possible_freq = 1;
1361 cluster->freq_init_done = false;
1362
1363 raw_spin_lock_init(&cluster->load_lock);
1364 cluster->cpus = *cpus;
1365 cluster->efficiency = topology_get_cpu_scale(cpumask_first(cpus));
1366
1367 if (cluster->efficiency > max_possible_efficiency)
1368 max_possible_efficiency = cluster->efficiency;
1369 if (cluster->efficiency < min_possible_efficiency)
1370 min_possible_efficiency = cluster->efficiency;
1371
1372 return cluster;
1373 }
1374
add_cluster(const struct cpumask * cpus,struct list_head * head)1375 static void add_cluster(const struct cpumask *cpus, struct list_head *head)
1376 {
1377 struct sched_cluster *cluster = alloc_new_cluster(cpus);
1378 int i;
1379
1380 if (!cluster)
1381 return;
1382
1383 for_each_cpu(i, cpus)
1384 cpu_rq(i)->cluster = cluster;
1385
1386 insert_cluster(cluster, head);
1387 set_bit(num_clusters, all_cluster_ids);
1388 num_clusters++;
1389 }
1390
compute_max_possible_capacity(struct sched_cluster * cluster)1391 static int compute_max_possible_capacity(struct sched_cluster *cluster)
1392 {
1393 int capacity = 1024;
1394
1395 capacity *= capacity_scale_cpu_efficiency(cluster);
1396 capacity >>= 10;
1397
1398 capacity *= (1024 * cluster->max_possible_freq) / min_max_freq;
1399 capacity >>= 10;
1400
1401 return capacity;
1402 }
1403
walt_update_min_max_capacity(void)1404 void walt_update_min_max_capacity(void)
1405 {
1406 unsigned long flags;
1407
1408 acquire_rq_locks_irqsave(cpu_possible_mask, &flags);
1409 __update_min_max_capacity();
1410 release_rq_locks_irqrestore(cpu_possible_mask, &flags);
1411 }
1412
1413 static int
compare_clusters(void * priv,const struct list_head * a,const struct list_head * b)1414 compare_clusters(void *priv, const struct list_head *a, const struct list_head *b)
1415 {
1416 struct sched_cluster *cluster1, *cluster2;
1417 int ret;
1418
1419 cluster1 = container_of(a, struct sched_cluster, list);
1420 cluster2 = container_of(b, struct sched_cluster, list);
1421
1422 /*
1423 * Don't assume higher capacity means higher power. If the
1424 * power cost is same, sort the higher capacity cluster before
1425 * the lower capacity cluster to start placing the tasks
1426 * on the higher capacity cluster.
1427 */
1428 ret = cluster1->max_power_cost > cluster2->max_power_cost ||
1429 (cluster1->max_power_cost == cluster2->max_power_cost &&
1430 cluster1->max_possible_capacity <
1431 cluster2->max_possible_capacity);
1432
1433 return ret;
1434 }
1435
sort_clusters(void)1436 void sort_clusters(void)
1437 {
1438 struct sched_cluster *cluster;
1439 struct list_head new_head;
1440 unsigned int tmp_max = 1;
1441
1442 INIT_LIST_HEAD(&new_head);
1443
1444 for_each_sched_cluster(cluster) {
1445 cluster->max_power_cost = power_cost(cluster_first_cpu(cluster),
1446 max_task_load());
1447 cluster->min_power_cost = power_cost(cluster_first_cpu(cluster),
1448 0);
1449
1450 if (cluster->max_power_cost > tmp_max)
1451 tmp_max = cluster->max_power_cost;
1452 }
1453 max_power_cost = tmp_max;
1454
1455 move_list(&new_head, &cluster_head, true);
1456
1457 list_sort(NULL, &new_head, compare_clusters);
1458 assign_cluster_ids(&new_head);
1459
1460 /*
1461 * Ensure cluster ids are visible to all CPUs before making
1462 * cluster_head visible.
1463 */
1464 move_list(&cluster_head, &new_head, false);
1465 }
1466
update_all_clusters_stats(void)1467 static void update_all_clusters_stats(void)
1468 {
1469 struct sched_cluster *cluster;
1470 u64 highest_mpc = 0, lowest_mpc = U64_MAX;
1471 unsigned long flags;
1472
1473 acquire_rq_locks_irqsave(cpu_possible_mask, &flags);
1474
1475 for_each_sched_cluster(cluster) {
1476 u64 mpc;
1477
1478 cluster->capacity = compute_capacity(cluster);
1479 mpc = cluster->max_possible_capacity =
1480 compute_max_possible_capacity(cluster);
1481 cluster->load_scale_factor = compute_load_scale_factor(cluster);
1482
1483 cluster->exec_scale_factor =
1484 DIV_ROUND_UP(cluster->efficiency * 1024,
1485 max_possible_efficiency);
1486
1487 if (mpc > highest_mpc)
1488 highest_mpc = mpc;
1489
1490 if (mpc < lowest_mpc)
1491 lowest_mpc = mpc;
1492 }
1493
1494 max_possible_capacity = highest_mpc;
1495 min_max_possible_capacity = lowest_mpc;
1496
1497 __update_min_max_capacity();
1498 release_rq_locks_irqrestore(cpu_possible_mask, &flags);
1499 }
1500
update_cluster_topology(void)1501 void update_cluster_topology(void)
1502 {
1503 struct cpumask cpus = *cpu_possible_mask;
1504 const struct cpumask *cluster_cpus;
1505 struct list_head new_head;
1506 int i;
1507
1508 INIT_LIST_HEAD(&new_head);
1509
1510 for_each_cpu(i, &cpus) {
1511 cluster_cpus = cpu_coregroup_mask(i);
1512 cpumask_or(&all_cluster_cpus, &all_cluster_cpus, cluster_cpus);
1513 cpumask_andnot(&cpus, &cpus, cluster_cpus);
1514 add_cluster(cluster_cpus, &new_head);
1515 }
1516
1517 assign_cluster_ids(&new_head);
1518
1519 /*
1520 * Ensure cluster ids are visible to all CPUs before making
1521 * cluster_head visible.
1522 */
1523 move_list(&cluster_head, &new_head, false);
1524 update_all_clusters_stats();
1525 }
1526
1527 struct sched_cluster init_cluster = {
1528 .list = LIST_HEAD_INIT(init_cluster.list),
1529 .id = 0,
1530 .max_power_cost = 1,
1531 .min_power_cost = 1,
1532 .capacity = 1024,
1533 .max_possible_capacity = 1024,
1534 .efficiency = 1,
1535 .load_scale_factor = 1024,
1536 .cur_freq = 1,
1537 .max_freq = 1,
1538 .min_freq = 1,
1539 .max_possible_freq = 1,
1540 .exec_scale_factor = 1024,
1541 };
1542
init_clusters(void)1543 void init_clusters(void)
1544 {
1545 bitmap_clear(all_cluster_ids, 0, NR_CPUS);
1546 init_cluster.cpus = *cpu_possible_mask;
1547 raw_spin_lock_init(&init_cluster.load_lock);
1548 INIT_LIST_HEAD(&cluster_head);
1549 }
1550
1551 static unsigned long cpu_max_table_freq[NR_CPUS];
1552
update_cpu_cluster_capacity(const cpumask_t * cpus)1553 void update_cpu_cluster_capacity(const cpumask_t *cpus)
1554 {
1555 int i;
1556 struct sched_cluster *cluster;
1557 struct cpumask cpumask;
1558 unsigned long flags;
1559
1560 cpumask_copy(&cpumask, cpus);
1561 acquire_rq_locks_irqsave(cpu_possible_mask, &flags);
1562
1563 for_each_cpu(i, &cpumask) {
1564 cluster = cpu_rq(i)->cluster;
1565 cpumask_andnot(&cpumask, &cpumask, &cluster->cpus);
1566
1567 cluster->capacity = compute_capacity(cluster);
1568 cluster->load_scale_factor = compute_load_scale_factor(cluster);
1569 }
1570
1571 __update_min_max_capacity();
1572
1573 release_rq_locks_irqrestore(cpu_possible_mask, &flags);
1574 }
1575
cpufreq_notifier_policy(struct notifier_block * nb,unsigned long val,void * data)1576 static int cpufreq_notifier_policy(struct notifier_block *nb,
1577 unsigned long val, void *data)
1578 {
1579 struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
1580 struct sched_cluster *cluster = NULL;
1581 struct cpumask policy_cluster = *policy->related_cpus;
1582 unsigned int orig_max_freq = 0;
1583 int i, j, update_capacity = 0;
1584
1585 if (val != CPUFREQ_CREATE_POLICY)
1586 return 0;
1587
1588 walt_update_min_max_capacity();
1589
1590 max_possible_freq = max(max_possible_freq, policy->cpuinfo.max_freq);
1591 if (min_max_freq == 1)
1592 min_max_freq = UINT_MAX;
1593 min_max_freq = min(min_max_freq, policy->cpuinfo.max_freq);
1594 BUG_ON(!min_max_freq);
1595 BUG_ON(!policy->max);
1596
1597 for_each_cpu(i, &policy_cluster)
1598 cpu_max_table_freq[i] = policy->cpuinfo.max_freq;
1599
1600 for_each_cpu(i, &policy_cluster) {
1601 cluster = cpu_rq(i)->cluster;
1602 cpumask_andnot(&policy_cluster, &policy_cluster,
1603 &cluster->cpus);
1604
1605 orig_max_freq = cluster->max_freq;
1606 cluster->min_freq = policy->min;
1607 cluster->max_freq = policy->max;
1608 cluster->cur_freq = policy->cur;
1609
1610 if (!cluster->freq_init_done) {
1611 mutex_lock(&cluster_lock);
1612 for_each_cpu(j, &cluster->cpus)
1613 cpumask_copy(&cpu_rq(j)->freq_domain_cpumask,
1614 policy->related_cpus);
1615 cluster->max_possible_freq = policy->cpuinfo.max_freq;
1616 cluster->max_possible_capacity =
1617 compute_max_possible_capacity(cluster);
1618 cluster->freq_init_done = true;
1619
1620 sort_clusters();
1621 update_all_clusters_stats();
1622 mutex_unlock(&cluster_lock);
1623 continue;
1624 }
1625
1626 update_capacity += (orig_max_freq != cluster->max_freq);
1627 }
1628
1629 if (update_capacity)
1630 update_cpu_cluster_capacity(policy->related_cpus);
1631
1632 return 0;
1633 }
1634
1635 static struct notifier_block notifier_policy_block = {
1636 .notifier_call = cpufreq_notifier_policy
1637 };
1638
cpufreq_notifier_trans(struct notifier_block * nb,unsigned long val,void * data)1639 static int cpufreq_notifier_trans(struct notifier_block *nb,
1640 unsigned long val, void *data)
1641 {
1642 struct cpufreq_freqs *freq = (struct cpufreq_freqs *)data;
1643 unsigned int cpu = freq->policy->cpu, new_freq = freq->new;
1644 unsigned long flags;
1645 struct sched_cluster *cluster;
1646 struct cpumask policy_cpus = cpu_rq(cpu)->freq_domain_cpumask;
1647 int i, j;
1648
1649 if (val != CPUFREQ_POSTCHANGE)
1650 return NOTIFY_DONE;
1651
1652 if (cpu_cur_freq(cpu) == new_freq)
1653 return NOTIFY_OK;
1654
1655 for_each_cpu(i, &policy_cpus) {
1656 cluster = cpu_rq(i)->cluster;
1657
1658 for_each_cpu(j, &cluster->cpus) {
1659 struct rq *rq = cpu_rq(j);
1660
1661 raw_spin_lock_irqsave(&rq->lock, flags);
1662 update_task_ravg(rq->curr, rq, TASK_UPDATE,
1663 sched_ktime_clock(), 0);
1664 raw_spin_unlock_irqrestore(&rq->lock, flags);
1665 }
1666
1667 cluster->cur_freq = new_freq;
1668 cpumask_andnot(&policy_cpus, &policy_cpus, &cluster->cpus);
1669 }
1670
1671 return NOTIFY_OK;
1672 }
1673
1674 static struct notifier_block notifier_trans_block = {
1675 .notifier_call = cpufreq_notifier_trans
1676 };
1677
register_walt_callback(void)1678 static int register_walt_callback(void)
1679 {
1680 int ret;
1681
1682 ret = cpufreq_register_notifier(¬ifier_policy_block,
1683 CPUFREQ_POLICY_NOTIFIER);
1684 if (!ret)
1685 ret = cpufreq_register_notifier(¬ifier_trans_block,
1686 CPUFREQ_TRANSITION_NOTIFIER);
1687
1688 return ret;
1689 }
1690 /*
1691 * cpufreq callbacks can be registered at core_initcall or later time.
1692 * Any registration done prior to that is "forgotten" by cpufreq. See
1693 * initialization of variable init_cpufreq_transition_notifier_list_called
1694 * for further information.
1695 */
1696 core_initcall(register_walt_callback);
1697
1698 /*
1699 * Runs in hard-irq context. This should ideally run just after the latest
1700 * window roll-over.
1701 */
walt_irq_work(struct irq_work * irq_work)1702 void walt_irq_work(struct irq_work *irq_work)
1703 {
1704 struct sched_cluster *cluster;
1705 struct rq *rq;
1706 int cpu;
1707 u64 wc;
1708 bool is_migration = false;
1709 int level = 0;
1710
1711 /* Am I the window rollover work or the migration work? */
1712 if (irq_work == &walt_migration_irq_work)
1713 is_migration = true;
1714
1715 for_each_cpu(cpu, cpu_possible_mask) {
1716 if (level == 0)
1717 raw_spin_lock(&cpu_rq(cpu)->lock);
1718 else
1719 raw_spin_lock_nested(&cpu_rq(cpu)->lock, level);
1720 level++;
1721 }
1722
1723 wc = sched_ktime_clock();
1724 walt_load_reported_window = atomic64_read(&walt_irq_work_lastq_ws);
1725 for_each_sched_cluster(cluster) {
1726 raw_spin_lock(&cluster->load_lock);
1727
1728 for_each_cpu(cpu, &cluster->cpus) {
1729 rq = cpu_rq(cpu);
1730 if (rq->curr) {
1731 update_task_ravg(rq->curr, rq,
1732 TASK_UPDATE, wc, 0);
1733 account_load_subtractions(rq);
1734 }
1735 }
1736
1737 raw_spin_unlock(&cluster->load_lock);
1738 }
1739
1740 for_each_sched_cluster(cluster) {
1741 cpumask_t cluster_online_cpus;
1742 unsigned int num_cpus, i = 1;
1743
1744 cpumask_and(&cluster_online_cpus, &cluster->cpus,
1745 cpu_online_mask);
1746 num_cpus = cpumask_weight(&cluster_online_cpus);
1747 for_each_cpu(cpu, &cluster_online_cpus) {
1748 int flag = SCHED_CPUFREQ_WALT;
1749
1750 rq = cpu_rq(cpu);
1751
1752 if (i == num_cpus)
1753 cpufreq_update_util(cpu_rq(cpu), flag);
1754 else
1755 cpufreq_update_util(cpu_rq(cpu), flag |
1756 SCHED_CPUFREQ_CONTINUE);
1757 i++;
1758 }
1759 }
1760
1761 for_each_cpu(cpu, cpu_possible_mask)
1762 raw_spin_unlock(&cpu_rq(cpu)->lock);
1763
1764 if (!is_migration)
1765 core_ctl_check(this_rq()->window_start);
1766 }
1767
walt_init_once(void)1768 static void walt_init_once(void)
1769 {
1770 init_irq_work(&walt_migration_irq_work, walt_irq_work);
1771 init_irq_work(&walt_cpufreq_irq_work, walt_irq_work);
1772
1773 walt_cpu_util_freq_divisor =
1774 (sched_ravg_window >> SCHED_CAPACITY_SHIFT) * 100;
1775 walt_scale_demand_divisor = sched_ravg_window >> SCHED_CAPACITY_SHIFT;
1776
1777 sched_init_task_load_windows =
1778 div64_u64((u64)sysctl_sched_init_task_load_pct *
1779 (u64)sched_ravg_window, 100);
1780 sched_init_task_load_windows_scaled =
1781 scale_demand(sched_init_task_load_windows);
1782 }
1783
walt_sched_init_rq(struct rq * rq)1784 void walt_sched_init_rq(struct rq *rq)
1785 {
1786 static bool init;
1787 int j;
1788
1789 if (!init) {
1790 walt_init_once();
1791 init = true;
1792 }
1793
1794 cpumask_set_cpu(cpu_of(rq), &rq->freq_domain_cpumask);
1795
1796 rq->walt_stats.cumulative_runnable_avg_scaled = 0;
1797 rq->window_start = 0;
1798 rq->walt_flags = 0;
1799 rq->cur_irqload = 0;
1800 rq->avg_irqload = 0;
1801 rq->irqload_ts = 0;
1802
1803 /*
1804 * All cpus part of same cluster by default. This avoids the
1805 * need to check for rq->cluster being non-NULL in hot-paths
1806 * like select_best_cpu()
1807 */
1808 rq->cluster = &init_cluster;
1809 rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
1810 rq->nt_curr_runnable_sum = rq->nt_prev_runnable_sum = 0;
1811 rq->cum_window_demand_scaled = 0;
1812
1813 for (j = 0; j < NUM_TRACKED_WINDOWS; j++)
1814 memset(&rq->load_subs[j], 0, sizeof(struct load_subtractions));
1815 }
1816
1817 #define min_cap_cluster() \
1818 list_first_entry(&cluster_head, struct sched_cluster, list)
1819 #define max_cap_cluster() \
1820 list_last_entry(&cluster_head, struct sched_cluster, list)
sched_cluster_debug_show(struct seq_file * file,void * param)1821 static int sched_cluster_debug_show(struct seq_file *file, void *param)
1822 {
1823 struct sched_cluster *cluster = NULL;
1824
1825 seq_printf(file, "min_id:%d, max_id:%d\n",
1826 min_cap_cluster()->id,
1827 max_cap_cluster()->id);
1828
1829 for_each_sched_cluster(cluster) {
1830 seq_printf(file, "id:%d, cpumask:%d(%*pbl)\n",
1831 cluster->id,
1832 cpumask_first(&cluster->cpus),
1833 cpumask_pr_args(&cluster->cpus));
1834 }
1835
1836 return 0;
1837 }
1838
sched_cluster_debug_open(struct inode * inode,struct file * filp)1839 static int sched_cluster_debug_open(struct inode *inode, struct file *filp)
1840 {
1841 return single_open(filp, sched_cluster_debug_show, NULL);
1842 }
1843
1844 static const struct proc_ops sched_cluster_fops = {
1845 .proc_open = sched_cluster_debug_open,
1846 .proc_read = seq_read,
1847 .proc_lseek = seq_lseek,
1848 .proc_release = seq_release,
1849 };
1850
init_sched_cluster_debug_procfs(void)1851 static int __init init_sched_cluster_debug_procfs(void)
1852 {
1853 struct proc_dir_entry *pe = NULL;
1854
1855 pe = proc_create("sched_cluster",
1856 0444, NULL, &sched_cluster_fops);
1857 if (!pe)
1858 return -ENOMEM;
1859 return 0;
1860 }
1861 late_initcall(init_sched_cluster_debug_procfs);
1862