1 /*
2 * Copyright (c) 2016, The Linux Foundation. All rights reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License version 2 and
6 * only version 2 as published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it will be useful,
9 * but WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 * GNU General Public License for more details.
12 *
13 *
14 * Window Assisted Load Tracking (WALT) implementation credits:
15 * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
16 * Pavan Kumar Kondeti, Olav Haugan
17 *
18 * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
19 * and Todd Kjos
20 */
21
22 #include <linux/syscore_ops.h>
23 #include <trace/events/sched.h>
24 #include "sched.h"
25 #include "walt.h"
26
27 #define WINDOW_STATS_RECENT 0
28 #define WINDOW_STATS_MAX 1
29 #define WINDOW_STATS_MAX_RECENT_AVG 2
30 #define WINDOW_STATS_AVG 3
31 #define WINDOW_STATS_INVALID_POLICY 4
32
33 #define EXITING_TASK_MARKER 0xdeaddead
34
35 static __read_mostly unsigned int walt_ravg_hist_size = 5;
36 static __read_mostly unsigned int walt_window_stats_policy =
37 WINDOW_STATS_MAX_RECENT_AVG;
38 static __read_mostly unsigned int walt_account_wait_time = 1;
39 static __read_mostly unsigned int walt_freq_account_wait_time = 0;
40 static __read_mostly unsigned int walt_io_is_busy = 0;
41
42 unsigned int sysctl_sched_walt_init_task_load_pct = 15;
43
44 /* true -> use PELT based load stats, false -> use window-based load stats */
45 bool __read_mostly walt_disabled = false;
46
47 /*
48 * Window size (in ns). Adjust for the tick size so that the window
49 * rollover occurs just before the tick boundary.
50 */
51 __read_mostly unsigned int walt_ravg_window =
52 (20000000 / TICK_NSEC) * TICK_NSEC;
53 #define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
54 #define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
55
56 static unsigned int sync_cpu;
57 static ktime_t ktime_last;
58 static bool walt_ktime_suspended;
59
task_load(struct task_struct * p)60 static unsigned int task_load(struct task_struct *p)
61 {
62 return p->ravg.demand;
63 }
64
fixup_cum_window_demand(struct rq * rq,s64 delta)65 static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
66 {
67 rq->cum_window_demand += delta;
68 if (unlikely((s64)rq->cum_window_demand < 0))
69 rq->cum_window_demand = 0;
70 }
71
72 void
walt_inc_cumulative_runnable_avg(struct rq * rq,struct task_struct * p)73 walt_inc_cumulative_runnable_avg(struct rq *rq,
74 struct task_struct *p)
75 {
76 rq->cumulative_runnable_avg += p->ravg.demand;
77
78 /*
79 * Add a task's contribution to the cumulative window demand when
80 *
81 * (1) task is enqueued with on_rq = 1 i.e migration,
82 * prio/cgroup/class change.
83 * (2) task is waking for the first time in this window.
84 */
85 if (p->on_rq || (p->last_sleep_ts < rq->window_start))
86 fixup_cum_window_demand(rq, p->ravg.demand);
87 }
88
89 void
walt_dec_cumulative_runnable_avg(struct rq * rq,struct task_struct * p)90 walt_dec_cumulative_runnable_avg(struct rq *rq,
91 struct task_struct *p)
92 {
93 rq->cumulative_runnable_avg -= p->ravg.demand;
94 BUG_ON((s64)rq->cumulative_runnable_avg < 0);
95
96 /*
97 * on_rq will be 1 for sleeping tasks. So check if the task
98 * is migrating or dequeuing in RUNNING state to change the
99 * prio/cgroup/class.
100 */
101 if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
102 fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
103 }
104
105 static void
fixup_cumulative_runnable_avg(struct rq * rq,struct task_struct * p,u64 new_task_load)106 fixup_cumulative_runnable_avg(struct rq *rq,
107 struct task_struct *p, u64 new_task_load)
108 {
109 s64 task_load_delta = (s64)new_task_load - task_load(p);
110
111 rq->cumulative_runnable_avg += task_load_delta;
112 if ((s64)rq->cumulative_runnable_avg < 0)
113 panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
114 task_load_delta, task_load(p));
115
116 fixup_cum_window_demand(rq, task_load_delta);
117 }
118
walt_ktime_clock(void)119 u64 walt_ktime_clock(void)
120 {
121 if (unlikely(walt_ktime_suspended))
122 return ktime_to_ns(ktime_last);
123 return ktime_get_ns();
124 }
125
walt_resume(void)126 static void walt_resume(void)
127 {
128 walt_ktime_suspended = false;
129 }
130
walt_suspend(void)131 static int walt_suspend(void)
132 {
133 ktime_last = ktime_get();
134 walt_ktime_suspended = true;
135 return 0;
136 }
137
138 static struct syscore_ops walt_syscore_ops = {
139 .resume = walt_resume,
140 .suspend = walt_suspend
141 };
142
walt_init_ops(void)143 static int __init walt_init_ops(void)
144 {
145 register_syscore_ops(&walt_syscore_ops);
146 return 0;
147 }
148 late_initcall(walt_init_ops);
149
walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq * cfs_rq,struct task_struct * p)150 void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
151 struct task_struct *p)
152 {
153 cfs_rq->cumulative_runnable_avg += p->ravg.demand;
154 }
155
walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq * cfs_rq,struct task_struct * p)156 void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
157 struct task_struct *p)
158 {
159 cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
160 }
161
exiting_task(struct task_struct * p)162 static int exiting_task(struct task_struct *p)
163 {
164 if (p->flags & PF_EXITING) {
165 if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
166 p->ravg.sum_history[0] = EXITING_TASK_MARKER;
167 }
168 return 1;
169 }
170 return 0;
171 }
172
set_walt_ravg_window(char * str)173 static int __init set_walt_ravg_window(char *str)
174 {
175 unsigned int adj_window;
176 bool no_walt = walt_disabled;
177
178 get_option(&str, &walt_ravg_window);
179
180 /* Adjust for CONFIG_HZ */
181 adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
182
183 /* Warn if we're a bit too far away from the expected window size */
184 WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
185 "tick-adjusted window size %u, original was %u\n", adj_window,
186 walt_ravg_window);
187
188 walt_ravg_window = adj_window;
189
190 walt_disabled = walt_disabled ||
191 (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
192 walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
193
194 WARN(!no_walt && walt_disabled,
195 "invalid window size, disabling WALT\n");
196
197 return 0;
198 }
199
200 early_param("walt_ravg_window", set_walt_ravg_window);
201
202 static void
update_window_start(struct rq * rq,u64 wallclock)203 update_window_start(struct rq *rq, u64 wallclock)
204 {
205 s64 delta;
206 int nr_windows;
207
208 delta = wallclock - rq->window_start;
209 /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
210 if (delta < 0) {
211 delta = 0;
212 WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
213 }
214
215 if (delta < walt_ravg_window)
216 return;
217
218 nr_windows = div64_u64(delta, walt_ravg_window);
219 rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
220
221 rq->cum_window_demand = rq->cumulative_runnable_avg;
222 }
223
224 /*
225 * Translate absolute delta time accounted on a CPU
226 * to a scale where 1024 is the capacity of the most
227 * capable CPU running at FMAX
228 */
scale_exec_time(u64 delta,struct rq * rq)229 static u64 scale_exec_time(u64 delta, struct rq *rq)
230 {
231 unsigned long capcurr = capacity_curr_of(cpu_of(rq));
232
233 return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
234 }
235
cpu_is_waiting_on_io(struct rq * rq)236 static int cpu_is_waiting_on_io(struct rq *rq)
237 {
238 if (!walt_io_is_busy)
239 return 0;
240
241 return atomic_read(&rq->nr_iowait);
242 }
243
walt_account_irqtime(int cpu,struct task_struct * curr,u64 delta,u64 wallclock)244 void walt_account_irqtime(int cpu, struct task_struct *curr,
245 u64 delta, u64 wallclock)
246 {
247 struct rq *rq = cpu_rq(cpu);
248 unsigned long flags, nr_windows;
249 u64 cur_jiffies_ts;
250
251 raw_spin_lock_irqsave(&rq->lock, flags);
252
253 /*
254 * cputime (wallclock) uses sched_clock so use the same here for
255 * consistency.
256 */
257 delta += sched_clock() - wallclock;
258 cur_jiffies_ts = get_jiffies_64();
259
260 if (is_idle_task(curr))
261 walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
262 delta);
263
264 nr_windows = cur_jiffies_ts - rq->irqload_ts;
265
266 if (nr_windows) {
267 if (nr_windows < 10) {
268 /* Decay CPU's irqload by 3/4 for each window. */
269 rq->avg_irqload *= (3 * nr_windows);
270 rq->avg_irqload = div64_u64(rq->avg_irqload,
271 4 * nr_windows);
272 } else {
273 rq->avg_irqload = 0;
274 }
275 rq->avg_irqload += rq->cur_irqload;
276 rq->cur_irqload = 0;
277 }
278
279 rq->cur_irqload += delta;
280 rq->irqload_ts = cur_jiffies_ts;
281 raw_spin_unlock_irqrestore(&rq->lock, flags);
282 }
283
284
285 #define WALT_HIGH_IRQ_TIMEOUT 3
286
walt_irqload(int cpu)287 u64 walt_irqload(int cpu) {
288 struct rq *rq = cpu_rq(cpu);
289 s64 delta;
290 delta = get_jiffies_64() - rq->irqload_ts;
291
292 /*
293 * Current context can be preempted by irq and rq->irqload_ts can be
294 * updated by irq context so that delta can be negative.
295 * But this is okay and we can safely return as this means there
296 * was recent irq occurrence.
297 */
298
299 if (delta < WALT_HIGH_IRQ_TIMEOUT)
300 return rq->avg_irqload;
301 else
302 return 0;
303 }
304
walt_cpu_high_irqload(int cpu)305 int walt_cpu_high_irqload(int cpu) {
306 return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
307 }
308
account_busy_for_cpu_time(struct rq * rq,struct task_struct * p,u64 irqtime,int event)309 static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
310 u64 irqtime, int event)
311 {
312 if (is_idle_task(p)) {
313 /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
314 if (event == PICK_NEXT_TASK)
315 return 0;
316
317 /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
318 return irqtime || cpu_is_waiting_on_io(rq);
319 }
320
321 if (event == TASK_WAKE)
322 return 0;
323
324 if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
325 event == TASK_UPDATE)
326 return 1;
327
328 /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
329 return walt_freq_account_wait_time;
330 }
331
332 /*
333 * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
334 */
update_cpu_busy_time(struct task_struct * p,struct rq * rq,int event,u64 wallclock,u64 irqtime)335 static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
336 int event, u64 wallclock, u64 irqtime)
337 {
338 int new_window, nr_full_windows = 0;
339 int p_is_curr_task = (p == rq->curr);
340 u64 mark_start = p->ravg.mark_start;
341 u64 window_start = rq->window_start;
342 u32 window_size = walt_ravg_window;
343 u64 delta;
344
345 new_window = mark_start < window_start;
346 if (new_window) {
347 nr_full_windows = div64_u64((window_start - mark_start),
348 window_size);
349 if (p->ravg.active_windows < USHRT_MAX)
350 p->ravg.active_windows++;
351 }
352
353 /* Handle per-task window rollover. We don't care about the idle
354 * task or exiting tasks. */
355 if (new_window && !is_idle_task(p) && !exiting_task(p)) {
356 u32 curr_window = 0;
357
358 if (!nr_full_windows)
359 curr_window = p->ravg.curr_window;
360
361 p->ravg.prev_window = curr_window;
362 p->ravg.curr_window = 0;
363 }
364
365 if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
366 /* account_busy_for_cpu_time() = 0, so no update to the
367 * task's current window needs to be made. This could be
368 * for example
369 *
370 * - a wakeup event on a task within the current
371 * window (!new_window below, no action required),
372 * - switching to a new task from idle (PICK_NEXT_TASK)
373 * in a new window where irqtime is 0 and we aren't
374 * waiting on IO */
375
376 if (!new_window)
377 return;
378
379 /* A new window has started. The RQ demand must be rolled
380 * over if p is the current task. */
381 if (p_is_curr_task) {
382 u64 prev_sum = 0;
383
384 /* p is either idle task or an exiting task */
385 if (!nr_full_windows) {
386 prev_sum = rq->curr_runnable_sum;
387 }
388
389 rq->prev_runnable_sum = prev_sum;
390 rq->curr_runnable_sum = 0;
391 }
392
393 return;
394 }
395
396 if (!new_window) {
397 /* account_busy_for_cpu_time() = 1 so busy time needs
398 * to be accounted to the current window. No rollover
399 * since we didn't start a new window. An example of this is
400 * when a task starts execution and then sleeps within the
401 * same window. */
402
403 if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
404 delta = wallclock - mark_start;
405 else
406 delta = irqtime;
407 delta = scale_exec_time(delta, rq);
408 rq->curr_runnable_sum += delta;
409 if (!is_idle_task(p) && !exiting_task(p))
410 p->ravg.curr_window += delta;
411
412 return;
413 }
414
415 if (!p_is_curr_task) {
416 /* account_busy_for_cpu_time() = 1 so busy time needs
417 * to be accounted to the current window. A new window
418 * has also started, but p is not the current task, so the
419 * window is not rolled over - just split up and account
420 * as necessary into curr and prev. The window is only
421 * rolled over when a new window is processed for the current
422 * task.
423 *
424 * Irqtime can't be accounted by a task that isn't the
425 * currently running task. */
426
427 if (!nr_full_windows) {
428 /* A full window hasn't elapsed, account partial
429 * contribution to previous completed window. */
430 delta = scale_exec_time(window_start - mark_start, rq);
431 if (!exiting_task(p))
432 p->ravg.prev_window += delta;
433 } else {
434 /* Since at least one full window has elapsed,
435 * the contribution to the previous window is the
436 * full window (window_size). */
437 delta = scale_exec_time(window_size, rq);
438 if (!exiting_task(p))
439 p->ravg.prev_window = delta;
440 }
441 rq->prev_runnable_sum += delta;
442
443 /* Account piece of busy time in the current window. */
444 delta = scale_exec_time(wallclock - window_start, rq);
445 rq->curr_runnable_sum += delta;
446 if (!exiting_task(p))
447 p->ravg.curr_window = delta;
448
449 return;
450 }
451
452 if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
453 /* account_busy_for_cpu_time() = 1 so busy time needs
454 * to be accounted to the current window. A new window
455 * has started and p is the current task so rollover is
456 * needed. If any of these three above conditions are true
457 * then this busy time can't be accounted as irqtime.
458 *
459 * Busy time for the idle task or exiting tasks need not
460 * be accounted.
461 *
462 * An example of this would be a task that starts execution
463 * and then sleeps once a new window has begun. */
464
465 if (!nr_full_windows) {
466 /* A full window hasn't elapsed, account partial
467 * contribution to previous completed window. */
468 delta = scale_exec_time(window_start - mark_start, rq);
469 if (!is_idle_task(p) && !exiting_task(p))
470 p->ravg.prev_window += delta;
471
472 delta += rq->curr_runnable_sum;
473 } else {
474 /* Since at least one full window has elapsed,
475 * the contribution to the previous window is the
476 * full window (window_size). */
477 delta = scale_exec_time(window_size, rq);
478 if (!is_idle_task(p) && !exiting_task(p))
479 p->ravg.prev_window = delta;
480
481 }
482 /*
483 * Rollover for normal runnable sum is done here by overwriting
484 * the values in prev_runnable_sum and curr_runnable_sum.
485 * Rollover for new task runnable sum has completed by previous
486 * if-else statement.
487 */
488 rq->prev_runnable_sum = delta;
489
490 /* Account piece of busy time in the current window. */
491 delta = scale_exec_time(wallclock - window_start, rq);
492 rq->curr_runnable_sum = delta;
493 if (!is_idle_task(p) && !exiting_task(p))
494 p->ravg.curr_window = delta;
495
496 return;
497 }
498
499 if (irqtime) {
500 /* account_busy_for_cpu_time() = 1 so busy time needs
501 * to be accounted to the current window. A new window
502 * has started and p is the current task so rollover is
503 * needed. The current task must be the idle task because
504 * irqtime is not accounted for any other task.
505 *
506 * Irqtime will be accounted each time we process IRQ activity
507 * after a period of idleness, so we know the IRQ busy time
508 * started at wallclock - irqtime. */
509
510 BUG_ON(!is_idle_task(p));
511 mark_start = wallclock - irqtime;
512
513 /* Roll window over. If IRQ busy time was just in the current
514 * window then that is all that need be accounted. */
515 rq->prev_runnable_sum = rq->curr_runnable_sum;
516 if (mark_start > window_start) {
517 rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
518 return;
519 }
520
521 /* The IRQ busy time spanned multiple windows. Process the
522 * busy time preceding the current window start first. */
523 delta = window_start - mark_start;
524 if (delta > window_size)
525 delta = window_size;
526 delta = scale_exec_time(delta, rq);
527 rq->prev_runnable_sum += delta;
528
529 /* Process the remaining IRQ busy time in the current window. */
530 delta = wallclock - window_start;
531 rq->curr_runnable_sum = scale_exec_time(delta, rq);
532
533 return;
534 }
535
536 BUG();
537 }
538
account_busy_for_task_demand(struct task_struct * p,int event)539 static int account_busy_for_task_demand(struct task_struct *p, int event)
540 {
541 /* No need to bother updating task demand for exiting tasks
542 * or the idle task. */
543 if (exiting_task(p) || is_idle_task(p))
544 return 0;
545
546 /* When a task is waking up it is completing a segment of non-busy
547 * time. Likewise, if wait time is not treated as busy time, then
548 * when a task begins to run or is migrated, it is not running and
549 * is completing a segment of non-busy time. */
550 if (event == TASK_WAKE || (!walt_account_wait_time &&
551 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
552 return 0;
553
554 return 1;
555 }
556
557 /*
558 * Called when new window is starting for a task, to record cpu usage over
559 * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
560 * when, say, a real-time task runs without preemption for several windows at a
561 * stretch.
562 */
update_history(struct rq * rq,struct task_struct * p,u32 runtime,int samples,int event)563 static void update_history(struct rq *rq, struct task_struct *p,
564 u32 runtime, int samples, int event)
565 {
566 u32 *hist = &p->ravg.sum_history[0];
567 int ridx, widx;
568 u32 max = 0, avg, demand;
569 u64 sum = 0;
570
571 /* Ignore windows where task had no activity */
572 if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
573 goto done;
574
575 /* Push new 'runtime' value onto stack */
576 widx = walt_ravg_hist_size - 1;
577 ridx = widx - samples;
578 for (; ridx >= 0; --widx, --ridx) {
579 hist[widx] = hist[ridx];
580 sum += hist[widx];
581 if (hist[widx] > max)
582 max = hist[widx];
583 }
584
585 for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
586 hist[widx] = runtime;
587 sum += hist[widx];
588 if (hist[widx] > max)
589 max = hist[widx];
590 }
591
592 p->ravg.sum = 0;
593
594 if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
595 demand = runtime;
596 } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
597 demand = max;
598 } else {
599 avg = div64_u64(sum, walt_ravg_hist_size);
600 if (walt_window_stats_policy == WINDOW_STATS_AVG)
601 demand = avg;
602 else
603 demand = max(avg, runtime);
604 }
605
606 /*
607 * A throttled deadline sched class task gets dequeued without
608 * changing p->on_rq. Since the dequeue decrements hmp stats
609 * avoid decrementing it here again.
610 *
611 * When window is rolled over, the cumulative window demand
612 * is reset to the cumulative runnable average (contribution from
613 * the tasks on the runqueue). If the current task is dequeued
614 * already, it's demand is not included in the cumulative runnable
615 * average. So add the task demand separately to cumulative window
616 * demand.
617 */
618 if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
619 if (task_on_rq_queued(p))
620 fixup_cumulative_runnable_avg(rq, p, demand);
621 else if (rq->curr == p)
622 fixup_cum_window_demand(rq, demand);
623 }
624
625 p->ravg.demand = demand;
626
627 done:
628 trace_walt_update_history(rq, p, runtime, samples, event);
629 return;
630 }
631
add_to_task_demand(struct rq * rq,struct task_struct * p,u64 delta)632 static void add_to_task_demand(struct rq *rq, struct task_struct *p,
633 u64 delta)
634 {
635 delta = scale_exec_time(delta, rq);
636 p->ravg.sum += delta;
637 if (unlikely(p->ravg.sum > walt_ravg_window))
638 p->ravg.sum = walt_ravg_window;
639 }
640
641 /*
642 * Account cpu demand of task and/or update task's cpu demand history
643 *
644 * ms = p->ravg.mark_start;
645 * wc = wallclock
646 * ws = rq->window_start
647 *
648 * Three possibilities:
649 *
650 * a) Task event is contained within one window.
651 * window_start < mark_start < wallclock
652 *
653 * ws ms wc
654 * | | |
655 * V V V
656 * |---------------|
657 *
658 * In this case, p->ravg.sum is updated *iff* event is appropriate
659 * (ex: event == PUT_PREV_TASK)
660 *
661 * b) Task event spans two windows.
662 * mark_start < window_start < wallclock
663 *
664 * ms ws wc
665 * | | |
666 * V V V
667 * -----|-------------------
668 *
669 * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
670 * is appropriate, then a new window sample is recorded followed
671 * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
672 *
673 * c) Task event spans more than two windows.
674 *
675 * ms ws_tmp ws wc
676 * | | | |
677 * V V V V
678 * ---|-------|-------|-------|-------|------
679 * | |
680 * |<------ nr_full_windows ------>|
681 *
682 * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
683 * event is appropriate, window sample of p->ravg.sum is recorded,
684 * 'nr_full_window' samples of window_size is also recorded *iff*
685 * event is appropriate and finally p->ravg.sum is set to (wc - ws)
686 * *iff* event is appropriate.
687 *
688 * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
689 * depends on it!
690 */
update_task_demand(struct task_struct * p,struct rq * rq,int event,u64 wallclock)691 static void update_task_demand(struct task_struct *p, struct rq *rq,
692 int event, u64 wallclock)
693 {
694 u64 mark_start = p->ravg.mark_start;
695 u64 delta, window_start = rq->window_start;
696 int new_window, nr_full_windows;
697 u32 window_size = walt_ravg_window;
698
699 new_window = mark_start < window_start;
700 if (!account_busy_for_task_demand(p, event)) {
701 if (new_window)
702 /* If the time accounted isn't being accounted as
703 * busy time, and a new window started, only the
704 * previous window need be closed out with the
705 * pre-existing demand. Multiple windows may have
706 * elapsed, but since empty windows are dropped,
707 * it is not necessary to account those. */
708 update_history(rq, p, p->ravg.sum, 1, event);
709 return;
710 }
711
712 if (!new_window) {
713 /* The simple case - busy time contained within the existing
714 * window. */
715 add_to_task_demand(rq, p, wallclock - mark_start);
716 return;
717 }
718
719 /* Busy time spans at least two windows. Temporarily rewind
720 * window_start to first window boundary after mark_start. */
721 delta = window_start - mark_start;
722 nr_full_windows = div64_u64(delta, window_size);
723 window_start -= (u64)nr_full_windows * (u64)window_size;
724
725 /* Process (window_start - mark_start) first */
726 add_to_task_demand(rq, p, window_start - mark_start);
727
728 /* Push new sample(s) into task's demand history */
729 update_history(rq, p, p->ravg.sum, 1, event);
730 if (nr_full_windows)
731 update_history(rq, p, scale_exec_time(window_size, rq),
732 nr_full_windows, event);
733
734 /* Roll window_start back to current to process any remainder
735 * in current window. */
736 window_start += (u64)nr_full_windows * (u64)window_size;
737
738 /* Process (wallclock - window_start) next */
739 mark_start = window_start;
740 add_to_task_demand(rq, p, wallclock - mark_start);
741 }
742
743 /* Reflect task activity on its demand and cpu's busy time statistics */
walt_update_task_ravg(struct task_struct * p,struct rq * rq,int event,u64 wallclock,u64 irqtime)744 void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
745 int event, u64 wallclock, u64 irqtime)
746 {
747 if (walt_disabled || !rq->window_start)
748 return;
749
750 lockdep_assert_held(&rq->lock);
751
752 update_window_start(rq, wallclock);
753
754 if (!p->ravg.mark_start)
755 goto done;
756
757 update_task_demand(p, rq, event, wallclock);
758 update_cpu_busy_time(p, rq, event, wallclock, irqtime);
759
760 done:
761 trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
762
763 p->ravg.mark_start = wallclock;
764 }
765
reset_task_stats(struct task_struct * p)766 static void reset_task_stats(struct task_struct *p)
767 {
768 u32 sum = 0;
769
770 if (exiting_task(p))
771 sum = EXITING_TASK_MARKER;
772
773 memset(&p->ravg, 0, sizeof(struct ravg));
774 /* Retain EXITING_TASK marker */
775 p->ravg.sum_history[0] = sum;
776 }
777
walt_mark_task_starting(struct task_struct * p)778 void walt_mark_task_starting(struct task_struct *p)
779 {
780 u64 wallclock;
781 struct rq *rq = task_rq(p);
782
783 if (!rq->window_start) {
784 reset_task_stats(p);
785 return;
786 }
787
788 wallclock = walt_ktime_clock();
789 p->ravg.mark_start = wallclock;
790 }
791
walt_set_window_start(struct rq * rq)792 void walt_set_window_start(struct rq *rq)
793 {
794 int cpu = cpu_of(rq);
795 struct rq *sync_rq = cpu_rq(sync_cpu);
796
797 if (likely(rq->window_start))
798 return;
799
800 if (cpu == sync_cpu) {
801 rq->window_start = 1;
802 } else {
803 raw_spin_unlock(&rq->lock);
804 double_rq_lock(rq, sync_rq);
805 rq->window_start = cpu_rq(sync_cpu)->window_start;
806 rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
807 raw_spin_unlock(&sync_rq->lock);
808 }
809
810 rq->curr->ravg.mark_start = rq->window_start;
811 }
812
walt_migrate_sync_cpu(int cpu)813 void walt_migrate_sync_cpu(int cpu)
814 {
815 if (cpu == sync_cpu)
816 sync_cpu = smp_processor_id();
817 }
818
walt_fixup_busy_time(struct task_struct * p,int new_cpu)819 void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
820 {
821 struct rq *src_rq = task_rq(p);
822 struct rq *dest_rq = cpu_rq(new_cpu);
823 u64 wallclock;
824
825 if (!p->on_rq && p->state != TASK_WAKING)
826 return;
827
828 if (exiting_task(p)) {
829 return;
830 }
831
832 if (p->state == TASK_WAKING)
833 double_rq_lock(src_rq, dest_rq);
834
835 wallclock = walt_ktime_clock();
836
837 walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
838 TASK_UPDATE, wallclock, 0);
839 walt_update_task_ravg(dest_rq->curr, dest_rq,
840 TASK_UPDATE, wallclock, 0);
841
842 walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
843
844 /*
845 * When a task is migrating during the wakeup, adjust
846 * the task's contribution towards cumulative window
847 * demand.
848 */
849 if (p->state == TASK_WAKING &&
850 p->last_sleep_ts >= src_rq->window_start) {
851 fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
852 fixup_cum_window_demand(dest_rq, p->ravg.demand);
853 }
854
855 if (p->ravg.curr_window) {
856 src_rq->curr_runnable_sum -= p->ravg.curr_window;
857 dest_rq->curr_runnable_sum += p->ravg.curr_window;
858 }
859
860 if (p->ravg.prev_window) {
861 src_rq->prev_runnable_sum -= p->ravg.prev_window;
862 dest_rq->prev_runnable_sum += p->ravg.prev_window;
863 }
864
865 if ((s64)src_rq->prev_runnable_sum < 0) {
866 src_rq->prev_runnable_sum = 0;
867 WARN_ON(1);
868 }
869 if ((s64)src_rq->curr_runnable_sum < 0) {
870 src_rq->curr_runnable_sum = 0;
871 WARN_ON(1);
872 }
873
874 trace_walt_migration_update_sum(src_rq, p);
875 trace_walt_migration_update_sum(dest_rq, p);
876
877 if (p->state == TASK_WAKING)
878 double_rq_unlock(src_rq, dest_rq);
879 }
880
walt_init_new_task_load(struct task_struct * p)881 void walt_init_new_task_load(struct task_struct *p)
882 {
883 int i;
884 u32 init_load_windows =
885 div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
886 (u64)walt_ravg_window, 100);
887 u32 init_load_pct = current->init_load_pct;
888
889 p->init_load_pct = 0;
890 memset(&p->ravg, 0, sizeof(struct ravg));
891
892 if (init_load_pct) {
893 init_load_windows = div64_u64((u64)init_load_pct *
894 (u64)walt_ravg_window, 100);
895 }
896
897 p->ravg.demand = init_load_windows;
898 for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
899 p->ravg.sum_history[i] = init_load_windows;
900 }
901