• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2016, The Linux Foundation. All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 and
6  * only version 2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11  * GNU General Public License for more details.
12  *
13  *
14  * Window Assisted Load Tracking (WALT) implementation credits:
15  * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
16  * Pavan Kumar Kondeti, Olav Haugan
17  *
18  * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
19  *             and Todd Kjos
20  */
21 
22 #include <linux/syscore_ops.h>
23 #include <trace/events/sched.h>
24 #include "sched.h"
25 #include "walt.h"
26 
27 #define WINDOW_STATS_RECENT		0
28 #define WINDOW_STATS_MAX		1
29 #define WINDOW_STATS_MAX_RECENT_AVG	2
30 #define WINDOW_STATS_AVG		3
31 #define WINDOW_STATS_INVALID_POLICY	4
32 
33 #define EXITING_TASK_MARKER	0xdeaddead
34 
35 static __read_mostly unsigned int walt_ravg_hist_size = 5;
36 static __read_mostly unsigned int walt_window_stats_policy =
37 	WINDOW_STATS_MAX_RECENT_AVG;
38 static __read_mostly unsigned int walt_account_wait_time = 1;
39 static __read_mostly unsigned int walt_freq_account_wait_time = 0;
40 static __read_mostly unsigned int walt_io_is_busy = 0;
41 
42 unsigned int sysctl_sched_walt_init_task_load_pct = 15;
43 
44 /* true -> use PELT based load stats, false -> use window-based load stats */
45 bool __read_mostly walt_disabled = false;
46 
47 /*
48  * Window size (in ns). Adjust for the tick size so that the window
49  * rollover occurs just before the tick boundary.
50  */
51 __read_mostly unsigned int walt_ravg_window =
52 					    (20000000 / TICK_NSEC) * TICK_NSEC;
53 #define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
54 #define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
55 
56 static unsigned int sync_cpu;
57 static ktime_t ktime_last;
58 static bool walt_ktime_suspended;
59 
task_load(struct task_struct * p)60 static unsigned int task_load(struct task_struct *p)
61 {
62 	return p->ravg.demand;
63 }
64 
fixup_cum_window_demand(struct rq * rq,s64 delta)65 static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
66 {
67 	rq->cum_window_demand += delta;
68 	if (unlikely((s64)rq->cum_window_demand < 0))
69 		rq->cum_window_demand = 0;
70 }
71 
72 void
walt_inc_cumulative_runnable_avg(struct rq * rq,struct task_struct * p)73 walt_inc_cumulative_runnable_avg(struct rq *rq,
74 				 struct task_struct *p)
75 {
76 	rq->cumulative_runnable_avg += p->ravg.demand;
77 
78 	/*
79 	 * Add a task's contribution to the cumulative window demand when
80 	 *
81 	 * (1) task is enqueued with on_rq = 1 i.e migration,
82 	 *     prio/cgroup/class change.
83 	 * (2) task is waking for the first time in this window.
84 	 */
85 	if (p->on_rq || (p->last_sleep_ts < rq->window_start))
86 		fixup_cum_window_demand(rq, p->ravg.demand);
87 }
88 
89 void
walt_dec_cumulative_runnable_avg(struct rq * rq,struct task_struct * p)90 walt_dec_cumulative_runnable_avg(struct rq *rq,
91 				 struct task_struct *p)
92 {
93 	rq->cumulative_runnable_avg -= p->ravg.demand;
94 	BUG_ON((s64)rq->cumulative_runnable_avg < 0);
95 
96 	/*
97 	 * on_rq will be 1 for sleeping tasks. So check if the task
98 	 * is migrating or dequeuing in RUNNING state to change the
99 	 * prio/cgroup/class.
100 	 */
101 	if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
102 		fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
103 }
104 
105 static void
fixup_cumulative_runnable_avg(struct rq * rq,struct task_struct * p,u64 new_task_load)106 fixup_cumulative_runnable_avg(struct rq *rq,
107 			      struct task_struct *p, u64 new_task_load)
108 {
109 	s64 task_load_delta = (s64)new_task_load - task_load(p);
110 
111 	rq->cumulative_runnable_avg += task_load_delta;
112 	if ((s64)rq->cumulative_runnable_avg < 0)
113 		panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
114 			task_load_delta, task_load(p));
115 
116 	fixup_cum_window_demand(rq, task_load_delta);
117 }
118 
walt_ktime_clock(void)119 u64 walt_ktime_clock(void)
120 {
121 	if (unlikely(walt_ktime_suspended))
122 		return ktime_to_ns(ktime_last);
123 	return ktime_get_ns();
124 }
125 
walt_resume(void)126 static void walt_resume(void)
127 {
128 	walt_ktime_suspended = false;
129 }
130 
walt_suspend(void)131 static int walt_suspend(void)
132 {
133 	ktime_last = ktime_get();
134 	walt_ktime_suspended = true;
135 	return 0;
136 }
137 
138 static struct syscore_ops walt_syscore_ops = {
139 	.resume	= walt_resume,
140 	.suspend = walt_suspend
141 };
142 
walt_init_ops(void)143 static int __init walt_init_ops(void)
144 {
145 	register_syscore_ops(&walt_syscore_ops);
146 	return 0;
147 }
148 late_initcall(walt_init_ops);
149 
walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq * cfs_rq,struct task_struct * p)150 void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
151 		struct task_struct *p)
152 {
153 	cfs_rq->cumulative_runnable_avg += p->ravg.demand;
154 }
155 
walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq * cfs_rq,struct task_struct * p)156 void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
157 		struct task_struct *p)
158 {
159 	cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
160 }
161 
exiting_task(struct task_struct * p)162 static int exiting_task(struct task_struct *p)
163 {
164 	if (p->flags & PF_EXITING) {
165 		if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
166 			p->ravg.sum_history[0] = EXITING_TASK_MARKER;
167 		}
168 		return 1;
169 	}
170 	return 0;
171 }
172 
set_walt_ravg_window(char * str)173 static int __init set_walt_ravg_window(char *str)
174 {
175 	unsigned int adj_window;
176 	bool no_walt = walt_disabled;
177 
178 	get_option(&str, &walt_ravg_window);
179 
180 	/* Adjust for CONFIG_HZ */
181 	adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
182 
183 	/* Warn if we're a bit too far away from the expected window size */
184 	WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
185 	     "tick-adjusted window size %u, original was %u\n", adj_window,
186 	     walt_ravg_window);
187 
188 	walt_ravg_window = adj_window;
189 
190 	walt_disabled = walt_disabled ||
191 			(walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
192 			 walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
193 
194 	WARN(!no_walt && walt_disabled,
195 	     "invalid window size, disabling WALT\n");
196 
197 	return 0;
198 }
199 
200 early_param("walt_ravg_window", set_walt_ravg_window);
201 
202 static void
update_window_start(struct rq * rq,u64 wallclock)203 update_window_start(struct rq *rq, u64 wallclock)
204 {
205 	s64 delta;
206 	int nr_windows;
207 
208 	delta = wallclock - rq->window_start;
209 	/* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
210 	if (delta < 0) {
211 		delta = 0;
212 		WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
213 	}
214 
215 	if (delta < walt_ravg_window)
216 		return;
217 
218 	nr_windows = div64_u64(delta, walt_ravg_window);
219 	rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
220 
221 	rq->cum_window_demand = rq->cumulative_runnable_avg;
222 }
223 
224 /*
225  * Translate absolute delta time accounted on a CPU
226  * to a scale where 1024 is the capacity of the most
227  * capable CPU running at FMAX
228  */
scale_exec_time(u64 delta,struct rq * rq)229 static u64 scale_exec_time(u64 delta, struct rq *rq)
230 {
231 	unsigned long capcurr = capacity_curr_of(cpu_of(rq));
232 
233 	return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
234 }
235 
cpu_is_waiting_on_io(struct rq * rq)236 static int cpu_is_waiting_on_io(struct rq *rq)
237 {
238 	if (!walt_io_is_busy)
239 		return 0;
240 
241 	return atomic_read(&rq->nr_iowait);
242 }
243 
walt_account_irqtime(int cpu,struct task_struct * curr,u64 delta,u64 wallclock)244 void walt_account_irqtime(int cpu, struct task_struct *curr,
245 				 u64 delta, u64 wallclock)
246 {
247 	struct rq *rq = cpu_rq(cpu);
248 	unsigned long flags, nr_windows;
249 	u64 cur_jiffies_ts;
250 
251 	raw_spin_lock_irqsave(&rq->lock, flags);
252 
253 	/*
254 	 * cputime (wallclock) uses sched_clock so use the same here for
255 	 * consistency.
256 	 */
257 	delta += sched_clock() - wallclock;
258 	cur_jiffies_ts = get_jiffies_64();
259 
260 	if (is_idle_task(curr))
261 		walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
262 				 delta);
263 
264 	nr_windows = cur_jiffies_ts - rq->irqload_ts;
265 
266 	if (nr_windows) {
267 		if (nr_windows < 10) {
268 			/* Decay CPU's irqload by 3/4 for each window. */
269 			rq->avg_irqload *= (3 * nr_windows);
270 			rq->avg_irqload = div64_u64(rq->avg_irqload,
271 						    4 * nr_windows);
272 		} else {
273 			rq->avg_irqload = 0;
274 		}
275 		rq->avg_irqload += rq->cur_irqload;
276 		rq->cur_irqload = 0;
277 	}
278 
279 	rq->cur_irqload += delta;
280 	rq->irqload_ts = cur_jiffies_ts;
281 	raw_spin_unlock_irqrestore(&rq->lock, flags);
282 }
283 
284 
285 #define WALT_HIGH_IRQ_TIMEOUT 3
286 
walt_irqload(int cpu)287 u64 walt_irqload(int cpu) {
288 	struct rq *rq = cpu_rq(cpu);
289 	s64 delta;
290 	delta = get_jiffies_64() - rq->irqload_ts;
291 
292         /*
293 	 * Current context can be preempted by irq and rq->irqload_ts can be
294 	 * updated by irq context so that delta can be negative.
295 	 * But this is okay and we can safely return as this means there
296 	 * was recent irq occurrence.
297 	 */
298 
299         if (delta < WALT_HIGH_IRQ_TIMEOUT)
300 		return rq->avg_irqload;
301         else
302 		return 0;
303 }
304 
walt_cpu_high_irqload(int cpu)305 int walt_cpu_high_irqload(int cpu) {
306 	return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
307 }
308 
account_busy_for_cpu_time(struct rq * rq,struct task_struct * p,u64 irqtime,int event)309 static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
310 				     u64 irqtime, int event)
311 {
312 	if (is_idle_task(p)) {
313 		/* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
314 		if (event == PICK_NEXT_TASK)
315 			return 0;
316 
317 		/* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
318 		return irqtime || cpu_is_waiting_on_io(rq);
319 	}
320 
321 	if (event == TASK_WAKE)
322 		return 0;
323 
324 	if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
325 					 event == TASK_UPDATE)
326 		return 1;
327 
328 	/* Only TASK_MIGRATE && PICK_NEXT_TASK left */
329 	return walt_freq_account_wait_time;
330 }
331 
332 /*
333  * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
334  */
update_cpu_busy_time(struct task_struct * p,struct rq * rq,int event,u64 wallclock,u64 irqtime)335 static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
336 	     int event, u64 wallclock, u64 irqtime)
337 {
338 	int new_window, nr_full_windows = 0;
339 	int p_is_curr_task = (p == rq->curr);
340 	u64 mark_start = p->ravg.mark_start;
341 	u64 window_start = rq->window_start;
342 	u32 window_size = walt_ravg_window;
343 	u64 delta;
344 
345 	new_window = mark_start < window_start;
346 	if (new_window) {
347 		nr_full_windows = div64_u64((window_start - mark_start),
348 						window_size);
349 		if (p->ravg.active_windows < USHRT_MAX)
350 			p->ravg.active_windows++;
351 	}
352 
353 	/* Handle per-task window rollover. We don't care about the idle
354 	 * task or exiting tasks. */
355 	if (new_window && !is_idle_task(p) && !exiting_task(p)) {
356 		u32 curr_window = 0;
357 
358 		if (!nr_full_windows)
359 			curr_window = p->ravg.curr_window;
360 
361 		p->ravg.prev_window = curr_window;
362 		p->ravg.curr_window = 0;
363 	}
364 
365 	if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
366 		/* account_busy_for_cpu_time() = 0, so no update to the
367 		 * task's current window needs to be made. This could be
368 		 * for example
369 		 *
370 		 *   - a wakeup event on a task within the current
371 		 *     window (!new_window below, no action required),
372 		 *   - switching to a new task from idle (PICK_NEXT_TASK)
373 		 *     in a new window where irqtime is 0 and we aren't
374 		 *     waiting on IO */
375 
376 		if (!new_window)
377 			return;
378 
379 		/* A new window has started. The RQ demand must be rolled
380 		 * over if p is the current task. */
381 		if (p_is_curr_task) {
382 			u64 prev_sum = 0;
383 
384 			/* p is either idle task or an exiting task */
385 			if (!nr_full_windows) {
386 				prev_sum = rq->curr_runnable_sum;
387 			}
388 
389 			rq->prev_runnable_sum = prev_sum;
390 			rq->curr_runnable_sum = 0;
391 		}
392 
393 		return;
394 	}
395 
396 	if (!new_window) {
397 		/* account_busy_for_cpu_time() = 1 so busy time needs
398 		 * to be accounted to the current window. No rollover
399 		 * since we didn't start a new window. An example of this is
400 		 * when a task starts execution and then sleeps within the
401 		 * same window. */
402 
403 		if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
404 			delta = wallclock - mark_start;
405 		else
406 			delta = irqtime;
407 		delta = scale_exec_time(delta, rq);
408 		rq->curr_runnable_sum += delta;
409 		if (!is_idle_task(p) && !exiting_task(p))
410 			p->ravg.curr_window += delta;
411 
412 		return;
413 	}
414 
415 	if (!p_is_curr_task) {
416 		/* account_busy_for_cpu_time() = 1 so busy time needs
417 		 * to be accounted to the current window. A new window
418 		 * has also started, but p is not the current task, so the
419 		 * window is not rolled over - just split up and account
420 		 * as necessary into curr and prev. The window is only
421 		 * rolled over when a new window is processed for the current
422 		 * task.
423 		 *
424 		 * Irqtime can't be accounted by a task that isn't the
425 		 * currently running task. */
426 
427 		if (!nr_full_windows) {
428 			/* A full window hasn't elapsed, account partial
429 			 * contribution to previous completed window. */
430 			delta = scale_exec_time(window_start - mark_start, rq);
431 			if (!exiting_task(p))
432 				p->ravg.prev_window += delta;
433 		} else {
434 			/* Since at least one full window has elapsed,
435 			 * the contribution to the previous window is the
436 			 * full window (window_size). */
437 			delta = scale_exec_time(window_size, rq);
438 			if (!exiting_task(p))
439 				p->ravg.prev_window = delta;
440 		}
441 		rq->prev_runnable_sum += delta;
442 
443 		/* Account piece of busy time in the current window. */
444 		delta = scale_exec_time(wallclock - window_start, rq);
445 		rq->curr_runnable_sum += delta;
446 		if (!exiting_task(p))
447 			p->ravg.curr_window = delta;
448 
449 		return;
450 	}
451 
452 	if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
453 		/* account_busy_for_cpu_time() = 1 so busy time needs
454 		 * to be accounted to the current window. A new window
455 		 * has started and p is the current task so rollover is
456 		 * needed. If any of these three above conditions are true
457 		 * then this busy time can't be accounted as irqtime.
458 		 *
459 		 * Busy time for the idle task or exiting tasks need not
460 		 * be accounted.
461 		 *
462 		 * An example of this would be a task that starts execution
463 		 * and then sleeps once a new window has begun. */
464 
465 		if (!nr_full_windows) {
466 			/* A full window hasn't elapsed, account partial
467 			 * contribution to previous completed window. */
468 			delta = scale_exec_time(window_start - mark_start, rq);
469 			if (!is_idle_task(p) && !exiting_task(p))
470 				p->ravg.prev_window += delta;
471 
472 			delta += rq->curr_runnable_sum;
473 		} else {
474 			/* Since at least one full window has elapsed,
475 			 * the contribution to the previous window is the
476 			 * full window (window_size). */
477 			delta = scale_exec_time(window_size, rq);
478 			if (!is_idle_task(p) && !exiting_task(p))
479 				p->ravg.prev_window = delta;
480 
481 		}
482 		/*
483 		 * Rollover for normal runnable sum is done here by overwriting
484 		 * the values in prev_runnable_sum and curr_runnable_sum.
485 		 * Rollover for new task runnable sum has completed by previous
486 		 * if-else statement.
487 		 */
488 		rq->prev_runnable_sum = delta;
489 
490 		/* Account piece of busy time in the current window. */
491 		delta = scale_exec_time(wallclock - window_start, rq);
492 		rq->curr_runnable_sum = delta;
493 		if (!is_idle_task(p) && !exiting_task(p))
494 			p->ravg.curr_window = delta;
495 
496 		return;
497 	}
498 
499 	if (irqtime) {
500 		/* account_busy_for_cpu_time() = 1 so busy time needs
501 		 * to be accounted to the current window. A new window
502 		 * has started and p is the current task so rollover is
503 		 * needed. The current task must be the idle task because
504 		 * irqtime is not accounted for any other task.
505 		 *
506 		 * Irqtime will be accounted each time we process IRQ activity
507 		 * after a period of idleness, so we know the IRQ busy time
508 		 * started at wallclock - irqtime. */
509 
510 		BUG_ON(!is_idle_task(p));
511 		mark_start = wallclock - irqtime;
512 
513 		/* Roll window over. If IRQ busy time was just in the current
514 		 * window then that is all that need be accounted. */
515 		rq->prev_runnable_sum = rq->curr_runnable_sum;
516 		if (mark_start > window_start) {
517 			rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
518 			return;
519 		}
520 
521 		/* The IRQ busy time spanned multiple windows. Process the
522 		 * busy time preceding the current window start first. */
523 		delta = window_start - mark_start;
524 		if (delta > window_size)
525 			delta = window_size;
526 		delta = scale_exec_time(delta, rq);
527 		rq->prev_runnable_sum += delta;
528 
529 		/* Process the remaining IRQ busy time in the current window. */
530 		delta = wallclock - window_start;
531 		rq->curr_runnable_sum = scale_exec_time(delta, rq);
532 
533 		return;
534 	}
535 
536 	BUG();
537 }
538 
account_busy_for_task_demand(struct task_struct * p,int event)539 static int account_busy_for_task_demand(struct task_struct *p, int event)
540 {
541 	/* No need to bother updating task demand for exiting tasks
542 	 * or the idle task. */
543 	if (exiting_task(p) || is_idle_task(p))
544 		return 0;
545 
546 	/* When a task is waking up it is completing a segment of non-busy
547 	 * time. Likewise, if wait time is not treated as busy time, then
548 	 * when a task begins to run or is migrated, it is not running and
549 	 * is completing a segment of non-busy time. */
550 	if (event == TASK_WAKE || (!walt_account_wait_time &&
551 			 (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
552 		return 0;
553 
554 	return 1;
555 }
556 
557 /*
558  * Called when new window is starting for a task, to record cpu usage over
559  * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
560  * when, say, a real-time task runs without preemption for several windows at a
561  * stretch.
562  */
update_history(struct rq * rq,struct task_struct * p,u32 runtime,int samples,int event)563 static void update_history(struct rq *rq, struct task_struct *p,
564 			 u32 runtime, int samples, int event)
565 {
566 	u32 *hist = &p->ravg.sum_history[0];
567 	int ridx, widx;
568 	u32 max = 0, avg, demand;
569 	u64 sum = 0;
570 
571 	/* Ignore windows where task had no activity */
572 	if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
573 			goto done;
574 
575 	/* Push new 'runtime' value onto stack */
576 	widx = walt_ravg_hist_size - 1;
577 	ridx = widx - samples;
578 	for (; ridx >= 0; --widx, --ridx) {
579 		hist[widx] = hist[ridx];
580 		sum += hist[widx];
581 		if (hist[widx] > max)
582 			max = hist[widx];
583 	}
584 
585 	for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
586 		hist[widx] = runtime;
587 		sum += hist[widx];
588 		if (hist[widx] > max)
589 			max = hist[widx];
590 	}
591 
592 	p->ravg.sum = 0;
593 
594 	if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
595 		demand = runtime;
596 	} else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
597 		demand = max;
598 	} else {
599 		avg = div64_u64(sum, walt_ravg_hist_size);
600 		if (walt_window_stats_policy == WINDOW_STATS_AVG)
601 			demand = avg;
602 		else
603 			demand = max(avg, runtime);
604 	}
605 
606 	/*
607 	 * A throttled deadline sched class task gets dequeued without
608 	 * changing p->on_rq. Since the dequeue decrements hmp stats
609 	 * avoid decrementing it here again.
610 	 *
611 	 * When window is rolled over, the cumulative window demand
612 	 * is reset to the cumulative runnable average (contribution from
613 	 * the tasks on the runqueue). If the current task is dequeued
614 	 * already, it's demand is not included in the cumulative runnable
615 	 * average. So add the task demand separately to cumulative window
616 	 * demand.
617 	 */
618 	if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
619 		if (task_on_rq_queued(p))
620 			fixup_cumulative_runnable_avg(rq, p, demand);
621 		else if (rq->curr == p)
622 			fixup_cum_window_demand(rq, demand);
623 	}
624 
625 	p->ravg.demand = demand;
626 
627 done:
628 	trace_walt_update_history(rq, p, runtime, samples, event);
629 	return;
630 }
631 
add_to_task_demand(struct rq * rq,struct task_struct * p,u64 delta)632 static void add_to_task_demand(struct rq *rq, struct task_struct *p,
633 				u64 delta)
634 {
635 	delta = scale_exec_time(delta, rq);
636 	p->ravg.sum += delta;
637 	if (unlikely(p->ravg.sum > walt_ravg_window))
638 		p->ravg.sum = walt_ravg_window;
639 }
640 
641 /*
642  * Account cpu demand of task and/or update task's cpu demand history
643  *
644  * ms = p->ravg.mark_start;
645  * wc = wallclock
646  * ws = rq->window_start
647  *
648  * Three possibilities:
649  *
650  *	a) Task event is contained within one window.
651  *		window_start < mark_start < wallclock
652  *
653  *		ws   ms  wc
654  *		|    |   |
655  *		V    V   V
656  *		|---------------|
657  *
658  *	In this case, p->ravg.sum is updated *iff* event is appropriate
659  *	(ex: event == PUT_PREV_TASK)
660  *
661  *	b) Task event spans two windows.
662  *		mark_start < window_start < wallclock
663  *
664  *		ms   ws   wc
665  *		|    |    |
666  *		V    V    V
667  *		-----|-------------------
668  *
669  *	In this case, p->ravg.sum is updated with (ws - ms) *iff* event
670  *	is appropriate, then a new window sample is recorded followed
671  *	by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
672  *
673  *	c) Task event spans more than two windows.
674  *
675  *		ms ws_tmp			   ws  wc
676  *		|  |				   |   |
677  *		V  V				   V   V
678  *		---|-------|-------|-------|-------|------
679  *		   |				   |
680  *		   |<------ nr_full_windows ------>|
681  *
682  *	In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
683  *	event is appropriate, window sample of p->ravg.sum is recorded,
684  *	'nr_full_window' samples of window_size is also recorded *iff*
685  *	event is appropriate and finally p->ravg.sum is set to (wc - ws)
686  *	*iff* event is appropriate.
687  *
688  * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
689  * depends on it!
690  */
update_task_demand(struct task_struct * p,struct rq * rq,int event,u64 wallclock)691 static void update_task_demand(struct task_struct *p, struct rq *rq,
692 	     int event, u64 wallclock)
693 {
694 	u64 mark_start = p->ravg.mark_start;
695 	u64 delta, window_start = rq->window_start;
696 	int new_window, nr_full_windows;
697 	u32 window_size = walt_ravg_window;
698 
699 	new_window = mark_start < window_start;
700 	if (!account_busy_for_task_demand(p, event)) {
701 		if (new_window)
702 			/* If the time accounted isn't being accounted as
703 			 * busy time, and a new window started, only the
704 			 * previous window need be closed out with the
705 			 * pre-existing demand. Multiple windows may have
706 			 * elapsed, but since empty windows are dropped,
707 			 * it is not necessary to account those. */
708 			update_history(rq, p, p->ravg.sum, 1, event);
709 		return;
710 	}
711 
712 	if (!new_window) {
713 		/* The simple case - busy time contained within the existing
714 		 * window. */
715 		add_to_task_demand(rq, p, wallclock - mark_start);
716 		return;
717 	}
718 
719 	/* Busy time spans at least two windows. Temporarily rewind
720 	 * window_start to first window boundary after mark_start. */
721 	delta = window_start - mark_start;
722 	nr_full_windows = div64_u64(delta, window_size);
723 	window_start -= (u64)nr_full_windows * (u64)window_size;
724 
725 	/* Process (window_start - mark_start) first */
726 	add_to_task_demand(rq, p, window_start - mark_start);
727 
728 	/* Push new sample(s) into task's demand history */
729 	update_history(rq, p, p->ravg.sum, 1, event);
730 	if (nr_full_windows)
731 		update_history(rq, p, scale_exec_time(window_size, rq),
732 			       nr_full_windows, event);
733 
734 	/* Roll window_start back to current to process any remainder
735 	 * in current window. */
736 	window_start += (u64)nr_full_windows * (u64)window_size;
737 
738 	/* Process (wallclock - window_start) next */
739 	mark_start = window_start;
740 	add_to_task_demand(rq, p, wallclock - mark_start);
741 }
742 
743 /* Reflect task activity on its demand and cpu's busy time statistics */
walt_update_task_ravg(struct task_struct * p,struct rq * rq,int event,u64 wallclock,u64 irqtime)744 void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
745 	     int event, u64 wallclock, u64 irqtime)
746 {
747 	if (walt_disabled || !rq->window_start)
748 		return;
749 
750 	lockdep_assert_held(&rq->lock);
751 
752 	update_window_start(rq, wallclock);
753 
754 	if (!p->ravg.mark_start)
755 		goto done;
756 
757 	update_task_demand(p, rq, event, wallclock);
758 	update_cpu_busy_time(p, rq, event, wallclock, irqtime);
759 
760 done:
761 	trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
762 
763 	p->ravg.mark_start = wallclock;
764 }
765 
reset_task_stats(struct task_struct * p)766 static void reset_task_stats(struct task_struct *p)
767 {
768 	u32 sum = 0;
769 
770 	if (exiting_task(p))
771 		sum = EXITING_TASK_MARKER;
772 
773 	memset(&p->ravg, 0, sizeof(struct ravg));
774 	/* Retain EXITING_TASK marker */
775 	p->ravg.sum_history[0] = sum;
776 }
777 
walt_mark_task_starting(struct task_struct * p)778 void walt_mark_task_starting(struct task_struct *p)
779 {
780 	u64 wallclock;
781 	struct rq *rq = task_rq(p);
782 
783 	if (!rq->window_start) {
784 		reset_task_stats(p);
785 		return;
786 	}
787 
788 	wallclock = walt_ktime_clock();
789 	p->ravg.mark_start = wallclock;
790 }
791 
walt_set_window_start(struct rq * rq)792 void walt_set_window_start(struct rq *rq)
793 {
794 	int cpu = cpu_of(rq);
795 	struct rq *sync_rq = cpu_rq(sync_cpu);
796 
797 	if (likely(rq->window_start))
798 		return;
799 
800 	if (cpu == sync_cpu) {
801 		rq->window_start = 1;
802 	} else {
803 		raw_spin_unlock(&rq->lock);
804 		double_rq_lock(rq, sync_rq);
805 		rq->window_start = cpu_rq(sync_cpu)->window_start;
806 		rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
807 		raw_spin_unlock(&sync_rq->lock);
808 	}
809 
810 	rq->curr->ravg.mark_start = rq->window_start;
811 }
812 
walt_migrate_sync_cpu(int cpu)813 void walt_migrate_sync_cpu(int cpu)
814 {
815 	if (cpu == sync_cpu)
816 		sync_cpu = smp_processor_id();
817 }
818 
walt_fixup_busy_time(struct task_struct * p,int new_cpu)819 void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
820 {
821 	struct rq *src_rq = task_rq(p);
822 	struct rq *dest_rq = cpu_rq(new_cpu);
823 	u64 wallclock;
824 
825 	if (!p->on_rq && p->state != TASK_WAKING)
826 		return;
827 
828 	if (exiting_task(p)) {
829 		return;
830 	}
831 
832 	if (p->state == TASK_WAKING)
833 		double_rq_lock(src_rq, dest_rq);
834 
835 	wallclock = walt_ktime_clock();
836 
837 	walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
838 			TASK_UPDATE, wallclock, 0);
839 	walt_update_task_ravg(dest_rq->curr, dest_rq,
840 			TASK_UPDATE, wallclock, 0);
841 
842 	walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
843 
844 	/*
845 	 * When a task is migrating during the wakeup, adjust
846 	 * the task's contribution towards cumulative window
847 	 * demand.
848 	 */
849 	if (p->state == TASK_WAKING &&
850 	    p->last_sleep_ts >= src_rq->window_start) {
851 		fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
852 		fixup_cum_window_demand(dest_rq, p->ravg.demand);
853 	}
854 
855 	if (p->ravg.curr_window) {
856 		src_rq->curr_runnable_sum -= p->ravg.curr_window;
857 		dest_rq->curr_runnable_sum += p->ravg.curr_window;
858 	}
859 
860 	if (p->ravg.prev_window) {
861 		src_rq->prev_runnable_sum -= p->ravg.prev_window;
862 		dest_rq->prev_runnable_sum += p->ravg.prev_window;
863 	}
864 
865 	if ((s64)src_rq->prev_runnable_sum < 0) {
866 		src_rq->prev_runnable_sum = 0;
867 		WARN_ON(1);
868 	}
869 	if ((s64)src_rq->curr_runnable_sum < 0) {
870 		src_rq->curr_runnable_sum = 0;
871 		WARN_ON(1);
872 	}
873 
874 	trace_walt_migration_update_sum(src_rq, p);
875 	trace_walt_migration_update_sum(dest_rq, p);
876 
877 	if (p->state == TASK_WAKING)
878 		double_rq_unlock(src_rq, dest_rq);
879 }
880 
walt_init_new_task_load(struct task_struct * p)881 void walt_init_new_task_load(struct task_struct *p)
882 {
883 	int i;
884 	u32 init_load_windows =
885 			div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
886                           (u64)walt_ravg_window, 100);
887 	u32 init_load_pct = current->init_load_pct;
888 
889 	p->init_load_pct = 0;
890 	memset(&p->ravg, 0, sizeof(struct ravg));
891 
892 	if (init_load_pct) {
893 		init_load_windows = div64_u64((u64)init_load_pct *
894 			  (u64)walt_ravg_window, 100);
895 	}
896 
897 	p->ravg.demand = init_load_windows;
898 	for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
899 		p->ravg.sum_history[i] = init_load_windows;
900 }
901