1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Timer events oriented CPU idle governor
4 *
5 * Copyright (C) 2018 - 2021 Intel Corporation
6 * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
7 */
8
9 /**
10 * DOC: teo-description
11 *
12 * The idea of this governor is based on the observation that on many systems
13 * timer interrupts are two or more orders of magnitude more frequent than any
14 * other interrupt types, so they are likely to dominate CPU wakeup patterns.
15 * Moreover, in principle, the time when the next timer event is going to occur
16 * can be determined at the idle state selection time, although doing that may
17 * be costly, so it can be regarded as the most reliable source of information
18 * for idle state selection.
19 *
20 * Of course, non-timer wakeup sources are more important in some use cases,
21 * but even then it is generally unnecessary to consider idle duration values
22 * greater than the time time till the next timer event, referred as the sleep
23 * length in what follows, because the closest timer will ultimately wake up the
24 * CPU anyway unless it is woken up earlier.
25 *
26 * However, since obtaining the sleep length may be costly, the governor first
27 * checks if it can select a shallow idle state using wakeup pattern information
28 * from recent times, in which case it can do without knowing the sleep length
29 * at all. For this purpose, it counts CPU wakeup events and looks for an idle
30 * state whose target residency has not exceeded the idle duration (measured
31 * after wakeup) in the majority of relevant recent cases. If the target
32 * residency of that state is small enough, it may be used right away and the
33 * sleep length need not be determined.
34 *
35 * The computations carried out by this governor are based on using bins whose
36 * boundaries are aligned with the target residency parameter values of the CPU
37 * idle states provided by the %CPUIdle driver in the ascending order. That is,
38 * the first bin spans from 0 up to, but not including, the target residency of
39 * the second idle state (idle state 1), the second bin spans from the target
40 * residency of idle state 1 up to, but not including, the target residency of
41 * idle state 2, the third bin spans from the target residency of idle state 2
42 * up to, but not including, the target residency of idle state 3 and so on.
43 * The last bin spans from the target residency of the deepest idle state
44 * supplied by the driver to the scheduler tick period length or to infinity if
45 * the tick period length is less than the target residency of that state. In
46 * the latter case, the governor also counts events with the measured idle
47 * duration between the tick period length and the target residency of the
48 * deepest idle state.
49 *
50 * Two metrics called "hits" and "intercepts" are associated with each bin.
51 * They are updated every time before selecting an idle state for the given CPU
52 * in accordance with what happened last time.
53 *
54 * The "hits" metric reflects the relative frequency of situations in which the
55 * sleep length and the idle duration measured after CPU wakeup fall into the
56 * same bin (that is, the CPU appears to wake up "on time" relative to the sleep
57 * length). In turn, the "intercepts" metric reflects the relative frequency of
58 * non-timer wakeup events for which the measured idle duration falls into a bin
59 * that corresponds to an idle state shallower than the one whose bin is fallen
60 * into by the sleep length (these events are also referred to as "intercepts"
61 * below).
62 *
63 * In order to select an idle state for a CPU, the governor takes the following
64 * steps (modulo the possible latency constraint that must be taken into account
65 * too):
66 *
67 * 1. Find the deepest enabled CPU idle state (the candidate idle state) and
68 * compute 2 sums as follows:
69 *
70 * - The sum of the "hits" metric for all of the idle states shallower than
71 * the candidate one (it represents the cases in which the CPU was likely
72 * woken up by a timer).
73 *
74 * - The sum of the "intercepts" metric for all of the idle states shallower
75 * than the candidate one (it represents the cases in which the CPU was
76 * likely woken up by a non-timer wakeup source).
77 *
78 * 2. If the second sum computed in step 1 is greater than a half of the sum of
79 * both metrics for the candidate state bin and all subsequent bins(if any),
80 * a shallower idle state is likely to be more suitable, so look for it.
81 *
82 * - Traverse the enabled idle states shallower than the candidate one in the
83 * descending order.
84 *
85 * - For each of them compute the sum of the "intercepts" metrics over all
86 * of the idle states between it and the candidate one (including the
87 * former and excluding the latter).
88 *
89 * - If this sum is greater than a half of the second sum computed in step 1,
90 * use the given idle state as the new candidate one.
91 *
92 * 3. If the current candidate state is state 0 or its target residency is short
93 * enough, return it and prevent the scheduler tick from being stopped.
94 *
95 * 4. Obtain the sleep length value and check if it is below the target
96 * residency of the current candidate state, in which case a new shallower
97 * candidate state needs to be found, so look for it.
98 */
99
100 #include <linux/cpuidle.h>
101 #include <linux/jiffies.h>
102 #include <linux/kernel.h>
103 #include <linux/sched/clock.h>
104 #include <linux/tick.h>
105
106 #include "gov.h"
107
108 /*
109 * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
110 * is used for decreasing metrics on a regular basis.
111 */
112 #define PULSE 1024
113 #define DECAY_SHIFT 3
114
115 /**
116 * struct teo_bin - Metrics used by the TEO cpuidle governor.
117 * @intercepts: The "intercepts" metric.
118 * @hits: The "hits" metric.
119 */
120 struct teo_bin {
121 unsigned int intercepts;
122 unsigned int hits;
123 };
124
125 /**
126 * struct teo_cpu - CPU data used by the TEO cpuidle governor.
127 * @time_span_ns: Time between idle state selection and post-wakeup update.
128 * @sleep_length_ns: Time till the closest timer event (at the selection time).
129 * @state_bins: Idle state data bins for this CPU.
130 * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
131 * @tick_hits: Number of "hits" after TICK_NSEC.
132 */
133 struct teo_cpu {
134 s64 time_span_ns;
135 s64 sleep_length_ns;
136 struct teo_bin state_bins[CPUIDLE_STATE_MAX];
137 unsigned int total;
138 unsigned int tick_hits;
139 };
140
141 static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
142
143 /**
144 * teo_update - Update CPU metrics after wakeup.
145 * @drv: cpuidle driver containing state data.
146 * @dev: Target CPU.
147 */
teo_update(struct cpuidle_driver * drv,struct cpuidle_device * dev)148 static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
149 {
150 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
151 int i, idx_timer = 0, idx_duration = 0;
152 s64 target_residency_ns;
153 u64 measured_ns;
154
155 if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) {
156 /*
157 * One of the safety nets has triggered or the wakeup was close
158 * enough to the closest timer event expected at the idle state
159 * selection time to be discarded.
160 */
161 measured_ns = U64_MAX;
162 } else {
163 u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns;
164
165 /*
166 * The computations below are to determine whether or not the
167 * (saved) time till the next timer event and the measured idle
168 * duration fall into the same "bin", so use last_residency_ns
169 * for that instead of time_span_ns which includes the cpuidle
170 * overhead.
171 */
172 measured_ns = dev->last_residency_ns;
173 /*
174 * The delay between the wakeup and the first instruction
175 * executed by the CPU is not likely to be worst-case every
176 * time, so take 1/2 of the exit latency as a very rough
177 * approximation of the average of it.
178 */
179 if (measured_ns >= lat_ns)
180 measured_ns -= lat_ns / 2;
181 else
182 measured_ns /= 2;
183 }
184
185 cpu_data->total = 0;
186
187 /*
188 * Decay the "hits" and "intercepts" metrics for all of the bins and
189 * find the bins that the sleep length and the measured idle duration
190 * fall into.
191 */
192 for (i = 0; i < drv->state_count; i++) {
193 struct teo_bin *bin = &cpu_data->state_bins[i];
194
195 bin->hits -= bin->hits >> DECAY_SHIFT;
196 bin->intercepts -= bin->intercepts >> DECAY_SHIFT;
197
198 cpu_data->total += bin->hits + bin->intercepts;
199
200 target_residency_ns = drv->states[i].target_residency_ns;
201
202 if (target_residency_ns <= cpu_data->sleep_length_ns) {
203 idx_timer = i;
204 if (target_residency_ns <= measured_ns)
205 idx_duration = i;
206 }
207 }
208
209 /*
210 * If the deepest state's target residency is below the tick length,
211 * make a record of it to help teo_select() decide whether or not
212 * to stop the tick. This effectively adds an extra hits-only bin
213 * beyond the last state-related one.
214 */
215 if (target_residency_ns < TICK_NSEC) {
216 cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT;
217
218 cpu_data->total += cpu_data->tick_hits;
219
220 if (TICK_NSEC <= cpu_data->sleep_length_ns) {
221 idx_timer = drv->state_count;
222 if (TICK_NSEC <= measured_ns) {
223 cpu_data->tick_hits += PULSE;
224 goto end;
225 }
226 }
227 }
228
229 /*
230 * If the measured idle duration falls into the same bin as the sleep
231 * length, this is a "hit", so update the "hits" metric for that bin.
232 * Otherwise, update the "intercepts" metric for the bin fallen into by
233 * the measured idle duration.
234 */
235 if (idx_timer == idx_duration)
236 cpu_data->state_bins[idx_timer].hits += PULSE;
237 else
238 cpu_data->state_bins[idx_duration].intercepts += PULSE;
239
240 end:
241 cpu_data->total += PULSE;
242 }
243
teo_state_ok(int i,struct cpuidle_driver * drv)244 static bool teo_state_ok(int i, struct cpuidle_driver *drv)
245 {
246 return !tick_nohz_tick_stopped() ||
247 drv->states[i].target_residency_ns >= TICK_NSEC;
248 }
249
250 /**
251 * teo_find_shallower_state - Find shallower idle state matching given duration.
252 * @drv: cpuidle driver containing state data.
253 * @dev: Target CPU.
254 * @state_idx: Index of the capping idle state.
255 * @duration_ns: Idle duration value to match.
256 * @no_poll: Don't consider polling states.
257 */
teo_find_shallower_state(struct cpuidle_driver * drv,struct cpuidle_device * dev,int state_idx,s64 duration_ns,bool no_poll)258 static int teo_find_shallower_state(struct cpuidle_driver *drv,
259 struct cpuidle_device *dev, int state_idx,
260 s64 duration_ns, bool no_poll)
261 {
262 int i;
263
264 for (i = state_idx - 1; i >= 0; i--) {
265 if (dev->states_usage[i].disable ||
266 (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
267 continue;
268
269 state_idx = i;
270 if (drv->states[i].target_residency_ns <= duration_ns)
271 break;
272 }
273 return state_idx;
274 }
275
276 /**
277 * teo_select - Selects the next idle state to enter.
278 * @drv: cpuidle driver containing state data.
279 * @dev: Target CPU.
280 * @stop_tick: Indication on whether or not to stop the scheduler tick.
281 */
teo_select(struct cpuidle_driver * drv,struct cpuidle_device * dev,bool * stop_tick)282 static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
283 bool *stop_tick)
284 {
285 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
286 s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
287 ktime_t delta_tick = TICK_NSEC / 2;
288 unsigned int tick_intercept_sum = 0;
289 unsigned int idx_intercept_sum = 0;
290 unsigned int intercept_sum = 0;
291 unsigned int idx_hit_sum = 0;
292 unsigned int hit_sum = 0;
293 int constraint_idx = 0;
294 int idx0 = 0, idx = -1;
295 int prev_intercept_idx;
296 s64 duration_ns;
297 int i;
298
299 if (dev->last_state_idx >= 0) {
300 teo_update(drv, dev);
301 dev->last_state_idx = -1;
302 }
303
304 cpu_data->time_span_ns = local_clock();
305 /*
306 * Set the expected sleep length to infinity in case of an early
307 * return.
308 */
309 cpu_data->sleep_length_ns = KTIME_MAX;
310
311 /* Check if there is any choice in the first place. */
312 if (drv->state_count < 2) {
313 idx = 0;
314 goto out_tick;
315 }
316
317 if (!dev->states_usage[0].disable)
318 idx = 0;
319
320 /* Compute the sums of metrics for early wakeup pattern detection. */
321 for (i = 1; i < drv->state_count; i++) {
322 struct teo_bin *prev_bin = &cpu_data->state_bins[i-1];
323 struct cpuidle_state *s = &drv->states[i];
324
325 /*
326 * Update the sums of idle state mertics for all of the states
327 * shallower than the current one.
328 */
329 intercept_sum += prev_bin->intercepts;
330 hit_sum += prev_bin->hits;
331
332 if (dev->states_usage[i].disable)
333 continue;
334
335 if (idx < 0)
336 idx0 = i; /* first enabled state */
337
338 idx = i;
339
340 if (s->exit_latency_ns <= latency_req)
341 constraint_idx = i;
342
343 /* Save the sums for the current state. */
344 idx_intercept_sum = intercept_sum;
345 idx_hit_sum = hit_sum;
346 }
347
348 /* Avoid unnecessary overhead. */
349 if (idx < 0) {
350 idx = 0; /* No states enabled, must use 0. */
351 goto out_tick;
352 }
353
354 if (idx == idx0) {
355 /*
356 * Only one idle state is enabled, so use it, but do not
357 * allow the tick to be stopped it is shallow enough.
358 */
359 duration_ns = drv->states[idx].target_residency_ns;
360 goto end;
361 }
362
363 tick_intercept_sum = intercept_sum +
364 cpu_data->state_bins[drv->state_count-1].intercepts;
365
366 /*
367 * If the sum of the intercepts metric for all of the idle states
368 * shallower than the current candidate one (idx) is greater than the
369 * sum of the intercepts and hits metrics for the candidate state and
370 * all of the deeper states a shallower idle state is likely to be a
371 * better choice.
372 */
373 prev_intercept_idx = idx;
374 if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) {
375 int first_suitable_idx = idx;
376
377 /*
378 * Look for the deepest idle state whose target residency had
379 * not exceeded the idle duration in over a half of the relevant
380 * cases in the past.
381 *
382 * Take the possible duration limitation present if the tick
383 * has been stopped already into account.
384 */
385 intercept_sum = 0;
386
387 for (i = idx - 1; i >= 0; i--) {
388 struct teo_bin *bin = &cpu_data->state_bins[i];
389
390 intercept_sum += bin->intercepts;
391
392 if (2 * intercept_sum > idx_intercept_sum) {
393 /*
394 * Use the current state unless it is too
395 * shallow or disabled, in which case take the
396 * first enabled state that is deep enough.
397 */
398 if (teo_state_ok(i, drv) &&
399 !dev->states_usage[i].disable)
400 idx = i;
401 else
402 idx = first_suitable_idx;
403
404 break;
405 }
406
407 if (dev->states_usage[i].disable)
408 continue;
409
410 if (!teo_state_ok(i, drv)) {
411 /*
412 * The current state is too shallow, but if an
413 * alternative candidate state has been found,
414 * it may still turn out to be a better choice.
415 */
416 if (first_suitable_idx != idx)
417 continue;
418
419 break;
420 }
421
422 first_suitable_idx = i;
423 }
424 }
425 if (!idx && prev_intercept_idx) {
426 /*
427 * We have to query the sleep length here otherwise we don't
428 * know after wakeup if our guess was correct.
429 */
430 duration_ns = tick_nohz_get_sleep_length(&delta_tick);
431 cpu_data->sleep_length_ns = duration_ns;
432 goto out_tick;
433 }
434
435 /*
436 * If there is a latency constraint, it may be necessary to select an
437 * idle state shallower than the current candidate one.
438 */
439 if (idx > constraint_idx)
440 idx = constraint_idx;
441
442 /*
443 * Skip the timers check if state 0 is the current candidate one,
444 * because an immediate non-timer wakeup is expected in that case.
445 */
446 if (!idx)
447 goto out_tick;
448
449 /*
450 * If state 0 is a polling one, check if the target residency of
451 * the current candidate state is low enough and skip the timers
452 * check in that case too.
453 */
454 if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) &&
455 drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS)
456 goto out_tick;
457
458 duration_ns = tick_nohz_get_sleep_length(&delta_tick);
459 cpu_data->sleep_length_ns = duration_ns;
460
461 /*
462 * If the closest expected timer is before the target residency of the
463 * candidate state, a shallower one needs to be found.
464 */
465 if (drv->states[idx].target_residency_ns > duration_ns) {
466 i = teo_find_shallower_state(drv, dev, idx, duration_ns, false);
467 if (teo_state_ok(i, drv))
468 idx = i;
469 }
470
471 /*
472 * If the selected state's target residency is below the tick length
473 * and intercepts occurring before the tick length are the majority of
474 * total wakeup events, do not stop the tick.
475 */
476 if (drv->states[idx].target_residency_ns < TICK_NSEC &&
477 tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8)
478 duration_ns = TICK_NSEC / 2;
479
480 end:
481 /*
482 * Allow the tick to be stopped unless the selected state is a polling
483 * one or the expected idle duration is shorter than the tick period
484 * length.
485 */
486 if ((!(drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
487 duration_ns >= TICK_NSEC) || tick_nohz_tick_stopped())
488 return idx;
489
490 /*
491 * The tick is not going to be stopped, so if the target residency of
492 * the state to be returned is not within the time till the closest
493 * timer including the tick, try to correct that.
494 */
495 if (idx > idx0 &&
496 drv->states[idx].target_residency_ns > delta_tick)
497 idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
498
499 out_tick:
500 *stop_tick = false;
501 return idx;
502 }
503
504 /**
505 * teo_reflect - Note that governor data for the CPU need to be updated.
506 * @dev: Target CPU.
507 * @state: Entered state.
508 */
teo_reflect(struct cpuidle_device * dev,int state)509 static void teo_reflect(struct cpuidle_device *dev, int state)
510 {
511 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
512
513 dev->last_state_idx = state;
514 /*
515 * If the wakeup was not "natural", but triggered by one of the safety
516 * nets, assume that the CPU might have been idle for the entire sleep
517 * length time.
518 */
519 if (dev->poll_time_limit ||
520 (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) {
521 dev->poll_time_limit = false;
522 cpu_data->time_span_ns = cpu_data->sleep_length_ns;
523 } else {
524 cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns;
525 }
526 }
527
528 /**
529 * teo_enable_device - Initialize the governor's data for the target CPU.
530 * @drv: cpuidle driver (not used).
531 * @dev: Target CPU.
532 */
teo_enable_device(struct cpuidle_driver * drv,struct cpuidle_device * dev)533 static int teo_enable_device(struct cpuidle_driver *drv,
534 struct cpuidle_device *dev)
535 {
536 struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
537
538 memset(cpu_data, 0, sizeof(*cpu_data));
539
540 return 0;
541 }
542
543 static struct cpuidle_governor teo_governor = {
544 .name = "teo",
545 .rating = 19,
546 .enable = teo_enable_device,
547 .select = teo_select,
548 .reflect = teo_reflect,
549 };
550
teo_governor_init(void)551 static int __init teo_governor_init(void)
552 {
553 return cpuidle_register_governor(&teo_governor);
554 }
555
556 postcore_initcall(teo_governor_init);
557