• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * menu.c - the menu idle governor
4  *
5  * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
6  * Copyright (C) 2009 Intel Corporation
7  * Author:
8  *        Arjan van de Ven <arjan@linux.intel.com>
9  */
10 
11 #include <linux/kernel.h>
12 #include <linux/cpuidle.h>
13 #include <linux/time.h>
14 #include <linux/ktime.h>
15 #include <linux/hrtimer.h>
16 #include <linux/tick.h>
17 #include <linux/sched/stat.h>
18 #include <linux/math64.h>
19 
20 #include "gov.h"
21 
22 #define BUCKETS 6
23 #define INTERVAL_SHIFT 3
24 #define INTERVALS (1UL << INTERVAL_SHIFT)
25 #define RESOLUTION 1024
26 #define DECAY 8
27 #define MAX_INTERESTING (50000 * NSEC_PER_USEC)
28 
29 /*
30  * Concepts and ideas behind the menu governor
31  *
32  * For the menu governor, there are 2 decision factors for picking a C
33  * state:
34  * 1) Energy break even point
35  * 2) Latency tolerance (from pmqos infrastructure)
36  * These two factors are treated independently.
37  *
38  * Energy break even point
39  * -----------------------
40  * C state entry and exit have an energy cost, and a certain amount of time in
41  * the  C state is required to actually break even on this cost. CPUIDLE
42  * provides us this duration in the "target_residency" field. So all that we
43  * need is a good prediction of how long we'll be idle. Like the traditional
44  * menu governor, we start with the actual known "next timer event" time.
45  *
46  * Since there are other source of wakeups (interrupts for example) than
47  * the next timer event, this estimation is rather optimistic. To get a
48  * more realistic estimate, a correction factor is applied to the estimate,
49  * that is based on historic behavior. For example, if in the past the actual
50  * duration always was 50% of the next timer tick, the correction factor will
51  * be 0.5.
52  *
53  * menu uses a running average for this correction factor, however it uses a
54  * set of factors, not just a single factor. This stems from the realization
55  * that the ratio is dependent on the order of magnitude of the expected
56  * duration; if we expect 500 milliseconds of idle time the likelihood of
57  * getting an interrupt very early is much higher than if we expect 50 micro
58  * seconds of idle time. A second independent factor that has big impact on
59  * the actual factor is if there is (disk) IO outstanding or not.
60  * (as a special twist, we consider every sleep longer than 50 milliseconds
61  * as perfect; there are no power gains for sleeping longer than this)
62  *
63  * For these two reasons we keep an array of 12 independent factors, that gets
64  * indexed based on the magnitude of the expected duration as well as the
65  * "is IO outstanding" property.
66  *
67  * Repeatable-interval-detector
68  * ----------------------------
69  * There are some cases where "next timer" is a completely unusable predictor:
70  * Those cases where the interval is fixed, for example due to hardware
71  * interrupt mitigation, but also due to fixed transfer rate devices such as
72  * mice.
73  * For this, we use a different predictor: We track the duration of the last 8
74  * intervals and if the stand deviation of these 8 intervals is below a
75  * threshold value, we use the average of these intervals as prediction.
76  *
77  */
78 
79 struct menu_device {
80 	int             needs_update;
81 	int             tick_wakeup;
82 
83 	u64		next_timer_ns;
84 	unsigned int	bucket;
85 	unsigned int	correction_factor[BUCKETS];
86 	unsigned int	intervals[INTERVALS];
87 	int		interval_ptr;
88 };
89 
which_bucket(u64 duration_ns)90 static inline int which_bucket(u64 duration_ns)
91 {
92 	int bucket = 0;
93 
94 	if (duration_ns < 10ULL * NSEC_PER_USEC)
95 		return bucket;
96 	if (duration_ns < 100ULL * NSEC_PER_USEC)
97 		return bucket + 1;
98 	if (duration_ns < 1000ULL * NSEC_PER_USEC)
99 		return bucket + 2;
100 	if (duration_ns < 10000ULL * NSEC_PER_USEC)
101 		return bucket + 3;
102 	if (duration_ns < 100000ULL * NSEC_PER_USEC)
103 		return bucket + 4;
104 	return bucket + 5;
105 }
106 
107 static DEFINE_PER_CPU(struct menu_device, menu_devices);
108 
menu_update_intervals(struct menu_device * data,unsigned int interval_us)109 static void menu_update_intervals(struct menu_device *data, unsigned int interval_us)
110 {
111 	/* Update the repeating-pattern data. */
112 	data->intervals[data->interval_ptr++] = interval_us;
113 	if (data->interval_ptr >= INTERVALS)
114 		data->interval_ptr = 0;
115 }
116 
117 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
118 
119 /*
120  * Try detecting repeating patterns by keeping track of the last 8
121  * intervals, and checking if the standard deviation of that set
122  * of points is below a threshold. If it is... then use the
123  * average of these 8 points as the estimated value.
124  */
get_typical_interval(struct menu_device * data)125 static unsigned int get_typical_interval(struct menu_device *data)
126 {
127 	int i, divisor;
128 	unsigned int min, max, thresh, avg;
129 	uint64_t sum, variance;
130 
131 	thresh = INT_MAX; /* Discard outliers above this value */
132 
133 again:
134 
135 	/* First calculate the average of past intervals */
136 	min = UINT_MAX;
137 	max = 0;
138 	sum = 0;
139 	divisor = 0;
140 	for (i = 0; i < INTERVALS; i++) {
141 		unsigned int value = data->intervals[i];
142 		if (value <= thresh) {
143 			sum += value;
144 			divisor++;
145 			if (value > max)
146 				max = value;
147 
148 			if (value < min)
149 				min = value;
150 		}
151 	}
152 
153 	if (!max)
154 		return UINT_MAX;
155 
156 	if (divisor == INTERVALS)
157 		avg = sum >> INTERVAL_SHIFT;
158 	else
159 		avg = div_u64(sum, divisor);
160 
161 	/* Then try to determine variance */
162 	variance = 0;
163 	for (i = 0; i < INTERVALS; i++) {
164 		unsigned int value = data->intervals[i];
165 		if (value <= thresh) {
166 			int64_t diff = (int64_t)value - avg;
167 			variance += diff * diff;
168 		}
169 	}
170 	if (divisor == INTERVALS)
171 		variance >>= INTERVAL_SHIFT;
172 	else
173 		do_div(variance, divisor);
174 
175 	/*
176 	 * The typical interval is obtained when standard deviation is
177 	 * small (stddev <= 20 us, variance <= 400 us^2) or standard
178 	 * deviation is small compared to the average interval (avg >
179 	 * 6*stddev, avg^2 > 36*variance). The average is smaller than
180 	 * UINT_MAX aka U32_MAX, so computing its square does not
181 	 * overflow a u64. We simply reject this candidate average if
182 	 * the standard deviation is greater than 715 s (which is
183 	 * rather unlikely).
184 	 *
185 	 * Use this result only if there is no timer to wake us up sooner.
186 	 */
187 	if (likely(variance <= U64_MAX/36)) {
188 		if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3))
189 							|| variance <= 400) {
190 			return avg;
191 		}
192 	}
193 
194 	/*
195 	 * If we have outliers to the upside in our distribution, discard
196 	 * those by setting the threshold to exclude these outliers, then
197 	 * calculate the average and standard deviation again. Once we get
198 	 * down to the bottom 3/4 of our samples, stop excluding samples.
199 	 *
200 	 * This can deal with workloads that have long pauses interspersed
201 	 * with sporadic activity with a bunch of short pauses.
202 	 */
203 	if (divisor * 4 <= INTERVALS * 3) {
204 		/*
205 		 * If there are sufficiently many data points still under
206 		 * consideration after the outliers have been eliminated,
207 		 * returning without a prediction would be a mistake because it
208 		 * is likely that the next interval will not exceed the current
209 		 * maximum, so return the latter in that case.
210 		 */
211 		if (divisor >= INTERVALS / 2)
212 			return max;
213 
214 		return UINT_MAX;
215 	}
216 
217 	thresh = max - 1;
218 	goto again;
219 }
220 
221 /**
222  * menu_select - selects the next idle state to enter
223  * @drv: cpuidle driver containing state data
224  * @dev: the CPU
225  * @stop_tick: indication on whether or not to stop the tick
226  */
menu_select(struct cpuidle_driver * drv,struct cpuidle_device * dev,bool * stop_tick)227 static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
228 		       bool *stop_tick)
229 {
230 	struct menu_device *data = this_cpu_ptr(&menu_devices);
231 	s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
232 	u64 predicted_ns;
233 	ktime_t delta, delta_tick;
234 	int i, idx;
235 
236 	if (data->needs_update) {
237 		menu_update(drv, dev);
238 		data->needs_update = 0;
239 	} else if (!dev->last_residency_ns) {
240 		/*
241 		 * This happens when the driver rejects the previously selected
242 		 * idle state and returns an error, so update the recent
243 		 * intervals table to prevent invalid information from being
244 		 * used going forward.
245 		 */
246 		menu_update_intervals(data, UINT_MAX);
247 	}
248 
249 	/* Find the shortest expected idle interval. */
250 	predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
251 	if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
252 		unsigned int timer_us;
253 
254 		/* Determine the time till the closest timer. */
255 		delta = tick_nohz_get_sleep_length(&delta_tick);
256 		if (unlikely(delta < 0)) {
257 			delta = 0;
258 			delta_tick = 0;
259 		}
260 
261 		data->next_timer_ns = delta;
262 		data->bucket = which_bucket(data->next_timer_ns);
263 
264 		/* Round up the result for half microseconds. */
265 		timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
266 					data->next_timer_ns *
267 						data->correction_factor[data->bucket],
268 				   RESOLUTION * DECAY * NSEC_PER_USEC);
269 		/* Use the lowest expected idle interval to pick the idle state. */
270 		predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns);
271 	} else {
272 		/*
273 		 * Because the next timer event is not going to be determined
274 		 * in this case, assume that without the tick the closest timer
275 		 * will be in distant future and that the closest tick will occur
276 		 * after 1/2 of the tick period.
277 		 */
278 		data->next_timer_ns = KTIME_MAX;
279 		delta_tick = TICK_NSEC / 2;
280 		data->bucket = which_bucket(KTIME_MAX);
281 	}
282 
283 	if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
284 	    ((data->next_timer_ns < drv->states[1].target_residency_ns ||
285 	      latency_req < drv->states[1].exit_latency_ns) &&
286 	     !dev->states_usage[0].disable)) {
287 		/*
288 		 * In this case state[0] will be used no matter what, so return
289 		 * it right away and keep the tick running if state[0] is a
290 		 * polling one.
291 		 */
292 		*stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING);
293 		return 0;
294 	}
295 
296 	/*
297 	 * If the tick is already stopped, the cost of possible short idle
298 	 * duration misprediction is much higher, because the CPU may be stuck
299 	 * in a shallow idle state for a long time as a result of it.  In that
300 	 * case, say we might mispredict and use the known time till the closest
301 	 * timer event for the idle state selection.
302 	 */
303 	if (tick_nohz_tick_stopped() && predicted_ns < TICK_NSEC)
304 		predicted_ns = data->next_timer_ns;
305 
306 	/*
307 	 * Find the idle state with the lowest power while satisfying
308 	 * our constraints.
309 	 */
310 	idx = -1;
311 	for (i = 0; i < drv->state_count; i++) {
312 		struct cpuidle_state *s = &drv->states[i];
313 
314 		if (dev->states_usage[i].disable)
315 			continue;
316 
317 		if (idx == -1)
318 			idx = i; /* first enabled state */
319 
320 		if (s->exit_latency_ns > latency_req)
321 			break;
322 
323 		if (s->target_residency_ns > predicted_ns) {
324 			/*
325 			 * Use a physical idle state, not busy polling, unless
326 			 * a timer is going to trigger soon enough.
327 			 */
328 			if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) &&
329 			    s->target_residency_ns <= data->next_timer_ns) {
330 				predicted_ns = s->target_residency_ns;
331 				idx = i;
332 				break;
333 			}
334 			if (predicted_ns < TICK_NSEC)
335 				break;
336 
337 			if (!tick_nohz_tick_stopped()) {
338 				/*
339 				 * If the state selected so far is shallow,
340 				 * waking up early won't hurt, so retain the
341 				 * tick in that case and let the governor run
342 				 * again in the next iteration of the loop.
343 				 */
344 				predicted_ns = drv->states[idx].target_residency_ns;
345 				break;
346 			}
347 
348 			/*
349 			 * If the state selected so far is shallow and this
350 			 * state's target residency matches the time till the
351 			 * closest timer event, select this one to avoid getting
352 			 * stuck in the shallow one for too long.
353 			 */
354 			if (drv->states[idx].target_residency_ns < TICK_NSEC &&
355 			    s->target_residency_ns <= delta_tick)
356 				idx = i;
357 
358 			return idx;
359 		}
360 
361 		idx = i;
362 	}
363 
364 	if (idx == -1)
365 		idx = 0; /* No states enabled. Must use 0. */
366 
367 	/*
368 	 * Don't stop the tick if the selected state is a polling one or if the
369 	 * expected idle duration is shorter than the tick period length.
370 	 */
371 	if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) ||
372 	     predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) {
373 		*stop_tick = false;
374 
375 		if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) {
376 			/*
377 			 * The tick is not going to be stopped and the target
378 			 * residency of the state to be returned is not within
379 			 * the time until the next timer event including the
380 			 * tick, so try to correct that.
381 			 */
382 			for (i = idx - 1; i >= 0; i--) {
383 				if (dev->states_usage[i].disable)
384 					continue;
385 
386 				idx = i;
387 				if (drv->states[i].target_residency_ns <= delta_tick)
388 					break;
389 			}
390 		}
391 	}
392 
393 	return idx;
394 }
395 
396 /**
397  * menu_reflect - records that data structures need update
398  * @dev: the CPU
399  * @index: the index of actual entered state
400  *
401  * NOTE: it's important to be fast here because this operation will add to
402  *       the overall exit latency.
403  */
menu_reflect(struct cpuidle_device * dev,int index)404 static void menu_reflect(struct cpuidle_device *dev, int index)
405 {
406 	struct menu_device *data = this_cpu_ptr(&menu_devices);
407 
408 	dev->last_state_idx = index;
409 	data->needs_update = 1;
410 	data->tick_wakeup = tick_nohz_idle_got_tick();
411 }
412 
413 /**
414  * menu_update - attempts to guess what happened after entry
415  * @drv: cpuidle driver containing state data
416  * @dev: the CPU
417  */
menu_update(struct cpuidle_driver * drv,struct cpuidle_device * dev)418 static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev)
419 {
420 	struct menu_device *data = this_cpu_ptr(&menu_devices);
421 	int last_idx = dev->last_state_idx;
422 	struct cpuidle_state *target = &drv->states[last_idx];
423 	u64 measured_ns;
424 	unsigned int new_factor;
425 
426 	/*
427 	 * Try to figure out how much time passed between entry to low
428 	 * power state and occurrence of the wakeup event.
429 	 *
430 	 * If the entered idle state didn't support residency measurements,
431 	 * we use them anyway if they are short, and if long,
432 	 * truncate to the whole expected time.
433 	 *
434 	 * Any measured amount of time will include the exit latency.
435 	 * Since we are interested in when the wakeup begun, not when it
436 	 * was completed, we must subtract the exit latency. However, if
437 	 * the measured amount of time is less than the exit latency,
438 	 * assume the state was never reached and the exit latency is 0.
439 	 */
440 
441 	if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) {
442 		/*
443 		 * The nohz code said that there wouldn't be any events within
444 		 * the tick boundary (if the tick was stopped), but the idle
445 		 * duration predictor had a differing opinion.  Since the CPU
446 		 * was woken up by a tick (that wasn't stopped after all), the
447 		 * predictor was not quite right, so assume that the CPU could
448 		 * have been idle long (but not forever) to help the idle
449 		 * duration predictor do a better job next time.
450 		 */
451 		measured_ns = 9 * MAX_INTERESTING / 10;
452 	} else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) &&
453 		   dev->poll_time_limit) {
454 		/*
455 		 * The CPU exited the "polling" state due to a time limit, so
456 		 * the idle duration prediction leading to the selection of that
457 		 * state was inaccurate.  If a better prediction had been made,
458 		 * the CPU might have been woken up from idle by the next timer.
459 		 * Assume that to be the case.
460 		 */
461 		measured_ns = data->next_timer_ns;
462 	} else {
463 		/* measured value */
464 		measured_ns = dev->last_residency_ns;
465 
466 		/* Deduct exit latency */
467 		if (measured_ns > 2 * target->exit_latency_ns)
468 			measured_ns -= target->exit_latency_ns;
469 		else
470 			measured_ns /= 2;
471 	}
472 
473 	/* Make sure our coefficients do not exceed unity */
474 	if (measured_ns > data->next_timer_ns)
475 		measured_ns = data->next_timer_ns;
476 
477 	/* Update our correction ratio */
478 	new_factor = data->correction_factor[data->bucket];
479 	new_factor -= new_factor / DECAY;
480 
481 	if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING)
482 		new_factor += div64_u64(RESOLUTION * measured_ns,
483 					data->next_timer_ns);
484 	else
485 		/*
486 		 * we were idle so long that we count it as a perfect
487 		 * prediction
488 		 */
489 		new_factor += RESOLUTION;
490 
491 	/*
492 	 * We don't want 0 as factor; we always want at least
493 	 * a tiny bit of estimated time. Fortunately, due to rounding,
494 	 * new_factor will stay nonzero regardless of measured_us values
495 	 * and the compiler can eliminate this test as long as DECAY > 1.
496 	 */
497 	if (DECAY == 1 && unlikely(new_factor == 0))
498 		new_factor = 1;
499 
500 	data->correction_factor[data->bucket] = new_factor;
501 
502 	menu_update_intervals(data, ktime_to_us(measured_ns));
503 }
504 
505 /**
506  * menu_enable_device - scans a CPU's states and does setup
507  * @drv: cpuidle driver
508  * @dev: the CPU
509  */
menu_enable_device(struct cpuidle_driver * drv,struct cpuidle_device * dev)510 static int menu_enable_device(struct cpuidle_driver *drv,
511 				struct cpuidle_device *dev)
512 {
513 	struct menu_device *data = &per_cpu(menu_devices, dev->cpu);
514 	int i;
515 
516 	memset(data, 0, sizeof(struct menu_device));
517 
518 	/*
519 	 * if the correction factor is 0 (eg first time init or cpu hotplug
520 	 * etc), we actually want to start out with a unity factor.
521 	 */
522 	for(i = 0; i < BUCKETS; i++)
523 		data->correction_factor[i] = RESOLUTION * DECAY;
524 
525 	return 0;
526 }
527 
528 static struct cpuidle_governor menu_governor = {
529 	.name =		"menu",
530 	.rating =	20,
531 	.enable =	menu_enable_device,
532 	.select =	menu_select,
533 	.reflect =	menu_reflect,
534 };
535 
536 /**
537  * init_menu - initializes the governor
538  */
init_menu(void)539 static int __init init_menu(void)
540 {
541 	return cpuidle_register_governor(&menu_governor);
542 }
543 
544 postcore_initcall(init_menu);
545