• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * intel_powerclamp.c - package c-state idle injection
4  *
5  * Copyright (c) 2012, Intel Corporation.
6  *
7  * Authors:
8  *     Arjan van de Ven <arjan@linux.intel.com>
9  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
10  *
11  *	TODO:
12  *           1. better handle wakeup from external interrupts, currently a fixed
13  *              compensation is added to clamping duration when excessive amount
14  *              of wakeups are observed during idle time. the reason is that in
15  *              case of external interrupts without need for ack, clamping down
16  *              cpu in non-irq context does not reduce irq. for majority of the
17  *              cases, clamping down cpu does help reduce irq as well, we should
18  *              be able to differentiate the two cases and give a quantitative
19  *              solution for the irqs that we can control. perhaps based on
20  *              get_cpu_iowait_time_us()
21  *
22  *	     2. synchronization with other hw blocks
23  */
24 
25 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
26 
27 #include <linux/module.h>
28 #include <linux/kernel.h>
29 #include <linux/delay.h>
30 #include <linux/kthread.h>
31 #include <linux/cpu.h>
32 #include <linux/thermal.h>
33 #include <linux/slab.h>
34 #include <linux/tick.h>
35 #include <linux/debugfs.h>
36 #include <linux/seq_file.h>
37 #include <linux/sched/rt.h>
38 #include <uapi/linux/sched/types.h>
39 
40 #include <asm/nmi.h>
41 #include <asm/msr.h>
42 #include <asm/mwait.h>
43 #include <asm/cpu_device_id.h>
44 #include <asm/hardirq.h>
45 
46 #define MAX_TARGET_RATIO (50U)
47 /* For each undisturbed clamping period (no extra wake ups during idle time),
48  * we increment the confidence counter for the given target ratio.
49  * CONFIDENCE_OK defines the level where runtime calibration results are
50  * valid.
51  */
52 #define CONFIDENCE_OK (3)
53 /* Default idle injection duration, driver adjust sleep time to meet target
54  * idle ratio. Similar to frequency modulation.
55  */
56 #define DEFAULT_DURATION_JIFFIES (6)
57 
58 static unsigned int target_mwait;
59 static struct dentry *debug_dir;
60 static bool poll_pkg_cstate_enable;
61 
62 /* user selected target */
63 static unsigned int set_target_ratio;
64 static unsigned int current_ratio;
65 static bool should_skip;
66 static bool reduce_irq;
67 static atomic_t idle_wakeup_counter;
68 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
69 				  * control parameters. default to BSP but BSP
70 				  * can be offlined.
71 				  */
72 static bool clamping;
73 
74 static const struct sched_param sparam = {
75 	.sched_priority = MAX_USER_RT_PRIO / 2,
76 };
77 struct powerclamp_worker_data {
78 	struct kthread_worker *worker;
79 	struct kthread_work balancing_work;
80 	struct kthread_delayed_work idle_injection_work;
81 	unsigned int cpu;
82 	unsigned int count;
83 	unsigned int guard;
84 	unsigned int window_size_now;
85 	unsigned int target_ratio;
86 	unsigned int duration_jiffies;
87 	bool clamping;
88 };
89 
90 static struct powerclamp_worker_data __percpu *worker_data;
91 static struct thermal_cooling_device *cooling_dev;
92 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
93 					   * clamping kthread worker
94 					   */
95 
96 static unsigned int duration;
97 static unsigned int pkg_cstate_ratio_cur;
98 static unsigned int window_size;
99 
duration_set(const char * arg,const struct kernel_param * kp)100 static int duration_set(const char *arg, const struct kernel_param *kp)
101 {
102 	int ret = 0;
103 	unsigned long new_duration;
104 
105 	ret = kstrtoul(arg, 10, &new_duration);
106 	if (ret)
107 		goto exit;
108 	if (new_duration > 25 || new_duration < 6) {
109 		pr_err("Out of recommended range %lu, between 6-25ms\n",
110 			new_duration);
111 		ret = -EINVAL;
112 	}
113 
114 	duration = clamp(new_duration, 6ul, 25ul);
115 	smp_mb();
116 
117 exit:
118 
119 	return ret;
120 }
121 
122 static const struct kernel_param_ops duration_ops = {
123 	.set = duration_set,
124 	.get = param_get_int,
125 };
126 
127 
128 module_param_cb(duration, &duration_ops, &duration, 0644);
129 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130 
131 struct powerclamp_calibration_data {
132 	unsigned long confidence;  /* used for calibration, basically a counter
133 				    * gets incremented each time a clamping
134 				    * period is completed without extra wakeups
135 				    * once that counter is reached given level,
136 				    * compensation is deemed usable.
137 				    */
138 	unsigned long steady_comp; /* steady state compensation used when
139 				    * no extra wakeups occurred.
140 				    */
141 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
142 				     * mostly from external interrupts.
143 				     */
144 };
145 
146 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147 
window_size_set(const char * arg,const struct kernel_param * kp)148 static int window_size_set(const char *arg, const struct kernel_param *kp)
149 {
150 	int ret = 0;
151 	unsigned long new_window_size;
152 
153 	ret = kstrtoul(arg, 10, &new_window_size);
154 	if (ret)
155 		goto exit_win;
156 	if (new_window_size > 10 || new_window_size < 2) {
157 		pr_err("Out of recommended window size %lu, between 2-10\n",
158 			new_window_size);
159 		ret = -EINVAL;
160 	}
161 
162 	window_size = clamp(new_window_size, 2ul, 10ul);
163 	smp_mb();
164 
165 exit_win:
166 
167 	return ret;
168 }
169 
170 static const struct kernel_param_ops window_size_ops = {
171 	.set = window_size_set,
172 	.get = param_get_int,
173 };
174 
175 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177 	"\tpowerclamp controls idle ratio within this window. larger\n"
178 	"\twindow size results in slower response time but more smooth\n"
179 	"\tclamping results. default to 2.");
180 
find_target_mwait(void)181 static void find_target_mwait(void)
182 {
183 	unsigned int eax, ebx, ecx, edx;
184 	unsigned int highest_cstate = 0;
185 	unsigned int highest_subcstate = 0;
186 	int i;
187 
188 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189 		return;
190 
191 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192 
193 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195 		return;
196 
197 	edx >>= MWAIT_SUBSTATE_SIZE;
198 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199 		if (edx & MWAIT_SUBSTATE_MASK) {
200 			highest_cstate = i;
201 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202 		}
203 	}
204 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205 		(highest_subcstate - 1);
206 
207 }
208 
209 struct pkg_cstate_info {
210 	bool skip;
211 	int msr_index;
212 	int cstate_id;
213 };
214 
215 #define PKG_CSTATE_INIT(id) {				\
216 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
217 		.cstate_id = id				\
218 			}
219 
220 static struct pkg_cstate_info pkg_cstates[] = {
221 	PKG_CSTATE_INIT(2),
222 	PKG_CSTATE_INIT(3),
223 	PKG_CSTATE_INIT(6),
224 	PKG_CSTATE_INIT(7),
225 	PKG_CSTATE_INIT(8),
226 	PKG_CSTATE_INIT(9),
227 	PKG_CSTATE_INIT(10),
228 	{NULL},
229 };
230 
has_pkg_state_counter(void)231 static bool has_pkg_state_counter(void)
232 {
233 	u64 val;
234 	struct pkg_cstate_info *info = pkg_cstates;
235 
236 	/* check if any one of the counter msrs exists */
237 	while (info->msr_index) {
238 		if (!rdmsrl_safe(info->msr_index, &val))
239 			return true;
240 		info++;
241 	}
242 
243 	return false;
244 }
245 
pkg_state_counter(void)246 static u64 pkg_state_counter(void)
247 {
248 	u64 val;
249 	u64 count = 0;
250 	struct pkg_cstate_info *info = pkg_cstates;
251 
252 	while (info->msr_index) {
253 		if (!info->skip) {
254 			if (!rdmsrl_safe(info->msr_index, &val))
255 				count += val;
256 			else
257 				info->skip = true;
258 		}
259 		info++;
260 	}
261 
262 	return count;
263 }
264 
get_compensation(int ratio)265 static unsigned int get_compensation(int ratio)
266 {
267 	unsigned int comp = 0;
268 
269 	if (!poll_pkg_cstate_enable)
270 		return 0;
271 
272 	/* we only use compensation if all adjacent ones are good */
273 	if (ratio == 1 &&
274 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
275 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
276 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
277 		comp = (cal_data[ratio].steady_comp +
278 			cal_data[ratio + 1].steady_comp +
279 			cal_data[ratio + 2].steady_comp) / 3;
280 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
281 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
282 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
283 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
284 		comp = (cal_data[ratio].steady_comp +
285 			cal_data[ratio - 1].steady_comp +
286 			cal_data[ratio - 2].steady_comp) / 3;
287 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
288 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
289 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
290 		comp = (cal_data[ratio].steady_comp +
291 			cal_data[ratio - 1].steady_comp +
292 			cal_data[ratio + 1].steady_comp) / 3;
293 	}
294 
295 	/* REVISIT: simple penalty of double idle injection */
296 	if (reduce_irq)
297 		comp = ratio;
298 	/* do not exceed limit */
299 	if (comp + ratio >= MAX_TARGET_RATIO)
300 		comp = MAX_TARGET_RATIO - ratio - 1;
301 
302 	return comp;
303 }
304 
adjust_compensation(int target_ratio,unsigned int win)305 static void adjust_compensation(int target_ratio, unsigned int win)
306 {
307 	int delta;
308 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
309 
310 	/*
311 	 * adjust compensations if confidence level has not been reached or
312 	 * there are too many wakeups during the last idle injection period, we
313 	 * cannot trust the data for compensation.
314 	 */
315 	if (d->confidence >= CONFIDENCE_OK ||
316 		atomic_read(&idle_wakeup_counter) >
317 		win * num_online_cpus())
318 		return;
319 
320 	delta = set_target_ratio - current_ratio;
321 	/* filter out bad data */
322 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
323 		if (d->steady_comp)
324 			d->steady_comp =
325 				roundup(delta+d->steady_comp, 2)/2;
326 		else
327 			d->steady_comp = delta;
328 		d->confidence++;
329 	}
330 }
331 
powerclamp_adjust_controls(unsigned int target_ratio,unsigned int guard,unsigned int win)332 static bool powerclamp_adjust_controls(unsigned int target_ratio,
333 				unsigned int guard, unsigned int win)
334 {
335 	static u64 msr_last, tsc_last;
336 	u64 msr_now, tsc_now;
337 	u64 val64;
338 
339 	/* check result for the last window */
340 	msr_now = pkg_state_counter();
341 	tsc_now = rdtsc();
342 
343 	/* calculate pkg cstate vs tsc ratio */
344 	if (!msr_last || !tsc_last)
345 		current_ratio = 1;
346 	else if (tsc_now-tsc_last) {
347 		val64 = 100*(msr_now-msr_last);
348 		do_div(val64, (tsc_now-tsc_last));
349 		current_ratio = val64;
350 	}
351 
352 	/* update record */
353 	msr_last = msr_now;
354 	tsc_last = tsc_now;
355 
356 	adjust_compensation(target_ratio, win);
357 	/*
358 	 * too many external interrupts, set flag such
359 	 * that we can take measure later.
360 	 */
361 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
362 		2 * win * num_online_cpus();
363 
364 	atomic_set(&idle_wakeup_counter, 0);
365 	/* if we are above target+guard, skip */
366 	return set_target_ratio + guard <= current_ratio;
367 }
368 
clamp_balancing_func(struct kthread_work * work)369 static void clamp_balancing_func(struct kthread_work *work)
370 {
371 	struct powerclamp_worker_data *w_data;
372 	int sleeptime;
373 	unsigned long target_jiffies;
374 	unsigned int compensated_ratio;
375 	int interval; /* jiffies to sleep for each attempt */
376 
377 	w_data = container_of(work, struct powerclamp_worker_data,
378 			      balancing_work);
379 
380 	/*
381 	 * make sure user selected ratio does not take effect until
382 	 * the next round. adjust target_ratio if user has changed
383 	 * target such that we can converge quickly.
384 	 */
385 	w_data->target_ratio = READ_ONCE(set_target_ratio);
386 	w_data->guard = 1 + w_data->target_ratio / 20;
387 	w_data->window_size_now = window_size;
388 	w_data->duration_jiffies = msecs_to_jiffies(duration);
389 	w_data->count++;
390 
391 	/*
392 	 * systems may have different ability to enter package level
393 	 * c-states, thus we need to compensate the injected idle ratio
394 	 * to achieve the actual target reported by the HW.
395 	 */
396 	compensated_ratio = w_data->target_ratio +
397 		get_compensation(w_data->target_ratio);
398 	if (compensated_ratio <= 0)
399 		compensated_ratio = 1;
400 	interval = w_data->duration_jiffies * 100 / compensated_ratio;
401 
402 	/* align idle time */
403 	target_jiffies = roundup(jiffies, interval);
404 	sleeptime = target_jiffies - jiffies;
405 	if (sleeptime <= 0)
406 		sleeptime = 1;
407 
408 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
409 		kthread_queue_delayed_work(w_data->worker,
410 					   &w_data->idle_injection_work,
411 					   sleeptime);
412 }
413 
clamp_idle_injection_func(struct kthread_work * work)414 static void clamp_idle_injection_func(struct kthread_work *work)
415 {
416 	struct powerclamp_worker_data *w_data;
417 
418 	w_data = container_of(work, struct powerclamp_worker_data,
419 			      idle_injection_work.work);
420 
421 	/*
422 	 * only elected controlling cpu can collect stats and update
423 	 * control parameters.
424 	 */
425 	if (w_data->cpu == control_cpu &&
426 	    !(w_data->count % w_data->window_size_now)) {
427 		should_skip =
428 			powerclamp_adjust_controls(w_data->target_ratio,
429 						   w_data->guard,
430 						   w_data->window_size_now);
431 		smp_mb();
432 	}
433 
434 	if (should_skip)
435 		goto balance;
436 
437 	play_idle(jiffies_to_usecs(w_data->duration_jiffies));
438 
439 balance:
440 	if (clamping && w_data->clamping && cpu_online(w_data->cpu))
441 		kthread_queue_work(w_data->worker, &w_data->balancing_work);
442 }
443 
444 /*
445  * 1 HZ polling while clamping is active, useful for userspace
446  * to monitor actual idle ratio.
447  */
448 static void poll_pkg_cstate(struct work_struct *dummy);
449 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
poll_pkg_cstate(struct work_struct * dummy)450 static void poll_pkg_cstate(struct work_struct *dummy)
451 {
452 	static u64 msr_last;
453 	static u64 tsc_last;
454 
455 	u64 msr_now;
456 	u64 tsc_now;
457 	u64 val64;
458 
459 	msr_now = pkg_state_counter();
460 	tsc_now = rdtsc();
461 
462 	/* calculate pkg cstate vs tsc ratio */
463 	if (!msr_last || !tsc_last)
464 		pkg_cstate_ratio_cur = 1;
465 	else {
466 		if (tsc_now - tsc_last) {
467 			val64 = 100 * (msr_now - msr_last);
468 			do_div(val64, (tsc_now - tsc_last));
469 			pkg_cstate_ratio_cur = val64;
470 		}
471 	}
472 
473 	/* update record */
474 	msr_last = msr_now;
475 	tsc_last = tsc_now;
476 
477 	if (true == clamping)
478 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
479 }
480 
start_power_clamp_worker(unsigned long cpu)481 static void start_power_clamp_worker(unsigned long cpu)
482 {
483 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
484 	struct kthread_worker *worker;
485 
486 	worker = kthread_create_worker_on_cpu(cpu, 0, "kidle_inj/%ld", cpu);
487 	if (IS_ERR(worker))
488 		return;
489 
490 	w_data->worker = worker;
491 	w_data->count = 0;
492 	w_data->cpu = cpu;
493 	w_data->clamping = true;
494 	set_bit(cpu, cpu_clamping_mask);
495 	sched_setscheduler(worker->task, SCHED_FIFO, &sparam);
496 	kthread_init_work(&w_data->balancing_work, clamp_balancing_func);
497 	kthread_init_delayed_work(&w_data->idle_injection_work,
498 				  clamp_idle_injection_func);
499 	kthread_queue_work(w_data->worker, &w_data->balancing_work);
500 }
501 
stop_power_clamp_worker(unsigned long cpu)502 static void stop_power_clamp_worker(unsigned long cpu)
503 {
504 	struct powerclamp_worker_data *w_data = per_cpu_ptr(worker_data, cpu);
505 
506 	if (!w_data->worker)
507 		return;
508 
509 	w_data->clamping = false;
510 	/*
511 	 * Make sure that all works that get queued after this point see
512 	 * the clamping disabled. The counter part is not needed because
513 	 * there is an implicit memory barrier when the queued work
514 	 * is proceed.
515 	 */
516 	smp_wmb();
517 	kthread_cancel_work_sync(&w_data->balancing_work);
518 	kthread_cancel_delayed_work_sync(&w_data->idle_injection_work);
519 	/*
520 	 * The balancing work still might be queued here because
521 	 * the handling of the "clapming" variable, cancel, and queue
522 	 * operations are not synchronized via a lock. But it is not
523 	 * a big deal. The balancing work is fast and destroy kthread
524 	 * will wait for it.
525 	 */
526 	clear_bit(w_data->cpu, cpu_clamping_mask);
527 	kthread_destroy_worker(w_data->worker);
528 
529 	w_data->worker = NULL;
530 }
531 
start_power_clamp(void)532 static int start_power_clamp(void)
533 {
534 	unsigned long cpu;
535 
536 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
537 	/* prevent cpu hotplug */
538 	get_online_cpus();
539 
540 	/* prefer BSP */
541 	control_cpu = cpumask_first(cpu_online_mask);
542 
543 	clamping = true;
544 	if (poll_pkg_cstate_enable)
545 		schedule_delayed_work(&poll_pkg_cstate_work, 0);
546 
547 	/* start one kthread worker per online cpu */
548 	for_each_online_cpu(cpu) {
549 		start_power_clamp_worker(cpu);
550 	}
551 	put_online_cpus();
552 
553 	return 0;
554 }
555 
end_power_clamp(void)556 static void end_power_clamp(void)
557 {
558 	int i;
559 
560 	/*
561 	 * Block requeuing in all the kthread workers. They will flush and
562 	 * stop faster.
563 	 */
564 	clamping = false;
565 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
566 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
567 			pr_debug("clamping worker for cpu %d alive, destroy\n",
568 				 i);
569 			stop_power_clamp_worker(i);
570 		}
571 	}
572 }
573 
powerclamp_cpu_online(unsigned int cpu)574 static int powerclamp_cpu_online(unsigned int cpu)
575 {
576 	if (clamping == false)
577 		return 0;
578 	start_power_clamp_worker(cpu);
579 	/* prefer BSP as controlling CPU */
580 	if (cpu == 0) {
581 		control_cpu = 0;
582 		smp_mb();
583 	}
584 	return 0;
585 }
586 
powerclamp_cpu_predown(unsigned int cpu)587 static int powerclamp_cpu_predown(unsigned int cpu)
588 {
589 	if (clamping == false)
590 		return 0;
591 
592 	stop_power_clamp_worker(cpu);
593 	if (cpu != control_cpu)
594 		return 0;
595 
596 	control_cpu = cpumask_first(cpu_online_mask);
597 	if (control_cpu == cpu)
598 		control_cpu = cpumask_next(cpu, cpu_online_mask);
599 	smp_mb();
600 	return 0;
601 }
602 
powerclamp_get_max_state(struct thermal_cooling_device * cdev,unsigned long * state)603 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
604 				 unsigned long *state)
605 {
606 	*state = MAX_TARGET_RATIO;
607 
608 	return 0;
609 }
610 
powerclamp_get_cur_state(struct thermal_cooling_device * cdev,unsigned long * state)611 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
612 				 unsigned long *state)
613 {
614 	if (clamping) {
615 		if (poll_pkg_cstate_enable)
616 			*state = pkg_cstate_ratio_cur;
617 		else
618 			*state = set_target_ratio;
619 	} else {
620 		/* to save power, do not poll idle ratio while not clamping */
621 		*state = -1; /* indicates invalid state */
622 	}
623 
624 	return 0;
625 }
626 
powerclamp_set_cur_state(struct thermal_cooling_device * cdev,unsigned long new_target_ratio)627 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
628 				 unsigned long new_target_ratio)
629 {
630 	int ret = 0;
631 
632 	new_target_ratio = clamp(new_target_ratio, 0UL,
633 				(unsigned long) (MAX_TARGET_RATIO-1));
634 	if (set_target_ratio == 0 && new_target_ratio > 0) {
635 		pr_info("Start idle injection to reduce power\n");
636 		set_target_ratio = new_target_ratio;
637 		ret = start_power_clamp();
638 		goto exit_set;
639 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
640 		pr_info("Stop forced idle injection\n");
641 		end_power_clamp();
642 		set_target_ratio = 0;
643 	} else	/* adjust currently running */ {
644 		set_target_ratio = new_target_ratio;
645 		/* make new set_target_ratio visible to other cpus */
646 		smp_mb();
647 	}
648 
649 exit_set:
650 	return ret;
651 }
652 
653 /* bind to generic thermal layer as cooling device*/
654 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
655 	.get_max_state = powerclamp_get_max_state,
656 	.get_cur_state = powerclamp_get_cur_state,
657 	.set_cur_state = powerclamp_set_cur_state,
658 };
659 
660 static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = {
661 	{ X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_MWAIT },
662 	{}
663 };
664 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
665 
powerclamp_probe(void)666 static int __init powerclamp_probe(void)
667 {
668 
669 	if (!x86_match_cpu(intel_powerclamp_ids)) {
670 		pr_err("CPU does not support MWAIT\n");
671 		return -ENODEV;
672 	}
673 
674 	/* The goal for idle time alignment is to achieve package cstate. */
675 	if (!has_pkg_state_counter()) {
676 		pr_info("No package C-state available\n");
677 		return -ENODEV;
678 	}
679 
680 	/* find the deepest mwait value */
681 	find_target_mwait();
682 
683 	return 0;
684 }
685 
powerclamp_debug_show(struct seq_file * m,void * unused)686 static int powerclamp_debug_show(struct seq_file *m, void *unused)
687 {
688 	int i = 0;
689 
690 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
691 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
692 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
693 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
694 			i,
695 			cal_data[i].confidence,
696 			cal_data[i].steady_comp,
697 			cal_data[i].dynamic_comp);
698 	}
699 
700 	return 0;
701 }
702 
703 DEFINE_SHOW_ATTRIBUTE(powerclamp_debug);
704 
powerclamp_create_debug_files(void)705 static inline void powerclamp_create_debug_files(void)
706 {
707 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
708 
709 	debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir, cal_data,
710 			    &powerclamp_debug_fops);
711 }
712 
713 static enum cpuhp_state hp_state;
714 
powerclamp_init(void)715 static int __init powerclamp_init(void)
716 {
717 	int retval;
718 	int bitmap_size;
719 
720 	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
721 	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
722 	if (!cpu_clamping_mask)
723 		return -ENOMEM;
724 
725 	/* probe cpu features and ids here */
726 	retval = powerclamp_probe();
727 	if (retval)
728 		goto exit_free;
729 
730 	/* set default limit, maybe adjusted during runtime based on feedback */
731 	window_size = 2;
732 	retval = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN,
733 					   "thermal/intel_powerclamp:online",
734 					   powerclamp_cpu_online,
735 					   powerclamp_cpu_predown);
736 	if (retval < 0)
737 		goto exit_free;
738 
739 	hp_state = retval;
740 
741 	worker_data = alloc_percpu(struct powerclamp_worker_data);
742 	if (!worker_data) {
743 		retval = -ENOMEM;
744 		goto exit_unregister;
745 	}
746 
747 	if (topology_max_packages() == 1 && topology_max_die_per_package() == 1)
748 		poll_pkg_cstate_enable = true;
749 
750 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
751 						&powerclamp_cooling_ops);
752 	if (IS_ERR(cooling_dev)) {
753 		retval = -ENODEV;
754 		goto exit_free_thread;
755 	}
756 
757 	if (!duration)
758 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
759 
760 	powerclamp_create_debug_files();
761 
762 	return 0;
763 
764 exit_free_thread:
765 	free_percpu(worker_data);
766 exit_unregister:
767 	cpuhp_remove_state_nocalls(hp_state);
768 exit_free:
769 	kfree(cpu_clamping_mask);
770 	return retval;
771 }
772 module_init(powerclamp_init);
773 
powerclamp_exit(void)774 static void __exit powerclamp_exit(void)
775 {
776 	end_power_clamp();
777 	cpuhp_remove_state_nocalls(hp_state);
778 	free_percpu(worker_data);
779 	thermal_cooling_device_unregister(cooling_dev);
780 	kfree(cpu_clamping_mask);
781 
782 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
783 	debugfs_remove_recursive(debug_dir);
784 }
785 module_exit(powerclamp_exit);
786 
787 MODULE_LICENSE("GPL");
788 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
789 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
790 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
791