• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * intel_powerclamp.c - package c-state idle injection
3  *
4  * Copyright (c) 2012, Intel Corporation.
5  *
6  * Authors:
7  *     Arjan van de Ven <arjan@linux.intel.com>
8  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms and conditions of the GNU General Public License,
12  * version 2, as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22  *
23  *
24  *	TODO:
25  *           1. better handle wakeup from external interrupts, currently a fixed
26  *              compensation is added to clamping duration when excessive amount
27  *              of wakeups are observed during idle time. the reason is that in
28  *              case of external interrupts without need for ack, clamping down
29  *              cpu in non-irq context does not reduce irq. for majority of the
30  *              cases, clamping down cpu does help reduce irq as well, we should
31  *              be able to differenciate the two cases and give a quantitative
32  *              solution for the irqs that we can control. perhaps based on
33  *              get_cpu_iowait_time_us()
34  *
35  *	     2. synchronization with other hw blocks
36  *
37  *
38  */
39 
40 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
41 
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/freezer.h>
47 #include <linux/cpu.h>
48 #include <linux/thermal.h>
49 #include <linux/slab.h>
50 #include <linux/tick.h>
51 #include <linux/debugfs.h>
52 #include <linux/seq_file.h>
53 #include <linux/sched/rt.h>
54 
55 #include <asm/nmi.h>
56 #include <asm/msr.h>
57 #include <asm/mwait.h>
58 #include <asm/cpu_device_id.h>
59 #include <asm/idle.h>
60 #include <asm/hardirq.h>
61 
62 #define MAX_TARGET_RATIO (50U)
63 /* For each undisturbed clamping period (no extra wake ups during idle time),
64  * we increment the confidence counter for the given target ratio.
65  * CONFIDENCE_OK defines the level where runtime calibration results are
66  * valid.
67  */
68 #define CONFIDENCE_OK (3)
69 /* Default idle injection duration, driver adjust sleep time to meet target
70  * idle ratio. Similar to frequency modulation.
71  */
72 #define DEFAULT_DURATION_JIFFIES (6)
73 
74 static unsigned int target_mwait;
75 static struct dentry *debug_dir;
76 
77 /* user selected target */
78 static unsigned int set_target_ratio;
79 static unsigned int current_ratio;
80 static bool should_skip;
81 static bool reduce_irq;
82 static atomic_t idle_wakeup_counter;
83 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
84 				  * control parameters. default to BSP but BSP
85 				  * can be offlined.
86 				  */
87 static bool clamping;
88 
89 
90 static struct task_struct * __percpu *powerclamp_thread;
91 static struct thermal_cooling_device *cooling_dev;
92 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
93 					   * clamping thread
94 					   */
95 
96 static unsigned int duration;
97 static unsigned int pkg_cstate_ratio_cur;
98 static unsigned int window_size;
99 
duration_set(const char * arg,const struct kernel_param * kp)100 static int duration_set(const char *arg, const struct kernel_param *kp)
101 {
102 	int ret = 0;
103 	unsigned long new_duration;
104 
105 	ret = kstrtoul(arg, 10, &new_duration);
106 	if (ret)
107 		goto exit;
108 	if (new_duration > 25 || new_duration < 6) {
109 		pr_err("Out of recommended range %lu, between 6-25ms\n",
110 			new_duration);
111 		ret = -EINVAL;
112 	}
113 
114 	duration = clamp(new_duration, 6ul, 25ul);
115 	smp_mb();
116 
117 exit:
118 
119 	return ret;
120 }
121 
122 static struct kernel_param_ops duration_ops = {
123 	.set = duration_set,
124 	.get = param_get_int,
125 };
126 
127 
128 module_param_cb(duration, &duration_ops, &duration, 0644);
129 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130 
131 struct powerclamp_calibration_data {
132 	unsigned long confidence;  /* used for calibration, basically a counter
133 				    * gets incremented each time a clamping
134 				    * period is completed without extra wakeups
135 				    * once that counter is reached given level,
136 				    * compensation is deemed usable.
137 				    */
138 	unsigned long steady_comp; /* steady state compensation used when
139 				    * no extra wakeups occurred.
140 				    */
141 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
142 				     * mostly from external interrupts.
143 				     */
144 };
145 
146 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147 
window_size_set(const char * arg,const struct kernel_param * kp)148 static int window_size_set(const char *arg, const struct kernel_param *kp)
149 {
150 	int ret = 0;
151 	unsigned long new_window_size;
152 
153 	ret = kstrtoul(arg, 10, &new_window_size);
154 	if (ret)
155 		goto exit_win;
156 	if (new_window_size > 10 || new_window_size < 2) {
157 		pr_err("Out of recommended window size %lu, between 2-10\n",
158 			new_window_size);
159 		ret = -EINVAL;
160 	}
161 
162 	window_size = clamp(new_window_size, 2ul, 10ul);
163 	smp_mb();
164 
165 exit_win:
166 
167 	return ret;
168 }
169 
170 static struct kernel_param_ops window_size_ops = {
171 	.set = window_size_set,
172 	.get = param_get_int,
173 };
174 
175 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177 	"\tpowerclamp controls idle ratio within this window. larger\n"
178 	"\twindow size results in slower response time but more smooth\n"
179 	"\tclamping results. default to 2.");
180 
find_target_mwait(void)181 static void find_target_mwait(void)
182 {
183 	unsigned int eax, ebx, ecx, edx;
184 	unsigned int highest_cstate = 0;
185 	unsigned int highest_subcstate = 0;
186 	int i;
187 
188 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189 		return;
190 
191 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192 
193 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195 		return;
196 
197 	edx >>= MWAIT_SUBSTATE_SIZE;
198 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199 		if (edx & MWAIT_SUBSTATE_MASK) {
200 			highest_cstate = i;
201 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202 		}
203 	}
204 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205 		(highest_subcstate - 1);
206 
207 }
208 
has_pkg_state_counter(void)209 static bool has_pkg_state_counter(void)
210 {
211 	u64 tmp;
212 	return !rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &tmp) ||
213 	       !rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &tmp) ||
214 	       !rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &tmp) ||
215 	       !rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &tmp);
216 }
217 
pkg_state_counter(void)218 static u64 pkg_state_counter(void)
219 {
220 	u64 val;
221 	u64 count = 0;
222 
223 	static bool skip_c2;
224 	static bool skip_c3;
225 	static bool skip_c6;
226 	static bool skip_c7;
227 
228 	if (!skip_c2) {
229 		if (!rdmsrl_safe(MSR_PKG_C2_RESIDENCY, &val))
230 			count += val;
231 		else
232 			skip_c2 = true;
233 	}
234 
235 	if (!skip_c3) {
236 		if (!rdmsrl_safe(MSR_PKG_C3_RESIDENCY, &val))
237 			count += val;
238 		else
239 			skip_c3 = true;
240 	}
241 
242 	if (!skip_c6) {
243 		if (!rdmsrl_safe(MSR_PKG_C6_RESIDENCY, &val))
244 			count += val;
245 		else
246 			skip_c6 = true;
247 	}
248 
249 	if (!skip_c7) {
250 		if (!rdmsrl_safe(MSR_PKG_C7_RESIDENCY, &val))
251 			count += val;
252 		else
253 			skip_c7 = true;
254 	}
255 
256 	return count;
257 }
258 
noop_timer(unsigned long foo)259 static void noop_timer(unsigned long foo)
260 {
261 	/* empty... just the fact that we get the interrupt wakes us up */
262 }
263 
get_compensation(int ratio)264 static unsigned int get_compensation(int ratio)
265 {
266 	unsigned int comp = 0;
267 
268 	/* we only use compensation if all adjacent ones are good */
269 	if (ratio == 1 &&
270 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
271 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
272 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
273 		comp = (cal_data[ratio].steady_comp +
274 			cal_data[ratio + 1].steady_comp +
275 			cal_data[ratio + 2].steady_comp) / 3;
276 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
277 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
278 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
279 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
280 		comp = (cal_data[ratio].steady_comp +
281 			cal_data[ratio - 1].steady_comp +
282 			cal_data[ratio - 2].steady_comp) / 3;
283 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
284 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
286 		comp = (cal_data[ratio].steady_comp +
287 			cal_data[ratio - 1].steady_comp +
288 			cal_data[ratio + 1].steady_comp) / 3;
289 	}
290 
291 	/* REVISIT: simple penalty of double idle injection */
292 	if (reduce_irq)
293 		comp = ratio;
294 	/* do not exceed limit */
295 	if (comp + ratio >= MAX_TARGET_RATIO)
296 		comp = MAX_TARGET_RATIO - ratio - 1;
297 
298 	return comp;
299 }
300 
adjust_compensation(int target_ratio,unsigned int win)301 static void adjust_compensation(int target_ratio, unsigned int win)
302 {
303 	int delta;
304 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
305 
306 	/*
307 	 * adjust compensations if confidence level has not been reached or
308 	 * there are too many wakeups during the last idle injection period, we
309 	 * cannot trust the data for compensation.
310 	 */
311 	if (d->confidence >= CONFIDENCE_OK ||
312 		atomic_read(&idle_wakeup_counter) >
313 		win * num_online_cpus())
314 		return;
315 
316 	delta = set_target_ratio - current_ratio;
317 	/* filter out bad data */
318 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
319 		if (d->steady_comp)
320 			d->steady_comp =
321 				roundup(delta+d->steady_comp, 2)/2;
322 		else
323 			d->steady_comp = delta;
324 		d->confidence++;
325 	}
326 }
327 
powerclamp_adjust_controls(unsigned int target_ratio,unsigned int guard,unsigned int win)328 static bool powerclamp_adjust_controls(unsigned int target_ratio,
329 				unsigned int guard, unsigned int win)
330 {
331 	static u64 msr_last, tsc_last;
332 	u64 msr_now, tsc_now;
333 	u64 val64;
334 
335 	/* check result for the last window */
336 	msr_now = pkg_state_counter();
337 	rdtscll(tsc_now);
338 
339 	/* calculate pkg cstate vs tsc ratio */
340 	if (!msr_last || !tsc_last)
341 		current_ratio = 1;
342 	else if (tsc_now-tsc_last) {
343 		val64 = 100*(msr_now-msr_last);
344 		do_div(val64, (tsc_now-tsc_last));
345 		current_ratio = val64;
346 	}
347 
348 	/* update record */
349 	msr_last = msr_now;
350 	tsc_last = tsc_now;
351 
352 	adjust_compensation(target_ratio, win);
353 	/*
354 	 * too many external interrupts, set flag such
355 	 * that we can take measure later.
356 	 */
357 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
358 		2 * win * num_online_cpus();
359 
360 	atomic_set(&idle_wakeup_counter, 0);
361 	/* if we are above target+guard, skip */
362 	return set_target_ratio + guard <= current_ratio;
363 }
364 
clamp_thread(void * arg)365 static int clamp_thread(void *arg)
366 {
367 	int cpunr = (unsigned long)arg;
368 	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
369 	static const struct sched_param param = {
370 		.sched_priority = MAX_USER_RT_PRIO/2,
371 	};
372 	unsigned int count = 0;
373 	unsigned int target_ratio;
374 
375 	set_bit(cpunr, cpu_clamping_mask);
376 	set_freezable();
377 	init_timer_on_stack(&wakeup_timer);
378 	sched_setscheduler(current, SCHED_FIFO, &param);
379 
380 	while (true == clamping && !kthread_should_stop() &&
381 		cpu_online(cpunr)) {
382 		int sleeptime;
383 		unsigned long target_jiffies;
384 		unsigned int guard;
385 		unsigned int compensation = 0;
386 		int interval; /* jiffies to sleep for each attempt */
387 		unsigned int duration_jiffies = msecs_to_jiffies(duration);
388 		unsigned int window_size_now;
389 
390 		try_to_freeze();
391 		/*
392 		 * make sure user selected ratio does not take effect until
393 		 * the next round. adjust target_ratio if user has changed
394 		 * target such that we can converge quickly.
395 		 */
396 		target_ratio = set_target_ratio;
397 		guard = 1 + target_ratio/20;
398 		window_size_now = window_size;
399 		count++;
400 
401 		/*
402 		 * systems may have different ability to enter package level
403 		 * c-states, thus we need to compensate the injected idle ratio
404 		 * to achieve the actual target reported by the HW.
405 		 */
406 		compensation = get_compensation(target_ratio);
407 		interval = duration_jiffies*100/(target_ratio+compensation);
408 
409 		/* align idle time */
410 		target_jiffies = roundup(jiffies, interval);
411 		sleeptime = target_jiffies - jiffies;
412 		if (sleeptime <= 0)
413 			sleeptime = 1;
414 		schedule_timeout_interruptible(sleeptime);
415 		/*
416 		 * only elected controlling cpu can collect stats and update
417 		 * control parameters.
418 		 */
419 		if (cpunr == control_cpu && !(count%window_size_now)) {
420 			should_skip =
421 				powerclamp_adjust_controls(target_ratio,
422 							guard, window_size_now);
423 			smp_mb();
424 		}
425 
426 		if (should_skip)
427 			continue;
428 
429 		target_jiffies = jiffies + duration_jiffies;
430 		mod_timer(&wakeup_timer, target_jiffies);
431 		if (unlikely(local_softirq_pending()))
432 			continue;
433 		/*
434 		 * stop tick sched during idle time, interrupts are still
435 		 * allowed. thus jiffies are updated properly.
436 		 */
437 		preempt_disable();
438 		/* mwait until target jiffies is reached */
439 		while (time_before(jiffies, target_jiffies)) {
440 			unsigned long ecx = 1;
441 			unsigned long eax = target_mwait;
442 
443 			/*
444 			 * REVISIT: may call enter_idle() to notify drivers who
445 			 * can save power during cpu idle. same for exit_idle()
446 			 */
447 			local_touch_nmi();
448 			stop_critical_timings();
449 			mwait_idle_with_hints(eax, ecx);
450 			start_critical_timings();
451 			atomic_inc(&idle_wakeup_counter);
452 		}
453 		preempt_enable();
454 	}
455 	del_timer_sync(&wakeup_timer);
456 	clear_bit(cpunr, cpu_clamping_mask);
457 
458 	return 0;
459 }
460 
461 /*
462  * 1 HZ polling while clamping is active, useful for userspace
463  * to monitor actual idle ratio.
464  */
465 static void poll_pkg_cstate(struct work_struct *dummy);
466 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
poll_pkg_cstate(struct work_struct * dummy)467 static void poll_pkg_cstate(struct work_struct *dummy)
468 {
469 	static u64 msr_last;
470 	static u64 tsc_last;
471 	static unsigned long jiffies_last;
472 
473 	u64 msr_now;
474 	unsigned long jiffies_now;
475 	u64 tsc_now;
476 	u64 val64;
477 
478 	msr_now = pkg_state_counter();
479 	rdtscll(tsc_now);
480 	jiffies_now = jiffies;
481 
482 	/* calculate pkg cstate vs tsc ratio */
483 	if (!msr_last || !tsc_last)
484 		pkg_cstate_ratio_cur = 1;
485 	else {
486 		if (tsc_now - tsc_last) {
487 			val64 = 100 * (msr_now - msr_last);
488 			do_div(val64, (tsc_now - tsc_last));
489 			pkg_cstate_ratio_cur = val64;
490 		}
491 	}
492 
493 	/* update record */
494 	msr_last = msr_now;
495 	jiffies_last = jiffies_now;
496 	tsc_last = tsc_now;
497 
498 	if (true == clamping)
499 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
500 }
501 
start_power_clamp(void)502 static int start_power_clamp(void)
503 {
504 	unsigned long cpu;
505 	struct task_struct *thread;
506 
507 	/* check if pkg cstate counter is completely 0, abort in this case */
508 	if (!has_pkg_state_counter()) {
509 		pr_err("pkg cstate counter not functional, abort\n");
510 		return -EINVAL;
511 	}
512 
513 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
514 	/* prevent cpu hotplug */
515 	get_online_cpus();
516 
517 	/* prefer BSP */
518 	control_cpu = 0;
519 	if (!cpu_online(control_cpu))
520 		control_cpu = smp_processor_id();
521 
522 	clamping = true;
523 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
524 
525 	/* start one thread per online cpu */
526 	for_each_online_cpu(cpu) {
527 		struct task_struct **p =
528 			per_cpu_ptr(powerclamp_thread, cpu);
529 
530 		thread = kthread_create_on_node(clamp_thread,
531 						(void *) cpu,
532 						cpu_to_node(cpu),
533 						"kidle_inject/%ld", cpu);
534 		/* bind to cpu here */
535 		if (likely(!IS_ERR(thread))) {
536 			kthread_bind(thread, cpu);
537 			wake_up_process(thread);
538 			*p = thread;
539 		}
540 
541 	}
542 	put_online_cpus();
543 
544 	return 0;
545 }
546 
end_power_clamp(void)547 static void end_power_clamp(void)
548 {
549 	int i;
550 	struct task_struct *thread;
551 
552 	clamping = false;
553 	/*
554 	 * make clamping visible to other cpus and give per cpu clamping threads
555 	 * sometime to exit, or gets killed later.
556 	 */
557 	smp_mb();
558 	msleep(20);
559 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
560 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
561 			pr_debug("clamping thread for cpu %d alive, kill\n", i);
562 			thread = *per_cpu_ptr(powerclamp_thread, i);
563 			kthread_stop(thread);
564 		}
565 	}
566 }
567 
powerclamp_cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)568 static int powerclamp_cpu_callback(struct notifier_block *nfb,
569 				unsigned long action, void *hcpu)
570 {
571 	unsigned long cpu = (unsigned long)hcpu;
572 	struct task_struct *thread;
573 	struct task_struct **percpu_thread =
574 		per_cpu_ptr(powerclamp_thread, cpu);
575 
576 	if (false == clamping)
577 		goto exit_ok;
578 
579 	switch (action) {
580 	case CPU_ONLINE:
581 		thread = kthread_create_on_node(clamp_thread,
582 						(void *) cpu,
583 						cpu_to_node(cpu),
584 						"kidle_inject/%lu", cpu);
585 		if (likely(!IS_ERR(thread))) {
586 			kthread_bind(thread, cpu);
587 			wake_up_process(thread);
588 			*percpu_thread = thread;
589 		}
590 		/* prefer BSP as controlling CPU */
591 		if (cpu == 0) {
592 			control_cpu = 0;
593 			smp_mb();
594 		}
595 		break;
596 	case CPU_DEAD:
597 		if (test_bit(cpu, cpu_clamping_mask)) {
598 			pr_err("cpu %lu dead but powerclamping thread is not\n",
599 				cpu);
600 			kthread_stop(*percpu_thread);
601 		}
602 		if (cpu == control_cpu) {
603 			control_cpu = smp_processor_id();
604 			smp_mb();
605 		}
606 	}
607 
608 exit_ok:
609 	return NOTIFY_OK;
610 }
611 
612 static struct notifier_block powerclamp_cpu_notifier = {
613 	.notifier_call = powerclamp_cpu_callback,
614 };
615 
powerclamp_get_max_state(struct thermal_cooling_device * cdev,unsigned long * state)616 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
617 				 unsigned long *state)
618 {
619 	*state = MAX_TARGET_RATIO;
620 
621 	return 0;
622 }
623 
powerclamp_get_cur_state(struct thermal_cooling_device * cdev,unsigned long * state)624 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
625 				 unsigned long *state)
626 {
627 	if (true == clamping)
628 		*state = pkg_cstate_ratio_cur;
629 	else
630 		/* to save power, do not poll idle ratio while not clamping */
631 		*state = -1; /* indicates invalid state */
632 
633 	return 0;
634 }
635 
powerclamp_set_cur_state(struct thermal_cooling_device * cdev,unsigned long new_target_ratio)636 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
637 				 unsigned long new_target_ratio)
638 {
639 	int ret = 0;
640 
641 	new_target_ratio = clamp(new_target_ratio, 0UL,
642 				(unsigned long) (MAX_TARGET_RATIO-1));
643 	if (set_target_ratio == 0 && new_target_ratio > 0) {
644 		pr_info("Start idle injection to reduce power\n");
645 		set_target_ratio = new_target_ratio;
646 		ret = start_power_clamp();
647 		goto exit_set;
648 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
649 		pr_info("Stop forced idle injection\n");
650 		set_target_ratio = 0;
651 		end_power_clamp();
652 	} else	/* adjust currently running */ {
653 		set_target_ratio = new_target_ratio;
654 		/* make new set_target_ratio visible to other cpus */
655 		smp_mb();
656 	}
657 
658 exit_set:
659 	return ret;
660 }
661 
662 /* bind to generic thermal layer as cooling device*/
663 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
664 	.get_max_state = powerclamp_get_max_state,
665 	.get_cur_state = powerclamp_get_cur_state,
666 	.set_cur_state = powerclamp_set_cur_state,
667 };
668 
669 /* runs on Nehalem and later */
670 static const struct x86_cpu_id intel_powerclamp_ids[] = {
671 	{ X86_VENDOR_INTEL, 6, 0x1a},
672 	{ X86_VENDOR_INTEL, 6, 0x1c},
673 	{ X86_VENDOR_INTEL, 6, 0x1e},
674 	{ X86_VENDOR_INTEL, 6, 0x1f},
675 	{ X86_VENDOR_INTEL, 6, 0x25},
676 	{ X86_VENDOR_INTEL, 6, 0x26},
677 	{ X86_VENDOR_INTEL, 6, 0x2a},
678 	{ X86_VENDOR_INTEL, 6, 0x2c},
679 	{ X86_VENDOR_INTEL, 6, 0x2d},
680 	{ X86_VENDOR_INTEL, 6, 0x2e},
681 	{ X86_VENDOR_INTEL, 6, 0x2f},
682 	{ X86_VENDOR_INTEL, 6, 0x37},
683 	{ X86_VENDOR_INTEL, 6, 0x3a},
684 	{ X86_VENDOR_INTEL, 6, 0x3c},
685 	{ X86_VENDOR_INTEL, 6, 0x3d},
686 	{ X86_VENDOR_INTEL, 6, 0x3e},
687 	{ X86_VENDOR_INTEL, 6, 0x3f},
688 	{ X86_VENDOR_INTEL, 6, 0x45},
689 	{ X86_VENDOR_INTEL, 6, 0x46},
690 	{}
691 };
692 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
693 
powerclamp_probe(void)694 static int powerclamp_probe(void)
695 {
696 	if (!x86_match_cpu(intel_powerclamp_ids)) {
697 		pr_err("Intel powerclamp does not run on family %d model %d\n",
698 				boot_cpu_data.x86, boot_cpu_data.x86_model);
699 		return -ENODEV;
700 	}
701 	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
702 		!boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
703 		!boot_cpu_has(X86_FEATURE_MWAIT) ||
704 		!boot_cpu_has(X86_FEATURE_ARAT))
705 		return -ENODEV;
706 
707 	/* find the deepest mwait value */
708 	find_target_mwait();
709 
710 	return 0;
711 }
712 
powerclamp_debug_show(struct seq_file * m,void * unused)713 static int powerclamp_debug_show(struct seq_file *m, void *unused)
714 {
715 	int i = 0;
716 
717 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
718 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
719 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
720 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
721 			i,
722 			cal_data[i].confidence,
723 			cal_data[i].steady_comp,
724 			cal_data[i].dynamic_comp);
725 	}
726 
727 	return 0;
728 }
729 
powerclamp_debug_open(struct inode * inode,struct file * file)730 static int powerclamp_debug_open(struct inode *inode,
731 			struct file *file)
732 {
733 	return single_open(file, powerclamp_debug_show, inode->i_private);
734 }
735 
736 static const struct file_operations powerclamp_debug_fops = {
737 	.open		= powerclamp_debug_open,
738 	.read		= seq_read,
739 	.llseek		= seq_lseek,
740 	.release	= single_release,
741 	.owner		= THIS_MODULE,
742 };
743 
powerclamp_create_debug_files(void)744 static inline void powerclamp_create_debug_files(void)
745 {
746 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
747 	if (!debug_dir)
748 		return;
749 
750 	if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
751 					cal_data, &powerclamp_debug_fops))
752 		goto file_error;
753 
754 	return;
755 
756 file_error:
757 	debugfs_remove_recursive(debug_dir);
758 }
759 
powerclamp_init(void)760 static int powerclamp_init(void)
761 {
762 	int retval;
763 	int bitmap_size;
764 
765 	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
766 	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
767 	if (!cpu_clamping_mask)
768 		return -ENOMEM;
769 
770 	/* probe cpu features and ids here */
771 	retval = powerclamp_probe();
772 	if (retval)
773 		goto exit_free;
774 
775 	/* set default limit, maybe adjusted during runtime based on feedback */
776 	window_size = 2;
777 	register_hotcpu_notifier(&powerclamp_cpu_notifier);
778 
779 	powerclamp_thread = alloc_percpu(struct task_struct *);
780 	if (!powerclamp_thread) {
781 		retval = -ENOMEM;
782 		goto exit_unregister;
783 	}
784 
785 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
786 						&powerclamp_cooling_ops);
787 	if (IS_ERR(cooling_dev)) {
788 		retval = -ENODEV;
789 		goto exit_free_thread;
790 	}
791 
792 	if (!duration)
793 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
794 
795 	powerclamp_create_debug_files();
796 
797 	return 0;
798 
799 exit_free_thread:
800 	free_percpu(powerclamp_thread);
801 exit_unregister:
802 	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
803 exit_free:
804 	kfree(cpu_clamping_mask);
805 	return retval;
806 }
807 module_init(powerclamp_init);
808 
powerclamp_exit(void)809 static void powerclamp_exit(void)
810 {
811 	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
812 	end_power_clamp();
813 	free_percpu(powerclamp_thread);
814 	thermal_cooling_device_unregister(cooling_dev);
815 	kfree(cpu_clamping_mask);
816 
817 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
818 	debugfs_remove_recursive(debug_dir);
819 }
820 module_exit(powerclamp_exit);
821 
822 MODULE_LICENSE("GPL");
823 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
824 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
825 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
826