• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * intel_powerclamp.c - package c-state idle injection
3  *
4  * Copyright (c) 2012, Intel Corporation.
5  *
6  * Authors:
7  *     Arjan van de Ven <arjan@linux.intel.com>
8  *     Jacob Pan <jacob.jun.pan@linux.intel.com>
9  *
10  * This program is free software; you can redistribute it and/or modify it
11  * under the terms and conditions of the GNU General Public License,
12  * version 2, as published by the Free Software Foundation.
13  *
14  * This program is distributed in the hope it will be useful, but WITHOUT
15  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
16  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
17  * more details.
18  *
19  * You should have received a copy of the GNU General Public License along with
20  * this program; if not, write to the Free Software Foundation, Inc.,
21  * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
22  *
23  *
24  *	TODO:
25  *           1. better handle wakeup from external interrupts, currently a fixed
26  *              compensation is added to clamping duration when excessive amount
27  *              of wakeups are observed during idle time. the reason is that in
28  *              case of external interrupts without need for ack, clamping down
29  *              cpu in non-irq context does not reduce irq. for majority of the
30  *              cases, clamping down cpu does help reduce irq as well, we should
31  *              be able to differenciate the two cases and give a quantitative
32  *              solution for the irqs that we can control. perhaps based on
33  *              get_cpu_iowait_time_us()
34  *
35  *	     2. synchronization with other hw blocks
36  *
37  *
38  */
39 
40 #define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
41 
42 #include <linux/module.h>
43 #include <linux/kernel.h>
44 #include <linux/delay.h>
45 #include <linux/kthread.h>
46 #include <linux/freezer.h>
47 #include <linux/cpu.h>
48 #include <linux/thermal.h>
49 #include <linux/slab.h>
50 #include <linux/tick.h>
51 #include <linux/debugfs.h>
52 #include <linux/seq_file.h>
53 #include <linux/sched/rt.h>
54 
55 #include <asm/nmi.h>
56 #include <asm/msr.h>
57 #include <asm/mwait.h>
58 #include <asm/cpu_device_id.h>
59 #include <asm/idle.h>
60 #include <asm/hardirq.h>
61 
62 #define MAX_TARGET_RATIO (50U)
63 /* For each undisturbed clamping period (no extra wake ups during idle time),
64  * we increment the confidence counter for the given target ratio.
65  * CONFIDENCE_OK defines the level where runtime calibration results are
66  * valid.
67  */
68 #define CONFIDENCE_OK (3)
69 /* Default idle injection duration, driver adjust sleep time to meet target
70  * idle ratio. Similar to frequency modulation.
71  */
72 #define DEFAULT_DURATION_JIFFIES (6)
73 
74 static unsigned int target_mwait;
75 static struct dentry *debug_dir;
76 
77 /* user selected target */
78 static unsigned int set_target_ratio;
79 static unsigned int current_ratio;
80 static bool should_skip;
81 static bool reduce_irq;
82 static atomic_t idle_wakeup_counter;
83 static unsigned int control_cpu; /* The cpu assigned to collect stat and update
84 				  * control parameters. default to BSP but BSP
85 				  * can be offlined.
86 				  */
87 static bool clamping;
88 
89 
90 static struct task_struct * __percpu *powerclamp_thread;
91 static struct thermal_cooling_device *cooling_dev;
92 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
93 					   * clamping thread
94 					   */
95 
96 static unsigned int duration;
97 static unsigned int pkg_cstate_ratio_cur;
98 static unsigned int window_size;
99 
duration_set(const char * arg,const struct kernel_param * kp)100 static int duration_set(const char *arg, const struct kernel_param *kp)
101 {
102 	int ret = 0;
103 	unsigned long new_duration;
104 
105 	ret = kstrtoul(arg, 10, &new_duration);
106 	if (ret)
107 		goto exit;
108 	if (new_duration > 25 || new_duration < 6) {
109 		pr_err("Out of recommended range %lu, between 6-25ms\n",
110 			new_duration);
111 		ret = -EINVAL;
112 	}
113 
114 	duration = clamp(new_duration, 6ul, 25ul);
115 	smp_mb();
116 
117 exit:
118 
119 	return ret;
120 }
121 
122 static const struct kernel_param_ops duration_ops = {
123 	.set = duration_set,
124 	.get = param_get_int,
125 };
126 
127 
128 module_param_cb(duration, &duration_ops, &duration, 0644);
129 MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec.");
130 
131 struct powerclamp_calibration_data {
132 	unsigned long confidence;  /* used for calibration, basically a counter
133 				    * gets incremented each time a clamping
134 				    * period is completed without extra wakeups
135 				    * once that counter is reached given level,
136 				    * compensation is deemed usable.
137 				    */
138 	unsigned long steady_comp; /* steady state compensation used when
139 				    * no extra wakeups occurred.
140 				    */
141 	unsigned long dynamic_comp; /* compensate excessive wakeup from idle
142 				     * mostly from external interrupts.
143 				     */
144 };
145 
146 static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO];
147 
window_size_set(const char * arg,const struct kernel_param * kp)148 static int window_size_set(const char *arg, const struct kernel_param *kp)
149 {
150 	int ret = 0;
151 	unsigned long new_window_size;
152 
153 	ret = kstrtoul(arg, 10, &new_window_size);
154 	if (ret)
155 		goto exit_win;
156 	if (new_window_size > 10 || new_window_size < 2) {
157 		pr_err("Out of recommended window size %lu, between 2-10\n",
158 			new_window_size);
159 		ret = -EINVAL;
160 	}
161 
162 	window_size = clamp(new_window_size, 2ul, 10ul);
163 	smp_mb();
164 
165 exit_win:
166 
167 	return ret;
168 }
169 
170 static const struct kernel_param_ops window_size_ops = {
171 	.set = window_size_set,
172 	.get = param_get_int,
173 };
174 
175 module_param_cb(window_size, &window_size_ops, &window_size, 0644);
176 MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n"
177 	"\tpowerclamp controls idle ratio within this window. larger\n"
178 	"\twindow size results in slower response time but more smooth\n"
179 	"\tclamping results. default to 2.");
180 
find_target_mwait(void)181 static void find_target_mwait(void)
182 {
183 	unsigned int eax, ebx, ecx, edx;
184 	unsigned int highest_cstate = 0;
185 	unsigned int highest_subcstate = 0;
186 	int i;
187 
188 	if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF)
189 		return;
190 
191 	cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
192 
193 	if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
194 	    !(ecx & CPUID5_ECX_INTERRUPT_BREAK))
195 		return;
196 
197 	edx >>= MWAIT_SUBSTATE_SIZE;
198 	for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
199 		if (edx & MWAIT_SUBSTATE_MASK) {
200 			highest_cstate = i;
201 			highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
202 		}
203 	}
204 	target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
205 		(highest_subcstate - 1);
206 
207 }
208 
209 struct pkg_cstate_info {
210 	bool skip;
211 	int msr_index;
212 	int cstate_id;
213 };
214 
215 #define PKG_CSTATE_INIT(id) {				\
216 		.msr_index = MSR_PKG_C##id##_RESIDENCY, \
217 		.cstate_id = id				\
218 			}
219 
220 static struct pkg_cstate_info pkg_cstates[] = {
221 	PKG_CSTATE_INIT(2),
222 	PKG_CSTATE_INIT(3),
223 	PKG_CSTATE_INIT(6),
224 	PKG_CSTATE_INIT(7),
225 	PKG_CSTATE_INIT(8),
226 	PKG_CSTATE_INIT(9),
227 	PKG_CSTATE_INIT(10),
228 	{NULL},
229 };
230 
has_pkg_state_counter(void)231 static bool has_pkg_state_counter(void)
232 {
233 	u64 val;
234 	struct pkg_cstate_info *info = pkg_cstates;
235 
236 	/* check if any one of the counter msrs exists */
237 	while (info->msr_index) {
238 		if (!rdmsrl_safe(info->msr_index, &val))
239 			return true;
240 		info++;
241 	}
242 
243 	return false;
244 }
245 
pkg_state_counter(void)246 static u64 pkg_state_counter(void)
247 {
248 	u64 val;
249 	u64 count = 0;
250 	struct pkg_cstate_info *info = pkg_cstates;
251 
252 	while (info->msr_index) {
253 		if (!info->skip) {
254 			if (!rdmsrl_safe(info->msr_index, &val))
255 				count += val;
256 			else
257 				info->skip = true;
258 		}
259 		info++;
260 	}
261 
262 	return count;
263 }
264 
noop_timer(unsigned long foo)265 static void noop_timer(unsigned long foo)
266 {
267 	/* empty... just the fact that we get the interrupt wakes us up */
268 }
269 
get_compensation(int ratio)270 static unsigned int get_compensation(int ratio)
271 {
272 	unsigned int comp = 0;
273 
274 	/* we only use compensation if all adjacent ones are good */
275 	if (ratio == 1 &&
276 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
277 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK &&
278 		cal_data[ratio + 2].confidence >= CONFIDENCE_OK) {
279 		comp = (cal_data[ratio].steady_comp +
280 			cal_data[ratio + 1].steady_comp +
281 			cal_data[ratio + 2].steady_comp) / 3;
282 	} else if (ratio == MAX_TARGET_RATIO - 1 &&
283 		cal_data[ratio].confidence >= CONFIDENCE_OK &&
284 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
285 		cal_data[ratio - 2].confidence >= CONFIDENCE_OK) {
286 		comp = (cal_data[ratio].steady_comp +
287 			cal_data[ratio - 1].steady_comp +
288 			cal_data[ratio - 2].steady_comp) / 3;
289 	} else if (cal_data[ratio].confidence >= CONFIDENCE_OK &&
290 		cal_data[ratio - 1].confidence >= CONFIDENCE_OK &&
291 		cal_data[ratio + 1].confidence >= CONFIDENCE_OK) {
292 		comp = (cal_data[ratio].steady_comp +
293 			cal_data[ratio - 1].steady_comp +
294 			cal_data[ratio + 1].steady_comp) / 3;
295 	}
296 
297 	/* REVISIT: simple penalty of double idle injection */
298 	if (reduce_irq)
299 		comp = ratio;
300 	/* do not exceed limit */
301 	if (comp + ratio >= MAX_TARGET_RATIO)
302 		comp = MAX_TARGET_RATIO - ratio - 1;
303 
304 	return comp;
305 }
306 
adjust_compensation(int target_ratio,unsigned int win)307 static void adjust_compensation(int target_ratio, unsigned int win)
308 {
309 	int delta;
310 	struct powerclamp_calibration_data *d = &cal_data[target_ratio];
311 
312 	/*
313 	 * adjust compensations if confidence level has not been reached or
314 	 * there are too many wakeups during the last idle injection period, we
315 	 * cannot trust the data for compensation.
316 	 */
317 	if (d->confidence >= CONFIDENCE_OK ||
318 		atomic_read(&idle_wakeup_counter) >
319 		win * num_online_cpus())
320 		return;
321 
322 	delta = set_target_ratio - current_ratio;
323 	/* filter out bad data */
324 	if (delta >= 0 && delta <= (1+target_ratio/10)) {
325 		if (d->steady_comp)
326 			d->steady_comp =
327 				roundup(delta+d->steady_comp, 2)/2;
328 		else
329 			d->steady_comp = delta;
330 		d->confidence++;
331 	}
332 }
333 
powerclamp_adjust_controls(unsigned int target_ratio,unsigned int guard,unsigned int win)334 static bool powerclamp_adjust_controls(unsigned int target_ratio,
335 				unsigned int guard, unsigned int win)
336 {
337 	static u64 msr_last, tsc_last;
338 	u64 msr_now, tsc_now;
339 	u64 val64;
340 
341 	/* check result for the last window */
342 	msr_now = pkg_state_counter();
343 	tsc_now = rdtsc();
344 
345 	/* calculate pkg cstate vs tsc ratio */
346 	if (!msr_last || !tsc_last)
347 		current_ratio = 1;
348 	else if (tsc_now-tsc_last) {
349 		val64 = 100*(msr_now-msr_last);
350 		do_div(val64, (tsc_now-tsc_last));
351 		current_ratio = val64;
352 	}
353 
354 	/* update record */
355 	msr_last = msr_now;
356 	tsc_last = tsc_now;
357 
358 	adjust_compensation(target_ratio, win);
359 	/*
360 	 * too many external interrupts, set flag such
361 	 * that we can take measure later.
362 	 */
363 	reduce_irq = atomic_read(&idle_wakeup_counter) >=
364 		2 * win * num_online_cpus();
365 
366 	atomic_set(&idle_wakeup_counter, 0);
367 	/* if we are above target+guard, skip */
368 	return set_target_ratio + guard <= current_ratio;
369 }
370 
clamp_thread(void * arg)371 static int clamp_thread(void *arg)
372 {
373 	int cpunr = (unsigned long)arg;
374 	DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
375 	static const struct sched_param param = {
376 		.sched_priority = MAX_USER_RT_PRIO/2,
377 	};
378 	unsigned int count = 0;
379 	unsigned int target_ratio;
380 
381 	set_bit(cpunr, cpu_clamping_mask);
382 	set_freezable();
383 	init_timer_on_stack(&wakeup_timer);
384 	sched_setscheduler(current, SCHED_FIFO, &param);
385 
386 	while (true == clamping && !kthread_should_stop() &&
387 		cpu_online(cpunr)) {
388 		int sleeptime;
389 		unsigned long target_jiffies;
390 		unsigned int guard;
391 		unsigned int compensation = 0;
392 		int interval; /* jiffies to sleep for each attempt */
393 		unsigned int duration_jiffies = msecs_to_jiffies(duration);
394 		unsigned int window_size_now;
395 
396 		try_to_freeze();
397 		/*
398 		 * make sure user selected ratio does not take effect until
399 		 * the next round. adjust target_ratio if user has changed
400 		 * target such that we can converge quickly.
401 		 */
402 		target_ratio = set_target_ratio;
403 		guard = 1 + target_ratio/20;
404 		window_size_now = window_size;
405 		count++;
406 
407 		/*
408 		 * systems may have different ability to enter package level
409 		 * c-states, thus we need to compensate the injected idle ratio
410 		 * to achieve the actual target reported by the HW.
411 		 */
412 		compensation = get_compensation(target_ratio);
413 		interval = duration_jiffies*100/(target_ratio+compensation);
414 
415 		/* align idle time */
416 		target_jiffies = roundup(jiffies, interval);
417 		sleeptime = target_jiffies - jiffies;
418 		if (sleeptime <= 0)
419 			sleeptime = 1;
420 		schedule_timeout_interruptible(sleeptime);
421 		/*
422 		 * only elected controlling cpu can collect stats and update
423 		 * control parameters.
424 		 */
425 		if (cpunr == control_cpu && !(count%window_size_now)) {
426 			should_skip =
427 				powerclamp_adjust_controls(target_ratio,
428 							guard, window_size_now);
429 			smp_mb();
430 		}
431 
432 		if (should_skip)
433 			continue;
434 
435 		target_jiffies = jiffies + duration_jiffies;
436 		mod_timer(&wakeup_timer, target_jiffies);
437 		if (unlikely(local_softirq_pending()))
438 			continue;
439 		/*
440 		 * stop tick sched during idle time, interrupts are still
441 		 * allowed. thus jiffies are updated properly.
442 		 */
443 		preempt_disable();
444 		/* mwait until target jiffies is reached */
445 		while (time_before(jiffies, target_jiffies)) {
446 			unsigned long ecx = 1;
447 			unsigned long eax = target_mwait;
448 
449 			/*
450 			 * REVISIT: may call enter_idle() to notify drivers who
451 			 * can save power during cpu idle. same for exit_idle()
452 			 */
453 			local_touch_nmi();
454 			stop_critical_timings();
455 			mwait_idle_with_hints(eax, ecx);
456 			start_critical_timings();
457 			atomic_inc(&idle_wakeup_counter);
458 		}
459 		preempt_enable();
460 	}
461 	del_timer_sync(&wakeup_timer);
462 	clear_bit(cpunr, cpu_clamping_mask);
463 
464 	return 0;
465 }
466 
467 /*
468  * 1 HZ polling while clamping is active, useful for userspace
469  * to monitor actual idle ratio.
470  */
471 static void poll_pkg_cstate(struct work_struct *dummy);
472 static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate);
poll_pkg_cstate(struct work_struct * dummy)473 static void poll_pkg_cstate(struct work_struct *dummy)
474 {
475 	static u64 msr_last;
476 	static u64 tsc_last;
477 	static unsigned long jiffies_last;
478 
479 	u64 msr_now;
480 	unsigned long jiffies_now;
481 	u64 tsc_now;
482 	u64 val64;
483 
484 	msr_now = pkg_state_counter();
485 	tsc_now = rdtsc();
486 	jiffies_now = jiffies;
487 
488 	/* calculate pkg cstate vs tsc ratio */
489 	if (!msr_last || !tsc_last)
490 		pkg_cstate_ratio_cur = 1;
491 	else {
492 		if (tsc_now - tsc_last) {
493 			val64 = 100 * (msr_now - msr_last);
494 			do_div(val64, (tsc_now - tsc_last));
495 			pkg_cstate_ratio_cur = val64;
496 		}
497 	}
498 
499 	/* update record */
500 	msr_last = msr_now;
501 	jiffies_last = jiffies_now;
502 	tsc_last = tsc_now;
503 
504 	if (true == clamping)
505 		schedule_delayed_work(&poll_pkg_cstate_work, HZ);
506 }
507 
start_power_clamp(void)508 static int start_power_clamp(void)
509 {
510 	unsigned long cpu;
511 	struct task_struct *thread;
512 
513 	/* check if pkg cstate counter is completely 0, abort in this case */
514 	if (!has_pkg_state_counter()) {
515 		pr_err("pkg cstate counter not functional, abort\n");
516 		return -EINVAL;
517 	}
518 
519 	set_target_ratio = clamp(set_target_ratio, 0U, MAX_TARGET_RATIO - 1);
520 	/* prevent cpu hotplug */
521 	get_online_cpus();
522 
523 	/* prefer BSP */
524 	control_cpu = 0;
525 	if (!cpu_online(control_cpu))
526 		control_cpu = smp_processor_id();
527 
528 	clamping = true;
529 	schedule_delayed_work(&poll_pkg_cstate_work, 0);
530 
531 	/* start one thread per online cpu */
532 	for_each_online_cpu(cpu) {
533 		struct task_struct **p =
534 			per_cpu_ptr(powerclamp_thread, cpu);
535 
536 		thread = kthread_create_on_node(clamp_thread,
537 						(void *) cpu,
538 						cpu_to_node(cpu),
539 						"kidle_inject/%ld", cpu);
540 		/* bind to cpu here */
541 		if (likely(!IS_ERR(thread))) {
542 			kthread_bind(thread, cpu);
543 			wake_up_process(thread);
544 			*p = thread;
545 		}
546 
547 	}
548 	put_online_cpus();
549 
550 	return 0;
551 }
552 
end_power_clamp(void)553 static void end_power_clamp(void)
554 {
555 	int i;
556 	struct task_struct *thread;
557 
558 	clamping = false;
559 	/*
560 	 * make clamping visible to other cpus and give per cpu clamping threads
561 	 * sometime to exit, or gets killed later.
562 	 */
563 	smp_mb();
564 	msleep(20);
565 	if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
566 		for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
567 			pr_debug("clamping thread for cpu %d alive, kill\n", i);
568 			thread = *per_cpu_ptr(powerclamp_thread, i);
569 			kthread_stop(thread);
570 		}
571 	}
572 }
573 
powerclamp_cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)574 static int powerclamp_cpu_callback(struct notifier_block *nfb,
575 				unsigned long action, void *hcpu)
576 {
577 	unsigned long cpu = (unsigned long)hcpu;
578 	struct task_struct *thread;
579 	struct task_struct **percpu_thread =
580 		per_cpu_ptr(powerclamp_thread, cpu);
581 
582 	if (false == clamping)
583 		goto exit_ok;
584 
585 	switch (action) {
586 	case CPU_ONLINE:
587 		thread = kthread_create_on_node(clamp_thread,
588 						(void *) cpu,
589 						cpu_to_node(cpu),
590 						"kidle_inject/%lu", cpu);
591 		if (likely(!IS_ERR(thread))) {
592 			kthread_bind(thread, cpu);
593 			wake_up_process(thread);
594 			*percpu_thread = thread;
595 		}
596 		/* prefer BSP as controlling CPU */
597 		if (cpu == 0) {
598 			control_cpu = 0;
599 			smp_mb();
600 		}
601 		break;
602 	case CPU_DEAD:
603 		if (test_bit(cpu, cpu_clamping_mask)) {
604 			pr_err("cpu %lu dead but powerclamping thread is not\n",
605 				cpu);
606 			kthread_stop(*percpu_thread);
607 		}
608 		if (cpu == control_cpu) {
609 			control_cpu = smp_processor_id();
610 			smp_mb();
611 		}
612 	}
613 
614 exit_ok:
615 	return NOTIFY_OK;
616 }
617 
618 static struct notifier_block powerclamp_cpu_notifier = {
619 	.notifier_call = powerclamp_cpu_callback,
620 };
621 
powerclamp_get_max_state(struct thermal_cooling_device * cdev,unsigned long * state)622 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
623 				 unsigned long *state)
624 {
625 	*state = MAX_TARGET_RATIO;
626 
627 	return 0;
628 }
629 
powerclamp_get_cur_state(struct thermal_cooling_device * cdev,unsigned long * state)630 static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev,
631 				 unsigned long *state)
632 {
633 	if (true == clamping)
634 		*state = pkg_cstate_ratio_cur;
635 	else
636 		/* to save power, do not poll idle ratio while not clamping */
637 		*state = -1; /* indicates invalid state */
638 
639 	return 0;
640 }
641 
powerclamp_set_cur_state(struct thermal_cooling_device * cdev,unsigned long new_target_ratio)642 static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev,
643 				 unsigned long new_target_ratio)
644 {
645 	int ret = 0;
646 
647 	new_target_ratio = clamp(new_target_ratio, 0UL,
648 				(unsigned long) (MAX_TARGET_RATIO-1));
649 	if (set_target_ratio == 0 && new_target_ratio > 0) {
650 		pr_info("Start idle injection to reduce power\n");
651 		set_target_ratio = new_target_ratio;
652 		ret = start_power_clamp();
653 		goto exit_set;
654 	} else	if (set_target_ratio > 0 && new_target_ratio == 0) {
655 		pr_info("Stop forced idle injection\n");
656 		set_target_ratio = 0;
657 		end_power_clamp();
658 	} else	/* adjust currently running */ {
659 		set_target_ratio = new_target_ratio;
660 		/* make new set_target_ratio visible to other cpus */
661 		smp_mb();
662 	}
663 
664 exit_set:
665 	return ret;
666 }
667 
668 /* bind to generic thermal layer as cooling device*/
669 static struct thermal_cooling_device_ops powerclamp_cooling_ops = {
670 	.get_max_state = powerclamp_get_max_state,
671 	.get_cur_state = powerclamp_get_cur_state,
672 	.set_cur_state = powerclamp_set_cur_state,
673 };
674 
675 /* runs on Nehalem and later */
676 static const struct x86_cpu_id intel_powerclamp_ids[] __initconst = {
677 	{ X86_VENDOR_INTEL, 6, 0x1a},
678 	{ X86_VENDOR_INTEL, 6, 0x1c},
679 	{ X86_VENDOR_INTEL, 6, 0x1e},
680 	{ X86_VENDOR_INTEL, 6, 0x1f},
681 	{ X86_VENDOR_INTEL, 6, 0x25},
682 	{ X86_VENDOR_INTEL, 6, 0x26},
683 	{ X86_VENDOR_INTEL, 6, 0x2a},
684 	{ X86_VENDOR_INTEL, 6, 0x2c},
685 	{ X86_VENDOR_INTEL, 6, 0x2d},
686 	{ X86_VENDOR_INTEL, 6, 0x2e},
687 	{ X86_VENDOR_INTEL, 6, 0x2f},
688 	{ X86_VENDOR_INTEL, 6, 0x37},
689 	{ X86_VENDOR_INTEL, 6, 0x3a},
690 	{ X86_VENDOR_INTEL, 6, 0x3c},
691 	{ X86_VENDOR_INTEL, 6, 0x3d},
692 	{ X86_VENDOR_INTEL, 6, 0x3e},
693 	{ X86_VENDOR_INTEL, 6, 0x3f},
694 	{ X86_VENDOR_INTEL, 6, 0x45},
695 	{ X86_VENDOR_INTEL, 6, 0x46},
696 	{ X86_VENDOR_INTEL, 6, 0x47},
697 	{ X86_VENDOR_INTEL, 6, 0x4c},
698 	{ X86_VENDOR_INTEL, 6, 0x4d},
699 	{ X86_VENDOR_INTEL, 6, 0x4e},
700 	{ X86_VENDOR_INTEL, 6, 0x4f},
701 	{ X86_VENDOR_INTEL, 6, 0x56},
702 	{ X86_VENDOR_INTEL, 6, 0x57},
703 	{ X86_VENDOR_INTEL, 6, 0x5e},
704 	{}
705 };
706 MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids);
707 
powerclamp_probe(void)708 static int __init powerclamp_probe(void)
709 {
710 	if (!x86_match_cpu(intel_powerclamp_ids)) {
711 		pr_err("Intel powerclamp does not run on family %d model %d\n",
712 				boot_cpu_data.x86, boot_cpu_data.x86_model);
713 		return -ENODEV;
714 	}
715 	if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC) ||
716 		!boot_cpu_has(X86_FEATURE_CONSTANT_TSC) ||
717 		!boot_cpu_has(X86_FEATURE_MWAIT) ||
718 		!boot_cpu_has(X86_FEATURE_ARAT))
719 		return -ENODEV;
720 
721 	/* find the deepest mwait value */
722 	find_target_mwait();
723 
724 	return 0;
725 }
726 
powerclamp_debug_show(struct seq_file * m,void * unused)727 static int powerclamp_debug_show(struct seq_file *m, void *unused)
728 {
729 	int i = 0;
730 
731 	seq_printf(m, "controlling cpu: %d\n", control_cpu);
732 	seq_printf(m, "pct confidence steady dynamic (compensation)\n");
733 	for (i = 0; i < MAX_TARGET_RATIO; i++) {
734 		seq_printf(m, "%d\t%lu\t%lu\t%lu\n",
735 			i,
736 			cal_data[i].confidence,
737 			cal_data[i].steady_comp,
738 			cal_data[i].dynamic_comp);
739 	}
740 
741 	return 0;
742 }
743 
powerclamp_debug_open(struct inode * inode,struct file * file)744 static int powerclamp_debug_open(struct inode *inode,
745 			struct file *file)
746 {
747 	return single_open(file, powerclamp_debug_show, inode->i_private);
748 }
749 
750 static const struct file_operations powerclamp_debug_fops = {
751 	.open		= powerclamp_debug_open,
752 	.read		= seq_read,
753 	.llseek		= seq_lseek,
754 	.release	= single_release,
755 	.owner		= THIS_MODULE,
756 };
757 
powerclamp_create_debug_files(void)758 static inline void powerclamp_create_debug_files(void)
759 {
760 	debug_dir = debugfs_create_dir("intel_powerclamp", NULL);
761 	if (!debug_dir)
762 		return;
763 
764 	if (!debugfs_create_file("powerclamp_calib", S_IRUGO, debug_dir,
765 					cal_data, &powerclamp_debug_fops))
766 		goto file_error;
767 
768 	return;
769 
770 file_error:
771 	debugfs_remove_recursive(debug_dir);
772 }
773 
powerclamp_init(void)774 static int __init powerclamp_init(void)
775 {
776 	int retval;
777 	int bitmap_size;
778 
779 	bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);
780 	cpu_clamping_mask = kzalloc(bitmap_size, GFP_KERNEL);
781 	if (!cpu_clamping_mask)
782 		return -ENOMEM;
783 
784 	/* probe cpu features and ids here */
785 	retval = powerclamp_probe();
786 	if (retval)
787 		goto exit_free;
788 
789 	/* set default limit, maybe adjusted during runtime based on feedback */
790 	window_size = 2;
791 	register_hotcpu_notifier(&powerclamp_cpu_notifier);
792 
793 	powerclamp_thread = alloc_percpu(struct task_struct *);
794 	if (!powerclamp_thread) {
795 		retval = -ENOMEM;
796 		goto exit_unregister;
797 	}
798 
799 	cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
800 						&powerclamp_cooling_ops);
801 	if (IS_ERR(cooling_dev)) {
802 		retval = -ENODEV;
803 		goto exit_free_thread;
804 	}
805 
806 	if (!duration)
807 		duration = jiffies_to_msecs(DEFAULT_DURATION_JIFFIES);
808 
809 	powerclamp_create_debug_files();
810 
811 	return 0;
812 
813 exit_free_thread:
814 	free_percpu(powerclamp_thread);
815 exit_unregister:
816 	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
817 exit_free:
818 	kfree(cpu_clamping_mask);
819 	return retval;
820 }
821 module_init(powerclamp_init);
822 
powerclamp_exit(void)823 static void __exit powerclamp_exit(void)
824 {
825 	unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
826 	end_power_clamp();
827 	free_percpu(powerclamp_thread);
828 	thermal_cooling_device_unregister(cooling_dev);
829 	kfree(cpu_clamping_mask);
830 
831 	cancel_delayed_work_sync(&poll_pkg_cstate_work);
832 	debugfs_remove_recursive(debug_dir);
833 }
834 module_exit(powerclamp_exit);
835 
836 MODULE_LICENSE("GPL");
837 MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
838 MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>");
839 MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs");
840