• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * perf_event_intel_rapl.c: support Intel RAPL energy consumption counters
3  * Copyright (C) 2013 Google, Inc., Stephane Eranian
4  *
5  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
6  * section 14.7.1 (September 2013)
7  *
8  * RAPL provides more controls than just reporting energy consumption
9  * however here we only expose the 3 energy consumption free running
10  * counters (pp0, pkg, dram).
11  *
12  * Each of those counters increments in a power unit defined by the
13  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
14  * but it can vary.
15  *
16  * Counter to rapl events mappings:
17  *
18  *  pp0 counter: consumption of all physical cores (power plane 0)
19  * 	  event: rapl_energy_cores
20  *    perf code: 0x1
21  *
22  *  pkg counter: consumption of the whole processor package
23  *	  event: rapl_energy_pkg
24  *    perf code: 0x2
25  *
26  * dram counter: consumption of the dram domain (servers only)
27  *	  event: rapl_energy_dram
28  *    perf code: 0x3
29  *
30  * dram counter: consumption of the builtin-gpu domain (client only)
31  *	  event: rapl_energy_gpu
32  *    perf code: 0x4
33  *
34  * We manage those counters as free running (read-only). They may be
35  * use simultaneously by other tools, such as turbostat.
36  *
37  * The events only support system-wide mode counting. There is no
38  * sampling support because it does not make sense and is not
39  * supported by the RAPL hardware.
40  *
41  * Because we want to avoid floating-point operations in the kernel,
42  * the events are all reported in fixed point arithmetic (32.32).
43  * Tools must adjust the counts to convert them to Watts using
44  * the duration of the measurement. Tools may use a function such as
45  * ldexp(raw_count, -32);
46  */
47 #include <linux/module.h>
48 #include <linux/slab.h>
49 #include <linux/perf_event.h>
50 #include <asm/cpu_device_id.h>
51 #include "perf_event.h"
52 
53 /*
54  * RAPL energy status counters
55  */
56 #define RAPL_IDX_PP0_NRG_STAT	0	/* all cores */
57 #define INTEL_RAPL_PP0		0x1	/* pseudo-encoding */
58 #define RAPL_IDX_PKG_NRG_STAT	1	/* entire package */
59 #define INTEL_RAPL_PKG		0x2	/* pseudo-encoding */
60 #define RAPL_IDX_RAM_NRG_STAT	2	/* DRAM */
61 #define INTEL_RAPL_RAM		0x3	/* pseudo-encoding */
62 #define RAPL_IDX_PP1_NRG_STAT	3	/* gpu */
63 #define INTEL_RAPL_PP1		0x4	/* pseudo-encoding */
64 
65 /* Clients have PP0, PKG */
66 #define RAPL_IDX_CLN	(1<<RAPL_IDX_PP0_NRG_STAT|\
67 			 1<<RAPL_IDX_PKG_NRG_STAT|\
68 			 1<<RAPL_IDX_PP1_NRG_STAT)
69 
70 /* Servers have PP0, PKG, RAM */
71 #define RAPL_IDX_SRV	(1<<RAPL_IDX_PP0_NRG_STAT|\
72 			 1<<RAPL_IDX_PKG_NRG_STAT|\
73 			 1<<RAPL_IDX_RAM_NRG_STAT)
74 
75 /* Servers have PP0, PKG, RAM, PP1 */
76 #define RAPL_IDX_HSW	(1<<RAPL_IDX_PP0_NRG_STAT|\
77 			 1<<RAPL_IDX_PKG_NRG_STAT|\
78 			 1<<RAPL_IDX_RAM_NRG_STAT|\
79 			 1<<RAPL_IDX_PP1_NRG_STAT)
80 
81 /*
82  * event code: LSB 8 bits, passed in attr->config
83  * any other bit is reserved
84  */
85 #define RAPL_EVENT_MASK	0xFFULL
86 
87 #define DEFINE_RAPL_FORMAT_ATTR(_var, _name, _format)		\
88 static ssize_t __rapl_##_var##_show(struct kobject *kobj,	\
89 				struct kobj_attribute *attr,	\
90 				char *page)			\
91 {								\
92 	BUILD_BUG_ON(sizeof(_format) >= PAGE_SIZE);		\
93 	return sprintf(page, _format "\n");			\
94 }								\
95 static struct kobj_attribute format_attr_##_var =		\
96 	__ATTR(_name, 0444, __rapl_##_var##_show, NULL)
97 
98 #define RAPL_EVENT_DESC(_name, _config)				\
99 {								\
100 	.attr	= __ATTR(_name, 0444, rapl_event_show, NULL),	\
101 	.config	= _config,					\
102 }
103 
104 #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */
105 
106 struct rapl_pmu {
107 	spinlock_t	 lock;
108 	int		 hw_unit;  /* 1/2^hw_unit Joule */
109 	int		 n_active; /* number of active events */
110 	struct list_head active_list;
111 	struct pmu	 *pmu; /* pointer to rapl_pmu_class */
112 	ktime_t		 timer_interval; /* in ktime_t unit */
113 	struct hrtimer   hrtimer;
114 };
115 
116 static struct pmu rapl_pmu_class;
117 static cpumask_t rapl_cpu_mask;
118 static int rapl_cntr_mask;
119 
120 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu);
121 static DEFINE_PER_CPU(struct rapl_pmu *, rapl_pmu_to_free);
122 
rapl_read_counter(struct perf_event * event)123 static inline u64 rapl_read_counter(struct perf_event *event)
124 {
125 	u64 raw;
126 	rdmsrl(event->hw.event_base, raw);
127 	return raw;
128 }
129 
rapl_scale(u64 v)130 static inline u64 rapl_scale(u64 v)
131 {
132 	/*
133 	 * scale delta to smallest unit (1/2^32)
134 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
135 	 * or use ldexp(count, -32).
136 	 * Watts = Joules/Time delta
137 	 */
138 	return v << (32 - __this_cpu_read(rapl_pmu)->hw_unit);
139 }
140 
rapl_event_update(struct perf_event * event)141 static u64 rapl_event_update(struct perf_event *event)
142 {
143 	struct hw_perf_event *hwc = &event->hw;
144 	u64 prev_raw_count, new_raw_count;
145 	s64 delta, sdelta;
146 	int shift = RAPL_CNTR_WIDTH;
147 
148 again:
149 	prev_raw_count = local64_read(&hwc->prev_count);
150 	rdmsrl(event->hw.event_base, new_raw_count);
151 
152 	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
153 			    new_raw_count) != prev_raw_count) {
154 		cpu_relax();
155 		goto again;
156 	}
157 
158 	/*
159 	 * Now we have the new raw value and have updated the prev
160 	 * timestamp already. We can now calculate the elapsed delta
161 	 * (event-)time and add that to the generic event.
162 	 *
163 	 * Careful, not all hw sign-extends above the physical width
164 	 * of the count.
165 	 */
166 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
167 	delta >>= shift;
168 
169 	sdelta = rapl_scale(delta);
170 
171 	local64_add(sdelta, &event->count);
172 
173 	return new_raw_count;
174 }
175 
rapl_start_hrtimer(struct rapl_pmu * pmu)176 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
177 {
178 	__hrtimer_start_range_ns(&pmu->hrtimer,
179 			pmu->timer_interval, 0,
180 			HRTIMER_MODE_REL_PINNED, 0);
181 }
182 
rapl_stop_hrtimer(struct rapl_pmu * pmu)183 static void rapl_stop_hrtimer(struct rapl_pmu *pmu)
184 {
185 	hrtimer_cancel(&pmu->hrtimer);
186 }
187 
rapl_hrtimer_handle(struct hrtimer * hrtimer)188 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
189 {
190 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
191 	struct perf_event *event;
192 	unsigned long flags;
193 
194 	if (!pmu->n_active)
195 		return HRTIMER_NORESTART;
196 
197 	spin_lock_irqsave(&pmu->lock, flags);
198 
199 	list_for_each_entry(event, &pmu->active_list, active_entry) {
200 		rapl_event_update(event);
201 	}
202 
203 	spin_unlock_irqrestore(&pmu->lock, flags);
204 
205 	hrtimer_forward_now(hrtimer, pmu->timer_interval);
206 
207 	return HRTIMER_RESTART;
208 }
209 
rapl_hrtimer_init(struct rapl_pmu * pmu)210 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
211 {
212 	struct hrtimer *hr = &pmu->hrtimer;
213 
214 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
215 	hr->function = rapl_hrtimer_handle;
216 }
217 
__rapl_pmu_event_start(struct rapl_pmu * pmu,struct perf_event * event)218 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
219 				   struct perf_event *event)
220 {
221 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
222 		return;
223 
224 	event->hw.state = 0;
225 
226 	list_add_tail(&event->active_entry, &pmu->active_list);
227 
228 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
229 
230 	pmu->n_active++;
231 	if (pmu->n_active == 1)
232 		rapl_start_hrtimer(pmu);
233 }
234 
rapl_pmu_event_start(struct perf_event * event,int mode)235 static void rapl_pmu_event_start(struct perf_event *event, int mode)
236 {
237 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
238 	unsigned long flags;
239 
240 	spin_lock_irqsave(&pmu->lock, flags);
241 	__rapl_pmu_event_start(pmu, event);
242 	spin_unlock_irqrestore(&pmu->lock, flags);
243 }
244 
rapl_pmu_event_stop(struct perf_event * event,int mode)245 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
246 {
247 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
248 	struct hw_perf_event *hwc = &event->hw;
249 	unsigned long flags;
250 
251 	spin_lock_irqsave(&pmu->lock, flags);
252 
253 	/* mark event as deactivated and stopped */
254 	if (!(hwc->state & PERF_HES_STOPPED)) {
255 		WARN_ON_ONCE(pmu->n_active <= 0);
256 		pmu->n_active--;
257 		if (pmu->n_active == 0)
258 			rapl_stop_hrtimer(pmu);
259 
260 		list_del(&event->active_entry);
261 
262 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
263 		hwc->state |= PERF_HES_STOPPED;
264 	}
265 
266 	/* check if update of sw counter is necessary */
267 	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
268 		/*
269 		 * Drain the remaining delta count out of a event
270 		 * that we are disabling:
271 		 */
272 		rapl_event_update(event);
273 		hwc->state |= PERF_HES_UPTODATE;
274 	}
275 
276 	spin_unlock_irqrestore(&pmu->lock, flags);
277 }
278 
rapl_pmu_event_add(struct perf_event * event,int mode)279 static int rapl_pmu_event_add(struct perf_event *event, int mode)
280 {
281 	struct rapl_pmu *pmu = __this_cpu_read(rapl_pmu);
282 	struct hw_perf_event *hwc = &event->hw;
283 	unsigned long flags;
284 
285 	spin_lock_irqsave(&pmu->lock, flags);
286 
287 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
288 
289 	if (mode & PERF_EF_START)
290 		__rapl_pmu_event_start(pmu, event);
291 
292 	spin_unlock_irqrestore(&pmu->lock, flags);
293 
294 	return 0;
295 }
296 
rapl_pmu_event_del(struct perf_event * event,int flags)297 static void rapl_pmu_event_del(struct perf_event *event, int flags)
298 {
299 	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
300 }
301 
rapl_pmu_event_init(struct perf_event * event)302 static int rapl_pmu_event_init(struct perf_event *event)
303 {
304 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
305 	int bit, msr, ret = 0;
306 
307 	/* only look at RAPL events */
308 	if (event->attr.type != rapl_pmu_class.type)
309 		return -ENOENT;
310 
311 	/* check only supported bits are set */
312 	if (event->attr.config & ~RAPL_EVENT_MASK)
313 		return -EINVAL;
314 
315 	/*
316 	 * check event is known (determines counter)
317 	 */
318 	switch (cfg) {
319 	case INTEL_RAPL_PP0:
320 		bit = RAPL_IDX_PP0_NRG_STAT;
321 		msr = MSR_PP0_ENERGY_STATUS;
322 		break;
323 	case INTEL_RAPL_PKG:
324 		bit = RAPL_IDX_PKG_NRG_STAT;
325 		msr = MSR_PKG_ENERGY_STATUS;
326 		break;
327 	case INTEL_RAPL_RAM:
328 		bit = RAPL_IDX_RAM_NRG_STAT;
329 		msr = MSR_DRAM_ENERGY_STATUS;
330 		break;
331 	case INTEL_RAPL_PP1:
332 		bit = RAPL_IDX_PP1_NRG_STAT;
333 		msr = MSR_PP1_ENERGY_STATUS;
334 		break;
335 	default:
336 		return -EINVAL;
337 	}
338 	/* check event supported */
339 	if (!(rapl_cntr_mask & (1 << bit)))
340 		return -EINVAL;
341 
342 	/* unsupported modes and filters */
343 	if (event->attr.exclude_user   ||
344 	    event->attr.exclude_kernel ||
345 	    event->attr.exclude_hv     ||
346 	    event->attr.exclude_idle   ||
347 	    event->attr.exclude_host   ||
348 	    event->attr.exclude_guest  ||
349 	    event->attr.sample_period) /* no sampling */
350 		return -EINVAL;
351 
352 	/* must be done before validate_group */
353 	event->hw.event_base = msr;
354 	event->hw.config = cfg;
355 	event->hw.idx = bit;
356 
357 	return ret;
358 }
359 
rapl_pmu_event_read(struct perf_event * event)360 static void rapl_pmu_event_read(struct perf_event *event)
361 {
362 	rapl_event_update(event);
363 }
364 
rapl_get_attr_cpumask(struct device * dev,struct device_attribute * attr,char * buf)365 static ssize_t rapl_get_attr_cpumask(struct device *dev,
366 				struct device_attribute *attr, char *buf)
367 {
368 	int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &rapl_cpu_mask);
369 
370 	buf[n++] = '\n';
371 	buf[n] = '\0';
372 	return n;
373 }
374 
375 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
376 
377 static struct attribute *rapl_pmu_attrs[] = {
378 	&dev_attr_cpumask.attr,
379 	NULL,
380 };
381 
382 static struct attribute_group rapl_pmu_attr_group = {
383 	.attrs = rapl_pmu_attrs,
384 };
385 
386 EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
387 EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
388 EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
389 EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
390 
391 EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
392 EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
393 EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
394 EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
395 
396 /*
397  * we compute in 0.23 nJ increments regardless of MSR
398  */
399 EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
400 EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
401 EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
402 EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
403 
404 static struct attribute *rapl_events_srv_attr[] = {
405 	EVENT_PTR(rapl_cores),
406 	EVENT_PTR(rapl_pkg),
407 	EVENT_PTR(rapl_ram),
408 
409 	EVENT_PTR(rapl_cores_unit),
410 	EVENT_PTR(rapl_pkg_unit),
411 	EVENT_PTR(rapl_ram_unit),
412 
413 	EVENT_PTR(rapl_cores_scale),
414 	EVENT_PTR(rapl_pkg_scale),
415 	EVENT_PTR(rapl_ram_scale),
416 	NULL,
417 };
418 
419 static struct attribute *rapl_events_cln_attr[] = {
420 	EVENT_PTR(rapl_cores),
421 	EVENT_PTR(rapl_pkg),
422 	EVENT_PTR(rapl_gpu),
423 
424 	EVENT_PTR(rapl_cores_unit),
425 	EVENT_PTR(rapl_pkg_unit),
426 	EVENT_PTR(rapl_gpu_unit),
427 
428 	EVENT_PTR(rapl_cores_scale),
429 	EVENT_PTR(rapl_pkg_scale),
430 	EVENT_PTR(rapl_gpu_scale),
431 	NULL,
432 };
433 
434 static struct attribute *rapl_events_hsw_attr[] = {
435 	EVENT_PTR(rapl_cores),
436 	EVENT_PTR(rapl_pkg),
437 	EVENT_PTR(rapl_gpu),
438 	EVENT_PTR(rapl_ram),
439 
440 	EVENT_PTR(rapl_cores_unit),
441 	EVENT_PTR(rapl_pkg_unit),
442 	EVENT_PTR(rapl_gpu_unit),
443 	EVENT_PTR(rapl_ram_unit),
444 
445 	EVENT_PTR(rapl_cores_scale),
446 	EVENT_PTR(rapl_pkg_scale),
447 	EVENT_PTR(rapl_gpu_scale),
448 	EVENT_PTR(rapl_ram_scale),
449 	NULL,
450 };
451 
452 static struct attribute_group rapl_pmu_events_group = {
453 	.name = "events",
454 	.attrs = NULL, /* patched at runtime */
455 };
456 
457 DEFINE_RAPL_FORMAT_ATTR(event, event, "config:0-7");
458 static struct attribute *rapl_formats_attr[] = {
459 	&format_attr_event.attr,
460 	NULL,
461 };
462 
463 static struct attribute_group rapl_pmu_format_group = {
464 	.name = "format",
465 	.attrs = rapl_formats_attr,
466 };
467 
468 const struct attribute_group *rapl_attr_groups[] = {
469 	&rapl_pmu_attr_group,
470 	&rapl_pmu_format_group,
471 	&rapl_pmu_events_group,
472 	NULL,
473 };
474 
475 static struct pmu rapl_pmu_class = {
476 	.attr_groups	= rapl_attr_groups,
477 	.task_ctx_nr	= perf_invalid_context, /* system-wide only */
478 	.event_init	= rapl_pmu_event_init,
479 	.add		= rapl_pmu_event_add, /* must have */
480 	.del		= rapl_pmu_event_del, /* must have */
481 	.start		= rapl_pmu_event_start,
482 	.stop		= rapl_pmu_event_stop,
483 	.read		= rapl_pmu_event_read,
484 };
485 
rapl_cpu_exit(int cpu)486 static void rapl_cpu_exit(int cpu)
487 {
488 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
489 	int i, phys_id = topology_physical_package_id(cpu);
490 	int target = -1;
491 
492 	/* find a new cpu on same package */
493 	for_each_online_cpu(i) {
494 		if (i == cpu)
495 			continue;
496 		if (phys_id == topology_physical_package_id(i)) {
497 			target = i;
498 			break;
499 		}
500 	}
501 	/*
502 	 * clear cpu from cpumask
503 	 * if was set in cpumask and still some cpu on package,
504 	 * then move to new cpu
505 	 */
506 	if (cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask) && target >= 0)
507 		cpumask_set_cpu(target, &rapl_cpu_mask);
508 
509 	WARN_ON(cpumask_empty(&rapl_cpu_mask));
510 	/*
511 	 * migrate events and context to new cpu
512 	 */
513 	if (target >= 0)
514 		perf_pmu_migrate_context(pmu->pmu, cpu, target);
515 
516 	/* cancel overflow polling timer for CPU */
517 	rapl_stop_hrtimer(pmu);
518 }
519 
rapl_cpu_init(int cpu)520 static void rapl_cpu_init(int cpu)
521 {
522 	int i, phys_id = topology_physical_package_id(cpu);
523 
524 	/* check if phys_is is already covered */
525 	for_each_cpu(i, &rapl_cpu_mask) {
526 		if (phys_id == topology_physical_package_id(i))
527 			return;
528 	}
529 	/* was not found, so add it */
530 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
531 }
532 
rapl_cpu_prepare(int cpu)533 static int rapl_cpu_prepare(int cpu)
534 {
535 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
536 	int phys_id = topology_physical_package_id(cpu);
537 	u64 ms;
538 	u64 msr_rapl_power_unit_bits;
539 
540 	if (pmu)
541 		return 0;
542 
543 	if (phys_id < 0)
544 		return -1;
545 
546 	/* protect rdmsrl() to handle virtualization */
547 	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
548 		return -1;
549 
550 	pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
551 	if (!pmu)
552 		return -1;
553 
554 	spin_lock_init(&pmu->lock);
555 
556 	INIT_LIST_HEAD(&pmu->active_list);
557 
558 	/*
559 	 * grab power unit as: 1/2^unit Joules
560 	 *
561 	 * we cache in local PMU instance
562 	 */
563 	pmu->hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
564 	pmu->pmu = &rapl_pmu_class;
565 
566 	/*
567 	 * use reference of 200W for scaling the timeout
568 	 * to avoid missing counter overflows.
569 	 * 200W = 200 Joules/sec
570 	 * divide interval by 2 to avoid lockstep (2 * 100)
571 	 * if hw unit is 32, then we use 2 ms 1/200/2
572 	 */
573 	if (pmu->hw_unit < 32)
574 		ms = (1000 / (2 * 100)) * (1ULL << (32 - pmu->hw_unit - 1));
575 	else
576 		ms = 2;
577 
578 	pmu->timer_interval = ms_to_ktime(ms);
579 
580 	rapl_hrtimer_init(pmu);
581 
582 	/* set RAPL pmu for this cpu for now */
583 	per_cpu(rapl_pmu, cpu) = pmu;
584 	per_cpu(rapl_pmu_to_free, cpu) = NULL;
585 
586 	return 0;
587 }
588 
rapl_cpu_kfree(int cpu)589 static void rapl_cpu_kfree(int cpu)
590 {
591 	struct rapl_pmu *pmu = per_cpu(rapl_pmu_to_free, cpu);
592 
593 	kfree(pmu);
594 
595 	per_cpu(rapl_pmu_to_free, cpu) = NULL;
596 }
597 
rapl_cpu_dying(int cpu)598 static int rapl_cpu_dying(int cpu)
599 {
600 	struct rapl_pmu *pmu = per_cpu(rapl_pmu, cpu);
601 
602 	if (!pmu)
603 		return 0;
604 
605 	per_cpu(rapl_pmu, cpu) = NULL;
606 
607 	per_cpu(rapl_pmu_to_free, cpu) = pmu;
608 
609 	return 0;
610 }
611 
rapl_cpu_notifier(struct notifier_block * self,unsigned long action,void * hcpu)612 static int rapl_cpu_notifier(struct notifier_block *self,
613 			     unsigned long action, void *hcpu)
614 {
615 	unsigned int cpu = (long)hcpu;
616 
617 	switch (action & ~CPU_TASKS_FROZEN) {
618 	case CPU_UP_PREPARE:
619 		rapl_cpu_prepare(cpu);
620 		break;
621 	case CPU_STARTING:
622 		rapl_cpu_init(cpu);
623 		break;
624 	case CPU_UP_CANCELED:
625 	case CPU_DYING:
626 		rapl_cpu_dying(cpu);
627 		break;
628 	case CPU_ONLINE:
629 	case CPU_DEAD:
630 		rapl_cpu_kfree(cpu);
631 		break;
632 	case CPU_DOWN_PREPARE:
633 		rapl_cpu_exit(cpu);
634 		break;
635 	default:
636 		break;
637 	}
638 
639 	return NOTIFY_OK;
640 }
641 
642 static const struct x86_cpu_id rapl_cpu_match[] = {
643 	[0] = { .vendor = X86_VENDOR_INTEL, .family = 6 },
644 	[1] = {},
645 };
646 
rapl_pmu_init(void)647 static int __init rapl_pmu_init(void)
648 {
649 	struct rapl_pmu *pmu;
650 	int cpu, ret;
651 
652 	/*
653 	 * check for Intel processor family 6
654 	 */
655 	if (!x86_match_cpu(rapl_cpu_match))
656 		return 0;
657 
658 	/* check supported CPU */
659 	switch (boot_cpu_data.x86_model) {
660 	case 42: /* Sandy Bridge */
661 	case 58: /* Ivy Bridge */
662 		rapl_cntr_mask = RAPL_IDX_CLN;
663 		rapl_pmu_events_group.attrs = rapl_events_cln_attr;
664 		break;
665 	case 60: /* Haswell */
666 	case 69: /* Haswell-Celeron */
667 	case 61: /* Broadwell */
668 		rapl_cntr_mask = RAPL_IDX_HSW;
669 		rapl_pmu_events_group.attrs = rapl_events_hsw_attr;
670 		break;
671 	case 45: /* Sandy Bridge-EP */
672 	case 62: /* IvyTown */
673 		rapl_cntr_mask = RAPL_IDX_SRV;
674 		rapl_pmu_events_group.attrs = rapl_events_srv_attr;
675 		break;
676 
677 	default:
678 		/* unsupported */
679 		return 0;
680 	}
681 
682 	cpu_notifier_register_begin();
683 
684 	for_each_online_cpu(cpu) {
685 		ret = rapl_cpu_prepare(cpu);
686 		if (ret)
687 			goto out;
688 		rapl_cpu_init(cpu);
689 	}
690 
691 	__perf_cpu_notifier(rapl_cpu_notifier);
692 
693 	ret = perf_pmu_register(&rapl_pmu_class, "power", -1);
694 	if (WARN_ON(ret)) {
695 		pr_info("RAPL PMU detected, registration failed (%d), RAPL PMU disabled\n", ret);
696 		cpu_notifier_register_done();
697 		return -1;
698 	}
699 
700 	pmu = __this_cpu_read(rapl_pmu);
701 
702 	pr_info("RAPL PMU detected, hw unit 2^-%d Joules,"
703 		" API unit is 2^-32 Joules,"
704 		" %d fixed counters"
705 		" %llu ms ovfl timer\n",
706 		pmu->hw_unit,
707 		hweight32(rapl_cntr_mask),
708 		ktime_to_ms(pmu->timer_interval));
709 
710 out:
711 	cpu_notifier_register_done();
712 
713 	return 0;
714 }
715 device_initcall(rapl_pmu_init);
716