• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Support Intel/AMD RAPL energy consumption counters
4  * Copyright (C) 2013 Google, Inc., Stephane Eranian
5  *
6  * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7  * section 14.7.1 (September 2013)
8  *
9  * AMD RAPL interface for Fam17h is described in the public PPR:
10  * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11  *
12  * RAPL provides more controls than just reporting energy consumption
13  * however here we only expose the 3 energy consumption free running
14  * counters (pp0, pkg, dram).
15  *
16  * Each of those counters increments in a power unit defined by the
17  * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18  * but it can vary.
19  *
20  * Counter to rapl events mappings:
21  *
22  *  pp0 counter: consumption of all physical cores (power plane 0)
23  * 	  event: rapl_energy_cores
24  *    perf code: 0x1
25  *
26  *  pkg counter: consumption of the whole processor package
27  *	  event: rapl_energy_pkg
28  *    perf code: 0x2
29  *
30  * dram counter: consumption of the dram domain (servers only)
31  *	  event: rapl_energy_dram
32  *    perf code: 0x3
33  *
34  * gpu counter: consumption of the builtin-gpu domain (client only)
35  *	  event: rapl_energy_gpu
36  *    perf code: 0x4
37  *
38  *  psys counter: consumption of the builtin-psys domain (client only)
39  *	  event: rapl_energy_psys
40  *    perf code: 0x5
41  *
42  * We manage those counters as free running (read-only). They may be
43  * use simultaneously by other tools, such as turbostat.
44  *
45  * The events only support system-wide mode counting. There is no
46  * sampling support because it does not make sense and is not
47  * supported by the RAPL hardware.
48  *
49  * Because we want to avoid floating-point operations in the kernel,
50  * the events are all reported in fixed point arithmetic (32.32).
51  * Tools must adjust the counts to convert them to Watts using
52  * the duration of the measurement. Tools may use a function such as
53  * ldexp(raw_count, -32);
54  */
55 
56 #define pr_fmt(fmt) "RAPL PMU: " fmt
57 
58 #include <linux/module.h>
59 #include <linux/slab.h>
60 #include <linux/perf_event.h>
61 #include <linux/nospec.h>
62 #include <asm/cpu_device_id.h>
63 #include <asm/intel-family.h>
64 #include "perf_event.h"
65 #include "probe.h"
66 
67 MODULE_LICENSE("GPL");
68 
69 /*
70  * RAPL energy status counters
71  */
72 enum perf_rapl_events {
73 	PERF_RAPL_PP0 = 0,		/* all cores */
74 	PERF_RAPL_PKG,			/* entire package */
75 	PERF_RAPL_RAM,			/* DRAM */
76 	PERF_RAPL_PP1,			/* gpu */
77 	PERF_RAPL_PSYS,			/* psys */
78 
79 	PERF_RAPL_MAX,
80 	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
81 };
82 
83 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
84 	"pp0-core",
85 	"package",
86 	"dram",
87 	"pp1-gpu",
88 	"psys",
89 };
90 
91 /*
92  * event code: LSB 8 bits, passed in attr->config
93  * any other bit is reserved
94  */
95 #define RAPL_EVENT_MASK	0xFFULL
96 #define RAPL_CNTR_WIDTH 32
97 
98 #define RAPL_EVENT_ATTR_STR(_name, v, str)					\
99 static struct perf_pmu_events_attr event_attr_##v = {				\
100 	.attr		= __ATTR(_name, 0444, perf_event_sysfs_show, NULL),	\
101 	.id		= 0,							\
102 	.event_str	= str,							\
103 };
104 
105 struct rapl_pmu {
106 	raw_spinlock_t		lock;
107 	int			n_active;
108 	int			cpu;
109 	struct list_head	active_list;
110 	struct pmu		*pmu;
111 	ktime_t			timer_interval;
112 	struct hrtimer		hrtimer;
113 };
114 
115 struct rapl_pmus {
116 	struct pmu		pmu;
117 	unsigned int		maxdie;
118 	struct rapl_pmu		*pmus[];
119 };
120 
121 struct rapl_model {
122 	unsigned long	events;
123 	bool		apply_quirk;
124 };
125 
126  /* 1/2^hw_unit Joule */
127 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
128 static struct rapl_pmus *rapl_pmus;
129 static cpumask_t rapl_cpu_mask;
130 static unsigned int rapl_cntr_mask;
131 static u64 rapl_timer_ms;
132 static struct perf_msr rapl_msrs[];
133 
cpu_to_rapl_pmu(unsigned int cpu)134 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
135 {
136 	unsigned int dieid = topology_logical_die_id(cpu);
137 
138 	/*
139 	 * The unsigned check also catches the '-1' return value for non
140 	 * existent mappings in the topology map.
141 	 */
142 	return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
143 }
144 
rapl_read_counter(struct perf_event * event)145 static inline u64 rapl_read_counter(struct perf_event *event)
146 {
147 	u64 raw;
148 	rdmsrl(event->hw.event_base, raw);
149 	return raw;
150 }
151 
rapl_scale(u64 v,int cfg)152 static inline u64 rapl_scale(u64 v, int cfg)
153 {
154 	if (cfg > NR_RAPL_DOMAINS) {
155 		pr_warn("Invalid domain %d, failed to scale data\n", cfg);
156 		return v;
157 	}
158 	/*
159 	 * scale delta to smallest unit (1/2^32)
160 	 * users must then scale back: count * 1/(1e9*2^32) to get Joules
161 	 * or use ldexp(count, -32).
162 	 * Watts = Joules/Time delta
163 	 */
164 	return v << (32 - rapl_hw_unit[cfg - 1]);
165 }
166 
rapl_event_update(struct perf_event * event)167 static u64 rapl_event_update(struct perf_event *event)
168 {
169 	struct hw_perf_event *hwc = &event->hw;
170 	u64 prev_raw_count, new_raw_count;
171 	s64 delta, sdelta;
172 	int shift = RAPL_CNTR_WIDTH;
173 
174 again:
175 	prev_raw_count = local64_read(&hwc->prev_count);
176 	rdmsrl(event->hw.event_base, new_raw_count);
177 
178 	if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
179 			    new_raw_count) != prev_raw_count) {
180 		cpu_relax();
181 		goto again;
182 	}
183 
184 	/*
185 	 * Now we have the new raw value and have updated the prev
186 	 * timestamp already. We can now calculate the elapsed delta
187 	 * (event-)time and add that to the generic event.
188 	 *
189 	 * Careful, not all hw sign-extends above the physical width
190 	 * of the count.
191 	 */
192 	delta = (new_raw_count << shift) - (prev_raw_count << shift);
193 	delta >>= shift;
194 
195 	sdelta = rapl_scale(delta, event->hw.config);
196 
197 	local64_add(sdelta, &event->count);
198 
199 	return new_raw_count;
200 }
201 
rapl_start_hrtimer(struct rapl_pmu * pmu)202 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
203 {
204        hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
205 		     HRTIMER_MODE_REL_PINNED);
206 }
207 
rapl_hrtimer_handle(struct hrtimer * hrtimer)208 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
209 {
210 	struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
211 	struct perf_event *event;
212 	unsigned long flags;
213 
214 	if (!pmu->n_active)
215 		return HRTIMER_NORESTART;
216 
217 	raw_spin_lock_irqsave(&pmu->lock, flags);
218 
219 	list_for_each_entry(event, &pmu->active_list, active_entry)
220 		rapl_event_update(event);
221 
222 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
223 
224 	hrtimer_forward_now(hrtimer, pmu->timer_interval);
225 
226 	return HRTIMER_RESTART;
227 }
228 
rapl_hrtimer_init(struct rapl_pmu * pmu)229 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
230 {
231 	struct hrtimer *hr = &pmu->hrtimer;
232 
233 	hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
234 	hr->function = rapl_hrtimer_handle;
235 }
236 
__rapl_pmu_event_start(struct rapl_pmu * pmu,struct perf_event * event)237 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
238 				   struct perf_event *event)
239 {
240 	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
241 		return;
242 
243 	event->hw.state = 0;
244 
245 	list_add_tail(&event->active_entry, &pmu->active_list);
246 
247 	local64_set(&event->hw.prev_count, rapl_read_counter(event));
248 
249 	pmu->n_active++;
250 	if (pmu->n_active == 1)
251 		rapl_start_hrtimer(pmu);
252 }
253 
rapl_pmu_event_start(struct perf_event * event,int mode)254 static void rapl_pmu_event_start(struct perf_event *event, int mode)
255 {
256 	struct rapl_pmu *pmu = event->pmu_private;
257 	unsigned long flags;
258 
259 	raw_spin_lock_irqsave(&pmu->lock, flags);
260 	__rapl_pmu_event_start(pmu, event);
261 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
262 }
263 
rapl_pmu_event_stop(struct perf_event * event,int mode)264 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
265 {
266 	struct rapl_pmu *pmu = event->pmu_private;
267 	struct hw_perf_event *hwc = &event->hw;
268 	unsigned long flags;
269 
270 	raw_spin_lock_irqsave(&pmu->lock, flags);
271 
272 	/* mark event as deactivated and stopped */
273 	if (!(hwc->state & PERF_HES_STOPPED)) {
274 		WARN_ON_ONCE(pmu->n_active <= 0);
275 		pmu->n_active--;
276 		if (pmu->n_active == 0)
277 			hrtimer_cancel(&pmu->hrtimer);
278 
279 		list_del(&event->active_entry);
280 
281 		WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
282 		hwc->state |= PERF_HES_STOPPED;
283 	}
284 
285 	/* check if update of sw counter is necessary */
286 	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
287 		/*
288 		 * Drain the remaining delta count out of a event
289 		 * that we are disabling:
290 		 */
291 		rapl_event_update(event);
292 		hwc->state |= PERF_HES_UPTODATE;
293 	}
294 
295 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
296 }
297 
rapl_pmu_event_add(struct perf_event * event,int mode)298 static int rapl_pmu_event_add(struct perf_event *event, int mode)
299 {
300 	struct rapl_pmu *pmu = event->pmu_private;
301 	struct hw_perf_event *hwc = &event->hw;
302 	unsigned long flags;
303 
304 	raw_spin_lock_irqsave(&pmu->lock, flags);
305 
306 	hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
307 
308 	if (mode & PERF_EF_START)
309 		__rapl_pmu_event_start(pmu, event);
310 
311 	raw_spin_unlock_irqrestore(&pmu->lock, flags);
312 
313 	return 0;
314 }
315 
rapl_pmu_event_del(struct perf_event * event,int flags)316 static void rapl_pmu_event_del(struct perf_event *event, int flags)
317 {
318 	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
319 }
320 
rapl_pmu_event_init(struct perf_event * event)321 static int rapl_pmu_event_init(struct perf_event *event)
322 {
323 	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
324 	int bit, ret = 0;
325 	struct rapl_pmu *pmu;
326 
327 	/* only look at RAPL events */
328 	if (event->attr.type != rapl_pmus->pmu.type)
329 		return -ENOENT;
330 
331 	/* check only supported bits are set */
332 	if (event->attr.config & ~RAPL_EVENT_MASK)
333 		return -EINVAL;
334 
335 	if (event->cpu < 0)
336 		return -EINVAL;
337 
338 	event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
339 
340 	if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
341 		return -EINVAL;
342 
343 	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
344 	bit = cfg - 1;
345 
346 	/* check event supported */
347 	if (!(rapl_cntr_mask & (1 << bit)))
348 		return -EINVAL;
349 
350 	/* unsupported modes and filters */
351 	if (event->attr.sample_period) /* no sampling */
352 		return -EINVAL;
353 
354 	/* must be done before validate_group */
355 	pmu = cpu_to_rapl_pmu(event->cpu);
356 	if (!pmu)
357 		return -EINVAL;
358 	event->cpu = pmu->cpu;
359 	event->pmu_private = pmu;
360 	event->hw.event_base = rapl_msrs[bit].msr;
361 	event->hw.config = cfg;
362 	event->hw.idx = bit;
363 
364 	return ret;
365 }
366 
rapl_pmu_event_read(struct perf_event * event)367 static void rapl_pmu_event_read(struct perf_event *event)
368 {
369 	rapl_event_update(event);
370 }
371 
rapl_get_attr_cpumask(struct device * dev,struct device_attribute * attr,char * buf)372 static ssize_t rapl_get_attr_cpumask(struct device *dev,
373 				struct device_attribute *attr, char *buf)
374 {
375 	return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
376 }
377 
378 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
379 
380 static struct attribute *rapl_pmu_attrs[] = {
381 	&dev_attr_cpumask.attr,
382 	NULL,
383 };
384 
385 static struct attribute_group rapl_pmu_attr_group = {
386 	.attrs = rapl_pmu_attrs,
387 };
388 
389 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
390 RAPL_EVENT_ATTR_STR(energy-pkg  ,   rapl_pkg, "event=0x02");
391 RAPL_EVENT_ATTR_STR(energy-ram  ,   rapl_ram, "event=0x03");
392 RAPL_EVENT_ATTR_STR(energy-gpu  ,   rapl_gpu, "event=0x04");
393 RAPL_EVENT_ATTR_STR(energy-psys,   rapl_psys, "event=0x05");
394 
395 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
396 RAPL_EVENT_ATTR_STR(energy-pkg.unit  ,   rapl_pkg_unit, "Joules");
397 RAPL_EVENT_ATTR_STR(energy-ram.unit  ,   rapl_ram_unit, "Joules");
398 RAPL_EVENT_ATTR_STR(energy-gpu.unit  ,   rapl_gpu_unit, "Joules");
399 RAPL_EVENT_ATTR_STR(energy-psys.unit,   rapl_psys_unit, "Joules");
400 
401 /*
402  * we compute in 0.23 nJ increments regardless of MSR
403  */
404 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
405 RAPL_EVENT_ATTR_STR(energy-pkg.scale,     rapl_pkg_scale, "2.3283064365386962890625e-10");
406 RAPL_EVENT_ATTR_STR(energy-ram.scale,     rapl_ram_scale, "2.3283064365386962890625e-10");
407 RAPL_EVENT_ATTR_STR(energy-gpu.scale,     rapl_gpu_scale, "2.3283064365386962890625e-10");
408 RAPL_EVENT_ATTR_STR(energy-psys.scale,   rapl_psys_scale, "2.3283064365386962890625e-10");
409 
410 /*
411  * There are no default events, but we need to create
412  * "events" group (with empty attrs) before updating
413  * it with detected events.
414  */
415 static struct attribute *attrs_empty[] = {
416 	NULL,
417 };
418 
419 static struct attribute_group rapl_pmu_events_group = {
420 	.name = "events",
421 	.attrs = attrs_empty,
422 };
423 
424 PMU_FORMAT_ATTR(event, "config:0-7");
425 static struct attribute *rapl_formats_attr[] = {
426 	&format_attr_event.attr,
427 	NULL,
428 };
429 
430 static struct attribute_group rapl_pmu_format_group = {
431 	.name = "format",
432 	.attrs = rapl_formats_attr,
433 };
434 
435 static const struct attribute_group *rapl_attr_groups[] = {
436 	&rapl_pmu_attr_group,
437 	&rapl_pmu_format_group,
438 	&rapl_pmu_events_group,
439 	NULL,
440 };
441 
442 static struct attribute *rapl_events_cores[] = {
443 	EVENT_PTR(rapl_cores),
444 	EVENT_PTR(rapl_cores_unit),
445 	EVENT_PTR(rapl_cores_scale),
446 	NULL,
447 };
448 
449 static struct attribute_group rapl_events_cores_group = {
450 	.name  = "events",
451 	.attrs = rapl_events_cores,
452 };
453 
454 static struct attribute *rapl_events_pkg[] = {
455 	EVENT_PTR(rapl_pkg),
456 	EVENT_PTR(rapl_pkg_unit),
457 	EVENT_PTR(rapl_pkg_scale),
458 	NULL,
459 };
460 
461 static struct attribute_group rapl_events_pkg_group = {
462 	.name  = "events",
463 	.attrs = rapl_events_pkg,
464 };
465 
466 static struct attribute *rapl_events_ram[] = {
467 	EVENT_PTR(rapl_ram),
468 	EVENT_PTR(rapl_ram_unit),
469 	EVENT_PTR(rapl_ram_scale),
470 	NULL,
471 };
472 
473 static struct attribute_group rapl_events_ram_group = {
474 	.name  = "events",
475 	.attrs = rapl_events_ram,
476 };
477 
478 static struct attribute *rapl_events_gpu[] = {
479 	EVENT_PTR(rapl_gpu),
480 	EVENT_PTR(rapl_gpu_unit),
481 	EVENT_PTR(rapl_gpu_scale),
482 	NULL,
483 };
484 
485 static struct attribute_group rapl_events_gpu_group = {
486 	.name  = "events",
487 	.attrs = rapl_events_gpu,
488 };
489 
490 static struct attribute *rapl_events_psys[] = {
491 	EVENT_PTR(rapl_psys),
492 	EVENT_PTR(rapl_psys_unit),
493 	EVENT_PTR(rapl_psys_scale),
494 	NULL,
495 };
496 
497 static struct attribute_group rapl_events_psys_group = {
498 	.name  = "events",
499 	.attrs = rapl_events_psys,
500 };
501 
test_msr(int idx,void * data)502 static bool test_msr(int idx, void *data)
503 {
504 	return test_bit(idx, (unsigned long *) data);
505 }
506 
507 static struct perf_msr rapl_msrs[] = {
508 	[PERF_RAPL_PP0]  = { MSR_PP0_ENERGY_STATUS,      &rapl_events_cores_group, test_msr },
509 	[PERF_RAPL_PKG]  = { MSR_PKG_ENERGY_STATUS,      &rapl_events_pkg_group,   test_msr },
510 	[PERF_RAPL_RAM]  = { MSR_DRAM_ENERGY_STATUS,     &rapl_events_ram_group,   test_msr },
511 	[PERF_RAPL_PP1]  = { MSR_PP1_ENERGY_STATUS,      &rapl_events_gpu_group,   test_msr },
512 	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group,  test_msr },
513 };
514 
rapl_cpu_offline(unsigned int cpu)515 static int rapl_cpu_offline(unsigned int cpu)
516 {
517 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
518 	int target;
519 
520 	/* Check if exiting cpu is used for collecting rapl events */
521 	if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
522 		return 0;
523 
524 	pmu->cpu = -1;
525 	/* Find a new cpu to collect rapl events */
526 	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
527 
528 	/* Migrate rapl events to the new target */
529 	if (target < nr_cpu_ids) {
530 		cpumask_set_cpu(target, &rapl_cpu_mask);
531 		pmu->cpu = target;
532 		perf_pmu_migrate_context(pmu->pmu, cpu, target);
533 	}
534 	return 0;
535 }
536 
rapl_cpu_online(unsigned int cpu)537 static int rapl_cpu_online(unsigned int cpu)
538 {
539 	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
540 	int target;
541 
542 	if (!pmu) {
543 		pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
544 		if (!pmu)
545 			return -ENOMEM;
546 
547 		raw_spin_lock_init(&pmu->lock);
548 		INIT_LIST_HEAD(&pmu->active_list);
549 		pmu->pmu = &rapl_pmus->pmu;
550 		pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
551 		rapl_hrtimer_init(pmu);
552 
553 		rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
554 	}
555 
556 	/*
557 	 * Check if there is an online cpu in the package which collects rapl
558 	 * events already.
559 	 */
560 	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
561 	if (target < nr_cpu_ids)
562 		return 0;
563 
564 	cpumask_set_cpu(cpu, &rapl_cpu_mask);
565 	pmu->cpu = cpu;
566 	return 0;
567 }
568 
rapl_check_hw_unit(bool apply_quirk)569 static int rapl_check_hw_unit(bool apply_quirk)
570 {
571 	u64 msr_rapl_power_unit_bits;
572 	int i;
573 
574 	/* protect rdmsrl() to handle virtualization */
575 	if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
576 		return -1;
577 	for (i = 0; i < NR_RAPL_DOMAINS; i++)
578 		rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
579 
580 	/*
581 	 * DRAM domain on HSW server and KNL has fixed energy unit which can be
582 	 * different than the unit from power unit MSR. See
583 	 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
584 	 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
585 	 */
586 	if (apply_quirk)
587 		rapl_hw_unit[PERF_RAPL_RAM] = 16;
588 
589 	/*
590 	 * Calculate the timer rate:
591 	 * Use reference of 200W for scaling the timeout to avoid counter
592 	 * overflows. 200W = 200 Joules/sec
593 	 * Divide interval by 2 to avoid lockstep (2 * 100)
594 	 * if hw unit is 32, then we use 2 ms 1/200/2
595 	 */
596 	rapl_timer_ms = 2;
597 	if (rapl_hw_unit[0] < 32) {
598 		rapl_timer_ms = (1000 / (2 * 100));
599 		rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
600 	}
601 	return 0;
602 }
603 
rapl_advertise(void)604 static void __init rapl_advertise(void)
605 {
606 	int i;
607 
608 	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
609 		hweight32(rapl_cntr_mask), rapl_timer_ms);
610 
611 	for (i = 0; i < NR_RAPL_DOMAINS; i++) {
612 		if (rapl_cntr_mask & (1 << i)) {
613 			pr_info("hw unit of domain %s 2^-%d Joules\n",
614 				rapl_domain_names[i], rapl_hw_unit[i]);
615 		}
616 	}
617 }
618 
cleanup_rapl_pmus(void)619 static void cleanup_rapl_pmus(void)
620 {
621 	int i;
622 
623 	for (i = 0; i < rapl_pmus->maxdie; i++)
624 		kfree(rapl_pmus->pmus[i]);
625 	kfree(rapl_pmus);
626 }
627 
628 static const struct attribute_group *rapl_attr_update[] = {
629 	&rapl_events_cores_group,
630 	&rapl_events_pkg_group,
631 	&rapl_events_ram_group,
632 	&rapl_events_gpu_group,
633 	&rapl_events_psys_group,
634 	NULL,
635 };
636 
init_rapl_pmus(void)637 static int __init init_rapl_pmus(void)
638 {
639 	int maxdie = topology_max_packages() * topology_max_die_per_package();
640 	size_t size;
641 
642 	size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
643 	rapl_pmus = kzalloc(size, GFP_KERNEL);
644 	if (!rapl_pmus)
645 		return -ENOMEM;
646 
647 	rapl_pmus->maxdie		= maxdie;
648 	rapl_pmus->pmu.attr_groups	= rapl_attr_groups;
649 	rapl_pmus->pmu.attr_update	= rapl_attr_update;
650 	rapl_pmus->pmu.task_ctx_nr	= perf_invalid_context;
651 	rapl_pmus->pmu.event_init	= rapl_pmu_event_init;
652 	rapl_pmus->pmu.add		= rapl_pmu_event_add;
653 	rapl_pmus->pmu.del		= rapl_pmu_event_del;
654 	rapl_pmus->pmu.start		= rapl_pmu_event_start;
655 	rapl_pmus->pmu.stop		= rapl_pmu_event_stop;
656 	rapl_pmus->pmu.read		= rapl_pmu_event_read;
657 	rapl_pmus->pmu.module		= THIS_MODULE;
658 	rapl_pmus->pmu.capabilities	= PERF_PMU_CAP_NO_EXCLUDE;
659 	return 0;
660 }
661 
662 #define X86_RAPL_MODEL_MATCH(model, init)	\
663 	{ X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
664 
665 static struct rapl_model model_snb = {
666 	.events		= BIT(PERF_RAPL_PP0) |
667 			  BIT(PERF_RAPL_PKG) |
668 			  BIT(PERF_RAPL_PP1),
669 	.apply_quirk	= false,
670 };
671 
672 static struct rapl_model model_snbep = {
673 	.events		= BIT(PERF_RAPL_PP0) |
674 			  BIT(PERF_RAPL_PKG) |
675 			  BIT(PERF_RAPL_RAM),
676 	.apply_quirk	= false,
677 };
678 
679 static struct rapl_model model_hsw = {
680 	.events		= BIT(PERF_RAPL_PP0) |
681 			  BIT(PERF_RAPL_PKG) |
682 			  BIT(PERF_RAPL_RAM) |
683 			  BIT(PERF_RAPL_PP1),
684 	.apply_quirk	= false,
685 };
686 
687 static struct rapl_model model_hsx = {
688 	.events		= BIT(PERF_RAPL_PP0) |
689 			  BIT(PERF_RAPL_PKG) |
690 			  BIT(PERF_RAPL_RAM),
691 	.apply_quirk	= true,
692 };
693 
694 static struct rapl_model model_knl = {
695 	.events		= BIT(PERF_RAPL_PKG) |
696 			  BIT(PERF_RAPL_RAM),
697 	.apply_quirk	= true,
698 };
699 
700 static struct rapl_model model_skl = {
701 	.events		= BIT(PERF_RAPL_PP0) |
702 			  BIT(PERF_RAPL_PKG) |
703 			  BIT(PERF_RAPL_RAM) |
704 			  BIT(PERF_RAPL_PP1) |
705 			  BIT(PERF_RAPL_PSYS),
706 	.apply_quirk	= false,
707 };
708 
709 static const struct x86_cpu_id rapl_model_match[] __initconst = {
710 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE,		model_snb),
711 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X,		model_snbep),
712 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE,		model_snb),
713 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X,		model_snbep),
714 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL,		model_hsw),
715 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X,		model_hsx),
716 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L,		model_hsw),
717 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G,		model_hsw),
718 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL,		model_hsw),
719 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G,		model_hsw),
720 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X,		model_hsx),
721 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D,		model_hsx),
722 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL,		model_knl),
723 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM,		model_knl),
724 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L,		model_skl),
725 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE,		model_skl),
726 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X,		model_hsx),
727 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L,		model_skl),
728 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE,		model_skl),
729 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L,		model_skl),
730 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT,		model_hsw),
731 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D,	model_hsw),
732 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS,	model_hsw),
733 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L,		model_skl),
734 	X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE,		model_skl),
735 	{},
736 };
737 
738 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
739 
rapl_pmu_init(void)740 static int __init rapl_pmu_init(void)
741 {
742 	const struct x86_cpu_id *id;
743 	struct rapl_model *rm;
744 	int ret;
745 
746 	id = x86_match_cpu(rapl_model_match);
747 	if (!id)
748 		return -ENODEV;
749 
750 	rm = (struct rapl_model *) id->driver_data;
751 	rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
752 					false, (void *) &rm->events);
753 
754 	ret = rapl_check_hw_unit(rm->apply_quirk);
755 	if (ret)
756 		return ret;
757 
758 	ret = init_rapl_pmus();
759 	if (ret)
760 		return ret;
761 
762 	/*
763 	 * Install callbacks. Core will call them for each online cpu.
764 	 */
765 	ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
766 				"perf/x86/rapl:online",
767 				rapl_cpu_online, rapl_cpu_offline);
768 	if (ret)
769 		goto out;
770 
771 	ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
772 	if (ret)
773 		goto out1;
774 
775 	rapl_advertise();
776 	return 0;
777 
778 out1:
779 	cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
780 out:
781 	pr_warn("Initialization failed (%d), disabled\n", ret);
782 	cleanup_rapl_pmus();
783 	return ret;
784 }
785 module_init(rapl_pmu_init);
786 
intel_rapl_exit(void)787 static void __exit intel_rapl_exit(void)
788 {
789 	cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
790 	perf_pmu_unregister(&rapl_pmus->pmu);
791 	cleanup_rapl_pmus();
792 }
793 module_exit(intel_rapl_exit);
794