1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * Support Intel/AMD RAPL energy consumption counters
4 * Copyright (C) 2013 Google, Inc., Stephane Eranian
5 *
6 * Intel RAPL interface is specified in the IA-32 Manual Vol3b
7 * section 14.7.1 (September 2013)
8 *
9 * AMD RAPL interface for Fam17h is described in the public PPR:
10 * https://bugzilla.kernel.org/show_bug.cgi?id=206537
11 *
12 * RAPL provides more controls than just reporting energy consumption
13 * however here we only expose the 3 energy consumption free running
14 * counters (pp0, pkg, dram).
15 *
16 * Each of those counters increments in a power unit defined by the
17 * RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18 * but it can vary.
19 *
20 * Counter to rapl events mappings:
21 *
22 * pp0 counter: consumption of all physical cores (power plane 0)
23 * event: rapl_energy_cores
24 * perf code: 0x1
25 *
26 * pkg counter: consumption of the whole processor package
27 * event: rapl_energy_pkg
28 * perf code: 0x2
29 *
30 * dram counter: consumption of the dram domain (servers only)
31 * event: rapl_energy_dram
32 * perf code: 0x3
33 *
34 * gpu counter: consumption of the builtin-gpu domain (client only)
35 * event: rapl_energy_gpu
36 * perf code: 0x4
37 *
38 * psys counter: consumption of the builtin-psys domain (client only)
39 * event: rapl_energy_psys
40 * perf code: 0x5
41 *
42 * We manage those counters as free running (read-only). They may be
43 * use simultaneously by other tools, such as turbostat.
44 *
45 * The events only support system-wide mode counting. There is no
46 * sampling support because it does not make sense and is not
47 * supported by the RAPL hardware.
48 *
49 * Because we want to avoid floating-point operations in the kernel,
50 * the events are all reported in fixed point arithmetic (32.32).
51 * Tools must adjust the counts to convert them to Watts using
52 * the duration of the measurement. Tools may use a function such as
53 * ldexp(raw_count, -32);
54 */
55
56 #define pr_fmt(fmt) "RAPL PMU: " fmt
57
58 #include <linux/module.h>
59 #include <linux/slab.h>
60 #include <linux/perf_event.h>
61 #include <linux/nospec.h>
62 #include <asm/cpu_device_id.h>
63 #include <asm/intel-family.h>
64 #include "perf_event.h"
65 #include "probe.h"
66
67 MODULE_LICENSE("GPL");
68
69 /*
70 * RAPL energy status counters
71 */
72 enum perf_rapl_events {
73 PERF_RAPL_PP0 = 0, /* all cores */
74 PERF_RAPL_PKG, /* entire package */
75 PERF_RAPL_RAM, /* DRAM */
76 PERF_RAPL_PP1, /* gpu */
77 PERF_RAPL_PSYS, /* psys */
78
79 PERF_RAPL_MAX,
80 NR_RAPL_DOMAINS = PERF_RAPL_MAX,
81 };
82
83 static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
84 "pp0-core",
85 "package",
86 "dram",
87 "pp1-gpu",
88 "psys",
89 };
90
91 /*
92 * event code: LSB 8 bits, passed in attr->config
93 * any other bit is reserved
94 */
95 #define RAPL_EVENT_MASK 0xFFULL
96 #define RAPL_CNTR_WIDTH 32
97
98 #define RAPL_EVENT_ATTR_STR(_name, v, str) \
99 static struct perf_pmu_events_attr event_attr_##v = { \
100 .attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
101 .id = 0, \
102 .event_str = str, \
103 };
104
105 struct rapl_pmu {
106 raw_spinlock_t lock;
107 int n_active;
108 int cpu;
109 struct list_head active_list;
110 struct pmu *pmu;
111 ktime_t timer_interval;
112 struct hrtimer hrtimer;
113 };
114
115 struct rapl_pmus {
116 struct pmu pmu;
117 unsigned int maxdie;
118 struct rapl_pmu *pmus[];
119 };
120
121 struct rapl_model {
122 unsigned long events;
123 bool apply_quirk;
124 };
125
126 /* 1/2^hw_unit Joule */
127 static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
128 static struct rapl_pmus *rapl_pmus;
129 static cpumask_t rapl_cpu_mask;
130 static unsigned int rapl_cntr_mask;
131 static u64 rapl_timer_ms;
132 static struct perf_msr rapl_msrs[];
133
cpu_to_rapl_pmu(unsigned int cpu)134 static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu)
135 {
136 unsigned int dieid = topology_logical_die_id(cpu);
137
138 /*
139 * The unsigned check also catches the '-1' return value for non
140 * existent mappings in the topology map.
141 */
142 return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
143 }
144
rapl_read_counter(struct perf_event * event)145 static inline u64 rapl_read_counter(struct perf_event *event)
146 {
147 u64 raw;
148 rdmsrl(event->hw.event_base, raw);
149 return raw;
150 }
151
rapl_scale(u64 v,int cfg)152 static inline u64 rapl_scale(u64 v, int cfg)
153 {
154 if (cfg > NR_RAPL_DOMAINS) {
155 pr_warn("Invalid domain %d, failed to scale data\n", cfg);
156 return v;
157 }
158 /*
159 * scale delta to smallest unit (1/2^32)
160 * users must then scale back: count * 1/(1e9*2^32) to get Joules
161 * or use ldexp(count, -32).
162 * Watts = Joules/Time delta
163 */
164 return v << (32 - rapl_hw_unit[cfg - 1]);
165 }
166
rapl_event_update(struct perf_event * event)167 static u64 rapl_event_update(struct perf_event *event)
168 {
169 struct hw_perf_event *hwc = &event->hw;
170 u64 prev_raw_count, new_raw_count;
171 s64 delta, sdelta;
172 int shift = RAPL_CNTR_WIDTH;
173
174 again:
175 prev_raw_count = local64_read(&hwc->prev_count);
176 rdmsrl(event->hw.event_base, new_raw_count);
177
178 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
179 new_raw_count) != prev_raw_count) {
180 cpu_relax();
181 goto again;
182 }
183
184 /*
185 * Now we have the new raw value and have updated the prev
186 * timestamp already. We can now calculate the elapsed delta
187 * (event-)time and add that to the generic event.
188 *
189 * Careful, not all hw sign-extends above the physical width
190 * of the count.
191 */
192 delta = (new_raw_count << shift) - (prev_raw_count << shift);
193 delta >>= shift;
194
195 sdelta = rapl_scale(delta, event->hw.config);
196
197 local64_add(sdelta, &event->count);
198
199 return new_raw_count;
200 }
201
rapl_start_hrtimer(struct rapl_pmu * pmu)202 static void rapl_start_hrtimer(struct rapl_pmu *pmu)
203 {
204 hrtimer_start(&pmu->hrtimer, pmu->timer_interval,
205 HRTIMER_MODE_REL_PINNED);
206 }
207
rapl_hrtimer_handle(struct hrtimer * hrtimer)208 static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
209 {
210 struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer);
211 struct perf_event *event;
212 unsigned long flags;
213
214 if (!pmu->n_active)
215 return HRTIMER_NORESTART;
216
217 raw_spin_lock_irqsave(&pmu->lock, flags);
218
219 list_for_each_entry(event, &pmu->active_list, active_entry)
220 rapl_event_update(event);
221
222 raw_spin_unlock_irqrestore(&pmu->lock, flags);
223
224 hrtimer_forward_now(hrtimer, pmu->timer_interval);
225
226 return HRTIMER_RESTART;
227 }
228
rapl_hrtimer_init(struct rapl_pmu * pmu)229 static void rapl_hrtimer_init(struct rapl_pmu *pmu)
230 {
231 struct hrtimer *hr = &pmu->hrtimer;
232
233 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
234 hr->function = rapl_hrtimer_handle;
235 }
236
__rapl_pmu_event_start(struct rapl_pmu * pmu,struct perf_event * event)237 static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
238 struct perf_event *event)
239 {
240 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
241 return;
242
243 event->hw.state = 0;
244
245 list_add_tail(&event->active_entry, &pmu->active_list);
246
247 local64_set(&event->hw.prev_count, rapl_read_counter(event));
248
249 pmu->n_active++;
250 if (pmu->n_active == 1)
251 rapl_start_hrtimer(pmu);
252 }
253
rapl_pmu_event_start(struct perf_event * event,int mode)254 static void rapl_pmu_event_start(struct perf_event *event, int mode)
255 {
256 struct rapl_pmu *pmu = event->pmu_private;
257 unsigned long flags;
258
259 raw_spin_lock_irqsave(&pmu->lock, flags);
260 __rapl_pmu_event_start(pmu, event);
261 raw_spin_unlock_irqrestore(&pmu->lock, flags);
262 }
263
rapl_pmu_event_stop(struct perf_event * event,int mode)264 static void rapl_pmu_event_stop(struct perf_event *event, int mode)
265 {
266 struct rapl_pmu *pmu = event->pmu_private;
267 struct hw_perf_event *hwc = &event->hw;
268 unsigned long flags;
269
270 raw_spin_lock_irqsave(&pmu->lock, flags);
271
272 /* mark event as deactivated and stopped */
273 if (!(hwc->state & PERF_HES_STOPPED)) {
274 WARN_ON_ONCE(pmu->n_active <= 0);
275 pmu->n_active--;
276 if (pmu->n_active == 0)
277 hrtimer_cancel(&pmu->hrtimer);
278
279 list_del(&event->active_entry);
280
281 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
282 hwc->state |= PERF_HES_STOPPED;
283 }
284
285 /* check if update of sw counter is necessary */
286 if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
287 /*
288 * Drain the remaining delta count out of a event
289 * that we are disabling:
290 */
291 rapl_event_update(event);
292 hwc->state |= PERF_HES_UPTODATE;
293 }
294
295 raw_spin_unlock_irqrestore(&pmu->lock, flags);
296 }
297
rapl_pmu_event_add(struct perf_event * event,int mode)298 static int rapl_pmu_event_add(struct perf_event *event, int mode)
299 {
300 struct rapl_pmu *pmu = event->pmu_private;
301 struct hw_perf_event *hwc = &event->hw;
302 unsigned long flags;
303
304 raw_spin_lock_irqsave(&pmu->lock, flags);
305
306 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
307
308 if (mode & PERF_EF_START)
309 __rapl_pmu_event_start(pmu, event);
310
311 raw_spin_unlock_irqrestore(&pmu->lock, flags);
312
313 return 0;
314 }
315
rapl_pmu_event_del(struct perf_event * event,int flags)316 static void rapl_pmu_event_del(struct perf_event *event, int flags)
317 {
318 rapl_pmu_event_stop(event, PERF_EF_UPDATE);
319 }
320
rapl_pmu_event_init(struct perf_event * event)321 static int rapl_pmu_event_init(struct perf_event *event)
322 {
323 u64 cfg = event->attr.config & RAPL_EVENT_MASK;
324 int bit, ret = 0;
325 struct rapl_pmu *pmu;
326
327 /* only look at RAPL events */
328 if (event->attr.type != rapl_pmus->pmu.type)
329 return -ENOENT;
330
331 /* check only supported bits are set */
332 if (event->attr.config & ~RAPL_EVENT_MASK)
333 return -EINVAL;
334
335 if (event->cpu < 0)
336 return -EINVAL;
337
338 event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG;
339
340 if (!cfg || cfg >= NR_RAPL_DOMAINS + 1)
341 return -EINVAL;
342
343 cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1);
344 bit = cfg - 1;
345
346 /* check event supported */
347 if (!(rapl_cntr_mask & (1 << bit)))
348 return -EINVAL;
349
350 /* unsupported modes and filters */
351 if (event->attr.sample_period) /* no sampling */
352 return -EINVAL;
353
354 /* must be done before validate_group */
355 pmu = cpu_to_rapl_pmu(event->cpu);
356 if (!pmu)
357 return -EINVAL;
358 event->cpu = pmu->cpu;
359 event->pmu_private = pmu;
360 event->hw.event_base = rapl_msrs[bit].msr;
361 event->hw.config = cfg;
362 event->hw.idx = bit;
363
364 return ret;
365 }
366
rapl_pmu_event_read(struct perf_event * event)367 static void rapl_pmu_event_read(struct perf_event *event)
368 {
369 rapl_event_update(event);
370 }
371
rapl_get_attr_cpumask(struct device * dev,struct device_attribute * attr,char * buf)372 static ssize_t rapl_get_attr_cpumask(struct device *dev,
373 struct device_attribute *attr, char *buf)
374 {
375 return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask);
376 }
377
378 static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
379
380 static struct attribute *rapl_pmu_attrs[] = {
381 &dev_attr_cpumask.attr,
382 NULL,
383 };
384
385 static struct attribute_group rapl_pmu_attr_group = {
386 .attrs = rapl_pmu_attrs,
387 };
388
389 RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
390 RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
391 RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
392 RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
393 RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
394
395 RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
396 RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
397 RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
398 RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
399 RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
400
401 /*
402 * we compute in 0.23 nJ increments regardless of MSR
403 */
404 RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
405 RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
406 RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
407 RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
408 RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
409
410 /*
411 * There are no default events, but we need to create
412 * "events" group (with empty attrs) before updating
413 * it with detected events.
414 */
415 static struct attribute *attrs_empty[] = {
416 NULL,
417 };
418
419 static struct attribute_group rapl_pmu_events_group = {
420 .name = "events",
421 .attrs = attrs_empty,
422 };
423
424 PMU_FORMAT_ATTR(event, "config:0-7");
425 static struct attribute *rapl_formats_attr[] = {
426 &format_attr_event.attr,
427 NULL,
428 };
429
430 static struct attribute_group rapl_pmu_format_group = {
431 .name = "format",
432 .attrs = rapl_formats_attr,
433 };
434
435 static const struct attribute_group *rapl_attr_groups[] = {
436 &rapl_pmu_attr_group,
437 &rapl_pmu_format_group,
438 &rapl_pmu_events_group,
439 NULL,
440 };
441
442 static struct attribute *rapl_events_cores[] = {
443 EVENT_PTR(rapl_cores),
444 EVENT_PTR(rapl_cores_unit),
445 EVENT_PTR(rapl_cores_scale),
446 NULL,
447 };
448
449 static struct attribute_group rapl_events_cores_group = {
450 .name = "events",
451 .attrs = rapl_events_cores,
452 };
453
454 static struct attribute *rapl_events_pkg[] = {
455 EVENT_PTR(rapl_pkg),
456 EVENT_PTR(rapl_pkg_unit),
457 EVENT_PTR(rapl_pkg_scale),
458 NULL,
459 };
460
461 static struct attribute_group rapl_events_pkg_group = {
462 .name = "events",
463 .attrs = rapl_events_pkg,
464 };
465
466 static struct attribute *rapl_events_ram[] = {
467 EVENT_PTR(rapl_ram),
468 EVENT_PTR(rapl_ram_unit),
469 EVENT_PTR(rapl_ram_scale),
470 NULL,
471 };
472
473 static struct attribute_group rapl_events_ram_group = {
474 .name = "events",
475 .attrs = rapl_events_ram,
476 };
477
478 static struct attribute *rapl_events_gpu[] = {
479 EVENT_PTR(rapl_gpu),
480 EVENT_PTR(rapl_gpu_unit),
481 EVENT_PTR(rapl_gpu_scale),
482 NULL,
483 };
484
485 static struct attribute_group rapl_events_gpu_group = {
486 .name = "events",
487 .attrs = rapl_events_gpu,
488 };
489
490 static struct attribute *rapl_events_psys[] = {
491 EVENT_PTR(rapl_psys),
492 EVENT_PTR(rapl_psys_unit),
493 EVENT_PTR(rapl_psys_scale),
494 NULL,
495 };
496
497 static struct attribute_group rapl_events_psys_group = {
498 .name = "events",
499 .attrs = rapl_events_psys,
500 };
501
test_msr(int idx,void * data)502 static bool test_msr(int idx, void *data)
503 {
504 return test_bit(idx, (unsigned long *) data);
505 }
506
507 static struct perf_msr rapl_msrs[] = {
508 [PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, &rapl_events_cores_group, test_msr },
509 [PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr },
510 [PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, &rapl_events_ram_group, test_msr },
511 [PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, &rapl_events_gpu_group, test_msr },
512 [PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, &rapl_events_psys_group, test_msr },
513 };
514
rapl_cpu_offline(unsigned int cpu)515 static int rapl_cpu_offline(unsigned int cpu)
516 {
517 struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
518 int target;
519
520 /* Check if exiting cpu is used for collecting rapl events */
521 if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask))
522 return 0;
523
524 pmu->cpu = -1;
525 /* Find a new cpu to collect rapl events */
526 target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
527
528 /* Migrate rapl events to the new target */
529 if (target < nr_cpu_ids) {
530 cpumask_set_cpu(target, &rapl_cpu_mask);
531 pmu->cpu = target;
532 perf_pmu_migrate_context(pmu->pmu, cpu, target);
533 }
534 return 0;
535 }
536
rapl_cpu_online(unsigned int cpu)537 static int rapl_cpu_online(unsigned int cpu)
538 {
539 struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
540 int target;
541
542 if (!pmu) {
543 pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
544 if (!pmu)
545 return -ENOMEM;
546
547 raw_spin_lock_init(&pmu->lock);
548 INIT_LIST_HEAD(&pmu->active_list);
549 pmu->pmu = &rapl_pmus->pmu;
550 pmu->timer_interval = ms_to_ktime(rapl_timer_ms);
551 rapl_hrtimer_init(pmu);
552
553 rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
554 }
555
556 /*
557 * Check if there is an online cpu in the package which collects rapl
558 * events already.
559 */
560 target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
561 if (target < nr_cpu_ids)
562 return 0;
563
564 cpumask_set_cpu(cpu, &rapl_cpu_mask);
565 pmu->cpu = cpu;
566 return 0;
567 }
568
rapl_check_hw_unit(bool apply_quirk)569 static int rapl_check_hw_unit(bool apply_quirk)
570 {
571 u64 msr_rapl_power_unit_bits;
572 int i;
573
574 /* protect rdmsrl() to handle virtualization */
575 if (rdmsrl_safe(MSR_RAPL_POWER_UNIT, &msr_rapl_power_unit_bits))
576 return -1;
577 for (i = 0; i < NR_RAPL_DOMAINS; i++)
578 rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL;
579
580 /*
581 * DRAM domain on HSW server and KNL has fixed energy unit which can be
582 * different than the unit from power unit MSR. See
583 * "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
584 * of 2. Datasheet, September 2014, Reference Number: 330784-001 "
585 */
586 if (apply_quirk)
587 rapl_hw_unit[PERF_RAPL_RAM] = 16;
588
589 /*
590 * Calculate the timer rate:
591 * Use reference of 200W for scaling the timeout to avoid counter
592 * overflows. 200W = 200 Joules/sec
593 * Divide interval by 2 to avoid lockstep (2 * 100)
594 * if hw unit is 32, then we use 2 ms 1/200/2
595 */
596 rapl_timer_ms = 2;
597 if (rapl_hw_unit[0] < 32) {
598 rapl_timer_ms = (1000 / (2 * 100));
599 rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1));
600 }
601 return 0;
602 }
603
rapl_advertise(void)604 static void __init rapl_advertise(void)
605 {
606 int i;
607
608 pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
609 hweight32(rapl_cntr_mask), rapl_timer_ms);
610
611 for (i = 0; i < NR_RAPL_DOMAINS; i++) {
612 if (rapl_cntr_mask & (1 << i)) {
613 pr_info("hw unit of domain %s 2^-%d Joules\n",
614 rapl_domain_names[i], rapl_hw_unit[i]);
615 }
616 }
617 }
618
cleanup_rapl_pmus(void)619 static void cleanup_rapl_pmus(void)
620 {
621 int i;
622
623 for (i = 0; i < rapl_pmus->maxdie; i++)
624 kfree(rapl_pmus->pmus[i]);
625 kfree(rapl_pmus);
626 }
627
628 static const struct attribute_group *rapl_attr_update[] = {
629 &rapl_events_cores_group,
630 &rapl_events_pkg_group,
631 &rapl_events_ram_group,
632 &rapl_events_gpu_group,
633 &rapl_events_psys_group,
634 NULL,
635 };
636
init_rapl_pmus(void)637 static int __init init_rapl_pmus(void)
638 {
639 int maxdie = topology_max_packages() * topology_max_die_per_package();
640 size_t size;
641
642 size = sizeof(*rapl_pmus) + maxdie * sizeof(struct rapl_pmu *);
643 rapl_pmus = kzalloc(size, GFP_KERNEL);
644 if (!rapl_pmus)
645 return -ENOMEM;
646
647 rapl_pmus->maxdie = maxdie;
648 rapl_pmus->pmu.attr_groups = rapl_attr_groups;
649 rapl_pmus->pmu.attr_update = rapl_attr_update;
650 rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
651 rapl_pmus->pmu.event_init = rapl_pmu_event_init;
652 rapl_pmus->pmu.add = rapl_pmu_event_add;
653 rapl_pmus->pmu.del = rapl_pmu_event_del;
654 rapl_pmus->pmu.start = rapl_pmu_event_start;
655 rapl_pmus->pmu.stop = rapl_pmu_event_stop;
656 rapl_pmus->pmu.read = rapl_pmu_event_read;
657 rapl_pmus->pmu.module = THIS_MODULE;
658 rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
659 return 0;
660 }
661
662 #define X86_RAPL_MODEL_MATCH(model, init) \
663 { X86_VENDOR_INTEL, 6, model, X86_FEATURE_ANY, (unsigned long)&init }
664
665 static struct rapl_model model_snb = {
666 .events = BIT(PERF_RAPL_PP0) |
667 BIT(PERF_RAPL_PKG) |
668 BIT(PERF_RAPL_PP1),
669 .apply_quirk = false,
670 };
671
672 static struct rapl_model model_snbep = {
673 .events = BIT(PERF_RAPL_PP0) |
674 BIT(PERF_RAPL_PKG) |
675 BIT(PERF_RAPL_RAM),
676 .apply_quirk = false,
677 };
678
679 static struct rapl_model model_hsw = {
680 .events = BIT(PERF_RAPL_PP0) |
681 BIT(PERF_RAPL_PKG) |
682 BIT(PERF_RAPL_RAM) |
683 BIT(PERF_RAPL_PP1),
684 .apply_quirk = false,
685 };
686
687 static struct rapl_model model_hsx = {
688 .events = BIT(PERF_RAPL_PP0) |
689 BIT(PERF_RAPL_PKG) |
690 BIT(PERF_RAPL_RAM),
691 .apply_quirk = true,
692 };
693
694 static struct rapl_model model_knl = {
695 .events = BIT(PERF_RAPL_PKG) |
696 BIT(PERF_RAPL_RAM),
697 .apply_quirk = true,
698 };
699
700 static struct rapl_model model_skl = {
701 .events = BIT(PERF_RAPL_PP0) |
702 BIT(PERF_RAPL_PKG) |
703 BIT(PERF_RAPL_RAM) |
704 BIT(PERF_RAPL_PP1) |
705 BIT(PERF_RAPL_PSYS),
706 .apply_quirk = false,
707 };
708
709 static const struct x86_cpu_id rapl_model_match[] __initconst = {
710 X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE, model_snb),
711 X86_RAPL_MODEL_MATCH(INTEL_FAM6_SANDYBRIDGE_X, model_snbep),
712 X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE, model_snb),
713 X86_RAPL_MODEL_MATCH(INTEL_FAM6_IVYBRIDGE_X, model_snbep),
714 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL, model_hsw),
715 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_X, model_hsx),
716 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_L, model_hsw),
717 X86_RAPL_MODEL_MATCH(INTEL_FAM6_HASWELL_G, model_hsw),
718 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL, model_hsw),
719 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_G, model_hsw),
720 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, model_hsx),
721 X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_D, model_hsx),
722 X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, model_knl),
723 X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNM, model_knl),
724 X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_L, model_skl),
725 X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE, model_skl),
726 X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, model_hsx),
727 X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE_L, model_skl),
728 X86_RAPL_MODEL_MATCH(INTEL_FAM6_KABYLAKE, model_skl),
729 X86_RAPL_MODEL_MATCH(INTEL_FAM6_CANNONLAKE_L, model_skl),
730 X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, model_hsw),
731 X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_D, model_hsw),
732 X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT_PLUS, model_hsw),
733 X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE_L, model_skl),
734 X86_RAPL_MODEL_MATCH(INTEL_FAM6_ICELAKE, model_skl),
735 {},
736 };
737
738 MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
739
rapl_pmu_init(void)740 static int __init rapl_pmu_init(void)
741 {
742 const struct x86_cpu_id *id;
743 struct rapl_model *rm;
744 int ret;
745
746 id = x86_match_cpu(rapl_model_match);
747 if (!id)
748 return -ENODEV;
749
750 rm = (struct rapl_model *) id->driver_data;
751 rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX,
752 false, (void *) &rm->events);
753
754 ret = rapl_check_hw_unit(rm->apply_quirk);
755 if (ret)
756 return ret;
757
758 ret = init_rapl_pmus();
759 if (ret)
760 return ret;
761
762 /*
763 * Install callbacks. Core will call them for each online cpu.
764 */
765 ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE,
766 "perf/x86/rapl:online",
767 rapl_cpu_online, rapl_cpu_offline);
768 if (ret)
769 goto out;
770
771 ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1);
772 if (ret)
773 goto out1;
774
775 rapl_advertise();
776 return 0;
777
778 out1:
779 cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE);
780 out:
781 pr_warn("Initialization failed (%d), disabled\n", ret);
782 cleanup_rapl_pmus();
783 return ret;
784 }
785 module_init(rapl_pmu_init);
786
intel_rapl_exit(void)787 static void __exit intel_rapl_exit(void)
788 {
789 cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE);
790 perf_pmu_unregister(&rapl_pmus->pmu);
791 cleanup_rapl_pmus();
792 }
793 module_exit(intel_rapl_exit);
794