• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Intel(R) Processor Trace PMU driver for perf
3  * Copyright (c) 2013-2014, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  *
14  * Intel PT is specified in the Intel Architecture Instruction Set Extensions
15  * Programming Reference:
16  * http://software.intel.com/en-us/intel-isa-extensions
17  */
18 
19 #undef DEBUG
20 
21 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
22 
23 #include <linux/types.h>
24 #include <linux/slab.h>
25 #include <linux/device.h>
26 
27 #include <asm/perf_event.h>
28 #include <asm/insn.h>
29 #include <asm/io.h>
30 #include <asm/intel_pt.h>
31 
32 #include "../perf_event.h"
33 #include "pt.h"
34 
35 static DEFINE_PER_CPU(struct pt, pt_ctx);
36 
37 static struct pt_pmu pt_pmu;
38 
39 enum cpuid_regs {
40 	CR_EAX = 0,
41 	CR_ECX,
42 	CR_EDX,
43 	CR_EBX
44 };
45 
46 /*
47  * Capabilities of Intel PT hardware, such as number of address bits or
48  * supported output schemes, are cached and exported to userspace as "caps"
49  * attribute group of pt pmu device
50  * (/sys/bus/event_source/devices/intel_pt/caps/) so that userspace can store
51  * relevant bits together with intel_pt traces.
52  *
53  * These are necessary for both trace decoding (payloads_lip, contains address
54  * width encoded in IP-related packets), and event configuration (bitmasks with
55  * permitted values for certain bit fields).
56  */
57 #define PT_CAP(_n, _l, _r, _m)						\
58 	[PT_CAP_ ## _n] = { .name = __stringify(_n), .leaf = _l,	\
59 			    .reg = _r, .mask = _m }
60 
61 static struct pt_cap_desc {
62 	const char	*name;
63 	u32		leaf;
64 	u8		reg;
65 	u32		mask;
66 } pt_caps[] = {
67 	PT_CAP(max_subleaf,		0, CR_EAX, 0xffffffff),
68 	PT_CAP(cr3_filtering,		0, CR_EBX, BIT(0)),
69 	PT_CAP(psb_cyc,			0, CR_EBX, BIT(1)),
70 	PT_CAP(ip_filtering,		0, CR_EBX, BIT(2)),
71 	PT_CAP(mtc,			0, CR_EBX, BIT(3)),
72 	PT_CAP(ptwrite,			0, CR_EBX, BIT(4)),
73 	PT_CAP(power_event_trace,	0, CR_EBX, BIT(5)),
74 	PT_CAP(topa_output,		0, CR_ECX, BIT(0)),
75 	PT_CAP(topa_multiple_entries,	0, CR_ECX, BIT(1)),
76 	PT_CAP(single_range_output,	0, CR_ECX, BIT(2)),
77 	PT_CAP(payloads_lip,		0, CR_ECX, BIT(31)),
78 	PT_CAP(num_address_ranges,	1, CR_EAX, 0x3),
79 	PT_CAP(mtc_periods,		1, CR_EAX, 0xffff0000),
80 	PT_CAP(cycle_thresholds,	1, CR_EBX, 0xffff),
81 	PT_CAP(psb_periods,		1, CR_EBX, 0xffff0000),
82 };
83 
pt_cap_get(enum pt_capabilities cap)84 static u32 pt_cap_get(enum pt_capabilities cap)
85 {
86 	struct pt_cap_desc *cd = &pt_caps[cap];
87 	u32 c = pt_pmu.caps[cd->leaf * PT_CPUID_REGS_NUM + cd->reg];
88 	unsigned int shift = __ffs(cd->mask);
89 
90 	return (c & cd->mask) >> shift;
91 }
92 
pt_cap_show(struct device * cdev,struct device_attribute * attr,char * buf)93 static ssize_t pt_cap_show(struct device *cdev,
94 			   struct device_attribute *attr,
95 			   char *buf)
96 {
97 	struct dev_ext_attribute *ea =
98 		container_of(attr, struct dev_ext_attribute, attr);
99 	enum pt_capabilities cap = (long)ea->var;
100 
101 	return snprintf(buf, PAGE_SIZE, "%x\n", pt_cap_get(cap));
102 }
103 
104 static struct attribute_group pt_cap_group = {
105 	.name	= "caps",
106 };
107 
108 PMU_FORMAT_ATTR(cyc,		"config:1"	);
109 PMU_FORMAT_ATTR(pwr_evt,	"config:4"	);
110 PMU_FORMAT_ATTR(fup_on_ptw,	"config:5"	);
111 PMU_FORMAT_ATTR(mtc,		"config:9"	);
112 PMU_FORMAT_ATTR(tsc,		"config:10"	);
113 PMU_FORMAT_ATTR(noretcomp,	"config:11"	);
114 PMU_FORMAT_ATTR(ptw,		"config:12"	);
115 PMU_FORMAT_ATTR(mtc_period,	"config:14-17"	);
116 PMU_FORMAT_ATTR(cyc_thresh,	"config:19-22"	);
117 PMU_FORMAT_ATTR(psb_period,	"config:24-27"	);
118 
119 static struct attribute *pt_formats_attr[] = {
120 	&format_attr_cyc.attr,
121 	&format_attr_pwr_evt.attr,
122 	&format_attr_fup_on_ptw.attr,
123 	&format_attr_mtc.attr,
124 	&format_attr_tsc.attr,
125 	&format_attr_noretcomp.attr,
126 	&format_attr_ptw.attr,
127 	&format_attr_mtc_period.attr,
128 	&format_attr_cyc_thresh.attr,
129 	&format_attr_psb_period.attr,
130 	NULL,
131 };
132 
133 static struct attribute_group pt_format_group = {
134 	.name	= "format",
135 	.attrs	= pt_formats_attr,
136 };
137 
138 static ssize_t
pt_timing_attr_show(struct device * dev,struct device_attribute * attr,char * page)139 pt_timing_attr_show(struct device *dev, struct device_attribute *attr,
140 		    char *page)
141 {
142 	struct perf_pmu_events_attr *pmu_attr =
143 		container_of(attr, struct perf_pmu_events_attr, attr);
144 
145 	switch (pmu_attr->id) {
146 	case 0:
147 		return sprintf(page, "%lu\n", pt_pmu.max_nonturbo_ratio);
148 	case 1:
149 		return sprintf(page, "%u:%u\n",
150 			       pt_pmu.tsc_art_num,
151 			       pt_pmu.tsc_art_den);
152 	default:
153 		break;
154 	}
155 
156 	return -EINVAL;
157 }
158 
159 PMU_EVENT_ATTR(max_nonturbo_ratio, timing_attr_max_nonturbo_ratio, 0,
160 	       pt_timing_attr_show);
161 PMU_EVENT_ATTR(tsc_art_ratio, timing_attr_tsc_art_ratio, 1,
162 	       pt_timing_attr_show);
163 
164 static struct attribute *pt_timing_attr[] = {
165 	&timing_attr_max_nonturbo_ratio.attr.attr,
166 	&timing_attr_tsc_art_ratio.attr.attr,
167 	NULL,
168 };
169 
170 static struct attribute_group pt_timing_group = {
171 	.attrs	= pt_timing_attr,
172 };
173 
174 static const struct attribute_group *pt_attr_groups[] = {
175 	&pt_cap_group,
176 	&pt_format_group,
177 	&pt_timing_group,
178 	NULL,
179 };
180 
pt_pmu_hw_init(void)181 static int __init pt_pmu_hw_init(void)
182 {
183 	struct dev_ext_attribute *de_attrs;
184 	struct attribute **attrs;
185 	size_t size;
186 	u64 reg;
187 	int ret;
188 	long i;
189 
190 	rdmsrl(MSR_PLATFORM_INFO, reg);
191 	pt_pmu.max_nonturbo_ratio = (reg & 0xff00) >> 8;
192 
193 	/*
194 	 * if available, read in TSC to core crystal clock ratio,
195 	 * otherwise, zero for numerator stands for "not enumerated"
196 	 * as per SDM
197 	 */
198 	if (boot_cpu_data.cpuid_level >= CPUID_TSC_LEAF) {
199 		u32 eax, ebx, ecx, edx;
200 
201 		cpuid(CPUID_TSC_LEAF, &eax, &ebx, &ecx, &edx);
202 
203 		pt_pmu.tsc_art_num = ebx;
204 		pt_pmu.tsc_art_den = eax;
205 	}
206 
207 	if (boot_cpu_has(X86_FEATURE_VMX)) {
208 		/*
209 		 * Intel SDM, 36.5 "Tracing post-VMXON" says that
210 		 * "IA32_VMX_MISC[bit 14]" being 1 means PT can trace
211 		 * post-VMXON.
212 		 */
213 		rdmsrl(MSR_IA32_VMX_MISC, reg);
214 		if (reg & BIT(14))
215 			pt_pmu.vmx = true;
216 	}
217 
218 	attrs = NULL;
219 
220 	for (i = 0; i < PT_CPUID_LEAVES; i++) {
221 		cpuid_count(20, i,
222 			    &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM],
223 			    &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM],
224 			    &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM],
225 			    &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]);
226 	}
227 
228 	ret = -ENOMEM;
229 	size = sizeof(struct attribute *) * (ARRAY_SIZE(pt_caps)+1);
230 	attrs = kzalloc(size, GFP_KERNEL);
231 	if (!attrs)
232 		goto fail;
233 
234 	size = sizeof(struct dev_ext_attribute) * (ARRAY_SIZE(pt_caps)+1);
235 	de_attrs = kzalloc(size, GFP_KERNEL);
236 	if (!de_attrs)
237 		goto fail;
238 
239 	for (i = 0; i < ARRAY_SIZE(pt_caps); i++) {
240 		struct dev_ext_attribute *de_attr = de_attrs + i;
241 
242 		de_attr->attr.attr.name = pt_caps[i].name;
243 
244 		sysfs_attr_init(&de_attr->attr.attr);
245 
246 		de_attr->attr.attr.mode		= S_IRUGO;
247 		de_attr->attr.show		= pt_cap_show;
248 		de_attr->var			= (void *)i;
249 
250 		attrs[i] = &de_attr->attr.attr;
251 	}
252 
253 	pt_cap_group.attrs = attrs;
254 
255 	return 0;
256 
257 fail:
258 	kfree(attrs);
259 
260 	return ret;
261 }
262 
263 #define RTIT_CTL_CYC_PSB (RTIT_CTL_CYCLEACC	| \
264 			  RTIT_CTL_CYC_THRESH	| \
265 			  RTIT_CTL_PSB_FREQ)
266 
267 #define RTIT_CTL_MTC	(RTIT_CTL_MTC_EN	| \
268 			 RTIT_CTL_MTC_RANGE)
269 
270 #define RTIT_CTL_PTW	(RTIT_CTL_PTW_EN	| \
271 			 RTIT_CTL_FUP_ON_PTW)
272 
273 #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN		| \
274 			RTIT_CTL_DISRETC	| \
275 			RTIT_CTL_CYC_PSB	| \
276 			RTIT_CTL_MTC		| \
277 			RTIT_CTL_PWR_EVT_EN	| \
278 			RTIT_CTL_FUP_ON_PTW	| \
279 			RTIT_CTL_PTW_EN)
280 
pt_event_valid(struct perf_event * event)281 static bool pt_event_valid(struct perf_event *event)
282 {
283 	u64 config = event->attr.config;
284 	u64 allowed, requested;
285 
286 	if ((config & PT_CONFIG_MASK) != config)
287 		return false;
288 
289 	if (config & RTIT_CTL_CYC_PSB) {
290 		if (!pt_cap_get(PT_CAP_psb_cyc))
291 			return false;
292 
293 		allowed = pt_cap_get(PT_CAP_psb_periods);
294 		requested = (config & RTIT_CTL_PSB_FREQ) >>
295 			RTIT_CTL_PSB_FREQ_OFFSET;
296 		if (requested && (!(allowed & BIT(requested))))
297 			return false;
298 
299 		allowed = pt_cap_get(PT_CAP_cycle_thresholds);
300 		requested = (config & RTIT_CTL_CYC_THRESH) >>
301 			RTIT_CTL_CYC_THRESH_OFFSET;
302 		if (requested && (!(allowed & BIT(requested))))
303 			return false;
304 	}
305 
306 	if (config & RTIT_CTL_MTC) {
307 		/*
308 		 * In the unlikely case that CPUID lists valid mtc periods,
309 		 * but not the mtc capability, drop out here.
310 		 *
311 		 * Spec says that setting mtc period bits while mtc bit in
312 		 * CPUID is 0 will #GP, so better safe than sorry.
313 		 */
314 		if (!pt_cap_get(PT_CAP_mtc))
315 			return false;
316 
317 		allowed = pt_cap_get(PT_CAP_mtc_periods);
318 		if (!allowed)
319 			return false;
320 
321 		requested = (config & RTIT_CTL_MTC_RANGE) >>
322 			RTIT_CTL_MTC_RANGE_OFFSET;
323 
324 		if (!(allowed & BIT(requested)))
325 			return false;
326 	}
327 
328 	if (config & RTIT_CTL_PWR_EVT_EN &&
329 	    !pt_cap_get(PT_CAP_power_event_trace))
330 		return false;
331 
332 	if (config & RTIT_CTL_PTW) {
333 		if (!pt_cap_get(PT_CAP_ptwrite))
334 			return false;
335 
336 		/* FUPonPTW without PTW doesn't make sense */
337 		if ((config & RTIT_CTL_FUP_ON_PTW) &&
338 		    !(config & RTIT_CTL_PTW_EN))
339 			return false;
340 	}
341 
342 	return true;
343 }
344 
345 /*
346  * PT configuration helpers
347  * These all are cpu affine and operate on a local PT
348  */
349 
350 /* Address ranges and their corresponding msr configuration registers */
351 static const struct pt_address_range {
352 	unsigned long	msr_a;
353 	unsigned long	msr_b;
354 	unsigned int	reg_off;
355 } pt_address_ranges[] = {
356 	{
357 		.msr_a	 = MSR_IA32_RTIT_ADDR0_A,
358 		.msr_b	 = MSR_IA32_RTIT_ADDR0_B,
359 		.reg_off = RTIT_CTL_ADDR0_OFFSET,
360 	},
361 	{
362 		.msr_a	 = MSR_IA32_RTIT_ADDR1_A,
363 		.msr_b	 = MSR_IA32_RTIT_ADDR1_B,
364 		.reg_off = RTIT_CTL_ADDR1_OFFSET,
365 	},
366 	{
367 		.msr_a	 = MSR_IA32_RTIT_ADDR2_A,
368 		.msr_b	 = MSR_IA32_RTIT_ADDR2_B,
369 		.reg_off = RTIT_CTL_ADDR2_OFFSET,
370 	},
371 	{
372 		.msr_a	 = MSR_IA32_RTIT_ADDR3_A,
373 		.msr_b	 = MSR_IA32_RTIT_ADDR3_B,
374 		.reg_off = RTIT_CTL_ADDR3_OFFSET,
375 	}
376 };
377 
pt_config_filters(struct perf_event * event)378 static u64 pt_config_filters(struct perf_event *event)
379 {
380 	struct pt_filters *filters = event->hw.addr_filters;
381 	struct pt *pt = this_cpu_ptr(&pt_ctx);
382 	unsigned int range = 0;
383 	u64 rtit_ctl = 0;
384 
385 	if (!filters)
386 		return 0;
387 
388 	perf_event_addr_filters_sync(event);
389 
390 	for (range = 0; range < filters->nr_filters; range++) {
391 		struct pt_filter *filter = &filters->filter[range];
392 
393 		/*
394 		 * Note, if the range has zero start/end addresses due
395 		 * to its dynamic object not being loaded yet, we just
396 		 * go ahead and program zeroed range, which will simply
397 		 * produce no data. Note^2: if executable code at 0x0
398 		 * is a concern, we can set up an "invalid" configuration
399 		 * such as msr_b < msr_a.
400 		 */
401 
402 		/* avoid redundant msr writes */
403 		if (pt->filters.filter[range].msr_a != filter->msr_a) {
404 			wrmsrl(pt_address_ranges[range].msr_a, filter->msr_a);
405 			pt->filters.filter[range].msr_a = filter->msr_a;
406 		}
407 
408 		if (pt->filters.filter[range].msr_b != filter->msr_b) {
409 			wrmsrl(pt_address_ranges[range].msr_b, filter->msr_b);
410 			pt->filters.filter[range].msr_b = filter->msr_b;
411 		}
412 
413 		rtit_ctl |= filter->config << pt_address_ranges[range].reg_off;
414 	}
415 
416 	return rtit_ctl;
417 }
418 
pt_config(struct perf_event * event)419 static void pt_config(struct perf_event *event)
420 {
421 	u64 reg;
422 
423 	if (!event->hw.itrace_started) {
424 		event->hw.itrace_started = 1;
425 		wrmsrl(MSR_IA32_RTIT_STATUS, 0);
426 	}
427 
428 	reg = pt_config_filters(event);
429 	reg |= RTIT_CTL_TOPA | RTIT_CTL_BRANCH_EN | RTIT_CTL_TRACEEN;
430 
431 	if (!event->attr.exclude_kernel)
432 		reg |= RTIT_CTL_OS;
433 	if (!event->attr.exclude_user)
434 		reg |= RTIT_CTL_USR;
435 
436 	reg |= (event->attr.config & PT_CONFIG_MASK);
437 
438 	event->hw.config = reg;
439 	wrmsrl(MSR_IA32_RTIT_CTL, reg);
440 }
441 
pt_config_stop(struct perf_event * event)442 static void pt_config_stop(struct perf_event *event)
443 {
444 	u64 ctl = READ_ONCE(event->hw.config);
445 
446 	/* may be already stopped by a PMI */
447 	if (!(ctl & RTIT_CTL_TRACEEN))
448 		return;
449 
450 	ctl &= ~RTIT_CTL_TRACEEN;
451 	wrmsrl(MSR_IA32_RTIT_CTL, ctl);
452 
453 	WRITE_ONCE(event->hw.config, ctl);
454 
455 	/*
456 	 * A wrmsr that disables trace generation serializes other PT
457 	 * registers and causes all data packets to be written to memory,
458 	 * but a fence is required for the data to become globally visible.
459 	 *
460 	 * The below WMB, separating data store and aux_head store matches
461 	 * the consumer's RMB that separates aux_head load and data load.
462 	 */
463 	wmb();
464 }
465 
pt_config_buffer(void * buf,unsigned int topa_idx,unsigned int output_off)466 static void pt_config_buffer(void *buf, unsigned int topa_idx,
467 			     unsigned int output_off)
468 {
469 	u64 reg;
470 
471 	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, virt_to_phys(buf));
472 
473 	reg = 0x7f | ((u64)topa_idx << 7) | ((u64)output_off << 32);
474 
475 	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, reg);
476 }
477 
478 /*
479  * Keep ToPA table-related metadata on the same page as the actual table,
480  * taking up a few words from the top
481  */
482 
483 #define TENTS_PER_PAGE (((PAGE_SIZE - 40) / sizeof(struct topa_entry)) - 1)
484 
485 /**
486  * struct topa - page-sized ToPA table with metadata at the top
487  * @table:	actual ToPA table entries, as understood by PT hardware
488  * @list:	linkage to struct pt_buffer's list of tables
489  * @phys:	physical address of this page
490  * @offset:	offset of the first entry in this table in the buffer
491  * @size:	total size of all entries in this table
492  * @last:	index of the last initialized entry in this table
493  */
494 struct topa {
495 	struct topa_entry	table[TENTS_PER_PAGE];
496 	struct list_head	list;
497 	u64			phys;
498 	u64			offset;
499 	size_t			size;
500 	int			last;
501 };
502 
503 /* make -1 stand for the last table entry */
504 #define TOPA_ENTRY(t, i) ((i) == -1 ? &(t)->table[(t)->last] : &(t)->table[(i)])
505 
506 /**
507  * topa_alloc() - allocate page-sized ToPA table
508  * @cpu:	CPU on which to allocate.
509  * @gfp:	Allocation flags.
510  *
511  * Return:	On success, return the pointer to ToPA table page.
512  */
topa_alloc(int cpu,gfp_t gfp)513 static struct topa *topa_alloc(int cpu, gfp_t gfp)
514 {
515 	int node = cpu_to_node(cpu);
516 	struct topa *topa;
517 	struct page *p;
518 
519 	p = alloc_pages_node(node, gfp | __GFP_ZERO, 0);
520 	if (!p)
521 		return NULL;
522 
523 	topa = page_address(p);
524 	topa->last = 0;
525 	topa->phys = page_to_phys(p);
526 
527 	/*
528 	 * In case of singe-entry ToPA, always put the self-referencing END
529 	 * link as the 2nd entry in the table
530 	 */
531 	if (!pt_cap_get(PT_CAP_topa_multiple_entries)) {
532 		TOPA_ENTRY(topa, 1)->base = topa->phys >> TOPA_SHIFT;
533 		TOPA_ENTRY(topa, 1)->end = 1;
534 	}
535 
536 	return topa;
537 }
538 
539 /**
540  * topa_free() - free a page-sized ToPA table
541  * @topa:	Table to deallocate.
542  */
topa_free(struct topa * topa)543 static void topa_free(struct topa *topa)
544 {
545 	free_page((unsigned long)topa);
546 }
547 
548 /**
549  * topa_insert_table() - insert a ToPA table into a buffer
550  * @buf:	 PT buffer that's being extended.
551  * @topa:	 New topa table to be inserted.
552  *
553  * If it's the first table in this buffer, set up buffer's pointers
554  * accordingly; otherwise, add a END=1 link entry to @topa to the current
555  * "last" table and adjust the last table pointer to @topa.
556  */
topa_insert_table(struct pt_buffer * buf,struct topa * topa)557 static void topa_insert_table(struct pt_buffer *buf, struct topa *topa)
558 {
559 	struct topa *last = buf->last;
560 
561 	list_add_tail(&topa->list, &buf->tables);
562 
563 	if (!buf->first) {
564 		buf->first = buf->last = buf->cur = topa;
565 		return;
566 	}
567 
568 	topa->offset = last->offset + last->size;
569 	buf->last = topa;
570 
571 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
572 		return;
573 
574 	BUG_ON(last->last != TENTS_PER_PAGE - 1);
575 
576 	TOPA_ENTRY(last, -1)->base = topa->phys >> TOPA_SHIFT;
577 	TOPA_ENTRY(last, -1)->end = 1;
578 }
579 
580 /**
581  * topa_table_full() - check if a ToPA table is filled up
582  * @topa:	ToPA table.
583  */
topa_table_full(struct topa * topa)584 static bool topa_table_full(struct topa *topa)
585 {
586 	/* single-entry ToPA is a special case */
587 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
588 		return !!topa->last;
589 
590 	return topa->last == TENTS_PER_PAGE - 1;
591 }
592 
593 /**
594  * topa_insert_pages() - create a list of ToPA tables
595  * @buf:	PT buffer being initialized.
596  * @gfp:	Allocation flags.
597  *
598  * This initializes a list of ToPA tables with entries from
599  * the data_pages provided by rb_alloc_aux().
600  *
601  * Return:	0 on success or error code.
602  */
topa_insert_pages(struct pt_buffer * buf,gfp_t gfp)603 static int topa_insert_pages(struct pt_buffer *buf, gfp_t gfp)
604 {
605 	struct topa *topa = buf->last;
606 	int order = 0;
607 	struct page *p;
608 
609 	p = virt_to_page(buf->data_pages[buf->nr_pages]);
610 	if (PagePrivate(p))
611 		order = page_private(p);
612 
613 	if (topa_table_full(topa)) {
614 		topa = topa_alloc(buf->cpu, gfp);
615 		if (!topa)
616 			return -ENOMEM;
617 
618 		topa_insert_table(buf, topa);
619 	}
620 
621 	TOPA_ENTRY(topa, -1)->base = page_to_phys(p) >> TOPA_SHIFT;
622 	TOPA_ENTRY(topa, -1)->size = order;
623 	if (!buf->snapshot && !pt_cap_get(PT_CAP_topa_multiple_entries)) {
624 		TOPA_ENTRY(topa, -1)->intr = 1;
625 		TOPA_ENTRY(topa, -1)->stop = 1;
626 	}
627 
628 	topa->last++;
629 	topa->size += sizes(order);
630 
631 	buf->nr_pages += 1ul << order;
632 
633 	return 0;
634 }
635 
636 /**
637  * pt_topa_dump() - print ToPA tables and their entries
638  * @buf:	PT buffer.
639  */
pt_topa_dump(struct pt_buffer * buf)640 static void pt_topa_dump(struct pt_buffer *buf)
641 {
642 	struct topa *topa;
643 
644 	list_for_each_entry(topa, &buf->tables, list) {
645 		int i;
646 
647 		pr_debug("# table @%p (%016Lx), off %llx size %zx\n", topa->table,
648 			 topa->phys, topa->offset, topa->size);
649 		for (i = 0; i < TENTS_PER_PAGE; i++) {
650 			pr_debug("# entry @%p (%lx sz %u %c%c%c) raw=%16llx\n",
651 				 &topa->table[i],
652 				 (unsigned long)topa->table[i].base << TOPA_SHIFT,
653 				 sizes(topa->table[i].size),
654 				 topa->table[i].end ?  'E' : ' ',
655 				 topa->table[i].intr ? 'I' : ' ',
656 				 topa->table[i].stop ? 'S' : ' ',
657 				 *(u64 *)&topa->table[i]);
658 			if ((pt_cap_get(PT_CAP_topa_multiple_entries) &&
659 			     topa->table[i].stop) ||
660 			    topa->table[i].end)
661 				break;
662 		}
663 	}
664 }
665 
666 /**
667  * pt_buffer_advance() - advance to the next output region
668  * @buf:	PT buffer.
669  *
670  * Advance the current pointers in the buffer to the next ToPA entry.
671  */
pt_buffer_advance(struct pt_buffer * buf)672 static void pt_buffer_advance(struct pt_buffer *buf)
673 {
674 	buf->output_off = 0;
675 	buf->cur_idx++;
676 
677 	if (buf->cur_idx == buf->cur->last) {
678 		if (buf->cur == buf->last)
679 			buf->cur = buf->first;
680 		else
681 			buf->cur = list_entry(buf->cur->list.next, struct topa,
682 					      list);
683 		buf->cur_idx = 0;
684 	}
685 }
686 
687 /**
688  * pt_update_head() - calculate current offsets and sizes
689  * @pt:		Per-cpu pt context.
690  *
691  * Update buffer's current write pointer position and data size.
692  */
pt_update_head(struct pt * pt)693 static void pt_update_head(struct pt *pt)
694 {
695 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
696 	u64 topa_idx, base, old;
697 
698 	/* offset of the first region in this table from the beginning of buf */
699 	base = buf->cur->offset + buf->output_off;
700 
701 	/* offset of the current output region within this table */
702 	for (topa_idx = 0; topa_idx < buf->cur_idx; topa_idx++)
703 		base += sizes(buf->cur->table[topa_idx].size);
704 
705 	if (buf->snapshot) {
706 		local_set(&buf->data_size, base);
707 	} else {
708 		old = (local64_xchg(&buf->head, base) &
709 		       ((buf->nr_pages << PAGE_SHIFT) - 1));
710 		if (base < old)
711 			base += buf->nr_pages << PAGE_SHIFT;
712 
713 		local_add(base - old, &buf->data_size);
714 	}
715 }
716 
717 /**
718  * pt_buffer_region() - obtain current output region's address
719  * @buf:	PT buffer.
720  */
pt_buffer_region(struct pt_buffer * buf)721 static void *pt_buffer_region(struct pt_buffer *buf)
722 {
723 	return phys_to_virt(buf->cur->table[buf->cur_idx].base << TOPA_SHIFT);
724 }
725 
726 /**
727  * pt_buffer_region_size() - obtain current output region's size
728  * @buf:	PT buffer.
729  */
pt_buffer_region_size(struct pt_buffer * buf)730 static size_t pt_buffer_region_size(struct pt_buffer *buf)
731 {
732 	return sizes(buf->cur->table[buf->cur_idx].size);
733 }
734 
735 /**
736  * pt_handle_status() - take care of possible status conditions
737  * @pt:		Per-cpu pt context.
738  */
pt_handle_status(struct pt * pt)739 static void pt_handle_status(struct pt *pt)
740 {
741 	struct pt_buffer *buf = perf_get_aux(&pt->handle);
742 	int advance = 0;
743 	u64 status;
744 
745 	rdmsrl(MSR_IA32_RTIT_STATUS, status);
746 
747 	if (status & RTIT_STATUS_ERROR) {
748 		pr_err_ratelimited("ToPA ERROR encountered, trying to recover\n");
749 		pt_topa_dump(buf);
750 		status &= ~RTIT_STATUS_ERROR;
751 	}
752 
753 	if (status & RTIT_STATUS_STOPPED) {
754 		status &= ~RTIT_STATUS_STOPPED;
755 
756 		/*
757 		 * On systems that only do single-entry ToPA, hitting STOP
758 		 * means we are already losing data; need to let the decoder
759 		 * know.
760 		 */
761 		if (!pt_cap_get(PT_CAP_topa_multiple_entries) ||
762 		    buf->output_off == sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) {
763 			local_inc(&buf->lost);
764 			advance++;
765 		}
766 	}
767 
768 	/*
769 	 * Also on single-entry ToPA implementations, interrupt will come
770 	 * before the output reaches its output region's boundary.
771 	 */
772 	if (!pt_cap_get(PT_CAP_topa_multiple_entries) && !buf->snapshot &&
773 	    pt_buffer_region_size(buf) - buf->output_off <= TOPA_PMI_MARGIN) {
774 		void *head = pt_buffer_region(buf);
775 
776 		/* everything within this margin needs to be zeroed out */
777 		memset(head + buf->output_off, 0,
778 		       pt_buffer_region_size(buf) -
779 		       buf->output_off);
780 		advance++;
781 	}
782 
783 	if (advance)
784 		pt_buffer_advance(buf);
785 
786 	wrmsrl(MSR_IA32_RTIT_STATUS, status);
787 }
788 
789 /**
790  * pt_read_offset() - translate registers into buffer pointers
791  * @buf:	PT buffer.
792  *
793  * Set buffer's output pointers from MSR values.
794  */
pt_read_offset(struct pt_buffer * buf)795 static void pt_read_offset(struct pt_buffer *buf)
796 {
797 	u64 offset, base_topa;
798 
799 	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, base_topa);
800 	buf->cur = phys_to_virt(base_topa);
801 
802 	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, offset);
803 	/* offset within current output region */
804 	buf->output_off = offset >> 32;
805 	/* index of current output region within this table */
806 	buf->cur_idx = (offset & 0xffffff80) >> 7;
807 }
808 
809 /**
810  * pt_topa_next_entry() - obtain index of the first page in the next ToPA entry
811  * @buf:	PT buffer.
812  * @pg:		Page offset in the buffer.
813  *
814  * When advancing to the next output region (ToPA entry), given a page offset
815  * into the buffer, we need to find the offset of the first page in the next
816  * region.
817  */
pt_topa_next_entry(struct pt_buffer * buf,unsigned int pg)818 static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg)
819 {
820 	struct topa_entry *te = buf->topa_index[pg];
821 
822 	/* one region */
823 	if (buf->first == buf->last && buf->first->last == 1)
824 		return pg;
825 
826 	do {
827 		pg++;
828 		pg &= buf->nr_pages - 1;
829 	} while (buf->topa_index[pg] == te);
830 
831 	return pg;
832 }
833 
834 /**
835  * pt_buffer_reset_markers() - place interrupt and stop bits in the buffer
836  * @buf:	PT buffer.
837  * @handle:	Current output handle.
838  *
839  * Place INT and STOP marks to prevent overwriting old data that the consumer
840  * hasn't yet collected and waking up the consumer after a certain fraction of
841  * the buffer has filled up. Only needed and sensible for non-snapshot counters.
842  *
843  * This obviously relies on buf::head to figure out buffer markers, so it has
844  * to be called after pt_buffer_reset_offsets() and before the hardware tracing
845  * is enabled.
846  */
pt_buffer_reset_markers(struct pt_buffer * buf,struct perf_output_handle * handle)847 static int pt_buffer_reset_markers(struct pt_buffer *buf,
848 				   struct perf_output_handle *handle)
849 
850 {
851 	unsigned long head = local64_read(&buf->head);
852 	unsigned long idx, npages, wakeup;
853 
854 	/* can't stop in the middle of an output region */
855 	if (buf->output_off + handle->size + 1 <
856 	    sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size))
857 		return -EINVAL;
858 
859 
860 	/* single entry ToPA is handled by marking all regions STOP=1 INT=1 */
861 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
862 		return 0;
863 
864 	/* clear STOP and INT from current entry */
865 	buf->topa_index[buf->stop_pos]->stop = 0;
866 	buf->topa_index[buf->stop_pos]->intr = 0;
867 	buf->topa_index[buf->intr_pos]->intr = 0;
868 
869 	/* how many pages till the STOP marker */
870 	npages = handle->size >> PAGE_SHIFT;
871 
872 	/* if it's on a page boundary, fill up one more page */
873 	if (!offset_in_page(head + handle->size + 1))
874 		npages++;
875 
876 	idx = (head >> PAGE_SHIFT) + npages;
877 	idx &= buf->nr_pages - 1;
878 	buf->stop_pos = idx;
879 
880 	wakeup = handle->wakeup >> PAGE_SHIFT;
881 
882 	/* in the worst case, wake up the consumer one page before hard stop */
883 	idx = (head >> PAGE_SHIFT) + npages - 1;
884 	if (idx > wakeup)
885 		idx = wakeup;
886 
887 	idx &= buf->nr_pages - 1;
888 	buf->intr_pos = idx;
889 
890 	buf->topa_index[buf->stop_pos]->stop = 1;
891 	buf->topa_index[buf->stop_pos]->intr = 1;
892 	buf->topa_index[buf->intr_pos]->intr = 1;
893 
894 	return 0;
895 }
896 
897 /**
898  * pt_buffer_setup_topa_index() - build topa_index[] table of regions
899  * @buf:	PT buffer.
900  *
901  * topa_index[] references output regions indexed by offset into the
902  * buffer for purposes of quick reverse lookup.
903  */
pt_buffer_setup_topa_index(struct pt_buffer * buf)904 static void pt_buffer_setup_topa_index(struct pt_buffer *buf)
905 {
906 	struct topa *cur = buf->first, *prev = buf->last;
907 	struct topa_entry *te_cur = TOPA_ENTRY(cur, 0),
908 		*te_prev = TOPA_ENTRY(prev, prev->last - 1);
909 	int pg = 0, idx = 0;
910 
911 	while (pg < buf->nr_pages) {
912 		int tidx;
913 
914 		/* pages within one topa entry */
915 		for (tidx = 0; tidx < 1 << te_cur->size; tidx++, pg++)
916 			buf->topa_index[pg] = te_prev;
917 
918 		te_prev = te_cur;
919 
920 		if (idx == cur->last - 1) {
921 			/* advance to next topa table */
922 			idx = 0;
923 			cur = list_entry(cur->list.next, struct topa, list);
924 		} else {
925 			idx++;
926 		}
927 		te_cur = TOPA_ENTRY(cur, idx);
928 	}
929 
930 }
931 
932 /**
933  * pt_buffer_reset_offsets() - adjust buffer's write pointers from aux_head
934  * @buf:	PT buffer.
935  * @head:	Write pointer (aux_head) from AUX buffer.
936  *
937  * Find the ToPA table and entry corresponding to given @head and set buffer's
938  * "current" pointers accordingly. This is done after we have obtained the
939  * current aux_head position from a successful call to perf_aux_output_begin()
940  * to make sure the hardware is writing to the right place.
941  *
942  * This function modifies buf::{cur,cur_idx,output_off} that will be programmed
943  * into PT msrs when the tracing is enabled and buf::head and buf::data_size,
944  * which are used to determine INT and STOP markers' locations by a subsequent
945  * call to pt_buffer_reset_markers().
946  */
pt_buffer_reset_offsets(struct pt_buffer * buf,unsigned long head)947 static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head)
948 {
949 	int pg;
950 
951 	if (buf->snapshot)
952 		head &= (buf->nr_pages << PAGE_SHIFT) - 1;
953 
954 	pg = (head >> PAGE_SHIFT) & (buf->nr_pages - 1);
955 	pg = pt_topa_next_entry(buf, pg);
956 
957 	buf->cur = (struct topa *)((unsigned long)buf->topa_index[pg] & PAGE_MASK);
958 	buf->cur_idx = ((unsigned long)buf->topa_index[pg] -
959 			(unsigned long)buf->cur) / sizeof(struct topa_entry);
960 	buf->output_off = head & (sizes(buf->cur->table[buf->cur_idx].size) - 1);
961 
962 	local64_set(&buf->head, head);
963 	local_set(&buf->data_size, 0);
964 }
965 
966 /**
967  * pt_buffer_fini_topa() - deallocate ToPA structure of a buffer
968  * @buf:	PT buffer.
969  */
pt_buffer_fini_topa(struct pt_buffer * buf)970 static void pt_buffer_fini_topa(struct pt_buffer *buf)
971 {
972 	struct topa *topa, *iter;
973 
974 	list_for_each_entry_safe(topa, iter, &buf->tables, list) {
975 		/*
976 		 * right now, this is in free_aux() path only, so
977 		 * no need to unlink this table from the list
978 		 */
979 		topa_free(topa);
980 	}
981 }
982 
983 /**
984  * pt_buffer_init_topa() - initialize ToPA table for pt buffer
985  * @buf:	PT buffer.
986  * @size:	Total size of all regions within this ToPA.
987  * @gfp:	Allocation flags.
988  */
pt_buffer_init_topa(struct pt_buffer * buf,unsigned long nr_pages,gfp_t gfp)989 static int pt_buffer_init_topa(struct pt_buffer *buf, unsigned long nr_pages,
990 			       gfp_t gfp)
991 {
992 	struct topa *topa;
993 	int err;
994 
995 	topa = topa_alloc(buf->cpu, gfp);
996 	if (!topa)
997 		return -ENOMEM;
998 
999 	topa_insert_table(buf, topa);
1000 
1001 	while (buf->nr_pages < nr_pages) {
1002 		err = topa_insert_pages(buf, gfp);
1003 		if (err) {
1004 			pt_buffer_fini_topa(buf);
1005 			return -ENOMEM;
1006 		}
1007 	}
1008 
1009 	pt_buffer_setup_topa_index(buf);
1010 
1011 	/* link last table to the first one, unless we're double buffering */
1012 	if (pt_cap_get(PT_CAP_topa_multiple_entries)) {
1013 		TOPA_ENTRY(buf->last, -1)->base = buf->first->phys >> TOPA_SHIFT;
1014 		TOPA_ENTRY(buf->last, -1)->end = 1;
1015 	}
1016 
1017 	pt_topa_dump(buf);
1018 	return 0;
1019 }
1020 
1021 /**
1022  * pt_buffer_setup_aux() - set up topa tables for a PT buffer
1023  * @cpu:	Cpu on which to allocate, -1 means current.
1024  * @pages:	Array of pointers to buffer pages passed from perf core.
1025  * @nr_pages:	Number of pages in the buffer.
1026  * @snapshot:	If this is a snapshot/overwrite counter.
1027  *
1028  * This is a pmu::setup_aux callback that sets up ToPA tables and all the
1029  * bookkeeping for an AUX buffer.
1030  *
1031  * Return:	Our private PT buffer structure.
1032  */
1033 static void *
pt_buffer_setup_aux(int cpu,void ** pages,int nr_pages,bool snapshot)1034 pt_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool snapshot)
1035 {
1036 	struct pt_buffer *buf;
1037 	int node, ret;
1038 
1039 	if (!nr_pages)
1040 		return NULL;
1041 
1042 	if (cpu == -1)
1043 		cpu = raw_smp_processor_id();
1044 	node = cpu_to_node(cpu);
1045 
1046 	buf = kzalloc_node(offsetof(struct pt_buffer, topa_index[nr_pages]),
1047 			   GFP_KERNEL, node);
1048 	if (!buf)
1049 		return NULL;
1050 
1051 	buf->cpu = cpu;
1052 	buf->snapshot = snapshot;
1053 	buf->data_pages = pages;
1054 
1055 	INIT_LIST_HEAD(&buf->tables);
1056 
1057 	ret = pt_buffer_init_topa(buf, nr_pages, GFP_KERNEL);
1058 	if (ret) {
1059 		kfree(buf);
1060 		return NULL;
1061 	}
1062 
1063 	return buf;
1064 }
1065 
1066 /**
1067  * pt_buffer_free_aux() - perf AUX deallocation path callback
1068  * @data:	PT buffer.
1069  */
pt_buffer_free_aux(void * data)1070 static void pt_buffer_free_aux(void *data)
1071 {
1072 	struct pt_buffer *buf = data;
1073 
1074 	pt_buffer_fini_topa(buf);
1075 	kfree(buf);
1076 }
1077 
pt_addr_filters_init(struct perf_event * event)1078 static int pt_addr_filters_init(struct perf_event *event)
1079 {
1080 	struct pt_filters *filters;
1081 	int node = event->cpu == -1 ? -1 : cpu_to_node(event->cpu);
1082 
1083 	if (!pt_cap_get(PT_CAP_num_address_ranges))
1084 		return 0;
1085 
1086 	filters = kzalloc_node(sizeof(struct pt_filters), GFP_KERNEL, node);
1087 	if (!filters)
1088 		return -ENOMEM;
1089 
1090 	if (event->parent)
1091 		memcpy(filters, event->parent->hw.addr_filters,
1092 		       sizeof(*filters));
1093 
1094 	event->hw.addr_filters = filters;
1095 
1096 	return 0;
1097 }
1098 
pt_addr_filters_fini(struct perf_event * event)1099 static void pt_addr_filters_fini(struct perf_event *event)
1100 {
1101 	kfree(event->hw.addr_filters);
1102 	event->hw.addr_filters = NULL;
1103 }
1104 
valid_kernel_ip(unsigned long ip)1105 static inline bool valid_kernel_ip(unsigned long ip)
1106 {
1107 	return virt_addr_valid(ip) && kernel_ip(ip);
1108 }
1109 
pt_event_addr_filters_validate(struct list_head * filters)1110 static int pt_event_addr_filters_validate(struct list_head *filters)
1111 {
1112 	struct perf_addr_filter *filter;
1113 	int range = 0;
1114 
1115 	list_for_each_entry(filter, filters, entry) {
1116 		/* PT doesn't support single address triggers */
1117 		if (!filter->range || !filter->size)
1118 			return -EOPNOTSUPP;
1119 
1120 		if (!filter->inode) {
1121 			if (!valid_kernel_ip(filter->offset))
1122 				return -EINVAL;
1123 
1124 			if (!valid_kernel_ip(filter->offset + filter->size))
1125 				return -EINVAL;
1126 		}
1127 
1128 		if (++range > pt_cap_get(PT_CAP_num_address_ranges))
1129 			return -EOPNOTSUPP;
1130 	}
1131 
1132 	return 0;
1133 }
1134 
pt_event_addr_filters_sync(struct perf_event * event)1135 static void pt_event_addr_filters_sync(struct perf_event *event)
1136 {
1137 	struct perf_addr_filters_head *head = perf_event_addr_filters(event);
1138 	unsigned long msr_a, msr_b, *offs = event->addr_filters_offs;
1139 	struct pt_filters *filters = event->hw.addr_filters;
1140 	struct perf_addr_filter *filter;
1141 	int range = 0;
1142 
1143 	if (!filters)
1144 		return;
1145 
1146 	list_for_each_entry(filter, &head->list, entry) {
1147 		if (filter->inode && !offs[range]) {
1148 			msr_a = msr_b = 0;
1149 		} else {
1150 			/* apply the offset */
1151 			msr_a = filter->offset + offs[range];
1152 			msr_b = filter->size + msr_a - 1;
1153 		}
1154 
1155 		filters->filter[range].msr_a  = msr_a;
1156 		filters->filter[range].msr_b  = msr_b;
1157 		filters->filter[range].config = filter->filter ? 1 : 2;
1158 		range++;
1159 	}
1160 
1161 	filters->nr_filters = range;
1162 }
1163 
1164 /**
1165  * intel_pt_interrupt() - PT PMI handler
1166  */
intel_pt_interrupt(void)1167 void intel_pt_interrupt(void)
1168 {
1169 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1170 	struct pt_buffer *buf;
1171 	struct perf_event *event = pt->handle.event;
1172 
1173 	/*
1174 	 * There may be a dangling PT bit in the interrupt status register
1175 	 * after PT has been disabled by pt_event_stop(). Make sure we don't
1176 	 * do anything (particularly, re-enable) for this event here.
1177 	 */
1178 	if (!READ_ONCE(pt->handle_nmi))
1179 		return;
1180 
1181 	/*
1182 	 * If VMX is on and PT does not support it, don't touch anything.
1183 	 */
1184 	if (READ_ONCE(pt->vmx_on))
1185 		return;
1186 
1187 	if (!event)
1188 		return;
1189 
1190 	pt_config_stop(event);
1191 
1192 	buf = perf_get_aux(&pt->handle);
1193 	if (!buf)
1194 		return;
1195 
1196 	pt_read_offset(buf);
1197 
1198 	pt_handle_status(pt);
1199 
1200 	pt_update_head(pt);
1201 
1202 	perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
1203 			    local_xchg(&buf->lost, 0));
1204 
1205 	if (!event->hw.state) {
1206 		int ret;
1207 
1208 		buf = perf_aux_output_begin(&pt->handle, event);
1209 		if (!buf) {
1210 			event->hw.state = PERF_HES_STOPPED;
1211 			return;
1212 		}
1213 
1214 		pt_buffer_reset_offsets(buf, pt->handle.head);
1215 		/* snapshot counters don't use PMI, so it's safe */
1216 		ret = pt_buffer_reset_markers(buf, &pt->handle);
1217 		if (ret) {
1218 			perf_aux_output_end(&pt->handle, 0, true);
1219 			return;
1220 		}
1221 
1222 		pt_config_buffer(buf->cur->table, buf->cur_idx,
1223 				 buf->output_off);
1224 		pt_config(event);
1225 	}
1226 }
1227 
intel_pt_handle_vmx(int on)1228 void intel_pt_handle_vmx(int on)
1229 {
1230 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1231 	struct perf_event *event;
1232 	unsigned long flags;
1233 
1234 	/* PT plays nice with VMX, do nothing */
1235 	if (pt_pmu.vmx)
1236 		return;
1237 
1238 	/*
1239 	 * VMXON will clear RTIT_CTL.TraceEn; we need to make
1240 	 * sure to not try to set it while VMX is on. Disable
1241 	 * interrupts to avoid racing with pmu callbacks;
1242 	 * concurrent PMI should be handled fine.
1243 	 */
1244 	local_irq_save(flags);
1245 	WRITE_ONCE(pt->vmx_on, on);
1246 
1247 	if (on) {
1248 		/* prevent pt_config_stop() from writing RTIT_CTL */
1249 		event = pt->handle.event;
1250 		if (event)
1251 			event->hw.config = 0;
1252 	}
1253 	local_irq_restore(flags);
1254 }
1255 EXPORT_SYMBOL_GPL(intel_pt_handle_vmx);
1256 
1257 /*
1258  * PMU callbacks
1259  */
1260 
pt_event_start(struct perf_event * event,int mode)1261 static void pt_event_start(struct perf_event *event, int mode)
1262 {
1263 	struct hw_perf_event *hwc = &event->hw;
1264 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1265 	struct pt_buffer *buf;
1266 
1267 	if (READ_ONCE(pt->vmx_on))
1268 		return;
1269 
1270 	buf = perf_aux_output_begin(&pt->handle, event);
1271 	if (!buf)
1272 		goto fail_stop;
1273 
1274 	pt_buffer_reset_offsets(buf, pt->handle.head);
1275 	if (!buf->snapshot) {
1276 		if (pt_buffer_reset_markers(buf, &pt->handle))
1277 			goto fail_end_stop;
1278 	}
1279 
1280 	WRITE_ONCE(pt->handle_nmi, 1);
1281 	hwc->state = 0;
1282 
1283 	pt_config_buffer(buf->cur->table, buf->cur_idx,
1284 			 buf->output_off);
1285 	pt_config(event);
1286 
1287 	return;
1288 
1289 fail_end_stop:
1290 	perf_aux_output_end(&pt->handle, 0, true);
1291 fail_stop:
1292 	hwc->state = PERF_HES_STOPPED;
1293 }
1294 
pt_event_stop(struct perf_event * event,int mode)1295 static void pt_event_stop(struct perf_event *event, int mode)
1296 {
1297 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1298 
1299 	/*
1300 	 * Protect against the PMI racing with disabling wrmsr,
1301 	 * see comment in intel_pt_interrupt().
1302 	 */
1303 	WRITE_ONCE(pt->handle_nmi, 0);
1304 
1305 	pt_config_stop(event);
1306 
1307 	if (event->hw.state == PERF_HES_STOPPED)
1308 		return;
1309 
1310 	event->hw.state = PERF_HES_STOPPED;
1311 
1312 	if (mode & PERF_EF_UPDATE) {
1313 		struct pt_buffer *buf = perf_get_aux(&pt->handle);
1314 
1315 		if (!buf)
1316 			return;
1317 
1318 		if (WARN_ON_ONCE(pt->handle.event != event))
1319 			return;
1320 
1321 		pt_read_offset(buf);
1322 
1323 		pt_handle_status(pt);
1324 
1325 		pt_update_head(pt);
1326 
1327 		if (buf->snapshot)
1328 			pt->handle.head =
1329 				local_xchg(&buf->data_size,
1330 					   buf->nr_pages << PAGE_SHIFT);
1331 		perf_aux_output_end(&pt->handle, local_xchg(&buf->data_size, 0),
1332 				    local_xchg(&buf->lost, 0));
1333 	}
1334 }
1335 
pt_event_del(struct perf_event * event,int mode)1336 static void pt_event_del(struct perf_event *event, int mode)
1337 {
1338 	pt_event_stop(event, PERF_EF_UPDATE);
1339 }
1340 
pt_event_add(struct perf_event * event,int mode)1341 static int pt_event_add(struct perf_event *event, int mode)
1342 {
1343 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1344 	struct hw_perf_event *hwc = &event->hw;
1345 	int ret = -EBUSY;
1346 
1347 	if (pt->handle.event)
1348 		goto fail;
1349 
1350 	if (mode & PERF_EF_START) {
1351 		pt_event_start(event, 0);
1352 		ret = -EINVAL;
1353 		if (hwc->state == PERF_HES_STOPPED)
1354 			goto fail;
1355 	} else {
1356 		hwc->state = PERF_HES_STOPPED;
1357 	}
1358 
1359 	ret = 0;
1360 fail:
1361 
1362 	return ret;
1363 }
1364 
pt_event_read(struct perf_event * event)1365 static void pt_event_read(struct perf_event *event)
1366 {
1367 }
1368 
pt_event_destroy(struct perf_event * event)1369 static void pt_event_destroy(struct perf_event *event)
1370 {
1371 	pt_addr_filters_fini(event);
1372 	x86_del_exclusive(x86_lbr_exclusive_pt);
1373 }
1374 
pt_event_init(struct perf_event * event)1375 static int pt_event_init(struct perf_event *event)
1376 {
1377 	if (event->attr.type != pt_pmu.pmu.type)
1378 		return -ENOENT;
1379 
1380 	if (!pt_event_valid(event))
1381 		return -EINVAL;
1382 
1383 	if (x86_add_exclusive(x86_lbr_exclusive_pt))
1384 		return -EBUSY;
1385 
1386 	if (pt_addr_filters_init(event)) {
1387 		x86_del_exclusive(x86_lbr_exclusive_pt);
1388 		return -ENOMEM;
1389 	}
1390 
1391 	event->destroy = pt_event_destroy;
1392 
1393 	return 0;
1394 }
1395 
cpu_emergency_stop_pt(void)1396 void cpu_emergency_stop_pt(void)
1397 {
1398 	struct pt *pt = this_cpu_ptr(&pt_ctx);
1399 
1400 	if (pt->handle.event)
1401 		pt_event_stop(pt->handle.event, PERF_EF_UPDATE);
1402 }
1403 
pt_init(void)1404 static __init int pt_init(void)
1405 {
1406 	int ret, cpu, prior_warn = 0;
1407 
1408 	BUILD_BUG_ON(sizeof(struct topa) > PAGE_SIZE);
1409 
1410 	if (!boot_cpu_has(X86_FEATURE_INTEL_PT))
1411 		return -ENODEV;
1412 
1413 	get_online_cpus();
1414 	for_each_online_cpu(cpu) {
1415 		u64 ctl;
1416 
1417 		ret = rdmsrl_safe_on_cpu(cpu, MSR_IA32_RTIT_CTL, &ctl);
1418 		if (!ret && (ctl & RTIT_CTL_TRACEEN))
1419 			prior_warn++;
1420 	}
1421 	put_online_cpus();
1422 
1423 	if (prior_warn) {
1424 		x86_add_exclusive(x86_lbr_exclusive_pt);
1425 		pr_warn("PT is enabled at boot time, doing nothing\n");
1426 
1427 		return -EBUSY;
1428 	}
1429 
1430 	ret = pt_pmu_hw_init();
1431 	if (ret)
1432 		return ret;
1433 
1434 	if (!pt_cap_get(PT_CAP_topa_output)) {
1435 		pr_warn("ToPA output is not supported on this CPU\n");
1436 		return -ENODEV;
1437 	}
1438 
1439 	if (!pt_cap_get(PT_CAP_topa_multiple_entries))
1440 		pt_pmu.pmu.capabilities =
1441 			PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_AUX_SW_DOUBLEBUF;
1442 
1443 	pt_pmu.pmu.capabilities	|= PERF_PMU_CAP_EXCLUSIVE | PERF_PMU_CAP_ITRACE;
1444 	pt_pmu.pmu.attr_groups		 = pt_attr_groups;
1445 	pt_pmu.pmu.task_ctx_nr		 = perf_sw_context;
1446 	pt_pmu.pmu.event_init		 = pt_event_init;
1447 	pt_pmu.pmu.add			 = pt_event_add;
1448 	pt_pmu.pmu.del			 = pt_event_del;
1449 	pt_pmu.pmu.start		 = pt_event_start;
1450 	pt_pmu.pmu.stop			 = pt_event_stop;
1451 	pt_pmu.pmu.read			 = pt_event_read;
1452 	pt_pmu.pmu.setup_aux		 = pt_buffer_setup_aux;
1453 	pt_pmu.pmu.free_aux		 = pt_buffer_free_aux;
1454 	pt_pmu.pmu.addr_filters_sync     = pt_event_addr_filters_sync;
1455 	pt_pmu.pmu.addr_filters_validate = pt_event_addr_filters_validate;
1456 	pt_pmu.pmu.nr_addr_filters       =
1457 		pt_cap_get(PT_CAP_num_address_ranges);
1458 
1459 	ret = perf_pmu_register(&pt_pmu.pmu, "intel_pt", -1);
1460 
1461 	return ret;
1462 }
1463 arch_initcall(pt_init);
1464