1 /*
2 * BTS PMU driver for perf
3 * Copyright (c) 2013-2014, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 */
14
15 #undef DEBUG
16
17 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19 #include <linux/bitops.h>
20 #include <linux/types.h>
21 #include <linux/slab.h>
22 #include <linux/debugfs.h>
23 #include <linux/device.h>
24 #include <linux/coredump.h>
25 #include <linux/kaiser.h>
26
27 #include <asm-generic/sizes.h>
28 #include <asm/perf_event.h>
29
30 #include "perf_event.h"
31
32 struct bts_ctx {
33 struct perf_output_handle handle;
34 struct debug_store ds_back;
35 int started;
36 };
37
38 static DEFINE_PER_CPU(struct bts_ctx, bts_ctx);
39
40 #define BTS_RECORD_SIZE 24
41 #define BTS_SAFETY_MARGIN 4080
42
43 struct bts_phys {
44 struct page *page;
45 unsigned long size;
46 unsigned long offset;
47 unsigned long displacement;
48 };
49
50 struct bts_buffer {
51 size_t real_size; /* multiple of BTS_RECORD_SIZE */
52 unsigned int nr_pages;
53 unsigned int nr_bufs;
54 unsigned int cur_buf;
55 bool snapshot;
56 local_t data_size;
57 local_t lost;
58 local_t head;
59 unsigned long end;
60 void **data_pages;
61 struct bts_phys buf[0];
62 };
63
64 struct pmu bts_pmu;
65
buf_size(struct page * page)66 static size_t buf_size(struct page *page)
67 {
68 return 1 << (PAGE_SHIFT + page_private(page));
69 }
70
bts_buffer_free_aux(void * data)71 static void bts_buffer_free_aux(void *data)
72 {
73 #ifdef CONFIG_PAGE_TABLE_ISOLATION
74 struct bts_buffer *buf = data;
75 int nbuf;
76
77 for (nbuf = 0; nbuf < buf->nr_bufs; nbuf++) {
78 struct page *page = buf->buf[nbuf].page;
79 void *kaddr = page_address(page);
80 size_t page_size = buf_size(page);
81
82 kaiser_remove_mapping((unsigned long)kaddr, page_size);
83 }
84 #endif
85 kfree(data);
86 }
87
88 static void *
bts_buffer_setup_aux(int cpu,void ** pages,int nr_pages,bool overwrite)89 bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
90 {
91 struct bts_buffer *buf;
92 struct page *page;
93 int node = (cpu == -1) ? cpu : cpu_to_node(cpu);
94 unsigned long offset;
95 size_t size = nr_pages << PAGE_SHIFT;
96 int pg, nbuf, pad;
97
98 /* count all the high order buffers */
99 for (pg = 0, nbuf = 0; pg < nr_pages;) {
100 page = virt_to_page(pages[pg]);
101 if (WARN_ON_ONCE(!PagePrivate(page) && nr_pages > 1))
102 return NULL;
103 pg += 1 << page_private(page);
104 nbuf++;
105 }
106
107 /*
108 * to avoid interrupts in overwrite mode, only allow one physical
109 */
110 if (overwrite && nbuf > 1)
111 return NULL;
112
113 buf = kzalloc_node(offsetof(struct bts_buffer, buf[nbuf]), GFP_KERNEL, node);
114 if (!buf)
115 return NULL;
116
117 buf->nr_pages = nr_pages;
118 buf->nr_bufs = nbuf;
119 buf->snapshot = overwrite;
120 buf->data_pages = pages;
121 buf->real_size = size - size % BTS_RECORD_SIZE;
122
123 for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
124 void *kaddr = pages[pg];
125 size_t page_size;
126
127 page = virt_to_page(kaddr);
128 page_size = buf_size(page);
129
130 if (kaiser_add_mapping((unsigned long)kaddr,
131 page_size, __PAGE_KERNEL) < 0) {
132 buf->nr_bufs = nbuf;
133 bts_buffer_free_aux(buf);
134 return NULL;
135 }
136
137 buf->buf[nbuf].page = page;
138 buf->buf[nbuf].offset = offset;
139 buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
140 buf->buf[nbuf].size = page_size - buf->buf[nbuf].displacement;
141 pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
142 buf->buf[nbuf].size -= pad;
143
144 pg += page_size >> PAGE_SHIFT;
145 offset += page_size;
146 }
147
148 return buf;
149 }
150
bts_buffer_offset(struct bts_buffer * buf,unsigned int idx)151 static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
152 {
153 return buf->buf[idx].offset + buf->buf[idx].displacement;
154 }
155
156 static void
bts_config_buffer(struct bts_buffer * buf)157 bts_config_buffer(struct bts_buffer *buf)
158 {
159 int cpu = raw_smp_processor_id();
160 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
161 struct bts_phys *phys = &buf->buf[buf->cur_buf];
162 unsigned long index, thresh = 0, end = phys->size;
163 struct page *page = phys->page;
164
165 index = local_read(&buf->head);
166
167 if (!buf->snapshot) {
168 if (buf->end < phys->offset + buf_size(page))
169 end = buf->end - phys->offset - phys->displacement;
170
171 index -= phys->offset + phys->displacement;
172
173 if (end - index > BTS_SAFETY_MARGIN)
174 thresh = end - BTS_SAFETY_MARGIN;
175 else if (end - index > BTS_RECORD_SIZE)
176 thresh = end - BTS_RECORD_SIZE;
177 else
178 thresh = end;
179 }
180
181 ds->bts_buffer_base = (u64)(long)page_address(page) + phys->displacement;
182 ds->bts_index = ds->bts_buffer_base + index;
183 ds->bts_absolute_maximum = ds->bts_buffer_base + end;
184 ds->bts_interrupt_threshold = !buf->snapshot
185 ? ds->bts_buffer_base + thresh
186 : ds->bts_absolute_maximum + BTS_RECORD_SIZE;
187 }
188
bts_buffer_pad_out(struct bts_phys * phys,unsigned long head)189 static void bts_buffer_pad_out(struct bts_phys *phys, unsigned long head)
190 {
191 unsigned long index = head - phys->offset;
192
193 memset(page_address(phys->page) + index, 0, phys->size - index);
194 }
195
bts_buffer_is_full(struct bts_buffer * buf,struct bts_ctx * bts)196 static bool bts_buffer_is_full(struct bts_buffer *buf, struct bts_ctx *bts)
197 {
198 if (buf->snapshot)
199 return false;
200
201 if (local_read(&buf->data_size) >= bts->handle.size ||
202 bts->handle.size - local_read(&buf->data_size) < BTS_RECORD_SIZE)
203 return true;
204
205 return false;
206 }
207
bts_update(struct bts_ctx * bts)208 static void bts_update(struct bts_ctx *bts)
209 {
210 int cpu = raw_smp_processor_id();
211 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
212 struct bts_buffer *buf = perf_get_aux(&bts->handle);
213 unsigned long index = ds->bts_index - ds->bts_buffer_base, old, head;
214
215 if (!buf)
216 return;
217
218 head = index + bts_buffer_offset(buf, buf->cur_buf);
219 old = local_xchg(&buf->head, head);
220
221 if (!buf->snapshot) {
222 if (old == head)
223 return;
224
225 if (ds->bts_index >= ds->bts_absolute_maximum)
226 local_inc(&buf->lost);
227
228 /*
229 * old and head are always in the same physical buffer, so we
230 * can subtract them to get the data size.
231 */
232 local_add(head - old, &buf->data_size);
233 } else {
234 local_set(&buf->data_size, head);
235 }
236 }
237
__bts_event_start(struct perf_event * event)238 static void __bts_event_start(struct perf_event *event)
239 {
240 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
241 struct bts_buffer *buf = perf_get_aux(&bts->handle);
242 u64 config = 0;
243
244 if (!buf || bts_buffer_is_full(buf, bts))
245 return;
246
247 event->hw.itrace_started = 1;
248 event->hw.state = 0;
249
250 if (!buf->snapshot)
251 config |= ARCH_PERFMON_EVENTSEL_INT;
252 if (!event->attr.exclude_kernel)
253 config |= ARCH_PERFMON_EVENTSEL_OS;
254 if (!event->attr.exclude_user)
255 config |= ARCH_PERFMON_EVENTSEL_USR;
256
257 bts_config_buffer(buf);
258
259 /*
260 * local barrier to make sure that ds configuration made it
261 * before we enable BTS
262 */
263 wmb();
264
265 intel_pmu_enable_bts(config);
266 }
267
bts_event_start(struct perf_event * event,int flags)268 static void bts_event_start(struct perf_event *event, int flags)
269 {
270 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
271
272 __bts_event_start(event);
273
274 /* PMI handler: this counter is running and likely generating PMIs */
275 ACCESS_ONCE(bts->started) = 1;
276 }
277
__bts_event_stop(struct perf_event * event)278 static void __bts_event_stop(struct perf_event *event)
279 {
280 /*
281 * No extra synchronization is mandated by the documentation to have
282 * BTS data stores globally visible.
283 */
284 intel_pmu_disable_bts();
285
286 if (event->hw.state & PERF_HES_STOPPED)
287 return;
288
289 ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED;
290 }
291
bts_event_stop(struct perf_event * event,int flags)292 static void bts_event_stop(struct perf_event *event, int flags)
293 {
294 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
295
296 /* PMI handler: don't restart this counter */
297 ACCESS_ONCE(bts->started) = 0;
298
299 __bts_event_stop(event);
300
301 if (flags & PERF_EF_UPDATE)
302 bts_update(bts);
303 }
304
intel_bts_enable_local(void)305 void intel_bts_enable_local(void)
306 {
307 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
308
309 if (bts->handle.event && bts->started)
310 __bts_event_start(bts->handle.event);
311 }
312
intel_bts_disable_local(void)313 void intel_bts_disable_local(void)
314 {
315 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
316
317 if (bts->handle.event)
318 __bts_event_stop(bts->handle.event);
319 }
320
321 static int
bts_buffer_reset(struct bts_buffer * buf,struct perf_output_handle * handle)322 bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle)
323 {
324 unsigned long head, space, next_space, pad, gap, skip, wakeup;
325 unsigned int next_buf;
326 struct bts_phys *phys, *next_phys;
327 int ret;
328
329 if (buf->snapshot)
330 return 0;
331
332 head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1);
333 if (WARN_ON_ONCE(head != local_read(&buf->head)))
334 return -EINVAL;
335
336 phys = &buf->buf[buf->cur_buf];
337 space = phys->offset + phys->displacement + phys->size - head;
338 pad = space;
339 if (space > handle->size) {
340 space = handle->size;
341 space -= space % BTS_RECORD_SIZE;
342 }
343 if (space <= BTS_SAFETY_MARGIN) {
344 /* See if next phys buffer has more space */
345 next_buf = buf->cur_buf + 1;
346 if (next_buf >= buf->nr_bufs)
347 next_buf = 0;
348 next_phys = &buf->buf[next_buf];
349 gap = buf_size(phys->page) - phys->displacement - phys->size +
350 next_phys->displacement;
351 skip = pad + gap;
352 if (handle->size >= skip) {
353 next_space = next_phys->size;
354 if (next_space + skip > handle->size) {
355 next_space = handle->size - skip;
356 next_space -= next_space % BTS_RECORD_SIZE;
357 }
358 if (next_space > space || !space) {
359 if (pad)
360 bts_buffer_pad_out(phys, head);
361 ret = perf_aux_output_skip(handle, skip);
362 if (ret)
363 return ret;
364 /* Advance to next phys buffer */
365 phys = next_phys;
366 space = next_space;
367 head = phys->offset + phys->displacement;
368 /*
369 * After this, cur_buf and head won't match ds
370 * anymore, so we must not be racing with
371 * bts_update().
372 */
373 buf->cur_buf = next_buf;
374 local_set(&buf->head, head);
375 }
376 }
377 }
378
379 /* Don't go far beyond wakeup watermark */
380 wakeup = BTS_SAFETY_MARGIN + BTS_RECORD_SIZE + handle->wakeup -
381 handle->head;
382 if (space > wakeup) {
383 space = wakeup;
384 space -= space % BTS_RECORD_SIZE;
385 }
386
387 buf->end = head + space;
388
389 /*
390 * If we have no space, the lost notification would have been sent when
391 * we hit absolute_maximum - see bts_update()
392 */
393 if (!space)
394 return -ENOSPC;
395
396 return 0;
397 }
398
intel_bts_interrupt(void)399 int intel_bts_interrupt(void)
400 {
401 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
402 struct perf_event *event = bts->handle.event;
403 struct bts_buffer *buf;
404 s64 old_head;
405 int err;
406
407 if (!event || !bts->started)
408 return 0;
409
410 buf = perf_get_aux(&bts->handle);
411 /*
412 * Skip snapshot counters: they don't use the interrupt, but
413 * there's no other way of telling, because the pointer will
414 * keep moving
415 */
416 if (!buf || buf->snapshot)
417 return 0;
418
419 old_head = local_read(&buf->head);
420 bts_update(bts);
421
422 /* no new data */
423 if (old_head == local_read(&buf->head))
424 return 0;
425
426 perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
427 !!local_xchg(&buf->lost, 0));
428
429 buf = perf_aux_output_begin(&bts->handle, event);
430 if (!buf)
431 return 1;
432
433 err = bts_buffer_reset(buf, &bts->handle);
434 if (err)
435 perf_aux_output_end(&bts->handle, 0, false);
436
437 return 1;
438 }
439
bts_event_del(struct perf_event * event,int mode)440 static void bts_event_del(struct perf_event *event, int mode)
441 {
442 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
443 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
444 struct bts_buffer *buf = perf_get_aux(&bts->handle);
445
446 bts_event_stop(event, PERF_EF_UPDATE);
447
448 if (buf) {
449 if (buf->snapshot)
450 bts->handle.head =
451 local_xchg(&buf->data_size,
452 buf->nr_pages << PAGE_SHIFT);
453 perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0),
454 !!local_xchg(&buf->lost, 0));
455 }
456
457 cpuc->ds->bts_index = bts->ds_back.bts_buffer_base;
458 cpuc->ds->bts_buffer_base = bts->ds_back.bts_buffer_base;
459 cpuc->ds->bts_absolute_maximum = bts->ds_back.bts_absolute_maximum;
460 cpuc->ds->bts_interrupt_threshold = bts->ds_back.bts_interrupt_threshold;
461 }
462
bts_event_add(struct perf_event * event,int mode)463 static int bts_event_add(struct perf_event *event, int mode)
464 {
465 struct bts_buffer *buf;
466 struct bts_ctx *bts = this_cpu_ptr(&bts_ctx);
467 struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
468 struct hw_perf_event *hwc = &event->hw;
469 int ret = -EBUSY;
470
471 event->hw.state = PERF_HES_STOPPED;
472
473 if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask))
474 return -EBUSY;
475
476 if (bts->handle.event)
477 return -EBUSY;
478
479 buf = perf_aux_output_begin(&bts->handle, event);
480 if (!buf)
481 return -EINVAL;
482
483 ret = bts_buffer_reset(buf, &bts->handle);
484 if (ret) {
485 perf_aux_output_end(&bts->handle, 0, false);
486 return ret;
487 }
488
489 bts->ds_back.bts_buffer_base = cpuc->ds->bts_buffer_base;
490 bts->ds_back.bts_absolute_maximum = cpuc->ds->bts_absolute_maximum;
491 bts->ds_back.bts_interrupt_threshold = cpuc->ds->bts_interrupt_threshold;
492
493 if (mode & PERF_EF_START) {
494 bts_event_start(event, 0);
495 if (hwc->state & PERF_HES_STOPPED) {
496 bts_event_del(event, 0);
497 return -EBUSY;
498 }
499 }
500
501 return 0;
502 }
503
bts_event_destroy(struct perf_event * event)504 static void bts_event_destroy(struct perf_event *event)
505 {
506 x86_release_hardware();
507 x86_del_exclusive(x86_lbr_exclusive_bts);
508 }
509
bts_event_init(struct perf_event * event)510 static int bts_event_init(struct perf_event *event)
511 {
512 int ret;
513
514 if (event->attr.type != bts_pmu.type)
515 return -ENOENT;
516
517 if (x86_add_exclusive(x86_lbr_exclusive_bts))
518 return -EBUSY;
519
520 /*
521 * BTS leaks kernel addresses even when CPL0 tracing is
522 * disabled, so disallow intel_bts driver for unprivileged
523 * users on paranoid systems since it provides trace data
524 * to the user in a zero-copy fashion.
525 *
526 * Note that the default paranoia setting permits unprivileged
527 * users to profile the kernel.
528 */
529 if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
530 !capable(CAP_SYS_ADMIN))
531 return -EACCES;
532
533 ret = x86_reserve_hardware();
534 if (ret) {
535 x86_del_exclusive(x86_lbr_exclusive_bts);
536 return ret;
537 }
538
539 event->destroy = bts_event_destroy;
540
541 return 0;
542 }
543
bts_event_read(struct perf_event * event)544 static void bts_event_read(struct perf_event *event)
545 {
546 }
547
bts_init(void)548 static __init int bts_init(void)
549 {
550 if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts)
551 return -ENODEV;
552
553 bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE;
554 bts_pmu.task_ctx_nr = perf_sw_context;
555 bts_pmu.event_init = bts_event_init;
556 bts_pmu.add = bts_event_add;
557 bts_pmu.del = bts_event_del;
558 bts_pmu.start = bts_event_start;
559 bts_pmu.stop = bts_event_stop;
560 bts_pmu.read = bts_event_read;
561 bts_pmu.setup_aux = bts_buffer_setup_aux;
562 bts_pmu.free_aux = bts_buffer_free_aux;
563
564 return perf_pmu_register(&bts_pmu, "intel_bts", -1);
565 }
566 arch_initcall(bts_init);
567