1 /*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/tick.h>
22 #include <linux/sysfs.h>
23 #include <linux/dcache.h>
24 #include <linux/percpu.h>
25 #include <linux/ptrace.h>
26 #include <linux/reboot.h>
27 #include <linux/vmstat.h>
28 #include <linux/device.h>
29 #include <linux/export.h>
30 #include <linux/vmalloc.h>
31 #include <linux/hardirq.h>
32 #include <linux/rculist.h>
33 #include <linux/uaccess.h>
34 #include <linux/syscalls.h>
35 #include <linux/anon_inodes.h>
36 #include <linux/kernel_stat.h>
37 #include <linux/cgroup.h>
38 #include <linux/perf_event.h>
39 #include <linux/trace_events.h>
40 #include <linux/hw_breakpoint.h>
41 #include <linux/mm_types.h>
42 #include <linux/module.h>
43 #include <linux/mman.h>
44 #include <linux/compat.h>
45 #include <linux/bpf.h>
46 #include <linux/filter.h>
47 #include <linux/namei.h>
48 #include <linux/parser.h>
49
50 #include "internal.h"
51
52 #include <asm/irq_regs.h>
53
54 typedef int (*remote_function_f)(void *);
55
56 struct remote_function_call {
57 struct task_struct *p;
58 remote_function_f func;
59 void *info;
60 int ret;
61 };
62
remote_function(void * data)63 static void remote_function(void *data)
64 {
65 struct remote_function_call *tfc = data;
66 struct task_struct *p = tfc->p;
67
68 if (p) {
69 /* -EAGAIN */
70 if (task_cpu(p) != smp_processor_id())
71 return;
72
73 /*
74 * Now that we're on right CPU with IRQs disabled, we can test
75 * if we hit the right task without races.
76 */
77
78 tfc->ret = -ESRCH; /* No such (running) process */
79 if (p != current)
80 return;
81 }
82
83 tfc->ret = tfc->func(tfc->info);
84 }
85
86 /**
87 * task_function_call - call a function on the cpu on which a task runs
88 * @p: the task to evaluate
89 * @func: the function to be called
90 * @info: the function call argument
91 *
92 * Calls the function @func when the task is currently running. This might
93 * be on the current CPU, which just calls the function directly
94 *
95 * returns: @func return value, or
96 * -ESRCH - when the process isn't running
97 * -EAGAIN - when the process moved away
98 */
99 static int
task_function_call(struct task_struct * p,remote_function_f func,void * info)100 task_function_call(struct task_struct *p, remote_function_f func, void *info)
101 {
102 struct remote_function_call data = {
103 .p = p,
104 .func = func,
105 .info = info,
106 .ret = -EAGAIN,
107 };
108 int ret;
109
110 do {
111 ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
112 if (!ret)
113 ret = data.ret;
114 } while (ret == -EAGAIN);
115
116 return ret;
117 }
118
119 /**
120 * cpu_function_call - call a function on the cpu
121 * @func: the function to be called
122 * @info: the function call argument
123 *
124 * Calls the function @func on the remote cpu.
125 *
126 * returns: @func return value or -ENXIO when the cpu is offline
127 */
cpu_function_call(int cpu,remote_function_f func,void * info)128 static int cpu_function_call(int cpu, remote_function_f func, void *info)
129 {
130 struct remote_function_call data = {
131 .p = NULL,
132 .func = func,
133 .info = info,
134 .ret = -ENXIO, /* No such CPU */
135 };
136
137 smp_call_function_single(cpu, remote_function, &data, 1);
138
139 return data.ret;
140 }
141
142 static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context * ctx)143 __get_cpu_context(struct perf_event_context *ctx)
144 {
145 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
146 }
147
perf_ctx_lock(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)148 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
149 struct perf_event_context *ctx)
150 {
151 raw_spin_lock(&cpuctx->ctx.lock);
152 if (ctx)
153 raw_spin_lock(&ctx->lock);
154 }
155
perf_ctx_unlock(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)156 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
157 struct perf_event_context *ctx)
158 {
159 if (ctx)
160 raw_spin_unlock(&ctx->lock);
161 raw_spin_unlock(&cpuctx->ctx.lock);
162 }
163
164 #define TASK_TOMBSTONE ((void *)-1L)
165
is_kernel_event(struct perf_event * event)166 static bool is_kernel_event(struct perf_event *event)
167 {
168 return READ_ONCE(event->owner) == TASK_TOMBSTONE;
169 }
170
171 /*
172 * On task ctx scheduling...
173 *
174 * When !ctx->nr_events a task context will not be scheduled. This means
175 * we can disable the scheduler hooks (for performance) without leaving
176 * pending task ctx state.
177 *
178 * This however results in two special cases:
179 *
180 * - removing the last event from a task ctx; this is relatively straight
181 * forward and is done in __perf_remove_from_context.
182 *
183 * - adding the first event to a task ctx; this is tricky because we cannot
184 * rely on ctx->is_active and therefore cannot use event_function_call().
185 * See perf_install_in_context().
186 *
187 * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
188 */
189
190 typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
191 struct perf_event_context *, void *);
192
193 struct event_function_struct {
194 struct perf_event *event;
195 event_f func;
196 void *data;
197 };
198
event_function(void * info)199 static int event_function(void *info)
200 {
201 struct event_function_struct *efs = info;
202 struct perf_event *event = efs->event;
203 struct perf_event_context *ctx = event->ctx;
204 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
205 struct perf_event_context *task_ctx = cpuctx->task_ctx;
206 int ret = 0;
207
208 WARN_ON_ONCE(!irqs_disabled());
209
210 perf_ctx_lock(cpuctx, task_ctx);
211 /*
212 * Since we do the IPI call without holding ctx->lock things can have
213 * changed, double check we hit the task we set out to hit.
214 */
215 if (ctx->task) {
216 if (ctx->task != current) {
217 ret = -ESRCH;
218 goto unlock;
219 }
220
221 /*
222 * We only use event_function_call() on established contexts,
223 * and event_function() is only ever called when active (or
224 * rather, we'll have bailed in task_function_call() or the
225 * above ctx->task != current test), therefore we must have
226 * ctx->is_active here.
227 */
228 WARN_ON_ONCE(!ctx->is_active);
229 /*
230 * And since we have ctx->is_active, cpuctx->task_ctx must
231 * match.
232 */
233 WARN_ON_ONCE(task_ctx != ctx);
234 } else {
235 WARN_ON_ONCE(&cpuctx->ctx != ctx);
236 }
237
238 efs->func(event, cpuctx, ctx, efs->data);
239 unlock:
240 perf_ctx_unlock(cpuctx, task_ctx);
241
242 return ret;
243 }
244
event_function_call(struct perf_event * event,event_f func,void * data)245 static void event_function_call(struct perf_event *event, event_f func, void *data)
246 {
247 struct perf_event_context *ctx = event->ctx;
248 struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
249 struct event_function_struct efs = {
250 .event = event,
251 .func = func,
252 .data = data,
253 };
254
255 if (!event->parent) {
256 /*
257 * If this is a !child event, we must hold ctx::mutex to
258 * stabilize the the event->ctx relation. See
259 * perf_event_ctx_lock().
260 */
261 lockdep_assert_held(&ctx->mutex);
262 }
263
264 if (!task) {
265 cpu_function_call(event->cpu, event_function, &efs);
266 return;
267 }
268
269 if (task == TASK_TOMBSTONE)
270 return;
271
272 again:
273 if (!task_function_call(task, event_function, &efs))
274 return;
275
276 raw_spin_lock_irq(&ctx->lock);
277 /*
278 * Reload the task pointer, it might have been changed by
279 * a concurrent perf_event_context_sched_out().
280 */
281 task = ctx->task;
282 if (task == TASK_TOMBSTONE) {
283 raw_spin_unlock_irq(&ctx->lock);
284 return;
285 }
286 if (ctx->is_active) {
287 raw_spin_unlock_irq(&ctx->lock);
288 goto again;
289 }
290 func(event, NULL, ctx, data);
291 raw_spin_unlock_irq(&ctx->lock);
292 }
293
294 /*
295 * Similar to event_function_call() + event_function(), but hard assumes IRQs
296 * are already disabled and we're on the right CPU.
297 */
event_function_local(struct perf_event * event,event_f func,void * data)298 static void event_function_local(struct perf_event *event, event_f func, void *data)
299 {
300 struct perf_event_context *ctx = event->ctx;
301 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
302 struct task_struct *task = READ_ONCE(ctx->task);
303 struct perf_event_context *task_ctx = NULL;
304
305 WARN_ON_ONCE(!irqs_disabled());
306
307 if (task) {
308 if (task == TASK_TOMBSTONE)
309 return;
310
311 task_ctx = ctx;
312 }
313
314 perf_ctx_lock(cpuctx, task_ctx);
315
316 task = ctx->task;
317 if (task == TASK_TOMBSTONE)
318 goto unlock;
319
320 if (task) {
321 /*
322 * We must be either inactive or active and the right task,
323 * otherwise we're screwed, since we cannot IPI to somewhere
324 * else.
325 */
326 if (ctx->is_active) {
327 if (WARN_ON_ONCE(task != current))
328 goto unlock;
329
330 if (WARN_ON_ONCE(cpuctx->task_ctx != ctx))
331 goto unlock;
332 }
333 } else {
334 WARN_ON_ONCE(&cpuctx->ctx != ctx);
335 }
336
337 func(event, cpuctx, ctx, data);
338 unlock:
339 perf_ctx_unlock(cpuctx, task_ctx);
340 }
341
342 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
343 PERF_FLAG_FD_OUTPUT |\
344 PERF_FLAG_PID_CGROUP |\
345 PERF_FLAG_FD_CLOEXEC)
346
347 /*
348 * branch priv levels that need permission checks
349 */
350 #define PERF_SAMPLE_BRANCH_PERM_PLM \
351 (PERF_SAMPLE_BRANCH_KERNEL |\
352 PERF_SAMPLE_BRANCH_HV)
353
354 enum event_type_t {
355 EVENT_FLEXIBLE = 0x1,
356 EVENT_PINNED = 0x2,
357 EVENT_TIME = 0x4,
358 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
359 };
360
361 /*
362 * perf_sched_events : >0 events exist
363 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
364 */
365
366 static void perf_sched_delayed(struct work_struct *work);
367 DEFINE_STATIC_KEY_FALSE(perf_sched_events);
368 static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
369 static DEFINE_MUTEX(perf_sched_mutex);
370 static atomic_t perf_sched_count;
371
372 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
373 static DEFINE_PER_CPU(int, perf_sched_cb_usages);
374 static DEFINE_PER_CPU(struct pmu_event_list, pmu_sb_events);
375
376 static atomic_t nr_mmap_events __read_mostly;
377 static atomic_t nr_comm_events __read_mostly;
378 static atomic_t nr_task_events __read_mostly;
379 static atomic_t nr_freq_events __read_mostly;
380 static atomic_t nr_switch_events __read_mostly;
381
382 static LIST_HEAD(pmus);
383 static DEFINE_MUTEX(pmus_lock);
384 static struct srcu_struct pmus_srcu;
385
386 /*
387 * perf event paranoia level:
388 * -1 - not paranoid at all
389 * 0 - disallow raw tracepoint access for unpriv
390 * 1 - disallow cpu events for unpriv
391 * 2 - disallow kernel profiling for unpriv
392 * 3 - disallow all unpriv perf event use
393 */
394 #ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
395 int sysctl_perf_event_paranoid __read_mostly = 3;
396 #else
397 int sysctl_perf_event_paranoid __read_mostly = 2;
398 #endif
399
400 /* Minimum for 512 kiB + 1 user control page */
401 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
402
403 /*
404 * max perf event sample rate
405 */
406 #define DEFAULT_MAX_SAMPLE_RATE 100000
407 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
408 #define DEFAULT_CPU_TIME_MAX_PERCENT 25
409
410 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
411
412 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
413 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
414
415 static int perf_sample_allowed_ns __read_mostly =
416 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
417
update_perf_cpu_limits(void)418 static void update_perf_cpu_limits(void)
419 {
420 u64 tmp = perf_sample_period_ns;
421
422 tmp *= sysctl_perf_cpu_time_max_percent;
423 tmp = div_u64(tmp, 100);
424 if (!tmp)
425 tmp = 1;
426
427 WRITE_ONCE(perf_sample_allowed_ns, tmp);
428 }
429
430 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
431
perf_proc_update_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)432 int perf_proc_update_handler(struct ctl_table *table, int write,
433 void __user *buffer, size_t *lenp,
434 loff_t *ppos)
435 {
436 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
437
438 if (ret || !write)
439 return ret;
440
441 /*
442 * If throttling is disabled don't allow the write:
443 */
444 if (sysctl_perf_cpu_time_max_percent == 100 ||
445 sysctl_perf_cpu_time_max_percent == 0)
446 return -EINVAL;
447
448 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
449 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
450 update_perf_cpu_limits();
451
452 return 0;
453 }
454
455 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
456
perf_cpu_time_max_percent_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)457 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
458 void __user *buffer, size_t *lenp,
459 loff_t *ppos)
460 {
461 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
462
463 if (ret || !write)
464 return ret;
465
466 if (sysctl_perf_cpu_time_max_percent == 100 ||
467 sysctl_perf_cpu_time_max_percent == 0) {
468 printk(KERN_WARNING
469 "perf: Dynamic interrupt throttling disabled, can hang your system!\n");
470 WRITE_ONCE(perf_sample_allowed_ns, 0);
471 } else {
472 update_perf_cpu_limits();
473 }
474
475 return 0;
476 }
477
478 /*
479 * perf samples are done in some very critical code paths (NMIs).
480 * If they take too much CPU time, the system can lock up and not
481 * get any real work done. This will drop the sample rate when
482 * we detect that events are taking too long.
483 */
484 #define NR_ACCUMULATED_SAMPLES 128
485 static DEFINE_PER_CPU(u64, running_sample_length);
486
487 static u64 __report_avg;
488 static u64 __report_allowed;
489
perf_duration_warn(struct irq_work * w)490 static void perf_duration_warn(struct irq_work *w)
491 {
492 printk_ratelimited(KERN_INFO
493 "perf: interrupt took too long (%lld > %lld), lowering "
494 "kernel.perf_event_max_sample_rate to %d\n",
495 __report_avg, __report_allowed,
496 sysctl_perf_event_sample_rate);
497 }
498
499 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
500
perf_sample_event_took(u64 sample_len_ns)501 void perf_sample_event_took(u64 sample_len_ns)
502 {
503 u64 max_len = READ_ONCE(perf_sample_allowed_ns);
504 u64 running_len;
505 u64 avg_len;
506 u32 max;
507
508 if (max_len == 0)
509 return;
510
511 /* Decay the counter by 1 average sample. */
512 running_len = __this_cpu_read(running_sample_length);
513 running_len -= running_len/NR_ACCUMULATED_SAMPLES;
514 running_len += sample_len_ns;
515 __this_cpu_write(running_sample_length, running_len);
516
517 /*
518 * Note: this will be biased artifically low until we have
519 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
520 * from having to maintain a count.
521 */
522 avg_len = running_len/NR_ACCUMULATED_SAMPLES;
523 if (avg_len <= max_len)
524 return;
525
526 __report_avg = avg_len;
527 __report_allowed = max_len;
528
529 /*
530 * Compute a throttle threshold 25% below the current duration.
531 */
532 avg_len += avg_len / 4;
533 max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent;
534 if (avg_len < max)
535 max /= (u32)avg_len;
536 else
537 max = 1;
538
539 WRITE_ONCE(perf_sample_allowed_ns, avg_len);
540 WRITE_ONCE(max_samples_per_tick, max);
541
542 sysctl_perf_event_sample_rate = max * HZ;
543 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
544
545 if (!irq_work_queue(&perf_duration_work)) {
546 early_printk("perf: interrupt took too long (%lld > %lld), lowering "
547 "kernel.perf_event_max_sample_rate to %d\n",
548 __report_avg, __report_allowed,
549 sysctl_perf_event_sample_rate);
550 }
551 }
552
553 static atomic64_t perf_event_id;
554
555 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
556 enum event_type_t event_type);
557
558 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
559 enum event_type_t event_type,
560 struct task_struct *task);
561
562 static void update_context_time(struct perf_event_context *ctx);
563 static u64 perf_event_time(struct perf_event *event);
564
perf_event_print_debug(void)565 void __weak perf_event_print_debug(void) { }
566
perf_pmu_name(void)567 extern __weak const char *perf_pmu_name(void)
568 {
569 return "pmu";
570 }
571
perf_clock(void)572 static inline u64 perf_clock(void)
573 {
574 return local_clock();
575 }
576
perf_event_clock(struct perf_event * event)577 static inline u64 perf_event_clock(struct perf_event *event)
578 {
579 return event->clock();
580 }
581
582 #ifdef CONFIG_CGROUP_PERF
583
584 static inline bool
perf_cgroup_match(struct perf_event * event)585 perf_cgroup_match(struct perf_event *event)
586 {
587 struct perf_event_context *ctx = event->ctx;
588 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
589
590 /* @event doesn't care about cgroup */
591 if (!event->cgrp)
592 return true;
593
594 /* wants specific cgroup scope but @cpuctx isn't associated with any */
595 if (!cpuctx->cgrp)
596 return false;
597
598 /*
599 * Cgroup scoping is recursive. An event enabled for a cgroup is
600 * also enabled for all its descendant cgroups. If @cpuctx's
601 * cgroup is a descendant of @event's (the test covers identity
602 * case), it's a match.
603 */
604 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
605 event->cgrp->css.cgroup);
606 }
607
perf_detach_cgroup(struct perf_event * event)608 static inline void perf_detach_cgroup(struct perf_event *event)
609 {
610 css_put(&event->cgrp->css);
611 event->cgrp = NULL;
612 }
613
is_cgroup_event(struct perf_event * event)614 static inline int is_cgroup_event(struct perf_event *event)
615 {
616 return event->cgrp != NULL;
617 }
618
perf_cgroup_event_time(struct perf_event * event)619 static inline u64 perf_cgroup_event_time(struct perf_event *event)
620 {
621 struct perf_cgroup_info *t;
622
623 t = per_cpu_ptr(event->cgrp->info, event->cpu);
624 return t->time;
625 }
626
__update_cgrp_time(struct perf_cgroup * cgrp)627 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
628 {
629 struct perf_cgroup_info *info;
630 u64 now;
631
632 now = perf_clock();
633
634 info = this_cpu_ptr(cgrp->info);
635
636 info->time += now - info->timestamp;
637 info->timestamp = now;
638 }
639
update_cgrp_time_from_cpuctx(struct perf_cpu_context * cpuctx)640 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
641 {
642 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
643 if (cgrp_out)
644 __update_cgrp_time(cgrp_out);
645 }
646
update_cgrp_time_from_event(struct perf_event * event)647 static inline void update_cgrp_time_from_event(struct perf_event *event)
648 {
649 struct perf_cgroup *cgrp;
650
651 /*
652 * ensure we access cgroup data only when needed and
653 * when we know the cgroup is pinned (css_get)
654 */
655 if (!is_cgroup_event(event))
656 return;
657
658 cgrp = perf_cgroup_from_task(current, event->ctx);
659 /*
660 * Do not update time when cgroup is not active
661 */
662 if (cgrp == event->cgrp)
663 __update_cgrp_time(event->cgrp);
664 }
665
666 static inline void
perf_cgroup_set_timestamp(struct task_struct * task,struct perf_event_context * ctx)667 perf_cgroup_set_timestamp(struct task_struct *task,
668 struct perf_event_context *ctx)
669 {
670 struct perf_cgroup *cgrp;
671 struct perf_cgroup_info *info;
672
673 /*
674 * ctx->lock held by caller
675 * ensure we do not access cgroup data
676 * unless we have the cgroup pinned (css_get)
677 */
678 if (!task || !ctx->nr_cgroups)
679 return;
680
681 cgrp = perf_cgroup_from_task(task, ctx);
682 info = this_cpu_ptr(cgrp->info);
683 info->timestamp = ctx->timestamp;
684 }
685
686 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
687 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
688
689 /*
690 * reschedule events based on the cgroup constraint of task.
691 *
692 * mode SWOUT : schedule out everything
693 * mode SWIN : schedule in based on cgroup for next
694 */
perf_cgroup_switch(struct task_struct * task,int mode)695 static void perf_cgroup_switch(struct task_struct *task, int mode)
696 {
697 struct perf_cpu_context *cpuctx;
698 struct pmu *pmu;
699 unsigned long flags;
700
701 /*
702 * disable interrupts to avoid geting nr_cgroup
703 * changes via __perf_event_disable(). Also
704 * avoids preemption.
705 */
706 local_irq_save(flags);
707
708 /*
709 * we reschedule only in the presence of cgroup
710 * constrained events.
711 */
712
713 list_for_each_entry_rcu(pmu, &pmus, entry) {
714 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
715 if (cpuctx->unique_pmu != pmu)
716 continue; /* ensure we process each cpuctx once */
717
718 /*
719 * perf_cgroup_events says at least one
720 * context on this CPU has cgroup events.
721 *
722 * ctx->nr_cgroups reports the number of cgroup
723 * events for a context.
724 */
725 if (cpuctx->ctx.nr_cgroups > 0) {
726 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
727 perf_pmu_disable(cpuctx->ctx.pmu);
728
729 if (mode & PERF_CGROUP_SWOUT) {
730 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
731 /*
732 * must not be done before ctxswout due
733 * to event_filter_match() in event_sched_out()
734 */
735 cpuctx->cgrp = NULL;
736 }
737
738 if (mode & PERF_CGROUP_SWIN) {
739 WARN_ON_ONCE(cpuctx->cgrp);
740 /*
741 * set cgrp before ctxsw in to allow
742 * event_filter_match() to not have to pass
743 * task around
744 * we pass the cpuctx->ctx to perf_cgroup_from_task()
745 * because cgorup events are only per-cpu
746 */
747 cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx);
748 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
749 }
750 perf_pmu_enable(cpuctx->ctx.pmu);
751 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
752 }
753 }
754
755 local_irq_restore(flags);
756 }
757
perf_cgroup_sched_out(struct task_struct * task,struct task_struct * next)758 static inline void perf_cgroup_sched_out(struct task_struct *task,
759 struct task_struct *next)
760 {
761 struct perf_cgroup *cgrp1;
762 struct perf_cgroup *cgrp2 = NULL;
763
764 rcu_read_lock();
765 /*
766 * we come here when we know perf_cgroup_events > 0
767 * we do not need to pass the ctx here because we know
768 * we are holding the rcu lock
769 */
770 cgrp1 = perf_cgroup_from_task(task, NULL);
771 cgrp2 = perf_cgroup_from_task(next, NULL);
772
773 /*
774 * only schedule out current cgroup events if we know
775 * that we are switching to a different cgroup. Otherwise,
776 * do no touch the cgroup events.
777 */
778 if (cgrp1 != cgrp2)
779 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
780
781 rcu_read_unlock();
782 }
783
perf_cgroup_sched_in(struct task_struct * prev,struct task_struct * task)784 static inline void perf_cgroup_sched_in(struct task_struct *prev,
785 struct task_struct *task)
786 {
787 struct perf_cgroup *cgrp1;
788 struct perf_cgroup *cgrp2 = NULL;
789
790 rcu_read_lock();
791 /*
792 * we come here when we know perf_cgroup_events > 0
793 * we do not need to pass the ctx here because we know
794 * we are holding the rcu lock
795 */
796 cgrp1 = perf_cgroup_from_task(task, NULL);
797 cgrp2 = perf_cgroup_from_task(prev, NULL);
798
799 /*
800 * only need to schedule in cgroup events if we are changing
801 * cgroup during ctxsw. Cgroup events were not scheduled
802 * out of ctxsw out if that was not the case.
803 */
804 if (cgrp1 != cgrp2)
805 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
806
807 rcu_read_unlock();
808 }
809
perf_cgroup_connect(int fd,struct perf_event * event,struct perf_event_attr * attr,struct perf_event * group_leader)810 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
811 struct perf_event_attr *attr,
812 struct perf_event *group_leader)
813 {
814 struct perf_cgroup *cgrp;
815 struct cgroup_subsys_state *css;
816 struct fd f = fdget(fd);
817 int ret = 0;
818
819 if (!f.file)
820 return -EBADF;
821
822 css = css_tryget_online_from_dir(f.file->f_path.dentry,
823 &perf_event_cgrp_subsys);
824 if (IS_ERR(css)) {
825 ret = PTR_ERR(css);
826 goto out;
827 }
828
829 cgrp = container_of(css, struct perf_cgroup, css);
830 event->cgrp = cgrp;
831
832 /*
833 * all events in a group must monitor
834 * the same cgroup because a task belongs
835 * to only one perf cgroup at a time
836 */
837 if (group_leader && group_leader->cgrp != cgrp) {
838 perf_detach_cgroup(event);
839 ret = -EINVAL;
840 }
841 out:
842 fdput(f);
843 return ret;
844 }
845
846 static inline void
perf_cgroup_set_shadow_time(struct perf_event * event,u64 now)847 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
848 {
849 struct perf_cgroup_info *t;
850 t = per_cpu_ptr(event->cgrp->info, event->cpu);
851 event->shadow_ctx_time = now - t->timestamp;
852 }
853
854 static inline void
perf_cgroup_defer_enabled(struct perf_event * event)855 perf_cgroup_defer_enabled(struct perf_event *event)
856 {
857 /*
858 * when the current task's perf cgroup does not match
859 * the event's, we need to remember to call the
860 * perf_mark_enable() function the first time a task with
861 * a matching perf cgroup is scheduled in.
862 */
863 if (is_cgroup_event(event) && !perf_cgroup_match(event))
864 event->cgrp_defer_enabled = 1;
865 }
866
867 static inline void
perf_cgroup_mark_enabled(struct perf_event * event,struct perf_event_context * ctx)868 perf_cgroup_mark_enabled(struct perf_event *event,
869 struct perf_event_context *ctx)
870 {
871 struct perf_event *sub;
872 u64 tstamp = perf_event_time(event);
873
874 if (!event->cgrp_defer_enabled)
875 return;
876
877 event->cgrp_defer_enabled = 0;
878
879 event->tstamp_enabled = tstamp - event->total_time_enabled;
880 list_for_each_entry(sub, &event->sibling_list, group_entry) {
881 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
882 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
883 sub->cgrp_defer_enabled = 0;
884 }
885 }
886 }
887
888 /*
889 * Update cpuctx->cgrp so that it is set when first cgroup event is added and
890 * cleared when last cgroup event is removed.
891 */
892 static inline void
list_update_cgroup_event(struct perf_event * event,struct perf_event_context * ctx,bool add)893 list_update_cgroup_event(struct perf_event *event,
894 struct perf_event_context *ctx, bool add)
895 {
896 struct perf_cpu_context *cpuctx;
897
898 if (!is_cgroup_event(event))
899 return;
900
901 if (add && ctx->nr_cgroups++)
902 return;
903 else if (!add && --ctx->nr_cgroups)
904 return;
905 /*
906 * Because cgroup events are always per-cpu events,
907 * this will always be called from the right CPU.
908 */
909 cpuctx = __get_cpu_context(ctx);
910
911 /*
912 * cpuctx->cgrp is NULL until a cgroup event is sched in or
913 * ctx->nr_cgroup == 0 .
914 */
915 if (add && perf_cgroup_from_task(current, ctx) == event->cgrp)
916 cpuctx->cgrp = event->cgrp;
917 else if (!add)
918 cpuctx->cgrp = NULL;
919 }
920
921 #else /* !CONFIG_CGROUP_PERF */
922
923 static inline bool
perf_cgroup_match(struct perf_event * event)924 perf_cgroup_match(struct perf_event *event)
925 {
926 return true;
927 }
928
perf_detach_cgroup(struct perf_event * event)929 static inline void perf_detach_cgroup(struct perf_event *event)
930 {}
931
is_cgroup_event(struct perf_event * event)932 static inline int is_cgroup_event(struct perf_event *event)
933 {
934 return 0;
935 }
936
perf_cgroup_event_cgrp_time(struct perf_event * event)937 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
938 {
939 return 0;
940 }
941
update_cgrp_time_from_event(struct perf_event * event)942 static inline void update_cgrp_time_from_event(struct perf_event *event)
943 {
944 }
945
update_cgrp_time_from_cpuctx(struct perf_cpu_context * cpuctx)946 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
947 {
948 }
949
perf_cgroup_sched_out(struct task_struct * task,struct task_struct * next)950 static inline void perf_cgroup_sched_out(struct task_struct *task,
951 struct task_struct *next)
952 {
953 }
954
perf_cgroup_sched_in(struct task_struct * prev,struct task_struct * task)955 static inline void perf_cgroup_sched_in(struct task_struct *prev,
956 struct task_struct *task)
957 {
958 }
959
perf_cgroup_connect(pid_t pid,struct perf_event * event,struct perf_event_attr * attr,struct perf_event * group_leader)960 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
961 struct perf_event_attr *attr,
962 struct perf_event *group_leader)
963 {
964 return -EINVAL;
965 }
966
967 static inline void
perf_cgroup_set_timestamp(struct task_struct * task,struct perf_event_context * ctx)968 perf_cgroup_set_timestamp(struct task_struct *task,
969 struct perf_event_context *ctx)
970 {
971 }
972
973 void
perf_cgroup_switch(struct task_struct * task,struct task_struct * next)974 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
975 {
976 }
977
978 static inline void
perf_cgroup_set_shadow_time(struct perf_event * event,u64 now)979 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
980 {
981 }
982
perf_cgroup_event_time(struct perf_event * event)983 static inline u64 perf_cgroup_event_time(struct perf_event *event)
984 {
985 return 0;
986 }
987
988 static inline void
perf_cgroup_defer_enabled(struct perf_event * event)989 perf_cgroup_defer_enabled(struct perf_event *event)
990 {
991 }
992
993 static inline void
perf_cgroup_mark_enabled(struct perf_event * event,struct perf_event_context * ctx)994 perf_cgroup_mark_enabled(struct perf_event *event,
995 struct perf_event_context *ctx)
996 {
997 }
998
999 static inline void
list_update_cgroup_event(struct perf_event * event,struct perf_event_context * ctx,bool add)1000 list_update_cgroup_event(struct perf_event *event,
1001 struct perf_event_context *ctx, bool add)
1002 {
1003 }
1004
1005 #endif
1006
1007 /*
1008 * set default to be dependent on timer tick just
1009 * like original code
1010 */
1011 #define PERF_CPU_HRTIMER (1000 / HZ)
1012 /*
1013 * function must be called with interrupts disbled
1014 */
perf_mux_hrtimer_handler(struct hrtimer * hr)1015 static enum hrtimer_restart perf_mux_hrtimer_handler(struct hrtimer *hr)
1016 {
1017 struct perf_cpu_context *cpuctx;
1018 int rotations = 0;
1019
1020 WARN_ON(!irqs_disabled());
1021
1022 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
1023 rotations = perf_rotate_context(cpuctx);
1024
1025 raw_spin_lock(&cpuctx->hrtimer_lock);
1026 if (rotations)
1027 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
1028 else
1029 cpuctx->hrtimer_active = 0;
1030 raw_spin_unlock(&cpuctx->hrtimer_lock);
1031
1032 return rotations ? HRTIMER_RESTART : HRTIMER_NORESTART;
1033 }
1034
__perf_mux_hrtimer_init(struct perf_cpu_context * cpuctx,int cpu)1035 static void __perf_mux_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
1036 {
1037 struct hrtimer *timer = &cpuctx->hrtimer;
1038 struct pmu *pmu = cpuctx->ctx.pmu;
1039 u64 interval;
1040
1041 /* no multiplexing needed for SW PMU */
1042 if (pmu->task_ctx_nr == perf_sw_context)
1043 return;
1044
1045 /*
1046 * check default is sane, if not set then force to
1047 * default interval (1/tick)
1048 */
1049 interval = pmu->hrtimer_interval_ms;
1050 if (interval < 1)
1051 interval = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
1052
1053 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * interval);
1054
1055 raw_spin_lock_init(&cpuctx->hrtimer_lock);
1056 hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1057 timer->function = perf_mux_hrtimer_handler;
1058 }
1059
perf_mux_hrtimer_restart(struct perf_cpu_context * cpuctx)1060 static int perf_mux_hrtimer_restart(struct perf_cpu_context *cpuctx)
1061 {
1062 struct hrtimer *timer = &cpuctx->hrtimer;
1063 struct pmu *pmu = cpuctx->ctx.pmu;
1064 unsigned long flags;
1065
1066 /* not for SW PMU */
1067 if (pmu->task_ctx_nr == perf_sw_context)
1068 return 0;
1069
1070 raw_spin_lock_irqsave(&cpuctx->hrtimer_lock, flags);
1071 if (!cpuctx->hrtimer_active) {
1072 cpuctx->hrtimer_active = 1;
1073 hrtimer_forward_now(timer, cpuctx->hrtimer_interval);
1074 hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
1075 }
1076 raw_spin_unlock_irqrestore(&cpuctx->hrtimer_lock, flags);
1077
1078 return 0;
1079 }
1080
perf_pmu_disable(struct pmu * pmu)1081 void perf_pmu_disable(struct pmu *pmu)
1082 {
1083 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1084 if (!(*count)++)
1085 pmu->pmu_disable(pmu);
1086 }
1087
perf_pmu_enable(struct pmu * pmu)1088 void perf_pmu_enable(struct pmu *pmu)
1089 {
1090 int *count = this_cpu_ptr(pmu->pmu_disable_count);
1091 if (!--(*count))
1092 pmu->pmu_enable(pmu);
1093 }
1094
1095 static DEFINE_PER_CPU(struct list_head, active_ctx_list);
1096
1097 /*
1098 * perf_event_ctx_activate(), perf_event_ctx_deactivate(), and
1099 * perf_event_task_tick() are fully serialized because they're strictly cpu
1100 * affine and perf_event_ctx{activate,deactivate} are called with IRQs
1101 * disabled, while perf_event_task_tick is called from IRQ context.
1102 */
perf_event_ctx_activate(struct perf_event_context * ctx)1103 static void perf_event_ctx_activate(struct perf_event_context *ctx)
1104 {
1105 struct list_head *head = this_cpu_ptr(&active_ctx_list);
1106
1107 WARN_ON(!irqs_disabled());
1108
1109 WARN_ON(!list_empty(&ctx->active_ctx_list));
1110
1111 list_add(&ctx->active_ctx_list, head);
1112 }
1113
perf_event_ctx_deactivate(struct perf_event_context * ctx)1114 static void perf_event_ctx_deactivate(struct perf_event_context *ctx)
1115 {
1116 WARN_ON(!irqs_disabled());
1117
1118 WARN_ON(list_empty(&ctx->active_ctx_list));
1119
1120 list_del_init(&ctx->active_ctx_list);
1121 }
1122
get_ctx(struct perf_event_context * ctx)1123 static void get_ctx(struct perf_event_context *ctx)
1124 {
1125 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
1126 }
1127
free_ctx(struct rcu_head * head)1128 static void free_ctx(struct rcu_head *head)
1129 {
1130 struct perf_event_context *ctx;
1131
1132 ctx = container_of(head, struct perf_event_context, rcu_head);
1133 kfree(ctx->task_ctx_data);
1134 kfree(ctx);
1135 }
1136
put_ctx(struct perf_event_context * ctx)1137 static void put_ctx(struct perf_event_context *ctx)
1138 {
1139 if (atomic_dec_and_test(&ctx->refcount)) {
1140 if (ctx->parent_ctx)
1141 put_ctx(ctx->parent_ctx);
1142 if (ctx->task && ctx->task != TASK_TOMBSTONE)
1143 put_task_struct(ctx->task);
1144 call_rcu(&ctx->rcu_head, free_ctx);
1145 }
1146 }
1147
1148 /*
1149 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
1150 * perf_pmu_migrate_context() we need some magic.
1151 *
1152 * Those places that change perf_event::ctx will hold both
1153 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
1154 *
1155 * Lock ordering is by mutex address. There are two other sites where
1156 * perf_event_context::mutex nests and those are:
1157 *
1158 * - perf_event_exit_task_context() [ child , 0 ]
1159 * perf_event_exit_event()
1160 * put_event() [ parent, 1 ]
1161 *
1162 * - perf_event_init_context() [ parent, 0 ]
1163 * inherit_task_group()
1164 * inherit_group()
1165 * inherit_event()
1166 * perf_event_alloc()
1167 * perf_init_event()
1168 * perf_try_init_event() [ child , 1 ]
1169 *
1170 * While it appears there is an obvious deadlock here -- the parent and child
1171 * nesting levels are inverted between the two. This is in fact safe because
1172 * life-time rules separate them. That is an exiting task cannot fork, and a
1173 * spawning task cannot (yet) exit.
1174 *
1175 * But remember that that these are parent<->child context relations, and
1176 * migration does not affect children, therefore these two orderings should not
1177 * interact.
1178 *
1179 * The change in perf_event::ctx does not affect children (as claimed above)
1180 * because the sys_perf_event_open() case will install a new event and break
1181 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
1182 * concerned with cpuctx and that doesn't have children.
1183 *
1184 * The places that change perf_event::ctx will issue:
1185 *
1186 * perf_remove_from_context();
1187 * synchronize_rcu();
1188 * perf_install_in_context();
1189 *
1190 * to affect the change. The remove_from_context() + synchronize_rcu() should
1191 * quiesce the event, after which we can install it in the new location. This
1192 * means that only external vectors (perf_fops, prctl) can perturb the event
1193 * while in transit. Therefore all such accessors should also acquire
1194 * perf_event_context::mutex to serialize against this.
1195 *
1196 * However; because event->ctx can change while we're waiting to acquire
1197 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
1198 * function.
1199 *
1200 * Lock order:
1201 * cred_guard_mutex
1202 * task_struct::perf_event_mutex
1203 * perf_event_context::mutex
1204 * perf_event::child_mutex;
1205 * perf_event_context::lock
1206 * perf_event::mmap_mutex
1207 * mmap_sem
1208 */
1209 static struct perf_event_context *
perf_event_ctx_lock_nested(struct perf_event * event,int nesting)1210 perf_event_ctx_lock_nested(struct perf_event *event, int nesting)
1211 {
1212 struct perf_event_context *ctx;
1213
1214 again:
1215 rcu_read_lock();
1216 ctx = ACCESS_ONCE(event->ctx);
1217 if (!atomic_inc_not_zero(&ctx->refcount)) {
1218 rcu_read_unlock();
1219 goto again;
1220 }
1221 rcu_read_unlock();
1222
1223 mutex_lock_nested(&ctx->mutex, nesting);
1224 if (event->ctx != ctx) {
1225 mutex_unlock(&ctx->mutex);
1226 put_ctx(ctx);
1227 goto again;
1228 }
1229
1230 return ctx;
1231 }
1232
1233 static inline struct perf_event_context *
perf_event_ctx_lock(struct perf_event * event)1234 perf_event_ctx_lock(struct perf_event *event)
1235 {
1236 return perf_event_ctx_lock_nested(event, 0);
1237 }
1238
perf_event_ctx_unlock(struct perf_event * event,struct perf_event_context * ctx)1239 static void perf_event_ctx_unlock(struct perf_event *event,
1240 struct perf_event_context *ctx)
1241 {
1242 mutex_unlock(&ctx->mutex);
1243 put_ctx(ctx);
1244 }
1245
1246 /*
1247 * This must be done under the ctx->lock, such as to serialize against
1248 * context_equiv(), therefore we cannot call put_ctx() since that might end up
1249 * calling scheduler related locks and ctx->lock nests inside those.
1250 */
1251 static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context * ctx)1252 unclone_ctx(struct perf_event_context *ctx)
1253 {
1254 struct perf_event_context *parent_ctx = ctx->parent_ctx;
1255
1256 lockdep_assert_held(&ctx->lock);
1257
1258 if (parent_ctx)
1259 ctx->parent_ctx = NULL;
1260 ctx->generation++;
1261
1262 return parent_ctx;
1263 }
1264
perf_event_pid(struct perf_event * event,struct task_struct * p)1265 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1266 {
1267 /*
1268 * only top level events have the pid namespace they were created in
1269 */
1270 if (event->parent)
1271 event = event->parent;
1272
1273 return task_tgid_nr_ns(p, event->ns);
1274 }
1275
perf_event_tid(struct perf_event * event,struct task_struct * p)1276 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1277 {
1278 /*
1279 * only top level events have the pid namespace they were created in
1280 */
1281 if (event->parent)
1282 event = event->parent;
1283
1284 return task_pid_nr_ns(p, event->ns);
1285 }
1286
1287 /*
1288 * If we inherit events we want to return the parent event id
1289 * to userspace.
1290 */
primary_event_id(struct perf_event * event)1291 static u64 primary_event_id(struct perf_event *event)
1292 {
1293 u64 id = event->id;
1294
1295 if (event->parent)
1296 id = event->parent->id;
1297
1298 return id;
1299 }
1300
1301 /*
1302 * Get the perf_event_context for a task and lock it.
1303 *
1304 * This has to cope with with the fact that until it is locked,
1305 * the context could get moved to another task.
1306 */
1307 static struct perf_event_context *
perf_lock_task_context(struct task_struct * task,int ctxn,unsigned long * flags)1308 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1309 {
1310 struct perf_event_context *ctx;
1311
1312 retry:
1313 /*
1314 * One of the few rules of preemptible RCU is that one cannot do
1315 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1316 * part of the read side critical section was irqs-enabled -- see
1317 * rcu_read_unlock_special().
1318 *
1319 * Since ctx->lock nests under rq->lock we must ensure the entire read
1320 * side critical section has interrupts disabled.
1321 */
1322 local_irq_save(*flags);
1323 rcu_read_lock();
1324 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1325 if (ctx) {
1326 /*
1327 * If this context is a clone of another, it might
1328 * get swapped for another underneath us by
1329 * perf_event_task_sched_out, though the
1330 * rcu_read_lock() protects us from any context
1331 * getting freed. Lock the context and check if it
1332 * got swapped before we could get the lock, and retry
1333 * if so. If we locked the right context, then it
1334 * can't get swapped on us any more.
1335 */
1336 raw_spin_lock(&ctx->lock);
1337 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1338 raw_spin_unlock(&ctx->lock);
1339 rcu_read_unlock();
1340 local_irq_restore(*flags);
1341 goto retry;
1342 }
1343
1344 if (ctx->task == TASK_TOMBSTONE ||
1345 !atomic_inc_not_zero(&ctx->refcount)) {
1346 raw_spin_unlock(&ctx->lock);
1347 ctx = NULL;
1348 } else {
1349 WARN_ON_ONCE(ctx->task != task);
1350 }
1351 }
1352 rcu_read_unlock();
1353 if (!ctx)
1354 local_irq_restore(*flags);
1355 return ctx;
1356 }
1357
1358 /*
1359 * Get the context for a task and increment its pin_count so it
1360 * can't get swapped to another task. This also increments its
1361 * reference count so that the context can't get freed.
1362 */
1363 static struct perf_event_context *
perf_pin_task_context(struct task_struct * task,int ctxn)1364 perf_pin_task_context(struct task_struct *task, int ctxn)
1365 {
1366 struct perf_event_context *ctx;
1367 unsigned long flags;
1368
1369 ctx = perf_lock_task_context(task, ctxn, &flags);
1370 if (ctx) {
1371 ++ctx->pin_count;
1372 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1373 }
1374 return ctx;
1375 }
1376
perf_unpin_context(struct perf_event_context * ctx)1377 static void perf_unpin_context(struct perf_event_context *ctx)
1378 {
1379 unsigned long flags;
1380
1381 raw_spin_lock_irqsave(&ctx->lock, flags);
1382 --ctx->pin_count;
1383 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1384 }
1385
1386 /*
1387 * Update the record of the current time in a context.
1388 */
update_context_time(struct perf_event_context * ctx)1389 static void update_context_time(struct perf_event_context *ctx)
1390 {
1391 u64 now = perf_clock();
1392
1393 ctx->time += now - ctx->timestamp;
1394 ctx->timestamp = now;
1395 }
1396
perf_event_time(struct perf_event * event)1397 static u64 perf_event_time(struct perf_event *event)
1398 {
1399 struct perf_event_context *ctx = event->ctx;
1400
1401 if (is_cgroup_event(event))
1402 return perf_cgroup_event_time(event);
1403
1404 return ctx ? ctx->time : 0;
1405 }
1406
1407 /*
1408 * Update the total_time_enabled and total_time_running fields for a event.
1409 */
update_event_times(struct perf_event * event)1410 static void update_event_times(struct perf_event *event)
1411 {
1412 struct perf_event_context *ctx = event->ctx;
1413 u64 run_end;
1414
1415 lockdep_assert_held(&ctx->lock);
1416
1417 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1418 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1419 return;
1420
1421 /*
1422 * in cgroup mode, time_enabled represents
1423 * the time the event was enabled AND active
1424 * tasks were in the monitored cgroup. This is
1425 * independent of the activity of the context as
1426 * there may be a mix of cgroup and non-cgroup events.
1427 *
1428 * That is why we treat cgroup events differently
1429 * here.
1430 */
1431 if (is_cgroup_event(event))
1432 run_end = perf_cgroup_event_time(event);
1433 else if (ctx->is_active)
1434 run_end = ctx->time;
1435 else
1436 run_end = event->tstamp_stopped;
1437
1438 event->total_time_enabled = run_end - event->tstamp_enabled;
1439
1440 if (event->state == PERF_EVENT_STATE_INACTIVE)
1441 run_end = event->tstamp_stopped;
1442 else
1443 run_end = perf_event_time(event);
1444
1445 event->total_time_running = run_end - event->tstamp_running;
1446
1447 }
1448
1449 /*
1450 * Update total_time_enabled and total_time_running for all events in a group.
1451 */
update_group_times(struct perf_event * leader)1452 static void update_group_times(struct perf_event *leader)
1453 {
1454 struct perf_event *event;
1455
1456 update_event_times(leader);
1457 list_for_each_entry(event, &leader->sibling_list, group_entry)
1458 update_event_times(event);
1459 }
1460
1461 static struct list_head *
ctx_group_list(struct perf_event * event,struct perf_event_context * ctx)1462 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1463 {
1464 if (event->attr.pinned)
1465 return &ctx->pinned_groups;
1466 else
1467 return &ctx->flexible_groups;
1468 }
1469
1470 /*
1471 * Add a event from the lists for its context.
1472 * Must be called with ctx->mutex and ctx->lock held.
1473 */
1474 static void
list_add_event(struct perf_event * event,struct perf_event_context * ctx)1475 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1476 {
1477 lockdep_assert_held(&ctx->lock);
1478
1479 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1480 event->attach_state |= PERF_ATTACH_CONTEXT;
1481
1482 /*
1483 * If we're a stand alone event or group leader, we go to the context
1484 * list, group events are kept attached to the group so that
1485 * perf_group_detach can, at all times, locate all siblings.
1486 */
1487 if (event->group_leader == event) {
1488 struct list_head *list;
1489
1490 event->group_caps = event->event_caps;
1491
1492 list = ctx_group_list(event, ctx);
1493 list_add_tail(&event->group_entry, list);
1494 }
1495
1496 list_update_cgroup_event(event, ctx, true);
1497
1498 list_add_rcu(&event->event_entry, &ctx->event_list);
1499 ctx->nr_events++;
1500 if (event->attr.inherit_stat)
1501 ctx->nr_stat++;
1502
1503 ctx->generation++;
1504 }
1505
1506 /*
1507 * Initialize event state based on the perf_event_attr::disabled.
1508 */
perf_event__state_init(struct perf_event * event)1509 static inline void perf_event__state_init(struct perf_event *event)
1510 {
1511 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1512 PERF_EVENT_STATE_INACTIVE;
1513 }
1514
__perf_event_read_size(struct perf_event * event,int nr_siblings)1515 static void __perf_event_read_size(struct perf_event *event, int nr_siblings)
1516 {
1517 int entry = sizeof(u64); /* value */
1518 int size = 0;
1519 int nr = 1;
1520
1521 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1522 size += sizeof(u64);
1523
1524 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1525 size += sizeof(u64);
1526
1527 if (event->attr.read_format & PERF_FORMAT_ID)
1528 entry += sizeof(u64);
1529
1530 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1531 nr += nr_siblings;
1532 size += sizeof(u64);
1533 }
1534
1535 size += entry * nr;
1536 event->read_size = size;
1537 }
1538
__perf_event_header_size(struct perf_event * event,u64 sample_type)1539 static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
1540 {
1541 struct perf_sample_data *data;
1542 u16 size = 0;
1543
1544 if (sample_type & PERF_SAMPLE_IP)
1545 size += sizeof(data->ip);
1546
1547 if (sample_type & PERF_SAMPLE_ADDR)
1548 size += sizeof(data->addr);
1549
1550 if (sample_type & PERF_SAMPLE_PERIOD)
1551 size += sizeof(data->period);
1552
1553 if (sample_type & PERF_SAMPLE_WEIGHT)
1554 size += sizeof(data->weight);
1555
1556 if (sample_type & PERF_SAMPLE_READ)
1557 size += event->read_size;
1558
1559 if (sample_type & PERF_SAMPLE_DATA_SRC)
1560 size += sizeof(data->data_src.val);
1561
1562 if (sample_type & PERF_SAMPLE_TRANSACTION)
1563 size += sizeof(data->txn);
1564
1565 event->header_size = size;
1566 }
1567
1568 /*
1569 * Called at perf_event creation and when events are attached/detached from a
1570 * group.
1571 */
perf_event__header_size(struct perf_event * event)1572 static void perf_event__header_size(struct perf_event *event)
1573 {
1574 __perf_event_read_size(event,
1575 event->group_leader->nr_siblings);
1576 __perf_event_header_size(event, event->attr.sample_type);
1577 }
1578
perf_event__id_header_size(struct perf_event * event)1579 static void perf_event__id_header_size(struct perf_event *event)
1580 {
1581 struct perf_sample_data *data;
1582 u64 sample_type = event->attr.sample_type;
1583 u16 size = 0;
1584
1585 if (sample_type & PERF_SAMPLE_TID)
1586 size += sizeof(data->tid_entry);
1587
1588 if (sample_type & PERF_SAMPLE_TIME)
1589 size += sizeof(data->time);
1590
1591 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1592 size += sizeof(data->id);
1593
1594 if (sample_type & PERF_SAMPLE_ID)
1595 size += sizeof(data->id);
1596
1597 if (sample_type & PERF_SAMPLE_STREAM_ID)
1598 size += sizeof(data->stream_id);
1599
1600 if (sample_type & PERF_SAMPLE_CPU)
1601 size += sizeof(data->cpu_entry);
1602
1603 event->id_header_size = size;
1604 }
1605
perf_event_validate_size(struct perf_event * event)1606 static bool perf_event_validate_size(struct perf_event *event)
1607 {
1608 /*
1609 * The values computed here will be over-written when we actually
1610 * attach the event.
1611 */
1612 __perf_event_read_size(event, event->group_leader->nr_siblings + 1);
1613 __perf_event_header_size(event, event->attr.sample_type & ~PERF_SAMPLE_READ);
1614 perf_event__id_header_size(event);
1615
1616 /*
1617 * Sum the lot; should not exceed the 64k limit we have on records.
1618 * Conservative limit to allow for callchains and other variable fields.
1619 */
1620 if (event->read_size + event->header_size +
1621 event->id_header_size + sizeof(struct perf_event_header) >= 16*1024)
1622 return false;
1623
1624 return true;
1625 }
1626
perf_group_attach(struct perf_event * event)1627 static void perf_group_attach(struct perf_event *event)
1628 {
1629 struct perf_event *group_leader = event->group_leader, *pos;
1630
1631 lockdep_assert_held(&event->ctx->lock);
1632
1633 /*
1634 * We can have double attach due to group movement in perf_event_open.
1635 */
1636 if (event->attach_state & PERF_ATTACH_GROUP)
1637 return;
1638
1639 event->attach_state |= PERF_ATTACH_GROUP;
1640
1641 if (group_leader == event)
1642 return;
1643
1644 WARN_ON_ONCE(group_leader->ctx != event->ctx);
1645
1646 group_leader->group_caps &= event->event_caps;
1647
1648 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1649 group_leader->nr_siblings++;
1650
1651 perf_event__header_size(group_leader);
1652
1653 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1654 perf_event__header_size(pos);
1655 }
1656
1657 /*
1658 * Remove a event from the lists for its context.
1659 * Must be called with ctx->mutex and ctx->lock held.
1660 */
1661 static void
list_del_event(struct perf_event * event,struct perf_event_context * ctx)1662 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1663 {
1664 WARN_ON_ONCE(event->ctx != ctx);
1665 lockdep_assert_held(&ctx->lock);
1666
1667 /*
1668 * We can have double detach due to exit/hot-unplug + close.
1669 */
1670 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1671 return;
1672
1673 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1674
1675 list_update_cgroup_event(event, ctx, false);
1676
1677 ctx->nr_events--;
1678 if (event->attr.inherit_stat)
1679 ctx->nr_stat--;
1680
1681 list_del_rcu(&event->event_entry);
1682
1683 if (event->group_leader == event)
1684 list_del_init(&event->group_entry);
1685
1686 update_group_times(event);
1687
1688 /*
1689 * If event was in error state, then keep it
1690 * that way, otherwise bogus counts will be
1691 * returned on read(). The only way to get out
1692 * of error state is by explicit re-enabling
1693 * of the event
1694 */
1695 if (event->state > PERF_EVENT_STATE_OFF)
1696 event->state = PERF_EVENT_STATE_OFF;
1697
1698 ctx->generation++;
1699 }
1700
perf_group_detach(struct perf_event * event)1701 static void perf_group_detach(struct perf_event *event)
1702 {
1703 struct perf_event *sibling, *tmp;
1704 struct list_head *list = NULL;
1705
1706 lockdep_assert_held(&event->ctx->lock);
1707
1708 /*
1709 * We can have double detach due to exit/hot-unplug + close.
1710 */
1711 if (!(event->attach_state & PERF_ATTACH_GROUP))
1712 return;
1713
1714 event->attach_state &= ~PERF_ATTACH_GROUP;
1715
1716 /*
1717 * If this is a sibling, remove it from its group.
1718 */
1719 if (event->group_leader != event) {
1720 list_del_init(&event->group_entry);
1721 event->group_leader->nr_siblings--;
1722 goto out;
1723 }
1724
1725 if (!list_empty(&event->group_entry))
1726 list = &event->group_entry;
1727
1728 /*
1729 * If this was a group event with sibling events then
1730 * upgrade the siblings to singleton events by adding them
1731 * to whatever list we are on.
1732 */
1733 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1734 if (list)
1735 list_move_tail(&sibling->group_entry, list);
1736 sibling->group_leader = sibling;
1737
1738 /* Inherit group flags from the previous leader */
1739 sibling->group_caps = event->group_caps;
1740
1741 WARN_ON_ONCE(sibling->ctx != event->ctx);
1742 }
1743
1744 out:
1745 perf_event__header_size(event->group_leader);
1746
1747 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1748 perf_event__header_size(tmp);
1749 }
1750
is_orphaned_event(struct perf_event * event)1751 static bool is_orphaned_event(struct perf_event *event)
1752 {
1753 return event->state == PERF_EVENT_STATE_DEAD;
1754 }
1755
__pmu_filter_match(struct perf_event * event)1756 static inline int __pmu_filter_match(struct perf_event *event)
1757 {
1758 struct pmu *pmu = event->pmu;
1759 return pmu->filter_match ? pmu->filter_match(event) : 1;
1760 }
1761
1762 /*
1763 * Check whether we should attempt to schedule an event group based on
1764 * PMU-specific filtering. An event group can consist of HW and SW events,
1765 * potentially with a SW leader, so we must check all the filters, to
1766 * determine whether a group is schedulable:
1767 */
pmu_filter_match(struct perf_event * event)1768 static inline int pmu_filter_match(struct perf_event *event)
1769 {
1770 struct perf_event *child;
1771
1772 if (!__pmu_filter_match(event))
1773 return 0;
1774
1775 list_for_each_entry(child, &event->sibling_list, group_entry) {
1776 if (!__pmu_filter_match(child))
1777 return 0;
1778 }
1779
1780 return 1;
1781 }
1782
1783 static inline int
event_filter_match(struct perf_event * event)1784 event_filter_match(struct perf_event *event)
1785 {
1786 return (event->cpu == -1 || event->cpu == smp_processor_id()) &&
1787 perf_cgroup_match(event) && pmu_filter_match(event);
1788 }
1789
1790 static void
event_sched_out(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1791 event_sched_out(struct perf_event *event,
1792 struct perf_cpu_context *cpuctx,
1793 struct perf_event_context *ctx)
1794 {
1795 u64 tstamp = perf_event_time(event);
1796 u64 delta;
1797
1798 WARN_ON_ONCE(event->ctx != ctx);
1799 lockdep_assert_held(&ctx->lock);
1800
1801 /*
1802 * An event which could not be activated because of
1803 * filter mismatch still needs to have its timings
1804 * maintained, otherwise bogus information is return
1805 * via read() for time_enabled, time_running:
1806 */
1807 if (event->state == PERF_EVENT_STATE_INACTIVE &&
1808 !event_filter_match(event)) {
1809 delta = tstamp - event->tstamp_stopped;
1810 event->tstamp_running += delta;
1811 event->tstamp_stopped = tstamp;
1812 }
1813
1814 if (event->state != PERF_EVENT_STATE_ACTIVE)
1815 return;
1816
1817 perf_pmu_disable(event->pmu);
1818
1819 event->tstamp_stopped = tstamp;
1820 event->pmu->del(event, 0);
1821 event->oncpu = -1;
1822 event->state = PERF_EVENT_STATE_INACTIVE;
1823 if (event->pending_disable) {
1824 event->pending_disable = 0;
1825 event->state = PERF_EVENT_STATE_OFF;
1826 }
1827
1828 if (!is_software_event(event))
1829 cpuctx->active_oncpu--;
1830 if (!--ctx->nr_active)
1831 perf_event_ctx_deactivate(ctx);
1832 if (event->attr.freq && event->attr.sample_freq)
1833 ctx->nr_freq--;
1834 if (event->attr.exclusive || !cpuctx->active_oncpu)
1835 cpuctx->exclusive = 0;
1836
1837 perf_pmu_enable(event->pmu);
1838 }
1839
1840 static void
group_sched_out(struct perf_event * group_event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1841 group_sched_out(struct perf_event *group_event,
1842 struct perf_cpu_context *cpuctx,
1843 struct perf_event_context *ctx)
1844 {
1845 struct perf_event *event;
1846 int state = group_event->state;
1847
1848 perf_pmu_disable(ctx->pmu);
1849
1850 event_sched_out(group_event, cpuctx, ctx);
1851
1852 /*
1853 * Schedule out siblings (if any):
1854 */
1855 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1856 event_sched_out(event, cpuctx, ctx);
1857
1858 perf_pmu_enable(ctx->pmu);
1859
1860 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1861 cpuctx->exclusive = 0;
1862 }
1863
1864 #define DETACH_GROUP 0x01UL
1865
1866 /*
1867 * Cross CPU call to remove a performance event
1868 *
1869 * We disable the event on the hardware level first. After that we
1870 * remove it from the context list.
1871 */
1872 static void
__perf_remove_from_context(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx,void * info)1873 __perf_remove_from_context(struct perf_event *event,
1874 struct perf_cpu_context *cpuctx,
1875 struct perf_event_context *ctx,
1876 void *info)
1877 {
1878 unsigned long flags = (unsigned long)info;
1879
1880 event_sched_out(event, cpuctx, ctx);
1881 if (flags & DETACH_GROUP)
1882 perf_group_detach(event);
1883 list_del_event(event, ctx);
1884
1885 if (!ctx->nr_events && ctx->is_active) {
1886 ctx->is_active = 0;
1887 if (ctx->task) {
1888 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
1889 cpuctx->task_ctx = NULL;
1890 }
1891 }
1892 }
1893
1894 /*
1895 * Remove the event from a task's (or a CPU's) list of events.
1896 *
1897 * If event->ctx is a cloned context, callers must make sure that
1898 * every task struct that event->ctx->task could possibly point to
1899 * remains valid. This is OK when called from perf_release since
1900 * that only calls us on the top-level context, which can't be a clone.
1901 * When called from perf_event_exit_task, it's OK because the
1902 * context has been detached from its task.
1903 */
perf_remove_from_context(struct perf_event * event,unsigned long flags)1904 static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
1905 {
1906 struct perf_event_context *ctx = event->ctx;
1907
1908 lockdep_assert_held(&ctx->mutex);
1909
1910 event_function_call(event, __perf_remove_from_context, (void *)flags);
1911
1912 /*
1913 * The above event_function_call() can NO-OP when it hits
1914 * TASK_TOMBSTONE. In that case we must already have been detached
1915 * from the context (by perf_event_exit_event()) but the grouping
1916 * might still be in-tact.
1917 */
1918 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1919 if ((flags & DETACH_GROUP) &&
1920 (event->attach_state & PERF_ATTACH_GROUP)) {
1921 /*
1922 * Since in that case we cannot possibly be scheduled, simply
1923 * detach now.
1924 */
1925 raw_spin_lock_irq(&ctx->lock);
1926 perf_group_detach(event);
1927 raw_spin_unlock_irq(&ctx->lock);
1928 }
1929 }
1930
1931 /*
1932 * Cross CPU call to disable a performance event
1933 */
__perf_event_disable(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx,void * info)1934 static void __perf_event_disable(struct perf_event *event,
1935 struct perf_cpu_context *cpuctx,
1936 struct perf_event_context *ctx,
1937 void *info)
1938 {
1939 if (event->state < PERF_EVENT_STATE_INACTIVE)
1940 return;
1941
1942 update_context_time(ctx);
1943 update_cgrp_time_from_event(event);
1944 update_group_times(event);
1945 if (event == event->group_leader)
1946 group_sched_out(event, cpuctx, ctx);
1947 else
1948 event_sched_out(event, cpuctx, ctx);
1949 event->state = PERF_EVENT_STATE_OFF;
1950 }
1951
1952 /*
1953 * Disable a event.
1954 *
1955 * If event->ctx is a cloned context, callers must make sure that
1956 * every task struct that event->ctx->task could possibly point to
1957 * remains valid. This condition is satisifed when called through
1958 * perf_event_for_each_child or perf_event_for_each because they
1959 * hold the top-level event's child_mutex, so any descendant that
1960 * goes to exit will block in perf_event_exit_event().
1961 *
1962 * When called from perf_pending_event it's OK because event->ctx
1963 * is the current context on this CPU and preemption is disabled,
1964 * hence we can't get into perf_event_task_sched_out for this context.
1965 */
_perf_event_disable(struct perf_event * event)1966 static void _perf_event_disable(struct perf_event *event)
1967 {
1968 struct perf_event_context *ctx = event->ctx;
1969
1970 raw_spin_lock_irq(&ctx->lock);
1971 if (event->state <= PERF_EVENT_STATE_OFF) {
1972 raw_spin_unlock_irq(&ctx->lock);
1973 return;
1974 }
1975 raw_spin_unlock_irq(&ctx->lock);
1976
1977 event_function_call(event, __perf_event_disable, NULL);
1978 }
1979
perf_event_disable_local(struct perf_event * event)1980 void perf_event_disable_local(struct perf_event *event)
1981 {
1982 event_function_local(event, __perf_event_disable, NULL);
1983 }
1984
1985 /*
1986 * Strictly speaking kernel users cannot create groups and therefore this
1987 * interface does not need the perf_event_ctx_lock() magic.
1988 */
perf_event_disable(struct perf_event * event)1989 void perf_event_disable(struct perf_event *event)
1990 {
1991 struct perf_event_context *ctx;
1992
1993 ctx = perf_event_ctx_lock(event);
1994 _perf_event_disable(event);
1995 perf_event_ctx_unlock(event, ctx);
1996 }
1997 EXPORT_SYMBOL_GPL(perf_event_disable);
1998
perf_event_disable_inatomic(struct perf_event * event)1999 void perf_event_disable_inatomic(struct perf_event *event)
2000 {
2001 event->pending_disable = 1;
2002 irq_work_queue(&event->pending);
2003 }
2004
perf_set_shadow_time(struct perf_event * event,struct perf_event_context * ctx,u64 tstamp)2005 static void perf_set_shadow_time(struct perf_event *event,
2006 struct perf_event_context *ctx,
2007 u64 tstamp)
2008 {
2009 /*
2010 * use the correct time source for the time snapshot
2011 *
2012 * We could get by without this by leveraging the
2013 * fact that to get to this function, the caller
2014 * has most likely already called update_context_time()
2015 * and update_cgrp_time_xx() and thus both timestamp
2016 * are identical (or very close). Given that tstamp is,
2017 * already adjusted for cgroup, we could say that:
2018 * tstamp - ctx->timestamp
2019 * is equivalent to
2020 * tstamp - cgrp->timestamp.
2021 *
2022 * Then, in perf_output_read(), the calculation would
2023 * work with no changes because:
2024 * - event is guaranteed scheduled in
2025 * - no scheduled out in between
2026 * - thus the timestamp would be the same
2027 *
2028 * But this is a bit hairy.
2029 *
2030 * So instead, we have an explicit cgroup call to remain
2031 * within the time time source all along. We believe it
2032 * is cleaner and simpler to understand.
2033 */
2034 if (is_cgroup_event(event))
2035 perf_cgroup_set_shadow_time(event, tstamp);
2036 else
2037 event->shadow_ctx_time = tstamp - ctx->timestamp;
2038 }
2039
2040 #define MAX_INTERRUPTS (~0ULL)
2041
2042 static void perf_log_throttle(struct perf_event *event, int enable);
2043 static void perf_log_itrace_start(struct perf_event *event);
2044
2045 static int
event_sched_in(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)2046 event_sched_in(struct perf_event *event,
2047 struct perf_cpu_context *cpuctx,
2048 struct perf_event_context *ctx)
2049 {
2050 u64 tstamp = perf_event_time(event);
2051 int ret = 0;
2052
2053 lockdep_assert_held(&ctx->lock);
2054
2055 if (event->state <= PERF_EVENT_STATE_OFF)
2056 return 0;
2057
2058 WRITE_ONCE(event->oncpu, smp_processor_id());
2059 /*
2060 * Order event::oncpu write to happen before the ACTIVE state
2061 * is visible.
2062 */
2063 smp_wmb();
2064 WRITE_ONCE(event->state, PERF_EVENT_STATE_ACTIVE);
2065
2066 /*
2067 * Unthrottle events, since we scheduled we might have missed several
2068 * ticks already, also for a heavily scheduling task there is little
2069 * guarantee it'll get a tick in a timely manner.
2070 */
2071 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
2072 perf_log_throttle(event, 1);
2073 event->hw.interrupts = 0;
2074 }
2075
2076 /*
2077 * The new state must be visible before we turn it on in the hardware:
2078 */
2079 smp_wmb();
2080
2081 perf_pmu_disable(event->pmu);
2082
2083 perf_set_shadow_time(event, ctx, tstamp);
2084
2085 perf_log_itrace_start(event);
2086
2087 if (event->pmu->add(event, PERF_EF_START)) {
2088 event->state = PERF_EVENT_STATE_INACTIVE;
2089 event->oncpu = -1;
2090 ret = -EAGAIN;
2091 goto out;
2092 }
2093
2094 event->tstamp_running += tstamp - event->tstamp_stopped;
2095
2096 if (!is_software_event(event))
2097 cpuctx->active_oncpu++;
2098 if (!ctx->nr_active++)
2099 perf_event_ctx_activate(ctx);
2100 if (event->attr.freq && event->attr.sample_freq)
2101 ctx->nr_freq++;
2102
2103 if (event->attr.exclusive)
2104 cpuctx->exclusive = 1;
2105
2106 out:
2107 perf_pmu_enable(event->pmu);
2108
2109 return ret;
2110 }
2111
2112 static int
group_sched_in(struct perf_event * group_event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)2113 group_sched_in(struct perf_event *group_event,
2114 struct perf_cpu_context *cpuctx,
2115 struct perf_event_context *ctx)
2116 {
2117 struct perf_event *event, *partial_group = NULL;
2118 struct pmu *pmu = ctx->pmu;
2119 u64 now = ctx->time;
2120 bool simulate = false;
2121
2122 if (group_event->state == PERF_EVENT_STATE_OFF)
2123 return 0;
2124
2125 pmu->start_txn(pmu, PERF_PMU_TXN_ADD);
2126
2127 if (event_sched_in(group_event, cpuctx, ctx)) {
2128 pmu->cancel_txn(pmu);
2129 perf_mux_hrtimer_restart(cpuctx);
2130 return -EAGAIN;
2131 }
2132
2133 /*
2134 * Schedule in siblings as one group (if any):
2135 */
2136 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2137 if (event_sched_in(event, cpuctx, ctx)) {
2138 partial_group = event;
2139 goto group_error;
2140 }
2141 }
2142
2143 if (!pmu->commit_txn(pmu))
2144 return 0;
2145
2146 group_error:
2147 /*
2148 * Groups can be scheduled in as one unit only, so undo any
2149 * partial group before returning:
2150 * The events up to the failed event are scheduled out normally,
2151 * tstamp_stopped will be updated.
2152 *
2153 * The failed events and the remaining siblings need to have
2154 * their timings updated as if they had gone thru event_sched_in()
2155 * and event_sched_out(). This is required to get consistent timings
2156 * across the group. This also takes care of the case where the group
2157 * could never be scheduled by ensuring tstamp_stopped is set to mark
2158 * the time the event was actually stopped, such that time delta
2159 * calculation in update_event_times() is correct.
2160 */
2161 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
2162 if (event == partial_group)
2163 simulate = true;
2164
2165 if (simulate) {
2166 event->tstamp_running += now - event->tstamp_stopped;
2167 event->tstamp_stopped = now;
2168 } else {
2169 event_sched_out(event, cpuctx, ctx);
2170 }
2171 }
2172 event_sched_out(group_event, cpuctx, ctx);
2173
2174 pmu->cancel_txn(pmu);
2175
2176 perf_mux_hrtimer_restart(cpuctx);
2177
2178 return -EAGAIN;
2179 }
2180
2181 /*
2182 * Work out whether we can put this event group on the CPU now.
2183 */
group_can_go_on(struct perf_event * event,struct perf_cpu_context * cpuctx,int can_add_hw)2184 static int group_can_go_on(struct perf_event *event,
2185 struct perf_cpu_context *cpuctx,
2186 int can_add_hw)
2187 {
2188 /*
2189 * Groups consisting entirely of software events can always go on.
2190 */
2191 if (event->group_caps & PERF_EV_CAP_SOFTWARE)
2192 return 1;
2193 /*
2194 * If an exclusive group is already on, no other hardware
2195 * events can go on.
2196 */
2197 if (cpuctx->exclusive)
2198 return 0;
2199 /*
2200 * If this group is exclusive and there are already
2201 * events on the CPU, it can't go on.
2202 */
2203 if (event->attr.exclusive && cpuctx->active_oncpu)
2204 return 0;
2205 /*
2206 * Otherwise, try to add it if all previous groups were able
2207 * to go on.
2208 */
2209 return can_add_hw;
2210 }
2211
add_event_to_ctx(struct perf_event * event,struct perf_event_context * ctx)2212 static void add_event_to_ctx(struct perf_event *event,
2213 struct perf_event_context *ctx)
2214 {
2215 u64 tstamp = perf_event_time(event);
2216
2217 list_add_event(event, ctx);
2218 perf_group_attach(event);
2219 event->tstamp_enabled = tstamp;
2220 event->tstamp_running = tstamp;
2221 event->tstamp_stopped = tstamp;
2222 }
2223
2224 static void ctx_sched_out(struct perf_event_context *ctx,
2225 struct perf_cpu_context *cpuctx,
2226 enum event_type_t event_type);
2227 static void
2228 ctx_sched_in(struct perf_event_context *ctx,
2229 struct perf_cpu_context *cpuctx,
2230 enum event_type_t event_type,
2231 struct task_struct *task);
2232
task_ctx_sched_out(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)2233 static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
2234 struct perf_event_context *ctx)
2235 {
2236 if (!cpuctx->task_ctx)
2237 return;
2238
2239 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2240 return;
2241
2242 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2243 }
2244
perf_event_sched_in(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx,struct task_struct * task)2245 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2246 struct perf_event_context *ctx,
2247 struct task_struct *task)
2248 {
2249 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2250 if (ctx)
2251 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2252 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2253 if (ctx)
2254 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2255 }
2256
ctx_resched(struct perf_cpu_context * cpuctx,struct perf_event_context * task_ctx)2257 static void ctx_resched(struct perf_cpu_context *cpuctx,
2258 struct perf_event_context *task_ctx)
2259 {
2260 perf_pmu_disable(cpuctx->ctx.pmu);
2261 if (task_ctx)
2262 task_ctx_sched_out(cpuctx, task_ctx);
2263 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2264 perf_event_sched_in(cpuctx, task_ctx, current);
2265 perf_pmu_enable(cpuctx->ctx.pmu);
2266 }
2267
2268 /*
2269 * Cross CPU call to install and enable a performance event
2270 *
2271 * Very similar to remote_function() + event_function() but cannot assume that
2272 * things like ctx->is_active and cpuctx->task_ctx are set.
2273 */
__perf_install_in_context(void * info)2274 static int __perf_install_in_context(void *info)
2275 {
2276 struct perf_event *event = info;
2277 struct perf_event_context *ctx = event->ctx;
2278 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2279 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2280 bool reprogram = true;
2281 int ret = 0;
2282
2283 raw_spin_lock(&cpuctx->ctx.lock);
2284 if (ctx->task) {
2285 raw_spin_lock(&ctx->lock);
2286 task_ctx = ctx;
2287
2288 reprogram = (ctx->task == current);
2289
2290 /*
2291 * If the task is running, it must be running on this CPU,
2292 * otherwise we cannot reprogram things.
2293 *
2294 * If its not running, we don't care, ctx->lock will
2295 * serialize against it becoming runnable.
2296 */
2297 if (task_curr(ctx->task) && !reprogram) {
2298 ret = -ESRCH;
2299 goto unlock;
2300 }
2301
2302 WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
2303 } else if (task_ctx) {
2304 raw_spin_lock(&task_ctx->lock);
2305 }
2306
2307 if (reprogram) {
2308 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2309 add_event_to_ctx(event, ctx);
2310 ctx_resched(cpuctx, task_ctx);
2311 } else {
2312 add_event_to_ctx(event, ctx);
2313 }
2314
2315 unlock:
2316 perf_ctx_unlock(cpuctx, task_ctx);
2317
2318 return ret;
2319 }
2320
2321 /*
2322 * Attach a performance event to a context.
2323 *
2324 * Very similar to event_function_call, see comment there.
2325 */
2326 static void
perf_install_in_context(struct perf_event_context * ctx,struct perf_event * event,int cpu)2327 perf_install_in_context(struct perf_event_context *ctx,
2328 struct perf_event *event,
2329 int cpu)
2330 {
2331 struct task_struct *task = READ_ONCE(ctx->task);
2332
2333 lockdep_assert_held(&ctx->mutex);
2334
2335 if (event->cpu != -1)
2336 event->cpu = cpu;
2337
2338 /*
2339 * Ensures that if we can observe event->ctx, both the event and ctx
2340 * will be 'complete'. See perf_iterate_sb_cpu().
2341 */
2342 smp_store_release(&event->ctx, ctx);
2343
2344 if (!task) {
2345 cpu_function_call(cpu, __perf_install_in_context, event);
2346 return;
2347 }
2348
2349 /*
2350 * Should not happen, we validate the ctx is still alive before calling.
2351 */
2352 if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
2353 return;
2354
2355 /*
2356 * Installing events is tricky because we cannot rely on ctx->is_active
2357 * to be set in case this is the nr_events 0 -> 1 transition.
2358 *
2359 * Instead we use task_curr(), which tells us if the task is running.
2360 * However, since we use task_curr() outside of rq::lock, we can race
2361 * against the actual state. This means the result can be wrong.
2362 *
2363 * If we get a false positive, we retry, this is harmless.
2364 *
2365 * If we get a false negative, things are complicated. If we are after
2366 * perf_event_context_sched_in() ctx::lock will serialize us, and the
2367 * value must be correct. If we're before, it doesn't matter since
2368 * perf_event_context_sched_in() will program the counter.
2369 *
2370 * However, this hinges on the remote context switch having observed
2371 * our task->perf_event_ctxp[] store, such that it will in fact take
2372 * ctx::lock in perf_event_context_sched_in().
2373 *
2374 * We do this by task_function_call(), if the IPI fails to hit the task
2375 * we know any future context switch of task must see the
2376 * perf_event_ctpx[] store.
2377 */
2378
2379 /*
2380 * This smp_mb() orders the task->perf_event_ctxp[] store with the
2381 * task_cpu() load, such that if the IPI then does not find the task
2382 * running, a future context switch of that task must observe the
2383 * store.
2384 */
2385 smp_mb();
2386 again:
2387 if (!task_function_call(task, __perf_install_in_context, event))
2388 return;
2389
2390 raw_spin_lock_irq(&ctx->lock);
2391 task = ctx->task;
2392 if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
2393 /*
2394 * Cannot happen because we already checked above (which also
2395 * cannot happen), and we hold ctx->mutex, which serializes us
2396 * against perf_event_exit_task_context().
2397 */
2398 raw_spin_unlock_irq(&ctx->lock);
2399 return;
2400 }
2401 /*
2402 * If the task is not running, ctx->lock will avoid it becoming so,
2403 * thus we can safely install the event.
2404 */
2405 if (task_curr(task)) {
2406 raw_spin_unlock_irq(&ctx->lock);
2407 goto again;
2408 }
2409 add_event_to_ctx(event, ctx);
2410 raw_spin_unlock_irq(&ctx->lock);
2411 }
2412
2413 /*
2414 * Put a event into inactive state and update time fields.
2415 * Enabling the leader of a group effectively enables all
2416 * the group members that aren't explicitly disabled, so we
2417 * have to update their ->tstamp_enabled also.
2418 * Note: this works for group members as well as group leaders
2419 * since the non-leader members' sibling_lists will be empty.
2420 */
__perf_event_mark_enabled(struct perf_event * event)2421 static void __perf_event_mark_enabled(struct perf_event *event)
2422 {
2423 struct perf_event *sub;
2424 u64 tstamp = perf_event_time(event);
2425
2426 event->state = PERF_EVENT_STATE_INACTIVE;
2427 event->tstamp_enabled = tstamp - event->total_time_enabled;
2428 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2429 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2430 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2431 }
2432 }
2433
2434 /*
2435 * Cross CPU call to enable a performance event
2436 */
__perf_event_enable(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx,void * info)2437 static void __perf_event_enable(struct perf_event *event,
2438 struct perf_cpu_context *cpuctx,
2439 struct perf_event_context *ctx,
2440 void *info)
2441 {
2442 struct perf_event *leader = event->group_leader;
2443 struct perf_event_context *task_ctx;
2444
2445 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2446 event->state <= PERF_EVENT_STATE_ERROR)
2447 return;
2448
2449 if (ctx->is_active)
2450 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
2451
2452 __perf_event_mark_enabled(event);
2453
2454 if (!ctx->is_active)
2455 return;
2456
2457 if (!event_filter_match(event)) {
2458 if (is_cgroup_event(event))
2459 perf_cgroup_defer_enabled(event);
2460 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2461 return;
2462 }
2463
2464 /*
2465 * If the event is in a group and isn't the group leader,
2466 * then don't put it on unless the group is on.
2467 */
2468 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
2469 ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
2470 return;
2471 }
2472
2473 task_ctx = cpuctx->task_ctx;
2474 if (ctx->task)
2475 WARN_ON_ONCE(task_ctx != ctx);
2476
2477 ctx_resched(cpuctx, task_ctx);
2478 }
2479
2480 /*
2481 * Enable a event.
2482 *
2483 * If event->ctx is a cloned context, callers must make sure that
2484 * every task struct that event->ctx->task could possibly point to
2485 * remains valid. This condition is satisfied when called through
2486 * perf_event_for_each_child or perf_event_for_each as described
2487 * for perf_event_disable.
2488 */
_perf_event_enable(struct perf_event * event)2489 static void _perf_event_enable(struct perf_event *event)
2490 {
2491 struct perf_event_context *ctx = event->ctx;
2492
2493 raw_spin_lock_irq(&ctx->lock);
2494 if (event->state >= PERF_EVENT_STATE_INACTIVE ||
2495 event->state < PERF_EVENT_STATE_ERROR) {
2496 raw_spin_unlock_irq(&ctx->lock);
2497 return;
2498 }
2499
2500 /*
2501 * If the event is in error state, clear that first.
2502 *
2503 * That way, if we see the event in error state below, we know that it
2504 * has gone back into error state, as distinct from the task having
2505 * been scheduled away before the cross-call arrived.
2506 */
2507 if (event->state == PERF_EVENT_STATE_ERROR)
2508 event->state = PERF_EVENT_STATE_OFF;
2509 raw_spin_unlock_irq(&ctx->lock);
2510
2511 event_function_call(event, __perf_event_enable, NULL);
2512 }
2513
2514 /*
2515 * See perf_event_disable();
2516 */
perf_event_enable(struct perf_event * event)2517 void perf_event_enable(struct perf_event *event)
2518 {
2519 struct perf_event_context *ctx;
2520
2521 ctx = perf_event_ctx_lock(event);
2522 _perf_event_enable(event);
2523 perf_event_ctx_unlock(event, ctx);
2524 }
2525 EXPORT_SYMBOL_GPL(perf_event_enable);
2526
2527 struct stop_event_data {
2528 struct perf_event *event;
2529 unsigned int restart;
2530 };
2531
__perf_event_stop(void * info)2532 static int __perf_event_stop(void *info)
2533 {
2534 struct stop_event_data *sd = info;
2535 struct perf_event *event = sd->event;
2536
2537 /* if it's already INACTIVE, do nothing */
2538 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2539 return 0;
2540
2541 /* matches smp_wmb() in event_sched_in() */
2542 smp_rmb();
2543
2544 /*
2545 * There is a window with interrupts enabled before we get here,
2546 * so we need to check again lest we try to stop another CPU's event.
2547 */
2548 if (READ_ONCE(event->oncpu) != smp_processor_id())
2549 return -EAGAIN;
2550
2551 event->pmu->stop(event, PERF_EF_UPDATE);
2552
2553 /*
2554 * May race with the actual stop (through perf_pmu_output_stop()),
2555 * but it is only used for events with AUX ring buffer, and such
2556 * events will refuse to restart because of rb::aux_mmap_count==0,
2557 * see comments in perf_aux_output_begin().
2558 *
2559 * Since this is happening on a event-local CPU, no trace is lost
2560 * while restarting.
2561 */
2562 if (sd->restart)
2563 event->pmu->start(event, 0);
2564
2565 return 0;
2566 }
2567
perf_event_stop(struct perf_event * event,int restart)2568 static int perf_event_stop(struct perf_event *event, int restart)
2569 {
2570 struct stop_event_data sd = {
2571 .event = event,
2572 .restart = restart,
2573 };
2574 int ret = 0;
2575
2576 do {
2577 if (READ_ONCE(event->state) != PERF_EVENT_STATE_ACTIVE)
2578 return 0;
2579
2580 /* matches smp_wmb() in event_sched_in() */
2581 smp_rmb();
2582
2583 /*
2584 * We only want to restart ACTIVE events, so if the event goes
2585 * inactive here (event->oncpu==-1), there's nothing more to do;
2586 * fall through with ret==-ENXIO.
2587 */
2588 ret = cpu_function_call(READ_ONCE(event->oncpu),
2589 __perf_event_stop, &sd);
2590 } while (ret == -EAGAIN);
2591
2592 return ret;
2593 }
2594
2595 /*
2596 * In order to contain the amount of racy and tricky in the address filter
2597 * configuration management, it is a two part process:
2598 *
2599 * (p1) when userspace mappings change as a result of (1) or (2) or (3) below,
2600 * we update the addresses of corresponding vmas in
2601 * event::addr_filters_offs array and bump the event::addr_filters_gen;
2602 * (p2) when an event is scheduled in (pmu::add), it calls
2603 * perf_event_addr_filters_sync() which calls pmu::addr_filters_sync()
2604 * if the generation has changed since the previous call.
2605 *
2606 * If (p1) happens while the event is active, we restart it to force (p2).
2607 *
2608 * (1) perf_addr_filters_apply(): adjusting filters' offsets based on
2609 * pre-existing mappings, called once when new filters arrive via SET_FILTER
2610 * ioctl;
2611 * (2) perf_addr_filters_adjust(): adjusting filters' offsets based on newly
2612 * registered mapping, called for every new mmap(), with mm::mmap_sem down
2613 * for reading;
2614 * (3) perf_event_addr_filters_exec(): clearing filters' offsets in the process
2615 * of exec.
2616 */
perf_event_addr_filters_sync(struct perf_event * event)2617 void perf_event_addr_filters_sync(struct perf_event *event)
2618 {
2619 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
2620
2621 if (!has_addr_filter(event))
2622 return;
2623
2624 raw_spin_lock(&ifh->lock);
2625 if (event->addr_filters_gen != event->hw.addr_filters_gen) {
2626 event->pmu->addr_filters_sync(event);
2627 event->hw.addr_filters_gen = event->addr_filters_gen;
2628 }
2629 raw_spin_unlock(&ifh->lock);
2630 }
2631 EXPORT_SYMBOL_GPL(perf_event_addr_filters_sync);
2632
_perf_event_refresh(struct perf_event * event,int refresh)2633 static int _perf_event_refresh(struct perf_event *event, int refresh)
2634 {
2635 /*
2636 * not supported on inherited events
2637 */
2638 if (event->attr.inherit || !is_sampling_event(event))
2639 return -EINVAL;
2640
2641 atomic_add(refresh, &event->event_limit);
2642 _perf_event_enable(event);
2643
2644 return 0;
2645 }
2646
2647 /*
2648 * See perf_event_disable()
2649 */
perf_event_refresh(struct perf_event * event,int refresh)2650 int perf_event_refresh(struct perf_event *event, int refresh)
2651 {
2652 struct perf_event_context *ctx;
2653 int ret;
2654
2655 ctx = perf_event_ctx_lock(event);
2656 ret = _perf_event_refresh(event, refresh);
2657 perf_event_ctx_unlock(event, ctx);
2658
2659 return ret;
2660 }
2661 EXPORT_SYMBOL_GPL(perf_event_refresh);
2662
ctx_sched_out(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx,enum event_type_t event_type)2663 static void ctx_sched_out(struct perf_event_context *ctx,
2664 struct perf_cpu_context *cpuctx,
2665 enum event_type_t event_type)
2666 {
2667 int is_active = ctx->is_active;
2668 struct perf_event *event;
2669
2670 lockdep_assert_held(&ctx->lock);
2671
2672 if (likely(!ctx->nr_events)) {
2673 /*
2674 * See __perf_remove_from_context().
2675 */
2676 WARN_ON_ONCE(ctx->is_active);
2677 if (ctx->task)
2678 WARN_ON_ONCE(cpuctx->task_ctx);
2679 return;
2680 }
2681
2682 ctx->is_active &= ~event_type;
2683 if (!(ctx->is_active & EVENT_ALL))
2684 ctx->is_active = 0;
2685
2686 if (ctx->task) {
2687 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
2688 if (!ctx->is_active)
2689 cpuctx->task_ctx = NULL;
2690 }
2691
2692 /*
2693 * Always update time if it was set; not only when it changes.
2694 * Otherwise we can 'forget' to update time for any but the last
2695 * context we sched out. For example:
2696 *
2697 * ctx_sched_out(.event_type = EVENT_FLEXIBLE)
2698 * ctx_sched_out(.event_type = EVENT_PINNED)
2699 *
2700 * would only update time for the pinned events.
2701 */
2702 if (is_active & EVENT_TIME) {
2703 /* update (and stop) ctx time */
2704 update_context_time(ctx);
2705 update_cgrp_time_from_cpuctx(cpuctx);
2706 }
2707
2708 is_active ^= ctx->is_active; /* changed bits */
2709
2710 if (!ctx->nr_active || !(is_active & EVENT_ALL))
2711 return;
2712
2713 perf_pmu_disable(ctx->pmu);
2714 if (is_active & EVENT_PINNED) {
2715 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2716 group_sched_out(event, cpuctx, ctx);
2717 }
2718
2719 if (is_active & EVENT_FLEXIBLE) {
2720 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2721 group_sched_out(event, cpuctx, ctx);
2722 }
2723 perf_pmu_enable(ctx->pmu);
2724 }
2725
2726 /*
2727 * Test whether two contexts are equivalent, i.e. whether they have both been
2728 * cloned from the same version of the same context.
2729 *
2730 * Equivalence is measured using a generation number in the context that is
2731 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2732 * and list_del_event().
2733 */
context_equiv(struct perf_event_context * ctx1,struct perf_event_context * ctx2)2734 static int context_equiv(struct perf_event_context *ctx1,
2735 struct perf_event_context *ctx2)
2736 {
2737 lockdep_assert_held(&ctx1->lock);
2738 lockdep_assert_held(&ctx2->lock);
2739
2740 /* Pinning disables the swap optimization */
2741 if (ctx1->pin_count || ctx2->pin_count)
2742 return 0;
2743
2744 /* If ctx1 is the parent of ctx2 */
2745 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2746 return 1;
2747
2748 /* If ctx2 is the parent of ctx1 */
2749 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2750 return 1;
2751
2752 /*
2753 * If ctx1 and ctx2 have the same parent; we flatten the parent
2754 * hierarchy, see perf_event_init_context().
2755 */
2756 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2757 ctx1->parent_gen == ctx2->parent_gen)
2758 return 1;
2759
2760 /* Unmatched */
2761 return 0;
2762 }
2763
__perf_event_sync_stat(struct perf_event * event,struct perf_event * next_event)2764 static void __perf_event_sync_stat(struct perf_event *event,
2765 struct perf_event *next_event)
2766 {
2767 u64 value;
2768
2769 if (!event->attr.inherit_stat)
2770 return;
2771
2772 /*
2773 * Update the event value, we cannot use perf_event_read()
2774 * because we're in the middle of a context switch and have IRQs
2775 * disabled, which upsets smp_call_function_single(), however
2776 * we know the event must be on the current CPU, therefore we
2777 * don't need to use it.
2778 */
2779 switch (event->state) {
2780 case PERF_EVENT_STATE_ACTIVE:
2781 event->pmu->read(event);
2782 /* fall-through */
2783
2784 case PERF_EVENT_STATE_INACTIVE:
2785 update_event_times(event);
2786 break;
2787
2788 default:
2789 break;
2790 }
2791
2792 /*
2793 * In order to keep per-task stats reliable we need to flip the event
2794 * values when we flip the contexts.
2795 */
2796 value = local64_read(&next_event->count);
2797 value = local64_xchg(&event->count, value);
2798 local64_set(&next_event->count, value);
2799
2800 swap(event->total_time_enabled, next_event->total_time_enabled);
2801 swap(event->total_time_running, next_event->total_time_running);
2802
2803 /*
2804 * Since we swizzled the values, update the user visible data too.
2805 */
2806 perf_event_update_userpage(event);
2807 perf_event_update_userpage(next_event);
2808 }
2809
perf_event_sync_stat(struct perf_event_context * ctx,struct perf_event_context * next_ctx)2810 static void perf_event_sync_stat(struct perf_event_context *ctx,
2811 struct perf_event_context *next_ctx)
2812 {
2813 struct perf_event *event, *next_event;
2814
2815 if (!ctx->nr_stat)
2816 return;
2817
2818 update_context_time(ctx);
2819
2820 event = list_first_entry(&ctx->event_list,
2821 struct perf_event, event_entry);
2822
2823 next_event = list_first_entry(&next_ctx->event_list,
2824 struct perf_event, event_entry);
2825
2826 while (&event->event_entry != &ctx->event_list &&
2827 &next_event->event_entry != &next_ctx->event_list) {
2828
2829 __perf_event_sync_stat(event, next_event);
2830
2831 event = list_next_entry(event, event_entry);
2832 next_event = list_next_entry(next_event, event_entry);
2833 }
2834 }
2835
perf_event_context_sched_out(struct task_struct * task,int ctxn,struct task_struct * next)2836 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2837 struct task_struct *next)
2838 {
2839 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2840 struct perf_event_context *next_ctx;
2841 struct perf_event_context *parent, *next_parent;
2842 struct perf_cpu_context *cpuctx;
2843 int do_switch = 1;
2844
2845 if (likely(!ctx))
2846 return;
2847
2848 cpuctx = __get_cpu_context(ctx);
2849 if (!cpuctx->task_ctx)
2850 return;
2851
2852 rcu_read_lock();
2853 next_ctx = next->perf_event_ctxp[ctxn];
2854 if (!next_ctx)
2855 goto unlock;
2856
2857 parent = rcu_dereference(ctx->parent_ctx);
2858 next_parent = rcu_dereference(next_ctx->parent_ctx);
2859
2860 /* If neither context have a parent context; they cannot be clones. */
2861 if (!parent && !next_parent)
2862 goto unlock;
2863
2864 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2865 /*
2866 * Looks like the two contexts are clones, so we might be
2867 * able to optimize the context switch. We lock both
2868 * contexts and check that they are clones under the
2869 * lock (including re-checking that neither has been
2870 * uncloned in the meantime). It doesn't matter which
2871 * order we take the locks because no other cpu could
2872 * be trying to lock both of these tasks.
2873 */
2874 raw_spin_lock(&ctx->lock);
2875 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2876 if (context_equiv(ctx, next_ctx)) {
2877 WRITE_ONCE(ctx->task, next);
2878 WRITE_ONCE(next_ctx->task, task);
2879
2880 swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
2881
2882 /*
2883 * RCU_INIT_POINTER here is safe because we've not
2884 * modified the ctx and the above modification of
2885 * ctx->task and ctx->task_ctx_data are immaterial
2886 * since those values are always verified under
2887 * ctx->lock which we're now holding.
2888 */
2889 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
2890 RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
2891
2892 do_switch = 0;
2893
2894 perf_event_sync_stat(ctx, next_ctx);
2895 }
2896 raw_spin_unlock(&next_ctx->lock);
2897 raw_spin_unlock(&ctx->lock);
2898 }
2899 unlock:
2900 rcu_read_unlock();
2901
2902 if (do_switch) {
2903 raw_spin_lock(&ctx->lock);
2904 task_ctx_sched_out(cpuctx, ctx);
2905 raw_spin_unlock(&ctx->lock);
2906 }
2907 }
2908
2909 static DEFINE_PER_CPU(struct list_head, sched_cb_list);
2910
perf_sched_cb_dec(struct pmu * pmu)2911 void perf_sched_cb_dec(struct pmu *pmu)
2912 {
2913 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2914
2915 this_cpu_dec(perf_sched_cb_usages);
2916
2917 if (!--cpuctx->sched_cb_usage)
2918 list_del(&cpuctx->sched_cb_entry);
2919 }
2920
2921
perf_sched_cb_inc(struct pmu * pmu)2922 void perf_sched_cb_inc(struct pmu *pmu)
2923 {
2924 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2925
2926 if (!cpuctx->sched_cb_usage++)
2927 list_add(&cpuctx->sched_cb_entry, this_cpu_ptr(&sched_cb_list));
2928
2929 this_cpu_inc(perf_sched_cb_usages);
2930 }
2931
2932 /*
2933 * This function provides the context switch callback to the lower code
2934 * layer. It is invoked ONLY when the context switch callback is enabled.
2935 *
2936 * This callback is relevant even to per-cpu events; for example multi event
2937 * PEBS requires this to provide PID/TID information. This requires we flush
2938 * all queued PEBS records before we context switch to a new task.
2939 */
perf_pmu_sched_task(struct task_struct * prev,struct task_struct * next,bool sched_in)2940 static void perf_pmu_sched_task(struct task_struct *prev,
2941 struct task_struct *next,
2942 bool sched_in)
2943 {
2944 struct perf_cpu_context *cpuctx;
2945 struct pmu *pmu;
2946
2947 if (prev == next)
2948 return;
2949
2950 list_for_each_entry(cpuctx, this_cpu_ptr(&sched_cb_list), sched_cb_entry) {
2951 pmu = cpuctx->unique_pmu; /* software PMUs will not have sched_task */
2952
2953 if (WARN_ON_ONCE(!pmu->sched_task))
2954 continue;
2955
2956 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2957 perf_pmu_disable(pmu);
2958
2959 pmu->sched_task(cpuctx->task_ctx, sched_in);
2960
2961 perf_pmu_enable(pmu);
2962 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2963 }
2964 }
2965
2966 static void perf_event_switch(struct task_struct *task,
2967 struct task_struct *next_prev, bool sched_in);
2968
2969 #define for_each_task_context_nr(ctxn) \
2970 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2971
2972 /*
2973 * Called from scheduler to remove the events of the current task,
2974 * with interrupts disabled.
2975 *
2976 * We stop each event and update the event value in event->count.
2977 *
2978 * This does not protect us against NMI, but disable()
2979 * sets the disabled bit in the control field of event _before_
2980 * accessing the event control register. If a NMI hits, then it will
2981 * not restart the event.
2982 */
__perf_event_task_sched_out(struct task_struct * task,struct task_struct * next)2983 void __perf_event_task_sched_out(struct task_struct *task,
2984 struct task_struct *next)
2985 {
2986 int ctxn;
2987
2988 if (__this_cpu_read(perf_sched_cb_usages))
2989 perf_pmu_sched_task(task, next, false);
2990
2991 if (atomic_read(&nr_switch_events))
2992 perf_event_switch(task, next, false);
2993
2994 for_each_task_context_nr(ctxn)
2995 perf_event_context_sched_out(task, ctxn, next);
2996
2997 /*
2998 * if cgroup events exist on this CPU, then we need
2999 * to check if we have to switch out PMU state.
3000 * cgroup event are system-wide mode only
3001 */
3002 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3003 perf_cgroup_sched_out(task, next);
3004 }
3005
3006 /*
3007 * Called with IRQs disabled
3008 */
cpu_ctx_sched_out(struct perf_cpu_context * cpuctx,enum event_type_t event_type)3009 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
3010 enum event_type_t event_type)
3011 {
3012 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
3013 }
3014
3015 static void
ctx_pinned_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx)3016 ctx_pinned_sched_in(struct perf_event_context *ctx,
3017 struct perf_cpu_context *cpuctx)
3018 {
3019 struct perf_event *event;
3020
3021 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
3022 if (event->state <= PERF_EVENT_STATE_OFF)
3023 continue;
3024 if (!event_filter_match(event))
3025 continue;
3026
3027 /* may need to reset tstamp_enabled */
3028 if (is_cgroup_event(event))
3029 perf_cgroup_mark_enabled(event, ctx);
3030
3031 if (group_can_go_on(event, cpuctx, 1))
3032 group_sched_in(event, cpuctx, ctx);
3033
3034 /*
3035 * If this pinned group hasn't been scheduled,
3036 * put it in error state.
3037 */
3038 if (event->state == PERF_EVENT_STATE_INACTIVE) {
3039 update_group_times(event);
3040 event->state = PERF_EVENT_STATE_ERROR;
3041 }
3042 }
3043 }
3044
3045 static void
ctx_flexible_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx)3046 ctx_flexible_sched_in(struct perf_event_context *ctx,
3047 struct perf_cpu_context *cpuctx)
3048 {
3049 struct perf_event *event;
3050 int can_add_hw = 1;
3051
3052 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
3053 /* Ignore events in OFF or ERROR state */
3054 if (event->state <= PERF_EVENT_STATE_OFF)
3055 continue;
3056 /*
3057 * Listen to the 'cpu' scheduling filter constraint
3058 * of events:
3059 */
3060 if (!event_filter_match(event))
3061 continue;
3062
3063 /* may need to reset tstamp_enabled */
3064 if (is_cgroup_event(event))
3065 perf_cgroup_mark_enabled(event, ctx);
3066
3067 if (group_can_go_on(event, cpuctx, can_add_hw)) {
3068 if (group_sched_in(event, cpuctx, ctx))
3069 can_add_hw = 0;
3070 }
3071 }
3072 }
3073
3074 static void
ctx_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx,enum event_type_t event_type,struct task_struct * task)3075 ctx_sched_in(struct perf_event_context *ctx,
3076 struct perf_cpu_context *cpuctx,
3077 enum event_type_t event_type,
3078 struct task_struct *task)
3079 {
3080 int is_active = ctx->is_active;
3081 u64 now;
3082
3083 lockdep_assert_held(&ctx->lock);
3084
3085 if (likely(!ctx->nr_events))
3086 return;
3087
3088 ctx->is_active |= (event_type | EVENT_TIME);
3089 if (ctx->task) {
3090 if (!is_active)
3091 cpuctx->task_ctx = ctx;
3092 else
3093 WARN_ON_ONCE(cpuctx->task_ctx != ctx);
3094 }
3095
3096 is_active ^= ctx->is_active; /* changed bits */
3097
3098 if (is_active & EVENT_TIME) {
3099 /* start ctx time */
3100 now = perf_clock();
3101 ctx->timestamp = now;
3102 perf_cgroup_set_timestamp(task, ctx);
3103 }
3104
3105 /*
3106 * First go through the list and put on any pinned groups
3107 * in order to give them the best chance of going on.
3108 */
3109 if (is_active & EVENT_PINNED)
3110 ctx_pinned_sched_in(ctx, cpuctx);
3111
3112 /* Then walk through the lower prio flexible groups */
3113 if (is_active & EVENT_FLEXIBLE)
3114 ctx_flexible_sched_in(ctx, cpuctx);
3115 }
3116
cpu_ctx_sched_in(struct perf_cpu_context * cpuctx,enum event_type_t event_type,struct task_struct * task)3117 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
3118 enum event_type_t event_type,
3119 struct task_struct *task)
3120 {
3121 struct perf_event_context *ctx = &cpuctx->ctx;
3122
3123 ctx_sched_in(ctx, cpuctx, event_type, task);
3124 }
3125
perf_event_context_sched_in(struct perf_event_context * ctx,struct task_struct * task)3126 static void perf_event_context_sched_in(struct perf_event_context *ctx,
3127 struct task_struct *task)
3128 {
3129 struct perf_cpu_context *cpuctx;
3130
3131 cpuctx = __get_cpu_context(ctx);
3132 if (cpuctx->task_ctx == ctx)
3133 return;
3134
3135 perf_ctx_lock(cpuctx, ctx);
3136 perf_pmu_disable(ctx->pmu);
3137 /*
3138 * We want to keep the following priority order:
3139 * cpu pinned (that don't need to move), task pinned,
3140 * cpu flexible, task flexible.
3141 */
3142 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3143 perf_event_sched_in(cpuctx, ctx, task);
3144 perf_pmu_enable(ctx->pmu);
3145 perf_ctx_unlock(cpuctx, ctx);
3146 }
3147
3148 /*
3149 * Called from scheduler to add the events of the current task
3150 * with interrupts disabled.
3151 *
3152 * We restore the event value and then enable it.
3153 *
3154 * This does not protect us against NMI, but enable()
3155 * sets the enabled bit in the control field of event _before_
3156 * accessing the event control register. If a NMI hits, then it will
3157 * keep the event running.
3158 */
__perf_event_task_sched_in(struct task_struct * prev,struct task_struct * task)3159 void __perf_event_task_sched_in(struct task_struct *prev,
3160 struct task_struct *task)
3161 {
3162 struct perf_event_context *ctx;
3163 int ctxn;
3164
3165 /*
3166 * If cgroup events exist on this CPU, then we need to check if we have
3167 * to switch in PMU state; cgroup event are system-wide mode only.
3168 *
3169 * Since cgroup events are CPU events, we must schedule these in before
3170 * we schedule in the task events.
3171 */
3172 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
3173 perf_cgroup_sched_in(prev, task);
3174
3175 for_each_task_context_nr(ctxn) {
3176 ctx = task->perf_event_ctxp[ctxn];
3177 if (likely(!ctx))
3178 continue;
3179
3180 perf_event_context_sched_in(ctx, task);
3181 }
3182
3183 if (atomic_read(&nr_switch_events))
3184 perf_event_switch(task, prev, true);
3185
3186 if (__this_cpu_read(perf_sched_cb_usages))
3187 perf_pmu_sched_task(prev, task, true);
3188 }
3189
perf_calculate_period(struct perf_event * event,u64 nsec,u64 count)3190 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
3191 {
3192 u64 frequency = event->attr.sample_freq;
3193 u64 sec = NSEC_PER_SEC;
3194 u64 divisor, dividend;
3195
3196 int count_fls, nsec_fls, frequency_fls, sec_fls;
3197
3198 count_fls = fls64(count);
3199 nsec_fls = fls64(nsec);
3200 frequency_fls = fls64(frequency);
3201 sec_fls = 30;
3202
3203 /*
3204 * We got @count in @nsec, with a target of sample_freq HZ
3205 * the target period becomes:
3206 *
3207 * @count * 10^9
3208 * period = -------------------
3209 * @nsec * sample_freq
3210 *
3211 */
3212
3213 /*
3214 * Reduce accuracy by one bit such that @a and @b converge
3215 * to a similar magnitude.
3216 */
3217 #define REDUCE_FLS(a, b) \
3218 do { \
3219 if (a##_fls > b##_fls) { \
3220 a >>= 1; \
3221 a##_fls--; \
3222 } else { \
3223 b >>= 1; \
3224 b##_fls--; \
3225 } \
3226 } while (0)
3227
3228 /*
3229 * Reduce accuracy until either term fits in a u64, then proceed with
3230 * the other, so that finally we can do a u64/u64 division.
3231 */
3232 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
3233 REDUCE_FLS(nsec, frequency);
3234 REDUCE_FLS(sec, count);
3235 }
3236
3237 if (count_fls + sec_fls > 64) {
3238 divisor = nsec * frequency;
3239
3240 while (count_fls + sec_fls > 64) {
3241 REDUCE_FLS(count, sec);
3242 divisor >>= 1;
3243 }
3244
3245 dividend = count * sec;
3246 } else {
3247 dividend = count * sec;
3248
3249 while (nsec_fls + frequency_fls > 64) {
3250 REDUCE_FLS(nsec, frequency);
3251 dividend >>= 1;
3252 }
3253
3254 divisor = nsec * frequency;
3255 }
3256
3257 if (!divisor)
3258 return dividend;
3259
3260 return div64_u64(dividend, divisor);
3261 }
3262
3263 static DEFINE_PER_CPU(int, perf_throttled_count);
3264 static DEFINE_PER_CPU(u64, perf_throttled_seq);
3265
perf_adjust_period(struct perf_event * event,u64 nsec,u64 count,bool disable)3266 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
3267 {
3268 struct hw_perf_event *hwc = &event->hw;
3269 s64 period, sample_period;
3270 s64 delta;
3271
3272 period = perf_calculate_period(event, nsec, count);
3273
3274 delta = (s64)(period - hwc->sample_period);
3275 delta = (delta + 7) / 8; /* low pass filter */
3276
3277 sample_period = hwc->sample_period + delta;
3278
3279 if (!sample_period)
3280 sample_period = 1;
3281
3282 hwc->sample_period = sample_period;
3283
3284 if (local64_read(&hwc->period_left) > 8*sample_period) {
3285 if (disable)
3286 event->pmu->stop(event, PERF_EF_UPDATE);
3287
3288 local64_set(&hwc->period_left, 0);
3289
3290 if (disable)
3291 event->pmu->start(event, PERF_EF_RELOAD);
3292 }
3293 }
3294
3295 /*
3296 * combine freq adjustment with unthrottling to avoid two passes over the
3297 * events. At the same time, make sure, having freq events does not change
3298 * the rate of unthrottling as that would introduce bias.
3299 */
perf_adjust_freq_unthr_context(struct perf_event_context * ctx,int needs_unthr)3300 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
3301 int needs_unthr)
3302 {
3303 struct perf_event *event;
3304 struct hw_perf_event *hwc;
3305 u64 now, period = TICK_NSEC;
3306 s64 delta;
3307
3308 /*
3309 * only need to iterate over all events iff:
3310 * - context have events in frequency mode (needs freq adjust)
3311 * - there are events to unthrottle on this cpu
3312 */
3313 if (!(ctx->nr_freq || needs_unthr))
3314 return;
3315
3316 raw_spin_lock(&ctx->lock);
3317 perf_pmu_disable(ctx->pmu);
3318
3319 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
3320 if (event->state != PERF_EVENT_STATE_ACTIVE)
3321 continue;
3322
3323 if (!event_filter_match(event))
3324 continue;
3325
3326 perf_pmu_disable(event->pmu);
3327
3328 hwc = &event->hw;
3329
3330 if (hwc->interrupts == MAX_INTERRUPTS) {
3331 hwc->interrupts = 0;
3332 perf_log_throttle(event, 1);
3333 event->pmu->start(event, 0);
3334 }
3335
3336 if (!event->attr.freq || !event->attr.sample_freq)
3337 goto next;
3338
3339 /*
3340 * stop the event and update event->count
3341 */
3342 event->pmu->stop(event, PERF_EF_UPDATE);
3343
3344 now = local64_read(&event->count);
3345 delta = now - hwc->freq_count_stamp;
3346 hwc->freq_count_stamp = now;
3347
3348 /*
3349 * restart the event
3350 * reload only if value has changed
3351 * we have stopped the event so tell that
3352 * to perf_adjust_period() to avoid stopping it
3353 * twice.
3354 */
3355 if (delta > 0)
3356 perf_adjust_period(event, period, delta, false);
3357
3358 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3359 next:
3360 perf_pmu_enable(event->pmu);
3361 }
3362
3363 perf_pmu_enable(ctx->pmu);
3364 raw_spin_unlock(&ctx->lock);
3365 }
3366
3367 /*
3368 * Round-robin a context's events:
3369 */
rotate_ctx(struct perf_event_context * ctx)3370 static void rotate_ctx(struct perf_event_context *ctx)
3371 {
3372 /*
3373 * Rotate the first entry last of non-pinned groups. Rotation might be
3374 * disabled by the inheritance code.
3375 */
3376 if (!ctx->rotate_disable)
3377 list_rotate_left(&ctx->flexible_groups);
3378 }
3379
perf_rotate_context(struct perf_cpu_context * cpuctx)3380 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3381 {
3382 struct perf_event_context *ctx = NULL;
3383 int rotate = 0;
3384
3385 if (cpuctx->ctx.nr_events) {
3386 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3387 rotate = 1;
3388 }
3389
3390 ctx = cpuctx->task_ctx;
3391 if (ctx && ctx->nr_events) {
3392 if (ctx->nr_events != ctx->nr_active)
3393 rotate = 1;
3394 }
3395
3396 if (!rotate)
3397 goto done;
3398
3399 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3400 perf_pmu_disable(cpuctx->ctx.pmu);
3401
3402 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3403 if (ctx)
3404 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3405
3406 rotate_ctx(&cpuctx->ctx);
3407 if (ctx)
3408 rotate_ctx(ctx);
3409
3410 perf_event_sched_in(cpuctx, ctx, current);
3411
3412 perf_pmu_enable(cpuctx->ctx.pmu);
3413 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3414 done:
3415
3416 return rotate;
3417 }
3418
perf_event_task_tick(void)3419 void perf_event_task_tick(void)
3420 {
3421 struct list_head *head = this_cpu_ptr(&active_ctx_list);
3422 struct perf_event_context *ctx, *tmp;
3423 int throttled;
3424
3425 WARN_ON(!irqs_disabled());
3426
3427 __this_cpu_inc(perf_throttled_seq);
3428 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3429 tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
3430
3431 list_for_each_entry_safe(ctx, tmp, head, active_ctx_list)
3432 perf_adjust_freq_unthr_context(ctx, throttled);
3433 }
3434
event_enable_on_exec(struct perf_event * event,struct perf_event_context * ctx)3435 static int event_enable_on_exec(struct perf_event *event,
3436 struct perf_event_context *ctx)
3437 {
3438 if (!event->attr.enable_on_exec)
3439 return 0;
3440
3441 event->attr.enable_on_exec = 0;
3442 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3443 return 0;
3444
3445 __perf_event_mark_enabled(event);
3446
3447 return 1;
3448 }
3449
3450 /*
3451 * Enable all of a task's events that have been marked enable-on-exec.
3452 * This expects task == current.
3453 */
perf_event_enable_on_exec(int ctxn)3454 static void perf_event_enable_on_exec(int ctxn)
3455 {
3456 struct perf_event_context *ctx, *clone_ctx = NULL;
3457 struct perf_cpu_context *cpuctx;
3458 struct perf_event *event;
3459 unsigned long flags;
3460 int enabled = 0;
3461
3462 local_irq_save(flags);
3463 ctx = current->perf_event_ctxp[ctxn];
3464 if (!ctx || !ctx->nr_events)
3465 goto out;
3466
3467 cpuctx = __get_cpu_context(ctx);
3468 perf_ctx_lock(cpuctx, ctx);
3469 ctx_sched_out(ctx, cpuctx, EVENT_TIME);
3470 list_for_each_entry(event, &ctx->event_list, event_entry)
3471 enabled |= event_enable_on_exec(event, ctx);
3472
3473 /*
3474 * Unclone and reschedule this context if we enabled any event.
3475 */
3476 if (enabled) {
3477 clone_ctx = unclone_ctx(ctx);
3478 ctx_resched(cpuctx, ctx);
3479 }
3480 perf_ctx_unlock(cpuctx, ctx);
3481
3482 out:
3483 local_irq_restore(flags);
3484
3485 if (clone_ctx)
3486 put_ctx(clone_ctx);
3487 }
3488
3489 struct perf_read_data {
3490 struct perf_event *event;
3491 bool group;
3492 int ret;
3493 };
3494
__perf_event_read_cpu(struct perf_event * event,int event_cpu)3495 static int __perf_event_read_cpu(struct perf_event *event, int event_cpu)
3496 {
3497 u16 local_pkg, event_pkg;
3498
3499 if (event->group_caps & PERF_EV_CAP_READ_ACTIVE_PKG) {
3500 int local_cpu = smp_processor_id();
3501
3502 event_pkg = topology_physical_package_id(event_cpu);
3503 local_pkg = topology_physical_package_id(local_cpu);
3504
3505 if (event_pkg == local_pkg)
3506 return local_cpu;
3507 }
3508
3509 return event_cpu;
3510 }
3511
3512 /*
3513 * Cross CPU call to read the hardware event
3514 */
__perf_event_read(void * info)3515 static void __perf_event_read(void *info)
3516 {
3517 struct perf_read_data *data = info;
3518 struct perf_event *sub, *event = data->event;
3519 struct perf_event_context *ctx = event->ctx;
3520 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3521 struct pmu *pmu = event->pmu;
3522
3523 /*
3524 * If this is a task context, we need to check whether it is
3525 * the current task context of this cpu. If not it has been
3526 * scheduled out before the smp call arrived. In that case
3527 * event->count would have been updated to a recent sample
3528 * when the event was scheduled out.
3529 */
3530 if (ctx->task && cpuctx->task_ctx != ctx)
3531 return;
3532
3533 raw_spin_lock(&ctx->lock);
3534 if (ctx->is_active) {
3535 update_context_time(ctx);
3536 update_cgrp_time_from_event(event);
3537 }
3538
3539 update_event_times(event);
3540 if (event->state != PERF_EVENT_STATE_ACTIVE)
3541 goto unlock;
3542
3543 if (!data->group) {
3544 pmu->read(event);
3545 data->ret = 0;
3546 goto unlock;
3547 }
3548
3549 pmu->start_txn(pmu, PERF_PMU_TXN_READ);
3550
3551 pmu->read(event);
3552
3553 list_for_each_entry(sub, &event->sibling_list, group_entry) {
3554 update_event_times(sub);
3555 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
3556 /*
3557 * Use sibling's PMU rather than @event's since
3558 * sibling could be on different (eg: software) PMU.
3559 */
3560 sub->pmu->read(sub);
3561 }
3562 }
3563
3564 data->ret = pmu->commit_txn(pmu);
3565
3566 unlock:
3567 raw_spin_unlock(&ctx->lock);
3568 }
3569
perf_event_count(struct perf_event * event)3570 static inline u64 perf_event_count(struct perf_event *event)
3571 {
3572 if (event->pmu->count)
3573 return event->pmu->count(event);
3574
3575 return __perf_event_count(event);
3576 }
3577
3578 /*
3579 * NMI-safe method to read a local event, that is an event that
3580 * is:
3581 * - either for the current task, or for this CPU
3582 * - does not have inherit set, for inherited task events
3583 * will not be local and we cannot read them atomically
3584 * - must not have a pmu::count method
3585 */
perf_event_read_local(struct perf_event * event)3586 u64 perf_event_read_local(struct perf_event *event)
3587 {
3588 unsigned long flags;
3589 u64 val;
3590
3591 /*
3592 * Disabling interrupts avoids all counter scheduling (context
3593 * switches, timer based rotation and IPIs).
3594 */
3595 local_irq_save(flags);
3596
3597 /* If this is a per-task event, it must be for current */
3598 WARN_ON_ONCE((event->attach_state & PERF_ATTACH_TASK) &&
3599 event->hw.target != current);
3600
3601 /* If this is a per-CPU event, it must be for this CPU */
3602 WARN_ON_ONCE(!(event->attach_state & PERF_ATTACH_TASK) &&
3603 event->cpu != smp_processor_id());
3604
3605 /*
3606 * It must not be an event with inherit set, we cannot read
3607 * all child counters from atomic context.
3608 */
3609 WARN_ON_ONCE(event->attr.inherit);
3610
3611 /*
3612 * It must not have a pmu::count method, those are not
3613 * NMI safe.
3614 */
3615 WARN_ON_ONCE(event->pmu->count);
3616
3617 /*
3618 * If the event is currently on this CPU, its either a per-task event,
3619 * or local to this CPU. Furthermore it means its ACTIVE (otherwise
3620 * oncpu == -1).
3621 */
3622 if (event->oncpu == smp_processor_id())
3623 event->pmu->read(event);
3624
3625 val = local64_read(&event->count);
3626 local_irq_restore(flags);
3627
3628 return val;
3629 }
3630
perf_event_read(struct perf_event * event,bool group)3631 static int perf_event_read(struct perf_event *event, bool group)
3632 {
3633 int event_cpu, ret = 0;
3634
3635 /*
3636 * If event is enabled and currently active on a CPU, update the
3637 * value in the event structure:
3638 */
3639 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3640 struct perf_read_data data = {
3641 .event = event,
3642 .group = group,
3643 .ret = 0,
3644 };
3645
3646 event_cpu = READ_ONCE(event->oncpu);
3647 if ((unsigned)event_cpu >= nr_cpu_ids)
3648 return 0;
3649
3650 preempt_disable();
3651 event_cpu = __perf_event_read_cpu(event, event_cpu);
3652
3653 /*
3654 * Purposely ignore the smp_call_function_single() return
3655 * value.
3656 *
3657 * If event_cpu isn't a valid CPU it means the event got
3658 * scheduled out and that will have updated the event count.
3659 *
3660 * Therefore, either way, we'll have an up-to-date event count
3661 * after this.
3662 */
3663 (void)smp_call_function_single(event_cpu, __perf_event_read, &data, 1);
3664 preempt_enable();
3665 ret = data.ret;
3666 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3667 struct perf_event_context *ctx = event->ctx;
3668 unsigned long flags;
3669
3670 raw_spin_lock_irqsave(&ctx->lock, flags);
3671 /*
3672 * may read while context is not active
3673 * (e.g., thread is blocked), in that case
3674 * we cannot update context time
3675 */
3676 if (ctx->is_active) {
3677 update_context_time(ctx);
3678 update_cgrp_time_from_event(event);
3679 }
3680 if (group)
3681 update_group_times(event);
3682 else
3683 update_event_times(event);
3684 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3685 }
3686
3687 return ret;
3688 }
3689
3690 /*
3691 * Initialize the perf_event context in a task_struct:
3692 */
__perf_event_init_context(struct perf_event_context * ctx)3693 static void __perf_event_init_context(struct perf_event_context *ctx)
3694 {
3695 raw_spin_lock_init(&ctx->lock);
3696 mutex_init(&ctx->mutex);
3697 INIT_LIST_HEAD(&ctx->active_ctx_list);
3698 INIT_LIST_HEAD(&ctx->pinned_groups);
3699 INIT_LIST_HEAD(&ctx->flexible_groups);
3700 INIT_LIST_HEAD(&ctx->event_list);
3701 atomic_set(&ctx->refcount, 1);
3702 }
3703
3704 static struct perf_event_context *
alloc_perf_context(struct pmu * pmu,struct task_struct * task)3705 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3706 {
3707 struct perf_event_context *ctx;
3708
3709 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3710 if (!ctx)
3711 return NULL;
3712
3713 __perf_event_init_context(ctx);
3714 if (task) {
3715 ctx->task = task;
3716 get_task_struct(task);
3717 }
3718 ctx->pmu = pmu;
3719
3720 return ctx;
3721 }
3722
3723 static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)3724 find_lively_task_by_vpid(pid_t vpid)
3725 {
3726 struct task_struct *task;
3727
3728 rcu_read_lock();
3729 if (!vpid)
3730 task = current;
3731 else
3732 task = find_task_by_vpid(vpid);
3733 if (task)
3734 get_task_struct(task);
3735 rcu_read_unlock();
3736
3737 if (!task)
3738 return ERR_PTR(-ESRCH);
3739
3740 return task;
3741 }
3742
3743 /*
3744 * Returns a matching context with refcount and pincount.
3745 */
3746 static struct perf_event_context *
find_get_context(struct pmu * pmu,struct task_struct * task,struct perf_event * event)3747 find_get_context(struct pmu *pmu, struct task_struct *task,
3748 struct perf_event *event)
3749 {
3750 struct perf_event_context *ctx, *clone_ctx = NULL;
3751 struct perf_cpu_context *cpuctx;
3752 void *task_ctx_data = NULL;
3753 unsigned long flags;
3754 int ctxn, err;
3755 int cpu = event->cpu;
3756
3757 if (!task) {
3758 /* Must be root to operate on a CPU event: */
3759 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3760 return ERR_PTR(-EACCES);
3761
3762 /*
3763 * We could be clever and allow to attach a event to an
3764 * offline CPU and activate it when the CPU comes up, but
3765 * that's for later.
3766 */
3767 if (!cpu_online(cpu))
3768 return ERR_PTR(-ENODEV);
3769
3770 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3771 ctx = &cpuctx->ctx;
3772 get_ctx(ctx);
3773 ++ctx->pin_count;
3774
3775 return ctx;
3776 }
3777
3778 err = -EINVAL;
3779 ctxn = pmu->task_ctx_nr;
3780 if (ctxn < 0)
3781 goto errout;
3782
3783 if (event->attach_state & PERF_ATTACH_TASK_DATA) {
3784 task_ctx_data = kzalloc(pmu->task_ctx_size, GFP_KERNEL);
3785 if (!task_ctx_data) {
3786 err = -ENOMEM;
3787 goto errout;
3788 }
3789 }
3790
3791 retry:
3792 ctx = perf_lock_task_context(task, ctxn, &flags);
3793 if (ctx) {
3794 clone_ctx = unclone_ctx(ctx);
3795 ++ctx->pin_count;
3796
3797 if (task_ctx_data && !ctx->task_ctx_data) {
3798 ctx->task_ctx_data = task_ctx_data;
3799 task_ctx_data = NULL;
3800 }
3801 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3802
3803 if (clone_ctx)
3804 put_ctx(clone_ctx);
3805 } else {
3806 ctx = alloc_perf_context(pmu, task);
3807 err = -ENOMEM;
3808 if (!ctx)
3809 goto errout;
3810
3811 if (task_ctx_data) {
3812 ctx->task_ctx_data = task_ctx_data;
3813 task_ctx_data = NULL;
3814 }
3815
3816 err = 0;
3817 mutex_lock(&task->perf_event_mutex);
3818 /*
3819 * If it has already passed perf_event_exit_task().
3820 * we must see PF_EXITING, it takes this mutex too.
3821 */
3822 if (task->flags & PF_EXITING)
3823 err = -ESRCH;
3824 else if (task->perf_event_ctxp[ctxn])
3825 err = -EAGAIN;
3826 else {
3827 get_ctx(ctx);
3828 ++ctx->pin_count;
3829 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3830 }
3831 mutex_unlock(&task->perf_event_mutex);
3832
3833 if (unlikely(err)) {
3834 put_ctx(ctx);
3835
3836 if (err == -EAGAIN)
3837 goto retry;
3838 goto errout;
3839 }
3840 }
3841
3842 kfree(task_ctx_data);
3843 return ctx;
3844
3845 errout:
3846 kfree(task_ctx_data);
3847 return ERR_PTR(err);
3848 }
3849
3850 static void perf_event_free_filter(struct perf_event *event);
3851 static void perf_event_free_bpf_prog(struct perf_event *event);
3852
free_event_rcu(struct rcu_head * head)3853 static void free_event_rcu(struct rcu_head *head)
3854 {
3855 struct perf_event *event;
3856
3857 event = container_of(head, struct perf_event, rcu_head);
3858 if (event->ns)
3859 put_pid_ns(event->ns);
3860 perf_event_free_filter(event);
3861 kfree(event);
3862 }
3863
3864 static void ring_buffer_attach(struct perf_event *event,
3865 struct ring_buffer *rb);
3866
detach_sb_event(struct perf_event * event)3867 static void detach_sb_event(struct perf_event *event)
3868 {
3869 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
3870
3871 raw_spin_lock(&pel->lock);
3872 list_del_rcu(&event->sb_list);
3873 raw_spin_unlock(&pel->lock);
3874 }
3875
is_sb_event(struct perf_event * event)3876 static bool is_sb_event(struct perf_event *event)
3877 {
3878 struct perf_event_attr *attr = &event->attr;
3879
3880 if (event->parent)
3881 return false;
3882
3883 if (event->attach_state & PERF_ATTACH_TASK)
3884 return false;
3885
3886 if (attr->mmap || attr->mmap_data || attr->mmap2 ||
3887 attr->comm || attr->comm_exec ||
3888 attr->task ||
3889 attr->context_switch)
3890 return true;
3891 return false;
3892 }
3893
unaccount_pmu_sb_event(struct perf_event * event)3894 static void unaccount_pmu_sb_event(struct perf_event *event)
3895 {
3896 if (is_sb_event(event))
3897 detach_sb_event(event);
3898 }
3899
unaccount_event_cpu(struct perf_event * event,int cpu)3900 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3901 {
3902 if (event->parent)
3903 return;
3904
3905 if (is_cgroup_event(event))
3906 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3907 }
3908
3909 #ifdef CONFIG_NO_HZ_FULL
3910 static DEFINE_SPINLOCK(nr_freq_lock);
3911 #endif
3912
unaccount_freq_event_nohz(void)3913 static void unaccount_freq_event_nohz(void)
3914 {
3915 #ifdef CONFIG_NO_HZ_FULL
3916 spin_lock(&nr_freq_lock);
3917 if (atomic_dec_and_test(&nr_freq_events))
3918 tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS);
3919 spin_unlock(&nr_freq_lock);
3920 #endif
3921 }
3922
unaccount_freq_event(void)3923 static void unaccount_freq_event(void)
3924 {
3925 if (tick_nohz_full_enabled())
3926 unaccount_freq_event_nohz();
3927 else
3928 atomic_dec(&nr_freq_events);
3929 }
3930
unaccount_event(struct perf_event * event)3931 static void unaccount_event(struct perf_event *event)
3932 {
3933 bool dec = false;
3934
3935 if (event->parent)
3936 return;
3937
3938 if (event->attach_state & PERF_ATTACH_TASK)
3939 dec = true;
3940 if (event->attr.mmap || event->attr.mmap_data)
3941 atomic_dec(&nr_mmap_events);
3942 if (event->attr.comm)
3943 atomic_dec(&nr_comm_events);
3944 if (event->attr.task)
3945 atomic_dec(&nr_task_events);
3946 if (event->attr.freq)
3947 unaccount_freq_event();
3948 if (event->attr.context_switch) {
3949 dec = true;
3950 atomic_dec(&nr_switch_events);
3951 }
3952 if (is_cgroup_event(event))
3953 dec = true;
3954 if (has_branch_stack(event))
3955 dec = true;
3956
3957 if (dec) {
3958 if (!atomic_add_unless(&perf_sched_count, -1, 1))
3959 schedule_delayed_work(&perf_sched_work, HZ);
3960 }
3961
3962 unaccount_event_cpu(event, event->cpu);
3963
3964 unaccount_pmu_sb_event(event);
3965 }
3966
perf_sched_delayed(struct work_struct * work)3967 static void perf_sched_delayed(struct work_struct *work)
3968 {
3969 mutex_lock(&perf_sched_mutex);
3970 if (atomic_dec_and_test(&perf_sched_count))
3971 static_branch_disable(&perf_sched_events);
3972 mutex_unlock(&perf_sched_mutex);
3973 }
3974
3975 /*
3976 * The following implement mutual exclusion of events on "exclusive" pmus
3977 * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
3978 * at a time, so we disallow creating events that might conflict, namely:
3979 *
3980 * 1) cpu-wide events in the presence of per-task events,
3981 * 2) per-task events in the presence of cpu-wide events,
3982 * 3) two matching events on the same context.
3983 *
3984 * The former two cases are handled in the allocation path (perf_event_alloc(),
3985 * _free_event()), the latter -- before the first perf_install_in_context().
3986 */
exclusive_event_init(struct perf_event * event)3987 static int exclusive_event_init(struct perf_event *event)
3988 {
3989 struct pmu *pmu = event->pmu;
3990
3991 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
3992 return 0;
3993
3994 /*
3995 * Prevent co-existence of per-task and cpu-wide events on the
3996 * same exclusive pmu.
3997 *
3998 * Negative pmu::exclusive_cnt means there are cpu-wide
3999 * events on this "exclusive" pmu, positive means there are
4000 * per-task events.
4001 *
4002 * Since this is called in perf_event_alloc() path, event::ctx
4003 * doesn't exist yet; it is, however, safe to use PERF_ATTACH_TASK
4004 * to mean "per-task event", because unlike other attach states it
4005 * never gets cleared.
4006 */
4007 if (event->attach_state & PERF_ATTACH_TASK) {
4008 if (!atomic_inc_unless_negative(&pmu->exclusive_cnt))
4009 return -EBUSY;
4010 } else {
4011 if (!atomic_dec_unless_positive(&pmu->exclusive_cnt))
4012 return -EBUSY;
4013 }
4014
4015 return 0;
4016 }
4017
exclusive_event_destroy(struct perf_event * event)4018 static void exclusive_event_destroy(struct perf_event *event)
4019 {
4020 struct pmu *pmu = event->pmu;
4021
4022 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4023 return;
4024
4025 /* see comment in exclusive_event_init() */
4026 if (event->attach_state & PERF_ATTACH_TASK)
4027 atomic_dec(&pmu->exclusive_cnt);
4028 else
4029 atomic_inc(&pmu->exclusive_cnt);
4030 }
4031
exclusive_event_match(struct perf_event * e1,struct perf_event * e2)4032 static bool exclusive_event_match(struct perf_event *e1, struct perf_event *e2)
4033 {
4034 if ((e1->pmu == e2->pmu) &&
4035 (e1->cpu == e2->cpu ||
4036 e1->cpu == -1 ||
4037 e2->cpu == -1))
4038 return true;
4039 return false;
4040 }
4041
4042 /* Called under the same ctx::mutex as perf_install_in_context() */
exclusive_event_installable(struct perf_event * event,struct perf_event_context * ctx)4043 static bool exclusive_event_installable(struct perf_event *event,
4044 struct perf_event_context *ctx)
4045 {
4046 struct perf_event *iter_event;
4047 struct pmu *pmu = event->pmu;
4048
4049 if (!(pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE))
4050 return true;
4051
4052 list_for_each_entry(iter_event, &ctx->event_list, event_entry) {
4053 if (exclusive_event_match(iter_event, event))
4054 return false;
4055 }
4056
4057 return true;
4058 }
4059
4060 static void perf_addr_filters_splice(struct perf_event *event,
4061 struct list_head *head);
4062
_free_event(struct perf_event * event)4063 static void _free_event(struct perf_event *event)
4064 {
4065 irq_work_sync(&event->pending);
4066
4067 unaccount_event(event);
4068
4069 if (event->rb) {
4070 /*
4071 * Can happen when we close an event with re-directed output.
4072 *
4073 * Since we have a 0 refcount, perf_mmap_close() will skip
4074 * over us; possibly making our ring_buffer_put() the last.
4075 */
4076 mutex_lock(&event->mmap_mutex);
4077 ring_buffer_attach(event, NULL);
4078 mutex_unlock(&event->mmap_mutex);
4079 }
4080
4081 if (is_cgroup_event(event))
4082 perf_detach_cgroup(event);
4083
4084 if (!event->parent) {
4085 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
4086 put_callchain_buffers();
4087 }
4088
4089 perf_event_free_bpf_prog(event);
4090 perf_addr_filters_splice(event, NULL);
4091 kfree(event->addr_filters_offs);
4092
4093 if (event->destroy)
4094 event->destroy(event);
4095
4096 if (event->ctx)
4097 put_ctx(event->ctx);
4098
4099 if (event->hw.target)
4100 put_task_struct(event->hw.target);
4101
4102 exclusive_event_destroy(event);
4103 module_put(event->pmu->module);
4104
4105 call_rcu(&event->rcu_head, free_event_rcu);
4106 }
4107
4108 /*
4109 * Used to free events which have a known refcount of 1, such as in error paths
4110 * where the event isn't exposed yet and inherited events.
4111 */
free_event(struct perf_event * event)4112 static void free_event(struct perf_event *event)
4113 {
4114 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
4115 "unexpected event refcount: %ld; ptr=%p\n",
4116 atomic_long_read(&event->refcount), event)) {
4117 /* leak to avoid use-after-free */
4118 return;
4119 }
4120
4121 _free_event(event);
4122 }
4123
4124 /*
4125 * Remove user event from the owner task.
4126 */
perf_remove_from_owner(struct perf_event * event)4127 static void perf_remove_from_owner(struct perf_event *event)
4128 {
4129 struct task_struct *owner;
4130
4131 rcu_read_lock();
4132 /*
4133 * Matches the smp_store_release() in perf_event_exit_task(). If we
4134 * observe !owner it means the list deletion is complete and we can
4135 * indeed free this event, otherwise we need to serialize on
4136 * owner->perf_event_mutex.
4137 */
4138 owner = lockless_dereference(event->owner);
4139 if (owner) {
4140 /*
4141 * Since delayed_put_task_struct() also drops the last
4142 * task reference we can safely take a new reference
4143 * while holding the rcu_read_lock().
4144 */
4145 get_task_struct(owner);
4146 }
4147 rcu_read_unlock();
4148
4149 if (owner) {
4150 /*
4151 * If we're here through perf_event_exit_task() we're already
4152 * holding ctx->mutex which would be an inversion wrt. the
4153 * normal lock order.
4154 *
4155 * However we can safely take this lock because its the child
4156 * ctx->mutex.
4157 */
4158 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
4159
4160 /*
4161 * We have to re-check the event->owner field, if it is cleared
4162 * we raced with perf_event_exit_task(), acquiring the mutex
4163 * ensured they're done, and we can proceed with freeing the
4164 * event.
4165 */
4166 if (event->owner) {
4167 list_del_init(&event->owner_entry);
4168 smp_store_release(&event->owner, NULL);
4169 }
4170 mutex_unlock(&owner->perf_event_mutex);
4171 put_task_struct(owner);
4172 }
4173 }
4174
put_event(struct perf_event * event)4175 static void put_event(struct perf_event *event)
4176 {
4177 if (!atomic_long_dec_and_test(&event->refcount))
4178 return;
4179
4180 _free_event(event);
4181 }
4182
4183 /*
4184 * Kill an event dead; while event:refcount will preserve the event
4185 * object, it will not preserve its functionality. Once the last 'user'
4186 * gives up the object, we'll destroy the thing.
4187 */
perf_event_release_kernel(struct perf_event * event)4188 int perf_event_release_kernel(struct perf_event *event)
4189 {
4190 struct perf_event_context *ctx = event->ctx;
4191 struct perf_event *child, *tmp;
4192
4193 /*
4194 * If we got here through err_file: fput(event_file); we will not have
4195 * attached to a context yet.
4196 */
4197 if (!ctx) {
4198 WARN_ON_ONCE(event->attach_state &
4199 (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
4200 goto no_ctx;
4201 }
4202
4203 if (!is_kernel_event(event))
4204 perf_remove_from_owner(event);
4205
4206 ctx = perf_event_ctx_lock(event);
4207 WARN_ON_ONCE(ctx->parent_ctx);
4208 perf_remove_from_context(event, DETACH_GROUP);
4209
4210 raw_spin_lock_irq(&ctx->lock);
4211 /*
4212 * Mark this even as STATE_DEAD, there is no external reference to it
4213 * anymore.
4214 *
4215 * Anybody acquiring event->child_mutex after the below loop _must_
4216 * also see this, most importantly inherit_event() which will avoid
4217 * placing more children on the list.
4218 *
4219 * Thus this guarantees that we will in fact observe and kill _ALL_
4220 * child events.
4221 */
4222 event->state = PERF_EVENT_STATE_DEAD;
4223 raw_spin_unlock_irq(&ctx->lock);
4224
4225 perf_event_ctx_unlock(event, ctx);
4226
4227 again:
4228 mutex_lock(&event->child_mutex);
4229 list_for_each_entry(child, &event->child_list, child_list) {
4230
4231 /*
4232 * Cannot change, child events are not migrated, see the
4233 * comment with perf_event_ctx_lock_nested().
4234 */
4235 ctx = lockless_dereference(child->ctx);
4236 /*
4237 * Since child_mutex nests inside ctx::mutex, we must jump
4238 * through hoops. We start by grabbing a reference on the ctx.
4239 *
4240 * Since the event cannot get freed while we hold the
4241 * child_mutex, the context must also exist and have a !0
4242 * reference count.
4243 */
4244 get_ctx(ctx);
4245
4246 /*
4247 * Now that we have a ctx ref, we can drop child_mutex, and
4248 * acquire ctx::mutex without fear of it going away. Then we
4249 * can re-acquire child_mutex.
4250 */
4251 mutex_unlock(&event->child_mutex);
4252 mutex_lock(&ctx->mutex);
4253 mutex_lock(&event->child_mutex);
4254
4255 /*
4256 * Now that we hold ctx::mutex and child_mutex, revalidate our
4257 * state, if child is still the first entry, it didn't get freed
4258 * and we can continue doing so.
4259 */
4260 tmp = list_first_entry_or_null(&event->child_list,
4261 struct perf_event, child_list);
4262 if (tmp == child) {
4263 perf_remove_from_context(child, DETACH_GROUP);
4264 list_del(&child->child_list);
4265 free_event(child);
4266 /*
4267 * This matches the refcount bump in inherit_event();
4268 * this can't be the last reference.
4269 */
4270 put_event(event);
4271 }
4272
4273 mutex_unlock(&event->child_mutex);
4274 mutex_unlock(&ctx->mutex);
4275 put_ctx(ctx);
4276 goto again;
4277 }
4278 mutex_unlock(&event->child_mutex);
4279
4280 no_ctx:
4281 put_event(event); /* Must be the 'last' reference */
4282 return 0;
4283 }
4284 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
4285
4286 /*
4287 * Called when the last reference to the file is gone.
4288 */
perf_release(struct inode * inode,struct file * file)4289 static int perf_release(struct inode *inode, struct file *file)
4290 {
4291 perf_event_release_kernel(file->private_data);
4292 return 0;
4293 }
4294
perf_event_read_value(struct perf_event * event,u64 * enabled,u64 * running)4295 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
4296 {
4297 struct perf_event *child;
4298 u64 total = 0;
4299
4300 *enabled = 0;
4301 *running = 0;
4302
4303 mutex_lock(&event->child_mutex);
4304
4305 (void)perf_event_read(event, false);
4306 total += perf_event_count(event);
4307
4308 *enabled += event->total_time_enabled +
4309 atomic64_read(&event->child_total_time_enabled);
4310 *running += event->total_time_running +
4311 atomic64_read(&event->child_total_time_running);
4312
4313 list_for_each_entry(child, &event->child_list, child_list) {
4314 (void)perf_event_read(child, false);
4315 total += perf_event_count(child);
4316 *enabled += child->total_time_enabled;
4317 *running += child->total_time_running;
4318 }
4319 mutex_unlock(&event->child_mutex);
4320
4321 return total;
4322 }
4323 EXPORT_SYMBOL_GPL(perf_event_read_value);
4324
__perf_read_group_add(struct perf_event * leader,u64 read_format,u64 * values)4325 static int __perf_read_group_add(struct perf_event *leader,
4326 u64 read_format, u64 *values)
4327 {
4328 struct perf_event *sub;
4329 int n = 1; /* skip @nr */
4330 int ret;
4331
4332 ret = perf_event_read(leader, true);
4333 if (ret)
4334 return ret;
4335
4336 /*
4337 * Since we co-schedule groups, {enabled,running} times of siblings
4338 * will be identical to those of the leader, so we only publish one
4339 * set.
4340 */
4341 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4342 values[n++] += leader->total_time_enabled +
4343 atomic64_read(&leader->child_total_time_enabled);
4344 }
4345
4346 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4347 values[n++] += leader->total_time_running +
4348 atomic64_read(&leader->child_total_time_running);
4349 }
4350
4351 /*
4352 * Write {count,id} tuples for every sibling.
4353 */
4354 values[n++] += perf_event_count(leader);
4355 if (read_format & PERF_FORMAT_ID)
4356 values[n++] = primary_event_id(leader);
4357
4358 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4359 values[n++] += perf_event_count(sub);
4360 if (read_format & PERF_FORMAT_ID)
4361 values[n++] = primary_event_id(sub);
4362 }
4363
4364 return 0;
4365 }
4366
perf_read_group(struct perf_event * event,u64 read_format,char __user * buf)4367 static int perf_read_group(struct perf_event *event,
4368 u64 read_format, char __user *buf)
4369 {
4370 struct perf_event *leader = event->group_leader, *child;
4371 struct perf_event_context *ctx = leader->ctx;
4372 int ret;
4373 u64 *values;
4374
4375 lockdep_assert_held(&ctx->mutex);
4376
4377 values = kzalloc(event->read_size, GFP_KERNEL);
4378 if (!values)
4379 return -ENOMEM;
4380
4381 values[0] = 1 + leader->nr_siblings;
4382
4383 /*
4384 * By locking the child_mutex of the leader we effectively
4385 * lock the child list of all siblings.. XXX explain how.
4386 */
4387 mutex_lock(&leader->child_mutex);
4388
4389 ret = __perf_read_group_add(leader, read_format, values);
4390 if (ret)
4391 goto unlock;
4392
4393 list_for_each_entry(child, &leader->child_list, child_list) {
4394 ret = __perf_read_group_add(child, read_format, values);
4395 if (ret)
4396 goto unlock;
4397 }
4398
4399 mutex_unlock(&leader->child_mutex);
4400
4401 ret = event->read_size;
4402 if (copy_to_user(buf, values, event->read_size))
4403 ret = -EFAULT;
4404 goto out;
4405
4406 unlock:
4407 mutex_unlock(&leader->child_mutex);
4408 out:
4409 kfree(values);
4410 return ret;
4411 }
4412
perf_read_one(struct perf_event * event,u64 read_format,char __user * buf)4413 static int perf_read_one(struct perf_event *event,
4414 u64 read_format, char __user *buf)
4415 {
4416 u64 enabled, running;
4417 u64 values[4];
4418 int n = 0;
4419
4420 values[n++] = perf_event_read_value(event, &enabled, &running);
4421 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4422 values[n++] = enabled;
4423 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4424 values[n++] = running;
4425 if (read_format & PERF_FORMAT_ID)
4426 values[n++] = primary_event_id(event);
4427
4428 if (copy_to_user(buf, values, n * sizeof(u64)))
4429 return -EFAULT;
4430
4431 return n * sizeof(u64);
4432 }
4433
is_event_hup(struct perf_event * event)4434 static bool is_event_hup(struct perf_event *event)
4435 {
4436 bool no_children;
4437
4438 if (event->state > PERF_EVENT_STATE_EXIT)
4439 return false;
4440
4441 mutex_lock(&event->child_mutex);
4442 no_children = list_empty(&event->child_list);
4443 mutex_unlock(&event->child_mutex);
4444 return no_children;
4445 }
4446
4447 /*
4448 * Read the performance event - simple non blocking version for now
4449 */
4450 static ssize_t
__perf_read(struct perf_event * event,char __user * buf,size_t count)4451 __perf_read(struct perf_event *event, char __user *buf, size_t count)
4452 {
4453 u64 read_format = event->attr.read_format;
4454 int ret;
4455
4456 /*
4457 * Return end-of-file for a read on a event that is in
4458 * error state (i.e. because it was pinned but it couldn't be
4459 * scheduled on to the CPU at some point).
4460 */
4461 if (event->state == PERF_EVENT_STATE_ERROR)
4462 return 0;
4463
4464 if (count < event->read_size)
4465 return -ENOSPC;
4466
4467 WARN_ON_ONCE(event->ctx->parent_ctx);
4468 if (read_format & PERF_FORMAT_GROUP)
4469 ret = perf_read_group(event, read_format, buf);
4470 else
4471 ret = perf_read_one(event, read_format, buf);
4472
4473 return ret;
4474 }
4475
4476 static ssize_t
perf_read(struct file * file,char __user * buf,size_t count,loff_t * ppos)4477 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
4478 {
4479 struct perf_event *event = file->private_data;
4480 struct perf_event_context *ctx;
4481 int ret;
4482
4483 ctx = perf_event_ctx_lock(event);
4484 ret = __perf_read(event, buf, count);
4485 perf_event_ctx_unlock(event, ctx);
4486
4487 return ret;
4488 }
4489
perf_poll(struct file * file,poll_table * wait)4490 static unsigned int perf_poll(struct file *file, poll_table *wait)
4491 {
4492 struct perf_event *event = file->private_data;
4493 struct ring_buffer *rb;
4494 unsigned int events = POLLHUP;
4495
4496 poll_wait(file, &event->waitq, wait);
4497
4498 if (is_event_hup(event))
4499 return events;
4500
4501 /*
4502 * Pin the event->rb by taking event->mmap_mutex; otherwise
4503 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
4504 */
4505 mutex_lock(&event->mmap_mutex);
4506 rb = event->rb;
4507 if (rb)
4508 events = atomic_xchg(&rb->poll, 0);
4509 mutex_unlock(&event->mmap_mutex);
4510 return events;
4511 }
4512
_perf_event_reset(struct perf_event * event)4513 static void _perf_event_reset(struct perf_event *event)
4514 {
4515 (void)perf_event_read(event, false);
4516 local64_set(&event->count, 0);
4517 perf_event_update_userpage(event);
4518 }
4519
4520 /*
4521 * Holding the top-level event's child_mutex means that any
4522 * descendant process that has inherited this event will block
4523 * in perf_event_exit_event() if it goes to exit, thus satisfying the
4524 * task existence requirements of perf_event_enable/disable.
4525 */
perf_event_for_each_child(struct perf_event * event,void (* func)(struct perf_event *))4526 static void perf_event_for_each_child(struct perf_event *event,
4527 void (*func)(struct perf_event *))
4528 {
4529 struct perf_event *child;
4530
4531 WARN_ON_ONCE(event->ctx->parent_ctx);
4532
4533 mutex_lock(&event->child_mutex);
4534 func(event);
4535 list_for_each_entry(child, &event->child_list, child_list)
4536 func(child);
4537 mutex_unlock(&event->child_mutex);
4538 }
4539
perf_event_for_each(struct perf_event * event,void (* func)(struct perf_event *))4540 static void perf_event_for_each(struct perf_event *event,
4541 void (*func)(struct perf_event *))
4542 {
4543 struct perf_event_context *ctx = event->ctx;
4544 struct perf_event *sibling;
4545
4546 lockdep_assert_held(&ctx->mutex);
4547
4548 event = event->group_leader;
4549
4550 perf_event_for_each_child(event, func);
4551 list_for_each_entry(sibling, &event->sibling_list, group_entry)
4552 perf_event_for_each_child(sibling, func);
4553 }
4554
__perf_event_period(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx,void * info)4555 static void __perf_event_period(struct perf_event *event,
4556 struct perf_cpu_context *cpuctx,
4557 struct perf_event_context *ctx,
4558 void *info)
4559 {
4560 u64 value = *((u64 *)info);
4561 bool active;
4562
4563 if (event->attr.freq) {
4564 event->attr.sample_freq = value;
4565 } else {
4566 event->attr.sample_period = value;
4567 event->hw.sample_period = value;
4568 }
4569
4570 active = (event->state == PERF_EVENT_STATE_ACTIVE);
4571 if (active) {
4572 perf_pmu_disable(ctx->pmu);
4573 /*
4574 * We could be throttled; unthrottle now to avoid the tick
4575 * trying to unthrottle while we already re-started the event.
4576 */
4577 if (event->hw.interrupts == MAX_INTERRUPTS) {
4578 event->hw.interrupts = 0;
4579 perf_log_throttle(event, 1);
4580 }
4581 event->pmu->stop(event, PERF_EF_UPDATE);
4582 }
4583
4584 local64_set(&event->hw.period_left, 0);
4585
4586 if (active) {
4587 event->pmu->start(event, PERF_EF_RELOAD);
4588 perf_pmu_enable(ctx->pmu);
4589 }
4590 }
4591
perf_event_period(struct perf_event * event,u64 __user * arg)4592 static int perf_event_period(struct perf_event *event, u64 __user *arg)
4593 {
4594 u64 value;
4595
4596 if (!is_sampling_event(event))
4597 return -EINVAL;
4598
4599 if (copy_from_user(&value, arg, sizeof(value)))
4600 return -EFAULT;
4601
4602 if (!value)
4603 return -EINVAL;
4604
4605 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
4606 return -EINVAL;
4607
4608 event_function_call(event, __perf_event_period, &value);
4609
4610 return 0;
4611 }
4612
4613 static const struct file_operations perf_fops;
4614
perf_fget_light(int fd,struct fd * p)4615 static inline int perf_fget_light(int fd, struct fd *p)
4616 {
4617 struct fd f = fdget(fd);
4618 if (!f.file)
4619 return -EBADF;
4620
4621 if (f.file->f_op != &perf_fops) {
4622 fdput(f);
4623 return -EBADF;
4624 }
4625 *p = f;
4626 return 0;
4627 }
4628
4629 static int perf_event_set_output(struct perf_event *event,
4630 struct perf_event *output_event);
4631 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
4632 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd);
4633
_perf_ioctl(struct perf_event * event,unsigned int cmd,unsigned long arg)4634 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
4635 {
4636 void (*func)(struct perf_event *);
4637 u32 flags = arg;
4638
4639 switch (cmd) {
4640 case PERF_EVENT_IOC_ENABLE:
4641 func = _perf_event_enable;
4642 break;
4643 case PERF_EVENT_IOC_DISABLE:
4644 func = _perf_event_disable;
4645 break;
4646 case PERF_EVENT_IOC_RESET:
4647 func = _perf_event_reset;
4648 break;
4649
4650 case PERF_EVENT_IOC_REFRESH:
4651 return _perf_event_refresh(event, arg);
4652
4653 case PERF_EVENT_IOC_PERIOD:
4654 return perf_event_period(event, (u64 __user *)arg);
4655
4656 case PERF_EVENT_IOC_ID:
4657 {
4658 u64 id = primary_event_id(event);
4659
4660 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
4661 return -EFAULT;
4662 return 0;
4663 }
4664
4665 case PERF_EVENT_IOC_SET_OUTPUT:
4666 {
4667 int ret;
4668 if (arg != -1) {
4669 struct perf_event *output_event;
4670 struct fd output;
4671 ret = perf_fget_light(arg, &output);
4672 if (ret)
4673 return ret;
4674 output_event = output.file->private_data;
4675 ret = perf_event_set_output(event, output_event);
4676 fdput(output);
4677 } else {
4678 ret = perf_event_set_output(event, NULL);
4679 }
4680 return ret;
4681 }
4682
4683 case PERF_EVENT_IOC_SET_FILTER:
4684 return perf_event_set_filter(event, (void __user *)arg);
4685
4686 case PERF_EVENT_IOC_SET_BPF:
4687 return perf_event_set_bpf_prog(event, arg);
4688
4689 case PERF_EVENT_IOC_PAUSE_OUTPUT: {
4690 struct ring_buffer *rb;
4691
4692 rcu_read_lock();
4693 rb = rcu_dereference(event->rb);
4694 if (!rb || !rb->nr_pages) {
4695 rcu_read_unlock();
4696 return -EINVAL;
4697 }
4698 rb_toggle_paused(rb, !!arg);
4699 rcu_read_unlock();
4700 return 0;
4701 }
4702 default:
4703 return -ENOTTY;
4704 }
4705
4706 if (flags & PERF_IOC_FLAG_GROUP)
4707 perf_event_for_each(event, func);
4708 else
4709 perf_event_for_each_child(event, func);
4710
4711 return 0;
4712 }
4713
perf_ioctl(struct file * file,unsigned int cmd,unsigned long arg)4714 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4715 {
4716 struct perf_event *event = file->private_data;
4717 struct perf_event_context *ctx;
4718 long ret;
4719
4720 ctx = perf_event_ctx_lock(event);
4721 ret = _perf_ioctl(event, cmd, arg);
4722 perf_event_ctx_unlock(event, ctx);
4723
4724 return ret;
4725 }
4726
4727 #ifdef CONFIG_COMPAT
perf_compat_ioctl(struct file * file,unsigned int cmd,unsigned long arg)4728 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4729 unsigned long arg)
4730 {
4731 switch (_IOC_NR(cmd)) {
4732 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4733 case _IOC_NR(PERF_EVENT_IOC_ID):
4734 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4735 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4736 cmd &= ~IOCSIZE_MASK;
4737 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4738 }
4739 break;
4740 }
4741 return perf_ioctl(file, cmd, arg);
4742 }
4743 #else
4744 # define perf_compat_ioctl NULL
4745 #endif
4746
perf_event_task_enable(void)4747 int perf_event_task_enable(void)
4748 {
4749 struct perf_event_context *ctx;
4750 struct perf_event *event;
4751
4752 mutex_lock(¤t->perf_event_mutex);
4753 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4754 ctx = perf_event_ctx_lock(event);
4755 perf_event_for_each_child(event, _perf_event_enable);
4756 perf_event_ctx_unlock(event, ctx);
4757 }
4758 mutex_unlock(¤t->perf_event_mutex);
4759
4760 return 0;
4761 }
4762
perf_event_task_disable(void)4763 int perf_event_task_disable(void)
4764 {
4765 struct perf_event_context *ctx;
4766 struct perf_event *event;
4767
4768 mutex_lock(¤t->perf_event_mutex);
4769 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4770 ctx = perf_event_ctx_lock(event);
4771 perf_event_for_each_child(event, _perf_event_disable);
4772 perf_event_ctx_unlock(event, ctx);
4773 }
4774 mutex_unlock(¤t->perf_event_mutex);
4775
4776 return 0;
4777 }
4778
perf_event_index(struct perf_event * event)4779 static int perf_event_index(struct perf_event *event)
4780 {
4781 if (event->hw.state & PERF_HES_STOPPED)
4782 return 0;
4783
4784 if (event->state != PERF_EVENT_STATE_ACTIVE)
4785 return 0;
4786
4787 return event->pmu->event_idx(event);
4788 }
4789
calc_timer_values(struct perf_event * event,u64 * now,u64 * enabled,u64 * running)4790 static void calc_timer_values(struct perf_event *event,
4791 u64 *now,
4792 u64 *enabled,
4793 u64 *running)
4794 {
4795 u64 ctx_time;
4796
4797 *now = perf_clock();
4798 ctx_time = event->shadow_ctx_time + *now;
4799 *enabled = ctx_time - event->tstamp_enabled;
4800 *running = ctx_time - event->tstamp_running;
4801 }
4802
perf_event_init_userpage(struct perf_event * event)4803 static void perf_event_init_userpage(struct perf_event *event)
4804 {
4805 struct perf_event_mmap_page *userpg;
4806 struct ring_buffer *rb;
4807
4808 rcu_read_lock();
4809 rb = rcu_dereference(event->rb);
4810 if (!rb)
4811 goto unlock;
4812
4813 userpg = rb->user_page;
4814
4815 /* Allow new userspace to detect that bit 0 is deprecated */
4816 userpg->cap_bit0_is_deprecated = 1;
4817 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4818 userpg->data_offset = PAGE_SIZE;
4819 userpg->data_size = perf_data_size(rb);
4820
4821 unlock:
4822 rcu_read_unlock();
4823 }
4824
arch_perf_update_userpage(struct perf_event * event,struct perf_event_mmap_page * userpg,u64 now)4825 void __weak arch_perf_update_userpage(
4826 struct perf_event *event, struct perf_event_mmap_page *userpg, u64 now)
4827 {
4828 }
4829
4830 /*
4831 * Callers need to ensure there can be no nesting of this function, otherwise
4832 * the seqlock logic goes bad. We can not serialize this because the arch
4833 * code calls this from NMI context.
4834 */
perf_event_update_userpage(struct perf_event * event)4835 void perf_event_update_userpage(struct perf_event *event)
4836 {
4837 struct perf_event_mmap_page *userpg;
4838 struct ring_buffer *rb;
4839 u64 enabled, running, now;
4840
4841 rcu_read_lock();
4842 rb = rcu_dereference(event->rb);
4843 if (!rb)
4844 goto unlock;
4845
4846 /*
4847 * compute total_time_enabled, total_time_running
4848 * based on snapshot values taken when the event
4849 * was last scheduled in.
4850 *
4851 * we cannot simply called update_context_time()
4852 * because of locking issue as we can be called in
4853 * NMI context
4854 */
4855 calc_timer_values(event, &now, &enabled, &running);
4856
4857 userpg = rb->user_page;
4858 /*
4859 * Disable preemption so as to not let the corresponding user-space
4860 * spin too long if we get preempted.
4861 */
4862 preempt_disable();
4863 ++userpg->lock;
4864 barrier();
4865 userpg->index = perf_event_index(event);
4866 userpg->offset = perf_event_count(event);
4867 if (userpg->index)
4868 userpg->offset -= local64_read(&event->hw.prev_count);
4869
4870 userpg->time_enabled = enabled +
4871 atomic64_read(&event->child_total_time_enabled);
4872
4873 userpg->time_running = running +
4874 atomic64_read(&event->child_total_time_running);
4875
4876 arch_perf_update_userpage(event, userpg, now);
4877
4878 barrier();
4879 ++userpg->lock;
4880 preempt_enable();
4881 unlock:
4882 rcu_read_unlock();
4883 }
4884
perf_mmap_fault(struct vm_area_struct * vma,struct vm_fault * vmf)4885 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4886 {
4887 struct perf_event *event = vma->vm_file->private_data;
4888 struct ring_buffer *rb;
4889 int ret = VM_FAULT_SIGBUS;
4890
4891 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4892 if (vmf->pgoff == 0)
4893 ret = 0;
4894 return ret;
4895 }
4896
4897 rcu_read_lock();
4898 rb = rcu_dereference(event->rb);
4899 if (!rb)
4900 goto unlock;
4901
4902 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4903 goto unlock;
4904
4905 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4906 if (!vmf->page)
4907 goto unlock;
4908
4909 get_page(vmf->page);
4910 vmf->page->mapping = vma->vm_file->f_mapping;
4911 vmf->page->index = vmf->pgoff;
4912
4913 ret = 0;
4914 unlock:
4915 rcu_read_unlock();
4916
4917 return ret;
4918 }
4919
ring_buffer_attach(struct perf_event * event,struct ring_buffer * rb)4920 static void ring_buffer_attach(struct perf_event *event,
4921 struct ring_buffer *rb)
4922 {
4923 struct ring_buffer *old_rb = NULL;
4924 unsigned long flags;
4925
4926 if (event->rb) {
4927 /*
4928 * Should be impossible, we set this when removing
4929 * event->rb_entry and wait/clear when adding event->rb_entry.
4930 */
4931 WARN_ON_ONCE(event->rcu_pending);
4932
4933 old_rb = event->rb;
4934 spin_lock_irqsave(&old_rb->event_lock, flags);
4935 list_del_rcu(&event->rb_entry);
4936 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4937
4938 event->rcu_batches = get_state_synchronize_rcu();
4939 event->rcu_pending = 1;
4940 }
4941
4942 if (rb) {
4943 if (event->rcu_pending) {
4944 cond_synchronize_rcu(event->rcu_batches);
4945 event->rcu_pending = 0;
4946 }
4947
4948 spin_lock_irqsave(&rb->event_lock, flags);
4949 list_add_rcu(&event->rb_entry, &rb->event_list);
4950 spin_unlock_irqrestore(&rb->event_lock, flags);
4951 }
4952
4953 /*
4954 * Avoid racing with perf_mmap_close(AUX): stop the event
4955 * before swizzling the event::rb pointer; if it's getting
4956 * unmapped, its aux_mmap_count will be 0 and it won't
4957 * restart. See the comment in __perf_pmu_output_stop().
4958 *
4959 * Data will inevitably be lost when set_output is done in
4960 * mid-air, but then again, whoever does it like this is
4961 * not in for the data anyway.
4962 */
4963 if (has_aux(event))
4964 perf_event_stop(event, 0);
4965
4966 rcu_assign_pointer(event->rb, rb);
4967
4968 if (old_rb) {
4969 ring_buffer_put(old_rb);
4970 /*
4971 * Since we detached before setting the new rb, so that we
4972 * could attach the new rb, we could have missed a wakeup.
4973 * Provide it now.
4974 */
4975 wake_up_all(&event->waitq);
4976 }
4977 }
4978
ring_buffer_wakeup(struct perf_event * event)4979 static void ring_buffer_wakeup(struct perf_event *event)
4980 {
4981 struct ring_buffer *rb;
4982
4983 rcu_read_lock();
4984 rb = rcu_dereference(event->rb);
4985 if (rb) {
4986 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4987 wake_up_all(&event->waitq);
4988 }
4989 rcu_read_unlock();
4990 }
4991
ring_buffer_get(struct perf_event * event)4992 struct ring_buffer *ring_buffer_get(struct perf_event *event)
4993 {
4994 struct ring_buffer *rb;
4995
4996 rcu_read_lock();
4997 rb = rcu_dereference(event->rb);
4998 if (rb) {
4999 if (!atomic_inc_not_zero(&rb->refcount))
5000 rb = NULL;
5001 }
5002 rcu_read_unlock();
5003
5004 return rb;
5005 }
5006
ring_buffer_put(struct ring_buffer * rb)5007 void ring_buffer_put(struct ring_buffer *rb)
5008 {
5009 if (!atomic_dec_and_test(&rb->refcount))
5010 return;
5011
5012 WARN_ON_ONCE(!list_empty(&rb->event_list));
5013
5014 call_rcu(&rb->rcu_head, rb_free_rcu);
5015 }
5016
perf_mmap_open(struct vm_area_struct * vma)5017 static void perf_mmap_open(struct vm_area_struct *vma)
5018 {
5019 struct perf_event *event = vma->vm_file->private_data;
5020
5021 atomic_inc(&event->mmap_count);
5022 atomic_inc(&event->rb->mmap_count);
5023
5024 if (vma->vm_pgoff)
5025 atomic_inc(&event->rb->aux_mmap_count);
5026
5027 if (event->pmu->event_mapped)
5028 event->pmu->event_mapped(event);
5029 }
5030
5031 static void perf_pmu_output_stop(struct perf_event *event);
5032
5033 /*
5034 * A buffer can be mmap()ed multiple times; either directly through the same
5035 * event, or through other events by use of perf_event_set_output().
5036 *
5037 * In order to undo the VM accounting done by perf_mmap() we need to destroy
5038 * the buffer here, where we still have a VM context. This means we need
5039 * to detach all events redirecting to us.
5040 */
perf_mmap_close(struct vm_area_struct * vma)5041 static void perf_mmap_close(struct vm_area_struct *vma)
5042 {
5043 struct perf_event *event = vma->vm_file->private_data;
5044
5045 struct ring_buffer *rb = ring_buffer_get(event);
5046 struct user_struct *mmap_user = rb->mmap_user;
5047 int mmap_locked = rb->mmap_locked;
5048 unsigned long size = perf_data_size(rb);
5049
5050 if (event->pmu->event_unmapped)
5051 event->pmu->event_unmapped(event);
5052
5053 /*
5054 * rb->aux_mmap_count will always drop before rb->mmap_count and
5055 * event->mmap_count, so it is ok to use event->mmap_mutex to
5056 * serialize with perf_mmap here.
5057 */
5058 if (rb_has_aux(rb) && vma->vm_pgoff == rb->aux_pgoff &&
5059 atomic_dec_and_mutex_lock(&rb->aux_mmap_count, &event->mmap_mutex)) {
5060 /*
5061 * Stop all AUX events that are writing to this buffer,
5062 * so that we can free its AUX pages and corresponding PMU
5063 * data. Note that after rb::aux_mmap_count dropped to zero,
5064 * they won't start any more (see perf_aux_output_begin()).
5065 */
5066 perf_pmu_output_stop(event);
5067
5068 /* now it's safe to free the pages */
5069 atomic_long_sub(rb->aux_nr_pages, &mmap_user->locked_vm);
5070 vma->vm_mm->pinned_vm -= rb->aux_mmap_locked;
5071
5072 /* this has to be the last one */
5073 rb_free_aux(rb);
5074 WARN_ON_ONCE(atomic_read(&rb->aux_refcount));
5075
5076 mutex_unlock(&event->mmap_mutex);
5077 }
5078
5079 atomic_dec(&rb->mmap_count);
5080
5081 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
5082 goto out_put;
5083
5084 ring_buffer_attach(event, NULL);
5085 mutex_unlock(&event->mmap_mutex);
5086
5087 /* If there's still other mmap()s of this buffer, we're done. */
5088 if (atomic_read(&rb->mmap_count))
5089 goto out_put;
5090
5091 /*
5092 * No other mmap()s, detach from all other events that might redirect
5093 * into the now unreachable buffer. Somewhat complicated by the
5094 * fact that rb::event_lock otherwise nests inside mmap_mutex.
5095 */
5096 again:
5097 rcu_read_lock();
5098 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
5099 if (!atomic_long_inc_not_zero(&event->refcount)) {
5100 /*
5101 * This event is en-route to free_event() which will
5102 * detach it and remove it from the list.
5103 */
5104 continue;
5105 }
5106 rcu_read_unlock();
5107
5108 mutex_lock(&event->mmap_mutex);
5109 /*
5110 * Check we didn't race with perf_event_set_output() which can
5111 * swizzle the rb from under us while we were waiting to
5112 * acquire mmap_mutex.
5113 *
5114 * If we find a different rb; ignore this event, a next
5115 * iteration will no longer find it on the list. We have to
5116 * still restart the iteration to make sure we're not now
5117 * iterating the wrong list.
5118 */
5119 if (event->rb == rb)
5120 ring_buffer_attach(event, NULL);
5121
5122 mutex_unlock(&event->mmap_mutex);
5123 put_event(event);
5124
5125 /*
5126 * Restart the iteration; either we're on the wrong list or
5127 * destroyed its integrity by doing a deletion.
5128 */
5129 goto again;
5130 }
5131 rcu_read_unlock();
5132
5133 /*
5134 * It could be there's still a few 0-ref events on the list; they'll
5135 * get cleaned up by free_event() -- they'll also still have their
5136 * ref on the rb and will free it whenever they are done with it.
5137 *
5138 * Aside from that, this buffer is 'fully' detached and unmapped,
5139 * undo the VM accounting.
5140 */
5141
5142 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
5143 vma->vm_mm->pinned_vm -= mmap_locked;
5144 free_uid(mmap_user);
5145
5146 out_put:
5147 ring_buffer_put(rb); /* could be last */
5148 }
5149
5150 static const struct vm_operations_struct perf_mmap_vmops = {
5151 .open = perf_mmap_open,
5152 .close = perf_mmap_close, /* non mergable */
5153 .fault = perf_mmap_fault,
5154 .page_mkwrite = perf_mmap_fault,
5155 };
5156
perf_mmap(struct file * file,struct vm_area_struct * vma)5157 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
5158 {
5159 struct perf_event *event = file->private_data;
5160 unsigned long user_locked, user_lock_limit;
5161 struct user_struct *user = current_user();
5162 unsigned long locked, lock_limit;
5163 struct ring_buffer *rb = NULL;
5164 unsigned long vma_size;
5165 unsigned long nr_pages;
5166 long user_extra = 0, extra = 0;
5167 int ret = 0, flags = 0;
5168
5169 /*
5170 * Don't allow mmap() of inherited per-task counters. This would
5171 * create a performance issue due to all children writing to the
5172 * same rb.
5173 */
5174 if (event->cpu == -1 && event->attr.inherit)
5175 return -EINVAL;
5176
5177 if (!(vma->vm_flags & VM_SHARED))
5178 return -EINVAL;
5179
5180 vma_size = vma->vm_end - vma->vm_start;
5181
5182 if (vma->vm_pgoff == 0) {
5183 nr_pages = (vma_size / PAGE_SIZE) - 1;
5184 } else {
5185 /*
5186 * AUX area mapping: if rb->aux_nr_pages != 0, it's already
5187 * mapped, all subsequent mappings should have the same size
5188 * and offset. Must be above the normal perf buffer.
5189 */
5190 u64 aux_offset, aux_size;
5191
5192 if (!event->rb)
5193 return -EINVAL;
5194
5195 nr_pages = vma_size / PAGE_SIZE;
5196
5197 mutex_lock(&event->mmap_mutex);
5198 ret = -EINVAL;
5199
5200 rb = event->rb;
5201 if (!rb)
5202 goto aux_unlock;
5203
5204 aux_offset = ACCESS_ONCE(rb->user_page->aux_offset);
5205 aux_size = ACCESS_ONCE(rb->user_page->aux_size);
5206
5207 if (aux_offset < perf_data_size(rb) + PAGE_SIZE)
5208 goto aux_unlock;
5209
5210 if (aux_offset != vma->vm_pgoff << PAGE_SHIFT)
5211 goto aux_unlock;
5212
5213 /* already mapped with a different offset */
5214 if (rb_has_aux(rb) && rb->aux_pgoff != vma->vm_pgoff)
5215 goto aux_unlock;
5216
5217 if (aux_size != vma_size || aux_size != nr_pages * PAGE_SIZE)
5218 goto aux_unlock;
5219
5220 /* already mapped with a different size */
5221 if (rb_has_aux(rb) && rb->aux_nr_pages != nr_pages)
5222 goto aux_unlock;
5223
5224 if (!is_power_of_2(nr_pages))
5225 goto aux_unlock;
5226
5227 if (!atomic_inc_not_zero(&rb->mmap_count))
5228 goto aux_unlock;
5229
5230 if (rb_has_aux(rb)) {
5231 atomic_inc(&rb->aux_mmap_count);
5232 ret = 0;
5233 goto unlock;
5234 }
5235
5236 atomic_set(&rb->aux_mmap_count, 1);
5237 user_extra = nr_pages;
5238
5239 goto accounting;
5240 }
5241
5242 /*
5243 * If we have rb pages ensure they're a power-of-two number, so we
5244 * can do bitmasks instead of modulo.
5245 */
5246 if (nr_pages != 0 && !is_power_of_2(nr_pages))
5247 return -EINVAL;
5248
5249 if (vma_size != PAGE_SIZE * (1 + nr_pages))
5250 return -EINVAL;
5251
5252 WARN_ON_ONCE(event->ctx->parent_ctx);
5253 again:
5254 mutex_lock(&event->mmap_mutex);
5255 if (event->rb) {
5256 if (event->rb->nr_pages != nr_pages) {
5257 ret = -EINVAL;
5258 goto unlock;
5259 }
5260
5261 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
5262 /*
5263 * Raced against perf_mmap_close() through
5264 * perf_event_set_output(). Try again, hope for better
5265 * luck.
5266 */
5267 mutex_unlock(&event->mmap_mutex);
5268 goto again;
5269 }
5270
5271 goto unlock;
5272 }
5273
5274 user_extra = nr_pages + 1;
5275
5276 accounting:
5277 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
5278
5279 /*
5280 * Increase the limit linearly with more CPUs:
5281 */
5282 user_lock_limit *= num_online_cpus();
5283
5284 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
5285
5286 if (user_locked > user_lock_limit)
5287 extra = user_locked - user_lock_limit;
5288
5289 lock_limit = rlimit(RLIMIT_MEMLOCK);
5290 lock_limit >>= PAGE_SHIFT;
5291 locked = vma->vm_mm->pinned_vm + extra;
5292
5293 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
5294 !capable(CAP_IPC_LOCK)) {
5295 ret = -EPERM;
5296 goto unlock;
5297 }
5298
5299 WARN_ON(!rb && event->rb);
5300
5301 if (vma->vm_flags & VM_WRITE)
5302 flags |= RING_BUFFER_WRITABLE;
5303
5304 if (!rb) {
5305 rb = rb_alloc(nr_pages,
5306 event->attr.watermark ? event->attr.wakeup_watermark : 0,
5307 event->cpu, flags);
5308
5309 if (!rb) {
5310 ret = -ENOMEM;
5311 goto unlock;
5312 }
5313
5314 atomic_set(&rb->mmap_count, 1);
5315 rb->mmap_user = get_current_user();
5316 rb->mmap_locked = extra;
5317
5318 ring_buffer_attach(event, rb);
5319
5320 perf_event_init_userpage(event);
5321 perf_event_update_userpage(event);
5322 } else {
5323 ret = rb_alloc_aux(rb, event, vma->vm_pgoff, nr_pages,
5324 event->attr.aux_watermark, flags);
5325 if (!ret)
5326 rb->aux_mmap_locked = extra;
5327 }
5328
5329 unlock:
5330 if (!ret) {
5331 atomic_long_add(user_extra, &user->locked_vm);
5332 vma->vm_mm->pinned_vm += extra;
5333
5334 atomic_inc(&event->mmap_count);
5335 } else if (rb) {
5336 atomic_dec(&rb->mmap_count);
5337 }
5338 aux_unlock:
5339 mutex_unlock(&event->mmap_mutex);
5340
5341 /*
5342 * Since pinned accounting is per vm we cannot allow fork() to copy our
5343 * vma.
5344 */
5345 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
5346 vma->vm_ops = &perf_mmap_vmops;
5347
5348 if (event->pmu->event_mapped)
5349 event->pmu->event_mapped(event);
5350
5351 return ret;
5352 }
5353
perf_fasync(int fd,struct file * filp,int on)5354 static int perf_fasync(int fd, struct file *filp, int on)
5355 {
5356 struct inode *inode = file_inode(filp);
5357 struct perf_event *event = filp->private_data;
5358 int retval;
5359
5360 inode_lock(inode);
5361 retval = fasync_helper(fd, filp, on, &event->fasync);
5362 inode_unlock(inode);
5363
5364 if (retval < 0)
5365 return retval;
5366
5367 return 0;
5368 }
5369
5370 static const struct file_operations perf_fops = {
5371 .llseek = no_llseek,
5372 .release = perf_release,
5373 .read = perf_read,
5374 .poll = perf_poll,
5375 .unlocked_ioctl = perf_ioctl,
5376 .compat_ioctl = perf_compat_ioctl,
5377 .mmap = perf_mmap,
5378 .fasync = perf_fasync,
5379 };
5380
5381 /*
5382 * Perf event wakeup
5383 *
5384 * If there's data, ensure we set the poll() state and publish everything
5385 * to user-space before waking everybody up.
5386 */
5387
perf_event_fasync(struct perf_event * event)5388 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
5389 {
5390 /* only the parent has fasync state */
5391 if (event->parent)
5392 event = event->parent;
5393 return &event->fasync;
5394 }
5395
perf_event_wakeup(struct perf_event * event)5396 void perf_event_wakeup(struct perf_event *event)
5397 {
5398 ring_buffer_wakeup(event);
5399
5400 if (event->pending_kill) {
5401 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
5402 event->pending_kill = 0;
5403 }
5404 }
5405
perf_pending_event(struct irq_work * entry)5406 static void perf_pending_event(struct irq_work *entry)
5407 {
5408 struct perf_event *event = container_of(entry,
5409 struct perf_event, pending);
5410 int rctx;
5411
5412 rctx = perf_swevent_get_recursion_context();
5413 /*
5414 * If we 'fail' here, that's OK, it means recursion is already disabled
5415 * and we won't recurse 'further'.
5416 */
5417
5418 if (event->pending_disable) {
5419 event->pending_disable = 0;
5420 perf_event_disable_local(event);
5421 }
5422
5423 if (event->pending_wakeup) {
5424 event->pending_wakeup = 0;
5425 perf_event_wakeup(event);
5426 }
5427
5428 if (rctx >= 0)
5429 perf_swevent_put_recursion_context(rctx);
5430 }
5431
5432 /*
5433 * We assume there is only KVM supporting the callbacks.
5434 * Later on, we might change it to a list if there is
5435 * another virtualization implementation supporting the callbacks.
5436 */
5437 struct perf_guest_info_callbacks *perf_guest_cbs;
5438
perf_register_guest_info_callbacks(struct perf_guest_info_callbacks * cbs)5439 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5440 {
5441 perf_guest_cbs = cbs;
5442 return 0;
5443 }
5444 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
5445
perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks * cbs)5446 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
5447 {
5448 perf_guest_cbs = NULL;
5449 return 0;
5450 }
5451 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
5452
5453 static void
perf_output_sample_regs(struct perf_output_handle * handle,struct pt_regs * regs,u64 mask)5454 perf_output_sample_regs(struct perf_output_handle *handle,
5455 struct pt_regs *regs, u64 mask)
5456 {
5457 int bit;
5458 DECLARE_BITMAP(_mask, 64);
5459
5460 bitmap_from_u64(_mask, mask);
5461 for_each_set_bit(bit, _mask, sizeof(mask) * BITS_PER_BYTE) {
5462 u64 val;
5463
5464 val = perf_reg_value(regs, bit);
5465 perf_output_put(handle, val);
5466 }
5467 }
5468
perf_sample_regs_user(struct perf_regs * regs_user,struct pt_regs * regs,struct pt_regs * regs_user_copy)5469 static void perf_sample_regs_user(struct perf_regs *regs_user,
5470 struct pt_regs *regs,
5471 struct pt_regs *regs_user_copy)
5472 {
5473 if (user_mode(regs)) {
5474 regs_user->abi = perf_reg_abi(current);
5475 regs_user->regs = regs;
5476 } else if (current->mm) {
5477 perf_get_regs_user(regs_user, regs, regs_user_copy);
5478 } else {
5479 regs_user->abi = PERF_SAMPLE_REGS_ABI_NONE;
5480 regs_user->regs = NULL;
5481 }
5482 }
5483
perf_sample_regs_intr(struct perf_regs * regs_intr,struct pt_regs * regs)5484 static void perf_sample_regs_intr(struct perf_regs *regs_intr,
5485 struct pt_regs *regs)
5486 {
5487 regs_intr->regs = regs;
5488 regs_intr->abi = perf_reg_abi(current);
5489 }
5490
5491
5492 /*
5493 * Get remaining task size from user stack pointer.
5494 *
5495 * It'd be better to take stack vma map and limit this more
5496 * precisly, but there's no way to get it safely under interrupt,
5497 * so using TASK_SIZE as limit.
5498 */
perf_ustack_task_size(struct pt_regs * regs)5499 static u64 perf_ustack_task_size(struct pt_regs *regs)
5500 {
5501 unsigned long addr = perf_user_stack_pointer(regs);
5502
5503 if (!addr || addr >= TASK_SIZE)
5504 return 0;
5505
5506 return TASK_SIZE - addr;
5507 }
5508
5509 static u16
perf_sample_ustack_size(u16 stack_size,u16 header_size,struct pt_regs * regs)5510 perf_sample_ustack_size(u16 stack_size, u16 header_size,
5511 struct pt_regs *regs)
5512 {
5513 u64 task_size;
5514
5515 /* No regs, no stack pointer, no dump. */
5516 if (!regs)
5517 return 0;
5518
5519 /*
5520 * Check if we fit in with the requested stack size into the:
5521 * - TASK_SIZE
5522 * If we don't, we limit the size to the TASK_SIZE.
5523 *
5524 * - remaining sample size
5525 * If we don't, we customize the stack size to
5526 * fit in to the remaining sample size.
5527 */
5528
5529 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
5530 stack_size = min(stack_size, (u16) task_size);
5531
5532 /* Current header size plus static size and dynamic size. */
5533 header_size += 2 * sizeof(u64);
5534
5535 /* Do we fit in with the current stack dump size? */
5536 if ((u16) (header_size + stack_size) < header_size) {
5537 /*
5538 * If we overflow the maximum size for the sample,
5539 * we customize the stack dump size to fit in.
5540 */
5541 stack_size = USHRT_MAX - header_size - sizeof(u64);
5542 stack_size = round_up(stack_size, sizeof(u64));
5543 }
5544
5545 return stack_size;
5546 }
5547
5548 static void
perf_output_sample_ustack(struct perf_output_handle * handle,u64 dump_size,struct pt_regs * regs)5549 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
5550 struct pt_regs *regs)
5551 {
5552 /* Case of a kernel thread, nothing to dump */
5553 if (!regs) {
5554 u64 size = 0;
5555 perf_output_put(handle, size);
5556 } else {
5557 unsigned long sp;
5558 unsigned int rem;
5559 u64 dyn_size;
5560
5561 /*
5562 * We dump:
5563 * static size
5564 * - the size requested by user or the best one we can fit
5565 * in to the sample max size
5566 * data
5567 * - user stack dump data
5568 * dynamic size
5569 * - the actual dumped size
5570 */
5571
5572 /* Static size. */
5573 perf_output_put(handle, dump_size);
5574
5575 /* Data. */
5576 sp = perf_user_stack_pointer(regs);
5577 rem = __output_copy_user(handle, (void *) sp, dump_size);
5578 dyn_size = dump_size - rem;
5579
5580 perf_output_skip(handle, rem);
5581
5582 /* Dynamic size. */
5583 perf_output_put(handle, dyn_size);
5584 }
5585 }
5586
__perf_event_header__init_id(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)5587 static void __perf_event_header__init_id(struct perf_event_header *header,
5588 struct perf_sample_data *data,
5589 struct perf_event *event)
5590 {
5591 u64 sample_type = event->attr.sample_type;
5592
5593 data->type = sample_type;
5594 header->size += event->id_header_size;
5595
5596 if (sample_type & PERF_SAMPLE_TID) {
5597 /* namespace issues */
5598 data->tid_entry.pid = perf_event_pid(event, current);
5599 data->tid_entry.tid = perf_event_tid(event, current);
5600 }
5601
5602 if (sample_type & PERF_SAMPLE_TIME)
5603 data->time = perf_event_clock(event);
5604
5605 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
5606 data->id = primary_event_id(event);
5607
5608 if (sample_type & PERF_SAMPLE_STREAM_ID)
5609 data->stream_id = event->id;
5610
5611 if (sample_type & PERF_SAMPLE_CPU) {
5612 data->cpu_entry.cpu = raw_smp_processor_id();
5613 data->cpu_entry.reserved = 0;
5614 }
5615 }
5616
perf_event_header__init_id(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)5617 void perf_event_header__init_id(struct perf_event_header *header,
5618 struct perf_sample_data *data,
5619 struct perf_event *event)
5620 {
5621 if (event->attr.sample_id_all)
5622 __perf_event_header__init_id(header, data, event);
5623 }
5624
__perf_event__output_id_sample(struct perf_output_handle * handle,struct perf_sample_data * data)5625 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
5626 struct perf_sample_data *data)
5627 {
5628 u64 sample_type = data->type;
5629
5630 if (sample_type & PERF_SAMPLE_TID)
5631 perf_output_put(handle, data->tid_entry);
5632
5633 if (sample_type & PERF_SAMPLE_TIME)
5634 perf_output_put(handle, data->time);
5635
5636 if (sample_type & PERF_SAMPLE_ID)
5637 perf_output_put(handle, data->id);
5638
5639 if (sample_type & PERF_SAMPLE_STREAM_ID)
5640 perf_output_put(handle, data->stream_id);
5641
5642 if (sample_type & PERF_SAMPLE_CPU)
5643 perf_output_put(handle, data->cpu_entry);
5644
5645 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5646 perf_output_put(handle, data->id);
5647 }
5648
perf_event__output_id_sample(struct perf_event * event,struct perf_output_handle * handle,struct perf_sample_data * sample)5649 void perf_event__output_id_sample(struct perf_event *event,
5650 struct perf_output_handle *handle,
5651 struct perf_sample_data *sample)
5652 {
5653 if (event->attr.sample_id_all)
5654 __perf_event__output_id_sample(handle, sample);
5655 }
5656
perf_output_read_one(struct perf_output_handle * handle,struct perf_event * event,u64 enabled,u64 running)5657 static void perf_output_read_one(struct perf_output_handle *handle,
5658 struct perf_event *event,
5659 u64 enabled, u64 running)
5660 {
5661 u64 read_format = event->attr.read_format;
5662 u64 values[4];
5663 int n = 0;
5664
5665 values[n++] = perf_event_count(event);
5666 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
5667 values[n++] = enabled +
5668 atomic64_read(&event->child_total_time_enabled);
5669 }
5670 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
5671 values[n++] = running +
5672 atomic64_read(&event->child_total_time_running);
5673 }
5674 if (read_format & PERF_FORMAT_ID)
5675 values[n++] = primary_event_id(event);
5676
5677 __output_copy(handle, values, n * sizeof(u64));
5678 }
5679
perf_output_read_group(struct perf_output_handle * handle,struct perf_event * event,u64 enabled,u64 running)5680 static void perf_output_read_group(struct perf_output_handle *handle,
5681 struct perf_event *event,
5682 u64 enabled, u64 running)
5683 {
5684 struct perf_event *leader = event->group_leader, *sub;
5685 u64 read_format = event->attr.read_format;
5686 u64 values[5];
5687 int n = 0;
5688
5689 values[n++] = 1 + leader->nr_siblings;
5690
5691 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
5692 values[n++] = enabled;
5693
5694 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
5695 values[n++] = running;
5696
5697 if (leader != event)
5698 leader->pmu->read(leader);
5699
5700 values[n++] = perf_event_count(leader);
5701 if (read_format & PERF_FORMAT_ID)
5702 values[n++] = primary_event_id(leader);
5703
5704 __output_copy(handle, values, n * sizeof(u64));
5705
5706 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
5707 n = 0;
5708
5709 if ((sub != event) &&
5710 (sub->state == PERF_EVENT_STATE_ACTIVE))
5711 sub->pmu->read(sub);
5712
5713 values[n++] = perf_event_count(sub);
5714 if (read_format & PERF_FORMAT_ID)
5715 values[n++] = primary_event_id(sub);
5716
5717 __output_copy(handle, values, n * sizeof(u64));
5718 }
5719 }
5720
5721 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
5722 PERF_FORMAT_TOTAL_TIME_RUNNING)
5723
5724 /*
5725 * XXX PERF_SAMPLE_READ vs inherited events seems difficult.
5726 *
5727 * The problem is that its both hard and excessively expensive to iterate the
5728 * child list, not to mention that its impossible to IPI the children running
5729 * on another CPU, from interrupt/NMI context.
5730 */
perf_output_read(struct perf_output_handle * handle,struct perf_event * event)5731 static void perf_output_read(struct perf_output_handle *handle,
5732 struct perf_event *event)
5733 {
5734 u64 enabled = 0, running = 0, now;
5735 u64 read_format = event->attr.read_format;
5736
5737 /*
5738 * compute total_time_enabled, total_time_running
5739 * based on snapshot values taken when the event
5740 * was last scheduled in.
5741 *
5742 * we cannot simply called update_context_time()
5743 * because of locking issue as we are called in
5744 * NMI context
5745 */
5746 if (read_format & PERF_FORMAT_TOTAL_TIMES)
5747 calc_timer_values(event, &now, &enabled, &running);
5748
5749 if (event->attr.read_format & PERF_FORMAT_GROUP)
5750 perf_output_read_group(handle, event, enabled, running);
5751 else
5752 perf_output_read_one(handle, event, enabled, running);
5753 }
5754
perf_output_sample(struct perf_output_handle * handle,struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)5755 void perf_output_sample(struct perf_output_handle *handle,
5756 struct perf_event_header *header,
5757 struct perf_sample_data *data,
5758 struct perf_event *event)
5759 {
5760 u64 sample_type = data->type;
5761
5762 perf_output_put(handle, *header);
5763
5764 if (sample_type & PERF_SAMPLE_IDENTIFIER)
5765 perf_output_put(handle, data->id);
5766
5767 if (sample_type & PERF_SAMPLE_IP)
5768 perf_output_put(handle, data->ip);
5769
5770 if (sample_type & PERF_SAMPLE_TID)
5771 perf_output_put(handle, data->tid_entry);
5772
5773 if (sample_type & PERF_SAMPLE_TIME)
5774 perf_output_put(handle, data->time);
5775
5776 if (sample_type & PERF_SAMPLE_ADDR)
5777 perf_output_put(handle, data->addr);
5778
5779 if (sample_type & PERF_SAMPLE_ID)
5780 perf_output_put(handle, data->id);
5781
5782 if (sample_type & PERF_SAMPLE_STREAM_ID)
5783 perf_output_put(handle, data->stream_id);
5784
5785 if (sample_type & PERF_SAMPLE_CPU)
5786 perf_output_put(handle, data->cpu_entry);
5787
5788 if (sample_type & PERF_SAMPLE_PERIOD)
5789 perf_output_put(handle, data->period);
5790
5791 if (sample_type & PERF_SAMPLE_READ)
5792 perf_output_read(handle, event);
5793
5794 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5795 if (data->callchain) {
5796 int size = 1;
5797
5798 if (data->callchain)
5799 size += data->callchain->nr;
5800
5801 size *= sizeof(u64);
5802
5803 __output_copy(handle, data->callchain, size);
5804 } else {
5805 u64 nr = 0;
5806 perf_output_put(handle, nr);
5807 }
5808 }
5809
5810 if (sample_type & PERF_SAMPLE_RAW) {
5811 struct perf_raw_record *raw = data->raw;
5812
5813 if (raw) {
5814 struct perf_raw_frag *frag = &raw->frag;
5815
5816 perf_output_put(handle, raw->size);
5817 do {
5818 if (frag->copy) {
5819 __output_custom(handle, frag->copy,
5820 frag->data, frag->size);
5821 } else {
5822 __output_copy(handle, frag->data,
5823 frag->size);
5824 }
5825 if (perf_raw_frag_last(frag))
5826 break;
5827 frag = frag->next;
5828 } while (1);
5829 if (frag->pad)
5830 __output_skip(handle, NULL, frag->pad);
5831 } else {
5832 struct {
5833 u32 size;
5834 u32 data;
5835 } raw = {
5836 .size = sizeof(u32),
5837 .data = 0,
5838 };
5839 perf_output_put(handle, raw);
5840 }
5841 }
5842
5843 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5844 if (data->br_stack) {
5845 size_t size;
5846
5847 size = data->br_stack->nr
5848 * sizeof(struct perf_branch_entry);
5849
5850 perf_output_put(handle, data->br_stack->nr);
5851 perf_output_copy(handle, data->br_stack->entries, size);
5852 } else {
5853 /*
5854 * we always store at least the value of nr
5855 */
5856 u64 nr = 0;
5857 perf_output_put(handle, nr);
5858 }
5859 }
5860
5861 if (sample_type & PERF_SAMPLE_REGS_USER) {
5862 u64 abi = data->regs_user.abi;
5863
5864 /*
5865 * If there are no regs to dump, notice it through
5866 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5867 */
5868 perf_output_put(handle, abi);
5869
5870 if (abi) {
5871 u64 mask = event->attr.sample_regs_user;
5872 perf_output_sample_regs(handle,
5873 data->regs_user.regs,
5874 mask);
5875 }
5876 }
5877
5878 if (sample_type & PERF_SAMPLE_STACK_USER) {
5879 perf_output_sample_ustack(handle,
5880 data->stack_user_size,
5881 data->regs_user.regs);
5882 }
5883
5884 if (sample_type & PERF_SAMPLE_WEIGHT)
5885 perf_output_put(handle, data->weight);
5886
5887 if (sample_type & PERF_SAMPLE_DATA_SRC)
5888 perf_output_put(handle, data->data_src.val);
5889
5890 if (sample_type & PERF_SAMPLE_TRANSACTION)
5891 perf_output_put(handle, data->txn);
5892
5893 if (sample_type & PERF_SAMPLE_REGS_INTR) {
5894 u64 abi = data->regs_intr.abi;
5895 /*
5896 * If there are no regs to dump, notice it through
5897 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5898 */
5899 perf_output_put(handle, abi);
5900
5901 if (abi) {
5902 u64 mask = event->attr.sample_regs_intr;
5903
5904 perf_output_sample_regs(handle,
5905 data->regs_intr.regs,
5906 mask);
5907 }
5908 }
5909
5910 if (!event->attr.watermark) {
5911 int wakeup_events = event->attr.wakeup_events;
5912
5913 if (wakeup_events) {
5914 struct ring_buffer *rb = handle->rb;
5915 int events = local_inc_return(&rb->events);
5916
5917 if (events >= wakeup_events) {
5918 local_sub(wakeup_events, &rb->events);
5919 local_inc(&rb->wakeup);
5920 }
5921 }
5922 }
5923 }
5924
perf_prepare_sample(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event,struct pt_regs * regs)5925 void perf_prepare_sample(struct perf_event_header *header,
5926 struct perf_sample_data *data,
5927 struct perf_event *event,
5928 struct pt_regs *regs)
5929 {
5930 u64 sample_type = event->attr.sample_type;
5931
5932 header->type = PERF_RECORD_SAMPLE;
5933 header->size = sizeof(*header) + event->header_size;
5934
5935 header->misc = 0;
5936 header->misc |= perf_misc_flags(regs);
5937
5938 __perf_event_header__init_id(header, data, event);
5939
5940 if (sample_type & PERF_SAMPLE_IP)
5941 data->ip = perf_instruction_pointer(regs);
5942
5943 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5944 int size = 1;
5945
5946 data->callchain = perf_callchain(event, regs);
5947
5948 if (data->callchain)
5949 size += data->callchain->nr;
5950
5951 header->size += size * sizeof(u64);
5952 }
5953
5954 if (sample_type & PERF_SAMPLE_RAW) {
5955 struct perf_raw_record *raw = data->raw;
5956 int size;
5957
5958 if (raw) {
5959 struct perf_raw_frag *frag = &raw->frag;
5960 u32 sum = 0;
5961
5962 do {
5963 sum += frag->size;
5964 if (perf_raw_frag_last(frag))
5965 break;
5966 frag = frag->next;
5967 } while (1);
5968
5969 size = round_up(sum + sizeof(u32), sizeof(u64));
5970 raw->size = size - sizeof(u32);
5971 frag->pad = raw->size - sum;
5972 } else {
5973 size = sizeof(u64);
5974 }
5975
5976 header->size += size;
5977 }
5978
5979 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5980 int size = sizeof(u64); /* nr */
5981 if (data->br_stack) {
5982 size += data->br_stack->nr
5983 * sizeof(struct perf_branch_entry);
5984 }
5985 header->size += size;
5986 }
5987
5988 if (sample_type & (PERF_SAMPLE_REGS_USER | PERF_SAMPLE_STACK_USER))
5989 perf_sample_regs_user(&data->regs_user, regs,
5990 &data->regs_user_copy);
5991
5992 if (sample_type & PERF_SAMPLE_REGS_USER) {
5993 /* regs dump ABI info */
5994 int size = sizeof(u64);
5995
5996 if (data->regs_user.regs) {
5997 u64 mask = event->attr.sample_regs_user;
5998 size += hweight64(mask) * sizeof(u64);
5999 }
6000
6001 header->size += size;
6002 }
6003
6004 if (sample_type & PERF_SAMPLE_STACK_USER) {
6005 /*
6006 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
6007 * processed as the last one or have additional check added
6008 * in case new sample type is added, because we could eat
6009 * up the rest of the sample size.
6010 */
6011 u16 stack_size = event->attr.sample_stack_user;
6012 u16 size = sizeof(u64);
6013
6014 stack_size = perf_sample_ustack_size(stack_size, header->size,
6015 data->regs_user.regs);
6016
6017 /*
6018 * If there is something to dump, add space for the dump
6019 * itself and for the field that tells the dynamic size,
6020 * which is how many have been actually dumped.
6021 */
6022 if (stack_size)
6023 size += sizeof(u64) + stack_size;
6024
6025 data->stack_user_size = stack_size;
6026 header->size += size;
6027 }
6028
6029 if (sample_type & PERF_SAMPLE_REGS_INTR) {
6030 /* regs dump ABI info */
6031 int size = sizeof(u64);
6032
6033 perf_sample_regs_intr(&data->regs_intr, regs);
6034
6035 if (data->regs_intr.regs) {
6036 u64 mask = event->attr.sample_regs_intr;
6037
6038 size += hweight64(mask) * sizeof(u64);
6039 }
6040
6041 header->size += size;
6042 }
6043 }
6044
6045 static void __always_inline
__perf_event_output(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs,int (* output_begin)(struct perf_output_handle *,struct perf_event *,unsigned int))6046 __perf_event_output(struct perf_event *event,
6047 struct perf_sample_data *data,
6048 struct pt_regs *regs,
6049 int (*output_begin)(struct perf_output_handle *,
6050 struct perf_event *,
6051 unsigned int))
6052 {
6053 struct perf_output_handle handle;
6054 struct perf_event_header header;
6055
6056 /* protect the callchain buffers */
6057 rcu_read_lock();
6058
6059 perf_prepare_sample(&header, data, event, regs);
6060
6061 if (output_begin(&handle, event, header.size))
6062 goto exit;
6063
6064 perf_output_sample(&handle, &header, data, event);
6065
6066 perf_output_end(&handle);
6067
6068 exit:
6069 rcu_read_unlock();
6070 }
6071
6072 void
perf_event_output_forward(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)6073 perf_event_output_forward(struct perf_event *event,
6074 struct perf_sample_data *data,
6075 struct pt_regs *regs)
6076 {
6077 __perf_event_output(event, data, regs, perf_output_begin_forward);
6078 }
6079
6080 void
perf_event_output_backward(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)6081 perf_event_output_backward(struct perf_event *event,
6082 struct perf_sample_data *data,
6083 struct pt_regs *regs)
6084 {
6085 __perf_event_output(event, data, regs, perf_output_begin_backward);
6086 }
6087
6088 void
perf_event_output(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)6089 perf_event_output(struct perf_event *event,
6090 struct perf_sample_data *data,
6091 struct pt_regs *regs)
6092 {
6093 __perf_event_output(event, data, regs, perf_output_begin);
6094 }
6095
6096 /*
6097 * read event_id
6098 */
6099
6100 struct perf_read_event {
6101 struct perf_event_header header;
6102
6103 u32 pid;
6104 u32 tid;
6105 };
6106
6107 static void
perf_event_read_event(struct perf_event * event,struct task_struct * task)6108 perf_event_read_event(struct perf_event *event,
6109 struct task_struct *task)
6110 {
6111 struct perf_output_handle handle;
6112 struct perf_sample_data sample;
6113 struct perf_read_event read_event = {
6114 .header = {
6115 .type = PERF_RECORD_READ,
6116 .misc = 0,
6117 .size = sizeof(read_event) + event->read_size,
6118 },
6119 .pid = perf_event_pid(event, task),
6120 .tid = perf_event_tid(event, task),
6121 };
6122 int ret;
6123
6124 perf_event_header__init_id(&read_event.header, &sample, event);
6125 ret = perf_output_begin(&handle, event, read_event.header.size);
6126 if (ret)
6127 return;
6128
6129 perf_output_put(&handle, read_event);
6130 perf_output_read(&handle, event);
6131 perf_event__output_id_sample(event, &handle, &sample);
6132
6133 perf_output_end(&handle);
6134 }
6135
6136 typedef void (perf_iterate_f)(struct perf_event *event, void *data);
6137
6138 static void
perf_iterate_ctx(struct perf_event_context * ctx,perf_iterate_f output,void * data,bool all)6139 perf_iterate_ctx(struct perf_event_context *ctx,
6140 perf_iterate_f output,
6141 void *data, bool all)
6142 {
6143 struct perf_event *event;
6144
6145 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6146 if (!all) {
6147 if (event->state < PERF_EVENT_STATE_INACTIVE)
6148 continue;
6149 if (!event_filter_match(event))
6150 continue;
6151 }
6152
6153 output(event, data);
6154 }
6155 }
6156
perf_iterate_sb_cpu(perf_iterate_f output,void * data)6157 static void perf_iterate_sb_cpu(perf_iterate_f output, void *data)
6158 {
6159 struct pmu_event_list *pel = this_cpu_ptr(&pmu_sb_events);
6160 struct perf_event *event;
6161
6162 list_for_each_entry_rcu(event, &pel->list, sb_list) {
6163 /*
6164 * Skip events that are not fully formed yet; ensure that
6165 * if we observe event->ctx, both event and ctx will be
6166 * complete enough. See perf_install_in_context().
6167 */
6168 if (!smp_load_acquire(&event->ctx))
6169 continue;
6170
6171 if (event->state < PERF_EVENT_STATE_INACTIVE)
6172 continue;
6173 if (!event_filter_match(event))
6174 continue;
6175 output(event, data);
6176 }
6177 }
6178
6179 /*
6180 * Iterate all events that need to receive side-band events.
6181 *
6182 * For new callers; ensure that account_pmu_sb_event() includes
6183 * your event, otherwise it might not get delivered.
6184 */
6185 static void
perf_iterate_sb(perf_iterate_f output,void * data,struct perf_event_context * task_ctx)6186 perf_iterate_sb(perf_iterate_f output, void *data,
6187 struct perf_event_context *task_ctx)
6188 {
6189 struct perf_event_context *ctx;
6190 int ctxn;
6191
6192 rcu_read_lock();
6193 preempt_disable();
6194
6195 /*
6196 * If we have task_ctx != NULL we only notify the task context itself.
6197 * The task_ctx is set only for EXIT events before releasing task
6198 * context.
6199 */
6200 if (task_ctx) {
6201 perf_iterate_ctx(task_ctx, output, data, false);
6202 goto done;
6203 }
6204
6205 perf_iterate_sb_cpu(output, data);
6206
6207 for_each_task_context_nr(ctxn) {
6208 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6209 if (ctx)
6210 perf_iterate_ctx(ctx, output, data, false);
6211 }
6212 done:
6213 preempt_enable();
6214 rcu_read_unlock();
6215 }
6216
6217 /*
6218 * Clear all file-based filters at exec, they'll have to be
6219 * re-instated when/if these objects are mmapped again.
6220 */
perf_event_addr_filters_exec(struct perf_event * event,void * data)6221 static void perf_event_addr_filters_exec(struct perf_event *event, void *data)
6222 {
6223 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6224 struct perf_addr_filter *filter;
6225 unsigned int restart = 0, count = 0;
6226 unsigned long flags;
6227
6228 if (!has_addr_filter(event))
6229 return;
6230
6231 raw_spin_lock_irqsave(&ifh->lock, flags);
6232 list_for_each_entry(filter, &ifh->list, entry) {
6233 if (filter->inode) {
6234 event->addr_filters_offs[count] = 0;
6235 restart++;
6236 }
6237
6238 count++;
6239 }
6240
6241 if (restart)
6242 event->addr_filters_gen++;
6243 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6244
6245 if (restart)
6246 perf_event_stop(event, 1);
6247 }
6248
perf_event_exec(void)6249 void perf_event_exec(void)
6250 {
6251 struct perf_event_context *ctx;
6252 int ctxn;
6253
6254 rcu_read_lock();
6255 for_each_task_context_nr(ctxn) {
6256 ctx = current->perf_event_ctxp[ctxn];
6257 if (!ctx)
6258 continue;
6259
6260 perf_event_enable_on_exec(ctxn);
6261
6262 perf_iterate_ctx(ctx, perf_event_addr_filters_exec, NULL,
6263 true);
6264 }
6265 rcu_read_unlock();
6266 }
6267
6268 struct remote_output {
6269 struct ring_buffer *rb;
6270 int err;
6271 };
6272
__perf_event_output_stop(struct perf_event * event,void * data)6273 static void __perf_event_output_stop(struct perf_event *event, void *data)
6274 {
6275 struct perf_event *parent = event->parent;
6276 struct remote_output *ro = data;
6277 struct ring_buffer *rb = ro->rb;
6278 struct stop_event_data sd = {
6279 .event = event,
6280 };
6281
6282 if (!has_aux(event))
6283 return;
6284
6285 if (!parent)
6286 parent = event;
6287
6288 /*
6289 * In case of inheritance, it will be the parent that links to the
6290 * ring-buffer, but it will be the child that's actually using it.
6291 *
6292 * We are using event::rb to determine if the event should be stopped,
6293 * however this may race with ring_buffer_attach() (through set_output),
6294 * which will make us skip the event that actually needs to be stopped.
6295 * So ring_buffer_attach() has to stop an aux event before re-assigning
6296 * its rb pointer.
6297 */
6298 if (rcu_dereference(parent->rb) == rb)
6299 ro->err = __perf_event_stop(&sd);
6300 }
6301
__perf_pmu_output_stop(void * info)6302 static int __perf_pmu_output_stop(void *info)
6303 {
6304 struct perf_event *event = info;
6305 struct pmu *pmu = event->pmu;
6306 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
6307 struct remote_output ro = {
6308 .rb = event->rb,
6309 };
6310
6311 rcu_read_lock();
6312 perf_iterate_ctx(&cpuctx->ctx, __perf_event_output_stop, &ro, false);
6313 if (cpuctx->task_ctx)
6314 perf_iterate_ctx(cpuctx->task_ctx, __perf_event_output_stop,
6315 &ro, false);
6316 rcu_read_unlock();
6317
6318 return ro.err;
6319 }
6320
perf_pmu_output_stop(struct perf_event * event)6321 static void perf_pmu_output_stop(struct perf_event *event)
6322 {
6323 struct perf_event *iter;
6324 int err, cpu;
6325
6326 restart:
6327 rcu_read_lock();
6328 list_for_each_entry_rcu(iter, &event->rb->event_list, rb_entry) {
6329 /*
6330 * For per-CPU events, we need to make sure that neither they
6331 * nor their children are running; for cpu==-1 events it's
6332 * sufficient to stop the event itself if it's active, since
6333 * it can't have children.
6334 */
6335 cpu = iter->cpu;
6336 if (cpu == -1)
6337 cpu = READ_ONCE(iter->oncpu);
6338
6339 if (cpu == -1)
6340 continue;
6341
6342 err = cpu_function_call(cpu, __perf_pmu_output_stop, event);
6343 if (err == -EAGAIN) {
6344 rcu_read_unlock();
6345 goto restart;
6346 }
6347 }
6348 rcu_read_unlock();
6349 }
6350
6351 /*
6352 * task tracking -- fork/exit
6353 *
6354 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
6355 */
6356
6357 struct perf_task_event {
6358 struct task_struct *task;
6359 struct perf_event_context *task_ctx;
6360
6361 struct {
6362 struct perf_event_header header;
6363
6364 u32 pid;
6365 u32 ppid;
6366 u32 tid;
6367 u32 ptid;
6368 u64 time;
6369 } event_id;
6370 };
6371
perf_event_task_match(struct perf_event * event)6372 static int perf_event_task_match(struct perf_event *event)
6373 {
6374 return event->attr.comm || event->attr.mmap ||
6375 event->attr.mmap2 || event->attr.mmap_data ||
6376 event->attr.task;
6377 }
6378
perf_event_task_output(struct perf_event * event,void * data)6379 static void perf_event_task_output(struct perf_event *event,
6380 void *data)
6381 {
6382 struct perf_task_event *task_event = data;
6383 struct perf_output_handle handle;
6384 struct perf_sample_data sample;
6385 struct task_struct *task = task_event->task;
6386 int ret, size = task_event->event_id.header.size;
6387
6388 if (!perf_event_task_match(event))
6389 return;
6390
6391 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
6392
6393 ret = perf_output_begin(&handle, event,
6394 task_event->event_id.header.size);
6395 if (ret)
6396 goto out;
6397
6398 task_event->event_id.pid = perf_event_pid(event, task);
6399 task_event->event_id.ppid = perf_event_pid(event, current);
6400
6401 task_event->event_id.tid = perf_event_tid(event, task);
6402 task_event->event_id.ptid = perf_event_tid(event, current);
6403
6404 task_event->event_id.time = perf_event_clock(event);
6405
6406 perf_output_put(&handle, task_event->event_id);
6407
6408 perf_event__output_id_sample(event, &handle, &sample);
6409
6410 perf_output_end(&handle);
6411 out:
6412 task_event->event_id.header.size = size;
6413 }
6414
perf_event_task(struct task_struct * task,struct perf_event_context * task_ctx,int new)6415 static void perf_event_task(struct task_struct *task,
6416 struct perf_event_context *task_ctx,
6417 int new)
6418 {
6419 struct perf_task_event task_event;
6420
6421 if (!atomic_read(&nr_comm_events) &&
6422 !atomic_read(&nr_mmap_events) &&
6423 !atomic_read(&nr_task_events))
6424 return;
6425
6426 task_event = (struct perf_task_event){
6427 .task = task,
6428 .task_ctx = task_ctx,
6429 .event_id = {
6430 .header = {
6431 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
6432 .misc = 0,
6433 .size = sizeof(task_event.event_id),
6434 },
6435 /* .pid */
6436 /* .ppid */
6437 /* .tid */
6438 /* .ptid */
6439 /* .time */
6440 },
6441 };
6442
6443 perf_iterate_sb(perf_event_task_output,
6444 &task_event,
6445 task_ctx);
6446 }
6447
perf_event_fork(struct task_struct * task)6448 void perf_event_fork(struct task_struct *task)
6449 {
6450 perf_event_task(task, NULL, 1);
6451 }
6452
6453 /*
6454 * comm tracking
6455 */
6456
6457 struct perf_comm_event {
6458 struct task_struct *task;
6459 char *comm;
6460 int comm_size;
6461
6462 struct {
6463 struct perf_event_header header;
6464
6465 u32 pid;
6466 u32 tid;
6467 } event_id;
6468 };
6469
perf_event_comm_match(struct perf_event * event)6470 static int perf_event_comm_match(struct perf_event *event)
6471 {
6472 return event->attr.comm;
6473 }
6474
perf_event_comm_output(struct perf_event * event,void * data)6475 static void perf_event_comm_output(struct perf_event *event,
6476 void *data)
6477 {
6478 struct perf_comm_event *comm_event = data;
6479 struct perf_output_handle handle;
6480 struct perf_sample_data sample;
6481 int size = comm_event->event_id.header.size;
6482 int ret;
6483
6484 if (!perf_event_comm_match(event))
6485 return;
6486
6487 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
6488 ret = perf_output_begin(&handle, event,
6489 comm_event->event_id.header.size);
6490
6491 if (ret)
6492 goto out;
6493
6494 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
6495 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
6496
6497 perf_output_put(&handle, comm_event->event_id);
6498 __output_copy(&handle, comm_event->comm,
6499 comm_event->comm_size);
6500
6501 perf_event__output_id_sample(event, &handle, &sample);
6502
6503 perf_output_end(&handle);
6504 out:
6505 comm_event->event_id.header.size = size;
6506 }
6507
perf_event_comm_event(struct perf_comm_event * comm_event)6508 static void perf_event_comm_event(struct perf_comm_event *comm_event)
6509 {
6510 char comm[TASK_COMM_LEN];
6511 unsigned int size;
6512
6513 memset(comm, 0, sizeof(comm));
6514 strlcpy(comm, comm_event->task->comm, sizeof(comm));
6515 size = ALIGN(strlen(comm)+1, sizeof(u64));
6516
6517 comm_event->comm = comm;
6518 comm_event->comm_size = size;
6519
6520 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
6521
6522 perf_iterate_sb(perf_event_comm_output,
6523 comm_event,
6524 NULL);
6525 }
6526
perf_event_comm(struct task_struct * task,bool exec)6527 void perf_event_comm(struct task_struct *task, bool exec)
6528 {
6529 struct perf_comm_event comm_event;
6530
6531 if (!atomic_read(&nr_comm_events))
6532 return;
6533
6534 comm_event = (struct perf_comm_event){
6535 .task = task,
6536 /* .comm */
6537 /* .comm_size */
6538 .event_id = {
6539 .header = {
6540 .type = PERF_RECORD_COMM,
6541 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
6542 /* .size */
6543 },
6544 /* .pid */
6545 /* .tid */
6546 },
6547 };
6548
6549 perf_event_comm_event(&comm_event);
6550 }
6551
6552 /*
6553 * mmap tracking
6554 */
6555
6556 struct perf_mmap_event {
6557 struct vm_area_struct *vma;
6558
6559 const char *file_name;
6560 int file_size;
6561 int maj, min;
6562 u64 ino;
6563 u64 ino_generation;
6564 u32 prot, flags;
6565
6566 struct {
6567 struct perf_event_header header;
6568
6569 u32 pid;
6570 u32 tid;
6571 u64 start;
6572 u64 len;
6573 u64 pgoff;
6574 } event_id;
6575 };
6576
perf_event_mmap_match(struct perf_event * event,void * data)6577 static int perf_event_mmap_match(struct perf_event *event,
6578 void *data)
6579 {
6580 struct perf_mmap_event *mmap_event = data;
6581 struct vm_area_struct *vma = mmap_event->vma;
6582 int executable = vma->vm_flags & VM_EXEC;
6583
6584 return (!executable && event->attr.mmap_data) ||
6585 (executable && (event->attr.mmap || event->attr.mmap2));
6586 }
6587
perf_event_mmap_output(struct perf_event * event,void * data)6588 static void perf_event_mmap_output(struct perf_event *event,
6589 void *data)
6590 {
6591 struct perf_mmap_event *mmap_event = data;
6592 struct perf_output_handle handle;
6593 struct perf_sample_data sample;
6594 int size = mmap_event->event_id.header.size;
6595 int ret;
6596
6597 if (!perf_event_mmap_match(event, data))
6598 return;
6599
6600 if (event->attr.mmap2) {
6601 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
6602 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
6603 mmap_event->event_id.header.size += sizeof(mmap_event->min);
6604 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
6605 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
6606 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
6607 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
6608 }
6609
6610 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
6611 ret = perf_output_begin(&handle, event,
6612 mmap_event->event_id.header.size);
6613 if (ret)
6614 goto out;
6615
6616 mmap_event->event_id.pid = perf_event_pid(event, current);
6617 mmap_event->event_id.tid = perf_event_tid(event, current);
6618
6619 perf_output_put(&handle, mmap_event->event_id);
6620
6621 if (event->attr.mmap2) {
6622 perf_output_put(&handle, mmap_event->maj);
6623 perf_output_put(&handle, mmap_event->min);
6624 perf_output_put(&handle, mmap_event->ino);
6625 perf_output_put(&handle, mmap_event->ino_generation);
6626 perf_output_put(&handle, mmap_event->prot);
6627 perf_output_put(&handle, mmap_event->flags);
6628 }
6629
6630 __output_copy(&handle, mmap_event->file_name,
6631 mmap_event->file_size);
6632
6633 perf_event__output_id_sample(event, &handle, &sample);
6634
6635 perf_output_end(&handle);
6636 out:
6637 mmap_event->event_id.header.size = size;
6638 }
6639
perf_event_mmap_event(struct perf_mmap_event * mmap_event)6640 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
6641 {
6642 struct vm_area_struct *vma = mmap_event->vma;
6643 struct file *file = vma->vm_file;
6644 int maj = 0, min = 0;
6645 u64 ino = 0, gen = 0;
6646 u32 prot = 0, flags = 0;
6647 unsigned int size;
6648 char tmp[16];
6649 char *buf = NULL;
6650 char *name;
6651
6652 if (vma->vm_flags & VM_READ)
6653 prot |= PROT_READ;
6654 if (vma->vm_flags & VM_WRITE)
6655 prot |= PROT_WRITE;
6656 if (vma->vm_flags & VM_EXEC)
6657 prot |= PROT_EXEC;
6658
6659 if (vma->vm_flags & VM_MAYSHARE)
6660 flags = MAP_SHARED;
6661 else
6662 flags = MAP_PRIVATE;
6663
6664 if (vma->vm_flags & VM_DENYWRITE)
6665 flags |= MAP_DENYWRITE;
6666 if (vma->vm_flags & VM_MAYEXEC)
6667 flags |= MAP_EXECUTABLE;
6668 if (vma->vm_flags & VM_LOCKED)
6669 flags |= MAP_LOCKED;
6670 if (vma->vm_flags & VM_HUGETLB)
6671 flags |= MAP_HUGETLB;
6672
6673 if (file) {
6674 struct inode *inode;
6675 dev_t dev;
6676
6677 buf = kmalloc(PATH_MAX, GFP_KERNEL);
6678 if (!buf) {
6679 name = "//enomem";
6680 goto cpy_name;
6681 }
6682 /*
6683 * d_path() works from the end of the rb backwards, so we
6684 * need to add enough zero bytes after the string to handle
6685 * the 64bit alignment we do later.
6686 */
6687 name = file_path(file, buf, PATH_MAX - sizeof(u64));
6688 if (IS_ERR(name)) {
6689 name = "//toolong";
6690 goto cpy_name;
6691 }
6692 inode = file_inode(vma->vm_file);
6693 dev = inode->i_sb->s_dev;
6694 ino = inode->i_ino;
6695 gen = inode->i_generation;
6696 maj = MAJOR(dev);
6697 min = MINOR(dev);
6698
6699 goto got_name;
6700 } else {
6701 if (vma->vm_ops && vma->vm_ops->name) {
6702 name = (char *) vma->vm_ops->name(vma);
6703 if (name)
6704 goto cpy_name;
6705 }
6706
6707 name = (char *)arch_vma_name(vma);
6708 if (name)
6709 goto cpy_name;
6710
6711 if (vma->vm_start <= vma->vm_mm->start_brk &&
6712 vma->vm_end >= vma->vm_mm->brk) {
6713 name = "[heap]";
6714 goto cpy_name;
6715 }
6716 if (vma->vm_start <= vma->vm_mm->start_stack &&
6717 vma->vm_end >= vma->vm_mm->start_stack) {
6718 name = "[stack]";
6719 goto cpy_name;
6720 }
6721
6722 name = "//anon";
6723 goto cpy_name;
6724 }
6725
6726 cpy_name:
6727 strlcpy(tmp, name, sizeof(tmp));
6728 name = tmp;
6729 got_name:
6730 /*
6731 * Since our buffer works in 8 byte units we need to align our string
6732 * size to a multiple of 8. However, we must guarantee the tail end is
6733 * zero'd out to avoid leaking random bits to userspace.
6734 */
6735 size = strlen(name)+1;
6736 while (!IS_ALIGNED(size, sizeof(u64)))
6737 name[size++] = '\0';
6738
6739 mmap_event->file_name = name;
6740 mmap_event->file_size = size;
6741 mmap_event->maj = maj;
6742 mmap_event->min = min;
6743 mmap_event->ino = ino;
6744 mmap_event->ino_generation = gen;
6745 mmap_event->prot = prot;
6746 mmap_event->flags = flags;
6747
6748 if (!(vma->vm_flags & VM_EXEC))
6749 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
6750
6751 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
6752
6753 perf_iterate_sb(perf_event_mmap_output,
6754 mmap_event,
6755 NULL);
6756
6757 kfree(buf);
6758 }
6759
6760 /*
6761 * Check whether inode and address range match filter criteria.
6762 */
perf_addr_filter_match(struct perf_addr_filter * filter,struct file * file,unsigned long offset,unsigned long size)6763 static bool perf_addr_filter_match(struct perf_addr_filter *filter,
6764 struct file *file, unsigned long offset,
6765 unsigned long size)
6766 {
6767 if (filter->inode != file->f_inode)
6768 return false;
6769
6770 if (filter->offset > offset + size)
6771 return false;
6772
6773 if (filter->offset + filter->size < offset)
6774 return false;
6775
6776 return true;
6777 }
6778
__perf_addr_filters_adjust(struct perf_event * event,void * data)6779 static void __perf_addr_filters_adjust(struct perf_event *event, void *data)
6780 {
6781 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
6782 struct vm_area_struct *vma = data;
6783 unsigned long off = vma->vm_pgoff << PAGE_SHIFT, flags;
6784 struct file *file = vma->vm_file;
6785 struct perf_addr_filter *filter;
6786 unsigned int restart = 0, count = 0;
6787
6788 if (!has_addr_filter(event))
6789 return;
6790
6791 if (!file)
6792 return;
6793
6794 raw_spin_lock_irqsave(&ifh->lock, flags);
6795 list_for_each_entry(filter, &ifh->list, entry) {
6796 if (perf_addr_filter_match(filter, file, off,
6797 vma->vm_end - vma->vm_start)) {
6798 event->addr_filters_offs[count] = vma->vm_start;
6799 restart++;
6800 }
6801
6802 count++;
6803 }
6804
6805 if (restart)
6806 event->addr_filters_gen++;
6807 raw_spin_unlock_irqrestore(&ifh->lock, flags);
6808
6809 if (restart)
6810 perf_event_stop(event, 1);
6811 }
6812
6813 /*
6814 * Adjust all task's events' filters to the new vma
6815 */
perf_addr_filters_adjust(struct vm_area_struct * vma)6816 static void perf_addr_filters_adjust(struct vm_area_struct *vma)
6817 {
6818 struct perf_event_context *ctx;
6819 int ctxn;
6820
6821 /*
6822 * Data tracing isn't supported yet and as such there is no need
6823 * to keep track of anything that isn't related to executable code:
6824 */
6825 if (!(vma->vm_flags & VM_EXEC))
6826 return;
6827
6828 rcu_read_lock();
6829 for_each_task_context_nr(ctxn) {
6830 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
6831 if (!ctx)
6832 continue;
6833
6834 perf_iterate_ctx(ctx, __perf_addr_filters_adjust, vma, true);
6835 }
6836 rcu_read_unlock();
6837 }
6838
perf_event_mmap(struct vm_area_struct * vma)6839 void perf_event_mmap(struct vm_area_struct *vma)
6840 {
6841 struct perf_mmap_event mmap_event;
6842
6843 if (!atomic_read(&nr_mmap_events))
6844 return;
6845
6846 mmap_event = (struct perf_mmap_event){
6847 .vma = vma,
6848 /* .file_name */
6849 /* .file_size */
6850 .event_id = {
6851 .header = {
6852 .type = PERF_RECORD_MMAP,
6853 .misc = PERF_RECORD_MISC_USER,
6854 /* .size */
6855 },
6856 /* .pid */
6857 /* .tid */
6858 .start = vma->vm_start,
6859 .len = vma->vm_end - vma->vm_start,
6860 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
6861 },
6862 /* .maj (attr_mmap2 only) */
6863 /* .min (attr_mmap2 only) */
6864 /* .ino (attr_mmap2 only) */
6865 /* .ino_generation (attr_mmap2 only) */
6866 /* .prot (attr_mmap2 only) */
6867 /* .flags (attr_mmap2 only) */
6868 };
6869
6870 perf_addr_filters_adjust(vma);
6871 perf_event_mmap_event(&mmap_event);
6872 }
6873
perf_event_aux_event(struct perf_event * event,unsigned long head,unsigned long size,u64 flags)6874 void perf_event_aux_event(struct perf_event *event, unsigned long head,
6875 unsigned long size, u64 flags)
6876 {
6877 struct perf_output_handle handle;
6878 struct perf_sample_data sample;
6879 struct perf_aux_event {
6880 struct perf_event_header header;
6881 u64 offset;
6882 u64 size;
6883 u64 flags;
6884 } rec = {
6885 .header = {
6886 .type = PERF_RECORD_AUX,
6887 .misc = 0,
6888 .size = sizeof(rec),
6889 },
6890 .offset = head,
6891 .size = size,
6892 .flags = flags,
6893 };
6894 int ret;
6895
6896 perf_event_header__init_id(&rec.header, &sample, event);
6897 ret = perf_output_begin(&handle, event, rec.header.size);
6898
6899 if (ret)
6900 return;
6901
6902 perf_output_put(&handle, rec);
6903 perf_event__output_id_sample(event, &handle, &sample);
6904
6905 perf_output_end(&handle);
6906 }
6907
6908 /*
6909 * Lost/dropped samples logging
6910 */
perf_log_lost_samples(struct perf_event * event,u64 lost)6911 void perf_log_lost_samples(struct perf_event *event, u64 lost)
6912 {
6913 struct perf_output_handle handle;
6914 struct perf_sample_data sample;
6915 int ret;
6916
6917 struct {
6918 struct perf_event_header header;
6919 u64 lost;
6920 } lost_samples_event = {
6921 .header = {
6922 .type = PERF_RECORD_LOST_SAMPLES,
6923 .misc = 0,
6924 .size = sizeof(lost_samples_event),
6925 },
6926 .lost = lost,
6927 };
6928
6929 perf_event_header__init_id(&lost_samples_event.header, &sample, event);
6930
6931 ret = perf_output_begin(&handle, event,
6932 lost_samples_event.header.size);
6933 if (ret)
6934 return;
6935
6936 perf_output_put(&handle, lost_samples_event);
6937 perf_event__output_id_sample(event, &handle, &sample);
6938 perf_output_end(&handle);
6939 }
6940
6941 /*
6942 * context_switch tracking
6943 */
6944
6945 struct perf_switch_event {
6946 struct task_struct *task;
6947 struct task_struct *next_prev;
6948
6949 struct {
6950 struct perf_event_header header;
6951 u32 next_prev_pid;
6952 u32 next_prev_tid;
6953 } event_id;
6954 };
6955
perf_event_switch_match(struct perf_event * event)6956 static int perf_event_switch_match(struct perf_event *event)
6957 {
6958 return event->attr.context_switch;
6959 }
6960
perf_event_switch_output(struct perf_event * event,void * data)6961 static void perf_event_switch_output(struct perf_event *event, void *data)
6962 {
6963 struct perf_switch_event *se = data;
6964 struct perf_output_handle handle;
6965 struct perf_sample_data sample;
6966 int ret;
6967
6968 if (!perf_event_switch_match(event))
6969 return;
6970
6971 /* Only CPU-wide events are allowed to see next/prev pid/tid */
6972 if (event->ctx->task) {
6973 se->event_id.header.type = PERF_RECORD_SWITCH;
6974 se->event_id.header.size = sizeof(se->event_id.header);
6975 } else {
6976 se->event_id.header.type = PERF_RECORD_SWITCH_CPU_WIDE;
6977 se->event_id.header.size = sizeof(se->event_id);
6978 se->event_id.next_prev_pid =
6979 perf_event_pid(event, se->next_prev);
6980 se->event_id.next_prev_tid =
6981 perf_event_tid(event, se->next_prev);
6982 }
6983
6984 perf_event_header__init_id(&se->event_id.header, &sample, event);
6985
6986 ret = perf_output_begin(&handle, event, se->event_id.header.size);
6987 if (ret)
6988 return;
6989
6990 if (event->ctx->task)
6991 perf_output_put(&handle, se->event_id.header);
6992 else
6993 perf_output_put(&handle, se->event_id);
6994
6995 perf_event__output_id_sample(event, &handle, &sample);
6996
6997 perf_output_end(&handle);
6998 }
6999
perf_event_switch(struct task_struct * task,struct task_struct * next_prev,bool sched_in)7000 static void perf_event_switch(struct task_struct *task,
7001 struct task_struct *next_prev, bool sched_in)
7002 {
7003 struct perf_switch_event switch_event;
7004
7005 /* N.B. caller checks nr_switch_events != 0 */
7006
7007 switch_event = (struct perf_switch_event){
7008 .task = task,
7009 .next_prev = next_prev,
7010 .event_id = {
7011 .header = {
7012 /* .type */
7013 .misc = sched_in ? 0 : PERF_RECORD_MISC_SWITCH_OUT,
7014 /* .size */
7015 },
7016 /* .next_prev_pid */
7017 /* .next_prev_tid */
7018 },
7019 };
7020
7021 perf_iterate_sb(perf_event_switch_output,
7022 &switch_event,
7023 NULL);
7024 }
7025
7026 /*
7027 * IRQ throttle logging
7028 */
7029
perf_log_throttle(struct perf_event * event,int enable)7030 static void perf_log_throttle(struct perf_event *event, int enable)
7031 {
7032 struct perf_output_handle handle;
7033 struct perf_sample_data sample;
7034 int ret;
7035
7036 struct {
7037 struct perf_event_header header;
7038 u64 time;
7039 u64 id;
7040 u64 stream_id;
7041 } throttle_event = {
7042 .header = {
7043 .type = PERF_RECORD_THROTTLE,
7044 .misc = 0,
7045 .size = sizeof(throttle_event),
7046 },
7047 .time = perf_event_clock(event),
7048 .id = primary_event_id(event),
7049 .stream_id = event->id,
7050 };
7051
7052 if (enable)
7053 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
7054
7055 perf_event_header__init_id(&throttle_event.header, &sample, event);
7056
7057 ret = perf_output_begin(&handle, event,
7058 throttle_event.header.size);
7059 if (ret)
7060 return;
7061
7062 perf_output_put(&handle, throttle_event);
7063 perf_event__output_id_sample(event, &handle, &sample);
7064 perf_output_end(&handle);
7065 }
7066
perf_log_itrace_start(struct perf_event * event)7067 static void perf_log_itrace_start(struct perf_event *event)
7068 {
7069 struct perf_output_handle handle;
7070 struct perf_sample_data sample;
7071 struct perf_aux_event {
7072 struct perf_event_header header;
7073 u32 pid;
7074 u32 tid;
7075 } rec;
7076 int ret;
7077
7078 if (event->parent)
7079 event = event->parent;
7080
7081 if (!(event->pmu->capabilities & PERF_PMU_CAP_ITRACE) ||
7082 event->hw.itrace_started)
7083 return;
7084
7085 rec.header.type = PERF_RECORD_ITRACE_START;
7086 rec.header.misc = 0;
7087 rec.header.size = sizeof(rec);
7088 rec.pid = perf_event_pid(event, current);
7089 rec.tid = perf_event_tid(event, current);
7090
7091 perf_event_header__init_id(&rec.header, &sample, event);
7092 ret = perf_output_begin(&handle, event, rec.header.size);
7093
7094 if (ret)
7095 return;
7096
7097 perf_output_put(&handle, rec);
7098 perf_event__output_id_sample(event, &handle, &sample);
7099
7100 perf_output_end(&handle);
7101 }
7102
7103 static int
__perf_event_account_interrupt(struct perf_event * event,int throttle)7104 __perf_event_account_interrupt(struct perf_event *event, int throttle)
7105 {
7106 struct hw_perf_event *hwc = &event->hw;
7107 int ret = 0;
7108 u64 seq;
7109
7110 seq = __this_cpu_read(perf_throttled_seq);
7111 if (seq != hwc->interrupts_seq) {
7112 hwc->interrupts_seq = seq;
7113 hwc->interrupts = 1;
7114 } else {
7115 hwc->interrupts++;
7116 if (unlikely(throttle
7117 && hwc->interrupts >= max_samples_per_tick)) {
7118 __this_cpu_inc(perf_throttled_count);
7119 tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS);
7120 hwc->interrupts = MAX_INTERRUPTS;
7121 perf_log_throttle(event, 0);
7122 ret = 1;
7123 }
7124 }
7125
7126 if (event->attr.freq) {
7127 u64 now = perf_clock();
7128 s64 delta = now - hwc->freq_time_stamp;
7129
7130 hwc->freq_time_stamp = now;
7131
7132 if (delta > 0 && delta < 2*TICK_NSEC)
7133 perf_adjust_period(event, delta, hwc->last_period, true);
7134 }
7135
7136 return ret;
7137 }
7138
perf_event_account_interrupt(struct perf_event * event)7139 int perf_event_account_interrupt(struct perf_event *event)
7140 {
7141 return __perf_event_account_interrupt(event, 1);
7142 }
7143
7144 /*
7145 * Generic event overflow handling, sampling.
7146 */
7147
__perf_event_overflow(struct perf_event * event,int throttle,struct perf_sample_data * data,struct pt_regs * regs)7148 static int __perf_event_overflow(struct perf_event *event,
7149 int throttle, struct perf_sample_data *data,
7150 struct pt_regs *regs)
7151 {
7152 int events = atomic_read(&event->event_limit);
7153 int ret = 0;
7154
7155 /*
7156 * Non-sampling counters might still use the PMI to fold short
7157 * hardware counters, ignore those.
7158 */
7159 if (unlikely(!is_sampling_event(event)))
7160 return 0;
7161
7162 ret = __perf_event_account_interrupt(event, throttle);
7163
7164 /*
7165 * XXX event_limit might not quite work as expected on inherited
7166 * events
7167 */
7168
7169 event->pending_kill = POLL_IN;
7170 if (events && atomic_dec_and_test(&event->event_limit)) {
7171 ret = 1;
7172 event->pending_kill = POLL_HUP;
7173
7174 perf_event_disable_inatomic(event);
7175 }
7176
7177 READ_ONCE(event->overflow_handler)(event, data, regs);
7178
7179 if (*perf_event_fasync(event) && event->pending_kill) {
7180 event->pending_wakeup = 1;
7181 irq_work_queue(&event->pending);
7182 }
7183
7184 return ret;
7185 }
7186
perf_event_overflow(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)7187 int perf_event_overflow(struct perf_event *event,
7188 struct perf_sample_data *data,
7189 struct pt_regs *regs)
7190 {
7191 return __perf_event_overflow(event, 1, data, regs);
7192 }
7193
7194 /*
7195 * Generic software event infrastructure
7196 */
7197
7198 struct swevent_htable {
7199 struct swevent_hlist *swevent_hlist;
7200 struct mutex hlist_mutex;
7201 int hlist_refcount;
7202
7203 /* Recursion avoidance in each contexts */
7204 int recursion[PERF_NR_CONTEXTS];
7205 };
7206
7207 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
7208
7209 /*
7210 * We directly increment event->count and keep a second value in
7211 * event->hw.period_left to count intervals. This period event
7212 * is kept in the range [-sample_period, 0] so that we can use the
7213 * sign as trigger.
7214 */
7215
perf_swevent_set_period(struct perf_event * event)7216 u64 perf_swevent_set_period(struct perf_event *event)
7217 {
7218 struct hw_perf_event *hwc = &event->hw;
7219 u64 period = hwc->last_period;
7220 u64 nr, offset;
7221 s64 old, val;
7222
7223 hwc->last_period = hwc->sample_period;
7224
7225 again:
7226 old = val = local64_read(&hwc->period_left);
7227 if (val < 0)
7228 return 0;
7229
7230 nr = div64_u64(period + val, period);
7231 offset = nr * period;
7232 val -= offset;
7233 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
7234 goto again;
7235
7236 return nr;
7237 }
7238
perf_swevent_overflow(struct perf_event * event,u64 overflow,struct perf_sample_data * data,struct pt_regs * regs)7239 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
7240 struct perf_sample_data *data,
7241 struct pt_regs *regs)
7242 {
7243 struct hw_perf_event *hwc = &event->hw;
7244 int throttle = 0;
7245
7246 if (!overflow)
7247 overflow = perf_swevent_set_period(event);
7248
7249 if (hwc->interrupts == MAX_INTERRUPTS)
7250 return;
7251
7252 for (; overflow; overflow--) {
7253 if (__perf_event_overflow(event, throttle,
7254 data, regs)) {
7255 /*
7256 * We inhibit the overflow from happening when
7257 * hwc->interrupts == MAX_INTERRUPTS.
7258 */
7259 break;
7260 }
7261 throttle = 1;
7262 }
7263 }
7264
perf_swevent_event(struct perf_event * event,u64 nr,struct perf_sample_data * data,struct pt_regs * regs)7265 static void perf_swevent_event(struct perf_event *event, u64 nr,
7266 struct perf_sample_data *data,
7267 struct pt_regs *regs)
7268 {
7269 struct hw_perf_event *hwc = &event->hw;
7270
7271 local64_add(nr, &event->count);
7272
7273 if (!regs)
7274 return;
7275
7276 if (!is_sampling_event(event))
7277 return;
7278
7279 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
7280 data->period = nr;
7281 return perf_swevent_overflow(event, 1, data, regs);
7282 } else
7283 data->period = event->hw.last_period;
7284
7285 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
7286 return perf_swevent_overflow(event, 1, data, regs);
7287
7288 if (local64_add_negative(nr, &hwc->period_left))
7289 return;
7290
7291 perf_swevent_overflow(event, 0, data, regs);
7292 }
7293
perf_exclude_event(struct perf_event * event,struct pt_regs * regs)7294 static int perf_exclude_event(struct perf_event *event,
7295 struct pt_regs *regs)
7296 {
7297 if (event->hw.state & PERF_HES_STOPPED)
7298 return 1;
7299
7300 if (regs) {
7301 if (event->attr.exclude_user && user_mode(regs))
7302 return 1;
7303
7304 if (event->attr.exclude_kernel && !user_mode(regs))
7305 return 1;
7306 }
7307
7308 return 0;
7309 }
7310
perf_swevent_match(struct perf_event * event,enum perf_type_id type,u32 event_id,struct perf_sample_data * data,struct pt_regs * regs)7311 static int perf_swevent_match(struct perf_event *event,
7312 enum perf_type_id type,
7313 u32 event_id,
7314 struct perf_sample_data *data,
7315 struct pt_regs *regs)
7316 {
7317 if (event->attr.type != type)
7318 return 0;
7319
7320 if (event->attr.config != event_id)
7321 return 0;
7322
7323 if (perf_exclude_event(event, regs))
7324 return 0;
7325
7326 return 1;
7327 }
7328
swevent_hash(u64 type,u32 event_id)7329 static inline u64 swevent_hash(u64 type, u32 event_id)
7330 {
7331 u64 val = event_id | (type << 32);
7332
7333 return hash_64(val, SWEVENT_HLIST_BITS);
7334 }
7335
7336 static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist * hlist,u64 type,u32 event_id)7337 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
7338 {
7339 u64 hash = swevent_hash(type, event_id);
7340
7341 return &hlist->heads[hash];
7342 }
7343
7344 /* For the read side: events when they trigger */
7345 static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable * swhash,u64 type,u32 event_id)7346 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
7347 {
7348 struct swevent_hlist *hlist;
7349
7350 hlist = rcu_dereference(swhash->swevent_hlist);
7351 if (!hlist)
7352 return NULL;
7353
7354 return __find_swevent_head(hlist, type, event_id);
7355 }
7356
7357 /* For the event head insertion and removal in the hlist */
7358 static inline struct hlist_head *
find_swevent_head(struct swevent_htable * swhash,struct perf_event * event)7359 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
7360 {
7361 struct swevent_hlist *hlist;
7362 u32 event_id = event->attr.config;
7363 u64 type = event->attr.type;
7364
7365 /*
7366 * Event scheduling is always serialized against hlist allocation
7367 * and release. Which makes the protected version suitable here.
7368 * The context lock guarantees that.
7369 */
7370 hlist = rcu_dereference_protected(swhash->swevent_hlist,
7371 lockdep_is_held(&event->ctx->lock));
7372 if (!hlist)
7373 return NULL;
7374
7375 return __find_swevent_head(hlist, type, event_id);
7376 }
7377
do_perf_sw_event(enum perf_type_id type,u32 event_id,u64 nr,struct perf_sample_data * data,struct pt_regs * regs)7378 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
7379 u64 nr,
7380 struct perf_sample_data *data,
7381 struct pt_regs *regs)
7382 {
7383 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7384 struct perf_event *event;
7385 struct hlist_head *head;
7386
7387 rcu_read_lock();
7388 head = find_swevent_head_rcu(swhash, type, event_id);
7389 if (!head)
7390 goto end;
7391
7392 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7393 if (perf_swevent_match(event, type, event_id, data, regs))
7394 perf_swevent_event(event, nr, data, regs);
7395 }
7396 end:
7397 rcu_read_unlock();
7398 }
7399
7400 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
7401
perf_swevent_get_recursion_context(void)7402 int perf_swevent_get_recursion_context(void)
7403 {
7404 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7405
7406 return get_recursion_context(swhash->recursion);
7407 }
7408 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
7409
perf_swevent_put_recursion_context(int rctx)7410 void perf_swevent_put_recursion_context(int rctx)
7411 {
7412 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7413
7414 put_recursion_context(swhash->recursion, rctx);
7415 }
7416
___perf_sw_event(u32 event_id,u64 nr,struct pt_regs * regs,u64 addr)7417 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7418 {
7419 struct perf_sample_data data;
7420
7421 if (WARN_ON_ONCE(!regs))
7422 return;
7423
7424 perf_sample_data_init(&data, addr, 0);
7425 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
7426 }
7427
__perf_sw_event(u32 event_id,u64 nr,struct pt_regs * regs,u64 addr)7428 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
7429 {
7430 int rctx;
7431
7432 preempt_disable_notrace();
7433 rctx = perf_swevent_get_recursion_context();
7434 if (unlikely(rctx < 0))
7435 goto fail;
7436
7437 ___perf_sw_event(event_id, nr, regs, addr);
7438
7439 perf_swevent_put_recursion_context(rctx);
7440 fail:
7441 preempt_enable_notrace();
7442 }
7443
perf_swevent_read(struct perf_event * event)7444 static void perf_swevent_read(struct perf_event *event)
7445 {
7446 }
7447
perf_swevent_add(struct perf_event * event,int flags)7448 static int perf_swevent_add(struct perf_event *event, int flags)
7449 {
7450 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
7451 struct hw_perf_event *hwc = &event->hw;
7452 struct hlist_head *head;
7453
7454 if (is_sampling_event(event)) {
7455 hwc->last_period = hwc->sample_period;
7456 perf_swevent_set_period(event);
7457 }
7458
7459 hwc->state = !(flags & PERF_EF_START);
7460
7461 head = find_swevent_head(swhash, event);
7462 if (WARN_ON_ONCE(!head))
7463 return -EINVAL;
7464
7465 hlist_add_head_rcu(&event->hlist_entry, head);
7466 perf_event_update_userpage(event);
7467
7468 return 0;
7469 }
7470
perf_swevent_del(struct perf_event * event,int flags)7471 static void perf_swevent_del(struct perf_event *event, int flags)
7472 {
7473 hlist_del_rcu(&event->hlist_entry);
7474 }
7475
perf_swevent_start(struct perf_event * event,int flags)7476 static void perf_swevent_start(struct perf_event *event, int flags)
7477 {
7478 event->hw.state = 0;
7479 }
7480
perf_swevent_stop(struct perf_event * event,int flags)7481 static void perf_swevent_stop(struct perf_event *event, int flags)
7482 {
7483 event->hw.state = PERF_HES_STOPPED;
7484 }
7485
7486 /* Deref the hlist from the update side */
7487 static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable * swhash)7488 swevent_hlist_deref(struct swevent_htable *swhash)
7489 {
7490 return rcu_dereference_protected(swhash->swevent_hlist,
7491 lockdep_is_held(&swhash->hlist_mutex));
7492 }
7493
swevent_hlist_release(struct swevent_htable * swhash)7494 static void swevent_hlist_release(struct swevent_htable *swhash)
7495 {
7496 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
7497
7498 if (!hlist)
7499 return;
7500
7501 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
7502 kfree_rcu(hlist, rcu_head);
7503 }
7504
swevent_hlist_put_cpu(int cpu)7505 static void swevent_hlist_put_cpu(int cpu)
7506 {
7507 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7508
7509 mutex_lock(&swhash->hlist_mutex);
7510
7511 if (!--swhash->hlist_refcount)
7512 swevent_hlist_release(swhash);
7513
7514 mutex_unlock(&swhash->hlist_mutex);
7515 }
7516
swevent_hlist_put(void)7517 static void swevent_hlist_put(void)
7518 {
7519 int cpu;
7520
7521 for_each_possible_cpu(cpu)
7522 swevent_hlist_put_cpu(cpu);
7523 }
7524
swevent_hlist_get_cpu(int cpu)7525 static int swevent_hlist_get_cpu(int cpu)
7526 {
7527 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7528 int err = 0;
7529
7530 mutex_lock(&swhash->hlist_mutex);
7531 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
7532 struct swevent_hlist *hlist;
7533
7534 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
7535 if (!hlist) {
7536 err = -ENOMEM;
7537 goto exit;
7538 }
7539 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7540 }
7541 swhash->hlist_refcount++;
7542 exit:
7543 mutex_unlock(&swhash->hlist_mutex);
7544
7545 return err;
7546 }
7547
swevent_hlist_get(void)7548 static int swevent_hlist_get(void)
7549 {
7550 int err, cpu, failed_cpu;
7551
7552 get_online_cpus();
7553 for_each_possible_cpu(cpu) {
7554 err = swevent_hlist_get_cpu(cpu);
7555 if (err) {
7556 failed_cpu = cpu;
7557 goto fail;
7558 }
7559 }
7560 put_online_cpus();
7561
7562 return 0;
7563 fail:
7564 for_each_possible_cpu(cpu) {
7565 if (cpu == failed_cpu)
7566 break;
7567 swevent_hlist_put_cpu(cpu);
7568 }
7569
7570 put_online_cpus();
7571 return err;
7572 }
7573
7574 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
7575
sw_perf_event_destroy(struct perf_event * event)7576 static void sw_perf_event_destroy(struct perf_event *event)
7577 {
7578 u64 event_id = event->attr.config;
7579
7580 WARN_ON(event->parent);
7581
7582 static_key_slow_dec(&perf_swevent_enabled[event_id]);
7583 swevent_hlist_put();
7584 }
7585
perf_swevent_init(struct perf_event * event)7586 static int perf_swevent_init(struct perf_event *event)
7587 {
7588 u64 event_id = event->attr.config;
7589
7590 if (event->attr.type != PERF_TYPE_SOFTWARE)
7591 return -ENOENT;
7592
7593 /*
7594 * no branch sampling for software events
7595 */
7596 if (has_branch_stack(event))
7597 return -EOPNOTSUPP;
7598
7599 switch (event_id) {
7600 case PERF_COUNT_SW_CPU_CLOCK:
7601 case PERF_COUNT_SW_TASK_CLOCK:
7602 return -ENOENT;
7603
7604 default:
7605 break;
7606 }
7607
7608 if (event_id >= PERF_COUNT_SW_MAX)
7609 return -ENOENT;
7610
7611 if (!event->parent) {
7612 int err;
7613
7614 err = swevent_hlist_get();
7615 if (err)
7616 return err;
7617
7618 static_key_slow_inc(&perf_swevent_enabled[event_id]);
7619 event->destroy = sw_perf_event_destroy;
7620 }
7621
7622 return 0;
7623 }
7624
7625 static struct pmu perf_swevent = {
7626 .task_ctx_nr = perf_sw_context,
7627
7628 .capabilities = PERF_PMU_CAP_NO_NMI,
7629
7630 .event_init = perf_swevent_init,
7631 .add = perf_swevent_add,
7632 .del = perf_swevent_del,
7633 .start = perf_swevent_start,
7634 .stop = perf_swevent_stop,
7635 .read = perf_swevent_read,
7636 };
7637
7638 #ifdef CONFIG_EVENT_TRACING
7639
perf_tp_filter_match(struct perf_event * event,struct perf_sample_data * data)7640 static int perf_tp_filter_match(struct perf_event *event,
7641 struct perf_sample_data *data)
7642 {
7643 void *record = data->raw->frag.data;
7644
7645 /* only top level events have filters set */
7646 if (event->parent)
7647 event = event->parent;
7648
7649 if (likely(!event->filter) || filter_match_preds(event->filter, record))
7650 return 1;
7651 return 0;
7652 }
7653
perf_tp_event_match(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)7654 static int perf_tp_event_match(struct perf_event *event,
7655 struct perf_sample_data *data,
7656 struct pt_regs *regs)
7657 {
7658 if (event->hw.state & PERF_HES_STOPPED)
7659 return 0;
7660 /*
7661 * All tracepoints are from kernel-space.
7662 */
7663 if (event->attr.exclude_kernel)
7664 return 0;
7665
7666 if (!perf_tp_filter_match(event, data))
7667 return 0;
7668
7669 return 1;
7670 }
7671
perf_trace_run_bpf_submit(void * raw_data,int size,int rctx,struct trace_event_call * call,u64 count,struct pt_regs * regs,struct hlist_head * head,struct task_struct * task)7672 void perf_trace_run_bpf_submit(void *raw_data, int size, int rctx,
7673 struct trace_event_call *call, u64 count,
7674 struct pt_regs *regs, struct hlist_head *head,
7675 struct task_struct *task)
7676 {
7677 struct bpf_prog *prog = call->prog;
7678
7679 if (prog) {
7680 *(struct pt_regs **)raw_data = regs;
7681 if (!trace_call_bpf(prog, raw_data) || hlist_empty(head)) {
7682 perf_swevent_put_recursion_context(rctx);
7683 return;
7684 }
7685 }
7686 perf_tp_event(call->event.type, count, raw_data, size, regs, head,
7687 rctx, task);
7688 }
7689 EXPORT_SYMBOL_GPL(perf_trace_run_bpf_submit);
7690
perf_tp_event(u16 event_type,u64 count,void * record,int entry_size,struct pt_regs * regs,struct hlist_head * head,int rctx,struct task_struct * task)7691 void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size,
7692 struct pt_regs *regs, struct hlist_head *head, int rctx,
7693 struct task_struct *task)
7694 {
7695 struct perf_sample_data data;
7696 struct perf_event *event;
7697
7698 struct perf_raw_record raw = {
7699 .frag = {
7700 .size = entry_size,
7701 .data = record,
7702 },
7703 };
7704
7705 perf_sample_data_init(&data, 0, 0);
7706 data.raw = &raw;
7707
7708 perf_trace_buf_update(record, event_type);
7709
7710 hlist_for_each_entry_rcu(event, head, hlist_entry) {
7711 if (perf_tp_event_match(event, &data, regs))
7712 perf_swevent_event(event, count, &data, regs);
7713 }
7714
7715 /*
7716 * If we got specified a target task, also iterate its context and
7717 * deliver this event there too.
7718 */
7719 if (task && task != current) {
7720 struct perf_event_context *ctx;
7721 struct trace_entry *entry = record;
7722
7723 rcu_read_lock();
7724 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
7725 if (!ctx)
7726 goto unlock;
7727
7728 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
7729 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7730 continue;
7731 if (event->attr.config != entry->type)
7732 continue;
7733 if (perf_tp_event_match(event, &data, regs))
7734 perf_swevent_event(event, count, &data, regs);
7735 }
7736 unlock:
7737 rcu_read_unlock();
7738 }
7739
7740 perf_swevent_put_recursion_context(rctx);
7741 }
7742 EXPORT_SYMBOL_GPL(perf_tp_event);
7743
tp_perf_event_destroy(struct perf_event * event)7744 static void tp_perf_event_destroy(struct perf_event *event)
7745 {
7746 perf_trace_destroy(event);
7747 }
7748
perf_tp_event_init(struct perf_event * event)7749 static int perf_tp_event_init(struct perf_event *event)
7750 {
7751 int err;
7752
7753 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7754 return -ENOENT;
7755
7756 /*
7757 * no branch sampling for tracepoint events
7758 */
7759 if (has_branch_stack(event))
7760 return -EOPNOTSUPP;
7761
7762 err = perf_trace_init(event);
7763 if (err)
7764 return err;
7765
7766 event->destroy = tp_perf_event_destroy;
7767
7768 return 0;
7769 }
7770
7771 static struct pmu perf_tracepoint = {
7772 .task_ctx_nr = perf_sw_context,
7773
7774 .event_init = perf_tp_event_init,
7775 .add = perf_trace_add,
7776 .del = perf_trace_del,
7777 .start = perf_swevent_start,
7778 .stop = perf_swevent_stop,
7779 .read = perf_swevent_read,
7780 };
7781
perf_tp_register(void)7782 static inline void perf_tp_register(void)
7783 {
7784 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
7785 }
7786
perf_event_free_filter(struct perf_event * event)7787 static void perf_event_free_filter(struct perf_event *event)
7788 {
7789 ftrace_profile_free_filter(event);
7790 }
7791
7792 #ifdef CONFIG_BPF_SYSCALL
bpf_overflow_handler(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)7793 static void bpf_overflow_handler(struct perf_event *event,
7794 struct perf_sample_data *data,
7795 struct pt_regs *regs)
7796 {
7797 struct bpf_perf_event_data_kern ctx = {
7798 .data = data,
7799 .regs = regs,
7800 };
7801 int ret = 0;
7802
7803 preempt_disable();
7804 if (unlikely(__this_cpu_inc_return(bpf_prog_active) != 1))
7805 goto out;
7806 rcu_read_lock();
7807 ret = BPF_PROG_RUN(event->prog, (void *)&ctx);
7808 rcu_read_unlock();
7809 out:
7810 __this_cpu_dec(bpf_prog_active);
7811 preempt_enable();
7812 if (!ret)
7813 return;
7814
7815 event->orig_overflow_handler(event, data, regs);
7816 }
7817
perf_event_set_bpf_handler(struct perf_event * event,u32 prog_fd)7818 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7819 {
7820 struct bpf_prog *prog;
7821
7822 if (event->overflow_handler_context)
7823 /* hw breakpoint or kernel counter */
7824 return -EINVAL;
7825
7826 if (event->prog)
7827 return -EEXIST;
7828
7829 prog = bpf_prog_get_type(prog_fd, BPF_PROG_TYPE_PERF_EVENT);
7830 if (IS_ERR(prog))
7831 return PTR_ERR(prog);
7832
7833 event->prog = prog;
7834 event->orig_overflow_handler = READ_ONCE(event->overflow_handler);
7835 WRITE_ONCE(event->overflow_handler, bpf_overflow_handler);
7836 return 0;
7837 }
7838
perf_event_free_bpf_handler(struct perf_event * event)7839 static void perf_event_free_bpf_handler(struct perf_event *event)
7840 {
7841 struct bpf_prog *prog = event->prog;
7842
7843 if (!prog)
7844 return;
7845
7846 WRITE_ONCE(event->overflow_handler, event->orig_overflow_handler);
7847 event->prog = NULL;
7848 bpf_prog_put(prog);
7849 }
7850 #else
perf_event_set_bpf_handler(struct perf_event * event,u32 prog_fd)7851 static int perf_event_set_bpf_handler(struct perf_event *event, u32 prog_fd)
7852 {
7853 return -EOPNOTSUPP;
7854 }
perf_event_free_bpf_handler(struct perf_event * event)7855 static void perf_event_free_bpf_handler(struct perf_event *event)
7856 {
7857 }
7858 #endif
7859
perf_event_set_bpf_prog(struct perf_event * event,u32 prog_fd)7860 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7861 {
7862 bool is_kprobe, is_tracepoint;
7863 struct bpf_prog *prog;
7864
7865 if (event->attr.type == PERF_TYPE_HARDWARE ||
7866 event->attr.type == PERF_TYPE_SOFTWARE)
7867 return perf_event_set_bpf_handler(event, prog_fd);
7868
7869 if (event->attr.type != PERF_TYPE_TRACEPOINT)
7870 return -EINVAL;
7871
7872 if (event->tp_event->prog)
7873 return -EEXIST;
7874
7875 is_kprobe = event->tp_event->flags & TRACE_EVENT_FL_UKPROBE;
7876 is_tracepoint = event->tp_event->flags & TRACE_EVENT_FL_TRACEPOINT;
7877 if (!is_kprobe && !is_tracepoint)
7878 /* bpf programs can only be attached to u/kprobe or tracepoint */
7879 return -EINVAL;
7880
7881 prog = bpf_prog_get(prog_fd);
7882 if (IS_ERR(prog))
7883 return PTR_ERR(prog);
7884
7885 if ((is_kprobe && prog->type != BPF_PROG_TYPE_KPROBE) ||
7886 (is_tracepoint && prog->type != BPF_PROG_TYPE_TRACEPOINT)) {
7887 /* valid fd, but invalid bpf program type */
7888 bpf_prog_put(prog);
7889 return -EINVAL;
7890 }
7891
7892 if (is_tracepoint) {
7893 int off = trace_event_get_offsets(event->tp_event);
7894
7895 if (prog->aux->max_ctx_offset > off) {
7896 bpf_prog_put(prog);
7897 return -EACCES;
7898 }
7899 }
7900 event->tp_event->prog = prog;
7901 event->tp_event->bpf_prog_owner = event;
7902
7903 return 0;
7904 }
7905
perf_event_free_bpf_prog(struct perf_event * event)7906 static void perf_event_free_bpf_prog(struct perf_event *event)
7907 {
7908 struct bpf_prog *prog;
7909
7910 perf_event_free_bpf_handler(event);
7911
7912 if (!event->tp_event)
7913 return;
7914
7915 prog = event->tp_event->prog;
7916 if (prog && event->tp_event->bpf_prog_owner == event) {
7917 event->tp_event->prog = NULL;
7918 bpf_prog_put(prog);
7919 }
7920 }
7921
7922 #else
7923
perf_tp_register(void)7924 static inline void perf_tp_register(void)
7925 {
7926 }
7927
perf_event_free_filter(struct perf_event * event)7928 static void perf_event_free_filter(struct perf_event *event)
7929 {
7930 }
7931
perf_event_set_bpf_prog(struct perf_event * event,u32 prog_fd)7932 static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
7933 {
7934 return -ENOENT;
7935 }
7936
perf_event_free_bpf_prog(struct perf_event * event)7937 static void perf_event_free_bpf_prog(struct perf_event *event)
7938 {
7939 }
7940 #endif /* CONFIG_EVENT_TRACING */
7941
7942 #ifdef CONFIG_HAVE_HW_BREAKPOINT
perf_bp_event(struct perf_event * bp,void * data)7943 void perf_bp_event(struct perf_event *bp, void *data)
7944 {
7945 struct perf_sample_data sample;
7946 struct pt_regs *regs = data;
7947
7948 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
7949
7950 if (!bp->hw.state && !perf_exclude_event(bp, regs))
7951 perf_swevent_event(bp, 1, &sample, regs);
7952 }
7953 #endif
7954
7955 /*
7956 * Allocate a new address filter
7957 */
7958 static struct perf_addr_filter *
perf_addr_filter_new(struct perf_event * event,struct list_head * filters)7959 perf_addr_filter_new(struct perf_event *event, struct list_head *filters)
7960 {
7961 int node = cpu_to_node(event->cpu == -1 ? 0 : event->cpu);
7962 struct perf_addr_filter *filter;
7963
7964 filter = kzalloc_node(sizeof(*filter), GFP_KERNEL, node);
7965 if (!filter)
7966 return NULL;
7967
7968 INIT_LIST_HEAD(&filter->entry);
7969 list_add_tail(&filter->entry, filters);
7970
7971 return filter;
7972 }
7973
free_filters_list(struct list_head * filters)7974 static void free_filters_list(struct list_head *filters)
7975 {
7976 struct perf_addr_filter *filter, *iter;
7977
7978 list_for_each_entry_safe(filter, iter, filters, entry) {
7979 if (filter->inode)
7980 iput(filter->inode);
7981 list_del(&filter->entry);
7982 kfree(filter);
7983 }
7984 }
7985
7986 /*
7987 * Free existing address filters and optionally install new ones
7988 */
perf_addr_filters_splice(struct perf_event * event,struct list_head * head)7989 static void perf_addr_filters_splice(struct perf_event *event,
7990 struct list_head *head)
7991 {
7992 unsigned long flags;
7993 LIST_HEAD(list);
7994
7995 if (!has_addr_filter(event))
7996 return;
7997
7998 /* don't bother with children, they don't have their own filters */
7999 if (event->parent)
8000 return;
8001
8002 raw_spin_lock_irqsave(&event->addr_filters.lock, flags);
8003
8004 list_splice_init(&event->addr_filters.list, &list);
8005 if (head)
8006 list_splice(head, &event->addr_filters.list);
8007
8008 raw_spin_unlock_irqrestore(&event->addr_filters.lock, flags);
8009
8010 free_filters_list(&list);
8011 }
8012
8013 /*
8014 * Scan through mm's vmas and see if one of them matches the
8015 * @filter; if so, adjust filter's address range.
8016 * Called with mm::mmap_sem down for reading.
8017 */
perf_addr_filter_apply(struct perf_addr_filter * filter,struct mm_struct * mm)8018 static unsigned long perf_addr_filter_apply(struct perf_addr_filter *filter,
8019 struct mm_struct *mm)
8020 {
8021 struct vm_area_struct *vma;
8022
8023 for (vma = mm->mmap; vma; vma = vma->vm_next) {
8024 struct file *file = vma->vm_file;
8025 unsigned long off = vma->vm_pgoff << PAGE_SHIFT;
8026 unsigned long vma_size = vma->vm_end - vma->vm_start;
8027
8028 if (!file)
8029 continue;
8030
8031 if (!perf_addr_filter_match(filter, file, off, vma_size))
8032 continue;
8033
8034 return vma->vm_start;
8035 }
8036
8037 return 0;
8038 }
8039
8040 /*
8041 * Update event's address range filters based on the
8042 * task's existing mappings, if any.
8043 */
perf_event_addr_filters_apply(struct perf_event * event)8044 static void perf_event_addr_filters_apply(struct perf_event *event)
8045 {
8046 struct perf_addr_filters_head *ifh = perf_event_addr_filters(event);
8047 struct task_struct *task = READ_ONCE(event->ctx->task);
8048 struct perf_addr_filter *filter;
8049 struct mm_struct *mm = NULL;
8050 unsigned int count = 0;
8051 unsigned long flags;
8052
8053 /*
8054 * We may observe TASK_TOMBSTONE, which means that the event tear-down
8055 * will stop on the parent's child_mutex that our caller is also holding
8056 */
8057 if (task == TASK_TOMBSTONE)
8058 return;
8059
8060 mm = get_task_mm(event->ctx->task);
8061 if (!mm)
8062 goto restart;
8063
8064 down_read(&mm->mmap_sem);
8065
8066 raw_spin_lock_irqsave(&ifh->lock, flags);
8067 list_for_each_entry(filter, &ifh->list, entry) {
8068 event->addr_filters_offs[count] = 0;
8069
8070 /*
8071 * Adjust base offset if the filter is associated to a binary
8072 * that needs to be mapped:
8073 */
8074 if (filter->inode)
8075 event->addr_filters_offs[count] =
8076 perf_addr_filter_apply(filter, mm);
8077
8078 count++;
8079 }
8080
8081 event->addr_filters_gen++;
8082 raw_spin_unlock_irqrestore(&ifh->lock, flags);
8083
8084 up_read(&mm->mmap_sem);
8085
8086 mmput(mm);
8087
8088 restart:
8089 perf_event_stop(event, 1);
8090 }
8091
8092 /*
8093 * Address range filtering: limiting the data to certain
8094 * instruction address ranges. Filters are ioctl()ed to us from
8095 * userspace as ascii strings.
8096 *
8097 * Filter string format:
8098 *
8099 * ACTION RANGE_SPEC
8100 * where ACTION is one of the
8101 * * "filter": limit the trace to this region
8102 * * "start": start tracing from this address
8103 * * "stop": stop tracing at this address/region;
8104 * RANGE_SPEC is
8105 * * for kernel addresses: <start address>[/<size>]
8106 * * for object files: <start address>[/<size>]@</path/to/object/file>
8107 *
8108 * if <size> is not specified, the range is treated as a single address.
8109 */
8110 enum {
8111 IF_ACT_NONE = -1,
8112 IF_ACT_FILTER,
8113 IF_ACT_START,
8114 IF_ACT_STOP,
8115 IF_SRC_FILE,
8116 IF_SRC_KERNEL,
8117 IF_SRC_FILEADDR,
8118 IF_SRC_KERNELADDR,
8119 };
8120
8121 enum {
8122 IF_STATE_ACTION = 0,
8123 IF_STATE_SOURCE,
8124 IF_STATE_END,
8125 };
8126
8127 static const match_table_t if_tokens = {
8128 { IF_ACT_FILTER, "filter" },
8129 { IF_ACT_START, "start" },
8130 { IF_ACT_STOP, "stop" },
8131 { IF_SRC_FILE, "%u/%u@%s" },
8132 { IF_SRC_KERNEL, "%u/%u" },
8133 { IF_SRC_FILEADDR, "%u@%s" },
8134 { IF_SRC_KERNELADDR, "%u" },
8135 { IF_ACT_NONE, NULL },
8136 };
8137
8138 /*
8139 * Address filter string parser
8140 */
8141 static int
perf_event_parse_addr_filter(struct perf_event * event,char * fstr,struct list_head * filters)8142 perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
8143 struct list_head *filters)
8144 {
8145 struct perf_addr_filter *filter = NULL;
8146 char *start, *orig, *filename = NULL;
8147 struct path path;
8148 substring_t args[MAX_OPT_ARGS];
8149 int state = IF_STATE_ACTION, token;
8150 unsigned int kernel = 0;
8151 int ret = -EINVAL;
8152
8153 orig = fstr = kstrdup(fstr, GFP_KERNEL);
8154 if (!fstr)
8155 return -ENOMEM;
8156
8157 while ((start = strsep(&fstr, " ,\n")) != NULL) {
8158 ret = -EINVAL;
8159
8160 if (!*start)
8161 continue;
8162
8163 /* filter definition begins */
8164 if (state == IF_STATE_ACTION) {
8165 filter = perf_addr_filter_new(event, filters);
8166 if (!filter)
8167 goto fail;
8168 }
8169
8170 token = match_token(start, if_tokens, args);
8171 switch (token) {
8172 case IF_ACT_FILTER:
8173 case IF_ACT_START:
8174 filter->filter = 1;
8175
8176 case IF_ACT_STOP:
8177 if (state != IF_STATE_ACTION)
8178 goto fail;
8179
8180 state = IF_STATE_SOURCE;
8181 break;
8182
8183 case IF_SRC_KERNELADDR:
8184 case IF_SRC_KERNEL:
8185 kernel = 1;
8186
8187 case IF_SRC_FILEADDR:
8188 case IF_SRC_FILE:
8189 if (state != IF_STATE_SOURCE)
8190 goto fail;
8191
8192 if (token == IF_SRC_FILE || token == IF_SRC_KERNEL)
8193 filter->range = 1;
8194
8195 *args[0].to = 0;
8196 ret = kstrtoul(args[0].from, 0, &filter->offset);
8197 if (ret)
8198 goto fail;
8199
8200 if (filter->range) {
8201 *args[1].to = 0;
8202 ret = kstrtoul(args[1].from, 0, &filter->size);
8203 if (ret)
8204 goto fail;
8205 }
8206
8207 if (token == IF_SRC_FILE || token == IF_SRC_FILEADDR) {
8208 int fpos = filter->range ? 2 : 1;
8209
8210 filename = match_strdup(&args[fpos]);
8211 if (!filename) {
8212 ret = -ENOMEM;
8213 goto fail;
8214 }
8215 }
8216
8217 state = IF_STATE_END;
8218 break;
8219
8220 default:
8221 goto fail;
8222 }
8223
8224 /*
8225 * Filter definition is fully parsed, validate and install it.
8226 * Make sure that it doesn't contradict itself or the event's
8227 * attribute.
8228 */
8229 if (state == IF_STATE_END) {
8230 if (kernel && event->attr.exclude_kernel)
8231 goto fail;
8232
8233 if (!kernel) {
8234 if (!filename)
8235 goto fail;
8236
8237 /* look up the path and grab its inode */
8238 ret = kern_path(filename, LOOKUP_FOLLOW, &path);
8239 if (ret)
8240 goto fail_free_name;
8241
8242 filter->inode = igrab(d_inode(path.dentry));
8243 path_put(&path);
8244 kfree(filename);
8245 filename = NULL;
8246
8247 ret = -EINVAL;
8248 if (!filter->inode ||
8249 !S_ISREG(filter->inode->i_mode))
8250 /* free_filters_list() will iput() */
8251 goto fail;
8252 }
8253
8254 /* ready to consume more filters */
8255 state = IF_STATE_ACTION;
8256 filter = NULL;
8257 }
8258 }
8259
8260 if (state != IF_STATE_ACTION)
8261 goto fail;
8262
8263 kfree(orig);
8264
8265 return 0;
8266
8267 fail_free_name:
8268 kfree(filename);
8269 fail:
8270 free_filters_list(filters);
8271 kfree(orig);
8272
8273 return ret;
8274 }
8275
8276 static int
perf_event_set_addr_filter(struct perf_event * event,char * filter_str)8277 perf_event_set_addr_filter(struct perf_event *event, char *filter_str)
8278 {
8279 LIST_HEAD(filters);
8280 int ret;
8281
8282 /*
8283 * Since this is called in perf_ioctl() path, we're already holding
8284 * ctx::mutex.
8285 */
8286 lockdep_assert_held(&event->ctx->mutex);
8287
8288 if (WARN_ON_ONCE(event->parent))
8289 return -EINVAL;
8290
8291 /*
8292 * For now, we only support filtering in per-task events; doing so
8293 * for CPU-wide events requires additional context switching trickery,
8294 * since same object code will be mapped at different virtual
8295 * addresses in different processes.
8296 */
8297 if (!event->ctx->task)
8298 return -EOPNOTSUPP;
8299
8300 ret = perf_event_parse_addr_filter(event, filter_str, &filters);
8301 if (ret)
8302 return ret;
8303
8304 ret = event->pmu->addr_filters_validate(&filters);
8305 if (ret) {
8306 free_filters_list(&filters);
8307 return ret;
8308 }
8309
8310 /* remove existing filters, if any */
8311 perf_addr_filters_splice(event, &filters);
8312
8313 /* install new filters */
8314 perf_event_for_each_child(event, perf_event_addr_filters_apply);
8315
8316 return ret;
8317 }
8318
perf_event_set_filter(struct perf_event * event,void __user * arg)8319 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
8320 {
8321 char *filter_str;
8322 int ret = -EINVAL;
8323
8324 if ((event->attr.type != PERF_TYPE_TRACEPOINT ||
8325 !IS_ENABLED(CONFIG_EVENT_TRACING)) &&
8326 !has_addr_filter(event))
8327 return -EINVAL;
8328
8329 filter_str = strndup_user(arg, PAGE_SIZE);
8330 if (IS_ERR(filter_str))
8331 return PTR_ERR(filter_str);
8332
8333 if (IS_ENABLED(CONFIG_EVENT_TRACING) &&
8334 event->attr.type == PERF_TYPE_TRACEPOINT)
8335 ret = ftrace_profile_set_filter(event, event->attr.config,
8336 filter_str);
8337 else if (has_addr_filter(event))
8338 ret = perf_event_set_addr_filter(event, filter_str);
8339
8340 kfree(filter_str);
8341 return ret;
8342 }
8343
8344 /*
8345 * hrtimer based swevent callback
8346 */
8347
perf_swevent_hrtimer(struct hrtimer * hrtimer)8348 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
8349 {
8350 enum hrtimer_restart ret = HRTIMER_RESTART;
8351 struct perf_sample_data data;
8352 struct pt_regs *regs;
8353 struct perf_event *event;
8354 u64 period;
8355
8356 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
8357
8358 if (event->state != PERF_EVENT_STATE_ACTIVE)
8359 return HRTIMER_NORESTART;
8360
8361 event->pmu->read(event);
8362
8363 perf_sample_data_init(&data, 0, event->hw.last_period);
8364 regs = get_irq_regs();
8365
8366 if (regs && !perf_exclude_event(event, regs)) {
8367 if (!(event->attr.exclude_idle && is_idle_task(current)))
8368 if (__perf_event_overflow(event, 1, &data, regs))
8369 ret = HRTIMER_NORESTART;
8370 }
8371
8372 period = max_t(u64, 10000, event->hw.sample_period);
8373 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
8374
8375 return ret;
8376 }
8377
perf_swevent_start_hrtimer(struct perf_event * event)8378 static void perf_swevent_start_hrtimer(struct perf_event *event)
8379 {
8380 struct hw_perf_event *hwc = &event->hw;
8381 s64 period;
8382
8383 if (!is_sampling_event(event))
8384 return;
8385
8386 period = local64_read(&hwc->period_left);
8387 if (period) {
8388 if (period < 0)
8389 period = 10000;
8390
8391 local64_set(&hwc->period_left, 0);
8392 } else {
8393 period = max_t(u64, 10000, hwc->sample_period);
8394 }
8395 hrtimer_start(&hwc->hrtimer, ns_to_ktime(period),
8396 HRTIMER_MODE_REL_PINNED);
8397 }
8398
perf_swevent_cancel_hrtimer(struct perf_event * event)8399 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
8400 {
8401 struct hw_perf_event *hwc = &event->hw;
8402
8403 if (is_sampling_event(event)) {
8404 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
8405 local64_set(&hwc->period_left, ktime_to_ns(remaining));
8406
8407 hrtimer_cancel(&hwc->hrtimer);
8408 }
8409 }
8410
perf_swevent_init_hrtimer(struct perf_event * event)8411 static void perf_swevent_init_hrtimer(struct perf_event *event)
8412 {
8413 struct hw_perf_event *hwc = &event->hw;
8414
8415 if (!is_sampling_event(event))
8416 return;
8417
8418 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
8419 hwc->hrtimer.function = perf_swevent_hrtimer;
8420
8421 /*
8422 * Since hrtimers have a fixed rate, we can do a static freq->period
8423 * mapping and avoid the whole period adjust feedback stuff.
8424 */
8425 if (event->attr.freq) {
8426 long freq = event->attr.sample_freq;
8427
8428 event->attr.sample_period = NSEC_PER_SEC / freq;
8429 hwc->sample_period = event->attr.sample_period;
8430 local64_set(&hwc->period_left, hwc->sample_period);
8431 hwc->last_period = hwc->sample_period;
8432 event->attr.freq = 0;
8433 }
8434 }
8435
8436 /*
8437 * Software event: cpu wall time clock
8438 */
8439
cpu_clock_event_update(struct perf_event * event)8440 static void cpu_clock_event_update(struct perf_event *event)
8441 {
8442 s64 prev;
8443 u64 now;
8444
8445 now = local_clock();
8446 prev = local64_xchg(&event->hw.prev_count, now);
8447 local64_add(now - prev, &event->count);
8448 }
8449
cpu_clock_event_start(struct perf_event * event,int flags)8450 static void cpu_clock_event_start(struct perf_event *event, int flags)
8451 {
8452 local64_set(&event->hw.prev_count, local_clock());
8453 perf_swevent_start_hrtimer(event);
8454 }
8455
cpu_clock_event_stop(struct perf_event * event,int flags)8456 static void cpu_clock_event_stop(struct perf_event *event, int flags)
8457 {
8458 perf_swevent_cancel_hrtimer(event);
8459 cpu_clock_event_update(event);
8460 }
8461
cpu_clock_event_add(struct perf_event * event,int flags)8462 static int cpu_clock_event_add(struct perf_event *event, int flags)
8463 {
8464 if (flags & PERF_EF_START)
8465 cpu_clock_event_start(event, flags);
8466 perf_event_update_userpage(event);
8467
8468 return 0;
8469 }
8470
cpu_clock_event_del(struct perf_event * event,int flags)8471 static void cpu_clock_event_del(struct perf_event *event, int flags)
8472 {
8473 cpu_clock_event_stop(event, flags);
8474 }
8475
cpu_clock_event_read(struct perf_event * event)8476 static void cpu_clock_event_read(struct perf_event *event)
8477 {
8478 cpu_clock_event_update(event);
8479 }
8480
cpu_clock_event_init(struct perf_event * event)8481 static int cpu_clock_event_init(struct perf_event *event)
8482 {
8483 if (event->attr.type != PERF_TYPE_SOFTWARE)
8484 return -ENOENT;
8485
8486 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
8487 return -ENOENT;
8488
8489 /*
8490 * no branch sampling for software events
8491 */
8492 if (has_branch_stack(event))
8493 return -EOPNOTSUPP;
8494
8495 perf_swevent_init_hrtimer(event);
8496
8497 return 0;
8498 }
8499
8500 static struct pmu perf_cpu_clock = {
8501 .task_ctx_nr = perf_sw_context,
8502
8503 .capabilities = PERF_PMU_CAP_NO_NMI,
8504
8505 .event_init = cpu_clock_event_init,
8506 .add = cpu_clock_event_add,
8507 .del = cpu_clock_event_del,
8508 .start = cpu_clock_event_start,
8509 .stop = cpu_clock_event_stop,
8510 .read = cpu_clock_event_read,
8511 };
8512
8513 /*
8514 * Software event: task time clock
8515 */
8516
task_clock_event_update(struct perf_event * event,u64 now)8517 static void task_clock_event_update(struct perf_event *event, u64 now)
8518 {
8519 u64 prev;
8520 s64 delta;
8521
8522 prev = local64_xchg(&event->hw.prev_count, now);
8523 delta = now - prev;
8524 local64_add(delta, &event->count);
8525 }
8526
task_clock_event_start(struct perf_event * event,int flags)8527 static void task_clock_event_start(struct perf_event *event, int flags)
8528 {
8529 local64_set(&event->hw.prev_count, event->ctx->time);
8530 perf_swevent_start_hrtimer(event);
8531 }
8532
task_clock_event_stop(struct perf_event * event,int flags)8533 static void task_clock_event_stop(struct perf_event *event, int flags)
8534 {
8535 perf_swevent_cancel_hrtimer(event);
8536 task_clock_event_update(event, event->ctx->time);
8537 }
8538
task_clock_event_add(struct perf_event * event,int flags)8539 static int task_clock_event_add(struct perf_event *event, int flags)
8540 {
8541 if (flags & PERF_EF_START)
8542 task_clock_event_start(event, flags);
8543 perf_event_update_userpage(event);
8544
8545 return 0;
8546 }
8547
task_clock_event_del(struct perf_event * event,int flags)8548 static void task_clock_event_del(struct perf_event *event, int flags)
8549 {
8550 task_clock_event_stop(event, PERF_EF_UPDATE);
8551 }
8552
task_clock_event_read(struct perf_event * event)8553 static void task_clock_event_read(struct perf_event *event)
8554 {
8555 u64 now = perf_clock();
8556 u64 delta = now - event->ctx->timestamp;
8557 u64 time = event->ctx->time + delta;
8558
8559 task_clock_event_update(event, time);
8560 }
8561
task_clock_event_init(struct perf_event * event)8562 static int task_clock_event_init(struct perf_event *event)
8563 {
8564 if (event->attr.type != PERF_TYPE_SOFTWARE)
8565 return -ENOENT;
8566
8567 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
8568 return -ENOENT;
8569
8570 /*
8571 * no branch sampling for software events
8572 */
8573 if (has_branch_stack(event))
8574 return -EOPNOTSUPP;
8575
8576 perf_swevent_init_hrtimer(event);
8577
8578 return 0;
8579 }
8580
8581 static struct pmu perf_task_clock = {
8582 .task_ctx_nr = perf_sw_context,
8583
8584 .capabilities = PERF_PMU_CAP_NO_NMI,
8585
8586 .event_init = task_clock_event_init,
8587 .add = task_clock_event_add,
8588 .del = task_clock_event_del,
8589 .start = task_clock_event_start,
8590 .stop = task_clock_event_stop,
8591 .read = task_clock_event_read,
8592 };
8593
perf_pmu_nop_void(struct pmu * pmu)8594 static void perf_pmu_nop_void(struct pmu *pmu)
8595 {
8596 }
8597
perf_pmu_nop_txn(struct pmu * pmu,unsigned int flags)8598 static void perf_pmu_nop_txn(struct pmu *pmu, unsigned int flags)
8599 {
8600 }
8601
perf_pmu_nop_int(struct pmu * pmu)8602 static int perf_pmu_nop_int(struct pmu *pmu)
8603 {
8604 return 0;
8605 }
8606
8607 static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
8608
perf_pmu_start_txn(struct pmu * pmu,unsigned int flags)8609 static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
8610 {
8611 __this_cpu_write(nop_txn_flags, flags);
8612
8613 if (flags & ~PERF_PMU_TXN_ADD)
8614 return;
8615
8616 perf_pmu_disable(pmu);
8617 }
8618
perf_pmu_commit_txn(struct pmu * pmu)8619 static int perf_pmu_commit_txn(struct pmu *pmu)
8620 {
8621 unsigned int flags = __this_cpu_read(nop_txn_flags);
8622
8623 __this_cpu_write(nop_txn_flags, 0);
8624
8625 if (flags & ~PERF_PMU_TXN_ADD)
8626 return 0;
8627
8628 perf_pmu_enable(pmu);
8629 return 0;
8630 }
8631
perf_pmu_cancel_txn(struct pmu * pmu)8632 static void perf_pmu_cancel_txn(struct pmu *pmu)
8633 {
8634 unsigned int flags = __this_cpu_read(nop_txn_flags);
8635
8636 __this_cpu_write(nop_txn_flags, 0);
8637
8638 if (flags & ~PERF_PMU_TXN_ADD)
8639 return;
8640
8641 perf_pmu_enable(pmu);
8642 }
8643
perf_event_idx_default(struct perf_event * event)8644 static int perf_event_idx_default(struct perf_event *event)
8645 {
8646 return 0;
8647 }
8648
8649 /*
8650 * Ensures all contexts with the same task_ctx_nr have the same
8651 * pmu_cpu_context too.
8652 */
find_pmu_context(int ctxn)8653 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
8654 {
8655 struct pmu *pmu;
8656
8657 if (ctxn < 0)
8658 return NULL;
8659
8660 list_for_each_entry(pmu, &pmus, entry) {
8661 if (pmu->task_ctx_nr == ctxn)
8662 return pmu->pmu_cpu_context;
8663 }
8664
8665 return NULL;
8666 }
8667
update_pmu_context(struct pmu * pmu,struct pmu * old_pmu)8668 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
8669 {
8670 int cpu;
8671
8672 for_each_possible_cpu(cpu) {
8673 struct perf_cpu_context *cpuctx;
8674
8675 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8676
8677 if (cpuctx->unique_pmu == old_pmu)
8678 cpuctx->unique_pmu = pmu;
8679 }
8680 }
8681
free_pmu_context(struct pmu * pmu)8682 static void free_pmu_context(struct pmu *pmu)
8683 {
8684 struct pmu *i;
8685
8686 mutex_lock(&pmus_lock);
8687 /*
8688 * Like a real lame refcount.
8689 */
8690 list_for_each_entry(i, &pmus, entry) {
8691 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
8692 update_pmu_context(i, pmu);
8693 goto out;
8694 }
8695 }
8696
8697 free_percpu(pmu->pmu_cpu_context);
8698 out:
8699 mutex_unlock(&pmus_lock);
8700 }
8701
8702 /*
8703 * Let userspace know that this PMU supports address range filtering:
8704 */
nr_addr_filters_show(struct device * dev,struct device_attribute * attr,char * page)8705 static ssize_t nr_addr_filters_show(struct device *dev,
8706 struct device_attribute *attr,
8707 char *page)
8708 {
8709 struct pmu *pmu = dev_get_drvdata(dev);
8710
8711 return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->nr_addr_filters);
8712 }
8713 DEVICE_ATTR_RO(nr_addr_filters);
8714
8715 static struct idr pmu_idr;
8716
8717 static ssize_t
type_show(struct device * dev,struct device_attribute * attr,char * page)8718 type_show(struct device *dev, struct device_attribute *attr, char *page)
8719 {
8720 struct pmu *pmu = dev_get_drvdata(dev);
8721
8722 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
8723 }
8724 static DEVICE_ATTR_RO(type);
8725
8726 static ssize_t
perf_event_mux_interval_ms_show(struct device * dev,struct device_attribute * attr,char * page)8727 perf_event_mux_interval_ms_show(struct device *dev,
8728 struct device_attribute *attr,
8729 char *page)
8730 {
8731 struct pmu *pmu = dev_get_drvdata(dev);
8732
8733 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
8734 }
8735
8736 static DEFINE_MUTEX(mux_interval_mutex);
8737
8738 static ssize_t
perf_event_mux_interval_ms_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)8739 perf_event_mux_interval_ms_store(struct device *dev,
8740 struct device_attribute *attr,
8741 const char *buf, size_t count)
8742 {
8743 struct pmu *pmu = dev_get_drvdata(dev);
8744 int timer, cpu, ret;
8745
8746 ret = kstrtoint(buf, 0, &timer);
8747 if (ret)
8748 return ret;
8749
8750 if (timer < 1)
8751 return -EINVAL;
8752
8753 /* same value, noting to do */
8754 if (timer == pmu->hrtimer_interval_ms)
8755 return count;
8756
8757 mutex_lock(&mux_interval_mutex);
8758 pmu->hrtimer_interval_ms = timer;
8759
8760 /* update all cpuctx for this PMU */
8761 get_online_cpus();
8762 for_each_online_cpu(cpu) {
8763 struct perf_cpu_context *cpuctx;
8764 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8765 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
8766
8767 cpu_function_call(cpu,
8768 (remote_function_f)perf_mux_hrtimer_restart, cpuctx);
8769 }
8770 put_online_cpus();
8771 mutex_unlock(&mux_interval_mutex);
8772
8773 return count;
8774 }
8775 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
8776
8777 static struct attribute *pmu_dev_attrs[] = {
8778 &dev_attr_type.attr,
8779 &dev_attr_perf_event_mux_interval_ms.attr,
8780 NULL,
8781 };
8782 ATTRIBUTE_GROUPS(pmu_dev);
8783
8784 static int pmu_bus_running;
8785 static struct bus_type pmu_bus = {
8786 .name = "event_source",
8787 .dev_groups = pmu_dev_groups,
8788 };
8789
pmu_dev_release(struct device * dev)8790 static void pmu_dev_release(struct device *dev)
8791 {
8792 kfree(dev);
8793 }
8794
pmu_dev_alloc(struct pmu * pmu)8795 static int pmu_dev_alloc(struct pmu *pmu)
8796 {
8797 int ret = -ENOMEM;
8798
8799 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
8800 if (!pmu->dev)
8801 goto out;
8802
8803 pmu->dev->groups = pmu->attr_groups;
8804 device_initialize(pmu->dev);
8805 ret = dev_set_name(pmu->dev, "%s", pmu->name);
8806 if (ret)
8807 goto free_dev;
8808
8809 dev_set_drvdata(pmu->dev, pmu);
8810 pmu->dev->bus = &pmu_bus;
8811 pmu->dev->release = pmu_dev_release;
8812 ret = device_add(pmu->dev);
8813 if (ret)
8814 goto free_dev;
8815
8816 /* For PMUs with address filters, throw in an extra attribute: */
8817 if (pmu->nr_addr_filters)
8818 ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
8819
8820 if (ret)
8821 goto del_dev;
8822
8823 out:
8824 return ret;
8825
8826 del_dev:
8827 device_del(pmu->dev);
8828
8829 free_dev:
8830 put_device(pmu->dev);
8831 goto out;
8832 }
8833
8834 static struct lock_class_key cpuctx_mutex;
8835 static struct lock_class_key cpuctx_lock;
8836
perf_pmu_register(struct pmu * pmu,const char * name,int type)8837 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
8838 {
8839 int cpu, ret;
8840
8841 mutex_lock(&pmus_lock);
8842 ret = -ENOMEM;
8843 pmu->pmu_disable_count = alloc_percpu(int);
8844 if (!pmu->pmu_disable_count)
8845 goto unlock;
8846
8847 pmu->type = -1;
8848 if (!name)
8849 goto skip_type;
8850 pmu->name = name;
8851
8852 if (type < 0) {
8853 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
8854 if (type < 0) {
8855 ret = type;
8856 goto free_pdc;
8857 }
8858 }
8859 pmu->type = type;
8860
8861 if (pmu_bus_running) {
8862 ret = pmu_dev_alloc(pmu);
8863 if (ret)
8864 goto free_idr;
8865 }
8866
8867 skip_type:
8868 if (pmu->task_ctx_nr == perf_hw_context) {
8869 static int hw_context_taken = 0;
8870
8871 /*
8872 * Other than systems with heterogeneous CPUs, it never makes
8873 * sense for two PMUs to share perf_hw_context. PMUs which are
8874 * uncore must use perf_invalid_context.
8875 */
8876 if (WARN_ON_ONCE(hw_context_taken &&
8877 !(pmu->capabilities & PERF_PMU_CAP_HETEROGENEOUS_CPUS)))
8878 pmu->task_ctx_nr = perf_invalid_context;
8879
8880 hw_context_taken = 1;
8881 }
8882
8883 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
8884 if (pmu->pmu_cpu_context)
8885 goto got_cpu_context;
8886
8887 ret = -ENOMEM;
8888 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
8889 if (!pmu->pmu_cpu_context)
8890 goto free_dev;
8891
8892 for_each_possible_cpu(cpu) {
8893 struct perf_cpu_context *cpuctx;
8894
8895 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
8896 __perf_event_init_context(&cpuctx->ctx);
8897 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
8898 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
8899 cpuctx->ctx.pmu = pmu;
8900
8901 __perf_mux_hrtimer_init(cpuctx, cpu);
8902
8903 cpuctx->unique_pmu = pmu;
8904 }
8905
8906 got_cpu_context:
8907 if (!pmu->start_txn) {
8908 if (pmu->pmu_enable) {
8909 /*
8910 * If we have pmu_enable/pmu_disable calls, install
8911 * transaction stubs that use that to try and batch
8912 * hardware accesses.
8913 */
8914 pmu->start_txn = perf_pmu_start_txn;
8915 pmu->commit_txn = perf_pmu_commit_txn;
8916 pmu->cancel_txn = perf_pmu_cancel_txn;
8917 } else {
8918 pmu->start_txn = perf_pmu_nop_txn;
8919 pmu->commit_txn = perf_pmu_nop_int;
8920 pmu->cancel_txn = perf_pmu_nop_void;
8921 }
8922 }
8923
8924 if (!pmu->pmu_enable) {
8925 pmu->pmu_enable = perf_pmu_nop_void;
8926 pmu->pmu_disable = perf_pmu_nop_void;
8927 }
8928
8929 if (!pmu->event_idx)
8930 pmu->event_idx = perf_event_idx_default;
8931
8932 list_add_rcu(&pmu->entry, &pmus);
8933 atomic_set(&pmu->exclusive_cnt, 0);
8934 ret = 0;
8935 unlock:
8936 mutex_unlock(&pmus_lock);
8937
8938 return ret;
8939
8940 free_dev:
8941 device_del(pmu->dev);
8942 put_device(pmu->dev);
8943
8944 free_idr:
8945 if (pmu->type >= PERF_TYPE_MAX)
8946 idr_remove(&pmu_idr, pmu->type);
8947
8948 free_pdc:
8949 free_percpu(pmu->pmu_disable_count);
8950 goto unlock;
8951 }
8952 EXPORT_SYMBOL_GPL(perf_pmu_register);
8953
perf_pmu_unregister(struct pmu * pmu)8954 void perf_pmu_unregister(struct pmu *pmu)
8955 {
8956 int remove_device;
8957
8958 mutex_lock(&pmus_lock);
8959 remove_device = pmu_bus_running;
8960 list_del_rcu(&pmu->entry);
8961 mutex_unlock(&pmus_lock);
8962
8963 /*
8964 * We dereference the pmu list under both SRCU and regular RCU, so
8965 * synchronize against both of those.
8966 */
8967 synchronize_srcu(&pmus_srcu);
8968 synchronize_rcu();
8969
8970 free_percpu(pmu->pmu_disable_count);
8971 if (pmu->type >= PERF_TYPE_MAX)
8972 idr_remove(&pmu_idr, pmu->type);
8973 if (remove_device) {
8974 if (pmu->nr_addr_filters)
8975 device_remove_file(pmu->dev, &dev_attr_nr_addr_filters);
8976 device_del(pmu->dev);
8977 put_device(pmu->dev);
8978 }
8979 free_pmu_context(pmu);
8980 }
8981 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
8982
perf_try_init_event(struct pmu * pmu,struct perf_event * event)8983 static int perf_try_init_event(struct pmu *pmu, struct perf_event *event)
8984 {
8985 struct perf_event_context *ctx = NULL;
8986 int ret;
8987
8988 if (!try_module_get(pmu->module))
8989 return -ENODEV;
8990
8991 if (event->group_leader != event) {
8992 /*
8993 * This ctx->mutex can nest when we're called through
8994 * inheritance. See the perf_event_ctx_lock_nested() comment.
8995 */
8996 ctx = perf_event_ctx_lock_nested(event->group_leader,
8997 SINGLE_DEPTH_NESTING);
8998 BUG_ON(!ctx);
8999 }
9000
9001 event->pmu = pmu;
9002 ret = pmu->event_init(event);
9003
9004 if (ctx)
9005 perf_event_ctx_unlock(event->group_leader, ctx);
9006
9007 if (ret)
9008 module_put(pmu->module);
9009
9010 return ret;
9011 }
9012
perf_init_event(struct perf_event * event)9013 static struct pmu *perf_init_event(struct perf_event *event)
9014 {
9015 struct pmu *pmu = NULL;
9016 int idx;
9017 int ret;
9018
9019 idx = srcu_read_lock(&pmus_srcu);
9020
9021 rcu_read_lock();
9022 pmu = idr_find(&pmu_idr, event->attr.type);
9023 rcu_read_unlock();
9024 if (pmu) {
9025 ret = perf_try_init_event(pmu, event);
9026 if (ret)
9027 pmu = ERR_PTR(ret);
9028 goto unlock;
9029 }
9030
9031 list_for_each_entry_rcu(pmu, &pmus, entry) {
9032 ret = perf_try_init_event(pmu, event);
9033 if (!ret)
9034 goto unlock;
9035
9036 if (ret != -ENOENT) {
9037 pmu = ERR_PTR(ret);
9038 goto unlock;
9039 }
9040 }
9041 pmu = ERR_PTR(-ENOENT);
9042 unlock:
9043 srcu_read_unlock(&pmus_srcu, idx);
9044
9045 return pmu;
9046 }
9047
attach_sb_event(struct perf_event * event)9048 static void attach_sb_event(struct perf_event *event)
9049 {
9050 struct pmu_event_list *pel = per_cpu_ptr(&pmu_sb_events, event->cpu);
9051
9052 raw_spin_lock(&pel->lock);
9053 list_add_rcu(&event->sb_list, &pel->list);
9054 raw_spin_unlock(&pel->lock);
9055 }
9056
9057 /*
9058 * We keep a list of all !task (and therefore per-cpu) events
9059 * that need to receive side-band records.
9060 *
9061 * This avoids having to scan all the various PMU per-cpu contexts
9062 * looking for them.
9063 */
account_pmu_sb_event(struct perf_event * event)9064 static void account_pmu_sb_event(struct perf_event *event)
9065 {
9066 if (is_sb_event(event))
9067 attach_sb_event(event);
9068 }
9069
account_event_cpu(struct perf_event * event,int cpu)9070 static void account_event_cpu(struct perf_event *event, int cpu)
9071 {
9072 if (event->parent)
9073 return;
9074
9075 if (is_cgroup_event(event))
9076 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
9077 }
9078
9079 /* Freq events need the tick to stay alive (see perf_event_task_tick). */
account_freq_event_nohz(void)9080 static void account_freq_event_nohz(void)
9081 {
9082 #ifdef CONFIG_NO_HZ_FULL
9083 /* Lock so we don't race with concurrent unaccount */
9084 spin_lock(&nr_freq_lock);
9085 if (atomic_inc_return(&nr_freq_events) == 1)
9086 tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS);
9087 spin_unlock(&nr_freq_lock);
9088 #endif
9089 }
9090
account_freq_event(void)9091 static void account_freq_event(void)
9092 {
9093 if (tick_nohz_full_enabled())
9094 account_freq_event_nohz();
9095 else
9096 atomic_inc(&nr_freq_events);
9097 }
9098
9099
account_event(struct perf_event * event)9100 static void account_event(struct perf_event *event)
9101 {
9102 bool inc = false;
9103
9104 if (event->parent)
9105 return;
9106
9107 if (event->attach_state & PERF_ATTACH_TASK)
9108 inc = true;
9109 if (event->attr.mmap || event->attr.mmap_data)
9110 atomic_inc(&nr_mmap_events);
9111 if (event->attr.comm)
9112 atomic_inc(&nr_comm_events);
9113 if (event->attr.task)
9114 atomic_inc(&nr_task_events);
9115 if (event->attr.freq)
9116 account_freq_event();
9117 if (event->attr.context_switch) {
9118 atomic_inc(&nr_switch_events);
9119 inc = true;
9120 }
9121 if (has_branch_stack(event))
9122 inc = true;
9123 if (is_cgroup_event(event))
9124 inc = true;
9125
9126 if (inc) {
9127 if (atomic_inc_not_zero(&perf_sched_count))
9128 goto enabled;
9129
9130 mutex_lock(&perf_sched_mutex);
9131 if (!atomic_read(&perf_sched_count)) {
9132 static_branch_enable(&perf_sched_events);
9133 /*
9134 * Guarantee that all CPUs observe they key change and
9135 * call the perf scheduling hooks before proceeding to
9136 * install events that need them.
9137 */
9138 synchronize_sched();
9139 }
9140 /*
9141 * Now that we have waited for the sync_sched(), allow further
9142 * increments to by-pass the mutex.
9143 */
9144 atomic_inc(&perf_sched_count);
9145 mutex_unlock(&perf_sched_mutex);
9146 }
9147 enabled:
9148
9149 account_event_cpu(event, event->cpu);
9150
9151 account_pmu_sb_event(event);
9152 }
9153
9154 /*
9155 * Allocate and initialize a event structure
9156 */
9157 static struct perf_event *
perf_event_alloc(struct perf_event_attr * attr,int cpu,struct task_struct * task,struct perf_event * group_leader,struct perf_event * parent_event,perf_overflow_handler_t overflow_handler,void * context,int cgroup_fd)9158 perf_event_alloc(struct perf_event_attr *attr, int cpu,
9159 struct task_struct *task,
9160 struct perf_event *group_leader,
9161 struct perf_event *parent_event,
9162 perf_overflow_handler_t overflow_handler,
9163 void *context, int cgroup_fd)
9164 {
9165 struct pmu *pmu;
9166 struct perf_event *event;
9167 struct hw_perf_event *hwc;
9168 long err = -EINVAL;
9169
9170 if ((unsigned)cpu >= nr_cpu_ids) {
9171 if (!task || cpu != -1)
9172 return ERR_PTR(-EINVAL);
9173 }
9174
9175 event = kzalloc(sizeof(*event), GFP_KERNEL);
9176 if (!event)
9177 return ERR_PTR(-ENOMEM);
9178
9179 /*
9180 * Single events are their own group leaders, with an
9181 * empty sibling list:
9182 */
9183 if (!group_leader)
9184 group_leader = event;
9185
9186 mutex_init(&event->child_mutex);
9187 INIT_LIST_HEAD(&event->child_list);
9188
9189 INIT_LIST_HEAD(&event->group_entry);
9190 INIT_LIST_HEAD(&event->event_entry);
9191 INIT_LIST_HEAD(&event->sibling_list);
9192 INIT_LIST_HEAD(&event->rb_entry);
9193 INIT_LIST_HEAD(&event->active_entry);
9194 INIT_LIST_HEAD(&event->addr_filters.list);
9195 INIT_HLIST_NODE(&event->hlist_entry);
9196
9197
9198 init_waitqueue_head(&event->waitq);
9199 init_irq_work(&event->pending, perf_pending_event);
9200
9201 mutex_init(&event->mmap_mutex);
9202 raw_spin_lock_init(&event->addr_filters.lock);
9203
9204 atomic_long_set(&event->refcount, 1);
9205 event->cpu = cpu;
9206 event->attr = *attr;
9207 event->group_leader = group_leader;
9208 event->pmu = NULL;
9209 event->oncpu = -1;
9210
9211 event->parent = parent_event;
9212
9213 event->ns = get_pid_ns(task_active_pid_ns(current));
9214 event->id = atomic64_inc_return(&perf_event_id);
9215
9216 event->state = PERF_EVENT_STATE_INACTIVE;
9217
9218 if (task) {
9219 event->attach_state = PERF_ATTACH_TASK;
9220 /*
9221 * XXX pmu::event_init needs to know what task to account to
9222 * and we cannot use the ctx information because we need the
9223 * pmu before we get a ctx.
9224 */
9225 get_task_struct(task);
9226 event->hw.target = task;
9227 }
9228
9229 event->clock = &local_clock;
9230 if (parent_event)
9231 event->clock = parent_event->clock;
9232
9233 if (!overflow_handler && parent_event) {
9234 overflow_handler = parent_event->overflow_handler;
9235 context = parent_event->overflow_handler_context;
9236 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_EVENT_TRACING)
9237 if (overflow_handler == bpf_overflow_handler) {
9238 struct bpf_prog *prog = bpf_prog_inc(parent_event->prog);
9239
9240 if (IS_ERR(prog)) {
9241 err = PTR_ERR(prog);
9242 goto err_ns;
9243 }
9244 event->prog = prog;
9245 event->orig_overflow_handler =
9246 parent_event->orig_overflow_handler;
9247 }
9248 #endif
9249 }
9250
9251 if (overflow_handler) {
9252 event->overflow_handler = overflow_handler;
9253 event->overflow_handler_context = context;
9254 } else if (is_write_backward(event)){
9255 event->overflow_handler = perf_event_output_backward;
9256 event->overflow_handler_context = NULL;
9257 } else {
9258 event->overflow_handler = perf_event_output_forward;
9259 event->overflow_handler_context = NULL;
9260 }
9261
9262 perf_event__state_init(event);
9263
9264 pmu = NULL;
9265
9266 hwc = &event->hw;
9267 hwc->sample_period = attr->sample_period;
9268 if (attr->freq && attr->sample_freq)
9269 hwc->sample_period = 1;
9270 hwc->last_period = hwc->sample_period;
9271
9272 local64_set(&hwc->period_left, hwc->sample_period);
9273
9274 /*
9275 * We currently do not support PERF_SAMPLE_READ on inherited events.
9276 * See perf_output_read().
9277 */
9278 if (attr->inherit && (attr->sample_type & PERF_SAMPLE_READ))
9279 goto err_ns;
9280
9281 if (!has_branch_stack(event))
9282 event->attr.branch_sample_type = 0;
9283
9284 if (cgroup_fd != -1) {
9285 err = perf_cgroup_connect(cgroup_fd, event, attr, group_leader);
9286 if (err)
9287 goto err_ns;
9288 }
9289
9290 pmu = perf_init_event(event);
9291 if (!pmu)
9292 goto err_ns;
9293 else if (IS_ERR(pmu)) {
9294 err = PTR_ERR(pmu);
9295 goto err_ns;
9296 }
9297
9298 err = exclusive_event_init(event);
9299 if (err)
9300 goto err_pmu;
9301
9302 if (has_addr_filter(event)) {
9303 event->addr_filters_offs = kcalloc(pmu->nr_addr_filters,
9304 sizeof(unsigned long),
9305 GFP_KERNEL);
9306 if (!event->addr_filters_offs) {
9307 err = -ENOMEM;
9308 goto err_per_task;
9309 }
9310
9311 /* force hw sync on the address filters */
9312 event->addr_filters_gen = 1;
9313 }
9314
9315 if (!event->parent) {
9316 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
9317 err = get_callchain_buffers(attr->sample_max_stack);
9318 if (err)
9319 goto err_addr_filters;
9320 }
9321 }
9322
9323 /* symmetric to unaccount_event() in _free_event() */
9324 account_event(event);
9325
9326 return event;
9327
9328 err_addr_filters:
9329 kfree(event->addr_filters_offs);
9330
9331 err_per_task:
9332 exclusive_event_destroy(event);
9333
9334 err_pmu:
9335 if (event->destroy)
9336 event->destroy(event);
9337 module_put(pmu->module);
9338 err_ns:
9339 if (is_cgroup_event(event))
9340 perf_detach_cgroup(event);
9341 if (event->ns)
9342 put_pid_ns(event->ns);
9343 if (event->hw.target)
9344 put_task_struct(event->hw.target);
9345 kfree(event);
9346
9347 return ERR_PTR(err);
9348 }
9349
perf_copy_attr(struct perf_event_attr __user * uattr,struct perf_event_attr * attr)9350 static int perf_copy_attr(struct perf_event_attr __user *uattr,
9351 struct perf_event_attr *attr)
9352 {
9353 u32 size;
9354 int ret;
9355
9356 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
9357 return -EFAULT;
9358
9359 /*
9360 * zero the full structure, so that a short copy will be nice.
9361 */
9362 memset(attr, 0, sizeof(*attr));
9363
9364 ret = get_user(size, &uattr->size);
9365 if (ret)
9366 return ret;
9367
9368 if (size > PAGE_SIZE) /* silly large */
9369 goto err_size;
9370
9371 if (!size) /* abi compat */
9372 size = PERF_ATTR_SIZE_VER0;
9373
9374 if (size < PERF_ATTR_SIZE_VER0)
9375 goto err_size;
9376
9377 /*
9378 * If we're handed a bigger struct than we know of,
9379 * ensure all the unknown bits are 0 - i.e. new
9380 * user-space does not rely on any kernel feature
9381 * extensions we dont know about yet.
9382 */
9383 if (size > sizeof(*attr)) {
9384 unsigned char __user *addr;
9385 unsigned char __user *end;
9386 unsigned char val;
9387
9388 addr = (void __user *)uattr + sizeof(*attr);
9389 end = (void __user *)uattr + size;
9390
9391 for (; addr < end; addr++) {
9392 ret = get_user(val, addr);
9393 if (ret)
9394 return ret;
9395 if (val)
9396 goto err_size;
9397 }
9398 size = sizeof(*attr);
9399 }
9400
9401 ret = copy_from_user(attr, uattr, size);
9402 if (ret)
9403 return -EFAULT;
9404
9405 if (attr->__reserved_1)
9406 return -EINVAL;
9407
9408 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
9409 return -EINVAL;
9410
9411 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
9412 return -EINVAL;
9413
9414 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
9415 u64 mask = attr->branch_sample_type;
9416
9417 /* only using defined bits */
9418 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
9419 return -EINVAL;
9420
9421 /* at least one branch bit must be set */
9422 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
9423 return -EINVAL;
9424
9425 /* propagate priv level, when not set for branch */
9426 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
9427
9428 /* exclude_kernel checked on syscall entry */
9429 if (!attr->exclude_kernel)
9430 mask |= PERF_SAMPLE_BRANCH_KERNEL;
9431
9432 if (!attr->exclude_user)
9433 mask |= PERF_SAMPLE_BRANCH_USER;
9434
9435 if (!attr->exclude_hv)
9436 mask |= PERF_SAMPLE_BRANCH_HV;
9437 /*
9438 * adjust user setting (for HW filter setup)
9439 */
9440 attr->branch_sample_type = mask;
9441 }
9442 /* privileged levels capture (kernel, hv): check permissions */
9443 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
9444 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9445 return -EACCES;
9446 }
9447
9448 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
9449 ret = perf_reg_validate(attr->sample_regs_user);
9450 if (ret)
9451 return ret;
9452 }
9453
9454 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
9455 if (!arch_perf_have_user_stack_dump())
9456 return -ENOSYS;
9457
9458 /*
9459 * We have __u32 type for the size, but so far
9460 * we can only use __u16 as maximum due to the
9461 * __u16 sample size limit.
9462 */
9463 if (attr->sample_stack_user >= USHRT_MAX)
9464 return -EINVAL;
9465 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
9466 return -EINVAL;
9467 }
9468
9469 if (attr->sample_type & PERF_SAMPLE_REGS_INTR)
9470 ret = perf_reg_validate(attr->sample_regs_intr);
9471 out:
9472 return ret;
9473
9474 err_size:
9475 put_user(sizeof(*attr), &uattr->size);
9476 ret = -E2BIG;
9477 goto out;
9478 }
9479
9480 static int
perf_event_set_output(struct perf_event * event,struct perf_event * output_event)9481 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
9482 {
9483 struct ring_buffer *rb = NULL;
9484 int ret = -EINVAL;
9485
9486 if (!output_event)
9487 goto set;
9488
9489 /* don't allow circular references */
9490 if (event == output_event)
9491 goto out;
9492
9493 /*
9494 * Don't allow cross-cpu buffers
9495 */
9496 if (output_event->cpu != event->cpu)
9497 goto out;
9498
9499 /*
9500 * If its not a per-cpu rb, it must be the same task.
9501 */
9502 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
9503 goto out;
9504
9505 /*
9506 * Mixing clocks in the same buffer is trouble you don't need.
9507 */
9508 if (output_event->clock != event->clock)
9509 goto out;
9510
9511 /*
9512 * Either writing ring buffer from beginning or from end.
9513 * Mixing is not allowed.
9514 */
9515 if (is_write_backward(output_event) != is_write_backward(event))
9516 goto out;
9517
9518 /*
9519 * If both events generate aux data, they must be on the same PMU
9520 */
9521 if (has_aux(event) && has_aux(output_event) &&
9522 event->pmu != output_event->pmu)
9523 goto out;
9524
9525 set:
9526 mutex_lock(&event->mmap_mutex);
9527 /* Can't redirect output if we've got an active mmap() */
9528 if (atomic_read(&event->mmap_count))
9529 goto unlock;
9530
9531 if (output_event) {
9532 /* get the rb we want to redirect to */
9533 rb = ring_buffer_get(output_event);
9534 if (!rb)
9535 goto unlock;
9536 }
9537
9538 ring_buffer_attach(event, rb);
9539
9540 ret = 0;
9541 unlock:
9542 mutex_unlock(&event->mmap_mutex);
9543
9544 out:
9545 return ret;
9546 }
9547
mutex_lock_double(struct mutex * a,struct mutex * b)9548 static void mutex_lock_double(struct mutex *a, struct mutex *b)
9549 {
9550 if (b < a)
9551 swap(a, b);
9552
9553 mutex_lock(a);
9554 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
9555 }
9556
perf_event_set_clock(struct perf_event * event,clockid_t clk_id)9557 static int perf_event_set_clock(struct perf_event *event, clockid_t clk_id)
9558 {
9559 bool nmi_safe = false;
9560
9561 switch (clk_id) {
9562 case CLOCK_MONOTONIC:
9563 event->clock = &ktime_get_mono_fast_ns;
9564 nmi_safe = true;
9565 break;
9566
9567 case CLOCK_MONOTONIC_RAW:
9568 event->clock = &ktime_get_raw_fast_ns;
9569 nmi_safe = true;
9570 break;
9571
9572 case CLOCK_REALTIME:
9573 event->clock = &ktime_get_real_ns;
9574 break;
9575
9576 case CLOCK_BOOTTIME:
9577 event->clock = &ktime_get_boot_ns;
9578 break;
9579
9580 case CLOCK_TAI:
9581 event->clock = &ktime_get_tai_ns;
9582 break;
9583
9584 default:
9585 return -EINVAL;
9586 }
9587
9588 if (!nmi_safe && !(event->pmu->capabilities & PERF_PMU_CAP_NO_NMI))
9589 return -EINVAL;
9590
9591 return 0;
9592 }
9593
9594 /*
9595 * Variation on perf_event_ctx_lock_nested(), except we take two context
9596 * mutexes.
9597 */
9598 static struct perf_event_context *
__perf_event_ctx_lock_double(struct perf_event * group_leader,struct perf_event_context * ctx)9599 __perf_event_ctx_lock_double(struct perf_event *group_leader,
9600 struct perf_event_context *ctx)
9601 {
9602 struct perf_event_context *gctx;
9603
9604 again:
9605 rcu_read_lock();
9606 gctx = READ_ONCE(group_leader->ctx);
9607 if (!atomic_inc_not_zero(&gctx->refcount)) {
9608 rcu_read_unlock();
9609 goto again;
9610 }
9611 rcu_read_unlock();
9612
9613 mutex_lock_double(&gctx->mutex, &ctx->mutex);
9614
9615 if (group_leader->ctx != gctx) {
9616 mutex_unlock(&ctx->mutex);
9617 mutex_unlock(&gctx->mutex);
9618 put_ctx(gctx);
9619 goto again;
9620 }
9621
9622 return gctx;
9623 }
9624
9625 /**
9626 * sys_perf_event_open - open a performance event, associate it to a task/cpu
9627 *
9628 * @attr_uptr: event_id type attributes for monitoring/sampling
9629 * @pid: target pid
9630 * @cpu: target cpu
9631 * @group_fd: group leader event fd
9632 */
SYSCALL_DEFINE5(perf_event_open,struct perf_event_attr __user *,attr_uptr,pid_t,pid,int,cpu,int,group_fd,unsigned long,flags)9633 SYSCALL_DEFINE5(perf_event_open,
9634 struct perf_event_attr __user *, attr_uptr,
9635 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
9636 {
9637 struct perf_event *group_leader = NULL, *output_event = NULL;
9638 struct perf_event *event, *sibling;
9639 struct perf_event_attr attr;
9640 struct perf_event_context *ctx, *uninitialized_var(gctx);
9641 struct file *event_file = NULL;
9642 struct fd group = {NULL, 0};
9643 struct task_struct *task = NULL;
9644 struct pmu *pmu;
9645 int event_fd;
9646 int move_group = 0;
9647 int err;
9648 int f_flags = O_RDWR;
9649 int cgroup_fd = -1;
9650
9651 /* for future expandability... */
9652 if (flags & ~PERF_FLAG_ALL)
9653 return -EINVAL;
9654
9655 if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
9656 return -EACCES;
9657
9658 err = perf_copy_attr(attr_uptr, &attr);
9659 if (err)
9660 return err;
9661
9662 if (!attr.exclude_kernel) {
9663 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
9664 return -EACCES;
9665 }
9666
9667 if (attr.freq) {
9668 if (attr.sample_freq > sysctl_perf_event_sample_rate)
9669 return -EINVAL;
9670 } else {
9671 if (attr.sample_period & (1ULL << 63))
9672 return -EINVAL;
9673 }
9674
9675 if (!attr.sample_max_stack)
9676 attr.sample_max_stack = sysctl_perf_event_max_stack;
9677
9678 /*
9679 * In cgroup mode, the pid argument is used to pass the fd
9680 * opened to the cgroup directory in cgroupfs. The cpu argument
9681 * designates the cpu on which to monitor threads from that
9682 * cgroup.
9683 */
9684 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
9685 return -EINVAL;
9686
9687 if (flags & PERF_FLAG_FD_CLOEXEC)
9688 f_flags |= O_CLOEXEC;
9689
9690 event_fd = get_unused_fd_flags(f_flags);
9691 if (event_fd < 0)
9692 return event_fd;
9693
9694 if (group_fd != -1) {
9695 err = perf_fget_light(group_fd, &group);
9696 if (err)
9697 goto err_fd;
9698 group_leader = group.file->private_data;
9699 if (flags & PERF_FLAG_FD_OUTPUT)
9700 output_event = group_leader;
9701 if (flags & PERF_FLAG_FD_NO_GROUP)
9702 group_leader = NULL;
9703 }
9704
9705 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
9706 task = find_lively_task_by_vpid(pid);
9707 if (IS_ERR(task)) {
9708 err = PTR_ERR(task);
9709 goto err_group_fd;
9710 }
9711 }
9712
9713 if (task && group_leader &&
9714 group_leader->attr.inherit != attr.inherit) {
9715 err = -EINVAL;
9716 goto err_task;
9717 }
9718
9719 get_online_cpus();
9720
9721 if (task) {
9722 err = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
9723 if (err)
9724 goto err_cpus;
9725
9726 /*
9727 * Reuse ptrace permission checks for now.
9728 *
9729 * We must hold cred_guard_mutex across this and any potential
9730 * perf_install_in_context() call for this new event to
9731 * serialize against exec() altering our credentials (and the
9732 * perf_event_exit_task() that could imply).
9733 */
9734 err = -EACCES;
9735 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
9736 goto err_cred;
9737 }
9738
9739 if (flags & PERF_FLAG_PID_CGROUP)
9740 cgroup_fd = pid;
9741
9742 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
9743 NULL, NULL, cgroup_fd);
9744 if (IS_ERR(event)) {
9745 err = PTR_ERR(event);
9746 goto err_cred;
9747 }
9748
9749 if (is_sampling_event(event)) {
9750 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
9751 err = -EOPNOTSUPP;
9752 goto err_alloc;
9753 }
9754 }
9755
9756 /*
9757 * Special case software events and allow them to be part of
9758 * any hardware group.
9759 */
9760 pmu = event->pmu;
9761
9762 if (attr.use_clockid) {
9763 err = perf_event_set_clock(event, attr.clockid);
9764 if (err)
9765 goto err_alloc;
9766 }
9767
9768 if (pmu->task_ctx_nr == perf_sw_context)
9769 event->event_caps |= PERF_EV_CAP_SOFTWARE;
9770
9771 if (group_leader &&
9772 (is_software_event(event) != is_software_event(group_leader))) {
9773 if (is_software_event(event)) {
9774 /*
9775 * If event and group_leader are not both a software
9776 * event, and event is, then group leader is not.
9777 *
9778 * Allow the addition of software events to !software
9779 * groups, this is safe because software events never
9780 * fail to schedule.
9781 */
9782 pmu = group_leader->pmu;
9783 } else if (is_software_event(group_leader) &&
9784 (group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
9785 /*
9786 * In case the group is a pure software group, and we
9787 * try to add a hardware event, move the whole group to
9788 * the hardware context.
9789 */
9790 move_group = 1;
9791 }
9792 }
9793
9794 /*
9795 * Get the target context (task or percpu):
9796 */
9797 ctx = find_get_context(pmu, task, event);
9798 if (IS_ERR(ctx)) {
9799 err = PTR_ERR(ctx);
9800 goto err_alloc;
9801 }
9802
9803 if ((pmu->capabilities & PERF_PMU_CAP_EXCLUSIVE) && group_leader) {
9804 err = -EBUSY;
9805 goto err_context;
9806 }
9807
9808 /*
9809 * Look up the group leader (we will attach this event to it):
9810 */
9811 if (group_leader) {
9812 err = -EINVAL;
9813
9814 /*
9815 * Do not allow a recursive hierarchy (this new sibling
9816 * becoming part of another group-sibling):
9817 */
9818 if (group_leader->group_leader != group_leader)
9819 goto err_context;
9820
9821 /* All events in a group should have the same clock */
9822 if (group_leader->clock != event->clock)
9823 goto err_context;
9824
9825 /*
9826 * Make sure we're both events for the same CPU;
9827 * grouping events for different CPUs is broken; since
9828 * you can never concurrently schedule them anyhow.
9829 */
9830 if (group_leader->cpu != event->cpu)
9831 goto err_context;
9832
9833 /*
9834 * Make sure we're both on the same task, or both
9835 * per-CPU events.
9836 */
9837 if (group_leader->ctx->task != ctx->task)
9838 goto err_context;
9839
9840 /*
9841 * Do not allow to attach to a group in a different task
9842 * or CPU context. If we're moving SW events, we'll fix
9843 * this up later, so allow that.
9844 */
9845 if (!move_group && group_leader->ctx != ctx)
9846 goto err_context;
9847
9848 /*
9849 * Only a group leader can be exclusive or pinned
9850 */
9851 if (attr.exclusive || attr.pinned)
9852 goto err_context;
9853 }
9854
9855 if (output_event) {
9856 err = perf_event_set_output(event, output_event);
9857 if (err)
9858 goto err_context;
9859 }
9860
9861 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
9862 f_flags);
9863 if (IS_ERR(event_file)) {
9864 err = PTR_ERR(event_file);
9865 event_file = NULL;
9866 goto err_context;
9867 }
9868
9869 if (move_group) {
9870 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
9871
9872 if (gctx->task == TASK_TOMBSTONE) {
9873 err = -ESRCH;
9874 goto err_locked;
9875 }
9876
9877 /*
9878 * Check if we raced against another sys_perf_event_open() call
9879 * moving the software group underneath us.
9880 */
9881 if (!(group_leader->group_caps & PERF_EV_CAP_SOFTWARE)) {
9882 /*
9883 * If someone moved the group out from under us, check
9884 * if this new event wound up on the same ctx, if so
9885 * its the regular !move_group case, otherwise fail.
9886 */
9887 if (gctx != ctx) {
9888 err = -EINVAL;
9889 goto err_locked;
9890 } else {
9891 perf_event_ctx_unlock(group_leader, gctx);
9892 move_group = 0;
9893 }
9894 }
9895 } else {
9896 mutex_lock(&ctx->mutex);
9897 }
9898
9899 if (ctx->task == TASK_TOMBSTONE) {
9900 err = -ESRCH;
9901 goto err_locked;
9902 }
9903
9904 if (!perf_event_validate_size(event)) {
9905 err = -E2BIG;
9906 goto err_locked;
9907 }
9908
9909 /*
9910 * Must be under the same ctx::mutex as perf_install_in_context(),
9911 * because we need to serialize with concurrent event creation.
9912 */
9913 if (!exclusive_event_installable(event, ctx)) {
9914 /* exclusive and group stuff are assumed mutually exclusive */
9915 WARN_ON_ONCE(move_group);
9916
9917 err = -EBUSY;
9918 goto err_locked;
9919 }
9920
9921 WARN_ON_ONCE(ctx->parent_ctx);
9922
9923 /*
9924 * This is the point on no return; we cannot fail hereafter. This is
9925 * where we start modifying current state.
9926 */
9927
9928 if (move_group) {
9929 /*
9930 * See perf_event_ctx_lock() for comments on the details
9931 * of swizzling perf_event::ctx.
9932 */
9933 perf_remove_from_context(group_leader, 0);
9934
9935 list_for_each_entry(sibling, &group_leader->sibling_list,
9936 group_entry) {
9937 perf_remove_from_context(sibling, 0);
9938 put_ctx(gctx);
9939 }
9940
9941 /*
9942 * Wait for everybody to stop referencing the events through
9943 * the old lists, before installing it on new lists.
9944 */
9945 synchronize_rcu();
9946
9947 /*
9948 * Install the group siblings before the group leader.
9949 *
9950 * Because a group leader will try and install the entire group
9951 * (through the sibling list, which is still in-tact), we can
9952 * end up with siblings installed in the wrong context.
9953 *
9954 * By installing siblings first we NO-OP because they're not
9955 * reachable through the group lists.
9956 */
9957 list_for_each_entry(sibling, &group_leader->sibling_list,
9958 group_entry) {
9959 perf_event__state_init(sibling);
9960 perf_install_in_context(ctx, sibling, sibling->cpu);
9961 get_ctx(ctx);
9962 }
9963
9964 /*
9965 * Removing from the context ends up with disabled
9966 * event. What we want here is event in the initial
9967 * startup state, ready to be add into new context.
9968 */
9969 perf_event__state_init(group_leader);
9970 perf_install_in_context(ctx, group_leader, group_leader->cpu);
9971 get_ctx(ctx);
9972
9973 /*
9974 * Now that all events are installed in @ctx, nothing
9975 * references @gctx anymore, so drop the last reference we have
9976 * on it.
9977 */
9978 put_ctx(gctx);
9979 }
9980
9981 /*
9982 * Precalculate sample_data sizes; do while holding ctx::mutex such
9983 * that we're serialized against further additions and before
9984 * perf_install_in_context() which is the point the event is active and
9985 * can use these values.
9986 */
9987 perf_event__header_size(event);
9988 perf_event__id_header_size(event);
9989
9990 event->owner = current;
9991
9992 perf_install_in_context(ctx, event, event->cpu);
9993 perf_unpin_context(ctx);
9994
9995 if (move_group)
9996 perf_event_ctx_unlock(group_leader, gctx);
9997 mutex_unlock(&ctx->mutex);
9998
9999 if (task) {
10000 mutex_unlock(&task->signal->cred_guard_mutex);
10001 put_task_struct(task);
10002 }
10003
10004 put_online_cpus();
10005
10006 mutex_lock(¤t->perf_event_mutex);
10007 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
10008 mutex_unlock(¤t->perf_event_mutex);
10009
10010 /*
10011 * Drop the reference on the group_event after placing the
10012 * new event on the sibling_list. This ensures destruction
10013 * of the group leader will find the pointer to itself in
10014 * perf_group_detach().
10015 */
10016 fdput(group);
10017 fd_install(event_fd, event_file);
10018 return event_fd;
10019
10020 err_locked:
10021 if (move_group)
10022 perf_event_ctx_unlock(group_leader, gctx);
10023 mutex_unlock(&ctx->mutex);
10024 /* err_file: */
10025 fput(event_file);
10026 err_context:
10027 perf_unpin_context(ctx);
10028 put_ctx(ctx);
10029 err_alloc:
10030 /*
10031 * If event_file is set, the fput() above will have called ->release()
10032 * and that will take care of freeing the event.
10033 */
10034 if (!event_file)
10035 free_event(event);
10036 err_cred:
10037 if (task)
10038 mutex_unlock(&task->signal->cred_guard_mutex);
10039 err_cpus:
10040 put_online_cpus();
10041 err_task:
10042 if (task)
10043 put_task_struct(task);
10044 err_group_fd:
10045 fdput(group);
10046 err_fd:
10047 put_unused_fd(event_fd);
10048 return err;
10049 }
10050
10051 /**
10052 * perf_event_create_kernel_counter
10053 *
10054 * @attr: attributes of the counter to create
10055 * @cpu: cpu in which the counter is bound
10056 * @task: task to profile (NULL for percpu)
10057 */
10058 struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr * attr,int cpu,struct task_struct * task,perf_overflow_handler_t overflow_handler,void * context)10059 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
10060 struct task_struct *task,
10061 perf_overflow_handler_t overflow_handler,
10062 void *context)
10063 {
10064 struct perf_event_context *ctx;
10065 struct perf_event *event;
10066 int err;
10067
10068 /*
10069 * Get the target context (task or percpu):
10070 */
10071
10072 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
10073 overflow_handler, context, -1);
10074 if (IS_ERR(event)) {
10075 err = PTR_ERR(event);
10076 goto err;
10077 }
10078
10079 /* Mark owner so we could distinguish it from user events. */
10080 event->owner = TASK_TOMBSTONE;
10081
10082 ctx = find_get_context(event->pmu, task, event);
10083 if (IS_ERR(ctx)) {
10084 err = PTR_ERR(ctx);
10085 goto err_free;
10086 }
10087
10088 WARN_ON_ONCE(ctx->parent_ctx);
10089 mutex_lock(&ctx->mutex);
10090 if (ctx->task == TASK_TOMBSTONE) {
10091 err = -ESRCH;
10092 goto err_unlock;
10093 }
10094
10095 if (!exclusive_event_installable(event, ctx)) {
10096 err = -EBUSY;
10097 goto err_unlock;
10098 }
10099
10100 perf_install_in_context(ctx, event, cpu);
10101 perf_unpin_context(ctx);
10102 mutex_unlock(&ctx->mutex);
10103
10104 return event;
10105
10106 err_unlock:
10107 mutex_unlock(&ctx->mutex);
10108 perf_unpin_context(ctx);
10109 put_ctx(ctx);
10110 err_free:
10111 free_event(event);
10112 err:
10113 return ERR_PTR(err);
10114 }
10115 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
10116
perf_pmu_migrate_context(struct pmu * pmu,int src_cpu,int dst_cpu)10117 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
10118 {
10119 struct perf_event_context *src_ctx;
10120 struct perf_event_context *dst_ctx;
10121 struct perf_event *event, *tmp;
10122 LIST_HEAD(events);
10123
10124 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
10125 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
10126
10127 /*
10128 * See perf_event_ctx_lock() for comments on the details
10129 * of swizzling perf_event::ctx.
10130 */
10131 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
10132 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
10133 event_entry) {
10134 perf_remove_from_context(event, 0);
10135 unaccount_event_cpu(event, src_cpu);
10136 put_ctx(src_ctx);
10137 list_add(&event->migrate_entry, &events);
10138 }
10139
10140 /*
10141 * Wait for the events to quiesce before re-instating them.
10142 */
10143 synchronize_rcu();
10144
10145 /*
10146 * Re-instate events in 2 passes.
10147 *
10148 * Skip over group leaders and only install siblings on this first
10149 * pass, siblings will not get enabled without a leader, however a
10150 * leader will enable its siblings, even if those are still on the old
10151 * context.
10152 */
10153 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10154 if (event->group_leader == event)
10155 continue;
10156
10157 list_del(&event->migrate_entry);
10158 if (event->state >= PERF_EVENT_STATE_OFF)
10159 event->state = PERF_EVENT_STATE_INACTIVE;
10160 account_event_cpu(event, dst_cpu);
10161 perf_install_in_context(dst_ctx, event, dst_cpu);
10162 get_ctx(dst_ctx);
10163 }
10164
10165 /*
10166 * Once all the siblings are setup properly, install the group leaders
10167 * to make it go.
10168 */
10169 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
10170 list_del(&event->migrate_entry);
10171 if (event->state >= PERF_EVENT_STATE_OFF)
10172 event->state = PERF_EVENT_STATE_INACTIVE;
10173 account_event_cpu(event, dst_cpu);
10174 perf_install_in_context(dst_ctx, event, dst_cpu);
10175 get_ctx(dst_ctx);
10176 }
10177 mutex_unlock(&dst_ctx->mutex);
10178 mutex_unlock(&src_ctx->mutex);
10179 }
10180 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
10181
sync_child_event(struct perf_event * child_event,struct task_struct * child)10182 static void sync_child_event(struct perf_event *child_event,
10183 struct task_struct *child)
10184 {
10185 struct perf_event *parent_event = child_event->parent;
10186 u64 child_val;
10187
10188 if (child_event->attr.inherit_stat)
10189 perf_event_read_event(child_event, child);
10190
10191 child_val = perf_event_count(child_event);
10192
10193 /*
10194 * Add back the child's count to the parent's count:
10195 */
10196 atomic64_add(child_val, &parent_event->child_count);
10197 atomic64_add(child_event->total_time_enabled,
10198 &parent_event->child_total_time_enabled);
10199 atomic64_add(child_event->total_time_running,
10200 &parent_event->child_total_time_running);
10201 }
10202
10203 static void
perf_event_exit_event(struct perf_event * child_event,struct perf_event_context * child_ctx,struct task_struct * child)10204 perf_event_exit_event(struct perf_event *child_event,
10205 struct perf_event_context *child_ctx,
10206 struct task_struct *child)
10207 {
10208 struct perf_event *parent_event = child_event->parent;
10209
10210 /*
10211 * Do not destroy the 'original' grouping; because of the context
10212 * switch optimization the original events could've ended up in a
10213 * random child task.
10214 *
10215 * If we were to destroy the original group, all group related
10216 * operations would cease to function properly after this random
10217 * child dies.
10218 *
10219 * Do destroy all inherited groups, we don't care about those
10220 * and being thorough is better.
10221 */
10222 raw_spin_lock_irq(&child_ctx->lock);
10223 WARN_ON_ONCE(child_ctx->is_active);
10224
10225 if (parent_event)
10226 perf_group_detach(child_event);
10227 list_del_event(child_event, child_ctx);
10228 child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
10229 raw_spin_unlock_irq(&child_ctx->lock);
10230
10231 /*
10232 * Parent events are governed by their filedesc, retain them.
10233 */
10234 if (!parent_event) {
10235 perf_event_wakeup(child_event);
10236 return;
10237 }
10238 /*
10239 * Child events can be cleaned up.
10240 */
10241
10242 sync_child_event(child_event, child);
10243
10244 /*
10245 * Remove this event from the parent's list
10246 */
10247 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
10248 mutex_lock(&parent_event->child_mutex);
10249 list_del_init(&child_event->child_list);
10250 mutex_unlock(&parent_event->child_mutex);
10251
10252 /*
10253 * Kick perf_poll() for is_event_hup().
10254 */
10255 perf_event_wakeup(parent_event);
10256 free_event(child_event);
10257 put_event(parent_event);
10258 }
10259
perf_event_exit_task_context(struct task_struct * child,int ctxn)10260 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
10261 {
10262 struct perf_event_context *child_ctx, *clone_ctx = NULL;
10263 struct perf_event *child_event, *next;
10264
10265 WARN_ON_ONCE(child != current);
10266
10267 child_ctx = perf_pin_task_context(child, ctxn);
10268 if (!child_ctx)
10269 return;
10270
10271 /*
10272 * In order to reduce the amount of tricky in ctx tear-down, we hold
10273 * ctx::mutex over the entire thing. This serializes against almost
10274 * everything that wants to access the ctx.
10275 *
10276 * The exception is sys_perf_event_open() /
10277 * perf_event_create_kernel_count() which does find_get_context()
10278 * without ctx::mutex (it cannot because of the move_group double mutex
10279 * lock thing). See the comments in perf_install_in_context().
10280 */
10281 mutex_lock(&child_ctx->mutex);
10282
10283 /*
10284 * In a single ctx::lock section, de-schedule the events and detach the
10285 * context from the task such that we cannot ever get it scheduled back
10286 * in.
10287 */
10288 raw_spin_lock_irq(&child_ctx->lock);
10289 task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
10290
10291 /*
10292 * Now that the context is inactive, destroy the task <-> ctx relation
10293 * and mark the context dead.
10294 */
10295 RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
10296 put_ctx(child_ctx); /* cannot be last */
10297 WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
10298 put_task_struct(current); /* cannot be last */
10299
10300 clone_ctx = unclone_ctx(child_ctx);
10301 raw_spin_unlock_irq(&child_ctx->lock);
10302
10303 if (clone_ctx)
10304 put_ctx(clone_ctx);
10305
10306 /*
10307 * Report the task dead after unscheduling the events so that we
10308 * won't get any samples after PERF_RECORD_EXIT. We can however still
10309 * get a few PERF_RECORD_READ events.
10310 */
10311 perf_event_task(child, child_ctx, 0);
10312
10313 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
10314 perf_event_exit_event(child_event, child_ctx, child);
10315
10316 mutex_unlock(&child_ctx->mutex);
10317
10318 put_ctx(child_ctx);
10319 }
10320
10321 /*
10322 * When a child task exits, feed back event values to parent events.
10323 *
10324 * Can be called with cred_guard_mutex held when called from
10325 * install_exec_creds().
10326 */
perf_event_exit_task(struct task_struct * child)10327 void perf_event_exit_task(struct task_struct *child)
10328 {
10329 struct perf_event *event, *tmp;
10330 int ctxn;
10331
10332 mutex_lock(&child->perf_event_mutex);
10333 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
10334 owner_entry) {
10335 list_del_init(&event->owner_entry);
10336
10337 /*
10338 * Ensure the list deletion is visible before we clear
10339 * the owner, closes a race against perf_release() where
10340 * we need to serialize on the owner->perf_event_mutex.
10341 */
10342 smp_store_release(&event->owner, NULL);
10343 }
10344 mutex_unlock(&child->perf_event_mutex);
10345
10346 for_each_task_context_nr(ctxn)
10347 perf_event_exit_task_context(child, ctxn);
10348
10349 /*
10350 * The perf_event_exit_task_context calls perf_event_task
10351 * with child's task_ctx, which generates EXIT events for
10352 * child contexts and sets child->perf_event_ctxp[] to NULL.
10353 * At this point we need to send EXIT events to cpu contexts.
10354 */
10355 perf_event_task(child, NULL, 0);
10356 }
10357
perf_free_event(struct perf_event * event,struct perf_event_context * ctx)10358 static void perf_free_event(struct perf_event *event,
10359 struct perf_event_context *ctx)
10360 {
10361 struct perf_event *parent = event->parent;
10362
10363 if (WARN_ON_ONCE(!parent))
10364 return;
10365
10366 mutex_lock(&parent->child_mutex);
10367 list_del_init(&event->child_list);
10368 mutex_unlock(&parent->child_mutex);
10369
10370 put_event(parent);
10371
10372 raw_spin_lock_irq(&ctx->lock);
10373 perf_group_detach(event);
10374 list_del_event(event, ctx);
10375 raw_spin_unlock_irq(&ctx->lock);
10376 free_event(event);
10377 }
10378
10379 /*
10380 * Free an unexposed, unused context as created by inheritance by
10381 * perf_event_init_task below, used by fork() in case of fail.
10382 *
10383 * Not all locks are strictly required, but take them anyway to be nice and
10384 * help out with the lockdep assertions.
10385 */
perf_event_free_task(struct task_struct * task)10386 void perf_event_free_task(struct task_struct *task)
10387 {
10388 struct perf_event_context *ctx;
10389 struct perf_event *event, *tmp;
10390 int ctxn;
10391
10392 for_each_task_context_nr(ctxn) {
10393 ctx = task->perf_event_ctxp[ctxn];
10394 if (!ctx)
10395 continue;
10396
10397 mutex_lock(&ctx->mutex);
10398 raw_spin_lock_irq(&ctx->lock);
10399 /*
10400 * Destroy the task <-> ctx relation and mark the context dead.
10401 *
10402 * This is important because even though the task hasn't been
10403 * exposed yet the context has been (through child_list).
10404 */
10405 RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
10406 WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
10407 put_task_struct(task); /* cannot be last */
10408 raw_spin_unlock_irq(&ctx->lock);
10409 again:
10410 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
10411 group_entry)
10412 perf_free_event(event, ctx);
10413
10414 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
10415 group_entry)
10416 perf_free_event(event, ctx);
10417
10418 if (!list_empty(&ctx->pinned_groups) ||
10419 !list_empty(&ctx->flexible_groups))
10420 goto again;
10421
10422 mutex_unlock(&ctx->mutex);
10423
10424 put_ctx(ctx);
10425 }
10426 }
10427
perf_event_delayed_put(struct task_struct * task)10428 void perf_event_delayed_put(struct task_struct *task)
10429 {
10430 int ctxn;
10431
10432 for_each_task_context_nr(ctxn)
10433 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
10434 }
10435
perf_event_get(unsigned int fd)10436 struct file *perf_event_get(unsigned int fd)
10437 {
10438 struct file *file;
10439
10440 file = fget_raw(fd);
10441 if (!file)
10442 return ERR_PTR(-EBADF);
10443
10444 if (file->f_op != &perf_fops) {
10445 fput(file);
10446 return ERR_PTR(-EBADF);
10447 }
10448
10449 return file;
10450 }
10451
perf_event_attrs(struct perf_event * event)10452 const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
10453 {
10454 if (!event)
10455 return ERR_PTR(-EINVAL);
10456
10457 return &event->attr;
10458 }
10459
10460 /*
10461 * inherit a event from parent task to child task:
10462 */
10463 static struct perf_event *
inherit_event(struct perf_event * parent_event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,struct perf_event * group_leader,struct perf_event_context * child_ctx)10464 inherit_event(struct perf_event *parent_event,
10465 struct task_struct *parent,
10466 struct perf_event_context *parent_ctx,
10467 struct task_struct *child,
10468 struct perf_event *group_leader,
10469 struct perf_event_context *child_ctx)
10470 {
10471 enum perf_event_active_state parent_state = parent_event->state;
10472 struct perf_event *child_event;
10473 unsigned long flags;
10474
10475 /*
10476 * Instead of creating recursive hierarchies of events,
10477 * we link inherited events back to the original parent,
10478 * which has a filp for sure, which we use as the reference
10479 * count:
10480 */
10481 if (parent_event->parent)
10482 parent_event = parent_event->parent;
10483
10484 child_event = perf_event_alloc(&parent_event->attr,
10485 parent_event->cpu,
10486 child,
10487 group_leader, parent_event,
10488 NULL, NULL, -1);
10489 if (IS_ERR(child_event))
10490 return child_event;
10491
10492 /*
10493 * is_orphaned_event() and list_add_tail(&parent_event->child_list)
10494 * must be under the same lock in order to serialize against
10495 * perf_event_release_kernel(), such that either we must observe
10496 * is_orphaned_event() or they will observe us on the child_list.
10497 */
10498 mutex_lock(&parent_event->child_mutex);
10499 if (is_orphaned_event(parent_event) ||
10500 !atomic_long_inc_not_zero(&parent_event->refcount)) {
10501 mutex_unlock(&parent_event->child_mutex);
10502 free_event(child_event);
10503 return NULL;
10504 }
10505
10506 get_ctx(child_ctx);
10507
10508 /*
10509 * Make the child state follow the state of the parent event,
10510 * not its attr.disabled bit. We hold the parent's mutex,
10511 * so we won't race with perf_event_{en, dis}able_family.
10512 */
10513 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
10514 child_event->state = PERF_EVENT_STATE_INACTIVE;
10515 else
10516 child_event->state = PERF_EVENT_STATE_OFF;
10517
10518 if (parent_event->attr.freq) {
10519 u64 sample_period = parent_event->hw.sample_period;
10520 struct hw_perf_event *hwc = &child_event->hw;
10521
10522 hwc->sample_period = sample_period;
10523 hwc->last_period = sample_period;
10524
10525 local64_set(&hwc->period_left, sample_period);
10526 }
10527
10528 child_event->ctx = child_ctx;
10529 child_event->overflow_handler = parent_event->overflow_handler;
10530 child_event->overflow_handler_context
10531 = parent_event->overflow_handler_context;
10532
10533 /*
10534 * Precalculate sample_data sizes
10535 */
10536 perf_event__header_size(child_event);
10537 perf_event__id_header_size(child_event);
10538
10539 /*
10540 * Link it up in the child's context:
10541 */
10542 raw_spin_lock_irqsave(&child_ctx->lock, flags);
10543 add_event_to_ctx(child_event, child_ctx);
10544 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
10545
10546 /*
10547 * Link this into the parent event's child list
10548 */
10549 list_add_tail(&child_event->child_list, &parent_event->child_list);
10550 mutex_unlock(&parent_event->child_mutex);
10551
10552 return child_event;
10553 }
10554
inherit_group(struct perf_event * parent_event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,struct perf_event_context * child_ctx)10555 static int inherit_group(struct perf_event *parent_event,
10556 struct task_struct *parent,
10557 struct perf_event_context *parent_ctx,
10558 struct task_struct *child,
10559 struct perf_event_context *child_ctx)
10560 {
10561 struct perf_event *leader;
10562 struct perf_event *sub;
10563 struct perf_event *child_ctr;
10564
10565 leader = inherit_event(parent_event, parent, parent_ctx,
10566 child, NULL, child_ctx);
10567 if (IS_ERR(leader))
10568 return PTR_ERR(leader);
10569 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
10570 child_ctr = inherit_event(sub, parent, parent_ctx,
10571 child, leader, child_ctx);
10572 if (IS_ERR(child_ctr))
10573 return PTR_ERR(child_ctr);
10574 }
10575 return 0;
10576 }
10577
10578 static int
inherit_task_group(struct perf_event * event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,int ctxn,int * inherited_all)10579 inherit_task_group(struct perf_event *event, struct task_struct *parent,
10580 struct perf_event_context *parent_ctx,
10581 struct task_struct *child, int ctxn,
10582 int *inherited_all)
10583 {
10584 int ret;
10585 struct perf_event_context *child_ctx;
10586
10587 if (!event->attr.inherit) {
10588 *inherited_all = 0;
10589 return 0;
10590 }
10591
10592 child_ctx = child->perf_event_ctxp[ctxn];
10593 if (!child_ctx) {
10594 /*
10595 * This is executed from the parent task context, so
10596 * inherit events that have been marked for cloning.
10597 * First allocate and initialize a context for the
10598 * child.
10599 */
10600
10601 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
10602 if (!child_ctx)
10603 return -ENOMEM;
10604
10605 child->perf_event_ctxp[ctxn] = child_ctx;
10606 }
10607
10608 ret = inherit_group(event, parent, parent_ctx,
10609 child, child_ctx);
10610
10611 if (ret)
10612 *inherited_all = 0;
10613
10614 return ret;
10615 }
10616
10617 /*
10618 * Initialize the perf_event context in task_struct
10619 */
perf_event_init_context(struct task_struct * child,int ctxn)10620 static int perf_event_init_context(struct task_struct *child, int ctxn)
10621 {
10622 struct perf_event_context *child_ctx, *parent_ctx;
10623 struct perf_event_context *cloned_ctx;
10624 struct perf_event *event;
10625 struct task_struct *parent = current;
10626 int inherited_all = 1;
10627 unsigned long flags;
10628 int ret = 0;
10629
10630 if (likely(!parent->perf_event_ctxp[ctxn]))
10631 return 0;
10632
10633 /*
10634 * If the parent's context is a clone, pin it so it won't get
10635 * swapped under us.
10636 */
10637 parent_ctx = perf_pin_task_context(parent, ctxn);
10638 if (!parent_ctx)
10639 return 0;
10640
10641 /*
10642 * No need to check if parent_ctx != NULL here; since we saw
10643 * it non-NULL earlier, the only reason for it to become NULL
10644 * is if we exit, and since we're currently in the middle of
10645 * a fork we can't be exiting at the same time.
10646 */
10647
10648 /*
10649 * Lock the parent list. No need to lock the child - not PID
10650 * hashed yet and not running, so nobody can access it.
10651 */
10652 mutex_lock(&parent_ctx->mutex);
10653
10654 /*
10655 * We dont have to disable NMIs - we are only looking at
10656 * the list, not manipulating it:
10657 */
10658 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
10659 ret = inherit_task_group(event, parent, parent_ctx,
10660 child, ctxn, &inherited_all);
10661 if (ret)
10662 goto out_unlock;
10663 }
10664
10665 /*
10666 * We can't hold ctx->lock when iterating the ->flexible_group list due
10667 * to allocations, but we need to prevent rotation because
10668 * rotate_ctx() will change the list from interrupt context.
10669 */
10670 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10671 parent_ctx->rotate_disable = 1;
10672 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10673
10674 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
10675 ret = inherit_task_group(event, parent, parent_ctx,
10676 child, ctxn, &inherited_all);
10677 if (ret)
10678 goto out_unlock;
10679 }
10680
10681 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
10682 parent_ctx->rotate_disable = 0;
10683
10684 child_ctx = child->perf_event_ctxp[ctxn];
10685
10686 if (child_ctx && inherited_all) {
10687 /*
10688 * Mark the child context as a clone of the parent
10689 * context, or of whatever the parent is a clone of.
10690 *
10691 * Note that if the parent is a clone, the holding of
10692 * parent_ctx->lock avoids it from being uncloned.
10693 */
10694 cloned_ctx = parent_ctx->parent_ctx;
10695 if (cloned_ctx) {
10696 child_ctx->parent_ctx = cloned_ctx;
10697 child_ctx->parent_gen = parent_ctx->parent_gen;
10698 } else {
10699 child_ctx->parent_ctx = parent_ctx;
10700 child_ctx->parent_gen = parent_ctx->generation;
10701 }
10702 get_ctx(child_ctx->parent_ctx);
10703 }
10704
10705 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
10706 out_unlock:
10707 mutex_unlock(&parent_ctx->mutex);
10708
10709 perf_unpin_context(parent_ctx);
10710 put_ctx(parent_ctx);
10711
10712 return ret;
10713 }
10714
10715 /*
10716 * Initialize the perf_event context in task_struct
10717 */
perf_event_init_task(struct task_struct * child)10718 int perf_event_init_task(struct task_struct *child)
10719 {
10720 int ctxn, ret;
10721
10722 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
10723 mutex_init(&child->perf_event_mutex);
10724 INIT_LIST_HEAD(&child->perf_event_list);
10725
10726 for_each_task_context_nr(ctxn) {
10727 ret = perf_event_init_context(child, ctxn);
10728 if (ret) {
10729 perf_event_free_task(child);
10730 return ret;
10731 }
10732 }
10733
10734 return 0;
10735 }
10736
perf_event_init_all_cpus(void)10737 static void __init perf_event_init_all_cpus(void)
10738 {
10739 struct swevent_htable *swhash;
10740 int cpu;
10741
10742 for_each_possible_cpu(cpu) {
10743 swhash = &per_cpu(swevent_htable, cpu);
10744 mutex_init(&swhash->hlist_mutex);
10745 INIT_LIST_HEAD(&per_cpu(active_ctx_list, cpu));
10746
10747 INIT_LIST_HEAD(&per_cpu(pmu_sb_events.list, cpu));
10748 raw_spin_lock_init(&per_cpu(pmu_sb_events.lock, cpu));
10749
10750 INIT_LIST_HEAD(&per_cpu(sched_cb_list, cpu));
10751 }
10752 }
10753
perf_event_init_cpu(unsigned int cpu)10754 int perf_event_init_cpu(unsigned int cpu)
10755 {
10756 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
10757
10758 mutex_lock(&swhash->hlist_mutex);
10759 if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
10760 struct swevent_hlist *hlist;
10761
10762 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
10763 WARN_ON(!hlist);
10764 rcu_assign_pointer(swhash->swevent_hlist, hlist);
10765 }
10766 mutex_unlock(&swhash->hlist_mutex);
10767 return 0;
10768 }
10769
10770 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
__perf_event_exit_context(void * __info)10771 static void __perf_event_exit_context(void *__info)
10772 {
10773 struct perf_event_context *ctx = __info;
10774 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
10775 struct perf_event *event;
10776
10777 raw_spin_lock(&ctx->lock);
10778 list_for_each_entry(event, &ctx->event_list, event_entry)
10779 __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
10780 raw_spin_unlock(&ctx->lock);
10781 }
10782
perf_event_exit_cpu_context(int cpu)10783 static void perf_event_exit_cpu_context(int cpu)
10784 {
10785 struct perf_event_context *ctx;
10786 struct pmu *pmu;
10787 int idx;
10788
10789 idx = srcu_read_lock(&pmus_srcu);
10790 list_for_each_entry_rcu(pmu, &pmus, entry) {
10791 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
10792
10793 mutex_lock(&ctx->mutex);
10794 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
10795 mutex_unlock(&ctx->mutex);
10796 }
10797 srcu_read_unlock(&pmus_srcu, idx);
10798 }
10799 #else
10800
perf_event_exit_cpu_context(int cpu)10801 static void perf_event_exit_cpu_context(int cpu) { }
10802
10803 #endif
10804
perf_event_exit_cpu(unsigned int cpu)10805 int perf_event_exit_cpu(unsigned int cpu)
10806 {
10807 perf_event_exit_cpu_context(cpu);
10808 return 0;
10809 }
10810
10811 static int
perf_reboot(struct notifier_block * notifier,unsigned long val,void * v)10812 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
10813 {
10814 int cpu;
10815
10816 for_each_online_cpu(cpu)
10817 perf_event_exit_cpu(cpu);
10818
10819 return NOTIFY_OK;
10820 }
10821
10822 /*
10823 * Run the perf reboot notifier at the very last possible moment so that
10824 * the generic watchdog code runs as long as possible.
10825 */
10826 static struct notifier_block perf_reboot_notifier = {
10827 .notifier_call = perf_reboot,
10828 .priority = INT_MIN,
10829 };
10830
perf_event_init(void)10831 void __init perf_event_init(void)
10832 {
10833 int ret;
10834
10835 idr_init(&pmu_idr);
10836
10837 perf_event_init_all_cpus();
10838 init_srcu_struct(&pmus_srcu);
10839 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
10840 perf_pmu_register(&perf_cpu_clock, NULL, -1);
10841 perf_pmu_register(&perf_task_clock, NULL, -1);
10842 perf_tp_register();
10843 perf_event_init_cpu(smp_processor_id());
10844 register_reboot_notifier(&perf_reboot_notifier);
10845
10846 ret = init_hw_breakpoint();
10847 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
10848
10849 /*
10850 * Build time assertion that we keep the data_head at the intended
10851 * location. IOW, validation we got the __reserved[] size right.
10852 */
10853 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
10854 != 1024);
10855 }
10856
perf_event_sysfs_show(struct device * dev,struct device_attribute * attr,char * page)10857 ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr,
10858 char *page)
10859 {
10860 struct perf_pmu_events_attr *pmu_attr =
10861 container_of(attr, struct perf_pmu_events_attr, attr);
10862
10863 if (pmu_attr->event_str)
10864 return sprintf(page, "%s\n", pmu_attr->event_str);
10865
10866 return 0;
10867 }
10868 EXPORT_SYMBOL_GPL(perf_event_sysfs_show);
10869
perf_event_sysfs_init(void)10870 static int __init perf_event_sysfs_init(void)
10871 {
10872 struct pmu *pmu;
10873 int ret;
10874
10875 mutex_lock(&pmus_lock);
10876
10877 ret = bus_register(&pmu_bus);
10878 if (ret)
10879 goto unlock;
10880
10881 list_for_each_entry(pmu, &pmus, entry) {
10882 if (!pmu->name || pmu->type < 0)
10883 continue;
10884
10885 ret = pmu_dev_alloc(pmu);
10886 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
10887 }
10888 pmu_bus_running = 1;
10889 ret = 0;
10890
10891 unlock:
10892 mutex_unlock(&pmus_lock);
10893
10894 return ret;
10895 }
10896 device_initcall(perf_event_sysfs_init);
10897
10898 #ifdef CONFIG_CGROUP_PERF
10899 static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)10900 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
10901 {
10902 struct perf_cgroup *jc;
10903
10904 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
10905 if (!jc)
10906 return ERR_PTR(-ENOMEM);
10907
10908 jc->info = alloc_percpu(struct perf_cgroup_info);
10909 if (!jc->info) {
10910 kfree(jc);
10911 return ERR_PTR(-ENOMEM);
10912 }
10913
10914 return &jc->css;
10915 }
10916
perf_cgroup_css_free(struct cgroup_subsys_state * css)10917 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
10918 {
10919 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
10920
10921 free_percpu(jc->info);
10922 kfree(jc);
10923 }
10924
__perf_cgroup_move(void * info)10925 static int __perf_cgroup_move(void *info)
10926 {
10927 struct task_struct *task = info;
10928 rcu_read_lock();
10929 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
10930 rcu_read_unlock();
10931 return 0;
10932 }
10933
perf_cgroup_attach(struct cgroup_taskset * tset)10934 static void perf_cgroup_attach(struct cgroup_taskset *tset)
10935 {
10936 struct task_struct *task;
10937 struct cgroup_subsys_state *css;
10938
10939 cgroup_taskset_for_each(task, css, tset)
10940 task_function_call(task, __perf_cgroup_move, task);
10941 }
10942
10943 struct cgroup_subsys perf_event_cgrp_subsys = {
10944 .css_alloc = perf_cgroup_css_alloc,
10945 .css_free = perf_cgroup_css_free,
10946 .attach = perf_cgroup_attach,
10947 };
10948 #endif /* CONFIG_CGROUP_PERF */
10949