1 /*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/tick.h>
22 #include <linux/sysfs.h>
23 #include <linux/dcache.h>
24 #include <linux/percpu.h>
25 #include <linux/ptrace.h>
26 #include <linux/reboot.h>
27 #include <linux/vmstat.h>
28 #include <linux/device.h>
29 #include <linux/export.h>
30 #include <linux/vmalloc.h>
31 #include <linux/hardirq.h>
32 #include <linux/rculist.h>
33 #include <linux/uaccess.h>
34 #include <linux/syscalls.h>
35 #include <linux/anon_inodes.h>
36 #include <linux/kernel_stat.h>
37 #include <linux/perf_event.h>
38 #include <linux/ftrace_event.h>
39 #include <linux/hw_breakpoint.h>
40 #include <linux/mm_types.h>
41 #include <linux/cgroup.h>
42 #include <linux/module.h>
43 #include <linux/mman.h>
44 #include <linux/compat.h>
45
46 #include "internal.h"
47
48 #include <asm/irq_regs.h>
49
50 static struct workqueue_struct *perf_wq;
51
52 struct remote_function_call {
53 struct task_struct *p;
54 int (*func)(void *info);
55 void *info;
56 int ret;
57 };
58
remote_function(void * data)59 static void remote_function(void *data)
60 {
61 struct remote_function_call *tfc = data;
62 struct task_struct *p = tfc->p;
63
64 if (p) {
65 tfc->ret = -EAGAIN;
66 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
67 return;
68 }
69
70 tfc->ret = tfc->func(tfc->info);
71 }
72
73 /**
74 * task_function_call - call a function on the cpu on which a task runs
75 * @p: the task to evaluate
76 * @func: the function to be called
77 * @info: the function call argument
78 *
79 * Calls the function @func when the task is currently running. This might
80 * be on the current CPU, which just calls the function directly
81 *
82 * returns: @func return value, or
83 * -ESRCH - when the process isn't running
84 * -EAGAIN - when the process moved away
85 */
86 static int
task_function_call(struct task_struct * p,int (* func)(void * info),void * info)87 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
88 {
89 struct remote_function_call data = {
90 .p = p,
91 .func = func,
92 .info = info,
93 .ret = -ESRCH, /* No such (running) process */
94 };
95
96 if (task_curr(p))
97 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
98
99 return data.ret;
100 }
101
102 /**
103 * cpu_function_call - call a function on the cpu
104 * @func: the function to be called
105 * @info: the function call argument
106 *
107 * Calls the function @func on the remote cpu.
108 *
109 * returns: @func return value or -ENXIO when the cpu is offline
110 */
cpu_function_call(int cpu,int (* func)(void * info),void * info)111 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
112 {
113 struct remote_function_call data = {
114 .p = NULL,
115 .func = func,
116 .info = info,
117 .ret = -ENXIO, /* No such CPU */
118 };
119
120 smp_call_function_single(cpu, remote_function, &data, 1);
121
122 return data.ret;
123 }
124
125 #define EVENT_OWNER_KERNEL ((void *) -1)
126
is_kernel_event(struct perf_event * event)127 static bool is_kernel_event(struct perf_event *event)
128 {
129 return event->owner == EVENT_OWNER_KERNEL;
130 }
131
132 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
133 PERF_FLAG_FD_OUTPUT |\
134 PERF_FLAG_PID_CGROUP |\
135 PERF_FLAG_FD_CLOEXEC)
136
137 /*
138 * branch priv levels that need permission checks
139 */
140 #define PERF_SAMPLE_BRANCH_PERM_PLM \
141 (PERF_SAMPLE_BRANCH_KERNEL |\
142 PERF_SAMPLE_BRANCH_HV)
143
144 enum event_type_t {
145 EVENT_FLEXIBLE = 0x1,
146 EVENT_PINNED = 0x2,
147 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
148 };
149
150 /*
151 * perf_sched_events : >0 events exist
152 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
153 */
154 struct static_key_deferred perf_sched_events __read_mostly;
155 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
156 static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
157
158 static atomic_t nr_mmap_events __read_mostly;
159 static atomic_t nr_comm_events __read_mostly;
160 static atomic_t nr_task_events __read_mostly;
161 static atomic_t nr_freq_events __read_mostly;
162
163 static LIST_HEAD(pmus);
164 static DEFINE_MUTEX(pmus_lock);
165 static struct srcu_struct pmus_srcu;
166
167 /*
168 * perf event paranoia level:
169 * -1 - not paranoid at all
170 * 0 - disallow raw tracepoint access for unpriv
171 * 1 - disallow cpu events for unpriv
172 * 2 - disallow kernel profiling for unpriv
173 * 3 - disallow all unpriv perf event use
174 */
175 #ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
176 int sysctl_perf_event_paranoid __read_mostly = 3;
177 #else
178 int sysctl_perf_event_paranoid __read_mostly = 1;
179 #endif
180
181 /* Minimum for 512 kiB + 1 user control page */
182 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
183
184 /*
185 * max perf event sample rate
186 */
187 #define DEFAULT_MAX_SAMPLE_RATE 100000
188 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
189 #define DEFAULT_CPU_TIME_MAX_PERCENT 25
190
191 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
192
193 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
194 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
195
196 static int perf_sample_allowed_ns __read_mostly =
197 DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100;
198
update_perf_cpu_limits(void)199 void update_perf_cpu_limits(void)
200 {
201 u64 tmp = perf_sample_period_ns;
202
203 tmp *= sysctl_perf_cpu_time_max_percent;
204 do_div(tmp, 100);
205 ACCESS_ONCE(perf_sample_allowed_ns) = tmp;
206 }
207
208 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
209
perf_proc_update_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)210 int perf_proc_update_handler(struct ctl_table *table, int write,
211 void __user *buffer, size_t *lenp,
212 loff_t *ppos)
213 {
214 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
215
216 if (ret || !write)
217 return ret;
218
219 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
220 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
221 update_perf_cpu_limits();
222
223 return 0;
224 }
225
226 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
227
perf_cpu_time_max_percent_handler(struct ctl_table * table,int write,void __user * buffer,size_t * lenp,loff_t * ppos)228 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
229 void __user *buffer, size_t *lenp,
230 loff_t *ppos)
231 {
232 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
233
234 if (ret || !write)
235 return ret;
236
237 update_perf_cpu_limits();
238
239 return 0;
240 }
241
242 /*
243 * perf samples are done in some very critical code paths (NMIs).
244 * If they take too much CPU time, the system can lock up and not
245 * get any real work done. This will drop the sample rate when
246 * we detect that events are taking too long.
247 */
248 #define NR_ACCUMULATED_SAMPLES 128
249 static DEFINE_PER_CPU(u64, running_sample_length);
250
perf_duration_warn(struct irq_work * w)251 static void perf_duration_warn(struct irq_work *w)
252 {
253 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
254 u64 avg_local_sample_len;
255 u64 local_samples_len;
256
257 local_samples_len = __this_cpu_read(running_sample_length);
258 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
259
260 printk_ratelimited(KERN_WARNING
261 "perf interrupt took too long (%lld > %lld), lowering "
262 "kernel.perf_event_max_sample_rate to %d\n",
263 avg_local_sample_len, allowed_ns >> 1,
264 sysctl_perf_event_sample_rate);
265 }
266
267 static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn);
268
perf_sample_event_took(u64 sample_len_ns)269 void perf_sample_event_took(u64 sample_len_ns)
270 {
271 u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns);
272 u64 avg_local_sample_len;
273 u64 local_samples_len;
274
275 if (allowed_ns == 0)
276 return;
277
278 /* decay the counter by 1 average sample */
279 local_samples_len = __this_cpu_read(running_sample_length);
280 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
281 local_samples_len += sample_len_ns;
282 __this_cpu_write(running_sample_length, local_samples_len);
283
284 /*
285 * note: this will be biased artifically low until we have
286 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
287 * from having to maintain a count.
288 */
289 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
290
291 if (avg_local_sample_len <= allowed_ns)
292 return;
293
294 if (max_samples_per_tick <= 1)
295 return;
296
297 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
298 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
299 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
300
301 update_perf_cpu_limits();
302
303 if (!irq_work_queue(&perf_duration_work)) {
304 early_printk("perf interrupt took too long (%lld > %lld), lowering "
305 "kernel.perf_event_max_sample_rate to %d\n",
306 avg_local_sample_len, allowed_ns >> 1,
307 sysctl_perf_event_sample_rate);
308 }
309 }
310
311 static atomic64_t perf_event_id;
312
313 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
314 enum event_type_t event_type);
315
316 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
317 enum event_type_t event_type,
318 struct task_struct *task);
319
320 static void update_context_time(struct perf_event_context *ctx);
321 static u64 perf_event_time(struct perf_event *event);
322
perf_event_print_debug(void)323 void __weak perf_event_print_debug(void) { }
324
perf_pmu_name(void)325 extern __weak const char *perf_pmu_name(void)
326 {
327 return "pmu";
328 }
329
perf_clock(void)330 static inline u64 perf_clock(void)
331 {
332 return local_clock();
333 }
334
335 static inline struct perf_cpu_context *
__get_cpu_context(struct perf_event_context * ctx)336 __get_cpu_context(struct perf_event_context *ctx)
337 {
338 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
339 }
340
perf_ctx_lock(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)341 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
342 struct perf_event_context *ctx)
343 {
344 raw_spin_lock(&cpuctx->ctx.lock);
345 if (ctx)
346 raw_spin_lock(&ctx->lock);
347 }
348
perf_ctx_unlock(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)349 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
350 struct perf_event_context *ctx)
351 {
352 if (ctx)
353 raw_spin_unlock(&ctx->lock);
354 raw_spin_unlock(&cpuctx->ctx.lock);
355 }
356
357 #ifdef CONFIG_CGROUP_PERF
358
359 /*
360 * perf_cgroup_info keeps track of time_enabled for a cgroup.
361 * This is a per-cpu dynamically allocated data structure.
362 */
363 struct perf_cgroup_info {
364 u64 time;
365 u64 timestamp;
366 };
367
368 struct perf_cgroup {
369 struct cgroup_subsys_state css;
370 struct perf_cgroup_info __percpu *info;
371 };
372
373 /*
374 * Must ensure cgroup is pinned (css_get) before calling
375 * this function. In other words, we cannot call this function
376 * if there is no cgroup event for the current CPU context.
377 */
378 static inline struct perf_cgroup *
perf_cgroup_from_task(struct task_struct * task)379 perf_cgroup_from_task(struct task_struct *task)
380 {
381 return container_of(task_css(task, perf_event_cgrp_id),
382 struct perf_cgroup, css);
383 }
384
385 static inline bool
perf_cgroup_match(struct perf_event * event)386 perf_cgroup_match(struct perf_event *event)
387 {
388 struct perf_event_context *ctx = event->ctx;
389 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
390
391 /* @event doesn't care about cgroup */
392 if (!event->cgrp)
393 return true;
394
395 /* wants specific cgroup scope but @cpuctx isn't associated with any */
396 if (!cpuctx->cgrp)
397 return false;
398
399 /*
400 * Cgroup scoping is recursive. An event enabled for a cgroup is
401 * also enabled for all its descendant cgroups. If @cpuctx's
402 * cgroup is a descendant of @event's (the test covers identity
403 * case), it's a match.
404 */
405 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
406 event->cgrp->css.cgroup);
407 }
408
perf_detach_cgroup(struct perf_event * event)409 static inline void perf_detach_cgroup(struct perf_event *event)
410 {
411 css_put(&event->cgrp->css);
412 event->cgrp = NULL;
413 }
414
is_cgroup_event(struct perf_event * event)415 static inline int is_cgroup_event(struct perf_event *event)
416 {
417 return event->cgrp != NULL;
418 }
419
perf_cgroup_event_time(struct perf_event * event)420 static inline u64 perf_cgroup_event_time(struct perf_event *event)
421 {
422 struct perf_cgroup_info *t;
423
424 t = per_cpu_ptr(event->cgrp->info, event->cpu);
425 return t->time;
426 }
427
__update_cgrp_time(struct perf_cgroup * cgrp)428 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
429 {
430 struct perf_cgroup_info *info;
431 u64 now;
432
433 now = perf_clock();
434
435 info = this_cpu_ptr(cgrp->info);
436
437 info->time += now - info->timestamp;
438 info->timestamp = now;
439 }
440
update_cgrp_time_from_cpuctx(struct perf_cpu_context * cpuctx)441 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
442 {
443 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
444 if (cgrp_out)
445 __update_cgrp_time(cgrp_out);
446 }
447
update_cgrp_time_from_event(struct perf_event * event)448 static inline void update_cgrp_time_from_event(struct perf_event *event)
449 {
450 struct perf_cgroup *cgrp;
451
452 /*
453 * ensure we access cgroup data only when needed and
454 * when we know the cgroup is pinned (css_get)
455 */
456 if (!is_cgroup_event(event))
457 return;
458
459 cgrp = perf_cgroup_from_task(current);
460 /*
461 * Do not update time when cgroup is not active
462 */
463 if (cgrp == event->cgrp)
464 __update_cgrp_time(event->cgrp);
465 }
466
467 static inline void
perf_cgroup_set_timestamp(struct task_struct * task,struct perf_event_context * ctx)468 perf_cgroup_set_timestamp(struct task_struct *task,
469 struct perf_event_context *ctx)
470 {
471 struct perf_cgroup *cgrp;
472 struct perf_cgroup_info *info;
473
474 /*
475 * ctx->lock held by caller
476 * ensure we do not access cgroup data
477 * unless we have the cgroup pinned (css_get)
478 */
479 if (!task || !ctx->nr_cgroups)
480 return;
481
482 cgrp = perf_cgroup_from_task(task);
483 info = this_cpu_ptr(cgrp->info);
484 info->timestamp = ctx->timestamp;
485 }
486
487 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
488 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
489
490 /*
491 * reschedule events based on the cgroup constraint of task.
492 *
493 * mode SWOUT : schedule out everything
494 * mode SWIN : schedule in based on cgroup for next
495 */
perf_cgroup_switch(struct task_struct * task,int mode)496 void perf_cgroup_switch(struct task_struct *task, int mode)
497 {
498 struct perf_cpu_context *cpuctx;
499 struct pmu *pmu;
500 unsigned long flags;
501
502 /*
503 * disable interrupts to avoid geting nr_cgroup
504 * changes via __perf_event_disable(). Also
505 * avoids preemption.
506 */
507 local_irq_save(flags);
508
509 /*
510 * we reschedule only in the presence of cgroup
511 * constrained events.
512 */
513 rcu_read_lock();
514
515 list_for_each_entry_rcu(pmu, &pmus, entry) {
516 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
517 if (cpuctx->unique_pmu != pmu)
518 continue; /* ensure we process each cpuctx once */
519
520 /*
521 * perf_cgroup_events says at least one
522 * context on this CPU has cgroup events.
523 *
524 * ctx->nr_cgroups reports the number of cgroup
525 * events for a context.
526 */
527 if (cpuctx->ctx.nr_cgroups > 0) {
528 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
529 perf_pmu_disable(cpuctx->ctx.pmu);
530
531 if (mode & PERF_CGROUP_SWOUT) {
532 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
533 /*
534 * must not be done before ctxswout due
535 * to event_filter_match() in event_sched_out()
536 */
537 cpuctx->cgrp = NULL;
538 }
539
540 if (mode & PERF_CGROUP_SWIN) {
541 WARN_ON_ONCE(cpuctx->cgrp);
542 /*
543 * set cgrp before ctxsw in to allow
544 * event_filter_match() to not have to pass
545 * task around
546 */
547 cpuctx->cgrp = perf_cgroup_from_task(task);
548 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
549 }
550 perf_pmu_enable(cpuctx->ctx.pmu);
551 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
552 }
553 }
554
555 rcu_read_unlock();
556
557 local_irq_restore(flags);
558 }
559
perf_cgroup_sched_out(struct task_struct * task,struct task_struct * next)560 static inline void perf_cgroup_sched_out(struct task_struct *task,
561 struct task_struct *next)
562 {
563 struct perf_cgroup *cgrp1;
564 struct perf_cgroup *cgrp2 = NULL;
565
566 /*
567 * we come here when we know perf_cgroup_events > 0
568 */
569 cgrp1 = perf_cgroup_from_task(task);
570
571 /*
572 * next is NULL when called from perf_event_enable_on_exec()
573 * that will systematically cause a cgroup_switch()
574 */
575 if (next)
576 cgrp2 = perf_cgroup_from_task(next);
577
578 /*
579 * only schedule out current cgroup events if we know
580 * that we are switching to a different cgroup. Otherwise,
581 * do no touch the cgroup events.
582 */
583 if (cgrp1 != cgrp2)
584 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
585 }
586
perf_cgroup_sched_in(struct task_struct * prev,struct task_struct * task)587 static inline void perf_cgroup_sched_in(struct task_struct *prev,
588 struct task_struct *task)
589 {
590 struct perf_cgroup *cgrp1;
591 struct perf_cgroup *cgrp2 = NULL;
592
593 /*
594 * we come here when we know perf_cgroup_events > 0
595 */
596 cgrp1 = perf_cgroup_from_task(task);
597
598 /* prev can never be NULL */
599 cgrp2 = perf_cgroup_from_task(prev);
600
601 /*
602 * only need to schedule in cgroup events if we are changing
603 * cgroup during ctxsw. Cgroup events were not scheduled
604 * out of ctxsw out if that was not the case.
605 */
606 if (cgrp1 != cgrp2)
607 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
608 }
609
perf_cgroup_connect(int fd,struct perf_event * event,struct perf_event_attr * attr,struct perf_event * group_leader)610 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
611 struct perf_event_attr *attr,
612 struct perf_event *group_leader)
613 {
614 struct perf_cgroup *cgrp;
615 struct cgroup_subsys_state *css;
616 struct fd f = fdget(fd);
617 int ret = 0;
618
619 if (!f.file)
620 return -EBADF;
621
622 css = css_tryget_online_from_dir(f.file->f_dentry,
623 &perf_event_cgrp_subsys);
624 if (IS_ERR(css)) {
625 ret = PTR_ERR(css);
626 goto out;
627 }
628
629 cgrp = container_of(css, struct perf_cgroup, css);
630 event->cgrp = cgrp;
631
632 /*
633 * all events in a group must monitor
634 * the same cgroup because a task belongs
635 * to only one perf cgroup at a time
636 */
637 if (group_leader && group_leader->cgrp != cgrp) {
638 perf_detach_cgroup(event);
639 ret = -EINVAL;
640 }
641 out:
642 fdput(f);
643 return ret;
644 }
645
646 static inline void
perf_cgroup_set_shadow_time(struct perf_event * event,u64 now)647 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
648 {
649 struct perf_cgroup_info *t;
650 t = per_cpu_ptr(event->cgrp->info, event->cpu);
651 event->shadow_ctx_time = now - t->timestamp;
652 }
653
654 static inline void
perf_cgroup_defer_enabled(struct perf_event * event)655 perf_cgroup_defer_enabled(struct perf_event *event)
656 {
657 /*
658 * when the current task's perf cgroup does not match
659 * the event's, we need to remember to call the
660 * perf_mark_enable() function the first time a task with
661 * a matching perf cgroup is scheduled in.
662 */
663 if (is_cgroup_event(event) && !perf_cgroup_match(event))
664 event->cgrp_defer_enabled = 1;
665 }
666
667 static inline void
perf_cgroup_mark_enabled(struct perf_event * event,struct perf_event_context * ctx)668 perf_cgroup_mark_enabled(struct perf_event *event,
669 struct perf_event_context *ctx)
670 {
671 struct perf_event *sub;
672 u64 tstamp = perf_event_time(event);
673
674 if (!event->cgrp_defer_enabled)
675 return;
676
677 event->cgrp_defer_enabled = 0;
678
679 event->tstamp_enabled = tstamp - event->total_time_enabled;
680 list_for_each_entry(sub, &event->sibling_list, group_entry) {
681 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
682 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
683 sub->cgrp_defer_enabled = 0;
684 }
685 }
686 }
687 #else /* !CONFIG_CGROUP_PERF */
688
689 static inline bool
perf_cgroup_match(struct perf_event * event)690 perf_cgroup_match(struct perf_event *event)
691 {
692 return true;
693 }
694
perf_detach_cgroup(struct perf_event * event)695 static inline void perf_detach_cgroup(struct perf_event *event)
696 {}
697
is_cgroup_event(struct perf_event * event)698 static inline int is_cgroup_event(struct perf_event *event)
699 {
700 return 0;
701 }
702
perf_cgroup_event_cgrp_time(struct perf_event * event)703 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
704 {
705 return 0;
706 }
707
update_cgrp_time_from_event(struct perf_event * event)708 static inline void update_cgrp_time_from_event(struct perf_event *event)
709 {
710 }
711
update_cgrp_time_from_cpuctx(struct perf_cpu_context * cpuctx)712 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
713 {
714 }
715
perf_cgroup_sched_out(struct task_struct * task,struct task_struct * next)716 static inline void perf_cgroup_sched_out(struct task_struct *task,
717 struct task_struct *next)
718 {
719 }
720
perf_cgroup_sched_in(struct task_struct * prev,struct task_struct * task)721 static inline void perf_cgroup_sched_in(struct task_struct *prev,
722 struct task_struct *task)
723 {
724 }
725
perf_cgroup_connect(pid_t pid,struct perf_event * event,struct perf_event_attr * attr,struct perf_event * group_leader)726 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
727 struct perf_event_attr *attr,
728 struct perf_event *group_leader)
729 {
730 return -EINVAL;
731 }
732
733 static inline void
perf_cgroup_set_timestamp(struct task_struct * task,struct perf_event_context * ctx)734 perf_cgroup_set_timestamp(struct task_struct *task,
735 struct perf_event_context *ctx)
736 {
737 }
738
739 void
perf_cgroup_switch(struct task_struct * task,struct task_struct * next)740 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
741 {
742 }
743
744 static inline void
perf_cgroup_set_shadow_time(struct perf_event * event,u64 now)745 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
746 {
747 }
748
perf_cgroup_event_time(struct perf_event * event)749 static inline u64 perf_cgroup_event_time(struct perf_event *event)
750 {
751 return 0;
752 }
753
754 static inline void
perf_cgroup_defer_enabled(struct perf_event * event)755 perf_cgroup_defer_enabled(struct perf_event *event)
756 {
757 }
758
759 static inline void
perf_cgroup_mark_enabled(struct perf_event * event,struct perf_event_context * ctx)760 perf_cgroup_mark_enabled(struct perf_event *event,
761 struct perf_event_context *ctx)
762 {
763 }
764 #endif
765
766 /*
767 * set default to be dependent on timer tick just
768 * like original code
769 */
770 #define PERF_CPU_HRTIMER (1000 / HZ)
771 /*
772 * function must be called with interrupts disbled
773 */
perf_cpu_hrtimer_handler(struct hrtimer * hr)774 static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
775 {
776 struct perf_cpu_context *cpuctx;
777 enum hrtimer_restart ret = HRTIMER_NORESTART;
778 int rotations = 0;
779
780 WARN_ON(!irqs_disabled());
781
782 cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
783
784 rotations = perf_rotate_context(cpuctx);
785
786 /*
787 * arm timer if needed
788 */
789 if (rotations) {
790 hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
791 ret = HRTIMER_RESTART;
792 }
793
794 return ret;
795 }
796
797 /* CPU is going down */
perf_cpu_hrtimer_cancel(int cpu)798 void perf_cpu_hrtimer_cancel(int cpu)
799 {
800 struct perf_cpu_context *cpuctx;
801 struct pmu *pmu;
802 unsigned long flags;
803
804 if (WARN_ON(cpu != smp_processor_id()))
805 return;
806
807 local_irq_save(flags);
808
809 rcu_read_lock();
810
811 list_for_each_entry_rcu(pmu, &pmus, entry) {
812 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
813
814 if (pmu->task_ctx_nr == perf_sw_context)
815 continue;
816
817 hrtimer_cancel(&cpuctx->hrtimer);
818 }
819
820 rcu_read_unlock();
821
822 local_irq_restore(flags);
823 }
824
__perf_cpu_hrtimer_init(struct perf_cpu_context * cpuctx,int cpu)825 static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
826 {
827 struct hrtimer *hr = &cpuctx->hrtimer;
828 struct pmu *pmu = cpuctx->ctx.pmu;
829 int timer;
830
831 /* no multiplexing needed for SW PMU */
832 if (pmu->task_ctx_nr == perf_sw_context)
833 return;
834
835 /*
836 * check default is sane, if not set then force to
837 * default interval (1/tick)
838 */
839 timer = pmu->hrtimer_interval_ms;
840 if (timer < 1)
841 timer = pmu->hrtimer_interval_ms = PERF_CPU_HRTIMER;
842
843 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
844
845 hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
846 hr->function = perf_cpu_hrtimer_handler;
847 }
848
perf_cpu_hrtimer_restart(struct perf_cpu_context * cpuctx)849 static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
850 {
851 struct hrtimer *hr = &cpuctx->hrtimer;
852 struct pmu *pmu = cpuctx->ctx.pmu;
853
854 /* not for SW PMU */
855 if (pmu->task_ctx_nr == perf_sw_context)
856 return;
857
858 if (hrtimer_active(hr))
859 return;
860
861 if (!hrtimer_callback_running(hr))
862 __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
863 0, HRTIMER_MODE_REL_PINNED, 0);
864 }
865
perf_pmu_disable(struct pmu * pmu)866 void perf_pmu_disable(struct pmu *pmu)
867 {
868 int *count = this_cpu_ptr(pmu->pmu_disable_count);
869 if (!(*count)++)
870 pmu->pmu_disable(pmu);
871 }
872
perf_pmu_enable(struct pmu * pmu)873 void perf_pmu_enable(struct pmu *pmu)
874 {
875 int *count = this_cpu_ptr(pmu->pmu_disable_count);
876 if (!--(*count))
877 pmu->pmu_enable(pmu);
878 }
879
880 static DEFINE_PER_CPU(struct list_head, rotation_list);
881
882 /*
883 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
884 * because they're strictly cpu affine and rotate_start is called with IRQs
885 * disabled, while rotate_context is called from IRQ context.
886 */
perf_pmu_rotate_start(struct pmu * pmu)887 static void perf_pmu_rotate_start(struct pmu *pmu)
888 {
889 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
890 struct list_head *head = this_cpu_ptr(&rotation_list);
891
892 WARN_ON(!irqs_disabled());
893
894 if (list_empty(&cpuctx->rotation_list))
895 list_add(&cpuctx->rotation_list, head);
896 }
897
get_ctx(struct perf_event_context * ctx)898 static void get_ctx(struct perf_event_context *ctx)
899 {
900 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
901 }
902
put_ctx(struct perf_event_context * ctx)903 static void put_ctx(struct perf_event_context *ctx)
904 {
905 if (atomic_dec_and_test(&ctx->refcount)) {
906 if (ctx->parent_ctx)
907 put_ctx(ctx->parent_ctx);
908 if (ctx->task)
909 put_task_struct(ctx->task);
910 kfree_rcu(ctx, rcu_head);
911 }
912 }
913
914 /*
915 * Because of perf_event::ctx migration in sys_perf_event_open::move_group and
916 * perf_pmu_migrate_context() we need some magic.
917 *
918 * Those places that change perf_event::ctx will hold both
919 * perf_event_ctx::mutex of the 'old' and 'new' ctx value.
920 *
921 * Lock ordering is by mutex address. There is one other site where
922 * perf_event_context::mutex nests and that is put_event(). But remember that
923 * that is a parent<->child context relation, and migration does not affect
924 * children, therefore these two orderings should not interact.
925 *
926 * The change in perf_event::ctx does not affect children (as claimed above)
927 * because the sys_perf_event_open() case will install a new event and break
928 * the ctx parent<->child relation, and perf_pmu_migrate_context() is only
929 * concerned with cpuctx and that doesn't have children.
930 *
931 * The places that change perf_event::ctx will issue:
932 *
933 * perf_remove_from_context();
934 * synchronize_rcu();
935 * perf_install_in_context();
936 *
937 * to affect the change. The remove_from_context() + synchronize_rcu() should
938 * quiesce the event, after which we can install it in the new location. This
939 * means that only external vectors (perf_fops, prctl) can perturb the event
940 * while in transit. Therefore all such accessors should also acquire
941 * perf_event_context::mutex to serialize against this.
942 *
943 * However; because event->ctx can change while we're waiting to acquire
944 * ctx->mutex we must be careful and use the below perf_event_ctx_lock()
945 * function.
946 *
947 * Lock order:
948 * task_struct::perf_event_mutex
949 * perf_event_context::mutex
950 * perf_event_context::lock
951 * perf_event::child_mutex;
952 * perf_event::mmap_mutex
953 * mmap_sem
954 */
perf_event_ctx_lock(struct perf_event * event)955 static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event)
956 {
957 struct perf_event_context *ctx;
958
959 again:
960 rcu_read_lock();
961 ctx = ACCESS_ONCE(event->ctx);
962 if (!atomic_inc_not_zero(&ctx->refcount)) {
963 rcu_read_unlock();
964 goto again;
965 }
966 rcu_read_unlock();
967
968 mutex_lock(&ctx->mutex);
969 if (event->ctx != ctx) {
970 mutex_unlock(&ctx->mutex);
971 put_ctx(ctx);
972 goto again;
973 }
974
975 return ctx;
976 }
977
perf_event_ctx_unlock(struct perf_event * event,struct perf_event_context * ctx)978 static void perf_event_ctx_unlock(struct perf_event *event,
979 struct perf_event_context *ctx)
980 {
981 mutex_unlock(&ctx->mutex);
982 put_ctx(ctx);
983 }
984
985 /*
986 * This must be done under the ctx->lock, such as to serialize against
987 * context_equiv(), therefore we cannot call put_ctx() since that might end up
988 * calling scheduler related locks and ctx->lock nests inside those.
989 */
990 static __must_check struct perf_event_context *
unclone_ctx(struct perf_event_context * ctx)991 unclone_ctx(struct perf_event_context *ctx)
992 {
993 struct perf_event_context *parent_ctx = ctx->parent_ctx;
994
995 lockdep_assert_held(&ctx->lock);
996
997 if (parent_ctx)
998 ctx->parent_ctx = NULL;
999 ctx->generation++;
1000
1001 return parent_ctx;
1002 }
1003
perf_event_pid(struct perf_event * event,struct task_struct * p)1004 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
1005 {
1006 /*
1007 * only top level events have the pid namespace they were created in
1008 */
1009 if (event->parent)
1010 event = event->parent;
1011
1012 return task_tgid_nr_ns(p, event->ns);
1013 }
1014
perf_event_tid(struct perf_event * event,struct task_struct * p)1015 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
1016 {
1017 /*
1018 * only top level events have the pid namespace they were created in
1019 */
1020 if (event->parent)
1021 event = event->parent;
1022
1023 return task_pid_nr_ns(p, event->ns);
1024 }
1025
1026 /*
1027 * If we inherit events we want to return the parent event id
1028 * to userspace.
1029 */
primary_event_id(struct perf_event * event)1030 static u64 primary_event_id(struct perf_event *event)
1031 {
1032 u64 id = event->id;
1033
1034 if (event->parent)
1035 id = event->parent->id;
1036
1037 return id;
1038 }
1039
1040 /*
1041 * Get the perf_event_context for a task and lock it.
1042 * This has to cope with with the fact that until it is locked,
1043 * the context could get moved to another task.
1044 */
1045 static struct perf_event_context *
perf_lock_task_context(struct task_struct * task,int ctxn,unsigned long * flags)1046 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
1047 {
1048 struct perf_event_context *ctx;
1049
1050 retry:
1051 /*
1052 * One of the few rules of preemptible RCU is that one cannot do
1053 * rcu_read_unlock() while holding a scheduler (or nested) lock when
1054 * part of the read side critical section was preemptible -- see
1055 * rcu_read_unlock_special().
1056 *
1057 * Since ctx->lock nests under rq->lock we must ensure the entire read
1058 * side critical section is non-preemptible.
1059 */
1060 preempt_disable();
1061 rcu_read_lock();
1062 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
1063 if (ctx) {
1064 /*
1065 * If this context is a clone of another, it might
1066 * get swapped for another underneath us by
1067 * perf_event_task_sched_out, though the
1068 * rcu_read_lock() protects us from any context
1069 * getting freed. Lock the context and check if it
1070 * got swapped before we could get the lock, and retry
1071 * if so. If we locked the right context, then it
1072 * can't get swapped on us any more.
1073 */
1074 raw_spin_lock_irqsave(&ctx->lock, *flags);
1075 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
1076 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1077 rcu_read_unlock();
1078 preempt_enable();
1079 goto retry;
1080 }
1081
1082 if (!atomic_inc_not_zero(&ctx->refcount)) {
1083 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
1084 ctx = NULL;
1085 }
1086 }
1087 rcu_read_unlock();
1088 preempt_enable();
1089 return ctx;
1090 }
1091
1092 /*
1093 * Get the context for a task and increment its pin_count so it
1094 * can't get swapped to another task. This also increments its
1095 * reference count so that the context can't get freed.
1096 */
1097 static struct perf_event_context *
perf_pin_task_context(struct task_struct * task,int ctxn)1098 perf_pin_task_context(struct task_struct *task, int ctxn)
1099 {
1100 struct perf_event_context *ctx;
1101 unsigned long flags;
1102
1103 ctx = perf_lock_task_context(task, ctxn, &flags);
1104 if (ctx) {
1105 ++ctx->pin_count;
1106 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1107 }
1108 return ctx;
1109 }
1110
perf_unpin_context(struct perf_event_context * ctx)1111 static void perf_unpin_context(struct perf_event_context *ctx)
1112 {
1113 unsigned long flags;
1114
1115 raw_spin_lock_irqsave(&ctx->lock, flags);
1116 --ctx->pin_count;
1117 raw_spin_unlock_irqrestore(&ctx->lock, flags);
1118 }
1119
1120 /*
1121 * Update the record of the current time in a context.
1122 */
update_context_time(struct perf_event_context * ctx)1123 static void update_context_time(struct perf_event_context *ctx)
1124 {
1125 u64 now = perf_clock();
1126
1127 ctx->time += now - ctx->timestamp;
1128 ctx->timestamp = now;
1129 }
1130
perf_event_time(struct perf_event * event)1131 static u64 perf_event_time(struct perf_event *event)
1132 {
1133 struct perf_event_context *ctx = event->ctx;
1134
1135 if (is_cgroup_event(event))
1136 return perf_cgroup_event_time(event);
1137
1138 return ctx ? ctx->time : 0;
1139 }
1140
1141 /*
1142 * Update the total_time_enabled and total_time_running fields for a event.
1143 * The caller of this function needs to hold the ctx->lock.
1144 */
update_event_times(struct perf_event * event)1145 static void update_event_times(struct perf_event *event)
1146 {
1147 struct perf_event_context *ctx = event->ctx;
1148 u64 run_end;
1149
1150 if (event->state < PERF_EVENT_STATE_INACTIVE ||
1151 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
1152 return;
1153 /*
1154 * in cgroup mode, time_enabled represents
1155 * the time the event was enabled AND active
1156 * tasks were in the monitored cgroup. This is
1157 * independent of the activity of the context as
1158 * there may be a mix of cgroup and non-cgroup events.
1159 *
1160 * That is why we treat cgroup events differently
1161 * here.
1162 */
1163 if (is_cgroup_event(event))
1164 run_end = perf_cgroup_event_time(event);
1165 else if (ctx->is_active)
1166 run_end = ctx->time;
1167 else
1168 run_end = event->tstamp_stopped;
1169
1170 event->total_time_enabled = run_end - event->tstamp_enabled;
1171
1172 if (event->state == PERF_EVENT_STATE_INACTIVE)
1173 run_end = event->tstamp_stopped;
1174 else
1175 run_end = perf_event_time(event);
1176
1177 event->total_time_running = run_end - event->tstamp_running;
1178
1179 }
1180
1181 /*
1182 * Update total_time_enabled and total_time_running for all events in a group.
1183 */
update_group_times(struct perf_event * leader)1184 static void update_group_times(struct perf_event *leader)
1185 {
1186 struct perf_event *event;
1187
1188 update_event_times(leader);
1189 list_for_each_entry(event, &leader->sibling_list, group_entry)
1190 update_event_times(event);
1191 }
1192
1193 static struct list_head *
ctx_group_list(struct perf_event * event,struct perf_event_context * ctx)1194 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
1195 {
1196 if (event->attr.pinned)
1197 return &ctx->pinned_groups;
1198 else
1199 return &ctx->flexible_groups;
1200 }
1201
1202 /*
1203 * Add a event from the lists for its context.
1204 * Must be called with ctx->mutex and ctx->lock held.
1205 */
1206 static void
list_add_event(struct perf_event * event,struct perf_event_context * ctx)1207 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1208 {
1209 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1210 event->attach_state |= PERF_ATTACH_CONTEXT;
1211
1212 /*
1213 * If we're a stand alone event or group leader, we go to the context
1214 * list, group events are kept attached to the group so that
1215 * perf_group_detach can, at all times, locate all siblings.
1216 */
1217 if (event->group_leader == event) {
1218 struct list_head *list;
1219
1220 if (is_software_event(event))
1221 event->group_flags |= PERF_GROUP_SOFTWARE;
1222
1223 list = ctx_group_list(event, ctx);
1224 list_add_tail(&event->group_entry, list);
1225 }
1226
1227 if (is_cgroup_event(event))
1228 ctx->nr_cgroups++;
1229
1230 if (has_branch_stack(event))
1231 ctx->nr_branch_stack++;
1232
1233 list_add_rcu(&event->event_entry, &ctx->event_list);
1234 if (!ctx->nr_events)
1235 perf_pmu_rotate_start(ctx->pmu);
1236 ctx->nr_events++;
1237 if (event->attr.inherit_stat)
1238 ctx->nr_stat++;
1239
1240 ctx->generation++;
1241 }
1242
1243 /*
1244 * Initialize event state based on the perf_event_attr::disabled.
1245 */
perf_event__state_init(struct perf_event * event)1246 static inline void perf_event__state_init(struct perf_event *event)
1247 {
1248 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1249 PERF_EVENT_STATE_INACTIVE;
1250 }
1251
1252 /*
1253 * Called at perf_event creation and when events are attached/detached from a
1254 * group.
1255 */
perf_event__read_size(struct perf_event * event)1256 static void perf_event__read_size(struct perf_event *event)
1257 {
1258 int entry = sizeof(u64); /* value */
1259 int size = 0;
1260 int nr = 1;
1261
1262 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1263 size += sizeof(u64);
1264
1265 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1266 size += sizeof(u64);
1267
1268 if (event->attr.read_format & PERF_FORMAT_ID)
1269 entry += sizeof(u64);
1270
1271 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1272 nr += event->group_leader->nr_siblings;
1273 size += sizeof(u64);
1274 }
1275
1276 size += entry * nr;
1277 event->read_size = size;
1278 }
1279
perf_event__header_size(struct perf_event * event)1280 static void perf_event__header_size(struct perf_event *event)
1281 {
1282 struct perf_sample_data *data;
1283 u64 sample_type = event->attr.sample_type;
1284 u16 size = 0;
1285
1286 perf_event__read_size(event);
1287
1288 if (sample_type & PERF_SAMPLE_IP)
1289 size += sizeof(data->ip);
1290
1291 if (sample_type & PERF_SAMPLE_ADDR)
1292 size += sizeof(data->addr);
1293
1294 if (sample_type & PERF_SAMPLE_PERIOD)
1295 size += sizeof(data->period);
1296
1297 if (sample_type & PERF_SAMPLE_WEIGHT)
1298 size += sizeof(data->weight);
1299
1300 if (sample_type & PERF_SAMPLE_READ)
1301 size += event->read_size;
1302
1303 if (sample_type & PERF_SAMPLE_DATA_SRC)
1304 size += sizeof(data->data_src.val);
1305
1306 if (sample_type & PERF_SAMPLE_TRANSACTION)
1307 size += sizeof(data->txn);
1308
1309 event->header_size = size;
1310 }
1311
perf_event__id_header_size(struct perf_event * event)1312 static void perf_event__id_header_size(struct perf_event *event)
1313 {
1314 struct perf_sample_data *data;
1315 u64 sample_type = event->attr.sample_type;
1316 u16 size = 0;
1317
1318 if (sample_type & PERF_SAMPLE_TID)
1319 size += sizeof(data->tid_entry);
1320
1321 if (sample_type & PERF_SAMPLE_TIME)
1322 size += sizeof(data->time);
1323
1324 if (sample_type & PERF_SAMPLE_IDENTIFIER)
1325 size += sizeof(data->id);
1326
1327 if (sample_type & PERF_SAMPLE_ID)
1328 size += sizeof(data->id);
1329
1330 if (sample_type & PERF_SAMPLE_STREAM_ID)
1331 size += sizeof(data->stream_id);
1332
1333 if (sample_type & PERF_SAMPLE_CPU)
1334 size += sizeof(data->cpu_entry);
1335
1336 event->id_header_size = size;
1337 }
1338
perf_group_attach(struct perf_event * event)1339 static void perf_group_attach(struct perf_event *event)
1340 {
1341 struct perf_event *group_leader = event->group_leader, *pos;
1342
1343 /*
1344 * We can have double attach due to group movement in perf_event_open.
1345 */
1346 if (event->attach_state & PERF_ATTACH_GROUP)
1347 return;
1348
1349 event->attach_state |= PERF_ATTACH_GROUP;
1350
1351 if (group_leader == event)
1352 return;
1353
1354 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1355 !is_software_event(event))
1356 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1357
1358 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1359 group_leader->nr_siblings++;
1360
1361 perf_event__header_size(group_leader);
1362
1363 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1364 perf_event__header_size(pos);
1365 }
1366
1367 /*
1368 * Remove a event from the lists for its context.
1369 * Must be called with ctx->mutex and ctx->lock held.
1370 */
1371 static void
list_del_event(struct perf_event * event,struct perf_event_context * ctx)1372 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1373 {
1374 struct perf_cpu_context *cpuctx;
1375 /*
1376 * We can have double detach due to exit/hot-unplug + close.
1377 */
1378 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1379 return;
1380
1381 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1382
1383 if (is_cgroup_event(event)) {
1384 ctx->nr_cgroups--;
1385 cpuctx = __get_cpu_context(ctx);
1386 /*
1387 * if there are no more cgroup events
1388 * then cler cgrp to avoid stale pointer
1389 * in update_cgrp_time_from_cpuctx()
1390 */
1391 if (!ctx->nr_cgroups)
1392 cpuctx->cgrp = NULL;
1393 }
1394
1395 if (has_branch_stack(event))
1396 ctx->nr_branch_stack--;
1397
1398 ctx->nr_events--;
1399 if (event->attr.inherit_stat)
1400 ctx->nr_stat--;
1401
1402 list_del_rcu(&event->event_entry);
1403
1404 if (event->group_leader == event)
1405 list_del_init(&event->group_entry);
1406
1407 update_group_times(event);
1408
1409 /*
1410 * If event was in error state, then keep it
1411 * that way, otherwise bogus counts will be
1412 * returned on read(). The only way to get out
1413 * of error state is by explicit re-enabling
1414 * of the event
1415 */
1416 if (event->state > PERF_EVENT_STATE_OFF)
1417 event->state = PERF_EVENT_STATE_OFF;
1418
1419 ctx->generation++;
1420 }
1421
perf_group_detach(struct perf_event * event)1422 static void perf_group_detach(struct perf_event *event)
1423 {
1424 struct perf_event *sibling, *tmp;
1425 struct list_head *list = NULL;
1426
1427 /*
1428 * We can have double detach due to exit/hot-unplug + close.
1429 */
1430 if (!(event->attach_state & PERF_ATTACH_GROUP))
1431 return;
1432
1433 event->attach_state &= ~PERF_ATTACH_GROUP;
1434
1435 /*
1436 * If this is a sibling, remove it from its group.
1437 */
1438 if (event->group_leader != event) {
1439 list_del_init(&event->group_entry);
1440 event->group_leader->nr_siblings--;
1441 goto out;
1442 }
1443
1444 if (!list_empty(&event->group_entry))
1445 list = &event->group_entry;
1446
1447 /*
1448 * If this was a group event with sibling events then
1449 * upgrade the siblings to singleton events by adding them
1450 * to whatever list we are on.
1451 */
1452 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1453 if (list)
1454 list_move_tail(&sibling->group_entry, list);
1455 sibling->group_leader = sibling;
1456
1457 /* Inherit group flags from the previous leader */
1458 sibling->group_flags = event->group_flags;
1459 }
1460
1461 out:
1462 perf_event__header_size(event->group_leader);
1463
1464 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1465 perf_event__header_size(tmp);
1466 }
1467
1468 /*
1469 * User event without the task.
1470 */
is_orphaned_event(struct perf_event * event)1471 static bool is_orphaned_event(struct perf_event *event)
1472 {
1473 return event && !is_kernel_event(event) && !event->owner;
1474 }
1475
1476 /*
1477 * Event has a parent but parent's task finished and it's
1478 * alive only because of children holding refference.
1479 */
is_orphaned_child(struct perf_event * event)1480 static bool is_orphaned_child(struct perf_event *event)
1481 {
1482 return is_orphaned_event(event->parent);
1483 }
1484
1485 static void orphans_remove_work(struct work_struct *work);
1486
schedule_orphans_remove(struct perf_event_context * ctx)1487 static void schedule_orphans_remove(struct perf_event_context *ctx)
1488 {
1489 if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
1490 return;
1491
1492 if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
1493 get_ctx(ctx);
1494 ctx->orphans_remove_sched = true;
1495 }
1496 }
1497
perf_workqueue_init(void)1498 static int __init perf_workqueue_init(void)
1499 {
1500 perf_wq = create_singlethread_workqueue("perf");
1501 WARN(!perf_wq, "failed to create perf workqueue\n");
1502 return perf_wq ? 0 : -1;
1503 }
1504
1505 core_initcall(perf_workqueue_init);
1506
1507 static inline int
event_filter_match(struct perf_event * event)1508 event_filter_match(struct perf_event *event)
1509 {
1510 return (event->cpu == -1 || event->cpu == smp_processor_id())
1511 && perf_cgroup_match(event);
1512 }
1513
1514 static void
event_sched_out(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1515 event_sched_out(struct perf_event *event,
1516 struct perf_cpu_context *cpuctx,
1517 struct perf_event_context *ctx)
1518 {
1519 u64 tstamp = perf_event_time(event);
1520 u64 delta;
1521 /*
1522 * An event which could not be activated because of
1523 * filter mismatch still needs to have its timings
1524 * maintained, otherwise bogus information is return
1525 * via read() for time_enabled, time_running:
1526 */
1527 if (event->state == PERF_EVENT_STATE_INACTIVE
1528 && !event_filter_match(event)) {
1529 delta = tstamp - event->tstamp_stopped;
1530 event->tstamp_running += delta;
1531 event->tstamp_stopped = tstamp;
1532 }
1533
1534 if (event->state != PERF_EVENT_STATE_ACTIVE)
1535 return;
1536
1537 perf_pmu_disable(event->pmu);
1538
1539 event->state = PERF_EVENT_STATE_INACTIVE;
1540 if (event->pending_disable) {
1541 event->pending_disable = 0;
1542 event->state = PERF_EVENT_STATE_OFF;
1543 }
1544 event->tstamp_stopped = tstamp;
1545 event->pmu->del(event, 0);
1546 event->oncpu = -1;
1547
1548 if (!is_software_event(event))
1549 cpuctx->active_oncpu--;
1550 ctx->nr_active--;
1551 if (event->attr.freq && event->attr.sample_freq)
1552 ctx->nr_freq--;
1553 if (event->attr.exclusive || !cpuctx->active_oncpu)
1554 cpuctx->exclusive = 0;
1555
1556 if (is_orphaned_child(event))
1557 schedule_orphans_remove(ctx);
1558
1559 perf_pmu_enable(event->pmu);
1560 }
1561
1562 static void
group_sched_out(struct perf_event * group_event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1563 group_sched_out(struct perf_event *group_event,
1564 struct perf_cpu_context *cpuctx,
1565 struct perf_event_context *ctx)
1566 {
1567 struct perf_event *event;
1568 int state = group_event->state;
1569
1570 event_sched_out(group_event, cpuctx, ctx);
1571
1572 /*
1573 * Schedule out siblings (if any):
1574 */
1575 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1576 event_sched_out(event, cpuctx, ctx);
1577
1578 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1579 cpuctx->exclusive = 0;
1580 }
1581
1582 struct remove_event {
1583 struct perf_event *event;
1584 bool detach_group;
1585 };
1586
1587 /*
1588 * Cross CPU call to remove a performance event
1589 *
1590 * We disable the event on the hardware level first. After that we
1591 * remove it from the context list.
1592 */
__perf_remove_from_context(void * info)1593 static int __perf_remove_from_context(void *info)
1594 {
1595 struct remove_event *re = info;
1596 struct perf_event *event = re->event;
1597 struct perf_event_context *ctx = event->ctx;
1598 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1599
1600 raw_spin_lock(&ctx->lock);
1601 event_sched_out(event, cpuctx, ctx);
1602 if (re->detach_group)
1603 perf_group_detach(event);
1604 list_del_event(event, ctx);
1605 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1606 ctx->is_active = 0;
1607 cpuctx->task_ctx = NULL;
1608 }
1609 raw_spin_unlock(&ctx->lock);
1610
1611 return 0;
1612 }
1613
1614
1615 /*
1616 * Remove the event from a task's (or a CPU's) list of events.
1617 *
1618 * CPU events are removed with a smp call. For task events we only
1619 * call when the task is on a CPU.
1620 *
1621 * If event->ctx is a cloned context, callers must make sure that
1622 * every task struct that event->ctx->task could possibly point to
1623 * remains valid. This is OK when called from perf_release since
1624 * that only calls us on the top-level context, which can't be a clone.
1625 * When called from perf_event_exit_task, it's OK because the
1626 * context has been detached from its task.
1627 */
perf_remove_from_context(struct perf_event * event,bool detach_group)1628 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1629 {
1630 struct perf_event_context *ctx = event->ctx;
1631 struct task_struct *task = ctx->task;
1632 struct remove_event re = {
1633 .event = event,
1634 .detach_group = detach_group,
1635 };
1636
1637 lockdep_assert_held(&ctx->mutex);
1638
1639 if (!task) {
1640 /*
1641 * Per cpu events are removed via an smp call. The removal can
1642 * fail if the CPU is currently offline, but in that case we
1643 * already called __perf_remove_from_context from
1644 * perf_event_exit_cpu.
1645 */
1646 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1647 return;
1648 }
1649
1650 retry:
1651 if (!task_function_call(task, __perf_remove_from_context, &re))
1652 return;
1653
1654 raw_spin_lock_irq(&ctx->lock);
1655 /*
1656 * If we failed to find a running task, but find the context active now
1657 * that we've acquired the ctx->lock, retry.
1658 */
1659 if (ctx->is_active) {
1660 raw_spin_unlock_irq(&ctx->lock);
1661 /*
1662 * Reload the task pointer, it might have been changed by
1663 * a concurrent perf_event_context_sched_out().
1664 */
1665 task = ctx->task;
1666 goto retry;
1667 }
1668
1669 /*
1670 * Since the task isn't running, its safe to remove the event, us
1671 * holding the ctx->lock ensures the task won't get scheduled in.
1672 */
1673 if (detach_group)
1674 perf_group_detach(event);
1675 list_del_event(event, ctx);
1676 raw_spin_unlock_irq(&ctx->lock);
1677 }
1678
1679 /*
1680 * Cross CPU call to disable a performance event
1681 */
__perf_event_disable(void * info)1682 int __perf_event_disable(void *info)
1683 {
1684 struct perf_event *event = info;
1685 struct perf_event_context *ctx = event->ctx;
1686 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1687
1688 /*
1689 * If this is a per-task event, need to check whether this
1690 * event's task is the current task on this cpu.
1691 *
1692 * Can trigger due to concurrent perf_event_context_sched_out()
1693 * flipping contexts around.
1694 */
1695 if (ctx->task && cpuctx->task_ctx != ctx)
1696 return -EINVAL;
1697
1698 raw_spin_lock(&ctx->lock);
1699
1700 /*
1701 * If the event is on, turn it off.
1702 * If it is in error state, leave it in error state.
1703 */
1704 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1705 update_context_time(ctx);
1706 update_cgrp_time_from_event(event);
1707 update_group_times(event);
1708 if (event == event->group_leader)
1709 group_sched_out(event, cpuctx, ctx);
1710 else
1711 event_sched_out(event, cpuctx, ctx);
1712 event->state = PERF_EVENT_STATE_OFF;
1713 }
1714
1715 raw_spin_unlock(&ctx->lock);
1716
1717 return 0;
1718 }
1719
1720 /*
1721 * Disable a event.
1722 *
1723 * If event->ctx is a cloned context, callers must make sure that
1724 * every task struct that event->ctx->task could possibly point to
1725 * remains valid. This condition is satisifed when called through
1726 * perf_event_for_each_child or perf_event_for_each because they
1727 * hold the top-level event's child_mutex, so any descendant that
1728 * goes to exit will block in sync_child_event.
1729 * When called from perf_pending_event it's OK because event->ctx
1730 * is the current context on this CPU and preemption is disabled,
1731 * hence we can't get into perf_event_task_sched_out for this context.
1732 */
_perf_event_disable(struct perf_event * event)1733 static void _perf_event_disable(struct perf_event *event)
1734 {
1735 struct perf_event_context *ctx = event->ctx;
1736 struct task_struct *task = ctx->task;
1737
1738 if (!task) {
1739 /*
1740 * Disable the event on the cpu that it's on
1741 */
1742 cpu_function_call(event->cpu, __perf_event_disable, event);
1743 return;
1744 }
1745
1746 retry:
1747 if (!task_function_call(task, __perf_event_disable, event))
1748 return;
1749
1750 raw_spin_lock_irq(&ctx->lock);
1751 /*
1752 * If the event is still active, we need to retry the cross-call.
1753 */
1754 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1755 raw_spin_unlock_irq(&ctx->lock);
1756 /*
1757 * Reload the task pointer, it might have been changed by
1758 * a concurrent perf_event_context_sched_out().
1759 */
1760 task = ctx->task;
1761 goto retry;
1762 }
1763
1764 /*
1765 * Since we have the lock this context can't be scheduled
1766 * in, so we can change the state safely.
1767 */
1768 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1769 update_group_times(event);
1770 event->state = PERF_EVENT_STATE_OFF;
1771 }
1772 raw_spin_unlock_irq(&ctx->lock);
1773 }
1774
1775 /*
1776 * Strictly speaking kernel users cannot create groups and therefore this
1777 * interface does not need the perf_event_ctx_lock() magic.
1778 */
perf_event_disable(struct perf_event * event)1779 void perf_event_disable(struct perf_event *event)
1780 {
1781 struct perf_event_context *ctx;
1782
1783 ctx = perf_event_ctx_lock(event);
1784 _perf_event_disable(event);
1785 perf_event_ctx_unlock(event, ctx);
1786 }
1787 EXPORT_SYMBOL_GPL(perf_event_disable);
1788
perf_set_shadow_time(struct perf_event * event,struct perf_event_context * ctx,u64 tstamp)1789 static void perf_set_shadow_time(struct perf_event *event,
1790 struct perf_event_context *ctx,
1791 u64 tstamp)
1792 {
1793 /*
1794 * use the correct time source for the time snapshot
1795 *
1796 * We could get by without this by leveraging the
1797 * fact that to get to this function, the caller
1798 * has most likely already called update_context_time()
1799 * and update_cgrp_time_xx() and thus both timestamp
1800 * are identical (or very close). Given that tstamp is,
1801 * already adjusted for cgroup, we could say that:
1802 * tstamp - ctx->timestamp
1803 * is equivalent to
1804 * tstamp - cgrp->timestamp.
1805 *
1806 * Then, in perf_output_read(), the calculation would
1807 * work with no changes because:
1808 * - event is guaranteed scheduled in
1809 * - no scheduled out in between
1810 * - thus the timestamp would be the same
1811 *
1812 * But this is a bit hairy.
1813 *
1814 * So instead, we have an explicit cgroup call to remain
1815 * within the time time source all along. We believe it
1816 * is cleaner and simpler to understand.
1817 */
1818 if (is_cgroup_event(event))
1819 perf_cgroup_set_shadow_time(event, tstamp);
1820 else
1821 event->shadow_ctx_time = tstamp - ctx->timestamp;
1822 }
1823
1824 #define MAX_INTERRUPTS (~0ULL)
1825
1826 static void perf_log_throttle(struct perf_event *event, int enable);
1827
1828 static int
event_sched_in(struct perf_event * event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1829 event_sched_in(struct perf_event *event,
1830 struct perf_cpu_context *cpuctx,
1831 struct perf_event_context *ctx)
1832 {
1833 u64 tstamp = perf_event_time(event);
1834 int ret = 0;
1835
1836 lockdep_assert_held(&ctx->lock);
1837
1838 if (event->state <= PERF_EVENT_STATE_OFF)
1839 return 0;
1840
1841 event->state = PERF_EVENT_STATE_ACTIVE;
1842 event->oncpu = smp_processor_id();
1843
1844 /*
1845 * Unthrottle events, since we scheduled we might have missed several
1846 * ticks already, also for a heavily scheduling task there is little
1847 * guarantee it'll get a tick in a timely manner.
1848 */
1849 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1850 perf_log_throttle(event, 1);
1851 event->hw.interrupts = 0;
1852 }
1853
1854 /*
1855 * The new state must be visible before we turn it on in the hardware:
1856 */
1857 smp_wmb();
1858
1859 perf_pmu_disable(event->pmu);
1860
1861 if (event->pmu->add(event, PERF_EF_START)) {
1862 event->state = PERF_EVENT_STATE_INACTIVE;
1863 event->oncpu = -1;
1864 ret = -EAGAIN;
1865 goto out;
1866 }
1867
1868 event->tstamp_running += tstamp - event->tstamp_stopped;
1869
1870 perf_set_shadow_time(event, ctx, tstamp);
1871
1872 if (!is_software_event(event))
1873 cpuctx->active_oncpu++;
1874 ctx->nr_active++;
1875 if (event->attr.freq && event->attr.sample_freq)
1876 ctx->nr_freq++;
1877
1878 if (event->attr.exclusive)
1879 cpuctx->exclusive = 1;
1880
1881 if (is_orphaned_child(event))
1882 schedule_orphans_remove(ctx);
1883
1884 out:
1885 perf_pmu_enable(event->pmu);
1886
1887 return ret;
1888 }
1889
1890 static int
group_sched_in(struct perf_event * group_event,struct perf_cpu_context * cpuctx,struct perf_event_context * ctx)1891 group_sched_in(struct perf_event *group_event,
1892 struct perf_cpu_context *cpuctx,
1893 struct perf_event_context *ctx)
1894 {
1895 struct perf_event *event, *partial_group = NULL;
1896 struct pmu *pmu = ctx->pmu;
1897 u64 now = ctx->time;
1898 bool simulate = false;
1899
1900 if (group_event->state == PERF_EVENT_STATE_OFF)
1901 return 0;
1902
1903 pmu->start_txn(pmu);
1904
1905 if (event_sched_in(group_event, cpuctx, ctx)) {
1906 pmu->cancel_txn(pmu);
1907 perf_cpu_hrtimer_restart(cpuctx);
1908 return -EAGAIN;
1909 }
1910
1911 /*
1912 * Schedule in siblings as one group (if any):
1913 */
1914 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1915 if (event_sched_in(event, cpuctx, ctx)) {
1916 partial_group = event;
1917 goto group_error;
1918 }
1919 }
1920
1921 if (!pmu->commit_txn(pmu))
1922 return 0;
1923
1924 group_error:
1925 /*
1926 * Groups can be scheduled in as one unit only, so undo any
1927 * partial group before returning:
1928 * The events up to the failed event are scheduled out normally,
1929 * tstamp_stopped will be updated.
1930 *
1931 * The failed events and the remaining siblings need to have
1932 * their timings updated as if they had gone thru event_sched_in()
1933 * and event_sched_out(). This is required to get consistent timings
1934 * across the group. This also takes care of the case where the group
1935 * could never be scheduled by ensuring tstamp_stopped is set to mark
1936 * the time the event was actually stopped, such that time delta
1937 * calculation in update_event_times() is correct.
1938 */
1939 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1940 if (event == partial_group)
1941 simulate = true;
1942
1943 if (simulate) {
1944 event->tstamp_running += now - event->tstamp_stopped;
1945 event->tstamp_stopped = now;
1946 } else {
1947 event_sched_out(event, cpuctx, ctx);
1948 }
1949 }
1950 event_sched_out(group_event, cpuctx, ctx);
1951
1952 pmu->cancel_txn(pmu);
1953
1954 perf_cpu_hrtimer_restart(cpuctx);
1955
1956 return -EAGAIN;
1957 }
1958
1959 /*
1960 * Work out whether we can put this event group on the CPU now.
1961 */
group_can_go_on(struct perf_event * event,struct perf_cpu_context * cpuctx,int can_add_hw)1962 static int group_can_go_on(struct perf_event *event,
1963 struct perf_cpu_context *cpuctx,
1964 int can_add_hw)
1965 {
1966 /*
1967 * Groups consisting entirely of software events can always go on.
1968 */
1969 if (event->group_flags & PERF_GROUP_SOFTWARE)
1970 return 1;
1971 /*
1972 * If an exclusive group is already on, no other hardware
1973 * events can go on.
1974 */
1975 if (cpuctx->exclusive)
1976 return 0;
1977 /*
1978 * If this group is exclusive and there are already
1979 * events on the CPU, it can't go on.
1980 */
1981 if (event->attr.exclusive && cpuctx->active_oncpu)
1982 return 0;
1983 /*
1984 * Otherwise, try to add it if all previous groups were able
1985 * to go on.
1986 */
1987 return can_add_hw;
1988 }
1989
add_event_to_ctx(struct perf_event * event,struct perf_event_context * ctx)1990 static void add_event_to_ctx(struct perf_event *event,
1991 struct perf_event_context *ctx)
1992 {
1993 u64 tstamp = perf_event_time(event);
1994
1995 list_add_event(event, ctx);
1996 perf_group_attach(event);
1997 event->tstamp_enabled = tstamp;
1998 event->tstamp_running = tstamp;
1999 event->tstamp_stopped = tstamp;
2000 }
2001
2002 static void task_ctx_sched_out(struct perf_event_context *ctx);
2003 static void
2004 ctx_sched_in(struct perf_event_context *ctx,
2005 struct perf_cpu_context *cpuctx,
2006 enum event_type_t event_type,
2007 struct task_struct *task);
2008
perf_event_sched_in(struct perf_cpu_context * cpuctx,struct perf_event_context * ctx,struct task_struct * task)2009 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
2010 struct perf_event_context *ctx,
2011 struct task_struct *task)
2012 {
2013 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
2014 if (ctx)
2015 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
2016 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
2017 if (ctx)
2018 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
2019 }
2020
2021 /*
2022 * Cross CPU call to install and enable a performance event
2023 *
2024 * Must be called with ctx->mutex held
2025 */
__perf_install_in_context(void * info)2026 static int __perf_install_in_context(void *info)
2027 {
2028 struct perf_event *event = info;
2029 struct perf_event_context *ctx = event->ctx;
2030 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2031 struct perf_event_context *task_ctx = cpuctx->task_ctx;
2032 struct task_struct *task = current;
2033
2034 perf_ctx_lock(cpuctx, task_ctx);
2035 perf_pmu_disable(cpuctx->ctx.pmu);
2036
2037 /*
2038 * If there was an active task_ctx schedule it out.
2039 */
2040 if (task_ctx)
2041 task_ctx_sched_out(task_ctx);
2042
2043 /*
2044 * If the context we're installing events in is not the
2045 * active task_ctx, flip them.
2046 */
2047 if (ctx->task && task_ctx != ctx) {
2048 if (task_ctx)
2049 raw_spin_unlock(&task_ctx->lock);
2050 raw_spin_lock(&ctx->lock);
2051 task_ctx = ctx;
2052 }
2053
2054 if (task_ctx) {
2055 cpuctx->task_ctx = task_ctx;
2056 task = task_ctx->task;
2057 }
2058
2059 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
2060
2061 update_context_time(ctx);
2062 /*
2063 * update cgrp time only if current cgrp
2064 * matches event->cgrp. Must be done before
2065 * calling add_event_to_ctx()
2066 */
2067 update_cgrp_time_from_event(event);
2068
2069 add_event_to_ctx(event, ctx);
2070
2071 /*
2072 * Schedule everything back in
2073 */
2074 perf_event_sched_in(cpuctx, task_ctx, task);
2075
2076 perf_pmu_enable(cpuctx->ctx.pmu);
2077 perf_ctx_unlock(cpuctx, task_ctx);
2078
2079 return 0;
2080 }
2081
2082 /*
2083 * Attach a performance event to a context
2084 *
2085 * First we add the event to the list with the hardware enable bit
2086 * in event->hw_config cleared.
2087 *
2088 * If the event is attached to a task which is on a CPU we use a smp
2089 * call to enable it in the task context. The task might have been
2090 * scheduled away, but we check this in the smp call again.
2091 */
2092 static void
perf_install_in_context(struct perf_event_context * ctx,struct perf_event * event,int cpu)2093 perf_install_in_context(struct perf_event_context *ctx,
2094 struct perf_event *event,
2095 int cpu)
2096 {
2097 struct task_struct *task = ctx->task;
2098
2099 lockdep_assert_held(&ctx->mutex);
2100
2101 event->ctx = ctx;
2102 if (event->cpu != -1)
2103 event->cpu = cpu;
2104
2105 if (!task) {
2106 /*
2107 * Per cpu events are installed via an smp call and
2108 * the install is always successful.
2109 */
2110 cpu_function_call(cpu, __perf_install_in_context, event);
2111 return;
2112 }
2113
2114 retry:
2115 if (!task_function_call(task, __perf_install_in_context, event))
2116 return;
2117
2118 raw_spin_lock_irq(&ctx->lock);
2119 /*
2120 * If we failed to find a running task, but find the context active now
2121 * that we've acquired the ctx->lock, retry.
2122 */
2123 if (ctx->is_active) {
2124 raw_spin_unlock_irq(&ctx->lock);
2125 /*
2126 * Reload the task pointer, it might have been changed by
2127 * a concurrent perf_event_context_sched_out().
2128 */
2129 task = ctx->task;
2130 goto retry;
2131 }
2132
2133 /*
2134 * Since the task isn't running, its safe to add the event, us holding
2135 * the ctx->lock ensures the task won't get scheduled in.
2136 */
2137 add_event_to_ctx(event, ctx);
2138 raw_spin_unlock_irq(&ctx->lock);
2139 }
2140
2141 /*
2142 * Put a event into inactive state and update time fields.
2143 * Enabling the leader of a group effectively enables all
2144 * the group members that aren't explicitly disabled, so we
2145 * have to update their ->tstamp_enabled also.
2146 * Note: this works for group members as well as group leaders
2147 * since the non-leader members' sibling_lists will be empty.
2148 */
__perf_event_mark_enabled(struct perf_event * event)2149 static void __perf_event_mark_enabled(struct perf_event *event)
2150 {
2151 struct perf_event *sub;
2152 u64 tstamp = perf_event_time(event);
2153
2154 event->state = PERF_EVENT_STATE_INACTIVE;
2155 event->tstamp_enabled = tstamp - event->total_time_enabled;
2156 list_for_each_entry(sub, &event->sibling_list, group_entry) {
2157 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
2158 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
2159 }
2160 }
2161
2162 /*
2163 * Cross CPU call to enable a performance event
2164 */
__perf_event_enable(void * info)2165 static int __perf_event_enable(void *info)
2166 {
2167 struct perf_event *event = info;
2168 struct perf_event_context *ctx = event->ctx;
2169 struct perf_event *leader = event->group_leader;
2170 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2171 int err;
2172
2173 /*
2174 * There's a time window between 'ctx->is_active' check
2175 * in perf_event_enable function and this place having:
2176 * - IRQs on
2177 * - ctx->lock unlocked
2178 *
2179 * where the task could be killed and 'ctx' deactivated
2180 * by perf_event_exit_task.
2181 */
2182 if (!ctx->is_active)
2183 return -EINVAL;
2184
2185 raw_spin_lock(&ctx->lock);
2186 update_context_time(ctx);
2187
2188 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2189 goto unlock;
2190
2191 /*
2192 * set current task's cgroup time reference point
2193 */
2194 perf_cgroup_set_timestamp(current, ctx);
2195
2196 __perf_event_mark_enabled(event);
2197
2198 if (!event_filter_match(event)) {
2199 if (is_cgroup_event(event))
2200 perf_cgroup_defer_enabled(event);
2201 goto unlock;
2202 }
2203
2204 /*
2205 * If the event is in a group and isn't the group leader,
2206 * then don't put it on unless the group is on.
2207 */
2208 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
2209 goto unlock;
2210
2211 if (!group_can_go_on(event, cpuctx, 1)) {
2212 err = -EEXIST;
2213 } else {
2214 if (event == leader)
2215 err = group_sched_in(event, cpuctx, ctx);
2216 else
2217 err = event_sched_in(event, cpuctx, ctx);
2218 }
2219
2220 if (err) {
2221 /*
2222 * If this event can't go on and it's part of a
2223 * group, then the whole group has to come off.
2224 */
2225 if (leader != event) {
2226 group_sched_out(leader, cpuctx, ctx);
2227 perf_cpu_hrtimer_restart(cpuctx);
2228 }
2229 if (leader->attr.pinned) {
2230 update_group_times(leader);
2231 leader->state = PERF_EVENT_STATE_ERROR;
2232 }
2233 }
2234
2235 unlock:
2236 raw_spin_unlock(&ctx->lock);
2237
2238 return 0;
2239 }
2240
2241 /*
2242 * Enable a event.
2243 *
2244 * If event->ctx is a cloned context, callers must make sure that
2245 * every task struct that event->ctx->task could possibly point to
2246 * remains valid. This condition is satisfied when called through
2247 * perf_event_for_each_child or perf_event_for_each as described
2248 * for perf_event_disable.
2249 */
_perf_event_enable(struct perf_event * event)2250 static void _perf_event_enable(struct perf_event *event)
2251 {
2252 struct perf_event_context *ctx = event->ctx;
2253 struct task_struct *task = ctx->task;
2254
2255 if (!task) {
2256 /*
2257 * Enable the event on the cpu that it's on
2258 */
2259 cpu_function_call(event->cpu, __perf_event_enable, event);
2260 return;
2261 }
2262
2263 raw_spin_lock_irq(&ctx->lock);
2264 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2265 goto out;
2266
2267 /*
2268 * If the event is in error state, clear that first.
2269 * That way, if we see the event in error state below, we
2270 * know that it has gone back into error state, as distinct
2271 * from the task having been scheduled away before the
2272 * cross-call arrived.
2273 */
2274 if (event->state == PERF_EVENT_STATE_ERROR)
2275 event->state = PERF_EVENT_STATE_OFF;
2276
2277 retry:
2278 if (!ctx->is_active) {
2279 __perf_event_mark_enabled(event);
2280 goto out;
2281 }
2282
2283 raw_spin_unlock_irq(&ctx->lock);
2284
2285 if (!task_function_call(task, __perf_event_enable, event))
2286 return;
2287
2288 raw_spin_lock_irq(&ctx->lock);
2289
2290 /*
2291 * If the context is active and the event is still off,
2292 * we need to retry the cross-call.
2293 */
2294 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2295 /*
2296 * task could have been flipped by a concurrent
2297 * perf_event_context_sched_out()
2298 */
2299 task = ctx->task;
2300 goto retry;
2301 }
2302
2303 out:
2304 raw_spin_unlock_irq(&ctx->lock);
2305 }
2306
2307 /*
2308 * See perf_event_disable();
2309 */
perf_event_enable(struct perf_event * event)2310 void perf_event_enable(struct perf_event *event)
2311 {
2312 struct perf_event_context *ctx;
2313
2314 ctx = perf_event_ctx_lock(event);
2315 _perf_event_enable(event);
2316 perf_event_ctx_unlock(event, ctx);
2317 }
2318 EXPORT_SYMBOL_GPL(perf_event_enable);
2319
_perf_event_refresh(struct perf_event * event,int refresh)2320 static int _perf_event_refresh(struct perf_event *event, int refresh)
2321 {
2322 /*
2323 * not supported on inherited events
2324 */
2325 if (event->attr.inherit || !is_sampling_event(event))
2326 return -EINVAL;
2327
2328 atomic_add(refresh, &event->event_limit);
2329 _perf_event_enable(event);
2330
2331 return 0;
2332 }
2333
2334 /*
2335 * See perf_event_disable()
2336 */
perf_event_refresh(struct perf_event * event,int refresh)2337 int perf_event_refresh(struct perf_event *event, int refresh)
2338 {
2339 struct perf_event_context *ctx;
2340 int ret;
2341
2342 ctx = perf_event_ctx_lock(event);
2343 ret = _perf_event_refresh(event, refresh);
2344 perf_event_ctx_unlock(event, ctx);
2345
2346 return ret;
2347 }
2348 EXPORT_SYMBOL_GPL(perf_event_refresh);
2349
ctx_sched_out(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx,enum event_type_t event_type)2350 static void ctx_sched_out(struct perf_event_context *ctx,
2351 struct perf_cpu_context *cpuctx,
2352 enum event_type_t event_type)
2353 {
2354 struct perf_event *event;
2355 int is_active = ctx->is_active;
2356
2357 ctx->is_active &= ~event_type;
2358 if (likely(!ctx->nr_events))
2359 return;
2360
2361 update_context_time(ctx);
2362 update_cgrp_time_from_cpuctx(cpuctx);
2363 if (!ctx->nr_active)
2364 return;
2365
2366 perf_pmu_disable(ctx->pmu);
2367 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2368 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2369 group_sched_out(event, cpuctx, ctx);
2370 }
2371
2372 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2373 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2374 group_sched_out(event, cpuctx, ctx);
2375 }
2376 perf_pmu_enable(ctx->pmu);
2377 }
2378
2379 /*
2380 * Test whether two contexts are equivalent, i.e. whether they have both been
2381 * cloned from the same version of the same context.
2382 *
2383 * Equivalence is measured using a generation number in the context that is
2384 * incremented on each modification to it; see unclone_ctx(), list_add_event()
2385 * and list_del_event().
2386 */
context_equiv(struct perf_event_context * ctx1,struct perf_event_context * ctx2)2387 static int context_equiv(struct perf_event_context *ctx1,
2388 struct perf_event_context *ctx2)
2389 {
2390 lockdep_assert_held(&ctx1->lock);
2391 lockdep_assert_held(&ctx2->lock);
2392
2393 /* Pinning disables the swap optimization */
2394 if (ctx1->pin_count || ctx2->pin_count)
2395 return 0;
2396
2397 /* If ctx1 is the parent of ctx2 */
2398 if (ctx1 == ctx2->parent_ctx && ctx1->generation == ctx2->parent_gen)
2399 return 1;
2400
2401 /* If ctx2 is the parent of ctx1 */
2402 if (ctx1->parent_ctx == ctx2 && ctx1->parent_gen == ctx2->generation)
2403 return 1;
2404
2405 /*
2406 * If ctx1 and ctx2 have the same parent; we flatten the parent
2407 * hierarchy, see perf_event_init_context().
2408 */
2409 if (ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx &&
2410 ctx1->parent_gen == ctx2->parent_gen)
2411 return 1;
2412
2413 /* Unmatched */
2414 return 0;
2415 }
2416
__perf_event_sync_stat(struct perf_event * event,struct perf_event * next_event)2417 static void __perf_event_sync_stat(struct perf_event *event,
2418 struct perf_event *next_event)
2419 {
2420 u64 value;
2421
2422 if (!event->attr.inherit_stat)
2423 return;
2424
2425 /*
2426 * Update the event value, we cannot use perf_event_read()
2427 * because we're in the middle of a context switch and have IRQs
2428 * disabled, which upsets smp_call_function_single(), however
2429 * we know the event must be on the current CPU, therefore we
2430 * don't need to use it.
2431 */
2432 switch (event->state) {
2433 case PERF_EVENT_STATE_ACTIVE:
2434 event->pmu->read(event);
2435 /* fall-through */
2436
2437 case PERF_EVENT_STATE_INACTIVE:
2438 update_event_times(event);
2439 break;
2440
2441 default:
2442 break;
2443 }
2444
2445 /*
2446 * In order to keep per-task stats reliable we need to flip the event
2447 * values when we flip the contexts.
2448 */
2449 value = local64_read(&next_event->count);
2450 value = local64_xchg(&event->count, value);
2451 local64_set(&next_event->count, value);
2452
2453 swap(event->total_time_enabled, next_event->total_time_enabled);
2454 swap(event->total_time_running, next_event->total_time_running);
2455
2456 /*
2457 * Since we swizzled the values, update the user visible data too.
2458 */
2459 perf_event_update_userpage(event);
2460 perf_event_update_userpage(next_event);
2461 }
2462
perf_event_sync_stat(struct perf_event_context * ctx,struct perf_event_context * next_ctx)2463 static void perf_event_sync_stat(struct perf_event_context *ctx,
2464 struct perf_event_context *next_ctx)
2465 {
2466 struct perf_event *event, *next_event;
2467
2468 if (!ctx->nr_stat)
2469 return;
2470
2471 update_context_time(ctx);
2472
2473 event = list_first_entry(&ctx->event_list,
2474 struct perf_event, event_entry);
2475
2476 next_event = list_first_entry(&next_ctx->event_list,
2477 struct perf_event, event_entry);
2478
2479 while (&event->event_entry != &ctx->event_list &&
2480 &next_event->event_entry != &next_ctx->event_list) {
2481
2482 __perf_event_sync_stat(event, next_event);
2483
2484 event = list_next_entry(event, event_entry);
2485 next_event = list_next_entry(next_event, event_entry);
2486 }
2487 }
2488
perf_event_context_sched_out(struct task_struct * task,int ctxn,struct task_struct * next)2489 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2490 struct task_struct *next)
2491 {
2492 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2493 struct perf_event_context *next_ctx;
2494 struct perf_event_context *parent, *next_parent;
2495 struct perf_cpu_context *cpuctx;
2496 int do_switch = 1;
2497
2498 if (likely(!ctx))
2499 return;
2500
2501 cpuctx = __get_cpu_context(ctx);
2502 if (!cpuctx->task_ctx)
2503 return;
2504
2505 rcu_read_lock();
2506 next_ctx = next->perf_event_ctxp[ctxn];
2507 if (!next_ctx)
2508 goto unlock;
2509
2510 parent = rcu_dereference(ctx->parent_ctx);
2511 next_parent = rcu_dereference(next_ctx->parent_ctx);
2512
2513 /* If neither context have a parent context; they cannot be clones. */
2514 if (!parent && !next_parent)
2515 goto unlock;
2516
2517 if (next_parent == ctx || next_ctx == parent || next_parent == parent) {
2518 /*
2519 * Looks like the two contexts are clones, so we might be
2520 * able to optimize the context switch. We lock both
2521 * contexts and check that they are clones under the
2522 * lock (including re-checking that neither has been
2523 * uncloned in the meantime). It doesn't matter which
2524 * order we take the locks because no other cpu could
2525 * be trying to lock both of these tasks.
2526 */
2527 raw_spin_lock(&ctx->lock);
2528 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2529 if (context_equiv(ctx, next_ctx)) {
2530 /*
2531 * XXX do we need a memory barrier of sorts
2532 * wrt to rcu_dereference() of perf_event_ctxp
2533 */
2534 task->perf_event_ctxp[ctxn] = next_ctx;
2535 next->perf_event_ctxp[ctxn] = ctx;
2536 ctx->task = next;
2537 next_ctx->task = task;
2538 do_switch = 0;
2539
2540 perf_event_sync_stat(ctx, next_ctx);
2541 }
2542 raw_spin_unlock(&next_ctx->lock);
2543 raw_spin_unlock(&ctx->lock);
2544 }
2545 unlock:
2546 rcu_read_unlock();
2547
2548 if (do_switch) {
2549 raw_spin_lock(&ctx->lock);
2550 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2551 cpuctx->task_ctx = NULL;
2552 raw_spin_unlock(&ctx->lock);
2553 }
2554 }
2555
2556 #define for_each_task_context_nr(ctxn) \
2557 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2558
2559 /*
2560 * Called from scheduler to remove the events of the current task,
2561 * with interrupts disabled.
2562 *
2563 * We stop each event and update the event value in event->count.
2564 *
2565 * This does not protect us against NMI, but disable()
2566 * sets the disabled bit in the control field of event _before_
2567 * accessing the event control register. If a NMI hits, then it will
2568 * not restart the event.
2569 */
__perf_event_task_sched_out(struct task_struct * task,struct task_struct * next)2570 void __perf_event_task_sched_out(struct task_struct *task,
2571 struct task_struct *next)
2572 {
2573 int ctxn;
2574
2575 for_each_task_context_nr(ctxn)
2576 perf_event_context_sched_out(task, ctxn, next);
2577
2578 /*
2579 * if cgroup events exist on this CPU, then we need
2580 * to check if we have to switch out PMU state.
2581 * cgroup event are system-wide mode only
2582 */
2583 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2584 perf_cgroup_sched_out(task, next);
2585 }
2586
task_ctx_sched_out(struct perf_event_context * ctx)2587 static void task_ctx_sched_out(struct perf_event_context *ctx)
2588 {
2589 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2590
2591 if (!cpuctx->task_ctx)
2592 return;
2593
2594 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2595 return;
2596
2597 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2598 cpuctx->task_ctx = NULL;
2599 }
2600
2601 /*
2602 * Called with IRQs disabled
2603 */
cpu_ctx_sched_out(struct perf_cpu_context * cpuctx,enum event_type_t event_type)2604 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2605 enum event_type_t event_type)
2606 {
2607 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2608 }
2609
2610 static void
ctx_pinned_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx)2611 ctx_pinned_sched_in(struct perf_event_context *ctx,
2612 struct perf_cpu_context *cpuctx)
2613 {
2614 struct perf_event *event;
2615
2616 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2617 if (event->state <= PERF_EVENT_STATE_OFF)
2618 continue;
2619 if (!event_filter_match(event))
2620 continue;
2621
2622 /* may need to reset tstamp_enabled */
2623 if (is_cgroup_event(event))
2624 perf_cgroup_mark_enabled(event, ctx);
2625
2626 if (group_can_go_on(event, cpuctx, 1))
2627 group_sched_in(event, cpuctx, ctx);
2628
2629 /*
2630 * If this pinned group hasn't been scheduled,
2631 * put it in error state.
2632 */
2633 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2634 update_group_times(event);
2635 event->state = PERF_EVENT_STATE_ERROR;
2636 }
2637 }
2638 }
2639
2640 static void
ctx_flexible_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx)2641 ctx_flexible_sched_in(struct perf_event_context *ctx,
2642 struct perf_cpu_context *cpuctx)
2643 {
2644 struct perf_event *event;
2645 int can_add_hw = 1;
2646
2647 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2648 /* Ignore events in OFF or ERROR state */
2649 if (event->state <= PERF_EVENT_STATE_OFF)
2650 continue;
2651 /*
2652 * Listen to the 'cpu' scheduling filter constraint
2653 * of events:
2654 */
2655 if (!event_filter_match(event))
2656 continue;
2657
2658 /* may need to reset tstamp_enabled */
2659 if (is_cgroup_event(event))
2660 perf_cgroup_mark_enabled(event, ctx);
2661
2662 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2663 if (group_sched_in(event, cpuctx, ctx))
2664 can_add_hw = 0;
2665 }
2666 }
2667 }
2668
2669 static void
ctx_sched_in(struct perf_event_context * ctx,struct perf_cpu_context * cpuctx,enum event_type_t event_type,struct task_struct * task)2670 ctx_sched_in(struct perf_event_context *ctx,
2671 struct perf_cpu_context *cpuctx,
2672 enum event_type_t event_type,
2673 struct task_struct *task)
2674 {
2675 u64 now;
2676 int is_active = ctx->is_active;
2677
2678 ctx->is_active |= event_type;
2679 if (likely(!ctx->nr_events))
2680 return;
2681
2682 now = perf_clock();
2683 ctx->timestamp = now;
2684 perf_cgroup_set_timestamp(task, ctx);
2685 /*
2686 * First go through the list and put on any pinned groups
2687 * in order to give them the best chance of going on.
2688 */
2689 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2690 ctx_pinned_sched_in(ctx, cpuctx);
2691
2692 /* Then walk through the lower prio flexible groups */
2693 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2694 ctx_flexible_sched_in(ctx, cpuctx);
2695 }
2696
cpu_ctx_sched_in(struct perf_cpu_context * cpuctx,enum event_type_t event_type,struct task_struct * task)2697 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2698 enum event_type_t event_type,
2699 struct task_struct *task)
2700 {
2701 struct perf_event_context *ctx = &cpuctx->ctx;
2702
2703 ctx_sched_in(ctx, cpuctx, event_type, task);
2704 }
2705
perf_event_context_sched_in(struct perf_event_context * ctx,struct task_struct * task)2706 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2707 struct task_struct *task)
2708 {
2709 struct perf_cpu_context *cpuctx;
2710
2711 cpuctx = __get_cpu_context(ctx);
2712 if (cpuctx->task_ctx == ctx)
2713 return;
2714
2715 perf_ctx_lock(cpuctx, ctx);
2716 perf_pmu_disable(ctx->pmu);
2717 /*
2718 * We want to keep the following priority order:
2719 * cpu pinned (that don't need to move), task pinned,
2720 * cpu flexible, task flexible.
2721 */
2722 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2723
2724 if (ctx->nr_events)
2725 cpuctx->task_ctx = ctx;
2726
2727 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2728
2729 perf_pmu_enable(ctx->pmu);
2730 perf_ctx_unlock(cpuctx, ctx);
2731
2732 /*
2733 * Since these rotations are per-cpu, we need to ensure the
2734 * cpu-context we got scheduled on is actually rotating.
2735 */
2736 perf_pmu_rotate_start(ctx->pmu);
2737 }
2738
2739 /*
2740 * When sampling the branck stack in system-wide, it may be necessary
2741 * to flush the stack on context switch. This happens when the branch
2742 * stack does not tag its entries with the pid of the current task.
2743 * Otherwise it becomes impossible to associate a branch entry with a
2744 * task. This ambiguity is more likely to appear when the branch stack
2745 * supports priv level filtering and the user sets it to monitor only
2746 * at the user level (which could be a useful measurement in system-wide
2747 * mode). In that case, the risk is high of having a branch stack with
2748 * branch from multiple tasks. Flushing may mean dropping the existing
2749 * entries or stashing them somewhere in the PMU specific code layer.
2750 *
2751 * This function provides the context switch callback to the lower code
2752 * layer. It is invoked ONLY when there is at least one system-wide context
2753 * with at least one active event using taken branch sampling.
2754 */
perf_branch_stack_sched_in(struct task_struct * prev,struct task_struct * task)2755 static void perf_branch_stack_sched_in(struct task_struct *prev,
2756 struct task_struct *task)
2757 {
2758 struct perf_cpu_context *cpuctx;
2759 struct pmu *pmu;
2760 unsigned long flags;
2761
2762 /* no need to flush branch stack if not changing task */
2763 if (prev == task)
2764 return;
2765
2766 local_irq_save(flags);
2767
2768 rcu_read_lock();
2769
2770 list_for_each_entry_rcu(pmu, &pmus, entry) {
2771 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2772
2773 /*
2774 * check if the context has at least one
2775 * event using PERF_SAMPLE_BRANCH_STACK
2776 */
2777 if (cpuctx->ctx.nr_branch_stack > 0
2778 && pmu->flush_branch_stack) {
2779
2780 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2781
2782 perf_pmu_disable(pmu);
2783
2784 pmu->flush_branch_stack();
2785
2786 perf_pmu_enable(pmu);
2787
2788 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2789 }
2790 }
2791
2792 rcu_read_unlock();
2793
2794 local_irq_restore(flags);
2795 }
2796
2797 /*
2798 * Called from scheduler to add the events of the current task
2799 * with interrupts disabled.
2800 *
2801 * We restore the event value and then enable it.
2802 *
2803 * This does not protect us against NMI, but enable()
2804 * sets the enabled bit in the control field of event _before_
2805 * accessing the event control register. If a NMI hits, then it will
2806 * keep the event running.
2807 */
__perf_event_task_sched_in(struct task_struct * prev,struct task_struct * task)2808 void __perf_event_task_sched_in(struct task_struct *prev,
2809 struct task_struct *task)
2810 {
2811 struct perf_event_context *ctx;
2812 int ctxn;
2813
2814 for_each_task_context_nr(ctxn) {
2815 ctx = task->perf_event_ctxp[ctxn];
2816 if (likely(!ctx))
2817 continue;
2818
2819 perf_event_context_sched_in(ctx, task);
2820 }
2821 /*
2822 * if cgroup events exist on this CPU, then we need
2823 * to check if we have to switch in PMU state.
2824 * cgroup event are system-wide mode only
2825 */
2826 if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
2827 perf_cgroup_sched_in(prev, task);
2828
2829 /* check for system-wide branch_stack events */
2830 if (atomic_read(this_cpu_ptr(&perf_branch_stack_events)))
2831 perf_branch_stack_sched_in(prev, task);
2832 }
2833
perf_calculate_period(struct perf_event * event,u64 nsec,u64 count)2834 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2835 {
2836 u64 frequency = event->attr.sample_freq;
2837 u64 sec = NSEC_PER_SEC;
2838 u64 divisor, dividend;
2839
2840 int count_fls, nsec_fls, frequency_fls, sec_fls;
2841
2842 count_fls = fls64(count);
2843 nsec_fls = fls64(nsec);
2844 frequency_fls = fls64(frequency);
2845 sec_fls = 30;
2846
2847 /*
2848 * We got @count in @nsec, with a target of sample_freq HZ
2849 * the target period becomes:
2850 *
2851 * @count * 10^9
2852 * period = -------------------
2853 * @nsec * sample_freq
2854 *
2855 */
2856
2857 /*
2858 * Reduce accuracy by one bit such that @a and @b converge
2859 * to a similar magnitude.
2860 */
2861 #define REDUCE_FLS(a, b) \
2862 do { \
2863 if (a##_fls > b##_fls) { \
2864 a >>= 1; \
2865 a##_fls--; \
2866 } else { \
2867 b >>= 1; \
2868 b##_fls--; \
2869 } \
2870 } while (0)
2871
2872 /*
2873 * Reduce accuracy until either term fits in a u64, then proceed with
2874 * the other, so that finally we can do a u64/u64 division.
2875 */
2876 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2877 REDUCE_FLS(nsec, frequency);
2878 REDUCE_FLS(sec, count);
2879 }
2880
2881 if (count_fls + sec_fls > 64) {
2882 divisor = nsec * frequency;
2883
2884 while (count_fls + sec_fls > 64) {
2885 REDUCE_FLS(count, sec);
2886 divisor >>= 1;
2887 }
2888
2889 dividend = count * sec;
2890 } else {
2891 dividend = count * sec;
2892
2893 while (nsec_fls + frequency_fls > 64) {
2894 REDUCE_FLS(nsec, frequency);
2895 dividend >>= 1;
2896 }
2897
2898 divisor = nsec * frequency;
2899 }
2900
2901 if (!divisor)
2902 return dividend;
2903
2904 return div64_u64(dividend, divisor);
2905 }
2906
2907 static DEFINE_PER_CPU(int, perf_throttled_count);
2908 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2909
perf_adjust_period(struct perf_event * event,u64 nsec,u64 count,bool disable)2910 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2911 {
2912 struct hw_perf_event *hwc = &event->hw;
2913 s64 period, sample_period;
2914 s64 delta;
2915
2916 period = perf_calculate_period(event, nsec, count);
2917
2918 delta = (s64)(period - hwc->sample_period);
2919 delta = (delta + 7) / 8; /* low pass filter */
2920
2921 sample_period = hwc->sample_period + delta;
2922
2923 if (!sample_period)
2924 sample_period = 1;
2925
2926 hwc->sample_period = sample_period;
2927
2928 if (local64_read(&hwc->period_left) > 8*sample_period) {
2929 if (disable)
2930 event->pmu->stop(event, PERF_EF_UPDATE);
2931
2932 local64_set(&hwc->period_left, 0);
2933
2934 if (disable)
2935 event->pmu->start(event, PERF_EF_RELOAD);
2936 }
2937 }
2938
2939 /*
2940 * combine freq adjustment with unthrottling to avoid two passes over the
2941 * events. At the same time, make sure, having freq events does not change
2942 * the rate of unthrottling as that would introduce bias.
2943 */
perf_adjust_freq_unthr_context(struct perf_event_context * ctx,int needs_unthr)2944 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2945 int needs_unthr)
2946 {
2947 struct perf_event *event;
2948 struct hw_perf_event *hwc;
2949 u64 now, period = TICK_NSEC;
2950 s64 delta;
2951
2952 /*
2953 * only need to iterate over all events iff:
2954 * - context have events in frequency mode (needs freq adjust)
2955 * - there are events to unthrottle on this cpu
2956 */
2957 if (!(ctx->nr_freq || needs_unthr))
2958 return;
2959
2960 raw_spin_lock(&ctx->lock);
2961 perf_pmu_disable(ctx->pmu);
2962
2963 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2964 if (event->state != PERF_EVENT_STATE_ACTIVE)
2965 continue;
2966
2967 if (!event_filter_match(event))
2968 continue;
2969
2970 perf_pmu_disable(event->pmu);
2971
2972 hwc = &event->hw;
2973
2974 if (hwc->interrupts == MAX_INTERRUPTS) {
2975 hwc->interrupts = 0;
2976 perf_log_throttle(event, 1);
2977 event->pmu->start(event, 0);
2978 }
2979
2980 if (!event->attr.freq || !event->attr.sample_freq)
2981 goto next;
2982
2983 /*
2984 * stop the event and update event->count
2985 */
2986 event->pmu->stop(event, PERF_EF_UPDATE);
2987
2988 now = local64_read(&event->count);
2989 delta = now - hwc->freq_count_stamp;
2990 hwc->freq_count_stamp = now;
2991
2992 /*
2993 * restart the event
2994 * reload only if value has changed
2995 * we have stopped the event so tell that
2996 * to perf_adjust_period() to avoid stopping it
2997 * twice.
2998 */
2999 if (delta > 0)
3000 perf_adjust_period(event, period, delta, false);
3001
3002 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
3003 next:
3004 perf_pmu_enable(event->pmu);
3005 }
3006
3007 perf_pmu_enable(ctx->pmu);
3008 raw_spin_unlock(&ctx->lock);
3009 }
3010
3011 /*
3012 * Round-robin a context's events:
3013 */
rotate_ctx(struct perf_event_context * ctx)3014 static void rotate_ctx(struct perf_event_context *ctx)
3015 {
3016 /*
3017 * Rotate the first entry last of non-pinned groups. Rotation might be
3018 * disabled by the inheritance code.
3019 */
3020 if (!ctx->rotate_disable)
3021 list_rotate_left(&ctx->flexible_groups);
3022 }
3023
3024 /*
3025 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
3026 * because they're strictly cpu affine and rotate_start is called with IRQs
3027 * disabled, while rotate_context is called from IRQ context.
3028 */
perf_rotate_context(struct perf_cpu_context * cpuctx)3029 static int perf_rotate_context(struct perf_cpu_context *cpuctx)
3030 {
3031 struct perf_event_context *ctx = NULL;
3032 int rotate = 0, remove = 1;
3033
3034 if (cpuctx->ctx.nr_events) {
3035 remove = 0;
3036 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
3037 rotate = 1;
3038 }
3039
3040 ctx = cpuctx->task_ctx;
3041 if (ctx && ctx->nr_events) {
3042 remove = 0;
3043 if (ctx->nr_events != ctx->nr_active)
3044 rotate = 1;
3045 }
3046
3047 if (!rotate)
3048 goto done;
3049
3050 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
3051 perf_pmu_disable(cpuctx->ctx.pmu);
3052
3053 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
3054 if (ctx)
3055 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
3056
3057 rotate_ctx(&cpuctx->ctx);
3058 if (ctx)
3059 rotate_ctx(ctx);
3060
3061 perf_event_sched_in(cpuctx, ctx, current);
3062
3063 perf_pmu_enable(cpuctx->ctx.pmu);
3064 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
3065 done:
3066 if (remove)
3067 list_del_init(&cpuctx->rotation_list);
3068
3069 return rotate;
3070 }
3071
3072 #ifdef CONFIG_NO_HZ_FULL
perf_event_can_stop_tick(void)3073 bool perf_event_can_stop_tick(void)
3074 {
3075 if (atomic_read(&nr_freq_events) ||
3076 __this_cpu_read(perf_throttled_count))
3077 return false;
3078 else
3079 return true;
3080 }
3081 #endif
3082
perf_event_task_tick(void)3083 void perf_event_task_tick(void)
3084 {
3085 struct list_head *head = this_cpu_ptr(&rotation_list);
3086 struct perf_cpu_context *cpuctx, *tmp;
3087 struct perf_event_context *ctx;
3088 int throttled;
3089
3090 WARN_ON(!irqs_disabled());
3091
3092 __this_cpu_inc(perf_throttled_seq);
3093 throttled = __this_cpu_xchg(perf_throttled_count, 0);
3094
3095 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
3096 ctx = &cpuctx->ctx;
3097 perf_adjust_freq_unthr_context(ctx, throttled);
3098
3099 ctx = cpuctx->task_ctx;
3100 if (ctx)
3101 perf_adjust_freq_unthr_context(ctx, throttled);
3102 }
3103 }
3104
event_enable_on_exec(struct perf_event * event,struct perf_event_context * ctx)3105 static int event_enable_on_exec(struct perf_event *event,
3106 struct perf_event_context *ctx)
3107 {
3108 if (!event->attr.enable_on_exec)
3109 return 0;
3110
3111 event->attr.enable_on_exec = 0;
3112 if (event->state >= PERF_EVENT_STATE_INACTIVE)
3113 return 0;
3114
3115 __perf_event_mark_enabled(event);
3116
3117 return 1;
3118 }
3119
3120 /*
3121 * Enable all of a task's events that have been marked enable-on-exec.
3122 * This expects task == current.
3123 */
perf_event_enable_on_exec(struct perf_event_context * ctx)3124 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
3125 {
3126 struct perf_event_context *clone_ctx = NULL;
3127 struct perf_event *event;
3128 unsigned long flags;
3129 int enabled = 0;
3130 int ret;
3131
3132 local_irq_save(flags);
3133 if (!ctx || !ctx->nr_events)
3134 goto out;
3135
3136 /*
3137 * We must ctxsw out cgroup events to avoid conflict
3138 * when invoking perf_task_event_sched_in() later on
3139 * in this function. Otherwise we end up trying to
3140 * ctxswin cgroup events which are already scheduled
3141 * in.
3142 */
3143 perf_cgroup_sched_out(current, NULL);
3144
3145 raw_spin_lock(&ctx->lock);
3146 task_ctx_sched_out(ctx);
3147
3148 list_for_each_entry(event, &ctx->event_list, event_entry) {
3149 ret = event_enable_on_exec(event, ctx);
3150 if (ret)
3151 enabled = 1;
3152 }
3153
3154 /*
3155 * Unclone this context if we enabled any event.
3156 */
3157 if (enabled)
3158 clone_ctx = unclone_ctx(ctx);
3159
3160 raw_spin_unlock(&ctx->lock);
3161
3162 /*
3163 * Also calls ctxswin for cgroup events, if any:
3164 */
3165 perf_event_context_sched_in(ctx, ctx->task);
3166 out:
3167 local_irq_restore(flags);
3168
3169 if (clone_ctx)
3170 put_ctx(clone_ctx);
3171 }
3172
perf_event_exec(void)3173 void perf_event_exec(void)
3174 {
3175 struct perf_event_context *ctx;
3176 int ctxn;
3177
3178 rcu_read_lock();
3179 for_each_task_context_nr(ctxn) {
3180 ctx = current->perf_event_ctxp[ctxn];
3181 if (!ctx)
3182 continue;
3183
3184 perf_event_enable_on_exec(ctx);
3185 }
3186 rcu_read_unlock();
3187 }
3188
3189 /*
3190 * Cross CPU call to read the hardware event
3191 */
__perf_event_read(void * info)3192 static void __perf_event_read(void *info)
3193 {
3194 struct perf_event *event = info;
3195 struct perf_event_context *ctx = event->ctx;
3196 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
3197
3198 /*
3199 * If this is a task context, we need to check whether it is
3200 * the current task context of this cpu. If not it has been
3201 * scheduled out before the smp call arrived. In that case
3202 * event->count would have been updated to a recent sample
3203 * when the event was scheduled out.
3204 */
3205 if (ctx->task && cpuctx->task_ctx != ctx)
3206 return;
3207
3208 raw_spin_lock(&ctx->lock);
3209 if (ctx->is_active) {
3210 update_context_time(ctx);
3211 update_cgrp_time_from_event(event);
3212 }
3213 update_event_times(event);
3214 if (event->state == PERF_EVENT_STATE_ACTIVE)
3215 event->pmu->read(event);
3216 raw_spin_unlock(&ctx->lock);
3217 }
3218
perf_event_count(struct perf_event * event)3219 static inline u64 perf_event_count(struct perf_event *event)
3220 {
3221 return local64_read(&event->count) + atomic64_read(&event->child_count);
3222 }
3223
perf_event_read(struct perf_event * event)3224 static u64 perf_event_read(struct perf_event *event)
3225 {
3226 /*
3227 * If event is enabled and currently active on a CPU, update the
3228 * value in the event structure:
3229 */
3230 if (event->state == PERF_EVENT_STATE_ACTIVE) {
3231 smp_call_function_single(event->oncpu,
3232 __perf_event_read, event, 1);
3233 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
3234 struct perf_event_context *ctx = event->ctx;
3235 unsigned long flags;
3236
3237 raw_spin_lock_irqsave(&ctx->lock, flags);
3238 /*
3239 * may read while context is not active
3240 * (e.g., thread is blocked), in that case
3241 * we cannot update context time
3242 */
3243 if (ctx->is_active) {
3244 update_context_time(ctx);
3245 update_cgrp_time_from_event(event);
3246 }
3247 update_event_times(event);
3248 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3249 }
3250
3251 return perf_event_count(event);
3252 }
3253
3254 /*
3255 * Initialize the perf_event context in a task_struct:
3256 */
__perf_event_init_context(struct perf_event_context * ctx)3257 static void __perf_event_init_context(struct perf_event_context *ctx)
3258 {
3259 raw_spin_lock_init(&ctx->lock);
3260 mutex_init(&ctx->mutex);
3261 INIT_LIST_HEAD(&ctx->pinned_groups);
3262 INIT_LIST_HEAD(&ctx->flexible_groups);
3263 INIT_LIST_HEAD(&ctx->event_list);
3264 atomic_set(&ctx->refcount, 1);
3265 INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
3266 }
3267
3268 static struct perf_event_context *
alloc_perf_context(struct pmu * pmu,struct task_struct * task)3269 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
3270 {
3271 struct perf_event_context *ctx;
3272
3273 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
3274 if (!ctx)
3275 return NULL;
3276
3277 __perf_event_init_context(ctx);
3278 if (task) {
3279 ctx->task = task;
3280 get_task_struct(task);
3281 }
3282 ctx->pmu = pmu;
3283
3284 return ctx;
3285 }
3286
3287 static struct task_struct *
find_lively_task_by_vpid(pid_t vpid)3288 find_lively_task_by_vpid(pid_t vpid)
3289 {
3290 struct task_struct *task;
3291 int err;
3292
3293 rcu_read_lock();
3294 if (!vpid)
3295 task = current;
3296 else
3297 task = find_task_by_vpid(vpid);
3298 if (task)
3299 get_task_struct(task);
3300 rcu_read_unlock();
3301
3302 if (!task)
3303 return ERR_PTR(-ESRCH);
3304
3305 /* Reuse ptrace permission checks for now. */
3306 err = -EACCES;
3307 if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS))
3308 goto errout;
3309
3310 return task;
3311 errout:
3312 put_task_struct(task);
3313 return ERR_PTR(err);
3314
3315 }
3316
3317 /*
3318 * Returns a matching context with refcount and pincount.
3319 */
3320 static struct perf_event_context *
find_get_context(struct pmu * pmu,struct task_struct * task,int cpu)3321 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
3322 {
3323 struct perf_event_context *ctx, *clone_ctx = NULL;
3324 struct perf_cpu_context *cpuctx;
3325 unsigned long flags;
3326 int ctxn, err;
3327
3328 if (!task) {
3329 /* Must be root to operate on a CPU event: */
3330 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
3331 return ERR_PTR(-EACCES);
3332
3333 /*
3334 * We could be clever and allow to attach a event to an
3335 * offline CPU and activate it when the CPU comes up, but
3336 * that's for later.
3337 */
3338 if (!cpu_online(cpu))
3339 return ERR_PTR(-ENODEV);
3340
3341 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
3342 ctx = &cpuctx->ctx;
3343 get_ctx(ctx);
3344 ++ctx->pin_count;
3345
3346 return ctx;
3347 }
3348
3349 err = -EINVAL;
3350 ctxn = pmu->task_ctx_nr;
3351 if (ctxn < 0)
3352 goto errout;
3353
3354 retry:
3355 ctx = perf_lock_task_context(task, ctxn, &flags);
3356 if (ctx) {
3357 clone_ctx = unclone_ctx(ctx);
3358 ++ctx->pin_count;
3359 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3360
3361 if (clone_ctx)
3362 put_ctx(clone_ctx);
3363 } else {
3364 ctx = alloc_perf_context(pmu, task);
3365 err = -ENOMEM;
3366 if (!ctx)
3367 goto errout;
3368
3369 err = 0;
3370 mutex_lock(&task->perf_event_mutex);
3371 /*
3372 * If it has already passed perf_event_exit_task().
3373 * we must see PF_EXITING, it takes this mutex too.
3374 */
3375 if (task->flags & PF_EXITING)
3376 err = -ESRCH;
3377 else if (task->perf_event_ctxp[ctxn])
3378 err = -EAGAIN;
3379 else {
3380 get_ctx(ctx);
3381 ++ctx->pin_count;
3382 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3383 }
3384 mutex_unlock(&task->perf_event_mutex);
3385
3386 if (unlikely(err)) {
3387 put_ctx(ctx);
3388
3389 if (err == -EAGAIN)
3390 goto retry;
3391 goto errout;
3392 }
3393 }
3394
3395 return ctx;
3396
3397 errout:
3398 return ERR_PTR(err);
3399 }
3400
3401 static void perf_event_free_filter(struct perf_event *event);
3402
free_event_rcu(struct rcu_head * head)3403 static void free_event_rcu(struct rcu_head *head)
3404 {
3405 struct perf_event *event;
3406
3407 event = container_of(head, struct perf_event, rcu_head);
3408 if (event->ns)
3409 put_pid_ns(event->ns);
3410 perf_event_free_filter(event);
3411 kfree(event);
3412 }
3413
3414 static void ring_buffer_put(struct ring_buffer *rb);
3415 static void ring_buffer_attach(struct perf_event *event,
3416 struct ring_buffer *rb);
3417
unaccount_event_cpu(struct perf_event * event,int cpu)3418 static void unaccount_event_cpu(struct perf_event *event, int cpu)
3419 {
3420 if (event->parent)
3421 return;
3422
3423 if (has_branch_stack(event)) {
3424 if (!(event->attach_state & PERF_ATTACH_TASK))
3425 atomic_dec(&per_cpu(perf_branch_stack_events, cpu));
3426 }
3427 if (is_cgroup_event(event))
3428 atomic_dec(&per_cpu(perf_cgroup_events, cpu));
3429 }
3430
unaccount_event(struct perf_event * event)3431 static void unaccount_event(struct perf_event *event)
3432 {
3433 if (event->parent)
3434 return;
3435
3436 if (event->attach_state & PERF_ATTACH_TASK)
3437 static_key_slow_dec_deferred(&perf_sched_events);
3438 if (event->attr.mmap || event->attr.mmap_data)
3439 atomic_dec(&nr_mmap_events);
3440 if (event->attr.comm)
3441 atomic_dec(&nr_comm_events);
3442 if (event->attr.task)
3443 atomic_dec(&nr_task_events);
3444 if (event->attr.freq)
3445 atomic_dec(&nr_freq_events);
3446 if (is_cgroup_event(event))
3447 static_key_slow_dec_deferred(&perf_sched_events);
3448 if (has_branch_stack(event))
3449 static_key_slow_dec_deferred(&perf_sched_events);
3450
3451 unaccount_event_cpu(event, event->cpu);
3452 }
3453
__free_event(struct perf_event * event)3454 static void __free_event(struct perf_event *event)
3455 {
3456 if (!event->parent) {
3457 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3458 put_callchain_buffers();
3459 }
3460
3461 if (event->destroy)
3462 event->destroy(event);
3463
3464 if (event->ctx)
3465 put_ctx(event->ctx);
3466
3467 if (event->pmu)
3468 module_put(event->pmu->module);
3469
3470 call_rcu(&event->rcu_head, free_event_rcu);
3471 }
3472
_free_event(struct perf_event * event)3473 static void _free_event(struct perf_event *event)
3474 {
3475 irq_work_sync(&event->pending);
3476
3477 unaccount_event(event);
3478
3479 if (event->rb) {
3480 /*
3481 * Can happen when we close an event with re-directed output.
3482 *
3483 * Since we have a 0 refcount, perf_mmap_close() will skip
3484 * over us; possibly making our ring_buffer_put() the last.
3485 */
3486 mutex_lock(&event->mmap_mutex);
3487 ring_buffer_attach(event, NULL);
3488 mutex_unlock(&event->mmap_mutex);
3489 }
3490
3491 if (is_cgroup_event(event))
3492 perf_detach_cgroup(event);
3493
3494 __free_event(event);
3495 }
3496
3497 /*
3498 * Used to free events which have a known refcount of 1, such as in error paths
3499 * where the event isn't exposed yet and inherited events.
3500 */
free_event(struct perf_event * event)3501 static void free_event(struct perf_event *event)
3502 {
3503 if (WARN(atomic_long_cmpxchg(&event->refcount, 1, 0) != 1,
3504 "unexpected event refcount: %ld; ptr=%p\n",
3505 atomic_long_read(&event->refcount), event)) {
3506 /* leak to avoid use-after-free */
3507 return;
3508 }
3509
3510 _free_event(event);
3511 }
3512
3513 /*
3514 * Remove user event from the owner task.
3515 */
perf_remove_from_owner(struct perf_event * event)3516 static void perf_remove_from_owner(struct perf_event *event)
3517 {
3518 struct task_struct *owner;
3519
3520 rcu_read_lock();
3521 owner = ACCESS_ONCE(event->owner);
3522 /*
3523 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3524 * !owner it means the list deletion is complete and we can indeed
3525 * free this event, otherwise we need to serialize on
3526 * owner->perf_event_mutex.
3527 */
3528 smp_read_barrier_depends();
3529 if (owner) {
3530 /*
3531 * Since delayed_put_task_struct() also drops the last
3532 * task reference we can safely take a new reference
3533 * while holding the rcu_read_lock().
3534 */
3535 get_task_struct(owner);
3536 }
3537 rcu_read_unlock();
3538
3539 if (owner) {
3540 /*
3541 * If we're here through perf_event_exit_task() we're already
3542 * holding ctx->mutex which would be an inversion wrt. the
3543 * normal lock order.
3544 *
3545 * However we can safely take this lock because its the child
3546 * ctx->mutex.
3547 */
3548 mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING);
3549
3550 /*
3551 * We have to re-check the event->owner field, if it is cleared
3552 * we raced with perf_event_exit_task(), acquiring the mutex
3553 * ensured they're done, and we can proceed with freeing the
3554 * event.
3555 */
3556 if (event->owner)
3557 list_del_init(&event->owner_entry);
3558 mutex_unlock(&owner->perf_event_mutex);
3559 put_task_struct(owner);
3560 }
3561 }
3562
3563 /*
3564 * Called when the last reference to the file is gone.
3565 */
put_event(struct perf_event * event)3566 static void put_event(struct perf_event *event)
3567 {
3568 struct perf_event_context *ctx = event->ctx;
3569
3570 if (!atomic_long_dec_and_test(&event->refcount))
3571 return;
3572
3573 if (!is_kernel_event(event))
3574 perf_remove_from_owner(event);
3575
3576 WARN_ON_ONCE(ctx->parent_ctx);
3577 /*
3578 * There are two ways this annotation is useful:
3579 *
3580 * 1) there is a lock recursion from perf_event_exit_task
3581 * see the comment there.
3582 *
3583 * 2) there is a lock-inversion with mmap_sem through
3584 * perf_event_read_group(), which takes faults while
3585 * holding ctx->mutex, however this is called after
3586 * the last filedesc died, so there is no possibility
3587 * to trigger the AB-BA case.
3588 */
3589 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3590 perf_remove_from_context(event, true);
3591 mutex_unlock(&ctx->mutex);
3592
3593 _free_event(event);
3594 }
3595
perf_event_release_kernel(struct perf_event * event)3596 int perf_event_release_kernel(struct perf_event *event)
3597 {
3598 put_event(event);
3599 return 0;
3600 }
3601 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3602
perf_release(struct inode * inode,struct file * file)3603 static int perf_release(struct inode *inode, struct file *file)
3604 {
3605 put_event(file->private_data);
3606 return 0;
3607 }
3608
3609 /*
3610 * Remove all orphanes events from the context.
3611 */
orphans_remove_work(struct work_struct * work)3612 static void orphans_remove_work(struct work_struct *work)
3613 {
3614 struct perf_event_context *ctx;
3615 struct perf_event *event, *tmp;
3616
3617 ctx = container_of(work, struct perf_event_context,
3618 orphans_remove.work);
3619
3620 mutex_lock(&ctx->mutex);
3621 list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
3622 struct perf_event *parent_event = event->parent;
3623
3624 if (!is_orphaned_child(event))
3625 continue;
3626
3627 perf_remove_from_context(event, true);
3628
3629 mutex_lock(&parent_event->child_mutex);
3630 list_del_init(&event->child_list);
3631 mutex_unlock(&parent_event->child_mutex);
3632
3633 free_event(event);
3634 put_event(parent_event);
3635 }
3636
3637 raw_spin_lock_irq(&ctx->lock);
3638 ctx->orphans_remove_sched = false;
3639 raw_spin_unlock_irq(&ctx->lock);
3640 mutex_unlock(&ctx->mutex);
3641
3642 put_ctx(ctx);
3643 }
3644
perf_event_read_value(struct perf_event * event,u64 * enabled,u64 * running)3645 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3646 {
3647 struct perf_event *child;
3648 u64 total = 0;
3649
3650 *enabled = 0;
3651 *running = 0;
3652
3653 mutex_lock(&event->child_mutex);
3654 total += perf_event_read(event);
3655 *enabled += event->total_time_enabled +
3656 atomic64_read(&event->child_total_time_enabled);
3657 *running += event->total_time_running +
3658 atomic64_read(&event->child_total_time_running);
3659
3660 list_for_each_entry(child, &event->child_list, child_list) {
3661 total += perf_event_read(child);
3662 *enabled += child->total_time_enabled;
3663 *running += child->total_time_running;
3664 }
3665 mutex_unlock(&event->child_mutex);
3666
3667 return total;
3668 }
3669 EXPORT_SYMBOL_GPL(perf_event_read_value);
3670
perf_event_read_group(struct perf_event * event,u64 read_format,char __user * buf)3671 static int perf_event_read_group(struct perf_event *event,
3672 u64 read_format, char __user *buf)
3673 {
3674 struct perf_event *leader = event->group_leader, *sub;
3675 struct perf_event_context *ctx = leader->ctx;
3676 int n = 0, size = 0, ret;
3677 u64 count, enabled, running;
3678 u64 values[5];
3679
3680 lockdep_assert_held(&ctx->mutex);
3681
3682 count = perf_event_read_value(leader, &enabled, &running);
3683
3684 values[n++] = 1 + leader->nr_siblings;
3685 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3686 values[n++] = enabled;
3687 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3688 values[n++] = running;
3689 values[n++] = count;
3690 if (read_format & PERF_FORMAT_ID)
3691 values[n++] = primary_event_id(leader);
3692
3693 size = n * sizeof(u64);
3694
3695 if (copy_to_user(buf, values, size))
3696 return -EFAULT;
3697
3698 ret = size;
3699
3700 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3701 n = 0;
3702
3703 values[n++] = perf_event_read_value(sub, &enabled, &running);
3704 if (read_format & PERF_FORMAT_ID)
3705 values[n++] = primary_event_id(sub);
3706
3707 size = n * sizeof(u64);
3708
3709 if (copy_to_user(buf + ret, values, size)) {
3710 return -EFAULT;
3711 }
3712
3713 ret += size;
3714 }
3715
3716 return ret;
3717 }
3718
perf_event_read_one(struct perf_event * event,u64 read_format,char __user * buf)3719 static int perf_event_read_one(struct perf_event *event,
3720 u64 read_format, char __user *buf)
3721 {
3722 u64 enabled, running;
3723 u64 values[4];
3724 int n = 0;
3725
3726 values[n++] = perf_event_read_value(event, &enabled, &running);
3727 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3728 values[n++] = enabled;
3729 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3730 values[n++] = running;
3731 if (read_format & PERF_FORMAT_ID)
3732 values[n++] = primary_event_id(event);
3733
3734 if (copy_to_user(buf, values, n * sizeof(u64)))
3735 return -EFAULT;
3736
3737 return n * sizeof(u64);
3738 }
3739
is_event_hup(struct perf_event * event)3740 static bool is_event_hup(struct perf_event *event)
3741 {
3742 bool no_children;
3743
3744 if (event->state != PERF_EVENT_STATE_EXIT)
3745 return false;
3746
3747 mutex_lock(&event->child_mutex);
3748 no_children = list_empty(&event->child_list);
3749 mutex_unlock(&event->child_mutex);
3750 return no_children;
3751 }
3752
3753 /*
3754 * Read the performance event - simple non blocking version for now
3755 */
3756 static ssize_t
perf_read_hw(struct perf_event * event,char __user * buf,size_t count)3757 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3758 {
3759 u64 read_format = event->attr.read_format;
3760 int ret;
3761
3762 /*
3763 * Return end-of-file for a read on a event that is in
3764 * error state (i.e. because it was pinned but it couldn't be
3765 * scheduled on to the CPU at some point).
3766 */
3767 if (event->state == PERF_EVENT_STATE_ERROR)
3768 return 0;
3769
3770 if (count < event->read_size)
3771 return -ENOSPC;
3772
3773 WARN_ON_ONCE(event->ctx->parent_ctx);
3774 if (read_format & PERF_FORMAT_GROUP)
3775 ret = perf_event_read_group(event, read_format, buf);
3776 else
3777 ret = perf_event_read_one(event, read_format, buf);
3778
3779 return ret;
3780 }
3781
3782 static ssize_t
perf_read(struct file * file,char __user * buf,size_t count,loff_t * ppos)3783 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3784 {
3785 struct perf_event *event = file->private_data;
3786 struct perf_event_context *ctx;
3787 int ret;
3788
3789 ctx = perf_event_ctx_lock(event);
3790 ret = perf_read_hw(event, buf, count);
3791 perf_event_ctx_unlock(event, ctx);
3792
3793 return ret;
3794 }
3795
perf_poll(struct file * file,poll_table * wait)3796 static unsigned int perf_poll(struct file *file, poll_table *wait)
3797 {
3798 struct perf_event *event = file->private_data;
3799 struct ring_buffer *rb;
3800 unsigned int events = POLLHUP;
3801
3802 poll_wait(file, &event->waitq, wait);
3803
3804 if (is_event_hup(event))
3805 return events;
3806
3807 /*
3808 * Pin the event->rb by taking event->mmap_mutex; otherwise
3809 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3810 */
3811 mutex_lock(&event->mmap_mutex);
3812 rb = event->rb;
3813 if (rb)
3814 events = atomic_xchg(&rb->poll, 0);
3815 mutex_unlock(&event->mmap_mutex);
3816 return events;
3817 }
3818
_perf_event_reset(struct perf_event * event)3819 static void _perf_event_reset(struct perf_event *event)
3820 {
3821 (void)perf_event_read(event);
3822 local64_set(&event->count, 0);
3823 perf_event_update_userpage(event);
3824 }
3825
3826 /*
3827 * Holding the top-level event's child_mutex means that any
3828 * descendant process that has inherited this event will block
3829 * in sync_child_event if it goes to exit, thus satisfying the
3830 * task existence requirements of perf_event_enable/disable.
3831 */
perf_event_for_each_child(struct perf_event * event,void (* func)(struct perf_event *))3832 static void perf_event_for_each_child(struct perf_event *event,
3833 void (*func)(struct perf_event *))
3834 {
3835 struct perf_event *child;
3836
3837 WARN_ON_ONCE(event->ctx->parent_ctx);
3838
3839 mutex_lock(&event->child_mutex);
3840 func(event);
3841 list_for_each_entry(child, &event->child_list, child_list)
3842 func(child);
3843 mutex_unlock(&event->child_mutex);
3844 }
3845
perf_event_for_each(struct perf_event * event,void (* func)(struct perf_event *))3846 static void perf_event_for_each(struct perf_event *event,
3847 void (*func)(struct perf_event *))
3848 {
3849 struct perf_event_context *ctx = event->ctx;
3850 struct perf_event *sibling;
3851
3852 lockdep_assert_held(&ctx->mutex);
3853
3854 event = event->group_leader;
3855
3856 perf_event_for_each_child(event, func);
3857 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3858 perf_event_for_each_child(sibling, func);
3859 }
3860
3861 struct period_event {
3862 struct perf_event *event;
3863 u64 value;
3864 };
3865
__perf_event_period(void * info)3866 static int __perf_event_period(void *info)
3867 {
3868 struct period_event *pe = info;
3869 struct perf_event *event = pe->event;
3870 struct perf_event_context *ctx = event->ctx;
3871 u64 value = pe->value;
3872 bool active;
3873
3874 raw_spin_lock(&ctx->lock);
3875 if (event->attr.freq) {
3876 event->attr.sample_freq = value;
3877 } else {
3878 event->attr.sample_period = value;
3879 event->hw.sample_period = value;
3880 }
3881
3882 active = (event->state == PERF_EVENT_STATE_ACTIVE);
3883 if (active) {
3884 perf_pmu_disable(ctx->pmu);
3885 event->pmu->stop(event, PERF_EF_UPDATE);
3886 }
3887
3888 local64_set(&event->hw.period_left, 0);
3889
3890 if (active) {
3891 event->pmu->start(event, PERF_EF_RELOAD);
3892 perf_pmu_enable(ctx->pmu);
3893 }
3894 raw_spin_unlock(&ctx->lock);
3895
3896 return 0;
3897 }
3898
perf_event_period(struct perf_event * event,u64 __user * arg)3899 static int perf_event_period(struct perf_event *event, u64 __user *arg)
3900 {
3901 struct period_event pe = { .event = event, };
3902 struct perf_event_context *ctx = event->ctx;
3903 struct task_struct *task;
3904 u64 value;
3905
3906 if (!is_sampling_event(event))
3907 return -EINVAL;
3908
3909 if (copy_from_user(&value, arg, sizeof(value)))
3910 return -EFAULT;
3911
3912 if (!value)
3913 return -EINVAL;
3914
3915 if (event->attr.freq && value > sysctl_perf_event_sample_rate)
3916 return -EINVAL;
3917
3918 task = ctx->task;
3919 pe.value = value;
3920
3921 if (!task) {
3922 cpu_function_call(event->cpu, __perf_event_period, &pe);
3923 return 0;
3924 }
3925
3926 retry:
3927 if (!task_function_call(task, __perf_event_period, &pe))
3928 return 0;
3929
3930 raw_spin_lock_irq(&ctx->lock);
3931 if (ctx->is_active) {
3932 raw_spin_unlock_irq(&ctx->lock);
3933 task = ctx->task;
3934 goto retry;
3935 }
3936
3937 __perf_event_period(&pe);
3938 raw_spin_unlock_irq(&ctx->lock);
3939
3940 return 0;
3941 }
3942
3943 static const struct file_operations perf_fops;
3944
perf_fget_light(int fd,struct fd * p)3945 static inline int perf_fget_light(int fd, struct fd *p)
3946 {
3947 struct fd f = fdget(fd);
3948 if (!f.file)
3949 return -EBADF;
3950
3951 if (f.file->f_op != &perf_fops) {
3952 fdput(f);
3953 return -EBADF;
3954 }
3955 *p = f;
3956 return 0;
3957 }
3958
3959 static int perf_event_set_output(struct perf_event *event,
3960 struct perf_event *output_event);
3961 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3962
_perf_ioctl(struct perf_event * event,unsigned int cmd,unsigned long arg)3963 static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg)
3964 {
3965 void (*func)(struct perf_event *);
3966 u32 flags = arg;
3967
3968 switch (cmd) {
3969 case PERF_EVENT_IOC_ENABLE:
3970 func = _perf_event_enable;
3971 break;
3972 case PERF_EVENT_IOC_DISABLE:
3973 func = _perf_event_disable;
3974 break;
3975 case PERF_EVENT_IOC_RESET:
3976 func = _perf_event_reset;
3977 break;
3978
3979 case PERF_EVENT_IOC_REFRESH:
3980 return _perf_event_refresh(event, arg);
3981
3982 case PERF_EVENT_IOC_PERIOD:
3983 return perf_event_period(event, (u64 __user *)arg);
3984
3985 case PERF_EVENT_IOC_ID:
3986 {
3987 u64 id = primary_event_id(event);
3988
3989 if (copy_to_user((void __user *)arg, &id, sizeof(id)))
3990 return -EFAULT;
3991 return 0;
3992 }
3993
3994 case PERF_EVENT_IOC_SET_OUTPUT:
3995 {
3996 int ret;
3997 if (arg != -1) {
3998 struct perf_event *output_event;
3999 struct fd output;
4000 ret = perf_fget_light(arg, &output);
4001 if (ret)
4002 return ret;
4003 output_event = output.file->private_data;
4004 ret = perf_event_set_output(event, output_event);
4005 fdput(output);
4006 } else {
4007 ret = perf_event_set_output(event, NULL);
4008 }
4009 return ret;
4010 }
4011
4012 case PERF_EVENT_IOC_SET_FILTER:
4013 return perf_event_set_filter(event, (void __user *)arg);
4014
4015 default:
4016 return -ENOTTY;
4017 }
4018
4019 if (flags & PERF_IOC_FLAG_GROUP)
4020 perf_event_for_each(event, func);
4021 else
4022 perf_event_for_each_child(event, func);
4023
4024 return 0;
4025 }
4026
perf_ioctl(struct file * file,unsigned int cmd,unsigned long arg)4027 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
4028 {
4029 struct perf_event *event = file->private_data;
4030 struct perf_event_context *ctx;
4031 long ret;
4032
4033 ctx = perf_event_ctx_lock(event);
4034 ret = _perf_ioctl(event, cmd, arg);
4035 perf_event_ctx_unlock(event, ctx);
4036
4037 return ret;
4038 }
4039
4040 #ifdef CONFIG_COMPAT
perf_compat_ioctl(struct file * file,unsigned int cmd,unsigned long arg)4041 static long perf_compat_ioctl(struct file *file, unsigned int cmd,
4042 unsigned long arg)
4043 {
4044 switch (_IOC_NR(cmd)) {
4045 case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
4046 case _IOC_NR(PERF_EVENT_IOC_ID):
4047 /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
4048 if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
4049 cmd &= ~IOCSIZE_MASK;
4050 cmd |= sizeof(void *) << IOCSIZE_SHIFT;
4051 }
4052 break;
4053 }
4054 return perf_ioctl(file, cmd, arg);
4055 }
4056 #else
4057 # define perf_compat_ioctl NULL
4058 #endif
4059
perf_event_task_enable(void)4060 int perf_event_task_enable(void)
4061 {
4062 struct perf_event_context *ctx;
4063 struct perf_event *event;
4064
4065 mutex_lock(¤t->perf_event_mutex);
4066 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4067 ctx = perf_event_ctx_lock(event);
4068 perf_event_for_each_child(event, _perf_event_enable);
4069 perf_event_ctx_unlock(event, ctx);
4070 }
4071 mutex_unlock(¤t->perf_event_mutex);
4072
4073 return 0;
4074 }
4075
perf_event_task_disable(void)4076 int perf_event_task_disable(void)
4077 {
4078 struct perf_event_context *ctx;
4079 struct perf_event *event;
4080
4081 mutex_lock(¤t->perf_event_mutex);
4082 list_for_each_entry(event, ¤t->perf_event_list, owner_entry) {
4083 ctx = perf_event_ctx_lock(event);
4084 perf_event_for_each_child(event, _perf_event_disable);
4085 perf_event_ctx_unlock(event, ctx);
4086 }
4087 mutex_unlock(¤t->perf_event_mutex);
4088
4089 return 0;
4090 }
4091
perf_event_index(struct perf_event * event)4092 static int perf_event_index(struct perf_event *event)
4093 {
4094 if (event->hw.state & PERF_HES_STOPPED)
4095 return 0;
4096
4097 if (event->state != PERF_EVENT_STATE_ACTIVE)
4098 return 0;
4099
4100 return event->pmu->event_idx(event);
4101 }
4102
calc_timer_values(struct perf_event * event,u64 * now,u64 * enabled,u64 * running)4103 static void calc_timer_values(struct perf_event *event,
4104 u64 *now,
4105 u64 *enabled,
4106 u64 *running)
4107 {
4108 u64 ctx_time;
4109
4110 *now = perf_clock();
4111 ctx_time = event->shadow_ctx_time + *now;
4112 *enabled = ctx_time - event->tstamp_enabled;
4113 *running = ctx_time - event->tstamp_running;
4114 }
4115
perf_event_init_userpage(struct perf_event * event)4116 static void perf_event_init_userpage(struct perf_event *event)
4117 {
4118 struct perf_event_mmap_page *userpg;
4119 struct ring_buffer *rb;
4120
4121 rcu_read_lock();
4122 rb = rcu_dereference(event->rb);
4123 if (!rb)
4124 goto unlock;
4125
4126 userpg = rb->user_page;
4127
4128 /* Allow new userspace to detect that bit 0 is deprecated */
4129 userpg->cap_bit0_is_deprecated = 1;
4130 userpg->size = offsetof(struct perf_event_mmap_page, __reserved);
4131
4132 unlock:
4133 rcu_read_unlock();
4134 }
4135
arch_perf_update_userpage(struct perf_event_mmap_page * userpg,u64 now)4136 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
4137 {
4138 }
4139
4140 /*
4141 * Callers need to ensure there can be no nesting of this function, otherwise
4142 * the seqlock logic goes bad. We can not serialize this because the arch
4143 * code calls this from NMI context.
4144 */
perf_event_update_userpage(struct perf_event * event)4145 void perf_event_update_userpage(struct perf_event *event)
4146 {
4147 struct perf_event_mmap_page *userpg;
4148 struct ring_buffer *rb;
4149 u64 enabled, running, now;
4150
4151 rcu_read_lock();
4152 rb = rcu_dereference(event->rb);
4153 if (!rb)
4154 goto unlock;
4155
4156 /*
4157 * compute total_time_enabled, total_time_running
4158 * based on snapshot values taken when the event
4159 * was last scheduled in.
4160 *
4161 * we cannot simply called update_context_time()
4162 * because of locking issue as we can be called in
4163 * NMI context
4164 */
4165 calc_timer_values(event, &now, &enabled, &running);
4166
4167 userpg = rb->user_page;
4168 /*
4169 * Disable preemption so as to not let the corresponding user-space
4170 * spin too long if we get preempted.
4171 */
4172 preempt_disable();
4173 ++userpg->lock;
4174 barrier();
4175 userpg->index = perf_event_index(event);
4176 userpg->offset = perf_event_count(event);
4177 if (userpg->index)
4178 userpg->offset -= local64_read(&event->hw.prev_count);
4179
4180 userpg->time_enabled = enabled +
4181 atomic64_read(&event->child_total_time_enabled);
4182
4183 userpg->time_running = running +
4184 atomic64_read(&event->child_total_time_running);
4185
4186 arch_perf_update_userpage(userpg, now);
4187
4188 barrier();
4189 ++userpg->lock;
4190 preempt_enable();
4191 unlock:
4192 rcu_read_unlock();
4193 }
4194
perf_mmap_fault(struct vm_area_struct * vma,struct vm_fault * vmf)4195 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
4196 {
4197 struct perf_event *event = vma->vm_file->private_data;
4198 struct ring_buffer *rb;
4199 int ret = VM_FAULT_SIGBUS;
4200
4201 if (vmf->flags & FAULT_FLAG_MKWRITE) {
4202 if (vmf->pgoff == 0)
4203 ret = 0;
4204 return ret;
4205 }
4206
4207 rcu_read_lock();
4208 rb = rcu_dereference(event->rb);
4209 if (!rb)
4210 goto unlock;
4211
4212 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
4213 goto unlock;
4214
4215 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
4216 if (!vmf->page)
4217 goto unlock;
4218
4219 get_page(vmf->page);
4220 vmf->page->mapping = vma->vm_file->f_mapping;
4221 vmf->page->index = vmf->pgoff;
4222
4223 ret = 0;
4224 unlock:
4225 rcu_read_unlock();
4226
4227 return ret;
4228 }
4229
ring_buffer_attach(struct perf_event * event,struct ring_buffer * rb)4230 static void ring_buffer_attach(struct perf_event *event,
4231 struct ring_buffer *rb)
4232 {
4233 struct ring_buffer *old_rb = NULL;
4234 unsigned long flags;
4235
4236 if (event->rb) {
4237 /*
4238 * Should be impossible, we set this when removing
4239 * event->rb_entry and wait/clear when adding event->rb_entry.
4240 */
4241 WARN_ON_ONCE(event->rcu_pending);
4242
4243 old_rb = event->rb;
4244 spin_lock_irqsave(&old_rb->event_lock, flags);
4245 list_del_rcu(&event->rb_entry);
4246 spin_unlock_irqrestore(&old_rb->event_lock, flags);
4247
4248 event->rcu_batches = get_state_synchronize_rcu();
4249 event->rcu_pending = 1;
4250 }
4251
4252 if (rb) {
4253 if (event->rcu_pending) {
4254 cond_synchronize_rcu(event->rcu_batches);
4255 event->rcu_pending = 0;
4256 }
4257
4258 spin_lock_irqsave(&rb->event_lock, flags);
4259 list_add_rcu(&event->rb_entry, &rb->event_list);
4260 spin_unlock_irqrestore(&rb->event_lock, flags);
4261 }
4262
4263 rcu_assign_pointer(event->rb, rb);
4264
4265 if (old_rb) {
4266 ring_buffer_put(old_rb);
4267 /*
4268 * Since we detached before setting the new rb, so that we
4269 * could attach the new rb, we could have missed a wakeup.
4270 * Provide it now.
4271 */
4272 wake_up_all(&event->waitq);
4273 }
4274 }
4275
ring_buffer_wakeup(struct perf_event * event)4276 static void ring_buffer_wakeup(struct perf_event *event)
4277 {
4278 struct ring_buffer *rb;
4279
4280 rcu_read_lock();
4281 rb = rcu_dereference(event->rb);
4282 if (rb) {
4283 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
4284 wake_up_all(&event->waitq);
4285 }
4286 rcu_read_unlock();
4287 }
4288
rb_free_rcu(struct rcu_head * rcu_head)4289 static void rb_free_rcu(struct rcu_head *rcu_head)
4290 {
4291 struct ring_buffer *rb;
4292
4293 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
4294 rb_free(rb);
4295 }
4296
ring_buffer_get(struct perf_event * event)4297 static struct ring_buffer *ring_buffer_get(struct perf_event *event)
4298 {
4299 struct ring_buffer *rb;
4300
4301 rcu_read_lock();
4302 rb = rcu_dereference(event->rb);
4303 if (rb) {
4304 if (!atomic_inc_not_zero(&rb->refcount))
4305 rb = NULL;
4306 }
4307 rcu_read_unlock();
4308
4309 return rb;
4310 }
4311
ring_buffer_put(struct ring_buffer * rb)4312 static void ring_buffer_put(struct ring_buffer *rb)
4313 {
4314 if (!atomic_dec_and_test(&rb->refcount))
4315 return;
4316
4317 WARN_ON_ONCE(!list_empty(&rb->event_list));
4318
4319 call_rcu(&rb->rcu_head, rb_free_rcu);
4320 }
4321
perf_mmap_open(struct vm_area_struct * vma)4322 static void perf_mmap_open(struct vm_area_struct *vma)
4323 {
4324 struct perf_event *event = vma->vm_file->private_data;
4325
4326 atomic_inc(&event->mmap_count);
4327 atomic_inc(&event->rb->mmap_count);
4328 }
4329
4330 /*
4331 * A buffer can be mmap()ed multiple times; either directly through the same
4332 * event, or through other events by use of perf_event_set_output().
4333 *
4334 * In order to undo the VM accounting done by perf_mmap() we need to destroy
4335 * the buffer here, where we still have a VM context. This means we need
4336 * to detach all events redirecting to us.
4337 */
perf_mmap_close(struct vm_area_struct * vma)4338 static void perf_mmap_close(struct vm_area_struct *vma)
4339 {
4340 struct perf_event *event = vma->vm_file->private_data;
4341
4342 struct ring_buffer *rb = ring_buffer_get(event);
4343 struct user_struct *mmap_user = rb->mmap_user;
4344 int mmap_locked = rb->mmap_locked;
4345 unsigned long size = perf_data_size(rb);
4346
4347 atomic_dec(&rb->mmap_count);
4348
4349 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
4350 goto out_put;
4351
4352 ring_buffer_attach(event, NULL);
4353 mutex_unlock(&event->mmap_mutex);
4354
4355 /* If there's still other mmap()s of this buffer, we're done. */
4356 if (atomic_read(&rb->mmap_count))
4357 goto out_put;
4358
4359 /*
4360 * No other mmap()s, detach from all other events that might redirect
4361 * into the now unreachable buffer. Somewhat complicated by the
4362 * fact that rb::event_lock otherwise nests inside mmap_mutex.
4363 */
4364 again:
4365 rcu_read_lock();
4366 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
4367 if (!atomic_long_inc_not_zero(&event->refcount)) {
4368 /*
4369 * This event is en-route to free_event() which will
4370 * detach it and remove it from the list.
4371 */
4372 continue;
4373 }
4374 rcu_read_unlock();
4375
4376 mutex_lock(&event->mmap_mutex);
4377 /*
4378 * Check we didn't race with perf_event_set_output() which can
4379 * swizzle the rb from under us while we were waiting to
4380 * acquire mmap_mutex.
4381 *
4382 * If we find a different rb; ignore this event, a next
4383 * iteration will no longer find it on the list. We have to
4384 * still restart the iteration to make sure we're not now
4385 * iterating the wrong list.
4386 */
4387 if (event->rb == rb)
4388 ring_buffer_attach(event, NULL);
4389
4390 mutex_unlock(&event->mmap_mutex);
4391 put_event(event);
4392
4393 /*
4394 * Restart the iteration; either we're on the wrong list or
4395 * destroyed its integrity by doing a deletion.
4396 */
4397 goto again;
4398 }
4399 rcu_read_unlock();
4400
4401 /*
4402 * It could be there's still a few 0-ref events on the list; they'll
4403 * get cleaned up by free_event() -- they'll also still have their
4404 * ref on the rb and will free it whenever they are done with it.
4405 *
4406 * Aside from that, this buffer is 'fully' detached and unmapped,
4407 * undo the VM accounting.
4408 */
4409
4410 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
4411 vma->vm_mm->pinned_vm -= mmap_locked;
4412 free_uid(mmap_user);
4413
4414 out_put:
4415 ring_buffer_put(rb); /* could be last */
4416 }
4417
4418 static const struct vm_operations_struct perf_mmap_vmops = {
4419 .open = perf_mmap_open,
4420 .close = perf_mmap_close,
4421 .fault = perf_mmap_fault,
4422 .page_mkwrite = perf_mmap_fault,
4423 };
4424
perf_mmap(struct file * file,struct vm_area_struct * vma)4425 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
4426 {
4427 struct perf_event *event = file->private_data;
4428 unsigned long user_locked, user_lock_limit;
4429 struct user_struct *user = current_user();
4430 unsigned long locked, lock_limit;
4431 struct ring_buffer *rb;
4432 unsigned long vma_size;
4433 unsigned long nr_pages;
4434 long user_extra, extra;
4435 int ret = 0, flags = 0;
4436
4437 /*
4438 * Don't allow mmap() of inherited per-task counters. This would
4439 * create a performance issue due to all children writing to the
4440 * same rb.
4441 */
4442 if (event->cpu == -1 && event->attr.inherit)
4443 return -EINVAL;
4444
4445 if (!(vma->vm_flags & VM_SHARED))
4446 return -EINVAL;
4447
4448 vma_size = vma->vm_end - vma->vm_start;
4449 nr_pages = (vma_size / PAGE_SIZE) - 1;
4450
4451 /*
4452 * If we have rb pages ensure they're a power-of-two number, so we
4453 * can do bitmasks instead of modulo.
4454 */
4455 if (nr_pages != 0 && !is_power_of_2(nr_pages))
4456 return -EINVAL;
4457
4458 if (vma_size != PAGE_SIZE * (1 + nr_pages))
4459 return -EINVAL;
4460
4461 if (vma->vm_pgoff != 0)
4462 return -EINVAL;
4463
4464 WARN_ON_ONCE(event->ctx->parent_ctx);
4465 again:
4466 mutex_lock(&event->mmap_mutex);
4467 if (event->rb) {
4468 if (event->rb->nr_pages != nr_pages) {
4469 ret = -EINVAL;
4470 goto unlock;
4471 }
4472
4473 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
4474 /*
4475 * Raced against perf_mmap_close() through
4476 * perf_event_set_output(). Try again, hope for better
4477 * luck.
4478 */
4479 mutex_unlock(&event->mmap_mutex);
4480 goto again;
4481 }
4482
4483 goto unlock;
4484 }
4485
4486 user_extra = nr_pages + 1;
4487 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
4488
4489 /*
4490 * Increase the limit linearly with more CPUs:
4491 */
4492 user_lock_limit *= num_online_cpus();
4493
4494 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
4495
4496 extra = 0;
4497 if (user_locked > user_lock_limit)
4498 extra = user_locked - user_lock_limit;
4499
4500 lock_limit = rlimit(RLIMIT_MEMLOCK);
4501 lock_limit >>= PAGE_SHIFT;
4502 locked = vma->vm_mm->pinned_vm + extra;
4503
4504 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
4505 !capable(CAP_IPC_LOCK)) {
4506 ret = -EPERM;
4507 goto unlock;
4508 }
4509
4510 WARN_ON(event->rb);
4511
4512 if (vma->vm_flags & VM_WRITE)
4513 flags |= RING_BUFFER_WRITABLE;
4514
4515 rb = rb_alloc(nr_pages,
4516 event->attr.watermark ? event->attr.wakeup_watermark : 0,
4517 event->cpu, flags);
4518
4519 if (!rb) {
4520 ret = -ENOMEM;
4521 goto unlock;
4522 }
4523
4524 atomic_set(&rb->mmap_count, 1);
4525 rb->mmap_locked = extra;
4526 rb->mmap_user = get_current_user();
4527
4528 atomic_long_add(user_extra, &user->locked_vm);
4529 vma->vm_mm->pinned_vm += extra;
4530
4531 ring_buffer_attach(event, rb);
4532
4533 perf_event_init_userpage(event);
4534 perf_event_update_userpage(event);
4535
4536 unlock:
4537 if (!ret)
4538 atomic_inc(&event->mmap_count);
4539 mutex_unlock(&event->mmap_mutex);
4540
4541 /*
4542 * Since pinned accounting is per vm we cannot allow fork() to copy our
4543 * vma.
4544 */
4545 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
4546 vma->vm_ops = &perf_mmap_vmops;
4547
4548 return ret;
4549 }
4550
perf_fasync(int fd,struct file * filp,int on)4551 static int perf_fasync(int fd, struct file *filp, int on)
4552 {
4553 struct inode *inode = file_inode(filp);
4554 struct perf_event *event = filp->private_data;
4555 int retval;
4556
4557 mutex_lock(&inode->i_mutex);
4558 retval = fasync_helper(fd, filp, on, &event->fasync);
4559 mutex_unlock(&inode->i_mutex);
4560
4561 if (retval < 0)
4562 return retval;
4563
4564 return 0;
4565 }
4566
4567 static const struct file_operations perf_fops = {
4568 .llseek = no_llseek,
4569 .release = perf_release,
4570 .read = perf_read,
4571 .poll = perf_poll,
4572 .unlocked_ioctl = perf_ioctl,
4573 .compat_ioctl = perf_compat_ioctl,
4574 .mmap = perf_mmap,
4575 .fasync = perf_fasync,
4576 };
4577
4578 /*
4579 * Perf event wakeup
4580 *
4581 * If there's data, ensure we set the poll() state and publish everything
4582 * to user-space before waking everybody up.
4583 */
4584
perf_event_fasync(struct perf_event * event)4585 static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
4586 {
4587 /* only the parent has fasync state */
4588 if (event->parent)
4589 event = event->parent;
4590 return &event->fasync;
4591 }
4592
perf_event_wakeup(struct perf_event * event)4593 void perf_event_wakeup(struct perf_event *event)
4594 {
4595 ring_buffer_wakeup(event);
4596
4597 if (event->pending_kill) {
4598 kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
4599 event->pending_kill = 0;
4600 }
4601 }
4602
perf_pending_event(struct irq_work * entry)4603 static void perf_pending_event(struct irq_work *entry)
4604 {
4605 struct perf_event *event = container_of(entry,
4606 struct perf_event, pending);
4607 int rctx;
4608
4609 rctx = perf_swevent_get_recursion_context();
4610 /*
4611 * If we 'fail' here, that's OK, it means recursion is already disabled
4612 * and we won't recurse 'further'.
4613 */
4614
4615 if (event->pending_disable) {
4616 event->pending_disable = 0;
4617 __perf_event_disable(event);
4618 }
4619
4620 if (event->pending_wakeup) {
4621 event->pending_wakeup = 0;
4622 perf_event_wakeup(event);
4623 }
4624
4625 if (rctx >= 0)
4626 perf_swevent_put_recursion_context(rctx);
4627 }
4628
4629 /*
4630 * We assume there is only KVM supporting the callbacks.
4631 * Later on, we might change it to a list if there is
4632 * another virtualization implementation supporting the callbacks.
4633 */
4634 struct perf_guest_info_callbacks *perf_guest_cbs;
4635
perf_register_guest_info_callbacks(struct perf_guest_info_callbacks * cbs)4636 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4637 {
4638 perf_guest_cbs = cbs;
4639 return 0;
4640 }
4641 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4642
perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks * cbs)4643 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4644 {
4645 perf_guest_cbs = NULL;
4646 return 0;
4647 }
4648 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4649
4650 static void
perf_output_sample_regs(struct perf_output_handle * handle,struct pt_regs * regs,u64 mask)4651 perf_output_sample_regs(struct perf_output_handle *handle,
4652 struct pt_regs *regs, u64 mask)
4653 {
4654 int bit;
4655
4656 for_each_set_bit(bit, (const unsigned long *) &mask,
4657 sizeof(mask) * BITS_PER_BYTE) {
4658 u64 val;
4659
4660 val = perf_reg_value(regs, bit);
4661 perf_output_put(handle, val);
4662 }
4663 }
4664
perf_sample_regs_user(struct perf_regs_user * regs_user,struct pt_regs * regs)4665 static void perf_sample_regs_user(struct perf_regs_user *regs_user,
4666 struct pt_regs *regs)
4667 {
4668 if (!user_mode(regs)) {
4669 if (current->mm)
4670 regs = task_pt_regs(current);
4671 else
4672 regs = NULL;
4673 }
4674
4675 if (regs) {
4676 regs_user->regs = regs;
4677 regs_user->abi = perf_reg_abi(current);
4678 }
4679 }
4680
4681 /*
4682 * Get remaining task size from user stack pointer.
4683 *
4684 * It'd be better to take stack vma map and limit this more
4685 * precisly, but there's no way to get it safely under interrupt,
4686 * so using TASK_SIZE as limit.
4687 */
perf_ustack_task_size(struct pt_regs * regs)4688 static u64 perf_ustack_task_size(struct pt_regs *regs)
4689 {
4690 unsigned long addr = perf_user_stack_pointer(regs);
4691
4692 if (!addr || addr >= TASK_SIZE)
4693 return 0;
4694
4695 return TASK_SIZE - addr;
4696 }
4697
4698 static u16
perf_sample_ustack_size(u16 stack_size,u16 header_size,struct pt_regs * regs)4699 perf_sample_ustack_size(u16 stack_size, u16 header_size,
4700 struct pt_regs *regs)
4701 {
4702 u64 task_size;
4703
4704 /* No regs, no stack pointer, no dump. */
4705 if (!regs)
4706 return 0;
4707
4708 /*
4709 * Check if we fit in with the requested stack size into the:
4710 * - TASK_SIZE
4711 * If we don't, we limit the size to the TASK_SIZE.
4712 *
4713 * - remaining sample size
4714 * If we don't, we customize the stack size to
4715 * fit in to the remaining sample size.
4716 */
4717
4718 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4719 stack_size = min(stack_size, (u16) task_size);
4720
4721 /* Current header size plus static size and dynamic size. */
4722 header_size += 2 * sizeof(u64);
4723
4724 /* Do we fit in with the current stack dump size? */
4725 if ((u16) (header_size + stack_size) < header_size) {
4726 /*
4727 * If we overflow the maximum size for the sample,
4728 * we customize the stack dump size to fit in.
4729 */
4730 stack_size = USHRT_MAX - header_size - sizeof(u64);
4731 stack_size = round_up(stack_size, sizeof(u64));
4732 }
4733
4734 return stack_size;
4735 }
4736
4737 static void
perf_output_sample_ustack(struct perf_output_handle * handle,u64 dump_size,struct pt_regs * regs)4738 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4739 struct pt_regs *regs)
4740 {
4741 /* Case of a kernel thread, nothing to dump */
4742 if (!regs) {
4743 u64 size = 0;
4744 perf_output_put(handle, size);
4745 } else {
4746 unsigned long sp;
4747 unsigned int rem;
4748 u64 dyn_size;
4749
4750 /*
4751 * We dump:
4752 * static size
4753 * - the size requested by user or the best one we can fit
4754 * in to the sample max size
4755 * data
4756 * - user stack dump data
4757 * dynamic size
4758 * - the actual dumped size
4759 */
4760
4761 /* Static size. */
4762 perf_output_put(handle, dump_size);
4763
4764 /* Data. */
4765 sp = perf_user_stack_pointer(regs);
4766 rem = __output_copy_user(handle, (void *) sp, dump_size);
4767 dyn_size = dump_size - rem;
4768
4769 perf_output_skip(handle, rem);
4770
4771 /* Dynamic size. */
4772 perf_output_put(handle, dyn_size);
4773 }
4774 }
4775
__perf_event_header__init_id(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)4776 static void __perf_event_header__init_id(struct perf_event_header *header,
4777 struct perf_sample_data *data,
4778 struct perf_event *event)
4779 {
4780 u64 sample_type = event->attr.sample_type;
4781
4782 data->type = sample_type;
4783 header->size += event->id_header_size;
4784
4785 if (sample_type & PERF_SAMPLE_TID) {
4786 /* namespace issues */
4787 data->tid_entry.pid = perf_event_pid(event, current);
4788 data->tid_entry.tid = perf_event_tid(event, current);
4789 }
4790
4791 if (sample_type & PERF_SAMPLE_TIME)
4792 data->time = perf_clock();
4793
4794 if (sample_type & (PERF_SAMPLE_ID | PERF_SAMPLE_IDENTIFIER))
4795 data->id = primary_event_id(event);
4796
4797 if (sample_type & PERF_SAMPLE_STREAM_ID)
4798 data->stream_id = event->id;
4799
4800 if (sample_type & PERF_SAMPLE_CPU) {
4801 data->cpu_entry.cpu = raw_smp_processor_id();
4802 data->cpu_entry.reserved = 0;
4803 }
4804 }
4805
perf_event_header__init_id(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)4806 void perf_event_header__init_id(struct perf_event_header *header,
4807 struct perf_sample_data *data,
4808 struct perf_event *event)
4809 {
4810 if (event->attr.sample_id_all)
4811 __perf_event_header__init_id(header, data, event);
4812 }
4813
__perf_event__output_id_sample(struct perf_output_handle * handle,struct perf_sample_data * data)4814 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4815 struct perf_sample_data *data)
4816 {
4817 u64 sample_type = data->type;
4818
4819 if (sample_type & PERF_SAMPLE_TID)
4820 perf_output_put(handle, data->tid_entry);
4821
4822 if (sample_type & PERF_SAMPLE_TIME)
4823 perf_output_put(handle, data->time);
4824
4825 if (sample_type & PERF_SAMPLE_ID)
4826 perf_output_put(handle, data->id);
4827
4828 if (sample_type & PERF_SAMPLE_STREAM_ID)
4829 perf_output_put(handle, data->stream_id);
4830
4831 if (sample_type & PERF_SAMPLE_CPU)
4832 perf_output_put(handle, data->cpu_entry);
4833
4834 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4835 perf_output_put(handle, data->id);
4836 }
4837
perf_event__output_id_sample(struct perf_event * event,struct perf_output_handle * handle,struct perf_sample_data * sample)4838 void perf_event__output_id_sample(struct perf_event *event,
4839 struct perf_output_handle *handle,
4840 struct perf_sample_data *sample)
4841 {
4842 if (event->attr.sample_id_all)
4843 __perf_event__output_id_sample(handle, sample);
4844 }
4845
perf_output_read_one(struct perf_output_handle * handle,struct perf_event * event,u64 enabled,u64 running)4846 static void perf_output_read_one(struct perf_output_handle *handle,
4847 struct perf_event *event,
4848 u64 enabled, u64 running)
4849 {
4850 u64 read_format = event->attr.read_format;
4851 u64 values[4];
4852 int n = 0;
4853
4854 values[n++] = perf_event_count(event);
4855 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4856 values[n++] = enabled +
4857 atomic64_read(&event->child_total_time_enabled);
4858 }
4859 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4860 values[n++] = running +
4861 atomic64_read(&event->child_total_time_running);
4862 }
4863 if (read_format & PERF_FORMAT_ID)
4864 values[n++] = primary_event_id(event);
4865
4866 __output_copy(handle, values, n * sizeof(u64));
4867 }
4868
4869 /*
4870 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4871 */
perf_output_read_group(struct perf_output_handle * handle,struct perf_event * event,u64 enabled,u64 running)4872 static void perf_output_read_group(struct perf_output_handle *handle,
4873 struct perf_event *event,
4874 u64 enabled, u64 running)
4875 {
4876 struct perf_event *leader = event->group_leader, *sub;
4877 u64 read_format = event->attr.read_format;
4878 u64 values[5];
4879 int n = 0;
4880
4881 values[n++] = 1 + leader->nr_siblings;
4882
4883 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4884 values[n++] = enabled;
4885
4886 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4887 values[n++] = running;
4888
4889 if (leader != event)
4890 leader->pmu->read(leader);
4891
4892 values[n++] = perf_event_count(leader);
4893 if (read_format & PERF_FORMAT_ID)
4894 values[n++] = primary_event_id(leader);
4895
4896 __output_copy(handle, values, n * sizeof(u64));
4897
4898 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4899 n = 0;
4900
4901 if ((sub != event) &&
4902 (sub->state == PERF_EVENT_STATE_ACTIVE))
4903 sub->pmu->read(sub);
4904
4905 values[n++] = perf_event_count(sub);
4906 if (read_format & PERF_FORMAT_ID)
4907 values[n++] = primary_event_id(sub);
4908
4909 __output_copy(handle, values, n * sizeof(u64));
4910 }
4911 }
4912
4913 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4914 PERF_FORMAT_TOTAL_TIME_RUNNING)
4915
perf_output_read(struct perf_output_handle * handle,struct perf_event * event)4916 static void perf_output_read(struct perf_output_handle *handle,
4917 struct perf_event *event)
4918 {
4919 u64 enabled = 0, running = 0, now;
4920 u64 read_format = event->attr.read_format;
4921
4922 /*
4923 * compute total_time_enabled, total_time_running
4924 * based on snapshot values taken when the event
4925 * was last scheduled in.
4926 *
4927 * we cannot simply called update_context_time()
4928 * because of locking issue as we are called in
4929 * NMI context
4930 */
4931 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4932 calc_timer_values(event, &now, &enabled, &running);
4933
4934 if (event->attr.read_format & PERF_FORMAT_GROUP)
4935 perf_output_read_group(handle, event, enabled, running);
4936 else
4937 perf_output_read_one(handle, event, enabled, running);
4938 }
4939
perf_output_sample(struct perf_output_handle * handle,struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event)4940 void perf_output_sample(struct perf_output_handle *handle,
4941 struct perf_event_header *header,
4942 struct perf_sample_data *data,
4943 struct perf_event *event)
4944 {
4945 u64 sample_type = data->type;
4946
4947 perf_output_put(handle, *header);
4948
4949 if (sample_type & PERF_SAMPLE_IDENTIFIER)
4950 perf_output_put(handle, data->id);
4951
4952 if (sample_type & PERF_SAMPLE_IP)
4953 perf_output_put(handle, data->ip);
4954
4955 if (sample_type & PERF_SAMPLE_TID)
4956 perf_output_put(handle, data->tid_entry);
4957
4958 if (sample_type & PERF_SAMPLE_TIME)
4959 perf_output_put(handle, data->time);
4960
4961 if (sample_type & PERF_SAMPLE_ADDR)
4962 perf_output_put(handle, data->addr);
4963
4964 if (sample_type & PERF_SAMPLE_ID)
4965 perf_output_put(handle, data->id);
4966
4967 if (sample_type & PERF_SAMPLE_STREAM_ID)
4968 perf_output_put(handle, data->stream_id);
4969
4970 if (sample_type & PERF_SAMPLE_CPU)
4971 perf_output_put(handle, data->cpu_entry);
4972
4973 if (sample_type & PERF_SAMPLE_PERIOD)
4974 perf_output_put(handle, data->period);
4975
4976 if (sample_type & PERF_SAMPLE_READ)
4977 perf_output_read(handle, event);
4978
4979 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4980 if (data->callchain) {
4981 int size = 1;
4982
4983 if (data->callchain)
4984 size += data->callchain->nr;
4985
4986 size *= sizeof(u64);
4987
4988 __output_copy(handle, data->callchain, size);
4989 } else {
4990 u64 nr = 0;
4991 perf_output_put(handle, nr);
4992 }
4993 }
4994
4995 if (sample_type & PERF_SAMPLE_RAW) {
4996 if (data->raw) {
4997 perf_output_put(handle, data->raw->size);
4998 __output_copy(handle, data->raw->data,
4999 data->raw->size);
5000 } else {
5001 struct {
5002 u32 size;
5003 u32 data;
5004 } raw = {
5005 .size = sizeof(u32),
5006 .data = 0,
5007 };
5008 perf_output_put(handle, raw);
5009 }
5010 }
5011
5012 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5013 if (data->br_stack) {
5014 size_t size;
5015
5016 size = data->br_stack->nr
5017 * sizeof(struct perf_branch_entry);
5018
5019 perf_output_put(handle, data->br_stack->nr);
5020 perf_output_copy(handle, data->br_stack->entries, size);
5021 } else {
5022 /*
5023 * we always store at least the value of nr
5024 */
5025 u64 nr = 0;
5026 perf_output_put(handle, nr);
5027 }
5028 }
5029
5030 if (sample_type & PERF_SAMPLE_REGS_USER) {
5031 u64 abi = data->regs_user.abi;
5032
5033 /*
5034 * If there are no regs to dump, notice it through
5035 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
5036 */
5037 perf_output_put(handle, abi);
5038
5039 if (abi) {
5040 u64 mask = event->attr.sample_regs_user;
5041 perf_output_sample_regs(handle,
5042 data->regs_user.regs,
5043 mask);
5044 }
5045 }
5046
5047 if (sample_type & PERF_SAMPLE_STACK_USER) {
5048 perf_output_sample_ustack(handle,
5049 data->stack_user_size,
5050 data->regs_user.regs);
5051 }
5052
5053 if (sample_type & PERF_SAMPLE_WEIGHT)
5054 perf_output_put(handle, data->weight);
5055
5056 if (sample_type & PERF_SAMPLE_DATA_SRC)
5057 perf_output_put(handle, data->data_src.val);
5058
5059 if (sample_type & PERF_SAMPLE_TRANSACTION)
5060 perf_output_put(handle, data->txn);
5061
5062 if (!event->attr.watermark) {
5063 int wakeup_events = event->attr.wakeup_events;
5064
5065 if (wakeup_events) {
5066 struct ring_buffer *rb = handle->rb;
5067 int events = local_inc_return(&rb->events);
5068
5069 if (events >= wakeup_events) {
5070 local_sub(wakeup_events, &rb->events);
5071 local_inc(&rb->wakeup);
5072 }
5073 }
5074 }
5075 }
5076
perf_prepare_sample(struct perf_event_header * header,struct perf_sample_data * data,struct perf_event * event,struct pt_regs * regs)5077 void perf_prepare_sample(struct perf_event_header *header,
5078 struct perf_sample_data *data,
5079 struct perf_event *event,
5080 struct pt_regs *regs)
5081 {
5082 u64 sample_type = event->attr.sample_type;
5083
5084 header->type = PERF_RECORD_SAMPLE;
5085 header->size = sizeof(*header) + event->header_size;
5086
5087 header->misc = 0;
5088 header->misc |= perf_misc_flags(regs);
5089
5090 __perf_event_header__init_id(header, data, event);
5091
5092 if (sample_type & PERF_SAMPLE_IP)
5093 data->ip = perf_instruction_pointer(regs);
5094
5095 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
5096 int size = 1;
5097
5098 data->callchain = perf_callchain(event, regs);
5099
5100 if (data->callchain)
5101 size += data->callchain->nr;
5102
5103 header->size += size * sizeof(u64);
5104 }
5105
5106 if (sample_type & PERF_SAMPLE_RAW) {
5107 int size = sizeof(u32);
5108
5109 if (data->raw)
5110 size += data->raw->size;
5111 else
5112 size += sizeof(u32);
5113
5114 WARN_ON_ONCE(size & (sizeof(u64)-1));
5115 header->size += size;
5116 }
5117
5118 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
5119 int size = sizeof(u64); /* nr */
5120 if (data->br_stack) {
5121 size += data->br_stack->nr
5122 * sizeof(struct perf_branch_entry);
5123 }
5124 header->size += size;
5125 }
5126
5127 if (sample_type & PERF_SAMPLE_REGS_USER) {
5128 /* regs dump ABI info */
5129 int size = sizeof(u64);
5130
5131 perf_sample_regs_user(&data->regs_user, regs);
5132
5133 if (data->regs_user.regs) {
5134 u64 mask = event->attr.sample_regs_user;
5135 size += hweight64(mask) * sizeof(u64);
5136 }
5137
5138 header->size += size;
5139 }
5140
5141 if (sample_type & PERF_SAMPLE_STACK_USER) {
5142 /*
5143 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
5144 * processed as the last one or have additional check added
5145 * in case new sample type is added, because we could eat
5146 * up the rest of the sample size.
5147 */
5148 struct perf_regs_user *uregs = &data->regs_user;
5149 u16 stack_size = event->attr.sample_stack_user;
5150 u16 size = sizeof(u64);
5151
5152 if (!uregs->abi)
5153 perf_sample_regs_user(uregs, regs);
5154
5155 stack_size = perf_sample_ustack_size(stack_size, header->size,
5156 uregs->regs);
5157
5158 /*
5159 * If there is something to dump, add space for the dump
5160 * itself and for the field that tells the dynamic size,
5161 * which is how many have been actually dumped.
5162 */
5163 if (stack_size)
5164 size += sizeof(u64) + stack_size;
5165
5166 data->stack_user_size = stack_size;
5167 header->size += size;
5168 }
5169 }
5170
perf_event_output(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)5171 static void perf_event_output(struct perf_event *event,
5172 struct perf_sample_data *data,
5173 struct pt_regs *regs)
5174 {
5175 struct perf_output_handle handle;
5176 struct perf_event_header header;
5177
5178 /* protect the callchain buffers */
5179 rcu_read_lock();
5180
5181 perf_prepare_sample(&header, data, event, regs);
5182
5183 if (perf_output_begin(&handle, event, header.size))
5184 goto exit;
5185
5186 perf_output_sample(&handle, &header, data, event);
5187
5188 perf_output_end(&handle);
5189
5190 exit:
5191 rcu_read_unlock();
5192 }
5193
5194 /*
5195 * read event_id
5196 */
5197
5198 struct perf_read_event {
5199 struct perf_event_header header;
5200
5201 u32 pid;
5202 u32 tid;
5203 };
5204
5205 static void
perf_event_read_event(struct perf_event * event,struct task_struct * task)5206 perf_event_read_event(struct perf_event *event,
5207 struct task_struct *task)
5208 {
5209 struct perf_output_handle handle;
5210 struct perf_sample_data sample;
5211 struct perf_read_event read_event = {
5212 .header = {
5213 .type = PERF_RECORD_READ,
5214 .misc = 0,
5215 .size = sizeof(read_event) + event->read_size,
5216 },
5217 .pid = perf_event_pid(event, task),
5218 .tid = perf_event_tid(event, task),
5219 };
5220 int ret;
5221
5222 perf_event_header__init_id(&read_event.header, &sample, event);
5223 ret = perf_output_begin(&handle, event, read_event.header.size);
5224 if (ret)
5225 return;
5226
5227 perf_output_put(&handle, read_event);
5228 perf_output_read(&handle, event);
5229 perf_event__output_id_sample(event, &handle, &sample);
5230
5231 perf_output_end(&handle);
5232 }
5233
5234 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
5235
5236 static void
perf_event_aux_ctx(struct perf_event_context * ctx,perf_event_aux_output_cb output,void * data)5237 perf_event_aux_ctx(struct perf_event_context *ctx,
5238 perf_event_aux_output_cb output,
5239 void *data)
5240 {
5241 struct perf_event *event;
5242
5243 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5244 if (event->state < PERF_EVENT_STATE_INACTIVE)
5245 continue;
5246 if (!event_filter_match(event))
5247 continue;
5248 output(event, data);
5249 }
5250 }
5251
5252 static void
perf_event_aux(perf_event_aux_output_cb output,void * data,struct perf_event_context * task_ctx)5253 perf_event_aux(perf_event_aux_output_cb output, void *data,
5254 struct perf_event_context *task_ctx)
5255 {
5256 struct perf_cpu_context *cpuctx;
5257 struct perf_event_context *ctx;
5258 struct pmu *pmu;
5259 int ctxn;
5260
5261 rcu_read_lock();
5262 list_for_each_entry_rcu(pmu, &pmus, entry) {
5263 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
5264 if (cpuctx->unique_pmu != pmu)
5265 goto next;
5266 perf_event_aux_ctx(&cpuctx->ctx, output, data);
5267 if (task_ctx)
5268 goto next;
5269 ctxn = pmu->task_ctx_nr;
5270 if (ctxn < 0)
5271 goto next;
5272 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
5273 if (ctx)
5274 perf_event_aux_ctx(ctx, output, data);
5275 next:
5276 put_cpu_ptr(pmu->pmu_cpu_context);
5277 }
5278
5279 if (task_ctx) {
5280 preempt_disable();
5281 perf_event_aux_ctx(task_ctx, output, data);
5282 preempt_enable();
5283 }
5284 rcu_read_unlock();
5285 }
5286
5287 /*
5288 * task tracking -- fork/exit
5289 *
5290 * enabled by: attr.comm | attr.mmap | attr.mmap2 | attr.mmap_data | attr.task
5291 */
5292
5293 struct perf_task_event {
5294 struct task_struct *task;
5295 struct perf_event_context *task_ctx;
5296
5297 struct {
5298 struct perf_event_header header;
5299
5300 u32 pid;
5301 u32 ppid;
5302 u32 tid;
5303 u32 ptid;
5304 u64 time;
5305 } event_id;
5306 };
5307
perf_event_task_match(struct perf_event * event)5308 static int perf_event_task_match(struct perf_event *event)
5309 {
5310 return event->attr.comm || event->attr.mmap ||
5311 event->attr.mmap2 || event->attr.mmap_data ||
5312 event->attr.task;
5313 }
5314
perf_event_task_output(struct perf_event * event,void * data)5315 static void perf_event_task_output(struct perf_event *event,
5316 void *data)
5317 {
5318 struct perf_task_event *task_event = data;
5319 struct perf_output_handle handle;
5320 struct perf_sample_data sample;
5321 struct task_struct *task = task_event->task;
5322 int ret, size = task_event->event_id.header.size;
5323
5324 if (!perf_event_task_match(event))
5325 return;
5326
5327 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
5328
5329 ret = perf_output_begin(&handle, event,
5330 task_event->event_id.header.size);
5331 if (ret)
5332 goto out;
5333
5334 task_event->event_id.pid = perf_event_pid(event, task);
5335 task_event->event_id.ppid = perf_event_pid(event, current);
5336
5337 task_event->event_id.tid = perf_event_tid(event, task);
5338 task_event->event_id.ptid = perf_event_tid(event, current);
5339
5340 perf_output_put(&handle, task_event->event_id);
5341
5342 perf_event__output_id_sample(event, &handle, &sample);
5343
5344 perf_output_end(&handle);
5345 out:
5346 task_event->event_id.header.size = size;
5347 }
5348
perf_event_task(struct task_struct * task,struct perf_event_context * task_ctx,int new)5349 static void perf_event_task(struct task_struct *task,
5350 struct perf_event_context *task_ctx,
5351 int new)
5352 {
5353 struct perf_task_event task_event;
5354
5355 if (!atomic_read(&nr_comm_events) &&
5356 !atomic_read(&nr_mmap_events) &&
5357 !atomic_read(&nr_task_events))
5358 return;
5359
5360 task_event = (struct perf_task_event){
5361 .task = task,
5362 .task_ctx = task_ctx,
5363 .event_id = {
5364 .header = {
5365 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
5366 .misc = 0,
5367 .size = sizeof(task_event.event_id),
5368 },
5369 /* .pid */
5370 /* .ppid */
5371 /* .tid */
5372 /* .ptid */
5373 .time = perf_clock(),
5374 },
5375 };
5376
5377 perf_event_aux(perf_event_task_output,
5378 &task_event,
5379 task_ctx);
5380 }
5381
perf_event_fork(struct task_struct * task)5382 void perf_event_fork(struct task_struct *task)
5383 {
5384 perf_event_task(task, NULL, 1);
5385 }
5386
5387 /*
5388 * comm tracking
5389 */
5390
5391 struct perf_comm_event {
5392 struct task_struct *task;
5393 char *comm;
5394 int comm_size;
5395
5396 struct {
5397 struct perf_event_header header;
5398
5399 u32 pid;
5400 u32 tid;
5401 } event_id;
5402 };
5403
perf_event_comm_match(struct perf_event * event)5404 static int perf_event_comm_match(struct perf_event *event)
5405 {
5406 return event->attr.comm;
5407 }
5408
perf_event_comm_output(struct perf_event * event,void * data)5409 static void perf_event_comm_output(struct perf_event *event,
5410 void *data)
5411 {
5412 struct perf_comm_event *comm_event = data;
5413 struct perf_output_handle handle;
5414 struct perf_sample_data sample;
5415 int size = comm_event->event_id.header.size;
5416 int ret;
5417
5418 if (!perf_event_comm_match(event))
5419 return;
5420
5421 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
5422 ret = perf_output_begin(&handle, event,
5423 comm_event->event_id.header.size);
5424
5425 if (ret)
5426 goto out;
5427
5428 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
5429 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
5430
5431 perf_output_put(&handle, comm_event->event_id);
5432 __output_copy(&handle, comm_event->comm,
5433 comm_event->comm_size);
5434
5435 perf_event__output_id_sample(event, &handle, &sample);
5436
5437 perf_output_end(&handle);
5438 out:
5439 comm_event->event_id.header.size = size;
5440 }
5441
perf_event_comm_event(struct perf_comm_event * comm_event)5442 static void perf_event_comm_event(struct perf_comm_event *comm_event)
5443 {
5444 char comm[TASK_COMM_LEN];
5445 unsigned int size;
5446
5447 memset(comm, 0, sizeof(comm));
5448 strlcpy(comm, comm_event->task->comm, sizeof(comm));
5449 size = ALIGN(strlen(comm)+1, sizeof(u64));
5450
5451 comm_event->comm = comm;
5452 comm_event->comm_size = size;
5453
5454 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
5455
5456 perf_event_aux(perf_event_comm_output,
5457 comm_event,
5458 NULL);
5459 }
5460
perf_event_comm(struct task_struct * task,bool exec)5461 void perf_event_comm(struct task_struct *task, bool exec)
5462 {
5463 struct perf_comm_event comm_event;
5464
5465 if (!atomic_read(&nr_comm_events))
5466 return;
5467
5468 comm_event = (struct perf_comm_event){
5469 .task = task,
5470 /* .comm */
5471 /* .comm_size */
5472 .event_id = {
5473 .header = {
5474 .type = PERF_RECORD_COMM,
5475 .misc = exec ? PERF_RECORD_MISC_COMM_EXEC : 0,
5476 /* .size */
5477 },
5478 /* .pid */
5479 /* .tid */
5480 },
5481 };
5482
5483 perf_event_comm_event(&comm_event);
5484 }
5485
5486 /*
5487 * mmap tracking
5488 */
5489
5490 struct perf_mmap_event {
5491 struct vm_area_struct *vma;
5492
5493 const char *file_name;
5494 int file_size;
5495 int maj, min;
5496 u64 ino;
5497 u64 ino_generation;
5498 u32 prot, flags;
5499
5500 struct {
5501 struct perf_event_header header;
5502
5503 u32 pid;
5504 u32 tid;
5505 u64 start;
5506 u64 len;
5507 u64 pgoff;
5508 } event_id;
5509 };
5510
perf_event_mmap_match(struct perf_event * event,void * data)5511 static int perf_event_mmap_match(struct perf_event *event,
5512 void *data)
5513 {
5514 struct perf_mmap_event *mmap_event = data;
5515 struct vm_area_struct *vma = mmap_event->vma;
5516 int executable = vma->vm_flags & VM_EXEC;
5517
5518 return (!executable && event->attr.mmap_data) ||
5519 (executable && (event->attr.mmap || event->attr.mmap2));
5520 }
5521
perf_event_mmap_output(struct perf_event * event,void * data)5522 static void perf_event_mmap_output(struct perf_event *event,
5523 void *data)
5524 {
5525 struct perf_mmap_event *mmap_event = data;
5526 struct perf_output_handle handle;
5527 struct perf_sample_data sample;
5528 int size = mmap_event->event_id.header.size;
5529 int ret;
5530
5531 if (!perf_event_mmap_match(event, data))
5532 return;
5533
5534 if (event->attr.mmap2) {
5535 mmap_event->event_id.header.type = PERF_RECORD_MMAP2;
5536 mmap_event->event_id.header.size += sizeof(mmap_event->maj);
5537 mmap_event->event_id.header.size += sizeof(mmap_event->min);
5538 mmap_event->event_id.header.size += sizeof(mmap_event->ino);
5539 mmap_event->event_id.header.size += sizeof(mmap_event->ino_generation);
5540 mmap_event->event_id.header.size += sizeof(mmap_event->prot);
5541 mmap_event->event_id.header.size += sizeof(mmap_event->flags);
5542 }
5543
5544 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
5545 ret = perf_output_begin(&handle, event,
5546 mmap_event->event_id.header.size);
5547 if (ret)
5548 goto out;
5549
5550 mmap_event->event_id.pid = perf_event_pid(event, current);
5551 mmap_event->event_id.tid = perf_event_tid(event, current);
5552
5553 perf_output_put(&handle, mmap_event->event_id);
5554
5555 if (event->attr.mmap2) {
5556 perf_output_put(&handle, mmap_event->maj);
5557 perf_output_put(&handle, mmap_event->min);
5558 perf_output_put(&handle, mmap_event->ino);
5559 perf_output_put(&handle, mmap_event->ino_generation);
5560 perf_output_put(&handle, mmap_event->prot);
5561 perf_output_put(&handle, mmap_event->flags);
5562 }
5563
5564 __output_copy(&handle, mmap_event->file_name,
5565 mmap_event->file_size);
5566
5567 perf_event__output_id_sample(event, &handle, &sample);
5568
5569 perf_output_end(&handle);
5570 out:
5571 mmap_event->event_id.header.size = size;
5572 }
5573
perf_event_mmap_event(struct perf_mmap_event * mmap_event)5574 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
5575 {
5576 struct vm_area_struct *vma = mmap_event->vma;
5577 struct file *file = vma->vm_file;
5578 int maj = 0, min = 0;
5579 u64 ino = 0, gen = 0;
5580 u32 prot = 0, flags = 0;
5581 unsigned int size;
5582 char tmp[16];
5583 char *buf = NULL;
5584 char *name;
5585
5586 if (file) {
5587 struct inode *inode;
5588 dev_t dev;
5589
5590 buf = kmalloc(PATH_MAX, GFP_KERNEL);
5591 if (!buf) {
5592 name = "//enomem";
5593 goto cpy_name;
5594 }
5595 /*
5596 * d_path() works from the end of the rb backwards, so we
5597 * need to add enough zero bytes after the string to handle
5598 * the 64bit alignment we do later.
5599 */
5600 name = d_path(&file->f_path, buf, PATH_MAX - sizeof(u64));
5601 if (IS_ERR(name)) {
5602 name = "//toolong";
5603 goto cpy_name;
5604 }
5605 inode = file_inode(vma->vm_file);
5606 dev = inode->i_sb->s_dev;
5607 ino = inode->i_ino;
5608 gen = inode->i_generation;
5609 maj = MAJOR(dev);
5610 min = MINOR(dev);
5611
5612 if (vma->vm_flags & VM_READ)
5613 prot |= PROT_READ;
5614 if (vma->vm_flags & VM_WRITE)
5615 prot |= PROT_WRITE;
5616 if (vma->vm_flags & VM_EXEC)
5617 prot |= PROT_EXEC;
5618
5619 if (vma->vm_flags & VM_MAYSHARE)
5620 flags = MAP_SHARED;
5621 else
5622 flags = MAP_PRIVATE;
5623
5624 if (vma->vm_flags & VM_DENYWRITE)
5625 flags |= MAP_DENYWRITE;
5626 if (vma->vm_flags & VM_MAYEXEC)
5627 flags |= MAP_EXECUTABLE;
5628 if (vma->vm_flags & VM_LOCKED)
5629 flags |= MAP_LOCKED;
5630 if (vma->vm_flags & VM_HUGETLB)
5631 flags |= MAP_HUGETLB;
5632
5633 goto got_name;
5634 } else {
5635 if (vma->vm_ops && vma->vm_ops->name) {
5636 name = (char *) vma->vm_ops->name(vma);
5637 if (name)
5638 goto cpy_name;
5639 }
5640
5641 name = (char *)arch_vma_name(vma);
5642 if (name)
5643 goto cpy_name;
5644
5645 if (vma->vm_start <= vma->vm_mm->start_brk &&
5646 vma->vm_end >= vma->vm_mm->brk) {
5647 name = "[heap]";
5648 goto cpy_name;
5649 }
5650 if (vma->vm_start <= vma->vm_mm->start_stack &&
5651 vma->vm_end >= vma->vm_mm->start_stack) {
5652 name = "[stack]";
5653 goto cpy_name;
5654 }
5655
5656 name = "//anon";
5657 goto cpy_name;
5658 }
5659
5660 cpy_name:
5661 strlcpy(tmp, name, sizeof(tmp));
5662 name = tmp;
5663 got_name:
5664 /*
5665 * Since our buffer works in 8 byte units we need to align our string
5666 * size to a multiple of 8. However, we must guarantee the tail end is
5667 * zero'd out to avoid leaking random bits to userspace.
5668 */
5669 size = strlen(name)+1;
5670 while (!IS_ALIGNED(size, sizeof(u64)))
5671 name[size++] = '\0';
5672
5673 mmap_event->file_name = name;
5674 mmap_event->file_size = size;
5675 mmap_event->maj = maj;
5676 mmap_event->min = min;
5677 mmap_event->ino = ino;
5678 mmap_event->ino_generation = gen;
5679 mmap_event->prot = prot;
5680 mmap_event->flags = flags;
5681
5682 if (!(vma->vm_flags & VM_EXEC))
5683 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
5684
5685 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
5686
5687 perf_event_aux(perf_event_mmap_output,
5688 mmap_event,
5689 NULL);
5690
5691 kfree(buf);
5692 }
5693
perf_event_mmap(struct vm_area_struct * vma)5694 void perf_event_mmap(struct vm_area_struct *vma)
5695 {
5696 struct perf_mmap_event mmap_event;
5697
5698 if (!atomic_read(&nr_mmap_events))
5699 return;
5700
5701 mmap_event = (struct perf_mmap_event){
5702 .vma = vma,
5703 /* .file_name */
5704 /* .file_size */
5705 .event_id = {
5706 .header = {
5707 .type = PERF_RECORD_MMAP,
5708 .misc = PERF_RECORD_MISC_USER,
5709 /* .size */
5710 },
5711 /* .pid */
5712 /* .tid */
5713 .start = vma->vm_start,
5714 .len = vma->vm_end - vma->vm_start,
5715 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5716 },
5717 /* .maj (attr_mmap2 only) */
5718 /* .min (attr_mmap2 only) */
5719 /* .ino (attr_mmap2 only) */
5720 /* .ino_generation (attr_mmap2 only) */
5721 /* .prot (attr_mmap2 only) */
5722 /* .flags (attr_mmap2 only) */
5723 };
5724
5725 perf_event_mmap_event(&mmap_event);
5726 }
5727
5728 /*
5729 * IRQ throttle logging
5730 */
5731
perf_log_throttle(struct perf_event * event,int enable)5732 static void perf_log_throttle(struct perf_event *event, int enable)
5733 {
5734 struct perf_output_handle handle;
5735 struct perf_sample_data sample;
5736 int ret;
5737
5738 struct {
5739 struct perf_event_header header;
5740 u64 time;
5741 u64 id;
5742 u64 stream_id;
5743 } throttle_event = {
5744 .header = {
5745 .type = PERF_RECORD_THROTTLE,
5746 .misc = 0,
5747 .size = sizeof(throttle_event),
5748 },
5749 .time = perf_clock(),
5750 .id = primary_event_id(event),
5751 .stream_id = event->id,
5752 };
5753
5754 if (enable)
5755 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
5756
5757 perf_event_header__init_id(&throttle_event.header, &sample, event);
5758
5759 ret = perf_output_begin(&handle, event,
5760 throttle_event.header.size);
5761 if (ret)
5762 return;
5763
5764 perf_output_put(&handle, throttle_event);
5765 perf_event__output_id_sample(event, &handle, &sample);
5766 perf_output_end(&handle);
5767 }
5768
5769 /*
5770 * Generic event overflow handling, sampling.
5771 */
5772
__perf_event_overflow(struct perf_event * event,int throttle,struct perf_sample_data * data,struct pt_regs * regs)5773 static int __perf_event_overflow(struct perf_event *event,
5774 int throttle, struct perf_sample_data *data,
5775 struct pt_regs *regs)
5776 {
5777 int events = atomic_read(&event->event_limit);
5778 struct hw_perf_event *hwc = &event->hw;
5779 u64 seq;
5780 int ret = 0;
5781
5782 /*
5783 * Non-sampling counters might still use the PMI to fold short
5784 * hardware counters, ignore those.
5785 */
5786 if (unlikely(!is_sampling_event(event)))
5787 return 0;
5788
5789 seq = __this_cpu_read(perf_throttled_seq);
5790 if (seq != hwc->interrupts_seq) {
5791 hwc->interrupts_seq = seq;
5792 hwc->interrupts = 1;
5793 } else {
5794 hwc->interrupts++;
5795 if (unlikely(throttle
5796 && hwc->interrupts >= max_samples_per_tick)) {
5797 __this_cpu_inc(perf_throttled_count);
5798 hwc->interrupts = MAX_INTERRUPTS;
5799 perf_log_throttle(event, 0);
5800 tick_nohz_full_kick();
5801 ret = 1;
5802 }
5803 }
5804
5805 if (event->attr.freq) {
5806 u64 now = perf_clock();
5807 s64 delta = now - hwc->freq_time_stamp;
5808
5809 hwc->freq_time_stamp = now;
5810
5811 if (delta > 0 && delta < 2*TICK_NSEC)
5812 perf_adjust_period(event, delta, hwc->last_period, true);
5813 }
5814
5815 /*
5816 * XXX event_limit might not quite work as expected on inherited
5817 * events
5818 */
5819
5820 event->pending_kill = POLL_IN;
5821 if (events && atomic_dec_and_test(&event->event_limit)) {
5822 ret = 1;
5823 event->pending_kill = POLL_HUP;
5824 event->pending_disable = 1;
5825 irq_work_queue(&event->pending);
5826 }
5827
5828 if (event->overflow_handler)
5829 event->overflow_handler(event, data, regs);
5830 else
5831 perf_event_output(event, data, regs);
5832
5833 if (*perf_event_fasync(event) && event->pending_kill) {
5834 event->pending_wakeup = 1;
5835 irq_work_queue(&event->pending);
5836 }
5837
5838 return ret;
5839 }
5840
perf_event_overflow(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)5841 int perf_event_overflow(struct perf_event *event,
5842 struct perf_sample_data *data,
5843 struct pt_regs *regs)
5844 {
5845 return __perf_event_overflow(event, 1, data, regs);
5846 }
5847
5848 /*
5849 * Generic software event infrastructure
5850 */
5851
5852 struct swevent_htable {
5853 struct swevent_hlist *swevent_hlist;
5854 struct mutex hlist_mutex;
5855 int hlist_refcount;
5856
5857 /* Recursion avoidance in each contexts */
5858 int recursion[PERF_NR_CONTEXTS];
5859 };
5860
5861 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5862
5863 /*
5864 * We directly increment event->count and keep a second value in
5865 * event->hw.period_left to count intervals. This period event
5866 * is kept in the range [-sample_period, 0] so that we can use the
5867 * sign as trigger.
5868 */
5869
perf_swevent_set_period(struct perf_event * event)5870 u64 perf_swevent_set_period(struct perf_event *event)
5871 {
5872 struct hw_perf_event *hwc = &event->hw;
5873 u64 period = hwc->last_period;
5874 u64 nr, offset;
5875 s64 old, val;
5876
5877 hwc->last_period = hwc->sample_period;
5878
5879 again:
5880 old = val = local64_read(&hwc->period_left);
5881 if (val < 0)
5882 return 0;
5883
5884 nr = div64_u64(period + val, period);
5885 offset = nr * period;
5886 val -= offset;
5887 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
5888 goto again;
5889
5890 return nr;
5891 }
5892
perf_swevent_overflow(struct perf_event * event,u64 overflow,struct perf_sample_data * data,struct pt_regs * regs)5893 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5894 struct perf_sample_data *data,
5895 struct pt_regs *regs)
5896 {
5897 struct hw_perf_event *hwc = &event->hw;
5898 int throttle = 0;
5899
5900 if (!overflow)
5901 overflow = perf_swevent_set_period(event);
5902
5903 if (hwc->interrupts == MAX_INTERRUPTS)
5904 return;
5905
5906 for (; overflow; overflow--) {
5907 if (__perf_event_overflow(event, throttle,
5908 data, regs)) {
5909 /*
5910 * We inhibit the overflow from happening when
5911 * hwc->interrupts == MAX_INTERRUPTS.
5912 */
5913 break;
5914 }
5915 throttle = 1;
5916 }
5917 }
5918
perf_swevent_event(struct perf_event * event,u64 nr,struct perf_sample_data * data,struct pt_regs * regs)5919 static void perf_swevent_event(struct perf_event *event, u64 nr,
5920 struct perf_sample_data *data,
5921 struct pt_regs *regs)
5922 {
5923 struct hw_perf_event *hwc = &event->hw;
5924
5925 local64_add(nr, &event->count);
5926
5927 if (!regs)
5928 return;
5929
5930 if (!is_sampling_event(event))
5931 return;
5932
5933 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
5934 data->period = nr;
5935 return perf_swevent_overflow(event, 1, data, regs);
5936 } else
5937 data->period = event->hw.last_period;
5938
5939 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5940 return perf_swevent_overflow(event, 1, data, regs);
5941
5942 if (local64_add_negative(nr, &hwc->period_left))
5943 return;
5944
5945 perf_swevent_overflow(event, 0, data, regs);
5946 }
5947
perf_exclude_event(struct perf_event * event,struct pt_regs * regs)5948 static int perf_exclude_event(struct perf_event *event,
5949 struct pt_regs *regs)
5950 {
5951 if (event->hw.state & PERF_HES_STOPPED)
5952 return 1;
5953
5954 if (regs) {
5955 if (event->attr.exclude_user && user_mode(regs))
5956 return 1;
5957
5958 if (event->attr.exclude_kernel && !user_mode(regs))
5959 return 1;
5960 }
5961
5962 return 0;
5963 }
5964
perf_swevent_match(struct perf_event * event,enum perf_type_id type,u32 event_id,struct perf_sample_data * data,struct pt_regs * regs)5965 static int perf_swevent_match(struct perf_event *event,
5966 enum perf_type_id type,
5967 u32 event_id,
5968 struct perf_sample_data *data,
5969 struct pt_regs *regs)
5970 {
5971 if (event->attr.type != type)
5972 return 0;
5973
5974 if (event->attr.config != event_id)
5975 return 0;
5976
5977 if (perf_exclude_event(event, regs))
5978 return 0;
5979
5980 return 1;
5981 }
5982
swevent_hash(u64 type,u32 event_id)5983 static inline u64 swevent_hash(u64 type, u32 event_id)
5984 {
5985 u64 val = event_id | (type << 32);
5986
5987 return hash_64(val, SWEVENT_HLIST_BITS);
5988 }
5989
5990 static inline struct hlist_head *
__find_swevent_head(struct swevent_hlist * hlist,u64 type,u32 event_id)5991 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
5992 {
5993 u64 hash = swevent_hash(type, event_id);
5994
5995 return &hlist->heads[hash];
5996 }
5997
5998 /* For the read side: events when they trigger */
5999 static inline struct hlist_head *
find_swevent_head_rcu(struct swevent_htable * swhash,u64 type,u32 event_id)6000 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
6001 {
6002 struct swevent_hlist *hlist;
6003
6004 hlist = rcu_dereference(swhash->swevent_hlist);
6005 if (!hlist)
6006 return NULL;
6007
6008 return __find_swevent_head(hlist, type, event_id);
6009 }
6010
6011 /* For the event head insertion and removal in the hlist */
6012 static inline struct hlist_head *
find_swevent_head(struct swevent_htable * swhash,struct perf_event * event)6013 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
6014 {
6015 struct swevent_hlist *hlist;
6016 u32 event_id = event->attr.config;
6017 u64 type = event->attr.type;
6018
6019 /*
6020 * Event scheduling is always serialized against hlist allocation
6021 * and release. Which makes the protected version suitable here.
6022 * The context lock guarantees that.
6023 */
6024 hlist = rcu_dereference_protected(swhash->swevent_hlist,
6025 lockdep_is_held(&event->ctx->lock));
6026 if (!hlist)
6027 return NULL;
6028
6029 return __find_swevent_head(hlist, type, event_id);
6030 }
6031
do_perf_sw_event(enum perf_type_id type,u32 event_id,u64 nr,struct perf_sample_data * data,struct pt_regs * regs)6032 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
6033 u64 nr,
6034 struct perf_sample_data *data,
6035 struct pt_regs *regs)
6036 {
6037 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6038 struct perf_event *event;
6039 struct hlist_head *head;
6040
6041 rcu_read_lock();
6042 head = find_swevent_head_rcu(swhash, type, event_id);
6043 if (!head)
6044 goto end;
6045
6046 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6047 if (perf_swevent_match(event, type, event_id, data, regs))
6048 perf_swevent_event(event, nr, data, regs);
6049 }
6050 end:
6051 rcu_read_unlock();
6052 }
6053
6054 DEFINE_PER_CPU(struct pt_regs, __perf_regs[4]);
6055
perf_swevent_get_recursion_context(void)6056 int perf_swevent_get_recursion_context(void)
6057 {
6058 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6059
6060 return get_recursion_context(swhash->recursion);
6061 }
6062 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
6063
perf_swevent_put_recursion_context(int rctx)6064 inline void perf_swevent_put_recursion_context(int rctx)
6065 {
6066 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6067
6068 put_recursion_context(swhash->recursion, rctx);
6069 }
6070
___perf_sw_event(u32 event_id,u64 nr,struct pt_regs * regs,u64 addr)6071 void ___perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6072 {
6073 struct perf_sample_data data;
6074
6075 if (WARN_ON_ONCE(!regs))
6076 return;
6077
6078 perf_sample_data_init(&data, addr, 0);
6079 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
6080 }
6081
__perf_sw_event(u32 event_id,u64 nr,struct pt_regs * regs,u64 addr)6082 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
6083 {
6084 int rctx;
6085
6086 preempt_disable_notrace();
6087 rctx = perf_swevent_get_recursion_context();
6088 if (unlikely(rctx < 0))
6089 goto fail;
6090
6091 ___perf_sw_event(event_id, nr, regs, addr);
6092
6093 perf_swevent_put_recursion_context(rctx);
6094 fail:
6095 preempt_enable_notrace();
6096 }
6097
perf_swevent_read(struct perf_event * event)6098 static void perf_swevent_read(struct perf_event *event)
6099 {
6100 }
6101
perf_swevent_add(struct perf_event * event,int flags)6102 static int perf_swevent_add(struct perf_event *event, int flags)
6103 {
6104 struct swevent_htable *swhash = this_cpu_ptr(&swevent_htable);
6105 struct hw_perf_event *hwc = &event->hw;
6106 struct hlist_head *head;
6107
6108 if (is_sampling_event(event)) {
6109 hwc->last_period = hwc->sample_period;
6110 perf_swevent_set_period(event);
6111 }
6112
6113 hwc->state = !(flags & PERF_EF_START);
6114
6115 head = find_swevent_head(swhash, event);
6116 if (WARN_ON_ONCE(!head))
6117 return -EINVAL;
6118
6119 hlist_add_head_rcu(&event->hlist_entry, head);
6120
6121 return 0;
6122 }
6123
perf_swevent_del(struct perf_event * event,int flags)6124 static void perf_swevent_del(struct perf_event *event, int flags)
6125 {
6126 hlist_del_rcu(&event->hlist_entry);
6127 }
6128
perf_swevent_start(struct perf_event * event,int flags)6129 static void perf_swevent_start(struct perf_event *event, int flags)
6130 {
6131 event->hw.state = 0;
6132 }
6133
perf_swevent_stop(struct perf_event * event,int flags)6134 static void perf_swevent_stop(struct perf_event *event, int flags)
6135 {
6136 event->hw.state = PERF_HES_STOPPED;
6137 }
6138
6139 /* Deref the hlist from the update side */
6140 static inline struct swevent_hlist *
swevent_hlist_deref(struct swevent_htable * swhash)6141 swevent_hlist_deref(struct swevent_htable *swhash)
6142 {
6143 return rcu_dereference_protected(swhash->swevent_hlist,
6144 lockdep_is_held(&swhash->hlist_mutex));
6145 }
6146
swevent_hlist_release(struct swevent_htable * swhash)6147 static void swevent_hlist_release(struct swevent_htable *swhash)
6148 {
6149 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
6150
6151 if (!hlist)
6152 return;
6153
6154 RCU_INIT_POINTER(swhash->swevent_hlist, NULL);
6155 kfree_rcu(hlist, rcu_head);
6156 }
6157
swevent_hlist_put_cpu(struct perf_event * event,int cpu)6158 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
6159 {
6160 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6161
6162 mutex_lock(&swhash->hlist_mutex);
6163
6164 if (!--swhash->hlist_refcount)
6165 swevent_hlist_release(swhash);
6166
6167 mutex_unlock(&swhash->hlist_mutex);
6168 }
6169
swevent_hlist_put(struct perf_event * event)6170 static void swevent_hlist_put(struct perf_event *event)
6171 {
6172 int cpu;
6173
6174 for_each_possible_cpu(cpu)
6175 swevent_hlist_put_cpu(event, cpu);
6176 }
6177
swevent_hlist_get_cpu(struct perf_event * event,int cpu)6178 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
6179 {
6180 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
6181 int err = 0;
6182
6183 mutex_lock(&swhash->hlist_mutex);
6184 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
6185 struct swevent_hlist *hlist;
6186
6187 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
6188 if (!hlist) {
6189 err = -ENOMEM;
6190 goto exit;
6191 }
6192 rcu_assign_pointer(swhash->swevent_hlist, hlist);
6193 }
6194 swhash->hlist_refcount++;
6195 exit:
6196 mutex_unlock(&swhash->hlist_mutex);
6197
6198 return err;
6199 }
6200
swevent_hlist_get(struct perf_event * event)6201 static int swevent_hlist_get(struct perf_event *event)
6202 {
6203 int err;
6204 int cpu, failed_cpu;
6205
6206 get_online_cpus();
6207 for_each_possible_cpu(cpu) {
6208 err = swevent_hlist_get_cpu(event, cpu);
6209 if (err) {
6210 failed_cpu = cpu;
6211 goto fail;
6212 }
6213 }
6214 put_online_cpus();
6215
6216 return 0;
6217 fail:
6218 for_each_possible_cpu(cpu) {
6219 if (cpu == failed_cpu)
6220 break;
6221 swevent_hlist_put_cpu(event, cpu);
6222 }
6223
6224 put_online_cpus();
6225 return err;
6226 }
6227
6228 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
6229
sw_perf_event_destroy(struct perf_event * event)6230 static void sw_perf_event_destroy(struct perf_event *event)
6231 {
6232 u64 event_id = event->attr.config;
6233
6234 WARN_ON(event->parent);
6235
6236 static_key_slow_dec(&perf_swevent_enabled[event_id]);
6237 swevent_hlist_put(event);
6238 }
6239
perf_swevent_init(struct perf_event * event)6240 static int perf_swevent_init(struct perf_event *event)
6241 {
6242 u64 event_id = event->attr.config;
6243
6244 if (event->attr.type != PERF_TYPE_SOFTWARE)
6245 return -ENOENT;
6246
6247 /*
6248 * no branch sampling for software events
6249 */
6250 if (has_branch_stack(event))
6251 return -EOPNOTSUPP;
6252
6253 switch (event_id) {
6254 case PERF_COUNT_SW_CPU_CLOCK:
6255 case PERF_COUNT_SW_TASK_CLOCK:
6256 return -ENOENT;
6257
6258 default:
6259 break;
6260 }
6261
6262 if (event_id >= PERF_COUNT_SW_MAX)
6263 return -ENOENT;
6264
6265 if (!event->parent) {
6266 int err;
6267
6268 err = swevent_hlist_get(event);
6269 if (err)
6270 return err;
6271
6272 static_key_slow_inc(&perf_swevent_enabled[event_id]);
6273 event->destroy = sw_perf_event_destroy;
6274 }
6275
6276 return 0;
6277 }
6278
6279 static struct pmu perf_swevent = {
6280 .task_ctx_nr = perf_sw_context,
6281
6282 .event_init = perf_swevent_init,
6283 .add = perf_swevent_add,
6284 .del = perf_swevent_del,
6285 .start = perf_swevent_start,
6286 .stop = perf_swevent_stop,
6287 .read = perf_swevent_read,
6288 };
6289
6290 #ifdef CONFIG_EVENT_TRACING
6291
perf_tp_filter_match(struct perf_event * event,struct perf_sample_data * data)6292 static int perf_tp_filter_match(struct perf_event *event,
6293 struct perf_sample_data *data)
6294 {
6295 void *record = data->raw->data;
6296
6297 if (likely(!event->filter) || filter_match_preds(event->filter, record))
6298 return 1;
6299 return 0;
6300 }
6301
perf_tp_event_match(struct perf_event * event,struct perf_sample_data * data,struct pt_regs * regs)6302 static int perf_tp_event_match(struct perf_event *event,
6303 struct perf_sample_data *data,
6304 struct pt_regs *regs)
6305 {
6306 if (event->hw.state & PERF_HES_STOPPED)
6307 return 0;
6308 /*
6309 * All tracepoints are from kernel-space.
6310 */
6311 if (event->attr.exclude_kernel)
6312 return 0;
6313
6314 if (!perf_tp_filter_match(event, data))
6315 return 0;
6316
6317 return 1;
6318 }
6319
perf_tp_event(u64 addr,u64 count,void * record,int entry_size,struct pt_regs * regs,struct hlist_head * head,int rctx,struct task_struct * task)6320 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
6321 struct pt_regs *regs, struct hlist_head *head, int rctx,
6322 struct task_struct *task)
6323 {
6324 struct perf_sample_data data;
6325 struct perf_event *event;
6326
6327 struct perf_raw_record raw = {
6328 .size = entry_size,
6329 .data = record,
6330 };
6331
6332 perf_sample_data_init(&data, addr, 0);
6333 data.raw = &raw;
6334
6335 hlist_for_each_entry_rcu(event, head, hlist_entry) {
6336 if (perf_tp_event_match(event, &data, regs))
6337 perf_swevent_event(event, count, &data, regs);
6338 }
6339
6340 /*
6341 * If we got specified a target task, also iterate its context and
6342 * deliver this event there too.
6343 */
6344 if (task && task != current) {
6345 struct perf_event_context *ctx;
6346 struct trace_entry *entry = record;
6347
6348 rcu_read_lock();
6349 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
6350 if (!ctx)
6351 goto unlock;
6352
6353 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
6354 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6355 continue;
6356 if (event->attr.config != entry->type)
6357 continue;
6358 if (perf_tp_event_match(event, &data, regs))
6359 perf_swevent_event(event, count, &data, regs);
6360 }
6361 unlock:
6362 rcu_read_unlock();
6363 }
6364
6365 perf_swevent_put_recursion_context(rctx);
6366 }
6367 EXPORT_SYMBOL_GPL(perf_tp_event);
6368
tp_perf_event_destroy(struct perf_event * event)6369 static void tp_perf_event_destroy(struct perf_event *event)
6370 {
6371 perf_trace_destroy(event);
6372 }
6373
perf_tp_event_init(struct perf_event * event)6374 static int perf_tp_event_init(struct perf_event *event)
6375 {
6376 int err;
6377
6378 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6379 return -ENOENT;
6380
6381 /*
6382 * no branch sampling for tracepoint events
6383 */
6384 if (has_branch_stack(event))
6385 return -EOPNOTSUPP;
6386
6387 err = perf_trace_init(event);
6388 if (err)
6389 return err;
6390
6391 event->destroy = tp_perf_event_destroy;
6392
6393 return 0;
6394 }
6395
6396 static struct pmu perf_tracepoint = {
6397 .task_ctx_nr = perf_sw_context,
6398
6399 .event_init = perf_tp_event_init,
6400 .add = perf_trace_add,
6401 .del = perf_trace_del,
6402 .start = perf_swevent_start,
6403 .stop = perf_swevent_stop,
6404 .read = perf_swevent_read,
6405 };
6406
perf_tp_register(void)6407 static inline void perf_tp_register(void)
6408 {
6409 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
6410 }
6411
perf_event_set_filter(struct perf_event * event,void __user * arg)6412 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6413 {
6414 char *filter_str;
6415 int ret;
6416
6417 if (event->attr.type != PERF_TYPE_TRACEPOINT)
6418 return -EINVAL;
6419
6420 filter_str = strndup_user(arg, PAGE_SIZE);
6421 if (IS_ERR(filter_str))
6422 return PTR_ERR(filter_str);
6423
6424 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
6425
6426 kfree(filter_str);
6427 return ret;
6428 }
6429
perf_event_free_filter(struct perf_event * event)6430 static void perf_event_free_filter(struct perf_event *event)
6431 {
6432 ftrace_profile_free_filter(event);
6433 }
6434
6435 #else
6436
perf_tp_register(void)6437 static inline void perf_tp_register(void)
6438 {
6439 }
6440
perf_event_set_filter(struct perf_event * event,void __user * arg)6441 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
6442 {
6443 return -ENOENT;
6444 }
6445
perf_event_free_filter(struct perf_event * event)6446 static void perf_event_free_filter(struct perf_event *event)
6447 {
6448 }
6449
6450 #endif /* CONFIG_EVENT_TRACING */
6451
6452 #ifdef CONFIG_HAVE_HW_BREAKPOINT
perf_bp_event(struct perf_event * bp,void * data)6453 void perf_bp_event(struct perf_event *bp, void *data)
6454 {
6455 struct perf_sample_data sample;
6456 struct pt_regs *regs = data;
6457
6458 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
6459
6460 if (!bp->hw.state && !perf_exclude_event(bp, regs))
6461 perf_swevent_event(bp, 1, &sample, regs);
6462 }
6463 #endif
6464
6465 /*
6466 * hrtimer based swevent callback
6467 */
6468
perf_swevent_hrtimer(struct hrtimer * hrtimer)6469 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
6470 {
6471 enum hrtimer_restart ret = HRTIMER_RESTART;
6472 struct perf_sample_data data;
6473 struct pt_regs *regs;
6474 struct perf_event *event;
6475 u64 period;
6476
6477 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
6478
6479 if (event->state != PERF_EVENT_STATE_ACTIVE)
6480 return HRTIMER_NORESTART;
6481
6482 event->pmu->read(event);
6483
6484 perf_sample_data_init(&data, 0, event->hw.last_period);
6485 regs = get_irq_regs();
6486
6487 if (regs && !perf_exclude_event(event, regs)) {
6488 if (!(event->attr.exclude_idle && is_idle_task(current)))
6489 if (__perf_event_overflow(event, 1, &data, regs))
6490 ret = HRTIMER_NORESTART;
6491 }
6492
6493 period = max_t(u64, 10000, event->hw.sample_period);
6494 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
6495
6496 return ret;
6497 }
6498
perf_swevent_start_hrtimer(struct perf_event * event)6499 static void perf_swevent_start_hrtimer(struct perf_event *event)
6500 {
6501 struct hw_perf_event *hwc = &event->hw;
6502 s64 period;
6503
6504 if (!is_sampling_event(event))
6505 return;
6506
6507 period = local64_read(&hwc->period_left);
6508 if (period) {
6509 if (period < 0)
6510 period = 10000;
6511
6512 local64_set(&hwc->period_left, 0);
6513 } else {
6514 period = max_t(u64, 10000, hwc->sample_period);
6515 }
6516 __hrtimer_start_range_ns(&hwc->hrtimer,
6517 ns_to_ktime(period), 0,
6518 HRTIMER_MODE_REL_PINNED, 0);
6519 }
6520
perf_swevent_cancel_hrtimer(struct perf_event * event)6521 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
6522 {
6523 struct hw_perf_event *hwc = &event->hw;
6524
6525 if (is_sampling_event(event)) {
6526 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
6527 local64_set(&hwc->period_left, ktime_to_ns(remaining));
6528
6529 hrtimer_cancel(&hwc->hrtimer);
6530 }
6531 }
6532
perf_swevent_init_hrtimer(struct perf_event * event)6533 static void perf_swevent_init_hrtimer(struct perf_event *event)
6534 {
6535 struct hw_perf_event *hwc = &event->hw;
6536
6537 if (!is_sampling_event(event))
6538 return;
6539
6540 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
6541 hwc->hrtimer.function = perf_swevent_hrtimer;
6542
6543 /*
6544 * Since hrtimers have a fixed rate, we can do a static freq->period
6545 * mapping and avoid the whole period adjust feedback stuff.
6546 */
6547 if (event->attr.freq) {
6548 long freq = event->attr.sample_freq;
6549
6550 event->attr.sample_period = NSEC_PER_SEC / freq;
6551 hwc->sample_period = event->attr.sample_period;
6552 local64_set(&hwc->period_left, hwc->sample_period);
6553 hwc->last_period = hwc->sample_period;
6554 event->attr.freq = 0;
6555 }
6556 }
6557
6558 /*
6559 * Software event: cpu wall time clock
6560 */
6561
cpu_clock_event_update(struct perf_event * event)6562 static void cpu_clock_event_update(struct perf_event *event)
6563 {
6564 s64 prev;
6565 u64 now;
6566
6567 now = local_clock();
6568 prev = local64_xchg(&event->hw.prev_count, now);
6569 local64_add(now - prev, &event->count);
6570 }
6571
cpu_clock_event_start(struct perf_event * event,int flags)6572 static void cpu_clock_event_start(struct perf_event *event, int flags)
6573 {
6574 local64_set(&event->hw.prev_count, local_clock());
6575 perf_swevent_start_hrtimer(event);
6576 }
6577
cpu_clock_event_stop(struct perf_event * event,int flags)6578 static void cpu_clock_event_stop(struct perf_event *event, int flags)
6579 {
6580 perf_swevent_cancel_hrtimer(event);
6581 cpu_clock_event_update(event);
6582 }
6583
cpu_clock_event_add(struct perf_event * event,int flags)6584 static int cpu_clock_event_add(struct perf_event *event, int flags)
6585 {
6586 if (flags & PERF_EF_START)
6587 cpu_clock_event_start(event, flags);
6588
6589 return 0;
6590 }
6591
cpu_clock_event_del(struct perf_event * event,int flags)6592 static void cpu_clock_event_del(struct perf_event *event, int flags)
6593 {
6594 cpu_clock_event_stop(event, flags);
6595 }
6596
cpu_clock_event_read(struct perf_event * event)6597 static void cpu_clock_event_read(struct perf_event *event)
6598 {
6599 cpu_clock_event_update(event);
6600 }
6601
cpu_clock_event_init(struct perf_event * event)6602 static int cpu_clock_event_init(struct perf_event *event)
6603 {
6604 if (event->attr.type != PERF_TYPE_SOFTWARE)
6605 return -ENOENT;
6606
6607 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
6608 return -ENOENT;
6609
6610 /*
6611 * no branch sampling for software events
6612 */
6613 if (has_branch_stack(event))
6614 return -EOPNOTSUPP;
6615
6616 perf_swevent_init_hrtimer(event);
6617
6618 return 0;
6619 }
6620
6621 static struct pmu perf_cpu_clock = {
6622 .task_ctx_nr = perf_sw_context,
6623
6624 .event_init = cpu_clock_event_init,
6625 .add = cpu_clock_event_add,
6626 .del = cpu_clock_event_del,
6627 .start = cpu_clock_event_start,
6628 .stop = cpu_clock_event_stop,
6629 .read = cpu_clock_event_read,
6630 };
6631
6632 /*
6633 * Software event: task time clock
6634 */
6635
task_clock_event_update(struct perf_event * event,u64 now)6636 static void task_clock_event_update(struct perf_event *event, u64 now)
6637 {
6638 u64 prev;
6639 s64 delta;
6640
6641 prev = local64_xchg(&event->hw.prev_count, now);
6642 delta = now - prev;
6643 local64_add(delta, &event->count);
6644 }
6645
task_clock_event_start(struct perf_event * event,int flags)6646 static void task_clock_event_start(struct perf_event *event, int flags)
6647 {
6648 local64_set(&event->hw.prev_count, event->ctx->time);
6649 perf_swevent_start_hrtimer(event);
6650 }
6651
task_clock_event_stop(struct perf_event * event,int flags)6652 static void task_clock_event_stop(struct perf_event *event, int flags)
6653 {
6654 perf_swevent_cancel_hrtimer(event);
6655 task_clock_event_update(event, event->ctx->time);
6656 }
6657
task_clock_event_add(struct perf_event * event,int flags)6658 static int task_clock_event_add(struct perf_event *event, int flags)
6659 {
6660 if (flags & PERF_EF_START)
6661 task_clock_event_start(event, flags);
6662
6663 return 0;
6664 }
6665
task_clock_event_del(struct perf_event * event,int flags)6666 static void task_clock_event_del(struct perf_event *event, int flags)
6667 {
6668 task_clock_event_stop(event, PERF_EF_UPDATE);
6669 }
6670
task_clock_event_read(struct perf_event * event)6671 static void task_clock_event_read(struct perf_event *event)
6672 {
6673 u64 now = perf_clock();
6674 u64 delta = now - event->ctx->timestamp;
6675 u64 time = event->ctx->time + delta;
6676
6677 task_clock_event_update(event, time);
6678 }
6679
task_clock_event_init(struct perf_event * event)6680 static int task_clock_event_init(struct perf_event *event)
6681 {
6682 if (event->attr.type != PERF_TYPE_SOFTWARE)
6683 return -ENOENT;
6684
6685 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
6686 return -ENOENT;
6687
6688 /*
6689 * no branch sampling for software events
6690 */
6691 if (has_branch_stack(event))
6692 return -EOPNOTSUPP;
6693
6694 perf_swevent_init_hrtimer(event);
6695
6696 return 0;
6697 }
6698
6699 static struct pmu perf_task_clock = {
6700 .task_ctx_nr = perf_sw_context,
6701
6702 .event_init = task_clock_event_init,
6703 .add = task_clock_event_add,
6704 .del = task_clock_event_del,
6705 .start = task_clock_event_start,
6706 .stop = task_clock_event_stop,
6707 .read = task_clock_event_read,
6708 };
6709
perf_pmu_nop_void(struct pmu * pmu)6710 static void perf_pmu_nop_void(struct pmu *pmu)
6711 {
6712 }
6713
perf_pmu_nop_int(struct pmu * pmu)6714 static int perf_pmu_nop_int(struct pmu *pmu)
6715 {
6716 return 0;
6717 }
6718
perf_pmu_start_txn(struct pmu * pmu)6719 static void perf_pmu_start_txn(struct pmu *pmu)
6720 {
6721 perf_pmu_disable(pmu);
6722 }
6723
perf_pmu_commit_txn(struct pmu * pmu)6724 static int perf_pmu_commit_txn(struct pmu *pmu)
6725 {
6726 perf_pmu_enable(pmu);
6727 return 0;
6728 }
6729
perf_pmu_cancel_txn(struct pmu * pmu)6730 static void perf_pmu_cancel_txn(struct pmu *pmu)
6731 {
6732 perf_pmu_enable(pmu);
6733 }
6734
perf_event_idx_default(struct perf_event * event)6735 static int perf_event_idx_default(struct perf_event *event)
6736 {
6737 return 0;
6738 }
6739
6740 /*
6741 * Ensures all contexts with the same task_ctx_nr have the same
6742 * pmu_cpu_context too.
6743 */
find_pmu_context(int ctxn)6744 static struct perf_cpu_context __percpu *find_pmu_context(int ctxn)
6745 {
6746 struct pmu *pmu;
6747
6748 if (ctxn < 0)
6749 return NULL;
6750
6751 list_for_each_entry(pmu, &pmus, entry) {
6752 if (pmu->task_ctx_nr == ctxn)
6753 return pmu->pmu_cpu_context;
6754 }
6755
6756 return NULL;
6757 }
6758
update_pmu_context(struct pmu * pmu,struct pmu * old_pmu)6759 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
6760 {
6761 int cpu;
6762
6763 for_each_possible_cpu(cpu) {
6764 struct perf_cpu_context *cpuctx;
6765
6766 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6767
6768 if (cpuctx->unique_pmu == old_pmu)
6769 cpuctx->unique_pmu = pmu;
6770 }
6771 }
6772
free_pmu_context(struct pmu * pmu)6773 static void free_pmu_context(struct pmu *pmu)
6774 {
6775 struct pmu *i;
6776
6777 mutex_lock(&pmus_lock);
6778 /*
6779 * Like a real lame refcount.
6780 */
6781 list_for_each_entry(i, &pmus, entry) {
6782 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
6783 update_pmu_context(i, pmu);
6784 goto out;
6785 }
6786 }
6787
6788 free_percpu(pmu->pmu_cpu_context);
6789 out:
6790 mutex_unlock(&pmus_lock);
6791 }
6792 static struct idr pmu_idr;
6793
6794 static ssize_t
type_show(struct device * dev,struct device_attribute * attr,char * page)6795 type_show(struct device *dev, struct device_attribute *attr, char *page)
6796 {
6797 struct pmu *pmu = dev_get_drvdata(dev);
6798
6799 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6800 }
6801 static DEVICE_ATTR_RO(type);
6802
6803 static ssize_t
perf_event_mux_interval_ms_show(struct device * dev,struct device_attribute * attr,char * page)6804 perf_event_mux_interval_ms_show(struct device *dev,
6805 struct device_attribute *attr,
6806 char *page)
6807 {
6808 struct pmu *pmu = dev_get_drvdata(dev);
6809
6810 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->hrtimer_interval_ms);
6811 }
6812
6813 static ssize_t
perf_event_mux_interval_ms_store(struct device * dev,struct device_attribute * attr,const char * buf,size_t count)6814 perf_event_mux_interval_ms_store(struct device *dev,
6815 struct device_attribute *attr,
6816 const char *buf, size_t count)
6817 {
6818 struct pmu *pmu = dev_get_drvdata(dev);
6819 int timer, cpu, ret;
6820
6821 ret = kstrtoint(buf, 0, &timer);
6822 if (ret)
6823 return ret;
6824
6825 if (timer < 1)
6826 return -EINVAL;
6827
6828 /* same value, noting to do */
6829 if (timer == pmu->hrtimer_interval_ms)
6830 return count;
6831
6832 pmu->hrtimer_interval_ms = timer;
6833
6834 /* update all cpuctx for this PMU */
6835 for_each_possible_cpu(cpu) {
6836 struct perf_cpu_context *cpuctx;
6837 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6838 cpuctx->hrtimer_interval = ns_to_ktime(NSEC_PER_MSEC * timer);
6839
6840 if (hrtimer_active(&cpuctx->hrtimer))
6841 hrtimer_forward_now(&cpuctx->hrtimer, cpuctx->hrtimer_interval);
6842 }
6843
6844 return count;
6845 }
6846 static DEVICE_ATTR_RW(perf_event_mux_interval_ms);
6847
6848 static struct attribute *pmu_dev_attrs[] = {
6849 &dev_attr_type.attr,
6850 &dev_attr_perf_event_mux_interval_ms.attr,
6851 NULL,
6852 };
6853 ATTRIBUTE_GROUPS(pmu_dev);
6854
6855 static int pmu_bus_running;
6856 static struct bus_type pmu_bus = {
6857 .name = "event_source",
6858 .dev_groups = pmu_dev_groups,
6859 };
6860
pmu_dev_release(struct device * dev)6861 static void pmu_dev_release(struct device *dev)
6862 {
6863 kfree(dev);
6864 }
6865
pmu_dev_alloc(struct pmu * pmu)6866 static int pmu_dev_alloc(struct pmu *pmu)
6867 {
6868 int ret = -ENOMEM;
6869
6870 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
6871 if (!pmu->dev)
6872 goto out;
6873
6874 pmu->dev->groups = pmu->attr_groups;
6875 device_initialize(pmu->dev);
6876 ret = dev_set_name(pmu->dev, "%s", pmu->name);
6877 if (ret)
6878 goto free_dev;
6879
6880 dev_set_drvdata(pmu->dev, pmu);
6881 pmu->dev->bus = &pmu_bus;
6882 pmu->dev->release = pmu_dev_release;
6883 ret = device_add(pmu->dev);
6884 if (ret)
6885 goto free_dev;
6886
6887 out:
6888 return ret;
6889
6890 free_dev:
6891 put_device(pmu->dev);
6892 goto out;
6893 }
6894
6895 static struct lock_class_key cpuctx_mutex;
6896 static struct lock_class_key cpuctx_lock;
6897
perf_pmu_register(struct pmu * pmu,const char * name,int type)6898 int perf_pmu_register(struct pmu *pmu, const char *name, int type)
6899 {
6900 int cpu, ret;
6901
6902 mutex_lock(&pmus_lock);
6903 ret = -ENOMEM;
6904 pmu->pmu_disable_count = alloc_percpu(int);
6905 if (!pmu->pmu_disable_count)
6906 goto unlock;
6907
6908 pmu->type = -1;
6909 if (!name)
6910 goto skip_type;
6911 pmu->name = name;
6912
6913 if (type < 0) {
6914 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
6915 if (type < 0) {
6916 ret = type;
6917 goto free_pdc;
6918 }
6919 }
6920 pmu->type = type;
6921
6922 if (pmu_bus_running) {
6923 ret = pmu_dev_alloc(pmu);
6924 if (ret)
6925 goto free_idr;
6926 }
6927
6928 skip_type:
6929 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6930 if (pmu->pmu_cpu_context)
6931 goto got_cpu_context;
6932
6933 ret = -ENOMEM;
6934 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6935 if (!pmu->pmu_cpu_context)
6936 goto free_dev;
6937
6938 for_each_possible_cpu(cpu) {
6939 struct perf_cpu_context *cpuctx;
6940
6941 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6942 __perf_event_init_context(&cpuctx->ctx);
6943 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6944 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6945 cpuctx->ctx.pmu = pmu;
6946
6947 __perf_cpu_hrtimer_init(cpuctx, cpu);
6948
6949 INIT_LIST_HEAD(&cpuctx->rotation_list);
6950 cpuctx->unique_pmu = pmu;
6951 }
6952
6953 got_cpu_context:
6954 if (!pmu->start_txn) {
6955 if (pmu->pmu_enable) {
6956 /*
6957 * If we have pmu_enable/pmu_disable calls, install
6958 * transaction stubs that use that to try and batch
6959 * hardware accesses.
6960 */
6961 pmu->start_txn = perf_pmu_start_txn;
6962 pmu->commit_txn = perf_pmu_commit_txn;
6963 pmu->cancel_txn = perf_pmu_cancel_txn;
6964 } else {
6965 pmu->start_txn = perf_pmu_nop_void;
6966 pmu->commit_txn = perf_pmu_nop_int;
6967 pmu->cancel_txn = perf_pmu_nop_void;
6968 }
6969 }
6970
6971 if (!pmu->pmu_enable) {
6972 pmu->pmu_enable = perf_pmu_nop_void;
6973 pmu->pmu_disable = perf_pmu_nop_void;
6974 }
6975
6976 if (!pmu->event_idx)
6977 pmu->event_idx = perf_event_idx_default;
6978
6979 list_add_rcu(&pmu->entry, &pmus);
6980 ret = 0;
6981 unlock:
6982 mutex_unlock(&pmus_lock);
6983
6984 return ret;
6985
6986 free_dev:
6987 device_del(pmu->dev);
6988 put_device(pmu->dev);
6989
6990 free_idr:
6991 if (pmu->type >= PERF_TYPE_MAX)
6992 idr_remove(&pmu_idr, pmu->type);
6993
6994 free_pdc:
6995 free_percpu(pmu->pmu_disable_count);
6996 goto unlock;
6997 }
6998 EXPORT_SYMBOL_GPL(perf_pmu_register);
6999
perf_pmu_unregister(struct pmu * pmu)7000 void perf_pmu_unregister(struct pmu *pmu)
7001 {
7002 mutex_lock(&pmus_lock);
7003 list_del_rcu(&pmu->entry);
7004 mutex_unlock(&pmus_lock);
7005
7006 /*
7007 * We dereference the pmu list under both SRCU and regular RCU, so
7008 * synchronize against both of those.
7009 */
7010 synchronize_srcu(&pmus_srcu);
7011 synchronize_rcu();
7012
7013 free_percpu(pmu->pmu_disable_count);
7014 if (pmu->type >= PERF_TYPE_MAX)
7015 idr_remove(&pmu_idr, pmu->type);
7016 device_del(pmu->dev);
7017 put_device(pmu->dev);
7018 free_pmu_context(pmu);
7019 }
7020 EXPORT_SYMBOL_GPL(perf_pmu_unregister);
7021
perf_init_event(struct perf_event * event)7022 struct pmu *perf_init_event(struct perf_event *event)
7023 {
7024 struct pmu *pmu = NULL;
7025 int idx;
7026 int ret;
7027
7028 idx = srcu_read_lock(&pmus_srcu);
7029
7030 rcu_read_lock();
7031 pmu = idr_find(&pmu_idr, event->attr.type);
7032 rcu_read_unlock();
7033 if (pmu) {
7034 if (!try_module_get(pmu->module)) {
7035 pmu = ERR_PTR(-ENODEV);
7036 goto unlock;
7037 }
7038 event->pmu = pmu;
7039 ret = pmu->event_init(event);
7040 if (ret)
7041 pmu = ERR_PTR(ret);
7042 goto unlock;
7043 }
7044
7045 list_for_each_entry_rcu(pmu, &pmus, entry) {
7046 if (!try_module_get(pmu->module)) {
7047 pmu = ERR_PTR(-ENODEV);
7048 goto unlock;
7049 }
7050 event->pmu = pmu;
7051 ret = pmu->event_init(event);
7052 if (!ret)
7053 goto unlock;
7054
7055 if (ret != -ENOENT) {
7056 pmu = ERR_PTR(ret);
7057 goto unlock;
7058 }
7059 }
7060 pmu = ERR_PTR(-ENOENT);
7061 unlock:
7062 srcu_read_unlock(&pmus_srcu, idx);
7063
7064 return pmu;
7065 }
7066
account_event_cpu(struct perf_event * event,int cpu)7067 static void account_event_cpu(struct perf_event *event, int cpu)
7068 {
7069 if (event->parent)
7070 return;
7071
7072 if (has_branch_stack(event)) {
7073 if (!(event->attach_state & PERF_ATTACH_TASK))
7074 atomic_inc(&per_cpu(perf_branch_stack_events, cpu));
7075 }
7076 if (is_cgroup_event(event))
7077 atomic_inc(&per_cpu(perf_cgroup_events, cpu));
7078 }
7079
account_event(struct perf_event * event)7080 static void account_event(struct perf_event *event)
7081 {
7082 if (event->parent)
7083 return;
7084
7085 if (event->attach_state & PERF_ATTACH_TASK)
7086 static_key_slow_inc(&perf_sched_events.key);
7087 if (event->attr.mmap || event->attr.mmap_data)
7088 atomic_inc(&nr_mmap_events);
7089 if (event->attr.comm)
7090 atomic_inc(&nr_comm_events);
7091 if (event->attr.task)
7092 atomic_inc(&nr_task_events);
7093 if (event->attr.freq) {
7094 if (atomic_inc_return(&nr_freq_events) == 1)
7095 tick_nohz_full_kick_all();
7096 }
7097 if (has_branch_stack(event))
7098 static_key_slow_inc(&perf_sched_events.key);
7099 if (is_cgroup_event(event))
7100 static_key_slow_inc(&perf_sched_events.key);
7101
7102 account_event_cpu(event, event->cpu);
7103 }
7104
7105 /*
7106 * Allocate and initialize a event structure
7107 */
7108 static struct perf_event *
perf_event_alloc(struct perf_event_attr * attr,int cpu,struct task_struct * task,struct perf_event * group_leader,struct perf_event * parent_event,perf_overflow_handler_t overflow_handler,void * context)7109 perf_event_alloc(struct perf_event_attr *attr, int cpu,
7110 struct task_struct *task,
7111 struct perf_event *group_leader,
7112 struct perf_event *parent_event,
7113 perf_overflow_handler_t overflow_handler,
7114 void *context)
7115 {
7116 struct pmu *pmu;
7117 struct perf_event *event;
7118 struct hw_perf_event *hwc;
7119 long err = -EINVAL;
7120
7121 if ((unsigned)cpu >= nr_cpu_ids) {
7122 if (!task || cpu != -1)
7123 return ERR_PTR(-EINVAL);
7124 }
7125
7126 event = kzalloc(sizeof(*event), GFP_KERNEL);
7127 if (!event)
7128 return ERR_PTR(-ENOMEM);
7129
7130 /*
7131 * Single events are their own group leaders, with an
7132 * empty sibling list:
7133 */
7134 if (!group_leader)
7135 group_leader = event;
7136
7137 mutex_init(&event->child_mutex);
7138 INIT_LIST_HEAD(&event->child_list);
7139
7140 INIT_LIST_HEAD(&event->group_entry);
7141 INIT_LIST_HEAD(&event->event_entry);
7142 INIT_LIST_HEAD(&event->sibling_list);
7143 INIT_LIST_HEAD(&event->rb_entry);
7144 INIT_LIST_HEAD(&event->active_entry);
7145 INIT_HLIST_NODE(&event->hlist_entry);
7146
7147
7148 init_waitqueue_head(&event->waitq);
7149 init_irq_work(&event->pending, perf_pending_event);
7150
7151 mutex_init(&event->mmap_mutex);
7152
7153 atomic_long_set(&event->refcount, 1);
7154 event->cpu = cpu;
7155 event->attr = *attr;
7156 event->group_leader = group_leader;
7157 event->pmu = NULL;
7158 event->oncpu = -1;
7159
7160 event->parent = parent_event;
7161
7162 event->ns = get_pid_ns(task_active_pid_ns(current));
7163 event->id = atomic64_inc_return(&perf_event_id);
7164
7165 event->state = PERF_EVENT_STATE_INACTIVE;
7166
7167 if (task) {
7168 event->attach_state = PERF_ATTACH_TASK;
7169
7170 if (attr->type == PERF_TYPE_TRACEPOINT)
7171 event->hw.tp_target = task;
7172 #ifdef CONFIG_HAVE_HW_BREAKPOINT
7173 /*
7174 * hw_breakpoint is a bit difficult here..
7175 */
7176 else if (attr->type == PERF_TYPE_BREAKPOINT)
7177 event->hw.bp_target = task;
7178 #endif
7179 }
7180
7181 if (!overflow_handler && parent_event) {
7182 overflow_handler = parent_event->overflow_handler;
7183 context = parent_event->overflow_handler_context;
7184 }
7185
7186 event->overflow_handler = overflow_handler;
7187 event->overflow_handler_context = context;
7188
7189 perf_event__state_init(event);
7190
7191 pmu = NULL;
7192
7193 hwc = &event->hw;
7194 hwc->sample_period = attr->sample_period;
7195 if (attr->freq && attr->sample_freq)
7196 hwc->sample_period = 1;
7197 hwc->last_period = hwc->sample_period;
7198
7199 local64_set(&hwc->period_left, hwc->sample_period);
7200
7201 /*
7202 * we currently do not support PERF_FORMAT_GROUP on inherited events
7203 */
7204 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
7205 goto err_ns;
7206
7207 pmu = perf_init_event(event);
7208 if (!pmu)
7209 goto err_ns;
7210 else if (IS_ERR(pmu)) {
7211 err = PTR_ERR(pmu);
7212 goto err_ns;
7213 }
7214
7215 if (!event->parent) {
7216 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
7217 err = get_callchain_buffers();
7218 if (err)
7219 goto err_pmu;
7220 }
7221 }
7222
7223 return event;
7224
7225 err_pmu:
7226 if (event->destroy)
7227 event->destroy(event);
7228 module_put(pmu->module);
7229 err_ns:
7230 if (event->ns)
7231 put_pid_ns(event->ns);
7232 kfree(event);
7233
7234 return ERR_PTR(err);
7235 }
7236
perf_copy_attr(struct perf_event_attr __user * uattr,struct perf_event_attr * attr)7237 static int perf_copy_attr(struct perf_event_attr __user *uattr,
7238 struct perf_event_attr *attr)
7239 {
7240 u32 size;
7241 int ret;
7242
7243 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
7244 return -EFAULT;
7245
7246 /*
7247 * zero the full structure, so that a short copy will be nice.
7248 */
7249 memset(attr, 0, sizeof(*attr));
7250
7251 ret = get_user(size, &uattr->size);
7252 if (ret)
7253 return ret;
7254
7255 if (size > PAGE_SIZE) /* silly large */
7256 goto err_size;
7257
7258 if (!size) /* abi compat */
7259 size = PERF_ATTR_SIZE_VER0;
7260
7261 if (size < PERF_ATTR_SIZE_VER0)
7262 goto err_size;
7263
7264 /*
7265 * If we're handed a bigger struct than we know of,
7266 * ensure all the unknown bits are 0 - i.e. new
7267 * user-space does not rely on any kernel feature
7268 * extensions we dont know about yet.
7269 */
7270 if (size > sizeof(*attr)) {
7271 unsigned char __user *addr;
7272 unsigned char __user *end;
7273 unsigned char val;
7274
7275 addr = (void __user *)uattr + sizeof(*attr);
7276 end = (void __user *)uattr + size;
7277
7278 for (; addr < end; addr++) {
7279 ret = get_user(val, addr);
7280 if (ret)
7281 return ret;
7282 if (val)
7283 goto err_size;
7284 }
7285 size = sizeof(*attr);
7286 }
7287
7288 ret = copy_from_user(attr, uattr, size);
7289 if (ret)
7290 return -EFAULT;
7291
7292 if (attr->__reserved_1)
7293 return -EINVAL;
7294
7295 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
7296 return -EINVAL;
7297
7298 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
7299 return -EINVAL;
7300
7301 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
7302 u64 mask = attr->branch_sample_type;
7303
7304 /* only using defined bits */
7305 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
7306 return -EINVAL;
7307
7308 /* at least one branch bit must be set */
7309 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
7310 return -EINVAL;
7311
7312 /* propagate priv level, when not set for branch */
7313 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
7314
7315 /* exclude_kernel checked on syscall entry */
7316 if (!attr->exclude_kernel)
7317 mask |= PERF_SAMPLE_BRANCH_KERNEL;
7318
7319 if (!attr->exclude_user)
7320 mask |= PERF_SAMPLE_BRANCH_USER;
7321
7322 if (!attr->exclude_hv)
7323 mask |= PERF_SAMPLE_BRANCH_HV;
7324 /*
7325 * adjust user setting (for HW filter setup)
7326 */
7327 attr->branch_sample_type = mask;
7328 }
7329 /* privileged levels capture (kernel, hv): check permissions */
7330 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
7331 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7332 return -EACCES;
7333 }
7334
7335 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
7336 ret = perf_reg_validate(attr->sample_regs_user);
7337 if (ret)
7338 return ret;
7339 }
7340
7341 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
7342 if (!arch_perf_have_user_stack_dump())
7343 return -ENOSYS;
7344
7345 /*
7346 * We have __u32 type for the size, but so far
7347 * we can only use __u16 as maximum due to the
7348 * __u16 sample size limit.
7349 */
7350 if (attr->sample_stack_user >= USHRT_MAX)
7351 ret = -EINVAL;
7352 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
7353 ret = -EINVAL;
7354 }
7355
7356 out:
7357 return ret;
7358
7359 err_size:
7360 put_user(sizeof(*attr), &uattr->size);
7361 ret = -E2BIG;
7362 goto out;
7363 }
7364
7365 static int
perf_event_set_output(struct perf_event * event,struct perf_event * output_event)7366 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
7367 {
7368 struct ring_buffer *rb = NULL;
7369 int ret = -EINVAL;
7370
7371 if (!output_event)
7372 goto set;
7373
7374 /* don't allow circular references */
7375 if (event == output_event)
7376 goto out;
7377
7378 /*
7379 * Don't allow cross-cpu buffers
7380 */
7381 if (output_event->cpu != event->cpu)
7382 goto out;
7383
7384 /*
7385 * If its not a per-cpu rb, it must be the same task.
7386 */
7387 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
7388 goto out;
7389
7390 set:
7391 mutex_lock(&event->mmap_mutex);
7392 /* Can't redirect output if we've got an active mmap() */
7393 if (atomic_read(&event->mmap_count))
7394 goto unlock;
7395
7396 if (output_event) {
7397 /* get the rb we want to redirect to */
7398 rb = ring_buffer_get(output_event);
7399 if (!rb)
7400 goto unlock;
7401 }
7402
7403 ring_buffer_attach(event, rb);
7404
7405 ret = 0;
7406 unlock:
7407 mutex_unlock(&event->mmap_mutex);
7408
7409 out:
7410 return ret;
7411 }
7412
mutex_lock_double(struct mutex * a,struct mutex * b)7413 static void mutex_lock_double(struct mutex *a, struct mutex *b)
7414 {
7415 if (b < a)
7416 swap(a, b);
7417
7418 mutex_lock(a);
7419 mutex_lock_nested(b, SINGLE_DEPTH_NESTING);
7420 }
7421
7422 /*
7423 * Variation on perf_event_ctx_lock_nested(), except we take two context
7424 * mutexes.
7425 */
7426 static struct perf_event_context *
__perf_event_ctx_lock_double(struct perf_event * group_leader,struct perf_event_context * ctx)7427 __perf_event_ctx_lock_double(struct perf_event *group_leader,
7428 struct perf_event_context *ctx)
7429 {
7430 struct perf_event_context *gctx;
7431
7432 again:
7433 rcu_read_lock();
7434 gctx = ACCESS_ONCE(group_leader->ctx);
7435 if (!atomic_inc_not_zero(&gctx->refcount)) {
7436 rcu_read_unlock();
7437 goto again;
7438 }
7439 rcu_read_unlock();
7440
7441 mutex_lock_double(&gctx->mutex, &ctx->mutex);
7442
7443 if (group_leader->ctx != gctx) {
7444 mutex_unlock(&ctx->mutex);
7445 mutex_unlock(&gctx->mutex);
7446 put_ctx(gctx);
7447 goto again;
7448 }
7449
7450 return gctx;
7451 }
7452
7453 /**
7454 * sys_perf_event_open - open a performance event, associate it to a task/cpu
7455 *
7456 * @attr_uptr: event_id type attributes for monitoring/sampling
7457 * @pid: target pid
7458 * @cpu: target cpu
7459 * @group_fd: group leader event fd
7460 */
SYSCALL_DEFINE5(perf_event_open,struct perf_event_attr __user *,attr_uptr,pid_t,pid,int,cpu,int,group_fd,unsigned long,flags)7461 SYSCALL_DEFINE5(perf_event_open,
7462 struct perf_event_attr __user *, attr_uptr,
7463 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
7464 {
7465 struct perf_event *group_leader = NULL, *output_event = NULL;
7466 struct perf_event *event, *sibling;
7467 struct perf_event_attr attr;
7468 struct perf_event_context *ctx, *uninitialized_var(gctx);
7469 struct file *event_file = NULL;
7470 struct fd group = {NULL, 0};
7471 struct task_struct *task = NULL;
7472 struct pmu *pmu;
7473 int event_fd;
7474 int move_group = 0;
7475 int err;
7476 int f_flags = O_RDWR;
7477
7478 /* for future expandability... */
7479 if (flags & ~PERF_FLAG_ALL)
7480 return -EINVAL;
7481
7482 if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
7483 return -EACCES;
7484
7485 err = perf_copy_attr(attr_uptr, &attr);
7486 if (err)
7487 return err;
7488
7489 if (!attr.exclude_kernel) {
7490 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
7491 return -EACCES;
7492 }
7493
7494 if (attr.freq) {
7495 if (attr.sample_freq > sysctl_perf_event_sample_rate)
7496 return -EINVAL;
7497 } else {
7498 if (attr.sample_period & (1ULL << 63))
7499 return -EINVAL;
7500 }
7501
7502 /*
7503 * In cgroup mode, the pid argument is used to pass the fd
7504 * opened to the cgroup directory in cgroupfs. The cpu argument
7505 * designates the cpu on which to monitor threads from that
7506 * cgroup.
7507 */
7508 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
7509 return -EINVAL;
7510
7511 if (flags & PERF_FLAG_FD_CLOEXEC)
7512 f_flags |= O_CLOEXEC;
7513
7514 event_fd = get_unused_fd_flags(f_flags);
7515 if (event_fd < 0)
7516 return event_fd;
7517
7518 if (group_fd != -1) {
7519 err = perf_fget_light(group_fd, &group);
7520 if (err)
7521 goto err_fd;
7522 group_leader = group.file->private_data;
7523 if (flags & PERF_FLAG_FD_OUTPUT)
7524 output_event = group_leader;
7525 if (flags & PERF_FLAG_FD_NO_GROUP)
7526 group_leader = NULL;
7527 }
7528
7529 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
7530 task = find_lively_task_by_vpid(pid);
7531 if (IS_ERR(task)) {
7532 err = PTR_ERR(task);
7533 goto err_group_fd;
7534 }
7535 }
7536
7537 if (task && group_leader &&
7538 group_leader->attr.inherit != attr.inherit) {
7539 err = -EINVAL;
7540 goto err_task;
7541 }
7542
7543 get_online_cpus();
7544
7545 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
7546 NULL, NULL);
7547 if (IS_ERR(event)) {
7548 err = PTR_ERR(event);
7549 goto err_cpus;
7550 }
7551
7552 if (flags & PERF_FLAG_PID_CGROUP) {
7553 err = perf_cgroup_connect(pid, event, &attr, group_leader);
7554 if (err) {
7555 __free_event(event);
7556 goto err_cpus;
7557 }
7558 }
7559
7560 if (is_sampling_event(event)) {
7561 if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
7562 err = -ENOTSUPP;
7563 goto err_alloc;
7564 }
7565 }
7566
7567 account_event(event);
7568
7569 /*
7570 * Special case software events and allow them to be part of
7571 * any hardware group.
7572 */
7573 pmu = event->pmu;
7574
7575 if (group_leader &&
7576 (is_software_event(event) != is_software_event(group_leader))) {
7577 if (is_software_event(event)) {
7578 /*
7579 * If event and group_leader are not both a software
7580 * event, and event is, then group leader is not.
7581 *
7582 * Allow the addition of software events to !software
7583 * groups, this is safe because software events never
7584 * fail to schedule.
7585 */
7586 pmu = group_leader->pmu;
7587 } else if (is_software_event(group_leader) &&
7588 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7589 /*
7590 * In case the group is a pure software group, and we
7591 * try to add a hardware event, move the whole group to
7592 * the hardware context.
7593 */
7594 move_group = 1;
7595 }
7596 }
7597
7598 /*
7599 * Get the target context (task or percpu):
7600 */
7601 ctx = find_get_context(pmu, task, event->cpu);
7602 if (IS_ERR(ctx)) {
7603 err = PTR_ERR(ctx);
7604 goto err_alloc;
7605 }
7606
7607 if (task) {
7608 put_task_struct(task);
7609 task = NULL;
7610 }
7611
7612 /*
7613 * Look up the group leader (we will attach this event to it):
7614 */
7615 if (group_leader) {
7616 err = -EINVAL;
7617
7618 /*
7619 * Do not allow a recursive hierarchy (this new sibling
7620 * becoming part of another group-sibling):
7621 */
7622 if (group_leader->group_leader != group_leader)
7623 goto err_context;
7624 /*
7625 * Make sure we're both events for the same CPU;
7626 * grouping events for different CPUs is broken; since
7627 * you can never concurrently schedule them anyhow.
7628 */
7629 if (group_leader->cpu != event->cpu)
7630 goto err_context;
7631
7632 /*
7633 * Make sure we're both on the same task, or both
7634 * per-CPU events.
7635 */
7636 if (group_leader->ctx->task != ctx->task)
7637 goto err_context;
7638
7639 /*
7640 * Do not allow to attach to a group in a different task
7641 * or CPU context. If we're moving SW events, we'll fix
7642 * this up later, so allow that.
7643 */
7644 if (!move_group && group_leader->ctx != ctx)
7645 goto err_context;
7646
7647 /*
7648 * Only a group leader can be exclusive or pinned
7649 */
7650 if (attr.exclusive || attr.pinned)
7651 goto err_context;
7652 }
7653
7654 if (output_event) {
7655 err = perf_event_set_output(event, output_event);
7656 if (err)
7657 goto err_context;
7658 }
7659
7660 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event,
7661 f_flags);
7662 if (IS_ERR(event_file)) {
7663 err = PTR_ERR(event_file);
7664 goto err_context;
7665 }
7666
7667 if (move_group) {
7668 gctx = __perf_event_ctx_lock_double(group_leader, ctx);
7669
7670 /*
7671 * Check if we raced against another sys_perf_event_open() call
7672 * moving the software group underneath us.
7673 */
7674 if (!(group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
7675 /*
7676 * If someone moved the group out from under us, check
7677 * if this new event wound up on the same ctx, if so
7678 * its the regular !move_group case, otherwise fail.
7679 */
7680 if (gctx != ctx) {
7681 err = -EINVAL;
7682 goto err_locked;
7683 } else {
7684 perf_event_ctx_unlock(group_leader, gctx);
7685 move_group = 0;
7686 }
7687 }
7688
7689 /*
7690 * See perf_event_ctx_lock() for comments on the details
7691 * of swizzling perf_event::ctx.
7692 */
7693 perf_remove_from_context(group_leader, false);
7694
7695 /*
7696 * Removing from the context ends up with disabled
7697 * event. What we want here is event in the initial
7698 * startup state, ready to be add into new context.
7699 */
7700 perf_event__state_init(group_leader);
7701 list_for_each_entry(sibling, &group_leader->sibling_list,
7702 group_entry) {
7703 perf_remove_from_context(sibling, false);
7704 perf_event__state_init(sibling);
7705 put_ctx(gctx);
7706 }
7707 } else {
7708 mutex_lock(&ctx->mutex);
7709 }
7710
7711 WARN_ON_ONCE(ctx->parent_ctx);
7712
7713 if (move_group) {
7714 /*
7715 * Wait for everybody to stop referencing the events through
7716 * the old lists, before installing it on new lists.
7717 */
7718 synchronize_rcu();
7719
7720 perf_install_in_context(ctx, group_leader, group_leader->cpu);
7721 get_ctx(ctx);
7722 list_for_each_entry(sibling, &group_leader->sibling_list,
7723 group_entry) {
7724 perf_install_in_context(ctx, sibling, sibling->cpu);
7725 get_ctx(ctx);
7726 }
7727 }
7728
7729 perf_install_in_context(ctx, event, event->cpu);
7730 perf_unpin_context(ctx);
7731
7732 if (move_group) {
7733 perf_event_ctx_unlock(group_leader, gctx);
7734 put_ctx(gctx);
7735 }
7736 mutex_unlock(&ctx->mutex);
7737
7738 put_online_cpus();
7739
7740 event->owner = current;
7741
7742 mutex_lock(¤t->perf_event_mutex);
7743 list_add_tail(&event->owner_entry, ¤t->perf_event_list);
7744 mutex_unlock(¤t->perf_event_mutex);
7745
7746 /*
7747 * Precalculate sample_data sizes
7748 */
7749 perf_event__header_size(event);
7750 perf_event__id_header_size(event);
7751
7752 /*
7753 * Drop the reference on the group_event after placing the
7754 * new event on the sibling_list. This ensures destruction
7755 * of the group leader will find the pointer to itself in
7756 * perf_group_detach().
7757 */
7758 fdput(group);
7759 fd_install(event_fd, event_file);
7760 return event_fd;
7761
7762 err_locked:
7763 if (move_group)
7764 perf_event_ctx_unlock(group_leader, gctx);
7765 mutex_unlock(&ctx->mutex);
7766 fput(event_file);
7767 err_context:
7768 perf_unpin_context(ctx);
7769 put_ctx(ctx);
7770 err_alloc:
7771 free_event(event);
7772 err_cpus:
7773 put_online_cpus();
7774 err_task:
7775 if (task)
7776 put_task_struct(task);
7777 err_group_fd:
7778 fdput(group);
7779 err_fd:
7780 put_unused_fd(event_fd);
7781 return err;
7782 }
7783
7784 /**
7785 * perf_event_create_kernel_counter
7786 *
7787 * @attr: attributes of the counter to create
7788 * @cpu: cpu in which the counter is bound
7789 * @task: task to profile (NULL for percpu)
7790 */
7791 struct perf_event *
perf_event_create_kernel_counter(struct perf_event_attr * attr,int cpu,struct task_struct * task,perf_overflow_handler_t overflow_handler,void * context)7792 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
7793 struct task_struct *task,
7794 perf_overflow_handler_t overflow_handler,
7795 void *context)
7796 {
7797 struct perf_event_context *ctx;
7798 struct perf_event *event;
7799 int err;
7800
7801 /*
7802 * Get the target context (task or percpu):
7803 */
7804
7805 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
7806 overflow_handler, context);
7807 if (IS_ERR(event)) {
7808 err = PTR_ERR(event);
7809 goto err;
7810 }
7811
7812 /* Mark owner so we could distinguish it from user events. */
7813 event->owner = EVENT_OWNER_KERNEL;
7814
7815 account_event(event);
7816
7817 ctx = find_get_context(event->pmu, task, cpu);
7818 if (IS_ERR(ctx)) {
7819 err = PTR_ERR(ctx);
7820 goto err_free;
7821 }
7822
7823 WARN_ON_ONCE(ctx->parent_ctx);
7824 mutex_lock(&ctx->mutex);
7825 perf_install_in_context(ctx, event, cpu);
7826 perf_unpin_context(ctx);
7827 mutex_unlock(&ctx->mutex);
7828
7829 return event;
7830
7831 err_free:
7832 free_event(event);
7833 err:
7834 return ERR_PTR(err);
7835 }
7836 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
7837
perf_pmu_migrate_context(struct pmu * pmu,int src_cpu,int dst_cpu)7838 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
7839 {
7840 struct perf_event_context *src_ctx;
7841 struct perf_event_context *dst_ctx;
7842 struct perf_event *event, *tmp;
7843 LIST_HEAD(events);
7844
7845 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
7846 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
7847
7848 /*
7849 * See perf_event_ctx_lock() for comments on the details
7850 * of swizzling perf_event::ctx.
7851 */
7852 mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
7853 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
7854 event_entry) {
7855 perf_remove_from_context(event, false);
7856 unaccount_event_cpu(event, src_cpu);
7857 put_ctx(src_ctx);
7858 list_add(&event->migrate_entry, &events);
7859 }
7860
7861 synchronize_rcu();
7862
7863 list_for_each_entry_safe(event, tmp, &events, migrate_entry) {
7864 list_del(&event->migrate_entry);
7865 if (event->state >= PERF_EVENT_STATE_OFF)
7866 event->state = PERF_EVENT_STATE_INACTIVE;
7867 account_event_cpu(event, dst_cpu);
7868 perf_install_in_context(dst_ctx, event, dst_cpu);
7869 get_ctx(dst_ctx);
7870 }
7871 mutex_unlock(&dst_ctx->mutex);
7872 mutex_unlock(&src_ctx->mutex);
7873 }
7874 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
7875
sync_child_event(struct perf_event * child_event,struct task_struct * child)7876 static void sync_child_event(struct perf_event *child_event,
7877 struct task_struct *child)
7878 {
7879 struct perf_event *parent_event = child_event->parent;
7880 u64 child_val;
7881
7882 if (child_event->attr.inherit_stat)
7883 perf_event_read_event(child_event, child);
7884
7885 child_val = perf_event_count(child_event);
7886
7887 /*
7888 * Add back the child's count to the parent's count:
7889 */
7890 atomic64_add(child_val, &parent_event->child_count);
7891 atomic64_add(child_event->total_time_enabled,
7892 &parent_event->child_total_time_enabled);
7893 atomic64_add(child_event->total_time_running,
7894 &parent_event->child_total_time_running);
7895
7896 /*
7897 * Remove this event from the parent's list
7898 */
7899 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7900 mutex_lock(&parent_event->child_mutex);
7901 list_del_init(&child_event->child_list);
7902 mutex_unlock(&parent_event->child_mutex);
7903
7904 /*
7905 * Make sure user/parent get notified, that we just
7906 * lost one event.
7907 */
7908 perf_event_wakeup(parent_event);
7909
7910 /*
7911 * Release the parent event, if this was the last
7912 * reference to it.
7913 */
7914 put_event(parent_event);
7915 }
7916
7917 static void
__perf_event_exit_task(struct perf_event * child_event,struct perf_event_context * child_ctx,struct task_struct * child)7918 __perf_event_exit_task(struct perf_event *child_event,
7919 struct perf_event_context *child_ctx,
7920 struct task_struct *child)
7921 {
7922 /*
7923 * Do not destroy the 'original' grouping; because of the context
7924 * switch optimization the original events could've ended up in a
7925 * random child task.
7926 *
7927 * If we were to destroy the original group, all group related
7928 * operations would cease to function properly after this random
7929 * child dies.
7930 *
7931 * Do destroy all inherited groups, we don't care about those
7932 * and being thorough is better.
7933 */
7934 perf_remove_from_context(child_event, !!child_event->parent);
7935
7936 /*
7937 * It can happen that the parent exits first, and has events
7938 * that are still around due to the child reference. These
7939 * events need to be zapped.
7940 */
7941 if (child_event->parent) {
7942 sync_child_event(child_event, child);
7943 free_event(child_event);
7944 } else {
7945 child_event->state = PERF_EVENT_STATE_EXIT;
7946 perf_event_wakeup(child_event);
7947 }
7948 }
7949
perf_event_exit_task_context(struct task_struct * child,int ctxn)7950 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7951 {
7952 struct perf_event *child_event, *next;
7953 struct perf_event_context *child_ctx, *clone_ctx = NULL;
7954 unsigned long flags;
7955
7956 if (likely(!child->perf_event_ctxp[ctxn])) {
7957 perf_event_task(child, NULL, 0);
7958 return;
7959 }
7960
7961 local_irq_save(flags);
7962 /*
7963 * We can't reschedule here because interrupts are disabled,
7964 * and either child is current or it is a task that can't be
7965 * scheduled, so we are now safe from rescheduling changing
7966 * our context.
7967 */
7968 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
7969
7970 /*
7971 * Take the context lock here so that if find_get_context is
7972 * reading child->perf_event_ctxp, we wait until it has
7973 * incremented the context's refcount before we do put_ctx below.
7974 */
7975 raw_spin_lock(&child_ctx->lock);
7976 task_ctx_sched_out(child_ctx);
7977 child->perf_event_ctxp[ctxn] = NULL;
7978
7979 /*
7980 * If this context is a clone; unclone it so it can't get
7981 * swapped to another process while we're removing all
7982 * the events from it.
7983 */
7984 clone_ctx = unclone_ctx(child_ctx);
7985 update_context_time(child_ctx);
7986 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7987
7988 if (clone_ctx)
7989 put_ctx(clone_ctx);
7990
7991 /*
7992 * Report the task dead after unscheduling the events so that we
7993 * won't get any samples after PERF_RECORD_EXIT. We can however still
7994 * get a few PERF_RECORD_READ events.
7995 */
7996 perf_event_task(child, child_ctx, 0);
7997
7998 /*
7999 * We can recurse on the same lock type through:
8000 *
8001 * __perf_event_exit_task()
8002 * sync_child_event()
8003 * put_event()
8004 * mutex_lock(&ctx->mutex)
8005 *
8006 * But since its the parent context it won't be the same instance.
8007 */
8008 mutex_lock(&child_ctx->mutex);
8009
8010 list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
8011 __perf_event_exit_task(child_event, child_ctx, child);
8012
8013 mutex_unlock(&child_ctx->mutex);
8014
8015 put_ctx(child_ctx);
8016 }
8017
8018 /*
8019 * When a child task exits, feed back event values to parent events.
8020 */
perf_event_exit_task(struct task_struct * child)8021 void perf_event_exit_task(struct task_struct *child)
8022 {
8023 struct perf_event *event, *tmp;
8024 int ctxn;
8025
8026 mutex_lock(&child->perf_event_mutex);
8027 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
8028 owner_entry) {
8029 list_del_init(&event->owner_entry);
8030
8031 /*
8032 * Ensure the list deletion is visible before we clear
8033 * the owner, closes a race against perf_release() where
8034 * we need to serialize on the owner->perf_event_mutex.
8035 */
8036 smp_wmb();
8037 event->owner = NULL;
8038 }
8039 mutex_unlock(&child->perf_event_mutex);
8040
8041 for_each_task_context_nr(ctxn)
8042 perf_event_exit_task_context(child, ctxn);
8043 }
8044
perf_free_event(struct perf_event * event,struct perf_event_context * ctx)8045 static void perf_free_event(struct perf_event *event,
8046 struct perf_event_context *ctx)
8047 {
8048 struct perf_event *parent = event->parent;
8049
8050 if (WARN_ON_ONCE(!parent))
8051 return;
8052
8053 mutex_lock(&parent->child_mutex);
8054 list_del_init(&event->child_list);
8055 mutex_unlock(&parent->child_mutex);
8056
8057 put_event(parent);
8058
8059 perf_group_detach(event);
8060 list_del_event(event, ctx);
8061 free_event(event);
8062 }
8063
8064 /*
8065 * free an unexposed, unused context as created by inheritance by
8066 * perf_event_init_task below, used by fork() in case of fail.
8067 */
perf_event_free_task(struct task_struct * task)8068 void perf_event_free_task(struct task_struct *task)
8069 {
8070 struct perf_event_context *ctx;
8071 struct perf_event *event, *tmp;
8072 int ctxn;
8073
8074 for_each_task_context_nr(ctxn) {
8075 ctx = task->perf_event_ctxp[ctxn];
8076 if (!ctx)
8077 continue;
8078
8079 mutex_lock(&ctx->mutex);
8080 again:
8081 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
8082 group_entry)
8083 perf_free_event(event, ctx);
8084
8085 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
8086 group_entry)
8087 perf_free_event(event, ctx);
8088
8089 if (!list_empty(&ctx->pinned_groups) ||
8090 !list_empty(&ctx->flexible_groups))
8091 goto again;
8092
8093 mutex_unlock(&ctx->mutex);
8094
8095 put_ctx(ctx);
8096 }
8097 }
8098
perf_event_delayed_put(struct task_struct * task)8099 void perf_event_delayed_put(struct task_struct *task)
8100 {
8101 int ctxn;
8102
8103 for_each_task_context_nr(ctxn)
8104 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
8105 }
8106
8107 /*
8108 * inherit a event from parent task to child task:
8109 */
8110 static struct perf_event *
inherit_event(struct perf_event * parent_event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,struct perf_event * group_leader,struct perf_event_context * child_ctx)8111 inherit_event(struct perf_event *parent_event,
8112 struct task_struct *parent,
8113 struct perf_event_context *parent_ctx,
8114 struct task_struct *child,
8115 struct perf_event *group_leader,
8116 struct perf_event_context *child_ctx)
8117 {
8118 enum perf_event_active_state parent_state = parent_event->state;
8119 struct perf_event *child_event;
8120 unsigned long flags;
8121
8122 /*
8123 * Instead of creating recursive hierarchies of events,
8124 * we link inherited events back to the original parent,
8125 * which has a filp for sure, which we use as the reference
8126 * count:
8127 */
8128 if (parent_event->parent)
8129 parent_event = parent_event->parent;
8130
8131 child_event = perf_event_alloc(&parent_event->attr,
8132 parent_event->cpu,
8133 child,
8134 group_leader, parent_event,
8135 NULL, NULL);
8136 if (IS_ERR(child_event))
8137 return child_event;
8138
8139 if (is_orphaned_event(parent_event) ||
8140 !atomic_long_inc_not_zero(&parent_event->refcount)) {
8141 free_event(child_event);
8142 return NULL;
8143 }
8144
8145 get_ctx(child_ctx);
8146
8147 /*
8148 * Make the child state follow the state of the parent event,
8149 * not its attr.disabled bit. We hold the parent's mutex,
8150 * so we won't race with perf_event_{en, dis}able_family.
8151 */
8152 if (parent_state >= PERF_EVENT_STATE_INACTIVE)
8153 child_event->state = PERF_EVENT_STATE_INACTIVE;
8154 else
8155 child_event->state = PERF_EVENT_STATE_OFF;
8156
8157 if (parent_event->attr.freq) {
8158 u64 sample_period = parent_event->hw.sample_period;
8159 struct hw_perf_event *hwc = &child_event->hw;
8160
8161 hwc->sample_period = sample_period;
8162 hwc->last_period = sample_period;
8163
8164 local64_set(&hwc->period_left, sample_period);
8165 }
8166
8167 child_event->ctx = child_ctx;
8168 child_event->overflow_handler = parent_event->overflow_handler;
8169 child_event->overflow_handler_context
8170 = parent_event->overflow_handler_context;
8171
8172 /*
8173 * Precalculate sample_data sizes
8174 */
8175 perf_event__header_size(child_event);
8176 perf_event__id_header_size(child_event);
8177
8178 /*
8179 * Link it up in the child's context:
8180 */
8181 raw_spin_lock_irqsave(&child_ctx->lock, flags);
8182 add_event_to_ctx(child_event, child_ctx);
8183 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
8184
8185 /*
8186 * Link this into the parent event's child list
8187 */
8188 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
8189 mutex_lock(&parent_event->child_mutex);
8190 list_add_tail(&child_event->child_list, &parent_event->child_list);
8191 mutex_unlock(&parent_event->child_mutex);
8192
8193 return child_event;
8194 }
8195
inherit_group(struct perf_event * parent_event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,struct perf_event_context * child_ctx)8196 static int inherit_group(struct perf_event *parent_event,
8197 struct task_struct *parent,
8198 struct perf_event_context *parent_ctx,
8199 struct task_struct *child,
8200 struct perf_event_context *child_ctx)
8201 {
8202 struct perf_event *leader;
8203 struct perf_event *sub;
8204 struct perf_event *child_ctr;
8205
8206 leader = inherit_event(parent_event, parent, parent_ctx,
8207 child, NULL, child_ctx);
8208 if (IS_ERR(leader))
8209 return PTR_ERR(leader);
8210 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
8211 child_ctr = inherit_event(sub, parent, parent_ctx,
8212 child, leader, child_ctx);
8213 if (IS_ERR(child_ctr))
8214 return PTR_ERR(child_ctr);
8215 }
8216 return 0;
8217 }
8218
8219 static int
inherit_task_group(struct perf_event * event,struct task_struct * parent,struct perf_event_context * parent_ctx,struct task_struct * child,int ctxn,int * inherited_all)8220 inherit_task_group(struct perf_event *event, struct task_struct *parent,
8221 struct perf_event_context *parent_ctx,
8222 struct task_struct *child, int ctxn,
8223 int *inherited_all)
8224 {
8225 int ret;
8226 struct perf_event_context *child_ctx;
8227
8228 if (!event->attr.inherit) {
8229 *inherited_all = 0;
8230 return 0;
8231 }
8232
8233 child_ctx = child->perf_event_ctxp[ctxn];
8234 if (!child_ctx) {
8235 /*
8236 * This is executed from the parent task context, so
8237 * inherit events that have been marked for cloning.
8238 * First allocate and initialize a context for the
8239 * child.
8240 */
8241
8242 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
8243 if (!child_ctx)
8244 return -ENOMEM;
8245
8246 child->perf_event_ctxp[ctxn] = child_ctx;
8247 }
8248
8249 ret = inherit_group(event, parent, parent_ctx,
8250 child, child_ctx);
8251
8252 if (ret)
8253 *inherited_all = 0;
8254
8255 return ret;
8256 }
8257
8258 /*
8259 * Initialize the perf_event context in task_struct
8260 */
perf_event_init_context(struct task_struct * child,int ctxn)8261 static int perf_event_init_context(struct task_struct *child, int ctxn)
8262 {
8263 struct perf_event_context *child_ctx, *parent_ctx;
8264 struct perf_event_context *cloned_ctx;
8265 struct perf_event *event;
8266 struct task_struct *parent = current;
8267 int inherited_all = 1;
8268 unsigned long flags;
8269 int ret = 0;
8270
8271 if (likely(!parent->perf_event_ctxp[ctxn]))
8272 return 0;
8273
8274 /*
8275 * If the parent's context is a clone, pin it so it won't get
8276 * swapped under us.
8277 */
8278 parent_ctx = perf_pin_task_context(parent, ctxn);
8279 if (!parent_ctx)
8280 return 0;
8281
8282 /*
8283 * No need to check if parent_ctx != NULL here; since we saw
8284 * it non-NULL earlier, the only reason for it to become NULL
8285 * is if we exit, and since we're currently in the middle of
8286 * a fork we can't be exiting at the same time.
8287 */
8288
8289 /*
8290 * Lock the parent list. No need to lock the child - not PID
8291 * hashed yet and not running, so nobody can access it.
8292 */
8293 mutex_lock(&parent_ctx->mutex);
8294
8295 /*
8296 * We dont have to disable NMIs - we are only looking at
8297 * the list, not manipulating it:
8298 */
8299 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
8300 ret = inherit_task_group(event, parent, parent_ctx,
8301 child, ctxn, &inherited_all);
8302 if (ret)
8303 goto out_unlock;
8304 }
8305
8306 /*
8307 * We can't hold ctx->lock when iterating the ->flexible_group list due
8308 * to allocations, but we need to prevent rotation because
8309 * rotate_ctx() will change the list from interrupt context.
8310 */
8311 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8312 parent_ctx->rotate_disable = 1;
8313 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8314
8315 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
8316 ret = inherit_task_group(event, parent, parent_ctx,
8317 child, ctxn, &inherited_all);
8318 if (ret)
8319 goto out_unlock;
8320 }
8321
8322 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
8323 parent_ctx->rotate_disable = 0;
8324
8325 child_ctx = child->perf_event_ctxp[ctxn];
8326
8327 if (child_ctx && inherited_all) {
8328 /*
8329 * Mark the child context as a clone of the parent
8330 * context, or of whatever the parent is a clone of.
8331 *
8332 * Note that if the parent is a clone, the holding of
8333 * parent_ctx->lock avoids it from being uncloned.
8334 */
8335 cloned_ctx = parent_ctx->parent_ctx;
8336 if (cloned_ctx) {
8337 child_ctx->parent_ctx = cloned_ctx;
8338 child_ctx->parent_gen = parent_ctx->parent_gen;
8339 } else {
8340 child_ctx->parent_ctx = parent_ctx;
8341 child_ctx->parent_gen = parent_ctx->generation;
8342 }
8343 get_ctx(child_ctx->parent_ctx);
8344 }
8345
8346 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
8347 out_unlock:
8348 mutex_unlock(&parent_ctx->mutex);
8349
8350 perf_unpin_context(parent_ctx);
8351 put_ctx(parent_ctx);
8352
8353 return ret;
8354 }
8355
8356 /*
8357 * Initialize the perf_event context in task_struct
8358 */
perf_event_init_task(struct task_struct * child)8359 int perf_event_init_task(struct task_struct *child)
8360 {
8361 int ctxn, ret;
8362
8363 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
8364 mutex_init(&child->perf_event_mutex);
8365 INIT_LIST_HEAD(&child->perf_event_list);
8366
8367 for_each_task_context_nr(ctxn) {
8368 ret = perf_event_init_context(child, ctxn);
8369 if (ret) {
8370 perf_event_free_task(child);
8371 return ret;
8372 }
8373 }
8374
8375 return 0;
8376 }
8377
perf_event_init_all_cpus(void)8378 static void __init perf_event_init_all_cpus(void)
8379 {
8380 struct swevent_htable *swhash;
8381 int cpu;
8382
8383 for_each_possible_cpu(cpu) {
8384 swhash = &per_cpu(swevent_htable, cpu);
8385 mutex_init(&swhash->hlist_mutex);
8386 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
8387 }
8388 }
8389
perf_event_init_cpu(int cpu)8390 static void perf_event_init_cpu(int cpu)
8391 {
8392 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
8393
8394 mutex_lock(&swhash->hlist_mutex);
8395 if (swhash->hlist_refcount > 0) {
8396 struct swevent_hlist *hlist;
8397
8398 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
8399 WARN_ON(!hlist);
8400 rcu_assign_pointer(swhash->swevent_hlist, hlist);
8401 }
8402 mutex_unlock(&swhash->hlist_mutex);
8403 }
8404
8405 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
perf_pmu_rotate_stop(struct pmu * pmu)8406 static void perf_pmu_rotate_stop(struct pmu *pmu)
8407 {
8408 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
8409
8410 WARN_ON(!irqs_disabled());
8411
8412 list_del_init(&cpuctx->rotation_list);
8413 }
8414
__perf_event_exit_context(void * __info)8415 static void __perf_event_exit_context(void *__info)
8416 {
8417 struct remove_event re = { .detach_group = true };
8418 struct perf_event_context *ctx = __info;
8419
8420 perf_pmu_rotate_stop(ctx->pmu);
8421
8422 rcu_read_lock();
8423 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
8424 __perf_remove_from_context(&re);
8425 rcu_read_unlock();
8426 }
8427
perf_event_exit_cpu_context(int cpu)8428 static void perf_event_exit_cpu_context(int cpu)
8429 {
8430 struct perf_event_context *ctx;
8431 struct pmu *pmu;
8432 int idx;
8433
8434 idx = srcu_read_lock(&pmus_srcu);
8435 list_for_each_entry_rcu(pmu, &pmus, entry) {
8436 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
8437
8438 mutex_lock(&ctx->mutex);
8439 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
8440 mutex_unlock(&ctx->mutex);
8441 }
8442 srcu_read_unlock(&pmus_srcu, idx);
8443 }
8444
perf_event_exit_cpu(int cpu)8445 static void perf_event_exit_cpu(int cpu)
8446 {
8447 perf_event_exit_cpu_context(cpu);
8448 }
8449 #else
perf_event_exit_cpu(int cpu)8450 static inline void perf_event_exit_cpu(int cpu) { }
8451 #endif
8452
8453 static int
perf_reboot(struct notifier_block * notifier,unsigned long val,void * v)8454 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
8455 {
8456 int cpu;
8457
8458 for_each_online_cpu(cpu)
8459 perf_event_exit_cpu(cpu);
8460
8461 return NOTIFY_OK;
8462 }
8463
8464 /*
8465 * Run the perf reboot notifier at the very last possible moment so that
8466 * the generic watchdog code runs as long as possible.
8467 */
8468 static struct notifier_block perf_reboot_notifier = {
8469 .notifier_call = perf_reboot,
8470 .priority = INT_MIN,
8471 };
8472
8473 static int
perf_cpu_notify(struct notifier_block * self,unsigned long action,void * hcpu)8474 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
8475 {
8476 unsigned int cpu = (long)hcpu;
8477
8478 switch (action & ~CPU_TASKS_FROZEN) {
8479
8480 case CPU_UP_PREPARE:
8481 case CPU_DOWN_FAILED:
8482 perf_event_init_cpu(cpu);
8483 break;
8484
8485 case CPU_UP_CANCELED:
8486 case CPU_DOWN_PREPARE:
8487 perf_event_exit_cpu(cpu);
8488 break;
8489 default:
8490 break;
8491 }
8492
8493 return NOTIFY_OK;
8494 }
8495
perf_event_init(void)8496 void __init perf_event_init(void)
8497 {
8498 int ret;
8499
8500 idr_init(&pmu_idr);
8501
8502 perf_event_init_all_cpus();
8503 init_srcu_struct(&pmus_srcu);
8504 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
8505 perf_pmu_register(&perf_cpu_clock, NULL, -1);
8506 perf_pmu_register(&perf_task_clock, NULL, -1);
8507 perf_tp_register();
8508 perf_cpu_notifier(perf_cpu_notify);
8509 register_reboot_notifier(&perf_reboot_notifier);
8510
8511 ret = init_hw_breakpoint();
8512 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
8513
8514 /* do not patch jump label more than once per second */
8515 jump_label_rate_limit(&perf_sched_events, HZ);
8516
8517 /*
8518 * Build time assertion that we keep the data_head at the intended
8519 * location. IOW, validation we got the __reserved[] size right.
8520 */
8521 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
8522 != 1024);
8523 }
8524
perf_event_sysfs_init(void)8525 static int __init perf_event_sysfs_init(void)
8526 {
8527 struct pmu *pmu;
8528 int ret;
8529
8530 mutex_lock(&pmus_lock);
8531
8532 ret = bus_register(&pmu_bus);
8533 if (ret)
8534 goto unlock;
8535
8536 list_for_each_entry(pmu, &pmus, entry) {
8537 if (!pmu->name || pmu->type < 0)
8538 continue;
8539
8540 ret = pmu_dev_alloc(pmu);
8541 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
8542 }
8543 pmu_bus_running = 1;
8544 ret = 0;
8545
8546 unlock:
8547 mutex_unlock(&pmus_lock);
8548
8549 return ret;
8550 }
8551 device_initcall(perf_event_sysfs_init);
8552
8553 #ifdef CONFIG_CGROUP_PERF
8554 static struct cgroup_subsys_state *
perf_cgroup_css_alloc(struct cgroup_subsys_state * parent_css)8555 perf_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
8556 {
8557 struct perf_cgroup *jc;
8558
8559 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
8560 if (!jc)
8561 return ERR_PTR(-ENOMEM);
8562
8563 jc->info = alloc_percpu(struct perf_cgroup_info);
8564 if (!jc->info) {
8565 kfree(jc);
8566 return ERR_PTR(-ENOMEM);
8567 }
8568
8569 return &jc->css;
8570 }
8571
perf_cgroup_css_free(struct cgroup_subsys_state * css)8572 static void perf_cgroup_css_free(struct cgroup_subsys_state *css)
8573 {
8574 struct perf_cgroup *jc = container_of(css, struct perf_cgroup, css);
8575
8576 free_percpu(jc->info);
8577 kfree(jc);
8578 }
8579
__perf_cgroup_move(void * info)8580 static int __perf_cgroup_move(void *info)
8581 {
8582 struct task_struct *task = info;
8583 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
8584 return 0;
8585 }
8586
perf_cgroup_attach(struct cgroup_subsys_state * css,struct cgroup_taskset * tset)8587 static void perf_cgroup_attach(struct cgroup_subsys_state *css,
8588 struct cgroup_taskset *tset)
8589 {
8590 struct task_struct *task;
8591
8592 cgroup_taskset_for_each(task, tset)
8593 task_function_call(task, __perf_cgroup_move, task);
8594 }
8595
perf_cgroup_exit(struct cgroup_subsys_state * css,struct cgroup_subsys_state * old_css,struct task_struct * task)8596 static void perf_cgroup_exit(struct cgroup_subsys_state *css,
8597 struct cgroup_subsys_state *old_css,
8598 struct task_struct *task)
8599 {
8600 /*
8601 * cgroup_exit() is called in the copy_process() failure path.
8602 * Ignore this case since the task hasn't ran yet, this avoids
8603 * trying to poke a half freed task state from generic code.
8604 */
8605 if (!(task->flags & PF_EXITING))
8606 return;
8607
8608 task_function_call(task, __perf_cgroup_move, task);
8609 }
8610
8611 struct cgroup_subsys perf_event_cgrp_subsys = {
8612 .css_alloc = perf_cgroup_css_alloc,
8613 .css_free = perf_cgroup_css_free,
8614 .exit = perf_cgroup_exit,
8615 .attach = perf_cgroup_attach,
8616 };
8617 #endif /* CONFIG_CGROUP_PERF */
8618