• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/evlist.h"
25 #include <subcmd/exec-cmd.h>
26 #include "util/machine.h"
27 #include "util/session.h"
28 #include "util/thread.h"
29 #include <subcmd/parse-options.h>
30 #include "util/strlist.h"
31 #include "util/intlist.h"
32 #include "util/thread_map.h"
33 #include "util/stat.h"
34 #include "trace-event.h"
35 #include "util/parse-events.h"
36 #include "util/bpf-loader.h"
37 #include "callchain.h"
38 #include "syscalltbl.h"
39 #include "rb_resort.h"
40 
41 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
42 #include <stdlib.h>
43 #include <linux/err.h>
44 #include <linux/filter.h>
45 #include <linux/audit.h>
46 #include <linux/random.h>
47 #include <linux/stringify.h>
48 #include <linux/time64.h>
49 
50 #ifndef O_CLOEXEC
51 # define O_CLOEXEC		02000000
52 #endif
53 
54 struct trace {
55 	struct perf_tool	tool;
56 	struct syscalltbl	*sctbl;
57 	struct {
58 		int		max;
59 		struct syscall  *table;
60 		struct {
61 			struct perf_evsel *sys_enter,
62 					  *sys_exit;
63 		}		events;
64 	} syscalls;
65 	struct record_opts	opts;
66 	struct perf_evlist	*evlist;
67 	struct machine		*host;
68 	struct thread		*current;
69 	u64			base_time;
70 	FILE			*output;
71 	unsigned long		nr_events;
72 	struct strlist		*ev_qualifier;
73 	struct {
74 		size_t		nr;
75 		int		*entries;
76 	}			ev_qualifier_ids;
77 	struct intlist		*tid_list;
78 	struct intlist		*pid_list;
79 	struct {
80 		size_t		nr;
81 		pid_t		*entries;
82 	}			filter_pids;
83 	double			duration_filter;
84 	double			runtime_ms;
85 	struct {
86 		u64		vfs_getname,
87 				proc_getname;
88 	} stats;
89 	unsigned int		max_stack;
90 	unsigned int		min_stack;
91 	bool			not_ev_qualifier;
92 	bool			live;
93 	bool			full_time;
94 	bool			sched;
95 	bool			multiple_threads;
96 	bool			summary;
97 	bool			summary_only;
98 	bool			show_comm;
99 	bool			show_tool_stats;
100 	bool			trace_syscalls;
101 	bool			kernel_syscallchains;
102 	bool			force;
103 	bool			vfs_getname;
104 	int			trace_pgfaults;
105 	int			open_id;
106 };
107 
108 struct tp_field {
109 	int offset;
110 	union {
111 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
112 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
113 	};
114 };
115 
116 #define TP_UINT_FIELD(bits) \
117 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
118 { \
119 	u##bits value; \
120 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
121 	return value;  \
122 }
123 
124 TP_UINT_FIELD(8);
125 TP_UINT_FIELD(16);
126 TP_UINT_FIELD(32);
127 TP_UINT_FIELD(64);
128 
129 #define TP_UINT_FIELD__SWAPPED(bits) \
130 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
131 { \
132 	u##bits value; \
133 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
134 	return bswap_##bits(value);\
135 }
136 
137 TP_UINT_FIELD__SWAPPED(16);
138 TP_UINT_FIELD__SWAPPED(32);
139 TP_UINT_FIELD__SWAPPED(64);
140 
tp_field__init_uint(struct tp_field * field,struct format_field * format_field,bool needs_swap)141 static int tp_field__init_uint(struct tp_field *field,
142 			       struct format_field *format_field,
143 			       bool needs_swap)
144 {
145 	field->offset = format_field->offset;
146 
147 	switch (format_field->size) {
148 	case 1:
149 		field->integer = tp_field__u8;
150 		break;
151 	case 2:
152 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
153 		break;
154 	case 4:
155 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
156 		break;
157 	case 8:
158 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
159 		break;
160 	default:
161 		return -1;
162 	}
163 
164 	return 0;
165 }
166 
tp_field__ptr(struct tp_field * field,struct perf_sample * sample)167 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
168 {
169 	return sample->raw_data + field->offset;
170 }
171 
tp_field__init_ptr(struct tp_field * field,struct format_field * format_field)172 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
173 {
174 	field->offset = format_field->offset;
175 	field->pointer = tp_field__ptr;
176 	return 0;
177 }
178 
179 struct syscall_tp {
180 	struct tp_field id;
181 	union {
182 		struct tp_field args, ret;
183 	};
184 };
185 
perf_evsel__init_tp_uint_field(struct perf_evsel * evsel,struct tp_field * field,const char * name)186 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
187 					  struct tp_field *field,
188 					  const char *name)
189 {
190 	struct format_field *format_field = perf_evsel__field(evsel, name);
191 
192 	if (format_field == NULL)
193 		return -1;
194 
195 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
196 }
197 
198 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
199 	({ struct syscall_tp *sc = evsel->priv;\
200 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
201 
perf_evsel__init_tp_ptr_field(struct perf_evsel * evsel,struct tp_field * field,const char * name)202 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
203 					 struct tp_field *field,
204 					 const char *name)
205 {
206 	struct format_field *format_field = perf_evsel__field(evsel, name);
207 
208 	if (format_field == NULL)
209 		return -1;
210 
211 	return tp_field__init_ptr(field, format_field);
212 }
213 
214 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
215 	({ struct syscall_tp *sc = evsel->priv;\
216 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
217 
perf_evsel__delete_priv(struct perf_evsel * evsel)218 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
219 {
220 	zfree(&evsel->priv);
221 	perf_evsel__delete(evsel);
222 }
223 
perf_evsel__init_syscall_tp(struct perf_evsel * evsel,void * handler)224 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
225 {
226 	evsel->priv = malloc(sizeof(struct syscall_tp));
227 	if (evsel->priv != NULL) {
228 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
229 			goto out_delete;
230 
231 		evsel->handler = handler;
232 		return 0;
233 	}
234 
235 	return -ENOMEM;
236 
237 out_delete:
238 	zfree(&evsel->priv);
239 	return -ENOENT;
240 }
241 
perf_evsel__syscall_newtp(const char * direction,void * handler)242 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
243 {
244 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
245 
246 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
247 	if (IS_ERR(evsel))
248 		evsel = perf_evsel__newtp("syscalls", direction);
249 
250 	if (IS_ERR(evsel))
251 		return NULL;
252 
253 	if (perf_evsel__init_syscall_tp(evsel, handler))
254 		goto out_delete;
255 
256 	return evsel;
257 
258 out_delete:
259 	perf_evsel__delete_priv(evsel);
260 	return NULL;
261 }
262 
263 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
264 	({ struct syscall_tp *fields = evsel->priv; \
265 	   fields->name.integer(&fields->name, sample); })
266 
267 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
268 	({ struct syscall_tp *fields = evsel->priv; \
269 	   fields->name.pointer(&fields->name, sample); })
270 
271 struct syscall_arg {
272 	unsigned long val;
273 	struct thread *thread;
274 	struct trace  *trace;
275 	void	      *parm;
276 	u8	      idx;
277 	u8	      mask;
278 };
279 
280 struct strarray {
281 	int	    offset;
282 	int	    nr_entries;
283 	const char **entries;
284 };
285 
286 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
287 	.nr_entries = ARRAY_SIZE(array), \
288 	.entries = array, \
289 }
290 
291 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
292 	.offset	    = off, \
293 	.nr_entries = ARRAY_SIZE(array), \
294 	.entries = array, \
295 }
296 
__syscall_arg__scnprintf_strarray(char * bf,size_t size,const char * intfmt,struct syscall_arg * arg)297 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
298 						const char *intfmt,
299 					        struct syscall_arg *arg)
300 {
301 	struct strarray *sa = arg->parm;
302 	int idx = arg->val - sa->offset;
303 
304 	if (idx < 0 || idx >= sa->nr_entries)
305 		return scnprintf(bf, size, intfmt, arg->val);
306 
307 	return scnprintf(bf, size, "%s", sa->entries[idx]);
308 }
309 
syscall_arg__scnprintf_strarray(char * bf,size_t size,struct syscall_arg * arg)310 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
311 					      struct syscall_arg *arg)
312 {
313 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
314 }
315 
316 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
317 
318 #if defined(__i386__) || defined(__x86_64__)
319 /*
320  * FIXME: Make this available to all arches as soon as the ioctl beautifier
321  * 	  gets rewritten to support all arches.
322  */
syscall_arg__scnprintf_strhexarray(char * bf,size_t size,struct syscall_arg * arg)323 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
324 						 struct syscall_arg *arg)
325 {
326 	return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
327 }
328 
329 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
330 #endif /* defined(__i386__) || defined(__x86_64__) */
331 
332 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
333 					struct syscall_arg *arg);
334 
335 #define SCA_FD syscall_arg__scnprintf_fd
336 
337 #ifndef AT_FDCWD
338 #define AT_FDCWD	-100
339 #endif
340 
syscall_arg__scnprintf_fd_at(char * bf,size_t size,struct syscall_arg * arg)341 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
342 					   struct syscall_arg *arg)
343 {
344 	int fd = arg->val;
345 
346 	if (fd == AT_FDCWD)
347 		return scnprintf(bf, size, "CWD");
348 
349 	return syscall_arg__scnprintf_fd(bf, size, arg);
350 }
351 
352 #define SCA_FDAT syscall_arg__scnprintf_fd_at
353 
354 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
355 					      struct syscall_arg *arg);
356 
357 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
358 
syscall_arg__scnprintf_hex(char * bf,size_t size,struct syscall_arg * arg)359 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
360 					 struct syscall_arg *arg)
361 {
362 	return scnprintf(bf, size, "%#lx", arg->val);
363 }
364 
365 #define SCA_HEX syscall_arg__scnprintf_hex
366 
syscall_arg__scnprintf_int(char * bf,size_t size,struct syscall_arg * arg)367 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
368 					 struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
373 #define SCA_INT syscall_arg__scnprintf_int
374 
375 static const char *bpf_cmd[] = {
376 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
377 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
378 };
379 static DEFINE_STRARRAY(bpf_cmd);
380 
381 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
382 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
383 
384 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
385 static DEFINE_STRARRAY(itimers);
386 
387 static const char *keyctl_options[] = {
388 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
389 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
390 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
391 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
392 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
393 };
394 static DEFINE_STRARRAY(keyctl_options);
395 
396 static const char *whences[] = { "SET", "CUR", "END",
397 #ifdef SEEK_DATA
398 "DATA",
399 #endif
400 #ifdef SEEK_HOLE
401 "HOLE",
402 #endif
403 };
404 static DEFINE_STRARRAY(whences);
405 
406 static const char *fcntl_cmds[] = {
407 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
408 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
409 	"F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
410 	"F_GETOWNER_UIDS",
411 };
412 static DEFINE_STRARRAY(fcntl_cmds);
413 
414 static const char *rlimit_resources[] = {
415 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
416 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
417 	"RTTIME",
418 };
419 static DEFINE_STRARRAY(rlimit_resources);
420 
421 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
422 static DEFINE_STRARRAY(sighow);
423 
424 static const char *clockid[] = {
425 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
426 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
427 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
428 };
429 static DEFINE_STRARRAY(clockid);
430 
431 static const char *socket_families[] = {
432 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
433 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
434 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
435 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
436 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
437 	"ALG", "NFC", "VSOCK",
438 };
439 static DEFINE_STRARRAY(socket_families);
440 
syscall_arg__scnprintf_access_mode(char * bf,size_t size,struct syscall_arg * arg)441 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
442 						 struct syscall_arg *arg)
443 {
444 	size_t printed = 0;
445 	int mode = arg->val;
446 
447 	if (mode == F_OK) /* 0 */
448 		return scnprintf(bf, size, "F");
449 #define	P_MODE(n) \
450 	if (mode & n##_OK) { \
451 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
452 		mode &= ~n##_OK; \
453 	}
454 
455 	P_MODE(R);
456 	P_MODE(W);
457 	P_MODE(X);
458 #undef P_MODE
459 
460 	if (mode)
461 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
462 
463 	return printed;
464 }
465 
466 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
467 
468 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
469 					      struct syscall_arg *arg);
470 
471 #define SCA_FILENAME syscall_arg__scnprintf_filename
472 
syscall_arg__scnprintf_pipe_flags(char * bf,size_t size,struct syscall_arg * arg)473 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
474 						struct syscall_arg *arg)
475 {
476 	int printed = 0, flags = arg->val;
477 
478 #define	P_FLAG(n) \
479 	if (flags & O_##n) { \
480 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
481 		flags &= ~O_##n; \
482 	}
483 
484 	P_FLAG(CLOEXEC);
485 	P_FLAG(NONBLOCK);
486 #undef P_FLAG
487 
488 	if (flags)
489 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
490 
491 	return printed;
492 }
493 
494 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
495 
496 #if defined(__i386__) || defined(__x86_64__)
497 /*
498  * FIXME: Make this available to all arches.
499  */
500 #define TCGETS		0x5401
501 
502 static const char *tioctls[] = {
503 	"TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
504 	"TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
505 	"TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
506 	"TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
507 	"TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
508 	"TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
509 	"TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
510 	"TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
511 	"TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
512 	"TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
513 	"TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
514 	[0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
515 	"TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
516 	"TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
517 	"TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
518 };
519 
520 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
521 #endif /* defined(__i386__) || defined(__x86_64__) */
522 
523 #ifndef GRND_NONBLOCK
524 #define GRND_NONBLOCK	0x0001
525 #endif
526 #ifndef GRND_RANDOM
527 #define GRND_RANDOM	0x0002
528 #endif
529 
syscall_arg__scnprintf_getrandom_flags(char * bf,size_t size,struct syscall_arg * arg)530 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
531 						   struct syscall_arg *arg)
532 {
533 	int printed = 0, flags = arg->val;
534 
535 #define	P_FLAG(n) \
536 	if (flags & GRND_##n) { \
537 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
538 		flags &= ~GRND_##n; \
539 	}
540 
541 	P_FLAG(RANDOM);
542 	P_FLAG(NONBLOCK);
543 #undef P_FLAG
544 
545 	if (flags)
546 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
547 
548 	return printed;
549 }
550 
551 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
552 
553 #define STRARRAY(arg, name, array) \
554 	  .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
555 	  .arg_parm	 = { [arg] = &strarray__##array, }
556 
557 #include "trace/beauty/eventfd.c"
558 #include "trace/beauty/flock.c"
559 #include "trace/beauty/futex_op.c"
560 #include "trace/beauty/mmap.c"
561 #include "trace/beauty/mode_t.c"
562 #include "trace/beauty/msg_flags.c"
563 #include "trace/beauty/open_flags.c"
564 #include "trace/beauty/perf_event_open.c"
565 #include "trace/beauty/pid.c"
566 #include "trace/beauty/sched_policy.c"
567 #include "trace/beauty/seccomp.c"
568 #include "trace/beauty/signum.c"
569 #include "trace/beauty/socket_type.c"
570 #include "trace/beauty/waitid_options.c"
571 
572 static struct syscall_fmt {
573 	const char *name;
574 	const char *alias;
575 	size_t	   (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
576 	void	   *arg_parm[6];
577 	bool	   errmsg;
578 	bool	   errpid;
579 	bool	   timeout;
580 	bool	   hexret;
581 } syscall_fmts[] = {
582 	{ .name	    = "access",	    .errmsg = true,
583 	  .arg_scnprintf = { [1] = SCA_ACCMODE,  /* mode */ }, },
584 	{ .name	    = "arch_prctl", .errmsg = true, .alias = "prctl", },
585 	{ .name	    = "bpf",	    .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
586 	{ .name	    = "brk",	    .hexret = true,
587 	  .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
588 	{ .name	    = "chdir",	    .errmsg = true, },
589 	{ .name	    = "chmod",	    .errmsg = true, },
590 	{ .name	    = "chroot",	    .errmsg = true, },
591 	{ .name     = "clock_gettime",  .errmsg = true, STRARRAY(0, clk_id, clockid), },
592 	{ .name	    = "clone",	    .errpid = true, },
593 	{ .name	    = "close",	    .errmsg = true,
594 	  .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
595 	{ .name	    = "connect",    .errmsg = true, },
596 	{ .name	    = "creat",	    .errmsg = true, },
597 	{ .name	    = "dup",	    .errmsg = true, },
598 	{ .name	    = "dup2",	    .errmsg = true, },
599 	{ .name	    = "dup3",	    .errmsg = true, },
600 	{ .name	    = "epoll_ctl",  .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
601 	{ .name	    = "eventfd2",   .errmsg = true,
602 	  .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
603 	{ .name	    = "faccessat",  .errmsg = true, },
604 	{ .name	    = "fadvise64",  .errmsg = true, },
605 	{ .name	    = "fallocate",  .errmsg = true, },
606 	{ .name	    = "fchdir",	    .errmsg = true, },
607 	{ .name	    = "fchmod",	    .errmsg = true, },
608 	{ .name	    = "fchmodat",   .errmsg = true,
609 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
610 	{ .name	    = "fchown",	    .errmsg = true, },
611 	{ .name	    = "fchownat",   .errmsg = true,
612 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
613 	{ .name	    = "fcntl",	    .errmsg = true,
614 	  .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
615 	  .arg_parm	 = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
616 	{ .name	    = "fdatasync",  .errmsg = true, },
617 	{ .name	    = "flock",	    .errmsg = true,
618 	  .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
619 	{ .name	    = "fsetxattr",  .errmsg = true, },
620 	{ .name	    = "fstat",	    .errmsg = true, .alias = "newfstat", },
621 	{ .name	    = "fstatat",    .errmsg = true, .alias = "newfstatat", },
622 	{ .name	    = "fstatfs",    .errmsg = true, },
623 	{ .name	    = "fsync",    .errmsg = true, },
624 	{ .name	    = "ftruncate", .errmsg = true, },
625 	{ .name	    = "futex",	    .errmsg = true,
626 	  .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
627 	{ .name	    = "futimesat", .errmsg = true,
628 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
629 	{ .name	    = "getdents",   .errmsg = true, },
630 	{ .name	    = "getdents64", .errmsg = true, },
631 	{ .name	    = "getitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
632 	{ .name	    = "getpid",	    .errpid = true, },
633 	{ .name	    = "getpgid",    .errpid = true, },
634 	{ .name	    = "getppid",    .errpid = true, },
635 	{ .name	    = "getrandom",  .errmsg = true,
636 	  .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
637 	{ .name	    = "getrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
638 	{ .name	    = "getxattr",   .errmsg = true, },
639 	{ .name	    = "inotify_add_watch",	    .errmsg = true, },
640 	{ .name	    = "ioctl",	    .errmsg = true,
641 	  .arg_scnprintf = {
642 #if defined(__i386__) || defined(__x86_64__)
643 /*
644  * FIXME: Make this available to all arches.
645  */
646 			     [1] = SCA_STRHEXARRAY, /* cmd */
647 			     [2] = SCA_HEX, /* arg */ },
648 	  .arg_parm	 = { [1] = &strarray__tioctls, /* cmd */ }, },
649 #else
650 			     [2] = SCA_HEX, /* arg */ }, },
651 #endif
652 	{ .name	    = "keyctl",	    .errmsg = true, STRARRAY(0, option, keyctl_options), },
653 	{ .name	    = "kill",	    .errmsg = true,
654 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
655 	{ .name	    = "lchown",    .errmsg = true, },
656 	{ .name	    = "lgetxattr",  .errmsg = true, },
657 	{ .name	    = "linkat",	    .errmsg = true,
658 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
659 	{ .name	    = "listxattr",  .errmsg = true, },
660 	{ .name	    = "llistxattr", .errmsg = true, },
661 	{ .name	    = "lremovexattr",  .errmsg = true, },
662 	{ .name	    = "lseek",	    .errmsg = true,
663 	  .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
664 	  .arg_parm	 = { [2] = &strarray__whences, /* whence */ }, },
665 	{ .name	    = "lsetxattr",  .errmsg = true, },
666 	{ .name	    = "lstat",	    .errmsg = true, .alias = "newlstat", },
667 	{ .name	    = "lsxattr",    .errmsg = true, },
668 	{ .name     = "madvise",    .errmsg = true,
669 	  .arg_scnprintf = { [0] = SCA_HEX,	 /* start */
670 			     [2] = SCA_MADV_BHV, /* behavior */ }, },
671 	{ .name	    = "mkdir",    .errmsg = true, },
672 	{ .name	    = "mkdirat",    .errmsg = true,
673 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
674 	{ .name	    = "mknod",      .errmsg = true, },
675 	{ .name	    = "mknodat",    .errmsg = true,
676 	  .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
677 	{ .name	    = "mlock",	    .errmsg = true,
678 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
679 	{ .name	    = "mlockall",   .errmsg = true,
680 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 	{ .name	    = "mmap",	    .hexret = true,
682 /* The standard mmap maps to old_mmap on s390x */
683 #if defined(__s390x__)
684 	.alias = "old_mmap",
685 #endif
686 	  .arg_scnprintf = { [0] = SCA_HEX,	  /* addr */
687 			     [2] = SCA_MMAP_PROT, /* prot */
688 			     [3] = SCA_MMAP_FLAGS, /* flags */ }, },
689 	{ .name	    = "mprotect",   .errmsg = true,
690 	  .arg_scnprintf = { [0] = SCA_HEX, /* start */
691 			     [2] = SCA_MMAP_PROT, /* prot */ }, },
692 	{ .name	    = "mq_unlink", .errmsg = true,
693 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
694 	{ .name	    = "mremap",	    .hexret = true,
695 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */
696 			     [3] = SCA_MREMAP_FLAGS, /* flags */
697 			     [4] = SCA_HEX, /* new_addr */ }, },
698 	{ .name	    = "munlock",    .errmsg = true,
699 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
700 	{ .name	    = "munmap",	    .errmsg = true,
701 	  .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
702 	{ .name	    = "name_to_handle_at", .errmsg = true,
703 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
704 	{ .name	    = "newfstatat", .errmsg = true,
705 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
706 	{ .name	    = "open",	    .errmsg = true,
707 	  .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
708 	{ .name	    = "open_by_handle_at", .errmsg = true,
709 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
710 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
711 	{ .name	    = "openat",	    .errmsg = true,
712 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
713 			     [2] = SCA_OPEN_FLAGS, /* flags */ }, },
714 	{ .name	    = "perf_event_open", .errmsg = true,
715 	  .arg_scnprintf = { [2] = SCA_INT, /* cpu */
716 			     [3] = SCA_FD,  /* group_fd */
717 			     [4] = SCA_PERF_FLAGS,  /* flags */ }, },
718 	{ .name	    = "pipe2",	    .errmsg = true,
719 	  .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
720 	{ .name	    = "poll",	    .errmsg = true, .timeout = true, },
721 	{ .name	    = "ppoll",	    .errmsg = true, .timeout = true, },
722 	{ .name	    = "pread",	    .errmsg = true, .alias = "pread64", },
723 	{ .name	    = "preadv",	    .errmsg = true, .alias = "pread", },
724 	{ .name	    = "prlimit64",  .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
725 	{ .name	    = "pwrite",	    .errmsg = true, .alias = "pwrite64", },
726 	{ .name	    = "pwritev",    .errmsg = true, },
727 	{ .name	    = "read",	    .errmsg = true, },
728 	{ .name	    = "readlink",   .errmsg = true, },
729 	{ .name	    = "readlinkat", .errmsg = true,
730 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
731 	{ .name	    = "readv",	    .errmsg = true, },
732 	{ .name	    = "recvfrom",   .errmsg = true,
733 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
734 	{ .name	    = "recvmmsg",   .errmsg = true,
735 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
736 	{ .name	    = "recvmsg",    .errmsg = true,
737 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
738 	{ .name	    = "removexattr", .errmsg = true, },
739 	{ .name	    = "renameat",   .errmsg = true,
740 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
741 	{ .name	    = "rmdir",    .errmsg = true, },
742 	{ .name	    = "rt_sigaction", .errmsg = true,
743 	  .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
744 	{ .name	    = "rt_sigprocmask",  .errmsg = true, STRARRAY(0, how, sighow), },
745 	{ .name	    = "rt_sigqueueinfo", .errmsg = true,
746 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
747 	{ .name	    = "rt_tgsigqueueinfo", .errmsg = true,
748 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
749 	{ .name	    = "sched_getattr",	      .errmsg = true, },
750 	{ .name	    = "sched_setattr",	      .errmsg = true, },
751 	{ .name	    = "sched_setscheduler",   .errmsg = true,
752 	  .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
753 	{ .name	    = "seccomp", .errmsg = true,
754 	  .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
755 			     [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
756 	{ .name	    = "select",	    .errmsg = true, .timeout = true, },
757 	{ .name	    = "sendmmsg",    .errmsg = true,
758 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
759 	{ .name	    = "sendmsg",    .errmsg = true,
760 	  .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
761 	{ .name	    = "sendto",	    .errmsg = true,
762 	  .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
763 	{ .name	    = "set_tid_address", .errpid = true, },
764 	{ .name	    = "setitimer",  .errmsg = true, STRARRAY(0, which, itimers), },
765 	{ .name	    = "setpgid",    .errmsg = true, },
766 	{ .name	    = "setrlimit",  .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
767 	{ .name	    = "setxattr",   .errmsg = true, },
768 	{ .name	    = "shutdown",   .errmsg = true, },
769 	{ .name	    = "socket",	    .errmsg = true,
770 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
771 			     [1] = SCA_SK_TYPE, /* type */ },
772 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
773 	{ .name	    = "socketpair", .errmsg = true,
774 	  .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
775 			     [1] = SCA_SK_TYPE, /* type */ },
776 	  .arg_parm	 = { [0] = &strarray__socket_families, /* family */ }, },
777 	{ .name	    = "stat",	    .errmsg = true, .alias = "newstat", },
778 	{ .name	    = "statfs",	    .errmsg = true, },
779 	{ .name	    = "swapoff",    .errmsg = true,
780 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
781 	{ .name	    = "swapon",	    .errmsg = true,
782 	  .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
783 	{ .name	    = "symlinkat",  .errmsg = true,
784 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
785 	{ .name	    = "tgkill",	    .errmsg = true,
786 	  .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
787 	{ .name	    = "tkill",	    .errmsg = true,
788 	  .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
789 	{ .name	    = "truncate",   .errmsg = true, },
790 	{ .name	    = "uname",	    .errmsg = true, .alias = "newuname", },
791 	{ .name	    = "unlinkat",   .errmsg = true,
792 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
793 	{ .name	    = "utime",  .errmsg = true, },
794 	{ .name	    = "utimensat",  .errmsg = true,
795 	  .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
796 	{ .name	    = "utimes",  .errmsg = true, },
797 	{ .name	    = "vmsplice",  .errmsg = true, },
798 	{ .name	    = "wait4",	    .errpid = true,
799 	  .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
800 	{ .name	    = "waitid",	    .errpid = true,
801 	  .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
802 	{ .name	    = "write",	    .errmsg = true, },
803 	{ .name	    = "writev",	    .errmsg = true, },
804 };
805 
syscall_fmt__cmp(const void * name,const void * fmtp)806 static int syscall_fmt__cmp(const void *name, const void *fmtp)
807 {
808 	const struct syscall_fmt *fmt = fmtp;
809 	return strcmp(name, fmt->name);
810 }
811 
syscall_fmt__find(const char * name)812 static struct syscall_fmt *syscall_fmt__find(const char *name)
813 {
814 	const int nmemb = ARRAY_SIZE(syscall_fmts);
815 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
816 }
817 
818 struct syscall {
819 	struct event_format *tp_format;
820 	int		    nr_args;
821 	struct format_field *args;
822 	const char	    *name;
823 	bool		    is_exit;
824 	struct syscall_fmt  *fmt;
825 	size_t		    (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
826 	void		    **arg_parm;
827 };
828 
829 /*
830  * We need to have this 'calculated' boolean because in some cases we really
831  * don't know what is the duration of a syscall, for instance, when we start
832  * a session and some threads are waiting for a syscall to finish, say 'poll',
833  * in which case all we can do is to print "( ? ) for duration and for the
834  * start timestamp.
835  */
fprintf_duration(unsigned long t,bool calculated,FILE * fp)836 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
837 {
838 	double duration = (double)t / NSEC_PER_MSEC;
839 	size_t printed = fprintf(fp, "(");
840 
841 	if (!calculated)
842 		printed += fprintf(fp, "     ?   ");
843 	else if (duration >= 1.0)
844 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
845 	else if (duration >= 0.01)
846 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
847 	else
848 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
849 	return printed + fprintf(fp, "): ");
850 }
851 
852 /**
853  * filename.ptr: The filename char pointer that will be vfs_getname'd
854  * filename.entry_str_pos: Where to insert the string translated from
855  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
856  */
857 struct thread_trace {
858 	u64		  entry_time;
859 	u64		  exit_time;
860 	bool		  entry_pending;
861 	unsigned long	  nr_events;
862 	unsigned long	  pfmaj, pfmin;
863 	char		  *entry_str;
864 	double		  runtime_ms;
865         struct {
866 		unsigned long ptr;
867 		short int     entry_str_pos;
868 		bool	      pending_open;
869 		unsigned int  namelen;
870 		char	      *name;
871 	} filename;
872 	struct {
873 		int	  max;
874 		char	  **table;
875 	} paths;
876 
877 	struct intlist *syscall_stats;
878 };
879 
thread_trace__new(void)880 static struct thread_trace *thread_trace__new(void)
881 {
882 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
883 
884 	if (ttrace)
885 		ttrace->paths.max = -1;
886 
887 	ttrace->syscall_stats = intlist__new(NULL);
888 
889 	return ttrace;
890 }
891 
thread__trace(struct thread * thread,FILE * fp)892 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
893 {
894 	struct thread_trace *ttrace;
895 
896 	if (thread == NULL)
897 		goto fail;
898 
899 	if (thread__priv(thread) == NULL)
900 		thread__set_priv(thread, thread_trace__new());
901 
902 	if (thread__priv(thread) == NULL)
903 		goto fail;
904 
905 	ttrace = thread__priv(thread);
906 	++ttrace->nr_events;
907 
908 	return ttrace;
909 fail:
910 	color_fprintf(fp, PERF_COLOR_RED,
911 		      "WARNING: not enough memory, dropping samples!\n");
912 	return NULL;
913 }
914 
915 #define TRACE_PFMAJ		(1 << 0)
916 #define TRACE_PFMIN		(1 << 1)
917 
918 static const size_t trace__entry_str_size = 2048;
919 
trace__set_fd_pathname(struct thread * thread,int fd,const char * pathname)920 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
921 {
922 	struct thread_trace *ttrace = thread__priv(thread);
923 
924 	if (fd > ttrace->paths.max) {
925 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
926 
927 		if (npath == NULL)
928 			return -1;
929 
930 		if (ttrace->paths.max != -1) {
931 			memset(npath + ttrace->paths.max + 1, 0,
932 			       (fd - ttrace->paths.max) * sizeof(char *));
933 		} else {
934 			memset(npath, 0, (fd + 1) * sizeof(char *));
935 		}
936 
937 		ttrace->paths.table = npath;
938 		ttrace->paths.max   = fd;
939 	}
940 
941 	ttrace->paths.table[fd] = strdup(pathname);
942 
943 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
944 }
945 
thread__read_fd_path(struct thread * thread,int fd)946 static int thread__read_fd_path(struct thread *thread, int fd)
947 {
948 	char linkname[PATH_MAX], pathname[PATH_MAX];
949 	struct stat st;
950 	int ret;
951 
952 	if (thread->pid_ == thread->tid) {
953 		scnprintf(linkname, sizeof(linkname),
954 			  "/proc/%d/fd/%d", thread->pid_, fd);
955 	} else {
956 		scnprintf(linkname, sizeof(linkname),
957 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
958 	}
959 
960 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
961 		return -1;
962 
963 	ret = readlink(linkname, pathname, sizeof(pathname));
964 
965 	if (ret < 0 || ret > st.st_size)
966 		return -1;
967 
968 	pathname[ret] = '\0';
969 	return trace__set_fd_pathname(thread, fd, pathname);
970 }
971 
thread__fd_path(struct thread * thread,int fd,struct trace * trace)972 static const char *thread__fd_path(struct thread *thread, int fd,
973 				   struct trace *trace)
974 {
975 	struct thread_trace *ttrace = thread__priv(thread);
976 
977 	if (ttrace == NULL)
978 		return NULL;
979 
980 	if (fd < 0)
981 		return NULL;
982 
983 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
984 		if (!trace->live)
985 			return NULL;
986 		++trace->stats.proc_getname;
987 		if (thread__read_fd_path(thread, fd))
988 			return NULL;
989 	}
990 
991 	return ttrace->paths.table[fd];
992 }
993 
syscall_arg__scnprintf_fd(char * bf,size_t size,struct syscall_arg * arg)994 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
995 					struct syscall_arg *arg)
996 {
997 	int fd = arg->val;
998 	size_t printed = scnprintf(bf, size, "%d", fd);
999 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1000 
1001 	if (path)
1002 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1003 
1004 	return printed;
1005 }
1006 
syscall_arg__scnprintf_close_fd(char * bf,size_t size,struct syscall_arg * arg)1007 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1008 					      struct syscall_arg *arg)
1009 {
1010 	int fd = arg->val;
1011 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1012 	struct thread_trace *ttrace = thread__priv(arg->thread);
1013 
1014 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1015 		zfree(&ttrace->paths.table[fd]);
1016 
1017 	return printed;
1018 }
1019 
thread__set_filename_pos(struct thread * thread,const char * bf,unsigned long ptr)1020 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1021 				     unsigned long ptr)
1022 {
1023 	struct thread_trace *ttrace = thread__priv(thread);
1024 
1025 	ttrace->filename.ptr = ptr;
1026 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1027 }
1028 
syscall_arg__scnprintf_filename(char * bf,size_t size,struct syscall_arg * arg)1029 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1030 					      struct syscall_arg *arg)
1031 {
1032 	unsigned long ptr = arg->val;
1033 
1034 	if (!arg->trace->vfs_getname)
1035 		return scnprintf(bf, size, "%#x", ptr);
1036 
1037 	thread__set_filename_pos(arg->thread, bf, ptr);
1038 	return 0;
1039 }
1040 
trace__filter_duration(struct trace * trace,double t)1041 static bool trace__filter_duration(struct trace *trace, double t)
1042 {
1043 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1044 }
1045 
__trace__fprintf_tstamp(struct trace * trace,u64 tstamp,FILE * fp)1046 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1047 {
1048 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1049 
1050 	return fprintf(fp, "%10.3f ", ts);
1051 }
1052 
1053 /*
1054  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1055  * using ttrace->entry_time for a thread that receives a sys_exit without
1056  * first having received a sys_enter ("poll" issued before tracing session
1057  * starts, lost sys_enter exit due to ring buffer overflow).
1058  */
trace__fprintf_tstamp(struct trace * trace,u64 tstamp,FILE * fp)1059 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1060 {
1061 	if (tstamp > 0)
1062 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1063 
1064 	return fprintf(fp, "         ? ");
1065 }
1066 
1067 static bool done = false;
1068 static bool interrupted = false;
1069 
sig_handler(int sig)1070 static void sig_handler(int sig)
1071 {
1072 	done = true;
1073 	interrupted = sig == SIGINT;
1074 }
1075 
trace__fprintf_entry_head(struct trace * trace,struct thread * thread,u64 duration,bool duration_calculated,u64 tstamp,FILE * fp)1076 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1077 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1078 {
1079 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1080 	printed += fprintf_duration(duration, duration_calculated, fp);
1081 
1082 	if (trace->multiple_threads) {
1083 		if (trace->show_comm)
1084 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1085 		printed += fprintf(fp, "%d ", thread->tid);
1086 	}
1087 
1088 	return printed;
1089 }
1090 
trace__process_event(struct trace * trace,struct machine * machine,union perf_event * event,struct perf_sample * sample)1091 static int trace__process_event(struct trace *trace, struct machine *machine,
1092 				union perf_event *event, struct perf_sample *sample)
1093 {
1094 	int ret = 0;
1095 
1096 	switch (event->header.type) {
1097 	case PERF_RECORD_LOST:
1098 		color_fprintf(trace->output, PERF_COLOR_RED,
1099 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1100 		ret = machine__process_lost_event(machine, event, sample);
1101 		break;
1102 	default:
1103 		ret = machine__process_event(machine, event, sample);
1104 		break;
1105 	}
1106 
1107 	return ret;
1108 }
1109 
trace__tool_process(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)1110 static int trace__tool_process(struct perf_tool *tool,
1111 			       union perf_event *event,
1112 			       struct perf_sample *sample,
1113 			       struct machine *machine)
1114 {
1115 	struct trace *trace = container_of(tool, struct trace, tool);
1116 	return trace__process_event(trace, machine, event, sample);
1117 }
1118 
trace__machine__resolve_kernel_addr(void * vmachine,unsigned long long * addrp,char ** modp)1119 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1120 {
1121 	struct machine *machine = vmachine;
1122 
1123 	if (machine->kptr_restrict_warned)
1124 		return NULL;
1125 
1126 	if (symbol_conf.kptr_restrict) {
1127 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1128 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1129 			   "Kernel samples will not be resolved.\n");
1130 		machine->kptr_restrict_warned = true;
1131 		return NULL;
1132 	}
1133 
1134 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1135 }
1136 
trace__symbols_init(struct trace * trace,struct perf_evlist * evlist)1137 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1138 {
1139 	int err = symbol__init(NULL);
1140 
1141 	if (err)
1142 		return err;
1143 
1144 	trace->host = machine__new_host();
1145 	if (trace->host == NULL)
1146 		return -ENOMEM;
1147 
1148 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1149 		return -errno;
1150 
1151 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1152 					    evlist->threads, trace__tool_process, false,
1153 					    trace->opts.proc_map_timeout);
1154 	if (err)
1155 		symbol__exit();
1156 
1157 	return err;
1158 }
1159 
syscall__set_arg_fmts(struct syscall * sc)1160 static int syscall__set_arg_fmts(struct syscall *sc)
1161 {
1162 	struct format_field *field;
1163 	int idx = 0, len;
1164 
1165 	sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1166 	if (sc->arg_scnprintf == NULL)
1167 		return -1;
1168 
1169 	if (sc->fmt)
1170 		sc->arg_parm = sc->fmt->arg_parm;
1171 
1172 	for (field = sc->args; field; field = field->next) {
1173 		if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1174 			sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1175 		else if (strcmp(field->type, "const char *") == 0 &&
1176 			 (strcmp(field->name, "filename") == 0 ||
1177 			  strcmp(field->name, "path") == 0 ||
1178 			  strcmp(field->name, "pathname") == 0))
1179 			sc->arg_scnprintf[idx] = SCA_FILENAME;
1180 		else if (field->flags & FIELD_IS_POINTER)
1181 			sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1182 		else if (strcmp(field->type, "pid_t") == 0)
1183 			sc->arg_scnprintf[idx] = SCA_PID;
1184 		else if (strcmp(field->type, "umode_t") == 0)
1185 			sc->arg_scnprintf[idx] = SCA_MODE_T;
1186 		else if ((strcmp(field->type, "int") == 0 ||
1187 			  strcmp(field->type, "unsigned int") == 0 ||
1188 			  strcmp(field->type, "long") == 0) &&
1189 			 (len = strlen(field->name)) >= 2 &&
1190 			 strcmp(field->name + len - 2, "fd") == 0) {
1191 			/*
1192 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1193 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1194 			 * 65 int
1195 			 * 23 unsigned int
1196 			 * 7 unsigned long
1197 			 */
1198 			sc->arg_scnprintf[idx] = SCA_FD;
1199 		}
1200 		++idx;
1201 	}
1202 
1203 	return 0;
1204 }
1205 
trace__read_syscall_info(struct trace * trace,int id)1206 static int trace__read_syscall_info(struct trace *trace, int id)
1207 {
1208 	char tp_name[128];
1209 	struct syscall *sc;
1210 	const char *name = syscalltbl__name(trace->sctbl, id);
1211 
1212 	if (name == NULL)
1213 		return -1;
1214 
1215 	if (id > trace->syscalls.max) {
1216 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1217 
1218 		if (nsyscalls == NULL)
1219 			return -1;
1220 
1221 		if (trace->syscalls.max != -1) {
1222 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1223 			       (id - trace->syscalls.max) * sizeof(*sc));
1224 		} else {
1225 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1226 		}
1227 
1228 		trace->syscalls.table = nsyscalls;
1229 		trace->syscalls.max   = id;
1230 	}
1231 
1232 	sc = trace->syscalls.table + id;
1233 	sc->name = name;
1234 
1235 	sc->fmt  = syscall_fmt__find(sc->name);
1236 
1237 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1238 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1239 
1240 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1241 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1242 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1243 	}
1244 
1245 	if (IS_ERR(sc->tp_format))
1246 		return -1;
1247 
1248 	sc->args = sc->tp_format->format.fields;
1249 	sc->nr_args = sc->tp_format->format.nr_fields;
1250 	/*
1251 	 * We need to check and discard the first variable '__syscall_nr'
1252 	 * or 'nr' that mean the syscall number. It is needless here.
1253 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1254 	 */
1255 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1256 		sc->args = sc->args->next;
1257 		--sc->nr_args;
1258 	}
1259 
1260 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1261 
1262 	return syscall__set_arg_fmts(sc);
1263 }
1264 
trace__validate_ev_qualifier(struct trace * trace)1265 static int trace__validate_ev_qualifier(struct trace *trace)
1266 {
1267 	int err = 0, i;
1268 	struct str_node *pos;
1269 
1270 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1271 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1272 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1273 
1274 	if (trace->ev_qualifier_ids.entries == NULL) {
1275 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1276 		       trace->output);
1277 		err = -EINVAL;
1278 		goto out;
1279 	}
1280 
1281 	i = 0;
1282 
1283 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1284 		const char *sc = pos->s;
1285 		int id = syscalltbl__id(trace->sctbl, sc);
1286 
1287 		if (id < 0) {
1288 			if (err == 0) {
1289 				fputs("Error:\tInvalid syscall ", trace->output);
1290 				err = -EINVAL;
1291 			} else {
1292 				fputs(", ", trace->output);
1293 			}
1294 
1295 			fputs(sc, trace->output);
1296 		}
1297 
1298 		trace->ev_qualifier_ids.entries[i++] = id;
1299 	}
1300 
1301 	if (err < 0) {
1302 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1303 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1304 		zfree(&trace->ev_qualifier_ids.entries);
1305 		trace->ev_qualifier_ids.nr = 0;
1306 	}
1307 out:
1308 	return err;
1309 }
1310 
1311 /*
1312  * args is to be interpreted as a series of longs but we need to handle
1313  * 8-byte unaligned accesses. args points to raw_data within the event
1314  * and raw_data is guaranteed to be 8-byte unaligned because it is
1315  * preceded by raw_size which is a u32. So we need to copy args to a temp
1316  * variable to read it. Most notably this avoids extended load instructions
1317  * on unaligned addresses
1318  */
1319 
syscall__scnprintf_args(struct syscall * sc,char * bf,size_t size,unsigned char * args,struct trace * trace,struct thread * thread)1320 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1321 				      unsigned char *args, struct trace *trace,
1322 				      struct thread *thread)
1323 {
1324 	size_t printed = 0;
1325 	unsigned char *p;
1326 	unsigned long val;
1327 
1328 	if (sc->args != NULL) {
1329 		struct format_field *field;
1330 		u8 bit = 1;
1331 		struct syscall_arg arg = {
1332 			.idx	= 0,
1333 			.mask	= 0,
1334 			.trace  = trace,
1335 			.thread = thread,
1336 		};
1337 
1338 		for (field = sc->args; field;
1339 		     field = field->next, ++arg.idx, bit <<= 1) {
1340 			if (arg.mask & bit)
1341 				continue;
1342 
1343 			/* special care for unaligned accesses */
1344 			p = args + sizeof(unsigned long) * arg.idx;
1345 			memcpy(&val, p, sizeof(val));
1346 
1347 			/*
1348  			 * Suppress this argument if its value is zero and
1349  			 * and we don't have a string associated in an
1350  			 * strarray for it.
1351  			 */
1352 			if (val == 0 &&
1353 			    !(sc->arg_scnprintf &&
1354 			      sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1355 			      sc->arg_parm[arg.idx]))
1356 				continue;
1357 
1358 			printed += scnprintf(bf + printed, size - printed,
1359 					     "%s%s: ", printed ? ", " : "", field->name);
1360 			if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1361 				arg.val = val;
1362 				if (sc->arg_parm)
1363 					arg.parm = sc->arg_parm[arg.idx];
1364 				printed += sc->arg_scnprintf[arg.idx](bf + printed,
1365 								      size - printed, &arg);
1366 			} else {
1367 				printed += scnprintf(bf + printed, size - printed,
1368 						     "%ld", val);
1369 			}
1370 		}
1371 	} else if (IS_ERR(sc->tp_format)) {
1372 		/*
1373 		 * If we managed to read the tracepoint /format file, then we
1374 		 * may end up not having any args, like with gettid(), so only
1375 		 * print the raw args when we didn't manage to read it.
1376 		 */
1377 		int i = 0;
1378 
1379 		while (i < 6) {
1380 			/* special care for unaligned accesses */
1381 			p = args + sizeof(unsigned long) * i;
1382 			memcpy(&val, p, sizeof(val));
1383 			printed += scnprintf(bf + printed, size - printed,
1384 					     "%sarg%d: %ld",
1385 					     printed ? ", " : "", i, val);
1386 			++i;
1387 		}
1388 	}
1389 
1390 	return printed;
1391 }
1392 
1393 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1394 				  union perf_event *event,
1395 				  struct perf_sample *sample);
1396 
trace__syscall_info(struct trace * trace,struct perf_evsel * evsel,int id)1397 static struct syscall *trace__syscall_info(struct trace *trace,
1398 					   struct perf_evsel *evsel, int id)
1399 {
1400 
1401 	if (id < 0) {
1402 
1403 		/*
1404 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1405 		 * before that, leaving at a higher verbosity level till that is
1406 		 * explained. Reproduced with plain ftrace with:
1407 		 *
1408 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1409 		 * grep "NR -1 " /t/trace_pipe
1410 		 *
1411 		 * After generating some load on the machine.
1412  		 */
1413 		if (verbose > 1) {
1414 			static u64 n;
1415 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1416 				id, perf_evsel__name(evsel), ++n);
1417 		}
1418 		return NULL;
1419 	}
1420 
1421 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1422 	    trace__read_syscall_info(trace, id))
1423 		goto out_cant_read;
1424 
1425 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1426 		goto out_cant_read;
1427 
1428 	return &trace->syscalls.table[id];
1429 
1430 out_cant_read:
1431 	if (verbose) {
1432 		fprintf(trace->output, "Problems reading syscall %d", id);
1433 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1434 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1435 		fputs(" information\n", trace->output);
1436 	}
1437 	return NULL;
1438 }
1439 
thread__update_stats(struct thread_trace * ttrace,int id,struct perf_sample * sample)1440 static void thread__update_stats(struct thread_trace *ttrace,
1441 				 int id, struct perf_sample *sample)
1442 {
1443 	struct int_node *inode;
1444 	struct stats *stats;
1445 	u64 duration = 0;
1446 
1447 	inode = intlist__findnew(ttrace->syscall_stats, id);
1448 	if (inode == NULL)
1449 		return;
1450 
1451 	stats = inode->priv;
1452 	if (stats == NULL) {
1453 		stats = malloc(sizeof(struct stats));
1454 		if (stats == NULL)
1455 			return;
1456 		init_stats(stats);
1457 		inode->priv = stats;
1458 	}
1459 
1460 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1461 		duration = sample->time - ttrace->entry_time;
1462 
1463 	update_stats(stats, duration);
1464 }
1465 
trace__printf_interrupted_entry(struct trace * trace,struct perf_sample * sample)1466 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1467 {
1468 	struct thread_trace *ttrace;
1469 	u64 duration;
1470 	size_t printed;
1471 
1472 	if (trace->current == NULL)
1473 		return 0;
1474 
1475 	ttrace = thread__priv(trace->current);
1476 
1477 	if (!ttrace->entry_pending)
1478 		return 0;
1479 
1480 	duration = sample->time - ttrace->entry_time;
1481 
1482 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1483 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1484 	ttrace->entry_pending = false;
1485 
1486 	return printed;
1487 }
1488 
trace__sys_enter(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1489 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1490 			    union perf_event *event __maybe_unused,
1491 			    struct perf_sample *sample)
1492 {
1493 	char *msg;
1494 	void *args;
1495 	size_t printed = 0;
1496 	struct thread *thread;
1497 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1498 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1499 	struct thread_trace *ttrace;
1500 
1501 	if (sc == NULL)
1502 		return -1;
1503 
1504 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1505 	ttrace = thread__trace(thread, trace->output);
1506 	if (ttrace == NULL)
1507 		goto out_put;
1508 
1509 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1510 
1511 	if (ttrace->entry_str == NULL) {
1512 		ttrace->entry_str = malloc(trace__entry_str_size);
1513 		if (!ttrace->entry_str)
1514 			goto out_put;
1515 	}
1516 
1517 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1518 		trace__printf_interrupted_entry(trace, sample);
1519 
1520 	ttrace->entry_time = sample->time;
1521 	msg = ttrace->entry_str;
1522 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1523 
1524 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1525 					   args, trace, thread);
1526 
1527 	if (sc->is_exit) {
1528 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1529 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1530 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1531 		}
1532 	} else {
1533 		ttrace->entry_pending = true;
1534 		/* See trace__vfs_getname & trace__sys_exit */
1535 		ttrace->filename.pending_open = false;
1536 	}
1537 
1538 	if (trace->current != thread) {
1539 		thread__put(trace->current);
1540 		trace->current = thread__get(thread);
1541 	}
1542 	err = 0;
1543 out_put:
1544 	thread__put(thread);
1545 	return err;
1546 }
1547 
trace__resolve_callchain(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample,struct callchain_cursor * cursor)1548 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1549 				    struct perf_sample *sample,
1550 				    struct callchain_cursor *cursor)
1551 {
1552 	struct addr_location al;
1553 
1554 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1555 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1556 		return -1;
1557 
1558 	return 0;
1559 }
1560 
trace__fprintf_callchain(struct trace * trace,struct perf_sample * sample)1561 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1562 {
1563 	/* TODO: user-configurable print_opts */
1564 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1565 				        EVSEL__PRINT_DSO |
1566 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1567 
1568 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1569 }
1570 
trace__sys_exit(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1571 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1572 			   union perf_event *event __maybe_unused,
1573 			   struct perf_sample *sample)
1574 {
1575 	long ret;
1576 	u64 duration = 0;
1577 	bool duration_calculated = false;
1578 	struct thread *thread;
1579 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1580 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1581 	struct thread_trace *ttrace;
1582 
1583 	if (sc == NULL)
1584 		return -1;
1585 
1586 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1587 	ttrace = thread__trace(thread, trace->output);
1588 	if (ttrace == NULL)
1589 		goto out_put;
1590 
1591 	if (trace->summary)
1592 		thread__update_stats(ttrace, id, sample);
1593 
1594 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1595 
1596 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1597 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1598 		ttrace->filename.pending_open = false;
1599 		++trace->stats.vfs_getname;
1600 	}
1601 
1602 	ttrace->exit_time = sample->time;
1603 
1604 	if (ttrace->entry_time) {
1605 		duration = sample->time - ttrace->entry_time;
1606 		if (trace__filter_duration(trace, duration))
1607 			goto out;
1608 		duration_calculated = true;
1609 	} else if (trace->duration_filter)
1610 		goto out;
1611 
1612 	if (sample->callchain) {
1613 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1614 		if (callchain_ret == 0) {
1615 			if (callchain_cursor.nr < trace->min_stack)
1616 				goto out;
1617 			callchain_ret = 1;
1618 		}
1619 	}
1620 
1621 	if (trace->summary_only)
1622 		goto out;
1623 
1624 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1625 
1626 	if (ttrace->entry_pending) {
1627 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1628 	} else {
1629 		fprintf(trace->output, " ... [");
1630 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1631 		fprintf(trace->output, "]: %s()", sc->name);
1632 	}
1633 
1634 	if (sc->fmt == NULL) {
1635 signed_print:
1636 		fprintf(trace->output, ") = %ld", ret);
1637 	} else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1638 		char bf[STRERR_BUFSIZE];
1639 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1640 			   *e = audit_errno_to_name(-ret);
1641 
1642 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1643 	} else if (ret == 0 && sc->fmt->timeout)
1644 		fprintf(trace->output, ") = 0 Timeout");
1645 	else if (sc->fmt->hexret)
1646 		fprintf(trace->output, ") = %#lx", ret);
1647 	else if (sc->fmt->errpid) {
1648 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1649 
1650 		if (child != NULL) {
1651 			fprintf(trace->output, ") = %ld", ret);
1652 			if (child->comm_set)
1653 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1654 			thread__put(child);
1655 		}
1656 	} else
1657 		goto signed_print;
1658 
1659 	fputc('\n', trace->output);
1660 
1661 	if (callchain_ret > 0)
1662 		trace__fprintf_callchain(trace, sample);
1663 	else if (callchain_ret < 0)
1664 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1665 out:
1666 	ttrace->entry_pending = false;
1667 	err = 0;
1668 out_put:
1669 	thread__put(thread);
1670 	return err;
1671 }
1672 
trace__vfs_getname(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1673 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1674 			      union perf_event *event __maybe_unused,
1675 			      struct perf_sample *sample)
1676 {
1677 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1678 	struct thread_trace *ttrace;
1679 	size_t filename_len, entry_str_len, to_move;
1680 	ssize_t remaining_space;
1681 	char *pos;
1682 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1683 
1684 	if (!thread)
1685 		goto out;
1686 
1687 	ttrace = thread__priv(thread);
1688 	if (!ttrace)
1689 		goto out;
1690 
1691 	filename_len = strlen(filename);
1692 
1693 	if (ttrace->filename.namelen < filename_len) {
1694 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1695 
1696 		if (f == NULL)
1697 				goto out;
1698 
1699 		ttrace->filename.namelen = filename_len;
1700 		ttrace->filename.name = f;
1701 	}
1702 
1703 	strcpy(ttrace->filename.name, filename);
1704 	ttrace->filename.pending_open = true;
1705 
1706 	if (!ttrace->filename.ptr)
1707 		goto out;
1708 
1709 	entry_str_len = strlen(ttrace->entry_str);
1710 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1711 	if (remaining_space <= 0)
1712 		goto out;
1713 
1714 	if (filename_len > (size_t)remaining_space) {
1715 		filename += filename_len - remaining_space;
1716 		filename_len = remaining_space;
1717 	}
1718 
1719 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1720 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1721 	memmove(pos + filename_len, pos, to_move);
1722 	memcpy(pos, filename, filename_len);
1723 
1724 	ttrace->filename.ptr = 0;
1725 	ttrace->filename.entry_str_pos = 0;
1726 out:
1727 	return 0;
1728 }
1729 
trace__sched_stat_runtime(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1730 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1731 				     union perf_event *event __maybe_unused,
1732 				     struct perf_sample *sample)
1733 {
1734         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1735 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1736 	struct thread *thread = machine__findnew_thread(trace->host,
1737 							sample->pid,
1738 							sample->tid);
1739 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1740 
1741 	if (ttrace == NULL)
1742 		goto out_dump;
1743 
1744 	ttrace->runtime_ms += runtime_ms;
1745 	trace->runtime_ms += runtime_ms;
1746 	thread__put(thread);
1747 	return 0;
1748 
1749 out_dump:
1750 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1751 	       evsel->name,
1752 	       perf_evsel__strval(evsel, sample, "comm"),
1753 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1754 	       runtime,
1755 	       perf_evsel__intval(evsel, sample, "vruntime"));
1756 	thread__put(thread);
1757 	return 0;
1758 }
1759 
bpf_output__printer(enum binary_printer_ops op,unsigned int val,void * extra)1760 static void bpf_output__printer(enum binary_printer_ops op,
1761 				unsigned int val, void *extra)
1762 {
1763 	FILE *output = extra;
1764 	unsigned char ch = (unsigned char)val;
1765 
1766 	switch (op) {
1767 	case BINARY_PRINT_CHAR_DATA:
1768 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1769 		break;
1770 	case BINARY_PRINT_DATA_BEGIN:
1771 	case BINARY_PRINT_LINE_BEGIN:
1772 	case BINARY_PRINT_ADDR:
1773 	case BINARY_PRINT_NUM_DATA:
1774 	case BINARY_PRINT_NUM_PAD:
1775 	case BINARY_PRINT_SEP:
1776 	case BINARY_PRINT_CHAR_PAD:
1777 	case BINARY_PRINT_LINE_END:
1778 	case BINARY_PRINT_DATA_END:
1779 	default:
1780 		break;
1781 	}
1782 }
1783 
bpf_output__fprintf(struct trace * trace,struct perf_sample * sample)1784 static void bpf_output__fprintf(struct trace *trace,
1785 				struct perf_sample *sample)
1786 {
1787 	print_binary(sample->raw_data, sample->raw_size, 8,
1788 		     bpf_output__printer, trace->output);
1789 }
1790 
trace__event_handler(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1791 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1792 				union perf_event *event __maybe_unused,
1793 				struct perf_sample *sample)
1794 {
1795 	int callchain_ret = 0;
1796 
1797 	if (sample->callchain) {
1798 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1799 		if (callchain_ret == 0) {
1800 			if (callchain_cursor.nr < trace->min_stack)
1801 				goto out;
1802 			callchain_ret = 1;
1803 		}
1804 	}
1805 
1806 	trace__printf_interrupted_entry(trace, sample);
1807 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1808 
1809 	if (trace->trace_syscalls)
1810 		fprintf(trace->output, "(         ): ");
1811 
1812 	fprintf(trace->output, "%s:", evsel->name);
1813 
1814 	if (perf_evsel__is_bpf_output(evsel)) {
1815 		bpf_output__fprintf(trace, sample);
1816 	} else if (evsel->tp_format) {
1817 		event_format__fprintf(evsel->tp_format, sample->cpu,
1818 				      sample->raw_data, sample->raw_size,
1819 				      trace->output);
1820 	}
1821 
1822 	fprintf(trace->output, ")\n");
1823 
1824 	if (callchain_ret > 0)
1825 		trace__fprintf_callchain(trace, sample);
1826 	else if (callchain_ret < 0)
1827 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1828 out:
1829 	return 0;
1830 }
1831 
print_location(FILE * f,struct perf_sample * sample,struct addr_location * al,bool print_dso,bool print_sym)1832 static void print_location(FILE *f, struct perf_sample *sample,
1833 			   struct addr_location *al,
1834 			   bool print_dso, bool print_sym)
1835 {
1836 
1837 	if ((verbose || print_dso) && al->map)
1838 		fprintf(f, "%s@", al->map->dso->long_name);
1839 
1840 	if ((verbose || print_sym) && al->sym)
1841 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1842 			al->addr - al->sym->start);
1843 	else if (al->map)
1844 		fprintf(f, "0x%" PRIx64, al->addr);
1845 	else
1846 		fprintf(f, "0x%" PRIx64, sample->addr);
1847 }
1848 
trace__pgfault(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1849 static int trace__pgfault(struct trace *trace,
1850 			  struct perf_evsel *evsel,
1851 			  union perf_event *event __maybe_unused,
1852 			  struct perf_sample *sample)
1853 {
1854 	struct thread *thread;
1855 	struct addr_location al;
1856 	char map_type = 'd';
1857 	struct thread_trace *ttrace;
1858 	int err = -1;
1859 	int callchain_ret = 0;
1860 
1861 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1862 
1863 	if (sample->callchain) {
1864 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1865 		if (callchain_ret == 0) {
1866 			if (callchain_cursor.nr < trace->min_stack)
1867 				goto out_put;
1868 			callchain_ret = 1;
1869 		}
1870 	}
1871 
1872 	ttrace = thread__trace(thread, trace->output);
1873 	if (ttrace == NULL)
1874 		goto out_put;
1875 
1876 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1877 		ttrace->pfmaj++;
1878 	else
1879 		ttrace->pfmin++;
1880 
1881 	if (trace->summary_only)
1882 		goto out;
1883 
1884 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1885 			      sample->ip, &al);
1886 
1887 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1888 
1889 	fprintf(trace->output, "%sfault [",
1890 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1891 		"maj" : "min");
1892 
1893 	print_location(trace->output, sample, &al, false, true);
1894 
1895 	fprintf(trace->output, "] => ");
1896 
1897 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1898 				   sample->addr, &al);
1899 
1900 	if (!al.map) {
1901 		thread__find_addr_location(thread, sample->cpumode,
1902 					   MAP__FUNCTION, sample->addr, &al);
1903 
1904 		if (al.map)
1905 			map_type = 'x';
1906 		else
1907 			map_type = '?';
1908 	}
1909 
1910 	print_location(trace->output, sample, &al, true, false);
1911 
1912 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1913 
1914 	if (callchain_ret > 0)
1915 		trace__fprintf_callchain(trace, sample);
1916 	else if (callchain_ret < 0)
1917 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1918 out:
1919 	err = 0;
1920 out_put:
1921 	thread__put(thread);
1922 	return err;
1923 }
1924 
skip_sample(struct trace * trace,struct perf_sample * sample)1925 static bool skip_sample(struct trace *trace, struct perf_sample *sample)
1926 {
1927 	if ((trace->pid_list && intlist__find(trace->pid_list, sample->pid)) ||
1928 	    (trace->tid_list && intlist__find(trace->tid_list, sample->tid)))
1929 		return false;
1930 
1931 	if (trace->pid_list || trace->tid_list)
1932 		return true;
1933 
1934 	return false;
1935 }
1936 
trace__set_base_time(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample)1937 static void trace__set_base_time(struct trace *trace,
1938 				 struct perf_evsel *evsel,
1939 				 struct perf_sample *sample)
1940 {
1941 	/*
1942 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1943 	 * and don't use sample->time unconditionally, we may end up having
1944 	 * some other event in the future without PERF_SAMPLE_TIME for good
1945 	 * reason, i.e. we may not be interested in its timestamps, just in
1946 	 * it taking place, picking some piece of information when it
1947 	 * appears in our event stream (vfs_getname comes to mind).
1948 	 */
1949 	if (trace->base_time == 0 && !trace->full_time &&
1950 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1951 		trace->base_time = sample->time;
1952 }
1953 
trace__process_sample(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct perf_evsel * evsel,struct machine * machine __maybe_unused)1954 static int trace__process_sample(struct perf_tool *tool,
1955 				 union perf_event *event,
1956 				 struct perf_sample *sample,
1957 				 struct perf_evsel *evsel,
1958 				 struct machine *machine __maybe_unused)
1959 {
1960 	struct trace *trace = container_of(tool, struct trace, tool);
1961 	int err = 0;
1962 
1963 	tracepoint_handler handler = evsel->handler;
1964 
1965 	if (skip_sample(trace, sample))
1966 		return 0;
1967 
1968 	trace__set_base_time(trace, evsel, sample);
1969 
1970 	if (handler) {
1971 		++trace->nr_events;
1972 		handler(trace, evsel, event, sample);
1973 	}
1974 
1975 	return err;
1976 }
1977 
parse_target_str(struct trace * trace)1978 static int parse_target_str(struct trace *trace)
1979 {
1980 	if (trace->opts.target.pid) {
1981 		trace->pid_list = intlist__new(trace->opts.target.pid);
1982 		if (trace->pid_list == NULL) {
1983 			pr_err("Error parsing process id string\n");
1984 			return -EINVAL;
1985 		}
1986 	}
1987 
1988 	if (trace->opts.target.tid) {
1989 		trace->tid_list = intlist__new(trace->opts.target.tid);
1990 		if (trace->tid_list == NULL) {
1991 			pr_err("Error parsing thread id string\n");
1992 			return -EINVAL;
1993 		}
1994 	}
1995 
1996 	return 0;
1997 }
1998 
trace__record(struct trace * trace,int argc,const char ** argv)1999 static int trace__record(struct trace *trace, int argc, const char **argv)
2000 {
2001 	unsigned int rec_argc, i, j;
2002 	const char **rec_argv;
2003 	const char * const record_args[] = {
2004 		"record",
2005 		"-R",
2006 		"-m", "1024",
2007 		"-c", "1",
2008 	};
2009 
2010 	const char * const sc_args[] = { "-e", };
2011 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2012 	const char * const majpf_args[] = { "-e", "major-faults" };
2013 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2014 	const char * const minpf_args[] = { "-e", "minor-faults" };
2015 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2016 
2017 	/* +1 is for the event string below */
2018 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2019 		majpf_args_nr + minpf_args_nr + argc;
2020 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2021 
2022 	if (rec_argv == NULL)
2023 		return -ENOMEM;
2024 
2025 	j = 0;
2026 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2027 		rec_argv[j++] = record_args[i];
2028 
2029 	if (trace->trace_syscalls) {
2030 		for (i = 0; i < sc_args_nr; i++)
2031 			rec_argv[j++] = sc_args[i];
2032 
2033 		/* event string may be different for older kernels - e.g., RHEL6 */
2034 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2035 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2036 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2037 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2038 		else {
2039 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2040 			return -1;
2041 		}
2042 	}
2043 
2044 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2045 		for (i = 0; i < majpf_args_nr; i++)
2046 			rec_argv[j++] = majpf_args[i];
2047 
2048 	if (trace->trace_pgfaults & TRACE_PFMIN)
2049 		for (i = 0; i < minpf_args_nr; i++)
2050 			rec_argv[j++] = minpf_args[i];
2051 
2052 	for (i = 0; i < (unsigned int)argc; i++)
2053 		rec_argv[j++] = argv[i];
2054 
2055 	return cmd_record(j, rec_argv, NULL);
2056 }
2057 
2058 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2059 
perf_evlist__add_vfs_getname(struct perf_evlist * evlist)2060 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2061 {
2062 	struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2063 
2064 	if (IS_ERR(evsel))
2065 		return false;
2066 
2067 	if (perf_evsel__field(evsel, "pathname") == NULL) {
2068 		perf_evsel__delete(evsel);
2069 		return false;
2070 	}
2071 
2072 	evsel->handler = trace__vfs_getname;
2073 	perf_evlist__add(evlist, evsel);
2074 	return true;
2075 }
2076 
perf_evsel__new_pgfault(u64 config)2077 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2078 {
2079 	struct perf_evsel *evsel;
2080 	struct perf_event_attr attr = {
2081 		.type = PERF_TYPE_SOFTWARE,
2082 		.mmap_data = 1,
2083 	};
2084 
2085 	attr.config = config;
2086 	attr.sample_period = 1;
2087 
2088 	event_attr_init(&attr);
2089 
2090 	evsel = perf_evsel__new(&attr);
2091 	if (evsel)
2092 		evsel->handler = trace__pgfault;
2093 
2094 	return evsel;
2095 }
2096 
trace__handle_event(struct trace * trace,union perf_event * event,struct perf_sample * sample)2097 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2098 {
2099 	const u32 type = event->header.type;
2100 	struct perf_evsel *evsel;
2101 
2102 	if (type != PERF_RECORD_SAMPLE) {
2103 		trace__process_event(trace, trace->host, event, sample);
2104 		return;
2105 	}
2106 
2107 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2108 	if (evsel == NULL) {
2109 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2110 		return;
2111 	}
2112 
2113 	trace__set_base_time(trace, evsel, sample);
2114 
2115 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2116 	    sample->raw_data == NULL) {
2117 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2118 		       perf_evsel__name(evsel), sample->tid,
2119 		       sample->cpu, sample->raw_size);
2120 	} else {
2121 		tracepoint_handler handler = evsel->handler;
2122 		handler(trace, evsel, event, sample);
2123 	}
2124 }
2125 
trace__add_syscall_newtp(struct trace * trace)2126 static int trace__add_syscall_newtp(struct trace *trace)
2127 {
2128 	int ret = -1;
2129 	struct perf_evlist *evlist = trace->evlist;
2130 	struct perf_evsel *sys_enter, *sys_exit;
2131 
2132 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2133 	if (sys_enter == NULL)
2134 		goto out;
2135 
2136 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2137 		goto out_delete_sys_enter;
2138 
2139 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2140 	if (sys_exit == NULL)
2141 		goto out_delete_sys_enter;
2142 
2143 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2144 		goto out_delete_sys_exit;
2145 
2146 	perf_evlist__add(evlist, sys_enter);
2147 	perf_evlist__add(evlist, sys_exit);
2148 
2149 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2150 		/*
2151 		 * We're interested only in the user space callchain
2152 		 * leading to the syscall, allow overriding that for
2153 		 * debugging reasons using --kernel_syscall_callchains
2154 		 */
2155 		sys_exit->attr.exclude_callchain_kernel = 1;
2156 	}
2157 
2158 	trace->syscalls.events.sys_enter = sys_enter;
2159 	trace->syscalls.events.sys_exit  = sys_exit;
2160 
2161 	ret = 0;
2162 out:
2163 	return ret;
2164 
2165 out_delete_sys_exit:
2166 	perf_evsel__delete_priv(sys_exit);
2167 out_delete_sys_enter:
2168 	perf_evsel__delete_priv(sys_enter);
2169 	goto out;
2170 }
2171 
trace__set_ev_qualifier_filter(struct trace * trace)2172 static int trace__set_ev_qualifier_filter(struct trace *trace)
2173 {
2174 	int err = -1;
2175 	struct perf_evsel *sys_exit;
2176 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2177 						trace->ev_qualifier_ids.nr,
2178 						trace->ev_qualifier_ids.entries);
2179 
2180 	if (filter == NULL)
2181 		goto out_enomem;
2182 
2183 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2184 					  filter)) {
2185 		sys_exit = trace->syscalls.events.sys_exit;
2186 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2187 	}
2188 
2189 	free(filter);
2190 out:
2191 	return err;
2192 out_enomem:
2193 	errno = ENOMEM;
2194 	goto out;
2195 }
2196 
trace__run(struct trace * trace,int argc,const char ** argv)2197 static int trace__run(struct trace *trace, int argc, const char **argv)
2198 {
2199 	struct perf_evlist *evlist = trace->evlist;
2200 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2201 	int err = -1, i;
2202 	unsigned long before;
2203 	const bool forks = argc > 0;
2204 	bool draining = false;
2205 
2206 	trace->live = true;
2207 
2208 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2209 		goto out_error_raw_syscalls;
2210 
2211 	if (trace->trace_syscalls)
2212 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2213 
2214 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2215 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2216 		if (pgfault_maj == NULL)
2217 			goto out_error_mem;
2218 		perf_evlist__add(evlist, pgfault_maj);
2219 	}
2220 
2221 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2222 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2223 		if (pgfault_min == NULL)
2224 			goto out_error_mem;
2225 		perf_evlist__add(evlist, pgfault_min);
2226 	}
2227 
2228 	if (trace->sched &&
2229 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2230 				   trace__sched_stat_runtime))
2231 		goto out_error_sched_stat_runtime;
2232 
2233 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2234 	if (err < 0) {
2235 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2236 		goto out_delete_evlist;
2237 	}
2238 
2239 	err = trace__symbols_init(trace, evlist);
2240 	if (err < 0) {
2241 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2242 		goto out_delete_evlist;
2243 	}
2244 
2245 	perf_evlist__config(evlist, &trace->opts, NULL);
2246 
2247 	if (callchain_param.enabled) {
2248 		bool use_identifier = false;
2249 
2250 		if (trace->syscalls.events.sys_exit) {
2251 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2252 						     &trace->opts, &callchain_param);
2253 			use_identifier = true;
2254 		}
2255 
2256 		if (pgfault_maj) {
2257 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2258 			use_identifier = true;
2259 		}
2260 
2261 		if (pgfault_min) {
2262 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2263 			use_identifier = true;
2264 		}
2265 
2266 		if (use_identifier) {
2267 		       /*
2268 			* Now we have evsels with different sample_ids, use
2269 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2270 			* from a fixed position in each ring buffer record.
2271 			*
2272 			* As of this the changeset introducing this comment, this
2273 			* isn't strictly needed, as the fields that can come before
2274 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2275 			* some of those for things like copying the payload of
2276 			* pointer syscall arguments, and for vfs_getname we don't
2277 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2278 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2279 			*/
2280 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2281 			perf_evlist__reset_sample_bit(evlist, ID);
2282 		}
2283 	}
2284 
2285 	signal(SIGCHLD, sig_handler);
2286 	signal(SIGINT, sig_handler);
2287 
2288 	if (forks) {
2289 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2290 						    argv, false, NULL);
2291 		if (err < 0) {
2292 			fprintf(trace->output, "Couldn't run the workload!\n");
2293 			goto out_delete_evlist;
2294 		}
2295 	}
2296 
2297 	err = perf_evlist__open(evlist);
2298 	if (err < 0)
2299 		goto out_error_open;
2300 
2301 	err = bpf__apply_obj_config();
2302 	if (err) {
2303 		char errbuf[BUFSIZ];
2304 
2305 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2306 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2307 			 errbuf);
2308 		goto out_error_open;
2309 	}
2310 
2311 	/*
2312 	 * Better not use !target__has_task() here because we need to cover the
2313 	 * case where no threads were specified in the command line, but a
2314 	 * workload was, and in that case we will fill in the thread_map when
2315 	 * we fork the workload in perf_evlist__prepare_workload.
2316 	 */
2317 	if (trace->filter_pids.nr > 0)
2318 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2319 	else if (thread_map__pid(evlist->threads, 0) == -1)
2320 		err = perf_evlist__set_filter_pid(evlist, getpid());
2321 
2322 	if (err < 0)
2323 		goto out_error_mem;
2324 
2325 	if (trace->ev_qualifier_ids.nr > 0) {
2326 		err = trace__set_ev_qualifier_filter(trace);
2327 		if (err < 0)
2328 			goto out_errno;
2329 
2330 		pr_debug("event qualifier tracepoint filter: %s\n",
2331 			 trace->syscalls.events.sys_exit->filter);
2332 	}
2333 
2334 	err = perf_evlist__apply_filters(evlist, &evsel);
2335 	if (err < 0)
2336 		goto out_error_apply_filters;
2337 
2338 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2339 	if (err < 0)
2340 		goto out_error_mmap;
2341 
2342 	if (!target__none(&trace->opts.target))
2343 		perf_evlist__enable(evlist);
2344 
2345 	if (forks)
2346 		perf_evlist__start_workload(evlist);
2347 
2348 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2349 				  evlist->threads->nr > 1 ||
2350 				  perf_evlist__first(evlist)->attr.inherit;
2351 again:
2352 	before = trace->nr_events;
2353 
2354 	for (i = 0; i < evlist->nr_mmaps; i++) {
2355 		union perf_event *event;
2356 
2357 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2358 			struct perf_sample sample;
2359 
2360 			++trace->nr_events;
2361 
2362 			err = perf_evlist__parse_sample(evlist, event, &sample);
2363 			if (err) {
2364 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2365 				goto next_event;
2366 			}
2367 
2368 			trace__handle_event(trace, event, &sample);
2369 next_event:
2370 			perf_evlist__mmap_consume(evlist, i);
2371 
2372 			if (interrupted)
2373 				goto out_disable;
2374 
2375 			if (done && !draining) {
2376 				perf_evlist__disable(evlist);
2377 				draining = true;
2378 			}
2379 		}
2380 	}
2381 
2382 	if (trace->nr_events == before) {
2383 		int timeout = done ? 100 : -1;
2384 
2385 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2386 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2387 				draining = true;
2388 
2389 			goto again;
2390 		}
2391 	} else {
2392 		goto again;
2393 	}
2394 
2395 out_disable:
2396 	thread__zput(trace->current);
2397 
2398 	perf_evlist__disable(evlist);
2399 
2400 	if (!err) {
2401 		if (trace->summary)
2402 			trace__fprintf_thread_summary(trace, trace->output);
2403 
2404 		if (trace->show_tool_stats) {
2405 			fprintf(trace->output, "Stats:\n "
2406 					       " vfs_getname : %" PRIu64 "\n"
2407 					       " proc_getname: %" PRIu64 "\n",
2408 				trace->stats.vfs_getname,
2409 				trace->stats.proc_getname);
2410 		}
2411 	}
2412 
2413 out_delete_evlist:
2414 	perf_evlist__delete(evlist);
2415 	trace->evlist = NULL;
2416 	trace->live = false;
2417 	return err;
2418 {
2419 	char errbuf[BUFSIZ];
2420 
2421 out_error_sched_stat_runtime:
2422 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2423 	goto out_error;
2424 
2425 out_error_raw_syscalls:
2426 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2427 	goto out_error;
2428 
2429 out_error_mmap:
2430 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2431 	goto out_error;
2432 
2433 out_error_open:
2434 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2435 
2436 out_error:
2437 	fprintf(trace->output, "%s\n", errbuf);
2438 	goto out_delete_evlist;
2439 
2440 out_error_apply_filters:
2441 	fprintf(trace->output,
2442 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2443 		evsel->filter, perf_evsel__name(evsel), errno,
2444 		str_error_r(errno, errbuf, sizeof(errbuf)));
2445 	goto out_delete_evlist;
2446 }
2447 out_error_mem:
2448 	fprintf(trace->output, "Not enough memory to run!\n");
2449 	goto out_delete_evlist;
2450 
2451 out_errno:
2452 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2453 	goto out_delete_evlist;
2454 }
2455 
trace__replay(struct trace * trace)2456 static int trace__replay(struct trace *trace)
2457 {
2458 	const struct perf_evsel_str_handler handlers[] = {
2459 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2460 	};
2461 	struct perf_data_file file = {
2462 		.path  = input_name,
2463 		.mode  = PERF_DATA_MODE_READ,
2464 		.force = trace->force,
2465 	};
2466 	struct perf_session *session;
2467 	struct perf_evsel *evsel;
2468 	int err = -1;
2469 
2470 	trace->tool.sample	  = trace__process_sample;
2471 	trace->tool.mmap	  = perf_event__process_mmap;
2472 	trace->tool.mmap2	  = perf_event__process_mmap2;
2473 	trace->tool.comm	  = perf_event__process_comm;
2474 	trace->tool.exit	  = perf_event__process_exit;
2475 	trace->tool.fork	  = perf_event__process_fork;
2476 	trace->tool.attr	  = perf_event__process_attr;
2477 	trace->tool.tracing_data = perf_event__process_tracing_data;
2478 	trace->tool.build_id	  = perf_event__process_build_id;
2479 
2480 	trace->tool.ordered_events = true;
2481 	trace->tool.ordering_requires_timestamps = true;
2482 
2483 	/* add tid to output */
2484 	trace->multiple_threads = true;
2485 
2486 	session = perf_session__new(&file, false, &trace->tool);
2487 	if (session == NULL)
2488 		return -1;
2489 
2490 	if (symbol__init(&session->header.env) < 0)
2491 		goto out;
2492 
2493 	trace->host = &session->machines.host;
2494 
2495 	err = perf_session__set_tracepoints_handlers(session, handlers);
2496 	if (err)
2497 		goto out;
2498 
2499 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2500 						     "raw_syscalls:sys_enter");
2501 	/* older kernels have syscalls tp versus raw_syscalls */
2502 	if (evsel == NULL)
2503 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2504 							     "syscalls:sys_enter");
2505 
2506 	if (evsel &&
2507 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2508 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2509 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2510 		goto out;
2511 	}
2512 
2513 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2514 						     "raw_syscalls:sys_exit");
2515 	if (evsel == NULL)
2516 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2517 							     "syscalls:sys_exit");
2518 	if (evsel &&
2519 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2520 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2521 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2522 		goto out;
2523 	}
2524 
2525 	evlist__for_each_entry(session->evlist, evsel) {
2526 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2527 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2528 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2529 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2530 			evsel->handler = trace__pgfault;
2531 	}
2532 
2533 	err = parse_target_str(trace);
2534 	if (err != 0)
2535 		goto out;
2536 
2537 	setup_pager();
2538 
2539 	err = perf_session__process_events(session);
2540 	if (err)
2541 		pr_err("Failed to process events, error %d", err);
2542 
2543 	else if (trace->summary)
2544 		trace__fprintf_thread_summary(trace, trace->output);
2545 
2546 out:
2547 	perf_session__delete(session);
2548 
2549 	return err;
2550 }
2551 
trace__fprintf_threads_header(FILE * fp)2552 static size_t trace__fprintf_threads_header(FILE *fp)
2553 {
2554 	size_t printed;
2555 
2556 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2557 
2558 	return printed;
2559 }
2560 
2561 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2562 	struct stats 	*stats;
2563 	double		msecs;
2564 	int		syscall;
2565 )
2566 {
2567 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2568 	struct stats *stats = source->priv;
2569 
2570 	entry->syscall = source->i;
2571 	entry->stats   = stats;
2572 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2573 }
2574 
thread__dump_stats(struct thread_trace * ttrace,struct trace * trace,FILE * fp)2575 static size_t thread__dump_stats(struct thread_trace *ttrace,
2576 				 struct trace *trace, FILE *fp)
2577 {
2578 	size_t printed = 0;
2579 	struct syscall *sc;
2580 	struct rb_node *nd;
2581 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2582 
2583 	if (syscall_stats == NULL)
2584 		return 0;
2585 
2586 	printed += fprintf(fp, "\n");
2587 
2588 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2589 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2590 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2591 
2592 	resort_rb__for_each_entry(nd, syscall_stats) {
2593 		struct stats *stats = syscall_stats_entry->stats;
2594 		if (stats) {
2595 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2596 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2597 			double avg = avg_stats(stats);
2598 			double pct;
2599 			u64 n = (u64) stats->n;
2600 
2601 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2602 			avg /= NSEC_PER_MSEC;
2603 
2604 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2605 			printed += fprintf(fp, "   %-15s", sc->name);
2606 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2607 					   n, syscall_stats_entry->msecs, min, avg);
2608 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2609 		}
2610 	}
2611 
2612 	resort_rb__delete(syscall_stats);
2613 	printed += fprintf(fp, "\n\n");
2614 
2615 	return printed;
2616 }
2617 
trace__fprintf_thread(FILE * fp,struct thread * thread,struct trace * trace)2618 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2619 {
2620 	size_t printed = 0;
2621 	struct thread_trace *ttrace = thread__priv(thread);
2622 	double ratio;
2623 
2624 	if (ttrace == NULL)
2625 		return 0;
2626 
2627 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2628 
2629 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2630 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2631 	printed += fprintf(fp, "%.1f%%", ratio);
2632 	if (ttrace->pfmaj)
2633 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2634 	if (ttrace->pfmin)
2635 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2636 	if (trace->sched)
2637 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2638 	else if (fputc('\n', fp) != EOF)
2639 		++printed;
2640 
2641 	printed += thread__dump_stats(ttrace, trace, fp);
2642 
2643 	return printed;
2644 }
2645 
thread__nr_events(struct thread_trace * ttrace)2646 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2647 {
2648 	return ttrace ? ttrace->nr_events : 0;
2649 }
2650 
2651 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2652 	struct thread *thread;
2653 )
2654 {
2655 	entry->thread = rb_entry(nd, struct thread, rb_node);
2656 }
2657 
trace__fprintf_thread_summary(struct trace * trace,FILE * fp)2658 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2659 {
2660 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2661 	size_t printed = trace__fprintf_threads_header(fp);
2662 	struct rb_node *nd;
2663 
2664 	if (threads == NULL) {
2665 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2666 		return 0;
2667 	}
2668 
2669 	resort_rb__for_each_entry(nd, threads)
2670 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2671 
2672 	resort_rb__delete(threads);
2673 
2674 	return printed;
2675 }
2676 
trace__set_duration(const struct option * opt,const char * str,int unset __maybe_unused)2677 static int trace__set_duration(const struct option *opt, const char *str,
2678 			       int unset __maybe_unused)
2679 {
2680 	struct trace *trace = opt->value;
2681 
2682 	trace->duration_filter = atof(str);
2683 	return 0;
2684 }
2685 
trace__set_filter_pids(const struct option * opt,const char * str,int unset __maybe_unused)2686 static int trace__set_filter_pids(const struct option *opt, const char *str,
2687 				  int unset __maybe_unused)
2688 {
2689 	int ret = -1;
2690 	size_t i;
2691 	struct trace *trace = opt->value;
2692 	/*
2693 	 * FIXME: introduce a intarray class, plain parse csv and create a
2694 	 * { int nr, int entries[] } struct...
2695 	 */
2696 	struct intlist *list = intlist__new(str);
2697 
2698 	if (list == NULL)
2699 		return -1;
2700 
2701 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2702 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2703 
2704 	if (trace->filter_pids.entries == NULL)
2705 		goto out;
2706 
2707 	trace->filter_pids.entries[0] = getpid();
2708 
2709 	for (i = 1; i < trace->filter_pids.nr; ++i)
2710 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2711 
2712 	intlist__delete(list);
2713 	ret = 0;
2714 out:
2715 	return ret;
2716 }
2717 
trace__open_output(struct trace * trace,const char * filename)2718 static int trace__open_output(struct trace *trace, const char *filename)
2719 {
2720 	struct stat st;
2721 
2722 	if (!stat(filename, &st) && st.st_size) {
2723 		char oldname[PATH_MAX];
2724 
2725 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2726 		unlink(oldname);
2727 		rename(filename, oldname);
2728 	}
2729 
2730 	trace->output = fopen(filename, "w");
2731 
2732 	return trace->output == NULL ? -errno : 0;
2733 }
2734 
parse_pagefaults(const struct option * opt,const char * str,int unset __maybe_unused)2735 static int parse_pagefaults(const struct option *opt, const char *str,
2736 			    int unset __maybe_unused)
2737 {
2738 	int *trace_pgfaults = opt->value;
2739 
2740 	if (strcmp(str, "all") == 0)
2741 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2742 	else if (strcmp(str, "maj") == 0)
2743 		*trace_pgfaults |= TRACE_PFMAJ;
2744 	else if (strcmp(str, "min") == 0)
2745 		*trace_pgfaults |= TRACE_PFMIN;
2746 	else
2747 		return -1;
2748 
2749 	return 0;
2750 }
2751 
evlist__set_evsel_handler(struct perf_evlist * evlist,void * handler)2752 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2753 {
2754 	struct perf_evsel *evsel;
2755 
2756 	evlist__for_each_entry(evlist, evsel)
2757 		evsel->handler = handler;
2758 }
2759 
cmd_trace(int argc,const char ** argv,const char * prefix __maybe_unused)2760 int cmd_trace(int argc, const char **argv, const char *prefix __maybe_unused)
2761 {
2762 	const char *trace_usage[] = {
2763 		"perf trace [<options>] [<command>]",
2764 		"perf trace [<options>] -- <command> [<options>]",
2765 		"perf trace record [<options>] [<command>]",
2766 		"perf trace record [<options>] -- <command> [<options>]",
2767 		NULL
2768 	};
2769 	struct trace trace = {
2770 		.syscalls = {
2771 			. max = -1,
2772 		},
2773 		.opts = {
2774 			.target = {
2775 				.uid	   = UINT_MAX,
2776 				.uses_mmap = true,
2777 			},
2778 			.user_freq     = UINT_MAX,
2779 			.user_interval = ULLONG_MAX,
2780 			.no_buffering  = true,
2781 			.mmap_pages    = UINT_MAX,
2782 			.proc_map_timeout  = 500,
2783 		},
2784 		.output = stderr,
2785 		.show_comm = true,
2786 		.trace_syscalls = true,
2787 		.kernel_syscallchains = false,
2788 		.max_stack = UINT_MAX,
2789 	};
2790 	const char *output_name = NULL;
2791 	const char *ev_qualifier_str = NULL;
2792 	const struct option trace_options[] = {
2793 	OPT_CALLBACK(0, "event", &trace.evlist, "event",
2794 		     "event selector. use 'perf list' to list available events",
2795 		     parse_events_option),
2796 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2797 		    "show the thread COMM next to its id"),
2798 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2799 	OPT_STRING('e', "expr", &ev_qualifier_str, "expr", "list of syscalls to trace"),
2800 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2801 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2802 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2803 		    "trace events on existing process id"),
2804 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2805 		    "trace events on existing thread id"),
2806 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2807 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2808 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2809 		    "system-wide collection from all CPUs"),
2810 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2811 		    "list of cpus to monitor"),
2812 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2813 		    "child tasks do not inherit counters"),
2814 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2815 		     "number of mmap data pages",
2816 		     perf_evlist__parse_mmap_pages),
2817 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2818 		   "user to profile"),
2819 	OPT_CALLBACK(0, "duration", &trace, "float",
2820 		     "show only events with duration > N.M ms",
2821 		     trace__set_duration),
2822 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2823 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2824 	OPT_BOOLEAN('T', "time", &trace.full_time,
2825 		    "Show full timestamp, not time relative to first start"),
2826 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
2827 		    "Show only syscall summary with statistics"),
2828 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
2829 		    "Show all syscalls and summary with statistics"),
2830 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2831 		     "Trace pagefaults", parse_pagefaults, "maj"),
2832 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2833 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2834 	OPT_CALLBACK(0, "call-graph", &trace.opts,
2835 		     "record_mode[,record_size]", record_callchain_help,
2836 		     &record_parse_callchain_opt),
2837 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2838 		    "Show the kernel callchains on the syscall exit path"),
2839 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2840 		     "Set the minimum stack depth when parsing the callchain, "
2841 		     "anything below the specified depth will be ignored."),
2842 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2843 		     "Set the maximum stack depth when parsing the callchain, "
2844 		     "anything beyond the specified depth will be ignored. "
2845 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2846 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2847 			"per thread proc mmap processing timeout in ms"),
2848 	OPT_END()
2849 	};
2850 	bool __maybe_unused max_stack_user_set = true;
2851 	bool mmap_pages_user_set = true;
2852 	const char * const trace_subcommands[] = { "record", NULL };
2853 	int err;
2854 	char bf[BUFSIZ];
2855 
2856 	signal(SIGSEGV, sighandler_dump_stack);
2857 	signal(SIGFPE, sighandler_dump_stack);
2858 
2859 	trace.evlist = perf_evlist__new();
2860 	trace.sctbl = syscalltbl__new();
2861 
2862 	if (trace.evlist == NULL || trace.sctbl == NULL) {
2863 		pr_err("Not enough memory to run!\n");
2864 		err = -ENOMEM;
2865 		goto out;
2866 	}
2867 
2868 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2869 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2870 
2871 	err = bpf__setup_stdout(trace.evlist);
2872 	if (err) {
2873 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2874 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2875 		goto out;
2876 	}
2877 
2878 	err = -1;
2879 
2880 	if (trace.trace_pgfaults) {
2881 		trace.opts.sample_address = true;
2882 		trace.opts.sample_time = true;
2883 	}
2884 
2885 	if (trace.opts.mmap_pages == UINT_MAX)
2886 		mmap_pages_user_set = false;
2887 
2888 	if (trace.max_stack == UINT_MAX) {
2889 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2890 		max_stack_user_set = false;
2891 	}
2892 
2893 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2894 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2895 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2896 #endif
2897 
2898 	if (callchain_param.enabled) {
2899 		if (!mmap_pages_user_set && geteuid() == 0)
2900 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2901 
2902 		symbol_conf.use_callchain = true;
2903 	}
2904 
2905 	if (trace.evlist->nr_entries > 0)
2906 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2907 
2908 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2909 		return trace__record(&trace, argc-1, &argv[1]);
2910 
2911 	/* summary_only implies summary option, but don't overwrite summary if set */
2912 	if (trace.summary_only)
2913 		trace.summary = trace.summary_only;
2914 
2915 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2916 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
2917 		pr_err("Please specify something to trace.\n");
2918 		return -1;
2919 	}
2920 
2921 	if (!trace.trace_syscalls && ev_qualifier_str) {
2922 		pr_err("The -e option can't be used with --no-syscalls.\n");
2923 		goto out;
2924 	}
2925 
2926 	if (output_name != NULL) {
2927 		err = trace__open_output(&trace, output_name);
2928 		if (err < 0) {
2929 			perror("failed to create output file");
2930 			goto out;
2931 		}
2932 	}
2933 
2934 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
2935 
2936 	if (ev_qualifier_str != NULL) {
2937 		const char *s = ev_qualifier_str;
2938 		struct strlist_config slist_config = {
2939 			.dirname = system_path(STRACE_GROUPS_DIR),
2940 		};
2941 
2942 		trace.not_ev_qualifier = *s == '!';
2943 		if (trace.not_ev_qualifier)
2944 			++s;
2945 		trace.ev_qualifier = strlist__new(s, &slist_config);
2946 		if (trace.ev_qualifier == NULL) {
2947 			fputs("Not enough memory to parse event qualifier",
2948 			      trace.output);
2949 			err = -ENOMEM;
2950 			goto out_close;
2951 		}
2952 
2953 		err = trace__validate_ev_qualifier(&trace);
2954 		if (err)
2955 			goto out_close;
2956 	}
2957 
2958 	err = target__validate(&trace.opts.target);
2959 	if (err) {
2960 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2961 		fprintf(trace.output, "%s", bf);
2962 		goto out_close;
2963 	}
2964 
2965 	err = target__parse_uid(&trace.opts.target);
2966 	if (err) {
2967 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
2968 		fprintf(trace.output, "%s", bf);
2969 		goto out_close;
2970 	}
2971 
2972 	if (!argc && target__none(&trace.opts.target))
2973 		trace.opts.target.system_wide = true;
2974 
2975 	if (input_name)
2976 		err = trace__replay(&trace);
2977 	else
2978 		err = trace__run(&trace, argc, argv);
2979 
2980 out_close:
2981 	if (output_name != NULL)
2982 		fclose(trace.output);
2983 out:
2984 	return err;
2985 }
2986