• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * builtin-trace.c
3  *
4  * Builtin 'trace' command:
5  *
6  * Display a continuously updated trace of any workload, CPU, specific PID,
7  * system wide, etc.  Default format is loosely strace like, but any other
8  * event may be specified using --event.
9  *
10  * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11  *
12  * Initially based on the 'trace' prototype by Thomas Gleixner:
13  *
14  * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15  *
16  * Released under the GPL v2. (and only v2, not any later version)
17  */
18 
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45 
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60 
61 #include "sane_ctype.h"
62 
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC		02000000
65 #endif
66 
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE	1024
69 #endif
70 
71 struct trace {
72 	struct perf_tool	tool;
73 	struct syscalltbl	*sctbl;
74 	struct {
75 		int		max;
76 		struct syscall  *table;
77 		struct {
78 			struct perf_evsel *sys_enter,
79 					  *sys_exit;
80 		}		events;
81 	} syscalls;
82 	struct record_opts	opts;
83 	struct perf_evlist	*evlist;
84 	struct machine		*host;
85 	struct thread		*current;
86 	u64			base_time;
87 	FILE			*output;
88 	unsigned long		nr_events;
89 	struct strlist		*ev_qualifier;
90 	struct {
91 		size_t		nr;
92 		int		*entries;
93 	}			ev_qualifier_ids;
94 	struct {
95 		size_t		nr;
96 		pid_t		*entries;
97 	}			filter_pids;
98 	double			duration_filter;
99 	double			runtime_ms;
100 	struct {
101 		u64		vfs_getname,
102 				proc_getname;
103 	} stats;
104 	unsigned int		max_stack;
105 	unsigned int		min_stack;
106 	bool			not_ev_qualifier;
107 	bool			live;
108 	bool			full_time;
109 	bool			sched;
110 	bool			multiple_threads;
111 	bool			summary;
112 	bool			summary_only;
113 	bool			show_comm;
114 	bool			show_tool_stats;
115 	bool			trace_syscalls;
116 	bool			kernel_syscallchains;
117 	bool			force;
118 	bool			vfs_getname;
119 	int			trace_pgfaults;
120 	int			open_id;
121 };
122 
123 struct tp_field {
124 	int offset;
125 	union {
126 		u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 		void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 	};
129 };
130 
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 	u##bits value; \
135 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 	return value;  \
137 }
138 
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143 
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 	u##bits value; \
148 	memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 	return bswap_##bits(value);\
150 }
151 
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155 
tp_field__init_uint(struct tp_field * field,struct format_field * format_field,bool needs_swap)156 static int tp_field__init_uint(struct tp_field *field,
157 			       struct format_field *format_field,
158 			       bool needs_swap)
159 {
160 	field->offset = format_field->offset;
161 
162 	switch (format_field->size) {
163 	case 1:
164 		field->integer = tp_field__u8;
165 		break;
166 	case 2:
167 		field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 		break;
169 	case 4:
170 		field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 		break;
172 	case 8:
173 		field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 		break;
175 	default:
176 		return -1;
177 	}
178 
179 	return 0;
180 }
181 
tp_field__ptr(struct tp_field * field,struct perf_sample * sample)182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 	return sample->raw_data + field->offset;
185 }
186 
tp_field__init_ptr(struct tp_field * field,struct format_field * format_field)187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 	field->offset = format_field->offset;
190 	field->pointer = tp_field__ptr;
191 	return 0;
192 }
193 
194 struct syscall_tp {
195 	struct tp_field id;
196 	union {
197 		struct tp_field args, ret;
198 	};
199 };
200 
perf_evsel__init_tp_uint_field(struct perf_evsel * evsel,struct tp_field * field,const char * name)201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 					  struct tp_field *field,
203 					  const char *name)
204 {
205 	struct format_field *format_field = perf_evsel__field(evsel, name);
206 
207 	if (format_field == NULL)
208 		return -1;
209 
210 	return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212 
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 	({ struct syscall_tp *sc = evsel->priv;\
215 	   perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216 
perf_evsel__init_tp_ptr_field(struct perf_evsel * evsel,struct tp_field * field,const char * name)217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 					 struct tp_field *field,
219 					 const char *name)
220 {
221 	struct format_field *format_field = perf_evsel__field(evsel, name);
222 
223 	if (format_field == NULL)
224 		return -1;
225 
226 	return tp_field__init_ptr(field, format_field);
227 }
228 
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 	({ struct syscall_tp *sc = evsel->priv;\
231 	   perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232 
perf_evsel__delete_priv(struct perf_evsel * evsel)233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 	zfree(&evsel->priv);
236 	perf_evsel__delete(evsel);
237 }
238 
perf_evsel__init_syscall_tp(struct perf_evsel * evsel,void * handler)239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 	evsel->priv = malloc(sizeof(struct syscall_tp));
242 	if (evsel->priv != NULL) {
243 		if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 			goto out_delete;
245 
246 		evsel->handler = handler;
247 		return 0;
248 	}
249 
250 	return -ENOMEM;
251 
252 out_delete:
253 	zfree(&evsel->priv);
254 	return -ENOENT;
255 }
256 
perf_evsel__syscall_newtp(const char * direction,void * handler)257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 	struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260 
261 	/* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 	if (IS_ERR(evsel))
263 		evsel = perf_evsel__newtp("syscalls", direction);
264 
265 	if (IS_ERR(evsel))
266 		return NULL;
267 
268 	if (perf_evsel__init_syscall_tp(evsel, handler))
269 		goto out_delete;
270 
271 	return evsel;
272 
273 out_delete:
274 	perf_evsel__delete_priv(evsel);
275 	return NULL;
276 }
277 
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 	({ struct syscall_tp *fields = evsel->priv; \
280 	   fields->name.integer(&fields->name, sample); })
281 
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 	({ struct syscall_tp *fields = evsel->priv; \
284 	   fields->name.pointer(&fields->name, sample); })
285 
strarray__scnprintf(struct strarray * sa,char * bf,size_t size,const char * intfmt,int val)286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 	int idx = val - sa->offset;
289 
290 	if (idx < 0 || idx >= sa->nr_entries)
291 		return scnprintf(bf, size, intfmt, val);
292 
293 	return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295 
__syscall_arg__scnprintf_strarray(char * bf,size_t size,const char * intfmt,struct syscall_arg * arg)296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 						const char *intfmt,
298 					        struct syscall_arg *arg)
299 {
300 	return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302 
syscall_arg__scnprintf_strarray(char * bf,size_t size,struct syscall_arg * arg)303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 					      struct syscall_arg *arg)
305 {
306 	return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308 
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310 
311 struct strarrays {
312 	int		nr_entries;
313 	struct strarray **entries;
314 };
315 
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 	.nr_entries = ARRAY_SIZE(array), \
318 	.entries = array, \
319 }
320 
syscall_arg__scnprintf_strarrays(char * bf,size_t size,struct syscall_arg * arg)321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 					struct syscall_arg *arg)
323 {
324 	struct strarrays *sas = arg->parm;
325 	int i;
326 
327 	for (i = 0; i < sas->nr_entries; ++i) {
328 		struct strarray *sa = sas->entries[i];
329 		int idx = arg->val - sa->offset;
330 
331 		if (idx >= 0 && idx < sa->nr_entries) {
332 			if (sa->entries[idx] == NULL)
333 				break;
334 			return scnprintf(bf, size, "%s", sa->entries[idx]);
335 		}
336 	}
337 
338 	return scnprintf(bf, size, "%d", arg->val);
339 }
340 
341 #ifndef AT_FDCWD
342 #define AT_FDCWD	-100
343 #endif
344 
syscall_arg__scnprintf_fd_at(char * bf,size_t size,struct syscall_arg * arg)345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 					   struct syscall_arg *arg)
347 {
348 	int fd = arg->val;
349 
350 	if (fd == AT_FDCWD)
351 		return scnprintf(bf, size, "CWD");
352 
353 	return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355 
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357 
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 					      struct syscall_arg *arg);
360 
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362 
syscall_arg__scnprintf_hex(char * bf,size_t size,struct syscall_arg * arg)363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 	return scnprintf(bf, size, "%#lx", arg->val);
366 }
367 
syscall_arg__scnprintf_int(char * bf,size_t size,struct syscall_arg * arg)368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 	return scnprintf(bf, size, "%d", arg->val);
371 }
372 
syscall_arg__scnprintf_long(char * bf,size_t size,struct syscall_arg * arg)373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 	return scnprintf(bf, size, "%ld", arg->val);
376 }
377 
378 static const char *bpf_cmd[] = {
379 	"MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 	"MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383 
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386 
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389 
390 static const char *keyctl_options[] = {
391 	"GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 	"SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 	"INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 	"ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 	"INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398 
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408 
409 static const char *fcntl_cmds[] = {
410 	"DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 	"SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 	"SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 	"GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416 
417 static const char *fcntl_linux_specific_cmds[] = {
418 	"SETLEASE", "GETLEASE", "NOTIFY", [5] =	"CANCELLK", "DUPFD_CLOEXEC",
419 	"SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 	"GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422 
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424 
425 static struct strarray *fcntl_cmds_arrays[] = {
426 	&strarray__fcntl_cmds,
427 	&strarray__fcntl_linux_specific_cmds,
428 };
429 
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431 
432 static const char *rlimit_resources[] = {
433 	"CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 	"MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 	"RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438 
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441 
442 static const char *clockid[] = {
443 	"REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 	"MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 	"REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448 
449 static const char *socket_families[] = {
450 	"UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 	"BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 	"SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 	"RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 	"BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 	"ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458 
syscall_arg__scnprintf_access_mode(char * bf,size_t size,struct syscall_arg * arg)459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 						 struct syscall_arg *arg)
461 {
462 	size_t printed = 0;
463 	int mode = arg->val;
464 
465 	if (mode == F_OK) /* 0 */
466 		return scnprintf(bf, size, "F");
467 #define	P_MODE(n) \
468 	if (mode & n##_OK) { \
469 		printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 		mode &= ~n##_OK; \
471 	}
472 
473 	P_MODE(R);
474 	P_MODE(W);
475 	P_MODE(X);
476 #undef P_MODE
477 
478 	if (mode)
479 		printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480 
481 	return printed;
482 }
483 
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485 
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 					      struct syscall_arg *arg);
488 
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490 
syscall_arg__scnprintf_pipe_flags(char * bf,size_t size,struct syscall_arg * arg)491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 						struct syscall_arg *arg)
493 {
494 	int printed = 0, flags = arg->val;
495 
496 #define	P_FLAG(n) \
497 	if (flags & O_##n) { \
498 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 		flags &= ~O_##n; \
500 	}
501 
502 	P_FLAG(CLOEXEC);
503 	P_FLAG(NONBLOCK);
504 #undef P_FLAG
505 
506 	if (flags)
507 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508 
509 	return printed;
510 }
511 
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513 
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK	0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM	0x0002
519 #endif
520 
syscall_arg__scnprintf_getrandom_flags(char * bf,size_t size,struct syscall_arg * arg)521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 						   struct syscall_arg *arg)
523 {
524 	int printed = 0, flags = arg->val;
525 
526 #define	P_FLAG(n) \
527 	if (flags & GRND_##n) { \
528 		printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 		flags &= ~GRND_##n; \
530 	}
531 
532 	P_FLAG(RANDOM);
533 	P_FLAG(NONBLOCK);
534 #undef P_FLAG
535 
536 	if (flags)
537 		printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538 
539 	return printed;
540 }
541 
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543 
544 #define STRARRAY(name, array) \
545 	  { .scnprintf	= SCA_STRARRAY, \
546 	    .parm	= &strarray__##array, }
547 
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562 
563 struct syscall_arg_fmt {
564 	size_t	   (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 	void	   *parm;
566 	const char *name;
567 	bool	   show_zero;
568 };
569 
570 static struct syscall_fmt {
571 	const char *name;
572 	const char *alias;
573 	struct syscall_arg_fmt arg[6];
574 	u8	   nr_args;
575 	bool	   errpid;
576 	bool	   timeout;
577 	bool	   hexret;
578 } syscall_fmts[] = {
579 	{ .name	    = "access",
580 	  .arg = { [1] = { .scnprintf = SCA_ACCMODE,  /* mode */ }, }, },
581 	{ .name	    = "arch_prctl", .alias = "prctl", },
582 	{ .name	    = "bpf",
583 	  .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 	{ .name	    = "brk",	    .hexret = true,
585 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 	{ .name     = "clock_gettime",
587 	  .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 	{ .name	    = "clone",	    .errpid = true, .nr_args = 5,
589 	  .arg = { [0] = { .name = "flags",	    .scnprintf = SCA_CLONE_FLAGS, },
590 		   [1] = { .name = "child_stack",   .scnprintf = SCA_HEX, },
591 		   [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 		   [3] = { .name = "child_tidptr",  .scnprintf = SCA_HEX, },
593 		   [4] = { .name = "tls",	    .scnprintf = SCA_HEX, }, }, },
594 	{ .name	    = "close",
595 	  .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 	{ .name	    = "epoll_ctl",
597 	  .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 	{ .name	    = "eventfd2",
599 	  .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 	{ .name	    = "fchmodat",
601 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 	{ .name	    = "fchownat",
603 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604 	{ .name	    = "fcntl",
605 	  .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 			   .parm      = &strarrays__fcntl_cmds_arrays,
607 			   .show_zero = true, },
608 		   [2] = { .scnprintf =  SCA_FCNTL_ARG, /* arg */ }, }, },
609 	{ .name	    = "flock",
610 	  .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 	{ .name	    = "fstat", .alias = "newfstat", },
612 	{ .name	    = "fstatat", .alias = "newfstatat", },
613 	{ .name	    = "futex",
614 	  .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 	{ .name	    = "futimesat",
616 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 	{ .name	    = "getitimer",
618 	  .arg = { [0] = STRARRAY(which, itimers), }, },
619 	{ .name	    = "getpid",	    .errpid = true, },
620 	{ .name	    = "getpgid",    .errpid = true, },
621 	{ .name	    = "getppid",    .errpid = true, },
622 	{ .name	    = "getrandom",
623 	  .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 	{ .name	    = "getrlimit",
625 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626 	{ .name	    = "ioctl",
627 	  .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630  * FIXME: Make this available to all arches.
631  */
632 		   [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635 		   [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637 	{ .name	    = "keyctl",
638 	  .arg = { [0] = STRARRAY(option, keyctl_options), }, },
639 	{ .name	    = "kill",
640 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
641 	{ .name	    = "linkat",
642 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
643 	{ .name	    = "lseek",
644 	  .arg = { [2] = STRARRAY(whence, whences), }, },
645 	{ .name	    = "lstat", .alias = "newlstat", },
646 	{ .name     = "madvise",
647 	  .arg = { [0] = { .scnprintf = SCA_HEX,      /* start */ },
648 		   [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
649 	{ .name	    = "mkdirat",
650 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
651 	{ .name	    = "mknodat",
652 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
653 	{ .name	    = "mlock",
654 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655 	{ .name	    = "mlockall",
656 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657 	{ .name	    = "mmap",	    .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
660 	.alias = "old_mmap",
661 #endif
662 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* addr */ },
663 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
664 		   [3] = { .scnprintf = SCA_MMAP_FLAGS,	/* flags */ }, }, },
665 	{ .name	    = "mprotect",
666 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
667 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ }, }, },
668 	{ .name	    = "mq_unlink",
669 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670 	{ .name	    = "mremap",	    .hexret = true,
671 	  .arg = { [0] = { .scnprintf = SCA_HEX,	  /* addr */ },
672 		   [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673 		   [4] = { .scnprintf = SCA_HEX,	  /* new_addr */ }, }, },
674 	{ .name	    = "munlock",
675 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
676 	{ .name	    = "munmap",
677 	  .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678 	{ .name	    = "name_to_handle_at",
679 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680 	{ .name	    = "newfstatat",
681 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
682 	{ .name	    = "open",
683 	  .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684 	{ .name	    = "open_by_handle_at",
685 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
686 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
687 	{ .name	    = "openat",
688 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	/* dfd */ },
689 		   [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 	{ .name	    = "perf_event_open",
691 	  .arg = { [2] = { .scnprintf = SCA_INT,	/* cpu */ },
692 		   [3] = { .scnprintf = SCA_FD,		/* group_fd */ },
693 		   [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
694 	{ .name	    = "pipe2",
695 	  .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696 	{ .name	    = "pkey_alloc",
697 	  .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS,	/* access_rights */ }, }, },
698 	{ .name	    = "pkey_free",
699 	  .arg = { [0] = { .scnprintf = SCA_INT,	/* key */ }, }, },
700 	{ .name	    = "pkey_mprotect",
701 	  .arg = { [0] = { .scnprintf = SCA_HEX,	/* start */ },
702 		   [2] = { .scnprintf = SCA_MMAP_PROT,	/* prot */ },
703 		   [3] = { .scnprintf = SCA_INT,	/* pkey */ }, }, },
704 	{ .name	    = "poll", .timeout = true, },
705 	{ .name	    = "ppoll", .timeout = true, },
706 	{ .name	    = "pread", .alias = "pread64", },
707 	{ .name	    = "preadv", .alias = "pread", },
708 	{ .name	    = "prlimit64",
709 	  .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710 	{ .name	    = "pwrite", .alias = "pwrite64", },
711 	{ .name	    = "readlinkat",
712 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713 	{ .name	    = "recvfrom",
714 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715 	{ .name	    = "recvmmsg",
716 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
717 	{ .name	    = "recvmsg",
718 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719 	{ .name	    = "renameat",
720 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721 	{ .name	    = "rt_sigaction",
722 	  .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723 	{ .name	    = "rt_sigprocmask",
724 	  .arg = { [0] = STRARRAY(how, sighow), }, },
725 	{ .name	    = "rt_sigqueueinfo",
726 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727 	{ .name	    = "rt_tgsigqueueinfo",
728 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729 	{ .name	    = "sched_setscheduler",
730 	  .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
731 	{ .name	    = "seccomp",
732 	  .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP,	   /* op */ },
733 		   [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734 	{ .name	    = "select", .timeout = true, },
735 	{ .name	    = "sendmmsg",
736 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
737 	{ .name	    = "sendmsg",
738 	  .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
739 	{ .name	    = "sendto",
740 	  .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741 	{ .name	    = "set_tid_address", .errpid = true, },
742 	{ .name	    = "setitimer",
743 	  .arg = { [0] = STRARRAY(which, itimers), }, },
744 	{ .name	    = "setrlimit",
745 	  .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
746 	{ .name	    = "socket",
747 	  .arg = { [0] = STRARRAY(family, socket_families),
748 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749 	{ .name	    = "socketpair",
750 	  .arg = { [0] = STRARRAY(family, socket_families),
751 		   [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752 	{ .name	    = "stat", .alias = "newstat", },
753 	{ .name	    = "statx",
754 	  .arg = { [0] = { .scnprintf = SCA_FDAT,	 /* fdat */ },
755 		   [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756 		   [3] = { .scnprintf = SCA_STATX_MASK,	 /* mask */ }, }, },
757 	{ .name	    = "swapoff",
758 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
759 	{ .name	    = "swapon",
760 	  .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761 	{ .name	    = "symlinkat",
762 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
763 	{ .name	    = "tgkill",
764 	  .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
765 	{ .name	    = "tkill",
766 	  .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767 	{ .name	    = "uname", .alias = "newuname", },
768 	{ .name	    = "unlinkat",
769 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770 	{ .name	    = "utimensat",
771 	  .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772 	{ .name	    = "wait4",	    .errpid = true,
773 	  .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774 	{ .name	    = "waitid",	    .errpid = true,
775 	  .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
776 };
777 
syscall_fmt__cmp(const void * name,const void * fmtp)778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
779 {
780 	const struct syscall_fmt *fmt = fmtp;
781 	return strcmp(name, fmt->name);
782 }
783 
syscall_fmt__find(const char * name)784 static struct syscall_fmt *syscall_fmt__find(const char *name)
785 {
786 	const int nmemb = ARRAY_SIZE(syscall_fmts);
787 	return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
788 }
789 
790 struct syscall {
791 	struct event_format *tp_format;
792 	int		    nr_args;
793 	struct format_field *args;
794 	const char	    *name;
795 	bool		    is_exit;
796 	struct syscall_fmt  *fmt;
797 	struct syscall_arg_fmt *arg_fmt;
798 };
799 
800 /*
801  * We need to have this 'calculated' boolean because in some cases we really
802  * don't know what is the duration of a syscall, for instance, when we start
803  * a session and some threads are waiting for a syscall to finish, say 'poll',
804  * in which case all we can do is to print "( ? ) for duration and for the
805  * start timestamp.
806  */
fprintf_duration(unsigned long t,bool calculated,FILE * fp)807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
808 {
809 	double duration = (double)t / NSEC_PER_MSEC;
810 	size_t printed = fprintf(fp, "(");
811 
812 	if (!calculated)
813 		printed += fprintf(fp, "     ?   ");
814 	else if (duration >= 1.0)
815 		printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816 	else if (duration >= 0.01)
817 		printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
818 	else
819 		printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820 	return printed + fprintf(fp, "): ");
821 }
822 
823 /**
824  * filename.ptr: The filename char pointer that will be vfs_getname'd
825  * filename.entry_str_pos: Where to insert the string translated from
826  *                         filename.ptr by the vfs_getname tracepoint/kprobe.
827  * ret_scnprintf: syscall args may set this to a different syscall return
828  *                formatter, for instance, fcntl may return fds, file flags, etc.
829  */
830 struct thread_trace {
831 	u64		  entry_time;
832 	bool		  entry_pending;
833 	unsigned long	  nr_events;
834 	unsigned long	  pfmaj, pfmin;
835 	char		  *entry_str;
836 	double		  runtime_ms;
837 	size_t		  (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
838         struct {
839 		unsigned long ptr;
840 		short int     entry_str_pos;
841 		bool	      pending_open;
842 		unsigned int  namelen;
843 		char	      *name;
844 	} filename;
845 	struct {
846 		int	  max;
847 		char	  **table;
848 	} paths;
849 
850 	struct intlist *syscall_stats;
851 };
852 
thread_trace__new(void)853 static struct thread_trace *thread_trace__new(void)
854 {
855 	struct thread_trace *ttrace =  zalloc(sizeof(struct thread_trace));
856 
857 	if (ttrace)
858 		ttrace->paths.max = -1;
859 
860 	ttrace->syscall_stats = intlist__new(NULL);
861 
862 	return ttrace;
863 }
864 
thread__trace(struct thread * thread,FILE * fp)865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
866 {
867 	struct thread_trace *ttrace;
868 
869 	if (thread == NULL)
870 		goto fail;
871 
872 	if (thread__priv(thread) == NULL)
873 		thread__set_priv(thread, thread_trace__new());
874 
875 	if (thread__priv(thread) == NULL)
876 		goto fail;
877 
878 	ttrace = thread__priv(thread);
879 	++ttrace->nr_events;
880 
881 	return ttrace;
882 fail:
883 	color_fprintf(fp, PERF_COLOR_RED,
884 		      "WARNING: not enough memory, dropping samples!\n");
885 	return NULL;
886 }
887 
888 
syscall_arg__set_ret_scnprintf(struct syscall_arg * arg,size_t (* ret_scnprintf)(char * bf,size_t size,struct syscall_arg * arg))889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890 				    size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
891 {
892 	struct thread_trace *ttrace = thread__priv(arg->thread);
893 
894 	ttrace->ret_scnprintf = ret_scnprintf;
895 }
896 
897 #define TRACE_PFMAJ		(1 << 0)
898 #define TRACE_PFMIN		(1 << 1)
899 
900 static const size_t trace__entry_str_size = 2048;
901 
trace__set_fd_pathname(struct thread * thread,int fd,const char * pathname)902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 {
904 	struct thread_trace *ttrace = thread__priv(thread);
905 
906 	if (fd > ttrace->paths.max) {
907 		char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
908 
909 		if (npath == NULL)
910 			return -1;
911 
912 		if (ttrace->paths.max != -1) {
913 			memset(npath + ttrace->paths.max + 1, 0,
914 			       (fd - ttrace->paths.max) * sizeof(char *));
915 		} else {
916 			memset(npath, 0, (fd + 1) * sizeof(char *));
917 		}
918 
919 		ttrace->paths.table = npath;
920 		ttrace->paths.max   = fd;
921 	}
922 
923 	ttrace->paths.table[fd] = strdup(pathname);
924 
925 	return ttrace->paths.table[fd] != NULL ? 0 : -1;
926 }
927 
thread__read_fd_path(struct thread * thread,int fd)928 static int thread__read_fd_path(struct thread *thread, int fd)
929 {
930 	char linkname[PATH_MAX], pathname[PATH_MAX];
931 	struct stat st;
932 	int ret;
933 
934 	if (thread->pid_ == thread->tid) {
935 		scnprintf(linkname, sizeof(linkname),
936 			  "/proc/%d/fd/%d", thread->pid_, fd);
937 	} else {
938 		scnprintf(linkname, sizeof(linkname),
939 			  "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
940 	}
941 
942 	if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
943 		return -1;
944 
945 	ret = readlink(linkname, pathname, sizeof(pathname));
946 
947 	if (ret < 0 || ret > st.st_size)
948 		return -1;
949 
950 	pathname[ret] = '\0';
951 	return trace__set_fd_pathname(thread, fd, pathname);
952 }
953 
thread__fd_path(struct thread * thread,int fd,struct trace * trace)954 static const char *thread__fd_path(struct thread *thread, int fd,
955 				   struct trace *trace)
956 {
957 	struct thread_trace *ttrace = thread__priv(thread);
958 
959 	if (ttrace == NULL)
960 		return NULL;
961 
962 	if (fd < 0)
963 		return NULL;
964 
965 	if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
966 		if (!trace->live)
967 			return NULL;
968 		++trace->stats.proc_getname;
969 		if (thread__read_fd_path(thread, fd))
970 			return NULL;
971 	}
972 
973 	return ttrace->paths.table[fd];
974 }
975 
syscall_arg__scnprintf_fd(char * bf,size_t size,struct syscall_arg * arg)976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
977 {
978 	int fd = arg->val;
979 	size_t printed = scnprintf(bf, size, "%d", fd);
980 	const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981 
982 	if (path)
983 		printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984 
985 	return printed;
986 }
987 
syscall_arg__scnprintf_close_fd(char * bf,size_t size,struct syscall_arg * arg)988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 					      struct syscall_arg *arg)
990 {
991 	int fd = arg->val;
992 	size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 	struct thread_trace *ttrace = thread__priv(arg->thread);
994 
995 	if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 		zfree(&ttrace->paths.table[fd]);
997 
998 	return printed;
999 }
1000 
thread__set_filename_pos(struct thread * thread,const char * bf,unsigned long ptr)1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002 				     unsigned long ptr)
1003 {
1004 	struct thread_trace *ttrace = thread__priv(thread);
1005 
1006 	ttrace->filename.ptr = ptr;
1007 	ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009 
syscall_arg__scnprintf_filename(char * bf,size_t size,struct syscall_arg * arg)1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 					      struct syscall_arg *arg)
1012 {
1013 	unsigned long ptr = arg->val;
1014 
1015 	if (!arg->trace->vfs_getname)
1016 		return scnprintf(bf, size, "%#x", ptr);
1017 
1018 	thread__set_filename_pos(arg->thread, bf, ptr);
1019 	return 0;
1020 }
1021 
trace__filter_duration(struct trace * trace,double t)1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024 	return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026 
__trace__fprintf_tstamp(struct trace * trace,u64 tstamp,FILE * fp)1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029 	double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030 
1031 	return fprintf(fp, "%10.3f ", ts);
1032 }
1033 
1034 /*
1035  * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036  * using ttrace->entry_time for a thread that receives a sys_exit without
1037  * first having received a sys_enter ("poll" issued before tracing session
1038  * starts, lost sys_enter exit due to ring buffer overflow).
1039  */
trace__fprintf_tstamp(struct trace * trace,u64 tstamp,FILE * fp)1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042 	if (tstamp > 0)
1043 		return __trace__fprintf_tstamp(trace, tstamp, fp);
1044 
1045 	return fprintf(fp, "         ? ");
1046 }
1047 
1048 static bool done = false;
1049 static bool interrupted = false;
1050 
sig_handler(int sig)1051 static void sig_handler(int sig)
1052 {
1053 	done = true;
1054 	interrupted = sig == SIGINT;
1055 }
1056 
trace__fprintf_entry_head(struct trace * trace,struct thread * thread,u64 duration,bool duration_calculated,u64 tstamp,FILE * fp)1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058 					u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1059 {
1060 	size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061 	printed += fprintf_duration(duration, duration_calculated, fp);
1062 
1063 	if (trace->multiple_threads) {
1064 		if (trace->show_comm)
1065 			printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066 		printed += fprintf(fp, "%d ", thread->tid);
1067 	}
1068 
1069 	return printed;
1070 }
1071 
trace__process_event(struct trace * trace,struct machine * machine,union perf_event * event,struct perf_sample * sample)1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073 				union perf_event *event, struct perf_sample *sample)
1074 {
1075 	int ret = 0;
1076 
1077 	switch (event->header.type) {
1078 	case PERF_RECORD_LOST:
1079 		color_fprintf(trace->output, PERF_COLOR_RED,
1080 			      "LOST %" PRIu64 " events!\n", event->lost.lost);
1081 		ret = machine__process_lost_event(machine, event, sample);
1082 		break;
1083 	default:
1084 		ret = machine__process_event(machine, event, sample);
1085 		break;
1086 	}
1087 
1088 	return ret;
1089 }
1090 
trace__tool_process(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct machine * machine)1091 static int trace__tool_process(struct perf_tool *tool,
1092 			       union perf_event *event,
1093 			       struct perf_sample *sample,
1094 			       struct machine *machine)
1095 {
1096 	struct trace *trace = container_of(tool, struct trace, tool);
1097 	return trace__process_event(trace, machine, event, sample);
1098 }
1099 
trace__machine__resolve_kernel_addr(void * vmachine,unsigned long long * addrp,char ** modp)1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1101 {
1102 	struct machine *machine = vmachine;
1103 
1104 	if (machine->kptr_restrict_warned)
1105 		return NULL;
1106 
1107 	if (symbol_conf.kptr_restrict) {
1108 		pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109 			   "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110 			   "Kernel samples will not be resolved.\n");
1111 		machine->kptr_restrict_warned = true;
1112 		return NULL;
1113 	}
1114 
1115 	return machine__resolve_kernel_addr(vmachine, addrp, modp);
1116 }
1117 
trace__symbols_init(struct trace * trace,struct perf_evlist * evlist)1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1119 {
1120 	int err = symbol__init(NULL);
1121 
1122 	if (err)
1123 		return err;
1124 
1125 	trace->host = machine__new_host();
1126 	if (trace->host == NULL)
1127 		return -ENOMEM;
1128 
1129 	if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1130 		return -errno;
1131 
1132 	err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 					    evlist->threads, trace__tool_process, false,
1134 					    trace->opts.proc_map_timeout);
1135 	if (err)
1136 		symbol__exit();
1137 
1138 	return err;
1139 }
1140 
trace__symbols__exit(struct trace * trace)1141 static void trace__symbols__exit(struct trace *trace)
1142 {
1143 	machine__exit(trace->host);
1144 	trace->host = NULL;
1145 
1146 	symbol__exit();
1147 }
1148 
syscall__alloc_arg_fmts(struct syscall * sc,int nr_args)1149 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1150 {
1151 	int idx;
1152 
1153 	if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1154 		nr_args = sc->fmt->nr_args;
1155 
1156 	sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1157 	if (sc->arg_fmt == NULL)
1158 		return -1;
1159 
1160 	for (idx = 0; idx < nr_args; ++idx) {
1161 		if (sc->fmt)
1162 			sc->arg_fmt[idx] = sc->fmt->arg[idx];
1163 	}
1164 
1165 	sc->nr_args = nr_args;
1166 	return 0;
1167 }
1168 
syscall__set_arg_fmts(struct syscall * sc)1169 static int syscall__set_arg_fmts(struct syscall *sc)
1170 {
1171 	struct format_field *field;
1172 	int idx = 0, len;
1173 
1174 	for (field = sc->args; field; field = field->next, ++idx) {
1175 		if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1176 			continue;
1177 
1178 		if (strcmp(field->type, "const char *") == 0 &&
1179 			 (strcmp(field->name, "filename") == 0 ||
1180 			  strcmp(field->name, "path") == 0 ||
1181 			  strcmp(field->name, "pathname") == 0))
1182 			sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1183 		else if (field->flags & FIELD_IS_POINTER)
1184 			sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1185 		else if (strcmp(field->type, "pid_t") == 0)
1186 			sc->arg_fmt[idx].scnprintf = SCA_PID;
1187 		else if (strcmp(field->type, "umode_t") == 0)
1188 			sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1189 		else if ((strcmp(field->type, "int") == 0 ||
1190 			  strcmp(field->type, "unsigned int") == 0 ||
1191 			  strcmp(field->type, "long") == 0) &&
1192 			 (len = strlen(field->name)) >= 2 &&
1193 			 strcmp(field->name + len - 2, "fd") == 0) {
1194 			/*
1195 			 * /sys/kernel/tracing/events/syscalls/sys_enter*
1196 			 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1197 			 * 65 int
1198 			 * 23 unsigned int
1199 			 * 7 unsigned long
1200 			 */
1201 			sc->arg_fmt[idx].scnprintf = SCA_FD;
1202 		}
1203 	}
1204 
1205 	return 0;
1206 }
1207 
trace__read_syscall_info(struct trace * trace,int id)1208 static int trace__read_syscall_info(struct trace *trace, int id)
1209 {
1210 	char tp_name[128];
1211 	struct syscall *sc;
1212 	const char *name = syscalltbl__name(trace->sctbl, id);
1213 
1214 	if (name == NULL)
1215 		return -1;
1216 
1217 	if (id > trace->syscalls.max) {
1218 		struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1219 
1220 		if (nsyscalls == NULL)
1221 			return -1;
1222 
1223 		if (trace->syscalls.max != -1) {
1224 			memset(nsyscalls + trace->syscalls.max + 1, 0,
1225 			       (id - trace->syscalls.max) * sizeof(*sc));
1226 		} else {
1227 			memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1228 		}
1229 
1230 		trace->syscalls.table = nsyscalls;
1231 		trace->syscalls.max   = id;
1232 	}
1233 
1234 	sc = trace->syscalls.table + id;
1235 	sc->name = name;
1236 
1237 	sc->fmt  = syscall_fmt__find(sc->name);
1238 
1239 	snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1240 	sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1241 
1242 	if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1243 		snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1244 		sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1245 	}
1246 
1247 	if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1248 		return -1;
1249 
1250 	if (IS_ERR(sc->tp_format))
1251 		return -1;
1252 
1253 	sc->args = sc->tp_format->format.fields;
1254 	/*
1255 	 * We need to check and discard the first variable '__syscall_nr'
1256 	 * or 'nr' that mean the syscall number. It is needless here.
1257 	 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1258 	 */
1259 	if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1260 		sc->args = sc->args->next;
1261 		--sc->nr_args;
1262 	}
1263 
1264 	sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1265 
1266 	return syscall__set_arg_fmts(sc);
1267 }
1268 
trace__validate_ev_qualifier(struct trace * trace)1269 static int trace__validate_ev_qualifier(struct trace *trace)
1270 {
1271 	int err = 0, i;
1272 	size_t nr_allocated;
1273 	struct str_node *pos;
1274 
1275 	trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1276 	trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1277 						 sizeof(trace->ev_qualifier_ids.entries[0]));
1278 
1279 	if (trace->ev_qualifier_ids.entries == NULL) {
1280 		fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1281 		       trace->output);
1282 		err = -EINVAL;
1283 		goto out;
1284 	}
1285 
1286 	nr_allocated = trace->ev_qualifier_ids.nr;
1287 	i = 0;
1288 
1289 	strlist__for_each_entry(pos, trace->ev_qualifier) {
1290 		const char *sc = pos->s;
1291 		int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1292 
1293 		if (id < 0) {
1294 			id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1295 			if (id >= 0)
1296 				goto matches;
1297 
1298 			if (err == 0) {
1299 				fputs("Error:\tInvalid syscall ", trace->output);
1300 				err = -EINVAL;
1301 			} else {
1302 				fputs(", ", trace->output);
1303 			}
1304 
1305 			fputs(sc, trace->output);
1306 		}
1307 matches:
1308 		trace->ev_qualifier_ids.entries[i++] = id;
1309 		if (match_next == -1)
1310 			continue;
1311 
1312 		while (1) {
1313 			id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1314 			if (id < 0)
1315 				break;
1316 			if (nr_allocated == trace->ev_qualifier_ids.nr) {
1317 				void *entries;
1318 
1319 				nr_allocated += 8;
1320 				entries = realloc(trace->ev_qualifier_ids.entries,
1321 						  nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1322 				if (entries == NULL) {
1323 					err = -ENOMEM;
1324 					fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1325 					goto out_free;
1326 				}
1327 				trace->ev_qualifier_ids.entries = entries;
1328 			}
1329 			trace->ev_qualifier_ids.nr++;
1330 			trace->ev_qualifier_ids.entries[i++] = id;
1331 		}
1332 	}
1333 
1334 	if (err < 0) {
1335 		fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1336 		      "\nHint:\tand: 'man syscalls'\n", trace->output);
1337 out_free:
1338 		zfree(&trace->ev_qualifier_ids.entries);
1339 		trace->ev_qualifier_ids.nr = 0;
1340 	}
1341 out:
1342 	return err;
1343 }
1344 
1345 /*
1346  * args is to be interpreted as a series of longs but we need to handle
1347  * 8-byte unaligned accesses. args points to raw_data within the event
1348  * and raw_data is guaranteed to be 8-byte unaligned because it is
1349  * preceded by raw_size which is a u32. So we need to copy args to a temp
1350  * variable to read it. Most notably this avoids extended load instructions
1351  * on unaligned addresses
1352  */
syscall_arg__val(struct syscall_arg * arg,u8 idx)1353 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1354 {
1355 	unsigned long val;
1356 	unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1357 
1358 	memcpy(&val, p, sizeof(val));
1359 	return val;
1360 }
1361 
syscall__scnprintf_name(struct syscall * sc,char * bf,size_t size,struct syscall_arg * arg)1362 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1363 				      struct syscall_arg *arg)
1364 {
1365 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1366 		return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1367 
1368 	return scnprintf(bf, size, "arg%d: ", arg->idx);
1369 }
1370 
syscall__scnprintf_val(struct syscall * sc,char * bf,size_t size,struct syscall_arg * arg,unsigned long val)1371 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1372 				     struct syscall_arg *arg, unsigned long val)
1373 {
1374 	if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1375 		arg->val = val;
1376 		if (sc->arg_fmt[arg->idx].parm)
1377 			arg->parm = sc->arg_fmt[arg->idx].parm;
1378 		return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1379 	}
1380 	return scnprintf(bf, size, "%ld", val);
1381 }
1382 
syscall__scnprintf_args(struct syscall * sc,char * bf,size_t size,unsigned char * args,struct trace * trace,struct thread * thread)1383 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1384 				      unsigned char *args, struct trace *trace,
1385 				      struct thread *thread)
1386 {
1387 	size_t printed = 0;
1388 	unsigned long val;
1389 	u8 bit = 1;
1390 	struct syscall_arg arg = {
1391 		.args	= args,
1392 		.idx	= 0,
1393 		.mask	= 0,
1394 		.trace  = trace,
1395 		.thread = thread,
1396 	};
1397 	struct thread_trace *ttrace = thread__priv(thread);
1398 
1399 	/*
1400 	 * Things like fcntl will set this in its 'cmd' formatter to pick the
1401 	 * right formatter for the return value (an fd? file flags?), which is
1402 	 * not needed for syscalls that always return a given type, say an fd.
1403 	 */
1404 	ttrace->ret_scnprintf = NULL;
1405 
1406 	if (sc->args != NULL) {
1407 		struct format_field *field;
1408 
1409 		for (field = sc->args; field;
1410 		     field = field->next, ++arg.idx, bit <<= 1) {
1411 			if (arg.mask & bit)
1412 				continue;
1413 
1414 			val = syscall_arg__val(&arg, arg.idx);
1415 
1416 			/*
1417  			 * Suppress this argument if its value is zero and
1418  			 * and we don't have a string associated in an
1419  			 * strarray for it.
1420  			 */
1421 			if (val == 0 &&
1422 			    !(sc->arg_fmt &&
1423 			      (sc->arg_fmt[arg.idx].show_zero ||
1424 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1425 			       sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1426 			      sc->arg_fmt[arg.idx].parm))
1427 				continue;
1428 
1429 			printed += scnprintf(bf + printed, size - printed,
1430 					     "%s%s: ", printed ? ", " : "", field->name);
1431 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1432 		}
1433 	} else if (IS_ERR(sc->tp_format)) {
1434 		/*
1435 		 * If we managed to read the tracepoint /format file, then we
1436 		 * may end up not having any args, like with gettid(), so only
1437 		 * print the raw args when we didn't manage to read it.
1438 		 */
1439 		while (arg.idx < sc->nr_args) {
1440 			if (arg.mask & bit)
1441 				goto next_arg;
1442 			val = syscall_arg__val(&arg, arg.idx);
1443 			if (printed)
1444 				printed += scnprintf(bf + printed, size - printed, ", ");
1445 			printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1446 			printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1447 next_arg:
1448 			++arg.idx;
1449 			bit <<= 1;
1450 		}
1451 	}
1452 
1453 	return printed;
1454 }
1455 
1456 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1457 				  union perf_event *event,
1458 				  struct perf_sample *sample);
1459 
trace__syscall_info(struct trace * trace,struct perf_evsel * evsel,int id)1460 static struct syscall *trace__syscall_info(struct trace *trace,
1461 					   struct perf_evsel *evsel, int id)
1462 {
1463 
1464 	if (id < 0) {
1465 
1466 		/*
1467 		 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1468 		 * before that, leaving at a higher verbosity level till that is
1469 		 * explained. Reproduced with plain ftrace with:
1470 		 *
1471 		 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1472 		 * grep "NR -1 " /t/trace_pipe
1473 		 *
1474 		 * After generating some load on the machine.
1475  		 */
1476 		if (verbose > 1) {
1477 			static u64 n;
1478 			fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1479 				id, perf_evsel__name(evsel), ++n);
1480 		}
1481 		return NULL;
1482 	}
1483 
1484 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1485 	    trace__read_syscall_info(trace, id))
1486 		goto out_cant_read;
1487 
1488 	if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1489 		goto out_cant_read;
1490 
1491 	return &trace->syscalls.table[id];
1492 
1493 out_cant_read:
1494 	if (verbose > 0) {
1495 		fprintf(trace->output, "Problems reading syscall %d", id);
1496 		if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1497 			fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1498 		fputs(" information\n", trace->output);
1499 	}
1500 	return NULL;
1501 }
1502 
thread__update_stats(struct thread_trace * ttrace,int id,struct perf_sample * sample)1503 static void thread__update_stats(struct thread_trace *ttrace,
1504 				 int id, struct perf_sample *sample)
1505 {
1506 	struct int_node *inode;
1507 	struct stats *stats;
1508 	u64 duration = 0;
1509 
1510 	inode = intlist__findnew(ttrace->syscall_stats, id);
1511 	if (inode == NULL)
1512 		return;
1513 
1514 	stats = inode->priv;
1515 	if (stats == NULL) {
1516 		stats = malloc(sizeof(struct stats));
1517 		if (stats == NULL)
1518 			return;
1519 		init_stats(stats);
1520 		inode->priv = stats;
1521 	}
1522 
1523 	if (ttrace->entry_time && sample->time > ttrace->entry_time)
1524 		duration = sample->time - ttrace->entry_time;
1525 
1526 	update_stats(stats, duration);
1527 }
1528 
trace__printf_interrupted_entry(struct trace * trace,struct perf_sample * sample)1529 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1530 {
1531 	struct thread_trace *ttrace;
1532 	u64 duration;
1533 	size_t printed;
1534 
1535 	if (trace->current == NULL)
1536 		return 0;
1537 
1538 	ttrace = thread__priv(trace->current);
1539 
1540 	if (!ttrace->entry_pending)
1541 		return 0;
1542 
1543 	duration = sample->time - ttrace->entry_time;
1544 
1545 	printed  = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1546 	printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1547 	ttrace->entry_pending = false;
1548 
1549 	return printed;
1550 }
1551 
trace__sys_enter(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1552 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1553 			    union perf_event *event __maybe_unused,
1554 			    struct perf_sample *sample)
1555 {
1556 	char *msg;
1557 	void *args;
1558 	size_t printed = 0;
1559 	struct thread *thread;
1560 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1561 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1562 	struct thread_trace *ttrace;
1563 
1564 	if (sc == NULL)
1565 		return -1;
1566 
1567 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1568 	ttrace = thread__trace(thread, trace->output);
1569 	if (ttrace == NULL)
1570 		goto out_put;
1571 
1572 	args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1573 
1574 	if (ttrace->entry_str == NULL) {
1575 		ttrace->entry_str = malloc(trace__entry_str_size);
1576 		if (!ttrace->entry_str)
1577 			goto out_put;
1578 	}
1579 
1580 	if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1581 		trace__printf_interrupted_entry(trace, sample);
1582 
1583 	ttrace->entry_time = sample->time;
1584 	msg = ttrace->entry_str;
1585 	printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1586 
1587 	printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1588 					   args, trace, thread);
1589 
1590 	if (sc->is_exit) {
1591 		if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1592 			trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1593 			fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1594 		}
1595 	} else {
1596 		ttrace->entry_pending = true;
1597 		/* See trace__vfs_getname & trace__sys_exit */
1598 		ttrace->filename.pending_open = false;
1599 	}
1600 
1601 	if (trace->current != thread) {
1602 		thread__put(trace->current);
1603 		trace->current = thread__get(thread);
1604 	}
1605 	err = 0;
1606 out_put:
1607 	thread__put(thread);
1608 	return err;
1609 }
1610 
trace__resolve_callchain(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample,struct callchain_cursor * cursor)1611 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1612 				    struct perf_sample *sample,
1613 				    struct callchain_cursor *cursor)
1614 {
1615 	struct addr_location al;
1616 
1617 	if (machine__resolve(trace->host, &al, sample) < 0 ||
1618 	    thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1619 		return -1;
1620 
1621 	return 0;
1622 }
1623 
trace__fprintf_callchain(struct trace * trace,struct perf_sample * sample)1624 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1625 {
1626 	/* TODO: user-configurable print_opts */
1627 	const unsigned int print_opts = EVSEL__PRINT_SYM |
1628 				        EVSEL__PRINT_DSO |
1629 				        EVSEL__PRINT_UNKNOWN_AS_ADDR;
1630 
1631 	return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1632 }
1633 
trace__sys_exit(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1634 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1635 			   union perf_event *event __maybe_unused,
1636 			   struct perf_sample *sample)
1637 {
1638 	long ret;
1639 	u64 duration = 0;
1640 	bool duration_calculated = false;
1641 	struct thread *thread;
1642 	int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1643 	struct syscall *sc = trace__syscall_info(trace, evsel, id);
1644 	struct thread_trace *ttrace;
1645 
1646 	if (sc == NULL)
1647 		return -1;
1648 
1649 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1650 	ttrace = thread__trace(thread, trace->output);
1651 	if (ttrace == NULL)
1652 		goto out_put;
1653 
1654 	if (trace->summary)
1655 		thread__update_stats(ttrace, id, sample);
1656 
1657 	ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1658 
1659 	if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1660 		trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1661 		ttrace->filename.pending_open = false;
1662 		++trace->stats.vfs_getname;
1663 	}
1664 
1665 	if (ttrace->entry_time) {
1666 		duration = sample->time - ttrace->entry_time;
1667 		if (trace__filter_duration(trace, duration))
1668 			goto out;
1669 		duration_calculated = true;
1670 	} else if (trace->duration_filter)
1671 		goto out;
1672 
1673 	if (sample->callchain) {
1674 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1675 		if (callchain_ret == 0) {
1676 			if (callchain_cursor.nr < trace->min_stack)
1677 				goto out;
1678 			callchain_ret = 1;
1679 		}
1680 	}
1681 
1682 	if (trace->summary_only)
1683 		goto out;
1684 
1685 	trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1686 
1687 	if (ttrace->entry_pending) {
1688 		fprintf(trace->output, "%-70s", ttrace->entry_str);
1689 	} else {
1690 		fprintf(trace->output, " ... [");
1691 		color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1692 		fprintf(trace->output, "]: %s()", sc->name);
1693 	}
1694 
1695 	if (sc->fmt == NULL) {
1696 		if (ret < 0)
1697 			goto errno_print;
1698 signed_print:
1699 		fprintf(trace->output, ") = %ld", ret);
1700 	} else if (ret < 0) {
1701 errno_print: {
1702 		char bf[STRERR_BUFSIZE];
1703 		const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1704 			   *e = audit_errno_to_name(-ret);
1705 
1706 		fprintf(trace->output, ") = -1 %s %s", e, emsg);
1707 	}
1708 	} else if (ret == 0 && sc->fmt->timeout)
1709 		fprintf(trace->output, ") = 0 Timeout");
1710 	else if (ttrace->ret_scnprintf) {
1711 		char bf[1024];
1712 		struct syscall_arg arg = {
1713 			.val	= ret,
1714 			.thread	= thread,
1715 			.trace	= trace,
1716 		};
1717 		ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1718 		ttrace->ret_scnprintf = NULL;
1719 		fprintf(trace->output, ") = %s", bf);
1720 	} else if (sc->fmt->hexret)
1721 		fprintf(trace->output, ") = %#lx", ret);
1722 	else if (sc->fmt->errpid) {
1723 		struct thread *child = machine__find_thread(trace->host, ret, ret);
1724 
1725 		if (child != NULL) {
1726 			fprintf(trace->output, ") = %ld", ret);
1727 			if (child->comm_set)
1728 				fprintf(trace->output, " (%s)", thread__comm_str(child));
1729 			thread__put(child);
1730 		}
1731 	} else
1732 		goto signed_print;
1733 
1734 	fputc('\n', trace->output);
1735 
1736 	if (callchain_ret > 0)
1737 		trace__fprintf_callchain(trace, sample);
1738 	else if (callchain_ret < 0)
1739 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1740 out:
1741 	ttrace->entry_pending = false;
1742 	err = 0;
1743 out_put:
1744 	thread__put(thread);
1745 	return err;
1746 }
1747 
trace__vfs_getname(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1748 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1749 			      union perf_event *event __maybe_unused,
1750 			      struct perf_sample *sample)
1751 {
1752 	struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1753 	struct thread_trace *ttrace;
1754 	size_t filename_len, entry_str_len, to_move;
1755 	ssize_t remaining_space;
1756 	char *pos;
1757 	const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1758 
1759 	if (!thread)
1760 		goto out;
1761 
1762 	ttrace = thread__priv(thread);
1763 	if (!ttrace)
1764 		goto out_put;
1765 
1766 	filename_len = strlen(filename);
1767 	if (filename_len == 0)
1768 		goto out_put;
1769 
1770 	if (ttrace->filename.namelen < filename_len) {
1771 		char *f = realloc(ttrace->filename.name, filename_len + 1);
1772 
1773 		if (f == NULL)
1774 			goto out_put;
1775 
1776 		ttrace->filename.namelen = filename_len;
1777 		ttrace->filename.name = f;
1778 	}
1779 
1780 	strcpy(ttrace->filename.name, filename);
1781 	ttrace->filename.pending_open = true;
1782 
1783 	if (!ttrace->filename.ptr)
1784 		goto out_put;
1785 
1786 	entry_str_len = strlen(ttrace->entry_str);
1787 	remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1788 	if (remaining_space <= 0)
1789 		goto out_put;
1790 
1791 	if (filename_len > (size_t)remaining_space) {
1792 		filename += filename_len - remaining_space;
1793 		filename_len = remaining_space;
1794 	}
1795 
1796 	to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1797 	pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1798 	memmove(pos + filename_len, pos, to_move);
1799 	memcpy(pos, filename, filename_len);
1800 
1801 	ttrace->filename.ptr = 0;
1802 	ttrace->filename.entry_str_pos = 0;
1803 out_put:
1804 	thread__put(thread);
1805 out:
1806 	return 0;
1807 }
1808 
trace__sched_stat_runtime(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1809 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1810 				     union perf_event *event __maybe_unused,
1811 				     struct perf_sample *sample)
1812 {
1813         u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1814 	double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1815 	struct thread *thread = machine__findnew_thread(trace->host,
1816 							sample->pid,
1817 							sample->tid);
1818 	struct thread_trace *ttrace = thread__trace(thread, trace->output);
1819 
1820 	if (ttrace == NULL)
1821 		goto out_dump;
1822 
1823 	ttrace->runtime_ms += runtime_ms;
1824 	trace->runtime_ms += runtime_ms;
1825 out_put:
1826 	thread__put(thread);
1827 	return 0;
1828 
1829 out_dump:
1830 	fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1831 	       evsel->name,
1832 	       perf_evsel__strval(evsel, sample, "comm"),
1833 	       (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1834 	       runtime,
1835 	       perf_evsel__intval(evsel, sample, "vruntime"));
1836 	goto out_put;
1837 }
1838 
bpf_output__printer(enum binary_printer_ops op,unsigned int val,void * extra)1839 static void bpf_output__printer(enum binary_printer_ops op,
1840 				unsigned int val, void *extra)
1841 {
1842 	FILE *output = extra;
1843 	unsigned char ch = (unsigned char)val;
1844 
1845 	switch (op) {
1846 	case BINARY_PRINT_CHAR_DATA:
1847 		fprintf(output, "%c", isprint(ch) ? ch : '.');
1848 		break;
1849 	case BINARY_PRINT_DATA_BEGIN:
1850 	case BINARY_PRINT_LINE_BEGIN:
1851 	case BINARY_PRINT_ADDR:
1852 	case BINARY_PRINT_NUM_DATA:
1853 	case BINARY_PRINT_NUM_PAD:
1854 	case BINARY_PRINT_SEP:
1855 	case BINARY_PRINT_CHAR_PAD:
1856 	case BINARY_PRINT_LINE_END:
1857 	case BINARY_PRINT_DATA_END:
1858 	default:
1859 		break;
1860 	}
1861 }
1862 
bpf_output__fprintf(struct trace * trace,struct perf_sample * sample)1863 static void bpf_output__fprintf(struct trace *trace,
1864 				struct perf_sample *sample)
1865 {
1866 	print_binary(sample->raw_data, sample->raw_size, 8,
1867 		     bpf_output__printer, trace->output);
1868 }
1869 
trace__event_handler(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1870 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1871 				union perf_event *event __maybe_unused,
1872 				struct perf_sample *sample)
1873 {
1874 	int callchain_ret = 0;
1875 
1876 	if (sample->callchain) {
1877 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1878 		if (callchain_ret == 0) {
1879 			if (callchain_cursor.nr < trace->min_stack)
1880 				goto out;
1881 			callchain_ret = 1;
1882 		}
1883 	}
1884 
1885 	trace__printf_interrupted_entry(trace, sample);
1886 	trace__fprintf_tstamp(trace, sample->time, trace->output);
1887 
1888 	if (trace->trace_syscalls)
1889 		fprintf(trace->output, "(         ): ");
1890 
1891 	fprintf(trace->output, "%s:", evsel->name);
1892 
1893 	if (perf_evsel__is_bpf_output(evsel)) {
1894 		bpf_output__fprintf(trace, sample);
1895 	} else if (evsel->tp_format) {
1896 		event_format__fprintf(evsel->tp_format, sample->cpu,
1897 				      sample->raw_data, sample->raw_size,
1898 				      trace->output);
1899 	}
1900 
1901 	fprintf(trace->output, ")\n");
1902 
1903 	if (callchain_ret > 0)
1904 		trace__fprintf_callchain(trace, sample);
1905 	else if (callchain_ret < 0)
1906 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1907 out:
1908 	return 0;
1909 }
1910 
print_location(FILE * f,struct perf_sample * sample,struct addr_location * al,bool print_dso,bool print_sym)1911 static void print_location(FILE *f, struct perf_sample *sample,
1912 			   struct addr_location *al,
1913 			   bool print_dso, bool print_sym)
1914 {
1915 
1916 	if ((verbose > 0 || print_dso) && al->map)
1917 		fprintf(f, "%s@", al->map->dso->long_name);
1918 
1919 	if ((verbose > 0 || print_sym) && al->sym)
1920 		fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1921 			al->addr - al->sym->start);
1922 	else if (al->map)
1923 		fprintf(f, "0x%" PRIx64, al->addr);
1924 	else
1925 		fprintf(f, "0x%" PRIx64, sample->addr);
1926 }
1927 
trace__pgfault(struct trace * trace,struct perf_evsel * evsel,union perf_event * event __maybe_unused,struct perf_sample * sample)1928 static int trace__pgfault(struct trace *trace,
1929 			  struct perf_evsel *evsel,
1930 			  union perf_event *event __maybe_unused,
1931 			  struct perf_sample *sample)
1932 {
1933 	struct thread *thread;
1934 	struct addr_location al;
1935 	char map_type = 'd';
1936 	struct thread_trace *ttrace;
1937 	int err = -1;
1938 	int callchain_ret = 0;
1939 
1940 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1941 
1942 	if (sample->callchain) {
1943 		callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1944 		if (callchain_ret == 0) {
1945 			if (callchain_cursor.nr < trace->min_stack)
1946 				goto out_put;
1947 			callchain_ret = 1;
1948 		}
1949 	}
1950 
1951 	ttrace = thread__trace(thread, trace->output);
1952 	if (ttrace == NULL)
1953 		goto out_put;
1954 
1955 	if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1956 		ttrace->pfmaj++;
1957 	else
1958 		ttrace->pfmin++;
1959 
1960 	if (trace->summary_only)
1961 		goto out;
1962 
1963 	thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1964 			      sample->ip, &al);
1965 
1966 	trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1967 
1968 	fprintf(trace->output, "%sfault [",
1969 		evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1970 		"maj" : "min");
1971 
1972 	print_location(trace->output, sample, &al, false, true);
1973 
1974 	fprintf(trace->output, "] => ");
1975 
1976 	thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1977 				   sample->addr, &al);
1978 
1979 	if (!al.map) {
1980 		thread__find_addr_location(thread, sample->cpumode,
1981 					   MAP__FUNCTION, sample->addr, &al);
1982 
1983 		if (al.map)
1984 			map_type = 'x';
1985 		else
1986 			map_type = '?';
1987 	}
1988 
1989 	print_location(trace->output, sample, &al, true, false);
1990 
1991 	fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1992 
1993 	if (callchain_ret > 0)
1994 		trace__fprintf_callchain(trace, sample);
1995 	else if (callchain_ret < 0)
1996 		pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1997 out:
1998 	err = 0;
1999 out_put:
2000 	thread__put(thread);
2001 	return err;
2002 }
2003 
trace__set_base_time(struct trace * trace,struct perf_evsel * evsel,struct perf_sample * sample)2004 static void trace__set_base_time(struct trace *trace,
2005 				 struct perf_evsel *evsel,
2006 				 struct perf_sample *sample)
2007 {
2008 	/*
2009 	 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2010 	 * and don't use sample->time unconditionally, we may end up having
2011 	 * some other event in the future without PERF_SAMPLE_TIME for good
2012 	 * reason, i.e. we may not be interested in its timestamps, just in
2013 	 * it taking place, picking some piece of information when it
2014 	 * appears in our event stream (vfs_getname comes to mind).
2015 	 */
2016 	if (trace->base_time == 0 && !trace->full_time &&
2017 	    (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2018 		trace->base_time = sample->time;
2019 }
2020 
trace__process_sample(struct perf_tool * tool,union perf_event * event,struct perf_sample * sample,struct perf_evsel * evsel,struct machine * machine __maybe_unused)2021 static int trace__process_sample(struct perf_tool *tool,
2022 				 union perf_event *event,
2023 				 struct perf_sample *sample,
2024 				 struct perf_evsel *evsel,
2025 				 struct machine *machine __maybe_unused)
2026 {
2027 	struct trace *trace = container_of(tool, struct trace, tool);
2028 	struct thread *thread;
2029 	int err = 0;
2030 
2031 	tracepoint_handler handler = evsel->handler;
2032 
2033 	thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2034 	if (thread && thread__is_filtered(thread))
2035 		goto out;
2036 
2037 	trace__set_base_time(trace, evsel, sample);
2038 
2039 	if (handler) {
2040 		++trace->nr_events;
2041 		handler(trace, evsel, event, sample);
2042 	}
2043 out:
2044 	thread__put(thread);
2045 	return err;
2046 }
2047 
trace__record(struct trace * trace,int argc,const char ** argv)2048 static int trace__record(struct trace *trace, int argc, const char **argv)
2049 {
2050 	unsigned int rec_argc, i, j;
2051 	const char **rec_argv;
2052 	const char * const record_args[] = {
2053 		"record",
2054 		"-R",
2055 		"-m", "1024",
2056 		"-c", "1",
2057 	};
2058 
2059 	const char * const sc_args[] = { "-e", };
2060 	unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2061 	const char * const majpf_args[] = { "-e", "major-faults" };
2062 	unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2063 	const char * const minpf_args[] = { "-e", "minor-faults" };
2064 	unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2065 
2066 	/* +1 is for the event string below */
2067 	rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2068 		majpf_args_nr + minpf_args_nr + argc;
2069 	rec_argv = calloc(rec_argc + 1, sizeof(char *));
2070 
2071 	if (rec_argv == NULL)
2072 		return -ENOMEM;
2073 
2074 	j = 0;
2075 	for (i = 0; i < ARRAY_SIZE(record_args); i++)
2076 		rec_argv[j++] = record_args[i];
2077 
2078 	if (trace->trace_syscalls) {
2079 		for (i = 0; i < sc_args_nr; i++)
2080 			rec_argv[j++] = sc_args[i];
2081 
2082 		/* event string may be different for older kernels - e.g., RHEL6 */
2083 		if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2084 			rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2085 		else if (is_valid_tracepoint("syscalls:sys_enter"))
2086 			rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2087 		else {
2088 			pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2089 			free(rec_argv);
2090 			return -1;
2091 		}
2092 	}
2093 
2094 	if (trace->trace_pgfaults & TRACE_PFMAJ)
2095 		for (i = 0; i < majpf_args_nr; i++)
2096 			rec_argv[j++] = majpf_args[i];
2097 
2098 	if (trace->trace_pgfaults & TRACE_PFMIN)
2099 		for (i = 0; i < minpf_args_nr; i++)
2100 			rec_argv[j++] = minpf_args[i];
2101 
2102 	for (i = 0; i < (unsigned int)argc; i++)
2103 		rec_argv[j++] = argv[i];
2104 
2105 	return cmd_record(j, rec_argv);
2106 }
2107 
2108 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2109 
perf_evlist__add_vfs_getname(struct perf_evlist * evlist)2110 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2111 {
2112 	bool found = false;
2113 	struct perf_evsel *evsel, *tmp;
2114 	struct parse_events_error err = { .idx = 0, };
2115 	int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2116 
2117 	if (ret)
2118 		return false;
2119 
2120 	evlist__for_each_entry_safe(evlist, evsel, tmp) {
2121 		if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2122 			continue;
2123 
2124 		if (perf_evsel__field(evsel, "pathname")) {
2125 			evsel->handler = trace__vfs_getname;
2126 			found = true;
2127 			continue;
2128 		}
2129 
2130 		list_del_init(&evsel->node);
2131 		evsel->evlist = NULL;
2132 		perf_evsel__delete(evsel);
2133 	}
2134 
2135 	return found;
2136 }
2137 
perf_evsel__new_pgfault(u64 config)2138 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2139 {
2140 	struct perf_evsel *evsel;
2141 	struct perf_event_attr attr = {
2142 		.type = PERF_TYPE_SOFTWARE,
2143 		.mmap_data = 1,
2144 	};
2145 
2146 	attr.config = config;
2147 	attr.sample_period = 1;
2148 
2149 	event_attr_init(&attr);
2150 
2151 	evsel = perf_evsel__new(&attr);
2152 	if (evsel)
2153 		evsel->handler = trace__pgfault;
2154 
2155 	return evsel;
2156 }
2157 
trace__handle_event(struct trace * trace,union perf_event * event,struct perf_sample * sample)2158 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2159 {
2160 	const u32 type = event->header.type;
2161 	struct perf_evsel *evsel;
2162 
2163 	if (type != PERF_RECORD_SAMPLE) {
2164 		trace__process_event(trace, trace->host, event, sample);
2165 		return;
2166 	}
2167 
2168 	evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2169 	if (evsel == NULL) {
2170 		fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2171 		return;
2172 	}
2173 
2174 	trace__set_base_time(trace, evsel, sample);
2175 
2176 	if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2177 	    sample->raw_data == NULL) {
2178 		fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2179 		       perf_evsel__name(evsel), sample->tid,
2180 		       sample->cpu, sample->raw_size);
2181 	} else {
2182 		tracepoint_handler handler = evsel->handler;
2183 		handler(trace, evsel, event, sample);
2184 	}
2185 }
2186 
trace__add_syscall_newtp(struct trace * trace)2187 static int trace__add_syscall_newtp(struct trace *trace)
2188 {
2189 	int ret = -1;
2190 	struct perf_evlist *evlist = trace->evlist;
2191 	struct perf_evsel *sys_enter, *sys_exit;
2192 
2193 	sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2194 	if (sys_enter == NULL)
2195 		goto out;
2196 
2197 	if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2198 		goto out_delete_sys_enter;
2199 
2200 	sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2201 	if (sys_exit == NULL)
2202 		goto out_delete_sys_enter;
2203 
2204 	if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2205 		goto out_delete_sys_exit;
2206 
2207 	perf_evlist__add(evlist, sys_enter);
2208 	perf_evlist__add(evlist, sys_exit);
2209 
2210 	if (callchain_param.enabled && !trace->kernel_syscallchains) {
2211 		/*
2212 		 * We're interested only in the user space callchain
2213 		 * leading to the syscall, allow overriding that for
2214 		 * debugging reasons using --kernel_syscall_callchains
2215 		 */
2216 		sys_exit->attr.exclude_callchain_kernel = 1;
2217 	}
2218 
2219 	trace->syscalls.events.sys_enter = sys_enter;
2220 	trace->syscalls.events.sys_exit  = sys_exit;
2221 
2222 	ret = 0;
2223 out:
2224 	return ret;
2225 
2226 out_delete_sys_exit:
2227 	perf_evsel__delete_priv(sys_exit);
2228 out_delete_sys_enter:
2229 	perf_evsel__delete_priv(sys_enter);
2230 	goto out;
2231 }
2232 
trace__set_ev_qualifier_filter(struct trace * trace)2233 static int trace__set_ev_qualifier_filter(struct trace *trace)
2234 {
2235 	int err = -1;
2236 	struct perf_evsel *sys_exit;
2237 	char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2238 						trace->ev_qualifier_ids.nr,
2239 						trace->ev_qualifier_ids.entries);
2240 
2241 	if (filter == NULL)
2242 		goto out_enomem;
2243 
2244 	if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2245 					  filter)) {
2246 		sys_exit = trace->syscalls.events.sys_exit;
2247 		err = perf_evsel__append_tp_filter(sys_exit, filter);
2248 	}
2249 
2250 	free(filter);
2251 out:
2252 	return err;
2253 out_enomem:
2254 	errno = ENOMEM;
2255 	goto out;
2256 }
2257 
trace__set_filter_loop_pids(struct trace * trace)2258 static int trace__set_filter_loop_pids(struct trace *trace)
2259 {
2260 	unsigned int nr = 1;
2261 	pid_t pids[32] = {
2262 		getpid(),
2263 	};
2264 	struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2265 
2266 	while (thread && nr < ARRAY_SIZE(pids)) {
2267 		struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2268 
2269 		if (parent == NULL)
2270 			break;
2271 
2272 		if (!strcmp(thread__comm_str(parent), "sshd")) {
2273 			pids[nr++] = parent->tid;
2274 			break;
2275 		}
2276 		thread = parent;
2277 	}
2278 
2279 	return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2280 }
2281 
trace__run(struct trace * trace,int argc,const char ** argv)2282 static int trace__run(struct trace *trace, int argc, const char **argv)
2283 {
2284 	struct perf_evlist *evlist = trace->evlist;
2285 	struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2286 	int err = -1, i;
2287 	unsigned long before;
2288 	const bool forks = argc > 0;
2289 	bool draining = false;
2290 
2291 	trace->live = true;
2292 
2293 	if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2294 		goto out_error_raw_syscalls;
2295 
2296 	if (trace->trace_syscalls)
2297 		trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2298 
2299 	if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2300 		pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2301 		if (pgfault_maj == NULL)
2302 			goto out_error_mem;
2303 		perf_evlist__add(evlist, pgfault_maj);
2304 	}
2305 
2306 	if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2307 		pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2308 		if (pgfault_min == NULL)
2309 			goto out_error_mem;
2310 		perf_evlist__add(evlist, pgfault_min);
2311 	}
2312 
2313 	if (trace->sched &&
2314 	    perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2315 				   trace__sched_stat_runtime))
2316 		goto out_error_sched_stat_runtime;
2317 
2318 	err = perf_evlist__create_maps(evlist, &trace->opts.target);
2319 	if (err < 0) {
2320 		fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2321 		goto out_delete_evlist;
2322 	}
2323 
2324 	err = trace__symbols_init(trace, evlist);
2325 	if (err < 0) {
2326 		fprintf(trace->output, "Problems initializing symbol libraries!\n");
2327 		goto out_delete_evlist;
2328 	}
2329 
2330 	perf_evlist__config(evlist, &trace->opts, NULL);
2331 
2332 	if (callchain_param.enabled) {
2333 		bool use_identifier = false;
2334 
2335 		if (trace->syscalls.events.sys_exit) {
2336 			perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2337 						     &trace->opts, &callchain_param);
2338 			use_identifier = true;
2339 		}
2340 
2341 		if (pgfault_maj) {
2342 			perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2343 			use_identifier = true;
2344 		}
2345 
2346 		if (pgfault_min) {
2347 			perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2348 			use_identifier = true;
2349 		}
2350 
2351 		if (use_identifier) {
2352 		       /*
2353 			* Now we have evsels with different sample_ids, use
2354 			* PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2355 			* from a fixed position in each ring buffer record.
2356 			*
2357 			* As of this the changeset introducing this comment, this
2358 			* isn't strictly needed, as the fields that can come before
2359 			* PERF_SAMPLE_ID are all used, but we'll probably disable
2360 			* some of those for things like copying the payload of
2361 			* pointer syscall arguments, and for vfs_getname we don't
2362 			* need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2363 			* here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2364 			*/
2365 			perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2366 			perf_evlist__reset_sample_bit(evlist, ID);
2367 		}
2368 	}
2369 
2370 	signal(SIGCHLD, sig_handler);
2371 	signal(SIGINT, sig_handler);
2372 
2373 	if (forks) {
2374 		err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2375 						    argv, false, NULL);
2376 		if (err < 0) {
2377 			fprintf(trace->output, "Couldn't run the workload!\n");
2378 			goto out_delete_evlist;
2379 		}
2380 	}
2381 
2382 	err = perf_evlist__open(evlist);
2383 	if (err < 0)
2384 		goto out_error_open;
2385 
2386 	err = bpf__apply_obj_config();
2387 	if (err) {
2388 		char errbuf[BUFSIZ];
2389 
2390 		bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2391 		pr_err("ERROR: Apply config to BPF failed: %s\n",
2392 			 errbuf);
2393 		goto out_error_open;
2394 	}
2395 
2396 	/*
2397 	 * Better not use !target__has_task() here because we need to cover the
2398 	 * case where no threads were specified in the command line, but a
2399 	 * workload was, and in that case we will fill in the thread_map when
2400 	 * we fork the workload in perf_evlist__prepare_workload.
2401 	 */
2402 	if (trace->filter_pids.nr > 0)
2403 		err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2404 	else if (thread_map__pid(evlist->threads, 0) == -1)
2405 		err = trace__set_filter_loop_pids(trace);
2406 
2407 	if (err < 0)
2408 		goto out_error_mem;
2409 
2410 	if (trace->ev_qualifier_ids.nr > 0) {
2411 		err = trace__set_ev_qualifier_filter(trace);
2412 		if (err < 0)
2413 			goto out_errno;
2414 
2415 		pr_debug("event qualifier tracepoint filter: %s\n",
2416 			 trace->syscalls.events.sys_exit->filter);
2417 	}
2418 
2419 	err = perf_evlist__apply_filters(evlist, &evsel);
2420 	if (err < 0)
2421 		goto out_error_apply_filters;
2422 
2423 	err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2424 	if (err < 0)
2425 		goto out_error_mmap;
2426 
2427 	if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2428 		perf_evlist__enable(evlist);
2429 
2430 	if (forks)
2431 		perf_evlist__start_workload(evlist);
2432 
2433 	if (trace->opts.initial_delay) {
2434 		usleep(trace->opts.initial_delay * 1000);
2435 		perf_evlist__enable(evlist);
2436 	}
2437 
2438 	trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2439 				  evlist->threads->nr > 1 ||
2440 				  perf_evlist__first(evlist)->attr.inherit;
2441 again:
2442 	before = trace->nr_events;
2443 
2444 	for (i = 0; i < evlist->nr_mmaps; i++) {
2445 		union perf_event *event;
2446 
2447 		while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2448 			struct perf_sample sample;
2449 
2450 			++trace->nr_events;
2451 
2452 			err = perf_evlist__parse_sample(evlist, event, &sample);
2453 			if (err) {
2454 				fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2455 				goto next_event;
2456 			}
2457 
2458 			trace__handle_event(trace, event, &sample);
2459 next_event:
2460 			perf_evlist__mmap_consume(evlist, i);
2461 
2462 			if (interrupted)
2463 				goto out_disable;
2464 
2465 			if (done && !draining) {
2466 				perf_evlist__disable(evlist);
2467 				draining = true;
2468 			}
2469 		}
2470 	}
2471 
2472 	if (trace->nr_events == before) {
2473 		int timeout = done ? 100 : -1;
2474 
2475 		if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2476 			if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2477 				draining = true;
2478 
2479 			goto again;
2480 		}
2481 	} else {
2482 		goto again;
2483 	}
2484 
2485 out_disable:
2486 	thread__zput(trace->current);
2487 
2488 	perf_evlist__disable(evlist);
2489 
2490 	if (!err) {
2491 		if (trace->summary)
2492 			trace__fprintf_thread_summary(trace, trace->output);
2493 
2494 		if (trace->show_tool_stats) {
2495 			fprintf(trace->output, "Stats:\n "
2496 					       " vfs_getname : %" PRIu64 "\n"
2497 					       " proc_getname: %" PRIu64 "\n",
2498 				trace->stats.vfs_getname,
2499 				trace->stats.proc_getname);
2500 		}
2501 	}
2502 
2503 out_delete_evlist:
2504 	trace__symbols__exit(trace);
2505 
2506 	perf_evlist__delete(evlist);
2507 	trace->evlist = NULL;
2508 	trace->live = false;
2509 	return err;
2510 {
2511 	char errbuf[BUFSIZ];
2512 
2513 out_error_sched_stat_runtime:
2514 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2515 	goto out_error;
2516 
2517 out_error_raw_syscalls:
2518 	tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2519 	goto out_error;
2520 
2521 out_error_mmap:
2522 	perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2523 	goto out_error;
2524 
2525 out_error_open:
2526 	perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2527 
2528 out_error:
2529 	fprintf(trace->output, "%s\n", errbuf);
2530 	goto out_delete_evlist;
2531 
2532 out_error_apply_filters:
2533 	fprintf(trace->output,
2534 		"Failed to set filter \"%s\" on event %s with %d (%s)\n",
2535 		evsel->filter, perf_evsel__name(evsel), errno,
2536 		str_error_r(errno, errbuf, sizeof(errbuf)));
2537 	goto out_delete_evlist;
2538 }
2539 out_error_mem:
2540 	fprintf(trace->output, "Not enough memory to run!\n");
2541 	goto out_delete_evlist;
2542 
2543 out_errno:
2544 	fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2545 	goto out_delete_evlist;
2546 }
2547 
trace__replay(struct trace * trace)2548 static int trace__replay(struct trace *trace)
2549 {
2550 	const struct perf_evsel_str_handler handlers[] = {
2551 		{ "probe:vfs_getname",	     trace__vfs_getname, },
2552 	};
2553 	struct perf_data_file file = {
2554 		.path  = input_name,
2555 		.mode  = PERF_DATA_MODE_READ,
2556 		.force = trace->force,
2557 	};
2558 	struct perf_session *session;
2559 	struct perf_evsel *evsel;
2560 	int err = -1;
2561 
2562 	trace->tool.sample	  = trace__process_sample;
2563 	trace->tool.mmap	  = perf_event__process_mmap;
2564 	trace->tool.mmap2	  = perf_event__process_mmap2;
2565 	trace->tool.comm	  = perf_event__process_comm;
2566 	trace->tool.exit	  = perf_event__process_exit;
2567 	trace->tool.fork	  = perf_event__process_fork;
2568 	trace->tool.attr	  = perf_event__process_attr;
2569 	trace->tool.tracing_data  = perf_event__process_tracing_data;
2570 	trace->tool.build_id	  = perf_event__process_build_id;
2571 	trace->tool.namespaces	  = perf_event__process_namespaces;
2572 
2573 	trace->tool.ordered_events = true;
2574 	trace->tool.ordering_requires_timestamps = true;
2575 
2576 	/* add tid to output */
2577 	trace->multiple_threads = true;
2578 
2579 	session = perf_session__new(&file, false, &trace->tool);
2580 	if (session == NULL)
2581 		return -1;
2582 
2583 	if (trace->opts.target.pid)
2584 		symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2585 
2586 	if (trace->opts.target.tid)
2587 		symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2588 
2589 	if (symbol__init(&session->header.env) < 0)
2590 		goto out;
2591 
2592 	trace->host = &session->machines.host;
2593 
2594 	err = perf_session__set_tracepoints_handlers(session, handlers);
2595 	if (err)
2596 		goto out;
2597 
2598 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2599 						     "raw_syscalls:sys_enter");
2600 	/* older kernels have syscalls tp versus raw_syscalls */
2601 	if (evsel == NULL)
2602 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2603 							     "syscalls:sys_enter");
2604 
2605 	if (evsel &&
2606 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2607 	    perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2608 		pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2609 		goto out;
2610 	}
2611 
2612 	evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2613 						     "raw_syscalls:sys_exit");
2614 	if (evsel == NULL)
2615 		evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2616 							     "syscalls:sys_exit");
2617 	if (evsel &&
2618 	    (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2619 	    perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2620 		pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2621 		goto out;
2622 	}
2623 
2624 	evlist__for_each_entry(session->evlist, evsel) {
2625 		if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2626 		    (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2627 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2628 		     evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2629 			evsel->handler = trace__pgfault;
2630 	}
2631 
2632 	setup_pager();
2633 
2634 	err = perf_session__process_events(session);
2635 	if (err)
2636 		pr_err("Failed to process events, error %d", err);
2637 
2638 	else if (trace->summary)
2639 		trace__fprintf_thread_summary(trace, trace->output);
2640 
2641 out:
2642 	perf_session__delete(session);
2643 
2644 	return err;
2645 }
2646 
trace__fprintf_threads_header(FILE * fp)2647 static size_t trace__fprintf_threads_header(FILE *fp)
2648 {
2649 	size_t printed;
2650 
2651 	printed  = fprintf(fp, "\n Summary of events:\n\n");
2652 
2653 	return printed;
2654 }
2655 
2656 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2657 	struct stats 	*stats;
2658 	double		msecs;
2659 	int		syscall;
2660 )
2661 {
2662 	struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2663 	struct stats *stats = source->priv;
2664 
2665 	entry->syscall = source->i;
2666 	entry->stats   = stats;
2667 	entry->msecs   = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2668 }
2669 
thread__dump_stats(struct thread_trace * ttrace,struct trace * trace,FILE * fp)2670 static size_t thread__dump_stats(struct thread_trace *ttrace,
2671 				 struct trace *trace, FILE *fp)
2672 {
2673 	size_t printed = 0;
2674 	struct syscall *sc;
2675 	struct rb_node *nd;
2676 	DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2677 
2678 	if (syscall_stats == NULL)
2679 		return 0;
2680 
2681 	printed += fprintf(fp, "\n");
2682 
2683 	printed += fprintf(fp, "   syscall            calls    total       min       avg       max      stddev\n");
2684 	printed += fprintf(fp, "                               (msec)    (msec)    (msec)    (msec)        (%%)\n");
2685 	printed += fprintf(fp, "   --------------- -------- --------- --------- --------- ---------     ------\n");
2686 
2687 	resort_rb__for_each_entry(nd, syscall_stats) {
2688 		struct stats *stats = syscall_stats_entry->stats;
2689 		if (stats) {
2690 			double min = (double)(stats->min) / NSEC_PER_MSEC;
2691 			double max = (double)(stats->max) / NSEC_PER_MSEC;
2692 			double avg = avg_stats(stats);
2693 			double pct;
2694 			u64 n = (u64) stats->n;
2695 
2696 			pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2697 			avg /= NSEC_PER_MSEC;
2698 
2699 			sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2700 			printed += fprintf(fp, "   %-15s", sc->name);
2701 			printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2702 					   n, syscall_stats_entry->msecs, min, avg);
2703 			printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2704 		}
2705 	}
2706 
2707 	resort_rb__delete(syscall_stats);
2708 	printed += fprintf(fp, "\n\n");
2709 
2710 	return printed;
2711 }
2712 
trace__fprintf_thread(FILE * fp,struct thread * thread,struct trace * trace)2713 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2714 {
2715 	size_t printed = 0;
2716 	struct thread_trace *ttrace = thread__priv(thread);
2717 	double ratio;
2718 
2719 	if (ttrace == NULL)
2720 		return 0;
2721 
2722 	ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2723 
2724 	printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2725 	printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2726 	printed += fprintf(fp, "%.1f%%", ratio);
2727 	if (ttrace->pfmaj)
2728 		printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2729 	if (ttrace->pfmin)
2730 		printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2731 	if (trace->sched)
2732 		printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2733 	else if (fputc('\n', fp) != EOF)
2734 		++printed;
2735 
2736 	printed += thread__dump_stats(ttrace, trace, fp);
2737 
2738 	return printed;
2739 }
2740 
thread__nr_events(struct thread_trace * ttrace)2741 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2742 {
2743 	return ttrace ? ttrace->nr_events : 0;
2744 }
2745 
2746 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2747 	struct thread *thread;
2748 )
2749 {
2750 	entry->thread = rb_entry(nd, struct thread, rb_node);
2751 }
2752 
trace__fprintf_thread_summary(struct trace * trace,FILE * fp)2753 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2754 {
2755 	DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2756 	size_t printed = trace__fprintf_threads_header(fp);
2757 	struct rb_node *nd;
2758 
2759 	if (threads == NULL) {
2760 		fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2761 		return 0;
2762 	}
2763 
2764 	resort_rb__for_each_entry(nd, threads)
2765 		printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2766 
2767 	resort_rb__delete(threads);
2768 
2769 	return printed;
2770 }
2771 
trace__set_duration(const struct option * opt,const char * str,int unset __maybe_unused)2772 static int trace__set_duration(const struct option *opt, const char *str,
2773 			       int unset __maybe_unused)
2774 {
2775 	struct trace *trace = opt->value;
2776 
2777 	trace->duration_filter = atof(str);
2778 	return 0;
2779 }
2780 
trace__set_filter_pids(const struct option * opt,const char * str,int unset __maybe_unused)2781 static int trace__set_filter_pids(const struct option *opt, const char *str,
2782 				  int unset __maybe_unused)
2783 {
2784 	int ret = -1;
2785 	size_t i;
2786 	struct trace *trace = opt->value;
2787 	/*
2788 	 * FIXME: introduce a intarray class, plain parse csv and create a
2789 	 * { int nr, int entries[] } struct...
2790 	 */
2791 	struct intlist *list = intlist__new(str);
2792 
2793 	if (list == NULL)
2794 		return -1;
2795 
2796 	i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2797 	trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2798 
2799 	if (trace->filter_pids.entries == NULL)
2800 		goto out;
2801 
2802 	trace->filter_pids.entries[0] = getpid();
2803 
2804 	for (i = 1; i < trace->filter_pids.nr; ++i)
2805 		trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2806 
2807 	intlist__delete(list);
2808 	ret = 0;
2809 out:
2810 	return ret;
2811 }
2812 
trace__open_output(struct trace * trace,const char * filename)2813 static int trace__open_output(struct trace *trace, const char *filename)
2814 {
2815 	struct stat st;
2816 
2817 	if (!stat(filename, &st) && st.st_size) {
2818 		char oldname[PATH_MAX];
2819 
2820 		scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2821 		unlink(oldname);
2822 		rename(filename, oldname);
2823 	}
2824 
2825 	trace->output = fopen(filename, "w");
2826 
2827 	return trace->output == NULL ? -errno : 0;
2828 }
2829 
parse_pagefaults(const struct option * opt,const char * str,int unset __maybe_unused)2830 static int parse_pagefaults(const struct option *opt, const char *str,
2831 			    int unset __maybe_unused)
2832 {
2833 	int *trace_pgfaults = opt->value;
2834 
2835 	if (strcmp(str, "all") == 0)
2836 		*trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2837 	else if (strcmp(str, "maj") == 0)
2838 		*trace_pgfaults |= TRACE_PFMAJ;
2839 	else if (strcmp(str, "min") == 0)
2840 		*trace_pgfaults |= TRACE_PFMIN;
2841 	else
2842 		return -1;
2843 
2844 	return 0;
2845 }
2846 
evlist__set_evsel_handler(struct perf_evlist * evlist,void * handler)2847 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2848 {
2849 	struct perf_evsel *evsel;
2850 
2851 	evlist__for_each_entry(evlist, evsel)
2852 		evsel->handler = handler;
2853 }
2854 
2855 /*
2856  * XXX: Hackish, just splitting the combined -e+--event (syscalls
2857  * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2858  * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2859  *
2860  * It'd be better to introduce a parse_options() variant that would return a
2861  * list with the terms it didn't match to an event...
2862  */
trace__parse_events_option(const struct option * opt,const char * str,int unset __maybe_unused)2863 static int trace__parse_events_option(const struct option *opt, const char *str,
2864 				      int unset __maybe_unused)
2865 {
2866 	struct trace *trace = (struct trace *)opt->value;
2867 	const char *s = str;
2868 	char *sep = NULL, *lists[2] = { NULL, NULL, };
2869 	int len = strlen(str) + 1, err = -1, list, idx;
2870 	char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2871 	char group_name[PATH_MAX];
2872 
2873 	if (strace_groups_dir == NULL)
2874 		return -1;
2875 
2876 	if (*s == '!') {
2877 		++s;
2878 		trace->not_ev_qualifier = true;
2879 	}
2880 
2881 	while (1) {
2882 		if ((sep = strchr(s, ',')) != NULL)
2883 			*sep = '\0';
2884 
2885 		list = 0;
2886 		if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2887 		    syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2888 			list = 1;
2889 		} else {
2890 			path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2891 			if (access(group_name, R_OK) == 0)
2892 				list = 1;
2893 		}
2894 
2895 		if (lists[list]) {
2896 			sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2897 		} else {
2898 			lists[list] = malloc(len);
2899 			if (lists[list] == NULL)
2900 				goto out;
2901 			strcpy(lists[list], s);
2902 		}
2903 
2904 		if (!sep)
2905 			break;
2906 
2907 		*sep = ',';
2908 		s = sep + 1;
2909 	}
2910 
2911 	if (lists[1] != NULL) {
2912 		struct strlist_config slist_config = {
2913 			.dirname = strace_groups_dir,
2914 		};
2915 
2916 		trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2917 		if (trace->ev_qualifier == NULL) {
2918 			fputs("Not enough memory to parse event qualifier", trace->output);
2919 			goto out;
2920 		}
2921 
2922 		if (trace__validate_ev_qualifier(trace))
2923 			goto out;
2924 	}
2925 
2926 	err = 0;
2927 
2928 	if (lists[0]) {
2929 		struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2930 					       "event selector. use 'perf list' to list available events",
2931 					       parse_events_option);
2932 		err = parse_events_option(&o, lists[0], 0);
2933 	}
2934 out:
2935 	if (sep)
2936 		*sep = ',';
2937 
2938 	return err;
2939 }
2940 
cmd_trace(int argc,const char ** argv)2941 int cmd_trace(int argc, const char **argv)
2942 {
2943 	const char *trace_usage[] = {
2944 		"perf trace [<options>] [<command>]",
2945 		"perf trace [<options>] -- <command> [<options>]",
2946 		"perf trace record [<options>] [<command>]",
2947 		"perf trace record [<options>] -- <command> [<options>]",
2948 		NULL
2949 	};
2950 	struct trace trace = {
2951 		.syscalls = {
2952 			. max = -1,
2953 		},
2954 		.opts = {
2955 			.target = {
2956 				.uid	   = UINT_MAX,
2957 				.uses_mmap = true,
2958 			},
2959 			.user_freq     = UINT_MAX,
2960 			.user_interval = ULLONG_MAX,
2961 			.no_buffering  = true,
2962 			.mmap_pages    = UINT_MAX,
2963 			.proc_map_timeout  = 500,
2964 		},
2965 		.output = stderr,
2966 		.show_comm = true,
2967 		.trace_syscalls = true,
2968 		.kernel_syscallchains = false,
2969 		.max_stack = UINT_MAX,
2970 	};
2971 	const char *output_name = NULL;
2972 	const struct option trace_options[] = {
2973 	OPT_CALLBACK('e', "event", &trace, "event",
2974 		     "event/syscall selector. use 'perf list' to list available events",
2975 		     trace__parse_events_option),
2976 	OPT_BOOLEAN(0, "comm", &trace.show_comm,
2977 		    "show the thread COMM next to its id"),
2978 	OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2979 	OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2980 		     trace__parse_events_option),
2981 	OPT_STRING('o', "output", &output_name, "file", "output file name"),
2982 	OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2983 	OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2984 		    "trace events on existing process id"),
2985 	OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2986 		    "trace events on existing thread id"),
2987 	OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2988 		     "pids to filter (by the kernel)", trace__set_filter_pids),
2989 	OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2990 		    "system-wide collection from all CPUs"),
2991 	OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2992 		    "list of cpus to monitor"),
2993 	OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2994 		    "child tasks do not inherit counters"),
2995 	OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2996 		     "number of mmap data pages",
2997 		     perf_evlist__parse_mmap_pages),
2998 	OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2999 		   "user to profile"),
3000 	OPT_CALLBACK(0, "duration", &trace, "float",
3001 		     "show only events with duration > N.M ms",
3002 		     trace__set_duration),
3003 	OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3004 	OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3005 	OPT_BOOLEAN('T', "time", &trace.full_time,
3006 		    "Show full timestamp, not time relative to first start"),
3007 	OPT_BOOLEAN('s', "summary", &trace.summary_only,
3008 		    "Show only syscall summary with statistics"),
3009 	OPT_BOOLEAN('S', "with-summary", &trace.summary,
3010 		    "Show all syscalls and summary with statistics"),
3011 	OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3012 		     "Trace pagefaults", parse_pagefaults, "maj"),
3013 	OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3014 	OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3015 	OPT_CALLBACK(0, "call-graph", &trace.opts,
3016 		     "record_mode[,record_size]", record_callchain_help,
3017 		     &record_parse_callchain_opt),
3018 	OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3019 		    "Show the kernel callchains on the syscall exit path"),
3020 	OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3021 		     "Set the minimum stack depth when parsing the callchain, "
3022 		     "anything below the specified depth will be ignored."),
3023 	OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3024 		     "Set the maximum stack depth when parsing the callchain, "
3025 		     "anything beyond the specified depth will be ignored. "
3026 		     "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3027 	OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3028 			"per thread proc mmap processing timeout in ms"),
3029 	OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3030 		     "ms to wait before starting measurement after program "
3031 		     "start"),
3032 	OPT_END()
3033 	};
3034 	bool __maybe_unused max_stack_user_set = true;
3035 	bool mmap_pages_user_set = true;
3036 	const char * const trace_subcommands[] = { "record", NULL };
3037 	int err;
3038 	char bf[BUFSIZ];
3039 
3040 	signal(SIGSEGV, sighandler_dump_stack);
3041 	signal(SIGFPE, sighandler_dump_stack);
3042 
3043 	trace.evlist = perf_evlist__new();
3044 	trace.sctbl = syscalltbl__new();
3045 
3046 	if (trace.evlist == NULL || trace.sctbl == NULL) {
3047 		pr_err("Not enough memory to run!\n");
3048 		err = -ENOMEM;
3049 		goto out;
3050 	}
3051 
3052 	argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3053 				 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3054 
3055 	err = bpf__setup_stdout(trace.evlist);
3056 	if (err) {
3057 		bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3058 		pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3059 		goto out;
3060 	}
3061 
3062 	err = -1;
3063 
3064 	if (trace.trace_pgfaults) {
3065 		trace.opts.sample_address = true;
3066 		trace.opts.sample_time = true;
3067 	}
3068 
3069 	if (trace.opts.mmap_pages == UINT_MAX)
3070 		mmap_pages_user_set = false;
3071 
3072 	if (trace.max_stack == UINT_MAX) {
3073 		trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3074 		max_stack_user_set = false;
3075 	}
3076 
3077 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3078 	if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3079 		record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3080 #endif
3081 
3082 	if (callchain_param.enabled) {
3083 		if (!mmap_pages_user_set && geteuid() == 0)
3084 			trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3085 
3086 		symbol_conf.use_callchain = true;
3087 	}
3088 
3089 	if (trace.evlist->nr_entries > 0)
3090 		evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3091 
3092 	if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3093 		return trace__record(&trace, argc-1, &argv[1]);
3094 
3095 	/* summary_only implies summary option, but don't overwrite summary if set */
3096 	if (trace.summary_only)
3097 		trace.summary = trace.summary_only;
3098 
3099 	if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3100 	    trace.evlist->nr_entries == 0 /* Was --events used? */) {
3101 		pr_err("Please specify something to trace.\n");
3102 		return -1;
3103 	}
3104 
3105 	if (!trace.trace_syscalls && trace.ev_qualifier) {
3106 		pr_err("The -e option can't be used with --no-syscalls.\n");
3107 		goto out;
3108 	}
3109 
3110 	if (output_name != NULL) {
3111 		err = trace__open_output(&trace, output_name);
3112 		if (err < 0) {
3113 			perror("failed to create output file");
3114 			goto out;
3115 		}
3116 	}
3117 
3118 	trace.open_id = syscalltbl__id(trace.sctbl, "open");
3119 
3120 	err = target__validate(&trace.opts.target);
3121 	if (err) {
3122 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3123 		fprintf(trace.output, "%s", bf);
3124 		goto out_close;
3125 	}
3126 
3127 	err = target__parse_uid(&trace.opts.target);
3128 	if (err) {
3129 		target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3130 		fprintf(trace.output, "%s", bf);
3131 		goto out_close;
3132 	}
3133 
3134 	if (!argc && target__none(&trace.opts.target))
3135 		trace.opts.target.system_wide = true;
3136 
3137 	if (input_name)
3138 		err = trace__replay(&trace);
3139 	else
3140 		err = trace__run(&trace, argc, argv);
3141 
3142 out_close:
3143 	if (output_name != NULL)
3144 		fclose(trace.output);
3145 out:
3146 	return err;
3147 }
3148