1 // SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2 /* Copyright (c) 2021 Google LLC.
3 *
4 * Based on klockstat from BCC by Jiri Olsa and others
5 * 2021-10-26 Barret Rhoden Created this.
6 */
7 /* Differences from BCC python tool:
8 * - can specify a lock by ksym name, using '-L'
9 * - tracks whichever task had the max time for acquire and hold, outputted
10 * when '-s' > 1 (otherwise it's cluttered).
11 * - does not reset stats each interval by default. Can request with -R.
12 */
13 #include <argp.h>
14 #include <errno.h>
15 #include <signal.h>
16 #include <stdio.h>
17 #ifndef _GNU_SOURCE
18 #define _GNU_SOURCE
19 #endif
20 #include <stdlib.h>
21 #include <string.h>
22 #include <time.h>
23 #include <unistd.h>
24 #include <sys/param.h>
25
26 #include <bpf/libbpf.h>
27 #include <bpf/bpf.h>
28 #include "klockstat.h"
29 #include "klockstat.skel.h"
30 #include "trace_helpers.h"
31
32 #define warn(...) fprintf(stderr, __VA_ARGS__)
33
34 enum {
35 SORT_ACQ_MAX,
36 SORT_ACQ_COUNT,
37 SORT_ACQ_TOTAL,
38 SORT_HLD_MAX,
39 SORT_HLD_COUNT,
40 SORT_HLD_TOTAL,
41 };
42
43 static struct prog_env {
44 pid_t pid;
45 pid_t tid;
46 char *caller;
47 char *lock_name;
48 unsigned int nr_locks;
49 unsigned int nr_stack_entries;
50 unsigned int sort_acq;
51 unsigned int sort_hld;
52 unsigned int duration;
53 unsigned int interval;
54 unsigned int iterations;
55 bool reset;
56 bool timestamp;
57 bool verbose;
58 } env = {
59 .nr_locks = 99999999,
60 .nr_stack_entries = 1,
61 .sort_acq = SORT_ACQ_MAX,
62 .sort_hld = SORT_HLD_MAX,
63 .interval = 99999999,
64 .iterations = 99999999,
65 };
66
67 const char *argp_program_version = "klockstat 0.1";
68 const char *argp_program_bug_address =
69 "https://github.com/iovisor/bcc/tree/master/libbpf-tools";
70 static const char args_doc[] = "FUNCTION";
71 static const char program_doc[] =
72 "Trace mutex lock acquisition and hold times, in nsec\n"
73 "\n"
74 "Usage: klockstat [-hRTv] [-p PID] [-t TID] [-c FUNC] [-L LOCK] [-n NR_LOCKS]\n"
75 " [-s NR_STACKS] [-S SORT] [-d DURATION] [-i INTERVAL]\n"
76 "\v"
77 "Examples:\n"
78 " klockstat # trace system wide until ctrl-c\n"
79 " klockstat -d 5 # trace for 5 seconds\n"
80 " klockstat -i 5 # print stats every 5 seconds\n"
81 " klockstat -p 181 # trace process 181 only\n"
82 " klockstat -t 181 # trace thread 181 only\n"
83 " klockstat -c pipe_ # print only for lock callers with 'pipe_'\n"
84 " # prefix\n"
85 " klockstat -L cgroup_mutex # trace the cgroup_mutex lock only\n"
86 " klockstat -S acq_count # sort lock acquired results by acquire count\n"
87 " klockstat -S hld_total # sort lock held results by total held time\n"
88 " klockstat -S acq_count,hld_total # combination of above\n"
89 " klockstat -n 3 # display top 3 locks\n"
90 " klockstat -s 6 # display 6 stack entries per lock\n"
91 ;
92
93 static const struct argp_option opts[] = {
94 { "pid", 'p', "PID", 0, "Filter by process ID" },
95 { "tid", 't', "TID", 0, "Filter by thread ID" },
96 { 0, 0, 0, 0, "" },
97 { "caller", 'c', "FUNC", 0, "Filter by caller string prefix" },
98 { "lock", 'L', "LOCK", 0, "Filter by specific ksym lock name" },
99 { 0, 0, 0, 0, "" },
100 { "locks", 'n', "NR_LOCKS", 0, "Number of locks to print" },
101 { "stacks", 's', "NR_STACKS", 0, "Number of stack entries to print per lock" },
102 { "sort", 'S', "SORT", 0, "Sort by field:\n acq_[max|total|count]\n hld_[max|total|count]" },
103 { 0, 0, 0, 0, "" },
104 { "duration", 'd', "SECONDS", 0, "Duration to trace" },
105 { "interval", 'i', "SECONDS", 0, "Print interval" },
106 { "reset", 'R', NULL, 0, "Reset stats each interval" },
107 { "timestamp", 'T', NULL, 0, "Print timestamp" },
108 { "verbose", 'v', NULL, 0, "Verbose debug output" },
109
110 { NULL, 'h', NULL, OPTION_HIDDEN, "Show the full help" },
111 {},
112 };
113
parse_one_sort(struct prog_env * env,const char * sort)114 static bool parse_one_sort(struct prog_env *env, const char *sort)
115 {
116 const char *field = sort + 4;
117
118 if (!strncmp(sort, "acq_", 4)) {
119 if (!strcmp(field, "max")) {
120 env->sort_acq = SORT_ACQ_MAX;
121 return true;
122 } else if (!strcmp(field, "total")) {
123 env->sort_acq = SORT_ACQ_TOTAL;
124 return true;
125 } else if (!strcmp(field, "count")) {
126 env->sort_acq = SORT_ACQ_COUNT;
127 return true;
128 }
129 } else if (!strncmp(sort, "hld_", 4)) {
130 if (!strcmp(field, "max")) {
131 env->sort_hld = SORT_HLD_MAX;
132 return true;
133 } else if (!strcmp(field, "total")) {
134 env->sort_hld = SORT_HLD_TOTAL;
135 return true;
136 } else if (!strcmp(field, "count")) {
137 env->sort_hld = SORT_HLD_COUNT;
138 return true;
139 }
140 }
141
142 return false;
143 }
144
parse_sorts(struct prog_env * env,char * arg)145 static bool parse_sorts(struct prog_env *env, char *arg)
146 {
147 char *comma = strchr(arg, ',');
148
149 if (comma) {
150 *comma = '\0';
151 comma++;
152 if (!parse_one_sort(env, comma))
153 return false;
154 }
155 return parse_one_sort(env, arg);
156 }
157
parse_arg(int key,char * arg,struct argp_state * state)158 static error_t parse_arg(int key, char *arg, struct argp_state *state)
159 {
160 struct prog_env *env = state->input;
161 long duration, interval;
162
163 switch (key) {
164 case 'p':
165 errno = 0;
166 env->pid = strtol(arg, NULL, 10);
167 if (errno || env->pid <= 0) {
168 warn("Invalid PID: %s\n", arg);
169 argp_usage(state);
170 }
171 break;
172 case 't':
173 errno = 0;
174 env->tid = strtol(arg, NULL, 10);
175 if (errno || env->tid <= 0) {
176 warn("Invalid TID: %s\n", arg);
177 argp_usage(state);
178 }
179 break;
180 case 'c':
181 env->caller = arg;
182 break;
183 case 'L':
184 env->lock_name = arg;
185 break;
186 case 'n':
187 errno = 0;
188 env->nr_locks = strtol(arg, NULL, 10);
189 if (errno || env->nr_locks <= 0) {
190 warn("Invalid NR_LOCKS: %s\n", arg);
191 argp_usage(state);
192 }
193 break;
194 case 's':
195 errno = 0;
196 env->nr_stack_entries = strtol(arg, NULL, 10);
197 if (errno || env->nr_stack_entries <= 0) {
198 warn("Invalid NR_STACKS: %s\n", arg);
199 argp_usage(state);
200 }
201 break;
202 case 'S':
203 if (!parse_sorts(env, arg)) {
204 warn("Bad sort string: %s\n", arg);
205 argp_usage(state);
206 }
207 break;
208 case 'd':
209 errno = 0;
210 duration = strtol(arg, NULL, 10);
211 if (errno || duration <= 0) {
212 warn("Invalid duration: %s\n", arg);
213 argp_usage(state);
214 }
215 env->duration = duration;
216 break;
217 case 'i':
218 errno = 0;
219 interval = strtol(arg, NULL, 10);
220 if (errno || interval <= 0) {
221 warn("Invalid interval: %s\n", arg);
222 argp_usage(state);
223 }
224 env->interval = interval;
225 break;
226 case 'R':
227 env->reset = true;
228 break;
229 case 'T':
230 env->timestamp = true;
231 break;
232 case 'h':
233 argp_state_help(state, stderr, ARGP_HELP_STD_HELP);
234 break;
235 case 'v':
236 env->verbose = true;
237 break;
238 case ARGP_KEY_END:
239 if (env->duration) {
240 if (env->interval > env->duration)
241 env->interval = env->duration;
242 env->iterations = env->duration / env->interval;
243 }
244 break;
245 default:
246 return ARGP_ERR_UNKNOWN;
247 }
248 return 0;
249 }
250
251 struct stack_stat {
252 uint32_t stack_id;
253 struct lock_stat ls;
254 uint64_t bt[PERF_MAX_STACK_DEPTH];
255 };
256
caller_is_traced(struct ksyms * ksyms,uint64_t caller_pc)257 static bool caller_is_traced(struct ksyms *ksyms, uint64_t caller_pc)
258 {
259 const struct ksym *ksym;
260
261 if (!env.caller)
262 return true;
263 ksym = ksyms__map_addr(ksyms, caller_pc);
264 if (!ksym)
265 return true;
266 return strncmp(env.caller, ksym->name, strlen(env.caller)) == 0;
267 }
268
larger_first(uint64_t x,uint64_t y)269 static int larger_first(uint64_t x, uint64_t y)
270 {
271 if (x > y)
272 return -1;
273 if (x == y)
274 return 0;
275 return 1;
276 }
277
sort_by_acq(const void * x,const void * y)278 static int sort_by_acq(const void *x, const void *y)
279 {
280 struct stack_stat *ss_x = *(struct stack_stat**)x;
281 struct stack_stat *ss_y = *(struct stack_stat**)y;
282
283 switch (env.sort_acq) {
284 case SORT_ACQ_MAX:
285 return larger_first(ss_x->ls.acq_max_time,
286 ss_y->ls.acq_max_time);
287 case SORT_ACQ_COUNT:
288 return larger_first(ss_x->ls.acq_count,
289 ss_y->ls.acq_count);
290 case SORT_ACQ_TOTAL:
291 return larger_first(ss_x->ls.acq_total_time,
292 ss_y->ls.acq_total_time);
293 }
294
295 warn("bad sort_acq %d\n", env.sort_acq);
296 return -1;
297 }
298
sort_by_hld(const void * x,const void * y)299 static int sort_by_hld(const void *x, const void *y)
300 {
301 struct stack_stat *ss_x = *(struct stack_stat**)x;
302 struct stack_stat *ss_y = *(struct stack_stat**)y;
303
304 switch (env.sort_hld) {
305 case SORT_HLD_MAX:
306 return larger_first(ss_x->ls.hld_max_time,
307 ss_y->ls.hld_max_time);
308 case SORT_HLD_COUNT:
309 return larger_first(ss_x->ls.hld_count,
310 ss_y->ls.hld_count);
311 case SORT_HLD_TOTAL:
312 return larger_first(ss_x->ls.hld_total_time,
313 ss_y->ls.hld_total_time);
314 }
315
316 warn("bad sort_hld %d\n", env.sort_hld);
317 return -1;
318 }
319
symname(struct ksyms * ksyms,uint64_t pc,char * buf,size_t n)320 static char *symname(struct ksyms *ksyms, uint64_t pc, char *buf, size_t n)
321 {
322 const struct ksym *ksym = ksyms__map_addr(ksyms, pc);
323
324 if (!ksym)
325 return "Unknown";
326 snprintf(buf, n, "%s+0x%lx", ksym->name, pc - ksym->addr);
327 return buf;
328 }
329
print_acq_header(void)330 static void print_acq_header(void)
331 {
332 printf("\n Caller Avg Wait Count Max Wait Total Wait\n");
333 }
334
print_acq_stat(struct ksyms * ksyms,struct stack_stat * ss,int nr_stack_entries)335 static void print_acq_stat(struct ksyms *ksyms, struct stack_stat *ss,
336 int nr_stack_entries)
337 {
338 char buf[40];
339 int i;
340
341 printf("%37s %9llu %8llu %10llu %12llu\n",
342 symname(ksyms, ss->bt[0], buf, sizeof(buf)),
343 ss->ls.acq_total_time / ss->ls.acq_count,
344 ss->ls.acq_count,
345 ss->ls.acq_max_time,
346 ss->ls.acq_total_time);
347 for (i = 1; i < nr_stack_entries; i++) {
348 if (!ss->bt[i])
349 break;
350 printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf)));
351 }
352 if (nr_stack_entries > 1)
353 printf(" Max PID %llu, COMM %s\n",
354 ss->ls.acq_max_id >> 32,
355 ss->ls.acq_max_comm);
356 }
357
print_hld_header(void)358 static void print_hld_header(void)
359 {
360 printf("\n Caller Avg Hold Count Max Hold Total Hold\n");
361 }
362
print_hld_stat(struct ksyms * ksyms,struct stack_stat * ss,int nr_stack_entries)363 static void print_hld_stat(struct ksyms *ksyms, struct stack_stat *ss,
364 int nr_stack_entries)
365 {
366 char buf[40];
367 int i;
368
369 printf("%37s %9llu %8llu %10llu %12llu\n",
370 symname(ksyms, ss->bt[0], buf, sizeof(buf)),
371 ss->ls.hld_total_time / ss->ls.hld_count,
372 ss->ls.hld_count,
373 ss->ls.hld_max_time,
374 ss->ls.hld_total_time);
375 for (i = 1; i < nr_stack_entries; i++) {
376 if (!ss->bt[i])
377 break;
378 printf("%37s\n", symname(ksyms, ss->bt[i], buf, sizeof(buf)));
379 }
380 if (nr_stack_entries > 1)
381 printf(" Max PID %llu, COMM %s\n",
382 ss->ls.hld_max_id >> 32,
383 ss->ls.hld_max_comm);
384 }
385
print_stats(struct ksyms * ksyms,int stack_map,int stat_map)386 static int print_stats(struct ksyms *ksyms, int stack_map, int stat_map)
387 {
388 struct stack_stat **stats, *ss;
389 size_t stat_idx = 0;
390 size_t stats_sz = 1;
391 uint32_t lookup_key = 0;
392 uint32_t stack_id;
393 int ret, i;
394
395 stats = calloc(stats_sz, sizeof(void *));
396 if (!stats) {
397 warn("Out of memory\n");
398 return -1;
399 }
400
401 while (bpf_map_get_next_key(stat_map, &lookup_key, &stack_id) == 0) {
402 if (stat_idx == stats_sz) {
403 stats_sz *= 2;
404 stats = reallocarray(stats, stats_sz, sizeof(void *));
405 if (!stats) {
406 warn("Out of memory\n");
407 return -1;
408 }
409 }
410 ss = malloc(sizeof(struct stack_stat));
411 if (!ss) {
412 warn("Out of memory\n");
413 return -1;
414 }
415 ss->stack_id = stack_id;
416 if (env.reset) {
417 ret = bpf_map_lookup_and_delete_elem(stat_map,
418 &stack_id,
419 &ss->ls);
420 lookup_key = 0;
421 } else {
422 ret = bpf_map_lookup_elem(stat_map, &stack_id, &ss->ls);
423 lookup_key = stack_id;
424 }
425 if (ret) {
426 free(ss);
427 continue;
428 }
429 if (bpf_map_lookup_elem(stack_map, &stack_id, &ss->bt)) {
430 /* Can still report the results without a backtrace. */
431 warn("failed to lookup stack_id %u\n", stack_id);
432 }
433 if (!caller_is_traced(ksyms, ss->bt[0])) {
434 free(ss);
435 continue;
436 }
437 stats[stat_idx++] = ss;
438 }
439
440 qsort(stats, stat_idx, sizeof(void*), sort_by_acq);
441 for (i = 0; i < MIN(env.nr_locks, stat_idx); i++) {
442 if (i == 0 || env.nr_stack_entries > 1)
443 print_acq_header();
444 print_acq_stat(ksyms, stats[i],
445 MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH));
446 }
447
448 qsort(stats, stat_idx, sizeof(void*), sort_by_hld);
449 for (i = 0; i < MIN(env.nr_locks, stat_idx); i++) {
450 if (i == 0 || env.nr_stack_entries > 1)
451 print_hld_header();
452 print_hld_stat(ksyms, stats[i],
453 MIN(env.nr_stack_entries, PERF_MAX_STACK_DEPTH));
454 }
455
456 for (i = 0; i < stat_idx; i++)
457 free(stats[i]);
458 free(stats);
459
460 return 0;
461 }
462
get_lock_addr(struct ksyms * ksyms,const char * lock_name)463 static void *get_lock_addr(struct ksyms *ksyms, const char *lock_name)
464 {
465 const struct ksym *ksym = ksyms__get_symbol(ksyms, lock_name);
466
467 return ksym ? (void*)ksym->addr : NULL;
468 }
469
470 static volatile bool exiting;
471
sig_hand(int signr)472 static void sig_hand(int signr)
473 {
474 exiting = true;
475 }
476
477 static struct sigaction sigact = {.sa_handler = sig_hand};
478
libbpf_print_fn(enum libbpf_print_level level,const char * format,va_list args)479 static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args)
480 {
481 if (level == LIBBPF_DEBUG && !env.verbose)
482 return 0;
483 return vfprintf(stderr, format, args);
484 }
485
main(int argc,char ** argv)486 int main(int argc, char **argv)
487 {
488 static const struct argp argp = {
489 .options = opts,
490 .parser = parse_arg,
491 .args_doc = args_doc,
492 .doc = program_doc,
493 };
494 struct klockstat_bpf *obj = NULL;
495 struct ksyms *ksyms = NULL;
496 int i, err;
497 struct tm *tm;
498 char ts[32];
499 time_t t;
500 void *lock_addr = NULL;
501
502 err = argp_parse(&argp, argc, argv, 0, NULL, &env);
503 if (err)
504 return err;
505
506 sigaction(SIGINT, &sigact, 0);
507
508 libbpf_set_strict_mode(LIBBPF_STRICT_ALL);
509 libbpf_set_print(libbpf_print_fn);
510
511 ksyms = ksyms__load();
512 if (!ksyms) {
513 warn("failed to load kallsyms\n");
514 err = 1;
515 goto cleanup;
516 }
517 if (env.lock_name) {
518 lock_addr = get_lock_addr(ksyms, env.lock_name);
519 if (!lock_addr) {
520 warn("failed to find lock %s\n", env.lock_name);
521 err = 1;
522 goto cleanup;
523 }
524 }
525
526 obj = klockstat_bpf__open();
527 if (!obj) {
528 warn("failed to open BPF object\n");
529 err = 1;
530 goto cleanup;
531 }
532
533 obj->rodata->targ_tgid = env.pid;
534 obj->rodata->targ_pid = env.tid;
535 obj->rodata->targ_lock = lock_addr;
536
537 if (fentry_can_attach("mutex_lock_nested", NULL)) {
538 bpf_program__set_attach_target(obj->progs.mutex_lock, 0,
539 "mutex_lock_nested");
540 bpf_program__set_attach_target(obj->progs.mutex_lock_exit, 0,
541 "mutex_lock_nested");
542 bpf_program__set_attach_target(obj->progs.mutex_lock_interruptible, 0,
543 "mutex_lock_interruptible_nested");
544 bpf_program__set_attach_target(obj->progs.mutex_lock_interruptible_exit, 0,
545 "mutex_lock_interruptible_nested");
546 bpf_program__set_attach_target(obj->progs.mutex_lock_killable, 0,
547 "mutex_lock_killable_nested");
548 bpf_program__set_attach_target(obj->progs.mutex_lock_killable_exit, 0,
549 "mutex_lock_killable_nested");
550 }
551
552 err = klockstat_bpf__load(obj);
553 if (err) {
554 warn("failed to load BPF object\n");
555 return 1;
556 }
557 err = klockstat_bpf__attach(obj);
558 if (err) {
559 warn("failed to attach BPF object\n");
560 goto cleanup;
561 }
562
563 printf("Tracing mutex lock events... Hit Ctrl-C to end\n");
564
565 for (i = 0; i < env.iterations && !exiting; i++) {
566 sleep(env.interval);
567
568 printf("\n");
569 if (env.timestamp) {
570 time(&t);
571 tm = localtime(&t);
572 strftime(ts, sizeof(ts), "%H:%M:%S", tm);
573 printf("%-8s\n", ts);
574 }
575
576 if (print_stats(ksyms, bpf_map__fd(obj->maps.stack_map),
577 bpf_map__fd(obj->maps.stat_map))) {
578 warn("print_stats error, aborting.\n");
579 break;
580 }
581 }
582
583 printf("Exiting trace of mutex locks\n");
584
585 cleanup:
586 klockstat_bpf__destroy(obj);
587 ksyms__free(ksyms);
588
589 return err != 0;
590 }
591