1#!/usr/bin/python 2# 3# offcputime Summarize off-CPU time by stack trace 4# For Linux, uses BCC, eBPF. 5# 6# USAGE: offcputime [-h] [-p PID | -u | -k] [-U | -K] [-f] [duration] 7# 8# Copyright 2016 Netflix, Inc. 9# Licensed under the Apache License, Version 2.0 (the "License") 10# 11# 13-Jan-2016 Brendan Gregg Created this. 12 13from __future__ import print_function 14from bcc import BPF 15from sys import stderr 16from time import strftime 17import argparse 18import errno 19import signal 20 21# arg validation 22def positive_int(val): 23 try: 24 ival = int(val) 25 except ValueError: 26 raise argparse.ArgumentTypeError("must be an integer") 27 28 if ival < 0: 29 raise argparse.ArgumentTypeError("must be positive") 30 return ival 31 32def positive_nonzero_int(val): 33 ival = positive_int(val) 34 if ival == 0: 35 raise argparse.ArgumentTypeError("must be nonzero") 36 return ival 37 38def stack_id_err(stack_id): 39 # -EFAULT in get_stackid normally means the stack-trace is not available, 40 # Such as getting kernel stack trace in userspace code 41 return (stack_id < 0) and (stack_id != -errno.EFAULT) 42 43# arguments 44examples = """examples: 45 ./offcputime # trace off-CPU stack time until Ctrl-C 46 ./offcputime 5 # trace for 5 seconds only 47 ./offcputime -f 5 # 5 seconds, and output in folded format 48 ./offcputime -m 1000 # trace only events that last more than 1000 usec 49 ./offcputime -M 10000 # trace only events that last less than 10000 usec 50 ./offcputime -p 185 # only trace threads for PID 185 51 ./offcputime -t 188 # only trace thread 188 52 ./offcputime -u # only trace user threads (no kernel) 53 ./offcputime -k # only trace kernel threads (no user) 54 ./offcputime -U # only show user space stacks (no kernel) 55 ./offcputime -K # only show kernel space stacks (no user) 56""" 57parser = argparse.ArgumentParser( 58 description="Summarize off-CPU time by stack trace", 59 formatter_class=argparse.RawDescriptionHelpFormatter, 60 epilog=examples) 61thread_group = parser.add_mutually_exclusive_group() 62# Note: this script provides --pid and --tid flags but their arguments are 63# referred to internally using kernel nomenclature: TGID and PID. 64thread_group.add_argument("-p", "--pid", metavar="PID", dest="tgid", 65 help="trace this PID only", type=positive_int) 66thread_group.add_argument("-t", "--tid", metavar="TID", dest="pid", 67 help="trace this TID only", type=positive_int) 68thread_group.add_argument("-u", "--user-threads-only", action="store_true", 69 help="user threads only (no kernel threads)") 70thread_group.add_argument("-k", "--kernel-threads-only", action="store_true", 71 help="kernel threads only (no user threads)") 72stack_group = parser.add_mutually_exclusive_group() 73stack_group.add_argument("-U", "--user-stacks-only", action="store_true", 74 help="show stacks from user space only (no kernel space stacks)") 75stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", 76 help="show stacks from kernel space only (no user space stacks)") 77parser.add_argument("-d", "--delimited", action="store_true", 78 help="insert delimiter between kernel/user stacks") 79parser.add_argument("-f", "--folded", action="store_true", 80 help="output folded format") 81parser.add_argument("--stack-storage-size", default=1024, 82 type=positive_nonzero_int, 83 help="the number of unique stack traces that can be stored and " 84 "displayed (default 1024)") 85parser.add_argument("duration", nargs="?", default=99999999, 86 type=positive_nonzero_int, 87 help="duration of trace, in seconds") 88parser.add_argument("-m", "--min-block-time", default=1, 89 type=positive_nonzero_int, 90 help="the amount of time in microseconds over which we " + 91 "store traces (default 1)") 92parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1, 93 type=positive_nonzero_int, 94 help="the amount of time in microseconds under which we " + 95 "store traces (default U64_MAX)") 96parser.add_argument("--state", type=positive_int, 97 help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" + 98 ") see include/linux/sched.h") 99parser.add_argument("--ebpf", action="store_true", 100 help=argparse.SUPPRESS) 101args = parser.parse_args() 102folded = args.folded 103duration = int(args.duration) 104debug = 0 105 106# signal handler 107def signal_ignore(signal, frame): 108 print() 109 110# define BPF program 111bpf_text = """ 112#include <uapi/linux/ptrace.h> 113#include <linux/sched.h> 114 115#define MINBLOCK_US MINBLOCK_US_VALUEULL 116#define MAXBLOCK_US MAXBLOCK_US_VALUEULL 117 118struct key_t { 119 u64 pid; 120 u64 tgid; 121 int user_stack_id; 122 int kernel_stack_id; 123 char name[TASK_COMM_LEN]; 124}; 125BPF_HASH(counts, struct key_t); 126BPF_HASH(start, u32); 127BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 128 129struct warn_event_t { 130 u32 pid; 131 u32 tgid; 132 u32 t_start; 133 u32 t_end; 134}; 135BPF_PERF_OUTPUT(warn_events); 136 137int oncpu(struct pt_regs *ctx, struct task_struct *prev) { 138 u32 pid = prev->pid; 139 u32 tgid = prev->tgid; 140 u64 ts, *tsp; 141 142 // record previous thread sleep time 143 if ((THREAD_FILTER) && (STATE_FILTER)) { 144 ts = bpf_ktime_get_ns(); 145 start.update(&pid, &ts); 146 } 147 148 // get the current thread's start time 149 pid = bpf_get_current_pid_tgid(); 150 tgid = bpf_get_current_pid_tgid() >> 32; 151 tsp = start.lookup(&pid); 152 if (tsp == 0) { 153 return 0; // missed start or filtered 154 } 155 156 // calculate current thread's delta time 157 u64 t_start = *tsp; 158 u64 t_end = bpf_ktime_get_ns(); 159 start.delete(&pid); 160 if (t_start > t_end) { 161 struct warn_event_t event = { 162 .pid = pid, 163 .tgid = tgid, 164 .t_start = t_start, 165 .t_end = t_end, 166 }; 167 warn_events.perf_submit(ctx, &event, sizeof(event)); 168 return 0; 169 } 170 u64 delta = t_end - t_start; 171 delta = delta / 1000; 172 if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) { 173 return 0; 174 } 175 176 // create map key 177 struct key_t key = {}; 178 179 key.pid = pid; 180 key.tgid = tgid; 181 key.user_stack_id = USER_STACK_GET; 182 key.kernel_stack_id = KERNEL_STACK_GET; 183 bpf_get_current_comm(&key.name, sizeof(key.name)); 184 185 counts.increment(key, delta); 186 return 0; 187} 188""" 189 190# set thread filter 191thread_context = "" 192if args.tgid is not None: 193 thread_context = "PID %d" % args.tgid 194 thread_filter = 'tgid == %d' % args.tgid 195elif args.pid is not None: 196 thread_context = "TID %d" % args.pid 197 thread_filter = 'pid == %d' % args.pid 198elif args.user_threads_only: 199 thread_context = "user threads" 200 thread_filter = '!(prev->flags & PF_KTHREAD)' 201elif args.kernel_threads_only: 202 thread_context = "kernel threads" 203 thread_filter = 'prev->flags & PF_KTHREAD' 204else: 205 thread_context = "all threads" 206 thread_filter = '1' 207if args.state == 0: 208 state_filter = 'prev->STATE_FIELD == 0' 209elif args.state: 210 # these states are sometimes bitmask checked 211 state_filter = 'prev->STATE_FIELD & %d' % args.state 212else: 213 state_filter = '1' 214bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 215bpf_text = bpf_text.replace('STATE_FILTER', state_filter) 216if BPF.kernel_struct_has_field(b'task_struct', b'__state') == 1: 217 bpf_text = bpf_text.replace('STATE_FIELD', '__state') 218else: 219 bpf_text = bpf_text.replace('STATE_FIELD', 'state') 220 221# set stack storage size 222bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 223bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time)) 224bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time)) 225 226# handle stack args 227kernel_stack_get = "stack_traces.get_stackid(ctx, 0)" 228user_stack_get = "stack_traces.get_stackid(ctx, BPF_F_USER_STACK)" 229stack_context = "" 230if args.user_stacks_only: 231 stack_context = "user" 232 kernel_stack_get = "-1" 233elif args.kernel_stacks_only: 234 stack_context = "kernel" 235 user_stack_get = "-1" 236else: 237 stack_context = "user + kernel" 238bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) 239bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) 240 241need_delimiter = args.delimited and not (args.kernel_stacks_only or 242 args.user_stacks_only) 243 244# check for an edge case; the code below will handle this case correctly 245# but ultimately nothing will be displayed 246if args.kernel_threads_only and args.user_stacks_only: 247 print("ERROR: Displaying user stacks for kernel threads " + 248 "doesn't make sense.", file=stderr) 249 exit(1) 250 251if debug or args.ebpf: 252 print(bpf_text) 253 if args.ebpf: 254 exit() 255 256# initialize BPF 257b = BPF(text=bpf_text) 258b.attach_kprobe(event_re="^finish_task_switch$|^finish_task_switch\.isra\.\d$", 259 fn_name="oncpu") 260matched = b.num_open_kprobes() 261if matched == 0: 262 print("error: 0 functions traced. Exiting.", file=stderr) 263 exit(1) 264 265# header 266if not folded: 267 print("Tracing off-CPU time (us) of %s by %s stack" % 268 (thread_context, stack_context), end="") 269 if duration < 99999999: 270 print(" for %d secs." % duration) 271 else: 272 print("... Hit Ctrl-C to end.") 273 274 275def print_warn_event(cpu, data, size): 276 event = b["warn_events"].event(data) 277 # See https://github.com/iovisor/bcc/pull/3227 for those wondering how can this happen. 278 print("WARN: Skipped an event with negative duration: pid:%d, tgid:%d, off-cpu:%d, on-cpu:%d" 279 % (event.pid, event.tgid, event.t_start, event.t_end), 280 file=stderr) 281 282b["warn_events"].open_perf_buffer(print_warn_event) 283try: 284 duration_ms = duration * 1000 285 start_time_ms = int(BPF.monotonic_time() / 1000000) 286 while True: 287 elapsed_ms = int(BPF.monotonic_time() / 1000000) - start_time_ms 288 if elapsed_ms >= duration_ms: 289 break 290 b.perf_buffer_poll(timeout=duration_ms - elapsed_ms) 291except KeyboardInterrupt: 292 # as cleanup can take many seconds, trap Ctrl-C: 293 signal.signal(signal.SIGINT, signal_ignore) 294 295if not folded: 296 print() 297 298missing_stacks = 0 299has_enomem = False 300counts = b.get_table("counts") 301stack_traces = b.get_table("stack_traces") 302for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 303 # handle get_stackid errors 304 if not args.user_stacks_only and stack_id_err(k.kernel_stack_id): 305 missing_stacks += 1 306 has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM 307 if not args.kernel_stacks_only and stack_id_err(k.user_stack_id): 308 missing_stacks += 1 309 has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM 310 311 # user stacks will be symbolized by tgid, not pid, to avoid the overhead 312 # of one symbol resolver per thread 313 user_stack = [] if k.user_stack_id < 0 else \ 314 stack_traces.walk(k.user_stack_id) 315 kernel_stack = [] if k.kernel_stack_id < 0 else \ 316 stack_traces.walk(k.kernel_stack_id) 317 318 if folded: 319 # print folded stack output 320 user_stack = list(user_stack) 321 kernel_stack = list(kernel_stack) 322 line = [k.name.decode('utf-8', 'replace')] 323 # if we failed to get the stack is, such as due to no space (-ENOMEM) or 324 # hash collision (-EEXIST), we still print a placeholder for consistency 325 if not args.kernel_stacks_only: 326 if stack_id_err(k.user_stack_id): 327 line.append("[Missed User Stack]") 328 else: 329 line.extend([b.sym(addr, k.tgid).decode('utf-8', 'replace') 330 for addr in reversed(user_stack)]) 331 if not args.user_stacks_only: 332 line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else []) 333 if stack_id_err(k.kernel_stack_id): 334 line.append("[Missed Kernel Stack]") 335 else: 336 line.extend([b.ksym(addr).decode('utf-8', 'replace') 337 for addr in reversed(kernel_stack)]) 338 print("%s %d" % (";".join(line), v.value)) 339 else: 340 # print default multi-line stack output 341 if not args.user_stacks_only: 342 if stack_id_err(k.kernel_stack_id): 343 print(" [Missed Kernel Stack]") 344 else: 345 for addr in kernel_stack: 346 print(" %s" % b.ksym(addr).decode('utf-8', 'replace')) 347 if not args.kernel_stacks_only: 348 if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0: 349 print(" --") 350 if stack_id_err(k.user_stack_id): 351 print(" [Missed User Stack]") 352 else: 353 for addr in user_stack: 354 print(" %s" % b.sym(addr, k.tgid).decode('utf-8', 'replace')) 355 print(" %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid)) 356 print(" %d\n" % v.value) 357 358if missing_stacks > 0: 359 enomem_str = "" if not has_enomem else \ 360 " Consider increasing --stack-storage-size." 361 print("WARNING: %d stack traces lost and could not be displayed.%s" % 362 (missing_stacks, enomem_str), 363 file=stderr) 364