1#!/usr/bin/python 2# 3# offwaketime Summarize blocked time by kernel off-CPU stack + waker stack 4# For Linux, uses BCC, eBPF. 5# 6# USAGE: offwaketime [-h] [-p PID | -u | -k] [-U | -K] [-f] [duration] 7# 8# Copyright 2016 Netflix, Inc. 9# Licensed under the Apache License, Version 2.0 (the "License") 10# 11# 20-Jan-2016 Brendan Gregg Created this. 12 13from __future__ import print_function 14from bcc import BPF 15from time import sleep 16import argparse 17import signal 18import errno 19from sys import stderr 20 21# arg validation 22def positive_int(val): 23 dest = [] 24 # Filter up to 5 pids, arbitrary 25 args_list = val.split(",", 5) 26 pids_to_add = min(len(args_list), 5) 27 for i in range(pids_to_add): 28 dest.append(_positive_int(args_list[i])) 29 30 return dest 31 32def _positive_int(val): 33 try: 34 ival = int(val) 35 except ValueError: 36 raise argparse.ArgumentTypeError("must be an integer") 37 38 if ival < 0: 39 raise argparse.ArgumentTypeError("must be positive") 40 return ival 41 42def positive_nonzero_int(val): 43 ival = _positive_int(val) 44 if ival == 0: 45 raise argparse.ArgumentTypeError("must be nonzero") 46 return ival 47 48def build_filter(filter_name, values): 49 filter_string = "((%s == %d)" % (filter_name, values[0]) 50 51 for val in values[1:]: 52 filter_string += " || (%s == %d )" % (filter_name , val) 53 54 filter_string += ")" 55 56 return filter_string 57 58def stack_id_err(stack_id): 59 # -EFAULT in get_stackid normally means the stack-trace is not available, 60 # Such as getting kernel stack trace in userspace code 61 return (stack_id < 0) and (stack_id != -errno.EFAULT) 62 63# arguments 64examples = """examples: 65 ./offwaketime # trace off-CPU + waker stack time until Ctrl-C 66 ./offwaketime 5 # trace for 5 seconds only 67 ./offwaketime -f 5 # 5 seconds, and output in folded format 68 ./offwaketime -m 1000 # trace only events that last more than 1000 usec 69 ./offwaketime -M 9000 # trace only events that last less than 9000 usec 70 ./offwaketime -p 185 # only trace threads for PID 185 71 ./offwaketime -t 188 # only trace thread 188 72 ./offwaketime -u # only trace user threads (no kernel) 73 ./offwaketime -k # only trace kernel threads (no user) 74 ./offwaketime -U # only show user space stacks (no kernel) 75 ./offwaketime -K # only show kernel space stacks (no user) 76""" 77parser = argparse.ArgumentParser( 78 description="Summarize blocked time by kernel stack trace + waker stack", 79 formatter_class=argparse.RawDescriptionHelpFormatter, 80 epilog=examples) 81thread_group = parser.add_mutually_exclusive_group() 82# Note: this script provides --pid and --tid flags but their arguments are 83# referred to internally using kernel nomenclature: TGID and PID. 84thread_group.add_argument("-p", "--pid", metavar="PIDS", dest="tgid", 85 type=positive_int, 86 help="trace these PIDS only. Can be a comma separated list of PIDS.") 87thread_group.add_argument("-t", "--tid", metavar="TIDS", dest="pid", 88 type=positive_int, 89 help="trace these TIDS only. Can be a comma separated list of TIDS.") 90thread_group.add_argument("-u", "--user-threads-only", action="store_true", 91 help="user threads only (no kernel threads)") 92thread_group.add_argument("-k", "--kernel-threads-only", action="store_true", 93 help="kernel threads only (no user threads)") 94stack_group = parser.add_mutually_exclusive_group() 95stack_group.add_argument("-U", "--user-stacks-only", action="store_true", 96 help="show stacks from user space only (no kernel space stacks)") 97stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true", 98 help="show stacks from kernel space only (no user space stacks)") 99parser.add_argument("-d", "--delimited", action="store_true", 100 help="insert delimiter between kernel/user stacks") 101parser.add_argument("-f", "--folded", action="store_true", 102 help="output folded format") 103parser.add_argument("--stack-storage-size", default=1024, 104 type=positive_nonzero_int, 105 help="the number of unique stack traces that can be stored and " 106 "displayed (default 1024)") 107parser.add_argument("duration", nargs="?", default=99999999, 108 type=positive_nonzero_int, 109 help="duration of trace, in seconds") 110parser.add_argument("-m", "--min-block-time", default=1, 111 type=positive_nonzero_int, 112 help="the amount of time in microseconds over which we " + 113 "store traces (default 1)") 114parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1, 115 type=positive_nonzero_int, 116 help="the amount of time in microseconds under which we " + 117 "store traces (default U64_MAX)") 118parser.add_argument("--state", type=_positive_int, 119 help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" + 120 ") see include/linux/sched.h") 121parser.add_argument("--ebpf", action="store_true", 122 help=argparse.SUPPRESS) 123args = parser.parse_args() 124folded = args.folded 125duration = int(args.duration) 126 127# signal handler 128def signal_ignore(signal, frame): 129 print() 130 131# define BPF program 132bpf_text = """ 133#include <uapi/linux/ptrace.h> 134#include <linux/sched.h> 135 136#define MINBLOCK_US MINBLOCK_US_VALUEULL 137#define MAXBLOCK_US MAXBLOCK_US_VALUEULL 138 139struct key_t { 140 char waker[TASK_COMM_LEN]; 141 char target[TASK_COMM_LEN]; 142 s64 w_k_stack_id; 143 s64 w_u_stack_id; 144 s64 t_k_stack_id; 145 s64 t_u_stack_id; 146 u64 t_pid; 147 u64 t_tgid; 148 u32 w_pid; 149 u32 w_tgid; 150}; 151BPF_HASH(counts, struct key_t); 152 153// Key of this hash is PID of waiting Process, 154// value is timestamp when it went into waiting 155BPF_HASH(start, u32); 156 157struct wokeby_t { 158 char name[TASK_COMM_LEN]; 159 int k_stack_id; 160 int u_stack_id; 161 int w_pid; 162 int w_tgid; 163}; 164// Key of the hash is PID of the Process to be waken, value is information 165// of the Process who wakes it 166BPF_HASH(wokeby, u32, struct wokeby_t); 167 168BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 169 170int waker(struct pt_regs *ctx, struct task_struct *p) { 171 // PID and TGID of the target Process to be waken 172 u32 pid = p->pid; 173 u32 tgid = p->tgid; 174 175 if (!((THREAD_FILTER) && (STATE_FILTER))) { 176 return 0; 177 } 178 179 // Construct information about current (the waker) Process 180 struct wokeby_t woke = {}; 181 bpf_get_current_comm(&woke.name, sizeof(woke.name)); 182 woke.k_stack_id = KERNEL_STACK_GET; 183 woke.u_stack_id = USER_STACK_GET; 184 woke.w_pid = bpf_get_current_pid_tgid(); 185 woke.w_tgid = bpf_get_current_pid_tgid() >> 32; 186 187 wokeby.update(&pid, &woke); 188 return 0; 189} 190 191int oncpu(struct pt_regs *ctx, struct task_struct *p) { 192 // PID and TGID of the previous Process (Process going into waiting) 193 u32 pid = p->pid; 194 u32 tgid = p->tgid; 195 u64 *tsp; 196 u64 ts = bpf_ktime_get_ns(); 197 198 // Record timestamp for the previous Process (Process going into waiting) 199 if ((THREAD_FILTER) && (STATE_FILTER)) { 200 start.update(&pid, &ts); 201 } 202 203 // Calculate current Process's wait time by finding the timestamp of when 204 // it went into waiting. 205 // pid and tgid are now the PID and TGID of the current (waking) Process. 206 pid = bpf_get_current_pid_tgid(); 207 tgid = bpf_get_current_pid_tgid() >> 32; 208 tsp = start.lookup(&pid); 209 if (tsp == 0) { 210 // Missed or filtered when the Process went into waiting 211 return 0; 212 } 213 u64 delta = ts - *tsp; 214 start.delete(&pid); 215 delta = delta / 1000; 216 if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) { 217 return 0; 218 } 219 220 // create map key 221 struct key_t key = {}; 222 struct wokeby_t *woke; 223 224 bpf_get_current_comm(&key.target, sizeof(key.target)); 225 key.t_pid = pid; 226 key.t_tgid = tgid; 227 key.t_k_stack_id = KERNEL_STACK_GET; 228 key.t_u_stack_id = USER_STACK_GET; 229 230 woke = wokeby.lookup(&pid); 231 if (woke) { 232 key.w_k_stack_id = woke->k_stack_id; 233 key.w_u_stack_id = woke->u_stack_id; 234 key.w_pid = woke->w_pid; 235 key.w_tgid = woke->w_tgid; 236 __builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN); 237 wokeby.delete(&pid); 238 } 239 240 counts.increment(key, delta); 241 return 0; 242} 243""" 244 245# set thread filter 246if args.tgid is not None: 247 thread_filter = build_filter("tgid", args.tgid) 248elif args.pid is not None: 249 thread_filter = build_filter("pid", args.pid) 250elif args.user_threads_only: 251 thread_filter = '!(p->flags & PF_KTHREAD)' 252elif args.kernel_threads_only: 253 thread_filter = 'p->flags & PF_KTHREAD' 254else: 255 thread_filter = '1' 256if args.state == 0: 257 state_filter = 'p->STATE_FIELD == 0' 258elif args.state: 259 # these states are sometimes bitmask checked 260 state_filter = 'p->STATE_FIELD & %d' % args.state 261else: 262 state_filter = '1' 263bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter) 264bpf_text = bpf_text.replace('STATE_FILTER', state_filter) 265if BPF.kernel_struct_has_field(b'task_struct', b'__state') == 1: 266 bpf_text = bpf_text.replace('STATE_FIELD', '__state') 267else: 268 bpf_text = bpf_text.replace('STATE_FIELD', 'state') 269 270# set stack storage size 271bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 272bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time)) 273bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time)) 274 275# handle stack args 276kernel_stack_get = "stack_traces.get_stackid(ctx, 0)" 277user_stack_get = "stack_traces.get_stackid(ctx, BPF_F_USER_STACK)" 278stack_context = "" 279if args.user_stacks_only: 280 stack_context = "user" 281 kernel_stack_get = "-1" 282elif args.kernel_stacks_only: 283 stack_context = "kernel" 284 user_stack_get = "-1" 285else: 286 stack_context = "user + kernel" 287bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get) 288bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get) 289if args.ebpf: 290 print(bpf_text) 291 exit() 292 293# initialize BPF 294b = BPF(text=bpf_text) 295b.attach_kprobe(event_re="^finish_task_switch$|^finish_task_switch\.isra\.\d$", 296 fn_name="oncpu") 297b.attach_kprobe(event="try_to_wake_up", fn_name="waker") 298matched = b.num_open_kprobes() 299if matched == 0: 300 print("0 functions traced. Exiting.") 301 exit() 302 303# header 304if not folded: 305 print("Tracing blocked time (us) by %s off-CPU and waker stack" % 306 stack_context, end="") 307 if duration < 99999999: 308 print(" for %d secs." % duration) 309 else: 310 print("... Hit Ctrl-C to end.") 311 312try: 313 sleep(duration) 314except KeyboardInterrupt: 315 # as cleanup can take many seconds, trap Ctrl-C: 316 # print a newline for folded output on Ctrl-C 317 signal.signal(signal.SIGINT, signal_ignore) 318 319 320if not folded: 321 print() 322 323missing_stacks = 0 324has_enomem = False 325counts = b.get_table("counts") 326stack_traces = b.get_table("stack_traces") 327need_delimiter = args.delimited and not (args.kernel_stacks_only or 328 args.user_stacks_only) 329for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 330 # handle get_stackid errors 331 if not args.user_stacks_only: 332 missing_stacks += int(stack_id_err(k.w_k_stack_id)) 333 missing_stacks += int(stack_id_err(k.t_k_stack_id)) 334 has_enomem = has_enomem or (k.w_k_stack_id == -errno.ENOMEM) or \ 335 (k.t_k_stack_id == -errno.ENOMEM) 336 if not args.kernel_stacks_only: 337 missing_stacks += int(stack_id_err(k.w_u_stack_id)) 338 missing_stacks += int(stack_id_err(k.t_u_stack_id)) 339 has_enomem = has_enomem or (k.w_u_stack_id == -errno.ENOMEM) or \ 340 (k.t_u_stack_id == -errno.ENOMEM) 341 342 waker_user_stack = [] if k.w_u_stack_id < 1 else \ 343 reversed(list(stack_traces.walk(k.w_u_stack_id))[1:]) 344 waker_kernel_stack = [] if k.w_k_stack_id < 1 else \ 345 reversed(list(stack_traces.walk(k.w_k_stack_id))[1:]) 346 target_user_stack = [] if k.t_u_stack_id < 1 else \ 347 stack_traces.walk(k.t_u_stack_id) 348 target_kernel_stack = [] if k.t_k_stack_id < 1 else \ 349 stack_traces.walk(k.t_k_stack_id) 350 351 if folded: 352 # print folded stack output 353 line = [k.target.decode('utf-8', 'replace')] 354 if not args.kernel_stacks_only: 355 if stack_id_err(k.t_u_stack_id): 356 line.append("[Missed User Stack] %d" % k.t_u_stack_id) 357 else: 358 line.extend([b.sym(addr, k.t_tgid).decode('utf-8', 'replace') 359 for addr in reversed(list(target_user_stack)[1:])]) 360 if not args.user_stacks_only: 361 line.extend(["-"] if (need_delimiter and k.t_k_stack_id > 0 and k.t_u_stack_id > 0) else []) 362 if stack_id_err(k.t_k_stack_id): 363 line.append("[Missed Kernel Stack]") 364 else: 365 line.extend([b.ksym(addr).decode('utf-8', 'replace') 366 for addr in reversed(list(target_kernel_stack)[1:])]) 367 line.append("--") 368 if not args.user_stacks_only: 369 if stack_id_err(k.w_k_stack_id): 370 line.append("[Missed Kernel Stack]") 371 else: 372 line.extend([b.ksym(addr).decode('utf-8', 'replace') 373 for addr in reversed(list(waker_kernel_stack))]) 374 if not args.kernel_stacks_only: 375 line.extend(["-"] if (need_delimiter and k.w_u_stack_id > 0 and k.w_k_stack_id > 0) else []) 376 if stack_id_err(k.w_u_stack_id): 377 line.append("[Missed User Stack]") 378 else: 379 line.extend([b.sym(addr, k.w_tgid).decode('utf-8', 'replace') 380 for addr in reversed(list(waker_user_stack))]) 381 line.append(k.waker.decode('utf-8', 'replace')) 382 print("%s %d" % (";".join(line), v.value)) 383 else: 384 # print wakeup name then stack in reverse order 385 print(" %-16s %s %s" % ("waker:", k.waker.decode('utf-8', 'replace'), k.w_pid)) 386 if not args.kernel_stacks_only: 387 if stack_id_err(k.w_u_stack_id): 388 print(" [Missed User Stack] %d" % k.w_u_stack_id) 389 else: 390 for addr in waker_user_stack: 391 print(" %s" % b.sym(addr, k.w_tgid)) 392 if not args.user_stacks_only: 393 if need_delimiter and k.w_u_stack_id > 0 and k.w_k_stack_id > 0: 394 print(" -") 395 if stack_id_err(k.w_k_stack_id): 396 print(" [Missed Kernel Stack]") 397 else: 398 for addr in waker_kernel_stack: 399 print(" %s" % b.ksym(addr)) 400 401 # print waker/wakee delimiter 402 print(" %-16s %s" % ("--", "--")) 403 404 if not args.user_stacks_only: 405 if stack_id_err(k.t_k_stack_id): 406 print(" [Missed Kernel Stack]") 407 else: 408 for addr in target_kernel_stack: 409 print(" %s" % b.ksym(addr)) 410 if not args.kernel_stacks_only: 411 if need_delimiter and k.t_u_stack_id > 0 and k.t_k_stack_id > 0: 412 print(" -") 413 if stack_id_err(k.t_u_stack_id): 414 print(" [Missed User Stack]") 415 else: 416 for addr in target_user_stack: 417 print(" %s" % b.sym(addr, k.t_tgid)) 418 print(" %-16s %s %s" % ("target:", k.target.decode('utf-8', 'replace'), k.t_pid)) 419 print(" %d\n" % v.value) 420 421if missing_stacks > 0: 422 enomem_str = " Consider increasing --stack-storage-size." 423 print("WARNING: %d stack traces lost and could not be displayed.%s" % 424 (missing_stacks, (enomem_str if has_enomem else "")), 425 file=stderr) 426