1#!/usr/bin/env python 2# 3# offwaketime Summarize blocked time by kernel off-CPU stack + waker stack 4# For Linux, uses BCC, eBPF. 5# 6# USAGE: offwaketime [-h] [-u] [-p PID] [-T] [duration] 7# 8# The current implementation uses an unrolled loop for x86_64, and was written 9# as a proof of concept. This implementation should be replaced in the future 10# with an appropriate bpf_ call, when available. 11# 12# The Off-CPU stack is currently limited to a stack trace depth of 20 13# (maxtdepth), and the waker stack limited to 10 (maxwdepth). This is also 14# limited to kernel stacks, and x86_64 only. Check for future versions, where 15# these limitations should be removed. 16# 17# Copyright 2016 Netflix, Inc. 18# Licensed under the Apache License, Version 2.0 (the "License") 19# 20# 20-Jan-2016 Brendan Gregg Created this. 21 22from __future__ import print_function 23from bcc import BPF 24from time import sleep 25import argparse 26import signal 27 28# arguments 29examples = """examples: 30 ./offwaketime # trace off-CPU + waker stack time until Ctrl-C 31 ./offwaketime 5 # trace for 5 seconds only 32 ./offwaketime -f 5 # 5 seconds, and output in folded format 33 ./offwaketime -u # don't include kernel threads (user only) 34 ./offwaketime -p 185 # trace fo PID 185 only 35""" 36parser = argparse.ArgumentParser( 37 description="Summarize blocked time by kernel stack trace + waker stack", 38 formatter_class=argparse.RawDescriptionHelpFormatter, 39 epilog=examples) 40parser.add_argument("-u", "--useronly", action="store_true", 41 help="user threads only (no kernel threads)") 42parser.add_argument("-p", "--pid", 43 help="trace this PID only") 44parser.add_argument("-v", "--verbose", action="store_true", 45 help="show raw addresses") 46parser.add_argument("-f", "--folded", action="store_true", 47 help="output folded format") 48parser.add_argument("duration", nargs="?", default=99999999, 49 help="duration of trace, in seconds") 50args = parser.parse_args() 51folded = args.folded 52duration = int(args.duration) 53debug = 0 54maxwdepth = 10 # and MAXWDEPTH 55maxtdepth = 20 # and MAXTDEPTH 56if args.pid and args.useronly: 57 print("ERROR: use either -p or -u.") 58 exit() 59 60# signal handler 61def signal_ignore(signal, frame): 62 print() 63 64# define BPF program 65bpf_text = """ 66#include <uapi/linux/ptrace.h> 67#include <linux/sched.h> 68 69#define MAXWDEPTH 10 70#define MAXTDEPTH 20 71#define MINBLOCK_US 1 72 73struct key_t { 74 char waker[TASK_COMM_LEN]; 75 char target[TASK_COMM_LEN]; 76 u64 wret[MAXWDEPTH]; 77 u64 tret[MAXTDEPTH]; 78}; 79BPF_HASH(counts, struct key_t); 80BPF_HASH(start, u32); 81struct wokeby_t { 82 char name[TASK_COMM_LEN]; 83 u64 ret[MAXWDEPTH]; 84}; 85BPF_HASH(wokeby, u32, struct wokeby_t); 86 87static u64 get_frame(u64 *bp) { 88 if (*bp) { 89 // The following stack walker is x86_64 specific 90 u64 ret = 0; 91 if (bpf_probe_read(&ret, sizeof(ret), (void *)(*bp+8))) 92 return 0; 93 if (bpf_probe_read(bp, sizeof(*bp), (void *)*bp)) 94 *bp = 0; 95 if (ret < __START_KERNEL_map) 96 return 0; 97 return ret; 98 } 99 return 0; 100} 101 102int waker(struct pt_regs *ctx, struct task_struct *p) { 103 u32 pid = p->pid; 104 105 if (!(FILTER)) 106 return 0; 107 108 u64 bp = 0; 109 struct wokeby_t woke = {}; 110 int depth = 0; 111 bpf_get_current_comm(&woke.name, sizeof(woke.name)); 112 bp = ctx->bp; 113 114 // unrolled loop (MAXWDEPTH): 115 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 116 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 117 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 118 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 119 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 120 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 121 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 122 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 123 if (!(woke.ret[depth++] = get_frame(&bp))) goto out; 124 woke.ret[depth] = get_frame(&bp); 125 126out: 127 wokeby.update(&pid, &woke); 128 return 0; 129} 130 131int oncpu(struct pt_regs *ctx, struct task_struct *p) { 132 u32 pid = p->pid; 133 u64 ts, *tsp; 134 135 // record previous thread sleep time 136 if (FILTER) { 137 ts = bpf_ktime_get_ns(); 138 start.update(&pid, &ts); 139 } 140 141 // calculate current thread's delta time 142 pid = bpf_get_current_pid_tgid(); 143 tsp = start.lookup(&pid); 144 if (tsp == 0) 145 return 0; // missed start or filtered 146 u64 delta = bpf_ktime_get_ns() - *tsp; 147 start.delete(&pid); 148 delta = delta / 1000; 149 if (delta < MINBLOCK_US) 150 return 0; 151 152 // create map key 153 u64 zero = 0, *val, bp = 0; 154 int depth = 0; 155 struct key_t key = {}; 156 struct wokeby_t *woke; 157 bpf_get_current_comm(&key.target, sizeof(key.target)); 158 bp = ctx->bp; 159 160 // unrolled loop (MAXTDEPTH): 161 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 162 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 163 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 164 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 165 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 166 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 167 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 168 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 169 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 170 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 171 172 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 173 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 174 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 175 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 176 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 177 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 178 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 179 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 180 if (!(key.tret[depth++] = get_frame(&bp))) goto out; 181 key.tret[depth] = get_frame(&bp); 182 183out: 184 woke = wokeby.lookup(&pid); 185 if (woke) { 186 __builtin_memcpy(&key.wret, woke->ret, sizeof(key.wret)); 187 __builtin_memcpy(&key.waker, woke->name, TASK_COMM_LEN); 188 wokeby.delete(&pid); 189 } 190 191 val = counts.lookup_or_init(&key, &zero); 192 (*val) += delta; 193 return 0; 194} 195""" 196if args.pid: 197 filter = 'pid == %s' % args.pid 198elif args.useronly: 199 filter = '!(p->flags & PF_KTHREAD)' 200else: 201 filter = '1' 202bpf_text = bpf_text.replace('FILTER', filter) 203if debug: 204 print(bpf_text) 205 206# initialize BPF 207b = BPF(text=bpf_text) 208b.attach_kprobe(event="finish_task_switch", fn_name="oncpu") 209b.attach_kprobe(event="try_to_wake_up", fn_name="waker") 210matched = b.num_open_kprobes() 211if matched == 0: 212 print("0 functions traced. Exiting.") 213 exit() 214 215# header 216if not folded: 217 print("Tracing blocked time (us) by kernel off-CPU and waker stack", 218 end="") 219 if duration < 99999999: 220 print(" for %d secs." % duration) 221 else: 222 print("... Hit Ctrl-C to end.") 223 224# output 225while (1): 226 try: 227 sleep(duration) 228 except KeyboardInterrupt: 229 # as cleanup can take many seconds, trap Ctrl-C: 230 signal.signal(signal.SIGINT, signal_ignore) 231 232 if not folded: 233 print() 234 counts = b.get_table("counts") 235 for k, v in sorted(counts.items(), key=lambda counts: counts[1].value): 236 if folded: 237 # fold target stack 238 line = k.target + ";" 239 for i in reversed(range(0, maxtdepth)): 240 if k.tret[i] == 0: 241 continue 242 line = line + b.ksym(k.tret[i]) 243 if i != 0: 244 line = line + ";" 245 246 # add delimiter 247 line = line + ";-" 248 249 # fold waker stack 250 for i in range(0, maxwdepth): 251 line = line + ";" 252 if k.wret[i] == 0: 253 break 254 line = line + b.ksym(k.wret[i]) 255 if i != 0: 256 line = line + ";" + k.waker 257 258 # print as a line 259 print("%s %d" % (line, v.value)) 260 else: 261 # print wakeup name then stack in reverse order 262 print(" %-16s %s" % ("waker:", k.waker)) 263 for i in reversed(range(0, maxwdepth)): 264 if k.wret[i] == 0: 265 continue 266 print(" %-16x %s" % (k.wret[i], 267 b.ksym(k.wret[i]))) 268 269 # print delimiter 270 print(" %-16s %s" % ("-", "-")) 271 272 # print default multi-line stack output 273 for i in range(0, maxtdepth): 274 if k.tret[i] == 0: 275 break 276 print(" %-16x %s" % (k.tret[i], 277 b.ksym(k.tret[i]))) 278 print(" %-16s %s" % ("target:", k.target)) 279 print(" %d\n" % v.value) 280 counts.clear() 281 282 if not folded: 283 print("Detaching...") 284 exit() 285