1#!/usr/bin/python 2# 3# klockstat traces lock events and display locks statistics. 4# 5# USAGE: klockstat 6# 7 8from __future__ import print_function 9from bcc import BPF, USDT 10import argparse 11import subprocess 12import ctypes as ct 13from time import sleep, strftime 14from datetime import datetime, timedelta 15import errno 16from sys import stderr 17 18examples = """ 19 klockstat # trace system wide 20 klockstat -d 5 # trace for 5 seconds only 21 klockstat -i 5 # display stats every 5 seconds 22 klockstat -p 123 # trace locks for PID 123 23 klockstat -t 321 # trace locks for PID 321 24 klockstat -c pipe_ # display stats only for lock callers with 'pipe_' substring 25 klockstat -S acq_count # sort lock acquired results on acquired count 26 klockstat -S hld_total # sort lock held results on total held time 27 klockstat -S acq_count,hld_total # combination of above 28 klockstat -n 3 # display 3 locks 29 klockstat -s 3 # display 3 levels of stack 30""" 31 32# arg validation 33def positive_int(val): 34 try: 35 ival = int(val) 36 except ValueError: 37 raise argparse.ArgumentTypeError("must be an integer") 38 39 if ival < 0: 40 raise argparse.ArgumentTypeError("must be positive") 41 return ival 42 43def positive_nonzero_int(val): 44 ival = positive_int(val) 45 if ival == 0: 46 raise argparse.ArgumentTypeError("must be nonzero") 47 return ival 48 49def stack_id_err(stack_id): 50 # -EFAULT in get_stackid normally means the stack-trace is not available, 51 # Such as getting kernel stack trace in userspace code 52 return (stack_id < 0) and (stack_id != -errno.EFAULT) 53 54parser = argparse.ArgumentParser( 55 description="", 56 formatter_class=argparse.RawDescriptionHelpFormatter, 57 epilog=examples) 58 59time_group = parser.add_mutually_exclusive_group() 60time_group.add_argument("-d", "--duration", type=int, 61 help="total duration of trace in seconds") 62time_group.add_argument("-i", "--interval", type=int, 63 help="print summary at this interval (seconds)") 64parser.add_argument("-n", "--locks", type=int, default=99999999, 65 help="print given number of locks") 66parser.add_argument("-s", "--stacks", type=int, default=1, 67 help="print given number of stack entries") 68parser.add_argument("-c", "--caller", 69 help="print locks taken by given caller") 70parser.add_argument("-S", "--sort", 71 help="sort data on <aq_field,hd_field>, fields: acq_[max|total|count] hld_[max|total|count]") 72parser.add_argument("-p", "--pid", 73 help="trace this PID only") 74parser.add_argument("-t", "--tid", 75 help="trace this TID only") 76parser.add_argument("--stack-storage-size", default=16384, 77 type=positive_nonzero_int, 78 help="the number of unique stack traces that can be stored and " 79 "displayed (default 16384)") 80 81args = parser.parse_args() 82 83program = """ 84#include <uapi/linux/ptrace.h> 85 86struct depth_id { 87 u64 id; 88 u64 depth; 89}; 90 91BPF_ARRAY(enabled, u64, 1); 92BPF_HASH(track, u64, u64); 93BPF_HASH(time_aq, u64, u64); 94BPF_HASH(lock_depth, u64, u64); 95BPF_HASH(time_held, struct depth_id, u64); 96BPF_HASH(stack, struct depth_id, int); 97 98BPF_HASH(aq_report_count, int, u64); 99BPF_HASH(aq_report_max, int, u64); 100BPF_HASH(aq_report_total, int, u64); 101 102BPF_HASH(hl_report_count, int, u64); 103BPF_HASH(hl_report_max, int, u64); 104BPF_HASH(hl_report_total, int, u64); 105 106BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE); 107 108static bool is_enabled(void) 109{ 110 int key = 0; 111 u64 *ret; 112 113 ret = enabled.lookup(&key); 114 return ret && *ret == 1; 115} 116 117static bool allow_pid(u64 id) 118{ 119 u32 pid = id >> 32; // PID is higher part 120 u32 tid = id; // Cast and get the lower part 121 122 FILTER 123 124 return 1; 125} 126 127static int do_mutex_lock_enter(void *ctx, int skip) 128{ 129 if (!is_enabled()) 130 return 0; 131 132 u64 id = bpf_get_current_pid_tgid(); 133 134 if (!allow_pid(id)) 135 return 0; 136 137 u64 one = 1, zero = 0; 138 139 track.update(&id, &one); 140 141 u64 *depth = lock_depth.lookup(&id); 142 143 if (!depth) { 144 lock_depth.update(&id, &zero); 145 146 depth = lock_depth.lookup(&id); 147 /* something is wrong.. */ 148 if (!depth) 149 return 0; 150 } 151 152 int stackid = stack_traces.get_stackid(ctx, skip); 153 struct depth_id did = { 154 .id = id, 155 .depth = *depth, 156 }; 157 stack.update(&did, &stackid); 158 159 u64 ts = bpf_ktime_get_ns(); 160 time_aq.update(&id, &ts); 161 162 *depth += 1; 163 return 0; 164} 165 166static void update_aq_report_count(int *stackid) 167{ 168 u64 *count, one = 1; 169 170 count = aq_report_count.lookup(stackid); 171 if (!count) { 172 aq_report_count.update(stackid, &one); 173 } else { 174 *count += 1; 175 } 176} 177 178static void update_hl_report_count(int *stackid) 179{ 180 u64 *count, one = 1; 181 182 count = hl_report_count.lookup(stackid); 183 if (!count) { 184 hl_report_count.update(stackid, &one); 185 } else { 186 *count += 1; 187 } 188} 189 190static void update_aq_report_max(int *stackid, u64 time) 191{ 192 u64 *max; 193 194 max = aq_report_max.lookup(stackid); 195 if (!max || *max < time) 196 aq_report_max.update(stackid, &time); 197} 198 199static void update_hl_report_max(int *stackid, u64 time) 200{ 201 u64 *max; 202 203 max = hl_report_max.lookup(stackid); 204 if (!max || *max < time) 205 hl_report_max.update(stackid, &time); 206} 207 208static void update_aq_report_total(int *stackid, u64 delta) 209{ 210 u64 *count, *time; 211 212 count = aq_report_count.lookup(stackid); 213 if (!count) 214 return; 215 216 time = aq_report_total.lookup(stackid); 217 if (!time) { 218 aq_report_total.update(stackid, &delta); 219 } else { 220 *time = *time + delta; 221 } 222} 223 224static void update_hl_report_total(int *stackid, u64 delta) 225{ 226 u64 *count, *time; 227 228 count = hl_report_count.lookup(stackid); 229 if (!count) 230 return; 231 232 time = hl_report_total.lookup(stackid); 233 if (!time) { 234 hl_report_total.update(stackid, &delta); 235 } else { 236 *time = *time + delta; 237 } 238} 239 240static int do_mutex_lock_return(void) 241{ 242 if (!is_enabled()) 243 return 0; 244 245 u64 id = bpf_get_current_pid_tgid(); 246 247 if (!allow_pid(id)) 248 return 0; 249 250 u64 *one = track.lookup(&id); 251 252 if (!one) 253 return 0; 254 255 track.delete(&id); 256 257 u64 *depth = lock_depth.lookup(&id); 258 if (!depth) 259 return 0; 260 261 struct depth_id did = { 262 .id = id, 263 .depth = *depth - 1, 264 }; 265 266 u64 *aq = time_aq.lookup(&id); 267 if (!aq) 268 return 0; 269 270 int *stackid = stack.lookup(&did); 271 if (!stackid) 272 return 0; 273 274 int stackid_ = *stackid; 275 u64 cur = bpf_ktime_get_ns(); 276 277 if (cur > *aq) { 278 int val = cur - *aq; 279 update_aq_report_count(&stackid_); 280 update_aq_report_max(&stackid_, val); 281 update_aq_report_total(&stackid_, val); 282 } 283 284 time_held.update(&did, &cur); 285 return 0; 286} 287 288static int do_mutex_unlock_enter(void) 289{ 290 if (!is_enabled()) 291 return 0; 292 293 u64 id = bpf_get_current_pid_tgid(); 294 295 if (!allow_pid(id)) 296 return 0; 297 298 u64 *depth = lock_depth.lookup(&id); 299 300 if (!depth || *depth == 0) 301 return 0; 302 303 *depth -= 1; 304 305 struct depth_id did = { 306 .id = id, 307 .depth = *depth, 308 }; 309 310 u64 *held = time_held.lookup(&did); 311 if (!held) 312 return 0; 313 314 int *stackid = stack.lookup(&did); 315 if (!stackid) 316 return 0; 317 318 319 int stackid_ = *stackid; 320 u64 cur = bpf_ktime_get_ns(); 321 322 if (cur > *held) { 323 u64 val = cur - *held; 324 update_hl_report_count(&stackid_); 325 update_hl_report_max(&stackid_, val); 326 update_hl_report_total(&stackid_, val); 327 } 328 329 stack.delete(&did); 330 time_held.delete(&did); 331 return 0; 332} 333""" 334 335program_kprobe = """ 336int mutex_unlock_enter(struct pt_regs *ctx) 337{ 338 return do_mutex_unlock_enter(); 339} 340 341int mutex_lock_return(struct pt_regs *ctx) 342{ 343 return do_mutex_lock_return(); 344} 345 346int mutex_lock_enter(struct pt_regs *ctx) 347{ 348 return do_mutex_lock_enter(ctx, 0); 349} 350""" 351 352program_kfunc = """ 353KFUNC_PROBE(mutex_unlock, void *lock) 354{ 355 return do_mutex_unlock_enter(); 356} 357 358KRETFUNC_PROBE(mutex_lock, void *lock, int ret) 359{ 360 return do_mutex_lock_return(); 361} 362 363KFUNC_PROBE(mutex_lock, void *lock) 364{ 365 return do_mutex_lock_enter(ctx, 3); 366} 367 368""" 369 370program_kfunc_nested = """ 371KFUNC_PROBE(mutex_unlock, void *lock) 372{ 373 return do_mutex_unlock_enter(); 374} 375 376KRETFUNC_PROBE(mutex_lock_nested, void *lock, int ret) 377{ 378 return do_mutex_lock_return(); 379} 380 381KFUNC_PROBE(mutex_lock_nested, void *lock) 382{ 383 return do_mutex_lock_enter(ctx, 3); 384} 385 386""" 387 388is_support_kfunc = BPF.support_kfunc() 389if is_support_kfunc: 390 if BPF.get_kprobe_functions(b"mutex_lock_nested"): 391 program += program_kfunc_nested 392 else: 393 program += program_kfunc 394else: 395 program += program_kprobe 396 397def sort_list(maxs, totals, counts): 398 if (not args.sort): 399 return maxs; 400 401 for field in args.sort.split(','): 402 if (field == "acq_max" or field == "hld_max"): 403 return maxs 404 if (field == "acq_total" or field == "hld_total"): 405 return totals 406 if (field == "acq_count" or field == "hld_count"): 407 return counts 408 409 print("Wrong sort argument: %s", args.sort) 410 exit(-1) 411 412def display(sort, maxs, totals, counts): 413 global missing_stacks 414 global has_enomem 415 416 for k, v in sorted(sort.items(), key=lambda sort: sort[1].value, reverse=True)[:args.locks]: 417 missing_stacks += int(stack_id_err(k.value)) 418 has_enomem = has_enomem or (k.value == -errno.ENOMEM) 419 420 caller = "[Missed Kernel Stack]" 421 stack = [] 422 423 if (k.value >= 0): 424 stack = list(stack_traces.walk(k.value)) 425 caller = b.ksym(stack[1], show_offset=True) 426 427 if (args.caller and caller.find(args.caller.encode())): 428 continue 429 430 avg = totals[k].value / counts[k].value 431 432 print("%40s %10lu %6lu %10lu %10lu" % (caller, avg, counts[k].value, maxs[k].value, totals[k].value)) 433 434 for addr in stack[2:args.stacks]: 435 print("%40s" % b.ksym(addr, show_offset=True)) 436 437 438if args.tid: # TID trumps PID 439 program = program.replace('FILTER', 440 'if (tid != %s) { return 0; }' % args.tid) 441elif args.pid: 442 program = program.replace('FILTER', 443 'if (pid != %s) { return 0; }' % args.pid) 444else: 445 program = program.replace('FILTER', '') 446 447program = program.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size)) 448 449b = BPF(text=program) 450 451if not is_support_kfunc: 452 b.attach_kprobe(event="mutex_unlock", fn_name="mutex_unlock_enter") 453 # Depending on whether DEBUG_LOCK_ALLOC is set, the proper kprobe may be either mutex_lock or mutex_lock_nested 454 if BPF.get_kprobe_functions(b"mutex_lock_nested"): 455 b.attach_kretprobe(event="mutex_lock_nested", fn_name="mutex_lock_return") 456 b.attach_kprobe(event="mutex_lock_nested", fn_name="mutex_lock_enter") 457 else: 458 b.attach_kretprobe(event="mutex_lock", fn_name="mutex_lock_return") 459 b.attach_kprobe(event="mutex_lock", fn_name="mutex_lock_enter") 460 461enabled = b.get_table("enabled"); 462 463stack_traces = b.get_table("stack_traces") 464aq_counts = b.get_table("aq_report_count") 465aq_maxs = b.get_table("aq_report_max") 466aq_totals = b.get_table("aq_report_total") 467 468hl_counts = b.get_table("hl_report_count") 469hl_maxs = b.get_table("hl_report_max") 470hl_totals = b.get_table("hl_report_total") 471 472aq_sort = sort_list(aq_maxs, aq_totals, aq_counts) 473hl_sort = sort_list(hl_maxs, hl_totals, hl_counts) 474 475print("Tracing lock events... Hit Ctrl-C to end.") 476 477# duration and interval are mutualy exclusive 478exiting = 0 if args.interval else 1 479exiting = 1 if args.duration else 0 480 481seconds = 99999999 482if args.interval: 483 seconds = args.interval 484if args.duration: 485 seconds = args.duration 486 487missing_stacks = 0 488has_enomem = False 489 490while (1): 491 enabled[ct.c_int(0)] = ct.c_int(1) 492 493 try: 494 sleep(seconds) 495 except KeyboardInterrupt: 496 exiting = 1 497 498 enabled[ct.c_int(0)] = ct.c_int(0) 499 500 print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Spin", "Count", "Max spin", "Total spin")) 501 display(aq_sort, aq_maxs, aq_totals, aq_counts) 502 503 504 print("\n%40s %10s %6s %10s %10s" % ("Caller", "Avg Hold", "Count", "Max hold", "Total hold")) 505 display(hl_sort, hl_maxs, hl_totals, hl_counts) 506 507 if exiting: 508 break; 509 510 stack_traces.clear() 511 aq_counts.clear() 512 aq_maxs.clear() 513 aq_totals.clear() 514 hl_counts.clear() 515 hl_maxs.clear() 516 hl_totals.clear() 517 518if missing_stacks > 0: 519 enomem_str = " Consider increasing --stack-storage-size." 520 print("WARNING: %d stack traces lost and could not be displayed.%s" % 521 (missing_stacks, (enomem_str if has_enomem else "")), 522 file=stderr) 523