1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# criticalstat Trace long critical sections (IRQs or preemption disabled) 5# For Linux, uses BCC, eBPF. Requires kernel built with 6# CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS 7# 8# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION] 9# 10# Copyright (c) 2018, Google, Inc. 11# Licensed under the Apache License, Version 2.0 (the "License") 12# 13# By Joel Fernandes <joel@joelfernandes.org> 14 15from __future__ import print_function 16from bcc import BPF 17import argparse 18import sys 19import subprocess 20import os.path 21 22examples="" 23 24parser = argparse.ArgumentParser( 25 description="Trace long critical sections", 26 formatter_class=argparse.RawDescriptionHelpFormatter, 27 epilog=examples) 28 29parser.add_argument("-p", "--preemptoff", action="store_true", 30 help="Find long sections where preemption was off") 31 32parser.add_argument("-i", "--irqoff", action="store_true", 33 help="Find long sections where IRQ was off") 34 35parser.add_argument("-d", "--duration", default=100, 36 help="Duration in uS (microseconds) below which we filter") 37 38args = parser.parse_args() 39 40preemptoff = False 41irqoff = False 42 43if args.irqoff: 44 preemptoff = False 45 irqoff = True 46elif args.preemptoff: 47 preemptoff = True 48 irqoff = False 49 50debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" + 51 " | awk '{print $2}'", 52 shell=True, 53 stdout=subprocess.PIPE).stdout.read().split(b"\n")[0] 54 55if debugfs_path == "": 56 print("ERROR: Unable to find debugfs mount point"); 57 sys.exit(0); 58 59trace_path = debugfs_path + b"/tracing/events/preemptirq/"; 60 61if (not os.path.exists(trace_path + b"irq_disable") or 62 not os.path.exists(trace_path + b"irq_enable") or 63 not os.path.exists(trace_path + b"preempt_disable") or 64 not os.path.exists(trace_path + b"preempt_enable")): 65 print("ERROR: required tracing events are not available\n" + 66 "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " + 67 "CONFIG_PREEMPT_TRACER " + 68 "and CONFIG_PREEMPTIRQ_EVENTS (CONFIG_PREEMPTIRQ_TRACEPOINTS in " 69 "kernel 4.19 and later) enabled. Also please disable " + 70 "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.") 71 sys.exit(0) 72 73bpf_text = """ 74#include <uapi/linux/ptrace.h> 75#include <linux/sched.h> 76 77enum addr_offs { 78 START_CALLER_OFF, 79 START_PARENT_OFF, 80 END_CALLER_OFF, 81 END_PARENT_OFF 82}; 83 84struct start_data { 85 u32 addr_offs[2]; 86 u64 ts; 87 int idle_skip; 88 int active; 89}; 90 91struct data_t { 92 u64 time; 93 s64 stack_id; 94 u32 cpu; 95 u64 id; 96 u32 addrs[4]; /* indexed by addr_offs */ 97 char comm[TASK_COMM_LEN]; 98}; 99 100BPF_STACK_TRACE(stack_traces, 16384); 101BPF_PERCPU_ARRAY(sts, struct start_data, 1); 102BPF_PERCPU_ARRAY(isidle, u64, 1); 103BPF_PERF_OUTPUT(events); 104 105/* 106 * In the below code we install tracepoint probes on preempt or 107 * IRQ disable/enable critical sections and idle events, the cases 108 * are combinations of 4 different states. 109 * The states are defined as: 110 * CSenter: A critical section has been entered. Either due to 111 * preempt disable or irq disable. 112 * CSexit: A critical section has been exited. Either due to 113 * preempt enable or irq enable. 114 * Ienter: The CPU has entered an idle state. 115 * Iexit: The CPU has exited an idle state. 116 * 117 * The scenario we are trying to detect is if there is an overlap 118 * between Critical sections and idle entry/exit. If there are any 119 * such cases, we avoid recording those critical sections since they 120 * are not worth while to record and just add noise. 121 */ 122TRACEPOINT_PROBE(power, cpu_idle) 123{ 124 int idx = 0; 125 u64 val; 126 struct start_data *stdp, std; 127 128 // Mark active sections as that they should be skipped 129 130 // Handle the case CSenter, Ienter, CSexit, Iexit 131 // Handle the case CSenter, Ienter, Iexit, CSexit 132 stdp = sts.lookup(&idx); 133 if (stdp && stdp->active) { 134 /* 135 * Due to verifier issues, we have to copy contents 136 * of stdp onto the stack before the update. 137 * Fix it to directly update once kernel patch d71962f 138 * becomes more widespread. 139 */ 140 std = *stdp; 141 std.idle_skip = 1; 142 sts.update(&idx, &std); 143 } 144 145 // Mark CPU as actively within idle or not. 146 if (args->state < 100) { 147 val = 1; 148 isidle.update(&idx, &val); 149 } else { 150 val = 0; 151 isidle.update(&idx, &val); 152 } 153 return 0; 154} 155 156static int in_idle(void) 157{ 158 u64 *idlep; 159 int idx = 0; 160 161 // Skip event if we're in idle loop 162 idlep = isidle.lookup(&idx); 163 if (idlep && *idlep) 164 return 1; 165 return 0; 166} 167 168static void reset_state(void) 169{ 170 int idx = 0; 171 struct start_data s = {}; 172 173 sts.update(&idx, &s); 174} 175 176TRACEPOINT_PROBE(preemptirq, TYPE_disable) 177{ 178 int idx = 0; 179 struct start_data s; 180 181 // Handle the case Ienter, CSenter, CSexit, Iexit 182 // Handle the case Ienter, CSenter, Iexit, CSexit 183 if (in_idle()) { 184 reset_state(); 185 return 0; 186 } 187 188 u64 ts = bpf_ktime_get_ns(); 189 190 s.idle_skip = 0; 191 s.addr_offs[START_CALLER_OFF] = args->caller_offs; 192 s.addr_offs[START_PARENT_OFF] = args->parent_offs; 193 s.ts = ts; 194 s.active = 1; 195 196 sts.update(&idx, &s); 197 return 0; 198} 199 200TRACEPOINT_PROBE(preemptirq, TYPE_enable) 201{ 202 int idx = 0; 203 u64 start_ts, end_ts, diff; 204 struct start_data *stdp; 205 206 // Handle the case CSenter, Ienter, CSexit, Iexit 207 // Handle the case Ienter, CSenter, CSexit, Iexit 208 if (in_idle()) { 209 reset_state(); 210 return 0; 211 } 212 213 stdp = sts.lookup(&idx); 214 if (!stdp) { 215 reset_state(); 216 return 0; 217 } 218 219 // Handle the case Ienter, Csenter, Iexit, Csexit 220 if (!stdp->active) { 221 reset_state(); 222 return 0; 223 } 224 225 // Handle the case CSenter, Ienter, Iexit, CSexit 226 if (stdp->idle_skip) { 227 reset_state(); 228 return 0; 229 } 230 231 end_ts = bpf_ktime_get_ns(); 232 start_ts = stdp->ts; 233 234 if (start_ts > end_ts) { 235 reset_state(); 236 return 0; 237 } 238 239 diff = end_ts - start_ts; 240 241 if (diff < DURATION) { 242 reset_state(); 243 return 0; 244 } 245 246 u64 id = bpf_get_current_pid_tgid(); 247 struct data_t data = {}; 248 249 if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) { 250 data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF]; 251 data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF]; 252 data.addrs[END_CALLER_OFF] = args->caller_offs; 253 data.addrs[END_PARENT_OFF] = args->parent_offs; 254 255 data.id = id; 256 data.stack_id = stack_traces.get_stackid(args, 0); 257 data.time = diff; 258 data.cpu = bpf_get_smp_processor_id(); 259 events.perf_submit(args, &data, sizeof(data)); 260 } 261 262 reset_state(); 263 return 0; 264} 265""" 266bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000)) 267 268if preemptoff: 269 bpf_text = bpf_text.replace('TYPE', 'preempt') 270else: 271 bpf_text = bpf_text.replace('TYPE', 'irq') 272 273b = BPF(text=bpf_text) 274 275def get_syms(kstack): 276 syms = [] 277 278 for addr in kstack: 279 s = b.ksym(addr, show_offset=True) 280 syms.append(s) 281 282 return syms 283 284# process event 285def print_event(cpu, data, size): 286 try: 287 global b 288 event = b["events"].event(data) 289 stack_traces = b['stack_traces'] 290 stext = b.ksymname('_stext') 291 292 print("===================================") 293 print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \ 294 (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="") 295 print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1]))) 296 print("Section end: {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3]))) 297 298 if event.stack_id >= 0: 299 kstack = stack_traces.walk(event.stack_id) 300 syms = get_syms(kstack) 301 if not syms: 302 return 303 304 for s in syms: 305 print(" ", end="") 306 print("%s" % s) 307 else: 308 print("NO STACK FOUND DUE TO COLLISION") 309 print("===================================") 310 print("") 311 except Exception: 312 sys.exit(0) 313 314b["events"].open_perf_buffer(print_event, page_cnt=256) 315 316print("Finding critical section with {} disabled for > {}us".format( 317 ('preempt' if preemptoff else 'IRQ'), args.duration)) 318 319while 1: 320 try: 321 b.perf_buffer_poll() 322 except KeyboardInterrupt: 323 exit() 324