• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# criticalstat  Trace long critical sections (IRQs or preemption disabled)
5#               For Linux, uses BCC, eBPF. Requires kernel built with
6#               CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS
7#
8# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION]
9#
10# Copyright (c) 2018, Google, Inc.
11# Licensed under the Apache License, Version 2.0 (the "License")
12#
13# By Joel Fernandes <joel@joelfernandes.org>
14
15from __future__ import print_function
16from bcc import BPF
17import argparse
18import sys
19import subprocess
20import os.path
21
22examples=""
23
24parser = argparse.ArgumentParser(
25    description="Trace long critical sections",
26    formatter_class=argparse.RawDescriptionHelpFormatter,
27    epilog=examples)
28
29parser.add_argument("-p", "--preemptoff", action="store_true",
30                    help="Find long sections where preemption was off")
31
32parser.add_argument("-i", "--irqoff", action="store_true",
33                    help="Find long sections where IRQ was off")
34
35parser.add_argument("-d", "--duration", default=100,
36                    help="Duration in uS (microseconds) below which we filter")
37
38args = parser.parse_args()
39
40preemptoff = False
41irqoff = False
42
43if args.irqoff:
44    preemptoff = False
45    irqoff = True
46elif args.preemptoff:
47    preemptoff = True
48    irqoff = False
49
50debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" +
51    " | awk '{print $2}'",
52    shell=True,
53    stdout=subprocess.PIPE).stdout.read().split(b"\n")[0]
54
55if debugfs_path == "":
56    print("ERROR: Unable to find debugfs mount point");
57    sys.exit(0);
58
59trace_path = debugfs_path + b"/tracing/events/preemptirq/";
60
61if (not os.path.exists(trace_path + b"irq_disable") or
62   not os.path.exists(trace_path + b"irq_enable") or
63   not os.path.exists(trace_path + b"preempt_disable") or
64   not os.path.exists(trace_path + b"preempt_enable")):
65    print("ERROR: required tracing events are not available\n" +
66        "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " +
67        "CONFIG_PREEMPT_TRACER " +
68        "and CONFIG_PREEMPTIRQ_EVENTS (CONFIG_PREEMPTIRQ_TRACEPOINTS in "
69        "kernel 4.19 and later) enabled. Also please disable " +
70        "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.")
71    sys.exit(0)
72
73bpf_text = """
74#include <uapi/linux/ptrace.h>
75#include <linux/sched.h>
76
77enum addr_offs {
78    START_CALLER_OFF,
79    START_PARENT_OFF,
80    END_CALLER_OFF,
81    END_PARENT_OFF
82};
83
84struct start_data {
85    u32 addr_offs[2];
86    u64 ts;
87    int idle_skip;
88    int active;
89};
90
91struct data_t {
92    u64 time;
93    s64 stack_id;
94    u32 cpu;
95    u64 id;
96    u32 addrs[4];   /* indexed by addr_offs */
97    char comm[TASK_COMM_LEN];
98};
99
100BPF_STACK_TRACE(stack_traces, 16384);
101BPF_PERCPU_ARRAY(sts, struct start_data, 1);
102BPF_PERCPU_ARRAY(isidle, u64, 1);
103BPF_PERF_OUTPUT(events);
104
105/*
106 * In the below code we install tracepoint probes on preempt or
107 * IRQ disable/enable critical sections and idle events, the cases
108 * are combinations of 4 different states.
109 * The states are defined as:
110 * CSenter: A critical section has been entered. Either due to
111 *          preempt disable or irq disable.
112 * CSexit: A critical section has been exited. Either due to
113 *         preempt enable or irq enable.
114 * Ienter: The CPU has entered an idle state.
115 * Iexit: The CPU has exited an idle state.
116 *
117 * The scenario we are trying to detect is if there is an overlap
118 * between Critical sections and idle entry/exit. If there are any
119 * such cases, we avoid recording those critical sections since they
120 * are not worth while to record and just add noise.
121 */
122TRACEPOINT_PROBE(power, cpu_idle)
123{
124    int idx = 0;
125    u64 val;
126    struct start_data *stdp, std;
127
128    // Mark active sections as that they should be skipped
129
130    // Handle the case CSenter, Ienter, CSexit, Iexit
131    // Handle the case CSenter, Ienter, Iexit, CSexit
132    stdp = sts.lookup(&idx);
133    if (stdp && stdp->active) {
134        /*
135         * Due to verifier issues, we have to copy contents
136         * of stdp onto the stack before the update.
137         * Fix it to directly update once kernel patch d71962f
138         * becomes more widespread.
139         */
140        std = *stdp;
141        std.idle_skip = 1;
142        sts.update(&idx, &std);
143    }
144
145    // Mark CPU as actively within idle or not.
146    if (args->state < 100) {
147        val = 1;
148        isidle.update(&idx, &val);
149    } else {
150        val = 0;
151        isidle.update(&idx, &val);
152    }
153    return 0;
154}
155
156static int in_idle(void)
157{
158     u64 *idlep;
159     int idx = 0;
160
161    // Skip event if we're in idle loop
162    idlep = isidle.lookup(&idx);
163    if (idlep && *idlep)
164        return 1;
165    return 0;
166}
167
168static void reset_state(void)
169{
170    int idx = 0;
171    struct start_data s = {};
172
173    sts.update(&idx, &s);
174}
175
176TRACEPOINT_PROBE(preemptirq, TYPE_disable)
177{
178    int idx = 0;
179    struct start_data s;
180
181    // Handle the case Ienter, CSenter, CSexit, Iexit
182    // Handle the case Ienter, CSenter, Iexit, CSexit
183    if (in_idle()) {
184        reset_state();
185        return 0;
186    }
187
188    u64 ts = bpf_ktime_get_ns();
189
190    s.idle_skip = 0;
191    s.addr_offs[START_CALLER_OFF] = args->caller_offs;
192    s.addr_offs[START_PARENT_OFF] = args->parent_offs;
193    s.ts = ts;
194    s.active = 1;
195
196    sts.update(&idx, &s);
197    return 0;
198}
199
200TRACEPOINT_PROBE(preemptirq, TYPE_enable)
201{
202    int idx = 0;
203    u64 start_ts, end_ts, diff;
204    struct start_data *stdp;
205
206    // Handle the case CSenter, Ienter, CSexit, Iexit
207    // Handle the case Ienter, CSenter, CSexit, Iexit
208    if (in_idle()) {
209        reset_state();
210        return 0;
211    }
212
213    stdp = sts.lookup(&idx);
214    if (!stdp) {
215        reset_state();
216        return 0;
217    }
218
219    // Handle the case Ienter, Csenter, Iexit, Csexit
220    if (!stdp->active) {
221        reset_state();
222        return 0;
223    }
224
225    // Handle the case CSenter, Ienter, Iexit, CSexit
226    if (stdp->idle_skip) {
227        reset_state();
228        return 0;
229    }
230
231    end_ts = bpf_ktime_get_ns();
232    start_ts = stdp->ts;
233
234    if (start_ts > end_ts) {
235        reset_state();
236        return 0;
237    }
238
239    diff = end_ts - start_ts;
240
241    if (diff < DURATION) {
242        reset_state();
243        return 0;
244    }
245
246    u64 id = bpf_get_current_pid_tgid();
247    struct data_t data = {};
248
249    if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) {
250        data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF];
251        data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF];
252        data.addrs[END_CALLER_OFF] = args->caller_offs;
253        data.addrs[END_PARENT_OFF] = args->parent_offs;
254
255        data.id = id;
256        data.stack_id = stack_traces.get_stackid(args, 0);
257        data.time = diff;
258        data.cpu = bpf_get_smp_processor_id();
259        events.perf_submit(args, &data, sizeof(data));
260    }
261
262    reset_state();
263    return 0;
264}
265"""
266bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000))
267
268if preemptoff:
269    bpf_text = bpf_text.replace('TYPE', 'preempt')
270else:
271    bpf_text = bpf_text.replace('TYPE', 'irq')
272
273b = BPF(text=bpf_text)
274
275def get_syms(kstack):
276    syms = []
277
278    for addr in kstack:
279        s = b.ksym(addr, show_offset=True)
280        syms.append(s)
281
282    return syms
283
284# process event
285def print_event(cpu, data, size):
286    try:
287        global b
288        event = b["events"].event(data)
289        stack_traces = b['stack_traces']
290        stext = b.ksymname('_stext')
291
292        print("===================================")
293        print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \
294            (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="")
295        print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1])))
296        print("Section end:   {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3])))
297
298        if event.stack_id >= 0:
299            kstack = stack_traces.walk(event.stack_id)
300            syms = get_syms(kstack)
301            if not syms:
302                return
303
304            for s in syms:
305                print("  ", end="")
306                print("%s" % s)
307        else:
308            print("NO STACK FOUND DUE TO COLLISION")
309        print("===================================")
310        print("")
311    except Exception:
312        sys.exit(0)
313
314b["events"].open_perf_buffer(print_event, page_cnt=256)
315
316print("Finding critical section with {} disabled for > {}us".format(
317    ('preempt' if preemptoff else 'IRQ'), args.duration))
318
319while 1:
320    try:
321        b.perf_buffer_poll()
322    except KeyboardInterrupt:
323        exit()
324