• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# criticalstat  Trace long critical sections (IRQs or preemption disabled)
5#               For Linux, uses BCC, eBPF. Requires kernel built with
6#               CONFIG_DEBUG_PREEMPT and CONFIG_PREEMPTIRQ_EVENTS
7#
8# USAGE: criticalstat [-h] [-p] [-i] [-d DURATION]
9#
10# Copyright (c) 2018, Google, Inc.
11# Licensed under the Apache License, Version 2.0 (the "License")
12#
13# By Joel Fernandes <joel@joelfernandes.org>
14
15from __future__ import print_function
16from bcc import BPF
17import argparse
18import ctypes as ct
19import sys
20import subprocess
21import os.path
22
23examples=""
24
25parser = argparse.ArgumentParser(
26    description="Trace long critical sections",
27    formatter_class=argparse.RawDescriptionHelpFormatter,
28    epilog=examples)
29
30parser.add_argument("-p", "--preemptoff", action="store_true",
31                    help="Find long sections where preemption was off")
32
33parser.add_argument("-i", "--irqoff", action="store_true",
34                    help="Find long sections where IRQ was off")
35
36parser.add_argument("-d", "--duration", default=100,
37                    help="Duration in uS (microseconds) below which we filter")
38
39args = parser.parse_args()
40
41preemptoff = False
42irqoff = False
43
44if args.irqoff:
45    preemptoff = False
46    irqoff = True
47elif args.preemptoff:
48    preemptoff = True
49    irqoff = False
50
51debugfs_path = subprocess.Popen ("cat /proc/mounts | grep -w debugfs" +
52    " | awk '{print $2}'",
53    shell=True,
54    stdout=subprocess.PIPE).stdout.read().split(b"\n")[0]
55
56if debugfs_path == "":
57    print("ERROR: Unable to find debugfs mount point");
58    sys.exit(0);
59
60trace_path = debugfs_path + b"/tracing/events/preemptirq/";
61
62if (not os.path.exists(trace_path + b"irq_disable") or
63   not os.path.exists(trace_path + b"irq_enable") or
64   not os.path.exists(trace_path + b"preempt_disable") or
65   not os.path.exists(trace_path + b"preempt_enable")):
66    print("ERROR: required tracing events are not available\n" +
67        "Make sure the kernel is built with CONFIG_DEBUG_PREEMPT " +
68        "and CONFIG_PREEMPTIRQ_EVENTS enabled. Also please disable " +
69        "CONFIG_PROVE_LOCKING and CONFIG_LOCKDEP on older kernels.")
70    sys.exit(0)
71
72bpf_text = """
73#include <uapi/linux/ptrace.h>
74#include <linux/sched.h>
75
76enum addr_offs {
77    START_CALLER_OFF,
78    START_PARENT_OFF,
79    END_CALLER_OFF,
80    END_PARENT_OFF
81};
82
83struct start_data {
84    u32 addr_offs[2];
85    u64 ts;
86    int idle_skip;
87    int active;
88};
89
90struct data_t {
91    u64 time;
92    s64 stack_id;
93    u32 cpu;
94    u64 id;
95    u32 addrs[4];   /* indexed by addr_offs */
96    char comm[TASK_COMM_LEN];
97};
98
99BPF_STACK_TRACE(stack_traces, 16384);
100BPF_PERCPU_ARRAY(sts, struct start_data, 1);
101BPF_PERCPU_ARRAY(isidle, u64, 1);
102BPF_PERF_OUTPUT(events);
103
104/*
105 * In the below code we install tracepoint probes on preempt or
106 * IRQ disable/enable critical sections and idle events, the cases
107 * are combinations of 4 different states.
108 * The states are defined as:
109 * CSenter: A critical section has been entered. Either due to
110 *          preempt disable or irq disable.
111 * CSexit: A critical section has been exited. Either due to
112 *         preempt enable or irq enable.
113 * Ienter: The CPU has entered an idle state.
114 * Iexit: The CPU has exited an idle state.
115 *
116 * The scenario we are trying to detect is if there is an overlap
117 * between Critical sections and idle entry/exit. If there are any
118 * such cases, we avoid recording those critical sections since they
119 * are not worth while to record and just add noise.
120 */
121TRACEPOINT_PROBE(power, cpu_idle)
122{
123    int idx = 0;
124    u64 val;
125    struct start_data *stdp, std;
126
127    // Mark active sections as that they should be skipped
128
129    // Handle the case CSenter, Ienter, CSexit, Iexit
130    // Handle the case CSenter, Ienter, Iexit, CSexit
131    stdp = sts.lookup(&idx);
132    if (stdp && stdp->active) {
133        /*
134         * Due to verifier issues, we have to copy contents
135         * of stdp onto the stack before the update.
136         * Fix it to directly update once kernel patch d71962f
137         * becomes more widespread.
138         */
139        std = *stdp;
140        std.idle_skip = 1;
141        sts.update(&idx, &std);
142    }
143
144    // Mark CPU as actively within idle or not.
145    if (args->state < 100) {
146        val = 1;
147        isidle.update(&idx, &val);
148    } else {
149        val = 0;
150        isidle.update(&idx, &val);
151    }
152    return 0;
153}
154
155static int in_idle(void)
156{
157     u64 *idlep;
158     int idx = 0;
159
160    // Skip event if we're in idle loop
161    idlep = isidle.lookup(&idx);
162    if (idlep && *idlep)
163        return 1;
164    return 0;
165}
166
167static void reset_state(void)
168{
169    int idx = 0;
170    struct start_data s = {};
171
172    sts.update(&idx, &s);
173}
174
175TRACEPOINT_PROBE(preemptirq, TYPE_disable)
176{
177    int idx = 0;
178    struct start_data s;
179
180    // Handle the case Ienter, CSenter, CSexit, Iexit
181    // Handle the case Ienter, CSenter, Iexit, CSexit
182    if (in_idle()) {
183        reset_state();
184        return 0;
185    }
186
187    u64 ts = bpf_ktime_get_ns();
188
189    s.idle_skip = 0;
190    s.addr_offs[START_CALLER_OFF] = args->caller_offs;
191    s.addr_offs[START_PARENT_OFF] = args->parent_offs;
192    s.ts = ts;
193    s.active = 1;
194
195    sts.update(&idx, &s);
196    return 0;
197}
198
199TRACEPOINT_PROBE(preemptirq, TYPE_enable)
200{
201    int idx = 0;
202    u64 start_ts, end_ts, diff;
203    struct start_data *stdp;
204
205    // Handle the case CSenter, Ienter, CSexit, Iexit
206    // Handle the case Ienter, CSenter, CSexit, Iexit
207    if (in_idle()) {
208        reset_state();
209        return 0;
210    }
211
212    stdp = sts.lookup(&idx);
213    if (!stdp) {
214        reset_state();
215        return 0;
216    }
217
218    // Handle the case Ienter, Csenter, Iexit, Csexit
219    if (!stdp->active) {
220        reset_state();
221        return 0;
222    }
223
224    // Handle the case CSenter, Ienter, Iexit, CSexit
225    if (stdp->idle_skip) {
226        reset_state();
227        return 0;
228    }
229
230    end_ts = bpf_ktime_get_ns();
231    start_ts = stdp->ts;
232
233    if (start_ts > end_ts) {
234        reset_state();
235        return 0;
236    }
237
238    diff = end_ts - start_ts;
239
240    if (diff < DURATION) {
241        reset_state();
242        return 0;
243    }
244
245    u64 id = bpf_get_current_pid_tgid();
246    struct data_t data = {};
247
248    if (bpf_get_current_comm(&data.comm, sizeof(data.comm)) == 0) {
249        data.addrs[START_CALLER_OFF] = stdp->addr_offs[START_CALLER_OFF];
250        data.addrs[START_PARENT_OFF] = stdp->addr_offs[START_PARENT_OFF];
251        data.addrs[END_CALLER_OFF] = args->caller_offs;
252        data.addrs[END_PARENT_OFF] = args->parent_offs;
253
254        data.id = id;
255        data.stack_id = stack_traces.get_stackid(args, 0);
256        data.time = diff;
257        data.cpu = bpf_get_smp_processor_id();
258        events.perf_submit(args, &data, sizeof(data));
259    }
260
261    reset_state();
262    return 0;
263}
264"""
265bpf_text = bpf_text.replace('DURATION', '{}'.format(int(args.duration) * 1000))
266
267if preemptoff:
268    bpf_text = bpf_text.replace('TYPE', 'preempt')
269else:
270    bpf_text = bpf_text.replace('TYPE', 'irq')
271
272b = BPF(text=bpf_text)
273
274TASK_COMM_LEN = 16    # linux/sched.h
275
276class Data(ct.Structure):
277    _fields_ = [
278        ("time", ct.c_ulonglong),
279        ("stack_id", ct.c_longlong),
280        ("cpu", ct.c_int),
281        ("id", ct.c_ulonglong),
282        ("addrs", ct.c_int * 4),
283        ("comm", ct.c_char * TASK_COMM_LEN),
284    ]
285
286def get_syms(kstack):
287    syms = []
288
289    for addr in kstack:
290        s = b.ksym(addr, show_offset=True)
291        syms.append(s)
292
293    return syms
294
295# process event
296def print_event(cpu, data, size):
297    try:
298        global b
299        event = ct.cast(data, ct.POINTER(Data)).contents
300        stack_traces = b['stack_traces']
301        stext = b.ksymname('_stext')
302
303        print("===================================")
304        print("TASK: %s (pid %5d tid %5d) Total Time: %-9.3fus\n\n" % (event.comm, \
305            (event.id >> 32), (event.id & 0xffffffff), float(event.time) / 1000), end="")
306        print("Section start: {} -> {}".format(b.ksym(stext + event.addrs[0]), b.ksym(stext + event.addrs[1])))
307        print("Section end:   {} -> {}".format(b.ksym(stext + event.addrs[2]), b.ksym(stext + event.addrs[3])))
308
309        if event.stack_id >= 0:
310            kstack = stack_traces.walk(event.stack_id)
311            syms = get_syms(kstack)
312            if not syms:
313                return
314
315            for s in syms:
316                print("  ", end="")
317                print("%s" % s)
318        else:
319            print("NO STACK FOUND DUE TO COLLISION")
320        print("===================================")
321        print("")
322    except:
323        sys.exit(0)
324
325b["events"].open_perf_buffer(print_event, page_cnt=256)
326
327print("Finding critical section with {} disabled for > {}us".format(
328    ('preempt' if preemptoff else 'IRQ'), args.duration))
329
330while 1:
331    b.perf_buffer_poll();
332