• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# offcputime    Summarize off-CPU time by stack trace
4#               For Linux, uses BCC, eBPF.
5#
6# USAGE: offcputime [-h] [-p PID | -u | -k] [-U | -K] [-f] [duration]
7#
8# Copyright 2016 Netflix, Inc.
9# Licensed under the Apache License, Version 2.0 (the "License")
10#
11# 13-Jan-2016	Brendan Gregg	Created this.
12
13from __future__ import print_function
14from bcc import BPF
15from sys import stderr
16from time import strftime
17import argparse
18import errno
19import signal
20
21# arg validation
22def positive_int(val):
23    try:
24        ival = int(val)
25    except ValueError:
26        raise argparse.ArgumentTypeError("must be an integer")
27
28    if ival < 0:
29        raise argparse.ArgumentTypeError("must be positive")
30    return ival
31
32def positive_nonzero_int(val):
33    ival = positive_int(val)
34    if ival == 0:
35        raise argparse.ArgumentTypeError("must be nonzero")
36    return ival
37
38def stack_id_err(stack_id):
39    # -EFAULT in get_stackid normally means the stack-trace is not available,
40    # Such as getting kernel stack trace in userspace code
41    return (stack_id < 0) and (stack_id != -errno.EFAULT)
42
43# arguments
44examples = """examples:
45    ./offcputime             # trace off-CPU stack time until Ctrl-C
46    ./offcputime 5           # trace for 5 seconds only
47    ./offcputime -f 5        # 5 seconds, and output in folded format
48    ./offcputime -m 1000     # trace only events that last more than 1000 usec
49    ./offcputime -M 10000    # trace only events that last less than 10000 usec
50    ./offcputime -p 185      # only trace threads for PID 185
51    ./offcputime -t 188      # only trace thread 188
52    ./offcputime -u          # only trace user threads (no kernel)
53    ./offcputime -k          # only trace kernel threads (no user)
54    ./offcputime -U          # only show user space stacks (no kernel)
55    ./offcputime -K          # only show kernel space stacks (no user)
56"""
57parser = argparse.ArgumentParser(
58    description="Summarize off-CPU time by stack trace",
59    formatter_class=argparse.RawDescriptionHelpFormatter,
60    epilog=examples)
61thread_group = parser.add_mutually_exclusive_group()
62# Note: this script provides --pid and --tid flags but their arguments are
63# referred to internally using kernel nomenclature: TGID and PID.
64thread_group.add_argument("-p", "--pid", metavar="PID", dest="tgid",
65    help="trace this PID only", type=positive_int)
66thread_group.add_argument("-t", "--tid", metavar="TID", dest="pid",
67    help="trace this TID only", type=positive_int)
68thread_group.add_argument("-u", "--user-threads-only", action="store_true",
69    help="user threads only (no kernel threads)")
70thread_group.add_argument("-k", "--kernel-threads-only", action="store_true",
71    help="kernel threads only (no user threads)")
72stack_group = parser.add_mutually_exclusive_group()
73stack_group.add_argument("-U", "--user-stacks-only", action="store_true",
74    help="show stacks from user space only (no kernel space stacks)")
75stack_group.add_argument("-K", "--kernel-stacks-only", action="store_true",
76    help="show stacks from kernel space only (no user space stacks)")
77parser.add_argument("-d", "--delimited", action="store_true",
78    help="insert delimiter between kernel/user stacks")
79parser.add_argument("-f", "--folded", action="store_true",
80    help="output folded format")
81parser.add_argument("--stack-storage-size", default=1024,
82    type=positive_nonzero_int,
83    help="the number of unique stack traces that can be stored and "
84         "displayed (default 1024)")
85parser.add_argument("duration", nargs="?", default=99999999,
86    type=positive_nonzero_int,
87    help="duration of trace, in seconds")
88parser.add_argument("-m", "--min-block-time", default=1,
89    type=positive_nonzero_int,
90    help="the amount of time in microseconds over which we " +
91         "store traces (default 1)")
92parser.add_argument("-M", "--max-block-time", default=(1 << 64) - 1,
93    type=positive_nonzero_int,
94    help="the amount of time in microseconds under which we " +
95         "store traces (default U64_MAX)")
96parser.add_argument("--state", type=positive_int,
97    help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" +
98         ") see include/linux/sched.h")
99parser.add_argument("--ebpf", action="store_true",
100    help=argparse.SUPPRESS)
101args = parser.parse_args()
102folded = args.folded
103duration = int(args.duration)
104debug = 0
105
106# signal handler
107def signal_ignore(signal, frame):
108    print()
109
110# define BPF program
111bpf_text = """
112#include <uapi/linux/ptrace.h>
113#include <linux/sched.h>
114
115#define MINBLOCK_US    MINBLOCK_US_VALUEULL
116#define MAXBLOCK_US    MAXBLOCK_US_VALUEULL
117
118struct key_t {
119    u64 pid;
120    u64 tgid;
121    int user_stack_id;
122    int kernel_stack_id;
123    char name[TASK_COMM_LEN];
124};
125BPF_HASH(counts, struct key_t);
126BPF_HASH(start, u32);
127BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
128
129struct warn_event_t {
130    u32 pid;
131    u32 tgid;
132    u32 t_start;
133    u32 t_end;
134};
135BPF_PERF_OUTPUT(warn_events);
136
137int oncpu(struct pt_regs *ctx, struct task_struct *prev) {
138    u32 pid = prev->pid;
139    u32 tgid = prev->tgid;
140    u64 ts, *tsp;
141
142    // record previous thread sleep time
143    if ((THREAD_FILTER) && (STATE_FILTER)) {
144        ts = bpf_ktime_get_ns();
145        start.update(&pid, &ts);
146    }
147
148    // get the current thread's start time
149    pid = bpf_get_current_pid_tgid();
150    tgid = bpf_get_current_pid_tgid() >> 32;
151    tsp = start.lookup(&pid);
152    if (tsp == 0) {
153        return 0;        // missed start or filtered
154    }
155
156    // calculate current thread's delta time
157    u64 t_start = *tsp;
158    u64 t_end = bpf_ktime_get_ns();
159    start.delete(&pid);
160    if (t_start > t_end) {
161        struct warn_event_t event = {
162            .pid = pid,
163            .tgid = tgid,
164            .t_start = t_start,
165            .t_end = t_end,
166        };
167        warn_events.perf_submit(ctx, &event, sizeof(event));
168        return 0;
169    }
170    u64 delta = t_end - t_start;
171    delta = delta / 1000;
172    if ((delta < MINBLOCK_US) || (delta > MAXBLOCK_US)) {
173        return 0;
174    }
175
176    // create map key
177    struct key_t key = {};
178
179    key.pid = pid;
180    key.tgid = tgid;
181    key.user_stack_id = USER_STACK_GET;
182    key.kernel_stack_id = KERNEL_STACK_GET;
183    bpf_get_current_comm(&key.name, sizeof(key.name));
184
185    counts.increment(key, delta);
186    return 0;
187}
188"""
189
190# set thread filter
191thread_context = ""
192if args.tgid is not None:
193    thread_context = "PID %d" % args.tgid
194    thread_filter = 'tgid == %d' % args.tgid
195elif args.pid is not None:
196    thread_context = "TID %d" % args.pid
197    thread_filter = 'pid == %d' % args.pid
198elif args.user_threads_only:
199    thread_context = "user threads"
200    thread_filter = '!(prev->flags & PF_KTHREAD)'
201elif args.kernel_threads_only:
202    thread_context = "kernel threads"
203    thread_filter = 'prev->flags & PF_KTHREAD'
204else:
205    thread_context = "all threads"
206    thread_filter = '1'
207if args.state == 0:
208    state_filter = 'prev->STATE_FIELD == 0'
209elif args.state:
210    # these states are sometimes bitmask checked
211    state_filter = 'prev->STATE_FIELD & %d' % args.state
212else:
213    state_filter = '1'
214bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
215bpf_text = bpf_text.replace('STATE_FILTER', state_filter)
216if BPF.kernel_struct_has_field(b'task_struct', b'__state') == 1:
217    bpf_text = bpf_text.replace('STATE_FIELD', '__state')
218else:
219    bpf_text = bpf_text.replace('STATE_FIELD', 'state')
220
221# set stack storage size
222bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
223bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time))
224bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time))
225
226# handle stack args
227kernel_stack_get = "stack_traces.get_stackid(ctx, 0)"
228user_stack_get = "stack_traces.get_stackid(ctx, BPF_F_USER_STACK)"
229stack_context = ""
230if args.user_stacks_only:
231    stack_context = "user"
232    kernel_stack_get = "-1"
233elif args.kernel_stacks_only:
234    stack_context = "kernel"
235    user_stack_get = "-1"
236else:
237    stack_context = "user + kernel"
238bpf_text = bpf_text.replace('USER_STACK_GET', user_stack_get)
239bpf_text = bpf_text.replace('KERNEL_STACK_GET', kernel_stack_get)
240
241need_delimiter = args.delimited and not (args.kernel_stacks_only or
242                                         args.user_stacks_only)
243
244# check for an edge case; the code below will handle this case correctly
245# but ultimately nothing will be displayed
246if args.kernel_threads_only and args.user_stacks_only:
247    print("ERROR: Displaying user stacks for kernel threads " +
248          "doesn't make sense.", file=stderr)
249    exit(1)
250
251if debug or args.ebpf:
252    print(bpf_text)
253    if args.ebpf:
254        exit()
255
256# initialize BPF
257b = BPF(text=bpf_text)
258b.attach_kprobe(event_re="^finish_task_switch$|^finish_task_switch\.isra\.\d$",
259                fn_name="oncpu")
260matched = b.num_open_kprobes()
261if matched == 0:
262    print("error: 0 functions traced. Exiting.", file=stderr)
263    exit(1)
264
265# header
266if not folded:
267    print("Tracing off-CPU time (us) of %s by %s stack" %
268        (thread_context, stack_context), end="")
269    if duration < 99999999:
270        print(" for %d secs." % duration)
271    else:
272        print("... Hit Ctrl-C to end.")
273
274
275def print_warn_event(cpu, data, size):
276    event = b["warn_events"].event(data)
277    # See https://github.com/iovisor/bcc/pull/3227 for those wondering how can this happen.
278    print("WARN: Skipped an event with negative duration: pid:%d, tgid:%d, off-cpu:%d, on-cpu:%d"
279          % (event.pid, event.tgid, event.t_start, event.t_end),
280          file=stderr)
281
282b["warn_events"].open_perf_buffer(print_warn_event)
283try:
284    duration_ms = duration * 1000
285    start_time_ms = int(BPF.monotonic_time() / 1000000)
286    while True:
287        elapsed_ms = int(BPF.monotonic_time() / 1000000) - start_time_ms
288        if elapsed_ms >= duration_ms:
289            break
290        b.perf_buffer_poll(timeout=duration_ms - elapsed_ms)
291except KeyboardInterrupt:
292    # as cleanup can take many seconds, trap Ctrl-C:
293    signal.signal(signal.SIGINT, signal_ignore)
294
295if not folded:
296    print()
297
298missing_stacks = 0
299has_enomem = False
300counts = b.get_table("counts")
301stack_traces = b.get_table("stack_traces")
302for k, v in sorted(counts.items(), key=lambda counts: counts[1].value):
303    # handle get_stackid errors
304    if not args.user_stacks_only and stack_id_err(k.kernel_stack_id):
305        missing_stacks += 1
306        has_enomem = has_enomem or k.kernel_stack_id == -errno.ENOMEM
307    if not args.kernel_stacks_only and stack_id_err(k.user_stack_id):
308        missing_stacks += 1
309        has_enomem = has_enomem or k.user_stack_id == -errno.ENOMEM
310
311    # user stacks will be symbolized by tgid, not pid, to avoid the overhead
312    # of one symbol resolver per thread
313    user_stack = [] if k.user_stack_id < 0 else \
314        stack_traces.walk(k.user_stack_id)
315    kernel_stack = [] if k.kernel_stack_id < 0 else \
316        stack_traces.walk(k.kernel_stack_id)
317
318    if folded:
319        # print folded stack output
320        user_stack = list(user_stack)
321        kernel_stack = list(kernel_stack)
322        line = [k.name.decode('utf-8', 'replace')]
323        # if we failed to get the stack is, such as due to no space (-ENOMEM) or
324        # hash collision (-EEXIST), we still print a placeholder for consistency
325        if not args.kernel_stacks_only:
326            if stack_id_err(k.user_stack_id):
327                line.append("[Missed User Stack]")
328            else:
329                line.extend([b.sym(addr, k.tgid).decode('utf-8', 'replace')
330                    for addr in reversed(user_stack)])
331        if not args.user_stacks_only:
332            line.extend(["-"] if (need_delimiter and k.kernel_stack_id >= 0 and k.user_stack_id >= 0) else [])
333            if stack_id_err(k.kernel_stack_id):
334                line.append("[Missed Kernel Stack]")
335            else:
336                line.extend([b.ksym(addr).decode('utf-8', 'replace')
337                    for addr in reversed(kernel_stack)])
338        print("%s %d" % (";".join(line), v.value))
339    else:
340        # print default multi-line stack output
341        if not args.user_stacks_only:
342            if stack_id_err(k.kernel_stack_id):
343                print("    [Missed Kernel Stack]")
344            else:
345                for addr in kernel_stack:
346                    print("    %s" % b.ksym(addr).decode('utf-8', 'replace'))
347        if not args.kernel_stacks_only:
348            if need_delimiter and k.user_stack_id >= 0 and k.kernel_stack_id >= 0:
349                print("    --")
350            if stack_id_err(k.user_stack_id):
351                print("    [Missed User Stack]")
352            else:
353                for addr in user_stack:
354                    print("    %s" % b.sym(addr, k.tgid).decode('utf-8', 'replace'))
355        print("    %-16s %s (%d)" % ("-", k.name.decode('utf-8', 'replace'), k.pid))
356        print("        %d\n" % v.value)
357
358if missing_stacks > 0:
359    enomem_str = "" if not has_enomem else \
360        " Consider increasing --stack-storage-size."
361    print("WARNING: %d stack traces lost and could not be displayed.%s" %
362        (missing_stacks, enomem_str),
363        file=stderr)
364