• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# runqslower    Trace long process scheduling delays.
5#               For Linux, uses BCC, eBPF.
6#
7# This script traces high scheduling delays between tasks being
8# ready to run and them running on CPU after that.
9#
10# USAGE: runqslower [-p PID] [-t TID] [-P] [min_us]
11#
12# REQUIRES: Linux 4.9+ (BPF_PROG_TYPE_PERF_EVENT support).
13#
14# This measures the time a task spends waiting on a run queue for a turn
15# on-CPU, and shows this time as a individual events. This time should be small,
16# but a task may need to wait its turn due to CPU load.
17#
18# This measures two types of run queue latency:
19# 1. The time from a task being enqueued on a run queue to its context switch
20#    and execution. This traces ttwu_do_wakeup(), wake_up_new_task() ->
21#    finish_task_switch() with either raw tracepoints (if supported) or kprobes
22#    and instruments the run queue latency after a voluntary context switch.
23# 2. The time from when a task was involuntary context switched and still
24#    in the runnable state, to when it next executed. This is instrumented
25#    from finish_task_switch() alone.
26#
27# Copyright 2016 Cloudflare, Inc.
28# Licensed under the Apache License, Version 2.0 (the "License")
29#
30# 02-May-2018   Ivan Babrou   Created this.
31# 18-Nov-2019   Gergely Bod   BUG fix: Use bpf_probe_read_kernel_str() to extract the
32#                               process name from 'task_struct* next' in raw tp code.
33#                               bpf_get_current_comm() operates on the current task
34#                               which might already be different than 'next'.
35
36from __future__ import print_function
37from bcc import BPF
38import argparse
39from time import strftime
40
41# arguments
42examples = """examples:
43    ./runqslower         # trace run queue latency higher than 10000 us (default)
44    ./runqslower 1000    # trace run queue latency higher than 1000 us
45    ./runqslower -p 123  # trace pid 123
46    ./runqslower -t 123  # trace tid 123 (use for threads only)
47    ./runqslower -P      # also show previous task comm and TID
48"""
49parser = argparse.ArgumentParser(
50    description="Trace high run queue latency",
51    formatter_class=argparse.RawDescriptionHelpFormatter,
52    epilog=examples)
53parser.add_argument("min_us", nargs="?", default='10000',
54    help="minimum run queue latency to trace, in us (default 10000)")
55parser.add_argument("--ebpf", action="store_true",
56    help=argparse.SUPPRESS)
57
58thread_group = parser.add_mutually_exclusive_group()
59thread_group.add_argument("-p", "--pid", metavar="PID", dest="pid",
60    help="trace this PID only", type=int)
61thread_group.add_argument("-t", "--tid", metavar="TID", dest="tid",
62    help="trace this TID only", type=int)
63thread_group.add_argument("-P", "--previous", action="store_true",
64    help="also show previous task name and TID")
65args = parser.parse_args()
66
67min_us = int(args.min_us)
68debug = 0
69
70# define BPF program
71bpf_text = """
72#include <uapi/linux/ptrace.h>
73#include <linux/sched.h>
74#include <linux/nsproxy.h>
75#include <linux/pid_namespace.h>
76
77BPF_HASH(start, u32);
78
79struct rq;
80
81struct data_t {
82    u32 pid;
83    u32 prev_pid;
84    char task[TASK_COMM_LEN];
85    char prev_task[TASK_COMM_LEN];
86    u64 delta_us;
87};
88
89BPF_PERF_OUTPUT(events);
90
91// record enqueue timestamp
92static int trace_enqueue(u32 tgid, u32 pid)
93{
94    if (FILTER_PID || FILTER_TGID || pid == 0)
95        return 0;
96    u64 ts = bpf_ktime_get_ns();
97    start.update(&pid, &ts);
98    return 0;
99}
100"""
101
102bpf_text_kprobe = """
103int trace_wake_up_new_task(struct pt_regs *ctx, struct task_struct *p)
104{
105    return trace_enqueue(p->tgid, p->pid);
106}
107
108int trace_ttwu_do_wakeup(struct pt_regs *ctx, struct rq *rq, struct task_struct *p,
109    int wake_flags)
110{
111    return trace_enqueue(p->tgid, p->pid);
112}
113
114// calculate latency
115int trace_run(struct pt_regs *ctx, struct task_struct *prev)
116{
117    u32 pid, tgid, prev_pid;
118
119    // ivcsw: treat like an enqueue event and store timestamp
120    prev_pid = prev->pid;
121    if (prev->STATE_FIELD == TASK_RUNNING) {
122        tgid = prev->tgid;
123        u64 ts = bpf_ktime_get_ns();
124        if (prev_pid != 0) {
125            if (!(FILTER_PID) && !(FILTER_TGID)) {
126                start.update(&prev_pid, &ts);
127            }
128        }
129    }
130
131    pid = bpf_get_current_pid_tgid();
132
133    u64 *tsp, delta_us;
134
135    // fetch timestamp and calculate delta
136    tsp = start.lookup(&pid);
137    if (tsp == 0) {
138        return 0;   // missed enqueue
139    }
140    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
141
142    if (FILTER_US)
143        return 0;
144
145    struct data_t data = {};
146    data.pid = pid;
147    data.prev_pid = prev_pid;
148    data.delta_us = delta_us;
149    bpf_get_current_comm(&data.task, sizeof(data.task));
150    bpf_probe_read_kernel_str(&data.prev_task, sizeof(data.prev_task), prev->comm);
151
152    // output
153    events.perf_submit(ctx, &data, sizeof(data));
154
155    start.delete(&pid);
156    return 0;
157}
158"""
159
160bpf_text_raw_tp = """
161RAW_TRACEPOINT_PROBE(sched_wakeup)
162{
163    // TP_PROTO(struct task_struct *p)
164    struct task_struct *p = (struct task_struct *)ctx->args[0];
165    return trace_enqueue(p->tgid, p->pid);
166}
167
168RAW_TRACEPOINT_PROBE(sched_wakeup_new)
169{
170    // TP_PROTO(struct task_struct *p)
171    struct task_struct *p = (struct task_struct *)ctx->args[0];
172    u32 tgid, pid;
173
174    bpf_probe_read_kernel(&tgid, sizeof(tgid), &p->tgid);
175    bpf_probe_read_kernel(&pid, sizeof(pid), &p->pid);
176    return trace_enqueue(tgid, pid);
177}
178
179RAW_TRACEPOINT_PROBE(sched_switch)
180{
181    // TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next)
182    struct task_struct *prev = (struct task_struct *)ctx->args[1];
183    struct task_struct *next= (struct task_struct *)ctx->args[2];
184    u32 tgid, pid, prev_pid;
185    long state;
186
187    // ivcsw: treat like an enqueue event and store timestamp
188    bpf_probe_read_kernel(&state, sizeof(long), (const void *)&prev->STATE_FIELD);
189    bpf_probe_read_kernel(&prev_pid, sizeof(prev->pid), &prev->pid);
190    if (state == TASK_RUNNING) {
191        bpf_probe_read_kernel(&tgid, sizeof(prev->tgid), &prev->tgid);
192        u64 ts = bpf_ktime_get_ns();
193        if (prev_pid != 0) {
194            if (!(FILTER_PID) && !(FILTER_TGID)) {
195                start.update(&prev_pid, &ts);
196            }
197        }
198
199    }
200
201    bpf_probe_read_kernel(&pid, sizeof(next->pid), &next->pid);
202
203    u64 *tsp, delta_us;
204
205    // fetch timestamp and calculate delta
206    tsp = start.lookup(&pid);
207    if (tsp == 0) {
208        return 0;   // missed enqueue
209    }
210    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
211
212    if (FILTER_US)
213        return 0;
214
215    struct data_t data = {};
216    data.pid = pid;
217    data.prev_pid = prev_pid;
218    data.delta_us = delta_us;
219    bpf_probe_read_kernel_str(&data.task, sizeof(data.task), next->comm);
220    bpf_probe_read_kernel_str(&data.prev_task, sizeof(data.prev_task), prev->comm);
221
222    // output
223    events.perf_submit(ctx, &data, sizeof(data));
224
225    start.delete(&pid);
226    return 0;
227}
228"""
229
230is_support_raw_tp = BPF.support_raw_tracepoint()
231if is_support_raw_tp:
232    bpf_text += bpf_text_raw_tp
233else:
234    bpf_text += bpf_text_kprobe
235
236# code substitutions
237if BPF.kernel_struct_has_field(b'task_struct', b'__state') == 1:
238    bpf_text = bpf_text.replace('STATE_FIELD', '__state')
239else:
240    bpf_text = bpf_text.replace('STATE_FIELD', 'state')
241if min_us == 0:
242    bpf_text = bpf_text.replace('FILTER_US', '0')
243else:
244    bpf_text = bpf_text.replace('FILTER_US', 'delta_us <= %s' % str(min_us))
245
246if args.tid:
247    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % args.tid)
248else:
249    bpf_text = bpf_text.replace('FILTER_PID', '0')
250
251if args.pid:
252    bpf_text = bpf_text.replace('FILTER_TGID', 'tgid != %s' % args.pid)
253else:
254    bpf_text = bpf_text.replace('FILTER_TGID', '0')
255
256if debug or args.ebpf:
257    print(bpf_text)
258    if args.ebpf:
259        exit()
260
261# process event
262def print_event(cpu, data, size):
263    event = b["events"].event(data)
264    if args.previous:
265        print("%-8s %-16s %-6s %14s %-16s %-6s" % (strftime("%H:%M:%S"), event.task, event.pid, event.delta_us, event.prev_task, event.prev_pid))
266    else:
267        print("%-8s %-16s %-6s %14s" % (strftime("%H:%M:%S"), event.task, event.pid, event.delta_us))
268
269# load BPF program
270b = BPF(text=bpf_text)
271if not is_support_raw_tp:
272    b.attach_kprobe(event="ttwu_do_wakeup", fn_name="trace_ttwu_do_wakeup")
273    b.attach_kprobe(event="wake_up_new_task", fn_name="trace_wake_up_new_task")
274    b.attach_kprobe(event_re="^finish_task_switch$|^finish_task_switch\.isra\.\d$",
275                    fn_name="trace_run")
276
277print("Tracing run queue latency higher than %d us" % min_us)
278if args.previous:
279    print("%-8s %-16s %-6s %14s %-16s %-6s" % ("TIME", "COMM", "TID", "LAT(us)", "PREV COMM", "PREV TID"))
280else:
281    print("%-8s %-16s %-6s %14s" % ("TIME", "COMM", "TID", "LAT(us)"))
282
283# read events
284b["events"].open_perf_buffer(print_event, page_cnt=64)
285while 1:
286    try:
287        b.perf_buffer_poll()
288    except KeyboardInterrupt:
289        exit()
290