• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# biotop  block device (disk) I/O by process.
5#         For Linux, uses BCC, eBPF.
6#
7# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count]
8#
9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
10# request, as well as a starting timestamp for calculating I/O latency.
11#
12# Copyright 2016 Netflix, Inc.
13# Licensed under the Apache License, Version 2.0 (the "License")
14#
15# 06-Feb-2016   Brendan Gregg   Created this.
16# 17-Mar-2022   Rocky Xing      Added PID filter support.
17
18from __future__ import print_function
19from bcc import BPF
20from time import sleep, strftime
21import argparse
22from subprocess import call
23
24# arguments
25examples = """examples:
26    ./biotop            # block device I/O top, 1 second refresh
27    ./biotop -C         # don't clear the screen
28    ./biotop -p 181     # only trace PID 181
29    ./biotop 5          # 5 second summaries
30    ./biotop 5 10       # 5 second summaries, 10 times only
31"""
32parser = argparse.ArgumentParser(
33    description="Block device (disk) I/O by process",
34    formatter_class=argparse.RawDescriptionHelpFormatter,
35    epilog=examples)
36parser.add_argument("-C", "--noclear", action="store_true",
37    help="don't clear the screen")
38parser.add_argument("-r", "--maxrows", default=20,
39    help="maximum rows to print, default 20")
40parser.add_argument("-p", "--pid", type=int, metavar="PID",
41    help="trace this PID only")
42parser.add_argument("interval", nargs="?", default=1,
43    help="output interval, in seconds")
44parser.add_argument("count", nargs="?", default=99999999,
45    help="number of outputs")
46parser.add_argument("--ebpf", action="store_true",
47    help=argparse.SUPPRESS)
48args = parser.parse_args()
49interval = int(args.interval)
50countdown = int(args.count)
51maxrows = int(args.maxrows)
52clear = not int(args.noclear)
53
54# linux stats
55loadavg = "/proc/loadavg"
56diskstats = "/proc/diskstats"
57
58# load BPF program
59bpf_text = """
60#include <uapi/linux/ptrace.h>
61#include <linux/blk-mq.h>
62
63// for saving the timestamp and __data_len of each request
64struct start_req_t {
65    u64 ts;
66    u64 data_len;
67};
68
69// for saving process info by request
70struct who_t {
71    u32 pid;
72    char name[TASK_COMM_LEN];
73};
74
75// the key for the output summary
76struct info_t {
77    u32 pid;
78    int rwflag;
79    int major;
80    int minor;
81    char name[TASK_COMM_LEN];
82};
83
84// the value of the output summary
85struct val_t {
86    u64 bytes;
87    u64 us;
88    u32 io;
89};
90
91BPF_HASH(start, struct request *, struct start_req_t);
92BPF_HASH(whobyreq, struct request *, struct who_t);
93BPF_HASH(counts, struct info_t, struct val_t);
94
95// cache PID and comm by-req
96int trace_pid_start(struct pt_regs *ctx, struct request *req)
97{
98    struct who_t who = {};
99    u32 pid;
100
101    if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) {
102        pid = bpf_get_current_pid_tgid() >> 32;
103        if (FILTER_PID)
104            return 0;
105
106        who.pid = pid;
107        whobyreq.update(&req, &who);
108    }
109
110    return 0;
111}
112
113// time block I/O
114int trace_req_start(struct pt_regs *ctx, struct request *req)
115{
116    struct start_req_t start_req = {
117        .ts = bpf_ktime_get_ns(),
118        .data_len = req->__data_len
119    };
120    start.update(&req, &start_req);
121    return 0;
122}
123
124// output
125int trace_req_completion(struct pt_regs *ctx, struct request *req)
126{
127    struct start_req_t *startp;
128
129    // fetch timestamp and calculate delta
130    startp = start.lookup(&req);
131    if (startp == 0) {
132        return 0;    // missed tracing issue
133    }
134
135    struct who_t *whop;
136    u32 pid;
137
138    whop = whobyreq.lookup(&req);
139    pid = whop != 0 ? whop->pid : 0;
140    if (FILTER_PID) {
141        start.delete(&req);
142        if (whop != 0) {
143            whobyreq.delete(&req);
144        }
145        return 0;
146    }
147
148    struct val_t *valp, zero = {};
149    u64 delta_us = (bpf_ktime_get_ns() - startp->ts) / 1000;
150
151    // setup info_t key
152    struct info_t info = {};
153    info.major = req->__RQ_DISK__->major;
154    info.minor = req->__RQ_DISK__->first_minor;
155/*
156 * The following deals with a kernel version change (in mainline 4.7, although
157 * it may be backported to earlier kernels) with how block request write flags
158 * are tested. We handle both pre- and post-change versions here. Please avoid
159 * kernel version tests like this as much as possible: they inflate the code,
160 * test, and maintenance burden.
161 */
162#ifdef REQ_WRITE
163    info.rwflag = !!(req->cmd_flags & REQ_WRITE);
164#elif defined(REQ_OP_SHIFT)
165    info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
166#else
167    info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
168#endif
169
170    if (whop == 0) {
171        // missed pid who, save stats as pid 0
172        valp = counts.lookup_or_try_init(&info, &zero);
173    } else {
174        info.pid = whop->pid;
175        __builtin_memcpy(&info.name, whop->name, sizeof(info.name));
176        valp = counts.lookup_or_try_init(&info, &zero);
177    }
178
179    if (valp) {
180        // save stats
181        valp->us += delta_us;
182        valp->bytes += startp->data_len;
183        valp->io++;
184    }
185
186    start.delete(&req);
187    whobyreq.delete(&req);
188
189    return 0;
190}
191"""
192
193if args.ebpf:
194    print(bpf_text)
195    exit()
196
197if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
198    bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk')
199else:
200    bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk')
201
202if args.pid is not None:
203    bpf_text = bpf_text.replace('FILTER_PID', 'pid != %d' % args.pid)
204else:
205    bpf_text = bpf_text.replace('FILTER_PID', '0')
206
207b = BPF(text=bpf_text)
208if BPF.get_kprobe_functions(b'__blk_account_io_start'):
209    b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start")
210else:
211    b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
212if BPF.get_kprobe_functions(b'blk_start_request'):
213    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
214b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
215if BPF.get_kprobe_functions(b'__blk_account_io_done'):
216    b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion")
217else:
218    b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion")
219
220print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval)
221
222# cache disk major,minor -> diskname
223disklookup = {}
224with open(diskstats) as stats:
225    for line in stats:
226        a = line.split()
227        disklookup[a[0] + "," + a[1]] = a[2]
228
229# output
230exiting = 0
231while 1:
232    try:
233        sleep(interval)
234    except KeyboardInterrupt:
235        exiting = 1
236
237    # header
238    if clear:
239        call("clear")
240    else:
241        print()
242    with open(loadavg) as stats:
243        print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read()))
244    print("%-7s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM",
245        "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms"))
246
247    # by-PID output
248    counts = b.get_table("counts")
249    line = 0
250    for k, v in reversed(sorted(counts.items(),
251                                key=lambda counts: counts[1].bytes)):
252
253        # lookup disk
254        disk = str(k.major) + "," + str(k.minor)
255        if disk in disklookup:
256            diskname = disklookup[disk]
257        else:
258            diskname = "?"
259
260        # print line
261        avg_ms = (float(v.us) / 1000) / v.io
262        print("%-7d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid,
263            k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R",
264            k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms))
265
266        line += 1
267        if line >= maxrows:
268            break
269    counts.clear()
270
271    countdown -= 1
272    if exiting or countdown == 0:
273        print("Detaching...")
274        exit()
275