• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env python
2# @lint-avoid-python-3-compatibility-imports
3#
4# biosnoop  Trace block device I/O and print details including issuing PID.
5#       For Linux, uses BCC, eBPF.
6#
7# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
8# request, as well as a starting timestamp for calculating I/O latency.
9#
10# Copyright (c) 2015 Brendan Gregg.
11# Licensed under the Apache License, Version 2.0 (the "License")
12#
13# 16-Sep-2015   Brendan Gregg   Created this.
14# 11-Feb-2016   Allan McAleavy  updated for BPF_PERF_OUTPUT
15
16from __future__ import print_function
17from bcc import BPF
18import ctypes as ct
19import re
20
21# load BPF program
22b = BPF(text="""
23#include <uapi/linux/ptrace.h>
24#include <linux/blkdev.h>
25
26struct val_t {
27    u32 pid;
28    char name[TASK_COMM_LEN];
29};
30
31struct data_t {
32    u32 pid;
33    u64 rwflag;
34    u64 delta;
35    u64 sector;
36    u64 len;
37    u64 ts;
38    char disk_name[DISK_NAME_LEN];
39    char name[TASK_COMM_LEN];
40};
41
42BPF_HASH(start, struct request *);
43BPF_HASH(infobyreq, struct request *, struct val_t);
44BPF_PERF_OUTPUT(events);
45
46// cache PID and comm by-req
47int trace_pid_start(struct pt_regs *ctx, struct request *req)
48{
49    struct val_t val = {};
50
51    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
52        val.pid = bpf_get_current_pid_tgid();
53        infobyreq.update(&req, &val);
54    }
55    return 0;
56}
57
58// time block I/O
59int trace_req_start(struct pt_regs *ctx, struct request *req)
60{
61    u64 ts;
62
63    ts = bpf_ktime_get_ns();
64    start.update(&req, &ts);
65
66    return 0;
67}
68
69// output
70int trace_req_completion(struct pt_regs *ctx, struct request *req)
71{
72    u64 *tsp, delta;
73    u32 *pidp = 0;
74    struct val_t *valp;
75    struct data_t data = {};
76    u64 ts;
77
78    // fetch timestamp and calculate delta
79    tsp = start.lookup(&req);
80    if (tsp == 0) {
81        // missed tracing issue
82        return 0;
83    }
84    ts = bpf_ktime_get_ns();
85    data.delta = ts - *tsp;
86    data.ts = ts / 1000;
87
88    valp = infobyreq.lookup(&req);
89    if (valp == 0) {
90        data.len = req->__data_len;
91        strcpy(data.name, "?");
92    } else {
93        data.pid = valp->pid;
94        data.len = req->__data_len;
95        data.sector = req->__sector;
96        bpf_probe_read(&data.name, sizeof(data.name), valp->name);
97        struct gendisk *rq_disk = req->rq_disk;
98        bpf_probe_read(&data.disk_name, sizeof(data.disk_name),
99                       rq_disk->disk_name);
100    }
101
102/*
103 * The following deals with a kernel version change (in mainline 4.7, although
104 * it may be backported to earlier kernels) with how block request write flags
105 * are tested. We handle both pre- and post-change versions here. Please avoid
106 * kernel version tests like this as much as possible: they inflate the code,
107 * test, and maintenance burden.
108 */
109#ifdef REQ_WRITE
110    data.rwflag = !!(req->cmd_flags & REQ_WRITE);
111#elif defined(REQ_OP_SHIFT)
112    data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
113#else
114    data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
115#endif
116
117    events.perf_submit(ctx, &data, sizeof(data));
118    start.delete(&req);
119    infobyreq.delete(&req);
120
121    return 0;
122}
123""", debug=0)
124b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
125if BPF.get_kprobe_functions(b'blk_start_request'):
126    b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
127b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
128b.attach_kprobe(event="blk_account_io_completion",
129    fn_name="trace_req_completion")
130
131TASK_COMM_LEN = 16  # linux/sched.h
132DISK_NAME_LEN = 32  # linux/genhd.h
133
134class Data(ct.Structure):
135    _fields_ = [
136        ("pid", ct.c_ulonglong),
137        ("rwflag", ct.c_ulonglong),
138        ("delta", ct.c_ulonglong),
139        ("sector", ct.c_ulonglong),
140        ("len", ct.c_ulonglong),
141        ("ts", ct.c_ulonglong),
142        ("disk_name", ct.c_char * DISK_NAME_LEN),
143        ("name", ct.c_char * TASK_COMM_LEN)
144    ]
145
146# header
147print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
148    "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
149
150rwflg = ""
151start_ts = 0
152prev_ts = 0
153delta = 0
154
155# process event
156def print_event(cpu, data, size):
157    event = ct.cast(data, ct.POINTER(Data)).contents
158
159    val = -1
160    global start_ts
161    global prev_ts
162    global delta
163
164    if event.rwflag == 1:
165        rwflg = "W"
166
167    if event.rwflag == 0:
168        rwflg = "R"
169
170    if not re.match(b'\?', event.name):
171        val = event.sector
172
173    if start_ts == 0:
174        prev_ts = start_ts
175
176    if start_ts == 1:
177        delta = float(delta) + (event.ts - prev_ts)
178
179    print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
180        delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
181        event.disk_name.decode('utf-8', 'replace'), rwflg, val,
182        event.len, float(event.delta) / 1000000))
183
184    prev_ts = event.ts
185    start_ts = 1
186
187# loop with callback to print_event
188b["events"].open_perf_buffer(print_event, page_cnt=64)
189while 1:
190    try:
191        b.perf_buffer_poll()
192    except KeyboardInterrupt:
193        exit()
194