• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# biosnoop  Trace block device I/O and print details including issuing PID.
5#       For Linux, uses BCC, eBPF.
6#
7# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O
8# request, as well as a starting timestamp for calculating I/O latency.
9#
10# Copyright (c) 2015 Brendan Gregg.
11# Licensed under the Apache License, Version 2.0 (the "License")
12#
13# 16-Sep-2015   Brendan Gregg   Created this.
14# 11-Feb-2016   Allan McAleavy  updated for BPF_PERF_OUTPUT
15
16from __future__ import print_function
17from bcc import BPF
18import ctypes as ct
19import re
20
21# load BPF program
22b = BPF(text="""
23#include <uapi/linux/ptrace.h>
24#include <linux/blkdev.h>
25
26struct val_t {
27    u32 pid;
28    char name[TASK_COMM_LEN];
29};
30
31struct data_t {
32    u32 pid;
33    u64 rwflag;
34    u64 delta;
35    u64 sector;
36    u64 len;
37    u64 ts;
38    char disk_name[DISK_NAME_LEN];
39    char name[TASK_COMM_LEN];
40};
41
42BPF_HASH(start, struct request *);
43BPF_HASH(infobyreq, struct request *, struct val_t);
44BPF_PERF_OUTPUT(events);
45
46// cache PID and comm by-req
47int trace_pid_start(struct pt_regs *ctx, struct request *req)
48{
49    struct val_t val = {};
50
51    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
52        val.pid = bpf_get_current_pid_tgid();
53        infobyreq.update(&req, &val);
54    }
55    return 0;
56}
57
58// time block I/O
59int trace_req_start(struct pt_regs *ctx, struct request *req)
60{
61    u64 ts;
62
63    ts = bpf_ktime_get_ns();
64    start.update(&req, &ts);
65
66    return 0;
67}
68
69// output
70int trace_req_completion(struct pt_regs *ctx, struct request *req)
71{
72    u64 *tsp, delta;
73    u32 *pidp = 0;
74    struct val_t *valp;
75    struct data_t data = {};
76    u64 ts;
77
78    // fetch timestamp and calculate delta
79    tsp = start.lookup(&req);
80    if (tsp == 0) {
81        // missed tracing issue
82        return 0;
83    }
84    ts = bpf_ktime_get_ns();
85    data.delta = ts - *tsp;
86    data.ts = ts / 1000;
87
88    valp = infobyreq.lookup(&req);
89    if (valp == 0) {
90        data.len = req->__data_len;
91        strcpy(data.name, "?");
92    } else {
93        data.pid = valp->pid;
94        data.len = req->__data_len;
95        data.sector = req->__sector;
96        bpf_probe_read(&data.name, sizeof(data.name), valp->name);
97        struct gendisk *rq_disk = req->rq_disk;
98        bpf_probe_read(&data.disk_name, sizeof(data.disk_name),
99                       rq_disk->disk_name);
100    }
101
102/*
103 * The following deals with a kernel version change (in mainline 4.7, although
104 * it may be backported to earlier kernels) with how block request write flags
105 * are tested. We handle both pre- and post-change versions here. Please avoid
106 * kernel version tests like this as much as possible: they inflate the code,
107 * test, and maintenance burden.
108 */
109#ifdef REQ_WRITE
110    data.rwflag = !!(req->cmd_flags & REQ_WRITE);
111#elif defined(REQ_OP_SHIFT)
112    data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
113#else
114    data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
115#endif
116
117    events.perf_submit(ctx, &data, sizeof(data));
118    start.delete(&req);
119    infobyreq.delete(&req);
120
121    return 0;
122}
123""", debug=0)
124b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start")
125b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start")
126b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start")
127b.attach_kprobe(event="blk_account_io_completion",
128    fn_name="trace_req_completion")
129
130TASK_COMM_LEN = 16  # linux/sched.h
131DISK_NAME_LEN = 32  # linux/genhd.h
132
133class Data(ct.Structure):
134    _fields_ = [
135        ("pid", ct.c_ulonglong),
136        ("rwflag", ct.c_ulonglong),
137        ("delta", ct.c_ulonglong),
138        ("sector", ct.c_ulonglong),
139        ("len", ct.c_ulonglong),
140        ("ts", ct.c_ulonglong),
141        ("disk_name", ct.c_char * DISK_NAME_LEN),
142        ("name", ct.c_char * TASK_COMM_LEN)
143    ]
144
145# header
146print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID",
147    "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"))
148
149rwflg = ""
150start_ts = 0
151prev_ts = 0
152delta = 0
153
154# process event
155def print_event(cpu, data, size):
156    event = ct.cast(data, ct.POINTER(Data)).contents
157
158    val = -1
159    global start_ts
160    global prev_ts
161    global delta
162
163    if event.rwflag == 1:
164        rwflg = "W"
165
166    if event.rwflag == 0:
167        rwflg = "R"
168
169    if not re.match(b'\?', event.name):
170        val = event.sector
171
172    if start_ts == 0:
173        prev_ts = start_ts
174
175    if start_ts == 1:
176        delta = float(delta) + (event.ts - prev_ts)
177
178    print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % (
179        delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid,
180        event.disk_name.decode('utf-8', 'replace'), rwflg, val,
181        event.len, float(event.delta) / 1000000))
182
183    prev_ts = event.ts
184    start_ts = 1
185
186# loop with callback to print_event
187b["events"].open_perf_buffer(print_event, page_cnt=64)
188while 1:
189    b.perf_buffer_poll()
190