• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env bcc-lua
2--[[
3Copyright 2016 GitHub, Inc
4
5Licensed under the Apache License, Version 2.0 (the "License");
6you may not use this file except in compliance with the License.
7You may obtain a copy of the License at
8
9http://www.apache.org/licenses/LICENSE-2.0
10
11Unless required by applicable law or agreed to in writing, software
12distributed under the License is distributed on an "AS IS" BASIS,
13WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14See the License for the specific language governing permissions and
15limitations under the License.
16--]]
17
18local program = [[
19#include <uapi/linux/ptrace.h>
20#include <linux/blkdev.h>
21
22struct val_t {
23    u32 pid;
24    char name[TASK_COMM_LEN];
25};
26
27struct data_t {
28    u32 pid;
29    u64 rwflag;
30    u64 delta;
31    u64 sector;
32    u64 len;
33    u64 ts;
34    char disk_name[DISK_NAME_LEN];
35    char name[TASK_COMM_LEN];
36};
37
38BPF_HASH(start, struct request *);
39BPF_HASH(infobyreq, struct request *, struct val_t);
40BPF_PERF_OUTPUT(events);
41
42// cache PID and comm by-req
43int trace_pid_start(struct pt_regs *ctx, struct request *req)
44{
45    struct val_t val = {};
46
47    if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) {
48        val.pid = bpf_get_current_pid_tgid();
49        infobyreq.update(&req, &val);
50    }
51    return 0;
52}
53
54// time block I/O
55int trace_req_start(struct pt_regs *ctx, struct request *req)
56{
57    u64 ts;
58
59    ts = bpf_ktime_get_ns();
60    start.update(&req, &ts);
61
62    return 0;
63}
64
65// output
66int trace_req_completion(struct pt_regs *ctx, struct request *req)
67{
68    u64 *tsp, delta;
69    u32 *pidp = 0;
70    struct val_t *valp;
71    struct data_t data ={};
72    u64 ts;
73
74    // fetch timestamp and calculate delta
75    tsp = start.lookup(&req);
76    if (tsp == 0) {
77        // missed tracing issue
78        return 0;
79    }
80    ts = bpf_ktime_get_ns();
81    data.delta = ts - *tsp;
82    data.ts = ts / 1000;
83
84    valp = infobyreq.lookup(&req);
85    if (valp == 0) {
86        data.len = req->__data_len;
87        strcpy(data.name,"?");
88    } else {
89        data.pid = valp->pid;
90        data.len = req->__data_len;
91        data.sector = req->__sector;
92        bpf_probe_read(&data.name, sizeof(data.name), valp->name);
93        bpf_probe_read(&data.disk_name, sizeof(data.disk_name),
94                       req->rq_disk->disk_name);
95    }
96
97/*
98 * The following deals with a kernel version change (in mainline 4.7, although
99 * it may be backported to earlier kernels) with how block request write flags
100 * are tested. We handle both pre- and post-change versions here. Please avoid
101 * kernel version tests like this as much as possible: they inflate the code,
102 * test, and maintenance burden.
103 */
104#ifdef REQ_WRITE
105    data.rwflag = !!(req->cmd_flags & REQ_WRITE);
106#elif defined(REQ_OP_SHIFT)
107    data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE);
108#else
109    data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE);
110#endif
111
112    events.perf_submit(ctx,&data,sizeof(data));
113    start.delete(&req);
114    infobyreq.delete(&req);
115
116    return 0;
117}
118]]
119
120local ffi = require("ffi")
121
122return function(BPF, utils)
123  local bpf = BPF:new{text=program}
124
125  bpf:attach_kprobe{event="blk_account_io_start", fn_name="trace_pid_start"}
126  bpf:attach_kprobe{event="blk_start_request", fn_name="trace_req_start"}
127  bpf:attach_kprobe{event="blk_mq_start_request", fn_name="trace_req_start"}
128  bpf:attach_kprobe{event="blk_account_io_completion",
129      fn_name="trace_req_completion"}
130
131  print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % {"TIME(s)", "COMM", "PID",
132    "DISK", "T", "SECTOR", "BYTES", "LAT(ms)"})
133
134  local rwflg = ""
135  local start_ts = 0
136  local prev_ts = 0
137  local delta = 0
138
139  local function print_event(cpu, event)
140    local val = -1
141    local event_pid = event.pid
142    local event_delta = tonumber(event.delta)
143    local event_sector = tonumber(event.sector)
144    local event_len = tonumber(event.len)
145    local event_ts = tonumber(event.ts)
146    local event_disk_name = ffi.string(event.disk_name)
147    local event_name = ffi.string(event.name)
148
149    if event.rwflag == 1 then
150      rwflg = "W"
151    end
152
153    if event.rwflag == 0 then
154      rwflg = "R"
155    end
156
157    if not event_name:match("%?") then
158      val = event_sector
159    end
160
161    if start_ts == 0 then
162      prev_ts = start_ts
163    end
164
165    if start_ts == 1 then
166      delta = delta + (event_ts - prev_ts)
167    end
168
169    print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % {
170      delta / 1000000, event_name, event_pid, event_disk_name, rwflg, val,
171      event_len, event_delta / 1000000})
172
173    prev_ts = event_ts
174    start_ts = 1
175  end
176
177  local TASK_COMM_LEN = 16 -- linux/sched.h
178  local DISK_NAME_LEN = 32 -- linux/genhd.h
179
180  bpf:get_table("events"):open_perf_buffer(print_event, [[
181    struct {
182      uint32_t pid;
183      uint64_t rwflag;
184      uint64_t delta;
185      uint64_t sector;
186      uint64_t len;
187      uint64_t ts;
188      char disk_name[$];
189      char name[$];
190    }
191  ]], {DISK_NAME_LEN, TASK_COMM_LEN}, 64)
192  bpf:perf_buffer_poll_loop()
193end
194