1#!/usr/bin/env python 2# @lint-avoid-python-3-compatibility-imports 3# 4# biosnoop Trace block device I/O and print details including issuing PID. 5# For Linux, uses BCC, eBPF. 6# 7# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O 8# request, as well as a starting timestamp for calculating I/O latency. 9# 10# Copyright (c) 2015 Brendan Gregg. 11# Licensed under the Apache License, Version 2.0 (the "License") 12# 13# 16-Sep-2015 Brendan Gregg Created this. 14# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT 15 16from __future__ import print_function 17from bcc import BPF 18import ctypes as ct 19import re 20 21# load BPF program 22b = BPF(text=""" 23#include <uapi/linux/ptrace.h> 24#include <linux/blkdev.h> 25 26struct val_t { 27 u32 pid; 28 char name[TASK_COMM_LEN]; 29}; 30 31struct data_t { 32 u32 pid; 33 u64 rwflag; 34 u64 delta; 35 u64 sector; 36 u64 len; 37 u64 ts; 38 char disk_name[DISK_NAME_LEN]; 39 char name[TASK_COMM_LEN]; 40}; 41 42BPF_HASH(start, struct request *); 43BPF_HASH(infobyreq, struct request *, struct val_t); 44BPF_PERF_OUTPUT(events); 45 46// cache PID and comm by-req 47int trace_pid_start(struct pt_regs *ctx, struct request *req) 48{ 49 struct val_t val = {}; 50 51 if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { 52 val.pid = bpf_get_current_pid_tgid(); 53 infobyreq.update(&req, &val); 54 } 55 return 0; 56} 57 58// time block I/O 59int trace_req_start(struct pt_regs *ctx, struct request *req) 60{ 61 u64 ts; 62 63 ts = bpf_ktime_get_ns(); 64 start.update(&req, &ts); 65 66 return 0; 67} 68 69// output 70int trace_req_completion(struct pt_regs *ctx, struct request *req) 71{ 72 u64 *tsp, delta; 73 u32 *pidp = 0; 74 struct val_t *valp; 75 struct data_t data = {}; 76 u64 ts; 77 78 // fetch timestamp and calculate delta 79 tsp = start.lookup(&req); 80 if (tsp == 0) { 81 // missed tracing issue 82 return 0; 83 } 84 ts = bpf_ktime_get_ns(); 85 data.delta = ts - *tsp; 86 data.ts = ts / 1000; 87 88 valp = infobyreq.lookup(&req); 89 if (valp == 0) { 90 data.len = req->__data_len; 91 strcpy(data.name, "?"); 92 } else { 93 data.pid = valp->pid; 94 data.len = req->__data_len; 95 data.sector = req->__sector; 96 bpf_probe_read(&data.name, sizeof(data.name), valp->name); 97 struct gendisk *rq_disk = req->rq_disk; 98 bpf_probe_read(&data.disk_name, sizeof(data.disk_name), 99 rq_disk->disk_name); 100 } 101 102/* 103 * The following deals with a kernel version change (in mainline 4.7, although 104 * it may be backported to earlier kernels) with how block request write flags 105 * are tested. We handle both pre- and post-change versions here. Please avoid 106 * kernel version tests like this as much as possible: they inflate the code, 107 * test, and maintenance burden. 108 */ 109#ifdef REQ_WRITE 110 data.rwflag = !!(req->cmd_flags & REQ_WRITE); 111#elif defined(REQ_OP_SHIFT) 112 data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 113#else 114 data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 115#endif 116 117 events.perf_submit(ctx, &data, sizeof(data)); 118 start.delete(&req); 119 infobyreq.delete(&req); 120 121 return 0; 122} 123""", debug=0) 124b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") 125if BPF.get_kprobe_functions(b'blk_start_request'): 126 b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 127b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 128b.attach_kprobe(event="blk_account_io_completion", 129 fn_name="trace_req_completion") 130 131TASK_COMM_LEN = 16 # linux/sched.h 132DISK_NAME_LEN = 32 # linux/genhd.h 133 134class Data(ct.Structure): 135 _fields_ = [ 136 ("pid", ct.c_ulonglong), 137 ("rwflag", ct.c_ulonglong), 138 ("delta", ct.c_ulonglong), 139 ("sector", ct.c_ulonglong), 140 ("len", ct.c_ulonglong), 141 ("ts", ct.c_ulonglong), 142 ("disk_name", ct.c_char * DISK_NAME_LEN), 143 ("name", ct.c_char * TASK_COMM_LEN) 144 ] 145 146# header 147print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID", 148 "DISK", "T", "SECTOR", "BYTES", "LAT(ms)")) 149 150rwflg = "" 151start_ts = 0 152prev_ts = 0 153delta = 0 154 155# process event 156def print_event(cpu, data, size): 157 event = ct.cast(data, ct.POINTER(Data)).contents 158 159 val = -1 160 global start_ts 161 global prev_ts 162 global delta 163 164 if event.rwflag == 1: 165 rwflg = "W" 166 167 if event.rwflag == 0: 168 rwflg = "R" 169 170 if not re.match(b'\?', event.name): 171 val = event.sector 172 173 if start_ts == 0: 174 prev_ts = start_ts 175 176 if start_ts == 1: 177 delta = float(delta) + (event.ts - prev_ts) 178 179 print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % ( 180 delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid, 181 event.disk_name.decode('utf-8', 'replace'), rwflg, val, 182 event.len, float(event.delta) / 1000000)) 183 184 prev_ts = event.ts 185 start_ts = 1 186 187# loop with callback to print_event 188b["events"].open_perf_buffer(print_event, page_cnt=64) 189while 1: 190 try: 191 b.perf_buffer_poll() 192 except KeyboardInterrupt: 193 exit() 194