1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# biosnoop Trace block device I/O and print details including issuing PID. 5# For Linux, uses BCC, eBPF. 6# 7# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O 8# request, as well as a starting timestamp for calculating I/O latency. 9# 10# Copyright (c) 2015 Brendan Gregg. 11# Licensed under the Apache License, Version 2.0 (the "License") 12# 13# 16-Sep-2015 Brendan Gregg Created this. 14# 11-Feb-2016 Allan McAleavy updated for BPF_PERF_OUTPUT 15 16from __future__ import print_function 17from bcc import BPF 18import ctypes as ct 19import re 20 21# load BPF program 22b = BPF(text=""" 23#include <uapi/linux/ptrace.h> 24#include <linux/blkdev.h> 25 26struct val_t { 27 u32 pid; 28 char name[TASK_COMM_LEN]; 29}; 30 31struct data_t { 32 u32 pid; 33 u64 rwflag; 34 u64 delta; 35 u64 sector; 36 u64 len; 37 u64 ts; 38 char disk_name[DISK_NAME_LEN]; 39 char name[TASK_COMM_LEN]; 40}; 41 42BPF_HASH(start, struct request *); 43BPF_HASH(infobyreq, struct request *, struct val_t); 44BPF_PERF_OUTPUT(events); 45 46// cache PID and comm by-req 47int trace_pid_start(struct pt_regs *ctx, struct request *req) 48{ 49 struct val_t val = {}; 50 51 if (bpf_get_current_comm(&val.name, sizeof(val.name)) == 0) { 52 val.pid = bpf_get_current_pid_tgid(); 53 infobyreq.update(&req, &val); 54 } 55 return 0; 56} 57 58// time block I/O 59int trace_req_start(struct pt_regs *ctx, struct request *req) 60{ 61 u64 ts; 62 63 ts = bpf_ktime_get_ns(); 64 start.update(&req, &ts); 65 66 return 0; 67} 68 69// output 70int trace_req_completion(struct pt_regs *ctx, struct request *req) 71{ 72 u64 *tsp, delta; 73 u32 *pidp = 0; 74 struct val_t *valp; 75 struct data_t data = {}; 76 u64 ts; 77 78 // fetch timestamp and calculate delta 79 tsp = start.lookup(&req); 80 if (tsp == 0) { 81 // missed tracing issue 82 return 0; 83 } 84 ts = bpf_ktime_get_ns(); 85 data.delta = ts - *tsp; 86 data.ts = ts / 1000; 87 88 valp = infobyreq.lookup(&req); 89 if (valp == 0) { 90 data.len = req->__data_len; 91 strcpy(data.name, "?"); 92 } else { 93 data.pid = valp->pid; 94 data.len = req->__data_len; 95 data.sector = req->__sector; 96 bpf_probe_read(&data.name, sizeof(data.name), valp->name); 97 struct gendisk *rq_disk = req->rq_disk; 98 bpf_probe_read(&data.disk_name, sizeof(data.disk_name), 99 rq_disk->disk_name); 100 } 101 102/* 103 * The following deals with a kernel version change (in mainline 4.7, although 104 * it may be backported to earlier kernels) with how block request write flags 105 * are tested. We handle both pre- and post-change versions here. Please avoid 106 * kernel version tests like this as much as possible: they inflate the code, 107 * test, and maintenance burden. 108 */ 109#ifdef REQ_WRITE 110 data.rwflag = !!(req->cmd_flags & REQ_WRITE); 111#elif defined(REQ_OP_SHIFT) 112 data.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 113#else 114 data.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 115#endif 116 117 events.perf_submit(ctx, &data, sizeof(data)); 118 start.delete(&req); 119 infobyreq.delete(&req); 120 121 return 0; 122} 123""", debug=0) 124b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") 125b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 126b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 127b.attach_kprobe(event="blk_account_io_completion", 128 fn_name="trace_req_completion") 129 130TASK_COMM_LEN = 16 # linux/sched.h 131DISK_NAME_LEN = 32 # linux/genhd.h 132 133class Data(ct.Structure): 134 _fields_ = [ 135 ("pid", ct.c_ulonglong), 136 ("rwflag", ct.c_ulonglong), 137 ("delta", ct.c_ulonglong), 138 ("sector", ct.c_ulonglong), 139 ("len", ct.c_ulonglong), 140 ("ts", ct.c_ulonglong), 141 ("disk_name", ct.c_char * DISK_NAME_LEN), 142 ("name", ct.c_char * TASK_COMM_LEN) 143 ] 144 145# header 146print("%-14s %-14s %-6s %-7s %-2s %-9s %-7s %7s" % ("TIME(s)", "COMM", "PID", 147 "DISK", "T", "SECTOR", "BYTES", "LAT(ms)")) 148 149rwflg = "" 150start_ts = 0 151prev_ts = 0 152delta = 0 153 154# process event 155def print_event(cpu, data, size): 156 event = ct.cast(data, ct.POINTER(Data)).contents 157 158 val = -1 159 global start_ts 160 global prev_ts 161 global delta 162 163 if event.rwflag == 1: 164 rwflg = "W" 165 166 if event.rwflag == 0: 167 rwflg = "R" 168 169 if not re.match(b'\?', event.name): 170 val = event.sector 171 172 if start_ts == 0: 173 prev_ts = start_ts 174 175 if start_ts == 1: 176 delta = float(delta) + (event.ts - prev_ts) 177 178 print("%-14.9f %-14.14s %-6s %-7s %-2s %-9s %-7s %7.2f" % ( 179 delta / 1000000, event.name.decode('utf-8', 'replace'), event.pid, 180 event.disk_name.decode('utf-8', 'replace'), rwflg, val, 181 event.len, float(event.delta) / 1000000)) 182 183 prev_ts = event.ts 184 start_ts = 1 185 186# loop with callback to print_event 187b["events"].open_perf_buffer(print_event, page_cnt=64) 188while 1: 189 b.perf_buffer_poll() 190