1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# biotop block device (disk) I/O by process. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: biotop.py [-h] [-C] [-r MAXROWS] [-p PID] [interval] [count] 8# 9# This uses in-kernel eBPF maps to cache process details (PID and comm) by I/O 10# request, as well as a starting timestamp for calculating I/O latency. 11# 12# Copyright 2016 Netflix, Inc. 13# Licensed under the Apache License, Version 2.0 (the "License") 14# 15# 06-Feb-2016 Brendan Gregg Created this. 16# 17-Mar-2022 Rocky Xing Added PID filter support. 17 18from __future__ import print_function 19from bcc import BPF 20from time import sleep, strftime 21import argparse 22from subprocess import call 23 24# arguments 25examples = """examples: 26 ./biotop # block device I/O top, 1 second refresh 27 ./biotop -C # don't clear the screen 28 ./biotop -p 181 # only trace PID 181 29 ./biotop 5 # 5 second summaries 30 ./biotop 5 10 # 5 second summaries, 10 times only 31""" 32parser = argparse.ArgumentParser( 33 description="Block device (disk) I/O by process", 34 formatter_class=argparse.RawDescriptionHelpFormatter, 35 epilog=examples) 36parser.add_argument("-C", "--noclear", action="store_true", 37 help="don't clear the screen") 38parser.add_argument("-r", "--maxrows", default=20, 39 help="maximum rows to print, default 20") 40parser.add_argument("-p", "--pid", type=int, metavar="PID", 41 help="trace this PID only") 42parser.add_argument("interval", nargs="?", default=1, 43 help="output interval, in seconds") 44parser.add_argument("count", nargs="?", default=99999999, 45 help="number of outputs") 46parser.add_argument("--ebpf", action="store_true", 47 help=argparse.SUPPRESS) 48args = parser.parse_args() 49interval = int(args.interval) 50countdown = int(args.count) 51maxrows = int(args.maxrows) 52clear = not int(args.noclear) 53 54# linux stats 55loadavg = "/proc/loadavg" 56diskstats = "/proc/diskstats" 57 58# load BPF program 59bpf_text = """ 60#include <uapi/linux/ptrace.h> 61#include <linux/blk-mq.h> 62 63// for saving the timestamp and __data_len of each request 64struct start_req_t { 65 u64 ts; 66 u64 data_len; 67}; 68 69// for saving process info by request 70struct who_t { 71 u32 pid; 72 char name[TASK_COMM_LEN]; 73}; 74 75// the key for the output summary 76struct info_t { 77 u32 pid; 78 int rwflag; 79 int major; 80 int minor; 81 char name[TASK_COMM_LEN]; 82}; 83 84// the value of the output summary 85struct val_t { 86 u64 bytes; 87 u64 us; 88 u32 io; 89}; 90 91BPF_HASH(start, struct request *, struct start_req_t); 92BPF_HASH(whobyreq, struct request *, struct who_t); 93BPF_HASH(counts, struct info_t, struct val_t); 94 95// cache PID and comm by-req 96int trace_pid_start(struct pt_regs *ctx, struct request *req) 97{ 98 struct who_t who = {}; 99 u32 pid; 100 101 if (bpf_get_current_comm(&who.name, sizeof(who.name)) == 0) { 102 pid = bpf_get_current_pid_tgid() >> 32; 103 if (FILTER_PID) 104 return 0; 105 106 who.pid = pid; 107 whobyreq.update(&req, &who); 108 } 109 110 return 0; 111} 112 113// time block I/O 114int trace_req_start(struct pt_regs *ctx, struct request *req) 115{ 116 struct start_req_t start_req = { 117 .ts = bpf_ktime_get_ns(), 118 .data_len = req->__data_len 119 }; 120 start.update(&req, &start_req); 121 return 0; 122} 123 124// output 125int trace_req_completion(struct pt_regs *ctx, struct request *req) 126{ 127 struct start_req_t *startp; 128 129 // fetch timestamp and calculate delta 130 startp = start.lookup(&req); 131 if (startp == 0) { 132 return 0; // missed tracing issue 133 } 134 135 struct who_t *whop; 136 u32 pid; 137 138 whop = whobyreq.lookup(&req); 139 pid = whop != 0 ? whop->pid : 0; 140 if (FILTER_PID) { 141 start.delete(&req); 142 if (whop != 0) { 143 whobyreq.delete(&req); 144 } 145 return 0; 146 } 147 148 struct val_t *valp, zero = {}; 149 u64 delta_us = (bpf_ktime_get_ns() - startp->ts) / 1000; 150 151 // setup info_t key 152 struct info_t info = {}; 153 info.major = req->__RQ_DISK__->major; 154 info.minor = req->__RQ_DISK__->first_minor; 155/* 156 * The following deals with a kernel version change (in mainline 4.7, although 157 * it may be backported to earlier kernels) with how block request write flags 158 * are tested. We handle both pre- and post-change versions here. Please avoid 159 * kernel version tests like this as much as possible: they inflate the code, 160 * test, and maintenance burden. 161 */ 162#ifdef REQ_WRITE 163 info.rwflag = !!(req->cmd_flags & REQ_WRITE); 164#elif defined(REQ_OP_SHIFT) 165 info.rwflag = !!((req->cmd_flags >> REQ_OP_SHIFT) == REQ_OP_WRITE); 166#else 167 info.rwflag = !!((req->cmd_flags & REQ_OP_MASK) == REQ_OP_WRITE); 168#endif 169 170 if (whop == 0) { 171 // missed pid who, save stats as pid 0 172 valp = counts.lookup_or_try_init(&info, &zero); 173 } else { 174 info.pid = whop->pid; 175 __builtin_memcpy(&info.name, whop->name, sizeof(info.name)); 176 valp = counts.lookup_or_try_init(&info, &zero); 177 } 178 179 if (valp) { 180 // save stats 181 valp->us += delta_us; 182 valp->bytes += startp->data_len; 183 valp->io++; 184 } 185 186 start.delete(&req); 187 whobyreq.delete(&req); 188 189 return 0; 190} 191""" 192 193if args.ebpf: 194 print(bpf_text) 195 exit() 196 197if BPF.kernel_struct_has_field(b'request', b'rq_disk'): 198 bpf_text = bpf_text.replace('__RQ_DISK__', 'rq_disk') 199else: 200 bpf_text = bpf_text.replace('__RQ_DISK__', 'q->disk') 201 202if args.pid is not None: 203 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %d' % args.pid) 204else: 205 bpf_text = bpf_text.replace('FILTER_PID', '0') 206 207b = BPF(text=bpf_text) 208if BPF.get_kprobe_functions(b'__blk_account_io_start'): 209 b.attach_kprobe(event="__blk_account_io_start", fn_name="trace_pid_start") 210else: 211 b.attach_kprobe(event="blk_account_io_start", fn_name="trace_pid_start") 212if BPF.get_kprobe_functions(b'blk_start_request'): 213 b.attach_kprobe(event="blk_start_request", fn_name="trace_req_start") 214b.attach_kprobe(event="blk_mq_start_request", fn_name="trace_req_start") 215if BPF.get_kprobe_functions(b'__blk_account_io_done'): 216 b.attach_kprobe(event="__blk_account_io_done", fn_name="trace_req_completion") 217else: 218 b.attach_kprobe(event="blk_account_io_done", fn_name="trace_req_completion") 219 220print('Tracing... Output every %d secs. Hit Ctrl-C to end' % interval) 221 222# cache disk major,minor -> diskname 223disklookup = {} 224with open(diskstats) as stats: 225 for line in stats: 226 a = line.split() 227 disklookup[a[0] + "," + a[1]] = a[2] 228 229# output 230exiting = 0 231while 1: 232 try: 233 sleep(interval) 234 except KeyboardInterrupt: 235 exiting = 1 236 237 # header 238 if clear: 239 call("clear") 240 else: 241 print() 242 with open(loadavg) as stats: 243 print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) 244 print("%-7s %-16s %1s %-3s %-3s %-8s %5s %7s %6s" % ("PID", "COMM", 245 "D", "MAJ", "MIN", "DISK", "I/O", "Kbytes", "AVGms")) 246 247 # by-PID output 248 counts = b.get_table("counts") 249 line = 0 250 for k, v in reversed(sorted(counts.items(), 251 key=lambda counts: counts[1].bytes)): 252 253 # lookup disk 254 disk = str(k.major) + "," + str(k.minor) 255 if disk in disklookup: 256 diskname = disklookup[disk] 257 else: 258 diskname = "?" 259 260 # print line 261 avg_ms = (float(v.us) / 1000) / v.io 262 print("%-7d %-16s %1s %-3d %-3d %-8s %5s %7s %6.2f" % (k.pid, 263 k.name.decode('utf-8', 'replace'), "W" if k.rwflag else "R", 264 k.major, k.minor, diskname, v.io, v.bytes / 1024, avg_ms)) 265 266 line += 1 267 if line >= maxrows: 268 break 269 counts.clear() 270 271 countdown -= 1 272 if exiting or countdown == 0: 273 print("Detaching...") 274 exit() 275