1#!/usr/bin/python 2# 3# biolatpcts.py Monitor IO latency distribution of a block device. 4# 5# $ ./biolatpcts.py /dev/nvme0n1 6# nvme0n1 p1 p5 p10 p16 p25 p50 p75 p84 p90 p95 p99 p100 7# read 95us 175us 305us 515us 895us 985us 995us 1.5ms 2.5ms 3.5ms 4.5ms 10ms 8# write 5us 5us 5us 15us 25us 135us 765us 855us 885us 895us 965us 1.5ms 9# discard 5us 5us 5us 5us 135us 145us 165us 205us 385us 875us 1.5ms 2.5ms 10# flush 5us 5us 5us 5us 5us 5us 5us 5us 5us 1.5ms 4.5ms 5.5ms 11# 12# Copyright (C) 2020 Tejun Heo <tj@kernel.org> 13# Copyright (C) 2020 Facebook 14 15from __future__ import print_function 16from bcc import BPF 17from time import sleep 18from threading import Event 19import argparse 20import json 21import sys 22import os 23import signal 24 25description = """ 26Monitor IO latency distribution of a block device 27""" 28 29epilog = """ 30When interval is infinite, biolatpcts will print out result once the 31initialization is complete to indicate readiness. After initialized, 32biolatpcts will output whenever it receives SIGUSR1/2 and before exiting on 33SIGINT, SIGTERM or SIGHUP. 34 35SIGUSR1 starts a new period after reporting. SIGUSR2 doesn't and can be used 36to monitor progress without affecting accumulation of data points. They can 37be used to obtain latency distribution between two arbitrary events and 38monitor progress inbetween. 39""" 40 41parser = argparse.ArgumentParser(description = description, epilog = epilog, 42 formatter_class = argparse.ArgumentDefaultsHelpFormatter) 43parser.add_argument('dev', metavar='DEV', type=str, 44 help='Target block device (/dev/DEVNAME, DEVNAME or MAJ:MIN)') 45parser.add_argument('-i', '--interval', type=int, default=3, 46 help='Report interval (0: exit after startup, -1: infinite)') 47parser.add_argument('-w', '--which', choices=['from-rq-alloc', 'after-rq-alloc', 'on-device'], 48 default='on-device', help='Which latency to measure') 49parser.add_argument('-p', '--pcts', metavar='PCT,...', type=str, 50 default='1,5,10,16,25,50,75,84,90,95,99,100', 51 help='Percentiles to calculate') 52parser.add_argument('-j', '--json', action='store_true', 53 help='Output in json') 54parser.add_argument('--verbose', '-v', action='count', default = 0) 55 56bpf_source = """ 57#include <linux/blk_types.h> 58#include <linux/blkdev.h> 59#include <linux/blk-mq.h> 60#include <linux/time64.h> 61 62BPF_PERCPU_ARRAY(rwdf_100ms, u64, 400); 63BPF_PERCPU_ARRAY(rwdf_1ms, u64, 400); 64BPF_PERCPU_ARRAY(rwdf_10us, u64, 400); 65 66void kprobe_blk_account_io_done(struct pt_regs *ctx, struct request *rq, u64 now) 67{ 68 unsigned int cmd_flags; 69 u64 dur; 70 size_t base, slot; 71 72 if (!rq->__START_TIME_FIELD__) 73 return; 74 75 if (!rq->__RQ_DISK__ || 76 rq->__RQ_DISK__->major != __MAJOR__ || 77 rq->__RQ_DISK__->first_minor != __MINOR__) 78 return; 79 80 cmd_flags = rq->cmd_flags; 81 switch (cmd_flags & REQ_OP_MASK) { 82 case REQ_OP_READ: 83 base = 0; 84 break; 85 case REQ_OP_WRITE: 86 base = 100; 87 break; 88 case REQ_OP_DISCARD: 89 base = 200; 90 break; 91 case REQ_OP_FLUSH: 92 base = 300; 93 break; 94 default: 95 return; 96 } 97 98 dur = now - rq->__START_TIME_FIELD__; 99 100 slot = min_t(size_t, div_u64(dur, 100 * NSEC_PER_MSEC), 99); 101 rwdf_100ms.increment(base + slot); 102 if (slot) 103 return; 104 105 slot = min_t(size_t, div_u64(dur, NSEC_PER_MSEC), 99); 106 rwdf_1ms.increment(base + slot); 107 if (slot) 108 return; 109 110 slot = min_t(size_t, div_u64(dur, 10 * NSEC_PER_USEC), 99); 111 rwdf_10us.increment(base + slot); 112} 113""" 114 115args = parser.parse_args() 116args.pcts = args.pcts.split(',') 117args.pcts.sort(key=lambda x: float(x)) 118 119try: 120 major = int(args.dev.split(':')[0]) 121 minor = int(args.dev.split(':')[1]) 122except Exception: 123 if '/' in args.dev: 124 stat = os.stat(args.dev) 125 else: 126 stat = os.stat('/dev/' + args.dev) 127 128 major = os.major(stat.st_rdev) 129 minor = os.minor(stat.st_rdev) 130 131if args.which == 'from-rq-alloc': 132 start_time_field = 'alloc_time_ns' 133elif args.which == 'after-rq-alloc': 134 start_time_field = 'start_time_ns' 135elif args.which == 'on-device': 136 start_time_field = 'io_start_time_ns' 137else: 138 print("Invalid latency measurement {}".format(args.which)) 139 exit() 140 141bpf_source = bpf_source.replace('__START_TIME_FIELD__', start_time_field) 142bpf_source = bpf_source.replace('__MAJOR__', str(major)) 143bpf_source = bpf_source.replace('__MINOR__', str(minor)) 144 145if BPF.kernel_struct_has_field(b'request', b'rq_disk'): 146 bpf_source = bpf_source.replace('__RQ_DISK__', 'rq_disk') 147else: 148 bpf_source = bpf_source.replace('__RQ_DISK__', 'q->disk') 149 150bpf = BPF(text=bpf_source) 151if BPF.get_kprobe_functions(b'__blk_account_io_done'): 152 bpf.attach_kprobe(event="__blk_account_io_done", fn_name="kprobe_blk_account_io_done") 153else: 154 bpf.attach_kprobe(event="blk_account_io_done", fn_name="kprobe_blk_account_io_done") 155 156# times are in usecs 157MSEC = 1000 158SEC = 1000 * 1000 159 160cur_rwdf_100ms = bpf["rwdf_100ms"] 161cur_rwdf_1ms = bpf["rwdf_1ms"] 162cur_rwdf_10us = bpf["rwdf_10us"] 163 164last_rwdf_100ms = [0] * 400 165last_rwdf_1ms = [0] * 400 166last_rwdf_10us = [0] * 400 167 168rwdf_100ms = [0] * 400 169rwdf_1ms = [0] * 400 170rwdf_10us = [0] * 400 171 172io_type = ["read", "write", "discard", "flush"] 173 174def find_pct(req, total, slots, idx, counted): 175 while idx > 0: 176 idx -= 1 177 if slots[idx] > 0: 178 counted += slots[idx] 179 if args.verbose > 1: 180 print('idx={} counted={} pct={:.1f} req={}' 181 .format(idx, counted, counted / total, req)) 182 if (counted / total) * 100 >= 100 - req: 183 break 184 return (idx, counted) 185 186def calc_lat_pct(req_pcts, total, lat_100ms, lat_1ms, lat_10us): 187 pcts = [0] * len(req_pcts) 188 189 if total == 0: 190 return pcts 191 192 data = [(100 * MSEC, lat_100ms), (MSEC, lat_1ms), (10, lat_10us)] 193 data_sel = 0 194 idx = 100 195 counted = 0 196 197 for pct_idx in reversed(range(len(req_pcts))): 198 req = float(req_pcts[pct_idx]) 199 while True: 200 last_counted = counted 201 (gran, slots) = data[data_sel] 202 (idx, counted) = find_pct(req, total, slots, idx, counted) 203 if args.verbose > 1: 204 print('pct_idx={} req={} gran={} idx={} counted={} total={}' 205 .format(pct_idx, req, gran, idx, counted, total)) 206 if idx > 0 or data_sel == len(data) - 1: 207 break 208 counted = last_counted 209 data_sel += 1 210 idx = 100 211 212 pcts[pct_idx] = gran * idx + gran / 2 213 214 return pcts 215 216def format_usec(lat): 217 if lat > SEC: 218 return '{:.1f}s'.format(lat / SEC) 219 elif lat > 10 * MSEC: 220 return '{:.0f}ms'.format(lat / MSEC) 221 elif lat > MSEC: 222 return '{:.1f}ms'.format(lat / MSEC) 223 elif lat > 0: 224 return '{:.0f}us'.format(lat) 225 else: 226 return '-' 227 228# 0 interval can be used to test whether this script would run successfully. 229if args.interval == 0: 230 sys.exit(0) 231 232# Set up signal handling so that we print the result on USR1/2 and before 233# exiting on a signal. Combined with infinite interval, this can be used to 234# obtain overall latency distribution between two events. On USR2 the 235# accumulated counters are cleared too, which can be used to define 236# arbitrary intervals. 237force_update_last_rwdf = False 238keep_running = True 239result_req = Event() 240def sig_handler(sig, frame): 241 global keep_running, force_update_last_rwdf, result_req 242 if sig == signal.SIGUSR1: 243 force_update_last_rwdf = True 244 elif sig != signal.SIGUSR2: 245 keep_running = False 246 result_req.set() 247 248for sig in (signal.SIGUSR1, signal.SIGUSR2, signal.SIGINT, signal.SIGTERM, signal.SIGHUP): 249 signal.signal(sig, sig_handler) 250 251# If infinite interval, always trigger the first output so that the caller 252# can tell when initialization is complete. 253if args.interval < 0: 254 result_req.set(); 255 256while keep_running: 257 result_req.wait(args.interval if args.interval > 0 else None) 258 result_req.clear() 259 260 update_last_rwdf = args.interval > 0 or force_update_last_rwdf 261 force_update_last_rwdf = False 262 rwdf_total = [0] * 4; 263 264 for i in range(400): 265 v = cur_rwdf_100ms.sum(i).value 266 rwdf_100ms[i] = max(v - last_rwdf_100ms[i], 0) 267 if update_last_rwdf: 268 last_rwdf_100ms[i] = v 269 270 v = cur_rwdf_1ms.sum(i).value 271 rwdf_1ms[i] = max(v - last_rwdf_1ms[i], 0) 272 if update_last_rwdf: 273 last_rwdf_1ms[i] = v 274 275 v = cur_rwdf_10us.sum(i).value 276 rwdf_10us[i] = max(v - last_rwdf_10us[i], 0) 277 if update_last_rwdf: 278 last_rwdf_10us[i] = v 279 280 rwdf_total[int(i / 100)] += rwdf_100ms[i] 281 282 rwdf_lat = [] 283 for i in range(4): 284 left = i * 100 285 right = left + 100 286 rwdf_lat.append( 287 calc_lat_pct(args.pcts, rwdf_total[i], 288 rwdf_100ms[left:right], 289 rwdf_1ms[left:right], 290 rwdf_10us[left:right])) 291 292 if args.verbose: 293 print('{:7} 100ms {}'.format(io_type[i], rwdf_100ms[left:right])) 294 print('{:7} 1ms {}'.format(io_type[i], rwdf_1ms[left:right])) 295 print('{:7} 10us {}'.format(io_type[i], rwdf_10us[left:right])) 296 297 if args.json: 298 result = {} 299 for iot in range(4): 300 lats = {} 301 for pi in range(len(args.pcts)): 302 lats[args.pcts[pi]] = rwdf_lat[iot][pi] / SEC 303 result[io_type[iot]] = lats 304 print(json.dumps(result), flush=True) 305 else: 306 print('\n{:<7}'.format(os.path.basename(args.dev)), end='') 307 widths = [] 308 for pct in args.pcts: 309 widths.append(max(len(pct), 5)) 310 print(' {:>5}'.format(pct), end='') 311 print() 312 for iot in range(4): 313 print('{:7}'.format(io_type[iot]), end='') 314 for pi in range(len(rwdf_lat[iot])): 315 print(' {:>{}}'.format(format_usec(rwdf_lat[iot][pi]), widths[pi]), end='') 316 print() 317