• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2#
3# biolatpcts.py  Monitor IO latency distribution of a block device.
4#
5#  $ ./biolatpcts.py /dev/nvme0n1
6#  nvme0n1    p1    p5   p10   p16   p25   p50   p75   p84   p90   p95   p99  p100
7#  read     95us 175us 305us 515us 895us 985us 995us 1.5ms 2.5ms 3.5ms 4.5ms  10ms
8#  write     5us   5us   5us  15us  25us 135us 765us 855us 885us 895us 965us 1.5ms
9#  discard   5us   5us   5us   5us 135us 145us 165us 205us 385us 875us 1.5ms 2.5ms
10#  flush     5us   5us   5us   5us   5us   5us   5us   5us   5us 1.5ms 4.5ms 5.5ms
11#
12# Copyright (C) 2020 Tejun Heo <tj@kernel.org>
13# Copyright (C) 2020 Facebook
14
15from __future__ import print_function
16from bcc import BPF
17from time import sleep
18from threading import Event
19import argparse
20import json
21import sys
22import os
23import signal
24
25description = """
26Monitor IO latency distribution of a block device
27"""
28
29epilog = """
30When interval is infinite, biolatpcts will print out result once the
31initialization is complete to indicate readiness. After initialized,
32biolatpcts will output whenever it receives SIGUSR1/2 and before exiting on
33SIGINT, SIGTERM or SIGHUP.
34
35SIGUSR1 starts a new period after reporting. SIGUSR2 doesn't and can be used
36to monitor progress without affecting accumulation of data points. They can
37be used to obtain latency distribution between two arbitrary events and
38monitor progress inbetween.
39"""
40
41parser = argparse.ArgumentParser(description = description, epilog = epilog,
42                                 formatter_class = argparse.ArgumentDefaultsHelpFormatter)
43parser.add_argument('dev', metavar='DEV', type=str,
44                    help='Target block device (/dev/DEVNAME, DEVNAME or MAJ:MIN)')
45parser.add_argument('-i', '--interval', type=int, default=3,
46                    help='Report interval (0: exit after startup, -1: infinite)')
47parser.add_argument('-w', '--which', choices=['from-rq-alloc', 'after-rq-alloc', 'on-device'],
48                    default='on-device', help='Which latency to measure')
49parser.add_argument('-p', '--pcts', metavar='PCT,...', type=str,
50                    default='1,5,10,16,25,50,75,84,90,95,99,100',
51                    help='Percentiles to calculate')
52parser.add_argument('-j', '--json', action='store_true',
53                    help='Output in json')
54parser.add_argument('--verbose', '-v', action='count', default = 0)
55
56bpf_source = """
57#include <linux/blk_types.h>
58#include <linux/blkdev.h>
59#include <linux/blk-mq.h>
60#include <linux/time64.h>
61
62BPF_PERCPU_ARRAY(rwdf_100ms, u64, 400);
63BPF_PERCPU_ARRAY(rwdf_1ms, u64, 400);
64BPF_PERCPU_ARRAY(rwdf_10us, u64, 400);
65
66void kprobe_blk_account_io_done(struct pt_regs *ctx, struct request *rq, u64 now)
67{
68        unsigned int cmd_flags;
69        u64 dur;
70        size_t base, slot;
71
72        if (!rq->__START_TIME_FIELD__)
73                return;
74
75        if (!rq->__RQ_DISK__ ||
76            rq->__RQ_DISK__->major != __MAJOR__ ||
77            rq->__RQ_DISK__->first_minor != __MINOR__)
78                return;
79
80        cmd_flags = rq->cmd_flags;
81        switch (cmd_flags & REQ_OP_MASK) {
82        case REQ_OP_READ:
83                base = 0;
84                break;
85        case REQ_OP_WRITE:
86                base = 100;
87                break;
88        case REQ_OP_DISCARD:
89                base = 200;
90                break;
91        case REQ_OP_FLUSH:
92                base = 300;
93                break;
94        default:
95                return;
96        }
97
98        dur = now - rq->__START_TIME_FIELD__;
99
100        slot = min_t(size_t, div_u64(dur, 100 * NSEC_PER_MSEC), 99);
101        rwdf_100ms.increment(base + slot);
102        if (slot)
103                return;
104
105        slot = min_t(size_t, div_u64(dur, NSEC_PER_MSEC), 99);
106        rwdf_1ms.increment(base + slot);
107        if (slot)
108                return;
109
110        slot = min_t(size_t, div_u64(dur, 10 * NSEC_PER_USEC), 99);
111        rwdf_10us.increment(base + slot);
112}
113"""
114
115args = parser.parse_args()
116args.pcts = args.pcts.split(',')
117args.pcts.sort(key=lambda x: float(x))
118
119try:
120    major = int(args.dev.split(':')[0])
121    minor = int(args.dev.split(':')[1])
122except Exception:
123    if '/' in args.dev:
124        stat = os.stat(args.dev)
125    else:
126        stat = os.stat('/dev/' + args.dev)
127
128    major = os.major(stat.st_rdev)
129    minor = os.minor(stat.st_rdev)
130
131if args.which == 'from-rq-alloc':
132    start_time_field = 'alloc_time_ns'
133elif args.which == 'after-rq-alloc':
134    start_time_field = 'start_time_ns'
135elif args.which == 'on-device':
136    start_time_field = 'io_start_time_ns'
137else:
138    print("Invalid latency measurement {}".format(args.which))
139    exit()
140
141bpf_source = bpf_source.replace('__START_TIME_FIELD__', start_time_field)
142bpf_source = bpf_source.replace('__MAJOR__', str(major))
143bpf_source = bpf_source.replace('__MINOR__', str(minor))
144
145if BPF.kernel_struct_has_field(b'request', b'rq_disk'):
146    bpf_source = bpf_source.replace('__RQ_DISK__', 'rq_disk')
147else:
148    bpf_source = bpf_source.replace('__RQ_DISK__', 'q->disk')
149
150bpf = BPF(text=bpf_source)
151if BPF.get_kprobe_functions(b'__blk_account_io_done'):
152    bpf.attach_kprobe(event="__blk_account_io_done", fn_name="kprobe_blk_account_io_done")
153else:
154    bpf.attach_kprobe(event="blk_account_io_done", fn_name="kprobe_blk_account_io_done")
155
156# times are in usecs
157MSEC = 1000
158SEC = 1000 * 1000
159
160cur_rwdf_100ms = bpf["rwdf_100ms"]
161cur_rwdf_1ms = bpf["rwdf_1ms"]
162cur_rwdf_10us = bpf["rwdf_10us"]
163
164last_rwdf_100ms = [0] * 400
165last_rwdf_1ms = [0] * 400
166last_rwdf_10us = [0] * 400
167
168rwdf_100ms = [0] * 400
169rwdf_1ms = [0] * 400
170rwdf_10us = [0] * 400
171
172io_type = ["read", "write", "discard", "flush"]
173
174def find_pct(req, total, slots, idx, counted):
175    while idx > 0:
176        idx -= 1
177        if slots[idx] > 0:
178            counted += slots[idx]
179            if args.verbose > 1:
180                print('idx={} counted={} pct={:.1f} req={}'
181                      .format(idx, counted, counted / total, req))
182            if (counted / total) * 100 >= 100 - req:
183                break
184    return (idx, counted)
185
186def calc_lat_pct(req_pcts, total, lat_100ms, lat_1ms, lat_10us):
187    pcts = [0] * len(req_pcts)
188
189    if total == 0:
190        return pcts
191
192    data = [(100 * MSEC, lat_100ms), (MSEC, lat_1ms), (10, lat_10us)]
193    data_sel = 0
194    idx = 100
195    counted = 0
196
197    for pct_idx in reversed(range(len(req_pcts))):
198        req = float(req_pcts[pct_idx])
199        while True:
200            last_counted = counted
201            (gran, slots) = data[data_sel]
202            (idx, counted) = find_pct(req, total, slots, idx, counted)
203            if args.verbose > 1:
204                print('pct_idx={} req={} gran={} idx={} counted={} total={}'
205                      .format(pct_idx, req, gran, idx, counted, total))
206            if idx > 0 or data_sel == len(data) - 1:
207                break
208            counted = last_counted
209            data_sel += 1
210            idx = 100
211
212        pcts[pct_idx] = gran * idx + gran / 2
213
214    return pcts
215
216def format_usec(lat):
217    if lat > SEC:
218        return '{:.1f}s'.format(lat / SEC)
219    elif lat > 10 * MSEC:
220        return '{:.0f}ms'.format(lat / MSEC)
221    elif lat > MSEC:
222        return '{:.1f}ms'.format(lat / MSEC)
223    elif lat > 0:
224        return '{:.0f}us'.format(lat)
225    else:
226        return '-'
227
228# 0 interval can be used to test whether this script would run successfully.
229if args.interval == 0:
230    sys.exit(0)
231
232# Set up signal handling so that we print the result on USR1/2 and before
233# exiting on a signal. Combined with infinite interval, this can be used to
234# obtain overall latency distribution between two events. On USR2 the
235# accumulated counters are cleared too, which can be used to define
236# arbitrary intervals.
237force_update_last_rwdf = False
238keep_running = True
239result_req = Event()
240def sig_handler(sig, frame):
241    global keep_running, force_update_last_rwdf, result_req
242    if sig == signal.SIGUSR1:
243        force_update_last_rwdf = True
244    elif sig != signal.SIGUSR2:
245        keep_running = False
246    result_req.set()
247
248for sig in (signal.SIGUSR1, signal.SIGUSR2, signal.SIGINT, signal.SIGTERM, signal.SIGHUP):
249    signal.signal(sig, sig_handler)
250
251# If infinite interval, always trigger the first output so that the caller
252# can tell when initialization is complete.
253if args.interval < 0:
254    result_req.set();
255
256while keep_running:
257    result_req.wait(args.interval if args.interval > 0 else None)
258    result_req.clear()
259
260    update_last_rwdf = args.interval > 0 or force_update_last_rwdf
261    force_update_last_rwdf = False
262    rwdf_total = [0] * 4;
263
264    for i in range(400):
265        v = cur_rwdf_100ms.sum(i).value
266        rwdf_100ms[i] = max(v - last_rwdf_100ms[i], 0)
267        if update_last_rwdf:
268            last_rwdf_100ms[i] = v
269
270        v = cur_rwdf_1ms.sum(i).value
271        rwdf_1ms[i] = max(v - last_rwdf_1ms[i], 0)
272        if update_last_rwdf:
273            last_rwdf_1ms[i] = v
274
275        v = cur_rwdf_10us.sum(i).value
276        rwdf_10us[i] = max(v - last_rwdf_10us[i], 0)
277        if update_last_rwdf:
278            last_rwdf_10us[i] = v
279
280        rwdf_total[int(i / 100)] += rwdf_100ms[i]
281
282    rwdf_lat = []
283    for i in range(4):
284        left = i * 100
285        right = left + 100
286        rwdf_lat.append(
287            calc_lat_pct(args.pcts, rwdf_total[i],
288                         rwdf_100ms[left:right],
289                         rwdf_1ms[left:right],
290                         rwdf_10us[left:right]))
291
292        if args.verbose:
293            print('{:7} 100ms {}'.format(io_type[i], rwdf_100ms[left:right]))
294            print('{:7}   1ms {}'.format(io_type[i], rwdf_1ms[left:right]))
295            print('{:7}  10us {}'.format(io_type[i], rwdf_10us[left:right]))
296
297    if args.json:
298        result = {}
299        for iot in range(4):
300            lats = {}
301            for pi in range(len(args.pcts)):
302                lats[args.pcts[pi]] = rwdf_lat[iot][pi] / SEC
303            result[io_type[iot]] = lats
304        print(json.dumps(result), flush=True)
305    else:
306        print('\n{:<7}'.format(os.path.basename(args.dev)), end='')
307        widths = []
308        for pct in args.pcts:
309            widths.append(max(len(pct), 5))
310            print(' {:>5}'.format(pct), end='')
311        print()
312        for iot in range(4):
313            print('{:7}'.format(io_type[iot]), end='')
314            for pi in range(len(rwdf_lat[iot])):
315                print(' {:>{}}'.format(format_usec(rwdf_lat[iot][pi]), widths[pi]), end='')
316            print()
317