1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# btrfsslower Trace slow btrfs operations. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: btrfsslower [-h] [-j] [-p PID] [min_ms] [-d DURATION] 8# 9# This script traces common btrfs file operations: reads, writes, opens, and 10# syncs. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these btrfs operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# By default, a minimum millisecond threshold of 10 is used. 20# 21# Copyright 2016 Netflix, Inc. 22# Licensed under the Apache License, Version 2.0 (the "License") 23# 24# 15-Feb-2016 Brendan Gregg Created this. 25# 16-Oct-2016 Dina Goldshtein -p to filter by process ID. 26 27from __future__ import print_function 28from bcc import BPF 29import argparse 30from datetime import datetime, timedelta 31from time import strftime 32 33# symbols 34kallsyms = "/proc/kallsyms" 35 36# arguments 37examples = """examples: 38 ./btrfsslower # trace operations slower than 10 ms (default) 39 ./btrfsslower 1 # trace operations slower than 1 ms 40 ./btrfsslower -j 1 # ... 1 ms, parsable output (csv) 41 ./btrfsslower 0 # trace all operations (warning: verbose) 42 ./btrfsslower -p 185 # trace PID 185 only 43 ./btrfsslower -d 10 # trace for 10 seconds only 44""" 45parser = argparse.ArgumentParser( 46 description="Trace common btrfs file operations slower than a threshold", 47 formatter_class=argparse.RawDescriptionHelpFormatter, 48 epilog=examples) 49parser.add_argument("-j", "--csv", action="store_true", 50 help="just print fields: comma-separated values") 51parser.add_argument("-p", "--pid", 52 help="trace this PID only") 53parser.add_argument("min_ms", nargs="?", default='10', 54 help="minimum I/O duration to trace, in ms (default 10)") 55parser.add_argument("--ebpf", action="store_true", 56 help=argparse.SUPPRESS) 57parser.add_argument("-d", "--duration", 58 help="total duration of trace in seconds") 59args = parser.parse_args() 60min_ms = int(args.min_ms) 61pid = args.pid 62csv = args.csv 63debug = 0 64if args.duration: 65 args.duration = timedelta(seconds=int(args.duration)) 66 67# define BPF program 68bpf_text = """ 69#include <uapi/linux/ptrace.h> 70#include <linux/fs.h> 71#include <linux/sched.h> 72#include <linux/dcache.h> 73 74// XXX: switch these to char's when supported 75#define TRACE_READ 0 76#define TRACE_WRITE 1 77#define TRACE_OPEN 2 78#define TRACE_FSYNC 3 79 80struct val_t { 81 u64 ts; 82 u64 offset; 83 struct file *fp; 84}; 85 86struct data_t { 87 // XXX: switch some to u32's when supported 88 u64 ts_us; 89 u64 type; 90 u64 size; 91 u64 offset; 92 u64 delta_us; 93 u64 pid; 94 char task[TASK_COMM_LEN]; 95 char file[DNAME_INLINE_LEN]; 96}; 97 98BPF_HASH(entryinfo, u64, struct val_t); 99BPF_PERF_OUTPUT(events); 100 101// 102// Store timestamp and size on entry 103// 104 105// The current btrfs (Linux 4.5) uses generic_file_read_iter() instead of it's 106// own read function. So we need to trace that and then filter on btrfs, which 107// I do by checking file->f_op. 108int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) 109{ 110 u64 id = bpf_get_current_pid_tgid(); 111 u32 pid = id >> 32; // PID is higher part 112 113 if (FILTER_PID) 114 return 0; 115 116 // btrfs filter on file->f_op == btrfs_file_operations 117 struct file *fp = iocb->ki_filp; 118 if ((u64)fp->f_op != BTRFS_FILE_OPERATIONS) 119 return 0; 120 121 // store filep and timestamp by pid 122 struct val_t val = {}; 123 val.ts = bpf_ktime_get_ns(); 124 val.fp = fp; 125 val.offset = iocb->ki_pos; 126 if (val.fp) 127 entryinfo.update(&id, &val); 128 129 return 0; 130} 131 132// btrfs_file_write_iter(): 133int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb) 134{ 135 u64 id = bpf_get_current_pid_tgid(); 136 u32 pid = id >> 32; // PID is higher part 137 138 if (FILTER_PID) 139 return 0; 140 141 // store filep and timestamp by id 142 struct val_t val = {}; 143 val.ts = bpf_ktime_get_ns(); 144 val.fp = iocb->ki_filp; 145 val.offset = iocb->ki_pos; 146 if (val.fp) 147 entryinfo.update(&id, &val); 148 149 return 0; 150} 151 152// The current btrfs (Linux 4.5) uses generic_file_open(), instead of it's own 153// function. Same as with reads. Trace the generic path and filter: 154int trace_open_entry(struct pt_regs *ctx, struct inode *inode, 155 struct file *file) 156{ 157 u64 id = bpf_get_current_pid_tgid(); 158 u32 pid = id >> 32; // PID is higher part 159 160 if (FILTER_PID) 161 return 0; 162 163 // btrfs filter on file->f_op == btrfs_file_operations 164 if ((u64)file->f_op != BTRFS_FILE_OPERATIONS) 165 return 0; 166 167 // store filep and timestamp by id 168 struct val_t val = {}; 169 val.ts = bpf_ktime_get_ns(); 170 val.fp = file; 171 val.offset = 0; 172 if (val.fp) 173 entryinfo.update(&id, &val); 174 175 return 0; 176} 177 178// btrfs_sync_file(): 179int trace_fsync_entry(struct pt_regs *ctx, struct file *file) 180{ 181 u64 id = bpf_get_current_pid_tgid(); 182 u32 pid = id >> 32; // PID is higher part 183 184 if (FILTER_PID) 185 return 0; 186 187 // store filep and timestamp by id 188 struct val_t val = {}; 189 val.ts = bpf_ktime_get_ns(); 190 val.fp = file; 191 val.offset = 0; 192 if (val.fp) 193 entryinfo.update(&id, &val); 194 195 return 0; 196} 197 198// 199// Output 200// 201 202static int trace_return(struct pt_regs *ctx, int type) 203{ 204 struct val_t *valp; 205 u64 id = bpf_get_current_pid_tgid(); 206 u32 pid = id >> 32; // PID is higher part 207 208 valp = entryinfo.lookup(&id); 209 if (valp == 0) { 210 // missed tracing issue or filtered 211 return 0; 212 } 213 214 // calculate delta 215 u64 ts = bpf_ktime_get_ns(); 216 u64 delta_us = (ts - valp->ts) / 1000; 217 entryinfo.delete(&id); 218 if (FILTER_US) 219 return 0; 220 221 // populate output struct 222 u32 size = PT_REGS_RC(ctx); 223 struct data_t data = {.type = type, .size = size, .delta_us = delta_us, 224 .pid = pid}; 225 data.ts_us = ts / 1000; 226 data.offset = valp->offset; 227 bpf_get_current_comm(&data.task, sizeof(data.task)); 228 229 // workaround (rewriter should handle file to d_name in one step): 230 struct dentry *de = NULL; 231 struct qstr qs = {}; 232 de = valp->fp->f_path.dentry; 233 qs = de->d_name; 234 if (qs.len == 0) 235 return 0; 236 bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name); 237 238 // output 239 events.perf_submit(ctx, &data, sizeof(data)); 240 241 return 0; 242} 243 244int trace_read_return(struct pt_regs *ctx) 245{ 246 return trace_return(ctx, TRACE_READ); 247} 248 249int trace_write_return(struct pt_regs *ctx) 250{ 251 return trace_return(ctx, TRACE_WRITE); 252} 253 254int trace_open_return(struct pt_regs *ctx) 255{ 256 return trace_return(ctx, TRACE_OPEN); 257} 258 259int trace_fsync_return(struct pt_regs *ctx) 260{ 261 return trace_return(ctx, TRACE_FSYNC); 262} 263 264""" 265 266# code replacements 267with open(kallsyms) as syms: 268 ops = '' 269 for line in syms: 270 a = line.rstrip().split() 271 (addr, name) = (a[0], a[2]) 272 name = name.split("\t")[0] 273 if name == "btrfs_file_operations": 274 ops = "0x" + addr 275 break 276 if ops == '': 277 print("ERROR: no btrfs_file_operations in /proc/kallsyms. Exiting.") 278 print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") 279 exit() 280 bpf_text = bpf_text.replace('BTRFS_FILE_OPERATIONS', ops) 281if min_ms == 0: 282 bpf_text = bpf_text.replace('FILTER_US', '0') 283else: 284 bpf_text = bpf_text.replace('FILTER_US', 285 'delta_us <= %s' % str(min_ms * 1000)) 286if args.pid: 287 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 288else: 289 bpf_text = bpf_text.replace('FILTER_PID', '0') 290if debug or args.ebpf: 291 print(bpf_text) 292 if args.ebpf: 293 exit() 294 295# process event 296def print_event(cpu, data, size): 297 event = b["events"].event(data) 298 299 type = 'R' 300 if event.type == 1: 301 type = 'W' 302 elif event.type == 2: 303 type = 'O' 304 elif event.type == 3: 305 type = 'S' 306 307 if (csv): 308 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 309 event.ts_us, event.task.decode('utf-8', 'replace'), event.pid, 310 type, event.size, event.offset, event.delta_us, 311 event.file.decode('utf-8', 'replace'))) 312 return 313 print("%-8s %-14.14s %-7d %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), 314 event.task.decode('utf-8', 'replace'), event.pid, type, event.size, 315 event.offset / 1024, float(event.delta_us) / 1000, 316 event.file.decode('utf-8', 'replace'))) 317 318# initialize BPF 319b = BPF(text=bpf_text) 320 321# Common file functions. See earlier comment about generic_*(). 322b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") 323b.attach_kprobe(event="btrfs_file_write_iter", fn_name="trace_write_entry") 324b.attach_kprobe(event="generic_file_open", fn_name="trace_open_entry") 325b.attach_kprobe(event="btrfs_sync_file", fn_name="trace_fsync_entry") 326b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") 327b.attach_kretprobe(event="btrfs_file_write_iter", fn_name="trace_write_return") 328b.attach_kretprobe(event="generic_file_open", fn_name="trace_open_return") 329b.attach_kretprobe(event="btrfs_sync_file", fn_name="trace_fsync_return") 330 331# header 332if (csv): 333 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 334else: 335 if min_ms == 0: 336 print("Tracing btrfs operations") 337 else: 338 print("Tracing btrfs operations slower than %d ms" % min_ms) 339 print("%-8s %-14s %-7s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", 340 "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) 341 342# read events 343b["events"].open_perf_buffer(print_event, page_cnt=64) 344start_time = datetime.now() 345while not args.duration or datetime.now() - start_time < args.duration: 346 try: 347 b.perf_buffer_poll(timeout=1000) 348 except KeyboardInterrupt: 349 exit() 350