1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# ext4slower Trace slow ext4 operations. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: ext4slower [-h] [-j] [-p PID] [min_ms] 8# 9# This script traces common ext4 file operations: reads, writes, opens, and 10# syncs. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these ext4 operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# By default, a minimum millisecond threshold of 10 is used. 20# 21# Copyright 2016 Netflix, Inc. 22# Licensed under the Apache License, Version 2.0 (the "License") 23# 24# 11-Feb-2016 Brendan Gregg Created this. 25# 15-Oct-2016 Dina Goldshtein -p to filter by process ID. 26# 13-Jun-2018 Joe Yin modify generic_file_read_iter to ext4_file_read_iter. 27 28from __future__ import print_function 29from bcc import BPF 30import argparse 31from time import strftime 32 33# symbols 34kallsyms = "/proc/kallsyms" 35 36# arguments 37examples = """examples: 38 ./ext4slower # trace operations slower than 10 ms (default) 39 ./ext4slower 1 # trace operations slower than 1 ms 40 ./ext4slower -j 1 # ... 1 ms, parsable output (csv) 41 ./ext4slower 0 # trace all operations (warning: verbose) 42 ./ext4slower -p 185 # trace PID 185 only 43""" 44parser = argparse.ArgumentParser( 45 description="Trace common ext4 file operations slower than a threshold", 46 formatter_class=argparse.RawDescriptionHelpFormatter, 47 epilog=examples) 48parser.add_argument("-j", "--csv", action="store_true", 49 help="just print fields: comma-separated values") 50parser.add_argument("-p", "--pid", 51 help="trace this PID only") 52parser.add_argument("min_ms", nargs="?", default='10', 53 help="minimum I/O duration to trace, in ms (default 10)") 54parser.add_argument("--ebpf", action="store_true", 55 help=argparse.SUPPRESS) 56args = parser.parse_args() 57min_ms = int(args.min_ms) 58pid = args.pid 59csv = args.csv 60debug = 0 61 62# define BPF program 63bpf_text = """ 64#include <uapi/linux/ptrace.h> 65#include <linux/fs.h> 66#include <linux/sched.h> 67#include <linux/dcache.h> 68 69// XXX: switch these to char's when supported 70#define TRACE_READ 0 71#define TRACE_WRITE 1 72#define TRACE_OPEN 2 73#define TRACE_FSYNC 3 74 75struct val_t { 76 u64 ts; 77 u64 offset; 78 struct file *fp; 79}; 80 81struct data_t { 82 // XXX: switch some to u32's when supported 83 u64 ts_us; 84 u64 type; 85 u64 size; 86 u64 offset; 87 u64 delta_us; 88 u64 pid; 89 char task[TASK_COMM_LEN]; 90 char file[DNAME_INLINE_LEN]; 91}; 92 93BPF_HASH(entryinfo, u64, struct val_t); 94BPF_PERF_OUTPUT(events); 95 96// 97// Store timestamp and size on entry 98// 99 100// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's 101// own function, for reads. So we need to trace that and then filter on ext4, 102// which I do by checking file->f_op. 103// The new Linux version (since form 4.10) uses ext4_file_read_iter(), And if the 'CONFIG_FS_DAX' 104// is not set, then ext4_file_read_iter() will call generic_file_read_iter(), else it will call 105// ext4_dax_read_iter(), and trace generic_file_read_iter() will fail. 106int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) 107{ 108 u64 id = bpf_get_current_pid_tgid(); 109 u32 pid = id >> 32; // PID is higher part 110 111 if (FILTER_PID) 112 return 0; 113 114 // ext4 filter on file->f_op == ext4_file_operations 115 struct file *fp = iocb->ki_filp; 116 if ((u64)fp->f_op != EXT4_FILE_OPERATIONS) 117 return 0; 118 119 // store filep and timestamp by id 120 struct val_t val = {}; 121 val.ts = bpf_ktime_get_ns(); 122 val.fp = fp; 123 val.offset = iocb->ki_pos; 124 if (val.fp) 125 entryinfo.update(&id, &val); 126 127 return 0; 128} 129 130// ext4_file_write_iter(): 131int trace_write_entry(struct pt_regs *ctx, struct kiocb *iocb) 132{ 133 u64 id = bpf_get_current_pid_tgid(); 134 u32 pid = id >> 32; // PID is higher part 135 136 if (FILTER_PID) 137 return 0; 138 139 // store filep and timestamp by id 140 struct val_t val = {}; 141 val.ts = bpf_ktime_get_ns(); 142 val.fp = iocb->ki_filp; 143 val.offset = iocb->ki_pos; 144 if (val.fp) 145 entryinfo.update(&id, &val); 146 147 return 0; 148} 149 150// ext4_file_open(): 151int trace_open_entry(struct pt_regs *ctx, struct inode *inode, 152 struct file *file) 153{ 154 u64 id = bpf_get_current_pid_tgid(); 155 u32 pid = id >> 32; // PID is higher part 156 157 if (FILTER_PID) 158 return 0; 159 160 // store filep and timestamp by id 161 struct val_t val = {}; 162 val.ts = bpf_ktime_get_ns(); 163 val.fp = file; 164 val.offset = 0; 165 if (val.fp) 166 entryinfo.update(&id, &val); 167 168 return 0; 169} 170 171// ext4_sync_file(): 172int trace_fsync_entry(struct pt_regs *ctx, struct file *file) 173{ 174 u64 id = bpf_get_current_pid_tgid(); 175 u32 pid = id >> 32; // PID is higher part 176 177 if (FILTER_PID) 178 return 0; 179 180 // store filep and timestamp by id 181 struct val_t val = {}; 182 val.ts = bpf_ktime_get_ns(); 183 val.fp = file; 184 val.offset = 0; 185 if (val.fp) 186 entryinfo.update(&id, &val); 187 188 return 0; 189} 190 191// 192// Output 193// 194 195static int trace_return(struct pt_regs *ctx, int type) 196{ 197 struct val_t *valp; 198 u64 id = bpf_get_current_pid_tgid(); 199 u32 pid = id >> 32; // PID is higher part 200 201 valp = entryinfo.lookup(&id); 202 if (valp == 0) { 203 // missed tracing issue or filtered 204 return 0; 205 } 206 207 // calculate delta 208 u64 ts = bpf_ktime_get_ns(); 209 u64 delta_us = (ts - valp->ts) / 1000; 210 entryinfo.delete(&id); 211 if (FILTER_US) 212 return 0; 213 214 // populate output struct 215 u32 size = PT_REGS_RC(ctx); 216 struct data_t data = {.type = type, .size = size, .delta_us = delta_us, 217 .pid = pid}; 218 data.ts_us = ts / 1000; 219 data.offset = valp->offset; 220 bpf_get_current_comm(&data.task, sizeof(data.task)); 221 222 // workaround (rewriter should handle file to d_name in one step): 223 struct dentry *de = NULL; 224 struct qstr qs = {}; 225 de = valp->fp->f_path.dentry; 226 qs = de->d_name; 227 if (qs.len == 0) 228 return 0; 229 bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name); 230 231 // output 232 events.perf_submit(ctx, &data, sizeof(data)); 233 234 return 0; 235} 236 237int trace_read_return(struct pt_regs *ctx) 238{ 239 return trace_return(ctx, TRACE_READ); 240} 241 242int trace_write_return(struct pt_regs *ctx) 243{ 244 return trace_return(ctx, TRACE_WRITE); 245} 246 247int trace_open_return(struct pt_regs *ctx) 248{ 249 return trace_return(ctx, TRACE_OPEN); 250} 251 252int trace_fsync_return(struct pt_regs *ctx) 253{ 254 return trace_return(ctx, TRACE_FSYNC); 255} 256 257""" 258 259# code replacements 260with open(kallsyms) as syms: 261 ops = '' 262 for line in syms: 263 (addr, size, name) = line.rstrip().split(" ", 2) 264 name = name.split("\t")[0] 265 if name == "ext4_file_operations": 266 ops = "0x" + addr 267 break 268 if ops == '': 269 print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.") 270 print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") 271 exit() 272 bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops) 273if min_ms == 0: 274 bpf_text = bpf_text.replace('FILTER_US', '0') 275else: 276 bpf_text = bpf_text.replace('FILTER_US', 277 'delta_us <= %s' % str(min_ms * 1000)) 278if args.pid: 279 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 280else: 281 bpf_text = bpf_text.replace('FILTER_PID', '0') 282if debug or args.ebpf: 283 print(bpf_text) 284 if args.ebpf: 285 exit() 286 287# process event 288def print_event(cpu, data, size): 289 event = b["events"].event(data) 290 291 type = 'R' 292 if event.type == 1: 293 type = 'W' 294 elif event.type == 2: 295 type = 'O' 296 elif event.type == 3: 297 type = 'S' 298 299 if (csv): 300 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 301 event.ts_us, event.task.decode('utf-8', 'replace'), event.pid, 302 type, event.size, event.offset, event.delta_us, 303 event.file.decode('utf-8', 'replace'))) 304 return 305 print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % (strftime("%H:%M:%S"), 306 event.task.decode('utf-8', 'replace'), event.pid, type, event.size, 307 event.offset / 1024, float(event.delta_us) / 1000, 308 event.file.decode('utf-8', 'replace'))) 309 310# initialize BPF 311b = BPF(text=bpf_text) 312 313# Common file functions. See earlier comment about generic_file_read_iter(). 314if BPF.get_kprobe_functions(b'ext4_file_read_iter'): 315 b.attach_kprobe(event="ext4_file_read_iter", fn_name="trace_read_entry") 316else: 317 b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") 318b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_write_entry") 319b.attach_kprobe(event="ext4_file_open", fn_name="trace_open_entry") 320b.attach_kprobe(event="ext4_sync_file", fn_name="trace_fsync_entry") 321if BPF.get_kprobe_functions(b'ext4_file_read_iter'): 322 b.attach_kretprobe(event="ext4_file_read_iter", fn_name="trace_read_return") 323else: 324 b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") 325b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") 326b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return") 327b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return") 328 329# header 330if (csv): 331 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 332else: 333 if min_ms == 0: 334 print("Tracing ext4 operations") 335 else: 336 print("Tracing ext4 operations slower than %d ms" % min_ms) 337 print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", "COMM", "PID", "T", 338 "BYTES", "OFF_KB", "LAT(ms)", "FILENAME")) 339 340# read events 341b["events"].open_perf_buffer(print_event, page_cnt=64) 342while 1: 343 try: 344 b.perf_buffer_poll() 345 except KeyboardInterrupt: 346 exit() 347