1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# nfsslower Trace slow NFS operations 5# for Linux using BCC & eBPF 6# 7# Usage: nfsslower [-h] [-p PID] [min_ms] 8# 9# This script traces some common NFS operations: read, write, opens and 10# getattr. It measures the time spent in these operations, and prints details 11# for each that exceeded a threshold. 12# 13# WARNING: This adds low-overhead instrumentation to these NFS operations, 14# including reads and writes from the file system cache. Such reads and writes 15# can be very frequent (depending on the workload; eg, 1M/sec), at which 16# point the overhead of this tool (even if it prints no "slower" events) can 17# begin to become significant. 18# 19# Most of this code is copied from similar tools (ext4slower, zfsslower etc) 20# 21# By default, a minimum millisecond threshold of 10 is used. 22# 23# This tool uses kprobes to instrument the kernel for entry and exit 24# information, in the future a preferred way would be to use tracepoints. 25# Currently there aren't any tracepoints available for nfs_read_file, 26# nfs_write_file and nfs_open_file, nfs_getattr does have entry and exit 27# tracepoints but we chose to use kprobes for consistency 28# 29# 31-Aug-2017 Samuel Nair created this. Should work with NFSv{3,4} 30 31from __future__ import print_function 32from bcc import BPF 33import argparse 34from time import strftime 35 36examples = """ 37 ./nfsslower # trace operations slower than 10ms 38 ./nfsslower 1 # trace operations slower than 1ms 39 ./nfsslower -j 1 # ... 1 ms, parsable output (csv) 40 ./nfsslower 0 # trace all nfs operations 41 ./nfsslower -p 121 # trace pid 121 only 42""" 43parser = argparse.ArgumentParser( 44 description="""Trace READ, WRITE, OPEN \ 45and GETATTR NFS calls slower than a threshold,\ 46supports NFSv{3,4}""", 47 formatter_class=argparse.RawDescriptionHelpFormatter, 48 epilog=examples) 49 50parser.add_argument("-j", "--csv", action="store_true", 51 help="just print fields: comma-separated values") 52parser.add_argument("-p", "--pid", help="Trace this pid only") 53parser.add_argument("min_ms", nargs="?", default='10', 54 help="Minimum IO duration to trace in ms (default=10ms)") 55parser.add_argument("--ebpf", action="store_true", 56 help=argparse.SUPPRESS) 57args = parser.parse_args() 58min_ms = int(args.min_ms) 59pid = args.pid 60csv = args.csv 61debug = 0 62 63bpf_text = """ 64 65#include <uapi/linux/ptrace.h> 66#include <linux/fs.h> 67#include <linux/sched.h> 68#include <linux/dcache.h> 69 70#define TRACE_READ 0 71#define TRACE_WRITE 1 72#define TRACE_OPEN 2 73#define TRACE_GETATTR 3 74 75struct val_t { 76 u64 ts; 77 u64 offset; 78 struct file *fp; 79 struct dentry *d; 80}; 81 82struct data_t { 83 // XXX: switch some to u32's when supported 84 u64 ts_us; 85 u64 type; 86 u64 size; 87 u64 offset; 88 u64 delta_us; 89 u64 pid; 90 char task[TASK_COMM_LEN]; 91 char file[DNAME_INLINE_LEN]; 92}; 93 94BPF_HASH(entryinfo, u64, struct val_t); 95BPF_PERF_OUTPUT(events); 96 97int trace_rw_entry(struct pt_regs *ctx, struct kiocb *iocb, 98 struct iov_iter *data) 99{ 100 u64 id = bpf_get_current_pid_tgid(); 101 u32 pid = id >> 32; // PID is higher part 102 103 if(FILTER_PID) 104 return 0; 105 106 // store filep and timestamp by id 107 struct val_t val = {}; 108 val.ts = bpf_ktime_get_ns(); 109 val.fp = iocb->ki_filp; 110 val.d = NULL; 111 val.offset = iocb->ki_pos; 112 113 if (val.fp) 114 entryinfo.update(&id, &val); 115 116 return 0; 117} 118 119int trace_file_open_entry (struct pt_regs *ctx, struct inode *inode, 120 struct file *filp) 121{ 122 u64 id = bpf_get_current_pid_tgid(); 123 u32 pid = id >> 32; // PID is higher part 124 125 if(FILTER_PID) 126 return 0; 127 128 // store filep and timestamp by id 129 struct val_t val = {}; 130 val.ts = bpf_ktime_get_ns(); 131 val.fp = filp; 132 val.d = NULL; 133 val.offset = 0; 134 if (val.fp) 135 entryinfo.update(&id, &val); 136 137 return 0; 138} 139 140int trace_getattr_entry(struct pt_regs *ctx, struct vfsmount *mnt, 141 struct dentry *dentry, struct kstat *stat) 142{ 143 u64 id = bpf_get_current_pid_tgid(); 144 u32 pid = id >> 32; // PID is higher part 145 146 if(FILTER_PID) 147 return 0; 148 149 struct val_t val = {}; 150 val.ts = bpf_ktime_get_ns(); 151 val.fp = NULL; 152 val.d = dentry; 153 val.offset = 0; 154 if (val.d) 155 entryinfo.update(&id, &val); 156 157 return 0; 158} 159 160static int trace_exit(struct pt_regs *ctx, int type) 161{ 162 struct val_t *valp; 163 u64 id = bpf_get_current_pid_tgid(); 164 u32 pid = id >> 32; // PID is higher part 165 166 valp = entryinfo.lookup(&id); 167 if (valp == 0) { 168 // missed tracing issue or filtered 169 return 0; 170 } 171 172 // calculate delta 173 u64 ts = bpf_ktime_get_ns(); 174 u64 delta_us = (ts - valp->ts) / 1000; 175 entryinfo.delete(&id); 176 177 if (FILTER_US) 178 return 0; 179 180 // populate output struct 181 u32 size = PT_REGS_RC(ctx); 182 struct data_t data = {.type = type, .size = size, .delta_us = delta_us, 183 .pid = pid}; 184 data.ts_us = ts / 1000; 185 data.offset = valp->offset; 186 bpf_get_current_comm(&data.task, sizeof(data.task)); 187 188 // workaround (rewriter should handle file to d_name in one step): 189 struct dentry *de = NULL; 190 struct qstr qs = {}; 191 if(type == TRACE_GETATTR) 192 { 193 bpf_probe_read_kernel(&de,sizeof(de), &valp->d); 194 } 195 else 196 { 197 bpf_probe_read_kernel(&de, sizeof(de), &valp->fp->f_path.dentry); 198 } 199 200 bpf_probe_read_kernel(&qs, sizeof(qs), (void *)&de->d_name); 201 if (qs.len == 0) 202 return 0; 203 204 bpf_probe_read_kernel(&data.file, sizeof(data.file), (void *)qs.name); 205 // output 206 events.perf_submit(ctx, &data, sizeof(data)); 207 return 0; 208} 209 210int trace_file_open_return(struct pt_regs *ctx) 211{ 212 return trace_exit(ctx, TRACE_OPEN); 213} 214 215int trace_read_return(struct pt_regs *ctx) 216{ 217 return trace_exit(ctx, TRACE_READ); 218} 219 220int trace_write_return(struct pt_regs *ctx) 221{ 222 return trace_exit(ctx, TRACE_WRITE); 223} 224 225int trace_getattr_return(struct pt_regs *ctx) 226{ 227 return trace_exit(ctx, TRACE_GETATTR); 228} 229 230""" 231if min_ms == 0: 232 bpf_text = bpf_text.replace('FILTER_US', '0') 233else: 234 bpf_text = bpf_text.replace('FILTER_US', 235 'delta_us <= %s' % str(min_ms * 1000)) 236if args.pid: 237 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 238else: 239 bpf_text = bpf_text.replace('FILTER_PID', '0') 240if debug or args.ebpf: 241 print(bpf_text) 242 if args.ebpf: 243 exit() 244 245# process event 246def print_event(cpu, data, size): 247 event = b["events"].event(data) 248 249 type = 'R' 250 if event.type == 1: 251 type = 'W' 252 elif event.type == 2: 253 type = 'O' 254 elif event.type == 3: 255 type = 'G' 256 257 if(csv): 258 print("%d,%s,%d,%s,%d,%d,%d,%s" % ( 259 event.ts_us, event.task, event.pid, type, event.size, 260 event.offset, event.delta_us, event.file)) 261 return 262 print("%-8s %-14.14s %-6s %1s %-7s %-8d %7.2f %s" % 263 (strftime("%H:%M:%S"), 264 event.task.decode('utf-8', 'replace'), 265 event.pid, 266 type, 267 event.size, 268 event.offset / 1024, 269 float(event.delta_us) / 1000, 270 event.file.decode('utf-8', 'replace'))) 271 272 273# Currently specifically works for NFSv4, the other kprobes are generic 274# so it should work with earlier NFS versions 275 276b = BPF(text=bpf_text) 277b.attach_kprobe(event="nfs_file_read", fn_name="trace_rw_entry") 278b.attach_kprobe(event="nfs_file_write", fn_name="trace_rw_entry") 279b.attach_kprobe(event="nfs4_file_open", fn_name="trace_file_open_entry") 280b.attach_kprobe(event="nfs_file_open", fn_name="trace_file_open_entry") 281b.attach_kprobe(event="nfs_getattr", fn_name="trace_getattr_entry") 282 283b.attach_kretprobe(event="nfs_file_read", fn_name="trace_read_return") 284b.attach_kretprobe(event="nfs_file_write", fn_name="trace_write_return") 285b.attach_kretprobe(event="nfs4_file_open", fn_name="trace_file_open_return") 286b.attach_kretprobe(event="nfs_file_open", fn_name="trace_file_open_return") 287b.attach_kretprobe(event="nfs_getattr", fn_name="trace_getattr_return") 288 289if(csv): 290 print("ENDTIME_us,TASK,PID,TYPE,BYTES,OFFSET_b,LATENCY_us,FILE") 291else: 292 if min_ms == 0: 293 print("Tracing NFS operations... Ctrl-C to quit") 294 else: 295 print("""Tracing NFS operations that are slower than \ 296%d ms... Ctrl-C to quit""" 297 % min_ms) 298 print("%-8s %-14s %-6s %1s %-7s %-8s %7s %s" % ("TIME", 299 "COMM", 300 "PID", 301 "T", 302 "BYTES", 303 "OFF_KB", 304 "LAT(ms)", 305 "FILENAME")) 306 307b["events"].open_perf_buffer(print_event, page_cnt=64) 308while 1: 309 try: 310 b.perf_buffer_poll() 311 except KeyboardInterrupt: 312 exit() 313