1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# ext4dist Summarize ext4 operation latency. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: ext4dist [-h] [-T] [-m] [-p PID] [interval] [count] 8# 9# Copyright 2016 Netflix, Inc. 10# Licensed under the Apache License, Version 2.0 (the "License") 11# 12# 12-Feb-2016 Brendan Gregg Created this. 13 14from __future__ import print_function 15from bcc import BPF 16from time import sleep, strftime 17import argparse 18 19# symbols 20kallsyms = "/proc/kallsyms" 21 22# arguments 23examples = """examples: 24 ./ext4dist # show operation latency as a histogram 25 ./ext4dist -p 181 # trace PID 181 only 26 ./ext4dist 1 10 # print 1 second summaries, 10 times 27 ./ext4dist -m 5 # 5s summaries, milliseconds 28""" 29parser = argparse.ArgumentParser( 30 description="Summarize ext4 operation latency", 31 formatter_class=argparse.RawDescriptionHelpFormatter, 32 epilog=examples) 33parser.add_argument("-T", "--notimestamp", action="store_true", 34 help="don't include timestamp on interval output") 35parser.add_argument("-m", "--milliseconds", action="store_true", 36 help="output in milliseconds") 37parser.add_argument("-p", "--pid", 38 help="trace this PID only") 39parser.add_argument("interval", nargs="?", 40 help="output interval, in seconds") 41parser.add_argument("count", nargs="?", default=99999999, 42 help="number of outputs") 43parser.add_argument("--ebpf", action="store_true", 44 help=argparse.SUPPRESS) 45args = parser.parse_args() 46pid = args.pid 47countdown = int(args.count) 48if args.milliseconds: 49 factor = 1000000 50 label = "msecs" 51else: 52 factor = 1000 53 label = "usecs" 54if args.interval and int(args.interval) == 0: 55 print("ERROR: interval 0. Exiting.") 56 exit() 57debug = 0 58 59# define BPF program 60bpf_text = """ 61#include <uapi/linux/ptrace.h> 62#include <linux/fs.h> 63#include <linux/sched.h> 64 65#define OP_NAME_LEN 8 66typedef struct dist_key { 67 char op[OP_NAME_LEN]; 68 u64 slot; 69} dist_key_t; 70BPF_HASH(start, u32); 71BPF_HISTOGRAM(dist, dist_key_t); 72 73// time operation 74int trace_entry(struct pt_regs *ctx) 75{ 76 u32 pid = bpf_get_current_pid_tgid(); 77 if (FILTER_PID) 78 return 0; 79 u64 ts = bpf_ktime_get_ns(); 80 start.update(&pid, &ts); 81 return 0; 82} 83 84// The current ext4 (Linux 4.5) uses generic_file_read_iter(), instead of it's 85// own function, for reads. So we need to trace that and then filter on ext4, 86// which I do by checking file->f_op. 87int trace_read_entry(struct pt_regs *ctx, struct kiocb *iocb) 88{ 89 u32 pid = bpf_get_current_pid_tgid(); 90 if (FILTER_PID) 91 return 0; 92 93 // ext4 filter on file->f_op == ext4_file_operations 94 struct file *fp = iocb->ki_filp; 95 if ((u64)fp->f_op != EXT4_FILE_OPERATIONS) 96 return 0; 97 98 u64 ts = bpf_ktime_get_ns(); 99 start.update(&pid, &ts); 100 return 0; 101} 102 103static int trace_return(struct pt_regs *ctx, const char *op) 104{ 105 u64 *tsp; 106 u32 pid = bpf_get_current_pid_tgid(); 107 108 // fetch timestamp and calculate delta 109 tsp = start.lookup(&pid); 110 if (tsp == 0) { 111 return 0; // missed start or filtered 112 } 113 u64 delta = bpf_ktime_get_ns() - *tsp; 114 start.delete(&pid); 115 116 // Skip entries with backwards time: temp workaround for #728 117 if ((s64) delta < 0) 118 return 0; 119 120 delta /= FACTOR; 121 122 // store as histogram 123 dist_key_t key = {.slot = bpf_log2l(delta)}; 124 __builtin_memcpy(&key.op, op, sizeof(key.op)); 125 dist.increment(key); 126 127 return 0; 128} 129 130int trace_read_return(struct pt_regs *ctx) 131{ 132 char *op = "read"; 133 return trace_return(ctx, op); 134} 135 136int trace_write_return(struct pt_regs *ctx) 137{ 138 char *op = "write"; 139 return trace_return(ctx, op); 140} 141 142int trace_open_return(struct pt_regs *ctx) 143{ 144 char *op = "open"; 145 return trace_return(ctx, op); 146} 147 148int trace_fsync_return(struct pt_regs *ctx) 149{ 150 char *op = "fsync"; 151 return trace_return(ctx, op); 152} 153""" 154 155# code replacements 156with open(kallsyms) as syms: 157 ops = '' 158 for line in syms: 159 (addr, size, name) = line.rstrip().split(" ", 2) 160 name = name.split("\t")[0] 161 if name == "ext4_file_operations": 162 ops = "0x" + addr 163 break 164 if ops == '': 165 print("ERROR: no ext4_file_operations in /proc/kallsyms. Exiting.") 166 print("HINT: the kernel should be built with CONFIG_KALLSYMS_ALL.") 167 exit() 168 bpf_text = bpf_text.replace('EXT4_FILE_OPERATIONS', ops) 169bpf_text = bpf_text.replace('FACTOR', str(factor)) 170if args.pid: 171 bpf_text = bpf_text.replace('FILTER_PID', 'pid != %s' % pid) 172else: 173 bpf_text = bpf_text.replace('FILTER_PID', '0') 174if debug or args.ebpf: 175 print(bpf_text) 176 if args.ebpf: 177 exit() 178 179# load BPF program 180b = BPF(text=bpf_text) 181 182# Common file functions. See earlier comment about generic_file_read_iter(). 183# Comment by Joe Yin 184# From Linux 4.10, the function .read_iter at the ext4_file_operations has 185# changed to ext4_file_read_iter. 186# So, I add get_kprobe_functions(b'ext4_file_read_iter'),it will first to attach ext4_file_read_iter, 187# if fails and will attach the generic_file_read_iter which used to pre-4.10. 188 189if BPF.get_kprobe_functions(b'ext4_file_read_iter'): 190 b.attach_kprobe(event="ext4_file_read_iter", fn_name="trace_entry") 191else: 192 b.attach_kprobe(event="generic_file_read_iter", fn_name="trace_read_entry") 193b.attach_kprobe(event="ext4_file_write_iter", fn_name="trace_entry") 194b.attach_kprobe(event="ext4_file_open", fn_name="trace_entry") 195b.attach_kprobe(event="ext4_sync_file", fn_name="trace_entry") 196b.attach_kretprobe(event="generic_file_read_iter", fn_name="trace_read_return") 197b.attach_kretprobe(event="ext4_file_write_iter", fn_name="trace_write_return") 198b.attach_kretprobe(event="ext4_file_open", fn_name="trace_open_return") 199b.attach_kretprobe(event="ext4_sync_file", fn_name="trace_fsync_return") 200 201print("Tracing ext4 operation latency... Hit Ctrl-C to end.") 202 203# output 204exiting = 0 205dist = b.get_table("dist") 206while (1): 207 try: 208 if args.interval: 209 sleep(int(args.interval)) 210 else: 211 sleep(99999999) 212 except KeyboardInterrupt: 213 exiting = 1 214 215 print() 216 if args.interval and (not args.notimestamp): 217 print(strftime("%H:%M:%S:")) 218 219 dist.print_log2_hist(label, "operation", section_print_fn=bytes.decode) 220 dist.clear() 221 222 countdown -= 1 223 if exiting or countdown == 0: 224 exit() 225