1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# tcpconnlat Trace TCP active connection latency (connect). 5# For Linux, uses BCC, eBPF. Embedded C. 6# 7# USAGE: tcpconnlat [-h] [-t] [-p PID] 8# 9# This uses dynamic tracing of kernel functions, and will need to be updated 10# to match kernel changes. 11# 12# Copyright 2016 Netflix, Inc. 13# Licensed under the Apache License, Version 2.0 (the "License") 14# 15# 19-Feb-2016 Brendan Gregg Created this. 16 17from __future__ import print_function 18from bcc import BPF 19from socket import inet_ntop, AF_INET, AF_INET6 20from struct import pack 21import argparse 22import ctypes as ct 23 24# arg validation 25def positive_float(val): 26 try: 27 ival = float(val) 28 except ValueError: 29 raise argparse.ArgumentTypeError("must be a float") 30 31 if ival < 0: 32 raise argparse.ArgumentTypeError("must be positive") 33 return ival 34 35# arguments 36examples = """examples: 37 ./tcpconnlat # trace all TCP connect()s 38 ./tcpconnlat 1 # trace connection latency slower than 1 ms 39 ./tcpconnlat 0.1 # trace connection latency slower than 100 us 40 ./tcpconnlat -t # include timestamps 41 ./tcpconnlat -p 181 # only trace PID 181 42""" 43parser = argparse.ArgumentParser( 44 description="Trace TCP connects and show connection latency", 45 formatter_class=argparse.RawDescriptionHelpFormatter, 46 epilog=examples) 47parser.add_argument("-t", "--timestamp", action="store_true", 48 help="include timestamp on output") 49parser.add_argument("-p", "--pid", 50 help="trace this PID only") 51parser.add_argument("duration_ms", nargs="?", default=0, 52 type=positive_float, 53 help="minimum duration to trace (ms)") 54parser.add_argument("-v", "--verbose", action="store_true", 55 help="print the BPF program for debugging purposes") 56parser.add_argument("--ebpf", action="store_true", 57 help=argparse.SUPPRESS) 58args = parser.parse_args() 59 60if args.duration_ms: 61 # support fractions but round to nearest microsecond 62 duration_us = int(args.duration_ms * 1000) 63else: 64 duration_us = 0 # default is show all 65 66debug = 0 67 68# define BPF program 69bpf_text = """ 70#include <uapi/linux/ptrace.h> 71#include <net/sock.h> 72#include <net/tcp_states.h> 73#include <bcc/proto.h> 74 75struct info_t { 76 u64 ts; 77 u32 pid; 78 char task[TASK_COMM_LEN]; 79}; 80BPF_HASH(start, struct sock *, struct info_t); 81 82// separate data structs for ipv4 and ipv6 83struct ipv4_data_t { 84 u64 ts_us; 85 u32 pid; 86 u32 saddr; 87 u32 daddr; 88 u64 ip; 89 u16 dport; 90 u64 delta_us; 91 char task[TASK_COMM_LEN]; 92}; 93BPF_PERF_OUTPUT(ipv4_events); 94 95struct ipv6_data_t { 96 u64 ts_us; 97 u32 pid; 98 unsigned __int128 saddr; 99 unsigned __int128 daddr; 100 u64 ip; 101 u16 dport; 102 u64 delta_us; 103 char task[TASK_COMM_LEN]; 104}; 105BPF_PERF_OUTPUT(ipv6_events); 106 107int trace_connect(struct pt_regs *ctx, struct sock *sk) 108{ 109 u32 pid = bpf_get_current_pid_tgid(); 110 FILTER 111 struct info_t info = {.pid = pid}; 112 info.ts = bpf_ktime_get_ns(); 113 bpf_get_current_comm(&info.task, sizeof(info.task)); 114 start.update(&sk, &info); 115 return 0; 116}; 117 118// See tcp_v4_do_rcv() and tcp_v6_do_rcv(). So TCP_ESTBALISHED and TCP_LISTEN 119// are fast path and processed elsewhere, and leftovers are processed by 120// tcp_rcv_state_process(). We can trace this for handshake completion. 121// This should all be switched to static tracepoints when available. 122int trace_tcp_rcv_state_process(struct pt_regs *ctx, struct sock *skp) 123{ 124 // will be in TCP_SYN_SENT for handshake 125 if (skp->__sk_common.skc_state != TCP_SYN_SENT) 126 return 0; 127 128 // check start and calculate delta 129 struct info_t *infop = start.lookup(&skp); 130 if (infop == 0) { 131 return 0; // missed entry or filtered 132 } 133 134 u64 ts = infop->ts; 135 u64 now = bpf_ktime_get_ns(); 136 137 u64 delta_us = (now - ts) / 1000ul; 138 139#ifdef MIN_LATENCY 140 if ( delta_us < DURATION_US ) { 141 return 0; // connect latency is below latency filter minimum 142 } 143#endif 144 145 // pull in details 146 u16 family = 0, dport = 0; 147 family = skp->__sk_common.skc_family; 148 dport = skp->__sk_common.skc_dport; 149 150 // emit to appropriate data path 151 if (family == AF_INET) { 152 struct ipv4_data_t data4 = {.pid = infop->pid, .ip = 4}; 153 data4.ts_us = now / 1000; 154 data4.saddr = skp->__sk_common.skc_rcv_saddr; 155 data4.daddr = skp->__sk_common.skc_daddr; 156 data4.dport = ntohs(dport); 157 data4.delta_us = delta_us; 158 __builtin_memcpy(&data4.task, infop->task, sizeof(data4.task)); 159 ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); 160 161 } else /* AF_INET6 */ { 162 struct ipv6_data_t data6 = {.pid = infop->pid, .ip = 6}; 163 data6.ts_us = now / 1000; 164 bpf_probe_read(&data6.saddr, sizeof(data6.saddr), 165 skp->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 166 bpf_probe_read(&data6.daddr, sizeof(data6.daddr), 167 skp->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 168 data6.dport = ntohs(dport); 169 data6.delta_us = delta_us; 170 __builtin_memcpy(&data6.task, infop->task, sizeof(data6.task)); 171 ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); 172 } 173 174 start.delete(&skp); 175 176 return 0; 177} 178""" 179 180if duration_us > 0: 181 bpf_text = "#define MIN_LATENCY\n" + bpf_text 182 bpf_text = bpf_text.replace('DURATION_US', str(duration_us)) 183 184# code substitutions 185if args.pid: 186 bpf_text = bpf_text.replace('FILTER', 187 'if (pid != %s) { return 0; }' % args.pid) 188else: 189 bpf_text = bpf_text.replace('FILTER', '') 190if debug or args.verbose or args.ebpf: 191 print(bpf_text) 192 if args.ebpf: 193 exit() 194 195# initialize BPF 196b = BPF(text=bpf_text) 197b.attach_kprobe(event="tcp_v4_connect", fn_name="trace_connect") 198b.attach_kprobe(event="tcp_v6_connect", fn_name="trace_connect") 199b.attach_kprobe(event="tcp_rcv_state_process", 200 fn_name="trace_tcp_rcv_state_process") 201 202# event data 203TASK_COMM_LEN = 16 # linux/sched.h 204 205class Data_ipv4(ct.Structure): 206 _fields_ = [ 207 ("ts_us", ct.c_ulonglong), 208 ("pid", ct.c_uint), 209 ("saddr", ct.c_uint), 210 ("daddr", ct.c_uint), 211 ("ip", ct.c_ulonglong), 212 ("dport", ct.c_ushort), 213 ("delta_us", ct.c_ulonglong), 214 ("task", ct.c_char * TASK_COMM_LEN) 215 ] 216 217class Data_ipv6(ct.Structure): 218 _fields_ = [ 219 ("ts_us", ct.c_ulonglong), 220 ("pid", ct.c_uint), 221 ("saddr", (ct.c_ulonglong * 2)), 222 ("daddr", (ct.c_ulonglong * 2)), 223 ("ip", ct.c_ulonglong), 224 ("dport", ct.c_ushort), 225 ("delta_us", ct.c_ulonglong), 226 ("task", ct.c_char * TASK_COMM_LEN) 227 ] 228 229# process event 230start_ts = 0 231 232def print_ipv4_event(cpu, data, size): 233 event = ct.cast(data, ct.POINTER(Data_ipv4)).contents 234 global start_ts 235 if args.timestamp: 236 if start_ts == 0: 237 start_ts = event.ts_us 238 print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") 239 print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid, 240 event.task.decode('utf-8', 'replace'), event.ip, 241 inet_ntop(AF_INET, pack("I", event.saddr)), 242 inet_ntop(AF_INET, pack("I", event.daddr)), event.dport, 243 float(event.delta_us) / 1000)) 244 245def print_ipv6_event(cpu, data, size): 246 event = ct.cast(data, ct.POINTER(Data_ipv6)).contents 247 global start_ts 248 if args.timestamp: 249 if start_ts == 0: 250 start_ts = event.ts_us 251 print("%-9.3f" % ((float(event.ts_us) - start_ts) / 1000000), end="") 252 print("%-6d %-12.12s %-2d %-16s %-16s %-5d %.2f" % (event.pid, 253 event.task.decode('utf-8', 'replace'), event.ip, 254 inet_ntop(AF_INET6, event.saddr), inet_ntop(AF_INET6, event.daddr), 255 event.dport, float(event.delta_us) / 1000)) 256 257# header 258if args.timestamp: 259 print("%-9s" % ("TIME(s)"), end="") 260print("%-6s %-12s %-2s %-16s %-16s %-5s %s" % ("PID", "COMM", "IP", "SADDR", 261 "DADDR", "DPORT", "LAT(ms)")) 262 263# read events 264b["ipv4_events"].open_perf_buffer(print_ipv4_event) 265b["ipv6_events"].open_perf_buffer(print_ipv6_event) 266while 1: 267 b.perf_buffer_poll() 268