1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# tcplife Trace the lifespan of TCP sessions and summarize. 5# For Linux, uses BCC, BPF. Embedded C. 6# 7# USAGE: tcplife [-h] [-C] [-S] [-p PID] [interval [count]] 8# 9# This uses the sock:inet_sock_set_state tracepoint if it exists (added to 10# Linux 4.16, and replacing the earlier tcp:tcp_set_state), else it uses 11# kernel dynamic tracing of tcp_set_state(). 12# 13# While throughput counters are emitted, they are fetched in a low-overhead 14# manner: reading members of the tcp_info struct on TCP close. ie, we do not 15# trace send/receive. 16# 17# Copyright 2016 Netflix, Inc. 18# Licensed under the Apache License, Version 2.0 (the "License") 19# 20# IDEA: Julia Evans 21# 22# 18-Oct-2016 Brendan Gregg Created this. 23# 29-Dec-2017 " " Added tracepoint support. 24 25from __future__ import print_function 26from bcc import BPF 27import argparse 28from socket import inet_ntop, ntohs, AF_INET, AF_INET6 29from struct import pack 30import ctypes as ct 31from time import strftime 32 33# arguments 34examples = """examples: 35 ./tcplife # trace all TCP connect()s 36 ./tcplife -t # include time column (HH:MM:SS) 37 ./tcplife -w # wider colums (fit IPv6) 38 ./tcplife -stT # csv output, with times & timestamps 39 ./tcplife -p 181 # only trace PID 181 40 ./tcplife -L 80 # only trace local port 80 41 ./tcplife -L 80,81 # only trace local ports 80 and 81 42 ./tcplife -D 80 # only trace remote port 80 43""" 44parser = argparse.ArgumentParser( 45 description="Trace the lifespan of TCP sessions and summarize", 46 formatter_class=argparse.RawDescriptionHelpFormatter, 47 epilog=examples) 48parser.add_argument("-T", "--time", action="store_true", 49 help="include time column on output (HH:MM:SS)") 50parser.add_argument("-t", "--timestamp", action="store_true", 51 help="include timestamp on output (seconds)") 52parser.add_argument("-w", "--wide", action="store_true", 53 help="wide column output (fits IPv6 addresses)") 54parser.add_argument("-s", "--csv", action="store_true", 55 help="comma separated values output") 56parser.add_argument("-p", "--pid", 57 help="trace this PID only") 58parser.add_argument("-L", "--localport", 59 help="comma-separated list of local ports to trace.") 60parser.add_argument("-D", "--remoteport", 61 help="comma-separated list of remote ports to trace.") 62parser.add_argument("--ebpf", action="store_true", 63 help=argparse.SUPPRESS) 64args = parser.parse_args() 65debug = 0 66 67# define BPF program 68bpf_text = """ 69#include <uapi/linux/ptrace.h> 70#define KBUILD_MODNAME "foo" 71#include <linux/tcp.h> 72#include <net/sock.h> 73#include <bcc/proto.h> 74 75BPF_HASH(birth, struct sock *, u64); 76 77// separate data structs for ipv4 and ipv6 78struct ipv4_data_t { 79 u64 ts_us; 80 u32 pid; 81 u32 saddr; 82 u32 daddr; 83 u64 ports; 84 u64 rx_b; 85 u64 tx_b; 86 u64 span_us; 87 char task[TASK_COMM_LEN]; 88}; 89BPF_PERF_OUTPUT(ipv4_events); 90 91struct ipv6_data_t { 92 u64 ts_us; 93 u32 pid; 94 unsigned __int128 saddr; 95 unsigned __int128 daddr; 96 u64 ports; 97 u64 rx_b; 98 u64 tx_b; 99 u64 span_us; 100 char task[TASK_COMM_LEN]; 101}; 102BPF_PERF_OUTPUT(ipv6_events); 103 104struct id_t { 105 u32 pid; 106 char task[TASK_COMM_LEN]; 107}; 108BPF_HASH(whoami, struct sock *, struct id_t); 109""" 110 111# 112# XXX: The following is temporary code for older kernels, Linux 4.14 and 113# older. It uses kprobes to instrument tcp_set_state(). On Linux 4.16 and 114# later, the sock:inet_sock_set_state tracepoint should be used instead, as 115# is done by the code that follows this. In the distant future (2021?), this 116# kprobe code can be removed. This is why there is so much code 117# duplication: to make removal easier. 118# 119bpf_text_kprobe = """ 120int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) 121{ 122 u32 pid = bpf_get_current_pid_tgid() >> 32; 123 124 // lport is either used in a filter here, or later 125 u16 lport = sk->__sk_common.skc_num; 126 FILTER_LPORT 127 128 // dport is either used in a filter here, or later 129 u16 dport = sk->__sk_common.skc_dport; 130 dport = ntohs(dport); 131 FILTER_DPORT 132 133 /* 134 * This tool includes PID and comm context. It's best effort, and may 135 * be wrong in some situations. It currently works like this: 136 * - record timestamp on any state < TCP_FIN_WAIT1 137 * - cache task context on: 138 * TCP_SYN_SENT: tracing from client 139 * TCP_LAST_ACK: client-closed from server 140 * - do output on TCP_CLOSE: 141 * fetch task context if cached, or use current task 142 */ 143 144 // capture birth time 145 if (state < TCP_FIN_WAIT1) { 146 /* 147 * Matching just ESTABLISHED may be sufficient, provided no code-path 148 * sets ESTABLISHED without a tcp_set_state() call. Until we know 149 * that for sure, match all early states to increase chances a 150 * timestamp is set. 151 * Note that this needs to be set before the PID filter later on, 152 * since the PID isn't reliable for these early stages, so we must 153 * save all timestamps and do the PID filter later when we can. 154 */ 155 u64 ts = bpf_ktime_get_ns(); 156 birth.update(&sk, &ts); 157 } 158 159 // record PID & comm on SYN_SENT 160 if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) { 161 // now we can PID filter, both here and a little later on for CLOSE 162 FILTER_PID 163 struct id_t me = {.pid = pid}; 164 bpf_get_current_comm(&me.task, sizeof(me.task)); 165 whoami.update(&sk, &me); 166 } 167 168 if (state != TCP_CLOSE) 169 return 0; 170 171 // calculate lifespan 172 u64 *tsp, delta_us; 173 tsp = birth.lookup(&sk); 174 if (tsp == 0) { 175 whoami.delete(&sk); // may not exist 176 return 0; // missed create 177 } 178 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; 179 birth.delete(&sk); 180 181 // fetch possible cached data, and filter 182 struct id_t *mep; 183 mep = whoami.lookup(&sk); 184 if (mep != 0) 185 pid = mep->pid; 186 FILTER_PID 187 188 // get throughput stats. see tcp_get_info(). 189 u64 rx_b = 0, tx_b = 0, sport = 0; 190 struct tcp_sock *tp = (struct tcp_sock *)sk; 191 rx_b = tp->bytes_received; 192 tx_b = tp->bytes_acked; 193 194 u16 family = sk->__sk_common.skc_family; 195 196 if (family == AF_INET) { 197 struct ipv4_data_t data4 = {}; 198 data4.span_us = delta_us; 199 data4.rx_b = rx_b; 200 data4.tx_b = tx_b; 201 data4.ts_us = bpf_ktime_get_ns() / 1000; 202 data4.saddr = sk->__sk_common.skc_rcv_saddr; 203 data4.daddr = sk->__sk_common.skc_daddr; 204 // a workaround until data4 compiles with separate lport/dport 205 data4.pid = pid; 206 data4.ports = dport + ((0ULL + lport) << 32); 207 if (mep == 0) { 208 bpf_get_current_comm(&data4.task, sizeof(data4.task)); 209 } else { 210 bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task); 211 } 212 ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); 213 214 } else /* 6 */ { 215 struct ipv6_data_t data6 = {}; 216 data6.span_us = delta_us; 217 data6.rx_b = rx_b; 218 data6.tx_b = tx_b; 219 data6.ts_us = bpf_ktime_get_ns() / 1000; 220 bpf_probe_read(&data6.saddr, sizeof(data6.saddr), 221 sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 222 bpf_probe_read(&data6.daddr, sizeof(data6.daddr), 223 sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 224 // a workaround until data6 compiles with separate lport/dport 225 data6.ports = dport + ((0ULL + lport) << 32); 226 data6.pid = pid; 227 if (mep == 0) { 228 bpf_get_current_comm(&data6.task, sizeof(data6.task)); 229 } else { 230 bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task); 231 } 232 ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); 233 } 234 235 if (mep != 0) 236 whoami.delete(&sk); 237 238 return 0; 239} 240""" 241 242bpf_text_tracepoint = """ 243TRACEPOINT_PROBE(sock, inet_sock_set_state) 244{ 245 if (args->protocol != IPPROTO_TCP) 246 return 0; 247 248 u32 pid = bpf_get_current_pid_tgid() >> 32; 249 // sk is mostly used as a UUID, and for two tcp stats: 250 struct sock *sk = (struct sock *)args->skaddr; 251 252 // lport is either used in a filter here, or later 253 u16 lport = args->sport; 254 FILTER_LPORT 255 256 // dport is either used in a filter here, or later 257 u16 dport = args->dport; 258 FILTER_DPORT 259 260 /* 261 * This tool includes PID and comm context. It's best effort, and may 262 * be wrong in some situations. It currently works like this: 263 * - record timestamp on any state < TCP_FIN_WAIT1 264 * - cache task context on: 265 * TCP_SYN_SENT: tracing from client 266 * TCP_LAST_ACK: client-closed from server 267 * - do output on TCP_CLOSE: 268 * fetch task context if cached, or use current task 269 */ 270 271 // capture birth time 272 if (args->newstate < TCP_FIN_WAIT1) { 273 /* 274 * Matching just ESTABLISHED may be sufficient, provided no code-path 275 * sets ESTABLISHED without a tcp_set_state() call. Until we know 276 * that for sure, match all early states to increase chances a 277 * timestamp is set. 278 * Note that this needs to be set before the PID filter later on, 279 * since the PID isn't reliable for these early stages, so we must 280 * save all timestamps and do the PID filter later when we can. 281 */ 282 u64 ts = bpf_ktime_get_ns(); 283 birth.update(&sk, &ts); 284 } 285 286 // record PID & comm on SYN_SENT 287 if (args->newstate == TCP_SYN_SENT || args->newstate == TCP_LAST_ACK) { 288 // now we can PID filter, both here and a little later on for CLOSE 289 FILTER_PID 290 struct id_t me = {.pid = pid}; 291 bpf_get_current_comm(&me.task, sizeof(me.task)); 292 whoami.update(&sk, &me); 293 } 294 295 if (args->newstate != TCP_CLOSE) 296 return 0; 297 298 // calculate lifespan 299 u64 *tsp, delta_us; 300 tsp = birth.lookup(&sk); 301 if (tsp == 0) { 302 whoami.delete(&sk); // may not exist 303 return 0; // missed create 304 } 305 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; 306 birth.delete(&sk); 307 308 // fetch possible cached data, and filter 309 struct id_t *mep; 310 mep = whoami.lookup(&sk); 311 if (mep != 0) 312 pid = mep->pid; 313 FILTER_PID 314 315 // get throughput stats. see tcp_get_info(). 316 u64 rx_b = 0, tx_b = 0, sport = 0; 317 struct tcp_sock *tp = (struct tcp_sock *)sk; 318 rx_b = tp->bytes_received; 319 tx_b = tp->bytes_acked; 320 321 if (args->family == AF_INET) { 322 struct ipv4_data_t data4 = {}; 323 data4.span_us = delta_us; 324 data4.rx_b = rx_b; 325 data4.tx_b = tx_b; 326 data4.ts_us = bpf_ktime_get_ns() / 1000; 327 __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr)); 328 __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr)); 329 // a workaround until data4 compiles with separate lport/dport 330 data4.ports = dport + ((0ULL + lport) << 32); 331 data4.pid = pid; 332 333 if (mep == 0) { 334 bpf_get_current_comm(&data4.task, sizeof(data4.task)); 335 } else { 336 bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task); 337 } 338 ipv4_events.perf_submit(args, &data4, sizeof(data4)); 339 340 } else /* 6 */ { 341 struct ipv6_data_t data6 = {}; 342 data6.span_us = delta_us; 343 data6.rx_b = rx_b; 344 data6.tx_b = tx_b; 345 data6.ts_us = bpf_ktime_get_ns() / 1000; 346 __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr)); 347 __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr)); 348 // a workaround until data6 compiles with separate lport/dport 349 data6.ports = dport + ((0ULL + lport) << 32); 350 data6.pid = pid; 351 if (mep == 0) { 352 bpf_get_current_comm(&data6.task, sizeof(data6.task)); 353 } else { 354 bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task); 355 } 356 ipv6_events.perf_submit(args, &data6, sizeof(data6)); 357 } 358 359 if (mep != 0) 360 whoami.delete(&sk); 361 362 return 0; 363} 364""" 365 366if (BPF.tracepoint_exists("sock", "inet_sock_set_state")): 367 bpf_text += bpf_text_tracepoint 368else: 369 bpf_text += bpf_text_kprobe 370 371# code substitutions 372if args.pid: 373 bpf_text = bpf_text.replace('FILTER_PID', 374 'if (pid != %s) { return 0; }' % args.pid) 375if args.remoteport: 376 dports = [int(dport) for dport in args.remoteport.split(',')] 377 dports_if = ' && '.join(['dport != %d' % dport for dport in dports]) 378 bpf_text = bpf_text.replace('FILTER_DPORT', 379 'if (%s) { birth.delete(&sk); return 0; }' % dports_if) 380if args.localport: 381 lports = [int(lport) for lport in args.localport.split(',')] 382 lports_if = ' && '.join(['lport != %d' % lport for lport in lports]) 383 bpf_text = bpf_text.replace('FILTER_LPORT', 384 'if (%s) { birth.delete(&sk); return 0; }' % lports_if) 385bpf_text = bpf_text.replace('FILTER_PID', '') 386bpf_text = bpf_text.replace('FILTER_DPORT', '') 387bpf_text = bpf_text.replace('FILTER_LPORT', '') 388 389if debug or args.ebpf: 390 print(bpf_text) 391 if args.ebpf: 392 exit() 393 394# event data 395TASK_COMM_LEN = 16 # linux/sched.h 396 397class Data_ipv4(ct.Structure): 398 _fields_ = [ 399 ("ts_us", ct.c_ulonglong), 400 ("pid", ct.c_uint), 401 ("saddr", ct.c_uint), 402 ("daddr", ct.c_uint), 403 ("ports", ct.c_ulonglong), 404 ("rx_b", ct.c_ulonglong), 405 ("tx_b", ct.c_ulonglong), 406 ("span_us", ct.c_ulonglong), 407 ("task", ct.c_char * TASK_COMM_LEN) 408 ] 409 410class Data_ipv6(ct.Structure): 411 _fields_ = [ 412 ("ts_us", ct.c_ulonglong), 413 ("pid", ct.c_uint), 414 ("saddr", (ct.c_ulonglong * 2)), 415 ("daddr", (ct.c_ulonglong * 2)), 416 ("ports", ct.c_ulonglong), 417 ("rx_b", ct.c_ulonglong), 418 ("tx_b", ct.c_ulonglong), 419 ("span_us", ct.c_ulonglong), 420 ("task", ct.c_char * TASK_COMM_LEN) 421 ] 422 423# 424# Setup output formats 425# 426# Don't change the default output (next 2 lines): this fits in 80 chars. I 427# know it doesn't have NS or UIDs etc. I know. If you really, really, really 428# need to add columns, columns that solve real actual problems, I'd start by 429# adding an extended mode (-x) to included those columns. 430# 431header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s" 432format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f" 433if args.wide: 434 header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s" 435 format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f" 436if args.csv: 437 header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" 438 format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f" 439 440# process event 441def print_ipv4_event(cpu, data, size): 442 event = ct.cast(data, ct.POINTER(Data_ipv4)).contents 443 global start_ts 444 if args.time: 445 if args.csv: 446 print("%s," % strftime("%H:%M:%S"), end="") 447 else: 448 print("%-8s " % strftime("%H:%M:%S"), end="") 449 if args.timestamp: 450 if start_ts == 0: 451 start_ts = event.ts_us 452 delta_s = (float(event.ts_us) - start_ts) / 1000000 453 if args.csv: 454 print("%.6f," % delta_s, end="") 455 else: 456 print("%-9.6f " % delta_s, end="") 457 print(format_string % (event.pid, event.task.decode('utf-8', 'replace'), 458 "4" if args.wide or args.csv else "", 459 inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 32, 460 inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffffffff, 461 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000)) 462 463def print_ipv6_event(cpu, data, size): 464 event = ct.cast(data, ct.POINTER(Data_ipv6)).contents 465 global start_ts 466 if args.time: 467 if args.csv: 468 print("%s," % strftime("%H:%M:%S"), end="") 469 else: 470 print("%-8s " % strftime("%H:%M:%S"), end="") 471 if args.timestamp: 472 if start_ts == 0: 473 start_ts = event.ts_us 474 delta_s = (float(event.ts_us) - start_ts) / 1000000 475 if args.csv: 476 print("%.6f," % delta_s, end="") 477 else: 478 print("%-9.6f " % delta_s, end="") 479 print(format_string % (event.pid, event.task.decode('utf-8', 'replace'), 480 "6" if args.wide or args.csv else "", 481 inet_ntop(AF_INET6, event.saddr), event.ports >> 32, 482 inet_ntop(AF_INET6, event.daddr), event.ports & 0xffffffff, 483 event.tx_b / 1024, event.rx_b / 1024, float(event.span_us) / 1000)) 484 485# initialize BPF 486b = BPF(text=bpf_text) 487 488# header 489if args.time: 490 if args.csv: 491 print("%s," % ("TIME"), end="") 492 else: 493 print("%-8s " % ("TIME"), end="") 494if args.timestamp: 495 if args.csv: 496 print("%s," % ("TIME(s)"), end="") 497 else: 498 print("%-9s " % ("TIME(s)"), end="") 499print(header_string % ("PID", "COMM", 500 "IP" if args.wide or args.csv else "", "LADDR", 501 "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS")) 502 503start_ts = 0 504 505# read events 506b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64) 507b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64) 508while 1: 509 b.perf_buffer_poll() 510