1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# tcpcong Measure tcp congestion control status duration. 5# For Linux, uses BCC, eBPF. 6# 7# USAGE: tcpcong [-h] [-T] [-L] [-R] [-m] [-d] [interval] [outputs] 8# 9# Copyright (c) Ping Gan. 10# 11# 27-Jan-2022 Ping Gan Created this. 12 13from __future__ import print_function 14from bcc import BPF 15from time import sleep, strftime 16from struct import pack 17from socket import inet_ntop, AF_INET, AF_INET6 18from struct import pack 19import argparse 20 21examples = """examples: 22 ./tcpcong # show tcp congestion status duration 23 ./tcpcong 1 10 # show 1 second summaries, 10 times 24 ./tcpcong -L 3000-3006 1 # 1s summaries, local port 3000-3006 25 ./tcpcong -R 5000-5005 1 # 1s summaries, remote port 5000-5005 26 ./tcpcong -uT 1 # 1s summaries, microseconds, and timestamps 27 ./tcpcong -d # show the duration as histograms 28""" 29 30parser = argparse.ArgumentParser( 31 description="Summarize tcp socket congestion control status duration", 32 formatter_class=argparse.RawDescriptionHelpFormatter, 33 epilog=examples) 34parser.add_argument("-L", "--localport", 35 help="trace local ports only") 36parser.add_argument("-R", "--remoteport", 37 help="trace the dest ports only") 38parser.add_argument("-T", "--timestamp", action="store_true", 39 help="include timestamp on output") 40parser.add_argument("-d", "--dist", action="store_true", 41 help="show distributions as histograms") 42parser.add_argument("-u", "--microseconds", action="store_true", 43 help="output in microseconds") 44parser.add_argument("interval", nargs="?", default=99999999, 45 help="output interval, in seconds") 46parser.add_argument("outputs", nargs="?", default=99999999, 47 help="number of outputs") 48parser.add_argument("--ebpf", action="store_true", 49 help=argparse.SUPPRESS) 50args = parser.parse_args() 51countdown = int(args.outputs) 52debug = 0 53 54start_rport = end_rport = -1 55if args.remoteport: 56 rports = args.remoteport.split("-") 57 if (len(rports) != 2) and (len(rports) != 1): 58 print("unrecognized remote port range") 59 exit(1) 60 if len(rports) == 2: 61 start_rport = int(rports[0]) 62 end_rport = int(rports[1]) 63 else: 64 start_rport = int(rports[0]) 65 end_rport = int(rports[0]) 66if start_rport > end_rport: 67 tmp = start_rport 68 start_rport = end_rport 69 end_rport = tmp 70 71start_lport = end_lport = -1 72if args.localport: 73 lports = args.localport.split("-") 74 if (len(lports) != 2) and (len(lports) != 1): 75 print("unrecognized local port range") 76 exit(1) 77 if len(lports) == 2: 78 start_lport = int(lports[0]) 79 end_lport = int(lports[1]) 80 else: 81 start_lport = int(lports[0]) 82 end_lport = int(lports[0]) 83if start_lport > end_lport: 84 tmp = start_lport 85 start_lport = end_lport 86 end_lport = tmp 87 88# define BPF program 89bpf_text = """ 90#include <uapi/linux/ptrace.h> 91#include <net/sock.h> 92#include <bcc/proto.h> 93#include <net/tcp.h> 94#include <net/inet_connection_sock.h> 95 96typedef struct ipv4_flow_key { 97 u32 saddr; 98 u32 daddr; 99 u16 lport; 100 u16 dport; 101} ipv4_flow_key_t; 102 103typedef struct ipv6_flow_key { 104 unsigned __int128 saddr; 105 unsigned __int128 daddr; 106 u16 lport; 107 u16 dport; 108} ipv6_flow_key_t; 109 110typedef struct process_key { 111 char comm[TASK_COMM_LEN]; 112 u32 tid; 113} process_key_t; 114 115typedef struct ipv4_flow_val { 116 ipv4_flow_key_t ipv4_key; 117 u16 cong_state; 118} ipv4_flow_val_t; 119 120typedef struct ipv6_flow_val { 121 ipv6_flow_key_t ipv6_key; 122 u16 cong_state; 123} ipv6_flow_val_t; 124 125BPF_HASH(start_ipv4, process_key_t, ipv4_flow_val_t); 126BPF_HASH(start_ipv6, process_key_t, ipv6_flow_val_t); 127SOCK_STORE_DEF 128 129typedef struct data_val { 130 DEF_TEXT 131 u64 last_ts; 132 u16 last_cong_stat; 133} data_val_t; 134 135typedef struct cong { 136 u8 cong_stat:5, 137 ca_inited:1, 138 ca_setsockopt:1, 139 ca_dstlocked:1; 140} cong_status_t; 141 142BPF_HASH(ipv4_stat, ipv4_flow_key_t, data_val_t); 143BPF_HASH(ipv6_stat, ipv6_flow_key_t, data_val_t); 144 145HIST_TABLE 146 147static int entry_state_update_func(struct sock *sk) 148{ 149 u16 dport = 0, lport = 0; 150 u32 tid = bpf_get_current_pid_tgid(); 151 process_key_t key = {0}; 152 bpf_get_current_comm(&key.comm, sizeof(key.comm)); 153 key.tid = tid; 154 155 u64 family = sk->__sk_common.skc_family; 156 struct inet_connection_sock *icsk = inet_csk(sk); 157 cong_status_t cong_status; 158 bpf_probe_read_kernel(&cong_status, sizeof(cong_status), 159 (void *)((long)&icsk->icsk_retransmits) - 1); 160 if (family == AF_INET) { 161 ipv4_flow_val_t ipv4_val = {0}; 162 ipv4_val.ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr; 163 ipv4_val.ipv4_key.daddr = sk->__sk_common.skc_daddr; 164 ipv4_val.ipv4_key.lport = sk->__sk_common.skc_num; 165 dport = sk->__sk_common.skc_dport; 166 dport = ntohs(dport); 167 lport = ipv4_val.ipv4_key.lport; 168 FILTER_LPORT 169 FILTER_DPORT 170 ipv4_val.ipv4_key.dport = dport; 171 ipv4_val.cong_state = cong_status.cong_stat + 1; 172 start_ipv4.update(&key, &ipv4_val); 173 } else if (family == AF_INET6) { 174 ipv6_flow_val_t ipv6_val = {0}; 175 bpf_probe_read_kernel(&ipv6_val.ipv6_key.saddr, 176 sizeof(ipv6_val.ipv6_key.saddr), 177 &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 178 bpf_probe_read_kernel(&ipv6_val.ipv6_key.daddr, 179 sizeof(ipv6_val.ipv6_key.daddr), 180 &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 181 ipv6_val.ipv6_key.lport = sk->__sk_common.skc_num; 182 dport = sk->__sk_common.skc_dport; 183 dport = ntohs(dport); 184 lport = ipv6_val.ipv6_key.lport; 185 FILTER_LPORT 186 FILTER_DPORT 187 ipv6_val.ipv6_key.dport = dport; 188 ipv6_val.cong_state = cong_status.cong_stat + 1; 189 start_ipv6.update(&key, &ipv6_val); 190 } 191 SOCK_STORE_ADD 192 return 0; 193} 194 195static int ret_state_update_func(struct sock *sk) 196{ 197 u64 ts, ts1; 198 u16 family, last_cong_state; 199 u16 dport = 0, lport = 0; 200 u32 tid = bpf_get_current_pid_tgid(); 201 process_key_t key = {0}; 202 bpf_get_current_comm(&key.comm, sizeof(key.comm)); 203 key.tid = tid; 204 205 struct inet_connection_sock *icsk = inet_csk(sk); 206 cong_status_t cong_status; 207 bpf_probe_read_kernel(&cong_status, sizeof(cong_status), 208 (void *)((long)&icsk->icsk_retransmits) - 1); 209 data_val_t *datap, data = {0}; 210 STATE_KEY 211 bpf_probe_read_kernel(&family, sizeof(family), 212 &sk->__sk_common.skc_family); 213 if (family == AF_INET) { 214 ipv4_flow_val_t *val4 = start_ipv4.lookup(&key); 215 if (val4 == 0) { 216 SOCK_STORE_DEL 217 return 0; //missed 218 } 219 ipv4_flow_key_t keyv4 = {0}; 220 bpf_probe_read_kernel(&keyv4, sizeof(ipv4_flow_key_t), 221 &(val4->ipv4_key)); 222 dport = keyv4.dport; 223 lport = keyv4.lport; 224 FILTER_LPORT 225 FILTER_DPORT 226 datap = ipv4_stat.lookup(&keyv4); 227 if (datap == 0) { 228 data.last_ts = bpf_ktime_get_ns(); 229 data.last_cong_stat = val4->cong_state; 230 ipv4_stat.update(&keyv4, &data); 231 } else { 232 last_cong_state = val4->cong_state; 233 if ((cong_status.cong_stat + 1) != last_cong_state) { 234 ts1 = bpf_ktime_get_ns(); 235 ts = ts1 - datap->last_ts; 236 datap->last_ts = ts1; 237 datap->last_cong_stat = cong_status.cong_stat + 1; 238 ts /= 1000; 239 STORE 240 } 241 } 242 start_ipv4.delete(&key); 243 } else if (family == AF_INET6) { 244 ipv6_flow_val_t *val6 = start_ipv6.lookup(&key); 245 if (val6 == 0) { 246 SOCK_STORE_DEL 247 return 0; //missed 248 } 249 ipv6_flow_key_t keyv6 = {0}; 250 bpf_probe_read_kernel(&keyv6, sizeof(ipv6_flow_key_t), 251 &(val6->ipv6_key)); 252 dport = keyv6.dport; 253 lport = keyv6.lport; 254 FILTER_LPORT 255 FILTER_DPORT 256 datap = ipv6_stat.lookup(&keyv6); 257 if (datap == 0) { 258 data.last_ts = bpf_ktime_get_ns(); 259 data.last_cong_stat = val6->cong_state; 260 ipv6_stat.update(&keyv6, &data); 261 } else { 262 last_cong_state = val6->cong_state; 263 if ((cong_status.cong_stat + 1) != last_cong_state) { 264 ts1 = bpf_ktime_get_ns(); 265 ts = ts1 - datap->last_ts; 266 datap->last_ts = ts1; 267 datap->last_cong_stat = (cong_status.cong_stat + 1); 268 ts /= 1000; 269 STORE 270 } 271 } 272 start_ipv6.delete(&key); 273 } 274 SOCK_STORE_DEL 275 return 0; 276} 277""" 278 279kprobe_program = """ 280int entry_func(struct pt_regs *ctx, struct sock *sk) 281{ 282 return entry_state_update_func(sk); 283} 284 285int ret_func(struct pt_regs *ctx) 286{ 287 u32 tid = bpf_get_current_pid_tgid(); 288 process_key_t key = {0}; 289 bpf_get_current_comm(&key.comm, sizeof(key.comm)); 290 key.tid = tid; 291 struct sock **sockpp; 292 sockpp = sock_store.lookup(&key); 293 if (sockpp == 0) { 294 return 0; //miss the entry 295 } 296 struct sock *sk = *sockpp; 297 return ret_state_update_func(sk); 298} 299""" 300 301kfunc_program = """ 302KFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk) 303{ 304 return entry_state_update_func(sk); 305} 306 307KRETFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk) 308{ 309 return ret_state_update_func(sk); 310} 311 312KFUNC_PROBE(tcp_enter_cwr, struct sock *sk) 313{ 314 return entry_state_update_func(sk); 315} 316 317KRETFUNC_PROBE(tcp_enter_cwr, struct sock *sk) 318{ 319 return ret_state_update_func(sk); 320} 321 322KFUNC_PROBE(tcp_enter_loss, struct sock *sk) 323{ 324 return entry_state_update_func(sk); 325} 326 327KRETFUNC_PROBE(tcp_enter_loss, struct sock *sk) 328{ 329 return ret_state_update_func(sk); 330} 331 332KFUNC_PROBE(tcp_enter_recovery, struct sock *sk) 333{ 334 return entry_state_update_func(sk); 335} 336 337KRETFUNC_PROBE(tcp_enter_recovery, struct sock *sk) 338{ 339 return ret_state_update_func(sk); 340} 341 342KFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk) 343{ 344 return entry_state_update_func(sk); 345} 346 347KRETFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk) 348{ 349 return ret_state_update_func(sk); 350} 351""" 352 353# code replace 354is_support_kfunc = BPF.support_kfunc() 355if is_support_kfunc: 356 bpf_text += kfunc_program 357 bpf_text = bpf_text.replace('SOCK_STORE_DEF', '') 358 bpf_text = bpf_text.replace('SOCK_STORE_ADD', '') 359 bpf_text = bpf_text.replace('SOCK_STORE_DEL', '') 360else: 361 bpf_text += kprobe_program 362 bpf_text = bpf_text.replace('SOCK_STORE_DEF', 363 'BPF_HASH(sock_store, process_key_t, struct sock *);') 364 bpf_text = bpf_text.replace('SOCK_STORE_ADD', 365 'sock_store.update(&key, &sk);') 366 bpf_text = bpf_text.replace('SOCK_STORE_DEL', 367 'sock_store.delete(&key);') 368 369if args.localport: 370 bpf_text = bpf_text.replace('FILTER_LPORT', 371 'if (lport < %d || lport > %d) { return 0; }' 372 % (start_lport, end_lport)) 373else: 374 bpf_text = bpf_text.replace('FILTER_LPORT', '') 375 376if args.remoteport: 377 bpf_text = bpf_text.replace('FILTER_DPORT', 378 'if (dport < %d || dport > %d) { return 0; }' 379 % (start_rport, end_rport)) 380else: 381 bpf_text = bpf_text.replace('FILTER_DPORT', '') 382 383table_def_text = """ 384 u64 open_dura; 385 u64 loss_dura; 386 u64 disorder_dura; 387 u64 recover_dura; 388 u64 cwr_dura; 389 u64 total_changes; 390""" 391 392store_text = """ 393 datap->total_changes += 1; 394 if (last_cong_state == (TCP_CA_Open + 1)) { 395 datap->open_dura += ts; 396 } else if (last_cong_state == (TCP_CA_Disorder + 1)) { 397 datap->disorder_dura += ts; 398 } else if (last_cong_state == (TCP_CA_CWR + 1)) { 399 datap->cwr_dura += ts; 400 } else if (last_cong_state == (TCP_CA_Recovery + 1)) { 401 datap->recover_dura += ts; 402 } else if (last_cong_state == (TCP_CA_Loss + 1)) { 403 datap->loss_dura += ts; 404 } 405""" 406 407store_dist_text = """ 408 if (last_cong_state == (TCP_CA_Open + 1)) { 409 key_s.state = TCP_CA_Open; 410 } else if (last_cong_state == (TCP_CA_Disorder + 1)) { 411 key_s.state = TCP_CA_Disorder; 412 } else if (last_cong_state == (TCP_CA_CWR + 1)) { 413 key_s.state = TCP_CA_CWR; 414 } else if (last_cong_state == (TCP_CA_Recovery + 1)) { 415 key_s.state = TCP_CA_Recovery; 416 } else if (last_cong_state == (TCP_CA_Loss + 1)) { 417 key_s.state = TCP_CA_Loss; 418 } 419 TIME_UNIT 420 key_s.slot = bpf_log2l(ts); 421 dist.atomic_increment(key_s); 422""" 423 424hist_table_text = """ 425typedef struct congest_state_key { 426 u32 state; 427 u64 slot; 428}congest_state_key_t; 429 430BPF_HISTOGRAM(dist, congest_state_key_t); 431""" 432 433if args.dist: 434 bpf_text = bpf_text.replace('DEF_TEXT', '') 435 bpf_text = bpf_text.replace('STORE', store_dist_text) 436 bpf_text = bpf_text.replace('STATE_KEY', 437 'congest_state_key_t key_s = {0};') 438 bpf_text = bpf_text.replace('HIST_TABLE', hist_table_text) 439 if args.microseconds: 440 bpf_text = bpf_text.replace('TIME_UNIT', '') 441 else: 442 bpf_text = bpf_text.replace('TIME_UNIT', 'ts /= 1000;') 443else: 444 bpf_text = bpf_text.replace('DEF_TEXT', table_def_text) 445 bpf_text = bpf_text.replace('STORE', store_text) 446 bpf_text = bpf_text.replace('STATE_KEY', '') 447 bpf_text = bpf_text.replace('HIST_TABLE', '') 448 449 450if debug or args.ebpf: 451 print(bpf_text) 452 if args.ebpf: 453 exit() 454 455# load BPF program 456b = BPF(text=bpf_text) 457 458if not is_support_kfunc: 459 # all the tcp congestion control status update functions 460 # are called by below 5 functions. 461 b.attach_kprobe(event="tcp_fastretrans_alert", fn_name="entry_func") 462 b.attach_kretprobe(event="tcp_fastretrans_alert", fn_name="ret_func") 463 b.attach_kprobe(event="tcp_enter_cwr", fn_name="entry_func") 464 b.attach_kretprobe(event="tcp_enter_cwr", fn_name="ret_func") 465 b.attach_kprobe(event="tcp_process_tlp_ack", fn_name="entry_func") 466 b.attach_kretprobe(event="tcp_process_tlp_ack", fn_name="ret_func") 467 b.attach_kprobe(event="tcp_enter_loss", fn_name="entry_func") 468 b.attach_kretprobe(event="tcp_enter_loss", fn_name="ret_func") 469 b.attach_kprobe(event="tcp_enter_recovery", fn_name="entry_func") 470 b.attach_kretprobe(event="tcp_enter_recovery", fn_name="ret_func") 471 472print("Tracing tcp congestion control status duration... Hit Ctrl-C to end.") 473 474 475def cong_state_to_name(state): 476 # this need to match with kernel state 477 state_name = ["open", "disorder", "cwr", "recovery", "loss"] 478 return state_name[state] 479 480# output 481exiting = 0 if args.interval else 1 482ipv6_stat = b.get_table("ipv6_stat") 483ipv4_stat = b.get_table("ipv4_stat") 484if args.dist: 485 dist = b.get_table("dist") 486label = "ms" 487if args.microseconds: 488 label = "us" 489while (1): 490 try: 491 sleep(int(args.interval)) 492 except KeyboardInterrupt: 493 exiting = 1 494 495 print() 496 if args.timestamp: 497 print("%-8s\n" % strftime("%H:%M:%S"), end="") 498 if args.dist: 499 if args.microseconds: 500 dist.print_log2_hist("usecs", "tcp_congest_state", 501 section_print_fn=cong_state_to_name) 502 else: 503 dist.print_log2_hist("msecs", "tcp_congest_state", 504 section_print_fn=cong_state_to_name) 505 dist.clear() 506 else: 507 if ipv4_stat: 508 print("%-21s% -21s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort", 509 "RAddrPort", "Open_" + label, "Dod_" + label, 510 "Rcov_" + label, "Cwr_" + label, "Los_" + label, "Chgs")) 511 laddr = "" 512 raddr = "" 513 for k, v in sorted(ipv4_stat.items(), key=lambda ipv4_stat: ipv4_stat[0].lport): 514 laddr = inet_ntop(AF_INET, pack("I", k.saddr)) 515 raddr = inet_ntop(AF_INET, pack("I", k.daddr)) 516 open_dura = v.open_dura 517 disorder_dura = v.disorder_dura 518 recover_dura = v.recover_dura 519 cwr_dura = v.cwr_dura 520 loss_dura = v.loss_dura 521 if not args.microseconds: 522 open_dura /= 1000 523 disorder_dura /= 1000 524 recover_dura /= 1000 525 cwr_dura /= 1000 526 loss_dura /= 1000 527 if v.total_changes != 0: 528 print("%-21s %-21s %-7d %-6d %-7d %-7d %-6d %-5d" % (laddr + 529 "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura, 530 disorder_dura, recover_dura, cwr_dura, loss_dura, 531 v.total_changes)) 532 if ipv6_stat: 533 print("%-32s %-32s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort6", 534 "RAddrPort6", "Open_" + label, "Dod_" + label, "Rcov_" + label, 535 "Cwr_" + label, "Los_" + label, "Chgs")) 536 for k, v in sorted(ipv6_stat.items(), key=lambda ipv6_stat: ipv6_stat[0].lport): 537 laddr = inet_ntop(AF_INET6, bytes(k.saddr)) 538 raddr = inet_ntop(AF_INET6, bytes(k.daddr)) 539 open_dura = v.open_dura 540 disorder_dura = v.disorder_dura 541 recover_dura = v.recover_dura 542 cwr_dura = v.cwr_dura 543 loss_dura = v.loss_dura 544 if not args.microseconds: 545 open_dura /= 1000 546 disorder_dura /= 1000 547 recover_dura /= 1000 548 cwr_dura /= 1000 549 loss_dura /= 1000 550 if v.total_changes != 0: 551 print("%-32s %-32s %-7d %-7d %-7d %-6d %-6d %-5d" % (laddr + 552 "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura, 553 disorder_dura, recover_dura, cwr_dura, loss_dura, 554 v.total_changes)) 555 ipv4_stat.clear() 556 ipv6_stat.clear() 557 countdown -= 1 558 if exiting or countdown == 0: 559 exit() 560