• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# @lint-avoid-python-3-compatibility-imports
3#
4# tcpcong  Measure tcp congestion control status duration.
5#           For Linux, uses BCC, eBPF.
6#
7# USAGE: tcpcong [-h] [-T] [-L] [-R] [-m] [-d] [interval] [outputs]
8#
9# Copyright (c) Ping Gan.
10#
11# 27-Jan-2022   Ping Gan   Created this.
12
13from __future__ import print_function
14from bcc import BPF
15from time import sleep, strftime
16from struct import pack
17from socket import inet_ntop, AF_INET, AF_INET6
18from struct import pack
19import argparse
20
21examples = """examples:
22    ./tcpcong                 # show tcp congestion status duration
23    ./tcpcong 1 10            # show 1 second summaries, 10 times
24    ./tcpcong -L 3000-3006 1  # 1s summaries, local port 3000-3006
25    ./tcpcong -R 5000-5005 1  # 1s summaries, remote port 5000-5005
26    ./tcpcong -uT 1           # 1s summaries, microseconds, and timestamps
27    ./tcpcong -d              # show the duration as histograms
28"""
29
30parser = argparse.ArgumentParser(
31    description="Summarize tcp socket congestion control status duration",
32    formatter_class=argparse.RawDescriptionHelpFormatter,
33    epilog=examples)
34parser.add_argument("-L", "--localport",
35            help="trace local ports only")
36parser.add_argument("-R", "--remoteport",
37            help="trace the dest ports only")
38parser.add_argument("-T", "--timestamp", action="store_true",
39    help="include timestamp on output")
40parser.add_argument("-d", "--dist", action="store_true",
41    help="show distributions as histograms")
42parser.add_argument("-u", "--microseconds", action="store_true",
43    help="output in microseconds")
44parser.add_argument("interval", nargs="?", default=99999999,
45    help="output interval, in seconds")
46parser.add_argument("outputs", nargs="?", default=99999999,
47    help="number of outputs")
48parser.add_argument("--ebpf", action="store_true",
49    help=argparse.SUPPRESS)
50args = parser.parse_args()
51countdown = int(args.outputs)
52debug = 0
53
54start_rport = end_rport = -1
55if args.remoteport:
56    rports = args.remoteport.split("-")
57    if (len(rports) != 2) and (len(rports) != 1):
58        print("unrecognized remote port range")
59        exit(1)
60    if len(rports) == 2:
61        start_rport = int(rports[0])
62        end_rport = int(rports[1])
63    else:
64        start_rport = int(rports[0])
65        end_rport = int(rports[0])
66if start_rport > end_rport:
67    tmp = start_rport
68    start_rport = end_rport
69    end_rport = tmp
70
71start_lport = end_lport = -1
72if args.localport:
73    lports = args.localport.split("-")
74    if (len(lports) != 2) and (len(lports) != 1):
75        print("unrecognized local port range")
76        exit(1)
77    if len(lports) == 2:
78        start_lport = int(lports[0])
79        end_lport = int(lports[1])
80    else:
81        start_lport = int(lports[0])
82        end_lport = int(lports[0])
83if start_lport > end_lport:
84    tmp = start_lport
85    start_lport = end_lport
86    end_lport = tmp
87
88# define BPF program
89bpf_text = """
90#include <uapi/linux/ptrace.h>
91#include <net/sock.h>
92#include <bcc/proto.h>
93#include <net/tcp.h>
94#include <net/inet_connection_sock.h>
95
96typedef struct ipv4_flow_key {
97    u32 saddr;
98    u32 daddr;
99    u16 lport;
100    u16 dport;
101} ipv4_flow_key_t;
102
103typedef struct ipv6_flow_key {
104    unsigned __int128 saddr;
105    unsigned __int128 daddr;
106    u16 lport;
107    u16 dport;
108} ipv6_flow_key_t;
109
110typedef struct process_key {
111    char comm[TASK_COMM_LEN];
112    u32  tid;
113} process_key_t;
114
115typedef struct ipv4_flow_val {
116    ipv4_flow_key_t ipv4_key;
117    u16  cong_state;
118} ipv4_flow_val_t;
119
120typedef struct ipv6_flow_val {
121    ipv6_flow_key_t ipv6_key;
122    u16  cong_state;
123} ipv6_flow_val_t;
124
125BPF_HASH(start_ipv4, process_key_t, ipv4_flow_val_t);
126BPF_HASH(start_ipv6, process_key_t, ipv6_flow_val_t);
127SOCK_STORE_DEF
128
129typedef struct data_val {
130    DEF_TEXT
131    u64  last_ts;
132    u16  last_cong_stat;
133} data_val_t;
134
135typedef struct cong {
136    u8  cong_stat:5,
137        ca_inited:1,
138        ca_setsockopt:1,
139        ca_dstlocked:1;
140} cong_status_t;
141
142BPF_HASH(ipv4_stat, ipv4_flow_key_t, data_val_t);
143BPF_HASH(ipv6_stat, ipv6_flow_key_t, data_val_t);
144
145HIST_TABLE
146
147static int entry_state_update_func(struct sock *sk)
148{
149    u16 dport = 0, lport = 0;
150    u32 tid = bpf_get_current_pid_tgid();
151    process_key_t key = {0};
152    bpf_get_current_comm(&key.comm, sizeof(key.comm));
153    key.tid = tid;
154
155    u64 family = sk->__sk_common.skc_family;
156    struct inet_connection_sock *icsk = inet_csk(sk);
157    cong_status_t cong_status;
158    bpf_probe_read_kernel(&cong_status, sizeof(cong_status),
159        (void *)((long)&icsk->icsk_retransmits) - 1);
160    if (family == AF_INET) {
161        ipv4_flow_val_t ipv4_val = {0};
162        ipv4_val.ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr;
163        ipv4_val.ipv4_key.daddr = sk->__sk_common.skc_daddr;
164        ipv4_val.ipv4_key.lport = sk->__sk_common.skc_num;
165        dport = sk->__sk_common.skc_dport;
166        dport = ntohs(dport);
167        lport = ipv4_val.ipv4_key.lport;
168        FILTER_LPORT
169        FILTER_DPORT
170        ipv4_val.ipv4_key.dport = dport;
171        ipv4_val.cong_state = cong_status.cong_stat + 1;
172        start_ipv4.update(&key, &ipv4_val);
173    } else if (family == AF_INET6) {
174        ipv6_flow_val_t ipv6_val = {0};
175        bpf_probe_read_kernel(&ipv6_val.ipv6_key.saddr,
176            sizeof(ipv6_val.ipv6_key.saddr),
177            &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
178        bpf_probe_read_kernel(&ipv6_val.ipv6_key.daddr,
179            sizeof(ipv6_val.ipv6_key.daddr),
180            &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
181        ipv6_val.ipv6_key.lport = sk->__sk_common.skc_num;
182        dport = sk->__sk_common.skc_dport;
183        dport = ntohs(dport);
184        lport = ipv6_val.ipv6_key.lport;
185        FILTER_LPORT
186        FILTER_DPORT
187        ipv6_val.ipv6_key.dport = dport;
188        ipv6_val.cong_state = cong_status.cong_stat + 1;
189        start_ipv6.update(&key, &ipv6_val);
190    }
191    SOCK_STORE_ADD
192    return 0;
193}
194
195static int ret_state_update_func(struct sock *sk)
196{
197    u64 ts, ts1;
198    u16 family, last_cong_state;
199    u16 dport = 0, lport = 0;
200    u32 tid = bpf_get_current_pid_tgid();
201    process_key_t key = {0};
202    bpf_get_current_comm(&key.comm, sizeof(key.comm));
203    key.tid = tid;
204
205    struct inet_connection_sock *icsk = inet_csk(sk);
206    cong_status_t cong_status;
207    bpf_probe_read_kernel(&cong_status, sizeof(cong_status),
208        (void *)((long)&icsk->icsk_retransmits) - 1);
209    data_val_t *datap, data = {0};
210    STATE_KEY
211    bpf_probe_read_kernel(&family, sizeof(family),
212        &sk->__sk_common.skc_family);
213    if (family == AF_INET) {
214        ipv4_flow_val_t *val4 = start_ipv4.lookup(&key);
215        if (val4 == 0) {
216            SOCK_STORE_DEL
217            return 0; //missed
218        }
219        ipv4_flow_key_t keyv4 = {0};
220        bpf_probe_read_kernel(&keyv4, sizeof(ipv4_flow_key_t),
221            &(val4->ipv4_key));
222        dport = keyv4.dport;
223        lport = keyv4.lport;
224        FILTER_LPORT
225        FILTER_DPORT
226        datap = ipv4_stat.lookup(&keyv4);
227        if (datap == 0) {
228            data.last_ts = bpf_ktime_get_ns();
229            data.last_cong_stat = val4->cong_state;
230            ipv4_stat.update(&keyv4, &data);
231        } else {
232            last_cong_state = val4->cong_state;
233            if ((cong_status.cong_stat + 1) != last_cong_state) {
234                ts1 = bpf_ktime_get_ns();
235                ts = ts1 - datap->last_ts;
236                datap->last_ts = ts1;
237                datap->last_cong_stat = cong_status.cong_stat + 1;
238                ts /= 1000;
239                STORE
240            }
241        }
242        start_ipv4.delete(&key);
243    } else if (family == AF_INET6) {
244        ipv6_flow_val_t *val6 = start_ipv6.lookup(&key);
245        if (val6 == 0) {
246            SOCK_STORE_DEL
247            return 0; //missed
248        }
249        ipv6_flow_key_t keyv6 = {0};
250        bpf_probe_read_kernel(&keyv6, sizeof(ipv6_flow_key_t),
251            &(val6->ipv6_key));
252        dport = keyv6.dport;
253        lport = keyv6.lport;
254        FILTER_LPORT
255        FILTER_DPORT
256        datap = ipv6_stat.lookup(&keyv6);
257        if (datap == 0) {
258            data.last_ts = bpf_ktime_get_ns();
259            data.last_cong_stat = val6->cong_state;
260            ipv6_stat.update(&keyv6, &data);
261        } else {
262            last_cong_state = val6->cong_state;
263            if ((cong_status.cong_stat + 1) != last_cong_state) {
264                ts1 = bpf_ktime_get_ns();
265                ts = ts1 - datap->last_ts;
266                datap->last_ts = ts1;
267                datap->last_cong_stat = (cong_status.cong_stat + 1);
268                ts /= 1000;
269                STORE
270            }
271        }
272        start_ipv6.delete(&key);
273    }
274    SOCK_STORE_DEL
275    return 0;
276}
277"""
278
279kprobe_program = """
280int entry_func(struct pt_regs *ctx, struct sock *sk)
281{
282    return entry_state_update_func(sk);
283}
284
285int ret_func(struct pt_regs *ctx)
286{
287    u32 tid = bpf_get_current_pid_tgid();
288    process_key_t key = {0};
289    bpf_get_current_comm(&key.comm, sizeof(key.comm));
290    key.tid = tid;
291    struct sock **sockpp;
292    sockpp = sock_store.lookup(&key);
293    if (sockpp == 0) {
294        return 0; //miss the entry
295    }
296    struct sock *sk = *sockpp;
297    return ret_state_update_func(sk);
298}
299"""
300
301kfunc_program = """
302KFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk)
303{
304    return entry_state_update_func(sk);
305}
306
307KRETFUNC_PROBE(tcp_fastretrans_alert, struct sock *sk)
308{
309    return ret_state_update_func(sk);
310}
311
312KFUNC_PROBE(tcp_enter_cwr, struct sock *sk)
313{
314    return entry_state_update_func(sk);
315}
316
317KRETFUNC_PROBE(tcp_enter_cwr, struct sock *sk)
318{
319    return ret_state_update_func(sk);
320}
321
322KFUNC_PROBE(tcp_enter_loss, struct sock *sk)
323{
324    return entry_state_update_func(sk);
325}
326
327KRETFUNC_PROBE(tcp_enter_loss, struct sock *sk)
328{
329    return ret_state_update_func(sk);
330}
331
332KFUNC_PROBE(tcp_enter_recovery, struct sock *sk)
333{
334    return entry_state_update_func(sk);
335}
336
337KRETFUNC_PROBE(tcp_enter_recovery, struct sock *sk)
338{
339    return ret_state_update_func(sk);
340}
341
342KFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk)
343{
344    return entry_state_update_func(sk);
345}
346
347KRETFUNC_PROBE(tcp_process_tlp_ack, struct sock *sk)
348{
349    return ret_state_update_func(sk);
350}
351"""
352
353# code replace
354is_support_kfunc = BPF.support_kfunc()
355if is_support_kfunc:
356    bpf_text += kfunc_program
357    bpf_text = bpf_text.replace('SOCK_STORE_DEF', '')
358    bpf_text = bpf_text.replace('SOCK_STORE_ADD', '')
359    bpf_text = bpf_text.replace('SOCK_STORE_DEL', '')
360else:
361    bpf_text += kprobe_program
362    bpf_text = bpf_text.replace('SOCK_STORE_DEF',
363                   'BPF_HASH(sock_store, process_key_t, struct sock *);')
364    bpf_text = bpf_text.replace('SOCK_STORE_ADD',
365                   'sock_store.update(&key, &sk);')
366    bpf_text = bpf_text.replace('SOCK_STORE_DEL',
367                   'sock_store.delete(&key);')
368
369if args.localport:
370    bpf_text = bpf_text.replace('FILTER_LPORT',
371        'if (lport < %d || lport > %d) { return 0; }'
372        % (start_lport, end_lport))
373else:
374    bpf_text = bpf_text.replace('FILTER_LPORT', '')
375
376if args.remoteport:
377    bpf_text = bpf_text.replace('FILTER_DPORT',
378        'if (dport < %d || dport > %d) { return 0; }'
379        % (start_rport, end_rport))
380else:
381    bpf_text = bpf_text.replace('FILTER_DPORT', '')
382
383table_def_text = """
384    u64  open_dura;
385    u64  loss_dura;
386    u64  disorder_dura;
387    u64  recover_dura;
388    u64  cwr_dura;
389    u64  total_changes;
390"""
391
392store_text = """
393                datap->total_changes += 1;
394                if (last_cong_state == (TCP_CA_Open + 1)) {
395                    datap->open_dura += ts;
396                } else if (last_cong_state == (TCP_CA_Disorder + 1)) {
397                    datap->disorder_dura += ts;
398                } else if (last_cong_state == (TCP_CA_CWR + 1)) {
399                    datap->cwr_dura += ts;
400                } else if (last_cong_state == (TCP_CA_Recovery + 1)) {
401                    datap->recover_dura += ts;
402                } else if (last_cong_state == (TCP_CA_Loss + 1)) {
403                    datap->loss_dura += ts;
404                }
405"""
406
407store_dist_text = """
408                if (last_cong_state == (TCP_CA_Open + 1)) {
409                    key_s.state = TCP_CA_Open;
410                } else if (last_cong_state == (TCP_CA_Disorder + 1)) {
411                    key_s.state = TCP_CA_Disorder;
412                } else if (last_cong_state == (TCP_CA_CWR + 1)) {
413                    key_s.state = TCP_CA_CWR;
414                } else if (last_cong_state == (TCP_CA_Recovery + 1)) {
415                    key_s.state = TCP_CA_Recovery;
416                } else if (last_cong_state == (TCP_CA_Loss + 1)) {
417                    key_s.state = TCP_CA_Loss;
418                }
419                TIME_UNIT
420                key_s.slot = bpf_log2l(ts);
421                dist.atomic_increment(key_s);
422"""
423
424hist_table_text = """
425typedef struct congest_state_key {
426    u32  state;
427    u64  slot;
428}congest_state_key_t;
429
430BPF_HISTOGRAM(dist, congest_state_key_t);
431"""
432
433if args.dist:
434    bpf_text = bpf_text.replace('DEF_TEXT', '')
435    bpf_text = bpf_text.replace('STORE', store_dist_text)
436    bpf_text = bpf_text.replace('STATE_KEY',
437        'congest_state_key_t key_s = {0};')
438    bpf_text = bpf_text.replace('HIST_TABLE', hist_table_text)
439    if args.microseconds:
440        bpf_text = bpf_text.replace('TIME_UNIT', '')
441    else:
442        bpf_text = bpf_text.replace('TIME_UNIT', 'ts /= 1000;')
443else:
444    bpf_text = bpf_text.replace('DEF_TEXT', table_def_text)
445    bpf_text = bpf_text.replace('STORE', store_text)
446    bpf_text = bpf_text.replace('STATE_KEY', '')
447    bpf_text = bpf_text.replace('HIST_TABLE', '')
448
449
450if debug or args.ebpf:
451    print(bpf_text)
452    if args.ebpf:
453        exit()
454
455# load BPF program
456b = BPF(text=bpf_text)
457
458if not is_support_kfunc:
459    # all the tcp congestion control status update functions
460    # are called by below 5 functions.
461    b.attach_kprobe(event="tcp_fastretrans_alert", fn_name="entry_func")
462    b.attach_kretprobe(event="tcp_fastretrans_alert", fn_name="ret_func")
463    b.attach_kprobe(event="tcp_enter_cwr", fn_name="entry_func")
464    b.attach_kretprobe(event="tcp_enter_cwr", fn_name="ret_func")
465    b.attach_kprobe(event="tcp_process_tlp_ack", fn_name="entry_func")
466    b.attach_kretprobe(event="tcp_process_tlp_ack", fn_name="ret_func")
467    b.attach_kprobe(event="tcp_enter_loss", fn_name="entry_func")
468    b.attach_kretprobe(event="tcp_enter_loss", fn_name="ret_func")
469    b.attach_kprobe(event="tcp_enter_recovery", fn_name="entry_func")
470    b.attach_kretprobe(event="tcp_enter_recovery", fn_name="ret_func")
471
472print("Tracing tcp congestion control status duration... Hit Ctrl-C to end.")
473
474
475def cong_state_to_name(state):
476    # this need to match with kernel state
477    state_name = ["open", "disorder", "cwr", "recovery", "loss"]
478    return state_name[state]
479
480# output
481exiting = 0 if args.interval else 1
482ipv6_stat = b.get_table("ipv6_stat")
483ipv4_stat = b.get_table("ipv4_stat")
484if args.dist:
485    dist = b.get_table("dist")
486label = "ms"
487if args.microseconds:
488    label = "us"
489while (1):
490    try:
491        sleep(int(args.interval))
492    except KeyboardInterrupt:
493        exiting = 1
494
495    print()
496    if args.timestamp:
497        print("%-8s\n" % strftime("%H:%M:%S"), end="")
498    if args.dist:
499        if args.microseconds:
500            dist.print_log2_hist("usecs", "tcp_congest_state",
501                section_print_fn=cong_state_to_name)
502        else:
503            dist.print_log2_hist("msecs", "tcp_congest_state",
504                section_print_fn=cong_state_to_name)
505        dist.clear()
506    else:
507        if ipv4_stat:
508            print("%-21s% -21s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort",
509                "RAddrPort", "Open_" + label, "Dod_" + label,
510                "Rcov_" + label, "Cwr_" + label, "Los_" + label, "Chgs"))
511        laddr = ""
512        raddr = ""
513        for k, v in sorted(ipv4_stat.items(), key=lambda ipv4_stat: ipv4_stat[0].lport):
514            laddr = inet_ntop(AF_INET, pack("I", k.saddr))
515            raddr = inet_ntop(AF_INET, pack("I", k.daddr))
516            open_dura = v.open_dura
517            disorder_dura = v.disorder_dura
518            recover_dura = v.recover_dura
519            cwr_dura = v.cwr_dura
520            loss_dura = v.loss_dura
521            if not args.microseconds:
522                open_dura /= 1000
523                disorder_dura /= 1000
524                recover_dura /= 1000
525                cwr_dura /= 1000
526                loss_dura /= 1000
527            if v.total_changes != 0:
528                print("%-21s %-21s %-7d %-6d %-7d %-7d %-6d %-5d" % (laddr +
529                    "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura,
530                    disorder_dura, recover_dura, cwr_dura, loss_dura,
531                    v.total_changes))
532        if ipv6_stat:
533            print("%-32s %-32s %-7s %-6s %-7s %-7s %-6s %-5s" % ("LAddrPort6",
534                "RAddrPort6", "Open_" + label, "Dod_" + label, "Rcov_" + label,
535                "Cwr_" + label, "Los_" + label, "Chgs"))
536        for k, v in sorted(ipv6_stat.items(), key=lambda ipv6_stat: ipv6_stat[0].lport):
537            laddr = inet_ntop(AF_INET6, bytes(k.saddr))
538            raddr = inet_ntop(AF_INET6, bytes(k.daddr))
539            open_dura = v.open_dura
540            disorder_dura = v.disorder_dura
541            recover_dura = v.recover_dura
542            cwr_dura = v.cwr_dura
543            loss_dura = v.loss_dura
544            if not args.microseconds:
545                open_dura /= 1000
546                disorder_dura /= 1000
547                recover_dura /= 1000
548                cwr_dura /= 1000
549                loss_dura /= 1000
550            if v.total_changes != 0:
551                print("%-32s %-32s %-7d %-7d %-7d %-6d %-6d %-5d" % (laddr +
552                    "/" + str(k.lport), raddr + "/" + str(k.dport), open_dura,
553                    disorder_dura, recover_dura, cwr_dura, loss_dura,
554                    v.total_changes))
555    ipv4_stat.clear()
556    ipv6_stat.clear()
557    countdown -= 1
558    if exiting or countdown == 0:
559        exit()
560