• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/python
2# -*- coding: utf-8 -*-
3# @lint-avoid-python-3-compatibility-imports
4#
5# tcpstates   Trace the TCP session state changes with durations.
6#             For Linux, uses BCC, BPF. Embedded C.
7#
8# USAGE: tcpstates [-h] [-C] [-S] [interval [count]] [-4 | -6]
9#
10# This uses the sock:inet_sock_set_state tracepoint, added to Linux 4.16.
11# Linux 4.16 also adds more state transitions so that they can be traced.
12#
13# Copyright 2018 Netflix, Inc.
14# Licensed under the Apache License, Version 2.0 (the "License")
15#
16# 20-Mar-2018   Brendan Gregg   Created this.
17
18from __future__ import print_function
19from bcc import BPF
20import argparse
21from socket import inet_ntop, AF_INET, AF_INET6
22from struct import pack
23from time import strftime, time
24from os import getuid
25
26# arguments
27examples = """examples:
28    ./tcpstates           # trace all TCP state changes
29    ./tcpstates -t        # include timestamp column
30    ./tcpstates -T        # include time column (HH:MM:SS)
31    ./tcpstates -w        # wider columns (fit IPv6)
32    ./tcpstates -stT      # csv output, with times & timestamps
33    ./tcpstates -Y        # log events to the systemd journal
34    ./tcpstates -L 80     # only trace local port 80
35    ./tcpstates -L 80,81  # only trace local ports 80 and 81
36    ./tcpstates -D 80     # only trace remote port 80
37    ./tcpstates -4        # trace IPv4 family only
38    ./tcpstates -6        # trace IPv6 family only
39"""
40parser = argparse.ArgumentParser(
41    description="Trace TCP session state changes and durations",
42    formatter_class=argparse.RawDescriptionHelpFormatter,
43    epilog=examples)
44parser.add_argument("-T", "--time", action="store_true",
45    help="include time column on output (HH:MM:SS)")
46parser.add_argument("-t", "--timestamp", action="store_true",
47    help="include timestamp on output (seconds)")
48parser.add_argument("-w", "--wide", action="store_true",
49    help="wide column output (fits IPv6 addresses)")
50parser.add_argument("-s", "--csv", action="store_true",
51    help="comma separated values output")
52parser.add_argument("-L", "--localport",
53    help="comma-separated list of local ports to trace.")
54parser.add_argument("-D", "--remoteport",
55    help="comma-separated list of remote ports to trace.")
56parser.add_argument("--ebpf", action="store_true",
57    help=argparse.SUPPRESS)
58parser.add_argument("-Y", "--journal", action="store_true",
59    help="log session state changes to the systemd journal")
60group = parser.add_mutually_exclusive_group()
61group.add_argument("-4", "--ipv4", action="store_true",
62    help="trace IPv4 family only")
63group.add_argument("-6", "--ipv6", action="store_true",
64    help="trace IPv6 family only")
65args = parser.parse_args()
66debug = 0
67
68# define BPF program
69bpf_header = """
70#include <uapi/linux/ptrace.h>
71#include <linux/tcp.h>
72#include <net/sock.h>
73#include <bcc/proto.h>
74
75BPF_HASH(last, struct sock *, u64);
76
77// separate data structs for ipv4 and ipv6
78struct ipv4_data_t {
79    u64 ts_us;
80    u64 skaddr;
81    u32 saddr;
82    u32 daddr;
83    u64 span_us;
84    u32 pid;
85    u32 ports;
86    u32 oldstate;
87    u32 newstate;
88    char task[TASK_COMM_LEN];
89};
90BPF_PERF_OUTPUT(ipv4_events);
91
92struct ipv6_data_t {
93    u64 ts_us;
94    u64 skaddr;
95    unsigned __int128 saddr;
96    unsigned __int128 daddr;
97    u64 span_us;
98    u32 pid;
99    u32 ports;
100    u32 oldstate;
101    u32 newstate;
102    char task[TASK_COMM_LEN];
103};
104BPF_PERF_OUTPUT(ipv6_events);
105"""
106
107bpf_text_tracepoint = """
108TRACEPOINT_PROBE(sock, inet_sock_set_state)
109{
110    if (args->protocol != IPPROTO_TCP)
111        return 0;
112
113    u32 pid = bpf_get_current_pid_tgid() >> 32;
114    // sk is used as a UUID
115    struct sock *sk = (struct sock *)args->skaddr;
116
117    // lport is either used in a filter here, or later
118    u16 lport = args->sport;
119    FILTER_LPORT
120
121    // dport is either used in a filter here, or later
122    u16 dport = args->dport;
123    FILTER_DPORT
124
125    // calculate delta
126    u64 *tsp, delta_us;
127    tsp = last.lookup(&sk);
128    if (tsp == 0)
129        delta_us = 0;
130    else
131        delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
132    u16 family = args->family;
133    FILTER_FAMILY
134
135    if (args->family == AF_INET) {
136        struct ipv4_data_t data4 = {
137            .span_us = delta_us,
138            .oldstate = args->oldstate,
139            .newstate = args->newstate };
140        data4.skaddr = (u64)args->skaddr;
141        data4.ts_us = bpf_ktime_get_ns() / 1000;
142        __builtin_memcpy(&data4.saddr, args->saddr, sizeof(data4.saddr));
143        __builtin_memcpy(&data4.daddr, args->daddr, sizeof(data4.daddr));
144        // a workaround until data4 compiles with separate lport/dport
145        data4.ports = dport + ((0ULL + lport) << 16);
146        data4.pid = pid;
147
148        bpf_get_current_comm(&data4.task, sizeof(data4.task));
149        ipv4_events.perf_submit(args, &data4, sizeof(data4));
150
151    } else /* 6 */ {
152        struct ipv6_data_t data6 = {
153            .span_us = delta_us,
154            .oldstate = args->oldstate,
155            .newstate = args->newstate };
156        data6.skaddr = (u64)args->skaddr;
157        data6.ts_us = bpf_ktime_get_ns() / 1000;
158        __builtin_memcpy(&data6.saddr, args->saddr_v6, sizeof(data6.saddr));
159        __builtin_memcpy(&data6.daddr, args->daddr_v6, sizeof(data6.daddr));
160        // a workaround until data6 compiles with separate lport/dport
161        data6.ports = dport + ((0ULL + lport) << 16);
162        data6.pid = pid;
163        bpf_get_current_comm(&data6.task, sizeof(data6.task));
164        ipv6_events.perf_submit(args, &data6, sizeof(data6));
165    }
166
167    if (args->newstate == TCP_CLOSE) {
168        last.delete(&sk);
169    } else {
170        u64 ts = bpf_ktime_get_ns();
171        last.update(&sk, &ts);
172    }
173
174    return 0;
175}
176"""
177
178bpf_text_kprobe = """
179int kprobe__tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
180{
181    u32 pid = bpf_get_current_pid_tgid() >> 32;
182    // sk is used as a UUID
183
184    // lport is either used in a filter here, or later
185    u16 lport = sk->__sk_common.skc_num;
186    FILTER_LPORT
187
188    // dport is either used in a filter here, or later
189    u16 dport = sk->__sk_common.skc_dport;
190    dport = ntohs(dport);
191    FILTER_DPORT
192
193    // calculate delta
194    u64 *tsp, delta_us;
195    tsp = last.lookup(&sk);
196    if (tsp == 0)
197        delta_us = 0;
198    else
199        delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
200
201    u16 family = sk->__sk_common.skc_family;
202    FILTER_FAMILY
203
204    if (family == AF_INET) {
205        struct ipv4_data_t data4 = {
206            .span_us = delta_us,
207            .oldstate = sk->__sk_common.skc_state,
208            .newstate = state };
209        data4.skaddr = (u64)sk;
210        data4.ts_us = bpf_ktime_get_ns() / 1000;
211        data4.saddr = sk->__sk_common.skc_rcv_saddr;
212        data4.daddr = sk->__sk_common.skc_daddr;
213        // a workaround until data4 compiles with separate lport/dport
214        data4.ports = dport + ((0ULL + lport) << 16);
215        data4.pid = pid;
216
217        bpf_get_current_comm(&data4.task, sizeof(data4.task));
218        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
219
220    } else /* 6 */ {
221        struct ipv6_data_t data6 = {
222            .span_us = delta_us,
223            .oldstate = sk->__sk_common.skc_state,
224            .newstate = state };
225        data6.skaddr = (u64)sk;
226        data6.ts_us = bpf_ktime_get_ns() / 1000;
227        bpf_probe_read_kernel(&data6.saddr, sizeof(data6.saddr),
228            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
229        bpf_probe_read_kernel(&data6.daddr, sizeof(data6.daddr),
230            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
231        // a workaround until data6 compiles with separate lport/dport
232        data6.ports = dport + ((0ULL + lport) << 16);
233        data6.pid = pid;
234        bpf_get_current_comm(&data6.task, sizeof(data6.task));
235        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
236    }
237
238    if (state == TCP_CLOSE) {
239        last.delete(&sk);
240    } else {
241        u64 ts = bpf_ktime_get_ns();
242        last.update(&sk, &ts);
243    }
244
245    return 0;
246
247};
248"""
249
250bpf_text = bpf_header
251if BPF.tracepoint_exists("sock", "inet_sock_set_state"):
252    bpf_text += bpf_text_tracepoint
253else:
254    bpf_text += bpf_text_kprobe
255
256# code substitutions
257if args.remoteport:
258    dports = [int(dport) for dport in args.remoteport.split(',')]
259    dports_if = ' && '.join(['dport != %d' % dport for dport in dports])
260    bpf_text = bpf_text.replace('FILTER_DPORT',
261        'if (%s) { last.delete(&sk); return 0; }' % dports_if)
262if args.localport:
263    lports = [int(lport) for lport in args.localport.split(',')]
264    lports_if = ' && '.join(['lport != %d' % lport for lport in lports])
265    bpf_text = bpf_text.replace('FILTER_LPORT',
266        'if (%s) { last.delete(&sk); return 0; }' % lports_if)
267if args.ipv4:
268    bpf_text = bpf_text.replace('FILTER_FAMILY',
269        'if (family != AF_INET) { return 0; }')
270elif args.ipv6:
271    bpf_text = bpf_text.replace('FILTER_FAMILY',
272        'if (family != AF_INET6) { return 0; }')
273bpf_text = bpf_text.replace('FILTER_FAMILY', '')
274bpf_text = bpf_text.replace('FILTER_DPORT', '')
275bpf_text = bpf_text.replace('FILTER_LPORT', '')
276
277if debug or args.ebpf:
278    print(bpf_text)
279    if args.ebpf:
280        exit()
281
282#
283# Setup output formats
284#
285# Don't change the default output (next 2 lines): this fits in 80 chars. I
286# know it doesn't have NS or UIDs etc. I know. If you really, really, really
287# need to add columns, columns that solve real actual problems, I'd start by
288# adding an extended mode (-x) to included those columns.
289#
290header_string = "%-16s %-5s %-10.10s %s%-15s %-5s %-15s %-5s %-11s -> %-11s %s"
291format_string = ("%-16x %-5d %-10.10s %s%-15s %-5d %-15s %-5d %-11s " +
292    "-> %-11s %.3f")
293if args.wide:
294    header_string = ("%-16s %-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %-11s " +
295        "-> %-11s %s")
296    format_string = ("%-16x %-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %-11s " +
297        "-> %-11s %.3f")
298if args.csv:
299    header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
300    format_string = "%x,%d,%s,%s,%s,%s,%s,%d,%s,%s,%.3f"
301
302if args.journal:
303    try:
304        from systemd import journal
305    except ImportError:
306        print("ERROR: Journal logging requires the systemd.journal module")
307        exit(1)
308
309
310def tcpstate2str(state):
311    # from include/net/tcp_states.h:
312    tcpstate = {
313        1: "ESTABLISHED",
314        2: "SYN_SENT",
315        3: "SYN_RECV",
316        4: "FIN_WAIT1",
317        5: "FIN_WAIT2",
318        6: "TIME_WAIT",
319        7: "CLOSE",
320        8: "CLOSE_WAIT",
321        9: "LAST_ACK",
322        10: "LISTEN",
323        11: "CLOSING",
324        12: "NEW_SYN_RECV",
325    }
326
327    if state in tcpstate:
328        return tcpstate[state]
329    else:
330        return str(state)
331
332def journal_fields(event, addr_family):
333    addr_pfx = 'IPV4'
334    if addr_family == AF_INET6:
335        addr_pfx = 'IPV6'
336
337    fields = {
338        # Standard fields described in systemd.journal-fields(7). journal.send
339        # will fill in CODE_LINE, CODE_FILE, and CODE_FUNC for us. If we're
340        # root and specify OBJECT_PID, systemd-journald will add other OBJECT_*
341        # fields for us.
342        'SYSLOG_IDENTIFIER': 'tcpstates',
343        'PRIORITY': 5,
344        '_SOURCE_REALTIME_TIMESTAMP': time() * 1000000,
345        'OBJECT_PID': str(event.pid),
346        'OBJECT_COMM': event.task.decode('utf-8', 'replace'),
347        # Custom fields, aka "stuff we sort of made up".
348        'OBJECT_' + addr_pfx + '_SOURCE_ADDRESS': inet_ntop(addr_family, pack("I", event.saddr)),
349        'OBJECT_TCP_SOURCE_PORT': str(event.ports >> 16),
350        'OBJECT_' + addr_pfx + '_DESTINATION_ADDRESS': inet_ntop(addr_family, pack("I", event.daddr)),
351        'OBJECT_TCP_DESTINATION_PORT': str(event.ports & 0xffff),
352        'OBJECT_TCP_OLD_STATE': tcpstate2str(event.oldstate),
353        'OBJECT_TCP_NEW_STATE': tcpstate2str(event.newstate),
354        'OBJECT_TCP_SPAN_TIME': str(event.span_us)
355        }
356
357    msg_format_string = (u"%(OBJECT_COMM)s " +
358        u"%(OBJECT_" + addr_pfx + "_SOURCE_ADDRESS)s " +
359        u"%(OBJECT_TCP_SOURCE_PORT)s → " +
360        u"%(OBJECT_" + addr_pfx + "_DESTINATION_ADDRESS)s " +
361        u"%(OBJECT_TCP_DESTINATION_PORT)s " +
362        u"%(OBJECT_TCP_OLD_STATE)s → %(OBJECT_TCP_NEW_STATE)s")
363    fields['MESSAGE'] = msg_format_string % (fields)
364
365    if getuid() == 0:
366        del fields['OBJECT_COMM'] # Handled by systemd-journald
367
368    return fields
369
370# process event
371def print_ipv4_event(cpu, data, size):
372    event = b["ipv4_events"].event(data)
373    global start_ts
374    if args.time:
375        if args.csv:
376            print("%s," % strftime("%H:%M:%S"), end="")
377        else:
378            print("%-8s " % strftime("%H:%M:%S"), end="")
379    if args.timestamp:
380        if start_ts == 0:
381            start_ts = event.ts_us
382        delta_s = (float(event.ts_us) - start_ts) / 1000000
383        if args.csv:
384            print("%.6f," % delta_s, end="")
385        else:
386            print("%-9.6f " % delta_s, end="")
387    print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
388        "4" if args.wide or args.csv else "",
389        inet_ntop(AF_INET, pack("I", event.saddr)), event.ports >> 16,
390        inet_ntop(AF_INET, pack("I", event.daddr)), event.ports & 0xffff,
391        tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
392        float(event.span_us) / 1000))
393    if args.journal:
394        journal.send(**journal_fields(event, AF_INET))
395
396def print_ipv6_event(cpu, data, size):
397    event = b["ipv6_events"].event(data)
398    global start_ts
399    if args.time:
400        if args.csv:
401            print("%s," % strftime("%H:%M:%S"), end="")
402        else:
403            print("%-8s " % strftime("%H:%M:%S"), end="")
404    if args.timestamp:
405        if start_ts == 0:
406            start_ts = event.ts_us
407        delta_s = (float(event.ts_us) - start_ts) / 1000000
408        if args.csv:
409            print("%.6f," % delta_s, end="")
410        else:
411            print("%-9.6f " % delta_s, end="")
412    print(format_string % (event.skaddr, event.pid, event.task.decode('utf-8', 'replace'),
413        "6" if args.wide or args.csv else "",
414        inet_ntop(AF_INET6, event.saddr), event.ports >> 16,
415        inet_ntop(AF_INET6, event.daddr), event.ports & 0xffff,
416        tcpstate2str(event.oldstate), tcpstate2str(event.newstate),
417        float(event.span_us) / 1000))
418    if args.journal:
419        journal.send(**journal_fields(event, AF_INET6))
420
421# initialize BPF
422b = BPF(text=bpf_text)
423
424# header
425if args.time:
426    if args.csv:
427        print("%s," % ("TIME"), end="")
428    else:
429        print("%-8s " % ("TIME"), end="")
430if args.timestamp:
431    if args.csv:
432        print("%s," % ("TIME(s)"), end="")
433    else:
434        print("%-9s " % ("TIME(s)"), end="")
435print(header_string % ("SKADDR", "C-PID", "C-COMM",
436    "IP" if args.wide or args.csv else "",
437    "LADDR", "LPORT", "RADDR", "RPORT",
438    "OLDSTATE", "NEWSTATE", "MS"))
439
440start_ts = 0
441
442# read events
443b["ipv4_events"].open_perf_buffer(print_ipv4_event, page_cnt=64)
444b["ipv6_events"].open_perf_buffer(print_ipv6_event, page_cnt=64)
445while 1:
446    try:
447        b.perf_buffer_poll()
448    except KeyboardInterrupt:
449        exit()
450