• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1#!/usr/bin/env bcc-lua
2--[[
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14
1518-Mar-2017  Simon Liu Created this.
16--]]
17
18local ffi = require("ffi")
19local bit = require("bit")
20
21ffi.cdef[[
22const char *inet_ntop(int af, const void *src, char *dst, int size);
23uint16_t ntohs(uint16_t netshort);
24]]
25
26local program = [[
27#include <uapi/linux/ptrace.h>
28#define KBUILD_MODNAME "foo"
29#include <linux/tcp.h>
30#include <net/sock.h>
31#include <bcc/proto.h>
32
33BPF_HASH(birth, struct sock *, u64);
34
35// separate data structs for ipv4 and ipv6
36struct ipv4_data_t {
37    // XXX: switch some to u32's when supported
38    u64 ts_us;
39    u64 pid;
40    u64 saddr;
41    u64 daddr;
42    u64 ports;
43    u64 rx_b;
44    u64 tx_b;
45    u64 span_us;
46    char task[TASK_COMM_LEN];
47};
48BPF_PERF_OUTPUT(ipv4_events);
49
50struct ipv6_data_t {
51    u64 ts_us;
52    u64 pid;
53    u64 saddr[2];
54    u64 daddr[2];
55    u64 ports;
56    u64 rx_b;
57    u64 tx_b;
58    u64 span_us;
59    char task[TASK_COMM_LEN];
60};
61BPF_PERF_OUTPUT(ipv6_events);
62
63struct id_t {
64    u32 pid;
65    char task[TASK_COMM_LEN];
66};
67BPF_HASH(whoami, struct sock *, struct id_t);
68
69int trace_tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state)
70{
71    bpf_trace_printk("tcp_set_stat");
72    u32 pid = bpf_get_current_pid_tgid() >> 32;
73
74    // lport is either used in a filter here, or later
75    u16 lport = sk->__sk_common.skc_num;
76    FILTER_LPORT
77
78    // dport is either used in a filter here, or later
79    u16 dport = sk->__sk_common.skc_dport;
80    FILTER_DPORT
81
82    /*
83     * This tool includes PID and comm context. It's best effort, and may
84     * be wrong in some situations. It currently works like this:
85     * - record timestamp on any state < TCP_FIN_WAIT1
86     * - cache task context on:
87     *       TCP_SYN_SENT: tracing from client
88     *       TCP_LAST_ACK: client-closed from server
89     * - do output on TCP_CLOSE:
90     *       fetch task context if cached, or use current task
91     */
92
93    // capture birth time
94    if (state < TCP_FIN_WAIT1) {
95        /*
96         * Matching just ESTABLISHED may be sufficient, provided no code-path
97         * sets ESTABLISHED without a tcp_set_state() call. Until we know
98         * that for sure, match all early states to increase chances a
99         * timestamp is set.
100         * Note that this needs to be set before the PID filter later on,
101         * since the PID isn't reliable for these early stages, so we must
102         * save all timestamps and do the PID filter later when we can.
103         */
104        u64 ts = bpf_ktime_get_ns();
105        birth.update(&sk, &ts);
106    }
107
108    // record PID & comm on SYN_SENT
109    if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) {
110        // now we can PID filter, both here and a little later on for CLOSE
111        FILTER_PID
112        struct id_t me = {.pid = pid};
113        bpf_get_current_comm(&me.task, sizeof(me.task));
114        whoami.update(&sk, &me);
115    }
116
117    if (state != TCP_CLOSE)
118        return 0;
119
120    // calculate lifespan
121    u64 *tsp, delta_us;
122    tsp = birth.lookup(&sk);
123    if (tsp == 0) {
124        whoami.delete(&sk);     // may not exist
125        return 0;               // missed create
126    }
127    delta_us = (bpf_ktime_get_ns() - *tsp) / 1000;
128    birth.delete(&sk);
129
130    // fetch possible cached data, and filter
131    struct id_t *mep;
132    mep = whoami.lookup(&sk);
133    if (mep != 0)
134        pid = mep->pid;
135    FILTER_PID
136
137    // get throughput stats. see tcp_get_info().
138    u64 rx_b = 0, tx_b = 0, sport = 0;
139    struct tcp_sock *tp = (struct tcp_sock *)sk;
140    rx_b = tp->bytes_received;
141    tx_b = tp->bytes_acked;
142
143    u16 family = sk->__sk_common.skc_family;
144
145    if (family == AF_INET) {
146        struct ipv4_data_t data4 = {.span_us = delta_us,
147            .rx_b = rx_b, .tx_b = tx_b};
148        data4.ts_us = bpf_ktime_get_ns() / 1000;
149        data4.saddr = sk->__sk_common.skc_rcv_saddr;
150        data4.daddr = sk->__sk_common.skc_daddr;
151        // a workaround until data4 compiles with separate lport/dport
152        data4.pid = pid;
153        data4.ports = ntohs(dport) + ((0ULL + lport) << 32);
154        if (mep == 0) {
155            bpf_get_current_comm(&data4.task, sizeof(data4.task));
156        } else {
157            bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task);
158        }
159        ipv4_events.perf_submit(ctx, &data4, sizeof(data4));
160
161    } else /* 6 */ {
162        struct ipv6_data_t data6 = {.span_us = delta_us,
163            .rx_b = rx_b, .tx_b = tx_b};
164        data6.ts_us = bpf_ktime_get_ns() / 1000;
165        bpf_probe_read(&data6.saddr, sizeof(data6.saddr),
166            sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32);
167        bpf_probe_read(&data6.daddr, sizeof(data6.daddr),
168            sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32);
169        // a workaround until data6 compiles with separate lport/dport
170        data6.ports = ntohs(dport) + ((0ULL + lport) << 32);
171        data6.pid = pid;
172        if (mep == 0) {
173            bpf_get_current_comm(&data6.task, sizeof(data6.task));
174        } else {
175            bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task);
176        }
177        ipv6_events.perf_submit(ctx, &data6, sizeof(data6));
178    }
179
180    if (mep != 0)
181        whoami.delete(&sk);
182
183    return 0;
184}
185]]
186
187local debug = false
188local start_ts = 0
189
190local inet_addresslen = #"255.255.255.255"
191local inet6_addresslen = #"ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255"
192local AF_INET = 2
193local AF_INET6 = 10
194
195local header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s"
196local format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f"
197local ip_string = ""
198local ip_version = false
199local arg_timestamp = false
200local arg_csv = false
201local arg_time = false
202
203local examples = [[examples:
204    ./tcplife           # trace all TCP connect()s
205    ./tcplife -t        # include time column (HH:MM:SS)
206    ./tcplife -w        # wider colums (fit IPv6)
207    ./tcplife -stT      # csv output, with times & timestamps
208    ./tcplife -p 181    # only trace PID 181
209    ./tcplife -L 80     # only trace local port 80
210    ./tcplife -L 80,81  # only trace local ports 80 and 81
211    ./tcplife -D 80     # only trace remote port 80
212]]
213
214local function split(str,sep)
215   local t = {}
216   for w in string.gmatch(str, '([^,]+)') do
217      table.insert(t, w)
218   end
219   return t
220end
221
222local function inet_ntop(af, addr, len)
223   local addr_dst = ffi.new("char[?]", len)
224   local addr_src
225   if af == AF_INET then
226      addr_src = ffi.new("uint64_t[1]", addr)
227   else
228      addr_src = ffi.new("uint64_t[2]", addr)
229   end
230   ffi.C.inet_ntop(af, addr_src, addr_dst, len)
231   return ffi.string(addr_dst, len)
232end
233
234local function inet_ntohs(port)
235   local p = tonumber(port)
236   return ffi.C.ntohs(p)
237end
238
239local function print_ipv4_event(cpu, event)
240
241   local event_pid = tonumber(event.pid)
242   local event_task = ffi.string(event.task)
243   local event_ports = tonumber(event.ports)
244   local event_tx_b = tonumber(event.tx_b)
245   local event_rx_b = tonumber(event.rx_b)
246   local event_span_us = tonumber(event.span_us)
247   local event_ts_us = tonumber(event.ts_us)
248   local event_saddr = inet_ntop(AF_INET, tonumber(event.saddr), inet_addresslen)
249   local event_daddr = inet_ntop(AF_INET, tonumber(event.daddr), inet_addresslen)
250   if arg_time then
251      if arg_csv then
252         io.write("%s," % os.date("%H:%M:%S"))
253      else
254         io.write("%-8s " % os.date("%H:%M:%S"))
255      end
256   end
257   if arg_timestamp then
258      if start_ts == 0 then
259         start_ts = event_ts_us
260      end
261      local delta_s = (event_ts_us - start_ts) / 1000000
262      if arg.csv then
263         io.write("%.6f," % delta_s)
264      else
265         io.write("%-9.6f " % delta_s)
266      end
267   end
268   local iv = ""
269   if ip_version then
270      iv = "4"
271   end
272   print(string.format(format_string, event_pid, event_task, iv,
273                       event_saddr, bit.rshift(event_ports,32),
274                       event_daddr, bit.band(event_ports,0xffffffff),
275                       (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000))
276end
277
278
279local function print_ipv6_event(cpu, event)
280   local event_pid = tonumber(event.pid)
281   local event_task = ffi.string(event.task)
282   local event_ports = tonumber(event.ports)
283   local event_tx_b = tonumber(event.tx_b)
284   local event_rx_b = tonumber(event.rx_b)
285   local event_span_us = tonumber(event.span_us)
286   local event_ts_us = tonumber(event.ts_us)
287   local event_saddr = inet_ntop(AF_INET6, {tonumber(event.saddr[0]), tonumber(event.saddr[1])}, inet6_addresslen)
288   local event_daddr = inet_ntop(AF_INET6, {tonumber(event.daddr[0]), tonumber(event.daddr[1])}, inet6_addresslen)
289   if arg_time then
290      if arg_csv then
291         io.write("%s," % os.date("%H:%M:%S"))
292      else
293         io.write("%-8s " % os.date("%H:%M:%S"))
294      end
295   end
296   if arg_timestamp then
297      if start_ts == 0 then
298         start_ts = event_ts_us
299      end
300      local delta_s = (event_ts_us - start_ts) / 1000000
301      if arg.csv then
302         io.write("%.6f," % delta_s)
303      else
304         io.write("%-9.6f " % delta_s)
305      end
306   end
307   local iv = ""
308   if ip_version then
309      iv = "6"
310   end
311   print(string.format(format_string, event_pid, event_task, iv,
312                       event_saddr, bit.rshift(event_ports,32),
313                       event_daddr, bit.band(event_ports,0xffffffff),
314                       (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000))
315end
316
317local function parse_arg(utils)
318   local parser = utils.argparse("tcplife",
319                                 "Trace the lifespan of TCP sessions and summarize", examples)
320
321   parser:flag("-T --time", "include time column on output (HH:MM:SS)")
322   parser:flag("-t --timestamp", "include timestamp on output (seconds)")
323   parser:flag("-w --wide", "wide column output (fits IPv6 addresses)")
324   parser:flag("-s --csv", "comma separated values output")
325   parser:option("-p --pid", "trace this PID only"):convert(tonumber)
326   parser:option("-L --localport", "comma-separated list of local ports to trace.")
327   parser:option("-D --remoteport", "comma-separated list of remote ports to trace.")
328
329   local args = parser:parse()
330   if args.pid then
331      local filter = 'if (pid != %d) { return 0; }' % args.pid
332      program = program.gsub('FILTER_PID', filter)
333   end
334
335   if args.remoteport then
336      local dports = split(args.remoteport, ",")
337      local dports_if = ""
338      for i,d in ipairs(dports) do
339         if dports_if == "" then
340            dports_if = 'dport != %d' % inet_ntohs(d)
341         else
342            dports_if = dports_if .. ' && ' .. ('dport != %d' % inet_ntohs(d))
343         end
344      end
345      local filter = "if (%s) { birth.delete(&sk); return 0; }" % dports_if
346      program = program:gsub('FILTER_DPORT', filter)
347   end
348   if args.localport then
349      local lports = split(args.localport,",")
350      local lports_if = ""
351      for i,l in ipairs(lports) do
352         if lports_if == "" then
353            lports_if = 'lport != %d' % inet_ntohs(l)
354         else
355            lports_if = lports_if .. ' && ' .. ('lport != %d' % inet_ntohs(l))
356         end
357      end
358      local filter = "if (%s) { birth.delete(&sk); return 0; }" % lports_if
359      program = program:gsub('FILTER_LPORT', filter)
360   end
361   program = program:gsub('FILTER_PID', '')
362   program = program:gsub('FILTER_DPORT', '')
363   program = program:gsub('FILTER_LPORT', '')
364
365   if args.wide then
366      header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s"
367      format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f"
368      ip_string = "IP"
369      ip_version = true
370   end
371   if args.csv then
372      header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s"
373      format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f"
374      ip_string = "IP"
375      ip_version = true
376      arg_csv = true
377   end
378
379   if args.time then
380      arg_time = true
381      if args.csv then
382         io.write("%s," % ("TIME"))
383      else
384         io.write("%-8s " % ("TIME"))
385      end
386   end
387
388   if args.timestamp then
389      arg_timestamp = true
390      if args.csv then
391         io.write("%s," % ("TIME(s)"))
392      else
393         io.write("%-9s " % ("TIME(s)"))
394      end
395   end
396
397end
398
399return function(BPF, utils)
400   parse_arg(utils)
401   if debug then
402      print(program)
403   end
404
405   local bpf = BPF:new{text=program}
406   bpf:attach_kprobe{event="tcp_set_state", fn_name="trace_tcp_set_state"}
407   print(header_string % {"PID", "COMM",
408                          ip_string, "LADDR",
409                          "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"})
410   local TASK_COMM_LEN = 16 -- linux/sched.h
411   bpf:get_table("ipv4_events"):open_perf_buffer(print_ipv4_event, [[
412    struct {
413      uint64_t ts_us;
414      uint64_t pid;
415      uint64_t saddr;
416      uint64_t daddr;
417      uint64_t ports;
418      uint64_t rx_b;
419      uint64_t tx_b;
420      uint64_t span_us;
421      char task[$];
422    }
423   ]], {TASK_COMM_LEN}, 64)
424   bpf:get_table("ipv6_events"):open_perf_buffer(print_ipv6_event, [[
425    struct {
426      uint64_t ts_us;
427      uint64_t pid;
428      uint64_t saddr[2];
429      uint64_t daddr[2];
430      uint64_t ports;
431      uint64_t rx_b;
432      uint64_t tx_b;
433      uint64_t span_us;
434      char task[$];
435    }
436   ]], {TASK_COMM_LEN}, 64)
437
438   bpf:perf_buffer_poll_loop()
439end
440