1#!/usr/bin/env bcc-lua 2--[[ 3Licensed under the Apache License, Version 2.0 (the "License"); 4you may not use this file except in compliance with the License. 5You may obtain a copy of the License at 6 7http://www.apache.org/licenses/LICENSE-2.0 8 9Unless required by applicable law or agreed to in writing, software 10distributed under the License is distributed on an "AS IS" BASIS, 11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12See the License for the specific language governing permissions and 13limitations under the License. 14 1518-Mar-2017 Simon Liu Created this. 16--]] 17 18local ffi = require("ffi") 19local bit = require("bit") 20 21ffi.cdef[[ 22const char *inet_ntop(int af, const void *src, char *dst, int size); 23uint16_t ntohs(uint16_t netshort); 24]] 25 26local program = [[ 27#include <uapi/linux/ptrace.h> 28#define KBUILD_MODNAME "foo" 29#include <linux/tcp.h> 30#include <net/sock.h> 31#include <bcc/proto.h> 32 33BPF_HASH(birth, struct sock *, u64); 34 35// separate data structs for ipv4 and ipv6 36struct ipv4_data_t { 37 // XXX: switch some to u32's when supported 38 u64 ts_us; 39 u64 pid; 40 u64 saddr; 41 u64 daddr; 42 u64 ports; 43 u64 rx_b; 44 u64 tx_b; 45 u64 span_us; 46 char task[TASK_COMM_LEN]; 47}; 48BPF_PERF_OUTPUT(ipv4_events); 49 50struct ipv6_data_t { 51 u64 ts_us; 52 u64 pid; 53 u64 saddr[2]; 54 u64 daddr[2]; 55 u64 ports; 56 u64 rx_b; 57 u64 tx_b; 58 u64 span_us; 59 char task[TASK_COMM_LEN]; 60}; 61BPF_PERF_OUTPUT(ipv6_events); 62 63struct id_t { 64 u32 pid; 65 char task[TASK_COMM_LEN]; 66}; 67BPF_HASH(whoami, struct sock *, struct id_t); 68 69int trace_tcp_set_state(struct pt_regs *ctx, struct sock *sk, int state) 70{ 71 bpf_trace_printk("tcp_set_stat"); 72 u32 pid = bpf_get_current_pid_tgid() >> 32; 73 74 // lport is either used in a filter here, or later 75 u16 lport = sk->__sk_common.skc_num; 76 FILTER_LPORT 77 78 // dport is either used in a filter here, or later 79 u16 dport = sk->__sk_common.skc_dport; 80 FILTER_DPORT 81 82 /* 83 * This tool includes PID and comm context. It's best effort, and may 84 * be wrong in some situations. It currently works like this: 85 * - record timestamp on any state < TCP_FIN_WAIT1 86 * - cache task context on: 87 * TCP_SYN_SENT: tracing from client 88 * TCP_LAST_ACK: client-closed from server 89 * - do output on TCP_CLOSE: 90 * fetch task context if cached, or use current task 91 */ 92 93 // capture birth time 94 if (state < TCP_FIN_WAIT1) { 95 /* 96 * Matching just ESTABLISHED may be sufficient, provided no code-path 97 * sets ESTABLISHED without a tcp_set_state() call. Until we know 98 * that for sure, match all early states to increase chances a 99 * timestamp is set. 100 * Note that this needs to be set before the PID filter later on, 101 * since the PID isn't reliable for these early stages, so we must 102 * save all timestamps and do the PID filter later when we can. 103 */ 104 u64 ts = bpf_ktime_get_ns(); 105 birth.update(&sk, &ts); 106 } 107 108 // record PID & comm on SYN_SENT 109 if (state == TCP_SYN_SENT || state == TCP_LAST_ACK) { 110 // now we can PID filter, both here and a little later on for CLOSE 111 FILTER_PID 112 struct id_t me = {.pid = pid}; 113 bpf_get_current_comm(&me.task, sizeof(me.task)); 114 whoami.update(&sk, &me); 115 } 116 117 if (state != TCP_CLOSE) 118 return 0; 119 120 // calculate lifespan 121 u64 *tsp, delta_us; 122 tsp = birth.lookup(&sk); 123 if (tsp == 0) { 124 whoami.delete(&sk); // may not exist 125 return 0; // missed create 126 } 127 delta_us = (bpf_ktime_get_ns() - *tsp) / 1000; 128 birth.delete(&sk); 129 130 // fetch possible cached data, and filter 131 struct id_t *mep; 132 mep = whoami.lookup(&sk); 133 if (mep != 0) 134 pid = mep->pid; 135 FILTER_PID 136 137 // get throughput stats. see tcp_get_info(). 138 u64 rx_b = 0, tx_b = 0, sport = 0; 139 struct tcp_sock *tp = (struct tcp_sock *)sk; 140 rx_b = tp->bytes_received; 141 tx_b = tp->bytes_acked; 142 143 u16 family = sk->__sk_common.skc_family; 144 145 if (family == AF_INET) { 146 struct ipv4_data_t data4 = {.span_us = delta_us, 147 .rx_b = rx_b, .tx_b = tx_b}; 148 data4.ts_us = bpf_ktime_get_ns() / 1000; 149 data4.saddr = sk->__sk_common.skc_rcv_saddr; 150 data4.daddr = sk->__sk_common.skc_daddr; 151 // a workaround until data4 compiles with separate lport/dport 152 data4.pid = pid; 153 data4.ports = ntohs(dport) + ((0ULL + lport) << 32); 154 if (mep == 0) { 155 bpf_get_current_comm(&data4.task, sizeof(data4.task)); 156 } else { 157 bpf_probe_read(&data4.task, sizeof(data4.task), (void *)mep->task); 158 } 159 ipv4_events.perf_submit(ctx, &data4, sizeof(data4)); 160 161 } else /* 6 */ { 162 struct ipv6_data_t data6 = {.span_us = delta_us, 163 .rx_b = rx_b, .tx_b = tx_b}; 164 data6.ts_us = bpf_ktime_get_ns() / 1000; 165 bpf_probe_read(&data6.saddr, sizeof(data6.saddr), 166 sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 167 bpf_probe_read(&data6.daddr, sizeof(data6.daddr), 168 sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 169 // a workaround until data6 compiles with separate lport/dport 170 data6.ports = ntohs(dport) + ((0ULL + lport) << 32); 171 data6.pid = pid; 172 if (mep == 0) { 173 bpf_get_current_comm(&data6.task, sizeof(data6.task)); 174 } else { 175 bpf_probe_read(&data6.task, sizeof(data6.task), (void *)mep->task); 176 } 177 ipv6_events.perf_submit(ctx, &data6, sizeof(data6)); 178 } 179 180 if (mep != 0) 181 whoami.delete(&sk); 182 183 return 0; 184} 185]] 186 187local debug = false 188local start_ts = 0 189 190local inet_addresslen = #"255.255.255.255" 191local inet6_addresslen = #"ffff:ffff:ffff:ffff:ffff:ffff:255.255.255.255" 192local AF_INET = 2 193local AF_INET6 = 10 194 195local header_string = "%-5s %-10.10s %s%-15s %-5s %-15s %-5s %5s %5s %s" 196local format_string = "%-5d %-10.10s %s%-15s %-5d %-15s %-5d %5d %5d %.2f" 197local ip_string = "" 198local ip_version = false 199local arg_timestamp = false 200local arg_csv = false 201local arg_time = false 202 203local examples = [[examples: 204 ./tcplife # trace all TCP connect()s 205 ./tcplife -t # include time column (HH:MM:SS) 206 ./tcplife -w # wider colums (fit IPv6) 207 ./tcplife -stT # csv output, with times & timestamps 208 ./tcplife -p 181 # only trace PID 181 209 ./tcplife -L 80 # only trace local port 80 210 ./tcplife -L 80,81 # only trace local ports 80 and 81 211 ./tcplife -D 80 # only trace remote port 80 212]] 213 214local function split(str,sep) 215 local t = {} 216 for w in string.gmatch(str, '([^,]+)') do 217 table.insert(t, w) 218 end 219 return t 220end 221 222local function inet_ntop(af, addr, len) 223 local addr_dst = ffi.new("char[?]", len) 224 local addr_src 225 if af == AF_INET then 226 addr_src = ffi.new("uint64_t[1]", addr) 227 else 228 addr_src = ffi.new("uint64_t[2]", addr) 229 end 230 ffi.C.inet_ntop(af, addr_src, addr_dst, len) 231 return ffi.string(addr_dst, len) 232end 233 234local function inet_ntohs(port) 235 local p = tonumber(port) 236 return ffi.C.ntohs(p) 237end 238 239local function print_ipv4_event(cpu, event) 240 241 local event_pid = tonumber(event.pid) 242 local event_task = ffi.string(event.task) 243 local event_ports = tonumber(event.ports) 244 local event_tx_b = tonumber(event.tx_b) 245 local event_rx_b = tonumber(event.rx_b) 246 local event_span_us = tonumber(event.span_us) 247 local event_ts_us = tonumber(event.ts_us) 248 local event_saddr = inet_ntop(AF_INET, tonumber(event.saddr), inet_addresslen) 249 local event_daddr = inet_ntop(AF_INET, tonumber(event.daddr), inet_addresslen) 250 if arg_time then 251 if arg_csv then 252 io.write("%s," % os.date("%H:%M:%S")) 253 else 254 io.write("%-8s " % os.date("%H:%M:%S")) 255 end 256 end 257 if arg_timestamp then 258 if start_ts == 0 then 259 start_ts = event_ts_us 260 end 261 local delta_s = (event_ts_us - start_ts) / 1000000 262 if arg.csv then 263 io.write("%.6f," % delta_s) 264 else 265 io.write("%-9.6f " % delta_s) 266 end 267 end 268 local iv = "" 269 if ip_version then 270 iv = "4" 271 end 272 print(string.format(format_string, event_pid, event_task, iv, 273 event_saddr, bit.rshift(event_ports,32), 274 event_daddr, bit.band(event_ports,0xffffffff), 275 (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000)) 276end 277 278 279local function print_ipv6_event(cpu, event) 280 local event_pid = tonumber(event.pid) 281 local event_task = ffi.string(event.task) 282 local event_ports = tonumber(event.ports) 283 local event_tx_b = tonumber(event.tx_b) 284 local event_rx_b = tonumber(event.rx_b) 285 local event_span_us = tonumber(event.span_us) 286 local event_ts_us = tonumber(event.ts_us) 287 local event_saddr = inet_ntop(AF_INET6, {tonumber(event.saddr[0]), tonumber(event.saddr[1])}, inet6_addresslen) 288 local event_daddr = inet_ntop(AF_INET6, {tonumber(event.daddr[0]), tonumber(event.daddr[1])}, inet6_addresslen) 289 if arg_time then 290 if arg_csv then 291 io.write("%s," % os.date("%H:%M:%S")) 292 else 293 io.write("%-8s " % os.date("%H:%M:%S")) 294 end 295 end 296 if arg_timestamp then 297 if start_ts == 0 then 298 start_ts = event_ts_us 299 end 300 local delta_s = (event_ts_us - start_ts) / 1000000 301 if arg.csv then 302 io.write("%.6f," % delta_s) 303 else 304 io.write("%-9.6f " % delta_s) 305 end 306 end 307 local iv = "" 308 if ip_version then 309 iv = "6" 310 end 311 print(string.format(format_string, event_pid, event_task, iv, 312 event_saddr, bit.rshift(event_ports,32), 313 event_daddr, bit.band(event_ports,0xffffffff), 314 (event_tx_b / 1024), (event_rx_b / 1024), event_span_us/ 1000)) 315end 316 317local function parse_arg(utils) 318 local parser = utils.argparse("tcplife", 319 "Trace the lifespan of TCP sessions and summarize", examples) 320 321 parser:flag("-T --time", "include time column on output (HH:MM:SS)") 322 parser:flag("-t --timestamp", "include timestamp on output (seconds)") 323 parser:flag("-w --wide", "wide column output (fits IPv6 addresses)") 324 parser:flag("-s --csv", "comma separated values output") 325 parser:option("-p --pid", "trace this PID only"):convert(tonumber) 326 parser:option("-L --localport", "comma-separated list of local ports to trace.") 327 parser:option("-D --remoteport", "comma-separated list of remote ports to trace.") 328 329 local args = parser:parse() 330 if args.pid then 331 local filter = 'if (pid != %d) { return 0; }' % args.pid 332 program = program.gsub('FILTER_PID', filter) 333 end 334 335 if args.remoteport then 336 local dports = split(args.remoteport, ",") 337 local dports_if = "" 338 for i,d in ipairs(dports) do 339 if dports_if == "" then 340 dports_if = 'dport != %d' % inet_ntohs(d) 341 else 342 dports_if = dports_if .. ' && ' .. ('dport != %d' % inet_ntohs(d)) 343 end 344 end 345 local filter = "if (%s) { birth.delete(&sk); return 0; }" % dports_if 346 program = program:gsub('FILTER_DPORT', filter) 347 end 348 if args.localport then 349 local lports = split(args.localport,",") 350 local lports_if = "" 351 for i,l in ipairs(lports) do 352 if lports_if == "" then 353 lports_if = 'lport != %d' % inet_ntohs(l) 354 else 355 lports_if = lports_if .. ' && ' .. ('lport != %d' % inet_ntohs(l)) 356 end 357 end 358 local filter = "if (%s) { birth.delete(&sk); return 0; }" % lports_if 359 program = program:gsub('FILTER_LPORT', filter) 360 end 361 program = program:gsub('FILTER_PID', '') 362 program = program:gsub('FILTER_DPORT', '') 363 program = program:gsub('FILTER_LPORT', '') 364 365 if args.wide then 366 header_string = "%-5s %-16.16s %-2s %-26s %-5s %-26s %-5s %6s %6s %s" 367 format_string = "%-5d %-16.16s %-2s %-26s %-5s %-26s %-5d %6d %6d %.2f" 368 ip_string = "IP" 369 ip_version = true 370 end 371 if args.csv then 372 header_string = "%s,%s,%s,%s,%s,%s,%s,%s,%s,%s" 373 format_string = "%d,%s,%s,%s,%s,%s,%d,%d,%d,%.2f" 374 ip_string = "IP" 375 ip_version = true 376 arg_csv = true 377 end 378 379 if args.time then 380 arg_time = true 381 if args.csv then 382 io.write("%s," % ("TIME")) 383 else 384 io.write("%-8s " % ("TIME")) 385 end 386 end 387 388 if args.timestamp then 389 arg_timestamp = true 390 if args.csv then 391 io.write("%s," % ("TIME(s)")) 392 else 393 io.write("%-9s " % ("TIME(s)")) 394 end 395 end 396 397end 398 399return function(BPF, utils) 400 parse_arg(utils) 401 if debug then 402 print(program) 403 end 404 405 local bpf = BPF:new{text=program} 406 bpf:attach_kprobe{event="tcp_set_state", fn_name="trace_tcp_set_state"} 407 print(header_string % {"PID", "COMM", 408 ip_string, "LADDR", 409 "LPORT", "RADDR", "RPORT", "TX_KB", "RX_KB", "MS"}) 410 local TASK_COMM_LEN = 16 -- linux/sched.h 411 bpf:get_table("ipv4_events"):open_perf_buffer(print_ipv4_event, [[ 412 struct { 413 uint64_t ts_us; 414 uint64_t pid; 415 uint64_t saddr; 416 uint64_t daddr; 417 uint64_t ports; 418 uint64_t rx_b; 419 uint64_t tx_b; 420 uint64_t span_us; 421 char task[$]; 422 } 423 ]], {TASK_COMM_LEN}, 64) 424 bpf:get_table("ipv6_events"):open_perf_buffer(print_ipv6_event, [[ 425 struct { 426 uint64_t ts_us; 427 uint64_t pid; 428 uint64_t saddr[2]; 429 uint64_t daddr[2]; 430 uint64_t ports; 431 uint64_t rx_b; 432 uint64_t tx_b; 433 uint64_t span_us; 434 char task[$]; 435 } 436 ]], {TASK_COMM_LEN}, 64) 437 438 bpf:perf_buffer_poll_loop() 439end 440