1#!/usr/bin/python 2# @lint-avoid-python-3-compatibility-imports 3# 4# tcptop Summarize TCP send/recv throughput by host. 5# For Linux, uses BCC, eBPF. Embedded C. 6# 7# USAGE: tcptop [-h] [-C] [-S] [-p PID] [interval [count]] [-4 | -6] 8# 9# This uses dynamic tracing of kernel functions, and will need to be updated 10# to match kernel changes. 11# 12# WARNING: This traces all send/receives at the TCP level, and while it 13# summarizes data in-kernel to reduce overhead, there may still be some 14# overhead at high TCP send/receive rates (eg, ~13% of one CPU at 100k TCP 15# events/sec. This is not the same as packet rate: funccount can be used to 16# count the kprobes below to find out the TCP rate). Test in a lab environment 17# first. If your send/receive rate is low (eg, <1k/sec) then the overhead is 18# expected to be negligible. 19# 20# ToDo: Fit output to screen size (top X only) in default (not -C) mode. 21# 22# Copyright 2016 Netflix, Inc. 23# Licensed under the Apache License, Version 2.0 (the "License") 24# 25# 02-Sep-2016 Brendan Gregg Created this. 26 27from __future__ import print_function 28from bcc import BPF 29from bcc.containers import filter_by_containers 30import argparse 31from socket import inet_ntop, AF_INET, AF_INET6 32from struct import pack 33from time import sleep, strftime 34from subprocess import call 35from collections import namedtuple, defaultdict 36 37# arguments 38def range_check(string): 39 value = int(string) 40 if value < 1: 41 msg = "value must be stricly positive, got %d" % (value,) 42 raise argparse.ArgumentTypeError(msg) 43 return value 44 45examples = """examples: 46 ./tcptop # trace TCP send/recv by host 47 ./tcptop -C # don't clear the screen 48 ./tcptop -p 181 # only trace PID 181 49 ./tcptop --cgroupmap mappath # only trace cgroups in this BPF map 50 ./tcptop --mntnsmap mappath # only trace mount namespaces in the map 51 ./tcptop -4 # trace IPv4 family only 52 ./tcptop -6 # trace IPv6 family only 53""" 54parser = argparse.ArgumentParser( 55 description="Summarize TCP send/recv throughput by host", 56 formatter_class=argparse.RawDescriptionHelpFormatter, 57 epilog=examples) 58parser.add_argument("-C", "--noclear", action="store_true", 59 help="don't clear the screen") 60parser.add_argument("-S", "--nosummary", action="store_true", 61 help="skip system summary line") 62parser.add_argument("-p", "--pid", 63 help="trace this PID only") 64parser.add_argument("interval", nargs="?", default=1, type=range_check, 65 help="output interval, in seconds (default 1)") 66parser.add_argument("count", nargs="?", default=-1, type=range_check, 67 help="number of outputs") 68parser.add_argument("--cgroupmap", 69 help="trace cgroups in this BPF map only") 70parser.add_argument("--mntnsmap", 71 help="trace mount namespaces in this BPF map only") 72group = parser.add_mutually_exclusive_group() 73group.add_argument("-4", "--ipv4", action="store_true", 74 help="trace IPv4 family only") 75group.add_argument("-6", "--ipv6", action="store_true", 76 help="trace IPv6 family only") 77parser.add_argument("--ebpf", action="store_true", 78 help=argparse.SUPPRESS) 79args = parser.parse_args() 80debug = 0 81 82# linux stats 83loadavg = "/proc/loadavg" 84 85# define BPF program 86bpf_text = """ 87#include <uapi/linux/ptrace.h> 88#include <net/sock.h> 89#include <bcc/proto.h> 90 91struct ipv4_key_t { 92 u32 pid; 93 char name[TASK_COMM_LEN]; 94 u32 saddr; 95 u32 daddr; 96 u16 lport; 97 u16 dport; 98}; 99BPF_HASH(ipv4_send_bytes, struct ipv4_key_t); 100BPF_HASH(ipv4_recv_bytes, struct ipv4_key_t); 101 102struct ipv6_key_t { 103 unsigned __int128 saddr; 104 unsigned __int128 daddr; 105 u32 pid; 106 char name[TASK_COMM_LEN]; 107 u16 lport; 108 u16 dport; 109 u64 __pad__; 110}; 111BPF_HASH(ipv6_send_bytes, struct ipv6_key_t); 112BPF_HASH(ipv6_recv_bytes, struct ipv6_key_t); 113 114int kprobe__tcp_sendmsg(struct pt_regs *ctx, struct sock *sk, 115 struct msghdr *msg, size_t size) 116{ 117 if (container_should_be_filtered()) { 118 return 0; 119 } 120 121 u32 pid = bpf_get_current_pid_tgid() >> 32; 122 FILTER_PID 123 124 u16 dport = 0, family = sk->__sk_common.skc_family; 125 126 FILTER_FAMILY 127 128 if (family == AF_INET) { 129 struct ipv4_key_t ipv4_key = {.pid = pid}; 130 bpf_get_current_comm(&ipv4_key.name, sizeof(ipv4_key.name)); 131 ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr; 132 ipv4_key.daddr = sk->__sk_common.skc_daddr; 133 ipv4_key.lport = sk->__sk_common.skc_num; 134 dport = sk->__sk_common.skc_dport; 135 ipv4_key.dport = ntohs(dport); 136 ipv4_send_bytes.increment(ipv4_key, size); 137 138 } else if (family == AF_INET6) { 139 struct ipv6_key_t ipv6_key = {.pid = pid}; 140 bpf_get_current_comm(&ipv6_key.name, sizeof(ipv6_key.name)); 141 bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr), 142 &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 143 bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr), 144 &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 145 ipv6_key.lport = sk->__sk_common.skc_num; 146 dport = sk->__sk_common.skc_dport; 147 ipv6_key.dport = ntohs(dport); 148 ipv6_send_bytes.increment(ipv6_key, size); 149 } 150 // else drop 151 152 return 0; 153} 154 155/* 156 * tcp_recvmsg() would be obvious to trace, but is less suitable because: 157 * - we'd need to trace both entry and return, to have both sock and size 158 * - misses tcp_read_sock() traffic 159 * we'd much prefer tracepoints once they are available. 160 */ 161int kprobe__tcp_cleanup_rbuf(struct pt_regs *ctx, struct sock *sk, int copied) 162{ 163 if (container_should_be_filtered()) { 164 return 0; 165 } 166 167 u32 pid = bpf_get_current_pid_tgid() >> 32; 168 FILTER_PID 169 170 u16 dport = 0, family = sk->__sk_common.skc_family; 171 u64 *val, zero = 0; 172 173 if (copied <= 0) 174 return 0; 175 176 FILTER_FAMILY 177 178 if (family == AF_INET) { 179 struct ipv4_key_t ipv4_key = {.pid = pid}; 180 bpf_get_current_comm(&ipv4_key.name, sizeof(ipv4_key.name)); 181 ipv4_key.saddr = sk->__sk_common.skc_rcv_saddr; 182 ipv4_key.daddr = sk->__sk_common.skc_daddr; 183 ipv4_key.lport = sk->__sk_common.skc_num; 184 dport = sk->__sk_common.skc_dport; 185 ipv4_key.dport = ntohs(dport); 186 ipv4_recv_bytes.increment(ipv4_key, copied); 187 188 } else if (family == AF_INET6) { 189 struct ipv6_key_t ipv6_key = {.pid = pid}; 190 bpf_get_current_comm(&ipv6_key.name, sizeof(ipv6_key.name)); 191 bpf_probe_read_kernel(&ipv6_key.saddr, sizeof(ipv6_key.saddr), 192 &sk->__sk_common.skc_v6_rcv_saddr.in6_u.u6_addr32); 193 bpf_probe_read_kernel(&ipv6_key.daddr, sizeof(ipv6_key.daddr), 194 &sk->__sk_common.skc_v6_daddr.in6_u.u6_addr32); 195 ipv6_key.lport = sk->__sk_common.skc_num; 196 dport = sk->__sk_common.skc_dport; 197 ipv6_key.dport = ntohs(dport); 198 ipv6_recv_bytes.increment(ipv6_key, copied); 199 } 200 // else drop 201 202 return 0; 203} 204""" 205 206# code substitutions 207if args.pid: 208 bpf_text = bpf_text.replace('FILTER_PID', 209 'if (pid != %s) { return 0; }' % args.pid) 210else: 211 bpf_text = bpf_text.replace('FILTER_PID', '') 212if args.ipv4: 213 bpf_text = bpf_text.replace('FILTER_FAMILY', 214 'if (family != AF_INET) { return 0; }') 215elif args.ipv6: 216 bpf_text = bpf_text.replace('FILTER_FAMILY', 217 'if (family != AF_INET6) { return 0; }') 218bpf_text = bpf_text.replace('FILTER_FAMILY', '') 219bpf_text = filter_by_containers(args) + bpf_text 220if debug or args.ebpf: 221 print(bpf_text) 222 if args.ebpf: 223 exit() 224 225TCPSessionKey = namedtuple('TCPSession', ['pid', 'name', 'laddr', 'lport', 'daddr', 'dport']) 226 227def get_ipv4_session_key(k): 228 return TCPSessionKey(pid=k.pid, 229 name=k.name, 230 laddr=inet_ntop(AF_INET, pack("I", k.saddr)), 231 lport=k.lport, 232 daddr=inet_ntop(AF_INET, pack("I", k.daddr)), 233 dport=k.dport) 234 235def get_ipv6_session_key(k): 236 return TCPSessionKey(pid=k.pid, 237 name=k.name, 238 laddr=inet_ntop(AF_INET6, k.saddr), 239 lport=k.lport, 240 daddr=inet_ntop(AF_INET6, k.daddr), 241 dport=k.dport) 242 243# initialize BPF 244b = BPF(text=bpf_text) 245 246ipv4_send_bytes = b["ipv4_send_bytes"] 247ipv4_recv_bytes = b["ipv4_recv_bytes"] 248ipv6_send_bytes = b["ipv6_send_bytes"] 249ipv6_recv_bytes = b["ipv6_recv_bytes"] 250 251print('Tracing... Output every %s secs. Hit Ctrl-C to end' % args.interval) 252 253# output 254i = 0 255exiting = False 256while i != args.count and not exiting: 257 try: 258 sleep(args.interval) 259 except KeyboardInterrupt: 260 exiting = True 261 262 # header 263 if args.noclear: 264 print() 265 else: 266 call("clear") 267 if not args.nosummary: 268 with open(loadavg) as stats: 269 print("%-8s loadavg: %s" % (strftime("%H:%M:%S"), stats.read())) 270 271 # IPv4: build dict of all seen keys 272 ipv4_throughput = defaultdict(lambda: [0, 0]) 273 for k, v in ipv4_send_bytes.items(): 274 key = get_ipv4_session_key(k) 275 ipv4_throughput[key][0] = v.value 276 ipv4_send_bytes.clear() 277 278 for k, v in ipv4_recv_bytes.items(): 279 key = get_ipv4_session_key(k) 280 ipv4_throughput[key][1] = v.value 281 ipv4_recv_bytes.clear() 282 283 if ipv4_throughput: 284 print("%-7s %-12s %-21s %-21s %6s %6s" % ("PID", "COMM", 285 "LADDR", "RADDR", "RX_KB", "TX_KB")) 286 287 # output 288 for k, (send_bytes, recv_bytes) in sorted(ipv4_throughput.items(), 289 key=lambda kv: sum(kv[1]), 290 reverse=True): 291 print("%-7d %-12.12s %-21s %-21s %6d %6d" % (k.pid, 292 k.name, 293 k.laddr + ":" + str(k.lport), 294 k.daddr + ":" + str(k.dport), 295 int(recv_bytes / 1024), int(send_bytes / 1024))) 296 297 # IPv6: build dict of all seen keys 298 ipv6_throughput = defaultdict(lambda: [0, 0]) 299 for k, v in ipv6_send_bytes.items(): 300 key = get_ipv6_session_key(k) 301 ipv6_throughput[key][0] = v.value 302 ipv6_send_bytes.clear() 303 304 for k, v in ipv6_recv_bytes.items(): 305 key = get_ipv6_session_key(k) 306 ipv6_throughput[key][1] = v.value 307 ipv6_recv_bytes.clear() 308 309 if ipv6_throughput: 310 # more than 80 chars, sadly. 311 print("\n%-7s %-12s %-32s %-32s %6s %6s" % ("PID", "COMM", 312 "LADDR6", "RADDR6", "RX_KB", "TX_KB")) 313 314 # output 315 for k, (send_bytes, recv_bytes) in sorted(ipv6_throughput.items(), 316 key=lambda kv: sum(kv[1]), 317 reverse=True): 318 print("%-7d %-12.12s %-32s %-32s %6d %6d" % (k.pid, 319 k.name, 320 k.laddr + ":" + str(k.lport), 321 k.daddr + ":" + str(k.dport), 322 int(recv_bytes / 1024), int(send_bytes / 1024))) 323 324 i += 1 325