1--[[ 2Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com> 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15]] 16local ffi = require('ffi') 17local BPF = ffi.typeof('struct bpf') 18 19ffi.cdef [[ 20struct sk_buff { 21 uint32_t len; 22 uint32_t pkt_type; 23 uint32_t mark; 24 uint32_t queue_mapping; 25 uint32_t protocol; 26 uint32_t vlan_present; 27 uint32_t vlan_tci; 28 uint32_t vlan_proto; 29 uint32_t priority; 30 uint32_t ingress_ifindex; 31 uint32_t ifindex; 32 uint32_t tc_index; 33 uint32_t cb[5]; 34 uint32_t hash; 35 uint32_t tc_classid; 36 uint32_t data; 37 uint32_t data_end; 38 uint32_t napi_id; 39 40 /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ 41 uint32_t family; 42 uint32_t remote_ip4; /* Stored in network byte order */ 43 uint32_t local_ip4; /* Stored in network byte order */ 44 uint32_t remote_ip6[4]; /* Stored in network byte order */ 45 uint32_t local_ip6[4]; /* Stored in network byte order */ 46 uint32_t remote_port; /* Stored in network byte order */ 47 uint32_t local_port; /* stored in host byte order */ 48 /* ... here. */ 49 50 uint32_t data_meta; 51}; 52 53struct net_off_t { 54 uint8_t ver:4; 55} __attribute__((packed)); 56 57struct eth_t { 58 uint8_t dst[6]; 59 uint8_t src[6]; 60 uint16_t type; 61} __attribute__((packed)); 62 63struct dot1q_t { 64 uint16_t pri:3; 65 uint16_t cfi:1; 66 uint16_t vlanid:12; 67 uint16_t type; 68} __attribute__((packed)); 69 70struct arp_t { 71 uint16_t htype; 72 uint16_t ptype; 73 uint8_t hlen; 74 uint8_t plen; 75 uint16_t oper; 76 uint8_t sha[6]; 77 uint32_t spa; 78 uint8_t tha[6]; 79 uint32_t tpa; 80} __attribute__((packed)); 81 82struct ip_t { 83 uint8_t ver:4; 84 uint8_t hlen:4; 85 uint8_t tos; 86 uint16_t tlen; 87 uint16_t identification; 88 uint16_t ffo_unused:1; 89 uint16_t df:1; 90 uint16_t mf:1; 91 uint16_t foffset:13; 92 uint8_t ttl; 93 uint8_t proto; 94 uint16_t hchecksum; 95 uint32_t src; 96 uint32_t dst; 97} __attribute__((packed)); 98 99struct icmp_t { 100 uint8_t type; 101 uint8_t code; 102 uint16_t checksum; 103} __attribute__((packed)); 104 105struct ip6_t { 106 uint32_t ver:4; 107 uint32_t priority:8; 108 uint32_t flow_label:20; 109 uint16_t payload_len; 110 uint8_t next_header; 111 uint8_t hop_limit; 112 uint64_t src_hi; 113 uint64_t src_lo; 114 uint64_t dst_hi; 115 uint64_t dst_lo; 116} __attribute__((packed)); 117 118struct ip6_opt_t { 119 uint8_t next_header; 120 uint8_t ext_len; 121 uint8_t pad[6]; 122} __attribute__((packed)); 123 124struct icmp6_t { 125 uint8_t type; 126 uint8_t code; 127 uint16_t checksum; 128} __attribute__((packed)); 129 130struct udp_t { 131 uint16_t src_port; 132 uint16_t dst_port; 133 uint16_t length; 134 uint16_t crc; 135} __attribute__((packed)); 136 137struct tcp_t { 138 uint16_t src_port; 139 uint16_t dst_port; 140 uint32_t seq_num; 141 uint32_t ack_num; 142 uint8_t offset:4; 143 uint8_t reserved:4; 144 uint8_t flag_cwr:1; 145 uint8_t flag_ece:1; 146 uint8_t flag_urg:1; 147 uint8_t flag_ack:1; 148 uint8_t flag_psh:1; 149 uint8_t flag_rst:1; 150 uint8_t flag_syn:1; 151 uint8_t flag_fin:1; 152 uint16_t rcv_wnd; 153 uint16_t cksum; 154 uint16_t urg_ptr; 155} __attribute__((packed)); 156 157struct vxlan_t { 158 uint32_t rsv1:4; 159 uint32_t iflag:1; 160 uint32_t rsv2:3; 161 uint32_t rsv3:24; 162 uint32_t key:24; 163 uint32_t rsv4:8; 164} __attribute__((packed)); 165]] 166 167 168-- Architecture-specific ptrace register layout 169local S = require('syscall') 170local arch = S.abi.arch 171local parm_to_reg = {} 172if arch == 'x64' then 173 ffi.cdef [[ 174 struct pt_regs { 175 unsigned long r15; 176 unsigned long r14; 177 unsigned long r13; 178 unsigned long r12; 179 unsigned long bp; 180 unsigned long bx; 181 unsigned long r11; 182 unsigned long r10; 183 unsigned long r9; 184 unsigned long r8; 185 unsigned long ax; 186 unsigned long cx; 187 unsigned long dx; 188 unsigned long si; 189 unsigned long di; 190 unsigned long orig_ax; 191 unsigned long ip; 192 unsigned long cs; 193 unsigned long flags; 194 unsigned long sp; 195 unsigned long ss; 196 };]] 197 parm_to_reg = {parm1='di', parm2='si', parm3='dx', parm4='cx', parm5='r8', ret='sp', fp='bp'} 198else 199 ffi.cdef 'struct pt_regs {};' 200end 201-- Map symbolic registers to architecture ABI 202ffi.metatype('struct pt_regs', { 203 __index = function (_ --[[t]],k) 204 return assert(parm_to_reg[k], 'no such register: '..k) 205 end, 206}) 207 208local M = {} 209 210-- Dissector interface 211local function dissector(type, e, dst, src, field) 212 local parent = e.V[src].const 213 -- Create new dissector variable 214 e.vcopy(dst, src) 215 -- Compute and materialize new dissector offset from parent 216 e.V[dst].const = {off=e.V[src].const.off, __dissector=e.V[src].const.__dissector} 217 parent.__dissector[field](e, dst) 218 e.V[dst].const.__dissector = type 219end 220M.dissector = dissector 221 222-- Get current effective offset, load field value at an offset relative to it and 223-- add its value to compute next effective offset (e.g. udp_off = ip_off + pkt[ip_off].hlen) 224local function next_offset(e, var, type, off, mask, shift) 225 local d = e.V[var].const 226 -- Materialize relative offset value in R0 227 local dst_reg, tmp_reg 228 if d.off then 229 dst_reg = e.vreg(var, 0, true) 230 tmp_reg = dst_reg -- Use target register to avoid copy 231 e.emit(BPF.LD + BPF.ABS + e.const_width[ffi.sizeof(type)], tmp_reg, 0, 0, d.off + off or 0) 232 else 233 tmp_reg = e.vreg(e.tmpvar, 0, true, type) -- Reserve R0 for temporary relative offset 234 dst_reg = e.vreg(var) -- Must rematerialize (if it was spilled by tmp var) 235 e.emit(BPF.LD + BPF.IND + e.const_width[ffi.sizeof(type)], tmp_reg, dst_reg, 0, off or 0) 236 end 237 -- Finalize relative offset 238 if mask then 239 e.emit(BPF.ALU + BPF.AND + BPF.K, tmp_reg, 0, 0, mask) 240 end 241 if shift and shift ~= 0 then 242 local op = BPF.LSH 243 if shift < 0 then 244 op = BPF.RSH 245 shift = -shift 246 end 247 e.emit(BPF.ALU + op + BPF.K, tmp_reg, 0, 0, shift) 248 end 249 -- Add to base offset to turn it into effective address 250 if dst_reg ~= tmp_reg then 251 e.emit(BPF.ALU + BPF.ADD + BPF.X, dst_reg, tmp_reg, 0, 0) 252 else 253 e.emit(BPF.ALU + BPF.ADD + BPF.K, dst_reg, 0, 0, d.off) 254 end 255 -- Discard temporary allocations 256 d.off = nil 257 e.V[e.tmpvar].reg = nil 258end 259 260local function next_skip(e, var, off) 261 local d = e.V[var].const 262 if not d.off then 263 local dst_reg = e.vreg(var) 264 e.emit(BPF.ALU64 + BPF.ADD + BPF.K, dst_reg, 0, 0, off) 265 else 266 d.off = d.off + off 267 end 268end 269 270local function skip_eth(e, dst) 271 -- IP starts right after ETH header (fixed size) 272 local d = e.V[dst].const 273 d.off = d.off + ffi.sizeof('struct eth_t') 274end 275 276-- Export types 277M.type = function(typestr, t) 278 t = t or {} 279 t.__dissector=ffi.typeof(typestr) 280 return t 281end 282M.skb = M.type('struct sk_buff', {source='ptr_to_ctx'}) 283M.pt_regs = M.type('struct pt_regs', {source='ptr_to_probe'}) 284M.pkt = M.type('struct eth_t', {off=0, source='ptr_to_pkt'}) -- skb needs special accessors 285-- M.eth = function (...) return dissector(ffi.typeof('struct eth_t'), ...) end 286M.dot1q = function (...) return dissector(ffi.typeof('struct dot1q_t'), ...) end 287M.arp = function (...) return dissector(ffi.typeof('struct arp_t'), ...) end 288M.icmp = function (...) return dissector(ffi.typeof('struct icmp_t'), ...) end 289M.ip = function (...) return dissector(ffi.typeof('struct ip_t'), ...) end 290M.icmp6 = function (...) return dissector(ffi.typeof('struct icmp6_t'), ...) end 291M.ip6 = function (...) return dissector(ffi.typeof('struct ip6_t'), ...) end 292M.ip6_opt = function (...) return dissector(ffi.typeof('struct ip6_opt_t'), ...) end 293M.udp = function (...) return dissector(ffi.typeof('struct udp_t'), ...) end 294M.tcp = function (...) return dissector(ffi.typeof('struct tcp_t'), ...) end 295M.vxlan = function (...) return dissector(ffi.typeof('struct vxlan_t'), ...) end 296M.data = function (...) return dissector(ffi.typeof('uint8_t'), ...) end 297M.net_off = function (...) return dissector(ffi.typeof('struct net_off_t'), ...) end 298 299-- Metatables 300ffi.metatype(ffi.typeof('struct eth_t'), { 301 __index = { 302 ip = skip_eth, 303 ip6 = skip_eth, 304 net_off = function (e, dst) 305 next_skip(e, dst, BPF.NET_OFF) 306 end, 307 } 308}) 309 310ffi.metatype(ffi.typeof('struct net_off_t'), { 311 __index = { 312 ip = function () end, 313 ip6 = function () end, 314 } 315}) 316 317ffi.metatype(ffi.typeof('struct ip_t'), { 318 __index = { 319 -- Skip IP header length (stored as number of words) 320 -- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets 321 -- Mask first nibble and shift by 2 (multiplication by 4) 322 icmp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, 323 udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, 324 tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), 0, 0x0f, 2) end, 325 } 326}) 327 328ffi.metatype(ffi.typeof('struct ip6_t'), { 329 __index = { 330 -- Skip fixed IPv6 header length (40 bytes) 331 -- The caller must check the value of `next_header` to skip any extension headers 332 icmp6 = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, 333 udp = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, 334 tcp = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, 335 ip6_opt = function(e, dst) next_skip(e, dst, ffi.sizeof('struct ip6_t'), 0) end, 336 } 337}) 338 339local ip6_opt_ext_len_off = ffi.offsetof('struct ip6_opt_t', 'ext_len') 340ffi.metatype(ffi.typeof('struct ip6_opt_t'), { 341 __index = { 342 -- Skip IPv6 extension header length (field `ext_len`) 343 icmp6 = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, 344 udp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, 345 tcp = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, 346 ip6_opt = function(e, dst) next_offset(e, dst, ffi.typeof('uint8_t'), ip6_opt_ext_len_off) end, 347 } 348}) 349 350ffi.metatype(ffi.typeof('struct tcp_t'), { 351 __index = { 352 -- Skip TCP header length (stored as number of words) 353 -- e.g. hlen = 5, Header Length = 5 x sizeof(u32) = 20 octets 354 data = function(e, dst) 355 next_offset(e, dst, ffi.typeof('uint8_t'), ffi.offsetof('struct tcp_t', 'offset'), 0xf0, -2) 356 end, 357 } 358}) 359 360ffi.metatype(ffi.typeof('struct udp_t'), { 361 __index = { 362 -- Skip UDP header length (8 octets) 363 data = function(e, dst) 364 next_skip(e, dst, ffi.sizeof('struct udp_t')) 365 end, 366 } 367}) 368 369-- Constants 370M.c = { 371 eth = { -- Constants http://standards.ieee.org/regauth/ethertype 372 ip = 0x0800, -- IP (v4) protocol 373 ip6 = 0x86dd, -- IP (v6) protocol 374 arp = 0x0806, -- Address resolution protocol 375 revarp = 0x8035, -- Reverse addr resolution protocol 376 vlan = 0x8100, -- IEEE 802.1Q VLAN tagging 377 }, 378 ip = { 379 -- Reserved Addresses 380 addr_any = 0x00000000, -- 0.0.0.0 381 addr_broadcast = 0xffffffff, -- 255.255.255.255 382 addr_loopback = 0x7f000001, -- 127.0.0.1 383 addr_mcast_all = 0xe0000001, -- 224.0.0.1 384 addr_mcast_local = 0xe00000ff, -- 224.0.0.255 385 -- Type of service (ip_tos), RFC 1349 ("obsoleted by RFC 2474") 386 tos_default = 0x00, -- default 387 tos_lowdelay = 0x10, -- low delay 388 tos_throughput = 0x08, -- high throughput 389 tos_reliability = 0x04, -- high reliability 390 tos_lowcost = 0x02, -- low monetary cost - XXX 391 tos_ect = 0x02, -- ECN-capable transport 392 tos_ce = 0x01, -- congestion experienced 393 -- Fragmentation flags (ip_off) 394 rf = 0x8000, -- reserved 395 df = 0x4000, -- don't fragment 396 mf = 0x2000, -- more fragments (not last frag) 397 offmask = 0x1fff, -- mask for fragment offset 398 -- Time-to-live (ip_ttl), seconds 399 ttl_default = 64, -- default ttl, RFC 1122, RFC 1340 400 ttl_max = 255, -- maximum ttl 401 -- Protocol (ip_p) - http://www.iana.org/assignments/protocol-numbers 402 proto_ip = 0, -- dummy for IP 403 proto_hopopts = 0, -- IPv6 hop-by-hop options 404 proto_icmp = 1, -- ICMP 405 proto_igmp = 2, -- IGMP 406 proto_ggp = 3, -- gateway-gateway protocol 407 proto_ipip = 4, -- IP in IP 408 proto_st = 5, -- ST datagram mode 409 proto_tcp = 6, -- TCP 410 proto_cbt = 7, -- CBT 411 proto_egp = 8, -- exterior gateway protocol 412 proto_igp = 9, -- interior gateway protocol 413 proto_bbnrcc = 10, -- BBN RCC monitoring 414 proto_nvp = 11, -- Network Voice Protocol 415 proto_pup = 12, -- PARC universal packet 416 proto_argus = 13, -- ARGUS 417 proto_emcon = 14, -- EMCON 418 proto_xnet = 15, -- Cross Net Debugger 419 proto_chaos = 16, -- Chaos 420 proto_udp = 17, -- UDP 421 proto_mux = 18, -- multiplexing 422 proto_dcnmeas = 19, -- DCN measurement 423 proto_hmp = 20, -- Host Monitoring Protocol 424 proto_prm = 21, -- Packet Radio Measurement 425 proto_idp = 22, -- Xerox NS IDP 426 proto_trunk1 = 23, -- Trunk-1 427 proto_trunk2 = 24, -- Trunk-2 428 proto_leaf1 = 25, -- Leaf-1 429 proto_leaf2 = 26, -- Leaf-2 430 proto_rdp = 27, -- "Reliable Datagram" proto 431 proto_irtp = 28, -- Inet Reliable Transaction 432 proto_tp = 29, -- ISO TP class 4 433 proto_netblt = 30, -- Bulk Data Transfer 434 proto_mfpnsp = 31, -- MFE Network Services 435 proto_meritinp= 32, -- Merit Internodal Protocol 436 proto_sep = 33, -- Sequential Exchange proto 437 proto_3pc = 34, -- Third Party Connect proto 438 proto_idpr = 35, -- Interdomain Policy Route 439 proto_xtp = 36, -- Xpress Transfer Protocol 440 proto_ddp = 37, -- Datagram Delivery Proto 441 proto_cmtp = 38, -- IDPR Ctrl Message Trans 442 proto_tppp = 39, -- TP++ Transport Protocol 443 proto_il = 40, -- IL Transport Protocol 444 proto_ip6 = 41, -- IPv6 445 proto_sdrp = 42, -- Source Demand Routing 446 proto_routing = 43, -- IPv6 routing header 447 proto_fragment= 44, -- IPv6 fragmentation header 448 proto_rsvp = 46, -- Reservation protocol 449 proto_gre = 47, -- General Routing Encap 450 proto_mhrp = 48, -- Mobile Host Routing 451 proto_ena = 49, -- ENA 452 proto_esp = 50, -- Encap Security Payload 453 proto_ah = 51, -- Authentication Header 454 proto_inlsp = 52, -- Integated Net Layer Sec 455 proto_swipe = 53, -- SWIPE 456 proto_narp = 54, -- NBMA Address Resolution 457 proto_mobile = 55, -- Mobile IP, RFC 2004 458 proto_tlsp = 56, -- Transport Layer Security 459 proto_skip = 57, -- SKIP 460 proto_icmp6 = 58, -- ICMP for IPv6 461 proto_none = 59, -- IPv6 no next header 462 proto_dstopts = 60, -- IPv6 destination options 463 proto_anyhost = 61, -- any host internal proto 464 proto_cftp = 62, -- CFTP 465 proto_anynet = 63, -- any local network 466 proto_expak = 64, -- SATNET and Backroom EXPAK 467 proto_kryptolan = 65, -- Kryptolan 468 proto_rvd = 66, -- MIT Remote Virtual Disk 469 proto_ippc = 67, -- Inet Pluribus Packet Core 470 proto_distfs = 68, -- any distributed fs 471 proto_satmon = 69, -- SATNET Monitoring 472 proto_visa = 70, -- VISA Protocol 473 proto_ipcv = 71, -- Inet Packet Core Utility 474 proto_cpnx = 72, -- Comp Proto Net Executive 475 proto_cphb = 73, -- Comp Protocol Heart Beat 476 proto_wsn = 74, -- Wang Span Network 477 proto_pvp = 75, -- Packet Video Protocol 478 proto_brsatmon= 76, -- Backroom SATNET Monitor 479 proto_sunnd = 77, -- SUN ND Protocol 480 proto_wbmon = 78, -- WIDEBAND Monitoring 481 proto_wbexpak = 79, -- WIDEBAND EXPAK 482 proto_eon = 80, -- ISO CNLP 483 proto_vmtp = 81, -- Versatile Msg Transport 484 proto_svmtp = 82, -- Secure VMTP 485 proto_vines = 83, -- VINES 486 proto_ttp = 84, -- TTP 487 proto_nsfigp = 85, -- NSFNET-IGP 488 proto_dgp = 86, -- Dissimilar Gateway Proto 489 proto_tcf = 87, -- TCF 490 proto_eigrp = 88, -- EIGRP 491 proto_ospf = 89, -- Open Shortest Path First 492 proto_spriterpc= 90, -- Sprite RPC Protocol 493 proto_larp = 91, -- Locus Address Resolution 494 proto_mtp = 92, -- Multicast Transport Proto 495 proto_ax25 = 93, -- AX.25 Frames 496 proto_ipipencap= 94, -- yet-another IP encap 497 proto_micp = 95, -- Mobile Internet Ctrl 498 proto_sccsp = 96, -- Semaphore Comm Sec Proto 499 proto_etherip = 97, -- Ethernet in IPv4 500 proto_encap = 98, -- encapsulation header 501 proto_anyenc = 99, -- private encryption scheme 502 proto_gmtp = 100, -- GMTP 503 proto_ifmp = 101, -- Ipsilon Flow Mgmt Proto 504 proto_pnni = 102, -- PNNI over IP 505 proto_pim = 103, -- Protocol Indep Multicast 506 proto_aris = 104, -- ARIS 507 proto_scps = 105, -- SCPS 508 proto_qnx = 106, -- QNX 509 proto_an = 107, -- Active Networks 510 proto_ipcomp = 108, -- IP Payload Compression 511 proto_snp = 109, -- Sitara Networks Protocol 512 proto_compaqpeer= 110, -- Compaq Peer Protocol 513 proto_ipxip = 111, -- IPX in IP 514 proto_vrrp = 112, -- Virtual Router Redundancy 515 proto_pgm = 113, -- PGM Reliable Transport 516 proto_any0hop = 114, -- 0-hop protocol 517 proto_l2tp = 115, -- Layer 2 Tunneling Proto 518 proto_ddx = 116, -- D-II Data Exchange (DDX) 519 proto_iatp = 117, -- Interactive Agent Xfer 520 proto_stp = 118, -- Schedule Transfer Proto 521 proto_srp = 119, -- SpectraLink Radio Proto 522 proto_uti = 120, -- UTI 523 proto_smp = 121, -- Simple Message Protocol 524 proto_sm = 122, -- SM 525 proto_ptp = 123, -- Performance Transparency 526 proto_isis = 124, -- ISIS over IPv4 527 proto_fire = 125, -- FIRE 528 proto_crtp = 126, -- Combat Radio Transport 529 proto_crudp = 127, -- Combat Radio UDP 530 proto_sscopmce= 128, -- SSCOPMCE 531 proto_iplt = 129, -- IPLT 532 proto_sps = 130, -- Secure Packet Shield 533 proto_pipe = 131, -- Private IP Encap in IP 534 proto_sctp = 132, -- Stream Ctrl Transmission 535 proto_fc = 133, -- Fibre Channel 536 proto_rsvpign = 134, -- RSVP-E2E-IGNORE 537 proto_raw = 255, -- Raw IP packets 538 proto_reserved= 255, -- Reserved 539 }, 540} 541 542return M