1--[[ 2Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com> 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15]] 16local ffi = require('ffi') 17local bit = require('bit') 18local has_syscall, S = pcall(require, 'syscall') 19local M = {} 20 21ffi.cdef [[ 22struct bpf { 23 /* Instruction classes */ 24 static const int LD = 0x00; 25 static const int LDX = 0x01; 26 static const int ST = 0x02; 27 static const int STX = 0x03; 28 static const int ALU = 0x04; 29 static const int JMP = 0x05; 30 static const int ALU64 = 0x07; 31 /* ld/ldx fields */ 32 static const int W = 0x00; 33 static const int H = 0x08; 34 static const int B = 0x10; 35 static const int ABS = 0x20; 36 static const int IND = 0x40; 37 static const int MEM = 0x60; 38 static const int LEN = 0x80; 39 static const int MSH = 0xa0; 40 /* alu/jmp fields */ 41 static const int ADD = 0x00; 42 static const int SUB = 0x10; 43 static const int MUL = 0x20; 44 static const int DIV = 0x30; 45 static const int OR = 0x40; 46 static const int AND = 0x50; 47 static const int LSH = 0x60; 48 static const int RSH = 0x70; 49 static const int NEG = 0x80; 50 static const int MOD = 0x90; 51 static const int XOR = 0xa0; 52 static const int JA = 0x00; 53 static const int JEQ = 0x10; 54 static const int JGT = 0x20; 55 static const int JGE = 0x30; 56 static const int JSET = 0x40; 57 static const int K = 0x00; 58 static const int X = 0x08; 59 static const int JNE = 0x50; /* jump != */ 60 static const int JSGT = 0x60; /* SGT is signed '>', GT in x86 */ 61 static const int JSGE = 0x70; /* SGE is signed '>=', GE in x86 */ 62 static const int CALL = 0x80; /* function call */ 63 static const int EXIT = 0x90; /* function return */ 64 /* ld/ldx fields */ 65 static const int DW = 0x18; /* double word */ 66 static const int XADD = 0xc0; /* exclusive add */ 67 /* alu/jmp fields */ 68 static const int MOV = 0xb0; /* mov reg to reg */ 69 static const int ARSH = 0xc0; /* sign extending arithmetic shift right */ 70 /* change endianness of a register */ 71 static const int END = 0xd0; /* flags for endianness conversion: */ 72 static const int TO_LE = 0x00; /* convert to little-endian */ 73 static const int TO_BE = 0x08; /* convert to big-endian */ 74 /* misc */ 75 static const int PSEUDO_MAP_FD = 0x01; 76 /* helper functions */ 77 static const int F_CURRENT_CPU = 0xffffffff; 78 static const int F_USER_STACK = 1 << 8; 79 static const int F_FAST_STACK_CMP = 1 << 9; 80 static const int F_REUSE_STACKID = 1 << 10; 81 /* special offsets for ancillary data */ 82 static const int NET_OFF = -0x100000; 83 static const int LL_OFF = -0x200000; 84}; 85/* eBPF commands */ 86struct bpf_cmd { 87 static const int MAP_CREATE = 0; 88 static const int MAP_LOOKUP_ELEM = 1; 89 static const int MAP_UPDATE_ELEM = 2; 90 static const int MAP_DELETE_ELEM = 3; 91 static const int MAP_GET_NEXT_KEY = 4; 92 static const int PROG_LOAD = 5; 93 static const int OBJ_PIN = 6; 94 static const int OBJ_GET = 7; 95}; 96/* eBPF helpers */ 97struct bpf_func_id { 98 static const int unspec = 0; 99 static const int map_lookup_elem = 1; 100 static const int map_update_elem = 2; 101 static const int map_delete_elem = 3; 102 static const int probe_read = 4; 103 static const int ktime_get_ns = 5; 104 static const int trace_printk = 6; 105 static const int get_prandom_u32 = 7; 106 static const int get_smp_processor_id = 8; 107 static const int skb_store_bytes = 9; 108 static const int l3_csum_replace = 10; 109 static const int l4_csum_replace = 11; 110 static const int tail_call = 12; 111 static const int clone_redirect = 13; 112 static const int get_current_pid_tgid = 14; 113 static const int get_current_uid_gid = 15; 114 static const int get_current_comm = 16; 115 static const int get_cgroup_classid = 17; 116 static const int skb_vlan_push = 18; 117 static const int skb_vlan_pop = 19; 118 static const int skb_get_tunnel_key = 20; 119 static const int skb_set_tunnel_key = 21; 120 static const int perf_event_read = 22; 121 static const int redirect = 23; 122 static const int get_route_realm = 24; 123 static const int perf_event_output = 25; 124 static const int skb_load_bytes = 26; 125 static const int get_stackid = 27; 126}; 127/* BPF_MAP_STACK_TRACE structures and constants */ 128static const int BPF_MAX_STACK_DEPTH = 127; 129struct bpf_stacktrace { 130 uint64_t ip[BPF_MAX_STACK_DEPTH]; 131}; 132]] 133 134-- Compatibility: ljsyscall doesn't have support for BPF syscall 135if not has_syscall or not S.bpf then 136 error("ljsyscall doesn't support bpf(), must be updated") 137else 138 local strflag = require('syscall.helpers').strflag 139 -- Compatibility: ljsyscall<=0.12 140 if not S.c.BPF_MAP.LRU_HASH then 141 S.c.BPF_MAP = strflag { 142 UNSPEC = 0, 143 HASH = 1, 144 ARRAY = 2, 145 PROG_ARRAY = 3, 146 PERF_EVENT_ARRAY = 4, 147 PERCPU_HASH = 5, 148 PERCPU_ARRAY = 6, 149 STACK_TRACE = 7, 150 CGROUP_ARRAY = 8, 151 LRU_HASH = 9, 152 LRU_PERCPU_HASH = 10, 153 LPM_TRIE = 11, 154 ARRAY_OF_MAPS = 12, 155 HASH_OF_MAPS = 13, 156 DEVMAP = 14, 157 SOCKMAP = 15, 158 CPUMAP = 16, 159 } 160 end 161 if not S.c.BPF_PROG.TRACEPOINT then 162 S.c.BPF_PROG = strflag { 163 UNSPEC = 0, 164 SOCKET_FILTER = 1, 165 KPROBE = 2, 166 SCHED_CLS = 3, 167 SCHED_ACT = 4, 168 TRACEPOINT = 5, 169 XDP = 6, 170 PERF_EVENT = 7, 171 CGROUP_SKB = 8, 172 CGROUP_SOCK = 9, 173 LWT_IN = 10, 174 LWT_OUT = 11, 175 LWT_XMIT = 12, 176 SOCK_OPS = 13, 177 SK_SKB = 14, 178 CGROUP_DEVICE = 15, 179 SK_MSG = 16, 180 RAW_TRACEPOINT = 17, 181 CGROUP_SOCK_ADDR = 18, 182 } 183 end 184end 185 186-- Compatibility: metatype for stacktrace 187local function stacktrace_iter(t, i) 188 i = i + 1 189 if i < #t and t.ip[i] > 0 then 190 return i, t.ip[i] 191 end 192end 193ffi.metatype('struct bpf_stacktrace', { 194 __len = function (t) return ffi.sizeof(t.ip) / ffi.sizeof(t.ip[0]) end, 195 __ipairs = function (t) return stacktrace_iter, t, -1 end, 196}) 197 198-- Reflect cdata type 199function M.typename(v) 200 if not v or type(v) ~= 'cdata' then return nil end 201 return string.match(tostring(ffi.typeof(v)), '<([^>]+)') 202end 203 204-- Reflect if cdata type can be pointer (accepts array or pointer) 205function M.isptr(v, noarray) 206 local ctname = M.typename(v) 207 if ctname then 208 ctname = string.sub(ctname, -1) 209 ctname = ctname == '*' or (not noarray and ctname == ']') 210 end 211 return ctname 212end 213 214-- Return true if variable is a non-nil constant that can be used as immediate value 215-- e.g. result of KSHORT and KNUM 216function M.isimmconst(v) 217 return (type(v.const) == 'number' and not ffi.istype(v.type, ffi.typeof('void'))) 218 or type(v.const) == 'cdata' and ffi.istype(v.type, ffi.typeof('uint64_t')) -- Lua numbers are at most 52 bits 219 or type(v.const) == 'cdata' and ffi.istype(v.type, ffi.typeof('int64_t')) 220end 221 222function M.osversion() 223 -- We have no better way to extract current kernel hex-string other 224 -- than parsing headers, compiling a helper function or reading /proc 225 local ver_str, count = S.sysctl('kernel.version'):match('%d+.%d+.%d+'), 2 226 if not ver_str then -- kernel.version is freeform, fallback to kernel.osrelease 227 ver_str = S.sysctl('kernel.osrelease'):match('%d+.%d+.%d+') 228 end 229 local version = 0 230 for i in ver_str:gmatch('%d+') do -- Convert 'X.Y.Z' to 0xXXYYZZ 231 version = bit.bor(version, bit.lshift(tonumber(i), 8*count)) 232 count = count - 1 233 end 234 return version 235end 236 237function M.event_reader(reader, event_type) 238 -- Caller can specify event message binary format 239 if event_type then 240 assert(type(event_type) == 'string' and ffi.typeof(event_type), 'not a valid type for event reader') 241 event_type = ffi.typeof(event_type .. '*') -- Convert type to pointer-to-type 242 end 243 -- Wrap reader in interface that can interpret read event messages 244 return setmetatable({reader=reader,type=event_type}, {__index = { 245 block = function(_ --[[self]]) 246 return S.select { readfds = {reader.fd} } 247 end, 248 next = function(_ --[[self]], k) 249 local len, ev = reader:next(k) 250 -- Filter out only sample frames 251 while ev and ev.type ~= S.c.PERF_RECORD.SAMPLE do 252 len, ev = reader:next(len) 253 end 254 if ev and event_type then 255 -- The perf event reader returns framed data with header and variable length 256 -- This is going skip the frame header and cast data to given type 257 ev = ffi.cast(event_type, ffi.cast('char *', ev) + ffi.sizeof('struct perf_event_header') + ffi.sizeof('uint32_t')) 258 end 259 return len, ev 260 end, 261 read = function(self) 262 return self.next, self, nil 263 end, 264 }}) 265end 266 267function M.tracepoint_type(tp) 268 -- Read tracepoint format string 269 local fp = assert(io.open('/sys/kernel/debug/tracing/events/'..tp..'/format', 'r')) 270 local fmt = fp:read '*a' 271 fp:close() 272 -- Parse struct fields 273 local fields = {} 274 for f in fmt:gmatch 'field:([^;]+;)' do 275 table.insert(fields, f) 276 end 277 return string.format('struct { %s }', table.concat(fields)) 278end 279 280return M 281