1--[[ 2Copyright 2016 Marek Vavrusa <mvavrusa@cloudflare.com> 3 4Licensed under the Apache License, Version 2.0 (the "License"); 5you may not use this file except in compliance with the License. 6You may obtain a copy of the License at 7 8http://www.apache.org/licenses/LICENSE-2.0 9 10Unless required by applicable law or agreed to in writing, software 11distributed under the License is distributed on an "AS IS" BASIS, 12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13See the License for the specific language governing permissions and 14limitations under the License. 15]] 16-- LuaJIT to BPF bytecode compiler. 17-- 18-- The code generation phase is currently one-pass and produces: 19-- * Compiled code in BPF bytecode format (https://www.kernel.org/doc/Documentation/networking/filter.txt) 20-- * Variables with liveness analysis and other meta (spill information, compile-time value) 21-- 22-- The code generator optimises as much as possible in single pass: 23-- * Fold compile-time expressions and constant propagation 24-- * Basic control flow analysis with dead code elimination (based on compile-time expressions) 25-- * Single-pass optimistic register allocation 26-- 27-- The first pass doesn't have variable lifetime visibility yet, so it relies on rewriter for further 28-- optimisations such as: 29-- * Dead store elimination (first-pass doesn't know if/when the variable is going to be used) 30-- * Common sub-expression elimination (relies on DCE and liveness analysis) 31-- * Orphan JMP elimination (removing this in first pass would break previous JMP targets) 32-- * Better register allocation (needs to be recomputed after optimisations) 33 34local ffi = require('ffi') 35local bit = require('bit') 36local S = require('syscall') 37local bytecode = require('bpf.ljbytecode') 38local cdef = require('bpf.cdef') 39local proto = require('bpf.proto') 40local builtins = require('bpf.builtins') 41 42-- Constants 43local ALWAYS, NEVER = -1, -2 44local BPF = ffi.typeof('struct bpf') 45local HELPER = ffi.typeof('struct bpf_func_id') 46 47-- Symbolic table of constant expressions over numbers 48local const_expr = { 49 ADD = function (a, b) return a + b end, 50 SUB = function (a, b) return a - b end, 51 DIV = function (a, b) return a / b end, 52 MOD = function (a, b) return a % b end, 53 JEQ = function (a, b) return a == b end, 54 JNE = function (a, b) return a ~= b end, 55 JGE = function (a, b) return a >= b end, 56 JGT = function (a, b) return a > b end, 57} 58 59local const_width = { 60 [1] = BPF.B, [2] = BPF.H, [4] = BPF.W, [8] = BPF.DW, 61} 62 63-- Built-ins that are strict only (never compile-time expandable) 64local builtins_strict = { 65 [ffi.new] = true, 66 [print] = true, 67} 68 69-- Deep copy a table 70local function table_copy(t) 71 local copy = {} 72 for n,v in pairs(t) do 73 if type(v) == 'table' then 74 v = table_copy(v) 75 end 76 copy[n] = v 77 end 78 return copy 79end 80 81-- Return true if the constant part is a proxy 82local function is_proxy(x) 83 return type(x) == 'table' and (x.__dissector or x.__map or x.__base) 84end 85 86-- Create compiler closure 87local function create_emitter(env, stackslots, params, param_types) 88 89local V = {} -- Variable tracking / register allocator 90local code = { -- Generated code 91 pc = 0, bc_pc = 0, 92 insn = ffi.new('struct bpf_insn[4096]'), 93 fixup = {}, 94 reachable = true, 95 seen_cmp = nil, 96} 97local Vstate = {} -- Track variable layout at basic block exits 98 99-- Anything below this stack offset is free to use by caller 100-- @note: There is no tracking memory allocator, so the caller may 101-- lower it for persistent objects, but such memory will never 102-- be reclaimed and the caller is responsible for resetting stack 103-- top whenever the memory below is free to be reused 104local stack_top = (stackslots + 1) * ffi.sizeof('uint64_t') 105 106local function emit(op, dst, src, off, imm) 107 local ins = code.insn[code.pc] 108 ins.code = op 109 ins.dst_reg = dst 110 ins.src_reg = src 111 ins.off = off 112 ins.imm = imm 113 code.pc = code.pc + 1 114end 115 116local function reg_spill(var) 117 local vinfo = V[var] 118 assert(vinfo.reg, 'attempt to spill VAR that doesn\'t have an allocated register') 119 vinfo.spill = (var + 1) * ffi.sizeof('uint64_t') -- Index by (variable number) * (register width) 120 emit(BPF.MEM + BPF.STX + BPF.DW, 10, vinfo.reg, -vinfo.spill, 0) 121 vinfo.reg = nil 122end 123 124local function reg_fill(var, reg) 125 local vinfo = V[var] 126 assert(reg, 'attempt to fill variable to register but not register is allocated') 127 assert(vinfo.spill, 'attempt to fill register with a VAR that isn\'t spilled') 128 emit(BPF.MEM + BPF.LDX + BPF.DW, reg, 10, -vinfo.spill, 0) 129 vinfo.reg = reg 130 vinfo.spill = nil 131end 132 133-- Allocate a register (lazy simple allocator) 134local function reg_alloc(var, reg) 135 -- Specific register requested, must spill/move existing variable 136 if reg then 137 for k,v in pairs(V) do -- Spill any variable that has this register 138 if v.reg == reg and not v.shadow then 139 reg_spill(k) 140 break 141 end 142 end 143 return reg 144 end 145 -- Find free or least recently used slot 146 local last, last_seen, used = nil, 0xffff, 0 147 for k,v in pairs(V) do 148 if v.reg then 149 if not v.live_to or v.live_to < last_seen then 150 last, last_seen = k, v.live_to or last_seen 151 end 152 used = bit.bor(used, bit.lshift(1, v.reg)) 153 end 154 end 155 -- Attempt to select a free register from R7-R9 (callee saved) 156 local free = bit.bnot(used) 157 if bit.band(free, 0x80) ~= 0 then reg = 7 158 elseif bit.band(free,0x100) ~= 0 then reg = 8 159 elseif bit.band(free,0x200) ~= 0 then reg = 9 160 end 161 -- Select another variable to be spilled 162 if not reg then 163 assert(last) 164 reg = V[last].reg 165 reg_spill(last) 166 end 167 assert(reg, 'VAR '..var..'fill/spill failed') 168 return reg 169end 170 171-- Set new variable 172local function vset(var, reg, const, vtype) 173 -- Must materialise all variables shadowing this variable slot, as it will be overwritten 174 if V[var] and V[var].reg then 175 for _, vinfo in pairs(V) do 176 -- Shadowing variable MUST share the same type and attributes, 177 -- but the register assignment may have changed 178 if vinfo.shadow == var then 179 vinfo.reg = V[var].reg 180 vinfo.shadow = nil 181 end 182 end 183 end 184 -- Get precise type for CDATA or attempt to narrow numeric constant 185 if not vtype and type(const) == 'cdata' then 186 vtype = ffi.typeof(const) 187 end 188 V[var] = {reg=reg, const=const, type=vtype} 189 -- Track variable source 190 if V[var].const and type(const) == 'table' then 191 V[var].source = V[var].const.source 192 end 193end 194 195-- Materialize (or register) a variable in a register 196-- If the register is nil, then the a new register is assigned (if not already assigned) 197local function vreg(var, reg, reserve, vtype) 198 local vinfo = V[var] 199 assert(vinfo, 'VAR '..var..' not registered') 200 vinfo.live_to = code.pc-1 201 if (vinfo.reg and not reg) and not vinfo.shadow then return vinfo.reg end 202 reg = reg_alloc(var, reg) 203 -- Materialize variable shadow copy 204 local src = vinfo 205 while src.shadow do src = V[src.shadow] end 206 if reserve then -- luacheck: ignore 207 -- No load to register occurs 208 elseif src.reg then 209 emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, src.reg, 0, 0) 210 elseif src.spill then 211 vinfo.spill = src.spill 212 reg_fill(var, reg) 213 elseif src.const then 214 vtype = vtype or src.type 215 if type(src.const) == 'table' and src.const.__base then 216 -- Load pointer type 217 emit(BPF.ALU64 + BPF.MOV + BPF.X, reg, 10, 0, 0) 218 emit(BPF.ALU64 + BPF.ADD + BPF.K, reg, 0, 0, -src.const.__base) 219 elseif type(src.const) == 'table' and src.const.__dissector then 220 -- Load dissector offset (imm32), but keep the constant part (dissector proxy) 221 emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const.off or 0) 222 elseif vtype and ffi.sizeof(vtype) == 8 then 223 -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) 224 emit(BPF.LD + BPF.DW, reg, 0, 0, ffi.cast('uint32_t', src.const)) 225 emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.rshift(bit.rshift(src.const, 16), 16))) 226 vinfo.const = nil -- The variable is live 227 else 228 emit(BPF.ALU64 + BPF.MOV + BPF.K, reg, 0, 0, src.const) 229 vinfo.const = nil -- The variable is live 230 end 231 else assert(false, 'VAR '..var..' has neither register nor constant value') end 232 vinfo.reg = reg 233 vinfo.shadow = nil 234 vinfo.live_from = code.pc-1 235 vinfo.type = vtype or vinfo.type 236 return reg 237end 238 239-- Copy variable 240local function vcopy(dst, src) 241 if dst == src then return end 242 V[dst] = {reg=V[src].reg, const=V[src].const, shadow=src, source=V[src].source, type=V[src].type} 243end 244 245-- Dereference variable of pointer type 246local function vderef(dst_reg, src_reg, vinfo) 247 -- Dereference map pointers for primitive types 248 -- BPF doesn't allow pointer arithmetics, so use the entry value 249 assert(type(vinfo.const) == 'table' and vinfo.const.__dissector, 'cannot dereference a non-pointer variable') 250 local vtype = vinfo.const.__dissector 251 local w = ffi.sizeof(vtype) 252 assert(const_width[w], 'NYI: sizeof('..tostring(vtype)..') not 1/2/4/8 bytes') 253 if dst_reg ~= src_reg then 254 emit(BPF.ALU64 + BPF.MOV + BPF.X, dst_reg, src_reg, 0, 0) -- dst = src 255 end 256 -- Optimize the NULL check away if provably not NULL 257 if not vinfo.source or vinfo.source:find('_or_null', 1, true) then 258 emit(BPF.JMP + BPF.JEQ + BPF.K, src_reg, 0, 1, 0) -- if (src != NULL) 259 end 260 emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, 0, 0) -- dst = *src; 261end 262 263-- Allocate a space for variable 264local function valloc(size, blank) 265 local base = stack_top 266 assert(stack_top + size < 512 * 1024, 'exceeded maximum stack size of 512kB') 267 stack_top = stack_top + size 268 -- Align to 8 byte boundary 269 stack_top = math.ceil(stack_top/8)*8 270 -- Current kernel version doesn't support ARG_PTR_TO_RAW_STACK 271 -- so we always need to have memory initialized, remove this when supported 272 if blank then 273 if type(blank) == 'string' then 274 local sp = 0 275 while sp < size do 276 -- TODO: no BPF_ST + BPF_DW instruction yet 277 local as_u32 = ffi.new('uint32_t [1]') 278 local sub = blank:sub(sp+1, sp+ffi.sizeof(as_u32)) 279 ffi.copy(as_u32, sub, #sub) 280 emit(BPF.MEM + BPF.ST + BPF.W, 10, 0, -(stack_top-sp), as_u32[0]) 281 sp = sp + ffi.sizeof(as_u32) 282 end 283 elseif type(blank) == 'boolean' then 284 reg_alloc(stackslots, 0) 285 emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) 286 for sp = base+8,stack_top,8 do 287 emit(BPF.MEM + BPF.STX + BPF.DW, 10, 0, -sp, 0) 288 end 289 else error('NYI: will with unknown type '..type(blank)) end 290 end 291 return stack_top 292end 293 294-- Turn variable into scalar in register (or constant) 295local function vscalar(a, w) 296 assert(const_width[w], 'sizeof(scalar variable) must be 1/2/4/8') 297 local src_reg 298 -- If source is a pointer, we must dereference it first 299 if cdef.isptr(V[a].type) then 300 src_reg = vreg(a) 301 local tmp_reg = reg_alloc(stackslots, 1) -- Clone variable in tmp register 302 emit(BPF.ALU64 + BPF.MOV + BPF.X, tmp_reg, src_reg, 0, 0) 303 vderef(tmp_reg, tmp_reg, V[a]) 304 src_reg = tmp_reg -- Materialize and dereference it 305 -- Source is a value on stack, we must load it first 306 elseif type(V[a].const) == 'table' and V[a].const.__base > 0 then 307 src_reg = vreg(a) 308 emit(BPF.MEM + BPF.LDX + const_width[w], src_reg, 10, -V[a].const.__base, 0) 309 V[a].type = V[a].const.__dissector 310 V[a].const = nil -- Value is dereferenced 311 -- If source is an imm32 number, avoid register load 312 elseif type(V[a].const) == 'number' and w < 8 then 313 return nil, V[a].const 314 -- Load variable from any other source 315 else 316 src_reg = vreg(a) 317 end 318 319 return src_reg, nil 320end 321 322-- Emit compensation code at the end of basic block to unify variable set layout on all block exits 323-- 1. we need to free registers by spilling 324-- 2. fill registers to match other exits from this BB 325local function bb_end(Vcomp) 326 for i,v in pairs(V) do 327 if Vcomp[i] and Vcomp[i].spill and not v.spill then 328 -- Materialize constant or shadowing variable to be able to spill 329 if not v.reg and (v.shadow or cdef.isimmconst(v)) then 330 vreg(i) 331 end 332 reg_spill(i) 333 end 334 end 335 for i,v in pairs(V) do 336 if Vcomp[i] and Vcomp[i].reg and not v.reg then 337 vreg(i, Vcomp[i].reg) 338 end 339 -- Compensate variable metadata change 340 if Vcomp[i] and Vcomp[i].source then 341 V[i].source = Vcomp[i].source 342 end 343 end 344end 345 346local function CMP_STR(a, b, op) 347 assert(op == 'JEQ' or op == 'JNE', 'NYI: only equivallence stack/string only supports == or ~=') 348 -- I have no better idea how to implement it than unrolled XOR loop, as we can fixup only one JMP 349 -- So: X(a,b) = a[0] ^ b[0] | a[1] ^ b[1] | ... 350 -- EQ(a,b) <=> X == 0 351 -- This could be optimised by placing early exits by rewriter in second phase for long strings 352 local base, size = V[a].const.__base, math.min(#b, ffi.sizeof(V[a].type)) 353 local acc, tmp = reg_alloc(stackslots, 0), reg_alloc(stackslots+1, 1) 354 local sp = 0 355 emit(BPF.ALU64 + BPF.MOV + BPF.K, acc, 0, 0, 0) 356 while sp < size do 357 -- Load string chunk as imm32 358 local as_u32 = ffi.new('uint32_t [1]') 359 local sub = b:sub(sp+1, sp+ffi.sizeof(as_u32)) 360 ffi.copy(as_u32, sub, #sub) 361 -- TODO: make this faster by interleaved load/compare steps with DW length 362 emit(BPF.MEM + BPF.LDX + BPF.W, tmp, 10, -(base-sp), 0) 363 emit(BPF.ALU64 + BPF.XOR + BPF.K, tmp, 0, 0, as_u32[0]) 364 emit(BPF.ALU64 + BPF.OR + BPF.X, acc, tmp, 0, 0) 365 sp = sp + ffi.sizeof(as_u32) 366 end 367 emit(BPF.JMP + BPF[op] + BPF.K, acc, 0, 0xffff, 0) 368 code.seen_cmp = code.pc-1 369end 370 371local function CMP_REG(a, b, op) 372 -- Fold compile-time expressions 373 if V[a].const and V[b].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then 374 code.seen_cmp = const_expr[op](V[a].const, V[b].const) and ALWAYS or NEVER 375 else 376 -- Comparison against compile-time string or stack memory 377 if V[b].const and type(V[b].const) == 'string' then 378 return CMP_STR(a, V[b].const, op) 379 end 380 -- The 0xFFFF target here has no significance, it's just a placeholder for 381 -- compiler to replace it's absolute offset to LJ bytecode insn with a relative 382 -- offset in BPF program code, verifier will accept only programs with valid JMP targets 383 local a_reg, b_reg = vreg(a), vreg(b) 384 emit(BPF.JMP + BPF[op] + BPF.X, a_reg, b_reg, 0xffff, 0) 385 code.seen_cmp = code.pc-1 386 end 387end 388 389local function CMP_IMM(a, b, op) 390 local c = V[a].const 391 if c and not is_proxy(c) then -- Fold compile-time expressions 392 code.seen_cmp = const_expr[op](c, b) and ALWAYS or NEVER 393 else 394 -- Convert imm32 to number 395 if type(b) == 'string' then 396 if #b == 1 then b = b:byte() 397 elseif cdef.isptr(V[a].type) then 398 -- String comparison between stack/constant string 399 return CMP_STR(a, b, op) 400 elseif #b <= 4 then 401 -- Convert to u32 with network byte order 402 local imm = ffi.new('uint32_t[1]') 403 ffi.copy(imm, b, #b) 404 b = builtins.hton(imm[0]) 405 else error('NYI: compare register with string, where #string > sizeof(u32)') end 406 end 407 -- The 0xFFFF target here has no significance, it's just a placeholder for 408 -- compiler to replace it's absolute offset to LJ bytecode insn with a relative 409 -- offset in BPF program code, verifier will accept only programs with valid JMP targets 410 local reg = vreg(a) 411 emit(BPF.JMP + BPF[op] + BPF.K, reg, 0, 0xffff, b) 412 code.seen_cmp = code.pc-1 413 -- Remember NULL pointer checks as BPF prohibits pointer comparisons 414 -- and repeated checks wouldn't pass the verifier, only comparisons 415 -- against constants are checked. 416 if op == 'JEQ' and tonumber(b) == 0 and V[a].source then 417 local pos = V[a].source:find('_or_null', 1, true) 418 if pos then 419 code.seen_null_guard = a 420 end 421 -- Inverse NULL pointer check (if a ~= nil) 422 elseif op == 'JNE' and tonumber(b) == 0 and V[a].source then 423 local pos = V[a].source:find('_or_null', 1, true) 424 if pos then 425 code.seen_null_guard = a 426 code.seen_null_guard_inverse = true 427 end 428 end 429 end 430end 431 432local function ALU_IMM(dst, a, b, op) 433 -- Fold compile-time expressions 434 if V[a].const and not is_proxy(V[a].const) then 435 assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric') 436 vset(dst, nil, const_expr[op](V[a].const, b)) 437 -- Now we need to materialize dissected value at DST, and add it 438 else 439 vcopy(dst, a) 440 local dst_reg = vreg(dst) 441 if cdef.isptr(V[a].type) then 442 vderef(dst_reg, dst_reg, V[a]) 443 V[dst].type = V[a].const.__dissector 444 else 445 V[dst].type = V[a].type 446 end 447 emit(BPF.ALU64 + BPF[op] + BPF.K, dst_reg, 0, 0, b) 448 end 449end 450 451local function ALU_REG(dst, a, b, op) 452 -- Fold compile-time expressions 453 if V[a].const and not (is_proxy(V[a].const) or is_proxy(V[b].const)) then 454 assert(cdef.isimmconst(V[a]), 'VAR '..a..' must be numeric') 455 assert(cdef.isimmconst(V[b]), 'VAR '..b..' must be numeric') 456 if type(op) == 'string' then op = const_expr[op] end 457 vcopy(dst, a) 458 V[dst].const = op(V[a].const, V[b].const) 459 else 460 local src_reg = b and vreg(b) or 0 -- SRC is optional for unary operations 461 if b and cdef.isptr(V[b].type) then 462 -- We have to allocate a temporary register for dereferencing to preserve 463 -- pointer in source variable that MUST NOT be altered 464 reg_alloc(stackslots, 2) 465 vderef(2, src_reg, V[b]) 466 src_reg = 2 467 end 468 vcopy(dst, a) -- DST may alias B, so copy must occur after we materialize B 469 local dst_reg = vreg(dst) 470 if cdef.isptr(V[a].type) then 471 vderef(dst_reg, dst_reg, V[a]) 472 V[dst].type = V[a].const.__dissector 473 end 474 emit(BPF.ALU64 + BPF[op] + BPF.X, dst_reg, src_reg, 0, 0) 475 V[stackslots].reg = nil -- Free temporary registers 476 end 477end 478 479local function ALU_IMM_NV(dst, a, b, op) 480 -- Do DST = IMM(a) op VAR(b) where we can't invert because 481 -- the registers are u64 but immediates are u32, so complement 482 -- arithmetics wouldn't work 483 vset(stackslots+1, nil, a) 484 ALU_REG(dst, stackslots+1, b, op) 485end 486 487local function LD_ABS(dst, w, off) 488 assert(off, 'LD_ABS called without offset') 489 if w < 8 then 490 local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 491 emit(BPF.LD + BPF.ABS + const_width[w], dst_reg, 0, 0, off) 492 if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse 493 emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8) 494 end 495 elseif w == 8 then 496 -- LD_ABS|IND prohibits DW, we need to do two W loads and combine them 497 local tmp_reg = vreg(stackslots, 0, true, builtins.width_type(w)) -- Reserve R0 498 emit(BPF.LD + BPF.ABS + const_width[4], tmp_reg, 0, 0, off + 4) 499 if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse 500 emit(BPF.ALU + BPF.END + BPF.TO_BE, tmp_reg, 0, 0, 32) 501 end 502 ALU_IMM(stackslots, stackslots, 32, 'LSH') 503 local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0, spill tmp variable 504 emit(BPF.LD + BPF.ABS + const_width[4], dst_reg, 0, 0, off) 505 if ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse 506 emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, 32) 507 end 508 ALU_REG(dst, dst, stackslots, 'OR') 509 V[stackslots].reg = nil -- Free temporary registers 510 else 511 assert(w < 8, 'NYI: only LD_ABS of 1/2/4/8 is supported') 512 end 513end 514 515local function LD_IND(dst, src, w, off) 516 local src_reg = vreg(src) -- Must materialize first in case dst == src 517 local dst_reg = vreg(dst, 0, true, builtins.width_type(w)) -- Reserve R0 518 emit(BPF.LD + BPF.IND + const_width[w], dst_reg, src_reg, 0, off or 0) 519 if w > 1 and ffi.abi('le') then -- LD_ABS has htonl() semantics, reverse 520 emit(BPF.ALU + BPF.END + BPF.TO_BE, dst_reg, 0, 0, w * 8) 521 end 522end 523 524local function LD_MEM(dst, src, w, off) 525 local src_reg = vreg(src) -- Must materialize first in case dst == src 526 local dst_reg = vreg(dst, nil, true, builtins.width_type(w)) -- Reserve R0 527 emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, src_reg, off or 0, 0) 528end 529 530-- @note: This is specific now as it expects registers reserved 531local function LD_IMM_X(dst_reg, src_type, imm, w) 532 if w == 8 then -- IMM64 must be done in two instructions with imm64 = (lo(imm32), hi(imm32)) 533 emit(BPF.LD + const_width[w], dst_reg, src_type, 0, ffi.cast('uint32_t', imm)) 534 -- Must shift in two steps as bit.lshift supports [0..31] 535 emit(0, 0, 0, 0, ffi.cast('uint32_t', bit.lshift(bit.lshift(imm, 16), 16))) 536 else 537 emit(BPF.LD + const_width[w], dst_reg, src_type, 0, imm) 538 end 539end 540 541local function BUILTIN(func, ...) 542 local builtin_export = { 543 -- Compiler primitives (work with variable slots, emit instructions) 544 V=V, vreg=vreg, vset=vset, vcopy=vcopy, vderef=vderef, valloc=valloc, emit=emit, 545 reg_alloc=reg_alloc, reg_spill=reg_spill, tmpvar=stackslots, const_width=const_width, 546 -- Extensions and helpers (use with care) 547 LD_IMM_X = LD_IMM_X, 548 } 549 func(builtin_export, ...) 550end 551 552local function LOAD(dst, src, off, vtype) 553 local base = V[src].const 554 assert(base and base.__dissector, 'NYI: load() on variable that doesn\'t have dissector') 555 assert(V[src].source, 'NYI: load() on variable with unknown source') 556 -- Cast to different type if requested 557 vtype = vtype or base.__dissector 558 local w = ffi.sizeof(vtype) 559 assert(const_width[w], 'NYI: load() supports 1/2/4/8 bytes at a time only, wanted ' .. tostring(w)) 560 -- Packet access with a dissector (use BPF_LD) 561 if V[src].source:find('ptr_to_pkt', 1, true) then 562 if base.off then -- Absolute address to payload 563 LD_ABS(dst, w, off + base.off) 564 else -- Indirect address to payload 565 LD_IND(dst, src, w, off) 566 end 567 -- Direct access to first argument (skb fields, pt regs, ...) 568 elseif V[src].source:find('ptr_to_ctx', 1, true) then 569 LD_MEM(dst, src, w, off) 570 -- Direct skb access with a dissector (use BPF_MEM) 571 elseif V[src].source:find('ptr_to_skb', 1, true) then 572 LD_MEM(dst, src, w, off) 573 -- Pointer to map-backed memory (use BPF_MEM) 574 elseif V[src].source:find('ptr_to_map_value', 1, true) then 575 LD_MEM(dst, src, w, off) 576 -- Indirect read using probe (uprobe or kprobe, uses helper) 577 elseif V[src].source:find('ptr_to_probe', 1, true) then 578 BUILTIN(builtins[builtins.probe_read], nil, dst, src, vtype, off) 579 V[dst].source = V[src].source -- Builtin handles everything 580 else 581 error('NYI: load() on variable from ' .. V[src].source) 582 end 583 V[dst].type = vtype 584 V[dst].const = nil -- Dissected value is not constant anymore 585end 586 587local function CALL(a, b, d) 588 assert(b-1 <= 1, 'NYI: CALL with >1 return values') 589 -- Perform either compile-time, helper, or builtin 590 local func = V[a].const 591 -- Gather all arguments and check if they're constant 592 local args, const, nargs = {}, true, d - 1 593 for i = a+1, a+d-1 do 594 table.insert(args, V[i].const) 595 if not V[i].const or is_proxy(V[i].const) then const = false end 596 end 597 local builtin = builtins[func] 598 if not const or nargs == 0 then 599 if builtin and type(builtin) == 'function' then 600 args = {a} 601 for i = a+1, a+nargs do table.insert(args, i) end 602 BUILTIN(builtin, unpack(args)) 603 elseif V[a+2] and V[a+2].const then -- var OP imm 604 ALU_IMM(a, a+1, V[a+2].const, builtin) 605 elseif nargs <= 2 then -- var OP var 606 ALU_REG(a, a+1, V[a+2] and a+2, builtin) 607 else 608 error('NYI: CALL non-builtin with 3 or more arguments') 609 end 610 -- Call on dissector implies slice retrieval 611 elseif type(func) == 'table' and func.__dissector then 612 assert(nargs >= 2, 'NYI: <dissector>.slice(a, b) must have at least two arguments') 613 assert(V[a+1].const and V[a+2].const, 'NYI: slice() arguments must be constant') 614 local off = V[a+1].const 615 local vtype = builtins.width_type(V[a+2].const - off) 616 -- Access to packet via packet (use BPF_LD) 617 if V[a].source and V[a].source:find('ptr_to_', 1, true) then 618 LOAD(a, a, off, vtype) 619 else 620 error('NYI: <dissector>.slice(a, b) on non-pointer memory ' .. (V[a].source or 'unknown')) 621 end 622 -- Strict builtins cannot be expanded on compile-time 623 elseif builtins_strict[func] and builtin then 624 args = {a} 625 for i = a+1, a+nargs do table.insert(args, i) end 626 BUILTIN(builtin, unpack(args)) 627 -- Attempt compile-time call expansion (expects all argument compile-time known) 628 else 629 assert(const, 'NYI: CALL attempted on constant arguments, but at least one argument is not constant') 630 V[a].const = func(unpack(args)) 631 end 632end 633 634local function MAP_INIT(map_var, key, imm) 635 local map = V[map_var].const 636 vreg(map_var, 1, true, ffi.typeof('uint64_t')) 637 -- Reserve R1 and load ptr for process-local map fd 638 LD_IMM_X(1, BPF.PSEUDO_MAP_FD, map.fd, ffi.sizeof(V[map_var].type)) 639 V[map_var].reg = nil -- R1 will be invalidated after CALL, forget register allocation 640 -- Reserve R2 and load R2 = key pointer 641 local key_size = ffi.sizeof(map.key_type) 642 local w = const_width[key_size] or BPF.DW 643 local pod_type = const_width[key_size] 644 local sp = stack_top + key_size -- Must use stack below spill slots 645 -- Store immediate value on stack 646 reg_alloc(stackslots, 2) -- Spill anything in R2 (unnamed tmp variable) 647 local key_base = key and V[key].const 648 imm = imm or key_base 649 if imm and (not key or not is_proxy(key_base)) then 650 assert(pod_type, 'NYI: map[const K], K width must be 1/2/4/8') 651 emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, imm) 652 -- Key is in register, spill it 653 elseif V[key].reg and pod_type then 654 if cdef.isptr(V[key].type) then 655 -- There is already pointer in register, dereference before spilling 656 emit(BPF.MEM + BPF.LDX + w, 2, V[key].reg, 0, 0) 657 emit(BPF.MEM + BPF.STX + w, 10, 2, -sp, 0) 658 else -- Variable in register is POD, spill it on the stack 659 emit(BPF.MEM + BPF.STX + w, 10, V[key].reg, -sp, 0) 660 end 661 -- Key is spilled from register to stack 662 elseif V[key].spill then 663 sp = V[key].spill 664 -- Key is already on stack, write to base-relative address 665 elseif key_base.__base then 666 assert(key_size == ffi.sizeof(V[key].type), 'VAR '..key..' type incompatible with BPF map key type') 667 sp = key_base.__base 668 else 669 error('VAR '..key..' is neither const-expr/register/stack/spilled') 670 end 671 -- If [FP+K] addressing, emit it 672 if sp then 673 emit(BPF.ALU64 + BPF.MOV + BPF.X, 2, 10, 0, 0) 674 emit(BPF.ALU64 + BPF.ADD + BPF.K, 2, 0, 0, -sp) 675 end 676end 677 678local function MAP_GET(dst, map_var, key, imm) 679 local map = V[map_var].const 680 MAP_INIT(map_var, key, imm) 681 -- Flag as pointer type and associate dissector for map value type 682 vreg(dst, 0, true, ffi.typeof('uint8_t *')) 683 V[dst].const = {__dissector=map.val_type} 684 V[dst].source = 'ptr_to_map_value_or_null' 685 emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_lookup_elem) 686 V[stackslots].reg = nil -- Free temporary registers 687end 688 689local function MAP_DEL(map_var, key, key_imm) 690 -- Set R0, R1 (map fd, preempt R0) 691 reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) 692 MAP_INIT(map_var, key, key_imm) 693 emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_delete_elem) 694 V[stackslots].reg = nil -- Free temporary registers 695end 696 697local function MAP_SET(map_var, key, key_imm, src) 698 local map = V[map_var].const 699 -- Delete when setting nil 700 if V[src].type == ffi.typeof('void') then 701 return MAP_DEL(map_var, key, key_imm) 702 end 703 -- Set R0, R1 (map fd, preempt R0) 704 reg_alloc(stackslots, 0) -- Spill anything in R0 (unnamed tmp variable) 705 MAP_INIT(map_var, key, key_imm) 706 reg_alloc(stackslots, 4) -- Spill anything in R4 (unnamed tmp variable) 707 emit(BPF.ALU64 + BPF.MOV + BPF.K, 4, 0, 0, 0) -- BPF_ANY, create new element or update existing 708 -- Reserve R3 for value pointer 709 reg_alloc(stackslots, 3) -- Spill anything in R3 (unnamed tmp variable) 710 local val_size = ffi.sizeof(map.val_type) 711 local w = const_width[val_size] or BPF.DW 712 local pod_type = const_width[val_size] 713 -- Stack pointer must be aligned to both key/value size and have enough headroom for (key, value) 714 local sp = stack_top + ffi.sizeof(map.key_type) + val_size 715 sp = sp + (sp % val_size) 716 local base = V[src].const 717 if base and not is_proxy(base) then 718 assert(pod_type, 'NYI: MAP[K] = imm V; V width must be 1/2/4/8') 719 emit(BPF.MEM + BPF.ST + w, 10, 0, -sp, base) 720 -- Value is in register, spill it 721 elseif V[src].reg and pod_type then 722 -- Value is a pointer, derefernce it and spill it 723 if cdef.isptr(V[src].type) then 724 vderef(3, V[src].reg, V[src]) 725 emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0) 726 else 727 emit(BPF.MEM + BPF.STX + w, 10, V[src].reg, -sp, 0) 728 end 729 -- We get a pointer to spilled register on stack 730 elseif V[src].spill then 731 -- If variable is a pointer, we can load it to R3 directly (save "LEA") 732 if cdef.isptr(V[src].type) then 733 reg_fill(src, 3) 734 -- If variable is a stack pointer, we don't have to check it 735 if base.__base then 736 emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) 737 return 738 end 739 vderef(3, V[src].reg, V[src]) 740 emit(BPF.MEM + BPF.STX + w, 10, 3, -sp, 0) 741 else 742 sp = V[src].spill 743 end 744 -- Value is already on stack, write to base-relative address 745 elseif base.__base then 746 if val_size ~= ffi.sizeof(V[src].type) then 747 local err = string.format('VAR %d type (%s) incompatible with BPF map value type (%s): expected %d, got %d', 748 src, V[src].type, map.val_type, val_size, ffi.sizeof(V[src].type)) 749 error(err) 750 end 751 sp = base.__base 752 -- Value is constant, materialize it on stack 753 else 754 error('VAR '.. src ..' is neither const-expr/register/stack/spilled') 755 end 756 emit(BPF.ALU64 + BPF.MOV + BPF.X, 3, 10, 0, 0) 757 emit(BPF.ALU64 + BPF.ADD + BPF.K, 3, 0, 0, -sp) 758 emit(BPF.JMP + BPF.CALL, 0, 0, 0, HELPER.map_update_elem) 759 V[stackslots].reg = nil -- Free temporary registers 760end 761 762-- Finally - this table translates LuaJIT bytecode into code emitter actions. 763local BC = { 764 -- Constants 765 KNUM = function(a, _, c, _) -- KNUM 766 if c < 2147483648 then 767 vset(a, nil, c, ffi.typeof('int32_t')) 768 else 769 vset(a, nil, c, ffi.typeof('uint64_t')) 770 end 771 end, 772 KSHORT = function(a, _, _, d) -- KSHORT 773 vset(a, nil, d, ffi.typeof('int16_t')) 774 end, 775 KCDATA = function(a, _, c, _) -- KCDATA 776 -- Coerce numeric types if possible 777 local ct = ffi.typeof(c) 778 if ffi.istype(ct, ffi.typeof('uint64_t')) or ffi.istype(ct, ffi.typeof('int64_t')) then 779 vset(a, nil, c, ct) 780 elseif tonumber(c) ~= nil then 781 -- TODO: this should not be possible 782 vset(a, nil, tonumber(c), ct) 783 else 784 error('NYI: cannot use CDATA constant of type ' .. ct) 785 end 786 end, 787 KPRI = function(a, _, _, d) -- KPRI 788 -- KNIL is 0, must create a special type to identify it 789 local vtype = (d < 1) and ffi.typeof('void') or ffi.typeof('uint8_t') 790 vset(a, nil, (d < 2) and 0 or 1, vtype) 791 end, 792 KSTR = function(a, _, c, _) -- KSTR 793 vset(a, nil, c, ffi.typeof('const char[?]')) 794 end, 795 MOV = function(a, _, _, d) -- MOV var, var 796 vcopy(a, d) 797 end, 798 799 -- Comparison ops 800 -- Note: comparisons are always followed by JMP opcode, that 801 -- will fuse following JMP to JMP+CMP instruction in BPF 802 -- Note: we're narrowed to integers, so operand/operator inversion is legit 803 ISLT = function(a, _, _, d) return CMP_REG(d, a, 'JGE') end, -- (a < d) (inverted) 804 ISGE = function(a, _, _, d) return CMP_REG(a, d, 'JGE') end, -- (a >= d) 805 ISGT = function(a, _, _, d) return CMP_REG(a, d, 'JGT') end, -- (a > d) 806 ISEQV = function(a, _, _, d) return CMP_REG(a, d, 'JEQ') end, -- (a == d) 807 ISNEV = function(a, _, _, d) return CMP_REG(a, d, 'JNE') end, -- (a ~= d) 808 ISEQS = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == str(c)) 809 ISNES = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= str(c)) 810 ISEQN = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- (a == c) 811 ISNEN = function(a, _, c, _) return CMP_IMM(a, c, 'JNE') end, -- (a ~= c) 812 IST = function(_, _, _, d) return CMP_IMM(d, 0, 'JNE') end, -- (d) 813 ISF = function(_, _, _, d) return CMP_IMM(d, 0, 'JEQ') end, -- (not d) 814 ISEQP = function(a, _, c, _) return CMP_IMM(a, c, 'JEQ') end, -- ISEQP (a == c) 815 -- Binary operations with RHS constants 816 ADDVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, 817 SUBVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'SUB') end, 818 MULVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, 819 DIVVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'DIV') end, 820 MODVN = function(a, b, c, _) return ALU_IMM(a, b, c, 'MOD') end, 821 -- Binary operations with LHS constants 822 -- Cheat code: we're narrowed to integer arithmetic, so MUL+ADD are commutative 823 ADDNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'ADD') end, -- ADDNV 824 MULNV = function(a, b, c, _) return ALU_IMM(a, b, c, 'MUL') end, -- MULNV 825 SUBNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'SUB') end, -- SUBNV 826 DIVNV = function(a, b, c, _) return ALU_IMM_NV(a, c, b, 'DIV') end, -- DIVNV 827 -- Binary operations between registers 828 ADDVV = function(a, b, _, d) return ALU_REG(a, b, d, 'ADD') end, 829 SUBVV = function(a, b, _, d) return ALU_REG(a, b, d, 'SUB') end, 830 MULVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MUL') end, 831 DIVVV = function(a, b, _, d) return ALU_REG(a, b, d, 'DIV') end, 832 MODVV = function(a, b, _, d) return ALU_REG(a, b, d, 'MOD') end, 833 -- Strings 834 CAT = function(a, b, _, d) -- CAT A = B ~ D 835 assert(V[b].const and V[d].const, 'NYI: CAT only works on compile-time expressions') 836 assert(type(V[b].const) == 'string' and type(V[d].const) == 'string', 837 'NYI: CAT only works on compile-time strings') 838 vset(a, nil, V[b].const .. V[d].const) 839 end, 840 -- Tables 841 GGET = function (a, _, c, _) -- GGET (A = GLOBAL[c]) 842 if env[c] ~= nil then 843 vset(a, nil, env[c]) 844 else error(string.format("undefined global '%s'", c)) end 845 end, 846 UGET = function (a, _, c, _) -- UGET (A = UPVALUE[c]) 847 if env[c] ~= nil then 848 vset(a, nil, env[c]) 849 else error(string.format("undefined upvalue '%s'", c)) end 850 end, 851 TSETB = function (a, b, _, d) -- TSETB (B[D] = A) 852 assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer') 853 local vinfo = V[b].const 854 if vinfo.__map then -- BPF map read (constant) 855 return MAP_SET(b, nil, d, a) -- D is literal 856 elseif vinfo.__dissector then 857 assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size') 858 local w = ffi.sizeof(vinfo.__dissector) 859 -- TODO: support vectorized moves larger than register width 860 assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') 861 local src_reg, const = vscalar(a, w) 862 -- If changing map value, write to absolute address + offset 863 if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then 864 local dst_reg = vreg(b) 865 -- Optimization: immediate values (imm32) can be stored directly 866 if type(const) == 'number' then 867 emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, d, const) 868 else 869 emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, d, 0) 870 end 871 -- Table is already on stack, write to vinfo-relative address 872 elseif vinfo.__base then 873 -- Optimization: immediate values (imm32) can be stored directly 874 if type(const) == 'number' then 875 emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -vinfo.__base + (d * w), const) 876 else 877 emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -vinfo.__base + (d * w), 0) 878 end 879 else 880 error('NYI: B[D] where B is not Lua table, BPF map, or pointer') 881 end 882 elseif vinfo and vinfo and V[a].const then 883 vinfo[V[d].const] = V[a].const 884 else 885 error('NYI: B[D] where B is not Lua table, BPF map, or pointer') 886 end 887 end, 888 TSETV = function (a, b, _, d) -- TSETV (B[D] = A) 889 assert(V[b] and type(V[b].const) == 'table', 'NYI: B[D] where B is not Lua table, BPF map, or pointer') 890 local vinfo = V[b].const 891 if vinfo.__map then -- BPF map read (constant) 892 return MAP_SET(b, d, nil, a) -- D is variable 893 elseif vinfo.__dissector then 894 assert(vinfo.__dissector, 'NYI: B[D] where B does not have a known element size') 895 local w = ffi.sizeof(vinfo.__dissector) 896 -- TODO: support vectorized moves larger than register width 897 assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') 898 local src_reg, const = vscalar(a, w) 899 -- If changing map value, write to absolute address + offset 900 if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then 901 -- Calculate variable address from two registers 902 local tmp_var = stackslots + 1 903 vset(tmp_var, nil, d) 904 ALU_REG(tmp_var, tmp_var, b, 'ADD') 905 local dst_reg = vreg(tmp_var) 906 V[tmp_var].reg = nil -- Only temporary allocation 907 -- Optimization: immediate values (imm32) can be stored directly 908 if type(const) == 'number' and w < 8 then 909 emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, 0, const) 910 else 911 emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, 0, 0) 912 end 913 -- Table is already on stack, write to vinfo-relative address 914 elseif vinfo.__base then 915 -- Calculate variable address from two registers 916 local tmp_var = stackslots + 1 917 vcopy(tmp_var, d) -- Element position 918 if w > 1 then 919 ALU_IMM(tmp_var, tmp_var, w, 'MUL') -- multiply by element size 920 end 921 local dst_reg = vreg(tmp_var) -- add R10 (stack pointer) 922 emit(BPF.ALU64 + BPF.ADD + BPF.X, dst_reg, 10, 0, 0) 923 V[tmp_var].reg = nil -- Only temporary allocation 924 -- Optimization: immediate values (imm32) can be stored directly 925 if type(const) == 'number' and w < 8 then 926 emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, -vinfo.__base, const) 927 else 928 emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, -vinfo.__base, 0) 929 end 930 else 931 error('NYI: B[D] where B is not Lua table, BPF map, or pointer') 932 end 933 elseif vinfo and V[d].const and V[a].const then 934 vinfo[V[d].const] = V[a].const 935 else 936 error('NYI: B[D] where B is not Lua table, BPF map, or pointer') 937 end 938 end, 939 TSETS = function (a, b, c, _) -- TSETS (B[C] = A) 940 assert(V[b] and V[b].const, 'NYI: B[D] where B is not Lua table, BPF map, or pointer') 941 local base = V[b].const 942 if base.__dissector then 943 local ofs,bpos = ffi.offsetof(base.__dissector, c) 944 assert(not bpos, 'NYI: B[C] = A, where C is a bitfield') 945 local w = builtins.sizeofattr(base.__dissector, c) 946 -- TODO: support vectorized moves larger than register width 947 assert(const_width[w], 'B[C] = A, sizeof(A) must be 1/2/4/8') 948 local src_reg, const = vscalar(a, w) 949 -- If changing map value, write to absolute address + offset 950 if V[b].source and V[b].source:find('ptr_to_map_value', 1, true) then 951 local dst_reg = vreg(b) 952 -- Optimization: immediate values (imm32) can be stored directly 953 if type(const) == 'number' and w < 8 then 954 emit(BPF.MEM + BPF.ST + const_width[w], dst_reg, 0, ofs, const) 955 else 956 emit(BPF.MEM + BPF.STX + const_width[w], dst_reg, src_reg, ofs, 0) 957 end 958 -- Table is already on stack, write to base-relative address 959 elseif base.__base then 960 -- Optimization: immediate values (imm32) can be stored directly 961 if type(const) == 'number' and w < 8 then 962 emit(BPF.MEM + BPF.ST + const_width[w], 10, 0, -base.__base + ofs, const) 963 else 964 emit(BPF.MEM + BPF.STX + const_width[w], 10, src_reg, -base.__base + ofs, 0) 965 end 966 else 967 error('NYI: B[C] where B is not Lua table, BPF map, or pointer') 968 end 969 elseif V[a].const then 970 base[c] = V[a].const 971 else 972 error('NYI: B[C] where B is not Lua table, BPF map, or pointer') 973 end 974 end, 975 TGETB = function (a, b, _, d) -- TGETB (A = B[D]) 976 local base = V[b].const 977 assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') 978 if a ~= b then vset(a) end 979 if base.__map then -- BPF map read (constant) 980 MAP_GET(a, b, nil, d) 981 -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) 982 elseif V[b].source and V[b].source:find('ptr_to_') then 983 local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t') 984 LOAD(a, b, d, vtype) 985 -- Specialise PTR[0] as dereference operator 986 elseif cdef.isptr(V[b].type) and d == 0 then 987 vcopy(a, b) 988 local dst_reg = vreg(a) 989 vderef(dst_reg, dst_reg, V[a]) 990 V[a].type = V[a].const.__dissector 991 else 992 error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference') 993 end 994 end, 995 TGETV = function (a, b, _, d) -- TGETV (A = B[D]) 996 local base = V[b].const 997 assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') 998 if a ~= b then vset(a) end 999 if base.__map then -- BPF map read 1000 MAP_GET(a, b, d) 1001 -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) 1002 elseif V[b].source and V[b].source:find('ptr_to_') then 1003 local vtype = base.__dissector and base.__dissector or ffi.typeof('uint8_t') 1004 LOAD(a, b, d, vtype) 1005 -- Constant dereference 1006 elseif type(V[d].const) == 'number' then 1007 V[a].const = base[V[d].const] 1008 else 1009 error('NYI: A = B[D], where B is not Lua table or packet dissector or pointer dereference') 1010 end 1011 end, 1012 TGETS = function (a, b, c, _) -- TGETS (A = B[C]) 1013 local base = V[b].const 1014 assert(type(base) == 'table', 'NYI: B[C] where C is string and B not Lua table or BPF map') 1015 if a ~= b then vset(a) end 1016 if base.__dissector then 1017 local ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) 1018 -- Resolve table key using metatable 1019 if not ofs and type(base.__dissector[c]) == 'string' then 1020 c = base.__dissector[c] 1021 ofs,bpos,bsize = ffi.offsetof(base.__dissector, c) 1022 end 1023 if not ofs and proto[c] then -- Load new dissector on given offset 1024 BUILTIN(proto[c], a, b, c) 1025 else 1026 -- Loading register from offset is a little bit tricky as there are 1027 -- several data sources and value loading modes with different restrictions 1028 -- such as checking pointer values for NULL compared to using stack. 1029 assert(ofs, tostring(base.__dissector)..'.'..c..' attribute not exists') 1030 if a ~= b then vset(a) end 1031 -- Dissected value is probably not constant anymore 1032 local new_const = nil 1033 local w, atype = builtins.sizeofattr(base.__dissector, c) 1034 -- [SP+K] addressing using R10 (stack pointer) 1035 -- Doesn't need to be checked for NULL 1036 if base.__base and base.__base > 0 then 1037 if cdef.isptr(atype) then -- If the member is pointer type, update base pointer with offset 1038 new_const = {__base = base.__base-ofs} 1039 else 1040 local dst_reg = vreg(a, nil, true) 1041 emit(BPF.MEM + BPF.LDX + const_width[w], dst_reg, 10, -base.__base+ofs, 0) 1042 end 1043 -- Pointer access with a dissector (traditional uses BPF_LD, direct uses BPF_MEM) 1044 elseif V[b].source and V[b].source:find('ptr_to_') then 1045 LOAD(a, b, ofs, atype) 1046 else 1047 error('NYI: B[C] where B is not Lua table, BPF map, or pointer') 1048 end 1049 -- Bitfield, must be further narrowed with a bitmask/shift 1050 if bpos then 1051 local mask = 0 1052 for i=bpos+1,bpos+bsize do 1053 mask = bit.bor(mask, bit.lshift(1, w*8-i)) 1054 end 1055 emit(BPF.ALU64 + BPF.AND + BPF.K, vreg(a), 0, 0, mask) 1056 -- Free optimization: single-bit values need just boolean result 1057 if bsize > 1 then 1058 local shift = w*8-bsize-bpos 1059 if shift > 0 then 1060 emit(BPF.ALU64 + BPF.RSH + BPF.K, vreg(a), 0, 0, shift) 1061 end 1062 end 1063 end 1064 V[a].type = atype 1065 V[a].const = new_const 1066 V[a].source = V[b].source 1067 -- Track direct access to skb data 1068 -- see https://www.kernel.org/doc/Documentation/networking/filter.txt "Direct packet access" 1069 if ffi.istype(base.__dissector, ffi.typeof('struct sk_buff')) then 1070 -- Direct access to skb uses skb->data and skb->data_end 1071 -- which are encoded as u32, but are actually pointers 1072 if c == 'data' or c == 'data_end' then 1073 V[a].const = {__dissector = ffi.typeof('uint8_t')} 1074 V[a].source = 'ptr_to_skb' 1075 end 1076 end 1077 end 1078 else 1079 V[a].const = base[c] 1080 end 1081 end, 1082 -- Loops and branches 1083 CALLM = function (a, b, _, d) -- A = A(A+1, ..., A+D+MULTRES) 1084 -- NYI: Support single result only 1085 CALL(a, b, d+2) 1086 end, 1087 CALL = function (a, b, _, d) -- A = A(A+1, ..., A+D-1) 1088 CALL(a, b, d) 1089 end, 1090 JMP = function (a, _, c, _) -- JMP 1091 -- Discard unused slots after jump 1092 for i, _ in pairs(V) do 1093 if i >= a and i < stackslots then 1094 V[i] = nil 1095 end 1096 end 1097 -- Cross basic block boundary if the jump target isn't provably unreachable 1098 local val = code.fixup[c] or {} 1099 if code.seen_cmp and code.seen_cmp ~= ALWAYS then 1100 if code.seen_cmp ~= NEVER then -- Do not emit the jump or fixup 1101 -- Store previous CMP insn for reemitting after compensation code 1102 local jmpi = ffi.new('struct bpf_insn', code.insn[code.pc-1]) 1103 code.pc = code.pc - 1 1104 -- First branch point, emit compensation code 1105 local Vcomp = Vstate[c] 1106 if not Vcomp then 1107 -- Select scratch register (R0-5) that isn't used as operand 1108 -- in the CMP instruction, as the variable may not be live, after 1109 -- the JMP, but it may be used in the JMP+CMP instruction itself 1110 local tmp_reg = 0 1111 for reg = 0, 5 do 1112 if reg ~= jmpi.dst_reg and reg ~= jmpi.src_reg then 1113 tmp_reg = reg 1114 break 1115 end 1116 end 1117 -- Force materialization of constants at the end of BB 1118 for i, v in pairs(V) do 1119 if not v.reg and cdef.isimmconst(v) then 1120 vreg(i, tmp_reg) -- Load to TMP register (not saved) 1121 reg_spill(i) -- Spill caller-saved registers 1122 end 1123 end 1124 -- Record variable state 1125 Vstate[c] = V 1126 Vcomp = V 1127 V = table_copy(V) 1128 -- Variable state already set, emit specific compensation code 1129 else 1130 bb_end(Vcomp) 1131 end 1132 -- Record pointer NULL check from condition 1133 -- If the condition checks pointer variable against NULL, 1134 -- we can assume it will not be NULL in the fall-through block 1135 if code.seen_null_guard then 1136 local var = code.seen_null_guard 1137 -- The null guard can have two forms: 1138 -- if x == nil then goto 1139 -- if x ~= nil then goto 1140 -- First form guarantees that the variable will be non-nil on the following instruction 1141 -- Second form guarantees that the variable will be non-nil at the jump target 1142 local vinfo = code.seen_null_guard_inverse and Vcomp[var] or V[var] 1143 if vinfo.source then 1144 local pos = vinfo.source:find('_or_null', 1, true) 1145 if pos then 1146 vinfo.source = vinfo.source:sub(1, pos - 1) 1147 end 1148 end 1149 end 1150 -- Reemit CMP insn 1151 emit(jmpi.code, jmpi.dst_reg, jmpi.src_reg, jmpi.off, jmpi.imm) 1152 -- Fuse JMP into previous CMP opcode, mark JMP target for fixup 1153 -- as we don't knot the relative offset in generated code yet 1154 table.insert(val, code.pc-1) 1155 code.fixup[c] = val 1156 end 1157 code.seen_cmp = nil 1158 code.seen_null_guard = nil 1159 code.seen_null_guard_inverse = nil 1160 elseif c == code.bc_pc + 1 then -- luacheck: ignore 542 1161 -- Eliminate jumps to next immediate instruction 1162 -- e.g. 0002 JMP 1 => 0003 1163 else 1164 -- We need to synthesise a condition that's always true, however 1165 -- BPF prohibits pointer arithmetic to prevent pointer leaks 1166 -- so we have to clear out one register and use it for cmp that's always true 1167 local dst_reg = reg_alloc(stackslots) 1168 V[stackslots].reg = nil -- Only temporary allocation 1169 -- First branch point, emit compensation code 1170 local Vcomp = Vstate[c] 1171 if not Vcomp then 1172 -- Force materialization of constants at the end of BB 1173 for i, v in pairs(V) do 1174 if not v.reg and cdef.isimmconst(v) then 1175 vreg(i, dst_reg) -- Load to TMP register (not saved) 1176 reg_spill(i) -- Spill caller-saved registers 1177 end 1178 end 1179 -- Record variable state 1180 Vstate[c] = V 1181 V = table_copy(V) 1182 -- Variable state already set, emit specific compensation code 1183 else 1184 bb_end(Vcomp) 1185 end 1186 emit(BPF.ALU64 + BPF.MOV + BPF.K, dst_reg, 0, 0, 0) 1187 emit(BPF.JMP + BPF.JEQ + BPF.K, dst_reg, 0, 0xffff, 0) 1188 table.insert(val, code.pc-1) -- Fixup JMP target 1189 code.reachable = false -- Code following the JMP is not reachable 1190 code.fixup[c] = val 1191 end 1192 end, 1193 RET1 = function (a, _, _, _) -- RET1 1194 -- Free optimisation: spilled variable will not be filled again 1195 for i, v in pairs(V) do 1196 if i ~= a then v.reg = nil end 1197 end 1198 if V[a].reg ~= 0 then vreg(a, 0) end 1199 -- Convenience: dereference pointer variables 1200 -- e.g. 'return map[k]' will return actual map value, not pointer 1201 if cdef.isptr(V[a].type) then 1202 vderef(0, 0, V[a]) 1203 end 1204 emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) 1205 code.reachable = false 1206 end, 1207 RET0 = function (_, _, _, _) -- RET0 1208 emit(BPF.ALU64 + BPF.MOV + BPF.K, 0, 0, 0, 0) 1209 emit(BPF.JMP + BPF.EXIT, 0, 0, 0, 0) 1210 code.reachable = false 1211 end, 1212 compile = function () 1213 return code 1214 end 1215} 1216 1217-- Composite instructions 1218function BC.CALLT(a, _, _, d) -- Tailcall: return A(A+1, ..., A+D-1) 1219 CALL(a, 1, d) 1220 BC.RET1(a) 1221end 1222 1223-- Always initialize R6 with R1 context 1224emit(BPF.ALU64 + BPF.MOV + BPF.X, 6, 1, 0, 0) 1225-- Register R6 as context variable (first argument) 1226if params and params > 0 then 1227 vset(0, 6, param_types[1] or proto.skb) 1228 assert(V[0].source == V[0].const.source) -- Propagate source annotation from typeinfo 1229end 1230-- Register tmpvars 1231vset(stackslots) 1232vset(stackslots+1) 1233return setmetatable(BC, { 1234 __index = function (_, k, _) 1235 if type(k) == 'number' then 1236 local op_str = string.sub(require('jit.vmdef').bcnames, 6*k+1, 6*k+6) 1237 error(string.format("NYI: opcode '0x%02x' (%-04s)", k, op_str)) 1238 end 1239 end, 1240 __call = function (t, op, a, b, c, d) 1241 code.bc_pc = code.bc_pc + 1 1242 -- Exitting BB straight through, emit compensation code 1243 if Vstate[code.bc_pc] then 1244 if code.reachable then 1245 -- Instruction is reachable from previous line 1246 -- so we must make the variable allocation consistent 1247 -- with the variable allocation at the jump source 1248 -- e.g. 0001 x:R0 = 5 1249 -- 0002 if rand() then goto 0005 1250 -- 0003 x:R0 -> x:stack 1251 -- 0004 y:R0 = 5 1252 -- 0005 x:? = 10 <-- x was in R0 before jump, and stack after jump 1253 bb_end(Vstate[code.bc_pc]) 1254 else 1255 -- Instruction isn't reachable from previous line, restore variable layout 1256 -- e.g. RET or condition-less JMP on previous line 1257 V = table_copy(Vstate[code.bc_pc]) 1258 end 1259 end 1260 -- Perform fixup of jump targets 1261 -- We need to do this because the number of consumed and emitted 1262 -- bytecode instructions is different 1263 local fixup = code.fixup[code.bc_pc] 1264 if fixup ~= nil then 1265 -- Patch JMP source insn with relative offset 1266 for _,pc in ipairs(fixup) do 1267 code.insn[pc].off = code.pc - 1 - pc 1268 end 1269 code.fixup[code.bc_pc] = nil 1270 code.reachable = true 1271 end 1272 -- Execute 1273 if code.reachable then 1274 assert(t[op], string.format('NYI: instruction %s, parameters: %s,%s,%s,%s', op,a,b,c,d)) 1275 return t[op](a, b, c, d) 1276 end 1277 end, 1278}) 1279end 1280 1281-- Emitted code dump 1282local function dump_mem(cls, ins, _, fuse) 1283 -- This is a very dense MEM instruction decoder without much explanation 1284 -- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format 1285 local mode = bit.band(ins.code, 0xe0) 1286 if mode == BPF.XADD then cls = 5 end -- The only mode 1287 local op_1 = {'LD', 'LDX', 'ST', 'STX', '', 'XADD'} 1288 local op_2 = {[0]='W', [8]='H', [16]='B', [24]='DW'} 1289 local name = op_1[cls+1] .. op_2[bit.band(ins.code, 0x18)] 1290 local off = tonumber(ffi.cast('int16_t', ins.off)) -- Reinterpret as signed 1291 local dst = cls < 2 and 'R'..ins.dst_reg or string.format('[R%d%+d]', ins.dst_reg, off) 1292 local src = cls % 2 == 0 and '#'..ins.imm or 'R'..ins.src_reg 1293 if cls == BPF.LDX then src = string.format('[R%d%+d]', ins.src_reg, off) end 1294 if mode == BPF.ABS then src = string.format('skb[%d]', ins.imm) end 1295 if mode == BPF.IND then src = string.format('skb[R%d%+d]', ins.src_reg, ins.imm) end 1296 return string.format('%s\t%s\t%s', fuse and '' or name, fuse and '' or dst, src) 1297end 1298 1299local function dump_alu(cls, ins, pc) 1300 local alu = {'ADD', 'SUB', 'MUL', 'DIV', 'OR', 'AND', 'LSH', 'RSH', 'NEG', 'MOD', 'XOR', 'MOV', 'ARSH', 'END' } 1301 local jmp = {'JA', 'JEQ', 'JGT', 'JGE', 'JSET', 'JNE', 'JSGT', 'JSGE', 'CALL', 'EXIT'} 1302 local helper = {'unspec', 'map_lookup_elem', 'map_update_elem', 'map_delete_elem', 'probe_read', 'ktime_get_ns', 1303 'trace_printk', 'get_prandom_u32', 'get_smp_processor_id', 'skb_store_bytes', 1304 'l3_csum_replace', 'l4_csum_replace', 'tail_call', 'clone_redirect', 'get_current_pid_tgid', 1305 'get_current_uid_gid', 'get_current_comm', 'get_cgroup_classid', 'skb_vlan_push', 'skb_vlan_pop', 1306 'skb_get_tunnel_key', 'skb_set_tunnel_key', 'perf_event_read', 'redirect', 'get_route_realm', 1307 'perf_event_output', 'skb_load_bytes'} 1308 local op = 0 1309 -- This is a very dense ALU instruction decoder without much explanation 1310 -- Refer to https://www.kernel.org/doc/Documentation/networking/filter.txt for instruction format 1311 for i = 0,13 do if 0x10 * i == bit.band(ins.code, 0xf0) then op = i + 1 break end end 1312 local name = (cls == 5) and jmp[op] or alu[op] 1313 local src = (bit.band(ins.code, 0x08) == BPF.X) and 'R'..ins.src_reg or '#'..ins.imm 1314 local target = (cls == 5 and op < 9) and string.format('\t=> %04d', pc + ins.off + 1) or '' 1315 if cls == 5 and op == 9 then target = string.format('\t; %s', helper[ins.imm + 1] or tostring(ins.imm)) end 1316 return string.format('%s\t%s\t%s%s', name, 'R'..ins.dst_reg, src, target) 1317end 1318 1319local function dump_string(code, off, hide_counter) 1320 if not code then return end 1321 local cls_map = { 1322 [0] = dump_mem, [1] = dump_mem, [2] = dump_mem, [3] = dump_mem, 1323 [4] = dump_alu, [5] = dump_alu, [7] = dump_alu, 1324 } 1325 local result = {} 1326 local fused = false 1327 for i = off or 0, code.pc - 1 do 1328 local ins = code.insn[i] 1329 local cls = bit.band(ins.code, 0x07) 1330 local line = cls_map[cls](cls, ins, i, fused) 1331 if hide_counter then 1332 table.insert(result, line) 1333 else 1334 table.insert(result, string.format('%04u\t%s', i, line)) 1335 end 1336 fused = string.find(line, 'LDDW', 1) 1337 end 1338 return table.concat(result, '\n') 1339end 1340 1341local function dump(code) 1342 if not code then return end 1343 print(string.format('-- BPF %s:0-%u', code.insn, code.pc)) 1344 print(dump_string(code)) 1345end 1346 1347local function compile(prog, params) 1348 -- Create code emitter sandbox, include caller locals 1349 local env = { pkt=proto.pkt, eth=proto.pkt, BPF=BPF, ffi=ffi } 1350 -- Include upvalues up to 4 nested scopes back 1351 -- the narrower scope overrides broader scope 1352 for k = 5, 2, -1 do 1353 local i = 1 1354 while true do 1355 local ok, n, v = pcall(debug.getlocal, k, i) 1356 if not ok or not n then break end 1357 env[n] = v 1358 i = i + 1 1359 end 1360 end 1361 setmetatable(env, { 1362 __index = function (_, k) 1363 return proto[k] or builtins[k] or _G[k] 1364 end 1365 }) 1366 -- Create code emitter and compile LuaJIT bytecode 1367 if type(prog) == 'string' then prog = loadstring(prog) end 1368 -- Create error handler to print traceback 1369 local funci, pc = bytecode.funcinfo(prog), 0 1370 local E = create_emitter(env, funci.stackslots, funci.params, params or {}) 1371 local on_err = function (e) 1372 funci = bytecode.funcinfo(prog, pc) 1373 local from, to = 0, 0 1374 for _ = 1, funci.currentline do 1375 from = to 1376 to = string.find(funci.source, '\n', from+1, true) or 0 1377 end 1378 print(funci.loc..':'..string.sub(funci.source, from+1, to-1)) 1379 print('error: '..e) 1380 print(debug.traceback()) 1381 end 1382 for _,op,a,b,c,d in bytecode.decoder(prog) do 1383 local ok, _, err = xpcall(E,on_err,op,a,b,c,d) 1384 if not ok then 1385 return nil, err 1386 end 1387 end 1388 return E:compile() 1389end 1390 1391-- BPF map interface 1392local bpf_map_mt = { 1393 __gc = function (map) S.close(map.fd) end, 1394 __len = function(map) return map.max_entries end, 1395 __index = function (map, k) 1396 if type(k) == 'string' then 1397 -- Return iterator 1398 if k == 'pairs' then 1399 return function(t, key) 1400 -- Get next key 1401 local next_key = ffi.new(ffi.typeof(t.key)) 1402 local cur_key 1403 if key then 1404 cur_key = t.key 1405 t.key[0] = key 1406 else 1407 cur_key = ffi.new(ffi.typeof(t.key)) 1408 end 1409 local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_GET_NEXT_KEY, map.fd, cur_key, next_key) 1410 if not ok then return nil, err end 1411 -- Get next value 1412 assert(S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, next_key, map.val)) 1413 return next_key[0], map.val[0] 1414 end, map, nil 1415 -- Read for perf event map 1416 elseif k == 'reader' then 1417 return function (pmap, pid, cpu, event_type) 1418 -- Caller must either specify PID or CPU 1419 if not pid or pid < 0 then 1420 assert((cpu and cpu >= 0), 'NYI: creating composed reader for all CPUs') 1421 pid = -1 1422 end 1423 -- Create BPF output reader 1424 local pe = S.t.perf_event_attr1() 1425 pe[0].type = 'software' 1426 pe[0].config = 'sw_bpf_output' 1427 pe[0].sample_type = 'raw' 1428 pe[0].sample_period = 1 1429 pe[0].wakeup_events = 1 1430 local reader, err = S.t.perf_reader(S.perf_event_open(pe, pid, cpu or -1)) 1431 if not reader then return nil, tostring(err) end 1432 -- Register event reader fd in BPF map 1433 assert(cpu < pmap.max_entries, string.format('BPF map smaller than read CPU %d', cpu)) 1434 pmap[cpu] = reader.fd 1435 -- Open memory map and start reading 1436 local ok, err = reader:start() 1437 assert(ok, tostring(err)) 1438 ok, err = reader:mmap() 1439 assert(ok, tostring(err)) 1440 return cdef.event_reader(reader, event_type) 1441 end 1442 -- Signalise this is a map type 1443 end 1444 return k == '__map' 1445 end 1446 -- Retrieve key 1447 map.key[0] = k 1448 local ok, err = S.bpf_map_op(S.c.BPF_CMD.MAP_LOOKUP_ELEM, map.fd, map.key, map.val) 1449 if not ok then return nil, err end 1450 return ffi.new(map.val_type, map.val[0]) 1451 end, 1452 __newindex = function (map, k, v) 1453 map.key[0] = k 1454 if v == nil then 1455 return S.bpf_map_op(map.fd, S.c.BPF_CMD.MAP_DELETE_ELEM, map.key, nil) 1456 end 1457 map.val[0] = v 1458 return S.bpf_map_op(S.c.BPF_CMD.MAP_UPDATE_ELEM, map.fd, map.key, map.val) 1459 end, 1460} 1461 1462-- Linux tracing interface 1463local function trace_check_enabled(path) 1464 path = path or '/sys/kernel/debug/tracing' 1465 if S.statfs(path) then return true end 1466 return nil, 'debugfs not accessible: "mount -t debugfs nodev /sys/kernel/debug"? missing sudo?' 1467end 1468 1469-- Tracepoint interface 1470local tracepoint_mt = { 1471 __index = { 1472 bpf = function (t, prog) 1473 if type(prog) ~= 'table' then 1474 -- Create protocol parser with source probe 1475 prog = compile(prog, {proto.type(t.type, {source='ptr_to_probe'})}) 1476 end 1477 -- Load the BPF program 1478 local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.TRACEPOINT, prog.insn, prog.pc) 1479 assert(prog_fd, tostring(err)..': '..tostring(log)) 1480 -- Open tracepoint and attach 1481 t.reader:setbpf(prog_fd:getfd()) 1482 table.insert(t.progs, prog_fd) 1483 return prog_fd 1484 end, 1485 } 1486} 1487-- Open tracepoint 1488local function tracepoint_open(path, pid, cpu, group_fd) 1489 -- Open tracepoint and compile tracepoint type 1490 local tp = assert(S.perf_tracepoint('/sys/kernel/debug/tracing/events/'..path)) 1491 local tp_type = assert(cdef.tracepoint_type(path)) 1492 -- Open tracepoint reader and create interface 1493 local reader = assert(S.perf_attach_tracepoint(tp, pid, cpu, group_fd)) 1494 return setmetatable({tp=tp,type=tp_type,reader=reader,progs={}}, tracepoint_mt) 1495end 1496 1497local function trace_bpf(ptype, pname, pdef, retprobe, prog, pid, cpu, group_fd) 1498 -- Load BPF program 1499 if type(prog) ~= 'table' then 1500 prog = compile(prog, {proto.pt_regs}) 1501 end 1502 local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.KPROBE, prog.insn, prog.pc) 1503 assert(prog_fd, tostring(err)..': '..tostring(log)) 1504 -- Open tracepoint and attach 1505 local tp, err = S.perf_probe(ptype, pname, pdef, retprobe) 1506 if not tp then 1507 prog_fd:close() 1508 return nil, tostring(err) 1509 end 1510 local reader, err = S.perf_attach_tracepoint(tp, pid, cpu, group_fd, {sample_type='raw, callchain'}) 1511 if not reader then 1512 prog_fd:close() 1513 S.perf_probe(ptype, pname, false) 1514 return nil, tostring(err) 1515 end 1516 local ok, err = reader:setbpf(prog_fd:getfd()) 1517 if not ok then 1518 prog_fd:close() 1519 reader:close() 1520 S.perf_probe(ptype, pname, false) 1521 return nil, tostring(err)..' (kernel version should be at least 4.1)' 1522 end 1523 -- Create GC closure for reader to close BPF program 1524 -- and detach probe in correct order 1525 ffi.gc(reader, function () 1526 prog_fd:close() 1527 reader:close() 1528 S.perf_probe(ptype, pname, false) 1529 end) 1530 return {reader=reader, prog=prog_fd, probe=pname, probe_type=ptype} 1531end 1532 1533-- Module interface 1534return setmetatable({ 1535 new = create_emitter, 1536 dump = dump, 1537 dump_string = dump_string, 1538 maps = {}, 1539 map = function (type, max_entries, key_ctype, val_ctype) 1540 if not key_ctype then key_ctype = ffi.typeof('uint32_t') end 1541 if not val_ctype then val_ctype = ffi.typeof('uint32_t') end 1542 if not max_entries then max_entries = 4096 end 1543 -- Special case for BPF_MAP_STACK_TRACE 1544 if S.c.BPF_MAP[type] == S.c.BPF_MAP.STACK_TRACE then 1545 key_ctype = ffi.typeof('int32_t') 1546 val_ctype = ffi.typeof('struct bpf_stacktrace') 1547 end 1548 local fd, err = S.bpf_map_create(S.c.BPF_MAP[type], ffi.sizeof(key_ctype), ffi.sizeof(val_ctype), max_entries) 1549 if not fd then return nil, tostring(err) end 1550 local map = setmetatable({ 1551 max_entries = max_entries, 1552 key = ffi.new(ffi.typeof('$ [1]', key_ctype)), 1553 val = ffi.new(ffi.typeof('$ [1]', val_ctype)), 1554 map_type = S.c.BPF_MAP[type], 1555 key_type = key_ctype, 1556 val_type = val_ctype, 1557 fd = fd:nogc():getfd(), 1558 }, bpf_map_mt) 1559 return map 1560 end, 1561 socket = function (sock, prog) 1562 -- Expect socket type, if sock is string then assume it's 1563 -- an interface name (e.g. 'lo'), if it's a number then typecast it as a socket 1564 local ok, err 1565 if type(sock) == 'string' then 1566 local iface = assert(S.nl.getlink())[sock] 1567 assert(iface, sock..' is not interface name') 1568 sock, err = S.socket('packet', 'raw') 1569 assert(sock, tostring(err)) 1570 ok, err = sock:bind(S.t.sockaddr_ll({protocol='all', ifindex=iface.index})) 1571 assert(ok, tostring(err)) 1572 elseif type(sock) == 'number' then 1573 sock = S.t.fd(sock):nogc() 1574 elseif ffi.istype(S.t.fd, sock) then -- luacheck: ignore 1575 -- No cast required 1576 else 1577 return nil, 'socket must either be an fd number, an interface name, or an ljsyscall socket' 1578 end 1579 -- Load program and attach it to socket 1580 if type(prog) ~= 'table' then 1581 prog = compile(prog, {proto.skb}) 1582 end 1583 local prog_fd, err, log = S.bpf_prog_load(S.c.BPF_PROG.SOCKET_FILTER, prog.insn, prog.pc) 1584 assert(prog_fd, tostring(err)..': '..tostring(log)) 1585 assert(sock:setsockopt('socket', 'attach_bpf', prog_fd:getfd())) 1586 return prog_fd, err 1587 end, 1588 tracepoint = function(tp, prog, pid, cpu, group_fd) 1589 assert(trace_check_enabled()) 1590 -- Return tracepoint instance if no program specified 1591 -- this allows free specialisation of arg0 to tracepoint type 1592 local probe = tracepoint_open(tp, pid, cpu, group_fd) 1593 -- Load the BPF program 1594 if prog then 1595 probe:bpf(prog) 1596 end 1597 return probe 1598 end, 1599 kprobe = function(tp, prog, retprobe, pid, cpu, group_fd) 1600 assert(trace_check_enabled()) 1601 -- Open tracepoint and attach 1602 local pname, pdef = tp:match('([^:]+):(.+)') 1603 return trace_bpf('kprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) 1604 end, 1605 uprobe = function(tp, prog, retprobe, pid, cpu, group_fd) 1606 assert(trace_check_enabled()) 1607 -- Translate symbol to address 1608 local obj, sym_want = tp:match('([^:]+):(.+)') 1609 if not S.statfs(obj) then return nil, S.t.error(S.c.E.NOENT) end 1610 -- Resolve Elf object (no support for anything else) 1611 local elf = require('bpf.elf').open(obj) 1612 local sym = elf:resolve(sym_want) 1613 if not sym then return nil, 'no such symbol' end 1614 sym = sym.st_value - elf:loadaddr() 1615 local sym_addr = string.format('%x%04x', tonumber(bit.rshift(sym, 32)), 1616 tonumber(ffi.cast('uint32_t', sym))) 1617 -- Convert it to expected uprobe format 1618 local pname = string.format('%s_%s', obj:gsub('.*/', ''), sym_addr) 1619 local pdef = obj..':0x'..sym_addr 1620 return trace_bpf('uprobe', pname, pdef, retprobe, prog, pid, cpu, group_fd) 1621 end, 1622 tracelog = function(path) 1623 assert(trace_check_enabled()) 1624 path = path or '/sys/kernel/debug/tracing/trace_pipe' 1625 return io.open(path, 'r') 1626 end, 1627 ntoh = builtins.ntoh, hton = builtins.hton, 1628}, { 1629 __call = function (_, prog) return compile(prog) end, 1630}) 1631