1 /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
2 /* Copyright (c) 2021, Oracle and/or its affiliates. */
3
4 #include "vmlinux.h"
5
6 #include <bpf/bpf_helpers.h>
7 #include <bpf/bpf_tracing.h>
8 #include <bpf/bpf_core_read.h>
9
10 #include "ksnoop.h"
11
12 /* For kretprobes, the instruction pointer in the struct pt_regs context
13 * is the kretprobe_trampoline. We derive the instruction pointer
14 * by pushing it onto a function stack on entry and popping it on return.
15 *
16 * We could use bpf_get_func_ip(), but "stack mode" - where we
17 * specify functions "a", "b and "c" and only want to see a trace if "a"
18 * calls "b" and "b" calls "c" - utilizes this stack to determine if trace
19 * data should be collected.
20 */
21 #define FUNC_MAX_STACK_DEPTH 16
22 /* used to convince verifier we do not stray outside of array bounds */
23 #define FUNC_STACK_DEPTH_MASK (FUNC_MAX_STACK_DEPTH - 1)
24
25 #ifndef ENOSPC
26 #define ENOSPC 28
27 #endif
28
29 struct func_stack {
30 __u64 task;
31 __u64 ips[FUNC_MAX_STACK_DEPTH];
32 __u8 stack_depth;
33 };
34
35 #define MAX_TASKS 2048
36
37 /* function call stack hashed on a per-task key */
38 struct {
39 __uint(type, BPF_MAP_TYPE_HASH);
40 /* function call stack for functions we are tracing */
41 __uint(max_entries, MAX_TASKS);
42 __type(key, __u64);
43 __type(value, struct func_stack);
44 } ksnoop_func_stack SEC(".maps");
45
46 /* per-cpu trace info hashed on function address */
47 struct {
48 __uint(type, BPF_MAP_TYPE_PERCPU_HASH);
49 __uint(max_entries, MAX_FUNC_TRACES);
50 __type(key, __u64);
51 __type(value, struct trace);
52 } ksnoop_func_map SEC(".maps");
53
54 struct {
55 __uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
56 __uint(value_size, sizeof(int));
57 __uint(key_size, sizeof(int));
58 } ksnoop_perf_map SEC(".maps");
59
clear_trace(struct trace * trace)60 static void clear_trace(struct trace *trace)
61 {
62 __builtin_memset(&trace->trace_data, 0, sizeof(trace->trace_data));
63 trace->data_flags = 0;
64 trace->buf_len = 0;
65 }
66
get_trace(struct pt_regs * ctx,bool entry)67 static struct trace *get_trace(struct pt_regs *ctx, bool entry)
68 {
69 __u8 stack_depth, last_stack_depth;
70 struct func_stack *func_stack;
71 __u64 ip, last_ip = 0, task;
72 struct trace *trace;
73
74 task = bpf_get_current_task();
75
76 func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
77 if (!func_stack) {
78 struct func_stack new_stack = { .task = task };
79
80 bpf_map_update_elem(&ksnoop_func_stack, &task, &new_stack,
81 BPF_NOEXIST);
82 func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
83 if (!func_stack)
84 return NULL;
85 }
86
87 stack_depth = func_stack->stack_depth;
88 if (stack_depth > FUNC_MAX_STACK_DEPTH)
89 return NULL;
90
91 if (entry) {
92 ip = KSNOOP_IP_FIX(PT_REGS_IP_CORE(ctx));
93 if (stack_depth >= FUNC_MAX_STACK_DEPTH - 1)
94 return NULL;
95 /* verifier doesn't like using "stack_depth - 1" as array index
96 * directly.
97 */
98 last_stack_depth = stack_depth - 1;
99 /* get address of last function we called */
100 if (last_stack_depth >= 0 &&
101 last_stack_depth < FUNC_MAX_STACK_DEPTH)
102 last_ip = func_stack->ips[last_stack_depth];
103 /* push ip onto stack. return will pop it. */
104 func_stack->ips[stack_depth] = ip;
105 /* mask used in case bounds checks are optimized out */
106 stack_depth = (stack_depth + 1) & FUNC_STACK_DEPTH_MASK;
107 func_stack->stack_depth = stack_depth;
108 /* rather than zero stack entries on popping, we zero the
109 * (stack_depth + 1)'th entry when pushing the current
110 * entry. The reason we take this approach is that
111 * when tracking the set of functions we returned from,
112 * we want the history of functions we returned from to
113 * be preserved.
114 */
115 if (stack_depth < FUNC_MAX_STACK_DEPTH)
116 func_stack->ips[stack_depth] = 0;
117 } else {
118 if (stack_depth == 0 || stack_depth >= FUNC_MAX_STACK_DEPTH)
119 return NULL;
120 last_stack_depth = stack_depth;
121 /* get address of last function we returned from */
122 if (last_stack_depth >= 0 &&
123 last_stack_depth < FUNC_MAX_STACK_DEPTH)
124 last_ip = func_stack->ips[last_stack_depth];
125 if (stack_depth > 0) {
126 /* logical OR convinces verifier that we don't
127 * end up with a < 0 value, translating to 0xff
128 * and an outside of map element access.
129 */
130 stack_depth = (stack_depth - 1) & FUNC_STACK_DEPTH_MASK;
131 }
132 /* retrieve ip from stack as IP in pt_regs is
133 * bpf kretprobe trampoline address.
134 */
135 if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
136 ip = func_stack->ips[stack_depth];
137 if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
138 func_stack->stack_depth = stack_depth;
139 }
140
141 trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
142 if (!trace)
143 return NULL;
144
145 /* we may stash data on entry since predicates are a mix
146 * of entry/return; in such cases, trace->flags specifies
147 * KSNOOP_F_STASH, and we will output stashed data on return.
148 * If returning, make sure we don't clear our stashed data.
149 */
150 if (!entry && (trace->flags & KSNOOP_F_STASH)) {
151 /* skip clearing trace data */
152 if (!(trace->data_flags & KSNOOP_F_STASHED)) {
153 /* predicate must have failed */
154 return NULL;
155 }
156 /* skip clearing trace data */
157 } else {
158 /* clear trace data before starting. */
159 clear_trace(trace);
160 }
161
162 if (entry) {
163 /* if in stack mode, check if previous fn matches */
164 if (trace->prev_ip && trace->prev_ip != last_ip)
165 return NULL;
166 /* if tracing intermediate fn in stack of fns, stash data. */
167 if (trace->next_ip)
168 trace->data_flags |= KSNOOP_F_STASH;
169 /* we may stash data on entry since predicates are a mix
170 * of entry/return; in such cases, trace->flags specifies
171 * KSNOOP_F_STASH, and we will output stashed data on return.
172 */
173 if (trace->flags & KSNOOP_F_STASH)
174 trace->data_flags |= KSNOOP_F_STASH;
175 /* otherwise the data is outputted (because we've reached
176 * the last fn in the set of fns specified).
177 */
178 } else {
179 /* In stack mode, check if next fn matches the last fn
180 * we returned from; i.e. "a" called "b", and now
181 * we're at "a", was the last fn we returned from "b"?
182 * If so, stash data for later display (when we reach the
183 * first fn in the set of stack fns).
184 */
185 if (trace->next_ip && trace->next_ip != last_ip)
186 return NULL;
187 if (trace->prev_ip)
188 trace->data_flags |= KSNOOP_F_STASH;
189 /* If there is no "prev" function, i.e. we are at the
190 * first function in a set of stack functions, the trace
191 * info is shown (along with any stashed info associated
192 * with callers).
193 */
194 }
195 trace->task = task;
196 return trace;
197 }
198
output_trace(struct pt_regs * ctx,struct trace * trace)199 static void output_trace(struct pt_regs *ctx, struct trace *trace)
200 {
201 __u16 trace_len;
202
203 if (trace->buf_len == 0)
204 goto skip;
205
206 /* we may be simply stashing values, and will report later */
207 if (trace->data_flags & KSNOOP_F_STASH) {
208 trace->data_flags &= ~KSNOOP_F_STASH;
209 trace->data_flags |= KSNOOP_F_STASHED;
210 return;
211 }
212 /* we may be outputting earlier stashed data */
213 if (trace->data_flags & KSNOOP_F_STASHED)
214 trace->data_flags &= ~KSNOOP_F_STASHED;
215
216 /* trim perf event size to only contain data we've recorded. */
217 trace_len = sizeof(*trace) + trace->buf_len - MAX_TRACE_BUF;
218
219 if (trace_len <= sizeof(*trace))
220 bpf_perf_event_output(ctx, &ksnoop_perf_map,
221 BPF_F_CURRENT_CPU,
222 trace, trace_len);
223 skip:
224 clear_trace(trace);
225 }
226
output_stashed_traces(struct pt_regs * ctx,struct trace * currtrace,bool entry)227 static void output_stashed_traces(struct pt_regs *ctx,
228 struct trace *currtrace,
229 bool entry)
230 {
231 struct func_stack *func_stack;
232 struct trace *trace = NULL;
233 __u8 i;
234 __u64 task = 0;
235
236 task = bpf_get_current_task();
237 func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
238 if (!func_stack)
239 return;
240
241 if (entry) {
242 /* iterate from bottom to top of stack, outputting stashed
243 * data we find. This corresponds to the set of functions
244 * we called before the current function.
245 */
246 for (i = 0;
247 i < func_stack->stack_depth - 1 && i < FUNC_MAX_STACK_DEPTH;
248 i++) {
249 trace = bpf_map_lookup_elem(&ksnoop_func_map,
250 &func_stack->ips[i]);
251 if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
252 break;
253 if (trace->task != task)
254 return;
255 output_trace(ctx, trace);
256 }
257 } else {
258 /* iterate from top to bottom of stack, outputting stashed
259 * data we find. This corresponds to the set of functions
260 * that returned prior to the current returning function.
261 */
262 for (i = FUNC_MAX_STACK_DEPTH; i > 0; i--) {
263 __u64 ip;
264
265 ip = func_stack->ips[i];
266 if (!ip)
267 continue;
268 trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
269 if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
270 break;
271 if (trace->task != task)
272 return;
273 output_trace(ctx, trace);
274 }
275 }
276 /* finally output the current trace info */
277 output_trace(ctx, currtrace);
278 }
279
get_arg(struct pt_regs * ctx,enum arg argnum)280 static __u64 get_arg(struct pt_regs *ctx, enum arg argnum)
281 {
282 switch (argnum) {
283 case KSNOOP_ARG1:
284 return PT_REGS_PARM1_CORE(ctx);
285 case KSNOOP_ARG2:
286 return PT_REGS_PARM2_CORE(ctx);
287 case KSNOOP_ARG3:
288 return PT_REGS_PARM3_CORE(ctx);
289 case KSNOOP_ARG4:
290 return PT_REGS_PARM4_CORE(ctx);
291 case KSNOOP_ARG5:
292 return PT_REGS_PARM5_CORE(ctx);
293 case KSNOOP_RETURN:
294 return PT_REGS_RC_CORE(ctx);
295 default:
296 return 0;
297 }
298 }
299
ksnoop(struct pt_regs * ctx,bool entry)300 static int ksnoop(struct pt_regs *ctx, bool entry)
301 {
302 void *data_ptr = NULL;
303 struct trace *trace;
304 __u64 data;
305 __u32 currpid;
306 int ret;
307 __u8 i;
308
309 trace = get_trace(ctx, entry);
310 if (!trace)
311 return 0;
312
313 /* make sure we want events from this pid */
314 currpid = bpf_get_current_pid_tgid();
315 if (trace->filter_pid && trace->filter_pid != currpid)
316 return 0;
317 trace->pid = currpid;
318
319 trace->cpu = bpf_get_smp_processor_id();
320 trace->time = bpf_ktime_get_ns();
321
322 trace->data_flags &= ~(KSNOOP_F_ENTRY | KSNOOP_F_RETURN);
323 if (entry)
324 trace->data_flags |= KSNOOP_F_ENTRY;
325 else
326 trace->data_flags |= KSNOOP_F_RETURN;
327
328
329 for (i = 0; i < MAX_TRACES; i++) {
330 struct trace_data *currdata;
331 struct value *currtrace;
332 char *buf_offset = NULL;
333 __u32 tracesize;
334
335 currdata = &trace->trace_data[i];
336 currtrace = &trace->traces[i];
337
338 if ((entry && !base_arg_is_entry(currtrace->base_arg)) ||
339 (!entry && base_arg_is_entry(currtrace->base_arg)))
340 continue;
341
342 /* skip void (unused) trace arguments, ensuring not to
343 * skip "void *".
344 */
345 if (currtrace->type_id == 0 &&
346 !(currtrace->flags & KSNOOP_F_PTR))
347 continue;
348
349 data = get_arg(ctx, currtrace->base_arg);
350
351 /* look up member value and read into data field. */
352 if (currtrace->flags & KSNOOP_F_MEMBER) {
353 if (currtrace->offset)
354 data += currtrace->offset;
355
356 /* member is a pointer; read it in */
357 if (currtrace->flags & KSNOOP_F_PTR) {
358 void *dataptr = (void *)data;
359
360 ret = bpf_probe_read(&data, sizeof(data),
361 dataptr);
362 if (ret) {
363 currdata->err_type_id =
364 currtrace->type_id;
365 currdata->err = ret;
366 continue;
367 }
368 currdata->raw_value = data;
369 } else if (currtrace->size <=
370 sizeof(currdata->raw_value)) {
371 /* read member value for predicate comparison */
372 bpf_probe_read(&currdata->raw_value,
373 currtrace->size,
374 (void*)data);
375 }
376 } else {
377 currdata->raw_value = data;
378 }
379
380 /* simple predicate evaluation: if any predicate fails,
381 * skip all tracing for this function.
382 */
383 if (currtrace->flags & KSNOOP_F_PREDICATE_MASK) {
384 bool ok = false;
385
386 if (currtrace->flags & KSNOOP_F_PREDICATE_EQ &&
387 currdata->raw_value == currtrace->predicate_value)
388 ok = true;
389
390 if (currtrace->flags & KSNOOP_F_PREDICATE_NOTEQ &&
391 currdata->raw_value != currtrace->predicate_value)
392 ok = true;
393
394 if (currtrace->flags & KSNOOP_F_PREDICATE_GT &&
395 currdata->raw_value > currtrace->predicate_value)
396 ok = true;
397
398 if (currtrace->flags & KSNOOP_F_PREDICATE_LT &&
399 currdata->raw_value < currtrace->predicate_value)
400 ok = true;
401
402 if (!ok) {
403 clear_trace(trace);
404 return 0;
405 }
406 }
407
408 if (currtrace->flags & (KSNOOP_F_PTR | KSNOOP_F_MEMBER))
409 data_ptr = (void *)data;
410 else
411 data_ptr = &data;
412
413 if (trace->buf_len + MAX_TRACE_DATA >= MAX_TRACE_BUF)
414 break;
415
416 buf_offset = &trace->buf[trace->buf_len];
417 if (buf_offset > &trace->buf[MAX_TRACE_BUF]) {
418 currdata->err_type_id = currtrace->type_id;
419 currdata->err = -ENOSPC;
420 continue;
421 }
422 currdata->buf_offset = trace->buf_len;
423
424 tracesize = currtrace->size;
425 if (tracesize > MAX_TRACE_DATA)
426 tracesize = MAX_TRACE_DATA;
427 ret = bpf_probe_read(buf_offset, tracesize, data_ptr);
428 if (ret < 0) {
429 currdata->err_type_id = currtrace->type_id;
430 currdata->err = ret;
431 continue;
432 } else {
433 currdata->buf_len = tracesize;
434 trace->buf_len += tracesize;
435 }
436 }
437
438 /* show accumulated stashed traces (if any) */
439 if ((entry && trace->prev_ip && !trace->next_ip) ||
440 (!entry && trace->next_ip && !trace->prev_ip))
441 output_stashed_traces(ctx, trace, entry);
442 else
443 output_trace(ctx, trace);
444
445 return 0;
446 }
447
448 SEC("kprobe/foo")
kprobe_entry(struct pt_regs * ctx)449 int kprobe_entry(struct pt_regs *ctx)
450 {
451 return ksnoop(ctx, true);
452 }
453
454 SEC("kretprobe/foo")
kprobe_return(struct pt_regs * ctx)455 int kprobe_return(struct pt_regs *ctx)
456 {
457 return ksnoop(ctx, false);
458 }
459
460 char _license[] SEC("license") = "Dual BSD/GPL";
461