bcc/libbpf-tools/ksnoop.bpf.c

/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
/* Copyright (c) 2021, Oracle and/or its affiliates. */

#include "vmlinux.h"

#include <bpf/bpf_helpers.h>
#include <bpf/bpf_tracing.h>
#include <bpf/bpf_core_read.h>

#include "ksnoop.h"

/* For kretprobes, the instruction pointer in the struct pt_regs context
 * is the kretprobe_trampoline.  We derive the instruction pointer
 * by pushing it onto a function stack on entry and popping it on return.
 *
 * We could use bpf_get_func_ip(), but "stack mode" - where we
 * specify functions "a", "b and "c" and only want to see a trace if "a"
 * calls "b" and "b" calls "c" - utilizes this stack to determine if trace
 * data should be collected.
 */
#define FUNC_MAX_STACK_DEPTH	16
/* used to convince verifier we do not stray outside of array bounds */
#define FUNC_STACK_DEPTH_MASK	(FUNC_MAX_STACK_DEPTH - 1)

#ifndef ENOSPC
#define ENOSPC			28
#endif

struct func_stack {
	__u64 task;
	__u64 ips[FUNC_MAX_STACK_DEPTH];
	__u8 stack_depth;
};

#define MAX_TASKS		2048

/* function call stack hashed on a per-task key */
struct {
	__uint(type, BPF_MAP_TYPE_HASH);
	/* function call stack for functions we are tracing */
	__uint(max_entries, MAX_TASKS);
	__type(key, __u64);
	__type(value, struct func_stack);
} ksnoop_func_stack SEC(".maps");

/* per-cpu trace info hashed on function address */
struct {
	__uint(type, BPF_MAP_TYPE_PERCPU_HASH);
	__uint(max_entries, MAX_FUNC_TRACES);
	__type(key, __u64);
	__type(value, struct trace);
} ksnoop_func_map SEC(".maps");

struct {
	__uint(type, BPF_MAP_TYPE_PERF_EVENT_ARRAY);
	__uint(value_size, sizeof(int));
	__uint(key_size, sizeof(int));
} ksnoop_perf_map SEC(".maps");

static void clear_trace(struct trace *trace)
{
	__builtin_memset(&trace->trace_data, 0, sizeof(trace->trace_data));
	trace->data_flags = 0;
	trace->buf_len = 0;
}

static struct trace *get_trace(struct pt_regs *ctx, bool entry)
{
	__u8 stack_depth, last_stack_depth;
	struct func_stack *func_stack;
	__u64 ip, last_ip = 0, task;
	struct trace *trace;

	task = bpf_get_current_task();

	func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
	if (!func_stack) {
		struct func_stack new_stack = { .task = task };

		bpf_map_update_elem(&ksnoop_func_stack, &task, &new_stack,
				    BPF_NOEXIST);
		func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
		if (!func_stack)
			return NULL;
	}

	stack_depth = func_stack->stack_depth;
	if (stack_depth > FUNC_MAX_STACK_DEPTH)
		return NULL;

	if (entry) {
		ip = KSNOOP_IP_FIX(PT_REGS_IP_CORE(ctx));
		if (stack_depth >= FUNC_MAX_STACK_DEPTH - 1)
			return NULL;
		/* verifier doesn't like using "stack_depth - 1" as array index
		 * directly.
		 */
		last_stack_depth = stack_depth - 1;
		/* get address of last function we called */
		if (last_stack_depth >= 0 &&
		    last_stack_depth < FUNC_MAX_STACK_DEPTH)
			last_ip = func_stack->ips[last_stack_depth];
		/* push ip onto stack. return will pop it. */
		func_stack->ips[stack_depth] = ip;
		/* mask used in case bounds checks are optimized out */
		stack_depth = (stack_depth + 1) & FUNC_STACK_DEPTH_MASK;
		func_stack->stack_depth = stack_depth;
		/* rather than zero stack entries on popping, we zero the
		 * (stack_depth + 1)'th entry when pushing the current
		 * entry.  The reason we take this approach is that
		 * when tracking the set of functions we returned from,
		 * we want the history of functions we returned from to
		 * be preserved.
		 */
		if (stack_depth < FUNC_MAX_STACK_DEPTH)
			func_stack->ips[stack_depth] = 0;
	} else {
		if (stack_depth == 0 || stack_depth >= FUNC_MAX_STACK_DEPTH)
			return NULL;
		last_stack_depth = stack_depth;
		/* get address of last function we returned from */
		if (last_stack_depth >= 0 &&
		    last_stack_depth < FUNC_MAX_STACK_DEPTH)
			last_ip = func_stack->ips[last_stack_depth];
		if (stack_depth > 0) {
			/* logical OR convinces verifier that we don't
			 * end up with a < 0 value, translating to 0xff
			 * and an outside of map element access.
			 */
			stack_depth = (stack_depth - 1) & FUNC_STACK_DEPTH_MASK;
		}
		/* retrieve ip from stack as IP in pt_regs is
		 * bpf kretprobe trampoline address.
		 */
		if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
			ip = func_stack->ips[stack_depth];
		if (stack_depth >= 0 && stack_depth < FUNC_MAX_STACK_DEPTH)
			func_stack->stack_depth = stack_depth;
	}

	trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
	if (!trace)
		return NULL;

	/* we may stash data on entry since predicates are a mix
	 * of entry/return; in such cases, trace->flags specifies
	 * KSNOOP_F_STASH, and we will output stashed data on return.
	 * If returning, make sure we don't clear our stashed data.
	 */
	if (!entry && (trace->flags & KSNOOP_F_STASH)) {
		/* skip clearing trace data */
		if (!(trace->data_flags & KSNOOP_F_STASHED)) {
			/* predicate must have failed */
			return NULL;
		}
		/* skip clearing trace data */
	} else {
		/* clear trace data before starting. */
		clear_trace(trace);
	}

	if (entry) {
		/* if in stack mode, check if previous fn matches */
		if (trace->prev_ip && trace->prev_ip != last_ip)
			return NULL;
		/* if tracing intermediate fn in stack of fns, stash data. */
		if (trace->next_ip)
			trace->data_flags |= KSNOOP_F_STASH;
		/* we may stash data on entry since predicates are a mix
		 * of entry/return; in such cases, trace->flags specifies
		 * KSNOOP_F_STASH, and we will output stashed data on return.
		 */
		if (trace->flags & KSNOOP_F_STASH)
			trace->data_flags |= KSNOOP_F_STASH;
		/* otherwise the data is outputted (because we've reached
		 * the last fn in the set of fns specified).
		 */
	} else {
		/* In stack mode, check if next fn matches the last fn
		 * we returned from; i.e. "a" called "b", and now
		 * we're at "a", was the last fn we returned from "b"?
		 * If so, stash data for later display (when we reach the
		 * first fn in the set of stack fns).
		 */
		if (trace->next_ip && trace->next_ip != last_ip)
			return NULL;
		if (trace->prev_ip)
			trace->data_flags |= KSNOOP_F_STASH;
		/* If there is no "prev" function, i.e. we are at the
		 * first function in a set of stack functions, the trace
		 * info is shown (along with any stashed info associated
		 * with callers).
		 */
	}
	trace->task = task;
	return trace;
}

static void output_trace(struct pt_regs *ctx, struct trace *trace)
{
	__u16 trace_len;

	if (trace->buf_len == 0)
		goto skip;

	/* we may be simply stashing values, and will report later */
	if (trace->data_flags & KSNOOP_F_STASH) {
		trace->data_flags &= ~KSNOOP_F_STASH;
		trace->data_flags |= KSNOOP_F_STASHED;
		return;
	}
	/* we may be outputting earlier stashed data */
	if (trace->data_flags & KSNOOP_F_STASHED)
		trace->data_flags &= ~KSNOOP_F_STASHED;

	/* trim perf event size to only contain data we've recorded. */
	trace_len = sizeof(*trace) + trace->buf_len - MAX_TRACE_BUF;

	if (trace_len <= sizeof(*trace))
		bpf_perf_event_output(ctx, &ksnoop_perf_map,
				      BPF_F_CURRENT_CPU,
				      trace, trace_len);
skip:
	clear_trace(trace);
}

static void output_stashed_traces(struct pt_regs *ctx,
					 struct trace *currtrace,
					 bool entry)
{
	struct func_stack *func_stack;
	struct trace *trace = NULL;
	__u8 i;
	__u64 task = 0;

	task = bpf_get_current_task();
	func_stack = bpf_map_lookup_elem(&ksnoop_func_stack, &task);
	if (!func_stack)
		return;

	if (entry) {
		/* iterate from bottom to top of stack, outputting stashed
		 * data we find.  This corresponds to the set of functions
		 * we called before the current function.
		 */
		for (i = 0;
		     i < func_stack->stack_depth - 1 && i < FUNC_MAX_STACK_DEPTH;
		     i++) {
			trace = bpf_map_lookup_elem(&ksnoop_func_map,
						    &func_stack->ips[i]);
			if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
				break;
			if (trace->task != task)
				return;
			output_trace(ctx, trace);
		}
	} else {
		/* iterate from top to bottom of stack, outputting stashed
		 * data we find.  This corresponds to the set of functions
		 * that returned prior to the current returning function.
		 */
		for (i = FUNC_MAX_STACK_DEPTH; i > 0; i--) {
			__u64 ip;

			ip = func_stack->ips[i];
			if (!ip)
				continue;
			trace = bpf_map_lookup_elem(&ksnoop_func_map, &ip);
			if (!trace || !(trace->data_flags & KSNOOP_F_STASHED))
				break;
			if (trace->task != task)
				return;
			output_trace(ctx, trace);
		}
	}
	/* finally output the current trace info */
	output_trace(ctx, currtrace);
}

static __u64 get_arg(struct pt_regs *ctx, enum arg argnum)
{
	switch (argnum) {
	case KSNOOP_ARG1:
		return PT_REGS_PARM1_CORE(ctx);
	case KSNOOP_ARG2:
		return PT_REGS_PARM2_CORE(ctx);
	case KSNOOP_ARG3:
		return PT_REGS_PARM3_CORE(ctx);
	case KSNOOP_ARG4:
		return PT_REGS_PARM4_CORE(ctx);
	case KSNOOP_ARG5:
		return PT_REGS_PARM5_CORE(ctx);
	case KSNOOP_RETURN:
		return PT_REGS_RC_CORE(ctx);
	default:
		return 0;
	}
}

static int ksnoop(struct pt_regs *ctx, bool entry)
{
	void *data_ptr = NULL;
	struct trace *trace;
	__u64 data;
	__u32 currpid;
	int ret;
	__u8 i;

	trace = get_trace(ctx, entry);
	if (!trace)
		return 0;

	/* make sure we want events from this pid */
	currpid = bpf_get_current_pid_tgid();
	if (trace->filter_pid && trace->filter_pid != currpid)
		return 0;
	trace->pid = currpid;

	trace->cpu = bpf_get_smp_processor_id();
	trace->time = bpf_ktime_get_ns();

	trace->data_flags &= ~(KSNOOP_F_ENTRY | KSNOOP_F_RETURN);
	if (entry)
		trace->data_flags |= KSNOOP_F_ENTRY;
	else
		trace->data_flags |= KSNOOP_F_RETURN;


	for (i = 0; i < MAX_TRACES; i++) {
		struct trace_data *currdata;
		struct value *currtrace;
		char *buf_offset = NULL;
		__u32 tracesize;

		currdata = &trace->trace_data[i];
		currtrace = &trace->traces[i];

		if ((entry && !base_arg_is_entry(currtrace->base_arg)) ||
		    (!entry && base_arg_is_entry(currtrace->base_arg)))
			continue;

		/* skip void (unused) trace arguments, ensuring not to
		 * skip "void *".
		 */
		if (currtrace->type_id == 0 &&
		    !(currtrace->flags & KSNOOP_F_PTR))
			continue;

		data = get_arg(ctx, currtrace->base_arg);

		/* look up member value and read into data field. */
		if (currtrace->flags & KSNOOP_F_MEMBER) {
			if (currtrace->offset)
				data += currtrace->offset;

			/* member is a pointer; read it in */
			if (currtrace->flags & KSNOOP_F_PTR) {
				void *dataptr = (void *)data;

				ret = bpf_probe_read(&data, sizeof(data),
						     dataptr);
				if (ret) {
					currdata->err_type_id =
						currtrace->type_id;
					currdata->err = ret;
					continue;
				}
				currdata->raw_value = data;
			} else if (currtrace->size <=
				   sizeof(currdata->raw_value)) {
				/* read member value for predicate comparison */
				bpf_probe_read(&currdata->raw_value,
					       currtrace->size,
					       (void*)data);
			}
		} else {
			currdata->raw_value = data;
		}

		/* simple predicate evaluation: if any predicate fails,
		 * skip all tracing for this function.
		 */
		if (currtrace->flags & KSNOOP_F_PREDICATE_MASK) {
			bool ok = false;

			if (currtrace->flags & KSNOOP_F_PREDICATE_EQ &&
			    currdata->raw_value == currtrace->predicate_value)
				ok = true;

			if (currtrace->flags & KSNOOP_F_PREDICATE_NOTEQ &&
			    currdata->raw_value != currtrace->predicate_value)
				ok = true;

			if (currtrace->flags & KSNOOP_F_PREDICATE_GT &&
			    currdata->raw_value > currtrace->predicate_value)
				ok = true;

			if (currtrace->flags & KSNOOP_F_PREDICATE_LT &&
			    currdata->raw_value < currtrace->predicate_value)
				ok = true;

			if (!ok) {
				clear_trace(trace);
				return 0;
			}
		}

		if (currtrace->flags & (KSNOOP_F_PTR | KSNOOP_F_MEMBER))
			data_ptr = (void *)data;
		else
			data_ptr = &data;

		if (trace->buf_len + MAX_TRACE_DATA >= MAX_TRACE_BUF)
			break;

		buf_offset = &trace->buf[trace->buf_len];
		if (buf_offset > &trace->buf[MAX_TRACE_BUF]) {
			currdata->err_type_id = currtrace->type_id;
			currdata->err = -ENOSPC;
			continue;
		}
		currdata->buf_offset = trace->buf_len;

		tracesize = currtrace->size;
		if (tracesize > MAX_TRACE_DATA)
			tracesize = MAX_TRACE_DATA;
		ret = bpf_probe_read(buf_offset, tracesize, data_ptr);
		if (ret < 0) {
			currdata->err_type_id = currtrace->type_id;
			currdata->err = ret;
			continue;
		} else {
			currdata->buf_len = tracesize;
			trace->buf_len += tracesize;
		}
	}

	/* show accumulated stashed traces (if any) */
	if ((entry && trace->prev_ip && !trace->next_ip) ||
	    (!entry && trace->next_ip && !trace->prev_ip))
		output_stashed_traces(ctx, trace, entry);
	else
		output_trace(ctx, trace);

	return 0;
}

SEC("kprobe/foo")
int kprobe_entry(struct pt_regs *ctx)
{
	return ksnoop(ctx, true);
}

SEC("kretprobe/foo")
int kprobe_return(struct pt_regs *ctx)
{
	return ksnoop(ctx, false);
}

char _license[] SEC("license") = "Dual BSD/GPL";