• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2019 Facebook */
3 #include <linux/hash.h>
4 #include <linux/bpf.h>
5 #include <linux/filter.h>
6 #include <linux/ftrace.h>
7 #include <linux/rbtree_latch.h>
8 #include <linux/perf_event.h>
9 #include <linux/btf.h>
10 #include <linux/rcupdate_trace.h>
11 #include <linux/rcupdate_wait.h>
12 #include <trace/hooks/memory.h>
13 
14 /* dummy _ops. The verifier will operate on target program's ops. */
15 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
16 };
17 const struct bpf_prog_ops bpf_extension_prog_ops = {
18 };
19 
20 /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
21 #define TRAMPOLINE_HASH_BITS 10
22 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
23 
24 static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
25 
26 /* serializes access to trampoline_table */
27 static DEFINE_MUTEX(trampoline_mutex);
28 
bpf_jit_alloc_exec_page(void)29 void *bpf_jit_alloc_exec_page(void)
30 {
31 	void *image;
32 
33 	image = bpf_jit_alloc_exec(PAGE_SIZE);
34 	if (!image)
35 		return NULL;
36 
37 	set_vm_flush_reset_perms(image);
38 	/* Keep image as writeable. The alternative is to keep flipping ro/rw
39 	 * everytime new program is attached or detached.
40 	 */
41 	set_memory_x((long)image, 1);
42 	trace_android_vh_set_memory_x((unsigned long)image, 1);
43 	return image;
44 }
45 
bpf_image_ksym_add(void * data,struct bpf_ksym * ksym)46 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
47 {
48 	ksym->start = (unsigned long) data;
49 	ksym->end = ksym->start + PAGE_SIZE;
50 	bpf_ksym_add(ksym);
51 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
52 			   PAGE_SIZE, false, ksym->name);
53 }
54 
bpf_image_ksym_del(struct bpf_ksym * ksym)55 void bpf_image_ksym_del(struct bpf_ksym *ksym)
56 {
57 	bpf_ksym_del(ksym);
58 	perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
59 			   PAGE_SIZE, true, ksym->name);
60 }
61 
bpf_trampoline_lookup(u64 key)62 static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
63 {
64 	struct bpf_trampoline *tr;
65 	struct hlist_head *head;
66 	int i;
67 
68 	mutex_lock(&trampoline_mutex);
69 	head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
70 	hlist_for_each_entry(tr, head, hlist) {
71 		if (tr->key == key) {
72 			refcount_inc(&tr->refcnt);
73 			goto out;
74 		}
75 	}
76 	tr = kzalloc(sizeof(*tr), GFP_KERNEL);
77 	if (!tr)
78 		goto out;
79 
80 	tr->key = key;
81 	INIT_HLIST_NODE(&tr->hlist);
82 	hlist_add_head(&tr->hlist, head);
83 	refcount_set(&tr->refcnt, 1);
84 	mutex_init(&tr->mutex);
85 	for (i = 0; i < BPF_TRAMP_MAX; i++)
86 		INIT_HLIST_HEAD(&tr->progs_hlist[i]);
87 out:
88 	mutex_unlock(&trampoline_mutex);
89 	return tr;
90 }
91 
is_ftrace_location(void * ip)92 static int is_ftrace_location(void *ip)
93 {
94 	long addr;
95 
96 	addr = ftrace_location((long)ip);
97 	if (!addr)
98 		return 0;
99 	if (WARN_ON_ONCE(addr != (long)ip))
100 		return -EFAULT;
101 	return 1;
102 }
103 
unregister_fentry(struct bpf_trampoline * tr,void * old_addr)104 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
105 {
106 	void *ip = tr->func.addr;
107 	int ret;
108 
109 	if (tr->func.ftrace_managed)
110 		ret = unregister_ftrace_direct((long)ip, (long)old_addr);
111 	else
112 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
113 	return ret;
114 }
115 
modify_fentry(struct bpf_trampoline * tr,void * old_addr,void * new_addr)116 static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
117 {
118 	void *ip = tr->func.addr;
119 	int ret;
120 
121 	if (tr->func.ftrace_managed)
122 		ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
123 	else
124 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
125 	return ret;
126 }
127 
128 /* first time registering */
register_fentry(struct bpf_trampoline * tr,void * new_addr)129 static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
130 {
131 	void *ip = tr->func.addr;
132 	int ret;
133 
134 	ret = is_ftrace_location(ip);
135 	if (ret < 0)
136 		return ret;
137 	tr->func.ftrace_managed = ret;
138 
139 	if (tr->func.ftrace_managed)
140 		ret = register_ftrace_direct((long)ip, (long)new_addr);
141 	else
142 		ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
143 	return ret;
144 }
145 
146 static struct bpf_tramp_progs *
bpf_trampoline_get_progs(const struct bpf_trampoline * tr,int * total)147 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
148 {
149 	const struct bpf_prog_aux *aux;
150 	struct bpf_tramp_progs *tprogs;
151 	struct bpf_prog **progs;
152 	int kind;
153 
154 	*total = 0;
155 	tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
156 	if (!tprogs)
157 		return ERR_PTR(-ENOMEM);
158 
159 	for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
160 		tprogs[kind].nr_progs = tr->progs_cnt[kind];
161 		*total += tr->progs_cnt[kind];
162 		progs = tprogs[kind].progs;
163 
164 		hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
165 			*progs++ = aux->prog;
166 	}
167 	return tprogs;
168 }
169 
__bpf_tramp_image_put_deferred(struct work_struct * work)170 static void __bpf_tramp_image_put_deferred(struct work_struct *work)
171 {
172 	struct bpf_tramp_image *im;
173 
174 	im = container_of(work, struct bpf_tramp_image, work);
175 	bpf_image_ksym_del(&im->ksym);
176 	trace_android_vh_set_memory_nx((unsigned long)im->image, 1);
177 	bpf_jit_free_exec(im->image);
178 	bpf_jit_uncharge_modmem(1);
179 	percpu_ref_exit(&im->pcref);
180 	kfree_rcu(im, rcu);
181 }
182 
183 /* callback, fexit step 3 or fentry step 2 */
__bpf_tramp_image_put_rcu(struct rcu_head * rcu)184 static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
185 {
186 	struct bpf_tramp_image *im;
187 
188 	im = container_of(rcu, struct bpf_tramp_image, rcu);
189 	INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
190 	schedule_work(&im->work);
191 }
192 
193 /* callback, fexit step 2. Called after percpu_ref_kill confirms. */
__bpf_tramp_image_release(struct percpu_ref * pcref)194 static void __bpf_tramp_image_release(struct percpu_ref *pcref)
195 {
196 	struct bpf_tramp_image *im;
197 
198 	im = container_of(pcref, struct bpf_tramp_image, pcref);
199 	call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
200 }
201 
202 /* callback, fexit or fentry step 1 */
__bpf_tramp_image_put_rcu_tasks(struct rcu_head * rcu)203 static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
204 {
205 	struct bpf_tramp_image *im;
206 
207 	im = container_of(rcu, struct bpf_tramp_image, rcu);
208 	if (im->ip_after_call)
209 		/* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
210 		percpu_ref_kill(&im->pcref);
211 	else
212 		/* the case of fentry trampoline */
213 		call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
214 }
215 
bpf_tramp_image_put(struct bpf_tramp_image * im)216 static void bpf_tramp_image_put(struct bpf_tramp_image *im)
217 {
218 	/* The trampoline image that calls original function is using:
219 	 * rcu_read_lock_trace to protect sleepable bpf progs
220 	 * rcu_read_lock to protect normal bpf progs
221 	 * percpu_ref to protect trampoline itself
222 	 * rcu tasks to protect trampoline asm not covered by percpu_ref
223 	 * (which are few asm insns before __bpf_tramp_enter and
224 	 *  after __bpf_tramp_exit)
225 	 *
226 	 * The trampoline is unreachable before bpf_tramp_image_put().
227 	 *
228 	 * First, patch the trampoline to avoid calling into fexit progs.
229 	 * The progs will be freed even if the original function is still
230 	 * executing or sleeping.
231 	 * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
232 	 * first few asm instructions to execute and call into
233 	 * __bpf_tramp_enter->percpu_ref_get.
234 	 * Then use percpu_ref_kill to wait for the trampoline and the original
235 	 * function to finish.
236 	 * Then use call_rcu_tasks() to make sure few asm insns in
237 	 * the trampoline epilogue are done as well.
238 	 *
239 	 * In !PREEMPT case the task that got interrupted in the first asm
240 	 * insns won't go through an RCU quiescent state which the
241 	 * percpu_ref_kill will be waiting for. Hence the first
242 	 * call_rcu_tasks() is not necessary.
243 	 */
244 	if (im->ip_after_call) {
245 		int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
246 					     NULL, im->ip_epilogue);
247 		WARN_ON(err);
248 		if (IS_ENABLED(CONFIG_PREEMPTION))
249 			call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
250 		else
251 			percpu_ref_kill(&im->pcref);
252 		return;
253 	}
254 
255 	/* The trampoline without fexit and fmod_ret progs doesn't call original
256 	 * function and doesn't use percpu_ref.
257 	 * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
258 	 * Then use call_rcu_tasks() to wait for the rest of trampoline asm
259 	 * and normal progs.
260 	 */
261 	call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
262 }
263 
bpf_tramp_image_alloc(u64 key,u32 idx)264 static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
265 {
266 	struct bpf_tramp_image *im;
267 	struct bpf_ksym *ksym;
268 	void *image;
269 	int err = -ENOMEM;
270 
271 	im = kzalloc(sizeof(*im), GFP_KERNEL);
272 	if (!im)
273 		goto out;
274 
275 	err = bpf_jit_charge_modmem(1);
276 	if (err)
277 		goto out_free_im;
278 
279 	err = -ENOMEM;
280 	im->image = image = bpf_jit_alloc_exec_page();
281 	if (!image)
282 		goto out_uncharge;
283 
284 	err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
285 	if (err)
286 		goto out_free_image;
287 
288 	ksym = &im->ksym;
289 	INIT_LIST_HEAD_RCU(&ksym->lnode);
290 	snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
291 	bpf_image_ksym_add(image, ksym);
292 	return im;
293 
294 out_free_image:
295 	bpf_jit_free_exec(im->image);
296 out_uncharge:
297 	bpf_jit_uncharge_modmem(1);
298 out_free_im:
299 	kfree(im);
300 out:
301 	return ERR_PTR(err);
302 }
303 
bpf_trampoline_update(struct bpf_trampoline * tr)304 static int bpf_trampoline_update(struct bpf_trampoline *tr)
305 {
306 	struct bpf_tramp_image *im;
307 	struct bpf_tramp_progs *tprogs;
308 	u32 flags = BPF_TRAMP_F_RESTORE_REGS;
309 	int err, total;
310 
311 	tprogs = bpf_trampoline_get_progs(tr, &total);
312 	if (IS_ERR(tprogs))
313 		return PTR_ERR(tprogs);
314 
315 	if (total == 0) {
316 		err = unregister_fentry(tr, tr->cur_image->image);
317 		bpf_tramp_image_put(tr->cur_image);
318 		tr->cur_image = NULL;
319 		tr->selector = 0;
320 		goto out;
321 	}
322 
323 	im = bpf_tramp_image_alloc(tr->key, tr->selector);
324 	if (IS_ERR(im)) {
325 		err = PTR_ERR(im);
326 		goto out;
327 	}
328 
329 	if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
330 	    tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
331 		flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
332 
333 	err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
334 					  &tr->func.model, flags, tprogs,
335 					  tr->func.addr);
336 	if (err < 0)
337 		goto out;
338 
339 	WARN_ON(tr->cur_image && tr->selector == 0);
340 	WARN_ON(!tr->cur_image && tr->selector);
341 	if (tr->cur_image)
342 		/* progs already running at this address */
343 		err = modify_fentry(tr, tr->cur_image->image, im->image);
344 	else
345 		/* first time registering */
346 		err = register_fentry(tr, im->image);
347 	if (err)
348 		goto out;
349 	if (tr->cur_image)
350 		bpf_tramp_image_put(tr->cur_image);
351 	tr->cur_image = im;
352 	tr->selector++;
353 out:
354 	kfree(tprogs);
355 	return err;
356 }
357 
bpf_attach_type_to_tramp(struct bpf_prog * prog)358 static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
359 {
360 	switch (prog->expected_attach_type) {
361 	case BPF_TRACE_FENTRY:
362 		return BPF_TRAMP_FENTRY;
363 	case BPF_MODIFY_RETURN:
364 		return BPF_TRAMP_MODIFY_RETURN;
365 	case BPF_TRACE_FEXIT:
366 		return BPF_TRAMP_FEXIT;
367 	case BPF_LSM_MAC:
368 		if (!prog->aux->attach_func_proto->type)
369 			/* The function returns void, we cannot modify its
370 			 * return value.
371 			 */
372 			return BPF_TRAMP_FEXIT;
373 		else
374 			return BPF_TRAMP_MODIFY_RETURN;
375 	default:
376 		return BPF_TRAMP_REPLACE;
377 	}
378 }
379 
bpf_trampoline_link_prog(struct bpf_prog * prog,struct bpf_trampoline * tr)380 int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
381 {
382 	enum bpf_tramp_prog_type kind;
383 	int err = 0;
384 	int cnt = 0, i;
385 
386 	kind = bpf_attach_type_to_tramp(prog);
387 	mutex_lock(&tr->mutex);
388 	if (tr->extension_prog) {
389 		/* cannot attach fentry/fexit if extension prog is attached.
390 		 * cannot overwrite extension prog either.
391 		 */
392 		err = -EBUSY;
393 		goto out;
394 	}
395 
396 	for (i = 0; i < BPF_TRAMP_MAX; i++)
397 		cnt += tr->progs_cnt[i];
398 
399 	if (kind == BPF_TRAMP_REPLACE) {
400 		/* Cannot attach extension if fentry/fexit are in use. */
401 		if (cnt) {
402 			err = -EBUSY;
403 			goto out;
404 		}
405 		tr->extension_prog = prog;
406 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
407 					 prog->bpf_func);
408 		goto out;
409 	}
410 	if (cnt >= BPF_MAX_TRAMP_PROGS) {
411 		err = -E2BIG;
412 		goto out;
413 	}
414 	if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
415 		/* prog already linked */
416 		err = -EBUSY;
417 		goto out;
418 	}
419 	hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
420 	tr->progs_cnt[kind]++;
421 	err = bpf_trampoline_update(tr);
422 	if (err) {
423 		hlist_del(&prog->aux->tramp_hlist);
424 		tr->progs_cnt[kind]--;
425 	}
426 out:
427 	mutex_unlock(&tr->mutex);
428 	return err;
429 }
430 
431 /* bpf_trampoline_unlink_prog() should never fail. */
bpf_trampoline_unlink_prog(struct bpf_prog * prog,struct bpf_trampoline * tr)432 int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
433 {
434 	enum bpf_tramp_prog_type kind;
435 	int err;
436 
437 	kind = bpf_attach_type_to_tramp(prog);
438 	mutex_lock(&tr->mutex);
439 	if (kind == BPF_TRAMP_REPLACE) {
440 		WARN_ON_ONCE(!tr->extension_prog);
441 		err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
442 					 tr->extension_prog->bpf_func, NULL);
443 		tr->extension_prog = NULL;
444 		goto out;
445 	}
446 	hlist_del(&prog->aux->tramp_hlist);
447 	tr->progs_cnt[kind]--;
448 	err = bpf_trampoline_update(tr);
449 out:
450 	mutex_unlock(&tr->mutex);
451 	return err;
452 }
453 
bpf_trampoline_get(u64 key,struct bpf_attach_target_info * tgt_info)454 struct bpf_trampoline *bpf_trampoline_get(u64 key,
455 					  struct bpf_attach_target_info *tgt_info)
456 {
457 	struct bpf_trampoline *tr;
458 
459 	tr = bpf_trampoline_lookup(key);
460 	if (!tr)
461 		return NULL;
462 
463 	mutex_lock(&tr->mutex);
464 	if (tr->func.addr)
465 		goto out;
466 
467 	memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
468 	tr->func.addr = (void *)tgt_info->tgt_addr;
469 out:
470 	mutex_unlock(&tr->mutex);
471 	return tr;
472 }
473 
bpf_trampoline_put(struct bpf_trampoline * tr)474 void bpf_trampoline_put(struct bpf_trampoline *tr)
475 {
476 	int i;
477 
478 	if (!tr)
479 		return;
480 	mutex_lock(&trampoline_mutex);
481 	if (!refcount_dec_and_test(&tr->refcnt))
482 		goto out;
483 	WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
484 
485 	for (i = 0; i < BPF_TRAMP_MAX; i++)
486 		if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
487 			goto out;
488 
489 	/* This code will be executed even when the last bpf_tramp_image
490 	 * is alive. All progs are detached from the trampoline and the
491 	 * trampoline image is patched with jmp into epilogue to skip
492 	 * fexit progs. The fentry-only trampoline will be freed via
493 	 * multiple rcu callbacks.
494 	 */
495 	hlist_del(&tr->hlist);
496 	kfree(tr);
497 out:
498 	mutex_unlock(&trampoline_mutex);
499 }
500 
501 /* The logic is similar to BPF_PROG_RUN, but with an explicit
502  * rcu_read_lock() and migrate_disable() which are required
503  * for the trampoline. The macro is split into
504  * call _bpf_prog_enter
505  * call prog->bpf_func
506  * call __bpf_prog_exit
507  */
__bpf_prog_enter(void)508 u64 notrace __bpf_prog_enter(void)
509 	__acquires(RCU)
510 {
511 	u64 start = 0;
512 
513 	rcu_read_lock();
514 	migrate_disable();
515 	if (static_branch_unlikely(&bpf_stats_enabled_key))
516 		start = sched_clock();
517 	return start;
518 }
519 
__bpf_prog_exit(struct bpf_prog * prog,u64 start)520 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
521 	__releases(RCU)
522 {
523 	struct bpf_prog_stats *stats;
524 
525 	if (static_branch_unlikely(&bpf_stats_enabled_key) &&
526 	    /* static_key could be enabled in __bpf_prog_enter
527 	     * and disabled in __bpf_prog_exit.
528 	     * And vice versa.
529 	     * Hence check that 'start' is not zero.
530 	     */
531 	    start) {
532 		stats = this_cpu_ptr(prog->aux->stats);
533 		u64_stats_update_begin(&stats->syncp);
534 		stats->cnt++;
535 		stats->nsecs += sched_clock() - start;
536 		u64_stats_update_end(&stats->syncp);
537 	}
538 	migrate_enable();
539 	rcu_read_unlock();
540 }
541 
__bpf_prog_enter_sleepable(void)542 void notrace __bpf_prog_enter_sleepable(void)
543 {
544 	rcu_read_lock_trace();
545 	might_fault();
546 }
547 
__bpf_prog_exit_sleepable(void)548 void notrace __bpf_prog_exit_sleepable(void)
549 {
550 	rcu_read_unlock_trace();
551 }
552 
__bpf_tramp_enter(struct bpf_tramp_image * tr)553 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
554 {
555 	percpu_ref_get(&tr->pcref);
556 }
557 
__bpf_tramp_exit(struct bpf_tramp_image * tr)558 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
559 {
560 	percpu_ref_put(&tr->pcref);
561 }
562 
563 int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image * tr,void * image,void * image_end,const struct btf_func_model * m,u32 flags,struct bpf_tramp_progs * tprogs,void * orig_call)564 arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
565 			    const struct btf_func_model *m, u32 flags,
566 			    struct bpf_tramp_progs *tprogs,
567 			    void *orig_call)
568 {
569 	return -ENOTSUPP;
570 }
571 
init_trampolines(void)572 static int __init init_trampolines(void)
573 {
574 	int i;
575 
576 	for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
577 		INIT_HLIST_HEAD(&trampoline_table[i]);
578 	return 0;
579 }
580 late_initcall(init_trampolines);
581