1 // SPDX-License-Identifier: GPL-2.0-only
2 /* Copyright (c) 2019 Facebook */
3 #include <linux/hash.h>
4 #include <linux/bpf.h>
5 #include <linux/filter.h>
6 #include <linux/ftrace.h>
7 #include <linux/rbtree_latch.h>
8 #include <linux/perf_event.h>
9 #include <linux/btf.h>
10 #include <linux/rcupdate_trace.h>
11 #include <linux/rcupdate_wait.h>
12 #include <trace/hooks/memory.h>
13
14 /* dummy _ops. The verifier will operate on target program's ops. */
15 const struct bpf_verifier_ops bpf_extension_verifier_ops = {
16 };
17 const struct bpf_prog_ops bpf_extension_prog_ops = {
18 };
19
20 /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */
21 #define TRAMPOLINE_HASH_BITS 10
22 #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS)
23
24 static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE];
25
26 /* serializes access to trampoline_table */
27 static DEFINE_MUTEX(trampoline_mutex);
28
bpf_jit_alloc_exec_page(void)29 void *bpf_jit_alloc_exec_page(void)
30 {
31 void *image;
32
33 image = bpf_jit_alloc_exec(PAGE_SIZE);
34 if (!image)
35 return NULL;
36
37 set_vm_flush_reset_perms(image);
38 /* Keep image as writeable. The alternative is to keep flipping ro/rw
39 * everytime new program is attached or detached.
40 */
41 set_memory_x((long)image, 1);
42 trace_android_vh_set_memory_x((unsigned long)image, 1);
43 return image;
44 }
45
bpf_image_ksym_add(void * data,struct bpf_ksym * ksym)46 void bpf_image_ksym_add(void *data, struct bpf_ksym *ksym)
47 {
48 ksym->start = (unsigned long) data;
49 ksym->end = ksym->start + PAGE_SIZE;
50 bpf_ksym_add(ksym);
51 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
52 PAGE_SIZE, false, ksym->name);
53 }
54
bpf_image_ksym_del(struct bpf_ksym * ksym)55 void bpf_image_ksym_del(struct bpf_ksym *ksym)
56 {
57 bpf_ksym_del(ksym);
58 perf_event_ksymbol(PERF_RECORD_KSYMBOL_TYPE_BPF, ksym->start,
59 PAGE_SIZE, true, ksym->name);
60 }
61
bpf_trampoline_lookup(u64 key)62 static struct bpf_trampoline *bpf_trampoline_lookup(u64 key)
63 {
64 struct bpf_trampoline *tr;
65 struct hlist_head *head;
66 int i;
67
68 mutex_lock(&trampoline_mutex);
69 head = &trampoline_table[hash_64(key, TRAMPOLINE_HASH_BITS)];
70 hlist_for_each_entry(tr, head, hlist) {
71 if (tr->key == key) {
72 refcount_inc(&tr->refcnt);
73 goto out;
74 }
75 }
76 tr = kzalloc(sizeof(*tr), GFP_KERNEL);
77 if (!tr)
78 goto out;
79
80 tr->key = key;
81 INIT_HLIST_NODE(&tr->hlist);
82 hlist_add_head(&tr->hlist, head);
83 refcount_set(&tr->refcnt, 1);
84 mutex_init(&tr->mutex);
85 for (i = 0; i < BPF_TRAMP_MAX; i++)
86 INIT_HLIST_HEAD(&tr->progs_hlist[i]);
87 out:
88 mutex_unlock(&trampoline_mutex);
89 return tr;
90 }
91
is_ftrace_location(void * ip)92 static int is_ftrace_location(void *ip)
93 {
94 long addr;
95
96 addr = ftrace_location((long)ip);
97 if (!addr)
98 return 0;
99 if (WARN_ON_ONCE(addr != (long)ip))
100 return -EFAULT;
101 return 1;
102 }
103
unregister_fentry(struct bpf_trampoline * tr,void * old_addr)104 static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr)
105 {
106 void *ip = tr->func.addr;
107 int ret;
108
109 if (tr->func.ftrace_managed)
110 ret = unregister_ftrace_direct((long)ip, (long)old_addr);
111 else
112 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, NULL);
113 return ret;
114 }
115
modify_fentry(struct bpf_trampoline * tr,void * old_addr,void * new_addr)116 static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr)
117 {
118 void *ip = tr->func.addr;
119 int ret;
120
121 if (tr->func.ftrace_managed)
122 ret = modify_ftrace_direct((long)ip, (long)old_addr, (long)new_addr);
123 else
124 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, old_addr, new_addr);
125 return ret;
126 }
127
128 /* first time registering */
register_fentry(struct bpf_trampoline * tr,void * new_addr)129 static int register_fentry(struct bpf_trampoline *tr, void *new_addr)
130 {
131 void *ip = tr->func.addr;
132 int ret;
133
134 ret = is_ftrace_location(ip);
135 if (ret < 0)
136 return ret;
137 tr->func.ftrace_managed = ret;
138
139 if (tr->func.ftrace_managed)
140 ret = register_ftrace_direct((long)ip, (long)new_addr);
141 else
142 ret = bpf_arch_text_poke(ip, BPF_MOD_CALL, NULL, new_addr);
143 return ret;
144 }
145
146 static struct bpf_tramp_progs *
bpf_trampoline_get_progs(const struct bpf_trampoline * tr,int * total)147 bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total)
148 {
149 const struct bpf_prog_aux *aux;
150 struct bpf_tramp_progs *tprogs;
151 struct bpf_prog **progs;
152 int kind;
153
154 *total = 0;
155 tprogs = kcalloc(BPF_TRAMP_MAX, sizeof(*tprogs), GFP_KERNEL);
156 if (!tprogs)
157 return ERR_PTR(-ENOMEM);
158
159 for (kind = 0; kind < BPF_TRAMP_MAX; kind++) {
160 tprogs[kind].nr_progs = tr->progs_cnt[kind];
161 *total += tr->progs_cnt[kind];
162 progs = tprogs[kind].progs;
163
164 hlist_for_each_entry(aux, &tr->progs_hlist[kind], tramp_hlist)
165 *progs++ = aux->prog;
166 }
167 return tprogs;
168 }
169
__bpf_tramp_image_put_deferred(struct work_struct * work)170 static void __bpf_tramp_image_put_deferred(struct work_struct *work)
171 {
172 struct bpf_tramp_image *im;
173
174 im = container_of(work, struct bpf_tramp_image, work);
175 bpf_image_ksym_del(&im->ksym);
176 trace_android_vh_set_memory_nx((unsigned long)im->image, 1);
177 bpf_jit_free_exec(im->image);
178 bpf_jit_uncharge_modmem(1);
179 percpu_ref_exit(&im->pcref);
180 kfree_rcu(im, rcu);
181 }
182
183 /* callback, fexit step 3 or fentry step 2 */
__bpf_tramp_image_put_rcu(struct rcu_head * rcu)184 static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu)
185 {
186 struct bpf_tramp_image *im;
187
188 im = container_of(rcu, struct bpf_tramp_image, rcu);
189 INIT_WORK(&im->work, __bpf_tramp_image_put_deferred);
190 schedule_work(&im->work);
191 }
192
193 /* callback, fexit step 2. Called after percpu_ref_kill confirms. */
__bpf_tramp_image_release(struct percpu_ref * pcref)194 static void __bpf_tramp_image_release(struct percpu_ref *pcref)
195 {
196 struct bpf_tramp_image *im;
197
198 im = container_of(pcref, struct bpf_tramp_image, pcref);
199 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
200 }
201
202 /* callback, fexit or fentry step 1 */
__bpf_tramp_image_put_rcu_tasks(struct rcu_head * rcu)203 static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu)
204 {
205 struct bpf_tramp_image *im;
206
207 im = container_of(rcu, struct bpf_tramp_image, rcu);
208 if (im->ip_after_call)
209 /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */
210 percpu_ref_kill(&im->pcref);
211 else
212 /* the case of fentry trampoline */
213 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu);
214 }
215
bpf_tramp_image_put(struct bpf_tramp_image * im)216 static void bpf_tramp_image_put(struct bpf_tramp_image *im)
217 {
218 /* The trampoline image that calls original function is using:
219 * rcu_read_lock_trace to protect sleepable bpf progs
220 * rcu_read_lock to protect normal bpf progs
221 * percpu_ref to protect trampoline itself
222 * rcu tasks to protect trampoline asm not covered by percpu_ref
223 * (which are few asm insns before __bpf_tramp_enter and
224 * after __bpf_tramp_exit)
225 *
226 * The trampoline is unreachable before bpf_tramp_image_put().
227 *
228 * First, patch the trampoline to avoid calling into fexit progs.
229 * The progs will be freed even if the original function is still
230 * executing or sleeping.
231 * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on
232 * first few asm instructions to execute and call into
233 * __bpf_tramp_enter->percpu_ref_get.
234 * Then use percpu_ref_kill to wait for the trampoline and the original
235 * function to finish.
236 * Then use call_rcu_tasks() to make sure few asm insns in
237 * the trampoline epilogue are done as well.
238 *
239 * In !PREEMPT case the task that got interrupted in the first asm
240 * insns won't go through an RCU quiescent state which the
241 * percpu_ref_kill will be waiting for. Hence the first
242 * call_rcu_tasks() is not necessary.
243 */
244 if (im->ip_after_call) {
245 int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP,
246 NULL, im->ip_epilogue);
247 WARN_ON(err);
248 if (IS_ENABLED(CONFIG_PREEMPTION))
249 call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
250 else
251 percpu_ref_kill(&im->pcref);
252 return;
253 }
254
255 /* The trampoline without fexit and fmod_ret progs doesn't call original
256 * function and doesn't use percpu_ref.
257 * Use call_rcu_tasks_trace() to wait for sleepable progs to finish.
258 * Then use call_rcu_tasks() to wait for the rest of trampoline asm
259 * and normal progs.
260 */
261 call_rcu_tasks_trace(&im->rcu, __bpf_tramp_image_put_rcu_tasks);
262 }
263
bpf_tramp_image_alloc(u64 key,u32 idx)264 static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, u32 idx)
265 {
266 struct bpf_tramp_image *im;
267 struct bpf_ksym *ksym;
268 void *image;
269 int err = -ENOMEM;
270
271 im = kzalloc(sizeof(*im), GFP_KERNEL);
272 if (!im)
273 goto out;
274
275 err = bpf_jit_charge_modmem(1);
276 if (err)
277 goto out_free_im;
278
279 err = -ENOMEM;
280 im->image = image = bpf_jit_alloc_exec_page();
281 if (!image)
282 goto out_uncharge;
283
284 err = percpu_ref_init(&im->pcref, __bpf_tramp_image_release, 0, GFP_KERNEL);
285 if (err)
286 goto out_free_image;
287
288 ksym = &im->ksym;
289 INIT_LIST_HEAD_RCU(&ksym->lnode);
290 snprintf(ksym->name, KSYM_NAME_LEN, "bpf_trampoline_%llu_%u", key, idx);
291 bpf_image_ksym_add(image, ksym);
292 return im;
293
294 out_free_image:
295 bpf_jit_free_exec(im->image);
296 out_uncharge:
297 bpf_jit_uncharge_modmem(1);
298 out_free_im:
299 kfree(im);
300 out:
301 return ERR_PTR(err);
302 }
303
bpf_trampoline_update(struct bpf_trampoline * tr)304 static int bpf_trampoline_update(struct bpf_trampoline *tr)
305 {
306 struct bpf_tramp_image *im;
307 struct bpf_tramp_progs *tprogs;
308 u32 flags = BPF_TRAMP_F_RESTORE_REGS;
309 int err, total;
310
311 tprogs = bpf_trampoline_get_progs(tr, &total);
312 if (IS_ERR(tprogs))
313 return PTR_ERR(tprogs);
314
315 if (total == 0) {
316 err = unregister_fentry(tr, tr->cur_image->image);
317 bpf_tramp_image_put(tr->cur_image);
318 tr->cur_image = NULL;
319 tr->selector = 0;
320 goto out;
321 }
322
323 im = bpf_tramp_image_alloc(tr->key, tr->selector);
324 if (IS_ERR(im)) {
325 err = PTR_ERR(im);
326 goto out;
327 }
328
329 if (tprogs[BPF_TRAMP_FEXIT].nr_progs ||
330 tprogs[BPF_TRAMP_MODIFY_RETURN].nr_progs)
331 flags = BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME;
332
333 err = arch_prepare_bpf_trampoline(im, im->image, im->image + PAGE_SIZE,
334 &tr->func.model, flags, tprogs,
335 tr->func.addr);
336 if (err < 0)
337 goto out;
338
339 WARN_ON(tr->cur_image && tr->selector == 0);
340 WARN_ON(!tr->cur_image && tr->selector);
341 if (tr->cur_image)
342 /* progs already running at this address */
343 err = modify_fentry(tr, tr->cur_image->image, im->image);
344 else
345 /* first time registering */
346 err = register_fentry(tr, im->image);
347 if (err)
348 goto out;
349 if (tr->cur_image)
350 bpf_tramp_image_put(tr->cur_image);
351 tr->cur_image = im;
352 tr->selector++;
353 out:
354 kfree(tprogs);
355 return err;
356 }
357
bpf_attach_type_to_tramp(struct bpf_prog * prog)358 static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog)
359 {
360 switch (prog->expected_attach_type) {
361 case BPF_TRACE_FENTRY:
362 return BPF_TRAMP_FENTRY;
363 case BPF_MODIFY_RETURN:
364 return BPF_TRAMP_MODIFY_RETURN;
365 case BPF_TRACE_FEXIT:
366 return BPF_TRAMP_FEXIT;
367 case BPF_LSM_MAC:
368 if (!prog->aux->attach_func_proto->type)
369 /* The function returns void, we cannot modify its
370 * return value.
371 */
372 return BPF_TRAMP_FEXIT;
373 else
374 return BPF_TRAMP_MODIFY_RETURN;
375 default:
376 return BPF_TRAMP_REPLACE;
377 }
378 }
379
bpf_trampoline_link_prog(struct bpf_prog * prog,struct bpf_trampoline * tr)380 int bpf_trampoline_link_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
381 {
382 enum bpf_tramp_prog_type kind;
383 int err = 0;
384 int cnt = 0, i;
385
386 kind = bpf_attach_type_to_tramp(prog);
387 mutex_lock(&tr->mutex);
388 if (tr->extension_prog) {
389 /* cannot attach fentry/fexit if extension prog is attached.
390 * cannot overwrite extension prog either.
391 */
392 err = -EBUSY;
393 goto out;
394 }
395
396 for (i = 0; i < BPF_TRAMP_MAX; i++)
397 cnt += tr->progs_cnt[i];
398
399 if (kind == BPF_TRAMP_REPLACE) {
400 /* Cannot attach extension if fentry/fexit are in use. */
401 if (cnt) {
402 err = -EBUSY;
403 goto out;
404 }
405 tr->extension_prog = prog;
406 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP, NULL,
407 prog->bpf_func);
408 goto out;
409 }
410 if (cnt >= BPF_MAX_TRAMP_PROGS) {
411 err = -E2BIG;
412 goto out;
413 }
414 if (!hlist_unhashed(&prog->aux->tramp_hlist)) {
415 /* prog already linked */
416 err = -EBUSY;
417 goto out;
418 }
419 hlist_add_head(&prog->aux->tramp_hlist, &tr->progs_hlist[kind]);
420 tr->progs_cnt[kind]++;
421 err = bpf_trampoline_update(tr);
422 if (err) {
423 hlist_del(&prog->aux->tramp_hlist);
424 tr->progs_cnt[kind]--;
425 }
426 out:
427 mutex_unlock(&tr->mutex);
428 return err;
429 }
430
431 /* bpf_trampoline_unlink_prog() should never fail. */
bpf_trampoline_unlink_prog(struct bpf_prog * prog,struct bpf_trampoline * tr)432 int bpf_trampoline_unlink_prog(struct bpf_prog *prog, struct bpf_trampoline *tr)
433 {
434 enum bpf_tramp_prog_type kind;
435 int err;
436
437 kind = bpf_attach_type_to_tramp(prog);
438 mutex_lock(&tr->mutex);
439 if (kind == BPF_TRAMP_REPLACE) {
440 WARN_ON_ONCE(!tr->extension_prog);
441 err = bpf_arch_text_poke(tr->func.addr, BPF_MOD_JUMP,
442 tr->extension_prog->bpf_func, NULL);
443 tr->extension_prog = NULL;
444 goto out;
445 }
446 hlist_del(&prog->aux->tramp_hlist);
447 tr->progs_cnt[kind]--;
448 err = bpf_trampoline_update(tr);
449 out:
450 mutex_unlock(&tr->mutex);
451 return err;
452 }
453
bpf_trampoline_get(u64 key,struct bpf_attach_target_info * tgt_info)454 struct bpf_trampoline *bpf_trampoline_get(u64 key,
455 struct bpf_attach_target_info *tgt_info)
456 {
457 struct bpf_trampoline *tr;
458
459 tr = bpf_trampoline_lookup(key);
460 if (!tr)
461 return NULL;
462
463 mutex_lock(&tr->mutex);
464 if (tr->func.addr)
465 goto out;
466
467 memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel));
468 tr->func.addr = (void *)tgt_info->tgt_addr;
469 out:
470 mutex_unlock(&tr->mutex);
471 return tr;
472 }
473
bpf_trampoline_put(struct bpf_trampoline * tr)474 void bpf_trampoline_put(struct bpf_trampoline *tr)
475 {
476 int i;
477
478 if (!tr)
479 return;
480 mutex_lock(&trampoline_mutex);
481 if (!refcount_dec_and_test(&tr->refcnt))
482 goto out;
483 WARN_ON_ONCE(mutex_is_locked(&tr->mutex));
484
485 for (i = 0; i < BPF_TRAMP_MAX; i++)
486 if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i])))
487 goto out;
488
489 /* This code will be executed even when the last bpf_tramp_image
490 * is alive. All progs are detached from the trampoline and the
491 * trampoline image is patched with jmp into epilogue to skip
492 * fexit progs. The fentry-only trampoline will be freed via
493 * multiple rcu callbacks.
494 */
495 hlist_del(&tr->hlist);
496 kfree(tr);
497 out:
498 mutex_unlock(&trampoline_mutex);
499 }
500
501 /* The logic is similar to BPF_PROG_RUN, but with an explicit
502 * rcu_read_lock() and migrate_disable() which are required
503 * for the trampoline. The macro is split into
504 * call _bpf_prog_enter
505 * call prog->bpf_func
506 * call __bpf_prog_exit
507 */
__bpf_prog_enter(void)508 u64 notrace __bpf_prog_enter(void)
509 __acquires(RCU)
510 {
511 u64 start = 0;
512
513 rcu_read_lock();
514 migrate_disable();
515 if (static_branch_unlikely(&bpf_stats_enabled_key))
516 start = sched_clock();
517 return start;
518 }
519
__bpf_prog_exit(struct bpf_prog * prog,u64 start)520 void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start)
521 __releases(RCU)
522 {
523 struct bpf_prog_stats *stats;
524
525 if (static_branch_unlikely(&bpf_stats_enabled_key) &&
526 /* static_key could be enabled in __bpf_prog_enter
527 * and disabled in __bpf_prog_exit.
528 * And vice versa.
529 * Hence check that 'start' is not zero.
530 */
531 start) {
532 stats = this_cpu_ptr(prog->aux->stats);
533 u64_stats_update_begin(&stats->syncp);
534 stats->cnt++;
535 stats->nsecs += sched_clock() - start;
536 u64_stats_update_end(&stats->syncp);
537 }
538 migrate_enable();
539 rcu_read_unlock();
540 }
541
__bpf_prog_enter_sleepable(void)542 void notrace __bpf_prog_enter_sleepable(void)
543 {
544 rcu_read_lock_trace();
545 might_fault();
546 }
547
__bpf_prog_exit_sleepable(void)548 void notrace __bpf_prog_exit_sleepable(void)
549 {
550 rcu_read_unlock_trace();
551 }
552
__bpf_tramp_enter(struct bpf_tramp_image * tr)553 void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr)
554 {
555 percpu_ref_get(&tr->pcref);
556 }
557
__bpf_tramp_exit(struct bpf_tramp_image * tr)558 void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr)
559 {
560 percpu_ref_put(&tr->pcref);
561 }
562
563 int __weak
arch_prepare_bpf_trampoline(struct bpf_tramp_image * tr,void * image,void * image_end,const struct btf_func_model * m,u32 flags,struct bpf_tramp_progs * tprogs,void * orig_call)564 arch_prepare_bpf_trampoline(struct bpf_tramp_image *tr, void *image, void *image_end,
565 const struct btf_func_model *m, u32 flags,
566 struct bpf_tramp_progs *tprogs,
567 void *orig_call)
568 {
569 return -ENOTSUPP;
570 }
571
init_trampolines(void)572 static int __init init_trampolines(void)
573 {
574 int i;
575
576 for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++)
577 INIT_HLIST_HEAD(&trampoline_table[i]);
578 return 0;
579 }
580 late_initcall(init_trampolines);
581