• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2  *
3  * This program is free software; you can redistribute it and/or
4  * modify it under the terms of version 2 of the GNU General Public
5  * License as published by the Free Software Foundation.
6  *
7  * This program is distributed in the hope that it will be useful, but
8  * WITHOUT ANY WARRANTY; without even the implied warranty of
9  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10  * General Public License for more details.
11  */
12 #include <linux/bpf.h>
13 #include <linux/err.h>
14 #include <linux/slab.h>
15 #include <linux/mm.h>
16 #include <linux/filter.h>
17 #include <linux/perf_event.h>
18 
19 #define ARRAY_CREATE_FLAG_MASK \
20 	(BPF_F_RDONLY | BPF_F_WRONLY)
21 
bpf_array_free_percpu(struct bpf_array * array)22 static void bpf_array_free_percpu(struct bpf_array *array)
23 {
24 	int i;
25 
26 	for (i = 0; i < array->map.max_entries; i++) {
27 		free_percpu(array->pptrs[i]);
28 		cond_resched();
29 	}
30 }
31 
bpf_array_alloc_percpu(struct bpf_array * array)32 static int bpf_array_alloc_percpu(struct bpf_array *array)
33 {
34 	void __percpu *ptr;
35 	int i;
36 
37 	for (i = 0; i < array->map.max_entries; i++) {
38 		ptr = __alloc_percpu_gfp(array->elem_size, 8,
39 					 GFP_USER | __GFP_NOWARN);
40 		if (!ptr) {
41 			bpf_array_free_percpu(array);
42 			return -ENOMEM;
43 		}
44 		array->pptrs[i] = ptr;
45 		cond_resched();
46 	}
47 
48 	return 0;
49 }
50 
51 /* Called from syscall */
array_map_alloc(union bpf_attr * attr)52 static struct bpf_map *array_map_alloc(union bpf_attr *attr)
53 {
54 	bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY;
55 	u32 elem_size, index_mask, max_entries;
56 	bool unpriv = !capable(CAP_SYS_ADMIN);
57 	u64 cost, array_size, mask64;
58 	struct bpf_array *array;
59 	int ret;
60 
61 	/* check sanity of attributes */
62 	if (attr->max_entries == 0 || attr->key_size != 4 ||
63 	    attr->value_size == 0 ||
64 	    attr->map_flags & ~ARRAY_CREATE_FLAG_MASK)
65 		return ERR_PTR(-EINVAL);
66 
67 	if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1))
68 		/* if value_size is bigger, the user space won't be able to
69 		 * access the elements.
70 		 */
71 		return ERR_PTR(-E2BIG);
72 
73 	elem_size = round_up(attr->value_size, 8);
74 
75 	max_entries = attr->max_entries;
76 
77 	/* On 32 bit archs roundup_pow_of_two() with max_entries that has
78 	 * upper most bit set in u32 space is undefined behavior due to
79 	 * resulting 1U << 32, so do it manually here in u64 space.
80 	 */
81 	mask64 = fls_long(max_entries - 1);
82 	mask64 = 1ULL << mask64;
83 	mask64 -= 1;
84 
85 	index_mask = mask64;
86 	if (unpriv) {
87 		/* round up array size to nearest power of 2,
88 		 * since cpu will speculate within index_mask limits
89 		 */
90 		max_entries = index_mask + 1;
91 		/* Check for overflows. */
92 		if (max_entries < attr->max_entries)
93 			return ERR_PTR(-E2BIG);
94 	}
95 
96 	array_size = sizeof(*array);
97 	if (percpu)
98 		array_size += (u64) max_entries * sizeof(void *);
99 	else
100 		array_size += (u64) max_entries * elem_size;
101 
102 	/* make sure there is no u32 overflow later in round_up() */
103 	cost = array_size;
104 	if (cost >= U32_MAX - PAGE_SIZE)
105 		return ERR_PTR(-ENOMEM);
106 	if (percpu) {
107 		cost += (u64)attr->max_entries * elem_size * num_possible_cpus();
108 		if (cost >= U32_MAX - PAGE_SIZE)
109 			return ERR_PTR(-ENOMEM);
110 	}
111 	cost = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
112 
113 	ret = bpf_map_precharge_memlock(cost);
114 	if (ret < 0)
115 		return ERR_PTR(ret);
116 
117 	/* allocate all map elements and zero-initialize them */
118 	array = bpf_map_area_alloc(array_size);
119 	if (!array)
120 		return ERR_PTR(-ENOMEM);
121 	array->index_mask = index_mask;
122 	array->map.unpriv_array = unpriv;
123 
124 	/* copy mandatory map attributes */
125 	array->map.map_type = attr->map_type;
126 	array->map.key_size = attr->key_size;
127 	array->map.value_size = attr->value_size;
128 	array->map.max_entries = attr->max_entries;
129 	array->map.map_flags = attr->map_flags;
130 	array->map.pages = cost;
131 	array->elem_size = elem_size;
132 
133 	if (percpu &&
134 	    (elem_size > PCPU_MIN_UNIT_SIZE ||
135 	     bpf_array_alloc_percpu(array))) {
136 		bpf_map_area_free(array);
137 		return ERR_PTR(-ENOMEM);
138 	}
139 
140 	return &array->map;
141 }
142 
143 /* Called from syscall or from eBPF program */
array_map_lookup_elem(struct bpf_map * map,void * key)144 static void *array_map_lookup_elem(struct bpf_map *map, void *key)
145 {
146 	struct bpf_array *array = container_of(map, struct bpf_array, map);
147 	u32 index = *(u32 *)key;
148 
149 	if (unlikely(index >= array->map.max_entries))
150 		return NULL;
151 
152 	return array->value + array->elem_size * (index & array->index_mask);
153 }
154 
155 /* Called from eBPF program */
percpu_array_map_lookup_elem(struct bpf_map * map,void * key)156 static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key)
157 {
158 	struct bpf_array *array = container_of(map, struct bpf_array, map);
159 	u32 index = *(u32 *)key;
160 
161 	if (unlikely(index >= array->map.max_entries))
162 		return NULL;
163 
164 	return this_cpu_ptr(array->pptrs[index & array->index_mask]);
165 }
166 
bpf_percpu_array_copy(struct bpf_map * map,void * key,void * value)167 int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value)
168 {
169 	struct bpf_array *array = container_of(map, struct bpf_array, map);
170 	u32 index = *(u32 *)key;
171 	void __percpu *pptr;
172 	int cpu, off = 0;
173 	u32 size;
174 
175 	if (unlikely(index >= array->map.max_entries))
176 		return -ENOENT;
177 
178 	/* per_cpu areas are zero-filled and bpf programs can only
179 	 * access 'value_size' of them, so copying rounded areas
180 	 * will not leak any kernel data
181 	 */
182 	size = round_up(map->value_size, 8);
183 	rcu_read_lock();
184 	pptr = array->pptrs[index & array->index_mask];
185 	for_each_possible_cpu(cpu) {
186 		bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size);
187 		off += size;
188 	}
189 	rcu_read_unlock();
190 	return 0;
191 }
192 
193 /* Called from syscall */
array_map_get_next_key(struct bpf_map * map,void * key,void * next_key)194 static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
195 {
196 	struct bpf_array *array = container_of(map, struct bpf_array, map);
197 	u32 index = *(u32 *)key;
198 	u32 *next = (u32 *)next_key;
199 
200 	if (index >= array->map.max_entries) {
201 		*next = 0;
202 		return 0;
203 	}
204 
205 	if (index == array->map.max_entries - 1)
206 		return -ENOENT;
207 
208 	*next = index + 1;
209 	return 0;
210 }
211 
212 /* Called from syscall or from eBPF program */
array_map_update_elem(struct bpf_map * map,void * key,void * value,u64 map_flags)213 static int array_map_update_elem(struct bpf_map *map, void *key, void *value,
214 				 u64 map_flags)
215 {
216 	struct bpf_array *array = container_of(map, struct bpf_array, map);
217 	u32 index = *(u32 *)key;
218 
219 	if (unlikely(map_flags > BPF_EXIST))
220 		/* unknown flags */
221 		return -EINVAL;
222 
223 	if (unlikely(index >= array->map.max_entries))
224 		/* all elements were pre-allocated, cannot insert a new one */
225 		return -E2BIG;
226 
227 	if (unlikely(map_flags == BPF_NOEXIST))
228 		/* all elements already exist */
229 		return -EEXIST;
230 
231 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
232 		memcpy(this_cpu_ptr(array->pptrs[index & array->index_mask]),
233 		       value, map->value_size);
234 	else
235 		memcpy(array->value +
236 		       array->elem_size * (index & array->index_mask),
237 		       value, map->value_size);
238 	return 0;
239 }
240 
bpf_percpu_array_update(struct bpf_map * map,void * key,void * value,u64 map_flags)241 int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value,
242 			    u64 map_flags)
243 {
244 	struct bpf_array *array = container_of(map, struct bpf_array, map);
245 	u32 index = *(u32 *)key;
246 	void __percpu *pptr;
247 	int cpu, off = 0;
248 	u32 size;
249 
250 	if (unlikely(map_flags > BPF_EXIST))
251 		/* unknown flags */
252 		return -EINVAL;
253 
254 	if (unlikely(index >= array->map.max_entries))
255 		/* all elements were pre-allocated, cannot insert a new one */
256 		return -E2BIG;
257 
258 	if (unlikely(map_flags == BPF_NOEXIST))
259 		/* all elements already exist */
260 		return -EEXIST;
261 
262 	/* the user space will provide round_up(value_size, 8) bytes that
263 	 * will be copied into per-cpu area. bpf programs can only access
264 	 * value_size of it. During lookup the same extra bytes will be
265 	 * returned or zeros which were zero-filled by percpu_alloc,
266 	 * so no kernel data leaks possible
267 	 */
268 	size = round_up(map->value_size, 8);
269 	rcu_read_lock();
270 	pptr = array->pptrs[index & array->index_mask];
271 	for_each_possible_cpu(cpu) {
272 		bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size);
273 		off += size;
274 	}
275 	rcu_read_unlock();
276 	return 0;
277 }
278 
279 /* Called from syscall or from eBPF program */
array_map_delete_elem(struct bpf_map * map,void * key)280 static int array_map_delete_elem(struct bpf_map *map, void *key)
281 {
282 	return -EINVAL;
283 }
284 
285 /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
array_map_free(struct bpf_map * map)286 static void array_map_free(struct bpf_map *map)
287 {
288 	struct bpf_array *array = container_of(map, struct bpf_array, map);
289 
290 	/* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
291 	 * so the programs (can be more than one that used this map) were
292 	 * disconnected from events. Wait for outstanding programs to complete
293 	 * and free the array
294 	 */
295 	synchronize_rcu();
296 
297 	if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
298 		bpf_array_free_percpu(array);
299 
300 	bpf_map_area_free(array);
301 }
302 
303 static const struct bpf_map_ops array_ops = {
304 	.map_alloc = array_map_alloc,
305 	.map_free = array_map_free,
306 	.map_get_next_key = array_map_get_next_key,
307 	.map_lookup_elem = array_map_lookup_elem,
308 	.map_update_elem = array_map_update_elem,
309 	.map_delete_elem = array_map_delete_elem,
310 };
311 
312 static struct bpf_map_type_list array_type __read_mostly = {
313 	.ops = &array_ops,
314 	.type = BPF_MAP_TYPE_ARRAY,
315 };
316 
317 static const struct bpf_map_ops percpu_array_ops = {
318 	.map_alloc = array_map_alloc,
319 	.map_free = array_map_free,
320 	.map_get_next_key = array_map_get_next_key,
321 	.map_lookup_elem = percpu_array_map_lookup_elem,
322 	.map_update_elem = array_map_update_elem,
323 	.map_delete_elem = array_map_delete_elem,
324 };
325 
326 static struct bpf_map_type_list percpu_array_type __read_mostly = {
327 	.ops = &percpu_array_ops,
328 	.type = BPF_MAP_TYPE_PERCPU_ARRAY,
329 };
330 
register_array_map(void)331 static int __init register_array_map(void)
332 {
333 	bpf_register_map_type(&array_type);
334 	bpf_register_map_type(&percpu_array_type);
335 	return 0;
336 }
337 late_initcall(register_array_map);
338 
fd_array_map_alloc(union bpf_attr * attr)339 static struct bpf_map *fd_array_map_alloc(union bpf_attr *attr)
340 {
341 	/* only file descriptors can be stored in this type of map */
342 	if (attr->value_size != sizeof(u32))
343 		return ERR_PTR(-EINVAL);
344 	return array_map_alloc(attr);
345 }
346 
fd_array_map_free(struct bpf_map * map)347 static void fd_array_map_free(struct bpf_map *map)
348 {
349 	struct bpf_array *array = container_of(map, struct bpf_array, map);
350 	int i;
351 
352 	synchronize_rcu();
353 
354 	/* make sure it's empty */
355 	for (i = 0; i < array->map.max_entries; i++)
356 		BUG_ON(array->ptrs[i] != NULL);
357 
358 	bpf_map_area_free(array);
359 }
360 
fd_array_map_lookup_elem(struct bpf_map * map,void * key)361 static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
362 {
363 	return NULL;
364 }
365 
366 /* only called from syscall */
bpf_fd_array_map_update_elem(struct bpf_map * map,struct file * map_file,void * key,void * value,u64 map_flags)367 int bpf_fd_array_map_update_elem(struct bpf_map *map, struct file *map_file,
368 				 void *key, void *value, u64 map_flags)
369 {
370 	struct bpf_array *array = container_of(map, struct bpf_array, map);
371 	void *new_ptr, *old_ptr;
372 	u32 index = *(u32 *)key, ufd;
373 
374 	if (map_flags != BPF_ANY)
375 		return -EINVAL;
376 
377 	if (index >= array->map.max_entries)
378 		return -E2BIG;
379 
380 	ufd = *(u32 *)value;
381 	new_ptr = map->ops->map_fd_get_ptr(map, map_file, ufd);
382 	if (IS_ERR(new_ptr))
383 		return PTR_ERR(new_ptr);
384 
385 	old_ptr = xchg(array->ptrs + index, new_ptr);
386 	if (old_ptr)
387 		map->ops->map_fd_put_ptr(old_ptr);
388 
389 	return 0;
390 }
391 
fd_array_map_delete_elem(struct bpf_map * map,void * key)392 static int fd_array_map_delete_elem(struct bpf_map *map, void *key)
393 {
394 	struct bpf_array *array = container_of(map, struct bpf_array, map);
395 	void *old_ptr;
396 	u32 index = *(u32 *)key;
397 
398 	if (index >= array->map.max_entries)
399 		return -E2BIG;
400 
401 	old_ptr = xchg(array->ptrs + index, NULL);
402 	if (old_ptr) {
403 		map->ops->map_fd_put_ptr(old_ptr);
404 		return 0;
405 	} else {
406 		return -ENOENT;
407 	}
408 }
409 
prog_fd_array_get_ptr(struct bpf_map * map,struct file * map_file,int fd)410 static void *prog_fd_array_get_ptr(struct bpf_map *map,
411 				   struct file *map_file, int fd)
412 {
413 	struct bpf_array *array = container_of(map, struct bpf_array, map);
414 	struct bpf_prog *prog = bpf_prog_get(fd);
415 
416 	if (IS_ERR(prog))
417 		return prog;
418 
419 	if (!bpf_prog_array_compatible(array, prog)) {
420 		bpf_prog_put(prog);
421 		return ERR_PTR(-EINVAL);
422 	}
423 
424 	return prog;
425 }
426 
prog_fd_array_put_ptr(void * ptr)427 static void prog_fd_array_put_ptr(void *ptr)
428 {
429 	bpf_prog_put(ptr);
430 }
431 
432 /* decrement refcnt of all bpf_progs that are stored in this map */
bpf_fd_array_map_clear(struct bpf_map * map)433 void bpf_fd_array_map_clear(struct bpf_map *map)
434 {
435 	struct bpf_array *array = container_of(map, struct bpf_array, map);
436 	int i;
437 
438 	for (i = 0; i < array->map.max_entries; i++)
439 		fd_array_map_delete_elem(map, &i);
440 }
441 
442 static const struct bpf_map_ops prog_array_ops = {
443 	.map_alloc = fd_array_map_alloc,
444 	.map_free = fd_array_map_free,
445 	.map_get_next_key = array_map_get_next_key,
446 	.map_lookup_elem = fd_array_map_lookup_elem,
447 	.map_delete_elem = fd_array_map_delete_elem,
448 	.map_fd_get_ptr = prog_fd_array_get_ptr,
449 	.map_fd_put_ptr = prog_fd_array_put_ptr,
450 };
451 
452 static struct bpf_map_type_list prog_array_type __read_mostly = {
453 	.ops = &prog_array_ops,
454 	.type = BPF_MAP_TYPE_PROG_ARRAY,
455 };
456 
register_prog_array_map(void)457 static int __init register_prog_array_map(void)
458 {
459 	bpf_register_map_type(&prog_array_type);
460 	return 0;
461 }
462 late_initcall(register_prog_array_map);
463 
bpf_event_entry_gen(struct file * perf_file,struct file * map_file)464 static struct bpf_event_entry *bpf_event_entry_gen(struct file *perf_file,
465 						   struct file *map_file)
466 {
467 	struct bpf_event_entry *ee;
468 
469 	ee = kzalloc(sizeof(*ee), GFP_ATOMIC);
470 	if (ee) {
471 		ee->event = perf_file->private_data;
472 		ee->perf_file = perf_file;
473 		ee->map_file = map_file;
474 	}
475 
476 	return ee;
477 }
478 
__bpf_event_entry_free(struct rcu_head * rcu)479 static void __bpf_event_entry_free(struct rcu_head *rcu)
480 {
481 	struct bpf_event_entry *ee;
482 
483 	ee = container_of(rcu, struct bpf_event_entry, rcu);
484 	fput(ee->perf_file);
485 	kfree(ee);
486 }
487 
bpf_event_entry_free_rcu(struct bpf_event_entry * ee)488 static void bpf_event_entry_free_rcu(struct bpf_event_entry *ee)
489 {
490 	call_rcu(&ee->rcu, __bpf_event_entry_free);
491 }
492 
perf_event_fd_array_get_ptr(struct bpf_map * map,struct file * map_file,int fd)493 static void *perf_event_fd_array_get_ptr(struct bpf_map *map,
494 					 struct file *map_file, int fd)
495 {
496 	const struct perf_event_attr *attr;
497 	struct bpf_event_entry *ee;
498 	struct perf_event *event;
499 	struct file *perf_file;
500 
501 	perf_file = perf_event_get(fd);
502 	if (IS_ERR(perf_file))
503 		return perf_file;
504 
505 	event = perf_file->private_data;
506 	ee = ERR_PTR(-EINVAL);
507 
508 	attr = perf_event_attrs(event);
509 	if (IS_ERR(attr) || attr->inherit)
510 		goto err_out;
511 
512 	switch (attr->type) {
513 	case PERF_TYPE_SOFTWARE:
514 		if (attr->config != PERF_COUNT_SW_BPF_OUTPUT)
515 			goto err_out;
516 		/* fall-through */
517 	case PERF_TYPE_RAW:
518 	case PERF_TYPE_HARDWARE:
519 		ee = bpf_event_entry_gen(perf_file, map_file);
520 		if (ee)
521 			return ee;
522 		ee = ERR_PTR(-ENOMEM);
523 		/* fall-through */
524 	default:
525 		break;
526 	}
527 
528 err_out:
529 	fput(perf_file);
530 	return ee;
531 }
532 
perf_event_fd_array_put_ptr(void * ptr)533 static void perf_event_fd_array_put_ptr(void *ptr)
534 {
535 	bpf_event_entry_free_rcu(ptr);
536 }
537 
perf_event_fd_array_release(struct bpf_map * map,struct file * map_file)538 static void perf_event_fd_array_release(struct bpf_map *map,
539 					struct file *map_file)
540 {
541 	struct bpf_array *array = container_of(map, struct bpf_array, map);
542 	struct bpf_event_entry *ee;
543 	int i;
544 
545 	rcu_read_lock();
546 	for (i = 0; i < array->map.max_entries; i++) {
547 		ee = READ_ONCE(array->ptrs[i]);
548 		if (ee && ee->map_file == map_file)
549 			fd_array_map_delete_elem(map, &i);
550 	}
551 	rcu_read_unlock();
552 }
553 
554 static const struct bpf_map_ops perf_event_array_ops = {
555 	.map_alloc = fd_array_map_alloc,
556 	.map_free = fd_array_map_free,
557 	.map_get_next_key = array_map_get_next_key,
558 	.map_lookup_elem = fd_array_map_lookup_elem,
559 	.map_delete_elem = fd_array_map_delete_elem,
560 	.map_fd_get_ptr = perf_event_fd_array_get_ptr,
561 	.map_fd_put_ptr = perf_event_fd_array_put_ptr,
562 	.map_release = perf_event_fd_array_release,
563 };
564 
565 static struct bpf_map_type_list perf_event_array_type __read_mostly = {
566 	.ops = &perf_event_array_ops,
567 	.type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
568 };
569 
register_perf_event_array_map(void)570 static int __init register_perf_event_array_map(void)
571 {
572 	bpf_register_map_type(&perf_event_array_type);
573 	return 0;
574 }
575 late_initcall(register_perf_event_array_map);
576 
577 #ifdef CONFIG_CGROUPS
cgroup_fd_array_get_ptr(struct bpf_map * map,struct file * map_file,int fd)578 static void *cgroup_fd_array_get_ptr(struct bpf_map *map,
579 				     struct file *map_file /* not used */,
580 				     int fd)
581 {
582 	return cgroup_get_from_fd(fd);
583 }
584 
cgroup_fd_array_put_ptr(void * ptr)585 static void cgroup_fd_array_put_ptr(void *ptr)
586 {
587 	/* cgroup_put free cgrp after a rcu grace period */
588 	cgroup_put(ptr);
589 }
590 
cgroup_fd_array_free(struct bpf_map * map)591 static void cgroup_fd_array_free(struct bpf_map *map)
592 {
593 	bpf_fd_array_map_clear(map);
594 	fd_array_map_free(map);
595 }
596 
597 static const struct bpf_map_ops cgroup_array_ops = {
598 	.map_alloc = fd_array_map_alloc,
599 	.map_free = cgroup_fd_array_free,
600 	.map_get_next_key = array_map_get_next_key,
601 	.map_lookup_elem = fd_array_map_lookup_elem,
602 	.map_delete_elem = fd_array_map_delete_elem,
603 	.map_fd_get_ptr = cgroup_fd_array_get_ptr,
604 	.map_fd_put_ptr = cgroup_fd_array_put_ptr,
605 };
606 
607 static struct bpf_map_type_list cgroup_array_type __read_mostly = {
608 	.ops = &cgroup_array_ops,
609 	.type = BPF_MAP_TYPE_CGROUP_ARRAY,
610 };
611 
register_cgroup_array_map(void)612 static int __init register_cgroup_array_map(void)
613 {
614 	bpf_register_map_type(&cgroup_array_type);
615 	return 0;
616 }
617 late_initcall(register_cgroup_array_map);
618 #endif
619