1 /*
2 * eBPF kernel space program part
3 *
4 * Toy eBPF program for demonstration purposes, some parts derived from
5 * kernel tree's samples/bpf/sockex2_kern.c example.
6 *
7 * More background on eBPF, kernel tree: Documentation/networking/filter.txt
8 *
9 * Note, this file is rather large, and most classifier and actions are
10 * likely smaller to accomplish one specific use-case and are tailored
11 * for high performance. For performance reasons, you might also have the
12 * classifier and action already merged inside the classifier.
13 *
14 * In order to show various features it serves as a bigger programming
15 * example, which you should feel free to rip apart and experiment with.
16 *
17 * Compilation, configuration example:
18 *
19 * Note: as long as the BPF backend in LLVM is still experimental,
20 * you need to build LLVM with LLVM with --enable-experimental-targets=BPF
21 * Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
22 * and you have libelf.h and gelf.h headers and can link tc against -lelf.
23 *
24 * In case you need to sync kernel headers, go to your kernel source tree:
25 * # make headers_install INSTALL_HDR_PATH=/usr/
26 *
27 * $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
28 * $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
29 * $ objdump -h bpf.o
30 * [...]
31 * 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3
32 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
33 * 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3
34 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
35 * 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3
36 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
37 * 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2
38 * CONTENTS, ALLOC, LOAD, DATA
39 * 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0
40 * CONTENTS, ALLOC, LOAD, DATA
41 * [...]
42 * # echo 1 > /proc/sys/net/core/bpf_jit_enable
43 * $ gcc bpf_agent.c -o bpf_agent -Wall -O2
44 * # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal)
45 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
46 * action bpf obj bpf.o sec action-mark \
47 * action bpf obj bpf.o sec action-rand ok
48 * # tc filter show dev em1
49 * filter parent 1: protocol all pref 49152 bpf
50 * filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
51 * action order 1: bpf bpf.o:[action-mark] default-action pipe
52 * index 52 ref 1 bind 1
53 *
54 * action order 2: bpf bpf.o:[action-rand] default-action pipe
55 * index 53 ref 1 bind 1
56 *
57 * action order 3: gact action pass
58 * random type none pass val 0
59 * index 38 ref 1 bind 1
60 *
61 * The same program can also be installed on ingress side (as opposed to above
62 * egress configuration), e.g.:
63 *
64 * # tc qdisc add dev em1 handle ffff: ingress
65 * # tc filter add dev em1 parent ffff: bpf obj ...
66 *
67 * Notes on BPF agent:
68 *
69 * In the above example, the bpf_agent creates the unix domain socket
70 * natively. "tc exec" can also spawn a shell and hold the socktes there:
71 *
72 * # tc exec bpf imp /tmp/bpf-uds
73 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
74 * action bpf obj bpf.o sec action-mark \
75 * action bpf obj bpf.o sec action-rand ok
76 * sh-4.2# (shell spawned from tc exec)
77 * sh-4.2# bpf_agent
78 * [...]
79 *
80 * This will read out fds over environment and produce the same data dump
81 * as below. This has the advantage that the spawned shell owns the fds
82 * and thus if the agent is restarted, it can reattach to the same fds, also
83 * various programs can easily read/modify the data simultaneously from user
84 * space side.
85 *
86 * If the shell is unnecessary, the agent can also just be spawned directly
87 * via tc exec:
88 *
89 * # tc exec bpf imp /tmp/bpf-uds run bpf_agent
90 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
91 * action bpf obj bpf.o sec action-mark \
92 * action bpf obj bpf.o sec action-rand ok
93 *
94 * BPF agent example output:
95 *
96 * ver: 1
97 * obj: bpf.o
98 * dev: 64770
99 * ino: 6045133
100 * maps: 3
101 * map0:
102 * `- fd: 4
103 * | serial: 1
104 * | type: 1
105 * | max elem: 256
106 * | size key: 1
107 * ` size val: 16
108 * map1:
109 * `- fd: 5
110 * | serial: 2
111 * | type: 1
112 * | max elem: 1024
113 * | size key: 4
114 * ` size val: 16
115 * map2:
116 * `- fd: 6
117 * | serial: 3
118 * | type: 2
119 * | max elem: 64
120 * | size key: 4
121 * ` size val: 8
122 * data, period: 5sec
123 * `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0
124 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0]
125 * ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0]
126 * data, period: 5sec
127 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1
128 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0]
129 * ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0]
130 * data, period: 5sec
131 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3
132 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0]
133 * ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0]
134 * [...]
135 *
136 * This now means, the below classifier and action pipeline has been loaded
137 * as eBPF bytecode into the kernel, the kernel has verified that the
138 * execution of the bytecode is "safe", and it has JITed the programs
139 * afterwards, so that upon invocation they're running on native speed. tc
140 * has transferred all map file descriptors to the bpf_agent via IPC and
141 * even after tc exits, the agent can read out or modify all map data.
142 *
143 * Note that the export to the uds is done only once in the classifier and
144 * not in the action. It's enough to export the (here) shared descriptors
145 * once.
146 *
147 * If you need to disassemble the generated JIT image (echo with 2), the
148 * kernel tree has under tools/net/ a small helper, you can invoke e.g.
149 * `bpf_jit_disasm -o`.
150 *
151 * Please find in the code below further comments.
152 *
153 * -- Happy eBPF hacking! ;)
154 */
155 #include <stdint.h>
156 #include <stdbool.h>
157 #include <sys/types.h>
158 #include <sys/socket.h>
159 #include <asm/types.h>
160 #include <linux/in.h>
161 #include <linux/if.h>
162 #include <linux/if_ether.h>
163 #include <linux/ip.h>
164 #include <linux/ipv6.h>
165 #include <linux/if_tunnel.h>
166 #include <linux/filter.h>
167 #include <linux/bpf.h>
168
169 /* Common, shared definitions with ebpf_agent.c. */
170 #include "bpf_shared.h"
171 /* BPF helper functions for our example. */
172 #include "../../include/bpf_api.h"
173
174 /* Could be defined here as well, or included from the header. */
175 #define TC_ACT_UNSPEC (-1)
176 #define TC_ACT_OK 0
177 #define TC_ACT_RECLASSIFY 1
178 #define TC_ACT_SHOT 2
179 #define TC_ACT_PIPE 3
180 #define TC_ACT_STOLEN 4
181 #define TC_ACT_QUEUED 5
182 #define TC_ACT_REPEAT 6
183
184 /* Other, misc stuff. */
185 #define IP_MF 0x2000
186 #define IP_OFFSET 0x1FFF
187
188 /* eBPF map definitions, all placed in section "maps". */
189 struct bpf_elf_map __section("maps") map_proto = {
190 .type = BPF_MAP_TYPE_HASH,
191 .id = BPF_MAP_ID_PROTO,
192 .size_key = sizeof(uint8_t),
193 .size_value = sizeof(struct count_tuple),
194 .max_elem = 256,
195 };
196
197 struct bpf_elf_map __section("maps") map_queue = {
198 .type = BPF_MAP_TYPE_HASH,
199 .id = BPF_MAP_ID_QUEUE,
200 .size_key = sizeof(uint32_t),
201 .size_value = sizeof(struct count_queue),
202 .max_elem = 1024,
203 };
204
205 struct bpf_elf_map __section("maps") map_drops = {
206 .type = BPF_MAP_TYPE_ARRAY,
207 .id = BPF_MAP_ID_DROPS,
208 .size_key = sizeof(uint32_t),
209 .size_value = sizeof(long),
210 .max_elem = 64,
211 };
212
213 /* Helper functions and definitions for the flow dissector used by the
214 * example classifier. This resembles the kernel's flow dissector to
215 * some extend and is just used as an example to show what's possible
216 * with eBPF.
217 */
218 struct sockaddr;
219
220 struct vlan_hdr {
221 __be16 h_vlan_TCI;
222 __be16 h_vlan_encapsulated_proto;
223 };
224
225 struct flow_keys {
226 __u32 src;
227 __u32 dst;
228 union {
229 __u32 ports;
230 __u16 port16[2];
231 };
232 __s32 th_off;
233 __u8 ip_proto;
234 };
235
flow_ports_offset(__u8 ip_proto)236 static inline int flow_ports_offset(__u8 ip_proto)
237 {
238 switch (ip_proto) {
239 case IPPROTO_TCP:
240 case IPPROTO_UDP:
241 case IPPROTO_DCCP:
242 case IPPROTO_ESP:
243 case IPPROTO_SCTP:
244 case IPPROTO_UDPLITE:
245 default:
246 return 0;
247 case IPPROTO_AH:
248 return 4;
249 }
250 }
251
flow_is_frag(struct __sk_buff * skb,int nh_off)252 static inline bool flow_is_frag(struct __sk_buff *skb, int nh_off)
253 {
254 return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
255 (IP_MF | IP_OFFSET));
256 }
257
flow_parse_ipv4(struct __sk_buff * skb,int nh_off,__u8 * ip_proto,struct flow_keys * flow)258 static inline int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
259 __u8 *ip_proto, struct flow_keys *flow)
260 {
261 __u8 ip_ver_len;
262
263 if (unlikely(flow_is_frag(skb, nh_off)))
264 *ip_proto = 0;
265 else
266 *ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
267 protocol));
268 if (*ip_proto != IPPROTO_GRE) {
269 flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
270 flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
271 }
272
273 ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
274 if (likely(ip_ver_len == 0x45))
275 nh_off += 20;
276 else
277 nh_off += (ip_ver_len & 0xF) << 2;
278
279 return nh_off;
280 }
281
flow_addr_hash_ipv6(struct __sk_buff * skb,int off)282 static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
283 {
284 __u32 w0 = load_word(skb, off);
285 __u32 w1 = load_word(skb, off + sizeof(w0));
286 __u32 w2 = load_word(skb, off + sizeof(w0) * 2);
287 __u32 w3 = load_word(skb, off + sizeof(w0) * 3);
288
289 return w0 ^ w1 ^ w2 ^ w3;
290 }
291
flow_parse_ipv6(struct __sk_buff * skb,int nh_off,__u8 * ip_proto,struct flow_keys * flow)292 static inline int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
293 __u8 *ip_proto, struct flow_keys *flow)
294 {
295 *ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
296
297 flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
298 flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
299
300 return nh_off + sizeof(struct ipv6hdr);
301 }
302
flow_dissector(struct __sk_buff * skb,struct flow_keys * flow)303 static inline bool flow_dissector(struct __sk_buff *skb,
304 struct flow_keys *flow)
305 {
306 int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
307 __be16 proto = skb->protocol;
308 __u8 ip_proto;
309
310 /* TODO: check for skb->vlan_tci, skb->vlan_proto first */
311 if (proto == htons(ETH_P_8021AD)) {
312 proto = load_half(skb, nh_off +
313 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
314 nh_off += sizeof(struct vlan_hdr);
315 }
316 if (proto == htons(ETH_P_8021Q)) {
317 proto = load_half(skb, nh_off +
318 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
319 nh_off += sizeof(struct vlan_hdr);
320 }
321
322 if (likely(proto == htons(ETH_P_IP)))
323 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
324 else if (proto == htons(ETH_P_IPV6))
325 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
326 else
327 return false;
328
329 switch (ip_proto) {
330 case IPPROTO_GRE: {
331 struct gre_hdr {
332 __be16 flags;
333 __be16 proto;
334 };
335
336 __u16 gre_flags = load_half(skb, nh_off +
337 offsetof(struct gre_hdr, flags));
338 __u16 gre_proto = load_half(skb, nh_off +
339 offsetof(struct gre_hdr, proto));
340
341 if (gre_flags & (GRE_VERSION | GRE_ROUTING))
342 break;
343
344 nh_off += 4;
345 if (gre_flags & GRE_CSUM)
346 nh_off += 4;
347 if (gre_flags & GRE_KEY)
348 nh_off += 4;
349 if (gre_flags & GRE_SEQ)
350 nh_off += 4;
351
352 if (gre_proto == ETH_P_8021Q) {
353 gre_proto = load_half(skb, nh_off +
354 offsetof(struct vlan_hdr,
355 h_vlan_encapsulated_proto));
356 nh_off += sizeof(struct vlan_hdr);
357 }
358 if (gre_proto == ETH_P_IP)
359 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
360 else if (gre_proto == ETH_P_IPV6)
361 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
362 else
363 return false;
364 break;
365 }
366 case IPPROTO_IPIP:
367 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
368 break;
369 case IPPROTO_IPV6:
370 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
371 default:
372 break;
373 }
374
375 nh_off += flow_ports_offset(ip_proto);
376
377 flow->ports = load_word(skb, nh_off);
378 flow->th_off = nh_off;
379 flow->ip_proto = ip_proto;
380
381 return true;
382 }
383
cls_update_proto_map(const struct __sk_buff * skb,const struct flow_keys * flow)384 static inline void cls_update_proto_map(const struct __sk_buff *skb,
385 const struct flow_keys *flow)
386 {
387 uint8_t proto = flow->ip_proto;
388 struct count_tuple *ct, _ct;
389
390 ct = map_lookup_elem(&map_proto, &proto);
391 if (likely(ct)) {
392 lock_xadd(&ct->packets, 1);
393 lock_xadd(&ct->bytes, skb->len);
394 return;
395 }
396
397 /* No hit yet, we need to create a new entry. */
398 _ct.packets = 1;
399 _ct.bytes = skb->len;
400
401 map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
402 }
403
cls_update_queue_map(const struct __sk_buff * skb)404 static inline void cls_update_queue_map(const struct __sk_buff *skb)
405 {
406 uint32_t queue = skb->queue_mapping;
407 struct count_queue *cq, _cq;
408 bool mismatch;
409
410 mismatch = skb->queue_mapping != get_smp_processor_id();
411
412 cq = map_lookup_elem(&map_queue, &queue);
413 if (likely(cq)) {
414 lock_xadd(&cq->total, 1);
415 if (mismatch)
416 lock_xadd(&cq->mismatch, 1);
417 return;
418 }
419
420 /* No hit yet, we need to create a new entry. */
421 _cq.total = 1;
422 _cq.mismatch = mismatch ? 1 : 0;
423
424 map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
425 }
426
427 /* eBPF program definitions, placed in various sections, which can
428 * have custom section names. If custom names are in use, it's
429 * required to point tc to the correct section, e.g.
430 *
431 * tc filter add [...] bpf obj cls.o sec cls-tos [...]
432 *
433 * in case the program resides in __section("cls-tos").
434 *
435 * Default section for cls_bpf is: "classifier", for act_bpf is:
436 * "action". Naturally, if for example multiple actions are present
437 * in the same file, they need to have distinct section names.
438 *
439 * It is however not required to have multiple programs sharing
440 * a file.
441 */
442 __section("classifier")
cls_main(struct __sk_buff * skb)443 int cls_main(struct __sk_buff *skb)
444 {
445 struct flow_keys flow;
446
447 if (!flow_dissector(skb, &flow))
448 return 0; /* No match in cls_bpf. */
449
450 cls_update_proto_map(skb, &flow);
451 cls_update_queue_map(skb);
452
453 return flow.ip_proto;
454 }
455
act_update_drop_map(void)456 static inline void act_update_drop_map(void)
457 {
458 uint32_t *count, cpu = get_smp_processor_id();
459
460 count = map_lookup_elem(&map_drops, &cpu);
461 if (count)
462 /* Only this cpu is accessing this element. */
463 (*count)++;
464 }
465
466 __section("action-mark")
act_mark_main(struct __sk_buff * skb)467 int act_mark_main(struct __sk_buff *skb)
468 {
469 /* You could also mangle skb data here with the helper function
470 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
471 * do that already in the classifier itself as a merged combination
472 * of classifier'n'action model.
473 */
474
475 if (skb->mark == 0xcafe) {
476 act_update_drop_map();
477 return TC_ACT_SHOT;
478 }
479
480 /* Default configured tc opcode. */
481 return TC_ACT_UNSPEC;
482 }
483
484 __section("action-rand")
act_rand_main(struct __sk_buff * skb)485 int act_rand_main(struct __sk_buff *skb)
486 {
487 /* Sorry, we're near event horizon ... */
488 if ((get_prandom_u32() & 3) == 0) {
489 act_update_drop_map();
490 return TC_ACT_SHOT;
491 }
492
493 return TC_ACT_UNSPEC;
494 }
495
496 /* Last but not least, the file contains a license. Some future helper
497 * functions may only be available with a GPL license.
498 */
499 BPF_LICENSE("GPL");
500