1 /*
2 * nghttp2 - HTTP/2 C Library
3 *
4 * Copyright (c) 2021 Tatsuhiro Tsujikawa
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25 #include <linux/udp.h>
26 #include <linux/bpf.h>
27
28 #include <bpf/bpf_helpers.h>
29
30 /*
31 * How to compile:
32 *
33 * clang-12 -O2 -Wall -target bpf -g -c reuseport_kern.c -o reuseport_kern.o \
34 * -I/path/to/kernel/include
35 *
36 * See
37 * https://www.kernel.org/doc/Documentation/kbuild/headers_install.txt
38 * how to install kernel header files.
39 */
40
41 /* AES_CBC_decrypt_buffer: https://github.com/kokke/tiny-AES-c
42 License is Public Domain. Commit hash:
43 12e7744b4919e9d55de75b7ab566326a1c8e7a67 */
44
45 #define AES_keyExpSize 176
46
47 struct AES_ctx {
48 __u8 RoundKey[AES_keyExpSize];
49 };
50
51 /* The number of columns comprising a state in AES. This is a constant
52 in AES. Value=4 */
53 #define Nb 4
54
55 #define Nr 10 /* The number of rounds in AES Cipher. */
56
57 /* state - array holding the intermediate results during
58 decryption. */
59 typedef __u8 state_t[4][4];
60
61 /* The lookup-tables are marked const so they can be placed in
62 read-only storage instead of RAM The numbers below can be computed
63 dynamically trading ROM for RAM - This can be useful in (embedded)
64 bootloader applications, where ROM is often limited. */
65 static const __u8 rsbox[256] = {
66 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81,
67 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e,
68 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23,
69 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e, 0x08, 0x2e, 0xa1, 0x66,
70 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25, 0x72,
71 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65,
72 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46,
73 0x57, 0xa7, 0x8d, 0x9d, 0x84, 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a,
74 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca,
75 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91,
76 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6,
77 0x73, 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8,
78 0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f,
79 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2,
80 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4, 0x1f, 0xdd, 0xa8,
81 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
82 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93,
83 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb,
84 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61, 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6,
85 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d};
86
87 /* This function adds the round key to state. The round key is added
88 to the state by an XOR function. */
AddRoundKey(__u8 round,state_t * state,const __u8 * RoundKey)89 static void AddRoundKey(__u8 round, state_t *state, const __u8 *RoundKey) {
90 __u8 i, j;
91 for (i = 0; i < 4; ++i) {
92 for (j = 0; j < 4; ++j) {
93 (*state)[i][j] ^= RoundKey[(round * Nb * 4) + (i * Nb) + j];
94 }
95 }
96 }
97
xtime(__u8 x)98 static __u8 xtime(__u8 x) { return ((x << 1) ^ (((x >> 7) & 1) * 0x1b)); }
99
100 #define Multiply(x, y) \
101 (((y & 1) * x) ^ ((y >> 1 & 1) * xtime(x)) ^ \
102 ((y >> 2 & 1) * xtime(xtime(x))) ^ \
103 ((y >> 3 & 1) * xtime(xtime(xtime(x)))) ^ \
104 ((y >> 4 & 1) * xtime(xtime(xtime(xtime(x))))))
105
106 #define getSBoxInvert(num) (rsbox[(num)])
107
108 /* MixColumns function mixes the columns of the state matrix. The
109 method used to multiply may be difficult to understand for the
110 inexperienced. Please use the references to gain more
111 information. */
InvMixColumns(state_t * state)112 static void InvMixColumns(state_t *state) {
113 int i;
114 __u8 a, b, c, d;
115 for (i = 0; i < 4; ++i) {
116 a = (*state)[i][0];
117 b = (*state)[i][1];
118 c = (*state)[i][2];
119 d = (*state)[i][3];
120
121 (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^
122 Multiply(d, 0x09);
123 (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^
124 Multiply(d, 0x0d);
125 (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^
126 Multiply(d, 0x0b);
127 (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^
128 Multiply(d, 0x0e);
129 }
130 }
131
132 extern __u32 LINUX_KERNEL_VERSION __kconfig;
133
134 /* The SubBytes Function Substitutes the values in the state matrix
135 with values in an S-box. */
InvSubBytes(state_t * state)136 static void InvSubBytes(state_t *state) {
137 __u8 i, j;
138 if (LINUX_KERNEL_VERSION < KERNEL_VERSION(5, 10, 0)) {
139 for (i = 0; i < 4; ++i) {
140 for (j = 0; j < 4; ++j) {
141 /* Ubuntu 20.04 LTS kernel 5.4.0 needs this workaround
142 otherwise "math between map_value pointer and register with
143 unbounded min value is not allowed". 5.10.0 is a kernel
144 version that works but it might not be the minimum
145 version. */
146 __u8 k = (*state)[j][i];
147 (*state)[j][i] = k ? getSBoxInvert(k) : getSBoxInvert(0);
148 }
149 }
150 } else {
151 for (i = 0; i < 4; ++i) {
152 for (j = 0; j < 4; ++j) {
153 (*state)[j][i] = getSBoxInvert((*state)[j][i]);
154 }
155 }
156 }
157 }
158
InvShiftRows(state_t * state)159 static void InvShiftRows(state_t *state) {
160 __u8 temp;
161
162 /* Rotate first row 1 columns to right */
163 temp = (*state)[3][1];
164 (*state)[3][1] = (*state)[2][1];
165 (*state)[2][1] = (*state)[1][1];
166 (*state)[1][1] = (*state)[0][1];
167 (*state)[0][1] = temp;
168
169 /* Rotate second row 2 columns to right */
170 temp = (*state)[0][2];
171 (*state)[0][2] = (*state)[2][2];
172 (*state)[2][2] = temp;
173
174 temp = (*state)[1][2];
175 (*state)[1][2] = (*state)[3][2];
176 (*state)[3][2] = temp;
177
178 /* Rotate third row 3 columns to right */
179 temp = (*state)[0][3];
180 (*state)[0][3] = (*state)[1][3];
181 (*state)[1][3] = (*state)[2][3];
182 (*state)[2][3] = (*state)[3][3];
183 (*state)[3][3] = temp;
184 }
185
InvCipher(state_t * state,const __u8 * RoundKey)186 static void InvCipher(state_t *state, const __u8 *RoundKey) {
187 /* Add the First round key to the state before starting the
188 rounds. */
189 AddRoundKey(Nr, state, RoundKey);
190
191 /* There will be Nr rounds. The first Nr-1 rounds are identical.
192 These Nr rounds are executed in the loop below. Last one without
193 InvMixColumn() */
194 InvShiftRows(state);
195 InvSubBytes(state);
196 AddRoundKey(Nr - 1, state, RoundKey);
197 InvMixColumns(state);
198
199 InvShiftRows(state);
200 InvSubBytes(state);
201 AddRoundKey(Nr - 2, state, RoundKey);
202 InvMixColumns(state);
203
204 InvShiftRows(state);
205 InvSubBytes(state);
206 AddRoundKey(Nr - 3, state, RoundKey);
207 InvMixColumns(state);
208
209 InvShiftRows(state);
210 InvSubBytes(state);
211 AddRoundKey(Nr - 4, state, RoundKey);
212 InvMixColumns(state);
213
214 InvShiftRows(state);
215 InvSubBytes(state);
216 AddRoundKey(Nr - 5, state, RoundKey);
217 InvMixColumns(state);
218
219 InvShiftRows(state);
220 InvSubBytes(state);
221 AddRoundKey(Nr - 6, state, RoundKey);
222 InvMixColumns(state);
223
224 InvShiftRows(state);
225 InvSubBytes(state);
226 AddRoundKey(Nr - 7, state, RoundKey);
227 InvMixColumns(state);
228
229 InvShiftRows(state);
230 InvSubBytes(state);
231 AddRoundKey(Nr - 8, state, RoundKey);
232 InvMixColumns(state);
233
234 InvShiftRows(state);
235 InvSubBytes(state);
236 AddRoundKey(Nr - 9, state, RoundKey);
237 InvMixColumns(state);
238
239 InvShiftRows(state);
240 InvSubBytes(state);
241 AddRoundKey(Nr - 10, state, RoundKey);
242 }
243
AES_ECB_decrypt(const struct AES_ctx * ctx,__u8 * buf)244 static void AES_ECB_decrypt(const struct AES_ctx *ctx, __u8 *buf) {
245 /* The next function call decrypts the PlainText with the Key using
246 AES algorithm. */
247 InvCipher((state_t *)buf, ctx->RoundKey);
248 }
249
250 /* rol32: From linux kernel source code */
251
252 /**
253 * rol32 - rotate a 32-bit value left
254 * @word: value to rotate
255 * @shift: bits to roll
256 */
rol32(__u32 word,unsigned int shift)257 static inline __u32 rol32(__u32 word, unsigned int shift) {
258 return (word << shift) | (word >> ((-shift) & 31));
259 }
260
261 /* jhash.h: Jenkins hash support.
262 *
263 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
264 *
265 * https://burtleburtle.net/bob/hash/
266 *
267 * These are the credits from Bob's sources:
268 *
269 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
270 *
271 * These are functions for producing 32-bit hashes for hash table lookup.
272 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
273 * are externally useful functions. Routines to test the hash are included
274 * if SELF_TEST is defined. You can use this free for any purpose. It's in
275 * the public domain. It has no warranty.
276 *
277 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
278 *
279 * I've modified Bob's hash to be useful in the Linux kernel, and
280 * any bugs present are my fault.
281 * Jozsef
282 */
283
284 /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
285 #define __jhash_final(a, b, c) \
286 { \
287 c ^= b; \
288 c -= rol32(b, 14); \
289 a ^= c; \
290 a -= rol32(c, 11); \
291 b ^= a; \
292 b -= rol32(a, 25); \
293 c ^= b; \
294 c -= rol32(b, 16); \
295 a ^= c; \
296 a -= rol32(c, 4); \
297 b ^= a; \
298 b -= rol32(a, 14); \
299 c ^= b; \
300 c -= rol32(b, 24); \
301 }
302
303 /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
__jhash_nwords(__u32 a,__u32 b,__u32 c,__u32 initval)304 static inline __u32 __jhash_nwords(__u32 a, __u32 b, __u32 c, __u32 initval) {
305 a += initval;
306 b += initval;
307 c += initval;
308
309 __jhash_final(a, b, c);
310
311 return c;
312 }
313
314 /* An arbitrary initial parameter */
315 #define JHASH_INITVAL 0xdeadbeef
316
jhash_2words(__u32 a,__u32 b,__u32 initval)317 static inline __u32 jhash_2words(__u32 a, __u32 b, __u32 initval) {
318 return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
319 }
320
321 struct {
322 __uint(type, BPF_MAP_TYPE_HASH);
323 __uint(max_entries, 255);
324 __type(key, __u64);
325 __type(value, __u32);
326 } worker_id_map SEC(".maps");
327
328 struct {
329 __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
330 __uint(max_entries, 255);
331 __type(key, __u32);
332 __type(value, __u32);
333 } reuseport_array SEC(".maps");
334
335 struct {
336 __uint(type, BPF_MAP_TYPE_ARRAY);
337 __uint(max_entries, 1);
338 __type(key, __u32);
339 __type(value, __u64);
340 } sk_info SEC(".maps");
341
342 struct {
343 __uint(type, BPF_MAP_TYPE_ARRAY);
344 __uint(max_entries, 1);
345 __type(key, __u32);
346 __type(value, struct AES_ctx);
347 } aes_key SEC(".maps");
348
349 typedef struct quic_hd {
350 __u8 *dcid;
351 __u32 dcidlen;
352 __u32 dcid_offset;
353 __u8 type;
354 } quic_hd;
355
356 #define SV_DCIDLEN 17
357 #define MAX_DCIDLEN 20
358 #define MIN_DCIDLEN 8
359 #define WORKER_IDLEN 8
360 #define WORKER_ID_OFFSET 1
361
362 enum {
363 NGTCP2_PKT_INITIAL = 0x0,
364 NGTCP2_PKT_0RTT = 0x1,
365 NGTCP2_PKT_HANDSHAKE = 0x2,
366 NGTCP2_PKT_SHORT = 0x40,
367 };
368
parse_quic(quic_hd * qhd,__u8 * data,__u8 * data_end)369 static inline int parse_quic(quic_hd *qhd, __u8 *data, __u8 *data_end) {
370 __u8 *p;
371 __u64 dcidlen;
372
373 if (*data & 0x80) {
374 p = data + 1 + 4;
375
376 /* Do not check the actual DCID length because we might not buffer
377 entire DCID here. */
378 dcidlen = *p;
379
380 if (dcidlen > MAX_DCIDLEN || dcidlen < MIN_DCIDLEN) {
381 return -1;
382 }
383
384 ++p;
385
386 qhd->type = (*data & 0x30) >> 4;
387 qhd->dcid = p;
388 qhd->dcidlen = dcidlen;
389 qhd->dcid_offset = 6;
390 } else {
391 qhd->type = NGTCP2_PKT_SHORT;
392 qhd->dcid = data + 1;
393 qhd->dcidlen = SV_DCIDLEN;
394 qhd->dcid_offset = 1;
395 }
396
397 return 0;
398 }
399
hash(const __u8 * data,__u32 datalen,__u32 initval)400 static __u32 hash(const __u8 *data, __u32 datalen, __u32 initval) {
401 __u32 a, b;
402
403 a = (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
404 b = (data[4] << 24) | (data[5] << 16) | (data[6] << 8) | data[7];
405
406 return jhash_2words(a, b, initval);
407 }
408
sk_index_from_dcid(const quic_hd * qhd,const struct sk_reuseport_md * reuse_md,__u64 num_socks)409 static __u32 sk_index_from_dcid(const quic_hd *qhd,
410 const struct sk_reuseport_md *reuse_md,
411 __u64 num_socks) {
412 __u32 len = qhd->dcidlen;
413 __u32 h = reuse_md->hash;
414 __u8 hbuf[8];
415
416 if (len > 16) {
417 __builtin_memset(hbuf, 0, sizeof(hbuf));
418
419 switch (len) {
420 case 20:
421 __builtin_memcpy(hbuf, qhd->dcid + 16, 4);
422 break;
423 case 19:
424 __builtin_memcpy(hbuf, qhd->dcid + 16, 3);
425 break;
426 case 18:
427 __builtin_memcpy(hbuf, qhd->dcid + 16, 2);
428 break;
429 case 17:
430 __builtin_memcpy(hbuf, qhd->dcid + 16, 1);
431 break;
432 }
433
434 h = hash(hbuf, sizeof(hbuf), h);
435 len = 16;
436 }
437
438 if (len > 8) {
439 __builtin_memset(hbuf, 0, sizeof(hbuf));
440
441 switch (len) {
442 case 16:
443 __builtin_memcpy(hbuf, qhd->dcid + 8, 8);
444 break;
445 case 15:
446 __builtin_memcpy(hbuf, qhd->dcid + 8, 7);
447 break;
448 case 14:
449 __builtin_memcpy(hbuf, qhd->dcid + 8, 6);
450 break;
451 case 13:
452 __builtin_memcpy(hbuf, qhd->dcid + 8, 5);
453 break;
454 case 12:
455 __builtin_memcpy(hbuf, qhd->dcid + 8, 4);
456 break;
457 case 11:
458 __builtin_memcpy(hbuf, qhd->dcid + 8, 3);
459 break;
460 case 10:
461 __builtin_memcpy(hbuf, qhd->dcid + 8, 2);
462 break;
463 case 9:
464 __builtin_memcpy(hbuf, qhd->dcid + 8, 1);
465 break;
466 }
467
468 h = hash(hbuf, sizeof(hbuf), h);
469 len = 8;
470 }
471
472 return hash(qhd->dcid, len, h) % num_socks;
473 }
474
475 SEC("sk_reuseport")
select_reuseport(struct sk_reuseport_md * reuse_md)476 int select_reuseport(struct sk_reuseport_md *reuse_md) {
477 __u32 sk_index, *psk_index;
478 __u64 *pnum_socks;
479 __u32 zero = 0;
480 int rv;
481 quic_hd qhd;
482 __u8 qpktbuf[6 + MAX_DCIDLEN];
483 struct AES_ctx *aes_ctx;
484 __u8 *worker_id;
485 __u16 remote_port;
486 __u8 *data = reuse_md->data;
487
488 /* Packets less than 22 bytes never be a valid QUIC packet. */
489 if (reuse_md->len < sizeof(struct udphdr) + 22) {
490 return SK_DROP;
491 }
492
493 if (reuse_md->data + sizeof(struct udphdr) > reuse_md->data_end) {
494 return SK_DROP;
495 }
496
497 remote_port = (data[0] << 8) + data[1];
498
499 switch (remote_port) {
500 case 1900:
501 case 5353:
502 case 11211:
503 case 20800:
504 case 27015:
505 return SK_DROP;
506 default:
507 if (remote_port < 1024) {
508 return SK_DROP;
509 }
510 }
511
512 if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr), qpktbuf,
513 sizeof(qpktbuf)) != 0) {
514 return SK_DROP;
515 }
516
517 pnum_socks = bpf_map_lookup_elem(&sk_info, &zero);
518 if (pnum_socks == NULL) {
519 return SK_DROP;
520 }
521
522 aes_ctx = bpf_map_lookup_elem(&aes_key, &zero);
523 if (aes_ctx == NULL) {
524 return SK_DROP;
525 }
526
527 rv = parse_quic(&qhd, qpktbuf, qpktbuf + sizeof(qpktbuf));
528 if (rv != 0) {
529 return SK_DROP;
530 }
531
532 switch (qhd.type) {
533 case NGTCP2_PKT_INITIAL:
534 case NGTCP2_PKT_0RTT:
535 if (qhd.dcidlen == SV_DCIDLEN) {
536 worker_id = qhd.dcid + WORKER_ID_OFFSET;
537 AES_ECB_decrypt(aes_ctx, worker_id);
538
539 psk_index = bpf_map_lookup_elem(&worker_id_map, worker_id);
540 if (psk_index != NULL) {
541 sk_index = *psk_index;
542
543 break;
544 }
545 }
546
547 sk_index = sk_index_from_dcid(&qhd, reuse_md, *pnum_socks);
548
549 break;
550 case NGTCP2_PKT_HANDSHAKE:
551 case NGTCP2_PKT_SHORT:
552 if (qhd.dcidlen != SV_DCIDLEN) {
553 return SK_DROP;
554 }
555
556 worker_id = qhd.dcid + WORKER_ID_OFFSET;
557 AES_ECB_decrypt(aes_ctx, worker_id);
558
559 psk_index = bpf_map_lookup_elem(&worker_id_map, worker_id);
560 if (psk_index == NULL) {
561 sk_index = sk_index_from_dcid(&qhd, reuse_md, *pnum_socks);
562
563 break;
564 }
565
566 sk_index = *psk_index;
567
568 break;
569 default:
570 return SK_DROP;
571 }
572
573 rv = bpf_sk_select_reuseport(reuse_md, &reuseport_array, &sk_index, 0);
574 if (rv != 0) {
575 return SK_DROP;
576 }
577
578 return SK_PASS;
579 }
580