1 /*
2 * nghttp2 - HTTP/2 C Library
3 *
4 * Copyright (c) 2021 Tatsuhiro Tsujikawa
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice shall be
15 * included in all copies or substantial portions of the Software.
16 *
17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
18 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
19 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
20 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
21 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
22 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
23 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
24 */
25 #include <linux/udp.h>
26 #include <linux/bpf.h>
27
28 #include <bpf/bpf_helpers.h>
29
30 /*
31 * How to compile:
32 *
33 * clang-12 -O2 -Wall -target bpf -g -c reuseport_kern.c -o reuseport_kern.o \
34 * -I/path/to/kernel/include
35 *
36 * See
37 * https://www.kernel.org/doc/Documentation/kbuild/headers_install.txt
38 * how to install kernel header files.
39 */
40
41 /* AES_CBC_decrypt_buffer: https://github.com/kokke/tiny-AES-c
42 License is Public Domain. Commit hash:
43 12e7744b4919e9d55de75b7ab566326a1c8e7a67 */
44
45 #define AES_keyExpSize 176
46
47 struct AES_ctx {
48 __u8 RoundKey[AES_keyExpSize];
49 };
50
51 /* The number of columns comprising a state in AES. This is a constant
52 in AES. Value=4 */
53 #define Nb 4
54
55 #define Nr 10 /* The number of rounds in AES Cipher. */
56
57 /* state - array holding the intermediate results during
58 decryption. */
59 typedef __u8 state_t[4][4];
60
61 /* The lookup-tables are marked const so they can be placed in
62 read-only storage instead of RAM The numbers below can be computed
63 dynamically trading ROM for RAM - This can be useful in (embedded)
64 bootloader applications, where ROM is often limited. */
65 static const __u8 rsbox[256] = {
66 0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e,
67 0x81, 0xf3, 0xd7, 0xfb, 0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87,
68 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb, 0x54, 0x7b, 0x94, 0x32,
69 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
70 0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49,
71 0x6d, 0x8b, 0xd1, 0x25, 0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16,
72 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92, 0x6c, 0x70, 0x48, 0x50,
73 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
74 0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05,
75 0xb8, 0xb3, 0x45, 0x06, 0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02,
76 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b, 0x3a, 0x91, 0x11, 0x41,
77 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
78 0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8,
79 0x1c, 0x75, 0xdf, 0x6e, 0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89,
80 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b, 0xfc, 0x56, 0x3e, 0x4b,
81 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
82 0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59,
83 0x27, 0x80, 0xec, 0x5f, 0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d,
84 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef, 0xa0, 0xe0, 0x3b, 0x4d,
85 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
86 0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63,
87 0x55, 0x21, 0x0c, 0x7d};
88
89 /* This function adds the round key to state. The round key is added
90 to the state by an XOR function. */
AddRoundKey(__u8 round,state_t * state,const __u8 * RoundKey)91 static void AddRoundKey(__u8 round, state_t *state, const __u8 *RoundKey) {
92 __u8 i, j;
93 for (i = 0; i < 4; ++i) {
94 for (j = 0; j < 4; ++j) {
95 (*state)[i][j] ^= RoundKey[(round * Nb * 4) + (i * Nb) + j];
96 }
97 }
98 }
99
xtime(__u8 x)100 static __u8 xtime(__u8 x) { return ((x << 1) ^ (((x >> 7) & 1) * 0x1b)); }
101
102 #define Multiply(x, y) \
103 (((y & 1) * x) ^ ((y >> 1 & 1) * xtime(x)) ^ \
104 ((y >> 2 & 1) * xtime(xtime(x))) ^ \
105 ((y >> 3 & 1) * xtime(xtime(xtime(x)))) ^ \
106 ((y >> 4 & 1) * xtime(xtime(xtime(xtime(x))))))
107
108 #define getSBoxInvert(num) (rsbox[(num)])
109
110 /* MixColumns function mixes the columns of the state matrix. The
111 method used to multiply may be difficult to understand for the
112 inexperienced. Please use the references to gain more
113 information. */
InvMixColumns(state_t * state)114 static void InvMixColumns(state_t *state) {
115 int i;
116 __u8 a, b, c, d;
117 for (i = 0; i < 4; ++i) {
118 a = (*state)[i][0];
119 b = (*state)[i][1];
120 c = (*state)[i][2];
121 d = (*state)[i][3];
122
123 (*state)[i][0] = Multiply(a, 0x0e) ^ Multiply(b, 0x0b) ^ Multiply(c, 0x0d) ^
124 Multiply(d, 0x09);
125 (*state)[i][1] = Multiply(a, 0x09) ^ Multiply(b, 0x0e) ^ Multiply(c, 0x0b) ^
126 Multiply(d, 0x0d);
127 (*state)[i][2] = Multiply(a, 0x0d) ^ Multiply(b, 0x09) ^ Multiply(c, 0x0e) ^
128 Multiply(d, 0x0b);
129 (*state)[i][3] = Multiply(a, 0x0b) ^ Multiply(b, 0x0d) ^ Multiply(c, 0x09) ^
130 Multiply(d, 0x0e);
131 }
132 }
133
134 extern __u32 LINUX_KERNEL_VERSION __kconfig;
135
136 /* The SubBytes Function Substitutes the values in the state matrix
137 with values in an S-box. */
InvSubBytes(state_t * state)138 static void InvSubBytes(state_t *state) {
139 __u8 i, j;
140 if (LINUX_KERNEL_VERSION < KERNEL_VERSION(5, 10, 0)) {
141 for (i = 0; i < 4; ++i) {
142 for (j = 0; j < 4; ++j) {
143 /* Ubuntu 20.04 LTS kernel 5.4.0 needs this workaround
144 otherwise "math between map_value pointer and register with
145 unbounded min value is not allowed". 5.10.0 is a kernel
146 version that works but it might not be the minimum
147 version. */
148 __u8 k = (*state)[j][i];
149 (*state)[j][i] = k ? getSBoxInvert(k) : getSBoxInvert(0);
150 }
151 }
152 } else {
153 for (i = 0; i < 4; ++i) {
154 for (j = 0; j < 4; ++j) {
155 (*state)[j][i] = getSBoxInvert((*state)[j][i]);
156 }
157 }
158 }
159 }
160
InvShiftRows(state_t * state)161 static void InvShiftRows(state_t *state) {
162 __u8 temp;
163
164 /* Rotate first row 1 columns to right */
165 temp = (*state)[3][1];
166 (*state)[3][1] = (*state)[2][1];
167 (*state)[2][1] = (*state)[1][1];
168 (*state)[1][1] = (*state)[0][1];
169 (*state)[0][1] = temp;
170
171 /* Rotate second row 2 columns to right */
172 temp = (*state)[0][2];
173 (*state)[0][2] = (*state)[2][2];
174 (*state)[2][2] = temp;
175
176 temp = (*state)[1][2];
177 (*state)[1][2] = (*state)[3][2];
178 (*state)[3][2] = temp;
179
180 /* Rotate third row 3 columns to right */
181 temp = (*state)[0][3];
182 (*state)[0][3] = (*state)[1][3];
183 (*state)[1][3] = (*state)[2][3];
184 (*state)[2][3] = (*state)[3][3];
185 (*state)[3][3] = temp;
186 }
187
InvCipher(state_t * state,const __u8 * RoundKey)188 static void InvCipher(state_t *state, const __u8 *RoundKey) {
189 /* Add the First round key to the state before starting the
190 rounds. */
191 AddRoundKey(Nr, state, RoundKey);
192
193 /* There will be Nr rounds. The first Nr-1 rounds are identical.
194 These Nr rounds are executed in the loop below. Last one without
195 InvMixColumn() */
196 InvShiftRows(state);
197 InvSubBytes(state);
198 AddRoundKey(Nr - 1, state, RoundKey);
199 InvMixColumns(state);
200
201 InvShiftRows(state);
202 InvSubBytes(state);
203 AddRoundKey(Nr - 2, state, RoundKey);
204 InvMixColumns(state);
205
206 InvShiftRows(state);
207 InvSubBytes(state);
208 AddRoundKey(Nr - 3, state, RoundKey);
209 InvMixColumns(state);
210
211 InvShiftRows(state);
212 InvSubBytes(state);
213 AddRoundKey(Nr - 4, state, RoundKey);
214 InvMixColumns(state);
215
216 InvShiftRows(state);
217 InvSubBytes(state);
218 AddRoundKey(Nr - 5, state, RoundKey);
219 InvMixColumns(state);
220
221 InvShiftRows(state);
222 InvSubBytes(state);
223 AddRoundKey(Nr - 6, state, RoundKey);
224 InvMixColumns(state);
225
226 InvShiftRows(state);
227 InvSubBytes(state);
228 AddRoundKey(Nr - 7, state, RoundKey);
229 InvMixColumns(state);
230
231 InvShiftRows(state);
232 InvSubBytes(state);
233 AddRoundKey(Nr - 8, state, RoundKey);
234 InvMixColumns(state);
235
236 InvShiftRows(state);
237 InvSubBytes(state);
238 AddRoundKey(Nr - 9, state, RoundKey);
239 InvMixColumns(state);
240
241 InvShiftRows(state);
242 InvSubBytes(state);
243 AddRoundKey(Nr - 10, state, RoundKey);
244 }
245
AES_ECB_decrypt(const struct AES_ctx * ctx,__u8 * buf)246 static void AES_ECB_decrypt(const struct AES_ctx *ctx, __u8 *buf) {
247 /* The next function call decrypts the PlainText with the Key using
248 AES algorithm. */
249 InvCipher((state_t *)buf, ctx->RoundKey);
250 }
251
252 /* rol32: From linux kernel source code */
253
254 /**
255 * rol32 - rotate a 32-bit value left
256 * @word: value to rotate
257 * @shift: bits to roll
258 */
rol32(__u32 word,unsigned int shift)259 static inline __u32 rol32(__u32 word, unsigned int shift) {
260 return (word << shift) | (word >> ((-shift) & 31));
261 }
262
263 /* jhash.h: Jenkins hash support.
264 *
265 * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net)
266 *
267 * https://burtleburtle.net/bob/hash/
268 *
269 * These are the credits from Bob's sources:
270 *
271 * lookup3.c, by Bob Jenkins, May 2006, Public Domain.
272 *
273 * These are functions for producing 32-bit hashes for hash table lookup.
274 * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final()
275 * are externally useful functions. Routines to test the hash are included
276 * if SELF_TEST is defined. You can use this free for any purpose. It's in
277 * the public domain. It has no warranty.
278 *
279 * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@blackhole.kfki.hu)
280 *
281 * I've modified Bob's hash to be useful in the Linux kernel, and
282 * any bugs present are my fault.
283 * Jozsef
284 */
285
286 /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */
287 #define __jhash_final(a, b, c) \
288 { \
289 c ^= b; \
290 c -= rol32(b, 14); \
291 a ^= c; \
292 a -= rol32(c, 11); \
293 b ^= a; \
294 b -= rol32(a, 25); \
295 c ^= b; \
296 c -= rol32(b, 16); \
297 a ^= c; \
298 a -= rol32(c, 4); \
299 b ^= a; \
300 b -= rol32(a, 14); \
301 c ^= b; \
302 c -= rol32(b, 24); \
303 }
304
305 /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */
__jhash_nwords(__u32 a,__u32 b,__u32 c,__u32 initval)306 static inline __u32 __jhash_nwords(__u32 a, __u32 b, __u32 c, __u32 initval) {
307 a += initval;
308 b += initval;
309 c += initval;
310
311 __jhash_final(a, b, c);
312
313 return c;
314 }
315
316 /* An arbitrary initial parameter */
317 #define JHASH_INITVAL 0xdeadbeef
318
jhash_2words(__u32 a,__u32 b,__u32 initval)319 static inline __u32 jhash_2words(__u32 a, __u32 b, __u32 initval) {
320 return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2));
321 }
322
323 struct {
324 __uint(type, BPF_MAP_TYPE_HASH);
325 __uint(max_entries, 255);
326 __type(key, __u64);
327 __type(value, __u32);
328 } worker_id_map SEC(".maps");
329
330 struct {
331 __uint(type, BPF_MAP_TYPE_REUSEPORT_SOCKARRAY);
332 __uint(max_entries, 255);
333 __type(key, __u32);
334 __type(value, __u32);
335 } reuseport_array SEC(".maps");
336
337 struct {
338 __uint(type, BPF_MAP_TYPE_ARRAY);
339 __uint(max_entries, 1);
340 __type(key, __u32);
341 __type(value, __u64);
342 } sk_info SEC(".maps");
343
344 struct {
345 __uint(type, BPF_MAP_TYPE_ARRAY);
346 __uint(max_entries, 1);
347 __type(key, __u32);
348 __type(value, struct AES_ctx);
349 } aes_key SEC(".maps");
350
351 typedef struct quic_hd {
352 __u8 *dcid;
353 __u32 dcidlen;
354 __u32 dcid_offset;
355 __u8 type;
356 } quic_hd;
357
358 #define SV_DCIDLEN 17
359 #define MAX_DCIDLEN 20
360 #define MIN_DCIDLEN 8
361 #define WORKER_IDLEN 8
362 #define WORKER_ID_OFFSET 1
363
364 enum {
365 NGTCP2_PKT_INITIAL = 0x0,
366 NGTCP2_PKT_0RTT = 0x1,
367 NGTCP2_PKT_HANDSHAKE = 0x2,
368 NGTCP2_PKT_SHORT = 0x40,
369 };
370
parse_quic(quic_hd * qhd,__u8 * data,__u8 * data_end)371 static inline int parse_quic(quic_hd *qhd, __u8 *data, __u8 *data_end) {
372 __u8 *p;
373 __u64 dcidlen;
374
375 if (*data & 0x80) {
376 p = data + 1 + 4;
377
378 /* Do not check the actual DCID length because we might not buffer
379 entire DCID here. */
380 dcidlen = *p;
381
382 if (dcidlen > MAX_DCIDLEN || dcidlen < MIN_DCIDLEN) {
383 return -1;
384 }
385
386 ++p;
387
388 qhd->type = (*data & 0x30) >> 4;
389 qhd->dcid = p;
390 qhd->dcidlen = dcidlen;
391 qhd->dcid_offset = 6;
392 } else {
393 qhd->type = NGTCP2_PKT_SHORT;
394 qhd->dcid = data + 1;
395 qhd->dcidlen = SV_DCIDLEN;
396 qhd->dcid_offset = 1;
397 }
398
399 return 0;
400 }
401
hash(const __u8 * data,__u32 datalen,__u32 initval)402 static __u32 hash(const __u8 *data, __u32 datalen, __u32 initval) {
403 __u32 a, b;
404
405 a = (data[0] << 24) | (data[1] << 16) | (data[2] << 8) | data[3];
406 b = (data[4] << 24) | (data[5] << 16) | (data[6] << 8) | data[7];
407
408 return jhash_2words(a, b, initval);
409 }
410
sk_index_from_dcid(const quic_hd * qhd,const struct sk_reuseport_md * reuse_md,__u64 num_socks)411 static __u32 sk_index_from_dcid(const quic_hd *qhd,
412 const struct sk_reuseport_md *reuse_md,
413 __u64 num_socks) {
414 __u32 len = qhd->dcidlen;
415 __u32 h = reuse_md->hash;
416 __u8 hbuf[8];
417
418 if (len > 16) {
419 __builtin_memset(hbuf, 0, sizeof(hbuf));
420
421 switch (len) {
422 case 20:
423 __builtin_memcpy(hbuf, qhd->dcid + 16, 4);
424 break;
425 case 19:
426 __builtin_memcpy(hbuf, qhd->dcid + 16, 3);
427 break;
428 case 18:
429 __builtin_memcpy(hbuf, qhd->dcid + 16, 2);
430 break;
431 case 17:
432 __builtin_memcpy(hbuf, qhd->dcid + 16, 1);
433 break;
434 }
435
436 h = hash(hbuf, sizeof(hbuf), h);
437 len = 16;
438 }
439
440 if (len > 8) {
441 __builtin_memset(hbuf, 0, sizeof(hbuf));
442
443 switch (len) {
444 case 16:
445 __builtin_memcpy(hbuf, qhd->dcid + 8, 8);
446 break;
447 case 15:
448 __builtin_memcpy(hbuf, qhd->dcid + 8, 7);
449 break;
450 case 14:
451 __builtin_memcpy(hbuf, qhd->dcid + 8, 6);
452 break;
453 case 13:
454 __builtin_memcpy(hbuf, qhd->dcid + 8, 5);
455 break;
456 case 12:
457 __builtin_memcpy(hbuf, qhd->dcid + 8, 4);
458 break;
459 case 11:
460 __builtin_memcpy(hbuf, qhd->dcid + 8, 3);
461 break;
462 case 10:
463 __builtin_memcpy(hbuf, qhd->dcid + 8, 2);
464 break;
465 case 9:
466 __builtin_memcpy(hbuf, qhd->dcid + 8, 1);
467 break;
468 }
469
470 h = hash(hbuf, sizeof(hbuf), h);
471 len = 8;
472 }
473
474 return hash(qhd->dcid, len, h) % num_socks;
475 }
476
477 SEC("sk_reuseport")
select_reuseport(struct sk_reuseport_md * reuse_md)478 int select_reuseport(struct sk_reuseport_md *reuse_md) {
479 __u32 sk_index, *psk_index;
480 __u64 *pnum_socks;
481 __u32 zero = 0;
482 int rv;
483 quic_hd qhd;
484 __u8 qpktbuf[6 + MAX_DCIDLEN];
485 struct AES_ctx *aes_ctx;
486 __u8 *worker_id;
487 __u16 remote_port;
488 __u8 *data = reuse_md->data;
489
490 /* Packets less than 22 bytes never be a valid QUIC packet. */
491 if (reuse_md->len < sizeof(struct udphdr) + 22) {
492 return SK_DROP;
493 }
494
495 if (reuse_md->data + sizeof(struct udphdr) > reuse_md->data_end) {
496 return SK_DROP;
497 }
498
499 remote_port = (data[0] << 8) + data[1];
500
501 switch (remote_port) {
502 case 1900:
503 case 5353:
504 case 11211:
505 case 20800:
506 case 27015:
507 return SK_DROP;
508 default:
509 if (remote_port < 1024) {
510 return SK_DROP;
511 }
512 }
513
514 if (bpf_skb_load_bytes(reuse_md, sizeof(struct udphdr), qpktbuf,
515 sizeof(qpktbuf)) != 0) {
516 return SK_DROP;
517 }
518
519 pnum_socks = bpf_map_lookup_elem(&sk_info, &zero);
520 if (pnum_socks == NULL) {
521 return SK_DROP;
522 }
523
524 aes_ctx = bpf_map_lookup_elem(&aes_key, &zero);
525 if (aes_ctx == NULL) {
526 return SK_DROP;
527 }
528
529 rv = parse_quic(&qhd, qpktbuf, qpktbuf + sizeof(qpktbuf));
530 if (rv != 0) {
531 return SK_DROP;
532 }
533
534 switch (qhd.type) {
535 case NGTCP2_PKT_INITIAL:
536 case NGTCP2_PKT_0RTT:
537 if (qhd.dcidlen == SV_DCIDLEN) {
538 worker_id = qhd.dcid + WORKER_ID_OFFSET;
539 AES_ECB_decrypt(aes_ctx, worker_id);
540
541 psk_index = bpf_map_lookup_elem(&worker_id_map, worker_id);
542 if (psk_index != NULL) {
543 sk_index = *psk_index;
544
545 break;
546 }
547 }
548
549 sk_index = sk_index_from_dcid(&qhd, reuse_md, *pnum_socks);
550
551 break;
552 case NGTCP2_PKT_HANDSHAKE:
553 case NGTCP2_PKT_SHORT:
554 if (qhd.dcidlen != SV_DCIDLEN) {
555 return SK_DROP;
556 }
557
558 worker_id = qhd.dcid + WORKER_ID_OFFSET;
559 AES_ECB_decrypt(aes_ctx, worker_id);
560
561 psk_index = bpf_map_lookup_elem(&worker_id_map, worker_id);
562 if (psk_index == NULL) {
563 sk_index = sk_index_from_dcid(&qhd, reuse_md, *pnum_socks);
564
565 break;
566 }
567
568 sk_index = *psk_index;
569
570 break;
571 default:
572 return SK_DROP;
573 }
574
575 rv = bpf_sk_select_reuseport(reuse_md, &reuseport_array, &sk_index, 0);
576 if (rv != 0) {
577 return SK_DROP;
578 }
579
580 return SK_PASS;
581 }
582