• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * minilex.c
3  *
4  * High efficiency lexical state parser
5  *
6  * Copyright (C)2011-2020 Andy Green <andy@warmcat.com>
7  *
8  * Licensed under MIT
9  *
10  * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h
11  *
12  * Run it twice to test parsing on the generated table on stderr
13  *
14  * Whoo this got a bit complicated by lws-buildtime deselection of some
15  * headers optionally.  There are 3 x vars, UNCOMMON, WS, H2 so we make
16  * eight copies of the lextable selected by the appropriate #if defined()
17  */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 
23 /* get all the strings */
24 
25 #define LWS_ROLE_WS 1
26 #define LWS_WITH_HTTP_UNCOMMON_HEADERS 1
27 #define LWS_ROLE_H2 1
28 
29 #include "lextable-strings.h"
30 
31 #undef LWS_ROLE_WS
32 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
33 #undef LWS_ROLE_H2
34 
35 /* bitfield for the 8 versions as to which strings exist... index layout
36  *
37  *        b0      b1 b2
38  *  0 =
39  *  1 = uncommon
40  *  2 =           ws
41  *  3 = uncommon  ws
42  *  4 =              h2
43  *  5 = uncommon     h2
44  *  6 =           ws h2
45  *  7 = uncommon  ws h2
46  */
47 
48 unsigned char filter_array[] = {
49 	0xff, /* get */
50 	0xff, /* post */
51 	0xaa, /* options */
52 	0xff, /* host */
53 	0xff, /* connection */
54 	0xff, /* upgrade */
55 	0xff, /* origin */
56 	0xcc, /* sec-ws-draft */
57 	0xff, /* crlf */
58 	0xcc, /* sec-ws-ext */
59 	0xcc, /* sec-ws-key1 */
60 	0xcc, /* sec-ws-key2 */
61 	0xcc, /* sec-ws-protocol */
62 	0xcc, /* sec-ws-accept */
63 	0xcc, /* sec-ws-nonce */
64 	0xff, /* http/1.1 */
65 	0xf0, /* http2-settings */
66 	0xff, /* accept */
67 	0xaa, /* access-control-req-hdrs */
68 	0xff, /* if-modified-since */
69 	0xff, /* if-none-match */
70 	0xff, /* accept-encoding */
71 	0xff, /* accept-language */
72 	0xff, /* pragma */
73 	0xff, /* cache-control */
74 	0xff, /* authorization */
75 	0xff, /* cookie */
76 	0xff, /* content-length */
77 	0xff, /* content-type */
78 	0xff, /* date */
79 	0xff, /* range */
80 	0xfa, /* referer */
81 	0xcc, /* sec-ws-key */
82 	0xcc, /* sec-ws-version */
83 	0xcc, /* sec-sc-origin */
84 	0xf0, /* authority */
85 	0xf0, /* method */
86 	0xf0, /* path */
87 	0xf0, /* scheme */
88 	0xf0, /* status */
89 	0xfa, /* accept-charset */
90 	0xff, /* accept-ranges */
91 	0xfa, /* access-control-allow-origin */
92 	0xff, /* age */
93 	0xff, /* allow */
94 	0xff, /* content-disposition */
95 	0xff, /* content-encoding */
96 	0xff, /* content-language */
97 	0xff, /* content-location */
98 	0xff, /* content-range */
99 	0xff, /* etag */
100 	0xff, /* expect */
101 	0xff, /* expires */
102 	0xff, /* from */
103 	0xff, /* if-match */
104 	0xff, /* if-range */
105 	0xff, /* if-unmodified-since */
106 	0xff, /* last-modified */
107 	0xff, /* link */
108 	0xff, /* location */
109 	0xfa, /* max-forwards */
110 	0xfa, /* proxy-authenticate */
111 	0xfa, /* proxy-authorization */
112 	0xff, /* refresh */
113 	0xff, /* retry-after */
114 	0xff, /* server */
115 	0xff, /* set-cookie */
116 	0xfa, /* strict-transport-security */
117 	0xff, /* transfer-encoding */
118 	0xfa, /* user-agent */
119 	0xfa, /* vary */
120 	0xfa, /* via */
121 	0xfa, /* www-authenticate */
122 	0xaa, /* patch */
123 	0xaa, /* put */
124 	0xaa, /* delete */
125 	0xff, /* uri-args */
126 	0xaa, /* proxy */
127 	0xaa, /* x-real-ip */
128 	0xff, /* http/1.0 */
129 	0xff, /* x-forwarded-for */
130 	0xff, /* connect */
131 	0xff, /* head */
132 	0xfa, /* te */
133 	0xfa, /* replay-nonce */
134 	0xf0, /* protocol */
135 	0xff, /* x-auth-token */
136 	0xff /* not matchable */
137 };
138 
139 static unsigned char lws_header_implies_psuedoheader_map[] = {
140 	0x07, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0x00 /* <-64 */,
141 	0x0e /* <- 72 */, 0x24 /* <- 80 */, 0, 0, 0, 0
142 };
143 
144 /*
145  * b7 = 0 = 1-byte seq
146  *	    0x08 = fail
147  *	    2-byte seq
148  *	    0x00 - 0x07, then terminal as given in 2nd byte
149 	    3-byte seq
150  *	    no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
151  *    = 1 = 1-byte seq
152  *	    no match: die, match go fwd 1 byte
153  */
154 
155 unsigned char lextable[][2000] = {
156 	{
157 			#include "lextable.h"
158 	},
159 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
160 	{
161 			#include "lextable.h"
162 	},
163 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
164 #define LWS_ROLE_WS 1
165 	{
166 			#include "lextable.h"
167 	},
168 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
169 	{
170 			#include "lextable.h"
171 	},
172 #undef LWS_ROLE_WS
173 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
174 #define LWS_ROLE_H2 1
175 	{
176 			#include "lextable.h"
177 	},
178 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
179 	{
180 			#include "lextable.h"
181 	},
182 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
183 #define LWS_ROLE_WS 1
184 	{
185 			#include "lextable.h"
186 	},
187 #define LWS_WITH_HTTP_UNCOMMON_HEADERS 1
188 	{
189 			#include "lextable.h"
190 	},
191 };
192 
193 #define PARALLEL 30
194 
195 struct state {
196 	char c[PARALLEL];
197 	int state[PARALLEL];
198 	int count;
199 	int bytepos;
200 
201 	int real_pos;
202 };
203 
204 static unsigned char pseudomap[8][16];
205 
206 struct state state[1000];
207 int next = 1;
208 
209 #define FAIL_CHAR 0x08
210 
lextable_decode(int version,int pos,char c)211 int lextable_decode(int version, int pos, char c)
212 {
213 	while (1) {
214 		if (lextable[version][pos] & (1 << 7)) { /* 1-byte, fail on mismatch */
215 			if ((lextable[version][pos] & 0x7f) != c)
216 				return -1;
217 			/* fall thru */
218 			pos++;
219 			if (lextable[version][pos] == FAIL_CHAR)
220 				return -1;
221 			return pos;
222 		} else { /* b7 = 0, end or 3-byte */
223 			if (lextable[version][pos] < FAIL_CHAR) /* terminal marker */
224 				return pos;
225 
226 			if (lextable[version][pos] == c) /* goto */
227 				return pos + (lextable[version][pos + 1]) +
228 						(lextable[version][pos + 2] << 8);
229 			/* fall thru goto */
230 			pos += 3;
231 			/* continue */
232 		}
233 	}
234 }
235 
issue(int version)236 int issue(int version)
237 {
238 	const char *rset[200];
239 	int n = 0;
240 	int m;
241 	int prev;
242 	int walk;
243 	int saw;
244 	int y;
245 	int j;
246 	int pos = 0;
247 
248 	int setmembers = 0;
249 
250 	memset(rset, 0, sizeof(rset));
251 
252 	if (version == 7)
253 		printf("#if defined(LWS_HTTP_HEADERS_ALL) || (%cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
254 			 "%cdefined(LWS_ROLE_WS) && "
255 			 "%cdefined(LWS_ROLE_H2))\n", version & 1 ? ' ' : '!',
256 			     version & 2 ? ' ' : '!', version & 4 ? ' ' : '!');
257 	else
258 		printf("#if !defined(LWS_HTTP_HEADERS_ALL) && %cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
259 			 "%cdefined(LWS_ROLE_WS) && "
260 			 "%cdefined(LWS_ROLE_H2)\n", version & 1 ? ' ' : '!',
261 			     version & 2 ? ' ' : '!', version & 4 ? ' ' : '!');
262 
263 	/*
264 	 * let's create version's view of the set of strings
265 	 */
266 
267 	for (n = 0; n < sizeof(set) / sizeof(set[0]); n++)
268 		if (filter_array[n] & (1 << version)) {
269 			printf("\t/* %d: %d: %s */\n", setmembers, n, set[n]);
270 			if (lws_header_implies_psuedoheader_map[n >> 3] & (1 << (n & 7)))
271 				pseudomap[version][(setmembers >> 3)] |= 1 << (setmembers & 7);
272 			rset[setmembers++] = set[n];
273 		}
274 
275 	n = 0;
276 	while (n < setmembers) {
277 
278 		m = 0;
279 		walk = 0;
280 		prev = 0;
281 
282 		if (rset[n][0] == '\0') {
283 			n++;
284 			continue;
285 		}
286 
287 		while (rset[n][m]) {
288 
289 			saw = 0;
290 			for (y = 0; y < state[walk].count; y++)
291 				if (state[walk].c[y] == rset[n][m]) {
292 					/* exists -- go forward */
293 					walk = state[walk].state[y];
294 					saw = 1;
295 					break;
296 				}
297 
298 			if (saw)
299 				goto again;
300 
301 			/* something we didn't see before */
302 
303 			state[walk].c[state[walk].count] = rset[n][m];
304 
305 			state[walk].state[state[walk].count] = next;
306 			state[walk].count++;
307 			walk = next++;
308 again:
309 			m++;
310 		}
311 
312 		state[walk].c[0] = n++;
313 		state[walk].state[0] = 0; /* terminal marker */
314 		state[walk].count = 1;
315 	}
316 
317 	walk = 0;
318 	for (n = 0; n < next; n++) {
319 		state[n].bytepos = walk;
320 		walk += (2 * state[n].count);
321 	}
322 
323 	/* compute everyone's position first */
324 
325 	pos = 0;
326 	walk = 0;
327 	for (n = 0; n < next; n++) {
328 
329 		state[n].real_pos = pos;
330 
331 		for (m = 0; m < state[n].count; m++) {
332 
333 			if (state[n].state[m] == 0)
334 				pos += 2; /* terminal marker */
335 			else { /* c is a character */
336 				if ((state[state[n].state[m]].bytepos -
337 								walk) == 2)
338 					pos++;
339 				else {
340 					pos += 3;
341 					if (m == state[n].count - 1)
342 						pos++; /* fail */
343 				}
344 			}
345 			walk += 2;
346 		}
347 	}
348 
349 	walk = 0;
350 	pos = 0;
351 	for (n = 0; n < next; n++) {
352 		for (m = 0; m < state[n].count; m++) {
353 
354 			if (!m)
355 				fprintf(stdout, "/* pos %04x: %3d */ ",
356 							  state[n].real_pos, n);
357 			else
358 				fprintf(stdout, "                    ");
359 
360 			y = state[n].c[m];
361 			saw = state[n].state[m];
362 
363 			if (saw == 0) { // c is a terminal then
364 
365 				if (y > 0x7ff) {
366 					fprintf(stderr, "terminal too big\n");
367 					return 2;
368 				}
369 
370 				fprintf(stdout, "   0x%02X, 0x%02X           "
371 					"       "
372 					"/* - terminal marker %2d - */,\n",
373 						    y >> 8, y & 0xff, y & 0x7f);
374 				pos += 2;
375 				walk += 2;
376 				continue;
377 			}
378 
379 			/* c is a character */
380 
381 			prev = y &0x7f;
382 			if (prev < 32 || prev > 126)
383 				prev = '.';
384 
385 
386 			if ((state[saw].bytepos - walk) == 2) {
387 				fprintf(stdout, "   0x%02X /* '%c' -> */,\n",
388 						y | 0x80, prev);
389 				pos++;
390 				walk += 2;
391 				continue;
392 			}
393 
394 			j = state[saw].real_pos - pos;
395 
396 			if (j > 0xffff) {
397 				fprintf(stderr,
398 				  "Jump > 64K bytes ahead (%d to %d)\n",
399 					state[n].real_pos, state[saw].real_pos);
400 				return 1;
401 			}
402 			fprintf(stdout, "   0x%02X /* '%c' */, 0x%02X, 0x%02X  "
403 				"/* (to 0x%04X state %3d) */,\n",
404 				y, prev,
405 				j & 0xff, j >> 8,
406 				state[saw].real_pos, saw);
407 			pos += 3;
408 
409 			if (m == state[n].count - 1) {
410 				fprintf(stdout,
411 				  "                       0x%02X, /* fail */\n",
412 								FAIL_CHAR);
413 				pos++; /* fail */
414 			}
415 
416 			walk += 2;
417 		}
418 	}
419 
420 	fprintf(stdout, "/* total size %d bytes */\n", pos);
421 
422 	printf("#endif\n\n");
423 
424 	/*
425 	 * Try to parse every legal input string
426 	 */
427 
428 	for (n = 0; n < setmembers; n++) {
429 		walk = 0;
430 		m = 0;
431 		y = -1;
432 
433 		if (rset[n][0] == '\0')
434 			continue;
435 
436 		fprintf(stderr, "  trying %d '%s'\n", n, rset[n]);
437 
438 		while (rset[n][m]) {
439 			walk = lextable_decode(version, walk, rset[n][m]);
440 			if (walk < 0) {
441 				fprintf(stderr, "failed\n");
442 				return 3;
443 			}
444 
445 			if (lextable[version][walk] < FAIL_CHAR) {
446 				y = (lextable[version][walk] << 8) +
447 				     lextable[version][walk + 1];
448 				break;
449 			}
450 			m++;
451 		}
452 
453 		if (y != n) {
454 			fprintf(stderr, "decode failed %d\n", y);
455 			return 4;
456 		}
457 	}
458 
459 	fprintf(stderr, "All decode OK\n");
460 
461 	return 0;
462 }
463 
main(void)464 int main(void)
465 {
466 	int m, n;
467 
468 	for (n = 0; n < 8; n++) {
469 		issue(n);
470 	}
471 
472 	printf("\n/*\n");
473 
474 	for (n = 0; n < 8; n++) {
475 
476 		if (n == 7)
477 			printf("#if defined(LWS_HTTP_HEADERS_ALL) || (%cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
478 				 "%cdefined(LWS_ROLE_WS) && "
479 				 "%cdefined(LWS_ROLE_H2))\n", n & 1 ? ' ' : '!',
480 				     n & 2 ? ' ' : '!', n & 4 ? ' ' : '!');
481 		else
482 		printf("#if !defined(LWS_HTTP_HEADERS_ALL) && %cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
483 			 "%cdefined(LWS_ROLE_WS) && "
484 			 "%cdefined(LWS_ROLE_H2)\n", n & 1 ? ' ' : '!',
485 			     n & 2 ? ' ' : '!', n & 4 ? ' ' : '!');
486 
487 		printf("static uint8_t lws_header_implies_psuedoheader_map[] = {\n\t");
488 
489 		for (m = 0; m < sizeof(pseudomap[n]); m++)
490 			printf("0x%02x,", pseudomap[n][m]);
491 
492 		printf("\n};\n");
493 
494 		printf("#endif\n");
495 	}
496 
497 	printf("*/\n");
498 
499 	fprintf(stderr, "did all the variants\n");
500 }
501