• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * minilex.c
3  *
4  * High efficiency lexical state parser
5  *
6  * Copyright (C)2011-2020 Andy Green <andy@warmcat.com>
7  *
8  * Licensed under MIT
9  *
10  * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h
11  *
12  * Run it twice to test parsing on the generated table on stderr
13  *
14  * Whoo this got a bit complicated by lws-buildtime deselection of some
15  * headers optionally.  There are 3 x vars, UNCOMMON, WS, H2 so we make
16  * eight copies of the lextable selected by the appropriate #if defined()
17  */
18 
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22 
23 /* get all the strings */
24 
25 #define LWS_ROLE_WS 1
26 #define LWS_WITH_HTTP_UNCOMMON_HEADERS 1
27 #define LWS_ROLE_H2 1
28 
29 #include "lextable-strings.h"
30 
31 #undef LWS_ROLE_WS
32 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
33 #undef LWS_ROLE_H2
34 
35 /* bitfield for the 8 versions as to which strings exist... index layout
36  *
37  *        b0      b1 b2
38  *  0 =
39  *  1 = uncommon
40  *  2 =           ws
41  *  3 = uncommon  ws
42  *  4 =              h2
43  *  5 = uncommon     h2
44  *  6 =           ws h2
45  *  7 = uncommon  ws h2
46  */
47 
48 unsigned char filter_array[] = {
49 	0xff, /* get */
50 	0xff, /* post */
51 	0xaa, /* options */
52 	0xff, /* host */
53 	0xff, /* connection */
54 	0xff, /* upgrade */
55 	0xff, /* origin */
56 	0xcc, /* sec-ws-draft */
57 	0xff, /* crlf */
58 	0xcc, /* sec-ws-ext */
59 	0xcc, /* sec-ws-key1 */
60 	0xcc, /* sec-ws-key2 */
61 	0xcc, /* sec-ws-protocol */
62 	0xcc, /* sec-ws-accept */
63 	0xcc, /* sec-ws-nonce */
64 	0xff, /* http/1.1 */
65 	0xf0, /* http2-settings */
66 	0xff, /* accept */
67 	0xaa, /* access-control-req-hdrs */
68 	0xff, /* if-modified-since */
69 	0xff, /* if-none-match */
70 	0xff, /* accept-encoding */
71 	0xff, /* accept-language */
72 	0xff, /* pragma */
73 	0xff, /* cache-control */
74 	0xff, /* authorization */
75 	0xff, /* cookie */
76 	0xff, /* content-length */
77 	0xff, /* content-type */
78 	0xff, /* date */
79 	0xff, /* range */
80 	0xfa, /* referer */
81 	0xcc, /* sec-ws-key */
82 	0xcc, /* sec-ws-version */
83 	0xcc, /* sec-sc-origin */
84 	0xf0, /* authority */
85 	0xf0, /* method */
86 	0xf0, /* path */
87 	0xf0, /* scheme */
88 	0xf0, /* status */
89 	0xfa, /* accept-charset */
90 	0xff, /* accept-ranges */
91 	0xfa, /* access-control-allow-origin */
92 	0xff, /* age */
93 	0xff, /* allow */
94 	0xff, /* content-disposition */
95 	0xff, /* content-encoding */
96 	0xff, /* content-language */
97 	0xff, /* content-location */
98 	0xff, /* content-range */
99 	0xff, /* etag */
100 	0xff, /* expect */
101 	0xff, /* expires */
102 	0xff, /* from */
103 	0xff, /* if-match */
104 	0xff, /* if-range */
105 	0xff, /* if-unmodified-since */
106 	0xff, /* last-modified */
107 	0xff, /* link */
108 	0xff, /* location */
109 	0xfa, /* max-forwards */
110 	0xfa, /* proxy-authenticate */
111 	0xfa, /* proxy-authorization */
112 	0xff, /* refresh */
113 	0xff, /* retry-after */
114 	0xff, /* server */
115 	0xff, /* set-cookie */
116 	0xfa, /* strict-transport-security */
117 	0xff, /* transfer-encoding */
118 	0xfa, /* user-agent */
119 	0xfa, /* vary */
120 	0xfa, /* via */
121 	0xfa, /* www-authenticate */
122 	0xaa, /* patch */
123 	0xaa, /* put */
124 	0xaa, /* delete */
125 	0xff, /* uri-args */
126 	0xaa, /* proxy */
127 	0xaa, /* x-real-ip */
128 	0xff, /* http/1.0 */
129 	0xff, /* x-forwarded-for */
130 	0xff, /* connect */
131 	0xff, /* head */
132 	0xfa, /* te */
133 	0xfa, /* replay-nonce */
134 	0xf0, /* protocol */
135 	0xff, /* x-auth-token */
136 	0xff /* not matchable */
137 };
138 
139 static unsigned char lws_header_implies_psuedoheader_map[] = {
140 	0x07, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0x00 /* <-64 */,
141 	0x0e /* <- 72 */, 0x04 /* <- 80 */, 0, 0, 0, 0
142 };
143 
144 /*
145  * b7 = 0 = 1-byte seq
146  *	    0x08 = fail
147  *	    2-byte seq
148  *	    0x00 - 0x07, then terminal as given in 2nd byte
149 	    3-byte seq
150  *	    no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
151  *    = 1 = 1-byte seq
152  *	    no match: die, match go fwd 1 byte
153  */
154 
155 unsigned char lextable[][2000] = {
156 	{
157 			#include "lextable.h"
158 	},
159 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
160 	{
161 			#include "lextable.h"
162 	},
163 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
164 #define LWS_ROLE_WS 1
165 	{
166 			#include "lextable.h"
167 	},
168 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
169 	{
170 			#include "lextable.h"
171 	},
172 #undef LWS_ROLE_WS
173 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
174 #define LWS_ROLE_H2 1
175 	{
176 			#include "lextable.h"
177 	},
178 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
179 	{
180 			#include "lextable.h"
181 	},
182 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
183 #define LWS_ROLE_WS 1
184 	{
185 			#include "lextable.h"
186 	},
187 #define LWS_WITH_HTTP_UNCOMMON_HEADERS 1
188 	{
189 			#include "lextable.h"
190 	},
191 };
192 
193 #define PARALLEL 30
194 
195 struct state {
196 	char c[PARALLEL];
197 	int state[PARALLEL];
198 	int count;
199 	int bytepos;
200 
201 	int real_pos;
202 };
203 
204 static unsigned char pseudomap[8][16];
205 
206 struct state state[1000];
207 int next = 1;
208 
209 #define FAIL_CHAR 0x08
210 
lextable_decode(int version,int pos,char c)211 int lextable_decode(int version, int pos, char c)
212 {
213 	while (1) {
214 		if (lextable[version][pos] & (1 << 7)) { /* 1-byte, fail on mismatch */
215 			if ((lextable[version][pos] & 0x7f) != c)
216 				return -1;
217 			/* fall thru */
218 			pos++;
219 			if (lextable[version][pos] == FAIL_CHAR)
220 				return -1;
221 			return pos;
222 		} else { /* b7 = 0, end or 3-byte */
223 			if (lextable[version][pos] < FAIL_CHAR) /* terminal marker */
224 				return pos;
225 
226 			if (lextable[version][pos] == c) /* goto */
227 				return pos + (lextable[version][pos + 1]) +
228 						(lextable[version][pos + 2] << 8);
229 			/* fall thru goto */
230 			pos += 3;
231 			/* continue */
232 		}
233 	}
234 }
235 
issue(int version)236 int issue(int version)
237 {
238 	const char *rset[200];
239 	int n = 0;
240 	int m;
241 	int prev;
242 	int walk;
243 	int saw;
244 	int y;
245 	int j;
246 	int pos = 0;
247 
248 	int setmembers = 0;
249 
250 	memset(rset, 0, sizeof(rset));
251 
252 	printf("#if %cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
253 		 "%cdefined(LWS_ROLE_WS) && "
254 		 "%cdefined(LWS_ROLE_H2)\n", version & 1 ? ' ' : '!',
255 		     version & 2 ? ' ' : '!', version & 4 ? ' ' : '!');
256 
257 	/*
258 	 * let's create version's view of the set of strings
259 	 */
260 
261 	for (n = 0; n < sizeof(set) / sizeof(set[0]); n++)
262 		if (filter_array[n] & (1 << version)) {
263 			printf("\t/* %d: %d: %s */\n", setmembers, n, set[n]);
264 			if (lws_header_implies_psuedoheader_map[n >> 3] & (1 << (n & 7)))
265 				pseudomap[version][(setmembers >> 3)] |= 1 << (setmembers & 7);
266 			rset[setmembers++] = set[n];
267 		}
268 
269 	n = 0;
270 	while (n < setmembers) {
271 
272 		m = 0;
273 		walk = 0;
274 		prev = 0;
275 
276 		if (rset[n][0] == '\0') {
277 			n++;
278 			continue;
279 		}
280 
281 		while (rset[n][m]) {
282 
283 			saw = 0;
284 			for (y = 0; y < state[walk].count; y++)
285 				if (state[walk].c[y] == rset[n][m]) {
286 					/* exists -- go forward */
287 					walk = state[walk].state[y];
288 					saw = 1;
289 					break;
290 				}
291 
292 			if (saw)
293 				goto again;
294 
295 			/* something we didn't see before */
296 
297 			state[walk].c[state[walk].count] = rset[n][m];
298 
299 			state[walk].state[state[walk].count] = next;
300 			state[walk].count++;
301 			walk = next++;
302 again:
303 			m++;
304 		}
305 
306 		state[walk].c[0] = n++;
307 		state[walk].state[0] = 0; /* terminal marker */
308 		state[walk].count = 1;
309 	}
310 
311 	walk = 0;
312 	for (n = 0; n < next; n++) {
313 		state[n].bytepos = walk;
314 		walk += (2 * state[n].count);
315 	}
316 
317 	/* compute everyone's position first */
318 
319 	pos = 0;
320 	walk = 0;
321 	for (n = 0; n < next; n++) {
322 
323 		state[n].real_pos = pos;
324 
325 		for (m = 0; m < state[n].count; m++) {
326 
327 			if (state[n].state[m] == 0)
328 				pos += 2; /* terminal marker */
329 			else { /* c is a character */
330 				if ((state[state[n].state[m]].bytepos -
331 								walk) == 2)
332 					pos++;
333 				else {
334 					pos += 3;
335 					if (m == state[n].count - 1)
336 						pos++; /* fail */
337 				}
338 			}
339 			walk += 2;
340 		}
341 	}
342 
343 	walk = 0;
344 	pos = 0;
345 	for (n = 0; n < next; n++) {
346 		for (m = 0; m < state[n].count; m++) {
347 
348 			if (!m)
349 				fprintf(stdout, "/* pos %04x: %3d */ ",
350 							  state[n].real_pos, n);
351 			else
352 				fprintf(stdout, "                    ");
353 
354 			y = state[n].c[m];
355 			saw = state[n].state[m];
356 
357 			if (saw == 0) { // c is a terminal then
358 
359 				if (y > 0x7ff) {
360 					fprintf(stderr, "terminal too big\n");
361 					return 2;
362 				}
363 
364 				fprintf(stdout, "   0x%02X, 0x%02X           "
365 					"       "
366 					"/* - terminal marker %2d - */,\n",
367 						    y >> 8, y & 0xff, y & 0x7f);
368 				pos += 2;
369 				walk += 2;
370 				continue;
371 			}
372 
373 			/* c is a character */
374 
375 			prev = y &0x7f;
376 			if (prev < 32 || prev > 126)
377 				prev = '.';
378 
379 
380 			if ((state[saw].bytepos - walk) == 2) {
381 				fprintf(stdout, "   0x%02X /* '%c' -> */,\n",
382 						y | 0x80, prev);
383 				pos++;
384 				walk += 2;
385 				continue;
386 			}
387 
388 			j = state[saw].real_pos - pos;
389 
390 			if (j > 0xffff) {
391 				fprintf(stderr,
392 				  "Jump > 64K bytes ahead (%d to %d)\n",
393 					state[n].real_pos, state[saw].real_pos);
394 				return 1;
395 			}
396 			fprintf(stdout, "   0x%02X /* '%c' */, 0x%02X, 0x%02X  "
397 				"/* (to 0x%04X state %3d) */,\n",
398 				y, prev,
399 				j & 0xff, j >> 8,
400 				state[saw].real_pos, saw);
401 			pos += 3;
402 
403 			if (m == state[n].count - 1) {
404 				fprintf(stdout,
405 				  "                       0x%02X, /* fail */\n",
406 								FAIL_CHAR);
407 				pos++; /* fail */
408 			}
409 
410 			walk += 2;
411 		}
412 	}
413 
414 	fprintf(stdout, "/* total size %d bytes */\n", pos);
415 
416 	printf("#endif\n\n");
417 
418 	/*
419 	 * Try to parse every legal input string
420 	 */
421 
422 	for (n = 0; n < setmembers; n++) {
423 		walk = 0;
424 		m = 0;
425 		y = -1;
426 
427 		if (rset[n][0] == '\0')
428 			continue;
429 
430 		fprintf(stderr, "  trying %d '%s'\n", n, rset[n]);
431 
432 		while (rset[n][m]) {
433 			walk = lextable_decode(version, walk, rset[n][m]);
434 			if (walk < 0) {
435 				fprintf(stderr, "failed\n");
436 				return 3;
437 			}
438 
439 			if (lextable[version][walk] < FAIL_CHAR) {
440 				y = (lextable[version][walk] << 8) +
441 				     lextable[version][walk + 1];
442 				break;
443 			}
444 			m++;
445 		}
446 
447 		if (y != n) {
448 			fprintf(stderr, "decode failed %d\n", y);
449 			return 4;
450 		}
451 	}
452 
453 	fprintf(stderr, "All decode OK\n");
454 
455 	return 0;
456 }
457 
main(void)458 int main(void)
459 {
460 	int m, n;
461 
462 	for (n = 0; n < 8; n++) {
463 		issue(n);
464 	}
465 
466 	printf("\n/*\n");
467 
468 	for (n = 0; n < 8; n++) {
469 
470 		printf("#if %cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
471 			 "%cdefined(LWS_ROLE_WS) && "
472 			 "%cdefined(LWS_ROLE_H2)\n", n & 1 ? ' ' : '!',
473 			     n & 2 ? ' ' : '!', n & 4 ? ' ' : '!');
474 
475 		printf("static uint8_t lws_header_implies_psuedoheader_map[] = {\n\t");
476 
477 		for (m = 0; m < sizeof(pseudomap[n]); m++)
478 			printf("0x%02x,", pseudomap[n][m]);
479 
480 		printf("\n};\n");
481 
482 		printf("#endif\n");
483 	}
484 
485 	printf("*/\n");
486 
487 	fprintf(stderr, "did all the variants\n");
488 }
489