1 /*
2 * minilex.c
3 *
4 * High efficiency lexical state parser
5 *
6 * Copyright (C)2011-2020 Andy Green <andy@warmcat.com>
7 *
8 * Licensed under MIT
9 *
10 * Usage: gcc minilex.c -o minilex && ./minilex > lextable.h
11 *
12 * Run it twice to test parsing on the generated table on stderr
13 *
14 * Whoo this got a bit complicated by lws-buildtime deselection of some
15 * headers optionally. There are 3 x vars, UNCOMMON, WS, H2 so we make
16 * eight copies of the lextable selected by the appropriate #if defined()
17 */
18
19 #include <stdio.h>
20 #include <stdlib.h>
21 #include <string.h>
22
23 /* get all the strings */
24
25 #define LWS_ROLE_WS 1
26 #define LWS_WITH_HTTP_UNCOMMON_HEADERS 1
27 #define LWS_ROLE_H2 1
28
29 #include "lextable-strings.h"
30
31 #undef LWS_ROLE_WS
32 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
33 #undef LWS_ROLE_H2
34
35 /* bitfield for the 8 versions as to which strings exist... index layout
36 *
37 * b0 b1 b2
38 * 0 =
39 * 1 = uncommon
40 * 2 = ws
41 * 3 = uncommon ws
42 * 4 = h2
43 * 5 = uncommon h2
44 * 6 = ws h2
45 * 7 = uncommon ws h2
46 */
47
48 unsigned char filter_array[] = {
49 0xff, /* get */
50 0xff, /* post */
51 0xaa, /* options */
52 0xff, /* host */
53 0xff, /* connection */
54 0xff, /* upgrade */
55 0xff, /* origin */
56 0xcc, /* sec-ws-draft */
57 0xff, /* crlf */
58 0xcc, /* sec-ws-ext */
59 0xcc, /* sec-ws-key1 */
60 0xcc, /* sec-ws-key2 */
61 0xcc, /* sec-ws-protocol */
62 0xcc, /* sec-ws-accept */
63 0xcc, /* sec-ws-nonce */
64 0xff, /* http/1.1 */
65 0xf0, /* http2-settings */
66 0xff, /* accept */
67 0xaa, /* access-control-req-hdrs */
68 0xff, /* if-modified-since */
69 0xff, /* if-none-match */
70 0xff, /* accept-encoding */
71 0xff, /* accept-language */
72 0xff, /* pragma */
73 0xff, /* cache-control */
74 0xff, /* authorization */
75 0xff, /* cookie */
76 0xff, /* content-length */
77 0xff, /* content-type */
78 0xff, /* date */
79 0xff, /* range */
80 0xfa, /* referer */
81 0xcc, /* sec-ws-key */
82 0xcc, /* sec-ws-version */
83 0xcc, /* sec-sc-origin */
84 0xf0, /* authority */
85 0xf0, /* method */
86 0xf0, /* path */
87 0xf0, /* scheme */
88 0xf0, /* status */
89 0xfa, /* accept-charset */
90 0xff, /* accept-ranges */
91 0xfa, /* access-control-allow-origin */
92 0xff, /* age */
93 0xff, /* allow */
94 0xff, /* content-disposition */
95 0xff, /* content-encoding */
96 0xff, /* content-language */
97 0xff, /* content-location */
98 0xff, /* content-range */
99 0xff, /* etag */
100 0xff, /* expect */
101 0xff, /* expires */
102 0xff, /* from */
103 0xff, /* if-match */
104 0xff, /* if-range */
105 0xff, /* if-unmodified-since */
106 0xff, /* last-modified */
107 0xff, /* link */
108 0xff, /* location */
109 0xfa, /* max-forwards */
110 0xfa, /* proxy-authenticate */
111 0xfa, /* proxy-authorization */
112 0xff, /* refresh */
113 0xff, /* retry-after */
114 0xff, /* server */
115 0xff, /* set-cookie */
116 0xfa, /* strict-transport-security */
117 0xff, /* transfer-encoding */
118 0xfa, /* user-agent */
119 0xfa, /* vary */
120 0xfa, /* via */
121 0xfa, /* www-authenticate */
122 0xaa, /* patch */
123 0xaa, /* put */
124 0xaa, /* delete */
125 0xff, /* uri-args */
126 0xaa, /* proxy */
127 0xaa, /* x-real-ip */
128 0xff, /* http/1.0 */
129 0xff, /* x-forwarded-for */
130 0xff, /* connect */
131 0xff, /* head */
132 0xfa, /* te */
133 0xfa, /* replay-nonce */
134 0xf0, /* protocol */
135 0xff, /* x-auth-token */
136 0xff /* not matchable */
137 };
138
139 static unsigned char lws_header_implies_psuedoheader_map[] = {
140 0x07, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x00, 0x00, 0x00 /* <-64 */,
141 0x0e /* <- 72 */, 0x24 /* <- 80 */, 0, 0, 0, 0
142 };
143
144 /*
145 * b7 = 0 = 1-byte seq
146 * 0x08 = fail
147 * 2-byte seq
148 * 0x00 - 0x07, then terminal as given in 2nd byte
149 3-byte seq
150 * no match: go fwd 3 byte, match: jump fwd by amt in +1/+2 bytes
151 * = 1 = 1-byte seq
152 * no match: die, match go fwd 1 byte
153 */
154
155 unsigned char lextable[][2000] = {
156 {
157 #include "lextable.h"
158 },
159 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
160 {
161 #include "lextable.h"
162 },
163 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
164 #define LWS_ROLE_WS 1
165 {
166 #include "lextable.h"
167 },
168 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
169 {
170 #include "lextable.h"
171 },
172 #undef LWS_ROLE_WS
173 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
174 #define LWS_ROLE_H2 1
175 {
176 #include "lextable.h"
177 },
178 #define LWS_WITH_HTTP_UNCOMMON_HEADERS
179 {
180 #include "lextable.h"
181 },
182 #undef LWS_WITH_HTTP_UNCOMMON_HEADERS
183 #define LWS_ROLE_WS 1
184 {
185 #include "lextable.h"
186 },
187 #define LWS_WITH_HTTP_UNCOMMON_HEADERS 1
188 {
189 #include "lextable.h"
190 },
191 };
192
193 #define PARALLEL 30
194
195 struct state {
196 char c[PARALLEL];
197 int state[PARALLEL];
198 int count;
199 int bytepos;
200
201 int real_pos;
202 };
203
204 static unsigned char pseudomap[8][16];
205
206 struct state state[1000];
207 int next = 1;
208
209 #define FAIL_CHAR 0x08
210
lextable_decode(int version,int pos,char c)211 int lextable_decode(int version, int pos, char c)
212 {
213 while (1) {
214 if (lextable[version][pos] & (1 << 7)) { /* 1-byte, fail on mismatch */
215 if ((lextable[version][pos] & 0x7f) != c)
216 return -1;
217 /* fall thru */
218 pos++;
219 if (lextable[version][pos] == FAIL_CHAR)
220 return -1;
221 return pos;
222 } else { /* b7 = 0, end or 3-byte */
223 if (lextable[version][pos] < FAIL_CHAR) /* terminal marker */
224 return pos;
225
226 if (lextable[version][pos] == c) /* goto */
227 return pos + (lextable[version][pos + 1]) +
228 (lextable[version][pos + 2] << 8);
229 /* fall thru goto */
230 pos += 3;
231 /* continue */
232 }
233 }
234 }
235
issue(int version)236 int issue(int version)
237 {
238 const char *rset[200];
239 int n = 0;
240 int m;
241 int prev;
242 int walk;
243 int saw;
244 int y;
245 int j;
246 int pos = 0;
247
248 int setmembers = 0;
249
250 memset(rset, 0, sizeof(rset));
251
252 if (version == 7)
253 printf("#if defined(LWS_HTTP_HEADERS_ALL) || (%cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
254 "%cdefined(LWS_ROLE_WS) && "
255 "%cdefined(LWS_ROLE_H2))\n", version & 1 ? ' ' : '!',
256 version & 2 ? ' ' : '!', version & 4 ? ' ' : '!');
257 else
258 printf("#if !defined(LWS_HTTP_HEADERS_ALL) && %cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
259 "%cdefined(LWS_ROLE_WS) && "
260 "%cdefined(LWS_ROLE_H2)\n", version & 1 ? ' ' : '!',
261 version & 2 ? ' ' : '!', version & 4 ? ' ' : '!');
262
263 /*
264 * let's create version's view of the set of strings
265 */
266
267 for (n = 0; n < sizeof(set) / sizeof(set[0]); n++)
268 if (filter_array[n] & (1 << version)) {
269 printf("\t/* %d: %d: %s */\n", setmembers, n, set[n]);
270 if (lws_header_implies_psuedoheader_map[n >> 3] & (1 << (n & 7)))
271 pseudomap[version][(setmembers >> 3)] |= 1 << (setmembers & 7);
272 rset[setmembers++] = set[n];
273 }
274
275 n = 0;
276 while (n < setmembers) {
277
278 m = 0;
279 walk = 0;
280 prev = 0;
281
282 if (rset[n][0] == '\0') {
283 n++;
284 continue;
285 }
286
287 while (rset[n][m]) {
288
289 saw = 0;
290 for (y = 0; y < state[walk].count; y++)
291 if (state[walk].c[y] == rset[n][m]) {
292 /* exists -- go forward */
293 walk = state[walk].state[y];
294 saw = 1;
295 break;
296 }
297
298 if (saw)
299 goto again;
300
301 /* something we didn't see before */
302
303 state[walk].c[state[walk].count] = rset[n][m];
304
305 state[walk].state[state[walk].count] = next;
306 state[walk].count++;
307 walk = next++;
308 again:
309 m++;
310 }
311
312 state[walk].c[0] = n++;
313 state[walk].state[0] = 0; /* terminal marker */
314 state[walk].count = 1;
315 }
316
317 walk = 0;
318 for (n = 0; n < next; n++) {
319 state[n].bytepos = walk;
320 walk += (2 * state[n].count);
321 }
322
323 /* compute everyone's position first */
324
325 pos = 0;
326 walk = 0;
327 for (n = 0; n < next; n++) {
328
329 state[n].real_pos = pos;
330
331 for (m = 0; m < state[n].count; m++) {
332
333 if (state[n].state[m] == 0)
334 pos += 2; /* terminal marker */
335 else { /* c is a character */
336 if ((state[state[n].state[m]].bytepos -
337 walk) == 2)
338 pos++;
339 else {
340 pos += 3;
341 if (m == state[n].count - 1)
342 pos++; /* fail */
343 }
344 }
345 walk += 2;
346 }
347 }
348
349 walk = 0;
350 pos = 0;
351 for (n = 0; n < next; n++) {
352 for (m = 0; m < state[n].count; m++) {
353
354 if (!m)
355 fprintf(stdout, "/* pos %04x: %3d */ ",
356 state[n].real_pos, n);
357 else
358 fprintf(stdout, " ");
359
360 y = state[n].c[m];
361 saw = state[n].state[m];
362
363 if (saw == 0) { // c is a terminal then
364
365 if (y > 0x7ff) {
366 fprintf(stderr, "terminal too big\n");
367 return 2;
368 }
369
370 fprintf(stdout, " 0x%02X, 0x%02X "
371 " "
372 "/* - terminal marker %2d - */,\n",
373 y >> 8, y & 0xff, y & 0x7f);
374 pos += 2;
375 walk += 2;
376 continue;
377 }
378
379 /* c is a character */
380
381 prev = y &0x7f;
382 if (prev < 32 || prev > 126)
383 prev = '.';
384
385
386 if ((state[saw].bytepos - walk) == 2) {
387 fprintf(stdout, " 0x%02X /* '%c' -> */,\n",
388 y | 0x80, prev);
389 pos++;
390 walk += 2;
391 continue;
392 }
393
394 j = state[saw].real_pos - pos;
395
396 if (j > 0xffff) {
397 fprintf(stderr,
398 "Jump > 64K bytes ahead (%d to %d)\n",
399 state[n].real_pos, state[saw].real_pos);
400 return 1;
401 }
402 fprintf(stdout, " 0x%02X /* '%c' */, 0x%02X, 0x%02X "
403 "/* (to 0x%04X state %3d) */,\n",
404 y, prev,
405 j & 0xff, j >> 8,
406 state[saw].real_pos, saw);
407 pos += 3;
408
409 if (m == state[n].count - 1) {
410 fprintf(stdout,
411 " 0x%02X, /* fail */\n",
412 FAIL_CHAR);
413 pos++; /* fail */
414 }
415
416 walk += 2;
417 }
418 }
419
420 fprintf(stdout, "/* total size %d bytes */\n", pos);
421
422 printf("#endif\n\n");
423
424 /*
425 * Try to parse every legal input string
426 */
427
428 for (n = 0; n < setmembers; n++) {
429 walk = 0;
430 m = 0;
431 y = -1;
432
433 if (rset[n][0] == '\0')
434 continue;
435
436 fprintf(stderr, " trying %d '%s'\n", n, rset[n]);
437
438 while (rset[n][m]) {
439 walk = lextable_decode(version, walk, rset[n][m]);
440 if (walk < 0) {
441 fprintf(stderr, "failed\n");
442 return 3;
443 }
444
445 if (lextable[version][walk] < FAIL_CHAR) {
446 y = (lextable[version][walk] << 8) +
447 lextable[version][walk + 1];
448 break;
449 }
450 m++;
451 }
452
453 if (y != n) {
454 fprintf(stderr, "decode failed %d\n", y);
455 return 4;
456 }
457 }
458
459 fprintf(stderr, "All decode OK\n");
460
461 return 0;
462 }
463
main(void)464 int main(void)
465 {
466 int m, n;
467
468 for (n = 0; n < 8; n++) {
469 issue(n);
470 }
471
472 printf("\n/*\n");
473
474 for (n = 0; n < 8; n++) {
475
476 if (n == 7)
477 printf("#if defined(LWS_HTTP_HEADERS_ALL) || (%cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
478 "%cdefined(LWS_ROLE_WS) && "
479 "%cdefined(LWS_ROLE_H2))\n", n & 1 ? ' ' : '!',
480 n & 2 ? ' ' : '!', n & 4 ? ' ' : '!');
481 else
482 printf("#if !defined(LWS_HTTP_HEADERS_ALL) && %cdefined(LWS_WITH_HTTP_UNCOMMON_HEADERS) && "
483 "%cdefined(LWS_ROLE_WS) && "
484 "%cdefined(LWS_ROLE_H2)\n", n & 1 ? ' ' : '!',
485 n & 2 ? ' ' : '!', n & 4 ? ' ' : '!');
486
487 printf("static uint8_t lws_header_implies_psuedoheader_map[] = {\n\t");
488
489 for (m = 0; m < sizeof(pseudomap[n]); m++)
490 printf("0x%02x,", pseudomap[n][m]);
491
492 printf("\n};\n");
493
494 printf("#endif\n");
495 }
496
497 printf("*/\n");
498
499 fprintf(stderr, "did all the variants\n");
500 }
501