• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2  * libwebsockets - small server side websockets and web server implementation
3  *
4  * Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com>
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22  * IN THE SOFTWARE.
23  *
24  * The functions allow
25  *
26  *  - collecting a concordance of strings from one or more files (eg, a
27  *    directory of files) into a single in-memory, lac-backed trie;
28  *
29  *  - to optimize and serialize the in-memory trie to an fd;
30  *
31  *  - to very quickly report any instances of a string in any of the files
32  *    indexed by the trie, by a seeking around a serialized trie fd, without
33  *    having to load it all in memory
34  */
35 
36 #include "private-lib-core.h"
37 #include "private-lib-misc-fts.h"
38 
39 #include <stdio.h>
40 #include <string.h>
41 #include <assert.h>
42 #include <fcntl.h>
43 #include <errno.h>
44 #include <sys/types.h>
45 
46 struct lws_fts_entry;
47 
48 /* notice these are stored in t->lwsac_input_head which has input file scope */
49 
50 struct lws_fts_filepath {
51 	struct lws_fts_filepath *next;
52 	struct lws_fts_filepath *prev;
53 	char filepath[256];
54 	jg2_file_offset ofs;
55 	jg2_file_offset line_table_ofs;
56 	int filepath_len;
57 	int file_index;
58 	int total_lines;
59 	int priority;
60 };
61 
62 /* notice these are stored in t->lwsac_input_head which has input file scope */
63 
64 struct lws_fts_lines {
65 	struct lws_fts_lines *lines_next;
66 	/*
67 	 * amount of line numbers needs to meet average count for best
68 	 * efficiency.
69 	 *
70 	 * Line numbers are stored in VLI format since if we don't, around half
71 	 * the total lac allocation consists of struct lws_fts_lines...
72 	 * size chosen to maintain 8-byte struct alignment
73 	 */
74 	uint8_t vli[119];
75 	char count;
76 };
77 
78 /* this represents the instances of a symbol inside a given filepath */
79 
80 struct lws_fts_instance_file {
81 	/* linked-list of tifs generated for current file */
82 	struct lws_fts_instance_file *inst_file_next;
83 	struct lws_fts_entry *owner;
84 	struct lws_fts_lines *lines_list, *lines_tail;
85 	uint32_t file_index;
86 	uint32_t total;
87 
88 	/*
89 	 * optimization for the common case there's only 1 - ~3 matches, so we
90 	 * don't have to allocate any lws_fts_lines struct
91 	 *
92 	 * Using 8 bytes total for this maintains 8-byte struct alignment...
93 	 */
94 
95 	uint8_t vli[7];
96 	char count;
97 };
98 
99 /*
100  * this is the main trie in-memory allocation object
101  */
102 
103 struct lws_fts_entry {
104 	struct lws_fts_entry *parent;
105 
106 	struct lws_fts_entry *child_list;
107 	struct lws_fts_entry *sibling;
108 
109 	/*
110 	 * care... this points to content in t->lwsac_input_head, it goes
111 	 * out of scope when the input file being indexed completes
112 	 */
113 	struct lws_fts_instance_file *inst_file_list;
114 
115 	jg2_file_offset ofs_last_inst_file;
116 
117 	char *suffix; /* suffix string or NULL if one char (in .c) */
118 	jg2_file_offset ofs;
119 	uint32_t child_count;
120 	uint32_t instance_count;
121 	uint32_t agg_inst_count;
122 	uint32_t agg_child_count;
123 	uint32_t suffix_len;
124 	unsigned char c;
125 };
126 
127 /* there's only one of these per trie file */
128 
129 struct lws_fts {
130 	struct lwsac *lwsac_head;
131 	struct lwsac *lwsac_input_head;
132 	struct lws_fts_entry *root;
133 	struct lws_fts_filepath *filepath_list;
134 	struct lws_fts_filepath *fp;
135 
136 	struct lws_fts_entry *parser;
137 	struct lws_fts_entry *root_lookup[256];
138 
139 	/*
140 	 * head of linked-list of tifs generated for current file
141 	 * care... this points to content in t->lwsac_input_head
142 	 */
143 	struct lws_fts_instance_file *tif_list;
144 
145 	jg2_file_offset c; /* length of output file so far */
146 
147 	uint64_t agg_trie_creation_us;
148 	uint64_t agg_raw_input;
149 	uint64_t worst_lwsac_input_size;
150 	int last_file_index;
151 	int chars_in_line;
152 	jg2_file_offset last_block_len_ofs;
153 	int line_number;
154 	int lines_in_unsealed_linetable;
155 	int next_file_index;
156 	int count_entries;
157 
158 	int fd;
159 	unsigned int agg_pos;
160 	unsigned int str_match_pos;
161 
162 	unsigned char aggregate;
163 	unsigned char agg[128];
164 };
165 
166 /* since the kernel case allocates >300MB, no point keeping this too low */
167 
168 #define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
169 
170 #define spill(margin, force) \
171 	if (bp && ((uint32_t)bp >= (sizeof(buf) - (margin)) || (force))) { \
172 		if (write(t->fd, buf, bp) != bp) { \
173 			lwsl_err("%s: write %d failed (%d)\n", __func__, \
174 				 bp, errno); \
175 			return 1; \
176 		} \
177 		t->c += bp; \
178 		bp = 0; \
179 	}
180 
181 static int
g32(unsigned char * b,uint32_t d)182 g32(unsigned char *b, uint32_t d)
183 {
184 	*b++ = (d >> 24) & 0xff;
185 	*b++ = (d >> 16) & 0xff;
186 	*b++ = (d >> 8) & 0xff;
187 	*b = d & 0xff;
188 
189 	return 4;
190 }
191 
192 static int
g16(unsigned char * b,int d)193 g16(unsigned char *b, int d)
194 {
195 	*b++ = (d >> 8) & 0xff;
196 	*b = d & 0xff;
197 
198 	return 2;
199 }
200 
201 static int
wq32(unsigned char * b,uint32_t d)202 wq32(unsigned char *b, uint32_t d)
203 {
204 	unsigned char *ob = b;
205 
206 	if (d > (1 << 28) - 1)
207 		*b++ = ((d >> 28) | 0x80) & 0xff;
208 
209 	if (d > (1 << 21) - 1)
210 		*b++ = ((d >> 21) | 0x80) & 0xff;
211 
212 	if (d > (1 << 14) - 1)
213 		*b++ = ((d >> 14) | 0x80) & 0xff;
214 
215 	if (d > (1 << 7) - 1)
216 		*b++ = ((d >> 7) | 0x80) & 0xff;
217 
218 	*b++ = d & 0x7f;
219 
220 	return (int)(b - ob);
221 }
222 
223 
224 /* read a VLI, return the number of bytes used */
225 
226 int
rq32(unsigned char * b,uint32_t * d)227 rq32(unsigned char *b, uint32_t *d)
228 {
229 	unsigned char *ob = b;
230 	uint32_t t = 0;
231 
232 	t = *b & 0x7f;
233 	if (*(b++) & 0x80) {
234 		t = (t << 7) | (*b & 0x7f);
235 		if (*(b++) & 0x80) {
236 			t = (t << 7) | (*b & 0x7f);
237 			if (*(b++) & 0x80) {
238 				t = (t << 7) | (*b & 0x7f);
239 				if (*(b++) & 0x80) {
240 					t = (t << 7) | (*b & 0x7f);
241 					b++;
242 				}
243 			}
244 		}
245 	}
246 
247 	*d = t;
248 
249 	return (int)(b - ob);
250 }
251 
252 struct lws_fts *
lws_fts_create(int fd)253 lws_fts_create(int fd)
254 {
255 	struct lws_fts *t;
256 	struct lwsac *lwsac_head = NULL;
257 	unsigned char buf[TRIE_FILE_HDR_SIZE];
258 
259 	t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
260 	if (!t)
261 		return NULL;
262 
263 	memset(t, 0, sizeof(*t));
264 
265 	t->fd = fd;
266 	t->lwsac_head = lwsac_head;
267 	t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
268 			    TRIE_LWSAC_BLOCK_SIZE);
269 	if (!t->root)
270 		goto unwind;
271 
272 	memset(t->root, 0, sizeof(*t->root));
273 	t->parser = t->root;
274 	t->last_file_index = -1;
275 	t->line_number = 1;
276 	t->filepath_list = NULL;
277 
278 	memset(t->root_lookup, 0, sizeof(*t->root_lookup));
279 
280 	/* write the header */
281 
282 	buf[0] = 0xca;
283 	buf[1] = 0x7a;
284 	buf[2] = 0x5f;
285 	buf[3] = 0x75;
286 
287 	/* (these are filled in with correct data at the end) */
288 
289 	/* file offset to root trie entry */
290 	g32(&buf[4], 0);
291 	/* file length when it was created */
292 	g32(&buf[8], 0);
293 	/* fileoffset to the filepath table */
294 	g32(&buf[0xc], 0);
295 	/* count of filepaths */
296 	g32(&buf[0x10], 0);
297 
298 	if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
299 		lwsl_err("%s: trie header write failed\n", __func__);
300 		goto unwind;
301 	}
302 
303 	t->c = TRIE_FILE_HDR_SIZE;
304 
305 	return t;
306 
307 unwind:
308 	lwsac_free(&lwsac_head);
309 
310 	return NULL;
311 }
312 
313 void
lws_fts_destroy(struct lws_fts ** trie)314 lws_fts_destroy(struct lws_fts **trie)
315 {
316 	struct lwsac *lwsac_head = (*trie)->lwsac_head;
317 	lwsac_free(&(*trie)->lwsac_input_head);
318 	lwsac_free(&lwsac_head);
319 	*trie = NULL;
320 }
321 
322 int
lws_fts_file_index(struct lws_fts * t,const char * filepath,int filepath_len,int priority)323 lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
324 		    int priority)
325 {
326 	struct lws_fts_filepath *fp = t->filepath_list;
327 #if 0
328 	while (fp) {
329 		if (fp->filepath_len == filepath_len &&
330 		    !strcmp(fp->filepath, filepath))
331 			return fp->file_index;
332 
333 		fp = fp->next;
334 	}
335 #endif
336 	fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
337 	if (!fp)
338 		return -1;
339 
340 	fp->next = t->filepath_list;
341 	t->filepath_list = fp;
342 	strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
343 	fp->filepath[sizeof(fp->filepath) - 1] = '\0';
344 	fp->filepath_len = filepath_len;
345 	fp->file_index = t->next_file_index++;
346 	fp->line_table_ofs = t->c;
347 	fp->priority = priority;
348 	fp->total_lines = 0;
349 	t->fp = fp;
350 
351 	return fp->file_index;
352 }
353 
354 static struct lws_fts_entry *
lws_fts_entry_child_add(struct lws_fts * t,unsigned char c,struct lws_fts_entry * parent)355 lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
356 			struct lws_fts_entry *parent)
357 {
358 	struct lws_fts_entry *e, **pe;
359 
360 	e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
361 	if (!e)
362 		return NULL;
363 
364 	memset(e, 0, sizeof(*e));
365 
366 	e->c = c;
367 	parent->child_count++;
368 	e->parent = parent;
369 	t->count_entries++;
370 
371 	/* keep the parent child list in ascending sort order for c */
372 
373 	pe = &parent->child_list;
374 	while (*pe) {
375 		assert((*pe)->parent == parent);
376 		if ((*pe)->c > c) {
377 			/* add it before */
378 			e->sibling = *pe;
379 			*pe = e;
380 			break;
381 		}
382 		pe = &(*pe)->sibling;
383 	}
384 
385 	if (!*pe) {
386 		/* add it at the end */
387 		e->sibling = NULL;
388 		*pe = e;
389 	}
390 
391 	return e;
392 }
393 
394 static int
finalize_per_input(struct lws_fts * t)395 finalize_per_input(struct lws_fts *t)
396 {
397 	struct lws_fts_instance_file *tif;
398 	unsigned char buf[8192];
399 	uint64_t lwsac_input_size;
400 	jg2_file_offset temp;
401 	int bp = 0;
402 
403 	bp += g16(&buf[bp], 0);
404 	bp += g16(&buf[bp], 0);
405 	bp += g32(&buf[bp], 0);
406 	if (write(t->fd, buf, bp) != bp)
407 		return 1;
408 	t->c += bp;
409 	bp = 0;
410 
411 	/*
412 	 * Write the generated file index + instances (if any)
413 	 *
414 	 * Notice the next same-parent file instance fileoffset list is
415 	 * backwards, so it does not require seeks to fill in.  The first
416 	 * entry has 0 but the second entry points to the first entry (whose
417 	 * fileoffset is known).
418 	 *
419 	 * After all the file instance structs are finalized,
420 	 * .ofs_last_inst_file contains the fileoffset of that child's tif
421 	 * list head in the file.
422 	 *
423 	 * The file instances are written to disk in the order that the files
424 	 * were indexed, along with their prev pointers inline.
425 	 */
426 
427 	tif = t->tif_list;
428 	while (tif) {
429 		struct lws_fts_lines *i;
430 
431 		spill((3 * MAX_VLI) + tif->count, 0);
432 
433 		temp = tif->owner->ofs_last_inst_file;
434 		if (tif->total)
435 			tif->owner->ofs_last_inst_file = t->c + bp;
436 
437 		assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
438 
439 		/* fileoffset of prev instance file for this entry, or 0 */
440 		bp += wq32(&buf[bp], temp);
441 		bp += wq32(&buf[bp], tif->file_index);
442 		bp += wq32(&buf[bp], tif->total);
443 
444 		/* remove any pointers into this disposable lac footprint */
445 		tif->owner->inst_file_list = NULL;
446 
447 		memcpy(&buf[bp], &tif->vli, tif->count);
448 		bp += tif->count;
449 
450 		i = tif->lines_list;
451 		while (i) {
452 			spill(i->count, 0);
453 			memcpy(&buf[bp], &i->vli, i->count);
454 			bp += i->count;
455 
456 			i = i->lines_next;
457 		}
458 
459 		tif = tif->inst_file_next;
460 	}
461 
462 	spill(0, 1);
463 
464 	assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
465 
466 	if (t->lwsac_input_head) {
467 		lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
468 		if (lwsac_input_size > t->worst_lwsac_input_size)
469 			t->worst_lwsac_input_size = lwsac_input_size;
470 	}
471 
472 	/*
473 	 * those per-file allocations are all on a separate lac so we can
474 	 * free it cleanly afterwards
475 	 */
476 	lwsac_free(&t->lwsac_input_head);
477 
478 	/* and lose the pointer into the deallocated lac */
479 	t->tif_list = NULL;
480 
481 	return 0;
482 }
483 
484 /*
485  * 0 = punctuation, whitespace, brackets etc
486  * 1 = character inside symbol set
487  * 2 = upper-case character inside symbol set
488  */
489 
490 static char classify[] = {
491 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 	1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
495 	0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
496 	2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
497 	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
498 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
499 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
500 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
501 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
502 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
503 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
504 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
505 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
506 	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
507 };
508 
509 #if 0
510 static const char *
511 name_entry(struct lws_fts_entry *e1, char *s, int len)
512 {
513 	struct lws_fts_entry *e2;
514 	int n = len;
515 
516 	s[--n] = '\0';
517 
518 	e2 = e1;
519 	while (e2) {
520 		if (e2->suffix) {
521 			if ((int)e2->suffix_len < n) {
522 				n -= e2->suffix_len;
523 				memcpy(&s[n], e2->suffix, e2->suffix_len);
524 			}
525 		} else {
526 			n--;
527 			s[n] = e2->c;
528 		}
529 
530 		e2 = e2->parent;
531 	}
532 
533 	return &s[n + 1];
534 }
535 #endif
536 
537 /*
538  * as we parse the input, we create a line length table for the file index.
539  * Only the file header has been written before we start doing this.
540  */
541 
542 int
lws_fts_fill(struct lws_fts * t,uint32_t file_index,const char * buf,size_t len)543 lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
544 	     size_t len)
545 {
546 	unsigned long long tf = lws_now_usecs();
547 	unsigned char c, linetable[256], vlibuf[8];
548 	struct lws_fts_entry *e, *e1, *dcl;
549 	struct lws_fts_instance_file *tif;
550 	int bp = 0, sline, chars, m;
551 	char *osuff, skipline = 0;
552 	struct lws_fts_lines *tl;
553 	unsigned int olen, n;
554 	off_t lbh;
555 
556 	if ((int)file_index != t->last_file_index) {
557 		if (t->last_file_index >= 0)
558 			finalize_per_input(t);
559 		t->last_file_index = file_index;
560 		t->line_number = 1;
561 		t->chars_in_line = 0;
562 		t->lines_in_unsealed_linetable = 0;
563 	}
564 
565 	t->agg_raw_input += len;
566 
567 resume:
568 
569 	chars = 0;
570 	lbh = t->c;
571 	sline = t->line_number;
572 	bp += g16(&linetable[bp], 0);
573 	bp += g16(&linetable[bp], 0);
574 	bp += g32(&linetable[bp], 0);
575 
576 	while (len) {
577 		char go_around = 0;
578 
579 		if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
580 			break;
581 
582 		len--;
583 
584 		c = (unsigned char)*buf++;
585 		t->chars_in_line++;
586 		if (c == '\n') {
587 			skipline = 0;
588 			t->filepath_list->total_lines++;
589 			t->lines_in_unsealed_linetable++;
590 			t->line_number++;
591 
592 			bp += wq32(&linetable[bp], t->chars_in_line);
593 			if ((unsigned int)bp > sizeof(linetable) - 6) {
594 				if (write(t->fd, linetable, bp) != bp) {
595 					lwsl_err("%s: linetable write failed\n",
596 							__func__);
597 					return 1;
598 				}
599 				t->c += bp;
600 				bp = 0;
601 				// assert(lseek(t->fd, 0, SEEK_END) == t->c);
602 			}
603 
604 			chars += t->chars_in_line;
605 			t->chars_in_line = 0;
606 
607 			/*
608 			 * Detect overlength lines and skip them (eg, BASE64
609 			 * in css etc)
610 			 */
611 
612 			if (len > 200) {
613 				n = 0;
614 				m = 0;
615 				while (n < 200 && m < 80 && buf[n] != '\n') {
616 				       if (buf[n] == ' ' || buf[n] == '\t')
617 					       m = 0;
618 					n++;
619 					m++;
620 				}
621 
622 				/* 80 lines no whitespace, or >=200-char line */
623 
624 				if (m == 80 || n == 200)
625 					skipline = 1;
626 			}
627 
628 			goto seal;
629 		}
630 		if (skipline)
631 			continue;
632 
633 		m = classify[(int)c];
634 		if (!m)
635 			goto seal;
636 		if (m == 2)
637 			c += 'a' - 'A';
638 
639 		if (t->aggregate) {
640 
641 			/*
642 			 * We created a trie entry for an earlier char in this
643 			 * symbol already.  So we know at the moment, any
644 			 * further chars in the symbol are the only children.
645 			 *
646 			 * Aggregate them and add them as a string suffix to
647 			 * the trie symbol at the end (when we know how much to
648 			 * allocate).
649 			 */
650 
651 			if (t->agg_pos < sizeof(t->agg) - 1)
652 				/* symbol is not too long to stash */
653 				t->agg[t->agg_pos++] = c;
654 
655 			continue;
656 		}
657 
658 		if (t->str_match_pos) {
659 			go_around = 1;
660 			goto seal;
661 		}
662 
663 		/* zeroth-iteration child matching */
664 
665 		if (t->parser == t->root) {
666 			e = t->root_lookup[(int)c];
667 			if (e) {
668 				t->parser = e;
669 				continue;
670 			}
671 		} else {
672 
673 			/* look for the char amongst the children */
674 
675 			e = t->parser->child_list;
676 			while (e) {
677 
678 				/* since they're alpha ordered... */
679 				if (e->c > c) {
680 					e = NULL;
681 					break;
682 				}
683 				if (e->c == c) {
684 					t->parser = e;
685 
686 					if (e->suffix)
687 						t->str_match_pos = 1;
688 
689 					break;
690 				}
691 
692 				e = e->sibling;
693 			}
694 
695 			if (e)
696 				continue;
697 		}
698 
699 		/*
700 		 * we are blazing a new trail, add a new child representing
701 		 * the whole suffix that couldn't be matched until now.
702 		 */
703 
704 		e = lws_fts_entry_child_add(t, c, t->parser);
705 		if (!e) {
706 			lwsl_err("%s: lws_fts_entry_child_add failed\n",
707 					__func__);
708 			return 1;
709 		}
710 
711 		/* if it's the root node, keep the root_lookup table in sync */
712 
713 		if (t->parser == t->root)
714 			t->root_lookup[(int)c] = e;
715 
716 		/* follow the new path */
717 		t->parser = e;
718 
719 		{
720 			struct lws_fts_entry **pe = &e->child_list;
721 			while (*pe) {
722 				assert((*pe)->parent == e);
723 
724 				pe = &(*pe)->sibling;
725 			}
726 		}
727 
728 		/*
729 		 * If there are any more symbol characters coming, just
730 		 * create a suffix string on t->parser instead of what must
731 		 * currently be single-child nodes, since we just created e
732 		 * as a child with a single character due to no existing match
733 		 * on that single character... so if no match on 'h' with this
734 		 * guy's parent, we created e that matches on the single char
735 		 * 'h'.  If the symbol continues ... 'a' 'p' 'p' 'y', then
736 		 * instead of creating singleton child nodes under e,
737 		 * modify e to match on the whole string suffix "happy".
738 		 *
739 		 * If later "hoppy" appears, we will remove the suffix on e,
740 		 * so it reverts to a char match for 'h', add singleton children
741 		 * for 'a' and 'o', and attach a "ppy" suffix child to each of
742 		 * those.
743 		 *
744 		 * We want to do this so we don't have to allocate trie entries
745 		 * for every char in the string to save memory and consequently
746 		 * time.
747 		 *
748 		 * Don't try this optimization if the parent is the root node...
749 		 * it's not compatible with it's root_lookup table and it's
750 		 * highly likely children off the root entry are going to have
751 		 * to be fragmented.
752 		 */
753 
754 		if (e->parent != t->root) {
755 			t->aggregate = 1;
756 			t->agg_pos = 0;
757 		}
758 
759 		continue;
760 
761 seal:
762 		if (t->str_match_pos) {
763 
764 			/*
765 			 * We're partway through matching an elaborated string
766 			 * on a child, not just a character.  String matches
767 			 * only exist when we met a child entry that only had
768 			 * one path until now... so we had an 'h', and the
769 			 * only child had a string "hello".
770 			 *
771 			 * We are following the right path and will not need
772 			 * to back up, but we may find as we go we have the
773 			 * first instance of a second child path, eg, "help".
774 			 *
775 			 * When we get to the 'p', we have to split what was
776 			 * the only string option "hello" into "hel" and then
777 			 * two child entries, for "lo" and 'p'.
778 			 */
779 
780 			if (c == t->parser->suffix[t->str_match_pos++]) {
781 				if (t->str_match_pos < t->parser->suffix_len)
782 					continue;
783 
784 				/*
785 				 * We simply matched everything, continue
786 				 * parsing normally from this trie entry.
787 				 */
788 
789 				t->str_match_pos = 0;
790 				continue;
791 			}
792 
793 			/*
794 			 * So... we hit a mismatch somewhere... it means we
795 			 * have to split this string entry.
796 			 *
797 			 * We know the first char actually matched in order to
798 			 * start down this road.  So for the current trie entry,
799 			 * we need to truncate his suffix at the char before
800 			 * this mismatched one, where we diverged (if the
801 			 * second char, simply remove the suffix string from the
802 			 * current trie entry to turn it back to a 1-char match)
803 			 *
804 			 * The original entry, which becomes the lhs post-split,
805 			 * is t->parser.
806 			 */
807 
808 			olen = t->parser->suffix_len;
809 			osuff = t->parser->suffix;
810 
811 			if (t->str_match_pos == 2)
812 				t->parser->suffix = NULL;
813 			else
814 				t->parser->suffix_len = t->str_match_pos - 1;
815 
816 			/*
817 			 * Then we need to create a new child trie entry that
818 			 * represents the remainder of the original string
819 			 * path that we didn't match.  For the "hello" /
820 			 * "help" case, this guy will have "lo".
821 			 *
822 			 * Any instances or children (not siblings...) that were
823 			 * attached to the original trie entry must be detached
824 			 * first and then migrate to this new guy that completes
825 			 * the original string.
826 			 */
827 
828 			dcl = t->parser->child_list;
829 			m = t->parser->child_count;
830 
831 			t->parser->child_list = NULL;
832 			t->parser->child_count = 0;
833 
834 			e = lws_fts_entry_child_add(t,
835 					osuff[t->str_match_pos - 1], t->parser);
836 			if (!e) {
837 				lwsl_err("%s: lws_fts_entry_child_add fail1\n",
838 						__func__);
839 				return 1;
840 			}
841 
842 			e->child_list = dcl;
843 			e->child_count = m;
844 			/*
845 			 * any children we took over must point to us as the
846 			 * parent now they appear on our child list
847 			 */
848 			e1 = e->child_list;
849 			while (e1) {
850 				e1->parent = e;
851 				e1 = e1->sibling;
852 			}
853 
854 			/*
855 			 * We detached any children, gave them to the new guy
856 			 * and replaced them with just our new guy
857 			 */
858 			t->parser->child_count = 1;
859 			t->parser->child_list = e;
860 
861 			/*
862 			 * any instances that belonged to the original entry we
863 			 * are splitting now must be reassigned to the end
864 			 * part
865 			 */
866 
867 			e->inst_file_list = t->parser->inst_file_list;
868 			if (e->inst_file_list)
869 				e->inst_file_list->owner = e;
870 			t->parser->inst_file_list = NULL;
871 			e->instance_count = t->parser->instance_count;
872 			t->parser->instance_count = 0;
873 
874 			e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
875 			t->parser->ofs_last_inst_file = 0;
876 
877 			if (t->str_match_pos != olen) {
878 				/* we diverged partway */
879 				e->suffix = &osuff[t->str_match_pos - 1];
880 				e->suffix_len = olen - (t->str_match_pos - 1);
881 			}
882 
883 			/*
884 			 * if the current char is a terminal, skip creating a
885 			 * new way forward.
886 			 */
887 
888 			if (classify[(int)c]) {
889 
890 				/*
891 				 * Lastly we need to create a new child trie
892 				 * entry that represents the new way forward
893 				 * from the point that we diverged.  For the
894 				 * "hello" / "help" case, this guy will start
895 				 * as a child of "hel" with the single
896 				 * character match 'p'.
897 				 *
898 				 * Since he becomes the current parser context,
899 				 * more symbol characters may be coming to make
900 				 * him into, eg, "helping", in which case he
901 				 * will acquire a suffix eventually of "ping"
902 				 * via the aggregation stuff
903 				 */
904 
905 				e = lws_fts_entry_child_add(t, c, t->parser);
906 				if (!e) {
907 					lwsl_err("%s: child_add fail2\n",
908 						 __func__);
909 					return 1;
910 				}
911 			}
912 
913 			/* go on following this path */
914 			t->parser = e;
915 
916 			t->aggregate = 1;
917 			t->agg_pos = 0;
918 
919 			t->str_match_pos = 0;
920 
921 			if (go_around)
922 				continue;
923 
924 			/* this is intended to be a seal */
925 		}
926 
927 
928 		/* end of token */
929 
930 		if (t->aggregate && t->agg_pos) {
931 
932 			/* if nothing in agg[]: leave as single char match */
933 
934 			/* otherwise copy out the symbol aggregation */
935 			t->parser->suffix = lwsac_use(&t->lwsac_head,
936 						    t->agg_pos + 1,
937 						    TRIE_LWSAC_BLOCK_SIZE);
938 			if (!t->parser->suffix) {
939 				lwsl_err("%s: lac for suffix failed\n",
940 						__func__);
941 				return 1;
942 			}
943 
944 			/* add the first char at the beginning */
945 			*t->parser->suffix = t->parser->c;
946 			/* and then add the agg buffer stuff */
947 			memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
948 			t->parser->suffix_len = t->agg_pos + 1;
949 		}
950 		t->aggregate = 0;
951 
952 		if (t->parser == t->root) /* multiple terminal chars */
953 			continue;
954 
955 		if (!t->parser->inst_file_list ||
956 		    t->parser->inst_file_list->file_index != file_index) {
957 			tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
958 				      TRIE_LWSAC_BLOCK_SIZE);
959 			if (!tif) {
960 				lwsl_err("%s: lac for tif failed\n",
961 						__func__);
962 				return 1;
963 			}
964 
965 			tif->file_index = file_index;
966 			tif->owner = t->parser;
967 			tif->lines_list = NULL;
968 			tif->lines_tail = NULL;
969 			tif->total = 0;
970 			tif->count = 0;
971 			tif->inst_file_next = t->tif_list;
972 			t->tif_list = tif;
973 
974 			t->parser->inst_file_list = tif;
975 		}
976 
977 		/*
978 		 * A naive allocation strategy for this leads to 50% of the
979 		 * total inmem lac allocation being for line numbers...
980 		 *
981 		 * It's mainly solved by only holding the instance and line
982 		 * number tables for the duration of a file being input, as soon
983 		 * as one input file is finished it is written to disk.
984 		 *
985 		 * For the common case of 1 - ~3 matches the line number are
986 		 * stored in a small VLI array inside the filepath inst.  If the
987 		 * next one won't fit, it allocates a line number struct with
988 		 * more vli space and continues chaining those if needed.
989 		 */
990 
991 		n = wq32(vlibuf, t->line_number);
992 		tif = t->parser->inst_file_list;
993 
994 		if (!tif->lines_list) {
995 			/* we are still trying to use the file inst vli */
996 			if (LWS_ARRAY_SIZE(tif->vli) - tif->count >= n) {
997 				tif->count += wq32(tif->vli + tif->count,
998 						   t->line_number);
999 				goto after;
1000 			}
1001 			/* we are going to have to allocate */
1002 		}
1003 
1004 		/* can we add to an existing line numbers struct? */
1005 		if (tif->lines_tail &&
1006 		    LWS_ARRAY_SIZE(tif->lines_tail->vli) -
1007 		    	    tif->lines_tail->count >= n) {
1008 			tif->lines_tail->count += wq32(tif->lines_tail->vli +
1009 						       tif->lines_tail->count,
1010 						       t->line_number);
1011 			goto after;
1012 		}
1013 
1014 		/* either no existing line numbers struct at tail, or full */
1015 
1016 		/* have to create a(nother) line numbers struct */
1017 		tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
1018 			     TRIE_LWSAC_BLOCK_SIZE);
1019 		if (!tl) {
1020 			lwsl_err("%s: lac for tl failed\n", __func__);
1021 			return 1;
1022 		}
1023 		tl->lines_next = NULL;
1024 		if (tif->lines_tail)
1025 			tif->lines_tail->lines_next = tl;
1026 
1027 		tif->lines_tail = tl;
1028 		if (!tif->lines_list)
1029 			tif->lines_list = tl;
1030 
1031 		tl->count = wq32(tl->vli, t->line_number);
1032 after:
1033 		tif->total++;
1034 #if 0
1035 		{
1036 			char s[128];
1037 			const char *ne = name_entry(t->parser, s, sizeof(s));
1038 
1039 			if (!strcmp(ne, "describ")) {
1040 				lwsl_err("     %s %d\n", ne, t->str_match_pos);
1041 				write(1, buf - 10, 20);
1042 			}
1043 		}
1044 #endif
1045 		t->parser->instance_count++;
1046 		t->parser = t->root;
1047 		t->str_match_pos = 0;
1048 	}
1049 
1050 	/* seal off the line length table block */
1051 
1052 	if (bp) {
1053 		if (write(t->fd, linetable, bp) != bp)
1054 			return 1;
1055 		t->c += bp;
1056 		bp = 0;
1057 	}
1058 
1059 	if (lseek(t->fd, lbh, SEEK_SET) < 0) {
1060 		lwsl_err("%s: seek to 0x%llx failed\n", __func__,
1061 				(unsigned long long)lbh);
1062 		return 1;
1063 	}
1064 
1065 	g16(linetable, t->c - lbh);
1066 	g16(linetable + 2, t->line_number - sline);
1067 	g32(linetable + 4, chars);
1068 	if (write(t->fd, linetable, 8) != 8) {
1069 		lwsl_err("%s: write linetable header failed\n", __func__);
1070 		return 1;
1071 	}
1072 
1073 	assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1074 
1075 	if (lseek(t->fd, t->c, SEEK_SET) < 0) {
1076 		lwsl_err("%s: end seek failed\n", __func__);
1077 		return 1;
1078 	}
1079 
1080 	bp = 0;
1081 
1082 	if (len) {
1083 		t->lines_in_unsealed_linetable = 0;
1084 		goto resume;
1085 	}
1086 
1087 	/* dump the collected per-input instance and line data, and free it */
1088 
1089 	t->agg_trie_creation_us += lws_now_usecs() - tf;
1090 
1091 	return 0;
1092 }
1093 
1094 /* refer to ./README.md */
1095 
1096 int
lws_fts_serialize(struct lws_fts * t)1097 lws_fts_serialize(struct lws_fts *t)
1098 {
1099 	struct lws_fts_filepath *fp = t->filepath_list, *ofp;
1100 	unsigned long long tf = lws_now_usecs();
1101 	struct lws_fts_entry *e, *e1, *s[256];
1102 	unsigned char buf[8192], stasis;
1103 	int n, bp, sp = 0, do_parent;
1104 
1105 	(void)tf;
1106 	finalize_per_input(t);
1107 
1108 	/*
1109 	 * Compute aggregated instance counts (parents should know the total
1110 	 * number of instances below each child path)
1111 	 *
1112 	 *
1113 	 * If we have
1114 	 *
1115 	 * (root) -> (c1) -> (c2)
1116 	 *        -> (c3)
1117 	 *
1118 	 * we need to visit the nodes in the order
1119 	 *
1120 	 * c2, c1, c3, root
1121 	 */
1122 
1123 	sp = 0;
1124 	s[0] = t->root;
1125 	do_parent = 0;
1126 	while (sp >= 0) {
1127 		int n;
1128 
1129 		/* aggregate in every antecedent */
1130 
1131 		for (n = 0; n <= sp; n++) {
1132 			s[n]->agg_inst_count += s[sp]->instance_count;
1133 			s[n]->agg_child_count += s[sp]->child_count;
1134 		}
1135 
1136 		/* handle any children before the parent */
1137 
1138 		if (s[sp]->child_list) {
1139 			if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1140 				lwsl_err("Stack too deep\n");
1141 
1142 				goto bail;
1143 			}
1144 
1145 			s[sp + 1] = s[sp]->child_list;
1146 			sp++;
1147 			continue;
1148 		}
1149 
1150 		do {
1151 			if (s[sp]->sibling) {
1152 				s[sp] = s[sp]->sibling;
1153 				break;
1154 			} else
1155 				sp--;
1156 		} while (sp >= 0);
1157 	}
1158 
1159 	/* dump the filepaths and set prev */
1160 
1161 	fp = t->filepath_list;
1162 	ofp = NULL;
1163 	bp = 0;
1164 	while (fp) {
1165 
1166 		fp->ofs = t->c + bp;
1167 		n = (int)strlen(fp->filepath);
1168 		spill(15 + n, 0);
1169 
1170 		bp += wq32(&buf[bp], fp->line_table_ofs);
1171 		bp += wq32(&buf[bp], fp->total_lines);
1172 		bp += wq32(&buf[bp], n);
1173 		memcpy(&buf[bp], fp->filepath, n);
1174 		bp += n;
1175 
1176 		fp->prev = ofp;
1177 		ofp = fp;
1178 		fp = fp->next;
1179 	}
1180 
1181 	spill(0, 1);
1182 
1183 	/* record the fileoffset of the filepath map and filepath count */
1184 
1185 	if (lseek(t->fd, 0xc, SEEK_SET) < 0)
1186 		goto bail_seek;
1187 
1188 	g32(buf, t->c + bp);
1189 	g32(buf + 4, t->next_file_index);
1190 	if (write(t->fd, buf, 8) != 8)
1191 		goto bail;
1192 
1193 	if (lseek(t->fd, t->c + bp, SEEK_SET) < 0)
1194 		goto bail_seek;
1195 
1196 	/* dump the filepath map, starting from index 0, which is at the tail */
1197 
1198 	fp = ofp;
1199 	bp = 0;
1200 	while (fp) {
1201 		spill(5, 0);
1202 		g32(buf + bp, fp->ofs);
1203 		bp += 4;
1204 		fp = fp->prev;
1205 	}
1206 	spill(0, 1);
1207 
1208 	/*
1209 	 * The trie entries in reverse order... because of the reversal, we have
1210 	 * always written children first, and marked them with their file offset
1211 	 * before we come to refer to them.
1212 	 */
1213 
1214 	bp = 0;
1215 	sp = 0;
1216 	s[0] = t->root;
1217 	do_parent = 0;
1218 	while (s[sp]) {
1219 
1220 		/* handle any children before the parent */
1221 
1222 		if (!do_parent && s[sp]->child_list) {
1223 
1224 			if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1225 				lwsl_err("Stack too deep\n");
1226 
1227 				goto bail;
1228 			}
1229 
1230 			s[sp + 1] = s[sp]->child_list;
1231 			sp++;
1232 			continue;
1233 		}
1234 
1235 		/* leaf nodes with no children */
1236 
1237 		e = s[sp];
1238 		e->ofs = t->c + bp;
1239 
1240 		/* write the trie entry header */
1241 
1242 		spill((3 * MAX_VLI), 0);
1243 
1244 		bp += wq32(&buf[bp], e->ofs_last_inst_file);
1245 		bp += wq32(&buf[bp], e->child_count);
1246 		bp += wq32(&buf[bp], e->instance_count);
1247 		bp += wq32(&buf[bp], e->agg_inst_count);
1248 
1249 		/* sort the children in order of highest aggregate hits first */
1250 
1251 		do {
1252 			struct lws_fts_entry **pe, *te1, *te2;
1253 
1254 			stasis = 1;
1255 
1256 			/* bubble sort keeps going until nothing changed */
1257 
1258 			pe = &e->child_list;
1259 			while (*pe) {
1260 
1261 				te1 = *pe;
1262 				te2 = te1->sibling;
1263 
1264 				if (te2 && te1->agg_inst_count <
1265 					   te2->agg_inst_count) {
1266 					stasis = 0;
1267 
1268 					*pe = te2;
1269 					te1->sibling = te2->sibling;
1270 					te2->sibling = te1;
1271 				}
1272 
1273 				pe = &(*pe)->sibling;
1274 			}
1275 
1276 		} while (!stasis);
1277 
1278 		/* write the children */
1279 
1280 		e1 = e->child_list;
1281 		while (e1) {
1282 			spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
1283 
1284 			bp += wq32(&buf[bp], e1->ofs);
1285 			bp += wq32(&buf[bp], e1->instance_count);
1286 			bp += wq32(&buf[bp], e1->agg_inst_count);
1287 			bp += wq32(&buf[bp], e1->agg_child_count);
1288 
1289 			if (e1->suffix) { /* string  */
1290 				bp += wq32(&buf[bp], e1->suffix_len);
1291 				memmove(&buf[bp], e1->suffix, e1->suffix_len);
1292 				bp += e1->suffix_len;
1293 			} else { /* char */
1294 				bp += wq32(&buf[bp], 1);
1295 				buf[bp++] = e1->c;
1296 			}
1297 #if 0
1298 			if (e1->suffix && e1->suffix_len == 3 &&
1299 			    !memcmp(e1->suffix, "cri", 3)) {
1300 				struct lws_fts_entry *e2;
1301 
1302 				e2 = e1;
1303 				while (e2){
1304 					if (e2->suffix)
1305 						lwsl_notice("%s\n", e2->suffix);
1306 					else
1307 						lwsl_notice("%c\n", e2->c);
1308 
1309 					e2 = e2->parent;
1310 				}
1311 
1312 				lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
1313 						e1->instance_count, e1->child_count);
1314 			}
1315 #endif
1316 			e1 = e1->sibling;
1317 		}
1318 
1319 		/* if there are siblings, do those next */
1320 
1321 		if (do_parent) {
1322 			do_parent = 0;
1323 			sp--;
1324 		}
1325 
1326 		if (s[sp]->sibling)
1327 			s[sp] = s[sp]->sibling;
1328 		else {
1329 			/* if there are no siblings, do the parent */
1330 			do_parent = 1;
1331 			s[sp] = s[sp]->parent;
1332 		}
1333 	}
1334 
1335 	spill(0, 1);
1336 
1337 	assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1338 
1339 	/* drop the correct root trie offset + file length into the header */
1340 
1341 	if (lseek(t->fd, 4, SEEK_SET) < 0) {
1342 		lwsl_err("%s: unable to seek\n", __func__);
1343 
1344 		goto bail;
1345 	}
1346 
1347 	g32(buf, t->root->ofs);
1348 	g32(buf + 4, t->c);
1349 	if (write(t->fd, buf, 0x8) != 0x8)
1350 		goto bail;
1351 
1352 	lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
1353 		    "alloc: %dKiB + %dKiB, "
1354 		    "serialize: %dms, file: %dKiB\n", __func__,
1355 		    t->next_file_index,
1356 		    (int)(t->agg_raw_input / (1024 * 1024)),
1357 		    (int)(t->agg_trie_creation_us / 1000),
1358 		    (int)(lwsac_total_alloc(t->lwsac_head) / 1024),
1359 		    (int)(t->worst_lwsac_input_size / 1024),
1360 		    (int)((lws_now_usecs() - tf) / 1000),
1361 		    (int)(t->c / 1024));
1362 
1363 	return 0;
1364 
1365 bail_seek:
1366 	lwsl_err("%s: problem seekings\n", __func__);
1367 
1368 bail:
1369 	return 1;
1370 }
1371 
1372 
1373