1 /*
2 * libwebsockets - small server side websockets and web server implementation
3 *
4 * Copyright (C) 2010 - 2019 Andy Green <andy@warmcat.com>
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to
8 * deal in the Software without restriction, including without limitation the
9 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10 * sell copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
22 * IN THE SOFTWARE.
23 *
24 * The functions allow
25 *
26 * - collecting a concordance of strings from one or more files (eg, a
27 * directory of files) into a single in-memory, lac-backed trie;
28 *
29 * - to optimize and serialize the in-memory trie to an fd;
30 *
31 * - to very quickly report any instances of a string in any of the files
32 * indexed by the trie, by a seeking around a serialized trie fd, without
33 * having to load it all in memory
34 */
35
36 #include "private-lib-core.h"
37 #include "private-lib-misc-fts.h"
38
39 #include <stdio.h>
40 #include <string.h>
41 #include <assert.h>
42 #include <fcntl.h>
43 #include <errno.h>
44 #include <sys/types.h>
45
46 struct lws_fts_entry;
47
48 /* notice these are stored in t->lwsac_input_head which has input file scope */
49
50 struct lws_fts_filepath {
51 struct lws_fts_filepath *next;
52 struct lws_fts_filepath *prev;
53 char filepath[256];
54 jg2_file_offset ofs;
55 jg2_file_offset line_table_ofs;
56 int filepath_len;
57 int file_index;
58 int total_lines;
59 int priority;
60 };
61
62 /* notice these are stored in t->lwsac_input_head which has input file scope */
63
64 struct lws_fts_lines {
65 struct lws_fts_lines *lines_next;
66 /*
67 * amount of line numbers needs to meet average count for best
68 * efficiency.
69 *
70 * Line numbers are stored in VLI format since if we don't, around half
71 * the total lac allocation consists of struct lws_fts_lines...
72 * size chosen to maintain 8-byte struct alignment
73 */
74 uint8_t vli[119];
75 char count;
76 };
77
78 /* this represents the instances of a symbol inside a given filepath */
79
80 struct lws_fts_instance_file {
81 /* linked-list of tifs generated for current file */
82 struct lws_fts_instance_file *inst_file_next;
83 struct lws_fts_entry *owner;
84 struct lws_fts_lines *lines_list, *lines_tail;
85 uint32_t file_index;
86 uint32_t total;
87
88 /*
89 * optimization for the common case there's only 1 - ~3 matches, so we
90 * don't have to allocate any lws_fts_lines struct
91 *
92 * Using 8 bytes total for this maintains 8-byte struct alignment...
93 */
94
95 uint8_t vli[7];
96 char count;
97 };
98
99 /*
100 * this is the main trie in-memory allocation object
101 */
102
103 struct lws_fts_entry {
104 struct lws_fts_entry *parent;
105
106 struct lws_fts_entry *child_list;
107 struct lws_fts_entry *sibling;
108
109 /*
110 * care... this points to content in t->lwsac_input_head, it goes
111 * out of scope when the input file being indexed completes
112 */
113 struct lws_fts_instance_file *inst_file_list;
114
115 jg2_file_offset ofs_last_inst_file;
116
117 char *suffix; /* suffix string or NULL if one char (in .c) */
118 jg2_file_offset ofs;
119 uint32_t child_count;
120 uint32_t instance_count;
121 uint32_t agg_inst_count;
122 uint32_t agg_child_count;
123 uint32_t suffix_len;
124 unsigned char c;
125 };
126
127 /* there's only one of these per trie file */
128
129 struct lws_fts {
130 struct lwsac *lwsac_head;
131 struct lwsac *lwsac_input_head;
132 struct lws_fts_entry *root;
133 struct lws_fts_filepath *filepath_list;
134 struct lws_fts_filepath *fp;
135
136 struct lws_fts_entry *parser;
137 struct lws_fts_entry *root_lookup[256];
138
139 /*
140 * head of linked-list of tifs generated for current file
141 * care... this points to content in t->lwsac_input_head
142 */
143 struct lws_fts_instance_file *tif_list;
144
145 jg2_file_offset c; /* length of output file so far */
146
147 uint64_t agg_trie_creation_us;
148 uint64_t agg_raw_input;
149 uint64_t worst_lwsac_input_size;
150 int last_file_index;
151 int chars_in_line;
152 jg2_file_offset last_block_len_ofs;
153 int line_number;
154 int lines_in_unsealed_linetable;
155 int next_file_index;
156 int count_entries;
157
158 int fd;
159 unsigned int agg_pos;
160 unsigned int str_match_pos;
161
162 unsigned char aggregate;
163 unsigned char agg[128];
164 };
165
166 /* since the kernel case allocates >300MB, no point keeping this too low */
167
168 #define TRIE_LWSAC_BLOCK_SIZE (1024 * 1024)
169
170 #define spill(margin, force) \
171 if (bp && ((uint32_t)bp >= (sizeof(buf) - (margin)) || (force))) { \
172 if (write(t->fd, buf, bp) != bp) { \
173 lwsl_err("%s: write %d failed (%d)\n", __func__, \
174 bp, errno); \
175 return 1; \
176 } \
177 t->c += bp; \
178 bp = 0; \
179 }
180
181 static int
g32(unsigned char * b,uint32_t d)182 g32(unsigned char *b, uint32_t d)
183 {
184 *b++ = (d >> 24) & 0xff;
185 *b++ = (d >> 16) & 0xff;
186 *b++ = (d >> 8) & 0xff;
187 *b = d & 0xff;
188
189 return 4;
190 }
191
192 static int
g16(unsigned char * b,int d)193 g16(unsigned char *b, int d)
194 {
195 *b++ = (d >> 8) & 0xff;
196 *b = d & 0xff;
197
198 return 2;
199 }
200
201 static int
wq32(unsigned char * b,uint32_t d)202 wq32(unsigned char *b, uint32_t d)
203 {
204 unsigned char *ob = b;
205
206 if (d > (1 << 28) - 1)
207 *b++ = ((d >> 28) | 0x80) & 0xff;
208
209 if (d > (1 << 21) - 1)
210 *b++ = ((d >> 21) | 0x80) & 0xff;
211
212 if (d > (1 << 14) - 1)
213 *b++ = ((d >> 14) | 0x80) & 0xff;
214
215 if (d > (1 << 7) - 1)
216 *b++ = ((d >> 7) | 0x80) & 0xff;
217
218 *b++ = d & 0x7f;
219
220 return (int)(b - ob);
221 }
222
223
224 /* read a VLI, return the number of bytes used */
225
226 int
rq32(unsigned char * b,uint32_t * d)227 rq32(unsigned char *b, uint32_t *d)
228 {
229 unsigned char *ob = b;
230 uint32_t t = 0;
231
232 t = *b & 0x7f;
233 if (*(b++) & 0x80) {
234 t = (t << 7) | (*b & 0x7f);
235 if (*(b++) & 0x80) {
236 t = (t << 7) | (*b & 0x7f);
237 if (*(b++) & 0x80) {
238 t = (t << 7) | (*b & 0x7f);
239 if (*(b++) & 0x80) {
240 t = (t << 7) | (*b & 0x7f);
241 b++;
242 }
243 }
244 }
245 }
246
247 *d = t;
248
249 return (int)(b - ob);
250 }
251
252 struct lws_fts *
lws_fts_create(int fd)253 lws_fts_create(int fd)
254 {
255 struct lws_fts *t;
256 struct lwsac *lwsac_head = NULL;
257 unsigned char buf[TRIE_FILE_HDR_SIZE];
258
259 t = lwsac_use(&lwsac_head, sizeof(*t), TRIE_LWSAC_BLOCK_SIZE);
260 if (!t)
261 return NULL;
262
263 memset(t, 0, sizeof(*t));
264
265 t->fd = fd;
266 t->lwsac_head = lwsac_head;
267 t->root = lwsac_use(&lwsac_head, sizeof(*t->root),
268 TRIE_LWSAC_BLOCK_SIZE);
269 if (!t->root)
270 goto unwind;
271
272 memset(t->root, 0, sizeof(*t->root));
273 t->parser = t->root;
274 t->last_file_index = -1;
275 t->line_number = 1;
276 t->filepath_list = NULL;
277
278 memset(t->root_lookup, 0, sizeof(*t->root_lookup));
279
280 /* write the header */
281
282 buf[0] = 0xca;
283 buf[1] = 0x7a;
284 buf[2] = 0x5f;
285 buf[3] = 0x75;
286
287 /* (these are filled in with correct data at the end) */
288
289 /* file offset to root trie entry */
290 g32(&buf[4], 0);
291 /* file length when it was created */
292 g32(&buf[8], 0);
293 /* fileoffset to the filepath table */
294 g32(&buf[0xc], 0);
295 /* count of filepaths */
296 g32(&buf[0x10], 0);
297
298 if (write(t->fd, buf, TRIE_FILE_HDR_SIZE) != TRIE_FILE_HDR_SIZE) {
299 lwsl_err("%s: trie header write failed\n", __func__);
300 goto unwind;
301 }
302
303 t->c = TRIE_FILE_HDR_SIZE;
304
305 return t;
306
307 unwind:
308 lwsac_free(&lwsac_head);
309
310 return NULL;
311 }
312
313 void
lws_fts_destroy(struct lws_fts ** trie)314 lws_fts_destroy(struct lws_fts **trie)
315 {
316 struct lwsac *lwsac_head = (*trie)->lwsac_head;
317 lwsac_free(&(*trie)->lwsac_input_head);
318 lwsac_free(&lwsac_head);
319 *trie = NULL;
320 }
321
322 int
lws_fts_file_index(struct lws_fts * t,const char * filepath,int filepath_len,int priority)323 lws_fts_file_index(struct lws_fts *t, const char *filepath, int filepath_len,
324 int priority)
325 {
326 struct lws_fts_filepath *fp = t->filepath_list;
327 #if 0
328 while (fp) {
329 if (fp->filepath_len == filepath_len &&
330 !strcmp(fp->filepath, filepath))
331 return fp->file_index;
332
333 fp = fp->next;
334 }
335 #endif
336 fp = lwsac_use(&t->lwsac_head, sizeof(*fp), TRIE_LWSAC_BLOCK_SIZE);
337 if (!fp)
338 return -1;
339
340 fp->next = t->filepath_list;
341 t->filepath_list = fp;
342 strncpy(fp->filepath, filepath, sizeof(fp->filepath) - 1);
343 fp->filepath[sizeof(fp->filepath) - 1] = '\0';
344 fp->filepath_len = filepath_len;
345 fp->file_index = t->next_file_index++;
346 fp->line_table_ofs = t->c;
347 fp->priority = priority;
348 fp->total_lines = 0;
349 t->fp = fp;
350
351 return fp->file_index;
352 }
353
354 static struct lws_fts_entry *
lws_fts_entry_child_add(struct lws_fts * t,unsigned char c,struct lws_fts_entry * parent)355 lws_fts_entry_child_add(struct lws_fts *t, unsigned char c,
356 struct lws_fts_entry *parent)
357 {
358 struct lws_fts_entry *e, **pe;
359
360 e = lwsac_use(&t->lwsac_head, sizeof(*e), TRIE_LWSAC_BLOCK_SIZE);
361 if (!e)
362 return NULL;
363
364 memset(e, 0, sizeof(*e));
365
366 e->c = c;
367 parent->child_count++;
368 e->parent = parent;
369 t->count_entries++;
370
371 /* keep the parent child list in ascending sort order for c */
372
373 pe = &parent->child_list;
374 while (*pe) {
375 assert((*pe)->parent == parent);
376 if ((*pe)->c > c) {
377 /* add it before */
378 e->sibling = *pe;
379 *pe = e;
380 break;
381 }
382 pe = &(*pe)->sibling;
383 }
384
385 if (!*pe) {
386 /* add it at the end */
387 e->sibling = NULL;
388 *pe = e;
389 }
390
391 return e;
392 }
393
394 static int
finalize_per_input(struct lws_fts * t)395 finalize_per_input(struct lws_fts *t)
396 {
397 struct lws_fts_instance_file *tif;
398 unsigned char buf[8192];
399 uint64_t lwsac_input_size;
400 jg2_file_offset temp;
401 int bp = 0;
402
403 bp += g16(&buf[bp], 0);
404 bp += g16(&buf[bp], 0);
405 bp += g32(&buf[bp], 0);
406 if (write(t->fd, buf, bp) != bp)
407 return 1;
408 t->c += bp;
409 bp = 0;
410
411 /*
412 * Write the generated file index + instances (if any)
413 *
414 * Notice the next same-parent file instance fileoffset list is
415 * backwards, so it does not require seeks to fill in. The first
416 * entry has 0 but the second entry points to the first entry (whose
417 * fileoffset is known).
418 *
419 * After all the file instance structs are finalized,
420 * .ofs_last_inst_file contains the fileoffset of that child's tif
421 * list head in the file.
422 *
423 * The file instances are written to disk in the order that the files
424 * were indexed, along with their prev pointers inline.
425 */
426
427 tif = t->tif_list;
428 while (tif) {
429 struct lws_fts_lines *i;
430
431 spill((3 * MAX_VLI) + tif->count, 0);
432
433 temp = tif->owner->ofs_last_inst_file;
434 if (tif->total)
435 tif->owner->ofs_last_inst_file = t->c + bp;
436
437 assert(!temp || (temp > TRIE_FILE_HDR_SIZE && temp < t->c));
438
439 /* fileoffset of prev instance file for this entry, or 0 */
440 bp += wq32(&buf[bp], temp);
441 bp += wq32(&buf[bp], tif->file_index);
442 bp += wq32(&buf[bp], tif->total);
443
444 /* remove any pointers into this disposable lac footprint */
445 tif->owner->inst_file_list = NULL;
446
447 memcpy(&buf[bp], &tif->vli, tif->count);
448 bp += tif->count;
449
450 i = tif->lines_list;
451 while (i) {
452 spill(i->count, 0);
453 memcpy(&buf[bp], &i->vli, i->count);
454 bp += i->count;
455
456 i = i->lines_next;
457 }
458
459 tif = tif->inst_file_next;
460 }
461
462 spill(0, 1);
463
464 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
465
466 if (t->lwsac_input_head) {
467 lwsac_input_size = lwsac_total_alloc(t->lwsac_input_head);
468 if (lwsac_input_size > t->worst_lwsac_input_size)
469 t->worst_lwsac_input_size = lwsac_input_size;
470 }
471
472 /*
473 * those per-file allocations are all on a separate lac so we can
474 * free it cleanly afterwards
475 */
476 lwsac_free(&t->lwsac_input_head);
477
478 /* and lose the pointer into the deallocated lac */
479 t->tif_list = NULL;
480
481 return 0;
482 }
483
484 /*
485 * 0 = punctuation, whitespace, brackets etc
486 * 1 = character inside symbol set
487 * 2 = upper-case character inside symbol set
488 */
489
490 static char classify[] = {
491 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
492 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
494 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
495 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
496 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, //1,
497 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
498 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
499 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
500 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
501 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
502 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
503 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
504 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
505 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
506 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
507 };
508
509 #if 0
510 static const char *
511 name_entry(struct lws_fts_entry *e1, char *s, int len)
512 {
513 struct lws_fts_entry *e2;
514 int n = len;
515
516 s[--n] = '\0';
517
518 e2 = e1;
519 while (e2) {
520 if (e2->suffix) {
521 if ((int)e2->suffix_len < n) {
522 n -= e2->suffix_len;
523 memcpy(&s[n], e2->suffix, e2->suffix_len);
524 }
525 } else {
526 n--;
527 s[n] = e2->c;
528 }
529
530 e2 = e2->parent;
531 }
532
533 return &s[n + 1];
534 }
535 #endif
536
537 /*
538 * as we parse the input, we create a line length table for the file index.
539 * Only the file header has been written before we start doing this.
540 */
541
542 int
lws_fts_fill(struct lws_fts * t,uint32_t file_index,const char * buf,size_t len)543 lws_fts_fill(struct lws_fts *t, uint32_t file_index, const char *buf,
544 size_t len)
545 {
546 unsigned long long tf = lws_now_usecs();
547 unsigned char c, linetable[256], vlibuf[8];
548 struct lws_fts_entry *e, *e1, *dcl;
549 struct lws_fts_instance_file *tif;
550 int bp = 0, sline, chars, m;
551 char *osuff, skipline = 0;
552 struct lws_fts_lines *tl;
553 unsigned int olen, n;
554 off_t lbh;
555
556 if ((int)file_index != t->last_file_index) {
557 if (t->last_file_index >= 0)
558 finalize_per_input(t);
559 t->last_file_index = file_index;
560 t->line_number = 1;
561 t->chars_in_line = 0;
562 t->lines_in_unsealed_linetable = 0;
563 }
564
565 t->agg_raw_input += len;
566
567 resume:
568
569 chars = 0;
570 lbh = t->c;
571 sline = t->line_number;
572 bp += g16(&linetable[bp], 0);
573 bp += g16(&linetable[bp], 0);
574 bp += g32(&linetable[bp], 0);
575
576 while (len) {
577 char go_around = 0;
578
579 if (t->lines_in_unsealed_linetable >= LWS_FTS_LINES_PER_CHUNK)
580 break;
581
582 len--;
583
584 c = (unsigned char)*buf++;
585 t->chars_in_line++;
586 if (c == '\n') {
587 skipline = 0;
588 t->filepath_list->total_lines++;
589 t->lines_in_unsealed_linetable++;
590 t->line_number++;
591
592 bp += wq32(&linetable[bp], t->chars_in_line);
593 if ((unsigned int)bp > sizeof(linetable) - 6) {
594 if (write(t->fd, linetable, bp) != bp) {
595 lwsl_err("%s: linetable write failed\n",
596 __func__);
597 return 1;
598 }
599 t->c += bp;
600 bp = 0;
601 // assert(lseek(t->fd, 0, SEEK_END) == t->c);
602 }
603
604 chars += t->chars_in_line;
605 t->chars_in_line = 0;
606
607 /*
608 * Detect overlength lines and skip them (eg, BASE64
609 * in css etc)
610 */
611
612 if (len > 200) {
613 n = 0;
614 m = 0;
615 while (n < 200 && m < 80 && buf[n] != '\n') {
616 if (buf[n] == ' ' || buf[n] == '\t')
617 m = 0;
618 n++;
619 m++;
620 }
621
622 /* 80 lines no whitespace, or >=200-char line */
623
624 if (m == 80 || n == 200)
625 skipline = 1;
626 }
627
628 goto seal;
629 }
630 if (skipline)
631 continue;
632
633 m = classify[(int)c];
634 if (!m)
635 goto seal;
636 if (m == 2)
637 c += 'a' - 'A';
638
639 if (t->aggregate) {
640
641 /*
642 * We created a trie entry for an earlier char in this
643 * symbol already. So we know at the moment, any
644 * further chars in the symbol are the only children.
645 *
646 * Aggregate them and add them as a string suffix to
647 * the trie symbol at the end (when we know how much to
648 * allocate).
649 */
650
651 if (t->agg_pos < sizeof(t->agg) - 1)
652 /* symbol is not too long to stash */
653 t->agg[t->agg_pos++] = c;
654
655 continue;
656 }
657
658 if (t->str_match_pos) {
659 go_around = 1;
660 goto seal;
661 }
662
663 /* zeroth-iteration child matching */
664
665 if (t->parser == t->root) {
666 e = t->root_lookup[(int)c];
667 if (e) {
668 t->parser = e;
669 continue;
670 }
671 } else {
672
673 /* look for the char amongst the children */
674
675 e = t->parser->child_list;
676 while (e) {
677
678 /* since they're alpha ordered... */
679 if (e->c > c) {
680 e = NULL;
681 break;
682 }
683 if (e->c == c) {
684 t->parser = e;
685
686 if (e->suffix)
687 t->str_match_pos = 1;
688
689 break;
690 }
691
692 e = e->sibling;
693 }
694
695 if (e)
696 continue;
697 }
698
699 /*
700 * we are blazing a new trail, add a new child representing
701 * the whole suffix that couldn't be matched until now.
702 */
703
704 e = lws_fts_entry_child_add(t, c, t->parser);
705 if (!e) {
706 lwsl_err("%s: lws_fts_entry_child_add failed\n",
707 __func__);
708 return 1;
709 }
710
711 /* if it's the root node, keep the root_lookup table in sync */
712
713 if (t->parser == t->root)
714 t->root_lookup[(int)c] = e;
715
716 /* follow the new path */
717 t->parser = e;
718
719 {
720 struct lws_fts_entry **pe = &e->child_list;
721 while (*pe) {
722 assert((*pe)->parent == e);
723
724 pe = &(*pe)->sibling;
725 }
726 }
727
728 /*
729 * If there are any more symbol characters coming, just
730 * create a suffix string on t->parser instead of what must
731 * currently be single-child nodes, since we just created e
732 * as a child with a single character due to no existing match
733 * on that single character... so if no match on 'h' with this
734 * guy's parent, we created e that matches on the single char
735 * 'h'. If the symbol continues ... 'a' 'p' 'p' 'y', then
736 * instead of creating singleton child nodes under e,
737 * modify e to match on the whole string suffix "happy".
738 *
739 * If later "hoppy" appears, we will remove the suffix on e,
740 * so it reverts to a char match for 'h', add singleton children
741 * for 'a' and 'o', and attach a "ppy" suffix child to each of
742 * those.
743 *
744 * We want to do this so we don't have to allocate trie entries
745 * for every char in the string to save memory and consequently
746 * time.
747 *
748 * Don't try this optimization if the parent is the root node...
749 * it's not compatible with it's root_lookup table and it's
750 * highly likely children off the root entry are going to have
751 * to be fragmented.
752 */
753
754 if (e->parent != t->root) {
755 t->aggregate = 1;
756 t->agg_pos = 0;
757 }
758
759 continue;
760
761 seal:
762 if (t->str_match_pos) {
763
764 /*
765 * We're partway through matching an elaborated string
766 * on a child, not just a character. String matches
767 * only exist when we met a child entry that only had
768 * one path until now... so we had an 'h', and the
769 * only child had a string "hello".
770 *
771 * We are following the right path and will not need
772 * to back up, but we may find as we go we have the
773 * first instance of a second child path, eg, "help".
774 *
775 * When we get to the 'p', we have to split what was
776 * the only string option "hello" into "hel" and then
777 * two child entries, for "lo" and 'p'.
778 */
779
780 if (c == t->parser->suffix[t->str_match_pos++]) {
781 if (t->str_match_pos < t->parser->suffix_len)
782 continue;
783
784 /*
785 * We simply matched everything, continue
786 * parsing normally from this trie entry.
787 */
788
789 t->str_match_pos = 0;
790 continue;
791 }
792
793 /*
794 * So... we hit a mismatch somewhere... it means we
795 * have to split this string entry.
796 *
797 * We know the first char actually matched in order to
798 * start down this road. So for the current trie entry,
799 * we need to truncate his suffix at the char before
800 * this mismatched one, where we diverged (if the
801 * second char, simply remove the suffix string from the
802 * current trie entry to turn it back to a 1-char match)
803 *
804 * The original entry, which becomes the lhs post-split,
805 * is t->parser.
806 */
807
808 olen = t->parser->suffix_len;
809 osuff = t->parser->suffix;
810
811 if (t->str_match_pos == 2)
812 t->parser->suffix = NULL;
813 else
814 t->parser->suffix_len = t->str_match_pos - 1;
815
816 /*
817 * Then we need to create a new child trie entry that
818 * represents the remainder of the original string
819 * path that we didn't match. For the "hello" /
820 * "help" case, this guy will have "lo".
821 *
822 * Any instances or children (not siblings...) that were
823 * attached to the original trie entry must be detached
824 * first and then migrate to this new guy that completes
825 * the original string.
826 */
827
828 dcl = t->parser->child_list;
829 m = t->parser->child_count;
830
831 t->parser->child_list = NULL;
832 t->parser->child_count = 0;
833
834 e = lws_fts_entry_child_add(t,
835 osuff[t->str_match_pos - 1], t->parser);
836 if (!e) {
837 lwsl_err("%s: lws_fts_entry_child_add fail1\n",
838 __func__);
839 return 1;
840 }
841
842 e->child_list = dcl;
843 e->child_count = m;
844 /*
845 * any children we took over must point to us as the
846 * parent now they appear on our child list
847 */
848 e1 = e->child_list;
849 while (e1) {
850 e1->parent = e;
851 e1 = e1->sibling;
852 }
853
854 /*
855 * We detached any children, gave them to the new guy
856 * and replaced them with just our new guy
857 */
858 t->parser->child_count = 1;
859 t->parser->child_list = e;
860
861 /*
862 * any instances that belonged to the original entry we
863 * are splitting now must be reassigned to the end
864 * part
865 */
866
867 e->inst_file_list = t->parser->inst_file_list;
868 if (e->inst_file_list)
869 e->inst_file_list->owner = e;
870 t->parser->inst_file_list = NULL;
871 e->instance_count = t->parser->instance_count;
872 t->parser->instance_count = 0;
873
874 e->ofs_last_inst_file = t->parser->ofs_last_inst_file;
875 t->parser->ofs_last_inst_file = 0;
876
877 if (t->str_match_pos != olen) {
878 /* we diverged partway */
879 e->suffix = &osuff[t->str_match_pos - 1];
880 e->suffix_len = olen - (t->str_match_pos - 1);
881 }
882
883 /*
884 * if the current char is a terminal, skip creating a
885 * new way forward.
886 */
887
888 if (classify[(int)c]) {
889
890 /*
891 * Lastly we need to create a new child trie
892 * entry that represents the new way forward
893 * from the point that we diverged. For the
894 * "hello" / "help" case, this guy will start
895 * as a child of "hel" with the single
896 * character match 'p'.
897 *
898 * Since he becomes the current parser context,
899 * more symbol characters may be coming to make
900 * him into, eg, "helping", in which case he
901 * will acquire a suffix eventually of "ping"
902 * via the aggregation stuff
903 */
904
905 e = lws_fts_entry_child_add(t, c, t->parser);
906 if (!e) {
907 lwsl_err("%s: child_add fail2\n",
908 __func__);
909 return 1;
910 }
911 }
912
913 /* go on following this path */
914 t->parser = e;
915
916 t->aggregate = 1;
917 t->agg_pos = 0;
918
919 t->str_match_pos = 0;
920
921 if (go_around)
922 continue;
923
924 /* this is intended to be a seal */
925 }
926
927
928 /* end of token */
929
930 if (t->aggregate && t->agg_pos) {
931
932 /* if nothing in agg[]: leave as single char match */
933
934 /* otherwise copy out the symbol aggregation */
935 t->parser->suffix = lwsac_use(&t->lwsac_head,
936 t->agg_pos + 1,
937 TRIE_LWSAC_BLOCK_SIZE);
938 if (!t->parser->suffix) {
939 lwsl_err("%s: lac for suffix failed\n",
940 __func__);
941 return 1;
942 }
943
944 /* add the first char at the beginning */
945 *t->parser->suffix = t->parser->c;
946 /* and then add the agg buffer stuff */
947 memcpy(t->parser->suffix + 1, t->agg, t->agg_pos);
948 t->parser->suffix_len = t->agg_pos + 1;
949 }
950 t->aggregate = 0;
951
952 if (t->parser == t->root) /* multiple terminal chars */
953 continue;
954
955 if (!t->parser->inst_file_list ||
956 t->parser->inst_file_list->file_index != file_index) {
957 tif = lwsac_use(&t->lwsac_input_head, sizeof(*tif),
958 TRIE_LWSAC_BLOCK_SIZE);
959 if (!tif) {
960 lwsl_err("%s: lac for tif failed\n",
961 __func__);
962 return 1;
963 }
964
965 tif->file_index = file_index;
966 tif->owner = t->parser;
967 tif->lines_list = NULL;
968 tif->lines_tail = NULL;
969 tif->total = 0;
970 tif->count = 0;
971 tif->inst_file_next = t->tif_list;
972 t->tif_list = tif;
973
974 t->parser->inst_file_list = tif;
975 }
976
977 /*
978 * A naive allocation strategy for this leads to 50% of the
979 * total inmem lac allocation being for line numbers...
980 *
981 * It's mainly solved by only holding the instance and line
982 * number tables for the duration of a file being input, as soon
983 * as one input file is finished it is written to disk.
984 *
985 * For the common case of 1 - ~3 matches the line number are
986 * stored in a small VLI array inside the filepath inst. If the
987 * next one won't fit, it allocates a line number struct with
988 * more vli space and continues chaining those if needed.
989 */
990
991 n = wq32(vlibuf, t->line_number);
992 tif = t->parser->inst_file_list;
993
994 if (!tif->lines_list) {
995 /* we are still trying to use the file inst vli */
996 if (LWS_ARRAY_SIZE(tif->vli) - tif->count >= n) {
997 tif->count += wq32(tif->vli + tif->count,
998 t->line_number);
999 goto after;
1000 }
1001 /* we are going to have to allocate */
1002 }
1003
1004 /* can we add to an existing line numbers struct? */
1005 if (tif->lines_tail &&
1006 LWS_ARRAY_SIZE(tif->lines_tail->vli) -
1007 tif->lines_tail->count >= n) {
1008 tif->lines_tail->count += wq32(tif->lines_tail->vli +
1009 tif->lines_tail->count,
1010 t->line_number);
1011 goto after;
1012 }
1013
1014 /* either no existing line numbers struct at tail, or full */
1015
1016 /* have to create a(nother) line numbers struct */
1017 tl = lwsac_use(&t->lwsac_input_head, sizeof(*tl),
1018 TRIE_LWSAC_BLOCK_SIZE);
1019 if (!tl) {
1020 lwsl_err("%s: lac for tl failed\n", __func__);
1021 return 1;
1022 }
1023 tl->lines_next = NULL;
1024 if (tif->lines_tail)
1025 tif->lines_tail->lines_next = tl;
1026
1027 tif->lines_tail = tl;
1028 if (!tif->lines_list)
1029 tif->lines_list = tl;
1030
1031 tl->count = wq32(tl->vli, t->line_number);
1032 after:
1033 tif->total++;
1034 #if 0
1035 {
1036 char s[128];
1037 const char *ne = name_entry(t->parser, s, sizeof(s));
1038
1039 if (!strcmp(ne, "describ")) {
1040 lwsl_err(" %s %d\n", ne, t->str_match_pos);
1041 write(1, buf - 10, 20);
1042 }
1043 }
1044 #endif
1045 t->parser->instance_count++;
1046 t->parser = t->root;
1047 t->str_match_pos = 0;
1048 }
1049
1050 /* seal off the line length table block */
1051
1052 if (bp) {
1053 if (write(t->fd, linetable, bp) != bp)
1054 return 1;
1055 t->c += bp;
1056 bp = 0;
1057 }
1058
1059 if (lseek(t->fd, lbh, SEEK_SET) < 0) {
1060 lwsl_err("%s: seek to 0x%llx failed\n", __func__,
1061 (unsigned long long)lbh);
1062 return 1;
1063 }
1064
1065 g16(linetable, t->c - lbh);
1066 g16(linetable + 2, t->line_number - sline);
1067 g32(linetable + 4, chars);
1068 if (write(t->fd, linetable, 8) != 8) {
1069 lwsl_err("%s: write linetable header failed\n", __func__);
1070 return 1;
1071 }
1072
1073 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1074
1075 if (lseek(t->fd, t->c, SEEK_SET) < 0) {
1076 lwsl_err("%s: end seek failed\n", __func__);
1077 return 1;
1078 }
1079
1080 bp = 0;
1081
1082 if (len) {
1083 t->lines_in_unsealed_linetable = 0;
1084 goto resume;
1085 }
1086
1087 /* dump the collected per-input instance and line data, and free it */
1088
1089 t->agg_trie_creation_us += lws_now_usecs() - tf;
1090
1091 return 0;
1092 }
1093
1094 /* refer to ./README.md */
1095
1096 int
lws_fts_serialize(struct lws_fts * t)1097 lws_fts_serialize(struct lws_fts *t)
1098 {
1099 struct lws_fts_filepath *fp = t->filepath_list, *ofp;
1100 unsigned long long tf = lws_now_usecs();
1101 struct lws_fts_entry *e, *e1, *s[256];
1102 unsigned char buf[8192], stasis;
1103 int n, bp, sp = 0, do_parent;
1104
1105 (void)tf;
1106 finalize_per_input(t);
1107
1108 /*
1109 * Compute aggregated instance counts (parents should know the total
1110 * number of instances below each child path)
1111 *
1112 *
1113 * If we have
1114 *
1115 * (root) -> (c1) -> (c2)
1116 * -> (c3)
1117 *
1118 * we need to visit the nodes in the order
1119 *
1120 * c2, c1, c3, root
1121 */
1122
1123 sp = 0;
1124 s[0] = t->root;
1125 do_parent = 0;
1126 while (sp >= 0) {
1127 int n;
1128
1129 /* aggregate in every antecedent */
1130
1131 for (n = 0; n <= sp; n++) {
1132 s[n]->agg_inst_count += s[sp]->instance_count;
1133 s[n]->agg_child_count += s[sp]->child_count;
1134 }
1135
1136 /* handle any children before the parent */
1137
1138 if (s[sp]->child_list) {
1139 if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1140 lwsl_err("Stack too deep\n");
1141
1142 goto bail;
1143 }
1144
1145 s[sp + 1] = s[sp]->child_list;
1146 sp++;
1147 continue;
1148 }
1149
1150 do {
1151 if (s[sp]->sibling) {
1152 s[sp] = s[sp]->sibling;
1153 break;
1154 } else
1155 sp--;
1156 } while (sp >= 0);
1157 }
1158
1159 /* dump the filepaths and set prev */
1160
1161 fp = t->filepath_list;
1162 ofp = NULL;
1163 bp = 0;
1164 while (fp) {
1165
1166 fp->ofs = t->c + bp;
1167 n = (int)strlen(fp->filepath);
1168 spill(15 + n, 0);
1169
1170 bp += wq32(&buf[bp], fp->line_table_ofs);
1171 bp += wq32(&buf[bp], fp->total_lines);
1172 bp += wq32(&buf[bp], n);
1173 memcpy(&buf[bp], fp->filepath, n);
1174 bp += n;
1175
1176 fp->prev = ofp;
1177 ofp = fp;
1178 fp = fp->next;
1179 }
1180
1181 spill(0, 1);
1182
1183 /* record the fileoffset of the filepath map and filepath count */
1184
1185 if (lseek(t->fd, 0xc, SEEK_SET) < 0)
1186 goto bail_seek;
1187
1188 g32(buf, t->c + bp);
1189 g32(buf + 4, t->next_file_index);
1190 if (write(t->fd, buf, 8) != 8)
1191 goto bail;
1192
1193 if (lseek(t->fd, t->c + bp, SEEK_SET) < 0)
1194 goto bail_seek;
1195
1196 /* dump the filepath map, starting from index 0, which is at the tail */
1197
1198 fp = ofp;
1199 bp = 0;
1200 while (fp) {
1201 spill(5, 0);
1202 g32(buf + bp, fp->ofs);
1203 bp += 4;
1204 fp = fp->prev;
1205 }
1206 spill(0, 1);
1207
1208 /*
1209 * The trie entries in reverse order... because of the reversal, we have
1210 * always written children first, and marked them with their file offset
1211 * before we come to refer to them.
1212 */
1213
1214 bp = 0;
1215 sp = 0;
1216 s[0] = t->root;
1217 do_parent = 0;
1218 while (s[sp]) {
1219
1220 /* handle any children before the parent */
1221
1222 if (!do_parent && s[sp]->child_list) {
1223
1224 if (sp + 1 == LWS_ARRAY_SIZE(s)) {
1225 lwsl_err("Stack too deep\n");
1226
1227 goto bail;
1228 }
1229
1230 s[sp + 1] = s[sp]->child_list;
1231 sp++;
1232 continue;
1233 }
1234
1235 /* leaf nodes with no children */
1236
1237 e = s[sp];
1238 e->ofs = t->c + bp;
1239
1240 /* write the trie entry header */
1241
1242 spill((3 * MAX_VLI), 0);
1243
1244 bp += wq32(&buf[bp], e->ofs_last_inst_file);
1245 bp += wq32(&buf[bp], e->child_count);
1246 bp += wq32(&buf[bp], e->instance_count);
1247 bp += wq32(&buf[bp], e->agg_inst_count);
1248
1249 /* sort the children in order of highest aggregate hits first */
1250
1251 do {
1252 struct lws_fts_entry **pe, *te1, *te2;
1253
1254 stasis = 1;
1255
1256 /* bubble sort keeps going until nothing changed */
1257
1258 pe = &e->child_list;
1259 while (*pe) {
1260
1261 te1 = *pe;
1262 te2 = te1->sibling;
1263
1264 if (te2 && te1->agg_inst_count <
1265 te2->agg_inst_count) {
1266 stasis = 0;
1267
1268 *pe = te2;
1269 te1->sibling = te2->sibling;
1270 te2->sibling = te1;
1271 }
1272
1273 pe = &(*pe)->sibling;
1274 }
1275
1276 } while (!stasis);
1277
1278 /* write the children */
1279
1280 e1 = e->child_list;
1281 while (e1) {
1282 spill((5 * MAX_VLI) + e1->suffix_len + 1, 0);
1283
1284 bp += wq32(&buf[bp], e1->ofs);
1285 bp += wq32(&buf[bp], e1->instance_count);
1286 bp += wq32(&buf[bp], e1->agg_inst_count);
1287 bp += wq32(&buf[bp], e1->agg_child_count);
1288
1289 if (e1->suffix) { /* string */
1290 bp += wq32(&buf[bp], e1->suffix_len);
1291 memmove(&buf[bp], e1->suffix, e1->suffix_len);
1292 bp += e1->suffix_len;
1293 } else { /* char */
1294 bp += wq32(&buf[bp], 1);
1295 buf[bp++] = e1->c;
1296 }
1297 #if 0
1298 if (e1->suffix && e1->suffix_len == 3 &&
1299 !memcmp(e1->suffix, "cri", 3)) {
1300 struct lws_fts_entry *e2;
1301
1302 e2 = e1;
1303 while (e2){
1304 if (e2->suffix)
1305 lwsl_notice("%s\n", e2->suffix);
1306 else
1307 lwsl_notice("%c\n", e2->c);
1308
1309 e2 = e2->parent;
1310 }
1311
1312 lwsl_err("*** %c CRI inst %d ch %d\n", e1->parent->c,
1313 e1->instance_count, e1->child_count);
1314 }
1315 #endif
1316 e1 = e1->sibling;
1317 }
1318
1319 /* if there are siblings, do those next */
1320
1321 if (do_parent) {
1322 do_parent = 0;
1323 sp--;
1324 }
1325
1326 if (s[sp]->sibling)
1327 s[sp] = s[sp]->sibling;
1328 else {
1329 /* if there are no siblings, do the parent */
1330 do_parent = 1;
1331 s[sp] = s[sp]->parent;
1332 }
1333 }
1334
1335 spill(0, 1);
1336
1337 assert(lseek(t->fd, 0, SEEK_END) == (off_t)t->c);
1338
1339 /* drop the correct root trie offset + file length into the header */
1340
1341 if (lseek(t->fd, 4, SEEK_SET) < 0) {
1342 lwsl_err("%s: unable to seek\n", __func__);
1343
1344 goto bail;
1345 }
1346
1347 g32(buf, t->root->ofs);
1348 g32(buf + 4, t->c);
1349 if (write(t->fd, buf, 0x8) != 0x8)
1350 goto bail;
1351
1352 lwsl_notice("%s: index %d files (%uMiB) cpu time %dms, "
1353 "alloc: %dKiB + %dKiB, "
1354 "serialize: %dms, file: %dKiB\n", __func__,
1355 t->next_file_index,
1356 (int)(t->agg_raw_input / (1024 * 1024)),
1357 (int)(t->agg_trie_creation_us / 1000),
1358 (int)(lwsac_total_alloc(t->lwsac_head) / 1024),
1359 (int)(t->worst_lwsac_input_size / 1024),
1360 (int)((lws_now_usecs() - tf) / 1000),
1361 (int)(t->c / 1024));
1362
1363 return 0;
1364
1365 bail_seek:
1366 lwsl_err("%s: problem seekings\n", __func__);
1367
1368 bail:
1369 return 1;
1370 }
1371
1372
1373