1 /*
2 regcomp.c - TRE POSIX compatible regex compilation functions.
3
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 */
31
32 #include <string.h>
33 #include <stdlib.h>
34 #include <regex.h>
35 #include <limits.h>
36 #include <stdint.h>
37 #include <ctype.h>
38
39 #include "tre.h"
40
41 #include <assert.h>
42
43 /***********************************************************************
44 from tre-compile.h
45 ***********************************************************************/
46
47 typedef struct {
48 int position;
49 int code_min;
50 int code_max;
51 int *tags;
52 int assertions;
53 tre_ctype_t class;
54 tre_ctype_t *neg_classes;
55 int backref;
56 } tre_pos_and_tags_t;
57
58
59 /***********************************************************************
60 from tre-ast.c and tre-ast.h
61 ***********************************************************************/
62
63 /* The different AST node types. */
64 typedef enum {
65 LITERAL,
66 CATENATION,
67 ITERATION,
68 UNION
69 } tre_ast_type_t;
70
71 /* Special subtypes of TRE_LITERAL. */
72 #define EMPTY -1 /* Empty leaf (denotes empty string). */
73 #define ASSERTION -2 /* Assertion leaf. */
74 #define TAG -3 /* Tag leaf. */
75 #define BACKREF -4 /* Back reference leaf. */
76
77 #define IS_SPECIAL(x) ((x)->code_min < 0)
78 #define IS_EMPTY(x) ((x)->code_min == EMPTY)
79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80 #define IS_TAG(x) ((x)->code_min == TAG)
81 #define IS_BACKREF(x) ((x)->code_min == BACKREF)
82
83
84 /* A generic AST node. All AST nodes consist of this node on the top
85 level with `obj' pointing to the actual content. */
86 typedef struct {
87 tre_ast_type_t type; /* Type of the node. */
88 void *obj; /* Pointer to actual node. */
89 int nullable;
90 int submatch_id;
91 int num_submatches;
92 int num_tags;
93 tre_pos_and_tags_t *firstpos;
94 tre_pos_and_tags_t *lastpos;
95 } tre_ast_node_t;
96
97
98 /* A "literal" node. These are created for assertions, back references,
99 tags, matching parameter settings, and all expressions that match one
100 character. */
101 typedef struct {
102 long code_min;
103 long code_max;
104 int position;
105 tre_ctype_t class;
106 tre_ctype_t *neg_classes;
107 } tre_literal_t;
108
109 /* A "catenation" node. These are created when two regexps are concatenated.
110 If there are more than one subexpressions in sequence, the `left' part
111 holds all but the last, and `right' part holds the last subexpression
112 (catenation is left associative). */
113 typedef struct {
114 tre_ast_node_t *left;
115 tre_ast_node_t *right;
116 } tre_catenation_t;
117
118 /* An "iteration" node. These are created for the "*", "+", "?", and "{m,n}"
119 operators. */
120 typedef struct {
121 /* Subexpression to match. */
122 tre_ast_node_t *arg;
123 /* Minimum number of consecutive matches. */
124 int min;
125 /* Maximum number of consecutive matches. */
126 int max;
127 /* If 0, match as many characters as possible, if 1 match as few as
128 possible. Note that this does not always mean the same thing as
129 matching as many/few repetitions as possible. */
130 unsigned int minimal:1;
131 } tre_iteration_t;
132
133 /* An "union" node. These are created for the "|" operator. */
134 typedef struct {
135 tre_ast_node_t *left;
136 tre_ast_node_t *right;
137 } tre_union_t;
138
139
140 static tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem,tre_ast_type_t type,void * obj)141 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, void *obj)
142 {
143 tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
144 if (!node || !obj)
145 return 0;
146 node->obj = obj;
147 node->type = type;
148 node->nullable = -1;
149 node->submatch_id = -1;
150 return node;
151 }
152
153 static tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem,int code_min,int code_max,int position)154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
155 {
156 tre_ast_node_t *node;
157 tre_literal_t *lit;
158
159 lit = tre_mem_calloc(mem, sizeof *lit);
160 node = tre_ast_new_node(mem, LITERAL, lit);
161 if (!node)
162 return 0;
163 lit->code_min = code_min;
164 lit->code_max = code_max;
165 lit->position = position;
166 return node;
167 }
168
169 static tre_ast_node_t *
tre_ast_new_iter(tre_mem_t mem,tre_ast_node_t * arg,int min,int max,int minimal)170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
171 {
172 tre_ast_node_t *node;
173 tre_iteration_t *iter;
174
175 iter = tre_mem_calloc(mem, sizeof *iter);
176 node = tre_ast_new_node(mem, ITERATION, iter);
177 if (!node)
178 return 0;
179 iter->arg = arg;
180 iter->min = min;
181 iter->max = max;
182 iter->minimal = minimal;
183 node->num_submatches = arg->num_submatches;
184 return node;
185 }
186
187 static tre_ast_node_t *
tre_ast_new_union(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
189 {
190 tre_ast_node_t *node;
191 tre_union_t *un;
192
193 if (!left)
194 return right;
195 un = tre_mem_calloc(mem, sizeof *un);
196 node = tre_ast_new_node(mem, UNION, un);
197 if (!node || !right)
198 return 0;
199 un->left = left;
200 un->right = right;
201 node->num_submatches = left->num_submatches + right->num_submatches;
202 return node;
203 }
204
205 static tre_ast_node_t *
tre_ast_new_catenation(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
207 {
208 tre_ast_node_t *node;
209 tre_catenation_t *cat;
210
211 if (!left)
212 return right;
213 cat = tre_mem_calloc(mem, sizeof *cat);
214 node = tre_ast_new_node(mem, CATENATION, cat);
215 if (!node)
216 return 0;
217 cat->left = left;
218 cat->right = right;
219 node->num_submatches = left->num_submatches + right->num_submatches;
220 return node;
221 }
222
223
224 /***********************************************************************
225 from tre-stack.c and tre-stack.h
226 ***********************************************************************/
227
228 typedef struct tre_stack_rec tre_stack_t;
229
230 /* Creates a new stack object. `size' is initial size in bytes, `max_size'
231 is maximum size, and `increment' specifies how much more space will be
232 allocated with realloc() if all space gets used up. Returns the stack
233 object or NULL if out of memory. */
234 static tre_stack_t *
235 tre_stack_new(int size, int max_size, int increment);
236
237 /* Frees the stack object. */
238 static void
239 tre_stack_destroy(tre_stack_t *s);
240
241 /* Returns the current number of objects in the stack. */
242 static int
243 tre_stack_num_objects(tre_stack_t *s);
244
245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
246 `value' on top of stack `s'. Returns REG_ESPACE if out of memory.
247 This tries to realloc() more space before failing if maximum size
248 has not yet been reached. Returns REG_OK if successful. */
249 #define declare_pushf(typetag, type) \
250 static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
251
252 declare_pushf(voidptr, void *);
253 declare_pushf(int, int);
254
255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
256 element off of stack `s' and returns it. The stack must not be
257 empty. */
258 #define declare_popf(typetag, type) \
259 static type tre_stack_pop_ ## typetag(tre_stack_t *s)
260
261 declare_popf(voidptr, void *);
262 declare_popf(int, int);
263
264 /* Just to save some typing. */
265 #define STACK_PUSH(s, typetag, value) \
266 do \
267 { \
268 status = tre_stack_push_ ## typetag(s, value); \
269 } \
270 while (/*CONSTCOND*/0)
271
272 #define STACK_PUSHX(s, typetag, value) \
273 { \
274 status = tre_stack_push_ ## typetag(s, value); \
275 if (status != REG_OK) \
276 break; \
277 }
278
279 #define STACK_PUSHR(s, typetag, value) \
280 { \
281 reg_errcode_t _status; \
282 _status = tre_stack_push_ ## typetag(s, value); \
283 if (_status != REG_OK) \
284 return _status; \
285 }
286
287 union tre_stack_item {
288 void *voidptr_value;
289 int int_value;
290 };
291
292 struct tre_stack_rec {
293 int size;
294 int max_size;
295 int increment;
296 int ptr;
297 union tre_stack_item *stack;
298 };
299
300
301 static tre_stack_t *
tre_stack_new(int size,int max_size,int increment)302 tre_stack_new(int size, int max_size, int increment)
303 {
304 tre_stack_t *s;
305
306 s = xmalloc(sizeof(*s));
307 if (s != NULL)
308 {
309 s->stack = xmalloc(sizeof(*s->stack) * size);
310 if (s->stack == NULL)
311 {
312 xfree(s);
313 return NULL;
314 }
315 s->size = size;
316 s->max_size = max_size;
317 s->increment = increment;
318 s->ptr = 0;
319 }
320 return s;
321 }
322
323 static void
tre_stack_destroy(tre_stack_t * s)324 tre_stack_destroy(tre_stack_t *s)
325 {
326 xfree(s->stack);
327 xfree(s);
328 }
329
330 static int
tre_stack_num_objects(tre_stack_t * s)331 tre_stack_num_objects(tre_stack_t *s)
332 {
333 return s->ptr;
334 }
335
336 static reg_errcode_t
tre_stack_push(tre_stack_t * s,union tre_stack_item value)337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
338 {
339 if (s->ptr < s->size)
340 {
341 s->stack[s->ptr] = value;
342 s->ptr++;
343 }
344 else
345 {
346 if (s->size >= s->max_size)
347 {
348 return REG_ESPACE;
349 }
350 else
351 {
352 union tre_stack_item *new_buffer;
353 int new_size;
354 new_size = s->size + s->increment;
355 if (new_size > s->max_size)
356 new_size = s->max_size;
357 new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
358 if (new_buffer == NULL)
359 {
360 return REG_ESPACE;
361 }
362 assert(new_size > s->size);
363 s->size = new_size;
364 s->stack = new_buffer;
365 tre_stack_push(s, value);
366 }
367 }
368 return REG_OK;
369 }
370
371 #define define_pushf(typetag, type) \
372 declare_pushf(typetag, type) { \
373 union tre_stack_item item; \
374 item.typetag ## _value = value; \
375 return tre_stack_push(s, item); \
376 }
377
378 define_pushf(int, int)
379 define_pushf(voidptr, void *)
380
381 #define define_popf(typetag, type) \
382 declare_popf(typetag, type) { \
383 return s->stack[--s->ptr].typetag ## _value; \
384 }
385
386 define_popf(int, int)
387 define_popf(voidptr, void *)
388
389
390 /***********************************************************************
391 from tre-parse.c and tre-parse.h
392 ***********************************************************************/
393
394 /* Parse context. */
395 typedef struct {
396 /* Memory allocator. The AST is allocated using this. */
397 tre_mem_t mem;
398 /* Stack used for keeping track of regexp syntax. */
399 tre_stack_t *stack;
400 /* The parsed node after a parse function returns. */
401 tre_ast_node_t *n;
402 /* Position in the regexp pattern after a parse function returns. */
403 const char *s;
404 /* The first character of the last subexpression parsed. */
405 const char *start;
406 /* Current submatch ID. */
407 int submatch_id;
408 /* Current position (number of literal). */
409 int position;
410 /* The highest back reference or -1 if none seen so far. */
411 int max_backref;
412 /* Compilation flags. */
413 int cflags;
414 } tre_parse_ctx_t;
415
416 /* Some macros for expanding \w, \s, etc. */
417 static const struct {
418 char c;
419 const char *expansion;
420 } tre_macros[] = {
421 {'t', "\t"}, {'n', "\n"}, {'r', "\r"},
422 {'f', "\f"}, {'a', "\a"}, {'e', "\033"},
423 {'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
424 {'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
425 { 0, 0 }
426 };
427
428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
429 must have at least `len' items. Sets buf[0] to zero if the there
430 is no match in `tre_macros'. */
tre_expand_macro(const char * s)431 static const char *tre_expand_macro(const char *s)
432 {
433 int i;
434 for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
435 return tre_macros[i].expansion;
436 }
437
438 static int
tre_compare_lit(const void * a,const void * b)439 tre_compare_lit(const void *a, const void *b)
440 {
441 const tre_literal_t *const *la = a;
442 const tre_literal_t *const *lb = b;
443 /* assumes the range of valid code_min is < INT_MAX */
444 return la[0]->code_min - lb[0]->code_min;
445 }
446
447 struct literals {
448 tre_mem_t mem;
449 tre_literal_t **a;
450 int len;
451 int cap;
452 };
453
tre_new_lit(struct literals * p)454 static tre_literal_t *tre_new_lit(struct literals *p)
455 {
456 tre_literal_t **a;
457 if (p->len >= p->cap) {
458 if (p->cap >= 1<<15)
459 return 0;
460 p->cap *= 2;
461 a = xrealloc(p->a, p->cap * sizeof *p->a);
462 if (!a)
463 return 0;
464 p->a = a;
465 }
466 a = p->a + p->len++;
467 *a = tre_mem_calloc(p->mem, sizeof **a);
468 return *a;
469 }
470
add_icase_literals(struct literals * ls,int min,int max)471 static int add_icase_literals(struct literals *ls, int min, int max)
472 {
473 tre_literal_t *lit;
474 int b, e, c;
475 for (c=min; c<=max; ) {
476 /* assumes islower(c) and isupper(c) are exclusive
477 and toupper(c)!=c if islower(c).
478 multiple opposite case characters are not supported */
479 if (tre_islower(c)) {
480 b = e = tre_toupper(c);
481 for (c++, e++; c<=max; c++, e++)
482 if (tre_toupper(c) != e) break;
483 } else if (tre_isupper(c)) {
484 b = e = tre_tolower(c);
485 for (c++, e++; c<=max; c++, e++)
486 if (tre_tolower(c) != e) break;
487 } else {
488 c++;
489 continue;
490 }
491 lit = tre_new_lit(ls);
492 if (!lit)
493 return -1;
494 lit->code_min = b;
495 lit->code_max = e-1;
496 lit->position = -1;
497 }
498 return 0;
499 }
500
501
502 /* Maximum number of character classes in a negated bracket expression. */
503 #define MAX_NEG_CLASSES 64
504
505 struct neg {
506 int negate;
507 int len;
508 tre_ctype_t a[MAX_NEG_CLASSES];
509 };
510
511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
512
513 /*
514 bracket grammar:
515 Bracket = '[' List ']' | '[^' List ']'
516 List = Term | List Term
517 Term = Char | Range | Chclass | Eqclass
518 Range = Char '-' Char | Char '-' '-'
519 Char = Coll | coll_single
520 Meta = ']' | '-'
521 Coll = '[.' coll_single '.]' | '[.' coll_multi '.]' | '[.' Meta '.]'
522 Eqclass = '[=' coll_single '=]' | '[=' coll_multi '=]'
523 Chclass = '[:' class ':]'
524
525 coll_single is a single char collating element but it can be
526 '-' only at the beginning or end of a List and
527 ']' only at the beginning of a List and
528 '^' anywhere except after the openning '['
529 */
530
parse_bracket_terms(tre_parse_ctx_t * ctx,const char * s,struct literals * ls,struct neg * neg)531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
532 {
533 const char *start = s;
534 tre_ctype_t class;
535 int min, max;
536 wchar_t wc;
537 int len;
538
539 for (;;) {
540 class = 0;
541 len = mbtowc(&wc, s, -1);
542 if (len <= 0)
543 return *s ? REG_BADPAT : REG_EBRACK;
544 if (*s == ']' && s != start) {
545 ctx->s = s+1;
546 return REG_OK;
547 }
548 if (*s == '-' && s != start && s[1] != ']' &&
549 /* extension: [a-z--@] is accepted as [a-z]|[--@] */
550 (s[1] != '-' || s[2] == ']'))
551 return REG_ERANGE;
552 if (*s == '[' && (s[1] == '.' || s[1] == '='))
553 /* collating symbols and equivalence classes are not supported */
554 return REG_ECOLLATE;
555 if (*s == '[' && s[1] == ':') {
556 char tmp[CHARCLASS_NAME_MAX+1];
557 s += 2;
558 for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
559 if (s[len] == ':') {
560 memcpy(tmp, s, len);
561 tmp[len] = 0;
562 class = tre_ctype(tmp);
563 break;
564 }
565 }
566 if (!class || s[len+1] != ']')
567 return REG_ECTYPE;
568 min = 0;
569 max = TRE_CHAR_MAX;
570 s += len+2;
571 } else {
572 min = max = wc;
573 s += len;
574 if (*s == '-' && s[1] != ']') {
575 s++;
576 len = mbtowc(&wc, s, -1);
577 max = wc;
578 /* XXX - Should use collation order instead of
579 encoding values in character ranges. */
580 if (len <= 0 || min > max)
581 return REG_ERANGE;
582 s += len;
583 }
584 }
585
586 if (class && neg->negate) {
587 if (neg->len >= MAX_NEG_CLASSES)
588 return REG_ESPACE;
589 neg->a[neg->len++] = class;
590 } else {
591 tre_literal_t *lit = tre_new_lit(ls);
592 if (!lit)
593 return REG_ESPACE;
594 lit->code_min = min;
595 lit->code_max = max;
596 lit->class = class;
597 lit->position = -1;
598
599 /* Add opposite-case codepoints if REG_ICASE is present.
600 It seems that POSIX requires that bracket negation
601 should happen before case-folding, but most practical
602 implementations do it the other way around. Changing
603 the order would need efficient representation of
604 case-fold ranges and bracket range sets even with
605 simple patterns so this is ok for now. */
606 if (ctx->cflags & REG_ICASE && !class)
607 if (add_icase_literals(ls, min, max))
608 return REG_ESPACE;
609 }
610 }
611 }
612
parse_bracket(tre_parse_ctx_t * ctx,const char * s)613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
614 {
615 int i, max, min, negmax, negmin;
616 tre_ast_node_t *node = 0, *n;
617 tre_ctype_t *nc = 0;
618 tre_literal_t *lit;
619 struct literals ls;
620 struct neg neg;
621 reg_errcode_t err;
622
623 ls.mem = ctx->mem;
624 ls.len = 0;
625 ls.cap = 32;
626 ls.a = xmalloc(ls.cap * sizeof *ls.a);
627 if (!ls.a)
628 return REG_ESPACE;
629 neg.len = 0;
630 neg.negate = *s == '^';
631 if (neg.negate)
632 s++;
633
634 err = parse_bracket_terms(ctx, s, &ls, &neg);
635 if (err != REG_OK)
636 goto parse_bracket_done;
637
638 if (neg.negate) {
639 /*
640 * With REG_NEWLINE, POSIX requires that newlines are not matched by
641 * any form of a non-matching list.
642 */
643 if (ctx->cflags & REG_NEWLINE) {
644 lit = tre_new_lit(&ls);
645 if (!lit) {
646 err = REG_ESPACE;
647 goto parse_bracket_done;
648 }
649 lit->code_min = '\n';
650 lit->code_max = '\n';
651 lit->position = -1;
652 }
653 /* Sort the array if we need to negate it. */
654 qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
655 /* extra lit for the last negated range */
656 lit = tre_new_lit(&ls);
657 if (!lit) {
658 err = REG_ESPACE;
659 goto parse_bracket_done;
660 }
661 lit->code_min = TRE_CHAR_MAX+1;
662 lit->code_max = TRE_CHAR_MAX+1;
663 lit->position = -1;
664 /* negated classes */
665 if (neg.len) {
666 nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
667 if (!nc) {
668 err = REG_ESPACE;
669 goto parse_bracket_done;
670 }
671 memcpy(nc, neg.a, neg.len*sizeof *neg.a);
672 nc[neg.len] = 0;
673 }
674 }
675
676 /* Build a union of the items in the array, negated if necessary. */
677 negmax = negmin = 0;
678 for (i = 0; i < ls.len; i++) {
679 lit = ls.a[i];
680 min = lit->code_min;
681 max = lit->code_max;
682 if (neg.negate) {
683 if (min <= negmin) {
684 /* Overlap. */
685 negmin = MAX(max + 1, negmin);
686 continue;
687 }
688 negmax = min - 1;
689 lit->code_min = negmin;
690 lit->code_max = negmax;
691 negmin = max + 1;
692 }
693 lit->position = ctx->position;
694 lit->neg_classes = nc;
695 n = tre_ast_new_node(ctx->mem, LITERAL, lit);
696 node = tre_ast_new_union(ctx->mem, node, n);
697 if (!node) {
698 err = REG_ESPACE;
699 break;
700 }
701 }
702
703 parse_bracket_done:
704 xfree(ls.a);
705 ctx->position++;
706 ctx->n = node;
707 return err;
708 }
709
parse_dup_count(const char * s,int * n)710 static const char *parse_dup_count(const char *s, int *n)
711 {
712 *n = -1;
713 if (!isdigit(*s))
714 return s;
715 *n = 0;
716 for (;;) {
717 *n = 10 * *n + (*s - '0');
718 s++;
719 if (!isdigit(*s) || *n > RE_DUP_MAX)
720 break;
721 }
722 return s;
723 }
724
parse_dup(const char * s,int ere,int * pmin,int * pmax)725 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
726 {
727 int min, max;
728
729 s = parse_dup_count(s, &min);
730 if (*s == ',')
731 s = parse_dup_count(s+1, &max);
732 else
733 max = min;
734
735 if (
736 (max < min && max >= 0) ||
737 max > RE_DUP_MAX ||
738 min > RE_DUP_MAX ||
739 min < 0 ||
740 (!ere && *s++ != '\\') ||
741 *s++ != '}'
742 )
743 return 0;
744 *pmin = min;
745 *pmax = max;
746 return s;
747 }
748
hexval(unsigned c)749 static int hexval(unsigned c)
750 {
751 if (c-'0'<10) return c-'0';
752 c |= 32;
753 if (c-'a'<6) return c-'a'+10;
754 return -1;
755 }
756
marksub(tre_parse_ctx_t * ctx,tre_ast_node_t * node,int subid)757 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
758 {
759 if (node->submatch_id >= 0) {
760 tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
761 if (!n)
762 return REG_ESPACE;
763 n = tre_ast_new_catenation(ctx->mem, n, node);
764 if (!n)
765 return REG_ESPACE;
766 n->num_submatches = node->num_submatches;
767 node = n;
768 }
769 node->submatch_id = subid;
770 node->num_submatches++;
771 ctx->n = node;
772 return REG_OK;
773 }
774
775 /*
776 BRE grammar:
777 Regex = Branch | '^' | '$' | '^$' | '^' Branch | Branch '$' | '^' Branch '$'
778 Branch = Atom | Branch Atom
779 Atom = char | quoted_char | '.' | Bracket | Atom Dup | '\(' Branch '\)' | back_ref
780 Dup = '*' | '\{' Count '\}' | '\{' Count ',\}' | '\{' Count ',' Count '\}'
781
782 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
783
784 ERE grammar:
785 Regex = Branch | Regex '|' Branch
786 Branch = Atom | Branch Atom
787 Atom = char | quoted_char | '.' | Bracket | Atom Dup | '(' Regex ')' | '^' | '$'
788 Dup = '*' | '+' | '?' | '{' Count '}' | '{' Count ',}' | '{' Count ',' Count '}'
789
790 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
791 */
792
parse_atom(tre_parse_ctx_t * ctx,const char * s)793 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
794 {
795 int len, ere = ctx->cflags & REG_EXTENDED;
796 const char *p;
797 tre_ast_node_t *node;
798 wchar_t wc;
799 switch (*s) {
800 case '[':
801 return parse_bracket(ctx, s+1);
802 case '\\':
803 p = tre_expand_macro(s+1);
804 if (p) {
805 /* assume \X expansion is a single atom */
806 reg_errcode_t err = parse_atom(ctx, p);
807 ctx->s = s+2;
808 return err;
809 }
810 /* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
811 switch (*++s) {
812 case 0:
813 return REG_EESCAPE;
814 case 'b':
815 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
816 break;
817 case 'B':
818 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
819 break;
820 case '<':
821 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
822 break;
823 case '>':
824 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
825 break;
826 case 'x':
827 s++;
828 int i, v, c;
829 v = 0;
830 len = 2;
831 if (*s == '{') {
832 len = 8;
833 s++;
834 }
835 for (i=0; i<len && v<0x110000; i++) {
836 c = hexval(s[i]);
837 if (c < 0) break;
838 v = 16*v + c;
839 }
840 s += i;
841 if (len == 8) {
842 if (*s != '}')
843 return REG_EBRACE;
844 s++;
845 }
846 node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
847 s--;
848 break;
849 case '{':
850 case '+':
851 case '?':
852 /* extension: treat \+, \? as repetitions in BRE */
853 /* reject repetitions after empty expression in BRE */
854 if (!ere)
855 return REG_BADRPT;
856 case '|':
857 /* extension: treat \| as alternation in BRE */
858 if (!ere) {
859 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
860 s--;
861 goto end;
862 }
863 /* fallthrough */
864 default:
865 if (!ere && (unsigned)*s-'1' < 9) {
866 /* back reference */
867 int val = *s - '0';
868 node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
869 ctx->max_backref = MAX(val, ctx->max_backref);
870 } else {
871 /* extension: accept unknown escaped char
872 as a literal */
873 goto parse_literal;
874 }
875 }
876 s++;
877 break;
878 case '.':
879 if (ctx->cflags & REG_NEWLINE) {
880 tre_ast_node_t *tmp1, *tmp2;
881 tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
882 tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
883 if (tmp1 && tmp2)
884 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
885 else
886 node = 0;
887 } else {
888 node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
889 }
890 s++;
891 break;
892 case '^':
893 /* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
894 if (!ere && s != ctx->start)
895 goto parse_literal;
896 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
897 s++;
898 break;
899 case '$':
900 /* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
901 if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
902 goto parse_literal;
903 node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
904 s++;
905 break;
906 case '*':
907 case '{':
908 case '+':
909 case '?':
910 /* reject repetitions after empty expression in ERE */
911 if (ere)
912 return REG_BADRPT;
913 case '|':
914 if (!ere)
915 goto parse_literal;
916 case 0:
917 node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
918 break;
919 default:
920 parse_literal:
921 len = mbtowc(&wc, s, -1);
922 if (len < 0)
923 return REG_BADPAT;
924 if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
925 tre_ast_node_t *tmp1, *tmp2;
926 /* multiple opposite case characters are not supported */
927 tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
928 tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
929 if (tmp1 && tmp2)
930 node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
931 else
932 node = 0;
933 } else {
934 node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
935 }
936 ctx->position++;
937 s += len;
938 break;
939 }
940 end:
941 if (!node)
942 return REG_ESPACE;
943 ctx->n = node;
944 ctx->s = s;
945 return REG_OK;
946 }
947
948 #define PUSHPTR(err, s, v) do { \
949 if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
950 return err; \
951 } while(0)
952
953 #define PUSHINT(err, s, v) do { \
954 if ((err = tre_stack_push_int(s, v)) != REG_OK) \
955 return err; \
956 } while(0)
957
tre_parse(tre_parse_ctx_t * ctx)958 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
959 {
960 tre_ast_node_t *nbranch=0, *nunion=0;
961 int ere = ctx->cflags & REG_EXTENDED;
962 const char *s = ctx->start;
963 int subid = 0;
964 int depth = 0;
965 reg_errcode_t err;
966 tre_stack_t *stack = ctx->stack;
967
968 PUSHINT(err, stack, subid++);
969 for (;;) {
970 if ((!ere && *s == '\\' && s[1] == '(') ||
971 (ere && *s == '(')) {
972 PUSHPTR(err, stack, nunion);
973 PUSHPTR(err, stack, nbranch);
974 PUSHINT(err, stack, subid++);
975 s++;
976 if (!ere)
977 s++;
978 depth++;
979 nbranch = nunion = 0;
980 ctx->start = s;
981 continue;
982 }
983 if ((!ere && *s == '\\' && s[1] == ')') ||
984 (ere && *s == ')' && depth)) {
985 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
986 if (!ctx->n)
987 return REG_ESPACE;
988 } else {
989 err = parse_atom(ctx, s);
990 if (err != REG_OK)
991 return err;
992 s = ctx->s;
993 }
994
995 parse_iter:
996 for (;;) {
997 int min, max;
998
999 if (*s!='\\' && *s!='*') {
1000 if (!ere)
1001 break;
1002 if (*s!='+' && *s!='?' && *s!='{')
1003 break;
1004 }
1005 if (*s=='\\' && ere)
1006 break;
1007 /* extension: treat \+, \? as repetitions in BRE */
1008 if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
1009 break;
1010 if (*s=='\\')
1011 s++;
1012
1013 /* handle ^* at the start of a BRE. */
1014 if (!ere && s==ctx->start+1 && s[-1]=='^')
1015 break;
1016
1017 /* extension: multiple consecutive *+?{,} is unspecified,
1018 but (a+)+ has to be supported so accepting a++ makes
1019 sense, note however that the RE_DUP_MAX limit can be
1020 circumvented: (a{255}){255} uses a lot of memory.. */
1021 if (*s=='{') {
1022 s = parse_dup(s+1, ere, &min, &max);
1023 if (!s)
1024 return REG_BADBR;
1025 } else {
1026 min=0;
1027 max=-1;
1028 if (*s == '+')
1029 min = 1;
1030 if (*s == '?')
1031 max = 1;
1032 s++;
1033 }
1034 if (max == 0)
1035 ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1036 else
1037 ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1038 if (!ctx->n)
1039 return REG_ESPACE;
1040 }
1041
1042 nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1043 if ((ere && *s == '|') ||
1044 (ere && *s == ')' && depth) ||
1045 (!ere && *s == '\\' && s[1] == ')') ||
1046 /* extension: treat \| as alternation in BRE */
1047 (!ere && *s == '\\' && s[1] == '|') ||
1048 !*s) {
1049 /* extension: empty branch is unspecified (), (|a), (a|)
1050 here they are not rejected but match on empty string */
1051 int c = *s;
1052 nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1053 nbranch = 0;
1054
1055 if (c == '\\' && s[1] == '|') {
1056 s+=2;
1057 ctx->start = s;
1058 } else if (c == '|') {
1059 s++;
1060 ctx->start = s;
1061 } else {
1062 if (c == '\\') {
1063 if (!depth) return REG_EPAREN;
1064 s+=2;
1065 } else if (c == ')')
1066 s++;
1067 depth--;
1068 err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1069 if (err != REG_OK)
1070 return err;
1071 if (!c && depth<0) {
1072 ctx->submatch_id = subid;
1073 return REG_OK;
1074 }
1075 if (!c || depth<0)
1076 return REG_EPAREN;
1077 nbranch = tre_stack_pop_voidptr(stack);
1078 nunion = tre_stack_pop_voidptr(stack);
1079 goto parse_iter;
1080 }
1081 }
1082 }
1083 }
1084
1085
1086 /***********************************************************************
1087 from tre-compile.c
1088 ***********************************************************************/
1089
1090
1091 /*
1092 TODO:
1093 - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1094 function calls.
1095 */
1096
1097 /*
1098 Algorithms to setup tags so that submatch addressing can be done.
1099 */
1100
1101
1102 /* Inserts a catenation node to the root of the tree given in `node'.
1103 As the left child a new tag with number `tag_id' to `node' is added,
1104 and the right child is the old root. */
1105 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1106 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1107 {
1108 tre_catenation_t *c;
1109
1110 c = tre_mem_alloc(mem, sizeof(*c));
1111 if (c == NULL)
1112 return REG_ESPACE;
1113 c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1114 if (c->left == NULL)
1115 return REG_ESPACE;
1116 c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1117 if (c->right == NULL)
1118 return REG_ESPACE;
1119
1120 c->right->obj = node->obj;
1121 c->right->type = node->type;
1122 c->right->nullable = -1;
1123 c->right->submatch_id = -1;
1124 c->right->firstpos = NULL;
1125 c->right->lastpos = NULL;
1126 c->right->num_tags = 0;
1127 c->right->num_submatches = 0;
1128 node->obj = c;
1129 node->type = CATENATION;
1130 return REG_OK;
1131 }
1132
1133 /* Inserts a catenation node to the root of the tree given in `node'.
1134 As the right child a new tag with number `tag_id' to `node' is added,
1135 and the left child is the old root. */
1136 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1137 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1138 {
1139 tre_catenation_t *c;
1140
1141 c = tre_mem_alloc(mem, sizeof(*c));
1142 if (c == NULL)
1143 return REG_ESPACE;
1144 c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1145 if (c->right == NULL)
1146 return REG_ESPACE;
1147 c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1148 if (c->left == NULL)
1149 return REG_ESPACE;
1150
1151 c->left->obj = node->obj;
1152 c->left->type = node->type;
1153 c->left->nullable = -1;
1154 c->left->submatch_id = -1;
1155 c->left->firstpos = NULL;
1156 c->left->lastpos = NULL;
1157 c->left->num_tags = 0;
1158 c->left->num_submatches = 0;
1159 node->obj = c;
1160 node->type = CATENATION;
1161 return REG_OK;
1162 }
1163
1164 typedef enum {
1165 ADDTAGS_RECURSE,
1166 ADDTAGS_AFTER_ITERATION,
1167 ADDTAGS_AFTER_UNION_LEFT,
1168 ADDTAGS_AFTER_UNION_RIGHT,
1169 ADDTAGS_AFTER_CAT_LEFT,
1170 ADDTAGS_AFTER_CAT_RIGHT,
1171 ADDTAGS_SET_SUBMATCH_END
1172 } tre_addtags_symbol_t;
1173
1174
1175 typedef struct {
1176 int tag;
1177 int next_tag;
1178 } tre_tag_states_t;
1179
1180
1181 /* Go through `regset' and set submatch data for submatches that are
1182 using this tag. */
1183 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)1184 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1185 {
1186 int i;
1187
1188 for (i = 0; regset[i] >= 0; i++)
1189 {
1190 int id = regset[i] / 2;
1191 int start = !(regset[i] % 2);
1192 if (start)
1193 tnfa->submatch_data[id].so_tag = tag;
1194 else
1195 tnfa->submatch_data[id].eo_tag = tag;
1196 }
1197 regset[0] = -1;
1198 }
1199
1200
1201 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1202 subexpressions marked for submatch addressing can be traced. */
1203 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)1204 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1205 tre_tnfa_t *tnfa)
1206 {
1207 reg_errcode_t status = REG_OK;
1208 tre_addtags_symbol_t symbol;
1209 tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1210 int bottom = tre_stack_num_objects(stack);
1211 /* True for first pass (counting number of needed tags) */
1212 int first_pass = (mem == NULL || tnfa == NULL);
1213 int *regset, *orig_regset;
1214 int num_tags = 0; /* Total number of tags. */
1215 int num_minimals = 0; /* Number of special minimal tags. */
1216 int tag = 0; /* The tag that is to be added next. */
1217 int next_tag = 1; /* Next tag to use after this one. */
1218 int *parents; /* Stack of submatches the current submatch is
1219 contained in. */
1220 int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1221 tre_tag_states_t *saved_states;
1222
1223 tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1224 if (!first_pass)
1225 {
1226 tnfa->end_tag = 0;
1227 tnfa->minimal_tags[0] = -1;
1228 }
1229
1230 regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1231 if (regset == NULL)
1232 return REG_ESPACE;
1233 regset[0] = -1;
1234 orig_regset = regset;
1235
1236 parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1237 if (parents == NULL)
1238 {
1239 xfree(regset);
1240 return REG_ESPACE;
1241 }
1242 parents[0] = -1;
1243
1244 saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1245 if (saved_states == NULL)
1246 {
1247 xfree(regset);
1248 xfree(parents);
1249 return REG_ESPACE;
1250 }
1251 else
1252 {
1253 unsigned int i;
1254 for (i = 0; i <= tnfa->num_submatches; i++)
1255 saved_states[i].tag = -1;
1256 }
1257
1258 STACK_PUSH(stack, voidptr, node);
1259 STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1260
1261 while (tre_stack_num_objects(stack) > bottom)
1262 {
1263 if (status != REG_OK)
1264 break;
1265
1266 symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1267 switch (symbol)
1268 {
1269
1270 case ADDTAGS_SET_SUBMATCH_END:
1271 {
1272 int id = tre_stack_pop_int(stack);
1273 int i;
1274
1275 /* Add end of this submatch to regset. */
1276 for (i = 0; regset[i] >= 0; i++);
1277 regset[i] = id * 2 + 1;
1278 regset[i + 1] = -1;
1279
1280 /* Pop this submatch from the parents stack. */
1281 for (i = 0; parents[i] >= 0; i++);
1282 parents[i - 1] = -1;
1283 break;
1284 }
1285
1286 case ADDTAGS_RECURSE:
1287 node = tre_stack_pop_voidptr(stack);
1288
1289 if (node->submatch_id >= 0)
1290 {
1291 int id = node->submatch_id;
1292 int i;
1293
1294
1295 /* Add start of this submatch to regset. */
1296 for (i = 0; regset[i] >= 0; i++);
1297 regset[i] = id * 2;
1298 regset[i + 1] = -1;
1299
1300 if (!first_pass)
1301 {
1302 for (i = 0; parents[i] >= 0; i++);
1303 tnfa->submatch_data[id].parents = NULL;
1304 if (i > 0)
1305 {
1306 int *p = xmalloc(sizeof(*p) * (i + 1));
1307 if (p == NULL)
1308 {
1309 status = REG_ESPACE;
1310 break;
1311 }
1312 assert(tnfa->submatch_data[id].parents == NULL);
1313 tnfa->submatch_data[id].parents = p;
1314 for (i = 0; parents[i] >= 0; i++)
1315 p[i] = parents[i];
1316 p[i] = -1;
1317 }
1318 }
1319
1320 /* Add end of this submatch to regset after processing this
1321 node. */
1322 STACK_PUSHX(stack, int, node->submatch_id);
1323 STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1324 }
1325
1326 switch (node->type)
1327 {
1328 case LITERAL:
1329 {
1330 tre_literal_t *lit = node->obj;
1331
1332 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1333 {
1334 int i;
1335 if (regset[0] >= 0)
1336 {
1337 /* Regset is not empty, so add a tag before the
1338 literal or backref. */
1339 if (!first_pass)
1340 {
1341 status = tre_add_tag_left(mem, node, tag);
1342 tnfa->tag_directions[tag] = direction;
1343 if (minimal_tag >= 0)
1344 {
1345 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1346 tnfa->minimal_tags[i] = tag;
1347 tnfa->minimal_tags[i + 1] = minimal_tag;
1348 tnfa->minimal_tags[i + 2] = -1;
1349 minimal_tag = -1;
1350 num_minimals++;
1351 }
1352 tre_purge_regset(regset, tnfa, tag);
1353 }
1354 else
1355 {
1356 node->num_tags = 1;
1357 }
1358
1359 regset[0] = -1;
1360 tag = next_tag;
1361 num_tags++;
1362 next_tag++;
1363 }
1364 }
1365 else
1366 {
1367 assert(!IS_TAG(lit));
1368 }
1369 break;
1370 }
1371 case CATENATION:
1372 {
1373 tre_catenation_t *cat = node->obj;
1374 tre_ast_node_t *left = cat->left;
1375 tre_ast_node_t *right = cat->right;
1376 int reserved_tag = -1;
1377
1378
1379 /* After processing right child. */
1380 STACK_PUSHX(stack, voidptr, node);
1381 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1382
1383 /* Process right child. */
1384 STACK_PUSHX(stack, voidptr, right);
1385 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1386
1387 /* After processing left child. */
1388 STACK_PUSHX(stack, int, next_tag + left->num_tags);
1389 if (left->num_tags > 0 && right->num_tags > 0)
1390 {
1391 /* Reserve the next tag to the right child. */
1392 reserved_tag = next_tag;
1393 next_tag++;
1394 }
1395 STACK_PUSHX(stack, int, reserved_tag);
1396 STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1397
1398 /* Process left child. */
1399 STACK_PUSHX(stack, voidptr, left);
1400 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1401
1402 }
1403 break;
1404 case ITERATION:
1405 {
1406 tre_iteration_t *iter = node->obj;
1407
1408 if (first_pass)
1409 {
1410 STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1411 }
1412 else
1413 {
1414 STACK_PUSHX(stack, int, tag);
1415 STACK_PUSHX(stack, int, iter->minimal);
1416 }
1417 STACK_PUSHX(stack, voidptr, node);
1418 STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1419
1420 STACK_PUSHX(stack, voidptr, iter->arg);
1421 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1422
1423 /* Regset is not empty, so add a tag here. */
1424 if (regset[0] >= 0 || iter->minimal)
1425 {
1426 if (!first_pass)
1427 {
1428 int i;
1429 status = tre_add_tag_left(mem, node, tag);
1430 if (iter->minimal)
1431 tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1432 else
1433 tnfa->tag_directions[tag] = direction;
1434 if (minimal_tag >= 0)
1435 {
1436 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1437 tnfa->minimal_tags[i] = tag;
1438 tnfa->minimal_tags[i + 1] = minimal_tag;
1439 tnfa->minimal_tags[i + 2] = -1;
1440 minimal_tag = -1;
1441 num_minimals++;
1442 }
1443 tre_purge_regset(regset, tnfa, tag);
1444 }
1445
1446 regset[0] = -1;
1447 tag = next_tag;
1448 num_tags++;
1449 next_tag++;
1450 }
1451 direction = TRE_TAG_MINIMIZE;
1452 }
1453 break;
1454 case UNION:
1455 {
1456 tre_union_t *uni = node->obj;
1457 tre_ast_node_t *left = uni->left;
1458 tre_ast_node_t *right = uni->right;
1459 int left_tag;
1460 int right_tag;
1461
1462 if (regset[0] >= 0)
1463 {
1464 left_tag = next_tag;
1465 right_tag = next_tag + 1;
1466 }
1467 else
1468 {
1469 left_tag = tag;
1470 right_tag = next_tag;
1471 }
1472
1473 /* After processing right child. */
1474 STACK_PUSHX(stack, int, right_tag);
1475 STACK_PUSHX(stack, int, left_tag);
1476 STACK_PUSHX(stack, voidptr, regset);
1477 STACK_PUSHX(stack, int, regset[0] >= 0);
1478 STACK_PUSHX(stack, voidptr, node);
1479 STACK_PUSHX(stack, voidptr, right);
1480 STACK_PUSHX(stack, voidptr, left);
1481 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1482
1483 /* Process right child. */
1484 STACK_PUSHX(stack, voidptr, right);
1485 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1486
1487 /* After processing left child. */
1488 STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1489
1490 /* Process left child. */
1491 STACK_PUSHX(stack, voidptr, left);
1492 STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1493
1494 /* Regset is not empty, so add a tag here. */
1495 if (regset[0] >= 0)
1496 {
1497 if (!first_pass)
1498 {
1499 int i;
1500 status = tre_add_tag_left(mem, node, tag);
1501 tnfa->tag_directions[tag] = direction;
1502 if (minimal_tag >= 0)
1503 {
1504 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1505 tnfa->minimal_tags[i] = tag;
1506 tnfa->minimal_tags[i + 1] = minimal_tag;
1507 tnfa->minimal_tags[i + 2] = -1;
1508 minimal_tag = -1;
1509 num_minimals++;
1510 }
1511 tre_purge_regset(regset, tnfa, tag);
1512 }
1513
1514 regset[0] = -1;
1515 tag = next_tag;
1516 num_tags++;
1517 next_tag++;
1518 }
1519
1520 if (node->num_submatches > 0)
1521 {
1522 /* The next two tags are reserved for markers. */
1523 next_tag++;
1524 tag = next_tag;
1525 next_tag++;
1526 }
1527
1528 break;
1529 }
1530 }
1531
1532 if (node->submatch_id >= 0)
1533 {
1534 int i;
1535 /* Push this submatch on the parents stack. */
1536 for (i = 0; parents[i] >= 0; i++);
1537 parents[i] = node->submatch_id;
1538 parents[i + 1] = -1;
1539 }
1540
1541 break; /* end case: ADDTAGS_RECURSE */
1542
1543 case ADDTAGS_AFTER_ITERATION:
1544 {
1545 int minimal = 0;
1546 int enter_tag;
1547 node = tre_stack_pop_voidptr(stack);
1548 if (first_pass)
1549 {
1550 node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1551 + tre_stack_pop_int(stack);
1552 minimal_tag = -1;
1553 }
1554 else
1555 {
1556 minimal = tre_stack_pop_int(stack);
1557 enter_tag = tre_stack_pop_int(stack);
1558 if (minimal)
1559 minimal_tag = enter_tag;
1560 }
1561
1562 if (!first_pass)
1563 {
1564 if (minimal)
1565 direction = TRE_TAG_MINIMIZE;
1566 else
1567 direction = TRE_TAG_MAXIMIZE;
1568 }
1569 break;
1570 }
1571
1572 case ADDTAGS_AFTER_CAT_LEFT:
1573 {
1574 int new_tag = tre_stack_pop_int(stack);
1575 next_tag = tre_stack_pop_int(stack);
1576 if (new_tag >= 0)
1577 {
1578 tag = new_tag;
1579 }
1580 break;
1581 }
1582
1583 case ADDTAGS_AFTER_CAT_RIGHT:
1584 node = tre_stack_pop_voidptr(stack);
1585 if (first_pass)
1586 node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1587 + ((tre_catenation_t *)node->obj)->right->num_tags;
1588 break;
1589
1590 case ADDTAGS_AFTER_UNION_LEFT:
1591 /* Lift the bottom of the `regset' array so that when processing
1592 the right operand the items currently in the array are
1593 invisible. The original bottom was saved at ADDTAGS_UNION and
1594 will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1595 while (*regset >= 0)
1596 regset++;
1597 break;
1598
1599 case ADDTAGS_AFTER_UNION_RIGHT:
1600 {
1601 int added_tags, tag_left, tag_right;
1602 tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1603 tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1604 node = tre_stack_pop_voidptr(stack);
1605 added_tags = tre_stack_pop_int(stack);
1606 if (first_pass)
1607 {
1608 node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1609 + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1610 + ((node->num_submatches > 0) ? 2 : 0);
1611 }
1612 regset = tre_stack_pop_voidptr(stack);
1613 tag_left = tre_stack_pop_int(stack);
1614 tag_right = tre_stack_pop_int(stack);
1615
1616 /* Add tags after both children, the left child gets a smaller
1617 tag than the right child. This guarantees that we prefer
1618 the left child over the right child. */
1619 /* XXX - This is not always necessary (if the children have
1620 tags which must be seen for every match of that child). */
1621 /* XXX - Check if this is the only place where tre_add_tag_right
1622 is used. If so, use tre_add_tag_left (putting the tag before
1623 the child as opposed after the child) and throw away
1624 tre_add_tag_right. */
1625 if (node->num_submatches > 0)
1626 {
1627 if (!first_pass)
1628 {
1629 status = tre_add_tag_right(mem, left, tag_left);
1630 tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1631 if (status == REG_OK)
1632 status = tre_add_tag_right(mem, right, tag_right);
1633 tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1634 }
1635 num_tags += 2;
1636 }
1637 direction = TRE_TAG_MAXIMIZE;
1638 break;
1639 }
1640
1641 default:
1642 assert(0);
1643 break;
1644
1645 } /* end switch(symbol) */
1646 } /* end while(tre_stack_num_objects(stack) > bottom) */
1647
1648 if (!first_pass)
1649 tre_purge_regset(regset, tnfa, tag);
1650
1651 if (!first_pass && minimal_tag >= 0)
1652 {
1653 int i;
1654 for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1655 tnfa->minimal_tags[i] = tag;
1656 tnfa->minimal_tags[i + 1] = minimal_tag;
1657 tnfa->minimal_tags[i + 2] = -1;
1658 minimal_tag = -1;
1659 num_minimals++;
1660 }
1661
1662 assert(tree->num_tags == num_tags);
1663 tnfa->end_tag = num_tags;
1664 tnfa->num_tags = num_tags;
1665 tnfa->num_minimals = num_minimals;
1666 xfree(orig_regset);
1667 xfree(parents);
1668 xfree(saved_states);
1669 return status;
1670 }
1671
1672
1673
1674 /*
1675 AST to TNFA compilation routines.
1676 */
1677
1678 typedef enum {
1679 COPY_RECURSE,
1680 COPY_SET_RESULT_PTR
1681 } tre_copyast_symbol_t;
1682
1683 /* Flags for tre_copy_ast(). */
1684 #define COPY_REMOVE_TAGS 1
1685 #define COPY_MAXIMIZE_FIRST_TAG 2
1686
1687 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)1688 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1689 int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1690 tre_ast_node_t **copy, int *max_pos)
1691 {
1692 reg_errcode_t status = REG_OK;
1693 int bottom = tre_stack_num_objects(stack);
1694 int num_copied = 0;
1695 int first_tag = 1;
1696 tre_ast_node_t **result = copy;
1697 tre_copyast_symbol_t symbol;
1698
1699 STACK_PUSH(stack, voidptr, ast);
1700 STACK_PUSH(stack, int, COPY_RECURSE);
1701
1702 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1703 {
1704 tre_ast_node_t *node;
1705 if (status != REG_OK)
1706 break;
1707
1708 symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1709 switch (symbol)
1710 {
1711 case COPY_SET_RESULT_PTR:
1712 result = tre_stack_pop_voidptr(stack);
1713 break;
1714 case COPY_RECURSE:
1715 node = tre_stack_pop_voidptr(stack);
1716 switch (node->type)
1717 {
1718 case LITERAL:
1719 {
1720 tre_literal_t *lit = node->obj;
1721 int pos = lit->position;
1722 int min = lit->code_min;
1723 int max = lit->code_max;
1724 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1725 {
1726 /* XXX - e.g. [ab] has only one position but two
1727 nodes, so we are creating holes in the state space
1728 here. Not fatal, just wastes memory. */
1729 pos += *pos_add;
1730 num_copied++;
1731 }
1732 else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1733 {
1734 /* Change this tag to empty. */
1735 min = EMPTY;
1736 max = pos = -1;
1737 }
1738 else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1739 && first_tag)
1740 {
1741 /* Maximize the first tag. */
1742 tag_directions[max] = TRE_TAG_MAXIMIZE;
1743 first_tag = 0;
1744 }
1745 *result = tre_ast_new_literal(mem, min, max, pos);
1746 if (*result == NULL)
1747 status = REG_ESPACE;
1748 else {
1749 tre_literal_t *p = (*result)->obj;
1750 p->class = lit->class;
1751 p->neg_classes = lit->neg_classes;
1752 }
1753
1754 if (pos > *max_pos)
1755 *max_pos = pos;
1756 break;
1757 }
1758 case UNION:
1759 {
1760 tre_union_t *uni = node->obj;
1761 tre_union_t *tmp;
1762 *result = tre_ast_new_union(mem, uni->left, uni->right);
1763 if (*result == NULL)
1764 {
1765 status = REG_ESPACE;
1766 break;
1767 }
1768 tmp = (*result)->obj;
1769 result = &tmp->left;
1770 STACK_PUSHX(stack, voidptr, uni->right);
1771 STACK_PUSHX(stack, int, COPY_RECURSE);
1772 STACK_PUSHX(stack, voidptr, &tmp->right);
1773 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1774 STACK_PUSHX(stack, voidptr, uni->left);
1775 STACK_PUSHX(stack, int, COPY_RECURSE);
1776 break;
1777 }
1778 case CATENATION:
1779 {
1780 tre_catenation_t *cat = node->obj;
1781 tre_catenation_t *tmp;
1782 *result = tre_ast_new_catenation(mem, cat->left, cat->right);
1783 if (*result == NULL)
1784 {
1785 status = REG_ESPACE;
1786 break;
1787 }
1788 tmp = (*result)->obj;
1789 tmp->left = NULL;
1790 tmp->right = NULL;
1791 result = &tmp->left;
1792
1793 STACK_PUSHX(stack, voidptr, cat->right);
1794 STACK_PUSHX(stack, int, COPY_RECURSE);
1795 STACK_PUSHX(stack, voidptr, &tmp->right);
1796 STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1797 STACK_PUSHX(stack, voidptr, cat->left);
1798 STACK_PUSHX(stack, int, COPY_RECURSE);
1799 break;
1800 }
1801 case ITERATION:
1802 {
1803 tre_iteration_t *iter = node->obj;
1804 STACK_PUSHX(stack, voidptr, iter->arg);
1805 STACK_PUSHX(stack, int, COPY_RECURSE);
1806 *result = tre_ast_new_iter(mem, iter->arg, iter->min,
1807 iter->max, iter->minimal);
1808 if (*result == NULL)
1809 {
1810 status = REG_ESPACE;
1811 break;
1812 }
1813 iter = (*result)->obj;
1814 result = &iter->arg;
1815 break;
1816 }
1817 default:
1818 assert(0);
1819 break;
1820 }
1821 break;
1822 }
1823 }
1824 *pos_add += num_copied;
1825 return status;
1826 }
1827
1828 typedef enum {
1829 EXPAND_RECURSE,
1830 EXPAND_AFTER_ITER
1831 } tre_expand_ast_symbol_t;
1832
1833 /* Expands each iteration node that has a finite nonzero minimum or maximum
1834 iteration count to a catenated sequence of copies of the node. */
1835 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions)1836 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1837 int *position, tre_tag_direction_t *tag_directions)
1838 {
1839 reg_errcode_t status = REG_OK;
1840 int bottom = tre_stack_num_objects(stack);
1841 int pos_add = 0;
1842 int pos_add_total = 0;
1843 int max_pos = 0;
1844 int iter_depth = 0;
1845
1846 STACK_PUSHR(stack, voidptr, ast);
1847 STACK_PUSHR(stack, int, EXPAND_RECURSE);
1848 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1849 {
1850 tre_ast_node_t *node;
1851 tre_expand_ast_symbol_t symbol;
1852
1853 if (status != REG_OK)
1854 break;
1855
1856 symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1857 node = tre_stack_pop_voidptr(stack);
1858 switch (symbol)
1859 {
1860 case EXPAND_RECURSE:
1861 switch (node->type)
1862 {
1863 case LITERAL:
1864 {
1865 tre_literal_t *lit= node->obj;
1866 if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1867 {
1868 lit->position += pos_add;
1869 if (lit->position > max_pos)
1870 max_pos = lit->position;
1871 }
1872 break;
1873 }
1874 case UNION:
1875 {
1876 tre_union_t *uni = node->obj;
1877 STACK_PUSHX(stack, voidptr, uni->right);
1878 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1879 STACK_PUSHX(stack, voidptr, uni->left);
1880 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1881 break;
1882 }
1883 case CATENATION:
1884 {
1885 tre_catenation_t *cat = node->obj;
1886 STACK_PUSHX(stack, voidptr, cat->right);
1887 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1888 STACK_PUSHX(stack, voidptr, cat->left);
1889 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1890 break;
1891 }
1892 case ITERATION:
1893 {
1894 tre_iteration_t *iter = node->obj;
1895 STACK_PUSHX(stack, int, pos_add);
1896 STACK_PUSHX(stack, voidptr, node);
1897 STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1898 STACK_PUSHX(stack, voidptr, iter->arg);
1899 STACK_PUSHX(stack, int, EXPAND_RECURSE);
1900 /* If we are going to expand this node at EXPAND_AFTER_ITER
1901 then don't increase the `pos' fields of the nodes now, it
1902 will get done when expanding. */
1903 if (iter->min > 1 || iter->max > 1)
1904 pos_add = 0;
1905 iter_depth++;
1906 break;
1907 }
1908 default:
1909 assert(0);
1910 break;
1911 }
1912 break;
1913 case EXPAND_AFTER_ITER:
1914 {
1915 tre_iteration_t *iter = node->obj;
1916 int pos_add_last;
1917 pos_add = tre_stack_pop_int(stack);
1918 pos_add_last = pos_add;
1919 if (iter->min > 1 || iter->max > 1)
1920 {
1921 tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1922 int j;
1923 int pos_add_save = pos_add;
1924
1925 /* Create a catenated sequence of copies of the node. */
1926 for (j = 0; j < iter->min; j++)
1927 {
1928 tre_ast_node_t *copy;
1929 /* Remove tags from all but the last copy. */
1930 int flags = ((j + 1 < iter->min)
1931 ? COPY_REMOVE_TAGS
1932 : COPY_MAXIMIZE_FIRST_TAG);
1933 pos_add_save = pos_add;
1934 status = tre_copy_ast(mem, stack, iter->arg, flags,
1935 &pos_add, tag_directions, ©,
1936 &max_pos);
1937 if (status != REG_OK)
1938 return status;
1939 if (seq1 != NULL)
1940 seq1 = tre_ast_new_catenation(mem, seq1, copy);
1941 else
1942 seq1 = copy;
1943 if (seq1 == NULL)
1944 return REG_ESPACE;
1945 }
1946
1947 if (iter->max == -1)
1948 {
1949 /* No upper limit. */
1950 pos_add_save = pos_add;
1951 status = tre_copy_ast(mem, stack, iter->arg, 0,
1952 &pos_add, NULL, &seq2, &max_pos);
1953 if (status != REG_OK)
1954 return status;
1955 seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1956 if (seq2 == NULL)
1957 return REG_ESPACE;
1958 }
1959 else
1960 {
1961 for (j = iter->min; j < iter->max; j++)
1962 {
1963 tre_ast_node_t *tmp, *copy;
1964 pos_add_save = pos_add;
1965 status = tre_copy_ast(mem, stack, iter->arg, 0,
1966 &pos_add, NULL, ©, &max_pos);
1967 if (status != REG_OK)
1968 return status;
1969 if (seq2 != NULL)
1970 seq2 = tre_ast_new_catenation(mem, copy, seq2);
1971 else
1972 seq2 = copy;
1973 if (seq2 == NULL)
1974 return REG_ESPACE;
1975 tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1976 if (tmp == NULL)
1977 return REG_ESPACE;
1978 seq2 = tre_ast_new_union(mem, tmp, seq2);
1979 if (seq2 == NULL)
1980 return REG_ESPACE;
1981 }
1982 }
1983
1984 pos_add = pos_add_save;
1985 if (seq1 == NULL)
1986 seq1 = seq2;
1987 else if (seq2 != NULL)
1988 seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1989 if (seq1 == NULL)
1990 return REG_ESPACE;
1991 node->obj = seq1->obj;
1992 node->type = seq1->type;
1993 }
1994
1995 iter_depth--;
1996 pos_add_total += pos_add - pos_add_last;
1997 if (iter_depth == 0)
1998 pos_add = pos_add_total;
1999
2000 break;
2001 }
2002 default:
2003 assert(0);
2004 break;
2005 }
2006 }
2007
2008 *position += pos_add_total;
2009
2010 /* `max_pos' should never be larger than `*position' if the above
2011 code works, but just an extra safeguard let's make sure
2012 `*position' is set large enough so enough memory will be
2013 allocated for the transition table. */
2014 if (max_pos > *position)
2015 *position = max_pos;
2016
2017 return status;
2018 }
2019
2020 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)2021 tre_set_empty(tre_mem_t mem)
2022 {
2023 tre_pos_and_tags_t *new_set;
2024
2025 new_set = tre_mem_calloc(mem, sizeof(*new_set));
2026 if (new_set == NULL)
2027 return NULL;
2028
2029 new_set[0].position = -1;
2030 new_set[0].code_min = -1;
2031 new_set[0].code_max = -1;
2032
2033 return new_set;
2034 }
2035
2036 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)2037 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2038 tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2039 {
2040 tre_pos_and_tags_t *new_set;
2041
2042 new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2043 if (new_set == NULL)
2044 return NULL;
2045
2046 new_set[0].position = position;
2047 new_set[0].code_min = code_min;
2048 new_set[0].code_max = code_max;
2049 new_set[0].class = class;
2050 new_set[0].neg_classes = neg_classes;
2051 new_set[0].backref = backref;
2052 new_set[1].position = -1;
2053 new_set[1].code_min = -1;
2054 new_set[1].code_max = -1;
2055
2056 return new_set;
2057 }
2058
2059 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions)2060 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2061 int *tags, int assertions)
2062 {
2063 int s1, s2, i, j;
2064 tre_pos_and_tags_t *new_set;
2065 int *new_tags;
2066 int num_tags;
2067
2068 for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2069 for (s1 = 0; set1[s1].position >= 0; s1++);
2070 for (s2 = 0; set2[s2].position >= 0; s2++);
2071 new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2072 if (!new_set )
2073 return NULL;
2074
2075 for (s1 = 0; set1[s1].position >= 0; s1++)
2076 {
2077 new_set[s1].position = set1[s1].position;
2078 new_set[s1].code_min = set1[s1].code_min;
2079 new_set[s1].code_max = set1[s1].code_max;
2080 new_set[s1].assertions = set1[s1].assertions | assertions;
2081 new_set[s1].class = set1[s1].class;
2082 new_set[s1].neg_classes = set1[s1].neg_classes;
2083 new_set[s1].backref = set1[s1].backref;
2084 if (set1[s1].tags == NULL && tags == NULL)
2085 new_set[s1].tags = NULL;
2086 else
2087 {
2088 for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2089 new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2090 * (i + num_tags + 1)));
2091 if (new_tags == NULL)
2092 return NULL;
2093 for (j = 0; j < i; j++)
2094 new_tags[j] = set1[s1].tags[j];
2095 for (i = 0; i < num_tags; i++)
2096 new_tags[j + i] = tags[i];
2097 new_tags[j + i] = -1;
2098 new_set[s1].tags = new_tags;
2099 }
2100 }
2101
2102 for (s2 = 0; set2[s2].position >= 0; s2++)
2103 {
2104 new_set[s1 + s2].position = set2[s2].position;
2105 new_set[s1 + s2].code_min = set2[s2].code_min;
2106 new_set[s1 + s2].code_max = set2[s2].code_max;
2107 /* XXX - why not | assertions here as well? */
2108 new_set[s1 + s2].assertions = set2[s2].assertions;
2109 new_set[s1 + s2].class = set2[s2].class;
2110 new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2111 new_set[s1 + s2].backref = set2[s2].backref;
2112 if (set2[s2].tags == NULL)
2113 new_set[s1 + s2].tags = NULL;
2114 else
2115 {
2116 for (i = 0; set2[s2].tags[i] >= 0; i++);
2117 new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2118 if (new_tags == NULL)
2119 return NULL;
2120 for (j = 0; j < i; j++)
2121 new_tags[j] = set2[s2].tags[j];
2122 new_tags[j] = -1;
2123 new_set[s1 + s2].tags = new_tags;
2124 }
2125 }
2126 new_set[s1 + s2].position = -1;
2127 return new_set;
2128 }
2129
2130 /* Finds the empty path through `node' which is the one that should be
2131 taken according to POSIX.2 rules, and adds the tags on that path to
2132 `tags'. `tags' may be NULL. If `num_tags_seen' is not NULL, it is
2133 set to the number of tags seen on the path. */
2134 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * num_tags_seen)2135 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2136 int *assertions, int *num_tags_seen)
2137 {
2138 tre_literal_t *lit;
2139 tre_union_t *uni;
2140 tre_catenation_t *cat;
2141 tre_iteration_t *iter;
2142 int i;
2143 int bottom = tre_stack_num_objects(stack);
2144 reg_errcode_t status = REG_OK;
2145 if (num_tags_seen)
2146 *num_tags_seen = 0;
2147
2148 status = tre_stack_push_voidptr(stack, node);
2149
2150 /* Walk through the tree recursively. */
2151 while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2152 {
2153 node = tre_stack_pop_voidptr(stack);
2154
2155 switch (node->type)
2156 {
2157 case LITERAL:
2158 lit = (tre_literal_t *)node->obj;
2159 switch (lit->code_min)
2160 {
2161 case TAG:
2162 if (lit->code_max >= 0)
2163 {
2164 if (tags != NULL)
2165 {
2166 /* Add the tag to `tags'. */
2167 for (i = 0; tags[i] >= 0; i++)
2168 if (tags[i] == lit->code_max)
2169 break;
2170 if (tags[i] < 0)
2171 {
2172 tags[i] = lit->code_max;
2173 tags[i + 1] = -1;
2174 }
2175 }
2176 if (num_tags_seen)
2177 (*num_tags_seen)++;
2178 }
2179 break;
2180 case ASSERTION:
2181 assert(lit->code_max >= 1
2182 || lit->code_max <= ASSERT_LAST);
2183 if (assertions != NULL)
2184 *assertions |= lit->code_max;
2185 break;
2186 case EMPTY:
2187 break;
2188 default:
2189 assert(0);
2190 break;
2191 }
2192 break;
2193
2194 case UNION:
2195 /* Subexpressions starting earlier take priority over ones
2196 starting later, so we prefer the left subexpression over the
2197 right subexpression. */
2198 uni = (tre_union_t *)node->obj;
2199 if (uni->left->nullable)
2200 STACK_PUSHX(stack, voidptr, uni->left)
2201 else if (uni->right->nullable)
2202 STACK_PUSHX(stack, voidptr, uni->right)
2203 else
2204 assert(0);
2205 break;
2206
2207 case CATENATION:
2208 /* The path must go through both children. */
2209 cat = (tre_catenation_t *)node->obj;
2210 assert(cat->left->nullable);
2211 assert(cat->right->nullable);
2212 STACK_PUSHX(stack, voidptr, cat->left);
2213 STACK_PUSHX(stack, voidptr, cat->right);
2214 break;
2215
2216 case ITERATION:
2217 /* A match with an empty string is preferred over no match at
2218 all, so we go through the argument if possible. */
2219 iter = (tre_iteration_t *)node->obj;
2220 if (iter->arg->nullable)
2221 STACK_PUSHX(stack, voidptr, iter->arg);
2222 break;
2223
2224 default:
2225 assert(0);
2226 break;
2227 }
2228 }
2229
2230 return status;
2231 }
2232
2233
2234 typedef enum {
2235 NFL_RECURSE,
2236 NFL_POST_UNION,
2237 NFL_POST_CATENATION,
2238 NFL_POST_ITERATION
2239 } tre_nfl_stack_symbol_t;
2240
2241
2242 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2243 the nodes of the AST `tree'. */
2244 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)2245 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2246 {
2247 int bottom = tre_stack_num_objects(stack);
2248
2249 STACK_PUSHR(stack, voidptr, tree);
2250 STACK_PUSHR(stack, int, NFL_RECURSE);
2251
2252 while (tre_stack_num_objects(stack) > bottom)
2253 {
2254 tre_nfl_stack_symbol_t symbol;
2255 tre_ast_node_t *node;
2256
2257 symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2258 node = tre_stack_pop_voidptr(stack);
2259 switch (symbol)
2260 {
2261 case NFL_RECURSE:
2262 switch (node->type)
2263 {
2264 case LITERAL:
2265 {
2266 tre_literal_t *lit = (tre_literal_t *)node->obj;
2267 if (IS_BACKREF(lit))
2268 {
2269 /* Back references: nullable = false, firstpos = {i},
2270 lastpos = {i}. */
2271 node->nullable = 0;
2272 node->firstpos = tre_set_one(mem, lit->position, 0,
2273 TRE_CHAR_MAX, 0, NULL, -1);
2274 if (!node->firstpos)
2275 return REG_ESPACE;
2276 node->lastpos = tre_set_one(mem, lit->position, 0,
2277 TRE_CHAR_MAX, 0, NULL,
2278 (int)lit->code_max);
2279 if (!node->lastpos)
2280 return REG_ESPACE;
2281 }
2282 else if (lit->code_min < 0)
2283 {
2284 /* Tags, empty strings, params, and zero width assertions:
2285 nullable = true, firstpos = {}, and lastpos = {}. */
2286 node->nullable = 1;
2287 node->firstpos = tre_set_empty(mem);
2288 if (!node->firstpos)
2289 return REG_ESPACE;
2290 node->lastpos = tre_set_empty(mem);
2291 if (!node->lastpos)
2292 return REG_ESPACE;
2293 }
2294 else
2295 {
2296 /* Literal at position i: nullable = false, firstpos = {i},
2297 lastpos = {i}. */
2298 node->nullable = 0;
2299 node->firstpos =
2300 tre_set_one(mem, lit->position, (int)lit->code_min,
2301 (int)lit->code_max, 0, NULL, -1);
2302 if (!node->firstpos)
2303 return REG_ESPACE;
2304 node->lastpos = tre_set_one(mem, lit->position,
2305 (int)lit->code_min,
2306 (int)lit->code_max,
2307 lit->class, lit->neg_classes,
2308 -1);
2309 if (!node->lastpos)
2310 return REG_ESPACE;
2311 }
2312 break;
2313 }
2314
2315 case UNION:
2316 /* Compute the attributes for the two subtrees, and after that
2317 for this node. */
2318 STACK_PUSHR(stack, voidptr, node);
2319 STACK_PUSHR(stack, int, NFL_POST_UNION);
2320 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2321 STACK_PUSHR(stack, int, NFL_RECURSE);
2322 STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2323 STACK_PUSHR(stack, int, NFL_RECURSE);
2324 break;
2325
2326 case CATENATION:
2327 /* Compute the attributes for the two subtrees, and after that
2328 for this node. */
2329 STACK_PUSHR(stack, voidptr, node);
2330 STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2331 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2332 STACK_PUSHR(stack, int, NFL_RECURSE);
2333 STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2334 STACK_PUSHR(stack, int, NFL_RECURSE);
2335 break;
2336
2337 case ITERATION:
2338 /* Compute the attributes for the subtree, and after that for
2339 this node. */
2340 STACK_PUSHR(stack, voidptr, node);
2341 STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2342 STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2343 STACK_PUSHR(stack, int, NFL_RECURSE);
2344 break;
2345 }
2346 break; /* end case: NFL_RECURSE */
2347
2348 case NFL_POST_UNION:
2349 {
2350 tre_union_t *uni = (tre_union_t *)node->obj;
2351 node->nullable = uni->left->nullable || uni->right->nullable;
2352 node->firstpos = tre_set_union(mem, uni->left->firstpos,
2353 uni->right->firstpos, NULL, 0);
2354 if (!node->firstpos)
2355 return REG_ESPACE;
2356 node->lastpos = tre_set_union(mem, uni->left->lastpos,
2357 uni->right->lastpos, NULL, 0);
2358 if (!node->lastpos)
2359 return REG_ESPACE;
2360 break;
2361 }
2362
2363 case NFL_POST_ITERATION:
2364 {
2365 tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2366
2367 if (iter->min == 0 || iter->arg->nullable)
2368 node->nullable = 1;
2369 else
2370 node->nullable = 0;
2371 node->firstpos = iter->arg->firstpos;
2372 node->lastpos = iter->arg->lastpos;
2373 break;
2374 }
2375
2376 case NFL_POST_CATENATION:
2377 {
2378 int num_tags, *tags, assertions;
2379 reg_errcode_t status;
2380 tre_catenation_t *cat = node->obj;
2381 node->nullable = cat->left->nullable && cat->right->nullable;
2382
2383 /* Compute firstpos. */
2384 if (cat->left->nullable)
2385 {
2386 /* The left side matches the empty string. Make a first pass
2387 with tre_match_empty() to get the number of tags and
2388 parameters. */
2389 status = tre_match_empty(stack, cat->left,
2390 NULL, NULL, &num_tags);
2391 if (status != REG_OK)
2392 return status;
2393 /* Allocate arrays for the tags and parameters. */
2394 tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2395 if (!tags)
2396 return REG_ESPACE;
2397 tags[0] = -1;
2398 assertions = 0;
2399 /* Second pass with tre_mach_empty() to get the list of
2400 tags and parameters. */
2401 status = tre_match_empty(stack, cat->left, tags,
2402 &assertions, NULL);
2403 if (status != REG_OK)
2404 {
2405 xfree(tags);
2406 return status;
2407 }
2408 node->firstpos =
2409 tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2410 tags, assertions);
2411 xfree(tags);
2412 if (!node->firstpos)
2413 return REG_ESPACE;
2414 }
2415 else
2416 {
2417 node->firstpos = cat->left->firstpos;
2418 }
2419
2420 /* Compute lastpos. */
2421 if (cat->right->nullable)
2422 {
2423 /* The right side matches the empty string. Make a first pass
2424 with tre_match_empty() to get the number of tags and
2425 parameters. */
2426 status = tre_match_empty(stack, cat->right,
2427 NULL, NULL, &num_tags);
2428 if (status != REG_OK)
2429 return status;
2430 /* Allocate arrays for the tags and parameters. */
2431 tags = xmalloc(sizeof(int) * (num_tags + 1));
2432 if (!tags)
2433 return REG_ESPACE;
2434 tags[0] = -1;
2435 assertions = 0;
2436 /* Second pass with tre_mach_empty() to get the list of
2437 tags and parameters. */
2438 status = tre_match_empty(stack, cat->right, tags,
2439 &assertions, NULL);
2440 if (status != REG_OK)
2441 {
2442 xfree(tags);
2443 return status;
2444 }
2445 node->lastpos =
2446 tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2447 tags, assertions);
2448 xfree(tags);
2449 if (!node->lastpos)
2450 return REG_ESPACE;
2451 }
2452 else
2453 {
2454 node->lastpos = cat->right->lastpos;
2455 }
2456 break;
2457 }
2458
2459 default:
2460 assert(0);
2461 break;
2462 }
2463 }
2464
2465 return REG_OK;
2466 }
2467
2468
2469 /* Adds a transition from each position in `p1' to each position in `p2'. */
2470 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)2471 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2472 tre_tnfa_transition_t *transitions,
2473 int *counts, int *offs)
2474 {
2475 tre_pos_and_tags_t *orig_p2 = p2;
2476 tre_tnfa_transition_t *trans;
2477 int i, j, k, l, dup, prev_p2_pos;
2478
2479 if (transitions != NULL)
2480 while (p1->position >= 0)
2481 {
2482 p2 = orig_p2;
2483 prev_p2_pos = -1;
2484 while (p2->position >= 0)
2485 {
2486 /* Optimization: if this position was already handled, skip it. */
2487 if (p2->position == prev_p2_pos)
2488 {
2489 p2++;
2490 continue;
2491 }
2492 prev_p2_pos = p2->position;
2493 /* Set `trans' to point to the next unused transition from
2494 position `p1->position'. */
2495 trans = transitions + offs[p1->position];
2496 while (trans->state != NULL)
2497 {
2498 #if 0
2499 /* If we find a previous transition from `p1->position' to
2500 `p2->position', it is overwritten. This can happen only
2501 if there are nested loops in the regexp, like in "((a)*)*".
2502 In POSIX.2 repetition using the outer loop is always
2503 preferred over using the inner loop. Therefore the
2504 transition for the inner loop is useless and can be thrown
2505 away. */
2506 /* XXX - The same position is used for all nodes in a bracket
2507 expression, so this optimization cannot be used (it will
2508 break bracket expressions) unless I figure out a way to
2509 detect it here. */
2510 if (trans->state_id == p2->position)
2511 {
2512 break;
2513 }
2514 #endif
2515 trans++;
2516 }
2517
2518 if (trans->state == NULL)
2519 (trans + 1)->state = NULL;
2520 /* Use the character ranges, assertions, etc. from `p1' for
2521 the transition from `p1' to `p2'. */
2522 trans->code_min = p1->code_min;
2523 trans->code_max = p1->code_max;
2524 trans->state = transitions + offs[p2->position];
2525 trans->state_id = p2->position;
2526 trans->assertions = p1->assertions | p2->assertions
2527 | (p1->class ? ASSERT_CHAR_CLASS : 0)
2528 | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2529 if (p1->backref >= 0)
2530 {
2531 assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2532 assert(p2->backref < 0);
2533 trans->u.backref = p1->backref;
2534 trans->assertions |= ASSERT_BACKREF;
2535 }
2536 else
2537 trans->u.class = p1->class;
2538 if (p1->neg_classes != NULL)
2539 {
2540 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2541 trans->neg_classes =
2542 xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2543 if (trans->neg_classes == NULL)
2544 return REG_ESPACE;
2545 for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2546 trans->neg_classes[i] = p1->neg_classes[i];
2547 trans->neg_classes[i] = (tre_ctype_t)0;
2548 }
2549 else
2550 trans->neg_classes = NULL;
2551
2552 /* Find out how many tags this transition has. */
2553 i = 0;
2554 if (p1->tags != NULL)
2555 while(p1->tags[i] >= 0)
2556 i++;
2557 j = 0;
2558 if (p2->tags != NULL)
2559 while(p2->tags[j] >= 0)
2560 j++;
2561
2562 /* If we are overwriting a transition, free the old tag array. */
2563 if (trans->tags != NULL)
2564 xfree(trans->tags);
2565 trans->tags = NULL;
2566
2567 /* If there were any tags, allocate an array and fill it. */
2568 if (i + j > 0)
2569 {
2570 trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2571 if (!trans->tags)
2572 return REG_ESPACE;
2573 i = 0;
2574 if (p1->tags != NULL)
2575 while(p1->tags[i] >= 0)
2576 {
2577 trans->tags[i] = p1->tags[i];
2578 i++;
2579 }
2580 l = i;
2581 j = 0;
2582 if (p2->tags != NULL)
2583 while (p2->tags[j] >= 0)
2584 {
2585 /* Don't add duplicates. */
2586 dup = 0;
2587 for (k = 0; k < i; k++)
2588 if (trans->tags[k] == p2->tags[j])
2589 {
2590 dup = 1;
2591 break;
2592 }
2593 if (!dup)
2594 trans->tags[l++] = p2->tags[j];
2595 j++;
2596 }
2597 trans->tags[l] = -1;
2598 }
2599
2600 p2++;
2601 }
2602 p1++;
2603 }
2604 else
2605 /* Compute a maximum limit for the number of transitions leaving
2606 from each state. */
2607 while (p1->position >= 0)
2608 {
2609 p2 = orig_p2;
2610 while (p2->position >= 0)
2611 {
2612 counts[p1->position]++;
2613 p2++;
2614 }
2615 p1++;
2616 }
2617 return REG_OK;
2618 }
2619
2620 /* Converts the syntax tree to a TNFA. All the transitions in the TNFA are
2621 labelled with one character range (there are no transitions on empty
2622 strings). The TNFA takes O(n^2) space in the worst case, `n' is size of
2623 the regexp. */
2624 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)2625 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2626 int *counts, int *offs)
2627 {
2628 tre_union_t *uni;
2629 tre_catenation_t *cat;
2630 tre_iteration_t *iter;
2631 reg_errcode_t errcode = REG_OK;
2632
2633 /* XXX - recurse using a stack!. */
2634 switch (node->type)
2635 {
2636 case LITERAL:
2637 break;
2638 case UNION:
2639 uni = (tre_union_t *)node->obj;
2640 errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2641 if (errcode != REG_OK)
2642 return errcode;
2643 errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2644 break;
2645
2646 case CATENATION:
2647 cat = (tre_catenation_t *)node->obj;
2648 /* Add a transition from each position in cat->left->lastpos
2649 to each position in cat->right->firstpos. */
2650 errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2651 transitions, counts, offs);
2652 if (errcode != REG_OK)
2653 return errcode;
2654 errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2655 if (errcode != REG_OK)
2656 return errcode;
2657 errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2658 break;
2659
2660 case ITERATION:
2661 iter = (tre_iteration_t *)node->obj;
2662 assert(iter->max == -1 || iter->max == 1);
2663
2664 if (iter->max == -1)
2665 {
2666 assert(iter->min == 0 || iter->min == 1);
2667 /* Add a transition from each last position in the iterated
2668 expression to each first position. */
2669 errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2670 transitions, counts, offs);
2671 if (errcode != REG_OK)
2672 return errcode;
2673 }
2674 errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2675 break;
2676 }
2677 return errcode;
2678 }
2679
2680
2681 #define ERROR_EXIT(err) \
2682 do \
2683 { \
2684 errcode = err; \
2685 if (/*CONSTCOND*/1) \
2686 goto error_exit; \
2687 } \
2688 while (/*CONSTCOND*/0)
2689
2690
2691 int
regcomp(regex_t * restrict preg,const char * restrict regex,int cflags)2692 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2693 {
2694 tre_stack_t *stack;
2695 tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2696 tre_pos_and_tags_t *p;
2697 int *counts = NULL, *offs = NULL;
2698 int i, add = 0;
2699 tre_tnfa_transition_t *transitions, *initial;
2700 tre_tnfa_t *tnfa = NULL;
2701 tre_submatch_data_t *submatch_data;
2702 tre_tag_direction_t *tag_directions = NULL;
2703 reg_errcode_t errcode;
2704 tre_mem_t mem;
2705
2706 /* Parse context. */
2707 tre_parse_ctx_t parse_ctx;
2708
2709 /* Allocate a stack used throughout the compilation process for various
2710 purposes. */
2711 stack = tre_stack_new(512, 1024000, 128);
2712 if (!stack)
2713 return REG_ESPACE;
2714 /* Allocate a fast memory allocator. */
2715 mem = tre_mem_new();
2716 if (!mem)
2717 {
2718 tre_stack_destroy(stack);
2719 return REG_ESPACE;
2720 }
2721
2722 /* Parse the regexp. */
2723 memset(&parse_ctx, 0, sizeof(parse_ctx));
2724 parse_ctx.mem = mem;
2725 parse_ctx.stack = stack;
2726 parse_ctx.start = regex;
2727 parse_ctx.cflags = cflags;
2728 parse_ctx.max_backref = -1;
2729 errcode = tre_parse(&parse_ctx);
2730 if (errcode != REG_OK)
2731 ERROR_EXIT(errcode);
2732 preg->re_nsub = parse_ctx.submatch_id - 1;
2733 tree = parse_ctx.n;
2734
2735 #ifdef TRE_DEBUG
2736 tre_ast_print(tree);
2737 #endif /* TRE_DEBUG */
2738
2739 /* Referring to nonexistent subexpressions is illegal. */
2740 if (parse_ctx.max_backref > (int)preg->re_nsub)
2741 ERROR_EXIT(REG_ESUBREG);
2742
2743 /* Allocate the TNFA struct. */
2744 tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2745 if (tnfa == NULL)
2746 ERROR_EXIT(REG_ESPACE);
2747 tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2748 tnfa->have_approx = 0;
2749 tnfa->num_submatches = parse_ctx.submatch_id;
2750
2751 /* Set up tags for submatch addressing. If REG_NOSUB is set and the
2752 regexp does not have back references, this can be skipped. */
2753 if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2754 {
2755
2756 /* Figure out how many tags we will need. */
2757 errcode = tre_add_tags(NULL, stack, tree, tnfa);
2758 if (errcode != REG_OK)
2759 ERROR_EXIT(errcode);
2760
2761 if (tnfa->num_tags > 0)
2762 {
2763 tag_directions = xmalloc(sizeof(*tag_directions)
2764 * (tnfa->num_tags + 1));
2765 if (tag_directions == NULL)
2766 ERROR_EXIT(REG_ESPACE);
2767 tnfa->tag_directions = tag_directions;
2768 memset(tag_directions, -1,
2769 sizeof(*tag_directions) * (tnfa->num_tags + 1));
2770 }
2771 tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2772 sizeof(*tnfa->minimal_tags));
2773 if (tnfa->minimal_tags == NULL)
2774 ERROR_EXIT(REG_ESPACE);
2775
2776 submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2777 sizeof(*submatch_data));
2778 if (submatch_data == NULL)
2779 ERROR_EXIT(REG_ESPACE);
2780 tnfa->submatch_data = submatch_data;
2781
2782 errcode = tre_add_tags(mem, stack, tree, tnfa);
2783 if (errcode != REG_OK)
2784 ERROR_EXIT(errcode);
2785
2786 }
2787
2788 /* Expand iteration nodes. */
2789 errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2790 tag_directions);
2791 if (errcode != REG_OK)
2792 ERROR_EXIT(errcode);
2793
2794 /* Add a dummy node for the final state.
2795 XXX - For certain patterns this dummy node can be optimized away,
2796 for example "a*" or "ab*". Figure out a simple way to detect
2797 this possibility. */
2798 tmp_ast_l = tree;
2799 tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2800 if (tmp_ast_r == NULL)
2801 ERROR_EXIT(REG_ESPACE);
2802
2803 tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2804 if (tree == NULL)
2805 ERROR_EXIT(REG_ESPACE);
2806
2807 errcode = tre_compute_nfl(mem, stack, tree);
2808 if (errcode != REG_OK)
2809 ERROR_EXIT(errcode);
2810
2811 counts = xmalloc(sizeof(int) * parse_ctx.position);
2812 if (counts == NULL)
2813 ERROR_EXIT(REG_ESPACE);
2814
2815 offs = xmalloc(sizeof(int) * parse_ctx.position);
2816 if (offs == NULL)
2817 ERROR_EXIT(REG_ESPACE);
2818
2819 for (i = 0; i < parse_ctx.position; i++)
2820 counts[i] = 0;
2821 tre_ast_to_tnfa(tree, NULL, counts, NULL);
2822
2823 add = 0;
2824 for (i = 0; i < parse_ctx.position; i++)
2825 {
2826 offs[i] = add;
2827 add += counts[i] + 1;
2828 counts[i] = 0;
2829 }
2830 transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2831 if (transitions == NULL)
2832 ERROR_EXIT(REG_ESPACE);
2833 tnfa->transitions = transitions;
2834 tnfa->num_transitions = add;
2835
2836 errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2837 if (errcode != REG_OK)
2838 ERROR_EXIT(errcode);
2839
2840 tnfa->firstpos_chars = NULL;
2841
2842 p = tree->firstpos;
2843 i = 0;
2844 while (p->position >= 0)
2845 {
2846 i++;
2847 p++;
2848 }
2849
2850 initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2851 if (initial == NULL)
2852 ERROR_EXIT(REG_ESPACE);
2853 tnfa->initial = initial;
2854
2855 i = 0;
2856 for (p = tree->firstpos; p->position >= 0; p++)
2857 {
2858 initial[i].state = transitions + offs[p->position];
2859 initial[i].state_id = p->position;
2860 initial[i].tags = NULL;
2861 /* Copy the arrays p->tags, and p->params, they are allocated
2862 from a tre_mem object. */
2863 if (p->tags)
2864 {
2865 int j;
2866 for (j = 0; p->tags[j] >= 0; j++);
2867 initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2868 if (!initial[i].tags)
2869 ERROR_EXIT(REG_ESPACE);
2870 memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2871 }
2872 initial[i].assertions = p->assertions;
2873 i++;
2874 }
2875 initial[i].state = NULL;
2876
2877 tnfa->num_transitions = add;
2878 tnfa->final = transitions + offs[tree->lastpos[0].position];
2879 tnfa->num_states = parse_ctx.position;
2880 tnfa->cflags = cflags;
2881
2882 tre_mem_destroy(mem);
2883 tre_stack_destroy(stack);
2884 xfree(counts);
2885 xfree(offs);
2886
2887 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2888 return REG_OK;
2889
2890 error_exit:
2891 /* Free everything that was allocated and return the error code. */
2892 tre_mem_destroy(mem);
2893 if (stack != NULL)
2894 tre_stack_destroy(stack);
2895 if (counts != NULL)
2896 xfree(counts);
2897 if (offs != NULL)
2898 xfree(offs);
2899 preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2900 regfree(preg);
2901 return errcode;
2902 }
2903
2904
2905
2906
2907 void
regfree(regex_t * preg)2908 regfree(regex_t *preg)
2909 {
2910 tre_tnfa_t *tnfa;
2911 unsigned int i;
2912 tre_tnfa_transition_t *trans;
2913
2914 tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2915 if (!tnfa)
2916 return;
2917
2918 for (i = 0; i < tnfa->num_transitions; i++)
2919 if (tnfa->transitions[i].state)
2920 {
2921 if (tnfa->transitions[i].tags)
2922 xfree(tnfa->transitions[i].tags);
2923 if (tnfa->transitions[i].neg_classes)
2924 xfree(tnfa->transitions[i].neg_classes);
2925 }
2926 if (tnfa->transitions)
2927 xfree(tnfa->transitions);
2928
2929 if (tnfa->initial)
2930 {
2931 for (trans = tnfa->initial; trans->state; trans++)
2932 {
2933 if (trans->tags)
2934 xfree(trans->tags);
2935 }
2936 xfree(tnfa->initial);
2937 }
2938
2939 if (tnfa->submatch_data)
2940 {
2941 for (i = 0; i < tnfa->num_submatches; i++)
2942 if (tnfa->submatch_data[i].parents)
2943 xfree(tnfa->submatch_data[i].parents);
2944 xfree(tnfa->submatch_data);
2945 }
2946
2947 if (tnfa->tag_directions)
2948 xfree(tnfa->tag_directions);
2949 if (tnfa->firstpos_chars)
2950 xfree(tnfa->firstpos_chars);
2951 if (tnfa->minimal_tags)
2952 xfree(tnfa->minimal_tags);
2953 xfree(tnfa);
2954 }
2955