• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2   regcomp.c - TRE POSIX compatible regex compilation functions.
3 
4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5   All rights reserved.
6 
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions
9   are met:
10 
11     1. Redistributions of source code must retain the above copyright
12        notice, this list of conditions and the following disclaimer.
13 
14     2. Redistributions in binary form must reproduce the above copyright
15        notice, this list of conditions and the following disclaimer in the
16        documentation and/or other materials provided with the distribution.
17 
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 */
31 
32 #include <string.h>
33 #include <stdlib.h>
34 #include <regex.h>
35 #include <limits.h>
36 #include <stdint.h>
37 #include <ctype.h>
38 
39 #include "tre.h"
40 
41 #include <assert.h>
42 
43 /***********************************************************************
44  from tre-compile.h
45 ***********************************************************************/
46 
47 typedef struct {
48   int position;
49   int code_min;
50   int code_max;
51   int *tags;
52   int assertions;
53   tre_ctype_t class;
54   tre_ctype_t *neg_classes;
55   int backref;
56 } tre_pos_and_tags_t;
57 
58 
59 /***********************************************************************
60  from tre-ast.c and tre-ast.h
61 ***********************************************************************/
62 
63 /* The different AST node types. */
64 typedef enum {
65   LITERAL,
66   CATENATION,
67   ITERATION,
68   UNION
69 } tre_ast_type_t;
70 
71 /* Special subtypes of TRE_LITERAL. */
72 #define EMPTY	  -1   /* Empty leaf (denotes empty string). */
73 #define ASSERTION -2   /* Assertion leaf. */
74 #define TAG	  -3   /* Tag leaf. */
75 #define BACKREF	  -4   /* Back reference leaf. */
76 
77 #define IS_SPECIAL(x)	((x)->code_min < 0)
78 #define IS_EMPTY(x)	((x)->code_min == EMPTY)
79 #define IS_ASSERTION(x) ((x)->code_min == ASSERTION)
80 #define IS_TAG(x)	((x)->code_min == TAG)
81 #define IS_BACKREF(x)	((x)->code_min == BACKREF)
82 
83 
84 /* A generic AST node.  All AST nodes consist of this node on the top
85    level with `obj' pointing to the actual content. */
86 typedef struct {
87   tre_ast_type_t type;   /* Type of the node. */
88   void *obj;             /* Pointer to actual node. */
89   int nullable;
90   int submatch_id;
91   int num_submatches;
92   int num_tags;
93   tre_pos_and_tags_t *firstpos;
94   tre_pos_and_tags_t *lastpos;
95 } tre_ast_node_t;
96 
97 
98 /* A "literal" node.  These are created for assertions, back references,
99    tags, matching parameter settings, and all expressions that match one
100    character. */
101 typedef struct {
102   long code_min;
103   long code_max;
104   int position;
105   tre_ctype_t class;
106   tre_ctype_t *neg_classes;
107 } tre_literal_t;
108 
109 /* A "catenation" node.	 These are created when two regexps are concatenated.
110    If there are more than one subexpressions in sequence, the `left' part
111    holds all but the last, and `right' part holds the last subexpression
112    (catenation is left associative). */
113 typedef struct {
114   tre_ast_node_t *left;
115   tre_ast_node_t *right;
116 } tre_catenation_t;
117 
118 /* An "iteration" node.	 These are created for the "*", "+", "?", and "{m,n}"
119    operators. */
120 typedef struct {
121   /* Subexpression to match. */
122   tre_ast_node_t *arg;
123   /* Minimum number of consecutive matches. */
124   int min;
125   /* Maximum number of consecutive matches. */
126   int max;
127   /* If 0, match as many characters as possible, if 1 match as few as
128      possible.	Note that this does not always mean the same thing as
129      matching as many/few repetitions as possible. */
130   unsigned int minimal:1;
131 } tre_iteration_t;
132 
133 /* An "union" node.  These are created for the "|" operator. */
134 typedef struct {
135   tre_ast_node_t *left;
136   tre_ast_node_t *right;
137 } tre_union_t;
138 
139 
140 static tre_ast_node_t *
tre_ast_new_node(tre_mem_t mem,tre_ast_type_t type,void * obj)141 tre_ast_new_node(tre_mem_t mem, tre_ast_type_t type, void *obj)
142 {
143 	tre_ast_node_t *node = tre_mem_calloc(mem, sizeof *node);
144 	if (!node || !obj)
145 		return 0;
146 	node->obj = obj;
147 	node->type = type;
148 	node->nullable = -1;
149 	node->submatch_id = -1;
150 	return node;
151 }
152 
153 static tre_ast_node_t *
tre_ast_new_literal(tre_mem_t mem,int code_min,int code_max,int position)154 tre_ast_new_literal(tre_mem_t mem, int code_min, int code_max, int position)
155 {
156 	tre_ast_node_t *node;
157 	tre_literal_t *lit;
158 
159 	lit = tre_mem_calloc(mem, sizeof *lit);
160 	node = tre_ast_new_node(mem, LITERAL, lit);
161 	if (!node)
162 		return 0;
163 	lit->code_min = code_min;
164 	lit->code_max = code_max;
165 	lit->position = position;
166 	return node;
167 }
168 
169 static tre_ast_node_t *
tre_ast_new_iter(tre_mem_t mem,tre_ast_node_t * arg,int min,int max,int minimal)170 tre_ast_new_iter(tre_mem_t mem, tre_ast_node_t *arg, int min, int max, int minimal)
171 {
172 	tre_ast_node_t *node;
173 	tre_iteration_t *iter;
174 
175 	iter = tre_mem_calloc(mem, sizeof *iter);
176 	node = tre_ast_new_node(mem, ITERATION, iter);
177 	if (!node)
178 		return 0;
179 	iter->arg = arg;
180 	iter->min = min;
181 	iter->max = max;
182 	iter->minimal = minimal;
183 	node->num_submatches = arg->num_submatches;
184 	return node;
185 }
186 
187 static tre_ast_node_t *
tre_ast_new_union(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)188 tre_ast_new_union(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
189 {
190 	tre_ast_node_t *node;
191 	tre_union_t *un;
192 
193 	if (!left)
194 		return right;
195 	un = tre_mem_calloc(mem, sizeof *un);
196 	node = tre_ast_new_node(mem, UNION, un);
197 	if (!node || !right)
198 		return 0;
199 	un->left = left;
200 	un->right = right;
201 	node->num_submatches = left->num_submatches + right->num_submatches;
202 	return node;
203 }
204 
205 static tre_ast_node_t *
tre_ast_new_catenation(tre_mem_t mem,tre_ast_node_t * left,tre_ast_node_t * right)206 tre_ast_new_catenation(tre_mem_t mem, tre_ast_node_t *left, tre_ast_node_t *right)
207 {
208 	tre_ast_node_t *node;
209 	tre_catenation_t *cat;
210 
211 	if (!left)
212 		return right;
213 	cat = tre_mem_calloc(mem, sizeof *cat);
214 	node = tre_ast_new_node(mem, CATENATION, cat);
215 	if (!node)
216 		return 0;
217 	cat->left = left;
218 	cat->right = right;
219 	node->num_submatches = left->num_submatches + right->num_submatches;
220 	return node;
221 }
222 
223 
224 /***********************************************************************
225  from tre-stack.c and tre-stack.h
226 ***********************************************************************/
227 
228 typedef struct tre_stack_rec tre_stack_t;
229 
230 /* Creates a new stack object.	`size' is initial size in bytes, `max_size'
231    is maximum size, and `increment' specifies how much more space will be
232    allocated with realloc() if all space gets used up.	Returns the stack
233    object or NULL if out of memory. */
234 static tre_stack_t *
235 tre_stack_new(int size, int max_size, int increment);
236 
237 /* Frees the stack object. */
238 static void
239 tre_stack_destroy(tre_stack_t *s);
240 
241 /* Returns the current number of objects in the stack. */
242 static int
243 tre_stack_num_objects(tre_stack_t *s);
244 
245 /* Each tre_stack_push_*(tre_stack_t *s, <type> value) function pushes
246    `value' on top of stack `s'.  Returns REG_ESPACE if out of memory.
247    This tries to realloc() more space before failing if maximum size
248    has not yet been reached.  Returns REG_OK if successful. */
249 #define declare_pushf(typetag, type)					      \
250   static reg_errcode_t tre_stack_push_ ## typetag(tre_stack_t *s, type value)
251 
252 declare_pushf(voidptr, void *);
253 declare_pushf(int, int);
254 
255 /* Each tre_stack_pop_*(tre_stack_t *s) function pops the topmost
256    element off of stack `s' and returns it.  The stack must not be
257    empty. */
258 #define declare_popf(typetag, type)		  \
259   static type tre_stack_pop_ ## typetag(tre_stack_t *s)
260 
261 declare_popf(voidptr, void *);
262 declare_popf(int, int);
263 
264 /* Just to save some typing. */
265 #define STACK_PUSH(s, typetag, value)					      \
266   do									      \
267     {									      \
268       status = tre_stack_push_ ## typetag(s, value);			      \
269     }									      \
270   while (/*CONSTCOND*/0)
271 
272 #define STACK_PUSHX(s, typetag, value)					      \
273   {									      \
274     status = tre_stack_push_ ## typetag(s, value);			      \
275     if (status != REG_OK)						      \
276       break;								      \
277   }
278 
279 #define STACK_PUSHR(s, typetag, value)					      \
280   {									      \
281     reg_errcode_t _status;						      \
282     _status = tre_stack_push_ ## typetag(s, value);			      \
283     if (_status != REG_OK)						      \
284       return _status;							      \
285   }
286 
287 union tre_stack_item {
288   void *voidptr_value;
289   int int_value;
290 };
291 
292 struct tre_stack_rec {
293   int size;
294   int max_size;
295   int increment;
296   int ptr;
297   union tre_stack_item *stack;
298 };
299 
300 
301 static tre_stack_t *
tre_stack_new(int size,int max_size,int increment)302 tre_stack_new(int size, int max_size, int increment)
303 {
304   tre_stack_t *s;
305 
306   s = xmalloc(sizeof(*s));
307   if (s != NULL)
308     {
309       s->stack = xmalloc(sizeof(*s->stack) * size);
310       if (s->stack == NULL)
311 	{
312 	  xfree(s);
313 	  return NULL;
314 	}
315       s->size = size;
316       s->max_size = max_size;
317       s->increment = increment;
318       s->ptr = 0;
319     }
320   return s;
321 }
322 
323 static void
tre_stack_destroy(tre_stack_t * s)324 tre_stack_destroy(tre_stack_t *s)
325 {
326   xfree(s->stack);
327   xfree(s);
328 }
329 
330 static int
tre_stack_num_objects(tre_stack_t * s)331 tre_stack_num_objects(tre_stack_t *s)
332 {
333   return s->ptr;
334 }
335 
336 static reg_errcode_t
tre_stack_push(tre_stack_t * s,union tre_stack_item value)337 tre_stack_push(tre_stack_t *s, union tre_stack_item value)
338 {
339   if (s->ptr < s->size)
340     {
341       s->stack[s->ptr] = value;
342       s->ptr++;
343     }
344   else
345     {
346       if (s->size >= s->max_size)
347 	{
348 	  return REG_ESPACE;
349 	}
350       else
351 	{
352 	  union tre_stack_item *new_buffer;
353 	  int new_size;
354 	  new_size = s->size + s->increment;
355 	  if (new_size > s->max_size)
356 	    new_size = s->max_size;
357 	  new_buffer = xrealloc(s->stack, sizeof(*new_buffer) * new_size);
358 	  if (new_buffer == NULL)
359 	    {
360 	      return REG_ESPACE;
361 	    }
362 	  assert(new_size > s->size);
363 	  s->size = new_size;
364 	  s->stack = new_buffer;
365 	  tre_stack_push(s, value);
366 	}
367     }
368   return REG_OK;
369 }
370 
371 #define define_pushf(typetag, type)  \
372   declare_pushf(typetag, type) {     \
373     union tre_stack_item item;	     \
374     item.typetag ## _value = value;  \
375     return tre_stack_push(s, item);  \
376 }
377 
378 define_pushf(int, int)
379 define_pushf(voidptr, void *)
380 
381 #define define_popf(typetag, type)		    \
382   declare_popf(typetag, type) {			    \
383     return s->stack[--s->ptr].typetag ## _value;    \
384   }
385 
386 define_popf(int, int)
387 define_popf(voidptr, void *)
388 
389 
390 /***********************************************************************
391  from tre-parse.c and tre-parse.h
392 ***********************************************************************/
393 
394 /* Parse context. */
395 typedef struct {
396 	/* Memory allocator. The AST is allocated using this. */
397 	tre_mem_t mem;
398 	/* Stack used for keeping track of regexp syntax. */
399 	tre_stack_t *stack;
400 	/* The parsed node after a parse function returns. */
401 	tre_ast_node_t *n;
402 	/* Position in the regexp pattern after a parse function returns. */
403 	const char *s;
404 	/* The first character of the last subexpression parsed. */
405 	const char *start;
406 	/* Current submatch ID. */
407 	int submatch_id;
408 	/* Current position (number of literal). */
409 	int position;
410 	/* The highest back reference or -1 if none seen so far. */
411 	int max_backref;
412 	/* Compilation flags. */
413 	int cflags;
414 } tre_parse_ctx_t;
415 
416 /* Some macros for expanding \w, \s, etc. */
417 static const struct {
418 	char c;
419 	const char *expansion;
420 } tre_macros[] = {
421 	{'t', "\t"}, {'n', "\n"}, {'r', "\r"},
422 	{'f', "\f"}, {'a', "\a"}, {'e', "\033"},
423 	{'w', "[[:alnum:]_]"}, {'W', "[^[:alnum:]_]"}, {'s', "[[:space:]]"},
424 	{'S', "[^[:space:]]"}, {'d', "[[:digit:]]"}, {'D', "[^[:digit:]]"},
425 	{ 0, 0 }
426 };
427 
428 /* Expands a macro delimited by `regex' and `regex_end' to `buf', which
429    must have at least `len' items.  Sets buf[0] to zero if the there
430    is no match in `tre_macros'. */
tre_expand_macro(const char * s)431 static const char *tre_expand_macro(const char *s)
432 {
433 	int i;
434 	for (i = 0; tre_macros[i].c && tre_macros[i].c != *s; i++);
435 	return tre_macros[i].expansion;
436 }
437 
438 static int
tre_compare_lit(const void * a,const void * b)439 tre_compare_lit(const void *a, const void *b)
440 {
441 	const tre_literal_t *const *la = a;
442 	const tre_literal_t *const *lb = b;
443 	/* assumes the range of valid code_min is < INT_MAX */
444 	return la[0]->code_min - lb[0]->code_min;
445 }
446 
447 struct literals {
448 	tre_mem_t mem;
449 	tre_literal_t **a;
450 	int len;
451 	int cap;
452 };
453 
tre_new_lit(struct literals * p)454 static tre_literal_t *tre_new_lit(struct literals *p)
455 {
456 	tre_literal_t **a;
457 	if (p->len >= p->cap) {
458 		if (p->cap >= 1<<15)
459 			return 0;
460 		p->cap *= 2;
461 		a = xrealloc(p->a, p->cap * sizeof *p->a);
462 		if (!a)
463 			return 0;
464 		p->a = a;
465 	}
466 	a = p->a + p->len++;
467 	*a = tre_mem_calloc(p->mem, sizeof **a);
468 	return *a;
469 }
470 
add_icase_literals(struct literals * ls,int min,int max)471 static int add_icase_literals(struct literals *ls, int min, int max)
472 {
473 	tre_literal_t *lit;
474 	int b, e, c;
475 	for (c=min; c<=max; ) {
476 		/* assumes islower(c) and isupper(c) are exclusive
477 		   and toupper(c)!=c if islower(c).
478 		   multiple opposite case characters are not supported */
479 		if (tre_islower(c)) {
480 			b = e = tre_toupper(c);
481 			for (c++, e++; c<=max; c++, e++)
482 				if (tre_toupper(c) != e) break;
483 		} else if (tre_isupper(c)) {
484 			b = e = tre_tolower(c);
485 			for (c++, e++; c<=max; c++, e++)
486 				if (tre_tolower(c) != e) break;
487 		} else {
488 			c++;
489 			continue;
490 		}
491 		lit = tre_new_lit(ls);
492 		if (!lit)
493 			return -1;
494 		lit->code_min = b;
495 		lit->code_max = e-1;
496 		lit->position = -1;
497 	}
498 	return 0;
499 }
500 
501 
502 /* Maximum number of character classes in a negated bracket expression. */
503 #define MAX_NEG_CLASSES 64
504 
505 struct neg {
506 	int negate;
507 	int len;
508 	tre_ctype_t a[MAX_NEG_CLASSES];
509 };
510 
511 // TODO: parse bracket into a set of non-overlapping [lo,hi] ranges
512 
513 /*
514 bracket grammar:
515 Bracket  =  '[' List ']'  |  '[^' List ']'
516 List     =  Term  |  List Term
517 Term     =  Char  |  Range  |  Chclass  |  Eqclass
518 Range    =  Char '-' Char  |  Char '-' '-'
519 Char     =  Coll  |  coll_single
520 Meta     =  ']'  |  '-'
521 Coll     =  '[.' coll_single '.]'  |  '[.' coll_multi '.]'  |  '[.' Meta '.]'
522 Eqclass  =  '[=' coll_single '=]'  |  '[=' coll_multi '=]'
523 Chclass  =  '[:' class ':]'
524 
525 coll_single is a single char collating element but it can be
526  '-' only at the beginning or end of a List and
527  ']' only at the beginning of a List and
528  '^' anywhere except after the openning '['
529 */
530 
parse_bracket_terms(tre_parse_ctx_t * ctx,const char * s,struct literals * ls,struct neg * neg)531 static reg_errcode_t parse_bracket_terms(tre_parse_ctx_t *ctx, const char *s, struct literals *ls, struct neg *neg)
532 {
533 	const char *start = s;
534 	tre_ctype_t class;
535 	int min, max;
536 	wchar_t wc;
537 	int len;
538 
539 	for (;;) {
540 		class = 0;
541 		len = mbtowc(&wc, s, -1);
542 		if (len <= 0)
543 			return *s ? REG_BADPAT : REG_EBRACK;
544 		if (*s == ']' && s != start) {
545 			ctx->s = s+1;
546 			return REG_OK;
547 		}
548 		if (*s == '-' && s != start && s[1] != ']' &&
549 		    /* extension: [a-z--@] is accepted as [a-z]|[--@] */
550 		    (s[1] != '-' || s[2] == ']'))
551 			return REG_ERANGE;
552 		if (*s == '[' && (s[1] == '.' || s[1] == '='))
553 			/* collating symbols and equivalence classes are not supported */
554 			return REG_ECOLLATE;
555 		if (*s == '[' && s[1] == ':') {
556 			char tmp[CHARCLASS_NAME_MAX+1];
557 			s += 2;
558 			for (len=0; len < CHARCLASS_NAME_MAX && s[len]; len++) {
559 				if (s[len] == ':') {
560 					memcpy(tmp, s, len);
561 					tmp[len] = 0;
562 					class = tre_ctype(tmp);
563 					break;
564 				}
565 			}
566 			if (!class || s[len+1] != ']')
567 				return REG_ECTYPE;
568 			min = 0;
569 			max = TRE_CHAR_MAX;
570 			s += len+2;
571 		} else {
572 			min = max = wc;
573 			s += len;
574 			if (*s == '-' && s[1] != ']') {
575 				s++;
576 				len = mbtowc(&wc, s, -1);
577 				max = wc;
578 				/* XXX - Should use collation order instead of
579 				   encoding values in character ranges. */
580 				if (len <= 0 || min > max)
581 					return REG_ERANGE;
582 				s += len;
583 			}
584 		}
585 
586 		if (class && neg->negate) {
587 			if (neg->len >= MAX_NEG_CLASSES)
588 				return REG_ESPACE;
589 			neg->a[neg->len++] = class;
590 		} else  {
591 			tre_literal_t *lit = tre_new_lit(ls);
592 			if (!lit)
593 				return REG_ESPACE;
594 			lit->code_min = min;
595 			lit->code_max = max;
596 			lit->class = class;
597 			lit->position = -1;
598 
599 			/* Add opposite-case codepoints if REG_ICASE is present.
600 			   It seems that POSIX requires that bracket negation
601 			   should happen before case-folding, but most practical
602 			   implementations do it the other way around. Changing
603 			   the order would need efficient representation of
604 			   case-fold ranges and bracket range sets even with
605 			   simple patterns so this is ok for now. */
606 			if (ctx->cflags & REG_ICASE && !class)
607 				if (add_icase_literals(ls, min, max))
608 					return REG_ESPACE;
609 		}
610 	}
611 }
612 
parse_bracket(tre_parse_ctx_t * ctx,const char * s)613 static reg_errcode_t parse_bracket(tre_parse_ctx_t *ctx, const char *s)
614 {
615 	int i, max, min, negmax, negmin;
616 	tre_ast_node_t *node = 0, *n;
617 	tre_ctype_t *nc = 0;
618 	tre_literal_t *lit;
619 	struct literals ls;
620 	struct neg neg;
621 	reg_errcode_t err;
622 
623 	ls.mem = ctx->mem;
624 	ls.len = 0;
625 	ls.cap = 32;
626 	ls.a = xmalloc(ls.cap * sizeof *ls.a);
627 	if (!ls.a)
628 		return REG_ESPACE;
629 	neg.len = 0;
630 	neg.negate = *s == '^';
631 	if (neg.negate)
632 		s++;
633 
634 	err = parse_bracket_terms(ctx, s, &ls, &neg);
635 	if (err != REG_OK)
636 		goto parse_bracket_done;
637 
638 	if (neg.negate) {
639 		/*
640 		 * With REG_NEWLINE, POSIX requires that newlines are not matched by
641 		 * any form of a non-matching list.
642 		 */
643 		if (ctx->cflags & REG_NEWLINE) {
644 			lit = tre_new_lit(&ls);
645 			if (!lit) {
646 				err = REG_ESPACE;
647 				goto parse_bracket_done;
648 			}
649 			lit->code_min = '\n';
650 			lit->code_max = '\n';
651 			lit->position = -1;
652 		}
653 		/* Sort the array if we need to negate it. */
654 		qsort(ls.a, ls.len, sizeof *ls.a, tre_compare_lit);
655 		/* extra lit for the last negated range */
656 		lit = tre_new_lit(&ls);
657 		if (!lit) {
658 			err = REG_ESPACE;
659 			goto parse_bracket_done;
660 		}
661 		lit->code_min = TRE_CHAR_MAX+1;
662 		lit->code_max = TRE_CHAR_MAX+1;
663 		lit->position = -1;
664 		/* negated classes */
665 		if (neg.len) {
666 			nc = tre_mem_alloc(ctx->mem, (neg.len+1)*sizeof *neg.a);
667 			if (!nc) {
668 				err = REG_ESPACE;
669 				goto parse_bracket_done;
670 			}
671 			memcpy(nc, neg.a, neg.len*sizeof *neg.a);
672 			nc[neg.len] = 0;
673 		}
674 	}
675 
676 	/* Build a union of the items in the array, negated if necessary. */
677 	negmax = negmin = 0;
678 	for (i = 0; i < ls.len; i++) {
679 		lit = ls.a[i];
680 		min = lit->code_min;
681 		max = lit->code_max;
682 		if (neg.negate) {
683 			if (min <= negmin) {
684 				/* Overlap. */
685 				negmin = MAX(max + 1, negmin);
686 				continue;
687 			}
688 			negmax = min - 1;
689 			lit->code_min = negmin;
690 			lit->code_max = negmax;
691 			negmin = max + 1;
692 		}
693 		lit->position = ctx->position;
694 		lit->neg_classes = nc;
695 		n = tre_ast_new_node(ctx->mem, LITERAL, lit);
696 		node = tre_ast_new_union(ctx->mem, node, n);
697 		if (!node) {
698 			err = REG_ESPACE;
699 			break;
700 		}
701 	}
702 
703 parse_bracket_done:
704 	xfree(ls.a);
705 	ctx->position++;
706 	ctx->n = node;
707 	return err;
708 }
709 
parse_dup_count(const char * s,int * n)710 static const char *parse_dup_count(const char *s, int *n)
711 {
712 	*n = -1;
713 	if (!isdigit(*s))
714 		return s;
715 	*n = 0;
716 	for (;;) {
717 		*n = 10 * *n + (*s - '0');
718 		s++;
719 		if (!isdigit(*s) || *n > RE_DUP_MAX)
720 			break;
721 	}
722 	return s;
723 }
724 
parse_dup(const char * s,int ere,int * pmin,int * pmax)725 static const char *parse_dup(const char *s, int ere, int *pmin, int *pmax)
726 {
727 	int min, max;
728 
729 	s = parse_dup_count(s, &min);
730 	if (*s == ',')
731 		s = parse_dup_count(s+1, &max);
732 	else
733 		max = min;
734 
735 	if (
736 		(max < min && max >= 0) ||
737 		max > RE_DUP_MAX ||
738 		min > RE_DUP_MAX ||
739 		min < 0 ||
740 		(!ere && *s++ != '\\') ||
741 		*s++ != '}'
742 	)
743 		return 0;
744 	*pmin = min;
745 	*pmax = max;
746 	return s;
747 }
748 
hexval(unsigned c)749 static int hexval(unsigned c)
750 {
751 	if (c-'0'<10) return c-'0';
752 	c |= 32;
753 	if (c-'a'<6) return c-'a'+10;
754 	return -1;
755 }
756 
marksub(tre_parse_ctx_t * ctx,tre_ast_node_t * node,int subid)757 static reg_errcode_t marksub(tre_parse_ctx_t *ctx, tre_ast_node_t *node, int subid)
758 {
759 	if (node->submatch_id >= 0) {
760 		tre_ast_node_t *n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
761 		if (!n)
762 			return REG_ESPACE;
763 		n = tre_ast_new_catenation(ctx->mem, n, node);
764 		if (!n)
765 			return REG_ESPACE;
766 		n->num_submatches = node->num_submatches;
767 		node = n;
768 	}
769 	node->submatch_id = subid;
770 	node->num_submatches++;
771 	ctx->n = node;
772 	return REG_OK;
773 }
774 
775 /*
776 BRE grammar:
777 Regex  =  Branch  |  '^'  |  '$'  |  '^$'  |  '^' Branch  |  Branch '$'  |  '^' Branch '$'
778 Branch =  Atom  |  Branch Atom
779 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '\(' Branch '\)'  |  back_ref
780 Dup    =  '*'  |  '\{' Count '\}'  |  '\{' Count ',\}'  |  '\{' Count ',' Count '\}'
781 
782 (leading ^ and trailing $ in a sub expr may be an anchor or literal as well)
783 
784 ERE grammar:
785 Regex  =  Branch  |  Regex '|' Branch
786 Branch =  Atom  |  Branch Atom
787 Atom   =  char  |  quoted_char  |  '.'  |  Bracket  |  Atom Dup  |  '(' Regex ')'  |  '^'  |  '$'
788 Dup    =  '*'  |  '+'  |  '?'  |  '{' Count '}'  |  '{' Count ',}'  |  '{' Count ',' Count '}'
789 
790 (a*+?, ^*, $+, \X, {, (|a) are unspecified)
791 */
792 
parse_atom(tre_parse_ctx_t * ctx,const char * s)793 static reg_errcode_t parse_atom(tre_parse_ctx_t *ctx, const char *s)
794 {
795 	int len, ere = ctx->cflags & REG_EXTENDED;
796 	const char *p;
797 	tre_ast_node_t *node;
798 	wchar_t wc;
799 	switch (*s) {
800 	case '[':
801 		return parse_bracket(ctx, s+1);
802 	case '\\':
803 		p = tre_expand_macro(s+1);
804 		if (p) {
805 			/* assume \X expansion is a single atom */
806 			reg_errcode_t err = parse_atom(ctx, p);
807 			ctx->s = s+2;
808 			return err;
809 		}
810 		/* extensions: \b, \B, \<, \>, \xHH \x{HHHH} */
811 		switch (*++s) {
812 		case 0:
813 			return REG_EESCAPE;
814 		case 'b':
815 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB, -1);
816 			break;
817 		case 'B':
818 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_WB_NEG, -1);
819 			break;
820 		case '<':
821 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOW, -1);
822 			break;
823 		case '>':
824 			node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOW, -1);
825 			break;
826 		case 'x':
827 			s++;
828 			int i, v, c;
829 			v = 0;
830 			len = 2;
831 			if (*s == '{') {
832 				len = 8;
833 				s++;
834 			}
835 			for (i=0; i<len && v<0x110000; i++) {
836 				c = hexval(s[i]);
837 				if (c < 0) break;
838 				v = 16*v + c;
839 			}
840 			s += i;
841 			if (len == 8) {
842 				if (*s != '}')
843 					return REG_EBRACE;
844 				s++;
845 			}
846 			node = tre_ast_new_literal(ctx->mem, v, v, ctx->position++);
847 			s--;
848 			break;
849 		case '{':
850 		case '+':
851 		case '?':
852 			/* extension: treat \+, \? as repetitions in BRE */
853 			/* reject repetitions after empty expression in BRE */
854 			if (!ere)
855 				return REG_BADRPT;
856 		case '|':
857 			/* extension: treat \| as alternation in BRE */
858 			if (!ere) {
859 				node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
860 				s--;
861 				goto end;
862 			}
863 			/* fallthrough */
864 		default:
865 			if (!ere && (unsigned)*s-'1' < 9) {
866 				/* back reference */
867 				int val = *s - '0';
868 				node = tre_ast_new_literal(ctx->mem, BACKREF, val, ctx->position++);
869 				ctx->max_backref = MAX(val, ctx->max_backref);
870 			} else {
871 				/* extension: accept unknown escaped char
872 				   as a literal */
873 				goto parse_literal;
874 			}
875 		}
876 		s++;
877 		break;
878 	case '.':
879 		if (ctx->cflags & REG_NEWLINE) {
880 			tre_ast_node_t *tmp1, *tmp2;
881 			tmp1 = tre_ast_new_literal(ctx->mem, 0, '\n'-1, ctx->position++);
882 			tmp2 = tre_ast_new_literal(ctx->mem, '\n'+1, TRE_CHAR_MAX, ctx->position++);
883 			if (tmp1 && tmp2)
884 				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
885 			else
886 				node = 0;
887 		} else {
888 			node = tre_ast_new_literal(ctx->mem, 0, TRE_CHAR_MAX, ctx->position++);
889 		}
890 		s++;
891 		break;
892 	case '^':
893 		/* '^' has a special meaning everywhere in EREs, and at beginning of BRE. */
894 		if (!ere && s != ctx->start)
895 			goto parse_literal;
896 		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_BOL, -1);
897 		s++;
898 		break;
899 	case '$':
900 		/* '$' is special everywhere in EREs, and at the end of a BRE subexpression. */
901 		if (!ere && s[1] && (s[1]!='\\'|| (s[2]!=')' && s[2]!='|')))
902 			goto parse_literal;
903 		node = tre_ast_new_literal(ctx->mem, ASSERTION, ASSERT_AT_EOL, -1);
904 		s++;
905 		break;
906 	case '*':
907 	case '{':
908 	case '+':
909 	case '?':
910 		/* reject repetitions after empty expression in ERE */
911 		if (ere)
912 			return REG_BADRPT;
913 	case '|':
914 		if (!ere)
915 			goto parse_literal;
916 	case 0:
917 		node = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
918 		break;
919 	default:
920 parse_literal:
921 		len = mbtowc(&wc, s, -1);
922 		if (len < 0)
923 			return REG_BADPAT;
924 		if (ctx->cflags & REG_ICASE && (tre_isupper(wc) || tre_islower(wc))) {
925 			tre_ast_node_t *tmp1, *tmp2;
926 			/* multiple opposite case characters are not supported */
927 			tmp1 = tre_ast_new_literal(ctx->mem, tre_toupper(wc), tre_toupper(wc), ctx->position);
928 			tmp2 = tre_ast_new_literal(ctx->mem, tre_tolower(wc), tre_tolower(wc), ctx->position);
929 			if (tmp1 && tmp2)
930 				node = tre_ast_new_union(ctx->mem, tmp1, tmp2);
931 			else
932 				node = 0;
933 		} else {
934 			node = tre_ast_new_literal(ctx->mem, wc, wc, ctx->position);
935 		}
936 		ctx->position++;
937 		s += len;
938 		break;
939 	}
940 end:
941 	if (!node)
942 		return REG_ESPACE;
943 	ctx->n = node;
944 	ctx->s = s;
945 	return REG_OK;
946 }
947 
948 #define PUSHPTR(err, s, v) do { \
949 	if ((err = tre_stack_push_voidptr(s, v)) != REG_OK) \
950 		return err; \
951 } while(0)
952 
953 #define PUSHINT(err, s, v) do { \
954 	if ((err = tre_stack_push_int(s, v)) != REG_OK) \
955 		return err; \
956 } while(0)
957 
tre_parse(tre_parse_ctx_t * ctx)958 static reg_errcode_t tre_parse(tre_parse_ctx_t *ctx)
959 {
960 	tre_ast_node_t *nbranch=0, *nunion=0;
961 	int ere = ctx->cflags & REG_EXTENDED;
962 	const char *s = ctx->start;
963 	int subid = 0;
964 	int depth = 0;
965 	reg_errcode_t err;
966 	tre_stack_t *stack = ctx->stack;
967 
968 	PUSHINT(err, stack, subid++);
969 	for (;;) {
970 		if ((!ere && *s == '\\' && s[1] == '(') ||
971 		    (ere && *s == '(')) {
972 			PUSHPTR(err, stack, nunion);
973 			PUSHPTR(err, stack, nbranch);
974 			PUSHINT(err, stack, subid++);
975 			s++;
976 			if (!ere)
977 				s++;
978 			depth++;
979 			nbranch = nunion = 0;
980 			ctx->start = s;
981 			continue;
982 		}
983 		if ((!ere && *s == '\\' && s[1] == ')') ||
984 		    (ere && *s == ')' && depth)) {
985 			ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
986 			if (!ctx->n)
987 				return REG_ESPACE;
988 		} else {
989 			err = parse_atom(ctx, s);
990 			if (err != REG_OK)
991 				return err;
992 			s = ctx->s;
993 		}
994 
995 	parse_iter:
996 		for (;;) {
997 			int min, max;
998 
999 			if (*s!='\\' && *s!='*') {
1000 				if (!ere)
1001 					break;
1002 				if (*s!='+' && *s!='?' && *s!='{')
1003 					break;
1004 			}
1005 			if (*s=='\\' && ere)
1006 				break;
1007 			/* extension: treat \+, \? as repetitions in BRE */
1008 			if (*s=='\\' && s[1]!='+' && s[1]!='?' && s[1]!='{')
1009 				break;
1010 			if (*s=='\\')
1011 				s++;
1012 
1013 			/* handle ^* at the start of a BRE. */
1014 			if (!ere && s==ctx->start+1 && s[-1]=='^')
1015 				break;
1016 
1017 			/* extension: multiple consecutive *+?{,} is unspecified,
1018 			   but (a+)+ has to be supported so accepting a++ makes
1019 			   sense, note however that the RE_DUP_MAX limit can be
1020 			   circumvented: (a{255}){255} uses a lot of memory.. */
1021 			if (*s=='{') {
1022 				s = parse_dup(s+1, ere, &min, &max);
1023 				if (!s)
1024 					return REG_BADBR;
1025 			} else {
1026 				min=0;
1027 				max=-1;
1028 				if (*s == '+')
1029 					min = 1;
1030 				if (*s == '?')
1031 					max = 1;
1032 				s++;
1033 			}
1034 			if (max == 0)
1035 				ctx->n = tre_ast_new_literal(ctx->mem, EMPTY, -1, -1);
1036 			else
1037 				ctx->n = tre_ast_new_iter(ctx->mem, ctx->n, min, max, 0);
1038 			if (!ctx->n)
1039 				return REG_ESPACE;
1040 		}
1041 
1042 		nbranch = tre_ast_new_catenation(ctx->mem, nbranch, ctx->n);
1043 		if ((ere && *s == '|') ||
1044 		    (ere && *s == ')' && depth) ||
1045 		    (!ere && *s == '\\' && s[1] == ')') ||
1046 		    /* extension: treat \| as alternation in BRE */
1047 		    (!ere && *s == '\\' && s[1] == '|') ||
1048 		    !*s) {
1049 			/* extension: empty branch is unspecified (), (|a), (a|)
1050 			   here they are not rejected but match on empty string */
1051 			int c = *s;
1052 			nunion = tre_ast_new_union(ctx->mem, nunion, nbranch);
1053 			nbranch = 0;
1054 
1055 			if (c == '\\' && s[1] == '|') {
1056 				s+=2;
1057 				ctx->start = s;
1058 			} else if (c == '|') {
1059 				s++;
1060 				ctx->start = s;
1061 			} else {
1062 				if (c == '\\') {
1063 					if (!depth) return REG_EPAREN;
1064 					s+=2;
1065 				} else if (c == ')')
1066 					s++;
1067 				depth--;
1068 				err = marksub(ctx, nunion, tre_stack_pop_int(stack));
1069 				if (err != REG_OK)
1070 					return err;
1071 				if (!c && depth<0) {
1072 					ctx->submatch_id = subid;
1073 					return REG_OK;
1074 				}
1075 				if (!c || depth<0)
1076 					return REG_EPAREN;
1077 				nbranch = tre_stack_pop_voidptr(stack);
1078 				nunion = tre_stack_pop_voidptr(stack);
1079 				goto parse_iter;
1080 			}
1081 		}
1082 	}
1083 }
1084 
1085 
1086 /***********************************************************************
1087  from tre-compile.c
1088 ***********************************************************************/
1089 
1090 
1091 /*
1092   TODO:
1093    - Fix tre_ast_to_tnfa() to recurse using a stack instead of recursive
1094      function calls.
1095 */
1096 
1097 /*
1098   Algorithms to setup tags so that submatch addressing can be done.
1099 */
1100 
1101 
1102 /* Inserts a catenation node to the root of the tree given in `node'.
1103    As the left child a new tag with number `tag_id' to `node' is added,
1104    and the right child is the old root. */
1105 static reg_errcode_t
tre_add_tag_left(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1106 tre_add_tag_left(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1107 {
1108   tre_catenation_t *c;
1109 
1110   c = tre_mem_alloc(mem, sizeof(*c));
1111   if (c == NULL)
1112     return REG_ESPACE;
1113   c->left = tre_ast_new_literal(mem, TAG, tag_id, -1);
1114   if (c->left == NULL)
1115     return REG_ESPACE;
1116   c->right = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1117   if (c->right == NULL)
1118     return REG_ESPACE;
1119 
1120   c->right->obj = node->obj;
1121   c->right->type = node->type;
1122   c->right->nullable = -1;
1123   c->right->submatch_id = -1;
1124   c->right->firstpos = NULL;
1125   c->right->lastpos = NULL;
1126   c->right->num_tags = 0;
1127   c->right->num_submatches = 0;
1128   node->obj = c;
1129   node->type = CATENATION;
1130   return REG_OK;
1131 }
1132 
1133 /* Inserts a catenation node to the root of the tree given in `node'.
1134    As the right child a new tag with number `tag_id' to `node' is added,
1135    and the left child is the old root. */
1136 static reg_errcode_t
tre_add_tag_right(tre_mem_t mem,tre_ast_node_t * node,int tag_id)1137 tre_add_tag_right(tre_mem_t mem, tre_ast_node_t *node, int tag_id)
1138 {
1139   tre_catenation_t *c;
1140 
1141   c = tre_mem_alloc(mem, sizeof(*c));
1142   if (c == NULL)
1143     return REG_ESPACE;
1144   c->right = tre_ast_new_literal(mem, TAG, tag_id, -1);
1145   if (c->right == NULL)
1146     return REG_ESPACE;
1147   c->left = tre_mem_alloc(mem, sizeof(tre_ast_node_t));
1148   if (c->left == NULL)
1149     return REG_ESPACE;
1150 
1151   c->left->obj = node->obj;
1152   c->left->type = node->type;
1153   c->left->nullable = -1;
1154   c->left->submatch_id = -1;
1155   c->left->firstpos = NULL;
1156   c->left->lastpos = NULL;
1157   c->left->num_tags = 0;
1158   c->left->num_submatches = 0;
1159   node->obj = c;
1160   node->type = CATENATION;
1161   return REG_OK;
1162 }
1163 
1164 typedef enum {
1165   ADDTAGS_RECURSE,
1166   ADDTAGS_AFTER_ITERATION,
1167   ADDTAGS_AFTER_UNION_LEFT,
1168   ADDTAGS_AFTER_UNION_RIGHT,
1169   ADDTAGS_AFTER_CAT_LEFT,
1170   ADDTAGS_AFTER_CAT_RIGHT,
1171   ADDTAGS_SET_SUBMATCH_END
1172 } tre_addtags_symbol_t;
1173 
1174 
1175 typedef struct {
1176   int tag;
1177   int next_tag;
1178 } tre_tag_states_t;
1179 
1180 
1181 /* Go through `regset' and set submatch data for submatches that are
1182    using this tag. */
1183 static void
tre_purge_regset(int * regset,tre_tnfa_t * tnfa,int tag)1184 tre_purge_regset(int *regset, tre_tnfa_t *tnfa, int tag)
1185 {
1186   int i;
1187 
1188   for (i = 0; regset[i] >= 0; i++)
1189     {
1190       int id = regset[i] / 2;
1191       int start = !(regset[i] % 2);
1192       if (start)
1193 	tnfa->submatch_data[id].so_tag = tag;
1194       else
1195 	tnfa->submatch_data[id].eo_tag = tag;
1196     }
1197   regset[0] = -1;
1198 }
1199 
1200 
1201 /* Adds tags to appropriate locations in the parse tree in `tree', so that
1202    subexpressions marked for submatch addressing can be traced. */
1203 static reg_errcode_t
tre_add_tags(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree,tre_tnfa_t * tnfa)1204 tre_add_tags(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree,
1205 	     tre_tnfa_t *tnfa)
1206 {
1207   reg_errcode_t status = REG_OK;
1208   tre_addtags_symbol_t symbol;
1209   tre_ast_node_t *node = tree; /* Tree node we are currently looking at. */
1210   int bottom = tre_stack_num_objects(stack);
1211   /* True for first pass (counting number of needed tags) */
1212   int first_pass = (mem == NULL || tnfa == NULL);
1213   int *regset, *orig_regset;
1214   int num_tags = 0; /* Total number of tags. */
1215   int num_minimals = 0;	 /* Number of special minimal tags. */
1216   int tag = 0;	    /* The tag that is to be added next. */
1217   int next_tag = 1; /* Next tag to use after this one. */
1218   int *parents;	    /* Stack of submatches the current submatch is
1219 		       contained in. */
1220   int minimal_tag = -1; /* Tag that marks the beginning of a minimal match. */
1221   tre_tag_states_t *saved_states;
1222 
1223   tre_tag_direction_t direction = TRE_TAG_MINIMIZE;
1224   if (!first_pass)
1225     {
1226       tnfa->end_tag = 0;
1227       tnfa->minimal_tags[0] = -1;
1228     }
1229 
1230   regset = xmalloc(sizeof(*regset) * ((tnfa->num_submatches + 1) * 2));
1231   if (regset == NULL)
1232     return REG_ESPACE;
1233   regset[0] = -1;
1234   orig_regset = regset;
1235 
1236   parents = xmalloc(sizeof(*parents) * (tnfa->num_submatches + 1));
1237   if (parents == NULL)
1238     {
1239       xfree(regset);
1240       return REG_ESPACE;
1241     }
1242   parents[0] = -1;
1243 
1244   saved_states = xmalloc(sizeof(*saved_states) * (tnfa->num_submatches + 1));
1245   if (saved_states == NULL)
1246     {
1247       xfree(regset);
1248       xfree(parents);
1249       return REG_ESPACE;
1250     }
1251   else
1252     {
1253       unsigned int i;
1254       for (i = 0; i <= tnfa->num_submatches; i++)
1255 	saved_states[i].tag = -1;
1256     }
1257 
1258   STACK_PUSH(stack, voidptr, node);
1259   STACK_PUSH(stack, int, ADDTAGS_RECURSE);
1260 
1261   while (tre_stack_num_objects(stack) > bottom)
1262     {
1263       if (status != REG_OK)
1264 	break;
1265 
1266       symbol = (tre_addtags_symbol_t)tre_stack_pop_int(stack);
1267       switch (symbol)
1268 	{
1269 
1270 	case ADDTAGS_SET_SUBMATCH_END:
1271 	  {
1272 	    int id = tre_stack_pop_int(stack);
1273 	    int i;
1274 
1275 	    /* Add end of this submatch to regset. */
1276 	    for (i = 0; regset[i] >= 0; i++);
1277 	    regset[i] = id * 2 + 1;
1278 	    regset[i + 1] = -1;
1279 
1280 	    /* Pop this submatch from the parents stack. */
1281 	    for (i = 0; parents[i] >= 0; i++);
1282 	    parents[i - 1] = -1;
1283 	    break;
1284 	  }
1285 
1286 	case ADDTAGS_RECURSE:
1287 	  node = tre_stack_pop_voidptr(stack);
1288 
1289 	  if (node->submatch_id >= 0)
1290 	    {
1291 	      int id = node->submatch_id;
1292 	      int i;
1293 
1294 
1295 	      /* Add start of this submatch to regset. */
1296 	      for (i = 0; regset[i] >= 0; i++);
1297 	      regset[i] = id * 2;
1298 	      regset[i + 1] = -1;
1299 
1300 	      if (!first_pass)
1301 		{
1302 		  for (i = 0; parents[i] >= 0; i++);
1303 		  tnfa->submatch_data[id].parents = NULL;
1304 		  if (i > 0)
1305 		    {
1306 		      int *p = xmalloc(sizeof(*p) * (i + 1));
1307 		      if (p == NULL)
1308 			{
1309 			  status = REG_ESPACE;
1310 			  break;
1311 			}
1312 		      assert(tnfa->submatch_data[id].parents == NULL);
1313 		      tnfa->submatch_data[id].parents = p;
1314 		      for (i = 0; parents[i] >= 0; i++)
1315 			p[i] = parents[i];
1316 		      p[i] = -1;
1317 		    }
1318 		}
1319 
1320 	      /* Add end of this submatch to regset after processing this
1321 		 node. */
1322 	      STACK_PUSHX(stack, int, node->submatch_id);
1323 	      STACK_PUSHX(stack, int, ADDTAGS_SET_SUBMATCH_END);
1324 	    }
1325 
1326 	  switch (node->type)
1327 	    {
1328 	    case LITERAL:
1329 	      {
1330 		tre_literal_t *lit = node->obj;
1331 
1332 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1333 		  {
1334 		    int i;
1335 		    if (regset[0] >= 0)
1336 		      {
1337 			/* Regset is not empty, so add a tag before the
1338 			   literal or backref. */
1339 			if (!first_pass)
1340 			  {
1341 			    status = tre_add_tag_left(mem, node, tag);
1342 			    tnfa->tag_directions[tag] = direction;
1343 			    if (minimal_tag >= 0)
1344 			      {
1345 				for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1346 				tnfa->minimal_tags[i] = tag;
1347 				tnfa->minimal_tags[i + 1] = minimal_tag;
1348 				tnfa->minimal_tags[i + 2] = -1;
1349 				minimal_tag = -1;
1350 				num_minimals++;
1351 			      }
1352 			    tre_purge_regset(regset, tnfa, tag);
1353 			  }
1354 			else
1355 			  {
1356 			    node->num_tags = 1;
1357 			  }
1358 
1359 			regset[0] = -1;
1360 			tag = next_tag;
1361 			num_tags++;
1362 			next_tag++;
1363 		      }
1364 		  }
1365 		else
1366 		  {
1367 		    assert(!IS_TAG(lit));
1368 		  }
1369 		break;
1370 	      }
1371 	    case CATENATION:
1372 	      {
1373 		tre_catenation_t *cat = node->obj;
1374 		tre_ast_node_t *left = cat->left;
1375 		tre_ast_node_t *right = cat->right;
1376 		int reserved_tag = -1;
1377 
1378 
1379 		/* After processing right child. */
1380 		STACK_PUSHX(stack, voidptr, node);
1381 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_RIGHT);
1382 
1383 		/* Process right child. */
1384 		STACK_PUSHX(stack, voidptr, right);
1385 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1386 
1387 		/* After processing left child. */
1388 		STACK_PUSHX(stack, int, next_tag + left->num_tags);
1389 		if (left->num_tags > 0 && right->num_tags > 0)
1390 		  {
1391 		    /* Reserve the next tag to the right child. */
1392 		    reserved_tag = next_tag;
1393 		    next_tag++;
1394 		  }
1395 		STACK_PUSHX(stack, int, reserved_tag);
1396 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_CAT_LEFT);
1397 
1398 		/* Process left child. */
1399 		STACK_PUSHX(stack, voidptr, left);
1400 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1401 
1402 		}
1403 	      break;
1404 	    case ITERATION:
1405 	      {
1406 		tre_iteration_t *iter = node->obj;
1407 
1408 		if (first_pass)
1409 		  {
1410 		    STACK_PUSHX(stack, int, regset[0] >= 0 || iter->minimal);
1411 		  }
1412 		else
1413 		  {
1414 		    STACK_PUSHX(stack, int, tag);
1415 		    STACK_PUSHX(stack, int, iter->minimal);
1416 		  }
1417 		STACK_PUSHX(stack, voidptr, node);
1418 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_ITERATION);
1419 
1420 		STACK_PUSHX(stack, voidptr, iter->arg);
1421 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1422 
1423 		/* Regset is not empty, so add a tag here. */
1424 		if (regset[0] >= 0 || iter->minimal)
1425 		  {
1426 		    if (!first_pass)
1427 		      {
1428 			int i;
1429 			status = tre_add_tag_left(mem, node, tag);
1430 			if (iter->minimal)
1431 			  tnfa->tag_directions[tag] = TRE_TAG_MAXIMIZE;
1432 			else
1433 			  tnfa->tag_directions[tag] = direction;
1434 			if (minimal_tag >= 0)
1435 			  {
1436 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1437 			    tnfa->minimal_tags[i] = tag;
1438 			    tnfa->minimal_tags[i + 1] = minimal_tag;
1439 			    tnfa->minimal_tags[i + 2] = -1;
1440 			    minimal_tag = -1;
1441 			    num_minimals++;
1442 			  }
1443 			tre_purge_regset(regset, tnfa, tag);
1444 		      }
1445 
1446 		    regset[0] = -1;
1447 		    tag = next_tag;
1448 		    num_tags++;
1449 		    next_tag++;
1450 		  }
1451 		direction = TRE_TAG_MINIMIZE;
1452 	      }
1453 	      break;
1454 	    case UNION:
1455 	      {
1456 		tre_union_t *uni = node->obj;
1457 		tre_ast_node_t *left = uni->left;
1458 		tre_ast_node_t *right = uni->right;
1459 		int left_tag;
1460 		int right_tag;
1461 
1462 		if (regset[0] >= 0)
1463 		  {
1464 		    left_tag = next_tag;
1465 		    right_tag = next_tag + 1;
1466 		  }
1467 		else
1468 		  {
1469 		    left_tag = tag;
1470 		    right_tag = next_tag;
1471 		  }
1472 
1473 		/* After processing right child. */
1474 		STACK_PUSHX(stack, int, right_tag);
1475 		STACK_PUSHX(stack, int, left_tag);
1476 		STACK_PUSHX(stack, voidptr, regset);
1477 		STACK_PUSHX(stack, int, regset[0] >= 0);
1478 		STACK_PUSHX(stack, voidptr, node);
1479 		STACK_PUSHX(stack, voidptr, right);
1480 		STACK_PUSHX(stack, voidptr, left);
1481 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_RIGHT);
1482 
1483 		/* Process right child. */
1484 		STACK_PUSHX(stack, voidptr, right);
1485 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1486 
1487 		/* After processing left child. */
1488 		STACK_PUSHX(stack, int, ADDTAGS_AFTER_UNION_LEFT);
1489 
1490 		/* Process left child. */
1491 		STACK_PUSHX(stack, voidptr, left);
1492 		STACK_PUSHX(stack, int, ADDTAGS_RECURSE);
1493 
1494 		/* Regset is not empty, so add a tag here. */
1495 		if (regset[0] >= 0)
1496 		  {
1497 		    if (!first_pass)
1498 		      {
1499 			int i;
1500 			status = tre_add_tag_left(mem, node, tag);
1501 			tnfa->tag_directions[tag] = direction;
1502 			if (minimal_tag >= 0)
1503 			  {
1504 			    for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1505 			    tnfa->minimal_tags[i] = tag;
1506 			    tnfa->minimal_tags[i + 1] = minimal_tag;
1507 			    tnfa->minimal_tags[i + 2] = -1;
1508 			    minimal_tag = -1;
1509 			    num_minimals++;
1510 			  }
1511 			tre_purge_regset(regset, tnfa, tag);
1512 		      }
1513 
1514 		    regset[0] = -1;
1515 		    tag = next_tag;
1516 		    num_tags++;
1517 		    next_tag++;
1518 		  }
1519 
1520 		if (node->num_submatches > 0)
1521 		  {
1522 		    /* The next two tags are reserved for markers. */
1523 		    next_tag++;
1524 		    tag = next_tag;
1525 		    next_tag++;
1526 		  }
1527 
1528 		break;
1529 	      }
1530 	    }
1531 
1532 	  if (node->submatch_id >= 0)
1533 	    {
1534 	      int i;
1535 	      /* Push this submatch on the parents stack. */
1536 	      for (i = 0; parents[i] >= 0; i++);
1537 	      parents[i] = node->submatch_id;
1538 	      parents[i + 1] = -1;
1539 	    }
1540 
1541 	  break; /* end case: ADDTAGS_RECURSE */
1542 
1543 	case ADDTAGS_AFTER_ITERATION:
1544 	  {
1545 	    int minimal = 0;
1546 	    int enter_tag;
1547 	    node = tre_stack_pop_voidptr(stack);
1548 	    if (first_pass)
1549 	      {
1550 		node->num_tags = ((tre_iteration_t *)node->obj)->arg->num_tags
1551 		  + tre_stack_pop_int(stack);
1552 		minimal_tag = -1;
1553 	      }
1554 	    else
1555 	      {
1556 		minimal = tre_stack_pop_int(stack);
1557 		enter_tag = tre_stack_pop_int(stack);
1558 		if (minimal)
1559 		  minimal_tag = enter_tag;
1560 	      }
1561 
1562 	    if (!first_pass)
1563 	      {
1564 		if (minimal)
1565 		  direction = TRE_TAG_MINIMIZE;
1566 		else
1567 		  direction = TRE_TAG_MAXIMIZE;
1568 	      }
1569 	    break;
1570 	  }
1571 
1572 	case ADDTAGS_AFTER_CAT_LEFT:
1573 	  {
1574 	    int new_tag = tre_stack_pop_int(stack);
1575 	    next_tag = tre_stack_pop_int(stack);
1576 	    if (new_tag >= 0)
1577 	      {
1578 		tag = new_tag;
1579 	      }
1580 	    break;
1581 	  }
1582 
1583 	case ADDTAGS_AFTER_CAT_RIGHT:
1584 	  node = tre_stack_pop_voidptr(stack);
1585 	  if (first_pass)
1586 	    node->num_tags = ((tre_catenation_t *)node->obj)->left->num_tags
1587 	      + ((tre_catenation_t *)node->obj)->right->num_tags;
1588 	  break;
1589 
1590 	case ADDTAGS_AFTER_UNION_LEFT:
1591 	  /* Lift the bottom of the `regset' array so that when processing
1592 	     the right operand the items currently in the array are
1593 	     invisible.	 The original bottom was saved at ADDTAGS_UNION and
1594 	     will be restored at ADDTAGS_AFTER_UNION_RIGHT below. */
1595 	  while (*regset >= 0)
1596 	    regset++;
1597 	  break;
1598 
1599 	case ADDTAGS_AFTER_UNION_RIGHT:
1600 	  {
1601 	    int added_tags, tag_left, tag_right;
1602 	    tre_ast_node_t *left = tre_stack_pop_voidptr(stack);
1603 	    tre_ast_node_t *right = tre_stack_pop_voidptr(stack);
1604 	    node = tre_stack_pop_voidptr(stack);
1605 	    added_tags = tre_stack_pop_int(stack);
1606 	    if (first_pass)
1607 	      {
1608 		node->num_tags = ((tre_union_t *)node->obj)->left->num_tags
1609 		  + ((tre_union_t *)node->obj)->right->num_tags + added_tags
1610 		  + ((node->num_submatches > 0) ? 2 : 0);
1611 	      }
1612 	    regset = tre_stack_pop_voidptr(stack);
1613 	    tag_left = tre_stack_pop_int(stack);
1614 	    tag_right = tre_stack_pop_int(stack);
1615 
1616 	    /* Add tags after both children, the left child gets a smaller
1617 	       tag than the right child.  This guarantees that we prefer
1618 	       the left child over the right child. */
1619 	    /* XXX - This is not always necessary (if the children have
1620 	       tags which must be seen for every match of that child). */
1621 	    /* XXX - Check if this is the only place where tre_add_tag_right
1622 	       is used.	 If so, use tre_add_tag_left (putting the tag before
1623 	       the child as opposed after the child) and throw away
1624 	       tre_add_tag_right. */
1625 	    if (node->num_submatches > 0)
1626 	      {
1627 		if (!first_pass)
1628 		  {
1629 		    status = tre_add_tag_right(mem, left, tag_left);
1630 		    tnfa->tag_directions[tag_left] = TRE_TAG_MAXIMIZE;
1631 		    if (status == REG_OK)
1632 		      status = tre_add_tag_right(mem, right, tag_right);
1633 		    tnfa->tag_directions[tag_right] = TRE_TAG_MAXIMIZE;
1634 		  }
1635 		num_tags += 2;
1636 	      }
1637 	    direction = TRE_TAG_MAXIMIZE;
1638 	    break;
1639 	  }
1640 
1641 	default:
1642 	  assert(0);
1643 	  break;
1644 
1645 	} /* end switch(symbol) */
1646     } /* end while(tre_stack_num_objects(stack) > bottom) */
1647 
1648   if (!first_pass)
1649     tre_purge_regset(regset, tnfa, tag);
1650 
1651   if (!first_pass && minimal_tag >= 0)
1652     {
1653       int i;
1654       for (i = 0; tnfa->minimal_tags[i] >= 0; i++);
1655       tnfa->minimal_tags[i] = tag;
1656       tnfa->minimal_tags[i + 1] = minimal_tag;
1657       tnfa->minimal_tags[i + 2] = -1;
1658       minimal_tag = -1;
1659       num_minimals++;
1660     }
1661 
1662   assert(tree->num_tags == num_tags);
1663   tnfa->end_tag = num_tags;
1664   tnfa->num_tags = num_tags;
1665   tnfa->num_minimals = num_minimals;
1666   xfree(orig_regset);
1667   xfree(parents);
1668   xfree(saved_states);
1669   return status;
1670 }
1671 
1672 
1673 
1674 /*
1675   AST to TNFA compilation routines.
1676 */
1677 
1678 typedef enum {
1679   COPY_RECURSE,
1680   COPY_SET_RESULT_PTR
1681 } tre_copyast_symbol_t;
1682 
1683 /* Flags for tre_copy_ast(). */
1684 #define COPY_REMOVE_TAGS	 1
1685 #define COPY_MAXIMIZE_FIRST_TAG	 2
1686 
1687 static reg_errcode_t
tre_copy_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int flags,int * pos_add,tre_tag_direction_t * tag_directions,tre_ast_node_t ** copy,int * max_pos)1688 tre_copy_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1689 	     int flags, int *pos_add, tre_tag_direction_t *tag_directions,
1690 	     tre_ast_node_t **copy, int *max_pos)
1691 {
1692   reg_errcode_t status = REG_OK;
1693   int bottom = tre_stack_num_objects(stack);
1694   int num_copied = 0;
1695   int first_tag = 1;
1696   tre_ast_node_t **result = copy;
1697   tre_copyast_symbol_t symbol;
1698 
1699   STACK_PUSH(stack, voidptr, ast);
1700   STACK_PUSH(stack, int, COPY_RECURSE);
1701 
1702   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1703     {
1704       tre_ast_node_t *node;
1705       if (status != REG_OK)
1706 	break;
1707 
1708       symbol = (tre_copyast_symbol_t)tre_stack_pop_int(stack);
1709       switch (symbol)
1710 	{
1711 	case COPY_SET_RESULT_PTR:
1712 	  result = tre_stack_pop_voidptr(stack);
1713 	  break;
1714 	case COPY_RECURSE:
1715 	  node = tre_stack_pop_voidptr(stack);
1716 	  switch (node->type)
1717 	    {
1718 	    case LITERAL:
1719 	      {
1720 		tre_literal_t *lit = node->obj;
1721 		int pos = lit->position;
1722 		int min = lit->code_min;
1723 		int max = lit->code_max;
1724 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1725 		  {
1726 		    /* XXX - e.g. [ab] has only one position but two
1727 		       nodes, so we are creating holes in the state space
1728 		       here.  Not fatal, just wastes memory. */
1729 		    pos += *pos_add;
1730 		    num_copied++;
1731 		  }
1732 		else if (IS_TAG(lit) && (flags & COPY_REMOVE_TAGS))
1733 		  {
1734 		    /* Change this tag to empty. */
1735 		    min = EMPTY;
1736 		    max = pos = -1;
1737 		  }
1738 		else if (IS_TAG(lit) && (flags & COPY_MAXIMIZE_FIRST_TAG)
1739 			 && first_tag)
1740 		  {
1741 		    /* Maximize the first tag. */
1742 		    tag_directions[max] = TRE_TAG_MAXIMIZE;
1743 		    first_tag = 0;
1744 		  }
1745 		*result = tre_ast_new_literal(mem, min, max, pos);
1746 		if (*result == NULL)
1747 		  status = REG_ESPACE;
1748 		else {
1749 		  tre_literal_t *p = (*result)->obj;
1750 		  p->class = lit->class;
1751 		  p->neg_classes = lit->neg_classes;
1752 		}
1753 
1754 		if (pos > *max_pos)
1755 		  *max_pos = pos;
1756 		break;
1757 	      }
1758 	    case UNION:
1759 	      {
1760 		tre_union_t *uni = node->obj;
1761 		tre_union_t *tmp;
1762 		*result = tre_ast_new_union(mem, uni->left, uni->right);
1763 		if (*result == NULL)
1764 		  {
1765 		    status = REG_ESPACE;
1766 		    break;
1767 		  }
1768 		tmp = (*result)->obj;
1769 		result = &tmp->left;
1770 		STACK_PUSHX(stack, voidptr, uni->right);
1771 		STACK_PUSHX(stack, int, COPY_RECURSE);
1772 		STACK_PUSHX(stack, voidptr, &tmp->right);
1773 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1774 		STACK_PUSHX(stack, voidptr, uni->left);
1775 		STACK_PUSHX(stack, int, COPY_RECURSE);
1776 		break;
1777 	      }
1778 	    case CATENATION:
1779 	      {
1780 		tre_catenation_t *cat = node->obj;
1781 		tre_catenation_t *tmp;
1782 		*result = tre_ast_new_catenation(mem, cat->left, cat->right);
1783 		if (*result == NULL)
1784 		  {
1785 		    status = REG_ESPACE;
1786 		    break;
1787 		  }
1788 		tmp = (*result)->obj;
1789 		tmp->left = NULL;
1790 		tmp->right = NULL;
1791 		result = &tmp->left;
1792 
1793 		STACK_PUSHX(stack, voidptr, cat->right);
1794 		STACK_PUSHX(stack, int, COPY_RECURSE);
1795 		STACK_PUSHX(stack, voidptr, &tmp->right);
1796 		STACK_PUSHX(stack, int, COPY_SET_RESULT_PTR);
1797 		STACK_PUSHX(stack, voidptr, cat->left);
1798 		STACK_PUSHX(stack, int, COPY_RECURSE);
1799 		break;
1800 	      }
1801 	    case ITERATION:
1802 	      {
1803 		tre_iteration_t *iter = node->obj;
1804 		STACK_PUSHX(stack, voidptr, iter->arg);
1805 		STACK_PUSHX(stack, int, COPY_RECURSE);
1806 		*result = tre_ast_new_iter(mem, iter->arg, iter->min,
1807 					   iter->max, iter->minimal);
1808 		if (*result == NULL)
1809 		  {
1810 		    status = REG_ESPACE;
1811 		    break;
1812 		  }
1813 		iter = (*result)->obj;
1814 		result = &iter->arg;
1815 		break;
1816 	      }
1817 	    default:
1818 	      assert(0);
1819 	      break;
1820 	    }
1821 	  break;
1822 	}
1823     }
1824   *pos_add += num_copied;
1825   return status;
1826 }
1827 
1828 typedef enum {
1829   EXPAND_RECURSE,
1830   EXPAND_AFTER_ITER
1831 } tre_expand_ast_symbol_t;
1832 
1833 /* Expands each iteration node that has a finite nonzero minimum or maximum
1834    iteration count to a catenated sequence of copies of the node. */
1835 static reg_errcode_t
tre_expand_ast(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * ast,int * position,tre_tag_direction_t * tag_directions)1836 tre_expand_ast(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *ast,
1837 	       int *position, tre_tag_direction_t *tag_directions)
1838 {
1839   reg_errcode_t status = REG_OK;
1840   int bottom = tre_stack_num_objects(stack);
1841   int pos_add = 0;
1842   int pos_add_total = 0;
1843   int max_pos = 0;
1844   int iter_depth = 0;
1845 
1846   STACK_PUSHR(stack, voidptr, ast);
1847   STACK_PUSHR(stack, int, EXPAND_RECURSE);
1848   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
1849     {
1850       tre_ast_node_t *node;
1851       tre_expand_ast_symbol_t symbol;
1852 
1853       if (status != REG_OK)
1854 	break;
1855 
1856       symbol = (tre_expand_ast_symbol_t)tre_stack_pop_int(stack);
1857       node = tre_stack_pop_voidptr(stack);
1858       switch (symbol)
1859 	{
1860 	case EXPAND_RECURSE:
1861 	  switch (node->type)
1862 	    {
1863 	    case LITERAL:
1864 	      {
1865 		tre_literal_t *lit= node->obj;
1866 		if (!IS_SPECIAL(lit) || IS_BACKREF(lit))
1867 		  {
1868 		    lit->position += pos_add;
1869 		    if (lit->position > max_pos)
1870 		      max_pos = lit->position;
1871 		  }
1872 		break;
1873 	      }
1874 	    case UNION:
1875 	      {
1876 		tre_union_t *uni = node->obj;
1877 		STACK_PUSHX(stack, voidptr, uni->right);
1878 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1879 		STACK_PUSHX(stack, voidptr, uni->left);
1880 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1881 		break;
1882 	      }
1883 	    case CATENATION:
1884 	      {
1885 		tre_catenation_t *cat = node->obj;
1886 		STACK_PUSHX(stack, voidptr, cat->right);
1887 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1888 		STACK_PUSHX(stack, voidptr, cat->left);
1889 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1890 		break;
1891 	      }
1892 	    case ITERATION:
1893 	      {
1894 		tre_iteration_t *iter = node->obj;
1895 		STACK_PUSHX(stack, int, pos_add);
1896 		STACK_PUSHX(stack, voidptr, node);
1897 		STACK_PUSHX(stack, int, EXPAND_AFTER_ITER);
1898 		STACK_PUSHX(stack, voidptr, iter->arg);
1899 		STACK_PUSHX(stack, int, EXPAND_RECURSE);
1900 		/* If we are going to expand this node at EXPAND_AFTER_ITER
1901 		   then don't increase the `pos' fields of the nodes now, it
1902 		   will get done when expanding. */
1903 		if (iter->min > 1 || iter->max > 1)
1904 		  pos_add = 0;
1905 		iter_depth++;
1906 		break;
1907 	      }
1908 	    default:
1909 	      assert(0);
1910 	      break;
1911 	    }
1912 	  break;
1913 	case EXPAND_AFTER_ITER:
1914 	  {
1915 	    tre_iteration_t *iter = node->obj;
1916 	    int pos_add_last;
1917 	    pos_add = tre_stack_pop_int(stack);
1918 	    pos_add_last = pos_add;
1919 	    if (iter->min > 1 || iter->max > 1)
1920 	      {
1921 		tre_ast_node_t *seq1 = NULL, *seq2 = NULL;
1922 		int j;
1923 		int pos_add_save = pos_add;
1924 
1925 		/* Create a catenated sequence of copies of the node. */
1926 		for (j = 0; j < iter->min; j++)
1927 		  {
1928 		    tre_ast_node_t *copy;
1929 		    /* Remove tags from all but the last copy. */
1930 		    int flags = ((j + 1 < iter->min)
1931 				 ? COPY_REMOVE_TAGS
1932 				 : COPY_MAXIMIZE_FIRST_TAG);
1933 		    pos_add_save = pos_add;
1934 		    status = tre_copy_ast(mem, stack, iter->arg, flags,
1935 					  &pos_add, tag_directions, &copy,
1936 					  &max_pos);
1937 		    if (status != REG_OK)
1938 		      return status;
1939 		    if (seq1 != NULL)
1940 		      seq1 = tre_ast_new_catenation(mem, seq1, copy);
1941 		    else
1942 		      seq1 = copy;
1943 		    if (seq1 == NULL)
1944 		      return REG_ESPACE;
1945 		  }
1946 
1947 		if (iter->max == -1)
1948 		  {
1949 		    /* No upper limit. */
1950 		    pos_add_save = pos_add;
1951 		    status = tre_copy_ast(mem, stack, iter->arg, 0,
1952 					  &pos_add, NULL, &seq2, &max_pos);
1953 		    if (status != REG_OK)
1954 		      return status;
1955 		    seq2 = tre_ast_new_iter(mem, seq2, 0, -1, 0);
1956 		    if (seq2 == NULL)
1957 		      return REG_ESPACE;
1958 		  }
1959 		else
1960 		  {
1961 		    for (j = iter->min; j < iter->max; j++)
1962 		      {
1963 			tre_ast_node_t *tmp, *copy;
1964 			pos_add_save = pos_add;
1965 			status = tre_copy_ast(mem, stack, iter->arg, 0,
1966 					      &pos_add, NULL, &copy, &max_pos);
1967 			if (status != REG_OK)
1968 			  return status;
1969 			if (seq2 != NULL)
1970 			  seq2 = tre_ast_new_catenation(mem, copy, seq2);
1971 			else
1972 			  seq2 = copy;
1973 			if (seq2 == NULL)
1974 			  return REG_ESPACE;
1975 			tmp = tre_ast_new_literal(mem, EMPTY, -1, -1);
1976 			if (tmp == NULL)
1977 			  return REG_ESPACE;
1978 			seq2 = tre_ast_new_union(mem, tmp, seq2);
1979 			if (seq2 == NULL)
1980 			  return REG_ESPACE;
1981 		      }
1982 		  }
1983 
1984 		pos_add = pos_add_save;
1985 		if (seq1 == NULL)
1986 		  seq1 = seq2;
1987 		else if (seq2 != NULL)
1988 		  seq1 = tre_ast_new_catenation(mem, seq1, seq2);
1989 		if (seq1 == NULL)
1990 		  return REG_ESPACE;
1991 		node->obj = seq1->obj;
1992 		node->type = seq1->type;
1993 	      }
1994 
1995 	    iter_depth--;
1996 	    pos_add_total += pos_add - pos_add_last;
1997 	    if (iter_depth == 0)
1998 	      pos_add = pos_add_total;
1999 
2000 	    break;
2001 	  }
2002 	default:
2003 	  assert(0);
2004 	  break;
2005 	}
2006     }
2007 
2008   *position += pos_add_total;
2009 
2010   /* `max_pos' should never be larger than `*position' if the above
2011      code works, but just an extra safeguard let's make sure
2012      `*position' is set large enough so enough memory will be
2013      allocated for the transition table. */
2014   if (max_pos > *position)
2015     *position = max_pos;
2016 
2017   return status;
2018 }
2019 
2020 static tre_pos_and_tags_t *
tre_set_empty(tre_mem_t mem)2021 tre_set_empty(tre_mem_t mem)
2022 {
2023   tre_pos_and_tags_t *new_set;
2024 
2025   new_set = tre_mem_calloc(mem, sizeof(*new_set));
2026   if (new_set == NULL)
2027     return NULL;
2028 
2029   new_set[0].position = -1;
2030   new_set[0].code_min = -1;
2031   new_set[0].code_max = -1;
2032 
2033   return new_set;
2034 }
2035 
2036 static tre_pos_and_tags_t *
tre_set_one(tre_mem_t mem,int position,int code_min,int code_max,tre_ctype_t class,tre_ctype_t * neg_classes,int backref)2037 tre_set_one(tre_mem_t mem, int position, int code_min, int code_max,
2038 	    tre_ctype_t class, tre_ctype_t *neg_classes, int backref)
2039 {
2040   tre_pos_and_tags_t *new_set;
2041 
2042   new_set = tre_mem_calloc(mem, sizeof(*new_set) * 2);
2043   if (new_set == NULL)
2044     return NULL;
2045 
2046   new_set[0].position = position;
2047   new_set[0].code_min = code_min;
2048   new_set[0].code_max = code_max;
2049   new_set[0].class = class;
2050   new_set[0].neg_classes = neg_classes;
2051   new_set[0].backref = backref;
2052   new_set[1].position = -1;
2053   new_set[1].code_min = -1;
2054   new_set[1].code_max = -1;
2055 
2056   return new_set;
2057 }
2058 
2059 static tre_pos_and_tags_t *
tre_set_union(tre_mem_t mem,tre_pos_and_tags_t * set1,tre_pos_and_tags_t * set2,int * tags,int assertions)2060 tre_set_union(tre_mem_t mem, tre_pos_and_tags_t *set1, tre_pos_and_tags_t *set2,
2061 	      int *tags, int assertions)
2062 {
2063   int s1, s2, i, j;
2064   tre_pos_and_tags_t *new_set;
2065   int *new_tags;
2066   int num_tags;
2067 
2068   for (num_tags = 0; tags != NULL && tags[num_tags] >= 0; num_tags++);
2069   for (s1 = 0; set1[s1].position >= 0; s1++);
2070   for (s2 = 0; set2[s2].position >= 0; s2++);
2071   new_set = tre_mem_calloc(mem, sizeof(*new_set) * (s1 + s2 + 1));
2072   if (!new_set )
2073     return NULL;
2074 
2075   for (s1 = 0; set1[s1].position >= 0; s1++)
2076     {
2077       new_set[s1].position = set1[s1].position;
2078       new_set[s1].code_min = set1[s1].code_min;
2079       new_set[s1].code_max = set1[s1].code_max;
2080       new_set[s1].assertions = set1[s1].assertions | assertions;
2081       new_set[s1].class = set1[s1].class;
2082       new_set[s1].neg_classes = set1[s1].neg_classes;
2083       new_set[s1].backref = set1[s1].backref;
2084       if (set1[s1].tags == NULL && tags == NULL)
2085 	new_set[s1].tags = NULL;
2086       else
2087 	{
2088 	  for (i = 0; set1[s1].tags != NULL && set1[s1].tags[i] >= 0; i++);
2089 	  new_tags = tre_mem_alloc(mem, (sizeof(*new_tags)
2090 					 * (i + num_tags + 1)));
2091 	  if (new_tags == NULL)
2092 	    return NULL;
2093 	  for (j = 0; j < i; j++)
2094 	    new_tags[j] = set1[s1].tags[j];
2095 	  for (i = 0; i < num_tags; i++)
2096 	    new_tags[j + i] = tags[i];
2097 	  new_tags[j + i] = -1;
2098 	  new_set[s1].tags = new_tags;
2099 	}
2100     }
2101 
2102   for (s2 = 0; set2[s2].position >= 0; s2++)
2103     {
2104       new_set[s1 + s2].position = set2[s2].position;
2105       new_set[s1 + s2].code_min = set2[s2].code_min;
2106       new_set[s1 + s2].code_max = set2[s2].code_max;
2107       /* XXX - why not | assertions here as well? */
2108       new_set[s1 + s2].assertions = set2[s2].assertions;
2109       new_set[s1 + s2].class = set2[s2].class;
2110       new_set[s1 + s2].neg_classes = set2[s2].neg_classes;
2111       new_set[s1 + s2].backref = set2[s2].backref;
2112       if (set2[s2].tags == NULL)
2113 	new_set[s1 + s2].tags = NULL;
2114       else
2115 	{
2116 	  for (i = 0; set2[s2].tags[i] >= 0; i++);
2117 	  new_tags = tre_mem_alloc(mem, sizeof(*new_tags) * (i + 1));
2118 	  if (new_tags == NULL)
2119 	    return NULL;
2120 	  for (j = 0; j < i; j++)
2121 	    new_tags[j] = set2[s2].tags[j];
2122 	  new_tags[j] = -1;
2123 	  new_set[s1 + s2].tags = new_tags;
2124 	}
2125     }
2126   new_set[s1 + s2].position = -1;
2127   return new_set;
2128 }
2129 
2130 /* Finds the empty path through `node' which is the one that should be
2131    taken according to POSIX.2 rules, and adds the tags on that path to
2132    `tags'.   `tags' may be NULL.  If `num_tags_seen' is not NULL, it is
2133    set to the number of tags seen on the path. */
2134 static reg_errcode_t
tre_match_empty(tre_stack_t * stack,tre_ast_node_t * node,int * tags,int * assertions,int * num_tags_seen)2135 tre_match_empty(tre_stack_t *stack, tre_ast_node_t *node, int *tags,
2136 		int *assertions, int *num_tags_seen)
2137 {
2138   tre_literal_t *lit;
2139   tre_union_t *uni;
2140   tre_catenation_t *cat;
2141   tre_iteration_t *iter;
2142   int i;
2143   int bottom = tre_stack_num_objects(stack);
2144   reg_errcode_t status = REG_OK;
2145   if (num_tags_seen)
2146     *num_tags_seen = 0;
2147 
2148   status = tre_stack_push_voidptr(stack, node);
2149 
2150   /* Walk through the tree recursively. */
2151   while (status == REG_OK && tre_stack_num_objects(stack) > bottom)
2152     {
2153       node = tre_stack_pop_voidptr(stack);
2154 
2155       switch (node->type)
2156 	{
2157 	case LITERAL:
2158 	  lit = (tre_literal_t *)node->obj;
2159 	  switch (lit->code_min)
2160 	    {
2161 	    case TAG:
2162 	      if (lit->code_max >= 0)
2163 		{
2164 		  if (tags != NULL)
2165 		    {
2166 		      /* Add the tag to `tags'. */
2167 		      for (i = 0; tags[i] >= 0; i++)
2168 			if (tags[i] == lit->code_max)
2169 			  break;
2170 		      if (tags[i] < 0)
2171 			{
2172 			  tags[i] = lit->code_max;
2173 			  tags[i + 1] = -1;
2174 			}
2175 		    }
2176 		  if (num_tags_seen)
2177 		    (*num_tags_seen)++;
2178 		}
2179 	      break;
2180 	    case ASSERTION:
2181 	      assert(lit->code_max >= 1
2182 		     || lit->code_max <= ASSERT_LAST);
2183 	      if (assertions != NULL)
2184 		*assertions |= lit->code_max;
2185 	      break;
2186 	    case EMPTY:
2187 	      break;
2188 	    default:
2189 	      assert(0);
2190 	      break;
2191 	    }
2192 	  break;
2193 
2194 	case UNION:
2195 	  /* Subexpressions starting earlier take priority over ones
2196 	     starting later, so we prefer the left subexpression over the
2197 	     right subexpression. */
2198 	  uni = (tre_union_t *)node->obj;
2199 	  if (uni->left->nullable)
2200 	    STACK_PUSHX(stack, voidptr, uni->left)
2201 	  else if (uni->right->nullable)
2202 	    STACK_PUSHX(stack, voidptr, uni->right)
2203 	  else
2204 	    assert(0);
2205 	  break;
2206 
2207 	case CATENATION:
2208 	  /* The path must go through both children. */
2209 	  cat = (tre_catenation_t *)node->obj;
2210 	  assert(cat->left->nullable);
2211 	  assert(cat->right->nullable);
2212 	  STACK_PUSHX(stack, voidptr, cat->left);
2213 	  STACK_PUSHX(stack, voidptr, cat->right);
2214 	  break;
2215 
2216 	case ITERATION:
2217 	  /* A match with an empty string is preferred over no match at
2218 	     all, so we go through the argument if possible. */
2219 	  iter = (tre_iteration_t *)node->obj;
2220 	  if (iter->arg->nullable)
2221 	    STACK_PUSHX(stack, voidptr, iter->arg);
2222 	  break;
2223 
2224 	default:
2225 	  assert(0);
2226 	  break;
2227 	}
2228     }
2229 
2230   return status;
2231 }
2232 
2233 
2234 typedef enum {
2235   NFL_RECURSE,
2236   NFL_POST_UNION,
2237   NFL_POST_CATENATION,
2238   NFL_POST_ITERATION
2239 } tre_nfl_stack_symbol_t;
2240 
2241 
2242 /* Computes and fills in the fields `nullable', `firstpos', and `lastpos' for
2243    the nodes of the AST `tree'. */
2244 static reg_errcode_t
tre_compute_nfl(tre_mem_t mem,tre_stack_t * stack,tre_ast_node_t * tree)2245 tre_compute_nfl(tre_mem_t mem, tre_stack_t *stack, tre_ast_node_t *tree)
2246 {
2247   int bottom = tre_stack_num_objects(stack);
2248 
2249   STACK_PUSHR(stack, voidptr, tree);
2250   STACK_PUSHR(stack, int, NFL_RECURSE);
2251 
2252   while (tre_stack_num_objects(stack) > bottom)
2253     {
2254       tre_nfl_stack_symbol_t symbol;
2255       tre_ast_node_t *node;
2256 
2257       symbol = (tre_nfl_stack_symbol_t)tre_stack_pop_int(stack);
2258       node = tre_stack_pop_voidptr(stack);
2259       switch (symbol)
2260 	{
2261 	case NFL_RECURSE:
2262 	  switch (node->type)
2263 	    {
2264 	    case LITERAL:
2265 	      {
2266 		tre_literal_t *lit = (tre_literal_t *)node->obj;
2267 		if (IS_BACKREF(lit))
2268 		  {
2269 		    /* Back references: nullable = false, firstpos = {i},
2270 		       lastpos = {i}. */
2271 		    node->nullable = 0;
2272 		    node->firstpos = tre_set_one(mem, lit->position, 0,
2273 					     TRE_CHAR_MAX, 0, NULL, -1);
2274 		    if (!node->firstpos)
2275 		      return REG_ESPACE;
2276 		    node->lastpos = tre_set_one(mem, lit->position, 0,
2277 						TRE_CHAR_MAX, 0, NULL,
2278 						(int)lit->code_max);
2279 		    if (!node->lastpos)
2280 		      return REG_ESPACE;
2281 		  }
2282 		else if (lit->code_min < 0)
2283 		  {
2284 		    /* Tags, empty strings, params, and zero width assertions:
2285 		       nullable = true, firstpos = {}, and lastpos = {}. */
2286 		    node->nullable = 1;
2287 		    node->firstpos = tre_set_empty(mem);
2288 		    if (!node->firstpos)
2289 		      return REG_ESPACE;
2290 		    node->lastpos = tre_set_empty(mem);
2291 		    if (!node->lastpos)
2292 		      return REG_ESPACE;
2293 		  }
2294 		else
2295 		  {
2296 		    /* Literal at position i: nullable = false, firstpos = {i},
2297 		       lastpos = {i}. */
2298 		    node->nullable = 0;
2299 		    node->firstpos =
2300 		      tre_set_one(mem, lit->position, (int)lit->code_min,
2301 				  (int)lit->code_max, 0, NULL, -1);
2302 		    if (!node->firstpos)
2303 		      return REG_ESPACE;
2304 		    node->lastpos = tre_set_one(mem, lit->position,
2305 						(int)lit->code_min,
2306 						(int)lit->code_max,
2307 						lit->class, lit->neg_classes,
2308 						-1);
2309 		    if (!node->lastpos)
2310 		      return REG_ESPACE;
2311 		  }
2312 		break;
2313 	      }
2314 
2315 	    case UNION:
2316 	      /* Compute the attributes for the two subtrees, and after that
2317 		 for this node. */
2318 	      STACK_PUSHR(stack, voidptr, node);
2319 	      STACK_PUSHR(stack, int, NFL_POST_UNION);
2320 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->right);
2321 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2322 	      STACK_PUSHR(stack, voidptr, ((tre_union_t *)node->obj)->left);
2323 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2324 	      break;
2325 
2326 	    case CATENATION:
2327 	      /* Compute the attributes for the two subtrees, and after that
2328 		 for this node. */
2329 	      STACK_PUSHR(stack, voidptr, node);
2330 	      STACK_PUSHR(stack, int, NFL_POST_CATENATION);
2331 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->right);
2332 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2333 	      STACK_PUSHR(stack, voidptr, ((tre_catenation_t *)node->obj)->left);
2334 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2335 	      break;
2336 
2337 	    case ITERATION:
2338 	      /* Compute the attributes for the subtree, and after that for
2339 		 this node. */
2340 	      STACK_PUSHR(stack, voidptr, node);
2341 	      STACK_PUSHR(stack, int, NFL_POST_ITERATION);
2342 	      STACK_PUSHR(stack, voidptr, ((tre_iteration_t *)node->obj)->arg);
2343 	      STACK_PUSHR(stack, int, NFL_RECURSE);
2344 	      break;
2345 	    }
2346 	  break; /* end case: NFL_RECURSE */
2347 
2348 	case NFL_POST_UNION:
2349 	  {
2350 	    tre_union_t *uni = (tre_union_t *)node->obj;
2351 	    node->nullable = uni->left->nullable || uni->right->nullable;
2352 	    node->firstpos = tre_set_union(mem, uni->left->firstpos,
2353 					   uni->right->firstpos, NULL, 0);
2354 	    if (!node->firstpos)
2355 	      return REG_ESPACE;
2356 	    node->lastpos = tre_set_union(mem, uni->left->lastpos,
2357 					  uni->right->lastpos, NULL, 0);
2358 	    if (!node->lastpos)
2359 	      return REG_ESPACE;
2360 	    break;
2361 	  }
2362 
2363 	case NFL_POST_ITERATION:
2364 	  {
2365 	    tre_iteration_t *iter = (tre_iteration_t *)node->obj;
2366 
2367 	    if (iter->min == 0 || iter->arg->nullable)
2368 	      node->nullable = 1;
2369 	    else
2370 	      node->nullable = 0;
2371 	    node->firstpos = iter->arg->firstpos;
2372 	    node->lastpos = iter->arg->lastpos;
2373 	    break;
2374 	  }
2375 
2376 	case NFL_POST_CATENATION:
2377 	  {
2378 	    int num_tags, *tags, assertions;
2379 	    reg_errcode_t status;
2380 	    tre_catenation_t *cat = node->obj;
2381 	    node->nullable = cat->left->nullable && cat->right->nullable;
2382 
2383 	    /* Compute firstpos. */
2384 	    if (cat->left->nullable)
2385 	      {
2386 		/* The left side matches the empty string.  Make a first pass
2387 		   with tre_match_empty() to get the number of tags and
2388 		   parameters. */
2389 		status = tre_match_empty(stack, cat->left,
2390 					 NULL, NULL, &num_tags);
2391 		if (status != REG_OK)
2392 		  return status;
2393 		/* Allocate arrays for the tags and parameters. */
2394 		tags = xmalloc(sizeof(*tags) * (num_tags + 1));
2395 		if (!tags)
2396 		  return REG_ESPACE;
2397 		tags[0] = -1;
2398 		assertions = 0;
2399 		/* Second pass with tre_mach_empty() to get the list of
2400 		   tags and parameters. */
2401 		status = tre_match_empty(stack, cat->left, tags,
2402 					 &assertions, NULL);
2403 		if (status != REG_OK)
2404 		  {
2405 		    xfree(tags);
2406 		    return status;
2407 		  }
2408 		node->firstpos =
2409 		  tre_set_union(mem, cat->right->firstpos, cat->left->firstpos,
2410 				tags, assertions);
2411 		xfree(tags);
2412 		if (!node->firstpos)
2413 		  return REG_ESPACE;
2414 	      }
2415 	    else
2416 	      {
2417 		node->firstpos = cat->left->firstpos;
2418 	      }
2419 
2420 	    /* Compute lastpos. */
2421 	    if (cat->right->nullable)
2422 	      {
2423 		/* The right side matches the empty string.  Make a first pass
2424 		   with tre_match_empty() to get the number of tags and
2425 		   parameters. */
2426 		status = tre_match_empty(stack, cat->right,
2427 					 NULL, NULL, &num_tags);
2428 		if (status != REG_OK)
2429 		  return status;
2430 		/* Allocate arrays for the tags and parameters. */
2431 		tags = xmalloc(sizeof(int) * (num_tags + 1));
2432 		if (!tags)
2433 		  return REG_ESPACE;
2434 		tags[0] = -1;
2435 		assertions = 0;
2436 		/* Second pass with tre_mach_empty() to get the list of
2437 		   tags and parameters. */
2438 		status = tre_match_empty(stack, cat->right, tags,
2439 					 &assertions, NULL);
2440 		if (status != REG_OK)
2441 		  {
2442 		    xfree(tags);
2443 		    return status;
2444 		  }
2445 		node->lastpos =
2446 		  tre_set_union(mem, cat->left->lastpos, cat->right->lastpos,
2447 				tags, assertions);
2448 		xfree(tags);
2449 		if (!node->lastpos)
2450 		  return REG_ESPACE;
2451 	      }
2452 	    else
2453 	      {
2454 		node->lastpos = cat->right->lastpos;
2455 	      }
2456 	    break;
2457 	  }
2458 
2459 	default:
2460 	  assert(0);
2461 	  break;
2462 	}
2463     }
2464 
2465   return REG_OK;
2466 }
2467 
2468 
2469 /* Adds a transition from each position in `p1' to each position in `p2'. */
2470 static reg_errcode_t
tre_make_trans(tre_pos_and_tags_t * p1,tre_pos_and_tags_t * p2,tre_tnfa_transition_t * transitions,int * counts,int * offs)2471 tre_make_trans(tre_pos_and_tags_t *p1, tre_pos_and_tags_t *p2,
2472 	       tre_tnfa_transition_t *transitions,
2473 	       int *counts, int *offs)
2474 {
2475   tre_pos_and_tags_t *orig_p2 = p2;
2476   tre_tnfa_transition_t *trans;
2477   int i, j, k, l, dup, prev_p2_pos;
2478 
2479   if (transitions != NULL)
2480     while (p1->position >= 0)
2481       {
2482 	p2 = orig_p2;
2483 	prev_p2_pos = -1;
2484 	while (p2->position >= 0)
2485 	  {
2486 	    /* Optimization: if this position was already handled, skip it. */
2487 	    if (p2->position == prev_p2_pos)
2488 	      {
2489 		p2++;
2490 		continue;
2491 	      }
2492 	    prev_p2_pos = p2->position;
2493 	    /* Set `trans' to point to the next unused transition from
2494 	       position `p1->position'. */
2495 	    trans = transitions + offs[p1->position];
2496 	    while (trans->state != NULL)
2497 	      {
2498 #if 0
2499 		/* If we find a previous transition from `p1->position' to
2500 		   `p2->position', it is overwritten.  This can happen only
2501 		   if there are nested loops in the regexp, like in "((a)*)*".
2502 		   In POSIX.2 repetition using the outer loop is always
2503 		   preferred over using the inner loop.	 Therefore the
2504 		   transition for the inner loop is useless and can be thrown
2505 		   away. */
2506 		/* XXX - The same position is used for all nodes in a bracket
2507 		   expression, so this optimization cannot be used (it will
2508 		   break bracket expressions) unless I figure out a way to
2509 		   detect it here. */
2510 		if (trans->state_id == p2->position)
2511 		  {
2512 		    break;
2513 		  }
2514 #endif
2515 		trans++;
2516 	      }
2517 
2518 	    if (trans->state == NULL)
2519 	      (trans + 1)->state = NULL;
2520 	    /* Use the character ranges, assertions, etc. from `p1' for
2521 	       the transition from `p1' to `p2'. */
2522 	    trans->code_min = p1->code_min;
2523 	    trans->code_max = p1->code_max;
2524 	    trans->state = transitions + offs[p2->position];
2525 	    trans->state_id = p2->position;
2526 	    trans->assertions = p1->assertions | p2->assertions
2527 	      | (p1->class ? ASSERT_CHAR_CLASS : 0)
2528 	      | (p1->neg_classes != NULL ? ASSERT_CHAR_CLASS_NEG : 0);
2529 	    if (p1->backref >= 0)
2530 	      {
2531 		assert((trans->assertions & ASSERT_CHAR_CLASS) == 0);
2532 		assert(p2->backref < 0);
2533 		trans->u.backref = p1->backref;
2534 		trans->assertions |= ASSERT_BACKREF;
2535 	      }
2536 	    else
2537 	      trans->u.class = p1->class;
2538 	    if (p1->neg_classes != NULL)
2539 	      {
2540 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++);
2541 		trans->neg_classes =
2542 		  xmalloc(sizeof(*trans->neg_classes) * (i + 1));
2543 		if (trans->neg_classes == NULL)
2544 		  return REG_ESPACE;
2545 		for (i = 0; p1->neg_classes[i] != (tre_ctype_t)0; i++)
2546 		  trans->neg_classes[i] = p1->neg_classes[i];
2547 		trans->neg_classes[i] = (tre_ctype_t)0;
2548 	      }
2549 	    else
2550 	      trans->neg_classes = NULL;
2551 
2552 	    /* Find out how many tags this transition has. */
2553 	    i = 0;
2554 	    if (p1->tags != NULL)
2555 	      while(p1->tags[i] >= 0)
2556 		i++;
2557 	    j = 0;
2558 	    if (p2->tags != NULL)
2559 	      while(p2->tags[j] >= 0)
2560 		j++;
2561 
2562 	    /* If we are overwriting a transition, free the old tag array. */
2563 	    if (trans->tags != NULL)
2564 	      xfree(trans->tags);
2565 	    trans->tags = NULL;
2566 
2567 	    /* If there were any tags, allocate an array and fill it. */
2568 	    if (i + j > 0)
2569 	      {
2570 		trans->tags = xmalloc(sizeof(*trans->tags) * (i + j + 1));
2571 		if (!trans->tags)
2572 		  return REG_ESPACE;
2573 		i = 0;
2574 		if (p1->tags != NULL)
2575 		  while(p1->tags[i] >= 0)
2576 		    {
2577 		      trans->tags[i] = p1->tags[i];
2578 		      i++;
2579 		    }
2580 		l = i;
2581 		j = 0;
2582 		if (p2->tags != NULL)
2583 		  while (p2->tags[j] >= 0)
2584 		    {
2585 		      /* Don't add duplicates. */
2586 		      dup = 0;
2587 		      for (k = 0; k < i; k++)
2588 			if (trans->tags[k] == p2->tags[j])
2589 			  {
2590 			    dup = 1;
2591 			    break;
2592 			  }
2593 		      if (!dup)
2594 			trans->tags[l++] = p2->tags[j];
2595 		      j++;
2596 		    }
2597 		trans->tags[l] = -1;
2598 	      }
2599 
2600 	    p2++;
2601 	  }
2602 	p1++;
2603       }
2604   else
2605     /* Compute a maximum limit for the number of transitions leaving
2606        from each state. */
2607     while (p1->position >= 0)
2608       {
2609 	p2 = orig_p2;
2610 	while (p2->position >= 0)
2611 	  {
2612 	    counts[p1->position]++;
2613 	    p2++;
2614 	  }
2615 	p1++;
2616       }
2617   return REG_OK;
2618 }
2619 
2620 /* Converts the syntax tree to a TNFA.	All the transitions in the TNFA are
2621    labelled with one character range (there are no transitions on empty
2622    strings).  The TNFA takes O(n^2) space in the worst case, `n' is size of
2623    the regexp. */
2624 static reg_errcode_t
tre_ast_to_tnfa(tre_ast_node_t * node,tre_tnfa_transition_t * transitions,int * counts,int * offs)2625 tre_ast_to_tnfa(tre_ast_node_t *node, tre_tnfa_transition_t *transitions,
2626 		int *counts, int *offs)
2627 {
2628   tre_union_t *uni;
2629   tre_catenation_t *cat;
2630   tre_iteration_t *iter;
2631   reg_errcode_t errcode = REG_OK;
2632 
2633   /* XXX - recurse using a stack!. */
2634   switch (node->type)
2635     {
2636     case LITERAL:
2637       break;
2638     case UNION:
2639       uni = (tre_union_t *)node->obj;
2640       errcode = tre_ast_to_tnfa(uni->left, transitions, counts, offs);
2641       if (errcode != REG_OK)
2642 	return errcode;
2643       errcode = tre_ast_to_tnfa(uni->right, transitions, counts, offs);
2644       break;
2645 
2646     case CATENATION:
2647       cat = (tre_catenation_t *)node->obj;
2648       /* Add a transition from each position in cat->left->lastpos
2649 	 to each position in cat->right->firstpos. */
2650       errcode = tre_make_trans(cat->left->lastpos, cat->right->firstpos,
2651 			       transitions, counts, offs);
2652       if (errcode != REG_OK)
2653 	return errcode;
2654       errcode = tre_ast_to_tnfa(cat->left, transitions, counts, offs);
2655       if (errcode != REG_OK)
2656 	return errcode;
2657       errcode = tre_ast_to_tnfa(cat->right, transitions, counts, offs);
2658       break;
2659 
2660     case ITERATION:
2661       iter = (tre_iteration_t *)node->obj;
2662       assert(iter->max == -1 || iter->max == 1);
2663 
2664       if (iter->max == -1)
2665 	{
2666 	  assert(iter->min == 0 || iter->min == 1);
2667 	  /* Add a transition from each last position in the iterated
2668 	     expression to each first position. */
2669 	  errcode = tre_make_trans(iter->arg->lastpos, iter->arg->firstpos,
2670 				   transitions, counts, offs);
2671 	  if (errcode != REG_OK)
2672 	    return errcode;
2673 	}
2674       errcode = tre_ast_to_tnfa(iter->arg, transitions, counts, offs);
2675       break;
2676     }
2677   return errcode;
2678 }
2679 
2680 
2681 #define ERROR_EXIT(err)		  \
2682   do				  \
2683     {				  \
2684       errcode = err;		  \
2685       if (/*CONSTCOND*/1)	  \
2686       	goto error_exit;	  \
2687     }				  \
2688  while (/*CONSTCOND*/0)
2689 
2690 
2691 int
regcomp(regex_t * restrict preg,const char * restrict regex,int cflags)2692 regcomp(regex_t *restrict preg, const char *restrict regex, int cflags)
2693 {
2694   tre_stack_t *stack;
2695   tre_ast_node_t *tree, *tmp_ast_l, *tmp_ast_r;
2696   tre_pos_and_tags_t *p;
2697   int *counts = NULL, *offs = NULL;
2698   int i, add = 0;
2699   tre_tnfa_transition_t *transitions, *initial;
2700   tre_tnfa_t *tnfa = NULL;
2701   tre_submatch_data_t *submatch_data;
2702   tre_tag_direction_t *tag_directions = NULL;
2703   reg_errcode_t errcode;
2704   tre_mem_t mem;
2705 
2706   /* Parse context. */
2707   tre_parse_ctx_t parse_ctx;
2708 
2709   /* Allocate a stack used throughout the compilation process for various
2710      purposes. */
2711   stack = tre_stack_new(512, 1024000, 128);
2712   if (!stack)
2713     return REG_ESPACE;
2714   /* Allocate a fast memory allocator. */
2715   mem = tre_mem_new();
2716   if (!mem)
2717     {
2718       tre_stack_destroy(stack);
2719       return REG_ESPACE;
2720     }
2721 
2722   /* Parse the regexp. */
2723   memset(&parse_ctx, 0, sizeof(parse_ctx));
2724   parse_ctx.mem = mem;
2725   parse_ctx.stack = stack;
2726   parse_ctx.start = regex;
2727   parse_ctx.cflags = cflags;
2728   parse_ctx.max_backref = -1;
2729   errcode = tre_parse(&parse_ctx);
2730   if (errcode != REG_OK)
2731     ERROR_EXIT(errcode);
2732   preg->re_nsub = parse_ctx.submatch_id - 1;
2733   tree = parse_ctx.n;
2734 
2735 #ifdef TRE_DEBUG
2736   tre_ast_print(tree);
2737 #endif /* TRE_DEBUG */
2738 
2739   /* Referring to nonexistent subexpressions is illegal. */
2740   if (parse_ctx.max_backref > (int)preg->re_nsub)
2741     ERROR_EXIT(REG_ESUBREG);
2742 
2743   /* Allocate the TNFA struct. */
2744   tnfa = xcalloc(1, sizeof(tre_tnfa_t));
2745   if (tnfa == NULL)
2746     ERROR_EXIT(REG_ESPACE);
2747   tnfa->have_backrefs = parse_ctx.max_backref >= 0;
2748   tnfa->have_approx = 0;
2749   tnfa->num_submatches = parse_ctx.submatch_id;
2750 
2751   /* Set up tags for submatch addressing.  If REG_NOSUB is set and the
2752      regexp does not have back references, this can be skipped. */
2753   if (tnfa->have_backrefs || !(cflags & REG_NOSUB))
2754     {
2755 
2756       /* Figure out how many tags we will need. */
2757       errcode = tre_add_tags(NULL, stack, tree, tnfa);
2758       if (errcode != REG_OK)
2759 	ERROR_EXIT(errcode);
2760 
2761       if (tnfa->num_tags > 0)
2762 	{
2763 	  tag_directions = xmalloc(sizeof(*tag_directions)
2764 				   * (tnfa->num_tags + 1));
2765 	  if (tag_directions == NULL)
2766 	    ERROR_EXIT(REG_ESPACE);
2767 	  tnfa->tag_directions = tag_directions;
2768 	  memset(tag_directions, -1,
2769 		 sizeof(*tag_directions) * (tnfa->num_tags + 1));
2770 	}
2771       tnfa->minimal_tags = xcalloc((unsigned)tnfa->num_tags * 2 + 1,
2772 				   sizeof(*tnfa->minimal_tags));
2773       if (tnfa->minimal_tags == NULL)
2774 	ERROR_EXIT(REG_ESPACE);
2775 
2776       submatch_data = xcalloc((unsigned)parse_ctx.submatch_id,
2777 			      sizeof(*submatch_data));
2778       if (submatch_data == NULL)
2779 	ERROR_EXIT(REG_ESPACE);
2780       tnfa->submatch_data = submatch_data;
2781 
2782       errcode = tre_add_tags(mem, stack, tree, tnfa);
2783       if (errcode != REG_OK)
2784 	ERROR_EXIT(errcode);
2785 
2786     }
2787 
2788   /* Expand iteration nodes. */
2789   errcode = tre_expand_ast(mem, stack, tree, &parse_ctx.position,
2790 			   tag_directions);
2791   if (errcode != REG_OK)
2792     ERROR_EXIT(errcode);
2793 
2794   /* Add a dummy node for the final state.
2795      XXX - For certain patterns this dummy node can be optimized away,
2796 	   for example "a*" or "ab*".	Figure out a simple way to detect
2797 	   this possibility. */
2798   tmp_ast_l = tree;
2799   tmp_ast_r = tre_ast_new_literal(mem, 0, 0, parse_ctx.position++);
2800   if (tmp_ast_r == NULL)
2801     ERROR_EXIT(REG_ESPACE);
2802 
2803   tree = tre_ast_new_catenation(mem, tmp_ast_l, tmp_ast_r);
2804   if (tree == NULL)
2805     ERROR_EXIT(REG_ESPACE);
2806 
2807   errcode = tre_compute_nfl(mem, stack, tree);
2808   if (errcode != REG_OK)
2809     ERROR_EXIT(errcode);
2810 
2811   counts = xmalloc(sizeof(int) * parse_ctx.position);
2812   if (counts == NULL)
2813     ERROR_EXIT(REG_ESPACE);
2814 
2815   offs = xmalloc(sizeof(int) * parse_ctx.position);
2816   if (offs == NULL)
2817     ERROR_EXIT(REG_ESPACE);
2818 
2819   for (i = 0; i < parse_ctx.position; i++)
2820     counts[i] = 0;
2821   tre_ast_to_tnfa(tree, NULL, counts, NULL);
2822 
2823   add = 0;
2824   for (i = 0; i < parse_ctx.position; i++)
2825     {
2826       offs[i] = add;
2827       add += counts[i] + 1;
2828       counts[i] = 0;
2829     }
2830   transitions = xcalloc((unsigned)add + 1, sizeof(*transitions));
2831   if (transitions == NULL)
2832     ERROR_EXIT(REG_ESPACE);
2833   tnfa->transitions = transitions;
2834   tnfa->num_transitions = add;
2835 
2836   errcode = tre_ast_to_tnfa(tree, transitions, counts, offs);
2837   if (errcode != REG_OK)
2838     ERROR_EXIT(errcode);
2839 
2840   tnfa->firstpos_chars = NULL;
2841 
2842   p = tree->firstpos;
2843   i = 0;
2844   while (p->position >= 0)
2845     {
2846       i++;
2847       p++;
2848     }
2849 
2850   initial = xcalloc((unsigned)i + 1, sizeof(tre_tnfa_transition_t));
2851   if (initial == NULL)
2852     ERROR_EXIT(REG_ESPACE);
2853   tnfa->initial = initial;
2854 
2855   i = 0;
2856   for (p = tree->firstpos; p->position >= 0; p++)
2857     {
2858       initial[i].state = transitions + offs[p->position];
2859       initial[i].state_id = p->position;
2860       initial[i].tags = NULL;
2861       /* Copy the arrays p->tags, and p->params, they are allocated
2862 	 from a tre_mem object. */
2863       if (p->tags)
2864 	{
2865 	  int j;
2866 	  for (j = 0; p->tags[j] >= 0; j++);
2867 	  initial[i].tags = xmalloc(sizeof(*p->tags) * (j + 1));
2868 	  if (!initial[i].tags)
2869 	    ERROR_EXIT(REG_ESPACE);
2870 	  memcpy(initial[i].tags, p->tags, sizeof(*p->tags) * (j + 1));
2871 	}
2872       initial[i].assertions = p->assertions;
2873       i++;
2874     }
2875   initial[i].state = NULL;
2876 
2877   tnfa->num_transitions = add;
2878   tnfa->final = transitions + offs[tree->lastpos[0].position];
2879   tnfa->num_states = parse_ctx.position;
2880   tnfa->cflags = cflags;
2881 
2882   tre_mem_destroy(mem);
2883   tre_stack_destroy(stack);
2884   xfree(counts);
2885   xfree(offs);
2886 
2887   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2888   return REG_OK;
2889 
2890  error_exit:
2891   /* Free everything that was allocated and return the error code. */
2892   tre_mem_destroy(mem);
2893   if (stack != NULL)
2894     tre_stack_destroy(stack);
2895   if (counts != NULL)
2896     xfree(counts);
2897   if (offs != NULL)
2898     xfree(offs);
2899   preg->TRE_REGEX_T_FIELD = (void *)tnfa;
2900   regfree(preg);
2901   return errcode;
2902 }
2903 
2904 
2905 
2906 
2907 void
regfree(regex_t * preg)2908 regfree(regex_t *preg)
2909 {
2910   tre_tnfa_t *tnfa;
2911   unsigned int i;
2912   tre_tnfa_transition_t *trans;
2913 
2914   tnfa = (void *)preg->TRE_REGEX_T_FIELD;
2915   if (!tnfa)
2916     return;
2917 
2918   for (i = 0; i < tnfa->num_transitions; i++)
2919     if (tnfa->transitions[i].state)
2920       {
2921 	if (tnfa->transitions[i].tags)
2922 	  xfree(tnfa->transitions[i].tags);
2923 	if (tnfa->transitions[i].neg_classes)
2924 	  xfree(tnfa->transitions[i].neg_classes);
2925       }
2926   if (tnfa->transitions)
2927     xfree(tnfa->transitions);
2928 
2929   if (tnfa->initial)
2930     {
2931       for (trans = tnfa->initial; trans->state; trans++)
2932 	{
2933 	  if (trans->tags)
2934 	    xfree(trans->tags);
2935 	}
2936       xfree(tnfa->initial);
2937     }
2938 
2939   if (tnfa->submatch_data)
2940     {
2941       for (i = 0; i < tnfa->num_submatches; i++)
2942 	if (tnfa->submatch_data[i].parents)
2943 	  xfree(tnfa->submatch_data[i].parents);
2944       xfree(tnfa->submatch_data);
2945     }
2946 
2947   if (tnfa->tag_directions)
2948     xfree(tnfa->tag_directions);
2949   if (tnfa->firstpos_chars)
2950     xfree(tnfa->firstpos_chars);
2951   if (tnfa->minimal_tags)
2952     xfree(tnfa->minimal_tags);
2953   xfree(tnfa);
2954 }
2955