• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2   regexec.c - TRE POSIX compatible matching functions (and more).
3 
4   Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5   All rights reserved.
6 
7   Redistribution and use in source and binary forms, with or without
8   modification, are permitted provided that the following conditions
9   are met:
10 
11     1. Redistributions of source code must retain the above copyright
12        notice, this list of conditions and the following disclaimer.
13 
14     2. Redistributions in binary form must reproduce the above copyright
15        notice, this list of conditions and the following disclaimer in the
16        documentation and/or other materials provided with the distribution.
17 
18   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT
22   HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 
30 */
31 
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include <wctype.h>
36 #include <limits.h>
37 #include <stdint.h>
38 
39 #include <regex.h>
40 
41 #include "tre.h"
42 
43 #include <assert.h>
44 
45 static void
46 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
47 		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
48 
49 /***********************************************************************
50  from tre-match-utils.h
51 ***********************************************************************/
52 
53 #define GET_NEXT_WCHAR() do {                                                 \
54     prev_c = next_c; pos += pos_add_next;                                     \
55     if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) {        \
56         if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; }         \
57         else pos_add_next++;                                                  \
58     }                                                                         \
59     str_byte += pos_add_next;                                                 \
60   } while (0)
61 
62 #define IS_WORD_CHAR(c)	 ((c) == L'_' || tre_isalnum(c))
63 
64 #define CHECK_ASSERTIONS(assertions)					      \
65   (((assertions & ASSERT_AT_BOL)					      \
66     && (pos > 0 || reg_notbol)						      \
67     && (prev_c != L'\n' || !reg_newline))				      \
68    || ((assertions & ASSERT_AT_EOL)					      \
69        && (next_c != L'\0' || reg_noteol)				      \
70        && (next_c != L'\n' || !reg_newline))				      \
71    || ((assertions & ASSERT_AT_BOW)					      \
72        && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c)))	              \
73    || ((assertions & ASSERT_AT_EOW)					      \
74        && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c)))		      \
75    || ((assertions & ASSERT_AT_WB)					      \
76        && (pos != 0 && next_c != L'\0'					      \
77 	   && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c)))		      \
78    || ((assertions & ASSERT_AT_WB_NEG)					      \
79        && (pos == 0 || next_c == L'\0'					      \
80 	   || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
81 
82 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)                             \
83   (((trans_i->assertions & ASSERT_CHAR_CLASS)                                 \
84        && !(tnfa->cflags & REG_ICASE)                                         \
85        && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class))                 \
86     || ((trans_i->assertions & ASSERT_CHAR_CLASS)                             \
87         && (tnfa->cflags & REG_ICASE)                                         \
88         && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class)     \
89 	&& !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class))    \
90     || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG)                         \
91         && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
92                                       tnfa->cflags & REG_ICASE)))
93 
94 
95 
96 
97 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
98 static int
tre_tag_order(int num_tags,tre_tag_direction_t * tag_directions,regoff_t * t1,regoff_t * t2)99 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
100 	      regoff_t *t1, regoff_t *t2)
101 {
102   int i;
103   for (i = 0; i < num_tags; i++)
104     {
105       if (tag_directions[i] == TRE_TAG_MINIMIZE)
106 	{
107 	  if (t1[i] < t2[i])
108 	    return 1;
109 	  if (t1[i] > t2[i])
110 	    return 0;
111 	}
112       else
113 	{
114 	  if (t1[i] > t2[i])
115 	    return 1;
116 	  if (t1[i] < t2[i])
117 	    return 0;
118 	}
119     }
120   /*  assert(0);*/
121   return 0;
122 }
123 
124 static int
tre_neg_char_classes_match(tre_ctype_t * classes,tre_cint_t wc,int icase)125 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
126 {
127   while (*classes != (tre_ctype_t)0)
128     if ((!icase && tre_isctype(wc, *classes))
129 	|| (icase && (tre_isctype(tre_toupper(wc), *classes)
130 		      || tre_isctype(tre_tolower(wc), *classes))))
131       return 1; /* Match. */
132     else
133       classes++;
134   return 0; /* No match. */
135 }
136 
137 
138 /***********************************************************************
139  from tre-match-parallel.c
140 ***********************************************************************/
141 
142 /*
143   This algorithm searches for matches basically by reading characters
144   in the searched string one by one, starting at the beginning.	 All
145   matching paths in the TNFA are traversed in parallel.	 When two or
146   more paths reach the same state, exactly one is chosen according to
147   tag ordering rules; if returning submatches is not required it does
148   not matter which path is chosen.
149 
150   The worst case time required for finding the leftmost and longest
151   match, or determining that there is no match, is always linearly
152   dependent on the length of the text being searched.
153 
154   This algorithm cannot handle TNFAs with back referencing nodes.
155   See `tre-match-backtrack.c'.
156 */
157 
158 typedef struct {
159   tre_tnfa_transition_t *state;
160   regoff_t *tags;
161 } tre_tnfa_reach_t;
162 
163 typedef struct {
164   regoff_t pos;
165   regoff_t **tags;
166 } tre_reach_pos_t;
167 
168 
169 static reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)170 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
171 		      regoff_t *match_tags, int eflags,
172 		      regoff_t *match_end_ofs)
173 {
174   /* State variables required by GET_NEXT_WCHAR. */
175   tre_char_t prev_c = 0, next_c = 0;
176   const char *str_byte = string;
177   regoff_t pos = -1;
178   regoff_t pos_add_next = 1;
179 #ifdef TRE_MBSTATE
180   mbstate_t mbstate;
181 #endif /* TRE_MBSTATE */
182   int reg_notbol = eflags & REG_NOTBOL;
183   int reg_noteol = eflags & REG_NOTEOL;
184   int reg_newline = tnfa->cflags & REG_NEWLINE;
185   reg_errcode_t ret;
186 
187   char *buf;
188   tre_tnfa_transition_t *trans_i;
189   tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
190   tre_reach_pos_t *reach_pos;
191   int *tag_i;
192   int num_tags, i;
193 
194   regoff_t match_eo = -1;	   /* end offset of match (-1 if no match found yet) */
195   int new_match = 0;
196   regoff_t *tmp_tags = NULL;
197   regoff_t *tmp_iptr;
198 
199 #ifdef TRE_MBSTATE
200   memset(&mbstate, '\0', sizeof(mbstate));
201 #endif /* TRE_MBSTATE */
202 
203   if (!match_tags)
204     num_tags = 0;
205   else
206     num_tags = tnfa->num_tags;
207 
208   /* Allocate memory for temporary data required for matching.	This needs to
209      be done for every matching operation to be thread safe.  This allocates
210      everything in a single large block with calloc(). */
211   {
212     size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
213     char *tmp_buf;
214 
215     /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
216      * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
217     if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
218       return REG_ESPACE;
219 
220     /* Likewise check rbytes. */
221     if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
222       return REG_ESPACE;
223 
224     /* Likewise check pbytes. */
225     if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
226       return REG_ESPACE;
227 
228     /* Compute the length of the block we need. */
229     tbytes = sizeof(*tmp_tags) * num_tags;
230     rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
231     pbytes = sizeof(*reach_pos) * tnfa->num_states;
232     xbytes = sizeof(regoff_t) * num_tags;
233     total_bytes =
234       (sizeof(long) - 1) * 4 /* for alignment paddings */
235       + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
236 
237     /* Allocate the memory. */
238     buf = calloc(total_bytes, 1);
239     if (buf == NULL)
240       return REG_ESPACE;
241 
242     /* Get the various pointers within tmp_buf (properly aligned). */
243     tmp_tags = (void *)buf;
244     tmp_buf = buf + tbytes;
245     tmp_buf += ALIGN(tmp_buf, long);
246     reach_next = (void *)tmp_buf;
247     tmp_buf += rbytes;
248     tmp_buf += ALIGN(tmp_buf, long);
249     reach = (void *)tmp_buf;
250     tmp_buf += rbytes;
251     tmp_buf += ALIGN(tmp_buf, long);
252     reach_pos = (void *)tmp_buf;
253     tmp_buf += pbytes;
254     tmp_buf += ALIGN(tmp_buf, long);
255     for (i = 0; i < tnfa->num_states; i++)
256       {
257 	reach[i].tags = (void *)tmp_buf;
258 	tmp_buf += xbytes;
259 	reach_next[i].tags = (void *)tmp_buf;
260 	tmp_buf += xbytes;
261       }
262   }
263 
264   for (i = 0; i < tnfa->num_states; i++)
265     reach_pos[i].pos = -1;
266 
267   GET_NEXT_WCHAR();
268   pos = 0;
269 
270   reach_next_i = reach_next;
271   while (1)
272     {
273       /* If no match found yet, add the initial states to `reach_next'. */
274       if (match_eo < 0)
275 	{
276 	  trans_i = tnfa->initial;
277 	  while (trans_i->state != NULL)
278 	    {
279 	      if (reach_pos[trans_i->state_id].pos < pos)
280 		{
281 		  if (trans_i->assertions
282 		      && CHECK_ASSERTIONS(trans_i->assertions))
283 		    {
284 		      trans_i++;
285 		      continue;
286 		    }
287 
288 		  reach_next_i->state = trans_i->state;
289 		  for (i = 0; i < num_tags; i++)
290 		    reach_next_i->tags[i] = -1;
291 		  tag_i = trans_i->tags;
292 		  if (tag_i)
293 		    while (*tag_i >= 0)
294 		      {
295 			if (*tag_i < num_tags)
296 			  reach_next_i->tags[*tag_i] = pos;
297 			tag_i++;
298 		      }
299 		  if (reach_next_i->state == tnfa->final)
300 		    {
301 		      match_eo = pos;
302 		      new_match = 1;
303 		      for (i = 0; i < num_tags; i++)
304 			match_tags[i] = reach_next_i->tags[i];
305 		    }
306 		  reach_pos[trans_i->state_id].pos = pos;
307 		  reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
308 		  reach_next_i++;
309 		}
310 	      trans_i++;
311 	    }
312 	  reach_next_i->state = NULL;
313 	}
314       else
315 	{
316 	  if (num_tags == 0 || reach_next_i == reach_next)
317 	    /* We have found a match. */
318 	    break;
319 	}
320 
321       /* Check for end of string. */
322       if (!next_c) break;
323 
324       GET_NEXT_WCHAR();
325 
326       /* Swap `reach' and `reach_next'. */
327       reach_i = reach;
328       reach = reach_next;
329       reach_next = reach_i;
330 
331       /* For each state in `reach', weed out states that don't fulfill the
332 	 minimal matching conditions. */
333       if (tnfa->num_minimals && new_match)
334 	{
335 	  new_match = 0;
336 	  reach_next_i = reach_next;
337 	  for (reach_i = reach; reach_i->state; reach_i++)
338 	    {
339 	      int skip = 0;
340 	      for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
341 		{
342 		  int end = tnfa->minimal_tags[i];
343 		  int start = tnfa->minimal_tags[i + 1];
344 		  if (end >= num_tags)
345 		    {
346 		      skip = 1;
347 		      break;
348 		    }
349 		  else if (reach_i->tags[start] == match_tags[start]
350 			   && reach_i->tags[end] < match_tags[end])
351 		    {
352 		      skip = 1;
353 		      break;
354 		    }
355 		}
356 	      if (!skip)
357 		{
358 		  reach_next_i->state = reach_i->state;
359 		  tmp_iptr = reach_next_i->tags;
360 		  reach_next_i->tags = reach_i->tags;
361 		  reach_i->tags = tmp_iptr;
362 		  reach_next_i++;
363 		}
364 	    }
365 	  reach_next_i->state = NULL;
366 
367 	  /* Swap `reach' and `reach_next'. */
368 	  reach_i = reach;
369 	  reach = reach_next;
370 	  reach_next = reach_i;
371 	}
372 
373       /* For each state in `reach' see if there is a transition leaving with
374 	 the current input symbol to a state not yet in `reach_next', and
375 	 add the destination states to `reach_next'. */
376       reach_next_i = reach_next;
377       for (reach_i = reach; reach_i->state; reach_i++)
378 	{
379 	  for (trans_i = reach_i->state; trans_i->state; trans_i++)
380 	    {
381 	      /* Does this transition match the input symbol? */
382 	      if (trans_i->code_min <= (tre_cint_t)prev_c &&
383 		  trans_i->code_max >= (tre_cint_t)prev_c)
384 		{
385 		  if (trans_i->assertions
386 		      && (CHECK_ASSERTIONS(trans_i->assertions)
387 			  || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
388 		    {
389 		      continue;
390 		    }
391 
392 		  /* Compute the tags after this transition. */
393 		  for (i = 0; i < num_tags; i++)
394 		    tmp_tags[i] = reach_i->tags[i];
395 		  tag_i = trans_i->tags;
396 		  if (tag_i != NULL)
397 		    while (*tag_i >= 0)
398 		      {
399 			if (*tag_i < num_tags)
400 			  tmp_tags[*tag_i] = pos;
401 			tag_i++;
402 		      }
403 
404 		  if (reach_pos[trans_i->state_id].pos < pos)
405 		    {
406 		      /* Found an unvisited node. */
407 		      reach_next_i->state = trans_i->state;
408 		      tmp_iptr = reach_next_i->tags;
409 		      reach_next_i->tags = tmp_tags;
410 		      tmp_tags = tmp_iptr;
411 		      reach_pos[trans_i->state_id].pos = pos;
412 		      reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
413 
414 		      if (reach_next_i->state == tnfa->final
415 			  && (match_eo == -1
416 			      || (num_tags > 0
417 				  && reach_next_i->tags[0] <= match_tags[0])))
418 			{
419 			  match_eo = pos;
420 			  new_match = 1;
421 			  for (i = 0; i < num_tags; i++)
422 			    match_tags[i] = reach_next_i->tags[i];
423 			}
424 		      reach_next_i++;
425 
426 		    }
427 		  else
428 		    {
429 		      assert(reach_pos[trans_i->state_id].pos == pos);
430 		      /* Another path has also reached this state.  We choose
431 			 the winner by examining the tag values for both
432 			 paths. */
433 		      if (tre_tag_order(num_tags, tnfa->tag_directions,
434 					tmp_tags,
435 					*reach_pos[trans_i->state_id].tags))
436 			{
437 			  /* The new path wins. */
438 			  tmp_iptr = *reach_pos[trans_i->state_id].tags;
439 			  *reach_pos[trans_i->state_id].tags = tmp_tags;
440 			  if (trans_i->state == tnfa->final)
441 			    {
442 			      match_eo = pos;
443 			      new_match = 1;
444 			      for (i = 0; i < num_tags; i++)
445 				match_tags[i] = tmp_tags[i];
446 			    }
447 			  tmp_tags = tmp_iptr;
448 			}
449 		    }
450 		}
451 	    }
452 	}
453       reach_next_i->state = NULL;
454     }
455 
456   *match_end_ofs = match_eo;
457   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
458 error_exit:
459   xfree(buf);
460   return ret;
461 }
462 
463 
464 
465 /***********************************************************************
466  from tre-match-backtrack.c
467 ***********************************************************************/
468 
469 /*
470   This matcher is for regexps that use back referencing.  Regexp matching
471   with back referencing is an NP-complete problem on the number of back
472   references.  The easiest way to match them is to use a backtracking
473   routine which basically goes through all possible paths in the TNFA
474   and chooses the one which results in the best (leftmost and longest)
475   match.  This can be spectacularly expensive and may run out of stack
476   space, but there really is no better known generic algorithm.	 Quoting
477   Henry Spencer from comp.compilers:
478   <URL: http://compilers.iecc.com/comparch/article/93-03-102>
479 
480     POSIX.2 REs require longest match, which is really exciting to
481     implement since the obsolete ("basic") variant also includes
482     \<digit>.  I haven't found a better way of tackling this than doing
483     a preliminary match using a DFA (or simulation) on a modified RE
484     that just replicates subREs for \<digit>, and then doing a
485     backtracking match to determine whether the subRE matches were
486     right.  This can be rather slow, but I console myself with the
487     thought that people who use \<digit> deserve very slow execution.
488     (Pun unintentional but very appropriate.)
489 
490 */
491 
492 typedef struct {
493   regoff_t pos;
494   const char *str_byte;
495   tre_tnfa_transition_t *state;
496   int state_id;
497   int next_c;
498   regoff_t *tags;
499 #ifdef TRE_MBSTATE
500   mbstate_t mbstate;
501 #endif /* TRE_MBSTATE */
502 } tre_backtrack_item_t;
503 
504 typedef struct tre_backtrack_struct {
505   tre_backtrack_item_t item;
506   struct tre_backtrack_struct *prev;
507   struct tre_backtrack_struct *next;
508 } *tre_backtrack_t;
509 
510 #ifdef TRE_MBSTATE
511 #define BT_STACK_MBSTATE_IN  stack->item.mbstate = (mbstate)
512 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
513 #else /* !TRE_MBSTATE */
514 #define BT_STACK_MBSTATE_IN
515 #define BT_STACK_MBSTATE_OUT
516 #endif /* !TRE_MBSTATE */
517 
518 #define tre_bt_mem_new		  tre_mem_new
519 #define tre_bt_mem_alloc	  tre_mem_alloc
520 #define tre_bt_mem_destroy	  tre_mem_destroy
521 
522 
523 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
524   do									      \
525     {									      \
526       int i;								      \
527       if (!stack->next)							      \
528 	{								      \
529 	  tre_backtrack_t s;						      \
530 	  s = tre_bt_mem_alloc(mem, sizeof(*s));			      \
531 	  if (!s)							      \
532 	    {								      \
533 	      tre_bt_mem_destroy(mem);					      \
534 	      if (tags)							      \
535 		xfree(tags);						      \
536 	      if (pmatch)						      \
537 		xfree(pmatch);						      \
538 	      if (states_seen)						      \
539 		xfree(states_seen);					      \
540 	      return REG_ESPACE;					      \
541 	    }								      \
542 	  s->prev = stack;						      \
543 	  s->next = NULL;						      \
544 	  s->item.tags = tre_bt_mem_alloc(mem,				      \
545 					  sizeof(*tags) * tnfa->num_tags);    \
546 	  if (!s->item.tags)						      \
547 	    {								      \
548 	      tre_bt_mem_destroy(mem);					      \
549 	      if (tags)							      \
550 		xfree(tags);						      \
551 	      if (pmatch)						      \
552 		xfree(pmatch);						      \
553 	      if (states_seen)						      \
554 		xfree(states_seen);					      \
555 	      return REG_ESPACE;					      \
556 	    }								      \
557 	  stack->next = s;						      \
558 	  stack = s;							      \
559 	}								      \
560       else								      \
561 	stack = stack->next;						      \
562       stack->item.pos = (_pos);						      \
563       stack->item.str_byte = (_str_byte);				      \
564       stack->item.state = (_state);					      \
565       stack->item.state_id = (_state_id);				      \
566       stack->item.next_c = (_next_c);					      \
567       for (i = 0; i < tnfa->num_tags; i++)				      \
568 	stack->item.tags[i] = (_tags)[i];				      \
569       BT_STACK_MBSTATE_IN;						      \
570     }									      \
571   while (0)
572 
573 #define BT_STACK_POP()							      \
574   do									      \
575     {									      \
576       int i;								      \
577       assert(stack->prev);						      \
578       pos = stack->item.pos;						      \
579       str_byte = stack->item.str_byte;					      \
580       state = stack->item.state;					      \
581       next_c = stack->item.next_c;					      \
582       for (i = 0; i < tnfa->num_tags; i++)				      \
583 	tags[i] = stack->item.tags[i];					      \
584       BT_STACK_MBSTATE_OUT;						      \
585       stack = stack->prev;						      \
586     }									      \
587   while (0)
588 
589 #undef MIN
590 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
591 
592 static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)593 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
594 		       regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
595 {
596   /* State variables required by GET_NEXT_WCHAR. */
597   tre_char_t prev_c = 0, next_c = 0;
598   const char *str_byte = string;
599   regoff_t pos = 0;
600   regoff_t pos_add_next = 1;
601 #ifdef TRE_MBSTATE
602   mbstate_t mbstate;
603 #endif /* TRE_MBSTATE */
604   int reg_notbol = eflags & REG_NOTBOL;
605   int reg_noteol = eflags & REG_NOTEOL;
606   int reg_newline = tnfa->cflags & REG_NEWLINE;
607 
608   /* These are used to remember the necessary values of the above
609      variables to return to the position where the current search
610      started from. */
611   int next_c_start;
612   const char *str_byte_start;
613   regoff_t pos_start = -1;
614 #ifdef TRE_MBSTATE
615   mbstate_t mbstate_start;
616 #endif /* TRE_MBSTATE */
617 
618   /* End offset of best match so far, or -1 if no match found yet. */
619   regoff_t match_eo = -1;
620   /* Tag arrays. */
621   int *next_tags;
622   regoff_t *tags = NULL;
623   /* Current TNFA state. */
624   tre_tnfa_transition_t *state;
625   int *states_seen = NULL;
626 
627   /* Memory allocator to for allocating the backtracking stack. */
628   tre_mem_t mem = tre_bt_mem_new();
629 
630   /* The backtracking stack. */
631   tre_backtrack_t stack;
632 
633   tre_tnfa_transition_t *trans_i;
634   regmatch_t *pmatch = NULL;
635   int ret;
636 
637 #ifdef TRE_MBSTATE
638   memset(&mbstate, '\0', sizeof(mbstate));
639 #endif /* TRE_MBSTATE */
640 
641   if (!mem)
642     return REG_ESPACE;
643   stack = tre_bt_mem_alloc(mem, sizeof(*stack));
644   if (!stack)
645     {
646       ret = REG_ESPACE;
647       goto error_exit;
648     }
649   stack->prev = NULL;
650   stack->next = NULL;
651 
652   if (tnfa->num_tags)
653     {
654       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
655       if (!tags)
656 	{
657 	  ret = REG_ESPACE;
658 	  goto error_exit;
659 	}
660     }
661   if (tnfa->num_submatches)
662     {
663       pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
664       if (!pmatch)
665 	{
666 	  ret = REG_ESPACE;
667 	  goto error_exit;
668 	}
669     }
670   if (tnfa->num_states)
671     {
672       states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
673       if (!states_seen)
674 	{
675 	  ret = REG_ESPACE;
676 	  goto error_exit;
677 	}
678     }
679 
680  retry:
681   {
682     int i;
683     for (i = 0; i < tnfa->num_tags; i++)
684       {
685 	tags[i] = -1;
686 	if (match_tags)
687 	  match_tags[i] = -1;
688       }
689     for (i = 0; i < tnfa->num_states; i++)
690       states_seen[i] = 0;
691   }
692 
693   state = NULL;
694   pos = pos_start;
695   GET_NEXT_WCHAR();
696   pos_start = pos;
697   next_c_start = next_c;
698   str_byte_start = str_byte;
699 #ifdef TRE_MBSTATE
700   mbstate_start = mbstate;
701 #endif /* TRE_MBSTATE */
702 
703   /* Handle initial states. */
704   next_tags = NULL;
705   for (trans_i = tnfa->initial; trans_i->state; trans_i++)
706     {
707       if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
708 	{
709 	  continue;
710 	}
711       if (state == NULL)
712 	{
713 	  /* Start from this state. */
714 	  state = trans_i->state;
715 	  next_tags = trans_i->tags;
716 	}
717       else
718 	{
719 	  /* Backtrack to this state. */
720 	  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
721 			trans_i->state_id, next_c, tags, mbstate);
722 	  {
723 	    int *tmp = trans_i->tags;
724 	    if (tmp)
725 	      while (*tmp >= 0)
726 		stack->item.tags[*tmp++] = pos;
727 	  }
728 	}
729     }
730 
731   if (next_tags)
732     for (; *next_tags >= 0; next_tags++)
733       tags[*next_tags] = pos;
734 
735 
736   if (state == NULL)
737     goto backtrack;
738 
739   while (1)
740     {
741       tre_tnfa_transition_t *next_state;
742       int empty_br_match;
743 
744       if (state == tnfa->final)
745 	{
746 	  if (match_eo < pos
747 	      || (match_eo == pos
748 		  && match_tags
749 		  && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
750 				   tags, match_tags)))
751 	    {
752 	      int i;
753 	      /* This match wins the previous match. */
754 	      match_eo = pos;
755 	      if (match_tags)
756 		for (i = 0; i < tnfa->num_tags; i++)
757 		  match_tags[i] = tags[i];
758 	    }
759 	  /* Our TNFAs never have transitions leaving from the final state,
760 	     so we jump right to backtracking. */
761 	  goto backtrack;
762 	}
763 
764       /* Go to the next character in the input string. */
765       empty_br_match = 0;
766       trans_i = state;
767       if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
768 	{
769 	  /* This is a back reference state.  All transitions leaving from
770 	     this state have the same back reference "assertion".  Instead
771 	     of reading the next character, we match the back reference. */
772 	  regoff_t so, eo;
773 	  int bt = trans_i->u.backref;
774 	  regoff_t bt_len;
775 	  int result;
776 
777 	  /* Get the substring we need to match against.  Remember to
778 	     turn off REG_NOSUB temporarily. */
779 	  tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
780 			  tnfa, tags, pos);
781 	  so = pmatch[bt].rm_so;
782 	  eo = pmatch[bt].rm_eo;
783 	  bt_len = eo - so;
784 
785 	  result = strncmp((const char*)string + so, str_byte - 1,
786 				 (size_t)bt_len);
787 
788 	  if (result == 0)
789 	    {
790 	      /* Back reference matched.  Check for infinite loop. */
791 	      if (bt_len == 0)
792 		empty_br_match = 1;
793 	      if (empty_br_match && states_seen[trans_i->state_id])
794 		{
795 		  goto backtrack;
796 		}
797 
798 	      states_seen[trans_i->state_id] = empty_br_match;
799 
800 	      /* Advance in input string and resync `prev_c', `next_c'
801 		 and pos. */
802 	      str_byte += bt_len - 1;
803 	      pos += bt_len - 1;
804 	      GET_NEXT_WCHAR();
805 	    }
806 	  else
807 	    {
808 	      goto backtrack;
809 	    }
810 	}
811       else
812 	{
813 	  /* Check for end of string. */
814 	  if (next_c == L'\0')
815 		goto backtrack;
816 
817 	  /* Read the next character. */
818 	  GET_NEXT_WCHAR();
819 	}
820 
821       next_state = NULL;
822       for (trans_i = state; trans_i->state; trans_i++)
823 	{
824 	  if (trans_i->code_min <= (tre_cint_t)prev_c
825 	      && trans_i->code_max >= (tre_cint_t)prev_c)
826 	    {
827 	      if (trans_i->assertions
828 		  && (CHECK_ASSERTIONS(trans_i->assertions)
829 		      || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
830 		{
831 		  continue;
832 		}
833 
834 	      if (next_state == NULL)
835 		{
836 		  /* First matching transition. */
837 		  next_state = trans_i->state;
838 		  next_tags = trans_i->tags;
839 		}
840 	      else
841 		{
842 		  /* Second matching transition.  We may need to backtrack here
843 		     to take this transition instead of the first one, so we
844 		     push this transition in the backtracking stack so we can
845 		     jump back here if needed. */
846 		  BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
847 				trans_i->state_id, next_c, tags, mbstate);
848 		  {
849 		    int *tmp;
850 		    for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
851 		      stack->item.tags[*tmp] = pos;
852 		  }
853 #if 0 /* XXX - it's important not to look at all transitions here to keep
854 	 the stack small! */
855 		  break;
856 #endif
857 		}
858 	    }
859 	}
860 
861       if (next_state != NULL)
862 	{
863 	  /* Matching transitions were found.  Take the first one. */
864 	  state = next_state;
865 
866 	  /* Update the tag values. */
867 	  if (next_tags)
868 	    while (*next_tags >= 0)
869 	      tags[*next_tags++] = pos;
870 	}
871       else
872 	{
873 	backtrack:
874 	  /* A matching transition was not found.  Try to backtrack. */
875 	  if (stack->prev)
876 	    {
877 	      if (stack->item.state->assertions & ASSERT_BACKREF)
878 		{
879 		  states_seen[stack->item.state_id] = 0;
880 		}
881 
882 	      BT_STACK_POP();
883 	    }
884 	  else if (match_eo < 0)
885 	    {
886 	      /* Try starting from a later position in the input string. */
887 	      /* Check for end of string. */
888 	      if (next_c == L'\0')
889 		    {
890 		      break;
891 		    }
892 	      next_c = next_c_start;
893 #ifdef TRE_MBSTATE
894 	      mbstate = mbstate_start;
895 #endif /* TRE_MBSTATE */
896 	      str_byte = str_byte_start;
897 	      goto retry;
898 	    }
899 	  else
900 	    {
901 	      break;
902 	    }
903 	}
904     }
905 
906   ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
907   *match_end_ofs = match_eo;
908 
909  error_exit:
910   tre_bt_mem_destroy(mem);
911 #ifndef TRE_USE_ALLOCA
912   if (tags)
913     xfree(tags);
914   if (pmatch)
915     xfree(pmatch);
916   if (states_seen)
917     xfree(states_seen);
918 #endif /* !TRE_USE_ALLOCA */
919 
920   return ret;
921 }
922 
923 /***********************************************************************
924  from regexec.c
925 ***********************************************************************/
926 
927 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
928    endpoint values. */
929 static void
tre_fill_pmatch(size_t nmatch,regmatch_t pmatch[],int cflags,const tre_tnfa_t * tnfa,regoff_t * tags,regoff_t match_eo)930 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
931 		const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
932 {
933   tre_submatch_data_t *submatch_data;
934   unsigned int i, j;
935   int *parents;
936 
937   i = 0;
938   if (match_eo >= 0 && !(cflags & REG_NOSUB))
939     {
940       /* Construct submatch offsets from the tags. */
941       submatch_data = tnfa->submatch_data;
942       while (i < tnfa->num_submatches && i < nmatch)
943 	{
944 	  if (submatch_data[i].so_tag == tnfa->end_tag)
945 	    pmatch[i].rm_so = match_eo;
946 	  else
947 	    pmatch[i].rm_so = tags[submatch_data[i].so_tag];
948 
949 	  if (submatch_data[i].eo_tag == tnfa->end_tag)
950 	    pmatch[i].rm_eo = match_eo;
951 	  else
952 	    pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
953 
954 	  /* If either of the endpoints were not used, this submatch
955 	     was not part of the match. */
956 	  if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
957 	    pmatch[i].rm_so = pmatch[i].rm_eo = -1;
958 
959 	  i++;
960 	}
961       /* Reset all submatches that are not within all of their parent
962 	 submatches. */
963       i = 0;
964       while (i < tnfa->num_submatches && i < nmatch)
965 	{
966 	  if (pmatch[i].rm_eo == -1)
967 	    assert(pmatch[i].rm_so == -1);
968 	  assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
969 
970 	  parents = submatch_data[i].parents;
971 	  if (parents != NULL)
972 	    for (j = 0; parents[j] >= 0; j++)
973 	      {
974 		if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
975 		    || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
976 		  pmatch[i].rm_so = pmatch[i].rm_eo = -1;
977 	      }
978 	  i++;
979 	}
980     }
981 
982   while (i < nmatch)
983     {
984       pmatch[i].rm_so = -1;
985       pmatch[i].rm_eo = -1;
986       i++;
987     }
988 }
989 
990 
991 /*
992   Wrapper functions for POSIX compatible regexp matching.
993 */
994 
995 int
regexec(const regex_t * restrict preg,const char * restrict string,size_t nmatch,regmatch_t pmatch[restrict],int eflags)996 regexec(const regex_t *restrict preg, const char *restrict string,
997 	  size_t nmatch, regmatch_t pmatch[restrict], int eflags)
998 {
999   tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
1000   reg_errcode_t status;
1001   regoff_t *tags = NULL, eo;
1002   if (tnfa->cflags & REG_NOSUB) nmatch = 0;
1003   if (tnfa->num_tags > 0 && nmatch > 0)
1004     {
1005       tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
1006       if (tags == NULL)
1007 	return REG_ESPACE;
1008     }
1009 
1010   /* Dispatch to the appropriate matcher. */
1011   if (tnfa->have_backrefs)
1012     {
1013       /* The regex has back references, use the backtracking matcher. */
1014       status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
1015     }
1016   else
1017     {
1018       /* Exact matching, no back references, use the parallel matcher. */
1019       status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
1020     }
1021 
1022   if (status == REG_OK)
1023     /* A match was found, so fill the submatch registers. */
1024     tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1025   if (tags)
1026     xfree(tags);
1027   return status;
1028 }
1029