1 /*
2 regexec.c - TRE POSIX compatible matching functions (and more).
3
4 Copyright (c) 2001-2009 Ville Laurikari <vl@iki.fi>
5 All rights reserved.
6
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions
9 are met:
10
11 1. Redistributions of source code must retain the above copyright
12 notice, this list of conditions and the following disclaimer.
13
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
17
18 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
19 ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 */
31
32 #include <stdlib.h>
33 #include <string.h>
34 #include <wchar.h>
35 #include <wctype.h>
36 #include <limits.h>
37 #include <stdint.h>
38
39 #include <regex.h>
40
41 #include "tre.h"
42
43 #include <assert.h>
44
45 static void
46 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
47 const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo);
48
49 /***********************************************************************
50 from tre-match-utils.h
51 ***********************************************************************/
52
53 #define GET_NEXT_WCHAR() do { \
54 prev_c = next_c; pos += pos_add_next; \
55 if ((pos_add_next = mbtowc(&next_c, str_byte, MB_LEN_MAX)) <= 0) { \
56 if (pos_add_next < 0) { ret = REG_NOMATCH; goto error_exit; } \
57 else pos_add_next++; \
58 } \
59 str_byte += pos_add_next; \
60 } while (0)
61
62 #define IS_WORD_CHAR(c) ((c) == L'_' || tre_isalnum(c))
63
64 #define CHECK_ASSERTIONS(assertions) \
65 (((assertions & ASSERT_AT_BOL) \
66 && (pos > 0 || reg_notbol) \
67 && (prev_c != L'\n' || !reg_newline)) \
68 || ((assertions & ASSERT_AT_EOL) \
69 && (next_c != L'\0' || reg_noteol) \
70 && (next_c != L'\n' || !reg_newline)) \
71 || ((assertions & ASSERT_AT_BOW) \
72 && (IS_WORD_CHAR(prev_c) || !IS_WORD_CHAR(next_c))) \
73 || ((assertions & ASSERT_AT_EOW) \
74 && (!IS_WORD_CHAR(prev_c) || IS_WORD_CHAR(next_c))) \
75 || ((assertions & ASSERT_AT_WB) \
76 && (pos != 0 && next_c != L'\0' \
77 && IS_WORD_CHAR(prev_c) == IS_WORD_CHAR(next_c))) \
78 || ((assertions & ASSERT_AT_WB_NEG) \
79 && (pos == 0 || next_c == L'\0' \
80 || IS_WORD_CHAR(prev_c) != IS_WORD_CHAR(next_c))))
81
82 #define CHECK_CHAR_CLASSES(trans_i, tnfa, eflags) \
83 (((trans_i->assertions & ASSERT_CHAR_CLASS) \
84 && !(tnfa->cflags & REG_ICASE) \
85 && !tre_isctype((tre_cint_t)prev_c, trans_i->u.class)) \
86 || ((trans_i->assertions & ASSERT_CHAR_CLASS) \
87 && (tnfa->cflags & REG_ICASE) \
88 && !tre_isctype(tre_tolower((tre_cint_t)prev_c),trans_i->u.class) \
89 && !tre_isctype(tre_toupper((tre_cint_t)prev_c),trans_i->u.class)) \
90 || ((trans_i->assertions & ASSERT_CHAR_CLASS_NEG) \
91 && tre_neg_char_classes_match(trans_i->neg_classes,(tre_cint_t)prev_c,\
92 tnfa->cflags & REG_ICASE)))
93
94
95
96
97 /* Returns 1 if `t1' wins `t2', 0 otherwise. */
98 static int
tre_tag_order(int num_tags,tre_tag_direction_t * tag_directions,regoff_t * t1,regoff_t * t2)99 tre_tag_order(int num_tags, tre_tag_direction_t *tag_directions,
100 regoff_t *t1, regoff_t *t2)
101 {
102 int i;
103 for (i = 0; i < num_tags; i++)
104 {
105 if (tag_directions[i] == TRE_TAG_MINIMIZE)
106 {
107 if (t1[i] < t2[i])
108 return 1;
109 if (t1[i] > t2[i])
110 return 0;
111 }
112 else
113 {
114 if (t1[i] > t2[i])
115 return 1;
116 if (t1[i] < t2[i])
117 return 0;
118 }
119 }
120 /* assert(0);*/
121 return 0;
122 }
123
124 static int
tre_neg_char_classes_match(tre_ctype_t * classes,tre_cint_t wc,int icase)125 tre_neg_char_classes_match(tre_ctype_t *classes, tre_cint_t wc, int icase)
126 {
127 while (*classes != (tre_ctype_t)0)
128 if ((!icase && tre_isctype(wc, *classes))
129 || (icase && (tre_isctype(tre_toupper(wc), *classes)
130 || tre_isctype(tre_tolower(wc), *classes))))
131 return 1; /* Match. */
132 else
133 classes++;
134 return 0; /* No match. */
135 }
136
137
138 /***********************************************************************
139 from tre-match-parallel.c
140 ***********************************************************************/
141
142 /*
143 This algorithm searches for matches basically by reading characters
144 in the searched string one by one, starting at the beginning. All
145 matching paths in the TNFA are traversed in parallel. When two or
146 more paths reach the same state, exactly one is chosen according to
147 tag ordering rules; if returning submatches is not required it does
148 not matter which path is chosen.
149
150 The worst case time required for finding the leftmost and longest
151 match, or determining that there is no match, is always linearly
152 dependent on the length of the text being searched.
153
154 This algorithm cannot handle TNFAs with back referencing nodes.
155 See `tre-match-backtrack.c'.
156 */
157
158 typedef struct {
159 tre_tnfa_transition_t *state;
160 regoff_t *tags;
161 } tre_tnfa_reach_t;
162
163 typedef struct {
164 regoff_t pos;
165 regoff_t **tags;
166 } tre_reach_pos_t;
167
168
169 static reg_errcode_t
tre_tnfa_run_parallel(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)170 tre_tnfa_run_parallel(const tre_tnfa_t *tnfa, const void *string,
171 regoff_t *match_tags, int eflags,
172 regoff_t *match_end_ofs)
173 {
174 /* State variables required by GET_NEXT_WCHAR. */
175 tre_char_t prev_c = 0, next_c = 0;
176 const char *str_byte = string;
177 regoff_t pos = -1;
178 regoff_t pos_add_next = 1;
179 #ifdef TRE_MBSTATE
180 mbstate_t mbstate;
181 #endif /* TRE_MBSTATE */
182 int reg_notbol = eflags & REG_NOTBOL;
183 int reg_noteol = eflags & REG_NOTEOL;
184 int reg_newline = tnfa->cflags & REG_NEWLINE;
185 reg_errcode_t ret;
186
187 char *buf;
188 tre_tnfa_transition_t *trans_i;
189 tre_tnfa_reach_t *reach, *reach_next, *reach_i, *reach_next_i;
190 tre_reach_pos_t *reach_pos;
191 int *tag_i;
192 int num_tags, i;
193
194 regoff_t match_eo = -1; /* end offset of match (-1 if no match found yet) */
195 int new_match = 0;
196 regoff_t *tmp_tags = NULL;
197 regoff_t *tmp_iptr;
198
199 #ifdef TRE_MBSTATE
200 memset(&mbstate, '\0', sizeof(mbstate));
201 #endif /* TRE_MBSTATE */
202
203 if (!match_tags)
204 num_tags = 0;
205 else
206 num_tags = tnfa->num_tags;
207
208 /* Allocate memory for temporary data required for matching. This needs to
209 be done for every matching operation to be thread safe. This allocates
210 everything in a single large block with calloc(). */
211 {
212 size_t tbytes, rbytes, pbytes, xbytes, total_bytes;
213 char *tmp_buf;
214
215 /* Ensure that tbytes and xbytes*num_states cannot overflow, and that
216 * they don't contribute more than 1/8 of SIZE_MAX to total_bytes. */
217 if (num_tags > SIZE_MAX/(8 * sizeof(regoff_t) * tnfa->num_states))
218 return REG_ESPACE;
219
220 /* Likewise check rbytes. */
221 if (tnfa->num_states+1 > SIZE_MAX/(8 * sizeof(*reach_next)))
222 return REG_ESPACE;
223
224 /* Likewise check pbytes. */
225 if (tnfa->num_states > SIZE_MAX/(8 * sizeof(*reach_pos)))
226 return REG_ESPACE;
227
228 /* Compute the length of the block we need. */
229 tbytes = sizeof(*tmp_tags) * num_tags;
230 rbytes = sizeof(*reach_next) * (tnfa->num_states + 1);
231 pbytes = sizeof(*reach_pos) * tnfa->num_states;
232 xbytes = sizeof(regoff_t) * num_tags;
233 total_bytes =
234 (sizeof(long) - 1) * 4 /* for alignment paddings */
235 + (rbytes + xbytes * tnfa->num_states) * 2 + tbytes + pbytes;
236
237 /* Allocate the memory. */
238 buf = calloc(total_bytes, 1);
239 if (buf == NULL)
240 return REG_ESPACE;
241
242 /* Get the various pointers within tmp_buf (properly aligned). */
243 tmp_tags = (void *)buf;
244 tmp_buf = buf + tbytes;
245 tmp_buf += ALIGN(tmp_buf, long);
246 reach_next = (void *)tmp_buf;
247 tmp_buf += rbytes;
248 tmp_buf += ALIGN(tmp_buf, long);
249 reach = (void *)tmp_buf;
250 tmp_buf += rbytes;
251 tmp_buf += ALIGN(tmp_buf, long);
252 reach_pos = (void *)tmp_buf;
253 tmp_buf += pbytes;
254 tmp_buf += ALIGN(tmp_buf, long);
255 for (i = 0; i < tnfa->num_states; i++)
256 {
257 reach[i].tags = (void *)tmp_buf;
258 tmp_buf += xbytes;
259 reach_next[i].tags = (void *)tmp_buf;
260 tmp_buf += xbytes;
261 }
262 }
263
264 for (i = 0; i < tnfa->num_states; i++)
265 reach_pos[i].pos = -1;
266
267 GET_NEXT_WCHAR();
268 pos = 0;
269
270 reach_next_i = reach_next;
271 while (1)
272 {
273 /* If no match found yet, add the initial states to `reach_next'. */
274 if (match_eo < 0)
275 {
276 trans_i = tnfa->initial;
277 while (trans_i->state != NULL)
278 {
279 if (reach_pos[trans_i->state_id].pos < pos)
280 {
281 if (trans_i->assertions
282 && CHECK_ASSERTIONS(trans_i->assertions))
283 {
284 trans_i++;
285 continue;
286 }
287
288 reach_next_i->state = trans_i->state;
289 for (i = 0; i < num_tags; i++)
290 reach_next_i->tags[i] = -1;
291 tag_i = trans_i->tags;
292 if (tag_i)
293 while (*tag_i >= 0)
294 {
295 if (*tag_i < num_tags)
296 reach_next_i->tags[*tag_i] = pos;
297 tag_i++;
298 }
299 if (reach_next_i->state == tnfa->final)
300 {
301 match_eo = pos;
302 new_match = 1;
303 for (i = 0; i < num_tags; i++)
304 match_tags[i] = reach_next_i->tags[i];
305 }
306 reach_pos[trans_i->state_id].pos = pos;
307 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
308 reach_next_i++;
309 }
310 trans_i++;
311 }
312 reach_next_i->state = NULL;
313 }
314 else
315 {
316 if (num_tags == 0 || reach_next_i == reach_next)
317 /* We have found a match. */
318 break;
319 }
320
321 /* Check for end of string. */
322 if (!next_c) break;
323
324 GET_NEXT_WCHAR();
325
326 /* Swap `reach' and `reach_next'. */
327 reach_i = reach;
328 reach = reach_next;
329 reach_next = reach_i;
330
331 /* For each state in `reach', weed out states that don't fulfill the
332 minimal matching conditions. */
333 if (tnfa->num_minimals && new_match)
334 {
335 new_match = 0;
336 reach_next_i = reach_next;
337 for (reach_i = reach; reach_i->state; reach_i++)
338 {
339 int skip = 0;
340 for (i = 0; tnfa->minimal_tags[i] >= 0; i += 2)
341 {
342 int end = tnfa->minimal_tags[i];
343 int start = tnfa->minimal_tags[i + 1];
344 if (end >= num_tags)
345 {
346 skip = 1;
347 break;
348 }
349 else if (reach_i->tags[start] == match_tags[start]
350 && reach_i->tags[end] < match_tags[end])
351 {
352 skip = 1;
353 break;
354 }
355 }
356 if (!skip)
357 {
358 reach_next_i->state = reach_i->state;
359 tmp_iptr = reach_next_i->tags;
360 reach_next_i->tags = reach_i->tags;
361 reach_i->tags = tmp_iptr;
362 reach_next_i++;
363 }
364 }
365 reach_next_i->state = NULL;
366
367 /* Swap `reach' and `reach_next'. */
368 reach_i = reach;
369 reach = reach_next;
370 reach_next = reach_i;
371 }
372
373 /* For each state in `reach' see if there is a transition leaving with
374 the current input symbol to a state not yet in `reach_next', and
375 add the destination states to `reach_next'. */
376 reach_next_i = reach_next;
377 for (reach_i = reach; reach_i->state; reach_i++)
378 {
379 for (trans_i = reach_i->state; trans_i->state; trans_i++)
380 {
381 /* Does this transition match the input symbol? */
382 if (trans_i->code_min <= (tre_cint_t)prev_c &&
383 trans_i->code_max >= (tre_cint_t)prev_c)
384 {
385 if (trans_i->assertions
386 && (CHECK_ASSERTIONS(trans_i->assertions)
387 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
388 {
389 continue;
390 }
391
392 /* Compute the tags after this transition. */
393 for (i = 0; i < num_tags; i++)
394 tmp_tags[i] = reach_i->tags[i];
395 tag_i = trans_i->tags;
396 if (tag_i != NULL)
397 while (*tag_i >= 0)
398 {
399 if (*tag_i < num_tags)
400 tmp_tags[*tag_i] = pos;
401 tag_i++;
402 }
403
404 if (reach_pos[trans_i->state_id].pos < pos)
405 {
406 /* Found an unvisited node. */
407 reach_next_i->state = trans_i->state;
408 tmp_iptr = reach_next_i->tags;
409 reach_next_i->tags = tmp_tags;
410 tmp_tags = tmp_iptr;
411 reach_pos[trans_i->state_id].pos = pos;
412 reach_pos[trans_i->state_id].tags = &reach_next_i->tags;
413
414 if (reach_next_i->state == tnfa->final
415 && (match_eo == -1
416 || (num_tags > 0
417 && reach_next_i->tags[0] <= match_tags[0])))
418 {
419 match_eo = pos;
420 new_match = 1;
421 for (i = 0; i < num_tags; i++)
422 match_tags[i] = reach_next_i->tags[i];
423 }
424 reach_next_i++;
425
426 }
427 else
428 {
429 assert(reach_pos[trans_i->state_id].pos == pos);
430 /* Another path has also reached this state. We choose
431 the winner by examining the tag values for both
432 paths. */
433 if (tre_tag_order(num_tags, tnfa->tag_directions,
434 tmp_tags,
435 *reach_pos[trans_i->state_id].tags))
436 {
437 /* The new path wins. */
438 tmp_iptr = *reach_pos[trans_i->state_id].tags;
439 *reach_pos[trans_i->state_id].tags = tmp_tags;
440 if (trans_i->state == tnfa->final)
441 {
442 match_eo = pos;
443 new_match = 1;
444 for (i = 0; i < num_tags; i++)
445 match_tags[i] = tmp_tags[i];
446 }
447 tmp_tags = tmp_iptr;
448 }
449 }
450 }
451 }
452 }
453 reach_next_i->state = NULL;
454 }
455
456 *match_end_ofs = match_eo;
457 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
458 error_exit:
459 xfree(buf);
460 return ret;
461 }
462
463
464
465 /***********************************************************************
466 from tre-match-backtrack.c
467 ***********************************************************************/
468
469 /*
470 This matcher is for regexps that use back referencing. Regexp matching
471 with back referencing is an NP-complete problem on the number of back
472 references. The easiest way to match them is to use a backtracking
473 routine which basically goes through all possible paths in the TNFA
474 and chooses the one which results in the best (leftmost and longest)
475 match. This can be spectacularly expensive and may run out of stack
476 space, but there really is no better known generic algorithm. Quoting
477 Henry Spencer from comp.compilers:
478 <URL: http://compilers.iecc.com/comparch/article/93-03-102>
479
480 POSIX.2 REs require longest match, which is really exciting to
481 implement since the obsolete ("basic") variant also includes
482 \<digit>. I haven't found a better way of tackling this than doing
483 a preliminary match using a DFA (or simulation) on a modified RE
484 that just replicates subREs for \<digit>, and then doing a
485 backtracking match to determine whether the subRE matches were
486 right. This can be rather slow, but I console myself with the
487 thought that people who use \<digit> deserve very slow execution.
488 (Pun unintentional but very appropriate.)
489
490 */
491
492 typedef struct {
493 regoff_t pos;
494 const char *str_byte;
495 tre_tnfa_transition_t *state;
496 int state_id;
497 int next_c;
498 regoff_t *tags;
499 #ifdef TRE_MBSTATE
500 mbstate_t mbstate;
501 #endif /* TRE_MBSTATE */
502 } tre_backtrack_item_t;
503
504 typedef struct tre_backtrack_struct {
505 tre_backtrack_item_t item;
506 struct tre_backtrack_struct *prev;
507 struct tre_backtrack_struct *next;
508 } *tre_backtrack_t;
509
510 #ifdef TRE_MBSTATE
511 #define BT_STACK_MBSTATE_IN stack->item.mbstate = (mbstate)
512 #define BT_STACK_MBSTATE_OUT (mbstate) = stack->item.mbstate
513 #else /* !TRE_MBSTATE */
514 #define BT_STACK_MBSTATE_IN
515 #define BT_STACK_MBSTATE_OUT
516 #endif /* !TRE_MBSTATE */
517
518 #define tre_bt_mem_new tre_mem_new
519 #define tre_bt_mem_alloc tre_mem_alloc
520 #define tre_bt_mem_destroy tre_mem_destroy
521
522
523 #define BT_STACK_PUSH(_pos, _str_byte, _str_wide, _state, _state_id, _next_c, _tags, _mbstate) \
524 do \
525 { \
526 int i; \
527 if (!stack->next) \
528 { \
529 tre_backtrack_t s; \
530 s = tre_bt_mem_alloc(mem, sizeof(*s)); \
531 if (!s) \
532 { \
533 tre_bt_mem_destroy(mem); \
534 if (tags) \
535 xfree(tags); \
536 if (pmatch) \
537 xfree(pmatch); \
538 if (states_seen) \
539 xfree(states_seen); \
540 return REG_ESPACE; \
541 } \
542 s->prev = stack; \
543 s->next = NULL; \
544 s->item.tags = tre_bt_mem_alloc(mem, \
545 sizeof(*tags) * tnfa->num_tags); \
546 if (!s->item.tags) \
547 { \
548 tre_bt_mem_destroy(mem); \
549 if (tags) \
550 xfree(tags); \
551 if (pmatch) \
552 xfree(pmatch); \
553 if (states_seen) \
554 xfree(states_seen); \
555 return REG_ESPACE; \
556 } \
557 stack->next = s; \
558 stack = s; \
559 } \
560 else \
561 stack = stack->next; \
562 stack->item.pos = (_pos); \
563 stack->item.str_byte = (_str_byte); \
564 stack->item.state = (_state); \
565 stack->item.state_id = (_state_id); \
566 stack->item.next_c = (_next_c); \
567 for (i = 0; i < tnfa->num_tags; i++) \
568 stack->item.tags[i] = (_tags)[i]; \
569 BT_STACK_MBSTATE_IN; \
570 } \
571 while (0)
572
573 #define BT_STACK_POP() \
574 do \
575 { \
576 int i; \
577 assert(stack->prev); \
578 pos = stack->item.pos; \
579 str_byte = stack->item.str_byte; \
580 state = stack->item.state; \
581 next_c = stack->item.next_c; \
582 for (i = 0; i < tnfa->num_tags; i++) \
583 tags[i] = stack->item.tags[i]; \
584 BT_STACK_MBSTATE_OUT; \
585 stack = stack->prev; \
586 } \
587 while (0)
588
589 #undef MIN
590 #define MIN(a, b) ((a) <= (b) ? (a) : (b))
591
592 static reg_errcode_t
tre_tnfa_run_backtrack(const tre_tnfa_t * tnfa,const void * string,regoff_t * match_tags,int eflags,regoff_t * match_end_ofs)593 tre_tnfa_run_backtrack(const tre_tnfa_t *tnfa, const void *string,
594 regoff_t *match_tags, int eflags, regoff_t *match_end_ofs)
595 {
596 /* State variables required by GET_NEXT_WCHAR. */
597 tre_char_t prev_c = 0, next_c = 0;
598 const char *str_byte = string;
599 regoff_t pos = 0;
600 regoff_t pos_add_next = 1;
601 #ifdef TRE_MBSTATE
602 mbstate_t mbstate;
603 #endif /* TRE_MBSTATE */
604 int reg_notbol = eflags & REG_NOTBOL;
605 int reg_noteol = eflags & REG_NOTEOL;
606 int reg_newline = tnfa->cflags & REG_NEWLINE;
607
608 /* These are used to remember the necessary values of the above
609 variables to return to the position where the current search
610 started from. */
611 int next_c_start;
612 const char *str_byte_start;
613 regoff_t pos_start = -1;
614 #ifdef TRE_MBSTATE
615 mbstate_t mbstate_start;
616 #endif /* TRE_MBSTATE */
617
618 /* End offset of best match so far, or -1 if no match found yet. */
619 regoff_t match_eo = -1;
620 /* Tag arrays. */
621 int *next_tags;
622 regoff_t *tags = NULL;
623 /* Current TNFA state. */
624 tre_tnfa_transition_t *state;
625 int *states_seen = NULL;
626
627 /* Memory allocator to for allocating the backtracking stack. */
628 tre_mem_t mem = tre_bt_mem_new();
629
630 /* The backtracking stack. */
631 tre_backtrack_t stack;
632
633 tre_tnfa_transition_t *trans_i;
634 regmatch_t *pmatch = NULL;
635 int ret;
636
637 #ifdef TRE_MBSTATE
638 memset(&mbstate, '\0', sizeof(mbstate));
639 #endif /* TRE_MBSTATE */
640
641 if (!mem)
642 return REG_ESPACE;
643 stack = tre_bt_mem_alloc(mem, sizeof(*stack));
644 if (!stack)
645 {
646 ret = REG_ESPACE;
647 goto error_exit;
648 }
649 stack->prev = NULL;
650 stack->next = NULL;
651
652 if (tnfa->num_tags)
653 {
654 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
655 if (!tags)
656 {
657 ret = REG_ESPACE;
658 goto error_exit;
659 }
660 }
661 if (tnfa->num_submatches)
662 {
663 pmatch = xmalloc(sizeof(*pmatch) * tnfa->num_submatches);
664 if (!pmatch)
665 {
666 ret = REG_ESPACE;
667 goto error_exit;
668 }
669 }
670 if (tnfa->num_states)
671 {
672 states_seen = xmalloc(sizeof(*states_seen) * tnfa->num_states);
673 if (!states_seen)
674 {
675 ret = REG_ESPACE;
676 goto error_exit;
677 }
678 }
679
680 retry:
681 {
682 int i;
683 for (i = 0; i < tnfa->num_tags; i++)
684 {
685 tags[i] = -1;
686 if (match_tags)
687 match_tags[i] = -1;
688 }
689 for (i = 0; i < tnfa->num_states; i++)
690 states_seen[i] = 0;
691 }
692
693 state = NULL;
694 pos = pos_start;
695 GET_NEXT_WCHAR();
696 pos_start = pos;
697 next_c_start = next_c;
698 str_byte_start = str_byte;
699 #ifdef TRE_MBSTATE
700 mbstate_start = mbstate;
701 #endif /* TRE_MBSTATE */
702
703 /* Handle initial states. */
704 next_tags = NULL;
705 for (trans_i = tnfa->initial; trans_i->state; trans_i++)
706 {
707 if (trans_i->assertions && CHECK_ASSERTIONS(trans_i->assertions))
708 {
709 continue;
710 }
711 if (state == NULL)
712 {
713 /* Start from this state. */
714 state = trans_i->state;
715 next_tags = trans_i->tags;
716 }
717 else
718 {
719 /* Backtrack to this state. */
720 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
721 trans_i->state_id, next_c, tags, mbstate);
722 {
723 int *tmp = trans_i->tags;
724 if (tmp)
725 while (*tmp >= 0)
726 stack->item.tags[*tmp++] = pos;
727 }
728 }
729 }
730
731 if (next_tags)
732 for (; *next_tags >= 0; next_tags++)
733 tags[*next_tags] = pos;
734
735
736 if (state == NULL)
737 goto backtrack;
738
739 while (1)
740 {
741 tre_tnfa_transition_t *next_state;
742 int empty_br_match;
743
744 if (state == tnfa->final)
745 {
746 if (match_eo < pos
747 || (match_eo == pos
748 && match_tags
749 && tre_tag_order(tnfa->num_tags, tnfa->tag_directions,
750 tags, match_tags)))
751 {
752 int i;
753 /* This match wins the previous match. */
754 match_eo = pos;
755 if (match_tags)
756 for (i = 0; i < tnfa->num_tags; i++)
757 match_tags[i] = tags[i];
758 }
759 /* Our TNFAs never have transitions leaving from the final state,
760 so we jump right to backtracking. */
761 goto backtrack;
762 }
763
764 /* Go to the next character in the input string. */
765 empty_br_match = 0;
766 trans_i = state;
767 if (trans_i->state && trans_i->assertions & ASSERT_BACKREF)
768 {
769 /* This is a back reference state. All transitions leaving from
770 this state have the same back reference "assertion". Instead
771 of reading the next character, we match the back reference. */
772 regoff_t so, eo;
773 int bt = trans_i->u.backref;
774 regoff_t bt_len;
775 int result;
776
777 /* Get the substring we need to match against. Remember to
778 turn off REG_NOSUB temporarily. */
779 tre_fill_pmatch(bt + 1, pmatch, tnfa->cflags & ~REG_NOSUB,
780 tnfa, tags, pos);
781 so = pmatch[bt].rm_so;
782 eo = pmatch[bt].rm_eo;
783 bt_len = eo - so;
784
785 result = strncmp((const char*)string + so, str_byte - 1,
786 (size_t)bt_len);
787
788 if (result == 0)
789 {
790 /* Back reference matched. Check for infinite loop. */
791 if (bt_len == 0)
792 empty_br_match = 1;
793 if (empty_br_match && states_seen[trans_i->state_id])
794 {
795 goto backtrack;
796 }
797
798 states_seen[trans_i->state_id] = empty_br_match;
799
800 /* Advance in input string and resync `prev_c', `next_c'
801 and pos. */
802 str_byte += bt_len - 1;
803 pos += bt_len - 1;
804 GET_NEXT_WCHAR();
805 }
806 else
807 {
808 goto backtrack;
809 }
810 }
811 else
812 {
813 /* Check for end of string. */
814 if (next_c == L'\0')
815 goto backtrack;
816
817 /* Read the next character. */
818 GET_NEXT_WCHAR();
819 }
820
821 next_state = NULL;
822 for (trans_i = state; trans_i->state; trans_i++)
823 {
824 if (trans_i->code_min <= (tre_cint_t)prev_c
825 && trans_i->code_max >= (tre_cint_t)prev_c)
826 {
827 if (trans_i->assertions
828 && (CHECK_ASSERTIONS(trans_i->assertions)
829 || CHECK_CHAR_CLASSES(trans_i, tnfa, eflags)))
830 {
831 continue;
832 }
833
834 if (next_state == NULL)
835 {
836 /* First matching transition. */
837 next_state = trans_i->state;
838 next_tags = trans_i->tags;
839 }
840 else
841 {
842 /* Second matching transition. We may need to backtrack here
843 to take this transition instead of the first one, so we
844 push this transition in the backtracking stack so we can
845 jump back here if needed. */
846 BT_STACK_PUSH(pos, str_byte, 0, trans_i->state,
847 trans_i->state_id, next_c, tags, mbstate);
848 {
849 int *tmp;
850 for (tmp = trans_i->tags; tmp && *tmp >= 0; tmp++)
851 stack->item.tags[*tmp] = pos;
852 }
853 #if 0 /* XXX - it's important not to look at all transitions here to keep
854 the stack small! */
855 break;
856 #endif
857 }
858 }
859 }
860
861 if (next_state != NULL)
862 {
863 /* Matching transitions were found. Take the first one. */
864 state = next_state;
865
866 /* Update the tag values. */
867 if (next_tags)
868 while (*next_tags >= 0)
869 tags[*next_tags++] = pos;
870 }
871 else
872 {
873 backtrack:
874 /* A matching transition was not found. Try to backtrack. */
875 if (stack->prev)
876 {
877 if (stack->item.state->assertions & ASSERT_BACKREF)
878 {
879 states_seen[stack->item.state_id] = 0;
880 }
881
882 BT_STACK_POP();
883 }
884 else if (match_eo < 0)
885 {
886 /* Try starting from a later position in the input string. */
887 /* Check for end of string. */
888 if (next_c == L'\0')
889 {
890 break;
891 }
892 next_c = next_c_start;
893 #ifdef TRE_MBSTATE
894 mbstate = mbstate_start;
895 #endif /* TRE_MBSTATE */
896 str_byte = str_byte_start;
897 goto retry;
898 }
899 else
900 {
901 break;
902 }
903 }
904 }
905
906 ret = match_eo >= 0 ? REG_OK : REG_NOMATCH;
907 *match_end_ofs = match_eo;
908
909 error_exit:
910 tre_bt_mem_destroy(mem);
911 #ifndef TRE_USE_ALLOCA
912 if (tags)
913 xfree(tags);
914 if (pmatch)
915 xfree(pmatch);
916 if (states_seen)
917 xfree(states_seen);
918 #endif /* !TRE_USE_ALLOCA */
919
920 return ret;
921 }
922
923 /***********************************************************************
924 from regexec.c
925 ***********************************************************************/
926
927 /* Fills the POSIX.2 regmatch_t array according to the TNFA tag and match
928 endpoint values. */
929 static void
tre_fill_pmatch(size_t nmatch,regmatch_t pmatch[],int cflags,const tre_tnfa_t * tnfa,regoff_t * tags,regoff_t match_eo)930 tre_fill_pmatch(size_t nmatch, regmatch_t pmatch[], int cflags,
931 const tre_tnfa_t *tnfa, regoff_t *tags, regoff_t match_eo)
932 {
933 tre_submatch_data_t *submatch_data;
934 unsigned int i, j;
935 int *parents;
936
937 i = 0;
938 if (match_eo >= 0 && !(cflags & REG_NOSUB))
939 {
940 /* Construct submatch offsets from the tags. */
941 submatch_data = tnfa->submatch_data;
942 while (i < tnfa->num_submatches && i < nmatch)
943 {
944 if (submatch_data[i].so_tag == tnfa->end_tag)
945 pmatch[i].rm_so = match_eo;
946 else
947 pmatch[i].rm_so = tags[submatch_data[i].so_tag];
948
949 if (submatch_data[i].eo_tag == tnfa->end_tag)
950 pmatch[i].rm_eo = match_eo;
951 else
952 pmatch[i].rm_eo = tags[submatch_data[i].eo_tag];
953
954 /* If either of the endpoints were not used, this submatch
955 was not part of the match. */
956 if (pmatch[i].rm_so == -1 || pmatch[i].rm_eo == -1)
957 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
958
959 i++;
960 }
961 /* Reset all submatches that are not within all of their parent
962 submatches. */
963 i = 0;
964 while (i < tnfa->num_submatches && i < nmatch)
965 {
966 if (pmatch[i].rm_eo == -1)
967 assert(pmatch[i].rm_so == -1);
968 assert(pmatch[i].rm_so <= pmatch[i].rm_eo);
969
970 parents = submatch_data[i].parents;
971 if (parents != NULL)
972 for (j = 0; parents[j] >= 0; j++)
973 {
974 if (pmatch[i].rm_so < pmatch[parents[j]].rm_so
975 || pmatch[i].rm_eo > pmatch[parents[j]].rm_eo)
976 pmatch[i].rm_so = pmatch[i].rm_eo = -1;
977 }
978 i++;
979 }
980 }
981
982 while (i < nmatch)
983 {
984 pmatch[i].rm_so = -1;
985 pmatch[i].rm_eo = -1;
986 i++;
987 }
988 }
989
990
991 /*
992 Wrapper functions for POSIX compatible regexp matching.
993 */
994
995 int
regexec(const regex_t * restrict preg,const char * restrict string,size_t nmatch,regmatch_t pmatch[restrict],int eflags)996 regexec(const regex_t *restrict preg, const char *restrict string,
997 size_t nmatch, regmatch_t pmatch[restrict], int eflags)
998 {
999 tre_tnfa_t *tnfa = (void *)preg->TRE_REGEX_T_FIELD;
1000 reg_errcode_t status;
1001 regoff_t *tags = NULL, eo;
1002 if (tnfa->cflags & REG_NOSUB) nmatch = 0;
1003 if (tnfa->num_tags > 0 && nmatch > 0)
1004 {
1005 tags = xmalloc(sizeof(*tags) * tnfa->num_tags);
1006 if (tags == NULL)
1007 return REG_ESPACE;
1008 }
1009
1010 /* Dispatch to the appropriate matcher. */
1011 if (tnfa->have_backrefs)
1012 {
1013 /* The regex has back references, use the backtracking matcher. */
1014 status = tre_tnfa_run_backtrack(tnfa, string, tags, eflags, &eo);
1015 }
1016 else
1017 {
1018 /* Exact matching, no back references, use the parallel matcher. */
1019 status = tre_tnfa_run_parallel(tnfa, string, tags, eflags, &eo);
1020 }
1021
1022 if (status == REG_OK)
1023 /* A match was found, so fill the submatch registers. */
1024 tre_fill_pmatch(nmatch, pmatch, tnfa->cflags, tnfa, tags, eo);
1025 if (tags)
1026 xfree(tags);
1027 return status;
1028 }
1029