1 /* $NetBSD: engine.c,v 1.24 2012/03/13 21:13:42 christos Exp $ */
2
3 /*-
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)engine.c 8.5 (Berkeley) 3/20/94
35 */
36
37 /*-
38 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
39 *
40 * This code is derived from software contributed to Berkeley by
41 * Henry Spencer.
42 *
43 * Redistribution and use in source and binary forms, with or without
44 * modification, are permitted provided that the following conditions
45 * are met:
46 * 1. Redistributions of source code must retain the above copyright
47 * notice, this list of conditions and the following disclaimer.
48 * 2. Redistributions in binary form must reproduce the above copyright
49 * notice, this list of conditions and the following disclaimer in the
50 * documentation and/or other materials provided with the distribution.
51 * 3. All advertising materials mentioning features or use of this software
52 * must display the following acknowledgement:
53 * This product includes software developed by the University of
54 * California, Berkeley and its contributors.
55 * 4. Neither the name of the University nor the names of its contributors
56 * may be used to endorse or promote products derived from this software
57 * without specific prior written permission.
58 *
59 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
60 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
61 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
62 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
63 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
64 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
65 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
66 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
67 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
68 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
69 * SUCH DAMAGE.
70 *
71 * @(#)engine.c 8.5 (Berkeley) 3/20/94
72 */
73
74 /*
75 * The matching engine and friends. This file is #included by regexec.c
76 * after suitable #defines of a variety of macros used herein, so that
77 * different state representations can be used without duplicating masses
78 * of code.
79 */
80
81 #ifdef SNAMES
82 #define matcher smatcher
83 #define fast sfast
84 #define slow sslow
85 #define dissect sdissect
86 #define backref sbackref
87 #define step sstep
88 #define print sprint
89 #define at sat
90 #define match smat
91 #define nope snope
92 #endif
93 #ifdef LNAMES
94 #define matcher lmatcher
95 #define fast lfast
96 #define slow lslow
97 #define dissect ldissect
98 #define backref lbackref
99 #define step lstep
100 #define print lprint
101 #define at lat
102 #define match lmat
103 #define nope lnope
104 #endif
105
106 /* another structure passed up and down to avoid zillions of parameters */
107 struct match {
108 struct re_guts *g;
109 int eflags;
110 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
111 const char *offp; /* offsets work from here */
112 const char *beginp; /* start of string -- virtual NUL precedes */
113 const char *endp; /* end of string -- virtual NUL here */
114 const char *coldp; /* can be no match starting before here */
115 const char **lastpos; /* [nplus+1] */
116 STATEVARS;
117 states st; /* current states */
118 states fresh; /* states for a fresh start */
119 states tmp; /* temporary */
120 states empty; /* empty set of states */
121 };
122
123 /* ========= begin header generated by ./mkh ========= */
124 #ifdef __cplusplus
125 extern "C" {
126 #endif
127
128 /* === engine.c === */
129 static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatch_t pmatch[], int eflags);
130 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
131 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev);
132 static const char *fast(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
133 static const char *slow(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
134 static states step(struct re_guts *g, sopno start, sopno stop, states bef, int ch, states aft);
135 #define BOL (OUT+1)
136 #define EOL (BOL+1)
137 #define BOLEOL (BOL+2)
138 #define NOTHING (BOL+3)
139 #define BOW (BOL+4)
140 #define EOW (BOL+5)
141 #define CODEMAX (BOL+5) /* highest code used */
142 #define NONCHAR(c) ((c) > CHAR_MAX)
143 #define NNONCHAR (CODEMAX-CHAR_MAX)
144 #ifdef REDEBUG
145 static void print(struct match *m, char *caption, states st, int ch, FILE *d);
146 #endif
147 #ifdef REDEBUG
148 static void at(struct match *m, char *title, char *start, char *stop, sopno startst, sopno stopst);
149 #endif
150 #ifdef REDEBUG
151 static char *pchar(int ch);
152 #endif
153
154 #ifdef __cplusplus
155 }
156 #endif
157 /* ========= end header generated by ./mkh ========= */
158
159 #ifdef REDEBUG
160 #define SP(t, s, c) print(m, t, s, c, stdout)
161 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
162 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
163 static int nope = 0;
164 #else
165 #define SP(t, s, c) /* nothing */
166 #define AT(t, p1, p2, s1, s2) /* nothing */
167 #define NOTE(s) /* nothing */
168 #endif
169
170 /*
171 - matcher - the actual matching engine
172 == static int matcher(struct re_guts *g, char *string, \
173 == size_t nmatch, regmatch_t pmatch[], int eflags);
174 */
175 static int /* 0 success, REG_NOMATCH failure */
matcher(struct re_guts * g,const char * string,size_t nmatch,regmatch_t pmatch[],int eflags)176 matcher(
177 struct re_guts *g,
178 const char *string,
179 size_t nmatch,
180 regmatch_t pmatch[],
181 int eflags)
182 {
183 const char *endp;
184 size_t i;
185 struct match mv;
186 struct match *m = &mv;
187 const char *dp;
188 const sopno gf = g->firststate+1; /* +1 for OEND */
189 const sopno gl = g->laststate;
190 const char *start;
191 const char *stop;
192 int error = 0;
193
194 _DIAGASSERT(g != NULL);
195 _DIAGASSERT(string != NULL);
196 /* pmatch checked below */
197
198 /* simplify the situation where possible */
199 if (g->cflags®_NOSUB)
200 nmatch = 0;
201 if (eflags®_STARTEND) {
202 _DIAGASSERT(pmatch != NULL);
203 start = string + (size_t)pmatch[0].rm_so;
204 stop = string + (size_t)pmatch[0].rm_eo;
205 } else {
206 start = string;
207 stop = start + strlen(start);
208 }
209 if (stop < start)
210 return(REG_INVARG);
211
212 /* prescreening; this does wonders for this rather slow code */
213 if (g->must != NULL) {
214 for (dp = start; dp < stop; dp++)
215 if (*dp == g->must[0] && (size_t)(stop - dp) >= g->mlen &&
216 memcmp(dp, g->must, g->mlen) == 0)
217 break;
218 if (dp == stop) /* we didn't find g->must */
219 return(REG_NOMATCH);
220 }
221
222 /* match struct setup */
223 m->g = g;
224 m->eflags = eflags;
225 m->pmatch = NULL;
226 m->lastpos = NULL;
227 m->offp = string;
228 m->beginp = start;
229 m->endp = stop;
230 STATESETUP(m, 4);
231 SETUP(m->st);
232 SETUP(m->fresh);
233 SETUP(m->tmp);
234 SETUP(m->empty);
235 CLEAR(m->empty);
236
237 /* this loop does only one repetition except for backrefs */
238 for (;;) {
239 endp = fast(m, start, stop, gf, gl);
240 if (endp == NULL) { /* a miss */
241 error = REG_NOMATCH;
242 goto done;
243 }
244 if (nmatch == 0 && !g->backrefs)
245 break; /* no further info needed */
246
247 /* where? */
248 assert(m->coldp != NULL);
249 for (;;) {
250 NOTE("finding start");
251 endp = slow(m, m->coldp, stop, gf, gl);
252 if (endp != NULL)
253 break;
254 assert(m->coldp < m->endp);
255 m->coldp++;
256 }
257 if (nmatch == 1 && !g->backrefs)
258 break; /* no further info needed */
259
260 /* oh my, he wants the subexpressions... */
261 if (m->pmatch == NULL)
262 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
263 sizeof(regmatch_t));
264 if (m->pmatch == NULL) {
265 error = REG_ESPACE;
266 goto done;
267 }
268 for (i = 1; i <= m->g->nsub; i++)
269 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = (regoff_t)-1;
270 if (!g->backrefs && !(m->eflags®_BACKR)) {
271 NOTE("dissecting");
272 dp = dissect(m, m->coldp, endp, gf, gl);
273 } else {
274 if (g->nplus > 0 && m->lastpos == NULL)
275 m->lastpos = malloc((g->nplus+1) *
276 sizeof(const char *));
277 if (g->nplus > 0 && m->lastpos == NULL) {
278 error = REG_ESPACE;
279 goto done;
280 }
281 NOTE("backref dissect");
282 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
283 }
284 if (dp != NULL)
285 break;
286
287 /* uh-oh... we couldn't find a subexpression-level match */
288 assert(g->backrefs); /* must be back references doing it */
289 assert(g->nplus == 0 || m->lastpos != NULL);
290 for (;;) {
291 if (dp != NULL || endp <= m->coldp)
292 break; /* defeat */
293 NOTE("backoff");
294 endp = slow(m, m->coldp, endp-1, gf, gl);
295 if (endp == NULL)
296 break; /* defeat */
297 /* try it on a shorter possibility */
298 #ifndef NDEBUG
299 for (i = 1; i <= m->g->nsub; i++) {
300 assert(m->pmatch[i].rm_so == (regoff_t)-1);
301 assert(m->pmatch[i].rm_eo == (regoff_t)-1);
302 }
303 #endif
304 NOTE("backoff dissect");
305 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
306 }
307 assert(dp == NULL || dp == endp);
308 if (dp != NULL) /* found a shorter one */
309 break;
310
311 /* despite initial appearances, there is no match here */
312 NOTE("false alarm");
313 start = m->coldp + 1; /* recycle starting later */
314 assert(start <= stop);
315 }
316
317 /* fill in the details if requested */
318 if (nmatch > 0) {
319 _DIAGASSERT(pmatch != NULL);
320 pmatch[0].rm_so = m->coldp - m->offp;
321 pmatch[0].rm_eo = endp - m->offp;
322 }
323 if (nmatch > 1) {
324 assert(m->pmatch != NULL);
325 for (i = 1; i < nmatch; i++)
326 if (i <= m->g->nsub)
327 pmatch[i] = m->pmatch[i];
328 else {
329 pmatch[i].rm_so = (regoff_t)-1;
330 pmatch[i].rm_eo = (regoff_t)-1;
331 }
332 }
333
334 done:
335 if (m->pmatch != NULL) {
336 free(m->pmatch);
337 m->pmatch = NULL;
338 }
339 if (m->lastpos != NULL) {
340 free(m->lastpos);
341 m->lastpos = NULL;
342 }
343 STATETEARDOWN(m);
344 return error;
345 }
346
347 /*
348 - dissect - figure out what matched what, no back references
349 == static const char *dissect(struct match *m, const char *start, \
350 == const char *stop, sopno startst, sopno stopst);
351 */
352 static const char * /* == stop (success) always */
dissect(struct match * m,const char * start,const char * stop,sopno startst,sopno stopst)353 dissect(
354 struct match *m,
355 const char *start,
356 const char *stop,
357 sopno startst,
358 sopno stopst)
359 {
360 int i;
361 sopno ss; /* start sop of current subRE */
362 sopno es; /* end sop of current subRE */
363 const char *sp; /* start of string matched by it */
364 const char *stp; /* string matched by it cannot pass here */
365 const char *rest; /* start of rest of string */
366 const char *tail; /* string unmatched by rest of RE */
367 sopno ssub; /* start sop of subsubRE */
368 sopno esub; /* end sop of subsubRE */
369 const char *ssp; /* start of string matched by subsubRE */
370 const char *sep; /* end of string matched by subsubRE */
371 const char *oldssp; /* previous ssp */
372 #ifndef NDEBUG
373 const char *dp;
374 #endif
375
376 _DIAGASSERT(m != NULL);
377 _DIAGASSERT(start != NULL);
378 _DIAGASSERT(stop != NULL);
379
380 AT("diss", start, stop, startst, stopst);
381 sp = start;
382 for (ss = startst; ss < stopst; ss = es) {
383 /* identify end of subRE */
384 es = ss;
385 switch (OP(m->g->strip[es])) {
386 case OPLUS_:
387 case OQUEST_:
388 es += OPND(m->g->strip[es]);
389 break;
390 case OCH_:
391 while (OP(m->g->strip[es]) != O_CH)
392 es += OPND(m->g->strip[es]);
393 break;
394 }
395 es++;
396
397 /* figure out what it matched */
398 switch (OP(m->g->strip[ss])) {
399 case OEND:
400 assert(nope);
401 break;
402 case OCHAR:
403 sp++;
404 break;
405 case OBOL:
406 case OEOL:
407 case OBOW:
408 case OEOW:
409 break;
410 case OANY:
411 case OANYOF:
412 sp++;
413 break;
414 case OBACK_:
415 case O_BACK:
416 assert(nope);
417 break;
418 /* cases where length of match is hard to find */
419 case OQUEST_:
420 stp = stop;
421 for (;;) {
422 /* how long could this one be? */
423 rest = slow(m, sp, stp, ss, es);
424 assert(rest != NULL); /* it did match */
425 /* could the rest match the rest? */
426 tail = slow(m, rest, stop, es, stopst);
427 if (tail == stop)
428 break; /* yes! */
429 /* no -- try a shorter match for this one */
430 stp = rest - 1;
431 assert(stp >= sp); /* it did work */
432 }
433 ssub = ss + 1;
434 esub = es - 1;
435 /* did innards match? */
436 if (slow(m, sp, rest, ssub, esub) != NULL) {
437 #ifdef NDEBUG
438 (void)
439 #else
440 dp =
441 #endif
442 dissect(m, sp, rest, ssub, esub);
443 assert(dp == rest);
444 } else /* no */
445 assert(sp == rest);
446 sp = rest;
447 break;
448 case OPLUS_:
449 stp = stop;
450 for (;;) {
451 /* how long could this one be? */
452 rest = slow(m, sp, stp, ss, es);
453 assert(rest != NULL); /* it did match */
454 /* could the rest match the rest? */
455 tail = slow(m, rest, stop, es, stopst);
456 if (tail == stop)
457 break; /* yes! */
458 /* no -- try a shorter match for this one */
459 stp = rest - 1;
460 assert(stp >= sp); /* it did work */
461 }
462 ssub = ss + 1;
463 esub = es - 1;
464 ssp = sp;
465 oldssp = ssp;
466 for (;;) { /* find last match of innards */
467 sep = slow(m, ssp, rest, ssub, esub);
468 if (sep == NULL || sep == ssp)
469 break; /* failed or matched null */
470 oldssp = ssp; /* on to next try */
471 ssp = sep;
472 }
473 if (sep == NULL) {
474 /* last successful match */
475 sep = ssp;
476 ssp = oldssp;
477 }
478 assert(sep == rest); /* must exhaust substring */
479 assert(slow(m, ssp, sep, ssub, esub) == rest);
480 #ifdef NDEBUG
481 (void)
482 #else
483 dp =
484 #endif
485 dissect(m, ssp, sep, ssub, esub);
486 assert(dp == sep);
487 sp = rest;
488 break;
489 case OCH_:
490 stp = stop;
491 for (;;) {
492 /* how long could this one be? */
493 rest = slow(m, sp, stp, ss, es);
494 assert(rest != NULL); /* it did match */
495 /* could the rest match the rest? */
496 tail = slow(m, rest, stop, es, stopst);
497 if (tail == stop)
498 break; /* yes! */
499 /* no -- try a shorter match for this one */
500 stp = rest - 1;
501 assert(stp >= sp); /* it did work */
502 }
503 ssub = ss + 1;
504 esub = ss + OPND(m->g->strip[ss]) - 1;
505 assert(OP(m->g->strip[esub]) == OOR1);
506 for (;;) { /* find first matching branch */
507 if (slow(m, sp, rest, ssub, esub) == rest)
508 break; /* it matched all of it */
509 /* that one missed, try next one */
510 assert(OP(m->g->strip[esub]) == OOR1);
511 esub++;
512 assert(OP(m->g->strip[esub]) == OOR2);
513 ssub = esub + 1;
514 esub += OPND(m->g->strip[esub]);
515 if (OP(m->g->strip[esub]) == OOR2)
516 esub--;
517 else
518 assert(OP(m->g->strip[esub]) == O_CH);
519 }
520 #ifdef NDEBUG
521 (void)
522 #else
523 dp =
524 #endif
525 dissect(m, sp, rest, ssub, esub);
526 assert(dp == rest);
527 sp = rest;
528 break;
529 case O_PLUS:
530 case O_QUEST:
531 case OOR1:
532 case OOR2:
533 case O_CH:
534 assert(nope);
535 break;
536 case OLPAREN:
537 i = OPND(m->g->strip[ss]);
538 assert(0 < i && i <= m->g->nsub);
539 m->pmatch[i].rm_so = sp - m->offp;
540 break;
541 case ORPAREN:
542 i = OPND(m->g->strip[ss]);
543 assert(0 < i && i <= m->g->nsub);
544 m->pmatch[i].rm_eo = sp - m->offp;
545 break;
546 default: /* uh oh */
547 assert(nope);
548 break;
549 }
550 }
551
552 assert(sp == stop);
553 return(sp);
554 }
555
556 /*
557 - backref - figure out what matched what, figuring in back references
558 == static const char *backref(struct match *m, const char *start, \
559 == const char *stop, sopno startst, sopno stopst, sopno lev);
560 */
561 static const char * /* == stop (success) or NULL (failure) */
backref(struct match * m,const char * start,const char * stop,sopno startst,sopno stopst,sopno lev)562 backref(
563 struct match *m,
564 const char *start,
565 const char *stop,
566 sopno startst,
567 sopno stopst,
568 sopno lev) /* PLUS nesting level */
569 {
570 int i;
571 sopno ss; /* start sop of current subRE */
572 const char *sp; /* start of string matched by it */
573 sopno ssub; /* start sop of subsubRE */
574 sopno esub; /* end sop of subsubRE */
575 const char *ssp; /* start of string matched by subsubRE */
576 const char *dp;
577 size_t len;
578 int hard;
579 sop s;
580 regoff_t offsave;
581 cset *cs;
582
583 _DIAGASSERT(m != NULL);
584 _DIAGASSERT(start != NULL);
585 _DIAGASSERT(stop != NULL);
586
587 AT("back", start, stop, startst, stopst);
588 sp = start;
589
590 /* get as far as we can with easy stuff */
591 hard = 0;
592 for (ss = startst; !hard && ss < stopst; ss++)
593 switch (OP(s = m->g->strip[ss])) {
594 case OCHAR:
595 if (sp == stop || *sp++ != (char)OPND(s))
596 return(NULL);
597 break;
598 case OANY:
599 if (sp == stop)
600 return(NULL);
601 sp++;
602 break;
603 case OANYOF:
604 cs = &m->g->sets[OPND(s)];
605 if (sp == stop || !CHIN(cs, *sp++))
606 return(NULL);
607 break;
608 case OBOL:
609 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
610 (sp < m->endp && *(sp-1) == '\n' &&
611 (m->g->cflags®_NEWLINE)) )
612 { /* yes */ }
613 else
614 return(NULL);
615 break;
616 case OEOL:
617 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
618 (sp < m->endp && *sp == '\n' &&
619 (m->g->cflags®_NEWLINE)) )
620 { /* yes */ }
621 else
622 return(NULL);
623 break;
624 case OBOW:
625 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
626 (sp < m->endp && *(sp-1) == '\n' &&
627 (m->g->cflags®_NEWLINE)) ||
628 (sp > m->beginp &&
629 !ISWORD(*(sp-1))) ) &&
630 (sp < m->endp && ISWORD(*sp)) )
631 { /* yes */ }
632 else
633 return(NULL);
634 break;
635 case OEOW:
636 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
637 (sp < m->endp && *sp == '\n' &&
638 (m->g->cflags®_NEWLINE)) ||
639 (sp < m->endp && !ISWORD(*sp)) ) &&
640 (sp > m->beginp && ISWORD(*(sp-1))) )
641 { /* yes */ }
642 else
643 return(NULL);
644 break;
645 case O_QUEST:
646 break;
647 case OOR1: /* matches null but needs to skip */
648 ss++;
649 s = m->g->strip[ss];
650 do {
651 assert(OP(s) == OOR2);
652 ss += OPND(s);
653 } while (OP(s = m->g->strip[ss]) != O_CH);
654 /* note that the ss++ gets us past the O_CH */
655 break;
656 default: /* have to make a choice */
657 hard = 1;
658 break;
659 }
660 if (!hard) { /* that was it! */
661 if (sp != stop)
662 return(NULL);
663 return(sp);
664 }
665 ss--; /* adjust for the for's final increment */
666
667 /* the hard stuff */
668 AT("hard", sp, stop, ss, stopst);
669 s = m->g->strip[ss];
670 switch (OP(s)) {
671 case OBACK_: /* the vilest depths */
672 i = OPND(s);
673 assert(0 < i && i <= m->g->nsub);
674 if (m->pmatch[i].rm_eo == (regoff_t)-1)
675 return(NULL);
676 assert(m->pmatch[i].rm_so != (regoff_t)-1);
677 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
678 if (len == 0)
679 return(NULL);
680 assert(stop - m->beginp >= len);
681 if (sp > stop - len)
682 return(NULL); /* not enough left to match */
683 ssp = m->offp + (size_t)m->pmatch[i].rm_so;
684 if (memcmp(sp, ssp, len) != 0)
685 return(NULL);
686 while (m->g->strip[ss] != SOP(O_BACK, i))
687 ss++;
688 return(backref(m, sp+len, stop, ss+1, stopst, lev));
689
690 case OQUEST_: /* to null or not */
691 dp = backref(m, sp, stop, ss+1, stopst, lev);
692 if (dp != NULL)
693 return(dp); /* not */
694 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
695
696 case OPLUS_:
697 assert(m->lastpos != NULL);
698 assert(lev+1 <= m->g->nplus);
699 m->lastpos[lev+1] = sp;
700 return(backref(m, sp, stop, ss+1, stopst, lev+1));
701
702 case O_PLUS:
703 if (sp == m->lastpos[lev]) /* last pass matched null */
704 return(backref(m, sp, stop, ss+1, stopst, lev-1));
705 /* try another pass */
706 m->lastpos[lev] = sp;
707 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
708 if (dp == NULL)
709 dp = backref(m, sp, stop, ss+1, stopst, lev-1);
710 return(dp);
711
712 case OCH_: /* find the right one, if any */
713 ssub = ss + 1;
714 esub = ss + OPND(s) - 1;
715 assert(OP(m->g->strip[esub]) == OOR1);
716 for (;;) { /* find first matching branch */
717 dp = backref(m, sp, stop, ssub, esub, lev);
718 if (dp != NULL)
719 return(dp);
720 /* that one missed, try next one */
721 if (OP(m->g->strip[esub]) == O_CH)
722 return(NULL); /* there is none */
723 esub++;
724 assert(OP(m->g->strip[esub]) == OOR2);
725 ssub = esub + 1;
726 esub += OPND(m->g->strip[esub]);
727 if (OP(m->g->strip[esub]) == OOR2)
728 esub--;
729 else
730 assert(OP(m->g->strip[esub]) == O_CH);
731 }
732
733 case OLPAREN: /* must undo assignment if rest fails */
734 i = OPND(s);
735 assert(0 < i && i <= m->g->nsub);
736 offsave = m->pmatch[i].rm_so;
737 m->pmatch[i].rm_so = sp - m->offp;
738 dp = backref(m, sp, stop, ss+1, stopst, lev);
739 if (dp != NULL)
740 return(dp);
741 m->pmatch[i].rm_so = offsave;
742 return(NULL);
743
744 case ORPAREN: /* must undo assignment if rest fails */
745 i = OPND(s);
746 assert(0 < i && i <= m->g->nsub);
747 offsave = m->pmatch[i].rm_eo;
748 m->pmatch[i].rm_eo = sp - m->offp;
749 dp = backref(m, sp, stop, ss+1, stopst, lev);
750 if (dp != NULL)
751 return(dp);
752 m->pmatch[i].rm_eo = offsave;
753 return(NULL);
754
755 default: /* uh oh */
756 assert(nope);
757 break;
758 }
759
760 /* "can't happen" */
761 assert(nope);
762 /* NOTREACHED */
763 return NULL;
764 }
765
766 /*
767 - fast - step through the string at top speed
768 == static const char *fast(struct match *m, const char *start, \
769 == const char *stop, sopno startst, sopno stopst);
770 */
771 static const char * /* where tentative match ended, or NULL */
fast(struct match * m,const char * start,const char * stop,sopno startst,sopno stopst)772 fast(
773 struct match *m,
774 const char *start,
775 const char *stop,
776 sopno startst,
777 sopno stopst)
778 {
779 states st = m->st;
780 states fresh = m->fresh;
781 states tmp = m->tmp;
782 const char *p = start;
783 int c = (start == m->beginp) ? OUT : *(start-1);
784 int lastc; /* previous c */
785 int flagch;
786 size_t i;
787 const char *coldp; /* last p after which no match was underway */
788
789 _DIAGASSERT(m != NULL);
790 _DIAGASSERT(start != NULL);
791 _DIAGASSERT(stop != NULL);
792
793 CLEAR(st);
794 SET1(st, startst);
795 st = step(m->g, startst, stopst, st, NOTHING, st);
796 ASSIGN(fresh, st);
797 SP("start", st, *p);
798 coldp = NULL;
799 for (;;) {
800 /* next character */
801 lastc = c;
802 c = (p == m->endp) ? OUT : *p;
803 if (EQ(st, fresh))
804 coldp = p;
805
806 /* is there an EOL and/or BOL between lastc and c? */
807 flagch = '\0';
808 i = 0;
809 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
810 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
811 flagch = BOL;
812 i = m->g->nbol;
813 }
814 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
815 (c == OUT && !(m->eflags®_NOTEOL)) ) {
816 flagch = (flagch == BOL) ? BOLEOL : EOL;
817 i += m->g->neol;
818 }
819 if (i != 0) {
820 for (; i > 0; i--)
821 st = step(m->g, startst, stopst, st, flagch, st);
822 SP("boleol", st, c);
823 }
824
825 /* how about a word boundary? */
826 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
827 (c != OUT && ISWORD(c)) ) {
828 flagch = BOW;
829 }
830 if ( (lastc != OUT && ISWORD(lastc)) &&
831 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
832 flagch = EOW;
833 }
834 if (flagch == BOW || flagch == EOW) {
835 st = step(m->g, startst, stopst, st, flagch, st);
836 SP("boweow", st, c);
837 }
838
839 /* are we done? */
840 if (ISSET(st, stopst) || p == stop)
841 break; /* NOTE BREAK OUT */
842
843 /* no, we must deal with this character */
844 ASSIGN(tmp, st);
845 ASSIGN(st, fresh);
846 assert(c != OUT);
847 st = step(m->g, startst, stopst, tmp, c, st);
848 SP("aft", st, c);
849 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
850 p++;
851 }
852
853 assert(coldp != NULL);
854 m->coldp = coldp;
855 if (ISSET(st, stopst))
856 return(p+1);
857 else
858 return(NULL);
859 }
860
861 /*
862 - slow - step through the string more deliberately
863 == static const char *slow(struct match *m, const char *start, \
864 == const char *stop, sopno startst, sopno stopst);
865 */
866 static const char * /* where it ended */
slow(struct match * m,const char * start,const char * stop,sopno startst,sopno stopst)867 slow(
868 struct match *m,
869 const char *start,
870 const char *stop,
871 sopno startst,
872 sopno stopst)
873 {
874 states st = m->st;
875 states empty = m->empty;
876 states tmp = m->tmp;
877 const char *p = start;
878 int c = (start == m->beginp) ? OUT : *(start-1);
879 int lastc; /* previous c */
880 int flagch;
881 size_t i;
882 const char *matchp; /* last p at which a match ended */
883
884 _DIAGASSERT(m != NULL);
885 _DIAGASSERT(start != NULL);
886 _DIAGASSERT(stop != NULL);
887
888 AT("slow", start, stop, startst, stopst);
889 CLEAR(st);
890 SET1(st, startst);
891 SP("sstart", st, *p);
892 st = step(m->g, startst, stopst, st, NOTHING, st);
893 matchp = NULL;
894 for (;;) {
895 /* next character */
896 lastc = c;
897 c = (p == m->endp) ? OUT : *p;
898
899 /* is there an EOL and/or BOL between lastc and c? */
900 flagch = '\0';
901 i = 0;
902 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
903 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
904 flagch = BOL;
905 i = m->g->nbol;
906 }
907 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
908 (c == OUT && !(m->eflags®_NOTEOL)) ) {
909 flagch = (flagch == BOL) ? BOLEOL : EOL;
910 i += m->g->neol;
911 }
912 if (i != 0) {
913 for (; i > 0; i--)
914 st = step(m->g, startst, stopst, st, flagch, st);
915 SP("sboleol", st, c);
916 }
917
918 /* how about a word boundary? */
919 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
920 (c != OUT && ISWORD(c)) ) {
921 flagch = BOW;
922 }
923 if ( (lastc != OUT && ISWORD(lastc)) &&
924 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
925 flagch = EOW;
926 }
927 if (flagch == BOW || flagch == EOW) {
928 st = step(m->g, startst, stopst, st, flagch, st);
929 SP("sboweow", st, c);
930 }
931
932 /* are we done? */
933 if (ISSET(st, stopst))
934 matchp = p;
935 if (EQ(st, empty) || p == stop)
936 break; /* NOTE BREAK OUT */
937
938 /* no, we must deal with this character */
939 ASSIGN(tmp, st);
940 ASSIGN(st, empty);
941 assert(c != OUT);
942 st = step(m->g, startst, stopst, tmp, c, st);
943 SP("saft", st, c);
944 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
945 p++;
946 }
947
948 return(matchp);
949 }
950
951
952 /*
953 - step - map set of states reachable before char to set reachable after
954 == static states step(struct re_guts *g, sopno start, sopno stop, \
955 == states bef, int ch, states aft);
956 == #define BOL (OUT+1)
957 == #define EOL (BOL+1)
958 == #define BOLEOL (BOL+2)
959 == #define NOTHING (BOL+3)
960 == #define BOW (BOL+4)
961 == #define EOW (BOL+5)
962 == #define CODEMAX (BOL+5) // highest code used
963 == #define NONCHAR(c) ((c) > CHAR_MAX)
964 == #define NNONCHAR (CODEMAX-CHAR_MAX)
965 */
966 static states
step(struct re_guts * g,sopno start,sopno stop,states bef,int ch,states aft)967 step(
968 struct re_guts *g,
969 sopno start, /* start state within strip */
970 sopno stop, /* state after stop state within strip */
971 states bef, /* states reachable before */
972 int ch, /* character or NONCHAR code */
973 states aft) /* states already known reachable after */
974 {
975 cset *cs;
976 sop s;
977 sopno pc;
978 onestate here; /* note, macros know this name */
979 sopno look;
980 int i;
981
982 _DIAGASSERT(g != NULL);
983
984 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
985 s = g->strip[pc];
986 switch (OP(s)) {
987 case OEND:
988 assert(pc == stop-1);
989 break;
990 case OCHAR:
991 /* only characters can match */
992 assert(!NONCHAR(ch) || ch != (char)OPND(s));
993 if (ch == (char)OPND(s))
994 FWD(aft, bef, 1);
995 break;
996 case OBOL:
997 if (ch == BOL || ch == BOLEOL)
998 FWD(aft, bef, 1);
999 break;
1000 case OEOL:
1001 if (ch == EOL || ch == BOLEOL)
1002 FWD(aft, bef, 1);
1003 break;
1004 case OBOW:
1005 if (ch == BOW)
1006 FWD(aft, bef, 1);
1007 break;
1008 case OEOW:
1009 if (ch == EOW)
1010 FWD(aft, bef, 1);
1011 break;
1012 case OANY:
1013 if (!NONCHAR(ch))
1014 FWD(aft, bef, 1);
1015 break;
1016 case OANYOF:
1017 cs = &g->sets[OPND(s)];
1018 if (!NONCHAR(ch) && CHIN(cs, ch))
1019 FWD(aft, bef, 1);
1020 break;
1021 case OBACK_: /* ignored here */
1022 case O_BACK:
1023 FWD(aft, aft, 1);
1024 break;
1025 case OPLUS_: /* forward, this is just an empty */
1026 FWD(aft, aft, 1);
1027 break;
1028 case O_PLUS: /* both forward and back */
1029 FWD(aft, aft, 1);
1030 i = ISSETBACK(aft, OPND(s));
1031 BACK(aft, aft, OPND(s));
1032 if (!i && ISSETBACK(aft, OPND(s))) {
1033 /* oho, must reconsider loop body */
1034 pc -= OPND(s) + 1;
1035 INIT(here, pc);
1036 }
1037 break;
1038 case OQUEST_: /* two branches, both forward */
1039 FWD(aft, aft, 1);
1040 FWD(aft, aft, OPND(s));
1041 break;
1042 case O_QUEST: /* just an empty */
1043 FWD(aft, aft, 1);
1044 break;
1045 case OLPAREN: /* not significant here */
1046 case ORPAREN:
1047 FWD(aft, aft, 1);
1048 break;
1049 case OCH_: /* mark the first two branches */
1050 FWD(aft, aft, 1);
1051 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1052 FWD(aft, aft, OPND(s));
1053 break;
1054 case OOR1: /* done a branch, find the O_CH */
1055 if (ISSTATEIN(aft, here)) {
1056 for (look = 1;
1057 OP(s = g->strip[pc+look]) != O_CH;
1058 look += OPND(s))
1059 assert(OP(s) == OOR2);
1060 FWD(aft, aft, look);
1061 }
1062 break;
1063 case OOR2: /* propagate OCH_'s marking */
1064 FWD(aft, aft, 1);
1065 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
1066 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
1067 FWD(aft, aft, OPND(s));
1068 }
1069 break;
1070 case O_CH: /* just empty */
1071 FWD(aft, aft, 1);
1072 break;
1073 default: /* ooooops... */
1074 assert(nope);
1075 break;
1076 }
1077 }
1078
1079 return(aft);
1080 }
1081
1082 #ifdef REDEBUG
1083 /*
1084 - print - print a set of states
1085 == #ifdef REDEBUG
1086 == static void print(struct match *m, char *caption, states st, \
1087 == int ch, FILE *d);
1088 == #endif
1089 */
1090 static void
print(struct match * m,char * caption,states st,int ch,FILE * d)1091 print(
1092 struct match *m,
1093 char *caption,
1094 states st,
1095 int ch,
1096 FILE *d)
1097 {
1098 struct re_guts *g = m->g;
1099 int i;
1100 int first = 1;
1101
1102 _DIAGASSERT(m != NULL);
1103 _DIAGASSERT(caption != NULL);
1104
1105 if (!(m->eflags®_TRACE))
1106 return;
1107
1108 _DIAGASSERT(d != NULL);
1109
1110 fprintf(d, "%s", caption);
1111 if (ch != '\0')
1112 fprintf(d, " %s", pchar(ch));
1113 for (i = 0; i < g->nstates; i++)
1114 if (ISSET(st, i)) {
1115 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
1116 first = 0;
1117 }
1118 fprintf(d, "\n");
1119 }
1120
1121 /*
1122 - at - print current situation
1123 == #ifdef REDEBUG
1124 == static void at(struct match *m, char *title, char *start, char *stop, \
1125 == sopno startst, sopno stopst);
1126 == #endif
1127 */
1128 static void
at(struct match * m,char * title,char * start,char * stop,sopno startst,sopno stopst)1129 at(
1130 struct match *m,
1131 char *title,
1132 char *start,
1133 char *stop,
1134 sopno startst,
1135 sopno stopst)
1136 {
1137
1138 _DIAGASSERT(m != NULL);
1139 _DIAGASSERT(title != NULL);
1140 _DIAGASSERT(start != NULL);
1141 _DIAGASSERT(stop != NULL);
1142
1143 if (!(m->eflags®_TRACE))
1144 return;
1145
1146 printf("%s %s-", title, pchar(*start));
1147 printf("%s ", pchar(*stop));
1148 printf("%ld-%ld\n", (long)startst, (long)stopst);
1149 }
1150
1151 #ifndef PCHARDONE
1152 #define PCHARDONE /* never again */
1153 /*
1154 - pchar - make a character printable
1155 == #ifdef REDEBUG
1156 == static char *pchar(int ch);
1157 == #endif
1158 *
1159 * Is this identical to regchar() over in debug.c? Well, yes. But a
1160 * duplicate here avoids having a debugging-capable regexec.o tied to
1161 * a matching debug.o, and this is convenient. It all disappears in
1162 * the non-debug compilation anyway, so it doesn't matter much.
1163 */
1164 static char * /* -> representation */
pchar(int ch)1165 pchar(
1166 int ch)
1167 {
1168 static char pbuf[10];
1169
1170 if (isprint(ch) || ch == ' ')
1171 (void)snprintf(pbuf, sizeof pbuf, "%c", ch);
1172 else
1173 (void)snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1174 return(pbuf);
1175 }
1176 #endif
1177 #endif
1178
1179 #undef matcher
1180 #undef fast
1181 #undef slow
1182 #undef dissect
1183 #undef backref
1184 #undef step
1185 #undef print
1186 #undef at
1187 #undef match
1188 #undef nope
1189