1 /*
2 * The matching engine and friends. This file is #included by regexec.c
3 * after suitable #defines of a variety of macros used herein, so that
4 * different state representations can be used without duplicating masses
5 * of code.
6 */
7
8 #ifdef SNAMES
9 #define matcher smatcher
10 #define fast sfast
11 #define slow sslow
12 #define dissect sdissect
13 #define backref sbackref
14 #define step sstep
15 #define print sprint
16 #define at sat
17 #define match smat
18 #endif
19 #ifdef LNAMES
20 #define matcher lmatcher
21 #define fast lfast
22 #define slow lslow
23 #define dissect ldissect
24 #define backref lbackref
25 #define step lstep
26 #define print lprint
27 #define at lat
28 #define match lmat
29 #endif
30
31 /* another structure passed up and down to avoid zillions of parameters */
32 struct match {
33 struct re_guts *g;
34 int eflags;
35 regmatch_t *pmatch; /* [nsub+1] (0 element unused) */
36 char *offp; /* offsets work from here */
37 char *beginp; /* start of string -- virtual NUL precedes */
38 char *endp; /* end of string -- virtual NUL here */
39 char *coldp; /* can be no match starting before here */
40 char **lastpos; /* [nplus+1] */
41 STATEVARS;
42 states st; /* current states */
43 states fresh; /* states for a fresh start */
44 states tmp; /* temporary */
45 states empty; /* empty set of states */
46 };
47
48 #ifdef REDEBUG
49 #define SP(t, s, c) print(m, t, s, c, stdout)
50 #define AT(t, p1, p2, s1, s2) at(m, t, p1, p2, s1, s2)
51 #define NOTE(str) { if (m->eflags®_TRACE) printf("=%s\n", (str)); }
52 #else
53 #define SP(t, s, c) /* nothing */
54 #define AT(t, p1, p2, s1, s2) /* nothing */
55 #define NOTE(s) /* nothing */
56 #endif
57
58 static char *dissect(struct match *m, char *start,
59 char *stop, sopno startst, sopno stopst);
60 static char *backref(struct match *m, char *start,
61 char *stop, sopno startst, sopno stopst, sopno lev);
62 static char *fast(struct match *m, char *start,
63 char *stop, sopno startst, sopno stopst);
64 static char *slow(struct match *m, char *start,
65 char *stop, sopno startst, sopno stopst);
66 static states step(struct re_guts *g, sopno start, sopno stop,
67 states bef, int ch, states aft);
68 #ifdef REDEBUG
69 static void print(struct match *m, char *caption, states st,
70 int ch, FILE *d);
71 static void at(struct match *m, char *title, char *start, char *stop,
72 sopno startst, sopno stopst);
73 static char *pchar(int ch);
74 #endif
75
76 #define BOL (OUT+1)
77 #define EOL (BOL+1)
78 #define BOLEOL (BOL+2)
79 #define NOTHING (BOL+3)
80 #define BOW (BOL+4)
81 #define EOW (BOL+5)
82 #define CODEMAX (BOL+5) // highest code used
83 #define NONCHAR(c) ((c) > CHAR_MAX)
84 #define NNONCHAR (CODEMAX-CHAR_MAX)
85
86 /*
87 - matcher - the actual matching engine
88 */
89 static int /* 0 success, REG_NOMATCH failure */
matcher(g,string,nmatch,pmatch,eflags)90 matcher(g, string, nmatch, pmatch, eflags)
91 struct re_guts *g;
92 char *string;
93 size_t nmatch;
94 regmatch_t pmatch[];
95 int eflags;
96 {
97 char *endp;
98 int i;
99 struct match mv;
100 struct match *m = &mv;
101 char *dp;
102 const sopno gf = g->firststate+1; /* +1 for OEND */
103 const sopno gl = g->laststate;
104 char *start;
105 char *stop;
106
107 /* simplify the situation where possible */
108 if (g->cflags®_NOSUB)
109 nmatch = 0;
110 if (eflags®_STARTEND) {
111 start = string + pmatch[0].rm_so;
112 stop = string + pmatch[0].rm_eo;
113 } else {
114 start = string;
115 stop = start + strlen(start);
116 }
117 if (stop < start)
118 return(REG_INVARG);
119
120 /* prescreening; this does wonders for this rather slow code */
121 if (g->must != NULL) {
122 for (dp = start; dp < stop; dp++)
123 if (*dp == g->must[0] && stop - dp >= g->mlen &&
124 memcmp(dp, g->must, (size_t)g->mlen) == 0)
125 break;
126 if (dp == stop) /* we didn't find g->must */
127 return(REG_NOMATCH);
128 }
129
130 /* match struct setup */
131 m->g = g;
132 m->eflags = eflags;
133 m->pmatch = NULL;
134 m->lastpos = NULL;
135 m->offp = string;
136 m->beginp = start;
137 m->endp = stop;
138 STATESETUP(m, 4);
139 SETUP(m->st);
140 SETUP(m->fresh);
141 SETUP(m->tmp);
142 SETUP(m->empty);
143 CLEAR(m->empty);
144
145 /* this loop does only one repetition except for backrefs */
146 for (;;) {
147 endp = fast(m, start, stop, gf, gl);
148 if (endp == NULL) { /* a miss */
149 STATETEARDOWN(m);
150 return(REG_NOMATCH);
151 }
152 if (nmatch == 0 && !g->backrefs)
153 break; /* no further info needed */
154
155 /* where? */
156 assert(m->coldp != NULL);
157 for (;;) {
158 NOTE("finding start");
159 endp = slow(m, m->coldp, stop, gf, gl);
160 if (endp != NULL)
161 break;
162 assert(m->coldp < m->endp);
163 m->coldp++;
164 }
165 if (nmatch == 1 && !g->backrefs)
166 break; /* no further info needed */
167
168 /* oh my, he wants the subexpressions... */
169 if (m->pmatch == NULL)
170 m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
171 sizeof(regmatch_t));
172 if (m->pmatch == NULL) {
173 STATETEARDOWN(m);
174 return(REG_ESPACE);
175 }
176 for (i = 1; (size_t) i <= m->g->nsub; i++)
177 m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
178 if (!g->backrefs && !(m->eflags®_BACKR)) {
179 NOTE("dissecting");
180 dp = dissect(m, m->coldp, endp, gf, gl);
181 } else {
182 if (g->nplus > 0 && m->lastpos == NULL)
183 m->lastpos = (char **)malloc(((size_t)g->nplus+1) *
184 sizeof(char *));
185 if (g->nplus > 0 && m->lastpos == NULL) {
186 free(m->pmatch);
187 STATETEARDOWN(m);
188 return(REG_ESPACE);
189 }
190 NOTE("backref dissect");
191 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
192 }
193 if (dp != NULL)
194 break;
195
196 /* uh-oh... we couldn't find a subexpression-level match */
197 assert(g->backrefs); /* must be back references doing it */
198 assert(g->nplus == 0 || m->lastpos != NULL);
199 for (;;) {
200 if (dp != NULL || endp <= m->coldp)
201 break; /* defeat */
202 NOTE("backoff");
203 endp = slow(m, m->coldp, endp-1, gf, gl);
204 if (endp == NULL)
205 break; /* defeat */
206 /* try it on a shorter possibility */
207 #ifndef NDEBUG
208 for (i = 1; (size_t) i <= m->g->nsub; i++) {
209 assert(m->pmatch[i].rm_so == -1);
210 assert(m->pmatch[i].rm_eo == -1);
211 }
212 #endif
213 NOTE("backoff dissect");
214 dp = backref(m, m->coldp, endp, gf, gl, (sopno)0);
215 }
216 assert(dp == NULL || dp == endp);
217 if (dp != NULL) /* found a shorter one */
218 break;
219
220 /* despite initial appearances, there is no match here */
221 NOTE("false alarm");
222 start = m->coldp + 1; /* recycle starting later */
223 assert(start <= stop);
224 }
225
226 /* fill in the details if requested */
227 if (nmatch > 0) {
228 pmatch[0].rm_so = (regoff_t)(m->coldp - m->offp);
229 pmatch[0].rm_eo = (regoff_t)(endp - m->offp);
230 }
231 if (nmatch > 1) {
232 assert(m->pmatch != NULL);
233 for (i = 1; (size_t) i < nmatch; i++)
234 if ((size_t) i <= m->g->nsub)
235 pmatch[i] = m->pmatch[i];
236 else {
237 pmatch[i].rm_so = -1;
238 pmatch[i].rm_eo = -1;
239 }
240 }
241
242 if (m->pmatch != NULL)
243 free((char *)m->pmatch);
244 if (m->lastpos != NULL)
245 free((char *)m->lastpos);
246 STATETEARDOWN(m);
247 return(0);
248 }
249
250 /*
251 - dissect - figure out what matched what, no back references
252 */
253 static char * /* == stop (success) always */
dissect(m,start,stop,startst,stopst)254 dissect(m, start, stop, startst, stopst)
255 struct match *m;
256 char *start;
257 char *stop;
258 sopno startst;
259 sopno stopst;
260 {
261 int i;
262 sopno ss; /* start sop of current subRE */
263 sopno es; /* end sop of current subRE */
264 char *sp; /* start of string matched by it */
265 char *stp; /* string matched by it cannot pass here */
266 char *rest; /* start of rest of string */
267 char *tail; /* string unmatched by rest of RE */
268 sopno ssub; /* start sop of subsubRE */
269 sopno esub; /* end sop of subsubRE */
270 char *ssp; /* start of string matched by subsubRE */
271 char *sep; /* end of string matched by subsubRE */
272 char *oldssp; /* previous ssp */
273 char *dp;
274
275 AT("diss", start, stop, startst, stopst);
276 sp = start;
277 for (ss = startst; ss < stopst; ss = es) {
278 /* identify end of subRE */
279 es = ss;
280 switch (OP(m->g->strip[es])) {
281 case OPLUS_:
282 case OQUEST_:
283 es += OPND(m->g->strip[es]);
284 break;
285 case OCH_:
286 while (OP(m->g->strip[es]) != O_CH)
287 es += OPND(m->g->strip[es]);
288 break;
289 }
290 es++;
291
292 /* figure out what it matched */
293 switch (OP(m->g->strip[ss])) {
294 case OEND:
295 assert(nope);
296 break;
297 case OCHAR:
298 sp++;
299 break;
300 case OBOL:
301 case OEOL:
302 case OBOW:
303 case OEOW:
304 break;
305 case OANY:
306 case OANYOF:
307 sp++;
308 break;
309 case OBACK_:
310 case O_BACK:
311 assert(nope);
312 break;
313 /* cases where length of match is hard to find */
314 case OQUEST_:
315 stp = stop;
316 for (;;) {
317 /* how long could this one be? */
318 rest = slow(m, sp, stp, ss, es);
319 assert(rest != NULL); /* it did match */
320 /* could the rest match the rest? */
321 tail = slow(m, rest, stop, es, stopst);
322 if (tail == stop)
323 break; /* yes! */
324 /* no -- try a shorter match for this one */
325 stp = rest - 1;
326 assert(stp >= sp); /* it did work */
327 }
328 ssub = ss + 1;
329 esub = es - 1;
330 /* did innards match? */
331 if (slow(m, sp, rest, ssub, esub) != NULL) {
332 dp = dissect(m, sp, rest, ssub, esub);
333 assert(dp == rest);
334 } else /* no */
335 assert(sp == rest);
336 sp = rest;
337 break;
338 case OPLUS_:
339 stp = stop;
340 for (;;) {
341 /* how long could this one be? */
342 rest = slow(m, sp, stp, ss, es);
343 assert(rest != NULL); /* it did match */
344 /* could the rest match the rest? */
345 tail = slow(m, rest, stop, es, stopst);
346 if (tail == stop)
347 break; /* yes! */
348 /* no -- try a shorter match for this one */
349 stp = rest - 1;
350 assert(stp >= sp); /* it did work */
351 }
352 ssub = ss + 1;
353 esub = es - 1;
354 ssp = sp;
355 oldssp = ssp;
356 for (;;) { /* find last match of innards */
357 sep = slow(m, ssp, rest, ssub, esub);
358 if (sep == NULL || sep == ssp)
359 break; /* failed or matched null */
360 oldssp = ssp; /* on to next try */
361 ssp = sep;
362 }
363 if (sep == NULL) {
364 /* last successful match */
365 sep = ssp;
366 ssp = oldssp;
367 }
368 assert(sep == rest); /* must exhaust substring */
369 assert(slow(m, ssp, sep, ssub, esub) == rest);
370 dp = dissect(m, ssp, sep, ssub, esub);
371 assert(dp == sep);
372 sp = rest;
373 break;
374 case OCH_:
375 stp = stop;
376 for (;;) {
377 /* how long could this one be? */
378 rest = slow(m, sp, stp, ss, es);
379 assert(rest != NULL); /* it did match */
380 /* could the rest match the rest? */
381 tail = slow(m, rest, stop, es, stopst);
382 if (tail == stop)
383 break; /* yes! */
384 /* no -- try a shorter match for this one */
385 stp = rest - 1;
386 assert(stp >= sp); /* it did work */
387 }
388 ssub = ss + 1;
389 esub = ss + OPND(m->g->strip[ss]) - 1;
390 assert(OP(m->g->strip[esub]) == OOR1);
391 for (;;) { /* find first matching branch */
392 if (slow(m, sp, rest, ssub, esub) == rest)
393 break; /* it matched all of it */
394 /* that one missed, try next one */
395 assert(OP(m->g->strip[esub]) == OOR1);
396 esub++;
397 assert(OP(m->g->strip[esub]) == OOR2);
398 ssub = esub + 1;
399 esub += OPND(m->g->strip[esub]);
400 if (OP(m->g->strip[esub]) == OOR2)
401 esub--;
402 else
403 assert(OP(m->g->strip[esub]) == O_CH);
404 }
405 dp = dissect(m, sp, rest, ssub, esub);
406 assert(dp == rest);
407 sp = rest;
408 break;
409 case O_PLUS:
410 case O_QUEST:
411 case OOR1:
412 case OOR2:
413 case O_CH:
414 assert(nope);
415 break;
416 case OLPAREN:
417 i = OPND(m->g->strip[ss]);
418 assert(0 < i && (size_t) i <= m->g->nsub);
419 m->pmatch[i].rm_so = (regoff_t)(sp - m->offp);
420 break;
421 case ORPAREN:
422 i = OPND(m->g->strip[ss]);
423 assert(0 < i && (size_t) i <= m->g->nsub);
424 m->pmatch[i].rm_eo = (regoff_t)(sp - m->offp);
425 break;
426 default: /* uh oh */
427 assert(nope);
428 break;
429 }
430 }
431
432 assert(sp == stop);
433 return(sp);
434 }
435
436 /*
437 - backref - figure out what matched what, figuring in back references
438 */
439 static char * /* == stop (success) or NULL (failure) */
backref(m,start,stop,startst,stopst,lev)440 backref(m, start, stop, startst, stopst, lev)
441 struct match *m;
442 char *start;
443 char *stop;
444 sopno startst;
445 sopno stopst;
446 sopno lev; /* PLUS nesting level */
447 {
448 int i;
449 sopno ss; /* start sop of current subRE */
450 char *sp; /* start of string matched by it */
451 sopno ssub; /* start sop of subsubRE */
452 sopno esub; /* end sop of subsubRE */
453 char *ssp; /* start of string matched by subsubRE */
454 char *dp;
455 size_t len;
456 int hard;
457 sop s;
458 regoff_t offsave;
459 cset *cs;
460
461 AT("back", start, stop, startst, stopst);
462 sp = start;
463
464 /* get as far as we can with easy stuff */
465 hard = 0;
466 for (ss = startst; !hard && ss < stopst; ss++)
467 switch (OP(s = m->g->strip[ss])) {
468 case OCHAR:
469 if (sp == stop || *sp++ != (char)OPND(s))
470 return(NULL);
471 break;
472 case OANY:
473 if (sp == stop)
474 return(NULL);
475 sp++;
476 break;
477 case OANYOF:
478 cs = &m->g->sets[OPND(s)];
479 if (sp == stop || !CHIN(cs, *sp++))
480 return(NULL);
481 break;
482 case OBOL:
483 if ( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
484 (sp < m->endp && *(sp-1) == '\n' &&
485 (m->g->cflags®_NEWLINE)) )
486 { /* yes */ }
487 else
488 return(NULL);
489 break;
490 case OEOL:
491 if ( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
492 (sp < m->endp && *sp == '\n' &&
493 (m->g->cflags®_NEWLINE)) )
494 { /* yes */ }
495 else
496 return(NULL);
497 break;
498 case OBOW:
499 if (( (sp == m->beginp && !(m->eflags®_NOTBOL)) ||
500 (sp < m->endp && *(sp-1) == '\n' &&
501 (m->g->cflags®_NEWLINE)) ||
502 (sp > m->beginp &&
503 !ISWORD(*(sp-1))) ) &&
504 (sp < m->endp && ISWORD(*sp)) )
505 { /* yes */ }
506 else
507 return(NULL);
508 break;
509 case OEOW:
510 if (( (sp == m->endp && !(m->eflags®_NOTEOL)) ||
511 (sp < m->endp && *sp == '\n' &&
512 (m->g->cflags®_NEWLINE)) ||
513 (sp < m->endp && !ISWORD(*sp)) ) &&
514 (sp > m->beginp && ISWORD(*(sp-1))) )
515 { /* yes */ }
516 else
517 return(NULL);
518 break;
519 case O_QUEST:
520 break;
521 case OOR1: /* matches null but needs to skip */
522 ss++;
523 s = m->g->strip[ss];
524 do {
525 assert(OP(s) == OOR2);
526 ss += OPND(s);
527 } while (OP(s = m->g->strip[ss]) != O_CH);
528 /* note that the ss++ gets us past the O_CH */
529 break;
530 default: /* have to make a choice */
531 hard = 1;
532 break;
533 }
534 if (!hard) { /* that was it! */
535 if (sp != stop)
536 return(NULL);
537 return(sp);
538 }
539 ss--; /* adjust for the for's final increment */
540
541 /* the hard stuff */
542 AT("hard", sp, stop, ss, stopst);
543 s = m->g->strip[ss];
544 switch (OP(s)) {
545 case OBACK_: /* the vilest depths */
546 i = OPND(s);
547 assert(0 < i && (size_t) i <= m->g->nsub);
548 if (m->pmatch[i].rm_eo == -1)
549 return(NULL);
550 assert(m->pmatch[i].rm_so != -1);
551 len = (size_t)(m->pmatch[i].rm_eo - m->pmatch[i].rm_so);
552 assert((size_t) (stop - m->beginp) >= len);
553 if (sp > stop - len)
554 return(NULL); /* not enough left to match */
555 ssp = m->offp + m->pmatch[i].rm_so;
556 if (memcmp(sp, ssp, len) != 0)
557 return(NULL);
558 while (m->g->strip[ss] != SOP(O_BACK, i))
559 ss++;
560 return(backref(m, sp+len, stop, ss+1, stopst, lev));
561 break;
562 case OQUEST_: /* to null or not */
563 dp = backref(m, sp, stop, ss+1, stopst, lev);
564 if (dp != NULL)
565 return(dp); /* not */
566 return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev));
567 break;
568 case OPLUS_:
569 assert(m->lastpos != NULL);
570 assert(lev+1 <= m->g->nplus);
571 m->lastpos[lev+1] = sp;
572 return(backref(m, sp, stop, ss+1, stopst, lev+1));
573 break;
574 case O_PLUS:
575 if (sp == m->lastpos[lev]) /* last pass matched null */
576 return(backref(m, sp, stop, ss+1, stopst, lev-1));
577 /* try another pass */
578 m->lastpos[lev] = sp;
579 dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev);
580 if (dp == NULL)
581 return(backref(m, sp, stop, ss+1, stopst, lev-1));
582 else
583 return(dp);
584 break;
585 case OCH_: /* find the right one, if any */
586 ssub = ss + 1;
587 esub = ss + OPND(s) - 1;
588 assert(OP(m->g->strip[esub]) == OOR1);
589 for (;;) { /* find first matching branch */
590 dp = backref(m, sp, stop, ssub, esub, lev);
591 if (dp != NULL)
592 return(dp);
593 /* that one missed, try next one */
594 if (OP(m->g->strip[esub]) == O_CH)
595 return(NULL); /* there is none */
596 esub++;
597 assert(OP(m->g->strip[esub]) == OOR2);
598 ssub = esub + 1;
599 esub += OPND(m->g->strip[esub]);
600 if (OP(m->g->strip[esub]) == OOR2)
601 esub--;
602 else
603 assert(OP(m->g->strip[esub]) == O_CH);
604 }
605 break;
606 case OLPAREN: /* must undo assignment if rest fails */
607 i = OPND(s);
608 assert(0 < i && (size_t) i <= m->g->nsub);
609 offsave = m->pmatch[i].rm_so;
610 m->pmatch[i].rm_so = (regoff_t)(sp - m->offp);
611 dp = backref(m, sp, stop, ss+1, stopst, lev);
612 if (dp != NULL)
613 return(dp);
614 m->pmatch[i].rm_so = offsave;
615 return(NULL);
616 break;
617 case ORPAREN: /* must undo assignment if rest fails */
618 i = OPND(s);
619 assert(0 < i && (size_t) i <= m->g->nsub);
620 offsave = m->pmatch[i].rm_eo;
621 m->pmatch[i].rm_eo = (regoff_t)(sp - m->offp);
622 dp = backref(m, sp, stop, ss+1, stopst, lev);
623 if (dp != NULL)
624 return(dp);
625 m->pmatch[i].rm_eo = offsave;
626 return(NULL);
627 break;
628 default: /* uh oh */
629 assert(nope);
630 break;
631 }
632
633 /* "can't happen" */
634 assert(nope);
635 /* NOTREACHED */
636 return((char *)NULL); /* dummy */
637 }
638
639 /*
640 - fast - step through the string at top speed
641 */
642 static char * /* where tentative match ended, or NULL */
fast(m,start,stop,startst,stopst)643 fast(m, start, stop, startst, stopst)
644 struct match *m;
645 char *start;
646 char *stop;
647 sopno startst;
648 sopno stopst;
649 {
650 states st = m->st;
651 states fresh = m->fresh;
652 states tmp = m->tmp;
653 char *p = start;
654 int c = (start == m->beginp) ? OUT : *(start-1);
655 int lastc; /* previous c */
656 int flagch;
657 int i;
658 char *coldp; /* last p after which no match was underway */
659
660 CLEAR(st);
661 SET1(st, startst);
662 st = step(m->g, startst, stopst, st, NOTHING, st);
663 ASSIGN(fresh, st);
664 SP("start", st, *p);
665 coldp = NULL;
666 for (;;) {
667 /* next character */
668 lastc = c;
669 c = (p == m->endp) ? OUT : *p;
670 if (EQ(st, fresh))
671 coldp = p;
672
673 /* is there an EOL and/or BOL between lastc and c? */
674 flagch = '\0';
675 i = 0;
676 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
677 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
678 flagch = BOL;
679 i = m->g->nbol;
680 }
681 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
682 (c == OUT && !(m->eflags®_NOTEOL)) ) {
683 flagch = (flagch == BOL) ? BOLEOL : EOL;
684 i += m->g->neol;
685 }
686 if (i != 0) {
687 for (; i > 0; i--)
688 st = step(m->g, startst, stopst, st, flagch, st);
689 SP("boleol", st, c);
690 }
691
692 /* how about a word boundary? */
693 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
694 (c != OUT && ISWORD(c)) ) {
695 flagch = BOW;
696 }
697 if ( (lastc != OUT && ISWORD(lastc)) &&
698 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
699 flagch = EOW;
700 }
701 if (flagch == BOW || flagch == EOW) {
702 st = step(m->g, startst, stopst, st, flagch, st);
703 SP("boweow", st, c);
704 }
705
706 /* are we done? */
707 if (ISSET(st, stopst) || p == stop)
708 break; /* NOTE BREAK OUT */
709
710 /* no, we must deal with this character */
711 ASSIGN(tmp, st);
712 ASSIGN(st, fresh);
713 assert(c != OUT);
714 st = step(m->g, startst, stopst, tmp, c, st);
715 SP("aft", st, c);
716 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
717 p++;
718 }
719
720 assert(coldp != NULL);
721 m->coldp = coldp;
722 if (ISSET(st, stopst))
723 return(p+1);
724 else
725 return(NULL);
726 }
727
728 /*
729 - slow - step through the string more deliberately
730 */
731 static char * /* where it ended */
slow(m,start,stop,startst,stopst)732 slow(m, start, stop, startst, stopst)
733 struct match *m;
734 char *start;
735 char *stop;
736 sopno startst;
737 sopno stopst;
738 {
739 states st = m->st;
740 states empty = m->empty;
741 char *p = start;
742 int c = (start == m->beginp) ? OUT : *(start-1);
743 int lastc; /* previous c */
744 int flagch;
745 int i;
746 char *matchp; /* last p at which a match ended */
747
748 AT("slow", start, stop, startst, stopst);
749 CLEAR(st);
750 SET1(st, startst);
751 SP("sstart", st, *p);
752 st = step(m->g, startst, stopst, st, NOTHING, st);
753 matchp = NULL;
754 for (;;) {
755 /* next character */
756 lastc = c;
757 c = (p == m->endp) ? OUT : *p;
758
759 /* is there an EOL and/or BOL between lastc and c? */
760 flagch = '\0';
761 i = 0;
762 if ( (lastc == '\n' && m->g->cflags®_NEWLINE) ||
763 (lastc == OUT && !(m->eflags®_NOTBOL)) ) {
764 flagch = BOL;
765 i = m->g->nbol;
766 }
767 if ( (c == '\n' && m->g->cflags®_NEWLINE) ||
768 (c == OUT && !(m->eflags®_NOTEOL)) ) {
769 flagch = (flagch == BOL) ? BOLEOL : EOL;
770 i += m->g->neol;
771 }
772 if (i != 0) {
773 for (; i > 0; i--)
774 st = step(m->g, startst, stopst, st, flagch, st);
775 SP("sboleol", st, c);
776 }
777
778 /* how about a word boundary? */
779 if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
780 (c != OUT && ISWORD(c)) ) {
781 flagch = BOW;
782 }
783 if ( (lastc != OUT && ISWORD(lastc)) &&
784 (flagch == EOL || (c != OUT && !ISWORD(c))) ) {
785 flagch = EOW;
786 }
787 if (flagch == BOW || flagch == EOW) {
788 st = step(m->g, startst, stopst, st, flagch, st);
789 SP("sboweow", st, c);
790 }
791
792 /* are we done? */
793 if (ISSET(st, stopst))
794 matchp = p;
795 if (EQ(st, empty) || p == stop)
796 break; /* NOTE BREAK OUT */
797
798 /* no, we must deal with this character */
799 assert(c != OUT);
800 st = step(m->g, startst, stopst, st, c, empty);
801 SP("saft", st, c);
802 assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
803 p++;
804 }
805
806 return(matchp);
807 }
808
809
810 /*
811 - step - map set of states reachable before char to set reachable after
812 */
813 static states
step(g,start,stop,bef,ch,aft)814 step(g, start, stop, bef, ch, aft)
815 struct re_guts *g;
816 sopno start; /* start state within strip */
817 sopno stop; /* state after stop state within strip */
818 states bef; /* states reachable before */
819 int ch; /* character or NONCHAR code */
820 states aft; /* states already known reachable after */
821 {
822 cset *cs;
823 sop s;
824 sopno pc;
825 onestate here; /* note, macros know this name */
826 sopno look;
827 long i;
828
829 for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
830 s = g->strip[pc];
831 switch (OP(s)) {
832 case OEND:
833 assert(pc == stop-1);
834 break;
835 case OCHAR:
836 /* only characters can match */
837 assert(!NONCHAR(ch) || ch != (char)OPND(s));
838 if (ch == (char)OPND(s))
839 FWD(aft, bef, 1);
840 break;
841 case OBOL:
842 if (ch == BOL || ch == BOLEOL)
843 FWD(aft, bef, 1);
844 break;
845 case OEOL:
846 if (ch == EOL || ch == BOLEOL)
847 FWD(aft, bef, 1);
848 break;
849 case OBOW:
850 if (ch == BOW)
851 FWD(aft, bef, 1);
852 break;
853 case OEOW:
854 if (ch == EOW)
855 FWD(aft, bef, 1);
856 break;
857 case OANY:
858 if (!NONCHAR(ch))
859 FWD(aft, bef, 1);
860 break;
861 case OANYOF:
862 cs = &g->sets[OPND(s)];
863 if (!NONCHAR(ch) && CHIN(cs, ch))
864 FWD(aft, bef, 1);
865 break;
866 case OBACK_: /* ignored here */
867 case O_BACK:
868 FWD(aft, aft, 1);
869 break;
870 case OPLUS_: /* forward, this is just an empty */
871 FWD(aft, aft, 1);
872 break;
873 case O_PLUS: /* both forward and back */
874 FWD(aft, aft, 1);
875 i = ISSETBACK(aft, OPND(s));
876 BACK(aft, aft, OPND(s));
877 if (!i && ISSETBACK(aft, OPND(s))) {
878 /* oho, must reconsider loop body */
879 pc -= OPND(s) + 1;
880 INIT(here, pc);
881 }
882 break;
883 case OQUEST_: /* two branches, both forward */
884 FWD(aft, aft, 1);
885 FWD(aft, aft, OPND(s));
886 break;
887 case O_QUEST: /* just an empty */
888 FWD(aft, aft, 1);
889 break;
890 case OLPAREN: /* not significant here */
891 case ORPAREN:
892 FWD(aft, aft, 1);
893 break;
894 case OCH_: /* mark the first two branches */
895 FWD(aft, aft, 1);
896 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
897 FWD(aft, aft, OPND(s));
898 break;
899 case OOR1: /* done a branch, find the O_CH */
900 if (ISSTATEIN(aft, here)) {
901 for (look = 1;
902 OP(s = g->strip[pc+look]) != O_CH;
903 look += OPND(s))
904 assert(OP(s) == OOR2);
905 FWD(aft, aft, look);
906 }
907 break;
908 case OOR2: /* propagate OCH_'s marking */
909 FWD(aft, aft, 1);
910 if (OP(g->strip[pc+OPND(s)]) != O_CH) {
911 assert(OP(g->strip[pc+OPND(s)]) == OOR2);
912 FWD(aft, aft, OPND(s));
913 }
914 break;
915 case O_CH: /* just empty */
916 FWD(aft, aft, 1);
917 break;
918 default: /* ooooops... */
919 assert(nope);
920 break;
921 }
922 }
923
924 return(aft);
925 }
926
927 #ifdef REDEBUG
928 /*
929 - print - print a set of states
930 */
931 static void
print(m,caption,st,ch,d)932 print(m, caption, st, ch, d)
933 struct match *m;
934 char *caption;
935 states st;
936 int ch;
937 FILE *d;
938 {
939 struct re_guts *g = m->g;
940 int i;
941 int first = 1;
942
943 if (!(m->eflags®_TRACE))
944 return;
945
946 fprintf(d, "%s", caption);
947 if (ch != '\0')
948 fprintf(d, " %s", pchar(ch));
949 for (i = 0; i < g->nstates; i++)
950 if (ISSET(st, i)) {
951 fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
952 first = 0;
953 }
954 fprintf(d, "\n");
955 }
956
957 /*
958 - at - print current situation
959 */
960 static void
at(m,title,start,stop,startst,stopst)961 at(m, title, start, stop, startst, stopst)
962 struct match *m;
963 char *title;
964 char *start;
965 char *stop;
966 sopno startst;
967 sopno stopst;
968 {
969 if (!(m->eflags®_TRACE))
970 return;
971
972 printf("%s %s-", title, pchar(*start));
973 printf("%s ", pchar(*stop));
974 printf("%ld-%ld\n", (long)startst, (long)stopst);
975 }
976
977 #ifndef PCHARDONE
978 #define PCHARDONE /* never again */
979 /*
980 - pchar - make a character printable
981 *
982 * Is this identical to regchar() over in debug.c? Well, yes. But a
983 * duplicate here avoids having a debugging-capable regexec.o tied to
984 * a matching debug.o, and this is convenient. It all disappears in
985 * the non-debug compilation anyway, so it doesn't matter much.
986 */
987 static char * /* -> representation */
pchar(ch)988 pchar(ch)
989 int ch;
990 {
991 static char pbuf[10];
992
993 if (isprint(ch) || ch == ' ')
994 sprintf(pbuf, "%c", ch);
995 else
996 sprintf(pbuf, "\\%o", ch);
997 return(pbuf);
998 }
999 #endif
1000 #endif
1001
1002 #undef matcher
1003 #undef fast
1004 #undef slow
1005 #undef dissect
1006 #undef backref
1007 #undef step
1008 #undef print
1009 #undef at
1010 #undef match
1011