• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*	$OpenBSD: engine.c,v 1.15 2005/08/05 13:03:00 espie Exp $	*/
2  
3  /*-
4   * Copyright (c) 1992, 1993, 1994 Henry Spencer.
5   * Copyright (c) 1992, 1993, 1994
6   *	The Regents of the University of California.  All rights reserved.
7   *
8   * This code is derived from software contributed to Berkeley by
9   * Henry Spencer.
10   *
11   * Redistribution and use in source and binary forms, with or without
12   * modification, are permitted provided that the following conditions
13   * are met:
14   * 1. Redistributions of source code must retain the above copyright
15   *    notice, this list of conditions and the following disclaimer.
16   * 2. Redistributions in binary form must reproduce the above copyright
17   *    notice, this list of conditions and the following disclaimer in the
18   *    documentation and/or other materials provided with the distribution.
19   * 3. Neither the name of the University nor the names of its contributors
20   *    may be used to endorse or promote products derived from this software
21   *    without specific prior written permission.
22   *
23   * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24   * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26   * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27   * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28   * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29   * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32   * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33   * SUCH DAMAGE.
34   *
35   *	@(#)engine.c	8.5 (Berkeley) 3/20/94
36   */
37  
38  /*
39   * The matching engine and friends.  This file is #included by regexec.c
40   * after suitable #defines of a variety of macros used herein, so that
41   * different state representations can be used without duplicating masses
42   * of code.
43   */
44  
45  #ifdef SNAMES
46  #define	matcher	smatcher
47  #define	fast	sfast
48  #define	slow	sslow
49  #define	dissect	sdissect
50  #define	backref	sbackref
51  #define	step	sstep
52  #define	print	sprint
53  #define	at	sat
54  #define	match	smat
55  #define	nope	snope
56  #endif
57  #ifdef LNAMES
58  #define	matcher	lmatcher
59  #define	fast	lfast
60  #define	slow	lslow
61  #define	dissect	ldissect
62  #define	backref	lbackref
63  #define	step	lstep
64  #define	print	lprint
65  #define	at	lat
66  #define	match	lmat
67  #define	nope	lnope
68  #endif
69  
70  /* another structure passed up and down to avoid zillions of parameters */
71  struct match {
72  	struct re_guts *g;
73  	int eflags;
74  	regmatch_t *pmatch;	/* [nsub+1] (0 element unused) */
75  	char *offp;		/* offsets work from here */
76  	char *beginp;		/* start of string -- virtual NUL precedes */
77  	char *endp;		/* end of string -- virtual NUL here */
78  	char *coldp;		/* can be no match starting before here */
79  	char **lastpos;		/* [nplus+1] */
80  	STATEVARS;
81  	states st;		/* current states */
82  	states fresh;		/* states for a fresh start */
83  	states tmp;		/* temporary */
84  	states empty;		/* empty set of states */
85  };
86  
87  static int matcher(struct re_guts *, char *, size_t, regmatch_t[], int);
88  static char *dissect(struct match *, char *, char *, sopno, sopno);
89  static char *backref(struct match *, char *, char *, sopno, sopno, sopno, int);
90  static char *fast(struct match *, char *, char *, sopno, sopno);
91  static char *slow(struct match *, char *, char *, sopno, sopno);
92  static states step(struct re_guts *, sopno, sopno, states, int, states);
93  #define MAX_RECURSION	100
94  #define	BOL	(OUT+1)
95  #define	EOL	(BOL+1)
96  #define	BOLEOL	(BOL+2)
97  #define	NOTHING	(BOL+3)
98  #define	BOW	(BOL+4)
99  #define	EOW	(BOL+5)
100  #define	CODEMAX	(BOL+5)		/* highest code used */
101  #define	NONCHAR(c)	((c) > CHAR_MAX)
102  #define	NNONCHAR	(CODEMAX-CHAR_MAX)
103  #ifdef REDEBUG
104  static void print(struct match *, char *, states, int, FILE *);
105  #endif
106  #ifdef REDEBUG
107  static void at(struct match *, char *, char *, char *, sopno, sopno);
108  #endif
109  #ifdef REDEBUG
110  static char *pchar(int);
111  #endif
112  
113  #ifdef REDEBUG
114  #define	SP(t, s, c)	print(m, t, s, c, stdout)
115  #define	AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
116  #define	NOTE(str)	{ if (m->eflags&REG_TRACE) (void)printf("=%s\n", (str)); }
117  static int nope = 0;
118  #else
119  #define	SP(t, s, c)	/* nothing */
120  #define	AT(t, p1, p2, s1, s2)	/* nothing */
121  #define	NOTE(s)	/* nothing */
122  #endif
123  
124  /*
125   - matcher - the actual matching engine
126   */
127  static int			/* 0 success, REG_NOMATCH failure */
matcher(struct re_guts * g,char * string,size_t nmatch,regmatch_t pmatch[],int eflags)128  matcher(struct re_guts *g, char *string, size_t nmatch, regmatch_t pmatch[],
129      int eflags)
130  {
131  	char *endp;
132  	int i;
133  	struct match mv;
134  	struct match *m = &mv;
135  	char *dp;
136  	const sopno gf = g->firststate+1;	/* +1 for OEND */
137  	const sopno gl = g->laststate;
138  	char *start;
139  	char *stop;
140  
141  	/* simplify the situation where possible */
142  	if (g->cflags&REG_NOSUB)
143  		nmatch = 0;
144  	if (eflags&REG_STARTEND) {
145  		start = string + pmatch[0].rm_so;
146  		stop = string + pmatch[0].rm_eo;
147  	} else {
148  		start = string;
149  		stop = start + strlen(start);
150  	}
151  	if (stop < start)
152  		return(REG_INVARG);
153  
154  	/* prescreening; this does wonders for this rather slow code */
155  	if (g->must != NULL) {
156  		for (dp = start; dp < stop; dp++)
157  			if (*dp == g->must[0] && stop - dp >= g->mlen &&
158  				memcmp(dp, g->must, (size_t)g->mlen) == 0)
159  				break;
160  		if (dp == stop)		/* we didn't find g->must */
161  			return(REG_NOMATCH);
162  	}
163  
164  	/* match struct setup */
165  	m->g = g;
166  	m->eflags = eflags;
167  	m->pmatch = NULL;
168  	m->lastpos = NULL;
169  	m->offp = string;
170  	m->beginp = start;
171  	m->endp = (char*)stop;
172  	STATESETUP(m, 4);
173  	SETUP(m->st);
174  	SETUP(m->fresh);
175  	SETUP(m->tmp);
176  	SETUP(m->empty);
177  	CLEAR(m->empty);
178  
179  	/* this loop does only one repetition except for backrefs */
180  	for (;;) {
181  		endp = fast(m, start, stop, gf, gl);
182  		if (endp == NULL) {		/* a miss */
183  			free(m->pmatch);
184  			free(m->lastpos);
185  			STATETEARDOWN(m);
186  			return(REG_NOMATCH);
187  		}
188  		if (nmatch == 0 && !g->backrefs)
189  			break;		/* no further info needed */
190  
191  		/* where? */
192  		assert(m->coldp != NULL);
193  		for (;;) {
194  			NOTE("finding start");
195  			endp = slow(m, m->coldp, stop, gf, gl);
196  			if (endp != NULL)
197  				break;
198  			assert(m->coldp < m->endp);
199  			m->coldp++;
200  		}
201  		if (nmatch == 1 && !g->backrefs)
202  			break;		/* no further info needed */
203  
204  		/* oh my, he wants the subexpressions... */
205  		if (m->pmatch == NULL)
206  			m->pmatch = (regmatch_t *)malloc((m->g->nsub + 1) *
207  							sizeof(regmatch_t));
208  		if (m->pmatch == NULL) {
209  			STATETEARDOWN(m);
210  			return(REG_ESPACE);
211  		}
212  		for (i = 1; i <= (int)m->g->nsub; i++)
213  			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
214  		if (!g->backrefs && !(m->eflags&REG_BACKR)) {
215  			NOTE("dissecting");
216  			dp = dissect(m, m->coldp, endp, gf, gl);
217  		} else {
218  			if (g->nplus > 0 && m->lastpos == NULL)
219  				m->lastpos = (char **)malloc((g->nplus+1) *
220  							sizeof(char *));
221  			if (g->nplus > 0 && m->lastpos == NULL) {
222  				free(m->pmatch);
223  				STATETEARDOWN(m);
224  				return(REG_ESPACE);
225  			}
226  			NOTE("backref dissect");
227  			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
228  		}
229  		if (dp != NULL)
230  			break;
231  
232  		/* uh-oh... we couldn't find a subexpression-level match */
233  		assert(g->backrefs);	/* must be back references doing it */
234  		assert(g->nplus == 0 || m->lastpos != NULL);
235  		for (;;) {
236  			if (dp != NULL || endp <= m->coldp)
237  				break;		/* defeat */
238  			NOTE("backoff");
239  			endp = slow(m, m->coldp, endp-1, gf, gl);
240  			if (endp == NULL)
241  				break;		/* defeat */
242  			/* try it on a shorter possibility */
243  #ifndef NDEBUG
244  			for (i = 1; i <= m->g->nsub; i++) {
245  				assert(m->pmatch[i].rm_so == -1);
246  				assert(m->pmatch[i].rm_eo == -1);
247  			}
248  #endif
249  			NOTE("backoff dissect");
250  			dp = backref(m, m->coldp, endp, gf, gl, (sopno)0, 0);
251  		}
252  		assert(dp == NULL || dp == endp);
253  		if (dp != NULL)		/* found a shorter one */
254  			break;
255  
256  		/* despite initial appearances, there is no match here */
257  		NOTE("false alarm");
258  		if (m->coldp == stop)
259  			break;
260  		start = m->coldp + 1;	/* recycle starting later */
261  	}
262  
263  	/* fill in the details if requested */
264  	if (nmatch > 0) {
265  		pmatch[0].rm_so = m->coldp - m->offp;
266  		pmatch[0].rm_eo = endp - m->offp;
267  	}
268  	if (nmatch > 1) {
269  		assert(m->pmatch != NULL);
270  		for (i = 1; i < (ssize_t)nmatch; i++)
271  			if (i <= (int)m->g->nsub)
272  				pmatch[i] = m->pmatch[i];
273  			else {
274  				pmatch[i].rm_so = -1;
275  				pmatch[i].rm_eo = -1;
276  			}
277  	}
278  
279  	if (m->pmatch != NULL)
280  		free((char *)m->pmatch);
281  	if (m->lastpos != NULL)
282  		free((char *)m->lastpos);
283  	STATETEARDOWN(m);
284  	return(0);
285  }
286  
287  /*
288   - dissect - figure out what matched what, no back references
289   */
290  static char *			/* == stop (success) always */
dissect(struct match * m,char * start,char * stop,sopno startst,sopno stopst)291  dissect(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
292  {
293  	int i;
294  	sopno ss;	/* start sop of current subRE */
295  	sopno es;	/* end sop of current subRE */
296  	char *sp;	/* start of string matched by it */
297  	char *stp;	/* string matched by it cannot pass here */
298  	char *rest;	/* start of rest of string */
299  	char *tail;	/* string unmatched by rest of RE */
300  	sopno ssub;	/* start sop of subsubRE */
301  	sopno esub;	/* end sop of subsubRE */
302  	char *ssp;	/* start of string matched by subsubRE */
303  	char *sep;	/* end of string matched by subsubRE */
304  	char *oldssp;	/* previous ssp */
305  
306  	AT("diss", start, stop, startst, stopst);
307  	sp = start;
308  	for (ss = startst; ss < stopst; ss = es) {
309  		/* identify end of subRE */
310  		es = ss;
311  		switch (OP(m->g->strip[es])) {
312  		case OPLUS_:
313  		case OQUEST_:
314  			es += OPND(m->g->strip[es]);
315  			break;
316  		case OCH_:
317  			while (OP(m->g->strip[es]) != O_CH)
318  				es += OPND(m->g->strip[es]);
319  			break;
320  		}
321  		es++;
322  
323  		/* figure out what it matched */
324  		switch (OP(m->g->strip[ss])) {
325  		case OEND:
326  			assert(nope);
327  			break;
328  		case OCHAR:
329  			sp++;
330  			break;
331  		case OBOL:
332  		case OEOL:
333  		case OBOW:
334  		case OEOW:
335  			break;
336  		case OANY:
337  		case OANYOF:
338  			sp++;
339  			break;
340  		case OBACK_:
341  		case O_BACK:
342  			assert(nope);
343  			break;
344  		/* cases where length of match is hard to find */
345  		case OQUEST_:
346  			stp = stop;
347  			for (;;) {
348  				/* how long could this one be? */
349  				rest = slow(m, sp, stp, ss, es);
350  				assert(rest != NULL);	/* it did match */
351  				/* could the rest match the rest? */
352  				tail = slow(m, rest, stop, es, stopst);
353  				if (tail == stop)
354  					break;		/* yes! */
355  				/* no -- try a shorter match for this one */
356  				stp = rest - 1;
357  				assert(stp >= sp);	/* it did work */
358  			}
359  			ssub = ss + 1;
360  			esub = es - 1;
361  			/* did innards match? */
362  			if (slow(m, sp, rest, ssub, esub) != NULL) {
363  				if (dissect(m, sp, rest, ssub, esub) != rest) {
364  					assert(0 && "dissect(...) should return rest");
365  				}
366  			} else		/* no */
367  				assert(sp == rest);
368  			sp = rest;
369  			break;
370  		case OPLUS_:
371  			stp = stop;
372  			for (;;) {
373  				/* how long could this one be? */
374  				rest = slow(m, sp, stp, ss, es);
375  				assert(rest != NULL);	/* it did match */
376  				/* could the rest match the rest? */
377  				tail = slow(m, rest, stop, es, stopst);
378  				if (tail == stop)
379  					break;		/* yes! */
380  				/* no -- try a shorter match for this one */
381  				stp = rest - 1;
382  				assert(stp >= sp);	/* it did work */
383  			}
384  			ssub = ss + 1;
385  			esub = es - 1;
386  			ssp = sp;
387  			oldssp = ssp;
388  			for (;;) {	/* find last match of innards */
389  				sep = slow(m, ssp, rest, ssub, esub);
390  				if (sep == NULL || sep == ssp)
391  					break;	/* failed or matched null */
392  				oldssp = ssp;	/* on to next try */
393  				ssp = sep;
394  			}
395  			if (sep == NULL) {
396  				/* last successful match */
397  				sep = ssp;
398  				ssp = oldssp;
399  			}
400  			assert(sep == rest);	/* must exhaust substring */
401  			assert(slow(m, ssp, sep, ssub, esub) == rest);
402  			if (dissect(m, ssp, sep, ssub, esub) != sep) {
403  				assert(0 && "dissect(...) should return sep");
404  			}
405  			sp = rest;
406  			break;
407  		case OCH_:
408  			stp = stop;
409  			for (;;) {
410  				/* how long could this one be? */
411  				rest = slow(m, sp, stp, ss, es);
412  				assert(rest != NULL);	/* it did match */
413  				/* could the rest match the rest? */
414  				tail = slow(m, rest, stop, es, stopst);
415  				if (tail == stop)
416  					break;		/* yes! */
417  				/* no -- try a shorter match for this one */
418  				stp = rest - 1;
419  				assert(stp >= sp);	/* it did work */
420  			}
421  			ssub = ss + 1;
422  			esub = ss + OPND(m->g->strip[ss]) - 1;
423  			assert(OP(m->g->strip[esub]) == OOR1);
424  			for (;;) {	/* find first matching branch */
425  				if (slow(m, sp, rest, ssub, esub) == rest)
426  					break;	/* it matched all of it */
427  				/* that one missed, try next one */
428  				assert(OP(m->g->strip[esub]) == OOR1);
429  				esub++;
430  				assert(OP(m->g->strip[esub]) == OOR2);
431  				ssub = esub + 1;
432  				esub += OPND(m->g->strip[esub]);
433  				if (OP(m->g->strip[esub]) == OOR2)
434  					esub--;
435  				else
436  					assert(OP(m->g->strip[esub]) == O_CH);
437  			}
438  			if (dissect(m, sp, rest, ssub, esub) != rest) {
439  				assert(0 && "dissect(...) should return rest");
440  			}
441  			sp = rest;
442  			break;
443  		case O_PLUS:
444  		case O_QUEST:
445  		case OOR1:
446  		case OOR2:
447  		case O_CH:
448  			assert(nope);
449  			break;
450  		case OLPAREN:
451  			i = OPND(m->g->strip[ss]);
452  			assert(0 < i && i <= m->g->nsub);
453  			m->pmatch[i].rm_so = sp - m->offp;
454  			break;
455  		case ORPAREN:
456  			i = OPND(m->g->strip[ss]);
457  			assert(0 < i && i <= m->g->nsub);
458  			m->pmatch[i].rm_eo = sp - m->offp;
459  			break;
460  		default:		/* uh oh */
461  			assert(nope);
462  			break;
463  		}
464  	}
465  
466  	assert(sp == stop);
467  	return(sp);
468  }
469  
470  /*
471   - backref - figure out what matched what, figuring in back references
472   */
473  static char *			/* == stop (success) or NULL (failure) */
backref(struct match * m,char * start,char * stop,sopno startst,sopno stopst,sopno lev,int rec)474  backref(struct match *m, char *start, char *stop, sopno startst, sopno stopst,
475      sopno lev, int rec)			/* PLUS nesting level */
476  {
477  	int i;
478  	sopno ss;	/* start sop of current subRE */
479  	char *sp;	/* start of string matched by it */
480  	sopno ssub;	/* start sop of subsubRE */
481  	sopno esub;	/* end sop of subsubRE */
482  	char *ssp;	/* start of string matched by subsubRE */
483  	char *dp;
484  	size_t len;
485  	int hard;
486  	sop s;
487  	regoff_t offsave;
488  	cset *cs;
489  
490  	AT("back", start, stop, startst, stopst);
491  	sp = start;
492  
493  	/* get as far as we can with easy stuff */
494  	hard = 0;
495  	for (ss = startst; !hard && ss < stopst; ss++)
496  		switch (OP(s = m->g->strip[ss])) {
497  		case OCHAR:
498  			if (sp == stop || *sp++ != (char)OPND(s))
499  				return(NULL);
500  			break;
501  		case OANY:
502  			if (sp == stop)
503  				return(NULL);
504  			sp++;
505  			break;
506  		case OANYOF:
507  			cs = &m->g->sets[OPND(s)];
508  			if (sp == stop || !CHIN(cs, *sp++))
509  				return(NULL);
510  			break;
511  		case OBOL:
512  			if ( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
513  					(sp < m->endp && *(sp-1) == '\n' &&
514  						(m->g->cflags&REG_NEWLINE)) )
515  				{ /* yes */ }
516  			else
517  				return(NULL);
518  			break;
519  		case OEOL:
520  			if ( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
521  					(sp < m->endp && *sp == '\n' &&
522  						(m->g->cflags&REG_NEWLINE)) )
523  				{ /* yes */ }
524  			else
525  				return(NULL);
526  			break;
527  		case OBOW:
528  			if (( (sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
529  					(sp < m->endp && *(sp-1) == '\n' &&
530  						(m->g->cflags&REG_NEWLINE)) ||
531  					(sp > m->beginp &&
532  							!ISWORD(*(sp-1))) ) &&
533  					(sp < m->endp && ISWORD(*sp)) )
534  				{ /* yes */ }
535  			else
536  				return(NULL);
537  			break;
538  		case OEOW:
539  			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
540  					(sp < m->endp && *sp == '\n' &&
541  						(m->g->cflags&REG_NEWLINE)) ||
542  					(sp < m->endp && !ISWORD(*sp)) ) &&
543  					(sp > m->beginp && ISWORD(*(sp-1))) )
544  				{ /* yes */ }
545  			else
546  				return(NULL);
547  			break;
548  		case O_QUEST:
549  			break;
550  		case OOR1:	/* matches null but needs to skip */
551  			ss++;
552  			s = m->g->strip[ss];
553  			do {
554  				assert(OP(s) == OOR2);
555  				ss += OPND(s);
556  			} while (OP(s = m->g->strip[ss]) != O_CH);
557  			/* note that the ss++ gets us past the O_CH */
558  			break;
559  		default:	/* have to make a choice */
560  			hard = 1;
561  			break;
562  		}
563  	if (!hard) {		/* that was it! */
564  		if (sp != stop)
565  			return(NULL);
566  		return(sp);
567  	}
568  	ss--;			/* adjust for the for's final increment */
569  
570  	/* the hard stuff */
571  	AT("hard", sp, stop, ss, stopst);
572  	s = m->g->strip[ss];
573  	switch (OP(s)) {
574  	case OBACK_:		/* the vilest depths */
575  		i = OPND(s);
576  		assert(0 < i && i <= m->g->nsub);
577  		if (m->pmatch[i].rm_eo == -1)
578  			return(NULL);
579  		assert(m->pmatch[i].rm_so != -1);
580  		len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
581  		if (len == 0 && rec++ > MAX_RECURSION)
582  			return(NULL);
583  		assert(stop - m->beginp >= len);
584  		if (sp > stop - len)
585  			return(NULL);	/* not enough left to match */
586  		ssp = m->offp + m->pmatch[i].rm_so;
587  		if (memcmp(sp, ssp, len) != 0)
588  			return(NULL);
589  		while (m->g->strip[ss] != SOP(O_BACK, i))
590  			ss++;
591  		return(backref(m, sp+len, stop, ss+1, stopst, lev, rec));
592  		break;
593  	case OQUEST_:		/* to null or not */
594  		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
595  		if (dp != NULL)
596  			return(dp);	/* not */
597  		return(backref(m, sp, stop, ss+OPND(s)+1, stopst, lev, rec));
598  		break;
599  	case OPLUS_:
600  		assert(m->lastpos != NULL);
601  		assert(lev+1 <= m->g->nplus);
602  		m->lastpos[lev+1] = sp;
603  		return(backref(m, sp, stop, ss+1, stopst, lev+1, rec));
604  		break;
605  	case O_PLUS:
606  		if (sp == m->lastpos[lev])	/* last pass matched null */
607  			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
608  		/* try another pass */
609  		m->lastpos[lev] = sp;
610  		dp = backref(m, sp, stop, ss-OPND(s)+1, stopst, lev, rec);
611  		if (dp == NULL)
612  			return(backref(m, sp, stop, ss+1, stopst, lev-1, rec));
613  		else
614  			return(dp);
615  		break;
616  	case OCH_:		/* find the right one, if any */
617  		ssub = ss + 1;
618  		esub = ss + OPND(s) - 1;
619  		assert(OP(m->g->strip[esub]) == OOR1);
620  		for (;;) {	/* find first matching branch */
621  			dp = backref(m, sp, stop, ssub, esub, lev, rec);
622  			if (dp != NULL)
623  				return(dp);
624  			/* that one missed, try next one */
625  			if (OP(m->g->strip[esub]) == O_CH)
626  				return(NULL);	/* there is none */
627  			esub++;
628  			assert(OP(m->g->strip[esub]) == OOR2);
629  			ssub = esub + 1;
630  			esub += OPND(m->g->strip[esub]);
631  			if (OP(m->g->strip[esub]) == OOR2)
632  				esub--;
633  			else
634  				assert(OP(m->g->strip[esub]) == O_CH);
635  		}
636  		break;
637  	case OLPAREN:		/* must undo assignment if rest fails */
638  		i = OPND(s);
639  		assert(0 < i && i <= m->g->nsub);
640  		offsave = m->pmatch[i].rm_so;
641  		m->pmatch[i].rm_so = sp - m->offp;
642  		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
643  		if (dp != NULL)
644  			return(dp);
645  		m->pmatch[i].rm_so = offsave;
646  		return(NULL);
647  		break;
648  	case ORPAREN:		/* must undo assignment if rest fails */
649  		i = OPND(s);
650  		assert(0 < i && i <= m->g->nsub);
651  		offsave = m->pmatch[i].rm_eo;
652  		m->pmatch[i].rm_eo = sp - m->offp;
653  		dp = backref(m, sp, stop, ss+1, stopst, lev, rec);
654  		if (dp != NULL)
655  			return(dp);
656  		m->pmatch[i].rm_eo = offsave;
657  		return(NULL);
658  		break;
659  	default:		/* uh oh */
660  		assert(nope);
661  		break;
662  	}
663  
664  	/* "can't happen" */
665  	assert(nope);
666  	/* NOTREACHED */
667  	return 0;
668  }
669  
670  /*
671   - fast - step through the string at top speed
672   */
673  static char *			/* where tentative match ended, or NULL */
fast(struct match * m,char * start,char * stop,sopno startst,sopno stopst)674  fast(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
675  {
676  	states st = m->st;
677  	states fresh = m->fresh;
678  	states tmp = m->tmp;
679  	char *p = start;
680  	int c = (start == m->beginp) ? OUT : *(start-1);
681  	int lastc;	/* previous c */
682  	int flagch;
683  	int i;
684  	char *coldp;	/* last p after which no match was underway */
685  
686  	CLEAR(st);
687  	SET1(st, startst);
688  	st = step(m->g, startst, stopst, st, NOTHING, st);
689  	ASSIGN(fresh, st);
690  	SP("start", st, *p);
691  	coldp = NULL;
692  	for (;;) {
693  		/* next character */
694  		lastc = c;
695  		c = (p == m->endp) ? OUT : *p;
696  		if (EQ(st, fresh))
697  			coldp = p;
698  
699  		/* is there an EOL and/or BOL between lastc and c? */
700  		flagch = '\0';
701  		i = 0;
702  		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
703  				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
704  			flagch = BOL;
705  			i = m->g->nbol;
706  		}
707  		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
708  				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
709  			flagch = (flagch == BOL) ? BOLEOL : EOL;
710  			i += m->g->neol;
711  		}
712  		if (i != 0) {
713  			for (; i > 0; i--)
714  				st = step(m->g, startst, stopst, st, flagch, st);
715  			SP("boleol", st, c);
716  		}
717  
718  		/* how about a word boundary? */
719  		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
720  					(c != OUT && ISWORD(c)) ) {
721  			flagch = BOW;
722  		}
723  		if ( (lastc != OUT && ISWORD(lastc)) &&
724  				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
725  			flagch = EOW;
726  		}
727  		if (flagch == BOW || flagch == EOW) {
728  			st = step(m->g, startst, stopst, st, flagch, st);
729  			SP("boweow", st, c);
730  		}
731  
732  		/* are we done? */
733  		if (ISSET(st, stopst) || p == stop)
734  			break;		/* NOTE BREAK OUT */
735  
736  		/* no, we must deal with this character */
737  		ASSIGN(tmp, st);
738  		ASSIGN(st, fresh);
739  		assert(c != OUT);
740  		st = step(m->g, startst, stopst, tmp, c, st);
741  		SP("aft", st, c);
742  		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
743  		p++;
744  	}
745  
746  	assert(coldp != NULL);
747  	m->coldp = coldp;
748  	if (ISSET(st, stopst))
749  		return(p+1);
750  	else
751  		return(NULL);
752  }
753  
754  /*
755   - slow - step through the string more deliberately
756   */
757  static char *			/* where it ended */
slow(struct match * m,char * start,char * stop,sopno startst,sopno stopst)758  slow(struct match *m, char *start, char *stop, sopno startst, sopno stopst)
759  {
760  	states st = m->st;
761  	states empty = m->empty;
762  	states tmp = m->tmp;
763  	char *p = start;
764  	int c = (start == m->beginp) ? OUT : *(start-1);
765  	int lastc;	/* previous c */
766  	int flagch;
767  	int i;
768  	char *matchp;	/* last p at which a match ended */
769  
770  	AT("slow", start, stop, startst, stopst);
771  	CLEAR(st);
772  	SET1(st, startst);
773  	SP("sstart", st, *p);
774  	st = step(m->g, startst, stopst, st, NOTHING, st);
775  	matchp = NULL;
776  	for (;;) {
777  		/* next character */
778  		lastc = c;
779  		c = (p == m->endp) ? OUT : *p;
780  
781  		/* is there an EOL and/or BOL between lastc and c? */
782  		flagch = '\0';
783  		i = 0;
784  		if ( (lastc == '\n' && m->g->cflags&REG_NEWLINE) ||
785  				(lastc == OUT && !(m->eflags&REG_NOTBOL)) ) {
786  			flagch = BOL;
787  			i = m->g->nbol;
788  		}
789  		if ( (c == '\n' && m->g->cflags&REG_NEWLINE) ||
790  				(c == OUT && !(m->eflags&REG_NOTEOL)) ) {
791  			flagch = (flagch == BOL) ? BOLEOL : EOL;
792  			i += m->g->neol;
793  		}
794  		if (i != 0) {
795  			for (; i > 0; i--)
796  				st = step(m->g, startst, stopst, st, flagch, st);
797  			SP("sboleol", st, c);
798  		}
799  
800  		/* how about a word boundary? */
801  		if ( (flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
802  					(c != OUT && ISWORD(c)) ) {
803  			flagch = BOW;
804  		}
805  		if ( (lastc != OUT && ISWORD(lastc)) &&
806  				(flagch == EOL || (c != OUT && !ISWORD(c))) ) {
807  			flagch = EOW;
808  		}
809  		if (flagch == BOW || flagch == EOW) {
810  			st = step(m->g, startst, stopst, st, flagch, st);
811  			SP("sboweow", st, c);
812  		}
813  
814  		/* are we done? */
815  		if (ISSET(st, stopst))
816  			matchp = p;
817  		if (EQ(st, empty) || p == stop)
818  			break;		/* NOTE BREAK OUT */
819  
820  		/* no, we must deal with this character */
821  		ASSIGN(tmp, st);
822  		ASSIGN(st, empty);
823  		assert(c != OUT);
824  		st = step(m->g, startst, stopst, tmp, c, st);
825  		SP("saft", st, c);
826  		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
827  		p++;
828  	}
829  
830  	return(matchp);
831  }
832  
833  
834  /*
835   - step - map set of states reachable before char to set reachable after
836   */
837  static states
step(struct re_guts * g,sopno start,sopno stop,states bef,int ch,states aft)838  step(struct re_guts *g,
839      sopno start,		/* start state within strip */
840      sopno stop,			/* state after stop state within strip */
841      states bef,			/* states reachable before */
842      int ch,			/* character or NONCHAR code */
843      states aft)			/* states already known reachable after */
844  {
845  	cset *cs;
846  	sop s;
847  	sopno pc;
848  	onestate here;		/* note, macros know this name */
849  	sopno look;
850  	int i;
851  
852  	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here)) {
853  		s = g->strip[pc];
854  		switch (OP(s)) {
855  		case OEND:
856  			assert(pc == stop-1);
857  			break;
858  		case OCHAR:
859  			/* only characters can match */
860  			assert(!NONCHAR(ch) || ch != (char)OPND(s));
861  			if (ch == (char)OPND(s))
862  				FWD(aft, bef, 1);
863  			break;
864  		case OBOL:
865  			if (ch == BOL || ch == BOLEOL)
866  				FWD(aft, bef, 1);
867  			break;
868  		case OEOL:
869  			if (ch == EOL || ch == BOLEOL)
870  				FWD(aft, bef, 1);
871  			break;
872  		case OBOW:
873  			if (ch == BOW)
874  				FWD(aft, bef, 1);
875  			break;
876  		case OEOW:
877  			if (ch == EOW)
878  				FWD(aft, bef, 1);
879  			break;
880  		case OANY:
881  			if (!NONCHAR(ch))
882  				FWD(aft, bef, 1);
883  			break;
884  		case OANYOF:
885  			cs = &g->sets[OPND(s)];
886  			if (!NONCHAR(ch) && CHIN(cs, ch))
887  				FWD(aft, bef, 1);
888  			break;
889  		case OBACK_:		/* ignored here */
890  		case O_BACK:
891  			FWD(aft, aft, 1);
892  			break;
893  		case OPLUS_:		/* forward, this is just an empty */
894  			FWD(aft, aft, 1);
895  			break;
896  		case O_PLUS:		/* both forward and back */
897  			FWD(aft, aft, 1);
898  			i = ISSETBACK(aft, OPND(s));
899  			BACK(aft, aft, OPND(s));
900  			if (!i && ISSETBACK(aft, OPND(s))) {
901  				/* oho, must reconsider loop body */
902  				pc -= OPND(s) + 1;
903  				INIT(here, pc);
904  			}
905  			break;
906  		case OQUEST_:		/* two branches, both forward */
907  			FWD(aft, aft, 1);
908  			FWD(aft, aft, OPND(s));
909  			break;
910  		case O_QUEST:		/* just an empty */
911  			FWD(aft, aft, 1);
912  			break;
913  		case OLPAREN:		/* not significant here */
914  		case ORPAREN:
915  			FWD(aft, aft, 1);
916  			break;
917  		case OCH_:		/* mark the first two branches */
918  			FWD(aft, aft, 1);
919  			assert(OP(g->strip[pc+OPND(s)]) == OOR2);
920  			FWD(aft, aft, OPND(s));
921  			break;
922  		case OOR1:		/* done a branch, find the O_CH */
923  			if (ISSTATEIN(aft, here)) {
924  				for (look = 1;
925  						OP(s = g->strip[pc+look]) != O_CH;
926  						look += OPND(s))
927  					assert(OP(s) == OOR2);
928  				FWD(aft, aft, look);
929  			}
930  			break;
931  		case OOR2:		/* propagate OCH_'s marking */
932  			FWD(aft, aft, 1);
933  			if (OP(g->strip[pc+OPND(s)]) != O_CH) {
934  				assert(OP(g->strip[pc+OPND(s)]) == OOR2);
935  				FWD(aft, aft, OPND(s));
936  			}
937  			break;
938  		case O_CH:		/* just empty */
939  			FWD(aft, aft, 1);
940  			break;
941  		default:		/* ooooops... */
942  			assert(nope);
943  			break;
944  		}
945  	}
946  
947  	return(aft);
948  }
949  
950  #ifdef REDEBUG
951  /*
952   - print - print a set of states
953   */
954  static void
print(struct match * m,char * caption,states st,int ch,FILE * d)955  print(struct match *m, char *caption, states st, int ch, FILE *d)
956  {
957  	struct re_guts *g = m->g;
958  	int i;
959  	int first = 1;
960  
961  	if (!(m->eflags&REG_TRACE))
962  		return;
963  
964  	(void)fprintf(d, "%s", caption);
965  	if (ch != '\0')
966  		(void)fprintf(d, " %s", pchar(ch));
967  	for (i = 0; i < g->nstates; i++)
968  		if (ISSET(st, i)) {
969  			(void)fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
970  			first = 0;
971  		}
972  	(void)fprintf(d, "\n");
973  }
974  
975  /*
976   - at - print current situation
977   */
978  static void
at(struct match * m,char * title,char * start,char * stop,sopno startst,sopno stopst)979  at(struct match *m, char *title, char *start, char *stop, sopno startst,
980      sopno stopst)
981  {
982  	if (!(m->eflags&REG_TRACE))
983  		return;
984  
985  	(void)printf("%s %s-", title, pchar(*start));
986  	(void)printf("%s ", pchar(*stop));
987  	(void)printf("%ld-%ld\n", (long)startst, (long)stopst);
988  }
989  
990  #ifndef PCHARDONE
991  #define	PCHARDONE	/* never again */
992  /*
993   - pchar - make a character printable
994   *
995   * Is this identical to regchar() over in debug.c?  Well, yes.  But a
996   * duplicate here avoids having a debugging-capable regexec.o tied to
997   * a matching debug.o, and this is convenient.  It all disappears in
998   * the non-debug compilation anyway, so it doesn't matter much.
999   */
1000  static char *			/* -> representation */
pchar(int ch)1001  pchar(int ch)
1002  {
1003  	static char pbuf[10];
1004  
1005  	if (isprint(ch) || ch == ' ')
1006  		(void)_snprintf(pbuf, sizeof pbuf, "%c", ch);
1007  	else
1008  		(void)_snprintf(pbuf, sizeof pbuf, "\\%o", ch);
1009  	return(pbuf);
1010  }
1011  #endif
1012  #endif
1013  
1014  #undef	matcher
1015  #undef	fast
1016  #undef	slow
1017  #undef	dissect
1018  #undef	backref
1019  #undef	step
1020  #undef	print
1021  #undef	at
1022  #undef	match
1023  #undef	nope
1024