• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <assert.h>
2 #include <sys/types.h>
3 #include <stdio.h>
4 #include <string.h>
5 #include <ctype.h>
6 #include <limits.h>
7 #include <stdlib.h>
8 
9 #include "regex.h"
10 #include "regex2.h"
11 
12 #ifdef _POSIX2_RE_DUP_MAX
13 #define	DUPMAX	_POSIX2_RE_DUP_MAX
14 #else
15 #define	DUPMAX	255
16 #endif
17 #define	INFINITY	(DUPMAX + 1)
18 
19 /*
20  * parse structure, passed up and down to avoid global variables and
21  * other clumsinesses
22  */
23 struct parse {
24 	char *next;		/* next character in RE */
25 	char *end;		/* end of string (-> NUL normally) */
26 	int error;		/* has an error been seen? */
27 	sop *strip;		/* malloced strip */
28 	sopno ssize;		/* malloced strip size (allocated) */
29 	sopno slen;		/* malloced strip length (used) */
30 	int ncsalloc;		/* number of csets allocated */
31 	struct re_guts *g;
32 #	define	NPAREN	10	/* we need to remember () 1-9 for back refs */
33 	sopno pbegin[NPAREN];	/* -> ( ([0] unused) */
34 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
35 };
36 
37 static char nuls[10];		/* place to point scanner in event of error */
38 
39 static void p_ere(struct parse *p, int stop);
40 static void p_ere_exp(struct parse *p);
41 static void p_str(struct parse *p);
42 static void p_bre(struct parse *p, int end1,
43 	int end2);
44 static int p_simp_re(struct parse *p, int starordinary);
45 static int p_count(struct parse *p);
46 static void p_bracket(struct parse *p);
47 static void p_b_term(struct parse *p, cset *cs);
48 static void p_b_cclass(struct parse *p, cset *cs);
49 static void p_b_eclass(struct parse *p, cset *cs);
50 static char p_b_symbol(struct parse *p);
51 static char p_b_coll_elem(struct parse *p, int endc);
52 static char othercase(int ch);
53 static void bothcases(struct parse *p, int ch);
54 static void ordinary(struct parse *p, int ch);
55 static void nonnewline(struct parse *p);
56 static void repeat(struct parse *p, sopno start, int from, int to);
57 static void seterr(struct parse *p, int e);
58 static cset *allocset(struct parse *p);
59 static void freeset(struct parse *p, cset *cs);
60 static int freezeset(struct parse *p, cset *cs);
61 static int firstch(struct parse *p, cset *cs);
62 static int nch(struct parse *p, cset *cs);
63 static void mcadd(struct parse *p, cset *cs,
64 	char *cp);
65 static int isinsets(struct re_guts *g, int c);
66 static int samesets(struct re_guts *g, int c1, int c2);
67 static void categorize(struct parse *p, struct re_guts *g);
68 static sopno dupl(struct parse *p, sopno start, sopno finish);
69 static void doemit(struct parse *p, sop op, sop opnd);
70 static void doinsert(struct parse *p, sop op, sopno opnd, sopno pos);
71 static void dofwd(struct parse *p, sopno pos, sop value);
72 static void enlarge(struct parse *p, sopno size);
73 static void stripsnug(struct parse *p, struct re_guts *g);
74 static void findmust(struct parse *p, struct re_guts *g);
75 static sopno pluscount(struct parse *p, struct re_guts *g);
76 
77 /*
78  * macros for use with parse structure
79  * BEWARE:  these know that the parse structure is named `p' !!!
80  */
81 #define	PEEK()	(*p->next)
82 #define	PEEK2()	(*(p->next+1))
83 #define	MORE()	(p->next < p->end)
84 #define	MORE2()	(p->next+1 < p->end)
85 #define	SEE(c)	(MORE() && PEEK() == (c))
86 #define	SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
87 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
88 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
89 #define	NEXT()	(p->next++)
90 #define	NEXT2()	(p->next += 2)
91 #define	NEXTn(n)	(p->next += (n))
92 #define	GETNEXT()	(*p->next++)
93 #define	REQUIRE(co, e)	((co) ? (void) 0 : seterr(p, (e)))
94 #define	MUSTEAT(c, e)	(REQUIRE(MORE() && GETNEXT() == (c), e))
95 #define	EMIT(op, sopnd)	doemit(p, (sop)(op), (sop)(sopnd))
96 #define	INSERT(op, pos)	doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
97 #define	AHEAD(pos)		dofwd(p, pos, HERE()-(pos))
98 #define	ASTERN(sop, pos)	EMIT(sop, HERE()-pos)
99 #define	HERE()		(p->slen)
100 #define	THERE()		(p->slen - 1)
101 #define	THERETHERE()	(p->slen - 2)
102 #define	DROP(n)	(p->slen -= (n))
103 
104 #ifndef NDEBUG
105 static int never = 0;		/* for use in asserts; shuts lint up */
106 #else
107 #define	never	0		/* some <assert.h>s have bugs too */
108 #endif
109 
110 int				/* 0 success, otherwise REG_something */
regcomp(preg,pattern,cflags)111 regcomp(preg, pattern, cflags)
112 regex_t *preg;
113 const char *pattern;
114 int cflags;
115 {
116 	struct parse pa;
117 	struct re_guts *g;
118 	struct parse *p = &pa;
119 	unsigned int i;
120 	size_t len;
121 
122 	if ((cflags&REG_EXTENDED) && (cflags&REG_NOSPEC))
123 		return(REG_INVARG);
124 
125 	if (cflags&REG_PEND) {
126 		if (preg->re_endp < pattern)
127 			return(REG_INVARG);
128 		len = (size_t) (preg->re_endp - pattern);
129 	} else
130 		len = strlen((char *)pattern);
131 
132 	/* do the mallocs early so failure handling is easy */
133 	g = (struct re_guts *)malloc(sizeof(struct re_guts) +
134 							(NC-1)*sizeof(cat_t));
135 	if (g == NULL)
136 		return(REG_ESPACE);
137 	{
138 	  /* Patched for CERT Vulnerability Note VU#695940, Feb 2015. */
139 	  size_t new_ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
140 	  if (new_ssize < len || new_ssize > LONG_MAX / sizeof(sop)) {
141 	    free((char *) g);
142 	    return REG_INVARG;
143 	  }
144 	  p->ssize = (sopno)new_ssize;
145 	}
146 	p->strip = (sop *)malloc((size_t)p->ssize * sizeof(sop));
147 	p->slen = 0;
148 	if (p->strip == NULL) {
149 		free((char *)g);
150 		return(REG_ESPACE);
151 	}
152 
153 	/* set things up */
154 	p->g = g;
155 	p->next = (char *)pattern;	/* convenience; we do not modify it */
156 	p->end = p->next + len;
157 	p->error = 0;
158 	p->ncsalloc = 0;
159 	for (i = 0; i < NPAREN; i++) {
160 		p->pbegin[i] = 0;
161 		p->pend[i] = 0;
162 	}
163 	g->csetsize = NC;
164 	g->sets = NULL;
165 	g->setbits = NULL;
166 	g->ncsets = 0;
167 	g->cflags = cflags;
168 	g->iflags = 0;
169 	g->nbol = 0;
170 	g->neol = 0;
171 	g->must = NULL;
172 	g->mlen = 0;
173 	g->nsub = 0;
174 	g->ncategories = 1;	/* category 0 is "everything else" */
175 	g->categories = &g->catspace[-(CHAR_MIN)];
176 	(void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
177 	g->backrefs = 0;
178 
179 	/* do it */
180 	EMIT(OEND, 0);
181 	g->firststate = THERE();
182 	if (cflags&REG_EXTENDED)
183 		p_ere(p, OUT);
184 	else if (cflags&REG_NOSPEC)
185 		p_str(p);
186 	else
187 		p_bre(p, OUT, OUT);
188 	EMIT(OEND, 0);
189 	g->laststate = THERE();
190 
191 	/* tidy up loose ends and fill things in */
192 	categorize(p, g);
193 	stripsnug(p, g);
194 	findmust(p, g);
195 	g->nplus = pluscount(p, g);
196 	g->magic = MAGIC2;
197 	preg->re_nsub = g->nsub;
198 	preg->re_g = g;
199 	preg->re_magic = MAGIC1;
200 #ifndef REDEBUG
201 	/* not debugging, so can't rely on the assert() in regexec() */
202 	if (g->iflags&BAD)
203 		seterr(p, REG_ASSERT);
204 #endif
205 
206 	/* win or lose, we're done */
207 	if (p->error != 0)	/* lose */
208 		regfree(preg);
209 	return(p->error);
210 }
211 
212 /*
213  - p_ere - ERE parser top level, concatenation and alternation
214  */
215 static void
p_ere(p,stop)216 p_ere(p, stop)
217 struct parse *p;
218 int stop;			/* character this ERE should end at */
219 {
220 	char c;
221 	sopno prevback;
222 	sopno prevfwd;
223 	sopno conc;
224 	int first = 1;		/* is this the first alternative? */
225 
226 	for (;;) {
227 		/* do a bunch of concatenated expressions */
228 		conc = HERE();
229 		while (MORE() && (c = PEEK()) != '|' && c != stop)
230 			p_ere_exp(p);
231 		REQUIRE(HERE() != conc, REG_EMPTY);	/* require nonempty */
232 
233 		if (!EAT('|'))
234 			break;		/* NOTE BREAK OUT */
235 
236 		if (first) {
237 			INSERT(OCH_, conc);	/* offset is wrong */
238 			prevfwd = conc;
239 			prevback = conc;
240 			first = 0;
241 		}
242 		ASTERN(OOR1, prevback);
243 		prevback = THERE();
244 		AHEAD(prevfwd);			/* fix previous offset */
245 		prevfwd = HERE();
246 		EMIT(OOR2, 0);			/* offset is very wrong */
247 	}
248 
249 	if (!first) {		/* tail-end fixups */
250 		AHEAD(prevfwd);
251 		ASTERN(O_CH, prevback);
252 	}
253 
254 	assert(!MORE() || SEE(stop));
255 }
256 
257 /*
258  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
259  */
260 static void
p_ere_exp(p)261 p_ere_exp(p)
262 struct parse *p;
263 {
264 	char c;
265 	sopno pos;
266 	int count;
267 	int count2;
268 	sopno subno;
269 	int wascaret = 0;
270 
271 	assert(MORE());		/* caller should have ensured this */
272 	c = GETNEXT();
273 
274 	pos = HERE();
275 	switch (c) {
276 	case '(':
277 		REQUIRE(MORE(), REG_EPAREN);
278 		p->g->nsub++;
279 		subno = (sopno)p->g->nsub;
280 		if (subno < NPAREN)
281 			p->pbegin[subno] = HERE();
282 		EMIT(OLPAREN, subno);
283 		if (!SEE(')'))
284 			p_ere(p, ')');
285 		if (subno < NPAREN) {
286 			p->pend[subno] = HERE();
287 			assert(p->pend[subno] != 0);
288 		}
289 		EMIT(ORPAREN, subno);
290 		MUSTEAT(')', REG_EPAREN);
291 		break;
292 #ifndef POSIX_MISTAKE
293 	case ')':		/* happens only if no current unmatched ( */
294 		/*
295 		 * You may ask, why the ifndef?  Because I didn't notice
296 		 * this until slightly too late for 1003.2, and none of the
297 		 * other 1003.2 regular-expression reviewers noticed it at
298 		 * all.  So an unmatched ) is legal POSIX, at least until
299 		 * we can get it fixed.
300 		 */
301 		seterr(p, REG_EPAREN);
302 		break;
303 #endif
304 	case '^':
305 		EMIT(OBOL, 0);
306 		p->g->iflags |= USEBOL;
307 		p->g->nbol++;
308 		wascaret = 1;
309 		break;
310 	case '$':
311 		EMIT(OEOL, 0);
312 		p->g->iflags |= USEEOL;
313 		p->g->neol++;
314 		break;
315 	case '|':
316 		seterr(p, REG_EMPTY);
317 		break;
318 	case '*':
319 	case '+':
320 	case '?':
321 		seterr(p, REG_BADRPT);
322 		break;
323 	case '.':
324 		if (p->g->cflags&REG_NEWLINE)
325 			nonnewline(p);
326 		else
327 			EMIT(OANY, 0);
328 		break;
329 	case '[':
330 		p_bracket(p);
331 		break;
332 	case '\\':
333 		REQUIRE(MORE(), REG_EESCAPE);
334 		c = GETNEXT();
335 		ordinary(p, c);
336 		break;
337 	case '{':		/* okay as ordinary except if digit follows */
338 		REQUIRE(!MORE() || !isdigit(PEEK()), REG_BADRPT);
339 		/* FALLTHROUGH */
340 	default:
341 		ordinary(p, c);
342 		break;
343 	}
344 
345 	if (!MORE())
346 		return;
347 	c = PEEK();
348 	/* we call { a repetition if followed by a digit */
349 	if (!( c == '*' || c == '+' || c == '?' ||
350 				(c == '{' && MORE2() && isdigit(PEEK2())) ))
351 		return;		/* no repetition, we're done */
352 	NEXT();
353 
354 	REQUIRE(!wascaret, REG_BADRPT);
355 	switch (c) {
356 	case '*':	/* implemented as +? */
357 		/* this case does not require the (y|) trick, noKLUDGE */
358 		INSERT(OPLUS_, pos);
359 		ASTERN(O_PLUS, pos);
360 		INSERT(OQUEST_, pos);
361 		ASTERN(O_QUEST, pos);
362 		break;
363 	case '+':
364 		INSERT(OPLUS_, pos);
365 		ASTERN(O_PLUS, pos);
366 		break;
367 	case '?':
368 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
369 		INSERT(OCH_, pos);		/* offset slightly wrong */
370 		ASTERN(OOR1, pos);		/* this one's right */
371 		AHEAD(pos);			/* fix the OCH_ */
372 		EMIT(OOR2, 0);			/* offset very wrong... */
373 		AHEAD(THERE());			/* ...so fix it */
374 		ASTERN(O_CH, THERETHERE());
375 		break;
376 	case '{':
377 		count = p_count(p);
378 		if (EAT(',')) {
379 			if (isdigit(PEEK())) {
380 				count2 = p_count(p);
381 				REQUIRE(count <= count2, REG_BADBR);
382 			} else		/* single number with comma */
383 				count2 = INFINITY;
384 		} else		/* just a single number */
385 			count2 = count;
386 		repeat(p, pos, count, count2);
387 		if (!EAT('}')) {	/* error heuristics */
388 			while (MORE() && PEEK() != '}')
389 				NEXT();
390 			REQUIRE(MORE(), REG_EBRACE);
391 			seterr(p, REG_BADBR);
392 		}
393 		break;
394 	}
395 
396 	if (!MORE())
397 		return;
398 	c = PEEK();
399 	if (!( c == '*' || c == '+' || c == '?' ||
400 				(c == '{' && MORE2() && isdigit(PEEK2())) ) )
401 		return;
402 	seterr(p, REG_BADRPT);
403 }
404 
405 /*
406  - p_str - string (no metacharacters) "parser"
407  */
408 static void
p_str(p)409 p_str(p)
410 struct parse *p;
411 {
412 	REQUIRE(MORE(), REG_EMPTY);
413 	while (MORE())
414 		ordinary(p, GETNEXT());
415 }
416 
417 /*
418  - p_bre - BRE parser top level, anchoring and concatenation
419  * Giving end1 as OUT essentially eliminates the end1/end2 check.
420  *
421  * This implementation is a bit of a kludge, in that a trailing $ is first
422  * taken as an ordinary character and then revised to be an anchor.  The
423  * only undesirable side effect is that '$' gets included as a character
424  * category in such cases.  This is fairly harmless; not worth fixing.
425  * The amount of lookahead needed to avoid this kludge is excessive.
426  */
427 static void
p_bre(p,end1,end2)428 p_bre(p, end1, end2)
429 struct parse *p;
430 int end1;		/* first terminating character */
431 int end2;		/* second terminating character */
432 {
433 	sopno start = HERE();
434 	int first = 1;			/* first subexpression? */
435 	int wasdollar = 0;
436 
437 	if (EAT('^')) {
438 		EMIT(OBOL, 0);
439 		p->g->iflags |= USEBOL;
440 		p->g->nbol++;
441 	}
442 	while (MORE() && !SEETWO(end1, end2)) {
443 		wasdollar = p_simp_re(p, first);
444 		first = 0;
445 	}
446 	if (wasdollar) {	/* oops, that was a trailing anchor */
447 		DROP(1);
448 		EMIT(OEOL, 0);
449 		p->g->iflags |= USEEOL;
450 		p->g->neol++;
451 	}
452 
453 	REQUIRE(HERE() != start, REG_EMPTY);	/* require nonempty */
454 }
455 
456 /*
457  - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
458  */
459 static int			/* was the simple RE an unbackslashed $? */
p_simp_re(p,starordinary)460 p_simp_re(p, starordinary)
461 struct parse *p;
462 int starordinary;		/* is a leading * an ordinary character? */
463 {
464 	int c;
465 	int count;
466 	int count2;
467 	sopno pos;
468 	int i;
469 	sopno subno;
470 #	define	BACKSL	(1<<CHAR_BIT)
471 
472 	pos = HERE();		/* repetition op, if any, covers from here */
473 
474 	assert(MORE());		/* caller should have ensured this */
475 	c = GETNEXT();
476 	if (c == '\\') {
477 		REQUIRE(MORE(), REG_EESCAPE);
478 		c = BACKSL | (unsigned char)GETNEXT();
479 	}
480 	switch (c) {
481 	case '.':
482 		if (p->g->cflags&REG_NEWLINE)
483 			nonnewline(p);
484 		else
485 			EMIT(OANY, 0);
486 		break;
487 	case '[':
488 		p_bracket(p);
489 		break;
490 	case BACKSL|'{':
491 		seterr(p, REG_BADRPT);
492 		break;
493 	case BACKSL|'(':
494 		p->g->nsub++;
495 		subno = (sopno)p->g->nsub;
496 		if (subno < NPAREN)
497 			p->pbegin[subno] = HERE();
498 		EMIT(OLPAREN, subno);
499 		/* the MORE here is an error heuristic */
500 		if (MORE() && !SEETWO('\\', ')'))
501 			p_bre(p, '\\', ')');
502 		if (subno < NPAREN) {
503 			p->pend[subno] = HERE();
504 			assert(p->pend[subno] != 0);
505 		}
506 		EMIT(ORPAREN, subno);
507 		REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
508 		break;
509 	case BACKSL|')':	/* should not get here -- must be user */
510 	case BACKSL|'}':
511 		seterr(p, REG_EPAREN);
512 		break;
513 	case BACKSL|'1':
514 	case BACKSL|'2':
515 	case BACKSL|'3':
516 	case BACKSL|'4':
517 	case BACKSL|'5':
518 	case BACKSL|'6':
519 	case BACKSL|'7':
520 	case BACKSL|'8':
521 	case BACKSL|'9':
522 		i = (c&~BACKSL) - '0';
523 		assert(i < NPAREN);
524 		if (p->pend[i] != 0) {
525 			assert((size_t) i <= p->g->nsub);
526 			EMIT(OBACK_, i);
527 			assert(p->pbegin[i] != 0);
528 			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
529 			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
530 			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
531 			EMIT(O_BACK, i);
532 		} else
533 			seterr(p, REG_ESUBREG);
534 		p->g->backrefs = 1;
535 		break;
536 	case '*':
537 		REQUIRE(starordinary, REG_BADRPT);
538 		/* FALLTHROUGH */
539 	default:
540 		ordinary(p, (char)c);	/* takes off BACKSL, if any */
541 		break;
542 	}
543 
544 	if (EAT('*')) {		/* implemented as +? */
545 		/* this case does not require the (y|) trick, noKLUDGE */
546 		INSERT(OPLUS_, pos);
547 		ASTERN(O_PLUS, pos);
548 		INSERT(OQUEST_, pos);
549 		ASTERN(O_QUEST, pos);
550 	} else if (EATTWO('\\', '{')) {
551 		count = p_count(p);
552 		if (EAT(',')) {
553 			if (MORE() && isdigit(PEEK())) {
554 				count2 = p_count(p);
555 				REQUIRE(count <= count2, REG_BADBR);
556 			} else		/* single number with comma */
557 				count2 = INFINITY;
558 		} else		/* just a single number */
559 			count2 = count;
560 		repeat(p, pos, count, count2);
561 		if (!EATTWO('\\', '}')) {	/* error heuristics */
562 			while (MORE() && !SEETWO('\\', '}'))
563 				NEXT();
564 			REQUIRE(MORE(), REG_EBRACE);
565 			seterr(p, REG_BADBR);
566 		}
567 	} else if (c == (unsigned char)'$')	/* $ (but not \$) ends it */
568 		return(1);
569 
570 	return(0);
571 }
572 
573 /*
574  - p_count - parse a repetition count
575  */
576 static int			/* the value */
p_count(p)577 p_count(p)
578 struct parse *p;
579 {
580 	int count = 0;
581 	int ndigits = 0;
582 
583 	while (MORE() && isdigit(PEEK()) && count <= DUPMAX) {
584 		count = count*10 + (GETNEXT() - '0');
585 		ndigits++;
586 	}
587 
588 	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
589 	return(count);
590 }
591 
592 /*
593  - p_bracket - parse a bracketed character list
594  *
595  * Note a significant property of this code:  if the allocset() did seterr,
596  * no set operations are done.
597  */
598 static void
p_bracket(p)599 p_bracket(p)
600 struct parse *p;
601 {
602 	cset *cs = allocset(p);
603 	int invert = 0;
604 
605 	/* Dept of Truly Sickening Special-Case Kludges */
606 	if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
607 		EMIT(OBOW, 0);
608 		NEXTn(6);
609 		return;
610 	}
611 	if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
612 		EMIT(OEOW, 0);
613 		NEXTn(6);
614 		return;
615 	}
616 
617 	if (EAT('^'))
618 		invert++;	/* make note to invert set at end */
619 	if (EAT(']'))
620 		CHadd(cs, ']');
621 	else if (EAT('-'))
622 		CHadd(cs, '-');
623 	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
624 		p_b_term(p, cs);
625 	if (EAT('-'))
626 		CHadd(cs, '-');
627 	MUSTEAT(']', REG_EBRACK);
628 
629 	if (p->error != 0)	/* don't mess things up further */
630 		return;
631 
632 	if (p->g->cflags&REG_ICASE) {
633 		int i;
634 		int ci;
635 
636 		for (i = p->g->csetsize - 1; i >= 0; i--)
637 			if (CHIN(cs, i) && isalpha(i)) {
638 				ci = othercase(i);
639 				if (ci != i)
640 					CHadd(cs, ci);
641 			}
642 		assert(cs->multis == NULL);	/* xxx */
643 #if 0
644 		if (cs->multis != NULL)
645 			mccase(p, cs);
646 #endif
647 	}
648 	if (invert) {
649 		int i;
650 
651 		for (i = p->g->csetsize - 1; i >= 0; i--)
652 			if (CHIN(cs, i))
653 				CHsub(cs, i);
654 			else
655 				CHadd(cs, i);
656 		if (p->g->cflags&REG_NEWLINE)
657 			CHsub(cs, '\n');
658 		assert(cs->multis == NULL);	/* xxx */
659 #if 0
660 		if (cs->multis != NULL)
661 			mcinvert(p, cs);
662 #endif
663 	}
664 
665 	assert(cs->multis == NULL);		/* xxx */
666 
667 	if (nch(p, cs) == 1) {		/* optimize singleton sets */
668 		ordinary(p, firstch(p, cs));
669 		freeset(p, cs);
670 	} else
671 		EMIT(OANYOF, freezeset(p, cs));
672 }
673 
674 /*
675  - p_b_term - parse one term of a bracketed character list
676  */
677 static void
p_b_term(p,cs)678 p_b_term(p, cs)
679 struct parse *p;
680 cset *cs;
681 {
682 	char c;
683 	char start, finish;
684 	int i;
685 
686 	/* classify what we've got */
687 	switch ((MORE()) ? PEEK() : '\0') {
688 	case '[':
689 		c = (MORE2()) ? PEEK2() : '\0';
690 		break;
691 	case '-':
692 		seterr(p, REG_ERANGE);
693 		return;			/* NOTE RETURN */
694 		break;
695 	default:
696 		c = '\0';
697 		break;
698 	}
699 
700 	switch (c) {
701 	case ':':		/* character class */
702 		NEXT2();
703 		REQUIRE(MORE(), REG_EBRACK);
704 		c = PEEK();
705 		REQUIRE(c != '-' && c != ']', REG_ECTYPE);
706 		p_b_cclass(p, cs);
707 		REQUIRE(MORE(), REG_EBRACK);
708 		REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
709 		break;
710 	case '=':		/* equivalence class */
711 		NEXT2();
712 		REQUIRE(MORE(), REG_EBRACK);
713 		c = PEEK();
714 		REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
715 		p_b_eclass(p, cs);
716 		REQUIRE(MORE(), REG_EBRACK);
717 		REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
718 		break;
719 	default:		/* symbol, ordinary character, or range */
720 /* xxx revision needed for multichar stuff */
721 		start = p_b_symbol(p);
722 		if (SEE('-') && MORE2() && PEEK2() != ']') {
723 			/* range */
724 			NEXT();
725 			if (EAT('-'))
726 				finish = '-';
727 			else
728 				finish = p_b_symbol(p);
729 		} else
730 			finish = start;
731 /* xxx what about signed chars here... */
732 		REQUIRE(start <= finish, REG_ERANGE);
733 		for (i = start; i <= finish; i++)
734 			CHadd(cs, i);
735 		break;
736 	}
737 }
738 
739 /* Character-class table. */
740 static struct cclass {
741 	char *name;
742 	char *chars;
743 	char *multis;
744 } cclasses[] = {
745 	{"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", ""},
746 	{"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", ""},
747 	{"blank", " \t", ""},
748 	{"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177", ""},
749 	{"digit", "0123456789", ""},
750 	{"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ""},
751 	{"lower", "abcdefghijklmnopqrstuvwxyz",	""},
752 	{"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", ""},
753 	{"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",	""},
754 	{"space", "\t\n\v\f\r ", ""},
755 	{"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",	""},
756 	{"xdigit", "0123456789ABCDEFabcdef", ""},
757 	{NULL, 0, ""}
758 };
759 
760 /*
761  - p_b_cclass - parse a character-class name and deal with it
762  */
763 static void
p_b_cclass(p,cs)764 p_b_cclass(p, cs)
765 struct parse *p;
766 cset *cs;
767 {
768 	char *sp = p->next;
769 	struct cclass *cp;
770 	size_t len;
771 	char *u;
772 	char c;
773 
774 	while (MORE() && isalpha(PEEK()))
775 		NEXT();
776 	len = (size_t)(p->next - sp);
777 	for (cp = cclasses; cp->name != NULL; cp++)
778 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
779 			break;
780 	if (cp->name == NULL) {
781 		/* oops, didn't find it */
782 		seterr(p, REG_ECTYPE);
783 		return;
784 	}
785 
786 	u = cp->chars;
787 	while ((c = *u++) != '\0')
788 		CHadd(cs, c);
789 	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
790 		MCadd(p, cs, u);
791 }
792 
793 /*
794  - p_b_eclass - parse an equivalence-class name and deal with it
795  *
796  * This implementation is incomplete. xxx
797  */
798 static void
p_b_eclass(p,cs)799 p_b_eclass(p, cs)
800 struct parse *p;
801 cset *cs;
802 {
803 	char c;
804 
805 	c = p_b_coll_elem(p, '=');
806 	CHadd(cs, c);
807 }
808 
809 /*
810  - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
811  */
812 static char			/* value of symbol */
p_b_symbol(p)813 p_b_symbol(p)
814 struct parse *p;
815 {
816 	char value;
817 
818 	REQUIRE(MORE(), REG_EBRACK);
819 	if (!EATTWO('[', '.'))
820 		return(GETNEXT());
821 
822 	/* collating symbol */
823 	value = p_b_coll_elem(p, '.');
824 	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
825 	return(value);
826 }
827 
828 /* character-name table */
829 static struct cname {
830 	char *name;
831 	char code;
832 } cnames[] = {
833 	{"NUL",	'\0'},
834 	{"SOH",	'\001'},
835 	{"STX",	'\002'},
836 	{"ETX",	'\003'},
837 	{"EOT",	'\004'},
838 	{"ENQ",	'\005'},
839 	{"ACK",	'\006'},
840 	{"BEL",	'\007'},
841 	{"alert",	'\007'},
842 	{"BS",		'\010'},
843 	{"backspace",	'\b'},
844 	{"HT",		'\011'},
845 	{"tab",		'\t'},
846 	{"LF",		'\012'},
847 	{"newline",	'\n'},
848 	{"VT",		'\013'},
849 	{"vertical-tab",	'\v'},
850 	{"FF",		'\014'},
851 	{"form-feed",	'\f'},
852 	{"CR",		'\015'},
853 	{"carriage-return",	'\r'},
854 	{"SO",	'\016'},
855 	{"SI",	'\017'},
856 	{"DLE",	'\020'},
857 	{"DC1",	'\021'},
858 	{"DC2",	'\022'},
859 	{"DC3",	'\023'},
860 	{"DC4",	'\024'},
861 	{"NAK",	'\025'},
862 	{"SYN",	'\026'},
863 	{"ETB",	'\027'},
864 	{"CAN",	'\030'},
865 	{"EM",	'\031'},
866 	{"SUB",	'\032'},
867 	{"ESC",	'\033'},
868 	{"IS4",	'\034'},
869 	{"FS",	'\034'},
870 	{"IS3",	'\035'},
871 	{"GS",	'\035'},
872 	{"IS2",	'\036'},
873 	{"RS",	'\036'},
874 	{"IS1",	'\037'},
875 	{"US",	'\037'},
876 	{"space",		' '},
877 	{"exclamation-mark",	'!'},
878 	{"quotation-mark",	'"'},
879 	{"number-sign",		'#'},
880 	{"dollar-sign",		'$'},
881 	{"percent-sign",		'%'},
882 	{"ampersand",		'&'},
883 	{"apostrophe",		'\''},
884 	{"left-parenthesis",	'('},
885 	{"right-parenthesis",	')'},
886 	{"asterisk",	'*'},
887 	{"plus-sign",	'+'},
888 	{"comma",	','},
889 	{"hyphen",	'-'},
890 	{"hyphen-minus",	'-'},
891 	{"period",	'.'},
892 	{"full-stop",	'.'},
893 	{"slash",	'/'},
894 	{"solidus",	'/'},
895 	{"zero",		'0'},
896 	{"one",		'1'},
897 	{"two",		'2'},
898 	{"three",	'3'},
899 	{"four",		'4'},
900 	{"five",		'5'},
901 	{"six",		'6'},
902 	{"seven",	'7'},
903 	{"eight",	'8'},
904 	{"nine",		'9'},
905 	{"colon",	':'},
906 	{"semicolon",	';'},
907 	{"less-than-sign",	'<'},
908 	{"equals-sign",		'='},
909 	{"greater-than-sign",	'>'},
910 	{"question-mark",	'?'},
911 	{"commercial-at",	'@'},
912 	{"left-square-bracket",	'['},
913 	{"backslash",		'\\'},
914 	{"reverse-solidus",	'\\'},
915 	{"right-square-bracket",	']'},
916 	{"circumflex",		'^'},
917 	{"circumflex-accent",	'^'},
918 	{"underscore",		'_'},
919 	{"low-line",		'_'},
920 	{"grave-accent",		'`'},
921 	{"left-brace",		'{'},
922 	{"left-curly-bracket",	'{'},
923 	{"vertical-line",	'|'},
924 	{"right-brace",		'}'},
925 	{"right-curly-bracket",	'}'},
926 	{"tilde",		'~'},
927 	{"DEL",	'\177'},
928 	{NULL,	0},
929 };
930 
931 /*
932  - p_b_coll_elem - parse a collating-element name and look it up
933  */
934 static char			/* value of collating element */
p_b_coll_elem(p,endc)935 p_b_coll_elem(p, endc)
936 struct parse *p;
937 int endc;			/* name ended by endc,']' */
938 {
939 	char *sp = p->next;
940 	struct cname *cp;
941 	size_t len;
942 
943 	while (MORE() && !SEETWO(endc, ']'))
944 		NEXT();
945 	if (!MORE()) {
946 		seterr(p, REG_EBRACK);
947 		return(0);
948 	}
949 	len = (size_t)(p->next - sp);
950 	for (cp = cnames; cp->name != NULL; cp++)
951 		if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
952 			return(cp->code);	/* known name */
953 	if (len == 1)
954 		return(*sp);	/* single character */
955 	seterr(p, REG_ECOLLATE);			/* neither */
956 	return(0);
957 }
958 
959 /*
960  - othercase - return the case counterpart of an alphabetic
961  */
962 static char			/* if no counterpart, return ch */
othercase(ch)963 othercase(ch)
964 int ch;
965 {
966 	assert(isalpha(ch));
967 	if (isupper(ch))
968 		return((char)tolower(ch));
969 	else if (islower(ch))
970 		return((char)toupper(ch));
971 	else			/* peculiar, but could happen */
972 		return((char)ch);
973 }
974 
975 /*
976  - bothcases - emit a dualcase version of a two-case character
977  *
978  * Boy, is this implementation ever a kludge...
979  */
980 static void
bothcases(p,ch)981 bothcases(p, ch)
982 struct parse *p;
983 int ch;
984 {
985 	char *oldnext = p->next;
986 	char *oldend = p->end;
987 	char bracket[3];
988 
989 	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
990 	p->next = bracket;
991 	p->end = bracket+2;
992 	bracket[0] = (char)ch;
993 	bracket[1] = ']';
994 	bracket[2] = '\0';
995 	p_bracket(p);
996 	assert(p->next == bracket+2);
997 	p->next = oldnext;
998 	p->end = oldend;
999 }
1000 
1001 /*
1002  - ordinary - emit an ordinary character
1003  */
1004 static void
ordinary(p,ch)1005 ordinary(p, ch)
1006 struct parse *p;
1007 int ch;
1008 {
1009 	cat_t *cap = p->g->categories;
1010 
1011 	if ((p->g->cflags&REG_ICASE) && isalpha(ch) && othercase(ch) != ch)
1012 		bothcases(p, ch);
1013 	else {
1014 		EMIT(OCHAR, (sopno)ch);
1015 		if (cap[ch] == 0)
1016 			cap[ch] = (cat_t)(p->g->ncategories++);
1017 	}
1018 }
1019 
1020 /*
1021  - nonnewline - emit REG_NEWLINE version of OANY
1022  *
1023  * Boy, is this implementation ever a kludge...
1024  */
1025 static void
nonnewline(p)1026 nonnewline(p)
1027 struct parse *p;
1028 {
1029 	char *oldnext = p->next;
1030 	char *oldend = p->end;
1031 	char bracket[4];
1032 
1033 	p->next = bracket;
1034 	p->end = bracket+3;
1035 	bracket[0] = '^';
1036 	bracket[1] = '\n';
1037 	bracket[2] = ']';
1038 	bracket[3] = '\0';
1039 	p_bracket(p);
1040 	assert(p->next == bracket+3);
1041 	p->next = oldnext;
1042 	p->end = oldend;
1043 }
1044 
1045 /*
1046  - repeat - generate code for a bounded repetition, recursively if needed
1047  */
1048 static void
repeat(p,start,from,to)1049 repeat(p, start, from, to)
1050 struct parse *p;
1051 sopno start;			/* operand from here to end of strip */
1052 int from;			/* repeated from this number */
1053 int to;				/* to this number of times (maybe INFINITY) */
1054 {
1055 	sopno finish = HERE();
1056 #	define	N	2
1057 #	define	INF	3
1058 #	define	REP(f, t)	((f)*8 + (t))
1059 #	define	MAP(n)	(((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
1060 	sopno copy;
1061 
1062 	if (p->error != 0)	/* head off possible runaway recursion */
1063 		return;
1064 
1065 	assert(from <= to);
1066 
1067 	switch (REP(MAP(from), MAP(to))) {
1068 	case REP(0, 0):			/* must be user doing this */
1069 		DROP(finish-start);	/* drop the operand */
1070 		break;
1071 	case REP(0, 1):			/* as x{1,1}? */
1072 	case REP(0, N):			/* as x{1,n}? */
1073 	case REP(0, INF):		/* as x{1,}? */
1074 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1075 		INSERT(OCH_, start);		/* offset is wrong... */
1076 		repeat(p, start+1, 1, to);
1077 		ASTERN(OOR1, start);
1078 		AHEAD(start);			/* ... fix it */
1079 		EMIT(OOR2, 0);
1080 		AHEAD(THERE());
1081 		ASTERN(O_CH, THERETHERE());
1082 		break;
1083 	case REP(1, 1):			/* trivial case */
1084 		/* done */
1085 		break;
1086 	case REP(1, N):			/* as x?x{1,n-1} */
1087 		/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
1088 		INSERT(OCH_, start);
1089 		ASTERN(OOR1, start);
1090 		AHEAD(start);
1091 		EMIT(OOR2, 0);			/* offset very wrong... */
1092 		AHEAD(THERE());			/* ...so fix it */
1093 		ASTERN(O_CH, THERETHERE());
1094 		copy = dupl(p, start+1, finish+1);
1095 		assert(copy == finish+4);
1096 		repeat(p, copy, 1, to-1);
1097 		break;
1098 	case REP(1, INF):		/* as x+ */
1099 		INSERT(OPLUS_, start);
1100 		ASTERN(O_PLUS, start);
1101 		break;
1102 	case REP(N, N):			/* as xx{m-1,n-1} */
1103 		copy = dupl(p, start, finish);
1104 		repeat(p, copy, from-1, to-1);
1105 		break;
1106 	case REP(N, INF):		/* as xx{n-1,INF} */
1107 		copy = dupl(p, start, finish);
1108 		repeat(p, copy, from-1, to);
1109 		break;
1110 	default:			/* "can't happen" */
1111 		seterr(p, REG_ASSERT);	/* just in case */
1112 		break;
1113 	}
1114 }
1115 
1116 /*
1117  - seterr - set an error condition
1118  */
1119 static void
seterr(p,e)1120 seterr(p, e)
1121 struct parse *p;
1122 int e;
1123 {
1124 	if (p->error == 0)	/* keep earliest error condition */
1125 		p->error = e;
1126 	p->next = nuls;		/* try to bring things to a halt */
1127 	p->end = nuls;
1128 }
1129 
1130 /*
1131  - allocset - allocate a set of characters for []
1132  */
1133 static cset *
allocset(p)1134 allocset(p)
1135 struct parse *p;
1136 {
1137 	int no = p->g->ncsets++;
1138 	int nc;
1139 	int nbytes;
1140 	cset *cs;
1141 	int css = p->g->csetsize;
1142 	int i;
1143 
1144 	if (no >= p->ncsalloc) {	/* need another column of space */
1145 		p->ncsalloc += CHAR_BIT;
1146 		nc = p->ncsalloc;
1147 		assert(nc % CHAR_BIT == 0);
1148 		nbytes = nc / CHAR_BIT * css;
1149 		if (p->g->sets == NULL)
1150 			p->g->sets = (cset *)malloc((size_t)nc * sizeof(cset));
1151 		else
1152 			p->g->sets = (cset *)realloc((char *)p->g->sets,
1153 							(size_t)nc * sizeof(cset));
1154 		if (p->g->setbits == NULL)
1155 			p->g->setbits = (uch *)malloc((size_t)nbytes);
1156 		else {
1157 			p->g->setbits = (uch *)realloc((char *)p->g->setbits,
1158 							(size_t)nbytes);
1159 			/* xxx this isn't right if setbits is now NULL */
1160 			for (i = 0; i < no; i++)
1161 				p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1162 		}
1163 		if (p->g->sets != NULL && p->g->setbits != NULL)
1164 			(void) memset((char *)p->g->setbits + (nbytes - css),
1165 							0, (size_t)css);
1166 		else {
1167 			no = 0;
1168 			seterr(p, REG_ESPACE);
1169 			/* caller's responsibility not to do set ops */
1170 		}
1171 	}
1172 
1173 	assert(p->g->sets != NULL);	/* xxx */
1174 	cs = &p->g->sets[no];
1175 	cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1176 	cs->mask = (uch) (1 << ((no) % CHAR_BIT));
1177 	cs->hash = 0;
1178 	cs->smultis = 0;
1179 	cs->multis = NULL;
1180 
1181 	return(cs);
1182 }
1183 
1184 /*
1185  - freeset - free a now-unused set
1186  */
1187 static void
freeset(p,cs)1188 freeset(p, cs)
1189 struct parse *p;
1190 cset *cs;
1191 {
1192 	size_t i;
1193 	cset *top = &p->g->sets[p->g->ncsets];
1194 	size_t css = (size_t)p->g->csetsize;
1195 
1196 	for (i = 0; i < css; i++)
1197 		CHsub(cs, i);
1198 	if (cs == top-1)	/* recover only the easy case */
1199 		p->g->ncsets--;
1200 }
1201 
1202 /*
1203  - freezeset - final processing on a set of characters
1204  *
1205  * The main task here is merging identical sets.  This is usually a waste
1206  * of time (although the hash code minimizes the overhead), but can win
1207  * big if REG_ICASE is being used.  REG_ICASE, by the way, is why the hash
1208  * is done using addition rather than xor -- all ASCII [aA] sets xor to
1209  * the same value!
1210  */
1211 static int			/* set number */
freezeset(p,cs)1212 freezeset(p, cs)
1213 struct parse *p;
1214 cset *cs;
1215 {
1216 	uch h = cs->hash;
1217 	size_t i;
1218 	cset *top = &p->g->sets[p->g->ncsets];
1219 	cset *cs2;
1220 	size_t css = (size_t)p->g->csetsize;
1221 
1222 	/* look for an earlier one which is the same */
1223 	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1224 		if (cs2->hash == h && cs2 != cs) {
1225 			/* maybe */
1226 			for (i = 0; i < css; i++)
1227 				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1228 					break;		/* no */
1229 			if (i == css)
1230 				break;			/* yes */
1231 		}
1232 
1233 	if (cs2 < top) {	/* found one */
1234 		freeset(p, cs);
1235 		cs = cs2;
1236 	}
1237 
1238 	return((int)(cs - p->g->sets));
1239 }
1240 
1241 /*
1242  - firstch - return first character in a set (which must have at least one)
1243  */
1244 static int			/* character; there is no "none" value */
firstch(p,cs)1245 firstch(p, cs)
1246 struct parse *p;
1247 cset *cs;
1248 {
1249 	size_t i;
1250 	size_t css = (size_t)p->g->csetsize;
1251 
1252 	for (i = 0; i < css; i++)
1253 		if (CHIN(cs, i))
1254 			return((char)i);
1255 	assert(never);
1256 	return(0);		/* arbitrary */
1257 }
1258 
1259 /*
1260  - nch - number of characters in a set
1261  */
1262 static int
nch(p,cs)1263 nch(p, cs)
1264 struct parse *p;
1265 cset *cs;
1266 {
1267 	size_t i;
1268 	size_t css = (size_t)p->g->csetsize;
1269 	int n = 0;
1270 
1271 	for (i = 0; i < css; i++)
1272 		if (CHIN(cs, i))
1273 			n++;
1274 	return(n);
1275 }
1276 
1277 /*
1278  - mcadd - add a collating element to a cset
1279  */
1280 static void
mcadd(p,cs,cp)1281 mcadd(p, cs, cp)
1282 struct parse *p;
1283 cset *cs;
1284 char *cp;
1285 {
1286 	size_t oldend = cs->smultis;
1287 
1288 	cs->smultis += strlen(cp) + 1;
1289 	if (cs->multis == NULL)
1290 		cs->multis = malloc(cs->smultis);
1291 	else
1292 		cs->multis = realloc(cs->multis, cs->smultis);
1293 	if (cs->multis == NULL) {
1294 		seterr(p, REG_ESPACE);
1295 		return;
1296 	}
1297 
1298 	(void) strcpy(cs->multis + oldend - 1, cp);
1299 	cs->multis[cs->smultis - 1] = '\0';
1300 }
1301 
1302 #if 0
1303 /*
1304  - mcsub - subtract a collating element from a cset
1305  */
1306 static void
1307 mcsub(cs, cp)
1308 cset *cs;
1309 char *cp;
1310 {
1311 	char *fp = mcfind(cs, cp);
1312 	size_t len = strlen(fp);
1313 
1314 	assert(fp != NULL);
1315 	(void) memmove(fp, fp + len + 1,
1316 				cs->smultis - (fp + len + 1 - cs->multis));
1317 	cs->smultis -= len;
1318 
1319 	if (cs->smultis == 0) {
1320 		free(cs->multis);
1321 		cs->multis = NULL;
1322 		return;
1323 	}
1324 
1325 	cs->multis = realloc(cs->multis, cs->smultis);
1326 	assert(cs->multis != NULL);
1327 }
1328 
1329 /*
1330  - mcin - is a collating element in a cset?
1331  */
1332 static int
1333 mcin(cs, cp)
1334 cset *cs;
1335 char *cp;
1336 {
1337 	return(mcfind(cs, cp) != NULL);
1338 }
1339 
1340 /*
1341  - mcfind - find a collating element in a cset
1342  */
1343 static char *
1344 mcfind(cs, cp)
1345 cset *cs;
1346 char *cp;
1347 {
1348 	char *p;
1349 
1350 	if (cs->multis == NULL)
1351 		return(NULL);
1352 	for (p = cs->multis; *p != '\0'; p += strlen(p) + 1)
1353 		if (strcmp(cp, p) == 0)
1354 			return(p);
1355 	return(NULL);
1356 }
1357 
1358 /*
1359  - mcinvert - invert the list of collating elements in a cset
1360  *
1361  * This would have to know the set of possibilities.  Implementation
1362  * is deferred.
1363  */
1364 static void
1365 mcinvert(p, cs)
1366 struct parse *p;
1367 cset *cs;
1368 {
1369 	assert(cs->multis == NULL);	/* xxx */
1370 }
1371 
1372 /*
1373  - mccase - add case counterparts of the list of collating elements in a cset
1374  *
1375  * This would have to know the set of possibilities.  Implementation
1376  * is deferred.
1377  */
1378 static void
1379 mccase(p, cs)
1380 struct parse *p;
1381 cset *cs;
1382 {
1383 	assert(cs->multis == NULL);	/* xxx */
1384 }
1385 #endif
1386 
1387 /*
1388  - isinsets - is this character in any sets?
1389  */
1390 static int			/* predicate */
isinsets(g,c)1391 isinsets(g, c)
1392 struct re_guts *g;
1393 int c;
1394 {
1395 	uch *col;
1396 	int i;
1397 	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1398 	unsigned uc = (unsigned char)c;
1399 
1400 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1401 		if (col[uc] != 0)
1402 			return(1);
1403 	return(0);
1404 }
1405 
1406 /*
1407  - samesets - are these two characters in exactly the same sets?
1408  */
1409 static int			/* predicate */
samesets(g,c1,c2)1410 samesets(g, c1, c2)
1411 struct re_guts *g;
1412 int c1;
1413 int c2;
1414 {
1415 	uch *col;
1416 	int i;
1417 	int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1418 	unsigned uc1 = (unsigned char)c1;
1419 	unsigned uc2 = (unsigned char)c2;
1420 
1421 	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1422 		if (col[uc1] != col[uc2])
1423 			return(0);
1424 	return(1);
1425 }
1426 
1427 /*
1428  - categorize - sort out character categories
1429  */
1430 static void
categorize(p,g)1431 categorize(p, g)
1432 struct parse *p;
1433 struct re_guts *g;
1434 {
1435 	cat_t *cats = g->categories;
1436 	int c;
1437 	int c2;
1438 	cat_t cat;
1439 
1440 	/* avoid making error situations worse */
1441 	if (p->error != 0)
1442 		return;
1443 
1444 	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1445 		if (cats[c] == 0 && isinsets(g, c)) {
1446 			cat = (cat_t)g->ncategories++;
1447 			cats[c] = cat;
1448 			for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1449 				if (cats[c2] == 0 && samesets(g, c, c2))
1450 					cats[c2] = cat;
1451 		}
1452 }
1453 
1454 /*
1455  - dupl - emit a duplicate of a bunch of sops
1456  */
1457 static sopno			/* start of duplicate */
dupl(p,start,finish)1458 dupl(p, start, finish)
1459 struct parse *p;
1460 sopno start;			/* from here */
1461 sopno finish;			/* to this less one */
1462 {
1463 	sopno ret = HERE();
1464 	sopno len = finish - start;
1465 
1466 	assert(finish >= start);
1467 	if (len == 0)
1468 		return(ret);
1469 	enlarge(p, p->ssize + len);	/* this many unexpected additions */
1470 	assert(p->ssize >= p->slen + len);
1471 	(void) memcpy((char *)(p->strip + p->slen),
1472 		(char *)(p->strip + start), (size_t)len*sizeof(sop));
1473 	p->slen += len;
1474 	return(ret);
1475 }
1476 
1477 /*
1478  - doemit - emit a strip operator
1479  *
1480  * It might seem better to implement this as a macro with a function as
1481  * hard-case backup, but it's just too big and messy unless there are
1482  * some changes to the data structures.  Maybe later.
1483  */
1484 static void
doemit(p,op,opnd)1485 doemit(p, op, opnd)
1486 struct parse *p;
1487 sop op;
1488 sop opnd;
1489 {
1490 	/* avoid making error situations worse */
1491 	if (p->error != 0)
1492 		return;
1493 
1494 	/* deal with oversize operands ("can't happen", more or less) */
1495 	assert(opnd < 1<<OPSHIFT);
1496 
1497 	/* deal with undersized strip */
1498 	if (p->slen >= p->ssize)
1499 		enlarge(p, (p->ssize+1) / 2 * 3);	/* +50% */
1500 	assert(p->slen < p->ssize);
1501 
1502 	/* finally, it's all reduced to the easy case */
1503 	p->strip[p->slen++] = SOP(op, opnd);
1504 }
1505 
1506 /*
1507  - doinsert - insert a sop into the strip
1508  */
1509 static void
doinsert(p,op,opnd,pos)1510 doinsert(p, op, opnd, pos)
1511 struct parse *p;
1512 sop op;
1513 sopno opnd;
1514 sopno pos;
1515 {
1516 	sopno sn;
1517 	sop s;
1518 	int i;
1519 
1520 	/* avoid making error situations worse */
1521 	if (p->error != 0)
1522 		return;
1523 
1524 	sn = HERE();
1525 	EMIT(op, opnd);		/* do checks, ensure space */
1526 	assert(HERE() == sn+1);
1527 	s = p->strip[sn];
1528 
1529 	/* adjust paren pointers */
1530 	assert(pos > 0);
1531 	for (i = 1; i < NPAREN; i++) {
1532 		if (p->pbegin[i] >= pos) {
1533 			p->pbegin[i]++;
1534 		}
1535 		if (p->pend[i] >= pos) {
1536 			p->pend[i]++;
1537 		}
1538 	}
1539 
1540 	memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1541 				(size_t)(HERE()-pos-1)*sizeof(sop));
1542 	p->strip[pos] = s;
1543 }
1544 
1545 /*
1546  - dofwd - complete a forward reference
1547  */
1548 static void
dofwd(p,pos,value)1549 dofwd(p, pos, value)
1550 struct parse *p;
1551 sopno pos;
1552 sop value;
1553 {
1554 	/* avoid making error situations worse */
1555 	if (p->error != 0)
1556 		return;
1557 
1558 	assert(value < 1<<OPSHIFT);
1559 	p->strip[pos] = OP(p->strip[pos]) | value;
1560 }
1561 
1562 /*
1563  - enlarge - enlarge the strip
1564  */
1565 static void
enlarge(p,size)1566 enlarge(p, size)
1567 struct parse *p;
1568 sopno size;
1569 {
1570 	sop *sp;
1571 
1572 	if (p->ssize >= size)
1573 		return;
1574 
1575 	sp = (sop *)realloc(p->strip, (size_t)size*sizeof(sop));
1576 	if (sp == NULL) {
1577 		seterr(p, REG_ESPACE);
1578 		return;
1579 	}
1580 	p->strip = sp;
1581 	p->ssize = size;
1582 }
1583 
1584 /*
1585  - stripsnug - compact the strip
1586  */
1587 static void
stripsnug(p,g)1588 stripsnug(p, g)
1589 struct parse *p;
1590 struct re_guts *g;
1591 {
1592 	g->nstates = p->slen;
1593 	g->strip = (sop *)realloc((char *)p->strip, (size_t)p->slen * sizeof(sop));
1594 	if (g->strip == NULL) {
1595 		seterr(p, REG_ESPACE);
1596 		g->strip = p->strip;
1597 	}
1598 }
1599 
1600 /*
1601  - findmust - fill in must and mlen with longest mandatory literal string
1602  *
1603  * This algorithm could do fancy things like analyzing the operands of |
1604  * for common subsequences.  Someday.  This code is simple and finds most
1605  * of the interesting cases.
1606  *
1607  * Note that must and mlen got initialized during setup.
1608  */
1609 static void
findmust(p,g)1610 findmust(p, g)
1611 struct parse *p;
1612 struct re_guts *g;
1613 {
1614 	sop *scan;
1615 	sop *start;
1616 	sop *newstart;
1617 	int newlen;
1618 	sop s;
1619 	char *cp;
1620 	sopno i;
1621 
1622 	/* avoid making error situations worse */
1623 	if (p->error != 0)
1624 		return;
1625 
1626 	/* find the longest OCHAR sequence in strip */
1627 	newlen = 0;
1628 	scan = g->strip + 1;
1629 	do {
1630 		s = *scan++;
1631 		switch (OP(s)) {
1632 		case OCHAR:		/* sequence member */
1633 			if (newlen == 0)		/* new sequence */
1634 				newstart = scan - 1;
1635 			newlen++;
1636 			break;
1637 		case OPLUS_:		/* things that don't break one */
1638 		case OLPAREN:
1639 		case ORPAREN:
1640 			break;
1641 		case OQUEST_:		/* things that must be skipped */
1642 		case OCH_:
1643 			scan--;
1644 			do {
1645 				scan += OPND(s);
1646 				s = *scan;
1647 				/* assert() interferes w debug printouts */
1648 				if (OP(s) != O_QUEST && OP(s) != O_CH &&
1649 							OP(s) != OOR2) {
1650 					g->iflags |= BAD;
1651 					return;
1652 				}
1653 			} while (OP(s) != O_QUEST && OP(s) != O_CH);
1654 			/* fallthrough */
1655 		default:		/* things that break a sequence */
1656 			if (newlen > g->mlen) {		/* ends one */
1657 				start = newstart;
1658 				g->mlen = newlen;
1659 			}
1660 			newlen = 0;
1661 			break;
1662 		}
1663 	} while (OP(s) != OEND);
1664 
1665 	if (g->mlen == 0)		/* there isn't one */
1666 		return;
1667 
1668 	/* turn it into a character string */
1669 	g->must = malloc((size_t)g->mlen + 1);
1670 	if (g->must == NULL) {		/* argh; just forget it */
1671 		g->mlen = 0;
1672 		return;
1673 	}
1674 	cp = g->must;
1675 	scan = start;
1676 	for (i = g->mlen; i > 0; i--) {
1677 		while (OP(s = *scan++) != OCHAR)
1678 			continue;
1679 		assert(cp < g->must + g->mlen);
1680 		*cp++ = (char)OPND(s);
1681 	}
1682 	assert(cp == g->must + g->mlen);
1683 	*cp++ = '\0';		/* just on general principles */
1684 }
1685 
1686 /*
1687  - pluscount - count + nesting
1688  */
1689 static sopno			/* nesting depth */
pluscount(p,g)1690 pluscount(p, g)
1691 struct parse *p;
1692 struct re_guts *g;
1693 {
1694 	sop *scan;
1695 	sop s;
1696 	sopno plusnest = 0;
1697 	sopno maxnest = 0;
1698 
1699 	if (p->error != 0)
1700 		return(0);	/* there may not be an OEND */
1701 
1702 	scan = g->strip + 1;
1703 	do {
1704 		s = *scan++;
1705 		switch (OP(s)) {
1706 		case OPLUS_:
1707 			plusnest++;
1708 			break;
1709 		case O_PLUS:
1710 			if (plusnest > maxnest)
1711 				maxnest = plusnest;
1712 			plusnest--;
1713 			break;
1714 		}
1715 	} while (OP(s) != OEND);
1716 	if (plusnest != 0)
1717 		g->iflags |= BAD;
1718 	return(maxnest);
1719 }
1720