1 /* $OpenBSD: regcomp.c,v 1.19 2008/02/23 08:13:07 otto Exp $ */
2 /*-
3 * Copyright (c) 1992, 1993, 1994 Henry Spencer.
4 * Copyright (c) 1992, 1993, 1994
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Henry Spencer.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * @(#)regcomp.c 8.5 (Berkeley) 3/20/94
35 */
36
37 #include <sys/types.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <ctype.h>
41 #include <limits.h>
42 #include <stdlib.h>
43 #include "./regex.h"
44
45 #include "utils.h"
46 #include "./regex2.h"
47
48 #include "cclass.h"
49 #include "cname.h"
50
51 /*
52 * parse structure, passed up and down to avoid global variables and
53 * other clumsinesses
54 */
55 struct parse {
56 char *next; /* next character in RE */
57 char *end; /* end of string (-> NUL normally) */
58 int error; /* has an error been seen? */
59 sop *strip; /* malloced strip */
60 sopno ssize; /* malloced strip size (allocated) */
61 sopno slen; /* malloced strip length (used) */
62 int ncsalloc; /* number of csets allocated */
63 struct re_guts *g;
64 # define NPAREN 10 /* we need to remember () 1-9 for back refs */
65 sopno pbegin[NPAREN]; /* -> ( ([0] unused) */
66 sopno pend[NPAREN]; /* -> ) ([0] unused) */
67 };
68
69 static void p_ere(struct parse *, int);
70 static void p_ere_exp(struct parse *);
71 static void p_str(struct parse *);
72 static void p_bre(struct parse *, int, int);
73 static int p_simp_re(struct parse *, int);
74 static int p_count(struct parse *);
75 static void p_bracket(struct parse *);
76 static void p_b_term(struct parse *, cset *);
77 static void p_b_cclass(struct parse *, cset *);
78 static void p_b_eclass(struct parse *, cset *);
79 static char p_b_symbol(struct parse *);
80 static char p_b_coll_elem(struct parse *, int);
81 static char othercase(int);
82 static void bothcases(struct parse *, int);
83 static void ordinary(struct parse *, int);
84 static void nonnewline(struct parse *);
85 static void repeat(struct parse *, sopno, int, int);
86 static int seterr(struct parse *, int);
87 static cset *allocset(struct parse *);
88 static void freeset(struct parse *, cset *);
89 static int freezeset(struct parse *, cset *);
90 static int firstch(struct parse *, cset *);
91 static int nch(struct parse *, cset *);
92 static void mcadd(struct parse *, cset *, const char *);
93 static void mcinvert(struct parse *, cset *);
94 static void mccase(struct parse *, cset *);
95 static int isinsets(struct re_guts *, int);
96 static int samesets(struct re_guts *, int, int);
97 static void categorize(struct parse *, struct re_guts *);
98 static sopno dupl(struct parse *, sopno, sopno);
99 static void doemit(struct parse *, sop, size_t);
100 static void doinsert(struct parse *, sop, size_t, sopno);
101 static void dofwd(struct parse *, sopno, sop);
102 static void enlarge(struct parse *, sopno);
103 static void stripsnug(struct parse *, struct re_guts *);
104 static void findmust(struct parse *, struct re_guts *);
105 static sopno pluscount(struct parse *, struct re_guts *);
106
107 static char nuls[10]; /* place to point scanner in event of error */
108
109 /*
110 * macros for use with parse structure
111 * BEWARE: these know that the parse structure is named `p' !!!
112 */
113 #define PEEK() (*p->next)
114 #define PEEK2() (*(p->next+1))
115 #define MORE() (p->next < p->end)
116 #define MORE2() (p->next+1 < p->end)
117 #define SEE(c) (MORE() && PEEK() == (c))
118 #define SEETWO(a, b) (MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
119 #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0)
120 #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
121 #define NEXT() (p->next++)
122 #define NEXT2() (p->next += 2)
123 #define NEXTn(n) (p->next += (n))
124 #define GETNEXT() (*p->next++)
125 #define SETERROR(e) seterr(p, (e))
126 #define REQUIRE(co, e) if (!(co)) { SETERROR(e); }
127 #define MUSTSEE(c, e) REQUIRE(MORE() && PEEK() == (c), e)
128 #define MUSTEAT(c, e) REQUIRE(MORE() && GETNEXT() == (c), e)
129 #define MUSTNOTSEE(c, e) REQUIRE(!MORE() || PEEK() != (c), e)
130 #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
131 #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
132 #define AHEAD(pos) dofwd(p, pos, HERE()-(pos))
133 #define ASTERN(sop, pos) EMIT(sop, HERE()-pos)
134 #define HERE() (p->slen)
135 #define THERE() (p->slen - 1)
136 #define THERETHERE() (p->slen - 2)
137 #define DROP(n) (p->slen -= (n))
138
139 #ifndef NDEBUG
140 static int never = 0; /* for use in asserts; shuts lint up */
141 #else
142 #define never 0 /* some <assert.h>s have bugs too */
143 #endif
144
145 /*
146 - regcomp - interface for parser and compilation
147 */
148 int /* 0 success, otherwise REG_something */
regcomp(regex_t * preg,const char * pattern,int cflags)149 regcomp(regex_t *preg, const char *pattern, int cflags)
150 {
151 struct parse pa;
152 struct re_guts *g;
153 struct parse *p = &pa;
154 int i;
155 size_t len;
156 #ifdef REDEBUG
157 # define GOODFLAGS(f) (f)
158 #else
159 # define GOODFLAGS(f) ((f)&~REG_DUMP)
160 #endif
161
162 cflags = GOODFLAGS(cflags);
163 if ((cflags®_EXTENDED) && (cflags®_NOSPEC))
164 return(REG_INVARG);
165
166 if (cflags®_PEND) {
167 if (preg->re_endp < pattern)
168 return(REG_INVARG);
169 len = preg->re_endp - pattern;
170 } else
171 len = strlen((char *)pattern);
172
173 /* do the mallocs early so failure handling is easy */
174 g = (struct re_guts *)malloc(sizeof(struct re_guts) +
175 (NC-1)*sizeof(cat_t));
176 if (g == NULL)
177 return(REG_ESPACE);
178 p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */
179 p->strip = (sop *)calloc(p->ssize, sizeof(sop));
180 p->slen = 0;
181 if (p->strip == NULL) {
182 free((char *)g);
183 return(REG_ESPACE);
184 }
185
186 /* set things up */
187 p->g = g;
188 p->next = (char *)pattern; /* convenience; we do not modify it */
189 p->end = p->next + len;
190 p->error = 0;
191 p->ncsalloc = 0;
192 for (i = 0; i < NPAREN; i++) {
193 p->pbegin[i] = 0;
194 p->pend[i] = 0;
195 }
196 g->csetsize = NC;
197 g->sets = NULL;
198 g->setbits = NULL;
199 g->ncsets = 0;
200 g->cflags = cflags;
201 g->iflags = 0;
202 g->nbol = 0;
203 g->neol = 0;
204 g->must = NULL;
205 g->mlen = 0;
206 g->nsub = 0;
207 g->ncategories = 1; /* category 0 is "everything else" */
208 g->categories = &g->catspace[-(CHAR_MIN)];
209 (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t));
210 g->backrefs = 0;
211
212 /* do it */
213 EMIT(OEND, 0);
214 g->firststate = THERE();
215 if (cflags®_EXTENDED)
216 p_ere(p, OUT);
217 else if (cflags®_NOSPEC)
218 p_str(p);
219 else
220 p_bre(p, OUT, OUT);
221 EMIT(OEND, 0);
222 g->laststate = THERE();
223
224 /* tidy up loose ends and fill things in */
225 categorize(p, g);
226 stripsnug(p, g);
227 findmust(p, g);
228 g->nplus = pluscount(p, g);
229 g->magic = MAGIC2;
230 preg->re_nsub = g->nsub;
231 preg->re_g = g;
232 preg->re_magic = MAGIC1;
233 #ifndef REDEBUG
234 /* not debugging, so can't rely on the assert() in regexec() */
235 if (g->iflags&BAD)
236 SETERROR(REG_ASSERT);
237 #endif
238
239 /* win or lose, we're done */
240 if (p->error != 0) /* lose */
241 regfree(preg);
242 return(p->error);
243 }
244
245 /*
246 - p_ere - ERE parser top level, concatenation and alternation
247 */
248 static void
p_ere(struct parse * p,int stop)249 p_ere(struct parse *p, int stop) /* character this ERE should end at */
250 {
251 char c;
252 sopno prevback = 0;
253 sopno prevfwd = 0;
254 sopno conc;
255 int first = 1; /* is this the first alternative? */
256
257 for (;;) {
258 /* do a bunch of concatenated expressions */
259 conc = HERE();
260 while (MORE() && (c = PEEK()) != '|' && c != stop)
261 p_ere_exp(p);
262 REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */
263
264 if (!EAT('|'))
265 break; /* NOTE BREAK OUT */
266
267 if (first) {
268 INSERT(OCH_, conc); /* offset is wrong */
269 prevfwd = conc;
270 prevback = conc;
271 first = 0;
272 }
273 ASTERN(OOR1, prevback);
274 prevback = THERE();
275 AHEAD(prevfwd); /* fix previous offset */
276 prevfwd = HERE();
277 EMIT(OOR2, 0); /* offset is very wrong */
278 }
279
280 if (!first) { /* tail-end fixups */
281 AHEAD(prevfwd);
282 ASTERN(O_CH, prevback);
283 }
284
285 assert(!MORE() || SEE(stop));
286 }
287
288 /*
289 - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
290 */
291 static void
p_ere_exp(struct parse * p)292 p_ere_exp(struct parse *p)
293 {
294 char c;
295 sopno pos;
296 int count;
297 int count2;
298 sopno subno;
299 int wascaret = 0;
300
301 assert(MORE()); /* caller should have ensured this */
302 c = GETNEXT();
303
304 pos = HERE();
305 switch (c) {
306 case '(':
307 REQUIRE(MORE(), REG_EPAREN);
308 p->g->nsub++;
309 subno = p->g->nsub;
310 if (subno < NPAREN)
311 p->pbegin[subno] = HERE();
312 EMIT(OLPAREN, subno);
313 if (!SEE(')'))
314 p_ere(p, ')');
315 if (subno < NPAREN) {
316 p->pend[subno] = HERE();
317 assert(p->pend[subno] != 0);
318 }
319 EMIT(ORPAREN, subno);
320 MUSTEAT(')', REG_EPAREN);
321 break;
322 #ifndef POSIX_MISTAKE
323 case ')': /* happens only if no current unmatched ( */
324 /*
325 * You may ask, why the ifndef? Because I didn't notice
326 * this until slightly too late for 1003.2, and none of the
327 * other 1003.2 regular-expression reviewers noticed it at
328 * all. So an unmatched ) is legal POSIX, at least until
329 * we can get it fixed.
330 */
331 SETERROR(REG_EPAREN);
332 break;
333 #endif
334 case '^':
335 EMIT(OBOL, 0);
336 p->g->iflags |= USEBOL;
337 p->g->nbol++;
338 wascaret = 1;
339 break;
340 case '$':
341 EMIT(OEOL, 0);
342 p->g->iflags |= USEEOL;
343 p->g->neol++;
344 break;
345 case '|':
346 SETERROR(REG_EMPTY);
347 break;
348 case '*':
349 case '+':
350 case '?':
351 SETERROR(REG_BADRPT);
352 break;
353 case '.':
354 if (p->g->cflags®_NEWLINE)
355 nonnewline(p);
356 else
357 EMIT(OANY, 0);
358 break;
359 case '[':
360 p_bracket(p);
361 break;
362 case '\\':
363 REQUIRE(MORE(), REG_EESCAPE);
364 c = GETNEXT();
365 ordinary(p, c);
366 break;
367 case '{': /* okay as ordinary except if digit follows */
368 REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT);
369 /* FALLTHROUGH */
370 default:
371 ordinary(p, c);
372 break;
373 }
374
375 if (!MORE())
376 return;
377 c = PEEK();
378 /* we call { a repetition if followed by a digit */
379 if (!( c == '*' || c == '+' || c == '?' ||
380 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ))
381 return; /* no repetition, we're done */
382 NEXT();
383
384 REQUIRE(!wascaret, REG_BADRPT);
385 switch (c) {
386 case '*': /* implemented as +? */
387 /* this case does not require the (y|) trick, noKLUDGE */
388 INSERT(OPLUS_, pos);
389 ASTERN(O_PLUS, pos);
390 INSERT(OQUEST_, pos);
391 ASTERN(O_QUEST, pos);
392 break;
393 case '+':
394 INSERT(OPLUS_, pos);
395 ASTERN(O_PLUS, pos);
396 break;
397 case '?':
398 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
399 INSERT(OCH_, pos); /* offset slightly wrong */
400 ASTERN(OOR1, pos); /* this one's right */
401 AHEAD(pos); /* fix the OCH_ */
402 EMIT(OOR2, 0); /* offset very wrong... */
403 AHEAD(THERE()); /* ...so fix it */
404 ASTERN(O_CH, THERETHERE());
405 break;
406 case '{':
407 count = p_count(p);
408 if (EAT(',')) {
409 if (isdigit((uch)PEEK())) {
410 count2 = p_count(p);
411 REQUIRE(count <= count2, REG_BADBR);
412 } else /* single number with comma */
413 count2 = INFINITY;
414 } else /* just a single number */
415 count2 = count;
416 repeat(p, pos, count, count2);
417 if (!EAT('}')) { /* error heuristics */
418 while (MORE() && PEEK() != '}')
419 NEXT();
420 REQUIRE(MORE(), REG_EBRACE);
421 SETERROR(REG_BADBR);
422 }
423 break;
424 }
425
426 if (!MORE())
427 return;
428 c = PEEK();
429 if (!( c == '*' || c == '+' || c == '?' ||
430 (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) )
431 return;
432 SETERROR(REG_BADRPT);
433 }
434
435 /*
436 - p_str - string (no metacharacters) "parser"
437 */
438 static void
p_str(struct parse * p)439 p_str(struct parse *p)
440 {
441 REQUIRE(MORE(), REG_EMPTY);
442 while (MORE())
443 ordinary(p, GETNEXT());
444 }
445
446 /*
447 - p_bre - BRE parser top level, anchoring and concatenation
448 * Giving end1 as OUT essentially eliminates the end1/end2 check.
449 *
450 * This implementation is a bit of a kludge, in that a trailing $ is first
451 * taken as an ordinary character and then revised to be an anchor. The
452 * only undesirable side effect is that '$' gets included as a character
453 * category in such cases. This is fairly harmless; not worth fixing.
454 * The amount of lookahead needed to avoid this kludge is excessive.
455 */
456 static void
p_bre(struct parse * p,int end1,int end2)457 p_bre(struct parse *p,
458 int end1, /* first terminating character */
459 int end2) /* second terminating character */
460 {
461 sopno start = HERE();
462 int first = 1; /* first subexpression? */
463 int wasdollar = 0;
464
465 if (EAT('^')) {
466 EMIT(OBOL, 0);
467 p->g->iflags |= USEBOL;
468 p->g->nbol++;
469 }
470 while (MORE() && !SEETWO(end1, end2)) {
471 wasdollar = p_simp_re(p, first);
472 first = 0;
473 }
474 if (wasdollar) { /* oops, that was a trailing anchor */
475 DROP(1);
476 EMIT(OEOL, 0);
477 p->g->iflags |= USEEOL;
478 p->g->neol++;
479 }
480
481 REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */
482 }
483
484 /*
485 - p_simp_re - parse a simple RE, an atom possibly followed by a repetition
486 */
487 static int /* was the simple RE an unbackslashed $? */
p_simp_re(struct parse * p,int starordinary)488 p_simp_re(struct parse *p,
489 int starordinary) /* is a leading * an ordinary character? */
490 {
491 int c;
492 int count;
493 int count2;
494 sopno pos;
495 int i;
496 sopno subno;
497 # define BACKSL (1<<CHAR_BIT)
498
499 pos = HERE(); /* repetion op, if any, covers from here */
500
501 assert(MORE()); /* caller should have ensured this */
502 c = GETNEXT();
503 if (c == '\\') {
504 REQUIRE(MORE(), REG_EESCAPE);
505 c = BACKSL | GETNEXT();
506 }
507 switch (c) {
508 case '.':
509 if (p->g->cflags®_NEWLINE)
510 nonnewline(p);
511 else
512 EMIT(OANY, 0);
513 break;
514 case '[':
515 p_bracket(p);
516 break;
517 case BACKSL|'{':
518 SETERROR(REG_BADRPT);
519 break;
520 case BACKSL|'(':
521 p->g->nsub++;
522 subno = p->g->nsub;
523 if (subno < NPAREN)
524 p->pbegin[subno] = HERE();
525 EMIT(OLPAREN, subno);
526 /* the MORE here is an error heuristic */
527 if (MORE() && !SEETWO('\\', ')'))
528 p_bre(p, '\\', ')');
529 if (subno < NPAREN) {
530 p->pend[subno] = HERE();
531 assert(p->pend[subno] != 0);
532 }
533 EMIT(ORPAREN, subno);
534 REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
535 break;
536 case BACKSL|')': /* should not get here -- must be user */
537 case BACKSL|'}':
538 SETERROR(REG_EPAREN);
539 break;
540 case BACKSL|'1':
541 case BACKSL|'2':
542 case BACKSL|'3':
543 case BACKSL|'4':
544 case BACKSL|'5':
545 case BACKSL|'6':
546 case BACKSL|'7':
547 case BACKSL|'8':
548 case BACKSL|'9':
549 i = (c&~BACKSL) - '0';
550 assert(i < NPAREN);
551 if (p->pend[i] != 0) {
552 assert(i <= p->g->nsub);
553 EMIT(OBACK_, i);
554 assert(p->pbegin[i] != 0);
555 assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
556 assert(OP(p->strip[p->pend[i]]) == ORPAREN);
557 (void) dupl(p, p->pbegin[i]+1, p->pend[i]);
558 EMIT(O_BACK, i);
559 } else
560 SETERROR(REG_ESUBREG);
561 p->g->backrefs = 1;
562 break;
563 case '*':
564 REQUIRE(starordinary, REG_BADRPT);
565 /* FALLTHROUGH */
566 default:
567 ordinary(p, (char)c);
568 break;
569 }
570
571 if (EAT('*')) { /* implemented as +? */
572 /* this case does not require the (y|) trick, noKLUDGE */
573 INSERT(OPLUS_, pos);
574 ASTERN(O_PLUS, pos);
575 INSERT(OQUEST_, pos);
576 ASTERN(O_QUEST, pos);
577 } else if (EATTWO('\\', '{')) {
578 count = p_count(p);
579 if (EAT(',')) {
580 if (MORE() && isdigit((uch)PEEK())) {
581 count2 = p_count(p);
582 REQUIRE(count <= count2, REG_BADBR);
583 } else /* single number with comma */
584 count2 = INFINITY;
585 } else /* just a single number */
586 count2 = count;
587 repeat(p, pos, count, count2);
588 if (!EATTWO('\\', '}')) { /* error heuristics */
589 while (MORE() && !SEETWO('\\', '}'))
590 NEXT();
591 REQUIRE(MORE(), REG_EBRACE);
592 SETERROR(REG_BADBR);
593 }
594 } else if (c == '$') /* $ (but not \$) ends it */
595 return(1);
596
597 return(0);
598 }
599
600 /*
601 - p_count - parse a repetition count
602 */
603 static int /* the value */
p_count(struct parse * p)604 p_count(struct parse *p)
605 {
606 int count = 0;
607 int ndigits = 0;
608
609 while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) {
610 count = count*10 + (GETNEXT() - '0');
611 ndigits++;
612 }
613
614 REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
615 return(count);
616 }
617
618 /*
619 - p_bracket - parse a bracketed character list
620 *
621 * Note a significant property of this code: if the allocset() did SETERROR,
622 * no set operations are done.
623 */
624 static void
p_bracket(struct parse * p)625 p_bracket(struct parse *p)
626 {
627 cset *cs;
628 int invert = 0;
629
630 /* Dept of Truly Sickening Special-Case Kludges */
631 if (p->next + 5 < p->end && strncmp(p->next, "[:<:]]", 6) == 0) {
632 EMIT(OBOW, 0);
633 NEXTn(6);
634 return;
635 }
636 if (p->next + 5 < p->end && strncmp(p->next, "[:>:]]", 6) == 0) {
637 EMIT(OEOW, 0);
638 NEXTn(6);
639 return;
640 }
641
642 if ((cs = allocset(p)) == NULL) {
643 /* allocset did set error status in p */
644 return;
645 }
646
647 if (EAT('^'))
648 invert++; /* make note to invert set at end */
649 if (EAT(']'))
650 CHadd(cs, ']');
651 else if (EAT('-'))
652 CHadd(cs, '-');
653 while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
654 p_b_term(p, cs);
655 if (EAT('-'))
656 CHadd(cs, '-');
657 MUSTEAT(']', REG_EBRACK);
658
659 if (p->error != 0) { /* don't mess things up further */
660 freeset(p, cs);
661 return;
662 }
663
664 if (p->g->cflags®_ICASE) {
665 int i;
666 int ci;
667
668 for (i = p->g->csetsize - 1; i >= 0; i--)
669 if (CHIN(cs, i) && isalpha(i)) {
670 ci = othercase(i);
671 if (ci != i)
672 CHadd(cs, ci);
673 }
674 if (cs->multis != NULL)
675 mccase(p, cs);
676 }
677 if (invert) {
678 int i;
679
680 for (i = p->g->csetsize - 1; i >= 0; i--)
681 if (CHIN(cs, i))
682 CHsub(cs, i);
683 else
684 CHadd(cs, i);
685 if (p->g->cflags®_NEWLINE)
686 CHsub(cs, '\n');
687 if (cs->multis != NULL)
688 mcinvert(p, cs);
689 }
690
691 assert(cs->multis == NULL); /* xxx */
692
693 if (nch(p, cs) == 1) { /* optimize singleton sets */
694 ordinary(p, firstch(p, cs));
695 freeset(p, cs);
696 } else
697 EMIT(OANYOF, freezeset(p, cs));
698 }
699
700 /*
701 - p_b_term - parse one term of a bracketed character list
702 */
703 static void
p_b_term(struct parse * p,cset * cs)704 p_b_term(struct parse *p, cset *cs)
705 {
706 char c;
707 char start, finish;
708 int i;
709
710 /* classify what we've got */
711 switch ((MORE()) ? PEEK() : '\0') {
712 case '[':
713 c = (MORE2()) ? PEEK2() : '\0';
714 break;
715 case '-':
716 SETERROR(REG_ERANGE);
717 return; /* NOTE RETURN */
718 break;
719 default:
720 c = '\0';
721 break;
722 }
723
724 switch (c) {
725 case ':': /* character class */
726 NEXT2();
727 REQUIRE(MORE(), REG_EBRACK);
728 c = PEEK();
729 REQUIRE(c != '-' && c != ']', REG_ECTYPE);
730 p_b_cclass(p, cs);
731 REQUIRE(MORE(), REG_EBRACK);
732 REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
733 break;
734 case '=': /* equivalence class */
735 NEXT2();
736 REQUIRE(MORE(), REG_EBRACK);
737 c = PEEK();
738 REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
739 p_b_eclass(p, cs);
740 REQUIRE(MORE(), REG_EBRACK);
741 REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
742 break;
743 default: /* symbol, ordinary character, or range */
744 /* xxx revision needed for multichar stuff */
745 start = p_b_symbol(p);
746 if (SEE('-') && MORE2() && PEEK2() != ']') {
747 /* range */
748 NEXT();
749 if (EAT('-'))
750 finish = '-';
751 else
752 finish = p_b_symbol(p);
753 } else
754 finish = start;
755 /* xxx what about signed chars here... */
756 REQUIRE(start <= finish, REG_ERANGE);
757 for (i = start; i <= finish; i++)
758 CHadd(cs, i);
759 break;
760 }
761 }
762
763 /*
764 - p_b_cclass - parse a character-class name and deal with it
765 */
766 static void
p_b_cclass(struct parse * p,cset * cs)767 p_b_cclass(struct parse *p, cset *cs)
768 {
769 char *sp = p->next;
770 const struct cclass *cp;
771 size_t len;
772 const char *u;
773 char c;
774
775 while (MORE() && isalpha(PEEK()))
776 NEXT();
777 len = p->next - sp;
778 for (cp = cclasses; cp->name != NULL; cp++)
779 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
780 break;
781 if (cp->name == NULL) {
782 /* oops, didn't find it */
783 SETERROR(REG_ECTYPE);
784 return;
785 }
786
787 u = cp->chars;
788 while ((c = *u++) != '\0')
789 CHadd(cs, c);
790 for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
791 MCadd(p, cs, u);
792 }
793
794 /*
795 - p_b_eclass - parse an equivalence-class name and deal with it
796 *
797 * This implementation is incomplete. xxx
798 */
799 static void
p_b_eclass(struct parse * p,cset * cs)800 p_b_eclass(struct parse *p, cset *cs)
801 {
802 char c;
803
804 c = p_b_coll_elem(p, '=');
805 CHadd(cs, c);
806 }
807
808 /*
809 - p_b_symbol - parse a character or [..]ed multicharacter collating symbol
810 */
811 static char /* value of symbol */
p_b_symbol(struct parse * p)812 p_b_symbol(struct parse *p)
813 {
814 char value;
815
816 REQUIRE(MORE(), REG_EBRACK);
817 if (!EATTWO('[', '.'))
818 return(GETNEXT());
819
820 /* collating symbol */
821 value = p_b_coll_elem(p, '.');
822 REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
823 return(value);
824 }
825
826 /*
827 - p_b_coll_elem - parse a collating-element name and look it up
828 */
829 static char /* value of collating element */
p_b_coll_elem(struct parse * p,int endc)830 p_b_coll_elem(struct parse *p,
831 int endc) /* name ended by endc,']' */
832 {
833 char *sp = p->next;
834 const struct cname *cp;
835 int len;
836
837 while (MORE() && !SEETWO(endc, ']'))
838 NEXT();
839 if (!MORE()) {
840 SETERROR(REG_EBRACK);
841 return(0);
842 }
843 len = p->next - sp;
844 for (cp = cnames; cp->name != NULL; cp++)
845 if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
846 return(cp->code); /* known name */
847 if (len == 1)
848 return(*sp); /* single character */
849 SETERROR(REG_ECOLLATE); /* neither */
850 return(0);
851 }
852
853 /*
854 - othercase - return the case counterpart of an alphabetic
855 */
856 static char /* if no counterpart, return ch */
othercase(int ch)857 othercase(int ch)
858 {
859 ch = (uch)ch;
860 assert(isalpha(ch));
861 if (isupper(ch))
862 return ((uch)tolower(ch));
863 else if (islower(ch))
864 return ((uch)toupper(ch));
865 else /* peculiar, but could happen */
866 return(ch);
867 }
868
869 /*
870 - bothcases - emit a dualcase version of a two-case character
871 *
872 * Boy, is this implementation ever a kludge...
873 */
874 static void
bothcases(struct parse * p,int ch)875 bothcases(struct parse *p, int ch)
876 {
877 char *oldnext = p->next;
878 char *oldend = p->end;
879 char bracket[3];
880
881 ch = (uch)ch;
882 assert(othercase(ch) != ch); /* p_bracket() would recurse */
883 p->next = bracket;
884 p->end = bracket+2;
885 bracket[0] = ch;
886 bracket[1] = ']';
887 bracket[2] = '\0';
888 p_bracket(p);
889 assert(p->next == bracket+2);
890 p->next = oldnext;
891 p->end = oldend;
892 }
893
894 /*
895 - ordinary - emit an ordinary character
896 */
897 static void
ordinary(struct parse * p,int ch)898 ordinary(struct parse *p, int ch)
899 {
900 cat_t *cap = p->g->categories;
901
902 if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch)
903 bothcases(p, ch);
904 else {
905 EMIT(OCHAR, (uch)ch);
906 if (cap[ch] == 0)
907 cap[ch] = p->g->ncategories++;
908 }
909 }
910
911 /*
912 - nonnewline - emit REG_NEWLINE version of OANY
913 *
914 * Boy, is this implementation ever a kludge...
915 */
916 static void
nonnewline(struct parse * p)917 nonnewline(struct parse *p)
918 {
919 char *oldnext = p->next;
920 char *oldend = p->end;
921 char bracket[4];
922
923 p->next = bracket;
924 p->end = bracket+3;
925 bracket[0] = '^';
926 bracket[1] = '\n';
927 bracket[2] = ']';
928 bracket[3] = '\0';
929 p_bracket(p);
930 assert(p->next == bracket+3);
931 p->next = oldnext;
932 p->end = oldend;
933 }
934
935 /*
936 - repeat - generate code for a bounded repetition, recursively if needed
937 */
938 static void
repeat(struct parse * p,sopno start,int from,int to)939 repeat(struct parse *p,
940 sopno start, /* operand from here to end of strip */
941 int from, /* repeated from this number */
942 int to) /* to this number of times (maybe INFINITY) */
943 {
944 sopno finish = HERE();
945 # define N 2
946 # define INF 3
947 # define REP(f, t) ((f)*8 + (t))
948 # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
949 sopno copy;
950
951 if (p->error != 0) /* head off possible runaway recursion */
952 return;
953
954 assert(from <= to);
955
956 switch (REP(MAP(from), MAP(to))) {
957 case REP(0, 0): /* must be user doing this */
958 DROP(finish-start); /* drop the operand */
959 break;
960 case REP(0, 1): /* as x{1,1}? */
961 case REP(0, N): /* as x{1,n}? */
962 case REP(0, INF): /* as x{1,}? */
963 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
964 INSERT(OCH_, start); /* offset is wrong... */
965 repeat(p, start+1, 1, to);
966 ASTERN(OOR1, start);
967 AHEAD(start); /* ... fix it */
968 EMIT(OOR2, 0);
969 AHEAD(THERE());
970 ASTERN(O_CH, THERETHERE());
971 break;
972 case REP(1, 1): /* trivial case */
973 /* done */
974 break;
975 case REP(1, N): /* as x?x{1,n-1} */
976 /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
977 INSERT(OCH_, start);
978 ASTERN(OOR1, start);
979 AHEAD(start);
980 EMIT(OOR2, 0); /* offset very wrong... */
981 AHEAD(THERE()); /* ...so fix it */
982 ASTERN(O_CH, THERETHERE());
983 copy = dupl(p, start+1, finish+1);
984 assert(copy == finish+4);
985 repeat(p, copy, 1, to-1);
986 break;
987 case REP(1, INF): /* as x+ */
988 INSERT(OPLUS_, start);
989 ASTERN(O_PLUS, start);
990 break;
991 case REP(N, N): /* as xx{m-1,n-1} */
992 copy = dupl(p, start, finish);
993 repeat(p, copy, from-1, to-1);
994 break;
995 case REP(N, INF): /* as xx{n-1,INF} */
996 copy = dupl(p, start, finish);
997 repeat(p, copy, from-1, to);
998 break;
999 default: /* "can't happen" */
1000 SETERROR(REG_ASSERT); /* just in case */
1001 break;
1002 }
1003 }
1004
1005 /*
1006 - seterr - set an error condition
1007 */
1008 static int /* useless but makes type checking happy */
seterr(struct parse * p,int e)1009 seterr(struct parse *p, int e)
1010 {
1011 if (p->error == 0) /* keep earliest error condition */
1012 p->error = e;
1013 p->next = nuls; /* try to bring things to a halt */
1014 p->end = nuls;
1015 return(0); /* make the return value well-defined */
1016 }
1017
1018 /*
1019 - allocset - allocate a set of characters for []
1020 */
1021 static cset *
allocset(struct parse * p)1022 allocset(struct parse *p)
1023 {
1024 int no = p->g->ncsets++;
1025 size_t nc;
1026 size_t nbytes;
1027 cset *cs;
1028 size_t css = (size_t)p->g->csetsize;
1029 int i;
1030
1031 if (no >= p->ncsalloc) { /* need another column of space */
1032 void *ptr;
1033
1034 p->ncsalloc += CHAR_BIT;
1035 nc = p->ncsalloc;
1036 assert(nc % CHAR_BIT == 0);
1037 nbytes = nc / CHAR_BIT * css;
1038
1039 ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset));
1040 if (ptr == NULL)
1041 goto nomem;
1042 p->g->sets = (cset*)ptr;
1043
1044 ptr = (uch *)realloc((char *)p->g->setbits, nbytes);
1045 if (ptr == NULL)
1046 goto nomem;
1047 p->g->setbits = (uch*)ptr;
1048
1049 for (i = 0; i < no; i++)
1050 p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT);
1051
1052 (void) memset((char *)p->g->setbits + (nbytes - css), 0, css);
1053 }
1054 /* XXX should not happen */
1055 if (p->g->sets == NULL || p->g->setbits == NULL)
1056 goto nomem;
1057
1058 cs = &p->g->sets[no];
1059 cs->ptr = p->g->setbits + css*((no)/CHAR_BIT);
1060 cs->mask = 1 << ((no) % CHAR_BIT);
1061 cs->hash = 0;
1062 cs->smultis = 0;
1063 cs->multis = NULL;
1064
1065 return(cs);
1066 nomem:
1067 free(p->g->sets);
1068 p->g->sets = NULL;
1069 free(p->g->setbits);
1070 p->g->setbits = NULL;
1071
1072 SETERROR(REG_ESPACE);
1073 /* caller's responsibility not to do set ops */
1074 return(NULL);
1075 }
1076
1077 /*
1078 - freeset - free a now-unused set
1079 */
1080 static void
freeset(struct parse * p,cset * cs)1081 freeset(struct parse *p, cset *cs)
1082 {
1083 int i;
1084 cset *top = &p->g->sets[p->g->ncsets];
1085 size_t css = (size_t)p->g->csetsize;
1086
1087 for (i = 0; i < (ssize_t)css; i++)
1088 CHsub(cs, i);
1089 if (cs == top-1) /* recover only the easy case */
1090 p->g->ncsets--;
1091 }
1092
1093 /*
1094 - freezeset - final processing on a set of characters
1095 *
1096 * The main task here is merging identical sets. This is usually a waste
1097 * of time (although the hash code minimizes the overhead), but can win
1098 * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash
1099 * is done using addition rather than xor -- all ASCII [aA] sets xor to
1100 * the same value!
1101 */
1102 static int /* set number */
freezeset(struct parse * p,cset * cs)1103 freezeset(struct parse *p, cset *cs)
1104 {
1105 uch h = cs->hash;
1106 int i;
1107 cset *top = &p->g->sets[p->g->ncsets];
1108 cset *cs2;
1109 size_t css = (size_t)p->g->csetsize;
1110
1111 /* look for an earlier one which is the same */
1112 for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
1113 if (cs2->hash == h && cs2 != cs) {
1114 /* maybe */
1115 for (i = 0; i < (ssize_t)css; i++)
1116 if (!!CHIN(cs2, i) != !!CHIN(cs, i))
1117 break; /* no */
1118 if (i == (ssize_t)css)
1119 break; /* yes */
1120 }
1121
1122 if (cs2 < top) { /* found one */
1123 freeset(p, cs);
1124 cs = cs2;
1125 }
1126
1127 return((int)(cs - p->g->sets));
1128 }
1129
1130 /*
1131 - firstch - return first character in a set (which must have at least one)
1132 */
1133 static int /* character; there is no "none" value */
firstch(struct parse * p,cset * cs)1134 firstch(struct parse *p, cset *cs)
1135 {
1136 int i;
1137 size_t css = (size_t)p->g->csetsize;
1138
1139 for (i = 0; i < (ssize_t)css; i++)
1140 if (CHIN(cs, i))
1141 return((char)i);
1142 assert(never);
1143 return(0); /* arbitrary */
1144 }
1145
1146 /*
1147 - nch - number of characters in a set
1148 */
1149 static int
nch(struct parse * p,cset * cs)1150 nch(struct parse *p, cset *cs)
1151 {
1152 int i;
1153 size_t css = (size_t)p->g->csetsize;
1154 int n = 0;
1155
1156 for (i = 0; i < (ssize_t)css; i++)
1157 if (CHIN(cs, i))
1158 n++;
1159 return(n);
1160 }
1161
1162 /*
1163 - mcadd - add a collating element to a cset
1164 */
1165 static void
mcadd(struct parse * p,cset * cs,const char * cp)1166 mcadd( struct parse *p, cset *cs, const char *cp)
1167 {
1168 size_t oldend = cs->smultis;
1169 void *np;
1170
1171 cs->smultis += strlen(cp) + 1;
1172 np = realloc(cs->multis, cs->smultis);
1173 if (np == NULL) {
1174 if (cs->multis)
1175 free(cs->multis);
1176 cs->multis = NULL;
1177 SETERROR(REG_ESPACE);
1178 return;
1179 }
1180 cs->multis = (char*)np;
1181
1182 strncpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1);
1183 }
1184
1185 /*
1186 - mcinvert - invert the list of collating elements in a cset
1187 *
1188 * This would have to know the set of possibilities. Implementation
1189 * is deferred.
1190 */
1191 /* ARGSUSED */
1192 static void
mcinvert(struct parse * p,cset * cs)1193 mcinvert(struct parse *p, cset *cs)
1194 {
1195 assert(cs->multis == NULL); /* xxx */
1196 }
1197
1198 /*
1199 - mccase - add case counterparts of the list of collating elements in a cset
1200 *
1201 * This would have to know the set of possibilities. Implementation
1202 * is deferred.
1203 */
1204 /* ARGSUSED */
1205 static void
mccase(struct parse * p,cset * cs)1206 mccase(struct parse *p, cset *cs)
1207 {
1208 assert(cs->multis == NULL); /* xxx */
1209 }
1210
1211 /*
1212 - isinsets - is this character in any sets?
1213 */
1214 static int /* predicate */
isinsets(struct re_guts * g,int c)1215 isinsets(struct re_guts *g, int c)
1216 {
1217 uch *col;
1218 int i;
1219 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1220 unsigned uc = (uch)c;
1221
1222 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1223 if (col[uc] != 0)
1224 return(1);
1225 return(0);
1226 }
1227
1228 /*
1229 - samesets - are these two characters in exactly the same sets?
1230 */
1231 static int /* predicate */
samesets(struct re_guts * g,int c1,int c2)1232 samesets(struct re_guts *g, int c1, int c2)
1233 {
1234 uch *col;
1235 int i;
1236 int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT;
1237 unsigned uc1 = (uch)c1;
1238 unsigned uc2 = (uch)c2;
1239
1240 for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
1241 if (col[uc1] != col[uc2])
1242 return(0);
1243 return(1);
1244 }
1245
1246 /*
1247 - categorize - sort out character categories
1248 */
1249 static void
categorize(struct parse * p,struct re_guts * g)1250 categorize(struct parse *p, struct re_guts *g)
1251 {
1252 cat_t *cats = g->categories;
1253 int c;
1254 int c2;
1255 cat_t cat;
1256
1257 /* avoid making error situations worse */
1258 if (p->error != 0)
1259 return;
1260
1261 for (c = CHAR_MIN; c <= CHAR_MAX; c++)
1262 if (cats[c] == 0 && isinsets(g, c)) {
1263 cat = g->ncategories++;
1264 cats[c] = cat;
1265 for (c2 = c+1; c2 <= CHAR_MAX; c2++)
1266 if (cats[c2] == 0 && samesets(g, c, c2))
1267 cats[c2] = cat;
1268 }
1269 }
1270
1271 /*
1272 - dupl - emit a duplicate of a bunch of sops
1273 */
1274 static sopno /* start of duplicate */
dupl(struct parse * p,sopno start,sopno finish)1275 dupl(struct parse *p,
1276 sopno start, /* from here */
1277 sopno finish) /* to this less one */
1278 {
1279 sopno ret = HERE();
1280 sopno len = finish - start;
1281
1282 assert(finish >= start);
1283 if (len == 0)
1284 return(ret);
1285 enlarge(p, p->ssize + len); /* this many unexpected additions */
1286 assert(p->ssize >= p->slen + len);
1287 (void) memcpy((char *)(p->strip + p->slen),
1288 (char *)(p->strip + start), (size_t)len*sizeof(sop));
1289 p->slen += len;
1290 return(ret);
1291 }
1292
1293 /*
1294 - doemit - emit a strip operator
1295 *
1296 * It might seem better to implement this as a macro with a function as
1297 * hard-case backup, but it's just too big and messy unless there are
1298 * some changes to the data structures. Maybe later.
1299 */
1300 static void
doemit(struct parse * p,sop op,size_t opnd)1301 doemit(struct parse *p, sop op, size_t opnd)
1302 {
1303 /* avoid making error situations worse */
1304 if (p->error != 0)
1305 return;
1306
1307 /* deal with oversize operands ("can't happen", more or less) */
1308 assert(opnd < 1<<OPSHIFT);
1309
1310 /* deal with undersized strip */
1311 if (p->slen >= p->ssize)
1312 enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */
1313 assert(p->slen < p->ssize);
1314
1315 /* finally, it's all reduced to the easy case */
1316 p->strip[p->slen++] = SOP(op, opnd);
1317 }
1318
1319 /*
1320 - doinsert - insert a sop into the strip
1321 */
1322 static void
doinsert(struct parse * p,sop op,size_t opnd,sopno pos)1323 doinsert(struct parse *p, sop op, size_t opnd, sopno pos)
1324 {
1325 sopno sn;
1326 sop s;
1327 int i;
1328
1329 /* avoid making error situations worse */
1330 if (p->error != 0)
1331 return;
1332
1333 sn = HERE();
1334 EMIT(op, opnd); /* do checks, ensure space */
1335 assert(HERE() == sn+1);
1336 s = p->strip[sn];
1337
1338 /* adjust paren pointers */
1339 assert(pos > 0);
1340 for (i = 1; i < NPAREN; i++) {
1341 if (p->pbegin[i] >= pos) {
1342 p->pbegin[i]++;
1343 }
1344 if (p->pend[i] >= pos) {
1345 p->pend[i]++;
1346 }
1347 }
1348
1349 memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos],
1350 (HERE()-pos-1)*sizeof(sop));
1351 p->strip[pos] = s;
1352 }
1353
1354 /*
1355 - dofwd - complete a forward reference
1356 */
1357 static void
dofwd(struct parse * p,sopno pos,sop value)1358 dofwd(struct parse *p, sopno pos, sop value)
1359 {
1360 /* avoid making error situations worse */
1361 if (p->error != 0)
1362 return;
1363
1364 assert(value < 1<<OPSHIFT);
1365 p->strip[pos] = OP(p->strip[pos]) | value;
1366 }
1367
1368 /*
1369 - enlarge - enlarge the strip
1370 */
1371 static void
enlarge(struct parse * p,sopno size)1372 enlarge(struct parse *p, sopno size)
1373 {
1374 sop *sp;
1375
1376 if (p->ssize >= size)
1377 return;
1378
1379 sp = (sop *)realloc(p->strip, size*sizeof(sop));
1380 if (sp == NULL) {
1381 SETERROR(REG_ESPACE);
1382 return;
1383 }
1384 p->strip = sp;
1385 p->ssize = size;
1386 }
1387
1388 /*
1389 - stripsnug - compact the strip
1390 */
1391 static void
stripsnug(struct parse * p,struct re_guts * g)1392 stripsnug(struct parse *p, struct re_guts *g)
1393 {
1394 g->nstates = p->slen;
1395 g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop));
1396 if (g->strip == NULL) {
1397 SETERROR(REG_ESPACE);
1398 g->strip = p->strip;
1399 }
1400 }
1401
1402 /*
1403 - findmust - fill in must and mlen with longest mandatory literal string
1404 *
1405 * This algorithm could do fancy things like analyzing the operands of |
1406 * for common subsequences. Someday. This code is simple and finds most
1407 * of the interesting cases.
1408 *
1409 * Note that must and mlen got initialized during setup.
1410 */
1411 static void
findmust(struct parse * p,struct re_guts * g)1412 findmust(struct parse *p, struct re_guts *g)
1413 {
1414 sop *scan;
1415 sop *start = NULL; /* start initialized in the default case, after that */
1416 sop *newstart = NULL; /* newstart was initialized in the OCHAR case */
1417 sopno newlen;
1418 sop s;
1419 char *cp;
1420 sopno i;
1421
1422 /* avoid making error situations worse */
1423 if (p->error != 0)
1424 return;
1425
1426 /* find the longest OCHAR sequence in strip */
1427 newlen = 0;
1428 scan = g->strip + 1;
1429 do {
1430 s = *scan++;
1431 switch (OP(s)) {
1432 case OCHAR: /* sequence member */
1433 if (newlen == 0) /* new sequence */
1434 newstart = scan - 1;
1435 newlen++;
1436 break;
1437 case OPLUS_: /* things that don't break one */
1438 case OLPAREN:
1439 case ORPAREN:
1440 break;
1441 case OQUEST_: /* things that must be skipped */
1442 case OCH_:
1443 scan--;
1444 do {
1445 scan += OPND(s);
1446 s = *scan;
1447 /* assert() interferes w debug printouts */
1448 if (OP(s) != O_QUEST && OP(s) != O_CH &&
1449 OP(s) != OOR2) {
1450 g->iflags |= BAD;
1451 return;
1452 }
1453 } while (OP(s) != O_QUEST && OP(s) != O_CH);
1454 /* fallthrough */
1455 default: /* things that break a sequence */
1456 if (newlen > g->mlen) { /* ends one */
1457 start = newstart;
1458 g->mlen = newlen;
1459 }
1460 newlen = 0;
1461 break;
1462 }
1463 } while (OP(s) != OEND);
1464
1465 if (g->mlen == 0) /* there isn't one */
1466 return;
1467
1468 /* turn it into a character string */
1469 g->must = (char*)malloc((size_t)g->mlen + 1);
1470 if (g->must == NULL) { /* argh; just forget it */
1471 g->mlen = 0;
1472 return;
1473 }
1474 cp = g->must;
1475 scan = start;
1476 for (i = g->mlen; i > 0; i--) {
1477 while (OP(s = *scan++) != OCHAR)
1478 continue;
1479 assert(cp < g->must + g->mlen);
1480 *cp++ = (char)OPND(s);
1481 }
1482 assert(cp == g->must + g->mlen);
1483 *cp++ = '\0'; /* just on general principles */
1484 }
1485
1486 /*
1487 - pluscount - count + nesting
1488 */
1489 static sopno /* nesting depth */
pluscount(struct parse * p,struct re_guts * g)1490 pluscount(struct parse *p, struct re_guts *g)
1491 {
1492 sop *scan;
1493 sop s;
1494 sopno plusnest = 0;
1495 sopno maxnest = 0;
1496
1497 if (p->error != 0)
1498 return(0); /* there may not be an OEND */
1499
1500 scan = g->strip + 1;
1501 do {
1502 s = *scan++;
1503 switch (OP(s)) {
1504 case OPLUS_:
1505 plusnest++;
1506 break;
1507 case O_PLUS:
1508 if (plusnest > maxnest)
1509 maxnest = plusnest;
1510 plusnest--;
1511 break;
1512 }
1513 } while (OP(s) != OEND);
1514 if (plusnest != 0)
1515 g->iflags |= BAD;
1516 return(maxnest);
1517 }
1518