1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 /* lasciate ogne speranza, voi ch'intrate. */
26
27 #define DEBUG
28
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "ytab.h"
36
37 #define MAXLIN 22
38
39 #define type(v) (v)->nobj /* badly overloaded here */
40 #define info(v) (v)->ntype /* badly overloaded here */
41 #define left(v) (v)->narg[0]
42 #define right(v) (v)->narg[1]
43 #define parent(v) (v)->nnext
44
45 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
46 #define ELEAF case EMPTYRE: /* empty string in regexp */
47 #define UNARY case STAR: case PLUS: case QUEST:
48
49 /* encoding in tree Nodes:
50 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
51 left is index, right contains value or pointer to value
52 unary (STAR, PLUS, QUEST): left is child, right is null
53 binary (CAT, OR): left and right are children
54 parent contains pointer to parent
55 */
56
57
58 int *setvec;
59 int *tmpset;
60 int maxsetvec = 0;
61
62 int rtok; /* next token in current re */
63 int rlxval;
64 static const uschar *rlxstr;
65 static const uschar *prestr; /* current position in current re */
66 static const uschar *lastre; /* origin of last re */
67 static const uschar *lastatom; /* origin of last Atom */
68 static const uschar *starttok;
69 static const uschar *basestr; /* starts with original, replaced during
70 repetition processing */
71 static const uschar *firstbasestr;
72
73 static int setcnt;
74 static int poscnt;
75
76 const char *patbeg;
77 int patlen;
78
79 #define NFA 128 /* cache this many dynamic fa's */
80 fa *fatab[NFA];
81 int nfatab = 0; /* entries in fatab */
82
83 static int *
intalloc(size_t n,const char * f)84 intalloc(size_t n, const char *f)
85 {
86 void *p = calloc(n, sizeof(int));
87 if (p == NULL)
88 overflo(f);
89 return p;
90 }
91
92 static void
resizesetvec(const char * f)93 resizesetvec(const char *f)
94 {
95 if (maxsetvec == 0)
96 maxsetvec = MAXLIN;
97 else
98 maxsetvec *= 4;
99 setvec = realloc(setvec, maxsetvec * sizeof(*setvec));
100 tmpset = realloc(tmpset, maxsetvec * sizeof(*tmpset));
101 if (setvec == NULL || tmpset == NULL)
102 overflo(f);
103 }
104
105 static void
resize_state(fa * f,int state)106 resize_state(fa *f, int state)
107 {
108 void *p;
109 int i, new_count;
110
111 if (++state < f->state_count)
112 return;
113
114 new_count = state + 10; /* needs to be tuned */
115
116 p = realloc(f->gototab, new_count * sizeof(f->gototab[0]));
117 if (p == NULL)
118 goto out;
119 f->gototab = p;
120
121 p = realloc(f->out, new_count * sizeof(f->out[0]));
122 if (p == NULL)
123 goto out;
124 f->out = p;
125
126 p = realloc(f->posns, new_count * sizeof(f->posns[0]));
127 if (p == NULL)
128 goto out;
129 f->posns = p;
130
131 for (i = f->state_count; i < new_count; ++i) {
132 f->gototab[i] = calloc(NCHARS, sizeof(**f->gototab));
133 if (f->gototab[i] == NULL)
134 goto out;
135 f->out[i] = 0;
136 f->posns[i] = NULL;
137 }
138 f->state_count = new_count;
139 return;
140 out:
141 overflo(__func__);
142 }
143
makedfa(const char * s,bool anchor)144 fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
145 {
146 int i, use, nuse;
147 fa *pfa;
148 static int now = 1;
149
150 if (setvec == NULL) { /* first time through any RE */
151 resizesetvec(__func__);
152 }
153
154 if (compile_time != RUNNING) /* a constant for sure */
155 return mkdfa(s, anchor);
156 for (i = 0; i < nfatab; i++) /* is it there already? */
157 if (fatab[i]->anchor == anchor
158 && strcmp((const char *) fatab[i]->restr, s) == 0) {
159 fatab[i]->use = now++;
160 return fatab[i];
161 }
162 pfa = mkdfa(s, anchor);
163 if (nfatab < NFA) { /* room for another */
164 fatab[nfatab] = pfa;
165 fatab[nfatab]->use = now++;
166 nfatab++;
167 return pfa;
168 }
169 use = fatab[0]->use; /* replace least-recently used */
170 nuse = 0;
171 for (i = 1; i < nfatab; i++)
172 if (fatab[i]->use < use) {
173 use = fatab[i]->use;
174 nuse = i;
175 }
176 freefa(fatab[nuse]);
177 fatab[nuse] = pfa;
178 pfa->use = now++;
179 return pfa;
180 }
181
mkdfa(const char * s,bool anchor)182 fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
183 /* anchor = true for anchored matches, else false */
184 {
185 Node *p, *p1;
186 fa *f;
187
188 firstbasestr = (const uschar *) s;
189 basestr = firstbasestr;
190 p = reparse(s);
191 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
192 /* put ALL STAR in front of reg. exp. */
193 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
194 /* put FINAL after reg. exp. */
195
196 poscnt = 0;
197 penter(p1); /* enter parent pointers and leaf indices */
198 if ((f = calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
199 overflo(__func__);
200 f->accept = poscnt-1; /* penter has computed number of positions in re */
201 cfoll(f, p1); /* set up follow sets */
202 freetr(p1);
203 resize_state(f, 1);
204 f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
205 f->posns[1] = intalloc(1, __func__);
206 *f->posns[1] = 0;
207 f->initstat = makeinit(f, anchor);
208 f->anchor = anchor;
209 f->restr = (uschar *) tostring(s);
210 if (firstbasestr != basestr) {
211 if (basestr)
212 xfree(basestr);
213 }
214 return f;
215 }
216
makeinit(fa * f,bool anchor)217 int makeinit(fa *f, bool anchor)
218 {
219 int i, k;
220
221 f->curstat = 2;
222 f->out[2] = 0;
223 k = *(f->re[0].lfollow);
224 xfree(f->posns[2]);
225 f->posns[2] = intalloc(k + 1, __func__);
226 for (i = 0; i <= k; i++) {
227 (f->posns[2])[i] = (f->re[0].lfollow)[i];
228 }
229 if ((f->posns[2])[1] == f->accept)
230 f->out[2] = 1;
231 for (i = 0; i < NCHARS; i++)
232 f->gototab[2][i] = 0;
233 f->curstat = cgoto(f, 2, HAT);
234 if (anchor) {
235 *f->posns[2] = k-1; /* leave out position 0 */
236 for (i = 0; i < k; i++) {
237 (f->posns[0])[i] = (f->posns[2])[i];
238 }
239
240 f->out[0] = f->out[2];
241 if (f->curstat != 2)
242 --(*f->posns[f->curstat]);
243 }
244 return f->curstat;
245 }
246
penter(Node * p)247 void penter(Node *p) /* set up parent pointers and leaf indices */
248 {
249 switch (type(p)) {
250 ELEAF
251 LEAF
252 info(p) = poscnt;
253 poscnt++;
254 break;
255 UNARY
256 penter(left(p));
257 parent(left(p)) = p;
258 break;
259 case CAT:
260 case OR:
261 penter(left(p));
262 penter(right(p));
263 parent(left(p)) = p;
264 parent(right(p)) = p;
265 break;
266 case ZERO:
267 break;
268 default: /* can't happen */
269 FATAL("can't happen: unknown type %d in penter", type(p));
270 break;
271 }
272 }
273
freetr(Node * p)274 void freetr(Node *p) /* free parse tree */
275 {
276 switch (type(p)) {
277 ELEAF
278 LEAF
279 xfree(p);
280 break;
281 UNARY
282 case ZERO:
283 freetr(left(p));
284 xfree(p);
285 break;
286 case CAT:
287 case OR:
288 freetr(left(p));
289 freetr(right(p));
290 xfree(p);
291 break;
292 default: /* can't happen */
293 FATAL("can't happen: unknown type %d in freetr", type(p));
294 break;
295 }
296 }
297
298 /* in the parsing of regular expressions, metacharacters like . have */
299 /* to be seen literally; \056 is not a metacharacter. */
300
hexstr(const uschar ** pp)301 int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
302 { /* only pick up one 8-bit byte (2 chars) */
303 const uschar *p;
304 int n = 0;
305 int i;
306
307 for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
308 if (isdigit(*p))
309 n = 16 * n + *p - '0';
310 else if (*p >= 'a' && *p <= 'f')
311 n = 16 * n + *p - 'a' + 10;
312 else if (*p >= 'A' && *p <= 'F')
313 n = 16 * n + *p - 'A' + 10;
314 }
315 *pp = p;
316 return n;
317 }
318
319 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
320
quoted(const uschar ** pp)321 int quoted(const uschar **pp) /* pick up next thing after a \\ */
322 /* and increment *pp */
323 {
324 const uschar *p = *pp;
325 int c;
326
327 if ((c = *p++) == 't')
328 c = '\t';
329 else if (c == 'n')
330 c = '\n';
331 else if (c == 'f')
332 c = '\f';
333 else if (c == 'r')
334 c = '\r';
335 else if (c == 'b')
336 c = '\b';
337 else if (c == 'v')
338 c = '\v';
339 else if (c == 'a')
340 c = '\a';
341 else if (c == '\\')
342 c = '\\';
343 else if (c == 'x') { /* hexadecimal goo follows */
344 c = hexstr(&p); /* this adds a null if number is invalid */
345 } else if (isoctdigit(c)) { /* \d \dd \ddd */
346 int n = c - '0';
347 if (isoctdigit(*p)) {
348 n = 8 * n + *p++ - '0';
349 if (isoctdigit(*p))
350 n = 8 * n + *p++ - '0';
351 }
352 c = n;
353 } /* else */
354 /* c = c; */
355 *pp = p;
356 return c;
357 }
358
cclenter(const char * argp)359 char *cclenter(const char *argp) /* add a character class */
360 {
361 int i, c, c2;
362 const uschar *op, *p = (const uschar *) argp;
363 uschar *bp;
364 static uschar *buf = NULL;
365 static int bufsz = 100;
366
367 op = p;
368 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
369 FATAL("out of space for character class [%.10s...] 1", p);
370 bp = buf;
371 for (i = 0; (c = *p++) != 0; ) {
372 if (c == '\\') {
373 c = quoted(&p);
374 } else if (c == '-' && i > 0 && bp[-1] != 0) {
375 if (*p != 0) {
376 c = bp[-1];
377 c2 = *p++;
378 if (c2 == '\\')
379 c2 = quoted(&p);
380 if (c > c2) { /* empty; ignore */
381 bp--;
382 i--;
383 continue;
384 }
385 while (c < c2) {
386 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
387 FATAL("out of space for character class [%.10s...] 2", p);
388 *bp++ = ++c;
389 i++;
390 }
391 continue;
392 }
393 }
394 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
395 FATAL("out of space for character class [%.10s...] 3", p);
396 *bp++ = c;
397 i++;
398 }
399 *bp = 0;
400 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
401 xfree(op);
402 return (char *) tostring((char *) buf);
403 }
404
overflo(const char * s)405 void overflo(const char *s)
406 {
407 FATAL("regular expression too big: out of space in %.30s...", s);
408 }
409
cfoll(fa * f,Node * v)410 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
411 {
412 int i;
413 int *p;
414
415 switch (type(v)) {
416 ELEAF
417 LEAF
418 f->re[info(v)].ltype = type(v);
419 f->re[info(v)].lval.np = right(v);
420 while (f->accept >= maxsetvec) { /* guessing here! */
421 resizesetvec(__func__);
422 }
423 for (i = 0; i <= f->accept; i++)
424 setvec[i] = 0;
425 setcnt = 0;
426 follow(v); /* computes setvec and setcnt */
427 p = intalloc(setcnt + 1, __func__);
428 f->re[info(v)].lfollow = p;
429 *p = setcnt;
430 for (i = f->accept; i >= 0; i--)
431 if (setvec[i] == 1)
432 *++p = i;
433 break;
434 UNARY
435 cfoll(f,left(v));
436 break;
437 case CAT:
438 case OR:
439 cfoll(f,left(v));
440 cfoll(f,right(v));
441 break;
442 case ZERO:
443 break;
444 default: /* can't happen */
445 FATAL("can't happen: unknown type %d in cfoll", type(v));
446 }
447 }
448
first(Node * p)449 int first(Node *p) /* collects initially active leaves of p into setvec */
450 /* returns 0 if p matches empty string */
451 {
452 int b, lp;
453
454 switch (type(p)) {
455 ELEAF
456 LEAF
457 lp = info(p); /* look for high-water mark of subscripts */
458 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
459 resizesetvec(__func__);
460 }
461 if (type(p) == EMPTYRE) {
462 setvec[lp] = 0;
463 return(0);
464 }
465 if (setvec[lp] != 1) {
466 setvec[lp] = 1;
467 setcnt++;
468 }
469 if (type(p) == CCL && (*(char *) right(p)) == '\0')
470 return(0); /* empty CCL */
471 return(1);
472 case PLUS:
473 if (first(left(p)) == 0)
474 return(0);
475 return(1);
476 case STAR:
477 case QUEST:
478 first(left(p));
479 return(0);
480 case CAT:
481 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
482 return(1);
483 case OR:
484 b = first(right(p));
485 if (first(left(p)) == 0 || b == 0) return(0);
486 return(1);
487 case ZERO:
488 return 0;
489 }
490 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
491 return(-1);
492 }
493
follow(Node * v)494 void follow(Node *v) /* collects leaves that can follow v into setvec */
495 {
496 Node *p;
497
498 if (type(v) == FINAL)
499 return;
500 p = parent(v);
501 switch (type(p)) {
502 case STAR:
503 case PLUS:
504 first(v);
505 follow(p);
506 return;
507
508 case OR:
509 case QUEST:
510 follow(p);
511 return;
512
513 case CAT:
514 if (v == left(p)) { /* v is left child of p */
515 if (first(right(p)) == 0) {
516 follow(p);
517 return;
518 }
519 } else /* v is right child */
520 follow(p);
521 return;
522 }
523 }
524
member(int c,const char * sarg)525 int member(int c, const char *sarg) /* is c in s? */
526 {
527 const uschar *s = (const uschar *) sarg;
528
529 while (*s)
530 if (c == *s++)
531 return(1);
532 return(0);
533 }
534
match(fa * f,const char * p0)535 int match(fa *f, const char *p0) /* shortest match ? */
536 {
537 int s, ns;
538 const uschar *p = (const uschar *) p0;
539
540 s = f->initstat;
541 assert (s < f->state_count);
542
543 if (f->out[s])
544 return(1);
545 do {
546 /* assert(*p < NCHARS); */
547 if ((ns = f->gototab[s][*p]) != 0)
548 s = ns;
549 else
550 s = cgoto(f, s, *p);
551 if (f->out[s])
552 return(1);
553 } while (*p++ != 0);
554 return(0);
555 }
556
pmatch(fa * f,const char * p0)557 int pmatch(fa *f, const char *p0) /* longest match, for sub */
558 {
559 int s, ns;
560 const uschar *p = (const uschar *) p0;
561 const uschar *q;
562
563 s = f->initstat;
564 assert(s < f->state_count);
565
566 patbeg = (const char *)p;
567 patlen = -1;
568 do {
569 q = p;
570 do {
571 if (f->out[s]) /* final state */
572 patlen = q-p;
573 /* assert(*q < NCHARS); */
574 if ((ns = f->gototab[s][*q]) != 0)
575 s = ns;
576 else
577 s = cgoto(f, s, *q);
578
579 assert(s < f->state_count);
580
581 if (s == 1) { /* no transition */
582 if (patlen >= 0) {
583 patbeg = (const char *) p;
584 return(1);
585 }
586 else
587 goto nextin; /* no match */
588 }
589 } while (*q++ != 0);
590 if (f->out[s])
591 patlen = q-p-1; /* don't count $ */
592 if (patlen >= 0) {
593 patbeg = (const char *) p;
594 return(1);
595 }
596 nextin:
597 s = 2;
598 } while (*p++);
599 return (0);
600 }
601
nematch(fa * f,const char * p0)602 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
603 {
604 int s, ns;
605 const uschar *p = (const uschar *) p0;
606 const uschar *q;
607
608 s = f->initstat;
609 assert(s < f->state_count);
610
611 patbeg = (const char *)p;
612 patlen = -1;
613 while (*p) {
614 q = p;
615 do {
616 if (f->out[s]) /* final state */
617 patlen = q-p;
618 /* assert(*q < NCHARS); */
619 if ((ns = f->gototab[s][*q]) != 0)
620 s = ns;
621 else
622 s = cgoto(f, s, *q);
623 if (s == 1) { /* no transition */
624 if (patlen > 0) {
625 patbeg = (const char *) p;
626 return(1);
627 } else
628 goto nnextin; /* no nonempty match */
629 }
630 } while (*q++ != 0);
631 if (f->out[s])
632 patlen = q-p-1; /* don't count $ */
633 if (patlen > 0 ) {
634 patbeg = (const char *) p;
635 return(1);
636 }
637 nnextin:
638 s = 2;
639 p++;
640 }
641 return (0);
642 }
643
644
645 /*
646 * NAME
647 * fnematch
648 *
649 * DESCRIPTION
650 * A stream-fed version of nematch which transfers characters to a
651 * null-terminated buffer. All characters up to and including the last
652 * character of the matching text or EOF are placed in the buffer. If
653 * a match is found, patbeg and patlen are set appropriately.
654 *
655 * RETURN VALUES
656 * false No match found.
657 * true Match found.
658 */
659
fnematch(fa * pfa,FILE * f,char ** pbuf,int * pbufsize,int quantum)660 bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
661 {
662 char *buf = *pbuf;
663 int bufsize = *pbufsize;
664 int c, i, j, k, ns, s;
665
666 s = pfa->initstat;
667 patlen = 0;
668
669 /*
670 * All indices relative to buf.
671 * i <= j <= k <= bufsize
672 *
673 * i: origin of active substring
674 * j: current character
675 * k: destination of next getc()
676 */
677 i = -1, k = 0;
678 do {
679 j = i++;
680 do {
681 if (++j == k) {
682 if (k == bufsize)
683 if (!adjbuf((char **) &buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
684 FATAL("stream '%.30s...' too long", buf);
685 buf[k++] = (c = getc(f)) != EOF ? c : 0;
686 }
687 c = buf[j];
688 /* assert(c < NCHARS); */
689
690 if ((ns = pfa->gototab[s][c]) != 0)
691 s = ns;
692 else
693 s = cgoto(pfa, s, c);
694
695 if (pfa->out[s]) { /* final state */
696 patlen = j - i + 1;
697 if (c == 0) /* don't count $ */
698 patlen--;
699 }
700 } while (buf[j] && s != 1);
701 s = 2;
702 } while (buf[i] && !patlen);
703
704 /* adjbuf() may have relocated a resized buffer. Inform the world. */
705 *pbuf = buf;
706 *pbufsize = bufsize;
707
708 if (patlen) {
709 patbeg = (char *) buf + i;
710 /*
711 * Under no circumstances is the last character fed to
712 * the automaton part of the match. It is EOF's nullbyte,
713 * or it sent the automaton into a state with no further
714 * transitions available (s==1), or both. Room for a
715 * terminating nullbyte is guaranteed.
716 *
717 * ungetc any chars after the end of matching text
718 * (except for EOF's nullbyte, if present) and null
719 * terminate the buffer.
720 */
721 do
722 if (buf[--k] && ungetc(buf[k], f) == EOF)
723 FATAL("unable to ungetc '%c'", buf[k]);
724 while (k > i + patlen);
725 buf[k] = '\0';
726 return true;
727 }
728 else
729 return false;
730 }
731
reparse(const char * p)732 Node *reparse(const char *p) /* parses regular expression pointed to by p */
733 { /* uses relex() to scan regular expression */
734 Node *np;
735
736 dprintf( ("reparse <%s>\n", p) );
737 lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
738 rtok = relex();
739 /* GNU compatibility: an empty regexp matches anything */
740 if (rtok == '\0') {
741 /* FATAL("empty regular expression"); previous */
742 return(op2(EMPTYRE, NIL, NIL));
743 }
744 np = regexp();
745 if (rtok != '\0')
746 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
747 return(np);
748 }
749
regexp(void)750 Node *regexp(void) /* top-level parse of reg expr */
751 {
752 return (alt(concat(primary())));
753 }
754
primary(void)755 Node *primary(void)
756 {
757 Node *np;
758 int savelastatom;
759
760 switch (rtok) {
761 case CHAR:
762 lastatom = starttok;
763 np = op2(CHAR, NIL, itonp(rlxval));
764 rtok = relex();
765 return (unary(np));
766 case ALL:
767 rtok = relex();
768 return (unary(op2(ALL, NIL, NIL)));
769 case EMPTYRE:
770 rtok = relex();
771 return (unary(op2(EMPTYRE, NIL, NIL)));
772 case DOT:
773 lastatom = starttok;
774 rtok = relex();
775 return (unary(op2(DOT, NIL, NIL)));
776 case CCL:
777 np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
778 lastatom = starttok;
779 rtok = relex();
780 return (unary(np));
781 case NCCL:
782 np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
783 lastatom = starttok;
784 rtok = relex();
785 return (unary(np));
786 case '^':
787 rtok = relex();
788 return (unary(op2(CHAR, NIL, itonp(HAT))));
789 case '$':
790 rtok = relex();
791 return (unary(op2(CHAR, NIL, NIL)));
792 case '(':
793 lastatom = starttok;
794 savelastatom = starttok - basestr; /* Retain over recursion */
795 rtok = relex();
796 if (rtok == ')') { /* special pleading for () */
797 rtok = relex();
798 return unary(op2(CCL, NIL, (Node *) tostring("")));
799 }
800 np = regexp();
801 if (rtok == ')') {
802 lastatom = basestr + savelastatom; /* Restore */
803 rtok = relex();
804 return (unary(np));
805 }
806 else
807 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
808 default:
809 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
810 }
811 return 0; /*NOTREACHED*/
812 }
813
concat(Node * np)814 Node *concat(Node *np)
815 {
816 switch (rtok) {
817 case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
818 return (concat(op2(CAT, np, primary())));
819 case EMPTYRE:
820 rtok = relex();
821 return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
822 primary())));
823 }
824 return (np);
825 }
826
alt(Node * np)827 Node *alt(Node *np)
828 {
829 if (rtok == OR) {
830 rtok = relex();
831 return (alt(op2(OR, np, concat(primary()))));
832 }
833 return (np);
834 }
835
unary(Node * np)836 Node *unary(Node *np)
837 {
838 switch (rtok) {
839 case STAR:
840 rtok = relex();
841 return (unary(op2(STAR, np, NIL)));
842 case PLUS:
843 rtok = relex();
844 return (unary(op2(PLUS, np, NIL)));
845 case QUEST:
846 rtok = relex();
847 return (unary(op2(QUEST, np, NIL)));
848 case ZERO:
849 rtok = relex();
850 return (unary(op2(ZERO, np, NIL)));
851 default:
852 return (np);
853 }
854 }
855
856 /*
857 * Character class definitions conformant to the POSIX locale as
858 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
859 * and operating character sets are both ASCII (ISO646) or supersets
860 * thereof.
861 *
862 * Note that to avoid overflowing the temporary buffer used in
863 * relex(), the expanded character class (prior to range expansion)
864 * must be less than twice the size of their full name.
865 */
866
867 /* Because isblank doesn't show up in any of the header files on any
868 * system i use, it's defined here. if some other locale has a richer
869 * definition of "blank", define HAS_ISBLANK and provide your own
870 * version.
871 * the parentheses here are an attempt to find a path through the maze
872 * of macro definition and/or function and/or version provided. thanks
873 * to nelson beebe for the suggestion; let's see if it works everywhere.
874 */
875
876 /* #define HAS_ISBLANK */
877 #ifndef HAS_ISBLANK
878
879 int (xisblank)(int c)
880 {
881 return c==' ' || c=='\t';
882 }
883
884 #endif
885
886 static const struct charclass {
887 const char *cc_name;
888 int cc_namelen;
889 int (*cc_func)(int);
890 } charclasses[] = {
891 { "alnum", 5, isalnum },
892 { "alpha", 5, isalpha },
893 #ifndef HAS_ISBLANK
894 { "blank", 5, xisblank },
895 #else
896 { "blank", 5, isblank },
897 #endif
898 { "cntrl", 5, iscntrl },
899 { "digit", 5, isdigit },
900 { "graph", 5, isgraph },
901 { "lower", 5, islower },
902 { "print", 5, isprint },
903 { "punct", 5, ispunct },
904 { "space", 5, isspace },
905 { "upper", 5, isupper },
906 { "xdigit", 6, isxdigit },
907 { NULL, 0, NULL },
908 };
909
910 #define REPEAT_SIMPLE 0
911 #define REPEAT_PLUS_APPENDED 1
912 #define REPEAT_WITH_Q 2
913 #define REPEAT_ZERO 3
914
915 static int
replace_repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum,int special_case)916 replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
917 int atomlen, int firstnum, int secondnum, int special_case)
918 {
919 int i, j;
920 uschar *buf = 0;
921 int ret = 1;
922 int init_q = (firstnum == 0); /* first added char will be ? */
923 int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
924 int prefix_length = reptok - basestr; /* prefix includes first rep */
925 int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
926 int size = prefix_length + suffix_length;
927
928 if (firstnum > 1) { /* add room for reps 2 through firstnum */
929 size += atomlen*(firstnum-1);
930 }
931
932 /* Adjust size of buffer for special cases */
933 if (special_case == REPEAT_PLUS_APPENDED) {
934 size++; /* for the final + */
935 } else if (special_case == REPEAT_WITH_Q) {
936 size += init_q + (atomlen+1)* n_q_reps;
937 } else if (special_case == REPEAT_ZERO) {
938 size += 2; /* just a null ERE: () */
939 }
940 if ((buf = malloc(size + 1)) == NULL)
941 FATAL("out of space in reg expr %.10s..", lastre);
942 memcpy(buf, basestr, prefix_length); /* copy prefix */
943 j = prefix_length;
944 if (special_case == REPEAT_ZERO) {
945 j -= atomlen;
946 buf[j++] = '(';
947 buf[j++] = ')';
948 }
949 for (i = 1; i < firstnum; i++) { /* copy x reps */
950 memcpy(&buf[j], atom, atomlen);
951 j += atomlen;
952 }
953 if (special_case == REPEAT_PLUS_APPENDED) {
954 buf[j++] = '+';
955 } else if (special_case == REPEAT_WITH_Q) {
956 if (init_q)
957 buf[j++] = '?';
958 for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
959 memcpy(&buf[j], atom, atomlen);
960 j += atomlen;
961 buf[j++] = '?';
962 }
963 }
964 memcpy(&buf[j], reptok+reptoklen, suffix_length);
965 if (special_case == REPEAT_ZERO) {
966 buf[j+suffix_length] = '\0';
967 } else {
968 buf[size] = '\0';
969 }
970 /* free old basestr */
971 if (firstbasestr != basestr) {
972 if (basestr)
973 xfree(basestr);
974 }
975 basestr = buf;
976 prestr = buf + prefix_length;
977 if (special_case == REPEAT_ZERO) {
978 prestr -= atomlen;
979 ret++;
980 }
981 return ret;
982 }
983
repeat(const uschar * reptok,int reptoklen,const uschar * atom,int atomlen,int firstnum,int secondnum)984 static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
985 int atomlen, int firstnum, int secondnum)
986 {
987 /*
988 In general, the repetition specifier or "bound" is replaced here
989 by an equivalent ERE string, repeating the immediately previous atom
990 and appending ? and + as needed. Note that the first copy of the
991 atom is left in place, except in the special_case of a zero-repeat
992 (i.e., {0}).
993 */
994 if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
995 if (firstnum < 2) {
996 /* 0 or 1: should be handled before you get here */
997 FATAL("internal error");
998 } else {
999 return replace_repeat(reptok, reptoklen, atom, atomlen,
1000 firstnum, secondnum, REPEAT_PLUS_APPENDED);
1001 }
1002 } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1003 if (firstnum == 0) { /* {0} or {0,0} */
1004 /* This case is unusual because the resulting
1005 replacement string might actually be SMALLER than
1006 the original ERE */
1007 return replace_repeat(reptok, reptoklen, atom, atomlen,
1008 firstnum, secondnum, REPEAT_ZERO);
1009 } else { /* (firstnum >= 1) */
1010 return replace_repeat(reptok, reptoklen, atom, atomlen,
1011 firstnum, secondnum, REPEAT_SIMPLE);
1012 }
1013 } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1014 /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1015 return replace_repeat(reptok, reptoklen, atom, atomlen,
1016 firstnum, secondnum, REPEAT_WITH_Q);
1017 } else { /* Error - shouldn't be here (n>m) */
1018 FATAL("internal error");
1019 }
1020 return 0;
1021 }
1022
relex(void)1023 int relex(void) /* lexical analyzer for reparse */
1024 {
1025 int c, n;
1026 int cflag;
1027 static uschar *buf = NULL;
1028 static int bufsz = 100;
1029 uschar *bp;
1030 const struct charclass *cc;
1031 int i;
1032 int num, m;
1033 bool commafound, digitfound;
1034 const uschar *startreptok;
1035 static int parens = 0;
1036
1037 rescan:
1038 starttok = prestr;
1039
1040 switch (c = *prestr++) {
1041 case '|': return OR;
1042 case '*': return STAR;
1043 case '+': return PLUS;
1044 case '?': return QUEST;
1045 case '.': return DOT;
1046 case '\0': prestr--; return '\0';
1047 case '^':
1048 case '$':
1049 return c;
1050 case '(':
1051 parens++;
1052 return c;
1053 case ')':
1054 if (parens) {
1055 parens--;
1056 return c;
1057 }
1058 /* unmatched close parenthesis; per POSIX, treat as literal */
1059 rlxval = c;
1060 return CHAR;
1061 case '\\':
1062 rlxval = quoted(&prestr);
1063 return CHAR;
1064 default:
1065 rlxval = c;
1066 return CHAR;
1067 case '[':
1068 if (buf == NULL && (buf = malloc(bufsz)) == NULL)
1069 FATAL("out of space in reg expr %.10s..", lastre);
1070 bp = buf;
1071 if (*prestr == '^') {
1072 cflag = 1;
1073 prestr++;
1074 }
1075 else
1076 cflag = 0;
1077 n = 2 * strlen((const char *) prestr)+1;
1078 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1079 FATAL("out of space for reg expr %.10s...", lastre);
1080 for (; ; ) {
1081 if ((c = *prestr++) == '\\') {
1082 *bp++ = '\\';
1083 if ((c = *prestr++) == '\0')
1084 FATAL("nonterminated character class %.20s...", lastre);
1085 *bp++ = c;
1086 /* } else if (c == '\n') { */
1087 /* FATAL("newline in character class %.20s...", lastre); */
1088 } else if (c == '[' && *prestr == ':') {
1089 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1090 for (cc = charclasses; cc->cc_name; cc++)
1091 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1092 break;
1093 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1094 prestr[2 + cc->cc_namelen] == ']') {
1095 prestr += cc->cc_namelen + 3;
1096 /*
1097 * BUG: We begin at 1, instead of 0, since we
1098 * would otherwise prematurely terminate the
1099 * string for classes like [[:cntrl:]]. This
1100 * means that we can't match the NUL character,
1101 * not without first adapting the entire
1102 * program to track each string's length.
1103 */
1104 for (i = 1; i <= UCHAR_MAX; i++) {
1105 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1106 FATAL("out of space for reg expr %.10s...", lastre);
1107 if (cc->cc_func(i)) {
1108 *bp++ = i;
1109 n++;
1110 }
1111 }
1112 } else
1113 *bp++ = c;
1114 } else if (c == '[' && *prestr == '.') {
1115 char collate_char;
1116 prestr++;
1117 collate_char = *prestr++;
1118 if (*prestr == '.' && prestr[1] == ']') {
1119 prestr += 2;
1120 /* Found it: map via locale TBD: for
1121 now, simply return this char. This
1122 is sufficient to pass conformance
1123 test awk.ex 156
1124 */
1125 if (*prestr == ']') {
1126 prestr++;
1127 rlxval = collate_char;
1128 return CHAR;
1129 }
1130 }
1131 } else if (c == '[' && *prestr == '=') {
1132 char equiv_char;
1133 prestr++;
1134 equiv_char = *prestr++;
1135 if (*prestr == '=' && prestr[1] == ']') {
1136 prestr += 2;
1137 /* Found it: map via locale TBD: for now
1138 simply return this char. This is
1139 sufficient to pass conformance test
1140 awk.ex 156
1141 */
1142 if (*prestr == ']') {
1143 prestr++;
1144 rlxval = equiv_char;
1145 return CHAR;
1146 }
1147 }
1148 } else if (c == '\0') {
1149 FATAL("nonterminated character class %.20s", lastre);
1150 } else if (bp == buf) { /* 1st char is special */
1151 *bp++ = c;
1152 } else if (c == ']') {
1153 *bp++ = 0;
1154 rlxstr = (uschar *) tostring((char *) buf);
1155 if (cflag == 0)
1156 return CCL;
1157 else
1158 return NCCL;
1159 } else
1160 *bp++ = c;
1161 }
1162 break;
1163 case '{':
1164 if (isdigit(*(prestr))) {
1165 num = 0; /* Process as a repetition */
1166 n = -1; m = -1;
1167 commafound = false;
1168 digitfound = false;
1169 startreptok = prestr-1;
1170 /* Remember start of previous atom here ? */
1171 } else { /* just a { char, not a repetition */
1172 rlxval = c;
1173 return CHAR;
1174 }
1175 for (; ; ) {
1176 if ((c = *prestr++) == '}') {
1177 if (commafound) {
1178 if (digitfound) { /* {n,m} */
1179 m = num;
1180 if (m < n)
1181 FATAL("illegal repetition expression: class %.20s",
1182 lastre);
1183 if (n == 0 && m == 1) {
1184 return QUEST;
1185 }
1186 } else { /* {n,} */
1187 if (n == 0)
1188 return STAR;
1189 else if (n == 1)
1190 return PLUS;
1191 }
1192 } else {
1193 if (digitfound) { /* {n} same as {n,n} */
1194 n = num;
1195 m = num;
1196 } else { /* {} */
1197 FATAL("illegal repetition expression: class %.20s",
1198 lastre);
1199 }
1200 }
1201 if (repeat(starttok, prestr-starttok, lastatom,
1202 startreptok - lastatom, n, m) > 0) {
1203 if (n == 0 && m == 0) {
1204 return ZERO;
1205 }
1206 /* must rescan input for next token */
1207 goto rescan;
1208 }
1209 /* Failed to replace: eat up {...} characters
1210 and treat like just PLUS */
1211 return PLUS;
1212 } else if (c == '\0') {
1213 FATAL("nonterminated character class %.20s",
1214 lastre);
1215 } else if (isdigit(c)) {
1216 num = 10 * num + c - '0';
1217 digitfound = true;
1218 } else if (c == ',') {
1219 if (commafound)
1220 FATAL("illegal repetition expression: class %.20s",
1221 lastre);
1222 /* looking for {n,} or {n,m} */
1223 commafound = true;
1224 n = num;
1225 digitfound = false; /* reset */
1226 num = 0;
1227 } else {
1228 FATAL("illegal repetition expression: class %.20s",
1229 lastre);
1230 }
1231 }
1232 break;
1233 }
1234 }
1235
cgoto(fa * f,int s,int c)1236 int cgoto(fa *f, int s, int c)
1237 {
1238 int *p, *q;
1239 int i, j, k;
1240
1241 assert(c == HAT || c < NCHARS);
1242 while (f->accept >= maxsetvec) { /* guessing here! */
1243 resizesetvec(__func__);
1244 }
1245 for (i = 0; i <= f->accept; i++)
1246 setvec[i] = 0;
1247 setcnt = 0;
1248 resize_state(f, s);
1249 /* compute positions of gototab[s,c] into setvec */
1250 p = f->posns[s];
1251 for (i = 1; i <= *p; i++) {
1252 if ((k = f->re[p[i]].ltype) != FINAL) {
1253 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1254 || (k == DOT && c != 0 && c != HAT)
1255 || (k == ALL && c != 0)
1256 || (k == EMPTYRE && c != 0)
1257 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1258 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1259 q = f->re[p[i]].lfollow;
1260 for (j = 1; j <= *q; j++) {
1261 if (q[j] >= maxsetvec) {
1262 resizesetvec(__func__);
1263 }
1264 if (setvec[q[j]] == 0) {
1265 setcnt++;
1266 setvec[q[j]] = 1;
1267 }
1268 }
1269 }
1270 }
1271 }
1272 /* determine if setvec is a previous state */
1273 tmpset[0] = setcnt;
1274 j = 1;
1275 for (i = f->accept; i >= 0; i--)
1276 if (setvec[i]) {
1277 tmpset[j++] = i;
1278 }
1279 resize_state(f, f->curstat > s ? f->curstat : s);
1280 /* tmpset == previous state? */
1281 for (i = 1; i <= f->curstat; i++) {
1282 p = f->posns[i];
1283 if ((k = tmpset[0]) != p[0])
1284 goto different;
1285 for (j = 1; j <= k; j++)
1286 if (tmpset[j] != p[j])
1287 goto different;
1288 /* setvec is state i */
1289 if (c != HAT)
1290 f->gototab[s][c] = i;
1291 return i;
1292 different:;
1293 }
1294
1295 /* add tmpset to current set of states */
1296 ++(f->curstat);
1297 resize_state(f, f->curstat);
1298 for (i = 0; i < NCHARS; i++)
1299 f->gototab[f->curstat][i] = 0;
1300 xfree(f->posns[f->curstat]);
1301 p = intalloc(setcnt + 1, __func__);
1302
1303 f->posns[f->curstat] = p;
1304 if (c != HAT)
1305 f->gototab[s][c] = f->curstat;
1306 for (i = 0; i <= setcnt; i++)
1307 p[i] = tmpset[i];
1308 if (setvec[f->accept])
1309 f->out[f->curstat] = 1;
1310 else
1311 f->out[f->curstat] = 0;
1312 return f->curstat;
1313 }
1314
1315
freefa(fa * f)1316 void freefa(fa *f) /* free a finite automaton */
1317 {
1318 int i;
1319
1320 if (f == NULL)
1321 return;
1322 for (i = 0; i < f->state_count; i++)
1323 xfree(f->gototab[i])
1324 for (i = 0; i <= f->curstat; i++)
1325 xfree(f->posns[i]);
1326 for (i = 0; i <= f->accept; i++) {
1327 xfree(f->re[i].lfollow);
1328 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1329 xfree(f->re[i].lval.np);
1330 }
1331 xfree(f->restr);
1332 xfree(f->out);
1333 xfree(f->posns);
1334 xfree(f->gototab);
1335 xfree(f);
1336 }
1337