1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 /* lasciate ogne speranza, voi ch'intrate. */
26
27 #define DEBUG
28
29 #include <ctype.h>
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "awk.h"
35 #include "ytab.h"
36
37 #define HAT (NCHARS+2) /* matches ^ in regular expr */
38 /* NCHARS is 2**n */
39 #define MAXLIN 22
40
41 #define type(v) (v)->nobj /* badly overloaded here */
42 #define info(v) (v)->ntype /* badly overloaded here */
43 #define left(v) (v)->narg[0]
44 #define right(v) (v)->narg[1]
45 #define parent(v) (v)->nnext
46
47 #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
48 #define ELEAF case EMPTYRE: /* empty string in regexp */
49 #define UNARY case STAR: case PLUS: case QUEST:
50
51 /* encoding in tree Nodes:
52 leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
53 left is index, right contains value or pointer to value
54 unary (STAR, PLUS, QUEST): left is child, right is null
55 binary (CAT, OR): left and right are children
56 parent contains pointer to parent
57 */
58
59
60 int *setvec;
61 int *tmpset;
62 int maxsetvec = 0;
63
64 int rtok; /* next token in current re */
65 int rlxval;
66 static uschar *rlxstr;
67 static uschar *prestr; /* current position in current re */
68 static uschar *lastre; /* origin of last re */
69
70 static int setcnt;
71 static int poscnt;
72
73 char *patbeg;
74 int patlen;
75
76 #define NFA 20 /* cache this many dynamic fa's */
77 fa *fatab[NFA];
78 int nfatab = 0; /* entries in fatab */
79
makedfa(const char * s,int anchor)80 fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
81 {
82 int i, use, nuse;
83 fa *pfa;
84 static int now = 1;
85
86 if (setvec == 0) { /* first time through any RE */
87 maxsetvec = MAXLIN;
88 setvec = (int *) malloc(maxsetvec * sizeof(int));
89 tmpset = (int *) malloc(maxsetvec * sizeof(int));
90 if (setvec == 0 || tmpset == 0)
91 overflo("out of space initializing makedfa");
92 }
93
94 if (compile_time) /* a constant for sure */
95 return mkdfa(s, anchor);
96 for (i = 0; i < nfatab; i++) /* is it there already? */
97 if (fatab[i]->anchor == anchor
98 && strcmp((const char *) fatab[i]->restr, s) == 0) {
99 fatab[i]->use = now++;
100 return fatab[i];
101 }
102 pfa = mkdfa(s, anchor);
103 if (nfatab < NFA) { /* room for another */
104 fatab[nfatab] = pfa;
105 fatab[nfatab]->use = now++;
106 nfatab++;
107 return pfa;
108 }
109 use = fatab[0]->use; /* replace least-recently used */
110 nuse = 0;
111 for (i = 1; i < nfatab; i++)
112 if (fatab[i]->use < use) {
113 use = fatab[i]->use;
114 nuse = i;
115 }
116 freefa(fatab[nuse]);
117 fatab[nuse] = pfa;
118 pfa->use = now++;
119 return pfa;
120 }
121
mkdfa(const char * s,int anchor)122 fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
123 /* anchor = 1 for anchored matches, else 0 */
124 {
125 Node *p, *p1;
126 fa *f;
127
128 p = reparse(s);
129 p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
130 /* put ALL STAR in front of reg. exp. */
131 p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
132 /* put FINAL after reg. exp. */
133
134 poscnt = 0;
135 penter(p1); /* enter parent pointers and leaf indices */
136 if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
137 overflo("out of space for fa");
138 f->accept = poscnt-1; /* penter has computed number of positions in re */
139 cfoll(f, p1); /* set up follow sets */
140 freetr(p1);
141 if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
142 overflo("out of space in makedfa");
143 if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
144 overflo("out of space in makedfa");
145 *f->posns[1] = 0;
146 f->initstat = makeinit(f, anchor);
147 f->anchor = anchor;
148 f->restr = (uschar *) tostring(s);
149 return f;
150 }
151
makeinit(fa * f,int anchor)152 int makeinit(fa *f, int anchor)
153 {
154 int i, k;
155
156 f->curstat = 2;
157 f->out[2] = 0;
158 f->reset = 0;
159 k = *(f->re[0].lfollow);
160 xfree(f->posns[2]);
161 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
162 overflo("out of space in makeinit");
163 for (i=0; i <= k; i++) {
164 (f->posns[2])[i] = (f->re[0].lfollow)[i];
165 }
166 if ((f->posns[2])[1] == f->accept)
167 f->out[2] = 1;
168 for (i=0; i < NCHARS; i++)
169 f->gototab[2][i] = 0;
170 f->curstat = cgoto(f, 2, HAT);
171 if (anchor) {
172 *f->posns[2] = k-1; /* leave out position 0 */
173 for (i=0; i < k; i++) {
174 (f->posns[0])[i] = (f->posns[2])[i];
175 }
176
177 f->out[0] = f->out[2];
178 if (f->curstat != 2)
179 --(*f->posns[f->curstat]);
180 }
181 return f->curstat;
182 }
183
penter(Node * p)184 void penter(Node *p) /* set up parent pointers and leaf indices */
185 {
186 switch (type(p)) {
187 ELEAF
188 LEAF
189 info(p) = poscnt;
190 poscnt++;
191 break;
192 UNARY
193 penter(left(p));
194 parent(left(p)) = p;
195 break;
196 case CAT:
197 case OR:
198 penter(left(p));
199 penter(right(p));
200 parent(left(p)) = p;
201 parent(right(p)) = p;
202 break;
203 default: /* can't happen */
204 FATAL("can't happen: unknown type %d in penter", type(p));
205 break;
206 }
207 }
208
freetr(Node * p)209 void freetr(Node *p) /* free parse tree */
210 {
211 switch (type(p)) {
212 ELEAF
213 LEAF
214 xfree(p);
215 break;
216 UNARY
217 freetr(left(p));
218 xfree(p);
219 break;
220 case CAT:
221 case OR:
222 freetr(left(p));
223 freetr(right(p));
224 xfree(p);
225 break;
226 default: /* can't happen */
227 FATAL("can't happen: unknown type %d in freetr", type(p));
228 break;
229 }
230 }
231
232 /* in the parsing of regular expressions, metacharacters like . have */
233 /* to be seen literally; \056 is not a metacharacter. */
234
hexstr(uschar ** pp)235 int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */
236 { /* only pick up one 8-bit byte (2 chars) */
237 uschar *p;
238 int n = 0;
239 int i;
240
241 for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
242 if (isdigit(*p))
243 n = 16 * n + *p - '0';
244 else if (*p >= 'a' && *p <= 'f')
245 n = 16 * n + *p - 'a' + 10;
246 else if (*p >= 'A' && *p <= 'F')
247 n = 16 * n + *p - 'A' + 10;
248 }
249 *pp = (uschar *) p;
250 return n;
251 }
252
253 #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
254
quoted(uschar ** pp)255 int quoted(uschar **pp) /* pick up next thing after a \\ */
256 /* and increment *pp */
257 {
258 uschar *p = *pp;
259 int c;
260
261 if ((c = *p++) == 't')
262 c = '\t';
263 else if (c == 'n')
264 c = '\n';
265 else if (c == 'f')
266 c = '\f';
267 else if (c == 'r')
268 c = '\r';
269 else if (c == 'b')
270 c = '\b';
271 else if (c == '\\')
272 c = '\\';
273 else if (c == 'x') { /* hexadecimal goo follows */
274 c = hexstr(&p); /* this adds a null if number is invalid */
275 } else if (isoctdigit(c)) { /* \d \dd \ddd */
276 int n = c - '0';
277 if (isoctdigit(*p)) {
278 n = 8 * n + *p++ - '0';
279 if (isoctdigit(*p))
280 n = 8 * n + *p++ - '0';
281 }
282 c = n;
283 } /* else */
284 /* c = c; */
285 *pp = p;
286 return c;
287 }
288
cclenter(const char * argp)289 char *cclenter(const char *argp) /* add a character class */
290 {
291 int i, c, c2;
292 uschar *p = (uschar *) argp;
293 uschar *op, *bp;
294 static uschar *buf = 0;
295 static int bufsz = 100;
296
297 op = p;
298 if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
299 FATAL("out of space for character class [%.10s...] 1", p);
300 bp = buf;
301 for (i = 0; (c = *p++) != 0; ) {
302 if (c == '\\') {
303 c = quoted(&p);
304 } else if (c == '-' && i > 0 && bp[-1] != 0) {
305 if (*p != 0) {
306 c = bp[-1];
307 c2 = *p++;
308 if (c2 == '\\')
309 c2 = quoted(&p);
310 if (c > c2) { /* empty; ignore */
311 bp--;
312 i--;
313 continue;
314 }
315 while (c < c2) {
316 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
317 FATAL("out of space for character class [%.10s...] 2", p);
318 *bp++ = ++c;
319 i++;
320 }
321 continue;
322 }
323 }
324 if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
325 FATAL("out of space for character class [%.10s...] 3", p);
326 *bp++ = c;
327 i++;
328 }
329 *bp = 0;
330 dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
331 xfree(op);
332 return (char *) tostring((char *) buf);
333 }
334
overflo(const char * s)335 void overflo(const char *s)
336 {
337 FATAL("regular expression too big: %.30s...", s);
338 }
339
cfoll(fa * f,Node * v)340 void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
341 {
342 int i;
343 int *p;
344
345 switch (type(v)) {
346 ELEAF
347 LEAF
348 f->re[info(v)].ltype = type(v);
349 f->re[info(v)].lval.np = right(v);
350 while (f->accept >= maxsetvec) { /* guessing here! */
351 maxsetvec *= 4;
352 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
353 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
354 if (setvec == 0 || tmpset == 0)
355 overflo("out of space in cfoll()");
356 }
357 for (i = 0; i <= f->accept; i++)
358 setvec[i] = 0;
359 setcnt = 0;
360 follow(v); /* computes setvec and setcnt */
361 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
362 overflo("out of space building follow set");
363 f->re[info(v)].lfollow = p;
364 *p = setcnt;
365 for (i = f->accept; i >= 0; i--)
366 if (setvec[i] == 1)
367 *++p = i;
368 break;
369 UNARY
370 cfoll(f,left(v));
371 break;
372 case CAT:
373 case OR:
374 cfoll(f,left(v));
375 cfoll(f,right(v));
376 break;
377 default: /* can't happen */
378 FATAL("can't happen: unknown type %d in cfoll", type(v));
379 }
380 }
381
first(Node * p)382 int first(Node *p) /* collects initially active leaves of p into setvec */
383 /* returns 0 if p matches empty string */
384 {
385 int b, lp;
386
387 switch (type(p)) {
388 ELEAF
389 LEAF
390 lp = info(p); /* look for high-water mark of subscripts */
391 while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
392 maxsetvec *= 4;
393 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
394 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
395 if (setvec == 0 || tmpset == 0)
396 overflo("out of space in first()");
397 }
398 if (type(p) == EMPTYRE) {
399 setvec[lp] = 0;
400 return(0);
401 }
402 if (setvec[lp] != 1) {
403 setvec[lp] = 1;
404 setcnt++;
405 }
406 if (type(p) == CCL && (*(char *) right(p)) == '\0')
407 return(0); /* empty CCL */
408 else return(1);
409 case PLUS:
410 if (first(left(p)) == 0) return(0);
411 return(1);
412 case STAR:
413 case QUEST:
414 first(left(p));
415 return(0);
416 case CAT:
417 if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
418 return(1);
419 case OR:
420 b = first(right(p));
421 if (first(left(p)) == 0 || b == 0) return(0);
422 return(1);
423 }
424 FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
425 return(-1);
426 }
427
follow(Node * v)428 void follow(Node *v) /* collects leaves that can follow v into setvec */
429 {
430 Node *p;
431
432 if (type(v) == FINAL)
433 return;
434 p = parent(v);
435 switch (type(p)) {
436 case STAR:
437 case PLUS:
438 first(v);
439 follow(p);
440 return;
441
442 case OR:
443 case QUEST:
444 follow(p);
445 return;
446
447 case CAT:
448 if (v == left(p)) { /* v is left child of p */
449 if (first(right(p)) == 0) {
450 follow(p);
451 return;
452 }
453 } else /* v is right child */
454 follow(p);
455 return;
456 }
457 }
458
member(int c,const char * sarg)459 int member(int c, const char *sarg) /* is c in s? */
460 {
461 uschar *s = (uschar *) sarg;
462
463 while (*s)
464 if (c == *s++)
465 return(1);
466 return(0);
467 }
468
match(fa * f,const char * p0)469 int match(fa *f, const char *p0) /* shortest match ? */
470 {
471 int s, ns;
472 uschar *p = (uschar *) p0;
473
474 s = f->reset ? makeinit(f,0) : f->initstat;
475 if (f->out[s])
476 return(1);
477 do {
478 /* assert(*p < NCHARS); */
479 if ((ns = f->gototab[s][*p]) != 0)
480 s = ns;
481 else
482 s = cgoto(f, s, *p);
483 if (f->out[s])
484 return(1);
485 } while (*p++ != 0);
486 return(0);
487 }
488
pmatch(fa * f,const char * p0)489 int pmatch(fa *f, const char *p0) /* longest match, for sub */
490 {
491 int s, ns;
492 uschar *p = (uschar *) p0;
493 uschar *q;
494 int i, k;
495
496 /* s = f->reset ? makeinit(f,1) : f->initstat; */
497 if (f->reset) {
498 f->initstat = s = makeinit(f,1);
499 } else {
500 s = f->initstat;
501 }
502 patbeg = (char *) p;
503 patlen = -1;
504 do {
505 q = p;
506 do {
507 if (f->out[s]) /* final state */
508 patlen = q-p;
509 /* assert(*q < NCHARS); */
510 if ((ns = f->gototab[s][*q]) != 0)
511 s = ns;
512 else
513 s = cgoto(f, s, *q);
514 if (s == 1) { /* no transition */
515 if (patlen >= 0) {
516 patbeg = (char *) p;
517 return(1);
518 }
519 else
520 goto nextin; /* no match */
521 }
522 } while (*q++ != 0);
523 if (f->out[s])
524 patlen = q-p-1; /* don't count $ */
525 if (patlen >= 0) {
526 patbeg = (char *) p;
527 return(1);
528 }
529 nextin:
530 s = 2;
531 if (f->reset) {
532 for (i = 2; i <= f->curstat; i++)
533 xfree(f->posns[i]);
534 k = *f->posns[0];
535 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
536 overflo("out of space in pmatch");
537 for (i = 0; i <= k; i++)
538 (f->posns[2])[i] = (f->posns[0])[i];
539 f->initstat = f->curstat = 2;
540 f->out[2] = f->out[0];
541 for (i = 0; i < NCHARS; i++)
542 f->gototab[2][i] = 0;
543 }
544 } while (*p++ != 0);
545 return (0);
546 }
547
nematch(fa * f,const char * p0)548 int nematch(fa *f, const char *p0) /* non-empty match, for sub */
549 {
550 int s, ns;
551 uschar *p = (uschar *) p0;
552 uschar *q;
553 int i, k;
554
555 /* s = f->reset ? makeinit(f,1) : f->initstat; */
556 if (f->reset) {
557 f->initstat = s = makeinit(f,1);
558 } else {
559 s = f->initstat;
560 }
561 patlen = -1;
562 while (*p) {
563 q = p;
564 do {
565 if (f->out[s]) /* final state */
566 patlen = q-p;
567 /* assert(*q < NCHARS); */
568 if ((ns = f->gototab[s][*q]) != 0)
569 s = ns;
570 else
571 s = cgoto(f, s, *q);
572 if (s == 1) { /* no transition */
573 if (patlen > 0) {
574 patbeg = (char *) p;
575 return(1);
576 } else
577 goto nnextin; /* no nonempty match */
578 }
579 } while (*q++ != 0);
580 if (f->out[s])
581 patlen = q-p-1; /* don't count $ */
582 if (patlen > 0 ) {
583 patbeg = (char *) p;
584 return(1);
585 }
586 nnextin:
587 s = 2;
588 if (f->reset) {
589 for (i = 2; i <= f->curstat; i++)
590 xfree(f->posns[i]);
591 k = *f->posns[0];
592 if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
593 overflo("out of state space");
594 for (i = 0; i <= k; i++)
595 (f->posns[2])[i] = (f->posns[0])[i];
596 f->initstat = f->curstat = 2;
597 f->out[2] = f->out[0];
598 for (i = 0; i < NCHARS; i++)
599 f->gototab[2][i] = 0;
600 }
601 p++;
602 }
603 return (0);
604 }
605
reparse(const char * p)606 Node *reparse(const char *p) /* parses regular expression pointed to by p */
607 { /* uses relex() to scan regular expression */
608 Node *np;
609
610 dprintf( ("reparse <%s>\n", p) );
611 lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
612 rtok = relex();
613 /* GNU compatibility: an empty regexp matches anything */
614 if (rtok == '\0') {
615 /* FATAL("empty regular expression"); previous */
616 return(op2(EMPTYRE, NIL, NIL));
617 }
618 np = regexp();
619 if (rtok != '\0')
620 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
621 return(np);
622 }
623
regexp(void)624 Node *regexp(void) /* top-level parse of reg expr */
625 {
626 return (alt(concat(primary())));
627 }
628
primary(void)629 Node *primary(void)
630 {
631 Node *np;
632
633 switch (rtok) {
634 case CHAR:
635 np = op2(CHAR, NIL, itonp(rlxval));
636 rtok = relex();
637 return (unary(np));
638 case ALL:
639 rtok = relex();
640 return (unary(op2(ALL, NIL, NIL)));
641 case EMPTYRE:
642 rtok = relex();
643 return (unary(op2(ALL, NIL, NIL)));
644 case DOT:
645 rtok = relex();
646 return (unary(op2(DOT, NIL, NIL)));
647 case CCL:
648 np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
649 rtok = relex();
650 return (unary(np));
651 case NCCL:
652 np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
653 rtok = relex();
654 return (unary(np));
655 case '^':
656 rtok = relex();
657 return (unary(op2(CHAR, NIL, itonp(HAT))));
658 case '$':
659 rtok = relex();
660 return (unary(op2(CHAR, NIL, NIL)));
661 case '(':
662 rtok = relex();
663 if (rtok == ')') { /* special pleading for () */
664 rtok = relex();
665 return unary(op2(CCL, NIL, (Node *) tostring("")));
666 }
667 np = regexp();
668 if (rtok == ')') {
669 rtok = relex();
670 return (unary(np));
671 }
672 else
673 FATAL("syntax error in regular expression %s at %s", lastre, prestr);
674 default:
675 FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
676 }
677 return 0; /*NOTREACHED*/
678 }
679
concat(Node * np)680 Node *concat(Node *np)
681 {
682 switch (rtok) {
683 case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
684 return (concat(op2(CAT, np, primary())));
685 }
686 return (np);
687 }
688
alt(Node * np)689 Node *alt(Node *np)
690 {
691 if (rtok == OR) {
692 rtok = relex();
693 return (alt(op2(OR, np, concat(primary()))));
694 }
695 return (np);
696 }
697
unary(Node * np)698 Node *unary(Node *np)
699 {
700 switch (rtok) {
701 case STAR:
702 rtok = relex();
703 return (unary(op2(STAR, np, NIL)));
704 case PLUS:
705 rtok = relex();
706 return (unary(op2(PLUS, np, NIL)));
707 case QUEST:
708 rtok = relex();
709 return (unary(op2(QUEST, np, NIL)));
710 default:
711 return (np);
712 }
713 }
714
715 /*
716 * Character class definitions conformant to the POSIX locale as
717 * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
718 * and operating character sets are both ASCII (ISO646) or supersets
719 * thereof.
720 *
721 * Note that to avoid overflowing the temporary buffer used in
722 * relex(), the expanded character class (prior to range expansion)
723 * must be less than twice the size of their full name.
724 */
725
726 /* Because isblank doesn't show up in any of the header files on any
727 * system i use, it's defined here. if some other locale has a richer
728 * definition of "blank", define HAS_ISBLANK and provide your own
729 * version.
730 * the parentheses here are an attempt to find a path through the maze
731 * of macro definition and/or function and/or version provided. thanks
732 * to nelson beebe for the suggestion; let's see if it works everywhere.
733 */
734
735 /* #define HAS_ISBLANK */
736 #ifndef HAS_ISBLANK
737
738 int (xisblank)(int c)
739 {
740 return c==' ' || c=='\t';
741 }
742
743 #endif
744
745 struct charclass {
746 const char *cc_name;
747 int cc_namelen;
748 int (*cc_func)(int);
749 } charclasses[] = {
750 { "alnum", 5, isalnum },
751 { "alpha", 5, isalpha },
752 #ifndef HAS_ISBLANK
753 { "blank", 5, xisblank },
754 #else
755 { "blank", 5, isblank },
756 #endif
757 { "cntrl", 5, iscntrl },
758 { "digit", 5, isdigit },
759 { "graph", 5, isgraph },
760 { "lower", 5, islower },
761 { "print", 5, isprint },
762 { "punct", 5, ispunct },
763 { "space", 5, isspace },
764 { "upper", 5, isupper },
765 { "xdigit", 6, isxdigit },
766 { NULL, 0, NULL },
767 };
768
769
relex(void)770 int relex(void) /* lexical analyzer for reparse */
771 {
772 int c, n;
773 int cflag;
774 static uschar *buf = 0;
775 static int bufsz = 100;
776 uschar *bp;
777 struct charclass *cc;
778 int i;
779
780 switch (c = *prestr++) {
781 case '|': return OR;
782 case '*': return STAR;
783 case '+': return PLUS;
784 case '?': return QUEST;
785 case '.': return DOT;
786 case '\0': prestr--; return '\0';
787 case '^':
788 case '$':
789 case '(':
790 case ')':
791 return c;
792 case '\\':
793 rlxval = quoted(&prestr);
794 return CHAR;
795 default:
796 rlxval = c;
797 return CHAR;
798 case '[':
799 if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
800 FATAL("out of space in reg expr %.10s..", lastre);
801 bp = buf;
802 if (*prestr == '^') {
803 cflag = 1;
804 prestr++;
805 }
806 else
807 cflag = 0;
808 n = 2 * strlen((const char *) prestr)+1;
809 if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
810 FATAL("out of space for reg expr %.10s...", lastre);
811 for (; ; ) {
812 if ((c = *prestr++) == '\\') {
813 *bp++ = '\\';
814 if ((c = *prestr++) == '\0')
815 FATAL("nonterminated character class %.20s...", lastre);
816 *bp++ = c;
817 /* } else if (c == '\n') { */
818 /* FATAL("newline in character class %.20s...", lastre); */
819 } else if (c == '[' && *prestr == ':') {
820 /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
821 for (cc = charclasses; cc->cc_name; cc++)
822 if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
823 break;
824 if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
825 prestr[2 + cc->cc_namelen] == ']') {
826 prestr += cc->cc_namelen + 3;
827 /*
828 * BUG: We begin at 1, instead of 0, since we
829 * would otherwise prematurely terminate the
830 * string for classes like [[:cntrl:]]. This
831 * means that we can't match the NUL character,
832 * not without first adapting the entire
833 * program to track each string's length.
834 */
835 for (i = 1; i <= UCHAR_MAX; i++) {
836 if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
837 FATAL("out of space for reg expr %.10s...", lastre);
838 if (cc->cc_func(i)) {
839 *bp++ = i;
840 n++;
841 }
842 }
843 } else
844 *bp++ = c;
845 } else if (c == '\0') {
846 FATAL("nonterminated character class %.20s", lastre);
847 } else if (bp == buf) { /* 1st char is special */
848 *bp++ = c;
849 } else if (c == ']') {
850 *bp++ = 0;
851 rlxstr = (uschar *) tostring((char *) buf);
852 if (cflag == 0)
853 return CCL;
854 else
855 return NCCL;
856 } else
857 *bp++ = c;
858 }
859 }
860 }
861
cgoto(fa * f,int s,int c)862 int cgoto(fa *f, int s, int c)
863 {
864 int i, j, k;
865 int *p, *q;
866
867 assert(c == HAT || c < NCHARS);
868 while (f->accept >= maxsetvec) { /* guessing here! */
869 maxsetvec *= 4;
870 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
871 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
872 if (setvec == 0 || tmpset == 0)
873 overflo("out of space in cgoto()");
874 }
875 for (i = 0; i <= f->accept; i++)
876 setvec[i] = 0;
877 setcnt = 0;
878 /* compute positions of gototab[s,c] into setvec */
879 p = f->posns[s];
880 for (i = 1; i <= *p; i++) {
881 if ((k = f->re[p[i]].ltype) != FINAL) {
882 if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
883 || (k == DOT && c != 0 && c != HAT)
884 || (k == ALL && c != 0)
885 || (k == EMPTYRE && c != 0)
886 || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
887 || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
888 q = f->re[p[i]].lfollow;
889 for (j = 1; j <= *q; j++) {
890 if (q[j] >= maxsetvec) {
891 maxsetvec *= 4;
892 setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
893 tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
894 if (setvec == 0 || tmpset == 0)
895 overflo("cgoto overflow");
896 }
897 if (setvec[q[j]] == 0) {
898 setcnt++;
899 setvec[q[j]] = 1;
900 }
901 }
902 }
903 }
904 }
905 /* determine if setvec is a previous state */
906 tmpset[0] = setcnt;
907 j = 1;
908 for (i = f->accept; i >= 0; i--)
909 if (setvec[i]) {
910 tmpset[j++] = i;
911 }
912 /* tmpset == previous state? */
913 for (i = 1; i <= f->curstat; i++) {
914 p = f->posns[i];
915 if ((k = tmpset[0]) != p[0])
916 goto different;
917 for (j = 1; j <= k; j++)
918 if (tmpset[j] != p[j])
919 goto different;
920 /* setvec is state i */
921 f->gototab[s][c] = i;
922 return i;
923 different:;
924 }
925
926 /* add tmpset to current set of states */
927 if (f->curstat >= NSTATES-1) {
928 f->curstat = 2;
929 f->reset = 1;
930 for (i = 2; i < NSTATES; i++)
931 xfree(f->posns[i]);
932 } else
933 ++(f->curstat);
934 for (i = 0; i < NCHARS; i++)
935 f->gototab[f->curstat][i] = 0;
936 xfree(f->posns[f->curstat]);
937 if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
938 overflo("out of space in cgoto");
939
940 f->posns[f->curstat] = p;
941 f->gototab[s][c] = f->curstat;
942 for (i = 0; i <= setcnt; i++)
943 p[i] = tmpset[i];
944 if (setvec[f->accept])
945 f->out[f->curstat] = 1;
946 else
947 f->out[f->curstat] = 0;
948 return f->curstat;
949 }
950
951
freefa(fa * f)952 void freefa(fa *f) /* free a finite automaton */
953 {
954 int i;
955
956 if (f == NULL)
957 return;
958 for (i = 0; i <= f->curstat; i++)
959 xfree(f->posns[i]);
960 for (i = 0; i <= f->accept; i++) {
961 xfree(f->re[i].lfollow);
962 if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
963 xfree((f->re[i].lval.np));
964 }
965 xfree(f->restr);
966 xfree(f);
967 }
968