1 #include <time.h>
2 #include <string.h>
3 #include <stdio.h>
4 #include <ctype.h>
5
6 #include "tools/re2c/globals.h"
7 #include "tools/re2c/parse.h"
8 #include "tools/re2c/dfa.h"
9
10 static Symbol *first = NULL;
11
12 void
Symbol_init(Symbol * r,const SubStr * str)13 Symbol_init(Symbol *r, const SubStr *str)
14 {
15 r->next = first;
16 Str_init(&r->name, str);
17 r->re = NULL;
18 first = r;
19 }
20
21 Symbol *
Symbol_find(const SubStr * str)22 Symbol_find(const SubStr *str)
23 {
24 Symbol *sym;
25 for(sym = first; sym; sym = sym->next)
26 if(SubStr_eq(&sym->name, str)) return sym;
27 return Symbol_new(str);
28 }
29
30 /*
31 void showIns(FILE *o, const Ins *i, const Ins *base){
32 o.width(3);
33 o << &i - &base << ": ";
34 switch(i.i.tag){
35 case CHAR: {
36 o << "match ";
37 for(const Ins *j = &(&i)[1]; j < (Ins*) i.i.link; ++j)
38 prtCh(o, j->c.value);
39 break;
40 } case GOTO:
41 o << "goto " << ((Ins*) i.i.link - &base);
42 break;
43 case FORK:
44 o << "fork " << ((Ins*) i.i.link - &base);
45 break;
46 case CTXT:
47 o << "term " << ((RuleOp*) i.i.link)->accept;
48 break;
49 case TERM:
50 o << "term " << ((RuleOp*) i.i.link)->accept;
51 break;
52 }
53 o << "\n";
54 }
55 */
56
57 static unsigned int
AltOp_fixedLength(RegExp * r)58 AltOp_fixedLength(RegExp *r)
59 {
60 unsigned int l1 = RegExp_fixedLength(r->d.AltCatOp.exp1);
61 /* XXX? Should be exp2? */
62 unsigned int l2 = RegExp_fixedLength(r->d.AltCatOp.exp1);
63 if(l1 != l2 || l1 == ~0u)
64 return ~0u;
65 return l1;
66 }
67
68 static unsigned int
CatOp_fixedLength(RegExp * r)69 CatOp_fixedLength(RegExp *r)
70 {
71 unsigned int l1, l2;
72 if((l1 = RegExp_fixedLength(r->d.AltCatOp.exp1)) != ~0u )
73 if((l2 = RegExp_fixedLength(r->d.AltCatOp.exp2)) != ~0u)
74 return l1+l2;
75 return ~0u;
76 }
77
78 unsigned int
RegExp_fixedLength(RegExp * r)79 RegExp_fixedLength(RegExp *r)
80 {
81 switch (r->type) {
82 case NULLOP:
83 return 0;
84 case MATCHOP:
85 return 1;
86 case ALTOP:
87 return AltOp_fixedLength(r);
88 case CATOP:
89 return CatOp_fixedLength(r);
90 default:
91 return ~0u;
92 }
93 return ~0u;
94 }
95
96 void
RegExp_calcSize(RegExp * re,Char * rep)97 RegExp_calcSize(RegExp *re, Char *rep)
98 {
99 Range *r;
100 unsigned int c;
101
102 switch (re->type) {
103 case NULLOP:
104 re->size = 0;
105 break;
106 case MATCHOP:
107 re->size = 1;
108 for(r = re->d.match; r; r = r->next)
109 for(c = r->lb; c < r->ub; ++c)
110 if(rep[c] == c)
111 ++re->size;
112 break;
113 case RULEOP:
114 RegExp_calcSize(re->d.RuleOp.exp, rep);
115 RegExp_calcSize(re->d.RuleOp.ctx, rep);
116 re->size = re->d.RuleOp.exp->size + re->d.RuleOp.ctx->size + 1;
117 break;
118 case ALTOP:
119 RegExp_calcSize(re->d.AltCatOp.exp1, rep);
120 RegExp_calcSize(re->d.AltCatOp.exp2, rep);
121 re->size = re->d.AltCatOp.exp1->size
122 + re->d.AltCatOp.exp2->size + 2;
123 break;
124 case CATOP:
125 RegExp_calcSize(re->d.AltCatOp.exp1, rep);
126 RegExp_calcSize(re->d.AltCatOp.exp2, rep);
127 re->size = re->d.AltCatOp.exp1->size + re->d.AltCatOp.exp2->size;
128 break;
129 case CLOSEOP:
130 RegExp_calcSize(re->d.exp, rep);
131 re->size = re->d.exp->size + 1;
132 break;
133 case CLOSEVOP:
134 RegExp_calcSize(re->d.CloseVOp.exp, rep);
135
136 if (re->d.CloseVOp.max >= 0)
137 re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) +
138 ((1 + re->d.CloseVOp.exp->size) *
139 (re->d.CloseVOp.max - re->d.CloseVOp.min));
140 else
141 re->size = (re->d.CloseVOp.exp->size * re->d.CloseVOp.min) + 1;
142 break;
143 }
144 }
145
146 static void
MatchOp_compile(RegExp * re,Char * rep,Ins * i)147 MatchOp_compile(RegExp *re, Char *rep, Ins *i)
148 {
149 Ins *j;
150 unsigned int bump;
151 Range *r;
152 unsigned int c;
153
154 i->i.tag = CHAR;
155 i->i.link = &i[re->size];
156 j = &i[1];
157 bump = re->size;
158 for(r = re->d.match; r; r = r->next){
159 for(c = r->lb; c < r->ub; ++c){
160 if(rep[c] == c){
161 j->c.value = c;
162 j->c.bump = --bump;
163 j++;
164 }
165 }
166 }
167 }
168
169 static void
AltOp_compile(RegExp * re,Char * rep,Ins * i)170 AltOp_compile(RegExp *re, Char *rep, Ins *i){
171 Ins *j;
172
173 i->i.tag = FORK;
174 j = &i[re->d.AltCatOp.exp1->size + 1];
175 i->i.link = &j[1];
176 RegExp_compile(re->d.AltCatOp.exp1, rep, &i[1]);
177 j->i.tag = GOTO;
178 j->i.link = &j[re->d.AltCatOp.exp2->size + 1];
179 RegExp_compile(re->d.AltCatOp.exp2, rep, &j[1]);
180 }
181
182 void
RegExp_compile(RegExp * re,Char * rep,Ins * i)183 RegExp_compile(RegExp *re, Char *rep, Ins *i)
184 {
185 Ins *jumppoint;
186 int st = 0;
187
188 switch (re->type) {
189 case NULLOP:
190 break;
191 case MATCHOP:
192 MatchOp_compile(re, rep, i);
193 break;
194 case RULEOP:
195 re->d.RuleOp.ins = i;
196 RegExp_compile(re->d.RuleOp.exp, rep, &i[0]);
197 i += re->d.RuleOp.exp->size;
198 RegExp_compile(re->d.RuleOp.ctx, rep, &i[0]);
199 i += re->d.RuleOp.ctx->size;
200 i->i.tag = TERM;
201 i->i.link = re;
202 break;
203 case ALTOP:
204 AltOp_compile(re, rep, i);
205 break;
206 case CATOP:
207 RegExp_compile(re->d.AltCatOp.exp1, rep, &i[0]);
208 RegExp_compile(re->d.AltCatOp.exp2, rep,
209 &i[re->d.AltCatOp.exp1->size]);
210 break;
211 case CLOSEOP:
212 RegExp_compile(re->d.exp, rep, &i[0]);
213 i += re->d.exp->size;
214 i->i.tag = FORK;
215 i->i.link = i - re->d.exp->size;
216 break;
217 case CLOSEVOP:
218 jumppoint = i + ((1 + re->d.CloseVOp.exp->size) *
219 (re->d.CloseVOp.max - re->d.CloseVOp.min));
220 for(st = re->d.CloseVOp.min; st < re->d.CloseVOp.max; st++) {
221 i->i.tag = FORK;
222 i->i.link = jumppoint;
223 i+=1;
224 RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]);
225 i += re->d.CloseVOp.exp->size;
226 }
227 for(st = 0; st < re->d.CloseVOp.min; st++) {
228 RegExp_compile(re->d.CloseVOp.exp, rep, &i[0]);
229 i += re->d.CloseVOp.exp->size;
230 if(re->d.CloseVOp.max < 0 && st == 0) {
231 i->i.tag = FORK;
232 i->i.link = i - re->d.CloseVOp.exp->size;
233 i++;
234 }
235 }
236 break;
237 }
238 }
239
240 static void
MatchOp_split(RegExp * re,CharSet * s)241 MatchOp_split(RegExp *re, CharSet *s)
242 {
243 Range *r;
244 unsigned int c;
245
246 for(r = re->d.match; r; r = r->next){
247 for(c = r->lb; c < r->ub; ++c){
248 CharPtn *x = s->rep[c], *a = x->nxt;
249 if(!a){
250 if(x->card == 1)
251 continue;
252 x->nxt = a = s->freeHead;
253 if(!(s->freeHead = s->freeHead->nxt))
254 s->freeTail = &s->freeHead;
255 a->nxt = NULL;
256 x->fix = s->fix;
257 s->fix = x;
258 }
259 if(--(x->card) == 0){
260 *s->freeTail = x;
261 *(s->freeTail = &x->nxt) = NULL;
262 }
263 s->rep[c] = a;
264 ++(a->card);
265 }
266 }
267 for(; s->fix; s->fix = s->fix->fix)
268 if(s->fix->card)
269 s->fix->nxt = NULL;
270 }
271
272 void
RegExp_split(RegExp * re,CharSet * s)273 RegExp_split(RegExp *re, CharSet *s)
274 {
275 switch (re->type) {
276 case NULLOP:
277 break;
278 case MATCHOP:
279 MatchOp_split(re, s);
280 break;
281 case RULEOP:
282 RegExp_split(re->d.RuleOp.exp, s);
283 RegExp_split(re->d.RuleOp.ctx, s);
284 break;
285 case ALTOP:
286 /* FALLTHROUGH */
287 case CATOP:
288 RegExp_split(re->d.AltCatOp.exp1, s);
289 RegExp_split(re->d.AltCatOp.exp2, s);
290 break;
291 case CLOSEOP:
292 RegExp_split(re->d.exp, s);
293 break;
294 case CLOSEVOP:
295 RegExp_split(re->d.CloseVOp.exp, s);
296 break;
297 }
298 }
299
300 void
RegExp_display(RegExp * re,FILE * o)301 RegExp_display(RegExp *re, FILE *o)
302 {
303 switch (re->type) {
304 case NULLOP:
305 fputc('_', o);
306 break;
307 case MATCHOP:
308 Range_out(o, re->d.match);
309 break;
310 case RULEOP:
311 RegExp_display(re->d.RuleOp.exp, o);
312 fputc('/', o);
313 RegExp_display(re->d.RuleOp.ctx, o);
314 fputc(';', o);
315 break;
316 case ALTOP:
317 RegExp_display(re->d.AltCatOp.exp1, o);
318 fputc('|', o);
319 RegExp_display(re->d.AltCatOp.exp2, o);
320 break;
321 case CATOP:
322 RegExp_display(re->d.AltCatOp.exp1, o);
323 RegExp_display(re->d.AltCatOp.exp2, o);
324 break;
325 case CLOSEOP:
326 RegExp_display(re->d.exp, o);
327 fputc('+', o);
328 break;
329 }
330 }
331
332 void
Range_out(FILE * o,const Range * r)333 Range_out(FILE *o, const Range *r)
334 {
335 if(!r)
336 return;
337
338 if((r->ub - r->lb) == 1){
339 prtCh(o, r->lb);
340 } else {
341 prtCh(o, r->lb);
342 fputc('-', o);
343 prtCh(o, r->ub-1);
344 }
345 Range_out(o, r->next);
346 }
347
doUnion(Range * r1,Range * r2)348 static Range *doUnion(Range *r1, Range *r2){
349 Range *r, **rP = &r;
350 for(;;){
351 Range *s;
352 if(r1->lb <= r2->lb){
353 s = Range_new_copy(r1);
354 } else {
355 s = Range_new_copy(r2);
356 }
357 *rP = s;
358 rP = &s->next;
359 for(;;){
360 if(r1->lb <= r2->lb){
361 if(r1->lb > s->ub)
362 break;
363 if(r1->ub > s->ub)
364 s->ub = r1->ub;
365 if(!(r1 = r1->next)){
366 unsigned int ub = 0;
367 for(; r2 && r2->lb <= s->ub; r2 = r2->next)
368 ub = r2->ub;
369 if(ub > s->ub)
370 s->ub = ub;
371 *rP = r2;
372 return r;
373 }
374 } else {
375 if(r2->lb > s->ub)
376 break;
377 if(r2->ub > s->ub)
378 s->ub = r2->ub;
379 if(!(r2 = r2->next)){
380 unsigned int ub = 0;
381 for(; r1 && r1->lb <= s->ub; r1 = r1->next)
382 ub = r1->ub;
383 if(ub > s->ub)
384 s->ub = ub;
385 *rP = r1;
386 return r;
387 }
388 }
389 }
390 }
391 *rP = NULL;
392 return r;
393 }
394
doDiff(Range * r1,Range * r2)395 static Range *doDiff(Range *r1, Range *r2){
396 Range *r, *s, **rP = &r;
397 for(; r1; r1 = r1->next){
398 unsigned int lb = r1->lb;
399 for(; r2 && r2->ub <= r1->lb; r2 = r2->next);
400 for(; r2 && r2->lb < r1->ub; r2 = r2->next){
401 if(lb < r2->lb){
402 *rP = s = Range_new(lb, r2->lb);
403 rP = &s->next;
404 }
405 if((lb = r2->ub) >= r1->ub)
406 goto noMore;
407 }
408 *rP = s = Range_new(lb, r1->ub);
409 rP = &s->next;
410 noMore:;
411 }
412 *rP = NULL;
413 return r;
414 }
415
merge(RegExp * m1,RegExp * m2)416 static RegExp *merge(RegExp *m1, RegExp *m2){
417 if(!m1)
418 return m2;
419 if(!m2)
420 return m1;
421 return RegExp_new_MatchOp(doUnion(m1->d.match, m2->d.match));
422 }
423
mkDiff(RegExp * e1,RegExp * e2)424 RegExp *mkDiff(RegExp *e1, RegExp *e2){
425 RegExp *m1, *m2;
426 Range *r;
427 if(!(m1 = RegExp_isA(e1, MATCHOP)))
428 return NULL;
429 if(!(m2 = RegExp_isA(e2, MATCHOP)))
430 return NULL;
431 r = doDiff(m1->d.match, m2->d.match);
432 return r? RegExp_new_MatchOp(r) : RegExp_new_NullOp();
433 }
434
doAlt(RegExp * e1,RegExp * e2)435 static RegExp *doAlt(RegExp *e1, RegExp *e2){
436 if(!e1)
437 return e2;
438 if(!e2)
439 return e1;
440 return RegExp_new_AltOp(e1, e2);
441 }
442
mkAlt(RegExp * e1,RegExp * e2)443 RegExp *mkAlt(RegExp *e1, RegExp *e2){
444 RegExp *a;
445 RegExp *m1, *m2;
446 if((a = RegExp_isA(e1, ALTOP))){
447 if((m1 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP)))
448 e1 = a->d.AltCatOp.exp2;
449 } else if((m1 = RegExp_isA(e1, MATCHOP))){
450 e1 = NULL;
451 }
452 if((a = RegExp_isA(e2, ALTOP))){
453 if((m2 = RegExp_isA(a->d.AltCatOp.exp1, MATCHOP)))
454 e2 = a->d.AltCatOp.exp2;
455 } else if((m2 = RegExp_isA(e2, MATCHOP))){
456 e2 = NULL;
457 }
458 return doAlt(merge(m1, m2), doAlt(e1, e2));
459 }
460
unescape(SubStr * s)461 static unsigned char unescape(SubStr *s){
462 unsigned char c;
463 unsigned char v;
464 s->len--;
465 if((c = *s->str++) != '\\' || s->len == 0)
466 return xlat[c];
467 s->len--;
468 switch(c = *s->str++){
469 case 'n':
470 return xlat['\n'];
471 case 't':
472 return xlat['\t'];
473 case 'v':
474 return xlat['\v'];
475 case 'b':
476 return xlat['\b'];
477 case 'r':
478 return xlat['\r'];
479 case 'f':
480 return xlat['\f'];
481 case 'a':
482 return xlat['\a'];
483 case '0': case '1': case '2': case '3':
484 case '4': case '5': case '6': case '7': {
485 v = c - '0';
486 for(; s->len != 0 && '0' <= (c = *s->str) && c <= '7'; s->len--, s->str++)
487 v = v*8 + (c - '0');
488 return v;
489 } default:
490 return xlat[c];
491 }
492 }
493
getRange(SubStr * s)494 static Range *getRange(SubStr *s){
495 unsigned char lb = unescape(s), ub;
496 if(s->len < 2 || *s->str != '-'){
497 ub = lb;
498 } else {
499 s->len--; s->str++;
500 ub = unescape(s);
501 if(ub < lb){
502 unsigned char tmp;
503 tmp = lb; lb = ub; ub = tmp;
504 }
505 }
506 return Range_new(lb, ub+1);
507 }
508
matchChar(unsigned int c)509 static RegExp *matchChar(unsigned int c){
510 return RegExp_new_MatchOp(Range_new(c, c+1));
511 }
512
strToRE(SubStr s)513 RegExp *strToRE(SubStr s){
514 RegExp *re;
515 s.len -= 2; s.str += 1;
516 if(s.len == 0)
517 return RegExp_new_NullOp();
518 re = matchChar(unescape(&s));
519 while(s.len > 0)
520 re = RegExp_new_CatOp(re, matchChar(unescape(&s)));
521 return re;
522 }
523
strToCaseInsensitiveRE(SubStr s)524 RegExp *strToCaseInsensitiveRE(SubStr s){
525 unsigned char c;
526 RegExp *re, *reL, *reU;
527 s.len -= 2; s.str += 1;
528 if(s.len == 0)
529 return RegExp_new_NullOp();
530 c = unescape(&s);
531 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
532 reL = matchChar(tolower(c));
533 reU = matchChar(toupper(c));
534 re = mkAlt(reL, reU);
535 } else {
536 re = matchChar(c);
537 }
538 while(s.len > 0) {
539 c = unescape(&s);
540 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) {
541 reL = matchChar(tolower(c));
542 reU = matchChar(toupper(c));
543 re = RegExp_new_CatOp(re, mkAlt(reL, reU));
544 } else {
545 re = RegExp_new_CatOp(re, matchChar(c));
546 }
547 }
548 return re;
549 }
550
ranToRE(SubStr s)551 RegExp *ranToRE(SubStr s){
552 Range *r;
553 s.len -= 2; s.str += 1;
554 if(s.len == 0)
555 return RegExp_new_NullOp();
556 r = getRange(&s);
557 while(s.len > 0)
558 r = doUnion(r, getRange(&s));
559 return RegExp_new_MatchOp(r);
560 }
561
invToRE(SubStr s)562 RegExp *invToRE(SubStr s)
563 {
564 RegExp *any, *ran, *inv;
565 SubStr *ss;
566
567
568 s.len--;
569 s.str++;
570
571 ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]"));
572 any = ranToRE(*ss);
573 free(ss);
574 if (s.len <= 2)
575 return any;
576
577 ran = ranToRE(s);
578 inv = mkDiff(any, ran);
579
580 free(ran);
581 free(any);
582
583 return inv;
584 }
585
mkDot()586 RegExp *mkDot()
587 {
588 SubStr *ss = SubStr_new("[\\000-\\377]", strlen("[\\000-\\377]"));
589 RegExp * any = ranToRE(*ss);
590 RegExp * ran = matchChar('\n');
591 RegExp * inv = mkDiff(any, ran);
592
593 free(ss);
594 free(ran);
595 free(any);
596
597 return inv;
598 }
599
600 RegExp *
RegExp_new_RuleOp(RegExp * e,RegExp * c,Token * t,unsigned int a)601 RegExp_new_RuleOp(RegExp *e, RegExp *c, Token *t, unsigned int a)
602 {
603 RegExp *r = malloc(sizeof(RegExp));
604 r->type = RULEOP;
605 r->d.RuleOp.exp = e;
606 r->d.RuleOp.ctx = c;
607 r->d.RuleOp.ins = NULL;
608 r->d.RuleOp.accept = a;
609 r->d.RuleOp.code = t;
610 return r;
611 }
612
optimize(Ins * i)613 static void optimize(Ins *i){
614 while(!isMarked(i)){
615 mark(i);
616 if(i->i.tag == CHAR){
617 i = (Ins*) i->i.link;
618 } else if(i->i.tag == GOTO || i->i.tag == FORK){
619 Ins *target = (Ins*) i->i.link;
620 optimize(target);
621 if(target->i.tag == GOTO)
622 i->i.link = target->i.link == target? i : target;
623 if(i->i.tag == FORK){
624 Ins *follow = (Ins*) &i[1];
625 optimize(follow);
626 if(follow->i.tag == GOTO && follow->i.link == follow){
627 i->i.tag = GOTO;
628 } else if(i->i.link == i){
629 i->i.tag = GOTO;
630 i->i.link = follow;
631 }
632 }
633 return;
634 } else {
635 ++i;
636 }
637 }
638 }
639
genCode(FILE * o,RegExp * re)640 void genCode(FILE *o, RegExp *re){
641 CharSet cs;
642 unsigned int j;
643 Char rep[nChars];
644 Ins *ins, *eoi;
645 DFA *dfa;
646
647 memset(&cs, 0, sizeof(cs));
648 for(j = 0; j < nChars; ++j){
649 cs.rep[j] = &cs.ptn[0];
650 cs.ptn[j].nxt = &cs.ptn[j+1];
651 }
652 cs.freeHead = &cs.ptn[1];
653 *(cs.freeTail = &cs.ptn[nChars-1].nxt) = NULL;
654 cs.ptn[0].card = nChars;
655 cs.ptn[0].nxt = NULL;
656 RegExp_split(re, &cs);
657 /*
658 for(unsigned int k = 0; k < nChars;){
659 for(j = k; ++k < nChars && cs.rep[k] == cs.rep[j];);
660 printSpan(cerr, j, k);
661 cerr << "\t" << cs.rep[j] - &cs.ptn[0] << endl;
662 }
663 */
664 for(j = 0; j < nChars; ++j){
665 if(!cs.rep[j]->nxt)
666 cs.rep[j]->nxt = &cs.ptn[j];
667 rep[j] = (Char) (cs.rep[j]->nxt - &cs.ptn[0]);
668 }
669
670 RegExp_calcSize(re, rep);
671 ins = malloc(sizeof(Ins)*(re->size+1));
672 memset(ins, 0, (re->size+1)*sizeof(Ins));
673 RegExp_compile(re, rep, ins);
674 eoi = &ins[re->size];
675 eoi->i.tag = GOTO;
676 eoi->i.link = eoi;
677
678 optimize(ins);
679 for(j = 0; j < re->size;){
680 unmark(&ins[j]);
681 if(ins[j].i.tag == CHAR){
682 j = (Ins*) ins[j].i.link - ins;
683 } else {
684 j++;
685 }
686 }
687
688 dfa = DFA_new(ins, re->size, 0, 256, rep);
689 DFA_emit(dfa, o);
690 DFA_delete(dfa);
691 free(ins);
692 }
693