• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * This file contains code for
3  *
4  *      int rexpr(char *expr, char *s);
5  *
6  * which answers
7  *
8  *      1 if 's' is in the language described by the regular expression 'expr'
9  *      0 if it is not
10  *     -1 if the regular expression is invalid
11  *
12  * Language membership is determined by constructing a non-deterministic
13  * finite automata (NFA) from the regular expression.  A depth-
14  * first-search is performed on the NFA (graph) to check for a match of 's'.
15  * Each non-epsilon arc consumes one character from 's'.  Backtracking is
16  * performed to check all possible paths through the NFA.
17  *
18  * Regular expressions follow the meta-language:
19  *
20  * <regExpr>        ::= <andExpr> ( '|' <andExpr> )*
21  *
22  * <andExpr>        ::= <expr> ( <expr> )*
23  *
24  * <expr>           ::= {'~'} '[' <atomList> ']' <repeatSymbol>
25  *                      | '(' <regExpr> ')' <repeatSymbol>
26  *                      | '{' <regExpr> '}' <repeatSymbol>
27  *                      | <atom> <repeatSymbol>
28  *
29  * <repeatSymbol>   ::= { '*' | '+' }
30  *
31  * <atomList>       ::= <atom> ( <atom> )*
32  *                      | { <atomList> } <atom> '-' <atom> { <atomList> }
33  *
34  * <atom>           ::= Token[Atom]
35  *
36  * Notes:
37  *		~	means complement the set in [..]. i.e. all characters not listed
38  *		*	means match 0 or more times (can be on expression or atom)
39  *		+	means match 1 or more times (can be on expression or atom)
40  *		{}	optional
41  *		()	grouping
42  *		[]	set of atoms
43  *		x-y	all characters from x to y (found only in [..])
44  *		\xx the character with value xx
45  *
46  * Examples:
47  *		[a-z]+
48  *			match 1 or more lower-case letters (e.g. variable)
49  *
50  *		0x[0-9A-Fa-f]+
51  *			match a hex number with 0x on front (e.g. 0xA1FF)
52  *
53  *		[0-9]+.[0-9]+{e[0-9]+}
54  *			match a floating point number (e.g. 3.14e21)
55  *
56  * Code example:
57  *		if ( rexpr("[a-zA-Z][a-zA-Z0-9]+", str) ) then str is keyword
58  *
59  * Terence Parr
60  * Purdue University
61  * April 1991
62  */
63 
64 #include <stdio.h>
65 #include <ctype.h>
66 #ifdef __STDC__
67 #include <stdlib.h>
68 #else
69 #include <malloc.h>
70 #endif
71 #include "rexpr.h"
72 
73 #ifdef __USE_PROTOS
74 static int regExpr( GraphPtr g );
75 static int andExpr( GraphPtr g );
76 static int expr( GraphPtr g );
77 static int repeatSymbol( GraphPtr g );
78 static int atomList( char *p, int complement );
79 static void next( void );
80 static ArcPtr newGraphArc( void );
81 static NodePtr newNode( void );
82 static int ArcBetweenGraphNode( NodePtr i, NodePtr j, int label );
83 static Graph BuildNFA_atom( int label );
84 static Graph BuildNFA_AB( Graph A, Graph B );
85 static Graph BuildNFA_AorB( Graph A, Graph B );
86 static Graph BuildNFA_set( char *s );
87 static Graph BuildNFA_Astar( Graph A );
88 static Graph BuildNFA_Aplus( Graph A );
89 static Graph BuildNFA_Aoptional( Graph A );
90 #else
91 static int regExpr();
92 static int andExpr();
93 static int expr();
94 static int repeatSymbol();
95 static int atomList();
96 static void next();
97 static ArcPtr newGraphArc();
98 static NodePtr newNode();
99 static int ArcBetweenGraphNode();
100 static Graph BuildNFA_atom();
101 static Graph BuildNFA_AB();
102 static Graph BuildNFA_AorB();
103 static Graph BuildNFA_set();
104 static Graph BuildNFA_Astar();
105 static Graph BuildNFA_Aplus();
106 static Graph BuildNFA_Aoptional();
107 #endif
108 
109 static char *_c;
110 static int token, tokchar;
111 static NodePtr accept;
112 static NodePtr freelist = NULL;
113 
114 /*
115  * return 1 if s in language described by expr
116  *        0 if s is not
117  *       -1 if expr is an invalid regular expression
118  */
119 #ifdef __USE_PROTOS
rexpr(char * expr,char * s)120 static int rexpr(char *expr,char *s)
121 #else
122 static int rexpr(expr, s)
123 char *expr, *s;
124 #endif
125 {
126 	NodePtr p,q;
127 	Graph nfa;
128 	int result;
129 
130 	fprintf(stderr, "rexpr(%s,%s);\n", expr,s);
131 	freelist = NULL;
132 	_c = expr;
133 	next();
134 	if ( regExpr(&nfa) == -1 ) return -1;
135 	accept = nfa.right;
136 	result = match(nfa.left, s);
137 	/* free all your memory */
138 	p = q = freelist;
139 	while ( p!=NULL ) { q = p->track; free(p); p = q; }
140 	return result;
141 }
142 
143 /*
144  * do a depth-first-search on the NFA looking for a path from start to
145  * accept state labelled with the characters of 's'.
146  */
147 
148 #ifdef __USE_PROTOS
match(NodePtr automaton,char * s)149 static int match(NodePtr automaton,char *s)
150 #else
151 static int match(automaton, s)
152 NodePtr automaton;
153 char *s;
154 #endif
155 {
156 	ArcPtr p;
157 
158 	if ( automaton == accept && *s == '\0' ) return 1;	/* match */
159 
160 	for (p=automaton->arcs; p!=NULL; p=p->next)			/* try all arcs */
161 	{
162 		if ( p->label == Epsilon )
163 		{
164 			if ( match(p->target, s) ) return 1;
165 		}
166 		else if ( p->label == *s )
167 				if ( match(p->target, s+1) ) return 1;
168 	}
169 	return 0;
170 }
171 
172 /*
173  * <regExpr>        ::= <andExpr> ( '|' {<andExpr>} )*
174  *
175  * Return -1 if syntax error
176  * Return  0 if none found
177  * Return  1 if a regExrp was found
178  */
179 
180 #ifdef __USE_PROTOS
regExpr(GraphPtr g)181 static int regExpr(GraphPtr g)
182 #else
183 static int regExpr(g)
184 GraphPtr g;
185 #endif
186 {
187 	Graph g1, g2;
188 
189 	if ( andExpr(&g1) == -1 )
190 	{
191 		return -1;
192 	}
193 
194 	while ( token == '|' )
195 	{
196 		int a;
197 		next();
198 		a = andExpr(&g2);
199 		if ( a == -1 ) return -1;	/* syntax error below */
200 		else if ( !a ) return 1;	/* empty alternative */
201 		g1 = BuildNFA_AorB(g1, g2);
202 	}
203 
204 	if ( token!='\0' ) return -1;
205 
206 	*g = g1;
207 	return 1;
208 }
209 
210 /*
211  * <andExpr>        ::= <expr> ( <expr> )*
212  */
213 
214 #ifdef __USE_PROTOS
andExpr(GraphPtr g)215 static int andExpr(GraphPtr g)
216 #else
217 static int andExpr(g)
218 GraphPtr g;
219 #endif
220 {
221 	Graph g1, g2;
222 
223 	if ( expr(&g1) == -1 )
224 	{
225 		return -1;
226 	}
227 
228 	while ( token==Atom || token=='{' || token=='(' || token=='~' || token=='[' )
229 	{
230 		if (expr(&g2) == -1) return -1;
231 		g1 = BuildNFA_AB(g1, g2);
232 	}
233 
234 	*g = g1;
235 	return 1;
236 }
237 
238 /*
239  * <expr>           ::=    {'~'} '[' <atomList> ']' <repeatSymbol>
240  *                      | '(' <regExpr> ')' <repeatSymbol>
241  *                      | '{' <regExpr> '}' <repeatSymbol>
242  *                      | <atom> <repeatSymbol>
243  */
244 
245 #ifdef __USE_PROTOS
expr(GraphPtr g)246 static int expr(GraphPtr g)
247 #else
248 static int expr(g)
249 GraphPtr g;
250 #endif
251 {
252 	int complement = 0;
253 	char s[257];    /* alloc space for string of char in [] */
254 
255 	if ( token == '~' || token == '[' )
256 	{
257 		if ( token == '~' ) {complement = 1; next();}
258 		if ( token != '[' ) return -1;
259 		next();
260 		if ( atomList( s, complement ) == -1 ) return -1;
261 		*g = BuildNFA_set( s );
262 		if ( token != ']' ) return -1;
263 		next();
264 		repeatSymbol( g );
265 		return 1;
266 	}
267 	if ( token == '(' )
268 	{
269 		next();
270 		if ( regExpr( g ) == -1 ) return -1;
271 		if ( token != ')' ) return -1;
272 		next();
273 		repeatSymbol( g );
274 		return 1;
275 	}
276 	if ( token == '{' )
277 	{
278 		next();
279 		if ( regExpr( g ) == -1 ) return -1;
280 		if ( token != '}' ) return -1;
281 		next();
282 		/* S p e c i a l  C a s e   O p t i o n a l  {  } */
283 		if ( token != '*' && token != '+' )
284 		{
285 			*g = BuildNFA_Aoptional( *g );
286 		}
287 		repeatSymbol( g );
288 		return 1;
289 	}
290 	if ( token == Atom )
291 	{
292 		*g = BuildNFA_atom( tokchar );
293 		next();
294 		repeatSymbol( g );
295 		return 1;
296 	}
297 
298 	return -1;
299 }
300 
301 /*
302  * <repeatSymbol>   ::= { '*' | '+' }
303  */
304 #ifdef __USE_PROTOS
repeatSymbol(GraphPtr g)305 static int repeatSymbol(GraphPtr g)
306 #else
307 static int repeatSymbol(g)
308 GraphPtr g;
309 #endif
310 {
311 	switch ( token )
312 	{
313 		case '*' : *g = BuildNFA_Astar( *g ); next(); break;
314 		case '+' : *g = BuildNFA_Aplus( *g ); next(); break;
315 	}
316 	return 1;
317 }
318 
319 /*
320  * <atomList>       ::= <atom> { <atom> }*
321  *                      { <atomList> } <atom> '-' <atom> { <atomList> }
322  *
323  * a-b is same as ab
324  * q-a is same as q
325  */
326 
327 #ifdef __USE_PROTOS
atomList(char * p,int complement)328 static int atomList(char *p, int complement)
329 #else
330 static int atomList(p, complement)
331 char *p;
332 int complement;
333 #endif
334 {
335 	static unsigned char set[256];		/* no duplicates */
336 	int first, last, i;
337 	char *s = p;
338 
339 	if ( token != Atom ) return -1;
340 
341 	for (i=0; i<256; i++) set[i] = 0;
342 	while ( token == Atom )
343 	{
344 		if ( !set[tokchar] ) *s++ = tokchar;
345 		set[tokchar] = 1;    			/* Add atom to set */
346 		next();
347 		if ( token == '-' )         	/* have we found '-' */
348 		{
349 			first = *(s-1);             /* Get last char */
350 			next();
351 			if ( token != Atom ) return -1;
352 			else
353 			{
354 				last = tokchar;
355 			}
356 			for (i = first+1; i <= last; i++)
357 			{
358 				if ( !set[tokchar] ) *s++ = i;
359 				set[i] = 1;    			/* Add atom to set */
360 			}
361 			next();
362 		}
363 	}
364 	*s = '\0';
365 	if ( complement )
366 	{
367 		for (i=0; i<256; i++) set[i] = !set[i];
368 		for (i=1,s=p; i<256; i++) if ( set[i] ) *s++ = i;
369 		*s = '\0';
370 	}
371 	return 1;
372 }
373 
374 /* a somewhat stupid lexical analyzer */
375 
376 #ifdef __USE_PROTOS
next(void)377 static void next(void)
378 #else
379 static void next()
380 #endif
381 {
382 	while ( *_c==' ' || *_c=='\t' || *_c=='\n' ) _c++;
383 	if ( *_c=='\\' )
384 	{
385 		_c++;
386 		if ( isdigit(*_c) )
387 		{
388 			int n=0;
389 			while ( isdigit(*_c) )
390 			{
391 				n = n*10 + (*_c++ - '0');
392 			}
393 			if ( n>255 ) n=255;
394 			tokchar = n;
395 		}
396 		else
397 		{
398 			switch (*_c)
399 			{
400 				case 'n' : tokchar = '\n'; break;
401 				case 't' : tokchar = '\t'; break;
402 				case 'r' : tokchar = '\r'; break;
403 				default  : tokchar = *_c;
404 			}
405 			_c++;
406 		}
407 		token = Atom;
408 	}
409 	else if ( isgraph(*_c) && *_c!='[' && *_c!='(' && *_c!='{' &&
410 			  *_c!='-' && *_c!='}' && *_c!=')' && *_c!=']' &&
411 			  *_c!='+' && *_c!='*' && *_c!='~' && *_c!='|' )
412 	{
413 		token = Atom;
414 		tokchar = *_c++;
415 	}
416 	else
417 	{
418 		token = tokchar = *_c++;
419 	}
420 }
421 
422 /* N F A  B u i l d i n g  R o u t i n e s */
423 
424 #ifdef __USE_PROTOS
newGraphArc(void)425 static ArcPtr newGraphArc(void)
426 #else
427 static ArcPtr newGraphArc()
428 #endif
429 {
430 	ArcPtr p;
431 	p = (ArcPtr) calloc(1, sizeof(Arc));
432 	if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
433 	if ( freelist != NULL ) p->track = (ArcPtr) freelist;
434 	freelist = (NodePtr) p;
435 	return p;
436 }
437 
438 #ifdef __USE_PROTOS
newNode(void)439 static NodePtr newNode(void)
440 #else
441 static NodePtr newNode()
442 #endif
443 {
444 	NodePtr p;
445 	p = (NodePtr) calloc(1, sizeof(Node));
446 	if ( p==NULL ) {fprintf(stderr,"rexpr: out of memory\n"); exit(-1);}
447 	if ( freelist != NULL ) p->track = freelist;
448 	freelist = p;
449 	return p;
450 }
451 
452 #ifdef __USE_PROTOS
ArcBetweenGraphNodes(NodePtr i,NodePtr j,int label)453 static void ArcBetweenGraphNodes(NodePtr i,NodePtr j,int label)
454 #else
455 static void ArcBetweenGraphNodes(i, j, label)
456 NodePtr i, j;
457 int label;
458 #endif
459 {
460 	ArcPtr a;
461 
462 	a = newGraphArc();
463 	if ( i->arcs == NULL ) i->arctail = i->arcs = a;
464 	else {(i->arctail)->next = a; i->arctail = a;}
465 	a->label = label;
466 	a->target = j;
467 }
468 
469 #ifdef __USE_PROTOS
BuildNFA_atom(int label)470 static Graph BuildNFA_atom(int label)
471 #else
472 static Graph BuildNFA_atom(label)
473 int label;
474 #endif
475 {
476 	Graph g;
477 
478 	g.left = newNode();
479 	g.right = newNode();
480 	ArcBetweenGraphNodes(g.left, g.right, label);
481 	return( g );
482 }
483 
484 #ifdef __USE_PROTOS
BuildNFA_AB(Graph A,Graph B)485 static Graph BuildNFA_AB(Graph A,Graph B)
486 #else
487 static Graph BuildNFA_AB(A, B)
488 Graph A, B;
489 #endif
490 {
491 	Graph g;
492 
493 	ArcBetweenGraphNodes(A.right, B.left, Epsilon);
494 	g.left = A.left;
495 	g.right = B.right;
496 	return( g );
497 }
498 
499 #ifdef __USE_PROTOS
BuildNFA_AorB(Graph A,Graph B)500 static Graph BuildNFA_AorB(Graph A,Graph B)
501 #else
502 static Graph BuildNFA_AorB(A, B)
503 Graph A, B;
504 #endif
505 {
506 	Graph g;
507 
508 	g.left = newNode();
509 	ArcBetweenGraphNodes(g.left, A.left, Epsilon);
510 	ArcBetweenGraphNodes(g.left, B.left, Epsilon);
511 	g.right = newNode();
512 	ArcBetweenGraphNodes(A.right, g.right, Epsilon);
513 	ArcBetweenGraphNodes(B.right, g.right, Epsilon);
514 	return( g );
515 }
516 
517 #ifdef __USE_PROTOS
BuildNFA_set(char * s)518 static Graph BuildNFA_set(char *s)
519 #else
520 static Graph BuildNFA_set( s )
521 char *s;
522 #endif
523 {
524 	Graph g;
525 
526 	if ( s == NULL ) return g;
527 
528 	g.left = newNode();
529 	g.right = newNode();
530 	while ( *s != '\0' )
531 	{
532 		ArcBetweenGraphNodes(g.left, g.right, *s++);
533 	}
534 	return g;
535 }
536 
537 #ifdef __USE_PROTOS
BuildNFA_Astar(Graph A)538 static Graph BuildNFA_Astar(Graph A)
539 #else
540 static Graph BuildNFA_Astar( A )
541 Graph A;
542 #endif
543 {
544 	Graph g;
545 
546 	g.left = newNode();
547 	g.right = newNode();
548 
549 	ArcBetweenGraphNodes(g.left, A.left, Epsilon);
550 	ArcBetweenGraphNodes(g.left, g.right, Epsilon);
551 	ArcBetweenGraphNodes(A.right, g.right, Epsilon);
552 	ArcBetweenGraphNodes(A.right, A.left, Epsilon);
553 
554 	return( g );
555 }
556 
557 #ifdef __USE_PROTOS
BuildNFA_Aplus(Graph A)558 static Graph BuildNFA_Aplus(Graph A)
559 #else
560 static Graph BuildNFA_Aplus( A )
561 Graph A;
562 #endif
563 {
564 	ArcBetweenGraphNodes(A.right, A.left, Epsilon);
565 
566 	return( A );
567 }
568 
569 #ifdef __USE_PROTOS
BuildNFA_Aoptional(Graph A)570 static Graph BuildNFA_Aoptional(Graph A)
571 #else
572 static Graph BuildNFA_Aoptional( A )
573 Graph A;
574 #endif
575 {
576 	Graph g;
577 
578 	g.left = newNode();
579 	g.right = newNode();
580 
581 	ArcBetweenGraphNodes(g.left, A.left, Epsilon);
582 	ArcBetweenGraphNodes(g.left, g.right, Epsilon);
583 	ArcBetweenGraphNodes(A.right, g.right, Epsilon);
584 
585 	return( g );
586 }
587