1 /****************************************************************
2 Copyright (C) Lucent Technologies 1997
3 All Rights Reserved
4
5 Permission to use, copy, modify, and distribute this software and
6 its documentation for any purpose and without fee is hereby
7 granted, provided that the above copyright notice appear in all
8 copies and that both that the copyright notice and this
9 permission notice and warranty disclaimer appear in supporting
10 documentation, and that the name Lucent Technologies or any of
11 its entities not be used in advertising or publicity pertaining
12 to distribution of the software without specific, written prior
13 permission.
14
15 LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
16 INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
17 IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
18 SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
20 IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
21 ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
22 THIS SOFTWARE.
23 ****************************************************************/
24
25 #include <stdio.h>
26 #include <stdlib.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include "awk.h"
30 #include "ytab.h"
31
32 extern YYSTYPE yylval;
33 extern int infunc;
34
35 int lineno = 1;
36 int bracecnt = 0;
37 int brackcnt = 0;
38 int parencnt = 0;
39
40 typedef struct Keyword {
41 const char *word;
42 int sub;
43 int type;
44 } Keyword;
45
46 Keyword keywords[] ={ /* keep sorted: binary searched */
47 { "BEGIN", XBEGIN, XBEGIN },
48 { "END", XEND, XEND },
49 { "NF", VARNF, VARNF },
50 { "atan2", FATAN, BLTIN },
51 { "break", BREAK, BREAK },
52 { "close", CLOSE, CLOSE },
53 { "continue", CONTINUE, CONTINUE },
54 { "cos", FCOS, BLTIN },
55 { "delete", DELETE, DELETE },
56 { "do", DO, DO },
57 { "else", ELSE, ELSE },
58 { "exit", EXIT, EXIT },
59 { "exp", FEXP, BLTIN },
60 { "fflush", FFLUSH, BLTIN },
61 { "for", FOR, FOR },
62 { "func", FUNC, FUNC },
63 { "function", FUNC, FUNC },
64 { "getline", GETLINE, GETLINE },
65 { "gsub", GSUB, GSUB },
66 { "if", IF, IF },
67 { "in", IN, IN },
68 { "index", INDEX, INDEX },
69 { "int", FINT, BLTIN },
70 { "length", FLENGTH, BLTIN },
71 { "log", FLOG, BLTIN },
72 { "match", MATCHFCN, MATCHFCN },
73 { "next", NEXT, NEXT },
74 { "nextfile", NEXTFILE, NEXTFILE },
75 { "print", PRINT, PRINT },
76 { "printf", PRINTF, PRINTF },
77 { "rand", FRAND, BLTIN },
78 { "return", RETURN, RETURN },
79 { "sin", FSIN, BLTIN },
80 { "split", SPLIT, SPLIT },
81 { "sprintf", SPRINTF, SPRINTF },
82 { "sqrt", FSQRT, BLTIN },
83 { "srand", FSRAND, BLTIN },
84 { "sub", SUB, SUB },
85 { "substr", SUBSTR, SUBSTR },
86 { "system", FSYSTEM, BLTIN },
87 { "tolower", FTOLOWER, BLTIN },
88 { "toupper", FTOUPPER, BLTIN },
89 { "while", WHILE, WHILE },
90 };
91
92 #define RET(x) { if(dbg)printf("lex %s\n", tokname(x)); return(x); }
93
peek(void)94 int peek(void)
95 {
96 int c = input();
97 unput(c);
98 return c;
99 }
100
gettok(char ** pbuf,int * psz)101 int gettok(char **pbuf, int *psz) /* get next input token */
102 {
103 int c, retc;
104 char *buf = *pbuf;
105 int sz = *psz;
106 char *bp = buf;
107
108 c = input();
109 if (c == 0)
110 return 0;
111 buf[0] = c;
112 buf[1] = 0;
113 if (!isalnum(c) && c != '.' && c != '_')
114 return c;
115
116 *bp++ = c;
117 if (isalpha(c) || c == '_') { /* it's a varname */
118 for ( ; (c = input()) != 0; ) {
119 if (bp-buf >= sz)
120 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
121 FATAL( "out of space for name %.10s...", buf );
122 if (isalnum(c) || c == '_')
123 *bp++ = c;
124 else {
125 *bp = 0;
126 unput(c);
127 break;
128 }
129 }
130 *bp = 0;
131 retc = 'a'; /* alphanumeric */
132 } else { /* maybe it's a number, but could be . */
133 char *rem;
134 /* read input until can't be a number */
135 for ( ; (c = input()) != 0; ) {
136 if (bp-buf >= sz)
137 if (!adjbuf(&buf, &sz, bp-buf+2, 100, &bp, "gettok"))
138 FATAL( "out of space for number %.10s...", buf );
139 if (isdigit(c) || c == 'e' || c == 'E'
140 || c == '.' || c == '+' || c == '-')
141 *bp++ = c;
142 else {
143 unput(c);
144 break;
145 }
146 }
147 *bp = 0;
148 strtod(buf, &rem); /* parse the number */
149 if (rem == buf) { /* it wasn't a valid number at all */
150 buf[1] = 0; /* return one character as token */
151 retc = buf[0]; /* character is its own type */
152 unputstr(rem+1); /* put rest back for later */
153 } else { /* some prefix was a number */
154 unputstr(rem); /* put rest back for later */
155 rem[0] = 0; /* truncate buf after number part */
156 retc = '0'; /* type is number */
157 }
158 }
159 *pbuf = buf;
160 *psz = sz;
161 return retc;
162 }
163
164 int word(char *);
165 int string(void);
166 int regexpr(void);
167 int sc = 0; /* 1 => return a } right now */
168 int reg = 0; /* 1 => return a REGEXPR now */
169
yylex(void)170 int yylex(void)
171 {
172 int c;
173 static char *buf = 0;
174 static int bufsize = 5; /* BUG: setting this small causes core dump! */
175
176 if (buf == 0 && (buf = (char *) malloc(bufsize)) == NULL)
177 FATAL( "out of space in yylex" );
178 if (sc) {
179 sc = 0;
180 RET('}');
181 }
182 if (reg) {
183 reg = 0;
184 return regexpr();
185 }
186 for (;;) {
187 c = gettok(&buf, &bufsize);
188 if (c == 0)
189 return 0;
190 if (isalpha(c) || c == '_')
191 return word(buf);
192 if (isdigit(c)) {
193 yylval.cp = setsymtab(buf, tostring(buf), atof(buf), CON|NUM, symtab);
194 /* should this also have STR set? */
195 RET(NUMBER);
196 }
197
198 yylval.i = c;
199 switch (c) {
200 case '\n': /* {EOL} */
201 lineno++;
202 RET(NL);
203 case '\r': /* assume \n is coming */
204 case ' ': /* {WS}+ */
205 case '\t':
206 break;
207 case '#': /* #.* strip comments */
208 while ((c = input()) != '\n' && c != 0)
209 ;
210 unput(c);
211 break;
212 case ';':
213 RET(';');
214 case '\\':
215 if (peek() == '\n') {
216 input();
217 lineno++;
218 } else if (peek() == '\r') {
219 input(); input(); /* \n */
220 lineno++;
221 } else {
222 RET(c);
223 }
224 break;
225 case '&':
226 if (peek() == '&') {
227 input(); RET(AND);
228 } else
229 RET('&');
230 case '|':
231 if (peek() == '|') {
232 input(); RET(BOR);
233 } else
234 RET('|');
235 case '!':
236 if (peek() == '=') {
237 input(); yylval.i = NE; RET(NE);
238 } else if (peek() == '~') {
239 input(); yylval.i = NOTMATCH; RET(MATCHOP);
240 } else
241 RET(NOT);
242 case '~':
243 yylval.i = MATCH;
244 RET(MATCHOP);
245 case '<':
246 if (peek() == '=') {
247 input(); yylval.i = LE; RET(LE);
248 } else {
249 yylval.i = LT; RET(LT);
250 }
251 case '=':
252 if (peek() == '=') {
253 input(); yylval.i = EQ; RET(EQ);
254 } else {
255 yylval.i = ASSIGN; RET(ASGNOP);
256 }
257 case '>':
258 if (peek() == '=') {
259 input(); yylval.i = GE; RET(GE);
260 } else if (peek() == '>') {
261 input(); yylval.i = APPEND; RET(APPEND);
262 } else {
263 yylval.i = GT; RET(GT);
264 }
265 case '+':
266 if (peek() == '+') {
267 input(); yylval.i = INCR; RET(INCR);
268 } else if (peek() == '=') {
269 input(); yylval.i = ADDEQ; RET(ASGNOP);
270 } else
271 RET('+');
272 case '-':
273 if (peek() == '-') {
274 input(); yylval.i = DECR; RET(DECR);
275 } else if (peek() == '=') {
276 input(); yylval.i = SUBEQ; RET(ASGNOP);
277 } else
278 RET('-');
279 case '*':
280 if (peek() == '=') { /* *= */
281 input(); yylval.i = MULTEQ; RET(ASGNOP);
282 } else if (peek() == '*') { /* ** or **= */
283 input(); /* eat 2nd * */
284 if (peek() == '=') {
285 input(); yylval.i = POWEQ; RET(ASGNOP);
286 } else {
287 RET(POWER);
288 }
289 } else
290 RET('*');
291 case '/':
292 RET('/');
293 case '%':
294 if (peek() == '=') {
295 input(); yylval.i = MODEQ; RET(ASGNOP);
296 } else
297 RET('%');
298 case '^':
299 if (peek() == '=') {
300 input(); yylval.i = POWEQ; RET(ASGNOP);
301 } else
302 RET(POWER);
303
304 case '$':
305 /* BUG: awkward, if not wrong */
306 c = gettok(&buf, &bufsize);
307 if (isalpha(c)) {
308 if (strcmp(buf, "NF") == 0) { /* very special */
309 unputstr("(NF)");
310 RET(INDIRECT);
311 }
312 c = peek();
313 if (c == '(' || c == '[' || (infunc && isarg(buf) >= 0)) {
314 unputstr(buf);
315 RET(INDIRECT);
316 }
317 yylval.cp = setsymtab(buf, "", 0.0, STR|NUM, symtab);
318 RET(IVAR);
319 } else if (c == 0) { /* */
320 SYNTAX( "unexpected end of input after $" );
321 RET(';');
322 } else {
323 unputstr(buf);
324 RET(INDIRECT);
325 }
326
327 case '}':
328 if (--bracecnt < 0)
329 SYNTAX( "extra }" );
330 sc = 1;
331 RET(';');
332 case ']':
333 if (--brackcnt < 0)
334 SYNTAX( "extra ]" );
335 RET(']');
336 case ')':
337 if (--parencnt < 0)
338 SYNTAX( "extra )" );
339 RET(')');
340 case '{':
341 bracecnt++;
342 RET('{');
343 case '[':
344 brackcnt++;
345 RET('[');
346 case '(':
347 parencnt++;
348 RET('(');
349
350 case '"':
351 return string(); /* BUG: should be like tran.c ? */
352
353 default:
354 RET(c);
355 }
356 }
357 }
358
string(void)359 int string(void)
360 {
361 int c, n;
362 char *s, *bp;
363 static char *buf = 0;
364 static int bufsz = 500;
365
366 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
367 FATAL("out of space for strings");
368 for (bp = buf; (c = input()) != '"'; ) {
369 if (!adjbuf(&buf, &bufsz, bp-buf+2, 500, &bp, "string"))
370 FATAL("out of space for string %.10s...", buf);
371 switch (c) {
372 case '\n':
373 case '\r':
374 case 0:
375 *bp = '\0';
376 SYNTAX( "non-terminated string %.10s...", buf );
377 if (c == 0) /* hopeless */
378 FATAL( "giving up" );
379 lineno++;
380 break;
381 case '\\':
382 c = input();
383 switch (c) {
384 case '"': *bp++ = '"'; break;
385 case 'n': *bp++ = '\n'; break;
386 case 't': *bp++ = '\t'; break;
387 case 'f': *bp++ = '\f'; break;
388 case 'r': *bp++ = '\r'; break;
389 case 'b': *bp++ = '\b'; break;
390 case 'v': *bp++ = '\v'; break;
391 case 'a': *bp++ = '\007'; break;
392 case '\\': *bp++ = '\\'; break;
393
394 case '0': case '1': case '2': /* octal: \d \dd \ddd */
395 case '3': case '4': case '5': case '6': case '7':
396 n = c - '0';
397 if ((c = peek()) >= '0' && c < '8') {
398 n = 8 * n + input() - '0';
399 if ((c = peek()) >= '0' && c < '8')
400 n = 8 * n + input() - '0';
401 }
402 *bp++ = n;
403 break;
404
405 case 'x': /* hex \x0-9a-fA-F + */
406 { char xbuf[100], *px;
407 for (px = xbuf; (c = input()) != 0 && px-xbuf < 100-2; ) {
408 if (isdigit(c)
409 || (c >= 'a' && c <= 'f')
410 || (c >= 'A' && c <= 'F'))
411 *px++ = c;
412 else
413 break;
414 }
415 *px = 0;
416 unput(c);
417 sscanf(xbuf, "%x", (unsigned int *) &n);
418 *bp++ = n;
419 break;
420 }
421
422 default:
423 *bp++ = c;
424 break;
425 }
426 break;
427 default:
428 *bp++ = c;
429 break;
430 }
431 }
432 *bp = 0;
433 s = tostring(buf);
434 *bp++ = ' '; *bp++ = 0;
435 yylval.cp = setsymtab(buf, s, 0.0, CON|STR|DONTFREE, symtab);
436 RET(STRING);
437 }
438
439
binsearch(char * w,Keyword * kp,int n)440 int binsearch(char *w, Keyword *kp, int n)
441 {
442 int cond, low, mid, high;
443
444 low = 0;
445 high = n - 1;
446 while (low <= high) {
447 mid = (low + high) / 2;
448 if ((cond = strcmp(w, kp[mid].word)) < 0)
449 high = mid - 1;
450 else if (cond > 0)
451 low = mid + 1;
452 else
453 return mid;
454 }
455 return -1;
456 }
457
word(char * w)458 int word(char *w)
459 {
460 Keyword *kp;
461 int c, n;
462
463 n = binsearch(w, keywords, sizeof(keywords)/sizeof(keywords[0]));
464 /* BUG: this ought to be inside the if; in theory could fault (daniel barrett) */
465 kp = keywords + n;
466 if (n != -1) { /* found in table */
467 yylval.i = kp->sub;
468 switch (kp->type) { /* special handling */
469 case BLTIN:
470 if (kp->sub == FSYSTEM && safe)
471 SYNTAX( "system is unsafe" );
472 RET(kp->type);
473 case FUNC:
474 if (infunc)
475 SYNTAX( "illegal nested function" );
476 RET(kp->type);
477 case RETURN:
478 if (!infunc)
479 SYNTAX( "return not in function" );
480 RET(kp->type);
481 case VARNF:
482 yylval.cp = setsymtab("NF", "", 0.0, NUM, symtab);
483 RET(VARNF);
484 default:
485 RET(kp->type);
486 }
487 }
488 c = peek(); /* look for '(' */
489 if (c != '(' && infunc && (n=isarg(w)) >= 0) {
490 yylval.i = n;
491 RET(ARG);
492 } else {
493 yylval.cp = setsymtab(w, "", 0.0, STR|NUM|DONTFREE, symtab);
494 if (c == '(') {
495 RET(CALL);
496 } else {
497 RET(VAR);
498 }
499 }
500 }
501
startreg(void)502 void startreg(void) /* next call to yylex will return a regular expression */
503 {
504 reg = 1;
505 }
506
regexpr(void)507 int regexpr(void)
508 {
509 int c;
510 static char *buf = 0;
511 static int bufsz = 500;
512 char *bp;
513
514 if (buf == 0 && (buf = (char *) malloc(bufsz)) == NULL)
515 FATAL("out of space for rex expr");
516 bp = buf;
517 for ( ; (c = input()) != '/' && c != 0; ) {
518 if (!adjbuf(&buf, &bufsz, bp-buf+3, 500, &bp, "regexpr"))
519 FATAL("out of space for reg expr %.10s...", buf);
520 if (c == '\n') {
521 *bp = '\0';
522 SYNTAX( "newline in regular expression %.10s...", buf );
523 unput('\n');
524 break;
525 } else if (c == '\\') {
526 *bp++ = '\\';
527 *bp++ = input();
528 } else {
529 *bp++ = c;
530 }
531 }
532 *bp = 0;
533 if (c == 0)
534 SYNTAX("non-terminated regular expression %.10s...", buf);
535 yylval.s = tostring(buf);
536 unput('/');
537 RET(REGEXPR);
538 }
539
540 /* low-level lexical stuff, sort of inherited from lex */
541
542 char ebuf[300];
543 char *ep = ebuf;
544 char yysbuf[100]; /* pushback buffer */
545 char *yysptr = yysbuf;
546 FILE *yyin = 0;
547
input(void)548 int input(void) /* get next lexical input character */
549 {
550 int c;
551 extern char *lexprog;
552
553 if (yysptr > yysbuf)
554 c = (uschar)*--yysptr;
555 else if (lexprog != NULL) { /* awk '...' */
556 if ((c = (uschar)*lexprog) != 0)
557 lexprog++;
558 } else /* awk -f ... */
559 c = pgetc();
560 if (c == EOF)
561 c = 0;
562 if (ep >= ebuf + sizeof ebuf)
563 ep = ebuf;
564 *ep = c;
565 if (c != 0) {
566 ep++;
567 }
568 return (c);
569 }
570
unput(int c)571 void unput(int c) /* put lexical character back on input */
572 {
573 if (yysptr >= yysbuf + sizeof(yysbuf))
574 FATAL("pushed back too much: %.20s...", yysbuf);
575 *yysptr++ = c;
576 if (--ep < ebuf)
577 ep = ebuf + sizeof(ebuf) - 1;
578 }
579
unputstr(const char * s)580 void unputstr(const char *s) /* put a string back on input */
581 {
582 int i;
583
584 for (i = strlen(s)-1; i >= 0; i--)
585 unput(s[i]);
586 }
587