/* * Copyright 1993-2002 Christopher Seiwald and Perforce Software, Inc. * * This file is part of Jam - see jam.c for Copyright information. */ /* * scan.c - the jam yacc scanner * */ #include "jam.h" #include "scan.h" #include "output.h" #include "constants.h" #include "jamgram.hpp" struct keyword { const char * word; int type; } keywords[] = { #include "jamgramtab.h" { 0, 0 } }; typedef struct include include; struct include { include * next; /* next serial include file */ char * string; /* pointer into current line */ char * * strings; /* for yyfparse() -- text to parse */ LISTITER pos; /* for yysparse() -- text to parse */ LIST * list; /* for yysparse() -- text to parse */ FILE * file; /* for yyfparse() -- file being read */ OBJECT * fname; /* for yyfparse() -- file name */ int line; /* line counter for error messages */ char buf[ 512 ]; /* for yyfparse() -- line buffer */ }; static include * incp = 0; /* current file; head of chain */ static int scanmode = SCAN_NORMAL; static int anyerrors = 0; static char * symdump( YYSTYPE * ); #define BIGGEST_TOKEN 10240 /* no single token can be larger */ /* * Set parser mode: normal, string, or keyword. */ int yymode( int n ) { int result = scanmode; scanmode = n; return result; } void yyerror( char const * s ) { /* We use yylval instead of incp to access the error location information as * the incp pointer will already be reset to 0 in case the error occurred at * EOF. * * The two may differ only if ran into an unexpected EOF or we get an error * while reading a lexical token spanning multiple lines, e.g. a multi-line * string literal or action body, in which case yylval location information * will hold the information about where the token started while incp will * hold the information about where reading it broke. */ out_printf( "%s:%d: %s at %s\n", object_str( yylval.file ), yylval.line, s, symdump( &yylval ) ); ++anyerrors; } int yyanyerrors() { return anyerrors != 0; } void yyfparse( OBJECT * s ) { include * i = (include *)BJAM_MALLOC( sizeof( *i ) ); /* Push this onto the incp chain. */ i->string = (char*)""; i->strings = 0; i->file = 0; i->fname = object_copy( s ); i->line = 0; i->next = incp; incp = i; } void yysparse( OBJECT * name, const char * * lines ) { yyfparse( name ); incp->strings = (char * *)lines; } /* * yyfdone() - cleanup after we're done parsing a file. */ void yyfdone( void ) { include * const i = incp; incp = i->next; /* Close file, free name. */ if(i->file && (i->file != stdin)) fclose(i->file); object_free(i->fname); BJAM_FREE((char *)i); } /* * yyline() - read new line and return first character. * * Fabricates a continuous stream of characters across include files, returning * EOF at the bitter end. */ int yyline() { include * const i = incp; if ( !incp ) return EOF; /* Once we start reading from the input stream, we reset the include * insertion point so that the next include file becomes the head of the * list. */ /* If there is more data in this line, return it. */ if ( *i->string ) return *i->string++; /* If we are reading from an internal string list, go to the next string. */ if ( i->strings ) { if ( *i->strings ) { ++i->line; i->string = *(i->strings++); return *i->string++; } } else { /* If necessary, open the file. */ if ( !i->file ) { FILE * f = stdin; if ( strcmp( object_str( i->fname ), "-" ) && !( f = fopen( object_str( i->fname ), "r" ) ) ) perror( object_str( i->fname ) ); i->file = f; } /* If there is another line in this file, start it. */ if ( i->file && fgets( i->buf, sizeof( i->buf ), i->file ) ) { ++i->line; i->string = i->buf; return *i->string++; } } /* This include is done. Return EOF so yyparse() returns to * parse_file(). */ return EOF; } /* This allows us to get an extra character of lookahead. * There are a few places where we need to look ahead two * characters and yyprev only guarantees a single character * of putback. */ int yypeek() { if ( *incp->string ) { return *incp->string; } else if ( incp->strings ) { if ( *incp->strings ) return **incp->strings; } else if ( incp->file ) { /* Don't bother opening the file. yypeek is * only used in special cases and never at the * beginning of a file. */ int ch = fgetc( incp->file ); if ( ch != EOF ) ungetc( ch, incp->file ); return ch; } return EOF; } /* * yylex() - set yylval to current token; return its type. * * Macros to move things along: * * yychar() - return and advance character; invalid after EOF. * yyprev() - back up one character; invalid before yychar(). * * yychar() returns a continuous stream of characters, until it hits the EOF of * the current include file. */ #define yychar() ( *incp->string ? *incp->string++ : yyline() ) #define yyprev() ( incp->string-- ) static int use_new_scanner = 0; #define yystartkeyword() if(use_new_scanner) break; else token_warning() #define yyendkeyword() if(use_new_scanner) break; else if ( 1 ) { expect_whitespace = 1; continue; } else (void)0 void do_token_warning() { out_printf( "%s:%d: %s %s\n", object_str( yylval.file ), yylval.line, "Unescaped special character in", symdump( &yylval ) ); } #define token_warning() has_token_warning = 1 int yylex() { int c; char buf[ BIGGEST_TOKEN ]; char * b = buf; if ( !incp ) goto eof; /* Get first character (whitespace or of token). */ c = yychar(); if ( scanmode == SCAN_STRING ) { /* If scanning for a string (action's {}'s), look for the closing brace. * We handle matching braces, if they match. */ int nest = 1; while ( ( c != EOF ) && ( b < buf + sizeof( buf ) ) ) { if ( c == '{' ) ++nest; if ( ( c == '}' ) && !--nest ) break; *b++ = c; c = yychar(); /* Turn trailing "\r\n" sequences into plain "\n" for Cygwin. */ if ( ( c == '\n' ) && ( b[ -1 ] == '\r' ) ) --b; } /* We ate the ending brace -- regurgitate it. */ if ( c != EOF ) yyprev(); /* Check for obvious errors. */ if ( b == buf + sizeof( buf ) ) { yyerror( "action block too big" ); goto eof; } if ( nest ) { yyerror( "unmatched {} in action block" ); goto eof; } *b = 0; yylval.type = STRING; yylval.string = object_new( buf ); yylval.file = incp->fname; yylval.line = incp->line; } else { char * b = buf; struct keyword * k; int inquote = 0; int notkeyword; int hastoken = 0; int hasquote = 0; int ingrist = 0; int invarexpand = 0; int expect_whitespace = 0; int has_token_warning = 0; /* Eat white space. */ for ( ; ; ) { /* Skip past white space. */ while ( ( c != EOF ) && isspace( c ) ) c = yychar(); /* Not a comment? */ if ( c != '#' ) break; c = yychar(); if ( ( c != EOF ) && c == '|' ) { /* Swallow up block comment. */ int c0 = yychar(); int c1 = yychar(); while ( ! ( c0 == '|' && c1 == '#' ) && ( c0 != EOF && c1 != EOF ) ) { c0 = c1; c1 = yychar(); } c = yychar(); } else { /* Swallow up comment line. */ while ( ( c != EOF ) && ( c != '\n' ) ) c = yychar(); } } /* c now points to the first character of a token. */ if ( c == EOF ) goto eof; yylval.file = incp->fname; yylval.line = incp->line; /* While scanning the word, disqualify it for (expensive) keyword lookup * when we can: $anything, "anything", \anything */ notkeyword = c == '$'; /* Look for white space to delimit word. "'s get stripped but preserve * white space. \ protects next character. */ while ( ( c != EOF ) && ( b < buf + sizeof( buf ) ) && ( inquote || invarexpand || !isspace( c ) ) ) { if ( expect_whitespace || ( isspace( c ) && ! inquote ) ) { token_warning(); expect_whitespace = 0; } if ( !inquote && !invarexpand ) { if ( scanmode == SCAN_COND || scanmode == SCAN_CONDB ) { if ( hastoken && ( c == '=' || c == '<' || c == '>' || c == '!' || c == '(' || c == ')' || c == '&' || c == '|' ) ) { /* Don't treat > as special if we started with a grist. */ if ( ! ( scanmode == SCAN_CONDB && ingrist == 1 && c == '>' ) ) { yystartkeyword(); } } else if ( c == '=' || c == '(' || c == ')' ) { *b++ = c; c = yychar(); yyendkeyword(); } else if ( c == '!' || ( scanmode == SCAN_COND && ( c == '<' || c == '>' ) ) ) { *b++ = c; if ( ( c = yychar() ) == '=' ) { *b++ = c; c = yychar(); } yyendkeyword(); } else if ( c == '&' || c == '|' ) { *b++ = c; if ( yychar() == c ) { *b++ = c; c = yychar(); } yyendkeyword(); } } else if ( scanmode == SCAN_PARAMS ) { if ( c == '*' || c == '+' || c == '?' || c == '(' || c == ')' ) { if ( !hastoken ) { *b++ = c; c = yychar(); yyendkeyword(); } else { yystartkeyword(); } } } else if ( scanmode == SCAN_XASSIGN && ! hastoken ) { if ( c == '=' ) { *b++ = c; c = yychar(); yyendkeyword(); } else if ( c == '+' || c == '?' ) { if ( yypeek() == '=' ) { *b++ = c; *b++ = yychar(); c = yychar(); yyendkeyword(); } } } else if ( scanmode == SCAN_NORMAL || scanmode == SCAN_ASSIGN ) { if ( c == '=' ) { if ( !hastoken ) { *b++ = c; c = yychar(); yyendkeyword(); } else { yystartkeyword(); } } else if ( c == '+' || c == '?' ) { if ( yypeek() == '=' ) { if ( hastoken ) { yystartkeyword(); } else { *b++ = c; *b++ = yychar(); c = yychar(); yyendkeyword(); } } } } if ( scanmode != SCAN_CASE && ( c == ';' || c == '{' || c == '}' || ( scanmode != SCAN_PARAMS && ( c == '[' || c == ']' ) ) ) ) { if ( ! hastoken ) { *b++ = c; c = yychar(); yyendkeyword(); } else { yystartkeyword(); } } else if ( c == ':' ) { if ( ! hastoken ) { *b++ = c; c = yychar(); yyendkeyword(); break; } else if ( hasquote ) { /* Special rules for ':' do not apply after we quote anything. */ yystartkeyword(); } else if ( ingrist == 0 ) { int next = yychar(); int is_win_path = 0; int is_conditional = 0; if ( next == '\\' ) { if( yypeek() == '\\' ) { is_win_path = 1; } } else if ( next == '/' ) { is_win_path = 1; } yyprev(); if ( is_win_path ) { /* Accept windows paths iff they are at the start or immediately follow a grist. */ if ( b > buf && isalpha( b[ -1 ] ) && ( b == buf + 1 || b[ -2 ] == '>' ) ) { is_win_path = 1; } else { is_win_path = 0; } } if ( next == '<' ) { /* Accept conditionals only for tokens that start with "<" or "!<" */ if ( ( (b > buf) && (buf[ 0 ] == '<') ) || ( (b > (buf + 1)) && (buf[ 0 ] == '!') && (buf[ 1 ] == '<') )) { is_conditional = 1; } } if ( !is_conditional && !is_win_path ) { yystartkeyword(); } } } } hastoken = 1; if ( c == '"' ) { /* begin or end " */ inquote = !inquote; hasquote = 1; notkeyword = 1; } else if ( c != '\\' ) { if ( !invarexpand && c == '<' ) { if ( ingrist == 0 ) ingrist = 1; else ingrist = -1; } else if ( !invarexpand && c == '>' ) { if ( ingrist == 1 ) ingrist = 0; else ingrist = -1; } else if ( c == '$' ) { if ( ( c = yychar() ) == EOF ) { *b++ = '$'; break; } else if ( c == '(' ) { /* inside $(), we only care about quotes */ *b++ = '$'; c = '('; ++invarexpand; } else { c = '$'; yyprev(); } } else if ( c == '@' ) { if ( ( c = yychar() ) == EOF ) { *b++ = '@'; break; } else if ( c == '(' ) { /* inside @(), we only care about quotes */ *b++ = '@'; c = '('; ++invarexpand; } else { c = '@'; yyprev(); } } else if ( invarexpand && c == '(' ) { ++invarexpand; } else if ( invarexpand && c == ')' ) { --invarexpand; } /* normal char */ *b++ = c; } else if ( ( c = yychar() ) != EOF ) { /* \c */ if (c == 'n') c = '\n'; else if (c == 'r') c = '\r'; else if (c == 't') c = '\t'; *b++ = c; notkeyword = 1; } else { /* \EOF */ break; } c = yychar(); } /* Automatically switch modes after reading the token. */ if ( scanmode == SCAN_CONDB ) scanmode = SCAN_COND; /* Check obvious errors. */ if ( b == buf + sizeof( buf ) ) { yyerror( "string too big" ); goto eof; } if ( inquote ) { yyerror( "unmatched \" in string" ); goto eof; } /* We looked ahead a character - back up. */ if ( c != EOF ) yyprev(); /* Scan token table. Do not scan if it is obviously not a keyword or if * it is an alphabetic when were looking for punctuation. */ *b = 0; yylval.type = ARG; if ( !notkeyword && !( isalpha( *buf ) && ( scanmode == SCAN_PUNCT || scanmode == SCAN_PARAMS || scanmode == SCAN_ASSIGN ) ) ) for ( k = keywords; k->word; ++k ) if ( ( *buf == *k->word ) && !strcmp( k->word, buf ) ) { yylval.type = k->type; yylval.keyword = k->word; /* used by symdump */ break; } if ( yylval.type == ARG ) yylval.string = object_new( buf ); if ( scanmode == SCAN_NORMAL && yylval.type == ARG ) scanmode = SCAN_XASSIGN; if ( has_token_warning ) do_token_warning(); } if ( DEBUG_SCAN ) out_printf( "scan %s\n", symdump( &yylval ) ); return yylval.type; eof: /* We do not reset yylval.file & yylval.line here so unexpected EOF error * messages would include correct error location information. */ yylval.type = EOF; return yylval.type; } static char * symdump( YYSTYPE * s ) { static char buf[ BIGGEST_TOKEN + 20 ]; switch ( s->type ) { case EOF : sprintf( buf, "EOF" ); break; case 0 : sprintf( buf, "unknown symbol %s", object_str( s->string ) ); break; case ARG : sprintf( buf, "argument %s" , object_str( s->string ) ); break; case STRING: sprintf( buf, "string \"%s\"" , object_str( s->string ) ); break; default : sprintf( buf, "keyword %s" , s->keyword ); break; } return buf; } /* * Get information about the current file and line, for those epsilon * transitions that produce a parse. */ void yyinput_last_read_token( OBJECT * * name, int * line ) { /* TODO: Consider whether and when we might want to report where the last * read token ended, e.g. EOF errors inside string literals. */ *name = yylval.file; *line = yylval.line; }