/** A generic recognizer that can handle recognizers generated from * lexer, parser, and tree grammars. This is all the parsing * support code essentially; most of it is error recovery stuff and * backtracking. * *
This class should not be instantiated directly. Instead, use one of its * subclasses.
* * @class * @param {org.antlr.runtime.RecognizerSharedState} [state] state object with * which to initialize this recognizer. */ org.antlr.runtime.BaseRecognizer = function(state) { /** State of a lexer, parser, or tree parser are collected into a state * object so the state can be shared. This sharing is needed to * have one grammar import others and share same error variables * and other state variables. It's a kind of explicit multiple * inheritance via delegation of methods and shared state. * @type org.antlr.runtime.RecognizerSharedState */ this.state = state || new org.antlr.runtime.RecognizerSharedState(); }; org.antlr.lang.augmentObject(org.antlr.runtime.BaseRecognizer, { /** * @memberOf org.antlr.runtime.BaseRecognizer * @type Number */ MEMO_RULE_FAILED: -2, /** * @memberOf org.antlr.runtime.BaseRecognizer * @type Number */ MEMO_RULE_UNKNOWN: -1, /** * @memberOf org.antlr.runtime.BaseRecognizer * @type Number */ INITIAL_FOLLOW_STACK_SIZE: 100, /** * @memberOf org.antlr.runtime.BaseRecognizer * @type Number */ MEMO_RULE_FAILED_I: -2, /** * @memberOf org.antlr.runtime.BaseRecognizer * @type Number */ DEFAULT_TOKEN_CHANNEL: org.antlr.runtime.Token.DEFAULT_CHANNEL, /** * @memberOf org.antlr.runtime.BaseRecognizer * @type Number */ HIDDEN: org.antlr.runtime.Token.HIDDEN_CHANNEL, /** * @memberOf org.antlr.runtime.BaseRecognizer * @type String */ NEXT_TOKEN_RULE_NAME: "nextToken" }); org.antlr.runtime.BaseRecognizer.prototype = { /** Reset the parser's state. Subclasses must rewinds the input stream */ reset: function() { var i, len; // wack everything related to error recovery if (!this.state) { return; // no shared state work to do } this.state._fsp = -1; this.state.errorRecovery = false; this.state.lastErrorIndex = -1; this.state.failed = false; this.state.syntaxErrors = 0; // wack everything related to backtracking and memoization this.state.backtracking = 0; // wipe cache if (this.state.ruleMemo) { for (i=0, len=this.state.ruleMemo.length; iThis method sets errorRecovery to indicate the parser is recovering * not parsing. Once in recovery mode, no errors are generated. * To get out of recovery mode, the parser must successfully match * a token (after a resync). So it will go:
*If you override, make sure to update this.state.syntaxErrors if you * care about that.
* @param {org.antlr.runtime.RecognitionException} e the error to be reported. */ reportError: function(e) { // if we've already reported an error and have not matched a token // yet successfully, don't report any errors. if ( this.state.errorRecovery ) { return; } this.state.syntaxErrors++; this.state.errorRecovery = true; this.displayRecognitionError(this.getTokenNames(), e); }, /** * Assemble recognition error message. * @param {Array} tokenNames array of token names (strings). * @param {org.antlr.runtime.RecognitionException} e the error to be reported. */ displayRecognitionError: function(tokenNames, e) { var hdr = this.getErrorHeader(e), msg = this.getErrorMessage(e, tokenNames); this.emitErrorMessage(hdr+" "+msg); }, /** * Create error header message. Format isline * lineNumber:positionInLine. * @param {org.antlr.runtime.RecognitionException} e the error to be reported. * @returns {String} The error header. */ getErrorHeader: function(e) { /* handle null input */ if (!org.antlr.lang.isNumber(e.line)) { e.line = 0; } return "line "+e.line+":"+e.charPositionInLine; }, /** * Override this method to change where error messages go. * Defaults to "alert"-ing the error in browsers and "print"-ing the error * in other environments (e.g. Rhino, SpiderMonkey). * @param {String} msg the error message to be displayed. */ emitErrorMessage: function(msg) { if (typeof(window) != 'undefined' && window.alert) { alert(msg); } else { print(msg); } }, /** What error message should be generated for the various * exception types? * *
Not very object-oriented code, but I like having all error message * generation within one method rather than spread among all of the * exception classes. This also makes it much easier for the exception * handling because the exception classes do not have to have pointers back * to this object to access utility routines and so on. Also, changing * the message for an exception type would be difficult because you * would have to be subclassing exceptions, but then somehow get ANTLR * to make those kinds of exception objects instead of the default.
* *For grammar debugging, you will want to override this to add * more information such as the stack frame and no viable alts.
* *Override this to change the message generated for one or more * exception types.
* * @param {Array} tokenNames array of token names (strings). * @param {org.antlr.runtime.RecognitionException} e the error to be reported. * @returns {String} the error message to be emitted. */ getErrorMessage: function(e, tokenNames) { var msg = (e && e.getMessage) ? e.getMessage() : null, mte, tokenName; if ( e instanceof org.antlr.runtime.UnwantedTokenException ) { var ute = e; tokenName="Get number of recognition errors (lexer, parser, tree parser). Each * recognizer tracks its own number. So parser and lexer each have * separate count. Does not count the spurious errors found between * an error and next valid token match.
* *See also {@link #reportError}()
* @returns {Number} number of syntax errors encountered
*/
getNumberOfSyntaxErrors: function() {
return this.state.syntaxErrors;
},
/** How should a token be displayed in an error message? The default
* is to display just the text, but during development you might
* want to have a lot of information spit out. Override in that case
* to use t.toString() (which, for CommonToken, dumps everything about
* the token).
* @param {org.antlr.runtime.Token} t token that will be displayed in an error message
* @return {String} the string representation of the token
*/
getTokenErrorDisplay: function(t) {
var s = t.getText();
if ( !org.antlr.lang.isValue(s) ) {
if ( t.getType()==org.antlr.runtime.Token.EOF ) {
s = " During rule invocation, the parser pushes the set of tokens that can
* follow that rule reference on the stack; this amounts to
* computing FIRST of what follows the rule reference in the
* enclosing rule. This local follow set only includes tokens
* from within the rule; i.e., the FIRST computation done by
* ANTLR stops at the end of a rule. EXAMPLE When you find a "no viable alt exception", the input is not
* consistent with any of the alternatives for rule r. The best
* thing to do is to consume tokens until you see something that
* can legally follow a call to r *or* any rule that called r.
* You don't want the exact set of viable next tokens because the
* input might just be missing a token--you might consume the
* rest of the input looking for one of the missing tokens. Consider grammar: At each rule invocation, the set of tokens that could follow
* that rule is pushed on a stack. Here are the various "local"
* follow sets: Upon erroneous input "[]", the call chain is and, hence, the follow context stack is: Notice that ')' is not included, because b would have to have
* been called from a different context in rule a for ')' to be
* included. For error recovery, we cannot consider FOLLOW(c)
* (context-sensitive or otherwise). We need the combined set of
* all context-sensitive FOLLOW sets--the set of all tokens that
* could follow any reference in the call chain. We need to
* resync to one of those tokens. Note that FOLLOW(c)='^' and if
* we resync'd to that token, we'd consume until EOF. We need to
* sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}.
* In this case, for input "[]", LA(1) is in this set so we would
* not consume anything and after printing an error rule c would
* return normally. It would not find the required '^' though.
* At this point, it gets a mismatched token error and throws an
* exception (since LA(1) is not in the viable following token
* set). The rule exception handler tries to recover, but finds
* the same recovery set and doesn't consume anything. Rule b
* exits normally returning to rule a. Now it finds the ']' (and
* with the successful match exits errorRecovery mode). So, you cna see that the parser walks up call chain looking
* for the token that was a member of the recovery set. Errors are not generated in errorRecovery mode. ANTLR's error recovery mechanism is based upon original ideas: "Algorithms + Data Structures = Programs" by Niklaus Wirth and "A note on error recovery in recursive descent parsers":
* http://portal.acm.org/citation.cfm?id=947902.947905 Later, Josef Grosch had some good ideas: "Efficient and Comfortable Error Recovery in Recursive Descent
* Parsers":
* ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip Like Grosch I implemented local FOLLOW sets that are combined
* at run-time upon error to avoid overhead during parsing. This is set of token types that can follow a specific rule
* reference given a specific call chain. You get the set of
* viable tokens that can possibly come next (lookahead depth 1)
* given the current call chain. Contrast this with the
* definition of plain FOLLOW for rule r: where x in T* and alpha, beta in V*; T is set of terminals and
* V is the set of terminals and nonterminals. In other words,
* FOLLOW(r) is the set of all tokens that can possibly follow
* references to r in *any* sentential form (context). At
* runtime, however, we know precisely which context applies as
* we have the call chain. We may compute the exact (rather
* than covering superset) set of following tokens. For example, consider grammar: The FOLLOW sets are all inclusive whereas context-sensitive
* FOLLOW sets are precisely what could follow a rule reference.
* For input input "i=(3);", here is the derivation: At the "3" token, you'd have a call chain of What can follow that specific nested ref to atom? Exactly ')'
* as you can see by looking at the derivation of this specific
* input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. You want the exact viable token set when recovering from a
* token mismatch. Upon token mismatch, if LA(1) is member of
* the viable next token set, then you know there is most likely
* a missing token in the input stream. "Insert" one by just not
* throwing an exception. EXTRA TOKEN LA(1) is not what we are looking for. If LA(2) has the right token,
* however, then assume LA(1) is some extra spurious token. Delete it
* and LA(2) as if we were doing a normal match(), which advances the
* input. MISSING TOKEN If current token is consistent with what could come after
* ttype then it is ok to "insert" the missing token, else throw
* exception For example, Input "i=(3;" is clearly missing the
* ')'. When the parser returns from the nested call to expr, it
* will have call chain: and it will be trying to match the ')' at this point in the
* derivation: match() will see that ';' doesn't match ')' and report a
* mismatched token error. To recover, it sees that LA(1)==';'
* is in the set of tokens that can follow the ')' token
* reference in rule atom. It can assume that you forgot the ')'. This is ignored for lexers. The recognizer attempts to recover from single missing
* symbols. But, actions might refer to that missing symbol.
* For example, x=ID {f($x);}. The action clearly assumes
* that there has been an identifier matched previously and that
* $x points at that token. If that token is missing, but
* the next token in the stream is what we want we assume that
* this token is missing and we keep going. Because we
* have to return some token to replace the missing token,
* we have to conjure one up. This method gives the user control
* over the tokens returned for missing tokens. Mostly,
* you will want to create something special for identifier
* tokens. For literals such as '{' and ',', the default
* action in the parser or tree parser works. It simply creates
* a CommonToken of the appropriate type. The text will be the token.
* If you change what tokens must be created by the lexer,
* override this method to create the appropriate tokens.
*
*
* a : '[' b ']'
* | '(' b ')'
* ;
* b : c '^' INT ;
* c : ID
* | INT
* ;
*
*
*
* FOLLOW(b1_in_a) = FIRST(']') = ']'
* FOLLOW(b2_in_a) = FIRST(')') = ')'
* FOLLOW(c_in_b) = FIRST('^') = '^'
* a -> b -> c
*
*
*
*
* depth local follow set after call to rule
* 0 FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)}
*
*
*
*
* stat : ID '=' expr ';' // FOLLOW(stat)=={EOF}
* | "return" expr '.'
* ;
* expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'}
* atom : INT // FOLLOW(atom)=={'+',')',';','.'}
* | '(' expr ')'
* ;
*
*
*
* stat => ID '=' expr ';'
* => ID '=' atom ('+' atom)* ';'
* => ID '=' '(' expr ')' ('+' atom)* ';'
* => ID '=' '(' atom ')' ('+' atom)* ';'
* => ID '=' '(' INT ')' ('+' atom)* ';'
* => ID '=' '(' INT ')' ';'
* stat -> expr -> atom -> expr -> atom
*
*
*
* stat -> expr -> atom
* ^
* => ID '=' '(' INT ')' ('+' atom)* ';'
This method has a side-effect: if we have seen this input for * this rule and successfully parsed before, then seek ahead to * 1 past the stop token matched for this rule last time.
* @param {org.antlr.runtime.IntStream} input * @param {Number} ruleIndex * @returns {Boolean} */ alreadyParsedRule: function(input, ruleIndex) { var stopIndex = this.getRuleMemoization(ruleIndex, input.index()); if ( stopIndex==org.antlr.runtime.BaseRecognizer.MEMO_RULE_UNKNOWN ) { return false; } if ( stopIndex==org.antlr.runtime.BaseRecognizer.MEMO_RULE_FAILED ) { //System.out.println("rule "+ruleIndex+" will never succeed"); this.state.failed=true; } else { input.seek(stopIndex+1); // jump to one past stop token } return true; }, /** Record whether or not this rule parsed the input at this position * successfully. Use a standard java hashtable for now. * @param {org.antlr.runtime.IntStream} input * @param {Number} ruleIndex * @param {Number} ruleStartIndex */ memoize: function(input, ruleIndex, ruleStartIndex) { var stopTokenIndex = this.state.failed ? org.antlr.runtime.BaseRecognizer.MEMO_RULE_FAILED : input.index()-1; if ( !org.antlr.lang.isValue(this.state.ruleMemo) ) { throw new Error("!!!!!!!!! memo array is null for "+ this.getGrammarFileName()); } if ( ruleIndex >= this.state.ruleMemo.length ) { throw new Error("!!!!!!!!! memo size is "+this.state.ruleMemo.length+", but rule index is "+ruleIndex); } if ( org.antlr.lang.isValue(this.state.ruleMemo[ruleIndex]) ) { this.state.ruleMemo[ruleIndex][ruleStartIndex] = stopTokenIndex; } }, /** return how many rule/input-index pairs there are in total. * TODO: this includes synpreds. * @returns {Number} */ getRuleMemoizationCacheSize: function() { var n = 0, i; for (i = 0; this.state.ruleMemo && i < this.state.ruleMemo.length; i++) { var ruleMap = this.state.ruleMemo[i]; if ( ruleMap ) { // @todo need to get size of rulemap? n += ruleMap.length; // how many input indexes are recorded? } } return n; }, /** * When a grammar is compiled with the tracing flag enabled, this method is invoked * at the start of each rule. * @param {String} ruleName the current ruleName * @param {Number} ruleIndex * @param {Object} inputSymbol */ traceIn: function(ruleName, ruleIndex, inputSymbol) { this.emitErrorMessage("enter "+ruleName+" "+inputSymbol); if ( this.state.failed ) { this.emitErrorMessage(" failed="+this.failed); } if ( this.state.backtracking>0 ) { this.emitErrorMessage(" backtracking="+this.state.backtracking); } // System.out.println(); }, /** * When a grammar is compiled with the tracing flag enabled, this method is invoked * at the end of each rule. * @param {String} ruleName the current ruleName * @param {Number} ruleIndex * @param {Object} inputSymbol */ traceOut: function(ruleName, ruleIndex, inputSymbol) { this.emitErrorMessage("exit "+ruleName+" "+inputSymbol); if ( this.state.failed ) { this.emitErrorMessage(" failed="+this.state.failed); } if ( this.state.backtracking>0 ) { this.emitErrorMessage(" backtracking="+this.state.backtracking); } } };