1 /* 2 [The "BSD license"] 3 Copyright (c) 2005-2009 Terence Parr 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions 8 are met: 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 2. Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 3. The name of the author may not be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 package org.antlr.runtime; 29 30 import java.util.ArrayList; 31 import java.util.HashMap; 32 import java.util.List; 33 import java.util.Map; 34 35 /** A generic recognizer that can handle recognizers generated from 36 * lexer, parser, and tree grammars. This is all the parsing 37 * support code essentially; most of it is error recovery stuff and 38 * backtracking. 39 */ 40 public abstract class BaseRecognizer { 41 public static final int MEMO_RULE_FAILED = -2; 42 public static final int MEMO_RULE_UNKNOWN = -1; 43 public static final int INITIAL_FOLLOW_STACK_SIZE = 100; 44 45 // copies from Token object for convenience in actions 46 public static final int DEFAULT_TOKEN_CHANNEL = Token.DEFAULT_CHANNEL; 47 public static final int HIDDEN = Token.HIDDEN_CHANNEL; 48 49 public static final String NEXT_TOKEN_RULE_NAME = "nextToken"; 50 51 /** State of a lexer, parser, or tree parser are collected into a state 52 * object so the state can be shared. This sharing is needed to 53 * have one grammar import others and share same error variables 54 * and other state variables. It's a kind of explicit multiple 55 * inheritance via delegation of methods and shared state. 56 */ 57 protected RecognizerSharedState state; 58 BaseRecognizer()59 public BaseRecognizer() { 60 state = new RecognizerSharedState(); 61 } 62 BaseRecognizer(RecognizerSharedState state)63 public BaseRecognizer(RecognizerSharedState state) { 64 if ( state==null ) { 65 state = new RecognizerSharedState(); 66 } 67 this.state = state; 68 } 69 70 /** reset the parser's state; subclasses must rewinds the input stream */ reset()71 public void reset() { 72 // wack everything related to error recovery 73 if ( state==null ) { 74 return; // no shared state work to do 75 } 76 state._fsp = -1; 77 state.errorRecovery = false; 78 state.lastErrorIndex = -1; 79 state.failed = false; 80 state.syntaxErrors = 0; 81 // wack everything related to backtracking and memoization 82 state.backtracking = 0; 83 for (int i = 0; state.ruleMemo!=null && i < state.ruleMemo.length; i++) { // wipe cache 84 state.ruleMemo[i] = null; 85 } 86 } 87 88 89 /** Match current input symbol against ttype. Attempt 90 * single token insertion or deletion error recovery. If 91 * that fails, throw MismatchedTokenException. 92 * 93 * To turn off single token insertion or deletion error 94 * recovery, override recoverFromMismatchedToken() and have it 95 * throw an exception. See TreeParser.recoverFromMismatchedToken(). 96 * This way any error in a rule will cause an exception and 97 * immediate exit from rule. Rule would recover by resynchronizing 98 * to the set of symbols that can follow rule ref. 99 */ match(IntStream input, int ttype, BitSet follow)100 public Object match(IntStream input, int ttype, BitSet follow) 101 throws RecognitionException 102 { 103 //System.out.println("match "+((TokenStream)input).LT(1)); 104 Object matchedSymbol = getCurrentInputSymbol(input); 105 if ( input.LA(1)==ttype ) { 106 input.consume(); 107 state.errorRecovery = false; 108 state.failed = false; 109 return matchedSymbol; 110 } 111 if ( state.backtracking>0 ) { 112 state.failed = true; 113 return matchedSymbol; 114 } 115 matchedSymbol = recoverFromMismatchedToken(input, ttype, follow); 116 return matchedSymbol; 117 } 118 119 /** Match the wildcard: in a symbol */ matchAny(IntStream input)120 public void matchAny(IntStream input) { 121 state.errorRecovery = false; 122 state.failed = false; 123 input.consume(); 124 } 125 mismatchIsUnwantedToken(IntStream input, int ttype)126 public boolean mismatchIsUnwantedToken(IntStream input, int ttype) { 127 return input.LA(2)==ttype; 128 } 129 mismatchIsMissingToken(IntStream input, BitSet follow)130 public boolean mismatchIsMissingToken(IntStream input, BitSet follow) { 131 if ( follow==null ) { 132 // we have no information about the follow; we can only consume 133 // a single token and hope for the best 134 return false; 135 } 136 // compute what can follow this grammar element reference 137 if ( follow.member(Token.EOR_TOKEN_TYPE) ) { 138 BitSet viableTokensFollowingThisRule = computeContextSensitiveRuleFOLLOW(); 139 follow = follow.or(viableTokensFollowingThisRule); 140 if ( state._fsp>=0 ) { // remove EOR if we're not the start symbol 141 follow.remove(Token.EOR_TOKEN_TYPE); 142 } 143 } 144 // if current token is consistent with what could come after set 145 // then we know we're missing a token; error recovery is free to 146 // "insert" the missing token 147 148 //System.out.println("viable tokens="+follow.toString(getTokenNames())); 149 //System.out.println("LT(1)="+((TokenStream)input).LT(1)); 150 151 // BitSet cannot handle negative numbers like -1 (EOF) so I leave EOR 152 // in follow set to indicate that the fall of the start symbol is 153 // in the set (EOF can follow). 154 if ( follow.member(input.LA(1)) || follow.member(Token.EOR_TOKEN_TYPE) ) { 155 //System.out.println("LT(1)=="+((TokenStream)input).LT(1)+" is consistent with what follows; inserting..."); 156 return true; 157 } 158 return false; 159 } 160 161 /** Report a recognition problem. 162 * 163 * This method sets errorRecovery to indicate the parser is recovering 164 * not parsing. Once in recovery mode, no errors are generated. 165 * To get out of recovery mode, the parser must successfully match 166 * a token (after a resync). So it will go: 167 * 168 * 1. error occurs 169 * 2. enter recovery mode, report error 170 * 3. consume until token found in resynch set 171 * 4. try to resume parsing 172 * 5. next match() will reset errorRecovery mode 173 * 174 * If you override, make sure to update syntaxErrors if you care about that. 175 */ reportError(RecognitionException e)176 public void reportError(RecognitionException e) { 177 // if we've already reported an error and have not matched a token 178 // yet successfully, don't report any errors. 179 if ( state.errorRecovery ) { 180 //System.err.print("[SPURIOUS] "); 181 return; 182 } 183 state.syntaxErrors++; // don't count spurious 184 state.errorRecovery = true; 185 186 displayRecognitionError(this.getTokenNames(), e); 187 } 188 displayRecognitionError(String[] tokenNames, RecognitionException e)189 public void displayRecognitionError(String[] tokenNames, 190 RecognitionException e) 191 { 192 String hdr = getErrorHeader(e); 193 String msg = getErrorMessage(e, tokenNames); 194 emitErrorMessage(hdr+" "+msg); 195 } 196 197 /** What error message should be generated for the various 198 * exception types? 199 * 200 * Not very object-oriented code, but I like having all error message 201 * generation within one method rather than spread among all of the 202 * exception classes. This also makes it much easier for the exception 203 * handling because the exception classes do not have to have pointers back 204 * to this object to access utility routines and so on. Also, changing 205 * the message for an exception type would be difficult because you 206 * would have to subclassing exception, but then somehow get ANTLR 207 * to make those kinds of exception objects instead of the default. 208 * This looks weird, but trust me--it makes the most sense in terms 209 * of flexibility. 210 * 211 * For grammar debugging, you will want to override this to add 212 * more information such as the stack frame with 213 * getRuleInvocationStack(e, this.getClass().getName()) and, 214 * for no viable alts, the decision description and state etc... 215 * 216 * Override this to change the message generated for one or more 217 * exception types. 218 */ getErrorMessage(RecognitionException e, String[] tokenNames)219 public String getErrorMessage(RecognitionException e, String[] tokenNames) { 220 String msg = e.getMessage(); 221 if ( e instanceof UnwantedTokenException ) { 222 UnwantedTokenException ute = (UnwantedTokenException)e; 223 String tokenName="<unknown>"; 224 if ( ute.expecting== Token.EOF ) { 225 tokenName = "EOF"; 226 } 227 else { 228 tokenName = tokenNames[ute.expecting]; 229 } 230 msg = "extraneous input "+getTokenErrorDisplay(ute.getUnexpectedToken())+ 231 " expecting "+tokenName; 232 } 233 else if ( e instanceof MissingTokenException ) { 234 MissingTokenException mte = (MissingTokenException)e; 235 String tokenName="<unknown>"; 236 if ( mte.expecting== Token.EOF ) { 237 tokenName = "EOF"; 238 } 239 else { 240 tokenName = tokenNames[mte.expecting]; 241 } 242 msg = "missing "+tokenName+" at "+getTokenErrorDisplay(e.token); 243 } 244 else if ( e instanceof MismatchedTokenException ) { 245 MismatchedTokenException mte = (MismatchedTokenException)e; 246 String tokenName="<unknown>"; 247 if ( mte.expecting== Token.EOF ) { 248 tokenName = "EOF"; 249 } 250 else { 251 tokenName = tokenNames[mte.expecting]; 252 } 253 msg = "mismatched input "+getTokenErrorDisplay(e.token)+ 254 " expecting "+tokenName; 255 } 256 else if ( e instanceof MismatchedTreeNodeException ) { 257 MismatchedTreeNodeException mtne = (MismatchedTreeNodeException)e; 258 String tokenName="<unknown>"; 259 if ( mtne.expecting==Token.EOF ) { 260 tokenName = "EOF"; 261 } 262 else { 263 tokenName = tokenNames[mtne.expecting]; 264 } 265 msg = "mismatched tree node: "+mtne.node+ 266 " expecting "+tokenName; 267 } 268 else if ( e instanceof NoViableAltException ) { 269 //NoViableAltException nvae = (NoViableAltException)e; 270 // for development, can add "decision=<<"+nvae.grammarDecisionDescription+">>" 271 // and "(decision="+nvae.decisionNumber+") and 272 // "state "+nvae.stateNumber 273 msg = "no viable alternative at input "+getTokenErrorDisplay(e.token); 274 } 275 else if ( e instanceof EarlyExitException ) { 276 //EarlyExitException eee = (EarlyExitException)e; 277 // for development, can add "(decision="+eee.decisionNumber+")" 278 msg = "required (...)+ loop did not match anything at input "+ 279 getTokenErrorDisplay(e.token); 280 } 281 else if ( e instanceof MismatchedSetException ) { 282 MismatchedSetException mse = (MismatchedSetException)e; 283 msg = "mismatched input "+getTokenErrorDisplay(e.token)+ 284 " expecting set "+mse.expecting; 285 } 286 else if ( e instanceof MismatchedNotSetException ) { 287 MismatchedNotSetException mse = (MismatchedNotSetException)e; 288 msg = "mismatched input "+getTokenErrorDisplay(e.token)+ 289 " expecting set "+mse.expecting; 290 } 291 else if ( e instanceof FailedPredicateException ) { 292 FailedPredicateException fpe = (FailedPredicateException)e; 293 msg = "rule "+fpe.ruleName+" failed predicate: {"+ 294 fpe.predicateText+"}?"; 295 } 296 return msg; 297 } 298 299 /** Get number of recognition errors (lexer, parser, tree parser). Each 300 * recognizer tracks its own number. So parser and lexer each have 301 * separate count. Does not count the spurious errors found between 302 * an error and next valid token match 303 * 304 * See also reportError() 305 */ getNumberOfSyntaxErrors()306 public int getNumberOfSyntaxErrors() { 307 return state.syntaxErrors; 308 } 309 310 /** What is the error header, normally line/character position information? */ getErrorHeader(RecognitionException e)311 public String getErrorHeader(RecognitionException e) { 312 if ( getSourceName()!=null ) 313 return getSourceName()+" line "+e.line+":"+e.charPositionInLine; 314 315 return "line "+e.line+":"+e.charPositionInLine; 316 } 317 318 /** How should a token be displayed in an error message? The default 319 * is to display just the text, but during development you might 320 * want to have a lot of information spit out. Override in that case 321 * to use t.toString() (which, for CommonToken, dumps everything about 322 * the token). This is better than forcing you to override a method in 323 * your token objects because you don't have to go modify your lexer 324 * so that it creates a new Java type. 325 */ getTokenErrorDisplay(Token t)326 public String getTokenErrorDisplay(Token t) { 327 String s = t.getText(); 328 if ( s==null ) { 329 if ( t.getType()==Token.EOF ) { 330 s = "<EOF>"; 331 } 332 else { 333 s = "<"+t.getType()+">"; 334 } 335 } 336 s = s.replaceAll("\n","\\\\n"); 337 s = s.replaceAll("\r","\\\\r"); 338 s = s.replaceAll("\t","\\\\t"); 339 return "'"+s+"'"; 340 } 341 342 /** Override this method to change where error messages go */ emitErrorMessage(String msg)343 public void emitErrorMessage(String msg) { 344 System.err.println(msg); 345 } 346 347 /** Recover from an error found on the input stream. This is 348 * for NoViableAlt and mismatched symbol exceptions. If you enable 349 * single token insertion and deletion, this will usually not 350 * handle mismatched symbol exceptions but there could be a mismatched 351 * token that the match() routine could not recover from. 352 */ recover(IntStream input, RecognitionException re)353 public void recover(IntStream input, RecognitionException re) { 354 if ( state.lastErrorIndex==input.index() ) { 355 // uh oh, another error at same token index; must be a case 356 // where LT(1) is in the recovery token set so nothing is 357 // consumed; consume a single token so at least to prevent 358 // an infinite loop; this is a failsafe. 359 input.consume(); 360 } 361 state.lastErrorIndex = input.index(); 362 BitSet followSet = computeErrorRecoverySet(); 363 beginResync(); 364 consumeUntil(input, followSet); 365 endResync(); 366 } 367 368 /** A hook to listen in on the token consumption during error recovery. 369 * The DebugParser subclasses this to fire events to the listenter. 370 */ beginResync()371 public void beginResync() { 372 } 373 endResync()374 public void endResync() { 375 } 376 377 /* Compute the error recovery set for the current rule. During 378 * rule invocation, the parser pushes the set of tokens that can 379 * follow that rule reference on the stack; this amounts to 380 * computing FIRST of what follows the rule reference in the 381 * enclosing rule. This local follow set only includes tokens 382 * from within the rule; i.e., the FIRST computation done by 383 * ANTLR stops at the end of a rule. 384 * 385 * EXAMPLE 386 * 387 * When you find a "no viable alt exception", the input is not 388 * consistent with any of the alternatives for rule r. The best 389 * thing to do is to consume tokens until you see something that 390 * can legally follow a call to r *or* any rule that called r. 391 * You don't want the exact set of viable next tokens because the 392 * input might just be missing a token--you might consume the 393 * rest of the input looking for one of the missing tokens. 394 * 395 * Consider grammar: 396 * 397 * a : '[' b ']' 398 * | '(' b ')' 399 * ; 400 * b : c '^' INT ; 401 * c : ID 402 * | INT 403 * ; 404 * 405 * At each rule invocation, the set of tokens that could follow 406 * that rule is pushed on a stack. Here are the various "local" 407 * follow sets: 408 * 409 * FOLLOW(b1_in_a) = FIRST(']') = ']' 410 * FOLLOW(b2_in_a) = FIRST(')') = ')' 411 * FOLLOW(c_in_b) = FIRST('^') = '^' 412 * 413 * Upon erroneous input "[]", the call chain is 414 * 415 * a -> b -> c 416 * 417 * and, hence, the follow context stack is: 418 * 419 * depth local follow set after call to rule 420 * 0 <EOF> a (from main()) 421 * 1 ']' b 422 * 3 '^' c 423 * 424 * Notice that ')' is not included, because b would have to have 425 * been called from a different context in rule a for ')' to be 426 * included. 427 * 428 * For error recovery, we cannot consider FOLLOW(c) 429 * (context-sensitive or otherwise). We need the combined set of 430 * all context-sensitive FOLLOW sets--the set of all tokens that 431 * could follow any reference in the call chain. We need to 432 * resync to one of those tokens. Note that FOLLOW(c)='^' and if 433 * we resync'd to that token, we'd consume until EOF. We need to 434 * sync to context-sensitive FOLLOWs for a, b, and c: {']','^'}. 435 * In this case, for input "[]", LA(1) is in this set so we would 436 * not consume anything and after printing an error rule c would 437 * return normally. It would not find the required '^' though. 438 * At this point, it gets a mismatched token error and throws an 439 * exception (since LA(1) is not in the viable following token 440 * set). The rule exception handler tries to recover, but finds 441 * the same recovery set and doesn't consume anything. Rule b 442 * exits normally returning to rule a. Now it finds the ']' (and 443 * with the successful match exits errorRecovery mode). 444 * 445 * So, you cna see that the parser walks up call chain looking 446 * for the token that was a member of the recovery set. 447 * 448 * Errors are not generated in errorRecovery mode. 449 * 450 * ANTLR's error recovery mechanism is based upon original ideas: 451 * 452 * "Algorithms + Data Structures = Programs" by Niklaus Wirth 453 * 454 * and 455 * 456 * "A note on error recovery in recursive descent parsers": 457 * http://portal.acm.org/citation.cfm?id=947902.947905 458 * 459 * Later, Josef Grosch had some good ideas: 460 * 461 * "Efficient and Comfortable Error Recovery in Recursive Descent 462 * Parsers": 463 * ftp://www.cocolab.com/products/cocktail/doca4.ps/ell.ps.zip 464 * 465 * Like Grosch I implemented local FOLLOW sets that are combined 466 * at run-time upon error to avoid overhead during parsing. 467 */ computeErrorRecoverySet()468 protected BitSet computeErrorRecoverySet() { 469 return combineFollows(false); 470 } 471 472 /** Compute the context-sensitive FOLLOW set for current rule. 473 * This is set of token types that can follow a specific rule 474 * reference given a specific call chain. You get the set of 475 * viable tokens that can possibly come next (lookahead depth 1) 476 * given the current call chain. Contrast this with the 477 * definition of plain FOLLOW for rule r: 478 * 479 * FOLLOW(r)={x | S=>*alpha r beta in G and x in FIRST(beta)} 480 * 481 * where x in T* and alpha, beta in V*; T is set of terminals and 482 * V is the set of terminals and nonterminals. In other words, 483 * FOLLOW(r) is the set of all tokens that can possibly follow 484 * references to r in *any* sentential form (context). At 485 * runtime, however, we know precisely which context applies as 486 * we have the call chain. We may compute the exact (rather 487 * than covering superset) set of following tokens. 488 * 489 * For example, consider grammar: 490 * 491 * stat : ID '=' expr ';' // FOLLOW(stat)=={EOF} 492 * | "return" expr '.' 493 * ; 494 * expr : atom ('+' atom)* ; // FOLLOW(expr)=={';','.',')'} 495 * atom : INT // FOLLOW(atom)=={'+',')',';','.'} 496 * | '(' expr ')' 497 * ; 498 * 499 * The FOLLOW sets are all inclusive whereas context-sensitive 500 * FOLLOW sets are precisely what could follow a rule reference. 501 * For input input "i=(3);", here is the derivation: 502 * 503 * stat => ID '=' expr ';' 504 * => ID '=' atom ('+' atom)* ';' 505 * => ID '=' '(' expr ')' ('+' atom)* ';' 506 * => ID '=' '(' atom ')' ('+' atom)* ';' 507 * => ID '=' '(' INT ')' ('+' atom)* ';' 508 * => ID '=' '(' INT ')' ';' 509 * 510 * At the "3" token, you'd have a call chain of 511 * 512 * stat -> expr -> atom -> expr -> atom 513 * 514 * What can follow that specific nested ref to atom? Exactly ')' 515 * as you can see by looking at the derivation of this specific 516 * input. Contrast this with the FOLLOW(atom)={'+',')',';','.'}. 517 * 518 * You want the exact viable token set when recovering from a 519 * token mismatch. Upon token mismatch, if LA(1) is member of 520 * the viable next token set, then you know there is most likely 521 * a missing token in the input stream. "Insert" one by just not 522 * throwing an exception. 523 */ computeContextSensitiveRuleFOLLOW()524 protected BitSet computeContextSensitiveRuleFOLLOW() { 525 return combineFollows(true); 526 } 527 528 // what is exact? it seems to only add sets from above on stack 529 // if EOR is in set i. When it sees a set w/o EOR, it stops adding. 530 // Why would we ever want them all? Maybe no viable alt instead of 531 // mismatched token? combineFollows(boolean exact)532 protected BitSet combineFollows(boolean exact) { 533 int top = state._fsp; 534 BitSet followSet = new BitSet(); 535 for (int i=top; i>=0; i--) { 536 BitSet localFollowSet = (BitSet)state.following[i]; 537 /* 538 System.out.println("local follow depth "+i+"="+ 539 localFollowSet.toString(getTokenNames())+")"); 540 */ 541 followSet.orInPlace(localFollowSet); 542 if ( exact ) { 543 // can we see end of rule? 544 if ( localFollowSet.member(Token.EOR_TOKEN_TYPE) ) { 545 // Only leave EOR in set if at top (start rule); this lets 546 // us know if have to include follow(start rule); i.e., EOF 547 if ( i>0 ) { 548 followSet.remove(Token.EOR_TOKEN_TYPE); 549 } 550 } 551 else { // can't see end of rule, quit 552 break; 553 } 554 } 555 } 556 return followSet; 557 } 558 559 /** Attempt to recover from a single missing or extra token. 560 * 561 * EXTRA TOKEN 562 * 563 * LA(1) is not what we are looking for. If LA(2) has the right token, 564 * however, then assume LA(1) is some extra spurious token. Delete it 565 * and LA(2) as if we were doing a normal match(), which advances the 566 * input. 567 * 568 * MISSING TOKEN 569 * 570 * If current token is consistent with what could come after 571 * ttype then it is ok to "insert" the missing token, else throw 572 * exception For example, Input "i=(3;" is clearly missing the 573 * ')'. When the parser returns from the nested call to expr, it 574 * will have call chain: 575 * 576 * stat -> expr -> atom 577 * 578 * and it will be trying to match the ')' at this point in the 579 * derivation: 580 * 581 * => ID '=' '(' INT ')' ('+' atom)* ';' 582 * ^ 583 * match() will see that ';' doesn't match ')' and report a 584 * mismatched token error. To recover, it sees that LA(1)==';' 585 * is in the set of tokens that can follow the ')' token 586 * reference in rule atom. It can assume that you forgot the ')'. 587 */ recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)588 protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow) 589 throws RecognitionException 590 { 591 RecognitionException e = null; 592 // if next token is what we are looking for then "delete" this token 593 if ( mismatchIsUnwantedToken(input, ttype) ) { 594 e = new UnwantedTokenException(ttype, input); 595 /* 596 System.err.println("recoverFromMismatchedToken deleting "+ 597 ((TokenStream)input).LT(1)+ 598 " since "+((TokenStream)input).LT(2)+" is what we want"); 599 */ 600 beginResync(); 601 input.consume(); // simply delete extra token 602 endResync(); 603 reportError(e); // report after consuming so AW sees the token in the exception 604 // we want to return the token we're actually matching 605 Object matchedSymbol = getCurrentInputSymbol(input); 606 input.consume(); // move past ttype token as if all were ok 607 return matchedSymbol; 608 } 609 // can't recover with single token deletion, try insertion 610 if ( mismatchIsMissingToken(input, follow) ) { 611 Object inserted = getMissingSymbol(input, e, ttype, follow); 612 e = new MissingTokenException(ttype, input, inserted); 613 reportError(e); // report after inserting so AW sees the token in the exception 614 return inserted; 615 } 616 // even that didn't work; must throw the exception 617 e = new MismatchedTokenException(ttype, input); 618 throw e; 619 } 620 621 /** Not currently used */ recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)622 public Object recoverFromMismatchedSet(IntStream input, 623 RecognitionException e, 624 BitSet follow) 625 throws RecognitionException 626 { 627 if ( mismatchIsMissingToken(input, follow) ) { 628 // System.out.println("missing token"); 629 reportError(e); 630 // we don't know how to conjure up a token for sets yet 631 return getMissingSymbol(input, e, Token.INVALID_TOKEN_TYPE, follow); 632 } 633 // TODO do single token deletion like above for Token mismatch 634 throw e; 635 } 636 637 /** Match needs to return the current input symbol, which gets put 638 * into the label for the associated token ref; e.g., x=ID. Token 639 * and tree parsers need to return different objects. Rather than test 640 * for input stream type or change the IntStream interface, I use 641 * a simple method to ask the recognizer to tell me what the current 642 * input symbol is. 643 * 644 * This is ignored for lexers. 645 */ getCurrentInputSymbol(IntStream input)646 protected Object getCurrentInputSymbol(IntStream input) { return null; } 647 648 /** Conjure up a missing token during error recovery. 649 * 650 * The recognizer attempts to recover from single missing 651 * symbols. But, actions might refer to that missing symbol. 652 * For example, x=ID {f($x);}. The action clearly assumes 653 * that there has been an identifier matched previously and that 654 * $x points at that token. If that token is missing, but 655 * the next token in the stream is what we want we assume that 656 * this token is missing and we keep going. Because we 657 * have to return some token to replace the missing token, 658 * we have to conjure one up. This method gives the user control 659 * over the tokens returned for missing tokens. Mostly, 660 * you will want to create something special for identifier 661 * tokens. For literals such as '{' and ',', the default 662 * action in the parser or tree parser works. It simply creates 663 * a CommonToken of the appropriate type. The text will be the token. 664 * If you change what tokens must be created by the lexer, 665 * override this method to create the appropriate tokens. 666 */ getMissingSymbol(IntStream input, RecognitionException e, int expectedTokenType, BitSet follow)667 protected Object getMissingSymbol(IntStream input, 668 RecognitionException e, 669 int expectedTokenType, 670 BitSet follow) 671 { 672 return null; 673 } 674 consumeUntil(IntStream input, int tokenType)675 public void consumeUntil(IntStream input, int tokenType) { 676 //System.out.println("consumeUntil "+tokenType); 677 int ttype = input.LA(1); 678 while (ttype != Token.EOF && ttype != tokenType) { 679 input.consume(); 680 ttype = input.LA(1); 681 } 682 } 683 684 /** Consume tokens until one matches the given token set */ consumeUntil(IntStream input, BitSet set)685 public void consumeUntil(IntStream input, BitSet set) { 686 //System.out.println("consumeUntil("+set.toString(getTokenNames())+")"); 687 int ttype = input.LA(1); 688 while (ttype != Token.EOF && !set.member(ttype) ) { 689 //System.out.println("consume during recover LA(1)="+getTokenNames()[input.LA(1)]); 690 input.consume(); 691 ttype = input.LA(1); 692 } 693 } 694 695 /** Push a rule's follow set using our own hardcoded stack */ pushFollow(BitSet fset)696 protected void pushFollow(BitSet fset) { 697 if ( (state._fsp +1)>=state.following.length ) { 698 BitSet[] f = new BitSet[state.following.length*2]; 699 System.arraycopy(state.following, 0, f, 0, state.following.length); 700 state.following = f; 701 } 702 state.following[++state._fsp] = fset; 703 } 704 705 /** Return List<String> of the rules in your parser instance 706 * leading up to a call to this method. You could override if 707 * you want more details such as the file/line info of where 708 * in the parser java code a rule is invoked. 709 * 710 * This is very useful for error messages and for context-sensitive 711 * error recovery. 712 */ getRuleInvocationStack()713 public List getRuleInvocationStack() { 714 String parserClassName = getClass().getName(); 715 return getRuleInvocationStack(new Throwable(), parserClassName); 716 } 717 718 /** A more general version of getRuleInvocationStack where you can 719 * pass in, for example, a RecognitionException to get it's rule 720 * stack trace. This routine is shared with all recognizers, hence, 721 * static. 722 * 723 * TODO: move to a utility class or something; weird having lexer call this 724 */ getRuleInvocationStack(Throwable e, String recognizerClassName)725 public static List getRuleInvocationStack(Throwable e, 726 String recognizerClassName) 727 { 728 List rules = new ArrayList(); 729 StackTraceElement[] stack = e.getStackTrace(); 730 int i = 0; 731 for (i=stack.length-1; i>=0; i--) { 732 StackTraceElement t = stack[i]; 733 if ( t.getClassName().startsWith("org.antlr.runtime.") ) { 734 continue; // skip support code such as this method 735 } 736 if ( t.getMethodName().equals(NEXT_TOKEN_RULE_NAME) ) { 737 continue; 738 } 739 if ( !t.getClassName().equals(recognizerClassName) ) { 740 continue; // must not be part of this parser 741 } 742 rules.add(t.getMethodName()); 743 } 744 return rules; 745 } 746 getBacktrackingLevel()747 public int getBacktrackingLevel() { return state.backtracking; } 748 setBacktrackingLevel(int n)749 public void setBacktrackingLevel(int n) { state.backtracking = n; } 750 751 /** Return whether or not a backtracking attempt failed. */ failed()752 public boolean failed() { return state.failed; } 753 754 /** Used to print out token names like ID during debugging and 755 * error reporting. The generated parsers implement a method 756 * that overrides this to point to their String[] tokenNames. 757 */ getTokenNames()758 public String[] getTokenNames() { 759 return null; 760 } 761 762 /** For debugging and other purposes, might want the grammar name. 763 * Have ANTLR generate an implementation for this method. 764 */ getGrammarFileName()765 public String getGrammarFileName() { 766 return null; 767 } 768 getSourceName()769 public abstract String getSourceName(); 770 771 /** A convenience method for use most often with template rewrites. 772 * Convert a List<Token> to List<String> 773 */ toStrings(List tokens)774 public List toStrings(List tokens) { 775 if ( tokens==null ) return null; 776 List strings = new ArrayList(tokens.size()); 777 for (int i=0; i<tokens.size(); i++) { 778 strings.add(((Token)tokens.get(i)).getText()); 779 } 780 return strings; 781 } 782 783 /** Given a rule number and a start token index number, return 784 * MEMO_RULE_UNKNOWN if the rule has not parsed input starting from 785 * start index. If this rule has parsed input starting from the 786 * start index before, then return where the rule stopped parsing. 787 * It returns the index of the last token matched by the rule. 788 * 789 * For now we use a hashtable and just the slow Object-based one. 790 * Later, we can make a special one for ints and also one that 791 * tosses out data after we commit past input position i. 792 */ getRuleMemoization(int ruleIndex, int ruleStartIndex)793 public int getRuleMemoization(int ruleIndex, int ruleStartIndex) { 794 if ( state.ruleMemo[ruleIndex]==null ) { 795 state.ruleMemo[ruleIndex] = new HashMap(); 796 } 797 Integer stopIndexI = 798 (Integer)state.ruleMemo[ruleIndex].get(new Integer(ruleStartIndex)); 799 if ( stopIndexI==null ) { 800 return MEMO_RULE_UNKNOWN; 801 } 802 return stopIndexI.intValue(); 803 } 804 805 /** Has this rule already parsed input at the current index in the 806 * input stream? Return the stop token index or MEMO_RULE_UNKNOWN. 807 * If we attempted but failed to parse properly before, return 808 * MEMO_RULE_FAILED. 809 * 810 * This method has a side-effect: if we have seen this input for 811 * this rule and successfully parsed before, then seek ahead to 812 * 1 past the stop token matched for this rule last time. 813 */ alreadyParsedRule(IntStream input, int ruleIndex)814 public boolean alreadyParsedRule(IntStream input, int ruleIndex) { 815 int stopIndex = getRuleMemoization(ruleIndex, input.index()); 816 if ( stopIndex==MEMO_RULE_UNKNOWN ) { 817 return false; 818 } 819 if ( stopIndex==MEMO_RULE_FAILED ) { 820 //System.out.println("rule "+ruleIndex+" will never succeed"); 821 state.failed=true; 822 } 823 else { 824 //System.out.println("seen rule "+ruleIndex+" before; skipping ahead to @"+(stopIndex+1)+" failed="+state.failed); 825 input.seek(stopIndex+1); // jump to one past stop token 826 } 827 return true; 828 } 829 830 /** Record whether or not this rule parsed the input at this position 831 * successfully. Use a standard java hashtable for now. 832 */ memoize(IntStream input, int ruleIndex, int ruleStartIndex)833 public void memoize(IntStream input, 834 int ruleIndex, 835 int ruleStartIndex) 836 { 837 int stopTokenIndex = state.failed?MEMO_RULE_FAILED:input.index()-1; 838 if ( state.ruleMemo==null ) { 839 System.err.println("!!!!!!!!! memo array is null for "+ getGrammarFileName()); 840 } 841 if ( ruleIndex >= state.ruleMemo.length ) { 842 System.err.println("!!!!!!!!! memo size is "+state.ruleMemo.length+", but rule index is "+ruleIndex); 843 } 844 if ( state.ruleMemo[ruleIndex]!=null ) { 845 state.ruleMemo[ruleIndex].put( 846 new Integer(ruleStartIndex), new Integer(stopTokenIndex) 847 ); 848 } 849 } 850 851 /** return how many rule/input-index pairs there are in total. 852 * TODO: this includes synpreds. :( 853 */ getRuleMemoizationCacheSize()854 public int getRuleMemoizationCacheSize() { 855 int n = 0; 856 for (int i = 0; state.ruleMemo!=null && i < state.ruleMemo.length; i++) { 857 Map ruleMap = state.ruleMemo[i]; 858 if ( ruleMap!=null ) { 859 n += ruleMap.size(); // how many input indexes are recorded? 860 } 861 } 862 return n; 863 } 864 traceIn(String ruleName, int ruleIndex, Object inputSymbol)865 public void traceIn(String ruleName, int ruleIndex, Object inputSymbol) { 866 System.out.print("enter "+ruleName+" "+inputSymbol); 867 if ( state.backtracking>0 ) { 868 System.out.print(" backtracking="+state.backtracking); 869 } 870 System.out.println(); 871 } 872 traceOut(String ruleName, int ruleIndex, Object inputSymbol)873 public void traceOut(String ruleName, 874 int ruleIndex, 875 Object inputSymbol) 876 { 877 System.out.print("exit "+ruleName+" "+inputSymbol); 878 if ( state.backtracking>0 ) { 879 System.out.print(" backtracking="+state.backtracking); 880 if ( state.failed ) System.out.print(" failed"); 881 else System.out.print(" succeeded"); 882 } 883 System.out.println(); 884 } 885 886 } 887