1/* 2 [The 'BSD licence'] 3 Copyright (c) 2004 Terence Parr and Loring Craymer 4 All rights reserved. 5 6 Redistribution and use in source and binary forms, with or without 7 modification, are permitted provided that the following conditions 8 are met: 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 2. Redistributions in binary form must reproduce the above copyright 12 notice, this list of conditions and the following disclaimer in the 13 documentation and/or other materials provided with the distribution. 14 3. The name of the author may not be used to endorse or promote products 15 derived from this software without specific prior written permission. 16 17 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 18 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 19 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 20 IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 21 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 22 NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 26 THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27*/ 28 29/** Python 2.3.3 Grammar 30 * 31 * Terence Parr and Loring Craymer 32 * February 2004 33 * 34 * Converted to ANTLR v3 November 2005 by Terence Parr. 35 * 36 * This grammar was derived automatically from the Python 2.3.3 37 * parser grammar to get a syntactically correct ANTLR grammar 38 * for Python. Then Terence hand tweaked it to be semantically 39 * correct; i.e., removed lookahead issues etc... It is LL(1) 40 * except for the (sometimes optional) trailing commas and semi-colons. 41 * It needs two symbols of lookahead in this case. 42 * 43 * Starting with Loring's preliminary lexer for Python, I modified it 44 * to do my version of the whole nasty INDENT/DEDENT issue just so I 45 * could understand the problem better. This grammar requires 46 * PythonTokenStream.java to work. Also I used some rules from the 47 * semi-formal grammar on the web for Python (automatically 48 * translated to ANTLR format by an ANTLR grammar, naturally <grin>). 49 * The lexical rules for python are particularly nasty and it took me 50 * a long time to get it 'right'; i.e., think about it in the proper 51 * way. Resist changing the lexer unless you've used ANTLR a lot. ;) 52 * 53 * I (Terence) tested this by running it on the jython-2.1/Lib 54 * directory of 40k lines of Python. 55 * 56 * REQUIRES ANTLR v3 57 */ 58grammar Python; 59options {language=JavaScript;} 60 61tokens { 62 INDENT; 63 DEDENT; 64} 65 66@lexer::members { 67/** Handles context-sensitive lexing of implicit line joining such as 68 * the case where newline is ignored in cases like this: 69 * a = [3, 70 * 4] 71 */ 72 this.implicitLineJoiningLevel= 0; 73 this.startPos = -1; 74} 75 76single_input 77 : NEWLINE 78 | simple_stmt 79 | compound_stmt NEWLINE 80 ; 81 82file_input 83 : (NEWLINE | stmt)* 84 ; 85 86eval_input 87 : (NEWLINE)* testlist (NEWLINE)* 88 ; 89 90funcdef 91 : 'def' NAME parameters COLON suite 92 {xlog("found method def "+$NAME.text);} 93 ; 94 95parameters 96 : LPAREN (varargslist)? RPAREN 97 ; 98 99varargslist 100 : defparameter (options {greedy=true;}:COMMA defparameter)* 101 (COMMA 102 ( STAR NAME (COMMA DOUBLESTAR NAME)? 103 | DOUBLESTAR NAME 104 )? 105 )? 106 | STAR NAME (COMMA DOUBLESTAR NAME)? 107 | DOUBLESTAR NAME 108 ; 109 110defparameter 111 : fpdef (ASSIGN test)? 112 ; 113 114fpdef 115 : NAME 116 | LPAREN fplist RPAREN 117 ; 118 119fplist 120 : fpdef (options {greedy=true;}:COMMA fpdef)* (COMMA)? 121 ; 122 123 124stmt: simple_stmt 125 | compound_stmt 126 ; 127 128simple_stmt 129 : small_stmt (options {greedy=true;}:SEMI small_stmt)* (SEMI)? NEWLINE 130 ; 131 132small_stmt: expr_stmt 133 | print_stmt 134 | del_stmt 135 | pass_stmt 136 | flow_stmt 137 | import_stmt 138 | global_stmt 139 | exec_stmt 140 | assert_stmt 141 ; 142 143expr_stmt 144 : testlist 145 ( augassign testlist 146 | (ASSIGN testlist)+ 147 )? 148 ; 149 150augassign 151 : PLUSEQUAL 152 | MINUSEQUAL 153 | STAREQUAL 154 | SLASHEQUAL 155 | PERCENTEQUAL 156 | AMPEREQUAL 157 | VBAREQUAL 158 | CIRCUMFLEXEQUAL 159 | LEFTSHIFTEQUAL 160 | RIGHTSHIFTEQUAL 161 | DOUBLESTAREQUAL 162 | DOUBLESLASHEQUAL 163 ; 164 165print_stmt: 166 'print' 167 ( testlist 168 | RIGHTSHIFT testlist 169 )? 170 ; 171 172del_stmt: 'del' exprlist 173 ; 174 175pass_stmt: 'pass' 176 ; 177 178flow_stmt: break_stmt 179 | continue_stmt 180 | return_stmt 181 | raise_stmt 182 | yield_stmt 183 ; 184 185break_stmt: 'break' 186 ; 187 188continue_stmt: 'continue' 189 ; 190 191return_stmt: 'return' (testlist)? 192 ; 193 194yield_stmt: 'yield' testlist 195 ; 196 197raise_stmt: 'raise' (test (COMMA test (COMMA test)?)?)? 198 ; 199 200import_stmt 201 : 'import' dotted_as_name (COMMA dotted_as_name)* 202 | 'from' dotted_name 'import' 203 (STAR | import_as_name (COMMA import_as_name)*) 204 ; 205 206import_as_name 207 : NAME (NAME NAME)? 208 ; 209 210dotted_as_name: dotted_name (NAME NAME)? 211 ; 212 213dotted_name: NAME (DOT NAME)* 214 ; 215 216global_stmt: 'global' NAME (COMMA NAME)* 217 ; 218 219exec_stmt: 'exec' expr ('in' test (COMMA test)?)? 220 ; 221 222assert_stmt: 'assert' test (COMMA test)? 223 ; 224 225 226compound_stmt: if_stmt 227 | while_stmt 228 | for_stmt 229 | try_stmt 230 | funcdef 231 | classdef 232 ; 233 234if_stmt: 'if' test COLON suite ('elif' test COLON suite)* ('else' COLON suite)? 235 ; 236 237while_stmt: 'while' test COLON suite ('else' COLON suite)? 238 ; 239 240for_stmt: 'for' exprlist 'in' testlist COLON suite ('else' COLON suite)? 241 ; 242 243try_stmt 244 : 'try' COLON suite 245 ( (except_clause COLON suite)+ ('else' COLON suite)? 246 | 'finally' COLON suite 247 ) 248 ; 249 250except_clause: 'except' (test (COMMA test)?)? 251 ; 252 253suite: simple_stmt 254 | NEWLINE INDENT (stmt)+ DEDENT 255 ; 256 257 258test: and_test ('or' and_test)* 259 | lambdef 260 ; 261 262and_test 263 : not_test ('and' not_test)* 264 ; 265 266not_test 267 : 'not' not_test 268 | comparison 269 ; 270 271comparison: expr (comp_op expr)* 272 ; 273 274comp_op: LESS 275 |GREATER 276 |EQUAL 277 |GREATEREQUAL 278 |LESSEQUAL 279 |ALT_NOTEQUAL 280 |NOTEQUAL 281 |'in' 282 |'not' 'in' 283 |'is' 284 |'is' 'not' 285 ; 286 287expr: xor_expr (VBAR xor_expr)* 288 ; 289 290xor_expr: and_expr (CIRCUMFLEX and_expr)* 291 ; 292 293and_expr: shift_expr (AMPER shift_expr)* 294 ; 295 296shift_expr: arith_expr ((LEFTSHIFT|RIGHTSHIFT) arith_expr)* 297 ; 298 299arith_expr: term ((PLUS|MINUS) term)* 300 ; 301 302term: factor ((STAR | SLASH | PERCENT | DOUBLESLASH ) factor)* 303 ; 304 305factor 306 : (PLUS|MINUS|TILDE) factor 307 | power 308 ; 309 310power 311 : atom (trailer)* (options {greedy=true;}:DOUBLESTAR factor)? 312 ; 313 314atom: LPAREN (testlist)? RPAREN 315 | LBRACK (listmaker)? RBRACK 316 | LCURLY (dictmaker)? RCURLY 317 | BACKQUOTE testlist BACKQUOTE 318 | NAME 319 | INT 320 | LONGINT 321 | FLOAT 322 | COMPLEX 323 | (STRING)+ 324 ; 325 326listmaker: test ( list_for | (options {greedy=true;}:COMMA test)* ) (COMMA)? 327 ; 328 329lambdef: 'lambda' (varargslist)? COLON test 330 ; 331 332trailer: LPAREN (arglist)? RPAREN 333 | LBRACK subscriptlist RBRACK 334 | DOT NAME 335 ; 336 337subscriptlist 338 : subscript (options {greedy=true;}:COMMA subscript)* (COMMA)? 339 ; 340 341subscript 342 : DOT DOT DOT 343 | test (COLON (test)? (sliceop)?)? 344 | COLON (test)? (sliceop)? 345 ; 346 347sliceop: COLON (test)? 348 ; 349 350exprlist 351 : expr (options {k=2;}:COMMA expr)* (COMMA)? 352 ; 353 354testlist 355 : test (options {k=2;}: COMMA test)* (COMMA)? 356 ; 357 358dictmaker 359 : test COLON test 360 (options {k=2;}:COMMA test COLON test)* (COMMA)? 361 ; 362 363classdef: 'class' NAME (LPAREN testlist RPAREN)? COLON suite 364 {xlog("found class def "+$NAME.text);} 365 ; 366 367arglist: argument (COMMA argument)* 368 ( COMMA 369 ( STAR test (COMMA DOUBLESTAR test)? 370 | DOUBLESTAR test 371 )? 372 )? 373 | STAR test (COMMA DOUBLESTAR test)? 374 | DOUBLESTAR test 375 ; 376 377argument : test (ASSIGN test)? 378 ; 379 380list_iter: list_for 381 | list_if 382 ; 383 384list_for: 'for' exprlist 'in' testlist (list_iter)? 385 ; 386 387list_if: 'if' test (list_iter)? 388 ; 389 390LPAREN : '(' {this.implicitLineJoiningLevel++;} ; 391 392RPAREN : ')' {this.implicitLineJoiningLevel--;} ; 393 394LBRACK : '[' {this.implicitLineJoiningLevel++;} ; 395 396RBRACK : ']' {this.implicitLineJoiningLevel--;} ; 397 398COLON : ':' ; 399 400COMMA : ',' ; 401 402SEMI : ';' ; 403 404PLUS : '+' ; 405 406MINUS : '-' ; 407 408STAR : '*' ; 409 410SLASH : '/' ; 411 412VBAR : '|' ; 413 414AMPER : '&' ; 415 416LESS : '<' ; 417 418GREATER : '>' ; 419 420ASSIGN : '=' ; 421 422PERCENT : '%' ; 423 424BACKQUOTE : '`' ; 425 426LCURLY : '{' {this.implicitLineJoiningLevel++;} ; 427 428RCURLY : '}' {this.implicitLineJoiningLevel--;} ; 429 430CIRCUMFLEX : '^' ; 431 432TILDE : '~' ; 433 434EQUAL : '==' ; 435 436NOTEQUAL : '!=' ; 437 438ALT_NOTEQUAL: '<>' ; 439 440LESSEQUAL : '<=' ; 441 442LEFTSHIFT : '<<' ; 443 444GREATEREQUAL : '>=' ; 445 446RIGHTSHIFT : '>>' ; 447 448PLUSEQUAL : '+=' ; 449 450MINUSEQUAL : '-=' ; 451 452DOUBLESTAR : '**' ; 453 454STAREQUAL : '*=' ; 455 456DOUBLESLASH : '//' ; 457 458SLASHEQUAL : '/=' ; 459 460VBAREQUAL : '|=' ; 461 462PERCENTEQUAL : '%=' ; 463 464AMPEREQUAL : '&=' ; 465 466CIRCUMFLEXEQUAL : '^=' ; 467 468LEFTSHIFTEQUAL : '<<=' ; 469 470RIGHTSHIFTEQUAL : '>>=' ; 471 472DOUBLESTAREQUAL : '**=' ; 473 474DOUBLESLASHEQUAL : '//=' ; 475 476DOT : '.' ; 477 478FLOAT 479 : '.' DIGITS (Exponent)? 480 | DIGITS ('.' (DIGITS (Exponent)?)? | Exponent) 481 ; 482 483LONGINT 484 : INT ('l'|'L') 485 ; 486 487fragment 488Exponent 489 : ('e' | 'E') ( '+' | '-' )? DIGITS 490 ; 491 492INT : // Hex 493 '0' ('x' | 'X') ( '0' .. '9' | 'a' .. 'f' | 'A' .. 'F' )+ 494 ('l' | 'L')? 495 | // Octal 496 '0' DIGITS* 497 | '1'..'9' DIGITS* 498 ; 499 500COMPLEX 501 : INT ('j'|'J') 502 | FLOAT ('j'|'J') 503 ; 504 505fragment 506DIGITS : ( '0' .. '9' )+ ; 507 508NAME: ( 'a' .. 'z' | 'A' .. 'Z' | '_') 509 ( 'a' .. 'z' | 'A' .. 'Z' | '_' | '0' .. '9' )* 510 ; 511 512/** Match various string types. Note that greedy=false implies ''' 513 * should make us exit loop not continue. 514 */ 515STRING 516 : ('r'|'u'|'ur')? 517 ( '\'\'\'' (options {greedy=false;}:.)* '\'\'\'' 518 | '"""' (options {greedy=false;}:.)* '"""' 519 | '"' (ESC|~('\\'|'\n'|'"'))* '"' 520 | '\'' (ESC|~('\\'|'\n'|'\''))* '\'' 521 ) 522 ; 523 524fragment 525ESC 526 : '\\' . 527 ; 528 529/** Consume a newline and any whitespace at start of next line */ 530CONTINUED_LINE 531 : '\\' ('\r')? '\n' (' '|'\t')* { $channel=HIDDEN; } 532 ; 533 534/** Treat a sequence of blank lines as a single blank line. If 535 * nested within a (..), {..}, or [..], then ignore newlines. 536 * If the first newline starts in column one, they are to be ignored. 537 */ 538NEWLINE 539 : (('\r')? '\n' )+ 540 {if ( this.startPos==0 || this.implicitLineJoiningLevel>0 ) 541 $channel=HIDDEN; 542 } 543 ; 544 545WS : {this.startPos>0}?=> (' '|'\t')+ {$channel=HIDDEN;} 546 ; 547 548/** Grab everything before a real symbol. Then if newline, kill it 549 * as this is a blank line. If whitespace followed by comment, kill it 550 * as it's a comment on a line by itself. 551 * 552 * Ignore leading whitespace when nested in [..], (..), {..}. 553 */ 554LEADING_WS 555@init { 556 var spaces = 0; 557} 558 : {this.startPos==0}?=> 559 ( {this.implicitLineJoiningLevel>0}? ( ' ' | '\t' )+ {$channel=HIDDEN;} 560 | ( ' ' { spaces++; } 561 | '\t' { spaces += 8; spaces -= (spaces \% 8); } 562 )+ 563 { 564 // make a string of n spaces where n is column number - 1 565 var indentation = new Array(spaces); 566 for (var i=0; i<spaces; i++) { 567 indentation[i] = ' '; 568 } 569 var s = indentation.join(""); 570 this.emit(new org.antlr.runtime.CommonToken(this.LEADING_WS,s)); 571 } 572 // kill trailing newline if present and then ignore 573 ( ('\r')? '\n' {if (this.state.token!=null) this.state.token.setChannel(HIDDEN); else $channel=HIDDEN;})* 574 // {this.token.setChannel(99); } 575 ) 576 ; 577 578/** Comments not on line by themselves are turned into newlines. 579 580 b = a # end of line comment 581 582 or 583 584 a = [1, # weird 585 2] 586 587 This rule is invoked directly by nextToken when the comment is in 588 first column or when comment is on end of nonwhitespace line. 589 590 Only match \n here if we didn't start on left edge; let NEWLINE return that. 591 Kill if newlines if we live on a line by ourselves 592 593 Consume any leading whitespace if it starts on left edge. 594 */ 595COMMENT 596@init { 597 $channel=HIDDEN; 598} 599 : {this.startPos==0}?=> (' '|'\t')* '#' (~'\n')* '\n'+ 600 | {this.startPos>0}?=> '#' (~'\n')* // let NEWLINE handle \n unless char pos==0 for '#' 601 ; 602