1 /* 2 * Copyright 2016 Google Inc. All Rights Reserved. 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.google.turbine.parse; 18 19 import static com.google.turbine.parse.UnicodeEscapePreprocessor.ASCII_SUB; 20 21 import com.google.common.base.Verify; 22 import com.google.turbine.diag.SourceFile; 23 import com.google.turbine.diag.TurbineError; 24 import com.google.turbine.diag.TurbineError.ErrorKind; 25 26 /** A {@link Lexer} that streams input from a {@link UnicodeEscapePreprocessor}. */ 27 public class StreamLexer implements Lexer { 28 29 private final UnicodeEscapePreprocessor reader; 30 31 /** The current input character. */ 32 private char ch; 33 34 /** The start position of the current token. */ 35 private int position; 36 37 /** The start position of the current numeric literal or identifier token. */ 38 private int readFrom; 39 40 /** The value of the current string or character literal token. */ 41 private String value = null; 42 StreamLexer(UnicodeEscapePreprocessor reader)43 public StreamLexer(UnicodeEscapePreprocessor reader) { 44 this.reader = reader; 45 eat(); 46 } 47 48 /** Records the value of a literal. */ saveValue(String value)49 private void saveValue(String value) { 50 this.value = value; 51 } 52 53 /** Records the start position of a literal. */ readFrom()54 private void readFrom() { 55 value = null; 56 readFrom = reader.position(); 57 } 58 59 /** Consumes an input character. */ eat()60 private void eat() { 61 ch = reader.next(); 62 } 63 64 @Override stringValue()65 public String stringValue() { 66 if (value != null) { 67 return value; 68 } 69 return reader.readString(readFrom, reader.position()); 70 } 71 72 @Override position()73 public int position() { 74 return position; 75 } 76 77 @Override source()78 public SourceFile source() { 79 return reader.source(); 80 } 81 82 @Override next()83 public Token next() { 84 OUTER: 85 while (true) { 86 position = reader.position(); 87 switch (ch) { 88 case '\r': 89 case '\n': 90 case ' ': 91 case '\t': 92 case '\f': 93 eat(); 94 continue OUTER; 95 96 case '/': 97 { 98 eat(); 99 switch (ch) { 100 case '/': 101 while (true) { 102 eat(); 103 switch (ch) { 104 case '\n': 105 case '\r': 106 eat(); 107 continue OUTER; 108 case ASCII_SUB: 109 if (reader.done()) { 110 return Token.EOF; 111 } 112 eat(); 113 break; 114 } 115 } 116 case '*': 117 boolean sawStar = false; 118 while (true) { 119 eat(); 120 switch (ch) { 121 case '*': 122 sawStar = true; 123 break; 124 case '/': 125 if (sawStar) { 126 eat(); 127 continue OUTER; 128 } 129 sawStar = false; 130 break; 131 case ASCII_SUB: 132 if (reader.done()) { 133 return Token.EOF; 134 } 135 eat(); 136 break; 137 default: 138 sawStar = false; 139 break; 140 } 141 } 142 default: 143 if (ch == '=') { 144 eat(); 145 return Token.DIVEQ; 146 } 147 return Token.DIV; 148 } 149 } 150 151 case 'a': 152 case 'b': 153 case 'c': 154 case 'd': 155 case 'e': 156 case 'f': 157 case 'g': 158 case 'h': 159 case 'i': 160 case 'j': 161 case 'k': 162 case 'l': 163 case 'm': 164 case 'n': 165 case 'o': 166 case 'p': 167 case 'q': 168 case 'r': 169 case 's': 170 case 't': 171 case 'u': 172 case 'v': 173 case 'w': 174 case 'x': 175 case 'y': 176 case 'z': 177 case 'A': 178 case 'B': 179 case 'C': 180 case 'D': 181 case 'E': 182 case 'F': 183 case 'G': 184 case 'H': 185 case 'I': 186 case 'J': 187 case 'K': 188 case 'L': 189 case 'M': 190 case 'N': 191 case 'O': 192 case 'P': 193 case 'Q': 194 case 'R': 195 case 'S': 196 case 'T': 197 case 'U': 198 case 'V': 199 case 'W': 200 case 'X': 201 case 'Y': 202 case 'Z': 203 case '_': 204 case '$': 205 return identifier(); 206 207 case ASCII_SUB: 208 Verify.verify(reader.done()); 209 return Token.EOF; 210 211 case '-': 212 case '=': 213 case '>': 214 case '<': 215 case '!': 216 case '~': 217 case '+': 218 case '?': 219 case ':': 220 case '*': 221 case '&': 222 case '|': 223 case '^': 224 case '%': 225 return operator(); 226 case '(': 227 eat(); 228 return Token.LPAREN; 229 case ')': 230 eat(); 231 return Token.RPAREN; 232 case '{': 233 eat(); 234 return Token.LBRACE; 235 case '}': 236 eat(); 237 return Token.RBRACE; 238 case '[': 239 eat(); 240 return Token.LBRACK; 241 case ']': 242 eat(); 243 return Token.RBRACK; 244 case ';': 245 eat(); 246 return Token.SEMI; 247 case ',': 248 eat(); 249 return Token.COMMA; 250 case '@': 251 eat(); 252 return Token.AT; // what about frac, etc.? 253 254 case '0': 255 { 256 readFrom(); 257 eat(); 258 switch (ch) { 259 case 'x': 260 case 'X': 261 eat(); 262 return hexLiteral(); 263 case 'b': 264 case 'B': 265 eat(); 266 return boolLiteral(); 267 case '0': 268 case '1': 269 case '2': 270 case '3': 271 case '4': 272 case '5': 273 case '6': 274 case '7': 275 case '_': 276 return octalLiteral(); 277 case '.': 278 eat(); 279 return floatLiteral(); 280 case 'f': 281 case 'F': 282 eat(); 283 return Token.FLOAT_LITERAL; 284 case 'd': 285 case 'D': 286 eat(); 287 return Token.DOUBLE_LITERAL; 288 case 'l': 289 case 'L': 290 eat(); 291 return Token.LONG_LITERAL; 292 default: 293 return Token.INT_LITERAL; 294 } 295 } 296 case '1': 297 case '2': 298 case '3': 299 case '4': 300 case '5': 301 case '6': 302 case '7': 303 case '8': 304 case '9': 305 readFrom(); 306 return decimalLiteral(); 307 case '.': 308 { 309 readFrom(); 310 eat(); 311 switch (ch) { 312 case '.': 313 { 314 eat(); 315 if (ch == '.') { 316 eat(); 317 return Token.ELLIPSIS; 318 } else { 319 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 320 } 321 } 322 case '0': 323 case '1': 324 case '2': 325 case '3': 326 case '4': 327 case '5': 328 case '6': 329 case '7': 330 case '8': 331 case '9': 332 return floatLiteral(); 333 default: 334 return Token.DOT; 335 } 336 } 337 338 case '\'': 339 { 340 eat(); 341 char value; 342 switch (ch) { 343 case '\\': 344 eat(); 345 value = escape(); 346 break; 347 case '\'': 348 throw error(ErrorKind.EMPTY_CHARACTER_LITERAL); 349 default: 350 value = ch; 351 eat(); 352 } 353 if (ch == '\'') { 354 saveValue(String.valueOf(value)); 355 eat(); 356 return Token.CHAR_LITERAL; 357 } 358 throw error(ErrorKind.UNTERMINATED_CHARACTER_LITERAL); 359 } 360 361 case '"': 362 { 363 eat(); 364 readFrom(); 365 StringBuilder sb = new StringBuilder(); 366 STRING: 367 while (true) { 368 switch (ch) { 369 case '\\': 370 eat(); 371 sb.append(escape()); 372 continue STRING; 373 case '"': 374 saveValue(sb.toString()); 375 eat(); 376 return Token.STRING_LITERAL; 377 case '\n': 378 throw error(ErrorKind.UNTERMINATED_STRING); 379 case ASCII_SUB: 380 if (reader.done()) { 381 return Token.EOF; 382 } 383 // falls through 384 default: 385 sb.append(ch); 386 eat(); 387 continue STRING; 388 } 389 } 390 } 391 default: 392 if (Character.isJavaIdentifierStart(ch)) { 393 // TODO(cushon): the style guide disallows non-ascii identifiers 394 return identifier(); 395 } 396 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 397 } 398 } 399 } 400 escape()401 private char escape() { 402 boolean zeroToThree = false; 403 switch (ch) { 404 case 'b': 405 eat(); 406 return '\b'; 407 case 't': 408 eat(); 409 return '\t'; 410 case 'n': 411 eat(); 412 return '\n'; 413 case 'f': 414 eat(); 415 return '\f'; 416 case 'r': 417 eat(); 418 return '\r'; 419 case '"': 420 eat(); 421 return '\"'; 422 case '\'': 423 eat(); 424 return '\''; 425 case '\\': 426 eat(); 427 return '\\'; 428 case '0': 429 case '1': 430 case '2': 431 case '3': 432 zeroToThree = true; 433 // falls through 434 case '4': 435 case '5': 436 case '6': 437 case '7': 438 { 439 char value = (char) (ch - '0'); 440 eat(); 441 switch (ch) { 442 case '0': 443 case '1': 444 case '2': 445 case '3': 446 case '4': 447 case '5': 448 case '6': 449 case '7': 450 { 451 value = (char) ((value << 3) | (ch - '0')); 452 eat(); 453 if (zeroToThree) { 454 switch (ch) { 455 case '0': 456 case '1': 457 case '2': 458 case '3': 459 case '4': 460 case '5': 461 case '6': 462 case '7': 463 value = (char) ((value << 3) | (ch - '0')); 464 eat(); 465 return value; 466 default: 467 return value; 468 } 469 } 470 } 471 // fall through 472 default: 473 return value; 474 } 475 } 476 default: 477 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 478 } 479 } 480 decimalLiteral()481 private Token decimalLiteral() { 482 readDigits(); 483 switch (ch) { 484 case 'e': 485 case 'E': 486 return floatLiteral(); 487 case '.': 488 eat(); 489 return floatLiteral(); 490 case 'f': 491 case 'F': 492 eat(); 493 return Token.FLOAT_LITERAL; 494 case 'd': 495 case 'D': 496 eat(); 497 return Token.DOUBLE_LITERAL; 498 case 'l': 499 case 'L': 500 eat(); 501 return Token.LONG_LITERAL; 502 default: 503 return Token.INT_LITERAL; 504 } 505 } 506 hexFloatLiteral()507 private Token hexFloatLiteral() { 508 readHexDigits(); 509 switch (ch) { 510 case 'p': 511 case 'P': 512 eat(); 513 signedInteger(); 514 break; 515 } 516 return floatTypeSuffix(); 517 } 518 floatLiteral()519 private Token floatLiteral() { 520 if ('0' <= ch && ch <= '9') { 521 readDigits(); 522 } 523 switch (ch) { 524 case 'e': 525 case 'E': 526 eat(); 527 signedInteger(); 528 break; 529 } 530 return floatTypeSuffix(); 531 } 532 floatTypeSuffix()533 private Token floatTypeSuffix() { 534 switch (ch) { 535 case 'd': 536 case 'D': 537 eat(); 538 return Token.DOUBLE_LITERAL; 539 case 'f': 540 case 'F': 541 eat(); 542 return Token.FLOAT_LITERAL; 543 default: 544 return Token.DOUBLE_LITERAL; 545 } 546 } 547 signedInteger()548 private void signedInteger() { 549 switch (ch) { 550 case '-': 551 case '+': 552 eat(); 553 break; 554 default: 555 break; 556 } 557 readDigits(); 558 } 559 readHexDigits()560 private void readHexDigits() { 561 switch (ch) { 562 case 'A': 563 case 'B': 564 case 'C': 565 case 'D': 566 case 'E': 567 case 'F': 568 case 'a': 569 case 'b': 570 case 'c': 571 case 'd': 572 case 'e': 573 case 'f': 574 case '0': 575 case '1': 576 case '2': 577 case '3': 578 case '4': 579 case '5': 580 case '6': 581 case '7': 582 case '8': 583 case '9': 584 eat(); 585 break; 586 default: 587 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 588 } 589 OUTER: 590 while (true) { 591 switch (ch) { 592 case '_': 593 { 594 do { 595 eat(); 596 } while (ch == '_'); 597 switch (ch) { 598 case 'A': 599 case 'B': 600 case 'C': 601 case 'D': 602 case 'E': 603 case 'F': 604 case 'a': 605 case 'b': 606 case 'c': 607 case 'd': 608 case 'e': 609 case 'f': 610 case '0': 611 case '1': 612 case '2': 613 case '3': 614 case '4': 615 case '5': 616 case '6': 617 case '7': 618 case '8': 619 case '9': 620 continue OUTER; 621 default: 622 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 623 } 624 } 625 case 'A': 626 case 'B': 627 case 'C': 628 case 'D': 629 case 'E': 630 case 'F': 631 case 'a': 632 case 'b': 633 case 'c': 634 case 'd': 635 case 'e': 636 case 'f': 637 case '0': 638 case '1': 639 case '2': 640 case '3': 641 case '4': 642 case '5': 643 case '6': 644 case '7': 645 case '8': 646 case '9': 647 eat(); 648 break; 649 default: 650 return; 651 } 652 } 653 } 654 readDigits()655 private void readDigits() { 656 if ('0' <= ch && ch <= '9') { 657 eat(); 658 } else { 659 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 660 } 661 OUTER: 662 while (true) { 663 switch (ch) { 664 case '_': 665 do { 666 eat(); 667 } while (ch == '_'); 668 if ('0' <= ch && ch <= '9') { 669 continue OUTER; 670 } else { 671 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 672 } 673 case '0': 674 case '1': 675 case '2': 676 case '3': 677 case '4': 678 case '5': 679 case '6': 680 case '7': 681 case '8': 682 case '9': 683 eat(); 684 continue OUTER; 685 default: 686 return; 687 } 688 } 689 } 690 boolLiteral()691 private Token boolLiteral() { 692 readBinaryDigits(); 693 switch (ch) { 694 case 'l': 695 case 'L': 696 eat(); 697 return Token.LONG_LITERAL; 698 default: 699 return Token.INT_LITERAL; 700 } 701 } 702 readBinaryDigits()703 private void readBinaryDigits() { 704 switch (ch) { 705 case '0': 706 case '1': 707 eat(); 708 break; 709 default: 710 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 711 } 712 OUTER: 713 while (true) { 714 switch (ch) { 715 case '_': 716 do { 717 eat(); 718 } while (ch == '_'); 719 switch (ch) { 720 case '0': 721 case '1': 722 continue OUTER; 723 default: 724 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 725 } 726 case '0': 727 case '1': 728 eat(); 729 continue OUTER; 730 default: 731 return; 732 } 733 } 734 } 735 octalLiteral()736 private Token octalLiteral() { 737 readOctalDigits(); 738 switch (ch) { 739 case 'l': 740 case 'L': 741 eat(); 742 return Token.LONG_LITERAL; 743 default: 744 return Token.INT_LITERAL; 745 } 746 } 747 readOctalDigits()748 private void readOctalDigits() { 749 switch (ch) { 750 case '0': 751 case '1': 752 case '2': 753 case '3': 754 case '4': 755 case '5': 756 case '6': 757 case '7': 758 case '_': 759 eat(); 760 break; 761 default: 762 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 763 } 764 OUTER: 765 while (true) { 766 switch (ch) { 767 case '_': 768 do { 769 eat(); 770 } while (ch == '_'); 771 switch (ch) { 772 case '0': 773 case '1': 774 case '2': 775 case '3': 776 case '4': 777 case '5': 778 case '6': 779 case '7': 780 continue OUTER; 781 default: 782 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 783 } 784 case '0': 785 case '1': 786 case '2': 787 case '3': 788 case '4': 789 case '5': 790 case '6': 791 case '7': 792 eat(); 793 continue OUTER; 794 default: 795 return; 796 } 797 } 798 } 799 hexLiteral()800 private Token hexLiteral() { 801 readHexDigits(); 802 switch (ch) { 803 case '.': 804 eat(); 805 return hexFloatLiteral(); 806 case 'l': 807 case 'L': 808 eat(); 809 return Token.LONG_LITERAL; 810 case 'p': 811 case 'P': 812 eat(); 813 signedInteger(); 814 return floatTypeSuffix(); 815 default: 816 return Token.INT_LITERAL; 817 } 818 } 819 operator()820 private Token operator() { 821 switch (ch) { 822 case '=': 823 eat(); 824 if (ch == '=') { 825 eat(); 826 return Token.EQ; 827 } else { 828 return Token.ASSIGN; 829 } 830 case '>': 831 eat(); 832 switch (ch) { 833 case '=': 834 eat(); 835 return Token.GTE; 836 case '>': 837 eat(); 838 switch (ch) { 839 case '>': 840 eat(); 841 if (ch == '=') { 842 eat(); 843 return Token.GTGTGTE; 844 } else { 845 return Token.GTGTGT; 846 } 847 case '=': 848 eat(); 849 return Token.GTGTE; 850 default: 851 return Token.GTGT; 852 } 853 default: 854 return Token.GT; 855 } 856 case '<': 857 eat(); 858 switch (ch) { 859 case '=': 860 eat(); 861 return Token.LTE; 862 case '<': 863 eat(); 864 if (ch == '=') { 865 eat(); 866 return Token.LTLTE; 867 } else { 868 return Token.LTLT; 869 } 870 default: 871 return Token.LT; 872 } 873 case '!': 874 eat(); 875 if (ch == '=') { 876 eat(); 877 return Token.NOTEQ; 878 } else { 879 return Token.NOT; 880 } 881 case '~': 882 eat(); 883 return Token.TILDE; 884 case '?': 885 eat(); 886 return Token.COND; 887 case ':': 888 eat(); 889 if (ch == ':') { 890 eat(); 891 return Token.COLONCOLON; 892 } else { 893 return Token.COLON; 894 } 895 case '-': 896 eat(); 897 switch (ch) { 898 case '>': 899 eat(); 900 return Token.ARROW; 901 case '-': 902 eat(); 903 return Token.DECR; 904 case '=': 905 eat(); 906 return Token.MINUSEQ; 907 default: 908 return Token.MINUS; 909 } 910 case '&': 911 eat(); 912 switch (ch) { 913 case '&': 914 eat(); 915 return Token.ANDAND; 916 case '=': 917 eat(); 918 return Token.ANDEQ; 919 default: 920 return Token.AND; 921 } 922 case '|': 923 eat(); 924 switch (ch) { 925 case '=': 926 eat(); 927 return Token.OREQ; 928 case '|': 929 eat(); 930 return Token.OROR; 931 default: 932 return Token.OR; 933 } 934 case '+': 935 eat(); 936 switch (ch) { 937 case '+': 938 eat(); 939 return Token.INCR; 940 case '=': 941 eat(); 942 return Token.PLUSEQ; 943 default: 944 return Token.PLUS; 945 } 946 case '*': 947 eat(); 948 if (ch == '=') { 949 eat(); 950 return Token.MULTEQ; 951 } else { 952 return Token.MULT; 953 } 954 case '/': 955 // handled with comments 956 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 957 958 case '%': 959 eat(); 960 if (ch == '=') { 961 eat(); 962 return Token.MODEQ; 963 } else { 964 return Token.MOD; 965 } 966 case '^': 967 eat(); 968 if (ch == '=') { 969 eat(); 970 return Token.XOREQ; 971 } else { 972 return Token.XOR; 973 } 974 default: 975 throw error(ErrorKind.UNEXPECTED_INPUT, ch); 976 } 977 } 978 identifier()979 private Token identifier() { 980 readFrom(); 981 eat(); 982 // TODO(cushon): the style guide disallows non-ascii identifiers 983 while (Character.isJavaIdentifierPart(ch)) { 984 if (ch == ASCII_SUB && reader.done()) { 985 break; 986 } 987 eat(); 988 } 989 return makeIdent(stringValue()); 990 } 991 makeIdent(String s)992 private Token makeIdent(String s) { 993 switch (s) { 994 case "abstract": 995 return Token.ABSTRACT; 996 case "assert": 997 return Token.ASSERT; 998 case "boolean": 999 return Token.BOOLEAN; 1000 case "break": 1001 return Token.BREAK; 1002 case "byte": 1003 return Token.BYTE; 1004 case "case": 1005 return Token.CASE; 1006 case "catch": 1007 return Token.CATCH; 1008 case "char": 1009 return Token.CHAR; 1010 case "class": 1011 return Token.CLASS; 1012 case "const": 1013 return Token.CONST; 1014 case "continue": 1015 return Token.CONTINUE; 1016 case "default": 1017 return Token.DEFAULT; 1018 case "do": 1019 return Token.DO; 1020 case "double": 1021 return Token.DOUBLE; 1022 case "else": 1023 return Token.ELSE; 1024 case "enum": 1025 return Token.ENUM; 1026 case "extends": 1027 return Token.EXTENDS; 1028 case "final": 1029 return Token.FINAL; 1030 case "finally": 1031 return Token.FINALLY; 1032 case "float": 1033 return Token.FLOAT; 1034 case "for": 1035 return Token.FOR; 1036 case "goto": 1037 return Token.GOTO; 1038 case "if": 1039 return Token.IF; 1040 case "implements": 1041 return Token.IMPLEMENTS; 1042 case "import": 1043 return Token.IMPORT; 1044 case "instanceof": 1045 return Token.INSTANCEOF; 1046 case "int": 1047 return Token.INT; 1048 case "interface": 1049 return Token.INTERFACE; 1050 case "long": 1051 return Token.LONG; 1052 case "native": 1053 return Token.NATIVE; 1054 case "new": 1055 return Token.NEW; 1056 case "package": 1057 return Token.PACKAGE; 1058 case "private": 1059 return Token.PRIVATE; 1060 case "protected": 1061 return Token.PROTECTED; 1062 case "public": 1063 return Token.PUBLIC; 1064 case "return": 1065 return Token.RETURN; 1066 case "short": 1067 return Token.SHORT; 1068 case "static": 1069 return Token.STATIC; 1070 case "strictfp": 1071 return Token.STRICTFP; 1072 case "super": 1073 return Token.SUPER; 1074 case "switch": 1075 return Token.SWITCH; 1076 case "synchronized": 1077 return Token.SYNCHRONIZED; 1078 case "this": 1079 return Token.THIS; 1080 case "throw": 1081 return Token.THROW; 1082 case "throws": 1083 return Token.THROWS; 1084 case "transient": 1085 return Token.TRANSIENT; 1086 case "try": 1087 return Token.TRY; 1088 case "void": 1089 return Token.VOID; 1090 case "volatile": 1091 return Token.VOLATILE; 1092 case "while": 1093 return Token.WHILE; 1094 case "true": 1095 return Token.TRUE; 1096 case "false": 1097 return Token.FALSE; 1098 case "null": 1099 return Token.NULL; 1100 default: 1101 return Token.IDENT; 1102 } 1103 } 1104 error(ErrorKind kind, Object... args)1105 private TurbineError error(ErrorKind kind, Object... args) { 1106 return TurbineError.format(reader.source(), reader.position(), kind, args); 1107 } 1108 } 1109