1// Copyright 2017 The Bazel Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package syntax 6 7// A lexical scanner for Starlark. 8 9import ( 10 "fmt" 11 "io" 12 "io/ioutil" 13 "log" 14 "math/big" 15 "os" 16 "strconv" 17 "strings" 18 "unicode" 19 "unicode/utf8" 20) 21 22// A Token represents a Starlark lexical token. 23type Token int8 24 25const ( 26 ILLEGAL Token = iota 27 EOF 28 29 NEWLINE 30 INDENT 31 OUTDENT 32 33 // Tokens with values 34 IDENT // x 35 INT // 123 36 FLOAT // 1.23e45 37 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" 38 BYTES // b"foo", etc 39 40 // Punctuation 41 PLUS // + 42 MINUS // - 43 STAR // * 44 SLASH // / 45 SLASHSLASH // // 46 PERCENT // % 47 AMP // & 48 PIPE // | 49 CIRCUMFLEX // ^ 50 LTLT // << 51 GTGT // >> 52 TILDE // ~ 53 DOT // . 54 COMMA // , 55 EQ // = 56 SEMI // ; 57 COLON // : 58 LPAREN // ( 59 RPAREN // ) 60 LBRACK // [ 61 RBRACK // ] 62 LBRACE // { 63 RBRACE // } 64 LT // < 65 GT // > 66 GE // >= 67 LE // <= 68 EQL // == 69 NEQ // != 70 PLUS_EQ // += (keep order consistent with PLUS..GTGT) 71 MINUS_EQ // -= 72 STAR_EQ // *= 73 SLASH_EQ // /= 74 SLASHSLASH_EQ // //= 75 PERCENT_EQ // %= 76 AMP_EQ // &= 77 PIPE_EQ // |= 78 CIRCUMFLEX_EQ // ^= 79 LTLT_EQ // <<= 80 GTGT_EQ // >>= 81 STARSTAR // ** 82 83 // Keywords 84 AND 85 BREAK 86 CONTINUE 87 DEF 88 ELIF 89 ELSE 90 FOR 91 IF 92 IN 93 LAMBDA 94 LOAD 95 NOT 96 NOT_IN // synthesized by parser from NOT IN 97 OR 98 PASS 99 RETURN 100 WHILE 101 102 maxToken 103) 104 105func (tok Token) String() string { return tokenNames[tok] } 106 107// GoString is like String but quotes punctuation tokens. 108// Use Sprintf("%#v", tok) when constructing error messages. 109func (tok Token) GoString() string { 110 if tok >= PLUS && tok <= STARSTAR { 111 return "'" + tokenNames[tok] + "'" 112 } 113 return tokenNames[tok] 114} 115 116var tokenNames = [...]string{ 117 ILLEGAL: "illegal token", 118 EOF: "end of file", 119 NEWLINE: "newline", 120 INDENT: "indent", 121 OUTDENT: "outdent", 122 IDENT: "identifier", 123 INT: "int literal", 124 FLOAT: "float literal", 125 STRING: "string literal", 126 PLUS: "+", 127 MINUS: "-", 128 STAR: "*", 129 SLASH: "/", 130 SLASHSLASH: "//", 131 PERCENT: "%", 132 AMP: "&", 133 PIPE: "|", 134 CIRCUMFLEX: "^", 135 LTLT: "<<", 136 GTGT: ">>", 137 TILDE: "~", 138 DOT: ".", 139 COMMA: ",", 140 EQ: "=", 141 SEMI: ";", 142 COLON: ":", 143 LPAREN: "(", 144 RPAREN: ")", 145 LBRACK: "[", 146 RBRACK: "]", 147 LBRACE: "{", 148 RBRACE: "}", 149 LT: "<", 150 GT: ">", 151 GE: ">=", 152 LE: "<=", 153 EQL: "==", 154 NEQ: "!=", 155 PLUS_EQ: "+=", 156 MINUS_EQ: "-=", 157 STAR_EQ: "*=", 158 SLASH_EQ: "/=", 159 SLASHSLASH_EQ: "//=", 160 PERCENT_EQ: "%=", 161 AMP_EQ: "&=", 162 PIPE_EQ: "|=", 163 CIRCUMFLEX_EQ: "^=", 164 LTLT_EQ: "<<=", 165 GTGT_EQ: ">>=", 166 STARSTAR: "**", 167 AND: "and", 168 BREAK: "break", 169 CONTINUE: "continue", 170 DEF: "def", 171 ELIF: "elif", 172 ELSE: "else", 173 FOR: "for", 174 IF: "if", 175 IN: "in", 176 LAMBDA: "lambda", 177 LOAD: "load", 178 NOT: "not", 179 NOT_IN: "not in", 180 OR: "or", 181 PASS: "pass", 182 RETURN: "return", 183 WHILE: "while", 184} 185 186// A FilePortion describes the content of a portion of a file. 187// Callers may provide a FilePortion for the src argument of Parse 188// when the desired initial line and column numbers are not (1, 1), 189// such as when an expression is parsed from within larger file. 190type FilePortion struct { 191 Content []byte 192 FirstLine, FirstCol int32 193} 194 195// A Position describes the location of a rune of input. 196type Position struct { 197 file *string // filename (indirect for compactness) 198 Line int32 // 1-based line number; 0 if line unknown 199 Col int32 // 1-based column (rune) number; 0 if column unknown 200} 201 202// IsValid reports whether the position is valid. 203func (p Position) IsValid() bool { return p.file != nil } 204 205// Filename returns the name of the file containing this position. 206func (p Position) Filename() string { 207 if p.file != nil { 208 return *p.file 209 } 210 return "<invalid>" 211} 212 213// MakePosition returns position with the specified components. 214func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} } 215 216// add returns the position at the end of s, assuming it starts at p. 217func (p Position) add(s string) Position { 218 if n := strings.Count(s, "\n"); n > 0 { 219 p.Line += int32(n) 220 s = s[strings.LastIndex(s, "\n")+1:] 221 p.Col = 1 222 } 223 p.Col += int32(utf8.RuneCountInString(s)) 224 return p 225} 226 227func (p Position) String() string { 228 file := p.Filename() 229 if p.Line > 0 { 230 if p.Col > 0 { 231 return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col) 232 } 233 return fmt.Sprintf("%s:%d", file, p.Line) 234 } 235 return file 236} 237 238func (p Position) isBefore(q Position) bool { 239 if p.Line != q.Line { 240 return p.Line < q.Line 241 } 242 return p.Col < q.Col 243} 244 245// An scanner represents a single input file being parsed. 246type scanner struct { 247 rest []byte // rest of input (in REPL, a line of input) 248 token []byte // token being scanned 249 pos Position // current input position 250 depth int // nesting of [ ] { } ( ) 251 indentstk []int // stack of indentation levels 252 dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return 253 lineStart bool // after NEWLINE; convert spaces to indentation tokens 254 keepComments bool // accumulate comments in slice 255 lineComments []Comment // list of full line comments (if keepComments) 256 suffixComments []Comment // list of suffix comments (if keepComments) 257 258 readline func() ([]byte, error) // read next line of input (REPL only) 259} 260 261func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) { 262 var firstLine, firstCol int32 = 1, 1 263 if portion, ok := src.(FilePortion); ok { 264 firstLine, firstCol = portion.FirstLine, portion.FirstCol 265 } 266 sc := &scanner{ 267 pos: MakePosition(&filename, firstLine, firstCol), 268 indentstk: make([]int, 1, 10), // []int{0} + spare capacity 269 lineStart: true, 270 keepComments: keepComments, 271 } 272 sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only 273 if sc.readline == nil { 274 data, err := readSource(filename, src) 275 if err != nil { 276 return nil, err 277 } 278 sc.rest = data 279 } 280 return sc, nil 281} 282 283func readSource(filename string, src interface{}) ([]byte, error) { 284 switch src := src.(type) { 285 case string: 286 return []byte(src), nil 287 case []byte: 288 return src, nil 289 case io.Reader: 290 data, err := ioutil.ReadAll(src) 291 if err != nil { 292 err = &os.PathError{Op: "read", Path: filename, Err: err} 293 return nil, err 294 } 295 return data, nil 296 case FilePortion: 297 return src.Content, nil 298 case nil: 299 return ioutil.ReadFile(filename) 300 default: 301 return nil, fmt.Errorf("invalid source: %T", src) 302 } 303} 304 305// An Error describes the nature and position of a scanner or parser error. 306type Error struct { 307 Pos Position 308 Msg string 309} 310 311func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg } 312 313// errorf is called to report an error. 314// errorf does not return: it panics. 315func (sc *scanner) error(pos Position, s string) { 316 panic(Error{pos, s}) 317} 318 319func (sc *scanner) errorf(pos Position, format string, args ...interface{}) { 320 sc.error(pos, fmt.Sprintf(format, args...)) 321} 322 323func (sc *scanner) recover(err *error) { 324 // The scanner and parser panic both for routine errors like 325 // syntax errors and for programmer bugs like array index 326 // errors. Turn both into error returns. Catching bug panics 327 // is especially important when processing many files. 328 switch e := recover().(type) { 329 case nil: 330 // no panic 331 case Error: 332 *err = e 333 default: 334 *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)} 335 if debug { 336 log.Fatal(*err) 337 } 338 } 339} 340 341// eof reports whether the input has reached end of file. 342func (sc *scanner) eof() bool { 343 return len(sc.rest) == 0 && !sc.readLine() 344} 345 346// readLine attempts to read another line of input. 347// Precondition: len(sc.rest)==0. 348func (sc *scanner) readLine() bool { 349 if sc.readline != nil { 350 var err error 351 sc.rest, err = sc.readline() 352 if err != nil { 353 sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt 354 } 355 return len(sc.rest) > 0 356 } 357 return false 358} 359 360// peekRune returns the next rune in the input without consuming it. 361// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 362func (sc *scanner) peekRune() rune { 363 // TODO(adonovan): opt: measure and perhaps inline eof. 364 if sc.eof() { 365 return 0 366 } 367 368 // fast path: ASCII 369 if b := sc.rest[0]; b < utf8.RuneSelf { 370 if b == '\r' { 371 return '\n' 372 } 373 return rune(b) 374 } 375 376 r, _ := utf8.DecodeRune(sc.rest) 377 return r 378} 379 380// readRune consumes and returns the next rune in the input. 381// Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. 382func (sc *scanner) readRune() rune { 383 // eof() has been inlined here, both to avoid a call 384 // and to establish len(rest)>0 to avoid a bounds check. 385 if len(sc.rest) == 0 { 386 if !sc.readLine() { 387 sc.error(sc.pos, "internal scanner error: readRune at EOF") 388 } 389 // Redundant, but eliminates the bounds-check below. 390 if len(sc.rest) == 0 { 391 return 0 392 } 393 } 394 395 // fast path: ASCII 396 if b := sc.rest[0]; b < utf8.RuneSelf { 397 r := rune(b) 398 sc.rest = sc.rest[1:] 399 if r == '\r' { 400 if len(sc.rest) > 0 && sc.rest[0] == '\n' { 401 sc.rest = sc.rest[1:] 402 } 403 r = '\n' 404 } 405 if r == '\n' { 406 sc.pos.Line++ 407 sc.pos.Col = 1 408 } else { 409 sc.pos.Col++ 410 } 411 return r 412 } 413 414 r, size := utf8.DecodeRune(sc.rest) 415 sc.rest = sc.rest[size:] 416 sc.pos.Col++ 417 return r 418} 419 420// tokenValue records the position and value associated with each token. 421type tokenValue struct { 422 raw string // raw text of token 423 int int64 // decoded int 424 bigInt *big.Int // decoded integers > int64 425 float float64 // decoded float 426 string string // decoded string or bytes 427 pos Position // start position of token 428} 429 430// startToken marks the beginning of the next input token. 431// It must be followed by a call to endToken once the token has 432// been consumed using readRune. 433func (sc *scanner) startToken(val *tokenValue) { 434 sc.token = sc.rest 435 val.raw = "" 436 val.pos = sc.pos 437} 438 439// endToken marks the end of an input token. 440// It records the actual token string in val.raw if the caller 441// has not done that already. 442func (sc *scanner) endToken(val *tokenValue) { 443 if val.raw == "" { 444 val.raw = string(sc.token[:len(sc.token)-len(sc.rest)]) 445 } 446} 447 448// nextToken is called by the parser to obtain the next input token. 449// It returns the token value and sets val to the data associated with 450// the token. 451// 452// For all our input tokens, the associated data is val.pos (the 453// position where the token begins), val.raw (the input string 454// corresponding to the token). For string and int tokens, the string 455// and int fields additionally contain the token's interpreted value. 456func (sc *scanner) nextToken(val *tokenValue) Token { 457 458 // The following distribution of tokens guides case ordering: 459 // 460 // COMMA 27 % 461 // STRING 23 % 462 // IDENT 15 % 463 // EQL 11 % 464 // LBRACK 5.5 % 465 // RBRACK 5.5 % 466 // NEWLINE 3 % 467 // LPAREN 2.9 % 468 // RPAREN 2.9 % 469 // INT 2 % 470 // others < 1 % 471 // 472 // Although NEWLINE tokens are infrequent, and lineStart is 473 // usually (~97%) false on entry, skipped newlines account for 474 // about 50% of all iterations of the 'start' loop. 475 476start: 477 var c rune 478 479 // Deal with leading spaces and indentation. 480 blank := false 481 savedLineStart := sc.lineStart 482 if sc.lineStart { 483 sc.lineStart = false 484 col := 0 485 for { 486 c = sc.peekRune() 487 if c == ' ' { 488 col++ 489 sc.readRune() 490 } else if c == '\t' { 491 const tab = 8 492 col += int(tab - (sc.pos.Col-1)%tab) 493 sc.readRune() 494 } else { 495 break 496 } 497 } 498 499 // The third clause matches EOF. 500 if c == '#' || c == '\n' || c == 0 { 501 blank = true 502 } 503 504 // Compute indentation level for non-blank lines not 505 // inside an expression. This is not the common case. 506 if !blank && sc.depth == 0 { 507 cur := sc.indentstk[len(sc.indentstk)-1] 508 if col > cur { 509 // indent 510 sc.dents++ 511 sc.indentstk = append(sc.indentstk, col) 512 } else if col < cur { 513 // outdent(s) 514 for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] { 515 sc.dents-- 516 sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop 517 } 518 if col != sc.indentstk[len(sc.indentstk)-1] { 519 sc.error(sc.pos, "unindent does not match any outer indentation level") 520 } 521 } 522 } 523 } 524 525 // Return saved indentation tokens. 526 if sc.dents != 0 { 527 sc.startToken(val) 528 sc.endToken(val) 529 if sc.dents < 0 { 530 sc.dents++ 531 return OUTDENT 532 } else { 533 sc.dents-- 534 return INDENT 535 } 536 } 537 538 // start of line proper 539 c = sc.peekRune() 540 541 // Skip spaces. 542 for c == ' ' || c == '\t' { 543 sc.readRune() 544 c = sc.peekRune() 545 } 546 547 // comment 548 if c == '#' { 549 if sc.keepComments { 550 sc.startToken(val) 551 } 552 // Consume up to newline (included). 553 for c != 0 && c != '\n' { 554 sc.readRune() 555 c = sc.peekRune() 556 } 557 if sc.keepComments { 558 sc.endToken(val) 559 if blank { 560 sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw}) 561 } else { 562 sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw}) 563 } 564 } 565 } 566 567 // newline 568 if c == '\n' { 569 sc.lineStart = true 570 571 // Ignore newlines within expressions (common case). 572 if sc.depth > 0 { 573 sc.readRune() 574 goto start 575 } 576 577 // Ignore blank lines, except in the REPL, 578 // where they emit OUTDENTs and NEWLINE. 579 if blank { 580 if sc.readline == nil { 581 sc.readRune() 582 goto start 583 } else if len(sc.indentstk) > 1 { 584 sc.dents = 1 - len(sc.indentstk) 585 sc.indentstk = sc.indentstk[:1] 586 goto start 587 } 588 } 589 590 // At top-level (not in an expression). 591 sc.startToken(val) 592 sc.readRune() 593 val.raw = "\n" 594 return NEWLINE 595 } 596 597 // end of file 598 if c == 0 { 599 // Emit OUTDENTs for unfinished indentation, 600 // preceded by a NEWLINE if we haven't just emitted one. 601 if len(sc.indentstk) > 1 { 602 if savedLineStart { 603 sc.dents = 1 - len(sc.indentstk) 604 sc.indentstk = sc.indentstk[:1] 605 goto start 606 } else { 607 sc.lineStart = true 608 sc.startToken(val) 609 val.raw = "\n" 610 return NEWLINE 611 } 612 } 613 614 sc.startToken(val) 615 sc.endToken(val) 616 return EOF 617 } 618 619 // line continuation 620 if c == '\\' { 621 sc.readRune() 622 if sc.peekRune() != '\n' { 623 sc.errorf(sc.pos, "stray backslash in program") 624 } 625 sc.readRune() 626 goto start 627 } 628 629 // start of the next token 630 sc.startToken(val) 631 632 // comma (common case) 633 if c == ',' { 634 sc.readRune() 635 sc.endToken(val) 636 return COMMA 637 } 638 639 // string literal 640 if c == '"' || c == '\'' { 641 return sc.scanString(val, c) 642 } 643 644 // identifier or keyword 645 if isIdentStart(c) { 646 if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { 647 // r"..." 648 // b"..." 649 sc.readRune() 650 c = sc.peekRune() 651 return sc.scanString(val, c) 652 } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') { 653 // rb"..." 654 sc.readRune() 655 sc.readRune() 656 c = sc.peekRune() 657 return sc.scanString(val, c) 658 } 659 660 for isIdent(c) { 661 sc.readRune() 662 c = sc.peekRune() 663 } 664 sc.endToken(val) 665 if k, ok := keywordToken[val.raw]; ok { 666 return k 667 } 668 669 return IDENT 670 } 671 672 // brackets 673 switch c { 674 case '[', '(', '{': 675 sc.depth++ 676 sc.readRune() 677 sc.endToken(val) 678 switch c { 679 case '[': 680 return LBRACK 681 case '(': 682 return LPAREN 683 case '{': 684 return LBRACE 685 } 686 panic("unreachable") 687 688 case ']', ')', '}': 689 if sc.depth == 0 { 690 sc.errorf(sc.pos, "unexpected %q", c) 691 } else { 692 sc.depth-- 693 } 694 sc.readRune() 695 sc.endToken(val) 696 switch c { 697 case ']': 698 return RBRACK 699 case ')': 700 return RPAREN 701 case '}': 702 return RBRACE 703 } 704 panic("unreachable") 705 } 706 707 // int or float literal, or period 708 if isdigit(c) || c == '.' { 709 return sc.scanNumber(val, c) 710 } 711 712 // other punctuation 713 defer sc.endToken(val) 714 switch c { 715 case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '=' 716 start := sc.pos 717 sc.readRune() 718 if sc.peekRune() == '=' { 719 sc.readRune() 720 switch c { 721 case '<': 722 return LE 723 case '>': 724 return GE 725 case '=': 726 return EQL 727 case '!': 728 return NEQ 729 case '+': 730 return PLUS_EQ 731 case '-': 732 return MINUS_EQ 733 case '/': 734 return SLASH_EQ 735 case '%': 736 return PERCENT_EQ 737 case '&': 738 return AMP_EQ 739 case '|': 740 return PIPE_EQ 741 case '^': 742 return CIRCUMFLEX_EQ 743 } 744 } 745 switch c { 746 case '=': 747 return EQ 748 case '<': 749 if sc.peekRune() == '<' { 750 sc.readRune() 751 if sc.peekRune() == '=' { 752 sc.readRune() 753 return LTLT_EQ 754 } else { 755 return LTLT 756 } 757 } 758 return LT 759 case '>': 760 if sc.peekRune() == '>' { 761 sc.readRune() 762 if sc.peekRune() == '=' { 763 sc.readRune() 764 return GTGT_EQ 765 } else { 766 return GTGT 767 } 768 } 769 return GT 770 case '!': 771 sc.error(start, "unexpected input character '!'") 772 case '+': 773 return PLUS 774 case '-': 775 return MINUS 776 case '/': 777 if sc.peekRune() == '/' { 778 sc.readRune() 779 if sc.peekRune() == '=' { 780 sc.readRune() 781 return SLASHSLASH_EQ 782 } else { 783 return SLASHSLASH 784 } 785 } 786 return SLASH 787 case '%': 788 return PERCENT 789 case '&': 790 return AMP 791 case '|': 792 return PIPE 793 case '^': 794 return CIRCUMFLEX 795 } 796 panic("unreachable") 797 798 case ':', ';', '~': // single-char tokens (except comma) 799 sc.readRune() 800 switch c { 801 case ':': 802 return COLON 803 case ';': 804 return SEMI 805 case '~': 806 return TILDE 807 } 808 panic("unreachable") 809 810 case '*': // possibly followed by '*' or '=' 811 sc.readRune() 812 switch sc.peekRune() { 813 case '*': 814 sc.readRune() 815 return STARSTAR 816 case '=': 817 sc.readRune() 818 return STAR_EQ 819 } 820 return STAR 821 } 822 823 sc.errorf(sc.pos, "unexpected input character %#q", c) 824 panic("unreachable") 825} 826 827func (sc *scanner) scanString(val *tokenValue, quote rune) Token { 828 start := sc.pos 829 triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote) 830 sc.readRune() 831 832 // String literals may contain escaped or unescaped newlines, 833 // causing them to span multiple lines (gulps) of REPL input; 834 // they are the only such token. Thus we cannot call endToken, 835 // as it assumes sc.rest is unchanged since startToken. 836 // Instead, buffer the token here. 837 // TODO(adonovan): opt: buffer only if we encounter a newline. 838 raw := new(strings.Builder) 839 840 // Copy the prefix, e.g. r' or " (see startToken). 841 raw.Write(sc.token[:len(sc.token)-len(sc.rest)]) 842 843 if !triple { 844 // single-quoted string literal 845 for { 846 if sc.eof() { 847 sc.error(val.pos, "unexpected EOF in string") 848 } 849 c := sc.readRune() 850 raw.WriteRune(c) 851 if c == quote { 852 break 853 } 854 if c == '\n' { 855 sc.error(val.pos, "unexpected newline in string") 856 } 857 if c == '\\' { 858 if sc.eof() { 859 sc.error(val.pos, "unexpected EOF in string") 860 } 861 c = sc.readRune() 862 raw.WriteRune(c) 863 } 864 } 865 } else { 866 // triple-quoted string literal 867 sc.readRune() 868 raw.WriteRune(quote) 869 sc.readRune() 870 raw.WriteRune(quote) 871 872 quoteCount := 0 873 for { 874 if sc.eof() { 875 sc.error(val.pos, "unexpected EOF in string") 876 } 877 c := sc.readRune() 878 raw.WriteRune(c) 879 if c == quote { 880 quoteCount++ 881 if quoteCount == 3 { 882 break 883 } 884 } else { 885 quoteCount = 0 886 } 887 if c == '\\' { 888 if sc.eof() { 889 sc.error(val.pos, "unexpected EOF in string") 890 } 891 c = sc.readRune() 892 raw.WriteRune(c) 893 } 894 } 895 } 896 val.raw = raw.String() 897 898 s, _, isByte, err := unquote(val.raw) 899 if err != nil { 900 sc.error(start, err.Error()) 901 } 902 val.string = s 903 if isByte { 904 return BYTES 905 } else { 906 return STRING 907 } 908} 909 910func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { 911 // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements 912 // 913 // Python features not supported: 914 // - integer literals of >64 bits of precision 915 // - 123L or 123l long suffix 916 // - traditional octal: 0755 917 // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals 918 919 start := sc.pos 920 fraction, exponent := false, false 921 922 if c == '.' { 923 // dot or start of fraction 924 sc.readRune() 925 c = sc.peekRune() 926 if !isdigit(c) { 927 sc.endToken(val) 928 return DOT 929 } 930 fraction = true 931 } else if c == '0' { 932 // hex, octal, binary or float 933 sc.readRune() 934 c = sc.peekRune() 935 936 if c == '.' { 937 fraction = true 938 } else if c == 'x' || c == 'X' { 939 // hex 940 sc.readRune() 941 c = sc.peekRune() 942 if !isxdigit(c) { 943 sc.error(start, "invalid hex literal") 944 } 945 for isxdigit(c) { 946 sc.readRune() 947 c = sc.peekRune() 948 } 949 } else if c == 'o' || c == 'O' { 950 // octal 951 sc.readRune() 952 c = sc.peekRune() 953 if !isodigit(c) { 954 sc.error(sc.pos, "invalid octal literal") 955 } 956 for isodigit(c) { 957 sc.readRune() 958 c = sc.peekRune() 959 } 960 } else if c == 'b' || c == 'B' { 961 // binary 962 sc.readRune() 963 c = sc.peekRune() 964 if !isbdigit(c) { 965 sc.error(sc.pos, "invalid binary literal") 966 } 967 for isbdigit(c) { 968 sc.readRune() 969 c = sc.peekRune() 970 } 971 } else { 972 // float (or obsolete octal "0755") 973 allzeros, octal := true, true 974 for isdigit(c) { 975 if c != '0' { 976 allzeros = false 977 } 978 if c > '7' { 979 octal = false 980 } 981 sc.readRune() 982 c = sc.peekRune() 983 } 984 if c == '.' { 985 fraction = true 986 } else if c == 'e' || c == 'E' { 987 exponent = true 988 } else if octal && !allzeros { 989 sc.endToken(val) 990 sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:]) 991 } 992 } 993 } else { 994 // decimal 995 for isdigit(c) { 996 sc.readRune() 997 c = sc.peekRune() 998 } 999 1000 if c == '.' { 1001 fraction = true 1002 } else if c == 'e' || c == 'E' { 1003 exponent = true 1004 } 1005 } 1006 1007 if fraction { 1008 sc.readRune() // consume '.' 1009 c = sc.peekRune() 1010 for isdigit(c) { 1011 sc.readRune() 1012 c = sc.peekRune() 1013 } 1014 1015 if c == 'e' || c == 'E' { 1016 exponent = true 1017 } 1018 } 1019 1020 if exponent { 1021 sc.readRune() // consume [eE] 1022 c = sc.peekRune() 1023 if c == '+' || c == '-' { 1024 sc.readRune() 1025 c = sc.peekRune() 1026 if !isdigit(c) { 1027 sc.error(sc.pos, "invalid float literal") 1028 } 1029 } 1030 for isdigit(c) { 1031 sc.readRune() 1032 c = sc.peekRune() 1033 } 1034 } 1035 1036 sc.endToken(val) 1037 if fraction || exponent { 1038 var err error 1039 val.float, err = strconv.ParseFloat(val.raw, 64) 1040 if err != nil { 1041 sc.error(sc.pos, "invalid float literal") 1042 } 1043 return FLOAT 1044 } else { 1045 var err error 1046 s := val.raw 1047 val.bigInt = nil 1048 if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { 1049 val.int, err = strconv.ParseInt(s[2:], 8, 64) 1050 } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { 1051 val.int, err = strconv.ParseInt(s[2:], 2, 64) 1052 } else { 1053 val.int, err = strconv.ParseInt(s, 0, 64) 1054 if err != nil { 1055 num := new(big.Int) 1056 var ok bool 1057 val.bigInt, ok = num.SetString(s, 0) 1058 if ok { 1059 err = nil 1060 } 1061 } 1062 } 1063 if err != nil { 1064 sc.error(start, "invalid int literal") 1065 } 1066 return INT 1067 } 1068} 1069 1070// isIdent reports whether c is an identifier rune. 1071func isIdent(c rune) bool { 1072 return isdigit(c) || isIdentStart(c) 1073} 1074 1075func isIdentStart(c rune) bool { 1076 return 'a' <= c && c <= 'z' || 1077 'A' <= c && c <= 'Z' || 1078 c == '_' || 1079 unicode.IsLetter(c) 1080} 1081 1082func isdigit(c rune) bool { return '0' <= c && c <= '9' } 1083func isodigit(c rune) bool { return '0' <= c && c <= '7' } 1084func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' } 1085func isbdigit(c rune) bool { return '0' == c || c == '1' } 1086 1087// keywordToken records the special tokens for 1088// strings that should not be treated as ordinary identifiers. 1089var keywordToken = map[string]Token{ 1090 "and": AND, 1091 "break": BREAK, 1092 "continue": CONTINUE, 1093 "def": DEF, 1094 "elif": ELIF, 1095 "else": ELSE, 1096 "for": FOR, 1097 "if": IF, 1098 "in": IN, 1099 "lambda": LAMBDA, 1100 "load": LOAD, 1101 "not": NOT, 1102 "or": OR, 1103 "pass": PASS, 1104 "return": RETURN, 1105 "while": WHILE, 1106 1107 // reserved words: 1108 "as": ILLEGAL, 1109 // "assert": ILLEGAL, // heavily used by our tests 1110 "class": ILLEGAL, 1111 "del": ILLEGAL, 1112 "except": ILLEGAL, 1113 "finally": ILLEGAL, 1114 "from": ILLEGAL, 1115 "global": ILLEGAL, 1116 "import": ILLEGAL, 1117 "is": ILLEGAL, 1118 "nonlocal": ILLEGAL, 1119 "raise": ILLEGAL, 1120 "try": ILLEGAL, 1121 "with": ILLEGAL, 1122 "yield": ILLEGAL, 1123} 1124