// Copyright 2017 The Bazel Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package syntax // A lexical scanner for Starlark. import ( "fmt" "io" "io/ioutil" "log" "math/big" "os" "strconv" "strings" "unicode" "unicode/utf8" ) // A Token represents a Starlark lexical token. type Token int8 const ( ILLEGAL Token = iota EOF NEWLINE INDENT OUTDENT // Tokens with values IDENT // x INT // 123 FLOAT // 1.23e45 STRING // "foo" or 'foo' or '''foo''' or r'foo' or r"foo" BYTES // b"foo", etc // Punctuation PLUS // + MINUS // - STAR // * SLASH // / SLASHSLASH // // PERCENT // % AMP // & PIPE // | CIRCUMFLEX // ^ LTLT // << GTGT // >> TILDE // ~ DOT // . COMMA // , EQ // = SEMI // ; COLON // : LPAREN // ( RPAREN // ) LBRACK // [ RBRACK // ] LBRACE // { RBRACE // } LT // < GT // > GE // >= LE // <= EQL // == NEQ // != PLUS_EQ // += (keep order consistent with PLUS..GTGT) MINUS_EQ // -= STAR_EQ // *= SLASH_EQ // /= SLASHSLASH_EQ // //= PERCENT_EQ // %= AMP_EQ // &= PIPE_EQ // |= CIRCUMFLEX_EQ // ^= LTLT_EQ // <<= GTGT_EQ // >>= STARSTAR // ** // Keywords AND BREAK CONTINUE DEF ELIF ELSE FOR IF IN LAMBDA LOAD NOT NOT_IN // synthesized by parser from NOT IN OR PASS RETURN WHILE maxToken ) func (tok Token) String() string { return tokenNames[tok] } // GoString is like String but quotes punctuation tokens. // Use Sprintf("%#v", tok) when constructing error messages. func (tok Token) GoString() string { if tok >= PLUS && tok <= STARSTAR { return "'" + tokenNames[tok] + "'" } return tokenNames[tok] } var tokenNames = [...]string{ ILLEGAL: "illegal token", EOF: "end of file", NEWLINE: "newline", INDENT: "indent", OUTDENT: "outdent", IDENT: "identifier", INT: "int literal", FLOAT: "float literal", STRING: "string literal", PLUS: "+", MINUS: "-", STAR: "*", SLASH: "/", SLASHSLASH: "//", PERCENT: "%", AMP: "&", PIPE: "|", CIRCUMFLEX: "^", LTLT: "<<", GTGT: ">>", TILDE: "~", DOT: ".", COMMA: ",", EQ: "=", SEMI: ";", COLON: ":", LPAREN: "(", RPAREN: ")", LBRACK: "[", RBRACK: "]", LBRACE: "{", RBRACE: "}", LT: "<", GT: ">", GE: ">=", LE: "<=", EQL: "==", NEQ: "!=", PLUS_EQ: "+=", MINUS_EQ: "-=", STAR_EQ: "*=", SLASH_EQ: "/=", SLASHSLASH_EQ: "//=", PERCENT_EQ: "%=", AMP_EQ: "&=", PIPE_EQ: "|=", CIRCUMFLEX_EQ: "^=", LTLT_EQ: "<<=", GTGT_EQ: ">>=", STARSTAR: "**", AND: "and", BREAK: "break", CONTINUE: "continue", DEF: "def", ELIF: "elif", ELSE: "else", FOR: "for", IF: "if", IN: "in", LAMBDA: "lambda", LOAD: "load", NOT: "not", NOT_IN: "not in", OR: "or", PASS: "pass", RETURN: "return", WHILE: "while", } // A FilePortion describes the content of a portion of a file. // Callers may provide a FilePortion for the src argument of Parse // when the desired initial line and column numbers are not (1, 1), // such as when an expression is parsed from within larger file. type FilePortion struct { Content []byte FirstLine, FirstCol int32 } // A Position describes the location of a rune of input. type Position struct { file *string // filename (indirect for compactness) Line int32 // 1-based line number; 0 if line unknown Col int32 // 1-based column (rune) number; 0 if column unknown } // IsValid reports whether the position is valid. func (p Position) IsValid() bool { return p.file != nil } // Filename returns the name of the file containing this position. func (p Position) Filename() string { if p.file != nil { return *p.file } return "" } // MakePosition returns position with the specified components. func MakePosition(file *string, line, col int32) Position { return Position{file, line, col} } // add returns the position at the end of s, assuming it starts at p. func (p Position) add(s string) Position { if n := strings.Count(s, "\n"); n > 0 { p.Line += int32(n) s = s[strings.LastIndex(s, "\n")+1:] p.Col = 1 } p.Col += int32(utf8.RuneCountInString(s)) return p } func (p Position) String() string { file := p.Filename() if p.Line > 0 { if p.Col > 0 { return fmt.Sprintf("%s:%d:%d", file, p.Line, p.Col) } return fmt.Sprintf("%s:%d", file, p.Line) } return file } func (p Position) isBefore(q Position) bool { if p.Line != q.Line { return p.Line < q.Line } return p.Col < q.Col } // An scanner represents a single input file being parsed. type scanner struct { rest []byte // rest of input (in REPL, a line of input) token []byte // token being scanned pos Position // current input position depth int // nesting of [ ] { } ( ) indentstk []int // stack of indentation levels dents int // number of saved INDENT (>0) or OUTDENT (<0) tokens to return lineStart bool // after NEWLINE; convert spaces to indentation tokens keepComments bool // accumulate comments in slice lineComments []Comment // list of full line comments (if keepComments) suffixComments []Comment // list of suffix comments (if keepComments) readline func() ([]byte, error) // read next line of input (REPL only) } func newScanner(filename string, src interface{}, keepComments bool) (*scanner, error) { var firstLine, firstCol int32 = 1, 1 if portion, ok := src.(FilePortion); ok { firstLine, firstCol = portion.FirstLine, portion.FirstCol } sc := &scanner{ pos: MakePosition(&filename, firstLine, firstCol), indentstk: make([]int, 1, 10), // []int{0} + spare capacity lineStart: true, keepComments: keepComments, } sc.readline, _ = src.(func() ([]byte, error)) // ParseCompoundStmt (REPL) only if sc.readline == nil { data, err := readSource(filename, src) if err != nil { return nil, err } sc.rest = data } return sc, nil } func readSource(filename string, src interface{}) ([]byte, error) { switch src := src.(type) { case string: return []byte(src), nil case []byte: return src, nil case io.Reader: data, err := ioutil.ReadAll(src) if err != nil { err = &os.PathError{Op: "read", Path: filename, Err: err} return nil, err } return data, nil case FilePortion: return src.Content, nil case nil: return ioutil.ReadFile(filename) default: return nil, fmt.Errorf("invalid source: %T", src) } } // An Error describes the nature and position of a scanner or parser error. type Error struct { Pos Position Msg string } func (e Error) Error() string { return e.Pos.String() + ": " + e.Msg } // errorf is called to report an error. // errorf does not return: it panics. func (sc *scanner) error(pos Position, s string) { panic(Error{pos, s}) } func (sc *scanner) errorf(pos Position, format string, args ...interface{}) { sc.error(pos, fmt.Sprintf(format, args...)) } func (sc *scanner) recover(err *error) { // The scanner and parser panic both for routine errors like // syntax errors and for programmer bugs like array index // errors. Turn both into error returns. Catching bug panics // is especially important when processing many files. switch e := recover().(type) { case nil: // no panic case Error: *err = e default: *err = Error{sc.pos, fmt.Sprintf("internal error: %v", e)} if debug { log.Fatal(*err) } } } // eof reports whether the input has reached end of file. func (sc *scanner) eof() bool { return len(sc.rest) == 0 && !sc.readLine() } // readLine attempts to read another line of input. // Precondition: len(sc.rest)==0. func (sc *scanner) readLine() bool { if sc.readline != nil { var err error sc.rest, err = sc.readline() if err != nil { sc.errorf(sc.pos, "%v", err) // EOF or ErrInterrupt } return len(sc.rest) > 0 } return false } // peekRune returns the next rune in the input without consuming it. // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. func (sc *scanner) peekRune() rune { // TODO(adonovan): opt: measure and perhaps inline eof. if sc.eof() { return 0 } // fast path: ASCII if b := sc.rest[0]; b < utf8.RuneSelf { if b == '\r' { return '\n' } return rune(b) } r, _ := utf8.DecodeRune(sc.rest) return r } // readRune consumes and returns the next rune in the input. // Newlines in Unix, DOS, or Mac format are treated as one rune, '\n'. func (sc *scanner) readRune() rune { // eof() has been inlined here, both to avoid a call // and to establish len(rest)>0 to avoid a bounds check. if len(sc.rest) == 0 { if !sc.readLine() { sc.error(sc.pos, "internal scanner error: readRune at EOF") } // Redundant, but eliminates the bounds-check below. if len(sc.rest) == 0 { return 0 } } // fast path: ASCII if b := sc.rest[0]; b < utf8.RuneSelf { r := rune(b) sc.rest = sc.rest[1:] if r == '\r' { if len(sc.rest) > 0 && sc.rest[0] == '\n' { sc.rest = sc.rest[1:] } r = '\n' } if r == '\n' { sc.pos.Line++ sc.pos.Col = 1 } else { sc.pos.Col++ } return r } r, size := utf8.DecodeRune(sc.rest) sc.rest = sc.rest[size:] sc.pos.Col++ return r } // tokenValue records the position and value associated with each token. type tokenValue struct { raw string // raw text of token int int64 // decoded int bigInt *big.Int // decoded integers > int64 float float64 // decoded float string string // decoded string or bytes pos Position // start position of token } // startToken marks the beginning of the next input token. // It must be followed by a call to endToken once the token has // been consumed using readRune. func (sc *scanner) startToken(val *tokenValue) { sc.token = sc.rest val.raw = "" val.pos = sc.pos } // endToken marks the end of an input token. // It records the actual token string in val.raw if the caller // has not done that already. func (sc *scanner) endToken(val *tokenValue) { if val.raw == "" { val.raw = string(sc.token[:len(sc.token)-len(sc.rest)]) } } // nextToken is called by the parser to obtain the next input token. // It returns the token value and sets val to the data associated with // the token. // // For all our input tokens, the associated data is val.pos (the // position where the token begins), val.raw (the input string // corresponding to the token). For string and int tokens, the string // and int fields additionally contain the token's interpreted value. func (sc *scanner) nextToken(val *tokenValue) Token { // The following distribution of tokens guides case ordering: // // COMMA 27 % // STRING 23 % // IDENT 15 % // EQL 11 % // LBRACK 5.5 % // RBRACK 5.5 % // NEWLINE 3 % // LPAREN 2.9 % // RPAREN 2.9 % // INT 2 % // others < 1 % // // Although NEWLINE tokens are infrequent, and lineStart is // usually (~97%) false on entry, skipped newlines account for // about 50% of all iterations of the 'start' loop. start: var c rune // Deal with leading spaces and indentation. blank := false savedLineStart := sc.lineStart if sc.lineStart { sc.lineStart = false col := 0 for { c = sc.peekRune() if c == ' ' { col++ sc.readRune() } else if c == '\t' { const tab = 8 col += int(tab - (sc.pos.Col-1)%tab) sc.readRune() } else { break } } // The third clause matches EOF. if c == '#' || c == '\n' || c == 0 { blank = true } // Compute indentation level for non-blank lines not // inside an expression. This is not the common case. if !blank && sc.depth == 0 { cur := sc.indentstk[len(sc.indentstk)-1] if col > cur { // indent sc.dents++ sc.indentstk = append(sc.indentstk, col) } else if col < cur { // outdent(s) for len(sc.indentstk) > 0 && col < sc.indentstk[len(sc.indentstk)-1] { sc.dents-- sc.indentstk = sc.indentstk[:len(sc.indentstk)-1] // pop } if col != sc.indentstk[len(sc.indentstk)-1] { sc.error(sc.pos, "unindent does not match any outer indentation level") } } } } // Return saved indentation tokens. if sc.dents != 0 { sc.startToken(val) sc.endToken(val) if sc.dents < 0 { sc.dents++ return OUTDENT } else { sc.dents-- return INDENT } } // start of line proper c = sc.peekRune() // Skip spaces. for c == ' ' || c == '\t' { sc.readRune() c = sc.peekRune() } // comment if c == '#' { if sc.keepComments { sc.startToken(val) } // Consume up to newline (included). for c != 0 && c != '\n' { sc.readRune() c = sc.peekRune() } if sc.keepComments { sc.endToken(val) if blank { sc.lineComments = append(sc.lineComments, Comment{val.pos, val.raw}) } else { sc.suffixComments = append(sc.suffixComments, Comment{val.pos, val.raw}) } } } // newline if c == '\n' { sc.lineStart = true // Ignore newlines within expressions (common case). if sc.depth > 0 { sc.readRune() goto start } // Ignore blank lines, except in the REPL, // where they emit OUTDENTs and NEWLINE. if blank { if sc.readline == nil { sc.readRune() goto start } else if len(sc.indentstk) > 1 { sc.dents = 1 - len(sc.indentstk) sc.indentstk = sc.indentstk[:1] goto start } } // At top-level (not in an expression). sc.startToken(val) sc.readRune() val.raw = "\n" return NEWLINE } // end of file if c == 0 { // Emit OUTDENTs for unfinished indentation, // preceded by a NEWLINE if we haven't just emitted one. if len(sc.indentstk) > 1 { if savedLineStart { sc.dents = 1 - len(sc.indentstk) sc.indentstk = sc.indentstk[:1] goto start } else { sc.lineStart = true sc.startToken(val) val.raw = "\n" return NEWLINE } } sc.startToken(val) sc.endToken(val) return EOF } // line continuation if c == '\\' { sc.readRune() if sc.peekRune() != '\n' { sc.errorf(sc.pos, "stray backslash in program") } sc.readRune() goto start } // start of the next token sc.startToken(val) // comma (common case) if c == ',' { sc.readRune() sc.endToken(val) return COMMA } // string literal if c == '"' || c == '\'' { return sc.scanString(val, c) } // identifier or keyword if isIdentStart(c) { if (c == 'r' || c == 'b') && len(sc.rest) > 1 && (sc.rest[1] == '"' || sc.rest[1] == '\'') { // r"..." // b"..." sc.readRune() c = sc.peekRune() return sc.scanString(val, c) } else if c == 'r' && len(sc.rest) > 2 && sc.rest[1] == 'b' && (sc.rest[2] == '"' || sc.rest[2] == '\'') { // rb"..." sc.readRune() sc.readRune() c = sc.peekRune() return sc.scanString(val, c) } for isIdent(c) { sc.readRune() c = sc.peekRune() } sc.endToken(val) if k, ok := keywordToken[val.raw]; ok { return k } return IDENT } // brackets switch c { case '[', '(', '{': sc.depth++ sc.readRune() sc.endToken(val) switch c { case '[': return LBRACK case '(': return LPAREN case '{': return LBRACE } panic("unreachable") case ']', ')', '}': if sc.depth == 0 { sc.errorf(sc.pos, "unexpected %q", c) } else { sc.depth-- } sc.readRune() sc.endToken(val) switch c { case ']': return RBRACK case ')': return RPAREN case '}': return RBRACE } panic("unreachable") } // int or float literal, or period if isdigit(c) || c == '.' { return sc.scanNumber(val, c) } // other punctuation defer sc.endToken(val) switch c { case '=', '<', '>', '!', '+', '-', '%', '/', '&', '|', '^': // possibly followed by '=' start := sc.pos sc.readRune() if sc.peekRune() == '=' { sc.readRune() switch c { case '<': return LE case '>': return GE case '=': return EQL case '!': return NEQ case '+': return PLUS_EQ case '-': return MINUS_EQ case '/': return SLASH_EQ case '%': return PERCENT_EQ case '&': return AMP_EQ case '|': return PIPE_EQ case '^': return CIRCUMFLEX_EQ } } switch c { case '=': return EQ case '<': if sc.peekRune() == '<' { sc.readRune() if sc.peekRune() == '=' { sc.readRune() return LTLT_EQ } else { return LTLT } } return LT case '>': if sc.peekRune() == '>' { sc.readRune() if sc.peekRune() == '=' { sc.readRune() return GTGT_EQ } else { return GTGT } } return GT case '!': sc.error(start, "unexpected input character '!'") case '+': return PLUS case '-': return MINUS case '/': if sc.peekRune() == '/' { sc.readRune() if sc.peekRune() == '=' { sc.readRune() return SLASHSLASH_EQ } else { return SLASHSLASH } } return SLASH case '%': return PERCENT case '&': return AMP case '|': return PIPE case '^': return CIRCUMFLEX } panic("unreachable") case ':', ';', '~': // single-char tokens (except comma) sc.readRune() switch c { case ':': return COLON case ';': return SEMI case '~': return TILDE } panic("unreachable") case '*': // possibly followed by '*' or '=' sc.readRune() switch sc.peekRune() { case '*': sc.readRune() return STARSTAR case '=': sc.readRune() return STAR_EQ } return STAR } sc.errorf(sc.pos, "unexpected input character %#q", c) panic("unreachable") } func (sc *scanner) scanString(val *tokenValue, quote rune) Token { start := sc.pos triple := len(sc.rest) >= 3 && sc.rest[0] == byte(quote) && sc.rest[1] == byte(quote) && sc.rest[2] == byte(quote) sc.readRune() // String literals may contain escaped or unescaped newlines, // causing them to span multiple lines (gulps) of REPL input; // they are the only such token. Thus we cannot call endToken, // as it assumes sc.rest is unchanged since startToken. // Instead, buffer the token here. // TODO(adonovan): opt: buffer only if we encounter a newline. raw := new(strings.Builder) // Copy the prefix, e.g. r' or " (see startToken). raw.Write(sc.token[:len(sc.token)-len(sc.rest)]) if !triple { // single-quoted string literal for { if sc.eof() { sc.error(val.pos, "unexpected EOF in string") } c := sc.readRune() raw.WriteRune(c) if c == quote { break } if c == '\n' { sc.error(val.pos, "unexpected newline in string") } if c == '\\' { if sc.eof() { sc.error(val.pos, "unexpected EOF in string") } c = sc.readRune() raw.WriteRune(c) } } } else { // triple-quoted string literal sc.readRune() raw.WriteRune(quote) sc.readRune() raw.WriteRune(quote) quoteCount := 0 for { if sc.eof() { sc.error(val.pos, "unexpected EOF in string") } c := sc.readRune() raw.WriteRune(c) if c == quote { quoteCount++ if quoteCount == 3 { break } } else { quoteCount = 0 } if c == '\\' { if sc.eof() { sc.error(val.pos, "unexpected EOF in string") } c = sc.readRune() raw.WriteRune(c) } } } val.raw = raw.String() s, _, isByte, err := unquote(val.raw) if err != nil { sc.error(start, err.Error()) } val.string = s if isByte { return BYTES } else { return STRING } } func (sc *scanner) scanNumber(val *tokenValue, c rune) Token { // https://github.com/google/starlark-go/blob/master/doc/spec.md#lexical-elements // // Python features not supported: // - integer literals of >64 bits of precision // - 123L or 123l long suffix // - traditional octal: 0755 // https://docs.python.org/2/reference/lexical_analysis.html#integer-and-long-integer-literals start := sc.pos fraction, exponent := false, false if c == '.' { // dot or start of fraction sc.readRune() c = sc.peekRune() if !isdigit(c) { sc.endToken(val) return DOT } fraction = true } else if c == '0' { // hex, octal, binary or float sc.readRune() c = sc.peekRune() if c == '.' { fraction = true } else if c == 'x' || c == 'X' { // hex sc.readRune() c = sc.peekRune() if !isxdigit(c) { sc.error(start, "invalid hex literal") } for isxdigit(c) { sc.readRune() c = sc.peekRune() } } else if c == 'o' || c == 'O' { // octal sc.readRune() c = sc.peekRune() if !isodigit(c) { sc.error(sc.pos, "invalid octal literal") } for isodigit(c) { sc.readRune() c = sc.peekRune() } } else if c == 'b' || c == 'B' { // binary sc.readRune() c = sc.peekRune() if !isbdigit(c) { sc.error(sc.pos, "invalid binary literal") } for isbdigit(c) { sc.readRune() c = sc.peekRune() } } else { // float (or obsolete octal "0755") allzeros, octal := true, true for isdigit(c) { if c != '0' { allzeros = false } if c > '7' { octal = false } sc.readRune() c = sc.peekRune() } if c == '.' { fraction = true } else if c == 'e' || c == 'E' { exponent = true } else if octal && !allzeros { sc.endToken(val) sc.errorf(sc.pos, "obsolete form of octal literal; use 0o%s", val.raw[1:]) } } } else { // decimal for isdigit(c) { sc.readRune() c = sc.peekRune() } if c == '.' { fraction = true } else if c == 'e' || c == 'E' { exponent = true } } if fraction { sc.readRune() // consume '.' c = sc.peekRune() for isdigit(c) { sc.readRune() c = sc.peekRune() } if c == 'e' || c == 'E' { exponent = true } } if exponent { sc.readRune() // consume [eE] c = sc.peekRune() if c == '+' || c == '-' { sc.readRune() c = sc.peekRune() if !isdigit(c) { sc.error(sc.pos, "invalid float literal") } } for isdigit(c) { sc.readRune() c = sc.peekRune() } } sc.endToken(val) if fraction || exponent { var err error val.float, err = strconv.ParseFloat(val.raw, 64) if err != nil { sc.error(sc.pos, "invalid float literal") } return FLOAT } else { var err error s := val.raw val.bigInt = nil if len(s) > 2 && s[0] == '0' && (s[1] == 'o' || s[1] == 'O') { val.int, err = strconv.ParseInt(s[2:], 8, 64) } else if len(s) > 2 && s[0] == '0' && (s[1] == 'b' || s[1] == 'B') { val.int, err = strconv.ParseInt(s[2:], 2, 64) } else { val.int, err = strconv.ParseInt(s, 0, 64) if err != nil { num := new(big.Int) var ok bool val.bigInt, ok = num.SetString(s, 0) if ok { err = nil } } } if err != nil { sc.error(start, "invalid int literal") } return INT } } // isIdent reports whether c is an identifier rune. func isIdent(c rune) bool { return isdigit(c) || isIdentStart(c) } func isIdentStart(c rune) bool { return 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '_' || unicode.IsLetter(c) } func isdigit(c rune) bool { return '0' <= c && c <= '9' } func isodigit(c rune) bool { return '0' <= c && c <= '7' } func isxdigit(c rune) bool { return isdigit(c) || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' } func isbdigit(c rune) bool { return '0' == c || c == '1' } // keywordToken records the special tokens for // strings that should not be treated as ordinary identifiers. var keywordToken = map[string]Token{ "and": AND, "break": BREAK, "continue": CONTINUE, "def": DEF, "elif": ELIF, "else": ELSE, "for": FOR, "if": IF, "in": IN, "lambda": LAMBDA, "load": LOAD, "not": NOT, "or": OR, "pass": PASS, "return": RETURN, "while": WHILE, // reserved words: "as": ILLEGAL, // "assert": ILLEGAL, // heavily used by our tests "class": ILLEGAL, "del": ILLEGAL, "except": ILLEGAL, "finally": ILLEGAL, "from": ILLEGAL, "global": ILLEGAL, "import": ILLEGAL, "is": ILLEGAL, "nonlocal": ILLEGAL, "raise": ILLEGAL, "try": ILLEGAL, "with": ILLEGAL, "yield": ILLEGAL, }