1// Copyright 2023 The Bazel Authors. All rights reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package python 16 17import ( 18 "context" 19 "fmt" 20 "log" 21 "os" 22 "path/filepath" 23 "strings" 24 25 sitter "github.com/smacker/go-tree-sitter" 26 "github.com/smacker/go-tree-sitter/python" 27) 28 29const ( 30 sitterNodeTypeString = "string" 31 sitterNodeTypeComment = "comment" 32 sitterNodeTypeIdentifier = "identifier" 33 sitterNodeTypeDottedName = "dotted_name" 34 sitterNodeTypeIfStatement = "if_statement" 35 sitterNodeTypeAliasedImport = "aliased_import" 36 sitterNodeTypeWildcardImport = "wildcard_import" 37 sitterNodeTypeImportStatement = "import_statement" 38 sitterNodeTypeComparisonOperator = "comparison_operator" 39 sitterNodeTypeImportFromStatement = "import_from_statement" 40) 41 42type ParserOutput struct { 43 FileName string 44 Modules []module 45 Comments []comment 46 HasMain bool 47} 48 49type FileParser struct { 50 code []byte 51 relFilepath string 52 output ParserOutput 53} 54 55func NewFileParser() *FileParser { 56 return &FileParser{} 57} 58 59// ParseCode instantiates a new tree-sitter Parser and parses the python code, returning 60// the tree-sitter RootNode. 61// It prints a warning if parsing fails. 62func ParseCode(code []byte, path string) (*sitter.Node, error) { 63 parser := sitter.NewParser() 64 parser.SetLanguage(python.GetLanguage()) 65 66 tree, err := parser.ParseCtx(context.Background(), nil, code) 67 if err != nil { 68 return nil, err 69 } 70 71 root := tree.RootNode() 72 if !root.HasError() { 73 return root, nil 74 } 75 76 log.Printf("WARNING: failed to parse %q. The resulting BUILD target may be incorrect.", path) 77 78 // Note: we intentionally do not return an error even when root.HasError because the parse 79 // failure may be in some part of the code that Gazelle doesn't care about. 80 verbose, envExists := os.LookupEnv("RULES_PYTHON_GAZELLE_VERBOSE") 81 if !envExists || verbose != "1" { 82 return root, nil 83 } 84 85 for i := 0; i < int(root.ChildCount()); i++ { 86 child := root.Child(i) 87 if child.IsError() { 88 // Example logs: 89 // gazelle: Parse error at {Row:1 Column:0}: 90 // def search_one_more_level[T](): 91 log.Printf("Parse error at %+v:\n%+v", child.StartPoint(), child.Content(code)) 92 // Log the internal tree-sitter representation of what was parsed. Eg: 93 // gazelle: The above was parsed as: (ERROR (identifier) (call function: (list (identifier)) arguments: (argument_list))) 94 log.Printf("The above was parsed as: %v", child.String()) 95 } 96 } 97 98 return root, nil 99} 100 101// parseMain returns true if the python file has an `if __name__ == "__main__":` block, 102// which is a common idiom for python scripts/binaries. 103func (p *FileParser) parseMain(ctx context.Context, node *sitter.Node) bool { 104 for i := 0; i < int(node.ChildCount()); i++ { 105 if err := ctx.Err(); err != nil { 106 return false 107 } 108 child := node.Child(i) 109 if child.Type() == sitterNodeTypeIfStatement && 110 child.Child(1).Type() == sitterNodeTypeComparisonOperator && child.Child(1).Child(1).Type() == "==" { 111 statement := child.Child(1) 112 a, b := statement.Child(0), statement.Child(2) 113 // convert "'__main__' == __name__" to "__name__ == '__main__'" 114 if b.Type() == sitterNodeTypeIdentifier { 115 a, b = b, a 116 } 117 if a.Type() == sitterNodeTypeIdentifier && a.Content(p.code) == "__name__" && 118 // at github.com/smacker/go-tree-sitter@latest (after v0.0.0-20240422154435-0628b34cbf9c we used) 119 // "__main__" is the second child of b. But now, it isn't. 120 // we cannot use the latest go-tree-sitter because of the top level reference in scanner.c. 121 // https://github.com/smacker/go-tree-sitter/blob/04d6b33fe138a98075210f5b770482ded024dc0f/python/scanner.c#L1 122 b.Type() == sitterNodeTypeString && string(p.code[b.StartByte()+1:b.EndByte()-1]) == "__main__" { 123 return true 124 } 125 } 126 } 127 return false 128} 129 130// parseImportStatement parses a node for an import statement, returning a `module` and a boolean 131// representing if the parse was OK or not. 132func parseImportStatement(node *sitter.Node, code []byte) (module, bool) { 133 switch node.Type() { 134 case sitterNodeTypeDottedName: 135 return module{ 136 Name: node.Content(code), 137 LineNumber: node.StartPoint().Row + 1, 138 }, true 139 case sitterNodeTypeAliasedImport: 140 return parseImportStatement(node.Child(0), code) 141 case sitterNodeTypeWildcardImport: 142 return module{ 143 Name: "*", 144 LineNumber: node.StartPoint().Row + 1, 145 }, true 146 } 147 return module{}, false 148} 149 150// parseImportStatements parses a node for import statements, returning true if the node is 151// an import statement. It updates FileParser.output.Modules with the `module` that the 152// import represents. 153func (p *FileParser) parseImportStatements(node *sitter.Node) bool { 154 if node.Type() == sitterNodeTypeImportStatement { 155 for j := 1; j < int(node.ChildCount()); j++ { 156 m, ok := parseImportStatement(node.Child(j), p.code) 157 if !ok { 158 continue 159 } 160 m.Filepath = p.relFilepath 161 if strings.HasPrefix(m.Name, ".") { 162 continue 163 } 164 p.output.Modules = append(p.output.Modules, m) 165 } 166 } else if node.Type() == sitterNodeTypeImportFromStatement { 167 from := node.Child(1).Content(p.code) 168 if strings.HasPrefix(from, ".") { 169 return true 170 } 171 for j := 3; j < int(node.ChildCount()); j++ { 172 m, ok := parseImportStatement(node.Child(j), p.code) 173 if !ok { 174 continue 175 } 176 m.Filepath = p.relFilepath 177 m.From = from 178 m.Name = fmt.Sprintf("%s.%s", from, m.Name) 179 p.output.Modules = append(p.output.Modules, m) 180 } 181 } else { 182 return false 183 } 184 return true 185} 186 187// parseComments parses a node for comments, returning true if the node is a comment. 188// It updates FileParser.output.Comments with the parsed comment. 189func (p *FileParser) parseComments(node *sitter.Node) bool { 190 if node.Type() == sitterNodeTypeComment { 191 p.output.Comments = append(p.output.Comments, comment(node.Content(p.code))) 192 return true 193 } 194 return false 195} 196 197func (p *FileParser) SetCodeAndFile(code []byte, relPackagePath, filename string) { 198 p.code = code 199 p.relFilepath = filepath.Join(relPackagePath, filename) 200 p.output.FileName = filename 201} 202 203func (p *FileParser) parse(ctx context.Context, node *sitter.Node) { 204 if node == nil { 205 return 206 } 207 for i := 0; i < int(node.ChildCount()); i++ { 208 if err := ctx.Err(); err != nil { 209 return 210 } 211 child := node.Child(i) 212 if p.parseImportStatements(child) { 213 continue 214 } 215 if p.parseComments(child) { 216 continue 217 } 218 p.parse(ctx, child) 219 } 220} 221 222func (p *FileParser) Parse(ctx context.Context) (*ParserOutput, error) { 223 rootNode, err := ParseCode(p.code, p.relFilepath) 224 if err != nil { 225 return nil, err 226 } 227 228 p.output.HasMain = p.parseMain(ctx, rootNode) 229 230 p.parse(ctx, rootNode) 231 return &p.output, nil 232} 233 234func (p *FileParser) ParseFile(ctx context.Context, repoRoot, relPackagePath, filename string) (*ParserOutput, error) { 235 code, err := os.ReadFile(filepath.Join(repoRoot, relPackagePath, filename)) 236 if err != nil { 237 return nil, err 238 } 239 p.SetCodeAndFile(code, relPackagePath, filename) 240 return p.Parse(ctx) 241} 242