# Copyright 2015 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. from __future__ import print_function import parser import symbol import sys import token import tokenize from py_utils.refactor import offset_token class Snippet(object): """A node in the Python parse tree. The Python grammar is defined at: https://docs.python.org/2/reference/grammar.html There are two types of Snippets: TokenSnippets are leaf nodes containing actual text. Symbols are internal nodes representing higher-level groupings, and are defined by the left-hand sides of the BNFs in the above link. """ @property def type(self): raise NotImplementedError() @property def type_name(self): raise NotImplementedError() @property def children(self): """Return a list of this node's children.""" raise NotImplementedError() @property def tokens(self): """Return a tuple of the tokens this Snippet contains.""" raise NotImplementedError() def PrintTree(self, indent=0, stream=sys.stdout): """Spew a pretty-printed parse tree. Mostly useful for debugging.""" raise NotImplementedError() def __str__(self): return offset_token.Untokenize(self.tokens) def FindAll(self, snippet_type): if isinstance(snippet_type, int): if self.type == snippet_type: yield self else: if isinstance(self, snippet_type): yield self for child in self.children: for snippet in child.FindAll(snippet_type): yield snippet def FindChild(self, snippet_type, **kwargs): for child in self.children: if isinstance(snippet_type, int): if child.type != snippet_type: continue else: if not isinstance(child, snippet_type): continue for attribute, value in kwargs: if getattr(child, attribute) != value: break else: return child raise ValueError('%s is not in %s. Children are: %s' % (snippet_type, self, self.children)) def FindChildren(self, snippet_type): if isinstance(snippet_type, int): for child in self.children: if child.type == snippet_type: yield child else: for child in self.children: if isinstance(child, snippet_type): yield child class TokenSnippet(Snippet): """A Snippet containing a list of tokens. A list of tokens may start with any number of comments and non-terminating newlines, but must end with a syntactically meaningful token. """ def __init__(self, token_type, tokens): # For operators and delimiters, the TokenSnippet's type may be more specific # than the type of the constituent token. E.g. the TokenSnippet type is # token.DOT, but the token type is token.OP. This is because the parser # has more context than the tokenizer. self._type = token_type self._tokens = tokens self._modified = False @classmethod def Create(cls, token_type, string, offset=(0, 0)): return cls(token_type, [offset_token.OffsetToken(token_type, string, offset)]) @property def type(self): return self._type @property def type_name(self): return token.tok_name[self.type] @property def value(self): return self._tokens[-1].string @value.setter def value(self, value): self._tokens[-1].string = value self._modified = True @property def children(self): return [] @property def tokens(self): return tuple(self._tokens) @property def modified(self): return self._modified def PrintTree(self, indent=0, stream=sys.stdout): stream.write(' ' * indent) if not self.tokens: print(self.type_name, file=stream) return print('%-4s' % self.type_name, repr(self.tokens[0].string), file=stream) for tok in self.tokens[1:]: stream.write(' ' * indent) print(' ' * max(len(self.type_name), 4), repr(tok.string), file=stream) class Symbol(Snippet): """A Snippet containing sub-Snippets. The possible types and type_names are defined in Python's symbol module.""" def __init__(self, symbol_type, children): self._type = symbol_type self._children = children @property def type(self): return self._type @property def type_name(self): return symbol.sym_name[self.type] @property def children(self): return self._children @children.setter def children(self, value): # pylint: disable=arguments-differ self._children = value @property def tokens(self): tokens = [] for child in self.children: tokens += child.tokens return tuple(tokens) @property def modified(self): return any(child.modified for child in self.children) def PrintTree(self, indent=0, stream=sys.stdout): stream.write(' ' * indent) # If there's only one child, collapse it onto the same line. node = self while len(node.children) == 1 and len(node.children[0].children) == 1: print(node.type_name, end=' ', file=stream) node = node.children[0] print(node.type_name, file=stream) for child in node.children: child.PrintTree(indent + 2, stream) def Snippetize(f): """Return the syntax tree of the given file.""" f.seek(0) syntax_tree = parser.st2list(parser.suite(f.read())) tokens = offset_token.Tokenize(f) snippet = _SnippetizeNode(syntax_tree, tokens) assert not tokens return snippet def _SnippetizeNode(node, tokens): # The parser module gives a syntax tree that discards comments, # non-terminating newlines, and whitespace information. Use the tokens given # by the tokenize module to annotate the syntax tree with the information # needed to exactly reproduce the original source code. node_type = node[0] if node_type >= token.NT_OFFSET: # Symbol. children = tuple(_SnippetizeNode(child, tokens) for child in node[1:]) return Symbol(node_type, children) else: # Token. grabbed_tokens = [] while tokens and ( tokens[0].type == tokenize.COMMENT or tokens[0].type == tokenize.NL): grabbed_tokens.append(tokens.popleft()) # parser has 2 NEWLINEs right before the end. # tokenize has 0 or 1 depending on if the file has one. # Create extra nodes without consuming tokens to account for this. if node_type == token.NEWLINE: for tok in tokens: if tok.type == token.ENDMARKER: return TokenSnippet(node_type, grabbed_tokens) if tok.type != token.DEDENT: break assert tokens[0].type == token.OP or node_type == tokens[0].type grabbed_tokens.append(tokens.popleft()) return TokenSnippet(node_type, grabbed_tokens)