# Copyright 2015 Google Inc. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Comment splicer for lib2to3 trees. The lib2to3 syntax tree produced by the parser holds comments and whitespace in prefix attributes of nodes, rather than nodes themselves. This module provides functionality to splice comments out of prefixes and into nodes of their own, making them easier to process. SpliceComments(): the main function exported by this module. """ from lib2to3 import pygram from lib2to3 import pytree from lib2to3.pgen2 import token from yapf.yapflib import pytree_utils def SpliceComments(tree): """Given a pytree, splice comments into nodes of their own right. Extract comments from the prefixes where they are housed after parsing. The prefixes that previously housed the comments become empty. Args: tree: a pytree.Node - the tree to work on. The tree is modified by this function. """ # The previous leaf node encountered in the traversal. # This is a list because Python 2.x doesn't have 'nonlocal' :) prev_leaf = [None] _AnnotateIndents(tree) def _VisitNodeRec(node): """Recursively visit each node to splice comments into the AST.""" # This loop may insert into node.children, so we'll iterate over a copy. for child in node.children[:]: if isinstance(child, pytree.Node): # Nodes don't have prefixes. _VisitNodeRec(child) else: if child.prefix.lstrip().startswith('#'): # We have a comment prefix in this child, so splicing is needed. comment_prefix = child.prefix comment_lineno = child.lineno - comment_prefix.count('\n') comment_column = child.column # Remember the leading indentation of this prefix and clear it. # Mopping up the prefix is important because we may go over this same # child in the next iteration... child_prefix = child.prefix.lstrip('\n') prefix_indent = child_prefix[:child_prefix.find('#')] if '\n' in prefix_indent: prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:] child.prefix = '' if child.type == token.NEWLINE: # If the prefix was on a NEWLINE leaf, it's part of the line so it # will be inserted after the previously encountered leaf. # We can't just insert it before the NEWLINE node, because as a # result of the way pytrees are organized, this node can be under # an inappropriate parent. comment_column -= len(comment_prefix.lstrip()) pytree_utils.InsertNodesAfter( _CreateCommentsFromPrefix( comment_prefix, comment_lineno, comment_column, standalone=False), prev_leaf[0]) elif child.type == token.DEDENT: # Comment prefixes on DEDENT nodes also deserve special treatment, # because their final placement depends on their prefix. # We'll look for an ancestor of this child with a matching # indentation, and insert the comment before it if the ancestor is # on a DEDENT node and after it otherwise. # # lib2to3 places comments that should be separated into the same # DEDENT node. For example, "comment 1" and "comment 2" will be # combined. # # def _(): # for x in y: # pass # # comment 1 # # # comment 2 # pass # # In this case, we need to split them up ourselves. # Split into groups of comments at decreasing levels of indentation comment_groups = [] comment_column = None for cmt in comment_prefix.split('\n'): col = cmt.find('#') if col < 0: if comment_column is None: # Skip empty lines at the top of the first comment group comment_lineno += 1 continue elif comment_column is None or col < comment_column: comment_column = col comment_indent = cmt[:comment_column] comment_groups.append((comment_column, comment_indent, [])) comment_groups[-1][-1].append(cmt) # Insert a node for each group for comment_column, comment_indent, comment_group in comment_groups: ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent) if ancestor_at_indent.type == token.DEDENT: InsertNodes = pytree_utils.InsertNodesBefore # pylint: disable=invalid-name # noqa else: InsertNodes = pytree_utils.InsertNodesAfter # pylint: disable=invalid-name # noqa InsertNodes( _CreateCommentsFromPrefix( '\n'.join(comment_group) + '\n', comment_lineno, comment_column, standalone=True), ancestor_at_indent) comment_lineno += len(comment_group) else: # Otherwise there are two cases. # # 1. The comment is on its own line # 2. The comment is part of an expression. # # Unfortunately, it's fairly difficult to distinguish between the # two in lib2to3 trees. The algorithm here is to determine whether # child is the first leaf in the statement it belongs to. If it is, # then the comment (which is a prefix) belongs on a separate line. # If it is not, it means the comment is buried deep in the statement # and is part of some expression. stmt_parent = _FindStmtParent(child) for leaf_in_parent in stmt_parent.leaves(): if leaf_in_parent.type == token.NEWLINE: continue elif id(leaf_in_parent) == id(child): # This comment stands on its own line, and it has to be inserted # into the appropriate parent. We'll have to find a suitable # parent to insert into. See comments above # _STANDALONE_LINE_NODES for more details. node_with_line_parent = _FindNodeWithStandaloneLineParent(child) if pytree_utils.NodeName( node_with_line_parent.parent) in {'funcdef', 'classdef'}: # Keep a comment that's not attached to a function or class # next to the object it is attached to. comment_end = ( comment_lineno + comment_prefix.rstrip('\n').count('\n')) if comment_end < node_with_line_parent.lineno - 1: node_with_line_parent = node_with_line_parent.parent pytree_utils.InsertNodesBefore( _CreateCommentsFromPrefix( comment_prefix, comment_lineno, 0, standalone=True), node_with_line_parent) break else: if comment_lineno == prev_leaf[0].lineno: comment_lines = comment_prefix.splitlines() value = comment_lines[0].lstrip() if value.rstrip('\n'): comment_column = prev_leaf[0].column comment_column += len(prev_leaf[0].value) comment_column += ( len(comment_lines[0]) - len(comment_lines[0].lstrip())) comment_leaf = pytree.Leaf( type=token.COMMENT, value=value.rstrip('\n'), context=('', (comment_lineno, comment_column))) pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0]) comment_prefix = '\n'.join(comment_lines[1:]) comment_lineno += 1 rindex = (0 if '\n' not in comment_prefix.rstrip() else comment_prefix.rstrip().rindex('\n') + 1) comment_column = ( len(comment_prefix[rindex:]) - len(comment_prefix[rindex:].lstrip())) comments = _CreateCommentsFromPrefix( comment_prefix, comment_lineno, comment_column, standalone=False) pytree_utils.InsertNodesBefore(comments, child) break prev_leaf[0] = child _VisitNodeRec(tree) def _CreateCommentsFromPrefix(comment_prefix, comment_lineno, comment_column, standalone=False): """Create pytree nodes to represent the given comment prefix. Args: comment_prefix: (unicode) the text of the comment from the node's prefix. comment_lineno: (int) the line number for the start of the comment. comment_column: (int) the column for the start of the comment. standalone: (bool) determines if the comment is standalone or not. Returns: The simple_stmt nodes if this is a standalone comment, otherwise a list of new COMMENT leafs. The prefix may consist of multiple comment blocks, separated by blank lines. Each block gets its own leaf. """ # The comment is stored in the prefix attribute, with no lineno of its # own. So we only know at which line it ends. To find out at which line it # starts, look at how many newlines the comment itself contains. comments = [] lines = comment_prefix.split('\n') index = 0 while index < len(lines): comment_block = [] while index < len(lines) and lines[index].lstrip().startswith('#'): comment_block.append(lines[index].strip()) index += 1 if comment_block: new_lineno = comment_lineno + index - 1 comment_block[0] = comment_block[0].strip() comment_block[-1] = comment_block[-1].strip() comment_leaf = pytree.Leaf( type=token.COMMENT, value='\n'.join(comment_block), context=('', (new_lineno, comment_column))) comment_node = comment_leaf if not standalone else pytree.Node( pygram.python_symbols.simple_stmt, [comment_leaf]) comments.append(comment_node) while index < len(lines) and not lines[index].lstrip(): index += 1 return comments # "Standalone line nodes" are tree nodes that have to start a new line in Python # code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as # parents of other nodes but can come later in a line. This is a list of # standalone line nodes in the grammar. It is meant to be exhaustive # *eventually*, and we'll modify it with time as we discover more corner cases # in the parse tree. # # When splicing a standalone comment (i.e. a comment that appears on its own # line, not on the same line with other code), it's important to insert it into # an appropriate parent of the node it's attached to. An appropriate parent # is the first "standalone line node" in the parent chain of a node. _STANDALONE_LINE_NODES = frozenset([ 'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt', 'funcdef', 'classdef', 'decorated', 'file_input' ]) def _FindNodeWithStandaloneLineParent(node): """Find a node whose parent is a 'standalone line' node. See the comment above _STANDALONE_LINE_NODES for more details. Arguments: node: node to start from Returns: Suitable node that's either the node itself or one of its ancestors. """ if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES: return node else: # This is guaranteed to terminate because 'file_input' is the root node of # any pytree. return _FindNodeWithStandaloneLineParent(node.parent) # "Statement nodes" are standalone statements. The don't have to start a new # line. _STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES def _FindStmtParent(node): """Find the nearest parent of node that is a statement node. Arguments: node: node to start from Returns: Nearest parent (or node itself, if suitable). """ if pytree_utils.NodeName(node) in _STATEMENT_NODES: return node else: return _FindStmtParent(node.parent) def _FindAncestorAtIndent(node, indent): """Find an ancestor of node with the given indentation. Arguments: node: node to start from. This must not be the tree root. indent: indentation string for the ancestor we're looking for. See _AnnotateIndents for more details. Returns: An ancestor node with suitable indentation. If no suitable ancestor is found, the closest ancestor to the tree root is returned. """ if node.parent.parent is None: # Our parent is the tree root, so there's nowhere else to go. return node # If the parent has an indent annotation, and it's shorter than node's # indent, this is a suitable ancestor. # The reason for "shorter" rather than "equal" is that comments may be # improperly indented (i.e. by three spaces, where surrounding statements # have either zero or two or four), and we don't want to propagate them all # the way to the root. parent_indent = pytree_utils.GetNodeAnnotation( node.parent, pytree_utils.Annotation.CHILD_INDENT) if parent_indent is not None and indent.startswith(parent_indent): return node else: # Keep looking up the tree. return _FindAncestorAtIndent(node.parent, indent) def _AnnotateIndents(tree): """Annotate the tree with child_indent annotations. A child_indent annotation on a node specifies the indentation (as a string, like " ") of its children. It is inferred from the INDENT child of a node. Arguments: tree: root of a pytree. The pytree is modified to add annotations to nodes. Raises: RuntimeError: if the tree is malformed. """ # Annotate the root of the tree with zero indent. if tree.parent is None: pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, '') for child in tree.children: if child.type == token.INDENT: child_indent = pytree_utils.GetNodeAnnotation( tree, pytree_utils.Annotation.CHILD_INDENT) if child_indent is not None and child_indent != child.value: raise RuntimeError('inconsistent indentation for child', (tree, child)) pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, child.value) _AnnotateIndents(child)