1# Copyright 2015 Google Inc. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14"""Comment splicer for lib2to3 trees. 15 16The lib2to3 syntax tree produced by the parser holds comments and whitespace in 17prefix attributes of nodes, rather than nodes themselves. This module provides 18functionality to splice comments out of prefixes and into nodes of their own, 19making them easier to process. 20 21 SpliceComments(): the main function exported by this module. 22""" 23 24from lib2to3 import pygram 25from lib2to3 import pytree 26from lib2to3.pgen2 import token 27 28from yapf.yapflib import pytree_utils 29 30 31def SpliceComments(tree): 32 """Given a pytree, splice comments into nodes of their own right. 33 34 Extract comments from the prefixes where they are housed after parsing. 35 The prefixes that previously housed the comments become empty. 36 37 Args: 38 tree: a pytree.Node - the tree to work on. The tree is modified by this 39 function. 40 """ 41 # The previous leaf node encountered in the traversal. 42 # This is a list because Python 2.x doesn't have 'nonlocal' :) 43 prev_leaf = [None] 44 _AnnotateIndents(tree) 45 46 def _VisitNodeRec(node): 47 # This loop may insert into node.children, so we'll iterate over a copy. 48 for child in node.children[:]: 49 if isinstance(child, pytree.Node): 50 # Nodes don't have prefixes. 51 _VisitNodeRec(child) 52 else: 53 if child.prefix.lstrip().startswith('#'): 54 # We have a comment prefix in this child, so splicing is needed. 55 comment_prefix = child.prefix 56 comment_lineno = child.lineno - comment_prefix.count('\n') 57 comment_column = child.column 58 59 # Remember the leading indentation of this prefix and clear it. 60 # Mopping up the prefix is important because we may go over this same 61 # child in the next iteration... 62 child_prefix = child.prefix.lstrip('\n') 63 prefix_indent = child_prefix[:child_prefix.find('#')] 64 if '\n' in prefix_indent: 65 prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:] 66 child.prefix = '' 67 68 if child.type == token.NEWLINE: 69 # If the prefix was on a NEWLINE leaf, it's part of the line so it 70 # will be inserted after the previously encountered leaf. 71 # We can't just insert it before the NEWLINE node, because as a 72 # result of the way pytrees are organized, this node can be under 73 # an inappropriate parent. 74 comment_column -= len(comment_prefix.lstrip()) 75 pytree_utils.InsertNodesAfter( 76 _CreateCommentsFromPrefix( 77 comment_prefix, 78 comment_lineno, 79 comment_column, 80 standalone=False), prev_leaf[0]) 81 elif child.type == token.DEDENT: 82 # Comment prefixes on DEDENT nodes also deserve special treatment, 83 # because their final placement depends on their prefix. 84 # We'll look for an ancestor of this child with a matching 85 # indentation, and insert the comment before it if the ancestor is 86 # on a DEDENT node and after it otherwise. 87 # 88 # lib2to3 places comments that should be separated into the same 89 # DEDENT node. For example, "comment 1" and "comment 2" will be 90 # combined. 91 # 92 # def _(): 93 # for x in y: 94 # pass 95 # # comment 1 96 # 97 # # comment 2 98 # pass 99 # 100 # In this case, we need to split them up ourselves. 101 102 # Split into groups of comments at decreasing levels of indentation 103 comment_groups = [] 104 comment_column = None 105 for cmt in comment_prefix.split('\n'): 106 col = cmt.find('#') 107 if col < 0: 108 if comment_column is None: 109 # Skip empty lines at the top of the first comment group 110 comment_lineno += 1 111 continue 112 elif comment_column is None or col < comment_column: 113 comment_column = col 114 comment_indent = cmt[:comment_column] 115 comment_groups.append((comment_column, comment_indent, [])) 116 comment_groups[-1][-1].append(cmt) 117 118 # Insert a node for each group 119 for comment_column, comment_indent, comment_group in comment_groups: 120 ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent) 121 if ancestor_at_indent.type == token.DEDENT: 122 InsertNodes = pytree_utils.InsertNodesBefore # pylint: disable=invalid-name 123 else: 124 InsertNodes = pytree_utils.InsertNodesAfter # pylint: disable=invalid-name 125 InsertNodes( 126 _CreateCommentsFromPrefix( 127 '\n'.join(comment_group) + '\n', 128 comment_lineno, 129 comment_column, 130 standalone=True), ancestor_at_indent) 131 comment_lineno += len(comment_group) 132 else: 133 # Otherwise there are two cases. 134 # 135 # 1. The comment is on its own line 136 # 2. The comment is part of an expression. 137 # 138 # Unfortunately, it's fairly difficult to distinguish between the 139 # two in lib2to3 trees. The algorithm here is to determine whether 140 # child is the first leaf in the statement it belongs to. If it is, 141 # then the comment (which is a prefix) belongs on a separate line. 142 # If it is not, it means the comment is buried deep in the statement 143 # and is part of some expression. 144 stmt_parent = _FindStmtParent(child) 145 146 for leaf_in_parent in stmt_parent.leaves(): 147 if leaf_in_parent.type == token.NEWLINE: 148 continue 149 elif id(leaf_in_parent) == id(child): 150 # This comment stands on its own line, and it has to be inserted 151 # into the appropriate parent. We'll have to find a suitable 152 # parent to insert into. See comments above 153 # _STANDALONE_LINE_NODES for more details. 154 node_with_line_parent = _FindNodeWithStandaloneLineParent(child) 155 pytree_utils.InsertNodesBefore( 156 _CreateCommentsFromPrefix( 157 comment_prefix, comment_lineno, 0, standalone=True), 158 node_with_line_parent) 159 break 160 else: 161 if comment_lineno == prev_leaf[0].lineno: 162 comment_lines = comment_prefix.splitlines() 163 value = comment_lines[0].lstrip() 164 if value.rstrip('\n'): 165 comment_column = prev_leaf[0].column 166 comment_column += len(prev_leaf[0].value) 167 comment_column += ( 168 len(comment_lines[0]) - len(comment_lines[0].lstrip())) 169 comment_leaf = pytree.Leaf( 170 type=token.COMMENT, 171 value=value.rstrip('\n'), 172 context=('', (comment_lineno, comment_column))) 173 pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0]) 174 comment_prefix = '\n'.join(comment_lines[1:]) 175 comment_lineno += 1 176 177 rindex = (0 if '\n' not in comment_prefix.rstrip() else 178 comment_prefix.rstrip().rindex('\n') + 1) 179 comment_column = ( 180 len(comment_prefix[rindex:]) - len( 181 comment_prefix[rindex:].lstrip())) 182 comments = _CreateCommentsFromPrefix( 183 comment_prefix, 184 comment_lineno, 185 comment_column, 186 standalone=False) 187 pytree_utils.InsertNodesBefore(comments, child) 188 break 189 190 prev_leaf[0] = child 191 192 _VisitNodeRec(tree) 193 194 195def _CreateCommentsFromPrefix(comment_prefix, 196 comment_lineno, 197 comment_column, 198 standalone=False): 199 """Create pytree nodes to represent the given comment prefix. 200 201 Args: 202 comment_prefix: (unicode) the text of the comment from the node's prefix. 203 comment_lineno: (int) the line number for the start of the comment. 204 comment_column: (int) the column for the start of the comment. 205 standalone: (bool) determines if the comment is standalone or not. 206 207 Returns: 208 The simple_stmt nodes if this is a standalone comment, otherwise a list of 209 new COMMENT leafs. The prefix may consist of multiple comment blocks, 210 separated by blank lines. Each block gets its own leaf. 211 """ 212 # The comment is stored in the prefix attribute, with no lineno of its 213 # own. So we only know at which line it ends. To find out at which line it 214 # starts, look at how many newlines the comment itself contains. 215 comments = [] 216 217 lines = comment_prefix.split('\n') 218 index = 0 219 while index < len(lines): 220 comment_block = [] 221 while index < len(lines) and lines[index].lstrip().startswith('#'): 222 comment_block.append(lines[index].strip()) 223 index += 1 224 225 if comment_block: 226 new_lineno = comment_lineno + index - 1 227 comment_block[0] = comment_block[0].strip() 228 comment_block[-1] = comment_block[-1].strip() 229 comment_leaf = pytree.Leaf( 230 type=token.COMMENT, 231 value='\n'.join(comment_block), 232 context=('', (new_lineno, comment_column))) 233 comment_node = comment_leaf if not standalone else pytree.Node( 234 pygram.python_symbols.simple_stmt, [comment_leaf]) 235 comments.append(comment_node) 236 237 while index < len(lines) and not lines[index].lstrip(): 238 index += 1 239 240 return comments 241 242 243# "Standalone line nodes" are tree nodes that have to start a new line in Python 244# code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as 245# parents of other nodes but can come later in a line. This is a list of 246# standalone line nodes in the grammar. It is meant to be exhaustive 247# *eventually*, and we'll modify it with time as we discover more corner cases 248# in the parse tree. 249# 250# When splicing a standalone comment (i.e. a comment that appears on its own 251# line, not on the same line with other code), it's important to insert it into 252# an appropriate parent of the node it's attached to. An appropriate parent 253# is the first "standaline line node" in the parent chain of a node. 254_STANDALONE_LINE_NODES = frozenset([ 255 'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt', 256 'funcdef', 'classdef', 'decorated', 'file_input' 257]) 258 259 260def _FindNodeWithStandaloneLineParent(node): 261 """Find a node whose parent is a 'standalone line' node. 262 263 See the comment above _STANDALONE_LINE_NODES for more details. 264 265 Arguments: 266 node: node to start from 267 268 Returns: 269 Suitable node that's either the node itself or one of its ancestors. 270 """ 271 if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES: 272 return node 273 else: 274 # This is guaranteed to terminate because 'file_input' is the root node of 275 # any pytree. 276 return _FindNodeWithStandaloneLineParent(node.parent) 277 278 279# "Statement nodes" are standalone statements. The don't have to start a new 280# line. 281_STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES 282 283 284def _FindStmtParent(node): 285 """Find the nearest parent of node that is a statement node. 286 287 Arguments: 288 node: node to start from 289 290 Returns: 291 Nearest parent (or node itself, if suitable). 292 """ 293 if pytree_utils.NodeName(node) in _STATEMENT_NODES: 294 return node 295 else: 296 return _FindStmtParent(node.parent) 297 298 299def _FindAncestorAtIndent(node, indent): 300 """Find an ancestor of node with the given indentation. 301 302 Arguments: 303 node: node to start from. This must not be the tree root. 304 indent: indentation string for the ancestor we're looking for. 305 See _AnnotateIndents for more details. 306 307 Returns: 308 An ancestor node with suitable indentation. If no suitable ancestor is 309 found, the closest ancestor to the tree root is returned. 310 """ 311 if node.parent.parent is None: 312 # Our parent is the tree root, so there's nowhere else to go. 313 return node 314 315 # If the parent has an indent annotation, and it's shorter than node's 316 # indent, this is a suitable ancestor. 317 # The reason for "shorter" rather than "equal" is that comments may be 318 # improperly indented (i.e. by three spaces, where surrounding statements 319 # have either zero or two or four), and we don't want to propagate them all 320 # the way to the root. 321 parent_indent = pytree_utils.GetNodeAnnotation( 322 node.parent, pytree_utils.Annotation.CHILD_INDENT) 323 if parent_indent is not None and indent.startswith(parent_indent): 324 return node 325 else: 326 # Keep looking up the tree. 327 return _FindAncestorAtIndent(node.parent, indent) 328 329 330def _AnnotateIndents(tree): 331 """Annotate the tree with child_indent annotations. 332 333 A child_indent annotation on a node specifies the indentation (as a string, 334 like " ") of its children. It is inferred from the INDENT child of a node. 335 336 Arguments: 337 tree: root of a pytree. The pytree is modified to add annotations to nodes. 338 339 Raises: 340 RuntimeError: if the tree is malformed. 341 """ 342 # Annotate the root of the tree with zero indent. 343 if tree.parent is None: 344 pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, 345 '') 346 for child in tree.children: 347 if child.type == token.INDENT: 348 child_indent = pytree_utils.GetNodeAnnotation( 349 tree, pytree_utils.Annotation.CHILD_INDENT) 350 if child_indent is not None and child_indent != child.value: 351 raise RuntimeError('inconsistent indentation for child', (tree, child)) 352 pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, 353 child.value) 354 _AnnotateIndents(child) 355