1# Copyright 2015 Google Inc. All Rights Reserved. 2# 3# Licensed under the Apache License, Version 2.0 (the "License"); 4# you may not use this file except in compliance with the License. 5# You may obtain a copy of the License at 6# 7# http://www.apache.org/licenses/LICENSE-2.0 8# 9# Unless required by applicable law or agreed to in writing, software 10# distributed under the License is distributed on an "AS IS" BASIS, 11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12# See the License for the specific language governing permissions and 13# limitations under the License. 14"""Comment splicer for lib2to3 trees. 15 16The lib2to3 syntax tree produced by the parser holds comments and whitespace in 17prefix attributes of nodes, rather than nodes themselves. This module provides 18functionality to splice comments out of prefixes and into nodes of their own, 19making them easier to process. 20 21 SpliceComments(): the main function exported by this module. 22""" 23 24from lib2to3 import pygram 25from lib2to3 import pytree 26from lib2to3.pgen2 import token 27 28from yapf.yapflib import pytree_utils 29 30 31def SpliceComments(tree): 32 """Given a pytree, splice comments into nodes of their own right. 33 34 Extract comments from the prefixes where they are housed after parsing. 35 The prefixes that previously housed the comments become empty. 36 37 Args: 38 tree: a pytree.Node - the tree to work on. The tree is modified by this 39 function. 40 """ 41 # The previous leaf node encountered in the traversal. 42 # This is a list because Python 2.x doesn't have 'nonlocal' :) 43 prev_leaf = [None] 44 _AnnotateIndents(tree) 45 46 def _VisitNodeRec(node): 47 """Recursively visit each node to splice comments into the AST.""" 48 # This loop may insert into node.children, so we'll iterate over a copy. 49 for child in node.children[:]: 50 if isinstance(child, pytree.Node): 51 # Nodes don't have prefixes. 52 _VisitNodeRec(child) 53 else: 54 if child.prefix.lstrip().startswith('#'): 55 # We have a comment prefix in this child, so splicing is needed. 56 comment_prefix = child.prefix 57 comment_lineno = child.lineno - comment_prefix.count('\n') 58 comment_column = child.column 59 60 # Remember the leading indentation of this prefix and clear it. 61 # Mopping up the prefix is important because we may go over this same 62 # child in the next iteration... 63 child_prefix = child.prefix.lstrip('\n') 64 prefix_indent = child_prefix[:child_prefix.find('#')] 65 if '\n' in prefix_indent: 66 prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:] 67 child.prefix = '' 68 69 if child.type == token.NEWLINE: 70 # If the prefix was on a NEWLINE leaf, it's part of the line so it 71 # will be inserted after the previously encountered leaf. 72 # We can't just insert it before the NEWLINE node, because as a 73 # result of the way pytrees are organized, this node can be under 74 # an inappropriate parent. 75 comment_column -= len(comment_prefix.lstrip()) 76 pytree_utils.InsertNodesAfter( 77 _CreateCommentsFromPrefix( 78 comment_prefix, 79 comment_lineno, 80 comment_column, 81 standalone=False), prev_leaf[0]) 82 elif child.type == token.DEDENT: 83 # Comment prefixes on DEDENT nodes also deserve special treatment, 84 # because their final placement depends on their prefix. 85 # We'll look for an ancestor of this child with a matching 86 # indentation, and insert the comment before it if the ancestor is 87 # on a DEDENT node and after it otherwise. 88 # 89 # lib2to3 places comments that should be separated into the same 90 # DEDENT node. For example, "comment 1" and "comment 2" will be 91 # combined. 92 # 93 # def _(): 94 # for x in y: 95 # pass 96 # # comment 1 97 # 98 # # comment 2 99 # pass 100 # 101 # In this case, we need to split them up ourselves. 102 103 # Split into groups of comments at decreasing levels of indentation 104 comment_groups = [] 105 comment_column = None 106 for cmt in comment_prefix.split('\n'): 107 col = cmt.find('#') 108 if col < 0: 109 if comment_column is None: 110 # Skip empty lines at the top of the first comment group 111 comment_lineno += 1 112 continue 113 elif comment_column is None or col < comment_column: 114 comment_column = col 115 comment_indent = cmt[:comment_column] 116 comment_groups.append((comment_column, comment_indent, [])) 117 comment_groups[-1][-1].append(cmt) 118 119 # Insert a node for each group 120 for comment_column, comment_indent, comment_group in comment_groups: 121 ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent) 122 if ancestor_at_indent.type == token.DEDENT: 123 InsertNodes = pytree_utils.InsertNodesBefore # pylint: disable=invalid-name # noqa 124 else: 125 InsertNodes = pytree_utils.InsertNodesAfter # pylint: disable=invalid-name # noqa 126 InsertNodes( 127 _CreateCommentsFromPrefix( 128 '\n'.join(comment_group) + '\n', 129 comment_lineno, 130 comment_column, 131 standalone=True), ancestor_at_indent) 132 comment_lineno += len(comment_group) 133 else: 134 # Otherwise there are two cases. 135 # 136 # 1. The comment is on its own line 137 # 2. The comment is part of an expression. 138 # 139 # Unfortunately, it's fairly difficult to distinguish between the 140 # two in lib2to3 trees. The algorithm here is to determine whether 141 # child is the first leaf in the statement it belongs to. If it is, 142 # then the comment (which is a prefix) belongs on a separate line. 143 # If it is not, it means the comment is buried deep in the statement 144 # and is part of some expression. 145 stmt_parent = _FindStmtParent(child) 146 147 for leaf_in_parent in stmt_parent.leaves(): 148 if leaf_in_parent.type == token.NEWLINE: 149 continue 150 elif id(leaf_in_parent) == id(child): 151 # This comment stands on its own line, and it has to be inserted 152 # into the appropriate parent. We'll have to find a suitable 153 # parent to insert into. See comments above 154 # _STANDALONE_LINE_NODES for more details. 155 node_with_line_parent = _FindNodeWithStandaloneLineParent(child) 156 157 if pytree_utils.NodeName( 158 node_with_line_parent.parent) in {'funcdef', 'classdef'}: 159 # Keep a comment that's not attached to a function or class 160 # next to the object it is attached to. 161 comment_end = ( 162 comment_lineno + comment_prefix.rstrip('\n').count('\n')) 163 if comment_end < node_with_line_parent.lineno - 1: 164 node_with_line_parent = node_with_line_parent.parent 165 166 pytree_utils.InsertNodesBefore( 167 _CreateCommentsFromPrefix( 168 comment_prefix, comment_lineno, 0, standalone=True), 169 node_with_line_parent) 170 break 171 else: 172 if comment_lineno == prev_leaf[0].lineno: 173 comment_lines = comment_prefix.splitlines() 174 value = comment_lines[0].lstrip() 175 if value.rstrip('\n'): 176 comment_column = prev_leaf[0].column 177 comment_column += len(prev_leaf[0].value) 178 comment_column += ( 179 len(comment_lines[0]) - len(comment_lines[0].lstrip())) 180 comment_leaf = pytree.Leaf( 181 type=token.COMMENT, 182 value=value.rstrip('\n'), 183 context=('', (comment_lineno, comment_column))) 184 pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0]) 185 comment_prefix = '\n'.join(comment_lines[1:]) 186 comment_lineno += 1 187 188 rindex = (0 if '\n' not in comment_prefix.rstrip() else 189 comment_prefix.rstrip().rindex('\n') + 1) 190 comment_column = ( 191 len(comment_prefix[rindex:]) - 192 len(comment_prefix[rindex:].lstrip())) 193 comments = _CreateCommentsFromPrefix( 194 comment_prefix, 195 comment_lineno, 196 comment_column, 197 standalone=False) 198 pytree_utils.InsertNodesBefore(comments, child) 199 break 200 201 prev_leaf[0] = child 202 203 _VisitNodeRec(tree) 204 205 206def _CreateCommentsFromPrefix(comment_prefix, 207 comment_lineno, 208 comment_column, 209 standalone=False): 210 """Create pytree nodes to represent the given comment prefix. 211 212 Args: 213 comment_prefix: (unicode) the text of the comment from the node's prefix. 214 comment_lineno: (int) the line number for the start of the comment. 215 comment_column: (int) the column for the start of the comment. 216 standalone: (bool) determines if the comment is standalone or not. 217 218 Returns: 219 The simple_stmt nodes if this is a standalone comment, otherwise a list of 220 new COMMENT leafs. The prefix may consist of multiple comment blocks, 221 separated by blank lines. Each block gets its own leaf. 222 """ 223 # The comment is stored in the prefix attribute, with no lineno of its 224 # own. So we only know at which line it ends. To find out at which line it 225 # starts, look at how many newlines the comment itself contains. 226 comments = [] 227 228 lines = comment_prefix.split('\n') 229 index = 0 230 while index < len(lines): 231 comment_block = [] 232 while index < len(lines) and lines[index].lstrip().startswith('#'): 233 comment_block.append(lines[index].strip()) 234 index += 1 235 236 if comment_block: 237 new_lineno = comment_lineno + index - 1 238 comment_block[0] = comment_block[0].strip() 239 comment_block[-1] = comment_block[-1].strip() 240 comment_leaf = pytree.Leaf( 241 type=token.COMMENT, 242 value='\n'.join(comment_block), 243 context=('', (new_lineno, comment_column))) 244 comment_node = comment_leaf if not standalone else pytree.Node( 245 pygram.python_symbols.simple_stmt, [comment_leaf]) 246 comments.append(comment_node) 247 248 while index < len(lines) and not lines[index].lstrip(): 249 index += 1 250 251 return comments 252 253 254# "Standalone line nodes" are tree nodes that have to start a new line in Python 255# code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as 256# parents of other nodes but can come later in a line. This is a list of 257# standalone line nodes in the grammar. It is meant to be exhaustive 258# *eventually*, and we'll modify it with time as we discover more corner cases 259# in the parse tree. 260# 261# When splicing a standalone comment (i.e. a comment that appears on its own 262# line, not on the same line with other code), it's important to insert it into 263# an appropriate parent of the node it's attached to. An appropriate parent 264# is the first "standalone line node" in the parent chain of a node. 265_STANDALONE_LINE_NODES = frozenset([ 266 'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt', 267 'funcdef', 'classdef', 'decorated', 'file_input' 268]) 269 270 271def _FindNodeWithStandaloneLineParent(node): 272 """Find a node whose parent is a 'standalone line' node. 273 274 See the comment above _STANDALONE_LINE_NODES for more details. 275 276 Arguments: 277 node: node to start from 278 279 Returns: 280 Suitable node that's either the node itself or one of its ancestors. 281 """ 282 if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES: 283 return node 284 else: 285 # This is guaranteed to terminate because 'file_input' is the root node of 286 # any pytree. 287 return _FindNodeWithStandaloneLineParent(node.parent) 288 289 290# "Statement nodes" are standalone statements. The don't have to start a new 291# line. 292_STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES 293 294 295def _FindStmtParent(node): 296 """Find the nearest parent of node that is a statement node. 297 298 Arguments: 299 node: node to start from 300 301 Returns: 302 Nearest parent (or node itself, if suitable). 303 """ 304 if pytree_utils.NodeName(node) in _STATEMENT_NODES: 305 return node 306 else: 307 return _FindStmtParent(node.parent) 308 309 310def _FindAncestorAtIndent(node, indent): 311 """Find an ancestor of node with the given indentation. 312 313 Arguments: 314 node: node to start from. This must not be the tree root. 315 indent: indentation string for the ancestor we're looking for. 316 See _AnnotateIndents for more details. 317 318 Returns: 319 An ancestor node with suitable indentation. If no suitable ancestor is 320 found, the closest ancestor to the tree root is returned. 321 """ 322 if node.parent.parent is None: 323 # Our parent is the tree root, so there's nowhere else to go. 324 return node 325 326 # If the parent has an indent annotation, and it's shorter than node's 327 # indent, this is a suitable ancestor. 328 # The reason for "shorter" rather than "equal" is that comments may be 329 # improperly indented (i.e. by three spaces, where surrounding statements 330 # have either zero or two or four), and we don't want to propagate them all 331 # the way to the root. 332 parent_indent = pytree_utils.GetNodeAnnotation( 333 node.parent, pytree_utils.Annotation.CHILD_INDENT) 334 if parent_indent is not None and indent.startswith(parent_indent): 335 return node 336 else: 337 # Keep looking up the tree. 338 return _FindAncestorAtIndent(node.parent, indent) 339 340 341def _AnnotateIndents(tree): 342 """Annotate the tree with child_indent annotations. 343 344 A child_indent annotation on a node specifies the indentation (as a string, 345 like " ") of its children. It is inferred from the INDENT child of a node. 346 347 Arguments: 348 tree: root of a pytree. The pytree is modified to add annotations to nodes. 349 350 Raises: 351 RuntimeError: if the tree is malformed. 352 """ 353 # Annotate the root of the tree with zero indent. 354 if tree.parent is None: 355 pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, 356 '') 357 for child in tree.children: 358 if child.type == token.INDENT: 359 child_indent = pytree_utils.GetNodeAnnotation( 360 tree, pytree_utils.Annotation.CHILD_INDENT) 361 if child_indent is not None and child_indent != child.value: 362 raise RuntimeError('inconsistent indentation for child', (tree, child)) 363 pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT, 364 child.value) 365 _AnnotateIndents(child) 366