• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2015 Google Inc. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License");
4# you may not use this file except in compliance with the License.
5# You may obtain a copy of the License at
6#
7#     http://www.apache.org/licenses/LICENSE-2.0
8#
9# Unless required by applicable law or agreed to in writing, software
10# distributed under the License is distributed on an "AS IS" BASIS,
11# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12# See the License for the specific language governing permissions and
13# limitations under the License.
14"""Comment splicer for lib2to3 trees.
15
16The lib2to3 syntax tree produced by the parser holds comments and whitespace in
17prefix attributes of nodes, rather than nodes themselves. This module provides
18functionality to splice comments out of prefixes and into nodes of their own,
19making them easier to process.
20
21  SpliceComments(): the main function exported by this module.
22"""
23
24from lib2to3 import pygram
25from lib2to3 import pytree
26from lib2to3.pgen2 import token
27
28from yapf.yapflib import pytree_utils
29
30
31def SpliceComments(tree):
32  """Given a pytree, splice comments into nodes of their own right.
33
34  Extract comments from the prefixes where they are housed after parsing.
35  The prefixes that previously housed the comments become empty.
36
37  Args:
38    tree: a pytree.Node - the tree to work on. The tree is modified by this
39        function.
40  """
41  # The previous leaf node encountered in the traversal.
42  # This is a list because Python 2.x doesn't have 'nonlocal' :)
43  prev_leaf = [None]
44  _AnnotateIndents(tree)
45
46  def _VisitNodeRec(node):
47    """Recursively visit each node to splice comments into the AST."""
48    # This loop may insert into node.children, so we'll iterate over a copy.
49    for child in node.children[:]:
50      if isinstance(child, pytree.Node):
51        # Nodes don't have prefixes.
52        _VisitNodeRec(child)
53      else:
54        if child.prefix.lstrip().startswith('#'):
55          # We have a comment prefix in this child, so splicing is needed.
56          comment_prefix = child.prefix
57          comment_lineno = child.lineno - comment_prefix.count('\n')
58          comment_column = child.column
59
60          # Remember the leading indentation of this prefix and clear it.
61          # Mopping up the prefix is important because we may go over this same
62          # child in the next iteration...
63          child_prefix = child.prefix.lstrip('\n')
64          prefix_indent = child_prefix[:child_prefix.find('#')]
65          if '\n' in prefix_indent:
66            prefix_indent = prefix_indent[prefix_indent.rfind('\n') + 1:]
67          child.prefix = ''
68
69          if child.type == token.NEWLINE:
70            # If the prefix was on a NEWLINE leaf, it's part of the line so it
71            # will be inserted after the previously encountered leaf.
72            # We can't just insert it before the NEWLINE node, because as a
73            # result of the way pytrees are organized, this node can be under
74            # an inappropriate parent.
75            comment_column -= len(comment_prefix.lstrip())
76            pytree_utils.InsertNodesAfter(
77                _CreateCommentsFromPrefix(
78                    comment_prefix,
79                    comment_lineno,
80                    comment_column,
81                    standalone=False), prev_leaf[0])
82          elif child.type == token.DEDENT:
83            # Comment prefixes on DEDENT nodes also deserve special treatment,
84            # because their final placement depends on their prefix.
85            # We'll look for an ancestor of this child with a matching
86            # indentation, and insert the comment before it if the ancestor is
87            # on a DEDENT node and after it otherwise.
88            #
89            # lib2to3 places comments that should be separated into the same
90            # DEDENT node. For example, "comment 1" and "comment 2" will be
91            # combined.
92            #
93            #   def _():
94            #     for x in y:
95            #       pass
96            #       # comment 1
97            #
98            #     # comment 2
99            #     pass
100            #
101            # In this case, we need to split them up ourselves.
102
103            # Split into groups of comments at decreasing levels of indentation
104            comment_groups = []
105            comment_column = None
106            for cmt in comment_prefix.split('\n'):
107              col = cmt.find('#')
108              if col < 0:
109                if comment_column is None:
110                  # Skip empty lines at the top of the first comment group
111                  comment_lineno += 1
112                  continue
113              elif comment_column is None or col < comment_column:
114                comment_column = col
115                comment_indent = cmt[:comment_column]
116                comment_groups.append((comment_column, comment_indent, []))
117              comment_groups[-1][-1].append(cmt)
118
119            # Insert a node for each group
120            for comment_column, comment_indent, comment_group in comment_groups:
121              ancestor_at_indent = _FindAncestorAtIndent(child, comment_indent)
122              if ancestor_at_indent.type == token.DEDENT:
123                InsertNodes = pytree_utils.InsertNodesBefore  # pylint: disable=invalid-name # noqa
124              else:
125                InsertNodes = pytree_utils.InsertNodesAfter  # pylint: disable=invalid-name # noqa
126              InsertNodes(
127                  _CreateCommentsFromPrefix(
128                      '\n'.join(comment_group) + '\n',
129                      comment_lineno,
130                      comment_column,
131                      standalone=True), ancestor_at_indent)
132              comment_lineno += len(comment_group)
133          else:
134            # Otherwise there are two cases.
135            #
136            # 1. The comment is on its own line
137            # 2. The comment is part of an expression.
138            #
139            # Unfortunately, it's fairly difficult to distinguish between the
140            # two in lib2to3 trees. The algorithm here is to determine whether
141            # child is the first leaf in the statement it belongs to. If it is,
142            # then the comment (which is a prefix) belongs on a separate line.
143            # If it is not, it means the comment is buried deep in the statement
144            # and is part of some expression.
145            stmt_parent = _FindStmtParent(child)
146
147            for leaf_in_parent in stmt_parent.leaves():
148              if leaf_in_parent.type == token.NEWLINE:
149                continue
150              elif id(leaf_in_parent) == id(child):
151                # This comment stands on its own line, and it has to be inserted
152                # into the appropriate parent. We'll have to find a suitable
153                # parent to insert into. See comments above
154                # _STANDALONE_LINE_NODES for more details.
155                node_with_line_parent = _FindNodeWithStandaloneLineParent(child)
156
157                if pytree_utils.NodeName(
158                    node_with_line_parent.parent) in {'funcdef', 'classdef'}:
159                  # Keep a comment that's not attached to a function or class
160                  # next to the object it is attached to.
161                  comment_end = (
162                      comment_lineno + comment_prefix.rstrip('\n').count('\n'))
163                  if comment_end < node_with_line_parent.lineno - 1:
164                    node_with_line_parent = node_with_line_parent.parent
165
166                pytree_utils.InsertNodesBefore(
167                    _CreateCommentsFromPrefix(
168                        comment_prefix, comment_lineno, 0, standalone=True),
169                    node_with_line_parent)
170                break
171              else:
172                if comment_lineno == prev_leaf[0].lineno:
173                  comment_lines = comment_prefix.splitlines()
174                  value = comment_lines[0].lstrip()
175                  if value.rstrip('\n'):
176                    comment_column = prev_leaf[0].column
177                    comment_column += len(prev_leaf[0].value)
178                    comment_column += (
179                        len(comment_lines[0]) - len(comment_lines[0].lstrip()))
180                    comment_leaf = pytree.Leaf(
181                        type=token.COMMENT,
182                        value=value.rstrip('\n'),
183                        context=('', (comment_lineno, comment_column)))
184                    pytree_utils.InsertNodesAfter([comment_leaf], prev_leaf[0])
185                    comment_prefix = '\n'.join(comment_lines[1:])
186                    comment_lineno += 1
187
188                rindex = (0 if '\n' not in comment_prefix.rstrip() else
189                          comment_prefix.rstrip().rindex('\n') + 1)
190                comment_column = (
191                    len(comment_prefix[rindex:]) -
192                    len(comment_prefix[rindex:].lstrip()))
193                comments = _CreateCommentsFromPrefix(
194                    comment_prefix,
195                    comment_lineno,
196                    comment_column,
197                    standalone=False)
198                pytree_utils.InsertNodesBefore(comments, child)
199                break
200
201        prev_leaf[0] = child
202
203  _VisitNodeRec(tree)
204
205
206def _CreateCommentsFromPrefix(comment_prefix,
207                              comment_lineno,
208                              comment_column,
209                              standalone=False):
210  """Create pytree nodes to represent the given comment prefix.
211
212  Args:
213    comment_prefix: (unicode) the text of the comment from the node's prefix.
214    comment_lineno: (int) the line number for the start of the comment.
215    comment_column: (int) the column for the start of the comment.
216    standalone: (bool) determines if the comment is standalone or not.
217
218  Returns:
219    The simple_stmt nodes if this is a standalone comment, otherwise a list of
220    new COMMENT leafs. The prefix may consist of multiple comment blocks,
221    separated by blank lines. Each block gets its own leaf.
222  """
223  # The comment is stored in the prefix attribute, with no lineno of its
224  # own. So we only know at which line it ends. To find out at which line it
225  # starts, look at how many newlines the comment itself contains.
226  comments = []
227
228  lines = comment_prefix.split('\n')
229  index = 0
230  while index < len(lines):
231    comment_block = []
232    while index < len(lines) and lines[index].lstrip().startswith('#'):
233      comment_block.append(lines[index].strip())
234      index += 1
235
236    if comment_block:
237      new_lineno = comment_lineno + index - 1
238      comment_block[0] = comment_block[0].strip()
239      comment_block[-1] = comment_block[-1].strip()
240      comment_leaf = pytree.Leaf(
241          type=token.COMMENT,
242          value='\n'.join(comment_block),
243          context=('', (new_lineno, comment_column)))
244      comment_node = comment_leaf if not standalone else pytree.Node(
245          pygram.python_symbols.simple_stmt, [comment_leaf])
246      comments.append(comment_node)
247
248    while index < len(lines) and not lines[index].lstrip():
249      index += 1
250
251  return comments
252
253
254# "Standalone line nodes" are tree nodes that have to start a new line in Python
255# code (and cannot follow a ';' or ':'). Other nodes, like 'expr_stmt', serve as
256# parents of other nodes but can come later in a line. This is a list of
257# standalone line nodes in the grammar. It is meant to be exhaustive
258# *eventually*, and we'll modify it with time as we discover more corner cases
259# in the parse tree.
260#
261# When splicing a standalone comment (i.e. a comment that appears on its own
262# line, not on the same line with other code), it's important to insert it into
263# an appropriate parent of the node it's attached to. An appropriate parent
264# is the first "standalone line node" in the parent chain of a node.
265_STANDALONE_LINE_NODES = frozenset([
266    'suite', 'if_stmt', 'while_stmt', 'for_stmt', 'try_stmt', 'with_stmt',
267    'funcdef', 'classdef', 'decorated', 'file_input'
268])
269
270
271def _FindNodeWithStandaloneLineParent(node):
272  """Find a node whose parent is a 'standalone line' node.
273
274  See the comment above _STANDALONE_LINE_NODES for more details.
275
276  Arguments:
277    node: node to start from
278
279  Returns:
280    Suitable node that's either the node itself or one of its ancestors.
281  """
282  if pytree_utils.NodeName(node.parent) in _STANDALONE_LINE_NODES:
283    return node
284  else:
285    # This is guaranteed to terminate because 'file_input' is the root node of
286    # any pytree.
287    return _FindNodeWithStandaloneLineParent(node.parent)
288
289
290# "Statement nodes" are standalone statements. The don't have to start a new
291# line.
292_STATEMENT_NODES = frozenset(['simple_stmt']) | _STANDALONE_LINE_NODES
293
294
295def _FindStmtParent(node):
296  """Find the nearest parent of node that is a statement node.
297
298  Arguments:
299    node: node to start from
300
301  Returns:
302    Nearest parent (or node itself, if suitable).
303  """
304  if pytree_utils.NodeName(node) in _STATEMENT_NODES:
305    return node
306  else:
307    return _FindStmtParent(node.parent)
308
309
310def _FindAncestorAtIndent(node, indent):
311  """Find an ancestor of node with the given indentation.
312
313  Arguments:
314    node: node to start from. This must not be the tree root.
315    indent: indentation string for the ancestor we're looking for.
316        See _AnnotateIndents for more details.
317
318  Returns:
319    An ancestor node with suitable indentation. If no suitable ancestor is
320    found, the closest ancestor to the tree root is returned.
321  """
322  if node.parent.parent is None:
323    # Our parent is the tree root, so there's nowhere else to go.
324    return node
325
326  # If the parent has an indent annotation, and it's shorter than node's
327  # indent, this is a suitable ancestor.
328  # The reason for "shorter" rather than "equal" is that comments may be
329  # improperly indented (i.e. by three spaces, where surrounding statements
330  # have either zero or two or four), and we don't want to propagate them all
331  # the way to the root.
332  parent_indent = pytree_utils.GetNodeAnnotation(
333      node.parent, pytree_utils.Annotation.CHILD_INDENT)
334  if parent_indent is not None and indent.startswith(parent_indent):
335    return node
336  else:
337    # Keep looking up the tree.
338    return _FindAncestorAtIndent(node.parent, indent)
339
340
341def _AnnotateIndents(tree):
342  """Annotate the tree with child_indent annotations.
343
344  A child_indent annotation on a node specifies the indentation (as a string,
345  like "  ") of its children. It is inferred from the INDENT child of a node.
346
347  Arguments:
348    tree: root of a pytree. The pytree is modified to add annotations to nodes.
349
350  Raises:
351    RuntimeError: if the tree is malformed.
352  """
353  # Annotate the root of the tree with zero indent.
354  if tree.parent is None:
355    pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
356                                   '')
357  for child in tree.children:
358    if child.type == token.INDENT:
359      child_indent = pytree_utils.GetNodeAnnotation(
360          tree, pytree_utils.Annotation.CHILD_INDENT)
361      if child_indent is not None and child_indent != child.value:
362        raise RuntimeError('inconsistent indentation for child', (tree, child))
363      pytree_utils.SetNodeAnnotation(tree, pytree_utils.Annotation.CHILD_INDENT,
364                                     child.value)
365    _AnnotateIndents(child)
366