1# Copyright 2006 Google, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3""" 4Python parse tree definitions. 5 6This is a very concrete parse tree; we need to keep every token and 7even the comments and whitespace between tokens. 8 9There's also a pattern matching implementation here. 10""" 11 12__author__ = 'Guido van Rossum <guido@python.org>' 13 14import sys 15from io import StringIO 16from typing import List 17from typing import Optional 18from typing import Text 19from typing import Tuple 20from typing import Union 21 22HUGE = 0x7FFFFFFF # maximum repeat count, default max 23 24_type_reprs = {} 25 26 27def type_repr(type_num): 28 global _type_reprs 29 if not _type_reprs: 30 from .pygram import python_symbols 31 32 # printing tokens is possible but not as useful 33 # from .pgen2 import token // token.__dict__.items(): 34 for name, val in python_symbols.__dict__.items(): 35 if isinstance(val, int): 36 _type_reprs[val] = name 37 return _type_reprs.setdefault(type_num, type_num) 38 39 40NL = Union['Node', 'Leaf'] 41Context = Tuple[Text, Tuple[int, int]] 42RawNode = Tuple[int, Optional[Text], Optional[Context], Optional[List[NL]]] 43 44 45class Base(object): 46 """ 47 Abstract base class for Node and Leaf. 48 49 This provides some default functionality and boilerplate using the 50 template pattern. 51 52 A node may be a subnode of at most one parent. 53 """ 54 55 # Default values for instance variables 56 type = None # int: token number (< 256) or symbol number (>= 256) 57 parent = None # Parent node pointer, or None 58 children = () # Tuple of subnodes 59 was_changed = False 60 was_checked = False 61 62 def __new__(cls, *args, **kwds): 63 """Constructor that prevents Base from being instantiated.""" 64 assert cls is not Base, 'Cannot instantiate Base' 65 return object.__new__(cls) 66 67 def __eq__(self, other): 68 """ 69 Compare two nodes for equality. 70 71 This calls the method _eq(). 72 """ 73 if self.__class__ is not other.__class__: 74 return NotImplemented 75 return self._eq(other) 76 77 __hash__ = None # For Py3 compatibility. 78 79 def _eq(self, other): 80 """ 81 Compare two nodes for equality. 82 83 This is called by __eq__ and __ne__. It is only called if the two nodes 84 have the same type. This must be implemented by the concrete subclass. 85 Nodes should be considered equal if they have the same structure, 86 ignoring the prefix string and other context information. 87 """ 88 raise NotImplementedError 89 90 def clone(self): 91 """ 92 Return a cloned (deep) copy of self. 93 94 This must be implemented by the concrete subclass. 95 """ 96 raise NotImplementedError 97 98 def post_order(self): 99 """ 100 Return a post-order iterator for the tree. 101 102 This must be implemented by the concrete subclass. 103 """ 104 raise NotImplementedError 105 106 def pre_order(self): 107 """ 108 Return a pre-order iterator for the tree. 109 110 This must be implemented by the concrete subclass. 111 """ 112 raise NotImplementedError 113 114 def replace(self, new): 115 """Replace this node with a new one in the parent.""" 116 assert self.parent is not None, str(self) 117 assert new is not None 118 if not isinstance(new, list): 119 new = [new] 120 l_children = [] 121 found = False 122 for ch in self.parent.children: 123 if ch is self: 124 assert not found, (self.parent.children, self, new) 125 if new is not None: 126 l_children.extend(new) 127 found = True 128 else: 129 l_children.append(ch) 130 assert found, (self.children, self, new) 131 self.parent.changed() 132 self.parent.children = l_children 133 for x in new: 134 x.parent = self.parent 135 self.parent = None 136 137 def get_lineno(self): 138 """Return the line number which generated the invocant node.""" 139 node = self 140 while not isinstance(node, Leaf): 141 if not node.children: 142 return 143 node = node.children[0] 144 return node.lineno 145 146 def changed(self): 147 if self.parent: 148 self.parent.changed() 149 self.was_changed = True 150 151 def remove(self): 152 """ 153 Remove the node from the tree. Returns the position of the node in its 154 parent's children before it was removed. 155 """ 156 if self.parent: 157 for i, node in enumerate(self.parent.children): 158 if node is self: 159 self.parent.changed() 160 del self.parent.children[i] 161 self.parent = None 162 return i 163 164 @property 165 def next_sibling(self): 166 """ 167 The node immediately following the invocant in their parent's children 168 list. If the invocant does not have a next sibling, it is None 169 """ 170 if self.parent is None: 171 return None 172 173 # Can't use index(); we need to test by identity 174 for i, child in enumerate(self.parent.children): 175 if child is self: 176 try: 177 return self.parent.children[i + 1] 178 except IndexError: 179 return None 180 181 @property 182 def prev_sibling(self): 183 """ 184 The node immediately preceding the invocant in their parent's children 185 list. If the invocant does not have a previous sibling, it is None. 186 """ 187 if self.parent is None: 188 return None 189 190 # Can't use index(); we need to test by identity 191 for i, child in enumerate(self.parent.children): 192 if child is self: 193 if i == 0: 194 return None 195 return self.parent.children[i - 1] 196 197 def leaves(self): 198 for child in self.children: 199 yield from child.leaves() 200 201 def depth(self): 202 if self.parent is None: 203 return 0 204 return 1 + self.parent.depth() 205 206 def get_suffix(self): 207 """ 208 Return the string immediately following the invocant node. This is 209 effectively equivalent to node.next_sibling.prefix 210 """ 211 next_sib = self.next_sibling 212 if next_sib is None: 213 return '' 214 return next_sib.prefix 215 216 if sys.version_info < (3, 0): 217 218 def __str__(self): 219 return str(self).encode('ascii') 220 221 222class Node(Base): 223 """Concrete implementation for interior nodes.""" 224 225 def __init__(self, 226 type, 227 children, 228 context=None, 229 prefix=None, 230 fixers_applied=None): 231 """ 232 Initializer. 233 234 Takes a type constant (a symbol number >= 256), a sequence of 235 child nodes, and an optional context keyword argument. 236 237 As a side effect, the parent pointers of the children are updated. 238 """ 239 assert type >= 256, type 240 self.type = type 241 self.children = list(children) 242 for ch in self.children: 243 assert ch.parent is None, repr(ch) 244 ch.parent = self 245 if prefix is not None: 246 self.prefix = prefix 247 if fixers_applied: 248 self.fixers_applied = fixers_applied[:] 249 else: 250 self.fixers_applied = None 251 252 def __repr__(self): 253 """Return a canonical string representation.""" 254 return '%s(%s, %r)' % (self.__class__.__name__, type_repr( 255 self.type), self.children) 256 257 def __unicode__(self): 258 """ 259 Return a pretty string representation. 260 261 This reproduces the input source exactly. 262 """ 263 return ''.join(map(str, self.children)) 264 265 if sys.version_info > (3, 0): 266 __str__ = __unicode__ 267 268 def _eq(self, other): 269 """Compare two nodes for equality.""" 270 return (self.type, self.children) == (other.type, other.children) 271 272 def clone(self): 273 """Return a cloned (deep) copy of self.""" 274 return Node( 275 self.type, [ch.clone() for ch in self.children], 276 fixers_applied=self.fixers_applied) 277 278 def post_order(self): 279 """Return a post-order iterator for the tree.""" 280 for child in self.children: 281 yield from child.post_order() 282 yield self 283 284 def pre_order(self): 285 """Return a pre-order iterator for the tree.""" 286 yield self 287 for child in self.children: 288 yield from child.pre_order() 289 290 @property 291 def prefix(self): 292 """ 293 The whitespace and comments preceding this node in the input. 294 """ 295 if not self.children: 296 return '' 297 return self.children[0].prefix 298 299 @prefix.setter 300 def prefix(self, prefix): 301 if self.children: 302 self.children[0].prefix = prefix 303 304 def set_child(self, i, child): 305 """ 306 Equivalent to 'node.children[i] = child'. This method also sets the 307 child's parent attribute appropriately. 308 """ 309 child.parent = self 310 self.children[i].parent = None 311 self.children[i] = child 312 self.changed() 313 314 def insert_child(self, i, child): 315 """ 316 Equivalent to 'node.children.insert(i, child)'. This method also sets 317 the child's parent attribute appropriately. 318 """ 319 child.parent = self 320 self.children.insert(i, child) 321 self.changed() 322 323 def append_child(self, child): 324 """ 325 Equivalent to 'node.children.append(child)'. This method also sets the 326 child's parent attribute appropriately. 327 """ 328 child.parent = self 329 self.children.append(child) 330 self.changed() 331 332 333class Leaf(Base): 334 """Concrete implementation for leaf nodes.""" 335 336 # Default values for instance variables 337 _prefix = '' # Whitespace and comments preceding this token in the input 338 lineno = 0 # Line where this token starts in the input 339 column = 0 # Column where this token tarts in the input 340 341 def __init__(self, type, value, context=None, prefix=None, fixers_applied=[]): 342 """ 343 Initializer. 344 345 Takes a type constant (a token number < 256), a string value, and an 346 optional context keyword argument. 347 """ 348 assert 0 <= type < 256, type 349 if context is not None: 350 self._prefix, (self.lineno, self.column) = context 351 self.type = type 352 self.value = value 353 if prefix is not None: 354 self._prefix = prefix 355 self.fixers_applied = fixers_applied[:] 356 357 def __repr__(self): 358 """Return a canonical string representation.""" 359 return '%s(%r, %r)' % (self.__class__.__name__, self.type, self.value) 360 361 def __unicode__(self): 362 """ 363 Return a pretty string representation. 364 365 This reproduces the input source exactly. 366 """ 367 return self.prefix + str(self.value) 368 369 if sys.version_info > (3, 0): 370 __str__ = __unicode__ 371 372 def _eq(self, other): 373 """Compare two nodes for equality.""" 374 return (self.type, self.value) == (other.type, other.value) 375 376 def clone(self): 377 """Return a cloned (deep) copy of self.""" 378 return Leaf( 379 self.type, 380 self.value, (self.prefix, (self.lineno, self.column)), 381 fixers_applied=self.fixers_applied) 382 383 def leaves(self): 384 yield self 385 386 def post_order(self): 387 """Return a post-order iterator for the tree.""" 388 yield self 389 390 def pre_order(self): 391 """Return a pre-order iterator for the tree.""" 392 yield self 393 394 @property 395 def prefix(self): 396 """ 397 The whitespace and comments preceding this token in the input. 398 """ 399 return self._prefix 400 401 @prefix.setter 402 def prefix(self, prefix): 403 self.changed() 404 self._prefix = prefix 405 406 407def convert(gr, raw_node): 408 """ 409 Convert raw node information to a Node or Leaf instance. 410 411 This is passed to the parser driver which calls it whenever a reduction of a 412 grammar rule produces a new complete node, so that the tree is build 413 strictly bottom-up. 414 """ 415 type, value, context, children = raw_node 416 if children or type in gr.number2symbol: 417 # If there's exactly one child, return that child instead of 418 # creating a new node. 419 if len(children) == 1: 420 return children[0] 421 return Node(type, children, context=context) 422 else: 423 return Leaf(type, value, context=context) 424 425 426class BasePattern(object): 427 """ 428 A pattern is a tree matching pattern. 429 430 It looks for a specific node type (token or symbol), and 431 optionally for a specific content. 432 433 This is an abstract base class. There are three concrete 434 subclasses: 435 436 - LeafPattern matches a single leaf node; 437 - NodePattern matches a single node (usually non-leaf); 438 - WildcardPattern matches a sequence of nodes of variable length. 439 """ 440 441 # Defaults for instance variables 442 type = None # Node type (token if < 256, symbol if >= 256) 443 content = None # Optional content matching pattern 444 name = None # Optional name used to store match in results dict 445 446 def __new__(cls, *args, **kwds): 447 """Constructor that prevents BasePattern from being instantiated.""" 448 assert cls is not BasePattern, 'Cannot instantiate BasePattern' 449 return object.__new__(cls) 450 451 def __repr__(self): 452 args = [type_repr(self.type), self.content, self.name] 453 while args and args[-1] is None: 454 del args[-1] 455 return '%s(%s)' % (self.__class__.__name__, ', '.join(map(repr, args))) 456 457 def optimize(self): 458 """ 459 A subclass can define this as a hook for optimizations. 460 461 Returns either self or another node with the same effect. 462 """ 463 return self 464 465 def match(self, node, results=None): 466 """ 467 Does this pattern exactly match a node? 468 469 Returns True if it matches, False if not. 470 471 If results is not None, it must be a dict which will be 472 updated with the nodes matching named subpatterns. 473 474 Default implementation for non-wildcard patterns. 475 """ 476 if self.type is not None and node.type != self.type: 477 return False 478 if self.content is not None: 479 r = None 480 if results is not None: 481 r = {} 482 if not self._submatch(node, r): 483 return False 484 if r: 485 results.update(r) 486 if results is not None and self.name: 487 results[self.name] = node 488 return True 489 490 def match_seq(self, nodes, results=None): 491 """ 492 Does this pattern exactly match a sequence of nodes? 493 494 Default implementation for non-wildcard patterns. 495 """ 496 if len(nodes) != 1: 497 return False 498 return self.match(nodes[0], results) 499 500 def generate_matches(self, nodes): 501 """ 502 Generator yielding all matches for this pattern. 503 504 Default implementation for non-wildcard patterns. 505 """ 506 r = {} 507 if nodes and self.match(nodes[0], r): 508 yield 1, r 509 510 511class LeafPattern(BasePattern): 512 513 def __init__(self, type=None, content=None, name=None): 514 """ 515 Initializer. Takes optional type, content, and name. 516 517 The type, if given must be a token type (< 256). If not given, 518 this matches any *leaf* node; the content may still be required. 519 520 The content, if given, must be a string. 521 522 If a name is given, the matching node is stored in the results 523 dict under that key. 524 """ 525 if type is not None: 526 assert 0 <= type < 256, type 527 if content is not None: 528 assert isinstance(content, str), repr(content) 529 self.type = type 530 self.content = content 531 self.name = name 532 533 def match(self, node, results=None): 534 """Override match() to insist on a leaf node.""" 535 if not isinstance(node, Leaf): 536 return False 537 return BasePattern.match(self, node, results) 538 539 def _submatch(self, node, results=None): 540 """ 541 Match the pattern's content to the node's children. 542 543 This assumes the node type matches and self.content is not None. 544 545 Returns True if it matches, False if not. 546 547 If results is not None, it must be a dict which will be 548 updated with the nodes matching named subpatterns. 549 550 When returning False, the results dict may still be updated. 551 """ 552 return self.content == node.value 553 554 555class NodePattern(BasePattern): 556 557 wildcards = False 558 559 def __init__(self, type=None, content=None, name=None): 560 """ 561 Initializer. Takes optional type, content, and name. 562 563 The type, if given, must be a symbol type (>= 256). If the 564 type is None this matches *any* single node (leaf or not), 565 except if content is not None, in which it only matches 566 non-leaf nodes that also match the content pattern. 567 568 The content, if not None, must be a sequence of Patterns that 569 must match the node's children exactly. If the content is 570 given, the type must not be None. 571 572 If a name is given, the matching node is stored in the results 573 dict under that key. 574 """ 575 if type is not None: 576 assert type >= 256, type 577 if content is not None: 578 assert not isinstance(content, str), repr(content) 579 content = list(content) 580 for i, item in enumerate(content): 581 assert isinstance(item, BasePattern), (i, item) 582 if isinstance(item, WildcardPattern): 583 self.wildcards = True 584 self.type = type 585 self.content = content 586 self.name = name 587 588 def _submatch(self, node, results=None): 589 """ 590 Match the pattern's content to the node's children. 591 592 This assumes the node type matches and self.content is not None. 593 594 Returns True if it matches, False if not. 595 596 If results is not None, it must be a dict which will be 597 updated with the nodes matching named subpatterns. 598 599 When returning False, the results dict may still be updated. 600 """ 601 if self.wildcards: 602 for c, r in generate_matches(self.content, node.children): 603 if c == len(node.children): 604 if results is not None: 605 results.update(r) 606 return True 607 return False 608 if len(self.content) != len(node.children): 609 return False 610 for subpattern, child in zip(self.content, node.children): 611 if not subpattern.match(child, results): 612 return False 613 return True 614 615 616class WildcardPattern(BasePattern): 617 """ 618 A wildcard pattern can match zero or more nodes. 619 620 This has all the flexibility needed to implement patterns like: 621 622 .* .+ .? .{m,n} 623 (a b c | d e | f) 624 (...)* (...)+ (...)? (...){m,n} 625 626 except it always uses non-greedy matching. 627 """ 628 629 def __init__(self, content=None, min=0, max=HUGE, name=None): 630 """ 631 Initializer. 632 633 Args: 634 content: optional sequence of subsequences of patterns; 635 if absent, matches one node; 636 if present, each subsequence is an alternative [*] 637 min: optional minimum number of times to match, default 0 638 max: optional maximum number of times to match, default HUGE 639 name: optional name assigned to this match 640 641 [*] Thus, if content is [[a, b, c], [d, e], [f, g, h]] this is 642 equivalent to (a b c | d e | f g h); if content is None, 643 this is equivalent to '.' in regular expression terms. 644 The min and max parameters work as follows: 645 min=0, max=maxint: .* 646 min=1, max=maxint: .+ 647 min=0, max=1: .? 648 min=1, max=1: . 649 If content is not None, replace the dot with the parenthesized 650 list of alternatives, e.g. (a b c | d e | f g h)* 651 """ 652 assert 0 <= min <= max <= HUGE, (min, max) 653 if content is not None: 654 content = tuple(map(tuple, content)) # Protect against alterations 655 # Check sanity of alternatives 656 assert len(content), repr(content) # Can't have zero alternatives 657 for alt in content: 658 assert len(alt), repr(alt) # Can have empty alternatives 659 self.content = content 660 self.min = min 661 self.max = max 662 self.name = name 663 664 def optimize(self): 665 """Optimize certain stacked wildcard patterns.""" 666 subpattern = None 667 if (self.content is not None and len(self.content) == 1 and 668 len(self.content[0]) == 1): 669 subpattern = self.content[0][0] 670 if self.min == 1 and self.max == 1: 671 if self.content is None: 672 return NodePattern(name=self.name) 673 if subpattern is not None and self.name == subpattern.name: 674 return subpattern.optimize() 675 if (self.min <= 1 and isinstance(subpattern, WildcardPattern) and 676 subpattern.min <= 1 and self.name == subpattern.name): 677 return WildcardPattern(subpattern.content, self.min * subpattern.min, 678 self.max * subpattern.max, subpattern.name) 679 return self 680 681 def match(self, node, results=None): 682 """Does this pattern exactly match a node?""" 683 return self.match_seq([node], results) 684 685 def match_seq(self, nodes, results=None): 686 """Does this pattern exactly match a sequence of nodes?""" 687 for c, r in self.generate_matches(nodes): 688 if c == len(nodes): 689 if results is not None: 690 results.update(r) 691 if self.name: 692 results[self.name] = list(nodes) 693 return True 694 return False 695 696 def generate_matches(self, nodes): 697 """ 698 Generator yielding matches for a sequence of nodes. 699 700 Args: 701 nodes: sequence of nodes 702 703 Yields: 704 (count, results) tuples where: 705 count: the match comprises nodes[:count]; 706 results: dict containing named submatches. 707 """ 708 if self.content is None: 709 # Shortcut for special case (see __init__.__doc__) 710 for count in range(self.min, 1 + min(len(nodes), self.max)): 711 r = {} 712 if self.name: 713 r[self.name] = nodes[:count] 714 yield count, r 715 elif self.name == 'bare_name': 716 yield self._bare_name_matches(nodes) 717 else: 718 # The reason for this is that hitting the recursion limit usually 719 # results in some ugly messages about how RuntimeErrors are being 720 # ignored. We only have to do this on CPython, though, because other 721 # implementations don't have this nasty bug in the first place. 722 if hasattr(sys, 'getrefcount'): 723 save_stderr = sys.stderr 724 sys.stderr = StringIO() 725 try: 726 for count, r in self._recursive_matches(nodes, 0): 727 if self.name: 728 r[self.name] = nodes[:count] 729 yield count, r 730 except RuntimeError: 731 # We fall back to the iterative pattern matching scheme if the recursive 732 # scheme hits the recursion limit. 733 for count, r in self._iterative_matches(nodes): 734 if self.name: 735 r[self.name] = nodes[:count] 736 yield count, r 737 finally: 738 if hasattr(sys, 'getrefcount'): 739 sys.stderr = save_stderr 740 741 def _iterative_matches(self, nodes): 742 """Helper to iteratively yield the matches.""" 743 nodelen = len(nodes) 744 if 0 >= self.min: 745 yield 0, {} 746 747 results = [] 748 # generate matches that use just one alt from self.content 749 for alt in self.content: 750 for c, r in generate_matches(alt, nodes): 751 yield c, r 752 results.append((c, r)) 753 754 # for each match, iterate down the nodes 755 while results: 756 new_results = [] 757 for c0, r0 in results: 758 # stop if the entire set of nodes has been matched 759 if c0 < nodelen and c0 <= self.max: 760 for alt in self.content: 761 for c1, r1 in generate_matches(alt, nodes[c0:]): 762 if c1 > 0: 763 r = {} 764 r.update(r0) 765 r.update(r1) 766 yield c0 + c1, r 767 new_results.append((c0 + c1, r)) 768 results = new_results 769 770 def _bare_name_matches(self, nodes): 771 """Special optimized matcher for bare_name.""" 772 count = 0 773 r = {} 774 done = False 775 max = len(nodes) 776 while not done and count < max: 777 done = True 778 for leaf in self.content: 779 if leaf[0].match(nodes[count], r): 780 count += 1 781 done = False 782 break 783 r[self.name] = nodes[:count] 784 return count, r 785 786 def _recursive_matches(self, nodes, count): 787 """Helper to recursively yield the matches.""" 788 assert self.content is not None 789 if count >= self.min: 790 yield 0, {} 791 if count < self.max: 792 for alt in self.content: 793 for c0, r0 in generate_matches(alt, nodes): 794 for c1, r1 in self._recursive_matches(nodes[c0:], count + 1): 795 r = {} 796 r.update(r0) 797 r.update(r1) 798 yield c0 + c1, r 799 800 801class NegatedPattern(BasePattern): 802 803 def __init__(self, content=None): 804 """ 805 Initializer. 806 807 The argument is either a pattern or None. If it is None, this 808 only matches an empty sequence (effectively '$' in regex 809 lingo). If it is not None, this matches whenever the argument 810 pattern doesn't have any matches. 811 """ 812 if content is not None: 813 assert isinstance(content, BasePattern), repr(content) 814 self.content = content 815 816 def match(self, node): 817 # We never match a node in its entirety 818 return False 819 820 def match_seq(self, nodes): 821 # We only match an empty sequence of nodes in its entirety 822 return len(nodes) == 0 823 824 def generate_matches(self, nodes): 825 if self.content is None: 826 # Return a match if there is an empty sequence 827 if len(nodes) == 0: 828 yield 0, {} 829 else: 830 # Return a match if the argument pattern has no matches 831 for c, r in self.content.generate_matches(nodes): 832 return 833 yield 0, {} 834 835 836def generate_matches(patterns, nodes): 837 """ 838 Generator yielding matches for a sequence of patterns and nodes. 839 840 Args: 841 patterns: a sequence of patterns 842 nodes: a sequence of nodes 843 844 Yields: 845 (count, results) tuples where: 846 count: the entire sequence of patterns matches nodes[:count]; 847 results: dict containing named submatches. 848 """ 849 if not patterns: 850 yield 0, {} 851 else: 852 p, rest = patterns[0], patterns[1:] 853 for c0, r0 in p.generate_matches(nodes): 854 if not rest: 855 yield c0, r0 856 else: 857 for c1, r1 in generate_matches(rest, nodes[c0:]): 858 r = {} 859 r.update(r0) 860 r.update(r1) 861 yield c0 + c1, r 862