1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3 4"""This module defines the data structures used to represent a grammar. 5 6These are a bit arcane because they are derived from the data 7structures used by Python's 'pgen' parser generator. 8 9There's also a table here mapping operators to their names in the 10token module; the Python tokenize module reports all operators as the 11fallback token code OP, but the parser needs the actual token code. 12 13""" 14 15# Python imports 16import collections 17import pickle 18 19# Local imports 20from . import token, tokenize 21 22 23class Grammar(object): 24 """Pgen parsing tables conversion class. 25 26 Once initialized, this class supplies the grammar tables for the 27 parsing engine implemented by parse.py. The parsing engine 28 accesses the instance variables directly. The class here does not 29 provide initialization of the tables; several subclasses exist to 30 do this (see the conv and pgen modules). 31 32 The load() method reads the tables from a pickle file, which is 33 much faster than the other ways offered by subclasses. The pickle 34 file is written by calling dump() (after loading the grammar 35 tables using a subclass). The report() method prints a readable 36 representation of the tables to stdout, for debugging. 37 38 The instance variables are as follows: 39 40 symbol2number -- a dict mapping symbol names to numbers. Symbol 41 numbers are always 256 or higher, to distinguish 42 them from token numbers, which are between 0 and 43 255 (inclusive). 44 45 number2symbol -- a dict mapping numbers to symbol names; 46 these two are each other's inverse. 47 48 states -- a list of DFAs, where each DFA is a list of 49 states, each state is a list of arcs, and each 50 arc is a (i, j) pair where i is a label and j is 51 a state number. The DFA number is the index into 52 this list. (This name is slightly confusing.) 53 Final states are represented by a special arc of 54 the form (0, j) where j is its own state number. 55 56 dfas -- a dict mapping symbol numbers to (DFA, first) 57 pairs, where DFA is an item from the states list 58 above, and first is a set of tokens that can 59 begin this grammar rule (represented by a dict 60 whose values are always 1). 61 62 labels -- a list of (x, y) pairs where x is either a token 63 number or a symbol number, and y is either None 64 or a string; the strings are keywords. The label 65 number is the index in this list; label numbers 66 are used to mark state transitions (arcs) in the 67 DFAs. 68 69 start -- the number of the grammar's start symbol. 70 71 keywords -- a dict mapping keyword strings to arc labels. 72 73 tokens -- a dict mapping token numbers to arc labels. 74 75 """ 76 77 def __init__(self): 78 self.symbol2number = {} 79 self.number2symbol = {} 80 self.states = [] 81 self.dfas = {} 82 self.labels = [(0, "EMPTY")] 83 self.keywords = {} 84 self.tokens = {} 85 self.symbol2label = {} 86 self.start = 256 87 88 def dump(self, filename): 89 """Dump the grammar tables to a pickle file. 90 91 dump() recursively changes all dict to OrderedDict, so the pickled file 92 is not exactly the same as what was passed in to dump(). load() uses the 93 pickled file to create the tables, but only changes OrderedDict to dict 94 at the top level; it does not recursively change OrderedDict to dict. 95 So, the loaded tables are different from the original tables that were 96 passed to load() in that some of the OrderedDict (from the pickled file) 97 are not changed back to dict. For parsing, this has no effect on 98 performance because OrderedDict uses dict's __getitem__ with nothing in 99 between. 100 """ 101 with open(filename, "wb") as f: 102 d = _make_deterministic(self.__dict__) 103 pickle.dump(d, f, 2) 104 105 def load(self, filename): 106 """Load the grammar tables from a pickle file.""" 107 f = open(filename, "rb") 108 d = pickle.load(f) 109 f.close() 110 self.__dict__.update(d) 111 112 def loads(self, pkl): 113 """Load the grammar tables from a pickle bytes object.""" 114 self.__dict__.update(pickle.loads(pkl)) 115 116 def copy(self): 117 """ 118 Copy the grammar. 119 """ 120 new = self.__class__() 121 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", 122 "tokens", "symbol2label"): 123 setattr(new, dict_attr, getattr(self, dict_attr).copy()) 124 new.labels = self.labels[:] 125 new.states = self.states[:] 126 new.start = self.start 127 return new 128 129 def report(self): 130 """Dump the grammar tables to standard output, for debugging.""" 131 from pprint import pprint 132 print "s2n" 133 pprint(self.symbol2number) 134 print "n2s" 135 pprint(self.number2symbol) 136 print "states" 137 pprint(self.states) 138 print "dfas" 139 pprint(self.dfas) 140 print "labels" 141 pprint(self.labels) 142 print "start", self.start 143 144 145def _make_deterministic(top): 146 if isinstance(top, dict): 147 return collections.OrderedDict( 148 sorted(((k, _make_deterministic(v)) for k, v in top.iteritems()))) 149 if isinstance(top, list): 150 return [_make_deterministic(e) for e in top] 151 if isinstance(top, tuple): 152 return tuple(_make_deterministic(e) for e in top) 153 return top 154 155 156# Map from operator to number (since tokenize doesn't do this) 157 158opmap_raw = """ 159( LPAR 160) RPAR 161[ LSQB 162] RSQB 163: COLON 164, COMMA 165; SEMI 166+ PLUS 167- MINUS 168* STAR 169/ SLASH 170| VBAR 171& AMPER 172< LESS 173> GREATER 174= EQUAL 175. DOT 176% PERCENT 177` BACKQUOTE 178{ LBRACE 179} RBRACE 180@ AT 181@= ATEQUAL 182== EQEQUAL 183!= NOTEQUAL 184<> NOTEQUAL 185<= LESSEQUAL 186>= GREATEREQUAL 187~ TILDE 188^ CIRCUMFLEX 189<< LEFTSHIFT 190>> RIGHTSHIFT 191** DOUBLESTAR 192+= PLUSEQUAL 193-= MINEQUAL 194*= STAREQUAL 195/= SLASHEQUAL 196%= PERCENTEQUAL 197&= AMPEREQUAL 198|= VBAREQUAL 199^= CIRCUMFLEXEQUAL 200<<= LEFTSHIFTEQUAL 201>>= RIGHTSHIFTEQUAL 202**= DOUBLESTAREQUAL 203// DOUBLESLASH 204//= DOUBLESLASHEQUAL 205-> RARROW 206""" 207 208opmap = {} 209for line in opmap_raw.splitlines(): 210 if line: 211 op, name = line.split() 212 opmap[op] = getattr(token, name) 213