1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3 4"""This module defines the data structures used to represent a grammar. 5 6These are a bit arcane because they are derived from the data 7structures used by Python's 'pgen' parser generator. 8 9There's also a table here mapping operators to their names in the 10token module; the Python tokenize module reports all operators as the 11fallback token code OP, but the parser needs the actual token code. 12 13""" 14 15# Python imports 16import collections 17import pickle 18 19# Local imports 20from . import token 21 22 23class Grammar(object): 24 """Pgen parsing tables conversion class. 25 26 Once initialized, this class supplies the grammar tables for the 27 parsing engine implemented by parse.py. The parsing engine 28 accesses the instance variables directly. The class here does not 29 provide initialization of the tables; several subclasses exist to 30 do this (see the conv and pgen modules). 31 32 The load() method reads the tables from a pickle file, which is 33 much faster than the other ways offered by subclasses. The pickle 34 file is written by calling dump() (after loading the grammar 35 tables using a subclass). The report() method prints a readable 36 representation of the tables to stdout, for debugging. 37 38 The instance variables are as follows: 39 40 symbol2number -- a dict mapping symbol names to numbers. Symbol 41 numbers are always 256 or higher, to distinguish 42 them from token numbers, which are between 0 and 43 255 (inclusive). 44 45 number2symbol -- a dict mapping numbers to symbol names; 46 these two are each other's inverse. 47 48 states -- a list of DFAs, where each DFA is a list of 49 states, each state is a list of arcs, and each 50 arc is a (i, j) pair where i is a label and j is 51 a state number. The DFA number is the index into 52 this list. (This name is slightly confusing.) 53 Final states are represented by a special arc of 54 the form (0, j) where j is its own state number. 55 56 dfas -- a dict mapping symbol numbers to (DFA, first) 57 pairs, where DFA is an item from the states list 58 above, and first is a set of tokens that can 59 begin this grammar rule (represented by a dict 60 whose values are always 1). 61 62 labels -- a list of (x, y) pairs where x is either a token 63 number or a symbol number, and y is either None 64 or a string; the strings are keywords. The label 65 number is the index in this list; label numbers 66 are used to mark state transitions (arcs) in the 67 DFAs. 68 69 start -- the number of the grammar's start symbol. 70 71 keywords -- a dict mapping keyword strings to arc labels. 72 73 tokens -- a dict mapping token numbers to arc labels. 74 75 """ 76 77 def __init__(self): 78 self.symbol2number = {} 79 self.number2symbol = {} 80 self.states = [] 81 self.dfas = {} 82 self.labels = [(0, "EMPTY")] 83 self.keywords = {} 84 self.tokens = {} 85 self.symbol2label = {} 86 self.start = 256 87 88 def dump(self, filename): 89 """Dump the grammar tables to a pickle file. 90 91 dump() recursively changes all dict to OrderedDict, so the pickled file 92 is not exactly the same as what was passed in to dump(). load() uses the 93 pickled file to create the tables, but only changes OrderedDict to dict 94 at the top level; it does not recursively change OrderedDict to dict. 95 So, the loaded tables are different from the original tables that were 96 passed to load() in that some of the OrderedDict (from the pickled file) 97 are not changed back to dict. For parsing, this has no effect on 98 performance because OrderedDict uses dict's __getitem__ with nothing in 99 between. 100 """ 101 with open(filename, "wb") as f: 102 d = _make_deterministic(self.__dict__) 103 pickle.dump(d, f, 2) 104 105 def load(self, filename): 106 """Load the grammar tables from a pickle file.""" 107 with open(filename, "rb") as f: 108 d = pickle.load(f) 109 self.__dict__.update(d) 110 111 def loads(self, pkl): 112 """Load the grammar tables from a pickle bytes object.""" 113 self.__dict__.update(pickle.loads(pkl)) 114 115 def copy(self): 116 """ 117 Copy the grammar. 118 """ 119 new = self.__class__() 120 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", 121 "tokens", "symbol2label"): 122 setattr(new, dict_attr, getattr(self, dict_attr).copy()) 123 new.labels = self.labels[:] 124 new.states = self.states[:] 125 new.start = self.start 126 return new 127 128 def report(self): 129 """Dump the grammar tables to standard output, for debugging.""" 130 from pprint import pprint 131 print("s2n") 132 pprint(self.symbol2number) 133 print("n2s") 134 pprint(self.number2symbol) 135 print("states") 136 pprint(self.states) 137 print("dfas") 138 pprint(self.dfas) 139 print("labels") 140 pprint(self.labels) 141 print("start", self.start) 142 143 144def _make_deterministic(top): 145 if isinstance(top, dict): 146 return collections.OrderedDict( 147 sorted(((k, _make_deterministic(v)) for k, v in top.items()))) 148 if isinstance(top, list): 149 return [_make_deterministic(e) for e in top] 150 if isinstance(top, tuple): 151 return tuple(_make_deterministic(e) for e in top) 152 return top 153 154 155# Map from operator to number (since tokenize doesn't do this) 156 157opmap_raw = """ 158( LPAR 159) RPAR 160[ LSQB 161] RSQB 162: COLON 163, COMMA 164; SEMI 165+ PLUS 166- MINUS 167* STAR 168/ SLASH 169| VBAR 170& AMPER 171< LESS 172> GREATER 173= EQUAL 174. DOT 175% PERCENT 176` BACKQUOTE 177{ LBRACE 178} RBRACE 179@ AT 180@= ATEQUAL 181== EQEQUAL 182!= NOTEQUAL 183<> NOTEQUAL 184<= LESSEQUAL 185>= GREATEREQUAL 186~ TILDE 187^ CIRCUMFLEX 188<< LEFTSHIFT 189>> RIGHTSHIFT 190** DOUBLESTAR 191+= PLUSEQUAL 192-= MINEQUAL 193*= STAREQUAL 194/= SLASHEQUAL 195%= PERCENTEQUAL 196&= AMPEREQUAL 197|= VBAREQUAL 198^= CIRCUMFLEXEQUAL 199<<= LEFTSHIFTEQUAL 200>>= RIGHTSHIFTEQUAL 201**= DOUBLESTAREQUAL 202// DOUBLESLASH 203//= DOUBLESLASHEQUAL 204-> RARROW 205""" 206 207opmap = {} 208for line in opmap_raw.splitlines(): 209 if line: 210 op, name = line.split() 211 opmap[op] = getattr(token, name) 212