1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3 4"""This module defines the data structures used to represent a grammar. 5 6These are a bit arcane because they are derived from the data 7structures used by Python's 'pgen' parser generator. 8 9There's also a table here mapping operators to their names in the 10token module; the Python tokenize module reports all operators as the 11fallback token code OP, but the parser needs the actual token code. 12 13""" 14 15# Python imports 16import pickle 17 18# Local imports 19from . import token 20 21 22class Grammar(object): 23 """Pgen parsing tables conversion class. 24 25 Once initialized, this class supplies the grammar tables for the 26 parsing engine implemented by parse.py. The parsing engine 27 accesses the instance variables directly. The class here does not 28 provide initialization of the tables; several subclasses exist to 29 do this (see the conv and pgen modules). 30 31 The load() method reads the tables from a pickle file, which is 32 much faster than the other ways offered by subclasses. The pickle 33 file is written by calling dump() (after loading the grammar 34 tables using a subclass). The report() method prints a readable 35 representation of the tables to stdout, for debugging. 36 37 The instance variables are as follows: 38 39 symbol2number -- a dict mapping symbol names to numbers. Symbol 40 numbers are always 256 or higher, to distinguish 41 them from token numbers, which are between 0 and 42 255 (inclusive). 43 44 number2symbol -- a dict mapping numbers to symbol names; 45 these two are each other's inverse. 46 47 states -- a list of DFAs, where each DFA is a list of 48 states, each state is a list of arcs, and each 49 arc is a (i, j) pair where i is a label and j is 50 a state number. The DFA number is the index into 51 this list. (This name is slightly confusing.) 52 Final states are represented by a special arc of 53 the form (0, j) where j is its own state number. 54 55 dfas -- a dict mapping symbol numbers to (DFA, first) 56 pairs, where DFA is an item from the states list 57 above, and first is a set of tokens that can 58 begin this grammar rule (represented by a dict 59 whose values are always 1). 60 61 labels -- a list of (x, y) pairs where x is either a token 62 number or a symbol number, and y is either None 63 or a string; the strings are keywords. The label 64 number is the index in this list; label numbers 65 are used to mark state transitions (arcs) in the 66 DFAs. 67 68 start -- the number of the grammar's start symbol. 69 70 keywords -- a dict mapping keyword strings to arc labels. 71 72 tokens -- a dict mapping token numbers to arc labels. 73 74 """ 75 76 def __init__(self): 77 self.symbol2number = {} 78 self.number2symbol = {} 79 self.states = [] 80 self.dfas = {} 81 self.labels = [(0, "EMPTY")] 82 self.keywords = {} 83 self.tokens = {} 84 self.symbol2label = {} 85 self.start = 256 86 87 def dump(self, filename): 88 """Dump the grammar tables to a pickle file.""" 89 with open(filename, "wb") as f: 90 pickle.dump(self.__dict__, f, pickle.HIGHEST_PROTOCOL) 91 92 def load(self, filename): 93 """Load the grammar tables from a pickle file.""" 94 with open(filename, "rb") as f: 95 d = pickle.load(f) 96 self.__dict__.update(d) 97 98 def loads(self, pkl): 99 """Load the grammar tables from a pickle bytes object.""" 100 self.__dict__.update(pickle.loads(pkl)) 101 102 def copy(self): 103 """ 104 Copy the grammar. 105 """ 106 new = self.__class__() 107 for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords", 108 "tokens", "symbol2label"): 109 setattr(new, dict_attr, getattr(self, dict_attr).copy()) 110 new.labels = self.labels[:] 111 new.states = self.states[:] 112 new.start = self.start 113 return new 114 115 def report(self): 116 """Dump the grammar tables to standard output, for debugging.""" 117 from pprint import pprint 118 print("s2n") 119 pprint(self.symbol2number) 120 print("n2s") 121 pprint(self.number2symbol) 122 print("states") 123 pprint(self.states) 124 print("dfas") 125 pprint(self.dfas) 126 print("labels") 127 pprint(self.labels) 128 print("start", self.start) 129 130 131# Map from operator to number (since tokenize doesn't do this) 132 133opmap_raw = """ 134( LPAR 135) RPAR 136[ LSQB 137] RSQB 138: COLON 139, COMMA 140; SEMI 141+ PLUS 142- MINUS 143* STAR 144/ SLASH 145| VBAR 146& AMPER 147< LESS 148> GREATER 149= EQUAL 150. DOT 151% PERCENT 152` BACKQUOTE 153{ LBRACE 154} RBRACE 155@ AT 156@= ATEQUAL 157== EQEQUAL 158!= NOTEQUAL 159<> NOTEQUAL 160<= LESSEQUAL 161>= GREATEREQUAL 162~ TILDE 163^ CIRCUMFLEX 164<< LEFTSHIFT 165>> RIGHTSHIFT 166** DOUBLESTAR 167+= PLUSEQUAL 168-= MINEQUAL 169*= STAREQUAL 170/= SLASHEQUAL 171%= PERCENTEQUAL 172&= AMPEREQUAL 173|= VBAREQUAL 174^= CIRCUMFLEXEQUAL 175<<= LEFTSHIFTEQUAL 176>>= RIGHTSHIFTEQUAL 177**= DOUBLESTAREQUAL 178// DOUBLESLASH 179//= DOUBLESLASHEQUAL 180-> RARROW 181:= COLONEQUAL 182""" 183 184opmap = {} 185for line in opmap_raw.splitlines(): 186 if line: 187 op, name = line.split() 188 opmap[op] = getattr(token, name) 189