• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
16import collections
17import pickle
18
19# Local imports
20from . import token
21
22
23class Grammar(object):
24    """Pgen parsing tables conversion class.
25
26    Once initialized, this class supplies the grammar tables for the
27    parsing engine implemented by parse.py.  The parsing engine
28    accesses the instance variables directly.  The class here does not
29    provide initialization of the tables; several subclasses exist to
30    do this (see the conv and pgen modules).
31
32    The load() method reads the tables from a pickle file, which is
33    much faster than the other ways offered by subclasses.  The pickle
34    file is written by calling dump() (after loading the grammar
35    tables using a subclass).  The report() method prints a readable
36    representation of the tables to stdout, for debugging.
37
38    The instance variables are as follows:
39
40    symbol2number -- a dict mapping symbol names to numbers.  Symbol
41                     numbers are always 256 or higher, to distinguish
42                     them from token numbers, which are between 0 and
43                     255 (inclusive).
44
45    number2symbol -- a dict mapping numbers to symbol names;
46                     these two are each other's inverse.
47
48    states        -- a list of DFAs, where each DFA is a list of
49                     states, each state is a list of arcs, and each
50                     arc is a (i, j) pair where i is a label and j is
51                     a state number.  The DFA number is the index into
52                     this list.  (This name is slightly confusing.)
53                     Final states are represented by a special arc of
54                     the form (0, j) where j is its own state number.
55
56    dfas          -- a dict mapping symbol numbers to (DFA, first)
57                     pairs, where DFA is an item from the states list
58                     above, and first is a set of tokens that can
59                     begin this grammar rule (represented by a dict
60                     whose values are always 1).
61
62    labels        -- a list of (x, y) pairs where x is either a token
63                     number or a symbol number, and y is either None
64                     or a string; the strings are keywords.  The label
65                     number is the index in this list; label numbers
66                     are used to mark state transitions (arcs) in the
67                     DFAs.
68
69    start         -- the number of the grammar's start symbol.
70
71    keywords      -- a dict mapping keyword strings to arc labels.
72
73    tokens        -- a dict mapping token numbers to arc labels.
74
75    """
76
77    def __init__(self):
78        self.symbol2number = {}
79        self.number2symbol = {}
80        self.states = []
81        self.dfas = {}
82        self.labels = [(0, "EMPTY")]
83        self.keywords = {}
84        self.tokens = {}
85        self.symbol2label = {}
86        self.start = 256
87
88    def dump(self, filename):
89        """Dump the grammar tables to a pickle file.
90
91        dump() recursively changes all dict to OrderedDict, so the pickled file
92        is not exactly the same as what was passed in to dump(). load() uses the
93        pickled file to create the tables, but  only changes OrderedDict to dict
94        at the top level; it does not recursively change OrderedDict to dict.
95        So, the loaded tables are different from the original tables that were
96        passed to load() in that some of the OrderedDict (from the pickled file)
97        are not changed back to dict. For parsing, this has no effect on
98        performance because OrderedDict uses dict's __getitem__ with nothing in
99        between.
100        """
101        with open(filename, "wb") as f:
102            d = _make_deterministic(self.__dict__)
103            pickle.dump(d, f, 2)
104
105    def load(self, filename):
106        """Load the grammar tables from a pickle file."""
107        with open(filename, "rb") as f:
108            d = pickle.load(f)
109        self.__dict__.update(d)
110
111    def loads(self, pkl):
112        """Load the grammar tables from a pickle bytes object."""
113        self.__dict__.update(pickle.loads(pkl))
114
115    def copy(self):
116        """
117        Copy the grammar.
118        """
119        new = self.__class__()
120        for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
121                          "tokens", "symbol2label"):
122            setattr(new, dict_attr, getattr(self, dict_attr).copy())
123        new.labels = self.labels[:]
124        new.states = self.states[:]
125        new.start = self.start
126        return new
127
128    def report(self):
129        """Dump the grammar tables to standard output, for debugging."""
130        from pprint import pprint
131        print("s2n")
132        pprint(self.symbol2number)
133        print("n2s")
134        pprint(self.number2symbol)
135        print("states")
136        pprint(self.states)
137        print("dfas")
138        pprint(self.dfas)
139        print("labels")
140        pprint(self.labels)
141        print("start", self.start)
142
143
144def _make_deterministic(top):
145    if isinstance(top, dict):
146        return collections.OrderedDict(
147            sorted(((k, _make_deterministic(v)) for k, v in top.items())))
148    if isinstance(top, list):
149        return [_make_deterministic(e) for e in top]
150    if isinstance(top, tuple):
151        return tuple(_make_deterministic(e) for e in top)
152    return top
153
154
155# Map from operator to number (since tokenize doesn't do this)
156
157opmap_raw = """
158( LPAR
159) RPAR
160[ LSQB
161] RSQB
162: COLON
163, COMMA
164; SEMI
165+ PLUS
166- MINUS
167* STAR
168/ SLASH
169| VBAR
170& AMPER
171< LESS
172> GREATER
173= EQUAL
174. DOT
175% PERCENT
176` BACKQUOTE
177{ LBRACE
178} RBRACE
179@ AT
180@= ATEQUAL
181== EQEQUAL
182!= NOTEQUAL
183<> NOTEQUAL
184<= LESSEQUAL
185>= GREATEREQUAL
186~ TILDE
187^ CIRCUMFLEX
188<< LEFTSHIFT
189>> RIGHTSHIFT
190** DOUBLESTAR
191+= PLUSEQUAL
192-= MINEQUAL
193*= STAREQUAL
194/= SLASHEQUAL
195%= PERCENTEQUAL
196&= AMPEREQUAL
197|= VBAREQUAL
198^= CIRCUMFLEXEQUAL
199<<= LEFTSHIFTEQUAL
200>>= RIGHTSHIFTEQUAL
201**= DOUBLESTAREQUAL
202// DOUBLESLASH
203//= DOUBLESLASHEQUAL
204-> RARROW
205"""
206
207opmap = {}
208for line in opmap_raw.splitlines():
209    if line:
210        op, name = line.split()
211        opmap[op] = getattr(token, name)
212