• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""This module defines the data structures used to represent a grammar.
5
6These are a bit arcane because they are derived from the data
7structures used by Python's 'pgen' parser generator.
8
9There's also a table here mapping operators to their names in the
10token module; the Python tokenize module reports all operators as the
11fallback token code OP, but the parser needs the actual token code.
12
13"""
14
15# Python imports
16import collections
17import pickle
18
19# Local imports
20from . import token, tokenize
21
22
23class Grammar(object):
24    """Pgen parsing tables conversion class.
25
26    Once initialized, this class supplies the grammar tables for the
27    parsing engine implemented by parse.py.  The parsing engine
28    accesses the instance variables directly.  The class here does not
29    provide initialization of the tables; several subclasses exist to
30    do this (see the conv and pgen modules).
31
32    The load() method reads the tables from a pickle file, which is
33    much faster than the other ways offered by subclasses.  The pickle
34    file is written by calling dump() (after loading the grammar
35    tables using a subclass).  The report() method prints a readable
36    representation of the tables to stdout, for debugging.
37
38    The instance variables are as follows:
39
40    symbol2number -- a dict mapping symbol names to numbers.  Symbol
41                     numbers are always 256 or higher, to distinguish
42                     them from token numbers, which are between 0 and
43                     255 (inclusive).
44
45    number2symbol -- a dict mapping numbers to symbol names;
46                     these two are each other's inverse.
47
48    states        -- a list of DFAs, where each DFA is a list of
49                     states, each state is a list of arcs, and each
50                     arc is a (i, j) pair where i is a label and j is
51                     a state number.  The DFA number is the index into
52                     this list.  (This name is slightly confusing.)
53                     Final states are represented by a special arc of
54                     the form (0, j) where j is its own state number.
55
56    dfas          -- a dict mapping symbol numbers to (DFA, first)
57                     pairs, where DFA is an item from the states list
58                     above, and first is a set of tokens that can
59                     begin this grammar rule (represented by a dict
60                     whose values are always 1).
61
62    labels        -- a list of (x, y) pairs where x is either a token
63                     number or a symbol number, and y is either None
64                     or a string; the strings are keywords.  The label
65                     number is the index in this list; label numbers
66                     are used to mark state transitions (arcs) in the
67                     DFAs.
68
69    start         -- the number of the grammar's start symbol.
70
71    keywords      -- a dict mapping keyword strings to arc labels.
72
73    tokens        -- a dict mapping token numbers to arc labels.
74
75    """
76
77    def __init__(self):
78        self.symbol2number = {}
79        self.number2symbol = {}
80        self.states = []
81        self.dfas = {}
82        self.labels = [(0, "EMPTY")]
83        self.keywords = {}
84        self.tokens = {}
85        self.symbol2label = {}
86        self.start = 256
87
88    def dump(self, filename):
89        """Dump the grammar tables to a pickle file.
90
91        dump() recursively changes all dict to OrderedDict, so the pickled file
92        is not exactly the same as what was passed in to dump(). load() uses the
93        pickled file to create the tables, but  only changes OrderedDict to dict
94        at the top level; it does not recursively change OrderedDict to dict.
95        So, the loaded tables are different from the original tables that were
96        passed to load() in that some of the OrderedDict (from the pickled file)
97        are not changed back to dict. For parsing, this has no effect on
98        performance because OrderedDict uses dict's __getitem__ with nothing in
99        between.
100        """
101        with open(filename, "wb") as f:
102            d = _make_deterministic(self.__dict__)
103            pickle.dump(d, f, 2)
104
105    def load(self, filename):
106        """Load the grammar tables from a pickle file."""
107        f = open(filename, "rb")
108        d = pickle.load(f)
109        f.close()
110        self.__dict__.update(d)
111
112    def loads(self, pkl):
113        """Load the grammar tables from a pickle bytes object."""
114        self.__dict__.update(pickle.loads(pkl))
115
116    def copy(self):
117        """
118        Copy the grammar.
119        """
120        new = self.__class__()
121        for dict_attr in ("symbol2number", "number2symbol", "dfas", "keywords",
122                          "tokens", "symbol2label"):
123            setattr(new, dict_attr, getattr(self, dict_attr).copy())
124        new.labels = self.labels[:]
125        new.states = self.states[:]
126        new.start = self.start
127        return new
128
129    def report(self):
130        """Dump the grammar tables to standard output, for debugging."""
131        from pprint import pprint
132        print "s2n"
133        pprint(self.symbol2number)
134        print "n2s"
135        pprint(self.number2symbol)
136        print "states"
137        pprint(self.states)
138        print "dfas"
139        pprint(self.dfas)
140        print "labels"
141        pprint(self.labels)
142        print "start", self.start
143
144
145def _make_deterministic(top):
146    if isinstance(top, dict):
147        return collections.OrderedDict(
148            sorted(((k, _make_deterministic(v)) for k, v in top.iteritems())))
149    if isinstance(top, list):
150        return [_make_deterministic(e) for e in top]
151    if isinstance(top, tuple):
152        return tuple(_make_deterministic(e) for e in top)
153    return top
154
155
156# Map from operator to number (since tokenize doesn't do this)
157
158opmap_raw = """
159( LPAR
160) RPAR
161[ LSQB
162] RSQB
163: COLON
164, COMMA
165; SEMI
166+ PLUS
167- MINUS
168* STAR
169/ SLASH
170| VBAR
171& AMPER
172< LESS
173> GREATER
174= EQUAL
175. DOT
176% PERCENT
177` BACKQUOTE
178{ LBRACE
179} RBRACE
180@ AT
181@= ATEQUAL
182== EQEQUAL
183!= NOTEQUAL
184<> NOTEQUAL
185<= LESSEQUAL
186>= GREATEREQUAL
187~ TILDE
188^ CIRCUMFLEX
189<< LEFTSHIFT
190>> RIGHTSHIFT
191** DOUBLESTAR
192+= PLUSEQUAL
193-= MINEQUAL
194*= STAREQUAL
195/= SLASHEQUAL
196%= PERCENTEQUAL
197&= AMPEREQUAL
198|= VBAREQUAL
199^= CIRCUMFLEXEQUAL
200<<= LEFTSHIFTEQUAL
201>>= RIGHTSHIFTEQUAL
202**= DOUBLESTAREQUAL
203// DOUBLESLASH
204//= DOUBLESLASHEQUAL
205-> RARROW
206"""
207
208opmap = {}
209for line in opmap_raw.splitlines():
210    if line:
211        op, name = line.split()
212        opmap[op] = getattr(token, name)
213