• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import collections
2import tokenize  # from stdlib
3
4from . import grammar, token
5
6
7class ParserGenerator(object):
8
9    def __init__(self, grammar_file, token_file, stream=None, verbose=False):
10        close_stream = None
11        if stream is None:
12            stream = open(grammar_file)
13            close_stream = stream.close
14        with open(token_file) as tok_file:
15            token_lines = tok_file.readlines()
16        self.tokens = dict(token.generate_tokens(token_lines))
17        self.opmap = dict(token.generate_opmap(token_lines))
18        # Manually add <> so it does not collide with !=
19        self.opmap['<>'] = "NOTEQUAL"
20        self.verbose = verbose
21        self.filename = grammar_file
22        self.stream = stream
23        self.generator = tokenize.generate_tokens(stream.readline)
24        self.gettoken() # Initialize lookahead
25        self.dfas, self.startsymbol = self.parse()
26        if close_stream is not None:
27            close_stream()
28        self.first = {} # map from symbol name to set of tokens
29        self.addfirstsets()
30
31    def make_grammar(self):
32        c = grammar.Grammar()
33        names = list(self.dfas.keys())
34        names.remove(self.startsymbol)
35        names.insert(0, self.startsymbol)
36        for name in names:
37            i = 256 + len(c.symbol2number)
38            c.symbol2number[name] = i
39            c.number2symbol[i] = name
40        for name in names:
41            self.make_label(c, name)
42            dfa = self.dfas[name]
43            states = []
44            for state in dfa:
45                arcs = []
46                for label, next in sorted(state.arcs.items()):
47                    arcs.append((self.make_label(c, label), dfa.index(next)))
48                if state.isfinal:
49                    arcs.append((0, dfa.index(state)))
50                states.append(arcs)
51            c.states.append(states)
52            c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
53        c.start = c.symbol2number[self.startsymbol]
54
55        if self.verbose:
56            print("")
57            print("Grammar summary")
58            print("===============")
59
60            print("- {n_labels} labels".format(n_labels=len(c.labels)))
61            print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
62            print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
63            print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
64            print(
65                "- Start symbol: {start_symbol}".format(
66                    start_symbol=c.number2symbol[c.start]
67                )
68            )
69        return c
70
71    def make_first(self, c, name):
72        rawfirst = self.first[name]
73        first = set()
74        for label in sorted(rawfirst):
75            ilabel = self.make_label(c, label)
76            ##assert ilabel not in first # XXX failed on <> ... !=
77            first.add(ilabel)
78        return first
79
80    def make_label(self, c, label):
81        # XXX Maybe this should be a method on a subclass of converter?
82        ilabel = len(c.labels)
83        if label[0].isalpha():
84            # Either a symbol name or a named token
85            if label in c.symbol2number:
86                # A symbol name (a non-terminal)
87                if label in c.symbol2label:
88                    return c.symbol2label[label]
89                else:
90                    c.labels.append((c.symbol2number[label], None))
91                    c.symbol2label[label] = ilabel
92                    return ilabel
93            else:
94                # A named token (NAME, NUMBER, STRING)
95                itoken = self.tokens.get(label, None)
96                assert isinstance(itoken, int), label
97                assert itoken in self.tokens.values(), label
98                if itoken in c.tokens:
99                    return c.tokens[itoken]
100                else:
101                    c.labels.append((itoken, None))
102                    c.tokens[itoken] = ilabel
103                    return ilabel
104        else:
105            # Either a keyword or an operator
106            assert label[0] in ('"', "'"), label
107            value = eval(label)
108            if value[0].isalpha():
109                # A keyword
110                if value in c.keywords:
111                    return c.keywords[value]
112                else:
113                    c.labels.append((self.tokens["NAME"], value))
114                    c.keywords[value] = ilabel
115                    return ilabel
116            else:
117                # An operator (any non-numeric token)
118                tok_name = self.opmap[value] # Fails if unknown token
119                itoken = self.tokens[tok_name]
120                if itoken in c.tokens:
121                    return c.tokens[itoken]
122                else:
123                    c.labels.append((itoken, None))
124                    c.tokens[itoken] = ilabel
125                    return ilabel
126
127    def addfirstsets(self):
128        names = list(self.dfas.keys())
129        for name in names:
130            if name not in self.first:
131                self.calcfirst(name)
132
133            if self.verbose:
134                print("First set for {dfa_name}".format(dfa_name=name))
135                for item in self.first[name]:
136                    print("    - {terminal}".format(terminal=item))
137
138    def calcfirst(self, name):
139        dfa = self.dfas[name]
140        self.first[name] = None # dummy to detect left recursion
141        state = dfa[0]
142        totalset = set()
143        overlapcheck = {}
144        for label, next in state.arcs.items():
145            if label in self.dfas:
146                if label in self.first:
147                    fset = self.first[label]
148                    if fset is None:
149                        raise ValueError("recursion for rule %r" % name)
150                else:
151                    self.calcfirst(label)
152                    fset = self.first[label]
153                totalset.update(fset)
154                overlapcheck[label] = fset
155            else:
156                totalset.add(label)
157                overlapcheck[label] = {label}
158        inverse = {}
159        for label, itsfirst in overlapcheck.items():
160            for symbol in itsfirst:
161                if symbol in inverse:
162                    raise ValueError("rule %s is ambiguous; %s is in the"
163                                     " first sets of %s as well as %s" %
164                                     (name, symbol, label, inverse[symbol]))
165                inverse[symbol] = label
166        self.first[name] = totalset
167
168    def parse(self):
169        dfas = collections.OrderedDict()
170        startsymbol = None
171        # MSTART: (NEWLINE | RULE)* ENDMARKER
172        while self.type != tokenize.ENDMARKER:
173            while self.type == tokenize.NEWLINE:
174                self.gettoken()
175            # RULE: NAME ':' RHS NEWLINE
176            name = self.expect(tokenize.NAME)
177            if self.verbose:
178                print("Processing rule {dfa_name}".format(dfa_name=name))
179            self.expect(tokenize.OP, ":")
180            a, z = self.parse_rhs()
181            self.expect(tokenize.NEWLINE)
182            if self.verbose:
183                self.dump_nfa(name, a, z)
184            dfa = self.make_dfa(a, z)
185            if self.verbose:
186                self.dump_dfa(name, dfa)
187            self.simplify_dfa(dfa)
188            dfas[name] = dfa
189            if startsymbol is None:
190                startsymbol = name
191        return dfas, startsymbol
192
193    def make_dfa(self, start, finish):
194        # To turn an NFA into a DFA, we define the states of the DFA
195        # to correspond to *sets* of states of the NFA.  Then do some
196        # state reduction.  Let's represent sets as dicts with 1 for
197        # values.
198        assert isinstance(start, NFAState)
199        assert isinstance(finish, NFAState)
200        def closure(state):
201            base = set()
202            addclosure(state, base)
203            return base
204        def addclosure(state, base):
205            assert isinstance(state, NFAState)
206            if state in base:
207                return
208            base.add(state)
209            for label, next in state.arcs:
210                if label is None:
211                    addclosure(next, base)
212        states = [DFAState(closure(start), finish)]
213        for state in states: # NB states grows while we're iterating
214            arcs = {}
215            for nfastate in state.nfaset:
216                for label, next in nfastate.arcs:
217                    if label is not None:
218                        addclosure(next, arcs.setdefault(label, set()))
219            for label, nfaset in sorted(arcs.items()):
220                for st in states:
221                    if st.nfaset == nfaset:
222                        break
223                else:
224                    st = DFAState(nfaset, finish)
225                    states.append(st)
226                state.addarc(st, label)
227        return states # List of DFAState instances; first one is start
228
229    def dump_nfa(self, name, start, finish):
230        print("Dump of NFA for", name)
231        todo = [start]
232        for i, state in enumerate(todo):
233            print("  State", i, state is finish and "(final)" or "")
234            for label, next in state.arcs:
235                if next in todo:
236                    j = todo.index(next)
237                else:
238                    j = len(todo)
239                    todo.append(next)
240                if label is None:
241                    print("    -> %d" % j)
242                else:
243                    print("    %s -> %d" % (label, j))
244
245    def dump_dfa(self, name, dfa):
246        print("Dump of DFA for", name)
247        for i, state in enumerate(dfa):
248            print("  State", i, state.isfinal and "(final)" or "")
249            for label, next in sorted(state.arcs.items()):
250                print("    %s -> %d" % (label, dfa.index(next)))
251
252    def simplify_dfa(self, dfa):
253        # This is not theoretically optimal, but works well enough.
254        # Algorithm: repeatedly look for two states that have the same
255        # set of arcs (same labels pointing to the same nodes) and
256        # unify them, until things stop changing.
257
258        # dfa is a list of DFAState instances
259        changes = True
260        while changes:
261            changes = False
262            for i, state_i in enumerate(dfa):
263                for j in range(i+1, len(dfa)):
264                    state_j = dfa[j]
265                    if state_i == state_j:
266                        #print "  unify", i, j
267                        del dfa[j]
268                        for state in dfa:
269                            state.unifystate(state_j, state_i)
270                        changes = True
271                        break
272
273    def parse_rhs(self):
274        # RHS: ALT ('|' ALT)*
275        a, z = self.parse_alt()
276        if self.value != "|":
277            return a, z
278        else:
279            aa = NFAState()
280            zz = NFAState()
281            aa.addarc(a)
282            z.addarc(zz)
283            while self.value == "|":
284                self.gettoken()
285                a, z = self.parse_alt()
286                aa.addarc(a)
287                z.addarc(zz)
288            return aa, zz
289
290    def parse_alt(self):
291        # ALT: ITEM+
292        a, b = self.parse_item()
293        while (self.value in ("(", "[") or
294               self.type in (tokenize.NAME, tokenize.STRING)):
295            c, d = self.parse_item()
296            b.addarc(c)
297            b = d
298        return a, b
299
300    def parse_item(self):
301        # ITEM: '[' RHS ']' | ATOM ['+' | '*']
302        if self.value == "[":
303            self.gettoken()
304            a, z = self.parse_rhs()
305            self.expect(tokenize.OP, "]")
306            a.addarc(z)
307            return a, z
308        else:
309            a, z = self.parse_atom()
310            value = self.value
311            if value not in ("+", "*"):
312                return a, z
313            self.gettoken()
314            z.addarc(a)
315            if value == "+":
316                return a, z
317            else:
318                return a, a
319
320    def parse_atom(self):
321        # ATOM: '(' RHS ')' | NAME | STRING
322        if self.value == "(":
323            self.gettoken()
324            a, z = self.parse_rhs()
325            self.expect(tokenize.OP, ")")
326            return a, z
327        elif self.type in (tokenize.NAME, tokenize.STRING):
328            a = NFAState()
329            z = NFAState()
330            a.addarc(z, self.value)
331            self.gettoken()
332            return a, z
333        else:
334            self.raise_error("expected (...) or NAME or STRING, got %s/%s",
335                             self.type, self.value)
336
337    def expect(self, type, value=None):
338        if self.type != type or (value is not None and self.value != value):
339            self.raise_error("expected %s/%s, got %s/%s",
340                             type, value, self.type, self.value)
341        value = self.value
342        self.gettoken()
343        return value
344
345    def gettoken(self):
346        tup = next(self.generator)
347        while tup[0] in (tokenize.COMMENT, tokenize.NL):
348            tup = next(self.generator)
349        self.type, self.value, self.begin, self.end, self.line = tup
350        # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
351
352    def raise_error(self, msg, *args):
353        if args:
354            try:
355                msg = msg % args
356            except Exception:
357                msg = " ".join([msg] + list(map(str, args)))
358        raise SyntaxError(msg, (self.filename, self.end[0],
359                                self.end[1], self.line))
360
361class NFAState(object):
362
363    def __init__(self):
364        self.arcs = [] # list of (label, NFAState) pairs
365
366    def addarc(self, next, label=None):
367        assert label is None or isinstance(label, str)
368        assert isinstance(next, NFAState)
369        self.arcs.append((label, next))
370
371class DFAState(object):
372
373    def __init__(self, nfaset, final):
374        assert isinstance(nfaset, set)
375        assert isinstance(next(iter(nfaset)), NFAState)
376        assert isinstance(final, NFAState)
377        self.nfaset = nfaset
378        self.isfinal = final in nfaset
379        self.arcs = {} # map from label to DFAState
380
381    def addarc(self, next, label):
382        assert isinstance(label, str)
383        assert label not in self.arcs
384        assert isinstance(next, DFAState)
385        self.arcs[label] = next
386
387    def unifystate(self, old, new):
388        for label, next in self.arcs.items():
389            if next is old:
390                self.arcs[label] = new
391
392    def __eq__(self, other):
393        # Equality test -- ignore the nfaset instance variable
394        assert isinstance(other, DFAState)
395        if self.isfinal != other.isfinal:
396            return False
397        # Can't just return self.arcs == other.arcs, because that
398        # would invoke this method recursively, with cycles...
399        if len(self.arcs) != len(other.arcs):
400            return False
401        for label, next in self.arcs.items():
402            if next is not other.arcs.get(label):
403                return False
404        return True
405
406    __hash__ = None # For Py3 compatibility.
407