• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3"""Convert graminit.[ch] spit out by pgen to Python code.
4
5Pgen is the Python parser generator.  It is useful to quickly create a
6parser from a grammar file in Python's grammar notation.  But I don't
7want my parsers to be written in C (yet), so I'm translating the
8parsing tables to Python data structures and writing a Python parse
9engine.
10
11Note that the token numbers are constants determined by the standard
12Python tokenizer.  The standard token module defines these numbers and
13their names (the names are not used much).  The token numbers are
14hardcoded into the Python tokenizer and into pgen.  A Python
15implementation of the Python tokenizer is also available, in the
16standard tokenize module.
17
18On the other hand, symbol numbers (representing the grammar's
19non-terminals) are assigned by pgen based on the actual grammar
20input.
21
22Note: this module is pretty much obsolete; the pgen module generates
23equivalent grammar tables directly from the Grammar.txt input file
24without having to invoke the Python pgen C program.
25
26"""
27
28# Python imports
29import re
30
31# Local imports
32from pgen2 import grammar
33from pgen2 import token
34
35
36class Converter(grammar.Grammar):
37  """Grammar subclass that reads classic pgen output files.
38
39    The run() method reads the tables as produced by the pgen parser
40    generator, typically contained in two C files, graminit.h and
41    graminit.c.  The other methods are for internal use only.
42
43    See the base class for more documentation.
44
45    """
46
47  def run(self, graminit_h, graminit_c):
48    """Load the grammar tables from the text files written by pgen."""
49    self.parse_graminit_h(graminit_h)
50    self.parse_graminit_c(graminit_c)
51    self.finish_off()
52
53  def parse_graminit_h(self, filename):
54    """Parse the .h file written by pgen.  (Internal)
55
56        This file is a sequence of #define statements defining the
57        nonterminals of the grammar as numbers.  We build two tables
58        mapping the numbers to names and back.
59
60        """
61    try:
62      f = open(filename)
63    except OSError as err:
64      print("Can't open %s: %s" % (filename, err))
65      return False
66    self.symbol2number = {}
67    self.number2symbol = {}
68    lineno = 0
69    for line in f:
70      lineno += 1
71      mo = re.match(r'^#define\s+(\w+)\s+(\d+)$', line)
72      if not mo and line.strip():
73        print("%s(%s): can't parse %s" % (filename, lineno, line.strip()))
74      else:
75        symbol, number = mo.groups()
76        number = int(number)
77        assert symbol not in self.symbol2number
78        assert number not in self.number2symbol
79        self.symbol2number[symbol] = number
80        self.number2symbol[number] = symbol
81    return True
82
83  def parse_graminit_c(self, filename):
84    """Parse the .c file written by pgen.  (Internal)
85
86        The file looks as follows.  The first two lines are always this:
87
88        #include "pgenheaders.h"
89        #include "grammar.h"
90
91        After that come four blocks:
92
93        1) one or more state definitions
94        2) a table defining dfas
95        3) a table defining labels
96        4) a struct defining the grammar
97
98        A state definition has the following form:
99        - one or more arc arrays, each of the form:
100          static arc arcs_<n>_<m>[<k>] = {
101                  {<i>, <j>},
102                  ...
103          };
104        - followed by a state array, of the form:
105          static state states_<s>[<t>] = {
106                  {<k>, arcs_<n>_<m>},
107                  ...
108          };
109
110        """
111    try:
112      f = open(filename)
113    except OSError as err:
114      print("Can't open %s: %s" % (filename, err))
115      return False
116    # The code below essentially uses f's iterator-ness!
117    lineno = 0
118
119    # Expect the two #include lines
120    lineno, line = lineno + 1, next(f)
121    assert line == '#include "pgenheaders.h"\n', (lineno, line)
122    lineno, line = lineno + 1, next(f)
123    assert line == '#include "grammar.h"\n', (lineno, line)
124
125    # Parse the state definitions
126    lineno, line = lineno + 1, next(f)
127    allarcs = {}
128    states = []
129    while line.startswith('static arc '):
130      while line.startswith('static arc '):
131        mo = re.match(r'static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$', line)
132        assert mo, (lineno, line)
133        n, m, k = list(map(int, mo.groups()))
134        arcs = []
135        for _ in range(k):
136          lineno, line = lineno + 1, next(f)
137          mo = re.match(r'\s+{(\d+), (\d+)},$', line)
138          assert mo, (lineno, line)
139          i, j = list(map(int, mo.groups()))
140          arcs.append((i, j))
141        lineno, line = lineno + 1, next(f)
142        assert line == '};\n', (lineno, line)
143        allarcs[(n, m)] = arcs
144        lineno, line = lineno + 1, next(f)
145      mo = re.match(r'static state states_(\d+)\[(\d+)\] = {$', line)
146      assert mo, (lineno, line)
147      s, t = list(map(int, mo.groups()))
148      assert s == len(states), (lineno, line)
149      state = []
150      for _ in range(t):
151        lineno, line = lineno + 1, next(f)
152        mo = re.match(r'\s+{(\d+), arcs_(\d+)_(\d+)},$', line)
153        assert mo, (lineno, line)
154        k, n, m = list(map(int, mo.groups()))
155        arcs = allarcs[n, m]
156        assert k == len(arcs), (lineno, line)
157        state.append(arcs)
158      states.append(state)
159      lineno, line = lineno + 1, next(f)
160      assert line == '};\n', (lineno, line)
161      lineno, line = lineno + 1, next(f)
162    self.states = states
163
164    # Parse the dfas
165    dfas = {}
166    mo = re.match(r'static dfa dfas\[(\d+)\] = {$', line)
167    assert mo, (lineno, line)
168    ndfas = int(mo.group(1))
169    for i in range(ndfas):
170      lineno, line = lineno + 1, next(f)
171      mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', line)
172      assert mo, (lineno, line)
173      symbol = mo.group(2)
174      number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
175      assert self.symbol2number[symbol] == number, (lineno, line)
176      assert self.number2symbol[number] == symbol, (lineno, line)
177      assert x == 0, (lineno, line)
178      state = states[z]
179      assert y == len(state), (lineno, line)
180      lineno, line = lineno + 1, next(f)
181      mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
182      assert mo, (lineno, line)
183      first = {}
184      rawbitset = eval(mo.group(1))
185      for i, c in enumerate(rawbitset):
186        byte = ord(c)
187        for j in range(8):
188          if byte & (1 << j):
189            first[i * 8 + j] = 1
190      dfas[number] = (state, first)
191    lineno, line = lineno + 1, next(f)
192    assert line == '};\n', (lineno, line)
193    self.dfas = dfas
194
195    # Parse the labels
196    labels = []
197    lineno, line = lineno + 1, next(f)
198    mo = re.match(r'static label labels\[(\d+)\] = {$', line)
199    assert mo, (lineno, line)
200    nlabels = int(mo.group(1))
201    for i in range(nlabels):
202      lineno, line = lineno + 1, next(f)
203      mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
204      assert mo, (lineno, line)
205      x, y = mo.groups()
206      x = int(x)
207      if y == '0':
208        y = None
209      else:
210        y = eval(y)
211      labels.append((x, y))
212    lineno, line = lineno + 1, next(f)
213    assert line == '};\n', (lineno, line)
214    self.labels = labels
215
216    # Parse the grammar struct
217    lineno, line = lineno + 1, next(f)
218    assert line == 'grammar _PyParser_Grammar = {\n', (lineno, line)
219    lineno, line = lineno + 1, next(f)
220    mo = re.match(r'\s+(\d+),$', line)
221    assert mo, (lineno, line)
222    ndfas = int(mo.group(1))
223    assert ndfas == len(self.dfas)
224    lineno, line = lineno + 1, next(f)
225    assert line == '\tdfas,\n', (lineno, line)
226    lineno, line = lineno + 1, next(f)
227    mo = re.match(r'\s+{(\d+), labels},$', line)
228    assert mo, (lineno, line)
229    nlabels = int(mo.group(1))
230    assert nlabels == len(self.labels), (lineno, line)
231    lineno, line = lineno + 1, next(f)
232    mo = re.match(r'\s+(\d+)$', line)
233    assert mo, (lineno, line)
234    start = int(mo.group(1))
235    assert start in self.number2symbol, (lineno, line)
236    self.start = start
237    lineno, line = lineno + 1, next(f)
238    assert line == '};\n', (lineno, line)
239    try:
240      lineno, line = lineno + 1, next(f)
241    except StopIteration:
242      pass
243    else:
244      assert 0, (lineno, line)
245
246  def finish_off(self):
247    """Create additional useful structures.  (Internal)."""
248    self.keywords = {}  # map from keyword strings to arc labels
249    self.tokens = {}  # map from numeric token values to arc labels
250    for ilabel, (type, value) in enumerate(self.labels):
251      if type == token.NAME and value is not None:
252        self.keywords[value] = ilabel
253      elif value is None:
254        self.tokens[type] = ilabel
255