1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3 4"""Convert graminit.[ch] spit out by pgen to Python code. 5 6Pgen is the Python parser generator. It is useful to quickly create a 7parser from a grammar file in Python's grammar notation. But I don't 8want my parsers to be written in C (yet), so I'm translating the 9parsing tables to Python data structures and writing a Python parse 10engine. 11 12Note that the token numbers are constants determined by the standard 13Python tokenizer. The standard token module defines these numbers and 14their names (the names are not used much). The token numbers are 15hardcoded into the Python tokenizer and into pgen. A Python 16implementation of the Python tokenizer is also available, in the 17standard tokenize module. 18 19On the other hand, symbol numbers (representing the grammar's 20non-terminals) are assigned by pgen based on the actual grammar 21input. 22 23Note: this module is pretty much obsolete; the pgen module generates 24equivalent grammar tables directly from the Grammar.txt input file 25without having to invoke the Python pgen C program. 26 27""" 28 29# Python imports 30import re 31 32# Local imports 33from pgen2 import grammar, token 34 35 36class Converter(grammar.Grammar): 37 """Grammar subclass that reads classic pgen output files. 38 39 The run() method reads the tables as produced by the pgen parser 40 generator, typically contained in two C files, graminit.h and 41 graminit.c. The other methods are for internal use only. 42 43 See the base class for more documentation. 44 45 """ 46 47 def run(self, graminit_h, graminit_c): 48 """Load the grammar tables from the text files written by pgen.""" 49 self.parse_graminit_h(graminit_h) 50 self.parse_graminit_c(graminit_c) 51 self.finish_off() 52 53 def parse_graminit_h(self, filename): 54 """Parse the .h file written by pgen. (Internal) 55 56 This file is a sequence of #define statements defining the 57 nonterminals of the grammar as numbers. We build two tables 58 mapping the numbers to names and back. 59 60 """ 61 try: 62 f = open(filename) 63 except IOError, err: 64 print "Can't open %s: %s" % (filename, err) 65 return False 66 self.symbol2number = {} 67 self.number2symbol = {} 68 lineno = 0 69 for line in f: 70 lineno += 1 71 mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line) 72 if not mo and line.strip(): 73 print "%s(%s): can't parse %s" % (filename, lineno, 74 line.strip()) 75 else: 76 symbol, number = mo.groups() 77 number = int(number) 78 assert symbol not in self.symbol2number 79 assert number not in self.number2symbol 80 self.symbol2number[symbol] = number 81 self.number2symbol[number] = symbol 82 return True 83 84 def parse_graminit_c(self, filename): 85 """Parse the .c file written by pgen. (Internal) 86 87 The file looks as follows. The first two lines are always this: 88 89 #include "pgenheaders.h" 90 #include "grammar.h" 91 92 After that come four blocks: 93 94 1) one or more state definitions 95 2) a table defining dfas 96 3) a table defining labels 97 4) a struct defining the grammar 98 99 A state definition has the following form: 100 - one or more arc arrays, each of the form: 101 static arc arcs_<n>_<m>[<k>] = { 102 {<i>, <j>}, 103 ... 104 }; 105 - followed by a state array, of the form: 106 static state states_<s>[<t>] = { 107 {<k>, arcs_<n>_<m>}, 108 ... 109 }; 110 111 """ 112 try: 113 f = open(filename) 114 except IOError, err: 115 print "Can't open %s: %s" % (filename, err) 116 return False 117 # The code below essentially uses f's iterator-ness! 118 lineno = 0 119 120 # Expect the two #include lines 121 lineno, line = lineno+1, f.next() 122 assert line == '#include "pgenheaders.h"\n', (lineno, line) 123 lineno, line = lineno+1, f.next() 124 assert line == '#include "grammar.h"\n', (lineno, line) 125 126 # Parse the state definitions 127 lineno, line = lineno+1, f.next() 128 allarcs = {} 129 states = [] 130 while line.startswith("static arc "): 131 while line.startswith("static arc "): 132 mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$", 133 line) 134 assert mo, (lineno, line) 135 n, m, k = map(int, mo.groups()) 136 arcs = [] 137 for _ in range(k): 138 lineno, line = lineno+1, f.next() 139 mo = re.match(r"\s+{(\d+), (\d+)},$", line) 140 assert mo, (lineno, line) 141 i, j = map(int, mo.groups()) 142 arcs.append((i, j)) 143 lineno, line = lineno+1, f.next() 144 assert line == "};\n", (lineno, line) 145 allarcs[(n, m)] = arcs 146 lineno, line = lineno+1, f.next() 147 mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line) 148 assert mo, (lineno, line) 149 s, t = map(int, mo.groups()) 150 assert s == len(states), (lineno, line) 151 state = [] 152 for _ in range(t): 153 lineno, line = lineno+1, f.next() 154 mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line) 155 assert mo, (lineno, line) 156 k, n, m = map(int, mo.groups()) 157 arcs = allarcs[n, m] 158 assert k == len(arcs), (lineno, line) 159 state.append(arcs) 160 states.append(state) 161 lineno, line = lineno+1, f.next() 162 assert line == "};\n", (lineno, line) 163 lineno, line = lineno+1, f.next() 164 self.states = states 165 166 # Parse the dfas 167 dfas = {} 168 mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line) 169 assert mo, (lineno, line) 170 ndfas = int(mo.group(1)) 171 for i in range(ndfas): 172 lineno, line = lineno+1, f.next() 173 mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', 174 line) 175 assert mo, (lineno, line) 176 symbol = mo.group(2) 177 number, x, y, z = map(int, mo.group(1, 3, 4, 5)) 178 assert self.symbol2number[symbol] == number, (lineno, line) 179 assert self.number2symbol[number] == symbol, (lineno, line) 180 assert x == 0, (lineno, line) 181 state = states[z] 182 assert y == len(state), (lineno, line) 183 lineno, line = lineno+1, f.next() 184 mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) 185 assert mo, (lineno, line) 186 first = {} 187 rawbitset = eval(mo.group(1)) 188 for i, c in enumerate(rawbitset): 189 byte = ord(c) 190 for j in range(8): 191 if byte & (1<<j): 192 first[i*8 + j] = 1 193 dfas[number] = (state, first) 194 lineno, line = lineno+1, f.next() 195 assert line == "};\n", (lineno, line) 196 self.dfas = dfas 197 198 # Parse the labels 199 labels = [] 200 lineno, line = lineno+1, f.next() 201 mo = re.match(r"static label labels\[(\d+)\] = {$", line) 202 assert mo, (lineno, line) 203 nlabels = int(mo.group(1)) 204 for i in range(nlabels): 205 lineno, line = lineno+1, f.next() 206 mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) 207 assert mo, (lineno, line) 208 x, y = mo.groups() 209 x = int(x) 210 if y == "0": 211 y = None 212 else: 213 y = eval(y) 214 labels.append((x, y)) 215 lineno, line = lineno+1, f.next() 216 assert line == "};\n", (lineno, line) 217 self.labels = labels 218 219 # Parse the grammar struct 220 lineno, line = lineno+1, f.next() 221 assert line == "grammar _PyParser_Grammar = {\n", (lineno, line) 222 lineno, line = lineno+1, f.next() 223 mo = re.match(r"\s+(\d+),$", line) 224 assert mo, (lineno, line) 225 ndfas = int(mo.group(1)) 226 assert ndfas == len(self.dfas) 227 lineno, line = lineno+1, f.next() 228 assert line == "\tdfas,\n", (lineno, line) 229 lineno, line = lineno+1, f.next() 230 mo = re.match(r"\s+{(\d+), labels},$", line) 231 assert mo, (lineno, line) 232 nlabels = int(mo.group(1)) 233 assert nlabels == len(self.labels), (lineno, line) 234 lineno, line = lineno+1, f.next() 235 mo = re.match(r"\s+(\d+)$", line) 236 assert mo, (lineno, line) 237 start = int(mo.group(1)) 238 assert start in self.number2symbol, (lineno, line) 239 self.start = start 240 lineno, line = lineno+1, f.next() 241 assert line == "};\n", (lineno, line) 242 try: 243 lineno, line = lineno+1, f.next() 244 except StopIteration: 245 pass 246 else: 247 assert 0, (lineno, line) 248 249 def finish_off(self): 250 """Create additional useful structures. (Internal).""" 251 self.keywords = {} # map from keyword strings to arc labels 252 self.tokens = {} # map from numeric token values to arc labels 253 for ilabel, (type, value) in enumerate(self.labels): 254 if type == token.NAME and value is not None: 255 self.keywords[value] = ilabel 256 elif value is None: 257 self.tokens[type] = ilabel 258