1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. 2# Licensed to PSF under a Contributor Agreement. 3"""Convert graminit.[ch] spit out by pgen to Python code. 4 5Pgen is the Python parser generator. It is useful to quickly create a 6parser from a grammar file in Python's grammar notation. But I don't 7want my parsers to be written in C (yet), so I'm translating the 8parsing tables to Python data structures and writing a Python parse 9engine. 10 11Note that the token numbers are constants determined by the standard 12Python tokenizer. The standard token module defines these numbers and 13their names (the names are not used much). The token numbers are 14hardcoded into the Python tokenizer and into pgen. A Python 15implementation of the Python tokenizer is also available, in the 16standard tokenize module. 17 18On the other hand, symbol numbers (representing the grammar's 19non-terminals) are assigned by pgen based on the actual grammar 20input. 21 22Note: this module is pretty much obsolete; the pgen module generates 23equivalent grammar tables directly from the Grammar.txt input file 24without having to invoke the Python pgen C program. 25 26""" 27 28# Python imports 29import re 30 31# Local imports 32from pgen2 import grammar 33from pgen2 import token 34 35 36class Converter(grammar.Grammar): 37 """Grammar subclass that reads classic pgen output files. 38 39 The run() method reads the tables as produced by the pgen parser 40 generator, typically contained in two C files, graminit.h and 41 graminit.c. The other methods are for internal use only. 42 43 See the base class for more documentation. 44 45 """ 46 47 def run(self, graminit_h, graminit_c): 48 """Load the grammar tables from the text files written by pgen.""" 49 self.parse_graminit_h(graminit_h) 50 self.parse_graminit_c(graminit_c) 51 self.finish_off() 52 53 def parse_graminit_h(self, filename): 54 """Parse the .h file written by pgen. (Internal) 55 56 This file is a sequence of #define statements defining the 57 nonterminals of the grammar as numbers. We build two tables 58 mapping the numbers to names and back. 59 60 """ 61 try: 62 f = open(filename) 63 except OSError as err: 64 print("Can't open %s: %s" % (filename, err)) 65 return False 66 self.symbol2number = {} 67 self.number2symbol = {} 68 lineno = 0 69 for line in f: 70 lineno += 1 71 mo = re.match(r'^#define\s+(\w+)\s+(\d+)$', line) 72 if not mo and line.strip(): 73 print("%s(%s): can't parse %s" % (filename, lineno, line.strip())) 74 else: 75 symbol, number = mo.groups() 76 number = int(number) 77 assert symbol not in self.symbol2number 78 assert number not in self.number2symbol 79 self.symbol2number[symbol] = number 80 self.number2symbol[number] = symbol 81 return True 82 83 def parse_graminit_c(self, filename): 84 """Parse the .c file written by pgen. (Internal) 85 86 The file looks as follows. The first two lines are always this: 87 88 #include "pgenheaders.h" 89 #include "grammar.h" 90 91 After that come four blocks: 92 93 1) one or more state definitions 94 2) a table defining dfas 95 3) a table defining labels 96 4) a struct defining the grammar 97 98 A state definition has the following form: 99 - one or more arc arrays, each of the form: 100 static arc arcs_<n>_<m>[<k>] = { 101 {<i>, <j>}, 102 ... 103 }; 104 - followed by a state array, of the form: 105 static state states_<s>[<t>] = { 106 {<k>, arcs_<n>_<m>}, 107 ... 108 }; 109 110 """ 111 try: 112 f = open(filename) 113 except OSError as err: 114 print("Can't open %s: %s" % (filename, err)) 115 return False 116 # The code below essentially uses f's iterator-ness! 117 lineno = 0 118 119 # Expect the two #include lines 120 lineno, line = lineno + 1, next(f) 121 assert line == '#include "pgenheaders.h"\n', (lineno, line) 122 lineno, line = lineno + 1, next(f) 123 assert line == '#include "grammar.h"\n', (lineno, line) 124 125 # Parse the state definitions 126 lineno, line = lineno + 1, next(f) 127 allarcs = {} 128 states = [] 129 while line.startswith('static arc '): 130 while line.startswith('static arc '): 131 mo = re.match(r'static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$', line) 132 assert mo, (lineno, line) 133 n, m, k = list(map(int, mo.groups())) 134 arcs = [] 135 for _ in range(k): 136 lineno, line = lineno + 1, next(f) 137 mo = re.match(r'\s+{(\d+), (\d+)},$', line) 138 assert mo, (lineno, line) 139 i, j = list(map(int, mo.groups())) 140 arcs.append((i, j)) 141 lineno, line = lineno + 1, next(f) 142 assert line == '};\n', (lineno, line) 143 allarcs[(n, m)] = arcs 144 lineno, line = lineno + 1, next(f) 145 mo = re.match(r'static state states_(\d+)\[(\d+)\] = {$', line) 146 assert mo, (lineno, line) 147 s, t = list(map(int, mo.groups())) 148 assert s == len(states), (lineno, line) 149 state = [] 150 for _ in range(t): 151 lineno, line = lineno + 1, next(f) 152 mo = re.match(r'\s+{(\d+), arcs_(\d+)_(\d+)},$', line) 153 assert mo, (lineno, line) 154 k, n, m = list(map(int, mo.groups())) 155 arcs = allarcs[n, m] 156 assert k == len(arcs), (lineno, line) 157 state.append(arcs) 158 states.append(state) 159 lineno, line = lineno + 1, next(f) 160 assert line == '};\n', (lineno, line) 161 lineno, line = lineno + 1, next(f) 162 self.states = states 163 164 # Parse the dfas 165 dfas = {} 166 mo = re.match(r'static dfa dfas\[(\d+)\] = {$', line) 167 assert mo, (lineno, line) 168 ndfas = int(mo.group(1)) 169 for i in range(ndfas): 170 lineno, line = lineno + 1, next(f) 171 mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$', line) 172 assert mo, (lineno, line) 173 symbol = mo.group(2) 174 number, x, y, z = list(map(int, mo.group(1, 3, 4, 5))) 175 assert self.symbol2number[symbol] == number, (lineno, line) 176 assert self.number2symbol[number] == symbol, (lineno, line) 177 assert x == 0, (lineno, line) 178 state = states[z] 179 assert y == len(state), (lineno, line) 180 lineno, line = lineno + 1, next(f) 181 mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line) 182 assert mo, (lineno, line) 183 first = {} 184 rawbitset = eval(mo.group(1)) 185 for i, c in enumerate(rawbitset): 186 byte = ord(c) 187 for j in range(8): 188 if byte & (1 << j): 189 first[i * 8 + j] = 1 190 dfas[number] = (state, first) 191 lineno, line = lineno + 1, next(f) 192 assert line == '};\n', (lineno, line) 193 self.dfas = dfas 194 195 # Parse the labels 196 labels = [] 197 lineno, line = lineno + 1, next(f) 198 mo = re.match(r'static label labels\[(\d+)\] = {$', line) 199 assert mo, (lineno, line) 200 nlabels = int(mo.group(1)) 201 for i in range(nlabels): 202 lineno, line = lineno + 1, next(f) 203 mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line) 204 assert mo, (lineno, line) 205 x, y = mo.groups() 206 x = int(x) 207 if y == '0': 208 y = None 209 else: 210 y = eval(y) 211 labels.append((x, y)) 212 lineno, line = lineno + 1, next(f) 213 assert line == '};\n', (lineno, line) 214 self.labels = labels 215 216 # Parse the grammar struct 217 lineno, line = lineno + 1, next(f) 218 assert line == 'grammar _PyParser_Grammar = {\n', (lineno, line) 219 lineno, line = lineno + 1, next(f) 220 mo = re.match(r'\s+(\d+),$', line) 221 assert mo, (lineno, line) 222 ndfas = int(mo.group(1)) 223 assert ndfas == len(self.dfas) 224 lineno, line = lineno + 1, next(f) 225 assert line == '\tdfas,\n', (lineno, line) 226 lineno, line = lineno + 1, next(f) 227 mo = re.match(r'\s+{(\d+), labels},$', line) 228 assert mo, (lineno, line) 229 nlabels = int(mo.group(1)) 230 assert nlabels == len(self.labels), (lineno, line) 231 lineno, line = lineno + 1, next(f) 232 mo = re.match(r'\s+(\d+)$', line) 233 assert mo, (lineno, line) 234 start = int(mo.group(1)) 235 assert start in self.number2symbol, (lineno, line) 236 self.start = start 237 lineno, line = lineno + 1, next(f) 238 assert line == '};\n', (lineno, line) 239 try: 240 lineno, line = lineno + 1, next(f) 241 except StopIteration: 242 pass 243 else: 244 assert 0, (lineno, line) 245 246 def finish_off(self): 247 """Create additional useful structures. (Internal).""" 248 self.keywords = {} # map from keyword strings to arc labels 249 self.tokens = {} # map from numeric token values to arc labels 250 for ilabel, (type, value) in enumerate(self.labels): 251 if type == token.NAME and value is not None: 252 self.keywords[value] = ilabel 253 elif value is None: 254 self.tokens[type] = ilabel 255