1"""Python parser generator 2 3 4This parser generator transforms a Python grammar file into parsing tables 5that can be consumed by Python's LL(1) parser written in C. 6 7Concepts 8-------- 9 10* An LL(1) parser (Left-to-right, Leftmost derivation, 1 token-lookahead) is a 11 top-down parser for a subset of context-free languages. It parses the input 12 from Left to right, performing Leftmost derivation of the sentence, and can 13 only use 1 token of lookahead when parsing a sentence. 14 15* A parsing table is a collection of data that a generic implementation of the 16 LL(1) parser consumes to know how to parse a given context-free grammar. In 17 this case the collection of data involves Deterministic Finite Automatons, 18 calculated first sets, keywords and transition labels. 19 20* A grammar is defined by production rules (or just 'productions') that specify 21 which symbols may replace which other symbols; these rules may be used to 22 generate strings, or to parse them. Each such rule has a head, or left-hand 23 side, which consists of the string that may be replaced, and a body, or 24 right-hand side, which consists of a string that may replace it. In the 25 Python grammar, rules are written in the form 26 27 rule_name: rule_description; 28 29 meaning the rule 'a: b' specifies that a can be replaced by b. A context-free 30 grammar is a grammar in which the left-hand side of each production rule 31 consists of only a single nonterminal symbol. Context-free grammars can 32 always be recognized by a Non-Deterministic Automatons. 33 34* Terminal symbols are literal symbols which may appear in the outputs of the 35 production rules of the grammar and which cannot be changed using the rules 36 of the grammar. Applying the rules recursively to a source string of symbols 37 will usually terminate in a final output string consisting only of terminal 38 symbols. 39 40* Nonterminal symbols are those symbols which can be replaced. The grammar 41 includes a start symbol a designated member of the set of nonterminals from 42 which all the strings in the language may be derived by successive 43 applications of the production rules. 44 45* The language defined by the grammar is defined as the set of terminal strings 46 that can be derived using the production rules. 47 48* The first sets of a rule (FIRST(rule)) are defined to be the set of terminals 49 that can appear in the first position of any string derived from the rule. 50 This is useful for LL(1) parsers as the parser is only allowed to look at the 51 next token in the input to know which rule needs to parse. For example, given 52 this grammar: 53 54 start: '(' A | B ')' 55 A: 'a' '<' 56 B: 'b' '<' 57 58 and the input '(b<)' the parser can only look at 'b' to know if it needs 59 to parse A o B. Because FIRST(A) = {'a'} and FIRST(B) = {'b'} it knows 60 that needs to continue parsing rule B because only that rule can start 61 with 'b'. 62 63Description 64----------- 65 66The input for the parser generator is a grammar in extended BNF form (using * 67for repetition, + for at-least-once repetition, [] for optional parts, | for 68alternatives and () for grouping). 69 70Each rule in the grammar file is considered as a regular expression in its 71own right. It is turned into a Non-deterministic Finite Automaton (NFA), 72which is then turned into a Deterministic Finite Automaton (DFA), which is 73then optimized to reduce the number of states. See [Aho&Ullman 77] chapter 3, 74or similar compiler books (this technique is more often used for lexical 75analyzers). 76 77The DFA's are used by the parser as parsing tables in a special way that's 78probably unique. Before they are usable, the FIRST sets of all non-terminals 79are computed so the LL(1) parser consuming the parsing tables can distinguish 80between different transitions. 81Reference 82--------- 83 84[Aho&Ullman 77] 85 Aho&Ullman, Principles of Compiler Design, Addison-Wesley 1977 86 (first edition) 87""" 88 89from ast import literal_eval 90import collections 91 92from . import grammar, token 93from .automata import DFA 94from .metaparser import GrammarParser 95 96import enum 97 98 99class LabelType(enum.Enum): 100 NONTERMINAL = 0 101 NAMED_TOKEN = 1 102 KEYWORD = 2 103 OPERATOR = 3 104 NONE = 4 105 106 107class Label(str): 108 def __init__(self, value): 109 self.type = self._get_type() 110 111 def _get_type(self): 112 if self[0].isalpha(): 113 if self.upper() == self: 114 # NAMED tokens (ASYNC, NAME...) are all uppercase by convention 115 return LabelType.NAMED_TOKEN 116 else: 117 # If is not uppercase it must be a non terminal. 118 return LabelType.NONTERMINAL 119 else: 120 # Keywords and operators are wrapped in quotes 121 assert self[0] == self[-1] in ('"', "'"), self 122 value = literal_eval(self) 123 if value[0].isalpha(): 124 return LabelType.KEYWORD 125 else: 126 return LabelType.OPERATOR 127 128 def __repr__(self): 129 return "{}({})".format(self.type, super().__repr__()) 130 131 132class ParserGenerator(object): 133 def __init__(self, grammar_file, token_file, verbose=False, graph_file=None): 134 with open(grammar_file) as f: 135 self.grammar = f.read() 136 with open(token_file) as tok_file: 137 token_lines = tok_file.readlines() 138 self.tokens = dict(token.generate_tokens(token_lines)) 139 self.opmap = dict(token.generate_opmap(token_lines)) 140 # Manually add <> so it does not collide with != 141 self.opmap["<>"] = "NOTEQUAL" 142 self.verbose = verbose 143 self.filename = grammar_file 144 self.graph_file = graph_file 145 self.dfas, self.startsymbol = self.create_dfas() 146 self.first = {} # map from symbol name to set of tokens 147 self.calculate_first_sets() 148 149 def create_dfas(self): 150 rule_to_dfas = collections.OrderedDict() 151 start_nonterminal = None 152 for nfa in GrammarParser(self.grammar).parse(): 153 if self.verbose: 154 print("Dump of NFA for", nfa.name) 155 nfa.dump() 156 if self.graph_file is not None: 157 nfa.dump_graph(self.graph_file.write) 158 dfa = DFA.from_nfa(nfa) 159 if self.verbose: 160 print("Dump of DFA for", dfa.name) 161 dfa.dump() 162 dfa.simplify() 163 if self.graph_file is not None: 164 dfa.dump_graph(self.graph_file.write) 165 rule_to_dfas[dfa.name] = dfa 166 167 if start_nonterminal is None: 168 start_nonterminal = dfa.name 169 170 return rule_to_dfas, start_nonterminal 171 172 def make_grammar(self): 173 c = grammar.Grammar() 174 c.all_labels = set() 175 names = list(self.dfas.keys()) 176 names.remove(self.startsymbol) 177 names.insert(0, self.startsymbol) 178 for name in names: 179 i = 256 + len(c.symbol2number) 180 c.symbol2number[Label(name)] = i 181 c.number2symbol[i] = Label(name) 182 c.all_labels.add(name) 183 for name in names: 184 self.make_label(c, name) 185 dfa = self.dfas[name] 186 states = [] 187 for state in dfa: 188 arcs = [] 189 for label, next in sorted(state.arcs.items()): 190 c.all_labels.add(label) 191 arcs.append((self.make_label(c, label), dfa.states.index(next))) 192 if state.is_final: 193 arcs.append((0, dfa.states.index(state))) 194 states.append(arcs) 195 c.states.append(states) 196 c.dfas[c.symbol2number[name]] = (states, self.make_first_sets(c, name)) 197 c.start = c.symbol2number[self.startsymbol] 198 199 if self.verbose: 200 print("") 201 print("Grammar summary") 202 print("===============") 203 204 print("- {n_labels} labels".format(n_labels=len(c.labels))) 205 print("- {n_dfas} dfas".format(n_dfas=len(c.dfas))) 206 print("- {n_tokens} tokens".format(n_tokens=len(c.tokens))) 207 print("- {n_keywords} keywords".format(n_keywords=len(c.keywords))) 208 print( 209 "- Start symbol: {start_symbol}".format( 210 start_symbol=c.number2symbol[c.start] 211 ) 212 ) 213 return c 214 215 def make_first_sets(self, c, name): 216 rawfirst = self.first[name] 217 first = set() 218 for label in sorted(rawfirst): 219 ilabel = self.make_label(c, label) 220 ##assert ilabel not in first # XXX failed on <> ... != 221 first.add(ilabel) 222 return first 223 224 def make_label(self, c, label): 225 label = Label(label) 226 ilabel = len(c.labels) 227 228 if label.type == LabelType.NONTERMINAL: 229 if label in c.symbol2label: 230 return c.symbol2label[label] 231 else: 232 c.labels.append((c.symbol2number[label], None)) 233 c.symbol2label[label] = ilabel 234 return ilabel 235 elif label.type == LabelType.NAMED_TOKEN: 236 # A named token (NAME, NUMBER, STRING) 237 itoken = self.tokens.get(label, None) 238 assert isinstance(itoken, int), label 239 assert itoken in self.tokens.values(), label 240 if itoken in c.tokens: 241 return c.tokens[itoken] 242 else: 243 c.labels.append((itoken, None)) 244 c.tokens[itoken] = ilabel 245 return ilabel 246 elif label.type == LabelType.KEYWORD: 247 # A keyword 248 value = literal_eval(label) 249 if value in c.keywords: 250 return c.keywords[value] 251 else: 252 c.labels.append((self.tokens["NAME"], value)) 253 c.keywords[value] = ilabel 254 return ilabel 255 elif label.type == LabelType.OPERATOR: 256 # An operator (any non-numeric token) 257 value = literal_eval(label) 258 tok_name = self.opmap[value] # Fails if unknown token 259 itoken = self.tokens[tok_name] 260 if itoken in c.tokens: 261 return c.tokens[itoken] 262 else: 263 c.labels.append((itoken, None)) 264 c.tokens[itoken] = ilabel 265 return ilabel 266 else: 267 raise ValueError("Cannot categorize label {}".format(label)) 268 269 def calculate_first_sets(self): 270 names = list(self.dfas.keys()) 271 for name in names: 272 if name not in self.first: 273 self.calculate_first_sets_for_rule(name) 274 275 if self.verbose: 276 print("First set for {dfa_name}".format(dfa_name=name)) 277 for item in self.first[name]: 278 print(" - {terminal}".format(terminal=item)) 279 280 def calculate_first_sets_for_rule(self, name): 281 dfa = self.dfas[name] 282 self.first[name] = None # dummy to detect left recursion 283 state = dfa.states[0] 284 totalset = set() 285 overlapcheck = {} 286 for label, next in state.arcs.items(): 287 if label in self.dfas: 288 if label in self.first: 289 fset = self.first[label] 290 if fset is None: 291 raise ValueError("recursion for rule %r" % name) 292 else: 293 self.calculate_first_sets_for_rule(label) 294 fset = self.first[label] 295 totalset.update(fset) 296 overlapcheck[label] = fset 297 else: 298 totalset.add(label) 299 overlapcheck[label] = {label} 300 inverse = {} 301 for label, itsfirst in overlapcheck.items(): 302 for symbol in itsfirst: 303 if symbol in inverse: 304 raise ValueError( 305 "rule %s is ambiguous; %s is in the" 306 " first sets of %s as well as %s" 307 % (name, symbol, label, inverse[symbol]) 308 ) 309 inverse[symbol] = label 310 self.first[name] = totalset 311