1#------------------------------------------------------------------------------- 2# Parser for ASDL [1] definition files. Reads in an ASDL description and parses 3# it into an AST that describes it. 4# 5# The EBNF we're parsing here: Figure 1 of the paper [1]. Extended to support 6# modules and attributes after a product. Words starting with Capital letters 7# are terminals. Literal tokens are in "double quotes". Others are 8# non-terminals. Id is either TokenId or ConstructorId. 9# 10# module ::= "module" Id "{" [definitions] "}" 11# definitions ::= { TypeId "=" type } 12# type ::= product | sum 13# product ::= fields ["attributes" fields] 14# fields ::= "(" { field, "," } field ")" 15# field ::= TypeId ["?" | "*"] [Id] 16# sum ::= constructor { "|" constructor } ["attributes" fields] 17# constructor ::= ConstructorId [fields] 18# 19# [1] "The Zephyr Abstract Syntax Description Language" by Wang, et. al. See 20# http://asdl.sourceforge.net/ 21#------------------------------------------------------------------------------- 22from collections import namedtuple 23import re 24 25__all__ = [ 26 'builtin_types', 'parse', 'AST', 'Module', 'Type', 'Constructor', 27 'Field', 'Sum', 'Product', 'VisitorBase', 'Check', 'check'] 28 29# The following classes define nodes into which the ASDL description is parsed. 30# Note: this is a "meta-AST". ASDL files (such as Python.asdl) describe the AST 31# structure used by a programming language. But ASDL files themselves need to be 32# parsed. This module parses ASDL files and uses a simple AST to represent them. 33# See the EBNF at the top of the file to understand the logical connection 34# between the various node types. 35 36builtin_types = {'identifier', 'string', 'int', 'constant'} 37 38class AST: 39 def __repr__(self): 40 raise NotImplementedError 41 42class Module(AST): 43 def __init__(self, name, dfns): 44 self.name = name 45 self.dfns = dfns 46 self.types = {type.name: type.value for type in dfns} 47 48 def __repr__(self): 49 return 'Module({0.name}, {0.dfns})'.format(self) 50 51class Type(AST): 52 def __init__(self, name, value): 53 self.name = name 54 self.value = value 55 56 def __repr__(self): 57 return 'Type({0.name}, {0.value})'.format(self) 58 59class Constructor(AST): 60 def __init__(self, name, fields=None): 61 self.name = name 62 self.fields = fields or [] 63 64 def __repr__(self): 65 return 'Constructor({0.name}, {0.fields})'.format(self) 66 67class Field(AST): 68 def __init__(self, type, name=None, seq=False, opt=False): 69 self.type = type 70 self.name = name 71 self.seq = seq 72 self.opt = opt 73 74 def __str__(self): 75 if self.seq: 76 extra = "*" 77 elif self.opt: 78 extra = "?" 79 else: 80 extra = "" 81 82 return "{}{} {}".format(self.type, extra, self.name) 83 84 def __repr__(self): 85 if self.seq: 86 extra = ", seq=True" 87 elif self.opt: 88 extra = ", opt=True" 89 else: 90 extra = "" 91 if self.name is None: 92 return 'Field({0.type}{1})'.format(self, extra) 93 else: 94 return 'Field({0.type}, {0.name}{1})'.format(self, extra) 95 96class Sum(AST): 97 def __init__(self, types, attributes=None): 98 self.types = types 99 self.attributes = attributes or [] 100 101 def __repr__(self): 102 if self.attributes: 103 return 'Sum({0.types}, {0.attributes})'.format(self) 104 else: 105 return 'Sum({0.types})'.format(self) 106 107class Product(AST): 108 def __init__(self, fields, attributes=None): 109 self.fields = fields 110 self.attributes = attributes or [] 111 112 def __repr__(self): 113 if self.attributes: 114 return 'Product({0.fields}, {0.attributes})'.format(self) 115 else: 116 return 'Product({0.fields})'.format(self) 117 118# A generic visitor for the meta-AST that describes ASDL. This can be used by 119# emitters. Note that this visitor does not provide a generic visit method, so a 120# subclass needs to define visit methods from visitModule to as deep as the 121# interesting node. 122# We also define a Check visitor that makes sure the parsed ASDL is well-formed. 123 124class VisitorBase(object): 125 """Generic tree visitor for ASTs.""" 126 def __init__(self): 127 self.cache = {} 128 129 def visit(self, obj, *args): 130 klass = obj.__class__ 131 meth = self.cache.get(klass) 132 if meth is None: 133 methname = "visit" + klass.__name__ 134 meth = getattr(self, methname, None) 135 self.cache[klass] = meth 136 if meth: 137 try: 138 meth(obj, *args) 139 except Exception as e: 140 print("Error visiting %r: %s" % (obj, e)) 141 raise 142 143class Check(VisitorBase): 144 """A visitor that checks a parsed ASDL tree for correctness. 145 146 Errors are printed and accumulated. 147 """ 148 def __init__(self): 149 super(Check, self).__init__() 150 self.cons = {} 151 self.errors = 0 152 self.types = {} 153 154 def visitModule(self, mod): 155 for dfn in mod.dfns: 156 self.visit(dfn) 157 158 def visitType(self, type): 159 self.visit(type.value, str(type.name)) 160 161 def visitSum(self, sum, name): 162 for t in sum.types: 163 self.visit(t, name) 164 165 def visitConstructor(self, cons, name): 166 key = str(cons.name) 167 conflict = self.cons.get(key) 168 if conflict is None: 169 self.cons[key] = name 170 else: 171 print('Redefinition of constructor {}'.format(key)) 172 print('Defined in {} and {}'.format(conflict, name)) 173 self.errors += 1 174 for f in cons.fields: 175 self.visit(f, key) 176 177 def visitField(self, field, name): 178 key = str(field.type) 179 l = self.types.setdefault(key, []) 180 l.append(name) 181 182 def visitProduct(self, prod, name): 183 for f in prod.fields: 184 self.visit(f, name) 185 186def check(mod): 187 """Check the parsed ASDL tree for correctness. 188 189 Return True if success. For failure, the errors are printed out and False 190 is returned. 191 """ 192 v = Check() 193 v.visit(mod) 194 195 for t in v.types: 196 if t not in mod.types and not t in builtin_types: 197 v.errors += 1 198 uses = ", ".join(v.types[t]) 199 print('Undefined type {}, used in {}'.format(t, uses)) 200 return not v.errors 201 202# The ASDL parser itself comes next. The only interesting external interface 203# here is the top-level parse function. 204 205def parse(filename): 206 """Parse ASDL from the given file and return a Module node describing it.""" 207 with open(filename, encoding="utf-8") as f: 208 parser = ASDLParser() 209 return parser.parse(f.read()) 210 211# Types for describing tokens in an ASDL specification. 212class TokenKind: 213 """TokenKind is provides a scope for enumerated token kinds.""" 214 (ConstructorId, TypeId, Equals, Comma, Question, Pipe, Asterisk, 215 LParen, RParen, LBrace, RBrace) = range(11) 216 217 operator_table = { 218 '=': Equals, ',': Comma, '?': Question, '|': Pipe, '(': LParen, 219 ')': RParen, '*': Asterisk, '{': LBrace, '}': RBrace} 220 221Token = namedtuple('Token', 'kind value lineno') 222 223class ASDLSyntaxError(Exception): 224 def __init__(self, msg, lineno=None): 225 self.msg = msg 226 self.lineno = lineno or '<unknown>' 227 228 def __str__(self): 229 return 'Syntax error on line {0.lineno}: {0.msg}'.format(self) 230 231def tokenize_asdl(buf): 232 """Tokenize the given buffer. Yield Token objects.""" 233 for lineno, line in enumerate(buf.splitlines(), 1): 234 for m in re.finditer(r'\s*(\w+|--.*|.)', line.strip()): 235 c = m.group(1) 236 if c[0].isalpha(): 237 # Some kind of identifier 238 if c[0].isupper(): 239 yield Token(TokenKind.ConstructorId, c, lineno) 240 else: 241 yield Token(TokenKind.TypeId, c, lineno) 242 elif c[:2] == '--': 243 # Comment 244 break 245 else: 246 # Operators 247 try: 248 op_kind = TokenKind.operator_table[c] 249 except KeyError: 250 raise ASDLSyntaxError('Invalid operator %s' % c, lineno) 251 yield Token(op_kind, c, lineno) 252 253class ASDLParser: 254 """Parser for ASDL files. 255 256 Create, then call the parse method on a buffer containing ASDL. 257 This is a simple recursive descent parser that uses tokenize_asdl for the 258 lexing. 259 """ 260 def __init__(self): 261 self._tokenizer = None 262 self.cur_token = None 263 264 def parse(self, buf): 265 """Parse the ASDL in the buffer and return an AST with a Module root. 266 """ 267 self._tokenizer = tokenize_asdl(buf) 268 self._advance() 269 return self._parse_module() 270 271 def _parse_module(self): 272 if self._at_keyword('module'): 273 self._advance() 274 else: 275 raise ASDLSyntaxError( 276 'Expected "module" (found {})'.format(self.cur_token.value), 277 self.cur_token.lineno) 278 name = self._match(self._id_kinds) 279 self._match(TokenKind.LBrace) 280 defs = self._parse_definitions() 281 self._match(TokenKind.RBrace) 282 return Module(name, defs) 283 284 def _parse_definitions(self): 285 defs = [] 286 while self.cur_token.kind == TokenKind.TypeId: 287 typename = self._advance() 288 self._match(TokenKind.Equals) 289 type = self._parse_type() 290 defs.append(Type(typename, type)) 291 return defs 292 293 def _parse_type(self): 294 if self.cur_token.kind == TokenKind.LParen: 295 # If we see a (, it's a product 296 return self._parse_product() 297 else: 298 # Otherwise it's a sum. Look for ConstructorId 299 sumlist = [Constructor(self._match(TokenKind.ConstructorId), 300 self._parse_optional_fields())] 301 while self.cur_token.kind == TokenKind.Pipe: 302 # More constructors 303 self._advance() 304 sumlist.append(Constructor( 305 self._match(TokenKind.ConstructorId), 306 self._parse_optional_fields())) 307 return Sum(sumlist, self._parse_optional_attributes()) 308 309 def _parse_product(self): 310 return Product(self._parse_fields(), self._parse_optional_attributes()) 311 312 def _parse_fields(self): 313 fields = [] 314 self._match(TokenKind.LParen) 315 while self.cur_token.kind == TokenKind.TypeId: 316 typename = self._advance() 317 is_seq, is_opt = self._parse_optional_field_quantifier() 318 id = (self._advance() if self.cur_token.kind in self._id_kinds 319 else None) 320 fields.append(Field(typename, id, seq=is_seq, opt=is_opt)) 321 if self.cur_token.kind == TokenKind.RParen: 322 break 323 elif self.cur_token.kind == TokenKind.Comma: 324 self._advance() 325 self._match(TokenKind.RParen) 326 return fields 327 328 def _parse_optional_fields(self): 329 if self.cur_token.kind == TokenKind.LParen: 330 return self._parse_fields() 331 else: 332 return None 333 334 def _parse_optional_attributes(self): 335 if self._at_keyword('attributes'): 336 self._advance() 337 return self._parse_fields() 338 else: 339 return None 340 341 def _parse_optional_field_quantifier(self): 342 is_seq, is_opt = False, False 343 if self.cur_token.kind == TokenKind.Asterisk: 344 is_seq = True 345 self._advance() 346 elif self.cur_token.kind == TokenKind.Question: 347 is_opt = True 348 self._advance() 349 return is_seq, is_opt 350 351 def _advance(self): 352 """ Return the value of the current token and read the next one into 353 self.cur_token. 354 """ 355 cur_val = None if self.cur_token is None else self.cur_token.value 356 try: 357 self.cur_token = next(self._tokenizer) 358 except StopIteration: 359 self.cur_token = None 360 return cur_val 361 362 _id_kinds = (TokenKind.ConstructorId, TokenKind.TypeId) 363 364 def _match(self, kind): 365 """The 'match' primitive of RD parsers. 366 367 * Verifies that the current token is of the given kind (kind can 368 be a tuple, in which the kind must match one of its members). 369 * Returns the value of the current token 370 * Reads in the next token 371 """ 372 if (isinstance(kind, tuple) and self.cur_token.kind in kind or 373 self.cur_token.kind == kind 374 ): 375 value = self.cur_token.value 376 self._advance() 377 return value 378 else: 379 raise ASDLSyntaxError( 380 'Unmatched {} (found {})'.format(kind, self.cur_token.kind), 381 self.cur_token.lineno) 382 383 def _at_keyword(self, keyword): 384 return (self.cur_token.kind == TokenKind.TypeId and 385 self.cur_token.value == keyword) 386