1import argparse 2import sys 3import time 4import token 5import tokenize 6import traceback 7from abc import abstractmethod 8from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, TypeVar, cast 9 10from pegen.tokenizer import Mark, Tokenizer, exact_token_types 11 12T = TypeVar("T") 13F = TypeVar("F", bound=Callable[..., Any]) 14 15 16def logger(method: F) -> F: 17 """For non-memoized functions that we want to be logged. 18 19 (In practice this is only non-leader left-recursive functions.) 20 """ 21 method_name = method.__name__ 22 23 def logger_wrapper(self: "Parser", *args: object) -> Any: 24 if not self._verbose: 25 return method(self, *args) 26 argsr = ",".join(repr(arg) for arg in args) 27 fill = " " * self._level 28 print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})") 29 self._level += 1 30 tree = method(self, *args) 31 self._level -= 1 32 print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}") 33 return tree 34 35 logger_wrapper.__wrapped__ = method # type: ignore[attr-defined] 36 return cast(F, logger_wrapper) 37 38 39def memoize(method: F) -> F: 40 """Memoize a symbol method.""" 41 method_name = method.__name__ 42 43 def memoize_wrapper(self: "Parser", *args: object) -> Any: 44 mark = self._mark() 45 key = mark, method_name, args 46 # Fast path: cache hit, and not verbose. 47 if key in self._cache and not self._verbose: 48 tree, endmark = self._cache[key] 49 self._reset(endmark) 50 return tree 51 # Slow path: no cache hit, or verbose. 52 verbose = self._verbose 53 argsr = ",".join(repr(arg) for arg in args) 54 fill = " " * self._level 55 if key not in self._cache: 56 if verbose: 57 print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})") 58 self._level += 1 59 tree = method(self, *args) 60 self._level -= 1 61 if verbose: 62 print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}") 63 endmark = self._mark() 64 self._cache[key] = tree, endmark 65 else: 66 tree, endmark = self._cache[key] 67 if verbose: 68 print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}") 69 self._reset(endmark) 70 return tree 71 72 memoize_wrapper.__wrapped__ = method # type: ignore[attr-defined] 73 return cast(F, memoize_wrapper) 74 75 76def memoize_left_rec( 77 method: Callable[["Parser"], Optional[T]] 78) -> Callable[["Parser"], Optional[T]]: 79 """Memoize a left-recursive symbol method.""" 80 method_name = method.__name__ 81 82 def memoize_left_rec_wrapper(self: "Parser") -> Optional[T]: 83 mark = self._mark() 84 key = mark, method_name, () 85 # Fast path: cache hit, and not verbose. 86 if key in self._cache and not self._verbose: 87 tree, endmark = self._cache[key] 88 self._reset(endmark) 89 return tree 90 # Slow path: no cache hit, or verbose. 91 verbose = self._verbose 92 fill = " " * self._level 93 if key not in self._cache: 94 if verbose: 95 print(f"{fill}{method_name} ... (looking at {self.showpeek()})") 96 self._level += 1 97 98 # For left-recursive rules we manipulate the cache and 99 # loop until the rule shows no progress, then pick the 100 # previous result. For an explanation why this works, see 101 # https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion 102 # (But we use the memoization cache instead of a static 103 # variable; perhaps this is similar to a paper by Warth et al. 104 # (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08). 105 106 # Prime the cache with a failure. 107 self._cache[key] = None, mark 108 lastresult, lastmark = None, mark 109 depth = 0 110 if verbose: 111 print(f"{fill}Recursive {method_name} at {mark} depth {depth}") 112 113 while True: 114 self._reset(mark) 115 self.in_recursive_rule += 1 116 try: 117 result = method(self) 118 finally: 119 self.in_recursive_rule -= 1 120 endmark = self._mark() 121 depth += 1 122 if verbose: 123 print( 124 f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}" 125 ) 126 if not result: 127 if verbose: 128 print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}") 129 break 130 if endmark <= lastmark: 131 if verbose: 132 print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}") 133 break 134 self._cache[key] = lastresult, lastmark = result, endmark 135 136 self._reset(lastmark) 137 tree = lastresult 138 139 self._level -= 1 140 if verbose: 141 print(f"{fill}{method_name}() -> {tree!s:.200} [cached]") 142 if tree: 143 endmark = self._mark() 144 else: 145 endmark = mark 146 self._reset(endmark) 147 self._cache[key] = tree, endmark 148 else: 149 tree, endmark = self._cache[key] 150 if verbose: 151 print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]") 152 if tree: 153 self._reset(endmark) 154 return tree 155 156 memoize_left_rec_wrapper.__wrapped__ = method # type: ignore[attr-defined] 157 return memoize_left_rec_wrapper 158 159 160class Parser: 161 """Parsing base class.""" 162 163 KEYWORDS: ClassVar[Tuple[str, ...]] 164 165 SOFT_KEYWORDS: ClassVar[Tuple[str, ...]] 166 167 def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False): 168 self._tokenizer = tokenizer 169 self._verbose = verbose 170 self._level = 0 171 self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {} 172 # Integer tracking whether we are in a left recursive rule or not. Can be useful 173 # for error reporting. 174 self.in_recursive_rule = 0 175 # Pass through common tokenizer methods. 176 self._mark = self._tokenizer.mark 177 self._reset = self._tokenizer.reset 178 179 @abstractmethod 180 def start(self) -> Any: 181 pass 182 183 def showpeek(self) -> str: 184 tok = self._tokenizer.peek() 185 return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}" 186 187 @memoize 188 def name(self) -> Optional[tokenize.TokenInfo]: 189 tok = self._tokenizer.peek() 190 if tok.type == token.NAME and tok.string not in self.KEYWORDS: 191 return self._tokenizer.getnext() 192 return None 193 194 @memoize 195 def number(self) -> Optional[tokenize.TokenInfo]: 196 tok = self._tokenizer.peek() 197 if tok.type == token.NUMBER: 198 return self._tokenizer.getnext() 199 return None 200 201 @memoize 202 def string(self) -> Optional[tokenize.TokenInfo]: 203 tok = self._tokenizer.peek() 204 if tok.type == token.STRING: 205 return self._tokenizer.getnext() 206 return None 207 208 @memoize 209 def op(self) -> Optional[tokenize.TokenInfo]: 210 tok = self._tokenizer.peek() 211 if tok.type == token.OP: 212 return self._tokenizer.getnext() 213 return None 214 215 @memoize 216 def type_comment(self) -> Optional[tokenize.TokenInfo]: 217 tok = self._tokenizer.peek() 218 if tok.type == token.TYPE_COMMENT: 219 return self._tokenizer.getnext() 220 return None 221 222 @memoize 223 def soft_keyword(self) -> Optional[tokenize.TokenInfo]: 224 tok = self._tokenizer.peek() 225 if tok.type == token.NAME and tok.string in self.SOFT_KEYWORDS: 226 return self._tokenizer.getnext() 227 return None 228 229 @memoize 230 def expect(self, type: str) -> Optional[tokenize.TokenInfo]: 231 tok = self._tokenizer.peek() 232 if tok.string == type: 233 return self._tokenizer.getnext() 234 if type in exact_token_types: 235 if tok.type == exact_token_types[type]: 236 return self._tokenizer.getnext() 237 if type in token.__dict__: 238 if tok.type == token.__dict__[type]: 239 return self._tokenizer.getnext() 240 if tok.type == token.OP and tok.string == type: 241 return self._tokenizer.getnext() 242 return None 243 244 def expect_forced(self, res: Any, expectation: str) -> Optional[tokenize.TokenInfo]: 245 if res is None: 246 raise self.make_syntax_error(f"expected {expectation}") 247 return res 248 249 def positive_lookahead(self, func: Callable[..., T], *args: object) -> T: 250 mark = self._mark() 251 ok = func(*args) 252 self._reset(mark) 253 return ok 254 255 def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool: 256 mark = self._mark() 257 ok = func(*args) 258 self._reset(mark) 259 return not ok 260 261 def make_syntax_error(self, message: str, filename: str = "<unknown>") -> SyntaxError: 262 tok = self._tokenizer.diagnose() 263 return SyntaxError(message, (filename, tok.start[0], 1 + tok.start[1], tok.line)) 264 265 266def simple_parser_main(parser_class: Type[Parser]) -> None: 267 argparser = argparse.ArgumentParser() 268 argparser.add_argument( 269 "-v", 270 "--verbose", 271 action="count", 272 default=0, 273 help="Print timing stats; repeat for more debug output", 274 ) 275 argparser.add_argument( 276 "-q", "--quiet", action="store_true", help="Don't print the parsed program" 277 ) 278 argparser.add_argument("filename", help="Input file ('-' to use stdin)") 279 280 args = argparser.parse_args() 281 verbose = args.verbose 282 verbose_tokenizer = verbose >= 3 283 verbose_parser = verbose == 2 or verbose >= 4 284 285 t0 = time.time() 286 287 filename = args.filename 288 if filename == "" or filename == "-": 289 filename = "<stdin>" 290 file = sys.stdin 291 else: 292 file = open(args.filename) 293 try: 294 tokengen = tokenize.generate_tokens(file.readline) 295 tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer) 296 parser = parser_class(tokenizer, verbose=verbose_parser) 297 tree = parser.start() 298 try: 299 if file.isatty(): 300 endpos = 0 301 else: 302 endpos = file.tell() 303 except IOError: 304 endpos = 0 305 finally: 306 if file is not sys.stdin: 307 file.close() 308 309 t1 = time.time() 310 311 if not tree: 312 err = parser.make_syntax_error(filename) 313 traceback.print_exception(err.__class__, err, None) 314 sys.exit(1) 315 316 if not args.quiet: 317 print(tree) 318 319 if verbose: 320 dt = t1 - t0 321 diag = tokenizer.diagnose() 322 nlines = diag.end[0] 323 if diag.type == token.ENDMARKER: 324 nlines -= 1 325 print(f"Total time: {dt:.3f} sec; {nlines} lines", end="") 326 if endpos: 327 print(f" ({endpos} bytes)", end="") 328 if dt: 329 print(f"; {nlines / dt:.0f} lines/sec") 330 else: 331 print() 332 print("Caches sizes:") 333 print(f" token array : {len(tokenizer._tokens):10}") 334 print(f" cache : {len(parser._cache):10}") 335 ## print_memstats() 336