1import argparse 2import sys 3import time 4import token 5import tokenize 6import traceback 7 8from abc import abstractmethod 9from typing import Any, Callable, cast, Dict, Optional, Tuple, Type, TypeVar 10 11from pegen.tokenizer import exact_token_types 12from pegen.tokenizer import Mark 13from pegen.tokenizer import Tokenizer 14 15T = TypeVar("T") 16P = TypeVar("P", bound="Parser") 17F = TypeVar("F", bound=Callable[..., Any]) 18 19 20def logger(method: F) -> F: 21 """For non-memoized functions that we want to be logged. 22 23 (In practice this is only non-leader left-recursive functions.) 24 """ 25 method_name = method.__name__ 26 27 def logger_wrapper(self: P, *args: object) -> T: 28 if not self._verbose: 29 return method(self, *args) 30 argsr = ",".join(repr(arg) for arg in args) 31 fill = " " * self._level 32 print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})") 33 self._level += 1 34 tree = method(self, *args) 35 self._level -= 1 36 print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}") 37 return tree 38 39 logger_wrapper.__wrapped__ = method # type: ignore 40 return cast(F, logger_wrapper) 41 42 43def memoize(method: F) -> F: 44 """Memoize a symbol method.""" 45 method_name = method.__name__ 46 47 def memoize_wrapper(self: P, *args: object) -> T: 48 mark = self.mark() 49 key = mark, method_name, args 50 # Fast path: cache hit, and not verbose. 51 if key in self._cache and not self._verbose: 52 tree, endmark = self._cache[key] 53 self.reset(endmark) 54 return tree 55 # Slow path: no cache hit, or verbose. 56 verbose = self._verbose 57 argsr = ",".join(repr(arg) for arg in args) 58 fill = " " * self._level 59 if key not in self._cache: 60 if verbose: 61 print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})") 62 self._level += 1 63 tree = method(self, *args) 64 self._level -= 1 65 if verbose: 66 print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}") 67 endmark = self.mark() 68 self._cache[key] = tree, endmark 69 else: 70 tree, endmark = self._cache[key] 71 if verbose: 72 print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}") 73 self.reset(endmark) 74 return tree 75 76 memoize_wrapper.__wrapped__ = method # type: ignore 77 return cast(F, memoize_wrapper) 78 79 80def memoize_left_rec(method: Callable[[P], Optional[T]]) -> Callable[[P], Optional[T]]: 81 """Memoize a left-recursive symbol method.""" 82 method_name = method.__name__ 83 84 def memoize_left_rec_wrapper(self: P) -> Optional[T]: 85 mark = self.mark() 86 key = mark, method_name, () 87 # Fast path: cache hit, and not verbose. 88 if key in self._cache and not self._verbose: 89 tree, endmark = self._cache[key] 90 self.reset(endmark) 91 return tree 92 # Slow path: no cache hit, or verbose. 93 verbose = self._verbose 94 fill = " " * self._level 95 if key not in self._cache: 96 if verbose: 97 print(f"{fill}{method_name} ... (looking at {self.showpeek()})") 98 self._level += 1 99 100 # For left-recursive rules we manipulate the cache and 101 # loop until the rule shows no progress, then pick the 102 # previous result. For an explanation why this works, see 103 # https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion 104 # (But we use the memoization cache instead of a static 105 # variable; perhaps this is similar to a paper by Warth et al. 106 # (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08). 107 108 # Prime the cache with a failure. 109 self._cache[key] = None, mark 110 lastresult, lastmark = None, mark 111 depth = 0 112 if verbose: 113 print(f"{fill}Recursive {method_name} at {mark} depth {depth}") 114 115 while True: 116 self.reset(mark) 117 result = method(self) 118 endmark = self.mark() 119 depth += 1 120 if verbose: 121 print( 122 f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}" 123 ) 124 if not result: 125 if verbose: 126 print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}") 127 break 128 if endmark <= lastmark: 129 if verbose: 130 print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}") 131 break 132 self._cache[key] = lastresult, lastmark = result, endmark 133 134 self.reset(lastmark) 135 tree = lastresult 136 137 self._level -= 1 138 if verbose: 139 print(f"{fill}{method_name}() -> {tree!s:.200} [cached]") 140 if tree: 141 endmark = self.mark() 142 else: 143 endmark = mark 144 self.reset(endmark) 145 self._cache[key] = tree, endmark 146 else: 147 tree, endmark = self._cache[key] 148 if verbose: 149 print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]") 150 if tree: 151 self.reset(endmark) 152 return tree 153 154 memoize_left_rec_wrapper.__wrapped__ = method # type: ignore 155 return memoize_left_rec_wrapper 156 157 158class Parser: 159 """Parsing base class.""" 160 161 def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False): 162 self._tokenizer = tokenizer 163 self._verbose = verbose 164 self._level = 0 165 self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {} 166 # Pass through common tokenizer methods. 167 # TODO: Rename to _mark and _reset. 168 self.mark = self._tokenizer.mark 169 self.reset = self._tokenizer.reset 170 171 @abstractmethod 172 def start(self) -> Any: 173 pass 174 175 def showpeek(self) -> str: 176 tok = self._tokenizer.peek() 177 return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}" 178 179 @memoize 180 def name(self) -> Optional[tokenize.TokenInfo]: 181 tok = self._tokenizer.peek() 182 if tok.type == token.NAME: 183 return self._tokenizer.getnext() 184 return None 185 186 @memoize 187 def number(self) -> Optional[tokenize.TokenInfo]: 188 tok = self._tokenizer.peek() 189 if tok.type == token.NUMBER: 190 return self._tokenizer.getnext() 191 return None 192 193 @memoize 194 def string(self) -> Optional[tokenize.TokenInfo]: 195 tok = self._tokenizer.peek() 196 if tok.type == token.STRING: 197 return self._tokenizer.getnext() 198 return None 199 200 @memoize 201 def op(self) -> Optional[tokenize.TokenInfo]: 202 tok = self._tokenizer.peek() 203 if tok.type == token.OP: 204 return self._tokenizer.getnext() 205 return None 206 207 @memoize 208 def expect(self, type: str) -> Optional[tokenize.TokenInfo]: 209 tok = self._tokenizer.peek() 210 if tok.string == type: 211 return self._tokenizer.getnext() 212 if type in exact_token_types: 213 if tok.type == exact_token_types[type]: 214 return self._tokenizer.getnext() 215 if type in token.__dict__: 216 if tok.type == token.__dict__[type]: 217 return self._tokenizer.getnext() 218 if tok.type == token.OP and tok.string == type: 219 return self._tokenizer.getnext() 220 return None 221 222 def positive_lookahead(self, func: Callable[..., T], *args: object) -> T: 223 mark = self.mark() 224 ok = func(*args) 225 self.reset(mark) 226 return ok 227 228 def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool: 229 mark = self.mark() 230 ok = func(*args) 231 self.reset(mark) 232 return not ok 233 234 def make_syntax_error(self, filename: str = "<unknown>") -> SyntaxError: 235 tok = self._tokenizer.diagnose() 236 return SyntaxError( 237 "pegen parse failure", (filename, tok.start[0], 1 + tok.start[1], tok.line) 238 ) 239 240 241def simple_parser_main(parser_class: Type[Parser]) -> None: 242 argparser = argparse.ArgumentParser() 243 argparser.add_argument( 244 "-v", 245 "--verbose", 246 action="count", 247 default=0, 248 help="Print timing stats; repeat for more debug output", 249 ) 250 argparser.add_argument( 251 "-q", "--quiet", action="store_true", help="Don't print the parsed program" 252 ) 253 argparser.add_argument("filename", help="Input file ('-' to use stdin)") 254 255 args = argparser.parse_args() 256 verbose = args.verbose 257 verbose_tokenizer = verbose >= 3 258 verbose_parser = verbose == 2 or verbose >= 4 259 260 t0 = time.time() 261 262 filename = args.filename 263 if filename == "" or filename == "-": 264 filename = "<stdin>" 265 file = sys.stdin 266 else: 267 file = open(args.filename) 268 try: 269 tokengen = tokenize.generate_tokens(file.readline) 270 tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer) 271 parser = parser_class(tokenizer, verbose=verbose_parser) 272 tree = parser.start() 273 try: 274 if file.isatty(): 275 endpos = 0 276 else: 277 endpos = file.tell() 278 except IOError: 279 endpos = 0 280 finally: 281 if file is not sys.stdin: 282 file.close() 283 284 t1 = time.time() 285 286 if not tree: 287 err = parser.make_syntax_error(filename) 288 traceback.print_exception(err.__class__, err, None) 289 sys.exit(1) 290 291 if not args.quiet: 292 print(tree) 293 294 if verbose: 295 dt = t1 - t0 296 diag = tokenizer.diagnose() 297 nlines = diag.end[0] 298 if diag.type == token.ENDMARKER: 299 nlines -= 1 300 print(f"Total time: {dt:.3f} sec; {nlines} lines", end="") 301 if endpos: 302 print(f" ({endpos} bytes)", end="") 303 if dt: 304 print(f"; {nlines / dt:.0f} lines/sec") 305 else: 306 print() 307 print("Caches sizes:") 308 print(f" token array : {len(tokenizer._tokens):10}") 309 print(f" cache : {len(parser._cache):10}") 310 ## print_memstats() 311