• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1import argparse
2import sys
3import time
4import token
5import tokenize
6import traceback
7from abc import abstractmethod
8from typing import Any, Callable, ClassVar, Dict, Optional, Tuple, Type, TypeVar, cast
9
10from pegen.tokenizer import Mark, Tokenizer, exact_token_types
11
12T = TypeVar("T")
13F = TypeVar("F", bound=Callable[..., Any])
14
15
16def logger(method: F) -> F:
17    """For non-memoized functions that we want to be logged.
18
19    (In practice this is only non-leader left-recursive functions.)
20    """
21    method_name = method.__name__
22
23    def logger_wrapper(self: "Parser", *args: object) -> Any:
24        if not self._verbose:
25            return method(self, *args)
26        argsr = ",".join(repr(arg) for arg in args)
27        fill = "  " * self._level
28        print(f"{fill}{method_name}({argsr}) .... (looking at {self.showpeek()})")
29        self._level += 1
30        tree = method(self, *args)
31        self._level -= 1
32        print(f"{fill}... {method_name}({argsr}) --> {tree!s:.200}")
33        return tree
34
35    logger_wrapper.__wrapped__ = method  # type: ignore[attr-defined]
36    return cast(F, logger_wrapper)
37
38
39def memoize(method: F) -> F:
40    """Memoize a symbol method."""
41    method_name = method.__name__
42
43    def memoize_wrapper(self: "Parser", *args: object) -> Any:
44        mark = self._mark()
45        key = mark, method_name, args
46        # Fast path: cache hit, and not verbose.
47        if key in self._cache and not self._verbose:
48            tree, endmark = self._cache[key]
49            self._reset(endmark)
50            return tree
51        # Slow path: no cache hit, or verbose.
52        verbose = self._verbose
53        argsr = ",".join(repr(arg) for arg in args)
54        fill = "  " * self._level
55        if key not in self._cache:
56            if verbose:
57                print(f"{fill}{method_name}({argsr}) ... (looking at {self.showpeek()})")
58            self._level += 1
59            tree = method(self, *args)
60            self._level -= 1
61            if verbose:
62                print(f"{fill}... {method_name}({argsr}) -> {tree!s:.200}")
63            endmark = self._mark()
64            self._cache[key] = tree, endmark
65        else:
66            tree, endmark = self._cache[key]
67            if verbose:
68                print(f"{fill}{method_name}({argsr}) -> {tree!s:.200}")
69            self._reset(endmark)
70        return tree
71
72    memoize_wrapper.__wrapped__ = method  # type: ignore[attr-defined]
73    return cast(F, memoize_wrapper)
74
75
76def memoize_left_rec(
77    method: Callable[["Parser"], Optional[T]]
78) -> Callable[["Parser"], Optional[T]]:
79    """Memoize a left-recursive symbol method."""
80    method_name = method.__name__
81
82    def memoize_left_rec_wrapper(self: "Parser") -> Optional[T]:
83        mark = self._mark()
84        key = mark, method_name, ()
85        # Fast path: cache hit, and not verbose.
86        if key in self._cache and not self._verbose:
87            tree, endmark = self._cache[key]
88            self._reset(endmark)
89            return tree
90        # Slow path: no cache hit, or verbose.
91        verbose = self._verbose
92        fill = "  " * self._level
93        if key not in self._cache:
94            if verbose:
95                print(f"{fill}{method_name} ... (looking at {self.showpeek()})")
96            self._level += 1
97
98            # For left-recursive rules we manipulate the cache and
99            # loop until the rule shows no progress, then pick the
100            # previous result.  For an explanation why this works, see
101            # https://github.com/PhilippeSigaud/Pegged/wiki/Left-Recursion
102            # (But we use the memoization cache instead of a static
103            # variable; perhaps this is similar to a paper by Warth et al.
104            # (http://web.cs.ucla.edu/~todd/research/pub.php?id=pepm08).
105
106            # Prime the cache with a failure.
107            self._cache[key] = None, mark
108            lastresult, lastmark = None, mark
109            depth = 0
110            if verbose:
111                print(f"{fill}Recursive {method_name} at {mark} depth {depth}")
112
113            while True:
114                self._reset(mark)
115                self.in_recursive_rule += 1
116                try:
117                    result = method(self)
118                finally:
119                    self.in_recursive_rule -= 1
120                endmark = self._mark()
121                depth += 1
122                if verbose:
123                    print(
124                        f"{fill}Recursive {method_name} at {mark} depth {depth}: {result!s:.200} to {endmark}"
125                    )
126                if not result:
127                    if verbose:
128                        print(f"{fill}Fail with {lastresult!s:.200} to {lastmark}")
129                    break
130                if endmark <= lastmark:
131                    if verbose:
132                        print(f"{fill}Bailing with {lastresult!s:.200} to {lastmark}")
133                    break
134                self._cache[key] = lastresult, lastmark = result, endmark
135
136            self._reset(lastmark)
137            tree = lastresult
138
139            self._level -= 1
140            if verbose:
141                print(f"{fill}{method_name}() -> {tree!s:.200} [cached]")
142            if tree:
143                endmark = self._mark()
144            else:
145                endmark = mark
146                self._reset(endmark)
147            self._cache[key] = tree, endmark
148        else:
149            tree, endmark = self._cache[key]
150            if verbose:
151                print(f"{fill}{method_name}() -> {tree!s:.200} [fresh]")
152            if tree:
153                self._reset(endmark)
154        return tree
155
156    memoize_left_rec_wrapper.__wrapped__ = method  # type: ignore[attr-defined]
157    return memoize_left_rec_wrapper
158
159
160class Parser:
161    """Parsing base class."""
162
163    KEYWORDS: ClassVar[Tuple[str, ...]]
164
165    SOFT_KEYWORDS: ClassVar[Tuple[str, ...]]
166
167    def __init__(self, tokenizer: Tokenizer, *, verbose: bool = False):
168        self._tokenizer = tokenizer
169        self._verbose = verbose
170        self._level = 0
171        self._cache: Dict[Tuple[Mark, str, Tuple[Any, ...]], Tuple[Any, Mark]] = {}
172        # Integer tracking whether we are in a left recursive rule or not. Can be useful
173        # for error reporting.
174        self.in_recursive_rule = 0
175        # Pass through common tokenizer methods.
176        self._mark = self._tokenizer.mark
177        self._reset = self._tokenizer.reset
178
179    @abstractmethod
180    def start(self) -> Any:
181        pass
182
183    def showpeek(self) -> str:
184        tok = self._tokenizer.peek()
185        return f"{tok.start[0]}.{tok.start[1]}: {token.tok_name[tok.type]}:{tok.string!r}"
186
187    @memoize
188    def name(self) -> Optional[tokenize.TokenInfo]:
189        tok = self._tokenizer.peek()
190        if tok.type == token.NAME and tok.string not in self.KEYWORDS:
191            return self._tokenizer.getnext()
192        return None
193
194    @memoize
195    def number(self) -> Optional[tokenize.TokenInfo]:
196        tok = self._tokenizer.peek()
197        if tok.type == token.NUMBER:
198            return self._tokenizer.getnext()
199        return None
200
201    @memoize
202    def string(self) -> Optional[tokenize.TokenInfo]:
203        tok = self._tokenizer.peek()
204        if tok.type == token.STRING:
205            return self._tokenizer.getnext()
206        return None
207
208    @memoize
209    def op(self) -> Optional[tokenize.TokenInfo]:
210        tok = self._tokenizer.peek()
211        if tok.type == token.OP:
212            return self._tokenizer.getnext()
213        return None
214
215    @memoize
216    def type_comment(self) -> Optional[tokenize.TokenInfo]:
217        tok = self._tokenizer.peek()
218        if tok.type == token.TYPE_COMMENT:
219            return self._tokenizer.getnext()
220        return None
221
222    @memoize
223    def soft_keyword(self) -> Optional[tokenize.TokenInfo]:
224        tok = self._tokenizer.peek()
225        if tok.type == token.NAME and tok.string in self.SOFT_KEYWORDS:
226            return self._tokenizer.getnext()
227        return None
228
229    @memoize
230    def expect(self, type: str) -> Optional[tokenize.TokenInfo]:
231        tok = self._tokenizer.peek()
232        if tok.string == type:
233            return self._tokenizer.getnext()
234        if type in exact_token_types:
235            if tok.type == exact_token_types[type]:
236                return self._tokenizer.getnext()
237        if type in token.__dict__:
238            if tok.type == token.__dict__[type]:
239                return self._tokenizer.getnext()
240        if tok.type == token.OP and tok.string == type:
241            return self._tokenizer.getnext()
242        return None
243
244    def expect_forced(self, res: Any, expectation: str) -> Optional[tokenize.TokenInfo]:
245        if res is None:
246            raise self.make_syntax_error(f"expected {expectation}")
247        return res
248
249    def positive_lookahead(self, func: Callable[..., T], *args: object) -> T:
250        mark = self._mark()
251        ok = func(*args)
252        self._reset(mark)
253        return ok
254
255    def negative_lookahead(self, func: Callable[..., object], *args: object) -> bool:
256        mark = self._mark()
257        ok = func(*args)
258        self._reset(mark)
259        return not ok
260
261    def make_syntax_error(self, message: str, filename: str = "<unknown>") -> SyntaxError:
262        tok = self._tokenizer.diagnose()
263        return SyntaxError(message, (filename, tok.start[0], 1 + tok.start[1], tok.line))
264
265
266def simple_parser_main(parser_class: Type[Parser]) -> None:
267    argparser = argparse.ArgumentParser()
268    argparser.add_argument(
269        "-v",
270        "--verbose",
271        action="count",
272        default=0,
273        help="Print timing stats; repeat for more debug output",
274    )
275    argparser.add_argument(
276        "-q", "--quiet", action="store_true", help="Don't print the parsed program"
277    )
278    argparser.add_argument("filename", help="Input file ('-' to use stdin)")
279
280    args = argparser.parse_args()
281    verbose = args.verbose
282    verbose_tokenizer = verbose >= 3
283    verbose_parser = verbose == 2 or verbose >= 4
284
285    t0 = time.time()
286
287    filename = args.filename
288    if filename == "" or filename == "-":
289        filename = "<stdin>"
290        file = sys.stdin
291    else:
292        file = open(args.filename)
293    try:
294        tokengen = tokenize.generate_tokens(file.readline)
295        tokenizer = Tokenizer(tokengen, verbose=verbose_tokenizer)
296        parser = parser_class(tokenizer, verbose=verbose_parser)
297        tree = parser.start()
298        try:
299            if file.isatty():
300                endpos = 0
301            else:
302                endpos = file.tell()
303        except IOError:
304            endpos = 0
305    finally:
306        if file is not sys.stdin:
307            file.close()
308
309    t1 = time.time()
310
311    if not tree:
312        err = parser.make_syntax_error(filename)
313        traceback.print_exception(err.__class__, err, None)
314        sys.exit(1)
315
316    if not args.quiet:
317        print(tree)
318
319    if verbose:
320        dt = t1 - t0
321        diag = tokenizer.diagnose()
322        nlines = diag.end[0]
323        if diag.type == token.ENDMARKER:
324            nlines -= 1
325        print(f"Total time: {dt:.3f} sec; {nlines} lines", end="")
326        if endpos:
327            print(f" ({endpos} bytes)", end="")
328        if dt:
329            print(f"; {nlines / dt:.0f} lines/sec")
330        else:
331            print()
332        print("Caches sizes:")
333        print(f"  token array : {len(tokenizer._tokens):10}")
334        print(f"        cache : {len(parser._cache):10}")
335        ## print_memstats()
336