• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Define partial Python code Parser used by editor and hyperparser.
2
3Instances of ParseMap are used with str.translate.
4
5The following bound search and match functions are defined:
6_synchre - start of popular statement;
7_junkre - whitespace or comment line;
8_match_stringre: string, possibly without closer;
9_itemre - line that may have bracket structure start;
10_closere - line that must be followed by dedent.
11_chew_ordinaryre - non-special characters.
12"""
13import re
14
15# Reason last statement is continued (or C_NONE if it's not).
16(C_NONE, C_BACKSLASH, C_STRING_FIRST_LINE,
17 C_STRING_NEXT_LINES, C_BRACKET) = range(5)
18
19# Find what looks like the start of a popular statement.
20
21_synchre = re.compile(r"""
22    ^
23    [ \t]*
24    (?: while
25    |   else
26    |   def
27    |   return
28    |   assert
29    |   break
30    |   class
31    |   continue
32    |   elif
33    |   try
34    |   except
35    |   raise
36    |   import
37    |   yield
38    )
39    \b
40""", re.VERBOSE | re.MULTILINE).search
41
42# Match blank line or non-indenting comment line.
43
44_junkre = re.compile(r"""
45    [ \t]*
46    (?: \# \S .* )?
47    \n
48""", re.VERBOSE).match
49
50# Match any flavor of string; the terminating quote is optional
51# so that we're robust in the face of incomplete program text.
52
53_match_stringre = re.compile(r"""
54    \""" [^"\\]* (?:
55                     (?: \\. | "(?!"") )
56                     [^"\\]*
57                 )*
58    (?: \""" )?
59
60|   " [^"\\\n]* (?: \\. [^"\\\n]* )* "?
61
62|   ''' [^'\\]* (?:
63                   (?: \\. | '(?!'') )
64                   [^'\\]*
65                )*
66    (?: ''' )?
67
68|   ' [^'\\\n]* (?: \\. [^'\\\n]* )* '?
69""", re.VERBOSE | re.DOTALL).match
70
71# Match a line that starts with something interesting;
72# used to find the first item of a bracket structure.
73
74_itemre = re.compile(r"""
75    [ \t]*
76    [^\s#\\]    # if we match, m.end()-1 is the interesting char
77""", re.VERBOSE).match
78
79# Match start of statements that should be followed by a dedent.
80
81_closere = re.compile(r"""
82    \s*
83    (?: return
84    |   break
85    |   continue
86    |   raise
87    |   pass
88    )
89    \b
90""", re.VERBOSE).match
91
92# Chew up non-special chars as quickly as possible.  If match is
93# successful, m.end() less 1 is the index of the last boring char
94# matched.  If match is unsuccessful, the string starts with an
95# interesting char.
96
97_chew_ordinaryre = re.compile(r"""
98    [^[\](){}#'"\\]+
99""", re.VERBOSE).match
100
101
102class ParseMap(dict):
103    r"""Dict subclass that maps anything not in dict to 'x'.
104
105    This is designed to be used with str.translate in study1.
106    Anything not specifically mapped otherwise becomes 'x'.
107    Example: replace everything except whitespace with 'x'.
108
109    >>> keepwhite = ParseMap((ord(c), ord(c)) for c in ' \t\n\r')
110    >>> "a + b\tc\nd".translate(keepwhite)
111    'x x x\tx\nx'
112    """
113    # Calling this triples access time; see bpo-32940
114    def __missing__(self, key):
115        return 120  # ord('x')
116
117
118# Map all ascii to 120 to avoid __missing__ call, then replace some.
119trans = ParseMap.fromkeys(range(128), 120)
120trans.update((ord(c), ord('(')) for c in "({[")  # open brackets => '(';
121trans.update((ord(c), ord(')')) for c in ")}]")  # close brackets => ')'.
122trans.update((ord(c), ord(c)) for c in "\"'\\\n#")  # Keep these.
123
124
125class Parser:
126
127    def __init__(self, indentwidth, tabwidth):
128        self.indentwidth = indentwidth
129        self.tabwidth = tabwidth
130
131    def set_code(self, s):
132        assert len(s) == 0 or s[-1] == '\n'
133        self.code = s
134        self.study_level = 0
135
136    def find_good_parse_start(self, is_char_in_string=None,
137                              _synchre=_synchre):
138        """
139        Return index of a good place to begin parsing, as close to the
140        end of the string as possible.  This will be the start of some
141        popular stmt like "if" or "def".  Return None if none found:
142        the caller should pass more prior context then, if possible, or
143        if not (the entire program text up until the point of interest
144        has already been tried) pass 0 to set_lo().
145
146        This will be reliable iff given a reliable is_char_in_string()
147        function, meaning that when it says "no", it's absolutely
148        guaranteed that the char is not in a string.
149        """
150        code, pos = self.code, None
151
152        if not is_char_in_string:
153            # no clue -- make the caller pass everything
154            return None
155
156        # Peek back from the end for a good place to start,
157        # but don't try too often; pos will be left None, or
158        # bumped to a legitimate synch point.
159        limit = len(code)
160        for tries in range(5):
161            i = code.rfind(":\n", 0, limit)
162            if i < 0:
163                break
164            i = code.rfind('\n', 0, i) + 1  # start of colon line (-1+1=0)
165            m = _synchre(code, i, limit)
166            if m and not is_char_in_string(m.start()):
167                pos = m.start()
168                break
169            limit = i
170        if pos is None:
171            # Nothing looks like a block-opener, or stuff does
172            # but is_char_in_string keeps returning true; most likely
173            # we're in or near a giant string, the colorizer hasn't
174            # caught up enough to be helpful, or there simply *aren't*
175            # any interesting stmts.  In any of these cases we're
176            # going to have to parse the whole thing to be sure, so
177            # give it one last try from the start, but stop wasting
178            # time here regardless of the outcome.
179            m = _synchre(code)
180            if m and not is_char_in_string(m.start()):
181                pos = m.start()
182            return pos
183
184        # Peeking back worked; look forward until _synchre no longer
185        # matches.
186        i = pos + 1
187        while 1:
188            m = _synchre(code, i)
189            if m:
190                s, i = m.span()
191                if not is_char_in_string(s):
192                    pos = s
193            else:
194                break
195        return pos
196
197    def set_lo(self, lo):
198        """ Throw away the start of the string.
199
200        Intended to be called with the result of find_good_parse_start().
201        """
202        assert lo == 0 or self.code[lo-1] == '\n'
203        if lo > 0:
204            self.code = self.code[lo:]
205
206    def _study1(self):
207        """Find the line numbers of non-continuation lines.
208
209        As quickly as humanly possible <wink>, find the line numbers (0-
210        based) of the non-continuation lines.
211        Creates self.{goodlines, continuation}.
212        """
213        if self.study_level >= 1:
214            return
215        self.study_level = 1
216
217        # Map all uninteresting characters to "x", all open brackets
218        # to "(", all close brackets to ")", then collapse runs of
219        # uninteresting characters.  This can cut the number of chars
220        # by a factor of 10-40, and so greatly speed the following loop.
221        code = self.code
222        code = code.translate(trans)
223        code = code.replace('xxxxxxxx', 'x')
224        code = code.replace('xxxx', 'x')
225        code = code.replace('xx', 'x')
226        code = code.replace('xx', 'x')
227        code = code.replace('\nx', '\n')
228        # Replacing x\n with \n would be incorrect because
229        # x may be preceded by a backslash.
230
231        # March over the squashed version of the program, accumulating
232        # the line numbers of non-continued stmts, and determining
233        # whether & why the last stmt is a continuation.
234        continuation = C_NONE
235        level = lno = 0     # level is nesting level; lno is line number
236        self.goodlines = goodlines = [0]
237        push_good = goodlines.append
238        i, n = 0, len(code)
239        while i < n:
240            ch = code[i]
241            i = i+1
242
243            # cases are checked in decreasing order of frequency
244            if ch == 'x':
245                continue
246
247            if ch == '\n':
248                lno = lno + 1
249                if level == 0:
250                    push_good(lno)
251                    # else we're in an unclosed bracket structure
252                continue
253
254            if ch == '(':
255                level = level + 1
256                continue
257
258            if ch == ')':
259                if level:
260                    level = level - 1
261                    # else the program is invalid, but we can't complain
262                continue
263
264            if ch == '"' or ch == "'":
265                # consume the string
266                quote = ch
267                if code[i-1:i+2] == quote * 3:
268                    quote = quote * 3
269                firstlno = lno
270                w = len(quote) - 1
271                i = i+w
272                while i < n:
273                    ch = code[i]
274                    i = i+1
275
276                    if ch == 'x':
277                        continue
278
279                    if code[i-1:i+w] == quote:
280                        i = i+w
281                        break
282
283                    if ch == '\n':
284                        lno = lno + 1
285                        if w == 0:
286                            # unterminated single-quoted string
287                            if level == 0:
288                                push_good(lno)
289                            break
290                        continue
291
292                    if ch == '\\':
293                        assert i < n
294                        if code[i] == '\n':
295                            lno = lno + 1
296                        i = i+1
297                        continue
298
299                    # else comment char or paren inside string
300
301                else:
302                    # didn't break out of the loop, so we're still
303                    # inside a string
304                    if (lno - 1) == firstlno:
305                        # before the previous \n in code, we were in the first
306                        # line of the string
307                        continuation = C_STRING_FIRST_LINE
308                    else:
309                        continuation = C_STRING_NEXT_LINES
310                continue    # with outer loop
311
312            if ch == '#':
313                # consume the comment
314                i = code.find('\n', i)
315                assert i >= 0
316                continue
317
318            assert ch == '\\'
319            assert i < n
320            if code[i] == '\n':
321                lno = lno + 1
322                if i+1 == n:
323                    continuation = C_BACKSLASH
324            i = i+1
325
326        # The last stmt may be continued for all 3 reasons.
327        # String continuation takes precedence over bracket
328        # continuation, which beats backslash continuation.
329        if (continuation != C_STRING_FIRST_LINE
330            and continuation != C_STRING_NEXT_LINES and level > 0):
331            continuation = C_BRACKET
332        self.continuation = continuation
333
334        # Push the final line number as a sentinel value, regardless of
335        # whether it's continued.
336        assert (continuation == C_NONE) == (goodlines[-1] == lno)
337        if goodlines[-1] != lno:
338            push_good(lno)
339
340    def get_continuation_type(self):
341        self._study1()
342        return self.continuation
343
344    def _study2(self):
345        """
346        study1 was sufficient to determine the continuation status,
347        but doing more requires looking at every character.  study2
348        does this for the last interesting statement in the block.
349        Creates:
350            self.stmt_start, stmt_end
351                slice indices of last interesting stmt
352            self.stmt_bracketing
353                the bracketing structure of the last interesting stmt; for
354                example, for the statement "say(boo) or die",
355                stmt_bracketing will be ((0, 0), (0, 1), (2, 0), (2, 1),
356                (4, 0)). Strings and comments are treated as brackets, for
357                the matter.
358            self.lastch
359                last interesting character before optional trailing comment
360            self.lastopenbracketpos
361                if continuation is C_BRACKET, index of last open bracket
362        """
363        if self.study_level >= 2:
364            return
365        self._study1()
366        self.study_level = 2
367
368        # Set p and q to slice indices of last interesting stmt.
369        code, goodlines = self.code, self.goodlines
370        i = len(goodlines) - 1  # Index of newest line.
371        p = len(code)  # End of goodlines[i]
372        while i:
373            assert p
374            # Make p be the index of the stmt at line number goodlines[i].
375            # Move p back to the stmt at line number goodlines[i-1].
376            q = p
377            for nothing in range(goodlines[i-1], goodlines[i]):
378                # tricky: sets p to 0 if no preceding newline
379                p = code.rfind('\n', 0, p-1) + 1
380            # The stmt code[p:q] isn't a continuation, but may be blank
381            # or a non-indenting comment line.
382            if  _junkre(code, p):
383                i = i-1
384            else:
385                break
386        if i == 0:
387            # nothing but junk!
388            assert p == 0
389            q = p
390        self.stmt_start, self.stmt_end = p, q
391
392        # Analyze this stmt, to find the last open bracket (if any)
393        # and last interesting character (if any).
394        lastch = ""
395        stack = []  # stack of open bracket indices
396        push_stack = stack.append
397        bracketing = [(p, 0)]
398        while p < q:
399            # suck up all except ()[]{}'"#\\
400            m = _chew_ordinaryre(code, p, q)
401            if m:
402                # we skipped at least one boring char
403                newp = m.end()
404                # back up over totally boring whitespace
405                i = newp - 1    # index of last boring char
406                while i >= p and code[i] in " \t\n":
407                    i = i-1
408                if i >= p:
409                    lastch = code[i]
410                p = newp
411                if p >= q:
412                    break
413
414            ch = code[p]
415
416            if ch in "([{":
417                push_stack(p)
418                bracketing.append((p, len(stack)))
419                lastch = ch
420                p = p+1
421                continue
422
423            if ch in ")]}":
424                if stack:
425                    del stack[-1]
426                lastch = ch
427                p = p+1
428                bracketing.append((p, len(stack)))
429                continue
430
431            if ch == '"' or ch == "'":
432                # consume string
433                # Note that study1 did this with a Python loop, but
434                # we use a regexp here; the reason is speed in both
435                # cases; the string may be huge, but study1 pre-squashed
436                # strings to a couple of characters per line.  study1
437                # also needed to keep track of newlines, and we don't
438                # have to.
439                bracketing.append((p, len(stack)+1))
440                lastch = ch
441                p = _match_stringre(code, p, q).end()
442                bracketing.append((p, len(stack)))
443                continue
444
445            if ch == '#':
446                # consume comment and trailing newline
447                bracketing.append((p, len(stack)+1))
448                p = code.find('\n', p, q) + 1
449                assert p > 0
450                bracketing.append((p, len(stack)))
451                continue
452
453            assert ch == '\\'
454            p = p+1     # beyond backslash
455            assert p < q
456            if code[p] != '\n':
457                # the program is invalid, but can't complain
458                lastch = ch + code[p]
459            p = p+1     # beyond escaped char
460
461        # end while p < q:
462
463        self.lastch = lastch
464        self.lastopenbracketpos = stack[-1] if stack else None
465        self.stmt_bracketing = tuple(bracketing)
466
467    def compute_bracket_indent(self):
468        """Return number of spaces the next line should be indented.
469
470        Line continuation must be C_BRACKET.
471        """
472        self._study2()
473        assert self.continuation == C_BRACKET
474        j = self.lastopenbracketpos
475        code = self.code
476        n = len(code)
477        origi = i = code.rfind('\n', 0, j) + 1
478        j = j+1     # one beyond open bracket
479        # find first list item; set i to start of its line
480        while j < n:
481            m = _itemre(code, j)
482            if m:
483                j = m.end() - 1     # index of first interesting char
484                extra = 0
485                break
486            else:
487                # this line is junk; advance to next line
488                i = j = code.find('\n', j) + 1
489        else:
490            # nothing interesting follows the bracket;
491            # reproduce the bracket line's indentation + a level
492            j = i = origi
493            while code[j] in " \t":
494                j = j+1
495            extra = self.indentwidth
496        return len(code[i:j].expandtabs(self.tabwidth)) + extra
497
498    def get_num_lines_in_stmt(self):
499        """Return number of physical lines in last stmt.
500
501        The statement doesn't have to be an interesting statement.  This is
502        intended to be called when continuation is C_BACKSLASH.
503        """
504        self._study1()
505        goodlines = self.goodlines
506        return goodlines[-1] - goodlines[-2]
507
508    def compute_backslash_indent(self):
509        """Return number of spaces the next line should be indented.
510
511        Line continuation must be C_BACKSLASH.  Also assume that the new
512        line is the first one following the initial line of the stmt.
513        """
514        self._study2()
515        assert self.continuation == C_BACKSLASH
516        code = self.code
517        i = self.stmt_start
518        while code[i] in " \t":
519            i = i+1
520        startpos = i
521
522        # See whether the initial line starts an assignment stmt; i.e.,
523        # look for an = operator
524        endpos = code.find('\n', startpos) + 1
525        found = level = 0
526        while i < endpos:
527            ch = code[i]
528            if ch in "([{":
529                level = level + 1
530                i = i+1
531            elif ch in ")]}":
532                if level:
533                    level = level - 1
534                i = i+1
535            elif ch == '"' or ch == "'":
536                i = _match_stringre(code, i, endpos).end()
537            elif ch == '#':
538                # This line is unreachable because the # makes a comment of
539                # everything after it.
540                break
541            elif level == 0 and ch == '=' and \
542                   (i == 0 or code[i-1] not in "=<>!") and \
543                   code[i+1] != '=':
544                found = 1
545                break
546            else:
547                i = i+1
548
549        if found:
550            # found a legit =, but it may be the last interesting
551            # thing on the line
552            i = i+1     # move beyond the =
553            found = re.match(r"\s*\\", code[i:endpos]) is None
554
555        if not found:
556            # oh well ... settle for moving beyond the first chunk
557            # of non-whitespace chars
558            i = startpos
559            while code[i] not in " \t\n":
560                i = i+1
561
562        return len(code[self.stmt_start:i].expandtabs(\
563                                     self.tabwidth)) + 1
564
565    def get_base_indent_string(self):
566        """Return the leading whitespace on the initial line of the last
567        interesting stmt.
568        """
569        self._study2()
570        i, n = self.stmt_start, self.stmt_end
571        j = i
572        code = self.code
573        while j < n and code[j] in " \t":
574            j = j + 1
575        return code[i:j]
576
577    def is_block_opener(self):
578        "Return True if the last interesting statemtent opens a block."
579        self._study2()
580        return self.lastch == ':'
581
582    def is_block_closer(self):
583        "Return True if the last interesting statement closes a block."
584        self._study2()
585        return _closere(self.code, self.stmt_start) is not None
586
587    def get_last_stmt_bracketing(self):
588        """Return bracketing structure of the last interesting statement.
589
590        The returned tuple is in the format defined in _study2().
591        """
592        self._study2()
593        return self.stmt_bracketing
594
595
596if __name__ == '__main__':
597    from unittest import main
598    main('idlelib.idle_test.test_pyparse', verbosity=2)
599