• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens.  It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF).  It generates
95-tuples with these members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22    tokenize_loop(readline, tokeneater)
23    tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
33from codecs import BOM_UTF8, lookup
34from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38           "generate_tokens", "untokenize"]
39del token
40
41try:
42    bytes
43except NameError:
44    # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45    # valid Python 3 code.
46    bytes = str
47
48def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
57Binnumber = r'0[bB][01]*'
58Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
59Octnumber = r'0[oO]?[0-7]*[lL]?'
60Decnumber = r'[1-9]\d*[lL]?'
61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62Exponent = r'[eE][-+]?\d+'
63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64Expfloat = r'\d+' + Exponent
65Floatnumber = group(Pointfloat, Expfloat)
66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78# Single-line ' or " string.
79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
81
82# Because of leftmost-then-longest match semantics, be sure to put the
83# longest operators first (e.g., if = came before ==, == would get
84# recognized as two instances of =).
85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
86                 r"//=?", r"->",
87                 r"[+\-*/%&@|^=<>]=?",
88                 r"~")
89
90Bracket = '[][(){}]'
91Special = group(r'\r?\n', r'[:;.,`@]')
92Funny = group(Operator, Bracket, Special)
93
94PlainToken = group(Number, Funny, String, Name)
95Token = Ignore + PlainToken
96
97# First (or only) line of ' or " string.
98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
99                group("'", r'\\\r?\n'),
100                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
101                group('"', r'\\\r?\n'))
102PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
104
105tokenprog, pseudoprog, single3prog, double3prog = list(map(
106    re.compile, (Token, PseudoToken, Single3, Double3)))
107endprogs = {"'": re.compile(Single), '"': re.compile(Double),
108            "'''": single3prog, '"""': double3prog,
109            "r'''": single3prog, 'r"""': double3prog,
110            "u'''": single3prog, 'u"""': double3prog,
111            "b'''": single3prog, 'b"""': double3prog,
112            "ur'''": single3prog, 'ur"""': double3prog,
113            "br'''": single3prog, 'br"""': double3prog,
114            "R'''": single3prog, 'R"""': double3prog,
115            "U'''": single3prog, 'U"""': double3prog,
116            "B'''": single3prog, 'B"""': double3prog,
117            "uR'''": single3prog, 'uR"""': double3prog,
118            "Ur'''": single3prog, 'Ur"""': double3prog,
119            "UR'''": single3prog, 'UR"""': double3prog,
120            "bR'''": single3prog, 'bR"""': double3prog,
121            "Br'''": single3prog, 'Br"""': double3prog,
122            "BR'''": single3prog, 'BR"""': double3prog,
123            'r': None, 'R': None,
124            'u': None, 'U': None,
125            'b': None, 'B': None}
126
127triple_quoted = {}
128for t in ("'''", '"""',
129          "r'''", 'r"""', "R'''", 'R"""',
130          "u'''", 'u"""', "U'''", 'U"""',
131          "b'''", 'b"""', "B'''", 'B"""',
132          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
133          "uR'''", 'uR"""', "UR'''", 'UR"""',
134          "br'''", 'br"""', "Br'''", 'Br"""',
135          "bR'''", 'bR"""', "BR'''", 'BR"""',):
136    triple_quoted[t] = t
137single_quoted = {}
138for t in ("'", '"',
139          "r'", 'r"', "R'", 'R"',
140          "u'", 'u"', "U'", 'U"',
141          "b'", 'b"', "B'", 'B"',
142          "ur'", 'ur"', "Ur'", 'Ur"',
143          "uR'", 'uR"', "UR'", 'UR"',
144          "br'", 'br"', "Br'", 'Br"',
145          "bR'", 'bR"', "BR'", 'BR"', ):
146    single_quoted[t] = t
147
148tabsize = 8
149
150class TokenError(Exception): pass
151
152class StopTokenizing(Exception): pass
153
154def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing
155    (srow, scol) = xxx_todo_changeme
156    (erow, ecol) = xxx_todo_changeme1
157    print("%d,%d-%d,%d:\t%s\t%s" % \
158        (srow, scol, erow, ecol, tok_name[type], repr(token)))
159
160def tokenize(readline, tokeneater=printtoken):
161    """
162    The tokenize() function accepts two parameters: one representing the
163    input stream, and one providing an output mechanism for tokenize().
164
165    The first parameter, readline, must be a callable object which provides
166    the same interface as the readline() method of built-in file objects.
167    Each call to the function should return one line of input as a string.
168
169    The second parameter, tokeneater, must also be a callable object. It is
170    called once for each token, with five arguments, corresponding to the
171    tuples generated by generate_tokens().
172    """
173    try:
174        tokenize_loop(readline, tokeneater)
175    except StopTokenizing:
176        pass
177
178# backwards compatible interface
179def tokenize_loop(readline, tokeneater):
180    for token_info in generate_tokens(readline):
181        tokeneater(*token_info)
182
183class Untokenizer:
184
185    def __init__(self):
186        self.tokens = []
187        self.prev_row = 1
188        self.prev_col = 0
189
190    def add_whitespace(self, start):
191        row, col = start
192        assert row <= self.prev_row
193        col_offset = col - self.prev_col
194        if col_offset:
195            self.tokens.append(" " * col_offset)
196
197    def untokenize(self, iterable):
198        for t in iterable:
199            if len(t) == 2:
200                self.compat(t, iterable)
201                break
202            tok_type, token, start, end, line = t
203            self.add_whitespace(start)
204            self.tokens.append(token)
205            self.prev_row, self.prev_col = end
206            if tok_type in (NEWLINE, NL):
207                self.prev_row += 1
208                self.prev_col = 0
209        return "".join(self.tokens)
210
211    def compat(self, token, iterable):
212        startline = False
213        indents = []
214        toks_append = self.tokens.append
215        toknum, tokval = token
216        if toknum in (NAME, NUMBER):
217            tokval += ' '
218        if toknum in (NEWLINE, NL):
219            startline = True
220        for tok in iterable:
221            toknum, tokval = tok[:2]
222
223            if toknum in (NAME, NUMBER, ASYNC, AWAIT):
224                tokval += ' '
225
226            if toknum == INDENT:
227                indents.append(tokval)
228                continue
229            elif toknum == DEDENT:
230                indents.pop()
231                continue
232            elif toknum in (NEWLINE, NL):
233                startline = True
234            elif startline and indents:
235                toks_append(indents[-1])
236                startline = False
237            toks_append(tokval)
238
239cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
240blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
241
242def _get_normal_name(orig_enc):
243    """Imitates get_normal_name in tokenizer.c."""
244    # Only care about the first 12 characters.
245    enc = orig_enc[:12].lower().replace("_", "-")
246    if enc == "utf-8" or enc.startswith("utf-8-"):
247        return "utf-8"
248    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
249       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
250        return "iso-8859-1"
251    return orig_enc
252
253def detect_encoding(readline):
254    """
255    The detect_encoding() function is used to detect the encoding that should
256    be used to decode a Python source file. It requires one argument, readline,
257    in the same way as the tokenize() generator.
258
259    It will call readline a maximum of twice, and return the encoding used
260    (as a string) and a list of any lines (left as bytes) it has read
261    in.
262
263    It detects the encoding from the presence of a utf-8 bom or an encoding
264    cookie as specified in pep-0263. If both a bom and a cookie are present, but
265    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
266    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
267    'utf-8-sig' is returned.
268
269    If no encoding is specified, then the default of 'utf-8' will be returned.
270    """
271    bom_found = False
272    encoding = None
273    default = 'utf-8'
274    def read_or_stop():
275        try:
276            return readline()
277        except StopIteration:
278            return bytes()
279
280    def find_cookie(line):
281        try:
282            line_string = line.decode('ascii')
283        except UnicodeDecodeError:
284            return None
285        match = cookie_re.match(line_string)
286        if not match:
287            return None
288        encoding = _get_normal_name(match.group(1))
289        try:
290            codec = lookup(encoding)
291        except LookupError:
292            # This behaviour mimics the Python interpreter
293            raise SyntaxError("unknown encoding: " + encoding)
294
295        if bom_found:
296            if codec.name != 'utf-8':
297                # This behaviour mimics the Python interpreter
298                raise SyntaxError('encoding problem: utf-8')
299            encoding += '-sig'
300        return encoding
301
302    first = read_or_stop()
303    if first.startswith(BOM_UTF8):
304        bom_found = True
305        first = first[3:]
306        default = 'utf-8-sig'
307    if not first:
308        return default, []
309
310    encoding = find_cookie(first)
311    if encoding:
312        return encoding, [first]
313    if not blank_re.match(first):
314        return default, [first]
315
316    second = read_or_stop()
317    if not second:
318        return default, [first]
319
320    encoding = find_cookie(second)
321    if encoding:
322        return encoding, [first, second]
323
324    return default, [first, second]
325
326def untokenize(iterable):
327    """Transform tokens back into Python source code.
328
329    Each element returned by the iterable must be a token sequence
330    with at least two elements, a token number and token value.  If
331    only two tokens are passed, the resulting output is poor.
332
333    Round-trip invariant for full input:
334        Untokenized source will match input source exactly
335
336    Round-trip invariant for limited intput:
337        # Output text will tokenize the back to the input
338        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
339        newcode = untokenize(t1)
340        readline = iter(newcode.splitlines(1)).next
341        t2 = [tok[:2] for tokin generate_tokens(readline)]
342        assert t1 == t2
343    """
344    ut = Untokenizer()
345    return ut.untokenize(iterable)
346
347def generate_tokens(readline):
348    """
349    The generate_tokens() generator requires one argument, readline, which
350    must be a callable object which provides the same interface as the
351    readline() method of built-in file objects. Each call to the function
352    should return one line of input as a string.  Alternately, readline
353    can be a callable function terminating with StopIteration:
354        readline = open(myfile).next    # Example of alternate readline
355
356    The generator produces 5-tuples with these members: the token type; the
357    token string; a 2-tuple (srow, scol) of ints specifying the row and
358    column where the token begins in the source; a 2-tuple (erow, ecol) of
359    ints specifying the row and column where the token ends in the source;
360    and the line on which the token was found. The line passed is the
361    logical line; continuation lines are included.
362    """
363    lnum = parenlev = continued = 0
364    namechars, numchars = string.ascii_letters + '_', '0123456789'
365    contstr, needcont = '', 0
366    contline = None
367    indents = [0]
368
369    # 'stashed' and 'async_*' are used for async/await parsing
370    stashed = None
371    async_def = False
372    async_def_indent = 0
373    async_def_nl = False
374
375    while 1:                                   # loop over lines in stream
376        try:
377            line = readline()
378        except StopIteration:
379            line = ''
380        lnum = lnum + 1
381        pos, max = 0, len(line)
382
383        if contstr:                            # continued string
384            if not line:
385                raise TokenError("EOF in multi-line string", strstart)
386            endmatch = endprog.match(line)
387            if endmatch:
388                pos = end = endmatch.end(0)
389                yield (STRING, contstr + line[:end],
390                       strstart, (lnum, end), contline + line)
391                contstr, needcont = '', 0
392                contline = None
393            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
394                yield (ERRORTOKEN, contstr + line,
395                           strstart, (lnum, len(line)), contline)
396                contstr = ''
397                contline = None
398                continue
399            else:
400                contstr = contstr + line
401                contline = contline + line
402                continue
403
404        elif parenlev == 0 and not continued:  # new statement
405            if not line: break
406            column = 0
407            while pos < max:                   # measure leading whitespace
408                if line[pos] == ' ': column = column + 1
409                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
410                elif line[pos] == '\f': column = 0
411                else: break
412                pos = pos + 1
413            if pos == max: break
414
415            if stashed:
416                yield stashed
417                stashed = None
418
419            if line[pos] in '#\r\n':           # skip comments or blank lines
420                if line[pos] == '#':
421                    comment_token = line[pos:].rstrip('\r\n')
422                    nl_pos = pos + len(comment_token)
423                    yield (COMMENT, comment_token,
424                           (lnum, pos), (lnum, pos + len(comment_token)), line)
425                    yield (NL, line[nl_pos:],
426                           (lnum, nl_pos), (lnum, len(line)), line)
427                else:
428                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
429                           (lnum, pos), (lnum, len(line)), line)
430                continue
431
432            if column > indents[-1]:           # count indents or dedents
433                indents.append(column)
434                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
435            while column < indents[-1]:
436                if column not in indents:
437                    raise IndentationError(
438                        "unindent does not match any outer indentation level",
439                        ("<tokenize>", lnum, pos, line))
440                indents = indents[:-1]
441
442                if async_def and async_def_indent >= indents[-1]:
443                    async_def = False
444                    async_def_nl = False
445                    async_def_indent = 0
446
447                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
448
449            if async_def and async_def_nl and async_def_indent >= indents[-1]:
450                async_def = False
451                async_def_nl = False
452                async_def_indent = 0
453
454        else:                                  # continued statement
455            if not line:
456                raise TokenError("EOF in multi-line statement", (lnum, 0))
457            continued = 0
458
459        while pos < max:
460            pseudomatch = pseudoprog.match(line, pos)
461            if pseudomatch:                                # scan for tokens
462                start, end = pseudomatch.span(1)
463                spos, epos, pos = (lnum, start), (lnum, end), end
464                token, initial = line[start:end], line[start]
465
466                if initial in numchars or \
467                   (initial == '.' and token != '.'):      # ordinary number
468                    yield (NUMBER, token, spos, epos, line)
469                elif initial in '\r\n':
470                    newline = NEWLINE
471                    if parenlev > 0:
472                        newline = NL
473                    elif async_def:
474                        async_def_nl = True
475                    if stashed:
476                        yield stashed
477                        stashed = None
478                    yield (newline, token, spos, epos, line)
479
480                elif initial == '#':
481                    assert not token.endswith("\n")
482                    if stashed:
483                        yield stashed
484                        stashed = None
485                    yield (COMMENT, token, spos, epos, line)
486                elif token in triple_quoted:
487                    endprog = endprogs[token]
488                    endmatch = endprog.match(line, pos)
489                    if endmatch:                           # all on one line
490                        pos = endmatch.end(0)
491                        token = line[start:pos]
492                        if stashed:
493                            yield stashed
494                            stashed = None
495                        yield (STRING, token, spos, (lnum, pos), line)
496                    else:
497                        strstart = (lnum, start)           # multiple lines
498                        contstr = line[start:]
499                        contline = line
500                        break
501                elif initial in single_quoted or \
502                    token[:2] in single_quoted or \
503                    token[:3] in single_quoted:
504                    if token[-1] == '\n':                  # continued string
505                        strstart = (lnum, start)
506                        endprog = (endprogs[initial] or endprogs[token[1]] or
507                                   endprogs[token[2]])
508                        contstr, needcont = line[start:], 1
509                        contline = line
510                        break
511                    else:                                  # ordinary string
512                        if stashed:
513                            yield stashed
514                            stashed = None
515                        yield (STRING, token, spos, epos, line)
516                elif initial in namechars:                 # ordinary name
517                    if token in ('async', 'await'):
518                        if async_def:
519                            yield (ASYNC if token == 'async' else AWAIT,
520                                   token, spos, epos, line)
521                            continue
522
523                    tok = (NAME, token, spos, epos, line)
524                    if token == 'async' and not stashed:
525                        stashed = tok
526                        continue
527
528                    if token == 'def':
529                        if (stashed
530                                and stashed[0] == NAME
531                                and stashed[1] == 'async'):
532
533                            async_def = True
534                            async_def_indent = indents[-1]
535
536                            yield (ASYNC, stashed[1],
537                                   stashed[2], stashed[3],
538                                   stashed[4])
539                            stashed = None
540
541                    if stashed:
542                        yield stashed
543                        stashed = None
544
545                    yield tok
546                elif initial == '\\':                      # continued stmt
547                    # This yield is new; needed for better idempotency:
548                    if stashed:
549                        yield stashed
550                        stashed = None
551                    yield (NL, token, spos, (lnum, pos), line)
552                    continued = 1
553                else:
554                    if initial in '([{': parenlev = parenlev + 1
555                    elif initial in ')]}': parenlev = parenlev - 1
556                    if stashed:
557                        yield stashed
558                        stashed = None
559                    yield (OP, token, spos, epos, line)
560            else:
561                yield (ERRORTOKEN, line[pos],
562                           (lnum, pos), (lnum, pos+1), line)
563                pos = pos + 1
564
565    if stashed:
566        yield stashed
567        stashed = None
568
569    for indent in indents[1:]:                 # pop remaining indent levels
570        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
571    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
572
573if __name__ == '__main__':                     # testing
574    import sys
575    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
576    else: tokenize(sys.stdin.readline)
577