• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tokenization help for Python programs.
2
3tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens.  It decodes the bytes according to PEP-0263 for
5determining source file encoding.
6
7It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF).  It generates 5-tuples with these
9members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators.  Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
22
23__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26               'Michael Foord')
27from builtins import open as _builtin_open
28from codecs import lookup, BOM_UTF8
29import collections
30from io import TextIOWrapper
31import itertools as _itertools
32import re
33import sys
34from token import *
35from token import EXACT_TOKEN_TYPES
36
37cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
38blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
39
40import token
41__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
42                           "untokenize", "TokenInfo"]
43del token
44
45class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
46    def __repr__(self):
47        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
48        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
49                self._replace(type=annotated_type))
50
51    @property
52    def exact_type(self):
53        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
54            return EXACT_TOKEN_TYPES[self.string]
55        else:
56            return self.type
57
58def group(*choices): return '(' + '|'.join(choices) + ')'
59def any(*choices): return group(*choices) + '*'
60def maybe(*choices): return group(*choices) + '?'
61
62# Note: we use unicode matching for names ("\w") but ascii matching for
63# number literals.
64Whitespace = r'[ \f\t]*'
65Comment = r'#[^\r\n]*'
66Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
67Name = r'\w+'
68
69Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
70Binnumber = r'0[bB](?:_?[01])+'
71Octnumber = r'0[oO](?:_?[0-7])+'
72Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
73Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
74Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
75Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
76                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
77Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
78Floatnumber = group(Pointfloat, Expfloat)
79Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
80Number = group(Imagnumber, Floatnumber, Intnumber)
81
82# Return the empty string, plus all of the valid string prefixes.
83def _all_string_prefixes():
84    # The valid string prefixes. Only contain the lower case versions,
85    #  and don't contain any permutations (include 'fr', but not
86    #  'rf'). The various permutations will be generated.
87    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
88    # if we add binary f-strings, add: ['fb', 'fbr']
89    result = {''}
90    for prefix in _valid_string_prefixes:
91        for t in _itertools.permutations(prefix):
92            # create a list with upper and lower versions of each
93            #  character
94            for u in _itertools.product(*[(c, c.upper()) for c in t]):
95                result.add(''.join(u))
96    return result
97
98def _compile(expr):
99    return re.compile(expr, re.UNICODE)
100
101# Note that since _all_string_prefixes includes the empty string,
102#  StringPrefix can be the empty string (making it optional).
103StringPrefix = group(*_all_string_prefixes())
104
105# Tail end of ' string.
106Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
107# Tail end of " string.
108Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
109# Tail end of ''' string.
110Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
111# Tail end of """ string.
112Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
113Triple = group(StringPrefix + "'''", StringPrefix + '"""')
114# Single-line ' or " string.
115String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
116               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
117
118# Sorting in reverse order puts the long operators before their prefixes.
119# Otherwise if = came before ==, == would get recognized as two instances
120# of =.
121Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
122Funny = group(r'\r?\n', Special)
123
124PlainToken = group(Number, Funny, String, Name)
125Token = Ignore + PlainToken
126
127# First (or only) line of ' or " string.
128ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
129                group("'", r'\\\r?\n'),
130                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
131                group('"', r'\\\r?\n'))
132PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
133PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
134
135# For a given string prefix plus quotes, endpats maps it to a regex
136#  to match the remainder of that string. _prefix can be empty, for
137#  a normal single or triple quoted string (with no prefix).
138endpats = {}
139for _prefix in _all_string_prefixes():
140    endpats[_prefix + "'"] = Single
141    endpats[_prefix + '"'] = Double
142    endpats[_prefix + "'''"] = Single3
143    endpats[_prefix + '"""'] = Double3
144
145# A set of all of the single and triple quoted string prefixes,
146#  including the opening quotes.
147single_quoted = set()
148triple_quoted = set()
149for t in _all_string_prefixes():
150    for u in (t + '"', t + "'"):
151        single_quoted.add(u)
152    for u in (t + '"""', t + "'''"):
153        triple_quoted.add(u)
154
155tabsize = 8
156
157class TokenError(Exception): pass
158
159class StopTokenizing(Exception): pass
160
161
162class Untokenizer:
163
164    def __init__(self):
165        self.tokens = []
166        self.prev_row = 1
167        self.prev_col = 0
168        self.encoding = None
169
170    def add_whitespace(self, start):
171        row, col = start
172        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
173            raise ValueError("start ({},{}) precedes previous end ({},{})"
174                             .format(row, col, self.prev_row, self.prev_col))
175        row_offset = row - self.prev_row
176        if row_offset:
177            self.tokens.append("\\\n" * row_offset)
178            self.prev_col = 0
179        col_offset = col - self.prev_col
180        if col_offset:
181            self.tokens.append(" " * col_offset)
182
183    def untokenize(self, iterable):
184        it = iter(iterable)
185        indents = []
186        startline = False
187        for t in it:
188            if len(t) == 2:
189                self.compat(t, it)
190                break
191            tok_type, token, start, end, line = t
192            if tok_type == ENCODING:
193                self.encoding = token
194                continue
195            if tok_type == ENDMARKER:
196                break
197            if tok_type == INDENT:
198                indents.append(token)
199                continue
200            elif tok_type == DEDENT:
201                indents.pop()
202                self.prev_row, self.prev_col = end
203                continue
204            elif tok_type in (NEWLINE, NL):
205                startline = True
206            elif startline and indents:
207                indent = indents[-1]
208                if start[1] >= len(indent):
209                    self.tokens.append(indent)
210                    self.prev_col = len(indent)
211                startline = False
212            self.add_whitespace(start)
213            self.tokens.append(token)
214            self.prev_row, self.prev_col = end
215            if tok_type in (NEWLINE, NL):
216                self.prev_row += 1
217                self.prev_col = 0
218        return "".join(self.tokens)
219
220    def compat(self, token, iterable):
221        indents = []
222        toks_append = self.tokens.append
223        startline = token[0] in (NEWLINE, NL)
224        prevstring = False
225
226        for tok in _itertools.chain([token], iterable):
227            toknum, tokval = tok[:2]
228            if toknum == ENCODING:
229                self.encoding = tokval
230                continue
231
232            if toknum in (NAME, NUMBER):
233                tokval += ' '
234
235            # Insert a space between two consecutive strings
236            if toknum == STRING:
237                if prevstring:
238                    tokval = ' ' + tokval
239                prevstring = True
240            else:
241                prevstring = False
242
243            if toknum == INDENT:
244                indents.append(tokval)
245                continue
246            elif toknum == DEDENT:
247                indents.pop()
248                continue
249            elif toknum in (NEWLINE, NL):
250                startline = True
251            elif startline and indents:
252                toks_append(indents[-1])
253                startline = False
254            toks_append(tokval)
255
256
257def untokenize(iterable):
258    """Transform tokens back into Python source code.
259    It returns a bytes object, encoded using the ENCODING
260    token, which is the first token sequence output by tokenize.
261
262    Each element returned by the iterable must be a token sequence
263    with at least two elements, a token number and token value.  If
264    only two tokens are passed, the resulting output is poor.
265
266    Round-trip invariant for full input:
267        Untokenized source will match input source exactly
268
269    Round-trip invariant for limited input:
270        # Output bytes will tokenize back to the input
271        t1 = [tok[:2] for tok in tokenize(f.readline)]
272        newcode = untokenize(t1)
273        readline = BytesIO(newcode).readline
274        t2 = [tok[:2] for tok in tokenize(readline)]
275        assert t1 == t2
276    """
277    ut = Untokenizer()
278    out = ut.untokenize(iterable)
279    if ut.encoding is not None:
280        out = out.encode(ut.encoding)
281    return out
282
283
284def _get_normal_name(orig_enc):
285    """Imitates get_normal_name in tokenizer.c."""
286    # Only care about the first 12 characters.
287    enc = orig_enc[:12].lower().replace("_", "-")
288    if enc == "utf-8" or enc.startswith("utf-8-"):
289        return "utf-8"
290    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
291       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
292        return "iso-8859-1"
293    return orig_enc
294
295def detect_encoding(readline):
296    """
297    The detect_encoding() function is used to detect the encoding that should
298    be used to decode a Python source file.  It requires one argument, readline,
299    in the same way as the tokenize() generator.
300
301    It will call readline a maximum of twice, and return the encoding used
302    (as a string) and a list of any lines (left as bytes) it has read in.
303
304    It detects the encoding from the presence of a utf-8 bom or an encoding
305    cookie as specified in pep-0263.  If both a bom and a cookie are present,
306    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
307    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
308    'utf-8-sig' is returned.
309
310    If no encoding is specified, then the default of 'utf-8' will be returned.
311    """
312    try:
313        filename = readline.__self__.name
314    except AttributeError:
315        filename = None
316    bom_found = False
317    encoding = None
318    default = 'utf-8'
319    def read_or_stop():
320        try:
321            return readline()
322        except StopIteration:
323            return b''
324
325    def find_cookie(line):
326        try:
327            # Decode as UTF-8. Either the line is an encoding declaration,
328            # in which case it should be pure ASCII, or it must be UTF-8
329            # per default encoding.
330            line_string = line.decode('utf-8')
331        except UnicodeDecodeError:
332            msg = "invalid or missing encoding declaration"
333            if filename is not None:
334                msg = '{} for {!r}'.format(msg, filename)
335            raise SyntaxError(msg)
336
337        match = cookie_re.match(line_string)
338        if not match:
339            return None
340        encoding = _get_normal_name(match.group(1))
341        try:
342            codec = lookup(encoding)
343        except LookupError:
344            # This behaviour mimics the Python interpreter
345            if filename is None:
346                msg = "unknown encoding: " + encoding
347            else:
348                msg = "unknown encoding for {!r}: {}".format(filename,
349                        encoding)
350            raise SyntaxError(msg)
351
352        if bom_found:
353            if encoding != 'utf-8':
354                # This behaviour mimics the Python interpreter
355                if filename is None:
356                    msg = 'encoding problem: utf-8'
357                else:
358                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
359                raise SyntaxError(msg)
360            encoding += '-sig'
361        return encoding
362
363    first = read_or_stop()
364    if first.startswith(BOM_UTF8):
365        bom_found = True
366        first = first[3:]
367        default = 'utf-8-sig'
368    if not first:
369        return default, []
370
371    encoding = find_cookie(first)
372    if encoding:
373        return encoding, [first]
374    if not blank_re.match(first):
375        return default, [first]
376
377    second = read_or_stop()
378    if not second:
379        return default, [first]
380
381    encoding = find_cookie(second)
382    if encoding:
383        return encoding, [first, second]
384
385    return default, [first, second]
386
387
388def open(filename):
389    """Open a file in read only mode using the encoding detected by
390    detect_encoding().
391    """
392    buffer = _builtin_open(filename, 'rb')
393    try:
394        encoding, lines = detect_encoding(buffer.readline)
395        buffer.seek(0)
396        text = TextIOWrapper(buffer, encoding, line_buffering=True)
397        text.mode = 'r'
398        return text
399    except:
400        buffer.close()
401        raise
402
403
404def tokenize(readline):
405    """
406    The tokenize() generator requires one argument, readline, which
407    must be a callable object which provides the same interface as the
408    readline() method of built-in file objects.  Each call to the function
409    should return one line of input as bytes.  Alternatively, readline
410    can be a callable function terminating with StopIteration:
411        readline = open(myfile, 'rb').__next__  # Example of alternate readline
412
413    The generator produces 5-tuples with these members: the token type; the
414    token string; a 2-tuple (srow, scol) of ints specifying the row and
415    column where the token begins in the source; a 2-tuple (erow, ecol) of
416    ints specifying the row and column where the token ends in the source;
417    and the line on which the token was found.  The line passed is the
418    physical line.
419
420    The first token sequence will always be an ENCODING token
421    which tells you which encoding was used to decode the bytes stream.
422    """
423    encoding, consumed = detect_encoding(readline)
424    empty = _itertools.repeat(b"")
425    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
426    return _tokenize(rl_gen.__next__, encoding)
427
428
429def _tokenize(readline, encoding):
430    lnum = parenlev = continued = 0
431    numchars = '0123456789'
432    contstr, needcont = '', 0
433    contline = None
434    indents = [0]
435
436    if encoding is not None:
437        if encoding == "utf-8-sig":
438            # BOM will already have been stripped.
439            encoding = "utf-8"
440        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
441    last_line = b''
442    line = b''
443    while True:                                # loop over lines in stream
444        try:
445            # We capture the value of the line variable here because
446            # readline uses the empty string '' to signal end of input,
447            # hence `line` itself will always be overwritten at the end
448            # of this loop.
449            last_line = line
450            line = readline()
451        except StopIteration:
452            line = b''
453
454        if encoding is not None:
455            line = line.decode(encoding)
456        lnum += 1
457        pos, max = 0, len(line)
458
459        if contstr:                            # continued string
460            if not line:
461                raise TokenError("EOF in multi-line string", strstart)
462            endmatch = endprog.match(line)
463            if endmatch:
464                pos = end = endmatch.end(0)
465                yield TokenInfo(STRING, contstr + line[:end],
466                       strstart, (lnum, end), contline + line)
467                contstr, needcont = '', 0
468                contline = None
469            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
470                yield TokenInfo(ERRORTOKEN, contstr + line,
471                           strstart, (lnum, len(line)), contline)
472                contstr = ''
473                contline = None
474                continue
475            else:
476                contstr = contstr + line
477                contline = contline + line
478                continue
479
480        elif parenlev == 0 and not continued:  # new statement
481            if not line: break
482            column = 0
483            while pos < max:                   # measure leading whitespace
484                if line[pos] == ' ':
485                    column += 1
486                elif line[pos] == '\t':
487                    column = (column//tabsize + 1)*tabsize
488                elif line[pos] == '\f':
489                    column = 0
490                else:
491                    break
492                pos += 1
493            if pos == max:
494                break
495
496            if line[pos] in '#\r\n':           # skip comments or blank lines
497                if line[pos] == '#':
498                    comment_token = line[pos:].rstrip('\r\n')
499                    yield TokenInfo(COMMENT, comment_token,
500                           (lnum, pos), (lnum, pos + len(comment_token)), line)
501                    pos += len(comment_token)
502
503                yield TokenInfo(NL, line[pos:],
504                           (lnum, pos), (lnum, len(line)), line)
505                continue
506
507            if column > indents[-1]:           # count indents or dedents
508                indents.append(column)
509                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
510            while column < indents[-1]:
511                if column not in indents:
512                    raise IndentationError(
513                        "unindent does not match any outer indentation level",
514                        ("<tokenize>", lnum, pos, line))
515                indents = indents[:-1]
516
517                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
518
519        else:                                  # continued statement
520            if not line:
521                raise TokenError("EOF in multi-line statement", (lnum, 0))
522            continued = 0
523
524        while pos < max:
525            pseudomatch = _compile(PseudoToken).match(line, pos)
526            if pseudomatch:                                # scan for tokens
527                start, end = pseudomatch.span(1)
528                spos, epos, pos = (lnum, start), (lnum, end), end
529                if start == end:
530                    continue
531                token, initial = line[start:end], line[start]
532
533                if (initial in numchars or                 # ordinary number
534                    (initial == '.' and token != '.' and token != '...')):
535                    yield TokenInfo(NUMBER, token, spos, epos, line)
536                elif initial in '\r\n':
537                    if parenlev > 0:
538                        yield TokenInfo(NL, token, spos, epos, line)
539                    else:
540                        yield TokenInfo(NEWLINE, token, spos, epos, line)
541
542                elif initial == '#':
543                    assert not token.endswith("\n")
544                    yield TokenInfo(COMMENT, token, spos, epos, line)
545
546                elif token in triple_quoted:
547                    endprog = _compile(endpats[token])
548                    endmatch = endprog.match(line, pos)
549                    if endmatch:                           # all on one line
550                        pos = endmatch.end(0)
551                        token = line[start:pos]
552                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
553                    else:
554                        strstart = (lnum, start)           # multiple lines
555                        contstr = line[start:]
556                        contline = line
557                        break
558
559                # Check up to the first 3 chars of the token to see if
560                #  they're in the single_quoted set. If so, they start
561                #  a string.
562                # We're using the first 3, because we're looking for
563                #  "rb'" (for example) at the start of the token. If
564                #  we switch to longer prefixes, this needs to be
565                #  adjusted.
566                # Note that initial == token[:1].
567                # Also note that single quote checking must come after
568                #  triple quote checking (above).
569                elif (initial in single_quoted or
570                      token[:2] in single_quoted or
571                      token[:3] in single_quoted):
572                    if token[-1] == '\n':                  # continued string
573                        strstart = (lnum, start)
574                        # Again, using the first 3 chars of the
575                        #  token. This is looking for the matching end
576                        #  regex for the correct type of quote
577                        #  character. So it's really looking for
578                        #  endpats["'"] or endpats['"'], by trying to
579                        #  skip string prefix characters, if any.
580                        endprog = _compile(endpats.get(initial) or
581                                           endpats.get(token[1]) or
582                                           endpats.get(token[2]))
583                        contstr, needcont = line[start:], 1
584                        contline = line
585                        break
586                    else:                                  # ordinary string
587                        yield TokenInfo(STRING, token, spos, epos, line)
588
589                elif initial.isidentifier():               # ordinary name
590                    yield TokenInfo(NAME, token, spos, epos, line)
591                elif initial == '\\':                      # continued stmt
592                    continued = 1
593                else:
594                    if initial in '([{':
595                        parenlev += 1
596                    elif initial in ')]}':
597                        parenlev -= 1
598                    yield TokenInfo(OP, token, spos, epos, line)
599            else:
600                yield TokenInfo(ERRORTOKEN, line[pos],
601                           (lnum, pos), (lnum, pos+1), line)
602                pos += 1
603
604    # Add an implicit NEWLINE if the input doesn't end in one
605    if last_line and last_line[-1] not in '\r\n':
606        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
607    for indent in indents[1:]:                 # pop remaining indent levels
608        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
609    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
610
611
612def generate_tokens(readline):
613    """Tokenize a source reading Python code as unicode strings.
614
615    This has the same API as tokenize(), except that it expects the *readline*
616    callable to return str objects instead of bytes.
617    """
618    return _tokenize(readline, None)
619
620def main():
621    import argparse
622
623    # Helper error handling routines
624    def perror(message):
625        sys.stderr.write(message)
626        sys.stderr.write('\n')
627
628    def error(message, filename=None, location=None):
629        if location:
630            args = (filename,) + location + (message,)
631            perror("%s:%d:%d: error: %s" % args)
632        elif filename:
633            perror("%s: error: %s" % (filename, message))
634        else:
635            perror("error: %s" % message)
636        sys.exit(1)
637
638    # Parse the arguments and options
639    parser = argparse.ArgumentParser(prog='python -m tokenize')
640    parser.add_argument(dest='filename', nargs='?',
641                        metavar='filename.py',
642                        help='the file to tokenize; defaults to stdin')
643    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
644                        help='display token names using the exact type')
645    args = parser.parse_args()
646
647    try:
648        # Tokenize the input
649        if args.filename:
650            filename = args.filename
651            with _builtin_open(filename, 'rb') as f:
652                tokens = list(tokenize(f.readline))
653        else:
654            filename = "<stdin>"
655            tokens = _tokenize(sys.stdin.readline, None)
656
657        # Output the tokenization
658        for token in tokens:
659            token_type = token.type
660            if args.exact:
661                token_type = token.exact_type
662            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
663            print("%-20s%-15s%-15r" %
664                  (token_range, tok_name[token_type], token.string))
665    except IndentationError as err:
666        line, column = err.args[1][1:3]
667        error(err.args[0], filename, (line, column))
668    except TokenError as err:
669        line, column = err.args[1]
670        error(err.args[0], filename, (line, column))
671    except SyntaxError as err:
672        error(err, filename)
673    except OSError as err:
674        error(err)
675    except KeyboardInterrupt:
676        print("interrupted\n")
677    except Exception as err:
678        perror("unexpected error: %s" % err)
679        raise
680
681if __name__ == "__main__":
682    main()
683