• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tokenization help for Python programs.
2
3tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens.  It decodes the bytes according to PEP-0263 for
5determining source file encoding.
6
7It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF).  It generates 5-tuples with these
9members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators.  Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
22
23__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26               'Michael Foord')
27from builtins import open as _builtin_open
28from codecs import lookup, BOM_UTF8
29import collections
30import functools
31from io import TextIOWrapper
32import itertools as _itertools
33import re
34import sys
35from token import *
36from token import EXACT_TOKEN_TYPES
37
38cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
39blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
40
41import token
42__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
43                           "untokenize", "TokenInfo"]
44del token
45
46class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
47    def __repr__(self):
48        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
49        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
50                self._replace(type=annotated_type))
51
52    @property
53    def exact_type(self):
54        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
55            return EXACT_TOKEN_TYPES[self.string]
56        else:
57            return self.type
58
59def group(*choices): return '(' + '|'.join(choices) + ')'
60def any(*choices): return group(*choices) + '*'
61def maybe(*choices): return group(*choices) + '?'
62
63# Note: we use unicode matching for names ("\w") but ascii matching for
64# number literals.
65Whitespace = r'[ \f\t]*'
66Comment = r'#[^\r\n]*'
67Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
68Name = r'\w+'
69
70Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
71Binnumber = r'0[bB](?:_?[01])+'
72Octnumber = r'0[oO](?:_?[0-7])+'
73Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
74Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
75Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
76Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
77                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
78Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
79Floatnumber = group(Pointfloat, Expfloat)
80Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
81Number = group(Imagnumber, Floatnumber, Intnumber)
82
83# Return the empty string, plus all of the valid string prefixes.
84def _all_string_prefixes():
85    # The valid string prefixes. Only contain the lower case versions,
86    #  and don't contain any permutations (include 'fr', but not
87    #  'rf'). The various permutations will be generated.
88    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
89    # if we add binary f-strings, add: ['fb', 'fbr']
90    result = {''}
91    for prefix in _valid_string_prefixes:
92        for t in _itertools.permutations(prefix):
93            # create a list with upper and lower versions of each
94            #  character
95            for u in _itertools.product(*[(c, c.upper()) for c in t]):
96                result.add(''.join(u))
97    return result
98
99@functools.lru_cache
100def _compile(expr):
101    return re.compile(expr, re.UNICODE)
102
103# Note that since _all_string_prefixes includes the empty string,
104#  StringPrefix can be the empty string (making it optional).
105StringPrefix = group(*_all_string_prefixes())
106
107# Tail end of ' string.
108Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
109# Tail end of " string.
110Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
111# Tail end of ''' string.
112Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
113# Tail end of """ string.
114Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
115Triple = group(StringPrefix + "'''", StringPrefix + '"""')
116# Single-line ' or " string.
117String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
118               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
119
120# Sorting in reverse order puts the long operators before their prefixes.
121# Otherwise if = came before ==, == would get recognized as two instances
122# of =.
123Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
124Funny = group(r'\r?\n', Special)
125
126PlainToken = group(Number, Funny, String, Name)
127Token = Ignore + PlainToken
128
129# First (or only) line of ' or " string.
130ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
131                group("'", r'\\\r?\n'),
132                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
133                group('"', r'\\\r?\n'))
134PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
135PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
136
137# For a given string prefix plus quotes, endpats maps it to a regex
138#  to match the remainder of that string. _prefix can be empty, for
139#  a normal single or triple quoted string (with no prefix).
140endpats = {}
141for _prefix in _all_string_prefixes():
142    endpats[_prefix + "'"] = Single
143    endpats[_prefix + '"'] = Double
144    endpats[_prefix + "'''"] = Single3
145    endpats[_prefix + '"""'] = Double3
146
147# A set of all of the single and triple quoted string prefixes,
148#  including the opening quotes.
149single_quoted = set()
150triple_quoted = set()
151for t in _all_string_prefixes():
152    for u in (t + '"', t + "'"):
153        single_quoted.add(u)
154    for u in (t + '"""', t + "'''"):
155        triple_quoted.add(u)
156
157tabsize = 8
158
159class TokenError(Exception): pass
160
161class StopTokenizing(Exception): pass
162
163
164class Untokenizer:
165
166    def __init__(self):
167        self.tokens = []
168        self.prev_row = 1
169        self.prev_col = 0
170        self.encoding = None
171
172    def add_whitespace(self, start):
173        row, col = start
174        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
175            raise ValueError("start ({},{}) precedes previous end ({},{})"
176                             .format(row, col, self.prev_row, self.prev_col))
177        row_offset = row - self.prev_row
178        if row_offset:
179            self.tokens.append("\\\n" * row_offset)
180            self.prev_col = 0
181        col_offset = col - self.prev_col
182        if col_offset:
183            self.tokens.append(" " * col_offset)
184
185    def untokenize(self, iterable):
186        it = iter(iterable)
187        indents = []
188        startline = False
189        for t in it:
190            if len(t) == 2:
191                self.compat(t, it)
192                break
193            tok_type, token, start, end, line = t
194            if tok_type == ENCODING:
195                self.encoding = token
196                continue
197            if tok_type == ENDMARKER:
198                break
199            if tok_type == INDENT:
200                indents.append(token)
201                continue
202            elif tok_type == DEDENT:
203                indents.pop()
204                self.prev_row, self.prev_col = end
205                continue
206            elif tok_type in (NEWLINE, NL):
207                startline = True
208            elif startline and indents:
209                indent = indents[-1]
210                if start[1] >= len(indent):
211                    self.tokens.append(indent)
212                    self.prev_col = len(indent)
213                startline = False
214            self.add_whitespace(start)
215            self.tokens.append(token)
216            self.prev_row, self.prev_col = end
217            if tok_type in (NEWLINE, NL):
218                self.prev_row += 1
219                self.prev_col = 0
220        return "".join(self.tokens)
221
222    def compat(self, token, iterable):
223        indents = []
224        toks_append = self.tokens.append
225        startline = token[0] in (NEWLINE, NL)
226        prevstring = False
227
228        for tok in _itertools.chain([token], iterable):
229            toknum, tokval = tok[:2]
230            if toknum == ENCODING:
231                self.encoding = tokval
232                continue
233
234            if toknum in (NAME, NUMBER):
235                tokval += ' '
236
237            # Insert a space between two consecutive strings
238            if toknum == STRING:
239                if prevstring:
240                    tokval = ' ' + tokval
241                prevstring = True
242            else:
243                prevstring = False
244
245            if toknum == INDENT:
246                indents.append(tokval)
247                continue
248            elif toknum == DEDENT:
249                indents.pop()
250                continue
251            elif toknum in (NEWLINE, NL):
252                startline = True
253            elif startline and indents:
254                toks_append(indents[-1])
255                startline = False
256            toks_append(tokval)
257
258
259def untokenize(iterable):
260    """Transform tokens back into Python source code.
261    It returns a bytes object, encoded using the ENCODING
262    token, which is the first token sequence output by tokenize.
263
264    Each element returned by the iterable must be a token sequence
265    with at least two elements, a token number and token value.  If
266    only two tokens are passed, the resulting output is poor.
267
268    Round-trip invariant for full input:
269        Untokenized source will match input source exactly
270
271    Round-trip invariant for limited input:
272        # Output bytes will tokenize back to the input
273        t1 = [tok[:2] for tok in tokenize(f.readline)]
274        newcode = untokenize(t1)
275        readline = BytesIO(newcode).readline
276        t2 = [tok[:2] for tok in tokenize(readline)]
277        assert t1 == t2
278    """
279    ut = Untokenizer()
280    out = ut.untokenize(iterable)
281    if ut.encoding is not None:
282        out = out.encode(ut.encoding)
283    return out
284
285
286def _get_normal_name(orig_enc):
287    """Imitates get_normal_name in tokenizer.c."""
288    # Only care about the first 12 characters.
289    enc = orig_enc[:12].lower().replace("_", "-")
290    if enc == "utf-8" or enc.startswith("utf-8-"):
291        return "utf-8"
292    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
293       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
294        return "iso-8859-1"
295    return orig_enc
296
297def detect_encoding(readline):
298    """
299    The detect_encoding() function is used to detect the encoding that should
300    be used to decode a Python source file.  It requires one argument, readline,
301    in the same way as the tokenize() generator.
302
303    It will call readline a maximum of twice, and return the encoding used
304    (as a string) and a list of any lines (left as bytes) it has read in.
305
306    It detects the encoding from the presence of a utf-8 bom or an encoding
307    cookie as specified in pep-0263.  If both a bom and a cookie are present,
308    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
309    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
310    'utf-8-sig' is returned.
311
312    If no encoding is specified, then the default of 'utf-8' will be returned.
313    """
314    try:
315        filename = readline.__self__.name
316    except AttributeError:
317        filename = None
318    bom_found = False
319    encoding = None
320    default = 'utf-8'
321    def read_or_stop():
322        try:
323            return readline()
324        except StopIteration:
325            return b''
326
327    def find_cookie(line):
328        try:
329            # Decode as UTF-8. Either the line is an encoding declaration,
330            # in which case it should be pure ASCII, or it must be UTF-8
331            # per default encoding.
332            line_string = line.decode('utf-8')
333        except UnicodeDecodeError:
334            msg = "invalid or missing encoding declaration"
335            if filename is not None:
336                msg = '{} for {!r}'.format(msg, filename)
337            raise SyntaxError(msg)
338
339        match = cookie_re.match(line_string)
340        if not match:
341            return None
342        encoding = _get_normal_name(match.group(1))
343        try:
344            codec = lookup(encoding)
345        except LookupError:
346            # This behaviour mimics the Python interpreter
347            if filename is None:
348                msg = "unknown encoding: " + encoding
349            else:
350                msg = "unknown encoding for {!r}: {}".format(filename,
351                        encoding)
352            raise SyntaxError(msg)
353
354        if bom_found:
355            if encoding != 'utf-8':
356                # This behaviour mimics the Python interpreter
357                if filename is None:
358                    msg = 'encoding problem: utf-8'
359                else:
360                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
361                raise SyntaxError(msg)
362            encoding += '-sig'
363        return encoding
364
365    first = read_or_stop()
366    if first.startswith(BOM_UTF8):
367        bom_found = True
368        first = first[3:]
369        default = 'utf-8-sig'
370    if not first:
371        return default, []
372
373    encoding = find_cookie(first)
374    if encoding:
375        return encoding, [first]
376    if not blank_re.match(first):
377        return default, [first]
378
379    second = read_or_stop()
380    if not second:
381        return default, [first]
382
383    encoding = find_cookie(second)
384    if encoding:
385        return encoding, [first, second]
386
387    return default, [first, second]
388
389
390def open(filename):
391    """Open a file in read only mode using the encoding detected by
392    detect_encoding().
393    """
394    buffer = _builtin_open(filename, 'rb')
395    try:
396        encoding, lines = detect_encoding(buffer.readline)
397        buffer.seek(0)
398        text = TextIOWrapper(buffer, encoding, line_buffering=True)
399        text.mode = 'r'
400        return text
401    except:
402        buffer.close()
403        raise
404
405
406def tokenize(readline):
407    """
408    The tokenize() generator requires one argument, readline, which
409    must be a callable object which provides the same interface as the
410    readline() method of built-in file objects.  Each call to the function
411    should return one line of input as bytes.  Alternatively, readline
412    can be a callable function terminating with StopIteration:
413        readline = open(myfile, 'rb').__next__  # Example of alternate readline
414
415    The generator produces 5-tuples with these members: the token type; the
416    token string; a 2-tuple (srow, scol) of ints specifying the row and
417    column where the token begins in the source; a 2-tuple (erow, ecol) of
418    ints specifying the row and column where the token ends in the source;
419    and the line on which the token was found.  The line passed is the
420    physical line.
421
422    The first token sequence will always be an ENCODING token
423    which tells you which encoding was used to decode the bytes stream.
424    """
425    encoding, consumed = detect_encoding(readline)
426    empty = _itertools.repeat(b"")
427    rl_gen = _itertools.chain(consumed, iter(readline, b""), empty)
428    return _tokenize(rl_gen.__next__, encoding)
429
430
431def _tokenize(readline, encoding):
432    lnum = parenlev = continued = 0
433    numchars = '0123456789'
434    contstr, needcont = '', 0
435    contline = None
436    indents = [0]
437
438    if encoding is not None:
439        if encoding == "utf-8-sig":
440            # BOM will already have been stripped.
441            encoding = "utf-8"
442        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
443    last_line = b''
444    line = b''
445    while True:                                # loop over lines in stream
446        try:
447            # We capture the value of the line variable here because
448            # readline uses the empty string '' to signal end of input,
449            # hence `line` itself will always be overwritten at the end
450            # of this loop.
451            last_line = line
452            line = readline()
453        except StopIteration:
454            line = b''
455
456        if encoding is not None:
457            line = line.decode(encoding)
458        lnum += 1
459        pos, max = 0, len(line)
460
461        if contstr:                            # continued string
462            if not line:
463                raise TokenError("EOF in multi-line string", strstart)
464            endmatch = endprog.match(line)
465            if endmatch:
466                pos = end = endmatch.end(0)
467                yield TokenInfo(STRING, contstr + line[:end],
468                       strstart, (lnum, end), contline + line)
469                contstr, needcont = '', 0
470                contline = None
471            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
472                yield TokenInfo(ERRORTOKEN, contstr + line,
473                           strstart, (lnum, len(line)), contline)
474                contstr = ''
475                contline = None
476                continue
477            else:
478                contstr = contstr + line
479                contline = contline + line
480                continue
481
482        elif parenlev == 0 and not continued:  # new statement
483            if not line: break
484            column = 0
485            while pos < max:                   # measure leading whitespace
486                if line[pos] == ' ':
487                    column += 1
488                elif line[pos] == '\t':
489                    column = (column//tabsize + 1)*tabsize
490                elif line[pos] == '\f':
491                    column = 0
492                else:
493                    break
494                pos += 1
495            if pos == max:
496                break
497
498            if line[pos] in '#\r\n':           # skip comments or blank lines
499                if line[pos] == '#':
500                    comment_token = line[pos:].rstrip('\r\n')
501                    yield TokenInfo(COMMENT, comment_token,
502                           (lnum, pos), (lnum, pos + len(comment_token)), line)
503                    pos += len(comment_token)
504
505                yield TokenInfo(NL, line[pos:],
506                           (lnum, pos), (lnum, len(line)), line)
507                continue
508
509            if column > indents[-1]:           # count indents or dedents
510                indents.append(column)
511                yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
512            while column < indents[-1]:
513                if column not in indents:
514                    raise IndentationError(
515                        "unindent does not match any outer indentation level",
516                        ("<tokenize>", lnum, pos, line))
517                indents = indents[:-1]
518
519                yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line)
520
521        else:                                  # continued statement
522            if not line:
523                raise TokenError("EOF in multi-line statement", (lnum, 0))
524            continued = 0
525
526        while pos < max:
527            pseudomatch = _compile(PseudoToken).match(line, pos)
528            if pseudomatch:                                # scan for tokens
529                start, end = pseudomatch.span(1)
530                spos, epos, pos = (lnum, start), (lnum, end), end
531                if start == end:
532                    continue
533                token, initial = line[start:end], line[start]
534
535                if (initial in numchars or                 # ordinary number
536                    (initial == '.' and token != '.' and token != '...')):
537                    yield TokenInfo(NUMBER, token, spos, epos, line)
538                elif initial in '\r\n':
539                    if parenlev > 0:
540                        yield TokenInfo(NL, token, spos, epos, line)
541                    else:
542                        yield TokenInfo(NEWLINE, token, spos, epos, line)
543
544                elif initial == '#':
545                    assert not token.endswith("\n")
546                    yield TokenInfo(COMMENT, token, spos, epos, line)
547
548                elif token in triple_quoted:
549                    endprog = _compile(endpats[token])
550                    endmatch = endprog.match(line, pos)
551                    if endmatch:                           # all on one line
552                        pos = endmatch.end(0)
553                        token = line[start:pos]
554                        yield TokenInfo(STRING, token, spos, (lnum, pos), line)
555                    else:
556                        strstart = (lnum, start)           # multiple lines
557                        contstr = line[start:]
558                        contline = line
559                        break
560
561                # Check up to the first 3 chars of the token to see if
562                #  they're in the single_quoted set. If so, they start
563                #  a string.
564                # We're using the first 3, because we're looking for
565                #  "rb'" (for example) at the start of the token. If
566                #  we switch to longer prefixes, this needs to be
567                #  adjusted.
568                # Note that initial == token[:1].
569                # Also note that single quote checking must come after
570                #  triple quote checking (above).
571                elif (initial in single_quoted or
572                      token[:2] in single_quoted or
573                      token[:3] in single_quoted):
574                    if token[-1] == '\n':                  # continued string
575                        strstart = (lnum, start)
576                        # Again, using the first 3 chars of the
577                        #  token. This is looking for the matching end
578                        #  regex for the correct type of quote
579                        #  character. So it's really looking for
580                        #  endpats["'"] or endpats['"'], by trying to
581                        #  skip string prefix characters, if any.
582                        endprog = _compile(endpats.get(initial) or
583                                           endpats.get(token[1]) or
584                                           endpats.get(token[2]))
585                        contstr, needcont = line[start:], 1
586                        contline = line
587                        break
588                    else:                                  # ordinary string
589                        yield TokenInfo(STRING, token, spos, epos, line)
590
591                elif initial.isidentifier():               # ordinary name
592                    yield TokenInfo(NAME, token, spos, epos, line)
593                elif initial == '\\':                      # continued stmt
594                    continued = 1
595                else:
596                    if initial in '([{':
597                        parenlev += 1
598                    elif initial in ')]}':
599                        parenlev -= 1
600                    yield TokenInfo(OP, token, spos, epos, line)
601            else:
602                yield TokenInfo(ERRORTOKEN, line[pos],
603                           (lnum, pos), (lnum, pos+1), line)
604                pos += 1
605
606    # Add an implicit NEWLINE if the input doesn't end in one
607    if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"):
608        yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
609    for indent in indents[1:]:                 # pop remaining indent levels
610        yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '')
611    yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '')
612
613
614def generate_tokens(readline):
615    """Tokenize a source reading Python code as unicode strings.
616
617    This has the same API as tokenize(), except that it expects the *readline*
618    callable to return str objects instead of bytes.
619    """
620    return _tokenize(readline, None)
621
622def main():
623    import argparse
624
625    # Helper error handling routines
626    def perror(message):
627        sys.stderr.write(message)
628        sys.stderr.write('\n')
629
630    def error(message, filename=None, location=None):
631        if location:
632            args = (filename,) + location + (message,)
633            perror("%s:%d:%d: error: %s" % args)
634        elif filename:
635            perror("%s: error: %s" % (filename, message))
636        else:
637            perror("error: %s" % message)
638        sys.exit(1)
639
640    # Parse the arguments and options
641    parser = argparse.ArgumentParser(prog='python -m tokenize')
642    parser.add_argument(dest='filename', nargs='?',
643                        metavar='filename.py',
644                        help='the file to tokenize; defaults to stdin')
645    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
646                        help='display token names using the exact type')
647    args = parser.parse_args()
648
649    try:
650        # Tokenize the input
651        if args.filename:
652            filename = args.filename
653            with _builtin_open(filename, 'rb') as f:
654                tokens = list(tokenize(f.readline))
655        else:
656            filename = "<stdin>"
657            tokens = _tokenize(sys.stdin.readline, None)
658
659        # Output the tokenization
660        for token in tokens:
661            token_type = token.type
662            if args.exact:
663                token_type = token.exact_type
664            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
665            print("%-20s%-15s%-15r" %
666                  (token_range, tok_name[token_type], token.string))
667    except IndentationError as err:
668        line, column = err.args[1][1:3]
669        error(err.args[0], filename, (line, column))
670    except TokenError as err:
671        line, column = err.args[1]
672        error(err.args[0], filename, (line, column))
673    except SyntaxError as err:
674        error(err, filename)
675    except OSError as err:
676        error(err)
677    except KeyboardInterrupt:
678        print("interrupted\n")
679    except Exception as err:
680        perror("unexpected error: %s" % err)
681        raise
682
683if __name__ == "__main__":
684    main()
685