• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tokenization help for Python programs.
2
3tokenize(readline) is a generator that breaks a stream of bytes into
4Python tokens.  It decodes the bytes according to PEP-0263 for
5determining source file encoding.
6
7It accepts a readline-like method which is called repeatedly to get the
8next line of input (or b"" for EOF).  It generates 5-tuples with these
9members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators.  Additionally, all token lists start with an ENCODING token
20which tells you which encoding was used to decode the bytes stream.
21"""
22
23__author__ = 'Ka-Ping Yee <ping@lfw.org>'
24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
25               'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
26               'Michael Foord')
27from builtins import open as _builtin_open
28from codecs import lookup, BOM_UTF8
29import collections
30import functools
31from io import TextIOWrapper
32import itertools as _itertools
33import re
34import sys
35from token import *
36from token import EXACT_TOKEN_TYPES
37import _tokenize
38
39cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
40blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
41
42import token
43__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding",
44                           "untokenize", "TokenInfo", "open", "TokenError"]
45del token
46
47class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
48    def __repr__(self):
49        annotated_type = '%d (%s)' % (self.type, tok_name[self.type])
50        return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
51                self._replace(type=annotated_type))
52
53    @property
54    def exact_type(self):
55        if self.type == OP and self.string in EXACT_TOKEN_TYPES:
56            return EXACT_TOKEN_TYPES[self.string]
57        else:
58            return self.type
59
60def group(*choices): return '(' + '|'.join(choices) + ')'
61def any(*choices): return group(*choices) + '*'
62def maybe(*choices): return group(*choices) + '?'
63
64# Note: we use unicode matching for names ("\w") but ascii matching for
65# number literals.
66Whitespace = r'[ \f\t]*'
67Comment = r'#[^\r\n]*'
68Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
69Name = r'\w+'
70
71Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
72Binnumber = r'0[bB](?:_?[01])+'
73Octnumber = r'0[oO](?:_?[0-7])+'
74Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
75Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
76Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
77Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
78                   r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
79Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
80Floatnumber = group(Pointfloat, Expfloat)
81Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
82Number = group(Imagnumber, Floatnumber, Intnumber)
83
84# Return the empty string, plus all of the valid string prefixes.
85def _all_string_prefixes():
86    # The valid string prefixes. Only contain the lower case versions,
87    #  and don't contain any permutations (include 'fr', but not
88    #  'rf'). The various permutations will be generated.
89    _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
90    # if we add binary f-strings, add: ['fb', 'fbr']
91    result = {''}
92    for prefix in _valid_string_prefixes:
93        for t in _itertools.permutations(prefix):
94            # create a list with upper and lower versions of each
95            #  character
96            for u in _itertools.product(*[(c, c.upper()) for c in t]):
97                result.add(''.join(u))
98    return result
99
100@functools.lru_cache
101def _compile(expr):
102    return re.compile(expr, re.UNICODE)
103
104# Note that since _all_string_prefixes includes the empty string,
105#  StringPrefix can be the empty string (making it optional).
106StringPrefix = group(*_all_string_prefixes())
107
108# Tail end of ' string.
109Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
110# Tail end of " string.
111Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
112# Tail end of ''' string.
113Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
114# Tail end of """ string.
115Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
116Triple = group(StringPrefix + "'''", StringPrefix + '"""')
117# Single-line ' or " string.
118String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
119               StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
120
121# Sorting in reverse order puts the long operators before their prefixes.
122# Otherwise if = came before ==, == would get recognized as two instances
123# of =.
124Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True)))
125Funny = group(r'\r?\n', Special)
126
127PlainToken = group(Number, Funny, String, Name)
128Token = Ignore + PlainToken
129
130# First (or only) line of ' or " string.
131ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
132                group("'", r'\\\r?\n'),
133                StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
134                group('"', r'\\\r?\n'))
135PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
136PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
137
138# For a given string prefix plus quotes, endpats maps it to a regex
139#  to match the remainder of that string. _prefix can be empty, for
140#  a normal single or triple quoted string (with no prefix).
141endpats = {}
142for _prefix in _all_string_prefixes():
143    endpats[_prefix + "'"] = Single
144    endpats[_prefix + '"'] = Double
145    endpats[_prefix + "'''"] = Single3
146    endpats[_prefix + '"""'] = Double3
147del _prefix
148
149# A set of all of the single and triple quoted string prefixes,
150#  including the opening quotes.
151single_quoted = set()
152triple_quoted = set()
153for t in _all_string_prefixes():
154    for u in (t + '"', t + "'"):
155        single_quoted.add(u)
156    for u in (t + '"""', t + "'''"):
157        triple_quoted.add(u)
158del t, u
159
160tabsize = 8
161
162class TokenError(Exception): pass
163
164
165class Untokenizer:
166
167    def __init__(self):
168        self.tokens = []
169        self.prev_row = 1
170        self.prev_col = 0
171        self.prev_type = None
172        self.encoding = None
173
174    def add_whitespace(self, start):
175        row, col = start
176        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
177            raise ValueError("start ({},{}) precedes previous end ({},{})"
178                             .format(row, col, self.prev_row, self.prev_col))
179        row_offset = row - self.prev_row
180        if row_offset:
181            self.tokens.append("\\\n" * row_offset)
182            self.prev_col = 0
183        col_offset = col - self.prev_col
184        if col_offset:
185            self.tokens.append(" " * col_offset)
186
187    def escape_brackets(self, token):
188        characters = []
189        consume_until_next_bracket = False
190        for character in token:
191            if character == "}":
192                if consume_until_next_bracket:
193                    consume_until_next_bracket = False
194                else:
195                    characters.append(character)
196            if character == "{":
197                n_backslashes = sum(
198                    1 for char in _itertools.takewhile(
199                        "\\".__eq__,
200                        characters[-2::-1]
201                    )
202                )
203                if n_backslashes % 2 == 0 or characters[-1] != "N":
204                    characters.append(character)
205                else:
206                    consume_until_next_bracket = True
207            characters.append(character)
208        return "".join(characters)
209
210    def untokenize(self, iterable):
211        it = iter(iterable)
212        indents = []
213        startline = False
214        for t in it:
215            if len(t) == 2:
216                self.compat(t, it)
217                break
218            tok_type, token, start, end, line = t
219            if tok_type == ENCODING:
220                self.encoding = token
221                continue
222            if tok_type == ENDMARKER:
223                break
224            if tok_type == INDENT:
225                indents.append(token)
226                continue
227            elif tok_type == DEDENT:
228                indents.pop()
229                self.prev_row, self.prev_col = end
230                continue
231            elif tok_type in (NEWLINE, NL):
232                startline = True
233            elif startline and indents:
234                indent = indents[-1]
235                if start[1] >= len(indent):
236                    self.tokens.append(indent)
237                    self.prev_col = len(indent)
238                startline = False
239            elif tok_type == FSTRING_MIDDLE:
240                if '{' in token or '}' in token:
241                    token = self.escape_brackets(token)
242                    last_line = token.splitlines()[-1]
243                    end_line, end_col = end
244                    extra_chars = last_line.count("{{") + last_line.count("}}")
245                    end = (end_line, end_col + extra_chars)
246            elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
247                self.tokens.append(" ")
248
249            self.add_whitespace(start)
250            self.tokens.append(token)
251            self.prev_row, self.prev_col = end
252            if tok_type in (NEWLINE, NL):
253                self.prev_row += 1
254                self.prev_col = 0
255            self.prev_type = tok_type
256        return "".join(self.tokens)
257
258    def compat(self, token, iterable):
259        indents = []
260        toks_append = self.tokens.append
261        startline = token[0] in (NEWLINE, NL)
262        prevstring = False
263        in_fstring = 0
264
265        for tok in _itertools.chain([token], iterable):
266            toknum, tokval = tok[:2]
267            if toknum == ENCODING:
268                self.encoding = tokval
269                continue
270
271            if toknum in (NAME, NUMBER):
272                tokval += ' '
273
274            # Insert a space between two consecutive strings
275            if toknum == STRING:
276                if prevstring:
277                    tokval = ' ' + tokval
278                prevstring = True
279            else:
280                prevstring = False
281
282            if toknum == FSTRING_START:
283                in_fstring += 1
284            elif toknum == FSTRING_END:
285                in_fstring -= 1
286            if toknum == INDENT:
287                indents.append(tokval)
288                continue
289            elif toknum == DEDENT:
290                indents.pop()
291                continue
292            elif toknum in (NEWLINE, NL):
293                startline = True
294            elif startline and indents:
295                toks_append(indents[-1])
296                startline = False
297            elif toknum == FSTRING_MIDDLE:
298                tokval = self.escape_brackets(tokval)
299
300            # Insert a space between two consecutive brackets if we are in an f-string
301            if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring:
302                tokval = ' ' + tokval
303
304            # Insert a space between two consecutive f-strings
305            if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END):
306                self.tokens.append(" ")
307
308            toks_append(tokval)
309            self.prev_type = toknum
310
311
312def untokenize(iterable):
313    """Transform tokens back into Python source code.
314    It returns a bytes object, encoded using the ENCODING
315    token, which is the first token sequence output by tokenize.
316
317    Each element returned by the iterable must be a token sequence
318    with at least two elements, a token number and token value.  If
319    only two tokens are passed, the resulting output is poor.
320
321    Round-trip invariant for full input:
322        Untokenized source will match input source exactly
323
324    Round-trip invariant for limited input:
325        # Output bytes will tokenize back to the input
326        t1 = [tok[:2] for tok in tokenize(f.readline)]
327        newcode = untokenize(t1)
328        readline = BytesIO(newcode).readline
329        t2 = [tok[:2] for tok in tokenize(readline)]
330        assert t1 == t2
331    """
332    ut = Untokenizer()
333    out = ut.untokenize(iterable)
334    if ut.encoding is not None:
335        out = out.encode(ut.encoding)
336    return out
337
338
339def _get_normal_name(orig_enc):
340    """Imitates get_normal_name in Parser/tokenizer/helpers.c."""
341    # Only care about the first 12 characters.
342    enc = orig_enc[:12].lower().replace("_", "-")
343    if enc == "utf-8" or enc.startswith("utf-8-"):
344        return "utf-8"
345    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
346       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
347        return "iso-8859-1"
348    return orig_enc
349
350def detect_encoding(readline):
351    """
352    The detect_encoding() function is used to detect the encoding that should
353    be used to decode a Python source file.  It requires one argument, readline,
354    in the same way as the tokenize() generator.
355
356    It will call readline a maximum of twice, and return the encoding used
357    (as a string) and a list of any lines (left as bytes) it has read in.
358
359    It detects the encoding from the presence of a utf-8 bom or an encoding
360    cookie as specified in pep-0263.  If both a bom and a cookie are present,
361    but disagree, a SyntaxError will be raised.  If the encoding cookie is an
362    invalid charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
363    'utf-8-sig' is returned.
364
365    If no encoding is specified, then the default of 'utf-8' will be returned.
366    """
367    try:
368        filename = readline.__self__.name
369    except AttributeError:
370        filename = None
371    bom_found = False
372    encoding = None
373    default = 'utf-8'
374    def read_or_stop():
375        try:
376            return readline()
377        except StopIteration:
378            return b''
379
380    def find_cookie(line):
381        try:
382            # Decode as UTF-8. Either the line is an encoding declaration,
383            # in which case it should be pure ASCII, or it must be UTF-8
384            # per default encoding.
385            line_string = line.decode('utf-8')
386        except UnicodeDecodeError:
387            msg = "invalid or missing encoding declaration"
388            if filename is not None:
389                msg = '{} for {!r}'.format(msg, filename)
390            raise SyntaxError(msg)
391
392        match = cookie_re.match(line_string)
393        if not match:
394            return None
395        encoding = _get_normal_name(match.group(1))
396        try:
397            codec = lookup(encoding)
398        except LookupError:
399            # This behaviour mimics the Python interpreter
400            if filename is None:
401                msg = "unknown encoding: " + encoding
402            else:
403                msg = "unknown encoding for {!r}: {}".format(filename,
404                        encoding)
405            raise SyntaxError(msg)
406
407        if bom_found:
408            if encoding != 'utf-8':
409                # This behaviour mimics the Python interpreter
410                if filename is None:
411                    msg = 'encoding problem: utf-8'
412                else:
413                    msg = 'encoding problem for {!r}: utf-8'.format(filename)
414                raise SyntaxError(msg)
415            encoding += '-sig'
416        return encoding
417
418    first = read_or_stop()
419    if first.startswith(BOM_UTF8):
420        bom_found = True
421        first = first[3:]
422        default = 'utf-8-sig'
423    if not first:
424        return default, []
425
426    encoding = find_cookie(first)
427    if encoding:
428        return encoding, [first]
429    if not blank_re.match(first):
430        return default, [first]
431
432    second = read_or_stop()
433    if not second:
434        return default, [first]
435
436    encoding = find_cookie(second)
437    if encoding:
438        return encoding, [first, second]
439
440    return default, [first, second]
441
442
443def open(filename):
444    """Open a file in read only mode using the encoding detected by
445    detect_encoding().
446    """
447    buffer = _builtin_open(filename, 'rb')
448    try:
449        encoding, lines = detect_encoding(buffer.readline)
450        buffer.seek(0)
451        text = TextIOWrapper(buffer, encoding, line_buffering=True)
452        text.mode = 'r'
453        return text
454    except:
455        buffer.close()
456        raise
457
458def tokenize(readline):
459    """
460    The tokenize() generator requires one argument, readline, which
461    must be a callable object which provides the same interface as the
462    readline() method of built-in file objects.  Each call to the function
463    should return one line of input as bytes.  Alternatively, readline
464    can be a callable function terminating with StopIteration:
465        readline = open(myfile, 'rb').__next__  # Example of alternate readline
466
467    The generator produces 5-tuples with these members: the token type; the
468    token string; a 2-tuple (srow, scol) of ints specifying the row and
469    column where the token begins in the source; a 2-tuple (erow, ecol) of
470    ints specifying the row and column where the token ends in the source;
471    and the line on which the token was found.  The line passed is the
472    physical line.
473
474    The first token sequence will always be an ENCODING token
475    which tells you which encoding was used to decode the bytes stream.
476    """
477    encoding, consumed = detect_encoding(readline)
478    rl_gen = _itertools.chain(consumed, iter(readline, b""))
479    if encoding is not None:
480        if encoding == "utf-8-sig":
481            # BOM will already have been stripped.
482            encoding = "utf-8"
483        yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '')
484    yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True)
485
486def generate_tokens(readline):
487    """Tokenize a source reading Python code as unicode strings.
488
489    This has the same API as tokenize(), except that it expects the *readline*
490    callable to return str objects instead of bytes.
491    """
492    return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True)
493
494def main():
495    import argparse
496
497    # Helper error handling routines
498    def perror(message):
499        sys.stderr.write(message)
500        sys.stderr.write('\n')
501
502    def error(message, filename=None, location=None):
503        if location:
504            args = (filename,) + location + (message,)
505            perror("%s:%d:%d: error: %s" % args)
506        elif filename:
507            perror("%s: error: %s" % (filename, message))
508        else:
509            perror("error: %s" % message)
510        sys.exit(1)
511
512    # Parse the arguments and options
513    parser = argparse.ArgumentParser(prog='python -m tokenize')
514    parser.add_argument(dest='filename', nargs='?',
515                        metavar='filename.py',
516                        help='the file to tokenize; defaults to stdin')
517    parser.add_argument('-e', '--exact', dest='exact', action='store_true',
518                        help='display token names using the exact type')
519    args = parser.parse_args()
520
521    try:
522        # Tokenize the input
523        if args.filename:
524            filename = args.filename
525            with _builtin_open(filename, 'rb') as f:
526                tokens = list(tokenize(f.readline))
527        else:
528            filename = "<stdin>"
529            tokens = _generate_tokens_from_c_tokenizer(
530                sys.stdin.readline, extra_tokens=True)
531
532
533        # Output the tokenization
534        for token in tokens:
535            token_type = token.type
536            if args.exact:
537                token_type = token.exact_type
538            token_range = "%d,%d-%d,%d:" % (token.start + token.end)
539            print("%-20s%-15s%-15r" %
540                  (token_range, tok_name[token_type], token.string))
541    except IndentationError as err:
542        line, column = err.args[1][1:3]
543        error(err.args[0], filename, (line, column))
544    except TokenError as err:
545        line, column = err.args[1]
546        error(err.args[0], filename, (line, column))
547    except SyntaxError as err:
548        error(err, filename)
549    except OSError as err:
550        error(err)
551    except KeyboardInterrupt:
552        print("interrupted\n")
553    except Exception as err:
554        perror("unexpected error: %s" % err)
555        raise
556
557def _transform_msg(msg):
558    """Transform error messages from the C tokenizer into the Python tokenize
559
560    The C tokenizer is more picky than the Python one, so we need to massage
561    the error messages a bit for backwards compatibility.
562    """
563    if "unterminated triple-quoted string literal" in msg:
564        return "EOF in multi-line string"
565    return msg
566
567def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False):
568    """Tokenize a source reading Python code as unicode strings using the internal C tokenizer"""
569    if encoding is None:
570        it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens)
571    else:
572        it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens)
573    try:
574        for info in it:
575            yield TokenInfo._make(info)
576    except SyntaxError as e:
577        if type(e) != SyntaxError:
578            raise e from None
579        msg = _transform_msg(e.msg)
580        raise TokenError(msg, (e.lineno, e.offset)) from None
581
582
583if __name__ == "__main__":
584    main()
585