• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3
4"""Tokenization help for Python programs.
5
6generate_tokens(readline) is a generator that breaks a stream of
7text into Python tokens.  It accepts a readline-like method which is called
8repeatedly to get the next line of input (or "" for EOF).  It generates
95-tuples with these members:
10
11    the token type (see token.py)
12    the token (a string)
13    the starting (row, column) indices of the token (a 2-tuple of ints)
14    the ending (row, column) indices of the token (a 2-tuple of ints)
15    the original line (string)
16
17It is designed to match the working of the Python tokenizer exactly, except
18that it produces COMMENT tokens for comments and gives type OP for all
19operators
20
21Older entry points
22    tokenize_loop(readline, tokeneater)
23    tokenize(readline, tokeneater=printtoken)
24are the same, except instead of generating tokens, tokeneater is a callback
25function to which the 5 fields described above are passed as 5 arguments,
26each time a new token is found."""
27
28__author__ = 'Ka-Ping Yee <ping@lfw.org>'
29__credits__ = \
30    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
31
32import string, re
33from codecs import BOM_UTF8, lookup
34from lib2to3.pgen2.token import *
35
36from . import token
37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize",
38           "generate_tokens", "untokenize"]
39del token
40
41try:
42    bytes
43except NameError:
44    # Support bytes type in Python <= 2.5, so 2to3 turns itself into
45    # valid Python 3 code.
46    bytes = str
47
48def group(*choices): return '(' + '|'.join(choices) + ')'
49def any(*choices): return group(*choices) + '*'
50def maybe(*choices): return group(*choices) + '?'
51
52Whitespace = r'[ \f\t]*'
53Comment = r'#[^\r\n]*'
54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
55Name = r'[a-zA-Z_]\w*'
56
57Binnumber = r'0[bB][01]*'
58Hexnumber = r'0[xX][\da-fA-F]*[lL]?'
59Octnumber = r'0[oO]?[0-7]*[lL]?'
60Decnumber = r'[1-9]\d*[lL]?'
61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
62Exponent = r'[eE][-+]?\d+'
63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
64Expfloat = r'\d+' + Exponent
65Floatnumber = group(Pointfloat, Expfloat)
66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
67Number = group(Imagnumber, Floatnumber, Intnumber)
68
69# Tail end of ' string.
70Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
71# Tail end of " string.
72Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
73# Tail end of ''' string.
74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
75# Tail end of """ string.
76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""')
78# Single-line ' or " string.
79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
80               r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
81
82# Because of leftmost-then-longest match semantics, be sure to put the
83# longest operators first (e.g., if = came before ==, == would get
84# recognized as two instances of =).
85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
86                 r"//=?", r"->",
87                 r"[+\-*/%&@|^=<>]=?",
88                 r"~")
89
90Bracket = '[][(){}]'
91Special = group(r'\r?\n', r'[:;.,`@]')
92Funny = group(Operator, Bracket, Special)
93
94PlainToken = group(Number, Funny, String, Name)
95Token = Ignore + PlainToken
96
97# First (or only) line of ' or " string.
98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
99                group("'", r'\\\r?\n'),
100                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
101                group('"', r'\\\r?\n'))
102PseudoExtras = group(r'\\\r?\n', Comment, Triple)
103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
104
105tokenprog, pseudoprog, single3prog, double3prog = map(
106    re.compile, (Token, PseudoToken, Single3, Double3))
107endprogs = {"'": re.compile(Single), '"': re.compile(Double),
108            "'''": single3prog, '"""': double3prog,
109            "r'''": single3prog, 'r"""': double3prog,
110            "u'''": single3prog, 'u"""': double3prog,
111            "b'''": single3prog, 'b"""': double3prog,
112            "ur'''": single3prog, 'ur"""': double3prog,
113            "br'''": single3prog, 'br"""': double3prog,
114            "R'''": single3prog, 'R"""': double3prog,
115            "U'''": single3prog, 'U"""': double3prog,
116            "B'''": single3prog, 'B"""': double3prog,
117            "uR'''": single3prog, 'uR"""': double3prog,
118            "Ur'''": single3prog, 'Ur"""': double3prog,
119            "UR'''": single3prog, 'UR"""': double3prog,
120            "bR'''": single3prog, 'bR"""': double3prog,
121            "Br'''": single3prog, 'Br"""': double3prog,
122            "BR'''": single3prog, 'BR"""': double3prog,
123            'r': None, 'R': None,
124            'u': None, 'U': None,
125            'b': None, 'B': None}
126
127triple_quoted = {}
128for t in ("'''", '"""',
129          "r'''", 'r"""', "R'''", 'R"""',
130          "u'''", 'u"""', "U'''", 'U"""',
131          "b'''", 'b"""', "B'''", 'B"""',
132          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
133          "uR'''", 'uR"""', "UR'''", 'UR"""',
134          "br'''", 'br"""', "Br'''", 'Br"""',
135          "bR'''", 'bR"""', "BR'''", 'BR"""',):
136    triple_quoted[t] = t
137single_quoted = {}
138for t in ("'", '"',
139          "r'", 'r"', "R'", 'R"',
140          "u'", 'u"', "U'", 'U"',
141          "b'", 'b"', "B'", 'B"',
142          "ur'", 'ur"', "Ur'", 'Ur"',
143          "uR'", 'uR"', "UR'", 'UR"',
144          "br'", 'br"', "Br'", 'Br"',
145          "bR'", 'bR"', "BR'", 'BR"', ):
146    single_quoted[t] = t
147
148tabsize = 8
149
150class TokenError(Exception): pass
151
152class StopTokenizing(Exception): pass
153
154def printtoken(type, token, start, end, line): # for testing
155    (srow, scol) = start
156    (erow, ecol) = end
157    print "%d,%d-%d,%d:\t%s\t%s" % \
158        (srow, scol, erow, ecol, tok_name[type], repr(token))
159
160def tokenize(readline, tokeneater=printtoken):
161    """
162    The tokenize() function accepts two parameters: one representing the
163    input stream, and one providing an output mechanism for tokenize().
164
165    The first parameter, readline, must be a callable object which provides
166    the same interface as the readline() method of built-in file objects.
167    Each call to the function should return one line of input as a string.
168
169    The second parameter, tokeneater, must also be a callable object. It is
170    called once for each token, with five arguments, corresponding to the
171    tuples generated by generate_tokens().
172    """
173    try:
174        tokenize_loop(readline, tokeneater)
175    except StopTokenizing:
176        pass
177
178# backwards compatible interface
179def tokenize_loop(readline, tokeneater):
180    for token_info in generate_tokens(readline):
181        tokeneater(*token_info)
182
183class Untokenizer:
184
185    def __init__(self):
186        self.tokens = []
187        self.prev_row = 1
188        self.prev_col = 0
189
190    def add_whitespace(self, start):
191        row, col = start
192        assert row <= self.prev_row
193        col_offset = col - self.prev_col
194        if col_offset:
195            self.tokens.append(" " * col_offset)
196
197    def untokenize(self, iterable):
198        for t in iterable:
199            if len(t) == 2:
200                self.compat(t, iterable)
201                break
202            tok_type, token, start, end, line = t
203            self.add_whitespace(start)
204            self.tokens.append(token)
205            self.prev_row, self.prev_col = end
206            if tok_type in (NEWLINE, NL):
207                self.prev_row += 1
208                self.prev_col = 0
209        return "".join(self.tokens)
210
211    def compat(self, token, iterable):
212        startline = False
213        indents = []
214        toks_append = self.tokens.append
215        toknum, tokval = token
216        if toknum in (NAME, NUMBER):
217            tokval += ' '
218        if toknum in (NEWLINE, NL):
219            startline = True
220        for tok in iterable:
221            toknum, tokval = tok[:2]
222
223            if toknum in (NAME, NUMBER):
224                tokval += ' '
225
226            if toknum == INDENT:
227                indents.append(tokval)
228                continue
229            elif toknum == DEDENT:
230                indents.pop()
231                continue
232            elif toknum in (NEWLINE, NL):
233                startline = True
234            elif startline and indents:
235                toks_append(indents[-1])
236                startline = False
237            toks_append(tokval)
238
239cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)')
240blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)')
241
242def _get_normal_name(orig_enc):
243    """Imitates get_normal_name in tokenizer.c."""
244    # Only care about the first 12 characters.
245    enc = orig_enc[:12].lower().replace("_", "-")
246    if enc == "utf-8" or enc.startswith("utf-8-"):
247        return "utf-8"
248    if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \
249       enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")):
250        return "iso-8859-1"
251    return orig_enc
252
253def detect_encoding(readline):
254    """
255    The detect_encoding() function is used to detect the encoding that should
256    be used to decode a Python source file. It requires one argument, readline,
257    in the same way as the tokenize() generator.
258
259    It will call readline a maximum of twice, and return the encoding used
260    (as a string) and a list of any lines (left as bytes) it has read
261    in.
262
263    It detects the encoding from the presence of a utf-8 bom or an encoding
264    cookie as specified in pep-0263. If both a bom and a cookie are present, but
265    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
266    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
267    'utf-8-sig' is returned.
268
269    If no encoding is specified, then the default of 'utf-8' will be returned.
270    """
271    bom_found = False
272    encoding = None
273    default = 'utf-8'
274    def read_or_stop():
275        try:
276            return readline()
277        except StopIteration:
278            return bytes()
279
280    def find_cookie(line):
281        try:
282            line_string = line.decode('ascii')
283        except UnicodeDecodeError:
284            return None
285        match = cookie_re.match(line_string)
286        if not match:
287            return None
288        encoding = _get_normal_name(match.group(1))
289        try:
290            codec = lookup(encoding)
291        except LookupError:
292            # This behaviour mimics the Python interpreter
293            raise SyntaxError("unknown encoding: " + encoding)
294
295        if bom_found:
296            if codec.name != 'utf-8':
297                # This behaviour mimics the Python interpreter
298                raise SyntaxError('encoding problem: utf-8')
299            encoding += '-sig'
300        return encoding
301
302    first = read_or_stop()
303    if first.startswith(BOM_UTF8):
304        bom_found = True
305        first = first[3:]
306        default = 'utf-8-sig'
307    if not first:
308        return default, []
309
310    encoding = find_cookie(first)
311    if encoding:
312        return encoding, [first]
313    if not blank_re.match(first):
314        return default, [first]
315
316    second = read_or_stop()
317    if not second:
318        return default, [first]
319
320    encoding = find_cookie(second)
321    if encoding:
322        return encoding, [first, second]
323
324    return default, [first, second]
325
326def untokenize(iterable):
327    """Transform tokens back into Python source code.
328
329    Each element returned by the iterable must be a token sequence
330    with at least two elements, a token number and token value.  If
331    only two tokens are passed, the resulting output is poor.
332
333    Round-trip invariant for full input:
334        Untokenized source will match input source exactly
335
336    Round-trip invariant for limited intput:
337        # Output text will tokenize the back to the input
338        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
339        newcode = untokenize(t1)
340        readline = iter(newcode.splitlines(1)).next
341        t2 = [tok[:2] for tokin generate_tokens(readline)]
342        assert t1 == t2
343    """
344    ut = Untokenizer()
345    return ut.untokenize(iterable)
346
347def generate_tokens(readline):
348    """
349    The generate_tokens() generator requires one argument, readline, which
350    must be a callable object which provides the same interface as the
351    readline() method of built-in file objects. Each call to the function
352    should return one line of input as a string.  Alternately, readline
353    can be a callable function terminating with StopIteration:
354        readline = open(myfile).next    # Example of alternate readline
355
356    The generator produces 5-tuples with these members: the token type; the
357    token string; a 2-tuple (srow, scol) of ints specifying the row and
358    column where the token begins in the source; a 2-tuple (erow, ecol) of
359    ints specifying the row and column where the token ends in the source;
360    and the line on which the token was found. The line passed is the
361    logical line; continuation lines are included.
362    """
363    lnum = parenlev = continued = 0
364    namechars, numchars = string.ascii_letters + '_', '0123456789'
365    contstr, needcont = '', 0
366    contline = None
367    indents = [0]
368
369    while 1:                                   # loop over lines in stream
370        try:
371            line = readline()
372        except StopIteration:
373            line = ''
374        lnum = lnum + 1
375        pos, max = 0, len(line)
376
377        if contstr:                            # continued string
378            if not line:
379                raise TokenError, ("EOF in multi-line string", strstart)
380            endmatch = endprog.match(line)
381            if endmatch:
382                pos = end = endmatch.end(0)
383                yield (STRING, contstr + line[:end],
384                       strstart, (lnum, end), contline + line)
385                contstr, needcont = '', 0
386                contline = None
387            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
388                yield (ERRORTOKEN, contstr + line,
389                           strstart, (lnum, len(line)), contline)
390                contstr = ''
391                contline = None
392                continue
393            else:
394                contstr = contstr + line
395                contline = contline + line
396                continue
397
398        elif parenlev == 0 and not continued:  # new statement
399            if not line: break
400            column = 0
401            while pos < max:                   # measure leading whitespace
402                if line[pos] == ' ': column = column + 1
403                elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize
404                elif line[pos] == '\f': column = 0
405                else: break
406                pos = pos + 1
407            if pos == max: break
408
409            if line[pos] in '#\r\n':           # skip comments or blank lines
410                if line[pos] == '#':
411                    comment_token = line[pos:].rstrip('\r\n')
412                    nl_pos = pos + len(comment_token)
413                    yield (COMMENT, comment_token,
414                           (lnum, pos), (lnum, pos + len(comment_token)), line)
415                    yield (NL, line[nl_pos:],
416                           (lnum, nl_pos), (lnum, len(line)), line)
417                else:
418                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
419                           (lnum, pos), (lnum, len(line)), line)
420                continue
421
422            if column > indents[-1]:           # count indents or dedents
423                indents.append(column)
424                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
425            while column < indents[-1]:
426                if column not in indents:
427                    raise IndentationError(
428                        "unindent does not match any outer indentation level",
429                        ("<tokenize>", lnum, pos, line))
430                indents = indents[:-1]
431                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
432
433        else:                                  # continued statement
434            if not line:
435                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
436            continued = 0
437
438        while pos < max:
439            pseudomatch = pseudoprog.match(line, pos)
440            if pseudomatch:                                # scan for tokens
441                start, end = pseudomatch.span(1)
442                spos, epos, pos = (lnum, start), (lnum, end), end
443                token, initial = line[start:end], line[start]
444
445                if initial in numchars or \
446                   (initial == '.' and token != '.'):      # ordinary number
447                    yield (NUMBER, token, spos, epos, line)
448                elif initial in '\r\n':
449                    newline = NEWLINE
450                    if parenlev > 0:
451                        newline = NL
452                    yield (newline, token, spos, epos, line)
453                elif initial == '#':
454                    assert not token.endswith("\n")
455                    yield (COMMENT, token, spos, epos, line)
456                elif token in triple_quoted:
457                    endprog = endprogs[token]
458                    endmatch = endprog.match(line, pos)
459                    if endmatch:                           # all on one line
460                        pos = endmatch.end(0)
461                        token = line[start:pos]
462                        yield (STRING, token, spos, (lnum, pos), line)
463                    else:
464                        strstart = (lnum, start)           # multiple lines
465                        contstr = line[start:]
466                        contline = line
467                        break
468                elif initial in single_quoted or \
469                    token[:2] in single_quoted or \
470                    token[:3] in single_quoted:
471                    if token[-1] == '\n':                  # continued string
472                        strstart = (lnum, start)
473                        endprog = (endprogs[initial] or endprogs[token[1]] or
474                                   endprogs[token[2]])
475                        contstr, needcont = line[start:], 1
476                        contline = line
477                        break
478                    else:                                  # ordinary string
479                        yield (STRING, token, spos, epos, line)
480                elif initial in namechars:                 # ordinary name
481                    yield (NAME, token, spos, epos, line)
482                elif initial == '\\':                      # continued stmt
483                    # This yield is new; needed for better idempotency:
484                    yield (NL, token, spos, (lnum, pos), line)
485                    continued = 1
486                else:
487                    if initial in '([{': parenlev = parenlev + 1
488                    elif initial in ')]}': parenlev = parenlev - 1
489                    yield (OP, token, spos, epos, line)
490            else:
491                yield (ERRORTOKEN, line[pos],
492                           (lnum, pos), (lnum, pos+1), line)
493                pos = pos + 1
494
495    for indent in indents[1:]:                 # pop remaining indent levels
496        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
497    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
498
499if __name__ == '__main__':                     # testing
500    import sys
501    if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline)
502    else: tokenize(sys.stdin.readline)
503