• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens.  It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF).  It generates
65-tuples with these members:
7
8    the token type (see token.py)
9    the token (a string)
10    the starting (row, column) indices of the token (a 2-tuple of ints)
11    the ending (row, column) indices of the token (a 2-tuple of ints)
12    the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19    tokenize_loop(readline, tokeneater)
20    tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27               'Skip Montanaro, Raymond Hettinger')
28
29from itertools import chain
30import string, re
31from token import *
32
33import token
34__all__ = [x for x in dir(token) if not x.startswith("_")]
35__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
36del x
37del token
38
39COMMENT = N_TOKENS
40tok_name[COMMENT] = 'COMMENT'
41NL = N_TOKENS + 1
42tok_name[NL] = 'NL'
43N_TOKENS += 2
44
45def group(*choices): return '(' + '|'.join(choices) + ')'
46def any(*choices): return group(*choices) + '*'
47def maybe(*choices): return group(*choices) + '?'
48
49Whitespace = r'[ \f\t]*'
50Comment = r'#[^\r\n]*'
51Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
52Name = r'[a-zA-Z_]\w*'
53
54Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
55Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
56Binnumber = r'0[bB][01]+[lL]?'
57Decnumber = r'[1-9]\d*[lL]?'
58Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
59Exponent = r'[eE][-+]?\d+'
60Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
61Expfloat = r'\d+' + Exponent
62Floatnumber = group(Pointfloat, Expfloat)
63Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
64Number = group(Imagnumber, Floatnumber, Intnumber)
65
66# Tail end of ' string.
67Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
68# Tail end of " string.
69Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
70# Tail end of ''' string.
71Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
72# Tail end of """ string.
73Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
74Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
75# Single-line ' or " string.
76String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
77               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
78
79# Because of leftmost-then-longest match semantics, be sure to put the
80# longest operators first (e.g., if = came before ==, == would get
81# recognized as two instances of =).
82Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
83                 r"//=?",
84                 r"[+\-*/%&|^=<>]=?",
85                 r"~")
86
87Bracket = '[][(){}]'
88Special = group(r'\r?\n', r'[:;.,`@]')
89Funny = group(Operator, Bracket, Special)
90
91PlainToken = group(Number, Funny, String, Name)
92Token = Ignore + PlainToken
93
94# First (or only) line of ' or " string.
95ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
96                group("'", r'\\\r?\n'),
97                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
98                group('"', r'\\\r?\n'))
99PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
100PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
101
102tokenprog, pseudoprog, single3prog, double3prog = map(
103    re.compile, (Token, PseudoToken, Single3, Double3))
104endprogs = {"'": re.compile(Single), '"': re.compile(Double),
105            "'''": single3prog, '"""': double3prog,
106            "r'''": single3prog, 'r"""': double3prog,
107            "u'''": single3prog, 'u"""': double3prog,
108            "ur'''": single3prog, 'ur"""': double3prog,
109            "R'''": single3prog, 'R"""': double3prog,
110            "U'''": single3prog, 'U"""': double3prog,
111            "uR'''": single3prog, 'uR"""': double3prog,
112            "Ur'''": single3prog, 'Ur"""': double3prog,
113            "UR'''": single3prog, 'UR"""': double3prog,
114            "b'''": single3prog, 'b"""': double3prog,
115            "br'''": single3prog, 'br"""': double3prog,
116            "B'''": single3prog, 'B"""': double3prog,
117            "bR'''": single3prog, 'bR"""': double3prog,
118            "Br'''": single3prog, 'Br"""': double3prog,
119            "BR'''": single3prog, 'BR"""': double3prog,
120            'r': None, 'R': None, 'u': None, 'U': None,
121            'b': None, 'B': None}
122
123triple_quoted = {}
124for t in ("'''", '"""',
125          "r'''", 'r"""', "R'''", 'R"""',
126          "u'''", 'u"""', "U'''", 'U"""',
127          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
128          "uR'''", 'uR"""', "UR'''", 'UR"""',
129          "b'''", 'b"""', "B'''", 'B"""',
130          "br'''", 'br"""', "Br'''", 'Br"""',
131          "bR'''", 'bR"""', "BR'''", 'BR"""'):
132    triple_quoted[t] = t
133single_quoted = {}
134for t in ("'", '"',
135          "r'", 'r"', "R'", 'R"',
136          "u'", 'u"', "U'", 'U"',
137          "ur'", 'ur"', "Ur'", 'Ur"',
138          "uR'", 'uR"', "UR'", 'UR"',
139          "b'", 'b"', "B'", 'B"',
140          "br'", 'br"', "Br'", 'Br"',
141          "bR'", 'bR"', "BR'", 'BR"' ):
142    single_quoted[t] = t
143
144tabsize = 8
145
146class TokenError(Exception): pass
147
148class StopTokenizing(Exception): pass
149
150def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
151    srow, scol = srow_scol
152    erow, ecol = erow_ecol
153    print "%d,%d-%d,%d:\t%s\t%s" % \
154        (srow, scol, erow, ecol, tok_name[type], repr(token))
155
156def tokenize(readline, tokeneater=printtoken):
157    """
158    The tokenize() function accepts two parameters: one representing the
159    input stream, and one providing an output mechanism for tokenize().
160
161    The first parameter, readline, must be a callable object which provides
162    the same interface as the readline() method of built-in file objects.
163    Each call to the function should return one line of input as a string.
164
165    The second parameter, tokeneater, must also be a callable object. It is
166    called once for each token, with five arguments, corresponding to the
167    tuples generated by generate_tokens().
168    """
169    try:
170        tokenize_loop(readline, tokeneater)
171    except StopTokenizing:
172        pass
173
174# backwards compatible interface
175def tokenize_loop(readline, tokeneater):
176    for token_info in generate_tokens(readline):
177        tokeneater(*token_info)
178
179class Untokenizer:
180
181    def __init__(self):
182        self.tokens = []
183        self.prev_row = 1
184        self.prev_col = 0
185
186    def add_whitespace(self, start):
187        row, col = start
188        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
189            raise ValueError("start ({},{}) precedes previous end ({},{})"
190                             .format(row, col, self.prev_row, self.prev_col))
191        row_offset = row - self.prev_row
192        if row_offset:
193            self.tokens.append("\\\n" * row_offset)
194            self.prev_col = 0
195        col_offset = col - self.prev_col
196        if col_offset:
197            self.tokens.append(" " * col_offset)
198
199    def untokenize(self, iterable):
200        it = iter(iterable)
201        indents = []
202        startline = False
203        for t in it:
204            if len(t) == 2:
205                self.compat(t, it)
206                break
207            tok_type, token, start, end, line = t
208            if tok_type == ENDMARKER:
209                break
210            if tok_type == INDENT:
211                indents.append(token)
212                continue
213            elif tok_type == DEDENT:
214                indents.pop()
215                self.prev_row, self.prev_col = end
216                continue
217            elif tok_type in (NEWLINE, NL):
218                startline = True
219            elif startline and indents:
220                indent = indents[-1]
221                if start[1] >= len(indent):
222                    self.tokens.append(indent)
223                    self.prev_col = len(indent)
224                startline = False
225            self.add_whitespace(start)
226            self.tokens.append(token)
227            self.prev_row, self.prev_col = end
228            if tok_type in (NEWLINE, NL):
229                self.prev_row += 1
230                self.prev_col = 0
231        return "".join(self.tokens)
232
233    def compat(self, token, iterable):
234        indents = []
235        toks_append = self.tokens.append
236        startline = token[0] in (NEWLINE, NL)
237        prevstring = False
238
239        for tok in chain([token], iterable):
240            toknum, tokval = tok[:2]
241
242            if toknum in (NAME, NUMBER):
243                tokval += ' '
244
245            # Insert a space between two consecutive strings
246            if toknum == STRING:
247                if prevstring:
248                    tokval = ' ' + tokval
249                prevstring = True
250            else:
251                prevstring = False
252
253            if toknum == INDENT:
254                indents.append(tokval)
255                continue
256            elif toknum == DEDENT:
257                indents.pop()
258                continue
259            elif toknum in (NEWLINE, NL):
260                startline = True
261            elif startline and indents:
262                toks_append(indents[-1])
263                startline = False
264            toks_append(tokval)
265
266def untokenize(iterable):
267    """Transform tokens back into Python source code.
268
269    Each element returned by the iterable must be a token sequence
270    with at least two elements, a token number and token value.  If
271    only two tokens are passed, the resulting output is poor.
272
273    Round-trip invariant for full input:
274        Untokenized source will match input source exactly
275
276    Round-trip invariant for limited intput:
277        # Output text will tokenize the back to the input
278        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
279        newcode = untokenize(t1)
280        readline = iter(newcode.splitlines(1)).next
281        t2 = [tok[:2] for tok in generate_tokens(readline)]
282        assert t1 == t2
283    """
284    ut = Untokenizer()
285    return ut.untokenize(iterable)
286
287def generate_tokens(readline):
288    """
289    The generate_tokens() generator requires one argument, readline, which
290    must be a callable object which provides the same interface as the
291    readline() method of built-in file objects. Each call to the function
292    should return one line of input as a string.  Alternately, readline
293    can be a callable function terminating with StopIteration:
294        readline = open(myfile).next    # Example of alternate readline
295
296    The generator produces 5-tuples with these members: the token type; the
297    token string; a 2-tuple (srow, scol) of ints specifying the row and
298    column where the token begins in the source; a 2-tuple (erow, ecol) of
299    ints specifying the row and column where the token ends in the source;
300    and the line on which the token was found. The line passed is the
301    logical line; continuation lines are included.
302    """
303    lnum = parenlev = continued = 0
304    namechars, numchars = string.ascii_letters + '_', '0123456789'
305    contstr, needcont = '', 0
306    contline = None
307    indents = [0]
308
309    last_line = b''
310    line = b''
311    while 1:                                   # loop over lines in stream
312        try:
313            # We capture the value of the line variable here because
314            # readline uses the empty string '' to signal end of input,
315            # hence `line` itself will always be overwritten at the end
316            # of this loop.
317            last_line = line
318            line = readline()
319        except StopIteration:
320            line = ''
321        lnum += 1
322        pos, max = 0, len(line)
323
324        if contstr:                            # continued string
325            if not line:
326                raise TokenError, ("EOF in multi-line string", strstart)
327            endmatch = endprog.match(line)
328            if endmatch:
329                pos = end = endmatch.end(0)
330                yield (STRING, contstr + line[:end],
331                       strstart, (lnum, end), contline + line)
332                contstr, needcont = '', 0
333                contline = None
334            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
335                yield (ERRORTOKEN, contstr + line,
336                           strstart, (lnum, len(line)), contline)
337                contstr = ''
338                contline = None
339                continue
340            else:
341                contstr = contstr + line
342                contline = contline + line
343                continue
344
345        elif parenlev == 0 and not continued:  # new statement
346            if not line: break
347            column = 0
348            while pos < max:                   # measure leading whitespace
349                if line[pos] == ' ':
350                    column += 1
351                elif line[pos] == '\t':
352                    column = (column//tabsize + 1)*tabsize
353                elif line[pos] == '\f':
354                    column = 0
355                else:
356                    break
357                pos += 1
358            if pos == max:
359                break
360
361            if line[pos] in '#\r\n':           # skip comments or blank lines
362                if line[pos] == '#':
363                    comment_token = line[pos:].rstrip('\r\n')
364                    nl_pos = pos + len(comment_token)
365                    yield (COMMENT, comment_token,
366                           (lnum, pos), (lnum, pos + len(comment_token)), line)
367                    yield (NL, line[nl_pos:],
368                           (lnum, nl_pos), (lnum, len(line)), line)
369                else:
370                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
371                           (lnum, pos), (lnum, len(line)), line)
372                continue
373
374            if column > indents[-1]:           # count indents or dedents
375                indents.append(column)
376                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
377            while column < indents[-1]:
378                if column not in indents:
379                    raise IndentationError(
380                        "unindent does not match any outer indentation level",
381                        ("<tokenize>", lnum, pos, line))
382                indents = indents[:-1]
383                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
384
385        else:                                  # continued statement
386            if not line:
387                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
388            continued = 0
389
390        while pos < max:
391            pseudomatch = pseudoprog.match(line, pos)
392            if pseudomatch:                                # scan for tokens
393                start, end = pseudomatch.span(1)
394                spos, epos, pos = (lnum, start), (lnum, end), end
395                if start == end:
396                    continue
397                token, initial = line[start:end], line[start]
398
399                if initial in numchars or \
400                   (initial == '.' and token != '.'):      # ordinary number
401                    yield (NUMBER, token, spos, epos, line)
402                elif initial in '\r\n':
403                    yield (NL if parenlev > 0 else NEWLINE,
404                           token, spos, epos, line)
405                elif initial == '#':
406                    assert not token.endswith("\n")
407                    yield (COMMENT, token, spos, epos, line)
408                elif token in triple_quoted:
409                    endprog = endprogs[token]
410                    endmatch = endprog.match(line, pos)
411                    if endmatch:                           # all on one line
412                        pos = endmatch.end(0)
413                        token = line[start:pos]
414                        yield (STRING, token, spos, (lnum, pos), line)
415                    else:
416                        strstart = (lnum, start)           # multiple lines
417                        contstr = line[start:]
418                        contline = line
419                        break
420                elif initial in single_quoted or \
421                    token[:2] in single_quoted or \
422                    token[:3] in single_quoted:
423                    if token[-1] == '\n':                  # continued string
424                        strstart = (lnum, start)
425                        endprog = (endprogs[initial] or endprogs[token[1]] or
426                                   endprogs[token[2]])
427                        contstr, needcont = line[start:], 1
428                        contline = line
429                        break
430                    else:                                  # ordinary string
431                        yield (STRING, token, spos, epos, line)
432                elif initial in namechars:                 # ordinary name
433                    yield (NAME, token, spos, epos, line)
434                elif initial == '\\':                      # continued stmt
435                    continued = 1
436                else:
437                    if initial in '([{':
438                        parenlev += 1
439                    elif initial in ')]}':
440                        parenlev -= 1
441                    yield (OP, token, spos, epos, line)
442            else:
443                yield (ERRORTOKEN, line[pos],
444                           (lnum, pos), (lnum, pos+1), line)
445                pos += 1
446
447    # Add an implicit NEWLINE if the input doesn't end in one
448    if last_line and last_line[-1] not in '\r\n':
449        yield (NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '')
450    for indent in indents[1:]:                 # pop remaining indent levels
451        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
452    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
453
454if __name__ == '__main__':                     # testing
455    import sys
456    if len(sys.argv) > 1:
457        tokenize(open(sys.argv[1]).readline)
458    else:
459        tokenize(sys.stdin.readline)
460