• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens.  It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF).  It generates
65-tuples with these members:
7
8    the token type (see token.py)
9    the token (a string)
10    the starting (row, column) indices of the token (a 2-tuple of ints)
11    the ending (row, column) indices of the token (a 2-tuple of ints)
12    the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19    tokenize_loop(readline, tokeneater)
20    tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27               'Skip Montanaro, Raymond Hettinger')
28
29from itertools import chain
30import string, re
31from token import *
32
33import token
34__all__ = [x for x in dir(token) if not x.startswith("_")]
35__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
36del x
37del token
38
39COMMENT = N_TOKENS
40tok_name[COMMENT] = 'COMMENT'
41NL = N_TOKENS + 1
42tok_name[NL] = 'NL'
43N_TOKENS += 2
44
45def group(*choices): return '(' + '|'.join(choices) + ')'
46def any(*choices): return group(*choices) + '*'
47def maybe(*choices): return group(*choices) + '?'
48
49Whitespace = r'[ \f\t]*'
50Comment = r'#[^\r\n]*'
51Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
52Name = r'[a-zA-Z_]\w*'
53
54Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
55Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
56Binnumber = r'0[bB][01]+[lL]?'
57Decnumber = r'[1-9]\d*[lL]?'
58Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
59Exponent = r'[eE][-+]?\d+'
60Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
61Expfloat = r'\d+' + Exponent
62Floatnumber = group(Pointfloat, Expfloat)
63Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
64Number = group(Imagnumber, Floatnumber, Intnumber)
65
66# Tail end of ' string.
67Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
68# Tail end of " string.
69Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
70# Tail end of ''' string.
71Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
72# Tail end of """ string.
73Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
74Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
75# Single-line ' or " string.
76String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
77               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
78
79# Because of leftmost-then-longest match semantics, be sure to put the
80# longest operators first (e.g., if = came before ==, == would get
81# recognized as two instances of =).
82Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
83                 r"//=?",
84                 r"[+\-*/%&|^=<>]=?",
85                 r"~")
86
87Bracket = '[][(){}]'
88Special = group(r'\r?\n', r'[:;.,`@]')
89Funny = group(Operator, Bracket, Special)
90
91PlainToken = group(Number, Funny, String, Name)
92Token = Ignore + PlainToken
93
94# First (or only) line of ' or " string.
95ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
96                group("'", r'\\\r?\n'),
97                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
98                group('"', r'\\\r?\n'))
99PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
100PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
101
102tokenprog, pseudoprog, single3prog, double3prog = map(
103    re.compile, (Token, PseudoToken, Single3, Double3))
104endprogs = {"'": re.compile(Single), '"': re.compile(Double),
105            "'''": single3prog, '"""': double3prog,
106            "r'''": single3prog, 'r"""': double3prog,
107            "u'''": single3prog, 'u"""': double3prog,
108            "ur'''": single3prog, 'ur"""': double3prog,
109            "R'''": single3prog, 'R"""': double3prog,
110            "U'''": single3prog, 'U"""': double3prog,
111            "uR'''": single3prog, 'uR"""': double3prog,
112            "Ur'''": single3prog, 'Ur"""': double3prog,
113            "UR'''": single3prog, 'UR"""': double3prog,
114            "b'''": single3prog, 'b"""': double3prog,
115            "br'''": single3prog, 'br"""': double3prog,
116            "B'''": single3prog, 'B"""': double3prog,
117            "bR'''": single3prog, 'bR"""': double3prog,
118            "Br'''": single3prog, 'Br"""': double3prog,
119            "BR'''": single3prog, 'BR"""': double3prog,
120            'r': None, 'R': None, 'u': None, 'U': None,
121            'b': None, 'B': None}
122
123triple_quoted = {}
124for t in ("'''", '"""',
125          "r'''", 'r"""', "R'''", 'R"""',
126          "u'''", 'u"""', "U'''", 'U"""',
127          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
128          "uR'''", 'uR"""', "UR'''", 'UR"""',
129          "b'''", 'b"""', "B'''", 'B"""',
130          "br'''", 'br"""', "Br'''", 'Br"""',
131          "bR'''", 'bR"""', "BR'''", 'BR"""'):
132    triple_quoted[t] = t
133single_quoted = {}
134for t in ("'", '"',
135          "r'", 'r"', "R'", 'R"',
136          "u'", 'u"', "U'", 'U"',
137          "ur'", 'ur"', "Ur'", 'Ur"',
138          "uR'", 'uR"', "UR'", 'UR"',
139          "b'", 'b"', "B'", 'B"',
140          "br'", 'br"', "Br'", 'Br"',
141          "bR'", 'bR"', "BR'", 'BR"' ):
142    single_quoted[t] = t
143
144tabsize = 8
145
146class TokenError(Exception): pass
147
148class StopTokenizing(Exception): pass
149
150def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
151    srow, scol = srow_scol
152    erow, ecol = erow_ecol
153    print "%d,%d-%d,%d:\t%s\t%s" % \
154        (srow, scol, erow, ecol, tok_name[type], repr(token))
155
156def tokenize(readline, tokeneater=printtoken):
157    """
158    The tokenize() function accepts two parameters: one representing the
159    input stream, and one providing an output mechanism for tokenize().
160
161    The first parameter, readline, must be a callable object which provides
162    the same interface as the readline() method of built-in file objects.
163    Each call to the function should return one line of input as a string.
164
165    The second parameter, tokeneater, must also be a callable object. It is
166    called once for each token, with five arguments, corresponding to the
167    tuples generated by generate_tokens().
168    """
169    try:
170        tokenize_loop(readline, tokeneater)
171    except StopTokenizing:
172        pass
173
174# backwards compatible interface
175def tokenize_loop(readline, tokeneater):
176    for token_info in generate_tokens(readline):
177        tokeneater(*token_info)
178
179class Untokenizer:
180
181    def __init__(self):
182        self.tokens = []
183        self.prev_row = 1
184        self.prev_col = 0
185
186    def add_whitespace(self, start):
187        row, col = start
188        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
189            raise ValueError("start ({},{}) precedes previous end ({},{})"
190                             .format(row, col, self.prev_row, self.prev_col))
191        row_offset = row - self.prev_row
192        if row_offset:
193            self.tokens.append("\\\n" * row_offset)
194            self.prev_col = 0
195        col_offset = col - self.prev_col
196        if col_offset:
197            self.tokens.append(" " * col_offset)
198
199    def untokenize(self, iterable):
200        it = iter(iterable)
201        for t in it:
202            if len(t) == 2:
203                self.compat(t, it)
204                break
205            tok_type, token, start, end, line = t
206            if tok_type == ENDMARKER:
207                break
208            self.add_whitespace(start)
209            self.tokens.append(token)
210            self.prev_row, self.prev_col = end
211            if tok_type in (NEWLINE, NL):
212                self.prev_row += 1
213                self.prev_col = 0
214        return "".join(self.tokens)
215
216    def compat(self, token, iterable):
217        indents = []
218        toks_append = self.tokens.append
219        startline = token[0] in (NEWLINE, NL)
220        prevstring = False
221
222        for tok in chain([token], iterable):
223            toknum, tokval = tok[:2]
224
225            if toknum in (NAME, NUMBER):
226                tokval += ' '
227
228            # Insert a space between two consecutive strings
229            if toknum == STRING:
230                if prevstring:
231                    tokval = ' ' + tokval
232                prevstring = True
233            else:
234                prevstring = False
235
236            if toknum == INDENT:
237                indents.append(tokval)
238                continue
239            elif toknum == DEDENT:
240                indents.pop()
241                continue
242            elif toknum in (NEWLINE, NL):
243                startline = True
244            elif startline and indents:
245                toks_append(indents[-1])
246                startline = False
247            toks_append(tokval)
248
249def untokenize(iterable):
250    """Transform tokens back into Python source code.
251
252    Each element returned by the iterable must be a token sequence
253    with at least two elements, a token number and token value.  If
254    only two tokens are passed, the resulting output is poor.
255
256    Round-trip invariant for full input:
257        Untokenized source will match input source exactly
258
259    Round-trip invariant for limited intput:
260        # Output text will tokenize the back to the input
261        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
262        newcode = untokenize(t1)
263        readline = iter(newcode.splitlines(1)).next
264        t2 = [tok[:2] for tok in generate_tokens(readline)]
265        assert t1 == t2
266    """
267    ut = Untokenizer()
268    return ut.untokenize(iterable)
269
270def generate_tokens(readline):
271    """
272    The generate_tokens() generator requires one argument, readline, which
273    must be a callable object which provides the same interface as the
274    readline() method of built-in file objects. Each call to the function
275    should return one line of input as a string.  Alternately, readline
276    can be a callable function terminating with StopIteration:
277        readline = open(myfile).next    # Example of alternate readline
278
279    The generator produces 5-tuples with these members: the token type; the
280    token string; a 2-tuple (srow, scol) of ints specifying the row and
281    column where the token begins in the source; a 2-tuple (erow, ecol) of
282    ints specifying the row and column where the token ends in the source;
283    and the line on which the token was found. The line passed is the
284    logical line; continuation lines are included.
285    """
286    lnum = parenlev = continued = 0
287    namechars, numchars = string.ascii_letters + '_', '0123456789'
288    contstr, needcont = '', 0
289    contline = None
290    indents = [0]
291
292    while 1:                                   # loop over lines in stream
293        try:
294            line = readline()
295        except StopIteration:
296            line = ''
297        lnum += 1
298        pos, max = 0, len(line)
299
300        if contstr:                            # continued string
301            if not line:
302                raise TokenError, ("EOF in multi-line string", strstart)
303            endmatch = endprog.match(line)
304            if endmatch:
305                pos = end = endmatch.end(0)
306                yield (STRING, contstr + line[:end],
307                       strstart, (lnum, end), contline + line)
308                contstr, needcont = '', 0
309                contline = None
310            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
311                yield (ERRORTOKEN, contstr + line,
312                           strstart, (lnum, len(line)), contline)
313                contstr = ''
314                contline = None
315                continue
316            else:
317                contstr = contstr + line
318                contline = contline + line
319                continue
320
321        elif parenlev == 0 and not continued:  # new statement
322            if not line: break
323            column = 0
324            while pos < max:                   # measure leading whitespace
325                if line[pos] == ' ':
326                    column += 1
327                elif line[pos] == '\t':
328                    column = (column//tabsize + 1)*tabsize
329                elif line[pos] == '\f':
330                    column = 0
331                else:
332                    break
333                pos += 1
334            if pos == max:
335                break
336
337            if line[pos] in '#\r\n':           # skip comments or blank lines
338                if line[pos] == '#':
339                    comment_token = line[pos:].rstrip('\r\n')
340                    nl_pos = pos + len(comment_token)
341                    yield (COMMENT, comment_token,
342                           (lnum, pos), (lnum, pos + len(comment_token)), line)
343                    yield (NL, line[nl_pos:],
344                           (lnum, nl_pos), (lnum, len(line)), line)
345                else:
346                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
347                           (lnum, pos), (lnum, len(line)), line)
348                continue
349
350            if column > indents[-1]:           # count indents or dedents
351                indents.append(column)
352                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
353            while column < indents[-1]:
354                if column not in indents:
355                    raise IndentationError(
356                        "unindent does not match any outer indentation level",
357                        ("<tokenize>", lnum, pos, line))
358                indents = indents[:-1]
359                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
360
361        else:                                  # continued statement
362            if not line:
363                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
364            continued = 0
365
366        while pos < max:
367            pseudomatch = pseudoprog.match(line, pos)
368            if pseudomatch:                                # scan for tokens
369                start, end = pseudomatch.span(1)
370                spos, epos, pos = (lnum, start), (lnum, end), end
371                if start == end:
372                    continue
373                token, initial = line[start:end], line[start]
374
375                if initial in numchars or \
376                   (initial == '.' and token != '.'):      # ordinary number
377                    yield (NUMBER, token, spos, epos, line)
378                elif initial in '\r\n':
379                    yield (NL if parenlev > 0 else NEWLINE,
380                           token, spos, epos, line)
381                elif initial == '#':
382                    assert not token.endswith("\n")
383                    yield (COMMENT, token, spos, epos, line)
384                elif token in triple_quoted:
385                    endprog = endprogs[token]
386                    endmatch = endprog.match(line, pos)
387                    if endmatch:                           # all on one line
388                        pos = endmatch.end(0)
389                        token = line[start:pos]
390                        yield (STRING, token, spos, (lnum, pos), line)
391                    else:
392                        strstart = (lnum, start)           # multiple lines
393                        contstr = line[start:]
394                        contline = line
395                        break
396                elif initial in single_quoted or \
397                    token[:2] in single_quoted or \
398                    token[:3] in single_quoted:
399                    if token[-1] == '\n':                  # continued string
400                        strstart = (lnum, start)
401                        endprog = (endprogs[initial] or endprogs[token[1]] or
402                                   endprogs[token[2]])
403                        contstr, needcont = line[start:], 1
404                        contline = line
405                        break
406                    else:                                  # ordinary string
407                        yield (STRING, token, spos, epos, line)
408                elif initial in namechars:                 # ordinary name
409                    yield (NAME, token, spos, epos, line)
410                elif initial == '\\':                      # continued stmt
411                    continued = 1
412                else:
413                    if initial in '([{':
414                        parenlev += 1
415                    elif initial in ')]}':
416                        parenlev -= 1
417                    yield (OP, token, spos, epos, line)
418            else:
419                yield (ERRORTOKEN, line[pos],
420                           (lnum, pos), (lnum, pos+1), line)
421                pos += 1
422
423    for indent in indents[1:]:                 # pop remaining indent levels
424        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
425    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
426
427if __name__ == '__main__':                     # testing
428    import sys
429    if len(sys.argv) > 1:
430        tokenize(open(sys.argv[1]).readline)
431    else:
432        tokenize(sys.stdin.readline)
433