• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1"""Tokenization help for Python programs.
2
3generate_tokens(readline) is a generator that breaks a stream of
4text into Python tokens.  It accepts a readline-like method which is called
5repeatedly to get the next line of input (or "" for EOF).  It generates
65-tuples with these members:
7
8    the token type (see token.py)
9    the token (a string)
10    the starting (row, column) indices of the token (a 2-tuple of ints)
11    the ending (row, column) indices of the token (a 2-tuple of ints)
12    the original line (string)
13
14It is designed to match the working of the Python tokenizer exactly, except
15that it produces COMMENT tokens for comments and gives type OP for all
16operators
17
18Older entry points
19    tokenize_loop(readline, tokeneater)
20    tokenize(readline, tokeneater=printtoken)
21are the same, except instead of generating tokens, tokeneater is a callback
22function to which the 5 fields described above are passed as 5 arguments,
23each time a new token is found."""
24
25__author__ = 'Ka-Ping Yee <ping@lfw.org>'
26__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
27               'Skip Montanaro, Raymond Hettinger')
28
29from itertools import chain
30import string, re
31from token import *
32
33import token
34__all__ = [x for x in dir(token) if not x.startswith("_")]
35__all__ += ["COMMENT", "tokenize", "generate_tokens", "NL", "untokenize"]
36del x
37del token
38
39COMMENT = N_TOKENS
40tok_name[COMMENT] = 'COMMENT'
41NL = N_TOKENS + 1
42tok_name[NL] = 'NL'
43N_TOKENS += 2
44
45def group(*choices): return '(' + '|'.join(choices) + ')'
46def any(*choices): return group(*choices) + '*'
47def maybe(*choices): return group(*choices) + '?'
48
49Whitespace = r'[ \f\t]*'
50Comment = r'#[^\r\n]*'
51Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
52Name = r'[a-zA-Z_]\w*'
53
54Hexnumber = r'0[xX][\da-fA-F]+[lL]?'
55Octnumber = r'(0[oO][0-7]+)|(0[0-7]*)[lL]?'
56Binnumber = r'0[bB][01]+[lL]?'
57Decnumber = r'[1-9]\d*[lL]?'
58Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
59Exponent = r'[eE][-+]?\d+'
60Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent)
61Expfloat = r'\d+' + Exponent
62Floatnumber = group(Pointfloat, Expfloat)
63Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]')
64Number = group(Imagnumber, Floatnumber, Intnumber)
65
66# Tail end of ' string.
67Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
68# Tail end of " string.
69Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
70# Tail end of ''' string.
71Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
72# Tail end of """ string.
73Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
74Triple = group("[uUbB]?[rR]?'''", '[uUbB]?[rR]?"""')
75# Single-line ' or " string.
76String = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
77               r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
78
79# Because of leftmost-then-longest match semantics, be sure to put the
80# longest operators first (e.g., if = came before ==, == would get
81# recognized as two instances of =).
82Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=",
83                 r"//=?",
84                 r"[+\-*/%&|^=<>]=?",
85                 r"~")
86
87Bracket = '[][(){}]'
88Special = group(r'\r?\n', r'[:;.,`@]')
89Funny = group(Operator, Bracket, Special)
90
91PlainToken = group(Number, Funny, String, Name)
92Token = Ignore + PlainToken
93
94# First (or only) line of ' or " string.
95ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
96                group("'", r'\\\r?\n'),
97                r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
98                group('"', r'\\\r?\n'))
99PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
100PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
101
102tokenprog, pseudoprog, single3prog, double3prog = map(
103    re.compile, (Token, PseudoToken, Single3, Double3))
104endprogs = {"'": re.compile(Single), '"': re.compile(Double),
105            "'''": single3prog, '"""': double3prog,
106            "r'''": single3prog, 'r"""': double3prog,
107            "u'''": single3prog, 'u"""': double3prog,
108            "ur'''": single3prog, 'ur"""': double3prog,
109            "R'''": single3prog, 'R"""': double3prog,
110            "U'''": single3prog, 'U"""': double3prog,
111            "uR'''": single3prog, 'uR"""': double3prog,
112            "Ur'''": single3prog, 'Ur"""': double3prog,
113            "UR'''": single3prog, 'UR"""': double3prog,
114            "b'''": single3prog, 'b"""': double3prog,
115            "br'''": single3prog, 'br"""': double3prog,
116            "B'''": single3prog, 'B"""': double3prog,
117            "bR'''": single3prog, 'bR"""': double3prog,
118            "Br'''": single3prog, 'Br"""': double3prog,
119            "BR'''": single3prog, 'BR"""': double3prog,
120            'r': None, 'R': None, 'u': None, 'U': None,
121            'b': None, 'B': None}
122
123triple_quoted = {}
124for t in ("'''", '"""',
125          "r'''", 'r"""', "R'''", 'R"""',
126          "u'''", 'u"""', "U'''", 'U"""',
127          "ur'''", 'ur"""', "Ur'''", 'Ur"""',
128          "uR'''", 'uR"""', "UR'''", 'UR"""',
129          "b'''", 'b"""', "B'''", 'B"""',
130          "br'''", 'br"""', "Br'''", 'Br"""',
131          "bR'''", 'bR"""', "BR'''", 'BR"""'):
132    triple_quoted[t] = t
133single_quoted = {}
134for t in ("'", '"',
135          "r'", 'r"', "R'", 'R"',
136          "u'", 'u"', "U'", 'U"',
137          "ur'", 'ur"', "Ur'", 'Ur"',
138          "uR'", 'uR"', "UR'", 'UR"',
139          "b'", 'b"', "B'", 'B"',
140          "br'", 'br"', "Br'", 'Br"',
141          "bR'", 'bR"', "BR'", 'BR"' ):
142    single_quoted[t] = t
143
144tabsize = 8
145
146class TokenError(Exception): pass
147
148class StopTokenizing(Exception): pass
149
150def printtoken(type, token, srow_scol, erow_ecol, line): # for testing
151    srow, scol = srow_scol
152    erow, ecol = erow_ecol
153    print "%d,%d-%d,%d:\t%s\t%s" % \
154        (srow, scol, erow, ecol, tok_name[type], repr(token))
155
156def tokenize(readline, tokeneater=printtoken):
157    """
158    The tokenize() function accepts two parameters: one representing the
159    input stream, and one providing an output mechanism for tokenize().
160
161    The first parameter, readline, must be a callable object which provides
162    the same interface as the readline() method of built-in file objects.
163    Each call to the function should return one line of input as a string.
164
165    The second parameter, tokeneater, must also be a callable object. It is
166    called once for each token, with five arguments, corresponding to the
167    tuples generated by generate_tokens().
168    """
169    try:
170        tokenize_loop(readline, tokeneater)
171    except StopTokenizing:
172        pass
173
174# backwards compatible interface
175def tokenize_loop(readline, tokeneater):
176    for token_info in generate_tokens(readline):
177        tokeneater(*token_info)
178
179class Untokenizer:
180
181    def __init__(self):
182        self.tokens = []
183        self.prev_row = 1
184        self.prev_col = 0
185
186    def add_whitespace(self, start):
187        row, col = start
188        if row < self.prev_row or row == self.prev_row and col < self.prev_col:
189            raise ValueError("start ({},{}) precedes previous end ({},{})"
190                             .format(row, col, self.prev_row, self.prev_col))
191        row_offset = row - self.prev_row
192        if row_offset:
193            self.tokens.append("\\\n" * row_offset)
194            self.prev_col = 0
195        col_offset = col - self.prev_col
196        if col_offset:
197            self.tokens.append(" " * col_offset)
198
199    def untokenize(self, iterable):
200        it = iter(iterable)
201        indents = []
202        startline = False
203        for t in it:
204            if len(t) == 2:
205                self.compat(t, it)
206                break
207            tok_type, token, start, end, line = t
208            if tok_type == ENDMARKER:
209                break
210            if tok_type == INDENT:
211                indents.append(token)
212                continue
213            elif tok_type == DEDENT:
214                indents.pop()
215                self.prev_row, self.prev_col = end
216                continue
217            elif tok_type in (NEWLINE, NL):
218                startline = True
219            elif startline and indents:
220                indent = indents[-1]
221                if start[1] >= len(indent):
222                    self.tokens.append(indent)
223                    self.prev_col = len(indent)
224                startline = False
225            self.add_whitespace(start)
226            self.tokens.append(token)
227            self.prev_row, self.prev_col = end
228            if tok_type in (NEWLINE, NL):
229                self.prev_row += 1
230                self.prev_col = 0
231        return "".join(self.tokens)
232
233    def compat(self, token, iterable):
234        indents = []
235        toks_append = self.tokens.append
236        startline = token[0] in (NEWLINE, NL)
237        prevstring = False
238
239        for tok in chain([token], iterable):
240            toknum, tokval = tok[:2]
241
242            if toknum in (NAME, NUMBER):
243                tokval += ' '
244
245            # Insert a space between two consecutive strings
246            if toknum == STRING:
247                if prevstring:
248                    tokval = ' ' + tokval
249                prevstring = True
250            else:
251                prevstring = False
252
253            if toknum == INDENT:
254                indents.append(tokval)
255                continue
256            elif toknum == DEDENT:
257                indents.pop()
258                continue
259            elif toknum in (NEWLINE, NL):
260                startline = True
261            elif startline and indents:
262                toks_append(indents[-1])
263                startline = False
264            toks_append(tokval)
265
266def untokenize(iterable):
267    """Transform tokens back into Python source code.
268
269    Each element returned by the iterable must be a token sequence
270    with at least two elements, a token number and token value.  If
271    only two tokens are passed, the resulting output is poor.
272
273    Round-trip invariant for full input:
274        Untokenized source will match input source exactly
275
276    Round-trip invariant for limited intput:
277        # Output text will tokenize the back to the input
278        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
279        newcode = untokenize(t1)
280        readline = iter(newcode.splitlines(1)).next
281        t2 = [tok[:2] for tok in generate_tokens(readline)]
282        assert t1 == t2
283    """
284    ut = Untokenizer()
285    return ut.untokenize(iterable)
286
287def generate_tokens(readline):
288    """
289    The generate_tokens() generator requires one argument, readline, which
290    must be a callable object which provides the same interface as the
291    readline() method of built-in file objects. Each call to the function
292    should return one line of input as a string.  Alternately, readline
293    can be a callable function terminating with StopIteration:
294        readline = open(myfile).next    # Example of alternate readline
295
296    The generator produces 5-tuples with these members: the token type; the
297    token string; a 2-tuple (srow, scol) of ints specifying the row and
298    column where the token begins in the source; a 2-tuple (erow, ecol) of
299    ints specifying the row and column where the token ends in the source;
300    and the line on which the token was found. The line passed is the
301    logical line; continuation lines are included.
302    """
303    lnum = parenlev = continued = 0
304    namechars, numchars = string.ascii_letters + '_', '0123456789'
305    contstr, needcont = '', 0
306    contline = None
307    indents = [0]
308
309    while 1:                                   # loop over lines in stream
310        try:
311            line = readline()
312        except StopIteration:
313            line = ''
314        lnum += 1
315        pos, max = 0, len(line)
316
317        if contstr:                            # continued string
318            if not line:
319                raise TokenError, ("EOF in multi-line string", strstart)
320            endmatch = endprog.match(line)
321            if endmatch:
322                pos = end = endmatch.end(0)
323                yield (STRING, contstr + line[:end],
324                       strstart, (lnum, end), contline + line)
325                contstr, needcont = '', 0
326                contline = None
327            elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
328                yield (ERRORTOKEN, contstr + line,
329                           strstart, (lnum, len(line)), contline)
330                contstr = ''
331                contline = None
332                continue
333            else:
334                contstr = contstr + line
335                contline = contline + line
336                continue
337
338        elif parenlev == 0 and not continued:  # new statement
339            if not line: break
340            column = 0
341            while pos < max:                   # measure leading whitespace
342                if line[pos] == ' ':
343                    column += 1
344                elif line[pos] == '\t':
345                    column = (column//tabsize + 1)*tabsize
346                elif line[pos] == '\f':
347                    column = 0
348                else:
349                    break
350                pos += 1
351            if pos == max:
352                break
353
354            if line[pos] in '#\r\n':           # skip comments or blank lines
355                if line[pos] == '#':
356                    comment_token = line[pos:].rstrip('\r\n')
357                    nl_pos = pos + len(comment_token)
358                    yield (COMMENT, comment_token,
359                           (lnum, pos), (lnum, pos + len(comment_token)), line)
360                    yield (NL, line[nl_pos:],
361                           (lnum, nl_pos), (lnum, len(line)), line)
362                else:
363                    yield ((NL, COMMENT)[line[pos] == '#'], line[pos:],
364                           (lnum, pos), (lnum, len(line)), line)
365                continue
366
367            if column > indents[-1]:           # count indents or dedents
368                indents.append(column)
369                yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
370            while column < indents[-1]:
371                if column not in indents:
372                    raise IndentationError(
373                        "unindent does not match any outer indentation level",
374                        ("<tokenize>", lnum, pos, line))
375                indents = indents[:-1]
376                yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
377
378        else:                                  # continued statement
379            if not line:
380                raise TokenError, ("EOF in multi-line statement", (lnum, 0))
381            continued = 0
382
383        while pos < max:
384            pseudomatch = pseudoprog.match(line, pos)
385            if pseudomatch:                                # scan for tokens
386                start, end = pseudomatch.span(1)
387                spos, epos, pos = (lnum, start), (lnum, end), end
388                if start == end:
389                    continue
390                token, initial = line[start:end], line[start]
391
392                if initial in numchars or \
393                   (initial == '.' and token != '.'):      # ordinary number
394                    yield (NUMBER, token, spos, epos, line)
395                elif initial in '\r\n':
396                    yield (NL if parenlev > 0 else NEWLINE,
397                           token, spos, epos, line)
398                elif initial == '#':
399                    assert not token.endswith("\n")
400                    yield (COMMENT, token, spos, epos, line)
401                elif token in triple_quoted:
402                    endprog = endprogs[token]
403                    endmatch = endprog.match(line, pos)
404                    if endmatch:                           # all on one line
405                        pos = endmatch.end(0)
406                        token = line[start:pos]
407                        yield (STRING, token, spos, (lnum, pos), line)
408                    else:
409                        strstart = (lnum, start)           # multiple lines
410                        contstr = line[start:]
411                        contline = line
412                        break
413                elif initial in single_quoted or \
414                    token[:2] in single_quoted or \
415                    token[:3] in single_quoted:
416                    if token[-1] == '\n':                  # continued string
417                        strstart = (lnum, start)
418                        endprog = (endprogs[initial] or endprogs[token[1]] or
419                                   endprogs[token[2]])
420                        contstr, needcont = line[start:], 1
421                        contline = line
422                        break
423                    else:                                  # ordinary string
424                        yield (STRING, token, spos, epos, line)
425                elif initial in namechars:                 # ordinary name
426                    yield (NAME, token, spos, epos, line)
427                elif initial == '\\':                      # continued stmt
428                    continued = 1
429                else:
430                    if initial in '([{':
431                        parenlev += 1
432                    elif initial in ')]}':
433                        parenlev -= 1
434                    yield (OP, token, spos, epos, line)
435            else:
436                yield (ERRORTOKEN, line[pos],
437                           (lnum, pos), (lnum, pos+1), line)
438                pos += 1
439
440    for indent in indents[1:]:                 # pop remaining indent levels
441        yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
442    yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
443
444if __name__ == '__main__':                     # testing
445    import sys
446    if len(sys.argv) > 1:
447        tokenize(open(sys.argv[1]).readline)
448    else:
449        tokenize(sys.stdin.readline)
450