1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 2# All rights reserved. 3 4"""Tokenization help for Python programs. 5 6generate_tokens(readline) is a generator that breaks a stream of 7text into Python tokens. It accepts a readline-like method which is called 8repeatedly to get the next line of input (or "" for EOF). It generates 95-tuples with these members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators 20 21Older entry points 22 tokenize_loop(readline, tokeneater) 23 tokenize(readline, tokeneater=printtoken) 24are the same, except instead of generating tokens, tokeneater is a callback 25function to which the 5 fields described above are passed as 5 arguments, 26each time a new token is found.""" 27 28__author__ = 'Ka-Ping Yee <ping@lfw.org>' 29__credits__ = \ 30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 31 32import string, re 33from codecs import BOM_UTF8, lookup 34from lib2to3.pgen2.token import * 35 36from . import token 37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", 38 "generate_tokens", "untokenize"] 39del token 40 41try: 42 bytes 43except NameError: 44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into 45 # valid Python 3 code. 46 bytes = str 47 48def group(*choices): return '(' + '|'.join(choices) + ')' 49def any(*choices): return group(*choices) + '*' 50def maybe(*choices): return group(*choices) + '?' 51 52Whitespace = r'[ \f\t]*' 53Comment = r'#[^\r\n]*' 54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 55Name = r'[a-zA-Z_]\w*' 56 57Binnumber = r'0[bB][01]*' 58Hexnumber = r'0[xX][\da-fA-F]*[lL]?' 59Octnumber = r'0[oO]?[0-7]*[lL]?' 60Decnumber = r'[1-9]\d*[lL]?' 61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 62Exponent = r'[eE][-+]?\d+' 63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 64Expfloat = r'\d+' + Exponent 65Floatnumber = group(Pointfloat, Expfloat) 66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 67Number = group(Imagnumber, Floatnumber, Intnumber) 68 69# Tail end of ' string. 70Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 71# Tail end of " string. 72Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 73# Tail end of ''' string. 74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 75# Tail end of """ string. 76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""') 78# Single-line ' or " string. 79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 80 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 81 82# Because of leftmost-then-longest match semantics, be sure to put the 83# longest operators first (e.g., if = came before ==, == would get 84# recognized as two instances of =). 85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 86 r"//=?", r"->", 87 r"[+\-*/%&@|^=<>]=?", 88 r"~") 89 90Bracket = '[][(){}]' 91Special = group(r'\r?\n', r'[:;.,`@]') 92Funny = group(Operator, Bracket, Special) 93 94PlainToken = group(Number, Funny, String, Name) 95Token = Ignore + PlainToken 96 97# First (or only) line of ' or " string. 98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 99 group("'", r'\\\r?\n'), 100 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 101 group('"', r'\\\r?\n')) 102PseudoExtras = group(r'\\\r?\n', Comment, Triple) 103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 104 105tokenprog, pseudoprog, single3prog, double3prog = map( 106 re.compile, (Token, PseudoToken, Single3, Double3)) 107endprogs = {"'": re.compile(Single), '"': re.compile(Double), 108 "'''": single3prog, '"""': double3prog, 109 "r'''": single3prog, 'r"""': double3prog, 110 "u'''": single3prog, 'u"""': double3prog, 111 "b'''": single3prog, 'b"""': double3prog, 112 "ur'''": single3prog, 'ur"""': double3prog, 113 "br'''": single3prog, 'br"""': double3prog, 114 "R'''": single3prog, 'R"""': double3prog, 115 "U'''": single3prog, 'U"""': double3prog, 116 "B'''": single3prog, 'B"""': double3prog, 117 "uR'''": single3prog, 'uR"""': double3prog, 118 "Ur'''": single3prog, 'Ur"""': double3prog, 119 "UR'''": single3prog, 'UR"""': double3prog, 120 "bR'''": single3prog, 'bR"""': double3prog, 121 "Br'''": single3prog, 'Br"""': double3prog, 122 "BR'''": single3prog, 'BR"""': double3prog, 123 'r': None, 'R': None, 124 'u': None, 'U': None, 125 'b': None, 'B': None} 126 127triple_quoted = {} 128for t in ("'''", '"""', 129 "r'''", 'r"""', "R'''", 'R"""', 130 "u'''", 'u"""', "U'''", 'U"""', 131 "b'''", 'b"""', "B'''", 'B"""', 132 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 133 "uR'''", 'uR"""', "UR'''", 'UR"""', 134 "br'''", 'br"""', "Br'''", 'Br"""', 135 "bR'''", 'bR"""', "BR'''", 'BR"""',): 136 triple_quoted[t] = t 137single_quoted = {} 138for t in ("'", '"', 139 "r'", 'r"', "R'", 'R"', 140 "u'", 'u"', "U'", 'U"', 141 "b'", 'b"', "B'", 'B"', 142 "ur'", 'ur"', "Ur'", 'Ur"', 143 "uR'", 'uR"', "UR'", 'UR"', 144 "br'", 'br"', "Br'", 'Br"', 145 "bR'", 'bR"', "BR'", 'BR"', ): 146 single_quoted[t] = t 147 148tabsize = 8 149 150class TokenError(Exception): pass 151 152class StopTokenizing(Exception): pass 153 154def printtoken(type, token, start, end, line): # for testing 155 (srow, scol) = start 156 (erow, ecol) = end 157 print "%d,%d-%d,%d:\t%s\t%s" % \ 158 (srow, scol, erow, ecol, tok_name[type], repr(token)) 159 160def tokenize(readline, tokeneater=printtoken): 161 """ 162 The tokenize() function accepts two parameters: one representing the 163 input stream, and one providing an output mechanism for tokenize(). 164 165 The first parameter, readline, must be a callable object which provides 166 the same interface as the readline() method of built-in file objects. 167 Each call to the function should return one line of input as a string. 168 169 The second parameter, tokeneater, must also be a callable object. It is 170 called once for each token, with five arguments, corresponding to the 171 tuples generated by generate_tokens(). 172 """ 173 try: 174 tokenize_loop(readline, tokeneater) 175 except StopTokenizing: 176 pass 177 178# backwards compatible interface 179def tokenize_loop(readline, tokeneater): 180 for token_info in generate_tokens(readline): 181 tokeneater(*token_info) 182 183class Untokenizer: 184 185 def __init__(self): 186 self.tokens = [] 187 self.prev_row = 1 188 self.prev_col = 0 189 190 def add_whitespace(self, start): 191 row, col = start 192 assert row <= self.prev_row 193 col_offset = col - self.prev_col 194 if col_offset: 195 self.tokens.append(" " * col_offset) 196 197 def untokenize(self, iterable): 198 for t in iterable: 199 if len(t) == 2: 200 self.compat(t, iterable) 201 break 202 tok_type, token, start, end, line = t 203 self.add_whitespace(start) 204 self.tokens.append(token) 205 self.prev_row, self.prev_col = end 206 if tok_type in (NEWLINE, NL): 207 self.prev_row += 1 208 self.prev_col = 0 209 return "".join(self.tokens) 210 211 def compat(self, token, iterable): 212 startline = False 213 indents = [] 214 toks_append = self.tokens.append 215 toknum, tokval = token 216 if toknum in (NAME, NUMBER): 217 tokval += ' ' 218 if toknum in (NEWLINE, NL): 219 startline = True 220 for tok in iterable: 221 toknum, tokval = tok[:2] 222 223 if toknum in (NAME, NUMBER): 224 tokval += ' ' 225 226 if toknum == INDENT: 227 indents.append(tokval) 228 continue 229 elif toknum == DEDENT: 230 indents.pop() 231 continue 232 elif toknum in (NEWLINE, NL): 233 startline = True 234 elif startline and indents: 235 toks_append(indents[-1]) 236 startline = False 237 toks_append(tokval) 238 239cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)') 240blank_re = re.compile(r'^[ \t\f]*(?:[#\r\n]|$)') 241 242def _get_normal_name(orig_enc): 243 """Imitates get_normal_name in tokenizer.c.""" 244 # Only care about the first 12 characters. 245 enc = orig_enc[:12].lower().replace("_", "-") 246 if enc == "utf-8" or enc.startswith("utf-8-"): 247 return "utf-8" 248 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 249 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 250 return "iso-8859-1" 251 return orig_enc 252 253def detect_encoding(readline): 254 """ 255 The detect_encoding() function is used to detect the encoding that should 256 be used to decode a Python source file. It requires one argument, readline, 257 in the same way as the tokenize() generator. 258 259 It will call readline a maximum of twice, and return the encoding used 260 (as a string) and a list of any lines (left as bytes) it has read 261 in. 262 263 It detects the encoding from the presence of a utf-8 bom or an encoding 264 cookie as specified in pep-0263. If both a bom and a cookie are present, but 265 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 266 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 267 'utf-8-sig' is returned. 268 269 If no encoding is specified, then the default of 'utf-8' will be returned. 270 """ 271 bom_found = False 272 encoding = None 273 default = 'utf-8' 274 def read_or_stop(): 275 try: 276 return readline() 277 except StopIteration: 278 return bytes() 279 280 def find_cookie(line): 281 try: 282 line_string = line.decode('ascii') 283 except UnicodeDecodeError: 284 return None 285 match = cookie_re.match(line_string) 286 if not match: 287 return None 288 encoding = _get_normal_name(match.group(1)) 289 try: 290 codec = lookup(encoding) 291 except LookupError: 292 # This behaviour mimics the Python interpreter 293 raise SyntaxError("unknown encoding: " + encoding) 294 295 if bom_found: 296 if codec.name != 'utf-8': 297 # This behaviour mimics the Python interpreter 298 raise SyntaxError('encoding problem: utf-8') 299 encoding += '-sig' 300 return encoding 301 302 first = read_or_stop() 303 if first.startswith(BOM_UTF8): 304 bom_found = True 305 first = first[3:] 306 default = 'utf-8-sig' 307 if not first: 308 return default, [] 309 310 encoding = find_cookie(first) 311 if encoding: 312 return encoding, [first] 313 if not blank_re.match(first): 314 return default, [first] 315 316 second = read_or_stop() 317 if not second: 318 return default, [first] 319 320 encoding = find_cookie(second) 321 if encoding: 322 return encoding, [first, second] 323 324 return default, [first, second] 325 326def untokenize(iterable): 327 """Transform tokens back into Python source code. 328 329 Each element returned by the iterable must be a token sequence 330 with at least two elements, a token number and token value. If 331 only two tokens are passed, the resulting output is poor. 332 333 Round-trip invariant for full input: 334 Untokenized source will match input source exactly 335 336 Round-trip invariant for limited intput: 337 # Output text will tokenize the back to the input 338 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 339 newcode = untokenize(t1) 340 readline = iter(newcode.splitlines(1)).next 341 t2 = [tok[:2] for tokin generate_tokens(readline)] 342 assert t1 == t2 343 """ 344 ut = Untokenizer() 345 return ut.untokenize(iterable) 346 347def generate_tokens(readline): 348 """ 349 The generate_tokens() generator requires one argument, readline, which 350 must be a callable object which provides the same interface as the 351 readline() method of built-in file objects. Each call to the function 352 should return one line of input as a string. Alternately, readline 353 can be a callable function terminating with StopIteration: 354 readline = open(myfile).next # Example of alternate readline 355 356 The generator produces 5-tuples with these members: the token type; the 357 token string; a 2-tuple (srow, scol) of ints specifying the row and 358 column where the token begins in the source; a 2-tuple (erow, ecol) of 359 ints specifying the row and column where the token ends in the source; 360 and the line on which the token was found. The line passed is the 361 logical line; continuation lines are included. 362 """ 363 lnum = parenlev = continued = 0 364 namechars, numchars = string.ascii_letters + '_', '0123456789' 365 contstr, needcont = '', 0 366 contline = None 367 indents = [0] 368 369 while 1: # loop over lines in stream 370 try: 371 line = readline() 372 except StopIteration: 373 line = '' 374 lnum = lnum + 1 375 pos, max = 0, len(line) 376 377 if contstr: # continued string 378 if not line: 379 raise TokenError, ("EOF in multi-line string", strstart) 380 endmatch = endprog.match(line) 381 if endmatch: 382 pos = end = endmatch.end(0) 383 yield (STRING, contstr + line[:end], 384 strstart, (lnum, end), contline + line) 385 contstr, needcont = '', 0 386 contline = None 387 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 388 yield (ERRORTOKEN, contstr + line, 389 strstart, (lnum, len(line)), contline) 390 contstr = '' 391 contline = None 392 continue 393 else: 394 contstr = contstr + line 395 contline = contline + line 396 continue 397 398 elif parenlev == 0 and not continued: # new statement 399 if not line: break 400 column = 0 401 while pos < max: # measure leading whitespace 402 if line[pos] == ' ': column = column + 1 403 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize 404 elif line[pos] == '\f': column = 0 405 else: break 406 pos = pos + 1 407 if pos == max: break 408 409 if line[pos] in '#\r\n': # skip comments or blank lines 410 if line[pos] == '#': 411 comment_token = line[pos:].rstrip('\r\n') 412 nl_pos = pos + len(comment_token) 413 yield (COMMENT, comment_token, 414 (lnum, pos), (lnum, pos + len(comment_token)), line) 415 yield (NL, line[nl_pos:], 416 (lnum, nl_pos), (lnum, len(line)), line) 417 else: 418 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 419 (lnum, pos), (lnum, len(line)), line) 420 continue 421 422 if column > indents[-1]: # count indents or dedents 423 indents.append(column) 424 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 425 while column < indents[-1]: 426 if column not in indents: 427 raise IndentationError( 428 "unindent does not match any outer indentation level", 429 ("<tokenize>", lnum, pos, line)) 430 indents = indents[:-1] 431 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 432 433 else: # continued statement 434 if not line: 435 raise TokenError, ("EOF in multi-line statement", (lnum, 0)) 436 continued = 0 437 438 while pos < max: 439 pseudomatch = pseudoprog.match(line, pos) 440 if pseudomatch: # scan for tokens 441 start, end = pseudomatch.span(1) 442 spos, epos, pos = (lnum, start), (lnum, end), end 443 token, initial = line[start:end], line[start] 444 445 if initial in numchars or \ 446 (initial == '.' and token != '.'): # ordinary number 447 yield (NUMBER, token, spos, epos, line) 448 elif initial in '\r\n': 449 newline = NEWLINE 450 if parenlev > 0: 451 newline = NL 452 yield (newline, token, spos, epos, line) 453 elif initial == '#': 454 assert not token.endswith("\n") 455 yield (COMMENT, token, spos, epos, line) 456 elif token in triple_quoted: 457 endprog = endprogs[token] 458 endmatch = endprog.match(line, pos) 459 if endmatch: # all on one line 460 pos = endmatch.end(0) 461 token = line[start:pos] 462 yield (STRING, token, spos, (lnum, pos), line) 463 else: 464 strstart = (lnum, start) # multiple lines 465 contstr = line[start:] 466 contline = line 467 break 468 elif initial in single_quoted or \ 469 token[:2] in single_quoted or \ 470 token[:3] in single_quoted: 471 if token[-1] == '\n': # continued string 472 strstart = (lnum, start) 473 endprog = (endprogs[initial] or endprogs[token[1]] or 474 endprogs[token[2]]) 475 contstr, needcont = line[start:], 1 476 contline = line 477 break 478 else: # ordinary string 479 yield (STRING, token, spos, epos, line) 480 elif initial in namechars: # ordinary name 481 yield (NAME, token, spos, epos, line) 482 elif initial == '\\': # continued stmt 483 # This yield is new; needed for better idempotency: 484 yield (NL, token, spos, (lnum, pos), line) 485 continued = 1 486 else: 487 if initial in '([{': parenlev = parenlev + 1 488 elif initial in ')]}': parenlev = parenlev - 1 489 yield (OP, token, spos, epos, line) 490 else: 491 yield (ERRORTOKEN, line[pos], 492 (lnum, pos), (lnum, pos+1), line) 493 pos = pos + 1 494 495 for indent in indents[1:]: # pop remaining indent levels 496 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 497 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 498 499if __name__ == '__main__': # testing 500 import sys 501 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 502 else: tokenize(sys.stdin.readline) 503