1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 2# All rights reserved. 3 4"""Tokenization help for Python programs. 5 6generate_tokens(readline) is a generator that breaks a stream of 7text into Python tokens. It accepts a readline-like method which is called 8repeatedly to get the next line of input (or "" for EOF). It generates 95-tuples with these members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators 20 21Older entry points 22 tokenize_loop(readline, tokeneater) 23 tokenize(readline, tokeneater=printtoken) 24are the same, except instead of generating tokens, tokeneater is a callback 25function to which the 5 fields described above are passed as 5 arguments, 26each time a new token is found.""" 27 28__author__ = 'Ka-Ping Yee <ping@lfw.org>' 29__credits__ = \ 30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 31 32import string, re 33from codecs import BOM_UTF8, lookup 34from lib2to3.pgen2.token import * 35 36from . import token 37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", 38 "generate_tokens", "untokenize"] 39del token 40 41try: 42 bytes 43except NameError: 44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into 45 # valid Python 3 code. 46 bytes = str 47 48def group(*choices): return '(' + '|'.join(choices) + ')' 49def any(*choices): return group(*choices) + '*' 50def maybe(*choices): return group(*choices) + '?' 51 52Whitespace = r'[ \f\t]*' 53Comment = r'#[^\r\n]*' 54Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 55Name = r'[a-zA-Z_]\w*' 56 57Binnumber = r'0[bB][01]*' 58Hexnumber = r'0[xX][\da-fA-F]*[lL]?' 59Octnumber = r'0[oO]?[0-7]*[lL]?' 60Decnumber = r'[1-9]\d*[lL]?' 61Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 62Exponent = r'[eE][-+]?\d+' 63Pointfloat = group(r'\d+\.\d*', r'\.\d+') + maybe(Exponent) 64Expfloat = r'\d+' + Exponent 65Floatnumber = group(Pointfloat, Expfloat) 66Imagnumber = group(r'\d+[jJ]', Floatnumber + r'[jJ]') 67Number = group(Imagnumber, Floatnumber, Intnumber) 68 69# Tail end of ' string. 70Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 71# Tail end of " string. 72Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 73# Tail end of ''' string. 74Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 75# Tail end of """ string. 76Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 77Triple = group("[ubUB]?[rR]?'''", '[ubUB]?[rR]?"""') 78# Single-line ' or " string. 79String = group(r"[uU]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 80 r'[uU]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 81 82# Because of leftmost-then-longest match semantics, be sure to put the 83# longest operators first (e.g., if = came before ==, == would get 84# recognized as two instances of =). 85Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 86 r"//=?", r"->", 87 r"[+\-*/%&@|^=<>]=?", 88 r"~") 89 90Bracket = '[][(){}]' 91Special = group(r'\r?\n', r'[:;.,`@]') 92Funny = group(Operator, Bracket, Special) 93 94PlainToken = group(Number, Funny, String, Name) 95Token = Ignore + PlainToken 96 97# First (or only) line of ' or " string. 98ContStr = group(r"[uUbB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 99 group("'", r'\\\r?\n'), 100 r'[uUbB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 101 group('"', r'\\\r?\n')) 102PseudoExtras = group(r'\\\r?\n', Comment, Triple) 103PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 104 105tokenprog, pseudoprog, single3prog, double3prog = list(map( 106 re.compile, (Token, PseudoToken, Single3, Double3))) 107endprogs = {"'": re.compile(Single), '"': re.compile(Double), 108 "'''": single3prog, '"""': double3prog, 109 "r'''": single3prog, 'r"""': double3prog, 110 "u'''": single3prog, 'u"""': double3prog, 111 "b'''": single3prog, 'b"""': double3prog, 112 "ur'''": single3prog, 'ur"""': double3prog, 113 "br'''": single3prog, 'br"""': double3prog, 114 "R'''": single3prog, 'R"""': double3prog, 115 "U'''": single3prog, 'U"""': double3prog, 116 "B'''": single3prog, 'B"""': double3prog, 117 "uR'''": single3prog, 'uR"""': double3prog, 118 "Ur'''": single3prog, 'Ur"""': double3prog, 119 "UR'''": single3prog, 'UR"""': double3prog, 120 "bR'''": single3prog, 'bR"""': double3prog, 121 "Br'''": single3prog, 'Br"""': double3prog, 122 "BR'''": single3prog, 'BR"""': double3prog, 123 'r': None, 'R': None, 124 'u': None, 'U': None, 125 'b': None, 'B': None} 126 127triple_quoted = {} 128for t in ("'''", '"""', 129 "r'''", 'r"""', "R'''", 'R"""', 130 "u'''", 'u"""', "U'''", 'U"""', 131 "b'''", 'b"""', "B'''", 'B"""', 132 "ur'''", 'ur"""', "Ur'''", 'Ur"""', 133 "uR'''", 'uR"""', "UR'''", 'UR"""', 134 "br'''", 'br"""', "Br'''", 'Br"""', 135 "bR'''", 'bR"""', "BR'''", 'BR"""',): 136 triple_quoted[t] = t 137single_quoted = {} 138for t in ("'", '"', 139 "r'", 'r"', "R'", 'R"', 140 "u'", 'u"', "U'", 'U"', 141 "b'", 'b"', "B'", 'B"', 142 "ur'", 'ur"', "Ur'", 'Ur"', 143 "uR'", 'uR"', "UR'", 'UR"', 144 "br'", 'br"', "Br'", 'Br"', 145 "bR'", 'bR"', "BR'", 'BR"', ): 146 single_quoted[t] = t 147 148tabsize = 8 149 150class TokenError(Exception): pass 151 152class StopTokenizing(Exception): pass 153 154def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing 155 (srow, scol) = xxx_todo_changeme 156 (erow, ecol) = xxx_todo_changeme1 157 print("%d,%d-%d,%d:\t%s\t%s" % \ 158 (srow, scol, erow, ecol, tok_name[type], repr(token))) 159 160def tokenize(readline, tokeneater=printtoken): 161 """ 162 The tokenize() function accepts two parameters: one representing the 163 input stream, and one providing an output mechanism for tokenize(). 164 165 The first parameter, readline, must be a callable object which provides 166 the same interface as the readline() method of built-in file objects. 167 Each call to the function should return one line of input as a string. 168 169 The second parameter, tokeneater, must also be a callable object. It is 170 called once for each token, with five arguments, corresponding to the 171 tuples generated by generate_tokens(). 172 """ 173 try: 174 tokenize_loop(readline, tokeneater) 175 except StopTokenizing: 176 pass 177 178# backwards compatible interface 179def tokenize_loop(readline, tokeneater): 180 for token_info in generate_tokens(readline): 181 tokeneater(*token_info) 182 183class Untokenizer: 184 185 def __init__(self): 186 self.tokens = [] 187 self.prev_row = 1 188 self.prev_col = 0 189 190 def add_whitespace(self, start): 191 row, col = start 192 assert row <= self.prev_row 193 col_offset = col - self.prev_col 194 if col_offset: 195 self.tokens.append(" " * col_offset) 196 197 def untokenize(self, iterable): 198 for t in iterable: 199 if len(t) == 2: 200 self.compat(t, iterable) 201 break 202 tok_type, token, start, end, line = t 203 self.add_whitespace(start) 204 self.tokens.append(token) 205 self.prev_row, self.prev_col = end 206 if tok_type in (NEWLINE, NL): 207 self.prev_row += 1 208 self.prev_col = 0 209 return "".join(self.tokens) 210 211 def compat(self, token, iterable): 212 startline = False 213 indents = [] 214 toks_append = self.tokens.append 215 toknum, tokval = token 216 if toknum in (NAME, NUMBER): 217 tokval += ' ' 218 if toknum in (NEWLINE, NL): 219 startline = True 220 for tok in iterable: 221 toknum, tokval = tok[:2] 222 223 if toknum in (NAME, NUMBER, ASYNC, AWAIT): 224 tokval += ' ' 225 226 if toknum == INDENT: 227 indents.append(tokval) 228 continue 229 elif toknum == DEDENT: 230 indents.pop() 231 continue 232 elif toknum in (NEWLINE, NL): 233 startline = True 234 elif startline and indents: 235 toks_append(indents[-1]) 236 startline = False 237 toks_append(tokval) 238 239cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 240blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 241 242def _get_normal_name(orig_enc): 243 """Imitates get_normal_name in tokenizer.c.""" 244 # Only care about the first 12 characters. 245 enc = orig_enc[:12].lower().replace("_", "-") 246 if enc == "utf-8" or enc.startswith("utf-8-"): 247 return "utf-8" 248 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 249 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 250 return "iso-8859-1" 251 return orig_enc 252 253def detect_encoding(readline): 254 """ 255 The detect_encoding() function is used to detect the encoding that should 256 be used to decode a Python source file. It requires one argument, readline, 257 in the same way as the tokenize() generator. 258 259 It will call readline a maximum of twice, and return the encoding used 260 (as a string) and a list of any lines (left as bytes) it has read 261 in. 262 263 It detects the encoding from the presence of a utf-8 bom or an encoding 264 cookie as specified in pep-0263. If both a bom and a cookie are present, but 265 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 266 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 267 'utf-8-sig' is returned. 268 269 If no encoding is specified, then the default of 'utf-8' will be returned. 270 """ 271 bom_found = False 272 encoding = None 273 default = 'utf-8' 274 def read_or_stop(): 275 try: 276 return readline() 277 except StopIteration: 278 return bytes() 279 280 def find_cookie(line): 281 try: 282 line_string = line.decode('ascii') 283 except UnicodeDecodeError: 284 return None 285 match = cookie_re.match(line_string) 286 if not match: 287 return None 288 encoding = _get_normal_name(match.group(1)) 289 try: 290 codec = lookup(encoding) 291 except LookupError: 292 # This behaviour mimics the Python interpreter 293 raise SyntaxError("unknown encoding: " + encoding) 294 295 if bom_found: 296 if codec.name != 'utf-8': 297 # This behaviour mimics the Python interpreter 298 raise SyntaxError('encoding problem: utf-8') 299 encoding += '-sig' 300 return encoding 301 302 first = read_or_stop() 303 if first.startswith(BOM_UTF8): 304 bom_found = True 305 first = first[3:] 306 default = 'utf-8-sig' 307 if not first: 308 return default, [] 309 310 encoding = find_cookie(first) 311 if encoding: 312 return encoding, [first] 313 if not blank_re.match(first): 314 return default, [first] 315 316 second = read_or_stop() 317 if not second: 318 return default, [first] 319 320 encoding = find_cookie(second) 321 if encoding: 322 return encoding, [first, second] 323 324 return default, [first, second] 325 326def untokenize(iterable): 327 """Transform tokens back into Python source code. 328 329 Each element returned by the iterable must be a token sequence 330 with at least two elements, a token number and token value. If 331 only two tokens are passed, the resulting output is poor. 332 333 Round-trip invariant for full input: 334 Untokenized source will match input source exactly 335 336 Round-trip invariant for limited intput: 337 # Output text will tokenize the back to the input 338 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 339 newcode = untokenize(t1) 340 readline = iter(newcode.splitlines(1)).next 341 t2 = [tok[:2] for tokin generate_tokens(readline)] 342 assert t1 == t2 343 """ 344 ut = Untokenizer() 345 return ut.untokenize(iterable) 346 347def generate_tokens(readline): 348 """ 349 The generate_tokens() generator requires one argument, readline, which 350 must be a callable object which provides the same interface as the 351 readline() method of built-in file objects. Each call to the function 352 should return one line of input as a string. Alternately, readline 353 can be a callable function terminating with StopIteration: 354 readline = open(myfile).next # Example of alternate readline 355 356 The generator produces 5-tuples with these members: the token type; the 357 token string; a 2-tuple (srow, scol) of ints specifying the row and 358 column where the token begins in the source; a 2-tuple (erow, ecol) of 359 ints specifying the row and column where the token ends in the source; 360 and the line on which the token was found. The line passed is the 361 logical line; continuation lines are included. 362 """ 363 lnum = parenlev = continued = 0 364 namechars, numchars = string.ascii_letters + '_', '0123456789' 365 contstr, needcont = '', 0 366 contline = None 367 indents = [0] 368 369 # 'stashed' and 'async_*' are used for async/await parsing 370 stashed = None 371 async_def = False 372 async_def_indent = 0 373 async_def_nl = False 374 375 while 1: # loop over lines in stream 376 try: 377 line = readline() 378 except StopIteration: 379 line = '' 380 lnum = lnum + 1 381 pos, max = 0, len(line) 382 383 if contstr: # continued string 384 if not line: 385 raise TokenError("EOF in multi-line string", strstart) 386 endmatch = endprog.match(line) 387 if endmatch: 388 pos = end = endmatch.end(0) 389 yield (STRING, contstr + line[:end], 390 strstart, (lnum, end), contline + line) 391 contstr, needcont = '', 0 392 contline = None 393 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 394 yield (ERRORTOKEN, contstr + line, 395 strstart, (lnum, len(line)), contline) 396 contstr = '' 397 contline = None 398 continue 399 else: 400 contstr = contstr + line 401 contline = contline + line 402 continue 403 404 elif parenlev == 0 and not continued: # new statement 405 if not line: break 406 column = 0 407 while pos < max: # measure leading whitespace 408 if line[pos] == ' ': column = column + 1 409 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize 410 elif line[pos] == '\f': column = 0 411 else: break 412 pos = pos + 1 413 if pos == max: break 414 415 if stashed: 416 yield stashed 417 stashed = None 418 419 if line[pos] in '#\r\n': # skip comments or blank lines 420 if line[pos] == '#': 421 comment_token = line[pos:].rstrip('\r\n') 422 nl_pos = pos + len(comment_token) 423 yield (COMMENT, comment_token, 424 (lnum, pos), (lnum, pos + len(comment_token)), line) 425 yield (NL, line[nl_pos:], 426 (lnum, nl_pos), (lnum, len(line)), line) 427 else: 428 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 429 (lnum, pos), (lnum, len(line)), line) 430 continue 431 432 if column > indents[-1]: # count indents or dedents 433 indents.append(column) 434 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 435 while column < indents[-1]: 436 if column not in indents: 437 raise IndentationError( 438 "unindent does not match any outer indentation level", 439 ("<tokenize>", lnum, pos, line)) 440 indents = indents[:-1] 441 442 if async_def and async_def_indent >= indents[-1]: 443 async_def = False 444 async_def_nl = False 445 async_def_indent = 0 446 447 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 448 449 if async_def and async_def_nl and async_def_indent >= indents[-1]: 450 async_def = False 451 async_def_nl = False 452 async_def_indent = 0 453 454 else: # continued statement 455 if not line: 456 raise TokenError("EOF in multi-line statement", (lnum, 0)) 457 continued = 0 458 459 while pos < max: 460 pseudomatch = pseudoprog.match(line, pos) 461 if pseudomatch: # scan for tokens 462 start, end = pseudomatch.span(1) 463 spos, epos, pos = (lnum, start), (lnum, end), end 464 token, initial = line[start:end], line[start] 465 466 if initial in numchars or \ 467 (initial == '.' and token != '.'): # ordinary number 468 yield (NUMBER, token, spos, epos, line) 469 elif initial in '\r\n': 470 newline = NEWLINE 471 if parenlev > 0: 472 newline = NL 473 elif async_def: 474 async_def_nl = True 475 if stashed: 476 yield stashed 477 stashed = None 478 yield (newline, token, spos, epos, line) 479 480 elif initial == '#': 481 assert not token.endswith("\n") 482 if stashed: 483 yield stashed 484 stashed = None 485 yield (COMMENT, token, spos, epos, line) 486 elif token in triple_quoted: 487 endprog = endprogs[token] 488 endmatch = endprog.match(line, pos) 489 if endmatch: # all on one line 490 pos = endmatch.end(0) 491 token = line[start:pos] 492 if stashed: 493 yield stashed 494 stashed = None 495 yield (STRING, token, spos, (lnum, pos), line) 496 else: 497 strstart = (lnum, start) # multiple lines 498 contstr = line[start:] 499 contline = line 500 break 501 elif initial in single_quoted or \ 502 token[:2] in single_quoted or \ 503 token[:3] in single_quoted: 504 if token[-1] == '\n': # continued string 505 strstart = (lnum, start) 506 endprog = (endprogs[initial] or endprogs[token[1]] or 507 endprogs[token[2]]) 508 contstr, needcont = line[start:], 1 509 contline = line 510 break 511 else: # ordinary string 512 if stashed: 513 yield stashed 514 stashed = None 515 yield (STRING, token, spos, epos, line) 516 elif initial in namechars: # ordinary name 517 if token in ('async', 'await'): 518 if async_def: 519 yield (ASYNC if token == 'async' else AWAIT, 520 token, spos, epos, line) 521 continue 522 523 tok = (NAME, token, spos, epos, line) 524 if token == 'async' and not stashed: 525 stashed = tok 526 continue 527 528 if token == 'def': 529 if (stashed 530 and stashed[0] == NAME 531 and stashed[1] == 'async'): 532 533 async_def = True 534 async_def_indent = indents[-1] 535 536 yield (ASYNC, stashed[1], 537 stashed[2], stashed[3], 538 stashed[4]) 539 stashed = None 540 541 if stashed: 542 yield stashed 543 stashed = None 544 545 yield tok 546 elif initial == '\\': # continued stmt 547 # This yield is new; needed for better idempotency: 548 if stashed: 549 yield stashed 550 stashed = None 551 yield (NL, token, spos, (lnum, pos), line) 552 continued = 1 553 else: 554 if initial in '([{': parenlev = parenlev + 1 555 elif initial in ')]}': parenlev = parenlev - 1 556 if stashed: 557 yield stashed 558 stashed = None 559 yield (OP, token, spos, epos, line) 560 else: 561 yield (ERRORTOKEN, line[pos], 562 (lnum, pos), (lnum, pos+1), line) 563 pos = pos + 1 564 565 if stashed: 566 yield stashed 567 stashed = None 568 569 for indent in indents[1:]: # pop remaining indent levels 570 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 571 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 572 573if __name__ == '__main__': # testing 574 import sys 575 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 576 else: tokenize(sys.stdin.readline) 577