1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 2# All rights reserved. 3"""Tokenization help for Python programs. 4 5generate_tokens(readline) is a generator that breaks a stream of 6text into Python tokens. It accepts a readline-like method which is called 7repeatedly to get the next line of input (or "" for EOF). It generates 85-tuples with these members: 9 10 the token type (see token.py) 11 the token (a string) 12 the starting (row, column) indices of the token (a 2-tuple of ints) 13 the ending (row, column) indices of the token (a 2-tuple of ints) 14 the original line (string) 15 16It is designed to match the working of the Python tokenizer exactly, except 17that it produces COMMENT tokens for comments and gives type OP for all 18operators 19 20Older entry points 21 tokenize_loop(readline, tokeneater) 22 tokenize(readline, tokeneater=printtoken) 23are the same, except instead of generating tokens, tokeneater is a callback 24function to which the 5 fields described above are passed as 5 arguments, 25each time a new token is found.""" 26 27__author__ = 'Ka-Ping Yee <ping@lfw.org>' 28__credits__ = \ 29 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 30 31import re 32import string 33from codecs import BOM_UTF8 34from codecs import lookup 35 36from . import token 37from .token import ASYNC 38from .token import AWAIT 39from .token import COMMENT 40from .token import DEDENT 41from .token import ENDMARKER 42from .token import ERRORTOKEN 43from .token import INDENT 44from .token import NAME 45from .token import NEWLINE 46from .token import NL 47from .token import NUMBER 48from .token import OP 49from .token import STRING 50from .token import tok_name 51 52__all__ = [x for x in dir(token) if x[0] != '_' 53 ] + ['tokenize', 'generate_tokens', 'untokenize'] 54del token 55 56try: 57 bytes 58except NameError: 59 # Support bytes type in Python <= 2.5, so 2to3 turns itself into 60 # valid Python 3 code. 61 bytes = str 62 63 64def group(*choices): 65 return '(' + '|'.join(choices) + ')' 66 67 68def any(*choices): 69 return group(*choices) + '*' 70 71 72def maybe(*choices): 73 return group(*choices) + '?' 74 75 76def _combinations(*l): # noqa: E741 77 return set( 78 x + y for x in l for y in l + ('',) if x.casefold() != y.casefold()) 79 80 81Whitespace = r'[ \f\t]*' 82Comment = r'#[^\r\n]*' 83Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 84Name = r'\w+' 85 86Binnumber = r'0[bB]_?[01]+(?:_[01]+)*' 87Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?' 88Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?' 89Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?') 90Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 91Exponent = r'[eE][-+]?\d+(?:_\d+)*' 92Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', 93 r'\.\d+(?:_\d+)*') + maybe(Exponent) 94Expfloat = r'\d+(?:_\d+)*' + Exponent 95Floatnumber = group(Pointfloat, Expfloat) 96Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]') 97Number = group(Imagnumber, Floatnumber, Intnumber) 98 99# Tail end of ' string. 100Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 101# Tail end of " string. 102Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 103# Tail end of ''' string. 104Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 105# Tail end of """ string. 106Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 107_litprefix = r'(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?' 108Triple = group(_litprefix + "'''", _litprefix + '"""') 109# Single-line ' or " string. 110String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 111 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 112 113# Because of leftmost-then-longest match semantics, be sure to put the 114# longest operators first (e.g., if = came before ==, == would get 115# recognized as two instances of =). 116Operator = group(r'\*\*=?', r'>>=?', r'<<=?', r'<>', r'!=', r'//=?', r'->', 117 r'[+\-*/%&@|^=<>]=?', r'~') 118 119Bracket = '[][(){}]' 120Special = group(r'\r?\n', r':=', r'[:;.,`@]') 121Funny = group(Operator, Bracket, Special) 122 123PlainToken = group(Number, Funny, String, Name) 124Token = Ignore + PlainToken 125 126# First (or only) line of ' or " string. 127ContStr = group( 128 _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r'\\\r?\n'), 129 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n')) 130PseudoExtras = group(r'\\\r?\n', Comment, Triple) 131PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 132 133tokenprog, pseudoprog, single3prog, double3prog = map( 134 re.compile, (Token, PseudoToken, Single3, Double3)) 135 136_strprefixes = ( 137 _combinations('r', 'R', 'f', 'F') | _combinations('r', 'R', 'b', 'B') 138 | {'u', 'U', 'ur', 'uR', 'Ur', 'UR'}) 139 140endprogs = { 141 "'": re.compile(Single), 142 '"': re.compile(Double), 143 "'''": single3prog, 144 '"""': double3prog, 145 **{ 146 f"{prefix}'''": single3prog for prefix in _strprefixes 147 }, 148 **{ 149 f'{prefix}"""': double3prog for prefix in _strprefixes 150 }, 151 **{ 152 prefix: None for prefix in _strprefixes 153 } 154} 155 156triple_quoted = ({"'''", '"""'} | {f"{prefix}'''" for prefix in _strprefixes} 157 | {f'{prefix}"""' for prefix in _strprefixes}) 158single_quoted = ({"'", '"'} | {f"{prefix}'" for prefix in _strprefixes} 159 | {f'{prefix}"' for prefix in _strprefixes}) 160 161tabsize = 8 162 163 164class TokenError(Exception): 165 pass 166 167 168class StopTokenizing(Exception): 169 pass 170 171 172def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, 173 line): # for testing 174 (srow, scol) = xxx_todo_changeme 175 (erow, ecol) = xxx_todo_changeme1 176 print('%d,%d-%d,%d:\t%s\t%s' % 177 (srow, scol, erow, ecol, tok_name[type], repr(token))) 178 179 180def tokenize(readline, tokeneater=printtoken): 181 """ 182 The tokenize() function accepts two parameters: one representing the 183 input stream, and one providing an output mechanism for tokenize(). 184 185 The first parameter, readline, must be a callable object which provides 186 the same interface as the readline() method of built-in file objects. 187 Each call to the function should return one line of input as a string. 188 189 The second parameter, tokeneater, must also be a callable object. It is 190 called once for each token, with five arguments, corresponding to the 191 tuples generated by generate_tokens(). 192 """ 193 try: 194 tokenize_loop(readline, tokeneater) 195 except StopTokenizing: 196 pass 197 198 199# backwards compatible interface 200def tokenize_loop(readline, tokeneater): 201 for token_info in generate_tokens(readline): 202 tokeneater(*token_info) 203 204 205class Untokenizer: 206 207 def __init__(self): 208 self.tokens = [] 209 self.prev_row = 1 210 self.prev_col = 0 211 212 def add_whitespace(self, start): 213 row, col = start 214 assert row <= self.prev_row 215 col_offset = col - self.prev_col 216 if col_offset: 217 self.tokens.append(' ' * col_offset) 218 219 def untokenize(self, iterable): 220 for t in iterable: 221 if len(t) == 2: 222 self.compat(t, iterable) 223 break 224 tok_type, token, start, end, line = t 225 self.add_whitespace(start) 226 self.tokens.append(token) 227 self.prev_row, self.prev_col = end 228 if tok_type in (NEWLINE, NL): 229 self.prev_row += 1 230 self.prev_col = 0 231 return ''.join(self.tokens) 232 233 def compat(self, token, iterable): 234 startline = False 235 indents = [] 236 toks_append = self.tokens.append 237 toknum, tokval = token 238 if toknum in (NAME, NUMBER): 239 tokval += ' ' 240 if toknum in (NEWLINE, NL): 241 startline = True 242 for tok in iterable: 243 toknum, tokval = tok[:2] 244 245 if toknum in (NAME, NUMBER, ASYNC, AWAIT): 246 tokval += ' ' 247 248 if toknum == INDENT: 249 indents.append(tokval) 250 continue 251 elif toknum == DEDENT: 252 indents.pop() 253 continue 254 elif toknum in (NEWLINE, NL): 255 startline = True 256 elif startline and indents: 257 toks_append(indents[-1]) 258 startline = False 259 toks_append(tokval) 260 261 262cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 263blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 264 265 266def _get_normal_name(orig_enc): 267 """Imitates get_normal_name in tokenizer.c.""" 268 # Only care about the first 12 characters. 269 enc = orig_enc[:12].lower().replace('_', '-') 270 if enc == 'utf-8' or enc.startswith('utf-8-'): 271 return 'utf-8' 272 if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \ 273 enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')): 274 return 'iso-8859-1' 275 return orig_enc 276 277 278def detect_encoding(readline): 279 """ 280 The detect_encoding() function is used to detect the encoding that should 281 be used to decode a Python source file. It requires one argument, readline, 282 in the same way as the tokenize() generator. 283 284 It will call readline a maximum of twice, and return the encoding used 285 (as a string) and a list of any lines (left as bytes) it has read 286 in. 287 288 It detects the encoding from the presence of a utf-8 bom or an encoding 289 cookie as specified in pep-0263. If both a bom and a cookie are present, but 290 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 291 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 292 'utf-8-sig' is returned. 293 294 If no encoding is specified, then the default of 'utf-8' will be returned. 295 """ 296 bom_found = False 297 encoding = None 298 default = 'utf-8' 299 300 def read_or_stop(): 301 try: 302 return readline() 303 except StopIteration: 304 return bytes() 305 306 def find_cookie(line): 307 try: 308 line_string = line.decode('ascii') 309 except UnicodeDecodeError: 310 return None 311 match = cookie_re.match(line_string) 312 if not match: 313 return None 314 encoding = _get_normal_name(match.group(1)) 315 try: 316 codec = lookup(encoding) 317 except LookupError: 318 # This behaviour mimics the Python interpreter 319 raise SyntaxError('unknown encoding: ' + encoding) 320 321 if bom_found: 322 if codec.name != 'utf-8': 323 # This behaviour mimics the Python interpreter 324 raise SyntaxError('encoding problem: utf-8') 325 encoding += '-sig' 326 return encoding 327 328 first = read_or_stop() 329 if first.startswith(BOM_UTF8): 330 bom_found = True 331 first = first[3:] 332 default = 'utf-8-sig' 333 if not first: 334 return default, [] 335 336 encoding = find_cookie(first) 337 if encoding: 338 return encoding, [first] 339 if not blank_re.match(first): 340 return default, [first] 341 342 second = read_or_stop() 343 if not second: 344 return default, [first] 345 346 encoding = find_cookie(second) 347 if encoding: 348 return encoding, [first, second] 349 350 return default, [first, second] 351 352 353def untokenize(iterable): 354 """Transform tokens back into Python source code. 355 356 Each element returned by the iterable must be a token sequence 357 with at least two elements, a token number and token value. If 358 only two tokens are passed, the resulting output is poor. 359 360 Round-trip invariant for full input: 361 Untokenized source will match input source exactly 362 363 Round-trip invariant for limited input: 364 # Output text will tokenize the back to the input 365 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 366 newcode = untokenize(t1) 367 readline = iter(newcode.splitlines(1)).next 368 t2 = [tok[:2] for tokin generate_tokens(readline)] 369 assert t1 == t2 370 """ 371 ut = Untokenizer() 372 return ut.untokenize(iterable) 373 374 375def generate_tokens(readline): 376 """ 377 The generate_tokens() generator requires one argument, readline, which 378 must be a callable object which provides the same interface as the 379 readline() method of built-in file objects. Each call to the function 380 should return one line of input as a string. Alternately, readline 381 can be a callable function terminating with StopIteration: 382 readline = open(myfile).next # Example of alternate readline 383 384 The generator produces 5-tuples with these members: the token type; the 385 token string; a 2-tuple (srow, scol) of ints specifying the row and 386 column where the token begins in the source; a 2-tuple (erow, ecol) of 387 ints specifying the row and column where the token ends in the source; 388 and the line on which the token was found. The line passed is the 389 physical line. 390 """ 391 strstart = '' 392 endprog = '' 393 lnum = parenlev = continued = 0 394 contstr, needcont = '', 0 395 contline = None 396 indents = [0] 397 398 # 'stashed' and 'async_*' are used for async/await parsing 399 stashed = None 400 async_def = False 401 async_def_indent = 0 402 async_def_nl = False 403 404 while 1: # loop over lines in stream 405 try: 406 line = readline() 407 except StopIteration: 408 line = '' 409 lnum = lnum + 1 410 pos, max = 0, len(line) 411 412 if contstr: # continued string 413 if not line: 414 raise TokenError('EOF in multi-line string', strstart) 415 endmatch = endprog.match(line) 416 if endmatch: 417 pos = end = endmatch.end(0) 418 yield (STRING, contstr + line[:end], strstart, (lnum, end), 419 contline + line) 420 contstr, needcont = '', 0 421 contline = None 422 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 423 yield (ERRORTOKEN, contstr + line, strstart, (lnum, len(line)), 424 contline) 425 contstr = '' 426 contline = None 427 continue 428 else: 429 contstr = contstr + line 430 contline = contline + line 431 continue 432 433 elif parenlev == 0 and not continued: # new statement 434 if not line: 435 break 436 column = 0 437 while pos < max: # measure leading whitespace 438 if line[pos] == ' ': 439 column = column + 1 440 elif line[pos] == '\t': 441 column = (column // tabsize + 1) * tabsize 442 elif line[pos] == '\f': 443 column = 0 444 else: 445 break 446 pos = pos + 1 447 if pos == max: 448 break 449 450 if stashed: 451 yield stashed 452 stashed = None 453 454 if line[pos] in '#\r\n': # skip comments or blank lines 455 if line[pos] == '#': 456 comment_token = line[pos:].rstrip('\r\n') 457 nl_pos = pos + len(comment_token) 458 yield (COMMENT, comment_token, (lnum, pos), 459 (lnum, pos + len(comment_token)), line) 460 yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line) 461 else: 462 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], (lnum, pos), 463 (lnum, len(line)), line) 464 continue 465 466 if column > indents[-1]: # count indents or dedents 467 indents.append(column) 468 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 469 while column < indents[-1]: 470 if column not in indents: 471 raise IndentationError( 472 'unindent does not match any outer indentation level', 473 ('<tokenize>', lnum, pos, line)) 474 indents = indents[:-1] 475 476 if async_def and async_def_indent >= indents[-1]: 477 async_def = False 478 async_def_nl = False 479 async_def_indent = 0 480 481 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 482 483 if async_def and async_def_nl and async_def_indent >= indents[-1]: 484 async_def = False 485 async_def_nl = False 486 async_def_indent = 0 487 488 else: # continued statement 489 if not line: 490 raise TokenError('EOF in multi-line statement', (lnum, 0)) 491 continued = 0 492 493 while pos < max: 494 pseudomatch = pseudoprog.match(line, pos) 495 if pseudomatch: # scan for tokens 496 start, end = pseudomatch.span(1) 497 spos, epos, pos = (lnum, start), (lnum, end), end 498 token, initial = line[start:end], line[start] 499 500 if initial in string.digits or \ 501 (initial == '.' and token != '.'): # ordinary number 502 yield (NUMBER, token, spos, epos, line) 503 elif initial in '\r\n': 504 newline = NEWLINE 505 if parenlev > 0: 506 newline = NL 507 elif async_def: 508 async_def_nl = True 509 if stashed: 510 yield stashed 511 stashed = None 512 yield (newline, token, spos, epos, line) 513 514 elif initial == '#': 515 assert not token.endswith('\n') 516 if stashed: 517 yield stashed 518 stashed = None 519 yield (COMMENT, token, spos, epos, line) 520 elif token in triple_quoted: 521 endprog = endprogs[token] 522 endmatch = endprog.match(line, pos) 523 if endmatch: # all on one line 524 pos = endmatch.end(0) 525 token = line[start:pos] 526 if stashed: 527 yield stashed 528 stashed = None 529 yield (STRING, token, spos, (lnum, pos), line) 530 else: 531 strstart = (lnum, start) # multiple lines 532 contstr = line[start:] 533 contline = line 534 break 535 elif initial in single_quoted or \ 536 token[:2] in single_quoted or \ 537 token[:3] in single_quoted: 538 if token[-1] == '\n': # continued string 539 strstart = (lnum, start) # noqa: F841 540 endprog = ( 541 endprogs[initial] or endprogs[token[1]] or endprogs[token[2]]) 542 contstr, needcont = line[start:], 1 543 contline = line 544 break 545 else: # ordinary string 546 if stashed: 547 yield stashed 548 stashed = None 549 yield (STRING, token, spos, epos, line) 550 elif initial.isidentifier(): # ordinary name 551 if token in ('async', 'await'): 552 if async_def: 553 yield (ASYNC if token == 'async' else AWAIT, token, spos, epos, 554 line) 555 continue 556 557 tok = (NAME, token, spos, epos, line) 558 if token == 'async' and not stashed: 559 stashed = tok 560 continue 561 562 if token in ('def', 'for'): 563 if (stashed and stashed[0] == NAME and stashed[1] == 'async'): 564 565 if token == 'def': 566 async_def = True 567 async_def_indent = indents[-1] 568 569 yield (ASYNC, stashed[1], stashed[2], stashed[3], stashed[4]) 570 stashed = None 571 572 if stashed: 573 yield stashed 574 stashed = None 575 576 yield tok 577 elif initial == '\\': # continued stmt 578 # This yield is new; needed for better idempotency: 579 if stashed: 580 yield stashed 581 stashed = None 582 yield (NL, token, spos, (lnum, pos), line) 583 continued = 1 584 else: 585 if initial in '([{': 586 parenlev = parenlev + 1 587 elif initial in ')]}': 588 parenlev = parenlev - 1 589 if stashed: 590 yield stashed 591 stashed = None 592 yield (OP, token, spos, epos, line) 593 else: 594 yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line) 595 pos = pos + 1 596 597 if stashed: 598 yield stashed 599 stashed = None 600 601 for indent in indents[1:]: # pop remaining indent levels 602 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 603 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 604 605 606if __name__ == '__main__': # testing 607 import sys 608 if len(sys.argv) > 1: 609 tokenize(open(sys.argv[1]).readline) 610 else: 611 tokenize(sys.stdin.readline) 612