1"""Tokenization help for Python programs. 2 3tokenize(readline) is a generator that breaks a stream of bytes into 4Python tokens. It decodes the bytes according to PEP-0263 for 5determining source file encoding. 6 7It accepts a readline-like method which is called repeatedly to get the 8next line of input (or b"" for EOF). It generates 5-tuples with these 9members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators. Additionally, all token lists start with an ENCODING token 20which tells you which encoding was used to decode the bytes stream. 21""" 22 23__author__ = 'Ka-Ping Yee <ping@lfw.org>' 24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26 'Michael Foord') 27from builtins import open as _builtin_open 28from codecs import lookup, BOM_UTF8 29import collections 30import functools 31from io import TextIOWrapper 32import itertools as _itertools 33import re 34import sys 35from token import * 36from token import EXACT_TOKEN_TYPES 37 38cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 39blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 40 41import token 42__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", 43 "untokenize", "TokenInfo"] 44del token 45 46class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 47 def __repr__(self): 48 annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 49 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 50 self._replace(type=annotated_type)) 51 52 @property 53 def exact_type(self): 54 if self.type == OP and self.string in EXACT_TOKEN_TYPES: 55 return EXACT_TOKEN_TYPES[self.string] 56 else: 57 return self.type 58 59def group(*choices): return '(' + '|'.join(choices) + ')' 60def any(*choices): return group(*choices) + '*' 61def maybe(*choices): return group(*choices) + '?' 62 63# Note: we use unicode matching for names ("\w") but ascii matching for 64# number literals. 65Whitespace = r'[ \f\t]*' 66Comment = r'#[^\r\n]*' 67Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 68Name = r'\w+' 69 70Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 71Binnumber = r'0[bB](?:_?[01])+' 72Octnumber = r'0[oO](?:_?[0-7])+' 73Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 74Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 75Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 76Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 77 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 78Expfloat = r'[0-9](?:_?[0-9])*' + Exponent 79Floatnumber = group(Pointfloat, Expfloat) 80Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 81Number = group(Imagnumber, Floatnumber, Intnumber) 82 83# Return the empty string, plus all of the valid string prefixes. 84def _all_string_prefixes(): 85 # The valid string prefixes. Only contain the lower case versions, 86 # and don't contain any permutations (include 'fr', but not 87 # 'rf'). The various permutations will be generated. 88 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 89 # if we add binary f-strings, add: ['fb', 'fbr'] 90 result = {''} 91 for prefix in _valid_string_prefixes: 92 for t in _itertools.permutations(prefix): 93 # create a list with upper and lower versions of each 94 # character 95 for u in _itertools.product(*[(c, c.upper()) for c in t]): 96 result.add(''.join(u)) 97 return result 98 99@functools.lru_cache 100def _compile(expr): 101 return re.compile(expr, re.UNICODE) 102 103# Note that since _all_string_prefixes includes the empty string, 104# StringPrefix can be the empty string (making it optional). 105StringPrefix = group(*_all_string_prefixes()) 106 107# Tail end of ' string. 108Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 109# Tail end of " string. 110Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 111# Tail end of ''' string. 112Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 113# Tail end of """ string. 114Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 115Triple = group(StringPrefix + "'''", StringPrefix + '"""') 116# Single-line ' or " string. 117String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 118 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 119 120# Sorting in reverse order puts the long operators before their prefixes. 121# Otherwise if = came before ==, == would get recognized as two instances 122# of =. 123Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) 124Funny = group(r'\r?\n', Special) 125 126PlainToken = group(Number, Funny, String, Name) 127Token = Ignore + PlainToken 128 129# First (or only) line of ' or " string. 130ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 131 group("'", r'\\\r?\n'), 132 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 133 group('"', r'\\\r?\n')) 134PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 135PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 136 137# For a given string prefix plus quotes, endpats maps it to a regex 138# to match the remainder of that string. _prefix can be empty, for 139# a normal single or triple quoted string (with no prefix). 140endpats = {} 141for _prefix in _all_string_prefixes(): 142 endpats[_prefix + "'"] = Single 143 endpats[_prefix + '"'] = Double 144 endpats[_prefix + "'''"] = Single3 145 endpats[_prefix + '"""'] = Double3 146 147# A set of all of the single and triple quoted string prefixes, 148# including the opening quotes. 149single_quoted = set() 150triple_quoted = set() 151for t in _all_string_prefixes(): 152 for u in (t + '"', t + "'"): 153 single_quoted.add(u) 154 for u in (t + '"""', t + "'''"): 155 triple_quoted.add(u) 156 157tabsize = 8 158 159class TokenError(Exception): pass 160 161class StopTokenizing(Exception): pass 162 163 164class Untokenizer: 165 166 def __init__(self): 167 self.tokens = [] 168 self.prev_row = 1 169 self.prev_col = 0 170 self.encoding = None 171 172 def add_whitespace(self, start): 173 row, col = start 174 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 175 raise ValueError("start ({},{}) precedes previous end ({},{})" 176 .format(row, col, self.prev_row, self.prev_col)) 177 row_offset = row - self.prev_row 178 if row_offset: 179 self.tokens.append("\\\n" * row_offset) 180 self.prev_col = 0 181 col_offset = col - self.prev_col 182 if col_offset: 183 self.tokens.append(" " * col_offset) 184 185 def untokenize(self, iterable): 186 it = iter(iterable) 187 indents = [] 188 startline = False 189 for t in it: 190 if len(t) == 2: 191 self.compat(t, it) 192 break 193 tok_type, token, start, end, line = t 194 if tok_type == ENCODING: 195 self.encoding = token 196 continue 197 if tok_type == ENDMARKER: 198 break 199 if tok_type == INDENT: 200 indents.append(token) 201 continue 202 elif tok_type == DEDENT: 203 indents.pop() 204 self.prev_row, self.prev_col = end 205 continue 206 elif tok_type in (NEWLINE, NL): 207 startline = True 208 elif startline and indents: 209 indent = indents[-1] 210 if start[1] >= len(indent): 211 self.tokens.append(indent) 212 self.prev_col = len(indent) 213 startline = False 214 self.add_whitespace(start) 215 self.tokens.append(token) 216 self.prev_row, self.prev_col = end 217 if tok_type in (NEWLINE, NL): 218 self.prev_row += 1 219 self.prev_col = 0 220 return "".join(self.tokens) 221 222 def compat(self, token, iterable): 223 indents = [] 224 toks_append = self.tokens.append 225 startline = token[0] in (NEWLINE, NL) 226 prevstring = False 227 228 for tok in _itertools.chain([token], iterable): 229 toknum, tokval = tok[:2] 230 if toknum == ENCODING: 231 self.encoding = tokval 232 continue 233 234 if toknum in (NAME, NUMBER): 235 tokval += ' ' 236 237 # Insert a space between two consecutive strings 238 if toknum == STRING: 239 if prevstring: 240 tokval = ' ' + tokval 241 prevstring = True 242 else: 243 prevstring = False 244 245 if toknum == INDENT: 246 indents.append(tokval) 247 continue 248 elif toknum == DEDENT: 249 indents.pop() 250 continue 251 elif toknum in (NEWLINE, NL): 252 startline = True 253 elif startline and indents: 254 toks_append(indents[-1]) 255 startline = False 256 toks_append(tokval) 257 258 259def untokenize(iterable): 260 """Transform tokens back into Python source code. 261 It returns a bytes object, encoded using the ENCODING 262 token, which is the first token sequence output by tokenize. 263 264 Each element returned by the iterable must be a token sequence 265 with at least two elements, a token number and token value. If 266 only two tokens are passed, the resulting output is poor. 267 268 Round-trip invariant for full input: 269 Untokenized source will match input source exactly 270 271 Round-trip invariant for limited input: 272 # Output bytes will tokenize back to the input 273 t1 = [tok[:2] for tok in tokenize(f.readline)] 274 newcode = untokenize(t1) 275 readline = BytesIO(newcode).readline 276 t2 = [tok[:2] for tok in tokenize(readline)] 277 assert t1 == t2 278 """ 279 ut = Untokenizer() 280 out = ut.untokenize(iterable) 281 if ut.encoding is not None: 282 out = out.encode(ut.encoding) 283 return out 284 285 286def _get_normal_name(orig_enc): 287 """Imitates get_normal_name in tokenizer.c.""" 288 # Only care about the first 12 characters. 289 enc = orig_enc[:12].lower().replace("_", "-") 290 if enc == "utf-8" or enc.startswith("utf-8-"): 291 return "utf-8" 292 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 293 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 294 return "iso-8859-1" 295 return orig_enc 296 297def detect_encoding(readline): 298 """ 299 The detect_encoding() function is used to detect the encoding that should 300 be used to decode a Python source file. It requires one argument, readline, 301 in the same way as the tokenize() generator. 302 303 It will call readline a maximum of twice, and return the encoding used 304 (as a string) and a list of any lines (left as bytes) it has read in. 305 306 It detects the encoding from the presence of a utf-8 bom or an encoding 307 cookie as specified in pep-0263. If both a bom and a cookie are present, 308 but disagree, a SyntaxError will be raised. If the encoding cookie is an 309 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 310 'utf-8-sig' is returned. 311 312 If no encoding is specified, then the default of 'utf-8' will be returned. 313 """ 314 try: 315 filename = readline.__self__.name 316 except AttributeError: 317 filename = None 318 bom_found = False 319 encoding = None 320 default = 'utf-8' 321 def read_or_stop(): 322 try: 323 return readline() 324 except StopIteration: 325 return b'' 326 327 def find_cookie(line): 328 try: 329 # Decode as UTF-8. Either the line is an encoding declaration, 330 # in which case it should be pure ASCII, or it must be UTF-8 331 # per default encoding. 332 line_string = line.decode('utf-8') 333 except UnicodeDecodeError: 334 msg = "invalid or missing encoding declaration" 335 if filename is not None: 336 msg = '{} for {!r}'.format(msg, filename) 337 raise SyntaxError(msg) 338 339 match = cookie_re.match(line_string) 340 if not match: 341 return None 342 encoding = _get_normal_name(match.group(1)) 343 try: 344 codec = lookup(encoding) 345 except LookupError: 346 # This behaviour mimics the Python interpreter 347 if filename is None: 348 msg = "unknown encoding: " + encoding 349 else: 350 msg = "unknown encoding for {!r}: {}".format(filename, 351 encoding) 352 raise SyntaxError(msg) 353 354 if bom_found: 355 if encoding != 'utf-8': 356 # This behaviour mimics the Python interpreter 357 if filename is None: 358 msg = 'encoding problem: utf-8' 359 else: 360 msg = 'encoding problem for {!r}: utf-8'.format(filename) 361 raise SyntaxError(msg) 362 encoding += '-sig' 363 return encoding 364 365 first = read_or_stop() 366 if first.startswith(BOM_UTF8): 367 bom_found = True 368 first = first[3:] 369 default = 'utf-8-sig' 370 if not first: 371 return default, [] 372 373 encoding = find_cookie(first) 374 if encoding: 375 return encoding, [first] 376 if not blank_re.match(first): 377 return default, [first] 378 379 second = read_or_stop() 380 if not second: 381 return default, [first] 382 383 encoding = find_cookie(second) 384 if encoding: 385 return encoding, [first, second] 386 387 return default, [first, second] 388 389 390def open(filename): 391 """Open a file in read only mode using the encoding detected by 392 detect_encoding(). 393 """ 394 buffer = _builtin_open(filename, 'rb') 395 try: 396 encoding, lines = detect_encoding(buffer.readline) 397 buffer.seek(0) 398 text = TextIOWrapper(buffer, encoding, line_buffering=True) 399 text.mode = 'r' 400 return text 401 except: 402 buffer.close() 403 raise 404 405 406def tokenize(readline): 407 """ 408 The tokenize() generator requires one argument, readline, which 409 must be a callable object which provides the same interface as the 410 readline() method of built-in file objects. Each call to the function 411 should return one line of input as bytes. Alternatively, readline 412 can be a callable function terminating with StopIteration: 413 readline = open(myfile, 'rb').__next__ # Example of alternate readline 414 415 The generator produces 5-tuples with these members: the token type; the 416 token string; a 2-tuple (srow, scol) of ints specifying the row and 417 column where the token begins in the source; a 2-tuple (erow, ecol) of 418 ints specifying the row and column where the token ends in the source; 419 and the line on which the token was found. The line passed is the 420 physical line. 421 422 The first token sequence will always be an ENCODING token 423 which tells you which encoding was used to decode the bytes stream. 424 """ 425 encoding, consumed = detect_encoding(readline) 426 empty = _itertools.repeat(b"") 427 rl_gen = _itertools.chain(consumed, iter(readline, b""), empty) 428 return _tokenize(rl_gen.__next__, encoding) 429 430 431def _tokenize(readline, encoding): 432 lnum = parenlev = continued = 0 433 numchars = '0123456789' 434 contstr, needcont = '', 0 435 contline = None 436 indents = [0] 437 438 if encoding is not None: 439 if encoding == "utf-8-sig": 440 # BOM will already have been stripped. 441 encoding = "utf-8" 442 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 443 last_line = b'' 444 line = b'' 445 while True: # loop over lines in stream 446 try: 447 # We capture the value of the line variable here because 448 # readline uses the empty string '' to signal end of input, 449 # hence `line` itself will always be overwritten at the end 450 # of this loop. 451 last_line = line 452 line = readline() 453 except StopIteration: 454 line = b'' 455 456 if encoding is not None: 457 line = line.decode(encoding) 458 lnum += 1 459 pos, max = 0, len(line) 460 461 if contstr: # continued string 462 if not line: 463 raise TokenError("EOF in multi-line string", strstart) 464 endmatch = endprog.match(line) 465 if endmatch: 466 pos = end = endmatch.end(0) 467 yield TokenInfo(STRING, contstr + line[:end], 468 strstart, (lnum, end), contline + line) 469 contstr, needcont = '', 0 470 contline = None 471 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 472 yield TokenInfo(ERRORTOKEN, contstr + line, 473 strstart, (lnum, len(line)), contline) 474 contstr = '' 475 contline = None 476 continue 477 else: 478 contstr = contstr + line 479 contline = contline + line 480 continue 481 482 elif parenlev == 0 and not continued: # new statement 483 if not line: break 484 column = 0 485 while pos < max: # measure leading whitespace 486 if line[pos] == ' ': 487 column += 1 488 elif line[pos] == '\t': 489 column = (column//tabsize + 1)*tabsize 490 elif line[pos] == '\f': 491 column = 0 492 else: 493 break 494 pos += 1 495 if pos == max: 496 break 497 498 if line[pos] in '#\r\n': # skip comments or blank lines 499 if line[pos] == '#': 500 comment_token = line[pos:].rstrip('\r\n') 501 yield TokenInfo(COMMENT, comment_token, 502 (lnum, pos), (lnum, pos + len(comment_token)), line) 503 pos += len(comment_token) 504 505 yield TokenInfo(NL, line[pos:], 506 (lnum, pos), (lnum, len(line)), line) 507 continue 508 509 if column > indents[-1]: # count indents or dedents 510 indents.append(column) 511 yield TokenInfo(INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 512 while column < indents[-1]: 513 if column not in indents: 514 raise IndentationError( 515 "unindent does not match any outer indentation level", 516 ("<tokenize>", lnum, pos, line)) 517 indents = indents[:-1] 518 519 yield TokenInfo(DEDENT, '', (lnum, pos), (lnum, pos), line) 520 521 else: # continued statement 522 if not line: 523 raise TokenError("EOF in multi-line statement", (lnum, 0)) 524 continued = 0 525 526 while pos < max: 527 pseudomatch = _compile(PseudoToken).match(line, pos) 528 if pseudomatch: # scan for tokens 529 start, end = pseudomatch.span(1) 530 spos, epos, pos = (lnum, start), (lnum, end), end 531 if start == end: 532 continue 533 token, initial = line[start:end], line[start] 534 535 if (initial in numchars or # ordinary number 536 (initial == '.' and token != '.' and token != '...')): 537 yield TokenInfo(NUMBER, token, spos, epos, line) 538 elif initial in '\r\n': 539 if parenlev > 0: 540 yield TokenInfo(NL, token, spos, epos, line) 541 else: 542 yield TokenInfo(NEWLINE, token, spos, epos, line) 543 544 elif initial == '#': 545 assert not token.endswith("\n") 546 yield TokenInfo(COMMENT, token, spos, epos, line) 547 548 elif token in triple_quoted: 549 endprog = _compile(endpats[token]) 550 endmatch = endprog.match(line, pos) 551 if endmatch: # all on one line 552 pos = endmatch.end(0) 553 token = line[start:pos] 554 yield TokenInfo(STRING, token, spos, (lnum, pos), line) 555 else: 556 strstart = (lnum, start) # multiple lines 557 contstr = line[start:] 558 contline = line 559 break 560 561 # Check up to the first 3 chars of the token to see if 562 # they're in the single_quoted set. If so, they start 563 # a string. 564 # We're using the first 3, because we're looking for 565 # "rb'" (for example) at the start of the token. If 566 # we switch to longer prefixes, this needs to be 567 # adjusted. 568 # Note that initial == token[:1]. 569 # Also note that single quote checking must come after 570 # triple quote checking (above). 571 elif (initial in single_quoted or 572 token[:2] in single_quoted or 573 token[:3] in single_quoted): 574 if token[-1] == '\n': # continued string 575 strstart = (lnum, start) 576 # Again, using the first 3 chars of the 577 # token. This is looking for the matching end 578 # regex for the correct type of quote 579 # character. So it's really looking for 580 # endpats["'"] or endpats['"'], by trying to 581 # skip string prefix characters, if any. 582 endprog = _compile(endpats.get(initial) or 583 endpats.get(token[1]) or 584 endpats.get(token[2])) 585 contstr, needcont = line[start:], 1 586 contline = line 587 break 588 else: # ordinary string 589 yield TokenInfo(STRING, token, spos, epos, line) 590 591 elif initial.isidentifier(): # ordinary name 592 yield TokenInfo(NAME, token, spos, epos, line) 593 elif initial == '\\': # continued stmt 594 continued = 1 595 else: 596 if initial in '([{': 597 parenlev += 1 598 elif initial in ')]}': 599 parenlev -= 1 600 yield TokenInfo(OP, token, spos, epos, line) 601 else: 602 yield TokenInfo(ERRORTOKEN, line[pos], 603 (lnum, pos), (lnum, pos+1), line) 604 pos += 1 605 606 # Add an implicit NEWLINE if the input doesn't end in one 607 if last_line and last_line[-1] not in '\r\n' and not last_line.strip().startswith("#"): 608 yield TokenInfo(NEWLINE, '', (lnum - 1, len(last_line)), (lnum - 1, len(last_line) + 1), '') 609 for indent in indents[1:]: # pop remaining indent levels 610 yield TokenInfo(DEDENT, '', (lnum, 0), (lnum, 0), '') 611 yield TokenInfo(ENDMARKER, '', (lnum, 0), (lnum, 0), '') 612 613 614def generate_tokens(readline): 615 """Tokenize a source reading Python code as unicode strings. 616 617 This has the same API as tokenize(), except that it expects the *readline* 618 callable to return str objects instead of bytes. 619 """ 620 return _tokenize(readline, None) 621 622def main(): 623 import argparse 624 625 # Helper error handling routines 626 def perror(message): 627 sys.stderr.write(message) 628 sys.stderr.write('\n') 629 630 def error(message, filename=None, location=None): 631 if location: 632 args = (filename,) + location + (message,) 633 perror("%s:%d:%d: error: %s" % args) 634 elif filename: 635 perror("%s: error: %s" % (filename, message)) 636 else: 637 perror("error: %s" % message) 638 sys.exit(1) 639 640 # Parse the arguments and options 641 parser = argparse.ArgumentParser(prog='python -m tokenize') 642 parser.add_argument(dest='filename', nargs='?', 643 metavar='filename.py', 644 help='the file to tokenize; defaults to stdin') 645 parser.add_argument('-e', '--exact', dest='exact', action='store_true', 646 help='display token names using the exact type') 647 args = parser.parse_args() 648 649 try: 650 # Tokenize the input 651 if args.filename: 652 filename = args.filename 653 with _builtin_open(filename, 'rb') as f: 654 tokens = list(tokenize(f.readline)) 655 else: 656 filename = "<stdin>" 657 tokens = _tokenize(sys.stdin.readline, None) 658 659 # Output the tokenization 660 for token in tokens: 661 token_type = token.type 662 if args.exact: 663 token_type = token.exact_type 664 token_range = "%d,%d-%d,%d:" % (token.start + token.end) 665 print("%-20s%-15s%-15r" % 666 (token_range, tok_name[token_type], token.string)) 667 except IndentationError as err: 668 line, column = err.args[1][1:3] 669 error(err.args[0], filename, (line, column)) 670 except TokenError as err: 671 line, column = err.args[1] 672 error(err.args[0], filename, (line, column)) 673 except SyntaxError as err: 674 error(err, filename) 675 except OSError as err: 676 error(err) 677 except KeyboardInterrupt: 678 print("interrupted\n") 679 except Exception as err: 680 perror("unexpected error: %s" % err) 681 raise 682 683if __name__ == "__main__": 684 main() 685