1"""Tokenization help for Python programs. 2 3tokenize(readline) is a generator that breaks a stream of bytes into 4Python tokens. It decodes the bytes according to PEP-0263 for 5determining source file encoding. 6 7It accepts a readline-like method which is called repeatedly to get the 8next line of input (or b"" for EOF). It generates 5-tuples with these 9members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators. Additionally, all token lists start with an ENCODING token 20which tells you which encoding was used to decode the bytes stream. 21""" 22 23__author__ = 'Ka-Ping Yee <ping@lfw.org>' 24__credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' 25 'Skip Montanaro, Raymond Hettinger, Trent Nelson, ' 26 'Michael Foord') 27from builtins import open as _builtin_open 28from codecs import lookup, BOM_UTF8 29import collections 30import functools 31from io import TextIOWrapper 32import itertools as _itertools 33import re 34import sys 35from token import * 36from token import EXACT_TOKEN_TYPES 37import _tokenize 38 39cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 40blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 41 42import token 43__all__ = token.__all__ + ["tokenize", "generate_tokens", "detect_encoding", 44 "untokenize", "TokenInfo", "open", "TokenError"] 45del token 46 47class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')): 48 def __repr__(self): 49 annotated_type = '%d (%s)' % (self.type, tok_name[self.type]) 50 return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' % 51 self._replace(type=annotated_type)) 52 53 @property 54 def exact_type(self): 55 if self.type == OP and self.string in EXACT_TOKEN_TYPES: 56 return EXACT_TOKEN_TYPES[self.string] 57 else: 58 return self.type 59 60def group(*choices): return '(' + '|'.join(choices) + ')' 61def any(*choices): return group(*choices) + '*' 62def maybe(*choices): return group(*choices) + '?' 63 64# Note: we use unicode matching for names ("\w") but ascii matching for 65# number literals. 66Whitespace = r'[ \f\t]*' 67Comment = r'#[^\r\n]*' 68Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 69Name = r'\w+' 70 71Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+' 72Binnumber = r'0[bB](?:_?[01])+' 73Octnumber = r'0[oO](?:_?[0-7])+' 74Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)' 75Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber) 76Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*' 77Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?', 78 r'\.[0-9](?:_?[0-9])*') + maybe(Exponent) 79Expfloat = r'[0-9](?:_?[0-9])*' + Exponent 80Floatnumber = group(Pointfloat, Expfloat) 81Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]') 82Number = group(Imagnumber, Floatnumber, Intnumber) 83 84# Return the empty string, plus all of the valid string prefixes. 85def _all_string_prefixes(): 86 # The valid string prefixes. Only contain the lower case versions, 87 # and don't contain any permutations (include 'fr', but not 88 # 'rf'). The various permutations will be generated. 89 _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr'] 90 # if we add binary f-strings, add: ['fb', 'fbr'] 91 result = {''} 92 for prefix in _valid_string_prefixes: 93 for t in _itertools.permutations(prefix): 94 # create a list with upper and lower versions of each 95 # character 96 for u in _itertools.product(*[(c, c.upper()) for c in t]): 97 result.add(''.join(u)) 98 return result 99 100@functools.lru_cache 101def _compile(expr): 102 return re.compile(expr, re.UNICODE) 103 104# Note that since _all_string_prefixes includes the empty string, 105# StringPrefix can be the empty string (making it optional). 106StringPrefix = group(*_all_string_prefixes()) 107 108# Tail end of ' string. 109Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 110# Tail end of " string. 111Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 112# Tail end of ''' string. 113Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 114# Tail end of """ string. 115Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 116Triple = group(StringPrefix + "'''", StringPrefix + '"""') 117# Single-line ' or " string. 118String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 119 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 120 121# Sorting in reverse order puts the long operators before their prefixes. 122# Otherwise if = came before ==, == would get recognized as two instances 123# of =. 124Special = group(*map(re.escape, sorted(EXACT_TOKEN_TYPES, reverse=True))) 125Funny = group(r'\r?\n', Special) 126 127PlainToken = group(Number, Funny, String, Name) 128Token = Ignore + PlainToken 129 130# First (or only) line of ' or " string. 131ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 132 group("'", r'\\\r?\n'), 133 StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 134 group('"', r'\\\r?\n')) 135PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple) 136PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 137 138# For a given string prefix plus quotes, endpats maps it to a regex 139# to match the remainder of that string. _prefix can be empty, for 140# a normal single or triple quoted string (with no prefix). 141endpats = {} 142for _prefix in _all_string_prefixes(): 143 endpats[_prefix + "'"] = Single 144 endpats[_prefix + '"'] = Double 145 endpats[_prefix + "'''"] = Single3 146 endpats[_prefix + '"""'] = Double3 147del _prefix 148 149# A set of all of the single and triple quoted string prefixes, 150# including the opening quotes. 151single_quoted = set() 152triple_quoted = set() 153for t in _all_string_prefixes(): 154 for u in (t + '"', t + "'"): 155 single_quoted.add(u) 156 for u in (t + '"""', t + "'''"): 157 triple_quoted.add(u) 158del t, u 159 160tabsize = 8 161 162class TokenError(Exception): pass 163 164 165class Untokenizer: 166 167 def __init__(self): 168 self.tokens = [] 169 self.prev_row = 1 170 self.prev_col = 0 171 self.prev_type = None 172 self.encoding = None 173 174 def add_whitespace(self, start): 175 row, col = start 176 if row < self.prev_row or row == self.prev_row and col < self.prev_col: 177 raise ValueError("start ({},{}) precedes previous end ({},{})" 178 .format(row, col, self.prev_row, self.prev_col)) 179 row_offset = row - self.prev_row 180 if row_offset: 181 self.tokens.append("\\\n" * row_offset) 182 self.prev_col = 0 183 col_offset = col - self.prev_col 184 if col_offset: 185 self.tokens.append(" " * col_offset) 186 187 def escape_brackets(self, token): 188 characters = [] 189 consume_until_next_bracket = False 190 for character in token: 191 if character == "}": 192 if consume_until_next_bracket: 193 consume_until_next_bracket = False 194 else: 195 characters.append(character) 196 if character == "{": 197 n_backslashes = sum( 198 1 for char in _itertools.takewhile( 199 "\\".__eq__, 200 characters[-2::-1] 201 ) 202 ) 203 if n_backslashes % 2 == 0 or characters[-1] != "N": 204 characters.append(character) 205 else: 206 consume_until_next_bracket = True 207 characters.append(character) 208 return "".join(characters) 209 210 def untokenize(self, iterable): 211 it = iter(iterable) 212 indents = [] 213 startline = False 214 for t in it: 215 if len(t) == 2: 216 self.compat(t, it) 217 break 218 tok_type, token, start, end, line = t 219 if tok_type == ENCODING: 220 self.encoding = token 221 continue 222 if tok_type == ENDMARKER: 223 break 224 if tok_type == INDENT: 225 indents.append(token) 226 continue 227 elif tok_type == DEDENT: 228 indents.pop() 229 self.prev_row, self.prev_col = end 230 continue 231 elif tok_type in (NEWLINE, NL): 232 startline = True 233 elif startline and indents: 234 indent = indents[-1] 235 if start[1] >= len(indent): 236 self.tokens.append(indent) 237 self.prev_col = len(indent) 238 startline = False 239 elif tok_type == FSTRING_MIDDLE: 240 if '{' in token or '}' in token: 241 token = self.escape_brackets(token) 242 last_line = token.splitlines()[-1] 243 end_line, end_col = end 244 extra_chars = last_line.count("{{") + last_line.count("}}") 245 end = (end_line, end_col + extra_chars) 246 elif tok_type in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): 247 self.tokens.append(" ") 248 249 self.add_whitespace(start) 250 self.tokens.append(token) 251 self.prev_row, self.prev_col = end 252 if tok_type in (NEWLINE, NL): 253 self.prev_row += 1 254 self.prev_col = 0 255 self.prev_type = tok_type 256 return "".join(self.tokens) 257 258 def compat(self, token, iterable): 259 indents = [] 260 toks_append = self.tokens.append 261 startline = token[0] in (NEWLINE, NL) 262 prevstring = False 263 in_fstring = 0 264 265 for tok in _itertools.chain([token], iterable): 266 toknum, tokval = tok[:2] 267 if toknum == ENCODING: 268 self.encoding = tokval 269 continue 270 271 if toknum in (NAME, NUMBER): 272 tokval += ' ' 273 274 # Insert a space between two consecutive strings 275 if toknum == STRING: 276 if prevstring: 277 tokval = ' ' + tokval 278 prevstring = True 279 else: 280 prevstring = False 281 282 if toknum == FSTRING_START: 283 in_fstring += 1 284 elif toknum == FSTRING_END: 285 in_fstring -= 1 286 if toknum == INDENT: 287 indents.append(tokval) 288 continue 289 elif toknum == DEDENT: 290 indents.pop() 291 continue 292 elif toknum in (NEWLINE, NL): 293 startline = True 294 elif startline and indents: 295 toks_append(indents[-1]) 296 startline = False 297 elif toknum == FSTRING_MIDDLE: 298 tokval = self.escape_brackets(tokval) 299 300 # Insert a space between two consecutive brackets if we are in an f-string 301 if tokval in {"{", "}"} and self.tokens and self.tokens[-1] == tokval and in_fstring: 302 tokval = ' ' + tokval 303 304 # Insert a space between two consecutive f-strings 305 if toknum in (STRING, FSTRING_START) and self.prev_type in (STRING, FSTRING_END): 306 self.tokens.append(" ") 307 308 toks_append(tokval) 309 self.prev_type = toknum 310 311 312def untokenize(iterable): 313 """Transform tokens back into Python source code. 314 It returns a bytes object, encoded using the ENCODING 315 token, which is the first token sequence output by tokenize. 316 317 Each element returned by the iterable must be a token sequence 318 with at least two elements, a token number and token value. If 319 only two tokens are passed, the resulting output is poor. 320 321 Round-trip invariant for full input: 322 Untokenized source will match input source exactly 323 324 Round-trip invariant for limited input: 325 # Output bytes will tokenize back to the input 326 t1 = [tok[:2] for tok in tokenize(f.readline)] 327 newcode = untokenize(t1) 328 readline = BytesIO(newcode).readline 329 t2 = [tok[:2] for tok in tokenize(readline)] 330 assert t1 == t2 331 """ 332 ut = Untokenizer() 333 out = ut.untokenize(iterable) 334 if ut.encoding is not None: 335 out = out.encode(ut.encoding) 336 return out 337 338 339def _get_normal_name(orig_enc): 340 """Imitates get_normal_name in Parser/tokenizer/helpers.c.""" 341 # Only care about the first 12 characters. 342 enc = orig_enc[:12].lower().replace("_", "-") 343 if enc == "utf-8" or enc.startswith("utf-8-"): 344 return "utf-8" 345 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 346 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 347 return "iso-8859-1" 348 return orig_enc 349 350def detect_encoding(readline): 351 """ 352 The detect_encoding() function is used to detect the encoding that should 353 be used to decode a Python source file. It requires one argument, readline, 354 in the same way as the tokenize() generator. 355 356 It will call readline a maximum of twice, and return the encoding used 357 (as a string) and a list of any lines (left as bytes) it has read in. 358 359 It detects the encoding from the presence of a utf-8 bom or an encoding 360 cookie as specified in pep-0263. If both a bom and a cookie are present, 361 but disagree, a SyntaxError will be raised. If the encoding cookie is an 362 invalid charset, raise a SyntaxError. Note that if a utf-8 bom is found, 363 'utf-8-sig' is returned. 364 365 If no encoding is specified, then the default of 'utf-8' will be returned. 366 """ 367 try: 368 filename = readline.__self__.name 369 except AttributeError: 370 filename = None 371 bom_found = False 372 encoding = None 373 default = 'utf-8' 374 def read_or_stop(): 375 try: 376 return readline() 377 except StopIteration: 378 return b'' 379 380 def find_cookie(line): 381 try: 382 # Decode as UTF-8. Either the line is an encoding declaration, 383 # in which case it should be pure ASCII, or it must be UTF-8 384 # per default encoding. 385 line_string = line.decode('utf-8') 386 except UnicodeDecodeError: 387 msg = "invalid or missing encoding declaration" 388 if filename is not None: 389 msg = '{} for {!r}'.format(msg, filename) 390 raise SyntaxError(msg) 391 392 match = cookie_re.match(line_string) 393 if not match: 394 return None 395 encoding = _get_normal_name(match.group(1)) 396 try: 397 codec = lookup(encoding) 398 except LookupError: 399 # This behaviour mimics the Python interpreter 400 if filename is None: 401 msg = "unknown encoding: " + encoding 402 else: 403 msg = "unknown encoding for {!r}: {}".format(filename, 404 encoding) 405 raise SyntaxError(msg) 406 407 if bom_found: 408 if encoding != 'utf-8': 409 # This behaviour mimics the Python interpreter 410 if filename is None: 411 msg = 'encoding problem: utf-8' 412 else: 413 msg = 'encoding problem for {!r}: utf-8'.format(filename) 414 raise SyntaxError(msg) 415 encoding += '-sig' 416 return encoding 417 418 first = read_or_stop() 419 if first.startswith(BOM_UTF8): 420 bom_found = True 421 first = first[3:] 422 default = 'utf-8-sig' 423 if not first: 424 return default, [] 425 426 encoding = find_cookie(first) 427 if encoding: 428 return encoding, [first] 429 if not blank_re.match(first): 430 return default, [first] 431 432 second = read_or_stop() 433 if not second: 434 return default, [first] 435 436 encoding = find_cookie(second) 437 if encoding: 438 return encoding, [first, second] 439 440 return default, [first, second] 441 442 443def open(filename): 444 """Open a file in read only mode using the encoding detected by 445 detect_encoding(). 446 """ 447 buffer = _builtin_open(filename, 'rb') 448 try: 449 encoding, lines = detect_encoding(buffer.readline) 450 buffer.seek(0) 451 text = TextIOWrapper(buffer, encoding, line_buffering=True) 452 text.mode = 'r' 453 return text 454 except: 455 buffer.close() 456 raise 457 458def tokenize(readline): 459 """ 460 The tokenize() generator requires one argument, readline, which 461 must be a callable object which provides the same interface as the 462 readline() method of built-in file objects. Each call to the function 463 should return one line of input as bytes. Alternatively, readline 464 can be a callable function terminating with StopIteration: 465 readline = open(myfile, 'rb').__next__ # Example of alternate readline 466 467 The generator produces 5-tuples with these members: the token type; the 468 token string; a 2-tuple (srow, scol) of ints specifying the row and 469 column where the token begins in the source; a 2-tuple (erow, ecol) of 470 ints specifying the row and column where the token ends in the source; 471 and the line on which the token was found. The line passed is the 472 physical line. 473 474 The first token sequence will always be an ENCODING token 475 which tells you which encoding was used to decode the bytes stream. 476 """ 477 encoding, consumed = detect_encoding(readline) 478 rl_gen = _itertools.chain(consumed, iter(readline, b"")) 479 if encoding is not None: 480 if encoding == "utf-8-sig": 481 # BOM will already have been stripped. 482 encoding = "utf-8" 483 yield TokenInfo(ENCODING, encoding, (0, 0), (0, 0), '') 484 yield from _generate_tokens_from_c_tokenizer(rl_gen.__next__, encoding, extra_tokens=True) 485 486def generate_tokens(readline): 487 """Tokenize a source reading Python code as unicode strings. 488 489 This has the same API as tokenize(), except that it expects the *readline* 490 callable to return str objects instead of bytes. 491 """ 492 return _generate_tokens_from_c_tokenizer(readline, extra_tokens=True) 493 494def main(): 495 import argparse 496 497 # Helper error handling routines 498 def perror(message): 499 sys.stderr.write(message) 500 sys.stderr.write('\n') 501 502 def error(message, filename=None, location=None): 503 if location: 504 args = (filename,) + location + (message,) 505 perror("%s:%d:%d: error: %s" % args) 506 elif filename: 507 perror("%s: error: %s" % (filename, message)) 508 else: 509 perror("error: %s" % message) 510 sys.exit(1) 511 512 # Parse the arguments and options 513 parser = argparse.ArgumentParser(prog='python -m tokenize') 514 parser.add_argument(dest='filename', nargs='?', 515 metavar='filename.py', 516 help='the file to tokenize; defaults to stdin') 517 parser.add_argument('-e', '--exact', dest='exact', action='store_true', 518 help='display token names using the exact type') 519 args = parser.parse_args() 520 521 try: 522 # Tokenize the input 523 if args.filename: 524 filename = args.filename 525 with _builtin_open(filename, 'rb') as f: 526 tokens = list(tokenize(f.readline)) 527 else: 528 filename = "<stdin>" 529 tokens = _generate_tokens_from_c_tokenizer( 530 sys.stdin.readline, extra_tokens=True) 531 532 533 # Output the tokenization 534 for token in tokens: 535 token_type = token.type 536 if args.exact: 537 token_type = token.exact_type 538 token_range = "%d,%d-%d,%d:" % (token.start + token.end) 539 print("%-20s%-15s%-15r" % 540 (token_range, tok_name[token_type], token.string)) 541 except IndentationError as err: 542 line, column = err.args[1][1:3] 543 error(err.args[0], filename, (line, column)) 544 except TokenError as err: 545 line, column = err.args[1] 546 error(err.args[0], filename, (line, column)) 547 except SyntaxError as err: 548 error(err, filename) 549 except OSError as err: 550 error(err) 551 except KeyboardInterrupt: 552 print("interrupted\n") 553 except Exception as err: 554 perror("unexpected error: %s" % err) 555 raise 556 557def _transform_msg(msg): 558 """Transform error messages from the C tokenizer into the Python tokenize 559 560 The C tokenizer is more picky than the Python one, so we need to massage 561 the error messages a bit for backwards compatibility. 562 """ 563 if "unterminated triple-quoted string literal" in msg: 564 return "EOF in multi-line string" 565 return msg 566 567def _generate_tokens_from_c_tokenizer(source, encoding=None, extra_tokens=False): 568 """Tokenize a source reading Python code as unicode strings using the internal C tokenizer""" 569 if encoding is None: 570 it = _tokenize.TokenizerIter(source, extra_tokens=extra_tokens) 571 else: 572 it = _tokenize.TokenizerIter(source, encoding=encoding, extra_tokens=extra_tokens) 573 try: 574 for info in it: 575 yield TokenInfo._make(info) 576 except SyntaxError as e: 577 if type(e) != SyntaxError: 578 raise e from None 579 msg = _transform_msg(e.msg) 580 raise TokenError(msg, (e.lineno, e.offset)) from None 581 582 583if __name__ == "__main__": 584 main() 585