1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 2# All rights reserved. 3 4"""Tokenization help for Python programs. 5 6generate_tokens(readline) is a generator that breaks a stream of 7text into Python tokens. It accepts a readline-like method which is called 8repeatedly to get the next line of input (or "" for EOF). It generates 95-tuples with these members: 10 11 the token type (see token.py) 12 the token (a string) 13 the starting (row, column) indices of the token (a 2-tuple of ints) 14 the ending (row, column) indices of the token (a 2-tuple of ints) 15 the original line (string) 16 17It is designed to match the working of the Python tokenizer exactly, except 18that it produces COMMENT tokens for comments and gives type OP for all 19operators 20 21Older entry points 22 tokenize_loop(readline, tokeneater) 23 tokenize(readline, tokeneater=printtoken) 24are the same, except instead of generating tokens, tokeneater is a callback 25function to which the 5 fields described above are passed as 5 arguments, 26each time a new token is found.""" 27 28__author__ = 'Ka-Ping Yee <ping@lfw.org>' 29__credits__ = \ 30 'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro' 31 32import string, re 33from codecs import BOM_UTF8, lookup 34from lib2to3.pgen2.token import * 35 36from . import token 37__all__ = [x for x in dir(token) if x[0] != '_'] + ["tokenize", 38 "generate_tokens", "untokenize"] 39del token 40 41try: 42 bytes 43except NameError: 44 # Support bytes type in Python <= 2.5, so 2to3 turns itself into 45 # valid Python 3 code. 46 bytes = str 47 48def group(*choices): return '(' + '|'.join(choices) + ')' 49def any(*choices): return group(*choices) + '*' 50def maybe(*choices): return group(*choices) + '?' 51def _combinations(*l): 52 return set( 53 x + y for x in l for y in l + ("",) if x.casefold() != y.casefold() 54 ) 55 56Whitespace = r'[ \f\t]*' 57Comment = r'#[^\r\n]*' 58Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment) 59Name = r'\w+' 60 61Binnumber = r'0[bB]_?[01]+(?:_[01]+)*' 62Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?' 63Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?' 64Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?') 65Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber) 66Exponent = r'[eE][-+]?\d+(?:_\d+)*' 67Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?', r'\.\d+(?:_\d+)*') + maybe(Exponent) 68Expfloat = r'\d+(?:_\d+)*' + Exponent 69Floatnumber = group(Pointfloat, Expfloat) 70Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]') 71Number = group(Imagnumber, Floatnumber, Intnumber) 72 73# Tail end of ' string. 74Single = r"[^'\\]*(?:\\.[^'\\]*)*'" 75# Tail end of " string. 76Double = r'[^"\\]*(?:\\.[^"\\]*)*"' 77# Tail end of ''' string. 78Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''" 79# Tail end of """ string. 80Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""' 81_litprefix = r"(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?" 82Triple = group(_litprefix + "'''", _litprefix + '"""') 83# Single-line ' or " string. 84String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'", 85 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"') 86 87# Because of leftmost-then-longest match semantics, be sure to put the 88# longest operators first (e.g., if = came before ==, == would get 89# recognized as two instances of =). 90Operator = group(r"\*\*=?", r">>=?", r"<<=?", r"<>", r"!=", 91 r"//=?", r"->", 92 r"[+\-*/%&@|^=<>]=?", 93 r"~") 94 95Bracket = '[][(){}]' 96Special = group(r'\r?\n', r':=', r'[:;.,`@]') 97Funny = group(Operator, Bracket, Special) 98 99PlainToken = group(Number, Funny, String, Name) 100Token = Ignore + PlainToken 101 102# First (or only) line of ' or " string. 103ContStr = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + 104 group("'", r'\\\r?\n'), 105 _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + 106 group('"', r'\\\r?\n')) 107PseudoExtras = group(r'\\\r?\n', Comment, Triple) 108PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name) 109 110tokenprog, pseudoprog, single3prog, double3prog = map( 111 re.compile, (Token, PseudoToken, Single3, Double3)) 112 113_strprefixes = ( 114 _combinations('r', 'R', 'f', 'F') | 115 _combinations('r', 'R', 'b', 'B') | 116 {'u', 'U', 'ur', 'uR', 'Ur', 'UR'} 117) 118 119endprogs = {"'": re.compile(Single), '"': re.compile(Double), 120 "'''": single3prog, '"""': double3prog, 121 **{f"{prefix}'''": single3prog for prefix in _strprefixes}, 122 **{f'{prefix}"""': double3prog for prefix in _strprefixes}, 123 **{prefix: None for prefix in _strprefixes}} 124 125triple_quoted = ( 126 {"'''", '"""'} | 127 {f"{prefix}'''" for prefix in _strprefixes} | 128 {f'{prefix}"""' for prefix in _strprefixes} 129) 130single_quoted = ( 131 {"'", '"'} | 132 {f"{prefix}'" for prefix in _strprefixes} | 133 {f'{prefix}"' for prefix in _strprefixes} 134) 135 136tabsize = 8 137 138class TokenError(Exception): pass 139 140class StopTokenizing(Exception): pass 141 142def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1, line): # for testing 143 (srow, scol) = xxx_todo_changeme 144 (erow, ecol) = xxx_todo_changeme1 145 print("%d,%d-%d,%d:\t%s\t%s" % \ 146 (srow, scol, erow, ecol, tok_name[type], repr(token))) 147 148def tokenize(readline, tokeneater=printtoken): 149 """ 150 The tokenize() function accepts two parameters: one representing the 151 input stream, and one providing an output mechanism for tokenize(). 152 153 The first parameter, readline, must be a callable object which provides 154 the same interface as the readline() method of built-in file objects. 155 Each call to the function should return one line of input as a string. 156 157 The second parameter, tokeneater, must also be a callable object. It is 158 called once for each token, with five arguments, corresponding to the 159 tuples generated by generate_tokens(). 160 """ 161 try: 162 tokenize_loop(readline, tokeneater) 163 except StopTokenizing: 164 pass 165 166# backwards compatible interface 167def tokenize_loop(readline, tokeneater): 168 for token_info in generate_tokens(readline): 169 tokeneater(*token_info) 170 171class Untokenizer: 172 173 def __init__(self): 174 self.tokens = [] 175 self.prev_row = 1 176 self.prev_col = 0 177 178 def add_whitespace(self, start): 179 row, col = start 180 assert row <= self.prev_row 181 col_offset = col - self.prev_col 182 if col_offset: 183 self.tokens.append(" " * col_offset) 184 185 def untokenize(self, iterable): 186 for t in iterable: 187 if len(t) == 2: 188 self.compat(t, iterable) 189 break 190 tok_type, token, start, end, line = t 191 self.add_whitespace(start) 192 self.tokens.append(token) 193 self.prev_row, self.prev_col = end 194 if tok_type in (NEWLINE, NL): 195 self.prev_row += 1 196 self.prev_col = 0 197 return "".join(self.tokens) 198 199 def compat(self, token, iterable): 200 startline = False 201 indents = [] 202 toks_append = self.tokens.append 203 toknum, tokval = token 204 if toknum in (NAME, NUMBER): 205 tokval += ' ' 206 if toknum in (NEWLINE, NL): 207 startline = True 208 for tok in iterable: 209 toknum, tokval = tok[:2] 210 211 if toknum in (NAME, NUMBER, ASYNC, AWAIT): 212 tokval += ' ' 213 214 if toknum == INDENT: 215 indents.append(tokval) 216 continue 217 elif toknum == DEDENT: 218 indents.pop() 219 continue 220 elif toknum in (NEWLINE, NL): 221 startline = True 222 elif startline and indents: 223 toks_append(indents[-1]) 224 startline = False 225 toks_append(tokval) 226 227cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII) 228blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII) 229 230def _get_normal_name(orig_enc): 231 """Imitates get_normal_name in tokenizer.c.""" 232 # Only care about the first 12 characters. 233 enc = orig_enc[:12].lower().replace("_", "-") 234 if enc == "utf-8" or enc.startswith("utf-8-"): 235 return "utf-8" 236 if enc in ("latin-1", "iso-8859-1", "iso-latin-1") or \ 237 enc.startswith(("latin-1-", "iso-8859-1-", "iso-latin-1-")): 238 return "iso-8859-1" 239 return orig_enc 240 241def detect_encoding(readline): 242 """ 243 The detect_encoding() function is used to detect the encoding that should 244 be used to decode a Python source file. It requires one argument, readline, 245 in the same way as the tokenize() generator. 246 247 It will call readline a maximum of twice, and return the encoding used 248 (as a string) and a list of any lines (left as bytes) it has read 249 in. 250 251 It detects the encoding from the presence of a utf-8 bom or an encoding 252 cookie as specified in pep-0263. If both a bom and a cookie are present, but 253 disagree, a SyntaxError will be raised. If the encoding cookie is an invalid 254 charset, raise a SyntaxError. Note that if a utf-8 bom is found, 255 'utf-8-sig' is returned. 256 257 If no encoding is specified, then the default of 'utf-8' will be returned. 258 """ 259 bom_found = False 260 encoding = None 261 default = 'utf-8' 262 def read_or_stop(): 263 try: 264 return readline() 265 except StopIteration: 266 return bytes() 267 268 def find_cookie(line): 269 try: 270 line_string = line.decode('ascii') 271 except UnicodeDecodeError: 272 return None 273 match = cookie_re.match(line_string) 274 if not match: 275 return None 276 encoding = _get_normal_name(match.group(1)) 277 try: 278 codec = lookup(encoding) 279 except LookupError: 280 # This behaviour mimics the Python interpreter 281 raise SyntaxError("unknown encoding: " + encoding) 282 283 if bom_found: 284 if codec.name != 'utf-8': 285 # This behaviour mimics the Python interpreter 286 raise SyntaxError('encoding problem: utf-8') 287 encoding += '-sig' 288 return encoding 289 290 first = read_or_stop() 291 if first.startswith(BOM_UTF8): 292 bom_found = True 293 first = first[3:] 294 default = 'utf-8-sig' 295 if not first: 296 return default, [] 297 298 encoding = find_cookie(first) 299 if encoding: 300 return encoding, [first] 301 if not blank_re.match(first): 302 return default, [first] 303 304 second = read_or_stop() 305 if not second: 306 return default, [first] 307 308 encoding = find_cookie(second) 309 if encoding: 310 return encoding, [first, second] 311 312 return default, [first, second] 313 314def untokenize(iterable): 315 """Transform tokens back into Python source code. 316 317 Each element returned by the iterable must be a token sequence 318 with at least two elements, a token number and token value. If 319 only two tokens are passed, the resulting output is poor. 320 321 Round-trip invariant for full input: 322 Untokenized source will match input source exactly 323 324 Round-trip invariant for limited input: 325 # Output text will tokenize the back to the input 326 t1 = [tok[:2] for tok in generate_tokens(f.readline)] 327 newcode = untokenize(t1) 328 readline = iter(newcode.splitlines(1)).next 329 t2 = [tok[:2] for tokin generate_tokens(readline)] 330 assert t1 == t2 331 """ 332 ut = Untokenizer() 333 return ut.untokenize(iterable) 334 335def generate_tokens(readline): 336 """ 337 The generate_tokens() generator requires one argument, readline, which 338 must be a callable object which provides the same interface as the 339 readline() method of built-in file objects. Each call to the function 340 should return one line of input as a string. Alternately, readline 341 can be a callable function terminating with StopIteration: 342 readline = open(myfile).next # Example of alternate readline 343 344 The generator produces 5-tuples with these members: the token type; the 345 token string; a 2-tuple (srow, scol) of ints specifying the row and 346 column where the token begins in the source; a 2-tuple (erow, ecol) of 347 ints specifying the row and column where the token ends in the source; 348 and the line on which the token was found. The line passed is the 349 physical line. 350 """ 351 lnum = parenlev = continued = 0 352 contstr, needcont = '', 0 353 contline = None 354 indents = [0] 355 356 # 'stashed' and 'async_*' are used for async/await parsing 357 stashed = None 358 async_def = False 359 async_def_indent = 0 360 async_def_nl = False 361 362 while 1: # loop over lines in stream 363 try: 364 line = readline() 365 except StopIteration: 366 line = '' 367 lnum = lnum + 1 368 pos, max = 0, len(line) 369 370 if contstr: # continued string 371 if not line: 372 raise TokenError("EOF in multi-line string", strstart) 373 endmatch = endprog.match(line) 374 if endmatch: 375 pos = end = endmatch.end(0) 376 yield (STRING, contstr + line[:end], 377 strstart, (lnum, end), contline + line) 378 contstr, needcont = '', 0 379 contline = None 380 elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n': 381 yield (ERRORTOKEN, contstr + line, 382 strstart, (lnum, len(line)), contline) 383 contstr = '' 384 contline = None 385 continue 386 else: 387 contstr = contstr + line 388 contline = contline + line 389 continue 390 391 elif parenlev == 0 and not continued: # new statement 392 if not line: break 393 column = 0 394 while pos < max: # measure leading whitespace 395 if line[pos] == ' ': column = column + 1 396 elif line[pos] == '\t': column = (column//tabsize + 1)*tabsize 397 elif line[pos] == '\f': column = 0 398 else: break 399 pos = pos + 1 400 if pos == max: break 401 402 if stashed: 403 yield stashed 404 stashed = None 405 406 if line[pos] in '#\r\n': # skip comments or blank lines 407 if line[pos] == '#': 408 comment_token = line[pos:].rstrip('\r\n') 409 nl_pos = pos + len(comment_token) 410 yield (COMMENT, comment_token, 411 (lnum, pos), (lnum, pos + len(comment_token)), line) 412 yield (NL, line[nl_pos:], 413 (lnum, nl_pos), (lnum, len(line)), line) 414 else: 415 yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], 416 (lnum, pos), (lnum, len(line)), line) 417 continue 418 419 if column > indents[-1]: # count indents or dedents 420 indents.append(column) 421 yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line) 422 while column < indents[-1]: 423 if column not in indents: 424 raise IndentationError( 425 "unindent does not match any outer indentation level", 426 ("<tokenize>", lnum, pos, line)) 427 indents = indents[:-1] 428 429 if async_def and async_def_indent >= indents[-1]: 430 async_def = False 431 async_def_nl = False 432 async_def_indent = 0 433 434 yield (DEDENT, '', (lnum, pos), (lnum, pos), line) 435 436 if async_def and async_def_nl and async_def_indent >= indents[-1]: 437 async_def = False 438 async_def_nl = False 439 async_def_indent = 0 440 441 else: # continued statement 442 if not line: 443 raise TokenError("EOF in multi-line statement", (lnum, 0)) 444 continued = 0 445 446 while pos < max: 447 pseudomatch = pseudoprog.match(line, pos) 448 if pseudomatch: # scan for tokens 449 start, end = pseudomatch.span(1) 450 spos, epos, pos = (lnum, start), (lnum, end), end 451 token, initial = line[start:end], line[start] 452 453 if initial in string.digits or \ 454 (initial == '.' and token != '.'): # ordinary number 455 yield (NUMBER, token, spos, epos, line) 456 elif initial in '\r\n': 457 newline = NEWLINE 458 if parenlev > 0: 459 newline = NL 460 elif async_def: 461 async_def_nl = True 462 if stashed: 463 yield stashed 464 stashed = None 465 yield (newline, token, spos, epos, line) 466 467 elif initial == '#': 468 assert not token.endswith("\n") 469 if stashed: 470 yield stashed 471 stashed = None 472 yield (COMMENT, token, spos, epos, line) 473 elif token in triple_quoted: 474 endprog = endprogs[token] 475 endmatch = endprog.match(line, pos) 476 if endmatch: # all on one line 477 pos = endmatch.end(0) 478 token = line[start:pos] 479 if stashed: 480 yield stashed 481 stashed = None 482 yield (STRING, token, spos, (lnum, pos), line) 483 else: 484 strstart = (lnum, start) # multiple lines 485 contstr = line[start:] 486 contline = line 487 break 488 elif initial in single_quoted or \ 489 token[:2] in single_quoted or \ 490 token[:3] in single_quoted: 491 if token[-1] == '\n': # continued string 492 strstart = (lnum, start) 493 endprog = (endprogs[initial] or endprogs[token[1]] or 494 endprogs[token[2]]) 495 contstr, needcont = line[start:], 1 496 contline = line 497 break 498 else: # ordinary string 499 if stashed: 500 yield stashed 501 stashed = None 502 yield (STRING, token, spos, epos, line) 503 elif initial.isidentifier(): # ordinary name 504 if token in ('async', 'await'): 505 if async_def: 506 yield (ASYNC if token == 'async' else AWAIT, 507 token, spos, epos, line) 508 continue 509 510 tok = (NAME, token, spos, epos, line) 511 if token == 'async' and not stashed: 512 stashed = tok 513 continue 514 515 if token in ('def', 'for'): 516 if (stashed 517 and stashed[0] == NAME 518 and stashed[1] == 'async'): 519 520 if token == 'def': 521 async_def = True 522 async_def_indent = indents[-1] 523 524 yield (ASYNC, stashed[1], 525 stashed[2], stashed[3], 526 stashed[4]) 527 stashed = None 528 529 if stashed: 530 yield stashed 531 stashed = None 532 533 yield tok 534 elif initial == '\\': # continued stmt 535 # This yield is new; needed for better idempotency: 536 if stashed: 537 yield stashed 538 stashed = None 539 yield (NL, token, spos, (lnum, pos), line) 540 continued = 1 541 else: 542 if initial in '([{': parenlev = parenlev + 1 543 elif initial in ')]}': parenlev = parenlev - 1 544 if stashed: 545 yield stashed 546 stashed = None 547 yield (OP, token, spos, epos, line) 548 else: 549 yield (ERRORTOKEN, line[pos], 550 (lnum, pos), (lnum, pos+1), line) 551 pos = pos + 1 552 553 if stashed: 554 yield stashed 555 stashed = None 556 557 for indent in indents[1:]: # pop remaining indent levels 558 yield (DEDENT, '', (lnum, 0), (lnum, 0), '') 559 yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '') 560 561if __name__ == '__main__': # testing 562 import sys 563 if len(sys.argv) > 1: tokenize(open(sys.argv[1]).readline) 564 else: tokenize(sys.stdin.readline) 565