1from test import support 2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP, 3 STRING, ENDMARKER, ENCODING, tok_name, detect_encoding, 4 open as tokenize_open, Untokenizer, generate_tokens, 5 NEWLINE) 6from io import BytesIO, StringIO 7import unittest 8from unittest import TestCase, mock 9from test.test_grammar import (VALID_UNDERSCORE_LITERALS, 10 INVALID_UNDERSCORE_LITERALS) 11import os 12import token 13 14 15# Converts a source string into a list of textual representation 16# of the tokens such as: 17# ` NAME 'if' (1, 0) (1, 2)` 18# to make writing tests easier. 19def stringify_tokens_from_source(token_generator, source_string): 20 result = [] 21 num_lines = len(source_string.splitlines()) 22 missing_trailing_nl = source_string[-1] not in '\r\n' 23 24 for type, token, start, end, line in token_generator: 25 if type == ENDMARKER: 26 break 27 # Ignore the new line on the last line if the input lacks one 28 if missing_trailing_nl and type == NEWLINE and end[0] == num_lines: 29 continue 30 type = tok_name[type] 31 result.append(f" {type:10} {token!r:13} {start} {end}") 32 33 return result 34 35class TokenizeTest(TestCase): 36 # Tests for the tokenize module. 37 38 # The tests can be really simple. Given a small fragment of source 39 # code, print out a table with tokens. The ENDMARKER, ENCODING and 40 # final NEWLINE are omitted for brevity. 41 42 def check_tokenize(self, s, expected): 43 # Format the tokens in s in a table format. 44 # The ENDMARKER and final NEWLINE are omitted. 45 f = BytesIO(s.encode('utf-8')) 46 result = stringify_tokens_from_source(tokenize(f.readline), s) 47 48 self.assertEqual(result, 49 [" ENCODING 'utf-8' (0, 0) (0, 0)"] + 50 expected.rstrip().splitlines()) 51 52 def test_implicit_newline(self): 53 # Make sure that the tokenizer puts in an implicit NEWLINE 54 # when the input lacks a trailing new line. 55 f = BytesIO("x".encode('utf-8')) 56 tokens = list(tokenize(f.readline)) 57 self.assertEqual(tokens[-2].type, NEWLINE) 58 self.assertEqual(tokens[-1].type, ENDMARKER) 59 60 def test_basic(self): 61 self.check_tokenize("1 + 1", """\ 62 NUMBER '1' (1, 0) (1, 1) 63 OP '+' (1, 2) (1, 3) 64 NUMBER '1' (1, 4) (1, 5) 65 """) 66 self.check_tokenize("if False:\n" 67 " # NL\n" 68 " \n" 69 " True = False # NEWLINE\n", """\ 70 NAME 'if' (1, 0) (1, 2) 71 NAME 'False' (1, 3) (1, 8) 72 OP ':' (1, 8) (1, 9) 73 NEWLINE '\\n' (1, 9) (1, 10) 74 COMMENT '# NL' (2, 4) (2, 8) 75 NL '\\n' (2, 8) (2, 9) 76 NL '\\n' (3, 4) (3, 5) 77 INDENT ' ' (4, 0) (4, 4) 78 NAME 'True' (4, 4) (4, 8) 79 OP '=' (4, 9) (4, 10) 80 NAME 'False' (4, 11) (4, 16) 81 COMMENT '# NEWLINE' (4, 17) (4, 26) 82 NEWLINE '\\n' (4, 26) (4, 27) 83 DEDENT '' (5, 0) (5, 0) 84 """) 85 indent_error_file = b"""\ 86def k(x): 87 x += 2 88 x += 5 89""" 90 readline = BytesIO(indent_error_file).readline 91 with self.assertRaisesRegex(IndentationError, 92 "unindent does not match any " 93 "outer indentation level"): 94 for tok in tokenize(readline): 95 pass 96 97 def test_int(self): 98 # Ordinary integers and binary operators 99 self.check_tokenize("0xff <= 255", """\ 100 NUMBER '0xff' (1, 0) (1, 4) 101 OP '<=' (1, 5) (1, 7) 102 NUMBER '255' (1, 8) (1, 11) 103 """) 104 self.check_tokenize("0b10 <= 255", """\ 105 NUMBER '0b10' (1, 0) (1, 4) 106 OP '<=' (1, 5) (1, 7) 107 NUMBER '255' (1, 8) (1, 11) 108 """) 109 self.check_tokenize("0o123 <= 0O123", """\ 110 NUMBER '0o123' (1, 0) (1, 5) 111 OP '<=' (1, 6) (1, 8) 112 NUMBER '0O123' (1, 9) (1, 14) 113 """) 114 self.check_tokenize("1234567 > ~0x15", """\ 115 NUMBER '1234567' (1, 0) (1, 7) 116 OP '>' (1, 8) (1, 9) 117 OP '~' (1, 10) (1, 11) 118 NUMBER '0x15' (1, 11) (1, 15) 119 """) 120 self.check_tokenize("2134568 != 1231515", """\ 121 NUMBER '2134568' (1, 0) (1, 7) 122 OP '!=' (1, 8) (1, 10) 123 NUMBER '1231515' (1, 11) (1, 18) 124 """) 125 self.check_tokenize("(-124561-1) & 200000000", """\ 126 OP '(' (1, 0) (1, 1) 127 OP '-' (1, 1) (1, 2) 128 NUMBER '124561' (1, 2) (1, 8) 129 OP '-' (1, 8) (1, 9) 130 NUMBER '1' (1, 9) (1, 10) 131 OP ')' (1, 10) (1, 11) 132 OP '&' (1, 12) (1, 13) 133 NUMBER '200000000' (1, 14) (1, 23) 134 """) 135 self.check_tokenize("0xdeadbeef != -1", """\ 136 NUMBER '0xdeadbeef' (1, 0) (1, 10) 137 OP '!=' (1, 11) (1, 13) 138 OP '-' (1, 14) (1, 15) 139 NUMBER '1' (1, 15) (1, 16) 140 """) 141 self.check_tokenize("0xdeadc0de & 12345", """\ 142 NUMBER '0xdeadc0de' (1, 0) (1, 10) 143 OP '&' (1, 11) (1, 12) 144 NUMBER '12345' (1, 13) (1, 18) 145 """) 146 self.check_tokenize("0xFF & 0x15 | 1234", """\ 147 NUMBER '0xFF' (1, 0) (1, 4) 148 OP '&' (1, 5) (1, 6) 149 NUMBER '0x15' (1, 7) (1, 11) 150 OP '|' (1, 12) (1, 13) 151 NUMBER '1234' (1, 14) (1, 18) 152 """) 153 154 def test_long(self): 155 # Long integers 156 self.check_tokenize("x = 0", """\ 157 NAME 'x' (1, 0) (1, 1) 158 OP '=' (1, 2) (1, 3) 159 NUMBER '0' (1, 4) (1, 5) 160 """) 161 self.check_tokenize("x = 0xfffffffffff", """\ 162 NAME 'x' (1, 0) (1, 1) 163 OP '=' (1, 2) (1, 3) 164 NUMBER '0xfffffffffff' (1, 4) (1, 17) 165 """) 166 self.check_tokenize("x = 123141242151251616110", """\ 167 NAME 'x' (1, 0) (1, 1) 168 OP '=' (1, 2) (1, 3) 169 NUMBER '123141242151251616110' (1, 4) (1, 25) 170 """) 171 self.check_tokenize("x = -15921590215012591", """\ 172 NAME 'x' (1, 0) (1, 1) 173 OP '=' (1, 2) (1, 3) 174 OP '-' (1, 4) (1, 5) 175 NUMBER '15921590215012591' (1, 5) (1, 22) 176 """) 177 178 def test_float(self): 179 # Floating point numbers 180 self.check_tokenize("x = 3.14159", """\ 181 NAME 'x' (1, 0) (1, 1) 182 OP '=' (1, 2) (1, 3) 183 NUMBER '3.14159' (1, 4) (1, 11) 184 """) 185 self.check_tokenize("x = 314159.", """\ 186 NAME 'x' (1, 0) (1, 1) 187 OP '=' (1, 2) (1, 3) 188 NUMBER '314159.' (1, 4) (1, 11) 189 """) 190 self.check_tokenize("x = .314159", """\ 191 NAME 'x' (1, 0) (1, 1) 192 OP '=' (1, 2) (1, 3) 193 NUMBER '.314159' (1, 4) (1, 11) 194 """) 195 self.check_tokenize("x = 3e14159", """\ 196 NAME 'x' (1, 0) (1, 1) 197 OP '=' (1, 2) (1, 3) 198 NUMBER '3e14159' (1, 4) (1, 11) 199 """) 200 self.check_tokenize("x = 3E123", """\ 201 NAME 'x' (1, 0) (1, 1) 202 OP '=' (1, 2) (1, 3) 203 NUMBER '3E123' (1, 4) (1, 9) 204 """) 205 self.check_tokenize("x+y = 3e-1230", """\ 206 NAME 'x' (1, 0) (1, 1) 207 OP '+' (1, 1) (1, 2) 208 NAME 'y' (1, 2) (1, 3) 209 OP '=' (1, 4) (1, 5) 210 NUMBER '3e-1230' (1, 6) (1, 13) 211 """) 212 self.check_tokenize("x = 3.14e159", """\ 213 NAME 'x' (1, 0) (1, 1) 214 OP '=' (1, 2) (1, 3) 215 NUMBER '3.14e159' (1, 4) (1, 12) 216 """) 217 218 def test_underscore_literals(self): 219 def number_token(s): 220 f = BytesIO(s.encode('utf-8')) 221 for toktype, token, start, end, line in tokenize(f.readline): 222 if toktype == NUMBER: 223 return token 224 return 'invalid token' 225 for lit in VALID_UNDERSCORE_LITERALS: 226 if '(' in lit: 227 # this won't work with compound complex inputs 228 continue 229 self.assertEqual(number_token(lit), lit) 230 for lit in INVALID_UNDERSCORE_LITERALS: 231 self.assertNotEqual(number_token(lit), lit) 232 233 def test_string(self): 234 # String literals 235 self.check_tokenize("x = ''; y = \"\"", """\ 236 NAME 'x' (1, 0) (1, 1) 237 OP '=' (1, 2) (1, 3) 238 STRING "''" (1, 4) (1, 6) 239 OP ';' (1, 6) (1, 7) 240 NAME 'y' (1, 8) (1, 9) 241 OP '=' (1, 10) (1, 11) 242 STRING '""' (1, 12) (1, 14) 243 """) 244 self.check_tokenize("x = '\"'; y = \"'\"", """\ 245 NAME 'x' (1, 0) (1, 1) 246 OP '=' (1, 2) (1, 3) 247 STRING '\\'"\\'' (1, 4) (1, 7) 248 OP ';' (1, 7) (1, 8) 249 NAME 'y' (1, 9) (1, 10) 250 OP '=' (1, 11) (1, 12) 251 STRING '"\\'"' (1, 13) (1, 16) 252 """) 253 self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\ 254 NAME 'x' (1, 0) (1, 1) 255 OP '=' (1, 2) (1, 3) 256 STRING '"doesn\\'t "' (1, 4) (1, 14) 257 NAME 'shrink' (1, 14) (1, 20) 258 STRING '", does it"' (1, 20) (1, 31) 259 """) 260 self.check_tokenize("x = 'abc' + 'ABC'", """\ 261 NAME 'x' (1, 0) (1, 1) 262 OP '=' (1, 2) (1, 3) 263 STRING "'abc'" (1, 4) (1, 9) 264 OP '+' (1, 10) (1, 11) 265 STRING "'ABC'" (1, 12) (1, 17) 266 """) 267 self.check_tokenize('y = "ABC" + "ABC"', """\ 268 NAME 'y' (1, 0) (1, 1) 269 OP '=' (1, 2) (1, 3) 270 STRING '"ABC"' (1, 4) (1, 9) 271 OP '+' (1, 10) (1, 11) 272 STRING '"ABC"' (1, 12) (1, 17) 273 """) 274 self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\ 275 NAME 'x' (1, 0) (1, 1) 276 OP '=' (1, 2) (1, 3) 277 STRING "r'abc'" (1, 4) (1, 10) 278 OP '+' (1, 11) (1, 12) 279 STRING "r'ABC'" (1, 13) (1, 19) 280 OP '+' (1, 20) (1, 21) 281 STRING "R'ABC'" (1, 22) (1, 28) 282 OP '+' (1, 29) (1, 30) 283 STRING "R'ABC'" (1, 31) (1, 37) 284 """) 285 self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\ 286 NAME 'y' (1, 0) (1, 1) 287 OP '=' (1, 2) (1, 3) 288 STRING 'r"abc"' (1, 4) (1, 10) 289 OP '+' (1, 11) (1, 12) 290 STRING 'r"ABC"' (1, 13) (1, 19) 291 OP '+' (1, 20) (1, 21) 292 STRING 'R"ABC"' (1, 22) (1, 28) 293 OP '+' (1, 29) (1, 30) 294 STRING 'R"ABC"' (1, 31) (1, 37) 295 """) 296 297 self.check_tokenize("u'abc' + U'abc'", """\ 298 STRING "u'abc'" (1, 0) (1, 6) 299 OP '+' (1, 7) (1, 8) 300 STRING "U'abc'" (1, 9) (1, 15) 301 """) 302 self.check_tokenize('u"abc" + U"abc"', """\ 303 STRING 'u"abc"' (1, 0) (1, 6) 304 OP '+' (1, 7) (1, 8) 305 STRING 'U"abc"' (1, 9) (1, 15) 306 """) 307 308 self.check_tokenize("b'abc' + B'abc'", """\ 309 STRING "b'abc'" (1, 0) (1, 6) 310 OP '+' (1, 7) (1, 8) 311 STRING "B'abc'" (1, 9) (1, 15) 312 """) 313 self.check_tokenize('b"abc" + B"abc"', """\ 314 STRING 'b"abc"' (1, 0) (1, 6) 315 OP '+' (1, 7) (1, 8) 316 STRING 'B"abc"' (1, 9) (1, 15) 317 """) 318 self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\ 319 STRING "br'abc'" (1, 0) (1, 7) 320 OP '+' (1, 8) (1, 9) 321 STRING "bR'abc'" (1, 10) (1, 17) 322 OP '+' (1, 18) (1, 19) 323 STRING "Br'abc'" (1, 20) (1, 27) 324 OP '+' (1, 28) (1, 29) 325 STRING "BR'abc'" (1, 30) (1, 37) 326 """) 327 self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\ 328 STRING 'br"abc"' (1, 0) (1, 7) 329 OP '+' (1, 8) (1, 9) 330 STRING 'bR"abc"' (1, 10) (1, 17) 331 OP '+' (1, 18) (1, 19) 332 STRING 'Br"abc"' (1, 20) (1, 27) 333 OP '+' (1, 28) (1, 29) 334 STRING 'BR"abc"' (1, 30) (1, 37) 335 """) 336 self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\ 337 STRING "rb'abc'" (1, 0) (1, 7) 338 OP '+' (1, 8) (1, 9) 339 STRING "rB'abc'" (1, 10) (1, 17) 340 OP '+' (1, 18) (1, 19) 341 STRING "Rb'abc'" (1, 20) (1, 27) 342 OP '+' (1, 28) (1, 29) 343 STRING "RB'abc'" (1, 30) (1, 37) 344 """) 345 self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\ 346 STRING 'rb"abc"' (1, 0) (1, 7) 347 OP '+' (1, 8) (1, 9) 348 STRING 'rB"abc"' (1, 10) (1, 17) 349 OP '+' (1, 18) (1, 19) 350 STRING 'Rb"abc"' (1, 20) (1, 27) 351 OP '+' (1, 28) (1, 29) 352 STRING 'RB"abc"' (1, 30) (1, 37) 353 """) 354 # Check 0, 1, and 2 character string prefixes. 355 self.check_tokenize(r'"a\ 356de\ 357fg"', """\ 358 STRING '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3) 359 """) 360 self.check_tokenize(r'u"a\ 361de"', """\ 362 STRING 'u"a\\\\\\nde"\' (1, 0) (2, 3) 363 """) 364 self.check_tokenize(r'rb"a\ 365d"', """\ 366 STRING 'rb"a\\\\\\nd"\' (1, 0) (2, 2) 367 """) 368 self.check_tokenize(r'"""a\ 369b"""', """\ 370 STRING '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 371 """) 372 self.check_tokenize(r'u"""a\ 373b"""', """\ 374 STRING 'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4) 375 """) 376 self.check_tokenize(r'rb"""a\ 377b\ 378c"""', """\ 379 STRING 'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4) 380 """) 381 self.check_tokenize('f"abc"', """\ 382 STRING 'f"abc"' (1, 0) (1, 6) 383 """) 384 self.check_tokenize('fR"a{b}c"', """\ 385 STRING 'fR"a{b}c"' (1, 0) (1, 9) 386 """) 387 self.check_tokenize('f"""abc"""', """\ 388 STRING 'f\"\"\"abc\"\"\"' (1, 0) (1, 10) 389 """) 390 self.check_tokenize(r'f"abc\ 391def"', """\ 392 STRING 'f"abc\\\\\\ndef"' (1, 0) (2, 4) 393 """) 394 self.check_tokenize(r'Rf"abc\ 395def"', """\ 396 STRING 'Rf"abc\\\\\\ndef"' (1, 0) (2, 4) 397 """) 398 399 def test_function(self): 400 self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\ 401 NAME 'def' (1, 0) (1, 3) 402 NAME 'd22' (1, 4) (1, 7) 403 OP '(' (1, 7) (1, 8) 404 NAME 'a' (1, 8) (1, 9) 405 OP ',' (1, 9) (1, 10) 406 NAME 'b' (1, 11) (1, 12) 407 OP ',' (1, 12) (1, 13) 408 NAME 'c' (1, 14) (1, 15) 409 OP '=' (1, 15) (1, 16) 410 NUMBER '2' (1, 16) (1, 17) 411 OP ',' (1, 17) (1, 18) 412 NAME 'd' (1, 19) (1, 20) 413 OP '=' (1, 20) (1, 21) 414 NUMBER '2' (1, 21) (1, 22) 415 OP ',' (1, 22) (1, 23) 416 OP '*' (1, 24) (1, 25) 417 NAME 'k' (1, 25) (1, 26) 418 OP ')' (1, 26) (1, 27) 419 OP ':' (1, 27) (1, 28) 420 NAME 'pass' (1, 29) (1, 33) 421 """) 422 self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\ 423 NAME 'def' (1, 0) (1, 3) 424 NAME 'd01v_' (1, 4) (1, 9) 425 OP '(' (1, 9) (1, 10) 426 NAME 'a' (1, 10) (1, 11) 427 OP '=' (1, 11) (1, 12) 428 NUMBER '1' (1, 12) (1, 13) 429 OP ',' (1, 13) (1, 14) 430 OP '*' (1, 15) (1, 16) 431 NAME 'k' (1, 16) (1, 17) 432 OP ',' (1, 17) (1, 18) 433 OP '**' (1, 19) (1, 21) 434 NAME 'w' (1, 21) (1, 22) 435 OP ')' (1, 22) (1, 23) 436 OP ':' (1, 23) (1, 24) 437 NAME 'pass' (1, 25) (1, 29) 438 """) 439 self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\ 440 NAME 'def' (1, 0) (1, 3) 441 NAME 'd23' (1, 4) (1, 7) 442 OP '(' (1, 7) (1, 8) 443 NAME 'a' (1, 8) (1, 9) 444 OP ':' (1, 9) (1, 10) 445 NAME 'str' (1, 11) (1, 14) 446 OP ',' (1, 14) (1, 15) 447 NAME 'b' (1, 16) (1, 17) 448 OP ':' (1, 17) (1, 18) 449 NAME 'int' (1, 19) (1, 22) 450 OP '=' (1, 22) (1, 23) 451 NUMBER '3' (1, 23) (1, 24) 452 OP ')' (1, 24) (1, 25) 453 OP '->' (1, 26) (1, 28) 454 NAME 'int' (1, 29) (1, 32) 455 OP ':' (1, 32) (1, 33) 456 NAME 'pass' (1, 34) (1, 38) 457 """) 458 459 def test_comparison(self): 460 # Comparison 461 self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != " 462 "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\ 463 NAME 'if' (1, 0) (1, 2) 464 NUMBER '1' (1, 3) (1, 4) 465 OP '<' (1, 5) (1, 6) 466 NUMBER '1' (1, 7) (1, 8) 467 OP '>' (1, 9) (1, 10) 468 NUMBER '1' (1, 11) (1, 12) 469 OP '==' (1, 13) (1, 15) 470 NUMBER '1' (1, 16) (1, 17) 471 OP '>=' (1, 18) (1, 20) 472 NUMBER '5' (1, 21) (1, 22) 473 OP '<=' (1, 23) (1, 25) 474 NUMBER '0x15' (1, 26) (1, 30) 475 OP '<=' (1, 31) (1, 33) 476 NUMBER '0x12' (1, 34) (1, 38) 477 OP '!=' (1, 39) (1, 41) 478 NUMBER '1' (1, 42) (1, 43) 479 NAME 'and' (1, 44) (1, 47) 480 NUMBER '5' (1, 48) (1, 49) 481 NAME 'in' (1, 50) (1, 52) 482 NUMBER '1' (1, 53) (1, 54) 483 NAME 'not' (1, 55) (1, 58) 484 NAME 'in' (1, 59) (1, 61) 485 NUMBER '1' (1, 62) (1, 63) 486 NAME 'is' (1, 64) (1, 66) 487 NUMBER '1' (1, 67) (1, 68) 488 NAME 'or' (1, 69) (1, 71) 489 NUMBER '5' (1, 72) (1, 73) 490 NAME 'is' (1, 74) (1, 76) 491 NAME 'not' (1, 77) (1, 80) 492 NUMBER '1' (1, 81) (1, 82) 493 OP ':' (1, 82) (1, 83) 494 NAME 'pass' (1, 84) (1, 88) 495 """) 496 497 def test_shift(self): 498 # Shift 499 self.check_tokenize("x = 1 << 1 >> 5", """\ 500 NAME 'x' (1, 0) (1, 1) 501 OP '=' (1, 2) (1, 3) 502 NUMBER '1' (1, 4) (1, 5) 503 OP '<<' (1, 6) (1, 8) 504 NUMBER '1' (1, 9) (1, 10) 505 OP '>>' (1, 11) (1, 13) 506 NUMBER '5' (1, 14) (1, 15) 507 """) 508 509 def test_additive(self): 510 # Additive 511 self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\ 512 NAME 'x' (1, 0) (1, 1) 513 OP '=' (1, 2) (1, 3) 514 NUMBER '1' (1, 4) (1, 5) 515 OP '-' (1, 6) (1, 7) 516 NAME 'y' (1, 8) (1, 9) 517 OP '+' (1, 10) (1, 11) 518 NUMBER '15' (1, 12) (1, 14) 519 OP '-' (1, 15) (1, 16) 520 NUMBER '1' (1, 17) (1, 18) 521 OP '+' (1, 19) (1, 20) 522 NUMBER '0x124' (1, 21) (1, 26) 523 OP '+' (1, 27) (1, 28) 524 NAME 'z' (1, 29) (1, 30) 525 OP '+' (1, 31) (1, 32) 526 NAME 'a' (1, 33) (1, 34) 527 OP '[' (1, 34) (1, 35) 528 NUMBER '5' (1, 35) (1, 36) 529 OP ']' (1, 36) (1, 37) 530 """) 531 532 def test_multiplicative(self): 533 # Multiplicative 534 self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\ 535 NAME 'x' (1, 0) (1, 1) 536 OP '=' (1, 2) (1, 3) 537 NUMBER '1' (1, 4) (1, 5) 538 OP '//' (1, 5) (1, 7) 539 NUMBER '1' (1, 7) (1, 8) 540 OP '*' (1, 8) (1, 9) 541 NUMBER '1' (1, 9) (1, 10) 542 OP '/' (1, 10) (1, 11) 543 NUMBER '5' (1, 11) (1, 12) 544 OP '*' (1, 12) (1, 13) 545 NUMBER '12' (1, 13) (1, 15) 546 OP '%' (1, 15) (1, 16) 547 NUMBER '0x12' (1, 16) (1, 20) 548 OP '@' (1, 20) (1, 21) 549 NUMBER '42' (1, 21) (1, 23) 550 """) 551 552 def test_unary(self): 553 # Unary 554 self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\ 555 OP '~' (1, 0) (1, 1) 556 NUMBER '1' (1, 1) (1, 2) 557 OP '^' (1, 3) (1, 4) 558 NUMBER '1' (1, 5) (1, 6) 559 OP '&' (1, 7) (1, 8) 560 NUMBER '1' (1, 9) (1, 10) 561 OP '|' (1, 11) (1, 12) 562 NUMBER '1' (1, 12) (1, 13) 563 OP '^' (1, 14) (1, 15) 564 OP '-' (1, 16) (1, 17) 565 NUMBER '1' (1, 17) (1, 18) 566 """) 567 self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\ 568 OP '-' (1, 0) (1, 1) 569 NUMBER '1' (1, 1) (1, 2) 570 OP '*' (1, 2) (1, 3) 571 NUMBER '1' (1, 3) (1, 4) 572 OP '/' (1, 4) (1, 5) 573 NUMBER '1' (1, 5) (1, 6) 574 OP '+' (1, 6) (1, 7) 575 NUMBER '1' (1, 7) (1, 8) 576 OP '*' (1, 8) (1, 9) 577 NUMBER '1' (1, 9) (1, 10) 578 OP '//' (1, 10) (1, 12) 579 NUMBER '1' (1, 12) (1, 13) 580 OP '-' (1, 14) (1, 15) 581 OP '-' (1, 16) (1, 17) 582 OP '-' (1, 17) (1, 18) 583 OP '-' (1, 18) (1, 19) 584 NUMBER '1' (1, 19) (1, 20) 585 OP '**' (1, 20) (1, 22) 586 NUMBER '1' (1, 22) (1, 23) 587 """) 588 589 def test_selector(self): 590 # Selector 591 self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\ 592 NAME 'import' (1, 0) (1, 6) 593 NAME 'sys' (1, 7) (1, 10) 594 OP ',' (1, 10) (1, 11) 595 NAME 'time' (1, 12) (1, 16) 596 NEWLINE '\\n' (1, 16) (1, 17) 597 NAME 'x' (2, 0) (2, 1) 598 OP '=' (2, 2) (2, 3) 599 NAME 'sys' (2, 4) (2, 7) 600 OP '.' (2, 7) (2, 8) 601 NAME 'modules' (2, 8) (2, 15) 602 OP '[' (2, 15) (2, 16) 603 STRING "'time'" (2, 16) (2, 22) 604 OP ']' (2, 22) (2, 23) 605 OP '.' (2, 23) (2, 24) 606 NAME 'time' (2, 24) (2, 28) 607 OP '(' (2, 28) (2, 29) 608 OP ')' (2, 29) (2, 30) 609 """) 610 611 def test_method(self): 612 # Methods 613 self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\ 614 OP '@' (1, 0) (1, 1) 615 NAME 'staticmethod' (1, 1) (1, 13) 616 NEWLINE '\\n' (1, 13) (1, 14) 617 NAME 'def' (2, 0) (2, 3) 618 NAME 'foo' (2, 4) (2, 7) 619 OP '(' (2, 7) (2, 8) 620 NAME 'x' (2, 8) (2, 9) 621 OP ',' (2, 9) (2, 10) 622 NAME 'y' (2, 10) (2, 11) 623 OP ')' (2, 11) (2, 12) 624 OP ':' (2, 12) (2, 13) 625 NAME 'pass' (2, 14) (2, 18) 626 """) 627 628 def test_tabs(self): 629 # Evil tabs 630 self.check_tokenize("def f():\n" 631 "\tif x\n" 632 " \tpass", """\ 633 NAME 'def' (1, 0) (1, 3) 634 NAME 'f' (1, 4) (1, 5) 635 OP '(' (1, 5) (1, 6) 636 OP ')' (1, 6) (1, 7) 637 OP ':' (1, 7) (1, 8) 638 NEWLINE '\\n' (1, 8) (1, 9) 639 INDENT '\\t' (2, 0) (2, 1) 640 NAME 'if' (2, 1) (2, 3) 641 NAME 'x' (2, 4) (2, 5) 642 NEWLINE '\\n' (2, 5) (2, 6) 643 INDENT ' \\t' (3, 0) (3, 9) 644 NAME 'pass' (3, 9) (3, 13) 645 DEDENT '' (4, 0) (4, 0) 646 DEDENT '' (4, 0) (4, 0) 647 """) 648 649 def test_non_ascii_identifiers(self): 650 # Non-ascii identifiers 651 self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\ 652 NAME 'Örter' (1, 0) (1, 5) 653 OP '=' (1, 6) (1, 7) 654 STRING "'places'" (1, 8) (1, 16) 655 NEWLINE '\\n' (1, 16) (1, 17) 656 NAME 'grün' (2, 0) (2, 4) 657 OP '=' (2, 5) (2, 6) 658 STRING "'green'" (2, 7) (2, 14) 659 """) 660 661 def test_unicode(self): 662 # Legacy unicode literals: 663 self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\ 664 NAME 'Örter' (1, 0) (1, 5) 665 OP '=' (1, 6) (1, 7) 666 STRING "u'places'" (1, 8) (1, 17) 667 NEWLINE '\\n' (1, 17) (1, 18) 668 NAME 'grün' (2, 0) (2, 4) 669 OP '=' (2, 5) (2, 6) 670 STRING "U'green'" (2, 7) (2, 15) 671 """) 672 673 def test_async(self): 674 # Async/await extension: 675 self.check_tokenize("async = 1", """\ 676 NAME 'async' (1, 0) (1, 5) 677 OP '=' (1, 6) (1, 7) 678 NUMBER '1' (1, 8) (1, 9) 679 """) 680 681 self.check_tokenize("a = (async = 1)", """\ 682 NAME 'a' (1, 0) (1, 1) 683 OP '=' (1, 2) (1, 3) 684 OP '(' (1, 4) (1, 5) 685 NAME 'async' (1, 5) (1, 10) 686 OP '=' (1, 11) (1, 12) 687 NUMBER '1' (1, 13) (1, 14) 688 OP ')' (1, 14) (1, 15) 689 """) 690 691 self.check_tokenize("async()", """\ 692 NAME 'async' (1, 0) (1, 5) 693 OP '(' (1, 5) (1, 6) 694 OP ')' (1, 6) (1, 7) 695 """) 696 697 self.check_tokenize("class async(Bar):pass", """\ 698 NAME 'class' (1, 0) (1, 5) 699 NAME 'async' (1, 6) (1, 11) 700 OP '(' (1, 11) (1, 12) 701 NAME 'Bar' (1, 12) (1, 15) 702 OP ')' (1, 15) (1, 16) 703 OP ':' (1, 16) (1, 17) 704 NAME 'pass' (1, 17) (1, 21) 705 """) 706 707 self.check_tokenize("class async:pass", """\ 708 NAME 'class' (1, 0) (1, 5) 709 NAME 'async' (1, 6) (1, 11) 710 OP ':' (1, 11) (1, 12) 711 NAME 'pass' (1, 12) (1, 16) 712 """) 713 714 self.check_tokenize("await = 1", """\ 715 NAME 'await' (1, 0) (1, 5) 716 OP '=' (1, 6) (1, 7) 717 NUMBER '1' (1, 8) (1, 9) 718 """) 719 720 self.check_tokenize("foo.async", """\ 721 NAME 'foo' (1, 0) (1, 3) 722 OP '.' (1, 3) (1, 4) 723 NAME 'async' (1, 4) (1, 9) 724 """) 725 726 self.check_tokenize("async for a in b: pass", """\ 727 NAME 'async' (1, 0) (1, 5) 728 NAME 'for' (1, 6) (1, 9) 729 NAME 'a' (1, 10) (1, 11) 730 NAME 'in' (1, 12) (1, 14) 731 NAME 'b' (1, 15) (1, 16) 732 OP ':' (1, 16) (1, 17) 733 NAME 'pass' (1, 18) (1, 22) 734 """) 735 736 self.check_tokenize("async with a as b: pass", """\ 737 NAME 'async' (1, 0) (1, 5) 738 NAME 'with' (1, 6) (1, 10) 739 NAME 'a' (1, 11) (1, 12) 740 NAME 'as' (1, 13) (1, 15) 741 NAME 'b' (1, 16) (1, 17) 742 OP ':' (1, 17) (1, 18) 743 NAME 'pass' (1, 19) (1, 23) 744 """) 745 746 self.check_tokenize("async.foo", """\ 747 NAME 'async' (1, 0) (1, 5) 748 OP '.' (1, 5) (1, 6) 749 NAME 'foo' (1, 6) (1, 9) 750 """) 751 752 self.check_tokenize("async", """\ 753 NAME 'async' (1, 0) (1, 5) 754 """) 755 756 self.check_tokenize("async\n#comment\nawait", """\ 757 NAME 'async' (1, 0) (1, 5) 758 NEWLINE '\\n' (1, 5) (1, 6) 759 COMMENT '#comment' (2, 0) (2, 8) 760 NL '\\n' (2, 8) (2, 9) 761 NAME 'await' (3, 0) (3, 5) 762 """) 763 764 self.check_tokenize("async\n...\nawait", """\ 765 NAME 'async' (1, 0) (1, 5) 766 NEWLINE '\\n' (1, 5) (1, 6) 767 OP '...' (2, 0) (2, 3) 768 NEWLINE '\\n' (2, 3) (2, 4) 769 NAME 'await' (3, 0) (3, 5) 770 """) 771 772 self.check_tokenize("async\nawait", """\ 773 NAME 'async' (1, 0) (1, 5) 774 NEWLINE '\\n' (1, 5) (1, 6) 775 NAME 'await' (2, 0) (2, 5) 776 """) 777 778 self.check_tokenize("foo.async + 1", """\ 779 NAME 'foo' (1, 0) (1, 3) 780 OP '.' (1, 3) (1, 4) 781 NAME 'async' (1, 4) (1, 9) 782 OP '+' (1, 10) (1, 11) 783 NUMBER '1' (1, 12) (1, 13) 784 """) 785 786 self.check_tokenize("async def foo(): pass", """\ 787 NAME 'async' (1, 0) (1, 5) 788 NAME 'def' (1, 6) (1, 9) 789 NAME 'foo' (1, 10) (1, 13) 790 OP '(' (1, 13) (1, 14) 791 OP ')' (1, 14) (1, 15) 792 OP ':' (1, 15) (1, 16) 793 NAME 'pass' (1, 17) (1, 21) 794 """) 795 796 self.check_tokenize('''\ 797async def foo(): 798 def foo(await): 799 await = 1 800 if 1: 801 await 802async += 1 803''', """\ 804 NAME 'async' (1, 0) (1, 5) 805 NAME 'def' (1, 6) (1, 9) 806 NAME 'foo' (1, 10) (1, 13) 807 OP '(' (1, 13) (1, 14) 808 OP ')' (1, 14) (1, 15) 809 OP ':' (1, 15) (1, 16) 810 NEWLINE '\\n' (1, 16) (1, 17) 811 INDENT ' ' (2, 0) (2, 2) 812 NAME 'def' (2, 2) (2, 5) 813 NAME 'foo' (2, 6) (2, 9) 814 OP '(' (2, 9) (2, 10) 815 NAME 'await' (2, 10) (2, 15) 816 OP ')' (2, 15) (2, 16) 817 OP ':' (2, 16) (2, 17) 818 NEWLINE '\\n' (2, 17) (2, 18) 819 INDENT ' ' (3, 0) (3, 4) 820 NAME 'await' (3, 4) (3, 9) 821 OP '=' (3, 10) (3, 11) 822 NUMBER '1' (3, 12) (3, 13) 823 NEWLINE '\\n' (3, 13) (3, 14) 824 DEDENT '' (4, 2) (4, 2) 825 NAME 'if' (4, 2) (4, 4) 826 NUMBER '1' (4, 5) (4, 6) 827 OP ':' (4, 6) (4, 7) 828 NEWLINE '\\n' (4, 7) (4, 8) 829 INDENT ' ' (5, 0) (5, 4) 830 NAME 'await' (5, 4) (5, 9) 831 NEWLINE '\\n' (5, 9) (5, 10) 832 DEDENT '' (6, 0) (6, 0) 833 DEDENT '' (6, 0) (6, 0) 834 NAME 'async' (6, 0) (6, 5) 835 OP '+=' (6, 6) (6, 8) 836 NUMBER '1' (6, 9) (6, 10) 837 NEWLINE '\\n' (6, 10) (6, 11) 838 """) 839 840 self.check_tokenize('''\ 841async def foo(): 842 async for i in 1: pass''', """\ 843 NAME 'async' (1, 0) (1, 5) 844 NAME 'def' (1, 6) (1, 9) 845 NAME 'foo' (1, 10) (1, 13) 846 OP '(' (1, 13) (1, 14) 847 OP ')' (1, 14) (1, 15) 848 OP ':' (1, 15) (1, 16) 849 NEWLINE '\\n' (1, 16) (1, 17) 850 INDENT ' ' (2, 0) (2, 2) 851 NAME 'async' (2, 2) (2, 7) 852 NAME 'for' (2, 8) (2, 11) 853 NAME 'i' (2, 12) (2, 13) 854 NAME 'in' (2, 14) (2, 16) 855 NUMBER '1' (2, 17) (2, 18) 856 OP ':' (2, 18) (2, 19) 857 NAME 'pass' (2, 20) (2, 24) 858 DEDENT '' (3, 0) (3, 0) 859 """) 860 861 self.check_tokenize('''async def foo(async): await''', """\ 862 NAME 'async' (1, 0) (1, 5) 863 NAME 'def' (1, 6) (1, 9) 864 NAME 'foo' (1, 10) (1, 13) 865 OP '(' (1, 13) (1, 14) 866 NAME 'async' (1, 14) (1, 19) 867 OP ')' (1, 19) (1, 20) 868 OP ':' (1, 20) (1, 21) 869 NAME 'await' (1, 22) (1, 27) 870 """) 871 872 self.check_tokenize('''\ 873def f(): 874 875 def baz(): pass 876 async def bar(): pass 877 878 await = 2''', """\ 879 NAME 'def' (1, 0) (1, 3) 880 NAME 'f' (1, 4) (1, 5) 881 OP '(' (1, 5) (1, 6) 882 OP ')' (1, 6) (1, 7) 883 OP ':' (1, 7) (1, 8) 884 NEWLINE '\\n' (1, 8) (1, 9) 885 NL '\\n' (2, 0) (2, 1) 886 INDENT ' ' (3, 0) (3, 2) 887 NAME 'def' (3, 2) (3, 5) 888 NAME 'baz' (3, 6) (3, 9) 889 OP '(' (3, 9) (3, 10) 890 OP ')' (3, 10) (3, 11) 891 OP ':' (3, 11) (3, 12) 892 NAME 'pass' (3, 13) (3, 17) 893 NEWLINE '\\n' (3, 17) (3, 18) 894 NAME 'async' (4, 2) (4, 7) 895 NAME 'def' (4, 8) (4, 11) 896 NAME 'bar' (4, 12) (4, 15) 897 OP '(' (4, 15) (4, 16) 898 OP ')' (4, 16) (4, 17) 899 OP ':' (4, 17) (4, 18) 900 NAME 'pass' (4, 19) (4, 23) 901 NEWLINE '\\n' (4, 23) (4, 24) 902 NL '\\n' (5, 0) (5, 1) 903 NAME 'await' (6, 2) (6, 7) 904 OP '=' (6, 8) (6, 9) 905 NUMBER '2' (6, 10) (6, 11) 906 DEDENT '' (7, 0) (7, 0) 907 """) 908 909 self.check_tokenize('''\ 910async def f(): 911 912 def baz(): pass 913 async def bar(): pass 914 915 await = 2''', """\ 916 NAME 'async' (1, 0) (1, 5) 917 NAME 'def' (1, 6) (1, 9) 918 NAME 'f' (1, 10) (1, 11) 919 OP '(' (1, 11) (1, 12) 920 OP ')' (1, 12) (1, 13) 921 OP ':' (1, 13) (1, 14) 922 NEWLINE '\\n' (1, 14) (1, 15) 923 NL '\\n' (2, 0) (2, 1) 924 INDENT ' ' (3, 0) (3, 2) 925 NAME 'def' (3, 2) (3, 5) 926 NAME 'baz' (3, 6) (3, 9) 927 OP '(' (3, 9) (3, 10) 928 OP ')' (3, 10) (3, 11) 929 OP ':' (3, 11) (3, 12) 930 NAME 'pass' (3, 13) (3, 17) 931 NEWLINE '\\n' (3, 17) (3, 18) 932 NAME 'async' (4, 2) (4, 7) 933 NAME 'def' (4, 8) (4, 11) 934 NAME 'bar' (4, 12) (4, 15) 935 OP '(' (4, 15) (4, 16) 936 OP ')' (4, 16) (4, 17) 937 OP ':' (4, 17) (4, 18) 938 NAME 'pass' (4, 19) (4, 23) 939 NEWLINE '\\n' (4, 23) (4, 24) 940 NL '\\n' (5, 0) (5, 1) 941 NAME 'await' (6, 2) (6, 7) 942 OP '=' (6, 8) (6, 9) 943 NUMBER '2' (6, 10) (6, 11) 944 DEDENT '' (7, 0) (7, 0) 945 """) 946 947class GenerateTokensTest(TokenizeTest): 948 def check_tokenize(self, s, expected): 949 # Format the tokens in s in a table format. 950 # The ENDMARKER and final NEWLINE are omitted. 951 f = StringIO(s) 952 result = stringify_tokens_from_source(generate_tokens(f.readline), s) 953 self.assertEqual(result, expected.rstrip().splitlines()) 954 955 956def decistmt(s): 957 result = [] 958 g = tokenize(BytesIO(s.encode('utf-8')).readline) # tokenize the string 959 for toknum, tokval, _, _, _ in g: 960 if toknum == NUMBER and '.' in tokval: # replace NUMBER tokens 961 result.extend([ 962 (NAME, 'Decimal'), 963 (OP, '('), 964 (STRING, repr(tokval)), 965 (OP, ')') 966 ]) 967 else: 968 result.append((toknum, tokval)) 969 return untokenize(result).decode('utf-8') 970 971class TestMisc(TestCase): 972 973 def test_decistmt(self): 974 # Substitute Decimals for floats in a string of statements. 975 # This is an example from the docs. 976 977 from decimal import Decimal 978 s = '+21.3e-5*-.1234/81.7' 979 self.assertEqual(decistmt(s), 980 "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')") 981 982 # The format of the exponent is inherited from the platform C library. 983 # Known cases are "e-007" (Windows) and "e-07" (not Windows). Since 984 # we're only showing 11 digits, and the 12th isn't close to 5, the 985 # rest of the output should be platform-independent. 986 self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7') 987 988 # Output from calculations with Decimal should be identical across all 989 # platforms. 990 self.assertEqual(eval(decistmt(s)), 991 Decimal('-3.217160342717258261933904529E-7')) 992 993 994class TestTokenizerAdheresToPep0263(TestCase): 995 """ 996 Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263. 997 """ 998 999 def _testFile(self, filename): 1000 path = os.path.join(os.path.dirname(__file__), filename) 1001 TestRoundtrip.check_roundtrip(self, open(path, 'rb')) 1002 1003 def test_utf8_coding_cookie_and_no_utf8_bom(self): 1004 f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt' 1005 self._testFile(f) 1006 1007 def test_latin1_coding_cookie_and_utf8_bom(self): 1008 """ 1009 As per PEP 0263, if a file starts with a utf-8 BOM signature, the only 1010 allowed encoding for the comment is 'utf-8'. The text file used in 1011 this test starts with a BOM signature, but specifies latin1 as the 1012 coding, so verify that a SyntaxError is raised, which matches the 1013 behaviour of the interpreter when it encounters a similar condition. 1014 """ 1015 f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt' 1016 self.assertRaises(SyntaxError, self._testFile, f) 1017 1018 def test_no_coding_cookie_and_utf8_bom(self): 1019 f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt' 1020 self._testFile(f) 1021 1022 def test_utf8_coding_cookie_and_utf8_bom(self): 1023 f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt' 1024 self._testFile(f) 1025 1026 def test_bad_coding_cookie(self): 1027 self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py') 1028 self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py') 1029 1030 1031class Test_Tokenize(TestCase): 1032 1033 def test__tokenize_decodes_with_specified_encoding(self): 1034 literal = '"ЉЊЈЁЂ"' 1035 line = literal.encode('utf-8') 1036 first = False 1037 def readline(): 1038 nonlocal first 1039 if not first: 1040 first = True 1041 return line 1042 else: 1043 return b'' 1044 1045 # skip the initial encoding token and the end tokens 1046 tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2] 1047 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] 1048 self.assertEqual(tokens, expected_tokens, 1049 "bytes not decoded with encoding") 1050 1051 def test__tokenize_does_not_decode_with_encoding_none(self): 1052 literal = '"ЉЊЈЁЂ"' 1053 first = False 1054 def readline(): 1055 nonlocal first 1056 if not first: 1057 first = True 1058 return literal 1059 else: 1060 return b'' 1061 1062 # skip the end tokens 1063 tokens = list(_tokenize(readline, encoding=None))[:-2] 1064 expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')] 1065 self.assertEqual(tokens, expected_tokens, 1066 "string not tokenized when encoding is None") 1067 1068 1069class TestDetectEncoding(TestCase): 1070 1071 def get_readline(self, lines): 1072 index = 0 1073 def readline(): 1074 nonlocal index 1075 if index == len(lines): 1076 raise StopIteration 1077 line = lines[index] 1078 index += 1 1079 return line 1080 return readline 1081 1082 def test_no_bom_no_encoding_cookie(self): 1083 lines = ( 1084 b'# something\n', 1085 b'print(something)\n', 1086 b'do_something(else)\n' 1087 ) 1088 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1089 self.assertEqual(encoding, 'utf-8') 1090 self.assertEqual(consumed_lines, list(lines[:2])) 1091 1092 def test_bom_no_cookie(self): 1093 lines = ( 1094 b'\xef\xbb\xbf# something\n', 1095 b'print(something)\n', 1096 b'do_something(else)\n' 1097 ) 1098 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1099 self.assertEqual(encoding, 'utf-8-sig') 1100 self.assertEqual(consumed_lines, 1101 [b'# something\n', b'print(something)\n']) 1102 1103 def test_cookie_first_line_no_bom(self): 1104 lines = ( 1105 b'# -*- coding: latin-1 -*-\n', 1106 b'print(something)\n', 1107 b'do_something(else)\n' 1108 ) 1109 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1110 self.assertEqual(encoding, 'iso-8859-1') 1111 self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n']) 1112 1113 def test_matched_bom_and_cookie_first_line(self): 1114 lines = ( 1115 b'\xef\xbb\xbf# coding=utf-8\n', 1116 b'print(something)\n', 1117 b'do_something(else)\n' 1118 ) 1119 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1120 self.assertEqual(encoding, 'utf-8-sig') 1121 self.assertEqual(consumed_lines, [b'# coding=utf-8\n']) 1122 1123 def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self): 1124 lines = ( 1125 b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n', 1126 b'print(something)\n', 1127 b'do_something(else)\n' 1128 ) 1129 readline = self.get_readline(lines) 1130 self.assertRaises(SyntaxError, detect_encoding, readline) 1131 1132 def test_cookie_second_line_no_bom(self): 1133 lines = ( 1134 b'#! something\n', 1135 b'# vim: set fileencoding=ascii :\n', 1136 b'print(something)\n', 1137 b'do_something(else)\n' 1138 ) 1139 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1140 self.assertEqual(encoding, 'ascii') 1141 expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n'] 1142 self.assertEqual(consumed_lines, expected) 1143 1144 def test_matched_bom_and_cookie_second_line(self): 1145 lines = ( 1146 b'\xef\xbb\xbf#! something\n', 1147 b'f# coding=utf-8\n', 1148 b'print(something)\n', 1149 b'do_something(else)\n' 1150 ) 1151 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1152 self.assertEqual(encoding, 'utf-8-sig') 1153 self.assertEqual(consumed_lines, 1154 [b'#! something\n', b'f# coding=utf-8\n']) 1155 1156 def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self): 1157 lines = ( 1158 b'\xef\xbb\xbf#! something\n', 1159 b'# vim: set fileencoding=ascii :\n', 1160 b'print(something)\n', 1161 b'do_something(else)\n' 1162 ) 1163 readline = self.get_readline(lines) 1164 self.assertRaises(SyntaxError, detect_encoding, readline) 1165 1166 def test_cookie_second_line_noncommented_first_line(self): 1167 lines = ( 1168 b"print('\xc2\xa3')\n", 1169 b'# vim: set fileencoding=iso8859-15 :\n', 1170 b"print('\xe2\x82\xac')\n" 1171 ) 1172 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1173 self.assertEqual(encoding, 'utf-8') 1174 expected = [b"print('\xc2\xa3')\n"] 1175 self.assertEqual(consumed_lines, expected) 1176 1177 def test_cookie_second_line_commented_first_line(self): 1178 lines = ( 1179 b"#print('\xc2\xa3')\n", 1180 b'# vim: set fileencoding=iso8859-15 :\n', 1181 b"print('\xe2\x82\xac')\n" 1182 ) 1183 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1184 self.assertEqual(encoding, 'iso8859-15') 1185 expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n'] 1186 self.assertEqual(consumed_lines, expected) 1187 1188 def test_cookie_second_line_empty_first_line(self): 1189 lines = ( 1190 b'\n', 1191 b'# vim: set fileencoding=iso8859-15 :\n', 1192 b"print('\xe2\x82\xac')\n" 1193 ) 1194 encoding, consumed_lines = detect_encoding(self.get_readline(lines)) 1195 self.assertEqual(encoding, 'iso8859-15') 1196 expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n'] 1197 self.assertEqual(consumed_lines, expected) 1198 1199 def test_latin1_normalization(self): 1200 # See get_normal_name() in tokenizer.c. 1201 encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix", 1202 "iso-8859-1-unix", "iso-latin-1-mac") 1203 for encoding in encodings: 1204 for rep in ("-", "_"): 1205 enc = encoding.replace("-", rep) 1206 lines = (b"#!/usr/bin/python\n", 1207 b"# coding: " + enc.encode("ascii") + b"\n", 1208 b"print(things)\n", 1209 b"do_something += 4\n") 1210 rl = self.get_readline(lines) 1211 found, consumed_lines = detect_encoding(rl) 1212 self.assertEqual(found, "iso-8859-1") 1213 1214 def test_syntaxerror_latin1(self): 1215 # Issue 14629: need to raise SyntaxError if the first 1216 # line(s) have non-UTF-8 characters 1217 lines = ( 1218 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S 1219 ) 1220 readline = self.get_readline(lines) 1221 self.assertRaises(SyntaxError, detect_encoding, readline) 1222 1223 1224 def test_utf8_normalization(self): 1225 # See get_normal_name() in tokenizer.c. 1226 encodings = ("utf-8", "utf-8-mac", "utf-8-unix") 1227 for encoding in encodings: 1228 for rep in ("-", "_"): 1229 enc = encoding.replace("-", rep) 1230 lines = (b"#!/usr/bin/python\n", 1231 b"# coding: " + enc.encode("ascii") + b"\n", 1232 b"1 + 3\n") 1233 rl = self.get_readline(lines) 1234 found, consumed_lines = detect_encoding(rl) 1235 self.assertEqual(found, "utf-8") 1236 1237 def test_short_files(self): 1238 readline = self.get_readline((b'print(something)\n',)) 1239 encoding, consumed_lines = detect_encoding(readline) 1240 self.assertEqual(encoding, 'utf-8') 1241 self.assertEqual(consumed_lines, [b'print(something)\n']) 1242 1243 encoding, consumed_lines = detect_encoding(self.get_readline(())) 1244 self.assertEqual(encoding, 'utf-8') 1245 self.assertEqual(consumed_lines, []) 1246 1247 readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',)) 1248 encoding, consumed_lines = detect_encoding(readline) 1249 self.assertEqual(encoding, 'utf-8-sig') 1250 self.assertEqual(consumed_lines, [b'print(something)\n']) 1251 1252 readline = self.get_readline((b'\xef\xbb\xbf',)) 1253 encoding, consumed_lines = detect_encoding(readline) 1254 self.assertEqual(encoding, 'utf-8-sig') 1255 self.assertEqual(consumed_lines, []) 1256 1257 readline = self.get_readline((b'# coding: bad\n',)) 1258 self.assertRaises(SyntaxError, detect_encoding, readline) 1259 1260 def test_false_encoding(self): 1261 # Issue 18873: "Encoding" detected in non-comment lines 1262 readline = self.get_readline((b'print("#coding=fake")',)) 1263 encoding, consumed_lines = detect_encoding(readline) 1264 self.assertEqual(encoding, 'utf-8') 1265 self.assertEqual(consumed_lines, [b'print("#coding=fake")']) 1266 1267 def test_open(self): 1268 filename = support.TESTFN + '.py' 1269 self.addCleanup(support.unlink, filename) 1270 1271 # test coding cookie 1272 for encoding in ('iso-8859-15', 'utf-8'): 1273 with open(filename, 'w', encoding=encoding) as fp: 1274 print("# coding: %s" % encoding, file=fp) 1275 print("print('euro:\u20ac')", file=fp) 1276 with tokenize_open(filename) as fp: 1277 self.assertEqual(fp.encoding, encoding) 1278 self.assertEqual(fp.mode, 'r') 1279 1280 # test BOM (no coding cookie) 1281 with open(filename, 'w', encoding='utf-8-sig') as fp: 1282 print("print('euro:\u20ac')", file=fp) 1283 with tokenize_open(filename) as fp: 1284 self.assertEqual(fp.encoding, 'utf-8-sig') 1285 self.assertEqual(fp.mode, 'r') 1286 1287 def test_filename_in_exception(self): 1288 # When possible, include the file name in the exception. 1289 path = 'some_file_path' 1290 lines = ( 1291 b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S 1292 ) 1293 class Bunk: 1294 def __init__(self, lines, path): 1295 self.name = path 1296 self._lines = lines 1297 self._index = 0 1298 1299 def readline(self): 1300 if self._index == len(lines): 1301 raise StopIteration 1302 line = lines[self._index] 1303 self._index += 1 1304 return line 1305 1306 with self.assertRaises(SyntaxError): 1307 ins = Bunk(lines, path) 1308 # Make sure lacking a name isn't an issue. 1309 del ins.name 1310 detect_encoding(ins.readline) 1311 with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)): 1312 ins = Bunk(lines, path) 1313 detect_encoding(ins.readline) 1314 1315 def test_open_error(self): 1316 # Issue #23840: open() must close the binary file on error 1317 m = BytesIO(b'#coding:xxx') 1318 with mock.patch('tokenize._builtin_open', return_value=m): 1319 self.assertRaises(SyntaxError, tokenize_open, 'foobar') 1320 self.assertTrue(m.closed) 1321 1322 1323class TestTokenize(TestCase): 1324 1325 def test_tokenize(self): 1326 import tokenize as tokenize_module 1327 encoding = object() 1328 encoding_used = None 1329 def mock_detect_encoding(readline): 1330 return encoding, [b'first', b'second'] 1331 1332 def mock__tokenize(readline, encoding): 1333 nonlocal encoding_used 1334 encoding_used = encoding 1335 out = [] 1336 while True: 1337 next_line = readline() 1338 if next_line: 1339 out.append(next_line) 1340 continue 1341 return out 1342 1343 counter = 0 1344 def mock_readline(): 1345 nonlocal counter 1346 counter += 1 1347 if counter == 5: 1348 return b'' 1349 return str(counter).encode() 1350 1351 orig_detect_encoding = tokenize_module.detect_encoding 1352 orig__tokenize = tokenize_module._tokenize 1353 tokenize_module.detect_encoding = mock_detect_encoding 1354 tokenize_module._tokenize = mock__tokenize 1355 try: 1356 results = tokenize(mock_readline) 1357 self.assertEqual(list(results), 1358 [b'first', b'second', b'1', b'2', b'3', b'4']) 1359 finally: 1360 tokenize_module.detect_encoding = orig_detect_encoding 1361 tokenize_module._tokenize = orig__tokenize 1362 1363 self.assertEqual(encoding_used, encoding) 1364 1365 def test_oneline_defs(self): 1366 buf = [] 1367 for i in range(500): 1368 buf.append('def i{i}(): return {i}'.format(i=i)) 1369 buf.append('OK') 1370 buf = '\n'.join(buf) 1371 1372 # Test that 500 consequent, one-line defs is OK 1373 toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline)) 1374 self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER 1375 # [-2] is always NEWLINE 1376 1377 def assertExactTypeEqual(self, opstr, *optypes): 1378 tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline)) 1379 num_optypes = len(optypes) 1380 self.assertEqual(len(tokens), 3 + num_optypes) 1381 self.assertEqual(tok_name[tokens[0].exact_type], 1382 tok_name[ENCODING]) 1383 for i in range(num_optypes): 1384 self.assertEqual(tok_name[tokens[i + 1].exact_type], 1385 tok_name[optypes[i]]) 1386 self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type], 1387 tok_name[token.NEWLINE]) 1388 self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type], 1389 tok_name[token.ENDMARKER]) 1390 1391 def test_exact_type(self): 1392 self.assertExactTypeEqual('()', token.LPAR, token.RPAR) 1393 self.assertExactTypeEqual('[]', token.LSQB, token.RSQB) 1394 self.assertExactTypeEqual(':', token.COLON) 1395 self.assertExactTypeEqual(',', token.COMMA) 1396 self.assertExactTypeEqual(';', token.SEMI) 1397 self.assertExactTypeEqual('+', token.PLUS) 1398 self.assertExactTypeEqual('-', token.MINUS) 1399 self.assertExactTypeEqual('*', token.STAR) 1400 self.assertExactTypeEqual('/', token.SLASH) 1401 self.assertExactTypeEqual('|', token.VBAR) 1402 self.assertExactTypeEqual('&', token.AMPER) 1403 self.assertExactTypeEqual('<', token.LESS) 1404 self.assertExactTypeEqual('>', token.GREATER) 1405 self.assertExactTypeEqual('=', token.EQUAL) 1406 self.assertExactTypeEqual('.', token.DOT) 1407 self.assertExactTypeEqual('%', token.PERCENT) 1408 self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE) 1409 self.assertExactTypeEqual('==', token.EQEQUAL) 1410 self.assertExactTypeEqual('!=', token.NOTEQUAL) 1411 self.assertExactTypeEqual('<=', token.LESSEQUAL) 1412 self.assertExactTypeEqual('>=', token.GREATEREQUAL) 1413 self.assertExactTypeEqual('~', token.TILDE) 1414 self.assertExactTypeEqual('^', token.CIRCUMFLEX) 1415 self.assertExactTypeEqual('<<', token.LEFTSHIFT) 1416 self.assertExactTypeEqual('>>', token.RIGHTSHIFT) 1417 self.assertExactTypeEqual('**', token.DOUBLESTAR) 1418 self.assertExactTypeEqual('+=', token.PLUSEQUAL) 1419 self.assertExactTypeEqual('-=', token.MINEQUAL) 1420 self.assertExactTypeEqual('*=', token.STAREQUAL) 1421 self.assertExactTypeEqual('/=', token.SLASHEQUAL) 1422 self.assertExactTypeEqual('%=', token.PERCENTEQUAL) 1423 self.assertExactTypeEqual('&=', token.AMPEREQUAL) 1424 self.assertExactTypeEqual('|=', token.VBAREQUAL) 1425 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) 1426 self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL) 1427 self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL) 1428 self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL) 1429 self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL) 1430 self.assertExactTypeEqual('//', token.DOUBLESLASH) 1431 self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL) 1432 self.assertExactTypeEqual(':=', token.COLONEQUAL) 1433 self.assertExactTypeEqual('...', token.ELLIPSIS) 1434 self.assertExactTypeEqual('->', token.RARROW) 1435 self.assertExactTypeEqual('@', token.AT) 1436 self.assertExactTypeEqual('@=', token.ATEQUAL) 1437 1438 self.assertExactTypeEqual('a**2+b**2==c**2', 1439 NAME, token.DOUBLESTAR, NUMBER, 1440 token.PLUS, 1441 NAME, token.DOUBLESTAR, NUMBER, 1442 token.EQEQUAL, 1443 NAME, token.DOUBLESTAR, NUMBER) 1444 self.assertExactTypeEqual('{1, 2, 3}', 1445 token.LBRACE, 1446 token.NUMBER, token.COMMA, 1447 token.NUMBER, token.COMMA, 1448 token.NUMBER, 1449 token.RBRACE) 1450 self.assertExactTypeEqual('^(x & 0x1)', 1451 token.CIRCUMFLEX, 1452 token.LPAR, 1453 token.NAME, token.AMPER, token.NUMBER, 1454 token.RPAR) 1455 1456 def test_pathological_trailing_whitespace(self): 1457 # See http://bugs.python.org/issue16152 1458 self.assertExactTypeEqual('@ ', token.AT) 1459 1460 1461class UntokenizeTest(TestCase): 1462 1463 def test_bad_input_order(self): 1464 # raise if previous row 1465 u = Untokenizer() 1466 u.prev_row = 2 1467 u.prev_col = 2 1468 with self.assertRaises(ValueError) as cm: 1469 u.add_whitespace((1,3)) 1470 self.assertEqual(cm.exception.args[0], 1471 'start (1,3) precedes previous end (2,2)') 1472 # raise if previous column in row 1473 self.assertRaises(ValueError, u.add_whitespace, (2,1)) 1474 1475 def test_backslash_continuation(self): 1476 # The problem is that <whitespace>\<newline> leaves no token 1477 u = Untokenizer() 1478 u.prev_row = 1 1479 u.prev_col = 1 1480 u.tokens = [] 1481 u.add_whitespace((2, 0)) 1482 self.assertEqual(u.tokens, ['\\\n']) 1483 u.prev_row = 2 1484 u.add_whitespace((4, 4)) 1485 self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', ' ']) 1486 TestRoundtrip.check_roundtrip(self, 'a\n b\n c\n \\\n c\n') 1487 1488 def test_iter_compat(self): 1489 u = Untokenizer() 1490 token = (NAME, 'Hello') 1491 tokens = [(ENCODING, 'utf-8'), token] 1492 u.compat(token, iter([])) 1493 self.assertEqual(u.tokens, ["Hello "]) 1494 u = Untokenizer() 1495 self.assertEqual(u.untokenize(iter([token])), 'Hello ') 1496 u = Untokenizer() 1497 self.assertEqual(u.untokenize(iter(tokens)), 'Hello ') 1498 self.assertEqual(u.encoding, 'utf-8') 1499 self.assertEqual(untokenize(iter(tokens)), b'Hello ') 1500 1501 1502class TestRoundtrip(TestCase): 1503 1504 def check_roundtrip(self, f): 1505 """ 1506 Test roundtrip for `untokenize`. `f` is an open file or a string. 1507 The source code in f is tokenized to both 5- and 2-tuples. 1508 Both sequences are converted back to source code via 1509 tokenize.untokenize(), and the latter tokenized again to 2-tuples. 1510 The test fails if the 3 pair tokenizations do not match. 1511 1512 When untokenize bugs are fixed, untokenize with 5-tuples should 1513 reproduce code that does not contain a backslash continuation 1514 following spaces. A proper test should test this. 1515 """ 1516 # Get source code and original tokenizations 1517 if isinstance(f, str): 1518 code = f.encode('utf-8') 1519 else: 1520 code = f.read() 1521 f.close() 1522 readline = iter(code.splitlines(keepends=True)).__next__ 1523 tokens5 = list(tokenize(readline)) 1524 tokens2 = [tok[:2] for tok in tokens5] 1525 # Reproduce tokens2 from pairs 1526 bytes_from2 = untokenize(tokens2) 1527 readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__ 1528 tokens2_from2 = [tok[:2] for tok in tokenize(readline2)] 1529 self.assertEqual(tokens2_from2, tokens2) 1530 # Reproduce tokens2 from 5-tuples 1531 bytes_from5 = untokenize(tokens5) 1532 readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__ 1533 tokens2_from5 = [tok[:2] for tok in tokenize(readline5)] 1534 self.assertEqual(tokens2_from5, tokens2) 1535 1536 def test_roundtrip(self): 1537 # There are some standard formatting practices that are easy to get right. 1538 1539 self.check_roundtrip("if x == 1:\n" 1540 " print(x)\n") 1541 self.check_roundtrip("# This is a comment\n" 1542 "# This also\n") 1543 1544 # Some people use different formatting conventions, which makes 1545 # untokenize a little trickier. Note that this test involves trailing 1546 # whitespace after the colon. Note that we use hex escapes to make the 1547 # two trailing blanks apparent in the expected output. 1548 1549 self.check_roundtrip("if x == 1 : \n" 1550 " print(x)\n") 1551 fn = support.findfile("tokenize_tests.txt") 1552 with open(fn, 'rb') as f: 1553 self.check_roundtrip(f) 1554 self.check_roundtrip("if x == 1:\n" 1555 " # A comment by itself.\n" 1556 " print(x) # Comment here, too.\n" 1557 " # Another comment.\n" 1558 "after_if = True\n") 1559 self.check_roundtrip("if (x # The comments need to go in the right place\n" 1560 " == 1):\n" 1561 " print('x==1')\n") 1562 self.check_roundtrip("class Test: # A comment here\n" 1563 " # A comment with weird indent\n" 1564 " after_com = 5\n" 1565 " def x(m): return m*5 # a one liner\n" 1566 " def y(m): # A whitespace after the colon\n" 1567 " return y*4 # 3-space indent\n") 1568 1569 # Some error-handling code 1570 self.check_roundtrip("try: import somemodule\n" 1571 "except ImportError: # comment\n" 1572 " print('Can not import' # comment2\n)" 1573 "else: print('Loaded')\n") 1574 1575 def test_continuation(self): 1576 # Balancing continuation 1577 self.check_roundtrip("a = (3,4, \n" 1578 "5,6)\n" 1579 "y = [3, 4,\n" 1580 "5]\n" 1581 "z = {'a': 5,\n" 1582 "'b':15, 'c':True}\n" 1583 "x = len(y) + 5 - a[\n" 1584 "3] - a[2]\n" 1585 "+ len(z) - z[\n" 1586 "'b']\n") 1587 1588 def test_backslash_continuation(self): 1589 # Backslash means line continuation, except for comments 1590 self.check_roundtrip("x=1+\\\n" 1591 "1\n" 1592 "# This is a comment\\\n" 1593 "# This also\n") 1594 self.check_roundtrip("# Comment \\\n" 1595 "x = 0") 1596 1597 def test_string_concatenation(self): 1598 # Two string literals on the same line 1599 self.check_roundtrip("'' ''") 1600 1601 def test_random_files(self): 1602 # Test roundtrip on random python modules. 1603 # pass the '-ucpu' option to process the full directory. 1604 1605 import glob, random 1606 fn = support.findfile("tokenize_tests.txt") 1607 tempdir = os.path.dirname(fn) or os.curdir 1608 testfiles = glob.glob(os.path.join(tempdir, "test*.py")) 1609 1610 # Tokenize is broken on test_pep3131.py because regular expressions are 1611 # broken on the obscure unicode identifiers in it. *sigh* 1612 # With roundtrip extended to test the 5-tuple mode of untokenize, 1613 # 7 more testfiles fail. Remove them also until the failure is diagnosed. 1614 1615 testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py")) 1616 for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'): 1617 testfiles.remove(os.path.join(tempdir, "test_%s.py") % f) 1618 1619 if not support.is_resource_enabled("cpu"): 1620 testfiles = random.sample(testfiles, 10) 1621 1622 for testfile in testfiles: 1623 if support.verbose >= 2: 1624 print('tokenize', testfile) 1625 with open(testfile, 'rb') as f: 1626 with self.subTest(file=testfile): 1627 self.check_roundtrip(f) 1628 1629 1630 def roundtrip(self, code): 1631 if isinstance(code, str): 1632 code = code.encode('utf-8') 1633 return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8') 1634 1635 def test_indentation_semantics_retained(self): 1636 """ 1637 Ensure that although whitespace might be mutated in a roundtrip, 1638 the semantic meaning of the indentation remains consistent. 1639 """ 1640 code = "if False:\n\tx=3\n\tx=3\n" 1641 codelines = self.roundtrip(code).split('\n') 1642 self.assertEqual(codelines[1], codelines[2]) 1643 self.check_roundtrip(code) 1644 1645 1646if __name__ == "__main__": 1647 unittest.main() 1648