• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from test import support
2from test.support import os_helper
3from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
4                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
5                     open as tokenize_open, Untokenizer, generate_tokens,
6                     NEWLINE)
7from io import BytesIO, StringIO
8import unittest
9from unittest import TestCase, mock
10from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
11                               INVALID_UNDERSCORE_LITERALS)
12import os
13import token
14
15
16# Converts a source string into a list of textual representation
17# of the tokens such as:
18# `    NAME       'if'          (1, 0) (1, 2)`
19# to make writing tests easier.
20def stringify_tokens_from_source(token_generator, source_string):
21    result = []
22    num_lines = len(source_string.splitlines())
23    missing_trailing_nl = source_string[-1] not in '\r\n'
24
25    for type, token, start, end, line in token_generator:
26        if type == ENDMARKER:
27            break
28        # Ignore the new line on the last line if the input lacks one
29        if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
30            continue
31        type = tok_name[type]
32        result.append(f"    {type:10} {token!r:13} {start} {end}")
33
34    return result
35
36class TokenizeTest(TestCase):
37    # Tests for the tokenize module.
38
39    # The tests can be really simple. Given a small fragment of source
40    # code, print out a table with tokens. The ENDMARKER, ENCODING and
41    # final NEWLINE are omitted for brevity.
42
43    def check_tokenize(self, s, expected):
44        # Format the tokens in s in a table format.
45        # The ENDMARKER and final NEWLINE are omitted.
46        f = BytesIO(s.encode('utf-8'))
47        result = stringify_tokens_from_source(tokenize(f.readline), s)
48
49        self.assertEqual(result,
50                         ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
51                         expected.rstrip().splitlines())
52
53    def test_implicit_newline(self):
54        # Make sure that the tokenizer puts in an implicit NEWLINE
55        # when the input lacks a trailing new line.
56        f = BytesIO("x".encode('utf-8'))
57        tokens = list(tokenize(f.readline))
58        self.assertEqual(tokens[-2].type, NEWLINE)
59        self.assertEqual(tokens[-1].type, ENDMARKER)
60
61    def test_basic(self):
62        self.check_tokenize("1 + 1", """\
63    NUMBER     '1'           (1, 0) (1, 1)
64    OP         '+'           (1, 2) (1, 3)
65    NUMBER     '1'           (1, 4) (1, 5)
66    """)
67        self.check_tokenize("if False:\n"
68                            "    # NL\n"
69                            "    \n"
70                            "    True = False # NEWLINE\n", """\
71    NAME       'if'          (1, 0) (1, 2)
72    NAME       'False'       (1, 3) (1, 8)
73    OP         ':'           (1, 8) (1, 9)
74    NEWLINE    '\\n'          (1, 9) (1, 10)
75    COMMENT    '# NL'        (2, 4) (2, 8)
76    NL         '\\n'          (2, 8) (2, 9)
77    NL         '\\n'          (3, 4) (3, 5)
78    INDENT     '    '        (4, 0) (4, 4)
79    NAME       'True'        (4, 4) (4, 8)
80    OP         '='           (4, 9) (4, 10)
81    NAME       'False'       (4, 11) (4, 16)
82    COMMENT    '# NEWLINE'   (4, 17) (4, 26)
83    NEWLINE    '\\n'          (4, 26) (4, 27)
84    DEDENT     ''            (5, 0) (5, 0)
85    """)
86        indent_error_file = b"""\
87def k(x):
88    x += 2
89  x += 5
90"""
91        readline = BytesIO(indent_error_file).readline
92        with self.assertRaisesRegex(IndentationError,
93                                    "unindent does not match any "
94                                    "outer indentation level"):
95            for tok in tokenize(readline):
96                pass
97
98    def test_int(self):
99        # Ordinary integers and binary operators
100        self.check_tokenize("0xff <= 255", """\
101    NUMBER     '0xff'        (1, 0) (1, 4)
102    OP         '<='          (1, 5) (1, 7)
103    NUMBER     '255'         (1, 8) (1, 11)
104    """)
105        self.check_tokenize("0b10 <= 255", """\
106    NUMBER     '0b10'        (1, 0) (1, 4)
107    OP         '<='          (1, 5) (1, 7)
108    NUMBER     '255'         (1, 8) (1, 11)
109    """)
110        self.check_tokenize("0o123 <= 0O123", """\
111    NUMBER     '0o123'       (1, 0) (1, 5)
112    OP         '<='          (1, 6) (1, 8)
113    NUMBER     '0O123'       (1, 9) (1, 14)
114    """)
115        self.check_tokenize("1234567 > ~0x15", """\
116    NUMBER     '1234567'     (1, 0) (1, 7)
117    OP         '>'           (1, 8) (1, 9)
118    OP         '~'           (1, 10) (1, 11)
119    NUMBER     '0x15'        (1, 11) (1, 15)
120    """)
121        self.check_tokenize("2134568 != 1231515", """\
122    NUMBER     '2134568'     (1, 0) (1, 7)
123    OP         '!='          (1, 8) (1, 10)
124    NUMBER     '1231515'     (1, 11) (1, 18)
125    """)
126        self.check_tokenize("(-124561-1) & 200000000", """\
127    OP         '('           (1, 0) (1, 1)
128    OP         '-'           (1, 1) (1, 2)
129    NUMBER     '124561'      (1, 2) (1, 8)
130    OP         '-'           (1, 8) (1, 9)
131    NUMBER     '1'           (1, 9) (1, 10)
132    OP         ')'           (1, 10) (1, 11)
133    OP         '&'           (1, 12) (1, 13)
134    NUMBER     '200000000'   (1, 14) (1, 23)
135    """)
136        self.check_tokenize("0xdeadbeef != -1", """\
137    NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
138    OP         '!='          (1, 11) (1, 13)
139    OP         '-'           (1, 14) (1, 15)
140    NUMBER     '1'           (1, 15) (1, 16)
141    """)
142        self.check_tokenize("0xdeadc0de & 12345", """\
143    NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
144    OP         '&'           (1, 11) (1, 12)
145    NUMBER     '12345'       (1, 13) (1, 18)
146    """)
147        self.check_tokenize("0xFF & 0x15 | 1234", """\
148    NUMBER     '0xFF'        (1, 0) (1, 4)
149    OP         '&'           (1, 5) (1, 6)
150    NUMBER     '0x15'        (1, 7) (1, 11)
151    OP         '|'           (1, 12) (1, 13)
152    NUMBER     '1234'        (1, 14) (1, 18)
153    """)
154
155    def test_long(self):
156        # Long integers
157        self.check_tokenize("x = 0", """\
158    NAME       'x'           (1, 0) (1, 1)
159    OP         '='           (1, 2) (1, 3)
160    NUMBER     '0'           (1, 4) (1, 5)
161    """)
162        self.check_tokenize("x = 0xfffffffffff", """\
163    NAME       'x'           (1, 0) (1, 1)
164    OP         '='           (1, 2) (1, 3)
165    NUMBER     '0xfffffffffff' (1, 4) (1, 17)
166    """)
167        self.check_tokenize("x = 123141242151251616110", """\
168    NAME       'x'           (1, 0) (1, 1)
169    OP         '='           (1, 2) (1, 3)
170    NUMBER     '123141242151251616110' (1, 4) (1, 25)
171    """)
172        self.check_tokenize("x = -15921590215012591", """\
173    NAME       'x'           (1, 0) (1, 1)
174    OP         '='           (1, 2) (1, 3)
175    OP         '-'           (1, 4) (1, 5)
176    NUMBER     '15921590215012591' (1, 5) (1, 22)
177    """)
178
179    def test_float(self):
180        # Floating point numbers
181        self.check_tokenize("x = 3.14159", """\
182    NAME       'x'           (1, 0) (1, 1)
183    OP         '='           (1, 2) (1, 3)
184    NUMBER     '3.14159'     (1, 4) (1, 11)
185    """)
186        self.check_tokenize("x = 314159.", """\
187    NAME       'x'           (1, 0) (1, 1)
188    OP         '='           (1, 2) (1, 3)
189    NUMBER     '314159.'     (1, 4) (1, 11)
190    """)
191        self.check_tokenize("x = .314159", """\
192    NAME       'x'           (1, 0) (1, 1)
193    OP         '='           (1, 2) (1, 3)
194    NUMBER     '.314159'     (1, 4) (1, 11)
195    """)
196        self.check_tokenize("x = 3e14159", """\
197    NAME       'x'           (1, 0) (1, 1)
198    OP         '='           (1, 2) (1, 3)
199    NUMBER     '3e14159'     (1, 4) (1, 11)
200    """)
201        self.check_tokenize("x = 3E123", """\
202    NAME       'x'           (1, 0) (1, 1)
203    OP         '='           (1, 2) (1, 3)
204    NUMBER     '3E123'       (1, 4) (1, 9)
205    """)
206        self.check_tokenize("x+y = 3e-1230", """\
207    NAME       'x'           (1, 0) (1, 1)
208    OP         '+'           (1, 1) (1, 2)
209    NAME       'y'           (1, 2) (1, 3)
210    OP         '='           (1, 4) (1, 5)
211    NUMBER     '3e-1230'     (1, 6) (1, 13)
212    """)
213        self.check_tokenize("x = 3.14e159", """\
214    NAME       'x'           (1, 0) (1, 1)
215    OP         '='           (1, 2) (1, 3)
216    NUMBER     '3.14e159'    (1, 4) (1, 12)
217    """)
218
219    def test_underscore_literals(self):
220        def number_token(s):
221            f = BytesIO(s.encode('utf-8'))
222            for toktype, token, start, end, line in tokenize(f.readline):
223                if toktype == NUMBER:
224                    return token
225            return 'invalid token'
226        for lit in VALID_UNDERSCORE_LITERALS:
227            if '(' in lit:
228                # this won't work with compound complex inputs
229                continue
230            self.assertEqual(number_token(lit), lit)
231        for lit in INVALID_UNDERSCORE_LITERALS:
232            self.assertNotEqual(number_token(lit), lit)
233
234    def test_string(self):
235        # String literals
236        self.check_tokenize("x = ''; y = \"\"", """\
237    NAME       'x'           (1, 0) (1, 1)
238    OP         '='           (1, 2) (1, 3)
239    STRING     "''"          (1, 4) (1, 6)
240    OP         ';'           (1, 6) (1, 7)
241    NAME       'y'           (1, 8) (1, 9)
242    OP         '='           (1, 10) (1, 11)
243    STRING     '""'          (1, 12) (1, 14)
244    """)
245        self.check_tokenize("x = '\"'; y = \"'\"", """\
246    NAME       'x'           (1, 0) (1, 1)
247    OP         '='           (1, 2) (1, 3)
248    STRING     '\\'"\\''       (1, 4) (1, 7)
249    OP         ';'           (1, 7) (1, 8)
250    NAME       'y'           (1, 9) (1, 10)
251    OP         '='           (1, 11) (1, 12)
252    STRING     '"\\'"'        (1, 13) (1, 16)
253    """)
254        self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
255    NAME       'x'           (1, 0) (1, 1)
256    OP         '='           (1, 2) (1, 3)
257    STRING     '"doesn\\'t "' (1, 4) (1, 14)
258    NAME       'shrink'      (1, 14) (1, 20)
259    STRING     '", does it"' (1, 20) (1, 31)
260    """)
261        self.check_tokenize("x = 'abc' + 'ABC'", """\
262    NAME       'x'           (1, 0) (1, 1)
263    OP         '='           (1, 2) (1, 3)
264    STRING     "'abc'"       (1, 4) (1, 9)
265    OP         '+'           (1, 10) (1, 11)
266    STRING     "'ABC'"       (1, 12) (1, 17)
267    """)
268        self.check_tokenize('y = "ABC" + "ABC"', """\
269    NAME       'y'           (1, 0) (1, 1)
270    OP         '='           (1, 2) (1, 3)
271    STRING     '"ABC"'       (1, 4) (1, 9)
272    OP         '+'           (1, 10) (1, 11)
273    STRING     '"ABC"'       (1, 12) (1, 17)
274    """)
275        self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
276    NAME       'x'           (1, 0) (1, 1)
277    OP         '='           (1, 2) (1, 3)
278    STRING     "r'abc'"      (1, 4) (1, 10)
279    OP         '+'           (1, 11) (1, 12)
280    STRING     "r'ABC'"      (1, 13) (1, 19)
281    OP         '+'           (1, 20) (1, 21)
282    STRING     "R'ABC'"      (1, 22) (1, 28)
283    OP         '+'           (1, 29) (1, 30)
284    STRING     "R'ABC'"      (1, 31) (1, 37)
285    """)
286        self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
287    NAME       'y'           (1, 0) (1, 1)
288    OP         '='           (1, 2) (1, 3)
289    STRING     'r"abc"'      (1, 4) (1, 10)
290    OP         '+'           (1, 11) (1, 12)
291    STRING     'r"ABC"'      (1, 13) (1, 19)
292    OP         '+'           (1, 20) (1, 21)
293    STRING     'R"ABC"'      (1, 22) (1, 28)
294    OP         '+'           (1, 29) (1, 30)
295    STRING     'R"ABC"'      (1, 31) (1, 37)
296    """)
297
298        self.check_tokenize("u'abc' + U'abc'", """\
299    STRING     "u'abc'"      (1, 0) (1, 6)
300    OP         '+'           (1, 7) (1, 8)
301    STRING     "U'abc'"      (1, 9) (1, 15)
302    """)
303        self.check_tokenize('u"abc" + U"abc"', """\
304    STRING     'u"abc"'      (1, 0) (1, 6)
305    OP         '+'           (1, 7) (1, 8)
306    STRING     'U"abc"'      (1, 9) (1, 15)
307    """)
308
309        self.check_tokenize("b'abc' + B'abc'", """\
310    STRING     "b'abc'"      (1, 0) (1, 6)
311    OP         '+'           (1, 7) (1, 8)
312    STRING     "B'abc'"      (1, 9) (1, 15)
313    """)
314        self.check_tokenize('b"abc" + B"abc"', """\
315    STRING     'b"abc"'      (1, 0) (1, 6)
316    OP         '+'           (1, 7) (1, 8)
317    STRING     'B"abc"'      (1, 9) (1, 15)
318    """)
319        self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
320    STRING     "br'abc'"     (1, 0) (1, 7)
321    OP         '+'           (1, 8) (1, 9)
322    STRING     "bR'abc'"     (1, 10) (1, 17)
323    OP         '+'           (1, 18) (1, 19)
324    STRING     "Br'abc'"     (1, 20) (1, 27)
325    OP         '+'           (1, 28) (1, 29)
326    STRING     "BR'abc'"     (1, 30) (1, 37)
327    """)
328        self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
329    STRING     'br"abc"'     (1, 0) (1, 7)
330    OP         '+'           (1, 8) (1, 9)
331    STRING     'bR"abc"'     (1, 10) (1, 17)
332    OP         '+'           (1, 18) (1, 19)
333    STRING     'Br"abc"'     (1, 20) (1, 27)
334    OP         '+'           (1, 28) (1, 29)
335    STRING     'BR"abc"'     (1, 30) (1, 37)
336    """)
337        self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
338    STRING     "rb'abc'"     (1, 0) (1, 7)
339    OP         '+'           (1, 8) (1, 9)
340    STRING     "rB'abc'"     (1, 10) (1, 17)
341    OP         '+'           (1, 18) (1, 19)
342    STRING     "Rb'abc'"     (1, 20) (1, 27)
343    OP         '+'           (1, 28) (1, 29)
344    STRING     "RB'abc'"     (1, 30) (1, 37)
345    """)
346        self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
347    STRING     'rb"abc"'     (1, 0) (1, 7)
348    OP         '+'           (1, 8) (1, 9)
349    STRING     'rB"abc"'     (1, 10) (1, 17)
350    OP         '+'           (1, 18) (1, 19)
351    STRING     'Rb"abc"'     (1, 20) (1, 27)
352    OP         '+'           (1, 28) (1, 29)
353    STRING     'RB"abc"'     (1, 30) (1, 37)
354    """)
355        # Check 0, 1, and 2 character string prefixes.
356        self.check_tokenize(r'"a\
357de\
358fg"', """\
359    STRING     '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
360    """)
361        self.check_tokenize(r'u"a\
362de"', """\
363    STRING     'u"a\\\\\\nde"\'  (1, 0) (2, 3)
364    """)
365        self.check_tokenize(r'rb"a\
366d"', """\
367    STRING     'rb"a\\\\\\nd"\'  (1, 0) (2, 2)
368    """)
369        self.check_tokenize(r'"""a\
370b"""', """\
371    STRING     '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
372    """)
373        self.check_tokenize(r'u"""a\
374b"""', """\
375    STRING     'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
376    """)
377        self.check_tokenize(r'rb"""a\
378b\
379c"""', """\
380    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
381    """)
382        self.check_tokenize('f"abc"', """\
383    STRING     'f"abc"'      (1, 0) (1, 6)
384    """)
385        self.check_tokenize('fR"a{b}c"', """\
386    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
387    """)
388        self.check_tokenize('f"""abc"""', """\
389    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
390    """)
391        self.check_tokenize(r'f"abc\
392def"', """\
393    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
394    """)
395        self.check_tokenize(r'Rf"abc\
396def"', """\
397    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
398    """)
399
400    def test_function(self):
401        self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
402    NAME       'def'         (1, 0) (1, 3)
403    NAME       'd22'         (1, 4) (1, 7)
404    OP         '('           (1, 7) (1, 8)
405    NAME       'a'           (1, 8) (1, 9)
406    OP         ','           (1, 9) (1, 10)
407    NAME       'b'           (1, 11) (1, 12)
408    OP         ','           (1, 12) (1, 13)
409    NAME       'c'           (1, 14) (1, 15)
410    OP         '='           (1, 15) (1, 16)
411    NUMBER     '2'           (1, 16) (1, 17)
412    OP         ','           (1, 17) (1, 18)
413    NAME       'd'           (1, 19) (1, 20)
414    OP         '='           (1, 20) (1, 21)
415    NUMBER     '2'           (1, 21) (1, 22)
416    OP         ','           (1, 22) (1, 23)
417    OP         '*'           (1, 24) (1, 25)
418    NAME       'k'           (1, 25) (1, 26)
419    OP         ')'           (1, 26) (1, 27)
420    OP         ':'           (1, 27) (1, 28)
421    NAME       'pass'        (1, 29) (1, 33)
422    """)
423        self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
424    NAME       'def'         (1, 0) (1, 3)
425    NAME       'd01v_'       (1, 4) (1, 9)
426    OP         '('           (1, 9) (1, 10)
427    NAME       'a'           (1, 10) (1, 11)
428    OP         '='           (1, 11) (1, 12)
429    NUMBER     '1'           (1, 12) (1, 13)
430    OP         ','           (1, 13) (1, 14)
431    OP         '*'           (1, 15) (1, 16)
432    NAME       'k'           (1, 16) (1, 17)
433    OP         ','           (1, 17) (1, 18)
434    OP         '**'          (1, 19) (1, 21)
435    NAME       'w'           (1, 21) (1, 22)
436    OP         ')'           (1, 22) (1, 23)
437    OP         ':'           (1, 23) (1, 24)
438    NAME       'pass'        (1, 25) (1, 29)
439    """)
440        self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\
441    NAME       'def'         (1, 0) (1, 3)
442    NAME       'd23'         (1, 4) (1, 7)
443    OP         '('           (1, 7) (1, 8)
444    NAME       'a'           (1, 8) (1, 9)
445    OP         ':'           (1, 9) (1, 10)
446    NAME       'str'         (1, 11) (1, 14)
447    OP         ','           (1, 14) (1, 15)
448    NAME       'b'           (1, 16) (1, 17)
449    OP         ':'           (1, 17) (1, 18)
450    NAME       'int'         (1, 19) (1, 22)
451    OP         '='           (1, 22) (1, 23)
452    NUMBER     '3'           (1, 23) (1, 24)
453    OP         ')'           (1, 24) (1, 25)
454    OP         '->'          (1, 26) (1, 28)
455    NAME       'int'         (1, 29) (1, 32)
456    OP         ':'           (1, 32) (1, 33)
457    NAME       'pass'        (1, 34) (1, 38)
458    """)
459
460    def test_comparison(self):
461        # Comparison
462        self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
463                            "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
464    NAME       'if'          (1, 0) (1, 2)
465    NUMBER     '1'           (1, 3) (1, 4)
466    OP         '<'           (1, 5) (1, 6)
467    NUMBER     '1'           (1, 7) (1, 8)
468    OP         '>'           (1, 9) (1, 10)
469    NUMBER     '1'           (1, 11) (1, 12)
470    OP         '=='          (1, 13) (1, 15)
471    NUMBER     '1'           (1, 16) (1, 17)
472    OP         '>='          (1, 18) (1, 20)
473    NUMBER     '5'           (1, 21) (1, 22)
474    OP         '<='          (1, 23) (1, 25)
475    NUMBER     '0x15'        (1, 26) (1, 30)
476    OP         '<='          (1, 31) (1, 33)
477    NUMBER     '0x12'        (1, 34) (1, 38)
478    OP         '!='          (1, 39) (1, 41)
479    NUMBER     '1'           (1, 42) (1, 43)
480    NAME       'and'         (1, 44) (1, 47)
481    NUMBER     '5'           (1, 48) (1, 49)
482    NAME       'in'          (1, 50) (1, 52)
483    NUMBER     '1'           (1, 53) (1, 54)
484    NAME       'not'         (1, 55) (1, 58)
485    NAME       'in'          (1, 59) (1, 61)
486    NUMBER     '1'           (1, 62) (1, 63)
487    NAME       'is'          (1, 64) (1, 66)
488    NUMBER     '1'           (1, 67) (1, 68)
489    NAME       'or'          (1, 69) (1, 71)
490    NUMBER     '5'           (1, 72) (1, 73)
491    NAME       'is'          (1, 74) (1, 76)
492    NAME       'not'         (1, 77) (1, 80)
493    NUMBER     '1'           (1, 81) (1, 82)
494    OP         ':'           (1, 82) (1, 83)
495    NAME       'pass'        (1, 84) (1, 88)
496    """)
497
498    def test_shift(self):
499        # Shift
500        self.check_tokenize("x = 1 << 1 >> 5", """\
501    NAME       'x'           (1, 0) (1, 1)
502    OP         '='           (1, 2) (1, 3)
503    NUMBER     '1'           (1, 4) (1, 5)
504    OP         '<<'          (1, 6) (1, 8)
505    NUMBER     '1'           (1, 9) (1, 10)
506    OP         '>>'          (1, 11) (1, 13)
507    NUMBER     '5'           (1, 14) (1, 15)
508    """)
509
510    def test_additive(self):
511        # Additive
512        self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
513    NAME       'x'           (1, 0) (1, 1)
514    OP         '='           (1, 2) (1, 3)
515    NUMBER     '1'           (1, 4) (1, 5)
516    OP         '-'           (1, 6) (1, 7)
517    NAME       'y'           (1, 8) (1, 9)
518    OP         '+'           (1, 10) (1, 11)
519    NUMBER     '15'          (1, 12) (1, 14)
520    OP         '-'           (1, 15) (1, 16)
521    NUMBER     '1'           (1, 17) (1, 18)
522    OP         '+'           (1, 19) (1, 20)
523    NUMBER     '0x124'       (1, 21) (1, 26)
524    OP         '+'           (1, 27) (1, 28)
525    NAME       'z'           (1, 29) (1, 30)
526    OP         '+'           (1, 31) (1, 32)
527    NAME       'a'           (1, 33) (1, 34)
528    OP         '['           (1, 34) (1, 35)
529    NUMBER     '5'           (1, 35) (1, 36)
530    OP         ']'           (1, 36) (1, 37)
531    """)
532
533    def test_multiplicative(self):
534        # Multiplicative
535        self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
536    NAME       'x'           (1, 0) (1, 1)
537    OP         '='           (1, 2) (1, 3)
538    NUMBER     '1'           (1, 4) (1, 5)
539    OP         '//'          (1, 5) (1, 7)
540    NUMBER     '1'           (1, 7) (1, 8)
541    OP         '*'           (1, 8) (1, 9)
542    NUMBER     '1'           (1, 9) (1, 10)
543    OP         '/'           (1, 10) (1, 11)
544    NUMBER     '5'           (1, 11) (1, 12)
545    OP         '*'           (1, 12) (1, 13)
546    NUMBER     '12'          (1, 13) (1, 15)
547    OP         '%'           (1, 15) (1, 16)
548    NUMBER     '0x12'        (1, 16) (1, 20)
549    OP         '@'           (1, 20) (1, 21)
550    NUMBER     '42'          (1, 21) (1, 23)
551    """)
552
553    def test_unary(self):
554        # Unary
555        self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
556    OP         '~'           (1, 0) (1, 1)
557    NUMBER     '1'           (1, 1) (1, 2)
558    OP         '^'           (1, 3) (1, 4)
559    NUMBER     '1'           (1, 5) (1, 6)
560    OP         '&'           (1, 7) (1, 8)
561    NUMBER     '1'           (1, 9) (1, 10)
562    OP         '|'           (1, 11) (1, 12)
563    NUMBER     '1'           (1, 12) (1, 13)
564    OP         '^'           (1, 14) (1, 15)
565    OP         '-'           (1, 16) (1, 17)
566    NUMBER     '1'           (1, 17) (1, 18)
567    """)
568        self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
569    OP         '-'           (1, 0) (1, 1)
570    NUMBER     '1'           (1, 1) (1, 2)
571    OP         '*'           (1, 2) (1, 3)
572    NUMBER     '1'           (1, 3) (1, 4)
573    OP         '/'           (1, 4) (1, 5)
574    NUMBER     '1'           (1, 5) (1, 6)
575    OP         '+'           (1, 6) (1, 7)
576    NUMBER     '1'           (1, 7) (1, 8)
577    OP         '*'           (1, 8) (1, 9)
578    NUMBER     '1'           (1, 9) (1, 10)
579    OP         '//'          (1, 10) (1, 12)
580    NUMBER     '1'           (1, 12) (1, 13)
581    OP         '-'           (1, 14) (1, 15)
582    OP         '-'           (1, 16) (1, 17)
583    OP         '-'           (1, 17) (1, 18)
584    OP         '-'           (1, 18) (1, 19)
585    NUMBER     '1'           (1, 19) (1, 20)
586    OP         '**'          (1, 20) (1, 22)
587    NUMBER     '1'           (1, 22) (1, 23)
588    """)
589
590    def test_selector(self):
591        # Selector
592        self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
593    NAME       'import'      (1, 0) (1, 6)
594    NAME       'sys'         (1, 7) (1, 10)
595    OP         ','           (1, 10) (1, 11)
596    NAME       'time'        (1, 12) (1, 16)
597    NEWLINE    '\\n'          (1, 16) (1, 17)
598    NAME       'x'           (2, 0) (2, 1)
599    OP         '='           (2, 2) (2, 3)
600    NAME       'sys'         (2, 4) (2, 7)
601    OP         '.'           (2, 7) (2, 8)
602    NAME       'modules'     (2, 8) (2, 15)
603    OP         '['           (2, 15) (2, 16)
604    STRING     "'time'"      (2, 16) (2, 22)
605    OP         ']'           (2, 22) (2, 23)
606    OP         '.'           (2, 23) (2, 24)
607    NAME       'time'        (2, 24) (2, 28)
608    OP         '('           (2, 28) (2, 29)
609    OP         ')'           (2, 29) (2, 30)
610    """)
611
612    def test_method(self):
613        # Methods
614        self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
615    OP         '@'           (1, 0) (1, 1)
616    NAME       'staticmethod' (1, 1) (1, 13)
617    NEWLINE    '\\n'          (1, 13) (1, 14)
618    NAME       'def'         (2, 0) (2, 3)
619    NAME       'foo'         (2, 4) (2, 7)
620    OP         '('           (2, 7) (2, 8)
621    NAME       'x'           (2, 8) (2, 9)
622    OP         ','           (2, 9) (2, 10)
623    NAME       'y'           (2, 10) (2, 11)
624    OP         ')'           (2, 11) (2, 12)
625    OP         ':'           (2, 12) (2, 13)
626    NAME       'pass'        (2, 14) (2, 18)
627    """)
628
629    def test_tabs(self):
630        # Evil tabs
631        self.check_tokenize("def f():\n"
632                            "\tif x\n"
633                            "        \tpass", """\
634    NAME       'def'         (1, 0) (1, 3)
635    NAME       'f'           (1, 4) (1, 5)
636    OP         '('           (1, 5) (1, 6)
637    OP         ')'           (1, 6) (1, 7)
638    OP         ':'           (1, 7) (1, 8)
639    NEWLINE    '\\n'          (1, 8) (1, 9)
640    INDENT     '\\t'          (2, 0) (2, 1)
641    NAME       'if'          (2, 1) (2, 3)
642    NAME       'x'           (2, 4) (2, 5)
643    NEWLINE    '\\n'          (2, 5) (2, 6)
644    INDENT     '        \\t'  (3, 0) (3, 9)
645    NAME       'pass'        (3, 9) (3, 13)
646    DEDENT     ''            (4, 0) (4, 0)
647    DEDENT     ''            (4, 0) (4, 0)
648    """)
649
650    def test_non_ascii_identifiers(self):
651        # Non-ascii identifiers
652        self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
653    NAME       'Örter'       (1, 0) (1, 5)
654    OP         '='           (1, 6) (1, 7)
655    STRING     "'places'"    (1, 8) (1, 16)
656    NEWLINE    '\\n'          (1, 16) (1, 17)
657    NAME       'grün'        (2, 0) (2, 4)
658    OP         '='           (2, 5) (2, 6)
659    STRING     "'green'"     (2, 7) (2, 14)
660    """)
661
662    def test_unicode(self):
663        # Legacy unicode literals:
664        self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
665    NAME       'Örter'       (1, 0) (1, 5)
666    OP         '='           (1, 6) (1, 7)
667    STRING     "u'places'"   (1, 8) (1, 17)
668    NEWLINE    '\\n'          (1, 17) (1, 18)
669    NAME       'grün'        (2, 0) (2, 4)
670    OP         '='           (2, 5) (2, 6)
671    STRING     "U'green'"    (2, 7) (2, 15)
672    """)
673
674    def test_async(self):
675        # Async/await extension:
676        self.check_tokenize("async = 1", """\
677    NAME       'async'       (1, 0) (1, 5)
678    OP         '='           (1, 6) (1, 7)
679    NUMBER     '1'           (1, 8) (1, 9)
680    """)
681
682        self.check_tokenize("a = (async = 1)", """\
683    NAME       'a'           (1, 0) (1, 1)
684    OP         '='           (1, 2) (1, 3)
685    OP         '('           (1, 4) (1, 5)
686    NAME       'async'       (1, 5) (1, 10)
687    OP         '='           (1, 11) (1, 12)
688    NUMBER     '1'           (1, 13) (1, 14)
689    OP         ')'           (1, 14) (1, 15)
690    """)
691
692        self.check_tokenize("async()", """\
693    NAME       'async'       (1, 0) (1, 5)
694    OP         '('           (1, 5) (1, 6)
695    OP         ')'           (1, 6) (1, 7)
696    """)
697
698        self.check_tokenize("class async(Bar):pass", """\
699    NAME       'class'       (1, 0) (1, 5)
700    NAME       'async'       (1, 6) (1, 11)
701    OP         '('           (1, 11) (1, 12)
702    NAME       'Bar'         (1, 12) (1, 15)
703    OP         ')'           (1, 15) (1, 16)
704    OP         ':'           (1, 16) (1, 17)
705    NAME       'pass'        (1, 17) (1, 21)
706    """)
707
708        self.check_tokenize("class async:pass", """\
709    NAME       'class'       (1, 0) (1, 5)
710    NAME       'async'       (1, 6) (1, 11)
711    OP         ':'           (1, 11) (1, 12)
712    NAME       'pass'        (1, 12) (1, 16)
713    """)
714
715        self.check_tokenize("await = 1", """\
716    NAME       'await'       (1, 0) (1, 5)
717    OP         '='           (1, 6) (1, 7)
718    NUMBER     '1'           (1, 8) (1, 9)
719    """)
720
721        self.check_tokenize("foo.async", """\
722    NAME       'foo'         (1, 0) (1, 3)
723    OP         '.'           (1, 3) (1, 4)
724    NAME       'async'       (1, 4) (1, 9)
725    """)
726
727        self.check_tokenize("async for a in b: pass", """\
728    NAME       'async'       (1, 0) (1, 5)
729    NAME       'for'         (1, 6) (1, 9)
730    NAME       'a'           (1, 10) (1, 11)
731    NAME       'in'          (1, 12) (1, 14)
732    NAME       'b'           (1, 15) (1, 16)
733    OP         ':'           (1, 16) (1, 17)
734    NAME       'pass'        (1, 18) (1, 22)
735    """)
736
737        self.check_tokenize("async with a as b: pass", """\
738    NAME       'async'       (1, 0) (1, 5)
739    NAME       'with'        (1, 6) (1, 10)
740    NAME       'a'           (1, 11) (1, 12)
741    NAME       'as'          (1, 13) (1, 15)
742    NAME       'b'           (1, 16) (1, 17)
743    OP         ':'           (1, 17) (1, 18)
744    NAME       'pass'        (1, 19) (1, 23)
745    """)
746
747        self.check_tokenize("async.foo", """\
748    NAME       'async'       (1, 0) (1, 5)
749    OP         '.'           (1, 5) (1, 6)
750    NAME       'foo'         (1, 6) (1, 9)
751    """)
752
753        self.check_tokenize("async", """\
754    NAME       'async'       (1, 0) (1, 5)
755    """)
756
757        self.check_tokenize("async\n#comment\nawait", """\
758    NAME       'async'       (1, 0) (1, 5)
759    NEWLINE    '\\n'          (1, 5) (1, 6)
760    COMMENT    '#comment'    (2, 0) (2, 8)
761    NL         '\\n'          (2, 8) (2, 9)
762    NAME       'await'       (3, 0) (3, 5)
763    """)
764
765        self.check_tokenize("async\n...\nawait", """\
766    NAME       'async'       (1, 0) (1, 5)
767    NEWLINE    '\\n'          (1, 5) (1, 6)
768    OP         '...'         (2, 0) (2, 3)
769    NEWLINE    '\\n'          (2, 3) (2, 4)
770    NAME       'await'       (3, 0) (3, 5)
771    """)
772
773        self.check_tokenize("async\nawait", """\
774    NAME       'async'       (1, 0) (1, 5)
775    NEWLINE    '\\n'          (1, 5) (1, 6)
776    NAME       'await'       (2, 0) (2, 5)
777    """)
778
779        self.check_tokenize("foo.async + 1", """\
780    NAME       'foo'         (1, 0) (1, 3)
781    OP         '.'           (1, 3) (1, 4)
782    NAME       'async'       (1, 4) (1, 9)
783    OP         '+'           (1, 10) (1, 11)
784    NUMBER     '1'           (1, 12) (1, 13)
785    """)
786
787        self.check_tokenize("async def foo(): pass", """\
788    NAME       'async'       (1, 0) (1, 5)
789    NAME       'def'         (1, 6) (1, 9)
790    NAME       'foo'         (1, 10) (1, 13)
791    OP         '('           (1, 13) (1, 14)
792    OP         ')'           (1, 14) (1, 15)
793    OP         ':'           (1, 15) (1, 16)
794    NAME       'pass'        (1, 17) (1, 21)
795    """)
796
797        self.check_tokenize('''\
798async def foo():
799  def foo(await):
800    await = 1
801  if 1:
802    await
803async += 1
804''', """\
805    NAME       'async'       (1, 0) (1, 5)
806    NAME       'def'         (1, 6) (1, 9)
807    NAME       'foo'         (1, 10) (1, 13)
808    OP         '('           (1, 13) (1, 14)
809    OP         ')'           (1, 14) (1, 15)
810    OP         ':'           (1, 15) (1, 16)
811    NEWLINE    '\\n'          (1, 16) (1, 17)
812    INDENT     '  '          (2, 0) (2, 2)
813    NAME       'def'         (2, 2) (2, 5)
814    NAME       'foo'         (2, 6) (2, 9)
815    OP         '('           (2, 9) (2, 10)
816    NAME       'await'       (2, 10) (2, 15)
817    OP         ')'           (2, 15) (2, 16)
818    OP         ':'           (2, 16) (2, 17)
819    NEWLINE    '\\n'          (2, 17) (2, 18)
820    INDENT     '    '        (3, 0) (3, 4)
821    NAME       'await'       (3, 4) (3, 9)
822    OP         '='           (3, 10) (3, 11)
823    NUMBER     '1'           (3, 12) (3, 13)
824    NEWLINE    '\\n'          (3, 13) (3, 14)
825    DEDENT     ''            (4, 2) (4, 2)
826    NAME       'if'          (4, 2) (4, 4)
827    NUMBER     '1'           (4, 5) (4, 6)
828    OP         ':'           (4, 6) (4, 7)
829    NEWLINE    '\\n'          (4, 7) (4, 8)
830    INDENT     '    '        (5, 0) (5, 4)
831    NAME       'await'       (5, 4) (5, 9)
832    NEWLINE    '\\n'          (5, 9) (5, 10)
833    DEDENT     ''            (6, 0) (6, 0)
834    DEDENT     ''            (6, 0) (6, 0)
835    NAME       'async'       (6, 0) (6, 5)
836    OP         '+='          (6, 6) (6, 8)
837    NUMBER     '1'           (6, 9) (6, 10)
838    NEWLINE    '\\n'          (6, 10) (6, 11)
839    """)
840
841        self.check_tokenize('''\
842async def foo():
843  async for i in 1: pass''', """\
844    NAME       'async'       (1, 0) (1, 5)
845    NAME       'def'         (1, 6) (1, 9)
846    NAME       'foo'         (1, 10) (1, 13)
847    OP         '('           (1, 13) (1, 14)
848    OP         ')'           (1, 14) (1, 15)
849    OP         ':'           (1, 15) (1, 16)
850    NEWLINE    '\\n'          (1, 16) (1, 17)
851    INDENT     '  '          (2, 0) (2, 2)
852    NAME       'async'       (2, 2) (2, 7)
853    NAME       'for'         (2, 8) (2, 11)
854    NAME       'i'           (2, 12) (2, 13)
855    NAME       'in'          (2, 14) (2, 16)
856    NUMBER     '1'           (2, 17) (2, 18)
857    OP         ':'           (2, 18) (2, 19)
858    NAME       'pass'        (2, 20) (2, 24)
859    DEDENT     ''            (3, 0) (3, 0)
860    """)
861
862        self.check_tokenize('''async def foo(async): await''', """\
863    NAME       'async'       (1, 0) (1, 5)
864    NAME       'def'         (1, 6) (1, 9)
865    NAME       'foo'         (1, 10) (1, 13)
866    OP         '('           (1, 13) (1, 14)
867    NAME       'async'       (1, 14) (1, 19)
868    OP         ')'           (1, 19) (1, 20)
869    OP         ':'           (1, 20) (1, 21)
870    NAME       'await'       (1, 22) (1, 27)
871    """)
872
873        self.check_tokenize('''\
874def f():
875
876  def baz(): pass
877  async def bar(): pass
878
879  await = 2''', """\
880    NAME       'def'         (1, 0) (1, 3)
881    NAME       'f'           (1, 4) (1, 5)
882    OP         '('           (1, 5) (1, 6)
883    OP         ')'           (1, 6) (1, 7)
884    OP         ':'           (1, 7) (1, 8)
885    NEWLINE    '\\n'          (1, 8) (1, 9)
886    NL         '\\n'          (2, 0) (2, 1)
887    INDENT     '  '          (3, 0) (3, 2)
888    NAME       'def'         (3, 2) (3, 5)
889    NAME       'baz'         (3, 6) (3, 9)
890    OP         '('           (3, 9) (3, 10)
891    OP         ')'           (3, 10) (3, 11)
892    OP         ':'           (3, 11) (3, 12)
893    NAME       'pass'        (3, 13) (3, 17)
894    NEWLINE    '\\n'          (3, 17) (3, 18)
895    NAME       'async'       (4, 2) (4, 7)
896    NAME       'def'         (4, 8) (4, 11)
897    NAME       'bar'         (4, 12) (4, 15)
898    OP         '('           (4, 15) (4, 16)
899    OP         ')'           (4, 16) (4, 17)
900    OP         ':'           (4, 17) (4, 18)
901    NAME       'pass'        (4, 19) (4, 23)
902    NEWLINE    '\\n'          (4, 23) (4, 24)
903    NL         '\\n'          (5, 0) (5, 1)
904    NAME       'await'       (6, 2) (6, 7)
905    OP         '='           (6, 8) (6, 9)
906    NUMBER     '2'           (6, 10) (6, 11)
907    DEDENT     ''            (7, 0) (7, 0)
908    """)
909
910        self.check_tokenize('''\
911async def f():
912
913  def baz(): pass
914  async def bar(): pass
915
916  await = 2''', """\
917    NAME       'async'       (1, 0) (1, 5)
918    NAME       'def'         (1, 6) (1, 9)
919    NAME       'f'           (1, 10) (1, 11)
920    OP         '('           (1, 11) (1, 12)
921    OP         ')'           (1, 12) (1, 13)
922    OP         ':'           (1, 13) (1, 14)
923    NEWLINE    '\\n'          (1, 14) (1, 15)
924    NL         '\\n'          (2, 0) (2, 1)
925    INDENT     '  '          (3, 0) (3, 2)
926    NAME       'def'         (3, 2) (3, 5)
927    NAME       'baz'         (3, 6) (3, 9)
928    OP         '('           (3, 9) (3, 10)
929    OP         ')'           (3, 10) (3, 11)
930    OP         ':'           (3, 11) (3, 12)
931    NAME       'pass'        (3, 13) (3, 17)
932    NEWLINE    '\\n'          (3, 17) (3, 18)
933    NAME       'async'       (4, 2) (4, 7)
934    NAME       'def'         (4, 8) (4, 11)
935    NAME       'bar'         (4, 12) (4, 15)
936    OP         '('           (4, 15) (4, 16)
937    OP         ')'           (4, 16) (4, 17)
938    OP         ':'           (4, 17) (4, 18)
939    NAME       'pass'        (4, 19) (4, 23)
940    NEWLINE    '\\n'          (4, 23) (4, 24)
941    NL         '\\n'          (5, 0) (5, 1)
942    NAME       'await'       (6, 2) (6, 7)
943    OP         '='           (6, 8) (6, 9)
944    NUMBER     '2'           (6, 10) (6, 11)
945    DEDENT     ''            (7, 0) (7, 0)
946    """)
947
948class GenerateTokensTest(TokenizeTest):
949    def check_tokenize(self, s, expected):
950        # Format the tokens in s in a table format.
951        # The ENDMARKER and final NEWLINE are omitted.
952        f = StringIO(s)
953        result = stringify_tokens_from_source(generate_tokens(f.readline), s)
954        self.assertEqual(result, expected.rstrip().splitlines())
955
956
957def decistmt(s):
958    result = []
959    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
960    for toknum, tokval, _, _, _  in g:
961        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
962            result.extend([
963                (NAME, 'Decimal'),
964                (OP, '('),
965                (STRING, repr(tokval)),
966                (OP, ')')
967            ])
968        else:
969            result.append((toknum, tokval))
970    return untokenize(result).decode('utf-8')
971
972class TestMisc(TestCase):
973
974    def test_decistmt(self):
975        # Substitute Decimals for floats in a string of statements.
976        # This is an example from the docs.
977
978        from decimal import Decimal
979        s = '+21.3e-5*-.1234/81.7'
980        self.assertEqual(decistmt(s),
981                         "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
982
983        # The format of the exponent is inherited from the platform C library.
984        # Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
985        # we're only showing 11 digits, and the 12th isn't close to 5, the
986        # rest of the output should be platform-independent.
987        self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
988
989        # Output from calculations with Decimal should be identical across all
990        # platforms.
991        self.assertEqual(eval(decistmt(s)),
992                         Decimal('-3.217160342717258261933904529E-7'))
993
994
995class TestTokenizerAdheresToPep0263(TestCase):
996    """
997    Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
998    """
999
1000    def _testFile(self, filename):
1001        path = os.path.join(os.path.dirname(__file__), filename)
1002        TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
1003
1004    def test_utf8_coding_cookie_and_no_utf8_bom(self):
1005        f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
1006        self._testFile(f)
1007
1008    def test_latin1_coding_cookie_and_utf8_bom(self):
1009        """
1010        As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
1011        allowed encoding for the comment is 'utf-8'.  The text file used in
1012        this test starts with a BOM signature, but specifies latin1 as the
1013        coding, so verify that a SyntaxError is raised, which matches the
1014        behaviour of the interpreter when it encounters a similar condition.
1015        """
1016        f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
1017        self.assertRaises(SyntaxError, self._testFile, f)
1018
1019    def test_no_coding_cookie_and_utf8_bom(self):
1020        f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
1021        self._testFile(f)
1022
1023    def test_utf8_coding_cookie_and_utf8_bom(self):
1024        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
1025        self._testFile(f)
1026
1027    def test_bad_coding_cookie(self):
1028        self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
1029        self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
1030
1031
1032class Test_Tokenize(TestCase):
1033
1034    def test__tokenize_decodes_with_specified_encoding(self):
1035        literal = '"ЉЊЈЁЂ"'
1036        line = literal.encode('utf-8')
1037        first = False
1038        def readline():
1039            nonlocal first
1040            if not first:
1041                first = True
1042                return line
1043            else:
1044                return b''
1045
1046        # skip the initial encoding token and the end tokens
1047        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
1048        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1049        self.assertEqual(tokens, expected_tokens,
1050                         "bytes not decoded with encoding")
1051
1052    def test__tokenize_does_not_decode_with_encoding_none(self):
1053        literal = '"ЉЊЈЁЂ"'
1054        first = False
1055        def readline():
1056            nonlocal first
1057            if not first:
1058                first = True
1059                return literal
1060            else:
1061                return b''
1062
1063        # skip the end tokens
1064        tokens = list(_tokenize(readline, encoding=None))[:-2]
1065        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1066        self.assertEqual(tokens, expected_tokens,
1067                         "string not tokenized when encoding is None")
1068
1069
1070class TestDetectEncoding(TestCase):
1071
1072    def get_readline(self, lines):
1073        index = 0
1074        def readline():
1075            nonlocal index
1076            if index == len(lines):
1077                raise StopIteration
1078            line = lines[index]
1079            index += 1
1080            return line
1081        return readline
1082
1083    def test_no_bom_no_encoding_cookie(self):
1084        lines = (
1085            b'# something\n',
1086            b'print(something)\n',
1087            b'do_something(else)\n'
1088        )
1089        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1090        self.assertEqual(encoding, 'utf-8')
1091        self.assertEqual(consumed_lines, list(lines[:2]))
1092
1093    def test_bom_no_cookie(self):
1094        lines = (
1095            b'\xef\xbb\xbf# something\n',
1096            b'print(something)\n',
1097            b'do_something(else)\n'
1098        )
1099        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1100        self.assertEqual(encoding, 'utf-8-sig')
1101        self.assertEqual(consumed_lines,
1102                         [b'# something\n', b'print(something)\n'])
1103
1104    def test_cookie_first_line_no_bom(self):
1105        lines = (
1106            b'# -*- coding: latin-1 -*-\n',
1107            b'print(something)\n',
1108            b'do_something(else)\n'
1109        )
1110        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1111        self.assertEqual(encoding, 'iso-8859-1')
1112        self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
1113
1114    def test_matched_bom_and_cookie_first_line(self):
1115        lines = (
1116            b'\xef\xbb\xbf# coding=utf-8\n',
1117            b'print(something)\n',
1118            b'do_something(else)\n'
1119        )
1120        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1121        self.assertEqual(encoding, 'utf-8-sig')
1122        self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
1123
1124    def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1125        lines = (
1126            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1127            b'print(something)\n',
1128            b'do_something(else)\n'
1129        )
1130        readline = self.get_readline(lines)
1131        self.assertRaises(SyntaxError, detect_encoding, readline)
1132
1133    def test_cookie_second_line_no_bom(self):
1134        lines = (
1135            b'#! something\n',
1136            b'# vim: set fileencoding=ascii :\n',
1137            b'print(something)\n',
1138            b'do_something(else)\n'
1139        )
1140        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1141        self.assertEqual(encoding, 'ascii')
1142        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
1143        self.assertEqual(consumed_lines, expected)
1144
1145    def test_matched_bom_and_cookie_second_line(self):
1146        lines = (
1147            b'\xef\xbb\xbf#! something\n',
1148            b'f# coding=utf-8\n',
1149            b'print(something)\n',
1150            b'do_something(else)\n'
1151        )
1152        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1153        self.assertEqual(encoding, 'utf-8-sig')
1154        self.assertEqual(consumed_lines,
1155                         [b'#! something\n', b'f# coding=utf-8\n'])
1156
1157    def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1158        lines = (
1159            b'\xef\xbb\xbf#! something\n',
1160            b'# vim: set fileencoding=ascii :\n',
1161            b'print(something)\n',
1162            b'do_something(else)\n'
1163        )
1164        readline = self.get_readline(lines)
1165        self.assertRaises(SyntaxError, detect_encoding, readline)
1166
1167    def test_cookie_second_line_noncommented_first_line(self):
1168        lines = (
1169            b"print('\xc2\xa3')\n",
1170            b'# vim: set fileencoding=iso8859-15 :\n',
1171            b"print('\xe2\x82\xac')\n"
1172        )
1173        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1174        self.assertEqual(encoding, 'utf-8')
1175        expected = [b"print('\xc2\xa3')\n"]
1176        self.assertEqual(consumed_lines, expected)
1177
1178    def test_cookie_second_line_commented_first_line(self):
1179        lines = (
1180            b"#print('\xc2\xa3')\n",
1181            b'# vim: set fileencoding=iso8859-15 :\n',
1182            b"print('\xe2\x82\xac')\n"
1183        )
1184        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1185        self.assertEqual(encoding, 'iso8859-15')
1186        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1187        self.assertEqual(consumed_lines, expected)
1188
1189    def test_cookie_second_line_empty_first_line(self):
1190        lines = (
1191            b'\n',
1192            b'# vim: set fileencoding=iso8859-15 :\n',
1193            b"print('\xe2\x82\xac')\n"
1194        )
1195        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1196        self.assertEqual(encoding, 'iso8859-15')
1197        expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1198        self.assertEqual(consumed_lines, expected)
1199
1200    def test_latin1_normalization(self):
1201        # See get_normal_name() in tokenizer.c.
1202        encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1203                     "iso-8859-1-unix", "iso-latin-1-mac")
1204        for encoding in encodings:
1205            for rep in ("-", "_"):
1206                enc = encoding.replace("-", rep)
1207                lines = (b"#!/usr/bin/python\n",
1208                         b"# coding: " + enc.encode("ascii") + b"\n",
1209                         b"print(things)\n",
1210                         b"do_something += 4\n")
1211                rl = self.get_readline(lines)
1212                found, consumed_lines = detect_encoding(rl)
1213                self.assertEqual(found, "iso-8859-1")
1214
1215    def test_syntaxerror_latin1(self):
1216        # Issue 14629: need to raise SyntaxError if the first
1217        # line(s) have non-UTF-8 characters
1218        lines = (
1219            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1220            )
1221        readline = self.get_readline(lines)
1222        self.assertRaises(SyntaxError, detect_encoding, readline)
1223
1224
1225    def test_utf8_normalization(self):
1226        # See get_normal_name() in tokenizer.c.
1227        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1228        for encoding in encodings:
1229            for rep in ("-", "_"):
1230                enc = encoding.replace("-", rep)
1231                lines = (b"#!/usr/bin/python\n",
1232                         b"# coding: " + enc.encode("ascii") + b"\n",
1233                         b"1 + 3\n")
1234                rl = self.get_readline(lines)
1235                found, consumed_lines = detect_encoding(rl)
1236                self.assertEqual(found, "utf-8")
1237
1238    def test_short_files(self):
1239        readline = self.get_readline((b'print(something)\n',))
1240        encoding, consumed_lines = detect_encoding(readline)
1241        self.assertEqual(encoding, 'utf-8')
1242        self.assertEqual(consumed_lines, [b'print(something)\n'])
1243
1244        encoding, consumed_lines = detect_encoding(self.get_readline(()))
1245        self.assertEqual(encoding, 'utf-8')
1246        self.assertEqual(consumed_lines, [])
1247
1248        readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1249        encoding, consumed_lines = detect_encoding(readline)
1250        self.assertEqual(encoding, 'utf-8-sig')
1251        self.assertEqual(consumed_lines, [b'print(something)\n'])
1252
1253        readline = self.get_readline((b'\xef\xbb\xbf',))
1254        encoding, consumed_lines = detect_encoding(readline)
1255        self.assertEqual(encoding, 'utf-8-sig')
1256        self.assertEqual(consumed_lines, [])
1257
1258        readline = self.get_readline((b'# coding: bad\n',))
1259        self.assertRaises(SyntaxError, detect_encoding, readline)
1260
1261    def test_false_encoding(self):
1262        # Issue 18873: "Encoding" detected in non-comment lines
1263        readline = self.get_readline((b'print("#coding=fake")',))
1264        encoding, consumed_lines = detect_encoding(readline)
1265        self.assertEqual(encoding, 'utf-8')
1266        self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1267
1268    def test_open(self):
1269        filename = os_helper.TESTFN + '.py'
1270        self.addCleanup(os_helper.unlink, filename)
1271
1272        # test coding cookie
1273        for encoding in ('iso-8859-15', 'utf-8'):
1274            with open(filename, 'w', encoding=encoding) as fp:
1275                print("# coding: %s" % encoding, file=fp)
1276                print("print('euro:\u20ac')", file=fp)
1277            with tokenize_open(filename) as fp:
1278                self.assertEqual(fp.encoding, encoding)
1279                self.assertEqual(fp.mode, 'r')
1280
1281        # test BOM (no coding cookie)
1282        with open(filename, 'w', encoding='utf-8-sig') as fp:
1283            print("print('euro:\u20ac')", file=fp)
1284        with tokenize_open(filename) as fp:
1285            self.assertEqual(fp.encoding, 'utf-8-sig')
1286            self.assertEqual(fp.mode, 'r')
1287
1288    def test_filename_in_exception(self):
1289        # When possible, include the file name in the exception.
1290        path = 'some_file_path'
1291        lines = (
1292            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1293            )
1294        class Bunk:
1295            def __init__(self, lines, path):
1296                self.name = path
1297                self._lines = lines
1298                self._index = 0
1299
1300            def readline(self):
1301                if self._index == len(lines):
1302                    raise StopIteration
1303                line = lines[self._index]
1304                self._index += 1
1305                return line
1306
1307        with self.assertRaises(SyntaxError):
1308            ins = Bunk(lines, path)
1309            # Make sure lacking a name isn't an issue.
1310            del ins.name
1311            detect_encoding(ins.readline)
1312        with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1313            ins = Bunk(lines, path)
1314            detect_encoding(ins.readline)
1315
1316    def test_open_error(self):
1317        # Issue #23840: open() must close the binary file on error
1318        m = BytesIO(b'#coding:xxx')
1319        with mock.patch('tokenize._builtin_open', return_value=m):
1320            self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1321        self.assertTrue(m.closed)
1322
1323
1324class TestTokenize(TestCase):
1325
1326    def test_tokenize(self):
1327        import tokenize as tokenize_module
1328        encoding = object()
1329        encoding_used = None
1330        def mock_detect_encoding(readline):
1331            return encoding, [b'first', b'second']
1332
1333        def mock__tokenize(readline, encoding):
1334            nonlocal encoding_used
1335            encoding_used = encoding
1336            out = []
1337            while True:
1338                next_line = readline()
1339                if next_line:
1340                    out.append(next_line)
1341                    continue
1342                return out
1343
1344        counter = 0
1345        def mock_readline():
1346            nonlocal counter
1347            counter += 1
1348            if counter == 5:
1349                return b''
1350            return str(counter).encode()
1351
1352        orig_detect_encoding = tokenize_module.detect_encoding
1353        orig__tokenize = tokenize_module._tokenize
1354        tokenize_module.detect_encoding = mock_detect_encoding
1355        tokenize_module._tokenize = mock__tokenize
1356        try:
1357            results = tokenize(mock_readline)
1358            self.assertEqual(list(results),
1359                             [b'first', b'second', b'1', b'2', b'3', b'4'])
1360        finally:
1361            tokenize_module.detect_encoding = orig_detect_encoding
1362            tokenize_module._tokenize = orig__tokenize
1363
1364        self.assertEqual(encoding_used, encoding)
1365
1366    def test_oneline_defs(self):
1367        buf = []
1368        for i in range(500):
1369            buf.append('def i{i}(): return {i}'.format(i=i))
1370        buf.append('OK')
1371        buf = '\n'.join(buf)
1372
1373        # Test that 500 consequent, one-line defs is OK
1374        toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1375        self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
1376                                                # [-2] is always NEWLINE
1377
1378    def assertExactTypeEqual(self, opstr, *optypes):
1379        tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1380        num_optypes = len(optypes)
1381        self.assertEqual(len(tokens), 3 + num_optypes)
1382        self.assertEqual(tok_name[tokens[0].exact_type],
1383                         tok_name[ENCODING])
1384        for i in range(num_optypes):
1385            self.assertEqual(tok_name[tokens[i + 1].exact_type],
1386                             tok_name[optypes[i]])
1387        self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
1388                         tok_name[token.NEWLINE])
1389        self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
1390                         tok_name[token.ENDMARKER])
1391
1392    def test_exact_type(self):
1393        self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1394        self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1395        self.assertExactTypeEqual(':', token.COLON)
1396        self.assertExactTypeEqual(',', token.COMMA)
1397        self.assertExactTypeEqual(';', token.SEMI)
1398        self.assertExactTypeEqual('+', token.PLUS)
1399        self.assertExactTypeEqual('-', token.MINUS)
1400        self.assertExactTypeEqual('*', token.STAR)
1401        self.assertExactTypeEqual('/', token.SLASH)
1402        self.assertExactTypeEqual('|', token.VBAR)
1403        self.assertExactTypeEqual('&', token.AMPER)
1404        self.assertExactTypeEqual('<', token.LESS)
1405        self.assertExactTypeEqual('>', token.GREATER)
1406        self.assertExactTypeEqual('=', token.EQUAL)
1407        self.assertExactTypeEqual('.', token.DOT)
1408        self.assertExactTypeEqual('%', token.PERCENT)
1409        self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1410        self.assertExactTypeEqual('==', token.EQEQUAL)
1411        self.assertExactTypeEqual('!=', token.NOTEQUAL)
1412        self.assertExactTypeEqual('<=', token.LESSEQUAL)
1413        self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1414        self.assertExactTypeEqual('~', token.TILDE)
1415        self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1416        self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1417        self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1418        self.assertExactTypeEqual('**', token.DOUBLESTAR)
1419        self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1420        self.assertExactTypeEqual('-=', token.MINEQUAL)
1421        self.assertExactTypeEqual('*=', token.STAREQUAL)
1422        self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1423        self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1424        self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1425        self.assertExactTypeEqual('|=', token.VBAREQUAL)
1426        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1427        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1428        self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1429        self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1430        self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1431        self.assertExactTypeEqual('//', token.DOUBLESLASH)
1432        self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1433        self.assertExactTypeEqual(':=', token.COLONEQUAL)
1434        self.assertExactTypeEqual('...', token.ELLIPSIS)
1435        self.assertExactTypeEqual('->', token.RARROW)
1436        self.assertExactTypeEqual('@', token.AT)
1437        self.assertExactTypeEqual('@=', token.ATEQUAL)
1438
1439        self.assertExactTypeEqual('a**2+b**2==c**2',
1440                                  NAME, token.DOUBLESTAR, NUMBER,
1441                                  token.PLUS,
1442                                  NAME, token.DOUBLESTAR, NUMBER,
1443                                  token.EQEQUAL,
1444                                  NAME, token.DOUBLESTAR, NUMBER)
1445        self.assertExactTypeEqual('{1, 2, 3}',
1446                                  token.LBRACE,
1447                                  token.NUMBER, token.COMMA,
1448                                  token.NUMBER, token.COMMA,
1449                                  token.NUMBER,
1450                                  token.RBRACE)
1451        self.assertExactTypeEqual('^(x & 0x1)',
1452                                  token.CIRCUMFLEX,
1453                                  token.LPAR,
1454                                  token.NAME, token.AMPER, token.NUMBER,
1455                                  token.RPAR)
1456
1457    def test_pathological_trailing_whitespace(self):
1458        # See http://bugs.python.org/issue16152
1459        self.assertExactTypeEqual('@          ', token.AT)
1460
1461    def test_comment_at_the_end_of_the_source_without_newline(self):
1462        # See http://bugs.python.org/issue44667
1463        source = 'b = 1\n\n#test'
1464        expected_tokens = [token.NAME, token.EQUAL, token.NUMBER, token.NEWLINE, token.NL, token.COMMENT]
1465
1466        tokens = list(tokenize(BytesIO(source.encode('utf-8')).readline))
1467        self.assertEqual(tok_name[tokens[0].exact_type], tok_name[ENCODING])
1468        for i in range(6):
1469            self.assertEqual(tok_name[tokens[i + 1].exact_type], tok_name[expected_tokens[i]])
1470        self.assertEqual(tok_name[tokens[-1].exact_type], tok_name[token.ENDMARKER])
1471
1472class UntokenizeTest(TestCase):
1473
1474    def test_bad_input_order(self):
1475        # raise if previous row
1476        u = Untokenizer()
1477        u.prev_row = 2
1478        u.prev_col = 2
1479        with self.assertRaises(ValueError) as cm:
1480            u.add_whitespace((1,3))
1481        self.assertEqual(cm.exception.args[0],
1482                'start (1,3) precedes previous end (2,2)')
1483        # raise if previous column in row
1484        self.assertRaises(ValueError, u.add_whitespace, (2,1))
1485
1486    def test_backslash_continuation(self):
1487        # The problem is that <whitespace>\<newline> leaves no token
1488        u = Untokenizer()
1489        u.prev_row = 1
1490        u.prev_col =  1
1491        u.tokens = []
1492        u.add_whitespace((2, 0))
1493        self.assertEqual(u.tokens, ['\\\n'])
1494        u.prev_row = 2
1495        u.add_whitespace((4, 4))
1496        self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
1497        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
1498
1499    def test_iter_compat(self):
1500        u = Untokenizer()
1501        token = (NAME, 'Hello')
1502        tokens = [(ENCODING, 'utf-8'), token]
1503        u.compat(token, iter([]))
1504        self.assertEqual(u.tokens, ["Hello "])
1505        u = Untokenizer()
1506        self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1507        u = Untokenizer()
1508        self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1509        self.assertEqual(u.encoding, 'utf-8')
1510        self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1511
1512
1513class TestRoundtrip(TestCase):
1514
1515    def check_roundtrip(self, f):
1516        """
1517        Test roundtrip for `untokenize`. `f` is an open file or a string.
1518        The source code in f is tokenized to both 5- and 2-tuples.
1519        Both sequences are converted back to source code via
1520        tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1521        The test fails if the 3 pair tokenizations do not match.
1522
1523        When untokenize bugs are fixed, untokenize with 5-tuples should
1524        reproduce code that does not contain a backslash continuation
1525        following spaces.  A proper test should test this.
1526        """
1527        # Get source code and original tokenizations
1528        if isinstance(f, str):
1529            code = f.encode('utf-8')
1530        else:
1531            code = f.read()
1532            f.close()
1533        readline = iter(code.splitlines(keepends=True)).__next__
1534        tokens5 = list(tokenize(readline))
1535        tokens2 = [tok[:2] for tok in tokens5]
1536        # Reproduce tokens2 from pairs
1537        bytes_from2 = untokenize(tokens2)
1538        readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1539        tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1540        self.assertEqual(tokens2_from2, tokens2)
1541        # Reproduce tokens2 from 5-tuples
1542        bytes_from5 = untokenize(tokens5)
1543        readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1544        tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1545        self.assertEqual(tokens2_from5, tokens2)
1546
1547    def test_roundtrip(self):
1548        # There are some standard formatting practices that are easy to get right.
1549
1550        self.check_roundtrip("if x == 1:\n"
1551                             "    print(x)\n")
1552        self.check_roundtrip("# This is a comment\n"
1553                             "# This also\n")
1554
1555        # Some people use different formatting conventions, which makes
1556        # untokenize a little trickier. Note that this test involves trailing
1557        # whitespace after the colon. Note that we use hex escapes to make the
1558        # two trailing blanks apparent in the expected output.
1559
1560        self.check_roundtrip("if x == 1 : \n"
1561                             "  print(x)\n")
1562        fn = support.findfile("tokenize_tests.txt")
1563        with open(fn, 'rb') as f:
1564            self.check_roundtrip(f)
1565        self.check_roundtrip("if x == 1:\n"
1566                             "    # A comment by itself.\n"
1567                             "    print(x) # Comment here, too.\n"
1568                             "    # Another comment.\n"
1569                             "after_if = True\n")
1570        self.check_roundtrip("if (x # The comments need to go in the right place\n"
1571                             "    == 1):\n"
1572                             "    print('x==1')\n")
1573        self.check_roundtrip("class Test: # A comment here\n"
1574                             "  # A comment with weird indent\n"
1575                             "  after_com = 5\n"
1576                             "  def x(m): return m*5 # a one liner\n"
1577                             "  def y(m): # A whitespace after the colon\n"
1578                             "     return y*4 # 3-space indent\n")
1579
1580        # Some error-handling code
1581        self.check_roundtrip("try: import somemodule\n"
1582                             "except ImportError: # comment\n"
1583                             "    print('Can not import' # comment2\n)"
1584                             "else:   print('Loaded')\n")
1585
1586    def test_continuation(self):
1587        # Balancing continuation
1588        self.check_roundtrip("a = (3,4, \n"
1589                             "5,6)\n"
1590                             "y = [3, 4,\n"
1591                             "5]\n"
1592                             "z = {'a': 5,\n"
1593                             "'b':15, 'c':True}\n"
1594                             "x = len(y) + 5 - a[\n"
1595                             "3] - a[2]\n"
1596                             "+ len(z) - z[\n"
1597                             "'b']\n")
1598
1599    def test_backslash_continuation(self):
1600        # Backslash means line continuation, except for comments
1601        self.check_roundtrip("x=1+\\\n"
1602                             "1\n"
1603                             "# This is a comment\\\n"
1604                             "# This also\n")
1605        self.check_roundtrip("# Comment \\\n"
1606                             "x = 0")
1607
1608    def test_string_concatenation(self):
1609        # Two string literals on the same line
1610        self.check_roundtrip("'' ''")
1611
1612    def test_random_files(self):
1613        # Test roundtrip on random python modules.
1614        # pass the '-ucpu' option to process the full directory.
1615
1616        import glob, random
1617        fn = support.findfile("tokenize_tests.txt")
1618        tempdir = os.path.dirname(fn) or os.curdir
1619        testfiles = glob.glob(os.path.join(glob.escape(tempdir), "test*.py"))
1620
1621        # Tokenize is broken on test_pep3131.py because regular expressions are
1622        # broken on the obscure unicode identifiers in it. *sigh*
1623        # With roundtrip extended to test the 5-tuple mode of untokenize,
1624        # 7 more testfiles fail.  Remove them also until the failure is diagnosed.
1625
1626        testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
1627        for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1628            testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1629
1630        if not support.is_resource_enabled("cpu"):
1631            testfiles = random.sample(testfiles, 10)
1632
1633        for testfile in testfiles:
1634            if support.verbose >= 2:
1635                print('tokenize', testfile)
1636            with open(testfile, 'rb') as f:
1637                with self.subTest(file=testfile):
1638                    self.check_roundtrip(f)
1639
1640
1641    def roundtrip(self, code):
1642        if isinstance(code, str):
1643            code = code.encode('utf-8')
1644        return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
1645
1646    def test_indentation_semantics_retained(self):
1647        """
1648        Ensure that although whitespace might be mutated in a roundtrip,
1649        the semantic meaning of the indentation remains consistent.
1650        """
1651        code = "if False:\n\tx=3\n\tx=3\n"
1652        codelines = self.roundtrip(code).split('\n')
1653        self.assertEqual(codelines[1], codelines[2])
1654        self.check_roundtrip(code)
1655
1656
1657if __name__ == "__main__":
1658    unittest.main()
1659