• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1from test import support
2from tokenize import (tokenize, _tokenize, untokenize, NUMBER, NAME, OP,
3                     STRING, ENDMARKER, ENCODING, tok_name, detect_encoding,
4                     open as tokenize_open, Untokenizer, generate_tokens,
5                     NEWLINE)
6from io import BytesIO, StringIO
7import unittest
8from unittest import TestCase, mock
9from test.test_grammar import (VALID_UNDERSCORE_LITERALS,
10                               INVALID_UNDERSCORE_LITERALS)
11import os
12import token
13
14
15# Converts a source string into a list of textual representation
16# of the tokens such as:
17# `    NAME       'if'          (1, 0) (1, 2)`
18# to make writing tests easier.
19def stringify_tokens_from_source(token_generator, source_string):
20    result = []
21    num_lines = len(source_string.splitlines())
22    missing_trailing_nl = source_string[-1] not in '\r\n'
23
24    for type, token, start, end, line in token_generator:
25        if type == ENDMARKER:
26            break
27        # Ignore the new line on the last line if the input lacks one
28        if missing_trailing_nl and type == NEWLINE and end[0] == num_lines:
29            continue
30        type = tok_name[type]
31        result.append(f"    {type:10} {token!r:13} {start} {end}")
32
33    return result
34
35class TokenizeTest(TestCase):
36    # Tests for the tokenize module.
37
38    # The tests can be really simple. Given a small fragment of source
39    # code, print out a table with tokens. The ENDMARKER, ENCODING and
40    # final NEWLINE are omitted for brevity.
41
42    def check_tokenize(self, s, expected):
43        # Format the tokens in s in a table format.
44        # The ENDMARKER and final NEWLINE are omitted.
45        f = BytesIO(s.encode('utf-8'))
46        result = stringify_tokens_from_source(tokenize(f.readline), s)
47
48        self.assertEqual(result,
49                         ["    ENCODING   'utf-8'       (0, 0) (0, 0)"] +
50                         expected.rstrip().splitlines())
51
52    def test_implicit_newline(self):
53        # Make sure that the tokenizer puts in an implicit NEWLINE
54        # when the input lacks a trailing new line.
55        f = BytesIO("x".encode('utf-8'))
56        tokens = list(tokenize(f.readline))
57        self.assertEqual(tokens[-2].type, NEWLINE)
58        self.assertEqual(tokens[-1].type, ENDMARKER)
59
60    def test_basic(self):
61        self.check_tokenize("1 + 1", """\
62    NUMBER     '1'           (1, 0) (1, 1)
63    OP         '+'           (1, 2) (1, 3)
64    NUMBER     '1'           (1, 4) (1, 5)
65    """)
66        self.check_tokenize("if False:\n"
67                            "    # NL\n"
68                            "    \n"
69                            "    True = False # NEWLINE\n", """\
70    NAME       'if'          (1, 0) (1, 2)
71    NAME       'False'       (1, 3) (1, 8)
72    OP         ':'           (1, 8) (1, 9)
73    NEWLINE    '\\n'          (1, 9) (1, 10)
74    COMMENT    '# NL'        (2, 4) (2, 8)
75    NL         '\\n'          (2, 8) (2, 9)
76    NL         '\\n'          (3, 4) (3, 5)
77    INDENT     '    '        (4, 0) (4, 4)
78    NAME       'True'        (4, 4) (4, 8)
79    OP         '='           (4, 9) (4, 10)
80    NAME       'False'       (4, 11) (4, 16)
81    COMMENT    '# NEWLINE'   (4, 17) (4, 26)
82    NEWLINE    '\\n'          (4, 26) (4, 27)
83    DEDENT     ''            (5, 0) (5, 0)
84    """)
85        indent_error_file = b"""\
86def k(x):
87    x += 2
88  x += 5
89"""
90        readline = BytesIO(indent_error_file).readline
91        with self.assertRaisesRegex(IndentationError,
92                                    "unindent does not match any "
93                                    "outer indentation level"):
94            for tok in tokenize(readline):
95                pass
96
97    def test_int(self):
98        # Ordinary integers and binary operators
99        self.check_tokenize("0xff <= 255", """\
100    NUMBER     '0xff'        (1, 0) (1, 4)
101    OP         '<='          (1, 5) (1, 7)
102    NUMBER     '255'         (1, 8) (1, 11)
103    """)
104        self.check_tokenize("0b10 <= 255", """\
105    NUMBER     '0b10'        (1, 0) (1, 4)
106    OP         '<='          (1, 5) (1, 7)
107    NUMBER     '255'         (1, 8) (1, 11)
108    """)
109        self.check_tokenize("0o123 <= 0O123", """\
110    NUMBER     '0o123'       (1, 0) (1, 5)
111    OP         '<='          (1, 6) (1, 8)
112    NUMBER     '0O123'       (1, 9) (1, 14)
113    """)
114        self.check_tokenize("1234567 > ~0x15", """\
115    NUMBER     '1234567'     (1, 0) (1, 7)
116    OP         '>'           (1, 8) (1, 9)
117    OP         '~'           (1, 10) (1, 11)
118    NUMBER     '0x15'        (1, 11) (1, 15)
119    """)
120        self.check_tokenize("2134568 != 1231515", """\
121    NUMBER     '2134568'     (1, 0) (1, 7)
122    OP         '!='          (1, 8) (1, 10)
123    NUMBER     '1231515'     (1, 11) (1, 18)
124    """)
125        self.check_tokenize("(-124561-1) & 200000000", """\
126    OP         '('           (1, 0) (1, 1)
127    OP         '-'           (1, 1) (1, 2)
128    NUMBER     '124561'      (1, 2) (1, 8)
129    OP         '-'           (1, 8) (1, 9)
130    NUMBER     '1'           (1, 9) (1, 10)
131    OP         ')'           (1, 10) (1, 11)
132    OP         '&'           (1, 12) (1, 13)
133    NUMBER     '200000000'   (1, 14) (1, 23)
134    """)
135        self.check_tokenize("0xdeadbeef != -1", """\
136    NUMBER     '0xdeadbeef'  (1, 0) (1, 10)
137    OP         '!='          (1, 11) (1, 13)
138    OP         '-'           (1, 14) (1, 15)
139    NUMBER     '1'           (1, 15) (1, 16)
140    """)
141        self.check_tokenize("0xdeadc0de & 12345", """\
142    NUMBER     '0xdeadc0de'  (1, 0) (1, 10)
143    OP         '&'           (1, 11) (1, 12)
144    NUMBER     '12345'       (1, 13) (1, 18)
145    """)
146        self.check_tokenize("0xFF & 0x15 | 1234", """\
147    NUMBER     '0xFF'        (1, 0) (1, 4)
148    OP         '&'           (1, 5) (1, 6)
149    NUMBER     '0x15'        (1, 7) (1, 11)
150    OP         '|'           (1, 12) (1, 13)
151    NUMBER     '1234'        (1, 14) (1, 18)
152    """)
153
154    def test_long(self):
155        # Long integers
156        self.check_tokenize("x = 0", """\
157    NAME       'x'           (1, 0) (1, 1)
158    OP         '='           (1, 2) (1, 3)
159    NUMBER     '0'           (1, 4) (1, 5)
160    """)
161        self.check_tokenize("x = 0xfffffffffff", """\
162    NAME       'x'           (1, 0) (1, 1)
163    OP         '='           (1, 2) (1, 3)
164    NUMBER     '0xfffffffffff' (1, 4) (1, 17)
165    """)
166        self.check_tokenize("x = 123141242151251616110", """\
167    NAME       'x'           (1, 0) (1, 1)
168    OP         '='           (1, 2) (1, 3)
169    NUMBER     '123141242151251616110' (1, 4) (1, 25)
170    """)
171        self.check_tokenize("x = -15921590215012591", """\
172    NAME       'x'           (1, 0) (1, 1)
173    OP         '='           (1, 2) (1, 3)
174    OP         '-'           (1, 4) (1, 5)
175    NUMBER     '15921590215012591' (1, 5) (1, 22)
176    """)
177
178    def test_float(self):
179        # Floating point numbers
180        self.check_tokenize("x = 3.14159", """\
181    NAME       'x'           (1, 0) (1, 1)
182    OP         '='           (1, 2) (1, 3)
183    NUMBER     '3.14159'     (1, 4) (1, 11)
184    """)
185        self.check_tokenize("x = 314159.", """\
186    NAME       'x'           (1, 0) (1, 1)
187    OP         '='           (1, 2) (1, 3)
188    NUMBER     '314159.'     (1, 4) (1, 11)
189    """)
190        self.check_tokenize("x = .314159", """\
191    NAME       'x'           (1, 0) (1, 1)
192    OP         '='           (1, 2) (1, 3)
193    NUMBER     '.314159'     (1, 4) (1, 11)
194    """)
195        self.check_tokenize("x = 3e14159", """\
196    NAME       'x'           (1, 0) (1, 1)
197    OP         '='           (1, 2) (1, 3)
198    NUMBER     '3e14159'     (1, 4) (1, 11)
199    """)
200        self.check_tokenize("x = 3E123", """\
201    NAME       'x'           (1, 0) (1, 1)
202    OP         '='           (1, 2) (1, 3)
203    NUMBER     '3E123'       (1, 4) (1, 9)
204    """)
205        self.check_tokenize("x+y = 3e-1230", """\
206    NAME       'x'           (1, 0) (1, 1)
207    OP         '+'           (1, 1) (1, 2)
208    NAME       'y'           (1, 2) (1, 3)
209    OP         '='           (1, 4) (1, 5)
210    NUMBER     '3e-1230'     (1, 6) (1, 13)
211    """)
212        self.check_tokenize("x = 3.14e159", """\
213    NAME       'x'           (1, 0) (1, 1)
214    OP         '='           (1, 2) (1, 3)
215    NUMBER     '3.14e159'    (1, 4) (1, 12)
216    """)
217
218    def test_underscore_literals(self):
219        def number_token(s):
220            f = BytesIO(s.encode('utf-8'))
221            for toktype, token, start, end, line in tokenize(f.readline):
222                if toktype == NUMBER:
223                    return token
224            return 'invalid token'
225        for lit in VALID_UNDERSCORE_LITERALS:
226            if '(' in lit:
227                # this won't work with compound complex inputs
228                continue
229            self.assertEqual(number_token(lit), lit)
230        for lit in INVALID_UNDERSCORE_LITERALS:
231            self.assertNotEqual(number_token(lit), lit)
232
233    def test_string(self):
234        # String literals
235        self.check_tokenize("x = ''; y = \"\"", """\
236    NAME       'x'           (1, 0) (1, 1)
237    OP         '='           (1, 2) (1, 3)
238    STRING     "''"          (1, 4) (1, 6)
239    OP         ';'           (1, 6) (1, 7)
240    NAME       'y'           (1, 8) (1, 9)
241    OP         '='           (1, 10) (1, 11)
242    STRING     '""'          (1, 12) (1, 14)
243    """)
244        self.check_tokenize("x = '\"'; y = \"'\"", """\
245    NAME       'x'           (1, 0) (1, 1)
246    OP         '='           (1, 2) (1, 3)
247    STRING     '\\'"\\''       (1, 4) (1, 7)
248    OP         ';'           (1, 7) (1, 8)
249    NAME       'y'           (1, 9) (1, 10)
250    OP         '='           (1, 11) (1, 12)
251    STRING     '"\\'"'        (1, 13) (1, 16)
252    """)
253        self.check_tokenize("x = \"doesn't \"shrink\", does it\"", """\
254    NAME       'x'           (1, 0) (1, 1)
255    OP         '='           (1, 2) (1, 3)
256    STRING     '"doesn\\'t "' (1, 4) (1, 14)
257    NAME       'shrink'      (1, 14) (1, 20)
258    STRING     '", does it"' (1, 20) (1, 31)
259    """)
260        self.check_tokenize("x = 'abc' + 'ABC'", """\
261    NAME       'x'           (1, 0) (1, 1)
262    OP         '='           (1, 2) (1, 3)
263    STRING     "'abc'"       (1, 4) (1, 9)
264    OP         '+'           (1, 10) (1, 11)
265    STRING     "'ABC'"       (1, 12) (1, 17)
266    """)
267        self.check_tokenize('y = "ABC" + "ABC"', """\
268    NAME       'y'           (1, 0) (1, 1)
269    OP         '='           (1, 2) (1, 3)
270    STRING     '"ABC"'       (1, 4) (1, 9)
271    OP         '+'           (1, 10) (1, 11)
272    STRING     '"ABC"'       (1, 12) (1, 17)
273    """)
274        self.check_tokenize("x = r'abc' + r'ABC' + R'ABC' + R'ABC'", """\
275    NAME       'x'           (1, 0) (1, 1)
276    OP         '='           (1, 2) (1, 3)
277    STRING     "r'abc'"      (1, 4) (1, 10)
278    OP         '+'           (1, 11) (1, 12)
279    STRING     "r'ABC'"      (1, 13) (1, 19)
280    OP         '+'           (1, 20) (1, 21)
281    STRING     "R'ABC'"      (1, 22) (1, 28)
282    OP         '+'           (1, 29) (1, 30)
283    STRING     "R'ABC'"      (1, 31) (1, 37)
284    """)
285        self.check_tokenize('y = r"abc" + r"ABC" + R"ABC" + R"ABC"', """\
286    NAME       'y'           (1, 0) (1, 1)
287    OP         '='           (1, 2) (1, 3)
288    STRING     'r"abc"'      (1, 4) (1, 10)
289    OP         '+'           (1, 11) (1, 12)
290    STRING     'r"ABC"'      (1, 13) (1, 19)
291    OP         '+'           (1, 20) (1, 21)
292    STRING     'R"ABC"'      (1, 22) (1, 28)
293    OP         '+'           (1, 29) (1, 30)
294    STRING     'R"ABC"'      (1, 31) (1, 37)
295    """)
296
297        self.check_tokenize("u'abc' + U'abc'", """\
298    STRING     "u'abc'"      (1, 0) (1, 6)
299    OP         '+'           (1, 7) (1, 8)
300    STRING     "U'abc'"      (1, 9) (1, 15)
301    """)
302        self.check_tokenize('u"abc" + U"abc"', """\
303    STRING     'u"abc"'      (1, 0) (1, 6)
304    OP         '+'           (1, 7) (1, 8)
305    STRING     'U"abc"'      (1, 9) (1, 15)
306    """)
307
308        self.check_tokenize("b'abc' + B'abc'", """\
309    STRING     "b'abc'"      (1, 0) (1, 6)
310    OP         '+'           (1, 7) (1, 8)
311    STRING     "B'abc'"      (1, 9) (1, 15)
312    """)
313        self.check_tokenize('b"abc" + B"abc"', """\
314    STRING     'b"abc"'      (1, 0) (1, 6)
315    OP         '+'           (1, 7) (1, 8)
316    STRING     'B"abc"'      (1, 9) (1, 15)
317    """)
318        self.check_tokenize("br'abc' + bR'abc' + Br'abc' + BR'abc'", """\
319    STRING     "br'abc'"     (1, 0) (1, 7)
320    OP         '+'           (1, 8) (1, 9)
321    STRING     "bR'abc'"     (1, 10) (1, 17)
322    OP         '+'           (1, 18) (1, 19)
323    STRING     "Br'abc'"     (1, 20) (1, 27)
324    OP         '+'           (1, 28) (1, 29)
325    STRING     "BR'abc'"     (1, 30) (1, 37)
326    """)
327        self.check_tokenize('br"abc" + bR"abc" + Br"abc" + BR"abc"', """\
328    STRING     'br"abc"'     (1, 0) (1, 7)
329    OP         '+'           (1, 8) (1, 9)
330    STRING     'bR"abc"'     (1, 10) (1, 17)
331    OP         '+'           (1, 18) (1, 19)
332    STRING     'Br"abc"'     (1, 20) (1, 27)
333    OP         '+'           (1, 28) (1, 29)
334    STRING     'BR"abc"'     (1, 30) (1, 37)
335    """)
336        self.check_tokenize("rb'abc' + rB'abc' + Rb'abc' + RB'abc'", """\
337    STRING     "rb'abc'"     (1, 0) (1, 7)
338    OP         '+'           (1, 8) (1, 9)
339    STRING     "rB'abc'"     (1, 10) (1, 17)
340    OP         '+'           (1, 18) (1, 19)
341    STRING     "Rb'abc'"     (1, 20) (1, 27)
342    OP         '+'           (1, 28) (1, 29)
343    STRING     "RB'abc'"     (1, 30) (1, 37)
344    """)
345        self.check_tokenize('rb"abc" + rB"abc" + Rb"abc" + RB"abc"', """\
346    STRING     'rb"abc"'     (1, 0) (1, 7)
347    OP         '+'           (1, 8) (1, 9)
348    STRING     'rB"abc"'     (1, 10) (1, 17)
349    OP         '+'           (1, 18) (1, 19)
350    STRING     'Rb"abc"'     (1, 20) (1, 27)
351    OP         '+'           (1, 28) (1, 29)
352    STRING     'RB"abc"'     (1, 30) (1, 37)
353    """)
354        # Check 0, 1, and 2 character string prefixes.
355        self.check_tokenize(r'"a\
356de\
357fg"', """\
358    STRING     '"a\\\\\\nde\\\\\\nfg"\' (1, 0) (3, 3)
359    """)
360        self.check_tokenize(r'u"a\
361de"', """\
362    STRING     'u"a\\\\\\nde"\'  (1, 0) (2, 3)
363    """)
364        self.check_tokenize(r'rb"a\
365d"', """\
366    STRING     'rb"a\\\\\\nd"\'  (1, 0) (2, 2)
367    """)
368        self.check_tokenize(r'"""a\
369b"""', """\
370    STRING     '\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
371    """)
372        self.check_tokenize(r'u"""a\
373b"""', """\
374    STRING     'u\"\""a\\\\\\nb\"\""' (1, 0) (2, 4)
375    """)
376        self.check_tokenize(r'rb"""a\
377b\
378c"""', """\
379    STRING     'rb"\""a\\\\\\nb\\\\\\nc"\""' (1, 0) (3, 4)
380    """)
381        self.check_tokenize('f"abc"', """\
382    STRING     'f"abc"'      (1, 0) (1, 6)
383    """)
384        self.check_tokenize('fR"a{b}c"', """\
385    STRING     'fR"a{b}c"'   (1, 0) (1, 9)
386    """)
387        self.check_tokenize('f"""abc"""', """\
388    STRING     'f\"\"\"abc\"\"\"'  (1, 0) (1, 10)
389    """)
390        self.check_tokenize(r'f"abc\
391def"', """\
392    STRING     'f"abc\\\\\\ndef"' (1, 0) (2, 4)
393    """)
394        self.check_tokenize(r'Rf"abc\
395def"', """\
396    STRING     'Rf"abc\\\\\\ndef"' (1, 0) (2, 4)
397    """)
398
399    def test_function(self):
400        self.check_tokenize("def d22(a, b, c=2, d=2, *k): pass", """\
401    NAME       'def'         (1, 0) (1, 3)
402    NAME       'd22'         (1, 4) (1, 7)
403    OP         '('           (1, 7) (1, 8)
404    NAME       'a'           (1, 8) (1, 9)
405    OP         ','           (1, 9) (1, 10)
406    NAME       'b'           (1, 11) (1, 12)
407    OP         ','           (1, 12) (1, 13)
408    NAME       'c'           (1, 14) (1, 15)
409    OP         '='           (1, 15) (1, 16)
410    NUMBER     '2'           (1, 16) (1, 17)
411    OP         ','           (1, 17) (1, 18)
412    NAME       'd'           (1, 19) (1, 20)
413    OP         '='           (1, 20) (1, 21)
414    NUMBER     '2'           (1, 21) (1, 22)
415    OP         ','           (1, 22) (1, 23)
416    OP         '*'           (1, 24) (1, 25)
417    NAME       'k'           (1, 25) (1, 26)
418    OP         ')'           (1, 26) (1, 27)
419    OP         ':'           (1, 27) (1, 28)
420    NAME       'pass'        (1, 29) (1, 33)
421    """)
422        self.check_tokenize("def d01v_(a=1, *k, **w): pass", """\
423    NAME       'def'         (1, 0) (1, 3)
424    NAME       'd01v_'       (1, 4) (1, 9)
425    OP         '('           (1, 9) (1, 10)
426    NAME       'a'           (1, 10) (1, 11)
427    OP         '='           (1, 11) (1, 12)
428    NUMBER     '1'           (1, 12) (1, 13)
429    OP         ','           (1, 13) (1, 14)
430    OP         '*'           (1, 15) (1, 16)
431    NAME       'k'           (1, 16) (1, 17)
432    OP         ','           (1, 17) (1, 18)
433    OP         '**'          (1, 19) (1, 21)
434    NAME       'w'           (1, 21) (1, 22)
435    OP         ')'           (1, 22) (1, 23)
436    OP         ':'           (1, 23) (1, 24)
437    NAME       'pass'        (1, 25) (1, 29)
438    """)
439        self.check_tokenize("def d23(a: str, b: int=3) -> int: pass", """\
440    NAME       'def'         (1, 0) (1, 3)
441    NAME       'd23'         (1, 4) (1, 7)
442    OP         '('           (1, 7) (1, 8)
443    NAME       'a'           (1, 8) (1, 9)
444    OP         ':'           (1, 9) (1, 10)
445    NAME       'str'         (1, 11) (1, 14)
446    OP         ','           (1, 14) (1, 15)
447    NAME       'b'           (1, 16) (1, 17)
448    OP         ':'           (1, 17) (1, 18)
449    NAME       'int'         (1, 19) (1, 22)
450    OP         '='           (1, 22) (1, 23)
451    NUMBER     '3'           (1, 23) (1, 24)
452    OP         ')'           (1, 24) (1, 25)
453    OP         '->'          (1, 26) (1, 28)
454    NAME       'int'         (1, 29) (1, 32)
455    OP         ':'           (1, 32) (1, 33)
456    NAME       'pass'        (1, 34) (1, 38)
457    """)
458
459    def test_comparison(self):
460        # Comparison
461        self.check_tokenize("if 1 < 1 > 1 == 1 >= 5 <= 0x15 <= 0x12 != "
462                            "1 and 5 in 1 not in 1 is 1 or 5 is not 1: pass", """\
463    NAME       'if'          (1, 0) (1, 2)
464    NUMBER     '1'           (1, 3) (1, 4)
465    OP         '<'           (1, 5) (1, 6)
466    NUMBER     '1'           (1, 7) (1, 8)
467    OP         '>'           (1, 9) (1, 10)
468    NUMBER     '1'           (1, 11) (1, 12)
469    OP         '=='          (1, 13) (1, 15)
470    NUMBER     '1'           (1, 16) (1, 17)
471    OP         '>='          (1, 18) (1, 20)
472    NUMBER     '5'           (1, 21) (1, 22)
473    OP         '<='          (1, 23) (1, 25)
474    NUMBER     '0x15'        (1, 26) (1, 30)
475    OP         '<='          (1, 31) (1, 33)
476    NUMBER     '0x12'        (1, 34) (1, 38)
477    OP         '!='          (1, 39) (1, 41)
478    NUMBER     '1'           (1, 42) (1, 43)
479    NAME       'and'         (1, 44) (1, 47)
480    NUMBER     '5'           (1, 48) (1, 49)
481    NAME       'in'          (1, 50) (1, 52)
482    NUMBER     '1'           (1, 53) (1, 54)
483    NAME       'not'         (1, 55) (1, 58)
484    NAME       'in'          (1, 59) (1, 61)
485    NUMBER     '1'           (1, 62) (1, 63)
486    NAME       'is'          (1, 64) (1, 66)
487    NUMBER     '1'           (1, 67) (1, 68)
488    NAME       'or'          (1, 69) (1, 71)
489    NUMBER     '5'           (1, 72) (1, 73)
490    NAME       'is'          (1, 74) (1, 76)
491    NAME       'not'         (1, 77) (1, 80)
492    NUMBER     '1'           (1, 81) (1, 82)
493    OP         ':'           (1, 82) (1, 83)
494    NAME       'pass'        (1, 84) (1, 88)
495    """)
496
497    def test_shift(self):
498        # Shift
499        self.check_tokenize("x = 1 << 1 >> 5", """\
500    NAME       'x'           (1, 0) (1, 1)
501    OP         '='           (1, 2) (1, 3)
502    NUMBER     '1'           (1, 4) (1, 5)
503    OP         '<<'          (1, 6) (1, 8)
504    NUMBER     '1'           (1, 9) (1, 10)
505    OP         '>>'          (1, 11) (1, 13)
506    NUMBER     '5'           (1, 14) (1, 15)
507    """)
508
509    def test_additive(self):
510        # Additive
511        self.check_tokenize("x = 1 - y + 15 - 1 + 0x124 + z + a[5]", """\
512    NAME       'x'           (1, 0) (1, 1)
513    OP         '='           (1, 2) (1, 3)
514    NUMBER     '1'           (1, 4) (1, 5)
515    OP         '-'           (1, 6) (1, 7)
516    NAME       'y'           (1, 8) (1, 9)
517    OP         '+'           (1, 10) (1, 11)
518    NUMBER     '15'          (1, 12) (1, 14)
519    OP         '-'           (1, 15) (1, 16)
520    NUMBER     '1'           (1, 17) (1, 18)
521    OP         '+'           (1, 19) (1, 20)
522    NUMBER     '0x124'       (1, 21) (1, 26)
523    OP         '+'           (1, 27) (1, 28)
524    NAME       'z'           (1, 29) (1, 30)
525    OP         '+'           (1, 31) (1, 32)
526    NAME       'a'           (1, 33) (1, 34)
527    OP         '['           (1, 34) (1, 35)
528    NUMBER     '5'           (1, 35) (1, 36)
529    OP         ']'           (1, 36) (1, 37)
530    """)
531
532    def test_multiplicative(self):
533        # Multiplicative
534        self.check_tokenize("x = 1//1*1/5*12%0x12@42", """\
535    NAME       'x'           (1, 0) (1, 1)
536    OP         '='           (1, 2) (1, 3)
537    NUMBER     '1'           (1, 4) (1, 5)
538    OP         '//'          (1, 5) (1, 7)
539    NUMBER     '1'           (1, 7) (1, 8)
540    OP         '*'           (1, 8) (1, 9)
541    NUMBER     '1'           (1, 9) (1, 10)
542    OP         '/'           (1, 10) (1, 11)
543    NUMBER     '5'           (1, 11) (1, 12)
544    OP         '*'           (1, 12) (1, 13)
545    NUMBER     '12'          (1, 13) (1, 15)
546    OP         '%'           (1, 15) (1, 16)
547    NUMBER     '0x12'        (1, 16) (1, 20)
548    OP         '@'           (1, 20) (1, 21)
549    NUMBER     '42'          (1, 21) (1, 23)
550    """)
551
552    def test_unary(self):
553        # Unary
554        self.check_tokenize("~1 ^ 1 & 1 |1 ^ -1", """\
555    OP         '~'           (1, 0) (1, 1)
556    NUMBER     '1'           (1, 1) (1, 2)
557    OP         '^'           (1, 3) (1, 4)
558    NUMBER     '1'           (1, 5) (1, 6)
559    OP         '&'           (1, 7) (1, 8)
560    NUMBER     '1'           (1, 9) (1, 10)
561    OP         '|'           (1, 11) (1, 12)
562    NUMBER     '1'           (1, 12) (1, 13)
563    OP         '^'           (1, 14) (1, 15)
564    OP         '-'           (1, 16) (1, 17)
565    NUMBER     '1'           (1, 17) (1, 18)
566    """)
567        self.check_tokenize("-1*1/1+1*1//1 - ---1**1", """\
568    OP         '-'           (1, 0) (1, 1)
569    NUMBER     '1'           (1, 1) (1, 2)
570    OP         '*'           (1, 2) (1, 3)
571    NUMBER     '1'           (1, 3) (1, 4)
572    OP         '/'           (1, 4) (1, 5)
573    NUMBER     '1'           (1, 5) (1, 6)
574    OP         '+'           (1, 6) (1, 7)
575    NUMBER     '1'           (1, 7) (1, 8)
576    OP         '*'           (1, 8) (1, 9)
577    NUMBER     '1'           (1, 9) (1, 10)
578    OP         '//'          (1, 10) (1, 12)
579    NUMBER     '1'           (1, 12) (1, 13)
580    OP         '-'           (1, 14) (1, 15)
581    OP         '-'           (1, 16) (1, 17)
582    OP         '-'           (1, 17) (1, 18)
583    OP         '-'           (1, 18) (1, 19)
584    NUMBER     '1'           (1, 19) (1, 20)
585    OP         '**'          (1, 20) (1, 22)
586    NUMBER     '1'           (1, 22) (1, 23)
587    """)
588
589    def test_selector(self):
590        # Selector
591        self.check_tokenize("import sys, time\nx = sys.modules['time'].time()", """\
592    NAME       'import'      (1, 0) (1, 6)
593    NAME       'sys'         (1, 7) (1, 10)
594    OP         ','           (1, 10) (1, 11)
595    NAME       'time'        (1, 12) (1, 16)
596    NEWLINE    '\\n'          (1, 16) (1, 17)
597    NAME       'x'           (2, 0) (2, 1)
598    OP         '='           (2, 2) (2, 3)
599    NAME       'sys'         (2, 4) (2, 7)
600    OP         '.'           (2, 7) (2, 8)
601    NAME       'modules'     (2, 8) (2, 15)
602    OP         '['           (2, 15) (2, 16)
603    STRING     "'time'"      (2, 16) (2, 22)
604    OP         ']'           (2, 22) (2, 23)
605    OP         '.'           (2, 23) (2, 24)
606    NAME       'time'        (2, 24) (2, 28)
607    OP         '('           (2, 28) (2, 29)
608    OP         ')'           (2, 29) (2, 30)
609    """)
610
611    def test_method(self):
612        # Methods
613        self.check_tokenize("@staticmethod\ndef foo(x,y): pass", """\
614    OP         '@'           (1, 0) (1, 1)
615    NAME       'staticmethod' (1, 1) (1, 13)
616    NEWLINE    '\\n'          (1, 13) (1, 14)
617    NAME       'def'         (2, 0) (2, 3)
618    NAME       'foo'         (2, 4) (2, 7)
619    OP         '('           (2, 7) (2, 8)
620    NAME       'x'           (2, 8) (2, 9)
621    OP         ','           (2, 9) (2, 10)
622    NAME       'y'           (2, 10) (2, 11)
623    OP         ')'           (2, 11) (2, 12)
624    OP         ':'           (2, 12) (2, 13)
625    NAME       'pass'        (2, 14) (2, 18)
626    """)
627
628    def test_tabs(self):
629        # Evil tabs
630        self.check_tokenize("def f():\n"
631                            "\tif x\n"
632                            "        \tpass", """\
633    NAME       'def'         (1, 0) (1, 3)
634    NAME       'f'           (1, 4) (1, 5)
635    OP         '('           (1, 5) (1, 6)
636    OP         ')'           (1, 6) (1, 7)
637    OP         ':'           (1, 7) (1, 8)
638    NEWLINE    '\\n'          (1, 8) (1, 9)
639    INDENT     '\\t'          (2, 0) (2, 1)
640    NAME       'if'          (2, 1) (2, 3)
641    NAME       'x'           (2, 4) (2, 5)
642    NEWLINE    '\\n'          (2, 5) (2, 6)
643    INDENT     '        \\t'  (3, 0) (3, 9)
644    NAME       'pass'        (3, 9) (3, 13)
645    DEDENT     ''            (4, 0) (4, 0)
646    DEDENT     ''            (4, 0) (4, 0)
647    """)
648
649    def test_non_ascii_identifiers(self):
650        # Non-ascii identifiers
651        self.check_tokenize("Örter = 'places'\ngrün = 'green'", """\
652    NAME       'Örter'       (1, 0) (1, 5)
653    OP         '='           (1, 6) (1, 7)
654    STRING     "'places'"    (1, 8) (1, 16)
655    NEWLINE    '\\n'          (1, 16) (1, 17)
656    NAME       'grün'        (2, 0) (2, 4)
657    OP         '='           (2, 5) (2, 6)
658    STRING     "'green'"     (2, 7) (2, 14)
659    """)
660
661    def test_unicode(self):
662        # Legacy unicode literals:
663        self.check_tokenize("Örter = u'places'\ngrün = U'green'", """\
664    NAME       'Örter'       (1, 0) (1, 5)
665    OP         '='           (1, 6) (1, 7)
666    STRING     "u'places'"   (1, 8) (1, 17)
667    NEWLINE    '\\n'          (1, 17) (1, 18)
668    NAME       'grün'        (2, 0) (2, 4)
669    OP         '='           (2, 5) (2, 6)
670    STRING     "U'green'"    (2, 7) (2, 15)
671    """)
672
673    def test_async(self):
674        # Async/await extension:
675        self.check_tokenize("async = 1", """\
676    NAME       'async'       (1, 0) (1, 5)
677    OP         '='           (1, 6) (1, 7)
678    NUMBER     '1'           (1, 8) (1, 9)
679    """)
680
681        self.check_tokenize("a = (async = 1)", """\
682    NAME       'a'           (1, 0) (1, 1)
683    OP         '='           (1, 2) (1, 3)
684    OP         '('           (1, 4) (1, 5)
685    NAME       'async'       (1, 5) (1, 10)
686    OP         '='           (1, 11) (1, 12)
687    NUMBER     '1'           (1, 13) (1, 14)
688    OP         ')'           (1, 14) (1, 15)
689    """)
690
691        self.check_tokenize("async()", """\
692    NAME       'async'       (1, 0) (1, 5)
693    OP         '('           (1, 5) (1, 6)
694    OP         ')'           (1, 6) (1, 7)
695    """)
696
697        self.check_tokenize("class async(Bar):pass", """\
698    NAME       'class'       (1, 0) (1, 5)
699    NAME       'async'       (1, 6) (1, 11)
700    OP         '('           (1, 11) (1, 12)
701    NAME       'Bar'         (1, 12) (1, 15)
702    OP         ')'           (1, 15) (1, 16)
703    OP         ':'           (1, 16) (1, 17)
704    NAME       'pass'        (1, 17) (1, 21)
705    """)
706
707        self.check_tokenize("class async:pass", """\
708    NAME       'class'       (1, 0) (1, 5)
709    NAME       'async'       (1, 6) (1, 11)
710    OP         ':'           (1, 11) (1, 12)
711    NAME       'pass'        (1, 12) (1, 16)
712    """)
713
714        self.check_tokenize("await = 1", """\
715    NAME       'await'       (1, 0) (1, 5)
716    OP         '='           (1, 6) (1, 7)
717    NUMBER     '1'           (1, 8) (1, 9)
718    """)
719
720        self.check_tokenize("foo.async", """\
721    NAME       'foo'         (1, 0) (1, 3)
722    OP         '.'           (1, 3) (1, 4)
723    NAME       'async'       (1, 4) (1, 9)
724    """)
725
726        self.check_tokenize("async for a in b: pass", """\
727    NAME       'async'       (1, 0) (1, 5)
728    NAME       'for'         (1, 6) (1, 9)
729    NAME       'a'           (1, 10) (1, 11)
730    NAME       'in'          (1, 12) (1, 14)
731    NAME       'b'           (1, 15) (1, 16)
732    OP         ':'           (1, 16) (1, 17)
733    NAME       'pass'        (1, 18) (1, 22)
734    """)
735
736        self.check_tokenize("async with a as b: pass", """\
737    NAME       'async'       (1, 0) (1, 5)
738    NAME       'with'        (1, 6) (1, 10)
739    NAME       'a'           (1, 11) (1, 12)
740    NAME       'as'          (1, 13) (1, 15)
741    NAME       'b'           (1, 16) (1, 17)
742    OP         ':'           (1, 17) (1, 18)
743    NAME       'pass'        (1, 19) (1, 23)
744    """)
745
746        self.check_tokenize("async.foo", """\
747    NAME       'async'       (1, 0) (1, 5)
748    OP         '.'           (1, 5) (1, 6)
749    NAME       'foo'         (1, 6) (1, 9)
750    """)
751
752        self.check_tokenize("async", """\
753    NAME       'async'       (1, 0) (1, 5)
754    """)
755
756        self.check_tokenize("async\n#comment\nawait", """\
757    NAME       'async'       (1, 0) (1, 5)
758    NEWLINE    '\\n'          (1, 5) (1, 6)
759    COMMENT    '#comment'    (2, 0) (2, 8)
760    NL         '\\n'          (2, 8) (2, 9)
761    NAME       'await'       (3, 0) (3, 5)
762    """)
763
764        self.check_tokenize("async\n...\nawait", """\
765    NAME       'async'       (1, 0) (1, 5)
766    NEWLINE    '\\n'          (1, 5) (1, 6)
767    OP         '...'         (2, 0) (2, 3)
768    NEWLINE    '\\n'          (2, 3) (2, 4)
769    NAME       'await'       (3, 0) (3, 5)
770    """)
771
772        self.check_tokenize("async\nawait", """\
773    NAME       'async'       (1, 0) (1, 5)
774    NEWLINE    '\\n'          (1, 5) (1, 6)
775    NAME       'await'       (2, 0) (2, 5)
776    """)
777
778        self.check_tokenize("foo.async + 1", """\
779    NAME       'foo'         (1, 0) (1, 3)
780    OP         '.'           (1, 3) (1, 4)
781    NAME       'async'       (1, 4) (1, 9)
782    OP         '+'           (1, 10) (1, 11)
783    NUMBER     '1'           (1, 12) (1, 13)
784    """)
785
786        self.check_tokenize("async def foo(): pass", """\
787    NAME       'async'       (1, 0) (1, 5)
788    NAME       'def'         (1, 6) (1, 9)
789    NAME       'foo'         (1, 10) (1, 13)
790    OP         '('           (1, 13) (1, 14)
791    OP         ')'           (1, 14) (1, 15)
792    OP         ':'           (1, 15) (1, 16)
793    NAME       'pass'        (1, 17) (1, 21)
794    """)
795
796        self.check_tokenize('''\
797async def foo():
798  def foo(await):
799    await = 1
800  if 1:
801    await
802async += 1
803''', """\
804    NAME       'async'       (1, 0) (1, 5)
805    NAME       'def'         (1, 6) (1, 9)
806    NAME       'foo'         (1, 10) (1, 13)
807    OP         '('           (1, 13) (1, 14)
808    OP         ')'           (1, 14) (1, 15)
809    OP         ':'           (1, 15) (1, 16)
810    NEWLINE    '\\n'          (1, 16) (1, 17)
811    INDENT     '  '          (2, 0) (2, 2)
812    NAME       'def'         (2, 2) (2, 5)
813    NAME       'foo'         (2, 6) (2, 9)
814    OP         '('           (2, 9) (2, 10)
815    NAME       'await'       (2, 10) (2, 15)
816    OP         ')'           (2, 15) (2, 16)
817    OP         ':'           (2, 16) (2, 17)
818    NEWLINE    '\\n'          (2, 17) (2, 18)
819    INDENT     '    '        (3, 0) (3, 4)
820    NAME       'await'       (3, 4) (3, 9)
821    OP         '='           (3, 10) (3, 11)
822    NUMBER     '1'           (3, 12) (3, 13)
823    NEWLINE    '\\n'          (3, 13) (3, 14)
824    DEDENT     ''            (4, 2) (4, 2)
825    NAME       'if'          (4, 2) (4, 4)
826    NUMBER     '1'           (4, 5) (4, 6)
827    OP         ':'           (4, 6) (4, 7)
828    NEWLINE    '\\n'          (4, 7) (4, 8)
829    INDENT     '    '        (5, 0) (5, 4)
830    NAME       'await'       (5, 4) (5, 9)
831    NEWLINE    '\\n'          (5, 9) (5, 10)
832    DEDENT     ''            (6, 0) (6, 0)
833    DEDENT     ''            (6, 0) (6, 0)
834    NAME       'async'       (6, 0) (6, 5)
835    OP         '+='          (6, 6) (6, 8)
836    NUMBER     '1'           (6, 9) (6, 10)
837    NEWLINE    '\\n'          (6, 10) (6, 11)
838    """)
839
840        self.check_tokenize('''\
841async def foo():
842  async for i in 1: pass''', """\
843    NAME       'async'       (1, 0) (1, 5)
844    NAME       'def'         (1, 6) (1, 9)
845    NAME       'foo'         (1, 10) (1, 13)
846    OP         '('           (1, 13) (1, 14)
847    OP         ')'           (1, 14) (1, 15)
848    OP         ':'           (1, 15) (1, 16)
849    NEWLINE    '\\n'          (1, 16) (1, 17)
850    INDENT     '  '          (2, 0) (2, 2)
851    NAME       'async'       (2, 2) (2, 7)
852    NAME       'for'         (2, 8) (2, 11)
853    NAME       'i'           (2, 12) (2, 13)
854    NAME       'in'          (2, 14) (2, 16)
855    NUMBER     '1'           (2, 17) (2, 18)
856    OP         ':'           (2, 18) (2, 19)
857    NAME       'pass'        (2, 20) (2, 24)
858    DEDENT     ''            (3, 0) (3, 0)
859    """)
860
861        self.check_tokenize('''async def foo(async): await''', """\
862    NAME       'async'       (1, 0) (1, 5)
863    NAME       'def'         (1, 6) (1, 9)
864    NAME       'foo'         (1, 10) (1, 13)
865    OP         '('           (1, 13) (1, 14)
866    NAME       'async'       (1, 14) (1, 19)
867    OP         ')'           (1, 19) (1, 20)
868    OP         ':'           (1, 20) (1, 21)
869    NAME       'await'       (1, 22) (1, 27)
870    """)
871
872        self.check_tokenize('''\
873def f():
874
875  def baz(): pass
876  async def bar(): pass
877
878  await = 2''', """\
879    NAME       'def'         (1, 0) (1, 3)
880    NAME       'f'           (1, 4) (1, 5)
881    OP         '('           (1, 5) (1, 6)
882    OP         ')'           (1, 6) (1, 7)
883    OP         ':'           (1, 7) (1, 8)
884    NEWLINE    '\\n'          (1, 8) (1, 9)
885    NL         '\\n'          (2, 0) (2, 1)
886    INDENT     '  '          (3, 0) (3, 2)
887    NAME       'def'         (3, 2) (3, 5)
888    NAME       'baz'         (3, 6) (3, 9)
889    OP         '('           (3, 9) (3, 10)
890    OP         ')'           (3, 10) (3, 11)
891    OP         ':'           (3, 11) (3, 12)
892    NAME       'pass'        (3, 13) (3, 17)
893    NEWLINE    '\\n'          (3, 17) (3, 18)
894    NAME       'async'       (4, 2) (4, 7)
895    NAME       'def'         (4, 8) (4, 11)
896    NAME       'bar'         (4, 12) (4, 15)
897    OP         '('           (4, 15) (4, 16)
898    OP         ')'           (4, 16) (4, 17)
899    OP         ':'           (4, 17) (4, 18)
900    NAME       'pass'        (4, 19) (4, 23)
901    NEWLINE    '\\n'          (4, 23) (4, 24)
902    NL         '\\n'          (5, 0) (5, 1)
903    NAME       'await'       (6, 2) (6, 7)
904    OP         '='           (6, 8) (6, 9)
905    NUMBER     '2'           (6, 10) (6, 11)
906    DEDENT     ''            (7, 0) (7, 0)
907    """)
908
909        self.check_tokenize('''\
910async def f():
911
912  def baz(): pass
913  async def bar(): pass
914
915  await = 2''', """\
916    NAME       'async'       (1, 0) (1, 5)
917    NAME       'def'         (1, 6) (1, 9)
918    NAME       'f'           (1, 10) (1, 11)
919    OP         '('           (1, 11) (1, 12)
920    OP         ')'           (1, 12) (1, 13)
921    OP         ':'           (1, 13) (1, 14)
922    NEWLINE    '\\n'          (1, 14) (1, 15)
923    NL         '\\n'          (2, 0) (2, 1)
924    INDENT     '  '          (3, 0) (3, 2)
925    NAME       'def'         (3, 2) (3, 5)
926    NAME       'baz'         (3, 6) (3, 9)
927    OP         '('           (3, 9) (3, 10)
928    OP         ')'           (3, 10) (3, 11)
929    OP         ':'           (3, 11) (3, 12)
930    NAME       'pass'        (3, 13) (3, 17)
931    NEWLINE    '\\n'          (3, 17) (3, 18)
932    NAME       'async'       (4, 2) (4, 7)
933    NAME       'def'         (4, 8) (4, 11)
934    NAME       'bar'         (4, 12) (4, 15)
935    OP         '('           (4, 15) (4, 16)
936    OP         ')'           (4, 16) (4, 17)
937    OP         ':'           (4, 17) (4, 18)
938    NAME       'pass'        (4, 19) (4, 23)
939    NEWLINE    '\\n'          (4, 23) (4, 24)
940    NL         '\\n'          (5, 0) (5, 1)
941    NAME       'await'       (6, 2) (6, 7)
942    OP         '='           (6, 8) (6, 9)
943    NUMBER     '2'           (6, 10) (6, 11)
944    DEDENT     ''            (7, 0) (7, 0)
945    """)
946
947class GenerateTokensTest(TokenizeTest):
948    def check_tokenize(self, s, expected):
949        # Format the tokens in s in a table format.
950        # The ENDMARKER and final NEWLINE are omitted.
951        f = StringIO(s)
952        result = stringify_tokens_from_source(generate_tokens(f.readline), s)
953        self.assertEqual(result, expected.rstrip().splitlines())
954
955
956def decistmt(s):
957    result = []
958    g = tokenize(BytesIO(s.encode('utf-8')).readline)   # tokenize the string
959    for toknum, tokval, _, _, _  in g:
960        if toknum == NUMBER and '.' in tokval:  # replace NUMBER tokens
961            result.extend([
962                (NAME, 'Decimal'),
963                (OP, '('),
964                (STRING, repr(tokval)),
965                (OP, ')')
966            ])
967        else:
968            result.append((toknum, tokval))
969    return untokenize(result).decode('utf-8')
970
971class TestMisc(TestCase):
972
973    def test_decistmt(self):
974        # Substitute Decimals for floats in a string of statements.
975        # This is an example from the docs.
976
977        from decimal import Decimal
978        s = '+21.3e-5*-.1234/81.7'
979        self.assertEqual(decistmt(s),
980                         "+Decimal ('21.3e-5')*-Decimal ('.1234')/Decimal ('81.7')")
981
982        # The format of the exponent is inherited from the platform C library.
983        # Known cases are "e-007" (Windows) and "e-07" (not Windows).  Since
984        # we're only showing 11 digits, and the 12th isn't close to 5, the
985        # rest of the output should be platform-independent.
986        self.assertRegex(repr(eval(s)), '-3.2171603427[0-9]*e-0+7')
987
988        # Output from calculations with Decimal should be identical across all
989        # platforms.
990        self.assertEqual(eval(decistmt(s)),
991                         Decimal('-3.217160342717258261933904529E-7'))
992
993
994class TestTokenizerAdheresToPep0263(TestCase):
995    """
996    Test that tokenizer adheres to the coding behaviour stipulated in PEP 0263.
997    """
998
999    def _testFile(self, filename):
1000        path = os.path.join(os.path.dirname(__file__), filename)
1001        TestRoundtrip.check_roundtrip(self, open(path, 'rb'))
1002
1003    def test_utf8_coding_cookie_and_no_utf8_bom(self):
1004        f = 'tokenize_tests-utf8-coding-cookie-and-no-utf8-bom-sig.txt'
1005        self._testFile(f)
1006
1007    def test_latin1_coding_cookie_and_utf8_bom(self):
1008        """
1009        As per PEP 0263, if a file starts with a utf-8 BOM signature, the only
1010        allowed encoding for the comment is 'utf-8'.  The text file used in
1011        this test starts with a BOM signature, but specifies latin1 as the
1012        coding, so verify that a SyntaxError is raised, which matches the
1013        behaviour of the interpreter when it encounters a similar condition.
1014        """
1015        f = 'tokenize_tests-latin1-coding-cookie-and-utf8-bom-sig.txt'
1016        self.assertRaises(SyntaxError, self._testFile, f)
1017
1018    def test_no_coding_cookie_and_utf8_bom(self):
1019        f = 'tokenize_tests-no-coding-cookie-and-utf8-bom-sig-only.txt'
1020        self._testFile(f)
1021
1022    def test_utf8_coding_cookie_and_utf8_bom(self):
1023        f = 'tokenize_tests-utf8-coding-cookie-and-utf8-bom-sig.txt'
1024        self._testFile(f)
1025
1026    def test_bad_coding_cookie(self):
1027        self.assertRaises(SyntaxError, self._testFile, 'bad_coding.py')
1028        self.assertRaises(SyntaxError, self._testFile, 'bad_coding2.py')
1029
1030
1031class Test_Tokenize(TestCase):
1032
1033    def test__tokenize_decodes_with_specified_encoding(self):
1034        literal = '"ЉЊЈЁЂ"'
1035        line = literal.encode('utf-8')
1036        first = False
1037        def readline():
1038            nonlocal first
1039            if not first:
1040                first = True
1041                return line
1042            else:
1043                return b''
1044
1045        # skip the initial encoding token and the end tokens
1046        tokens = list(_tokenize(readline, encoding='utf-8'))[1:-2]
1047        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1048        self.assertEqual(tokens, expected_tokens,
1049                         "bytes not decoded with encoding")
1050
1051    def test__tokenize_does_not_decode_with_encoding_none(self):
1052        literal = '"ЉЊЈЁЂ"'
1053        first = False
1054        def readline():
1055            nonlocal first
1056            if not first:
1057                first = True
1058                return literal
1059            else:
1060                return b''
1061
1062        # skip the end tokens
1063        tokens = list(_tokenize(readline, encoding=None))[:-2]
1064        expected_tokens = [(3, '"ЉЊЈЁЂ"', (1, 0), (1, 7), '"ЉЊЈЁЂ"')]
1065        self.assertEqual(tokens, expected_tokens,
1066                         "string not tokenized when encoding is None")
1067
1068
1069class TestDetectEncoding(TestCase):
1070
1071    def get_readline(self, lines):
1072        index = 0
1073        def readline():
1074            nonlocal index
1075            if index == len(lines):
1076                raise StopIteration
1077            line = lines[index]
1078            index += 1
1079            return line
1080        return readline
1081
1082    def test_no_bom_no_encoding_cookie(self):
1083        lines = (
1084            b'# something\n',
1085            b'print(something)\n',
1086            b'do_something(else)\n'
1087        )
1088        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1089        self.assertEqual(encoding, 'utf-8')
1090        self.assertEqual(consumed_lines, list(lines[:2]))
1091
1092    def test_bom_no_cookie(self):
1093        lines = (
1094            b'\xef\xbb\xbf# something\n',
1095            b'print(something)\n',
1096            b'do_something(else)\n'
1097        )
1098        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1099        self.assertEqual(encoding, 'utf-8-sig')
1100        self.assertEqual(consumed_lines,
1101                         [b'# something\n', b'print(something)\n'])
1102
1103    def test_cookie_first_line_no_bom(self):
1104        lines = (
1105            b'# -*- coding: latin-1 -*-\n',
1106            b'print(something)\n',
1107            b'do_something(else)\n'
1108        )
1109        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1110        self.assertEqual(encoding, 'iso-8859-1')
1111        self.assertEqual(consumed_lines, [b'# -*- coding: latin-1 -*-\n'])
1112
1113    def test_matched_bom_and_cookie_first_line(self):
1114        lines = (
1115            b'\xef\xbb\xbf# coding=utf-8\n',
1116            b'print(something)\n',
1117            b'do_something(else)\n'
1118        )
1119        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1120        self.assertEqual(encoding, 'utf-8-sig')
1121        self.assertEqual(consumed_lines, [b'# coding=utf-8\n'])
1122
1123    def test_mismatched_bom_and_cookie_first_line_raises_syntaxerror(self):
1124        lines = (
1125            b'\xef\xbb\xbf# vim: set fileencoding=ascii :\n',
1126            b'print(something)\n',
1127            b'do_something(else)\n'
1128        )
1129        readline = self.get_readline(lines)
1130        self.assertRaises(SyntaxError, detect_encoding, readline)
1131
1132    def test_cookie_second_line_no_bom(self):
1133        lines = (
1134            b'#! something\n',
1135            b'# vim: set fileencoding=ascii :\n',
1136            b'print(something)\n',
1137            b'do_something(else)\n'
1138        )
1139        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1140        self.assertEqual(encoding, 'ascii')
1141        expected = [b'#! something\n', b'# vim: set fileencoding=ascii :\n']
1142        self.assertEqual(consumed_lines, expected)
1143
1144    def test_matched_bom_and_cookie_second_line(self):
1145        lines = (
1146            b'\xef\xbb\xbf#! something\n',
1147            b'f# coding=utf-8\n',
1148            b'print(something)\n',
1149            b'do_something(else)\n'
1150        )
1151        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1152        self.assertEqual(encoding, 'utf-8-sig')
1153        self.assertEqual(consumed_lines,
1154                         [b'#! something\n', b'f# coding=utf-8\n'])
1155
1156    def test_mismatched_bom_and_cookie_second_line_raises_syntaxerror(self):
1157        lines = (
1158            b'\xef\xbb\xbf#! something\n',
1159            b'# vim: set fileencoding=ascii :\n',
1160            b'print(something)\n',
1161            b'do_something(else)\n'
1162        )
1163        readline = self.get_readline(lines)
1164        self.assertRaises(SyntaxError, detect_encoding, readline)
1165
1166    def test_cookie_second_line_noncommented_first_line(self):
1167        lines = (
1168            b"print('\xc2\xa3')\n",
1169            b'# vim: set fileencoding=iso8859-15 :\n',
1170            b"print('\xe2\x82\xac')\n"
1171        )
1172        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1173        self.assertEqual(encoding, 'utf-8')
1174        expected = [b"print('\xc2\xa3')\n"]
1175        self.assertEqual(consumed_lines, expected)
1176
1177    def test_cookie_second_line_commented_first_line(self):
1178        lines = (
1179            b"#print('\xc2\xa3')\n",
1180            b'# vim: set fileencoding=iso8859-15 :\n',
1181            b"print('\xe2\x82\xac')\n"
1182        )
1183        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1184        self.assertEqual(encoding, 'iso8859-15')
1185        expected = [b"#print('\xc2\xa3')\n", b'# vim: set fileencoding=iso8859-15 :\n']
1186        self.assertEqual(consumed_lines, expected)
1187
1188    def test_cookie_second_line_empty_first_line(self):
1189        lines = (
1190            b'\n',
1191            b'# vim: set fileencoding=iso8859-15 :\n',
1192            b"print('\xe2\x82\xac')\n"
1193        )
1194        encoding, consumed_lines = detect_encoding(self.get_readline(lines))
1195        self.assertEqual(encoding, 'iso8859-15')
1196        expected = [b'\n', b'# vim: set fileencoding=iso8859-15 :\n']
1197        self.assertEqual(consumed_lines, expected)
1198
1199    def test_latin1_normalization(self):
1200        # See get_normal_name() in tokenizer.c.
1201        encodings = ("latin-1", "iso-8859-1", "iso-latin-1", "latin-1-unix",
1202                     "iso-8859-1-unix", "iso-latin-1-mac")
1203        for encoding in encodings:
1204            for rep in ("-", "_"):
1205                enc = encoding.replace("-", rep)
1206                lines = (b"#!/usr/bin/python\n",
1207                         b"# coding: " + enc.encode("ascii") + b"\n",
1208                         b"print(things)\n",
1209                         b"do_something += 4\n")
1210                rl = self.get_readline(lines)
1211                found, consumed_lines = detect_encoding(rl)
1212                self.assertEqual(found, "iso-8859-1")
1213
1214    def test_syntaxerror_latin1(self):
1215        # Issue 14629: need to raise SyntaxError if the first
1216        # line(s) have non-UTF-8 characters
1217        lines = (
1218            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1219            )
1220        readline = self.get_readline(lines)
1221        self.assertRaises(SyntaxError, detect_encoding, readline)
1222
1223
1224    def test_utf8_normalization(self):
1225        # See get_normal_name() in tokenizer.c.
1226        encodings = ("utf-8", "utf-8-mac", "utf-8-unix")
1227        for encoding in encodings:
1228            for rep in ("-", "_"):
1229                enc = encoding.replace("-", rep)
1230                lines = (b"#!/usr/bin/python\n",
1231                         b"# coding: " + enc.encode("ascii") + b"\n",
1232                         b"1 + 3\n")
1233                rl = self.get_readline(lines)
1234                found, consumed_lines = detect_encoding(rl)
1235                self.assertEqual(found, "utf-8")
1236
1237    def test_short_files(self):
1238        readline = self.get_readline((b'print(something)\n',))
1239        encoding, consumed_lines = detect_encoding(readline)
1240        self.assertEqual(encoding, 'utf-8')
1241        self.assertEqual(consumed_lines, [b'print(something)\n'])
1242
1243        encoding, consumed_lines = detect_encoding(self.get_readline(()))
1244        self.assertEqual(encoding, 'utf-8')
1245        self.assertEqual(consumed_lines, [])
1246
1247        readline = self.get_readline((b'\xef\xbb\xbfprint(something)\n',))
1248        encoding, consumed_lines = detect_encoding(readline)
1249        self.assertEqual(encoding, 'utf-8-sig')
1250        self.assertEqual(consumed_lines, [b'print(something)\n'])
1251
1252        readline = self.get_readline((b'\xef\xbb\xbf',))
1253        encoding, consumed_lines = detect_encoding(readline)
1254        self.assertEqual(encoding, 'utf-8-sig')
1255        self.assertEqual(consumed_lines, [])
1256
1257        readline = self.get_readline((b'# coding: bad\n',))
1258        self.assertRaises(SyntaxError, detect_encoding, readline)
1259
1260    def test_false_encoding(self):
1261        # Issue 18873: "Encoding" detected in non-comment lines
1262        readline = self.get_readline((b'print("#coding=fake")',))
1263        encoding, consumed_lines = detect_encoding(readline)
1264        self.assertEqual(encoding, 'utf-8')
1265        self.assertEqual(consumed_lines, [b'print("#coding=fake")'])
1266
1267    def test_open(self):
1268        filename = support.TESTFN + '.py'
1269        self.addCleanup(support.unlink, filename)
1270
1271        # test coding cookie
1272        for encoding in ('iso-8859-15', 'utf-8'):
1273            with open(filename, 'w', encoding=encoding) as fp:
1274                print("# coding: %s" % encoding, file=fp)
1275                print("print('euro:\u20ac')", file=fp)
1276            with tokenize_open(filename) as fp:
1277                self.assertEqual(fp.encoding, encoding)
1278                self.assertEqual(fp.mode, 'r')
1279
1280        # test BOM (no coding cookie)
1281        with open(filename, 'w', encoding='utf-8-sig') as fp:
1282            print("print('euro:\u20ac')", file=fp)
1283        with tokenize_open(filename) as fp:
1284            self.assertEqual(fp.encoding, 'utf-8-sig')
1285            self.assertEqual(fp.mode, 'r')
1286
1287    def test_filename_in_exception(self):
1288        # When possible, include the file name in the exception.
1289        path = 'some_file_path'
1290        lines = (
1291            b'print("\xdf")', # Latin-1: LATIN SMALL LETTER SHARP S
1292            )
1293        class Bunk:
1294            def __init__(self, lines, path):
1295                self.name = path
1296                self._lines = lines
1297                self._index = 0
1298
1299            def readline(self):
1300                if self._index == len(lines):
1301                    raise StopIteration
1302                line = lines[self._index]
1303                self._index += 1
1304                return line
1305
1306        with self.assertRaises(SyntaxError):
1307            ins = Bunk(lines, path)
1308            # Make sure lacking a name isn't an issue.
1309            del ins.name
1310            detect_encoding(ins.readline)
1311        with self.assertRaisesRegex(SyntaxError, '.*{}'.format(path)):
1312            ins = Bunk(lines, path)
1313            detect_encoding(ins.readline)
1314
1315    def test_open_error(self):
1316        # Issue #23840: open() must close the binary file on error
1317        m = BytesIO(b'#coding:xxx')
1318        with mock.patch('tokenize._builtin_open', return_value=m):
1319            self.assertRaises(SyntaxError, tokenize_open, 'foobar')
1320        self.assertTrue(m.closed)
1321
1322
1323class TestTokenize(TestCase):
1324
1325    def test_tokenize(self):
1326        import tokenize as tokenize_module
1327        encoding = object()
1328        encoding_used = None
1329        def mock_detect_encoding(readline):
1330            return encoding, [b'first', b'second']
1331
1332        def mock__tokenize(readline, encoding):
1333            nonlocal encoding_used
1334            encoding_used = encoding
1335            out = []
1336            while True:
1337                next_line = readline()
1338                if next_line:
1339                    out.append(next_line)
1340                    continue
1341                return out
1342
1343        counter = 0
1344        def mock_readline():
1345            nonlocal counter
1346            counter += 1
1347            if counter == 5:
1348                return b''
1349            return str(counter).encode()
1350
1351        orig_detect_encoding = tokenize_module.detect_encoding
1352        orig__tokenize = tokenize_module._tokenize
1353        tokenize_module.detect_encoding = mock_detect_encoding
1354        tokenize_module._tokenize = mock__tokenize
1355        try:
1356            results = tokenize(mock_readline)
1357            self.assertEqual(list(results),
1358                             [b'first', b'second', b'1', b'2', b'3', b'4'])
1359        finally:
1360            tokenize_module.detect_encoding = orig_detect_encoding
1361            tokenize_module._tokenize = orig__tokenize
1362
1363        self.assertEqual(encoding_used, encoding)
1364
1365    def test_oneline_defs(self):
1366        buf = []
1367        for i in range(500):
1368            buf.append('def i{i}(): return {i}'.format(i=i))
1369        buf.append('OK')
1370        buf = '\n'.join(buf)
1371
1372        # Test that 500 consequent, one-line defs is OK
1373        toks = list(tokenize(BytesIO(buf.encode('utf-8')).readline))
1374        self.assertEqual(toks[-3].string, 'OK') # [-1] is always ENDMARKER
1375                                                # [-2] is always NEWLINE
1376
1377    def assertExactTypeEqual(self, opstr, *optypes):
1378        tokens = list(tokenize(BytesIO(opstr.encode('utf-8')).readline))
1379        num_optypes = len(optypes)
1380        self.assertEqual(len(tokens), 3 + num_optypes)
1381        self.assertEqual(tok_name[tokens[0].exact_type],
1382                         tok_name[ENCODING])
1383        for i in range(num_optypes):
1384            self.assertEqual(tok_name[tokens[i + 1].exact_type],
1385                             tok_name[optypes[i]])
1386        self.assertEqual(tok_name[tokens[1 + num_optypes].exact_type],
1387                         tok_name[token.NEWLINE])
1388        self.assertEqual(tok_name[tokens[2 + num_optypes].exact_type],
1389                         tok_name[token.ENDMARKER])
1390
1391    def test_exact_type(self):
1392        self.assertExactTypeEqual('()', token.LPAR, token.RPAR)
1393        self.assertExactTypeEqual('[]', token.LSQB, token.RSQB)
1394        self.assertExactTypeEqual(':', token.COLON)
1395        self.assertExactTypeEqual(',', token.COMMA)
1396        self.assertExactTypeEqual(';', token.SEMI)
1397        self.assertExactTypeEqual('+', token.PLUS)
1398        self.assertExactTypeEqual('-', token.MINUS)
1399        self.assertExactTypeEqual('*', token.STAR)
1400        self.assertExactTypeEqual('/', token.SLASH)
1401        self.assertExactTypeEqual('|', token.VBAR)
1402        self.assertExactTypeEqual('&', token.AMPER)
1403        self.assertExactTypeEqual('<', token.LESS)
1404        self.assertExactTypeEqual('>', token.GREATER)
1405        self.assertExactTypeEqual('=', token.EQUAL)
1406        self.assertExactTypeEqual('.', token.DOT)
1407        self.assertExactTypeEqual('%', token.PERCENT)
1408        self.assertExactTypeEqual('{}', token.LBRACE, token.RBRACE)
1409        self.assertExactTypeEqual('==', token.EQEQUAL)
1410        self.assertExactTypeEqual('!=', token.NOTEQUAL)
1411        self.assertExactTypeEqual('<=', token.LESSEQUAL)
1412        self.assertExactTypeEqual('>=', token.GREATEREQUAL)
1413        self.assertExactTypeEqual('~', token.TILDE)
1414        self.assertExactTypeEqual('^', token.CIRCUMFLEX)
1415        self.assertExactTypeEqual('<<', token.LEFTSHIFT)
1416        self.assertExactTypeEqual('>>', token.RIGHTSHIFT)
1417        self.assertExactTypeEqual('**', token.DOUBLESTAR)
1418        self.assertExactTypeEqual('+=', token.PLUSEQUAL)
1419        self.assertExactTypeEqual('-=', token.MINEQUAL)
1420        self.assertExactTypeEqual('*=', token.STAREQUAL)
1421        self.assertExactTypeEqual('/=', token.SLASHEQUAL)
1422        self.assertExactTypeEqual('%=', token.PERCENTEQUAL)
1423        self.assertExactTypeEqual('&=', token.AMPEREQUAL)
1424        self.assertExactTypeEqual('|=', token.VBAREQUAL)
1425        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1426        self.assertExactTypeEqual('^=', token.CIRCUMFLEXEQUAL)
1427        self.assertExactTypeEqual('<<=', token.LEFTSHIFTEQUAL)
1428        self.assertExactTypeEqual('>>=', token.RIGHTSHIFTEQUAL)
1429        self.assertExactTypeEqual('**=', token.DOUBLESTAREQUAL)
1430        self.assertExactTypeEqual('//', token.DOUBLESLASH)
1431        self.assertExactTypeEqual('//=', token.DOUBLESLASHEQUAL)
1432        self.assertExactTypeEqual(':=', token.COLONEQUAL)
1433        self.assertExactTypeEqual('...', token.ELLIPSIS)
1434        self.assertExactTypeEqual('->', token.RARROW)
1435        self.assertExactTypeEqual('@', token.AT)
1436        self.assertExactTypeEqual('@=', token.ATEQUAL)
1437
1438        self.assertExactTypeEqual('a**2+b**2==c**2',
1439                                  NAME, token.DOUBLESTAR, NUMBER,
1440                                  token.PLUS,
1441                                  NAME, token.DOUBLESTAR, NUMBER,
1442                                  token.EQEQUAL,
1443                                  NAME, token.DOUBLESTAR, NUMBER)
1444        self.assertExactTypeEqual('{1, 2, 3}',
1445                                  token.LBRACE,
1446                                  token.NUMBER, token.COMMA,
1447                                  token.NUMBER, token.COMMA,
1448                                  token.NUMBER,
1449                                  token.RBRACE)
1450        self.assertExactTypeEqual('^(x & 0x1)',
1451                                  token.CIRCUMFLEX,
1452                                  token.LPAR,
1453                                  token.NAME, token.AMPER, token.NUMBER,
1454                                  token.RPAR)
1455
1456    def test_pathological_trailing_whitespace(self):
1457        # See http://bugs.python.org/issue16152
1458        self.assertExactTypeEqual('@          ', token.AT)
1459
1460
1461class UntokenizeTest(TestCase):
1462
1463    def test_bad_input_order(self):
1464        # raise if previous row
1465        u = Untokenizer()
1466        u.prev_row = 2
1467        u.prev_col = 2
1468        with self.assertRaises(ValueError) as cm:
1469            u.add_whitespace((1,3))
1470        self.assertEqual(cm.exception.args[0],
1471                'start (1,3) precedes previous end (2,2)')
1472        # raise if previous column in row
1473        self.assertRaises(ValueError, u.add_whitespace, (2,1))
1474
1475    def test_backslash_continuation(self):
1476        # The problem is that <whitespace>\<newline> leaves no token
1477        u = Untokenizer()
1478        u.prev_row = 1
1479        u.prev_col =  1
1480        u.tokens = []
1481        u.add_whitespace((2, 0))
1482        self.assertEqual(u.tokens, ['\\\n'])
1483        u.prev_row = 2
1484        u.add_whitespace((4, 4))
1485        self.assertEqual(u.tokens, ['\\\n', '\\\n\\\n', '    '])
1486        TestRoundtrip.check_roundtrip(self, 'a\n  b\n    c\n  \\\n  c\n')
1487
1488    def test_iter_compat(self):
1489        u = Untokenizer()
1490        token = (NAME, 'Hello')
1491        tokens = [(ENCODING, 'utf-8'), token]
1492        u.compat(token, iter([]))
1493        self.assertEqual(u.tokens, ["Hello "])
1494        u = Untokenizer()
1495        self.assertEqual(u.untokenize(iter([token])), 'Hello ')
1496        u = Untokenizer()
1497        self.assertEqual(u.untokenize(iter(tokens)), 'Hello ')
1498        self.assertEqual(u.encoding, 'utf-8')
1499        self.assertEqual(untokenize(iter(tokens)), b'Hello ')
1500
1501
1502class TestRoundtrip(TestCase):
1503
1504    def check_roundtrip(self, f):
1505        """
1506        Test roundtrip for `untokenize`. `f` is an open file or a string.
1507        The source code in f is tokenized to both 5- and 2-tuples.
1508        Both sequences are converted back to source code via
1509        tokenize.untokenize(), and the latter tokenized again to 2-tuples.
1510        The test fails if the 3 pair tokenizations do not match.
1511
1512        When untokenize bugs are fixed, untokenize with 5-tuples should
1513        reproduce code that does not contain a backslash continuation
1514        following spaces.  A proper test should test this.
1515        """
1516        # Get source code and original tokenizations
1517        if isinstance(f, str):
1518            code = f.encode('utf-8')
1519        else:
1520            code = f.read()
1521            f.close()
1522        readline = iter(code.splitlines(keepends=True)).__next__
1523        tokens5 = list(tokenize(readline))
1524        tokens2 = [tok[:2] for tok in tokens5]
1525        # Reproduce tokens2 from pairs
1526        bytes_from2 = untokenize(tokens2)
1527        readline2 = iter(bytes_from2.splitlines(keepends=True)).__next__
1528        tokens2_from2 = [tok[:2] for tok in tokenize(readline2)]
1529        self.assertEqual(tokens2_from2, tokens2)
1530        # Reproduce tokens2 from 5-tuples
1531        bytes_from5 = untokenize(tokens5)
1532        readline5 = iter(bytes_from5.splitlines(keepends=True)).__next__
1533        tokens2_from5 = [tok[:2] for tok in tokenize(readline5)]
1534        self.assertEqual(tokens2_from5, tokens2)
1535
1536    def test_roundtrip(self):
1537        # There are some standard formatting practices that are easy to get right.
1538
1539        self.check_roundtrip("if x == 1:\n"
1540                             "    print(x)\n")
1541        self.check_roundtrip("# This is a comment\n"
1542                             "# This also\n")
1543
1544        # Some people use different formatting conventions, which makes
1545        # untokenize a little trickier. Note that this test involves trailing
1546        # whitespace after the colon. Note that we use hex escapes to make the
1547        # two trailing blanks apparent in the expected output.
1548
1549        self.check_roundtrip("if x == 1 : \n"
1550                             "  print(x)\n")
1551        fn = support.findfile("tokenize_tests.txt")
1552        with open(fn, 'rb') as f:
1553            self.check_roundtrip(f)
1554        self.check_roundtrip("if x == 1:\n"
1555                             "    # A comment by itself.\n"
1556                             "    print(x) # Comment here, too.\n"
1557                             "    # Another comment.\n"
1558                             "after_if = True\n")
1559        self.check_roundtrip("if (x # The comments need to go in the right place\n"
1560                             "    == 1):\n"
1561                             "    print('x==1')\n")
1562        self.check_roundtrip("class Test: # A comment here\n"
1563                             "  # A comment with weird indent\n"
1564                             "  after_com = 5\n"
1565                             "  def x(m): return m*5 # a one liner\n"
1566                             "  def y(m): # A whitespace after the colon\n"
1567                             "     return y*4 # 3-space indent\n")
1568
1569        # Some error-handling code
1570        self.check_roundtrip("try: import somemodule\n"
1571                             "except ImportError: # comment\n"
1572                             "    print('Can not import' # comment2\n)"
1573                             "else:   print('Loaded')\n")
1574
1575    def test_continuation(self):
1576        # Balancing continuation
1577        self.check_roundtrip("a = (3,4, \n"
1578                             "5,6)\n"
1579                             "y = [3, 4,\n"
1580                             "5]\n"
1581                             "z = {'a': 5,\n"
1582                             "'b':15, 'c':True}\n"
1583                             "x = len(y) + 5 - a[\n"
1584                             "3] - a[2]\n"
1585                             "+ len(z) - z[\n"
1586                             "'b']\n")
1587
1588    def test_backslash_continuation(self):
1589        # Backslash means line continuation, except for comments
1590        self.check_roundtrip("x=1+\\\n"
1591                             "1\n"
1592                             "# This is a comment\\\n"
1593                             "# This also\n")
1594        self.check_roundtrip("# Comment \\\n"
1595                             "x = 0")
1596
1597    def test_string_concatenation(self):
1598        # Two string literals on the same line
1599        self.check_roundtrip("'' ''")
1600
1601    def test_random_files(self):
1602        # Test roundtrip on random python modules.
1603        # pass the '-ucpu' option to process the full directory.
1604
1605        import glob, random
1606        fn = support.findfile("tokenize_tests.txt")
1607        tempdir = os.path.dirname(fn) or os.curdir
1608        testfiles = glob.glob(os.path.join(tempdir, "test*.py"))
1609
1610        # Tokenize is broken on test_pep3131.py because regular expressions are
1611        # broken on the obscure unicode identifiers in it. *sigh*
1612        # With roundtrip extended to test the 5-tuple mode of untokenize,
1613        # 7 more testfiles fail.  Remove them also until the failure is diagnosed.
1614
1615        testfiles.remove(os.path.join(tempdir, "test_unicode_identifiers.py"))
1616        for f in ('buffer', 'builtin', 'fileio', 'inspect', 'os', 'platform', 'sys'):
1617            testfiles.remove(os.path.join(tempdir, "test_%s.py") % f)
1618
1619        if not support.is_resource_enabled("cpu"):
1620            testfiles = random.sample(testfiles, 10)
1621
1622        for testfile in testfiles:
1623            if support.verbose >= 2:
1624                print('tokenize', testfile)
1625            with open(testfile, 'rb') as f:
1626                with self.subTest(file=testfile):
1627                    self.check_roundtrip(f)
1628
1629
1630    def roundtrip(self, code):
1631        if isinstance(code, str):
1632            code = code.encode('utf-8')
1633        return untokenize(tokenize(BytesIO(code).readline)).decode('utf-8')
1634
1635    def test_indentation_semantics_retained(self):
1636        """
1637        Ensure that although whitespace might be mutated in a roundtrip,
1638        the semantic meaning of the indentation remains consistent.
1639        """
1640        code = "if False:\n\tx=3\n\tx=3\n"
1641        codelines = self.roundtrip(code).split('\n')
1642        self.assertEqual(codelines[1], codelines[2])
1643        self.check_roundtrip(code)
1644
1645
1646if __name__ == "__main__":
1647    unittest.main()
1648