• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation.
2# All rights reserved.
3"""Tokenization help for Python programs.
4
5generate_tokens(readline) is a generator that breaks a stream of
6text into Python tokens.  It accepts a readline-like method which is called
7repeatedly to get the next line of input (or "" for EOF).  It generates
85-tuples with these members:
9
10    the token type (see token.py)
11    the token (a string)
12    the starting (row, column) indices of the token (a 2-tuple of ints)
13    the ending (row, column) indices of the token (a 2-tuple of ints)
14    the original line (string)
15
16It is designed to match the working of the Python tokenizer exactly, except
17that it produces COMMENT tokens for comments and gives type OP for all
18operators
19
20Older entry points
21    tokenize_loop(readline, tokeneater)
22    tokenize(readline, tokeneater=printtoken)
23are the same, except instead of generating tokens, tokeneater is a callback
24function to which the 5 fields described above are passed as 5 arguments,
25each time a new token is found."""
26
27__author__ = 'Ka-Ping Yee <ping@lfw.org>'
28__credits__ = \
29    'GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, Skip Montanaro'
30
31import re
32import string
33from codecs import BOM_UTF8
34from codecs import lookup
35
36from . import token
37from .token import ASYNC
38from .token import AWAIT
39from .token import COMMENT
40from .token import DEDENT
41from .token import ENDMARKER
42from .token import ERRORTOKEN
43from .token import INDENT
44from .token import NAME
45from .token import NEWLINE
46from .token import NL
47from .token import NUMBER
48from .token import OP
49from .token import STRING
50from .token import tok_name
51
52__all__ = [x for x in dir(token) if x[0] != '_'
53          ] + ['tokenize', 'generate_tokens', 'untokenize']
54del token
55
56try:
57  bytes
58except NameError:
59  # Support bytes type in Python <= 2.5, so 2to3 turns itself into
60  # valid Python 3 code.
61  bytes = str
62
63
64def group(*choices):
65  return '(' + '|'.join(choices) + ')'
66
67
68def any(*choices):
69  return group(*choices) + '*'
70
71
72def maybe(*choices):
73  return group(*choices) + '?'
74
75
76def _combinations(*l):  # noqa: E741
77  return set(
78      x + y for x in l for y in l + ('',) if x.casefold() != y.casefold())
79
80
81Whitespace = r'[ \f\t]*'
82Comment = r'#[^\r\n]*'
83Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
84Name = r'\w+'
85
86Binnumber = r'0[bB]_?[01]+(?:_[01]+)*'
87Hexnumber = r'0[xX]_?[\da-fA-F]+(?:_[\da-fA-F]+)*[lL]?'
88Octnumber = r'0[oO]?_?[0-7]+(?:_[0-7]+)*[lL]?'
89Decnumber = group(r'[1-9]\d*(?:_\d+)*[lL]?', '0[lL]?')
90Intnumber = group(Binnumber, Hexnumber, Octnumber, Decnumber)
91Exponent = r'[eE][-+]?\d+(?:_\d+)*'
92Pointfloat = group(r'\d+(?:_\d+)*\.(?:\d+(?:_\d+)*)?',
93                   r'\.\d+(?:_\d+)*') + maybe(Exponent)
94Expfloat = r'\d+(?:_\d+)*' + Exponent
95Floatnumber = group(Pointfloat, Expfloat)
96Imagnumber = group(r'\d+(?:_\d+)*[jJ]', Floatnumber + r'[jJ]')
97Number = group(Imagnumber, Floatnumber, Intnumber)
98
99# Tail end of ' string.
100Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
101# Tail end of " string.
102Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
103# Tail end of ''' string.
104Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
105# Tail end of """ string.
106Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
107_litprefix = r'(?:[uUrRbBfF]|[rR][fFbB]|[fFbBuU][rR])?'
108Triple = group(_litprefix + "'''", _litprefix + '"""')
109# Single-line ' or " string.
110String = group(_litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
111               _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
112
113# Because of leftmost-then-longest match semantics, be sure to put the
114# longest operators first (e.g., if = came before ==, == would get
115# recognized as two instances of =).
116Operator = group(r'\*\*=?', r'>>=?', r'<<=?', r'<>', r'!=', r'//=?', r'->',
117                 r'[+\-*/%&@|^=<>]=?', r'~')
118
119Bracket = '[][(){}]'
120Special = group(r'\r?\n', r':=', r'[:;.,`@]')
121Funny = group(Operator, Bracket, Special)
122
123PlainToken = group(Number, Funny, String, Name)
124Token = Ignore + PlainToken
125
126# First (or only) line of ' or " string.
127ContStr = group(
128    _litprefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" + group("'", r'\\\r?\n'),
129    _litprefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' + group('"', r'\\\r?\n'))
130PseudoExtras = group(r'\\\r?\n', Comment, Triple)
131PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
132
133tokenprog, pseudoprog, single3prog, double3prog = map(
134    re.compile, (Token, PseudoToken, Single3, Double3))
135
136_strprefixes = (
137    _combinations('r', 'R', 'f', 'F') | _combinations('r', 'R', 'b', 'B')
138    | {'u', 'U', 'ur', 'uR', 'Ur', 'UR'})
139
140endprogs = {
141    "'": re.compile(Single),
142    '"': re.compile(Double),
143    "'''": single3prog,
144    '"""': double3prog,
145    **{
146        f"{prefix}'''": single3prog for prefix in _strprefixes
147    },
148    **{
149        f'{prefix}"""': double3prog for prefix in _strprefixes
150    },
151    **{
152        prefix: None for prefix in _strprefixes
153    }
154}
155
156triple_quoted = ({"'''", '"""'} | {f"{prefix}'''" for prefix in _strprefixes}
157                 | {f'{prefix}"""' for prefix in _strprefixes})
158single_quoted = ({"'", '"'} | {f"{prefix}'" for prefix in _strprefixes}
159                 | {f'{prefix}"' for prefix in _strprefixes})
160
161tabsize = 8
162
163
164class TokenError(Exception):
165  pass
166
167
168class StopTokenizing(Exception):
169  pass
170
171
172def printtoken(type, token, xxx_todo_changeme, xxx_todo_changeme1,
173               line):  # for testing
174  (srow, scol) = xxx_todo_changeme
175  (erow, ecol) = xxx_todo_changeme1
176  print('%d,%d-%d,%d:\t%s\t%s' %
177        (srow, scol, erow, ecol, tok_name[type], repr(token)))
178
179
180def tokenize(readline, tokeneater=printtoken):
181  """
182    The tokenize() function accepts two parameters: one representing the
183    input stream, and one providing an output mechanism for tokenize().
184
185    The first parameter, readline, must be a callable object which provides
186    the same interface as the readline() method of built-in file objects.
187    Each call to the function should return one line of input as a string.
188
189    The second parameter, tokeneater, must also be a callable object. It is
190    called once for each token, with five arguments, corresponding to the
191    tuples generated by generate_tokens().
192    """
193  try:
194    tokenize_loop(readline, tokeneater)
195  except StopTokenizing:
196    pass
197
198
199# backwards compatible interface
200def tokenize_loop(readline, tokeneater):
201  for token_info in generate_tokens(readline):
202    tokeneater(*token_info)
203
204
205class Untokenizer:
206
207  def __init__(self):
208    self.tokens = []
209    self.prev_row = 1
210    self.prev_col = 0
211
212  def add_whitespace(self, start):
213    row, col = start
214    assert row <= self.prev_row
215    col_offset = col - self.prev_col
216    if col_offset:
217      self.tokens.append(' ' * col_offset)
218
219  def untokenize(self, iterable):
220    for t in iterable:
221      if len(t) == 2:
222        self.compat(t, iterable)
223        break
224      tok_type, token, start, end, line = t
225      self.add_whitespace(start)
226      self.tokens.append(token)
227      self.prev_row, self.prev_col = end
228      if tok_type in (NEWLINE, NL):
229        self.prev_row += 1
230        self.prev_col = 0
231    return ''.join(self.tokens)
232
233  def compat(self, token, iterable):
234    startline = False
235    indents = []
236    toks_append = self.tokens.append
237    toknum, tokval = token
238    if toknum in (NAME, NUMBER):
239      tokval += ' '
240    if toknum in (NEWLINE, NL):
241      startline = True
242    for tok in iterable:
243      toknum, tokval = tok[:2]
244
245      if toknum in (NAME, NUMBER, ASYNC, AWAIT):
246        tokval += ' '
247
248      if toknum == INDENT:
249        indents.append(tokval)
250        continue
251      elif toknum == DEDENT:
252        indents.pop()
253        continue
254      elif toknum in (NEWLINE, NL):
255        startline = True
256      elif startline and indents:
257        toks_append(indents[-1])
258        startline = False
259      toks_append(tokval)
260
261
262cookie_re = re.compile(r'^[ \t\f]*#.*?coding[:=][ \t]*([-\w.]+)', re.ASCII)
263blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
264
265
266def _get_normal_name(orig_enc):
267  """Imitates get_normal_name in tokenizer.c."""
268  # Only care about the first 12 characters.
269  enc = orig_enc[:12].lower().replace('_', '-')
270  if enc == 'utf-8' or enc.startswith('utf-8-'):
271    return 'utf-8'
272  if enc in ('latin-1', 'iso-8859-1', 'iso-latin-1') or \
273     enc.startswith(('latin-1-', 'iso-8859-1-', 'iso-latin-1-')):
274    return 'iso-8859-1'
275  return orig_enc
276
277
278def detect_encoding(readline):
279  """
280    The detect_encoding() function is used to detect the encoding that should
281    be used to decode a Python source file. It requires one argument, readline,
282    in the same way as the tokenize() generator.
283
284    It will call readline a maximum of twice, and return the encoding used
285    (as a string) and a list of any lines (left as bytes) it has read
286    in.
287
288    It detects the encoding from the presence of a utf-8 bom or an encoding
289    cookie as specified in pep-0263. If both a bom and a cookie are present, but
290    disagree, a SyntaxError will be raised. If the encoding cookie is an invalid
291    charset, raise a SyntaxError.  Note that if a utf-8 bom is found,
292    'utf-8-sig' is returned.
293
294    If no encoding is specified, then the default of 'utf-8' will be returned.
295    """
296  bom_found = False
297  encoding = None
298  default = 'utf-8'
299
300  def read_or_stop():
301    try:
302      return readline()
303    except StopIteration:
304      return bytes()
305
306  def find_cookie(line):
307    try:
308      line_string = line.decode('ascii')
309    except UnicodeDecodeError:
310      return None
311    match = cookie_re.match(line_string)
312    if not match:
313      return None
314    encoding = _get_normal_name(match.group(1))
315    try:
316      codec = lookup(encoding)
317    except LookupError:
318      # This behaviour mimics the Python interpreter
319      raise SyntaxError('unknown encoding: ' + encoding)
320
321    if bom_found:
322      if codec.name != 'utf-8':
323        # This behaviour mimics the Python interpreter
324        raise SyntaxError('encoding problem: utf-8')
325      encoding += '-sig'
326    return encoding
327
328  first = read_or_stop()
329  if first.startswith(BOM_UTF8):
330    bom_found = True
331    first = first[3:]
332    default = 'utf-8-sig'
333  if not first:
334    return default, []
335
336  encoding = find_cookie(first)
337  if encoding:
338    return encoding, [first]
339  if not blank_re.match(first):
340    return default, [first]
341
342  second = read_or_stop()
343  if not second:
344    return default, [first]
345
346  encoding = find_cookie(second)
347  if encoding:
348    return encoding, [first, second]
349
350  return default, [first, second]
351
352
353def untokenize(iterable):
354  """Transform tokens back into Python source code.
355
356    Each element returned by the iterable must be a token sequence
357    with at least two elements, a token number and token value.  If
358    only two tokens are passed, the resulting output is poor.
359
360    Round-trip invariant for full input:
361        Untokenized source will match input source exactly
362
363    Round-trip invariant for limited input:
364        # Output text will tokenize the back to the input
365        t1 = [tok[:2] for tok in generate_tokens(f.readline)]
366        newcode = untokenize(t1)
367        readline = iter(newcode.splitlines(1)).next
368        t2 = [tok[:2] for tokin generate_tokens(readline)]
369        assert t1 == t2
370    """
371  ut = Untokenizer()
372  return ut.untokenize(iterable)
373
374
375def generate_tokens(readline):
376  """
377    The generate_tokens() generator requires one argument, readline, which
378    must be a callable object which provides the same interface as the
379    readline() method of built-in file objects. Each call to the function
380    should return one line of input as a string.  Alternately, readline
381    can be a callable function terminating with StopIteration:
382        readline = open(myfile).next    # Example of alternate readline
383
384    The generator produces 5-tuples with these members: the token type; the
385    token string; a 2-tuple (srow, scol) of ints specifying the row and
386    column where the token begins in the source; a 2-tuple (erow, ecol) of
387    ints specifying the row and column where the token ends in the source;
388    and the line on which the token was found. The line passed is the
389    physical line.
390    """
391  strstart = ''
392  endprog = ''
393  lnum = parenlev = continued = 0
394  contstr, needcont = '', 0
395  contline = None
396  indents = [0]
397
398  # 'stashed' and 'async_*' are used for async/await parsing
399  stashed = None
400  async_def = False
401  async_def_indent = 0
402  async_def_nl = False
403
404  while 1:  # loop over lines in stream
405    try:
406      line = readline()
407    except StopIteration:
408      line = ''
409    lnum = lnum + 1
410    pos, max = 0, len(line)
411
412    if contstr:  # continued string
413      if not line:
414        raise TokenError('EOF in multi-line string', strstart)
415      endmatch = endprog.match(line)
416      if endmatch:
417        pos = end = endmatch.end(0)
418        yield (STRING, contstr + line[:end], strstart, (lnum, end),
419               contline + line)
420        contstr, needcont = '', 0
421        contline = None
422      elif needcont and line[-2:] != '\\\n' and line[-3:] != '\\\r\n':
423        yield (ERRORTOKEN, contstr + line, strstart, (lnum, len(line)),
424               contline)
425        contstr = ''
426        contline = None
427        continue
428      else:
429        contstr = contstr + line
430        contline = contline + line
431        continue
432
433    elif parenlev == 0 and not continued:  # new statement
434      if not line:
435        break
436      column = 0
437      while pos < max:  # measure leading whitespace
438        if line[pos] == ' ':
439          column = column + 1
440        elif line[pos] == '\t':
441          column = (column // tabsize + 1) * tabsize
442        elif line[pos] == '\f':
443          column = 0
444        else:
445          break
446        pos = pos + 1
447      if pos == max:
448        break
449
450      if stashed:
451        yield stashed
452        stashed = None
453
454      if line[pos] in '#\r\n':  # skip comments or blank lines
455        if line[pos] == '#':
456          comment_token = line[pos:].rstrip('\r\n')
457          nl_pos = pos + len(comment_token)
458          yield (COMMENT, comment_token, (lnum, pos),
459                 (lnum, pos + len(comment_token)), line)
460          yield (NL, line[nl_pos:], (lnum, nl_pos), (lnum, len(line)), line)
461        else:
462          yield ((NL, COMMENT)[line[pos] == '#'], line[pos:], (lnum, pos),
463                 (lnum, len(line)), line)
464        continue
465
466      if column > indents[-1]:  # count indents or dedents
467        indents.append(column)
468        yield (INDENT, line[:pos], (lnum, 0), (lnum, pos), line)
469      while column < indents[-1]:
470        if column not in indents:
471          raise IndentationError(
472              'unindent does not match any outer indentation level',
473              ('<tokenize>', lnum, pos, line))
474        indents = indents[:-1]
475
476        if async_def and async_def_indent >= indents[-1]:
477          async_def = False
478          async_def_nl = False
479          async_def_indent = 0
480
481        yield (DEDENT, '', (lnum, pos), (lnum, pos), line)
482
483      if async_def and async_def_nl and async_def_indent >= indents[-1]:
484        async_def = False
485        async_def_nl = False
486        async_def_indent = 0
487
488    else:  # continued statement
489      if not line:
490        raise TokenError('EOF in multi-line statement', (lnum, 0))
491      continued = 0
492
493    while pos < max:
494      pseudomatch = pseudoprog.match(line, pos)
495      if pseudomatch:  # scan for tokens
496        start, end = pseudomatch.span(1)
497        spos, epos, pos = (lnum, start), (lnum, end), end
498        token, initial = line[start:end], line[start]
499
500        if initial in string.digits or \
501           (initial == '.' and token != '.'):      # ordinary number
502          yield (NUMBER, token, spos, epos, line)
503        elif initial in '\r\n':
504          newline = NEWLINE
505          if parenlev > 0:
506            newline = NL
507          elif async_def:
508            async_def_nl = True
509          if stashed:
510            yield stashed
511            stashed = None
512          yield (newline, token, spos, epos, line)
513
514        elif initial == '#':
515          assert not token.endswith('\n')
516          if stashed:
517            yield stashed
518            stashed = None
519          yield (COMMENT, token, spos, epos, line)
520        elif token in triple_quoted:
521          endprog = endprogs[token]
522          endmatch = endprog.match(line, pos)
523          if endmatch:  # all on one line
524            pos = endmatch.end(0)
525            token = line[start:pos]
526            if stashed:
527              yield stashed
528              stashed = None
529            yield (STRING, token, spos, (lnum, pos), line)
530          else:
531            strstart = (lnum, start)  # multiple lines
532            contstr = line[start:]
533            contline = line
534            break
535        elif initial in single_quoted or \
536            token[:2] in single_quoted or \
537            token[:3] in single_quoted:
538          if token[-1] == '\n':  # continued string
539            strstart = (lnum, start)  # noqa: F841
540            endprog = (
541                endprogs[initial] or endprogs[token[1]] or endprogs[token[2]])
542            contstr, needcont = line[start:], 1
543            contline = line
544            break
545          else:  # ordinary string
546            if stashed:
547              yield stashed
548              stashed = None
549            yield (STRING, token, spos, epos, line)
550        elif initial.isidentifier():  # ordinary name
551          if token in ('async', 'await'):
552            if async_def:
553              yield (ASYNC if token == 'async' else AWAIT, token, spos, epos,
554                     line)
555              continue
556
557          tok = (NAME, token, spos, epos, line)
558          if token == 'async' and not stashed:
559            stashed = tok
560            continue
561
562          if token in ('def', 'for'):
563            if (stashed and stashed[0] == NAME and stashed[1] == 'async'):
564
565              if token == 'def':
566                async_def = True
567                async_def_indent = indents[-1]
568
569              yield (ASYNC, stashed[1], stashed[2], stashed[3], stashed[4])
570              stashed = None
571
572          if stashed:
573            yield stashed
574            stashed = None
575
576          yield tok
577        elif initial == '\\':  # continued stmt
578          # This yield is new; needed for better idempotency:
579          if stashed:
580            yield stashed
581            stashed = None
582          yield (NL, token, spos, (lnum, pos), line)
583          continued = 1
584        else:
585          if initial in '([{':
586            parenlev = parenlev + 1
587          elif initial in ')]}':
588            parenlev = parenlev - 1
589          if stashed:
590            yield stashed
591            stashed = None
592          yield (OP, token, spos, epos, line)
593      else:
594        yield (ERRORTOKEN, line[pos], (lnum, pos), (lnum, pos + 1), line)
595        pos = pos + 1
596
597  if stashed:
598    yield stashed
599    stashed = None
600
601  for indent in indents[1:]:  # pop remaining indent levels
602    yield (DEDENT, '', (lnum, 0), (lnum, 0), '')
603  yield (ENDMARKER, '', (lnum, 0), (lnum, 0), '')
604
605
606if __name__ == '__main__':  # testing
607  import sys
608  if len(sys.argv) > 1:
609    tokenize(open(sys.argv[1]).readline)
610  else:
611    tokenize(sys.stdin.readline)
612