• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* Tokenizer implementation */
3 
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #include "unicodeobject.h"
14 #include "bytesobject.h"
15 #include "fileobject.h"
16 #include "abstract.h"
17 
18 /* Alternate tab spacing */
19 #define ALTTABSIZE 1
20 
21 #define is_potential_identifier_start(c) (\
22               (c >= 'a' && c <= 'z')\
23                || (c >= 'A' && c <= 'Z')\
24                || c == '_'\
25                || (c >= 128))
26 
27 #define is_potential_identifier_char(c) (\
28               (c >= 'a' && c <= 'z')\
29                || (c >= 'A' && c <= 'Z')\
30                || (c >= '0' && c <= '9')\
31                || c == '_'\
32                || (c >= 128))
33 
34 
35 /* Don't ever change this -- it would break the portability of Python code */
36 #define TABSIZE 8
37 
38 /* Forward */
39 static struct tok_state *tok_new(void);
40 static int tok_nextc(struct tok_state *tok);
41 static void tok_backup(struct tok_state *tok, int c);
42 static int syntaxerror(struct tok_state *tok, const char *format, ...);
43 
44 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
45    tokenizing. */
46 static const char* type_comment_prefix = "# type: ";
47 
48 /* Create and initialize a new tok_state structure */
49 
50 static struct tok_state *
tok_new(void)51 tok_new(void)
52 {
53     struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
54                                             sizeof(struct tok_state));
55     if (tok == NULL)
56         return NULL;
57     tok->buf = tok->cur = tok->inp = NULL;
58     tok->fp_interactive = 0;
59     tok->interactive_src_start = NULL;
60     tok->interactive_src_end = NULL;
61     tok->start = NULL;
62     tok->end = NULL;
63     tok->done = E_OK;
64     tok->fp = NULL;
65     tok->input = NULL;
66     tok->tabsize = TABSIZE;
67     tok->indent = 0;
68     tok->indstack[0] = 0;
69     tok->atbol = 1;
70     tok->pendin = 0;
71     tok->prompt = tok->nextprompt = NULL;
72     tok->lineno = 0;
73     tok->level = 0;
74     tok->altindstack[0] = 0;
75     tok->decoding_state = STATE_INIT;
76     tok->decoding_erred = 0;
77     tok->enc = NULL;
78     tok->encoding = NULL;
79     tok->cont_line = 0;
80     tok->filename = NULL;
81     tok->decoding_readline = NULL;
82     tok->decoding_buffer = NULL;
83     tok->type_comments = 0;
84     tok->async_hacks = 0;
85     tok->async_def = 0;
86     tok->async_def_indent = 0;
87     tok->async_def_nl = 0;
88     tok->interactive_underflow = IUNDERFLOW_NORMAL;
89     tok->str = NULL;
90     return tok;
91 }
92 
93 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)94 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
95 {
96     char* result = (char *)PyMem_Malloc(len + 1);
97     if (!result) {
98         tok->done = E_NOMEM;
99         return NULL;
100     }
101     memcpy(result, s, len);
102     result[len] = '\0';
103     return result;
104 }
105 
106 static char *
error_ret(struct tok_state * tok)107 error_ret(struct tok_state *tok) /* XXX */
108 {
109     tok->decoding_erred = 1;
110     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
111         PyMem_Free(tok->buf);
112     tok->buf = tok->cur = tok->inp = NULL;
113     tok->start = NULL;
114     tok->end = NULL;
115     tok->done = E_DECODE;
116     return NULL;                /* as if it were EOF */
117 }
118 
119 
120 static const char *
get_normal_name(const char * s)121 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
122 {
123     char buf[13];
124     int i;
125     for (i = 0; i < 12; i++) {
126         int c = s[i];
127         if (c == '\0')
128             break;
129         else if (c == '_')
130             buf[i] = '-';
131         else
132             buf[i] = tolower(c);
133     }
134     buf[i] = '\0';
135     if (strcmp(buf, "utf-8") == 0 ||
136         strncmp(buf, "utf-8-", 6) == 0)
137         return "utf-8";
138     else if (strcmp(buf, "latin-1") == 0 ||
139              strcmp(buf, "iso-8859-1") == 0 ||
140              strcmp(buf, "iso-latin-1") == 0 ||
141              strncmp(buf, "latin-1-", 8) == 0 ||
142              strncmp(buf, "iso-8859-1-", 11) == 0 ||
143              strncmp(buf, "iso-latin-1-", 12) == 0)
144         return "iso-8859-1";
145     else
146         return s;
147 }
148 
149 /* Return the coding spec in S, or NULL if none is found.  */
150 
151 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)152 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153 {
154     Py_ssize_t i;
155     *spec = NULL;
156     /* Coding spec must be in a comment, and that comment must be
157      * the only statement on the source code line. */
158     for (i = 0; i < size - 6; i++) {
159         if (s[i] == '#')
160             break;
161         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162             return 1;
163     }
164     for (; i < size - 6; i++) { /* XXX inefficient search */
165         const char* t = s + i;
166         if (memcmp(t, "coding", 6) == 0) {
167             const char* begin = NULL;
168             t += 6;
169             if (t[0] != ':' && t[0] != '=')
170                 continue;
171             do {
172                 t++;
173             } while (t[0] == ' ' || t[0] == '\t');
174 
175             begin = t;
176             while (Py_ISALNUM(t[0]) ||
177                    t[0] == '-' || t[0] == '_' || t[0] == '.')
178                 t++;
179 
180             if (begin < t) {
181                 char* r = new_string(begin, t - begin, tok);
182                 const char* q;
183                 if (!r)
184                     return 0;
185                 q = get_normal_name(r);
186                 if (r != q) {
187                     PyMem_Free(r);
188                     r = new_string(q, strlen(q), tok);
189                     if (!r)
190                         return 0;
191                 }
192                 *spec = r;
193                 break;
194             }
195         }
196     }
197     return 1;
198 }
199 
200 /* Check whether the line contains a coding spec. If it does,
201    invoke the set_readline function for the new encoding.
202    This function receives the tok_state and the new encoding.
203    Return 1 on success, 0 on failure.  */
204 
205 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))206 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207                   int set_readline(struct tok_state *, const char *))
208 {
209     char *cs;
210     if (tok->cont_line) {
211         /* It's a continuation line, so it can't be a coding spec. */
212         tok->decoding_state = STATE_NORMAL;
213         return 1;
214     }
215     if (!get_coding_spec(line, &cs, size, tok)) {
216         return 0;
217     }
218     if (!cs) {
219         Py_ssize_t i;
220         for (i = 0; i < size; i++) {
221             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222                 break;
223             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224                 /* Stop checking coding spec after a line containing
225                  * anything except a comment. */
226                 tok->decoding_state = STATE_NORMAL;
227                 break;
228             }
229         }
230         return 1;
231     }
232     tok->decoding_state = STATE_NORMAL;
233     if (tok->encoding == NULL) {
234         assert(tok->decoding_readline == NULL);
235         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236             error_ret(tok);
237             PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238             PyMem_Free(cs);
239             return 0;
240         }
241         tok->encoding = cs;
242     } else {                /* then, compare cs with BOM */
243         if (strcmp(tok->encoding, cs) != 0) {
244             error_ret(tok);
245             PyErr_Format(PyExc_SyntaxError,
246                          "encoding problem: %s with BOM", cs);
247             PyMem_Free(cs);
248             return 0;
249         }
250         PyMem_Free(cs);
251     }
252     return 1;
253 }
254 
255 /* See whether the file starts with a BOM. If it does,
256    invoke the set_readline function with the new encoding.
257    Return 1 on success, 0 on failure.  */
258 
259 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)260 check_bom(int get_char(struct tok_state *),
261           void unget_char(int, struct tok_state *),
262           int set_readline(struct tok_state *, const char *),
263           struct tok_state *tok)
264 {
265     int ch1, ch2, ch3;
266     ch1 = get_char(tok);
267     tok->decoding_state = STATE_SEEK_CODING;
268     if (ch1 == EOF) {
269         return 1;
270     } else if (ch1 == 0xEF) {
271         ch2 = get_char(tok);
272         if (ch2 != 0xBB) {
273             unget_char(ch2, tok);
274             unget_char(ch1, tok);
275             return 1;
276         }
277         ch3 = get_char(tok);
278         if (ch3 != 0xBF) {
279             unget_char(ch3, tok);
280             unget_char(ch2, tok);
281             unget_char(ch1, tok);
282             return 1;
283         }
284 #if 0
285     /* Disable support for UTF-16 BOMs until a decision
286        is made whether this needs to be supported.  */
287     } else if (ch1 == 0xFE) {
288         ch2 = get_char(tok);
289         if (ch2 != 0xFF) {
290             unget_char(ch2, tok);
291             unget_char(ch1, tok);
292             return 1;
293         }
294         if (!set_readline(tok, "utf-16-be"))
295             return 0;
296         tok->decoding_state = STATE_NORMAL;
297     } else if (ch1 == 0xFF) {
298         ch2 = get_char(tok);
299         if (ch2 != 0xFE) {
300             unget_char(ch2, tok);
301             unget_char(ch1, tok);
302             return 1;
303         }
304         if (!set_readline(tok, "utf-16-le"))
305             return 0;
306         tok->decoding_state = STATE_NORMAL;
307 #endif
308     } else {
309         unget_char(ch1, tok);
310         return 1;
311     }
312     if (tok->encoding != NULL)
313         PyMem_Free(tok->encoding);
314     tok->encoding = new_string("utf-8", 5, tok);
315     if (!tok->encoding)
316         return 0;
317     /* No need to set_readline: input is already utf-8 */
318     return 1;
319 }
320 
321 static int
tok_concatenate_interactive_new_line(struct tok_state * tok,const char * line)322 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
323     assert(tok->fp_interactive);
324 
325     if (!line) {
326         return 0;
327     }
328 
329     Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330     Py_ssize_t line_size = strlen(line);
331     char* new_str = tok->interactive_src_start;
332 
333     new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334     if (!new_str) {
335         if (tok->interactive_src_start) {
336             PyMem_Free(tok->interactive_src_start);
337         }
338         tok->interactive_src_start = NULL;
339         tok->interactive_src_end = NULL;
340         tok->done = E_NOMEM;
341         return -1;
342     }
343     strcpy(new_str + current_size, line);
344 
345     tok->interactive_src_start = new_str;
346     tok->interactive_src_end = new_str + current_size + line_size;
347     return 0;
348 }
349 
350 
351 /* Read a line of text from TOK into S, using the stream in TOK.
352    Return NULL on failure, else S.
353 
354    On entry, tok->decoding_buffer will be one of:
355      1) NULL: need to call tok->decoding_readline to get a new line
356      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357        stored the result in tok->decoding_buffer
358      3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
359        (in the s buffer) to copy entire contents of the line read
360        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
361        In this case, tok_readline_recode is called in a loop (with an expanded buffer)
362        until the buffer ends with a '\n' (or until the end of the file is
363        reached): see tok_nextc and its calls to tok_reserve_buf.
364 */
365 
366 static int
tok_reserve_buf(struct tok_state * tok,Py_ssize_t size)367 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
368 {
369     Py_ssize_t cur = tok->cur - tok->buf;
370     Py_ssize_t oldsize = tok->inp - tok->buf;
371     Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372     if (newsize > tok->end - tok->buf) {
373         char *newbuf = tok->buf;
374         Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
375         Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376         Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
377         newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378         if (newbuf == NULL) {
379             tok->done = E_NOMEM;
380             return 0;
381         }
382         tok->buf = newbuf;
383         tok->cur = tok->buf + cur;
384         tok->inp = tok->buf + oldsize;
385         tok->end = tok->buf + newsize;
386         tok->start = start < 0 ? NULL : tok->buf + start;
387         tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388         tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
389     }
390     return 1;
391 }
392 
393 static int
tok_readline_recode(struct tok_state * tok)394 tok_readline_recode(struct tok_state *tok) {
395     PyObject *line;
396     const  char *buf;
397     Py_ssize_t buflen;
398     line = tok->decoding_buffer;
399     if (line == NULL) {
400         line = PyObject_CallNoArgs(tok->decoding_readline);
401         if (line == NULL) {
402             error_ret(tok);
403             goto error;
404         }
405     }
406     else {
407         tok->decoding_buffer = NULL;
408     }
409     buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410     if (buf == NULL) {
411         error_ret(tok);
412         goto error;
413     }
414     if (!tok_reserve_buf(tok, buflen + 1)) {
415         goto error;
416     }
417     memcpy(tok->inp, buf, buflen);
418     tok->inp += buflen;
419     *tok->inp = '\0';
420     if (tok->fp_interactive &&
421         tok_concatenate_interactive_new_line(tok, buf) == -1) {
422         goto error;
423     }
424     Py_DECREF(line);
425     return 1;
426 error:
427     Py_XDECREF(line);
428     return 0;
429 }
430 
431 /* Set the readline function for TOK to a StreamReader's
432    readline function. The StreamReader is named ENC.
433 
434    This function is called from check_bom and check_coding_spec.
435 
436    ENC is usually identical to the future value of tok->encoding,
437    except for the (currently unsupported) case of UTF-16.
438 
439    Return 1 on success, 0 on failure. */
440 
441 static int
fp_setreadl(struct tok_state * tok,const char * enc)442 fp_setreadl(struct tok_state *tok, const char* enc)
443 {
444     PyObject *readline, *io, *stream;
445     _Py_IDENTIFIER(open);
446     _Py_IDENTIFIER(readline);
447     int fd;
448     long pos;
449 
450     fd = fileno(tok->fp);
451     /* Due to buffering the file offset for fd can be different from the file
452      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
453      * its file position counts CRLF as one char and can't be directly mapped
454      * to the file offset for fd.  Instead we step back one byte and read to
455      * the end of line.*/
456     pos = ftell(tok->fp);
457     if (pos == -1 ||
458         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
459         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
460         return 0;
461     }
462 
463     io = PyImport_ImportModuleNoBlock("io");
464     if (io == NULL)
465         return 0;
466 
467     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
468                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
469     Py_DECREF(io);
470     if (stream == NULL)
471         return 0;
472 
473     readline = _PyObject_GetAttrId(stream, &PyId_readline);
474     Py_DECREF(stream);
475     if (readline == NULL)
476         return 0;
477     Py_XSETREF(tok->decoding_readline, readline);
478 
479     if (pos > 0) {
480         PyObject *bufobj = _PyObject_CallNoArg(readline);
481         if (bufobj == NULL)
482             return 0;
483         Py_DECREF(bufobj);
484     }
485 
486     return 1;
487 }
488 
489 /* Fetch the next byte from TOK. */
490 
fp_getc(struct tok_state * tok)491 static int fp_getc(struct tok_state *tok) {
492     return getc(tok->fp);
493 }
494 
495 /* Unfetch the last byte back into TOK.  */
496 
fp_ungetc(int c,struct tok_state * tok)497 static void fp_ungetc(int c, struct tok_state *tok) {
498     ungetc(c, tok->fp);
499 }
500 
501 /* Check whether the characters at s start a valid
502    UTF-8 sequence. Return the number of characters forming
503    the sequence if yes, 0 if not.  */
valid_utf8(const unsigned char * s)504 static int valid_utf8(const unsigned char* s)
505 {
506     int expected = 0;
507     int length;
508     if (*s < 0x80)
509         /* single-byte code */
510         return 1;
511     if (*s < 0xc0)
512         /* following byte */
513         return 0;
514     if (*s < 0xE0)
515         expected = 1;
516     else if (*s < 0xF0)
517         expected = 2;
518     else if (*s < 0xF8)
519         expected = 3;
520     else
521         return 0;
522     length = expected + 1;
523     for (; expected; expected--)
524         if (s[expected] < 0x80 || s[expected] >= 0xC0)
525             return 0;
526     return length;
527 }
528 
529 static int
ensure_utf8(char * line,struct tok_state * tok)530 ensure_utf8(char *line, struct tok_state *tok)
531 {
532     int badchar = 0;
533     unsigned char *c;
534     int length;
535     for (c = (unsigned char *)line; *c; c += length) {
536         if (!(length = valid_utf8(c))) {
537             badchar = *c;
538             break;
539         }
540     }
541     if (badchar) {
542         /* Need to add 1 to the line number, since this line
543        has not been counted, yet.  */
544         PyErr_Format(PyExc_SyntaxError,
545                      "Non-UTF-8 code starting with '\\x%.2x' "
546                      "in file %U on line %i, "
547                      "but no encoding declared; "
548                      "see https://python.org/dev/peps/pep-0263/ for details",
549                      badchar, tok->filename, tok->lineno + 1);
550         return 0;
551     }
552     return 1;
553 }
554 
555 /* Fetch a byte from TOK, using the string buffer. */
556 
557 static int
buf_getc(struct tok_state * tok)558 buf_getc(struct tok_state *tok) {
559     return Py_CHARMASK(*tok->str++);
560 }
561 
562 /* Unfetch a byte from TOK, using the string buffer. */
563 
564 static void
buf_ungetc(int c,struct tok_state * tok)565 buf_ungetc(int c, struct tok_state *tok) {
566     tok->str--;
567     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
568 }
569 
570 /* Set the readline function for TOK to ENC. For the string-based
571    tokenizer, this means to just record the encoding. */
572 
573 static int
buf_setreadl(struct tok_state * tok,const char * enc)574 buf_setreadl(struct tok_state *tok, const char* enc) {
575     tok->enc = enc;
576     return 1;
577 }
578 
579 /* Return a UTF-8 encoding Python string object from the
580    C byte string STR, which is encoded with ENC. */
581 
582 static PyObject *
translate_into_utf8(const char * str,const char * enc)583 translate_into_utf8(const char* str, const char* enc) {
584     PyObject *utf8;
585     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586     if (buf == NULL)
587         return NULL;
588     utf8 = PyUnicode_AsUTF8String(buf);
589     Py_DECREF(buf);
590     return utf8;
591 }
592 
593 
594 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)595 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
596     int skip_next_lf = 0;
597     size_t needed_length = strlen(s) + 2, final_length;
598     char *buf, *current;
599     char c = '\0';
600     buf = PyMem_Malloc(needed_length);
601     if (buf == NULL) {
602         tok->done = E_NOMEM;
603         return NULL;
604     }
605     for (current = buf; *s; s++, current++) {
606         c = *s;
607         if (skip_next_lf) {
608             skip_next_lf = 0;
609             if (c == '\n') {
610                 c = *++s;
611                 if (!c)
612                     break;
613             }
614         }
615         if (c == '\r') {
616             skip_next_lf = 1;
617             c = '\n';
618         }
619         *current = c;
620     }
621     /* If this is exec input, add a newline to the end of the string if
622        there isn't one already. */
623     if (exec_input && c != '\n') {
624         *current = '\n';
625         current++;
626     }
627     *current = '\0';
628     final_length = current - buf + 1;
629     if (final_length < needed_length && final_length) {
630         /* should never fail */
631         char* result = PyMem_Realloc(buf, final_length);
632         if (result == NULL) {
633             PyMem_Free(buf);
634         }
635         buf = result;
636     }
637     return buf;
638 }
639 
640 /* Decode a byte string STR for use as the buffer of TOK.
641    Look for encoding declarations inside STR, and record them
642    inside TOK.  */
643 
644 static char *
decode_str(const char * input,int single,struct tok_state * tok)645 decode_str(const char *input, int single, struct tok_state *tok)
646 {
647     PyObject* utf8 = NULL;
648     char *str;
649     const char *s;
650     const char *newl[2] = {NULL, NULL};
651     int lineno = 0;
652     tok->input = str = translate_newlines(input, single, tok);
653     if (str == NULL)
654         return NULL;
655     tok->enc = NULL;
656     tok->str = str;
657     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658         return error_ret(tok);
659     str = tok->str;             /* string after BOM if any */
660     assert(str);
661     if (tok->enc != NULL) {
662         utf8 = translate_into_utf8(str, tok->enc);
663         if (utf8 == NULL)
664             return error_ret(tok);
665         str = PyBytes_AsString(utf8);
666     }
667     for (s = str;; s++) {
668         if (*s == '\0') break;
669         else if (*s == '\n') {
670             assert(lineno < 2);
671             newl[lineno] = s;
672             lineno++;
673             if (lineno == 2) break;
674         }
675     }
676     tok->enc = NULL;
677     /* need to check line 1 and 2 separately since check_coding_spec
678        assumes a single line as input */
679     if (newl[0]) {
680         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681             return NULL;
682         }
683         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
684             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685                                    tok, buf_setreadl))
686                 return NULL;
687         }
688     }
689     if (tok->enc != NULL) {
690         assert(utf8 == NULL);
691         utf8 = translate_into_utf8(str, tok->enc);
692         if (utf8 == NULL)
693             return error_ret(tok);
694         str = PyBytes_AS_STRING(utf8);
695     }
696     assert(tok->decoding_buffer == NULL);
697     tok->decoding_buffer = utf8; /* CAUTION */
698     return str;
699 }
700 
701 /* Set up tokenizer for string */
702 
703 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)704 PyTokenizer_FromString(const char *str, int exec_input)
705 {
706     struct tok_state *tok = tok_new();
707     char *decoded;
708 
709     if (tok == NULL)
710         return NULL;
711     decoded = decode_str(str, exec_input, tok);
712     if (decoded == NULL) {
713         PyTokenizer_Free(tok);
714         return NULL;
715     }
716 
717     tok->buf = tok->cur = tok->inp = decoded;
718     tok->end = decoded;
719     return tok;
720 }
721 
722 /* Set up tokenizer for UTF-8 string */
723 
724 struct tok_state *
PyTokenizer_FromUTF8(const char * str,int exec_input)725 PyTokenizer_FromUTF8(const char *str, int exec_input)
726 {
727     struct tok_state *tok = tok_new();
728     char *translated;
729     if (tok == NULL)
730         return NULL;
731     tok->input = translated = translate_newlines(str, exec_input, tok);
732     if (translated == NULL) {
733         PyTokenizer_Free(tok);
734         return NULL;
735     }
736     tok->decoding_state = STATE_NORMAL;
737     tok->enc = NULL;
738     tok->str = translated;
739     tok->encoding = new_string("utf-8", 5, tok);
740     if (!tok->encoding) {
741         PyTokenizer_Free(tok);
742         return NULL;
743     }
744 
745     tok->buf = tok->cur = tok->inp = translated;
746     tok->end = translated;
747     return tok;
748 }
749 
750 /* Set up tokenizer for file */
751 
752 struct tok_state *
PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)753 PyTokenizer_FromFile(FILE *fp, const char* enc,
754                      const char *ps1, const char *ps2)
755 {
756     struct tok_state *tok = tok_new();
757     if (tok == NULL)
758         return NULL;
759     if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
760         PyTokenizer_Free(tok);
761         return NULL;
762     }
763     tok->cur = tok->inp = tok->buf;
764     tok->end = tok->buf + BUFSIZ;
765     tok->fp = fp;
766     tok->prompt = ps1;
767     tok->nextprompt = ps2;
768     if (enc != NULL) {
769         /* Must copy encoding declaration since it
770            gets copied into the parse tree. */
771         tok->encoding = new_string(enc, strlen(enc), tok);
772         if (!tok->encoding) {
773             PyTokenizer_Free(tok);
774             return NULL;
775         }
776         tok->decoding_state = STATE_NORMAL;
777     }
778     return tok;
779 }
780 
781 /* Free a tok_state structure */
782 
783 void
PyTokenizer_Free(struct tok_state * tok)784 PyTokenizer_Free(struct tok_state *tok)
785 {
786     if (tok->encoding != NULL) {
787         PyMem_Free(tok->encoding);
788     }
789     Py_XDECREF(tok->decoding_readline);
790     Py_XDECREF(tok->decoding_buffer);
791     Py_XDECREF(tok->filename);
792     if (tok->fp != NULL && tok->buf != NULL) {
793         PyMem_Free(tok->buf);
794     }
795     if (tok->input) {
796         PyMem_Free(tok->input);
797     }
798     if (tok->interactive_src_start != NULL) {
799         PyMem_Free(tok->interactive_src_start);
800     }
801     PyMem_Free(tok);
802 }
803 
804 static int
tok_readline_raw(struct tok_state * tok)805 tok_readline_raw(struct tok_state *tok)
806 {
807     do {
808         if (!tok_reserve_buf(tok, BUFSIZ)) {
809             return 0;
810         }
811         char *line = Py_UniversalNewlineFgets(tok->inp,
812                                               (int)(tok->end - tok->inp),
813                                               tok->fp, NULL);
814         if (line == NULL) {
815             return 1;
816         }
817         if (tok->fp_interactive &&
818             tok_concatenate_interactive_new_line(tok, line) == -1) {
819             return 0;
820         }
821         tok->inp = strchr(tok->inp, '\0');
822         if (tok->inp == tok->buf) {
823             return 0;
824         }
825     } while (tok->inp[-1] != '\n');
826     return 1;
827 }
828 
829 static int
tok_underflow_string(struct tok_state * tok)830 tok_underflow_string(struct tok_state *tok) {
831     char *end = strchr(tok->inp, '\n');
832     if (end != NULL) {
833         end++;
834     }
835     else {
836         end = strchr(tok->inp, '\0');
837         if (end == tok->inp) {
838             tok->done = E_EOF;
839             return 0;
840         }
841     }
842     if (tok->start == NULL) {
843         tok->buf = tok->cur;
844     }
845     tok->line_start = tok->cur;
846     tok->lineno++;
847     tok->inp = end;
848     return 1;
849 }
850 
851 static int
tok_underflow_interactive(struct tok_state * tok)852 tok_underflow_interactive(struct tok_state *tok) {
853     if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854         tok->done = E_INTERACT_STOP;
855         return 1;
856     }
857     char *newtok = PyOS_Readline(tok->fp ? tok->fp : stdin, stdout, tok->prompt);
858     if (newtok != NULL) {
859         char *translated = translate_newlines(newtok, 0, tok);
860         PyMem_Free(newtok);
861         if (translated == NULL) {
862             return 0;
863         }
864         newtok = translated;
865     }
866     if (tok->encoding && newtok && *newtok) {
867         /* Recode to UTF-8 */
868         Py_ssize_t buflen;
869         const char* buf;
870         PyObject *u = translate_into_utf8(newtok, tok->encoding);
871         PyMem_Free(newtok);
872         if (u == NULL) {
873             tok->done = E_DECODE;
874             return 0;
875         }
876         buflen = PyBytes_GET_SIZE(u);
877         buf = PyBytes_AS_STRING(u);
878         newtok = PyMem_Malloc(buflen+1);
879         if (newtok == NULL) {
880             Py_DECREF(u);
881             tok->done = E_NOMEM;
882             return 0;
883         }
884         strcpy(newtok, buf);
885         Py_DECREF(u);
886     }
887     if (tok->fp_interactive &&
888         tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889         PyMem_Free(newtok);
890         return 0;
891     }
892     if (tok->nextprompt != NULL) {
893         tok->prompt = tok->nextprompt;
894     }
895     if (newtok == NULL) {
896         tok->done = E_INTR;
897     }
898     else if (*newtok == '\0') {
899         PyMem_Free(newtok);
900         tok->done = E_EOF;
901     }
902     else if (tok->start != NULL) {
903         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904         size_t size = strlen(newtok);
905         tok->lineno++;
906         if (!tok_reserve_buf(tok, size + 1)) {
907             PyMem_Free(tok->buf);
908             tok->buf = NULL;
909             PyMem_Free(newtok);
910             return 0;
911         }
912         memcpy(tok->cur, newtok, size + 1);
913         PyMem_Free(newtok);
914         tok->inp += size;
915         tok->multi_line_start = tok->buf + cur_multi_line_start;
916     }
917     else {
918         tok->lineno++;
919         PyMem_Free(tok->buf);
920         tok->buf = newtok;
921         tok->cur = tok->buf;
922         tok->line_start = tok->buf;
923         tok->inp = strchr(tok->buf, '\0');
924         tok->end = tok->inp + 1;
925     }
926     if (tok->done != E_OK) {
927         if (tok->prompt != NULL) {
928             PySys_WriteStderr("\n");
929         }
930         return 0;
931     }
932     return 1;
933 }
934 
935 static int
tok_underflow_file(struct tok_state * tok)936 tok_underflow_file(struct tok_state *tok) {
937     if (tok->start == NULL) {
938         tok->cur = tok->inp = tok->buf;
939     }
940     if (tok->decoding_state == STATE_INIT) {
941         /* We have not yet determined the encoding.
942            If an encoding is found, use the file-pointer
943            reader functions from now on. */
944         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945             error_ret(tok);
946             return 0;
947         }
948         assert(tok->decoding_state != STATE_INIT);
949     }
950     /* Read until '\n' or EOF */
951     if (tok->decoding_readline != NULL) {
952         /* We already have a codec associated with this input. */
953         if (!tok_readline_recode(tok)) {
954             return 0;
955         }
956     }
957     else {
958         /* We want a 'raw' read. */
959         if (!tok_readline_raw(tok)) {
960             return 0;
961         }
962     }
963     if (tok->inp == tok->cur) {
964         tok->done = E_EOF;
965         return 0;
966     }
967     if (tok->inp[-1] != '\n') {
968         /* Last line does not end in \n, fake one */
969         *tok->inp++ = '\n';
970         *tok->inp = '\0';
971     }
972 
973     tok->lineno++;
974     if (tok->decoding_state != STATE_NORMAL) {
975         if (tok->lineno > 2) {
976             tok->decoding_state = STATE_NORMAL;
977         }
978         else if (!check_coding_spec(tok->cur, strlen(tok->cur),
979                                     tok, fp_setreadl))
980         {
981             return 0;
982         }
983     }
984     /* The default encoding is UTF-8, so make sure we don't have any
985        non-UTF-8 sequences in it. */
986     if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
987         error_ret(tok);
988         return 0;
989     }
990     assert(tok->done == E_OK);
991     return tok->done == E_OK;
992 }
993 
994 #if defined(Py_DEBUG)
995 static void
print_escape(FILE * f,const char * s,Py_ssize_t size)996 print_escape(FILE *f, const char *s, Py_ssize_t size)
997 {
998     if (s == NULL) {
999         fputs("NULL", f);
1000         return;
1001     }
1002     putc('"', f);
1003     while (size-- > 0) {
1004         unsigned char c = *s++;
1005         switch (c) {
1006             case '\n': fputs("\\n", f); break;
1007             case '\r': fputs("\\r", f); break;
1008             case '\t': fputs("\\t", f); break;
1009             case '\f': fputs("\\f", f); break;
1010             case '\'': fputs("\\'", f); break;
1011             case '"': fputs("\\\"", f); break;
1012             default:
1013                 if (0x20 <= c && c <= 0x7f)
1014                     putc(c, f);
1015                 else
1016                     fprintf(f, "\\x%02x", c);
1017         }
1018     }
1019     putc('"', f);
1020 }
1021 #endif
1022 
1023 /* Get next char, updating state; error code goes into tok->done */
1024 
1025 static int
tok_nextc(struct tok_state * tok)1026 tok_nextc(struct tok_state *tok)
1027 {
1028     int rc;
1029     for (;;) {
1030         if (tok->cur != tok->inp) {
1031             return Py_CHARMASK(*tok->cur++); /* Fast path */
1032         }
1033         if (tok->done != E_OK) {
1034            return EOF;
1035         }
1036         if (tok->fp == NULL) {
1037             rc = tok_underflow_string(tok);
1038         }
1039         else if (tok->prompt != NULL) {
1040             rc = tok_underflow_interactive(tok);
1041         }
1042         else {
1043             rc = tok_underflow_file(tok);
1044         }
1045 #if defined(Py_DEBUG)
1046         if (Py_DebugFlag) {
1047             fprintf(stderr, "line[%d] = ", tok->lineno);
1048             print_escape(stderr, tok->cur, tok->inp - tok->cur);
1049             fprintf(stderr, "  tok->done = %d\n", tok->done);
1050         }
1051 #endif
1052         if (!rc) {
1053             tok->cur = tok->inp;
1054             return EOF;
1055         }
1056         tok->line_start = tok->cur;
1057     }
1058     Py_UNREACHABLE();
1059 }
1060 
1061 /* Back-up one character */
1062 
1063 static void
tok_backup(struct tok_state * tok,int c)1064 tok_backup(struct tok_state *tok, int c)
1065 {
1066     if (c != EOF) {
1067         if (--tok->cur < tok->buf) {
1068             Py_FatalError("tokenizer beginning of buffer");
1069         }
1070         if ((int)(unsigned char)*tok->cur != c) {
1071             Py_FatalError("tok_backup: wrong character");
1072         }
1073     }
1074 }
1075 
1076 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)1077 _syntaxerror_range(struct tok_state *tok, const char *format,
1078                    int col_offset, int end_col_offset,
1079                    va_list vargs)
1080 {
1081     PyObject *errmsg, *errtext, *args;
1082     errmsg = PyUnicode_FromFormatV(format, vargs);
1083     if (!errmsg) {
1084         goto error;
1085     }
1086 
1087     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1088                                    "replace");
1089     if (!errtext) {
1090         goto error;
1091     }
1092 
1093     if (col_offset == -1) {
1094         col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1095     }
1096     if (end_col_offset == -1) {
1097         end_col_offset = col_offset;
1098     }
1099 
1100     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1101     if (line_len != tok->cur - tok->line_start) {
1102         Py_DECREF(errtext);
1103         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1104                                        "replace");
1105     }
1106     if (!errtext) {
1107         goto error;
1108     }
1109 
1110     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1111                          col_offset, errtext, tok->lineno, end_col_offset);
1112     if (args) {
1113         PyErr_SetObject(PyExc_SyntaxError, args);
1114         Py_DECREF(args);
1115     }
1116 
1117 error:
1118     Py_XDECREF(errmsg);
1119     tok->done = E_ERROR;
1120     return ERRORTOKEN;
1121 }
1122 
1123 static int
syntaxerror(struct tok_state * tok,const char * format,...)1124 syntaxerror(struct tok_state *tok, const char *format, ...)
1125 {
1126     va_list vargs;
1127 #ifdef HAVE_STDARG_PROTOTYPES
1128     va_start(vargs, format);
1129 #else
1130     va_start(vargs);
1131 #endif
1132     int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1133     va_end(vargs);
1134     return ret;
1135 }
1136 
1137 static int
syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)1138 syntaxerror_known_range(struct tok_state *tok,
1139                         int col_offset, int end_col_offset,
1140                         const char *format, ...)
1141 {
1142     va_list vargs;
1143 #ifdef HAVE_STDARG_PROTOTYPES
1144     va_start(vargs, format);
1145 #else
1146     va_start(vargs);
1147 #endif
1148     int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1149     va_end(vargs);
1150     return ret;
1151 }
1152 
1153 
1154 
1155 static int
indenterror(struct tok_state * tok)1156 indenterror(struct tok_state *tok)
1157 {
1158     tok->done = E_TABSPACE;
1159     tok->cur = tok->inp;
1160     return ERRORTOKEN;
1161 }
1162 
1163 static int
parser_warn(struct tok_state * tok,const char * format,...)1164 parser_warn(struct tok_state *tok, const char *format, ...)
1165 {
1166     PyObject *errmsg;
1167     va_list vargs;
1168 #ifdef HAVE_STDARG_PROTOTYPES
1169     va_start(vargs, format);
1170 #else
1171     va_start(vargs);
1172 #endif
1173     errmsg = PyUnicode_FromFormatV(format, vargs);
1174     va_end(vargs);
1175     if (!errmsg) {
1176         goto error;
1177     }
1178 
1179     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1180                                  tok->lineno, NULL, NULL) < 0) {
1181         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1182             /* Replace the DeprecationWarning exception with a SyntaxError
1183                to get a more accurate error report */
1184             PyErr_Clear();
1185             syntaxerror(tok, "%U", errmsg);
1186         }
1187         goto error;
1188     }
1189     Py_DECREF(errmsg);
1190     return 0;
1191 
1192 error:
1193     Py_XDECREF(errmsg);
1194     tok->done = E_ERROR;
1195     return -1;
1196 }
1197 
1198 static int
lookahead(struct tok_state * tok,const char * test)1199 lookahead(struct tok_state *tok, const char *test)
1200 {
1201     const char *s = test;
1202     int res = 0;
1203     while (1) {
1204         int c = tok_nextc(tok);
1205         if (*s == 0) {
1206             res = !is_potential_identifier_char(c);
1207         }
1208         else if (c == *s) {
1209             s++;
1210             continue;
1211         }
1212 
1213         tok_backup(tok, c);
1214         while (s != test) {
1215             tok_backup(tok, *--s);
1216         }
1217         return res;
1218     }
1219 }
1220 
1221 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)1222 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1223 {
1224     /* Emit a deprecation warning only if the numeric literal is immediately
1225      * followed by one of keywords which can occurr after a numeric literal
1226      * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1227      * It allows to gradually deprecate existing valid code without adding
1228      * warning before error in most cases of invalid numeric literal (which
1229      * would be confusiong and break existing tests).
1230      * Raise a syntax error with slighly better message than plain
1231      * "invalid syntax" if the numeric literal is immediately followed by
1232      * other keyword or identifier.
1233      */
1234     int r = 0;
1235     if (c == 'a') {
1236         r = lookahead(tok, "nd");
1237     }
1238     else if (c == 'e') {
1239         r = lookahead(tok, "lse");
1240     }
1241     else if (c == 'f') {
1242         r = lookahead(tok, "or");
1243     }
1244     else if (c == 'i') {
1245         int c2 = tok_nextc(tok);
1246         if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1247             r = 1;
1248         }
1249         tok_backup(tok, c2);
1250     }
1251     else if (c == 'o') {
1252         r = lookahead(tok, "r");
1253     }
1254     else if (c == 'n') {
1255         r = lookahead(tok, "ot");
1256     }
1257     if (r) {
1258         tok_backup(tok, c);
1259         if (parser_warn(tok, "invalid %s literal", kind)) {
1260             return 0;
1261         }
1262         tok_nextc(tok);
1263     }
1264     else /* In future releases, only error will remain. */
1265     if (is_potential_identifier_char(c)) {
1266         tok_backup(tok, c);
1267         syntaxerror(tok, "invalid %s literal", kind);
1268         return 0;
1269     }
1270     return 1;
1271 }
1272 
1273 /* Verify that the identifier follows PEP 3131.
1274    All identifier strings are guaranteed to be "ready" unicode objects.
1275  */
1276 static int
verify_identifier(struct tok_state * tok)1277 verify_identifier(struct tok_state *tok)
1278 {
1279     PyObject *s;
1280     if (tok->decoding_erred)
1281         return 0;
1282     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1283     if (s == NULL) {
1284         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1285             tok->done = E_DECODE;
1286         }
1287         else {
1288             tok->done = E_ERROR;
1289         }
1290         return 0;
1291     }
1292     Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1293     if (invalid < 0) {
1294         Py_DECREF(s);
1295         tok->done = E_ERROR;
1296         return 0;
1297     }
1298     assert(PyUnicode_GET_LENGTH(s) > 0);
1299     if (invalid < PyUnicode_GET_LENGTH(s)) {
1300         Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1301         if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1302             /* Determine the offset in UTF-8 encoded input */
1303             Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1304             if (s != NULL) {
1305                 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1306             }
1307             if (s == NULL) {
1308                 tok->done = E_ERROR;
1309                 return 0;
1310             }
1311             tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1312         }
1313         Py_DECREF(s);
1314         // PyUnicode_FromFormatV() does not support %X
1315         char hex[9];
1316         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1317         if (Py_UNICODE_ISPRINTABLE(ch)) {
1318             syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1319         }
1320         else {
1321             syntaxerror(tok, "invalid non-printable character U+%s", hex);
1322         }
1323         return 0;
1324     }
1325     Py_DECREF(s);
1326     return 1;
1327 }
1328 
1329 static int
tok_decimal_tail(struct tok_state * tok)1330 tok_decimal_tail(struct tok_state *tok)
1331 {
1332     int c;
1333 
1334     while (1) {
1335         do {
1336             c = tok_nextc(tok);
1337         } while (isdigit(c));
1338         if (c != '_') {
1339             break;
1340         }
1341         c = tok_nextc(tok);
1342         if (!isdigit(c)) {
1343             tok_backup(tok, c);
1344             syntaxerror(tok, "invalid decimal literal");
1345             return 0;
1346         }
1347     }
1348     return c;
1349 }
1350 
1351 /* Get next token, after space stripping etc. */
1352 
1353 static inline int
tok_continuation_line(struct tok_state * tok)1354 tok_continuation_line(struct tok_state *tok) {
1355     int c = tok_nextc(tok);
1356     if (c != '\n') {
1357         tok->done = E_LINECONT;
1358         return -1;
1359     }
1360     c = tok_nextc(tok);
1361     if (c == EOF) {
1362         tok->done = E_EOF;
1363         tok->cur = tok->inp;
1364         return -1;
1365     } else {
1366         tok_backup(tok, c);
1367     }
1368     return c;
1369 }
1370 
1371 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1372 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1373 {
1374     int c;
1375     int blankline, nonascii;
1376 
1377     *p_start = *p_end = NULL;
1378   nextline:
1379     tok->start = NULL;
1380     blankline = 0;
1381 
1382     /* Get indentation level */
1383     if (tok->atbol) {
1384         int col = 0;
1385         int altcol = 0;
1386         tok->atbol = 0;
1387         int cont_line_col = 0;
1388         for (;;) {
1389             c = tok_nextc(tok);
1390             if (c == ' ') {
1391                 col++, altcol++;
1392             }
1393             else if (c == '\t') {
1394                 col = (col / tok->tabsize + 1) * tok->tabsize;
1395                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1396             }
1397             else if (c == '\014')  {/* Control-L (formfeed) */
1398                 col = altcol = 0; /* For Emacs users */
1399             }
1400             else if (c == '\\') {
1401                 // Indentation cannot be split over multiple physical lines
1402                 // using backslashes. This means that if we found a backslash
1403                 // preceded by whitespace, **the first one we find** determines
1404                 // the level of indentation of whatever comes next.
1405                 cont_line_col = cont_line_col ? cont_line_col : col;
1406                 if ((c = tok_continuation_line(tok)) == -1) {
1407                     return ERRORTOKEN;
1408                 }
1409             }
1410             else {
1411                 break;
1412             }
1413         }
1414         tok_backup(tok, c);
1415         if (c == '#' || c == '\n') {
1416             /* Lines with only whitespace and/or comments
1417                shouldn't affect the indentation and are
1418                not passed to the parser as NEWLINE tokens,
1419                except *totally* empty lines in interactive
1420                mode, which signal the end of a command group. */
1421             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1422                 blankline = 0; /* Let it through */
1423             }
1424             else if (tok->prompt != NULL && tok->lineno == 1) {
1425                 /* In interactive mode, if the first line contains
1426                    only spaces and/or a comment, let it through. */
1427                 blankline = 0;
1428                 col = altcol = 0;
1429             }
1430             else {
1431                 blankline = 1; /* Ignore completely */
1432             }
1433             /* We can't jump back right here since we still
1434                may need to skip to the end of a comment */
1435         }
1436         if (!blankline && tok->level == 0) {
1437             col = cont_line_col ? cont_line_col : col;
1438             altcol = cont_line_col ? cont_line_col : altcol;
1439             if (col == tok->indstack[tok->indent]) {
1440                 /* No change */
1441                 if (altcol != tok->altindstack[tok->indent]) {
1442                     return indenterror(tok);
1443                 }
1444             }
1445             else if (col > tok->indstack[tok->indent]) {
1446                 /* Indent -- always one */
1447                 if (tok->indent+1 >= MAXINDENT) {
1448                     tok->done = E_TOODEEP;
1449                     tok->cur = tok->inp;
1450                     return ERRORTOKEN;
1451                 }
1452                 if (altcol <= tok->altindstack[tok->indent]) {
1453                     return indenterror(tok);
1454                 }
1455                 tok->pendin++;
1456                 tok->indstack[++tok->indent] = col;
1457                 tok->altindstack[tok->indent] = altcol;
1458             }
1459             else /* col < tok->indstack[tok->indent] */ {
1460                 /* Dedent -- any number, must be consistent */
1461                 while (tok->indent > 0 &&
1462                     col < tok->indstack[tok->indent]) {
1463                     tok->pendin--;
1464                     tok->indent--;
1465                 }
1466                 if (col != tok->indstack[tok->indent]) {
1467                     tok->done = E_DEDENT;
1468                     tok->cur = tok->inp;
1469                     return ERRORTOKEN;
1470                 }
1471                 if (altcol != tok->altindstack[tok->indent]) {
1472                     return indenterror(tok);
1473                 }
1474             }
1475         }
1476     }
1477 
1478     tok->start = tok->cur;
1479 
1480     /* Return pending indents/dedents */
1481     if (tok->pendin != 0) {
1482         if (tok->pendin < 0) {
1483             tok->pendin++;
1484             return DEDENT;
1485         }
1486         else {
1487             tok->pendin--;
1488             return INDENT;
1489         }
1490     }
1491 
1492     /* Peek ahead at the next character */
1493     c = tok_nextc(tok);
1494     tok_backup(tok, c);
1495     /* Check if we are closing an async function */
1496     if (tok->async_def
1497         && !blankline
1498         /* Due to some implementation artifacts of type comments,
1499          * a TYPE_COMMENT at the start of a function won't set an
1500          * indentation level and it will produce a NEWLINE after it.
1501          * To avoid spuriously ending an async function due to this,
1502          * wait until we have some non-newline char in front of us. */
1503         && c != '\n'
1504         && tok->level == 0
1505         /* There was a NEWLINE after ASYNC DEF,
1506            so we're past the signature. */
1507         && tok->async_def_nl
1508         /* Current indentation level is less than where
1509            the async function was defined */
1510         && tok->async_def_indent >= tok->indent)
1511     {
1512         tok->async_def = 0;
1513         tok->async_def_indent = 0;
1514         tok->async_def_nl = 0;
1515     }
1516 
1517  again:
1518     tok->start = NULL;
1519     /* Skip spaces */
1520     do {
1521         c = tok_nextc(tok);
1522     } while (c == ' ' || c == '\t' || c == '\014');
1523 
1524     /* Set start of current token */
1525     tok->start = tok->cur - 1;
1526 
1527     /* Skip comment, unless it's a type comment */
1528     if (c == '#') {
1529         const char *prefix, *p, *type_start;
1530 
1531         while (c != EOF && c != '\n') {
1532             c = tok_nextc(tok);
1533         }
1534 
1535         if (tok->type_comments) {
1536             p = tok->start;
1537             prefix = type_comment_prefix;
1538             while (*prefix && p < tok->cur) {
1539                 if (*prefix == ' ') {
1540                     while (*p == ' ' || *p == '\t') {
1541                         p++;
1542                     }
1543                 } else if (*prefix == *p) {
1544                     p++;
1545                 } else {
1546                     break;
1547                 }
1548 
1549                 prefix++;
1550             }
1551 
1552             /* This is a type comment if we matched all of type_comment_prefix. */
1553             if (!*prefix) {
1554                 int is_type_ignore = 1;
1555                 const char *ignore_end = p + 6;
1556                 tok_backup(tok, c);  /* don't eat the newline or EOF */
1557 
1558                 type_start = p;
1559 
1560                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1561                  * or anything ASCII and non-alphanumeric. */
1562                 is_type_ignore = (
1563                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1564                     && !(tok->cur > ignore_end
1565                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1566 
1567                 if (is_type_ignore) {
1568                     *p_start = ignore_end;
1569                     *p_end = tok->cur;
1570 
1571                     /* If this type ignore is the only thing on the line, consume the newline also. */
1572                     if (blankline) {
1573                         tok_nextc(tok);
1574                         tok->atbol = 1;
1575                     }
1576                     return TYPE_IGNORE;
1577                 } else {
1578                     *p_start = type_start;  /* after type_comment_prefix */
1579                     *p_end = tok->cur;
1580                     return TYPE_COMMENT;
1581                 }
1582             }
1583         }
1584     }
1585 
1586     if (tok->done == E_INTERACT_STOP) {
1587         return ENDMARKER;
1588     }
1589 
1590     /* Check for EOF and errors now */
1591     if (c == EOF) {
1592         if (tok->level) {
1593             return ERRORTOKEN;
1594         }
1595         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1596     }
1597 
1598     /* Identifier (most frequent token!) */
1599     nonascii = 0;
1600     if (is_potential_identifier_start(c)) {
1601         /* Process the various legal combinations of b"", r"", u"", and f"". */
1602         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1603         while (1) {
1604             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1605                 saw_b = 1;
1606             /* Since this is a backwards compatibility support literal we don't
1607                want to support it in arbitrary order like byte literals. */
1608             else if (!(saw_b || saw_u || saw_r || saw_f)
1609                      && (c == 'u'|| c == 'U')) {
1610                 saw_u = 1;
1611             }
1612             /* ur"" and ru"" are not supported */
1613             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1614                 saw_r = 1;
1615             }
1616             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1617                 saw_f = 1;
1618             }
1619             else {
1620                 break;
1621             }
1622             c = tok_nextc(tok);
1623             if (c == '"' || c == '\'') {
1624                 goto letter_quote;
1625             }
1626         }
1627         while (is_potential_identifier_char(c)) {
1628             if (c >= 128) {
1629                 nonascii = 1;
1630             }
1631             c = tok_nextc(tok);
1632         }
1633         tok_backup(tok, c);
1634         if (nonascii && !verify_identifier(tok)) {
1635             return ERRORTOKEN;
1636         }
1637 
1638         *p_start = tok->start;
1639         *p_end = tok->cur;
1640 
1641         /* async/await parsing block. */
1642         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1643             /* May be an 'async' or 'await' token.  For Python 3.7 or
1644                later we recognize them unconditionally.  For Python
1645                3.5 or 3.6 we recognize 'async' in front of 'def', and
1646                either one inside of 'async def'.  (Technically we
1647                shouldn't recognize these at all for 3.4 or earlier,
1648                but there's no *valid* Python 3.4 code that would be
1649                rejected, and async functions will be rejected in a
1650                later phase.) */
1651             if (!tok->async_hacks || tok->async_def) {
1652                 /* Always recognize the keywords. */
1653                 if (memcmp(tok->start, "async", 5) == 0) {
1654                     return ASYNC;
1655                 }
1656                 if (memcmp(tok->start, "await", 5) == 0) {
1657                     return AWAIT;
1658                 }
1659             }
1660             else if (memcmp(tok->start, "async", 5) == 0) {
1661                 /* The current token is 'async'.
1662                    Look ahead one token to see if that is 'def'. */
1663 
1664                 struct tok_state ahead_tok;
1665                 const char *ahead_tok_start = NULL;
1666                 const char *ahead_tok_end = NULL;
1667                 int ahead_tok_kind;
1668 
1669                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1670                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1671                                          &ahead_tok_end);
1672 
1673                 if (ahead_tok_kind == NAME
1674                     && ahead_tok.cur - ahead_tok.start == 3
1675                     && memcmp(ahead_tok.start, "def", 3) == 0)
1676                 {
1677                     /* The next token is going to be 'def', so instead of
1678                        returning a plain NAME token, return ASYNC. */
1679                     tok->async_def_indent = tok->indent;
1680                     tok->async_def = 1;
1681                     return ASYNC;
1682                 }
1683             }
1684         }
1685 
1686         return NAME;
1687     }
1688 
1689     /* Newline */
1690     if (c == '\n') {
1691         tok->atbol = 1;
1692         if (blankline || tok->level > 0) {
1693             goto nextline;
1694         }
1695         *p_start = tok->start;
1696         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1697         tok->cont_line = 0;
1698         if (tok->async_def) {
1699             /* We're somewhere inside an 'async def' function, and
1700                we've encountered a NEWLINE after its signature. */
1701             tok->async_def_nl = 1;
1702         }
1703         return NEWLINE;
1704     }
1705 
1706     /* Period or number starting with period? */
1707     if (c == '.') {
1708         c = tok_nextc(tok);
1709         if (isdigit(c)) {
1710             goto fraction;
1711         } else if (c == '.') {
1712             c = tok_nextc(tok);
1713             if (c == '.') {
1714                 *p_start = tok->start;
1715                 *p_end = tok->cur;
1716                 return ELLIPSIS;
1717             }
1718             else {
1719                 tok_backup(tok, c);
1720             }
1721             tok_backup(tok, '.');
1722         }
1723         else {
1724             tok_backup(tok, c);
1725         }
1726         *p_start = tok->start;
1727         *p_end = tok->cur;
1728         return DOT;
1729     }
1730 
1731     /* Number */
1732     if (isdigit(c)) {
1733         if (c == '0') {
1734             /* Hex, octal or binary -- maybe. */
1735             c = tok_nextc(tok);
1736             if (c == 'x' || c == 'X') {
1737                 /* Hex */
1738                 c = tok_nextc(tok);
1739                 do {
1740                     if (c == '_') {
1741                         c = tok_nextc(tok);
1742                     }
1743                     if (!isxdigit(c)) {
1744                         tok_backup(tok, c);
1745                         return syntaxerror(tok, "invalid hexadecimal literal");
1746                     }
1747                     do {
1748                         c = tok_nextc(tok);
1749                     } while (isxdigit(c));
1750                 } while (c == '_');
1751                 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1752                     return ERRORTOKEN;
1753                 }
1754             }
1755             else if (c == 'o' || c == 'O') {
1756                 /* Octal */
1757                 c = tok_nextc(tok);
1758                 do {
1759                     if (c == '_') {
1760                         c = tok_nextc(tok);
1761                     }
1762                     if (c < '0' || c >= '8') {
1763                         if (isdigit(c)) {
1764                             return syntaxerror(tok,
1765                                     "invalid digit '%c' in octal literal", c);
1766                         }
1767                         else {
1768                             tok_backup(tok, c);
1769                             return syntaxerror(tok, "invalid octal literal");
1770                         }
1771                     }
1772                     do {
1773                         c = tok_nextc(tok);
1774                     } while ('0' <= c && c < '8');
1775                 } while (c == '_');
1776                 if (isdigit(c)) {
1777                     return syntaxerror(tok,
1778                             "invalid digit '%c' in octal literal", c);
1779                 }
1780                 if (!verify_end_of_number(tok, c, "octal")) {
1781                     return ERRORTOKEN;
1782                 }
1783             }
1784             else if (c == 'b' || c == 'B') {
1785                 /* Binary */
1786                 c = tok_nextc(tok);
1787                 do {
1788                     if (c == '_') {
1789                         c = tok_nextc(tok);
1790                     }
1791                     if (c != '0' && c != '1') {
1792                         if (isdigit(c)) {
1793                             return syntaxerror(tok,
1794                                     "invalid digit '%c' in binary literal", c);
1795                         }
1796                         else {
1797                             tok_backup(tok, c);
1798                             return syntaxerror(tok, "invalid binary literal");
1799                         }
1800                     }
1801                     do {
1802                         c = tok_nextc(tok);
1803                     } while (c == '0' || c == '1');
1804                 } while (c == '_');
1805                 if (isdigit(c)) {
1806                     return syntaxerror(tok,
1807                             "invalid digit '%c' in binary literal", c);
1808                 }
1809                 if (!verify_end_of_number(tok, c, "binary")) {
1810                     return ERRORTOKEN;
1811                 }
1812             }
1813             else {
1814                 int nonzero = 0;
1815                 /* maybe old-style octal; c is first char of it */
1816                 /* in any case, allow '0' as a literal */
1817                 while (1) {
1818                     if (c == '_') {
1819                         c = tok_nextc(tok);
1820                         if (!isdigit(c)) {
1821                             tok_backup(tok, c);
1822                             return syntaxerror(tok, "invalid decimal literal");
1823                         }
1824                     }
1825                     if (c != '0') {
1826                         break;
1827                     }
1828                     c = tok_nextc(tok);
1829                 }
1830                 char* zeros_end = tok->cur;
1831                 if (isdigit(c)) {
1832                     nonzero = 1;
1833                     c = tok_decimal_tail(tok);
1834                     if (c == 0) {
1835                         return ERRORTOKEN;
1836                     }
1837                 }
1838                 if (c == '.') {
1839                     c = tok_nextc(tok);
1840                     goto fraction;
1841                 }
1842                 else if (c == 'e' || c == 'E') {
1843                     goto exponent;
1844                 }
1845                 else if (c == 'j' || c == 'J') {
1846                     goto imaginary;
1847                 }
1848                 else if (nonzero) {
1849                     /* Old-style octal: now disallowed. */
1850                     tok_backup(tok, c);
1851                     return syntaxerror_known_range(
1852                             tok, (int)(tok->start + 1 - tok->line_start),
1853                             (int)(zeros_end - tok->line_start),
1854                             "leading zeros in decimal integer "
1855                             "literals are not permitted; "
1856                             "use an 0o prefix for octal integers");
1857                 }
1858                 if (!verify_end_of_number(tok, c, "decimal")) {
1859                     return ERRORTOKEN;
1860                 }
1861             }
1862         }
1863         else {
1864             /* Decimal */
1865             c = tok_decimal_tail(tok);
1866             if (c == 0) {
1867                 return ERRORTOKEN;
1868             }
1869             {
1870                 /* Accept floating point numbers. */
1871                 if (c == '.') {
1872                     c = tok_nextc(tok);
1873         fraction:
1874                     /* Fraction */
1875                     if (isdigit(c)) {
1876                         c = tok_decimal_tail(tok);
1877                         if (c == 0) {
1878                             return ERRORTOKEN;
1879                         }
1880                     }
1881                 }
1882                 if (c == 'e' || c == 'E') {
1883                     int e;
1884                   exponent:
1885                     e = c;
1886                     /* Exponent part */
1887                     c = tok_nextc(tok);
1888                     if (c == '+' || c == '-') {
1889                         c = tok_nextc(tok);
1890                         if (!isdigit(c)) {
1891                             tok_backup(tok, c);
1892                             return syntaxerror(tok, "invalid decimal literal");
1893                         }
1894                     } else if (!isdigit(c)) {
1895                         tok_backup(tok, c);
1896                         if (!verify_end_of_number(tok, e, "decimal")) {
1897                             return ERRORTOKEN;
1898                         }
1899                         tok_backup(tok, e);
1900                         *p_start = tok->start;
1901                         *p_end = tok->cur;
1902                         return NUMBER;
1903                     }
1904                     c = tok_decimal_tail(tok);
1905                     if (c == 0) {
1906                         return ERRORTOKEN;
1907                     }
1908                 }
1909                 if (c == 'j' || c == 'J') {
1910                     /* Imaginary part */
1911         imaginary:
1912                     c = tok_nextc(tok);
1913                     if (!verify_end_of_number(tok, c, "imaginary")) {
1914                         return ERRORTOKEN;
1915                     }
1916                 }
1917                 else if (!verify_end_of_number(tok, c, "decimal")) {
1918                     return ERRORTOKEN;
1919                 }
1920             }
1921         }
1922         tok_backup(tok, c);
1923         *p_start = tok->start;
1924         *p_end = tok->cur;
1925         return NUMBER;
1926     }
1927 
1928   letter_quote:
1929     /* String */
1930     if (c == '\'' || c == '"') {
1931         int quote = c;
1932         int quote_size = 1;             /* 1 or 3 */
1933         int end_quote_size = 0;
1934 
1935         /* Nodes of type STRING, especially multi line strings
1936            must be handled differently in order to get both
1937            the starting line number and the column offset right.
1938            (cf. issue 16806) */
1939         tok->first_lineno = tok->lineno;
1940         tok->multi_line_start = tok->line_start;
1941 
1942         /* Find the quote size and start of string */
1943         c = tok_nextc(tok);
1944         if (c == quote) {
1945             c = tok_nextc(tok);
1946             if (c == quote) {
1947                 quote_size = 3;
1948             }
1949             else {
1950                 end_quote_size = 1;     /* empty string found */
1951             }
1952         }
1953         if (c != quote) {
1954             tok_backup(tok, c);
1955         }
1956 
1957         /* Get rest of string */
1958         while (end_quote_size != quote_size) {
1959             c = tok_nextc(tok);
1960             if (c == EOF || (quote_size == 1 && c == '\n')) {
1961                 assert(tok->multi_line_start != NULL);
1962                 // shift the tok_state's location into
1963                 // the start of string, and report the error
1964                 // from the initial quote character
1965                 tok->cur = (char *)tok->start;
1966                 tok->cur++;
1967                 tok->line_start = tok->multi_line_start;
1968                 int start = tok->lineno;
1969                 tok->lineno = tok->first_lineno;
1970                 if (quote_size == 3) {
1971                     syntaxerror(tok, "unterminated triple-quoted string literal"
1972                                      " (detected at line %d)", start);
1973                     if (c != '\n') {
1974                         tok->done = E_EOFS;
1975                     }
1976                     return ERRORTOKEN;
1977                 }
1978                 else {
1979                     syntaxerror(tok, "unterminated string literal (detected at"
1980                                      " line %d)", start);
1981                     if (c != '\n') {
1982                         tok->done = E_EOLS;
1983                     }
1984                     return ERRORTOKEN;
1985                 }
1986             }
1987             if (c == quote) {
1988                 end_quote_size += 1;
1989             }
1990             else {
1991                 end_quote_size = 0;
1992                 if (c == '\\') {
1993                     tok_nextc(tok);  /* skip escaped char */
1994                 }
1995             }
1996         }
1997 
1998         *p_start = tok->start;
1999         *p_end = tok->cur;
2000         return STRING;
2001     }
2002 
2003     /* Line continuation */
2004     if (c == '\\') {
2005         if ((c = tok_continuation_line(tok)) == -1) {
2006             return ERRORTOKEN;
2007         }
2008         tok->cont_line = 1;
2009         goto again; /* Read next line */
2010     }
2011 
2012     /* Check for two-character token */
2013     {
2014         int c2 = tok_nextc(tok);
2015         int token = PyToken_TwoChars(c, c2);
2016         if (token != OP) {
2017             int c3 = tok_nextc(tok);
2018             int token3 = PyToken_ThreeChars(c, c2, c3);
2019             if (token3 != OP) {
2020                 token = token3;
2021             }
2022             else {
2023                 tok_backup(tok, c3);
2024             }
2025             *p_start = tok->start;
2026             *p_end = tok->cur;
2027             return token;
2028         }
2029         tok_backup(tok, c2);
2030     }
2031 
2032     /* Keep track of parentheses nesting level */
2033     switch (c) {
2034     case '(':
2035     case '[':
2036     case '{':
2037         if (tok->level >= MAXLEVEL) {
2038             return syntaxerror(tok, "too many nested parentheses");
2039         }
2040         tok->parenstack[tok->level] = c;
2041         tok->parenlinenostack[tok->level] = tok->lineno;
2042         tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2043         tok->level++;
2044         break;
2045     case ')':
2046     case ']':
2047     case '}':
2048         if (!tok->level) {
2049             return syntaxerror(tok, "unmatched '%c'", c);
2050         }
2051         tok->level--;
2052         int opening = tok->parenstack[tok->level];
2053         if (!((opening == '(' && c == ')') ||
2054               (opening == '[' && c == ']') ||
2055               (opening == '{' && c == '}')))
2056         {
2057             if (tok->parenlinenostack[tok->level] != tok->lineno) {
2058                 return syntaxerror(tok,
2059                         "closing parenthesis '%c' does not match "
2060                         "opening parenthesis '%c' on line %d",
2061                         c, opening, tok->parenlinenostack[tok->level]);
2062             }
2063             else {
2064                 return syntaxerror(tok,
2065                         "closing parenthesis '%c' does not match "
2066                         "opening parenthesis '%c'",
2067                         c, opening);
2068             }
2069         }
2070         break;
2071     }
2072 
2073     /* Punctuation character */
2074     *p_start = tok->start;
2075     *p_end = tok->cur;
2076     return PyToken_OneChar(c);
2077 }
2078 
2079 int
PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)2080 PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
2081 {
2082     int result = tok_get(tok, p_start, p_end);
2083     if (tok->decoding_erred) {
2084         result = ERRORTOKEN;
2085         tok->done = E_DECODE;
2086     }
2087     return result;
2088 }
2089 
2090 /* Get the encoding of a Python file. Check for the coding cookie and check if
2091    the file starts with a BOM.
2092 
2093    PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2094    encoding in the first or second line of the file (in which case the encoding
2095    should be assumed to be UTF-8).
2096 
2097    The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2098    by the caller. */
2099 
2100 char *
PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)2101 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2102 {
2103     struct tok_state *tok;
2104     FILE *fp;
2105     const char *p_start = NULL;
2106     const char *p_end = NULL;
2107     char *encoding = NULL;
2108 
2109     fd = _Py_dup(fd);
2110     if (fd < 0) {
2111         return NULL;
2112     }
2113 
2114     fp = fdopen(fd, "r");
2115     if (fp == NULL) {
2116         return NULL;
2117     }
2118     tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2119     if (tok == NULL) {
2120         fclose(fp);
2121         return NULL;
2122     }
2123     if (filename != NULL) {
2124         Py_INCREF(filename);
2125         tok->filename = filename;
2126     }
2127     else {
2128         tok->filename = PyUnicode_FromString("<string>");
2129         if (tok->filename == NULL) {
2130             fclose(fp);
2131             PyTokenizer_Free(tok);
2132             return encoding;
2133         }
2134     }
2135     while (tok->lineno < 2 && tok->done == E_OK) {
2136         PyTokenizer_Get(tok, &p_start, &p_end);
2137     }
2138     fclose(fp);
2139     if (tok->encoding) {
2140         encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2141         if (encoding) {
2142             strcpy(encoding, tok->encoding);
2143         }
2144     }
2145     PyTokenizer_Free(tok);
2146     return encoding;
2147 }
2148 
2149 char *
PyTokenizer_FindEncoding(int fd)2150 PyTokenizer_FindEncoding(int fd)
2151 {
2152     return PyTokenizer_FindEncodingFilename(fd, NULL);
2153 }
2154 
2155 #ifdef Py_DEBUG
2156 
2157 void
tok_dump(int type,char * start,char * end)2158 tok_dump(int type, char *start, char *end)
2159 {
2160     fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2161     if (type == NAME || type == NUMBER || type == STRING || type == OP)
2162         fprintf(stderr, "(%.*s)", (int)(end - start), start);
2163 }
2164 
2165 #endif
2166