• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 
2 /* Tokenizer implementation */
3 
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6 
7 #include <ctype.h>
8 #include <assert.h>
9 
10 #include "tokenizer.h"
11 #include "errcode.h"
12 
13 #include "unicodeobject.h"
14 #include "bytesobject.h"
15 #include "fileobject.h"
16 #include "abstract.h"
17 
18 /* Alternate tab spacing */
19 #define ALTTABSIZE 1
20 
21 #define is_potential_identifier_start(c) (\
22               (c >= 'a' && c <= 'z')\
23                || (c >= 'A' && c <= 'Z')\
24                || c == '_'\
25                || (c >= 128))
26 
27 #define is_potential_identifier_char(c) (\
28               (c >= 'a' && c <= 'z')\
29                || (c >= 'A' && c <= 'Z')\
30                || (c >= '0' && c <= '9')\
31                || c == '_'\
32                || (c >= 128))
33 
34 
35 /* Don't ever change this -- it would break the portability of Python code */
36 #define TABSIZE 8
37 
38 /* Forward */
39 static struct tok_state *tok_new(void);
40 static int tok_nextc(struct tok_state *tok);
41 static void tok_backup(struct tok_state *tok, int c);
42 
43 
44 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
45    tokenizing. */
46 static const char* type_comment_prefix = "# type: ";
47 
48 /* Create and initialize a new tok_state structure */
49 
50 static struct tok_state *
tok_new(void)51 tok_new(void)
52 {
53     struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
54                                             sizeof(struct tok_state));
55     if (tok == NULL)
56         return NULL;
57     tok->buf = tok->cur = tok->inp = NULL;
58     tok->fp_interactive = 0;
59     tok->interactive_src_start = NULL;
60     tok->interactive_src_end = NULL;
61     tok->start = NULL;
62     tok->end = NULL;
63     tok->done = E_OK;
64     tok->fp = NULL;
65     tok->input = NULL;
66     tok->tabsize = TABSIZE;
67     tok->indent = 0;
68     tok->indstack[0] = 0;
69     tok->atbol = 1;
70     tok->pendin = 0;
71     tok->prompt = tok->nextprompt = NULL;
72     tok->lineno = 0;
73     tok->level = 0;
74     tok->altindstack[0] = 0;
75     tok->decoding_state = STATE_INIT;
76     tok->decoding_erred = 0;
77     tok->enc = NULL;
78     tok->encoding = NULL;
79     tok->cont_line = 0;
80     tok->filename = NULL;
81     tok->decoding_readline = NULL;
82     tok->decoding_buffer = NULL;
83     tok->type_comments = 0;
84     tok->async_hacks = 0;
85     tok->async_def = 0;
86     tok->async_def_indent = 0;
87     tok->async_def_nl = 0;
88     tok->interactive_underflow = IUNDERFLOW_NORMAL;
89     tok->str = NULL;
90     return tok;
91 }
92 
93 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)94 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
95 {
96     char* result = (char *)PyMem_Malloc(len + 1);
97     if (!result) {
98         tok->done = E_NOMEM;
99         return NULL;
100     }
101     memcpy(result, s, len);
102     result[len] = '\0';
103     return result;
104 }
105 
106 static char *
error_ret(struct tok_state * tok)107 error_ret(struct tok_state *tok) /* XXX */
108 {
109     tok->decoding_erred = 1;
110     if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
111         PyMem_Free(tok->buf);
112     tok->buf = tok->cur = tok->inp = NULL;
113     tok->start = NULL;
114     tok->end = NULL;
115     tok->done = E_DECODE;
116     return NULL;                /* as if it were EOF */
117 }
118 
119 
120 static const char *
get_normal_name(const char * s)121 get_normal_name(const char *s)  /* for utf-8 and latin-1 */
122 {
123     char buf[13];
124     int i;
125     for (i = 0; i < 12; i++) {
126         int c = s[i];
127         if (c == '\0')
128             break;
129         else if (c == '_')
130             buf[i] = '-';
131         else
132             buf[i] = tolower(c);
133     }
134     buf[i] = '\0';
135     if (strcmp(buf, "utf-8") == 0 ||
136         strncmp(buf, "utf-8-", 6) == 0)
137         return "utf-8";
138     else if (strcmp(buf, "latin-1") == 0 ||
139              strcmp(buf, "iso-8859-1") == 0 ||
140              strcmp(buf, "iso-latin-1") == 0 ||
141              strncmp(buf, "latin-1-", 8) == 0 ||
142              strncmp(buf, "iso-8859-1-", 11) == 0 ||
143              strncmp(buf, "iso-latin-1-", 12) == 0)
144         return "iso-8859-1";
145     else
146         return s;
147 }
148 
149 /* Return the coding spec in S, or NULL if none is found.  */
150 
151 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)152 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153 {
154     Py_ssize_t i;
155     *spec = NULL;
156     /* Coding spec must be in a comment, and that comment must be
157      * the only statement on the source code line. */
158     for (i = 0; i < size - 6; i++) {
159         if (s[i] == '#')
160             break;
161         if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162             return 1;
163     }
164     for (; i < size - 6; i++) { /* XXX inefficient search */
165         const char* t = s + i;
166         if (memcmp(t, "coding", 6) == 0) {
167             const char* begin = NULL;
168             t += 6;
169             if (t[0] != ':' && t[0] != '=')
170                 continue;
171             do {
172                 t++;
173             } while (t[0] == ' ' || t[0] == '\t');
174 
175             begin = t;
176             while (Py_ISALNUM(t[0]) ||
177                    t[0] == '-' || t[0] == '_' || t[0] == '.')
178                 t++;
179 
180             if (begin < t) {
181                 char* r = new_string(begin, t - begin, tok);
182                 const char* q;
183                 if (!r)
184                     return 0;
185                 q = get_normal_name(r);
186                 if (r != q) {
187                     PyMem_Free(r);
188                     r = new_string(q, strlen(q), tok);
189                     if (!r)
190                         return 0;
191                 }
192                 *spec = r;
193                 break;
194             }
195         }
196     }
197     return 1;
198 }
199 
200 /* Check whether the line contains a coding spec. If it does,
201    invoke the set_readline function for the new encoding.
202    This function receives the tok_state and the new encoding.
203    Return 1 on success, 0 on failure.  */
204 
205 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))206 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207                   int set_readline(struct tok_state *, const char *))
208 {
209     char *cs;
210     if (tok->cont_line) {
211         /* It's a continuation line, so it can't be a coding spec. */
212         tok->decoding_state = STATE_NORMAL;
213         return 1;
214     }
215     if (!get_coding_spec(line, &cs, size, tok)) {
216         return 0;
217     }
218     if (!cs) {
219         Py_ssize_t i;
220         for (i = 0; i < size; i++) {
221             if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222                 break;
223             if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224                 /* Stop checking coding spec after a line containing
225                  * anything except a comment. */
226                 tok->decoding_state = STATE_NORMAL;
227                 break;
228             }
229         }
230         return 1;
231     }
232     tok->decoding_state = STATE_NORMAL;
233     if (tok->encoding == NULL) {
234         assert(tok->decoding_readline == NULL);
235         if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236             error_ret(tok);
237             PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238             PyMem_Free(cs);
239             return 0;
240         }
241         tok->encoding = cs;
242     } else {                /* then, compare cs with BOM */
243         if (strcmp(tok->encoding, cs) != 0) {
244             error_ret(tok);
245             PyErr_Format(PyExc_SyntaxError,
246                          "encoding problem: %s with BOM", cs);
247             PyMem_Free(cs);
248             return 0;
249         }
250         PyMem_Free(cs);
251     }
252     return 1;
253 }
254 
255 /* See whether the file starts with a BOM. If it does,
256    invoke the set_readline function with the new encoding.
257    Return 1 on success, 0 on failure.  */
258 
259 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)260 check_bom(int get_char(struct tok_state *),
261           void unget_char(int, struct tok_state *),
262           int set_readline(struct tok_state *, const char *),
263           struct tok_state *tok)
264 {
265     int ch1, ch2, ch3;
266     ch1 = get_char(tok);
267     tok->decoding_state = STATE_SEEK_CODING;
268     if (ch1 == EOF) {
269         return 1;
270     } else if (ch1 == 0xEF) {
271         ch2 = get_char(tok);
272         if (ch2 != 0xBB) {
273             unget_char(ch2, tok);
274             unget_char(ch1, tok);
275             return 1;
276         }
277         ch3 = get_char(tok);
278         if (ch3 != 0xBF) {
279             unget_char(ch3, tok);
280             unget_char(ch2, tok);
281             unget_char(ch1, tok);
282             return 1;
283         }
284 #if 0
285     /* Disable support for UTF-16 BOMs until a decision
286        is made whether this needs to be supported.  */
287     } else if (ch1 == 0xFE) {
288         ch2 = get_char(tok);
289         if (ch2 != 0xFF) {
290             unget_char(ch2, tok);
291             unget_char(ch1, tok);
292             return 1;
293         }
294         if (!set_readline(tok, "utf-16-be"))
295             return 0;
296         tok->decoding_state = STATE_NORMAL;
297     } else if (ch1 == 0xFF) {
298         ch2 = get_char(tok);
299         if (ch2 != 0xFE) {
300             unget_char(ch2, tok);
301             unget_char(ch1, tok);
302             return 1;
303         }
304         if (!set_readline(tok, "utf-16-le"))
305             return 0;
306         tok->decoding_state = STATE_NORMAL;
307 #endif
308     } else {
309         unget_char(ch1, tok);
310         return 1;
311     }
312     if (tok->encoding != NULL)
313         PyMem_Free(tok->encoding);
314     tok->encoding = new_string("utf-8", 5, tok);
315     if (!tok->encoding)
316         return 0;
317     /* No need to set_readline: input is already utf-8 */
318     return 1;
319 }
320 
321 static int
tok_concatenate_interactive_new_line(struct tok_state * tok,const char * line)322 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
323     assert(tok->fp_interactive);
324 
325     if (!line) {
326         return 0;
327     }
328 
329     Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330     Py_ssize_t line_size = strlen(line);
331     char* new_str = tok->interactive_src_start;
332 
333     new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334     if (!new_str) {
335         if (tok->interactive_src_start) {
336             PyMem_Free(tok->interactive_src_start);
337         }
338         tok->interactive_src_start = NULL;
339         tok->interactive_src_end = NULL;
340         tok->done = E_NOMEM;
341         return -1;
342     }
343     strcpy(new_str + current_size, line);
344 
345     tok->interactive_src_start = new_str;
346     tok->interactive_src_end = new_str + current_size + line_size;
347     return 0;
348 }
349 
350 
351 /* Read a line of text from TOK into S, using the stream in TOK.
352    Return NULL on failure, else S.
353 
354    On entry, tok->decoding_buffer will be one of:
355      1) NULL: need to call tok->decoding_readline to get a new line
356      2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357        stored the result in tok->decoding_buffer
358      3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
359        (in the s buffer) to copy entire contents of the line read
360        by tok->decoding_readline.  tok->decoding_buffer has the overflow.
361        In this case, tok_readline_recode is called in a loop (with an expanded buffer)
362        until the buffer ends with a '\n' (or until the end of the file is
363        reached): see tok_nextc and its calls to tok_reserve_buf.
364 */
365 
366 static int
tok_reserve_buf(struct tok_state * tok,Py_ssize_t size)367 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
368 {
369     Py_ssize_t cur = tok->cur - tok->buf;
370     Py_ssize_t oldsize = tok->inp - tok->buf;
371     Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372     if (newsize > tok->end - tok->buf) {
373         char *newbuf = tok->buf;
374         Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
375         Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376         Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
377         newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378         if (newbuf == NULL) {
379             tok->done = E_NOMEM;
380             return 0;
381         }
382         tok->buf = newbuf;
383         tok->cur = tok->buf + cur;
384         tok->inp = tok->buf + oldsize;
385         tok->end = tok->buf + newsize;
386         tok->start = start < 0 ? NULL : tok->buf + start;
387         tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388         tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
389     }
390     return 1;
391 }
392 
393 static int
tok_readline_recode(struct tok_state * tok)394 tok_readline_recode(struct tok_state *tok) {
395     PyObject *line;
396     const  char *buf;
397     Py_ssize_t buflen;
398     line = tok->decoding_buffer;
399     if (line == NULL) {
400         line = PyObject_CallNoArgs(tok->decoding_readline);
401         if (line == NULL) {
402             error_ret(tok);
403             goto error;
404         }
405     }
406     else {
407         tok->decoding_buffer = NULL;
408     }
409     buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410     if (buf == NULL) {
411         error_ret(tok);
412         goto error;
413     }
414     if (!tok_reserve_buf(tok, buflen + 1)) {
415         goto error;
416     }
417     memcpy(tok->inp, buf, buflen);
418     tok->inp += buflen;
419     *tok->inp = '\0';
420     if (tok->fp_interactive &&
421         tok_concatenate_interactive_new_line(tok, buf) == -1) {
422         goto error;
423     }
424     Py_DECREF(line);
425     return 1;
426 error:
427     Py_XDECREF(line);
428     return 0;
429 }
430 
431 /* Set the readline function for TOK to a StreamReader's
432    readline function. The StreamReader is named ENC.
433 
434    This function is called from check_bom and check_coding_spec.
435 
436    ENC is usually identical to the future value of tok->encoding,
437    except for the (currently unsupported) case of UTF-16.
438 
439    Return 1 on success, 0 on failure. */
440 
441 static int
fp_setreadl(struct tok_state * tok,const char * enc)442 fp_setreadl(struct tok_state *tok, const char* enc)
443 {
444     PyObject *readline, *io, *stream;
445     _Py_IDENTIFIER(open);
446     _Py_IDENTIFIER(readline);
447     int fd;
448     long pos;
449 
450     fd = fileno(tok->fp);
451     /* Due to buffering the file offset for fd can be different from the file
452      * position of tok->fp.  If tok->fp was opened in text mode on Windows,
453      * its file position counts CRLF as one char and can't be directly mapped
454      * to the file offset for fd.  Instead we step back one byte and read to
455      * the end of line.*/
456     pos = ftell(tok->fp);
457     if (pos == -1 ||
458         lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
459         PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
460         return 0;
461     }
462 
463     io = PyImport_ImportModuleNoBlock("io");
464     if (io == NULL)
465         return 0;
466 
467     stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
468                     fd, "r", -1, enc, Py_None, Py_None, Py_False);
469     Py_DECREF(io);
470     if (stream == NULL)
471         return 0;
472 
473     readline = _PyObject_GetAttrId(stream, &PyId_readline);
474     Py_DECREF(stream);
475     if (readline == NULL)
476         return 0;
477     Py_XSETREF(tok->decoding_readline, readline);
478 
479     if (pos > 0) {
480         PyObject *bufobj = _PyObject_CallNoArg(readline);
481         if (bufobj == NULL)
482             return 0;
483         Py_DECREF(bufobj);
484     }
485 
486     return 1;
487 }
488 
489 /* Fetch the next byte from TOK. */
490 
fp_getc(struct tok_state * tok)491 static int fp_getc(struct tok_state *tok) {
492     return getc(tok->fp);
493 }
494 
495 /* Unfetch the last byte back into TOK.  */
496 
fp_ungetc(int c,struct tok_state * tok)497 static void fp_ungetc(int c, struct tok_state *tok) {
498     ungetc(c, tok->fp);
499 }
500 
501 /* Check whether the characters at s start a valid
502    UTF-8 sequence. Return the number of characters forming
503    the sequence if yes, 0 if not.  */
valid_utf8(const unsigned char * s)504 static int valid_utf8(const unsigned char* s)
505 {
506     int expected = 0;
507     int length;
508     if (*s < 0x80)
509         /* single-byte code */
510         return 1;
511     if (*s < 0xc0)
512         /* following byte */
513         return 0;
514     if (*s < 0xE0)
515         expected = 1;
516     else if (*s < 0xF0)
517         expected = 2;
518     else if (*s < 0xF8)
519         expected = 3;
520     else
521         return 0;
522     length = expected + 1;
523     for (; expected; expected--)
524         if (s[expected] < 0x80 || s[expected] >= 0xC0)
525             return 0;
526     return length;
527 }
528 
529 static int
ensure_utf8(char * line,struct tok_state * tok)530 ensure_utf8(char *line, struct tok_state *tok)
531 {
532     int badchar = 0;
533     unsigned char *c;
534     int length;
535     for (c = (unsigned char *)line; *c; c += length) {
536         if (!(length = valid_utf8(c))) {
537             badchar = *c;
538             break;
539         }
540     }
541     if (badchar) {
542         /* Need to add 1 to the line number, since this line
543        has not been counted, yet.  */
544         PyErr_Format(PyExc_SyntaxError,
545                      "Non-UTF-8 code starting with '\\x%.2x' "
546                      "in file %U on line %i, "
547                      "but no encoding declared; "
548                      "see https://python.org/dev/peps/pep-0263/ for details",
549                      badchar, tok->filename, tok->lineno + 1);
550         return 0;
551     }
552     return 1;
553 }
554 
555 /* Fetch a byte from TOK, using the string buffer. */
556 
557 static int
buf_getc(struct tok_state * tok)558 buf_getc(struct tok_state *tok) {
559     return Py_CHARMASK(*tok->str++);
560 }
561 
562 /* Unfetch a byte from TOK, using the string buffer. */
563 
564 static void
buf_ungetc(int c,struct tok_state * tok)565 buf_ungetc(int c, struct tok_state *tok) {
566     tok->str--;
567     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
568 }
569 
570 /* Set the readline function for TOK to ENC. For the string-based
571    tokenizer, this means to just record the encoding. */
572 
573 static int
buf_setreadl(struct tok_state * tok,const char * enc)574 buf_setreadl(struct tok_state *tok, const char* enc) {
575     tok->enc = enc;
576     return 1;
577 }
578 
579 /* Return a UTF-8 encoding Python string object from the
580    C byte string STR, which is encoded with ENC. */
581 
582 static PyObject *
translate_into_utf8(const char * str,const char * enc)583 translate_into_utf8(const char* str, const char* enc) {
584     PyObject *utf8;
585     PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586     if (buf == NULL)
587         return NULL;
588     utf8 = PyUnicode_AsUTF8String(buf);
589     Py_DECREF(buf);
590     return utf8;
591 }
592 
593 
594 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)595 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
596     int skip_next_lf = 0;
597     size_t needed_length = strlen(s) + 2, final_length;
598     char *buf, *current;
599     char c = '\0';
600     buf = PyMem_Malloc(needed_length);
601     if (buf == NULL) {
602         tok->done = E_NOMEM;
603         return NULL;
604     }
605     for (current = buf; *s; s++, current++) {
606         c = *s;
607         if (skip_next_lf) {
608             skip_next_lf = 0;
609             if (c == '\n') {
610                 c = *++s;
611                 if (!c)
612                     break;
613             }
614         }
615         if (c == '\r') {
616             skip_next_lf = 1;
617             c = '\n';
618         }
619         *current = c;
620     }
621     /* If this is exec input, add a newline to the end of the string if
622        there isn't one already. */
623     if (exec_input && c != '\n') {
624         *current = '\n';
625         current++;
626     }
627     *current = '\0';
628     final_length = current - buf + 1;
629     if (final_length < needed_length && final_length) {
630         /* should never fail */
631         char* result = PyMem_Realloc(buf, final_length);
632         if (result == NULL) {
633             PyMem_Free(buf);
634         }
635         buf = result;
636     }
637     return buf;
638 }
639 
640 /* Decode a byte string STR for use as the buffer of TOK.
641    Look for encoding declarations inside STR, and record them
642    inside TOK.  */
643 
644 static char *
decode_str(const char * input,int single,struct tok_state * tok)645 decode_str(const char *input, int single, struct tok_state *tok)
646 {
647     PyObject* utf8 = NULL;
648     char *str;
649     const char *s;
650     const char *newl[2] = {NULL, NULL};
651     int lineno = 0;
652     tok->input = str = translate_newlines(input, single, tok);
653     if (str == NULL)
654         return NULL;
655     tok->enc = NULL;
656     tok->str = str;
657     if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658         return error_ret(tok);
659     str = tok->str;             /* string after BOM if any */
660     assert(str);
661     if (tok->enc != NULL) {
662         utf8 = translate_into_utf8(str, tok->enc);
663         if (utf8 == NULL)
664             return error_ret(tok);
665         str = PyBytes_AsString(utf8);
666     }
667     for (s = str;; s++) {
668         if (*s == '\0') break;
669         else if (*s == '\n') {
670             assert(lineno < 2);
671             newl[lineno] = s;
672             lineno++;
673             if (lineno == 2) break;
674         }
675     }
676     tok->enc = NULL;
677     /* need to check line 1 and 2 separately since check_coding_spec
678        assumes a single line as input */
679     if (newl[0]) {
680         if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681             return NULL;
682         }
683         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
684             if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685                                    tok, buf_setreadl))
686                 return NULL;
687         }
688     }
689     if (tok->enc != NULL) {
690         assert(utf8 == NULL);
691         utf8 = translate_into_utf8(str, tok->enc);
692         if (utf8 == NULL)
693             return error_ret(tok);
694         str = PyBytes_AS_STRING(utf8);
695     }
696     assert(tok->decoding_buffer == NULL);
697     tok->decoding_buffer = utf8; /* CAUTION */
698     return str;
699 }
700 
701 /* Set up tokenizer for string */
702 
703 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)704 PyTokenizer_FromString(const char *str, int exec_input)
705 {
706     struct tok_state *tok = tok_new();
707     char *decoded;
708 
709     if (tok == NULL)
710         return NULL;
711     decoded = decode_str(str, exec_input, tok);
712     if (decoded == NULL) {
713         PyTokenizer_Free(tok);
714         return NULL;
715     }
716 
717     tok->buf = tok->cur = tok->inp = decoded;
718     tok->end = decoded;
719     return tok;
720 }
721 
722 /* Set up tokenizer for UTF-8 string */
723 
724 struct tok_state *
PyTokenizer_FromUTF8(const char * str,int exec_input)725 PyTokenizer_FromUTF8(const char *str, int exec_input)
726 {
727     struct tok_state *tok = tok_new();
728     char *translated;
729     if (tok == NULL)
730         return NULL;
731     tok->input = translated = translate_newlines(str, exec_input, tok);
732     if (translated == NULL) {
733         PyTokenizer_Free(tok);
734         return NULL;
735     }
736     tok->decoding_state = STATE_NORMAL;
737     tok->enc = NULL;
738     tok->str = translated;
739     tok->encoding = new_string("utf-8", 5, tok);
740     if (!tok->encoding) {
741         PyTokenizer_Free(tok);
742         return NULL;
743     }
744 
745     tok->buf = tok->cur = tok->inp = translated;
746     tok->end = translated;
747     return tok;
748 }
749 
750 /* Set up tokenizer for file */
751 
752 struct tok_state *
PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)753 PyTokenizer_FromFile(FILE *fp, const char* enc,
754                      const char *ps1, const char *ps2)
755 {
756     struct tok_state *tok = tok_new();
757     if (tok == NULL)
758         return NULL;
759     if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
760         PyTokenizer_Free(tok);
761         return NULL;
762     }
763     tok->cur = tok->inp = tok->buf;
764     tok->end = tok->buf + BUFSIZ;
765     tok->fp = fp;
766     tok->prompt = ps1;
767     tok->nextprompt = ps2;
768     if (enc != NULL) {
769         /* Must copy encoding declaration since it
770            gets copied into the parse tree. */
771         tok->encoding = new_string(enc, strlen(enc), tok);
772         if (!tok->encoding) {
773             PyTokenizer_Free(tok);
774             return NULL;
775         }
776         tok->decoding_state = STATE_NORMAL;
777     }
778     return tok;
779 }
780 
781 /* Free a tok_state structure */
782 
783 void
PyTokenizer_Free(struct tok_state * tok)784 PyTokenizer_Free(struct tok_state *tok)
785 {
786     if (tok->encoding != NULL) {
787         PyMem_Free(tok->encoding);
788     }
789     Py_XDECREF(tok->decoding_readline);
790     Py_XDECREF(tok->decoding_buffer);
791     Py_XDECREF(tok->filename);
792     if (tok->fp != NULL && tok->buf != NULL) {
793         PyMem_Free(tok->buf);
794     }
795     if (tok->input) {
796         PyMem_Free(tok->input);
797     }
798     if (tok->interactive_src_start != NULL) {
799         PyMem_Free(tok->interactive_src_start);
800     }
801     PyMem_Free(tok);
802 }
803 
804 static int
tok_readline_raw(struct tok_state * tok)805 tok_readline_raw(struct tok_state *tok)
806 {
807     do {
808         if (!tok_reserve_buf(tok, BUFSIZ)) {
809             return 0;
810         }
811         char *line = Py_UniversalNewlineFgets(tok->inp,
812                                               (int)(tok->end - tok->inp),
813                                               tok->fp, NULL);
814         if (line == NULL) {
815             return 1;
816         }
817         if (tok->fp_interactive &&
818             tok_concatenate_interactive_new_line(tok, line) == -1) {
819             return 0;
820         }
821         tok->inp = strchr(tok->inp, '\0');
822         if (tok->inp == tok->buf) {
823             return 0;
824         }
825     } while (tok->inp[-1] != '\n');
826     return 1;
827 }
828 
829 static int
tok_underflow_string(struct tok_state * tok)830 tok_underflow_string(struct tok_state *tok) {
831     char *end = strchr(tok->inp, '\n');
832     if (end != NULL) {
833         end++;
834     }
835     else {
836         end = strchr(tok->inp, '\0');
837         if (end == tok->inp) {
838             tok->done = E_EOF;
839             return 0;
840         }
841     }
842     if (tok->start == NULL) {
843         tok->buf = tok->cur;
844     }
845     tok->line_start = tok->cur;
846     tok->lineno++;
847     tok->inp = end;
848     return 1;
849 }
850 
851 static int
tok_underflow_interactive(struct tok_state * tok)852 tok_underflow_interactive(struct tok_state *tok) {
853     if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854         tok->done = E_INTERACT_STOP;
855         return 1;
856     }
857     char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
858     if (newtok != NULL) {
859         char *translated = translate_newlines(newtok, 0, tok);
860         PyMem_Free(newtok);
861         if (translated == NULL) {
862             return 0;
863         }
864         newtok = translated;
865     }
866     if (tok->encoding && newtok && *newtok) {
867         /* Recode to UTF-8 */
868         Py_ssize_t buflen;
869         const char* buf;
870         PyObject *u = translate_into_utf8(newtok, tok->encoding);
871         PyMem_Free(newtok);
872         if (u == NULL) {
873             tok->done = E_DECODE;
874             return 0;
875         }
876         buflen = PyBytes_GET_SIZE(u);
877         buf = PyBytes_AS_STRING(u);
878         newtok = PyMem_Malloc(buflen+1);
879         if (newtok == NULL) {
880             Py_DECREF(u);
881             tok->done = E_NOMEM;
882             return 0;
883         }
884         strcpy(newtok, buf);
885         Py_DECREF(u);
886     }
887     if (tok->fp_interactive &&
888         tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889         PyMem_Free(newtok);
890         return 0;
891     }
892     if (tok->nextprompt != NULL) {
893         tok->prompt = tok->nextprompt;
894     }
895     if (newtok == NULL) {
896         tok->done = E_INTR;
897     }
898     else if (*newtok == '\0') {
899         PyMem_Free(newtok);
900         tok->done = E_EOF;
901     }
902     else if (tok->start != NULL) {
903         Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904         size_t size = strlen(newtok);
905         tok->lineno++;
906         if (!tok_reserve_buf(tok, size + 1)) {
907             PyMem_Free(tok->buf);
908             tok->buf = NULL;
909             PyMem_Free(newtok);
910             return 0;
911         }
912         memcpy(tok->cur, newtok, size + 1);
913         PyMem_Free(newtok);
914         tok->inp += size;
915         tok->multi_line_start = tok->buf + cur_multi_line_start;
916     }
917     else {
918         tok->lineno++;
919         PyMem_Free(tok->buf);
920         tok->buf = newtok;
921         tok->cur = tok->buf;
922         tok->line_start = tok->buf;
923         tok->inp = strchr(tok->buf, '\0');
924         tok->end = tok->inp + 1;
925     }
926     if (tok->done != E_OK) {
927         if (tok->prompt != NULL) {
928             PySys_WriteStderr("\n");
929         }
930         return 0;
931     }
932     return 1;
933 }
934 
935 static int
tok_underflow_file(struct tok_state * tok)936 tok_underflow_file(struct tok_state *tok) {
937     if (tok->start == NULL) {
938         tok->cur = tok->inp = tok->buf;
939     }
940     if (tok->decoding_state == STATE_INIT) {
941         /* We have not yet determined the encoding.
942            If an encoding is found, use the file-pointer
943            reader functions from now on. */
944         if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945             error_ret(tok);
946             return 0;
947         }
948         assert(tok->decoding_state != STATE_INIT);
949     }
950     /* Read until '\n' or EOF */
951     if (tok->decoding_readline != NULL) {
952         /* We already have a codec associated with this input. */
953         if (!tok_readline_recode(tok)) {
954             return 0;
955         }
956     }
957     else {
958         /* We want a 'raw' read. */
959         if (!tok_readline_raw(tok)) {
960             return 0;
961         }
962     }
963     if (tok->inp == tok->cur) {
964         tok->done = E_EOF;
965         return 0;
966     }
967     if (tok->inp[-1] != '\n') {
968         /* Last line does not end in \n, fake one */
969         *tok->inp++ = '\n';
970         *tok->inp = '\0';
971     }
972 
973     tok->lineno++;
974     if (tok->decoding_state != STATE_NORMAL) {
975         if (tok->lineno > 2) {
976             tok->decoding_state = STATE_NORMAL;
977         }
978         else if (!check_coding_spec(tok->cur, strlen(tok->cur),
979                                     tok, fp_setreadl))
980         {
981             return 0;
982         }
983     }
984     /* The default encoding is UTF-8, so make sure we don't have any
985        non-UTF-8 sequences in it. */
986     if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
987         error_ret(tok);
988         return 0;
989     }
990     assert(tok->done == E_OK);
991     return tok->done == E_OK;
992 }
993 
994 #if defined(Py_DEBUG)
995 static void
print_escape(FILE * f,const char * s,Py_ssize_t size)996 print_escape(FILE *f, const char *s, Py_ssize_t size)
997 {
998     if (s == NULL) {
999         fputs("NULL", f);
1000         return;
1001     }
1002     putc('"', f);
1003     while (size-- > 0) {
1004         unsigned char c = *s++;
1005         switch (c) {
1006             case '\n': fputs("\\n", f); break;
1007             case '\r': fputs("\\r", f); break;
1008             case '\t': fputs("\\t", f); break;
1009             case '\f': fputs("\\f", f); break;
1010             case '\'': fputs("\\'", f); break;
1011             case '"': fputs("\\\"", f); break;
1012             default:
1013                 if (0x20 <= c && c <= 0x7f)
1014                     putc(c, f);
1015                 else
1016                     fprintf(f, "\\x%02x", c);
1017         }
1018     }
1019     putc('"', f);
1020 }
1021 #endif
1022 
1023 /* Get next char, updating state; error code goes into tok->done */
1024 
1025 static int
tok_nextc(struct tok_state * tok)1026 tok_nextc(struct tok_state *tok)
1027 {
1028     int rc;
1029     for (;;) {
1030         if (tok->cur != tok->inp) {
1031             return Py_CHARMASK(*tok->cur++); /* Fast path */
1032         }
1033         if (tok->done != E_OK)
1034             return EOF;
1035         if (tok->fp == NULL) {
1036             rc = tok_underflow_string(tok);
1037         }
1038         else if (tok->prompt != NULL) {
1039             rc = tok_underflow_interactive(tok);
1040         }
1041         else {
1042             rc = tok_underflow_file(tok);
1043         }
1044 #if defined(Py_DEBUG)
1045         if (Py_DebugFlag) {
1046             fprintf(stderr, "line[%d] = ", tok->lineno);
1047             print_escape(stderr, tok->cur, tok->inp - tok->cur);
1048             fprintf(stderr, "  tok->done = %d\n", tok->done);
1049         }
1050 #endif
1051         if (!rc) {
1052             tok->cur = tok->inp;
1053             return EOF;
1054         }
1055         tok->line_start = tok->cur;
1056     }
1057     Py_UNREACHABLE();
1058 }
1059 
1060 /* Back-up one character */
1061 
1062 static void
tok_backup(struct tok_state * tok,int c)1063 tok_backup(struct tok_state *tok, int c)
1064 {
1065     if (c != EOF) {
1066         if (--tok->cur < tok->buf) {
1067             Py_FatalError("tokenizer beginning of buffer");
1068         }
1069         if ((int)(unsigned char)*tok->cur != c) {
1070             Py_FatalError("tok_backup: wrong character");
1071         }
1072     }
1073 }
1074 
1075 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)1076 _syntaxerror_range(struct tok_state *tok, const char *format,
1077                    int col_offset, int end_col_offset,
1078                    va_list vargs)
1079 {
1080     PyObject *errmsg, *errtext, *args;
1081     errmsg = PyUnicode_FromFormatV(format, vargs);
1082     if (!errmsg) {
1083         goto error;
1084     }
1085 
1086     errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1087                                    "replace");
1088     if (!errtext) {
1089         goto error;
1090     }
1091 
1092     if (col_offset == -1) {
1093         col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1094     }
1095     if (end_col_offset == -1) {
1096         end_col_offset = col_offset;
1097     }
1098 
1099     Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1100     if (line_len != tok->cur - tok->line_start) {
1101         Py_DECREF(errtext);
1102         errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1103                                        "replace");
1104     }
1105     if (!errtext) {
1106         goto error;
1107     }
1108 
1109     args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1110                          col_offset, errtext, tok->lineno, end_col_offset);
1111     if (args) {
1112         PyErr_SetObject(PyExc_SyntaxError, args);
1113         Py_DECREF(args);
1114     }
1115 
1116 error:
1117     Py_XDECREF(errmsg);
1118     tok->done = E_ERROR;
1119     return ERRORTOKEN;
1120 }
1121 
1122 static int
syntaxerror(struct tok_state * tok,const char * format,...)1123 syntaxerror(struct tok_state *tok, const char *format, ...)
1124 {
1125     va_list vargs;
1126 #ifdef HAVE_STDARG_PROTOTYPES
1127     va_start(vargs, format);
1128 #else
1129     va_start(vargs);
1130 #endif
1131     int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1132     va_end(vargs);
1133     return ret;
1134 }
1135 
1136 static int
syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)1137 syntaxerror_known_range(struct tok_state *tok,
1138                         int col_offset, int end_col_offset,
1139                         const char *format, ...)
1140 {
1141     va_list vargs;
1142 #ifdef HAVE_STDARG_PROTOTYPES
1143     va_start(vargs, format);
1144 #else
1145     va_start(vargs);
1146 #endif
1147     int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1148     va_end(vargs);
1149     return ret;
1150 }
1151 
1152 
1153 
1154 static int
indenterror(struct tok_state * tok)1155 indenterror(struct tok_state *tok)
1156 {
1157     tok->done = E_TABSPACE;
1158     tok->cur = tok->inp;
1159     return ERRORTOKEN;
1160 }
1161 
1162 static int
parser_warn(struct tok_state * tok,const char * format,...)1163 parser_warn(struct tok_state *tok, const char *format, ...)
1164 {
1165     PyObject *errmsg;
1166     va_list vargs;
1167 #ifdef HAVE_STDARG_PROTOTYPES
1168     va_start(vargs, format);
1169 #else
1170     va_start(vargs);
1171 #endif
1172     errmsg = PyUnicode_FromFormatV(format, vargs);
1173     va_end(vargs);
1174     if (!errmsg) {
1175         goto error;
1176     }
1177 
1178     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1179                                  tok->lineno, NULL, NULL) < 0) {
1180         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1181             /* Replace the DeprecationWarning exception with a SyntaxError
1182                to get a more accurate error report */
1183             PyErr_Clear();
1184             syntaxerror(tok, "%U", errmsg);
1185         }
1186         goto error;
1187     }
1188     Py_DECREF(errmsg);
1189     return 0;
1190 
1191 error:
1192     Py_XDECREF(errmsg);
1193     tok->done = E_ERROR;
1194     return -1;
1195 }
1196 
1197 static int
lookahead(struct tok_state * tok,const char * test)1198 lookahead(struct tok_state *tok, const char *test)
1199 {
1200     const char *s = test;
1201     int res = 0;
1202     while (1) {
1203         int c = tok_nextc(tok);
1204         if (*s == 0) {
1205             res = !is_potential_identifier_char(c);
1206         }
1207         else if (c == *s) {
1208             s++;
1209             continue;
1210         }
1211 
1212         tok_backup(tok, c);
1213         while (s != test) {
1214             tok_backup(tok, *--s);
1215         }
1216         return res;
1217     }
1218 }
1219 
1220 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)1221 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1222 {
1223     /* Emit a deprecation warning only if the numeric literal is immediately
1224      * followed by one of keywords which can occurr after a numeric literal
1225      * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1226      * It allows to gradually deprecate existing valid code without adding
1227      * warning before error in most cases of invalid numeric literal (which
1228      * would be confusiong and break existing tests).
1229      * Raise a syntax error with slighly better message than plain
1230      * "invalid syntax" if the numeric literal is immediately followed by
1231      * other keyword or identifier.
1232      */
1233     int r = 0;
1234     if (c == 'a') {
1235         r = lookahead(tok, "nd");
1236     }
1237     else if (c == 'e') {
1238         r = lookahead(tok, "lse");
1239     }
1240     else if (c == 'f') {
1241         r = lookahead(tok, "or");
1242     }
1243     else if (c == 'i') {
1244         int c2 = tok_nextc(tok);
1245         if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1246             r = 1;
1247         }
1248         tok_backup(tok, c2);
1249     }
1250     else if (c == 'o') {
1251         r = lookahead(tok, "r");
1252     }
1253     if (r) {
1254         tok_backup(tok, c);
1255         if (parser_warn(tok, "invalid %s literal", kind)) {
1256             return 0;
1257         }
1258         tok_nextc(tok);
1259     }
1260     else /* In future releases, only error will remain. */
1261     if (is_potential_identifier_char(c)) {
1262         tok_backup(tok, c);
1263         syntaxerror(tok, "invalid %s literal", kind);
1264         return 0;
1265     }
1266     return 1;
1267 }
1268 
1269 /* Verify that the identifier follows PEP 3131.
1270    All identifier strings are guaranteed to be "ready" unicode objects.
1271  */
1272 static int
verify_identifier(struct tok_state * tok)1273 verify_identifier(struct tok_state *tok)
1274 {
1275     PyObject *s;
1276     if (tok->decoding_erred)
1277         return 0;
1278     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1279     if (s == NULL) {
1280         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1281             tok->done = E_DECODE;
1282         }
1283         else {
1284             tok->done = E_ERROR;
1285         }
1286         return 0;
1287     }
1288     Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1289     if (invalid < 0) {
1290         Py_DECREF(s);
1291         tok->done = E_ERROR;
1292         return 0;
1293     }
1294     assert(PyUnicode_GET_LENGTH(s) > 0);
1295     if (invalid < PyUnicode_GET_LENGTH(s)) {
1296         Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1297         if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1298             /* Determine the offset in UTF-8 encoded input */
1299             Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1300             if (s != NULL) {
1301                 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1302             }
1303             if (s == NULL) {
1304                 tok->done = E_ERROR;
1305                 return 0;
1306             }
1307             tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1308         }
1309         Py_DECREF(s);
1310         // PyUnicode_FromFormatV() does not support %X
1311         char hex[9];
1312         (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1313         if (Py_UNICODE_ISPRINTABLE(ch)) {
1314             syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1315         }
1316         else {
1317             syntaxerror(tok, "invalid non-printable character U+%s", hex);
1318         }
1319         return 0;
1320     }
1321     Py_DECREF(s);
1322     return 1;
1323 }
1324 
1325 static int
tok_decimal_tail(struct tok_state * tok)1326 tok_decimal_tail(struct tok_state *tok)
1327 {
1328     int c;
1329 
1330     while (1) {
1331         do {
1332             c = tok_nextc(tok);
1333         } while (isdigit(c));
1334         if (c != '_') {
1335             break;
1336         }
1337         c = tok_nextc(tok);
1338         if (!isdigit(c)) {
1339             tok_backup(tok, c);
1340             syntaxerror(tok, "invalid decimal literal");
1341             return 0;
1342         }
1343     }
1344     return c;
1345 }
1346 
1347 /* Get next token, after space stripping etc. */
1348 
1349 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1350 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1351 {
1352     int c;
1353     int blankline, nonascii;
1354 
1355     *p_start = *p_end = NULL;
1356   nextline:
1357     tok->start = NULL;
1358     blankline = 0;
1359 
1360     /* Get indentation level */
1361     if (tok->atbol) {
1362         int col = 0;
1363         int altcol = 0;
1364         tok->atbol = 0;
1365         for (;;) {
1366             c = tok_nextc(tok);
1367             if (c == ' ') {
1368                 col++, altcol++;
1369             }
1370             else if (c == '\t') {
1371                 col = (col / tok->tabsize + 1) * tok->tabsize;
1372                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1373             }
1374             else if (c == '\014')  {/* Control-L (formfeed) */
1375                 col = altcol = 0; /* For Emacs users */
1376             }
1377             else {
1378                 break;
1379             }
1380         }
1381         tok_backup(tok, c);
1382         if (c == '#' || c == '\n' || c == '\\') {
1383             /* Lines with only whitespace and/or comments
1384                and/or a line continuation character
1385                shouldn't affect the indentation and are
1386                not passed to the parser as NEWLINE tokens,
1387                except *totally* empty lines in interactive
1388                mode, which signal the end of a command group. */
1389             if (col == 0 && c == '\n' && tok->prompt != NULL) {
1390                 blankline = 0; /* Let it through */
1391             }
1392             else if (tok->prompt != NULL && tok->lineno == 1) {
1393                 /* In interactive mode, if the first line contains
1394                    only spaces and/or a comment, let it through. */
1395                 blankline = 0;
1396                 col = altcol = 0;
1397             }
1398             else {
1399                 blankline = 1; /* Ignore completely */
1400             }
1401             /* We can't jump back right here since we still
1402                may need to skip to the end of a comment */
1403         }
1404         if (!blankline && tok->level == 0) {
1405             if (col == tok->indstack[tok->indent]) {
1406                 /* No change */
1407                 if (altcol != tok->altindstack[tok->indent]) {
1408                     return indenterror(tok);
1409                 }
1410             }
1411             else if (col > tok->indstack[tok->indent]) {
1412                 /* Indent -- always one */
1413                 if (tok->indent+1 >= MAXINDENT) {
1414                     tok->done = E_TOODEEP;
1415                     tok->cur = tok->inp;
1416                     return ERRORTOKEN;
1417                 }
1418                 if (altcol <= tok->altindstack[tok->indent]) {
1419                     return indenterror(tok);
1420                 }
1421                 tok->pendin++;
1422                 tok->indstack[++tok->indent] = col;
1423                 tok->altindstack[tok->indent] = altcol;
1424             }
1425             else /* col < tok->indstack[tok->indent] */ {
1426                 /* Dedent -- any number, must be consistent */
1427                 while (tok->indent > 0 &&
1428                     col < tok->indstack[tok->indent]) {
1429                     tok->pendin--;
1430                     tok->indent--;
1431                 }
1432                 if (col != tok->indstack[tok->indent]) {
1433                     tok->done = E_DEDENT;
1434                     tok->cur = tok->inp;
1435                     return ERRORTOKEN;
1436                 }
1437                 if (altcol != tok->altindstack[tok->indent]) {
1438                     return indenterror(tok);
1439                 }
1440             }
1441         }
1442     }
1443 
1444     tok->start = tok->cur;
1445 
1446     /* Return pending indents/dedents */
1447     if (tok->pendin != 0) {
1448         if (tok->pendin < 0) {
1449             tok->pendin++;
1450             return DEDENT;
1451         }
1452         else {
1453             tok->pendin--;
1454             return INDENT;
1455         }
1456     }
1457 
1458     /* Peek ahead at the next character */
1459     c = tok_nextc(tok);
1460     tok_backup(tok, c);
1461     /* Check if we are closing an async function */
1462     if (tok->async_def
1463         && !blankline
1464         /* Due to some implementation artifacts of type comments,
1465          * a TYPE_COMMENT at the start of a function won't set an
1466          * indentation level and it will produce a NEWLINE after it.
1467          * To avoid spuriously ending an async function due to this,
1468          * wait until we have some non-newline char in front of us. */
1469         && c != '\n'
1470         && tok->level == 0
1471         /* There was a NEWLINE after ASYNC DEF,
1472            so we're past the signature. */
1473         && tok->async_def_nl
1474         /* Current indentation level is less than where
1475            the async function was defined */
1476         && tok->async_def_indent >= tok->indent)
1477     {
1478         tok->async_def = 0;
1479         tok->async_def_indent = 0;
1480         tok->async_def_nl = 0;
1481     }
1482 
1483  again:
1484     tok->start = NULL;
1485     /* Skip spaces */
1486     do {
1487         c = tok_nextc(tok);
1488     } while (c == ' ' || c == '\t' || c == '\014');
1489 
1490     /* Set start of current token */
1491     tok->start = tok->cur - 1;
1492 
1493     /* Skip comment, unless it's a type comment */
1494     if (c == '#') {
1495         const char *prefix, *p, *type_start;
1496 
1497         while (c != EOF && c != '\n') {
1498             c = tok_nextc(tok);
1499         }
1500 
1501         if (tok->type_comments) {
1502             p = tok->start;
1503             prefix = type_comment_prefix;
1504             while (*prefix && p < tok->cur) {
1505                 if (*prefix == ' ') {
1506                     while (*p == ' ' || *p == '\t') {
1507                         p++;
1508                     }
1509                 } else if (*prefix == *p) {
1510                     p++;
1511                 } else {
1512                     break;
1513                 }
1514 
1515                 prefix++;
1516             }
1517 
1518             /* This is a type comment if we matched all of type_comment_prefix. */
1519             if (!*prefix) {
1520                 int is_type_ignore = 1;
1521                 const char *ignore_end = p + 6;
1522                 tok_backup(tok, c);  /* don't eat the newline or EOF */
1523 
1524                 type_start = p;
1525 
1526                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1527                  * or anything ASCII and non-alphanumeric. */
1528                 is_type_ignore = (
1529                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1530                     && !(tok->cur > ignore_end
1531                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1532 
1533                 if (is_type_ignore) {
1534                     *p_start = ignore_end;
1535                     *p_end = tok->cur;
1536 
1537                     /* If this type ignore is the only thing on the line, consume the newline also. */
1538                     if (blankline) {
1539                         tok_nextc(tok);
1540                         tok->atbol = 1;
1541                     }
1542                     return TYPE_IGNORE;
1543                 } else {
1544                     *p_start = type_start;  /* after type_comment_prefix */
1545                     *p_end = tok->cur;
1546                     return TYPE_COMMENT;
1547                 }
1548             }
1549         }
1550     }
1551 
1552     if (tok->done == E_INTERACT_STOP) {
1553         return ENDMARKER;
1554     }
1555 
1556     /* Check for EOF and errors now */
1557     if (c == EOF) {
1558         if (tok->level) {
1559             return ERRORTOKEN;
1560         }
1561         return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1562     }
1563 
1564     /* Identifier (most frequent token!) */
1565     nonascii = 0;
1566     if (is_potential_identifier_start(c)) {
1567         /* Process the various legal combinations of b"", r"", u"", and f"". */
1568         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1569         while (1) {
1570             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1571                 saw_b = 1;
1572             /* Since this is a backwards compatibility support literal we don't
1573                want to support it in arbitrary order like byte literals. */
1574             else if (!(saw_b || saw_u || saw_r || saw_f)
1575                      && (c == 'u'|| c == 'U')) {
1576                 saw_u = 1;
1577             }
1578             /* ur"" and ru"" are not supported */
1579             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1580                 saw_r = 1;
1581             }
1582             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1583                 saw_f = 1;
1584             }
1585             else {
1586                 break;
1587             }
1588             c = tok_nextc(tok);
1589             if (c == '"' || c == '\'') {
1590                 goto letter_quote;
1591             }
1592         }
1593         while (is_potential_identifier_char(c)) {
1594             if (c >= 128) {
1595                 nonascii = 1;
1596             }
1597             c = tok_nextc(tok);
1598         }
1599         tok_backup(tok, c);
1600         if (nonascii && !verify_identifier(tok)) {
1601             return ERRORTOKEN;
1602         }
1603 
1604         *p_start = tok->start;
1605         *p_end = tok->cur;
1606 
1607         /* async/await parsing block. */
1608         if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1609             /* May be an 'async' or 'await' token.  For Python 3.7 or
1610                later we recognize them unconditionally.  For Python
1611                3.5 or 3.6 we recognize 'async' in front of 'def', and
1612                either one inside of 'async def'.  (Technically we
1613                shouldn't recognize these at all for 3.4 or earlier,
1614                but there's no *valid* Python 3.4 code that would be
1615                rejected, and async functions will be rejected in a
1616                later phase.) */
1617             if (!tok->async_hacks || tok->async_def) {
1618                 /* Always recognize the keywords. */
1619                 if (memcmp(tok->start, "async", 5) == 0) {
1620                     return ASYNC;
1621                 }
1622                 if (memcmp(tok->start, "await", 5) == 0) {
1623                     return AWAIT;
1624                 }
1625             }
1626             else if (memcmp(tok->start, "async", 5) == 0) {
1627                 /* The current token is 'async'.
1628                    Look ahead one token to see if that is 'def'. */
1629 
1630                 struct tok_state ahead_tok;
1631                 const char *ahead_tok_start = NULL;
1632                 const char *ahead_tok_end = NULL;
1633                 int ahead_tok_kind;
1634 
1635                 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1636                 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1637                                          &ahead_tok_end);
1638 
1639                 if (ahead_tok_kind == NAME
1640                     && ahead_tok.cur - ahead_tok.start == 3
1641                     && memcmp(ahead_tok.start, "def", 3) == 0)
1642                 {
1643                     /* The next token is going to be 'def', so instead of
1644                        returning a plain NAME token, return ASYNC. */
1645                     tok->async_def_indent = tok->indent;
1646                     tok->async_def = 1;
1647                     return ASYNC;
1648                 }
1649             }
1650         }
1651 
1652         return NAME;
1653     }
1654 
1655     /* Newline */
1656     if (c == '\n') {
1657         tok->atbol = 1;
1658         if (blankline || tok->level > 0) {
1659             goto nextline;
1660         }
1661         *p_start = tok->start;
1662         *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1663         tok->cont_line = 0;
1664         if (tok->async_def) {
1665             /* We're somewhere inside an 'async def' function, and
1666                we've encountered a NEWLINE after its signature. */
1667             tok->async_def_nl = 1;
1668         }
1669         return NEWLINE;
1670     }
1671 
1672     /* Period or number starting with period? */
1673     if (c == '.') {
1674         c = tok_nextc(tok);
1675         if (isdigit(c)) {
1676             goto fraction;
1677         } else if (c == '.') {
1678             c = tok_nextc(tok);
1679             if (c == '.') {
1680                 *p_start = tok->start;
1681                 *p_end = tok->cur;
1682                 return ELLIPSIS;
1683             }
1684             else {
1685                 tok_backup(tok, c);
1686             }
1687             tok_backup(tok, '.');
1688         }
1689         else {
1690             tok_backup(tok, c);
1691         }
1692         *p_start = tok->start;
1693         *p_end = tok->cur;
1694         return DOT;
1695     }
1696 
1697     /* Number */
1698     if (isdigit(c)) {
1699         if (c == '0') {
1700             /* Hex, octal or binary -- maybe. */
1701             c = tok_nextc(tok);
1702             if (c == 'x' || c == 'X') {
1703                 /* Hex */
1704                 c = tok_nextc(tok);
1705                 do {
1706                     if (c == '_') {
1707                         c = tok_nextc(tok);
1708                     }
1709                     if (!isxdigit(c)) {
1710                         tok_backup(tok, c);
1711                         return syntaxerror(tok, "invalid hexadecimal literal");
1712                     }
1713                     do {
1714                         c = tok_nextc(tok);
1715                     } while (isxdigit(c));
1716                 } while (c == '_');
1717                 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1718                     return ERRORTOKEN;
1719                 }
1720             }
1721             else if (c == 'o' || c == 'O') {
1722                 /* Octal */
1723                 c = tok_nextc(tok);
1724                 do {
1725                     if (c == '_') {
1726                         c = tok_nextc(tok);
1727                     }
1728                     if (c < '0' || c >= '8') {
1729                         if (isdigit(c)) {
1730                             return syntaxerror(tok,
1731                                     "invalid digit '%c' in octal literal", c);
1732                         }
1733                         else {
1734                             tok_backup(tok, c);
1735                             return syntaxerror(tok, "invalid octal literal");
1736                         }
1737                     }
1738                     do {
1739                         c = tok_nextc(tok);
1740                     } while ('0' <= c && c < '8');
1741                 } while (c == '_');
1742                 if (isdigit(c)) {
1743                     return syntaxerror(tok,
1744                             "invalid digit '%c' in octal literal", c);
1745                 }
1746                 if (!verify_end_of_number(tok, c, "octal")) {
1747                     return ERRORTOKEN;
1748                 }
1749             }
1750             else if (c == 'b' || c == 'B') {
1751                 /* Binary */
1752                 c = tok_nextc(tok);
1753                 do {
1754                     if (c == '_') {
1755                         c = tok_nextc(tok);
1756                     }
1757                     if (c != '0' && c != '1') {
1758                         if (isdigit(c)) {
1759                             return syntaxerror(tok,
1760                                     "invalid digit '%c' in binary literal", c);
1761                         }
1762                         else {
1763                             tok_backup(tok, c);
1764                             return syntaxerror(tok, "invalid binary literal");
1765                         }
1766                     }
1767                     do {
1768                         c = tok_nextc(tok);
1769                     } while (c == '0' || c == '1');
1770                 } while (c == '_');
1771                 if (isdigit(c)) {
1772                     return syntaxerror(tok,
1773                             "invalid digit '%c' in binary literal", c);
1774                 }
1775                 if (!verify_end_of_number(tok, c, "binary")) {
1776                     return ERRORTOKEN;
1777                 }
1778             }
1779             else {
1780                 int nonzero = 0;
1781                 /* maybe old-style octal; c is first char of it */
1782                 /* in any case, allow '0' as a literal */
1783                 while (1) {
1784                     if (c == '_') {
1785                         c = tok_nextc(tok);
1786                         if (!isdigit(c)) {
1787                             tok_backup(tok, c);
1788                             return syntaxerror(tok, "invalid decimal literal");
1789                         }
1790                     }
1791                     if (c != '0') {
1792                         break;
1793                     }
1794                     c = tok_nextc(tok);
1795                 }
1796                 char* zeros_end = tok->cur;
1797                 if (isdigit(c)) {
1798                     nonzero = 1;
1799                     c = tok_decimal_tail(tok);
1800                     if (c == 0) {
1801                         return ERRORTOKEN;
1802                     }
1803                 }
1804                 if (c == '.') {
1805                     c = tok_nextc(tok);
1806                     goto fraction;
1807                 }
1808                 else if (c == 'e' || c == 'E') {
1809                     goto exponent;
1810                 }
1811                 else if (c == 'j' || c == 'J') {
1812                     goto imaginary;
1813                 }
1814                 else if (nonzero) {
1815                     /* Old-style octal: now disallowed. */
1816                     tok_backup(tok, c);
1817                     return syntaxerror_known_range(
1818                             tok, (int)(tok->start + 1 - tok->line_start),
1819                             (int)(zeros_end - tok->line_start),
1820                             "leading zeros in decimal integer "
1821                             "literals are not permitted; "
1822                             "use an 0o prefix for octal integers");
1823                 }
1824                 if (!verify_end_of_number(tok, c, "decimal")) {
1825                     return ERRORTOKEN;
1826                 }
1827             }
1828         }
1829         else {
1830             /* Decimal */
1831             c = tok_decimal_tail(tok);
1832             if (c == 0) {
1833                 return ERRORTOKEN;
1834             }
1835             {
1836                 /* Accept floating point numbers. */
1837                 if (c == '.') {
1838                     c = tok_nextc(tok);
1839         fraction:
1840                     /* Fraction */
1841                     if (isdigit(c)) {
1842                         c = tok_decimal_tail(tok);
1843                         if (c == 0) {
1844                             return ERRORTOKEN;
1845                         }
1846                     }
1847                 }
1848                 if (c == 'e' || c == 'E') {
1849                     int e;
1850                   exponent:
1851                     e = c;
1852                     /* Exponent part */
1853                     c = tok_nextc(tok);
1854                     if (c == '+' || c == '-') {
1855                         c = tok_nextc(tok);
1856                         if (!isdigit(c)) {
1857                             tok_backup(tok, c);
1858                             return syntaxerror(tok, "invalid decimal literal");
1859                         }
1860                     } else if (!isdigit(c)) {
1861                         tok_backup(tok, c);
1862                         if (!verify_end_of_number(tok, e, "decimal")) {
1863                             return ERRORTOKEN;
1864                         }
1865                         tok_backup(tok, e);
1866                         *p_start = tok->start;
1867                         *p_end = tok->cur;
1868                         return NUMBER;
1869                     }
1870                     c = tok_decimal_tail(tok);
1871                     if (c == 0) {
1872                         return ERRORTOKEN;
1873                     }
1874                 }
1875                 if (c == 'j' || c == 'J') {
1876                     /* Imaginary part */
1877         imaginary:
1878                     c = tok_nextc(tok);
1879                     if (!verify_end_of_number(tok, c, "imaginary")) {
1880                         return ERRORTOKEN;
1881                     }
1882                 }
1883                 else if (!verify_end_of_number(tok, c, "decimal")) {
1884                     return ERRORTOKEN;
1885                 }
1886             }
1887         }
1888         tok_backup(tok, c);
1889         *p_start = tok->start;
1890         *p_end = tok->cur;
1891         return NUMBER;
1892     }
1893 
1894   letter_quote:
1895     /* String */
1896     if (c == '\'' || c == '"') {
1897         int quote = c;
1898         int quote_size = 1;             /* 1 or 3 */
1899         int end_quote_size = 0;
1900 
1901         /* Nodes of type STRING, especially multi line strings
1902            must be handled differently in order to get both
1903            the starting line number and the column offset right.
1904            (cf. issue 16806) */
1905         tok->first_lineno = tok->lineno;
1906         tok->multi_line_start = tok->line_start;
1907 
1908         /* Find the quote size and start of string */
1909         c = tok_nextc(tok);
1910         if (c == quote) {
1911             c = tok_nextc(tok);
1912             if (c == quote) {
1913                 quote_size = 3;
1914             }
1915             else {
1916                 end_quote_size = 1;     /* empty string found */
1917             }
1918         }
1919         if (c != quote) {
1920             tok_backup(tok, c);
1921         }
1922 
1923         /* Get rest of string */
1924         while (end_quote_size != quote_size) {
1925             c = tok_nextc(tok);
1926             if (c == EOF || (quote_size == 1 && c == '\n')) {
1927                 assert(tok->multi_line_start != NULL);
1928                 // shift the tok_state's location into
1929                 // the start of string, and report the error
1930                 // from the initial quote character
1931                 tok->cur = (char *)tok->start;
1932                 tok->cur++;
1933                 tok->line_start = tok->multi_line_start;
1934                 int start = tok->lineno;
1935                 tok->lineno = tok->first_lineno;
1936 
1937                 if (quote_size == 3) {
1938                     return syntaxerror(tok,
1939                                        "unterminated triple-quoted string literal"
1940                                        " (detected at line %d)", start);
1941                 }
1942                 else {
1943                     return syntaxerror(tok,
1944                                        "unterminated string literal (detected at"
1945                                        " line %d)", start);
1946                 }
1947             }
1948             if (c == quote) {
1949                 end_quote_size += 1;
1950             }
1951             else {
1952                 end_quote_size = 0;
1953                 if (c == '\\') {
1954                     tok_nextc(tok);  /* skip escaped char */
1955                 }
1956             }
1957         }
1958 
1959         *p_start = tok->start;
1960         *p_end = tok->cur;
1961         return STRING;
1962     }
1963 
1964     /* Line continuation */
1965     if (c == '\\') {
1966         c = tok_nextc(tok);
1967         if (c != '\n') {
1968             tok->done = E_LINECONT;
1969             return ERRORTOKEN;
1970         }
1971         c = tok_nextc(tok);
1972         if (c == EOF) {
1973             tok->done = E_EOF;
1974             tok->cur = tok->inp;
1975             return ERRORTOKEN;
1976         } else {
1977             tok_backup(tok, c);
1978         }
1979         tok->cont_line = 1;
1980         goto again; /* Read next line */
1981     }
1982 
1983     /* Check for two-character token */
1984     {
1985         int c2 = tok_nextc(tok);
1986         int token = PyToken_TwoChars(c, c2);
1987         if (token != OP) {
1988             int c3 = tok_nextc(tok);
1989             int token3 = PyToken_ThreeChars(c, c2, c3);
1990             if (token3 != OP) {
1991                 token = token3;
1992             }
1993             else {
1994                 tok_backup(tok, c3);
1995             }
1996             *p_start = tok->start;
1997             *p_end = tok->cur;
1998             return token;
1999         }
2000         tok_backup(tok, c2);
2001     }
2002 
2003     /* Keep track of parentheses nesting level */
2004     switch (c) {
2005     case '(':
2006     case '[':
2007     case '{':
2008         if (tok->level >= MAXLEVEL) {
2009             return syntaxerror(tok, "too many nested parentheses");
2010         }
2011         tok->parenstack[tok->level] = c;
2012         tok->parenlinenostack[tok->level] = tok->lineno;
2013         tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2014         tok->level++;
2015         break;
2016     case ')':
2017     case ']':
2018     case '}':
2019         if (!tok->level) {
2020             return syntaxerror(tok, "unmatched '%c'", c);
2021         }
2022         tok->level--;
2023         int opening = tok->parenstack[tok->level];
2024         if (!((opening == '(' && c == ')') ||
2025               (opening == '[' && c == ']') ||
2026               (opening == '{' && c == '}')))
2027         {
2028             if (tok->parenlinenostack[tok->level] != tok->lineno) {
2029                 return syntaxerror(tok,
2030                         "closing parenthesis '%c' does not match "
2031                         "opening parenthesis '%c' on line %d",
2032                         c, opening, tok->parenlinenostack[tok->level]);
2033             }
2034             else {
2035                 return syntaxerror(tok,
2036                         "closing parenthesis '%c' does not match "
2037                         "opening parenthesis '%c'",
2038                         c, opening);
2039             }
2040         }
2041         break;
2042     }
2043 
2044     /* Punctuation character */
2045     *p_start = tok->start;
2046     *p_end = tok->cur;
2047     return PyToken_OneChar(c);
2048 }
2049 
2050 int
PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)2051 PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
2052 {
2053     int result = tok_get(tok, p_start, p_end);
2054     if (tok->decoding_erred) {
2055         result = ERRORTOKEN;
2056         tok->done = E_DECODE;
2057     }
2058     return result;
2059 }
2060 
2061 /* Get the encoding of a Python file. Check for the coding cookie and check if
2062    the file starts with a BOM.
2063 
2064    PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2065    encoding in the first or second line of the file (in which case the encoding
2066    should be assumed to be UTF-8).
2067 
2068    The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2069    by the caller. */
2070 
2071 char *
PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)2072 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2073 {
2074     struct tok_state *tok;
2075     FILE *fp;
2076     const char *p_start = NULL;
2077     const char *p_end = NULL;
2078     char *encoding = NULL;
2079 
2080     fd = _Py_dup(fd);
2081     if (fd < 0) {
2082         return NULL;
2083     }
2084 
2085     fp = fdopen(fd, "r");
2086     if (fp == NULL) {
2087         return NULL;
2088     }
2089     tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2090     if (tok == NULL) {
2091         fclose(fp);
2092         return NULL;
2093     }
2094     if (filename != NULL) {
2095         Py_INCREF(filename);
2096         tok->filename = filename;
2097     }
2098     else {
2099         tok->filename = PyUnicode_FromString("<string>");
2100         if (tok->filename == NULL) {
2101             fclose(fp);
2102             PyTokenizer_Free(tok);
2103             return encoding;
2104         }
2105     }
2106     while (tok->lineno < 2 && tok->done == E_OK) {
2107         PyTokenizer_Get(tok, &p_start, &p_end);
2108     }
2109     fclose(fp);
2110     if (tok->encoding) {
2111         encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2112         if (encoding) {
2113             strcpy(encoding, tok->encoding);
2114         }
2115     }
2116     PyTokenizer_Free(tok);
2117     return encoding;
2118 }
2119 
2120 char *
PyTokenizer_FindEncoding(int fd)2121 PyTokenizer_FindEncoding(int fd)
2122 {
2123     return PyTokenizer_FindEncodingFilename(fd, NULL);
2124 }
2125 
2126 #ifdef Py_DEBUG
2127 
2128 void
tok_dump(int type,char * start,char * end)2129 tok_dump(int type, char *start, char *end)
2130 {
2131     fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2132     if (type == NAME || type == NUMBER || type == STRING || type == OP)
2133         fprintf(stderr, "(%.*s)", (int)(end - start), start);
2134 }
2135 
2136 #endif
2137