1
2 /* Tokenizer implementation */
3
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "errcode.h"
12
13 #include "unicodeobject.h"
14 #include "bytesobject.h"
15 #include "fileobject.h"
16 #include "abstract.h"
17
18 /* Alternate tab spacing */
19 #define ALTTABSIZE 1
20
21 #define is_potential_identifier_start(c) (\
22 (c >= 'a' && c <= 'z')\
23 || (c >= 'A' && c <= 'Z')\
24 || c == '_'\
25 || (c >= 128))
26
27 #define is_potential_identifier_char(c) (\
28 (c >= 'a' && c <= 'z')\
29 || (c >= 'A' && c <= 'Z')\
30 || (c >= '0' && c <= '9')\
31 || c == '_'\
32 || (c >= 128))
33
34
35 /* Don't ever change this -- it would break the portability of Python code */
36 #define TABSIZE 8
37
38 /* Forward */
39 static struct tok_state *tok_new(void);
40 static int tok_nextc(struct tok_state *tok);
41 static void tok_backup(struct tok_state *tok, int c);
42
43
44 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
45 tokenizing. */
46 static const char* type_comment_prefix = "# type: ";
47
48 /* Create and initialize a new tok_state structure */
49
50 static struct tok_state *
tok_new(void)51 tok_new(void)
52 {
53 struct tok_state *tok = (struct tok_state *)PyMem_Malloc(
54 sizeof(struct tok_state));
55 if (tok == NULL)
56 return NULL;
57 tok->buf = tok->cur = tok->inp = NULL;
58 tok->fp_interactive = 0;
59 tok->interactive_src_start = NULL;
60 tok->interactive_src_end = NULL;
61 tok->start = NULL;
62 tok->end = NULL;
63 tok->done = E_OK;
64 tok->fp = NULL;
65 tok->input = NULL;
66 tok->tabsize = TABSIZE;
67 tok->indent = 0;
68 tok->indstack[0] = 0;
69 tok->atbol = 1;
70 tok->pendin = 0;
71 tok->prompt = tok->nextprompt = NULL;
72 tok->lineno = 0;
73 tok->level = 0;
74 tok->altindstack[0] = 0;
75 tok->decoding_state = STATE_INIT;
76 tok->decoding_erred = 0;
77 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
80 tok->filename = NULL;
81 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
83 tok->type_comments = 0;
84 tok->async_hacks = 0;
85 tok->async_def = 0;
86 tok->async_def_indent = 0;
87 tok->async_def_nl = 0;
88 tok->interactive_underflow = IUNDERFLOW_NORMAL;
89 tok->str = NULL;
90 return tok;
91 }
92
93 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)94 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
95 {
96 char* result = (char *)PyMem_Malloc(len + 1);
97 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
100 }
101 memcpy(result, s, len);
102 result[len] = '\0';
103 return result;
104 }
105
106 static char *
error_ret(struct tok_state * tok)107 error_ret(struct tok_state *tok) /* XXX */
108 {
109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
111 PyMem_Free(tok->buf);
112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
115 tok->done = E_DECODE;
116 return NULL; /* as if it were EOF */
117 }
118
119
120 static const char *
get_normal_name(const char * s)121 get_normal_name(const char *s) /* for utf-8 and latin-1 */
122 {
123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
147 }
148
149 /* Return the coding spec in S, or NULL if none is found. */
150
151 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)152 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153 {
154 Py_ssize_t i;
155 *spec = NULL;
156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162 return 1;
163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
166 if (memcmp(t, "coding", 6) == 0) {
167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
173 } while (t[0] == ' ' || t[0] == '\t');
174
175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
179
180 if (begin < t) {
181 char* r = new_string(begin, t - begin, tok);
182 const char* q;
183 if (!r)
184 return 0;
185 q = get_normal_name(r);
186 if (r != q) {
187 PyMem_Free(r);
188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
191 }
192 *spec = r;
193 break;
194 }
195 }
196 }
197 return 1;
198 }
199
200 /* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))206 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207 int set_readline(struct tok_state *, const char *))
208 {
209 char *cs;
210 if (tok->cont_line) {
211 /* It's a continuation line, so it can't be a coding spec. */
212 tok->decoding_state = STATE_NORMAL;
213 return 1;
214 }
215 if (!get_coding_spec(line, &cs, size, tok)) {
216 return 0;
217 }
218 if (!cs) {
219 Py_ssize_t i;
220 for (i = 0; i < size; i++) {
221 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
222 break;
223 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
224 /* Stop checking coding spec after a line containing
225 * anything except a comment. */
226 tok->decoding_state = STATE_NORMAL;
227 break;
228 }
229 }
230 return 1;
231 }
232 tok->decoding_state = STATE_NORMAL;
233 if (tok->encoding == NULL) {
234 assert(tok->decoding_readline == NULL);
235 if (strcmp(cs, "utf-8") != 0 && !set_readline(tok, cs)) {
236 error_ret(tok);
237 PyErr_Format(PyExc_SyntaxError, "encoding problem: %s", cs);
238 PyMem_Free(cs);
239 return 0;
240 }
241 tok->encoding = cs;
242 } else { /* then, compare cs with BOM */
243 if (strcmp(tok->encoding, cs) != 0) {
244 error_ret(tok);
245 PyErr_Format(PyExc_SyntaxError,
246 "encoding problem: %s with BOM", cs);
247 PyMem_Free(cs);
248 return 0;
249 }
250 PyMem_Free(cs);
251 }
252 return 1;
253 }
254
255 /* See whether the file starts with a BOM. If it does,
256 invoke the set_readline function with the new encoding.
257 Return 1 on success, 0 on failure. */
258
259 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)260 check_bom(int get_char(struct tok_state *),
261 void unget_char(int, struct tok_state *),
262 int set_readline(struct tok_state *, const char *),
263 struct tok_state *tok)
264 {
265 int ch1, ch2, ch3;
266 ch1 = get_char(tok);
267 tok->decoding_state = STATE_SEEK_CODING;
268 if (ch1 == EOF) {
269 return 1;
270 } else if (ch1 == 0xEF) {
271 ch2 = get_char(tok);
272 if (ch2 != 0xBB) {
273 unget_char(ch2, tok);
274 unget_char(ch1, tok);
275 return 1;
276 }
277 ch3 = get_char(tok);
278 if (ch3 != 0xBF) {
279 unget_char(ch3, tok);
280 unget_char(ch2, tok);
281 unget_char(ch1, tok);
282 return 1;
283 }
284 #if 0
285 /* Disable support for UTF-16 BOMs until a decision
286 is made whether this needs to be supported. */
287 } else if (ch1 == 0xFE) {
288 ch2 = get_char(tok);
289 if (ch2 != 0xFF) {
290 unget_char(ch2, tok);
291 unget_char(ch1, tok);
292 return 1;
293 }
294 if (!set_readline(tok, "utf-16-be"))
295 return 0;
296 tok->decoding_state = STATE_NORMAL;
297 } else if (ch1 == 0xFF) {
298 ch2 = get_char(tok);
299 if (ch2 != 0xFE) {
300 unget_char(ch2, tok);
301 unget_char(ch1, tok);
302 return 1;
303 }
304 if (!set_readline(tok, "utf-16-le"))
305 return 0;
306 tok->decoding_state = STATE_NORMAL;
307 #endif
308 } else {
309 unget_char(ch1, tok);
310 return 1;
311 }
312 if (tok->encoding != NULL)
313 PyMem_Free(tok->encoding);
314 tok->encoding = new_string("utf-8", 5, tok);
315 if (!tok->encoding)
316 return 0;
317 /* No need to set_readline: input is already utf-8 */
318 return 1;
319 }
320
321 static int
tok_concatenate_interactive_new_line(struct tok_state * tok,const char * line)322 tok_concatenate_interactive_new_line(struct tok_state *tok, const char *line) {
323 assert(tok->fp_interactive);
324
325 if (!line) {
326 return 0;
327 }
328
329 Py_ssize_t current_size = tok->interactive_src_end - tok->interactive_src_start;
330 Py_ssize_t line_size = strlen(line);
331 char* new_str = tok->interactive_src_start;
332
333 new_str = PyMem_Realloc(new_str, current_size + line_size + 1);
334 if (!new_str) {
335 if (tok->interactive_src_start) {
336 PyMem_Free(tok->interactive_src_start);
337 }
338 tok->interactive_src_start = NULL;
339 tok->interactive_src_end = NULL;
340 tok->done = E_NOMEM;
341 return -1;
342 }
343 strcpy(new_str + current_size, line);
344
345 tok->interactive_src_start = new_str;
346 tok->interactive_src_end = new_str + current_size + line_size;
347 return 0;
348 }
349
350
351 /* Read a line of text from TOK into S, using the stream in TOK.
352 Return NULL on failure, else S.
353
354 On entry, tok->decoding_buffer will be one of:
355 1) NULL: need to call tok->decoding_readline to get a new line
356 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
357 stored the result in tok->decoding_buffer
358 3) PyByteArrayObject *: previous call to tok_readline_recode did not have enough room
359 (in the s buffer) to copy entire contents of the line read
360 by tok->decoding_readline. tok->decoding_buffer has the overflow.
361 In this case, tok_readline_recode is called in a loop (with an expanded buffer)
362 until the buffer ends with a '\n' (or until the end of the file is
363 reached): see tok_nextc and its calls to tok_reserve_buf.
364 */
365
366 static int
tok_reserve_buf(struct tok_state * tok,Py_ssize_t size)367 tok_reserve_buf(struct tok_state *tok, Py_ssize_t size)
368 {
369 Py_ssize_t cur = tok->cur - tok->buf;
370 Py_ssize_t oldsize = tok->inp - tok->buf;
371 Py_ssize_t newsize = oldsize + Py_MAX(size, oldsize >> 1);
372 if (newsize > tok->end - tok->buf) {
373 char *newbuf = tok->buf;
374 Py_ssize_t start = tok->start == NULL ? -1 : tok->start - tok->buf;
375 Py_ssize_t line_start = tok->start == NULL ? -1 : tok->line_start - tok->buf;
376 Py_ssize_t multi_line_start = tok->multi_line_start - tok->buf;
377 newbuf = (char *)PyMem_Realloc(newbuf, newsize);
378 if (newbuf == NULL) {
379 tok->done = E_NOMEM;
380 return 0;
381 }
382 tok->buf = newbuf;
383 tok->cur = tok->buf + cur;
384 tok->inp = tok->buf + oldsize;
385 tok->end = tok->buf + newsize;
386 tok->start = start < 0 ? NULL : tok->buf + start;
387 tok->line_start = line_start < 0 ? NULL : tok->buf + line_start;
388 tok->multi_line_start = multi_line_start < 0 ? NULL : tok->buf + multi_line_start;
389 }
390 return 1;
391 }
392
393 static int
tok_readline_recode(struct tok_state * tok)394 tok_readline_recode(struct tok_state *tok) {
395 PyObject *line;
396 const char *buf;
397 Py_ssize_t buflen;
398 line = tok->decoding_buffer;
399 if (line == NULL) {
400 line = PyObject_CallNoArgs(tok->decoding_readline);
401 if (line == NULL) {
402 error_ret(tok);
403 goto error;
404 }
405 }
406 else {
407 tok->decoding_buffer = NULL;
408 }
409 buf = PyUnicode_AsUTF8AndSize(line, &buflen);
410 if (buf == NULL) {
411 error_ret(tok);
412 goto error;
413 }
414 if (!tok_reserve_buf(tok, buflen + 1)) {
415 goto error;
416 }
417 memcpy(tok->inp, buf, buflen);
418 tok->inp += buflen;
419 *tok->inp = '\0';
420 if (tok->fp_interactive &&
421 tok_concatenate_interactive_new_line(tok, buf) == -1) {
422 goto error;
423 }
424 Py_DECREF(line);
425 return 1;
426 error:
427 Py_XDECREF(line);
428 return 0;
429 }
430
431 /* Set the readline function for TOK to a StreamReader's
432 readline function. The StreamReader is named ENC.
433
434 This function is called from check_bom and check_coding_spec.
435
436 ENC is usually identical to the future value of tok->encoding,
437 except for the (currently unsupported) case of UTF-16.
438
439 Return 1 on success, 0 on failure. */
440
441 static int
fp_setreadl(struct tok_state * tok,const char * enc)442 fp_setreadl(struct tok_state *tok, const char* enc)
443 {
444 PyObject *readline, *io, *stream;
445 _Py_IDENTIFIER(open);
446 _Py_IDENTIFIER(readline);
447 int fd;
448 long pos;
449
450 fd = fileno(tok->fp);
451 /* Due to buffering the file offset for fd can be different from the file
452 * position of tok->fp. If tok->fp was opened in text mode on Windows,
453 * its file position counts CRLF as one char and can't be directly mapped
454 * to the file offset for fd. Instead we step back one byte and read to
455 * the end of line.*/
456 pos = ftell(tok->fp);
457 if (pos == -1 ||
458 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
459 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
460 return 0;
461 }
462
463 io = PyImport_ImportModuleNoBlock("io");
464 if (io == NULL)
465 return 0;
466
467 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
468 fd, "r", -1, enc, Py_None, Py_None, Py_False);
469 Py_DECREF(io);
470 if (stream == NULL)
471 return 0;
472
473 readline = _PyObject_GetAttrId(stream, &PyId_readline);
474 Py_DECREF(stream);
475 if (readline == NULL)
476 return 0;
477 Py_XSETREF(tok->decoding_readline, readline);
478
479 if (pos > 0) {
480 PyObject *bufobj = _PyObject_CallNoArg(readline);
481 if (bufobj == NULL)
482 return 0;
483 Py_DECREF(bufobj);
484 }
485
486 return 1;
487 }
488
489 /* Fetch the next byte from TOK. */
490
fp_getc(struct tok_state * tok)491 static int fp_getc(struct tok_state *tok) {
492 return getc(tok->fp);
493 }
494
495 /* Unfetch the last byte back into TOK. */
496
fp_ungetc(int c,struct tok_state * tok)497 static void fp_ungetc(int c, struct tok_state *tok) {
498 ungetc(c, tok->fp);
499 }
500
501 /* Check whether the characters at s start a valid
502 UTF-8 sequence. Return the number of characters forming
503 the sequence if yes, 0 if not. */
valid_utf8(const unsigned char * s)504 static int valid_utf8(const unsigned char* s)
505 {
506 int expected = 0;
507 int length;
508 if (*s < 0x80)
509 /* single-byte code */
510 return 1;
511 if (*s < 0xc0)
512 /* following byte */
513 return 0;
514 if (*s < 0xE0)
515 expected = 1;
516 else if (*s < 0xF0)
517 expected = 2;
518 else if (*s < 0xF8)
519 expected = 3;
520 else
521 return 0;
522 length = expected + 1;
523 for (; expected; expected--)
524 if (s[expected] < 0x80 || s[expected] >= 0xC0)
525 return 0;
526 return length;
527 }
528
529 static int
ensure_utf8(char * line,struct tok_state * tok)530 ensure_utf8(char *line, struct tok_state *tok)
531 {
532 int badchar = 0;
533 unsigned char *c;
534 int length;
535 for (c = (unsigned char *)line; *c; c += length) {
536 if (!(length = valid_utf8(c))) {
537 badchar = *c;
538 break;
539 }
540 }
541 if (badchar) {
542 /* Need to add 1 to the line number, since this line
543 has not been counted, yet. */
544 PyErr_Format(PyExc_SyntaxError,
545 "Non-UTF-8 code starting with '\\x%.2x' "
546 "in file %U on line %i, "
547 "but no encoding declared; "
548 "see https://python.org/dev/peps/pep-0263/ for details",
549 badchar, tok->filename, tok->lineno + 1);
550 return 0;
551 }
552 return 1;
553 }
554
555 /* Fetch a byte from TOK, using the string buffer. */
556
557 static int
buf_getc(struct tok_state * tok)558 buf_getc(struct tok_state *tok) {
559 return Py_CHARMASK(*tok->str++);
560 }
561
562 /* Unfetch a byte from TOK, using the string buffer. */
563
564 static void
buf_ungetc(int c,struct tok_state * tok)565 buf_ungetc(int c, struct tok_state *tok) {
566 tok->str--;
567 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
568 }
569
570 /* Set the readline function for TOK to ENC. For the string-based
571 tokenizer, this means to just record the encoding. */
572
573 static int
buf_setreadl(struct tok_state * tok,const char * enc)574 buf_setreadl(struct tok_state *tok, const char* enc) {
575 tok->enc = enc;
576 return 1;
577 }
578
579 /* Return a UTF-8 encoding Python string object from the
580 C byte string STR, which is encoded with ENC. */
581
582 static PyObject *
translate_into_utf8(const char * str,const char * enc)583 translate_into_utf8(const char* str, const char* enc) {
584 PyObject *utf8;
585 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
586 if (buf == NULL)
587 return NULL;
588 utf8 = PyUnicode_AsUTF8String(buf);
589 Py_DECREF(buf);
590 return utf8;
591 }
592
593
594 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)595 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
596 int skip_next_lf = 0;
597 size_t needed_length = strlen(s) + 2, final_length;
598 char *buf, *current;
599 char c = '\0';
600 buf = PyMem_Malloc(needed_length);
601 if (buf == NULL) {
602 tok->done = E_NOMEM;
603 return NULL;
604 }
605 for (current = buf; *s; s++, current++) {
606 c = *s;
607 if (skip_next_lf) {
608 skip_next_lf = 0;
609 if (c == '\n') {
610 c = *++s;
611 if (!c)
612 break;
613 }
614 }
615 if (c == '\r') {
616 skip_next_lf = 1;
617 c = '\n';
618 }
619 *current = c;
620 }
621 /* If this is exec input, add a newline to the end of the string if
622 there isn't one already. */
623 if (exec_input && c != '\n') {
624 *current = '\n';
625 current++;
626 }
627 *current = '\0';
628 final_length = current - buf + 1;
629 if (final_length < needed_length && final_length) {
630 /* should never fail */
631 char* result = PyMem_Realloc(buf, final_length);
632 if (result == NULL) {
633 PyMem_Free(buf);
634 }
635 buf = result;
636 }
637 return buf;
638 }
639
640 /* Decode a byte string STR for use as the buffer of TOK.
641 Look for encoding declarations inside STR, and record them
642 inside TOK. */
643
644 static char *
decode_str(const char * input,int single,struct tok_state * tok)645 decode_str(const char *input, int single, struct tok_state *tok)
646 {
647 PyObject* utf8 = NULL;
648 char *str;
649 const char *s;
650 const char *newl[2] = {NULL, NULL};
651 int lineno = 0;
652 tok->input = str = translate_newlines(input, single, tok);
653 if (str == NULL)
654 return NULL;
655 tok->enc = NULL;
656 tok->str = str;
657 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
658 return error_ret(tok);
659 str = tok->str; /* string after BOM if any */
660 assert(str);
661 if (tok->enc != NULL) {
662 utf8 = translate_into_utf8(str, tok->enc);
663 if (utf8 == NULL)
664 return error_ret(tok);
665 str = PyBytes_AsString(utf8);
666 }
667 for (s = str;; s++) {
668 if (*s == '\0') break;
669 else if (*s == '\n') {
670 assert(lineno < 2);
671 newl[lineno] = s;
672 lineno++;
673 if (lineno == 2) break;
674 }
675 }
676 tok->enc = NULL;
677 /* need to check line 1 and 2 separately since check_coding_spec
678 assumes a single line as input */
679 if (newl[0]) {
680 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
681 return NULL;
682 }
683 if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
684 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
685 tok, buf_setreadl))
686 return NULL;
687 }
688 }
689 if (tok->enc != NULL) {
690 assert(utf8 == NULL);
691 utf8 = translate_into_utf8(str, tok->enc);
692 if (utf8 == NULL)
693 return error_ret(tok);
694 str = PyBytes_AS_STRING(utf8);
695 }
696 assert(tok->decoding_buffer == NULL);
697 tok->decoding_buffer = utf8; /* CAUTION */
698 return str;
699 }
700
701 /* Set up tokenizer for string */
702
703 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)704 PyTokenizer_FromString(const char *str, int exec_input)
705 {
706 struct tok_state *tok = tok_new();
707 char *decoded;
708
709 if (tok == NULL)
710 return NULL;
711 decoded = decode_str(str, exec_input, tok);
712 if (decoded == NULL) {
713 PyTokenizer_Free(tok);
714 return NULL;
715 }
716
717 tok->buf = tok->cur = tok->inp = decoded;
718 tok->end = decoded;
719 return tok;
720 }
721
722 /* Set up tokenizer for UTF-8 string */
723
724 struct tok_state *
PyTokenizer_FromUTF8(const char * str,int exec_input)725 PyTokenizer_FromUTF8(const char *str, int exec_input)
726 {
727 struct tok_state *tok = tok_new();
728 char *translated;
729 if (tok == NULL)
730 return NULL;
731 tok->input = translated = translate_newlines(str, exec_input, tok);
732 if (translated == NULL) {
733 PyTokenizer_Free(tok);
734 return NULL;
735 }
736 tok->decoding_state = STATE_NORMAL;
737 tok->enc = NULL;
738 tok->str = translated;
739 tok->encoding = new_string("utf-8", 5, tok);
740 if (!tok->encoding) {
741 PyTokenizer_Free(tok);
742 return NULL;
743 }
744
745 tok->buf = tok->cur = tok->inp = translated;
746 tok->end = translated;
747 return tok;
748 }
749
750 /* Set up tokenizer for file */
751
752 struct tok_state *
PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)753 PyTokenizer_FromFile(FILE *fp, const char* enc,
754 const char *ps1, const char *ps2)
755 {
756 struct tok_state *tok = tok_new();
757 if (tok == NULL)
758 return NULL;
759 if ((tok->buf = (char *)PyMem_Malloc(BUFSIZ)) == NULL) {
760 PyTokenizer_Free(tok);
761 return NULL;
762 }
763 tok->cur = tok->inp = tok->buf;
764 tok->end = tok->buf + BUFSIZ;
765 tok->fp = fp;
766 tok->prompt = ps1;
767 tok->nextprompt = ps2;
768 if (enc != NULL) {
769 /* Must copy encoding declaration since it
770 gets copied into the parse tree. */
771 tok->encoding = new_string(enc, strlen(enc), tok);
772 if (!tok->encoding) {
773 PyTokenizer_Free(tok);
774 return NULL;
775 }
776 tok->decoding_state = STATE_NORMAL;
777 }
778 return tok;
779 }
780
781 /* Free a tok_state structure */
782
783 void
PyTokenizer_Free(struct tok_state * tok)784 PyTokenizer_Free(struct tok_state *tok)
785 {
786 if (tok->encoding != NULL) {
787 PyMem_Free(tok->encoding);
788 }
789 Py_XDECREF(tok->decoding_readline);
790 Py_XDECREF(tok->decoding_buffer);
791 Py_XDECREF(tok->filename);
792 if (tok->fp != NULL && tok->buf != NULL) {
793 PyMem_Free(tok->buf);
794 }
795 if (tok->input) {
796 PyMem_Free(tok->input);
797 }
798 if (tok->interactive_src_start != NULL) {
799 PyMem_Free(tok->interactive_src_start);
800 }
801 PyMem_Free(tok);
802 }
803
804 static int
tok_readline_raw(struct tok_state * tok)805 tok_readline_raw(struct tok_state *tok)
806 {
807 do {
808 if (!tok_reserve_buf(tok, BUFSIZ)) {
809 return 0;
810 }
811 char *line = Py_UniversalNewlineFgets(tok->inp,
812 (int)(tok->end - tok->inp),
813 tok->fp, NULL);
814 if (line == NULL) {
815 return 1;
816 }
817 if (tok->fp_interactive &&
818 tok_concatenate_interactive_new_line(tok, line) == -1) {
819 return 0;
820 }
821 tok->inp = strchr(tok->inp, '\0');
822 if (tok->inp == tok->buf) {
823 return 0;
824 }
825 } while (tok->inp[-1] != '\n');
826 return 1;
827 }
828
829 static int
tok_underflow_string(struct tok_state * tok)830 tok_underflow_string(struct tok_state *tok) {
831 char *end = strchr(tok->inp, '\n');
832 if (end != NULL) {
833 end++;
834 }
835 else {
836 end = strchr(tok->inp, '\0');
837 if (end == tok->inp) {
838 tok->done = E_EOF;
839 return 0;
840 }
841 }
842 if (tok->start == NULL) {
843 tok->buf = tok->cur;
844 }
845 tok->line_start = tok->cur;
846 tok->lineno++;
847 tok->inp = end;
848 return 1;
849 }
850
851 static int
tok_underflow_interactive(struct tok_state * tok)852 tok_underflow_interactive(struct tok_state *tok) {
853 if (tok->interactive_underflow == IUNDERFLOW_STOP) {
854 tok->done = E_INTERACT_STOP;
855 return 1;
856 }
857 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
858 if (newtok != NULL) {
859 char *translated = translate_newlines(newtok, 0, tok);
860 PyMem_Free(newtok);
861 if (translated == NULL) {
862 return 0;
863 }
864 newtok = translated;
865 }
866 if (tok->encoding && newtok && *newtok) {
867 /* Recode to UTF-8 */
868 Py_ssize_t buflen;
869 const char* buf;
870 PyObject *u = translate_into_utf8(newtok, tok->encoding);
871 PyMem_Free(newtok);
872 if (u == NULL) {
873 tok->done = E_DECODE;
874 return 0;
875 }
876 buflen = PyBytes_GET_SIZE(u);
877 buf = PyBytes_AS_STRING(u);
878 newtok = PyMem_Malloc(buflen+1);
879 if (newtok == NULL) {
880 Py_DECREF(u);
881 tok->done = E_NOMEM;
882 return 0;
883 }
884 strcpy(newtok, buf);
885 Py_DECREF(u);
886 }
887 if (tok->fp_interactive &&
888 tok_concatenate_interactive_new_line(tok, newtok) == -1) {
889 PyMem_Free(newtok);
890 return 0;
891 }
892 if (tok->nextprompt != NULL) {
893 tok->prompt = tok->nextprompt;
894 }
895 if (newtok == NULL) {
896 tok->done = E_INTR;
897 }
898 else if (*newtok == '\0') {
899 PyMem_Free(newtok);
900 tok->done = E_EOF;
901 }
902 else if (tok->start != NULL) {
903 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
904 size_t size = strlen(newtok);
905 tok->lineno++;
906 if (!tok_reserve_buf(tok, size + 1)) {
907 PyMem_Free(tok->buf);
908 tok->buf = NULL;
909 PyMem_Free(newtok);
910 return 0;
911 }
912 memcpy(tok->cur, newtok, size + 1);
913 PyMem_Free(newtok);
914 tok->inp += size;
915 tok->multi_line_start = tok->buf + cur_multi_line_start;
916 }
917 else {
918 tok->lineno++;
919 PyMem_Free(tok->buf);
920 tok->buf = newtok;
921 tok->cur = tok->buf;
922 tok->line_start = tok->buf;
923 tok->inp = strchr(tok->buf, '\0');
924 tok->end = tok->inp + 1;
925 }
926 if (tok->done != E_OK) {
927 if (tok->prompt != NULL) {
928 PySys_WriteStderr("\n");
929 }
930 return 0;
931 }
932 return 1;
933 }
934
935 static int
tok_underflow_file(struct tok_state * tok)936 tok_underflow_file(struct tok_state *tok) {
937 if (tok->start == NULL) {
938 tok->cur = tok->inp = tok->buf;
939 }
940 if (tok->decoding_state == STATE_INIT) {
941 /* We have not yet determined the encoding.
942 If an encoding is found, use the file-pointer
943 reader functions from now on. */
944 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok)) {
945 error_ret(tok);
946 return 0;
947 }
948 assert(tok->decoding_state != STATE_INIT);
949 }
950 /* Read until '\n' or EOF */
951 if (tok->decoding_readline != NULL) {
952 /* We already have a codec associated with this input. */
953 if (!tok_readline_recode(tok)) {
954 return 0;
955 }
956 }
957 else {
958 /* We want a 'raw' read. */
959 if (!tok_readline_raw(tok)) {
960 return 0;
961 }
962 }
963 if (tok->inp == tok->cur) {
964 tok->done = E_EOF;
965 return 0;
966 }
967 if (tok->inp[-1] != '\n') {
968 /* Last line does not end in \n, fake one */
969 *tok->inp++ = '\n';
970 *tok->inp = '\0';
971 }
972
973 tok->lineno++;
974 if (tok->decoding_state != STATE_NORMAL) {
975 if (tok->lineno > 2) {
976 tok->decoding_state = STATE_NORMAL;
977 }
978 else if (!check_coding_spec(tok->cur, strlen(tok->cur),
979 tok, fp_setreadl))
980 {
981 return 0;
982 }
983 }
984 /* The default encoding is UTF-8, so make sure we don't have any
985 non-UTF-8 sequences in it. */
986 if (!tok->encoding && !ensure_utf8(tok->cur, tok)) {
987 error_ret(tok);
988 return 0;
989 }
990 assert(tok->done == E_OK);
991 return tok->done == E_OK;
992 }
993
994 #if defined(Py_DEBUG)
995 static void
print_escape(FILE * f,const char * s,Py_ssize_t size)996 print_escape(FILE *f, const char *s, Py_ssize_t size)
997 {
998 if (s == NULL) {
999 fputs("NULL", f);
1000 return;
1001 }
1002 putc('"', f);
1003 while (size-- > 0) {
1004 unsigned char c = *s++;
1005 switch (c) {
1006 case '\n': fputs("\\n", f); break;
1007 case '\r': fputs("\\r", f); break;
1008 case '\t': fputs("\\t", f); break;
1009 case '\f': fputs("\\f", f); break;
1010 case '\'': fputs("\\'", f); break;
1011 case '"': fputs("\\\"", f); break;
1012 default:
1013 if (0x20 <= c && c <= 0x7f)
1014 putc(c, f);
1015 else
1016 fprintf(f, "\\x%02x", c);
1017 }
1018 }
1019 putc('"', f);
1020 }
1021 #endif
1022
1023 /* Get next char, updating state; error code goes into tok->done */
1024
1025 static int
tok_nextc(struct tok_state * tok)1026 tok_nextc(struct tok_state *tok)
1027 {
1028 int rc;
1029 for (;;) {
1030 if (tok->cur != tok->inp) {
1031 return Py_CHARMASK(*tok->cur++); /* Fast path */
1032 }
1033 if (tok->done != E_OK)
1034 return EOF;
1035 if (tok->fp == NULL) {
1036 rc = tok_underflow_string(tok);
1037 }
1038 else if (tok->prompt != NULL) {
1039 rc = tok_underflow_interactive(tok);
1040 }
1041 else {
1042 rc = tok_underflow_file(tok);
1043 }
1044 #if defined(Py_DEBUG)
1045 if (Py_DebugFlag) {
1046 fprintf(stderr, "line[%d] = ", tok->lineno);
1047 print_escape(stderr, tok->cur, tok->inp - tok->cur);
1048 fprintf(stderr, " tok->done = %d\n", tok->done);
1049 }
1050 #endif
1051 if (!rc) {
1052 tok->cur = tok->inp;
1053 return EOF;
1054 }
1055 tok->line_start = tok->cur;
1056 }
1057 Py_UNREACHABLE();
1058 }
1059
1060 /* Back-up one character */
1061
1062 static void
tok_backup(struct tok_state * tok,int c)1063 tok_backup(struct tok_state *tok, int c)
1064 {
1065 if (c != EOF) {
1066 if (--tok->cur < tok->buf) {
1067 Py_FatalError("tokenizer beginning of buffer");
1068 }
1069 if ((int)(unsigned char)*tok->cur != c) {
1070 Py_FatalError("tok_backup: wrong character");
1071 }
1072 }
1073 }
1074
1075 static int
_syntaxerror_range(struct tok_state * tok,const char * format,int col_offset,int end_col_offset,va_list vargs)1076 _syntaxerror_range(struct tok_state *tok, const char *format,
1077 int col_offset, int end_col_offset,
1078 va_list vargs)
1079 {
1080 PyObject *errmsg, *errtext, *args;
1081 errmsg = PyUnicode_FromFormatV(format, vargs);
1082 if (!errmsg) {
1083 goto error;
1084 }
1085
1086 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1087 "replace");
1088 if (!errtext) {
1089 goto error;
1090 }
1091
1092 if (col_offset == -1) {
1093 col_offset = (int)PyUnicode_GET_LENGTH(errtext);
1094 }
1095 if (end_col_offset == -1) {
1096 end_col_offset = col_offset;
1097 }
1098
1099 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1100 if (line_len != tok->cur - tok->line_start) {
1101 Py_DECREF(errtext);
1102 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1103 "replace");
1104 }
1105 if (!errtext) {
1106 goto error;
1107 }
1108
1109 args = Py_BuildValue("(O(OiiNii))", errmsg, tok->filename, tok->lineno,
1110 col_offset, errtext, tok->lineno, end_col_offset);
1111 if (args) {
1112 PyErr_SetObject(PyExc_SyntaxError, args);
1113 Py_DECREF(args);
1114 }
1115
1116 error:
1117 Py_XDECREF(errmsg);
1118 tok->done = E_ERROR;
1119 return ERRORTOKEN;
1120 }
1121
1122 static int
syntaxerror(struct tok_state * tok,const char * format,...)1123 syntaxerror(struct tok_state *tok, const char *format, ...)
1124 {
1125 va_list vargs;
1126 #ifdef HAVE_STDARG_PROTOTYPES
1127 va_start(vargs, format);
1128 #else
1129 va_start(vargs);
1130 #endif
1131 int ret = _syntaxerror_range(tok, format, -1, -1, vargs);
1132 va_end(vargs);
1133 return ret;
1134 }
1135
1136 static int
syntaxerror_known_range(struct tok_state * tok,int col_offset,int end_col_offset,const char * format,...)1137 syntaxerror_known_range(struct tok_state *tok,
1138 int col_offset, int end_col_offset,
1139 const char *format, ...)
1140 {
1141 va_list vargs;
1142 #ifdef HAVE_STDARG_PROTOTYPES
1143 va_start(vargs, format);
1144 #else
1145 va_start(vargs);
1146 #endif
1147 int ret = _syntaxerror_range(tok, format, col_offset, end_col_offset, vargs);
1148 va_end(vargs);
1149 return ret;
1150 }
1151
1152
1153
1154 static int
indenterror(struct tok_state * tok)1155 indenterror(struct tok_state *tok)
1156 {
1157 tok->done = E_TABSPACE;
1158 tok->cur = tok->inp;
1159 return ERRORTOKEN;
1160 }
1161
1162 static int
parser_warn(struct tok_state * tok,const char * format,...)1163 parser_warn(struct tok_state *tok, const char *format, ...)
1164 {
1165 PyObject *errmsg;
1166 va_list vargs;
1167 #ifdef HAVE_STDARG_PROTOTYPES
1168 va_start(vargs, format);
1169 #else
1170 va_start(vargs);
1171 #endif
1172 errmsg = PyUnicode_FromFormatV(format, vargs);
1173 va_end(vargs);
1174 if (!errmsg) {
1175 goto error;
1176 }
1177
1178 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, errmsg, tok->filename,
1179 tok->lineno, NULL, NULL) < 0) {
1180 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
1181 /* Replace the DeprecationWarning exception with a SyntaxError
1182 to get a more accurate error report */
1183 PyErr_Clear();
1184 syntaxerror(tok, "%U", errmsg);
1185 }
1186 goto error;
1187 }
1188 Py_DECREF(errmsg);
1189 return 0;
1190
1191 error:
1192 Py_XDECREF(errmsg);
1193 tok->done = E_ERROR;
1194 return -1;
1195 }
1196
1197 static int
lookahead(struct tok_state * tok,const char * test)1198 lookahead(struct tok_state *tok, const char *test)
1199 {
1200 const char *s = test;
1201 int res = 0;
1202 while (1) {
1203 int c = tok_nextc(tok);
1204 if (*s == 0) {
1205 res = !is_potential_identifier_char(c);
1206 }
1207 else if (c == *s) {
1208 s++;
1209 continue;
1210 }
1211
1212 tok_backup(tok, c);
1213 while (s != test) {
1214 tok_backup(tok, *--s);
1215 }
1216 return res;
1217 }
1218 }
1219
1220 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)1221 verify_end_of_number(struct tok_state *tok, int c, const char *kind)
1222 {
1223 /* Emit a deprecation warning only if the numeric literal is immediately
1224 * followed by one of keywords which can occurr after a numeric literal
1225 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
1226 * It allows to gradually deprecate existing valid code without adding
1227 * warning before error in most cases of invalid numeric literal (which
1228 * would be confusiong and break existing tests).
1229 * Raise a syntax error with slighly better message than plain
1230 * "invalid syntax" if the numeric literal is immediately followed by
1231 * other keyword or identifier.
1232 */
1233 int r = 0;
1234 if (c == 'a') {
1235 r = lookahead(tok, "nd");
1236 }
1237 else if (c == 'e') {
1238 r = lookahead(tok, "lse");
1239 }
1240 else if (c == 'f') {
1241 r = lookahead(tok, "or");
1242 }
1243 else if (c == 'i') {
1244 int c2 = tok_nextc(tok);
1245 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
1246 r = 1;
1247 }
1248 tok_backup(tok, c2);
1249 }
1250 else if (c == 'o') {
1251 r = lookahead(tok, "r");
1252 }
1253 if (r) {
1254 tok_backup(tok, c);
1255 if (parser_warn(tok, "invalid %s literal", kind)) {
1256 return 0;
1257 }
1258 tok_nextc(tok);
1259 }
1260 else /* In future releases, only error will remain. */
1261 if (is_potential_identifier_char(c)) {
1262 tok_backup(tok, c);
1263 syntaxerror(tok, "invalid %s literal", kind);
1264 return 0;
1265 }
1266 return 1;
1267 }
1268
1269 /* Verify that the identifier follows PEP 3131.
1270 All identifier strings are guaranteed to be "ready" unicode objects.
1271 */
1272 static int
verify_identifier(struct tok_state * tok)1273 verify_identifier(struct tok_state *tok)
1274 {
1275 PyObject *s;
1276 if (tok->decoding_erred)
1277 return 0;
1278 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1279 if (s == NULL) {
1280 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1281 tok->done = E_DECODE;
1282 }
1283 else {
1284 tok->done = E_ERROR;
1285 }
1286 return 0;
1287 }
1288 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1289 if (invalid < 0) {
1290 Py_DECREF(s);
1291 tok->done = E_ERROR;
1292 return 0;
1293 }
1294 assert(PyUnicode_GET_LENGTH(s) > 0);
1295 if (invalid < PyUnicode_GET_LENGTH(s)) {
1296 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1297 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1298 /* Determine the offset in UTF-8 encoded input */
1299 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1300 if (s != NULL) {
1301 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1302 }
1303 if (s == NULL) {
1304 tok->done = E_ERROR;
1305 return 0;
1306 }
1307 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1308 }
1309 Py_DECREF(s);
1310 // PyUnicode_FromFormatV() does not support %X
1311 char hex[9];
1312 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1313 if (Py_UNICODE_ISPRINTABLE(ch)) {
1314 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1315 }
1316 else {
1317 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1318 }
1319 return 0;
1320 }
1321 Py_DECREF(s);
1322 return 1;
1323 }
1324
1325 static int
tok_decimal_tail(struct tok_state * tok)1326 tok_decimal_tail(struct tok_state *tok)
1327 {
1328 int c;
1329
1330 while (1) {
1331 do {
1332 c = tok_nextc(tok);
1333 } while (isdigit(c));
1334 if (c != '_') {
1335 break;
1336 }
1337 c = tok_nextc(tok);
1338 if (!isdigit(c)) {
1339 tok_backup(tok, c);
1340 syntaxerror(tok, "invalid decimal literal");
1341 return 0;
1342 }
1343 }
1344 return c;
1345 }
1346
1347 /* Get next token, after space stripping etc. */
1348
1349 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1350 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1351 {
1352 int c;
1353 int blankline, nonascii;
1354
1355 *p_start = *p_end = NULL;
1356 nextline:
1357 tok->start = NULL;
1358 blankline = 0;
1359
1360 /* Get indentation level */
1361 if (tok->atbol) {
1362 int col = 0;
1363 int altcol = 0;
1364 tok->atbol = 0;
1365 for (;;) {
1366 c = tok_nextc(tok);
1367 if (c == ' ') {
1368 col++, altcol++;
1369 }
1370 else if (c == '\t') {
1371 col = (col / tok->tabsize + 1) * tok->tabsize;
1372 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1373 }
1374 else if (c == '\014') {/* Control-L (formfeed) */
1375 col = altcol = 0; /* For Emacs users */
1376 }
1377 else {
1378 break;
1379 }
1380 }
1381 tok_backup(tok, c);
1382 if (c == '#' || c == '\n' || c == '\\') {
1383 /* Lines with only whitespace and/or comments
1384 and/or a line continuation character
1385 shouldn't affect the indentation and are
1386 not passed to the parser as NEWLINE tokens,
1387 except *totally* empty lines in interactive
1388 mode, which signal the end of a command group. */
1389 if (col == 0 && c == '\n' && tok->prompt != NULL) {
1390 blankline = 0; /* Let it through */
1391 }
1392 else if (tok->prompt != NULL && tok->lineno == 1) {
1393 /* In interactive mode, if the first line contains
1394 only spaces and/or a comment, let it through. */
1395 blankline = 0;
1396 col = altcol = 0;
1397 }
1398 else {
1399 blankline = 1; /* Ignore completely */
1400 }
1401 /* We can't jump back right here since we still
1402 may need to skip to the end of a comment */
1403 }
1404 if (!blankline && tok->level == 0) {
1405 if (col == tok->indstack[tok->indent]) {
1406 /* No change */
1407 if (altcol != tok->altindstack[tok->indent]) {
1408 return indenterror(tok);
1409 }
1410 }
1411 else if (col > tok->indstack[tok->indent]) {
1412 /* Indent -- always one */
1413 if (tok->indent+1 >= MAXINDENT) {
1414 tok->done = E_TOODEEP;
1415 tok->cur = tok->inp;
1416 return ERRORTOKEN;
1417 }
1418 if (altcol <= tok->altindstack[tok->indent]) {
1419 return indenterror(tok);
1420 }
1421 tok->pendin++;
1422 tok->indstack[++tok->indent] = col;
1423 tok->altindstack[tok->indent] = altcol;
1424 }
1425 else /* col < tok->indstack[tok->indent] */ {
1426 /* Dedent -- any number, must be consistent */
1427 while (tok->indent > 0 &&
1428 col < tok->indstack[tok->indent]) {
1429 tok->pendin--;
1430 tok->indent--;
1431 }
1432 if (col != tok->indstack[tok->indent]) {
1433 tok->done = E_DEDENT;
1434 tok->cur = tok->inp;
1435 return ERRORTOKEN;
1436 }
1437 if (altcol != tok->altindstack[tok->indent]) {
1438 return indenterror(tok);
1439 }
1440 }
1441 }
1442 }
1443
1444 tok->start = tok->cur;
1445
1446 /* Return pending indents/dedents */
1447 if (tok->pendin != 0) {
1448 if (tok->pendin < 0) {
1449 tok->pendin++;
1450 return DEDENT;
1451 }
1452 else {
1453 tok->pendin--;
1454 return INDENT;
1455 }
1456 }
1457
1458 /* Peek ahead at the next character */
1459 c = tok_nextc(tok);
1460 tok_backup(tok, c);
1461 /* Check if we are closing an async function */
1462 if (tok->async_def
1463 && !blankline
1464 /* Due to some implementation artifacts of type comments,
1465 * a TYPE_COMMENT at the start of a function won't set an
1466 * indentation level and it will produce a NEWLINE after it.
1467 * To avoid spuriously ending an async function due to this,
1468 * wait until we have some non-newline char in front of us. */
1469 && c != '\n'
1470 && tok->level == 0
1471 /* There was a NEWLINE after ASYNC DEF,
1472 so we're past the signature. */
1473 && tok->async_def_nl
1474 /* Current indentation level is less than where
1475 the async function was defined */
1476 && tok->async_def_indent >= tok->indent)
1477 {
1478 tok->async_def = 0;
1479 tok->async_def_indent = 0;
1480 tok->async_def_nl = 0;
1481 }
1482
1483 again:
1484 tok->start = NULL;
1485 /* Skip spaces */
1486 do {
1487 c = tok_nextc(tok);
1488 } while (c == ' ' || c == '\t' || c == '\014');
1489
1490 /* Set start of current token */
1491 tok->start = tok->cur - 1;
1492
1493 /* Skip comment, unless it's a type comment */
1494 if (c == '#') {
1495 const char *prefix, *p, *type_start;
1496
1497 while (c != EOF && c != '\n') {
1498 c = tok_nextc(tok);
1499 }
1500
1501 if (tok->type_comments) {
1502 p = tok->start;
1503 prefix = type_comment_prefix;
1504 while (*prefix && p < tok->cur) {
1505 if (*prefix == ' ') {
1506 while (*p == ' ' || *p == '\t') {
1507 p++;
1508 }
1509 } else if (*prefix == *p) {
1510 p++;
1511 } else {
1512 break;
1513 }
1514
1515 prefix++;
1516 }
1517
1518 /* This is a type comment if we matched all of type_comment_prefix. */
1519 if (!*prefix) {
1520 int is_type_ignore = 1;
1521 const char *ignore_end = p + 6;
1522 tok_backup(tok, c); /* don't eat the newline or EOF */
1523
1524 type_start = p;
1525
1526 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1527 * or anything ASCII and non-alphanumeric. */
1528 is_type_ignore = (
1529 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1530 && !(tok->cur > ignore_end
1531 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1532
1533 if (is_type_ignore) {
1534 *p_start = ignore_end;
1535 *p_end = tok->cur;
1536
1537 /* If this type ignore is the only thing on the line, consume the newline also. */
1538 if (blankline) {
1539 tok_nextc(tok);
1540 tok->atbol = 1;
1541 }
1542 return TYPE_IGNORE;
1543 } else {
1544 *p_start = type_start; /* after type_comment_prefix */
1545 *p_end = tok->cur;
1546 return TYPE_COMMENT;
1547 }
1548 }
1549 }
1550 }
1551
1552 if (tok->done == E_INTERACT_STOP) {
1553 return ENDMARKER;
1554 }
1555
1556 /* Check for EOF and errors now */
1557 if (c == EOF) {
1558 if (tok->level) {
1559 return ERRORTOKEN;
1560 }
1561 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1562 }
1563
1564 /* Identifier (most frequent token!) */
1565 nonascii = 0;
1566 if (is_potential_identifier_start(c)) {
1567 /* Process the various legal combinations of b"", r"", u"", and f"". */
1568 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1569 while (1) {
1570 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1571 saw_b = 1;
1572 /* Since this is a backwards compatibility support literal we don't
1573 want to support it in arbitrary order like byte literals. */
1574 else if (!(saw_b || saw_u || saw_r || saw_f)
1575 && (c == 'u'|| c == 'U')) {
1576 saw_u = 1;
1577 }
1578 /* ur"" and ru"" are not supported */
1579 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1580 saw_r = 1;
1581 }
1582 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1583 saw_f = 1;
1584 }
1585 else {
1586 break;
1587 }
1588 c = tok_nextc(tok);
1589 if (c == '"' || c == '\'') {
1590 goto letter_quote;
1591 }
1592 }
1593 while (is_potential_identifier_char(c)) {
1594 if (c >= 128) {
1595 nonascii = 1;
1596 }
1597 c = tok_nextc(tok);
1598 }
1599 tok_backup(tok, c);
1600 if (nonascii && !verify_identifier(tok)) {
1601 return ERRORTOKEN;
1602 }
1603
1604 *p_start = tok->start;
1605 *p_end = tok->cur;
1606
1607 /* async/await parsing block. */
1608 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1609 /* May be an 'async' or 'await' token. For Python 3.7 or
1610 later we recognize them unconditionally. For Python
1611 3.5 or 3.6 we recognize 'async' in front of 'def', and
1612 either one inside of 'async def'. (Technically we
1613 shouldn't recognize these at all for 3.4 or earlier,
1614 but there's no *valid* Python 3.4 code that would be
1615 rejected, and async functions will be rejected in a
1616 later phase.) */
1617 if (!tok->async_hacks || tok->async_def) {
1618 /* Always recognize the keywords. */
1619 if (memcmp(tok->start, "async", 5) == 0) {
1620 return ASYNC;
1621 }
1622 if (memcmp(tok->start, "await", 5) == 0) {
1623 return AWAIT;
1624 }
1625 }
1626 else if (memcmp(tok->start, "async", 5) == 0) {
1627 /* The current token is 'async'.
1628 Look ahead one token to see if that is 'def'. */
1629
1630 struct tok_state ahead_tok;
1631 const char *ahead_tok_start = NULL;
1632 const char *ahead_tok_end = NULL;
1633 int ahead_tok_kind;
1634
1635 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1636 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1637 &ahead_tok_end);
1638
1639 if (ahead_tok_kind == NAME
1640 && ahead_tok.cur - ahead_tok.start == 3
1641 && memcmp(ahead_tok.start, "def", 3) == 0)
1642 {
1643 /* The next token is going to be 'def', so instead of
1644 returning a plain NAME token, return ASYNC. */
1645 tok->async_def_indent = tok->indent;
1646 tok->async_def = 1;
1647 return ASYNC;
1648 }
1649 }
1650 }
1651
1652 return NAME;
1653 }
1654
1655 /* Newline */
1656 if (c == '\n') {
1657 tok->atbol = 1;
1658 if (blankline || tok->level > 0) {
1659 goto nextline;
1660 }
1661 *p_start = tok->start;
1662 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1663 tok->cont_line = 0;
1664 if (tok->async_def) {
1665 /* We're somewhere inside an 'async def' function, and
1666 we've encountered a NEWLINE after its signature. */
1667 tok->async_def_nl = 1;
1668 }
1669 return NEWLINE;
1670 }
1671
1672 /* Period or number starting with period? */
1673 if (c == '.') {
1674 c = tok_nextc(tok);
1675 if (isdigit(c)) {
1676 goto fraction;
1677 } else if (c == '.') {
1678 c = tok_nextc(tok);
1679 if (c == '.') {
1680 *p_start = tok->start;
1681 *p_end = tok->cur;
1682 return ELLIPSIS;
1683 }
1684 else {
1685 tok_backup(tok, c);
1686 }
1687 tok_backup(tok, '.');
1688 }
1689 else {
1690 tok_backup(tok, c);
1691 }
1692 *p_start = tok->start;
1693 *p_end = tok->cur;
1694 return DOT;
1695 }
1696
1697 /* Number */
1698 if (isdigit(c)) {
1699 if (c == '0') {
1700 /* Hex, octal or binary -- maybe. */
1701 c = tok_nextc(tok);
1702 if (c == 'x' || c == 'X') {
1703 /* Hex */
1704 c = tok_nextc(tok);
1705 do {
1706 if (c == '_') {
1707 c = tok_nextc(tok);
1708 }
1709 if (!isxdigit(c)) {
1710 tok_backup(tok, c);
1711 return syntaxerror(tok, "invalid hexadecimal literal");
1712 }
1713 do {
1714 c = tok_nextc(tok);
1715 } while (isxdigit(c));
1716 } while (c == '_');
1717 if (!verify_end_of_number(tok, c, "hexadecimal")) {
1718 return ERRORTOKEN;
1719 }
1720 }
1721 else if (c == 'o' || c == 'O') {
1722 /* Octal */
1723 c = tok_nextc(tok);
1724 do {
1725 if (c == '_') {
1726 c = tok_nextc(tok);
1727 }
1728 if (c < '0' || c >= '8') {
1729 if (isdigit(c)) {
1730 return syntaxerror(tok,
1731 "invalid digit '%c' in octal literal", c);
1732 }
1733 else {
1734 tok_backup(tok, c);
1735 return syntaxerror(tok, "invalid octal literal");
1736 }
1737 }
1738 do {
1739 c = tok_nextc(tok);
1740 } while ('0' <= c && c < '8');
1741 } while (c == '_');
1742 if (isdigit(c)) {
1743 return syntaxerror(tok,
1744 "invalid digit '%c' in octal literal", c);
1745 }
1746 if (!verify_end_of_number(tok, c, "octal")) {
1747 return ERRORTOKEN;
1748 }
1749 }
1750 else if (c == 'b' || c == 'B') {
1751 /* Binary */
1752 c = tok_nextc(tok);
1753 do {
1754 if (c == '_') {
1755 c = tok_nextc(tok);
1756 }
1757 if (c != '0' && c != '1') {
1758 if (isdigit(c)) {
1759 return syntaxerror(tok,
1760 "invalid digit '%c' in binary literal", c);
1761 }
1762 else {
1763 tok_backup(tok, c);
1764 return syntaxerror(tok, "invalid binary literal");
1765 }
1766 }
1767 do {
1768 c = tok_nextc(tok);
1769 } while (c == '0' || c == '1');
1770 } while (c == '_');
1771 if (isdigit(c)) {
1772 return syntaxerror(tok,
1773 "invalid digit '%c' in binary literal", c);
1774 }
1775 if (!verify_end_of_number(tok, c, "binary")) {
1776 return ERRORTOKEN;
1777 }
1778 }
1779 else {
1780 int nonzero = 0;
1781 /* maybe old-style octal; c is first char of it */
1782 /* in any case, allow '0' as a literal */
1783 while (1) {
1784 if (c == '_') {
1785 c = tok_nextc(tok);
1786 if (!isdigit(c)) {
1787 tok_backup(tok, c);
1788 return syntaxerror(tok, "invalid decimal literal");
1789 }
1790 }
1791 if (c != '0') {
1792 break;
1793 }
1794 c = tok_nextc(tok);
1795 }
1796 char* zeros_end = tok->cur;
1797 if (isdigit(c)) {
1798 nonzero = 1;
1799 c = tok_decimal_tail(tok);
1800 if (c == 0) {
1801 return ERRORTOKEN;
1802 }
1803 }
1804 if (c == '.') {
1805 c = tok_nextc(tok);
1806 goto fraction;
1807 }
1808 else if (c == 'e' || c == 'E') {
1809 goto exponent;
1810 }
1811 else if (c == 'j' || c == 'J') {
1812 goto imaginary;
1813 }
1814 else if (nonzero) {
1815 /* Old-style octal: now disallowed. */
1816 tok_backup(tok, c);
1817 return syntaxerror_known_range(
1818 tok, (int)(tok->start + 1 - tok->line_start),
1819 (int)(zeros_end - tok->line_start),
1820 "leading zeros in decimal integer "
1821 "literals are not permitted; "
1822 "use an 0o prefix for octal integers");
1823 }
1824 if (!verify_end_of_number(tok, c, "decimal")) {
1825 return ERRORTOKEN;
1826 }
1827 }
1828 }
1829 else {
1830 /* Decimal */
1831 c = tok_decimal_tail(tok);
1832 if (c == 0) {
1833 return ERRORTOKEN;
1834 }
1835 {
1836 /* Accept floating point numbers. */
1837 if (c == '.') {
1838 c = tok_nextc(tok);
1839 fraction:
1840 /* Fraction */
1841 if (isdigit(c)) {
1842 c = tok_decimal_tail(tok);
1843 if (c == 0) {
1844 return ERRORTOKEN;
1845 }
1846 }
1847 }
1848 if (c == 'e' || c == 'E') {
1849 int e;
1850 exponent:
1851 e = c;
1852 /* Exponent part */
1853 c = tok_nextc(tok);
1854 if (c == '+' || c == '-') {
1855 c = tok_nextc(tok);
1856 if (!isdigit(c)) {
1857 tok_backup(tok, c);
1858 return syntaxerror(tok, "invalid decimal literal");
1859 }
1860 } else if (!isdigit(c)) {
1861 tok_backup(tok, c);
1862 if (!verify_end_of_number(tok, e, "decimal")) {
1863 return ERRORTOKEN;
1864 }
1865 tok_backup(tok, e);
1866 *p_start = tok->start;
1867 *p_end = tok->cur;
1868 return NUMBER;
1869 }
1870 c = tok_decimal_tail(tok);
1871 if (c == 0) {
1872 return ERRORTOKEN;
1873 }
1874 }
1875 if (c == 'j' || c == 'J') {
1876 /* Imaginary part */
1877 imaginary:
1878 c = tok_nextc(tok);
1879 if (!verify_end_of_number(tok, c, "imaginary")) {
1880 return ERRORTOKEN;
1881 }
1882 }
1883 else if (!verify_end_of_number(tok, c, "decimal")) {
1884 return ERRORTOKEN;
1885 }
1886 }
1887 }
1888 tok_backup(tok, c);
1889 *p_start = tok->start;
1890 *p_end = tok->cur;
1891 return NUMBER;
1892 }
1893
1894 letter_quote:
1895 /* String */
1896 if (c == '\'' || c == '"') {
1897 int quote = c;
1898 int quote_size = 1; /* 1 or 3 */
1899 int end_quote_size = 0;
1900
1901 /* Nodes of type STRING, especially multi line strings
1902 must be handled differently in order to get both
1903 the starting line number and the column offset right.
1904 (cf. issue 16806) */
1905 tok->first_lineno = tok->lineno;
1906 tok->multi_line_start = tok->line_start;
1907
1908 /* Find the quote size and start of string */
1909 c = tok_nextc(tok);
1910 if (c == quote) {
1911 c = tok_nextc(tok);
1912 if (c == quote) {
1913 quote_size = 3;
1914 }
1915 else {
1916 end_quote_size = 1; /* empty string found */
1917 }
1918 }
1919 if (c != quote) {
1920 tok_backup(tok, c);
1921 }
1922
1923 /* Get rest of string */
1924 while (end_quote_size != quote_size) {
1925 c = tok_nextc(tok);
1926 if (c == EOF || (quote_size == 1 && c == '\n')) {
1927 assert(tok->multi_line_start != NULL);
1928 // shift the tok_state's location into
1929 // the start of string, and report the error
1930 // from the initial quote character
1931 tok->cur = (char *)tok->start;
1932 tok->cur++;
1933 tok->line_start = tok->multi_line_start;
1934 int start = tok->lineno;
1935 tok->lineno = tok->first_lineno;
1936
1937 if (quote_size == 3) {
1938 return syntaxerror(tok,
1939 "unterminated triple-quoted string literal"
1940 " (detected at line %d)", start);
1941 }
1942 else {
1943 return syntaxerror(tok,
1944 "unterminated string literal (detected at"
1945 " line %d)", start);
1946 }
1947 }
1948 if (c == quote) {
1949 end_quote_size += 1;
1950 }
1951 else {
1952 end_quote_size = 0;
1953 if (c == '\\') {
1954 tok_nextc(tok); /* skip escaped char */
1955 }
1956 }
1957 }
1958
1959 *p_start = tok->start;
1960 *p_end = tok->cur;
1961 return STRING;
1962 }
1963
1964 /* Line continuation */
1965 if (c == '\\') {
1966 c = tok_nextc(tok);
1967 if (c != '\n') {
1968 tok->done = E_LINECONT;
1969 return ERRORTOKEN;
1970 }
1971 c = tok_nextc(tok);
1972 if (c == EOF) {
1973 tok->done = E_EOF;
1974 tok->cur = tok->inp;
1975 return ERRORTOKEN;
1976 } else {
1977 tok_backup(tok, c);
1978 }
1979 tok->cont_line = 1;
1980 goto again; /* Read next line */
1981 }
1982
1983 /* Check for two-character token */
1984 {
1985 int c2 = tok_nextc(tok);
1986 int token = PyToken_TwoChars(c, c2);
1987 if (token != OP) {
1988 int c3 = tok_nextc(tok);
1989 int token3 = PyToken_ThreeChars(c, c2, c3);
1990 if (token3 != OP) {
1991 token = token3;
1992 }
1993 else {
1994 tok_backup(tok, c3);
1995 }
1996 *p_start = tok->start;
1997 *p_end = tok->cur;
1998 return token;
1999 }
2000 tok_backup(tok, c2);
2001 }
2002
2003 /* Keep track of parentheses nesting level */
2004 switch (c) {
2005 case '(':
2006 case '[':
2007 case '{':
2008 if (tok->level >= MAXLEVEL) {
2009 return syntaxerror(tok, "too many nested parentheses");
2010 }
2011 tok->parenstack[tok->level] = c;
2012 tok->parenlinenostack[tok->level] = tok->lineno;
2013 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
2014 tok->level++;
2015 break;
2016 case ')':
2017 case ']':
2018 case '}':
2019 if (!tok->level) {
2020 return syntaxerror(tok, "unmatched '%c'", c);
2021 }
2022 tok->level--;
2023 int opening = tok->parenstack[tok->level];
2024 if (!((opening == '(' && c == ')') ||
2025 (opening == '[' && c == ']') ||
2026 (opening == '{' && c == '}')))
2027 {
2028 if (tok->parenlinenostack[tok->level] != tok->lineno) {
2029 return syntaxerror(tok,
2030 "closing parenthesis '%c' does not match "
2031 "opening parenthesis '%c' on line %d",
2032 c, opening, tok->parenlinenostack[tok->level]);
2033 }
2034 else {
2035 return syntaxerror(tok,
2036 "closing parenthesis '%c' does not match "
2037 "opening parenthesis '%c'",
2038 c, opening);
2039 }
2040 }
2041 break;
2042 }
2043
2044 /* Punctuation character */
2045 *p_start = tok->start;
2046 *p_end = tok->cur;
2047 return PyToken_OneChar(c);
2048 }
2049
2050 int
PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)2051 PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
2052 {
2053 int result = tok_get(tok, p_start, p_end);
2054 if (tok->decoding_erred) {
2055 result = ERRORTOKEN;
2056 tok->done = E_DECODE;
2057 }
2058 return result;
2059 }
2060
2061 /* Get the encoding of a Python file. Check for the coding cookie and check if
2062 the file starts with a BOM.
2063
2064 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
2065 encoding in the first or second line of the file (in which case the encoding
2066 should be assumed to be UTF-8).
2067
2068 The char* returned is malloc'ed via PyMem_Malloc() and thus must be freed
2069 by the caller. */
2070
2071 char *
PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)2072 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
2073 {
2074 struct tok_state *tok;
2075 FILE *fp;
2076 const char *p_start = NULL;
2077 const char *p_end = NULL;
2078 char *encoding = NULL;
2079
2080 fd = _Py_dup(fd);
2081 if (fd < 0) {
2082 return NULL;
2083 }
2084
2085 fp = fdopen(fd, "r");
2086 if (fp == NULL) {
2087 return NULL;
2088 }
2089 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
2090 if (tok == NULL) {
2091 fclose(fp);
2092 return NULL;
2093 }
2094 if (filename != NULL) {
2095 Py_INCREF(filename);
2096 tok->filename = filename;
2097 }
2098 else {
2099 tok->filename = PyUnicode_FromString("<string>");
2100 if (tok->filename == NULL) {
2101 fclose(fp);
2102 PyTokenizer_Free(tok);
2103 return encoding;
2104 }
2105 }
2106 while (tok->lineno < 2 && tok->done == E_OK) {
2107 PyTokenizer_Get(tok, &p_start, &p_end);
2108 }
2109 fclose(fp);
2110 if (tok->encoding) {
2111 encoding = (char *)PyMem_Malloc(strlen(tok->encoding) + 1);
2112 if (encoding) {
2113 strcpy(encoding, tok->encoding);
2114 }
2115 }
2116 PyTokenizer_Free(tok);
2117 return encoding;
2118 }
2119
2120 char *
PyTokenizer_FindEncoding(int fd)2121 PyTokenizer_FindEncoding(int fd)
2122 {
2123 return PyTokenizer_FindEncodingFilename(fd, NULL);
2124 }
2125
2126 #ifdef Py_DEBUG
2127
2128 void
tok_dump(int type,char * start,char * end)2129 tok_dump(int type, char *start, char *end)
2130 {
2131 fprintf(stderr, "%s", _PyParser_TokenNames[type]);
2132 if (type == NAME || type == NUMBER || type == STRING || type == OP)
2133 fprintf(stderr, "(%.*s)", (int)(end - start), start);
2134 }
2135
2136 #endif
2137