1
2 /* Tokenizer implementation */
3
4 #define PY_SSIZE_T_CLEAN
5 #include "Python.h"
6
7 #include <ctype.h>
8 #include <assert.h>
9
10 #include "tokenizer.h"
11 #include "errcode.h"
12
13 #include "unicodeobject.h"
14 #include "bytesobject.h"
15 #include "fileobject.h"
16 #include "codecs.h"
17 #include "abstract.h"
18
19 /* Alternate tab spacing */
20 #define ALTTABSIZE 1
21
22 #define is_potential_identifier_start(c) (\
23 (c >= 'a' && c <= 'z')\
24 || (c >= 'A' && c <= 'Z')\
25 || c == '_'\
26 || (c >= 128))
27
28 #define is_potential_identifier_char(c) (\
29 (c >= 'a' && c <= 'z')\
30 || (c >= 'A' && c <= 'Z')\
31 || (c >= '0' && c <= '9')\
32 || c == '_'\
33 || (c >= 128))
34
35
36 /* Don't ever change this -- it would break the portability of Python code */
37 #define TABSIZE 8
38
39 /* Forward */
40 static struct tok_state *tok_new(void);
41 static int tok_nextc(struct tok_state *tok);
42 static void tok_backup(struct tok_state *tok, int c);
43
44
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48
49 /* Create and initialize a new tok_state structure */
50
51 static struct tok_state *
tok_new(void)52 tok_new(void)
53 {
54 struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(
55 sizeof(struct tok_state));
56 if (tok == NULL)
57 return NULL;
58 tok->buf = tok->cur = tok->inp = NULL;
59 tok->start = NULL;
60 tok->end = NULL;
61 tok->done = E_OK;
62 tok->fp = NULL;
63 tok->input = NULL;
64 tok->tabsize = TABSIZE;
65 tok->indent = 0;
66 tok->indstack[0] = 0;
67
68 tok->atbol = 1;
69 tok->pendin = 0;
70 tok->prompt = tok->nextprompt = NULL;
71 tok->lineno = 0;
72 tok->level = 0;
73 tok->altindstack[0] = 0;
74 tok->decoding_state = STATE_INIT;
75 tok->decoding_erred = 0;
76 tok->read_coding_spec = 0;
77 tok->enc = NULL;
78 tok->encoding = NULL;
79 tok->cont_line = 0;
80 tok->filename = NULL;
81 tok->decoding_readline = NULL;
82 tok->decoding_buffer = NULL;
83 tok->type_comments = 0;
84
85 tok->async_hacks = 0;
86 tok->async_def = 0;
87 tok->async_def_indent = 0;
88 tok->async_def_nl = 0;
89
90 return tok;
91 }
92
93 static char *
new_string(const char * s,Py_ssize_t len,struct tok_state * tok)94 new_string(const char *s, Py_ssize_t len, struct tok_state *tok)
95 {
96 char* result = (char *)PyMem_MALLOC(len + 1);
97 if (!result) {
98 tok->done = E_NOMEM;
99 return NULL;
100 }
101 memcpy(result, s, len);
102 result[len] = '\0';
103 return result;
104 }
105
106 static char *
error_ret(struct tok_state * tok)107 error_ret(struct tok_state *tok) /* XXX */
108 {
109 tok->decoding_erred = 1;
110 if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */
111 PyMem_FREE(tok->buf);
112 tok->buf = tok->cur = tok->inp = NULL;
113 tok->start = NULL;
114 tok->end = NULL;
115 tok->done = E_DECODE;
116 return NULL; /* as if it were EOF */
117 }
118
119
120 static const char *
get_normal_name(const char * s)121 get_normal_name(const char *s) /* for utf-8 and latin-1 */
122 {
123 char buf[13];
124 int i;
125 for (i = 0; i < 12; i++) {
126 int c = s[i];
127 if (c == '\0')
128 break;
129 else if (c == '_')
130 buf[i] = '-';
131 else
132 buf[i] = tolower(c);
133 }
134 buf[i] = '\0';
135 if (strcmp(buf, "utf-8") == 0 ||
136 strncmp(buf, "utf-8-", 6) == 0)
137 return "utf-8";
138 else if (strcmp(buf, "latin-1") == 0 ||
139 strcmp(buf, "iso-8859-1") == 0 ||
140 strcmp(buf, "iso-latin-1") == 0 ||
141 strncmp(buf, "latin-1-", 8) == 0 ||
142 strncmp(buf, "iso-8859-1-", 11) == 0 ||
143 strncmp(buf, "iso-latin-1-", 12) == 0)
144 return "iso-8859-1";
145 else
146 return s;
147 }
148
149 /* Return the coding spec in S, or NULL if none is found. */
150
151 static int
get_coding_spec(const char * s,char ** spec,Py_ssize_t size,struct tok_state * tok)152 get_coding_spec(const char *s, char **spec, Py_ssize_t size, struct tok_state *tok)
153 {
154 Py_ssize_t i;
155 *spec = NULL;
156 /* Coding spec must be in a comment, and that comment must be
157 * the only statement on the source code line. */
158 for (i = 0; i < size - 6; i++) {
159 if (s[i] == '#')
160 break;
161 if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')
162 return 1;
163 }
164 for (; i < size - 6; i++) { /* XXX inefficient search */
165 const char* t = s + i;
166 if (strncmp(t, "coding", 6) == 0) {
167 const char* begin = NULL;
168 t += 6;
169 if (t[0] != ':' && t[0] != '=')
170 continue;
171 do {
172 t++;
173 } while (t[0] == '\x20' || t[0] == '\t');
174
175 begin = t;
176 while (Py_ISALNUM(t[0]) ||
177 t[0] == '-' || t[0] == '_' || t[0] == '.')
178 t++;
179
180 if (begin < t) {
181 char* r = new_string(begin, t - begin, tok);
182 const char* q;
183 if (!r)
184 return 0;
185 q = get_normal_name(r);
186 if (r != q) {
187 PyMem_FREE(r);
188 r = new_string(q, strlen(q), tok);
189 if (!r)
190 return 0;
191 }
192 *spec = r;
193 break;
194 }
195 }
196 }
197 return 1;
198 }
199
200 /* Check whether the line contains a coding spec. If it does,
201 invoke the set_readline function for the new encoding.
202 This function receives the tok_state and the new encoding.
203 Return 1 on success, 0 on failure. */
204
205 static int
check_coding_spec(const char * line,Py_ssize_t size,struct tok_state * tok,int set_readline (struct tok_state *,const char *))206 check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,
207 int set_readline(struct tok_state *, const char *))
208 {
209 char *cs;
210 int r = 1;
211
212 if (tok->cont_line) {
213 /* It's a continuation line, so it can't be a coding spec. */
214 tok->read_coding_spec = 1;
215 return 1;
216 }
217 if (!get_coding_spec(line, &cs, size, tok))
218 return 0;
219 if (!cs) {
220 Py_ssize_t i;
221 for (i = 0; i < size; i++) {
222 if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')
223 break;
224 if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {
225 /* Stop checking coding spec after a line containing
226 * anything except a comment. */
227 tok->read_coding_spec = 1;
228 break;
229 }
230 }
231 return 1;
232 }
233 tok->read_coding_spec = 1;
234 if (tok->encoding == NULL) {
235 assert(tok->decoding_state == STATE_RAW);
236 if (strcmp(cs, "utf-8") == 0) {
237 tok->encoding = cs;
238 } else {
239 r = set_readline(tok, cs);
240 if (r) {
241 tok->encoding = cs;
242 tok->decoding_state = STATE_NORMAL;
243 }
244 else {
245 PyErr_Format(PyExc_SyntaxError,
246 "encoding problem: %s", cs);
247 PyMem_FREE(cs);
248 }
249 }
250 } else { /* then, compare cs with BOM */
251 r = (strcmp(tok->encoding, cs) == 0);
252 if (!r)
253 PyErr_Format(PyExc_SyntaxError,
254 "encoding problem: %s with BOM", cs);
255 PyMem_FREE(cs);
256 }
257 return r;
258 }
259
260 /* See whether the file starts with a BOM. If it does,
261 invoke the set_readline function with the new encoding.
262 Return 1 on success, 0 on failure. */
263
264 static int
check_bom(int get_char (struct tok_state *),void unget_char (int,struct tok_state *),int set_readline (struct tok_state *,const char *),struct tok_state * tok)265 check_bom(int get_char(struct tok_state *),
266 void unget_char(int, struct tok_state *),
267 int set_readline(struct tok_state *, const char *),
268 struct tok_state *tok)
269 {
270 int ch1, ch2, ch3;
271 ch1 = get_char(tok);
272 tok->decoding_state = STATE_RAW;
273 if (ch1 == EOF) {
274 return 1;
275 } else if (ch1 == 0xEF) {
276 ch2 = get_char(tok);
277 if (ch2 != 0xBB) {
278 unget_char(ch2, tok);
279 unget_char(ch1, tok);
280 return 1;
281 }
282 ch3 = get_char(tok);
283 if (ch3 != 0xBF) {
284 unget_char(ch3, tok);
285 unget_char(ch2, tok);
286 unget_char(ch1, tok);
287 return 1;
288 }
289 #if 0
290 /* Disable support for UTF-16 BOMs until a decision
291 is made whether this needs to be supported. */
292 } else if (ch1 == 0xFE) {
293 ch2 = get_char(tok);
294 if (ch2 != 0xFF) {
295 unget_char(ch2, tok);
296 unget_char(ch1, tok);
297 return 1;
298 }
299 if (!set_readline(tok, "utf-16-be"))
300 return 0;
301 tok->decoding_state = STATE_NORMAL;
302 } else if (ch1 == 0xFF) {
303 ch2 = get_char(tok);
304 if (ch2 != 0xFE) {
305 unget_char(ch2, tok);
306 unget_char(ch1, tok);
307 return 1;
308 }
309 if (!set_readline(tok, "utf-16-le"))
310 return 0;
311 tok->decoding_state = STATE_NORMAL;
312 #endif
313 } else {
314 unget_char(ch1, tok);
315 return 1;
316 }
317 if (tok->encoding != NULL)
318 PyMem_FREE(tok->encoding);
319 tok->encoding = new_string("utf-8", 5, tok);
320 if (!tok->encoding)
321 return 0;
322 /* No need to set_readline: input is already utf-8 */
323 return 1;
324 }
325
326 /* Read a line of text from TOK into S, using the stream in TOK.
327 Return NULL on failure, else S.
328
329 On entry, tok->decoding_buffer will be one of:
330 1) NULL: need to call tok->decoding_readline to get a new line
331 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
332 stored the result in tok->decoding_buffer
333 3) PyByteArrayObject *: previous call to fp_readl did not have enough room
334 (in the s buffer) to copy entire contents of the line read
335 by tok->decoding_readline. tok->decoding_buffer has the overflow.
336 In this case, fp_readl is called in a loop (with an expanded buffer)
337 until the buffer ends with a '\n' (or until the end of the file is
338 reached): see tok_nextc and its calls to decoding_fgets.
339 */
340
341 static char *
fp_readl(char * s,int size,struct tok_state * tok)342 fp_readl(char *s, int size, struct tok_state *tok)
343 {
344 PyObject* bufobj;
345 const char *buf;
346 Py_ssize_t buflen;
347
348 /* Ask for one less byte so we can terminate it */
349 assert(size > 0);
350 size--;
351
352 if (tok->decoding_buffer) {
353 bufobj = tok->decoding_buffer;
354 Py_INCREF(bufobj);
355 }
356 else
357 {
358 bufobj = _PyObject_CallNoArg(tok->decoding_readline);
359 if (bufobj == NULL)
360 goto error;
361 }
362 if (PyUnicode_CheckExact(bufobj))
363 {
364 buf = PyUnicode_AsUTF8AndSize(bufobj, &buflen);
365 if (buf == NULL) {
366 goto error;
367 }
368 }
369 else
370 {
371 buf = PyByteArray_AsString(bufobj);
372 if (buf == NULL) {
373 goto error;
374 }
375 buflen = PyByteArray_GET_SIZE(bufobj);
376 }
377
378 Py_XDECREF(tok->decoding_buffer);
379 if (buflen > size) {
380 /* Too many chars, the rest goes into tok->decoding_buffer */
381 tok->decoding_buffer = PyByteArray_FromStringAndSize(buf+size,
382 buflen-size);
383 if (tok->decoding_buffer == NULL)
384 goto error;
385 buflen = size;
386 }
387 else
388 tok->decoding_buffer = NULL;
389
390 memcpy(s, buf, buflen);
391 s[buflen] = '\0';
392 if (buflen == 0) /* EOF */
393 s = NULL;
394 Py_DECREF(bufobj);
395 return s;
396
397 error:
398 Py_XDECREF(bufobj);
399 return error_ret(tok);
400 }
401
402 /* Set the readline function for TOK to a StreamReader's
403 readline function. The StreamReader is named ENC.
404
405 This function is called from check_bom and check_coding_spec.
406
407 ENC is usually identical to the future value of tok->encoding,
408 except for the (currently unsupported) case of UTF-16.
409
410 Return 1 on success, 0 on failure. */
411
412 static int
fp_setreadl(struct tok_state * tok,const char * enc)413 fp_setreadl(struct tok_state *tok, const char* enc)
414 {
415 PyObject *readline, *io, *stream;
416 _Py_IDENTIFIER(open);
417 _Py_IDENTIFIER(readline);
418 int fd;
419 long pos;
420
421 fd = fileno(tok->fp);
422 /* Due to buffering the file offset for fd can be different from the file
423 * position of tok->fp. If tok->fp was opened in text mode on Windows,
424 * its file position counts CRLF as one char and can't be directly mapped
425 * to the file offset for fd. Instead we step back one byte and read to
426 * the end of line.*/
427 pos = ftell(tok->fp);
428 if (pos == -1 ||
429 lseek(fd, (off_t)(pos > 0 ? pos - 1 : pos), SEEK_SET) == (off_t)-1) {
430 PyErr_SetFromErrnoWithFilename(PyExc_OSError, NULL);
431 return 0;
432 }
433
434 io = PyImport_ImportModuleNoBlock("io");
435 if (io == NULL)
436 return 0;
437
438 stream = _PyObject_CallMethodId(io, &PyId_open, "isisOOO",
439 fd, "r", -1, enc, Py_None, Py_None, Py_False);
440 Py_DECREF(io);
441 if (stream == NULL)
442 return 0;
443
444 readline = _PyObject_GetAttrId(stream, &PyId_readline);
445 Py_DECREF(stream);
446 if (readline == NULL)
447 return 0;
448 Py_XSETREF(tok->decoding_readline, readline);
449
450 if (pos > 0) {
451 PyObject *bufobj = _PyObject_CallNoArg(readline);
452 if (bufobj == NULL)
453 return 0;
454 Py_DECREF(bufobj);
455 }
456
457 return 1;
458 }
459
460 /* Fetch the next byte from TOK. */
461
fp_getc(struct tok_state * tok)462 static int fp_getc(struct tok_state *tok) {
463 return getc(tok->fp);
464 }
465
466 /* Unfetch the last byte back into TOK. */
467
fp_ungetc(int c,struct tok_state * tok)468 static void fp_ungetc(int c, struct tok_state *tok) {
469 ungetc(c, tok->fp);
470 }
471
472 /* Check whether the characters at s start a valid
473 UTF-8 sequence. Return the number of characters forming
474 the sequence if yes, 0 if not. */
valid_utf8(const unsigned char * s)475 static int valid_utf8(const unsigned char* s)
476 {
477 int expected = 0;
478 int length;
479 if (*s < 0x80)
480 /* single-byte code */
481 return 1;
482 if (*s < 0xc0)
483 /* following byte */
484 return 0;
485 if (*s < 0xE0)
486 expected = 1;
487 else if (*s < 0xF0)
488 expected = 2;
489 else if (*s < 0xF8)
490 expected = 3;
491 else
492 return 0;
493 length = expected + 1;
494 for (; expected; expected--)
495 if (s[expected] < 0x80 || s[expected] >= 0xC0)
496 return 0;
497 return length;
498 }
499
500 /* Read a line of input from TOK. Determine encoding
501 if necessary. */
502
503 static char *
decoding_fgets(char * s,int size,struct tok_state * tok)504 decoding_fgets(char *s, int size, struct tok_state *tok)
505 {
506 char *line = NULL;
507 int badchar = 0;
508 for (;;) {
509 if (tok->decoding_state == STATE_NORMAL) {
510 /* We already have a codec associated with
511 this input. */
512 line = fp_readl(s, size, tok);
513 break;
514 } else if (tok->decoding_state == STATE_RAW) {
515 /* We want a 'raw' read. */
516 line = Py_UniversalNewlineFgets(s, size,
517 tok->fp, NULL);
518 break;
519 } else {
520 /* We have not yet determined the encoding.
521 If an encoding is found, use the file-pointer
522 reader functions from now on. */
523 if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))
524 return error_ret(tok);
525 assert(tok->decoding_state != STATE_INIT);
526 }
527 }
528 if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {
529 if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {
530 return error_ret(tok);
531 }
532 }
533 /* The default encoding is UTF-8, so make sure we don't have any
534 non-UTF-8 sequences in it. */
535 if (line && !tok->encoding) {
536 unsigned char *c;
537 int length;
538 for (c = (unsigned char *)line; *c; c += length)
539 if (!(length = valid_utf8(c))) {
540 badchar = *c;
541 break;
542 }
543 }
544 if (badchar) {
545 /* Need to add 1 to the line number, since this line
546 has not been counted, yet. */
547 PyErr_Format(PyExc_SyntaxError,
548 "Non-UTF-8 code starting with '\\x%.2x' "
549 "in file %U on line %i, "
550 "but no encoding declared; "
551 "see http://python.org/dev/peps/pep-0263/ for details",
552 badchar, tok->filename, tok->lineno + 1);
553 return error_ret(tok);
554 }
555 return line;
556 }
557
558 static int
decoding_feof(struct tok_state * tok)559 decoding_feof(struct tok_state *tok)
560 {
561 if (tok->decoding_state != STATE_NORMAL) {
562 return feof(tok->fp);
563 } else {
564 PyObject* buf = tok->decoding_buffer;
565 if (buf == NULL) {
566 buf = _PyObject_CallNoArg(tok->decoding_readline);
567 if (buf == NULL) {
568 error_ret(tok);
569 return 1;
570 } else {
571 tok->decoding_buffer = buf;
572 }
573 }
574 return PyObject_Length(buf) == 0;
575 }
576 }
577
578 /* Fetch a byte from TOK, using the string buffer. */
579
580 static int
buf_getc(struct tok_state * tok)581 buf_getc(struct tok_state *tok) {
582 return Py_CHARMASK(*tok->str++);
583 }
584
585 /* Unfetch a byte from TOK, using the string buffer. */
586
587 static void
buf_ungetc(int c,struct tok_state * tok)588 buf_ungetc(int c, struct tok_state *tok) {
589 tok->str--;
590 assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */
591 }
592
593 /* Set the readline function for TOK to ENC. For the string-based
594 tokenizer, this means to just record the encoding. */
595
596 static int
buf_setreadl(struct tok_state * tok,const char * enc)597 buf_setreadl(struct tok_state *tok, const char* enc) {
598 tok->enc = enc;
599 return 1;
600 }
601
602 /* Return a UTF-8 encoding Python string object from the
603 C byte string STR, which is encoded with ENC. */
604
605 static PyObject *
translate_into_utf8(const char * str,const char * enc)606 translate_into_utf8(const char* str, const char* enc) {
607 PyObject *utf8;
608 PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);
609 if (buf == NULL)
610 return NULL;
611 utf8 = PyUnicode_AsUTF8String(buf);
612 Py_DECREF(buf);
613 return utf8;
614 }
615
616
617 static char *
translate_newlines(const char * s,int exec_input,struct tok_state * tok)618 translate_newlines(const char *s, int exec_input, struct tok_state *tok) {
619 int skip_next_lf = 0;
620 size_t needed_length = strlen(s) + 2, final_length;
621 char *buf, *current;
622 char c = '\0';
623 buf = PyMem_MALLOC(needed_length);
624 if (buf == NULL) {
625 tok->done = E_NOMEM;
626 return NULL;
627 }
628 for (current = buf; *s; s++, current++) {
629 c = *s;
630 if (skip_next_lf) {
631 skip_next_lf = 0;
632 if (c == '\n') {
633 c = *++s;
634 if (!c)
635 break;
636 }
637 }
638 if (c == '\r') {
639 skip_next_lf = 1;
640 c = '\n';
641 }
642 *current = c;
643 }
644 /* If this is exec input, add a newline to the end of the string if
645 there isn't one already. */
646 if (exec_input && c != '\n') {
647 *current = '\n';
648 current++;
649 }
650 *current = '\0';
651 final_length = current - buf + 1;
652 if (final_length < needed_length && final_length) {
653 /* should never fail */
654 char* result = PyMem_REALLOC(buf, final_length);
655 if (result == NULL) {
656 PyMem_FREE(buf);
657 }
658 buf = result;
659 }
660 return buf;
661 }
662
663 /* Decode a byte string STR for use as the buffer of TOK.
664 Look for encoding declarations inside STR, and record them
665 inside TOK. */
666
667 static char *
decode_str(const char * input,int single,struct tok_state * tok)668 decode_str(const char *input, int single, struct tok_state *tok)
669 {
670 PyObject* utf8 = NULL;
671 char *str;
672 const char *s;
673 const char *newl[2] = {NULL, NULL};
674 int lineno = 0;
675 tok->input = str = translate_newlines(input, single, tok);
676 if (str == NULL)
677 return NULL;
678 tok->enc = NULL;
679 tok->str = str;
680 if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
681 return error_ret(tok);
682 str = tok->str; /* string after BOM if any */
683 assert(str);
684 if (tok->enc != NULL) {
685 utf8 = translate_into_utf8(str, tok->enc);
686 if (utf8 == NULL)
687 return error_ret(tok);
688 str = PyBytes_AsString(utf8);
689 }
690 for (s = str;; s++) {
691 if (*s == '\0') break;
692 else if (*s == '\n') {
693 assert(lineno < 2);
694 newl[lineno] = s;
695 lineno++;
696 if (lineno == 2) break;
697 }
698 }
699 tok->enc = NULL;
700 /* need to check line 1 and 2 separately since check_coding_spec
701 assumes a single line as input */
702 if (newl[0]) {
703 if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))
704 return error_ret(tok);
705 if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {
706 if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],
707 tok, buf_setreadl))
708 return error_ret(tok);
709 }
710 }
711 if (tok->enc != NULL) {
712 assert(utf8 == NULL);
713 utf8 = translate_into_utf8(str, tok->enc);
714 if (utf8 == NULL)
715 return error_ret(tok);
716 str = PyBytes_AS_STRING(utf8);
717 }
718 assert(tok->decoding_buffer == NULL);
719 tok->decoding_buffer = utf8; /* CAUTION */
720 return str;
721 }
722
723 /* Set up tokenizer for string */
724
725 struct tok_state *
PyTokenizer_FromString(const char * str,int exec_input)726 PyTokenizer_FromString(const char *str, int exec_input)
727 {
728 struct tok_state *tok = tok_new();
729 char *decoded;
730
731 if (tok == NULL)
732 return NULL;
733 decoded = decode_str(str, exec_input, tok);
734 if (decoded == NULL) {
735 PyTokenizer_Free(tok);
736 return NULL;
737 }
738
739 tok->buf = tok->cur = tok->inp = decoded;
740 tok->end = decoded;
741 return tok;
742 }
743
744 struct tok_state *
PyTokenizer_FromUTF8(const char * str,int exec_input)745 PyTokenizer_FromUTF8(const char *str, int exec_input)
746 {
747 struct tok_state *tok = tok_new();
748 char *translated;
749 if (tok == NULL)
750 return NULL;
751 tok->input = translated = translate_newlines(str, exec_input, tok);
752 if (translated == NULL) {
753 PyTokenizer_Free(tok);
754 return NULL;
755 }
756 tok->decoding_state = STATE_RAW;
757 tok->read_coding_spec = 1;
758 tok->enc = NULL;
759 tok->str = translated;
760 tok->encoding = (char *)PyMem_MALLOC(6);
761 if (!tok->encoding) {
762 PyTokenizer_Free(tok);
763 return NULL;
764 }
765 strcpy(tok->encoding, "utf-8");
766
767 tok->buf = tok->cur = tok->inp = translated;
768 tok->end = translated;
769 return tok;
770 }
771
772 /* Set up tokenizer for file */
773
774 struct tok_state *
PyTokenizer_FromFile(FILE * fp,const char * enc,const char * ps1,const char * ps2)775 PyTokenizer_FromFile(FILE *fp, const char* enc,
776 const char *ps1, const char *ps2)
777 {
778 struct tok_state *tok = tok_new();
779 if (tok == NULL)
780 return NULL;
781 if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {
782 PyTokenizer_Free(tok);
783 return NULL;
784 }
785 tok->cur = tok->inp = tok->buf;
786 tok->end = tok->buf + BUFSIZ;
787 tok->fp = fp;
788 tok->prompt = ps1;
789 tok->nextprompt = ps2;
790 if (enc != NULL) {
791 /* Must copy encoding declaration since it
792 gets copied into the parse tree. */
793 tok->encoding = PyMem_MALLOC(strlen(enc)+1);
794 if (!tok->encoding) {
795 PyTokenizer_Free(tok);
796 return NULL;
797 }
798 strcpy(tok->encoding, enc);
799 tok->decoding_state = STATE_NORMAL;
800 }
801 return tok;
802 }
803
804
805 /* Free a tok_state structure */
806
807 void
PyTokenizer_Free(struct tok_state * tok)808 PyTokenizer_Free(struct tok_state *tok)
809 {
810 if (tok->encoding != NULL)
811 PyMem_FREE(tok->encoding);
812 Py_XDECREF(tok->decoding_readline);
813 Py_XDECREF(tok->decoding_buffer);
814 Py_XDECREF(tok->filename);
815 if (tok->fp != NULL && tok->buf != NULL)
816 PyMem_FREE(tok->buf);
817 if (tok->input)
818 PyMem_FREE(tok->input);
819 PyMem_FREE(tok);
820 }
821
822 /* Get next char, updating state; error code goes into tok->done */
823
824 static int
tok_nextc(struct tok_state * tok)825 tok_nextc(struct tok_state *tok)
826 {
827 for (;;) {
828 if (tok->cur != tok->inp) {
829 return Py_CHARMASK(*tok->cur++); /* Fast path */
830 }
831 if (tok->done != E_OK)
832 return EOF;
833 if (tok->fp == NULL) {
834 char *end = strchr(tok->inp, '\n');
835 if (end != NULL)
836 end++;
837 else {
838 end = strchr(tok->inp, '\0');
839 if (end == tok->inp) {
840 tok->done = E_EOF;
841 return EOF;
842 }
843 }
844 if (tok->start == NULL)
845 tok->buf = tok->cur;
846 tok->line_start = tok->cur;
847 tok->lineno++;
848 tok->inp = end;
849 return Py_CHARMASK(*tok->cur++);
850 }
851 if (tok->prompt != NULL) {
852 char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);
853 if (newtok != NULL) {
854 char *translated = translate_newlines(newtok, 0, tok);
855 PyMem_FREE(newtok);
856 if (translated == NULL)
857 return EOF;
858 newtok = translated;
859 }
860 if (tok->encoding && newtok && *newtok) {
861 /* Recode to UTF-8 */
862 Py_ssize_t buflen;
863 const char* buf;
864 PyObject *u = translate_into_utf8(newtok, tok->encoding);
865 PyMem_FREE(newtok);
866 if (!u) {
867 tok->done = E_DECODE;
868 return EOF;
869 }
870 buflen = PyBytes_GET_SIZE(u);
871 buf = PyBytes_AS_STRING(u);
872 newtok = PyMem_MALLOC(buflen+1);
873 if (newtok == NULL) {
874 Py_DECREF(u);
875 tok->done = E_NOMEM;
876 return EOF;
877 }
878 strcpy(newtok, buf);
879 Py_DECREF(u);
880 }
881 if (tok->nextprompt != NULL)
882 tok->prompt = tok->nextprompt;
883 if (newtok == NULL)
884 tok->done = E_INTR;
885 else if (*newtok == '\0') {
886 PyMem_FREE(newtok);
887 tok->done = E_EOF;
888 }
889 else if (tok->start != NULL) {
890 size_t start = tok->start - tok->buf;
891 size_t oldlen = tok->cur - tok->buf;
892 size_t newlen = oldlen + strlen(newtok);
893 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
894 char *buf = tok->buf;
895 buf = (char *)PyMem_REALLOC(buf, newlen+1);
896 tok->lineno++;
897 if (buf == NULL) {
898 PyMem_FREE(tok->buf);
899 tok->buf = NULL;
900 PyMem_FREE(newtok);
901 tok->done = E_NOMEM;
902 return EOF;
903 }
904 tok->buf = buf;
905 tok->cur = tok->buf + oldlen;
906 tok->multi_line_start = tok->buf + cur_multi_line_start;
907 tok->line_start = tok->cur;
908 strcpy(tok->buf + oldlen, newtok);
909 PyMem_FREE(newtok);
910 tok->inp = tok->buf + newlen;
911 tok->end = tok->inp + 1;
912 tok->start = tok->buf + start;
913 }
914 else {
915 tok->lineno++;
916 if (tok->buf != NULL)
917 PyMem_FREE(tok->buf);
918 tok->buf = newtok;
919 tok->cur = tok->buf;
920 tok->line_start = tok->buf;
921 tok->inp = strchr(tok->buf, '\0');
922 tok->end = tok->inp + 1;
923 }
924 }
925 else {
926 int done = 0;
927 Py_ssize_t cur = 0;
928 char *pt;
929 if (tok->start == NULL) {
930 if (tok->buf == NULL) {
931 tok->buf = (char *)
932 PyMem_MALLOC(BUFSIZ);
933 if (tok->buf == NULL) {
934 tok->done = E_NOMEM;
935 return EOF;
936 }
937 tok->end = tok->buf + BUFSIZ;
938 }
939 if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),
940 tok) == NULL) {
941 if (!tok->decoding_erred)
942 tok->done = E_EOF;
943 done = 1;
944 }
945 else {
946 tok->done = E_OK;
947 tok->inp = strchr(tok->buf, '\0');
948 done = tok->inp == tok->buf || tok->inp[-1] == '\n';
949 }
950 }
951 else {
952 cur = tok->cur - tok->buf;
953 if (decoding_feof(tok)) {
954 tok->done = E_EOF;
955 done = 1;
956 }
957 else
958 tok->done = E_OK;
959 }
960 tok->lineno++;
961 /* Read until '\n' or EOF */
962 while (!done) {
963 Py_ssize_t curstart = tok->start == NULL ? -1 :
964 tok->start - tok->buf;
965 Py_ssize_t cur_multi_line_start = tok->multi_line_start - tok->buf;
966 Py_ssize_t curvalid = tok->inp - tok->buf;
967 Py_ssize_t newsize = curvalid + BUFSIZ;
968 char *newbuf = tok->buf;
969 newbuf = (char *)PyMem_REALLOC(newbuf,
970 newsize);
971 if (newbuf == NULL) {
972 tok->done = E_NOMEM;
973 tok->cur = tok->inp;
974 return EOF;
975 }
976 tok->buf = newbuf;
977 tok->cur = tok->buf + cur;
978 tok->multi_line_start = tok->buf + cur_multi_line_start;
979 tok->line_start = tok->cur;
980 tok->inp = tok->buf + curvalid;
981 tok->end = tok->buf + newsize;
982 tok->start = curstart < 0 ? NULL :
983 tok->buf + curstart;
984 if (decoding_fgets(tok->inp,
985 (int)(tok->end - tok->inp),
986 tok) == NULL) {
987 /* Break out early on decoding
988 errors, as tok->buf will be NULL
989 */
990 if (tok->decoding_erred)
991 return EOF;
992 /* Last line does not end in \n,
993 fake one */
994 if (tok->inp[-1] != '\n')
995 strcpy(tok->inp, "\n");
996 }
997 tok->inp = strchr(tok->inp, '\0');
998 done = tok->inp[-1] == '\n';
999 }
1000 if (tok->buf != NULL) {
1001 tok->cur = tok->buf + cur;
1002 tok->line_start = tok->cur;
1003 /* replace "\r\n" with "\n" */
1004 /* For Mac leave the \r, giving a syntax error */
1005 pt = tok->inp - 2;
1006 if (pt >= tok->buf && *pt == '\r') {
1007 *pt++ = '\n';
1008 *pt = '\0';
1009 tok->inp = pt;
1010 }
1011 }
1012 }
1013 if (tok->done != E_OK) {
1014 if (tok->prompt != NULL)
1015 PySys_WriteStderr("\n");
1016 tok->cur = tok->inp;
1017 return EOF;
1018 }
1019 }
1020 /*NOTREACHED*/
1021 }
1022
1023
1024 /* Back-up one character */
1025
1026 static void
tok_backup(struct tok_state * tok,int c)1027 tok_backup(struct tok_state *tok, int c)
1028 {
1029 if (c != EOF) {
1030 if (--tok->cur < tok->buf) {
1031 Py_FatalError("tokenizer beginning of buffer");
1032 }
1033 if (*tok->cur != c) {
1034 *tok->cur = c;
1035 }
1036 }
1037 }
1038
1039
1040 static int
syntaxerror(struct tok_state * tok,const char * format,...)1041 syntaxerror(struct tok_state *tok, const char *format, ...)
1042 {
1043 PyObject *errmsg, *errtext, *args;
1044 va_list vargs;
1045 #ifdef HAVE_STDARG_PROTOTYPES
1046 va_start(vargs, format);
1047 #else
1048 va_start(vargs);
1049 #endif
1050 errmsg = PyUnicode_FromFormatV(format, vargs);
1051 va_end(vargs);
1052 if (!errmsg) {
1053 goto error;
1054 }
1055
1056 errtext = PyUnicode_DecodeUTF8(tok->line_start, tok->cur - tok->line_start,
1057 "replace");
1058 if (!errtext) {
1059 goto error;
1060 }
1061 int offset = (int)PyUnicode_GET_LENGTH(errtext);
1062 Py_ssize_t line_len = strcspn(tok->line_start, "\n");
1063 if (line_len != tok->cur - tok->line_start) {
1064 Py_DECREF(errtext);
1065 errtext = PyUnicode_DecodeUTF8(tok->line_start, line_len,
1066 "replace");
1067 }
1068 if (!errtext) {
1069 goto error;
1070 }
1071
1072 args = Py_BuildValue("(O(OiiN))", errmsg,
1073 tok->filename, tok->lineno, offset, errtext);
1074 if (args) {
1075 PyErr_SetObject(PyExc_SyntaxError, args);
1076 Py_DECREF(args);
1077 }
1078
1079 error:
1080 Py_XDECREF(errmsg);
1081 tok->done = E_ERROR;
1082 return ERRORTOKEN;
1083 }
1084
1085 static int
indenterror(struct tok_state * tok)1086 indenterror(struct tok_state *tok)
1087 {
1088 tok->done = E_TABSPACE;
1089 tok->cur = tok->inp;
1090 return ERRORTOKEN;
1091 }
1092
1093 /* Verify that the identifier follows PEP 3131.
1094 All identifier strings are guaranteed to be "ready" unicode objects.
1095 */
1096 static int
verify_identifier(struct tok_state * tok)1097 verify_identifier(struct tok_state *tok)
1098 {
1099 PyObject *s;
1100 if (tok->decoding_erred)
1101 return 0;
1102 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
1103 if (s == NULL) {
1104 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
1105 tok->done = E_DECODE;
1106 }
1107 else {
1108 tok->done = E_ERROR;
1109 }
1110 return 0;
1111 }
1112 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
1113 if (invalid < 0) {
1114 Py_DECREF(s);
1115 tok->done = E_ERROR;
1116 return 0;
1117 }
1118 assert(PyUnicode_GET_LENGTH(s) > 0);
1119 if (invalid < PyUnicode_GET_LENGTH(s)) {
1120 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
1121 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
1122 /* Determine the offset in UTF-8 encoded input */
1123 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
1124 if (s != NULL) {
1125 Py_SETREF(s, PyUnicode_AsUTF8String(s));
1126 }
1127 if (s == NULL) {
1128 tok->done = E_ERROR;
1129 return 0;
1130 }
1131 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
1132 }
1133 Py_DECREF(s);
1134 // PyUnicode_FromFormatV() does not support %X
1135 char hex[9];
1136 (void)PyOS_snprintf(hex, sizeof(hex), "%04X", ch);
1137 if (Py_UNICODE_ISPRINTABLE(ch)) {
1138 syntaxerror(tok, "invalid character '%c' (U+%s)", ch, hex);
1139 }
1140 else {
1141 syntaxerror(tok, "invalid non-printable character U+%s", hex);
1142 }
1143 return 0;
1144 }
1145 Py_DECREF(s);
1146 return 1;
1147 }
1148
1149 static int
tok_decimal_tail(struct tok_state * tok)1150 tok_decimal_tail(struct tok_state *tok)
1151 {
1152 int c;
1153
1154 while (1) {
1155 do {
1156 c = tok_nextc(tok);
1157 } while (isdigit(c));
1158 if (c != '_') {
1159 break;
1160 }
1161 c = tok_nextc(tok);
1162 if (!isdigit(c)) {
1163 tok_backup(tok, c);
1164 syntaxerror(tok, "invalid decimal literal");
1165 return 0;
1166 }
1167 }
1168 return c;
1169 }
1170
1171 /* Get next token, after space stripping etc. */
1172
1173 static int
tok_get(struct tok_state * tok,const char ** p_start,const char ** p_end)1174 tok_get(struct tok_state *tok, const char **p_start, const char **p_end)
1175 {
1176 int c;
1177 int blankline, nonascii;
1178
1179 *p_start = *p_end = NULL;
1180 nextline:
1181 tok->start = NULL;
1182 blankline = 0;
1183
1184 /* Get indentation level */
1185 if (tok->atbol) {
1186 int col = 0;
1187 int altcol = 0;
1188 tok->atbol = 0;
1189 for (;;) {
1190 c = tok_nextc(tok);
1191 if (c == ' ') {
1192 col++, altcol++;
1193 }
1194 else if (c == '\t') {
1195 col = (col / tok->tabsize + 1) * tok->tabsize;
1196 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
1197 }
1198 else if (c == '\014') {/* Control-L (formfeed) */
1199 col = altcol = 0; /* For Emacs users */
1200 }
1201 else {
1202 break;
1203 }
1204 }
1205 tok_backup(tok, c);
1206 if (c == '#' || c == '\n' || c == '\\') {
1207 /* Lines with only whitespace and/or comments
1208 and/or a line continuation character
1209 shouldn't affect the indentation and are
1210 not passed to the parser as NEWLINE tokens,
1211 except *totally* empty lines in interactive
1212 mode, which signal the end of a command group. */
1213 if (col == 0 && c == '\n' && tok->prompt != NULL) {
1214 blankline = 0; /* Let it through */
1215 }
1216 else if (tok->prompt != NULL && tok->lineno == 1) {
1217 /* In interactive mode, if the first line contains
1218 only spaces and/or a comment, let it through. */
1219 blankline = 0;
1220 col = altcol = 0;
1221 }
1222 else {
1223 blankline = 1; /* Ignore completely */
1224 }
1225 /* We can't jump back right here since we still
1226 may need to skip to the end of a comment */
1227 }
1228 if (!blankline && tok->level == 0) {
1229 if (col == tok->indstack[tok->indent]) {
1230 /* No change */
1231 if (altcol != tok->altindstack[tok->indent]) {
1232 return indenterror(tok);
1233 }
1234 }
1235 else if (col > tok->indstack[tok->indent]) {
1236 /* Indent -- always one */
1237 if (tok->indent+1 >= MAXINDENT) {
1238 tok->done = E_TOODEEP;
1239 tok->cur = tok->inp;
1240 return ERRORTOKEN;
1241 }
1242 if (altcol <= tok->altindstack[tok->indent]) {
1243 return indenterror(tok);
1244 }
1245 tok->pendin++;
1246 tok->indstack[++tok->indent] = col;
1247 tok->altindstack[tok->indent] = altcol;
1248 }
1249 else /* col < tok->indstack[tok->indent] */ {
1250 /* Dedent -- any number, must be consistent */
1251 while (tok->indent > 0 &&
1252 col < tok->indstack[tok->indent]) {
1253 tok->pendin--;
1254 tok->indent--;
1255 }
1256 if (col != tok->indstack[tok->indent]) {
1257 tok->done = E_DEDENT;
1258 tok->cur = tok->inp;
1259 return ERRORTOKEN;
1260 }
1261 if (altcol != tok->altindstack[tok->indent]) {
1262 return indenterror(tok);
1263 }
1264 }
1265 }
1266 }
1267
1268 tok->start = tok->cur;
1269
1270 /* Return pending indents/dedents */
1271 if (tok->pendin != 0) {
1272 if (tok->pendin < 0) {
1273 tok->pendin++;
1274 return DEDENT;
1275 }
1276 else {
1277 tok->pendin--;
1278 return INDENT;
1279 }
1280 }
1281
1282 /* Peek ahead at the next character */
1283 c = tok_nextc(tok);
1284 tok_backup(tok, c);
1285 /* Check if we are closing an async function */
1286 if (tok->async_def
1287 && !blankline
1288 /* Due to some implementation artifacts of type comments,
1289 * a TYPE_COMMENT at the start of a function won't set an
1290 * indentation level and it will produce a NEWLINE after it.
1291 * To avoid spuriously ending an async function due to this,
1292 * wait until we have some non-newline char in front of us. */
1293 && c != '\n'
1294 && tok->level == 0
1295 /* There was a NEWLINE after ASYNC DEF,
1296 so we're past the signature. */
1297 && tok->async_def_nl
1298 /* Current indentation level is less than where
1299 the async function was defined */
1300 && tok->async_def_indent >= tok->indent)
1301 {
1302 tok->async_def = 0;
1303 tok->async_def_indent = 0;
1304 tok->async_def_nl = 0;
1305 }
1306
1307 again:
1308 tok->start = NULL;
1309 /* Skip spaces */
1310 do {
1311 c = tok_nextc(tok);
1312 } while (c == ' ' || c == '\t' || c == '\014');
1313
1314 /* Set start of current token */
1315 tok->start = tok->cur - 1;
1316
1317 /* Skip comment, unless it's a type comment */
1318 if (c == '#') {
1319 const char *prefix, *p, *type_start;
1320
1321 while (c != EOF && c != '\n') {
1322 c = tok_nextc(tok);
1323 }
1324
1325 if (tok->type_comments) {
1326 p = tok->start;
1327 prefix = type_comment_prefix;
1328 while (*prefix && p < tok->cur) {
1329 if (*prefix == ' ') {
1330 while (*p == ' ' || *p == '\t') {
1331 p++;
1332 }
1333 } else if (*prefix == *p) {
1334 p++;
1335 } else {
1336 break;
1337 }
1338
1339 prefix++;
1340 }
1341
1342 /* This is a type comment if we matched all of type_comment_prefix. */
1343 if (!*prefix) {
1344 int is_type_ignore = 1;
1345 const char *ignore_end = p + 6;
1346 tok_backup(tok, c); /* don't eat the newline or EOF */
1347
1348 type_start = p;
1349
1350 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
1351 * or anything ASCII and non-alphanumeric. */
1352 is_type_ignore = (
1353 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
1354 && !(tok->cur > ignore_end
1355 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
1356
1357 if (is_type_ignore) {
1358 *p_start = ignore_end;
1359 *p_end = tok->cur;
1360
1361 /* If this type ignore is the only thing on the line, consume the newline also. */
1362 if (blankline) {
1363 tok_nextc(tok);
1364 tok->atbol = 1;
1365 }
1366 return TYPE_IGNORE;
1367 } else {
1368 *p_start = type_start; /* after type_comment_prefix */
1369 *p_end = tok->cur;
1370 return TYPE_COMMENT;
1371 }
1372 }
1373 }
1374 }
1375
1376 /* Check for EOF and errors now */
1377 if (c == EOF) {
1378 return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;
1379 }
1380
1381 /* Identifier (most frequent token!) */
1382 nonascii = 0;
1383 if (is_potential_identifier_start(c)) {
1384 /* Process the various legal combinations of b"", r"", u"", and f"". */
1385 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
1386 while (1) {
1387 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
1388 saw_b = 1;
1389 /* Since this is a backwards compatibility support literal we don't
1390 want to support it in arbitrary order like byte literals. */
1391 else if (!(saw_b || saw_u || saw_r || saw_f)
1392 && (c == 'u'|| c == 'U')) {
1393 saw_u = 1;
1394 }
1395 /* ur"" and ru"" are not supported */
1396 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
1397 saw_r = 1;
1398 }
1399 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
1400 saw_f = 1;
1401 }
1402 else {
1403 break;
1404 }
1405 c = tok_nextc(tok);
1406 if (c == '"' || c == '\'') {
1407 goto letter_quote;
1408 }
1409 }
1410 while (is_potential_identifier_char(c)) {
1411 if (c >= 128) {
1412 nonascii = 1;
1413 }
1414 c = tok_nextc(tok);
1415 }
1416 tok_backup(tok, c);
1417 if (nonascii && !verify_identifier(tok)) {
1418 return ERRORTOKEN;
1419 }
1420
1421 *p_start = tok->start;
1422 *p_end = tok->cur;
1423
1424 /* async/await parsing block. */
1425 if (tok->cur - tok->start == 5 && tok->start[0] == 'a') {
1426 /* May be an 'async' or 'await' token. For Python 3.7 or
1427 later we recognize them unconditionally. For Python
1428 3.5 or 3.6 we recognize 'async' in front of 'def', and
1429 either one inside of 'async def'. (Technically we
1430 shouldn't recognize these at all for 3.4 or earlier,
1431 but there's no *valid* Python 3.4 code that would be
1432 rejected, and async functions will be rejected in a
1433 later phase.) */
1434 if (!tok->async_hacks || tok->async_def) {
1435 /* Always recognize the keywords. */
1436 if (memcmp(tok->start, "async", 5) == 0) {
1437 return ASYNC;
1438 }
1439 if (memcmp(tok->start, "await", 5) == 0) {
1440 return AWAIT;
1441 }
1442 }
1443 else if (memcmp(tok->start, "async", 5) == 0) {
1444 /* The current token is 'async'.
1445 Look ahead one token to see if that is 'def'. */
1446
1447 struct tok_state ahead_tok;
1448 const char *ahead_tok_start = NULL;
1449 const char *ahead_tok_end = NULL;
1450 int ahead_tok_kind;
1451
1452 memcpy(&ahead_tok, tok, sizeof(ahead_tok));
1453 ahead_tok_kind = tok_get(&ahead_tok, &ahead_tok_start,
1454 &ahead_tok_end);
1455
1456 if (ahead_tok_kind == NAME
1457 && ahead_tok.cur - ahead_tok.start == 3
1458 && memcmp(ahead_tok.start, "def", 3) == 0)
1459 {
1460 /* The next token is going to be 'def', so instead of
1461 returning a plain NAME token, return ASYNC. */
1462 tok->async_def_indent = tok->indent;
1463 tok->async_def = 1;
1464 return ASYNC;
1465 }
1466 }
1467 }
1468
1469 return NAME;
1470 }
1471
1472 /* Newline */
1473 if (c == '\n') {
1474 tok->atbol = 1;
1475 if (blankline || tok->level > 0) {
1476 goto nextline;
1477 }
1478 *p_start = tok->start;
1479 *p_end = tok->cur - 1; /* Leave '\n' out of the string */
1480 tok->cont_line = 0;
1481 if (tok->async_def) {
1482 /* We're somewhere inside an 'async def' function, and
1483 we've encountered a NEWLINE after its signature. */
1484 tok->async_def_nl = 1;
1485 }
1486 return NEWLINE;
1487 }
1488
1489 /* Period or number starting with period? */
1490 if (c == '.') {
1491 c = tok_nextc(tok);
1492 if (isdigit(c)) {
1493 goto fraction;
1494 } else if (c == '.') {
1495 c = tok_nextc(tok);
1496 if (c == '.') {
1497 *p_start = tok->start;
1498 *p_end = tok->cur;
1499 return ELLIPSIS;
1500 }
1501 else {
1502 tok_backup(tok, c);
1503 }
1504 tok_backup(tok, '.');
1505 }
1506 else {
1507 tok_backup(tok, c);
1508 }
1509 *p_start = tok->start;
1510 *p_end = tok->cur;
1511 return DOT;
1512 }
1513
1514 /* Number */
1515 if (isdigit(c)) {
1516 if (c == '0') {
1517 /* Hex, octal or binary -- maybe. */
1518 c = tok_nextc(tok);
1519 if (c == 'x' || c == 'X') {
1520 /* Hex */
1521 c = tok_nextc(tok);
1522 do {
1523 if (c == '_') {
1524 c = tok_nextc(tok);
1525 }
1526 if (!isxdigit(c)) {
1527 tok_backup(tok, c);
1528 return syntaxerror(tok, "invalid hexadecimal literal");
1529 }
1530 do {
1531 c = tok_nextc(tok);
1532 } while (isxdigit(c));
1533 } while (c == '_');
1534 }
1535 else if (c == 'o' || c == 'O') {
1536 /* Octal */
1537 c = tok_nextc(tok);
1538 do {
1539 if (c == '_') {
1540 c = tok_nextc(tok);
1541 }
1542 if (c < '0' || c >= '8') {
1543 tok_backup(tok, c);
1544 if (isdigit(c)) {
1545 return syntaxerror(tok,
1546 "invalid digit '%c' in octal literal", c);
1547 }
1548 else {
1549 return syntaxerror(tok, "invalid octal literal");
1550 }
1551 }
1552 do {
1553 c = tok_nextc(tok);
1554 } while ('0' <= c && c < '8');
1555 } while (c == '_');
1556 if (isdigit(c)) {
1557 return syntaxerror(tok,
1558 "invalid digit '%c' in octal literal", c);
1559 }
1560 }
1561 else if (c == 'b' || c == 'B') {
1562 /* Binary */
1563 c = tok_nextc(tok);
1564 do {
1565 if (c == '_') {
1566 c = tok_nextc(tok);
1567 }
1568 if (c != '0' && c != '1') {
1569 tok_backup(tok, c);
1570 if (isdigit(c)) {
1571 return syntaxerror(tok,
1572 "invalid digit '%c' in binary literal", c);
1573 }
1574 else {
1575 return syntaxerror(tok, "invalid binary literal");
1576 }
1577 }
1578 do {
1579 c = tok_nextc(tok);
1580 } while (c == '0' || c == '1');
1581 } while (c == '_');
1582 if (isdigit(c)) {
1583 return syntaxerror(tok,
1584 "invalid digit '%c' in binary literal", c);
1585 }
1586 }
1587 else {
1588 int nonzero = 0;
1589 /* maybe old-style octal; c is first char of it */
1590 /* in any case, allow '0' as a literal */
1591 while (1) {
1592 if (c == '_') {
1593 c = tok_nextc(tok);
1594 if (!isdigit(c)) {
1595 tok_backup(tok, c);
1596 return syntaxerror(tok, "invalid decimal literal");
1597 }
1598 }
1599 if (c != '0') {
1600 break;
1601 }
1602 c = tok_nextc(tok);
1603 }
1604 if (isdigit(c)) {
1605 nonzero = 1;
1606 c = tok_decimal_tail(tok);
1607 if (c == 0) {
1608 return ERRORTOKEN;
1609 }
1610 }
1611 if (c == '.') {
1612 c = tok_nextc(tok);
1613 goto fraction;
1614 }
1615 else if (c == 'e' || c == 'E') {
1616 goto exponent;
1617 }
1618 else if (c == 'j' || c == 'J') {
1619 goto imaginary;
1620 }
1621 else if (nonzero) {
1622 /* Old-style octal: now disallowed. */
1623 tok_backup(tok, c);
1624 return syntaxerror(tok,
1625 "leading zeros in decimal integer "
1626 "literals are not permitted; "
1627 "use an 0o prefix for octal integers");
1628 }
1629 }
1630 }
1631 else {
1632 /* Decimal */
1633 c = tok_decimal_tail(tok);
1634 if (c == 0) {
1635 return ERRORTOKEN;
1636 }
1637 {
1638 /* Accept floating point numbers. */
1639 if (c == '.') {
1640 c = tok_nextc(tok);
1641 fraction:
1642 /* Fraction */
1643 if (isdigit(c)) {
1644 c = tok_decimal_tail(tok);
1645 if (c == 0) {
1646 return ERRORTOKEN;
1647 }
1648 }
1649 }
1650 if (c == 'e' || c == 'E') {
1651 int e;
1652 exponent:
1653 e = c;
1654 /* Exponent part */
1655 c = tok_nextc(tok);
1656 if (c == '+' || c == '-') {
1657 c = tok_nextc(tok);
1658 if (!isdigit(c)) {
1659 tok_backup(tok, c);
1660 return syntaxerror(tok, "invalid decimal literal");
1661 }
1662 } else if (!isdigit(c)) {
1663 tok_backup(tok, c);
1664 tok_backup(tok, e);
1665 *p_start = tok->start;
1666 *p_end = tok->cur;
1667 return NUMBER;
1668 }
1669 c = tok_decimal_tail(tok);
1670 if (c == 0) {
1671 return ERRORTOKEN;
1672 }
1673 }
1674 if (c == 'j' || c == 'J') {
1675 /* Imaginary part */
1676 imaginary:
1677 c = tok_nextc(tok);
1678 }
1679 }
1680 }
1681 tok_backup(tok, c);
1682 *p_start = tok->start;
1683 *p_end = tok->cur;
1684 return NUMBER;
1685 }
1686
1687 letter_quote:
1688 /* String */
1689 if (c == '\'' || c == '"') {
1690 int quote = c;
1691 int quote_size = 1; /* 1 or 3 */
1692 int end_quote_size = 0;
1693
1694 /* Nodes of type STRING, especially multi line strings
1695 must be handled differently in order to get both
1696 the starting line number and the column offset right.
1697 (cf. issue 16806) */
1698 tok->first_lineno = tok->lineno;
1699 tok->multi_line_start = tok->line_start;
1700
1701 /* Find the quote size and start of string */
1702 c = tok_nextc(tok);
1703 if (c == quote) {
1704 c = tok_nextc(tok);
1705 if (c == quote) {
1706 quote_size = 3;
1707 }
1708 else {
1709 end_quote_size = 1; /* empty string found */
1710 }
1711 }
1712 if (c != quote) {
1713 tok_backup(tok, c);
1714 }
1715
1716 /* Get rest of string */
1717 while (end_quote_size != quote_size) {
1718 c = tok_nextc(tok);
1719 if (c == EOF) {
1720 if (quote_size == 3) {
1721 tok->done = E_EOFS;
1722 }
1723 else {
1724 tok->done = E_EOLS;
1725 }
1726 tok->cur = tok->inp;
1727 return ERRORTOKEN;
1728 }
1729 if (quote_size == 1 && c == '\n') {
1730 tok->done = E_EOLS;
1731 tok->cur = tok->inp;
1732 return ERRORTOKEN;
1733 }
1734 if (c == quote) {
1735 end_quote_size += 1;
1736 }
1737 else {
1738 end_quote_size = 0;
1739 if (c == '\\') {
1740 tok_nextc(tok); /* skip escaped char */
1741 }
1742 }
1743 }
1744
1745 *p_start = tok->start;
1746 *p_end = tok->cur;
1747 return STRING;
1748 }
1749
1750 /* Line continuation */
1751 if (c == '\\') {
1752 c = tok_nextc(tok);
1753 if (c != '\n') {
1754 tok->done = E_LINECONT;
1755 tok->cur = tok->inp;
1756 return ERRORTOKEN;
1757 }
1758 c = tok_nextc(tok);
1759 if (c == EOF) {
1760 tok->done = E_EOF;
1761 tok->cur = tok->inp;
1762 return ERRORTOKEN;
1763 } else {
1764 tok_backup(tok, c);
1765 }
1766 tok->cont_line = 1;
1767 goto again; /* Read next line */
1768 }
1769
1770 /* Check for two-character token */
1771 {
1772 int c2 = tok_nextc(tok);
1773 int token = PyToken_TwoChars(c, c2);
1774 if (token != OP) {
1775 int c3 = tok_nextc(tok);
1776 int token3 = PyToken_ThreeChars(c, c2, c3);
1777 if (token3 != OP) {
1778 token = token3;
1779 }
1780 else {
1781 tok_backup(tok, c3);
1782 }
1783 *p_start = tok->start;
1784 *p_end = tok->cur;
1785 return token;
1786 }
1787 tok_backup(tok, c2);
1788 }
1789
1790 /* Keep track of parentheses nesting level */
1791 switch (c) {
1792 case '(':
1793 case '[':
1794 case '{':
1795 if (tok->level >= MAXLEVEL) {
1796 return syntaxerror(tok, "too many nested parentheses");
1797 }
1798 tok->parenstack[tok->level] = c;
1799 tok->parenlinenostack[tok->level] = tok->lineno;
1800 tok->level++;
1801 break;
1802 case ')':
1803 case ']':
1804 case '}':
1805 if (!tok->level) {
1806 return syntaxerror(tok, "unmatched '%c'", c);
1807 }
1808 tok->level--;
1809 int opening = tok->parenstack[tok->level];
1810 if (!((opening == '(' && c == ')') ||
1811 (opening == '[' && c == ']') ||
1812 (opening == '{' && c == '}')))
1813 {
1814 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1815 return syntaxerror(tok,
1816 "closing parenthesis '%c' does not match "
1817 "opening parenthesis '%c' on line %d",
1818 c, opening, tok->parenlinenostack[tok->level]);
1819 }
1820 else {
1821 return syntaxerror(tok,
1822 "closing parenthesis '%c' does not match "
1823 "opening parenthesis '%c'",
1824 c, opening);
1825 }
1826 }
1827 break;
1828 }
1829
1830 /* Punctuation character */
1831 *p_start = tok->start;
1832 *p_end = tok->cur;
1833 return PyToken_OneChar(c);
1834 }
1835
1836 int
PyTokenizer_Get(struct tok_state * tok,const char ** p_start,const char ** p_end)1837 PyTokenizer_Get(struct tok_state *tok, const char **p_start, const char **p_end)
1838 {
1839 int result = tok_get(tok, p_start, p_end);
1840 if (tok->decoding_erred) {
1841 result = ERRORTOKEN;
1842 tok->done = E_DECODE;
1843 }
1844 return result;
1845 }
1846
1847 /* Get the encoding of a Python file. Check for the coding cookie and check if
1848 the file starts with a BOM.
1849
1850 PyTokenizer_FindEncodingFilename() returns NULL when it can't find the
1851 encoding in the first or second line of the file (in which case the encoding
1852 should be assumed to be UTF-8).
1853
1854 The char* returned is malloc'ed via PyMem_MALLOC() and thus must be freed
1855 by the caller. */
1856
1857 char *
PyTokenizer_FindEncodingFilename(int fd,PyObject * filename)1858 PyTokenizer_FindEncodingFilename(int fd, PyObject *filename)
1859 {
1860 struct tok_state *tok;
1861 FILE *fp;
1862 const char *p_start = NULL;
1863 const char *p_end = NULL;
1864 char *encoding = NULL;
1865
1866 fd = _Py_dup(fd);
1867 if (fd < 0) {
1868 return NULL;
1869 }
1870
1871 fp = fdopen(fd, "r");
1872 if (fp == NULL) {
1873 return NULL;
1874 }
1875 tok = PyTokenizer_FromFile(fp, NULL, NULL, NULL);
1876 if (tok == NULL) {
1877 fclose(fp);
1878 return NULL;
1879 }
1880 if (filename != NULL) {
1881 Py_INCREF(filename);
1882 tok->filename = filename;
1883 }
1884 else {
1885 tok->filename = PyUnicode_FromString("<string>");
1886 if (tok->filename == NULL) {
1887 fclose(fp);
1888 PyTokenizer_Free(tok);
1889 return encoding;
1890 }
1891 }
1892 while (tok->lineno < 2 && tok->done == E_OK) {
1893 PyTokenizer_Get(tok, &p_start, &p_end);
1894 }
1895 fclose(fp);
1896 if (tok->encoding) {
1897 encoding = (char *)PyMem_MALLOC(strlen(tok->encoding) + 1);
1898 if (encoding)
1899 strcpy(encoding, tok->encoding);
1900 }
1901 PyTokenizer_Free(tok);
1902 return encoding;
1903 }
1904
1905 char *
PyTokenizer_FindEncoding(int fd)1906 PyTokenizer_FindEncoding(int fd)
1907 {
1908 return PyTokenizer_FindEncodingFilename(fd, NULL);
1909 }
1910
1911 #ifdef Py_DEBUG
1912
1913 void
tok_dump(int type,char * start,char * end)1914 tok_dump(int type, char *start, char *end)
1915 {
1916 printf("%s", _PyParser_TokenNames[type]);
1917 if (type == NAME || type == NUMBER || type == STRING || type == OP)
1918 printf("(%.*s)", (int)(end - start), start);
1919 }
1920
1921 #endif
1922