1 #include "Python.h"
2 #include "pycore_token.h"
3 #include "pycore_unicodeobject.h"
4 #include "errcode.h"
5
6 #include "state.h"
7 #include "../tokenizer/helpers.h"
8
9 /* Alternate tab spacing */
10 #define ALTTABSIZE 1
11
12 #define is_potential_identifier_start(c) (\
13 (c >= 'a' && c <= 'z')\
14 || (c >= 'A' && c <= 'Z')\
15 || c == '_'\
16 || (c >= 128))
17
18 #define is_potential_identifier_char(c) (\
19 (c >= 'a' && c <= 'z')\
20 || (c >= 'A' && c <= 'Z')\
21 || (c >= '0' && c <= '9')\
22 || c == '_'\
23 || (c >= 128))
24
25 #ifdef Py_DEBUG
TOK_GET_MODE(struct tok_state * tok)26 static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
27 assert(tok->tok_mode_stack_index >= 0);
28 assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL);
29 return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
30 }
TOK_NEXT_MODE(struct tok_state * tok)31 static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
32 assert(tok->tok_mode_stack_index >= 0);
33 assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL);
34 return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
35 }
36 #else
37 #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
38 #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
39 #endif
40
41 #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end)
42 #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
43 _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
44
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46 tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48
49 static inline int
contains_null_bytes(const char * str,size_t size)50 contains_null_bytes(const char* str, size_t size)
51 {
52 return memchr(str, 0, size) != NULL;
53 }
54
55 /* Get next char, updating state; error code goes into tok->done */
56 static int
tok_nextc(struct tok_state * tok)57 tok_nextc(struct tok_state *tok)
58 {
59 int rc;
60 for (;;) {
61 if (tok->cur != tok->inp) {
62 if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
63 tok->done = E_COLUMNOVERFLOW;
64 return EOF;
65 }
66 tok->col_offset++;
67 return Py_CHARMASK(*tok->cur++); /* Fast path */
68 }
69 if (tok->done != E_OK) {
70 return EOF;
71 }
72 rc = tok->underflow(tok);
73 #if defined(Py_DEBUG)
74 if (tok->debug) {
75 fprintf(stderr, "line[%d] = ", tok->lineno);
76 _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur);
77 fprintf(stderr, " tok->done = %d\n", tok->done);
78 }
79 #endif
80 if (!rc) {
81 tok->cur = tok->inp;
82 return EOF;
83 }
84 tok->line_start = tok->cur;
85
86 if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
87 _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes");
88 tok->cur = tok->inp;
89 return EOF;
90 }
91 }
92 Py_UNREACHABLE();
93 }
94
95 /* Back-up one character */
96 static void
tok_backup(struct tok_state * tok,int c)97 tok_backup(struct tok_state *tok, int c)
98 {
99 if (c != EOF) {
100 if (--tok->cur < tok->buf) {
101 Py_FatalError("tokenizer beginning of buffer");
102 }
103 if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
104 Py_FatalError("tok_backup: wrong character");
105 }
106 tok->col_offset--;
107 }
108 }
109
110 static int
set_fstring_expr(struct tok_state * tok,struct token * token,char c)111 set_fstring_expr(struct tok_state* tok, struct token *token, char c) {
112 assert(token != NULL);
113 assert(c == '}' || c == ':' || c == '!');
114 tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
115
116 if (!tok_mode->f_string_debug || token->metadata) {
117 return 0;
118 }
119 PyObject *res = NULL;
120
121 // Check if there is a # character in the expression
122 int hash_detected = 0;
123 for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) {
124 if (tok_mode->last_expr_buffer[i] == '#') {
125 hash_detected = 1;
126 break;
127 }
128 }
129
130 if (hash_detected) {
131 Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end;
132 char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char));
133 if (!result) {
134 return -1;
135 }
136
137 Py_ssize_t i = 0;
138 Py_ssize_t j = 0;
139
140 for (i = 0, j = 0; i < input_length; i++) {
141 if (tok_mode->last_expr_buffer[i] == '#') {
142 // Skip characters until newline or end of string
143 while (tok_mode->last_expr_buffer[i] != '\0' && i < input_length) {
144 if (tok_mode->last_expr_buffer[i] == '\n') {
145 result[j++] = tok_mode->last_expr_buffer[i];
146 break;
147 }
148 i++;
149 }
150 } else {
151 result[j++] = tok_mode->last_expr_buffer[i];
152 }
153 }
154
155 result[j] = '\0'; // Null-terminate the result string
156 res = PyUnicode_DecodeUTF8(result, j, NULL);
157 PyMem_Free(result);
158 } else {
159 res = PyUnicode_DecodeUTF8(
160 tok_mode->last_expr_buffer,
161 tok_mode->last_expr_size - tok_mode->last_expr_end,
162 NULL
163 );
164
165 }
166
167
168 if (!res) {
169 return -1;
170 }
171 token->metadata = res;
172 return 0;
173 }
174
175 int
_PyLexer_update_fstring_expr(struct tok_state * tok,char cur)176 _PyLexer_update_fstring_expr(struct tok_state *tok, char cur)
177 {
178 assert(tok->cur != NULL);
179
180 Py_ssize_t size = strlen(tok->cur);
181 tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
182
183 switch (cur) {
184 case 0:
185 if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
186 return 1;
187 }
188 char *new_buffer = PyMem_Realloc(
189 tok_mode->last_expr_buffer,
190 tok_mode->last_expr_size + size
191 );
192 if (new_buffer == NULL) {
193 PyMem_Free(tok_mode->last_expr_buffer);
194 goto error;
195 }
196 tok_mode->last_expr_buffer = new_buffer;
197 strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
198 tok_mode->last_expr_size += size;
199 break;
200 case '{':
201 if (tok_mode->last_expr_buffer != NULL) {
202 PyMem_Free(tok_mode->last_expr_buffer);
203 }
204 tok_mode->last_expr_buffer = PyMem_Malloc(size);
205 if (tok_mode->last_expr_buffer == NULL) {
206 goto error;
207 }
208 tok_mode->last_expr_size = size;
209 tok_mode->last_expr_end = -1;
210 strncpy(tok_mode->last_expr_buffer, tok->cur, size);
211 break;
212 case '}':
213 case '!':
214 case ':':
215 if (tok_mode->last_expr_end == -1) {
216 tok_mode->last_expr_end = strlen(tok->start);
217 }
218 break;
219 default:
220 Py_UNREACHABLE();
221 }
222 return 1;
223 error:
224 tok->done = E_NOMEM;
225 return 0;
226 }
227
228 static int
lookahead(struct tok_state * tok,const char * test)229 lookahead(struct tok_state *tok, const char *test)
230 {
231 const char *s = test;
232 int res = 0;
233 while (1) {
234 int c = tok_nextc(tok);
235 if (*s == 0) {
236 res = !is_potential_identifier_char(c);
237 }
238 else if (c == *s) {
239 s++;
240 continue;
241 }
242
243 tok_backup(tok, c);
244 while (s != test) {
245 tok_backup(tok, *--s);
246 }
247 return res;
248 }
249 }
250
251 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)252 verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
253 if (tok->tok_extra_tokens) {
254 // When we are parsing extra tokens, we don't want to emit warnings
255 // about invalid literals, because we want to be a bit more liberal.
256 return 1;
257 }
258 /* Emit a deprecation warning only if the numeric literal is immediately
259 * followed by one of keywords which can occur after a numeric literal
260 * in valid code: "and", "else", "for", "if", "in", "is" and "or".
261 * It allows to gradually deprecate existing valid code without adding
262 * warning before error in most cases of invalid numeric literal (which
263 * would be confusing and break existing tests).
264 * Raise a syntax error with slightly better message than plain
265 * "invalid syntax" if the numeric literal is immediately followed by
266 * other keyword or identifier.
267 */
268 int r = 0;
269 if (c == 'a') {
270 r = lookahead(tok, "nd");
271 }
272 else if (c == 'e') {
273 r = lookahead(tok, "lse");
274 }
275 else if (c == 'f') {
276 r = lookahead(tok, "or");
277 }
278 else if (c == 'i') {
279 int c2 = tok_nextc(tok);
280 if (c2 == 'f' || c2 == 'n' || c2 == 's') {
281 r = 1;
282 }
283 tok_backup(tok, c2);
284 }
285 else if (c == 'o') {
286 r = lookahead(tok, "r");
287 }
288 else if (c == 'n') {
289 r = lookahead(tok, "ot");
290 }
291 if (r) {
292 tok_backup(tok, c);
293 if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning,
294 "invalid %s literal", kind))
295 {
296 return 0;
297 }
298 tok_nextc(tok);
299 }
300 else /* In future releases, only error will remain. */
301 if (c < 128 && is_potential_identifier_char(c)) {
302 tok_backup(tok, c);
303 _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind);
304 return 0;
305 }
306 return 1;
307 }
308
309 /* Verify that the identifier follows PEP 3131.
310 All identifier strings are guaranteed to be "ready" unicode objects.
311 */
312 static int
verify_identifier(struct tok_state * tok)313 verify_identifier(struct tok_state *tok)
314 {
315 if (tok->tok_extra_tokens) {
316 return 1;
317 }
318 PyObject *s;
319 if (tok->decoding_erred)
320 return 0;
321 s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
322 if (s == NULL) {
323 if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
324 tok->done = E_DECODE;
325 }
326 else {
327 tok->done = E_ERROR;
328 }
329 return 0;
330 }
331 Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
332 if (invalid < 0) {
333 Py_DECREF(s);
334 tok->done = E_ERROR;
335 return 0;
336 }
337 assert(PyUnicode_GET_LENGTH(s) > 0);
338 if (invalid < PyUnicode_GET_LENGTH(s)) {
339 Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
340 if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
341 /* Determine the offset in UTF-8 encoded input */
342 Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
343 if (s != NULL) {
344 Py_SETREF(s, PyUnicode_AsUTF8String(s));
345 }
346 if (s == NULL) {
347 tok->done = E_ERROR;
348 return 0;
349 }
350 tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
351 }
352 Py_DECREF(s);
353 if (Py_UNICODE_ISPRINTABLE(ch)) {
354 _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
355 }
356 else {
357 _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch);
358 }
359 return 0;
360 }
361 Py_DECREF(s);
362 return 1;
363 }
364
365 static int
tok_decimal_tail(struct tok_state * tok)366 tok_decimal_tail(struct tok_state *tok)
367 {
368 int c;
369
370 while (1) {
371 do {
372 c = tok_nextc(tok);
373 } while (Py_ISDIGIT(c));
374 if (c != '_') {
375 break;
376 }
377 c = tok_nextc(tok);
378 if (!Py_ISDIGIT(c)) {
379 tok_backup(tok, c);
380 _PyTokenizer_syntaxerror(tok, "invalid decimal literal");
381 return 0;
382 }
383 }
384 return c;
385 }
386
387 static inline int
tok_continuation_line(struct tok_state * tok)388 tok_continuation_line(struct tok_state *tok) {
389 int c = tok_nextc(tok);
390 if (c == '\r') {
391 c = tok_nextc(tok);
392 }
393 if (c != '\n') {
394 tok->done = E_LINECONT;
395 return -1;
396 }
397 c = tok_nextc(tok);
398 if (c == EOF) {
399 tok->done = E_EOF;
400 tok->cur = tok->inp;
401 return -1;
402 } else {
403 tok_backup(tok, c);
404 }
405 return c;
406 }
407
408 static int
tok_get_normal_mode(struct tok_state * tok,tokenizer_mode * current_tok,struct token * token)409 tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
410 {
411 int c;
412 int blankline, nonascii;
413
414 const char *p_start = NULL;
415 const char *p_end = NULL;
416 nextline:
417 tok->start = NULL;
418 tok->starting_col_offset = -1;
419 blankline = 0;
420
421
422 /* Get indentation level */
423 if (tok->atbol) {
424 int col = 0;
425 int altcol = 0;
426 tok->atbol = 0;
427 int cont_line_col = 0;
428 for (;;) {
429 c = tok_nextc(tok);
430 if (c == ' ') {
431 col++, altcol++;
432 }
433 else if (c == '\t') {
434 col = (col / tok->tabsize + 1) * tok->tabsize;
435 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
436 }
437 else if (c == '\014') {/* Control-L (formfeed) */
438 col = altcol = 0; /* For Emacs users */
439 }
440 else if (c == '\\') {
441 // Indentation cannot be split over multiple physical lines
442 // using backslashes. This means that if we found a backslash
443 // preceded by whitespace, **the first one we find** determines
444 // the level of indentation of whatever comes next.
445 cont_line_col = cont_line_col ? cont_line_col : col;
446 if ((c = tok_continuation_line(tok)) == -1) {
447 return MAKE_TOKEN(ERRORTOKEN);
448 }
449 }
450 else {
451 break;
452 }
453 }
454 tok_backup(tok, c);
455 if (c == '#' || c == '\n' || c == '\r') {
456 /* Lines with only whitespace and/or comments
457 shouldn't affect the indentation and are
458 not passed to the parser as NEWLINE tokens,
459 except *totally* empty lines in interactive
460 mode, which signal the end of a command group. */
461 if (col == 0 && c == '\n' && tok->prompt != NULL) {
462 blankline = 0; /* Let it through */
463 }
464 else if (tok->prompt != NULL && tok->lineno == 1) {
465 /* In interactive mode, if the first line contains
466 only spaces and/or a comment, let it through. */
467 blankline = 0;
468 col = altcol = 0;
469 }
470 else {
471 blankline = 1; /* Ignore completely */
472 }
473 /* We can't jump back right here since we still
474 may need to skip to the end of a comment */
475 }
476 if (!blankline && tok->level == 0) {
477 col = cont_line_col ? cont_line_col : col;
478 altcol = cont_line_col ? cont_line_col : altcol;
479 if (col == tok->indstack[tok->indent]) {
480 /* No change */
481 if (altcol != tok->altindstack[tok->indent]) {
482 return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
483 }
484 }
485 else if (col > tok->indstack[tok->indent]) {
486 /* Indent -- always one */
487 if (tok->indent+1 >= MAXINDENT) {
488 tok->done = E_TOODEEP;
489 tok->cur = tok->inp;
490 return MAKE_TOKEN(ERRORTOKEN);
491 }
492 if (altcol <= tok->altindstack[tok->indent]) {
493 return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
494 }
495 tok->pendin++;
496 tok->indstack[++tok->indent] = col;
497 tok->altindstack[tok->indent] = altcol;
498 }
499 else /* col < tok->indstack[tok->indent] */ {
500 /* Dedent -- any number, must be consistent */
501 while (tok->indent > 0 &&
502 col < tok->indstack[tok->indent]) {
503 tok->pendin--;
504 tok->indent--;
505 }
506 if (col != tok->indstack[tok->indent]) {
507 tok->done = E_DEDENT;
508 tok->cur = tok->inp;
509 return MAKE_TOKEN(ERRORTOKEN);
510 }
511 if (altcol != tok->altindstack[tok->indent]) {
512 return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
513 }
514 }
515 }
516 }
517
518 tok->start = tok->cur;
519 tok->starting_col_offset = tok->col_offset;
520
521 /* Return pending indents/dedents */
522 if (tok->pendin != 0) {
523 if (tok->pendin < 0) {
524 if (tok->tok_extra_tokens) {
525 p_start = tok->cur;
526 p_end = tok->cur;
527 }
528 tok->pendin++;
529 return MAKE_TOKEN(DEDENT);
530 }
531 else {
532 if (tok->tok_extra_tokens) {
533 p_start = tok->buf;
534 p_end = tok->cur;
535 }
536 tok->pendin--;
537 return MAKE_TOKEN(INDENT);
538 }
539 }
540
541 /* Peek ahead at the next character */
542 c = tok_nextc(tok);
543 tok_backup(tok, c);
544
545 again:
546 tok->start = NULL;
547 /* Skip spaces */
548 do {
549 c = tok_nextc(tok);
550 } while (c == ' ' || c == '\t' || c == '\014');
551
552 /* Set start of current token */
553 tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
554 tok->starting_col_offset = tok->col_offset - 1;
555
556 /* Skip comment, unless it's a type comment */
557 if (c == '#') {
558
559 const char* p = NULL;
560 const char *prefix, *type_start;
561 int current_starting_col_offset;
562
563 while (c != EOF && c != '\n' && c != '\r') {
564 c = tok_nextc(tok);
565 }
566
567 if (tok->tok_extra_tokens) {
568 p = tok->start;
569 }
570
571 if (tok->type_comments) {
572 p = tok->start;
573 current_starting_col_offset = tok->starting_col_offset;
574 prefix = type_comment_prefix;
575 while (*prefix && p < tok->cur) {
576 if (*prefix == ' ') {
577 while (*p == ' ' || *p == '\t') {
578 p++;
579 current_starting_col_offset++;
580 }
581 } else if (*prefix == *p) {
582 p++;
583 current_starting_col_offset++;
584 } else {
585 break;
586 }
587
588 prefix++;
589 }
590
591 /* This is a type comment if we matched all of type_comment_prefix. */
592 if (!*prefix) {
593 int is_type_ignore = 1;
594 // +6 in order to skip the word 'ignore'
595 const char *ignore_end = p + 6;
596 const int ignore_end_col_offset = current_starting_col_offset + 6;
597 tok_backup(tok, c); /* don't eat the newline or EOF */
598
599 type_start = p;
600
601 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
602 * or anything ASCII and non-alphanumeric. */
603 is_type_ignore = (
604 tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
605 && !(tok->cur > ignore_end
606 && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
607
608 if (is_type_ignore) {
609 p_start = ignore_end;
610 p_end = tok->cur;
611
612 /* If this type ignore is the only thing on the line, consume the newline also. */
613 if (blankline) {
614 tok_nextc(tok);
615 tok->atbol = 1;
616 }
617 return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
618 } else {
619 p_start = type_start;
620 p_end = tok->cur;
621 return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
622 }
623 }
624 }
625 if (tok->tok_extra_tokens) {
626 tok_backup(tok, c); /* don't eat the newline or EOF */
627 p_start = p;
628 p_end = tok->cur;
629 tok->comment_newline = blankline;
630 return MAKE_TOKEN(COMMENT);
631 }
632 }
633
634 if (tok->done == E_INTERACT_STOP) {
635 return MAKE_TOKEN(ENDMARKER);
636 }
637
638 /* Check for EOF and errors now */
639 if (c == EOF) {
640 if (tok->level) {
641 return MAKE_TOKEN(ERRORTOKEN);
642 }
643 return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
644 }
645
646 /* Identifier (most frequent token!) */
647 nonascii = 0;
648 if (is_potential_identifier_start(c)) {
649 /* Process the various legal combinations of b"", r"", u"", and f"". */
650 int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
651 while (1) {
652 if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
653 saw_b = 1;
654 /* Since this is a backwards compatibility support literal we don't
655 want to support it in arbitrary order like byte literals. */
656 else if (!(saw_b || saw_u || saw_r || saw_f)
657 && (c == 'u'|| c == 'U')) {
658 saw_u = 1;
659 }
660 /* ur"" and ru"" are not supported */
661 else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
662 saw_r = 1;
663 }
664 else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
665 saw_f = 1;
666 }
667 else {
668 break;
669 }
670 c = tok_nextc(tok);
671 if (c == '"' || c == '\'') {
672 if (saw_f) {
673 goto f_string_quote;
674 }
675 goto letter_quote;
676 }
677 }
678 while (is_potential_identifier_char(c)) {
679 if (c >= 128) {
680 nonascii = 1;
681 }
682 c = tok_nextc(tok);
683 }
684 tok_backup(tok, c);
685 if (nonascii && !verify_identifier(tok)) {
686 return MAKE_TOKEN(ERRORTOKEN);
687 }
688
689 p_start = tok->start;
690 p_end = tok->cur;
691
692 return MAKE_TOKEN(NAME);
693 }
694
695 if (c == '\r') {
696 c = tok_nextc(tok);
697 }
698
699 /* Newline */
700 if (c == '\n') {
701 tok->atbol = 1;
702 if (blankline || tok->level > 0) {
703 if (tok->tok_extra_tokens) {
704 if (tok->comment_newline) {
705 tok->comment_newline = 0;
706 }
707 p_start = tok->start;
708 p_end = tok->cur;
709 return MAKE_TOKEN(NL);
710 }
711 goto nextline;
712 }
713 if (tok->comment_newline && tok->tok_extra_tokens) {
714 tok->comment_newline = 0;
715 p_start = tok->start;
716 p_end = tok->cur;
717 return MAKE_TOKEN(NL);
718 }
719 p_start = tok->start;
720 p_end = tok->cur - 1; /* Leave '\n' out of the string */
721 tok->cont_line = 0;
722 return MAKE_TOKEN(NEWLINE);
723 }
724
725 /* Period or number starting with period? */
726 if (c == '.') {
727 c = tok_nextc(tok);
728 if (Py_ISDIGIT(c)) {
729 goto fraction;
730 } else if (c == '.') {
731 c = tok_nextc(tok);
732 if (c == '.') {
733 p_start = tok->start;
734 p_end = tok->cur;
735 return MAKE_TOKEN(ELLIPSIS);
736 }
737 else {
738 tok_backup(tok, c);
739 }
740 tok_backup(tok, '.');
741 }
742 else {
743 tok_backup(tok, c);
744 }
745 p_start = tok->start;
746 p_end = tok->cur;
747 return MAKE_TOKEN(DOT);
748 }
749
750 /* Number */
751 if (Py_ISDIGIT(c)) {
752 if (c == '0') {
753 /* Hex, octal or binary -- maybe. */
754 c = tok_nextc(tok);
755 if (c == 'x' || c == 'X') {
756 /* Hex */
757 c = tok_nextc(tok);
758 do {
759 if (c == '_') {
760 c = tok_nextc(tok);
761 }
762 if (!Py_ISXDIGIT(c)) {
763 tok_backup(tok, c);
764 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal"));
765 }
766 do {
767 c = tok_nextc(tok);
768 } while (Py_ISXDIGIT(c));
769 } while (c == '_');
770 if (!verify_end_of_number(tok, c, "hexadecimal")) {
771 return MAKE_TOKEN(ERRORTOKEN);
772 }
773 }
774 else if (c == 'o' || c == 'O') {
775 /* Octal */
776 c = tok_nextc(tok);
777 do {
778 if (c == '_') {
779 c = tok_nextc(tok);
780 }
781 if (c < '0' || c >= '8') {
782 if (Py_ISDIGIT(c)) {
783 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
784 "invalid digit '%c' in octal literal", c));
785 }
786 else {
787 tok_backup(tok, c);
788 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal"));
789 }
790 }
791 do {
792 c = tok_nextc(tok);
793 } while ('0' <= c && c < '8');
794 } while (c == '_');
795 if (Py_ISDIGIT(c)) {
796 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
797 "invalid digit '%c' in octal literal", c));
798 }
799 if (!verify_end_of_number(tok, c, "octal")) {
800 return MAKE_TOKEN(ERRORTOKEN);
801 }
802 }
803 else if (c == 'b' || c == 'B') {
804 /* Binary */
805 c = tok_nextc(tok);
806 do {
807 if (c == '_') {
808 c = tok_nextc(tok);
809 }
810 if (c != '0' && c != '1') {
811 if (Py_ISDIGIT(c)) {
812 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
813 }
814 else {
815 tok_backup(tok, c);
816 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal"));
817 }
818 }
819 do {
820 c = tok_nextc(tok);
821 } while (c == '0' || c == '1');
822 } while (c == '_');
823 if (Py_ISDIGIT(c)) {
824 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
825 }
826 if (!verify_end_of_number(tok, c, "binary")) {
827 return MAKE_TOKEN(ERRORTOKEN);
828 }
829 }
830 else {
831 int nonzero = 0;
832 /* maybe old-style octal; c is first char of it */
833 /* in any case, allow '0' as a literal */
834 while (1) {
835 if (c == '_') {
836 c = tok_nextc(tok);
837 if (!Py_ISDIGIT(c)) {
838 tok_backup(tok, c);
839 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
840 }
841 }
842 if (c != '0') {
843 break;
844 }
845 c = tok_nextc(tok);
846 }
847 char* zeros_end = tok->cur;
848 if (Py_ISDIGIT(c)) {
849 nonzero = 1;
850 c = tok_decimal_tail(tok);
851 if (c == 0) {
852 return MAKE_TOKEN(ERRORTOKEN);
853 }
854 }
855 if (c == '.') {
856 c = tok_nextc(tok);
857 goto fraction;
858 }
859 else if (c == 'e' || c == 'E') {
860 goto exponent;
861 }
862 else if (c == 'j' || c == 'J') {
863 goto imaginary;
864 }
865 else if (nonzero && !tok->tok_extra_tokens) {
866 /* Old-style octal: now disallowed. */
867 tok_backup(tok, c);
868 return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range(
869 tok, (int)(tok->start + 1 - tok->line_start),
870 (int)(zeros_end - tok->line_start),
871 "leading zeros in decimal integer "
872 "literals are not permitted; "
873 "use an 0o prefix for octal integers"));
874 }
875 if (!verify_end_of_number(tok, c, "decimal")) {
876 return MAKE_TOKEN(ERRORTOKEN);
877 }
878 }
879 }
880 else {
881 /* Decimal */
882 c = tok_decimal_tail(tok);
883 if (c == 0) {
884 return MAKE_TOKEN(ERRORTOKEN);
885 }
886 {
887 /* Accept floating-point numbers. */
888 if (c == '.') {
889 c = tok_nextc(tok);
890 fraction:
891 /* Fraction */
892 if (Py_ISDIGIT(c)) {
893 c = tok_decimal_tail(tok);
894 if (c == 0) {
895 return MAKE_TOKEN(ERRORTOKEN);
896 }
897 }
898 }
899 if (c == 'e' || c == 'E') {
900 int e;
901 exponent:
902 e = c;
903 /* Exponent part */
904 c = tok_nextc(tok);
905 if (c == '+' || c == '-') {
906 c = tok_nextc(tok);
907 if (!Py_ISDIGIT(c)) {
908 tok_backup(tok, c);
909 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
910 }
911 } else if (!Py_ISDIGIT(c)) {
912 tok_backup(tok, c);
913 if (!verify_end_of_number(tok, e, "decimal")) {
914 return MAKE_TOKEN(ERRORTOKEN);
915 }
916 tok_backup(tok, e);
917 p_start = tok->start;
918 p_end = tok->cur;
919 return MAKE_TOKEN(NUMBER);
920 }
921 c = tok_decimal_tail(tok);
922 if (c == 0) {
923 return MAKE_TOKEN(ERRORTOKEN);
924 }
925 }
926 if (c == 'j' || c == 'J') {
927 /* Imaginary part */
928 imaginary:
929 c = tok_nextc(tok);
930 if (!verify_end_of_number(tok, c, "imaginary")) {
931 return MAKE_TOKEN(ERRORTOKEN);
932 }
933 }
934 else if (!verify_end_of_number(tok, c, "decimal")) {
935 return MAKE_TOKEN(ERRORTOKEN);
936 }
937 }
938 }
939 tok_backup(tok, c);
940 p_start = tok->start;
941 p_end = tok->cur;
942 return MAKE_TOKEN(NUMBER);
943 }
944
945 f_string_quote:
946 if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r') && (c == '\'' || c == '"'))) {
947 int quote = c;
948 int quote_size = 1; /* 1 or 3 */
949
950 /* Nodes of type STRING, especially multi line strings
951 must be handled differently in order to get both
952 the starting line number and the column offset right.
953 (cf. issue 16806) */
954 tok->first_lineno = tok->lineno;
955 tok->multi_line_start = tok->line_start;
956
957 /* Find the quote size and start of string */
958 int after_quote = tok_nextc(tok);
959 if (after_quote == quote) {
960 int after_after_quote = tok_nextc(tok);
961 if (after_after_quote == quote) {
962 quote_size = 3;
963 }
964 else {
965 // TODO: Check this
966 tok_backup(tok, after_after_quote);
967 tok_backup(tok, after_quote);
968 }
969 }
970 if (after_quote != quote) {
971 tok_backup(tok, after_quote);
972 }
973
974
975 p_start = tok->start;
976 p_end = tok->cur;
977 if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) {
978 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings"));
979 }
980 tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
981 the_current_tok->kind = TOK_FSTRING_MODE;
982 the_current_tok->f_string_quote = quote;
983 the_current_tok->f_string_quote_size = quote_size;
984 the_current_tok->f_string_start = tok->start;
985 the_current_tok->f_string_multi_line_start = tok->line_start;
986 the_current_tok->f_string_line_start = tok->lineno;
987 the_current_tok->f_string_start_offset = -1;
988 the_current_tok->f_string_multi_line_start_offset = -1;
989 the_current_tok->last_expr_buffer = NULL;
990 the_current_tok->last_expr_size = 0;
991 the_current_tok->last_expr_end = -1;
992 the_current_tok->in_format_spec = 0;
993 the_current_tok->f_string_debug = 0;
994
995 switch (*tok->start) {
996 case 'F':
997 case 'f':
998 the_current_tok->f_string_raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
999 break;
1000 case 'R':
1001 case 'r':
1002 the_current_tok->f_string_raw = 1;
1003 break;
1004 default:
1005 Py_UNREACHABLE();
1006 }
1007
1008 the_current_tok->curly_bracket_depth = 0;
1009 the_current_tok->curly_bracket_expr_start_depth = -1;
1010 return MAKE_TOKEN(FSTRING_START);
1011 }
1012
1013 letter_quote:
1014 /* String */
1015 if (c == '\'' || c == '"') {
1016 int quote = c;
1017 int quote_size = 1; /* 1 or 3 */
1018 int end_quote_size = 0;
1019 int has_escaped_quote = 0;
1020
1021 /* Nodes of type STRING, especially multi line strings
1022 must be handled differently in order to get both
1023 the starting line number and the column offset right.
1024 (cf. issue 16806) */
1025 tok->first_lineno = tok->lineno;
1026 tok->multi_line_start = tok->line_start;
1027
1028 /* Find the quote size and start of string */
1029 c = tok_nextc(tok);
1030 if (c == quote) {
1031 c = tok_nextc(tok);
1032 if (c == quote) {
1033 quote_size = 3;
1034 }
1035 else {
1036 end_quote_size = 1; /* empty string found */
1037 }
1038 }
1039 if (c != quote) {
1040 tok_backup(tok, c);
1041 }
1042
1043 /* Get rest of string */
1044 while (end_quote_size != quote_size) {
1045 c = tok_nextc(tok);
1046 if (tok->done == E_ERROR) {
1047 return MAKE_TOKEN(ERRORTOKEN);
1048 }
1049 if (tok->done == E_DECODE) {
1050 break;
1051 }
1052 if (c == EOF || (quote_size == 1 && c == '\n')) {
1053 assert(tok->multi_line_start != NULL);
1054 // shift the tok_state's location into
1055 // the start of string, and report the error
1056 // from the initial quote character
1057 tok->cur = (char *)tok->start;
1058 tok->cur++;
1059 tok->line_start = tok->multi_line_start;
1060 int start = tok->lineno;
1061 tok->lineno = tok->first_lineno;
1062
1063 if (INSIDE_FSTRING(tok)) {
1064 /* When we are in an f-string, before raising the
1065 * unterminated string literal error, check whether
1066 * does the initial quote matches with f-strings quotes
1067 * and if it is, then this must be a missing '}' token
1068 * so raise the proper error */
1069 tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1070 if (the_current_tok->f_string_quote == quote &&
1071 the_current_tok->f_string_quote_size == quote_size) {
1072 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expecting '}'", start));
1073 }
1074 }
1075
1076 if (quote_size == 3) {
1077 _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal"
1078 " (detected at line %d)", start);
1079 if (c != '\n') {
1080 tok->done = E_EOFS;
1081 }
1082 return MAKE_TOKEN(ERRORTOKEN);
1083 }
1084 else {
1085 if (has_escaped_quote) {
1086 _PyTokenizer_syntaxerror(
1087 tok,
1088 "unterminated string literal (detected at line %d); "
1089 "perhaps you escaped the end quote?",
1090 start
1091 );
1092 } else {
1093 _PyTokenizer_syntaxerror(
1094 tok, "unterminated string literal (detected at line %d)", start
1095 );
1096 }
1097 if (c != '\n') {
1098 tok->done = E_EOLS;
1099 }
1100 return MAKE_TOKEN(ERRORTOKEN);
1101 }
1102 }
1103 if (c == quote) {
1104 end_quote_size += 1;
1105 }
1106 else {
1107 end_quote_size = 0;
1108 if (c == '\\') {
1109 c = tok_nextc(tok); /* skip escaped char */
1110 if (c == quote) { /* but record whether the escaped char was a quote */
1111 has_escaped_quote = 1;
1112 }
1113 if (c == '\r') {
1114 c = tok_nextc(tok);
1115 }
1116 }
1117 }
1118 }
1119
1120 p_start = tok->start;
1121 p_end = tok->cur;
1122 return MAKE_TOKEN(STRING);
1123 }
1124
1125 /* Line continuation */
1126 if (c == '\\') {
1127 if ((c = tok_continuation_line(tok)) == -1) {
1128 return MAKE_TOKEN(ERRORTOKEN);
1129 }
1130 tok->cont_line = 1;
1131 goto again; /* Read next line */
1132 }
1133
1134 /* Punctuation character */
1135 int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
1136 if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
1137 /* This code block gets executed before the curly_bracket_depth is incremented
1138 * by the `{` case, so for ensuring that we are on the 0th level, we need
1139 * to adjust it manually */
1140 int cursor = current_tok->curly_bracket_depth - (c != '{');
1141 int in_format_spec = current_tok->in_format_spec;
1142 int cursor_in_format_with_debug =
1143 cursor == 1 && (current_tok->f_string_debug || in_format_spec);
1144 int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
1145 if ((cursor_valid) && !_PyLexer_update_fstring_expr(tok, c)) {
1146 return MAKE_TOKEN(ENDMARKER);
1147 }
1148 if ((cursor_valid) && c != '{' && set_fstring_expr(tok, token, c)) {
1149 return MAKE_TOKEN(ERRORTOKEN);
1150 }
1151
1152 if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
1153 current_tok->kind = TOK_FSTRING_MODE;
1154 current_tok->in_format_spec = 1;
1155 p_start = tok->start;
1156 p_end = tok->cur;
1157 return MAKE_TOKEN(_PyToken_OneChar(c));
1158 }
1159 }
1160
1161 /* Check for two-character token */
1162 {
1163 int c2 = tok_nextc(tok);
1164 int current_token = _PyToken_TwoChars(c, c2);
1165 if (current_token != OP) {
1166 int c3 = tok_nextc(tok);
1167 int current_token3 = _PyToken_ThreeChars(c, c2, c3);
1168 if (current_token3 != OP) {
1169 current_token = current_token3;
1170 }
1171 else {
1172 tok_backup(tok, c3);
1173 }
1174 p_start = tok->start;
1175 p_end = tok->cur;
1176 return MAKE_TOKEN(current_token);
1177 }
1178 tok_backup(tok, c2);
1179 }
1180
1181 /* Keep track of parentheses nesting level */
1182 switch (c) {
1183 case '(':
1184 case '[':
1185 case '{':
1186 if (tok->level >= MAXLEVEL) {
1187 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses"));
1188 }
1189 tok->parenstack[tok->level] = c;
1190 tok->parenlinenostack[tok->level] = tok->lineno;
1191 tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
1192 tok->level++;
1193 if (INSIDE_FSTRING(tok)) {
1194 current_tok->curly_bracket_depth++;
1195 }
1196 break;
1197 case ')':
1198 case ']':
1199 case '}':
1200 if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
1201 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: single '}' is not allowed"));
1202 }
1203 if (!tok->tok_extra_tokens && !tok->level) {
1204 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c));
1205 }
1206 if (tok->level > 0) {
1207 tok->level--;
1208 int opening = tok->parenstack[tok->level];
1209 if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') ||
1210 (opening == '[' && c == ']') ||
1211 (opening == '{' && c == '}'))) {
1212 /* If the opening bracket belongs to an f-string's expression
1213 part (e.g. f"{)}") and the closing bracket is an arbitrary
1214 nested expression, then instead of matching a different
1215 syntactical construct with it; we'll throw an unmatched
1216 parentheses error. */
1217 if (INSIDE_FSTRING(tok) && opening == '{') {
1218 assert(current_tok->curly_bracket_depth >= 0);
1219 int previous_bracket = current_tok->curly_bracket_depth - 1;
1220 if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
1221 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: unmatched '%c'", c));
1222 }
1223 }
1224 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1225 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1226 "closing parenthesis '%c' does not match "
1227 "opening parenthesis '%c' on line %d",
1228 c, opening, tok->parenlinenostack[tok->level]));
1229 }
1230 else {
1231 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1232 "closing parenthesis '%c' does not match "
1233 "opening parenthesis '%c'",
1234 c, opening));
1235 }
1236 }
1237 }
1238
1239 if (INSIDE_FSTRING(tok)) {
1240 current_tok->curly_bracket_depth--;
1241 if (current_tok->curly_bracket_depth < 0) {
1242 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: unmatched '%c'", c));
1243 }
1244 if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
1245 current_tok->curly_bracket_expr_start_depth--;
1246 current_tok->kind = TOK_FSTRING_MODE;
1247 current_tok->in_format_spec = 0;
1248 current_tok->f_string_debug = 0;
1249 }
1250 }
1251 break;
1252 default:
1253 break;
1254 }
1255
1256 if (!Py_UNICODE_ISPRINTABLE(c)) {
1257 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c));
1258 }
1259
1260 if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
1261 current_tok->f_string_debug = 1;
1262 }
1263
1264 /* Punctuation character */
1265 p_start = tok->start;
1266 p_end = tok->cur;
1267 return MAKE_TOKEN(_PyToken_OneChar(c));
1268 }
1269
1270 static int
tok_get_fstring_mode(struct tok_state * tok,tokenizer_mode * current_tok,struct token * token)1271 tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
1272 {
1273 const char *p_start = NULL;
1274 const char *p_end = NULL;
1275 int end_quote_size = 0;
1276 int unicode_escape = 0;
1277
1278 tok->start = tok->cur;
1279 tok->first_lineno = tok->lineno;
1280 tok->starting_col_offset = tok->col_offset;
1281
1282 // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
1283 // before it.
1284 int start_char = tok_nextc(tok);
1285 if (start_char == '{') {
1286 int peek1 = tok_nextc(tok);
1287 tok_backup(tok, peek1);
1288 tok_backup(tok, start_char);
1289 if (peek1 != '{') {
1290 current_tok->curly_bracket_expr_start_depth++;
1291 if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1292 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
1293 }
1294 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1295 return tok_get_normal_mode(tok, current_tok, token);
1296 }
1297 }
1298 else {
1299 tok_backup(tok, start_char);
1300 }
1301
1302 // Check if we are at the end of the string
1303 for (int i = 0; i < current_tok->f_string_quote_size; i++) {
1304 int quote = tok_nextc(tok);
1305 if (quote != current_tok->f_string_quote) {
1306 tok_backup(tok, quote);
1307 goto f_string_middle;
1308 }
1309 }
1310
1311 if (current_tok->last_expr_buffer != NULL) {
1312 PyMem_Free(current_tok->last_expr_buffer);
1313 current_tok->last_expr_buffer = NULL;
1314 current_tok->last_expr_size = 0;
1315 current_tok->last_expr_end = -1;
1316 }
1317
1318 p_start = tok->start;
1319 p_end = tok->cur;
1320 tok->tok_mode_stack_index--;
1321 return MAKE_TOKEN(FSTRING_END);
1322
1323 f_string_middle:
1324
1325 // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
1326 // this.
1327 tok->multi_line_start = tok->line_start;
1328 while (end_quote_size != current_tok->f_string_quote_size) {
1329 int c = tok_nextc(tok);
1330 if (tok->done == E_ERROR || tok->done == E_DECODE) {
1331 return MAKE_TOKEN(ERRORTOKEN);
1332 }
1333 int in_format_spec = (
1334 current_tok->in_format_spec
1335 &&
1336 INSIDE_FSTRING_EXPR(current_tok)
1337 );
1338
1339 if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) {
1340 if (tok->decoding_erred) {
1341 return MAKE_TOKEN(ERRORTOKEN);
1342 }
1343
1344 // If we are in a format spec and we found a newline,
1345 // it means that the format spec ends here and we should
1346 // return to the regular mode.
1347 if (in_format_spec && c == '\n') {
1348 tok_backup(tok, c);
1349 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1350 current_tok->in_format_spec = 0;
1351 p_start = tok->start;
1352 p_end = tok->cur;
1353 return MAKE_TOKEN(FSTRING_MIDDLE);
1354 }
1355
1356 assert(tok->multi_line_start != NULL);
1357 // shift the tok_state's location into
1358 // the start of string, and report the error
1359 // from the initial quote character
1360 tok->cur = (char *)current_tok->f_string_start;
1361 tok->cur++;
1362 tok->line_start = current_tok->f_string_multi_line_start;
1363 int start = tok->lineno;
1364
1365 tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1366 tok->lineno = the_current_tok->f_string_line_start;
1367
1368 if (current_tok->f_string_quote_size == 3) {
1369 _PyTokenizer_syntaxerror(tok,
1370 "unterminated triple-quoted f-string literal"
1371 " (detected at line %d)", start);
1372 if (c != '\n') {
1373 tok->done = E_EOFS;
1374 }
1375 return MAKE_TOKEN(ERRORTOKEN);
1376 }
1377 else {
1378 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1379 "unterminated f-string literal (detected at"
1380 " line %d)", start));
1381 }
1382 }
1383
1384 if (c == current_tok->f_string_quote) {
1385 end_quote_size += 1;
1386 continue;
1387 } else {
1388 end_quote_size = 0;
1389 }
1390
1391 if (c == '{') {
1392 if (!_PyLexer_update_fstring_expr(tok, c)) {
1393 return MAKE_TOKEN(ENDMARKER);
1394 }
1395 int peek = tok_nextc(tok);
1396 if (peek != '{' || in_format_spec) {
1397 tok_backup(tok, peek);
1398 tok_backup(tok, c);
1399 current_tok->curly_bracket_expr_start_depth++;
1400 if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1401 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
1402 }
1403 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1404 current_tok->in_format_spec = 0;
1405 p_start = tok->start;
1406 p_end = tok->cur;
1407 } else {
1408 p_start = tok->start;
1409 p_end = tok->cur - 1;
1410 }
1411 return MAKE_TOKEN(FSTRING_MIDDLE);
1412 } else if (c == '}') {
1413 if (unicode_escape) {
1414 p_start = tok->start;
1415 p_end = tok->cur;
1416 return MAKE_TOKEN(FSTRING_MIDDLE);
1417 }
1418 int peek = tok_nextc(tok);
1419
1420 // The tokenizer can only be in the format spec if we have already completed the expression
1421 // scanning (indicated by the end of the expression being set) and we are not at the top level
1422 // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
1423 // brackets, we can bypass it here.
1424 int cursor = current_tok->curly_bracket_depth;
1425 if (peek == '}' && !in_format_spec && cursor == 0) {
1426 p_start = tok->start;
1427 p_end = tok->cur - 1;
1428 } else {
1429 tok_backup(tok, peek);
1430 tok_backup(tok, c);
1431 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1432 current_tok->in_format_spec = 0;
1433 p_start = tok->start;
1434 p_end = tok->cur;
1435 }
1436 return MAKE_TOKEN(FSTRING_MIDDLE);
1437 } else if (c == '\\') {
1438 int peek = tok_nextc(tok);
1439 if (peek == '\r') {
1440 peek = tok_nextc(tok);
1441 }
1442 // Special case when the backslash is right before a curly
1443 // brace. We have to restore and return the control back
1444 // to the loop for the next iteration.
1445 if (peek == '{' || peek == '}') {
1446 if (!current_tok->f_string_raw) {
1447 if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) {
1448 return MAKE_TOKEN(ERRORTOKEN);
1449 }
1450 }
1451 tok_backup(tok, peek);
1452 continue;
1453 }
1454
1455 if (!current_tok->f_string_raw) {
1456 if (peek == 'N') {
1457 /* Handle named unicode escapes (\N{BULLET}) */
1458 peek = tok_nextc(tok);
1459 if (peek == '{') {
1460 unicode_escape = 1;
1461 } else {
1462 tok_backup(tok, peek);
1463 }
1464 }
1465 } /* else {
1466 skip the escaped character
1467 }*/
1468 }
1469 }
1470
1471 // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
1472 // add the quotes to the FSTRING_END in the next tokenizer iteration.
1473 for (int i = 0; i < current_tok->f_string_quote_size; i++) {
1474 tok_backup(tok, current_tok->f_string_quote);
1475 }
1476 p_start = tok->start;
1477 p_end = tok->cur;
1478 return MAKE_TOKEN(FSTRING_MIDDLE);
1479 }
1480
1481 static int
tok_get(struct tok_state * tok,struct token * token)1482 tok_get(struct tok_state *tok, struct token *token)
1483 {
1484 tokenizer_mode *current_tok = TOK_GET_MODE(tok);
1485 if (current_tok->kind == TOK_REGULAR_MODE) {
1486 return tok_get_normal_mode(tok, current_tok, token);
1487 } else {
1488 return tok_get_fstring_mode(tok, current_tok, token);
1489 }
1490 }
1491
1492 int
_PyTokenizer_Get(struct tok_state * tok,struct token * token)1493 _PyTokenizer_Get(struct tok_state *tok, struct token *token)
1494 {
1495 int result = tok_get(tok, token);
1496 if (tok->decoding_erred) {
1497 result = ERRORTOKEN;
1498 tok->done = E_DECODE;
1499 }
1500 return result;
1501 }
1502