• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include "Python.h"
2 #include "pycore_token.h"
3 #include "pycore_unicodeobject.h"
4 #include "errcode.h"
5 
6 #include "state.h"
7 #include "../tokenizer/helpers.h"
8 
9 /* Alternate tab spacing */
10 #define ALTTABSIZE 1
11 
12 #define is_potential_identifier_start(c) (\
13               (c >= 'a' && c <= 'z')\
14                || (c >= 'A' && c <= 'Z')\
15                || c == '_'\
16                || (c >= 128))
17 
18 #define is_potential_identifier_char(c) (\
19               (c >= 'a' && c <= 'z')\
20                || (c >= 'A' && c <= 'Z')\
21                || (c >= '0' && c <= '9')\
22                || c == '_'\
23                || (c >= 128))
24 
25 #ifdef Py_DEBUG
TOK_GET_MODE(struct tok_state * tok)26 static inline tokenizer_mode* TOK_GET_MODE(struct tok_state* tok) {
27     assert(tok->tok_mode_stack_index >= 0);
28     assert(tok->tok_mode_stack_index < MAXFSTRINGLEVEL);
29     return &(tok->tok_mode_stack[tok->tok_mode_stack_index]);
30 }
TOK_NEXT_MODE(struct tok_state * tok)31 static inline tokenizer_mode* TOK_NEXT_MODE(struct tok_state* tok) {
32     assert(tok->tok_mode_stack_index >= 0);
33     assert(tok->tok_mode_stack_index + 1 < MAXFSTRINGLEVEL);
34     return &(tok->tok_mode_stack[++tok->tok_mode_stack_index]);
35 }
36 #else
37 #define TOK_GET_MODE(tok) (&(tok->tok_mode_stack[tok->tok_mode_stack_index]))
38 #define TOK_NEXT_MODE(tok) (&(tok->tok_mode_stack[++tok->tok_mode_stack_index]))
39 #endif
40 
41 #define MAKE_TOKEN(token_type) _PyLexer_token_setup(tok, token, token_type, p_start, p_end)
42 #define MAKE_TYPE_COMMENT_TOKEN(token_type, col_offset, end_col_offset) (\
43                 _PyLexer_type_comment_token_setup(tok, token, token_type, col_offset, end_col_offset, p_start, p_end))
44 
45 /* Spaces in this constant are treated as "zero or more spaces or tabs" when
46    tokenizing. */
47 static const char* type_comment_prefix = "# type: ";
48 
49 static inline int
contains_null_bytes(const char * str,size_t size)50 contains_null_bytes(const char* str, size_t size)
51 {
52     return memchr(str, 0, size) != NULL;
53 }
54 
55 /* Get next char, updating state; error code goes into tok->done */
56 static int
tok_nextc(struct tok_state * tok)57 tok_nextc(struct tok_state *tok)
58 {
59     int rc;
60     for (;;) {
61         if (tok->cur != tok->inp) {
62             if ((unsigned int) tok->col_offset >= (unsigned int) INT_MAX) {
63                 tok->done = E_COLUMNOVERFLOW;
64                 return EOF;
65             }
66             tok->col_offset++;
67             return Py_CHARMASK(*tok->cur++); /* Fast path */
68         }
69         if (tok->done != E_OK) {
70             return EOF;
71         }
72         rc = tok->underflow(tok);
73 #if defined(Py_DEBUG)
74         if (tok->debug) {
75             fprintf(stderr, "line[%d] = ", tok->lineno);
76             _PyTokenizer_print_escape(stderr, tok->cur, tok->inp - tok->cur);
77             fprintf(stderr, "  tok->done = %d\n", tok->done);
78         }
79 #endif
80         if (!rc) {
81             tok->cur = tok->inp;
82             return EOF;
83         }
84         tok->line_start = tok->cur;
85 
86         if (contains_null_bytes(tok->line_start, tok->inp - tok->line_start)) {
87             _PyTokenizer_syntaxerror(tok, "source code cannot contain null bytes");
88             tok->cur = tok->inp;
89             return EOF;
90         }
91     }
92     Py_UNREACHABLE();
93 }
94 
95 /* Back-up one character */
96 static void
tok_backup(struct tok_state * tok,int c)97 tok_backup(struct tok_state *tok, int c)
98 {
99     if (c != EOF) {
100         if (--tok->cur < tok->buf) {
101             Py_FatalError("tokenizer beginning of buffer");
102         }
103         if ((int)(unsigned char)*tok->cur != Py_CHARMASK(c)) {
104             Py_FatalError("tok_backup: wrong character");
105         }
106         tok->col_offset--;
107     }
108 }
109 
110 static int
set_fstring_expr(struct tok_state * tok,struct token * token,char c)111 set_fstring_expr(struct tok_state* tok, struct token *token, char c) {
112     assert(token != NULL);
113     assert(c == '}' || c == ':' || c == '!');
114     tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
115 
116     if (!tok_mode->f_string_debug || token->metadata) {
117         return 0;
118     }
119     PyObject *res = NULL;
120 
121     // Check if there is a # character in the expression
122     int hash_detected = 0;
123     for (Py_ssize_t i = 0; i < tok_mode->last_expr_size - tok_mode->last_expr_end; i++) {
124         if (tok_mode->last_expr_buffer[i] == '#') {
125             hash_detected = 1;
126             break;
127         }
128     }
129 
130     if (hash_detected) {
131         Py_ssize_t input_length = tok_mode->last_expr_size - tok_mode->last_expr_end;
132         char *result = (char *)PyMem_Malloc((input_length + 1) * sizeof(char));
133         if (!result) {
134             return -1;
135         }
136 
137         Py_ssize_t i = 0;
138         Py_ssize_t j = 0;
139 
140         for (i = 0, j = 0; i < input_length; i++) {
141             if (tok_mode->last_expr_buffer[i] == '#') {
142                 // Skip characters until newline or end of string
143                 while (tok_mode->last_expr_buffer[i] != '\0' && i < input_length) {
144                     if (tok_mode->last_expr_buffer[i] == '\n') {
145                         result[j++] = tok_mode->last_expr_buffer[i];
146                         break;
147                     }
148                     i++;
149                 }
150             } else {
151                 result[j++] = tok_mode->last_expr_buffer[i];
152             }
153         }
154 
155         result[j] = '\0';  // Null-terminate the result string
156         res = PyUnicode_DecodeUTF8(result, j, NULL);
157         PyMem_Free(result);
158     } else {
159         res = PyUnicode_DecodeUTF8(
160             tok_mode->last_expr_buffer,
161             tok_mode->last_expr_size - tok_mode->last_expr_end,
162             NULL
163         );
164 
165     }
166 
167 
168    if (!res) {
169         return -1;
170     }
171     token->metadata = res;
172     return 0;
173 }
174 
175 int
_PyLexer_update_fstring_expr(struct tok_state * tok,char cur)176 _PyLexer_update_fstring_expr(struct tok_state *tok, char cur)
177 {
178     assert(tok->cur != NULL);
179 
180     Py_ssize_t size = strlen(tok->cur);
181     tokenizer_mode *tok_mode = TOK_GET_MODE(tok);
182 
183     switch (cur) {
184        case 0:
185             if (!tok_mode->last_expr_buffer || tok_mode->last_expr_end >= 0) {
186                 return 1;
187             }
188             char *new_buffer = PyMem_Realloc(
189                 tok_mode->last_expr_buffer,
190                 tok_mode->last_expr_size + size
191             );
192             if (new_buffer == NULL) {
193                 PyMem_Free(tok_mode->last_expr_buffer);
194                 goto error;
195             }
196             tok_mode->last_expr_buffer = new_buffer;
197             strncpy(tok_mode->last_expr_buffer + tok_mode->last_expr_size, tok->cur, size);
198             tok_mode->last_expr_size += size;
199             break;
200         case '{':
201             if (tok_mode->last_expr_buffer != NULL) {
202                 PyMem_Free(tok_mode->last_expr_buffer);
203             }
204             tok_mode->last_expr_buffer = PyMem_Malloc(size);
205             if (tok_mode->last_expr_buffer == NULL) {
206                 goto error;
207             }
208             tok_mode->last_expr_size = size;
209             tok_mode->last_expr_end = -1;
210             strncpy(tok_mode->last_expr_buffer, tok->cur, size);
211             break;
212         case '}':
213         case '!':
214         case ':':
215             if (tok_mode->last_expr_end == -1) {
216                 tok_mode->last_expr_end = strlen(tok->start);
217             }
218             break;
219         default:
220             Py_UNREACHABLE();
221     }
222     return 1;
223 error:
224     tok->done = E_NOMEM;
225     return 0;
226 }
227 
228 static int
lookahead(struct tok_state * tok,const char * test)229 lookahead(struct tok_state *tok, const char *test)
230 {
231     const char *s = test;
232     int res = 0;
233     while (1) {
234         int c = tok_nextc(tok);
235         if (*s == 0) {
236             res = !is_potential_identifier_char(c);
237         }
238         else if (c == *s) {
239             s++;
240             continue;
241         }
242 
243         tok_backup(tok, c);
244         while (s != test) {
245             tok_backup(tok, *--s);
246         }
247         return res;
248     }
249 }
250 
251 static int
verify_end_of_number(struct tok_state * tok,int c,const char * kind)252 verify_end_of_number(struct tok_state *tok, int c, const char *kind) {
253     if (tok->tok_extra_tokens) {
254         // When we are parsing extra tokens, we don't want to emit warnings
255         // about invalid literals, because we want to be a bit more liberal.
256         return 1;
257     }
258     /* Emit a deprecation warning only if the numeric literal is immediately
259      * followed by one of keywords which can occur after a numeric literal
260      * in valid code: "and", "else", "for", "if", "in", "is" and "or".
261      * It allows to gradually deprecate existing valid code without adding
262      * warning before error in most cases of invalid numeric literal (which
263      * would be confusing and break existing tests).
264      * Raise a syntax error with slightly better message than plain
265      * "invalid syntax" if the numeric literal is immediately followed by
266      * other keyword or identifier.
267      */
268     int r = 0;
269     if (c == 'a') {
270         r = lookahead(tok, "nd");
271     }
272     else if (c == 'e') {
273         r = lookahead(tok, "lse");
274     }
275     else if (c == 'f') {
276         r = lookahead(tok, "or");
277     }
278     else if (c == 'i') {
279         int c2 = tok_nextc(tok);
280         if (c2 == 'f' || c2 == 'n' || c2 == 's') {
281             r = 1;
282         }
283         tok_backup(tok, c2);
284     }
285     else if (c == 'o') {
286         r = lookahead(tok, "r");
287     }
288     else if (c == 'n') {
289         r = lookahead(tok, "ot");
290     }
291     if (r) {
292         tok_backup(tok, c);
293         if (_PyTokenizer_parser_warn(tok, PyExc_SyntaxWarning,
294                 "invalid %s literal", kind))
295         {
296             return 0;
297         }
298         tok_nextc(tok);
299     }
300     else /* In future releases, only error will remain. */
301     if (c < 128 && is_potential_identifier_char(c)) {
302         tok_backup(tok, c);
303         _PyTokenizer_syntaxerror(tok, "invalid %s literal", kind);
304         return 0;
305     }
306     return 1;
307 }
308 
309 /* Verify that the identifier follows PEP 3131.
310    All identifier strings are guaranteed to be "ready" unicode objects.
311  */
312 static int
verify_identifier(struct tok_state * tok)313 verify_identifier(struct tok_state *tok)
314 {
315     if (tok->tok_extra_tokens) {
316         return 1;
317     }
318     PyObject *s;
319     if (tok->decoding_erred)
320         return 0;
321     s = PyUnicode_DecodeUTF8(tok->start, tok->cur - tok->start, NULL);
322     if (s == NULL) {
323         if (PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {
324             tok->done = E_DECODE;
325         }
326         else {
327             tok->done = E_ERROR;
328         }
329         return 0;
330     }
331     Py_ssize_t invalid = _PyUnicode_ScanIdentifier(s);
332     if (invalid < 0) {
333         Py_DECREF(s);
334         tok->done = E_ERROR;
335         return 0;
336     }
337     assert(PyUnicode_GET_LENGTH(s) > 0);
338     if (invalid < PyUnicode_GET_LENGTH(s)) {
339         Py_UCS4 ch = PyUnicode_READ_CHAR(s, invalid);
340         if (invalid + 1 < PyUnicode_GET_LENGTH(s)) {
341             /* Determine the offset in UTF-8 encoded input */
342             Py_SETREF(s, PyUnicode_Substring(s, 0, invalid + 1));
343             if (s != NULL) {
344                 Py_SETREF(s, PyUnicode_AsUTF8String(s));
345             }
346             if (s == NULL) {
347                 tok->done = E_ERROR;
348                 return 0;
349             }
350             tok->cur = (char *)tok->start + PyBytes_GET_SIZE(s);
351         }
352         Py_DECREF(s);
353         if (Py_UNICODE_ISPRINTABLE(ch)) {
354             _PyTokenizer_syntaxerror(tok, "invalid character '%c' (U+%04X)", ch, ch);
355         }
356         else {
357             _PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", ch);
358         }
359         return 0;
360     }
361     Py_DECREF(s);
362     return 1;
363 }
364 
365 static int
tok_decimal_tail(struct tok_state * tok)366 tok_decimal_tail(struct tok_state *tok)
367 {
368     int c;
369 
370     while (1) {
371         do {
372             c = tok_nextc(tok);
373         } while (Py_ISDIGIT(c));
374         if (c != '_') {
375             break;
376         }
377         c = tok_nextc(tok);
378         if (!Py_ISDIGIT(c)) {
379             tok_backup(tok, c);
380             _PyTokenizer_syntaxerror(tok, "invalid decimal literal");
381             return 0;
382         }
383     }
384     return c;
385 }
386 
387 static inline int
tok_continuation_line(struct tok_state * tok)388 tok_continuation_line(struct tok_state *tok) {
389     int c = tok_nextc(tok);
390     if (c == '\r') {
391         c = tok_nextc(tok);
392     }
393     if (c != '\n') {
394         tok->done = E_LINECONT;
395         return -1;
396     }
397     c = tok_nextc(tok);
398     if (c == EOF) {
399         tok->done = E_EOF;
400         tok->cur = tok->inp;
401         return -1;
402     } else {
403         tok_backup(tok, c);
404     }
405     return c;
406 }
407 
408 static int
tok_get_normal_mode(struct tok_state * tok,tokenizer_mode * current_tok,struct token * token)409 tok_get_normal_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
410 {
411     int c;
412     int blankline, nonascii;
413 
414     const char *p_start = NULL;
415     const char *p_end = NULL;
416   nextline:
417     tok->start = NULL;
418     tok->starting_col_offset = -1;
419     blankline = 0;
420 
421 
422     /* Get indentation level */
423     if (tok->atbol) {
424         int col = 0;
425         int altcol = 0;
426         tok->atbol = 0;
427         int cont_line_col = 0;
428         for (;;) {
429             c = tok_nextc(tok);
430             if (c == ' ') {
431                 col++, altcol++;
432             }
433             else if (c == '\t') {
434                 col = (col / tok->tabsize + 1) * tok->tabsize;
435                 altcol = (altcol / ALTTABSIZE + 1) * ALTTABSIZE;
436             }
437             else if (c == '\014')  {/* Control-L (formfeed) */
438                 col = altcol = 0; /* For Emacs users */
439             }
440             else if (c == '\\') {
441                 // Indentation cannot be split over multiple physical lines
442                 // using backslashes. This means that if we found a backslash
443                 // preceded by whitespace, **the first one we find** determines
444                 // the level of indentation of whatever comes next.
445                 cont_line_col = cont_line_col ? cont_line_col : col;
446                 if ((c = tok_continuation_line(tok)) == -1) {
447                     return MAKE_TOKEN(ERRORTOKEN);
448                 }
449             }
450             else {
451                 break;
452             }
453         }
454         tok_backup(tok, c);
455         if (c == '#' || c == '\n' || c == '\r') {
456             /* Lines with only whitespace and/or comments
457                shouldn't affect the indentation and are
458                not passed to the parser as NEWLINE tokens,
459                except *totally* empty lines in interactive
460                mode, which signal the end of a command group. */
461             if (col == 0 && c == '\n' && tok->prompt != NULL) {
462                 blankline = 0; /* Let it through */
463             }
464             else if (tok->prompt != NULL && tok->lineno == 1) {
465                 /* In interactive mode, if the first line contains
466                    only spaces and/or a comment, let it through. */
467                 blankline = 0;
468                 col = altcol = 0;
469             }
470             else {
471                 blankline = 1; /* Ignore completely */
472             }
473             /* We can't jump back right here since we still
474                may need to skip to the end of a comment */
475         }
476         if (!blankline && tok->level == 0) {
477             col = cont_line_col ? cont_line_col : col;
478             altcol = cont_line_col ? cont_line_col : altcol;
479             if (col == tok->indstack[tok->indent]) {
480                 /* No change */
481                 if (altcol != tok->altindstack[tok->indent]) {
482                     return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
483                 }
484             }
485             else if (col > tok->indstack[tok->indent]) {
486                 /* Indent -- always one */
487                 if (tok->indent+1 >= MAXINDENT) {
488                     tok->done = E_TOODEEP;
489                     tok->cur = tok->inp;
490                     return MAKE_TOKEN(ERRORTOKEN);
491                 }
492                 if (altcol <= tok->altindstack[tok->indent]) {
493                     return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
494                 }
495                 tok->pendin++;
496                 tok->indstack[++tok->indent] = col;
497                 tok->altindstack[tok->indent] = altcol;
498             }
499             else /* col < tok->indstack[tok->indent] */ {
500                 /* Dedent -- any number, must be consistent */
501                 while (tok->indent > 0 &&
502                     col < tok->indstack[tok->indent]) {
503                     tok->pendin--;
504                     tok->indent--;
505                 }
506                 if (col != tok->indstack[tok->indent]) {
507                     tok->done = E_DEDENT;
508                     tok->cur = tok->inp;
509                     return MAKE_TOKEN(ERRORTOKEN);
510                 }
511                 if (altcol != tok->altindstack[tok->indent]) {
512                     return MAKE_TOKEN(_PyTokenizer_indenterror(tok));
513                 }
514             }
515         }
516     }
517 
518     tok->start = tok->cur;
519     tok->starting_col_offset = tok->col_offset;
520 
521     /* Return pending indents/dedents */
522     if (tok->pendin != 0) {
523         if (tok->pendin < 0) {
524             if (tok->tok_extra_tokens) {
525                 p_start = tok->cur;
526                 p_end = tok->cur;
527             }
528             tok->pendin++;
529             return MAKE_TOKEN(DEDENT);
530         }
531         else {
532             if (tok->tok_extra_tokens) {
533                 p_start = tok->buf;
534                 p_end = tok->cur;
535             }
536             tok->pendin--;
537             return MAKE_TOKEN(INDENT);
538         }
539     }
540 
541     /* Peek ahead at the next character */
542     c = tok_nextc(tok);
543     tok_backup(tok, c);
544 
545  again:
546     tok->start = NULL;
547     /* Skip spaces */
548     do {
549         c = tok_nextc(tok);
550     } while (c == ' ' || c == '\t' || c == '\014');
551 
552     /* Set start of current token */
553     tok->start = tok->cur == NULL ? NULL : tok->cur - 1;
554     tok->starting_col_offset = tok->col_offset - 1;
555 
556     /* Skip comment, unless it's a type comment */
557     if (c == '#') {
558 
559         const char* p = NULL;
560         const char *prefix, *type_start;
561         int current_starting_col_offset;
562 
563         while (c != EOF && c != '\n' && c != '\r') {
564             c = tok_nextc(tok);
565         }
566 
567         if (tok->tok_extra_tokens) {
568             p = tok->start;
569         }
570 
571         if (tok->type_comments) {
572             p = tok->start;
573             current_starting_col_offset = tok->starting_col_offset;
574             prefix = type_comment_prefix;
575             while (*prefix && p < tok->cur) {
576                 if (*prefix == ' ') {
577                     while (*p == ' ' || *p == '\t') {
578                         p++;
579                         current_starting_col_offset++;
580                     }
581                 } else if (*prefix == *p) {
582                     p++;
583                     current_starting_col_offset++;
584                 } else {
585                     break;
586                 }
587 
588                 prefix++;
589             }
590 
591             /* This is a type comment if we matched all of type_comment_prefix. */
592             if (!*prefix) {
593                 int is_type_ignore = 1;
594                 // +6 in order to skip the word 'ignore'
595                 const char *ignore_end = p + 6;
596                 const int ignore_end_col_offset = current_starting_col_offset + 6;
597                 tok_backup(tok, c);  /* don't eat the newline or EOF */
598 
599                 type_start = p;
600 
601                 /* A TYPE_IGNORE is "type: ignore" followed by the end of the token
602                  * or anything ASCII and non-alphanumeric. */
603                 is_type_ignore = (
604                     tok->cur >= ignore_end && memcmp(p, "ignore", 6) == 0
605                     && !(tok->cur > ignore_end
606                          && ((unsigned char)ignore_end[0] >= 128 || Py_ISALNUM(ignore_end[0]))));
607 
608                 if (is_type_ignore) {
609                     p_start = ignore_end;
610                     p_end = tok->cur;
611 
612                     /* If this type ignore is the only thing on the line, consume the newline also. */
613                     if (blankline) {
614                         tok_nextc(tok);
615                         tok->atbol = 1;
616                     }
617                     return MAKE_TYPE_COMMENT_TOKEN(TYPE_IGNORE, ignore_end_col_offset, tok->col_offset);
618                 } else {
619                     p_start = type_start;
620                     p_end = tok->cur;
621                     return MAKE_TYPE_COMMENT_TOKEN(TYPE_COMMENT, current_starting_col_offset, tok->col_offset);
622                 }
623             }
624         }
625         if (tok->tok_extra_tokens) {
626             tok_backup(tok, c);  /* don't eat the newline or EOF */
627             p_start = p;
628             p_end = tok->cur;
629             tok->comment_newline = blankline;
630             return MAKE_TOKEN(COMMENT);
631         }
632     }
633 
634     if (tok->done == E_INTERACT_STOP) {
635         return MAKE_TOKEN(ENDMARKER);
636     }
637 
638     /* Check for EOF and errors now */
639     if (c == EOF) {
640         if (tok->level) {
641             return MAKE_TOKEN(ERRORTOKEN);
642         }
643         return MAKE_TOKEN(tok->done == E_EOF ? ENDMARKER : ERRORTOKEN);
644     }
645 
646     /* Identifier (most frequent token!) */
647     nonascii = 0;
648     if (is_potential_identifier_start(c)) {
649         /* Process the various legal combinations of b"", r"", u"", and f"". */
650         int saw_b = 0, saw_r = 0, saw_u = 0, saw_f = 0;
651         while (1) {
652             if (!(saw_b || saw_u || saw_f) && (c == 'b' || c == 'B'))
653                 saw_b = 1;
654             /* Since this is a backwards compatibility support literal we don't
655                want to support it in arbitrary order like byte literals. */
656             else if (!(saw_b || saw_u || saw_r || saw_f)
657                      && (c == 'u'|| c == 'U')) {
658                 saw_u = 1;
659             }
660             /* ur"" and ru"" are not supported */
661             else if (!(saw_r || saw_u) && (c == 'r' || c == 'R')) {
662                 saw_r = 1;
663             }
664             else if (!(saw_f || saw_b || saw_u) && (c == 'f' || c == 'F')) {
665                 saw_f = 1;
666             }
667             else {
668                 break;
669             }
670             c = tok_nextc(tok);
671             if (c == '"' || c == '\'') {
672                 if (saw_f) {
673                     goto f_string_quote;
674                 }
675                 goto letter_quote;
676             }
677         }
678         while (is_potential_identifier_char(c)) {
679             if (c >= 128) {
680                 nonascii = 1;
681             }
682             c = tok_nextc(tok);
683         }
684         tok_backup(tok, c);
685         if (nonascii && !verify_identifier(tok)) {
686             return MAKE_TOKEN(ERRORTOKEN);
687         }
688 
689         p_start = tok->start;
690         p_end = tok->cur;
691 
692         return MAKE_TOKEN(NAME);
693     }
694 
695     if (c == '\r') {
696         c = tok_nextc(tok);
697     }
698 
699     /* Newline */
700     if (c == '\n') {
701         tok->atbol = 1;
702         if (blankline || tok->level > 0) {
703             if (tok->tok_extra_tokens) {
704                 if (tok->comment_newline) {
705                     tok->comment_newline = 0;
706                 }
707                 p_start = tok->start;
708                 p_end = tok->cur;
709                 return MAKE_TOKEN(NL);
710             }
711             goto nextline;
712         }
713         if (tok->comment_newline && tok->tok_extra_tokens) {
714             tok->comment_newline = 0;
715             p_start = tok->start;
716             p_end = tok->cur;
717             return MAKE_TOKEN(NL);
718         }
719         p_start = tok->start;
720         p_end = tok->cur - 1; /* Leave '\n' out of the string */
721         tok->cont_line = 0;
722         return MAKE_TOKEN(NEWLINE);
723     }
724 
725     /* Period or number starting with period? */
726     if (c == '.') {
727         c = tok_nextc(tok);
728         if (Py_ISDIGIT(c)) {
729             goto fraction;
730         } else if (c == '.') {
731             c = tok_nextc(tok);
732             if (c == '.') {
733                 p_start = tok->start;
734                 p_end = tok->cur;
735                 return MAKE_TOKEN(ELLIPSIS);
736             }
737             else {
738                 tok_backup(tok, c);
739             }
740             tok_backup(tok, '.');
741         }
742         else {
743             tok_backup(tok, c);
744         }
745         p_start = tok->start;
746         p_end = tok->cur;
747         return MAKE_TOKEN(DOT);
748     }
749 
750     /* Number */
751     if (Py_ISDIGIT(c)) {
752         if (c == '0') {
753             /* Hex, octal or binary -- maybe. */
754             c = tok_nextc(tok);
755             if (c == 'x' || c == 'X') {
756                 /* Hex */
757                 c = tok_nextc(tok);
758                 do {
759                     if (c == '_') {
760                         c = tok_nextc(tok);
761                     }
762                     if (!Py_ISXDIGIT(c)) {
763                         tok_backup(tok, c);
764                         return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid hexadecimal literal"));
765                     }
766                     do {
767                         c = tok_nextc(tok);
768                     } while (Py_ISXDIGIT(c));
769                 } while (c == '_');
770                 if (!verify_end_of_number(tok, c, "hexadecimal")) {
771                     return MAKE_TOKEN(ERRORTOKEN);
772                 }
773             }
774             else if (c == 'o' || c == 'O') {
775                 /* Octal */
776                 c = tok_nextc(tok);
777                 do {
778                     if (c == '_') {
779                         c = tok_nextc(tok);
780                     }
781                     if (c < '0' || c >= '8') {
782                         if (Py_ISDIGIT(c)) {
783                             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
784                                     "invalid digit '%c' in octal literal", c));
785                         }
786                         else {
787                             tok_backup(tok, c);
788                             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid octal literal"));
789                         }
790                     }
791                     do {
792                         c = tok_nextc(tok);
793                     } while ('0' <= c && c < '8');
794                 } while (c == '_');
795                 if (Py_ISDIGIT(c)) {
796                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
797                             "invalid digit '%c' in octal literal", c));
798                 }
799                 if (!verify_end_of_number(tok, c, "octal")) {
800                     return MAKE_TOKEN(ERRORTOKEN);
801                 }
802             }
803             else if (c == 'b' || c == 'B') {
804                 /* Binary */
805                 c = tok_nextc(tok);
806                 do {
807                     if (c == '_') {
808                         c = tok_nextc(tok);
809                     }
810                     if (c != '0' && c != '1') {
811                         if (Py_ISDIGIT(c)) {
812                             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
813                         }
814                         else {
815                             tok_backup(tok, c);
816                             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid binary literal"));
817                         }
818                     }
819                     do {
820                         c = tok_nextc(tok);
821                     } while (c == '0' || c == '1');
822                 } while (c == '_');
823                 if (Py_ISDIGIT(c)) {
824                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid digit '%c' in binary literal", c));
825                 }
826                 if (!verify_end_of_number(tok, c, "binary")) {
827                     return MAKE_TOKEN(ERRORTOKEN);
828                 }
829             }
830             else {
831                 int nonzero = 0;
832                 /* maybe old-style octal; c is first char of it */
833                 /* in any case, allow '0' as a literal */
834                 while (1) {
835                     if (c == '_') {
836                         c = tok_nextc(tok);
837                         if (!Py_ISDIGIT(c)) {
838                             tok_backup(tok, c);
839                             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
840                         }
841                     }
842                     if (c != '0') {
843                         break;
844                     }
845                     c = tok_nextc(tok);
846                 }
847                 char* zeros_end = tok->cur;
848                 if (Py_ISDIGIT(c)) {
849                     nonzero = 1;
850                     c = tok_decimal_tail(tok);
851                     if (c == 0) {
852                         return MAKE_TOKEN(ERRORTOKEN);
853                     }
854                 }
855                 if (c == '.') {
856                     c = tok_nextc(tok);
857                     goto fraction;
858                 }
859                 else if (c == 'e' || c == 'E') {
860                     goto exponent;
861                 }
862                 else if (c == 'j' || c == 'J') {
863                     goto imaginary;
864                 }
865                 else if (nonzero && !tok->tok_extra_tokens) {
866                     /* Old-style octal: now disallowed. */
867                     tok_backup(tok, c);
868                     return MAKE_TOKEN(_PyTokenizer_syntaxerror_known_range(
869                             tok, (int)(tok->start + 1 - tok->line_start),
870                             (int)(zeros_end - tok->line_start),
871                             "leading zeros in decimal integer "
872                             "literals are not permitted; "
873                             "use an 0o prefix for octal integers"));
874                 }
875                 if (!verify_end_of_number(tok, c, "decimal")) {
876                     return MAKE_TOKEN(ERRORTOKEN);
877                 }
878             }
879         }
880         else {
881             /* Decimal */
882             c = tok_decimal_tail(tok);
883             if (c == 0) {
884                 return MAKE_TOKEN(ERRORTOKEN);
885             }
886             {
887                 /* Accept floating-point numbers. */
888                 if (c == '.') {
889                     c = tok_nextc(tok);
890         fraction:
891                     /* Fraction */
892                     if (Py_ISDIGIT(c)) {
893                         c = tok_decimal_tail(tok);
894                         if (c == 0) {
895                             return MAKE_TOKEN(ERRORTOKEN);
896                         }
897                     }
898                 }
899                 if (c == 'e' || c == 'E') {
900                     int e;
901                   exponent:
902                     e = c;
903                     /* Exponent part */
904                     c = tok_nextc(tok);
905                     if (c == '+' || c == '-') {
906                         c = tok_nextc(tok);
907                         if (!Py_ISDIGIT(c)) {
908                             tok_backup(tok, c);
909                             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid decimal literal"));
910                         }
911                     } else if (!Py_ISDIGIT(c)) {
912                         tok_backup(tok, c);
913                         if (!verify_end_of_number(tok, e, "decimal")) {
914                             return MAKE_TOKEN(ERRORTOKEN);
915                         }
916                         tok_backup(tok, e);
917                         p_start = tok->start;
918                         p_end = tok->cur;
919                         return MAKE_TOKEN(NUMBER);
920                     }
921                     c = tok_decimal_tail(tok);
922                     if (c == 0) {
923                         return MAKE_TOKEN(ERRORTOKEN);
924                     }
925                 }
926                 if (c == 'j' || c == 'J') {
927                     /* Imaginary part */
928         imaginary:
929                     c = tok_nextc(tok);
930                     if (!verify_end_of_number(tok, c, "imaginary")) {
931                         return MAKE_TOKEN(ERRORTOKEN);
932                     }
933                 }
934                 else if (!verify_end_of_number(tok, c, "decimal")) {
935                     return MAKE_TOKEN(ERRORTOKEN);
936                 }
937             }
938         }
939         tok_backup(tok, c);
940         p_start = tok->start;
941         p_end = tok->cur;
942         return MAKE_TOKEN(NUMBER);
943     }
944 
945   f_string_quote:
946     if (((Py_TOLOWER(*tok->start) == 'f' || Py_TOLOWER(*tok->start) == 'r') && (c == '\'' || c == '"'))) {
947         int quote = c;
948         int quote_size = 1;             /* 1 or 3 */
949 
950         /* Nodes of type STRING, especially multi line strings
951            must be handled differently in order to get both
952            the starting line number and the column offset right.
953            (cf. issue 16806) */
954         tok->first_lineno = tok->lineno;
955         tok->multi_line_start = tok->line_start;
956 
957         /* Find the quote size and start of string */
958         int after_quote = tok_nextc(tok);
959         if (after_quote == quote) {
960             int after_after_quote = tok_nextc(tok);
961             if (after_after_quote == quote) {
962                 quote_size = 3;
963             }
964             else {
965                 // TODO: Check this
966                 tok_backup(tok, after_after_quote);
967                 tok_backup(tok, after_quote);
968             }
969         }
970         if (after_quote != quote) {
971             tok_backup(tok, after_quote);
972         }
973 
974 
975         p_start = tok->start;
976         p_end = tok->cur;
977         if (tok->tok_mode_stack_index + 1 >= MAXFSTRINGLEVEL) {
978             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested f-strings"));
979         }
980         tokenizer_mode *the_current_tok = TOK_NEXT_MODE(tok);
981         the_current_tok->kind = TOK_FSTRING_MODE;
982         the_current_tok->f_string_quote = quote;
983         the_current_tok->f_string_quote_size = quote_size;
984         the_current_tok->f_string_start = tok->start;
985         the_current_tok->f_string_multi_line_start = tok->line_start;
986         the_current_tok->f_string_line_start = tok->lineno;
987         the_current_tok->f_string_start_offset = -1;
988         the_current_tok->f_string_multi_line_start_offset = -1;
989         the_current_tok->last_expr_buffer = NULL;
990         the_current_tok->last_expr_size = 0;
991         the_current_tok->last_expr_end = -1;
992         the_current_tok->in_format_spec = 0;
993         the_current_tok->f_string_debug = 0;
994 
995         switch (*tok->start) {
996             case 'F':
997             case 'f':
998                 the_current_tok->f_string_raw = Py_TOLOWER(*(tok->start + 1)) == 'r';
999                 break;
1000             case 'R':
1001             case 'r':
1002                 the_current_tok->f_string_raw = 1;
1003                 break;
1004             default:
1005                 Py_UNREACHABLE();
1006         }
1007 
1008         the_current_tok->curly_bracket_depth = 0;
1009         the_current_tok->curly_bracket_expr_start_depth = -1;
1010         return MAKE_TOKEN(FSTRING_START);
1011     }
1012 
1013   letter_quote:
1014     /* String */
1015     if (c == '\'' || c == '"') {
1016         int quote = c;
1017         int quote_size = 1;             /* 1 or 3 */
1018         int end_quote_size = 0;
1019         int has_escaped_quote = 0;
1020 
1021         /* Nodes of type STRING, especially multi line strings
1022            must be handled differently in order to get both
1023            the starting line number and the column offset right.
1024            (cf. issue 16806) */
1025         tok->first_lineno = tok->lineno;
1026         tok->multi_line_start = tok->line_start;
1027 
1028         /* Find the quote size and start of string */
1029         c = tok_nextc(tok);
1030         if (c == quote) {
1031             c = tok_nextc(tok);
1032             if (c == quote) {
1033                 quote_size = 3;
1034             }
1035             else {
1036                 end_quote_size = 1;     /* empty string found */
1037             }
1038         }
1039         if (c != quote) {
1040             tok_backup(tok, c);
1041         }
1042 
1043         /* Get rest of string */
1044         while (end_quote_size != quote_size) {
1045             c = tok_nextc(tok);
1046             if (tok->done == E_ERROR) {
1047                 return MAKE_TOKEN(ERRORTOKEN);
1048             }
1049             if (tok->done == E_DECODE) {
1050                 break;
1051             }
1052             if (c == EOF || (quote_size == 1 && c == '\n')) {
1053                 assert(tok->multi_line_start != NULL);
1054                 // shift the tok_state's location into
1055                 // the start of string, and report the error
1056                 // from the initial quote character
1057                 tok->cur = (char *)tok->start;
1058                 tok->cur++;
1059                 tok->line_start = tok->multi_line_start;
1060                 int start = tok->lineno;
1061                 tok->lineno = tok->first_lineno;
1062 
1063                 if (INSIDE_FSTRING(tok)) {
1064                     /* When we are in an f-string, before raising the
1065                      * unterminated string literal error, check whether
1066                      * does the initial quote matches with f-strings quotes
1067                      * and if it is, then this must be a missing '}' token
1068                      * so raise the proper error */
1069                     tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1070                     if (the_current_tok->f_string_quote == quote &&
1071                         the_current_tok->f_string_quote_size == quote_size) {
1072                         return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expecting '}'", start));
1073                     }
1074                 }
1075 
1076                 if (quote_size == 3) {
1077                     _PyTokenizer_syntaxerror(tok, "unterminated triple-quoted string literal"
1078                                      " (detected at line %d)", start);
1079                     if (c != '\n') {
1080                         tok->done = E_EOFS;
1081                     }
1082                     return MAKE_TOKEN(ERRORTOKEN);
1083                 }
1084                 else {
1085                     if (has_escaped_quote) {
1086                         _PyTokenizer_syntaxerror(
1087                             tok,
1088                             "unterminated string literal (detected at line %d); "
1089                             "perhaps you escaped the end quote?",
1090                             start
1091                         );
1092                     } else {
1093                         _PyTokenizer_syntaxerror(
1094                             tok, "unterminated string literal (detected at line %d)", start
1095                         );
1096                     }
1097                     if (c != '\n') {
1098                         tok->done = E_EOLS;
1099                     }
1100                     return MAKE_TOKEN(ERRORTOKEN);
1101                 }
1102             }
1103             if (c == quote) {
1104                 end_quote_size += 1;
1105             }
1106             else {
1107                 end_quote_size = 0;
1108                 if (c == '\\') {
1109                     c = tok_nextc(tok);  /* skip escaped char */
1110                     if (c == quote) {  /* but record whether the escaped char was a quote */
1111                         has_escaped_quote = 1;
1112                     }
1113                     if (c == '\r') {
1114                         c = tok_nextc(tok);
1115                     }
1116                 }
1117             }
1118         }
1119 
1120         p_start = tok->start;
1121         p_end = tok->cur;
1122         return MAKE_TOKEN(STRING);
1123     }
1124 
1125     /* Line continuation */
1126     if (c == '\\') {
1127         if ((c = tok_continuation_line(tok)) == -1) {
1128             return MAKE_TOKEN(ERRORTOKEN);
1129         }
1130         tok->cont_line = 1;
1131         goto again; /* Read next line */
1132     }
1133 
1134     /* Punctuation character */
1135     int is_punctuation = (c == ':' || c == '}' || c == '!' || c == '{');
1136     if (is_punctuation && INSIDE_FSTRING(tok) && INSIDE_FSTRING_EXPR(current_tok)) {
1137         /* This code block gets executed before the curly_bracket_depth is incremented
1138          * by the `{` case, so for ensuring that we are on the 0th level, we need
1139          * to adjust it manually */
1140         int cursor = current_tok->curly_bracket_depth - (c != '{');
1141         int in_format_spec = current_tok->in_format_spec;
1142          int cursor_in_format_with_debug =
1143              cursor == 1 && (current_tok->f_string_debug || in_format_spec);
1144          int cursor_valid = cursor == 0 || cursor_in_format_with_debug;
1145         if ((cursor_valid) && !_PyLexer_update_fstring_expr(tok, c)) {
1146             return MAKE_TOKEN(ENDMARKER);
1147         }
1148         if ((cursor_valid) && c != '{' && set_fstring_expr(tok, token, c)) {
1149             return MAKE_TOKEN(ERRORTOKEN);
1150         }
1151 
1152         if (c == ':' && cursor == current_tok->curly_bracket_expr_start_depth) {
1153             current_tok->kind = TOK_FSTRING_MODE;
1154             current_tok->in_format_spec = 1;
1155             p_start = tok->start;
1156             p_end = tok->cur;
1157             return MAKE_TOKEN(_PyToken_OneChar(c));
1158         }
1159     }
1160 
1161     /* Check for two-character token */
1162     {
1163         int c2 = tok_nextc(tok);
1164         int current_token = _PyToken_TwoChars(c, c2);
1165         if (current_token != OP) {
1166             int c3 = tok_nextc(tok);
1167             int current_token3 = _PyToken_ThreeChars(c, c2, c3);
1168             if (current_token3 != OP) {
1169                 current_token = current_token3;
1170             }
1171             else {
1172                 tok_backup(tok, c3);
1173             }
1174             p_start = tok->start;
1175             p_end = tok->cur;
1176             return MAKE_TOKEN(current_token);
1177         }
1178         tok_backup(tok, c2);
1179     }
1180 
1181     /* Keep track of parentheses nesting level */
1182     switch (c) {
1183     case '(':
1184     case '[':
1185     case '{':
1186         if (tok->level >= MAXLEVEL) {
1187             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "too many nested parentheses"));
1188         }
1189         tok->parenstack[tok->level] = c;
1190         tok->parenlinenostack[tok->level] = tok->lineno;
1191         tok->parencolstack[tok->level] = (int)(tok->start - tok->line_start);
1192         tok->level++;
1193         if (INSIDE_FSTRING(tok)) {
1194             current_tok->curly_bracket_depth++;
1195         }
1196         break;
1197     case ')':
1198     case ']':
1199     case '}':
1200         if (INSIDE_FSTRING(tok) && !current_tok->curly_bracket_depth && c == '}') {
1201             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: single '}' is not allowed"));
1202         }
1203         if (!tok->tok_extra_tokens && !tok->level) {
1204             return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "unmatched '%c'", c));
1205         }
1206         if (tok->level > 0) {
1207             tok->level--;
1208             int opening = tok->parenstack[tok->level];
1209             if (!tok->tok_extra_tokens && !((opening == '(' && c == ')') ||
1210                                             (opening == '[' && c == ']') ||
1211                                             (opening == '{' && c == '}'))) {
1212                 /* If the opening bracket belongs to an f-string's expression
1213                 part (e.g. f"{)}") and the closing bracket is an arbitrary
1214                 nested expression, then instead of matching a different
1215                 syntactical construct with it; we'll throw an unmatched
1216                 parentheses error. */
1217                 if (INSIDE_FSTRING(tok) && opening == '{') {
1218                     assert(current_tok->curly_bracket_depth >= 0);
1219                     int previous_bracket = current_tok->curly_bracket_depth - 1;
1220                     if (previous_bracket == current_tok->curly_bracket_expr_start_depth) {
1221                         return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: unmatched '%c'", c));
1222                     }
1223                 }
1224                 if (tok->parenlinenostack[tok->level] != tok->lineno) {
1225                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1226                             "closing parenthesis '%c' does not match "
1227                             "opening parenthesis '%c' on line %d",
1228                             c, opening, tok->parenlinenostack[tok->level]));
1229                 }
1230                 else {
1231                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1232                             "closing parenthesis '%c' does not match "
1233                             "opening parenthesis '%c'",
1234                             c, opening));
1235                 }
1236             }
1237         }
1238 
1239         if (INSIDE_FSTRING(tok)) {
1240             current_tok->curly_bracket_depth--;
1241             if (current_tok->curly_bracket_depth < 0) {
1242                 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: unmatched '%c'", c));
1243             }
1244             if (c == '}' && current_tok->curly_bracket_depth == current_tok->curly_bracket_expr_start_depth) {
1245                 current_tok->curly_bracket_expr_start_depth--;
1246                 current_tok->kind = TOK_FSTRING_MODE;
1247                 current_tok->in_format_spec = 0;
1248                 current_tok->f_string_debug = 0;
1249             }
1250         }
1251         break;
1252     default:
1253         break;
1254     }
1255 
1256     if (!Py_UNICODE_ISPRINTABLE(c)) {
1257         return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "invalid non-printable character U+%04X", c));
1258     }
1259 
1260     if( c == '=' && INSIDE_FSTRING_EXPR(current_tok)) {
1261         current_tok->f_string_debug = 1;
1262     }
1263 
1264     /* Punctuation character */
1265     p_start = tok->start;
1266     p_end = tok->cur;
1267     return MAKE_TOKEN(_PyToken_OneChar(c));
1268 }
1269 
1270 static int
tok_get_fstring_mode(struct tok_state * tok,tokenizer_mode * current_tok,struct token * token)1271 tok_get_fstring_mode(struct tok_state *tok, tokenizer_mode* current_tok, struct token *token)
1272 {
1273     const char *p_start = NULL;
1274     const char *p_end = NULL;
1275     int end_quote_size = 0;
1276     int unicode_escape = 0;
1277 
1278     tok->start = tok->cur;
1279     tok->first_lineno = tok->lineno;
1280     tok->starting_col_offset = tok->col_offset;
1281 
1282     // If we start with a bracket, we defer to the normal mode as there is nothing for us to tokenize
1283     // before it.
1284     int start_char = tok_nextc(tok);
1285     if (start_char == '{') {
1286         int peek1 = tok_nextc(tok);
1287         tok_backup(tok, peek1);
1288         tok_backup(tok, start_char);
1289         if (peek1 != '{') {
1290             current_tok->curly_bracket_expr_start_depth++;
1291             if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1292                 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
1293             }
1294             TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1295             return tok_get_normal_mode(tok, current_tok, token);
1296         }
1297     }
1298     else {
1299         tok_backup(tok, start_char);
1300     }
1301 
1302     // Check if we are at the end of the string
1303     for (int i = 0; i < current_tok->f_string_quote_size; i++) {
1304         int quote = tok_nextc(tok);
1305         if (quote != current_tok->f_string_quote) {
1306             tok_backup(tok, quote);
1307             goto f_string_middle;
1308         }
1309     }
1310 
1311     if (current_tok->last_expr_buffer != NULL) {
1312         PyMem_Free(current_tok->last_expr_buffer);
1313         current_tok->last_expr_buffer = NULL;
1314         current_tok->last_expr_size = 0;
1315         current_tok->last_expr_end = -1;
1316     }
1317 
1318     p_start = tok->start;
1319     p_end = tok->cur;
1320     tok->tok_mode_stack_index--;
1321     return MAKE_TOKEN(FSTRING_END);
1322 
1323 f_string_middle:
1324 
1325     // TODO: This is a bit of a hack, but it works for now. We need to find a better way to handle
1326     // this.
1327     tok->multi_line_start = tok->line_start;
1328     while (end_quote_size != current_tok->f_string_quote_size) {
1329         int c = tok_nextc(tok);
1330         if (tok->done == E_ERROR || tok->done == E_DECODE) {
1331             return MAKE_TOKEN(ERRORTOKEN);
1332         }
1333         int in_format_spec = (
1334                 current_tok->in_format_spec
1335                 &&
1336                 INSIDE_FSTRING_EXPR(current_tok)
1337         );
1338 
1339        if (c == EOF || (current_tok->f_string_quote_size == 1 && c == '\n')) {
1340             if (tok->decoding_erred) {
1341                 return MAKE_TOKEN(ERRORTOKEN);
1342             }
1343 
1344             // If we are in a format spec and we found a newline,
1345             // it means that the format spec ends here and we should
1346             // return to the regular mode.
1347             if (in_format_spec && c == '\n') {
1348                 tok_backup(tok, c);
1349                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1350                 current_tok->in_format_spec = 0;
1351                 p_start = tok->start;
1352                 p_end = tok->cur;
1353                 return MAKE_TOKEN(FSTRING_MIDDLE);
1354             }
1355 
1356             assert(tok->multi_line_start != NULL);
1357             // shift the tok_state's location into
1358             // the start of string, and report the error
1359             // from the initial quote character
1360             tok->cur = (char *)current_tok->f_string_start;
1361             tok->cur++;
1362             tok->line_start = current_tok->f_string_multi_line_start;
1363             int start = tok->lineno;
1364 
1365             tokenizer_mode *the_current_tok = TOK_GET_MODE(tok);
1366             tok->lineno = the_current_tok->f_string_line_start;
1367 
1368             if (current_tok->f_string_quote_size == 3) {
1369                 _PyTokenizer_syntaxerror(tok,
1370                                     "unterminated triple-quoted f-string literal"
1371                                     " (detected at line %d)", start);
1372                 if (c != '\n') {
1373                     tok->done = E_EOFS;
1374                 }
1375                 return MAKE_TOKEN(ERRORTOKEN);
1376             }
1377             else {
1378                 return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok,
1379                                     "unterminated f-string literal (detected at"
1380                                     " line %d)", start));
1381             }
1382         }
1383 
1384         if (c == current_tok->f_string_quote) {
1385             end_quote_size += 1;
1386             continue;
1387         } else {
1388             end_quote_size = 0;
1389         }
1390 
1391         if (c == '{') {
1392             if (!_PyLexer_update_fstring_expr(tok, c)) {
1393                 return MAKE_TOKEN(ENDMARKER);
1394             }
1395             int peek = tok_nextc(tok);
1396             if (peek != '{' || in_format_spec) {
1397                 tok_backup(tok, peek);
1398                 tok_backup(tok, c);
1399                 current_tok->curly_bracket_expr_start_depth++;
1400                 if (current_tok->curly_bracket_expr_start_depth >= MAX_EXPR_NESTING) {
1401                     return MAKE_TOKEN(_PyTokenizer_syntaxerror(tok, "f-string: expressions nested too deeply"));
1402                 }
1403                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1404                 current_tok->in_format_spec = 0;
1405                 p_start = tok->start;
1406                 p_end = tok->cur;
1407             } else {
1408                 p_start = tok->start;
1409                 p_end = tok->cur - 1;
1410             }
1411             return MAKE_TOKEN(FSTRING_MIDDLE);
1412         } else if (c == '}') {
1413             if (unicode_escape) {
1414                 p_start = tok->start;
1415                 p_end = tok->cur;
1416                 return MAKE_TOKEN(FSTRING_MIDDLE);
1417             }
1418             int peek = tok_nextc(tok);
1419 
1420             // The tokenizer can only be in the format spec if we have already completed the expression
1421             // scanning (indicated by the end of the expression being set) and we are not at the top level
1422             // of the bracket stack (-1 is the top level). Since format specifiers can't legally use double
1423             // brackets, we can bypass it here.
1424             int cursor = current_tok->curly_bracket_depth;
1425             if (peek == '}' && !in_format_spec && cursor == 0) {
1426                 p_start = tok->start;
1427                 p_end = tok->cur - 1;
1428             } else {
1429                 tok_backup(tok, peek);
1430                 tok_backup(tok, c);
1431                 TOK_GET_MODE(tok)->kind = TOK_REGULAR_MODE;
1432                 current_tok->in_format_spec = 0;
1433                 p_start = tok->start;
1434                 p_end = tok->cur;
1435             }
1436             return MAKE_TOKEN(FSTRING_MIDDLE);
1437         } else if (c == '\\') {
1438             int peek = tok_nextc(tok);
1439             if (peek == '\r') {
1440                 peek = tok_nextc(tok);
1441             }
1442             // Special case when the backslash is right before a curly
1443             // brace. We have to restore and return the control back
1444             // to the loop for the next iteration.
1445             if (peek == '{' || peek == '}') {
1446                 if (!current_tok->f_string_raw) {
1447                     if (_PyTokenizer_warn_invalid_escape_sequence(tok, peek)) {
1448                         return MAKE_TOKEN(ERRORTOKEN);
1449                     }
1450                 }
1451                 tok_backup(tok, peek);
1452                 continue;
1453             }
1454 
1455             if (!current_tok->f_string_raw) {
1456                 if (peek == 'N') {
1457                     /* Handle named unicode escapes (\N{BULLET}) */
1458                     peek = tok_nextc(tok);
1459                     if (peek == '{') {
1460                         unicode_escape = 1;
1461                     } else {
1462                         tok_backup(tok, peek);
1463                     }
1464                 }
1465             } /* else {
1466                 skip the escaped character
1467             }*/
1468         }
1469     }
1470 
1471     // Backup the f-string quotes to emit a final FSTRING_MIDDLE and
1472     // add the quotes to the FSTRING_END in the next tokenizer iteration.
1473     for (int i = 0; i < current_tok->f_string_quote_size; i++) {
1474         tok_backup(tok, current_tok->f_string_quote);
1475     }
1476     p_start = tok->start;
1477     p_end = tok->cur;
1478     return MAKE_TOKEN(FSTRING_MIDDLE);
1479 }
1480 
1481 static int
tok_get(struct tok_state * tok,struct token * token)1482 tok_get(struct tok_state *tok, struct token *token)
1483 {
1484     tokenizer_mode *current_tok = TOK_GET_MODE(tok);
1485     if (current_tok->kind == TOK_REGULAR_MODE) {
1486         return tok_get_normal_mode(tok, current_tok, token);
1487     } else {
1488         return tok_get_fstring_mode(tok, current_tok, token);
1489     }
1490 }
1491 
1492 int
_PyTokenizer_Get(struct tok_state * tok,struct token * token)1493 _PyTokenizer_Get(struct tok_state *tok, struct token *token)
1494 {
1495     int result = tok_get(tok, token);
1496     if (tok->decoding_erred) {
1497         result = ERRORTOKEN;
1498         tok->done = E_DECODE;
1499     }
1500     return result;
1501 }
1502