• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <stdbool.h>
2 
3 #include <Python.h>
4 
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8 
9 //// STRING HANDLING FUNCTIONS ////
10 
11 static int
warn_invalid_escape_sequence(Parser * p,unsigned char first_invalid_escape_char,Token * t)12 warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
13 {
14     PyObject *msg =
15         PyUnicode_FromFormat("invalid escape sequence '\\%c'", first_invalid_escape_char);
16     if (msg == NULL) {
17         return -1;
18     }
19     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
20                                  t->lineno, NULL, NULL) < 0) {
21         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22             /* Replace the DeprecationWarning exception with a SyntaxError
23                to get a more accurate error report */
24             PyErr_Clear();
25 
26             /* This is needed, in order for the SyntaxError to point to the token t,
27                since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28                error location, if p->known_err_token is not set. */
29             p->known_err_token = t;
30             RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", first_invalid_escape_char);
31         }
32         Py_DECREF(msg);
33         return -1;
34     }
35     Py_DECREF(msg);
36     return 0;
37 }
38 
39 static PyObject *
decode_utf8(const char ** sPtr,const char * end)40 decode_utf8(const char **sPtr, const char *end)
41 {
42     const char *s;
43     const char *t;
44     t = s = *sPtr;
45     while (s < end && (*s & 0x80)) {
46         s++;
47     }
48     *sPtr = s;
49     return PyUnicode_DecodeUTF8(t, s - t, NULL);
50 }
51 
52 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)53 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
54 {
55     PyObject *v;
56     PyObject *u;
57     char *buf;
58     char *p;
59     const char *end;
60 
61     /* check for integer overflow */
62     if (len > SIZE_MAX / 6) {
63         return NULL;
64     }
65     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67     u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68     if (u == NULL) {
69         return NULL;
70     }
71     p = buf = PyBytes_AsString(u);
72     if (p == NULL) {
73         return NULL;
74     }
75     end = s + len;
76     while (s < end) {
77         if (*s == '\\') {
78             *p++ = *s++;
79             if (s >= end || *s & 0x80) {
80                 strcpy(p, "u005c");
81                 p += 5;
82                 if (s >= end) {
83                     break;
84                 }
85             }
86         }
87         if (*s & 0x80) {
88             PyObject *w;
89             int kind;
90             const void *data;
91             Py_ssize_t w_len;
92             Py_ssize_t i;
93             w = decode_utf8(&s, end);
94             if (w == NULL) {
95                 Py_DECREF(u);
96                 return NULL;
97             }
98             kind = PyUnicode_KIND(w);
99             data = PyUnicode_DATA(w);
100             w_len = PyUnicode_GET_LENGTH(w);
101             for (i = 0; i < w_len; i++) {
102                 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103                 sprintf(p, "\\U%08x", chr);
104                 p += 10;
105             }
106             /* Should be impossible to overflow */
107             assert(p - buf <= PyBytes_GET_SIZE(u));
108             Py_DECREF(w);
109         }
110         else {
111             *p++ = *s++;
112         }
113     }
114     len = p - buf;
115     s = buf;
116 
117     const char *first_invalid_escape;
118     v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119 
120     if (v != NULL && first_invalid_escape != NULL) {
121         if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
122             /* We have not decref u before because first_invalid_escape points
123                inside u. */
124             Py_XDECREF(u);
125             Py_DECREF(v);
126             return NULL;
127         }
128     }
129     Py_XDECREF(u);
130     return v;
131 }
132 
133 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)134 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
135 {
136     const char *first_invalid_escape;
137     PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138     if (result == NULL) {
139         return NULL;
140     }
141 
142     if (first_invalid_escape != NULL) {
143         if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
144             Py_DECREF(result);
145             return NULL;
146         }
147     }
148     return result;
149 }
150 
151 /* s must include the bracketing quote characters, and r, b, u,
152    &/or f prefixes (if any), and embedded escape sequences (if any).
153    _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154    If the string is an f-string, set *fstr and *fstrlen to the unparsed
155    string object.  Return 0 if no errors occurred.  */
156 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)157 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158                   const char **fstr, Py_ssize_t *fstrlen, Token *t)
159 {
160     const char *s = PyBytes_AsString(t->bytes);
161     if (s == NULL) {
162         return -1;
163     }
164 
165     size_t len;
166     int quote = Py_CHARMASK(*s);
167     int fmode = 0;
168     *bytesmode = 0;
169     *rawmode = 0;
170     *result = NULL;
171     *fstr = NULL;
172     if (Py_ISALPHA(quote)) {
173         while (!*bytesmode || !*rawmode) {
174             if (quote == 'b' || quote == 'B') {
175                 quote =(unsigned char)*++s;
176                 *bytesmode = 1;
177             }
178             else if (quote == 'u' || quote == 'U') {
179                 quote = (unsigned char)*++s;
180             }
181             else if (quote == 'r' || quote == 'R') {
182                 quote = (unsigned char)*++s;
183                 *rawmode = 1;
184             }
185             else if (quote == 'f' || quote == 'F') {
186                 quote = (unsigned char)*++s;
187                 fmode = 1;
188             }
189             else {
190                 break;
191             }
192         }
193     }
194 
195     /* fstrings are only allowed in Python 3.6 and greater */
196     if (fmode && p->feature_version < 6) {
197         p->error_indicator = 1;
198         RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199         return -1;
200     }
201 
202     if (fmode && *bytesmode) {
203         PyErr_BadInternalCall();
204         return -1;
205     }
206     if (quote != '\'' && quote != '\"') {
207         PyErr_BadInternalCall();
208         return -1;
209     }
210     /* Skip the leading quote char. */
211     s++;
212     len = strlen(s);
213     if (len > INT_MAX) {
214         PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215         return -1;
216     }
217     if (s[--len] != quote) {
218         /* Last quote char must match the first. */
219         PyErr_BadInternalCall();
220         return -1;
221     }
222     if (len >= 4 && s[0] == quote && s[1] == quote) {
223         /* A triple quoted string. We've already skipped one quote at
224            the start and one at the end of the string. Now skip the
225            two at the start. */
226         s += 2;
227         len -= 2;
228         /* And check that the last two match. */
229         if (s[--len] != quote || s[--len] != quote) {
230             PyErr_BadInternalCall();
231             return -1;
232         }
233     }
234 
235     if (fmode) {
236         /* Just return the bytes. The caller will parse the resulting
237            string. */
238         *fstr = s;
239         *fstrlen = len;
240         return 0;
241     }
242 
243     /* Not an f-string. */
244     /* Avoid invoking escape decoding routines if possible. */
245     *rawmode = *rawmode || strchr(s, '\\') == NULL;
246     if (*bytesmode) {
247         /* Disallow non-ASCII characters. */
248         const char *ch;
249         for (ch = s; *ch; ch++) {
250             if (Py_CHARMASK(*ch) >= 0x80) {
251                 RAISE_SYNTAX_ERROR(
252                                    "bytes can only contain ASCII "
253                                    "literal characters");
254                 return -1;
255             }
256         }
257         if (*rawmode) {
258             *result = PyBytes_FromStringAndSize(s, len);
259         }
260         else {
261             *result = decode_bytes_with_escapes(p, s, len, t);
262         }
263     }
264     else {
265         if (*rawmode) {
266             *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267         }
268         else {
269             *result = decode_unicode_with_escapes(p, s, len, t);
270         }
271     }
272     return *result == NULL ? -1 : 0;
273 }
274 
275 
276 
277 // FSTRING STUFF
278 
279 /* Fix locations for the given node and its children.
280 
281    `parent` is the enclosing node.
282    `expr_start` is the starting position of the expression (pointing to the open brace).
283    `n` is the node which locations are going to be fixed relative to parent.
284    `expr_str` is the child node's string representation, including braces.
285 */
286 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)287 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
288 {
289     *p_lines = 0;
290     *p_cols = 0;
291     assert(expr_start != NULL && *expr_start == '{');
292     if (parent && parent->bytes) {
293         const char *parent_str = PyBytes_AsString(parent->bytes);
294         if (!parent_str) {
295             return false;
296         }
297         // The following is needed, in order to correctly shift the column
298         // offset, in the case that (disregarding any whitespace) a newline
299         // immediately follows the opening curly brace of the fstring expression.
300         bool newline_after_brace = 1;
301         const char *start = expr_start + 1;
302         while (start && *start != '}' && *start != '\n') {
303             if (*start != ' ' && *start != '\t' && *start != '\f') {
304                 newline_after_brace = 0;
305                 break;
306             }
307             start++;
308         }
309 
310         // Account for the characters from the last newline character to our
311         // left until the beginning of expr_start.
312         if (!newline_after_brace) {
313             start = expr_start;
314             while (start > parent_str && *start != '\n') {
315                 start--;
316             }
317             *p_cols += (int)(expr_start - start);
318         }
319         /* adjust the start based on the number of newlines encountered
320            before the f-string expression */
321         for (const char *p = parent_str; p < expr_start; p++) {
322             if (*p == '\n') {
323                 (*p_lines)++;
324             }
325         }
326     }
327     return true;
328 }
329 
330 
331 /* Compile this expression in to an expr_ty.  Add parens around the
332    expression, in order to allow leading spaces in the expression. */
333 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)334 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
335                      Token *t)
336 {
337     expr_ty expr = NULL;
338     char *str;
339     Py_ssize_t len;
340     const char *s;
341     expr_ty result = NULL;
342 
343     assert(expr_end >= expr_start);
344     assert(*(expr_start-1) == '{');
345     assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
346            *expr_end == '=');
347 
348     /* If the substring is all whitespace, it's an error.  We need to catch this
349        here, and not when we call PyParser_SimpleParseStringFlagsFilename,
350        because turning the expression '' in to '()' would go from being invalid
351        to valid. */
352     for (s = expr_start; s != expr_end; s++) {
353         char c = *s;
354         /* The Python parser ignores only the following whitespace
355            characters (\r already is converted to \n). */
356         if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
357             break;
358         }
359     }
360     if (s == expr_end) {
361         RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
362         return NULL;
363     }
364 
365     len = expr_end - expr_start;
366     /* Allocate 3 extra bytes: open paren, close paren, null byte. */
367     str = PyMem_Calloc(len + 3, sizeof(char));
368     if (str == NULL) {
369         PyErr_NoMemory();
370         return NULL;
371     }
372 
373     // The call to fstring_find_expr_location is responsible for finding the column offset
374     // the generated AST nodes need to be shifted to the right, which is equal to the number
375     // of the f-string characters before the expression starts.
376     memcpy(str+1, expr_start, len);
377     int lines, cols;
378     if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
379         PyMem_Free(str);
380         return NULL;
381     }
382 
383     // The parentheses are needed in order to allow for leading whitespace within
384     // the f-string expression. This consequently gets parsed as a group (see the
385     // group rule in python.gram).
386     str[0] = '(';
387     str[len+1] = ')';
388 
389     struct tok_state* tok = PyTokenizer_FromString(str, 1);
390     if (tok == NULL) {
391         PyMem_Free(str);
392         return NULL;
393     }
394     Py_INCREF(p->tok->filename);
395 
396     tok->filename = p->tok->filename;
397     tok->lineno = t->lineno + lines - 1;
398 
399     Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
400                                      NULL, p->arena);
401 
402     p2->starting_lineno = t->lineno + lines;
403     p2->starting_col_offset = t->col_offset + cols;
404 
405     expr = _PyPegen_run_parser(p2);
406 
407     if (expr == NULL) {
408         goto exit;
409     }
410     result = expr;
411 
412 exit:
413     PyMem_Free(str);
414     _PyPegen_Parser_Free(p2);
415     PyTokenizer_Free(tok);
416     return result;
417 }
418 
419 /* Return -1 on error.
420 
421    Return 0 if we reached the end of the literal.
422 
423    Return 1 if we haven't reached the end of the literal, but we want
424    the caller to process the literal up to this point. Used for
425    doubled braces.
426 */
427 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)428 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
429                      PyObject **literal, int recurse_lvl, Token *t)
430 {
431     /* Get any literal string. It ends when we hit an un-doubled left
432        brace (which isn't part of a unicode name escape such as
433        "\N{EULER CONSTANT}"), or the end of the string. */
434 
435     const char *s = *str;
436     const char *literal_start = s;
437     int result = 0;
438 
439     assert(*literal == NULL);
440     while (s < end) {
441         char ch = *s++;
442         if (!raw && ch == '\\' && s < end) {
443             ch = *s++;
444             if (ch == 'N') {
445                 if (s < end && *s++ == '{') {
446                     while (s < end && *s++ != '}') {
447                     }
448                     continue;
449                 }
450                 break;
451             }
452             if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
453                 return -1;
454             }
455         }
456         if (ch == '{' || ch == '}') {
457             /* Check for doubled braces, but only at the top level. If
458                we checked at every level, then f'{0:{3}}' would fail
459                with the two closing braces. */
460             if (recurse_lvl == 0) {
461                 if (s < end && *s == ch) {
462                     /* We're going to tell the caller that the literal ends
463                        here, but that they should continue scanning. But also
464                        skip over the second brace when we resume scanning. */
465                     *str = s + 1;
466                     result = 1;
467                     goto done;
468                 }
469 
470                 /* Where a single '{' is the start of a new expression, a
471                    single '}' is not allowed. */
472                 if (ch == '}') {
473                     *str = s - 1;
474                     RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
475                     return -1;
476                 }
477             }
478             /* We're either at a '{', which means we're starting another
479                expression; or a '}', which means we're at the end of this
480                f-string (for a nested format_spec). */
481             s--;
482             break;
483         }
484     }
485     *str = s;
486     assert(s <= end);
487     assert(s == end || *s == '{' || *s == '}');
488 done:
489     if (literal_start != s) {
490         if (raw) {
491             *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
492                                                     s - literal_start,
493                                                     NULL, NULL);
494         } else {
495             *literal = decode_unicode_with_escapes(p, literal_start,
496                                                    s - literal_start, t);
497         }
498         if (!*literal) {
499             return -1;
500         }
501     }
502     return result;
503 }
504 
505 /* Forward declaration because parsing is recursive. */
506 static expr_ty
507 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
508               Token *first_token, Token* t, Token *last_token);
509 
510 /* Parse the f-string at *str, ending at end.  We know *str starts an
511    expression (so it must be a '{'). Returns the FormattedValue node, which
512    includes the expression, conversion character, format_spec expression, and
513    optionally the text of the expression (if = is used).
514 
515    Note that I don't do a perfect job here: I don't make sure that a
516    closing brace doesn't match an opening paren, for example. It
517    doesn't need to error on all invalid expressions, just correctly
518    find the end of all valid ones. Any errors inside the expression
519    will be caught when we parse it later.
520 
521    *expression is set to the expression.  For an '=' "debug" expression,
522    *expr_text is set to the debug text (the original text of the expression,
523    including the '=' and any whitespace around it, as a string object).  If
524    not a debug expression, *expr_text set to NULL. */
525 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)526 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
527                   PyObject **expr_text, expr_ty *expression, Token *first_token,
528                   Token *t, Token *last_token)
529 {
530     /* Return -1 on error, else 0. */
531 
532     const char *expr_start;
533     const char *expr_end;
534     expr_ty simple_expression;
535     expr_ty format_spec = NULL; /* Optional format specifier. */
536     int conversion = -1; /* The conversion char.  Use default if not
537                             specified, or !r if using = and no format
538                             spec. */
539 
540     /* 0 if we're not in a string, else the quote char we're trying to
541        match (single or double quote). */
542     char quote_char = 0;
543 
544     /* If we're inside a string, 1=normal, 3=triple-quoted. */
545     int string_type = 0;
546 
547     /* Keep track of nesting level for braces/parens/brackets in
548        expressions. */
549     Py_ssize_t nested_depth = 0;
550     char parenstack[MAXLEVEL];
551 
552     *expr_text = NULL;
553 
554     /* Can only nest one level deep. */
555     if (recurse_lvl >= 2) {
556         RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
557         goto error;
558     }
559 
560     /* The first char must be a left brace, or we wouldn't have gotten
561        here. Skip over it. */
562     assert(**str == '{');
563     *str += 1;
564 
565     expr_start = *str;
566     for (; *str < end; (*str)++) {
567         char ch;
568 
569         /* Loop invariants. */
570         assert(nested_depth >= 0);
571         assert(*str >= expr_start && *str < end);
572         if (quote_char) {
573             assert(string_type == 1 || string_type == 3);
574         } else {
575             assert(string_type == 0);
576         }
577 
578         ch = **str;
579         /* Nowhere inside an expression is a backslash allowed. */
580         if (ch == '\\') {
581             /* Error: can't include a backslash character, inside
582                parens or strings or not. */
583             RAISE_SYNTAX_ERROR(
584                       "f-string expression part "
585                       "cannot include a backslash");
586             goto error;
587         }
588         if (quote_char) {
589             /* We're inside a string. See if we're at the end. */
590             /* This code needs to implement the same non-error logic
591                as tok_get from tokenizer.c, at the letter_quote
592                label. To actually share that code would be a
593                nightmare. But, it's unlikely to change and is small,
594                so duplicate it here. Note we don't need to catch all
595                of the errors, since they'll be caught when parsing the
596                expression. We just need to match the non-error
597                cases. Thus we can ignore \n in single-quoted strings,
598                for example. Or non-terminated strings. */
599             if (ch == quote_char) {
600                 /* Does this match the string_type (single or triple
601                    quoted)? */
602                 if (string_type == 3) {
603                     if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
604                         /* We're at the end of a triple quoted string. */
605                         *str += 2;
606                         string_type = 0;
607                         quote_char = 0;
608                         continue;
609                     }
610                 } else {
611                     /* We're at the end of a normal string. */
612                     quote_char = 0;
613                     string_type = 0;
614                     continue;
615                 }
616             }
617         } else if (ch == '\'' || ch == '"') {
618             /* Is this a triple quoted string? */
619             if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
620                 string_type = 3;
621                 *str += 2;
622             } else {
623                 /* Start of a normal string. */
624                 string_type = 1;
625             }
626             /* Start looking for the end of the string. */
627             quote_char = ch;
628         } else if (ch == '[' || ch == '{' || ch == '(') {
629             if (nested_depth >= MAXLEVEL) {
630                 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
631                 goto error;
632             }
633             parenstack[nested_depth] = ch;
634             nested_depth++;
635         } else if (ch == '#') {
636             /* Error: can't include a comment character, inside parens
637                or not. */
638             RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
639             goto error;
640         } else if (nested_depth == 0 &&
641                    (ch == '!' || ch == ':' || ch == '}' ||
642                     ch == '=' || ch == '>' || ch == '<')) {
643             /* See if there's a next character. */
644             if (*str+1 < end) {
645                 char next = *(*str+1);
646 
647                 /* For "!=". since '=' is not an allowed conversion character,
648                    nothing is lost in this test. */
649                 if ((ch == '!' && next == '=') ||   /* != */
650                     (ch == '=' && next == '=') ||   /* == */
651                     (ch == '<' && next == '=') ||   /* <= */
652                     (ch == '>' && next == '=')      /* >= */
653                     ) {
654                     *str += 1;
655                     continue;
656                 }
657                 /* Don't get out of the loop for these, if they're single
658                    chars (not part of 2-char tokens). If by themselves, they
659                    don't end an expression (unlike say '!'). */
660                 if (ch == '>' || ch == '<') {
661                     continue;
662                 }
663             }
664 
665             /* Normal way out of this loop. */
666             break;
667         } else if (ch == ']' || ch == '}' || ch == ')') {
668             if (!nested_depth) {
669                 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
670                 goto error;
671             }
672             nested_depth--;
673             int opening = (unsigned char)parenstack[nested_depth];
674             if (!((opening == '(' && ch == ')') ||
675                   (opening == '[' && ch == ']') ||
676                   (opening == '{' && ch == '}')))
677             {
678                 RAISE_SYNTAX_ERROR(
679                           "f-string: closing parenthesis '%c' "
680                           "does not match opening parenthesis '%c'",
681                           ch, opening);
682                 goto error;
683             }
684         } else {
685             /* Just consume this char and loop around. */
686         }
687     }
688     expr_end = *str;
689     /* If we leave this loop in a string or with mismatched parens, we
690        don't care. We'll get a syntax error when compiling the
691        expression. But, we can produce a better error message, so
692        let's just do that.*/
693     if (quote_char) {
694         RAISE_SYNTAX_ERROR("f-string: unterminated string");
695         goto error;
696     }
697     if (nested_depth) {
698         int opening = (unsigned char)parenstack[nested_depth - 1];
699         RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
700         goto error;
701     }
702 
703     if (*str >= end) {
704         goto unexpected_end_of_string;
705     }
706 
707     /* Compile the expression as soon as possible, so we show errors
708        related to the expression before errors related to the
709        conversion or format_spec. */
710     simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
711     if (!simple_expression) {
712         goto error;
713     }
714 
715     /* Check for =, which puts the text value of the expression in
716        expr_text. */
717     if (**str == '=') {
718         if (p->feature_version < 8) {
719             RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
720                                "only supported in Python 3.8 and greater");
721             goto error;
722         }
723         *str += 1;
724 
725         /* Skip over ASCII whitespace.  No need to test for end of string
726            here, since we know there's at least a trailing quote somewhere
727            ahead. */
728         while (Py_ISSPACE(**str)) {
729             *str += 1;
730         }
731 
732         /* Set *expr_text to the text of the expression. */
733         *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
734         if (!*expr_text) {
735             goto error;
736         }
737     }
738 
739     /* Check for a conversion char, if present. */
740     if (**str == '!') {
741         *str += 1;
742         if (*str >= end) {
743             goto unexpected_end_of_string;
744         }
745 
746         conversion = (unsigned char)**str;
747         *str += 1;
748 
749         /* Validate the conversion. */
750         if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
751             RAISE_SYNTAX_ERROR(
752                       "f-string: invalid conversion character: "
753                       "expected 's', 'r', or 'a'");
754             goto error;
755         }
756 
757     }
758 
759     /* Check for the format spec, if present. */
760     if (*str >= end) {
761         goto unexpected_end_of_string;
762     }
763     if (**str == ':') {
764         *str += 1;
765         if (*str >= end) {
766             goto unexpected_end_of_string;
767         }
768 
769         /* Parse the format spec. */
770         format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
771                                     first_token, t, last_token);
772         if (!format_spec) {
773             goto error;
774         }
775     }
776 
777     if (*str >= end || **str != '}') {
778         goto unexpected_end_of_string;
779     }
780 
781     /* We're at a right brace. Consume it. */
782     assert(*str < end);
783     assert(**str == '}');
784     *str += 1;
785 
786     /* If we're in = mode (detected by non-NULL expr_text), and have no format
787        spec and no explicit conversion, set the conversion to 'r'. */
788     if (*expr_text && format_spec == NULL && conversion == -1) {
789         conversion = 'r';
790     }
791 
792     /* And now create the FormattedValue node that represents this
793        entire expression with the conversion and format spec. */
794     //TODO: Fix this
795     *expression = _PyAST_FormattedValue(simple_expression, conversion,
796                                         format_spec, first_token->lineno,
797                                         first_token->col_offset,
798                                         last_token->end_lineno,
799                                         last_token->end_col_offset, p->arena);
800     if (!*expression) {
801         goto error;
802     }
803 
804     return 0;
805 
806 unexpected_end_of_string:
807     RAISE_SYNTAX_ERROR("f-string: expecting '}'");
808     /* Falls through to error. */
809 
810 error:
811     Py_XDECREF(*expr_text);
812     return -1;
813 
814 }
815 
816 /* Return -1 on error.
817 
818    Return 0 if we have a literal (possible zero length) and an
819    expression (zero length if at the end of the string.
820 
821    Return 1 if we have a literal, but no expression, and we want the
822    caller to call us again. This is used to deal with doubled
823    braces.
824 
825    When called multiple times on the string 'a{{b{0}c', this function
826    will return:
827 
828    1. the literal 'a{' with no expression, and a return value
829       of 1. Despite the fact that there's no expression, the return
830       value of 1 means we're not finished yet.
831 
832    2. the literal 'b' and the expression '0', with a return value of
833       0. The fact that there's an expression means we're not finished.
834 
835    3. literal 'c' with no expression and a return value of 0. The
836       combination of the return value of 0 with no expression means
837       we're finished.
838 */
839 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)840 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
841                               int recurse_lvl, PyObject **literal,
842                               PyObject **expr_text, expr_ty *expression,
843                               Token *first_token, Token *t, Token *last_token)
844 {
845     int result;
846 
847     assert(*literal == NULL && *expression == NULL);
848 
849     /* Get any literal string. */
850     result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
851     if (result < 0) {
852         goto error;
853     }
854 
855     assert(result == 0 || result == 1);
856 
857     if (result == 1) {
858         /* We have a literal, but don't look at the expression. */
859         return 1;
860     }
861 
862     if (*str >= end || **str == '}') {
863         /* We're at the end of the string or the end of a nested
864            f-string: no expression. The top-level error case where we
865            expect to be at the end of the string but we're at a '}' is
866            handled later. */
867         return 0;
868     }
869 
870     /* We must now be the start of an expression, on a '{'. */
871     assert(**str == '{');
872 
873     if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
874                           expression, first_token, t, last_token) < 0) {
875         goto error;
876     }
877 
878     return 0;
879 
880 error:
881     Py_CLEAR(*literal);
882     return -1;
883 }
884 
885 #ifdef NDEBUG
886 #define ExprList_check_invariants(l)
887 #else
888 static void
ExprList_check_invariants(ExprList * l)889 ExprList_check_invariants(ExprList *l)
890 {
891     /* Check our invariants. Make sure this object is "live", and
892        hasn't been deallocated. */
893     assert(l->size >= 0);
894     assert(l->p != NULL);
895     if (l->size <= EXPRLIST_N_CACHED) {
896         assert(l->data == l->p);
897     }
898 }
899 #endif
900 
901 static void
ExprList_Init(ExprList * l)902 ExprList_Init(ExprList *l)
903 {
904     l->allocated = EXPRLIST_N_CACHED;
905     l->size = 0;
906 
907     /* Until we start allocating dynamically, p points to data. */
908     l->p = l->data;
909 
910     ExprList_check_invariants(l);
911 }
912 
913 static int
ExprList_Append(ExprList * l,expr_ty exp)914 ExprList_Append(ExprList *l, expr_ty exp)
915 {
916     ExprList_check_invariants(l);
917     if (l->size >= l->allocated) {
918         /* We need to alloc (or realloc) the memory. */
919         Py_ssize_t new_size = l->allocated * 2;
920 
921         /* See if we've ever allocated anything dynamically. */
922         if (l->p == l->data) {
923             Py_ssize_t i;
924             /* We're still using the cached data. Switch to
925                alloc-ing. */
926             l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
927             if (!l->p) {
928                 return -1;
929             }
930             /* Copy the cached data into the new buffer. */
931             for (i = 0; i < l->size; i++) {
932                 l->p[i] = l->data[i];
933             }
934         } else {
935             /* Just realloc. */
936             expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
937             if (!tmp) {
938                 PyMem_Free(l->p);
939                 l->p = NULL;
940                 return -1;
941             }
942             l->p = tmp;
943         }
944 
945         l->allocated = new_size;
946         assert(l->allocated == 2 * l->size);
947     }
948 
949     l->p[l->size++] = exp;
950 
951     ExprList_check_invariants(l);
952     return 0;
953 }
954 
955 static void
ExprList_Dealloc(ExprList * l)956 ExprList_Dealloc(ExprList *l)
957 {
958     ExprList_check_invariants(l);
959 
960     /* If there's been an error, or we've never dynamically allocated,
961        do nothing. */
962     if (!l->p || l->p == l->data) {
963         /* Do nothing. */
964     } else {
965         /* We have dynamically allocated. Free the memory. */
966         PyMem_Free(l->p);
967     }
968     l->p = NULL;
969     l->size = -1;
970 }
971 
972 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)973 ExprList_Finish(ExprList *l, PyArena *arena)
974 {
975     asdl_expr_seq *seq;
976 
977     ExprList_check_invariants(l);
978 
979     /* Allocate the asdl_seq and copy the expressions in to it. */
980     seq = _Py_asdl_expr_seq_new(l->size, arena);
981     if (seq) {
982         Py_ssize_t i;
983         for (i = 0; i < l->size; i++) {
984             asdl_seq_SET(seq, i, l->p[i]);
985         }
986     }
987     ExprList_Dealloc(l);
988     return seq;
989 }
990 
991 #ifdef NDEBUG
992 #define FstringParser_check_invariants(state)
993 #else
994 static void
FstringParser_check_invariants(FstringParser * state)995 FstringParser_check_invariants(FstringParser *state)
996 {
997     if (state->last_str) {
998         assert(PyUnicode_CheckExact(state->last_str));
999     }
1000     ExprList_check_invariants(&state->expr_list);
1001 }
1002 #endif
1003 
1004 void
_PyPegen_FstringParser_Init(FstringParser * state)1005 _PyPegen_FstringParser_Init(FstringParser *state)
1006 {
1007     state->last_str = NULL;
1008     state->fmode = 0;
1009     ExprList_Init(&state->expr_list);
1010     FstringParser_check_invariants(state);
1011 }
1012 
1013 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1014 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1015 {
1016     FstringParser_check_invariants(state);
1017 
1018     Py_XDECREF(state->last_str);
1019     ExprList_Dealloc(&state->expr_list);
1020 }
1021 
1022 /* Make a Constant node, but decref the PyUnicode object being added. */
1023 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1024 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1025 {
1026     PyObject *s = *str;
1027     PyObject *kind = NULL;
1028     *str = NULL;
1029     assert(PyUnicode_CheckExact(s));
1030     if (_PyArena_AddPyObject(p->arena, s) < 0) {
1031         Py_DECREF(s);
1032         return NULL;
1033     }
1034     const char* the_str = PyBytes_AsString(first_token->bytes);
1035     if (the_str && the_str[0] == 'u') {
1036         kind = _PyPegen_new_identifier(p, "u");
1037     }
1038 
1039     if (kind == NULL && PyErr_Occurred()) {
1040         return NULL;
1041     }
1042 
1043     return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1044                            last_token->end_lineno, last_token->end_col_offset,
1045                            p->arena);
1046 
1047 }
1048 
1049 
1050 /* Add a non-f-string (that is, a regular literal string). str is
1051    decref'd. */
1052 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1053 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1054 {
1055     FstringParser_check_invariants(state);
1056 
1057     assert(PyUnicode_CheckExact(str));
1058 
1059     if (PyUnicode_GET_LENGTH(str) == 0) {
1060         Py_DECREF(str);
1061         return 0;
1062     }
1063 
1064     if (!state->last_str) {
1065         /* We didn't have a string before, so just remember this one. */
1066         state->last_str = str;
1067     } else {
1068         /* Concatenate this with the previous string. */
1069         PyUnicode_AppendAndDel(&state->last_str, str);
1070         if (!state->last_str) {
1071             return -1;
1072         }
1073     }
1074     FstringParser_check_invariants(state);
1075     return 0;
1076 }
1077 
1078 /* Parse an f-string. The f-string is in *str to end, with no
1079    'f' or quotes. */
1080 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1081 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1082                             const char *end, int raw, int recurse_lvl,
1083                             Token *first_token, Token* t, Token *last_token)
1084 {
1085     FstringParser_check_invariants(state);
1086     state->fmode = 1;
1087 
1088     /* Parse the f-string. */
1089     while (1) {
1090         PyObject *literal = NULL;
1091         PyObject *expr_text = NULL;
1092         expr_ty expression = NULL;
1093 
1094         /* If there's a zero length literal in front of the
1095            expression, literal will be NULL. If we're at the end of
1096            the f-string, expression will be NULL (unless result == 1,
1097            see below). */
1098         int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1099                                                    &literal, &expr_text,
1100                                                    &expression, first_token, t, last_token);
1101         if (result < 0) {
1102             return -1;
1103         }
1104 
1105         /* Add the literal, if any. */
1106         if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1107             Py_XDECREF(expr_text);
1108             return -1;
1109         }
1110         /* Add the expr_text, if any. */
1111         if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1112             return -1;
1113         }
1114 
1115         /* We've dealt with the literal and expr_text, their ownership has
1116            been transferred to the state object.  Don't look at them again. */
1117 
1118         /* See if we should just loop around to get the next literal
1119            and expression, while ignoring the expression this
1120            time. This is used for un-doubling braces, as an
1121            optimization. */
1122         if (result == 1) {
1123             continue;
1124         }
1125 
1126         if (!expression) {
1127             /* We're done with this f-string. */
1128             break;
1129         }
1130 
1131         /* We know we have an expression. Convert any existing string
1132            to a Constant node. */
1133         if (!state->last_str) {
1134             /* Do nothing. No previous literal. */
1135         } else {
1136             /* Convert the existing last_str literal to a Constant node. */
1137             expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1138             if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1139                 return -1;
1140             }
1141         }
1142 
1143         if (ExprList_Append(&state->expr_list, expression) < 0) {
1144             return -1;
1145         }
1146     }
1147 
1148     /* If recurse_lvl is zero, then we must be at the end of the
1149        string. Otherwise, we must be at a right brace. */
1150 
1151     if (recurse_lvl == 0 && *str < end-1) {
1152         RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1153         return -1;
1154     }
1155     if (recurse_lvl != 0 && **str != '}') {
1156         RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1157         return -1;
1158     }
1159 
1160     FstringParser_check_invariants(state);
1161     return 0;
1162 }
1163 
1164 /* Convert the partial state reflected in last_str and expr_list to an
1165    expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1166 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1167 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1168                      Token *last_token)
1169 {
1170     asdl_expr_seq *seq;
1171 
1172     FstringParser_check_invariants(state);
1173 
1174     /* If we're just a constant string with no expressions, return
1175        that. */
1176     if (!state->fmode) {
1177         assert(!state->expr_list.size);
1178         if (!state->last_str) {
1179             /* Create a zero length string. */
1180             state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1181             if (!state->last_str) {
1182                 goto error;
1183             }
1184         }
1185         return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1186     }
1187 
1188     /* Create a Constant node out of last_str, if needed. It will be the
1189        last node in our expression list. */
1190     if (state->last_str) {
1191         expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1192         if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1193             goto error;
1194         }
1195     }
1196     /* This has already been freed. */
1197     assert(state->last_str == NULL);
1198 
1199     seq = ExprList_Finish(&state->expr_list, p->arena);
1200     if (!seq) {
1201         goto error;
1202     }
1203 
1204     return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1205                             last_token->end_lineno, last_token->end_col_offset,
1206                             p->arena);
1207 
1208 error:
1209     _PyPegen_FstringParser_Dealloc(state);
1210     return NULL;
1211 }
1212 
1213 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1214    at end, parse it into an expr_ty.  Return NULL on error.  Adjust
1215    str to point past the parsed portion. */
1216 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1217 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1218               int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1219 {
1220     FstringParser state;
1221 
1222     _PyPegen_FstringParser_Init(&state);
1223     if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1224                                     first_token, t, last_token) < 0) {
1225         _PyPegen_FstringParser_Dealloc(&state);
1226         return NULL;
1227     }
1228 
1229     return _PyPegen_FstringParser_Finish(p, &state, t, t);
1230 }
1231