• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <stdbool.h>
2 
3 #include <Python.h>
4 
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8 
9 //// STRING HANDLING FUNCTIONS ////
10 
11 static int
warn_invalid_escape_sequence(Parser * p,unsigned char first_invalid_escape_char,Token * t)12 warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
13 {
14     PyObject *msg =
15         PyUnicode_FromFormat("invalid escape sequence '\\%c'", first_invalid_escape_char);
16     if (msg == NULL) {
17         return -1;
18     }
19     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
20                                  t->lineno, NULL, NULL) < 0) {
21         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22             /* Replace the DeprecationWarning exception with a SyntaxError
23                to get a more accurate error report */
24             PyErr_Clear();
25 
26             /* This is needed, in order for the SyntaxError to point to the token t,
27                since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28                error location, if p->known_err_token is not set. */
29             p->known_err_token = t;
30             RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", first_invalid_escape_char);
31         }
32         Py_DECREF(msg);
33         return -1;
34     }
35     Py_DECREF(msg);
36     return 0;
37 }
38 
39 static PyObject *
decode_utf8(const char ** sPtr,const char * end)40 decode_utf8(const char **sPtr, const char *end)
41 {
42     const char *s;
43     const char *t;
44     t = s = *sPtr;
45     while (s < end && (*s & 0x80)) {
46         s++;
47     }
48     *sPtr = s;
49     return PyUnicode_DecodeUTF8(t, s - t, NULL);
50 }
51 
52 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)53 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
54 {
55     PyObject *v;
56     PyObject *u;
57     char *buf;
58     char *p;
59     const char *end;
60 
61     /* check for integer overflow */
62     if (len > SIZE_MAX / 6) {
63         return NULL;
64     }
65     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67     u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68     if (u == NULL) {
69         return NULL;
70     }
71     p = buf = PyBytes_AsString(u);
72     if (p == NULL) {
73         return NULL;
74     }
75     end = s + len;
76     while (s < end) {
77         if (*s == '\\') {
78             *p++ = *s++;
79             if (s >= end || *s & 0x80) {
80                 strcpy(p, "u005c");
81                 p += 5;
82                 if (s >= end) {
83                     break;
84                 }
85             }
86         }
87         if (*s & 0x80) {
88             PyObject *w;
89             int kind;
90             const void *data;
91             Py_ssize_t w_len;
92             Py_ssize_t i;
93             w = decode_utf8(&s, end);
94             if (w == NULL) {
95                 Py_DECREF(u);
96                 return NULL;
97             }
98             kind = PyUnicode_KIND(w);
99             data = PyUnicode_DATA(w);
100             w_len = PyUnicode_GET_LENGTH(w);
101             for (i = 0; i < w_len; i++) {
102                 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103                 sprintf(p, "\\U%08x", chr);
104                 p += 10;
105             }
106             /* Should be impossible to overflow */
107             assert(p - buf <= PyBytes_GET_SIZE(u));
108             Py_DECREF(w);
109         }
110         else {
111             *p++ = *s++;
112         }
113     }
114     len = p - buf;
115     s = buf;
116 
117     const char *first_invalid_escape;
118     v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119 
120     if (v != NULL && first_invalid_escape != NULL) {
121         if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
122             /* We have not decref u before because first_invalid_escape points
123                inside u. */
124             Py_XDECREF(u);
125             Py_DECREF(v);
126             return NULL;
127         }
128     }
129     Py_XDECREF(u);
130     return v;
131 }
132 
133 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)134 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
135 {
136     const char *first_invalid_escape;
137     PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138     if (result == NULL) {
139         return NULL;
140     }
141 
142     if (first_invalid_escape != NULL) {
143         if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
144             Py_DECREF(result);
145             return NULL;
146         }
147     }
148     return result;
149 }
150 
151 /* s must include the bracketing quote characters, and r, b, u,
152    &/or f prefixes (if any), and embedded escape sequences (if any).
153    _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154    If the string is an f-string, set *fstr and *fstrlen to the unparsed
155    string object.  Return 0 if no errors occurred.  */
156 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)157 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158                   const char **fstr, Py_ssize_t *fstrlen, Token *t)
159 {
160     const char *s = PyBytes_AsString(t->bytes);
161     if (s == NULL) {
162         return -1;
163     }
164 
165     size_t len;
166     int quote = Py_CHARMASK(*s);
167     int fmode = 0;
168     *bytesmode = 0;
169     *rawmode = 0;
170     *result = NULL;
171     *fstr = NULL;
172     if (Py_ISALPHA(quote)) {
173         while (!*bytesmode || !*rawmode) {
174             if (quote == 'b' || quote == 'B') {
175                 quote =(unsigned char)*++s;
176                 *bytesmode = 1;
177             }
178             else if (quote == 'u' || quote == 'U') {
179                 quote = (unsigned char)*++s;
180             }
181             else if (quote == 'r' || quote == 'R') {
182                 quote = (unsigned char)*++s;
183                 *rawmode = 1;
184             }
185             else if (quote == 'f' || quote == 'F') {
186                 quote = (unsigned char)*++s;
187                 fmode = 1;
188             }
189             else {
190                 break;
191             }
192         }
193     }
194 
195     /* fstrings are only allowed in Python 3.6 and greater */
196     if (fmode && p->feature_version < 6) {
197         p->error_indicator = 1;
198         RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199         return -1;
200     }
201 
202     if (fmode && *bytesmode) {
203         PyErr_BadInternalCall();
204         return -1;
205     }
206     if (quote != '\'' && quote != '\"') {
207         PyErr_BadInternalCall();
208         return -1;
209     }
210     /* Skip the leading quote char. */
211     s++;
212     len = strlen(s);
213     if (len > INT_MAX) {
214         PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215         return -1;
216     }
217     if (s[--len] != quote) {
218         /* Last quote char must match the first. */
219         PyErr_BadInternalCall();
220         return -1;
221     }
222     if (len >= 4 && s[0] == quote && s[1] == quote) {
223         /* A triple quoted string. We've already skipped one quote at
224            the start and one at the end of the string. Now skip the
225            two at the start. */
226         s += 2;
227         len -= 2;
228         /* And check that the last two match. */
229         if (s[--len] != quote || s[--len] != quote) {
230             PyErr_BadInternalCall();
231             return -1;
232         }
233     }
234 
235     if (fmode) {
236         /* Just return the bytes. The caller will parse the resulting
237            string. */
238         *fstr = s;
239         *fstrlen = len;
240         return 0;
241     }
242 
243     /* Not an f-string. */
244     /* Avoid invoking escape decoding routines if possible. */
245     *rawmode = *rawmode || strchr(s, '\\') == NULL;
246     if (*bytesmode) {
247         /* Disallow non-ASCII characters. */
248         const char *ch;
249         for (ch = s; *ch; ch++) {
250             if (Py_CHARMASK(*ch) >= 0x80) {
251                 RAISE_SYNTAX_ERROR(
252                                    "bytes can only contain ASCII "
253                                    "literal characters");
254                 return -1;
255             }
256         }
257         if (*rawmode) {
258             *result = PyBytes_FromStringAndSize(s, len);
259         }
260         else {
261             *result = decode_bytes_with_escapes(p, s, len, t);
262         }
263     }
264     else {
265         if (*rawmode) {
266             *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267         }
268         else {
269             *result = decode_unicode_with_escapes(p, s, len, t);
270         }
271     }
272     return *result == NULL ? -1 : 0;
273 }
274 
275 
276 
277 // FSTRING STUFF
278 
279 /* Fix locations for the given node and its children.
280 
281    `parent` is the enclosing node.
282    `expr_start` is the starting position of the expression (pointing to the open brace).
283    `n` is the node which locations are going to be fixed relative to parent.
284    `expr_str` is the child node's string representation, including braces.
285 */
286 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)287 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
288 {
289     *p_lines = 0;
290     *p_cols = 0;
291     assert(expr_start != NULL && *expr_start == '{');
292     if (parent && parent->bytes) {
293         const char *parent_str = PyBytes_AsString(parent->bytes);
294         if (!parent_str) {
295             return false;
296         }
297         // The following is needed, in order to correctly shift the column
298         // offset, in the case that (disregarding any whitespace) a newline
299         // immediately follows the opening curly brace of the fstring expression.
300         bool newline_after_brace = 1;
301         const char *start = expr_start + 1;
302         while (start && *start != '}' && *start != '\n') {
303             if (*start != ' ' && *start != '\t' && *start != '\f') {
304                 newline_after_brace = 0;
305                 break;
306             }
307             start++;
308         }
309 
310         // Account for the characters from the last newline character to our
311         // left until the beginning of expr_start.
312         if (!newline_after_brace) {
313             start = expr_start;
314             while (start > parent_str && *start != '\n') {
315                 start--;
316             }
317             *p_cols += (int)(expr_start - start);
318         }
319         /* adjust the start based on the number of newlines encountered
320            before the f-string expression */
321         for (const char *p = parent_str; p < expr_start; p++) {
322             if (*p == '\n') {
323                 (*p_lines)++;
324             }
325         }
326     }
327     return true;
328 }
329 
330 
331 /* Compile this expression in to an expr_ty.  Add parens around the
332    expression, in order to allow leading spaces in the expression. */
333 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)334 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
335                      Token *t)
336 {
337     expr_ty expr = NULL;
338     char *str;
339     Py_ssize_t len;
340     const char *s;
341     expr_ty result = NULL;
342 
343     assert(expr_end >= expr_start);
344     assert(*(expr_start-1) == '{');
345     assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
346            *expr_end == '=');
347 
348     /* If the substring is all whitespace, it's an error.  We need to catch this
349        here, and not when we call PyParser_SimpleParseStringFlagsFilename,
350        because turning the expression '' in to '()' would go from being invalid
351        to valid. */
352     for (s = expr_start; s != expr_end; s++) {
353         char c = *s;
354         /* The Python parser ignores only the following whitespace
355            characters (\r already is converted to \n). */
356         if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
357             break;
358         }
359     }
360     if (s == expr_end) {
361         RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
362         return NULL;
363     }
364 
365     len = expr_end - expr_start;
366     /* Allocate 3 extra bytes: open paren, close paren, null byte. */
367     str = PyMem_Calloc(len + 3, sizeof(char));
368     if (str == NULL) {
369         PyErr_NoMemory();
370         return NULL;
371     }
372 
373     // The call to fstring_find_expr_location is responsible for finding the column offset
374     // the generated AST nodes need to be shifted to the right, which is equal to the number
375     // of the f-string characters before the expression starts.
376     memcpy(str+1, expr_start, len);
377     int lines, cols;
378     if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
379         PyMem_Free(str);
380         return NULL;
381     }
382 
383     // The parentheses are needed in order to allow for leading whitespace within
384     // the f-string expression. This consequently gets parsed as a group (see the
385     // group rule in python.gram).
386     str[0] = '(';
387     str[len+1] = ')';
388 
389     struct tok_state* tok = PyTokenizer_FromString(str, 1);
390     if (tok == NULL) {
391         PyMem_Free(str);
392         return NULL;
393     }
394     Py_INCREF(p->tok->filename);
395 
396     tok->filename = p->tok->filename;
397     tok->lineno = t->lineno + lines - 1;
398 
399     Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
400                                      NULL, p->arena);
401 
402     p2->starting_lineno = t->lineno + lines;
403     p2->starting_col_offset = t->col_offset + cols;
404 
405     expr = _PyPegen_run_parser(p2);
406 
407     if (expr == NULL) {
408         goto exit;
409     }
410     result = expr;
411 
412 exit:
413     PyMem_Free(str);
414     _PyPegen_Parser_Free(p2);
415     PyTokenizer_Free(tok);
416     return result;
417 }
418 
419 /* Return -1 on error.
420 
421    Return 0 if we reached the end of the literal.
422 
423    Return 1 if we haven't reached the end of the literal, but we want
424    the caller to process the literal up to this point. Used for
425    doubled braces.
426 */
427 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)428 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
429                      PyObject **literal, int recurse_lvl, Token *t)
430 {
431     /* Get any literal string. It ends when we hit an un-doubled left
432        brace (which isn't part of a unicode name escape such as
433        "\N{EULER CONSTANT}"), or the end of the string. */
434 
435     const char *s = *str;
436     const char *literal_start = s;
437     int result = 0;
438 
439     assert(*literal == NULL);
440     while (s < end) {
441         char ch = *s++;
442         if (!raw && ch == '\\' && s < end) {
443             ch = *s++;
444             if (ch == 'N') {
445                 /* We need to look at and skip matching braces for "\N{name}"
446                    sequences because otherwise we'll think the opening '{'
447                    starts an expression, which is not the case with "\N".
448                    Keep looking for either a matched '{' '}' pair, or the end
449                    of the string. */
450 
451                 if (s < end && *s++ == '{') {
452                     while (s < end && *s++ != '}') {
453                     }
454                     continue;
455                 }
456 
457                 /* This is an invalid "\N" sequence, since it's a "\N" not
458                    followed by a "{".  Just keep parsing this literal.  This
459                    error will be caught later by
460                    decode_unicode_with_escapes(). */
461                 continue;
462             }
463             if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
464                 return -1;
465             }
466         }
467         if (ch == '{' || ch == '}') {
468             /* Check for doubled braces, but only at the top level. If
469                we checked at every level, then f'{0:{3}}' would fail
470                with the two closing braces. */
471             if (recurse_lvl == 0) {
472                 if (s < end && *s == ch) {
473                     /* We're going to tell the caller that the literal ends
474                        here, but that they should continue scanning. But also
475                        skip over the second brace when we resume scanning. */
476                     *str = s + 1;
477                     result = 1;
478                     goto done;
479                 }
480 
481                 /* Where a single '{' is the start of a new expression, a
482                    single '}' is not allowed. */
483                 if (ch == '}') {
484                     *str = s - 1;
485                     RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
486                     return -1;
487                 }
488             }
489             /* We're either at a '{', which means we're starting another
490                expression; or a '}', which means we're at the end of this
491                f-string (for a nested format_spec). */
492             s--;
493             break;
494         }
495     }
496     *str = s;
497     assert(s <= end);
498     assert(s == end || *s == '{' || *s == '}');
499 done:
500     if (literal_start != s) {
501         if (raw) {
502             *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
503                                                     s - literal_start,
504                                                     NULL, NULL);
505         }
506         else {
507             *literal = decode_unicode_with_escapes(p, literal_start,
508                                                    s - literal_start, t);
509         }
510         if (!*literal) {
511             return -1;
512         }
513     }
514     return result;
515 }
516 
517 /* Forward declaration because parsing is recursive. */
518 static expr_ty
519 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
520               Token *first_token, Token* t, Token *last_token);
521 
522 /* Parse the f-string at *str, ending at end.  We know *str starts an
523    expression (so it must be a '{'). Returns the FormattedValue node, which
524    includes the expression, conversion character, format_spec expression, and
525    optionally the text of the expression (if = is used).
526 
527    Note that I don't do a perfect job here: I don't make sure that a
528    closing brace doesn't match an opening paren, for example. It
529    doesn't need to error on all invalid expressions, just correctly
530    find the end of all valid ones. Any errors inside the expression
531    will be caught when we parse it later.
532 
533    *expression is set to the expression.  For an '=' "debug" expression,
534    *expr_text is set to the debug text (the original text of the expression,
535    including the '=' and any whitespace around it, as a string object).  If
536    not a debug expression, *expr_text set to NULL. */
537 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)538 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
539                   PyObject **expr_text, expr_ty *expression, Token *first_token,
540                   Token *t, Token *last_token)
541 {
542     /* Return -1 on error, else 0. */
543 
544     const char *expr_start;
545     const char *expr_end;
546     expr_ty simple_expression;
547     expr_ty format_spec = NULL; /* Optional format specifier. */
548     int conversion = -1; /* The conversion char.  Use default if not
549                             specified, or !r if using = and no format
550                             spec. */
551 
552     /* 0 if we're not in a string, else the quote char we're trying to
553        match (single or double quote). */
554     char quote_char = 0;
555 
556     /* If we're inside a string, 1=normal, 3=triple-quoted. */
557     int string_type = 0;
558 
559     /* Keep track of nesting level for braces/parens/brackets in
560        expressions. */
561     Py_ssize_t nested_depth = 0;
562     char parenstack[MAXLEVEL];
563 
564     *expr_text = NULL;
565 
566     /* Can only nest one level deep. */
567     if (recurse_lvl >= 2) {
568         RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
569         goto error;
570     }
571 
572     /* The first char must be a left brace, or we wouldn't have gotten
573        here. Skip over it. */
574     assert(**str == '{');
575     *str += 1;
576 
577     expr_start = *str;
578     for (; *str < end; (*str)++) {
579         char ch;
580 
581         /* Loop invariants. */
582         assert(nested_depth >= 0);
583         assert(*str >= expr_start && *str < end);
584         if (quote_char) {
585             assert(string_type == 1 || string_type == 3);
586         } else {
587             assert(string_type == 0);
588         }
589 
590         ch = **str;
591         /* Nowhere inside an expression is a backslash allowed. */
592         if (ch == '\\') {
593             /* Error: can't include a backslash character, inside
594                parens or strings or not. */
595             RAISE_SYNTAX_ERROR(
596                       "f-string expression part "
597                       "cannot include a backslash");
598             goto error;
599         }
600         if (quote_char) {
601             /* We're inside a string. See if we're at the end. */
602             /* This code needs to implement the same non-error logic
603                as tok_get from tokenizer.c, at the letter_quote
604                label. To actually share that code would be a
605                nightmare. But, it's unlikely to change and is small,
606                so duplicate it here. Note we don't need to catch all
607                of the errors, since they'll be caught when parsing the
608                expression. We just need to match the non-error
609                cases. Thus we can ignore \n in single-quoted strings,
610                for example. Or non-terminated strings. */
611             if (ch == quote_char) {
612                 /* Does this match the string_type (single or triple
613                    quoted)? */
614                 if (string_type == 3) {
615                     if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
616                         /* We're at the end of a triple quoted string. */
617                         *str += 2;
618                         string_type = 0;
619                         quote_char = 0;
620                         continue;
621                     }
622                 } else {
623                     /* We're at the end of a normal string. */
624                     quote_char = 0;
625                     string_type = 0;
626                     continue;
627                 }
628             }
629         } else if (ch == '\'' || ch == '"') {
630             /* Is this a triple quoted string? */
631             if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
632                 string_type = 3;
633                 *str += 2;
634             } else {
635                 /* Start of a normal string. */
636                 string_type = 1;
637             }
638             /* Start looking for the end of the string. */
639             quote_char = ch;
640         } else if (ch == '[' || ch == '{' || ch == '(') {
641             if (nested_depth >= MAXLEVEL) {
642                 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
643                 goto error;
644             }
645             parenstack[nested_depth] = ch;
646             nested_depth++;
647         } else if (ch == '#') {
648             /* Error: can't include a comment character, inside parens
649                or not. */
650             RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
651             goto error;
652         } else if (nested_depth == 0 &&
653                    (ch == '!' || ch == ':' || ch == '}' ||
654                     ch == '=' || ch == '>' || ch == '<')) {
655             /* See if there's a next character. */
656             if (*str+1 < end) {
657                 char next = *(*str+1);
658 
659                 /* For "!=". since '=' is not an allowed conversion character,
660                    nothing is lost in this test. */
661                 if ((ch == '!' && next == '=') ||   /* != */
662                     (ch == '=' && next == '=') ||   /* == */
663                     (ch == '<' && next == '=') ||   /* <= */
664                     (ch == '>' && next == '=')      /* >= */
665                     ) {
666                     *str += 1;
667                     continue;
668                 }
669             }
670             /* Don't get out of the loop for these, if they're single
671                chars (not part of 2-char tokens). If by themselves, they
672                don't end an expression (unlike say '!'). */
673             if (ch == '>' || ch == '<') {
674                 continue;
675             }
676 
677             /* Normal way out of this loop. */
678             break;
679         } else if (ch == ']' || ch == '}' || ch == ')') {
680             if (!nested_depth) {
681                 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
682                 goto error;
683             }
684             nested_depth--;
685             int opening = (unsigned char)parenstack[nested_depth];
686             if (!((opening == '(' && ch == ')') ||
687                   (opening == '[' && ch == ']') ||
688                   (opening == '{' && ch == '}')))
689             {
690                 RAISE_SYNTAX_ERROR(
691                           "f-string: closing parenthesis '%c' "
692                           "does not match opening parenthesis '%c'",
693                           ch, opening);
694                 goto error;
695             }
696         } else {
697             /* Just consume this char and loop around. */
698         }
699     }
700     expr_end = *str;
701     /* If we leave the above loop in a string or with mismatched parens, we
702        don't really care. We'll get a syntax error when compiling the
703        expression. But, we can produce a better error message, so let's just
704        do that.*/
705     if (quote_char) {
706         RAISE_SYNTAX_ERROR("f-string: unterminated string");
707         goto error;
708     }
709     if (nested_depth) {
710         int opening = (unsigned char)parenstack[nested_depth - 1];
711         RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
712         goto error;
713     }
714 
715     if (*str >= end) {
716         goto unexpected_end_of_string;
717     }
718 
719     /* Compile the expression as soon as possible, so we show errors
720        related to the expression before errors related to the
721        conversion or format_spec. */
722     simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
723     if (!simple_expression) {
724         goto error;
725     }
726 
727     /* Check for =, which puts the text value of the expression in
728        expr_text. */
729     if (**str == '=') {
730         if (p->feature_version < 8) {
731             RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
732                                "only supported in Python 3.8 and greater");
733             goto error;
734         }
735         *str += 1;
736 
737         /* Skip over ASCII whitespace.  No need to test for end of string
738            here, since we know there's at least a trailing quote somewhere
739            ahead. */
740         while (Py_ISSPACE(**str)) {
741             *str += 1;
742         }
743 
744         /* Set *expr_text to the text of the expression. */
745         *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
746         if (!*expr_text) {
747             goto error;
748         }
749     }
750 
751     /* Check for a conversion char, if present. */
752     if (**str == '!') {
753         *str += 1;
754         if (*str >= end) {
755             goto unexpected_end_of_string;
756         }
757 
758         conversion = (unsigned char)**str;
759         *str += 1;
760 
761         /* Validate the conversion. */
762         if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
763             RAISE_SYNTAX_ERROR(
764                       "f-string: invalid conversion character: "
765                       "expected 's', 'r', or 'a'");
766             goto error;
767         }
768 
769     }
770 
771     /* Check for the format spec, if present. */
772     if (*str >= end) {
773         goto unexpected_end_of_string;
774     }
775     if (**str == ':') {
776         *str += 1;
777         if (*str >= end) {
778             goto unexpected_end_of_string;
779         }
780 
781         /* Parse the format spec. */
782         format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
783                                     first_token, t, last_token);
784         if (!format_spec) {
785             goto error;
786         }
787     }
788 
789     if (*str >= end || **str != '}') {
790         goto unexpected_end_of_string;
791     }
792 
793     /* We're at a right brace. Consume it. */
794     assert(*str < end);
795     assert(**str == '}');
796     *str += 1;
797 
798     /* If we're in = mode (detected by non-NULL expr_text), and have no format
799        spec and no explicit conversion, set the conversion to 'r'. */
800     if (*expr_text && format_spec == NULL && conversion == -1) {
801         conversion = 'r';
802     }
803 
804     /* And now create the FormattedValue node that represents this
805        entire expression with the conversion and format spec. */
806     //TODO: Fix this
807     *expression = _PyAST_FormattedValue(simple_expression, conversion,
808                                         format_spec, first_token->lineno,
809                                         first_token->col_offset,
810                                         last_token->end_lineno,
811                                         last_token->end_col_offset, p->arena);
812     if (!*expression) {
813         goto error;
814     }
815 
816     return 0;
817 
818 unexpected_end_of_string:
819     RAISE_SYNTAX_ERROR("f-string: expecting '}'");
820     /* Falls through to error. */
821 
822 error:
823     Py_XDECREF(*expr_text);
824     return -1;
825 
826 }
827 
828 /* Return -1 on error.
829 
830    Return 0 if we have a literal (possible zero length) and an
831    expression (zero length if at the end of the string.
832 
833    Return 1 if we have a literal, but no expression, and we want the
834    caller to call us again. This is used to deal with doubled
835    braces.
836 
837    When called multiple times on the string 'a{{b{0}c', this function
838    will return:
839 
840    1. the literal 'a{' with no expression, and a return value
841       of 1. Despite the fact that there's no expression, the return
842       value of 1 means we're not finished yet.
843 
844    2. the literal 'b' and the expression '0', with a return value of
845       0. The fact that there's an expression means we're not finished.
846 
847    3. literal 'c' with no expression and a return value of 0. The
848       combination of the return value of 0 with no expression means
849       we're finished.
850 */
851 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)852 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
853                               int recurse_lvl, PyObject **literal,
854                               PyObject **expr_text, expr_ty *expression,
855                               Token *first_token, Token *t, Token *last_token)
856 {
857     int result;
858 
859     assert(*literal == NULL && *expression == NULL);
860 
861     /* Get any literal string. */
862     result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
863     if (result < 0) {
864         goto error;
865     }
866 
867     assert(result == 0 || result == 1);
868 
869     if (result == 1) {
870         /* We have a literal, but don't look at the expression. */
871         return 1;
872     }
873 
874     if (*str >= end || **str == '}') {
875         /* We're at the end of the string or the end of a nested
876            f-string: no expression. The top-level error case where we
877            expect to be at the end of the string but we're at a '}' is
878            handled later. */
879         return 0;
880     }
881 
882     /* We must now be the start of an expression, on a '{'. */
883     assert(**str == '{');
884 
885     if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
886                           expression, first_token, t, last_token) < 0) {
887         goto error;
888     }
889 
890     return 0;
891 
892 error:
893     Py_CLEAR(*literal);
894     return -1;
895 }
896 
897 #ifdef NDEBUG
898 #define ExprList_check_invariants(l)
899 #else
900 static void
ExprList_check_invariants(ExprList * l)901 ExprList_check_invariants(ExprList *l)
902 {
903     /* Check our invariants. Make sure this object is "live", and
904        hasn't been deallocated. */
905     assert(l->size >= 0);
906     assert(l->p != NULL);
907     if (l->size <= EXPRLIST_N_CACHED) {
908         assert(l->data == l->p);
909     }
910 }
911 #endif
912 
913 static void
ExprList_Init(ExprList * l)914 ExprList_Init(ExprList *l)
915 {
916     l->allocated = EXPRLIST_N_CACHED;
917     l->size = 0;
918 
919     /* Until we start allocating dynamically, p points to data. */
920     l->p = l->data;
921 
922     ExprList_check_invariants(l);
923 }
924 
925 static int
ExprList_Append(ExprList * l,expr_ty exp)926 ExprList_Append(ExprList *l, expr_ty exp)
927 {
928     ExprList_check_invariants(l);
929     if (l->size >= l->allocated) {
930         /* We need to alloc (or realloc) the memory. */
931         Py_ssize_t new_size = l->allocated * 2;
932 
933         /* See if we've ever allocated anything dynamically. */
934         if (l->p == l->data) {
935             Py_ssize_t i;
936             /* We're still using the cached data. Switch to
937                alloc-ing. */
938             l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
939             if (!l->p) {
940                 return -1;
941             }
942             /* Copy the cached data into the new buffer. */
943             for (i = 0; i < l->size; i++) {
944                 l->p[i] = l->data[i];
945             }
946         } else {
947             /* Just realloc. */
948             expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
949             if (!tmp) {
950                 PyMem_Free(l->p);
951                 l->p = NULL;
952                 return -1;
953             }
954             l->p = tmp;
955         }
956 
957         l->allocated = new_size;
958         assert(l->allocated == 2 * l->size);
959     }
960 
961     l->p[l->size++] = exp;
962 
963     ExprList_check_invariants(l);
964     return 0;
965 }
966 
967 static void
ExprList_Dealloc(ExprList * l)968 ExprList_Dealloc(ExprList *l)
969 {
970     ExprList_check_invariants(l);
971 
972     /* If there's been an error, or we've never dynamically allocated,
973        do nothing. */
974     if (!l->p || l->p == l->data) {
975         /* Do nothing. */
976     } else {
977         /* We have dynamically allocated. Free the memory. */
978         PyMem_Free(l->p);
979     }
980     l->p = NULL;
981     l->size = -1;
982 }
983 
984 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)985 ExprList_Finish(ExprList *l, PyArena *arena)
986 {
987     asdl_expr_seq *seq;
988 
989     ExprList_check_invariants(l);
990 
991     /* Allocate the asdl_seq and copy the expressions in to it. */
992     seq = _Py_asdl_expr_seq_new(l->size, arena);
993     if (seq) {
994         Py_ssize_t i;
995         for (i = 0; i < l->size; i++) {
996             asdl_seq_SET(seq, i, l->p[i]);
997         }
998     }
999     ExprList_Dealloc(l);
1000     return seq;
1001 }
1002 
1003 #ifdef NDEBUG
1004 #define FstringParser_check_invariants(state)
1005 #else
1006 static void
FstringParser_check_invariants(FstringParser * state)1007 FstringParser_check_invariants(FstringParser *state)
1008 {
1009     if (state->last_str) {
1010         assert(PyUnicode_CheckExact(state->last_str));
1011     }
1012     ExprList_check_invariants(&state->expr_list);
1013 }
1014 #endif
1015 
1016 void
_PyPegen_FstringParser_Init(FstringParser * state)1017 _PyPegen_FstringParser_Init(FstringParser *state)
1018 {
1019     state->last_str = NULL;
1020     state->fmode = 0;
1021     ExprList_Init(&state->expr_list);
1022     FstringParser_check_invariants(state);
1023 }
1024 
1025 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1026 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1027 {
1028     FstringParser_check_invariants(state);
1029 
1030     Py_XDECREF(state->last_str);
1031     ExprList_Dealloc(&state->expr_list);
1032 }
1033 
1034 /* Make a Constant node, but decref the PyUnicode object being added. */
1035 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1036 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1037 {
1038     PyObject *s = *str;
1039     PyObject *kind = NULL;
1040     *str = NULL;
1041     assert(PyUnicode_CheckExact(s));
1042     if (_PyArena_AddPyObject(p->arena, s) < 0) {
1043         Py_DECREF(s);
1044         return NULL;
1045     }
1046     const char* the_str = PyBytes_AsString(first_token->bytes);
1047     if (the_str && the_str[0] == 'u') {
1048         kind = _PyPegen_new_identifier(p, "u");
1049     }
1050 
1051     if (kind == NULL && PyErr_Occurred()) {
1052         return NULL;
1053     }
1054 
1055     return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1056                            last_token->end_lineno, last_token->end_col_offset,
1057                            p->arena);
1058 
1059 }
1060 
1061 
1062 /* Add a non-f-string (that is, a regular literal string). str is
1063    decref'd. */
1064 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1065 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1066 {
1067     FstringParser_check_invariants(state);
1068 
1069     assert(PyUnicode_CheckExact(str));
1070 
1071     if (PyUnicode_GET_LENGTH(str) == 0) {
1072         Py_DECREF(str);
1073         return 0;
1074     }
1075 
1076     if (!state->last_str) {
1077         /* We didn't have a string before, so just remember this one. */
1078         state->last_str = str;
1079     } else {
1080         /* Concatenate this with the previous string. */
1081         PyUnicode_AppendAndDel(&state->last_str, str);
1082         if (!state->last_str) {
1083             return -1;
1084         }
1085     }
1086     FstringParser_check_invariants(state);
1087     return 0;
1088 }
1089 
1090 /* Parse an f-string. The f-string is in *str to end, with no
1091    'f' or quotes. */
1092 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1093 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1094                             const char *end, int raw, int recurse_lvl,
1095                             Token *first_token, Token* t, Token *last_token)
1096 {
1097     FstringParser_check_invariants(state);
1098     state->fmode = 1;
1099 
1100     /* Parse the f-string. */
1101     while (1) {
1102         PyObject *literal = NULL;
1103         PyObject *expr_text = NULL;
1104         expr_ty expression = NULL;
1105 
1106         /* If there's a zero length literal in front of the
1107            expression, literal will be NULL. If we're at the end of
1108            the f-string, expression will be NULL (unless result == 1,
1109            see below). */
1110         int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1111                                                    &literal, &expr_text,
1112                                                    &expression, first_token, t, last_token);
1113         if (result < 0) {
1114             return -1;
1115         }
1116 
1117         /* Add the literal, if any. */
1118         if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1119             Py_XDECREF(expr_text);
1120             return -1;
1121         }
1122         /* Add the expr_text, if any. */
1123         if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1124             return -1;
1125         }
1126 
1127         /* We've dealt with the literal and expr_text, their ownership has
1128            been transferred to the state object.  Don't look at them again. */
1129 
1130         /* See if we should just loop around to get the next literal
1131            and expression, while ignoring the expression this
1132            time. This is used for un-doubling braces, as an
1133            optimization. */
1134         if (result == 1) {
1135             continue;
1136         }
1137 
1138         if (!expression) {
1139             /* We're done with this f-string. */
1140             break;
1141         }
1142 
1143         /* We know we have an expression. Convert any existing string
1144            to a Constant node. */
1145         if (!state->last_str) {
1146             /* Do nothing. No previous literal. */
1147         } else {
1148             /* Convert the existing last_str literal to a Constant node. */
1149             expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1150             if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1151                 return -1;
1152             }
1153         }
1154 
1155         if (ExprList_Append(&state->expr_list, expression) < 0) {
1156             return -1;
1157         }
1158     }
1159 
1160     /* If recurse_lvl is zero, then we must be at the end of the
1161        string. Otherwise, we must be at a right brace. */
1162 
1163     if (recurse_lvl == 0 && *str < end-1) {
1164         RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1165         return -1;
1166     }
1167     if (recurse_lvl != 0 && **str != '}') {
1168         RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1169         return -1;
1170     }
1171 
1172     FstringParser_check_invariants(state);
1173     return 0;
1174 }
1175 
1176 /* Convert the partial state reflected in last_str and expr_list to an
1177    expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1178 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1179 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1180                      Token *last_token)
1181 {
1182     asdl_expr_seq *seq;
1183 
1184     FstringParser_check_invariants(state);
1185 
1186     /* If we're just a constant string with no expressions, return
1187        that. */
1188     if (!state->fmode) {
1189         assert(!state->expr_list.size);
1190         if (!state->last_str) {
1191             /* Create a zero length string. */
1192             state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1193             if (!state->last_str) {
1194                 goto error;
1195             }
1196         }
1197         return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1198     }
1199 
1200     /* Create a Constant node out of last_str, if needed. It will be the
1201        last node in our expression list. */
1202     if (state->last_str) {
1203         expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1204         if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1205             goto error;
1206         }
1207     }
1208     /* This has already been freed. */
1209     assert(state->last_str == NULL);
1210 
1211     seq = ExprList_Finish(&state->expr_list, p->arena);
1212     if (!seq) {
1213         goto error;
1214     }
1215 
1216     return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1217                             last_token->end_lineno, last_token->end_col_offset,
1218                             p->arena);
1219 
1220 error:
1221     _PyPegen_FstringParser_Dealloc(state);
1222     return NULL;
1223 }
1224 
1225 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1226    at end, parse it into an expr_ty.  Return NULL on error.  Adjust
1227    str to point past the parsed portion. */
1228 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1229 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1230               int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1231 {
1232     FstringParser state;
1233 
1234     _PyPegen_FstringParser_Init(&state);
1235     if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1236                                     first_token, t, last_token) < 0) {
1237         _PyPegen_FstringParser_Dealloc(&state);
1238         return NULL;
1239     }
1240 
1241     return _PyPegen_FstringParser_Finish(p, &state, t, t);
1242 }
1243