• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <stdbool.h>
2 
3 #include <Python.h>
4 
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8 
9 //// STRING HANDLING FUNCTIONS ////
10 
11 static int
warn_invalid_escape_sequence(Parser * p,const char * first_invalid_escape,Token * t)12 warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13 {
14     unsigned char c = *first_invalid_escape;
15     int octal = ('4' <= c && c <= '7');
16     PyObject *msg =
17         octal
18         ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
19                                first_invalid_escape)
20         : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
21     if (msg == NULL) {
22         return -1;
23     }
24     if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
25                                  t->lineno, NULL, NULL) < 0) {
26         if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27             /* Replace the DeprecationWarning exception with a SyntaxError
28                to get a more accurate error report */
29             PyErr_Clear();
30 
31             /* This is needed, in order for the SyntaxError to point to the token t,
32                since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33                error location, if p->known_err_token is not set. */
34             p->known_err_token = t;
35             if (octal) {
36                 RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
37                                    first_invalid_escape);
38             }
39             else {
40                 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
41             }
42         }
43         Py_DECREF(msg);
44         return -1;
45     }
46     Py_DECREF(msg);
47     return 0;
48 }
49 
50 static PyObject *
decode_utf8(const char ** sPtr,const char * end)51 decode_utf8(const char **sPtr, const char *end)
52 {
53     const char *s;
54     const char *t;
55     t = s = *sPtr;
56     while (s < end && (*s & 0x80)) {
57         s++;
58     }
59     *sPtr = s;
60     return PyUnicode_DecodeUTF8(t, s - t, NULL);
61 }
62 
63 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)64 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
65 {
66     PyObject *v;
67     PyObject *u;
68     char *buf;
69     char *p;
70     const char *end;
71 
72     /* check for integer overflow */
73     if (len > SIZE_MAX / 6) {
74         return NULL;
75     }
76     /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
77        "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
78     u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
79     if (u == NULL) {
80         return NULL;
81     }
82     p = buf = PyBytes_AsString(u);
83     if (p == NULL) {
84         return NULL;
85     }
86     end = s + len;
87     while (s < end) {
88         if (*s == '\\') {
89             *p++ = *s++;
90             if (s >= end || *s & 0x80) {
91                 strcpy(p, "u005c");
92                 p += 5;
93                 if (s >= end) {
94                     break;
95                 }
96             }
97         }
98         if (*s & 0x80) {
99             PyObject *w;
100             int kind;
101             const void *data;
102             Py_ssize_t w_len;
103             Py_ssize_t i;
104             w = decode_utf8(&s, end);
105             if (w == NULL) {
106                 Py_DECREF(u);
107                 return NULL;
108             }
109             kind = PyUnicode_KIND(w);
110             data = PyUnicode_DATA(w);
111             w_len = PyUnicode_GET_LENGTH(w);
112             for (i = 0; i < w_len; i++) {
113                 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
114                 sprintf(p, "\\U%08x", chr);
115                 p += 10;
116             }
117             /* Should be impossible to overflow */
118             assert(p - buf <= PyBytes_GET_SIZE(u));
119             Py_DECREF(w);
120         }
121         else {
122             *p++ = *s++;
123         }
124     }
125     len = p - buf;
126     s = buf;
127 
128     int first_invalid_escape_char;
129     const char *first_invalid_escape_ptr;
130     v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
131                                                 &first_invalid_escape_char,
132                                                 &first_invalid_escape_ptr);
133 
134     if (v != NULL && first_invalid_escape_ptr != NULL) {
135         if (warn_invalid_escape_sequence(parser, first_invalid_escape_ptr, t) < 0) {
136             /* We have not decref u before because first_invalid_escape_ptr points
137                inside u. */
138             Py_XDECREF(u);
139             Py_DECREF(v);
140             return NULL;
141         }
142     }
143     Py_XDECREF(u);
144     return v;
145 }
146 
147 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)148 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
149 {
150     int first_invalid_escape_char;
151     const char *first_invalid_escape_ptr;
152     PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
153                                               &first_invalid_escape_char,
154                                               &first_invalid_escape_ptr);
155     if (result == NULL) {
156         return NULL;
157     }
158 
159     if (first_invalid_escape_ptr != NULL) {
160         if (warn_invalid_escape_sequence(p, first_invalid_escape_ptr, t) < 0) {
161             Py_DECREF(result);
162             return NULL;
163         }
164     }
165     return result;
166 }
167 
168 /* s must include the bracketing quote characters, and r, b, u,
169    &/or f prefixes (if any), and embedded escape sequences (if any).
170    _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
171    If the string is an f-string, set *fstr and *fstrlen to the unparsed
172    string object.  Return 0 if no errors occurred.  */
173 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)174 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
175                   const char **fstr, Py_ssize_t *fstrlen, Token *t)
176 {
177     const char *s = PyBytes_AsString(t->bytes);
178     if (s == NULL) {
179         return -1;
180     }
181 
182     size_t len;
183     int quote = Py_CHARMASK(*s);
184     int fmode = 0;
185     *bytesmode = 0;
186     *rawmode = 0;
187     *result = NULL;
188     *fstr = NULL;
189     if (Py_ISALPHA(quote)) {
190         while (!*bytesmode || !*rawmode) {
191             if (quote == 'b' || quote == 'B') {
192                 quote =(unsigned char)*++s;
193                 *bytesmode = 1;
194             }
195             else if (quote == 'u' || quote == 'U') {
196                 quote = (unsigned char)*++s;
197             }
198             else if (quote == 'r' || quote == 'R') {
199                 quote = (unsigned char)*++s;
200                 *rawmode = 1;
201             }
202             else if (quote == 'f' || quote == 'F') {
203                 quote = (unsigned char)*++s;
204                 fmode = 1;
205             }
206             else {
207                 break;
208             }
209         }
210     }
211 
212     /* fstrings are only allowed in Python 3.6 and greater */
213     if (fmode && p->feature_version < 6) {
214         p->error_indicator = 1;
215         RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
216         return -1;
217     }
218 
219     if (fmode && *bytesmode) {
220         PyErr_BadInternalCall();
221         return -1;
222     }
223     if (quote != '\'' && quote != '\"') {
224         PyErr_BadInternalCall();
225         return -1;
226     }
227     /* Skip the leading quote char. */
228     s++;
229     len = strlen(s);
230     if (len > INT_MAX) {
231         PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
232         return -1;
233     }
234     if (s[--len] != quote) {
235         /* Last quote char must match the first. */
236         PyErr_BadInternalCall();
237         return -1;
238     }
239     if (len >= 4 && s[0] == quote && s[1] == quote) {
240         /* A triple quoted string. We've already skipped one quote at
241            the start and one at the end of the string. Now skip the
242            two at the start. */
243         s += 2;
244         len -= 2;
245         /* And check that the last two match. */
246         if (s[--len] != quote || s[--len] != quote) {
247             PyErr_BadInternalCall();
248             return -1;
249         }
250     }
251 
252     if (fmode) {
253         /* Just return the bytes. The caller will parse the resulting
254            string. */
255         *fstr = s;
256         *fstrlen = len;
257         return 0;
258     }
259 
260     /* Not an f-string. */
261     /* Avoid invoking escape decoding routines if possible. */
262     *rawmode = *rawmode || strchr(s, '\\') == NULL;
263     if (*bytesmode) {
264         /* Disallow non-ASCII characters. */
265         const char *ch;
266         for (ch = s; *ch; ch++) {
267             if (Py_CHARMASK(*ch) >= 0x80) {
268                 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
269                                    t,
270                                    "bytes can only contain ASCII "
271                                    "literal characters");
272                 return -1;
273             }
274         }
275         if (*rawmode) {
276             *result = PyBytes_FromStringAndSize(s, len);
277         }
278         else {
279             *result = decode_bytes_with_escapes(p, s, len, t);
280         }
281     }
282     else {
283         if (*rawmode) {
284             *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
285         }
286         else {
287             *result = decode_unicode_with_escapes(p, s, len, t);
288         }
289     }
290     return *result == NULL ? -1 : 0;
291 }
292 
293 
294 
295 // FSTRING STUFF
296 
297 /* Fix locations for the given node and its children.
298 
299    `parent` is the enclosing node.
300    `expr_start` is the starting position of the expression (pointing to the open brace).
301    `n` is the node which locations are going to be fixed relative to parent.
302    `expr_str` is the child node's string representation, including braces.
303 */
304 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)305 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
306 {
307     *p_lines = 0;
308     *p_cols = 0;
309     assert(expr_start != NULL && *expr_start == '{');
310     if (parent && parent->bytes) {
311         const char *parent_str = PyBytes_AsString(parent->bytes);
312         if (!parent_str) {
313             return false;
314         }
315         // The following is needed, in order to correctly shift the column
316         // offset, in the case that (disregarding any whitespace) a newline
317         // immediately follows the opening curly brace of the fstring expression.
318         bool newline_after_brace = 1;
319         const char *start = expr_start + 1;
320         while (start && *start != '}' && *start != '\n') {
321             if (*start != ' ' && *start != '\t' && *start != '\f') {
322                 newline_after_brace = 0;
323                 break;
324             }
325             start++;
326         }
327 
328         // Account for the characters from the last newline character to our
329         // left until the beginning of expr_start.
330         if (!newline_after_brace) {
331             start = expr_start;
332             while (start > parent_str && *start != '\n') {
333                 start--;
334             }
335             *p_cols += (int)(expr_start - start);
336             if (*start == '\n') {
337                 *p_cols -= 1;
338             }
339         }
340         /* adjust the start based on the number of newlines encountered
341            before the f-string expression */
342         for (const char *p = parent_str; p < expr_start; p++) {
343             if (*p == '\n') {
344                 (*p_lines)++;
345             }
346         }
347     }
348     return true;
349 }
350 
351 
352 /* Compile this expression in to an expr_ty.  Add parens around the
353    expression, in order to allow leading spaces in the expression. */
354 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)355 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
356                      Token *t)
357 {
358     expr_ty expr = NULL;
359     char *str;
360     Py_ssize_t len;
361     const char *s;
362     expr_ty result = NULL;
363 
364     assert(expr_end >= expr_start);
365     assert(*(expr_start-1) == '{');
366     assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
367            *expr_end == '=');
368 
369     /* If the substring is all whitespace, it's an error.  We need to catch this
370        here, and not when we call PyParser_SimpleParseStringFlagsFilename,
371        because turning the expression '' in to '()' would go from being invalid
372        to valid. */
373     for (s = expr_start; s != expr_end; s++) {
374         char c = *s;
375         /* The Python parser ignores only the following whitespace
376            characters (\r already is converted to \n). */
377         if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
378             break;
379         }
380     }
381 
382     if (s == expr_end) {
383         if (*expr_end == '!' || *expr_end == ':' || *expr_end == '=') {
384             RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end);
385             return NULL;
386         }
387         RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
388         return NULL;
389     }
390 
391     len = expr_end - expr_start;
392     /* Allocate 3 extra bytes: open paren, close paren, null byte. */
393     str = PyMem_Calloc(len + 3, sizeof(char));
394     if (str == NULL) {
395         PyErr_NoMemory();
396         return NULL;
397     }
398 
399     // The call to fstring_find_expr_location is responsible for finding the column offset
400     // the generated AST nodes need to be shifted to the right, which is equal to the number
401     // of the f-string characters before the expression starts.
402     memcpy(str+1, expr_start, len);
403     int lines, cols;
404     if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
405         PyMem_Free(str);
406         return NULL;
407     }
408 
409     // The parentheses are needed in order to allow for leading whitespace within
410     // the f-string expression. This consequently gets parsed as a group (see the
411     // group rule in python.gram).
412     str[0] = '(';
413     str[len+1] = ')';
414 
415     struct tok_state* tok = _PyTokenizer_FromString(str, 1);
416     if (tok == NULL) {
417         PyMem_Free(str);
418         return NULL;
419     }
420     Py_INCREF(p->tok->filename);
421 
422     tok->filename = p->tok->filename;
423     tok->lineno = t->lineno + lines - 1;
424 
425     Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
426                                      NULL, p->arena);
427 
428     p2->starting_lineno = t->lineno + lines;
429     p2->starting_col_offset = lines != 0 ? cols : t->col_offset + cols;
430 
431     expr = _PyPegen_run_parser(p2);
432 
433     if (expr == NULL) {
434         goto exit;
435     }
436     result = expr;
437 
438 exit:
439     PyMem_Free(str);
440     _PyPegen_Parser_Free(p2);
441     _PyTokenizer_Free(tok);
442     return result;
443 }
444 
445 /* Return -1 on error.
446 
447    Return 0 if we reached the end of the literal.
448 
449    Return 1 if we haven't reached the end of the literal, but we want
450    the caller to process the literal up to this point. Used for
451    doubled braces.
452 */
453 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)454 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
455                      PyObject **literal, int recurse_lvl, Token *t)
456 {
457     /* Get any literal string. It ends when we hit an un-doubled left
458        brace (which isn't part of a unicode name escape such as
459        "\N{EULER CONSTANT}"), or the end of the string. */
460 
461     const char *s = *str;
462     const char *literal_start = s;
463     int result = 0;
464 
465     assert(*literal == NULL);
466     while (s < end) {
467         char ch = *s++;
468         if (!raw && ch == '\\' && s < end) {
469             ch = *s++;
470             if (ch == 'N') {
471                 /* We need to look at and skip matching braces for "\N{name}"
472                    sequences because otherwise we'll think the opening '{'
473                    starts an expression, which is not the case with "\N".
474                    Keep looking for either a matched '{' '}' pair, or the end
475                    of the string. */
476 
477                 if (s < end && *s++ == '{') {
478                     while (s < end && *s++ != '}') {
479                     }
480                     continue;
481                 }
482 
483                 /* This is an invalid "\N" sequence, since it's a "\N" not
484                    followed by a "{".  Just keep parsing this literal.  This
485                    error will be caught later by
486                    decode_unicode_with_escapes(). */
487                 continue;
488             }
489             if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 0) {
490                 return -1;
491             }
492         }
493         if (ch == '{' || ch == '}') {
494             /* Check for doubled braces, but only at the top level. If
495                we checked at every level, then f'{0:{3}}' would fail
496                with the two closing braces. */
497             if (recurse_lvl == 0) {
498                 if (s < end && *s == ch) {
499                     /* We're going to tell the caller that the literal ends
500                        here, but that they should continue scanning. But also
501                        skip over the second brace when we resume scanning. */
502                     *str = s + 1;
503                     result = 1;
504                     goto done;
505                 }
506 
507                 /* Where a single '{' is the start of a new expression, a
508                    single '}' is not allowed. */
509                 if (ch == '}') {
510                     *str = s - 1;
511                     RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
512                     return -1;
513                 }
514             }
515             /* We're either at a '{', which means we're starting another
516                expression; or a '}', which means we're at the end of this
517                f-string (for a nested format_spec). */
518             s--;
519             break;
520         }
521     }
522     *str = s;
523     assert(s <= end);
524     assert(s == end || *s == '{' || *s == '}');
525 done:
526     if (literal_start != s) {
527         if (raw) {
528             *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
529                                                     s - literal_start,
530                                                     NULL, NULL);
531         }
532         else {
533             *literal = decode_unicode_with_escapes(p, literal_start,
534                                                    s - literal_start, t);
535         }
536         if (!*literal) {
537             return -1;
538         }
539     }
540     return result;
541 }
542 
543 /* Forward declaration because parsing is recursive. */
544 static expr_ty
545 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
546               Token *first_token, Token* t, Token *last_token);
547 
548 /* Parse the f-string at *str, ending at end.  We know *str starts an
549    expression (so it must be a '{'). Returns the FormattedValue node, which
550    includes the expression, conversion character, format_spec expression, and
551    optionally the text of the expression (if = is used).
552 
553    Note that I don't do a perfect job here: I don't make sure that a
554    closing brace doesn't match an opening paren, for example. It
555    doesn't need to error on all invalid expressions, just correctly
556    find the end of all valid ones. Any errors inside the expression
557    will be caught when we parse it later.
558 
559    *expression is set to the expression.  For an '=' "debug" expression,
560    *expr_text is set to the debug text (the original text of the expression,
561    including the '=' and any whitespace around it, as a string object).  If
562    not a debug expression, *expr_text set to NULL. */
563 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)564 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
565                   PyObject **expr_text, expr_ty *expression, Token *first_token,
566                   Token *t, Token *last_token)
567 {
568     /* Return -1 on error, else 0. */
569 
570     const char *expr_start;
571     const char *expr_end;
572     expr_ty simple_expression;
573     expr_ty format_spec = NULL; /* Optional format specifier. */
574     int conversion = -1; /* The conversion char.  Use default if not
575                             specified, or !r if using = and no format
576                             spec. */
577 
578     /* 0 if we're not in a string, else the quote char we're trying to
579        match (single or double quote). */
580     char quote_char = 0;
581 
582     /* If we're inside a string, 1=normal, 3=triple-quoted. */
583     int string_type = 0;
584 
585     /* Keep track of nesting level for braces/parens/brackets in
586        expressions. */
587     Py_ssize_t nested_depth = 0;
588     char parenstack[MAXLEVEL];
589 
590     *expr_text = NULL;
591 
592     /* Can only nest one level deep. */
593     if (recurse_lvl >= 2) {
594         RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
595         goto error;
596     }
597 
598     /* The first char must be a left brace, or we wouldn't have gotten
599        here. Skip over it. */
600     assert(**str == '{');
601     *str += 1;
602 
603     expr_start = *str;
604     for (; *str < end; (*str)++) {
605         char ch;
606 
607         /* Loop invariants. */
608         assert(nested_depth >= 0);
609         assert(*str >= expr_start && *str < end);
610         if (quote_char) {
611             assert(string_type == 1 || string_type == 3);
612         } else {
613             assert(string_type == 0);
614         }
615 
616         ch = **str;
617         /* Nowhere inside an expression is a backslash allowed. */
618         if (ch == '\\') {
619             /* Error: can't include a backslash character, inside
620                parens or strings or not. */
621             RAISE_SYNTAX_ERROR(
622                       "f-string expression part "
623                       "cannot include a backslash");
624             goto error;
625         }
626         if (quote_char) {
627             /* We're inside a string. See if we're at the end. */
628             /* This code needs to implement the same non-error logic
629                as tok_get from tokenizer.c, at the letter_quote
630                label. To actually share that code would be a
631                nightmare. But, it's unlikely to change and is small,
632                so duplicate it here. Note we don't need to catch all
633                of the errors, since they'll be caught when parsing the
634                expression. We just need to match the non-error
635                cases. Thus we can ignore \n in single-quoted strings,
636                for example. Or non-terminated strings. */
637             if (ch == quote_char) {
638                 /* Does this match the string_type (single or triple
639                    quoted)? */
640                 if (string_type == 3) {
641                     if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
642                         /* We're at the end of a triple quoted string. */
643                         *str += 2;
644                         string_type = 0;
645                         quote_char = 0;
646                         continue;
647                     }
648                 } else {
649                     /* We're at the end of a normal string. */
650                     quote_char = 0;
651                     string_type = 0;
652                     continue;
653                 }
654             }
655         } else if (ch == '\'' || ch == '"') {
656             /* Is this a triple quoted string? */
657             if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
658                 string_type = 3;
659                 *str += 2;
660             } else {
661                 /* Start of a normal string. */
662                 string_type = 1;
663             }
664             /* Start looking for the end of the string. */
665             quote_char = ch;
666         } else if (ch == '[' || ch == '{' || ch == '(') {
667             if (nested_depth >= MAXLEVEL) {
668                 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
669                 goto error;
670             }
671             parenstack[nested_depth] = ch;
672             nested_depth++;
673         } else if (ch == '#') {
674             /* Error: can't include a comment character, inside parens
675                or not. */
676             RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
677             goto error;
678         } else if (nested_depth == 0 &&
679                    (ch == '!' || ch == ':' || ch == '}' ||
680                     ch == '=' || ch == '>' || ch == '<')) {
681             /* See if there's a next character. */
682             if (*str+1 < end) {
683                 char next = *(*str+1);
684 
685                 /* For "!=". since '=' is not an allowed conversion character,
686                    nothing is lost in this test. */
687                 if ((ch == '!' && next == '=') ||   /* != */
688                     (ch == '=' && next == '=') ||   /* == */
689                     (ch == '<' && next == '=') ||   /* <= */
690                     (ch == '>' && next == '=')      /* >= */
691                     ) {
692                     *str += 1;
693                     continue;
694                 }
695             }
696             /* Don't get out of the loop for these, if they're single
697                chars (not part of 2-char tokens). If by themselves, they
698                don't end an expression (unlike say '!'). */
699             if (ch == '>' || ch == '<') {
700                 continue;
701             }
702 
703             /* Normal way out of this loop. */
704             break;
705         } else if (ch == ']' || ch == '}' || ch == ')') {
706             if (!nested_depth) {
707                 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
708                 goto error;
709             }
710             nested_depth--;
711             int opening = (unsigned char)parenstack[nested_depth];
712             if (!((opening == '(' && ch == ')') ||
713                   (opening == '[' && ch == ']') ||
714                   (opening == '{' && ch == '}')))
715             {
716                 RAISE_SYNTAX_ERROR(
717                           "f-string: closing parenthesis '%c' "
718                           "does not match opening parenthesis '%c'",
719                           ch, opening);
720                 goto error;
721             }
722         } else {
723             /* Just consume this char and loop around. */
724         }
725     }
726     expr_end = *str;
727     /* If we leave the above loop in a string or with mismatched parens, we
728        don't really care. We'll get a syntax error when compiling the
729        expression. But, we can produce a better error message, so let's just
730        do that.*/
731     if (quote_char) {
732         RAISE_SYNTAX_ERROR("f-string: unterminated string");
733         goto error;
734     }
735     if (nested_depth) {
736         int opening = (unsigned char)parenstack[nested_depth - 1];
737         RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
738         goto error;
739     }
740 
741     if (*str >= end) {
742         goto unexpected_end_of_string;
743     }
744 
745     /* Compile the expression as soon as possible, so we show errors
746        related to the expression before errors related to the
747        conversion or format_spec. */
748     simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
749     if (!simple_expression) {
750         goto error;
751     }
752 
753     /* Check for =, which puts the text value of the expression in
754        expr_text. */
755     if (**str == '=') {
756         if (p->feature_version < 8) {
757             RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
758                                "only supported in Python 3.8 and greater");
759             goto error;
760         }
761         *str += 1;
762 
763         /* Skip over ASCII whitespace.  No need to test for end of string
764            here, since we know there's at least a trailing quote somewhere
765            ahead. */
766         while (Py_ISSPACE(**str)) {
767             *str += 1;
768         }
769         if (*str >= end) {
770             goto unexpected_end_of_string;
771         }
772         /* Set *expr_text to the text of the expression. */
773         *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
774         if (!*expr_text) {
775             goto error;
776         }
777     }
778 
779     /* Check for a conversion char, if present. */
780     if (**str == '!') {
781         *str += 1;
782         if (*str >= end) {
783             goto unexpected_end_of_string;
784         }
785 
786         conversion = (unsigned char)**str;
787         *str += 1;
788 
789         /* Validate the conversion. */
790         if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
791             RAISE_SYNTAX_ERROR(
792                       "f-string: invalid conversion character: "
793                       "expected 's', 'r', or 'a'");
794             goto error;
795         }
796 
797     }
798 
799     /* Check for the format spec, if present. */
800     if (*str >= end) {
801         goto unexpected_end_of_string;
802     }
803     if (**str == ':') {
804         *str += 1;
805         if (*str >= end) {
806             goto unexpected_end_of_string;
807         }
808 
809         /* Parse the format spec. */
810         format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
811                                     first_token, t, last_token);
812         if (!format_spec) {
813             goto error;
814         }
815     }
816 
817     if (*str >= end || **str != '}') {
818         goto unexpected_end_of_string;
819     }
820 
821     /* We're at a right brace. Consume it. */
822     assert(*str < end);
823     assert(**str == '}');
824     *str += 1;
825 
826     /* If we're in = mode (detected by non-NULL expr_text), and have no format
827        spec and no explicit conversion, set the conversion to 'r'. */
828     if (*expr_text && format_spec == NULL && conversion == -1) {
829         conversion = 'r';
830     }
831 
832     /* And now create the FormattedValue node that represents this
833        entire expression with the conversion and format spec. */
834     //TODO: Fix this
835     *expression = _PyAST_FormattedValue(simple_expression, conversion,
836                                         format_spec, first_token->lineno,
837                                         first_token->col_offset,
838                                         last_token->end_lineno,
839                                         last_token->end_col_offset, p->arena);
840     if (!*expression) {
841         goto error;
842     }
843 
844     return 0;
845 
846 unexpected_end_of_string:
847     RAISE_SYNTAX_ERROR("f-string: expecting '}'");
848     /* Falls through to error. */
849 
850 error:
851     Py_XDECREF(*expr_text);
852     return -1;
853 
854 }
855 
856 /* Return -1 on error.
857 
858    Return 0 if we have a literal (possible zero length) and an
859    expression (zero length if at the end of the string.
860 
861    Return 1 if we have a literal, but no expression, and we want the
862    caller to call us again. This is used to deal with doubled
863    braces.
864 
865    When called multiple times on the string 'a{{b{0}c', this function
866    will return:
867 
868    1. the literal 'a{' with no expression, and a return value
869       of 1. Despite the fact that there's no expression, the return
870       value of 1 means we're not finished yet.
871 
872    2. the literal 'b' and the expression '0', with a return value of
873       0. The fact that there's an expression means we're not finished.
874 
875    3. literal 'c' with no expression and a return value of 0. The
876       combination of the return value of 0 with no expression means
877       we're finished.
878 */
879 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)880 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
881                               int recurse_lvl, PyObject **literal,
882                               PyObject **expr_text, expr_ty *expression,
883                               Token *first_token, Token *t, Token *last_token)
884 {
885     int result;
886 
887     assert(*literal == NULL && *expression == NULL);
888 
889     /* Get any literal string. */
890     result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
891     if (result < 0) {
892         goto error;
893     }
894 
895     assert(result == 0 || result == 1);
896 
897     if (result == 1) {
898         /* We have a literal, but don't look at the expression. */
899         return 1;
900     }
901 
902     if (*str >= end || **str == '}') {
903         /* We're at the end of the string or the end of a nested
904            f-string: no expression. The top-level error case where we
905            expect to be at the end of the string but we're at a '}' is
906            handled later. */
907         return 0;
908     }
909 
910     /* We must now be the start of an expression, on a '{'. */
911     assert(**str == '{');
912 
913     if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
914                           expression, first_token, t, last_token) < 0) {
915         goto error;
916     }
917 
918     return 0;
919 
920 error:
921     Py_CLEAR(*literal);
922     return -1;
923 }
924 
925 #ifdef NDEBUG
926 #define ExprList_check_invariants(l)
927 #else
928 static void
ExprList_check_invariants(ExprList * l)929 ExprList_check_invariants(ExprList *l)
930 {
931     /* Check our invariants. Make sure this object is "live", and
932        hasn't been deallocated. */
933     assert(l->size >= 0);
934     assert(l->p != NULL);
935     if (l->size <= EXPRLIST_N_CACHED) {
936         assert(l->data == l->p);
937     }
938 }
939 #endif
940 
941 static void
ExprList_Init(ExprList * l)942 ExprList_Init(ExprList *l)
943 {
944     l->allocated = EXPRLIST_N_CACHED;
945     l->size = 0;
946 
947     /* Until we start allocating dynamically, p points to data. */
948     l->p = l->data;
949 
950     ExprList_check_invariants(l);
951 }
952 
953 static int
ExprList_Append(ExprList * l,expr_ty exp)954 ExprList_Append(ExprList *l, expr_ty exp)
955 {
956     ExprList_check_invariants(l);
957     if (l->size >= l->allocated) {
958         /* We need to alloc (or realloc) the memory. */
959         Py_ssize_t new_size = l->allocated * 2;
960 
961         /* See if we've ever allocated anything dynamically. */
962         if (l->p == l->data) {
963             Py_ssize_t i;
964             /* We're still using the cached data. Switch to
965                alloc-ing. */
966             l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
967             if (!l->p) {
968                 return -1;
969             }
970             /* Copy the cached data into the new buffer. */
971             for (i = 0; i < l->size; i++) {
972                 l->p[i] = l->data[i];
973             }
974         } else {
975             /* Just realloc. */
976             expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
977             if (!tmp) {
978                 PyMem_Free(l->p);
979                 l->p = NULL;
980                 return -1;
981             }
982             l->p = tmp;
983         }
984 
985         l->allocated = new_size;
986         assert(l->allocated == 2 * l->size);
987     }
988 
989     l->p[l->size++] = exp;
990 
991     ExprList_check_invariants(l);
992     return 0;
993 }
994 
995 static void
ExprList_Dealloc(ExprList * l)996 ExprList_Dealloc(ExprList *l)
997 {
998     ExprList_check_invariants(l);
999 
1000     /* If there's been an error, or we've never dynamically allocated,
1001        do nothing. */
1002     if (!l->p || l->p == l->data) {
1003         /* Do nothing. */
1004     } else {
1005         /* We have dynamically allocated. Free the memory. */
1006         PyMem_Free(l->p);
1007     }
1008     l->p = NULL;
1009     l->size = -1;
1010 }
1011 
1012 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)1013 ExprList_Finish(ExprList *l, PyArena *arena)
1014 {
1015     asdl_expr_seq *seq;
1016 
1017     ExprList_check_invariants(l);
1018 
1019     /* Allocate the asdl_seq and copy the expressions in to it. */
1020     seq = _Py_asdl_expr_seq_new(l->size, arena);
1021     if (seq) {
1022         Py_ssize_t i;
1023         for (i = 0; i < l->size; i++) {
1024             asdl_seq_SET(seq, i, l->p[i]);
1025         }
1026     }
1027     ExprList_Dealloc(l);
1028     return seq;
1029 }
1030 
1031 #ifdef NDEBUG
1032 #define FstringParser_check_invariants(state)
1033 #else
1034 static void
FstringParser_check_invariants(FstringParser * state)1035 FstringParser_check_invariants(FstringParser *state)
1036 {
1037     if (state->last_str) {
1038         assert(PyUnicode_CheckExact(state->last_str));
1039     }
1040     ExprList_check_invariants(&state->expr_list);
1041 }
1042 #endif
1043 
1044 void
_PyPegen_FstringParser_Init(FstringParser * state)1045 _PyPegen_FstringParser_Init(FstringParser *state)
1046 {
1047     state->last_str = NULL;
1048     state->fmode = 0;
1049     ExprList_Init(&state->expr_list);
1050     FstringParser_check_invariants(state);
1051 }
1052 
1053 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1054 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1055 {
1056     FstringParser_check_invariants(state);
1057 
1058     Py_XDECREF(state->last_str);
1059     ExprList_Dealloc(&state->expr_list);
1060 }
1061 
1062 /* Make a Constant node, but decref the PyUnicode object being added. */
1063 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1064 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1065 {
1066     PyObject *s = *str;
1067     PyObject *kind = NULL;
1068     *str = NULL;
1069     assert(PyUnicode_CheckExact(s));
1070     if (_PyArena_AddPyObject(p->arena, s) < 0) {
1071         Py_DECREF(s);
1072         return NULL;
1073     }
1074     const char* the_str = PyBytes_AsString(first_token->bytes);
1075     if (the_str && the_str[0] == 'u') {
1076         kind = _PyPegen_new_identifier(p, "u");
1077     }
1078 
1079     if (kind == NULL && PyErr_Occurred()) {
1080         return NULL;
1081     }
1082 
1083     return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1084                            last_token->end_lineno, last_token->end_col_offset,
1085                            p->arena);
1086 
1087 }
1088 
1089 
1090 /* Add a non-f-string (that is, a regular literal string). str is
1091    decref'd. */
1092 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1093 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1094 {
1095     FstringParser_check_invariants(state);
1096 
1097     assert(PyUnicode_CheckExact(str));
1098 
1099     if (PyUnicode_GET_LENGTH(str) == 0) {
1100         Py_DECREF(str);
1101         return 0;
1102     }
1103 
1104     if (!state->last_str) {
1105         /* We didn't have a string before, so just remember this one. */
1106         state->last_str = str;
1107     } else {
1108         /* Concatenate this with the previous string. */
1109         PyUnicode_AppendAndDel(&state->last_str, str);
1110         if (!state->last_str) {
1111             return -1;
1112         }
1113     }
1114     FstringParser_check_invariants(state);
1115     return 0;
1116 }
1117 
1118 /* Parse an f-string. The f-string is in *str to end, with no
1119    'f' or quotes. */
1120 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1121 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1122                             const char *end, int raw, int recurse_lvl,
1123                             Token *first_token, Token* t, Token *last_token)
1124 {
1125     FstringParser_check_invariants(state);
1126     state->fmode = 1;
1127 
1128     /* Parse the f-string. */
1129     while (1) {
1130         PyObject *literal = NULL;
1131         PyObject *expr_text = NULL;
1132         expr_ty expression = NULL;
1133 
1134         /* If there's a zero length literal in front of the
1135            expression, literal will be NULL. If we're at the end of
1136            the f-string, expression will be NULL (unless result == 1,
1137            see below). */
1138         int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1139                                                    &literal, &expr_text,
1140                                                    &expression, first_token, t, last_token);
1141         if (result < 0) {
1142             return -1;
1143         }
1144 
1145         /* Add the literal, if any. */
1146         if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1147             Py_XDECREF(expr_text);
1148             return -1;
1149         }
1150         /* Add the expr_text, if any. */
1151         if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1152             return -1;
1153         }
1154 
1155         /* We've dealt with the literal and expr_text, their ownership has
1156            been transferred to the state object.  Don't look at them again. */
1157 
1158         /* See if we should just loop around to get the next literal
1159            and expression, while ignoring the expression this
1160            time. This is used for un-doubling braces, as an
1161            optimization. */
1162         if (result == 1) {
1163             continue;
1164         }
1165 
1166         if (!expression) {
1167             /* We're done with this f-string. */
1168             break;
1169         }
1170 
1171         /* We know we have an expression. Convert any existing string
1172            to a Constant node. */
1173         if (state->last_str) {
1174             /* Convert the existing last_str literal to a Constant node. */
1175             expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1176             if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1177                 return -1;
1178             }
1179         }
1180 
1181         if (ExprList_Append(&state->expr_list, expression) < 0) {
1182             return -1;
1183         }
1184     }
1185 
1186     /* If recurse_lvl is zero, then we must be at the end of the
1187        string. Otherwise, we must be at a right brace. */
1188 
1189     if (recurse_lvl == 0 && *str < end-1) {
1190         RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1191         return -1;
1192     }
1193     if (recurse_lvl != 0 && **str != '}') {
1194         RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1195         return -1;
1196     }
1197 
1198     FstringParser_check_invariants(state);
1199     return 0;
1200 }
1201 
1202 /* Convert the partial state reflected in last_str and expr_list to an
1203    expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1204 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1205 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1206                      Token *last_token)
1207 {
1208     asdl_expr_seq *seq;
1209 
1210     FstringParser_check_invariants(state);
1211 
1212     /* If we're just a constant string with no expressions, return
1213        that. */
1214     if (!state->fmode) {
1215         assert(!state->expr_list.size);
1216         if (!state->last_str) {
1217             /* Create a zero length string. */
1218             state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1219             if (!state->last_str) {
1220                 goto error;
1221             }
1222         }
1223         return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1224     }
1225 
1226     /* Create a Constant node out of last_str, if needed. It will be the
1227        last node in our expression list. */
1228     if (state->last_str) {
1229         expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1230         if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1231             goto error;
1232         }
1233     }
1234     /* This has already been freed. */
1235     assert(state->last_str == NULL);
1236 
1237     seq = ExprList_Finish(&state->expr_list, p->arena);
1238     if (!seq) {
1239         goto error;
1240     }
1241 
1242     return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1243                             last_token->end_lineno, last_token->end_col_offset,
1244                             p->arena);
1245 
1246 error:
1247     _PyPegen_FstringParser_Dealloc(state);
1248     return NULL;
1249 }
1250 
1251 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1252    at end, parse it into an expr_ty.  Return NULL on error.  Adjust
1253    str to point past the parsed portion. */
1254 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1255 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1256               int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1257 {
1258     FstringParser state;
1259 
1260     _PyPegen_FstringParser_Init(&state);
1261     if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1262                                     first_token, t, last_token) < 0) {
1263         _PyPegen_FstringParser_Dealloc(&state);
1264         return NULL;
1265     }
1266 
1267     return _PyPegen_FstringParser_Finish(p, &state, t, t);
1268 }
1269