1 #include <stdbool.h>
2
3 #include <Python.h>
4
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8
9 //// STRING HANDLING FUNCTIONS ////
10
11 static int
warn_invalid_escape_sequence(Parser * p,const char * first_invalid_escape,Token * t)12 warn_invalid_escape_sequence(Parser *p, const char *first_invalid_escape, Token *t)
13 {
14 unsigned char c = *first_invalid_escape;
15 int octal = ('4' <= c && c <= '7');
16 PyObject *msg =
17 octal
18 ? PyUnicode_FromFormat("invalid octal escape sequence '\\%.3s'",
19 first_invalid_escape)
20 : PyUnicode_FromFormat("invalid escape sequence '\\%c'", c);
21 if (msg == NULL) {
22 return -1;
23 }
24 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
25 t->lineno, NULL, NULL) < 0) {
26 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
27 /* Replace the DeprecationWarning exception with a SyntaxError
28 to get a more accurate error report */
29 PyErr_Clear();
30
31 /* This is needed, in order for the SyntaxError to point to the token t,
32 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
33 error location, if p->known_err_token is not set. */
34 p->known_err_token = t;
35 if (octal) {
36 RAISE_SYNTAX_ERROR("invalid octal escape sequence '\\%.3s'",
37 first_invalid_escape);
38 }
39 else {
40 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", c);
41 }
42 }
43 Py_DECREF(msg);
44 return -1;
45 }
46 Py_DECREF(msg);
47 return 0;
48 }
49
50 static PyObject *
decode_utf8(const char ** sPtr,const char * end)51 decode_utf8(const char **sPtr, const char *end)
52 {
53 const char *s;
54 const char *t;
55 t = s = *sPtr;
56 while (s < end && (*s & 0x80)) {
57 s++;
58 }
59 *sPtr = s;
60 return PyUnicode_DecodeUTF8(t, s - t, NULL);
61 }
62
63 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)64 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
65 {
66 PyObject *v;
67 PyObject *u;
68 char *buf;
69 char *p;
70 const char *end;
71
72 /* check for integer overflow */
73 if (len > SIZE_MAX / 6) {
74 return NULL;
75 }
76 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
77 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
78 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
79 if (u == NULL) {
80 return NULL;
81 }
82 p = buf = PyBytes_AsString(u);
83 if (p == NULL) {
84 return NULL;
85 }
86 end = s + len;
87 while (s < end) {
88 if (*s == '\\') {
89 *p++ = *s++;
90 if (s >= end || *s & 0x80) {
91 strcpy(p, "u005c");
92 p += 5;
93 if (s >= end) {
94 break;
95 }
96 }
97 }
98 if (*s & 0x80) {
99 PyObject *w;
100 int kind;
101 const void *data;
102 Py_ssize_t w_len;
103 Py_ssize_t i;
104 w = decode_utf8(&s, end);
105 if (w == NULL) {
106 Py_DECREF(u);
107 return NULL;
108 }
109 kind = PyUnicode_KIND(w);
110 data = PyUnicode_DATA(w);
111 w_len = PyUnicode_GET_LENGTH(w);
112 for (i = 0; i < w_len; i++) {
113 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
114 sprintf(p, "\\U%08x", chr);
115 p += 10;
116 }
117 /* Should be impossible to overflow */
118 assert(p - buf <= PyBytes_GET_SIZE(u));
119 Py_DECREF(w);
120 }
121 else {
122 *p++ = *s++;
123 }
124 }
125 len = p - buf;
126 s = buf;
127
128 int first_invalid_escape_char;
129 const char *first_invalid_escape_ptr;
130 v = _PyUnicode_DecodeUnicodeEscapeInternal2(s, (Py_ssize_t)len, NULL, NULL,
131 &first_invalid_escape_char,
132 &first_invalid_escape_ptr);
133
134 if (v != NULL && first_invalid_escape_ptr != NULL) {
135 if (warn_invalid_escape_sequence(parser, first_invalid_escape_ptr, t) < 0) {
136 /* We have not decref u before because first_invalid_escape_ptr points
137 inside u. */
138 Py_XDECREF(u);
139 Py_DECREF(v);
140 return NULL;
141 }
142 }
143 Py_XDECREF(u);
144 return v;
145 }
146
147 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)148 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
149 {
150 int first_invalid_escape_char;
151 const char *first_invalid_escape_ptr;
152 PyObject *result = _PyBytes_DecodeEscape2(s, len, NULL,
153 &first_invalid_escape_char,
154 &first_invalid_escape_ptr);
155 if (result == NULL) {
156 return NULL;
157 }
158
159 if (first_invalid_escape_ptr != NULL) {
160 if (warn_invalid_escape_sequence(p, first_invalid_escape_ptr, t) < 0) {
161 Py_DECREF(result);
162 return NULL;
163 }
164 }
165 return result;
166 }
167
168 /* s must include the bracketing quote characters, and r, b, u,
169 &/or f prefixes (if any), and embedded escape sequences (if any).
170 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
171 If the string is an f-string, set *fstr and *fstrlen to the unparsed
172 string object. Return 0 if no errors occurred. */
173 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)174 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
175 const char **fstr, Py_ssize_t *fstrlen, Token *t)
176 {
177 const char *s = PyBytes_AsString(t->bytes);
178 if (s == NULL) {
179 return -1;
180 }
181
182 size_t len;
183 int quote = Py_CHARMASK(*s);
184 int fmode = 0;
185 *bytesmode = 0;
186 *rawmode = 0;
187 *result = NULL;
188 *fstr = NULL;
189 if (Py_ISALPHA(quote)) {
190 while (!*bytesmode || !*rawmode) {
191 if (quote == 'b' || quote == 'B') {
192 quote =(unsigned char)*++s;
193 *bytesmode = 1;
194 }
195 else if (quote == 'u' || quote == 'U') {
196 quote = (unsigned char)*++s;
197 }
198 else if (quote == 'r' || quote == 'R') {
199 quote = (unsigned char)*++s;
200 *rawmode = 1;
201 }
202 else if (quote == 'f' || quote == 'F') {
203 quote = (unsigned char)*++s;
204 fmode = 1;
205 }
206 else {
207 break;
208 }
209 }
210 }
211
212 /* fstrings are only allowed in Python 3.6 and greater */
213 if (fmode && p->feature_version < 6) {
214 p->error_indicator = 1;
215 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
216 return -1;
217 }
218
219 if (fmode && *bytesmode) {
220 PyErr_BadInternalCall();
221 return -1;
222 }
223 if (quote != '\'' && quote != '\"') {
224 PyErr_BadInternalCall();
225 return -1;
226 }
227 /* Skip the leading quote char. */
228 s++;
229 len = strlen(s);
230 if (len > INT_MAX) {
231 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
232 return -1;
233 }
234 if (s[--len] != quote) {
235 /* Last quote char must match the first. */
236 PyErr_BadInternalCall();
237 return -1;
238 }
239 if (len >= 4 && s[0] == quote && s[1] == quote) {
240 /* A triple quoted string. We've already skipped one quote at
241 the start and one at the end of the string. Now skip the
242 two at the start. */
243 s += 2;
244 len -= 2;
245 /* And check that the last two match. */
246 if (s[--len] != quote || s[--len] != quote) {
247 PyErr_BadInternalCall();
248 return -1;
249 }
250 }
251
252 if (fmode) {
253 /* Just return the bytes. The caller will parse the resulting
254 string. */
255 *fstr = s;
256 *fstrlen = len;
257 return 0;
258 }
259
260 /* Not an f-string. */
261 /* Avoid invoking escape decoding routines if possible. */
262 *rawmode = *rawmode || strchr(s, '\\') == NULL;
263 if (*bytesmode) {
264 /* Disallow non-ASCII characters. */
265 const char *ch;
266 for (ch = s; *ch; ch++) {
267 if (Py_CHARMASK(*ch) >= 0x80) {
268 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(
269 t,
270 "bytes can only contain ASCII "
271 "literal characters");
272 return -1;
273 }
274 }
275 if (*rawmode) {
276 *result = PyBytes_FromStringAndSize(s, len);
277 }
278 else {
279 *result = decode_bytes_with_escapes(p, s, len, t);
280 }
281 }
282 else {
283 if (*rawmode) {
284 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
285 }
286 else {
287 *result = decode_unicode_with_escapes(p, s, len, t);
288 }
289 }
290 return *result == NULL ? -1 : 0;
291 }
292
293
294
295 // FSTRING STUFF
296
297 /* Fix locations for the given node and its children.
298
299 `parent` is the enclosing node.
300 `expr_start` is the starting position of the expression (pointing to the open brace).
301 `n` is the node which locations are going to be fixed relative to parent.
302 `expr_str` is the child node's string representation, including braces.
303 */
304 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)305 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
306 {
307 *p_lines = 0;
308 *p_cols = 0;
309 assert(expr_start != NULL && *expr_start == '{');
310 if (parent && parent->bytes) {
311 const char *parent_str = PyBytes_AsString(parent->bytes);
312 if (!parent_str) {
313 return false;
314 }
315 // The following is needed, in order to correctly shift the column
316 // offset, in the case that (disregarding any whitespace) a newline
317 // immediately follows the opening curly brace of the fstring expression.
318 bool newline_after_brace = 1;
319 const char *start = expr_start + 1;
320 while (start && *start != '}' && *start != '\n') {
321 if (*start != ' ' && *start != '\t' && *start != '\f') {
322 newline_after_brace = 0;
323 break;
324 }
325 start++;
326 }
327
328 // Account for the characters from the last newline character to our
329 // left until the beginning of expr_start.
330 if (!newline_after_brace) {
331 start = expr_start;
332 while (start > parent_str && *start != '\n') {
333 start--;
334 }
335 *p_cols += (int)(expr_start - start);
336 if (*start == '\n') {
337 *p_cols -= 1;
338 }
339 }
340 /* adjust the start based on the number of newlines encountered
341 before the f-string expression */
342 for (const char *p = parent_str; p < expr_start; p++) {
343 if (*p == '\n') {
344 (*p_lines)++;
345 }
346 }
347 }
348 return true;
349 }
350
351
352 /* Compile this expression in to an expr_ty. Add parens around the
353 expression, in order to allow leading spaces in the expression. */
354 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)355 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
356 Token *t)
357 {
358 expr_ty expr = NULL;
359 char *str;
360 Py_ssize_t len;
361 const char *s;
362 expr_ty result = NULL;
363
364 assert(expr_end >= expr_start);
365 assert(*(expr_start-1) == '{');
366 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
367 *expr_end == '=');
368
369 /* If the substring is all whitespace, it's an error. We need to catch this
370 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
371 because turning the expression '' in to '()' would go from being invalid
372 to valid. */
373 for (s = expr_start; s != expr_end; s++) {
374 char c = *s;
375 /* The Python parser ignores only the following whitespace
376 characters (\r already is converted to \n). */
377 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
378 break;
379 }
380 }
381
382 if (s == expr_end) {
383 if (*expr_end == '!' || *expr_end == ':' || *expr_end == '=') {
384 RAISE_SYNTAX_ERROR("f-string: expression required before '%c'", *expr_end);
385 return NULL;
386 }
387 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
388 return NULL;
389 }
390
391 len = expr_end - expr_start;
392 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
393 str = PyMem_Calloc(len + 3, sizeof(char));
394 if (str == NULL) {
395 PyErr_NoMemory();
396 return NULL;
397 }
398
399 // The call to fstring_find_expr_location is responsible for finding the column offset
400 // the generated AST nodes need to be shifted to the right, which is equal to the number
401 // of the f-string characters before the expression starts.
402 memcpy(str+1, expr_start, len);
403 int lines, cols;
404 if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
405 PyMem_Free(str);
406 return NULL;
407 }
408
409 // The parentheses are needed in order to allow for leading whitespace within
410 // the f-string expression. This consequently gets parsed as a group (see the
411 // group rule in python.gram).
412 str[0] = '(';
413 str[len+1] = ')';
414
415 struct tok_state* tok = _PyTokenizer_FromString(str, 1);
416 if (tok == NULL) {
417 PyMem_Free(str);
418 return NULL;
419 }
420 Py_INCREF(p->tok->filename);
421
422 tok->filename = p->tok->filename;
423 tok->lineno = t->lineno + lines - 1;
424
425 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
426 NULL, p->arena);
427
428 p2->starting_lineno = t->lineno + lines;
429 p2->starting_col_offset = lines != 0 ? cols : t->col_offset + cols;
430
431 expr = _PyPegen_run_parser(p2);
432
433 if (expr == NULL) {
434 goto exit;
435 }
436 result = expr;
437
438 exit:
439 PyMem_Free(str);
440 _PyPegen_Parser_Free(p2);
441 _PyTokenizer_Free(tok);
442 return result;
443 }
444
445 /* Return -1 on error.
446
447 Return 0 if we reached the end of the literal.
448
449 Return 1 if we haven't reached the end of the literal, but we want
450 the caller to process the literal up to this point. Used for
451 doubled braces.
452 */
453 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)454 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
455 PyObject **literal, int recurse_lvl, Token *t)
456 {
457 /* Get any literal string. It ends when we hit an un-doubled left
458 brace (which isn't part of a unicode name escape such as
459 "\N{EULER CONSTANT}"), or the end of the string. */
460
461 const char *s = *str;
462 const char *literal_start = s;
463 int result = 0;
464
465 assert(*literal == NULL);
466 while (s < end) {
467 char ch = *s++;
468 if (!raw && ch == '\\' && s < end) {
469 ch = *s++;
470 if (ch == 'N') {
471 /* We need to look at and skip matching braces for "\N{name}"
472 sequences because otherwise we'll think the opening '{'
473 starts an expression, which is not the case with "\N".
474 Keep looking for either a matched '{' '}' pair, or the end
475 of the string. */
476
477 if (s < end && *s++ == '{') {
478 while (s < end && *s++ != '}') {
479 }
480 continue;
481 }
482
483 /* This is an invalid "\N" sequence, since it's a "\N" not
484 followed by a "{". Just keep parsing this literal. This
485 error will be caught later by
486 decode_unicode_with_escapes(). */
487 continue;
488 }
489 if (ch == '{' && warn_invalid_escape_sequence(p, s-1, t) < 0) {
490 return -1;
491 }
492 }
493 if (ch == '{' || ch == '}') {
494 /* Check for doubled braces, but only at the top level. If
495 we checked at every level, then f'{0:{3}}' would fail
496 with the two closing braces. */
497 if (recurse_lvl == 0) {
498 if (s < end && *s == ch) {
499 /* We're going to tell the caller that the literal ends
500 here, but that they should continue scanning. But also
501 skip over the second brace when we resume scanning. */
502 *str = s + 1;
503 result = 1;
504 goto done;
505 }
506
507 /* Where a single '{' is the start of a new expression, a
508 single '}' is not allowed. */
509 if (ch == '}') {
510 *str = s - 1;
511 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
512 return -1;
513 }
514 }
515 /* We're either at a '{', which means we're starting another
516 expression; or a '}', which means we're at the end of this
517 f-string (for a nested format_spec). */
518 s--;
519 break;
520 }
521 }
522 *str = s;
523 assert(s <= end);
524 assert(s == end || *s == '{' || *s == '}');
525 done:
526 if (literal_start != s) {
527 if (raw) {
528 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
529 s - literal_start,
530 NULL, NULL);
531 }
532 else {
533 *literal = decode_unicode_with_escapes(p, literal_start,
534 s - literal_start, t);
535 }
536 if (!*literal) {
537 return -1;
538 }
539 }
540 return result;
541 }
542
543 /* Forward declaration because parsing is recursive. */
544 static expr_ty
545 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
546 Token *first_token, Token* t, Token *last_token);
547
548 /* Parse the f-string at *str, ending at end. We know *str starts an
549 expression (so it must be a '{'). Returns the FormattedValue node, which
550 includes the expression, conversion character, format_spec expression, and
551 optionally the text of the expression (if = is used).
552
553 Note that I don't do a perfect job here: I don't make sure that a
554 closing brace doesn't match an opening paren, for example. It
555 doesn't need to error on all invalid expressions, just correctly
556 find the end of all valid ones. Any errors inside the expression
557 will be caught when we parse it later.
558
559 *expression is set to the expression. For an '=' "debug" expression,
560 *expr_text is set to the debug text (the original text of the expression,
561 including the '=' and any whitespace around it, as a string object). If
562 not a debug expression, *expr_text set to NULL. */
563 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)564 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
565 PyObject **expr_text, expr_ty *expression, Token *first_token,
566 Token *t, Token *last_token)
567 {
568 /* Return -1 on error, else 0. */
569
570 const char *expr_start;
571 const char *expr_end;
572 expr_ty simple_expression;
573 expr_ty format_spec = NULL; /* Optional format specifier. */
574 int conversion = -1; /* The conversion char. Use default if not
575 specified, or !r if using = and no format
576 spec. */
577
578 /* 0 if we're not in a string, else the quote char we're trying to
579 match (single or double quote). */
580 char quote_char = 0;
581
582 /* If we're inside a string, 1=normal, 3=triple-quoted. */
583 int string_type = 0;
584
585 /* Keep track of nesting level for braces/parens/brackets in
586 expressions. */
587 Py_ssize_t nested_depth = 0;
588 char parenstack[MAXLEVEL];
589
590 *expr_text = NULL;
591
592 /* Can only nest one level deep. */
593 if (recurse_lvl >= 2) {
594 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
595 goto error;
596 }
597
598 /* The first char must be a left brace, or we wouldn't have gotten
599 here. Skip over it. */
600 assert(**str == '{');
601 *str += 1;
602
603 expr_start = *str;
604 for (; *str < end; (*str)++) {
605 char ch;
606
607 /* Loop invariants. */
608 assert(nested_depth >= 0);
609 assert(*str >= expr_start && *str < end);
610 if (quote_char) {
611 assert(string_type == 1 || string_type == 3);
612 } else {
613 assert(string_type == 0);
614 }
615
616 ch = **str;
617 /* Nowhere inside an expression is a backslash allowed. */
618 if (ch == '\\') {
619 /* Error: can't include a backslash character, inside
620 parens or strings or not. */
621 RAISE_SYNTAX_ERROR(
622 "f-string expression part "
623 "cannot include a backslash");
624 goto error;
625 }
626 if (quote_char) {
627 /* We're inside a string. See if we're at the end. */
628 /* This code needs to implement the same non-error logic
629 as tok_get from tokenizer.c, at the letter_quote
630 label. To actually share that code would be a
631 nightmare. But, it's unlikely to change and is small,
632 so duplicate it here. Note we don't need to catch all
633 of the errors, since they'll be caught when parsing the
634 expression. We just need to match the non-error
635 cases. Thus we can ignore \n in single-quoted strings,
636 for example. Or non-terminated strings. */
637 if (ch == quote_char) {
638 /* Does this match the string_type (single or triple
639 quoted)? */
640 if (string_type == 3) {
641 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
642 /* We're at the end of a triple quoted string. */
643 *str += 2;
644 string_type = 0;
645 quote_char = 0;
646 continue;
647 }
648 } else {
649 /* We're at the end of a normal string. */
650 quote_char = 0;
651 string_type = 0;
652 continue;
653 }
654 }
655 } else if (ch == '\'' || ch == '"') {
656 /* Is this a triple quoted string? */
657 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
658 string_type = 3;
659 *str += 2;
660 } else {
661 /* Start of a normal string. */
662 string_type = 1;
663 }
664 /* Start looking for the end of the string. */
665 quote_char = ch;
666 } else if (ch == '[' || ch == '{' || ch == '(') {
667 if (nested_depth >= MAXLEVEL) {
668 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
669 goto error;
670 }
671 parenstack[nested_depth] = ch;
672 nested_depth++;
673 } else if (ch == '#') {
674 /* Error: can't include a comment character, inside parens
675 or not. */
676 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
677 goto error;
678 } else if (nested_depth == 0 &&
679 (ch == '!' || ch == ':' || ch == '}' ||
680 ch == '=' || ch == '>' || ch == '<')) {
681 /* See if there's a next character. */
682 if (*str+1 < end) {
683 char next = *(*str+1);
684
685 /* For "!=". since '=' is not an allowed conversion character,
686 nothing is lost in this test. */
687 if ((ch == '!' && next == '=') || /* != */
688 (ch == '=' && next == '=') || /* == */
689 (ch == '<' && next == '=') || /* <= */
690 (ch == '>' && next == '=') /* >= */
691 ) {
692 *str += 1;
693 continue;
694 }
695 }
696 /* Don't get out of the loop for these, if they're single
697 chars (not part of 2-char tokens). If by themselves, they
698 don't end an expression (unlike say '!'). */
699 if (ch == '>' || ch == '<') {
700 continue;
701 }
702
703 /* Normal way out of this loop. */
704 break;
705 } else if (ch == ']' || ch == '}' || ch == ')') {
706 if (!nested_depth) {
707 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
708 goto error;
709 }
710 nested_depth--;
711 int opening = (unsigned char)parenstack[nested_depth];
712 if (!((opening == '(' && ch == ')') ||
713 (opening == '[' && ch == ']') ||
714 (opening == '{' && ch == '}')))
715 {
716 RAISE_SYNTAX_ERROR(
717 "f-string: closing parenthesis '%c' "
718 "does not match opening parenthesis '%c'",
719 ch, opening);
720 goto error;
721 }
722 } else {
723 /* Just consume this char and loop around. */
724 }
725 }
726 expr_end = *str;
727 /* If we leave the above loop in a string or with mismatched parens, we
728 don't really care. We'll get a syntax error when compiling the
729 expression. But, we can produce a better error message, so let's just
730 do that.*/
731 if (quote_char) {
732 RAISE_SYNTAX_ERROR("f-string: unterminated string");
733 goto error;
734 }
735 if (nested_depth) {
736 int opening = (unsigned char)parenstack[nested_depth - 1];
737 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
738 goto error;
739 }
740
741 if (*str >= end) {
742 goto unexpected_end_of_string;
743 }
744
745 /* Compile the expression as soon as possible, so we show errors
746 related to the expression before errors related to the
747 conversion or format_spec. */
748 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
749 if (!simple_expression) {
750 goto error;
751 }
752
753 /* Check for =, which puts the text value of the expression in
754 expr_text. */
755 if (**str == '=') {
756 if (p->feature_version < 8) {
757 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
758 "only supported in Python 3.8 and greater");
759 goto error;
760 }
761 *str += 1;
762
763 /* Skip over ASCII whitespace. No need to test for end of string
764 here, since we know there's at least a trailing quote somewhere
765 ahead. */
766 while (Py_ISSPACE(**str)) {
767 *str += 1;
768 }
769 if (*str >= end) {
770 goto unexpected_end_of_string;
771 }
772 /* Set *expr_text to the text of the expression. */
773 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
774 if (!*expr_text) {
775 goto error;
776 }
777 }
778
779 /* Check for a conversion char, if present. */
780 if (**str == '!') {
781 *str += 1;
782 if (*str >= end) {
783 goto unexpected_end_of_string;
784 }
785
786 conversion = (unsigned char)**str;
787 *str += 1;
788
789 /* Validate the conversion. */
790 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
791 RAISE_SYNTAX_ERROR(
792 "f-string: invalid conversion character: "
793 "expected 's', 'r', or 'a'");
794 goto error;
795 }
796
797 }
798
799 /* Check for the format spec, if present. */
800 if (*str >= end) {
801 goto unexpected_end_of_string;
802 }
803 if (**str == ':') {
804 *str += 1;
805 if (*str >= end) {
806 goto unexpected_end_of_string;
807 }
808
809 /* Parse the format spec. */
810 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
811 first_token, t, last_token);
812 if (!format_spec) {
813 goto error;
814 }
815 }
816
817 if (*str >= end || **str != '}') {
818 goto unexpected_end_of_string;
819 }
820
821 /* We're at a right brace. Consume it. */
822 assert(*str < end);
823 assert(**str == '}');
824 *str += 1;
825
826 /* If we're in = mode (detected by non-NULL expr_text), and have no format
827 spec and no explicit conversion, set the conversion to 'r'. */
828 if (*expr_text && format_spec == NULL && conversion == -1) {
829 conversion = 'r';
830 }
831
832 /* And now create the FormattedValue node that represents this
833 entire expression with the conversion and format spec. */
834 //TODO: Fix this
835 *expression = _PyAST_FormattedValue(simple_expression, conversion,
836 format_spec, first_token->lineno,
837 first_token->col_offset,
838 last_token->end_lineno,
839 last_token->end_col_offset, p->arena);
840 if (!*expression) {
841 goto error;
842 }
843
844 return 0;
845
846 unexpected_end_of_string:
847 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
848 /* Falls through to error. */
849
850 error:
851 Py_XDECREF(*expr_text);
852 return -1;
853
854 }
855
856 /* Return -1 on error.
857
858 Return 0 if we have a literal (possible zero length) and an
859 expression (zero length if at the end of the string.
860
861 Return 1 if we have a literal, but no expression, and we want the
862 caller to call us again. This is used to deal with doubled
863 braces.
864
865 When called multiple times on the string 'a{{b{0}c', this function
866 will return:
867
868 1. the literal 'a{' with no expression, and a return value
869 of 1. Despite the fact that there's no expression, the return
870 value of 1 means we're not finished yet.
871
872 2. the literal 'b' and the expression '0', with a return value of
873 0. The fact that there's an expression means we're not finished.
874
875 3. literal 'c' with no expression and a return value of 0. The
876 combination of the return value of 0 with no expression means
877 we're finished.
878 */
879 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)880 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
881 int recurse_lvl, PyObject **literal,
882 PyObject **expr_text, expr_ty *expression,
883 Token *first_token, Token *t, Token *last_token)
884 {
885 int result;
886
887 assert(*literal == NULL && *expression == NULL);
888
889 /* Get any literal string. */
890 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
891 if (result < 0) {
892 goto error;
893 }
894
895 assert(result == 0 || result == 1);
896
897 if (result == 1) {
898 /* We have a literal, but don't look at the expression. */
899 return 1;
900 }
901
902 if (*str >= end || **str == '}') {
903 /* We're at the end of the string or the end of a nested
904 f-string: no expression. The top-level error case where we
905 expect to be at the end of the string but we're at a '}' is
906 handled later. */
907 return 0;
908 }
909
910 /* We must now be the start of an expression, on a '{'. */
911 assert(**str == '{');
912
913 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
914 expression, first_token, t, last_token) < 0) {
915 goto error;
916 }
917
918 return 0;
919
920 error:
921 Py_CLEAR(*literal);
922 return -1;
923 }
924
925 #ifdef NDEBUG
926 #define ExprList_check_invariants(l)
927 #else
928 static void
ExprList_check_invariants(ExprList * l)929 ExprList_check_invariants(ExprList *l)
930 {
931 /* Check our invariants. Make sure this object is "live", and
932 hasn't been deallocated. */
933 assert(l->size >= 0);
934 assert(l->p != NULL);
935 if (l->size <= EXPRLIST_N_CACHED) {
936 assert(l->data == l->p);
937 }
938 }
939 #endif
940
941 static void
ExprList_Init(ExprList * l)942 ExprList_Init(ExprList *l)
943 {
944 l->allocated = EXPRLIST_N_CACHED;
945 l->size = 0;
946
947 /* Until we start allocating dynamically, p points to data. */
948 l->p = l->data;
949
950 ExprList_check_invariants(l);
951 }
952
953 static int
ExprList_Append(ExprList * l,expr_ty exp)954 ExprList_Append(ExprList *l, expr_ty exp)
955 {
956 ExprList_check_invariants(l);
957 if (l->size >= l->allocated) {
958 /* We need to alloc (or realloc) the memory. */
959 Py_ssize_t new_size = l->allocated * 2;
960
961 /* See if we've ever allocated anything dynamically. */
962 if (l->p == l->data) {
963 Py_ssize_t i;
964 /* We're still using the cached data. Switch to
965 alloc-ing. */
966 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
967 if (!l->p) {
968 return -1;
969 }
970 /* Copy the cached data into the new buffer. */
971 for (i = 0; i < l->size; i++) {
972 l->p[i] = l->data[i];
973 }
974 } else {
975 /* Just realloc. */
976 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
977 if (!tmp) {
978 PyMem_Free(l->p);
979 l->p = NULL;
980 return -1;
981 }
982 l->p = tmp;
983 }
984
985 l->allocated = new_size;
986 assert(l->allocated == 2 * l->size);
987 }
988
989 l->p[l->size++] = exp;
990
991 ExprList_check_invariants(l);
992 return 0;
993 }
994
995 static void
ExprList_Dealloc(ExprList * l)996 ExprList_Dealloc(ExprList *l)
997 {
998 ExprList_check_invariants(l);
999
1000 /* If there's been an error, or we've never dynamically allocated,
1001 do nothing. */
1002 if (!l->p || l->p == l->data) {
1003 /* Do nothing. */
1004 } else {
1005 /* We have dynamically allocated. Free the memory. */
1006 PyMem_Free(l->p);
1007 }
1008 l->p = NULL;
1009 l->size = -1;
1010 }
1011
1012 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)1013 ExprList_Finish(ExprList *l, PyArena *arena)
1014 {
1015 asdl_expr_seq *seq;
1016
1017 ExprList_check_invariants(l);
1018
1019 /* Allocate the asdl_seq and copy the expressions in to it. */
1020 seq = _Py_asdl_expr_seq_new(l->size, arena);
1021 if (seq) {
1022 Py_ssize_t i;
1023 for (i = 0; i < l->size; i++) {
1024 asdl_seq_SET(seq, i, l->p[i]);
1025 }
1026 }
1027 ExprList_Dealloc(l);
1028 return seq;
1029 }
1030
1031 #ifdef NDEBUG
1032 #define FstringParser_check_invariants(state)
1033 #else
1034 static void
FstringParser_check_invariants(FstringParser * state)1035 FstringParser_check_invariants(FstringParser *state)
1036 {
1037 if (state->last_str) {
1038 assert(PyUnicode_CheckExact(state->last_str));
1039 }
1040 ExprList_check_invariants(&state->expr_list);
1041 }
1042 #endif
1043
1044 void
_PyPegen_FstringParser_Init(FstringParser * state)1045 _PyPegen_FstringParser_Init(FstringParser *state)
1046 {
1047 state->last_str = NULL;
1048 state->fmode = 0;
1049 ExprList_Init(&state->expr_list);
1050 FstringParser_check_invariants(state);
1051 }
1052
1053 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1054 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1055 {
1056 FstringParser_check_invariants(state);
1057
1058 Py_XDECREF(state->last_str);
1059 ExprList_Dealloc(&state->expr_list);
1060 }
1061
1062 /* Make a Constant node, but decref the PyUnicode object being added. */
1063 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1064 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1065 {
1066 PyObject *s = *str;
1067 PyObject *kind = NULL;
1068 *str = NULL;
1069 assert(PyUnicode_CheckExact(s));
1070 if (_PyArena_AddPyObject(p->arena, s) < 0) {
1071 Py_DECREF(s);
1072 return NULL;
1073 }
1074 const char* the_str = PyBytes_AsString(first_token->bytes);
1075 if (the_str && the_str[0] == 'u') {
1076 kind = _PyPegen_new_identifier(p, "u");
1077 }
1078
1079 if (kind == NULL && PyErr_Occurred()) {
1080 return NULL;
1081 }
1082
1083 return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1084 last_token->end_lineno, last_token->end_col_offset,
1085 p->arena);
1086
1087 }
1088
1089
1090 /* Add a non-f-string (that is, a regular literal string). str is
1091 decref'd. */
1092 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1093 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1094 {
1095 FstringParser_check_invariants(state);
1096
1097 assert(PyUnicode_CheckExact(str));
1098
1099 if (PyUnicode_GET_LENGTH(str) == 0) {
1100 Py_DECREF(str);
1101 return 0;
1102 }
1103
1104 if (!state->last_str) {
1105 /* We didn't have a string before, so just remember this one. */
1106 state->last_str = str;
1107 } else {
1108 /* Concatenate this with the previous string. */
1109 PyUnicode_AppendAndDel(&state->last_str, str);
1110 if (!state->last_str) {
1111 return -1;
1112 }
1113 }
1114 FstringParser_check_invariants(state);
1115 return 0;
1116 }
1117
1118 /* Parse an f-string. The f-string is in *str to end, with no
1119 'f' or quotes. */
1120 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1121 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1122 const char *end, int raw, int recurse_lvl,
1123 Token *first_token, Token* t, Token *last_token)
1124 {
1125 FstringParser_check_invariants(state);
1126 state->fmode = 1;
1127
1128 /* Parse the f-string. */
1129 while (1) {
1130 PyObject *literal = NULL;
1131 PyObject *expr_text = NULL;
1132 expr_ty expression = NULL;
1133
1134 /* If there's a zero length literal in front of the
1135 expression, literal will be NULL. If we're at the end of
1136 the f-string, expression will be NULL (unless result == 1,
1137 see below). */
1138 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1139 &literal, &expr_text,
1140 &expression, first_token, t, last_token);
1141 if (result < 0) {
1142 return -1;
1143 }
1144
1145 /* Add the literal, if any. */
1146 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1147 Py_XDECREF(expr_text);
1148 return -1;
1149 }
1150 /* Add the expr_text, if any. */
1151 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1152 return -1;
1153 }
1154
1155 /* We've dealt with the literal and expr_text, their ownership has
1156 been transferred to the state object. Don't look at them again. */
1157
1158 /* See if we should just loop around to get the next literal
1159 and expression, while ignoring the expression this
1160 time. This is used for un-doubling braces, as an
1161 optimization. */
1162 if (result == 1) {
1163 continue;
1164 }
1165
1166 if (!expression) {
1167 /* We're done with this f-string. */
1168 break;
1169 }
1170
1171 /* We know we have an expression. Convert any existing string
1172 to a Constant node. */
1173 if (state->last_str) {
1174 /* Convert the existing last_str literal to a Constant node. */
1175 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1176 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1177 return -1;
1178 }
1179 }
1180
1181 if (ExprList_Append(&state->expr_list, expression) < 0) {
1182 return -1;
1183 }
1184 }
1185
1186 /* If recurse_lvl is zero, then we must be at the end of the
1187 string. Otherwise, we must be at a right brace. */
1188
1189 if (recurse_lvl == 0 && *str < end-1) {
1190 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1191 return -1;
1192 }
1193 if (recurse_lvl != 0 && **str != '}') {
1194 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1195 return -1;
1196 }
1197
1198 FstringParser_check_invariants(state);
1199 return 0;
1200 }
1201
1202 /* Convert the partial state reflected in last_str and expr_list to an
1203 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1204 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1205 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1206 Token *last_token)
1207 {
1208 asdl_expr_seq *seq;
1209
1210 FstringParser_check_invariants(state);
1211
1212 /* If we're just a constant string with no expressions, return
1213 that. */
1214 if (!state->fmode) {
1215 assert(!state->expr_list.size);
1216 if (!state->last_str) {
1217 /* Create a zero length string. */
1218 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1219 if (!state->last_str) {
1220 goto error;
1221 }
1222 }
1223 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1224 }
1225
1226 /* Create a Constant node out of last_str, if needed. It will be the
1227 last node in our expression list. */
1228 if (state->last_str) {
1229 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1230 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1231 goto error;
1232 }
1233 }
1234 /* This has already been freed. */
1235 assert(state->last_str == NULL);
1236
1237 seq = ExprList_Finish(&state->expr_list, p->arena);
1238 if (!seq) {
1239 goto error;
1240 }
1241
1242 return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1243 last_token->end_lineno, last_token->end_col_offset,
1244 p->arena);
1245
1246 error:
1247 _PyPegen_FstringParser_Dealloc(state);
1248 return NULL;
1249 }
1250
1251 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1252 at end, parse it into an expr_ty. Return NULL on error. Adjust
1253 str to point past the parsed portion. */
1254 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1255 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1256 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1257 {
1258 FstringParser state;
1259
1260 _PyPegen_FstringParser_Init(&state);
1261 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1262 first_token, t, last_token) < 0) {
1263 _PyPegen_FstringParser_Dealloc(&state);
1264 return NULL;
1265 }
1266
1267 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1268 }
1269