1 #include <stdbool.h>
2
3 #include <Python.h>
4
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8
9 //// STRING HANDLING FUNCTIONS ////
10
11 static int
warn_invalid_escape_sequence(Parser * p,unsigned char first_invalid_escape_char,Token * t)12 warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
13 {
14 PyObject *msg =
15 PyUnicode_FromFormat("invalid escape sequence '\\%c'", first_invalid_escape_char);
16 if (msg == NULL) {
17 return -1;
18 }
19 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
20 t->lineno, NULL, NULL) < 0) {
21 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22 /* Replace the DeprecationWarning exception with a SyntaxError
23 to get a more accurate error report */
24 PyErr_Clear();
25
26 /* This is needed, in order for the SyntaxError to point to the token t,
27 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28 error location, if p->known_err_token is not set. */
29 p->known_err_token = t;
30 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", first_invalid_escape_char);
31 }
32 Py_DECREF(msg);
33 return -1;
34 }
35 Py_DECREF(msg);
36 return 0;
37 }
38
39 static PyObject *
decode_utf8(const char ** sPtr,const char * end)40 decode_utf8(const char **sPtr, const char *end)
41 {
42 const char *s;
43 const char *t;
44 t = s = *sPtr;
45 while (s < end && (*s & 0x80)) {
46 s++;
47 }
48 *sPtr = s;
49 return PyUnicode_DecodeUTF8(t, s - t, NULL);
50 }
51
52 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)53 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
54 {
55 PyObject *v;
56 PyObject *u;
57 char *buf;
58 char *p;
59 const char *end;
60
61 /* check for integer overflow */
62 if (len > SIZE_MAX / 6) {
63 return NULL;
64 }
65 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68 if (u == NULL) {
69 return NULL;
70 }
71 p = buf = PyBytes_AsString(u);
72 if (p == NULL) {
73 return NULL;
74 }
75 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
90 const void *data;
91 Py_ssize_t w_len;
92 Py_ssize_t i;
93 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
118 v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119
120 if (v != NULL && first_invalid_escape != NULL) {
121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131 }
132
133 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)134 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
135 {
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149 }
150
151 /* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)157 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
159 {
160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
175 quote =(unsigned char)*++s;
176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
179 quote = (unsigned char)*++s;
180 }
181 else if (quote == 'r' || quote == 'R') {
182 quote = (unsigned char)*++s;
183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
186 quote = (unsigned char)*++s;
187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
253 "literal characters");
254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
261 *result = decode_bytes_with_escapes(p, s, len, t);
262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
269 *result = decode_unicode_with_escapes(p, s, len, t);
270 }
271 }
272 return *result == NULL ? -1 : 0;
273 }
274
275
276
277 // FSTRING STUFF
278
279 /* Fix locations for the given node and its children.
280
281 `parent` is the enclosing node.
282 `expr_start` is the starting position of the expression (pointing to the open brace).
283 `n` is the node which locations are going to be fixed relative to parent.
284 `expr_str` is the child node's string representation, including braces.
285 */
286 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)287 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
288 {
289 *p_lines = 0;
290 *p_cols = 0;
291 assert(expr_start != NULL && *expr_start == '{');
292 if (parent && parent->bytes) {
293 const char *parent_str = PyBytes_AsString(parent->bytes);
294 if (!parent_str) {
295 return false;
296 }
297 // The following is needed, in order to correctly shift the column
298 // offset, in the case that (disregarding any whitespace) a newline
299 // immediately follows the opening curly brace of the fstring expression.
300 bool newline_after_brace = 1;
301 const char *start = expr_start + 1;
302 while (start && *start != '}' && *start != '\n') {
303 if (*start != ' ' && *start != '\t' && *start != '\f') {
304 newline_after_brace = 0;
305 break;
306 }
307 start++;
308 }
309
310 // Account for the characters from the last newline character to our
311 // left until the beginning of expr_start.
312 if (!newline_after_brace) {
313 start = expr_start;
314 while (start > parent_str && *start != '\n') {
315 start--;
316 }
317 *p_cols += (int)(expr_start - start);
318 }
319 /* adjust the start based on the number of newlines encountered
320 before the f-string expression */
321 for (const char *p = parent_str; p < expr_start; p++) {
322 if (*p == '\n') {
323 (*p_lines)++;
324 }
325 }
326 }
327 return true;
328 }
329
330
331 /* Compile this expression in to an expr_ty. Add parens around the
332 expression, in order to allow leading spaces in the expression. */
333 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)334 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
335 Token *t)
336 {
337 expr_ty expr = NULL;
338 char *str;
339 Py_ssize_t len;
340 const char *s;
341 expr_ty result = NULL;
342
343 assert(expr_end >= expr_start);
344 assert(*(expr_start-1) == '{');
345 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
346 *expr_end == '=');
347
348 /* If the substring is all whitespace, it's an error. We need to catch this
349 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
350 because turning the expression '' in to '()' would go from being invalid
351 to valid. */
352 for (s = expr_start; s != expr_end; s++) {
353 char c = *s;
354 /* The Python parser ignores only the following whitespace
355 characters (\r already is converted to \n). */
356 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
357 break;
358 }
359 }
360 if (s == expr_end) {
361 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
362 return NULL;
363 }
364
365 len = expr_end - expr_start;
366 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
367 str = PyMem_Calloc(len + 3, sizeof(char));
368 if (str == NULL) {
369 PyErr_NoMemory();
370 return NULL;
371 }
372
373 // The call to fstring_find_expr_location is responsible for finding the column offset
374 // the generated AST nodes need to be shifted to the right, which is equal to the number
375 // of the f-string characters before the expression starts.
376 memcpy(str+1, expr_start, len);
377 int lines, cols;
378 if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
379 PyMem_Free(str);
380 return NULL;
381 }
382
383 // The parentheses are needed in order to allow for leading whitespace within
384 // the f-string expression. This consequently gets parsed as a group (see the
385 // group rule in python.gram).
386 str[0] = '(';
387 str[len+1] = ')';
388
389 struct tok_state* tok = PyTokenizer_FromString(str, 1);
390 if (tok == NULL) {
391 PyMem_Free(str);
392 return NULL;
393 }
394 Py_INCREF(p->tok->filename);
395
396 tok->filename = p->tok->filename;
397 tok->lineno = t->lineno + lines - 1;
398
399 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
400 NULL, p->arena);
401
402 p2->starting_lineno = t->lineno + lines;
403 p2->starting_col_offset = t->col_offset + cols;
404
405 expr = _PyPegen_run_parser(p2);
406
407 if (expr == NULL) {
408 goto exit;
409 }
410 result = expr;
411
412 exit:
413 PyMem_Free(str);
414 _PyPegen_Parser_Free(p2);
415 PyTokenizer_Free(tok);
416 return result;
417 }
418
419 /* Return -1 on error.
420
421 Return 0 if we reached the end of the literal.
422
423 Return 1 if we haven't reached the end of the literal, but we want
424 the caller to process the literal up to this point. Used for
425 doubled braces.
426 */
427 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)428 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
429 PyObject **literal, int recurse_lvl, Token *t)
430 {
431 /* Get any literal string. It ends when we hit an un-doubled left
432 brace (which isn't part of a unicode name escape such as
433 "\N{EULER CONSTANT}"), or the end of the string. */
434
435 const char *s = *str;
436 const char *literal_start = s;
437 int result = 0;
438
439 assert(*literal == NULL);
440 while (s < end) {
441 char ch = *s++;
442 if (!raw && ch == '\\' && s < end) {
443 ch = *s++;
444 if (ch == 'N') {
445 /* We need to look at and skip matching braces for "\N{name}"
446 sequences because otherwise we'll think the opening '{'
447 starts an expression, which is not the case with "\N".
448 Keep looking for either a matched '{' '}' pair, or the end
449 of the string. */
450
451 if (s < end && *s++ == '{') {
452 while (s < end && *s++ != '}') {
453 }
454 continue;
455 }
456
457 /* This is an invalid "\N" sequence, since it's a "\N" not
458 followed by a "{". Just keep parsing this literal. This
459 error will be caught later by
460 decode_unicode_with_escapes(). */
461 continue;
462 }
463 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
464 return -1;
465 }
466 }
467 if (ch == '{' || ch == '}') {
468 /* Check for doubled braces, but only at the top level. If
469 we checked at every level, then f'{0:{3}}' would fail
470 with the two closing braces. */
471 if (recurse_lvl == 0) {
472 if (s < end && *s == ch) {
473 /* We're going to tell the caller that the literal ends
474 here, but that they should continue scanning. But also
475 skip over the second brace when we resume scanning. */
476 *str = s + 1;
477 result = 1;
478 goto done;
479 }
480
481 /* Where a single '{' is the start of a new expression, a
482 single '}' is not allowed. */
483 if (ch == '}') {
484 *str = s - 1;
485 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
486 return -1;
487 }
488 }
489 /* We're either at a '{', which means we're starting another
490 expression; or a '}', which means we're at the end of this
491 f-string (for a nested format_spec). */
492 s--;
493 break;
494 }
495 }
496 *str = s;
497 assert(s <= end);
498 assert(s == end || *s == '{' || *s == '}');
499 done:
500 if (literal_start != s) {
501 if (raw) {
502 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
503 s - literal_start,
504 NULL, NULL);
505 }
506 else {
507 *literal = decode_unicode_with_escapes(p, literal_start,
508 s - literal_start, t);
509 }
510 if (!*literal) {
511 return -1;
512 }
513 }
514 return result;
515 }
516
517 /* Forward declaration because parsing is recursive. */
518 static expr_ty
519 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
520 Token *first_token, Token* t, Token *last_token);
521
522 /* Parse the f-string at *str, ending at end. We know *str starts an
523 expression (so it must be a '{'). Returns the FormattedValue node, which
524 includes the expression, conversion character, format_spec expression, and
525 optionally the text of the expression (if = is used).
526
527 Note that I don't do a perfect job here: I don't make sure that a
528 closing brace doesn't match an opening paren, for example. It
529 doesn't need to error on all invalid expressions, just correctly
530 find the end of all valid ones. Any errors inside the expression
531 will be caught when we parse it later.
532
533 *expression is set to the expression. For an '=' "debug" expression,
534 *expr_text is set to the debug text (the original text of the expression,
535 including the '=' and any whitespace around it, as a string object). If
536 not a debug expression, *expr_text set to NULL. */
537 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)538 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
539 PyObject **expr_text, expr_ty *expression, Token *first_token,
540 Token *t, Token *last_token)
541 {
542 /* Return -1 on error, else 0. */
543
544 const char *expr_start;
545 const char *expr_end;
546 expr_ty simple_expression;
547 expr_ty format_spec = NULL; /* Optional format specifier. */
548 int conversion = -1; /* The conversion char. Use default if not
549 specified, or !r if using = and no format
550 spec. */
551
552 /* 0 if we're not in a string, else the quote char we're trying to
553 match (single or double quote). */
554 char quote_char = 0;
555
556 /* If we're inside a string, 1=normal, 3=triple-quoted. */
557 int string_type = 0;
558
559 /* Keep track of nesting level for braces/parens/brackets in
560 expressions. */
561 Py_ssize_t nested_depth = 0;
562 char parenstack[MAXLEVEL];
563
564 *expr_text = NULL;
565
566 /* Can only nest one level deep. */
567 if (recurse_lvl >= 2) {
568 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
569 goto error;
570 }
571
572 /* The first char must be a left brace, or we wouldn't have gotten
573 here. Skip over it. */
574 assert(**str == '{');
575 *str += 1;
576
577 expr_start = *str;
578 for (; *str < end; (*str)++) {
579 char ch;
580
581 /* Loop invariants. */
582 assert(nested_depth >= 0);
583 assert(*str >= expr_start && *str < end);
584 if (quote_char) {
585 assert(string_type == 1 || string_type == 3);
586 } else {
587 assert(string_type == 0);
588 }
589
590 ch = **str;
591 /* Nowhere inside an expression is a backslash allowed. */
592 if (ch == '\\') {
593 /* Error: can't include a backslash character, inside
594 parens or strings or not. */
595 RAISE_SYNTAX_ERROR(
596 "f-string expression part "
597 "cannot include a backslash");
598 goto error;
599 }
600 if (quote_char) {
601 /* We're inside a string. See if we're at the end. */
602 /* This code needs to implement the same non-error logic
603 as tok_get from tokenizer.c, at the letter_quote
604 label. To actually share that code would be a
605 nightmare. But, it's unlikely to change and is small,
606 so duplicate it here. Note we don't need to catch all
607 of the errors, since they'll be caught when parsing the
608 expression. We just need to match the non-error
609 cases. Thus we can ignore \n in single-quoted strings,
610 for example. Or non-terminated strings. */
611 if (ch == quote_char) {
612 /* Does this match the string_type (single or triple
613 quoted)? */
614 if (string_type == 3) {
615 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
616 /* We're at the end of a triple quoted string. */
617 *str += 2;
618 string_type = 0;
619 quote_char = 0;
620 continue;
621 }
622 } else {
623 /* We're at the end of a normal string. */
624 quote_char = 0;
625 string_type = 0;
626 continue;
627 }
628 }
629 } else if (ch == '\'' || ch == '"') {
630 /* Is this a triple quoted string? */
631 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
632 string_type = 3;
633 *str += 2;
634 } else {
635 /* Start of a normal string. */
636 string_type = 1;
637 }
638 /* Start looking for the end of the string. */
639 quote_char = ch;
640 } else if (ch == '[' || ch == '{' || ch == '(') {
641 if (nested_depth >= MAXLEVEL) {
642 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
643 goto error;
644 }
645 parenstack[nested_depth] = ch;
646 nested_depth++;
647 } else if (ch == '#') {
648 /* Error: can't include a comment character, inside parens
649 or not. */
650 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
651 goto error;
652 } else if (nested_depth == 0 &&
653 (ch == '!' || ch == ':' || ch == '}' ||
654 ch == '=' || ch == '>' || ch == '<')) {
655 /* See if there's a next character. */
656 if (*str+1 < end) {
657 char next = *(*str+1);
658
659 /* For "!=". since '=' is not an allowed conversion character,
660 nothing is lost in this test. */
661 if ((ch == '!' && next == '=') || /* != */
662 (ch == '=' && next == '=') || /* == */
663 (ch == '<' && next == '=') || /* <= */
664 (ch == '>' && next == '=') /* >= */
665 ) {
666 *str += 1;
667 continue;
668 }
669 }
670 /* Don't get out of the loop for these, if they're single
671 chars (not part of 2-char tokens). If by themselves, they
672 don't end an expression (unlike say '!'). */
673 if (ch == '>' || ch == '<') {
674 continue;
675 }
676
677 /* Normal way out of this loop. */
678 break;
679 } else if (ch == ']' || ch == '}' || ch == ')') {
680 if (!nested_depth) {
681 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
682 goto error;
683 }
684 nested_depth--;
685 int opening = (unsigned char)parenstack[nested_depth];
686 if (!((opening == '(' && ch == ')') ||
687 (opening == '[' && ch == ']') ||
688 (opening == '{' && ch == '}')))
689 {
690 RAISE_SYNTAX_ERROR(
691 "f-string: closing parenthesis '%c' "
692 "does not match opening parenthesis '%c'",
693 ch, opening);
694 goto error;
695 }
696 } else {
697 /* Just consume this char and loop around. */
698 }
699 }
700 expr_end = *str;
701 /* If we leave the above loop in a string or with mismatched parens, we
702 don't really care. We'll get a syntax error when compiling the
703 expression. But, we can produce a better error message, so let's just
704 do that.*/
705 if (quote_char) {
706 RAISE_SYNTAX_ERROR("f-string: unterminated string");
707 goto error;
708 }
709 if (nested_depth) {
710 int opening = (unsigned char)parenstack[nested_depth - 1];
711 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
712 goto error;
713 }
714
715 if (*str >= end) {
716 goto unexpected_end_of_string;
717 }
718
719 /* Compile the expression as soon as possible, so we show errors
720 related to the expression before errors related to the
721 conversion or format_spec. */
722 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
723 if (!simple_expression) {
724 goto error;
725 }
726
727 /* Check for =, which puts the text value of the expression in
728 expr_text. */
729 if (**str == '=') {
730 if (p->feature_version < 8) {
731 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
732 "only supported in Python 3.8 and greater");
733 goto error;
734 }
735 *str += 1;
736
737 /* Skip over ASCII whitespace. No need to test for end of string
738 here, since we know there's at least a trailing quote somewhere
739 ahead. */
740 while (Py_ISSPACE(**str)) {
741 *str += 1;
742 }
743
744 /* Set *expr_text to the text of the expression. */
745 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
746 if (!*expr_text) {
747 goto error;
748 }
749 }
750
751 /* Check for a conversion char, if present. */
752 if (**str == '!') {
753 *str += 1;
754 if (*str >= end) {
755 goto unexpected_end_of_string;
756 }
757
758 conversion = (unsigned char)**str;
759 *str += 1;
760
761 /* Validate the conversion. */
762 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
763 RAISE_SYNTAX_ERROR(
764 "f-string: invalid conversion character: "
765 "expected 's', 'r', or 'a'");
766 goto error;
767 }
768
769 }
770
771 /* Check for the format spec, if present. */
772 if (*str >= end) {
773 goto unexpected_end_of_string;
774 }
775 if (**str == ':') {
776 *str += 1;
777 if (*str >= end) {
778 goto unexpected_end_of_string;
779 }
780
781 /* Parse the format spec. */
782 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
783 first_token, t, last_token);
784 if (!format_spec) {
785 goto error;
786 }
787 }
788
789 if (*str >= end || **str != '}') {
790 goto unexpected_end_of_string;
791 }
792
793 /* We're at a right brace. Consume it. */
794 assert(*str < end);
795 assert(**str == '}');
796 *str += 1;
797
798 /* If we're in = mode (detected by non-NULL expr_text), and have no format
799 spec and no explicit conversion, set the conversion to 'r'. */
800 if (*expr_text && format_spec == NULL && conversion == -1) {
801 conversion = 'r';
802 }
803
804 /* And now create the FormattedValue node that represents this
805 entire expression with the conversion and format spec. */
806 //TODO: Fix this
807 *expression = _PyAST_FormattedValue(simple_expression, conversion,
808 format_spec, first_token->lineno,
809 first_token->col_offset,
810 last_token->end_lineno,
811 last_token->end_col_offset, p->arena);
812 if (!*expression) {
813 goto error;
814 }
815
816 return 0;
817
818 unexpected_end_of_string:
819 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
820 /* Falls through to error. */
821
822 error:
823 Py_XDECREF(*expr_text);
824 return -1;
825
826 }
827
828 /* Return -1 on error.
829
830 Return 0 if we have a literal (possible zero length) and an
831 expression (zero length if at the end of the string.
832
833 Return 1 if we have a literal, but no expression, and we want the
834 caller to call us again. This is used to deal with doubled
835 braces.
836
837 When called multiple times on the string 'a{{b{0}c', this function
838 will return:
839
840 1. the literal 'a{' with no expression, and a return value
841 of 1. Despite the fact that there's no expression, the return
842 value of 1 means we're not finished yet.
843
844 2. the literal 'b' and the expression '0', with a return value of
845 0. The fact that there's an expression means we're not finished.
846
847 3. literal 'c' with no expression and a return value of 0. The
848 combination of the return value of 0 with no expression means
849 we're finished.
850 */
851 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)852 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
853 int recurse_lvl, PyObject **literal,
854 PyObject **expr_text, expr_ty *expression,
855 Token *first_token, Token *t, Token *last_token)
856 {
857 int result;
858
859 assert(*literal == NULL && *expression == NULL);
860
861 /* Get any literal string. */
862 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
863 if (result < 0) {
864 goto error;
865 }
866
867 assert(result == 0 || result == 1);
868
869 if (result == 1) {
870 /* We have a literal, but don't look at the expression. */
871 return 1;
872 }
873
874 if (*str >= end || **str == '}') {
875 /* We're at the end of the string or the end of a nested
876 f-string: no expression. The top-level error case where we
877 expect to be at the end of the string but we're at a '}' is
878 handled later. */
879 return 0;
880 }
881
882 /* We must now be the start of an expression, on a '{'. */
883 assert(**str == '{');
884
885 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
886 expression, first_token, t, last_token) < 0) {
887 goto error;
888 }
889
890 return 0;
891
892 error:
893 Py_CLEAR(*literal);
894 return -1;
895 }
896
897 #ifdef NDEBUG
898 #define ExprList_check_invariants(l)
899 #else
900 static void
ExprList_check_invariants(ExprList * l)901 ExprList_check_invariants(ExprList *l)
902 {
903 /* Check our invariants. Make sure this object is "live", and
904 hasn't been deallocated. */
905 assert(l->size >= 0);
906 assert(l->p != NULL);
907 if (l->size <= EXPRLIST_N_CACHED) {
908 assert(l->data == l->p);
909 }
910 }
911 #endif
912
913 static void
ExprList_Init(ExprList * l)914 ExprList_Init(ExprList *l)
915 {
916 l->allocated = EXPRLIST_N_CACHED;
917 l->size = 0;
918
919 /* Until we start allocating dynamically, p points to data. */
920 l->p = l->data;
921
922 ExprList_check_invariants(l);
923 }
924
925 static int
ExprList_Append(ExprList * l,expr_ty exp)926 ExprList_Append(ExprList *l, expr_ty exp)
927 {
928 ExprList_check_invariants(l);
929 if (l->size >= l->allocated) {
930 /* We need to alloc (or realloc) the memory. */
931 Py_ssize_t new_size = l->allocated * 2;
932
933 /* See if we've ever allocated anything dynamically. */
934 if (l->p == l->data) {
935 Py_ssize_t i;
936 /* We're still using the cached data. Switch to
937 alloc-ing. */
938 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
939 if (!l->p) {
940 return -1;
941 }
942 /* Copy the cached data into the new buffer. */
943 for (i = 0; i < l->size; i++) {
944 l->p[i] = l->data[i];
945 }
946 } else {
947 /* Just realloc. */
948 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
949 if (!tmp) {
950 PyMem_Free(l->p);
951 l->p = NULL;
952 return -1;
953 }
954 l->p = tmp;
955 }
956
957 l->allocated = new_size;
958 assert(l->allocated == 2 * l->size);
959 }
960
961 l->p[l->size++] = exp;
962
963 ExprList_check_invariants(l);
964 return 0;
965 }
966
967 static void
ExprList_Dealloc(ExprList * l)968 ExprList_Dealloc(ExprList *l)
969 {
970 ExprList_check_invariants(l);
971
972 /* If there's been an error, or we've never dynamically allocated,
973 do nothing. */
974 if (!l->p || l->p == l->data) {
975 /* Do nothing. */
976 } else {
977 /* We have dynamically allocated. Free the memory. */
978 PyMem_Free(l->p);
979 }
980 l->p = NULL;
981 l->size = -1;
982 }
983
984 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)985 ExprList_Finish(ExprList *l, PyArena *arena)
986 {
987 asdl_expr_seq *seq;
988
989 ExprList_check_invariants(l);
990
991 /* Allocate the asdl_seq and copy the expressions in to it. */
992 seq = _Py_asdl_expr_seq_new(l->size, arena);
993 if (seq) {
994 Py_ssize_t i;
995 for (i = 0; i < l->size; i++) {
996 asdl_seq_SET(seq, i, l->p[i]);
997 }
998 }
999 ExprList_Dealloc(l);
1000 return seq;
1001 }
1002
1003 #ifdef NDEBUG
1004 #define FstringParser_check_invariants(state)
1005 #else
1006 static void
FstringParser_check_invariants(FstringParser * state)1007 FstringParser_check_invariants(FstringParser *state)
1008 {
1009 if (state->last_str) {
1010 assert(PyUnicode_CheckExact(state->last_str));
1011 }
1012 ExprList_check_invariants(&state->expr_list);
1013 }
1014 #endif
1015
1016 void
_PyPegen_FstringParser_Init(FstringParser * state)1017 _PyPegen_FstringParser_Init(FstringParser *state)
1018 {
1019 state->last_str = NULL;
1020 state->fmode = 0;
1021 ExprList_Init(&state->expr_list);
1022 FstringParser_check_invariants(state);
1023 }
1024
1025 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1026 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1027 {
1028 FstringParser_check_invariants(state);
1029
1030 Py_XDECREF(state->last_str);
1031 ExprList_Dealloc(&state->expr_list);
1032 }
1033
1034 /* Make a Constant node, but decref the PyUnicode object being added. */
1035 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1036 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1037 {
1038 PyObject *s = *str;
1039 PyObject *kind = NULL;
1040 *str = NULL;
1041 assert(PyUnicode_CheckExact(s));
1042 if (_PyArena_AddPyObject(p->arena, s) < 0) {
1043 Py_DECREF(s);
1044 return NULL;
1045 }
1046 const char* the_str = PyBytes_AsString(first_token->bytes);
1047 if (the_str && the_str[0] == 'u') {
1048 kind = _PyPegen_new_identifier(p, "u");
1049 }
1050
1051 if (kind == NULL && PyErr_Occurred()) {
1052 return NULL;
1053 }
1054
1055 return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1056 last_token->end_lineno, last_token->end_col_offset,
1057 p->arena);
1058
1059 }
1060
1061
1062 /* Add a non-f-string (that is, a regular literal string). str is
1063 decref'd. */
1064 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1065 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1066 {
1067 FstringParser_check_invariants(state);
1068
1069 assert(PyUnicode_CheckExact(str));
1070
1071 if (PyUnicode_GET_LENGTH(str) == 0) {
1072 Py_DECREF(str);
1073 return 0;
1074 }
1075
1076 if (!state->last_str) {
1077 /* We didn't have a string before, so just remember this one. */
1078 state->last_str = str;
1079 } else {
1080 /* Concatenate this with the previous string. */
1081 PyUnicode_AppendAndDel(&state->last_str, str);
1082 if (!state->last_str) {
1083 return -1;
1084 }
1085 }
1086 FstringParser_check_invariants(state);
1087 return 0;
1088 }
1089
1090 /* Parse an f-string. The f-string is in *str to end, with no
1091 'f' or quotes. */
1092 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1093 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1094 const char *end, int raw, int recurse_lvl,
1095 Token *first_token, Token* t, Token *last_token)
1096 {
1097 FstringParser_check_invariants(state);
1098 state->fmode = 1;
1099
1100 /* Parse the f-string. */
1101 while (1) {
1102 PyObject *literal = NULL;
1103 PyObject *expr_text = NULL;
1104 expr_ty expression = NULL;
1105
1106 /* If there's a zero length literal in front of the
1107 expression, literal will be NULL. If we're at the end of
1108 the f-string, expression will be NULL (unless result == 1,
1109 see below). */
1110 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1111 &literal, &expr_text,
1112 &expression, first_token, t, last_token);
1113 if (result < 0) {
1114 return -1;
1115 }
1116
1117 /* Add the literal, if any. */
1118 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1119 Py_XDECREF(expr_text);
1120 return -1;
1121 }
1122 /* Add the expr_text, if any. */
1123 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1124 return -1;
1125 }
1126
1127 /* We've dealt with the literal and expr_text, their ownership has
1128 been transferred to the state object. Don't look at them again. */
1129
1130 /* See if we should just loop around to get the next literal
1131 and expression, while ignoring the expression this
1132 time. This is used for un-doubling braces, as an
1133 optimization. */
1134 if (result == 1) {
1135 continue;
1136 }
1137
1138 if (!expression) {
1139 /* We're done with this f-string. */
1140 break;
1141 }
1142
1143 /* We know we have an expression. Convert any existing string
1144 to a Constant node. */
1145 if (!state->last_str) {
1146 /* Do nothing. No previous literal. */
1147 } else {
1148 /* Convert the existing last_str literal to a Constant node. */
1149 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1150 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1151 return -1;
1152 }
1153 }
1154
1155 if (ExprList_Append(&state->expr_list, expression) < 0) {
1156 return -1;
1157 }
1158 }
1159
1160 /* If recurse_lvl is zero, then we must be at the end of the
1161 string. Otherwise, we must be at a right brace. */
1162
1163 if (recurse_lvl == 0 && *str < end-1) {
1164 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1165 return -1;
1166 }
1167 if (recurse_lvl != 0 && **str != '}') {
1168 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1169 return -1;
1170 }
1171
1172 FstringParser_check_invariants(state);
1173 return 0;
1174 }
1175
1176 /* Convert the partial state reflected in last_str and expr_list to an
1177 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1178 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1179 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1180 Token *last_token)
1181 {
1182 asdl_expr_seq *seq;
1183
1184 FstringParser_check_invariants(state);
1185
1186 /* If we're just a constant string with no expressions, return
1187 that. */
1188 if (!state->fmode) {
1189 assert(!state->expr_list.size);
1190 if (!state->last_str) {
1191 /* Create a zero length string. */
1192 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1193 if (!state->last_str) {
1194 goto error;
1195 }
1196 }
1197 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1198 }
1199
1200 /* Create a Constant node out of last_str, if needed. It will be the
1201 last node in our expression list. */
1202 if (state->last_str) {
1203 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1204 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1205 goto error;
1206 }
1207 }
1208 /* This has already been freed. */
1209 assert(state->last_str == NULL);
1210
1211 seq = ExprList_Finish(&state->expr_list, p->arena);
1212 if (!seq) {
1213 goto error;
1214 }
1215
1216 return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1217 last_token->end_lineno, last_token->end_col_offset,
1218 p->arena);
1219
1220 error:
1221 _PyPegen_FstringParser_Dealloc(state);
1222 return NULL;
1223 }
1224
1225 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1226 at end, parse it into an expr_ty. Return NULL on error. Adjust
1227 str to point past the parsed portion. */
1228 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1229 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1230 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1231 {
1232 FstringParser state;
1233
1234 _PyPegen_FstringParser_Init(&state);
1235 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1236 first_token, t, last_token) < 0) {
1237 _PyPegen_FstringParser_Dealloc(&state);
1238 return NULL;
1239 }
1240
1241 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1242 }
1243