1 #include <stdbool.h>
2
3 #include <Python.h>
4
5 #include "tokenizer.h"
6 #include "pegen.h"
7 #include "string_parser.h"
8
9 //// STRING HANDLING FUNCTIONS ////
10
11 static int
warn_invalid_escape_sequence(Parser * p,unsigned char first_invalid_escape_char,Token * t)12 warn_invalid_escape_sequence(Parser *p, unsigned char first_invalid_escape_char, Token *t)
13 {
14 PyObject *msg =
15 PyUnicode_FromFormat("invalid escape sequence '\\%c'", first_invalid_escape_char);
16 if (msg == NULL) {
17 return -1;
18 }
19 if (PyErr_WarnExplicitObject(PyExc_DeprecationWarning, msg, p->tok->filename,
20 t->lineno, NULL, NULL) < 0) {
21 if (PyErr_ExceptionMatches(PyExc_DeprecationWarning)) {
22 /* Replace the DeprecationWarning exception with a SyntaxError
23 to get a more accurate error report */
24 PyErr_Clear();
25
26 /* This is needed, in order for the SyntaxError to point to the token t,
27 since _PyPegen_raise_error uses p->tokens[p->fill - 1] for the
28 error location, if p->known_err_token is not set. */
29 p->known_err_token = t;
30 RAISE_SYNTAX_ERROR("invalid escape sequence '\\%c'", first_invalid_escape_char);
31 }
32 Py_DECREF(msg);
33 return -1;
34 }
35 Py_DECREF(msg);
36 return 0;
37 }
38
39 static PyObject *
decode_utf8(const char ** sPtr,const char * end)40 decode_utf8(const char **sPtr, const char *end)
41 {
42 const char *s;
43 const char *t;
44 t = s = *sPtr;
45 while (s < end && (*s & 0x80)) {
46 s++;
47 }
48 *sPtr = s;
49 return PyUnicode_DecodeUTF8(t, s - t, NULL);
50 }
51
52 static PyObject *
decode_unicode_with_escapes(Parser * parser,const char * s,size_t len,Token * t)53 decode_unicode_with_escapes(Parser *parser, const char *s, size_t len, Token *t)
54 {
55 PyObject *v;
56 PyObject *u;
57 char *buf;
58 char *p;
59 const char *end;
60
61 /* check for integer overflow */
62 if (len > SIZE_MAX / 6) {
63 return NULL;
64 }
65 /* "ä" (2 bytes) may become "\U000000E4" (10 bytes), or 1:5
66 "\ä" (3 bytes) may become "\u005c\U000000E4" (16 bytes), or ~1:6 */
67 u = PyBytes_FromStringAndSize((char *)NULL, len * 6);
68 if (u == NULL) {
69 return NULL;
70 }
71 p = buf = PyBytes_AsString(u);
72 if (p == NULL) {
73 return NULL;
74 }
75 end = s + len;
76 while (s < end) {
77 if (*s == '\\') {
78 *p++ = *s++;
79 if (s >= end || *s & 0x80) {
80 strcpy(p, "u005c");
81 p += 5;
82 if (s >= end) {
83 break;
84 }
85 }
86 }
87 if (*s & 0x80) {
88 PyObject *w;
89 int kind;
90 const void *data;
91 Py_ssize_t w_len;
92 Py_ssize_t i;
93 w = decode_utf8(&s, end);
94 if (w == NULL) {
95 Py_DECREF(u);
96 return NULL;
97 }
98 kind = PyUnicode_KIND(w);
99 data = PyUnicode_DATA(w);
100 w_len = PyUnicode_GET_LENGTH(w);
101 for (i = 0; i < w_len; i++) {
102 Py_UCS4 chr = PyUnicode_READ(kind, data, i);
103 sprintf(p, "\\U%08x", chr);
104 p += 10;
105 }
106 /* Should be impossible to overflow */
107 assert(p - buf <= PyBytes_GET_SIZE(u));
108 Py_DECREF(w);
109 }
110 else {
111 *p++ = *s++;
112 }
113 }
114 len = p - buf;
115 s = buf;
116
117 const char *first_invalid_escape;
118 v = _PyUnicode_DecodeUnicodeEscapeInternal(s, len, NULL, NULL, &first_invalid_escape);
119
120 if (v != NULL && first_invalid_escape != NULL) {
121 if (warn_invalid_escape_sequence(parser, *first_invalid_escape, t) < 0) {
122 /* We have not decref u before because first_invalid_escape points
123 inside u. */
124 Py_XDECREF(u);
125 Py_DECREF(v);
126 return NULL;
127 }
128 }
129 Py_XDECREF(u);
130 return v;
131 }
132
133 static PyObject *
decode_bytes_with_escapes(Parser * p,const char * s,Py_ssize_t len,Token * t)134 decode_bytes_with_escapes(Parser *p, const char *s, Py_ssize_t len, Token *t)
135 {
136 const char *first_invalid_escape;
137 PyObject *result = _PyBytes_DecodeEscape(s, len, NULL, &first_invalid_escape);
138 if (result == NULL) {
139 return NULL;
140 }
141
142 if (first_invalid_escape != NULL) {
143 if (warn_invalid_escape_sequence(p, *first_invalid_escape, t) < 0) {
144 Py_DECREF(result);
145 return NULL;
146 }
147 }
148 return result;
149 }
150
151 /* s must include the bracketing quote characters, and r, b, u,
152 &/or f prefixes (if any), and embedded escape sequences (if any).
153 _PyPegen_parsestr parses it, and sets *result to decoded Python string object.
154 If the string is an f-string, set *fstr and *fstrlen to the unparsed
155 string object. Return 0 if no errors occurred. */
156 int
_PyPegen_parsestr(Parser * p,int * bytesmode,int * rawmode,PyObject ** result,const char ** fstr,Py_ssize_t * fstrlen,Token * t)157 _PyPegen_parsestr(Parser *p, int *bytesmode, int *rawmode, PyObject **result,
158 const char **fstr, Py_ssize_t *fstrlen, Token *t)
159 {
160 const char *s = PyBytes_AsString(t->bytes);
161 if (s == NULL) {
162 return -1;
163 }
164
165 size_t len;
166 int quote = Py_CHARMASK(*s);
167 int fmode = 0;
168 *bytesmode = 0;
169 *rawmode = 0;
170 *result = NULL;
171 *fstr = NULL;
172 if (Py_ISALPHA(quote)) {
173 while (!*bytesmode || !*rawmode) {
174 if (quote == 'b' || quote == 'B') {
175 quote =(unsigned char)*++s;
176 *bytesmode = 1;
177 }
178 else if (quote == 'u' || quote == 'U') {
179 quote = (unsigned char)*++s;
180 }
181 else if (quote == 'r' || quote == 'R') {
182 quote = (unsigned char)*++s;
183 *rawmode = 1;
184 }
185 else if (quote == 'f' || quote == 'F') {
186 quote = (unsigned char)*++s;
187 fmode = 1;
188 }
189 else {
190 break;
191 }
192 }
193 }
194
195 /* fstrings are only allowed in Python 3.6 and greater */
196 if (fmode && p->feature_version < 6) {
197 p->error_indicator = 1;
198 RAISE_SYNTAX_ERROR("Format strings are only supported in Python 3.6 and greater");
199 return -1;
200 }
201
202 if (fmode && *bytesmode) {
203 PyErr_BadInternalCall();
204 return -1;
205 }
206 if (quote != '\'' && quote != '\"') {
207 PyErr_BadInternalCall();
208 return -1;
209 }
210 /* Skip the leading quote char. */
211 s++;
212 len = strlen(s);
213 if (len > INT_MAX) {
214 PyErr_SetString(PyExc_OverflowError, "string to parse is too long");
215 return -1;
216 }
217 if (s[--len] != quote) {
218 /* Last quote char must match the first. */
219 PyErr_BadInternalCall();
220 return -1;
221 }
222 if (len >= 4 && s[0] == quote && s[1] == quote) {
223 /* A triple quoted string. We've already skipped one quote at
224 the start and one at the end of the string. Now skip the
225 two at the start. */
226 s += 2;
227 len -= 2;
228 /* And check that the last two match. */
229 if (s[--len] != quote || s[--len] != quote) {
230 PyErr_BadInternalCall();
231 return -1;
232 }
233 }
234
235 if (fmode) {
236 /* Just return the bytes. The caller will parse the resulting
237 string. */
238 *fstr = s;
239 *fstrlen = len;
240 return 0;
241 }
242
243 /* Not an f-string. */
244 /* Avoid invoking escape decoding routines if possible. */
245 *rawmode = *rawmode || strchr(s, '\\') == NULL;
246 if (*bytesmode) {
247 /* Disallow non-ASCII characters. */
248 const char *ch;
249 for (ch = s; *ch; ch++) {
250 if (Py_CHARMASK(*ch) >= 0x80) {
251 RAISE_SYNTAX_ERROR(
252 "bytes can only contain ASCII "
253 "literal characters");
254 return -1;
255 }
256 }
257 if (*rawmode) {
258 *result = PyBytes_FromStringAndSize(s, len);
259 }
260 else {
261 *result = decode_bytes_with_escapes(p, s, len, t);
262 }
263 }
264 else {
265 if (*rawmode) {
266 *result = PyUnicode_DecodeUTF8Stateful(s, len, NULL, NULL);
267 }
268 else {
269 *result = decode_unicode_with_escapes(p, s, len, t);
270 }
271 }
272 return *result == NULL ? -1 : 0;
273 }
274
275
276
277 // FSTRING STUFF
278
279 /* Fix locations for the given node and its children.
280
281 `parent` is the enclosing node.
282 `expr_start` is the starting position of the expression (pointing to the open brace).
283 `n` is the node which locations are going to be fixed relative to parent.
284 `expr_str` is the child node's string representation, including braces.
285 */
286 static bool
fstring_find_expr_location(Token * parent,const char * expr_start,char * expr_str,int * p_lines,int * p_cols)287 fstring_find_expr_location(Token *parent, const char* expr_start, char *expr_str, int *p_lines, int *p_cols)
288 {
289 *p_lines = 0;
290 *p_cols = 0;
291 assert(expr_start != NULL && *expr_start == '{');
292 if (parent && parent->bytes) {
293 const char *parent_str = PyBytes_AsString(parent->bytes);
294 if (!parent_str) {
295 return false;
296 }
297 // The following is needed, in order to correctly shift the column
298 // offset, in the case that (disregarding any whitespace) a newline
299 // immediately follows the opening curly brace of the fstring expression.
300 bool newline_after_brace = 1;
301 const char *start = expr_start + 1;
302 while (start && *start != '}' && *start != '\n') {
303 if (*start != ' ' && *start != '\t' && *start != '\f') {
304 newline_after_brace = 0;
305 break;
306 }
307 start++;
308 }
309
310 // Account for the characters from the last newline character to our
311 // left until the beginning of expr_start.
312 if (!newline_after_brace) {
313 start = expr_start;
314 while (start > parent_str && *start != '\n') {
315 start--;
316 }
317 *p_cols += (int)(expr_start - start);
318 }
319 /* adjust the start based on the number of newlines encountered
320 before the f-string expression */
321 for (const char *p = parent_str; p < expr_start; p++) {
322 if (*p == '\n') {
323 (*p_lines)++;
324 }
325 }
326 }
327 return true;
328 }
329
330
331 /* Compile this expression in to an expr_ty. Add parens around the
332 expression, in order to allow leading spaces in the expression. */
333 static expr_ty
fstring_compile_expr(Parser * p,const char * expr_start,const char * expr_end,Token * t)334 fstring_compile_expr(Parser *p, const char *expr_start, const char *expr_end,
335 Token *t)
336 {
337 expr_ty expr = NULL;
338 char *str;
339 Py_ssize_t len;
340 const char *s;
341 expr_ty result = NULL;
342
343 assert(expr_end >= expr_start);
344 assert(*(expr_start-1) == '{');
345 assert(*expr_end == '}' || *expr_end == '!' || *expr_end == ':' ||
346 *expr_end == '=');
347
348 /* If the substring is all whitespace, it's an error. We need to catch this
349 here, and not when we call PyParser_SimpleParseStringFlagsFilename,
350 because turning the expression '' in to '()' would go from being invalid
351 to valid. */
352 for (s = expr_start; s != expr_end; s++) {
353 char c = *s;
354 /* The Python parser ignores only the following whitespace
355 characters (\r already is converted to \n). */
356 if (!(c == ' ' || c == '\t' || c == '\n' || c == '\f')) {
357 break;
358 }
359 }
360 if (s == expr_end) {
361 RAISE_SYNTAX_ERROR("f-string: empty expression not allowed");
362 return NULL;
363 }
364
365 len = expr_end - expr_start;
366 /* Allocate 3 extra bytes: open paren, close paren, null byte. */
367 str = PyMem_Calloc(len + 3, sizeof(char));
368 if (str == NULL) {
369 PyErr_NoMemory();
370 return NULL;
371 }
372
373 // The call to fstring_find_expr_location is responsible for finding the column offset
374 // the generated AST nodes need to be shifted to the right, which is equal to the number
375 // of the f-string characters before the expression starts.
376 memcpy(str+1, expr_start, len);
377 int lines, cols;
378 if (!fstring_find_expr_location(t, expr_start-1, str+1, &lines, &cols)) {
379 PyMem_Free(str);
380 return NULL;
381 }
382
383 // The parentheses are needed in order to allow for leading whitespace within
384 // the f-string expression. This consequently gets parsed as a group (see the
385 // group rule in python.gram).
386 str[0] = '(';
387 str[len+1] = ')';
388
389 struct tok_state* tok = PyTokenizer_FromString(str, 1);
390 if (tok == NULL) {
391 PyMem_Free(str);
392 return NULL;
393 }
394 Py_INCREF(p->tok->filename);
395
396 tok->filename = p->tok->filename;
397 tok->lineno = t->lineno + lines - 1;
398
399 Parser *p2 = _PyPegen_Parser_New(tok, Py_fstring_input, p->flags, p->feature_version,
400 NULL, p->arena);
401
402 p2->starting_lineno = t->lineno + lines;
403 p2->starting_col_offset = t->col_offset + cols;
404
405 expr = _PyPegen_run_parser(p2);
406
407 if (expr == NULL) {
408 goto exit;
409 }
410 result = expr;
411
412 exit:
413 PyMem_Free(str);
414 _PyPegen_Parser_Free(p2);
415 PyTokenizer_Free(tok);
416 return result;
417 }
418
419 /* Return -1 on error.
420
421 Return 0 if we reached the end of the literal.
422
423 Return 1 if we haven't reached the end of the literal, but we want
424 the caller to process the literal up to this point. Used for
425 doubled braces.
426 */
427 static int
fstring_find_literal(Parser * p,const char ** str,const char * end,int raw,PyObject ** literal,int recurse_lvl,Token * t)428 fstring_find_literal(Parser *p, const char **str, const char *end, int raw,
429 PyObject **literal, int recurse_lvl, Token *t)
430 {
431 /* Get any literal string. It ends when we hit an un-doubled left
432 brace (which isn't part of a unicode name escape such as
433 "\N{EULER CONSTANT}"), or the end of the string. */
434
435 const char *s = *str;
436 const char *literal_start = s;
437 int result = 0;
438
439 assert(*literal == NULL);
440 while (s < end) {
441 char ch = *s++;
442 if (!raw && ch == '\\' && s < end) {
443 ch = *s++;
444 if (ch == 'N') {
445 if (s < end && *s++ == '{') {
446 while (s < end && *s++ != '}') {
447 }
448 continue;
449 }
450 break;
451 }
452 if (ch == '{' && warn_invalid_escape_sequence(p, ch, t) < 0) {
453 return -1;
454 }
455 }
456 if (ch == '{' || ch == '}') {
457 /* Check for doubled braces, but only at the top level. If
458 we checked at every level, then f'{0:{3}}' would fail
459 with the two closing braces. */
460 if (recurse_lvl == 0) {
461 if (s < end && *s == ch) {
462 /* We're going to tell the caller that the literal ends
463 here, but that they should continue scanning. But also
464 skip over the second brace when we resume scanning. */
465 *str = s + 1;
466 result = 1;
467 goto done;
468 }
469
470 /* Where a single '{' is the start of a new expression, a
471 single '}' is not allowed. */
472 if (ch == '}') {
473 *str = s - 1;
474 RAISE_SYNTAX_ERROR("f-string: single '}' is not allowed");
475 return -1;
476 }
477 }
478 /* We're either at a '{', which means we're starting another
479 expression; or a '}', which means we're at the end of this
480 f-string (for a nested format_spec). */
481 s--;
482 break;
483 }
484 }
485 *str = s;
486 assert(s <= end);
487 assert(s == end || *s == '{' || *s == '}');
488 done:
489 if (literal_start != s) {
490 if (raw) {
491 *literal = PyUnicode_DecodeUTF8Stateful(literal_start,
492 s - literal_start,
493 NULL, NULL);
494 } else {
495 *literal = decode_unicode_with_escapes(p, literal_start,
496 s - literal_start, t);
497 }
498 if (!*literal) {
499 return -1;
500 }
501 }
502 return result;
503 }
504
505 /* Forward declaration because parsing is recursive. */
506 static expr_ty
507 fstring_parse(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
508 Token *first_token, Token* t, Token *last_token);
509
510 /* Parse the f-string at *str, ending at end. We know *str starts an
511 expression (so it must be a '{'). Returns the FormattedValue node, which
512 includes the expression, conversion character, format_spec expression, and
513 optionally the text of the expression (if = is used).
514
515 Note that I don't do a perfect job here: I don't make sure that a
516 closing brace doesn't match an opening paren, for example. It
517 doesn't need to error on all invalid expressions, just correctly
518 find the end of all valid ones. Any errors inside the expression
519 will be caught when we parse it later.
520
521 *expression is set to the expression. For an '=' "debug" expression,
522 *expr_text is set to the debug text (the original text of the expression,
523 including the '=' and any whitespace around it, as a string object). If
524 not a debug expression, *expr_text set to NULL. */
525 static int
fstring_find_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)526 fstring_find_expr(Parser *p, const char **str, const char *end, int raw, int recurse_lvl,
527 PyObject **expr_text, expr_ty *expression, Token *first_token,
528 Token *t, Token *last_token)
529 {
530 /* Return -1 on error, else 0. */
531
532 const char *expr_start;
533 const char *expr_end;
534 expr_ty simple_expression;
535 expr_ty format_spec = NULL; /* Optional format specifier. */
536 int conversion = -1; /* The conversion char. Use default if not
537 specified, or !r if using = and no format
538 spec. */
539
540 /* 0 if we're not in a string, else the quote char we're trying to
541 match (single or double quote). */
542 char quote_char = 0;
543
544 /* If we're inside a string, 1=normal, 3=triple-quoted. */
545 int string_type = 0;
546
547 /* Keep track of nesting level for braces/parens/brackets in
548 expressions. */
549 Py_ssize_t nested_depth = 0;
550 char parenstack[MAXLEVEL];
551
552 *expr_text = NULL;
553
554 /* Can only nest one level deep. */
555 if (recurse_lvl >= 2) {
556 RAISE_SYNTAX_ERROR("f-string: expressions nested too deeply");
557 goto error;
558 }
559
560 /* The first char must be a left brace, or we wouldn't have gotten
561 here. Skip over it. */
562 assert(**str == '{');
563 *str += 1;
564
565 expr_start = *str;
566 for (; *str < end; (*str)++) {
567 char ch;
568
569 /* Loop invariants. */
570 assert(nested_depth >= 0);
571 assert(*str >= expr_start && *str < end);
572 if (quote_char) {
573 assert(string_type == 1 || string_type == 3);
574 } else {
575 assert(string_type == 0);
576 }
577
578 ch = **str;
579 /* Nowhere inside an expression is a backslash allowed. */
580 if (ch == '\\') {
581 /* Error: can't include a backslash character, inside
582 parens or strings or not. */
583 RAISE_SYNTAX_ERROR(
584 "f-string expression part "
585 "cannot include a backslash");
586 goto error;
587 }
588 if (quote_char) {
589 /* We're inside a string. See if we're at the end. */
590 /* This code needs to implement the same non-error logic
591 as tok_get from tokenizer.c, at the letter_quote
592 label. To actually share that code would be a
593 nightmare. But, it's unlikely to change and is small,
594 so duplicate it here. Note we don't need to catch all
595 of the errors, since they'll be caught when parsing the
596 expression. We just need to match the non-error
597 cases. Thus we can ignore \n in single-quoted strings,
598 for example. Or non-terminated strings. */
599 if (ch == quote_char) {
600 /* Does this match the string_type (single or triple
601 quoted)? */
602 if (string_type == 3) {
603 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
604 /* We're at the end of a triple quoted string. */
605 *str += 2;
606 string_type = 0;
607 quote_char = 0;
608 continue;
609 }
610 } else {
611 /* We're at the end of a normal string. */
612 quote_char = 0;
613 string_type = 0;
614 continue;
615 }
616 }
617 } else if (ch == '\'' || ch == '"') {
618 /* Is this a triple quoted string? */
619 if (*str+2 < end && *(*str+1) == ch && *(*str+2) == ch) {
620 string_type = 3;
621 *str += 2;
622 } else {
623 /* Start of a normal string. */
624 string_type = 1;
625 }
626 /* Start looking for the end of the string. */
627 quote_char = ch;
628 } else if (ch == '[' || ch == '{' || ch == '(') {
629 if (nested_depth >= MAXLEVEL) {
630 RAISE_SYNTAX_ERROR("f-string: too many nested parenthesis");
631 goto error;
632 }
633 parenstack[nested_depth] = ch;
634 nested_depth++;
635 } else if (ch == '#') {
636 /* Error: can't include a comment character, inside parens
637 or not. */
638 RAISE_SYNTAX_ERROR("f-string expression part cannot include '#'");
639 goto error;
640 } else if (nested_depth == 0 &&
641 (ch == '!' || ch == ':' || ch == '}' ||
642 ch == '=' || ch == '>' || ch == '<')) {
643 /* See if there's a next character. */
644 if (*str+1 < end) {
645 char next = *(*str+1);
646
647 /* For "!=". since '=' is not an allowed conversion character,
648 nothing is lost in this test. */
649 if ((ch == '!' && next == '=') || /* != */
650 (ch == '=' && next == '=') || /* == */
651 (ch == '<' && next == '=') || /* <= */
652 (ch == '>' && next == '=') /* >= */
653 ) {
654 *str += 1;
655 continue;
656 }
657 /* Don't get out of the loop for these, if they're single
658 chars (not part of 2-char tokens). If by themselves, they
659 don't end an expression (unlike say '!'). */
660 if (ch == '>' || ch == '<') {
661 continue;
662 }
663 }
664
665 /* Normal way out of this loop. */
666 break;
667 } else if (ch == ']' || ch == '}' || ch == ')') {
668 if (!nested_depth) {
669 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", ch);
670 goto error;
671 }
672 nested_depth--;
673 int opening = (unsigned char)parenstack[nested_depth];
674 if (!((opening == '(' && ch == ')') ||
675 (opening == '[' && ch == ']') ||
676 (opening == '{' && ch == '}')))
677 {
678 RAISE_SYNTAX_ERROR(
679 "f-string: closing parenthesis '%c' "
680 "does not match opening parenthesis '%c'",
681 ch, opening);
682 goto error;
683 }
684 } else {
685 /* Just consume this char and loop around. */
686 }
687 }
688 expr_end = *str;
689 /* If we leave this loop in a string or with mismatched parens, we
690 don't care. We'll get a syntax error when compiling the
691 expression. But, we can produce a better error message, so
692 let's just do that.*/
693 if (quote_char) {
694 RAISE_SYNTAX_ERROR("f-string: unterminated string");
695 goto error;
696 }
697 if (nested_depth) {
698 int opening = (unsigned char)parenstack[nested_depth - 1];
699 RAISE_SYNTAX_ERROR("f-string: unmatched '%c'", opening);
700 goto error;
701 }
702
703 if (*str >= end) {
704 goto unexpected_end_of_string;
705 }
706
707 /* Compile the expression as soon as possible, so we show errors
708 related to the expression before errors related to the
709 conversion or format_spec. */
710 simple_expression = fstring_compile_expr(p, expr_start, expr_end, t);
711 if (!simple_expression) {
712 goto error;
713 }
714
715 /* Check for =, which puts the text value of the expression in
716 expr_text. */
717 if (**str == '=') {
718 if (p->feature_version < 8) {
719 RAISE_SYNTAX_ERROR("f-string: self documenting expressions are "
720 "only supported in Python 3.8 and greater");
721 goto error;
722 }
723 *str += 1;
724
725 /* Skip over ASCII whitespace. No need to test for end of string
726 here, since we know there's at least a trailing quote somewhere
727 ahead. */
728 while (Py_ISSPACE(**str)) {
729 *str += 1;
730 }
731
732 /* Set *expr_text to the text of the expression. */
733 *expr_text = PyUnicode_FromStringAndSize(expr_start, *str-expr_start);
734 if (!*expr_text) {
735 goto error;
736 }
737 }
738
739 /* Check for a conversion char, if present. */
740 if (**str == '!') {
741 *str += 1;
742 if (*str >= end) {
743 goto unexpected_end_of_string;
744 }
745
746 conversion = (unsigned char)**str;
747 *str += 1;
748
749 /* Validate the conversion. */
750 if (!(conversion == 's' || conversion == 'r' || conversion == 'a')) {
751 RAISE_SYNTAX_ERROR(
752 "f-string: invalid conversion character: "
753 "expected 's', 'r', or 'a'");
754 goto error;
755 }
756
757 }
758
759 /* Check for the format spec, if present. */
760 if (*str >= end) {
761 goto unexpected_end_of_string;
762 }
763 if (**str == ':') {
764 *str += 1;
765 if (*str >= end) {
766 goto unexpected_end_of_string;
767 }
768
769 /* Parse the format spec. */
770 format_spec = fstring_parse(p, str, end, raw, recurse_lvl+1,
771 first_token, t, last_token);
772 if (!format_spec) {
773 goto error;
774 }
775 }
776
777 if (*str >= end || **str != '}') {
778 goto unexpected_end_of_string;
779 }
780
781 /* We're at a right brace. Consume it. */
782 assert(*str < end);
783 assert(**str == '}');
784 *str += 1;
785
786 /* If we're in = mode (detected by non-NULL expr_text), and have no format
787 spec and no explicit conversion, set the conversion to 'r'. */
788 if (*expr_text && format_spec == NULL && conversion == -1) {
789 conversion = 'r';
790 }
791
792 /* And now create the FormattedValue node that represents this
793 entire expression with the conversion and format spec. */
794 //TODO: Fix this
795 *expression = _PyAST_FormattedValue(simple_expression, conversion,
796 format_spec, first_token->lineno,
797 first_token->col_offset,
798 last_token->end_lineno,
799 last_token->end_col_offset, p->arena);
800 if (!*expression) {
801 goto error;
802 }
803
804 return 0;
805
806 unexpected_end_of_string:
807 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
808 /* Falls through to error. */
809
810 error:
811 Py_XDECREF(*expr_text);
812 return -1;
813
814 }
815
816 /* Return -1 on error.
817
818 Return 0 if we have a literal (possible zero length) and an
819 expression (zero length if at the end of the string.
820
821 Return 1 if we have a literal, but no expression, and we want the
822 caller to call us again. This is used to deal with doubled
823 braces.
824
825 When called multiple times on the string 'a{{b{0}c', this function
826 will return:
827
828 1. the literal 'a{' with no expression, and a return value
829 of 1. Despite the fact that there's no expression, the return
830 value of 1 means we're not finished yet.
831
832 2. the literal 'b' and the expression '0', with a return value of
833 0. The fact that there's an expression means we're not finished.
834
835 3. literal 'c' with no expression and a return value of 0. The
836 combination of the return value of 0 with no expression means
837 we're finished.
838 */
839 static int
fstring_find_literal_and_expr(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,PyObject ** literal,PyObject ** expr_text,expr_ty * expression,Token * first_token,Token * t,Token * last_token)840 fstring_find_literal_and_expr(Parser *p, const char **str, const char *end, int raw,
841 int recurse_lvl, PyObject **literal,
842 PyObject **expr_text, expr_ty *expression,
843 Token *first_token, Token *t, Token *last_token)
844 {
845 int result;
846
847 assert(*literal == NULL && *expression == NULL);
848
849 /* Get any literal string. */
850 result = fstring_find_literal(p, str, end, raw, literal, recurse_lvl, t);
851 if (result < 0) {
852 goto error;
853 }
854
855 assert(result == 0 || result == 1);
856
857 if (result == 1) {
858 /* We have a literal, but don't look at the expression. */
859 return 1;
860 }
861
862 if (*str >= end || **str == '}') {
863 /* We're at the end of the string or the end of a nested
864 f-string: no expression. The top-level error case where we
865 expect to be at the end of the string but we're at a '}' is
866 handled later. */
867 return 0;
868 }
869
870 /* We must now be the start of an expression, on a '{'. */
871 assert(**str == '{');
872
873 if (fstring_find_expr(p, str, end, raw, recurse_lvl, expr_text,
874 expression, first_token, t, last_token) < 0) {
875 goto error;
876 }
877
878 return 0;
879
880 error:
881 Py_CLEAR(*literal);
882 return -1;
883 }
884
885 #ifdef NDEBUG
886 #define ExprList_check_invariants(l)
887 #else
888 static void
ExprList_check_invariants(ExprList * l)889 ExprList_check_invariants(ExprList *l)
890 {
891 /* Check our invariants. Make sure this object is "live", and
892 hasn't been deallocated. */
893 assert(l->size >= 0);
894 assert(l->p != NULL);
895 if (l->size <= EXPRLIST_N_CACHED) {
896 assert(l->data == l->p);
897 }
898 }
899 #endif
900
901 static void
ExprList_Init(ExprList * l)902 ExprList_Init(ExprList *l)
903 {
904 l->allocated = EXPRLIST_N_CACHED;
905 l->size = 0;
906
907 /* Until we start allocating dynamically, p points to data. */
908 l->p = l->data;
909
910 ExprList_check_invariants(l);
911 }
912
913 static int
ExprList_Append(ExprList * l,expr_ty exp)914 ExprList_Append(ExprList *l, expr_ty exp)
915 {
916 ExprList_check_invariants(l);
917 if (l->size >= l->allocated) {
918 /* We need to alloc (or realloc) the memory. */
919 Py_ssize_t new_size = l->allocated * 2;
920
921 /* See if we've ever allocated anything dynamically. */
922 if (l->p == l->data) {
923 Py_ssize_t i;
924 /* We're still using the cached data. Switch to
925 alloc-ing. */
926 l->p = PyMem_Malloc(sizeof(expr_ty) * new_size);
927 if (!l->p) {
928 return -1;
929 }
930 /* Copy the cached data into the new buffer. */
931 for (i = 0; i < l->size; i++) {
932 l->p[i] = l->data[i];
933 }
934 } else {
935 /* Just realloc. */
936 expr_ty *tmp = PyMem_Realloc(l->p, sizeof(expr_ty) * new_size);
937 if (!tmp) {
938 PyMem_Free(l->p);
939 l->p = NULL;
940 return -1;
941 }
942 l->p = tmp;
943 }
944
945 l->allocated = new_size;
946 assert(l->allocated == 2 * l->size);
947 }
948
949 l->p[l->size++] = exp;
950
951 ExprList_check_invariants(l);
952 return 0;
953 }
954
955 static void
ExprList_Dealloc(ExprList * l)956 ExprList_Dealloc(ExprList *l)
957 {
958 ExprList_check_invariants(l);
959
960 /* If there's been an error, or we've never dynamically allocated,
961 do nothing. */
962 if (!l->p || l->p == l->data) {
963 /* Do nothing. */
964 } else {
965 /* We have dynamically allocated. Free the memory. */
966 PyMem_Free(l->p);
967 }
968 l->p = NULL;
969 l->size = -1;
970 }
971
972 static asdl_expr_seq *
ExprList_Finish(ExprList * l,PyArena * arena)973 ExprList_Finish(ExprList *l, PyArena *arena)
974 {
975 asdl_expr_seq *seq;
976
977 ExprList_check_invariants(l);
978
979 /* Allocate the asdl_seq and copy the expressions in to it. */
980 seq = _Py_asdl_expr_seq_new(l->size, arena);
981 if (seq) {
982 Py_ssize_t i;
983 for (i = 0; i < l->size; i++) {
984 asdl_seq_SET(seq, i, l->p[i]);
985 }
986 }
987 ExprList_Dealloc(l);
988 return seq;
989 }
990
991 #ifdef NDEBUG
992 #define FstringParser_check_invariants(state)
993 #else
994 static void
FstringParser_check_invariants(FstringParser * state)995 FstringParser_check_invariants(FstringParser *state)
996 {
997 if (state->last_str) {
998 assert(PyUnicode_CheckExact(state->last_str));
999 }
1000 ExprList_check_invariants(&state->expr_list);
1001 }
1002 #endif
1003
1004 void
_PyPegen_FstringParser_Init(FstringParser * state)1005 _PyPegen_FstringParser_Init(FstringParser *state)
1006 {
1007 state->last_str = NULL;
1008 state->fmode = 0;
1009 ExprList_Init(&state->expr_list);
1010 FstringParser_check_invariants(state);
1011 }
1012
1013 void
_PyPegen_FstringParser_Dealloc(FstringParser * state)1014 _PyPegen_FstringParser_Dealloc(FstringParser *state)
1015 {
1016 FstringParser_check_invariants(state);
1017
1018 Py_XDECREF(state->last_str);
1019 ExprList_Dealloc(&state->expr_list);
1020 }
1021
1022 /* Make a Constant node, but decref the PyUnicode object being added. */
1023 static expr_ty
make_str_node_and_del(Parser * p,PyObject ** str,Token * first_token,Token * last_token)1024 make_str_node_and_del(Parser *p, PyObject **str, Token* first_token, Token *last_token)
1025 {
1026 PyObject *s = *str;
1027 PyObject *kind = NULL;
1028 *str = NULL;
1029 assert(PyUnicode_CheckExact(s));
1030 if (_PyArena_AddPyObject(p->arena, s) < 0) {
1031 Py_DECREF(s);
1032 return NULL;
1033 }
1034 const char* the_str = PyBytes_AsString(first_token->bytes);
1035 if (the_str && the_str[0] == 'u') {
1036 kind = _PyPegen_new_identifier(p, "u");
1037 }
1038
1039 if (kind == NULL && PyErr_Occurred()) {
1040 return NULL;
1041 }
1042
1043 return _PyAST_Constant(s, kind, first_token->lineno, first_token->col_offset,
1044 last_token->end_lineno, last_token->end_col_offset,
1045 p->arena);
1046
1047 }
1048
1049
1050 /* Add a non-f-string (that is, a regular literal string). str is
1051 decref'd. */
1052 int
_PyPegen_FstringParser_ConcatAndDel(FstringParser * state,PyObject * str)1053 _PyPegen_FstringParser_ConcatAndDel(FstringParser *state, PyObject *str)
1054 {
1055 FstringParser_check_invariants(state);
1056
1057 assert(PyUnicode_CheckExact(str));
1058
1059 if (PyUnicode_GET_LENGTH(str) == 0) {
1060 Py_DECREF(str);
1061 return 0;
1062 }
1063
1064 if (!state->last_str) {
1065 /* We didn't have a string before, so just remember this one. */
1066 state->last_str = str;
1067 } else {
1068 /* Concatenate this with the previous string. */
1069 PyUnicode_AppendAndDel(&state->last_str, str);
1070 if (!state->last_str) {
1071 return -1;
1072 }
1073 }
1074 FstringParser_check_invariants(state);
1075 return 0;
1076 }
1077
1078 /* Parse an f-string. The f-string is in *str to end, with no
1079 'f' or quotes. */
1080 int
_PyPegen_FstringParser_ConcatFstring(Parser * p,FstringParser * state,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1081 _PyPegen_FstringParser_ConcatFstring(Parser *p, FstringParser *state, const char **str,
1082 const char *end, int raw, int recurse_lvl,
1083 Token *first_token, Token* t, Token *last_token)
1084 {
1085 FstringParser_check_invariants(state);
1086 state->fmode = 1;
1087
1088 /* Parse the f-string. */
1089 while (1) {
1090 PyObject *literal = NULL;
1091 PyObject *expr_text = NULL;
1092 expr_ty expression = NULL;
1093
1094 /* If there's a zero length literal in front of the
1095 expression, literal will be NULL. If we're at the end of
1096 the f-string, expression will be NULL (unless result == 1,
1097 see below). */
1098 int result = fstring_find_literal_and_expr(p, str, end, raw, recurse_lvl,
1099 &literal, &expr_text,
1100 &expression, first_token, t, last_token);
1101 if (result < 0) {
1102 return -1;
1103 }
1104
1105 /* Add the literal, if any. */
1106 if (literal && _PyPegen_FstringParser_ConcatAndDel(state, literal) < 0) {
1107 Py_XDECREF(expr_text);
1108 return -1;
1109 }
1110 /* Add the expr_text, if any. */
1111 if (expr_text && _PyPegen_FstringParser_ConcatAndDel(state, expr_text) < 0) {
1112 return -1;
1113 }
1114
1115 /* We've dealt with the literal and expr_text, their ownership has
1116 been transferred to the state object. Don't look at them again. */
1117
1118 /* See if we should just loop around to get the next literal
1119 and expression, while ignoring the expression this
1120 time. This is used for un-doubling braces, as an
1121 optimization. */
1122 if (result == 1) {
1123 continue;
1124 }
1125
1126 if (!expression) {
1127 /* We're done with this f-string. */
1128 break;
1129 }
1130
1131 /* We know we have an expression. Convert any existing string
1132 to a Constant node. */
1133 if (!state->last_str) {
1134 /* Do nothing. No previous literal. */
1135 } else {
1136 /* Convert the existing last_str literal to a Constant node. */
1137 expr_ty last_str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1138 if (!last_str || ExprList_Append(&state->expr_list, last_str) < 0) {
1139 return -1;
1140 }
1141 }
1142
1143 if (ExprList_Append(&state->expr_list, expression) < 0) {
1144 return -1;
1145 }
1146 }
1147
1148 /* If recurse_lvl is zero, then we must be at the end of the
1149 string. Otherwise, we must be at a right brace. */
1150
1151 if (recurse_lvl == 0 && *str < end-1) {
1152 RAISE_SYNTAX_ERROR("f-string: unexpected end of string");
1153 return -1;
1154 }
1155 if (recurse_lvl != 0 && **str != '}') {
1156 RAISE_SYNTAX_ERROR("f-string: expecting '}'");
1157 return -1;
1158 }
1159
1160 FstringParser_check_invariants(state);
1161 return 0;
1162 }
1163
1164 /* Convert the partial state reflected in last_str and expr_list to an
1165 expr_ty. The expr_ty can be a Constant, or a JoinedStr. */
1166 expr_ty
_PyPegen_FstringParser_Finish(Parser * p,FstringParser * state,Token * first_token,Token * last_token)1167 _PyPegen_FstringParser_Finish(Parser *p, FstringParser *state, Token* first_token,
1168 Token *last_token)
1169 {
1170 asdl_expr_seq *seq;
1171
1172 FstringParser_check_invariants(state);
1173
1174 /* If we're just a constant string with no expressions, return
1175 that. */
1176 if (!state->fmode) {
1177 assert(!state->expr_list.size);
1178 if (!state->last_str) {
1179 /* Create a zero length string. */
1180 state->last_str = PyUnicode_FromStringAndSize(NULL, 0);
1181 if (!state->last_str) {
1182 goto error;
1183 }
1184 }
1185 return make_str_node_and_del(p, &state->last_str, first_token, last_token);
1186 }
1187
1188 /* Create a Constant node out of last_str, if needed. It will be the
1189 last node in our expression list. */
1190 if (state->last_str) {
1191 expr_ty str = make_str_node_and_del(p, &state->last_str, first_token, last_token);
1192 if (!str || ExprList_Append(&state->expr_list, str) < 0) {
1193 goto error;
1194 }
1195 }
1196 /* This has already been freed. */
1197 assert(state->last_str == NULL);
1198
1199 seq = ExprList_Finish(&state->expr_list, p->arena);
1200 if (!seq) {
1201 goto error;
1202 }
1203
1204 return _PyAST_JoinedStr(seq, first_token->lineno, first_token->col_offset,
1205 last_token->end_lineno, last_token->end_col_offset,
1206 p->arena);
1207
1208 error:
1209 _PyPegen_FstringParser_Dealloc(state);
1210 return NULL;
1211 }
1212
1213 /* Given an f-string (with no 'f' or quotes) that's in *str and ends
1214 at end, parse it into an expr_ty. Return NULL on error. Adjust
1215 str to point past the parsed portion. */
1216 static expr_ty
fstring_parse(Parser * p,const char ** str,const char * end,int raw,int recurse_lvl,Token * first_token,Token * t,Token * last_token)1217 fstring_parse(Parser *p, const char **str, const char *end, int raw,
1218 int recurse_lvl, Token *first_token, Token* t, Token *last_token)
1219 {
1220 FstringParser state;
1221
1222 _PyPegen_FstringParser_Init(&state);
1223 if (_PyPegen_FstringParser_ConcatFstring(p, &state, str, end, raw, recurse_lvl,
1224 first_token, t, last_token) < 0) {
1225 _PyPegen_FstringParser_Dealloc(&state);
1226 return NULL;
1227 }
1228
1229 return _PyPegen_FstringParser_Finish(p, &state, t, t);
1230 }
1231