1 #include <Python.h>
2 #include "pycore_ast.h" // _PyAST_Validate(),
3 #include "pycore_pystate.h" // _PyThreadState_GET()
4 #include "pycore_pyerrors.h" // PyExc_IncompleteInputError
5 #include <errcode.h>
6
7 #include "lexer/lexer.h"
8 #include "tokenizer/tokenizer.h"
9 #include "pegen.h"
10
11 // Internal parser functions
12
13 asdl_stmt_seq*
_PyPegen_interactive_exit(Parser * p)14 _PyPegen_interactive_exit(Parser *p)
15 {
16 if (p->errcode) {
17 *(p->errcode) = E_EOF;
18 }
19 return NULL;
20 }
21
22 Py_ssize_t
_PyPegen_byte_offset_to_character_offset_line(PyObject * line,Py_ssize_t col_offset,Py_ssize_t end_col_offset)23 _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
24 {
25 const char *data = PyUnicode_AsUTF8(line);
26
27 Py_ssize_t len = 0;
28 while (col_offset < end_col_offset) {
29 Py_UCS4 ch = data[col_offset];
30 if (ch < 0x80) {
31 col_offset += 1;
32 } else if ((ch & 0xe0) == 0xc0) {
33 col_offset += 2;
34 } else if ((ch & 0xf0) == 0xe0) {
35 col_offset += 3;
36 } else if ((ch & 0xf8) == 0xf0) {
37 col_offset += 4;
38 } else {
39 PyErr_SetString(PyExc_ValueError, "Invalid UTF-8 sequence");
40 return -1;
41 }
42 len++;
43 }
44 return len;
45 }
46
47 Py_ssize_t
_PyPegen_byte_offset_to_character_offset_raw(const char * str,Py_ssize_t col_offset)48 _PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
49 {
50 Py_ssize_t len = strlen(str);
51 if (col_offset > len + 1) {
52 col_offset = len + 1;
53 }
54 assert(col_offset >= 0);
55 PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
56 if (!text) {
57 return -1;
58 }
59 Py_ssize_t size = PyUnicode_GET_LENGTH(text);
60 Py_DECREF(text);
61 return size;
62 }
63
64 Py_ssize_t
_PyPegen_byte_offset_to_character_offset(PyObject * line,Py_ssize_t col_offset)65 _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
66 {
67 const char *str = PyUnicode_AsUTF8(line);
68 if (!str) {
69 return -1;
70 }
71 return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
72 }
73
74 // Here, mark is the start of the node, while p->mark is the end.
75 // If node==NULL, they should be the same.
76 int
_PyPegen_insert_memo(Parser * p,int mark,int type,void * node)77 _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
78 {
79 // Insert in front
80 Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
81 if (m == NULL) {
82 return -1;
83 }
84 m->type = type;
85 m->node = node;
86 m->mark = p->mark;
87 m->next = p->tokens[mark]->memo;
88 p->tokens[mark]->memo = m;
89 return 0;
90 }
91
92 // Like _PyPegen_insert_memo(), but updates an existing node if found.
93 int
_PyPegen_update_memo(Parser * p,int mark,int type,void * node)94 _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
95 {
96 for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
97 if (m->type == type) {
98 // Update existing node.
99 m->node = node;
100 m->mark = p->mark;
101 return 0;
102 }
103 }
104 // Insert new node.
105 return _PyPegen_insert_memo(p, mark, type, node);
106 }
107
108 static int
init_normalization(Parser * p)109 init_normalization(Parser *p)
110 {
111 if (p->normalize) {
112 return 1;
113 }
114 p->normalize = _PyImport_GetModuleAttrString("unicodedata", "normalize");
115 if (!p->normalize)
116 {
117 return 0;
118 }
119 return 1;
120 }
121
122 static int
growable_comment_array_init(growable_comment_array * arr,size_t initial_size)123 growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
124 assert(initial_size > 0);
125 arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
126 arr->size = initial_size;
127 arr->num_items = 0;
128
129 return arr->items != NULL;
130 }
131
132 static int
growable_comment_array_add(growable_comment_array * arr,int lineno,char * comment)133 growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
134 if (arr->num_items >= arr->size) {
135 size_t new_size = arr->size * 2;
136 void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
137 if (!new_items_array) {
138 return 0;
139 }
140 arr->items = new_items_array;
141 arr->size = new_size;
142 }
143
144 arr->items[arr->num_items].lineno = lineno;
145 arr->items[arr->num_items].comment = comment; // Take ownership
146 arr->num_items++;
147 return 1;
148 }
149
150 static void
growable_comment_array_deallocate(growable_comment_array * arr)151 growable_comment_array_deallocate(growable_comment_array *arr) {
152 for (unsigned i = 0; i < arr->num_items; i++) {
153 PyMem_Free(arr->items[i].comment);
154 }
155 PyMem_Free(arr->items);
156 }
157
158 static int
_get_keyword_or_name_type(Parser * p,struct token * new_token)159 _get_keyword_or_name_type(Parser *p, struct token *new_token)
160 {
161 int name_len = new_token->end_col_offset - new_token->col_offset;
162 assert(name_len > 0);
163
164 if (name_len >= p->n_keyword_lists ||
165 p->keywords[name_len] == NULL ||
166 p->keywords[name_len]->type == -1) {
167 return NAME;
168 }
169 for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
170 if (strncmp(k->str, new_token->start, name_len) == 0) {
171 return k->type;
172 }
173 }
174 return NAME;
175 }
176
177 static int
initialize_token(Parser * p,Token * parser_token,struct token * new_token,int token_type)178 initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
179 assert(parser_token != NULL);
180
181 parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
182 parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
183 if (parser_token->bytes == NULL) {
184 return -1;
185 }
186 if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
187 Py_DECREF(parser_token->bytes);
188 return -1;
189 }
190
191 parser_token->metadata = NULL;
192 if (new_token->metadata != NULL) {
193 if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
194 Py_DECREF(parser_token->metadata);
195 return -1;
196 }
197 parser_token->metadata = new_token->metadata;
198 new_token->metadata = NULL;
199 }
200
201 parser_token->level = new_token->level;
202 parser_token->lineno = new_token->lineno;
203 parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
204 : new_token->col_offset;
205 parser_token->end_lineno = new_token->end_lineno;
206 parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
207 : new_token->end_col_offset;
208
209 p->fill += 1;
210
211 if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
212 return _Pypegen_raise_decode_error(p);
213 }
214
215 return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
216 }
217
218 static int
_resize_tokens_array(Parser * p)219 _resize_tokens_array(Parser *p) {
220 int newsize = p->size * 2;
221 Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
222 if (new_tokens == NULL) {
223 PyErr_NoMemory();
224 return -1;
225 }
226 p->tokens = new_tokens;
227
228 for (int i = p->size; i < newsize; i++) {
229 p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
230 if (p->tokens[i] == NULL) {
231 p->size = i; // Needed, in order to cleanup correctly after parser fails
232 PyErr_NoMemory();
233 return -1;
234 }
235 }
236 p->size = newsize;
237 return 0;
238 }
239
240 int
_PyPegen_fill_token(Parser * p)241 _PyPegen_fill_token(Parser *p)
242 {
243 struct token new_token;
244 _PyToken_Init(&new_token);
245 int type = _PyTokenizer_Get(p->tok, &new_token);
246
247 // Record and skip '# type: ignore' comments
248 while (type == TYPE_IGNORE) {
249 Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
250 char *tag = PyMem_Malloc(len + 1);
251 if (tag == NULL) {
252 PyErr_NoMemory();
253 goto error;
254 }
255 strncpy(tag, new_token.start, len);
256 tag[len] = '\0';
257 // Ownership of tag passes to the growable array
258 if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
259 PyErr_NoMemory();
260 goto error;
261 }
262 type = _PyTokenizer_Get(p->tok, &new_token);
263 }
264
265 // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
266 if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
267 type = NEWLINE; /* Add an extra newline */
268 p->parsing_started = 0;
269
270 if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
271 p->tok->pendin = -p->tok->indent;
272 p->tok->indent = 0;
273 }
274 }
275 else {
276 p->parsing_started = 1;
277 }
278
279 // Check if we are at the limit of the token array capacity and resize if needed
280 if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
281 goto error;
282 }
283
284 Token *t = p->tokens[p->fill];
285 return initialize_token(p, t, &new_token, type);
286 error:
287 _PyToken_Free(&new_token);
288 return -1;
289 }
290
291 #if defined(Py_DEBUG)
292 // Instrumentation to count the effectiveness of memoization.
293 // The array counts the number of tokens skipped by memoization,
294 // indexed by type.
295
296 #define NSTATISTICS _PYPEGEN_NSTATISTICS
297 #define memo_statistics _PyRuntime.parser.memo_statistics
298
299 #ifdef Py_GIL_DISABLED
300 #define MUTEX_LOCK() PyMutex_Lock(&_PyRuntime.parser.mutex)
301 #define MUTEX_UNLOCK() PyMutex_Unlock(&_PyRuntime.parser.mutex)
302 #else
303 #define MUTEX_LOCK()
304 #define MUTEX_UNLOCK()
305 #endif
306
307 void
_PyPegen_clear_memo_statistics(void)308 _PyPegen_clear_memo_statistics(void)
309 {
310 MUTEX_LOCK();
311 for (int i = 0; i < NSTATISTICS; i++) {
312 memo_statistics[i] = 0;
313 }
314 MUTEX_UNLOCK();
315 }
316
317 PyObject *
_PyPegen_get_memo_statistics(void)318 _PyPegen_get_memo_statistics(void)
319 {
320 PyObject *ret = PyList_New(NSTATISTICS);
321 if (ret == NULL) {
322 return NULL;
323 }
324
325 MUTEX_LOCK();
326 for (int i = 0; i < NSTATISTICS; i++) {
327 PyObject *value = PyLong_FromLong(memo_statistics[i]);
328 if (value == NULL) {
329 MUTEX_UNLOCK();
330 Py_DECREF(ret);
331 return NULL;
332 }
333 // PyList_SetItem borrows a reference to value.
334 if (PyList_SetItem(ret, i, value) < 0) {
335 MUTEX_UNLOCK();
336 Py_DECREF(ret);
337 return NULL;
338 }
339 }
340 MUTEX_UNLOCK();
341 return ret;
342 }
343 #endif
344
345 int // bool
_PyPegen_is_memoized(Parser * p,int type,void * pres)346 _PyPegen_is_memoized(Parser *p, int type, void *pres)
347 {
348 if (p->mark == p->fill) {
349 if (_PyPegen_fill_token(p) < 0) {
350 p->error_indicator = 1;
351 return -1;
352 }
353 }
354
355 Token *t = p->tokens[p->mark];
356
357 for (Memo *m = t->memo; m != NULL; m = m->next) {
358 if (m->type == type) {
359 #if defined(Py_DEBUG)
360 if (0 <= type && type < NSTATISTICS) {
361 long count = m->mark - p->mark;
362 // A memoized negative result counts for one.
363 if (count <= 0) {
364 count = 1;
365 }
366 MUTEX_LOCK();
367 memo_statistics[type] += count;
368 MUTEX_UNLOCK();
369 }
370 #endif
371 p->mark = m->mark;
372 *(void **)(pres) = m->node;
373 return 1;
374 }
375 }
376 return 0;
377 }
378
379 int
_PyPegen_lookahead_with_name(int positive,expr_ty (func)(Parser *),Parser * p)380 _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
381 {
382 int mark = p->mark;
383 void *res = func(p);
384 p->mark = mark;
385 return (res != NULL) == positive;
386 }
387
388 int
_PyPegen_lookahead_with_string(int positive,expr_ty (func)(Parser *,const char *),Parser * p,const char * arg)389 _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
390 {
391 int mark = p->mark;
392 void *res = func(p, arg);
393 p->mark = mark;
394 return (res != NULL) == positive;
395 }
396
397 int
_PyPegen_lookahead_with_int(int positive,Token * (func)(Parser *,int),Parser * p,int arg)398 _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
399 {
400 int mark = p->mark;
401 void *res = func(p, arg);
402 p->mark = mark;
403 return (res != NULL) == positive;
404 }
405
406 int
_PyPegen_lookahead(int positive,void * (func)(Parser *),Parser * p)407 _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
408 {
409 int mark = p->mark;
410 void *res = (void*)func(p);
411 p->mark = mark;
412 return (res != NULL) == positive;
413 }
414
415 Token *
_PyPegen_expect_token(Parser * p,int type)416 _PyPegen_expect_token(Parser *p, int type)
417 {
418 if (p->mark == p->fill) {
419 if (_PyPegen_fill_token(p) < 0) {
420 p->error_indicator = 1;
421 return NULL;
422 }
423 }
424 Token *t = p->tokens[p->mark];
425 if (t->type != type) {
426 return NULL;
427 }
428 p->mark += 1;
429 return t;
430 }
431
432 void*
_PyPegen_expect_forced_result(Parser * p,void * result,const char * expected)433 _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
434
435 if (p->error_indicator == 1) {
436 return NULL;
437 }
438 if (result == NULL) {
439 RAISE_SYNTAX_ERROR("expected (%s)", expected);
440 return NULL;
441 }
442 return result;
443 }
444
445 Token *
_PyPegen_expect_forced_token(Parser * p,int type,const char * expected)446 _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
447
448 if (p->error_indicator == 1) {
449 return NULL;
450 }
451
452 if (p->mark == p->fill) {
453 if (_PyPegen_fill_token(p) < 0) {
454 p->error_indicator = 1;
455 return NULL;
456 }
457 }
458 Token *t = p->tokens[p->mark];
459 if (t->type != type) {
460 RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
461 return NULL;
462 }
463 p->mark += 1;
464 return t;
465 }
466
467 expr_ty
_PyPegen_expect_soft_keyword(Parser * p,const char * keyword)468 _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
469 {
470 if (p->mark == p->fill) {
471 if (_PyPegen_fill_token(p) < 0) {
472 p->error_indicator = 1;
473 return NULL;
474 }
475 }
476 Token *t = p->tokens[p->mark];
477 if (t->type != NAME) {
478 return NULL;
479 }
480 const char *s = PyBytes_AsString(t->bytes);
481 if (!s) {
482 p->error_indicator = 1;
483 return NULL;
484 }
485 if (strcmp(s, keyword) != 0) {
486 return NULL;
487 }
488 return _PyPegen_name_token(p);
489 }
490
491 Token *
_PyPegen_get_last_nonnwhitespace_token(Parser * p)492 _PyPegen_get_last_nonnwhitespace_token(Parser *p)
493 {
494 assert(p->mark >= 0);
495 Token *token = NULL;
496 for (int m = p->mark - 1; m >= 0; m--) {
497 token = p->tokens[m];
498 if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
499 break;
500 }
501 }
502 return token;
503 }
504
505 PyObject *
_PyPegen_new_identifier(Parser * p,const char * n)506 _PyPegen_new_identifier(Parser *p, const char *n)
507 {
508 PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
509 if (!id) {
510 goto error;
511 }
512 /* PyUnicode_DecodeUTF8 should always return a ready string. */
513 assert(PyUnicode_IS_READY(id));
514 /* Check whether there are non-ASCII characters in the
515 identifier; if so, normalize to NFKC. */
516 if (!PyUnicode_IS_ASCII(id))
517 {
518 if (!init_normalization(p))
519 {
520 Py_DECREF(id);
521 goto error;
522 }
523 PyObject *form = PyUnicode_InternFromString("NFKC");
524 if (form == NULL)
525 {
526 Py_DECREF(id);
527 goto error;
528 }
529 PyObject *args[2] = {form, id};
530 PyObject *id2 = PyObject_Vectorcall(p->normalize, args, 2, NULL);
531 Py_DECREF(id);
532 Py_DECREF(form);
533 if (!id2) {
534 goto error;
535 }
536
537 if (!PyUnicode_Check(id2))
538 {
539 PyErr_Format(PyExc_TypeError,
540 "unicodedata.normalize() must return a string, not "
541 "%.200s",
542 _PyType_Name(Py_TYPE(id2)));
543 Py_DECREF(id2);
544 goto error;
545 }
546 id = id2;
547 }
548 PyInterpreterState *interp = _PyInterpreterState_GET();
549 _PyUnicode_InternImmortal(interp, &id);
550 if (_PyArena_AddPyObject(p->arena, id) < 0)
551 {
552 Py_DECREF(id);
553 goto error;
554 }
555 return id;
556
557 error:
558 p->error_indicator = 1;
559 return NULL;
560 }
561
562 static expr_ty
_PyPegen_name_from_token(Parser * p,Token * t)563 _PyPegen_name_from_token(Parser *p, Token* t)
564 {
565 if (t == NULL) {
566 return NULL;
567 }
568 const char *s = PyBytes_AsString(t->bytes);
569 if (!s) {
570 p->error_indicator = 1;
571 return NULL;
572 }
573 PyObject *id = _PyPegen_new_identifier(p, s);
574 if (id == NULL) {
575 p->error_indicator = 1;
576 return NULL;
577 }
578 return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
579 t->end_col_offset, p->arena);
580 }
581
582 expr_ty
_PyPegen_name_token(Parser * p)583 _PyPegen_name_token(Parser *p)
584 {
585 Token *t = _PyPegen_expect_token(p, NAME);
586 return _PyPegen_name_from_token(p, t);
587 }
588
589 void *
_PyPegen_string_token(Parser * p)590 _PyPegen_string_token(Parser *p)
591 {
592 return _PyPegen_expect_token(p, STRING);
593 }
594
_PyPegen_soft_keyword_token(Parser * p)595 expr_ty _PyPegen_soft_keyword_token(Parser *p) {
596 Token *t = _PyPegen_expect_token(p, NAME);
597 if (t == NULL) {
598 return NULL;
599 }
600 char *the_token;
601 Py_ssize_t size;
602 PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
603 for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
604 if (strncmp(*keyword, the_token, size) == 0) {
605 return _PyPegen_name_from_token(p, t);
606 }
607 }
608 return NULL;
609 }
610
611 static PyObject *
parsenumber_raw(const char * s)612 parsenumber_raw(const char *s)
613 {
614 const char *end;
615 long x;
616 double dx;
617 Py_complex compl;
618 int imflag;
619
620 assert(s != NULL);
621 errno = 0;
622 end = s + strlen(s) - 1;
623 imflag = *end == 'j' || *end == 'J';
624 if (s[0] == '0') {
625 x = (long)PyOS_strtoul(s, (char **)&end, 0);
626 if (x < 0 && errno == 0) {
627 return PyLong_FromString(s, (char **)0, 0);
628 }
629 }
630 else {
631 x = PyOS_strtol(s, (char **)&end, 0);
632 }
633 if (*end == '\0') {
634 if (errno != 0) {
635 return PyLong_FromString(s, (char **)0, 0);
636 }
637 return PyLong_FromLong(x);
638 }
639 /* XXX Huge floats may silently fail */
640 if (imflag) {
641 compl.real = 0.;
642 compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
643 if (compl.imag == -1.0 && PyErr_Occurred()) {
644 return NULL;
645 }
646 return PyComplex_FromCComplex(compl);
647 }
648 dx = PyOS_string_to_double(s, NULL, NULL);
649 if (dx == -1.0 && PyErr_Occurred()) {
650 return NULL;
651 }
652 return PyFloat_FromDouble(dx);
653 }
654
655 static PyObject *
parsenumber(const char * s)656 parsenumber(const char *s)
657 {
658 char *dup;
659 char *end;
660 PyObject *res = NULL;
661
662 assert(s != NULL);
663
664 if (strchr(s, '_') == NULL) {
665 return parsenumber_raw(s);
666 }
667 /* Create a duplicate without underscores. */
668 dup = PyMem_Malloc(strlen(s) + 1);
669 if (dup == NULL) {
670 return PyErr_NoMemory();
671 }
672 end = dup;
673 for (; *s; s++) {
674 if (*s != '_') {
675 *end++ = *s;
676 }
677 }
678 *end = '\0';
679 res = parsenumber_raw(dup);
680 PyMem_Free(dup);
681 return res;
682 }
683
684 expr_ty
_PyPegen_number_token(Parser * p)685 _PyPegen_number_token(Parser *p)
686 {
687 Token *t = _PyPegen_expect_token(p, NUMBER);
688 if (t == NULL) {
689 return NULL;
690 }
691
692 const char *num_raw = PyBytes_AsString(t->bytes);
693 if (num_raw == NULL) {
694 p->error_indicator = 1;
695 return NULL;
696 }
697
698 if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
699 p->error_indicator = 1;
700 return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
701 "in Python 3.6 and greater");
702 }
703
704 PyObject *c = parsenumber(num_raw);
705
706 if (c == NULL) {
707 p->error_indicator = 1;
708 PyThreadState *tstate = _PyThreadState_GET();
709 // The only way a ValueError should happen in _this_ code is via
710 // PyLong_FromString hitting a length limit.
711 if (tstate->current_exception != NULL &&
712 Py_TYPE(tstate->current_exception) == (PyTypeObject *)PyExc_ValueError
713 ) {
714 PyObject *exc = PyErr_GetRaisedException();
715 /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
716 * on the error message. Nobody is going to overlook their huge
717 * numeric literal once given the line. */
718 RAISE_ERROR_KNOWN_LOCATION(
719 p, PyExc_SyntaxError,
720 t->lineno, -1 /* col_offset */,
721 t->end_lineno, -1 /* end_col_offset */,
722 "%S - Consider hexadecimal for huge integer literals "
723 "to avoid decimal conversion limits.",
724 exc);
725 Py_DECREF(exc);
726 }
727 return NULL;
728 }
729
730 if (_PyArena_AddPyObject(p->arena, c) < 0) {
731 Py_DECREF(c);
732 p->error_indicator = 1;
733 return NULL;
734 }
735
736 return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
737 t->end_col_offset, p->arena);
738 }
739
740 /* Check that the source for a single input statement really is a single
741 statement by looking at what is left in the buffer after parsing.
742 Trailing whitespace and comments are OK. */
743 static int // bool
bad_single_statement(Parser * p)744 bad_single_statement(Parser *p)
745 {
746 char *cur = p->tok->cur;
747 char c = *cur;
748
749 for (;;) {
750 while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
751 c = *++cur;
752 }
753
754 if (!c) {
755 return 0;
756 }
757
758 if (c != '#') {
759 return 1;
760 }
761
762 /* Suck up comment. */
763 while (c && c != '\n') {
764 c = *++cur;
765 }
766 }
767 }
768
769 static int
compute_parser_flags(PyCompilerFlags * flags)770 compute_parser_flags(PyCompilerFlags *flags)
771 {
772 int parser_flags = 0;
773 if (!flags) {
774 return 0;
775 }
776 if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
777 parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
778 }
779 if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
780 parser_flags |= PyPARSE_IGNORE_COOKIE;
781 }
782 if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
783 parser_flags |= PyPARSE_BARRY_AS_BDFL;
784 }
785 if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
786 parser_flags |= PyPARSE_TYPE_COMMENTS;
787 }
788 if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
789 parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
790 }
791 return parser_flags;
792 }
793
794 // Parser API
795
796 Parser *
_PyPegen_Parser_New(struct tok_state * tok,int start_rule,int flags,int feature_version,int * errcode,PyArena * arena)797 _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
798 int feature_version, int *errcode, PyArena *arena)
799 {
800 Parser *p = PyMem_Malloc(sizeof(Parser));
801 if (p == NULL) {
802 return (Parser *) PyErr_NoMemory();
803 }
804 assert(tok != NULL);
805 tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
806 p->tok = tok;
807 p->keywords = NULL;
808 p->n_keyword_lists = -1;
809 p->soft_keywords = NULL;
810 p->tokens = PyMem_Malloc(sizeof(Token *));
811 if (!p->tokens) {
812 PyMem_Free(p);
813 return (Parser *) PyErr_NoMemory();
814 }
815 p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
816 if (!p->tokens[0]) {
817 PyMem_Free(p->tokens);
818 PyMem_Free(p);
819 return (Parser *) PyErr_NoMemory();
820 }
821 if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
822 PyMem_Free(p->tokens[0]);
823 PyMem_Free(p->tokens);
824 PyMem_Free(p);
825 return (Parser *) PyErr_NoMemory();
826 }
827
828 p->mark = 0;
829 p->fill = 0;
830 p->size = 1;
831
832 p->errcode = errcode;
833 p->arena = arena;
834 p->start_rule = start_rule;
835 p->parsing_started = 0;
836 p->normalize = NULL;
837 p->error_indicator = 0;
838
839 p->starting_lineno = 0;
840 p->starting_col_offset = 0;
841 p->flags = flags;
842 p->feature_version = feature_version;
843 p->known_err_token = NULL;
844 p->level = 0;
845 p->call_invalid_rules = 0;
846 #ifdef Py_DEBUG
847 p->debug = _Py_GetConfig()->parser_debug;
848 #endif
849 return p;
850 }
851
852 void
_PyPegen_Parser_Free(Parser * p)853 _PyPegen_Parser_Free(Parser *p)
854 {
855 Py_XDECREF(p->normalize);
856 for (int i = 0; i < p->size; i++) {
857 PyMem_Free(p->tokens[i]);
858 }
859 PyMem_Free(p->tokens);
860 growable_comment_array_deallocate(&p->type_ignore_comments);
861 PyMem_Free(p);
862 }
863
864 static void
reset_parser_state_for_error_pass(Parser * p)865 reset_parser_state_for_error_pass(Parser *p)
866 {
867 for (int i = 0; i < p->fill; i++) {
868 p->tokens[i]->memo = NULL;
869 }
870 p->mark = 0;
871 p->call_invalid_rules = 1;
872 // Don't try to get extra tokens in interactive mode when trying to
873 // raise specialized errors in the second pass.
874 p->tok->interactive_underflow = IUNDERFLOW_STOP;
875 }
876
877 static inline int
_is_end_of_source(Parser * p)878 _is_end_of_source(Parser *p) {
879 int err = p->tok->done;
880 return err == E_EOF || err == E_EOFS || err == E_EOLS;
881 }
882
883 void *
_PyPegen_run_parser(Parser * p)884 _PyPegen_run_parser(Parser *p)
885 {
886 void *res = _PyPegen_parse(p);
887 assert(p->level == 0);
888 if (res == NULL) {
889 if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) && _is_end_of_source(p)) {
890 PyErr_Clear();
891 return _PyPegen_raise_error(p, PyExc_IncompleteInputError, 0, "incomplete input");
892 }
893 if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
894 return NULL;
895 }
896 // Make a second parser pass. In this pass we activate heavier and slower checks
897 // to produce better error messages and more complete diagnostics. Extra "invalid_*"
898 // rules will be active during parsing.
899 Token *last_token = p->tokens[p->fill - 1];
900 reset_parser_state_for_error_pass(p);
901 _PyPegen_parse(p);
902
903 // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
904 // point.
905 _Pypegen_set_syntax_error(p, last_token);
906 return NULL;
907 }
908
909 if (p->start_rule == Py_single_input && bad_single_statement(p)) {
910 p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
911 return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
912 }
913
914 // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
915 #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
916 if (p->start_rule == Py_single_input ||
917 p->start_rule == Py_file_input ||
918 p->start_rule == Py_eval_input)
919 {
920 if (!_PyAST_Validate(res)) {
921 return NULL;
922 }
923 }
924 #endif
925 return res;
926 }
927
928 mod_ty
_PyPegen_run_parser_from_file_pointer(FILE * fp,int start_rule,PyObject * filename_ob,const char * enc,const char * ps1,const char * ps2,PyCompilerFlags * flags,int * errcode,PyObject ** interactive_src,PyArena * arena)929 _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
930 const char *enc, const char *ps1, const char *ps2,
931 PyCompilerFlags *flags, int *errcode,
932 PyObject **interactive_src, PyArena *arena)
933 {
934 struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
935 if (tok == NULL) {
936 if (PyErr_Occurred()) {
937 _PyPegen_raise_tokenizer_init_error(filename_ob);
938 return NULL;
939 }
940 return NULL;
941 }
942 if (!tok->fp || ps1 != NULL || ps2 != NULL ||
943 PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
944 tok->fp_interactive = 1;
945 }
946 // This transfers the ownership to the tokenizer
947 tok->filename = Py_NewRef(filename_ob);
948
949 // From here on we need to clean up even if there's an error
950 mod_ty result = NULL;
951
952 int parser_flags = compute_parser_flags(flags);
953 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
954 errcode, arena);
955 if (p == NULL) {
956 goto error;
957 }
958
959 result = _PyPegen_run_parser(p);
960 _PyPegen_Parser_Free(p);
961
962 if (tok->fp_interactive && tok->interactive_src_start && result && interactive_src != NULL) {
963 *interactive_src = PyUnicode_FromString(tok->interactive_src_start);
964 if (!interactive_src || _PyArena_AddPyObject(arena, *interactive_src) < 0) {
965 Py_XDECREF(interactive_src);
966 result = NULL;
967 goto error;
968 }
969 }
970
971 error:
972 _PyTokenizer_Free(tok);
973 return result;
974 }
975
976 mod_ty
_PyPegen_run_parser_from_string(const char * str,int start_rule,PyObject * filename_ob,PyCompilerFlags * flags,PyArena * arena)977 _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
978 PyCompilerFlags *flags, PyArena *arena)
979 {
980 int exec_input = start_rule == Py_file_input;
981
982 struct tok_state *tok;
983 if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
984 tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
985 } else {
986 tok = _PyTokenizer_FromString(str, exec_input, 0);
987 }
988 if (tok == NULL) {
989 if (PyErr_Occurred()) {
990 _PyPegen_raise_tokenizer_init_error(filename_ob);
991 }
992 return NULL;
993 }
994 // This transfers the ownership to the tokenizer
995 tok->filename = Py_NewRef(filename_ob);
996
997 // We need to clear up from here on
998 mod_ty result = NULL;
999
1000 int parser_flags = compute_parser_flags(flags);
1001 int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
1002 flags->cf_feature_version : PY_MINOR_VERSION;
1003 Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
1004 NULL, arena);
1005 if (p == NULL) {
1006 goto error;
1007 }
1008
1009 result = _PyPegen_run_parser(p);
1010 _PyPegen_Parser_Free(p);
1011
1012 error:
1013 _PyTokenizer_Free(tok);
1014 return result;
1015 }
1016