• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <Python.h>
2 #include "pycore_ast.h"           // _PyAST_Validate(),
3 #include "pycore_pystate.h"       // _PyThreadState_GET()
4 #include "pycore_pyerrors.h"      // PyExc_IncompleteInputError
5 #include <errcode.h>
6 
7 #include "lexer/lexer.h"
8 #include "tokenizer/tokenizer.h"
9 #include "pegen.h"
10 
11 // Internal parser functions
12 
13 asdl_stmt_seq*
_PyPegen_interactive_exit(Parser * p)14 _PyPegen_interactive_exit(Parser *p)
15 {
16     if (p->errcode) {
17         *(p->errcode) = E_EOF;
18     }
19     return NULL;
20 }
21 
22 Py_ssize_t
_PyPegen_byte_offset_to_character_offset_line(PyObject * line,Py_ssize_t col_offset,Py_ssize_t end_col_offset)23 _PyPegen_byte_offset_to_character_offset_line(PyObject *line, Py_ssize_t col_offset, Py_ssize_t end_col_offset)
24 {
25     const char *data = PyUnicode_AsUTF8(line);
26 
27     Py_ssize_t len = 0;
28     while (col_offset < end_col_offset) {
29         Py_UCS4 ch = data[col_offset];
30         if (ch < 0x80) {
31             col_offset += 1;
32         } else if ((ch & 0xe0) == 0xc0) {
33             col_offset += 2;
34         } else if ((ch & 0xf0) == 0xe0) {
35             col_offset += 3;
36         } else if ((ch & 0xf8) == 0xf0) {
37             col_offset += 4;
38         } else {
39             PyErr_SetString(PyExc_ValueError, "Invalid UTF-8 sequence");
40             return -1;
41         }
42         len++;
43     }
44     return len;
45 }
46 
47 Py_ssize_t
_PyPegen_byte_offset_to_character_offset_raw(const char * str,Py_ssize_t col_offset)48 _PyPegen_byte_offset_to_character_offset_raw(const char* str, Py_ssize_t col_offset)
49 {
50     Py_ssize_t len = strlen(str);
51     if (col_offset > len + 1) {
52         col_offset = len + 1;
53     }
54     assert(col_offset >= 0);
55     PyObject *text = PyUnicode_DecodeUTF8(str, col_offset, "replace");
56     if (!text) {
57         return -1;
58     }
59     Py_ssize_t size = PyUnicode_GET_LENGTH(text);
60     Py_DECREF(text);
61     return size;
62 }
63 
64 Py_ssize_t
_PyPegen_byte_offset_to_character_offset(PyObject * line,Py_ssize_t col_offset)65 _PyPegen_byte_offset_to_character_offset(PyObject *line, Py_ssize_t col_offset)
66 {
67     const char *str = PyUnicode_AsUTF8(line);
68     if (!str) {
69         return -1;
70     }
71     return _PyPegen_byte_offset_to_character_offset_raw(str, col_offset);
72 }
73 
74 // Here, mark is the start of the node, while p->mark is the end.
75 // If node==NULL, they should be the same.
76 int
_PyPegen_insert_memo(Parser * p,int mark,int type,void * node)77 _PyPegen_insert_memo(Parser *p, int mark, int type, void *node)
78 {
79     // Insert in front
80     Memo *m = _PyArena_Malloc(p->arena, sizeof(Memo));
81     if (m == NULL) {
82         return -1;
83     }
84     m->type = type;
85     m->node = node;
86     m->mark = p->mark;
87     m->next = p->tokens[mark]->memo;
88     p->tokens[mark]->memo = m;
89     return 0;
90 }
91 
92 // Like _PyPegen_insert_memo(), but updates an existing node if found.
93 int
_PyPegen_update_memo(Parser * p,int mark,int type,void * node)94 _PyPegen_update_memo(Parser *p, int mark, int type, void *node)
95 {
96     for (Memo *m = p->tokens[mark]->memo; m != NULL; m = m->next) {
97         if (m->type == type) {
98             // Update existing node.
99             m->node = node;
100             m->mark = p->mark;
101             return 0;
102         }
103     }
104     // Insert new node.
105     return _PyPegen_insert_memo(p, mark, type, node);
106 }
107 
108 static int
init_normalization(Parser * p)109 init_normalization(Parser *p)
110 {
111     if (p->normalize) {
112         return 1;
113     }
114     p->normalize = _PyImport_GetModuleAttrString("unicodedata", "normalize");
115     if (!p->normalize)
116     {
117         return 0;
118     }
119     return 1;
120 }
121 
122 static int
growable_comment_array_init(growable_comment_array * arr,size_t initial_size)123 growable_comment_array_init(growable_comment_array *arr, size_t initial_size) {
124     assert(initial_size > 0);
125     arr->items = PyMem_Malloc(initial_size * sizeof(*arr->items));
126     arr->size = initial_size;
127     arr->num_items = 0;
128 
129     return arr->items != NULL;
130 }
131 
132 static int
growable_comment_array_add(growable_comment_array * arr,int lineno,char * comment)133 growable_comment_array_add(growable_comment_array *arr, int lineno, char *comment) {
134     if (arr->num_items >= arr->size) {
135         size_t new_size = arr->size * 2;
136         void *new_items_array = PyMem_Realloc(arr->items, new_size * sizeof(*arr->items));
137         if (!new_items_array) {
138             return 0;
139         }
140         arr->items = new_items_array;
141         arr->size = new_size;
142     }
143 
144     arr->items[arr->num_items].lineno = lineno;
145     arr->items[arr->num_items].comment = comment;  // Take ownership
146     arr->num_items++;
147     return 1;
148 }
149 
150 static void
growable_comment_array_deallocate(growable_comment_array * arr)151 growable_comment_array_deallocate(growable_comment_array *arr) {
152     for (unsigned i = 0; i < arr->num_items; i++) {
153         PyMem_Free(arr->items[i].comment);
154     }
155     PyMem_Free(arr->items);
156 }
157 
158 static int
_get_keyword_or_name_type(Parser * p,struct token * new_token)159 _get_keyword_or_name_type(Parser *p, struct token *new_token)
160 {
161     int name_len = new_token->end_col_offset - new_token->col_offset;
162     assert(name_len > 0);
163 
164     if (name_len >= p->n_keyword_lists ||
165         p->keywords[name_len] == NULL ||
166         p->keywords[name_len]->type == -1) {
167         return NAME;
168     }
169     for (KeywordToken *k = p->keywords[name_len]; k != NULL && k->type != -1; k++) {
170         if (strncmp(k->str, new_token->start, name_len) == 0) {
171             return k->type;
172         }
173     }
174     return NAME;
175 }
176 
177 static int
initialize_token(Parser * p,Token * parser_token,struct token * new_token,int token_type)178 initialize_token(Parser *p, Token *parser_token, struct token *new_token, int token_type) {
179     assert(parser_token != NULL);
180 
181     parser_token->type = (token_type == NAME) ? _get_keyword_or_name_type(p, new_token) : token_type;
182     parser_token->bytes = PyBytes_FromStringAndSize(new_token->start, new_token->end - new_token->start);
183     if (parser_token->bytes == NULL) {
184         return -1;
185     }
186     if (_PyArena_AddPyObject(p->arena, parser_token->bytes) < 0) {
187         Py_DECREF(parser_token->bytes);
188         return -1;
189     }
190 
191     parser_token->metadata = NULL;
192     if (new_token->metadata != NULL) {
193         if (_PyArena_AddPyObject(p->arena, new_token->metadata) < 0) {
194             Py_DECREF(parser_token->metadata);
195             return -1;
196         }
197         parser_token->metadata = new_token->metadata;
198         new_token->metadata = NULL;
199     }
200 
201     parser_token->level = new_token->level;
202     parser_token->lineno = new_token->lineno;
203     parser_token->col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->col_offset
204                                                                     : new_token->col_offset;
205     parser_token->end_lineno = new_token->end_lineno;
206     parser_token->end_col_offset = p->tok->lineno == p->starting_lineno ? p->starting_col_offset + new_token->end_col_offset
207                                                                  : new_token->end_col_offset;
208 
209     p->fill += 1;
210 
211     if (token_type == ERRORTOKEN && p->tok->done == E_DECODE) {
212         return _Pypegen_raise_decode_error(p);
213     }
214 
215     return (token_type == ERRORTOKEN ? _Pypegen_tokenizer_error(p) : 0);
216 }
217 
218 static int
_resize_tokens_array(Parser * p)219 _resize_tokens_array(Parser *p) {
220     int newsize = p->size * 2;
221     Token **new_tokens = PyMem_Realloc(p->tokens, newsize * sizeof(Token *));
222     if (new_tokens == NULL) {
223         PyErr_NoMemory();
224         return -1;
225     }
226     p->tokens = new_tokens;
227 
228     for (int i = p->size; i < newsize; i++) {
229         p->tokens[i] = PyMem_Calloc(1, sizeof(Token));
230         if (p->tokens[i] == NULL) {
231             p->size = i; // Needed, in order to cleanup correctly after parser fails
232             PyErr_NoMemory();
233             return -1;
234         }
235     }
236     p->size = newsize;
237     return 0;
238 }
239 
240 int
_PyPegen_fill_token(Parser * p)241 _PyPegen_fill_token(Parser *p)
242 {
243     struct token new_token;
244     _PyToken_Init(&new_token);
245     int type = _PyTokenizer_Get(p->tok, &new_token);
246 
247     // Record and skip '# type: ignore' comments
248     while (type == TYPE_IGNORE) {
249         Py_ssize_t len = new_token.end_col_offset - new_token.col_offset;
250         char *tag = PyMem_Malloc(len + 1);
251         if (tag == NULL) {
252             PyErr_NoMemory();
253             goto error;
254         }
255         strncpy(tag, new_token.start, len);
256         tag[len] = '\0';
257         // Ownership of tag passes to the growable array
258         if (!growable_comment_array_add(&p->type_ignore_comments, p->tok->lineno, tag)) {
259             PyErr_NoMemory();
260             goto error;
261         }
262         type = _PyTokenizer_Get(p->tok, &new_token);
263     }
264 
265     // If we have reached the end and we are in single input mode we need to insert a newline and reset the parsing
266     if (p->start_rule == Py_single_input && type == ENDMARKER && p->parsing_started) {
267         type = NEWLINE; /* Add an extra newline */
268         p->parsing_started = 0;
269 
270         if (p->tok->indent && !(p->flags & PyPARSE_DONT_IMPLY_DEDENT)) {
271             p->tok->pendin = -p->tok->indent;
272             p->tok->indent = 0;
273         }
274     }
275     else {
276         p->parsing_started = 1;
277     }
278 
279     // Check if we are at the limit of the token array capacity and resize if needed
280     if ((p->fill == p->size) && (_resize_tokens_array(p) != 0)) {
281         goto error;
282     }
283 
284     Token *t = p->tokens[p->fill];
285     return initialize_token(p, t, &new_token, type);
286 error:
287     _PyToken_Free(&new_token);
288     return -1;
289 }
290 
291 #if defined(Py_DEBUG)
292 // Instrumentation to count the effectiveness of memoization.
293 // The array counts the number of tokens skipped by memoization,
294 // indexed by type.
295 
296 #define NSTATISTICS _PYPEGEN_NSTATISTICS
297 #define memo_statistics _PyRuntime.parser.memo_statistics
298 
299 #ifdef Py_GIL_DISABLED
300 #define MUTEX_LOCK() PyMutex_Lock(&_PyRuntime.parser.mutex)
301 #define MUTEX_UNLOCK() PyMutex_Unlock(&_PyRuntime.parser.mutex)
302 #else
303 #define MUTEX_LOCK()
304 #define MUTEX_UNLOCK()
305 #endif
306 
307 void
_PyPegen_clear_memo_statistics(void)308 _PyPegen_clear_memo_statistics(void)
309 {
310     MUTEX_LOCK();
311     for (int i = 0; i < NSTATISTICS; i++) {
312         memo_statistics[i] = 0;
313     }
314     MUTEX_UNLOCK();
315 }
316 
317 PyObject *
_PyPegen_get_memo_statistics(void)318 _PyPegen_get_memo_statistics(void)
319 {
320     PyObject *ret = PyList_New(NSTATISTICS);
321     if (ret == NULL) {
322         return NULL;
323     }
324 
325     MUTEX_LOCK();
326     for (int i = 0; i < NSTATISTICS; i++) {
327         PyObject *value = PyLong_FromLong(memo_statistics[i]);
328         if (value == NULL) {
329             MUTEX_UNLOCK();
330             Py_DECREF(ret);
331             return NULL;
332         }
333         // PyList_SetItem borrows a reference to value.
334         if (PyList_SetItem(ret, i, value) < 0) {
335             MUTEX_UNLOCK();
336             Py_DECREF(ret);
337             return NULL;
338         }
339     }
340     MUTEX_UNLOCK();
341     return ret;
342 }
343 #endif
344 
345 int  // bool
_PyPegen_is_memoized(Parser * p,int type,void * pres)346 _PyPegen_is_memoized(Parser *p, int type, void *pres)
347 {
348     if (p->mark == p->fill) {
349         if (_PyPegen_fill_token(p) < 0) {
350             p->error_indicator = 1;
351             return -1;
352         }
353     }
354 
355     Token *t = p->tokens[p->mark];
356 
357     for (Memo *m = t->memo; m != NULL; m = m->next) {
358         if (m->type == type) {
359 #if defined(Py_DEBUG)
360             if (0 <= type && type < NSTATISTICS) {
361                 long count = m->mark - p->mark;
362                 // A memoized negative result counts for one.
363                 if (count <= 0) {
364                     count = 1;
365                 }
366                 MUTEX_LOCK();
367                 memo_statistics[type] += count;
368                 MUTEX_UNLOCK();
369             }
370 #endif
371             p->mark = m->mark;
372             *(void **)(pres) = m->node;
373             return 1;
374         }
375     }
376     return 0;
377 }
378 
379 int
_PyPegen_lookahead_with_name(int positive,expr_ty (func)(Parser *),Parser * p)380 _PyPegen_lookahead_with_name(int positive, expr_ty (func)(Parser *), Parser *p)
381 {
382     int mark = p->mark;
383     void *res = func(p);
384     p->mark = mark;
385     return (res != NULL) == positive;
386 }
387 
388 int
_PyPegen_lookahead_with_string(int positive,expr_ty (func)(Parser *,const char *),Parser * p,const char * arg)389 _PyPegen_lookahead_with_string(int positive, expr_ty (func)(Parser *, const char*), Parser *p, const char* arg)
390 {
391     int mark = p->mark;
392     void *res = func(p, arg);
393     p->mark = mark;
394     return (res != NULL) == positive;
395 }
396 
397 int
_PyPegen_lookahead_with_int(int positive,Token * (func)(Parser *,int),Parser * p,int arg)398 _PyPegen_lookahead_with_int(int positive, Token *(func)(Parser *, int), Parser *p, int arg)
399 {
400     int mark = p->mark;
401     void *res = func(p, arg);
402     p->mark = mark;
403     return (res != NULL) == positive;
404 }
405 
406 int
_PyPegen_lookahead(int positive,void * (func)(Parser *),Parser * p)407 _PyPegen_lookahead(int positive, void *(func)(Parser *), Parser *p)
408 {
409     int mark = p->mark;
410     void *res = (void*)func(p);
411     p->mark = mark;
412     return (res != NULL) == positive;
413 }
414 
415 Token *
_PyPegen_expect_token(Parser * p,int type)416 _PyPegen_expect_token(Parser *p, int type)
417 {
418     if (p->mark == p->fill) {
419         if (_PyPegen_fill_token(p) < 0) {
420             p->error_indicator = 1;
421             return NULL;
422         }
423     }
424     Token *t = p->tokens[p->mark];
425     if (t->type != type) {
426        return NULL;
427     }
428     p->mark += 1;
429     return t;
430 }
431 
432 void*
_PyPegen_expect_forced_result(Parser * p,void * result,const char * expected)433 _PyPegen_expect_forced_result(Parser *p, void* result, const char* expected) {
434 
435     if (p->error_indicator == 1) {
436         return NULL;
437     }
438     if (result == NULL) {
439         RAISE_SYNTAX_ERROR("expected (%s)", expected);
440         return NULL;
441     }
442     return result;
443 }
444 
445 Token *
_PyPegen_expect_forced_token(Parser * p,int type,const char * expected)446 _PyPegen_expect_forced_token(Parser *p, int type, const char* expected) {
447 
448     if (p->error_indicator == 1) {
449         return NULL;
450     }
451 
452     if (p->mark == p->fill) {
453         if (_PyPegen_fill_token(p) < 0) {
454             p->error_indicator = 1;
455             return NULL;
456         }
457     }
458     Token *t = p->tokens[p->mark];
459     if (t->type != type) {
460         RAISE_SYNTAX_ERROR_KNOWN_LOCATION(t, "expected '%s'", expected);
461         return NULL;
462     }
463     p->mark += 1;
464     return t;
465 }
466 
467 expr_ty
_PyPegen_expect_soft_keyword(Parser * p,const char * keyword)468 _PyPegen_expect_soft_keyword(Parser *p, const char *keyword)
469 {
470     if (p->mark == p->fill) {
471         if (_PyPegen_fill_token(p) < 0) {
472             p->error_indicator = 1;
473             return NULL;
474         }
475     }
476     Token *t = p->tokens[p->mark];
477     if (t->type != NAME) {
478         return NULL;
479     }
480     const char *s = PyBytes_AsString(t->bytes);
481     if (!s) {
482         p->error_indicator = 1;
483         return NULL;
484     }
485     if (strcmp(s, keyword) != 0) {
486         return NULL;
487     }
488     return _PyPegen_name_token(p);
489 }
490 
491 Token *
_PyPegen_get_last_nonnwhitespace_token(Parser * p)492 _PyPegen_get_last_nonnwhitespace_token(Parser *p)
493 {
494     assert(p->mark >= 0);
495     Token *token = NULL;
496     for (int m = p->mark - 1; m >= 0; m--) {
497         token = p->tokens[m];
498         if (token->type != ENDMARKER && (token->type < NEWLINE || token->type > DEDENT)) {
499             break;
500         }
501     }
502     return token;
503 }
504 
505 PyObject *
_PyPegen_new_identifier(Parser * p,const char * n)506 _PyPegen_new_identifier(Parser *p, const char *n)
507 {
508     PyObject *id = PyUnicode_DecodeUTF8(n, strlen(n), NULL);
509     if (!id) {
510         goto error;
511     }
512     /* PyUnicode_DecodeUTF8 should always return a ready string. */
513     assert(PyUnicode_IS_READY(id));
514     /* Check whether there are non-ASCII characters in the
515        identifier; if so, normalize to NFKC. */
516     if (!PyUnicode_IS_ASCII(id))
517     {
518         if (!init_normalization(p))
519         {
520             Py_DECREF(id);
521             goto error;
522         }
523         PyObject *form = PyUnicode_InternFromString("NFKC");
524         if (form == NULL)
525         {
526             Py_DECREF(id);
527             goto error;
528         }
529         PyObject *args[2] = {form, id};
530         PyObject *id2 = PyObject_Vectorcall(p->normalize, args, 2, NULL);
531         Py_DECREF(id);
532         Py_DECREF(form);
533         if (!id2) {
534             goto error;
535         }
536 
537         if (!PyUnicode_Check(id2))
538         {
539             PyErr_Format(PyExc_TypeError,
540                          "unicodedata.normalize() must return a string, not "
541                          "%.200s",
542                          _PyType_Name(Py_TYPE(id2)));
543             Py_DECREF(id2);
544             goto error;
545         }
546         id = id2;
547     }
548     PyInterpreterState *interp = _PyInterpreterState_GET();
549     _PyUnicode_InternImmortal(interp, &id);
550     if (_PyArena_AddPyObject(p->arena, id) < 0)
551     {
552         Py_DECREF(id);
553         goto error;
554     }
555     return id;
556 
557 error:
558     p->error_indicator = 1;
559     return NULL;
560 }
561 
562 static expr_ty
_PyPegen_name_from_token(Parser * p,Token * t)563 _PyPegen_name_from_token(Parser *p, Token* t)
564 {
565     if (t == NULL) {
566         return NULL;
567     }
568     const char *s = PyBytes_AsString(t->bytes);
569     if (!s) {
570         p->error_indicator = 1;
571         return NULL;
572     }
573     PyObject *id = _PyPegen_new_identifier(p, s);
574     if (id == NULL) {
575         p->error_indicator = 1;
576         return NULL;
577     }
578     return _PyAST_Name(id, Load, t->lineno, t->col_offset, t->end_lineno,
579                        t->end_col_offset, p->arena);
580 }
581 
582 expr_ty
_PyPegen_name_token(Parser * p)583 _PyPegen_name_token(Parser *p)
584 {
585     Token *t = _PyPegen_expect_token(p, NAME);
586     return _PyPegen_name_from_token(p, t);
587 }
588 
589 void *
_PyPegen_string_token(Parser * p)590 _PyPegen_string_token(Parser *p)
591 {
592     return _PyPegen_expect_token(p, STRING);
593 }
594 
_PyPegen_soft_keyword_token(Parser * p)595 expr_ty _PyPegen_soft_keyword_token(Parser *p) {
596     Token *t = _PyPegen_expect_token(p, NAME);
597     if (t == NULL) {
598         return NULL;
599     }
600     char *the_token;
601     Py_ssize_t size;
602     PyBytes_AsStringAndSize(t->bytes, &the_token, &size);
603     for (char **keyword = p->soft_keywords; *keyword != NULL; keyword++) {
604         if (strncmp(*keyword, the_token, size) == 0) {
605             return _PyPegen_name_from_token(p, t);
606         }
607     }
608     return NULL;
609 }
610 
611 static PyObject *
parsenumber_raw(const char * s)612 parsenumber_raw(const char *s)
613 {
614     const char *end;
615     long x;
616     double dx;
617     Py_complex compl;
618     int imflag;
619 
620     assert(s != NULL);
621     errno = 0;
622     end = s + strlen(s) - 1;
623     imflag = *end == 'j' || *end == 'J';
624     if (s[0] == '0') {
625         x = (long)PyOS_strtoul(s, (char **)&end, 0);
626         if (x < 0 && errno == 0) {
627             return PyLong_FromString(s, (char **)0, 0);
628         }
629     }
630     else {
631         x = PyOS_strtol(s, (char **)&end, 0);
632     }
633     if (*end == '\0') {
634         if (errno != 0) {
635             return PyLong_FromString(s, (char **)0, 0);
636         }
637         return PyLong_FromLong(x);
638     }
639     /* XXX Huge floats may silently fail */
640     if (imflag) {
641         compl.real = 0.;
642         compl.imag = PyOS_string_to_double(s, (char **)&end, NULL);
643         if (compl.imag == -1.0 && PyErr_Occurred()) {
644             return NULL;
645         }
646         return PyComplex_FromCComplex(compl);
647     }
648     dx = PyOS_string_to_double(s, NULL, NULL);
649     if (dx == -1.0 && PyErr_Occurred()) {
650         return NULL;
651     }
652     return PyFloat_FromDouble(dx);
653 }
654 
655 static PyObject *
parsenumber(const char * s)656 parsenumber(const char *s)
657 {
658     char *dup;
659     char *end;
660     PyObject *res = NULL;
661 
662     assert(s != NULL);
663 
664     if (strchr(s, '_') == NULL) {
665         return parsenumber_raw(s);
666     }
667     /* Create a duplicate without underscores. */
668     dup = PyMem_Malloc(strlen(s) + 1);
669     if (dup == NULL) {
670         return PyErr_NoMemory();
671     }
672     end = dup;
673     for (; *s; s++) {
674         if (*s != '_') {
675             *end++ = *s;
676         }
677     }
678     *end = '\0';
679     res = parsenumber_raw(dup);
680     PyMem_Free(dup);
681     return res;
682 }
683 
684 expr_ty
_PyPegen_number_token(Parser * p)685 _PyPegen_number_token(Parser *p)
686 {
687     Token *t = _PyPegen_expect_token(p, NUMBER);
688     if (t == NULL) {
689         return NULL;
690     }
691 
692     const char *num_raw = PyBytes_AsString(t->bytes);
693     if (num_raw == NULL) {
694         p->error_indicator = 1;
695         return NULL;
696     }
697 
698     if (p->feature_version < 6 && strchr(num_raw, '_') != NULL) {
699         p->error_indicator = 1;
700         return RAISE_SYNTAX_ERROR("Underscores in numeric literals are only supported "
701                                   "in Python 3.6 and greater");
702     }
703 
704     PyObject *c = parsenumber(num_raw);
705 
706     if (c == NULL) {
707         p->error_indicator = 1;
708         PyThreadState *tstate = _PyThreadState_GET();
709         // The only way a ValueError should happen in _this_ code is via
710         // PyLong_FromString hitting a length limit.
711         if (tstate->current_exception != NULL &&
712             Py_TYPE(tstate->current_exception) == (PyTypeObject *)PyExc_ValueError
713         ) {
714             PyObject *exc = PyErr_GetRaisedException();
715             /* Intentionally omitting columns to avoid a wall of 1000s of '^'s
716              * on the error message. Nobody is going to overlook their huge
717              * numeric literal once given the line. */
718             RAISE_ERROR_KNOWN_LOCATION(
719                 p, PyExc_SyntaxError,
720                 t->lineno, -1 /* col_offset */,
721                 t->end_lineno, -1 /* end_col_offset */,
722                 "%S - Consider hexadecimal for huge integer literals "
723                 "to avoid decimal conversion limits.",
724                 exc);
725             Py_DECREF(exc);
726         }
727         return NULL;
728     }
729 
730     if (_PyArena_AddPyObject(p->arena, c) < 0) {
731         Py_DECREF(c);
732         p->error_indicator = 1;
733         return NULL;
734     }
735 
736     return _PyAST_Constant(c, NULL, t->lineno, t->col_offset, t->end_lineno,
737                            t->end_col_offset, p->arena);
738 }
739 
740 /* Check that the source for a single input statement really is a single
741    statement by looking at what is left in the buffer after parsing.
742    Trailing whitespace and comments are OK. */
743 static int // bool
bad_single_statement(Parser * p)744 bad_single_statement(Parser *p)
745 {
746     char *cur = p->tok->cur;
747     char c = *cur;
748 
749     for (;;) {
750         while (c == ' ' || c == '\t' || c == '\n' || c == '\014') {
751             c = *++cur;
752         }
753 
754         if (!c) {
755             return 0;
756         }
757 
758         if (c != '#') {
759             return 1;
760         }
761 
762         /* Suck up comment. */
763         while (c && c != '\n') {
764             c = *++cur;
765         }
766     }
767 }
768 
769 static int
compute_parser_flags(PyCompilerFlags * flags)770 compute_parser_flags(PyCompilerFlags *flags)
771 {
772     int parser_flags = 0;
773     if (!flags) {
774         return 0;
775     }
776     if (flags->cf_flags & PyCF_DONT_IMPLY_DEDENT) {
777         parser_flags |= PyPARSE_DONT_IMPLY_DEDENT;
778     }
779     if (flags->cf_flags & PyCF_IGNORE_COOKIE) {
780         parser_flags |= PyPARSE_IGNORE_COOKIE;
781     }
782     if (flags->cf_flags & CO_FUTURE_BARRY_AS_BDFL) {
783         parser_flags |= PyPARSE_BARRY_AS_BDFL;
784     }
785     if (flags->cf_flags & PyCF_TYPE_COMMENTS) {
786         parser_flags |= PyPARSE_TYPE_COMMENTS;
787     }
788     if (flags->cf_flags & PyCF_ALLOW_INCOMPLETE_INPUT) {
789         parser_flags |= PyPARSE_ALLOW_INCOMPLETE_INPUT;
790     }
791     return parser_flags;
792 }
793 
794 // Parser API
795 
796 Parser *
_PyPegen_Parser_New(struct tok_state * tok,int start_rule,int flags,int feature_version,int * errcode,PyArena * arena)797 _PyPegen_Parser_New(struct tok_state *tok, int start_rule, int flags,
798                     int feature_version, int *errcode, PyArena *arena)
799 {
800     Parser *p = PyMem_Malloc(sizeof(Parser));
801     if (p == NULL) {
802         return (Parser *) PyErr_NoMemory();
803     }
804     assert(tok != NULL);
805     tok->type_comments = (flags & PyPARSE_TYPE_COMMENTS) > 0;
806     p->tok = tok;
807     p->keywords = NULL;
808     p->n_keyword_lists = -1;
809     p->soft_keywords = NULL;
810     p->tokens = PyMem_Malloc(sizeof(Token *));
811     if (!p->tokens) {
812         PyMem_Free(p);
813         return (Parser *) PyErr_NoMemory();
814     }
815     p->tokens[0] = PyMem_Calloc(1, sizeof(Token));
816     if (!p->tokens[0]) {
817         PyMem_Free(p->tokens);
818         PyMem_Free(p);
819         return (Parser *) PyErr_NoMemory();
820     }
821     if (!growable_comment_array_init(&p->type_ignore_comments, 10)) {
822         PyMem_Free(p->tokens[0]);
823         PyMem_Free(p->tokens);
824         PyMem_Free(p);
825         return (Parser *) PyErr_NoMemory();
826     }
827 
828     p->mark = 0;
829     p->fill = 0;
830     p->size = 1;
831 
832     p->errcode = errcode;
833     p->arena = arena;
834     p->start_rule = start_rule;
835     p->parsing_started = 0;
836     p->normalize = NULL;
837     p->error_indicator = 0;
838 
839     p->starting_lineno = 0;
840     p->starting_col_offset = 0;
841     p->flags = flags;
842     p->feature_version = feature_version;
843     p->known_err_token = NULL;
844     p->level = 0;
845     p->call_invalid_rules = 0;
846 #ifdef Py_DEBUG
847     p->debug = _Py_GetConfig()->parser_debug;
848 #endif
849     return p;
850 }
851 
852 void
_PyPegen_Parser_Free(Parser * p)853 _PyPegen_Parser_Free(Parser *p)
854 {
855     Py_XDECREF(p->normalize);
856     for (int i = 0; i < p->size; i++) {
857         PyMem_Free(p->tokens[i]);
858     }
859     PyMem_Free(p->tokens);
860     growable_comment_array_deallocate(&p->type_ignore_comments);
861     PyMem_Free(p);
862 }
863 
864 static void
reset_parser_state_for_error_pass(Parser * p)865 reset_parser_state_for_error_pass(Parser *p)
866 {
867     for (int i = 0; i < p->fill; i++) {
868         p->tokens[i]->memo = NULL;
869     }
870     p->mark = 0;
871     p->call_invalid_rules = 1;
872     // Don't try to get extra tokens in interactive mode when trying to
873     // raise specialized errors in the second pass.
874     p->tok->interactive_underflow = IUNDERFLOW_STOP;
875 }
876 
877 static inline int
_is_end_of_source(Parser * p)878 _is_end_of_source(Parser *p) {
879     int err = p->tok->done;
880     return err == E_EOF || err == E_EOFS || err == E_EOLS;
881 }
882 
883 void *
_PyPegen_run_parser(Parser * p)884 _PyPegen_run_parser(Parser *p)
885 {
886     void *res = _PyPegen_parse(p);
887     assert(p->level == 0);
888     if (res == NULL) {
889         if ((p->flags & PyPARSE_ALLOW_INCOMPLETE_INPUT) &&  _is_end_of_source(p)) {
890             PyErr_Clear();
891             return _PyPegen_raise_error(p, PyExc_IncompleteInputError, 0, "incomplete input");
892         }
893         if (PyErr_Occurred() && !PyErr_ExceptionMatches(PyExc_SyntaxError)) {
894             return NULL;
895         }
896        // Make a second parser pass. In this pass we activate heavier and slower checks
897         // to produce better error messages and more complete diagnostics. Extra "invalid_*"
898         // rules will be active during parsing.
899         Token *last_token = p->tokens[p->fill - 1];
900         reset_parser_state_for_error_pass(p);
901         _PyPegen_parse(p);
902 
903         // Set SyntaxErrors accordingly depending on the parser/tokenizer status at the failure
904         // point.
905         _Pypegen_set_syntax_error(p, last_token);
906        return NULL;
907     }
908 
909     if (p->start_rule == Py_single_input && bad_single_statement(p)) {
910         p->tok->done = E_BADSINGLE; // This is not necessary for now, but might be in the future
911         return RAISE_SYNTAX_ERROR("multiple statements found while compiling a single statement");
912     }
913 
914     // test_peg_generator defines _Py_TEST_PEGEN to not call PyAST_Validate()
915 #if defined(Py_DEBUG) && !defined(_Py_TEST_PEGEN)
916     if (p->start_rule == Py_single_input ||
917         p->start_rule == Py_file_input ||
918         p->start_rule == Py_eval_input)
919     {
920         if (!_PyAST_Validate(res)) {
921             return NULL;
922         }
923     }
924 #endif
925     return res;
926 }
927 
928 mod_ty
_PyPegen_run_parser_from_file_pointer(FILE * fp,int start_rule,PyObject * filename_ob,const char * enc,const char * ps1,const char * ps2,PyCompilerFlags * flags,int * errcode,PyObject ** interactive_src,PyArena * arena)929 _PyPegen_run_parser_from_file_pointer(FILE *fp, int start_rule, PyObject *filename_ob,
930                              const char *enc, const char *ps1, const char *ps2,
931                              PyCompilerFlags *flags, int *errcode,
932                              PyObject **interactive_src, PyArena *arena)
933 {
934     struct tok_state *tok = _PyTokenizer_FromFile(fp, enc, ps1, ps2);
935     if (tok == NULL) {
936         if (PyErr_Occurred()) {
937             _PyPegen_raise_tokenizer_init_error(filename_ob);
938             return NULL;
939         }
940         return NULL;
941     }
942     if (!tok->fp || ps1 != NULL || ps2 != NULL ||
943         PyUnicode_CompareWithASCIIString(filename_ob, "<stdin>") == 0) {
944         tok->fp_interactive = 1;
945     }
946     // This transfers the ownership to the tokenizer
947     tok->filename = Py_NewRef(filename_ob);
948 
949     // From here on we need to clean up even if there's an error
950     mod_ty result = NULL;
951 
952     int parser_flags = compute_parser_flags(flags);
953     Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, PY_MINOR_VERSION,
954                                     errcode, arena);
955     if (p == NULL) {
956         goto error;
957     }
958 
959     result = _PyPegen_run_parser(p);
960     _PyPegen_Parser_Free(p);
961 
962     if (tok->fp_interactive && tok->interactive_src_start && result && interactive_src != NULL) {
963         *interactive_src = PyUnicode_FromString(tok->interactive_src_start);
964         if (!interactive_src || _PyArena_AddPyObject(arena, *interactive_src) < 0) {
965             Py_XDECREF(interactive_src);
966             result = NULL;
967             goto error;
968         }
969     }
970 
971 error:
972     _PyTokenizer_Free(tok);
973     return result;
974 }
975 
976 mod_ty
_PyPegen_run_parser_from_string(const char * str,int start_rule,PyObject * filename_ob,PyCompilerFlags * flags,PyArena * arena)977 _PyPegen_run_parser_from_string(const char *str, int start_rule, PyObject *filename_ob,
978                        PyCompilerFlags *flags, PyArena *arena)
979 {
980     int exec_input = start_rule == Py_file_input;
981 
982     struct tok_state *tok;
983     if (flags != NULL && flags->cf_flags & PyCF_IGNORE_COOKIE) {
984         tok = _PyTokenizer_FromUTF8(str, exec_input, 0);
985     } else {
986         tok = _PyTokenizer_FromString(str, exec_input, 0);
987     }
988     if (tok == NULL) {
989         if (PyErr_Occurred()) {
990             _PyPegen_raise_tokenizer_init_error(filename_ob);
991         }
992         return NULL;
993     }
994     // This transfers the ownership to the tokenizer
995     tok->filename = Py_NewRef(filename_ob);
996 
997     // We need to clear up from here on
998     mod_ty result = NULL;
999 
1000     int parser_flags = compute_parser_flags(flags);
1001     int feature_version = flags && (flags->cf_flags & PyCF_ONLY_AST) ?
1002         flags->cf_feature_version : PY_MINOR_VERSION;
1003     Parser *p = _PyPegen_Parser_New(tok, start_rule, parser_flags, feature_version,
1004                                     NULL, arena);
1005     if (p == NULL) {
1006         goto error;
1007     }
1008 
1009     result = _PyPegen_run_parser(p);
1010     _PyPegen_Parser_Free(p);
1011 
1012 error:
1013     _PyTokenizer_Free(tok);
1014     return result;
1015 }
1016